Datamining Suncam

Datamining Suncam

suncam.com

Suncam.com is a great source for continuing education courses for Professional Engineers and Land Surveyors. All courses and exams are available for free download.

Recently while taking a course, I noticed that the course and worksheet filenames are easily identifiable so I built the below Python script to retrieve all of the courses and worksheets.

"""
Brandon Taylor, PE
Suncam Course Retriever
"""

import bs4
import os
import time
import requests

inv_html = './/data//suncam.html'
courses_url = 'https://www.suncam.com/continuing-education/all_courses.html'
url_prefix = 'https://s3.amazonaws.com/suncam/'


def main():
    # if inventory has not been retrieved then retrieve and save
    if not os.path.isfile(inv_html):
        response = requests.get(url=courses_url)
        response.raise_for_status()

        if response.text:
            # create a beautiful soup object from all_courses.html
            soup = bs4.BeautifulSoup(response.text, 'html.parser')

            # save copy of html file locally
            with open(inv_html, "w", encoding='utf-8') as file:
                file.write(str(soup))

    # have to load with context manager due to UnicodeDecodeError
    with open(inv_html, 'rb') as html:
        soup = bs4.BeautifulSoup(html, 'html.parser')

        # grab all courses
        courses = soup.find_all("span", {'class': 'course'})

        print(f'SunCam Courses: {len(courses)}')

        # cycle thru courses skipping first entry (not course)
        for idx, entry in enumerate(courses[1:], 1):

            txt = entry.text

            # courseid is first three digits
            courseid = txt[:3]
            coursename = txt[txt.find("-"):].replace("-", "").replace(" ", "_").replace('"', "")

            # build course and worksheet filenames
            course_path = os.path.join('.//data//docs//', courseid + '_' + coursename + '.pdf')
            wrksht_path = os.path.join('.//data//worksheets//', courseid + '_WorkSheet.pdf')
            course_url = url_prefix + 'docs/' + courseid + '.pdf'
            wrksht_url = url_prefix + 'worksheets/' + courseid + '_Worksheet.pdf'

            print(f'[{idx}/{len(courses)}] {coursename}')

            # retrieve course if not already retrieved
            if not os.path.isfile(course_path):
                pdf = requests.get(course_url)
                with open(course_path, 'wb') as f:
                    f.write(pdf.content)
                time.sleep(10)

            # retrieve worksheet if not already retrieved
            if not os.path.isfile(wrksht_path):
                pdf = requests.get(wrksht_url)
                with open(wrksht_path, 'wb') as f:
                    f.write(pdf.content)
                time.sleep(10)


if __name__ == '__main__':
    main()