Datamining Suncam
Datamining Suncam
suncam.comSuncam.com is a great source for continuing education courses for Professional Engineers and Land Surveyors. All courses and exams are available for free download.
Recently while taking a course, I noticed that the course and worksheet filenames are easily identifiable so I built the below Python script to retrieve all of the courses and worksheets.
"""
Brandon Taylor, PE
Suncam Course Retriever
"""
import bs4
import os
import time
import requests
inv_html = './/data//suncam.html'
courses_url = 'https://www.suncam.com/continuing-education/all_courses.html'
url_prefix = 'https://s3.amazonaws.com/suncam/'
def main():
# if inventory has not been retrieved then retrieve and save
if not os.path.isfile(inv_html):
response = requests.get(url=courses_url)
response.raise_for_status()
if response.text:
# create a beautiful soup object from all_courses.html
soup = bs4.BeautifulSoup(response.text, 'html.parser')
# save copy of html file locally
with open(inv_html, "w", encoding='utf-8') as file:
file.write(str(soup))
# have to load with context manager due to UnicodeDecodeError
with open(inv_html, 'rb') as html:
soup = bs4.BeautifulSoup(html, 'html.parser')
# grab all courses
courses = soup.find_all("span", {'class': 'course'})
print(f'SunCam Courses: {len(courses)}')
# cycle thru courses skipping first entry (not course)
for idx, entry in enumerate(courses[1:], 1):
txt = entry.text
# courseid is first three digits
courseid = txt[:3]
coursename = txt[txt.find("-"):].replace("-", "").replace(" ", "_").replace('"', "")
# build course and worksheet filenames
course_path = os.path.join('.//data//docs//', courseid + '_' + coursename + '.pdf')
wrksht_path = os.path.join('.//data//worksheets//', courseid + '_WorkSheet.pdf')
course_url = url_prefix + 'docs/' + courseid + '.pdf'
wrksht_url = url_prefix + 'worksheets/' + courseid + '_Worksheet.pdf'
print(f'[{idx}/{len(courses)}] {coursename}')
# retrieve course if not already retrieved
if not os.path.isfile(course_path):
pdf = requests.get(course_url)
with open(course_path, 'wb') as f:
f.write(pdf.content)
time.sleep(10)
# retrieve worksheet if not already retrieved
if not os.path.isfile(wrksht_path):
pdf = requests.get(wrksht_url)
with open(wrksht_path, 'wb') as f:
f.write(pdf.content)
time.sleep(10)
if __name__ == '__main__':
main()