-
Notifications
You must be signed in to change notification settings - Fork 1
/
chunking.py
129 lines (95 loc) · 3.51 KB
/
chunking.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
import bs4 as bs
import urllib.request
from course import Course
from constants import WEBSOC
import time
def get_chunks_for(course_codes: [str], all_course_codes: [[str]]) -> [[str]]:
"""
Creates an optimized list of chunks to perform a search with given a list of course codes and chunks
"""
course_codes = sorted(course_codes)
code_index_map = dict()
for i in range(len(all_course_codes)):
code_index_map[all_course_codes[i]] = i
final_chunks = []
current_chunk = []
for code in course_codes:
if len(current_chunk) == 0:
current_chunk.append(code)
else:
first_code = current_chunk[0]
if int(code_index_map[code]) - int(code_index_map[first_code]) < 900:
current_chunk.append(code)
else:
final_chunks.append(current_chunk)
current_chunk = [code]
if len(current_chunk) != 0:
final_chunks.append(current_chunk)
batched_chunks = []
current_batch = []
for chunk in final_chunks:
if len(current_batch) + len(chunk) <= 8:
current_batch.extend(chunk)
else:
batched_chunks.append(chunk)
if len(current_batch) == 8:
batched_chunks.append(current_batch)
current_batch = []
if len(current_batch) != 0:
batched_chunks.append(current_batch)
return batched_chunks
def get_chunks(term) -> [[str]]:
course_codes = sorted(get_all_codes(term))
chunks = []
inner_list = []
counter = 0
for code in course_codes:
inner_list.append(code)
counter += 1
if counter == 900:
counter = 0
chunks.append(inner_list)
inner_list = []
if len(inner_list) != 0:
chunks.append(inner_list)
return chunks
def _get_courses_in_page(url) -> [Course]:
"""
Given a WebSoc search URL, creates a generator over each Course in the results page
"""
# Get the page that lists the courses in a table
with urllib.request.urlopen(url) as source:
soup = bs.BeautifulSoup(source, "html.parser")
# Iterate over each course, which is each row in the results
for row in soup.find_all("tr"):
# Get the values of each column
cells = [td.string for td in row.find_all("td")]
# Convert this row to a Course object
if len(cells) in {15, 16, 17}:
yield Course(cells)
def _get_department_urls(term) -> [str]:
"""
Creates a generator over the URLs of each department's WebSOC search results page
"""
# Get the page that lists all the departments
with urllib.request.urlopen(WEBSOC) as source:
soup = bs.BeautifulSoup(source, "html.parser")
# Extract the department codes from the department menu
for deptOption in soup.find("select", {"name": "Dept"}).find_all("option"):
url_fields = [("YearTerm", term),
("ShowFinals", '1'),
("ShowComments", '1'),
("Dept", deptOption.get("value")),
("CancelledCourses", 'Include')]
# Encode the URL that shows courses in this department
yield f'{WEBSOC}?{urllib.parse.urlencode(url_fields)}'
def get_all_codes(term) -> [str]:
"""
Generates all of the codes currently on WebSoc
"""
codes = []
for url in _get_department_urls(term):
time.sleep(1)
for course in _get_courses_in_page(url):
codes.append(course.code)
return codes