-
Notifications
You must be signed in to change notification settings - Fork 0
/
main.py
160 lines (136 loc) · 5.72 KB
/
main.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
import os
import asyncio
from concurrent.futures import ThreadPoolExecutor
from bs4 import BeautifulSoup
import requests
from weasyprint import HTML, CSS
from PyPDF2 import PdfMerger
from datetime import datetime
class BojToPdf:
def __init__(self, problem_numbers, set_number):
self.problem_numbers = problem_numbers
self.set_number = set_number
self.headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3'
}
def download_and_clean_html(self, link):
response = requests.get(link, headers=self.headers)
if response.status_code == 403:
print(f"Access forbidden for {link}")
return None
soup = BeautifulSoup(response.text, 'html.parser')
# 특정 클래스명을 가진 요소 제거
for element in soup.find_all(class_='footer'):
element.decompose()
for header in soup.find_all(class_='header'):
header.decompose()
for element in soup.find_all(class_='page-header'):
element['style'] = 'margin: 0;'
for element in soup.find_all(id='problem_association'):
element.decompose()
# 폰트 설정을 위한 스타일 추가
path_to_font = os.path.abspath('./fonts')
style_tag = soup.new_tag('style')
style_tag.string = f'''
@font-face {{
font-family: 'Noto Sans';
src: url('file://{path_to_font}/NotoSans-Regular.ttf') format('truetype');
}}
body {{
font-family: 'Noto Sans', sans-serif;
max-width: 100%;
}}
container {{
width: 100%;
}}
pre, code, kbd, samp {{
font-family: 'Noto Sans', monospace;
}}
'''
soup.head.append(style_tag)
return str(soup)
async def save_html_to_file(self, html, output_path):
temp_html_path = output_path.replace('.pdf', '.html')
loop = asyncio.get_event_loop()
await loop.run_in_executor(None, self._write_file, temp_html_path, html)
print(f"Saved cleaned HTML to {temp_html_path}")
return temp_html_path
def _write_file(self, temp_html_path, html):
with open(temp_html_path, 'w', encoding='utf-8') as file:
file.write(html)
def convert_html_to_pdf(self, temp_html_path, output_path):
path_to_font = os.path.abspath('./fonts')
css = CSS(string=f'''
@font-face {{
font-family: 'Noto Sans';
src: url('file://{path_to_font}/NotoSans-Regular.ttf') format('truetype');
}}
body {{
font-family: 'Noto Sans', sans-serif;
max-width: 100%;
}}
container {{
width: 100%;
}}
h1, h2, h3, h4, h5, p, div, span, text, pre, code, kbd, samp {{
font-family: 'Noto Sans', monospace;
}}
''')
# HTML을 PDF로 변환
HTML(temp_html_path).write_pdf(output_path, stylesheets=[css])
# 임시 HTML 파일 삭제
os.remove(temp_html_path)
async def process_problems(self):
os.makedirs('./outputs', exist_ok=True)
problem_links = [[num, f"https://www.acmicpc.net/problem/{num}"] for num in self.problem_numbers]
loop = asyncio.get_event_loop()
with ThreadPoolExecutor() as executor:
# HTML 다운로드 및 정리 작업
html_tasks = [
loop.run_in_executor(executor, self.download_and_clean_html, problem_link)
for problem_number, problem_link in problem_links
]
html_contents = await asyncio.gather(*html_tasks)
# HTML 저장 작업
save_tasks = []
temp_html_paths = []
for problem_number, html in zip(self.problem_numbers, html_contents):
if html is not None:
output_pdf_path = f"./outputs/{problem_number}.pdf"
temp_html_path = await self.save_html_to_file(html, output_pdf_path)
temp_html_paths.append((temp_html_path, output_pdf_path))
# PDF 변환 작업
pdf_tasks = [
loop.run_in_executor(executor, self.convert_html_to_pdf, temp_html_path, output_pdf_path)
for temp_html_path, output_pdf_path in temp_html_paths
]
await asyncio.gather(*pdf_tasks)
# PDF 병합
merger = PdfMerger()
for problem_number in self.problem_numbers:
pdf_path = f"./outputs/{problem_number}.pdf"
if os.path.exists(pdf_path):
merger.append(pdf_path)
# 오늘 날짜로 된 파일 이름 생성
today_str = datetime.today().strftime('%Y-%m-%d')
merged_pdf_path = f"./outputs/{today_str}-문제집-{self.set_number}.pdf"
merger.write(merged_pdf_path)
merger.close()
# 개별 PDF 파일 삭제
for problem_number in self.problem_numbers:
pdf_path = f"./outputs/{problem_number}.pdf"
if os.path.exists(pdf_path):
os.remove(pdf_path)
print(f"Created merged PDF: {merged_pdf_path}")
async def main():
problem_sets = [
# [14499, 13460, 12100, 14891],
# [17144, 15683, 3055, 14890],
# [1202, 2589, 9466, 11000],
]
tasks = []
for set_number, problem_set in enumerate(problem_sets, start=1):
boj_to_pdf = BojToPdf(problem_set, set_number)
tasks.append(boj_to_pdf.process_problems())
await asyncio.gather(*tasks)
asyncio.run(main())