-
Notifications
You must be signed in to change notification settings - Fork 0
/
jobkorea_protocal.py
83 lines (63 loc) · 2.3 KB
/
jobkorea_protocal.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
import json
import jobkorea
from selenium import webdriver
from webdriver_manager.chrome import ChromeDriverManager
from selenium.webdriver.chrome.service import Service
import ssl
import os
import sqlite3
with open("secrets.json", "r") as secret_file:
secrets = json.load(secret_file)
jobkorea_login_url = secrets["LOGIN_URL"]
# 데이터베이스 연결
conn = sqlite3.connect('crawling_data.db')
cursor = conn.cursor()
cursor.execute('''CREATE TABLE IF NOT EXISTS data
(Question TEXT, Answer TEXT)''')
# 데이터 저장 함수
def save_to_db(question, answer):
# 중복 검사
cursor.execute("SELECT * FROM data WHERE Question = ? AND Answer = ?", (question, answer))
if cursor.fetchone() is None:
cursor.execute("INSERT INTO data (Question, Answer) VALUES (?, ?)", (question, answer))
conn.commit()
# SSL 인증 오류 해결
ssl._create_default_https_context = ssl._create_unverified_context
chrome_options = webdriver.ChromeOptions()
chrome_options.add_argument(
"user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/96.0.4664.93 Safari/537.36"
)
driver = webdriver.Chrome(
service=Service(ChromeDriverManager().install()),
options=chrome_options.add_argument("--headless"),
)
file = open("/Users/jang-youngjoon/학교/2023-2학기/졸프/jobkorea_link.txt", "r")
# 처음 n줄을 건너뛰기
crawled = 499
for _ in range(crawled):
next(file)
driver.get(jobkorea_login_url)
read_count = crawled
qa_data = []
while True:
file_url = file.readline()
print(read_count, "번째 줄")
if file_url == "":
break
try:
qa_result = jobkorea.self_introduction_crawl(driver=driver, file_url=file_url)
question_list = qa_result["question_list"]
answer_list = qa_result["answer_list"]
for index in range(len(question_list)):
question = question_list[index]
answer = answer_list[index]
# DB에 저장
save_to_db(question, answer)
qa_data.append({
"Question": question,
"Answer": answer
})
print(len(qa_data))
except Exception as e:
print(f"{read_count}번째에서 다음 에러가 발생했습니다: {e}")
read_count += 1