-
Notifications
You must be signed in to change notification settings - Fork 2
/
Copy pathrelevant_professors.py
166 lines (138 loc) · 5.7 KB
/
relevant_professors.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
import time
import argparse
from bs4 import BeautifulSoup
from selenium import webdriver
from utils import *
from config import *
def check_relevant_professors_in_scholar(items=[], keywords=[], max_count=DEFAULT_MAX_COUNT, max_search_count=DEFAULT_MAX_SEARCH_COUNT):
relevant_professors = []
search_count = 0
count = 0
driver = webdriver.Chrome()
for prof_item in items:
relevant_professor = {
"school": prof_item.get("school", ""),
"name": prof_item.get("name", ""),
"home_page": prof_item.get("home_page", ""),
"google_scholar": prof_item.get("google_scholar", ""),
}
prof_recent_highlights = []
prof_relevance = 0
scholar_url = prof_item.get("google_scholar", "")
if scholar_url:
# Sort by year
scholar_year_url = scholar_url + "&view_op=list_works&sortby=pubdate"
driver.get(scholar_year_url)
time.sleep(
SLEEP_TIME
) # Adjust this sleep time w.r.t. to your network speed.
# Record recent highlights: recent works containing keywords
soup = BeautifulSoup(driver.page_source, "html.parser")
titles = soup.find_all("a", class_="gsc_a_at")
for title in titles:
for keyword in keywords:
if title.text.lower().count(keyword) > 0:
prof_recent_highlights.append(clean_text(title.text))
break
# Sort by cite
scholar_citation_url = scholar_url + "&view_op=list_works"
driver.get(scholar_citation_url)
time.sleep(
SLEEP_TIME
) # Adjust this sleep time w.r.t. to your network speed.
# Record domain relevance: works with high citation containing keywords
soup = BeautifulSoup(driver.page_source, "html.parser")
titles = soup.find_all("a", class_="gsc_a_at")
prof_relevance = sum(
title.text.lower().count(keyword)
for title in titles
for keyword in keywords
)
relevant_professor.update(
{
"recent_highlights_num": len(prof_recent_highlights),
"recent_highlights": "[" + ", ".join(prof_recent_highlights) + "]",
"relevance": prof_relevance,
}
)
# Add this professor if he/she has recent highlights or relevance
if (
relevant_professor["recent_highlights_num"] > 0
or relevant_professor["relevance"] > 0
):
relevant_professors.append(relevant_professor)
count += 1
if count >= max_count:
return relevant_professors
# Check maximum search count
search_count += 1
if search_count >= max_search_count:
return relevant_professors
# Adjust this time w.r.t. your network speed
time.sleep(SLEEP_TIME)
driver.quit()
return relevant_professors
def parse_arguments():
parser = argparse.ArgumentParser(
description="Search for relevant professors based on keywords in paper titles."
)
parser.add_argument(
"--filename",
type=str,
required=True,
help="Datasource file, the output of csranking_top_professors.py (e.g., 2020-2024-sec.csv)",
)
parser.add_argument(
"--keywords",
type=str,
required=True,
help="Comma-separated list of relevant keywords in paper titles (e.g., \"adversarial, blockchain, LLM\")",
)
parser.add_argument(
"--max_count",
type=int,
default=DEFAULT_MAX_COUNT,
help=f"Maximum number of professors to output (default: {DEFAULT_MAX_COUNT})",
)
parser.add_argument(
"--max_search_count",
type=int,
default=DEFAULT_MAX_SEARCH_COUNT,
help=f"Maximum number of professors to search for (default: {DEFAULT_MAX_SEARCH_COUNT})",
)
parser.add_argument(
"--schools",
type=str,
default="",
help="Comma-separated list of university names to filter by (e.g., \"CISPA Helmholtz Center, Sanford University, Zhejiang University\")",
)
args = parser.parse_args()
if args.max_count > DEFAULT_MAX_COUNT:
args.max_count = DEFAULT_MAX_COUNT
if args.max_search_count > DEFAULT_MAX_SEARCH_COUNT:
args.max_search_count = DEFAULT_MAX_SEARCH_COUNT
keywords = [keyword.strip().lower() for keyword in args.keywords.replace("\"", "").split(",")]
school_filter = (
[school.strip() for school in args.schools.replace("\"", "").split(",")]
if args.schools
else None
)
return args.filename, keywords, args.max_count, args.max_search_count, school_filter
if __name__ == "__main__":
filename, keywords, max_count, max_search_count, school_filter = parse_arguments()
# Read file
print("Loading universities...")
datasource = load_universities_to_csv(filename, school_filter=school_filter)
print(f"Keywords: {keywords}")
print("Start obtaining Google Scholar Info...")
relevant_profs = check_relevant_professors_in_scholar(
datasource, keywords, max_count=max_count, max_search_count=max_search_count
)
print("Finish obtaining Google Scholar Info...")
# Sort according to the number of recent highlights
relevant_profs_sorted = sorted(
relevant_profs, key=lambda x: x["recent_highlights_num"], reverse=True
)
save_filename = "relevant-profs-" + "-".join(keywords) + ".csv"
save_relevant_professors_to_csv(save_filename, relevant_profs_sorted)
print(f"Relevant professors' information has been saved to {save_filename}")