-
Notifications
You must be signed in to change notification settings - Fork 14
/
fetch_publications.py
81 lines (69 loc) · 3.12 KB
/
fetch_publications.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
# OG Author: Pierre Luc Bacon
# Updated by: Koustuv Sinha
import feedparser
from operator import itemgetter
from collections import defaultdict
from bs4 import BeautifulSoup
import yaml
# Profs with their DBLP RSS feeds and the start dates
# Include `dblp` and `startyear` tags in _data/professors.yml file
# to search DBLP for their papers
professors = yaml.safe_load(open("_data/professors.yml"))
profs_with_dblp = [p for p in professors if ("dblp" in p) and ("startyear" in p)]
print(f"Fetching paper details for {len(profs_with_dblp)} professors and affiliates")
# File where the papers will be written. This file is included by `_pages/publications.md`
output_file = "_includes/dblp_pubs.html"
publications = defaultdict(dict)
# Maintain a list of unique papers
seen_entries = []
total = 0
print("Fetching papers from DBLP...")
for prof in profs_with_dblp:
prof_name = prof["name"]
print(f"Fetching papers for author : {prof_name}")
dblp = prof["dblp"]
start_year = prof["startyear"]
d = feedparser.parse(dblp, sanitize_html=False)
print(f"Got {len(d['entries'])} papers")
ct = 0
for entry in d["entries"]:
# Prune papers beyond profs start date at McGill
if int(entry["published"]) < int(start_year):
continue
# DBLP returns all arxiv papers as well. Assuming the conference paper is the latest one,
# we choose the first entry of the title.
# This also assumes no two paper has the same title.
if entry["title"] not in seen_entries:
# Parse the summary, which is a formatted html.
soup = BeautifulSoup(entry["summary"], "html.parser")
authors = soup.find_all(itemprop="author")
author_string = ",".join([str(auth.contents[0]) for auth in authors])
# Not all papers have correct venues, ignore them
isPartOf = soup.find_all(itemprop="isPartOf")
if len(isPartOf) == 0:
venue = ""
else:
isPartOfContents = isPartOf[0].contents
if len(isPartOfContents) == 0:
venue = ""
else:
venue = isPartOfContents[0].get_text() + f" ({entry['published']})"
title_with_link = f"<i><a href='{entry['link']}'>{entry['title']}</a></i>"
formatted_cite = author_string + "; " + title_with_link + " " + venue
publications[int(entry["published"])][entry["id"]] = formatted_cite
seen_entries.append(entry["title"])
ct += 1
print(f"Collected {ct} papers")
total += ct
print(f"Writing total of {total} papers in {output_file}")
# Write in the output file
with open(output_file, "w") as f:
# add an index for quick search
years = publications.keys()
header = "Years active:" + ",".join([f" [{y}](#{y})" for y in years]) + "\n\n"
f.write(header)
for year, pubs in sorted(publications.items(), key=itemgetter(0), reverse=True):
if year > 1997:
pubsyear = ["<li>" + pub + "</li>\n\n" for pub in pubs.values()]
f.write("## {0}\n\n <ul>{1}</ul>\n\n".format(year, "".join(pubsyear)))
print("Done")