-
Notifications
You must be signed in to change notification settings - Fork 0
/
paperhelp_MICCAI_2021.py
89 lines (74 loc) · 3.34 KB
/
paperhelp_MICCAI_2021.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
import sys
import requests
import os
import re
import html
from typing import List
from tqdm import tqdm
CVF_URL = "https://miccai2021.org/openaccess/paperlinks/"
def download_file(url, dir, filename):
r = requests.get(url, stream=False,allow_redirects=True)
filename=re.sub('[:\?/+\'"<>]'," ",filename)
filename = re.sub("[\u4e00-\u9fa5]", "", filename)
open(f"{dir}/{filename}.pdf", 'wb').write(r.content)
class PaperHelper:
def __init__(self, year):
self.year = str(year)
webpage = requests.get(f"{CVF_URL}/index.html").text
open('temp.html','w',encoding='utf-8').write(webpage)
webpage = open('temp.html').read()
webpage = html.unescape(webpage)
self.titles= re.findall(f"(?=html\">).+(?=</a>)", webpage)
self.titles = [title[6:] for title in self.titles]
print(f'there are {len(self.titles)} papers in MICCAI{self.year}')
os.makedirs(f'MICCAI{year}', exist_ok=True)
if not os.path.exists(f'MICCAI{self.year}/MICCAI{self.year}_list.txt'):
self.url = re.findall(f"(?={self.year}/).+(?=.html)", webpage)
self.urls = [os.path.join(f'{CVF_URL}', url+'.html') for url in self.url]
self.dois=[]
for i,url in enumerate(tqdm(self.urls)):
webpage = requests.get(f"{url}").text
open('doi.html','w',encoding='utf-8').write(webpage)
webpage = open('doi.html').read()
webpage = html.unescape(webpage)
doi=re.findall("(?=DOI: <a href=\").+(?=\">https)", webpage)
doi=doi[0][14:]
doi=doi.replace('https://doi.org/','https://link.springer.com/content/pdf/')
self.dois.append(doi+'.pdf')
self.urls=self.dois
with open(f'MICCAI{self.year}/MICCAI{self.year}_list.txt','w') as fp:
[fp.write(str(item)+'\n') for item in self.urls]
fp.close()
self.urls=open(f'MICCAI{self.year}/MICCAI{self.year}_list.txt').readlines()
def search_keyword(self,kw):
result = []
for idx, title in enumerate(self.titles):
if kw.lower() in title.lower():
result.append(idx)
print(f"found {len(result)} papers")
return result
def download_paper(self, idx, save_to):
url = self.urls[idx].strip('\n')
download_file(url=url, dir=save_to, filename=self.titles[idx])
def download_keyword(self, kw):
paper_idx_list = self.search_keyword(kw)
try:
assert len(paper_idx_list) > 0
download_dir = f"./MICCAI{self.year}/MICCAI{self.year}-{kw}/"
print (f"Downloading in {download_dir}...")
os.makedirs(download_dir, exist_ok=True)
bar = tqdm(paper_idx_list)
for paper_idx in bar:
self.download_paper(paper_idx, download_dir)
bar.set_description(
f"Downloading \"{self.titles[paper_idx][:15]}...\"")
except:
print(f"{kw} paper does not found")
if __name__ == '__main__':
kw = "prognosis"
year=2021
# kw='graph'
# kw=sys.argv[1]
helper = PaperHelper(year)
print(f"Searching for \"{kw}\"...")
helper.download_keyword(kw)