-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathscrape_nature_spaces_between_ref_numbers.py
123 lines (114 loc) · 4.07 KB
/
scrape_nature_spaces_between_ref_numbers.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
import sys
reload(sys)
sys.setdefaultencoding('utf8')
import argparse
if __name__ == "__main__":
import re
import requests
import bs4
parser = argparse.ArgumentParser()
parser.add_argument("--index_url", help="e.g. https://www.nature.com/nature/journal/v551/n7678/full/nature24471.html", type=str, default="https://www.nature.com/nature/journal/v551/n7678/full/nature24471.html")
parser.add_argument("--play",action='store_true')
parser.add_argument('--output_file',type=str,help="if not specified- will default to title of the paper")
parser.add_argument('--speech_rate',type=int,help="default is 220 wpm",default=220)
parser.add_argument('--voice',type=str,help='Choose any from: Alex\nalice\nalva\namelie\nanna\ncarmit\ndamayanti\ndaniel\ndiego\nellen\nfiona\nFred\nioana\njoana\njorge\njuan\nkanya\nkaren\nkyoko\nlaura\nlekha\nluca\nluciana\nmaged\nmariska\nmei-jia\nmelina\nmilena\nmoira.premium\nmonica\nnora\npaulina\nsamantha\nsara\nsatu\nsin-ji\ntessa\nthomas\nting-ting\nveena\nVictoria\nxander\nyelda\nyuna\nyuri\nzosia\nzuzana\n',default='Alex')
args = parser.parse_args()
print('voice '+args.voice)
response = requests.get(args.index_url)
soup = bs4.BeautifulSoup(response.text,'html.parser')
connect={}
idx = []
a = soup.find_all('section')
header=soup.find_all('h1',attrs={'class':'article-heading'})[0].text.encode('utf8')
print('header '+ header)
def remove_bracketted_content(test_str):
ret = ''
skip1c = 0
skip2c = 0
for i in test_str:
if i == '(':
skip1c += 1
elif i == ')' and skip1c > 0:
skip1c -= 1
elif skip1c == 0:
ret += i
return ret
tmp=''
for i in a:
tmp=tmp+i.text.encode('utf8')
a = remove_bracketted_content(tmp)
x=re.sub(r'Pax9','Pax 9',a)
x=re.sub(r'Pax3','Pax 3',x)
x=re.sub(r'Suv39h','Suv 39 h',x)
x=re.sub(r'P53','P 53',x)
x=re.sub(r'Sox2','Sox 2',x)
x=re.sub(r'Oct4','Oct 4',x)
x=re.sub(r'Klf4','Klf 4',x)
x=re.sub(r'P66a','P 66 a',x)
x=re.sub(r'Thy1','Thy 1',x)
x=re.sub(r'Gatad2a','Gatad 2 a',x)
x=re.sub(r'Gatad2a','Gatad 2 a',x)
x=re.sub(r'Mbd3','Mbd 3',x)
x=re.sub(r'SSEA1','S SEA 1',x)
x=re.sub(r'Chd4', 'Chd 4',x)
x=re.sub(r'HP1','HP 1 ',x)
x=re.sub(r'H3K9ac','H 3 Kay 9 AC',x)
x=re.sub(r'H3K27ac','H 3 Kay 2 7 AC',x)
x=re.sub(r'H3K9','H 3 Kay 9 ',x)
x=re.sub(r'H4K20','H 4 Kay 20 ',x)
x=re.sub(r'H3K4','H 3 Kay 4 ',x)
x=re.sub(r'H3K27','H 3 Kay 2 7 ',x)
x=re.sub(r'H4K16','H 4 Kay 16 ',x)
x=re.sub(r'H2A','H 2 A ',x)
x=re.sub(r'H2B','H 2 B ',x)
x=re.sub(r'me3','ME 3',x)
x=re.sub(r'me2','ME 2',x)
x=re.sub(r'me1','ME 1',x)
x=re.sub(r'H1','H 1',x)
x=re.sub(r'H3','H 3',x)
x=re.sub(r'ESCs','E S sees',x)
x=re.sub(r'H2A','H 2 A',x)
x=re.sub(r'H2B','H 2 B',x)
x=re.sub(r'et al\.','et al',x)
x=re.sub(r'3D','three D', x)
x =re.sub(r'([^ 0-9])(\d+(?:, \d+)*)', r'\1', a)
x=re.sub(r'\xe2\x80\x9a\xc3\x84\xc3\xa2\xc2\xac\xe2\x88\x9e', ' degrees ', x)
x=re.sub(r'\xe2\x80\x9a\xc3\xa0\xc3\xad', ' minus ', x)
x=re.sub(r'\xe2\x80\x9a\xc3\x84\xc3\xac', '-', x)
x=re.sub(r'\xe2\x80\x9a\xc3\x84\xc3\xb2', '', x)
x=re.sub(r'\xe2\x80\x9a\xc3\x84\xc3\xb4', '', x)
x=re.sub(r'\xe2\x80\x98', '', x)
#x =re.sub(r'\,d+', '',x)
x = re.sub(r'\xce\x94', 'delta', x)
x = re.sub(r'\xce\xb2', 'beta', x)
x = re.sub(r'\xc3\x85', 'delta', x)
x = re.sub(r'\xce\xb1', 'angstrom', x)
x = re.sub(r'\xce\xb6', 'zeta', x)
x = re.sub(r'\xce\xb5', 'epsilon', x)
x = re.sub(r'\xce\xa9', 'omega', x)
x = re.sub(r'\xce\xbb', 'lambda', x)
x = re.sub(r'\xce\xb5', 'epsilon', x)
x = re.sub(r'\xe2\x88\x88', 'epsilon', x)
x = re.sub(r'\xe2\x88\x91', 'sum of', x)
x = re.sub(r'\xce\xa9', 'omega', x)
x=x[:x.find('\nReferences')]
if args.output_file:
file=open(args.output_file,'wb')
file.write(x.encode('utf8'))
file.close()
else:
file=open(header+b'.txt','wb')
file.write(x.encode('utf8'))
file.close()
if args.play:
import pyttsx3
engine = pyttsx3.init()
engine.setProperty('rate', args.speech_rate)
voices = engine.getProperty('voices')
for j,i in enumerate(voices):
if args.voice in str(i.id):
break
engine.setProperty('voice', voices[j].id)
engine.say(x.encode('utf8'))
engine.runAndWait()
engine.stop()