forked from rishabgit/genomic-info-from-papers
-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathtextpresso.py
120 lines (102 loc) · 5.03 KB
/
textpresso.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
import json
import logging
import requests
from typing import List
import nltk
from wbtools.literature.corpus import CorpusManager
logger = logging.getLogger(__name__)
def textpresso_paper_text(wbpid, token):
''' Takes a wbpid eg WBPaper00056731 and returns the fulltext paper
in sentences
'''
tokenizer = nltk.data.load('tokenizers/punkt/english.pickle')
url = 'https://textpressocentral.org:18080/v1/textpresso/api/search_documents'
headers = {'Accept': 'application/json',
'Content-Type': 'application/json'}
body = json.dumps({'token': token,
'query': {'accession': wbpid,
'type': 'document',
'corpora': ['C. elegans']},
'include_fulltext': True})
response = requests.post(url, data=body, headers=headers, verify=False)
if response.status_code == 200:
if response.json() is None:
return []
else:
paper = response.json()[0]
abstract = tokenizer.tokenize(paper['abstract'])
fulltext = tokenizer.tokenize(paper['fulltext'])
return abstract + fulltext
else:
return []
def wbtools_paper_text(settings, wbpid):
db_name = settings['wb_database']['db_name']
db_user = settings['wb_database']['db_user']
db_password = settings['wb_database']['db_password']
db_host = settings['wb_database']['db_host']
file_server_host = settings['wb_database']['file_server_host']
file_server_user = settings['wb_database']['file_server_user']
file_server_passwd = settings['wb_database']['file_server_passwd']
cm = CorpusManager()
# sectioning might not be always correct, text processing is done separately in the pipeline
# remove_sections = [PaperSections.ACKNOWLEDGEMENTS, PaperSections.REFERENCES, PaperSections.RELATED_WORK, PaperSections.INTRODUCTION]
remove_sections = []
paper_id = wbpid[7:]
cm.load_from_wb_database(db_name=db_name, db_user=db_user, db_password=db_password,
db_host=db_host, paper_ids=[paper_id],
file_server_host=file_server_host, file_server_user=file_server_user,
file_server_passwd=file_server_passwd,
load_bib_info=False, load_afp_info=False, load_curation_info=False)
sentences = cm.get_paper(paper_id).get_text_docs(remove_sections=remove_sections, split_sentences=True)
return sentences
def wbtools_get_papers_last_month(settings, day=None, max_num_papers: int = None):
''' List of paper Ids since the last day of previous month'''
if day is None:
day = datetime.now()
logger.info("getting papers from " + day.strftime("%Y-%m-%d"))
if day.month == 1:
previous_month = 12
year = day.year - 1
else:
previous_month = day.month-1
year = day.year
first_day, last_day = calendar.monthrange(
year, previous_month)
query_date = datetime(
year, previous_month, last_day)
db_name = settings['wb_database']['db_name']
db_user = settings['wb_database']['db_user']
db_password = settings['wb_database']['db_password']
db_host = settings['wb_database']['db_host']
file_server_host = settings['wb_database']['file_server_host']
file_server_user = settings['wb_database']['file_server_user']
file_server_passwd = settings['wb_database']['file_server_passwd']
cm = CorpusManager()
cm.load_from_wb_database(
db_name=db_name, db_user=db_user, db_password=db_password,
db_host=db_host, from_date=query_date.strftime("%Y-%m-%d"),
file_server_host=file_server_host, file_server_user=file_server_user, file_server_passwd=file_server_passwd,
max_num_papers=max_num_papers)
return [paper.paper_id for paper in cm.get_all_papers()]
def wbtools_get_papers(settings, paper_ids: List[str]):
db_name = settings['wb_database']['db_name']
db_user = settings['wb_database']['db_user']
db_password = settings['wb_database']['db_password']
db_host = settings['wb_database']['db_host']
file_server_host = settings['wb_database']['file_server_host']
file_server_user = settings['wb_database']['file_server_user']
file_server_passwd = settings['wb_database']['file_server_passwd']
cm = CorpusManager()
cm.load_from_wb_database(
db_name=db_name, db_user=db_user, db_password=db_password,
db_host=db_host, paper_ids=paper_ids,
file_server_host=file_server_host, file_server_user=file_server_user, file_server_passwd=file_server_passwd,
load_bib_info=False, load_afp_info=False, load_curation_info=False)
return cm
if __name__ == "__main__":
from settings import setSettings
settings = setSettings()
print(textpresso_paper_text('WBPaper00002627', settings['db_config']['textpresso']['token']))
# print(wbtools_get_papers_last_month(settings['db_config']))
# feb_day = datetime.strptime('2022-02-15', '%Y-%m-%d')
# print(wbtools_get_papers_last_month(settings['db_config'], day=feb_day))