From 644750805e61444a7ab444ac841679085393a642 Mon Sep 17 00:00:00 2001 From: americast Date: Sun, 9 May 2021 03:38:08 -0400 Subject: [PATCH 1/5] add resource extractor --- extract_resource_covid.py | 35 +++++++++++++++++++++++++++++++++++ location.py | 10 ++++++---- 2 files changed, 41 insertions(+), 4 deletions(-) create mode 100644 extract_resource_covid.py diff --git a/extract_resource_covid.py b/extract_resource_covid.py new file mode 100644 index 0000000..4491d08 --- /dev/null +++ b/extract_resource_covid.py @@ -0,0 +1,35 @@ +text_org = input("Enter text, or press return for an example: ") +import location +import pudb +if text_org == "": + text_org = "Oxygen producing unit at Princess Esra Hospital (Owaisi Group of Hospitals). #Oxygen #IndiaNeedsOxygen #IndiaFightsCOVID19 @aimim_national @imShaukatAli @asadowaisi @imAkbarOwaisi @warispathan @syedasimwaqar @Syed_Ruknuddin5 @ShahnawazAIMIM_ @Akhtaruliman5 https://t.co/vdZamB1wJl" +text = text_org.lower() + +places = location.return_location_list(text) +each_loc = [place[0] for place in places] +resources = { + "oxygen": "Oxygen", + "o2": "Oxygen", + "ventilator": "Ventilator", + "bed": "Beds", + "icu": "Beds", + "remdes": "Remdesivir", + "plasma": "Plasma", + "consultation": "Doctor" +} + +places_to_remove = [] +resource_text = "" +for resource in resources: + if resource in each_loc: + places_to_remove.append(each_loc.index(resource)) + if resource in text: + resource_text = resource_text+resources[resource]+" " + +places_to_remove.sort(reverse=True) +for ptr in places_to_remove: + del places[ptr] + +print("\n\n\nText: "+str(text_org)) +print("\nLocation: "+str(places)) +print("\nResources: "+resource_text) \ No newline at end of file diff --git a/location.py b/location.py index c3cd9e9..9f13edf 100644 --- a/location.py +++ b/location.py @@ -25,9 +25,10 @@ import random import wordsegment import jellyfish -from para_sentence import split_into_sentences +# from para_sentence import split_into_sentences import networkx as nx import geocoder +import pudb ps_stemmer=porter.PorterStemmer() @@ -46,7 +47,7 @@ stop_words_2=['i','me','we','us','you','u','she','her','his','he','him','it','they','them','who','which','whom','whose','that','this','these','those','anyone','someone','some','all','most','himself','herself','myself','itself','hers','ours','yours','theirs','to','in','at','for','from','etc',' ',','] stop_words.extend(stop_words_2) -stop_words.extend(['with', 'at', 'from', 'into', 'during', 'including', 'until', 'against', 'among', 'throughout', 'despite', 'towards', 'upon', 'concerning', 'of', 'to', 'in', 'for', 'on', 'by', 'about', 'like', 'through', 'over', 'before', 'between', 'after', 'since', 'without', 'under', 'within', 'along', 'following', 'across', 'behind', 'beyond', 'plus', 'except', 'but', 'up', 'out', 'around', 'down', 'off', 'above', 'near', 'and', 'or', 'but', 'nor', 'so', 'for', 'yet', 'after', 'although', 'as', 'as', 'if', 'long', 'because', 'before', 'even', 'if', 'even though', 'once', 'since', 'so', 'that', 'though', 'till', 'unless', 'until', 'what', 'when', 'whenever', 'wherever', 'whether', 'while', 'the', 'a', 'an', 'this', 'that', 'these', 'those', 'my', 'yours', 'his', 'her', 'its', 'ours', 'their', 'few', 'many', 'little', 'much', 'many', 'lot', 'most', 'some', 'any', 'enough', 'all', 'both', 'half', 'either', 'neither', 'each', 'every', 'other', 'another', 'such', 'what', 'rather', 'quite']) +stop_words.extend(['with', 'at', 'from', 'into', 'during', 'including', 'until', 'against', 'among', 'throughout', 'despite', 'towards', 'upon', 'concerning', 'of', 'to', 'in', 'for', 'on', 'by', 'about', 'like', 'through', 'over', 'before', 'between', 'after', 'since', 'without', 'under', 'within', 'along', 'following', 'across', 'behind', 'beyond', 'plus', 'except', 'but', 'up', 'out', 'around', 'down', 'off', 'above', 'near', 'and', 'or', 'but', 'nor', 'so', 'for', 'yet', 'after', 'although', 'as', 'as', 'if', 'long', 'because', 'before', 'even', 'if', 'even though', 'once', 'since', 'so', 'that', 'though', 'till', 'unless', 'until', 'what', 'when', 'whenever', 'wherever', 'whether', 'while', 'the', 'a', 'an', 'this', 'that', 'these', 'those', 'my', 'yours', 'his', 'her', 'its', 'ours', 'their', 'few', 'many', 'little', 'much', 'many', 'lot', 'most', 'some', 'any', 'enough', 'all', 'both', 'half', 'either', 'neither', 'each', 'every', 'other', 'another', 'such', 'what', 'rather', 'quite', 'oxygen', 'ventilator', 'bed', 'remdesivir', 'consultation', 'plasma', 'vir', 'se']) stop_words=list(set(stop_words)) stopword_file=open("DATA/Process_resources/stopword.txt",'r') stop_words.extend([line.rstrip() for line in stopword_file]) @@ -458,7 +459,7 @@ def NP_chunk(doc,text): return dep_places -with open('DATA/NP/NP_loc.p','rb') as handle: +with open('DATA/NP/IN_loc.p','rb') as handle: curr_loc_dict=pickle.load(handle) # false_names=false_names-set([i for i in curr_loc_dict]) @@ -476,6 +477,7 @@ def NP_chunk(doc,text): starttime=time.time() def return_location_list(text): + # pu.db lat_long=[] try: # print('\n') @@ -544,7 +546,7 @@ def return_location_list(text): if i =='' or i in false_names or ps_stemmer.stem(i) in false_names: continue if i.endswith('hospital') and len(i.split())>=3: - g=geocoder.osm(i+', Nepal') + g=geocoder.osm(i+', India') # print(g) if g.json!=None: lat_long.append((i,(g.json['lat'],g.json['lng']))) From 9322e2c33acd3315d12d6920204be5c2427db673 Mon Sep 17 00:00:00 2001 From: americast Date: Sun, 9 May 2021 03:48:08 -0400 Subject: [PATCH 2/5] add resource type --- extract_resource_covid.py | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) diff --git a/extract_resource_covid.py b/extract_resource_covid.py index 4491d08..a9b8e19 100644 --- a/extract_resource_covid.py +++ b/extract_resource_covid.py @@ -15,7 +15,8 @@ "icu": "Beds", "remdes": "Remdesivir", "plasma": "Plasma", - "consultation": "Doctor" + "consultation": "Doctor", + "ambulance": "Ambulance" } places_to_remove = [] @@ -32,4 +33,11 @@ print("\n\n\nText: "+str(text_org)) print("\nLocation: "+str(places)) -print("\nResources: "+resource_text) \ No newline at end of file +print("\nResources: "+resource_text) + +if "need" in text or "require" in text: + print("\nType: Need") +elif "availab" in text or len(resource_text) != 0: + print("\nType: Availability") +else: + print("\nType: Other") From e9aae29f9b4770986c1b44b2d80564ee15ebcea2 Mon Sep 17 00:00:00 2001 From: Ritika Agarwal Date: Sun, 9 May 2021 23:34:33 +0530 Subject: [PATCH 3/5] added new logic/heuristics for classification and extraction --- app_covid.py | 113 +++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 113 insertions(+) create mode 100644 app_covid.py diff --git a/app_covid.py b/app_covid.py new file mode 100644 index 0000000..3b34e39 --- /dev/null +++ b/app_covid.py @@ -0,0 +1,113 @@ +import os +import flask +from flask import Flask +app = Flask(__name__) + + +import re +import json +from urllib.parse import unquote +import location +import pudb + +## CORS +from flask_cors import CORS, cross_origin +cors = CORS(app) +app.config['CORS_HEADERS'] = 'Content-Type' + +tel_no="([+]?[0]?[1-9][0-9\s]*[-]?[0-9\s]+)" +email="([a-zA-Z0-9]?[a-zA-Z0-9_.]+[@][a-zA-Z]+[.](com|net|edu|in|org|en))" +http_url='http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\)]|(?:%[0-9a-fA-F][0-9a-fA-F]))+' + +def get_contact(text): + contacts=[] + numbers=re.findall(tel_no,text) + temp=set() + for i in numbers: + if len(i.replace(' ',''))>=7: + temp.add(i) + contacts.append(temp) + temp=set() + mails= re.findall(email,text) + for i in mails: + temp.add(i) + contacts.append(temp) + temp=set() + urls= re.findall(http_url,text) + for i in urls: + temp.add(i) + contacts.append(temp) + return contacts + +def get_classification(text, resource_list): + if "need" in text or "require" in text: + label = 0 + elif "availab" in text or len(resource_list) != 0: + label = 1 + else: + label = 2 + return (label) + +resources = { + "oxygen": "Oxygen", + "o2": "Oxygen", + "ventilator": "Ventilator", + "bed": "Beds", + "icu": "Beds", + "remdes": "Remdesivir", + "plasma": "Plasma", + "consultation": "Doctor", + "ambulance": "Ambulance" +} +def get_location_covid(text): + text = text.lower() + places = location.return_location_list(text) + each_loc = [place[0] for place in places] + places_to_remove = [] + resource_text = "" + for resource in resources: + if resource in each_loc: + places_to_remove.append(each_loc.index(resource)) + if resource in text: + resource_text = resource_text+resources[resource]+" " + places_to_remove.sort(reverse=True) + for ptr in places_to_remove: + del places[ptr] + return resource_text, places + +@app.route('/parse', methods=['GET', 'POST', 'OPTIONS']) +@cross_origin() +def parseResources(): + resource, line = {}, '' + print(flask.request.json) + print(unquote(flask.request.query_string.decode('utf-8'))) + if flask.request and flask.request.json and'text' in flask.request.json: + line = flask.request.json['text'] + else: + line = json.loads(unquote(flask.request.query_string.decode('utf-8')))['text'] + print('Received for parsing: ', line) + contacts = get_contact(line) + resource_text, locations = get_location_covid(line) + print(resource_text,locations) + resource['Contact'] = {'Phone number': list(contacts[0]), "Email": list(contacts[1])} + resource['Sources'] = {} + resource['ResourceWords'] = resource_text.strip(" ").split(" ") + resource['Locations'], resource['Resources'] = dict(), {} + resource['Resources'] = {"resources": resource['ResourceWords']} + for each in locations: + resource['Locations'][each[0]] = {"long": float(each[1][0][1]), "lat": float(each[1][0][0])} + resource['Classification'] = int(get_classification(line, resource['ResourceWords'])) + print('Returning', resource) + return flask.jsonify(resource) + +# add routes for nodejs backend via here as well +@app.route('/', methods=['GET', 'OPTIONS']) +@cross_origin() +def base(): + with open('index.html', 'r') as f: + txt = f.readlines() + return ''.join(txt) + +if __name__ == '__main__': + port = int(os.environ.get('PORT', 5000)) + app.run(host='0.0.0.0', port=port, debug=True) From e8086306598beb14931a0fcb796b2a8d39b60e71 Mon Sep 17 00:00:00 2001 From: rakaar Date: Wed, 12 May 2021 18:39:35 +0000 Subject: [PATCH 4/5] Fixes #12 --- classify_tweets_covid_infer.py | 2 +- requirements.txt | 69 ++++++++++++++++++++++++++++------ 2 files changed, 59 insertions(+), 12 deletions(-) diff --git a/classify_tweets_covid_infer.py b/classify_tweets_covid_infer.py index 2b8dd54..9fb6cb2 100644 --- a/classify_tweets_covid_infer.py +++ b/classify_tweets_covid_infer.py @@ -198,7 +198,7 @@ def __init__(self, module): def forward(self, x): return self.module(x) -config = {'hidden_dropout_prob':0.3, 'num_labels':3,'model_name':'bert-base-uncased', 'hidden_size':768, 'data_dir':'saved_models/',} +config = {'hidden_dropout_prob':0.3, 'num_labels':3,'model_name':'bert-base-uncased', 'hidden_size':768, 'data_dir':'saved_models',} config = SimpleNamespace(**config) # model = BertModel.from_pretrained('bert-base-uncased') diff --git a/requirements.txt b/requirements.txt index 02bb73f..d446526 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,17 +1,64 @@ +blis==0.7.4 +catalogue==1.0.0 +certifi==2020.12.5 +chardet==4.0.0 +click==8.0.0 +cycler==0.10.0 +cymem==2.0.5 +decorator==4.4.2 +emoji==0.5.1 +en-core-web-sm==2.3.1 +filelock==3.0.12 Flask==1.0.2 -stop_words==2018.7.23 -wordsegment==1.3.1 +Flask-Cors==3.0.7 +future==0.18.2 +geocoder==1.38.1 +gunicorn==19.9.0 +idna==2.10 +itsdangerous==2.0.0 +jellyfish==0.7.1 +Jinja2==3.0.0 +joblib==1.0.1 +kiwisolver==1.3.1 +MarkupSafe==2.0.0 +matplotlib==3.4.2 +murmurhash==1.0.5 networkx==2.5.1 nltk==3.4 -jellyfish==0.7.1 -geocoder==1.38.1 +numpy==1.20.3 +packaging==20.9 +pandas==1.2.4 +Pillow==8.2.0 +plac==1.1.3 +preshed==3.0.5 +pudb==2019.1 +Pygments==2.9.0 +pyparsing==2.4.7 +python-dateutil==2.8.1 +pytz==2021.1 +ratelim==0.1.6 +regex==2021.4.4 +requests==2.25.1 +sacremoses==0.0.45 +scikit-learn==0.24.2 +scipy==1.6.3 +singledispatch==3.6.1 +six==1.16.0 +sklearn==0.0 spacy==2.3.5 -emoji==0.5.1 -Flask-Cors==3.0.7 -word2number==1.1 -gunicorn==19.9.0 -https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-2.3.1/en_core_web_sm-2.3.1.tar.gz +srsly==1.0.5 +stop-words==2018.7.23 +thinc==7.4.5 +threadpoolctl==2.1.0 +tokenizers==0.10.2 torch==1.8.0 torchvision==0.9.0 -pudb==2019.1 -transformers==4.5.1 \ No newline at end of file +tqdm==4.60.0 +transformers==4.5.1 +typing-extensions==3.10.0.0 +urllib3==1.26.4 +urwid==2.1.2 +wasabi==0.8.2 +Werkzeug==2.0.0 +word2number==1.1 +wordsegment==1.3.1 From 81d479280c707c697c6723bd0d1fee455ed4fffc Mon Sep 17 00:00:00 2001 From: rakaar Date: Wed, 12 May 2021 18:48:55 +0000 Subject: [PATCH 5/5] Added wsgi file --- wsgi.py | 4 ++++ 1 file changed, 4 insertions(+) create mode 100644 wsgi.py diff --git a/wsgi.py b/wsgi.py new file mode 100644 index 0000000..6026b0f --- /dev/null +++ b/wsgi.py @@ -0,0 +1,4 @@ +from app import app + +if __name__ == "__main__": + app.run()