Skip to content

Commit

Permalink
Add SentenceClassificationAPI for sentence classification tasks
Browse files Browse the repository at this point in the history
Added a new API to handle sentence classification tasks (sentence_classification_api.py) which now handles the loading of sentence classifiers and sent2vec model, and processes POST requests for classifying sentences. Refactored curator_dashboard.py to now make a POST request to this new API for sentence classifications. This allows for better separation of concerns, as sentence classifications tasks has been abstracted out of the curator dashboard. This commit also includes the Dockerfiles for both the new API and changes to old Dockerfile for the environment variable pointing to this new API.
  • Loading branch information
valearna committed Nov 22, 2023
1 parent 7ef1f6b commit 69750cf
Show file tree
Hide file tree
Showing 5 changed files with 134 additions and 66 deletions.
3 changes: 2 additions & 1 deletion src/backend/api/Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -20,8 +20,9 @@ ENV TAZENDRA_PASSWORD=""
ENV DEV_MODE=""
ENV LOG_FILE=/var/log/afp_db_api.log
ENV INFO_FILE=/var/log/afp_db_api.info
ENV SENTENCE_CLASSIFICATION_API=http://textpressocentral.org:8002

ENV PYTHONPATH=$PYTHONPATH:/usr/src/app/

EXPOSE ${PORT}
CMD python3 src/backend/api/afp_api.py -N ${DB_NAME} -U ${DB_USER} -P "${DB_PASSWD}" -H ${DB_HOST} -p ${PORT} -a ${ADMINS} -e ${EMAIL_PASSWD} -u ${AFP_BASE_URL} -w ${TAZENDRA_USER} -z ${TAZENDRA_PASSWORD} ${DEV_MODE} -l ${LOG_FILE} -L DEBUG >> ${INFO_FILE} 2>&1
CMD python3 src/backend/api/afp_api.py -N ${DB_NAME} -U ${DB_USER} -P "${DB_PASSWD}" -H ${DB_HOST} -p ${PORT} -a ${ADMINS} -e ${EMAIL_PASSWD} -u ${AFP_BASE_URL} -w ${TAZENDRA_USER} -z ${TAZENDRA_PASSWORD} ${DEV_MODE} -l ${LOG_FILE} -L DEBUG -c ${SENTENCE_CLASSIFICATION_API} >> ${INFO_FILE} 2>&1
12 changes: 6 additions & 6 deletions src/backend/api/afp_api.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
import argparse
import logging
import falcon
import joblib
import os
from wsgiref import simple_server
from falcon import HTTPStatus
from wbtools.db.dbmanager import WBDBManager
Expand Down Expand Up @@ -46,9 +46,11 @@ def main():
parser.add_argument("-u", "--afp-base-url", metavar="afp_base_url", dest="afp_base_url", type=str)
parser.add_argument("-w", "--tazendra-username", metavar="tazendra_user", dest="tazendra_user", type=str)
parser.add_argument("-z", "--tazendra-password", metavar="tazendra_password", dest="tazendra_password", type=str)
parser.add_argument("-c", "--sentence-cassification-api", metavar="sentence_classification_api",
dest="sentence_classification_api", type=str)
parser.add_argument("-d", "--dev-mode", dest="dev_mode", action="store_true")
args = parser.parse_args()

os.environ["SENTENCE_CLASSIFICATION_API"] = args.sentence_classification_api
logging.basicConfig(filename=args.log_file, level=args.log_level,
format='%(asctime)s - %(name)s - %(levelname)s:%(message)s')

Expand All @@ -65,8 +67,7 @@ def main():
curator_dashboard_reader = CuratorDashboardReader(db_manager=db_manager,
afp_base_url=args.afp_base_url,
tazendra_username=args.tazendra_user,
tazendra_password=args.tazendra_password,
sentence_classifiers_path=sentence_classifiers_path)
tazendra_password=args.tazendra_password)
app.add_route('/api/read_admin/{req_type}', curator_dashboard_reader)
author_papers_reader = AuthorPapersPageReader(db_manager=db_manager, afp_base_url=args.afp_base_url,
email_passwd=args.email_passwd)
Expand Down Expand Up @@ -99,8 +100,7 @@ def main():
curator_dashboard_reader = CuratorDashboardReader(db_manager=db_manager,
afp_base_url=os.environ['AFP_BASE_URL'],
tazendra_username=os.environ['AFP_TAZENDRA_USER'],
tazendra_password=os.environ['AFP_TAZENDRA_PASSWORD'],
sentence_classifiers_path=sentence_classifier_path)
tazendra_password=os.environ['AFP_TAZENDRA_PASSWORD'])
app.add_route('/api/read_admin/{req_type}', curator_dashboard_reader)
author_papers_reader = AuthorPapersPageReader(db_manager=db_manager,
afp_base_url=os.environ['AFP_BASE_URL'],
Expand Down
66 changes: 7 additions & 59 deletions src/backend/api/endpoints/curator_dashboard.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,8 @@
import json
import os
import re

import joblib
import requests
import numpy as np
import sent2vec
import falcon
Expand All @@ -21,48 +22,11 @@

class CuratorDashboardReader:

def __init__(self, db_manager: WBDBManager, afp_base_url: str, tazendra_username, tazendra_password,
sentence_classifiers_path):
def __init__(self, db_manager: WBDBManager, afp_base_url: str, tazendra_username, tazendra_password):
self.db = db_manager
self.afp_base_url = afp_base_url
self.tazendra_username = tazendra_username
self.tazendra_password = tazendra_password
self.sentence_classifiers = self.load_sentence_classifiers(sentence_classifiers_path)
self.sent2vec_model = self.load_sent2vec_model(f"{sentence_classifiers_path}/biosentvec.bin")

@staticmethod
def load_sentence_classifiers(models_path):
logger.info("Loading sentence classifiers...")
sentence_classifier_all_info_expression = joblib.load(f"{models_path}/all_info_expression.joblib")
sentence_classifier_curatable_expression = joblib.load(f"{models_path}/curatable_expression.joblib")
sentence_classifier_language_expression = joblib.load(f"{models_path}/language_expression.joblib")
sentence_classifier_all_info_kinase = joblib.load(f"{models_path}/all_info_kinase.joblib")
sentence_classifier_curatable_kinase = joblib.load(f"{models_path}/curatable_kinase.joblib")
sentence_classifier_language_kinase = joblib.load(f"{models_path}/language_kinase.joblib")
logger.info("All sentence classifiers loaded")
return {
"expression": {
"all_info": sentence_classifier_all_info_expression,
"curatable": sentence_classifier_curatable_expression,
"language": sentence_classifier_language_expression
},
"kinase": {
"all_info": sentence_classifier_all_info_kinase,
"curatable": sentence_classifier_curatable_kinase,
"language": sentence_classifier_language_kinase
}
}

@staticmethod
def load_sent2vec_model(sent2vec_model_path):
logger.info("Loading sentence embedding model...")
biosentvec_model = sent2vec.Sent2vecModel()
try:
biosentvec_model.load_model(sent2vec_model_path)
except Exception as e:
logger.error(e)
logger.info("Sentence embedding model loaded")
return biosentvec_model

@staticmethod
def transform_none_to_string(val):
Expand Down Expand Up @@ -215,26 +179,10 @@ def get_text_from_pdfs(self, paper_id):
sentences = [sentence for sentence in sentences if len(sentence) > 20 and len(sentence.split(" ")) > 2]
paper.abstract = paper.abstract if paper.abstract else ""
paper.title = paper.title if paper.title else ""
sentence_embeddings = self.sent2vec_model.embed_sentences(sentences)
classes_all_info_expression = self.sentence_classifiers["expression"]["all_info"].predict(sentence_embeddings)
classes_curatable_expression = self.sentence_classifiers["expression"]["curatable"].predict(sentence_embeddings)
classes_language_expression = self.sentence_classifiers["expression"]["language"].predict(sentence_embeddings)
classes_all_info_kinase = self.sentence_classifiers["kinase"]["all_info"].predict(sentence_embeddings)
classes_curatable_kinase = self.sentence_classifiers["kinase"]["curatable"].predict(sentence_embeddings)
classes_language_kinase = self.sentence_classifiers["kinase"]["language"].predict(sentence_embeddings)
classes = {
"expression": {
"all_info": classes_all_info_expression.tolist(),
"curatable": classes_curatable_expression.tolist(),
"language": classes_language_expression.tolist()
},
"kinase": {
"all_info": classes_all_info_kinase.tolist(),
"curatable": classes_curatable_kinase.tolist(),
"language": classes_language_kinase.tolist()
}
}
return fulltext, sentences, json.dumps(classes)
res = requests.post(f"{os.environ['SENTENCE_CLASSIFICATION_API']}/api/sentence_classification/"
f"classify_sentences",
{"sentences": sentences})
return fulltext, sentences, json.dumps(res.json()["classes"])

def on_post(self, req, resp, req_type):
with self.db:
Expand Down
12 changes: 12 additions & 0 deletions src/backend/sentence_classification_api/Dockerfile
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
FROM python:3.8-slim

WORKDIR /usr/src/app/
ADD requirements.txt .
RUN pip3 install -r requirements.txt
RUN python3 -c "import nltk; nltk.download('stopwords'); nltk.download('punkt')"
COPY src/backend/sentence_embedding_api src/backend/sentence_embedding_api

ENV PYTHONPATH=$PYTHONPATH:/usr/src/app/

EXPOSE ${PORT}
CMD python3 src/backend/sentence_embedding_api/sentence_embedding_api.py >> /var/log/sentence_embedding_api.log 2>&1
107 changes: 107 additions & 0 deletions src/backend/sentence_classification_api/sentence_classification_api.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,107 @@
#!/usr/bin/env python3
import json
import logging

import joblib
import sent2vec
import falcon
import os

from wsgiref import simple_server
from falcon import HTTPStatus


logger = logging.getLogger(__name__)


class HandleCORS(object):
def process_request(self, req, resp):
allow_headers = req.get_header(
'Access-Control-Request-Headers',
default='*'
)
resp.set_header('Access-Control-Allow-Origin', '*')
resp.set_header('Access-Control-Allow-Methods', '*')
resp.set_header('Access-Control-Allow-Headers', allow_headers)
resp.set_header('Access-Control-Max-Age', 1728000) # 20 days
if req.method == 'OPTIONS':
raise HTTPStatus(falcon.HTTP_200, body='\n')


class SentenceClassificationReader:

def __init__(self):
self.sentence_classifiers = self.load_sentence_classifiers("/var/sentence_classification_models/")
self.sent2vec_model = self.load_sent2vec_model("/var/sentence_classification_models/biosentvec.bin")

@staticmethod
def load_sent2vec_model(sent2vec_model_path):
logger.info("Loading sentence embedding model...")
biosentvec_model = sent2vec.Sent2vecModel()
try:
biosentvec_model.load_model(sent2vec_model_path)
except Exception as e:
logger.error(e)
logger.info("Sentence embedding model loaded")
return biosentvec_model

@staticmethod
def load_sentence_classifiers(models_path):
logger.info("Loading sentence classifiers...")
sentence_classifier_all_info_expression = joblib.load(f"{models_path}/all_info_expression.joblib")
sentence_classifier_curatable_expression = joblib.load(f"{models_path}/curatable_expression.joblib")
sentence_classifier_language_expression = joblib.load(f"{models_path}/language_expression.joblib")
sentence_classifier_all_info_kinase = joblib.load(f"{models_path}/all_info_kinase.joblib")
sentence_classifier_curatable_kinase = joblib.load(f"{models_path}/curatable_kinase.joblib")
sentence_classifier_language_kinase = joblib.load(f"{models_path}/language_kinase.joblib")
logger.info("All sentence classifiers loaded")
return {
"expression": {
"all_info": sentence_classifier_all_info_expression,
"curatable": sentence_classifier_curatable_expression,
"language": sentence_classifier_language_expression
},
"kinase": {
"all_info": sentence_classifier_all_info_kinase,
"curatable": sentence_classifier_curatable_kinase,
"language": sentence_classifier_language_kinase
}
}

def on_post(self, req, resp, req_type):
if req_type != "classify_sentences" or "sentences" not in req.media:
raise falcon.HTTPError(falcon.HTTP_BAD_REQUEST)
sentence_embeddings = self.sent2vec_model.embed_sentences(req.media["sentences"])
classes_all_info_expression = self.sentence_classifiers["expression"]["all_info"].predict(sentence_embeddings)
classes_curatable_expression = self.sentence_classifiers["expression"]["curatable"].predict(sentence_embeddings)
classes_language_expression = self.sentence_classifiers["expression"]["language"].predict(sentence_embeddings)
classes_all_info_kinase = self.sentence_classifiers["kinase"]["all_info"].predict(sentence_embeddings)
classes_curatable_kinase = self.sentence_classifiers["kinase"]["curatable"].predict(sentence_embeddings)
classes_language_kinase = self.sentence_classifiers["kinase"]["language"].predict(sentence_embeddings)
classes = {
"expression": {
"all_info": classes_all_info_expression.tolist(),
"curatable": classes_curatable_expression.tolist(),
"language": classes_language_expression.tolist()
},
"kinase": {
"all_info": classes_all_info_kinase.tolist(),
"curatable": classes_curatable_kinase.tolist(),
"language": classes_language_kinase.tolist()
}
}
resp.body = f'{{"classes": {json.dumps(classes)}}}'
resp.status = falcon.HTTP_200


def main():
logging.basicConfig(level='INFO', format='%(asctime)s - %(name)s - %(levelname)s:%(message)s')
app = falcon.App(middleware=[HandleCORS()])
sentence_classification_reader = SentenceClassificationReader()
app.add_route('/api/sentence_classification/{req_type}', sentence_classification_reader)

httpd = simple_server.make_server('0.0.0.0', int(8002), app)
httpd.serve_forever()


main()

0 comments on commit 69750cf

Please sign in to comment.