From 54bc3f25e64d03ec747bc7d2147f2af8a3c87103 Mon Sep 17 00:00:00 2001 From: Haider Ali Date: Wed, 15 Nov 2023 06:36:31 -0500 Subject: [PATCH 1/6] Delete Test.py --- Test.py | 19 ------------------- 1 file changed, 19 deletions(-) delete mode 100644 Test.py diff --git a/Test.py b/Test.py deleted file mode 100644 index 50047bb..0000000 --- a/Test.py +++ /dev/null @@ -1,19 +0,0 @@ -from Questgen import main - -payload = { - "input_text": """Computer vision is a field of artificial intelligence (AI) that enables computers and systems to derive meaningful information from digital images, videos and other visual inputs — and take actions or make recommendations based on that information. If AI enables computers to think, computer vision enables them to see, observe and understand. - -Computer vision works much the same as human vision, except humans have a head start. Human sight has the advantage of lifetimes of context to train how to tell objects apart, how far away they are, whether they are moving and whether there is something wrong in an image. - -Computer vision trains machines to perform these functions, but it has to do it in much less time with cameras, data and algorithms rather than retinas, optic nerves and a visual cortex. Because a system trained to inspect products or watch a production asset can analyze thousands of products or processes a minute, noticing imperceptible defects or issues, it can quickly surpass human capabilities. - -Computer vision is used in industries ranging from energy and utilities to manufacturing and automotive – and the market is continuing to grow. It is expected to reach USD 48.6 billion by 2022.""", - "input_question": ["What is Computer Vision?", "Computer Vision is used in which industries?"] -} - - -predictor = main.AnswerPredictor() -answers = predictor.predict_answer(payload) - - -print(answers) \ No newline at end of file From bed0ca74525b0cc92e399be9945740ca94e2a88c Mon Sep 17 00:00:00 2001 From: Haider Ali Date: Wed, 15 Nov 2023 07:55:04 -0500 Subject: [PATCH 2/6] Delete Questgen/main.py --- Questgen/main.py | 288 ----------------------------------------------- 1 file changed, 288 deletions(-) delete mode 100644 Questgen/main.py diff --git a/Questgen/main.py b/Questgen/main.py deleted file mode 100644 index cc9178d..0000000 --- a/Questgen/main.py +++ /dev/null @@ -1,288 +0,0 @@ -import numpy as np # linear algebra -import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv) -import time -import torch -from transformers import T5ForConditionalGeneration,T5Tokenizer -import random -import spacy -import zipfile -import os -import json -from sense2vec import Sense2Vec -import requests -from collections import OrderedDict -import string -import pke -import nltk -import numpy -from nltk import FreqDist -nltk.download('brown', quiet=True, force=True) -nltk.download('stopwords', quiet=True, force=True) -nltk.download('popular', quiet=True, force=True) -from nltk.corpus import stopwords -from nltk.corpus import brown -from similarity.normalized_levenshtein import NormalizedLevenshtein -from nltk.tokenize import sent_tokenize -from flashtext import KeywordProcessor -from Questgen.encoding.encoding import beam_search_decoding -from Questgen.mcq.mcq import tokenize_sentences -from Questgen.mcq.mcq import get_keywords -from Questgen.mcq.mcq import get_sentences_for_keyword -from Questgen.mcq.mcq import generate_questions_mcq -from Questgen.mcq.mcq import generate_normal_questions -import time - -class QGen: - - def __init__(self): - - self.tokenizer = T5Tokenizer.from_pretrained('t5-large') - model = T5ForConditionalGeneration.from_pretrained('Parth/result') - device = torch.device("cuda" if torch.cuda.is_available() else "cpu") - model.to(device) - # model.eval() - self.device = device - self.model = model - self.nlp = spacy.load('en_core_web_sm', quiet=True) - - self.s2v = Sense2Vec().from_disk('s2v_old') - - self.fdist = FreqDist(brown.words()) - self.normalized_levenshtein = NormalizedLevenshtein() - self.set_seed(42) - - def set_seed(self,seed): - numpy.random.seed(seed) - torch.manual_seed(seed) - if torch.cuda.is_available(): - torch.cuda.manual_seed_all(seed) - - def predict_mcq(self, payload): - start = time.time() - inp = { - "input_text": payload.get("input_text"), - "max_questions": payload.get("max_questions", 4) - } - - text = inp['input_text'] - sentences = tokenize_sentences(text) - joiner = " " - modified_text = joiner.join(sentences) - - - keywords = get_keywords(self.nlp,modified_text,inp['max_questions'],self.s2v,self.fdist,self.normalized_levenshtein,len(sentences) ) - - - keyword_sentence_mapping = get_sentences_for_keyword(keywords, sentences) - - for k in keyword_sentence_mapping.keys(): - text_snippet = " ".join(keyword_sentence_mapping[k][:3]) - keyword_sentence_mapping[k] = text_snippet - - - final_output = {} - - if len(keyword_sentence_mapping.keys()) == 0: - return final_output - else: - try: - generated_questions = generate_questions_mcq(keyword_sentence_mapping,self.device,self.tokenizer,self.model,self.s2v,self.normalized_levenshtein) - - except: - return final_output - end = time.time() - - final_output["statement"] = modified_text - final_output["questions"] = generated_questions["questions"] - final_output["time_taken"] = end-start - - if torch.device=='cuda': - torch.cuda.empty_cache() - - return final_output - - def predict_shortq(self, payload): - inp = { - "input_text": payload.get("input_text"), - "max_questions": payload.get("max_questions", 4) - } - - text = inp['input_text'] - sentences = tokenize_sentences(text) - joiner = " " - modified_text = joiner.join(sentences) - - - keywords = get_keywords(self.nlp,modified_text,inp['max_questions'],self.s2v,self.fdist,self.normalized_levenshtein,len(sentences) ) - - - keyword_sentence_mapping = get_sentences_for_keyword(keywords, sentences) - - for k in keyword_sentence_mapping.keys(): - text_snippet = " ".join(keyword_sentence_mapping[k][:3]) - keyword_sentence_mapping[k] = text_snippet - - final_output = {} - - if len(keyword_sentence_mapping.keys()) == 0: - print('ZERO') - return final_output - else: - - generated_questions = generate_normal_questions(keyword_sentence_mapping,self.device,self.tokenizer,self.model) - print(generated_questions) - - - final_output["statement"] = modified_text - final_output["questions"] = generated_questions["questions"] - - if torch.device=='cuda': - torch.cuda.empty_cache() - - return final_output - - - def paraphrase(self,payload): - start = time.time() - inp = { - "input_text": payload.get("input_text"), - "max_questions": payload.get("max_questions", 3) - } - - text = inp['input_text'] - num = inp['max_questions'] - - self.sentence= text - self.text= "paraphrase: " + self.sentence + " " - - encoding = self.tokenizer.encode_plus(self.text,pad_to_max_length=True, return_tensors="pt") - input_ids, attention_masks = encoding["input_ids"].to(self.device), encoding["attention_mask"].to(self.device) - - beam_outputs = self.model.generate( - input_ids=input_ids, - attention_mask=attention_masks, - max_length= 50, - num_beams=50, - num_return_sequences=num, - no_repeat_ngram_size=2, - early_stopping=True - ) - -# print ("\nOriginal Question ::") -# print (text) -# print ("\n") -# print ("Paraphrased Questions :: ") - final_outputs =[] - for beam_output in beam_outputs: - sent = self.tokenizer.decode(beam_output, skip_special_tokens=True,clean_up_tokenization_spaces=True) - if sent.lower() != self.sentence.lower() and sent not in final_outputs: - final_outputs.append(sent) - - output= {} - output['Question']= text - output['Count']= num - output['Paraphrased Questions']= final_outputs - - for i, final_output in enumerate(final_outputs): - print("{}: {}".format(i, final_output)) - - if torch.device=='cuda': - torch.cuda.empty_cache() - - return output - - -class BoolQGen: - - def __init__(self): - self.tokenizer = T5Tokenizer.from_pretrained('t5-base') - model = T5ForConditionalGeneration.from_pretrained('ramsrigouthamg/t5_boolean_questions') - device = torch.device("cuda" if torch.cuda.is_available() else "cpu") - model.to(device) - # model.eval() - self.device = device - self.model = model - self.set_seed(42) - - def set_seed(self,seed): - numpy.random.seed(seed) - torch.manual_seed(seed) - if torch.cuda.is_available(): - torch.cuda.manual_seed_all(seed) - - def random_choice(self): - a = random.choice([0,1]) - return bool(a) - - - def predict_boolq(self,payload): - start = time.time() - inp = { - "input_text": payload.get("input_text"), - "max_questions": payload.get("max_questions", 4) - } - - text = inp['input_text'] - num= inp['max_questions'] - sentences = tokenize_sentences(text) - joiner = " " - modified_text = joiner.join(sentences) - answer = self.random_choice() - form = "truefalse: %s passage: %s " % (modified_text, answer) - - encoding = self.tokenizer.encode_plus(form, return_tensors="pt") - input_ids, attention_masks = encoding["input_ids"].to(self.device), encoding["attention_mask"].to(self.device) - - output = beam_search_decoding (input_ids, attention_masks,self.model,self.tokenizer) - if torch.device=='cuda': - torch.cuda.empty_cache() - - final= {} - final['Text']= text - final['Count']= num - final['Boolean Questions']= output - - return final - -class AnswerPredictor: - - def __init__(self): - self.tokenizer = T5Tokenizer.from_pretrained('t5-large', model_max_length=512) - model = T5ForConditionalGeneration.from_pretrained('Parth/boolean') - device = torch.device("cuda" if torch.cuda.is_available() else "cpu") - model.to(device) - # model.eval() - self.device = device - self.model = model - self.set_seed(42) - - def set_seed(self,seed): - numpy.random.seed(seed) - torch.manual_seed(seed) - if torch.cuda.is_available(): - torch.cuda.manual_seed_all(seed) - - def greedy_decoding (inp_ids,attn_mask,model,tokenizer): - greedy_output = model.generate(input_ids=inp_ids, attention_mask=attn_mask, max_length=256) - Question = tokenizer.decode(greedy_output[0], skip_special_tokens=True,clean_up_tokenization_spaces=True) - return Question.strip().capitalize() - - def predict_answer(self,payload): - answers = [] - inp = { - "input_text": payload.get("input_text"), - "input_question" : payload.get("input_question") - } - for ques in payload.get("input_question"): - - context = inp["input_text"] - question = ques - input = "question: %s context: %s " % (question,context) - - encoding = self.tokenizer.encode_plus(input, return_tensors="pt") - input_ids, attention_masks = encoding["input_ids"].to(self.device), encoding["attention_mask"].to(self.device) - greedy_output = self.model.generate(input_ids=input_ids, attention_mask=attention_masks, max_length=256) - Question = self.tokenizer.decode(greedy_output[0], skip_special_tokens=True,clean_up_tokenization_spaces=True) - answers.append(Question.strip().capitalize()) - - return answers From 252fcad90d157b8575c2ca529895f81c9f2218d6 Mon Sep 17 00:00:00 2001 From: Haider Ali Date: Wed, 15 Nov 2023 07:55:28 -0500 Subject: [PATCH 3/6] Added BERT model --- Questgen/main.py | 310 +++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 310 insertions(+) create mode 100644 Questgen/main.py diff --git a/Questgen/main.py b/Questgen/main.py new file mode 100644 index 0000000..86d591e --- /dev/null +++ b/Questgen/main.py @@ -0,0 +1,310 @@ +import numpy as np # linear algebra +import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv) +import time +import torch +from transformers import T5ForConditionalGeneration,T5Tokenizer +from transformers import AutoTokenizer, AutoModelForCausalLM +import random +import spacy +import zipfile +import os +import json +from sense2vec import Sense2Vec +import requests +from collections import OrderedDict +import string +import pke +import nltk +import numpy +from nltk import FreqDist +nltk.download('brown', quiet=True, force=True) +nltk.download('stopwords', quiet=True, force=True) +nltk.download('popular', quiet=True, force=True) +from nltk.corpus import stopwords +from nltk.corpus import brown +from similarity.normalized_levenshtein import NormalizedLevenshtein +from nltk.tokenize import sent_tokenize +from flashtext import KeywordProcessor +from Questgen.encoding.encoding import beam_search_decoding +from Questgen.mcq.mcq import tokenize_sentences +from Questgen.mcq.mcq import get_keywords +from Questgen.mcq.mcq import get_sentences_for_keyword +from Questgen.mcq.mcq import generate_questions_mcq +from Questgen.mcq.mcq import generate_normal_questions +import time + +class QGen: + + def __init__(self): + + self.tokenizer = T5Tokenizer.from_pretrained('t5-large') + model = T5ForConditionalGeneration.from_pretrained('Parth/result') + device = torch.device("cuda" if torch.cuda.is_available() else "cpu") + model.to(device) + # model.eval() + self.device = device + self.model = model + self.nlp = spacy.load('en_core_web_sm', quiet=True) + + self.s2v = Sense2Vec().from_disk('s2v_old') + + self.fdist = FreqDist(brown.words()) + self.normalized_levenshtein = NormalizedLevenshtein() + self.set_seed(42) + + def set_seed(self,seed): + numpy.random.seed(seed) + torch.manual_seed(seed) + if torch.cuda.is_available(): + torch.cuda.manual_seed_all(seed) + + def predict_mcq(self, payload): + start = time.time() + inp = { + "input_text": payload.get("input_text"), + "max_questions": payload.get("max_questions", 4) + } + + text = inp['input_text'] + sentences = tokenize_sentences(text) + joiner = " " + modified_text = joiner.join(sentences) + + + keywords = get_keywords(self.nlp,modified_text,inp['max_questions'],self.s2v,self.fdist,self.normalized_levenshtein,len(sentences) ) + + + keyword_sentence_mapping = get_sentences_for_keyword(keywords, sentences) + + for k in keyword_sentence_mapping.keys(): + text_snippet = " ".join(keyword_sentence_mapping[k][:3]) + keyword_sentence_mapping[k] = text_snippet + + + final_output = {} + + if len(keyword_sentence_mapping.keys()) == 0: + return final_output + else: + try: + generated_questions = generate_questions_mcq(keyword_sentence_mapping,self.device,self.tokenizer,self.model,self.s2v,self.normalized_levenshtein) + + except: + return final_output + end = time.time() + + final_output["statement"] = modified_text + final_output["questions"] = generated_questions["questions"] + final_output["time_taken"] = end-start + + if torch.device=='cuda': + torch.cuda.empty_cache() + + return final_output + + def predict_shortq(self, payload): + inp = { + "input_text": payload.get("input_text"), + "max_questions": payload.get("max_questions", 4) + } + + text = inp['input_text'] + sentences = tokenize_sentences(text) + joiner = " " + modified_text = joiner.join(sentences) + + + keywords = get_keywords(self.nlp,modified_text,inp['max_questions'],self.s2v,self.fdist,self.normalized_levenshtein,len(sentences) ) + + + keyword_sentence_mapping = get_sentences_for_keyword(keywords, sentences) + + for k in keyword_sentence_mapping.keys(): + text_snippet = " ".join(keyword_sentence_mapping[k][:3]) + keyword_sentence_mapping[k] = text_snippet + + final_output = {} + + if len(keyword_sentence_mapping.keys()) == 0: + print('ZERO') + return final_output + else: + + generated_questions = generate_normal_questions(keyword_sentence_mapping,self.device,self.tokenizer,self.model) + print(generated_questions) + + + final_output["statement"] = modified_text + final_output["questions"] = generated_questions["questions"] + + if torch.device=='cuda': + torch.cuda.empty_cache() + + return final_output + + + def paraphrase(self,payload): + start = time.time() + inp = { + "input_text": payload.get("input_text"), + "max_questions": payload.get("max_questions", 3) + } + + text = inp['input_text'] + num = inp['max_questions'] + + self.sentence= text + self.text= "paraphrase: " + self.sentence + " " + + encoding = self.tokenizer.encode_plus(self.text,pad_to_max_length=True, return_tensors="pt") + input_ids, attention_masks = encoding["input_ids"].to(self.device), encoding["attention_mask"].to(self.device) + + beam_outputs = self.model.generate( + input_ids=input_ids, + attention_mask=attention_masks, + max_length= 50, + num_beams=50, + num_return_sequences=num, + no_repeat_ngram_size=2, + early_stopping=True + ) + +# print ("\nOriginal Question ::") +# print (text) +# print ("\n") +# print ("Paraphrased Questions :: ") + final_outputs =[] + for beam_output in beam_outputs: + sent = self.tokenizer.decode(beam_output, skip_special_tokens=True,clean_up_tokenization_spaces=True) + if sent.lower() != self.sentence.lower() and sent not in final_outputs: + final_outputs.append(sent) + + output= {} + output['Question']= text + output['Count']= num + output['Paraphrased Questions']= final_outputs + + for i, final_output in enumerate(final_outputs): + print("{}: {}".format(i, final_output)) + + if torch.device=='cuda': + torch.cuda.empty_cache() + + return output + + +class BoolQGen: + + def __init__(self): + self.tokenizer = T5Tokenizer.from_pretrained('t5-large') + model = T5ForConditionalGeneration.from_pretrained('ramsrigouthamg/t5_boolean_questions') + device = torch.device("cuda" if torch.cuda.is_available() else "cpu") + model.to(device) + # model.eval() + self.device = device + self.model = model + self.set_seed(42) + + def set_seed(self,seed): + numpy.random.seed(seed) + torch.manual_seed(seed) + if torch.cuda.is_available(): + torch.cuda.manual_seed_all(seed) + + def random_choice(self): + a = random.choice([0,1]) + return bool(a) + + + def predict_boolq(self,payload): + start = time.time() + inp = { + "input_text": payload.get("input_text"), + "max_questions": payload.get("max_questions", 4) + } + + text = inp['input_text'] + num= inp['max_questions'] + sentences = tokenize_sentences(text) + joiner = " " + modified_text = joiner.join(sentences) + answer = self.random_choice() + form = "truefalse: %s passage: %s " % (modified_text, answer) + + encoding = self.tokenizer.encode_plus(form, return_tensors="pt") + input_ids, attention_masks = encoding["input_ids"].to(self.device), encoding["attention_mask"].to(self.device) + + output = beam_search_decoding (input_ids, attention_masks,self.model,self.tokenizer) + if torch.device=='cuda': + torch.cuda.empty_cache() + + final= {} + final['Text']= text + final['Count']= num + final['Boolean Questions']= output + + return final + +class AnswerPredictor: + + def __init__(self, model_name = "T5"): + self.model_name = model_name + if self.model_name: + self.tokenizer = T5Tokenizer.from_pretrained('t5-large', model_max_length=512) + model = T5ForConditionalGeneration.from_pretrained('Parth/boolean') + + if self.model_name == "BERT": + from transformers import AutoModelForQuestionAnswering, AutoTokenizer + model = AutoModelForQuestionAnswering.from_pretrained("Falconsai/question_answering") + self.tokenizer = AutoTokenizer.from_pretrained("Falconsai/question_answering") + + device = torch.device("cuda" if torch.cuda.is_available() else "cpu") + model.to(device) + # model.eval() + self.device = device + self.model = model + self.set_seed(42) + + def set_seed(self,seed): + numpy.random.seed(seed) + torch.manual_seed(seed) + if torch.cuda.is_available(): + torch.cuda.manual_seed_all(seed) + + def greedy_decoding (inp_ids,attn_mask,model,tokenizer): + greedy_output = model.generate(input_ids=inp_ids, attention_mask=attn_mask) + Question = tokenizer.decode(greedy_output[0], skip_special_tokens=True,clean_up_tokenization_spaces=True) + return Question.strip().capitalize() + + def predict_answer(self,payload): + answers = [] + inp = { + "input_text": payload.get("input_text"), + "input_question" : payload.get("input_question") + } + + for ques in payload.get("input_question"): + + context = inp["input_text"] + question = ques + input = "question: %s context: %s " % (question,context) + + inputs = self.tokenizer(question, context, return_tensors="pt") + if self.model_name == "T5": + encoding = self.tokenizer.encode_plus(input, return_tensors="pt") + input_ids, attention_masks = encoding["input_ids"].to(self.device), encoding["attention_mask"].to(self.device) + encoding = self.tokenizer.encode_plus(input, return_tensors="pt") + input_ids, attention_masks = encoding["input_ids"].to(self.device), encoding["attention_mask"].to(self.device) + greedy_output = self.model.generate(input_ids=input_ids, attention_mask=attention_masks, max_length=256) + Answer = self.tokenizer.decode(greedy_output[0], skip_special_tokens=True,clean_up_tokenization_spaces=True) + answers.append(Answer.strip().capitalize()) + if self.model_name == "BERT": + with torch.no_grad(): + outputs = self.model(**inputs) + answer_start_index = outputs.start_logits.argmax() + answer_end_index = outputs.end_logits.argmax() + predict_answer_tokens = inputs.input_ids[0, answer_start_index : answer_end_index + 1] + answer = self.tokenizer.decode(predict_answer_tokens) + answers.append(answer.strip().capitalize()) + + return answers From e8e8b63a8b1deed5d03900d1f4e73bef2cf1103b Mon Sep 17 00:00:00 2001 From: Haider Ali Date: Wed, 15 Nov 2023 08:09:06 -0500 Subject: [PATCH 4/6] Update README.md --- README.md | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/README.md b/README.md index 37643ed..51541a1 100644 --- a/README.md +++ b/README.md @@ -188,8 +188,10 @@ pprint (output) ### 2.5 Question Answering (Simple) +BERT model is also added for Question Answering for faster inference and shorter answers. Default is T5 Large. +A list of questions can be passed to receive a list of answers. ``` -answer = main.AnswerPredictor() +answer = main.AnswerPredictor(model_name="BERT") payload3 = { "input_text" : '''Sachin Ramesh Tendulkar is a former international cricketer from India and a former captain of the Indian national team. He is widely regarded @@ -210,6 +212,7 @@ Sachin ramesh tendulkar is a former international cricketer from india and a for ### 2.6 Question Answering (Boolean) + ``` payload4 = { "input_text" : '''Sachin Ramesh Tendulkar is a former international cricketer from @@ -233,6 +236,7 @@ Yes, sachin tendulkar is a former cricketer. For maintaining meaningfulness in Questions, Questgen uses Three T5 models. One for Boolean Question generation, one for MCQs, FAQs, Paraphrasing and one for answer generation. + ### Online Demo website. https://questgen.ai/ From cf7d6457ab8cfdf21684ea3713fa6127d6004eef Mon Sep 17 00:00:00 2001 From: Haider Ali Date: Wed, 15 Nov 2023 13:33:33 -0500 Subject: [PATCH 5/6] Update setup.py --- setup.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/setup.py b/setup.py index d29c44d..41da7ac 100644 --- a/setup.py +++ b/setup.py @@ -14,7 +14,7 @@ 'strsim==0.0.3', 'six==1.16.0', 'networkx==3.1', - 'numpy==1.22.4', + 'numpy', 'scipy==1.10.1', 'scikit-learn==1.2.2', 'unidecode==1.3', @@ -23,7 +23,7 @@ 'pytz==2022.7.1', 'python-dateutil==2.8.2', 'flashtext==2.7', - 'pandas==1.5.3', + 'pandas', 'sentencepiece==0.1.99' ], package_data={'Questgen': ['questgen.py', 'mcq.py', 'train_gpu.py', 'encoding.py']} From 4a488accbac6398d1929d60d272619365192eb39 Mon Sep 17 00:00:00 2001 From: Haider Ali Date: Wed, 15 Nov 2023 21:56:57 -0500 Subject: [PATCH 6/6] Update main.py --- Questgen/main.py | 36 ++++++++++++++++-------------------- 1 file changed, 16 insertions(+), 20 deletions(-) diff --git a/Questgen/main.py b/Questgen/main.py index 86d591e..a10d03b 100644 --- a/Questgen/main.py +++ b/Questgen/main.py @@ -1,5 +1,5 @@ -import numpy as np # linear algebra -import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv) +import numpy as np +import pandas as pd import time import torch from transformers import T5ForConditionalGeneration,T5Tokenizer @@ -44,7 +44,7 @@ def __init__(self): # model.eval() self.device = device self.model = model - self.nlp = spacy.load('en_core_web_sm', quiet=True) + self.nlp = spacy.load('en_core_web_sm') self.s2v = Sense2Vec().from_disk('s2v_old') @@ -166,7 +166,7 @@ def paraphrase(self,payload): num_beams=50, num_return_sequences=num, no_repeat_ngram_size=2, - early_stopping=True + early_stopping=True, ) # print ("\nOriginal Question ::") @@ -196,7 +196,7 @@ def paraphrase(self,payload): class BoolQGen: def __init__(self): - self.tokenizer = T5Tokenizer.from_pretrained('t5-large') + self.tokenizer = T5Tokenizer.from_pretrained('t5-base') model = T5ForConditionalGeneration.from_pretrained('ramsrigouthamg/t5_boolean_questions') device = torch.device("cuda" if torch.cuda.is_available() else "cpu") model.to(device) @@ -249,14 +249,15 @@ class AnswerPredictor: def __init__(self, model_name = "T5"): self.model_name = model_name - if self.model_name: - self.tokenizer = T5Tokenizer.from_pretrained('t5-large', model_max_length=512) + if self.model_name == "T5": + self.tokenizer = T5Tokenizer.from_pretrained('t5-base', model_max_length=512) model = T5ForConditionalGeneration.from_pretrained('Parth/boolean') + # print(model, self.tokenizer) if self.model_name == "BERT": from transformers import AutoModelForQuestionAnswering, AutoTokenizer model = AutoModelForQuestionAnswering.from_pretrained("Falconsai/question_answering") - self.tokenizer = AutoTokenizer.from_pretrained("Falconsai/question_answering") + self.tokenizer = AutoTokenizer.from_pretrained("Falconsai/question_answering", model_max_length=512) device = torch.device("cuda" if torch.cuda.is_available() else "cpu") model.to(device) @@ -272,20 +273,15 @@ def set_seed(self,seed): torch.cuda.manual_seed_all(seed) def greedy_decoding (inp_ids,attn_mask,model,tokenizer): - greedy_output = model.generate(input_ids=inp_ids, attention_mask=attn_mask) + greedy_output = model.generate(input_ids=inp_ids, attention_mask=attn_mask, max_length=256) Question = tokenizer.decode(greedy_output[0], skip_special_tokens=True,clean_up_tokenization_spaces=True) return Question.strip().capitalize() def predict_answer(self,payload): answers = [] - inp = { - "input_text": payload.get("input_text"), - "input_question" : payload.get("input_question") - } - + context = payload["input_text"] + for ques in payload.get("input_question"): - - context = inp["input_text"] question = ques input = "question: %s context: %s " % (question,context) @@ -293,11 +289,11 @@ def predict_answer(self,payload): if self.model_name == "T5": encoding = self.tokenizer.encode_plus(input, return_tensors="pt") input_ids, attention_masks = encoding["input_ids"].to(self.device), encoding["attention_mask"].to(self.device) - encoding = self.tokenizer.encode_plus(input, return_tensors="pt") - input_ids, attention_masks = encoding["input_ids"].to(self.device), encoding["attention_mask"].to(self.device) - greedy_output = self.model.generate(input_ids=input_ids, attention_mask=attention_masks, max_length=256) + greedy_output = self.model.generate(input_ids=input_ids, attention_mask=attention_masks, max_length=512) Answer = self.tokenizer.decode(greedy_output[0], skip_special_tokens=True,clean_up_tokenization_spaces=True) - answers.append(Answer.strip().capitalize()) + output = Answer.strip().capitalize() + answers.append(output) + break if self.model_name == "BERT": with torch.no_grad(): outputs = self.model(**inputs)