forked from amitKr85/project_student_allocation
-
Notifications
You must be signed in to change notification settings - Fork 0
/
extract_verb_adj.py
41 lines (33 loc) · 1.05 KB
/
extract_verb_adj.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
import docx
from nltk import pos_tag, word_tokenize
from nltk.corpus import stopwords
import re
#function to get text from docx
def getText(filename):
doc = docx.Document(filename)
fullText = []
for para in doc.paragraphs:
fullText.append(para.text)
return '\n'.join(fullText)
# function driver code
docxText = getText("test_files/abstracts.docx")
# all words along with tags
allSamplePos=pos_tag(word_tokenize(docxText))
# words dictionary
words = {}
# words to eliminate
stop_words = set(stopwords.words('english'))
# loop through all sampled words
for word,pos in allSamplePos:
# regex to mactch for only verbs and adjectives
if re.match(r'[VJ][A-Z]*',pos) and word not in stop_words:
words[ word ] = pos
# prints all verbs and adjectives
print("\nall appeared verbs..\n")
for word in words:
if re.match(r'[V][A-Z]*',words[word]):
print(word.ljust(20),end="\t")
print("\nall appeared adjectives..\n")
for word in words:
if re.match(r'[J][A-Z]*',words[word]):
print(word.ljust(20),end="\t")