-
Notifications
You must be signed in to change notification settings - Fork 2
/
chunk.py
59 lines (40 loc) · 1.86 KB
/
chunk.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
import nltk
from nltk.tag import pos_tag
from nltk.tokenize import word_tokenize
#list chunks[][] = set of chunks
#list q[][]-> the previous running instance's of p
# list of all patterns n(noun),nn(noun noun),an(adjective noun),npn(noun preposition noun),a(adjective),aa(adjective adjective)
our_p=["n","nn","an","npn","a","aa"]
# 2-D list with the pattern as -> [pattern,chunk,chunk....],[pattern,chunk,chunk....],.....
chunks=[["n"],["nn"],["an"],["npn"],["a"],["aa"]]
# list containing all the reg expressions of the patterns
patterns=["n:{<NN>}","nn:{<NN><NN>}","an:{<JJ><NN>}","npn:{<NN><PRP><NN>}","a:{<JJ>}","aa:{<JJ><JJ>}"]
# For now the program is only for a sentence....not for the corpus
sen="He is driving a red car";
# list with all words along with their pos tags
pos_tags=pos_tag(word_tokenize(sen))
# iterate over all patterns
for i in patterns:
patt=i
par=nltk.RegexpParser(patt)
result=par.parse(pos_tags)
for n in result:
if isinstance(n, nltk.tree.Tree): # checking if the leaves of root(S) are trees or not
for i in range(len(our_p)): # if a subtree is found we iterate over the number of patterns
if n.node == our_p[i]: # matching a node with our pattern...we gave name to each regular expression
c="" # initializing a temp. variable with NULL string
for j in range(len(n)): # iterating over the number of leaves of the subtree
c=c + " " + n[j][0] # concatinating the words ex. (red,JJ),(car,NN) -> red car
chunks[i].append(c)
# else:
# print "No Chunks"
print chunks # pattern of result -> (pattern,chunk,chunk..),(pattern,chunk,chunk..)
#extraction of chunks
# p[i].append(#chunks)
#save to an external file
#for i in p:
# for j in p[i]:
# for k in q[i]:
# for l in q[j]:
# if(p[i][j] != q[k][l]):
# new.append(p[i][j])