-
Notifications
You must be signed in to change notification settings - Fork 1
/
IBM-M2.py
90 lines (80 loc) · 2.23 KB
/
IBM-M2.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
"""
author: Gaurav Ahuja
date: November 15, 2013
IBM Model 2
"""
import time
import sys
import cPickle as pickle
from collections import defaultdict
from corpus import *
ibmm1FileName = 'ibmm1.dat'
ibmm2tFileName = 'ibmm2t.dat'
ibmm2qFileName = 'ibmm2q.dat'
def calcDenominator_2(fs, es, t, q):
d = defaultdict(float)
n = len(fs)
for k in range(n):
mk = len(fs[k])
lk = len(es[k]) - 1 #Null word is included hence -1
for i in range(1, mk+1): # fs[k][i-1] always
for j in range(lk+1): # es[k][j] always
fw = fs[k][i-1]
ew = es[k][j]
d[(k, i)] = d[(k, i)] + q[(j, i, lk, mk)]*t[ew][fw]
#end for j
#end for i
#end for k
return d
def updateT(t, wnc, wdc):
ef = wnc.keys()
for (ew, fw) in ef:
t[ew][fw] = wnc[(ew, fw)]/wdc[ew]
def updateQ(q, pnc, pdc):
p = pnc.keys()
for (j, i, lk, mk) in p:
q[(j, i, lk, mk)] = pnc[(j, i, lk, mk)]/pdc[(i, lk, mk)]
def IBM_M2(fs, es, S = 5):
t = loadP(ibmm1FileName)
q = initializeQ(fs, es)
n = len(fs)
for s in range(1, S+1):
print "Iteration # %d" %(s)
t1 = time.time()
wnc = defaultdict(float)
wdc = defaultdict(float)
pnc = defaultdict(float)
pdc = defaultdict(float)
d = calcDenominator_2(fs, es, t, q)
for k in range(n):
mk = len(fs[k])
lk = len(es[k]) - 1 #Null word is included hence -1
for i in range(1, mk+1): # fs[k][i-1] always
for j in range(lk+1): # es[k][j] always
fw = fs[k][i-1]
ew = es[k][j]
delta = q[(j, i, lk, mk)]*t[ew][fw]/d[(k, i)]
wnc[(ew, fw)] = wnc[(ew, fw)] + delta
wdc[ew] = wdc[ew] + delta
pnc[(j, i, lk, mk)] = pnc[(j, i, lk, mk)] + delta
pdc[(i, lk, mk)] = pdc[(i, lk, mk)] + delta
#end for j
#end for i
#end for k
updateT(t, wnc, wdc)
updateQ(q, pnc, pdc)
t1 = time.time() - t1
print "Time taken to complete iteration# %d: %.2f(s)" %(s, t1)
#end for s
writeP(t, ibmm2tFileName)
writeP(q, ibmm2qFileName)
return t
if __name__ == '__main__':
if(len(sys.argv) != 3):
print "Usage: python IBM-M2.py foreignCorus.gz englishCorpus.gz\n Assumes IBM-M1.py was executed before"
sys.exit(0)
t1 = time.time()
(fs, es) = extractSentences(sys.argv[1], sys.argv[2])
t = IBM_M2(fs, es, 5)
t1 = time.time()-t1
print "Time taken to calculate IBM-M2: %.2f(s)" %(t1)