-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathcharacter.py
177 lines (164 loc) · 6.82 KB
/
character.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
import string
import operator
import itertools
import collections
import scipy.special
from utils import info, success, warning, error, alphabet
def validate_charset(cs, verbose):
# Try to match the character set with something known
sets = [(string.printable, None),
(string.letters + string.digits + "/+", "base64"),
(string.letters + string.digits + "_-", "base64_urlsafe"),
(string.letters + string.digits, None),
(string.letters, None),
(string.uppercase + string.digits, None),
(string.uppercase, None),
(string.lowercase + string.digits, None),
# TODO: base32
(string.lowercase, None),
(string.hexdigits, "hexadecimal"),
("0123456789ABCDEF", "hexadecimal"),
("0123456789abcdef", "hexadecimal"),
(string.digits, None),
(string.octdigits, "octal"),
("01", "binary"),
]
diff_sym = [ (set(s), len(set(s) ^ cs), sugg) for s,sugg in sets ]
min_diff_set, diff, sugg = sorted(diff_sym, key=operator.itemgetter(1))[0]
if diff == 0:
if verbose:
print " The character set seems complete."
if sugg:
print warning(" It seems to be {}, consider decoding and use the binary tests".format(sugg))
else:
if len(cs) <= 4:
print warning(" Character set is tiny")
elif cs <= min_diff_set:
print warning(" It seems that {0} character(s) is/are missing: {1}".format(
diff,"".join(sorted(min_diff_set - cs))))
print warning(" (compared to {})".format("".join(sorted(min_diff_set))))
else:
print warning(" It seems that {0} character(s) is/are extra: {1}".format(
diff, "".join(sorted(cs - min_diff_set))))
print warning(" (compared to {})".format("".join(sorted(min_diff_set))))
def expand_tokens_chars(tks):
return zip(*tks)
def global_freq_test(csqs, verbose):
gcs = alphabet([ "".join(alphabet(csq)) for csq in csqs ])
if verbose:
print " Global Character Set:", "".join(sorted(gcs))
validate_charset(gcs, verbose)
p_value, reason = freq_test("".join(["".join(csq) for csq in csqs]), gcs, verbose)
reason += "\nThis could mean that some position does not use the same " \
"character set.\nConsider verifying the result of freq_test"
return p_value, reason
def freq_test(csq, cs, verbose):
f = {}
n = len(csq)
for c in cs:
f[c] = csq.count(c)
expected = float(n) / len(cs)
v_obs = sum([ float((v - expected)**2)/expected for k,v in f.iteritems()])
if verbose:
print " X^2 = %f" % v_obs
p_value = 1 - scipy.special.gammainc((len(cs) - 1) / 2., v_obs / 2)
reason = "Some characters appear not or too often. Below is the occurrence numbers:\n"
reason += "Min={0} Max={1} E.Avg={2}\n".format(min(f.values()), max(f.values()),
float(len(csq))/len(cs))
reason += "{"
reason += "\n ".join([ "'" + k + "'" + ": " + str(v) for k, v in sorted(f.iteritems(), key=operator.itemgetter(1), reverse=True)])
reason += "}"
return p_value, reason
def serial_test_nonoverlap(csq, cs, verbose):
n = len(csq)
v = list(itertools.product(cs, repeat=2))
dv = dict(zip([ "".join(x) for x in v], [0,] * len(v)))
for i in range(len(csq)/2):
dv["".join(csq[2*i:2*i+2])] += 1
expected = (float(n)/2) / (len(cs)**2)
v_obs = sum([ float((v - expected)**2)/expected for v in dv.values()])
if verbose:
print " X^2 = %f" % v_obs
p_value = 1 - scipy.special.gammainc((len(cs)**2 - 1)/2, v_obs/2)
reason = "Some character transitions are more probable than others:\n"
reason += "Min={0} Max={1} E.Avg={2}\n".format(min(dv.values()), max(dv.values()),
expected)
reason += "{"
reason += "\n ".join([ "'" + "".join(k) + "'" + ": " + str(v) for k, v in sorted(dv.iteritems(), key=operator.itemgetter(1), reverse=True)])
reason += "}"
return p_value, reason
#def _phi_m(csq, cs, l, m, verbose):
# v = list(itertools.product(cs, repeat=m))
# dv = dict(zip([ "".join(x) for x in v], [0,] * len(v)))
# ecsq = csq + csq[:m-1]
# n = len(csq)
# for i in range(len(csq)):
# dv["".join(ecsq[i:i+m])] += 1
# print dv
# print n, sum(dv.values())
# assert n == sum(dv.values())
# s = sum([ v**2 for k,v in dv.iteritems()])
# return ((float(l**m) * s) / n) - n
#
#def serial_test_overlap(csq, cs, verbose):
# l = len(cs)
# n = len(csq)
# phi_3 = _phi_m(csq, cs, l, 3, verbose)
# phi_2 = _phi_m(csq, cs, l, 2, verbose)
# phi_1 = _phi_m(csq, cs, l, 1, verbose)
# print phi_3, phi_2, phi_1
# d1 = phi_3 - phi_2
# d2 = phi_3 - 2*phi_2 + phi_1
# print d1, d2
# p_value1 = 1 - scipy.special.gammainc(2, d1/2)
# p_value2 = 1 - scipy.special.gammainc(1, d2/2)
# print p_value1, p_value2
# return p_value, reason
def correlation_test(csq1, csq2, cs):
pairs = zip(csq1, csq2)
dpairs = [ ("".join([c1,c2]), pairs.count((c1,c2))) for c1 in cs for c2 in cs ]
sdpairs = sorted(dpairs, key=operator.itemgetter(1))
print sdpairs[0], sdpairs[-1]
def diff_test(csq, cs):
diff = []
for i in range(len(csq)-1):
i1 = cs.index(csq[i+1])
i2 = cs.index(csq[i])
l = len(cs)
v = min( abs(i2-i1), abs(i2+l-i1), abs(i2-i1-l))
diff.append(v)
ddiff = collections.defaultdict(int)
for d in diff:
ddiff[d] += 1
print sorted(ddiff.items(), key=operator.itemgetter(1))
def analyse(tks, alpha, verbose):
print info("Running the character analysis on {} tokens...".format(len(tks)))
csqs = expand_tokens_chars(tks)
run_all_tests(csqs, alpha, verbose)
def run_all_tests(csqs, alpha, verbose):
for test in [global_freq_test, ]:
print "Running {}".format(test.__name__)
p_value, reason = test(csqs, verbose)
if p_value < alpha:
print error(" {0} has failed (p-value={1})".format(test.__name__, p_value))
print " Reason:\n {}".format(reason.replace("\n","\n "))
elif verbose:
print success(" {0} has passed (p-value={1})".format(test.__name__, p_value))
for test in [ freq_test, serial_test_nonoverlap, ]:
for i,csq in enumerate(csqs):
print "Running {} at position {}".format(test.__name__, i)
lcs = alphabet(csq)
if verbose:
print " Local Character Set:", "".join(sorted(lcs))
validate_charset(lcs, verbose)
p_value, reason = test(csq, lcs, verbose)
if p_value < alpha:
print error(" {0} has failed (character position={1}, p-value={2})".format(
test.__name__, i, p_value))
print " Reason:\n {}".format(reason.replace("\n", "\n "))
elif verbose:
print success(" {0} has passed (character position={1}, p-value={2})".format(
test.__name__, i, p_value))
# diff_test(csq,cgs)
#for csq1, csq2 in combinations(csqs, 2):
# correlation_test(csq1, csq2, gcs)