-
Notifications
You must be signed in to change notification settings - Fork 0
/
lmh_stats.py
executable file
·186 lines (164 loc) · 9.28 KB
/
lmh_stats.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
#!/usr/bin/env python3
"""
Can be used to create statistics about SMGloM.
This script analyzes the data collected with lmh_harvest.py.
A verbosity level can be set to change the what kind of errors
should be displayed during data collection.
TODO: CREATE TABLE DATA INDEPENDENTLY OF PRESENTATION
"""
import lmh_harvest as harvest
import os
def partition(entries, key):
result = {}
for entry in entries:
k = key(entry)
if k not in result:
result[k] = []
result[k].append(entry)
return result
def unique_list(l):
return sorted(list(set(l)))
def frac2str(a, b):
if b == 0:
return f"{'n/a':>9}"
s = "%.1f" % (100 * a/b)
return f"{s+'%':>9}"
def print_stats(gatherer):
repos = unique_list([e["repo"] for e in gatherer.sigfiles + gatherer.langfiles + gatherer.modules])
langs = unique_list([e["lang"] for e in gatherer.langfiles])
sigf_part = partition(gatherer.sigfiles, lambda e : (e["repo"]))
langf_part = partition(gatherer.langfiles, lambda e : (e["repo"]))
symi_part = partition(gatherer.symis, lambda e : (e["repo"]))
defi_part = partition(gatherer.defis, lambda e : (e["repo"], e["lang"]))
trefi_part = partition(gatherer.trefis, lambda e : e["repo"])
print(f"{'repo':20}{'modules':>9}{'aligned':>9}{'symbols':>9}{'aligned':>9}{'trefis':>9}"+"".join([f"{lang:>9}" for lang in langs])+f"{'views':>9}")
print("-"*(20+9+9+9+9+9+9+9*len(langs)))
for repo in repos:
suffix = ""
aligned_symbols = 0
symbols = 0
if repo in symi_part:
symbols = len(set([(e["mod_name"], e["name"]) for e in symi_part[repo]]))
aligned_symbols = len(set([(e["mod_name"], e["name"]) for e in symi_part[repo] if e["align"] and e["align"] != "noalign"]))
for lang in langs:
if (repo, lang) not in defi_part:
verbs = 0
else:
verbs = len(set([(e["mod_name"], e["name"]) for e in defi_part[(repo, lang)]]))
if repo in symi_part:
symbols_withverb = len(set([(e["mod_name"], e["name"]) for e in symi_part[repo] if e["noverb"] != "all" and lang not in e["noverb"]]))
else:
symbols_withverb = 0
suffix += frac2str(verbs, symbols_withverb)
modsigs = 0
gviewsigs = 0
aligned_modsigs = 0
if repo in sigf_part:
modsigs = len([e for e in sigf_part[repo] if e['type']=='modsig'])
aligned_modsigs = len([e for e in sigf_part[repo] if e['type']=='modsig' and e['align'] and e['align'] != "noalign"])
gviewsigs = len([e for e in sigf_part[repo] if e['type']=='gviewsig'])
trefis = 0
if repo in trefi_part:
trefis = len(trefi_part[repo])
print(f"{repo:20}" +
f"{modsigs:9}" + frac2str(aligned_modsigs, modsigs) +
f"{symbols:9}" + frac2str(aligned_symbols, symbols) +
f"{trefis:9}" +
suffix +
f"{gviewsigs:9}")
print("-"*(20+9+9+9+9+9+9+9*len(langs)))
suffix = ""
symbols = len(set([(e["mod_name"], e["name"]) for e in gatherer.symis]))
aligned_symbols = len(set([(e["mod_name"], e["name"]) for e in gatherer.symis if e["align"] and e["align"] != "noalign"]))
for lang in langs:
verbs = len(set([(e["mod_name"], e["name"]) for e in gatherer.defis if e["lang"] == lang]))
symbols_withverb = len(set([(e["mod_name"], e["name"]) for e in gatherer.symis if e["noverb"] != "all" and lang not in e["noverb"]]))
suffix += frac2str(verbs, symbols_withverb)
modsigs = len([e for e in gatherer.sigfiles if e['type']=='modsig'])
aligned_modsigs = len([e for e in gatherer.sigfiles if e['type']=='modsig' and e["align"] and e["align"] != "noalign"])
print(f"{'TOTAL':20}" +
f"{modsigs:9}" + frac2str(aligned_modsigs, modsigs) +
f"{symbols:9}" + frac2str(aligned_symbols, symbols) +
f"{len(gatherer.trefis):9}" +
suffix +
f"{len([e for e in gatherer.sigfiles if e['type']=='gviewsig']):9}")
def create_csv(gatherer):
repos = unique_list([e["repo"] for e in gatherer.sigfiles + gatherer.langfiles + gatherer.modules])
langs = unique_list([e["lang"] for e in gatherer.langfiles])
sigf_part = partition(gatherer.sigfiles, lambda e : (e["repo"]))
langf_part = partition(gatherer.langfiles, lambda e : (e["repo"]))
symi_part = partition(gatherer.symis, lambda e : (e["repo"]))
defi_part = partition(gatherer.defis, lambda e : (e["repo"], e["lang"]))
trefi_part = partition(gatherer.trefis, lambda e : e["repo"])
with open("stats.csv", "w") as fp:
fp.write("repo, modules, modules aligned, symbols, symbols aligned, total trefis, " + ", ".join([f"coverage {l}" for l in langs]) + ", " + ", ".join([f"synonymity {l}" for l in langs]) + ", views\n")
for repo in repos:
coverages = []
synonymity = []
aligned_symbols = 0
symbols = 0
if repo in symi_part:
symbols = len(set([(e["mod_name"], e["name"]) for e in symi_part[repo]]))
aligned_symbols = len(set([(e["mod_name"], e["name"]) for e in symi_part[repo] if e["align"] and e["align"] != "noalign"]))
for lang in langs:
if (repo, lang) not in defi_part:
verbs = 0
else:
verbs = len(set([(e["mod_name"], e["name"]) for e in defi_part[(repo, lang)]]))
verb_syns = len(set([(e["mod_name"], e["name"], e["string"]) for e in defi_part[(repo, lang)]]))
if repo in symi_part:
symbols_withverb = len(set([(e["mod_name"], e["name"]) for e in symi_part[repo] if e["noverb"] != "all" and lang not in e["noverb"]]))
else:
symbols_withverb = 0
coverages += [str(verbs / symbols_withverb) if symbols_withverb > 0 else "n/a"]
synonymity += [str(verb_syns / verbs) if verbs > 0 else "n/a"]
modsigs = 0
gviewsigs = 0
aligned_modsigs = 0
if repo in sigf_part:
modsigs = len([e for e in sigf_part[repo] if e['type']=='modsig'])
aligned_modsigs = len([e for e in sigf_part[repo] if e['type']=='modsig' and e['align'] and e['align'] != "noalign"])
gviewsigs = len([e for e in sigf_part[repo] if e['type']=='gviewsig'])
trefis = 0
if repo in trefi_part:
trefis = len(trefi_part[repo])
fp.write(f"{repo}, {modsigs}, {aligned_modsigs / modsigs if modsigs else 'n/a'}, "
f"{symbols}, {aligned_symbols / symbols if symbols else 'n/a'}, "
f"{trefis}, {', '.join(coverages)}, {', '.join(synonymity)}, {gviewsigs}\n")
symbols = len(set([(e["mod_name"], e["name"]) for e in gatherer.symis]))
aligned_symbols = len(set([(e["mod_name"], e["name"]) for e in gatherer.symis if e["align"] and e["align"] != "noalign"]))
coverages = []
synonymity = []
for lang in langs:
verbs = len(set([(e["mod_name"], e["name"]) for e in gatherer.defis if e["lang"] == lang]))
verb_syns = len(set([(e["mod_name"], e["name"], e["string"]) for e in gatherer.defis if e["lang"] == lang]))
symbols_withverb = len(set([(e["mod_name"], e["name"]) for e in gatherer.symis if e["noverb"] != "all" and lang not in e["noverb"]]))
coverages += [str(verbs / symbols_withverb) if symbols_withverb > 0 else "n/a"]
synonymity += [str(verb_syns / verbs) if verbs > 0 else "n/a"]
modsigs = len([e for e in gatherer.sigfiles if e['type']=='modsig'])
aligned_modsigs = len([e for e in gatherer.sigfiles if e['type']=='modsig' and e["align"] and e["align"] != "noalign"])
fp.write(f"TOTAL, {modsigs}, {aligned_modsigs / modsigs if modsigs else 'n/a'}, "
f"{symbols}, {aligned_symbols / symbols if symbols else 'n/a'}, "
f"{len(gatherer.trefis)}, {', '.join(coverages)}, {', '.join(synonymity)}, "
f"{len([e for e in gatherer.sigfiles if e['type']=='gviewsig'])}\n")
if __name__ == "__main__":
import argparse
parser = argparse.ArgumentParser(description="Script for printing SMGloM statistics",
epilog="Example call: lmh_stats.py -v0 /path/to/MathHub/smglom")
parser.add_argument("-v", "--verbosity", type=int, default=1, choices=range(4), help="the verbosity (default: 1)")
parser.add_argument("-c", "--csv", action="store_true", help="generate a CSV table")
parser.add_argument("DIRECTORY", nargs="+", help="git repo or higher level directory for which statistics are generated")
args = parser.parse_args()
if args.verbosity >= 2:
print("GATHERING DATA\n")
logger = harvest.SimpleLogger(args.verbosity)
mathhub_dir = harvest.get_mathhub_dir(os.path.abspath(args.DIRECTORY[0]))
ctx = harvest.HarvestContext(logger, harvest.DataGatherer(), mathhub_dir)
for directory in args.DIRECTORY:
harvest.gather_data_for_all_repos(directory, ctx)
if args.verbosity >= 2 or logger.something_was_logged:
print("\n\nSTATISTICS\n")
print_stats(ctx.gatherer)
if args.csv:
create_csv(ctx.gatherer)
print("\n\nCreated stats.csv")