-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathstats.py
95 lines (84 loc) · 2.85 KB
/
stats.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
import json
import requests
from multiprocessing.pool import ThreadPool
from multiprocessing import cpu_count
import argparse
import numpy as np
from random import seed, sample
from os.path import join, splitext, split, exists
from os import makedirs, listdir
from shutil import rmtree
from hashlib import sha512
import logging
import imagehash
from utils import clean_ext
'''
python stats.py
'''
parser = argparse.ArgumentParser()
parser.add_argument("--input_file", "-i", default='./synsets/silver_synsets.json')
parser.add_argument("--input_sample_file", "-is", default='./samples/silver_sample_synsets.json')
args = parser.parse_args()
noun_synsets, noun_gold_images, noun_images = (0,) * 3
adj_synsets, adj_gold_images, adj_images = (0,) * 3
adv_synsets, adv_gold_images, adv_images = (0,) * 3
verb_synsets, verb_gold_images, verb_images = (0,) * 3
synsets, gold_images, images = (0,) * 3
img_exts = {}
in_ = json.load(open(args.input_file, 'r'))
# 1 NOUN
# 2 VERB
# 3 ADJECTIVE
# 4 ADVERB
# 5 ADJECTIVE SATELLITE
for g in in_:
id = g['id']
g_image_len = len(g['goldImages'])
image_len = len(g['images'])
synsets += 1
gold_images += g_image_len
images += image_len
for i in g['images']:
_, ext = splitext(i['url'])
ext = clean_ext(ext)
if ext not in img_exts:
img_exts[ext] = 0
img_exts[ext] += 1
pos = id[-1:]
if pos == 'n':
noun_synsets += 1
noun_gold_images += g_image_len
noun_images += image_len
elif pos == 'v':
verb_synsets += 1
verb_gold_images += g_image_len
verb_images += image_len
elif pos in ['a', 's']:
adj_synsets += 1
adj_images += g_image_len
adj_images += image_len
elif pos == 'r':
adv_synsets += 1
adv_images += g_image_len
adv_images += image_len
else:
raise Exception('Unexpected POS: {}'.format(pos))
print('Statistics')
print('----------')
for s, g, i, n in [
(noun_synsets, noun_gold_images, noun_images, 'Noun'), (verb_synsets, verb_gold_images, verb_images, 'Verb'),
(adj_synsets, adj_gold_images, adj_images, 'Adjective'), (adv_synsets, adv_gold_images, adv_images, 'Adverb'),
(synsets, gold_images, images, 'All'),
]:
if g == s == 0:
q_g = 0.
q = 0.
else:
q_g = g/s
q = i/s
print('{} synsets: {}\t{} images (gold / all): {} / {}\t\t{} images per synset (gold / all): {:.2f} / {:.2f}'.format(n, s, n, g, i, n, q_g, q))
print()
for e, c in sorted(img_exts.items(), key=lambda x: x[1], reverse=True):
print('{} files showed up {} time(s) in all images making up {:.3f}%'.format(e, c, c*100./images))
print()
print('jpgs, pngs, svgs and gifs made up {:.3f}% of the dataset'.format((img_exts['.jpg'] + img_exts['.png'] + img_exts['.svg'] + img_exts['.gif']) * 100. / images))