-
Notifications
You must be signed in to change notification settings - Fork 4
/
Copy pathmain.py
104 lines (78 loc) · 3.7 KB
/
main.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
import requests
from bs4 import BeautifulSoup
from csv import DictReader
from pathlib import Path
import re
import uuid
import json
import db_upload
import make_embeddings_upload
#TODO this is definitely not efficient as it generates vectors for all the images, not just the ones with geo
# to fix we would somehow copy all the original images in the payloads_non_list into the directory we want to use in the vector making
#TODO there is also repeated code everywhere
# And hard coded values for too many things
image_path = Path("images_000")
my_file = Path('train_attribution_geo.json')
payloads_non_list = {}
if my_file.is_file():
# file already exists
with open('train_attribution_geo.json') as incoming_json:
payloads_non_list = json.load(incoming_json)
print("We already have metadata load the data into python non-list")
else:
metadata_path = "train_attribution.csv"
image_names = {}
for path in image_path.rglob('*.*'):
match = re.search('.*(\w{16})\.jpg$', str(path))
image_names[match.group(1)] = path
csvfile = open(metadata_path, "r", encoding='utf-8')
csvlines = DictReader(csvfile)
i = 1
for line in csvlines:
if line['id'] in image_names:
try:
page = requests.get(line['url'])
html = BeautifulSoup(page.text, features="html.parser")
our_tag = html.find('a', {"data-style": "osm-intl"})
if our_tag is not None:
if "data-lat" in our_tag.attrs and "data-lon" in our_tag.attrs:
lat = float(our_tag.attrs["data-lat"])
lon = float(our_tag.attrs["data-lon"])
# We have our payload at this point
print("found one " + line['id'] + " : " + line['url'] + " coords: " + str(lat) + ", " + str(lon))
payloads_non_list[line['id']] = {"picture": line['id'], "filename": str(image_names[line['id']]), "url": line['url'], "location": {"lon": lon, "lat": lat}}
except:
print("Threw an exception on: " + line['id'])
csvfile.close()
# Write our payloads out to file
with open('train_attribution_geo.json', 'w') as out_file:
json.dump(payloads_non_list, out_file, sort_keys=True, indent=4,
ensure_ascii=False)
# now create our vector
vectors_non_list = make_embeddings_upload.get_features()
# with open('../train_attribution_vectors.json', 'w') as out_file:
# json.dump(vectors_non_list, out_file, sort_keys=True, indent=4,
# ensure_ascii=False)
ids, vectors, payloads = [], [], []
# Put them together - need to do this because of the sorting problem - need to get them to line up
id_payload_vectors = []
# We need both the arrays and the JSON item. The JSON item is to accumulate JSON entries for the output file.
json_item = {}
for key, payload in payloads_non_list.items():
json_item = {}
payloads.append(payload)
json_item["payload"] = payload
vectors.append(vectors_non_list[key])
json_item["vector"] = list(vectors_non_list[key].astype(float))
id_string = str(uuid.uuid3(uuid.NAMESPACE_DNS, payload["url"]))
ids.append(id_string)
json_item["id"] = id_string
id_payload_vectors.append(json_item)
# write ids, payloads, and vectors out to a JSON file
with open("id_payload_vector.json", 'w') as full_file:
json.dump(id_payload_vectors, full_file, sort_keys=True, indent=4,
ensure_ascii=False)
# now insert into the collection
uploader = db_upload.DBUpload(512, "images")
uploader.upsert_vectors(ids, vectors, payloads)
print("finished")