-
Notifications
You must be signed in to change notification settings - Fork 0
/
retrieve_resources.py
119 lines (95 loc) · 3.98 KB
/
retrieve_resources.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
# Standard library imports
from pathlib import Path
import json
import os
import string
# Third-party libraries imports
import requests
import unidecode
def get_datasets_map() -> dict:
"""Map name of repo to datapackage.json URL. Return a dict."""
with open("datasets.json") as datasets_file:
datasets_map = json.load(datasets_file)
return datasets_map
def convert_string_to_proper_title(title: str) -> str:
allowed_chars = string.ascii_letters + string.digits + "-"
title = unidecode.unidecode(title.lower()) # get rid of all diacritics
new_title = ""
for char in title:
if char in allowed_chars:
new_title += char
elif char in [" ", ".", "_"]: # convert those characters to dashes
new_title += "-"
while "--" in new_title: # clean up possible duplicate characters
new_title = new_title.replace("--", "-")
return new_title.strip("-") # may get a dash at one of the ends
def generate_repo_titles_from_dataset_map(datasets_map: dict) -> dict:
"""Take in `datasets_map` and return a modified dict with a proper
title for each dataset. This is the title we use to create the repo
on GitHub."""
datasets_map_final = {}
for name in datasets_map:
new_title = convert_string_to_proper_title(name)
datasets_map_final[new_title] = datasets_map[name]
return datasets_map_final
def generate_repos_empty_list_resources(datasets_map: dict) -> dict:
"""Get a dictionary with the name of the repos we are interested in,
each one being set to an empty list."""
repos_list_resources = {}
for repo_name in datasets_map:
repos_list_resources[repo_name] = []
return repos_list_resources
def download_data_packages(datasets_map: dict):
"""Download the datapackage.json file for each repo."""
for repo_name, package_url in datasets_map.items():
print(f"[{repo_name}] Retrieving {package_url}...")
repo_dir = Path("data/" + repo_name)
try:
os.mkdir(repo_dir)
except FileExistsError:
pass
repo_file = repo_dir / "datapackage.json"
r = requests.get(package_url)
with open(repo_file, "w") as f:
f.write(r.text)
# NOTE: The script was hanging without chunking big files.
# This solved the issue.
def download_file(url: str, path: str) -> str:
with requests.get(url, stream=True) as r:
r.raise_for_status()
with open(path, "wb") as f:
for chunk in r.iter_content(chunk_size=8192):
f.write(chunk)
return path
def get_resources(datasets_map: dict) -> dict:
"""Download all the resources for each repo."""
repos_list_resources = generate_repos_empty_list_resources(datasets_map)
for repo_name in repos_list_resources:
repo_path = Path("data/" + repo_name)
repo_file = repo_path / "datapackage.json"
repo_package_url = datasets_map[repo_name].rstrip("datapackage.json")
with open(repo_file) as f:
json_content = json.load(f)
for resource in json_content["resources"]:
resource_path = resource["path"]
resource_url = repo_package_url + resource_path
resource_local_path = repo_path / resource_path
if (
os.path.exists(resource_local_path)
and os.stat(resource_local_path).st_size != 0
):
continue
if "test" in resource_url.lower():
continue
print(f"[{resource_local_path}] Retrieving {resource_url}...")
if "/data" in resource_url:
try:
os.mkdir(repo_path / "data")
except FileExistsError:
pass
download_file(url=resource_url, path=resource_local_path)
if __name__ == "__main__":
DATASETS_MAP = get_datasets_map()
DATASETS_MAP = generate_repo_titles_from_dataset_map(DATASETS_MAP)
download_data_packages(DATASETS_MAP)
get_resources(DATASETS_MAP)