Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Kontrolovat Last-Modified #152

Open
wants to merge 2 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
9 changes: 8 additions & 1 deletion data/ares/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,10 +3,17 @@
import os
import tarfile
from tempfile import NamedTemporaryFile
from typing import List
from urllib.request import urlretrieve

import lxml.etree

URL_BULK = "https://wwwinfo.mfcr.cz/ares/ares_vreo_all.tar.gz"


def resources() -> List[str]:
return [URL_BULK]


def attr(root, parts, nsmap):
ret = []
Expand Down Expand Up @@ -65,7 +72,7 @@ def organi(root, ico, nsmap):

def main(outdir: str, partial: bool = False):
with NamedTemporaryFile() as vfn:
urlretrieve("https://wwwinfo.mfcr.cz/ares/ares_vreo_all.tar.gz", vfn.name)
urlretrieve(URL_BULK, vfn.name)
with tarfile.open(vfn.name, "r:gz") as tf, open(
os.path.join(outdir, "firmy.csv"), "w", encoding="utf8"
) as ud, open(
Expand Down
22 changes: 14 additions & 8 deletions data/datovky/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,10 +2,24 @@
import gzip
import json
import os
from typing import List
from urllib.request import urlopen

import lxml.etree

BASE_URL = "https://www.mojedatovaschranka.cz/sds/datafile.do?format=xml&service="
urls = {
"po": BASE_URL + "seznam_ds_po",
"pfo": BASE_URL + "seznam_ds_pfo",
"fo": BASE_URL + "seznam_ds_fo",
"ovm": BASE_URL + "seznam_ds_ovm",
}


def resources() -> List[str]:
return list(urls.values())


mapping = {
"id": "id",
"type": "type",
Expand Down Expand Up @@ -89,14 +103,6 @@ def parse_xml(source, target_fn, partial):


def main(outdir: str, partial: bool = False):
BASE_URL = "https://www.mojedatovaschranka.cz/sds/datafile.do?format=xml&service="
urls = {
"po": BASE_URL + "seznam_ds_po",
"pfo": BASE_URL + "seznam_ds_pfo",
"fo": BASE_URL + "seznam_ds_fo",
"ovm": BASE_URL + "seznam_ds_ovm",
}

tdir = os.path.join(outdir, "datovky")
os.makedirs(tdir, exist_ok=True)
for ds, url in urls.items():
Expand Down
14 changes: 10 additions & 4 deletions data/dotinfo/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,8 +5,17 @@
import zipfile
from datetime import datetime
from tempfile import TemporaryDirectory
from typing import List
from urllib.request import urlretrieve

URL_DUMP = "https://data.mfcr.cz/sites/default/files/DotInfo_report_29_01_2020.zip"


def resources() -> List[str]:
return [] # TODO(PR): skipping for now due to TLS issues
# return [URL_DUMP]


header = {
"Evidenční číslo dotace": "evidencni_cislo_dotace",
"Identifikator dotace": "identifikator_dotace",
Expand All @@ -26,10 +35,7 @@ def main(outdir: str, partial: bool = False):
ssl._create_default_https_context = ssl._create_unverified_context
with TemporaryDirectory() as tmpdir:
rawpath = os.path.join(tmpdir, "raw.zip")
urlretrieve(
"https://data.mfcr.cz/sites/default/files/DotInfo_report_29_01_2020.zip",
rawpath,
)
urlretrieve(URL_DUMP, rawpath)

with zipfile.ZipFile(rawpath) as zf, zf.open(
"DotInfo_report_29_01_2020.csv"
Expand Down
6 changes: 6 additions & 0 deletions data/iissp/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@
import gzip
import os
from datetime import date
from typing import List
from urllib.request import Request, urlopen

import lxml.etree
Expand All @@ -10,6 +11,11 @@
url = "https://monitor.statnipokladna.cz/data/xml/ucjed.xml"
table_name = "ucetni_jednotky"


def resources() -> List[str]:
return [url]


# XSD nema vsechno, dafuq
cols = [
"ucjed_id",
Expand Down
6 changes: 3 additions & 3 deletions data/justice/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -76,9 +76,9 @@ def main(outdir: str, partial: bool = False):
# nejde filtrovat??? Tak to asi udelame na klientovi
url_pl = "https://dataor.justice.cz/api/3/action/package_list"

r = urlopen(url_pl, timeout=HTTP_TIMEOUT)
data = json.load(r)
assert data["success"]
with urlopen(url_pl, timeout=HTTP_TIMEOUT) as r:
data = json.load(r)
assert data["success"]

dss = [ds for ds in data["result"] if "-full-" in ds]
print(f"celkem {len(dss)} datasetu, ale filtruji jen na ty letosni")
Expand Down
8 changes: 8 additions & 0 deletions data/res/main.py
Original file line number Diff line number Diff line change
@@ -1,13 +1,21 @@
import gzip
import os
import shutil
from typing import List
from urllib.request import Request, urlopen

DATA = ("https://opendata.czso.cz/data/od_org03/res_data.csv", "subjekty.csv")
NACE = ("https://opendata.czso.cz/data/od_org03/res_pf_nace.csv", "nace.csv")
HTTP_TIMEOUT = 30


def resources() -> List[str]:
return [
DATA[0],
NACE[0],
]


def download_gzipped(url: str, filename: str):
req = Request(url)
req.add_header("Accept-Encoding", "gzip")
Expand Down
6 changes: 6 additions & 0 deletions data/szif/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
import shutil
from contextlib import closing
from tempfile import NamedTemporaryFile
from typing import List
from urllib.request import urlopen
from zipfile import ZipFile

Expand All @@ -13,12 +14,17 @@
"dokumenty_ke_stazeni%2Fpkp%2Fspd%2Fopendata%2F"
)
urls = {
2020: BASE_URL + "1622192829773.zip",
2019: BASE_URL + "1590753721920.zip",
2018: BASE_URL + "1563197121858.zip",
2017: BASE_URL + "1563197147275.zip",
}


def resources() -> List[str]:
return list(urls.values())


def main(outdir: str, partial: bool = False):
id_prijemce = 1

Expand Down
7 changes: 7 additions & 0 deletions data/udhpsh/main.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
import csv
import json
import os
from typing import List
from urllib.request import urlopen

HTTP_TIMEOUT = 60
Expand All @@ -10,9 +11,15 @@
"2018": "https://zpravy.udhpsh.cz/zpravy/vfz2018.json",
"2019": "https://zpravy.udhpsh.cz/zpravy/vfz2019.json",
"2020": "https://zpravy.udhpsh.cz/zpravy/vfz2020.json",
"2021": "https://zpravy.udhpsh.cz/zpravy/vfz2021.json",
}
years = sorted(indices.keys())


def resources() -> List[str]:
return list(indices.values())


mappings = {
"penizefo": {
"date": "datum",
Expand Down
41 changes: 41 additions & 0 deletions resource_check.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,41 @@
from importlib import import_module
from urllib.request import urlopen

modules = [
"ares",
"res",
"udhpsh",
"cssz",
"datovky",
"dotinfo",
"eufondy",
"iissp",
"justice",
"psp",
"steno",
"smlouvy",
"szif",
"upv",
"wikidata",
"zakazky",
"volby",
"icij",
]

# TODO(PR): docs
if __name__ == "__main__":
# TODO(PR): problemy:
# - udhpsh a par dalsich (no last modified)
# - justice (megamoc datafilu)

for module in modules:
# TODO(PR): tohle triggeruje importy z toho main.py, takze by
# to chtelo ty URL vyseparovat jinam mozna (a nebo poustet ve venvu)
try:
resources = import_module(f"data.{module}.main").resources()
except AttributeError:
print(module, "not found")
continue
for resource in resources:
with urlopen(resource) as req:
print(module, "\t", resource, "\t", req.headers.get("Last-Modified"))