From 8fabc822c4329b7a6dc6a8ca3fcd39c5a9555c9d Mon Sep 17 00:00:00 2001 From: David Huggins-Daines Date: Wed, 24 Jul 2024 23:29:00 -0400 Subject: [PATCH] fix: handle weird wordpress URLs --- .github/workflows/analyse.yml | 2 +- alexi/download.py | 6 +++++- 2 files changed, 6 insertions(+), 2 deletions(-) diff --git a/.github/workflows/analyse.yml b/.github/workflows/analyse.yml index b5f2a0b..081dfd9 100644 --- a/.github/workflows/analyse.yml +++ b/.github/workflows/analyse.yml @@ -52,7 +52,7 @@ jobs: fi done done - alexi -v download -u https://vsadm.ca/citoyens/reglementation/reglementation-durbanisme/index.html -o download/vsadm --all-pdf-links + alexi -v download -u https://vsadm.ca/citoyens/reglementation/reglementation-durbanisme/ -o download/vsadm --all-pdf-links - name: Extract run: | alexi -v extract -m download/index.json download/*.pdf diff --git a/alexi/download.py b/alexi/download.py index cfa3a2b..9403a64 100644 --- a/alexi/download.py +++ b/alexi/download.py @@ -57,6 +57,7 @@ def add_arguments(parser: argparse.ArgumentParser) -> argparse.ArgumentParser: def main(args: argparse.Namespace) -> None: u = urllib.parse.urlparse(args.url) LOGGER.info("Downloading %s", args.url) + args.outdir.mkdir(parents=True, exist_ok=True) try: subprocess.run( [ @@ -75,7 +76,10 @@ def main(args: argparse.Namespace) -> None: raise excludes = [re.compile(r) for r in args.exclude] paths = [] - with open(args.outdir / Path(u.path).name) as infh: + index = args.outdir / Path(u.path).name + if not index.exists(): + index = args.outdir / "index.html" + with open(index) as infh: soup = BeautifulSoup(infh, "lxml") if args.all_pdf_links: for a in soup.find_all("a"):