Skip to content

Commit

Permalink
fix: handle weird wordpress URLs
Browse files Browse the repository at this point in the history
  • Loading branch information
dhdaines committed Jul 25, 2024
1 parent 5ca849c commit 8fabc82
Show file tree
Hide file tree
Showing 2 changed files with 6 additions and 2 deletions.
2 changes: 1 addition & 1 deletion .github/workflows/analyse.yml
Original file line number Diff line number Diff line change
Expand Up @@ -52,7 +52,7 @@ jobs:
fi
done
done
alexi -v download -u https://vsadm.ca/citoyens/reglementation/reglementation-durbanisme/index.html -o download/vsadm --all-pdf-links
alexi -v download -u https://vsadm.ca/citoyens/reglementation/reglementation-durbanisme/ -o download/vsadm --all-pdf-links
- name: Extract
run: |
alexi -v extract -m download/index.json download/*.pdf
Expand Down
6 changes: 5 additions & 1 deletion alexi/download.py
Original file line number Diff line number Diff line change
Expand Up @@ -57,6 +57,7 @@ def add_arguments(parser: argparse.ArgumentParser) -> argparse.ArgumentParser:
def main(args: argparse.Namespace) -> None:
u = urllib.parse.urlparse(args.url)
LOGGER.info("Downloading %s", args.url)
args.outdir.mkdir(parents=True, exist_ok=True)
try:
subprocess.run(
[
Expand All @@ -75,7 +76,10 @@ def main(args: argparse.Namespace) -> None:
raise
excludes = [re.compile(r) for r in args.exclude]
paths = []
with open(args.outdir / Path(u.path).name) as infh:
index = args.outdir / Path(u.path).name
if not index.exists():
index = args.outdir / "index.html"
with open(index) as infh:
soup = BeautifulSoup(infh, "lxml")
if args.all_pdf_links:
for a in soup.find_all("a"):
Expand Down

0 comments on commit 8fabc82

Please sign in to comment.