Skip to content

Commit

Permalink
Refactored hathi_images command & added unit tests
Browse files Browse the repository at this point in the history
  • Loading branch information
laurejt committed Nov 13, 2024
1 parent 3776d44 commit 85bc6b1
Show file tree
Hide file tree
Showing 2 changed files with 253 additions and 79 deletions.
234 changes: 155 additions & 79 deletions ppa/archive/management/commands/hathi_images.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,14 @@
"""
**hathi_images** is a custom manage command for downloading both full-size
and thumbnail page images for a list of HathiTrust volumes.
"""
import argparse
from collections import Counter
from collections.abc import Iterable
import requests
from pathlib import Path
from time import sleep
from typing import Self

import progressbar
from django.core.management.base import BaseCommand, CommandError
Expand All @@ -11,6 +19,43 @@
from ppa.archive.templatetags.ppa_tags import page_image_url


class DownloadStats:
ACTION_TYPES = {"fetch", "skip"}
def __init__(self):
# Stats for full size images
self.full = Counter()
# Stats for thumbnail images
self.thumbnail = Counter()

def _log_action(self, image_type: str, action: str) -> None:
if action not in self.ACTION_TYPES:
raise ValueError(f"Unknown action type '{action}'")
if image_type == "full":
self.full[action] += 1
elif image_type == "thumbnail":
self.thumbnail[action] += 1
else:
raise ValueError(f"Unknown image type '{image_type}'")

def log_download(self, image_type: str) -> None:
self._log_action(image_type, "fetch")

def log_skip(self, image_type: str) -> None:
self._log_action(image_type, "skip")

Check warning on line 44 in ppa/archive/management/commands/hathi_images.py

View check run for this annotation

Codecov / codecov/patch

ppa/archive/management/commands/hathi_images.py#L44

Added line #L44 was not covered by tests

def update(self, other: Self) -> None:
self.full.update(other.full)
self.thumbnail.update(other.thumbnail)

def get_report(self) -> str:
return (
f"Fetched {self.full['fetch']} images & "
f"{self.thumbnail['fetch']} thumbnails; "
f"Skipped {self.full['skip']} images & "
f"{self.thumbnail['skip']} thumbnails"
)


class Command(BaseCommand):
"""
Download HathiTrust page image data via image server
Expand All @@ -21,46 +66,128 @@ class Command(BaseCommand):
#: normal verbosity level
v_normal = 1
verbosity = v_normal
#: crawl delay (in seconds)
crawl_delay=1

# Argument parsing
def add_arguments(self, parser):
"""
Configure additional CLI arguments
"""
parser.add_argument(

Check warning on line 75 in ppa/archive/management/commands/hathi_images.py

View check run for this annotation

Codecov / codecov/patch

ppa/archive/management/commands/hathi_images.py#L75

Added line #L75 was not covered by tests
"out",
"output_dir",
type=Path,
help="Top-level output directory")
help="Top-level output directory"
)
parser.add_argument(

Check warning on line 80 in ppa/archive/management/commands/hathi_images.py

View check run for this annotation

Codecov / codecov/patch

ppa/archive/management/commands/hathi_images.py#L80

Added line #L80 was not covered by tests
"--htids",
nargs="*",
nargs="+",
help="Optional list of HathiTrust ids (by default, downloads images for all public HathiTrust volumes)",
)
parser.add_argument(

Check warning on line 85 in ppa/archive/management/commands/hathi_images.py

View check run for this annotation

Codecov / codecov/patch

ppa/archive/management/commands/hathi_images.py#L85

Added line #L85 was not covered by tests
"--crawl-delay",
type=int,
help="Delay to be applied between each download in seconds. Default: 1",
default=1,
)
parser.add_argument(

Check warning on line 91 in ppa/archive/management/commands/hathi_images.py

View check run for this annotation

Codecov / codecov/patch

ppa/archive/management/commands/hathi_images.py#L91

Added line #L91 was not covered by tests
"--image-width",
type=int,
help="Width for full-size images in pixels. Default: 800",
default=800,
)
parser.add_argument(

Check warning on line 97 in ppa/archive/management/commands/hathi_images.py

View check run for this annotation

Codecov / codecov/patch

ppa/archive/management/commands/hathi_images.py#L97

Added line #L97 was not covered by tests
"--thumbnail-width",
type=int,
help="Width for thumbnail images in pixels. Must be at most 250 pixels. Default: 250",
default=250,
)
parser.add_argument(

Check warning on line 103 in ppa/archive/management/commands/hathi_images.py

View check run for this annotation

Codecov / codecov/patch

ppa/archive/management/commands/hathi_images.py#L103

Added line #L103 was not covered by tests
"--progress",
action="store_true",
action=argparse.BooleanOptionalAction,
help="Display progress bars to track download progress",
default=True,
)
def download_image(self, page_url: str, out_file: Path) -> None:

def download_image(self, page_url: str, out_file: Path) -> bool:
response = requests.get(page_url)
success = False
if response.status_code == requests.codes.ok:
with out_file.open(mode="wb") as writer:
writer.write(response.content)
success = True

Check warning on line 116 in ppa/archive/management/commands/hathi_images.py

View check run for this annotation

Codecov / codecov/patch

ppa/archive/management/commands/hathi_images.py#L111-L116

Added lines #L111 - L116 were not covered by tests
else:
if self.verbosity > self.v_normal:
self.stdout(f"Warning: Failed to fetch image {out_file.name}")

Check warning on line 119 in ppa/archive/management/commands/hathi_images.py

View check run for this annotation

Codecov / codecov/patch

ppa/archive/management/commands/hathi_images.py#L118-L119

Added lines #L118 - L119 were not covered by tests
# Apply crawl delay after request
sleep(self.crawl_delay)
return success

Check warning on line 122 in ppa/archive/management/commands/hathi_images.py

View check run for this annotation

Codecov / codecov/patch

ppa/archive/management/commands/hathi_images.py#L121-L122

Added lines #L121 - L122 were not covered by tests


def download_volume_images(self, vol_id:str, page_range: Iterable) -> DownloadStats:
# Determine output volume & thumbnail directories (create as needed)
vol_dir = self.output_dir / get_vol_dir(vol_id)
vol_dir.mkdir(parents=True, exist_ok=True)
thumbnail_dir = vol_dir / "thumbnails"
thumbnail_dir.mkdir(exist_ok=True)

Check warning on line 130 in ppa/archive/management/commands/hathi_images.py

View check run for this annotation

Codecov / codecov/patch

ppa/archive/management/commands/hathi_images.py#L127-L130

Added lines #L127 - L130 were not covered by tests

# Get filename-friendly version of htid
clean_htid = encode_htid(vol_id)

Check warning on line 133 in ppa/archive/management/commands/hathi_images.py

View check run for this annotation

Codecov / codecov/patch

ppa/archive/management/commands/hathi_images.py#L133

Added line #L133 was not covered by tests

# Setup volume-level progress bar
volume_progress = None
if self.show_progress:
volume_progress = progressbar.ProgressBar(

Check warning on line 138 in ppa/archive/management/commands/hathi_images.py

View check run for this annotation

Codecov / codecov/patch

ppa/archive/management/commands/hathi_images.py#L136-L138

Added lines #L136 - L138 were not covered by tests
line_offset=1, redirect_stdout=True, max_value=len(page_range), max_error=False
)
volume_progress.start()

Check warning on line 141 in ppa/archive/management/commands/hathi_images.py

View check run for this annotation

Codecov / codecov/patch

ppa/archive/management/commands/hathi_images.py#L141

Added line #L141 was not covered by tests

# Fetch images
stats = DownloadStats()
for page_num in page_range:
image_name = f"{clean_htid}.{page_num:08d}.jpg"

Check warning on line 146 in ppa/archive/management/commands/hathi_images.py

View check run for this annotation

Codecov / codecov/patch

ppa/archive/management/commands/hathi_images.py#L144-L146

Added lines #L144 - L146 were not covered by tests

# Fetch thumbnail if file does not exist
page_thumbnail = thumbnail_dir / image_name
if not page_thumbnail.is_file():
thumbnail_url = page_image_url(vol_id, page_num, self.thumbnail_width)
success = self.download_image(thumbnail_url, page_thumbnail)

Check warning on line 152 in ppa/archive/management/commands/hathi_images.py

View check run for this annotation

Codecov / codecov/patch

ppa/archive/management/commands/hathi_images.py#L149-L152

Added lines #L149 - L152 were not covered by tests
# TODO: Should we log something different if the download fails?

Check notice on line 153 in ppa/archive/management/commands/hathi_images.py

View check run for this annotation

codefactor.io / CodeFactor

ppa/archive/management/commands/hathi_images.py#L153

Unresolved comment '# TODO: Should we log something different if the download fails?' (C100)
stats.log_download("thumbnail")

Check warning on line 154 in ppa/archive/management/commands/hathi_images.py

View check run for this annotation

Codecov / codecov/patch

ppa/archive/management/commands/hathi_images.py#L154

Added line #L154 was not covered by tests
else:
stats.log_skip("thumbnail")

Check warning on line 156 in ppa/archive/management/commands/hathi_images.py

View check run for this annotation

Codecov / codecov/patch

ppa/archive/management/commands/hathi_images.py#L156

Added line #L156 was not covered by tests

# Fetch "full" image if file does not exist
page_image = vol_dir / image_name
if not page_image.is_file():
image_url = page_image_url(vol_id, page_num, self.full_width)
success = self.download_image(image_url, page_image)
stats.log_download("full")

Check warning on line 163 in ppa/archive/management/commands/hathi_images.py

View check run for this annotation

Codecov / codecov/patch

ppa/archive/management/commands/hathi_images.py#L159-L163

Added lines #L159 - L163 were not covered by tests
else:
stats.log_skip("full")

Check warning on line 165 in ppa/archive/management/commands/hathi_images.py

View check run for this annotation

Codecov / codecov/patch

ppa/archive/management/commands/hathi_images.py#L165

Added line #L165 was not covered by tests

# Update volume-specific progress bar
if volume_progress:
volume_progress.increment()

Check warning on line 169 in ppa/archive/management/commands/hathi_images.py

View check run for this annotation

Codecov / codecov/patch

ppa/archive/management/commands/hathi_images.py#L168-L169

Added lines #L168 - L169 were not covered by tests
# Finish volume-specific progress bar
if volume_progress:
volume_progress.finish()
return stats

Check warning on line 173 in ppa/archive/management/commands/hathi_images.py

View check run for this annotation

Codecov / codecov/patch

ppa/archive/management/commands/hathi_images.py#L171-L173

Added lines #L171 - L173 were not covered by tests


def handle(self, *args, **kwargs):
self.verbosity = kwargs.get("verbosity", self.v_normal)
self.options = kwargs

# validate output directory
if not kwargs["out"]:
raise CommandError("An output directory must be specified")
output_dir = kwargs["out"]
if not output_dir.is_dir():
self.output_dir = kwargs["output_dir"]
self.crawl_delay = kwargs["crawl_delay"]
self.full_width = kwargs["image_width"]
self.thumbnail_width = kwargs["thumbnail_width"]
self.verbosity = kwargs.get("verbosity", self.verbosity)
self.show_progress = kwargs["progress"]

Check warning on line 182 in ppa/archive/management/commands/hathi_images.py

View check run for this annotation

Codecov / codecov/patch

ppa/archive/management/commands/hathi_images.py#L177-L182

Added lines #L177 - L182 were not covered by tests

# Validate input arguments
if not self.output_dir.is_dir():
raise CommandError(

Check warning on line 186 in ppa/archive/management/commands/hathi_images.py

View check run for this annotation

Codecov / codecov/patch

ppa/archive/management/commands/hathi_images.py#L185-L186

Added lines #L185 - L186 were not covered by tests
f"Output directory '{output_dir}' does not exist or is not a directory"
f"Output directory '{self.output_dir}' does not exist or is not a directory"
)
if self.thumbnail_width > 250:
raise CommandError(f"Thumbnail width cannot be more than 250 pixels")

Check warning on line 190 in ppa/archive/management/commands/hathi_images.py

View check run for this annotation

Codecov / codecov/patch

ppa/archive/management/commands/hathi_images.py#L189-L190

Added lines #L189 - L190 were not covered by tests

# use ids specified via command line when present
htids = kwargs.get("htids", [])

Check warning on line 193 in ppa/archive/management/commands/hathi_images.py

View check run for this annotation

Codecov / codecov/patch

ppa/archive/management/commands/hathi_images.py#L193

Added line #L193 was not covered by tests
Expand All @@ -82,84 +209,33 @@ def handle(self, *args, **kwargs):
self.stdout.write("No records to download; stopping")
return

Check warning on line 210 in ppa/archive/management/commands/hathi_images.py

View check run for this annotation

Codecov / codecov/patch

ppa/archive/management/commands/hathi_images.py#L208-L210

Added lines #L208 - L210 were not covered by tests

self.stdout.write(

Check warning on line 212 in ppa/archive/management/commands/hathi_images.py

View check run for this annotation

Codecov / codecov/patch

ppa/archive/management/commands/hathi_images.py#L212

Added line #L212 was not covered by tests
f"Downloading images for {digworks.count()} record{pluralize(digworks)}"
)

# setup main progress bar
overall_progress = None
if self.options["progress"]:
if self.show_progress:
overall_progress = progressbar.ProgressBar(

Check warning on line 219 in ppa/archive/management/commands/hathi_images.py

View check run for this annotation

Codecov / codecov/patch

ppa/archive/management/commands/hathi_images.py#L217-L219

Added lines #L217 - L219 were not covered by tests
redirect_stdout=True, max_value=digworks.count(), max_error=False
line_offset=0, redirect_stdout=True, max_value=digworks.count(), max_error=False
)
overall_progress.start()

Check warning on line 222 in ppa/archive/management/commands/hathi_images.py

View check run for this annotation

Codecov / codecov/patch

ppa/archive/management/commands/hathi_images.py#L222

Added line #L222 was not covered by tests

self.stdout.write(
f"Downloading images for {digworks.count()} record{pluralize(digworks)}"
)

overall_stats = DownloadStats()
for digwork in digworks:
vol_id = digwork.source_id

Check warning on line 226 in ppa/archive/management/commands/hathi_images.py

View check run for this annotation

Codecov / codecov/patch

ppa/archive/management/commands/hathi_images.py#L224-L226

Added lines #L224 - L226 were not covered by tests

# Determine output volume & thumbnail directories (create as needed)
vol_dir = output_dir / get_vol_dir(vol_id)
vol_dir.mkdir(parents=True, exist_ok=True)
thumbnail_dir = vol_dir / "thumbnails"
thumbnail_dir.mkdir(exist_ok=True)

# Get filename-friendly version of htid
clean_htid = encode_htid(vol_id)

# Determine page range
if digwork.item_type == DigitizedWork.FULL:
page_range = range(1, digwork.page_count+1)

Check warning on line 229 in ppa/archive/management/commands/hathi_images.py

View check run for this annotation

Codecov / codecov/patch

ppa/archive/management/commands/hathi_images.py#L228-L229

Added lines #L228 - L229 were not covered by tests
else:
page_range = digwork.page_span

Check warning on line 231 in ppa/archive/management/commands/hathi_images.py

View check run for this annotation

Codecov / codecov/patch

ppa/archive/management/commands/hathi_images.py#L231

Added line #L231 was not covered by tests

# Setup volume-level progress bar
volume_progress = None
if self.options["progress"]:
volume_progress = progressbar.ProgressBar(
redirect_stdout=True, max_value=len(page_range), max_error=False
)
volume_progress.start()

# Fetch images
stats = {
"image": {"fetch": 0, "skip": 0},
"thumbnail": {"fetch": 0, "skip": 0}
}
for page_num in page_range:
image_name = f"{clean_htid}.{page_num:08d}.jpg"

# Fetch thumbnail if file does not exist
page_thumbnail = thumbnail_dir / image_name
if not page_thumbnail.is_file():
thumbnail_url = page_image_url(vol_id, page_num, 250)
self.download_image(thumbnail_url, page_thumbnail)
stats["thumbnail"]["fetch"] += 1
else:
stats["thumbnail"]["skip"] += 1

# Fetch "full" image if file does not exist
page_image = vol_dir / image_name
if not page_image.is_file():
image_url = page_image_url(vol_id, page_num, 800)
#self.download_image(image_url, page_image)
stats["image"]["fetch"] += 1
else:
stats["image"]["skip"] += 1

# Update volume-specific progress bar
if volume_progress:
volume_progress.increment()
# Finish volume-specific progress bar
if volume_progress:
volume_progress.finish()
self.stdout.write(
f"{vol_id}: Fetched {stats['image']['fetch']} images & "
f"{stats['thumbnail']['fetch']} thumbnails; "
f"Skipped {stats['image']['skip']} images & "
f"{stats['thumbnail']['skip']} thumbnails"
)
vol_stats = self.download_volume_images(vol_id, page_range)
overall_stats.update(vol_stats)

Check warning on line 234 in ppa/archive/management/commands/hathi_images.py

View check run for this annotation

Codecov / codecov/patch

ppa/archive/management/commands/hathi_images.py#L233-L234

Added lines #L233 - L234 were not covered by tests
# Update overall progress bar
if overall_progress:
overall_progress.increment()
if overall_progress:
overall_progress.finish()
self.stdout.write("\n\n") # To avoid overwriting progress bars
self.stdout.write(self.style.SUCCESS(overall_stats.get_report()))

Check warning on line 241 in ppa/archive/management/commands/hathi_images.py

View check run for this annotation

Codecov / codecov/patch

ppa/archive/management/commands/hathi_images.py#L236-L241

Added lines #L236 - L241 were not covered by tests
98 changes: 98 additions & 0 deletions ppa/archive/tests/test_hathi_images.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,98 @@
from unittest.mock import call, patch

import pytest
import requests


from ppa.archive.templatetags.ppa_tags import page_image_url
from ppa.archive.management.commands.hathi_images import (
DownloadStats,
)


class TestDownloadStats:
def check_stats(
self,
stats: DownloadStats,
full_fetch: int,
full_skip: int,
thumbnail_fetch: int,
thumbnail_skip: int,
) -> None:
"""Helper function to check stats"""
assert stats.full["fetch"] == full_fetch
assert stats.full["skip"] == full_skip
assert stats.thumbnail["fetch"] == thumbnail_fetch
assert stats.thumbnail["skip"] == thumbnail_skip

def test_init(self):
stats = DownloadStats()
self.check_stats(stats, 0, 0, 0, 0)

def test_log_action(self):
stats = DownloadStats()
# unknown action type
with pytest.raises(ValueError, match="Unknown action type 'bad_action'"):
stats._log_action("image_type", "bad_action")

# unknown image type
with pytest.raises(ValueError, match="Unknown image type 'image_type'"):
stats._log_action("image_type", "fetch")

# Add one to each image type & action
stats._log_action("full", "fetch")
self.check_stats(stats, 1, 0, 0, 0)
stats._log_action("full", "skip")
self.check_stats(stats, 1, 1, 0, 0)
stats._log_action("thumbnail", "fetch")
self.check_stats(stats, 1, 1, 1, 0)
stats._log_action("thumbnail", "skip")
self.check_stats(stats, 1, 1, 1, 1)

# Add another one to each image type & action
stats._log_action("thumbnail", "skip")
self.check_stats(stats, 1, 1, 1, 2)
stats._log_action("full", "skip")
self.check_stats(stats, 1, 2, 1, 2)
stats._log_action("full", "fetch")
self.check_stats(stats, 2, 2, 1, 2)
stats._log_action("thumbnail", "fetch")
self.check_stats(stats, 2, 2, 2, 2)

@patch.object(DownloadStats, "_log_action")
def test_log_download(self, mock_log_action):
stats = DownloadStats()
stats.log_download("image_type")
mock_log_action.called_once_with("image_type", "fetch")

@patch.object(DownloadStats, "_log_action")
def test_log_skip(self, mock_log_action):
stats = DownloadStats()
stats.log_download("image_type")
mock_log_action.called_once_with("image_type", "skip")

def test_update(self):
stats_a = DownloadStats()
stats_b = DownloadStats()
stats_b.full.update({"fetch": 5, "skip": 1})
stats_b.thumbnail.update({"fetch": 3, "skip": 2})
self.check_stats(stats_b, 5, 1, 3, 2 )

stats_a.update(stats_b)
self.check_stats(stats_a, 5, 1, 3, 2)
self.check_stats(stats_b, 5, 1, 3, 2 )

stats_a.update(stats_b)
self.check_stats(stats_a, 10, 2, 6, 4)
self.check_stats(stats_b, 5, 1, 3, 2 )

def test_report(self):
stats_a = DownloadStats()
report_a = "Fetched 0 images & 0 thumbnails; Skipped 0 images & 0 thumbnails"
assert stats_a.get_report() == report_a

stats_b = DownloadStats()
stats_b.full.update({"fetch": 5, "skip": 1})
stats_b.thumbnail.update({"fetch": 3, "skip": 2})
report_b = "Fetched 5 images & 3 thumbnails; Skipped 1 images & 2 thumbnails"
assert stats_b.get_report() == report_b

0 comments on commit 85bc6b1

Please sign in to comment.