Skip to content

Commit

Permalink
Add unit test for checksum checking.
Browse files Browse the repository at this point in the history
  • Loading branch information
markpbaggett committed May 15, 2024
1 parent 2e7608f commit cdca875
Show file tree
Hide file tree
Showing 3 changed files with 72 additions and 8 deletions.
Original file line number Diff line number Diff line change
@@ -0,0 +1,17 @@
source_identifier,model,remote_files,parents,has_work_type,primary_identifier,local_identifier,ark,acquisition_identifier,oclc,issn,isbn,title,alternative_title,abstract,table_of_contents,date_created,date_issued,date_other,date_created_d,date_issued_d,date_other_d,publisher,utk_publisher,publication_place,utk_place_of_publication,note,extent,instrumentation,first_line,intended_audience,rights_statement,spatial,spatial_local,coordinates,temporal,call_number,bibliographic_citation,provider,intermediate_provider,repository,archival_collection,subject,keyword,form,resource_type,form_local,language,sheetmusic_hostitem,is_part_of,rdf_type,file_language,visibility
mpaekefauver:248_MODS_fileset,FileSet,https://digital.lib.utk.edu/migration/mpaekefauver/mpaekefauver:248_MODS.xml,mpaekefauver:248_MODS,,,,,,,,,MODS,,MODS for mpaekefauver:248,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,http://pcdm.org/file-format-types#Markup,,restricted
mpaekefauver:309_MODS_fileset,FileSet,https://digital.lib.utk.edu/migration/mpaekefauver/mpaekefauver:309_MODS.xml,mpaekefauver:309_MODS,,,,,,,,,MODS,,MODS for mpaekefauver:309,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,http://pcdm.org/file-format-types#Markup,,restricted
mpaekefauver:118_OBJ_fileset,FileSet,https://digital.lib.utk.edu/migration/mpaekefauver/mpaekefauver:118_OBJ.jp2,mpaekefauver:118_OBJ,,,,,,,,,OBJ,,OBJ for mpaekefauver:118,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,http://pcdm.org/use#PreservationFile | http://pcdm.org/use#IntermediateFile,,restricted
mpaekefauver:99_MODS_fileset,FileSet,https://digital.lib.utk.edu/migration/mpaekefauver/mpaekefauver:99_MODS.xml,mpaekefauver:99_MODS,,,,,,,,,MODS,,MODS for mpaekefauver:99,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,http://pcdm.org/file-format-types#Markup,,restricted
mpaekefauver:374_MODS_fileset,FileSet,https://digital.lib.utk.edu/migration/mpaekefauver/mpaekefauver:374_MODS.xml,mpaekefauver:374_MODS,,,,,,,,,MODS,,MODS for mpaekefauver:374,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,http://pcdm.org/file-format-types#Markup,,restricted
mpaekefauver:465_OBJ_fileset,FileSet,https://digital.lib.utk.edu/migration/mpaekefauver/mpaekefauver:465_OBJ.jp2,mpaekefauver:465_OBJ,,,,,,,,,OBJ,,OBJ for mpaekefauver:465,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,http://pcdm.org/use#PreservationFile | http://pcdm.org/use#IntermediateFile,,restricted
mpaekefauver:356_MODS_fileset,FileSet,https://digital.lib.utk.edu/migration/mpaekefauver/mpaekefauver:356_MODS.xml,mpaekefauver:356_MODS,,,,,,,,,MODS,,MODS for mpaekefauver:356,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,http://pcdm.org/file-format-types#Markup,,restricted
mpaekefauver:254_OBJ_fileset,FileSet,https://digital.lib.utk.edu/migration/mpaekefauver/mpaekefauver:254_OBJ.jp2,mpaekefauver:254_OBJ,,,,,,,,,OBJ,,OBJ for mpaekefauver:254,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,http://pcdm.org/use#PreservationFile | http://pcdm.org/use#IntermediateFile,,restricted
mpaekefauver:451_OBJ_fileset,FileSet,https://digital.lib.utk.edu/migration/mpaekefauver/mpaekefauver:451_OBJ.jp2,mpaekefauver:451_OBJ,,,,,,,,,OBJ,,OBJ for mpaekefauver:451,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,http://pcdm.org/use#PreservationFile | http://pcdm.org/use#IntermediateFile,,restricted
mpaekefauver:104_MODS_fileset,FileSet,https://digital.lib.utk.edu/migration/mpaekefauver/mpaekefauver:104_MODS.xml,mpaekefauver:104_MODS,,,,,,,,,MODS,,MODS for mpaekefauver:104,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,http://pcdm.org/file-format-types#Markup,,restricted
mpaekefauver:419_MODS_fileset,FileSet,https://digital.lib.utk.edu/migration/mpaekefauver/mpaekefauver:419_MODS.xml,mpaekefauver:419_MODS,,,,,,,,,MODS,,MODS for mpaekefauver:419,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,http://pcdm.org/file-format-types#Markup,,restricted
mpaekefauver:318_OBJ_fileset,FileSet,https://digital.lib.utk.edu/migration/mpaekefauver/mpaekefauver:318_OBJ.jp2,mpaekefauver:318_OBJ,,,,,,,,,OBJ,,OBJ for mpaekefauver:318,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,http://pcdm.org/use#PreservationFile | http://pcdm.org/use#IntermediateFile,,restricted
mpaekefauver:463_MODS_fileset,FileSet,https://digital.lib.utk.edu/migration/mpaekefauver/mpaekefauver:463_MODS.xml,mpaekefauver:463_MODS,,,,,,,,,MODS,,MODS for mpaekefauver:463,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,http://pcdm.org/file-format-types#Markup,,restricted
mpaekefauver:350_MODS_fileset,FileSet,https://digital.lib.utk.edu/migration/mpaekefauver/mpaekefauver:350_MODS.xml,mpaekefauver:350_MODS,,,,,,,,,MODS,,MODS for mpaekefauver:350,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,http://pcdm.org/file-format-types#Markup,,restricted
mpaekefauver:108_OBJ_fileset,FileSet,https://digital.lib.utk.edu/migration/mpaekefauver/mpaekefauver:108_OBJ.jp2,mpaekefauver:108_OBJ,,,,,,,,,OBJ,,OBJ for mpaekefauver:108,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,http://pcdm.org/use#PreservationFile | http://pcdm.org/use#IntermediateFile,,restricted
mpaekefauver:420_MODS_fileset,FileSet,https://digital.lib.utk.edu/migration/mpaekefauver/mpaekefauver:420_MODS.xml,mpaekefauver:420_MODS,,,,,,,,,MODS,,MODS for mpaekefauver:420,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,http://pcdm.org/file-format-types#Markup,,restricted
28 changes: 28 additions & 0 deletions tests/test_checksum_checksum_file.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,28 @@
import pytest
from utk_exodus.checksum import HashSheet
from pathlib import Path

# Set path to fixtures
fixtures_path = Path(__file__).parent / "fixtures"

@pytest.fixture(
params=[
{
"filename": "bad_imports",
"expected_results": {
'url': 'https://raw.githubusercontent.com/utkdigitalinitiatives/utk-exodus/main/tests/fixtures/colloquy_202.xml',
'checksum': '081a51fae0200f266d2933756d48441c4ea77b1e'
}
},
]
)
def fixture(request):
request.param["fixtures_path"] = fixtures_path / request.param.get("filename")
return request.param

def test_checksum_file(fixture):
hs = HashSheet(fixture.get("fixtures_path"), "example.csv")
results = hs.checksum_file(
"https://raw.githubusercontent.com/utkdigitalinitiatives/utk-exodus/main/tests/fixtures/colloquy_202.xml"
)
assert results == fixture["expected_results"]
35 changes: 27 additions & 8 deletions utk_exodus/checksum/checksum.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,15 +25,34 @@ def walk_sheets(path):
def checksum(self):
files_with_checksums = []
for file in tqdm(self.all_files):
response = requests.get(file, stream=True)
response.raise_for_status()
sha1 = hashlib.sha1()
for chunk in response.iter_content(chunk_size=8192):
if chunk:
sha1.update(chunk)
files_with_checksums.append({"url": file, "checksum": sha1.hexdigest()})
hash = self.checksum_file(file)
files_with_checksums.append(hash)
return files_with_checksums

@staticmethod
def checksum_file(file):
"""Calculate the sha1 checksum of a file.
Args:
file (str): The path to the file to checksum.
Returns:
dict: A dictionary with the url and checksum of the file.
Examples:
>>> hs = HashSheet("tests/fixtures/bad_imports", "example.csv")
>>> hs.checksum_file("https://raw.githubusercontent.com/utkdigitalinitiatives/utk-exodus/main/tests/fixtures/colloquy_202.xml")
{'url': 'https://raw.githubusercontent.com/utkdigitalinitiatives/utk-exodus/main/tests/fixtures/colloquy_202.xml', 'checksum': '081a51fae0200f266d2933756d48441c4ea77b1e'}
"""
response = requests.get(file, stream=True)
response.raise_for_status()
sha1 = hashlib.sha1()
for chunk in response.iter_content(chunk_size=8192):
if chunk:
sha1.update(chunk)
return {"url": file, "checksum": sha1.hexdigest()}

def write(self):
with open(self.output, "w") as csvfile:
writer = DictWriter(csvfile, fieldnames=["url", "checksum"])
Expand All @@ -43,7 +62,7 @@ def write(self):


if __name__ == "__main__":
path = "delete/bad_imports"
path = "tests/fixtures/bad_imports"
output = "delete/sample_checksums.csv"
checksum = HashSheet(path, output)
checksum.write()

0 comments on commit cdca875

Please sign in to comment.