Skip to content

Commit

Permalink
Merge pull request #58 from openaddresses/gzip-compression
Browse files Browse the repository at this point in the history
Add support for gzip compression
  • Loading branch information
iandees authored Nov 19, 2024
2 parents 364296a + e925ae0 commit 2424e3c
Showing 1 changed file with 28 additions and 0 deletions.
28 changes: 28 additions & 0 deletions openaddr/conform.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@

import os
import errno
import gzip
import tempfile
import mimetypes
import json
Expand Down Expand Up @@ -53,6 +54,7 @@ def gdal_error_handler(err_class, err_num, err_msg):
]

UNZIPPED_DIRNAME = 'unzipped'
UNGZIPPED_DIRNAME = 'ungzipped'

# extracts:
# - '123' from '123 Main St'
Expand Down Expand Up @@ -156,6 +158,8 @@ def from_format_string(clz, format_string):
return GuessDecompressTask()
elif format_string.lower() == 'zip':
return ZipDecompressTask()
elif format_string.lower() == 'gzip':
return GzipDecompressTask()
else:
raise KeyError("I don't know how to decompress for format {}".format(format_string))

Expand All @@ -173,6 +177,10 @@ def decompress(self, source_paths, workdir, filenames):
substitute_task = ZipDecompressTask()
_L.info('Guessing zip compression based on file names')
return substitute_task.decompress(source_paths, workdir, filenames)
elif 'gzip' in types:
substitute_task = GzipDecompressTask()
_L.info('Guessing gzip compression based on file names')
return substitute_task.decompress(source_paths, workdir, filenames)

_L.warning('Could not guess a single compression from file names')
return source_paths
Expand Down Expand Up @@ -221,6 +229,26 @@ def decompress(self, source_paths, workdir, filenames):

return output_files

class GzipDecompressTask(DecompressionTask):
def decompress(self, source_paths, workdir, filenames):
output_files = []
expand_path = os.path.join(workdir, UNGZIPPED_DIRNAME)
mkdirsp(expand_path)

for source_path in source_paths:
# Build a file name for the decompressed file without the .gz extension
expanded_path = os.path.join(expand_path, os.path.basename(source_path)[:-3])

with open(expanded_path, 'wb') as temp_fp:
with open(source_path, 'rb') as source_fp:
with gzip.open(source_fp, 'rb') as gz_fp:
temp_fp.write(gz_fp.read())

output_files.append(temp_fp.name)
_L.debug("Ungzipped file {}".format(output_files[-1]))

return output_files

def elaborate_filenames(filename):
''' Return a list of filenames for a single name from conform file tag.
Expand Down

0 comments on commit 2424e3c

Please sign in to comment.