Skip to content

Commit

Permalink
processors/archive_webpages: add skip_failed: True/False module option
Browse files Browse the repository at this point in the history
- default False
- when set to True, don't attempt to archive items for which archive_error is set to True
  • Loading branch information
nodiscc committed Dec 20, 2023
1 parent 236d804 commit 6d6999b
Showing 1 changed file with 7 additions and 0 deletions.
7 changes: 7 additions & 0 deletions hecat/processors/archive_webpages.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,7 @@
output_directory: 'tests/webpages' # path to the output directory for archived pages
skip_already_archived: True # (default True) skip processing when item already has a 'archive_path': key
clean_removed: True # (default False) remove existing archived pages which do not match any id in the data file
skip_failed: False # (default False) don't attempt to archive items for which the previous archival attempt failed (archive_error: True)
# $ hecat --config tests/.hecat.archive_webpages.yml
Expand Down Expand Up @@ -203,6 +204,8 @@ def archive_webpages(step):
items = load_yaml_data(step['module_options']['data_file'])
if 'clean_removed' not in step['module_options']:
step['module_options']['clean_removed'] = False
if 'skip_failed' not in step['module_options']:
step['module_options']['skip_failed'] = False
for item in items:
# skip already archived items when skip_already_archived: True
if (('skip_already_archived' not in step['module_options'].keys() or
Expand All @@ -213,6 +216,10 @@ def archive_webpages(step):
elif ('exclude_tags' in step['module_options'] and any(tag in item['tags'] for tag in step['module_options']['exclude_tags'])):
logging.debug('skipping %s (id %s): one or more tags are present in exclude_tags', item['url'], item['id'])
skipped_count = skipped_count +1
# skip failed items when skip_failed: True
elif (step['module_options']['skip_failed'] and 'archive_error' in item.keys() and item['archive_error']):
logging.debug('skipping %s (id %s): the previous archival attempt failed, and skip_failed is set to True')
skipped_count = skipped_count +1
# archive items matching only_tags
elif list(set(step['module_options']['only_tags']) & set(item['tags'])):
logging.info('archiving %s (id %s)', item['url'], item ['id'])
Expand Down

0 comments on commit 6d6999b

Please sign in to comment.