Skip to content

Commit

Permalink
Merge pull request #125 from diging/develop
Browse files Browse the repository at this point in the history
Develop
  • Loading branch information
jdamerow authored Apr 6, 2018
2 parents afaed95 + ed1734e commit fc3f323
Show file tree
Hide file tree
Showing 25 changed files with 542 additions and 122 deletions.
6 changes: 1 addition & 5 deletions cookies/accession/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,9 +12,7 @@
from datetime import datetime, timedelta


logging.basicConfig()
logger = logging.getLogger(__name__)
logger.setLevel(settings.LOGLEVEL)
logger = settings.LOGGER

from itertools import repeat, imap

Expand Down Expand Up @@ -355,8 +353,6 @@ def create_resource(self, resource_data, relation_data, container=None):
try:
resource = Resource.objects.create(**data)
except Exception as E:
print data
print E
raise E
if container is None:
container = ResourceContainer.objects.create(primary=resource,
Expand Down
1 change: 0 additions & 1 deletion cookies/accession/giles.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,6 @@ def build_auth_headers(self):
Generates the authorization header for Giles requests.
"""
access_token = self.access_token_generator(self.get_auth_token)
print access_token
return {'Authorization': 'token %s' % access_token}

def get_auth_token(self):
Expand Down
2 changes: 0 additions & 2 deletions cookies/accession/hathitrust.py
Original file line number Diff line number Diff line change
Expand Up @@ -179,8 +179,6 @@ def process_metadata(self, identifier, data):
'entity_type': ['http://purl.org/dc/dcmitype/Text']
}
except Exception as E:
print E
print data
raise E

def process_content_metadata(self, identifier, raw):
Expand Down
125 changes: 103 additions & 22 deletions cookies/aggregate.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,14 +27,28 @@
import os, urlparse, mimetypes
import unicodecsv as csv

from cookies.models import Resource
from django.utils.text import slugify
from cookies.models import Resource, Value

cache = caches['remote_content']

logging.basicConfig()
logger = logging.getLogger(__name__)
logger.setLevel(settings.LOGLEVEL)

logger = settings.LOGGER

METADATA_CSV_HEADER = [
'resource_name',
'resource_uri',
'resource_type',
'resource_type_uri',
'collection_name',
'collection_uri',
'creator_name',
'creator_id',
'date_created',
'predicate',
'predicate_uri',
'target',
'target_uri',
]

def get_filename(resource):
if resource.is_external:
Expand All @@ -45,13 +59,8 @@ def get_filename(resource):
else:
filename = os.path.split(resource.file.path)[-1]

# Try to append a file extension, one is not already present.
filename, ext = os.path.splitext(filename)
if not ext:
ext = mimetypes.guess_extension(resource.content_type)
if ext:
filename += '.' + ext
return filename.replace(':', '-').replace('/', '_').replace('..', '.')
ext = mimetypes.guess_extension(resource.content_type)
return slugify(filename) + ext


def get_content(content_resource):
Expand All @@ -77,6 +86,36 @@ def get_content(content_resource):
return f.read()
return

def write_metadata_csv(filehandle, resource=None, write_header=False):
writer = csv.writer(filehandle)

if write_header:
writer.writerow(METADATA_CSV_HEADER)

if not resource:
return

for relation in resource.relations_from.all():
row = {
'resource_name' : resource.name,
'resource_uri' : resource.uri,
'resource_type' : str(resource.entity_type),
'resource_type_uri' : resource.entity_type.uri,
'collection_name' : resource.container.part_of.name,
'collection_uri' : resource.container.part_of.uri,
'creator_name' : resource.created_by.username,
'creator_id' : resource.created_by_id,
'date_created' : resource.created.isoformat(),
'predicate' : str(relation.predicate),
'predicate_uri' : relation.predicate.uri,
'target' : getattr(relation.target, "name", None),
'target_uri' : None
}
if relation.target and not isinstance(relation.target, Value):
row['target_uri'] = relation.target.uri

writer.writerow([row[c] for c in METADATA_CSV_HEADER])


def aggregate_content_resources_fast(container, content_type=None, part_uri='http://purl.org/dc/terms/isPartOf'):
return Resource.objects.filter(content_resource=True, container=container).order_by('parent__for_resource__relations_from_resource__sort_order')
Expand Down Expand Up @@ -168,6 +207,12 @@ def aggregate_content(queryset, proc=lambda content, rsrc: content, **kwargs):
aggregator = aggregate_content_resources(queryset, **kwargs)
return (proc(get_content(resource), resource) for resource in aggregator)

def aggregate_part_resources(queryset):
part_uri = 'http://purl.org/dc/terms/isPartOf'
for resource in queryset:
yield resource
for part_rel in resource.relations_to.filter(predicate__uri=part_uri):
yield part_rel.source

def export(queryset, target_path, fname=get_filename, **kwargs):
"""
Expand Down Expand Up @@ -202,26 +247,41 @@ def export_zip(queryset, target_path, fname=get_filename, **kwargs):
if not target_path.endswith('.zip'):
target_path += '.zip'

has_metadata = kwargs.pop('has_metadata', False)
proc = kwargs.pop('proc', lambda content, resource: content)
export_proc = lambda content, resource: (content, resource)
aggregator = aggregate_content(queryset, proc=export_proc, **kwargs)
base = 'amphora/'
log = []
index = cStringIO.StringIO()
metadata = cStringIO.StringIO()
index_writer = csv.writer(index)
index_writer.writerow(['ID', 'Name', 'PrimaryID', 'PrimaryURI', 'PrimaryName', 'Location'])

# Write header only
write_metadata_csv(metadata, resource=None, write_header=True)

with zipfile.ZipFile(target_path, 'w', allowZip64=True) as target:
for content, resource in aggregator:
if content is None:
log.append('No content for resource %i (%s)' % (resource.id, resource.name))
elif isinstance(content, Exception):
log.append('Encountered exception while retrieving content for %i (%s): %s' % (resource.id, resource.name, content.message))
else:
filename = fname(resource)
index_writer.writerow([resource.id, resource.name, resource.container.primary.id, resource.container.primary.uri, resource.container.primary.name, filename])
target.writestr(base + filename, content)
for queryset_resource in queryset:
for content, resource in aggregate_content([queryset_resource], proc=export_proc, **kwargs):
if content is None:
log.append('No content for resource %i (%s)' % (resource.id, resource.name))
elif isinstance(content, Exception):
log.append('Encountered exception while retrieving content for %i (%s): %s' % (resource.id, resource.name, content.message))
else:
filename = fname(resource)
index_writer.writerow([resource.id, resource.name, resource.container.primary.id, resource.container.primary.uri, resource.container.primary.name, filename])
target.writestr(base + filename, content)

if has_metadata:
for resource in aggregate_part_resources([queryset_resource]):
write_metadata_csv(metadata, resource, write_header=False)

target.writestr(base + 'MANIFEST.txt', manifest(log))
target.writestr(base + 'index.csv', index.getvalue())
if has_metadata:
target.writestr(base + 'metadata.csv', metadata.getvalue())

metadata.close()
index.close()
return target_path

Expand Down Expand Up @@ -272,3 +332,24 @@ def get_parent_resource(collection):
name = name[1:]
return name
return export_zip(queryset, target_path, fname=recursive_filename, **kwargs)

def export_metadata(queryset, target_path):
"""
Stream metadata into a zip archive at ``target_path``.
"""
logger.debug('aggregate.export_metadata: target: %s' % (target_path))
if not target_path.endswith('.zip'):
target_path += '.zip'

aggregator = aggregate_part_resources(queryset)

base = 'amphora/'
log = []
metadata = cStringIO.StringIO()
with zipfile.ZipFile(target_path, 'w', allowZip64=True) as target:
write_metadata_csv(metadata, aggregator.next(), write_header=True)
for resource in aggregator:
write_metadata_csv(metadata, resource, write_header=False)
target.writestr(base + 'metadata.csv', metadata.getvalue())
metadata.close()
return target_path
1 change: 0 additions & 1 deletion cookies/autocomplete.py
Original file line number Diff line number Diff line change
Expand Up @@ -70,7 +70,6 @@ def get_model_for_value(self, value):
def create_object(self, text):
"""Create an object given a text."""
model = self.get_model_for_value(text)
print model
if model:
return model.objects.create(**{self.create_field: model.pytype(text)})
return None
Expand Down
21 changes: 16 additions & 5 deletions cookies/filters.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,8 @@
from cookies.models import *
from cookies import authorization

from django.conf import settings
logger = settings.LOGGER

class GilesUploadFilter(django_filters.FilterSet):
class Meta:
Expand Down Expand Up @@ -56,7 +58,7 @@ def filter_has_predicate(self, queryset, name, value):
try:
queryset = queryset.filter(relations_from__predicate=value)
except Exception as E:
print str(E)
logger.exception(E)

return queryset

Expand All @@ -74,14 +76,21 @@ class Meta:


class ResourceContainerFilter(django_filters.FilterSet):
name = django_filters.CharFilter(method='lookup_name_in_parts')
name = django_filters.CharFilter(method='lookup_using_name_index')
content = django_filters.CharFilter(name='primary__indexable_content',
lookup_expr='icontains')
part_of = django_filters.ModelChoiceFilter(name='part_of', queryset=Collection.objects.all())

# FIXME: The following statement results in a very expensive Postgres query.
# entity_type = django_filters.ModelChoiceFilter(
# name='primary__entity_type',
# queryset=Type.objects.annotate(num_instances=Count('resource'))\
# .filter(num_instances__gt=0)
# )
# As a temporary workaround, use a static list for choices.
entity_type = django_filters.ModelChoiceFilter(
name='primary__entity_type',
queryset=Type.objects.annotate(num_instances=Count('resource'))\
.filter(num_instances__gt=0)
queryset=Type.objects.all(),
)

# FIXME: The following statement results in a very expensive Postgres query.
Expand Down Expand Up @@ -140,7 +149,6 @@ def filter_tag(self, queryset, value):
return queryset.filter(primary__tags__tag__id=value)

def filter_content_type(self, queryset, name, value):
print value
if not value:
return queryset
return queryset.filter(Q(content_relations__content_type__in=value)).distinct('id')
Expand All @@ -151,6 +159,9 @@ def lookup_name_in_parts(self, queryset, name, value):
q &= Q(primary__name__icontains=part)
return queryset.filter(q)

def lookup_using_name_index(self, queryset, name, value):
return queryset.filter(primary__name_index__plain_tsquery=value)

o = django_filters.OrderingFilter(
# tuple-mapping retains order
fields=(
Expand Down
21 changes: 17 additions & 4 deletions cookies/forms.py
Original file line number Diff line number Diff line change
Expand Up @@ -399,8 +399,8 @@ def to_python(self, value):
return super(HiddenModelMultipleChoiceField, self).to_python(value)

def validate(self, value):

print 'validate', value
# print 'validate', value
pass


class AddTagForm(forms.Form):
Expand Down Expand Up @@ -460,12 +460,14 @@ class Meta:


class SnapshotForm(forms.Form):
content_type = forms.MultipleChoiceField(choices=[])
content_type = forms.MultipleChoiceField(choices=[], required=False)
include_metadata = forms.BooleanField(label="Metadata", required=False, initial=True)
include_content = forms.BooleanField(label="Content", required=False, initial=True)
export_structure = forms.ChoiceField(choices=[
('flat', 'Flat'),
('collection', 'Preserve collection structure'),
('parts', 'Preserve resource hierarchy')
])
], required=False, label="Content export structure")

def __init__(self, *args, **kwargs):
super(SnapshotForm, self).__init__(*args, **kwargs)
Expand Down Expand Up @@ -516,6 +518,17 @@ def __init__(self, *args, **kwargs):
]
self.fields['content_type'].choices = [('__all__', 'All')] + zip(content_type_choices, content_type_choices)

def clean(self):
cleaned_data = super(SnapshotForm, self).clean()
if not (cleaned_data.get('include_metadata') or cleaned_data.get('include_content')):
raise forms.ValidationError('At least one of "include_content", "include_metadata" is required')

if (cleaned_data.get('include_content')
and not (cleaned_data.get('content_type')
and cleaned_data.get('export_structure'))):
raise forms.ValidationError('Content type and export structure required')
return cleaned_data

class GilesLogForm(forms.Form):
UPLOAD_ALL = 'all'
UPLOAD_SELECTED = 'selected'
Expand Down
Loading

0 comments on commit fc3f323

Please sign in to comment.