Skip to content

Commit

Permalink
Merge branch 'release/1.3.0'
Browse files Browse the repository at this point in the history
  • Loading branch information
BeritJanssen committed Nov 11, 2019
2 parents e9b768b + c3b0887 commit 7b784ca
Show file tree
Hide file tree
Showing 44 changed files with 1,026 additions and 402 deletions.
7 changes: 4 additions & 3 deletions backend/addcorpus/corpus.py
Original file line number Diff line number Diff line change
Expand Up @@ -152,10 +152,11 @@ def update_query(self, **kwargs):

def request_media(self, document):
'''
Get a list of urls from where media associated
with a document can be fetched.
Get a dictionary with
'media': list of urls from where media associated with a document can be fetched,
'info': information for file download
'''
return []
return {'media': None, 'info': None}

def es_mapping(self):
'''
Expand Down
14 changes: 10 additions & 4 deletions backend/addcorpus/image_processing.py
Original file line number Diff line number Diff line change
Expand Up @@ -41,20 +41,26 @@ def build_partial_pdf(pages, input_pdf):

def retrieve_pdf(path):
'''
Retrieve the pdf as a file object, and gather some additional information.
Retrieve the pdf as a file object.
'''
pdf = PdfFileReader(path, 'rb')

return pdf

def get_pdf_info(path):
'''
Gather pdf information.
'''
pdf = PdfFileReader(path, 'rb')
title = pdf.getDocumentInfo().title
_dir, filename = split(path)
num_pages = pdf.getNumPages()

info = {
'filename': title if title else filename,
'filesize': sizeof_fmt(getsize(path)),
'all_pages': list(range(0, num_pages))
}

return pdf, info
return info

def sizeof_fmt(num, suffix='B'):
'''
Expand Down
8 changes: 7 additions & 1 deletion backend/api/tasks.py
Original file line number Diff line number Diff line change
Expand Up @@ -65,7 +65,13 @@ def create_filename(route):
def create_csv(results, fields, filename):
entries = []
for result in results:
entry = {field: result['_source'][field] for field in fields}
entry={}
for field in fields:
if field in result['_source']:
entry.update( {field:result['_source'][field]} )
#the id field lives one level higher and is named '_id' by elastic search
if field=="id" and "_id" in result:
entry.update( {field: result['_id']} )
entries.append(entry)
csv.register_dialect('myDialect', delimiter=',', quotechar='"',
quoting=csv.QUOTE_NONNUMERIC, skipinitialspace=True)
Expand Down
12 changes: 5 additions & 7 deletions backend/api/views.py
Original file line number Diff line number Diff line change
Expand Up @@ -503,15 +503,13 @@ def api_get_media():
if len(list(request.args.keys()))>2:
# there are more arguments, currently used for pdf retrieval only
try:
out, info = backend_corpus.get_media(request.args)
out = backend_corpus.get_media(request.args)
except Exception as e:
current_app.logger.error(e)
abort(400)
header = json.dumps(info)
if not out:
abort(404)
response = make_response(send_file(out, attachment_filename="scan.pdf", as_attachment=True, mimetype=backend_corpus.scan_image_type))
response.headers['pdfinfo'] = header
return response
else:
absolute_path = join(backend_corpus.data_directory, image_path)
Expand All @@ -523,7 +521,7 @@ def api_get_media():

@api.route('/request_media', methods=['POST'])
@login_required
def api_request_images():
def api_request_media():
if not request.json:
abort(400)
corpus_index = request.json['corpus_index']
Expand All @@ -533,10 +531,10 @@ def api_request_images():
else:
data = backend_corpus.request_media(request.json['document'])
current_app.logger.info(data)
if len(data)==0:
if len(data['media'])==0:
return jsonify({'success': False})
output = {'success': True, 'media': data}
return jsonify(output)
data['success'] = True
return jsonify(data)


@api.route('/get_related_words', methods=['POST'])
Expand Down
43 changes: 25 additions & 18 deletions backend/corpora/dutchannualreports/dutchannualreports.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@
from addcorpus.extract import XML, Metadata, Combined
from addcorpus.filters import MultipleChoiceFilter, RangeFilter
from addcorpus.corpus import XMLCorpus, Field
from addcorpus.image_processing import retrieve_pdf, pdf_pages, build_partial_pdf
from addcorpus.image_processing import get_pdf_info, retrieve_pdf, pdf_pages, build_partial_pdf


class DutchAnnualReports(XMLCorpus):
Expand Down Expand Up @@ -211,35 +211,42 @@ def sources(self, start=min_date, end=max_date):
)
]

def request_media(self, document):
def request_media(self, document):
image_path = document['fieldValues']['image_path']
pdf_info = get_pdf_info(op.join(self.data_directory, image_path))
pages_returned = 5 #number of pages that is displayed. must be odd number.
#the page corresponding to the document
home_page = int(document['fieldValues']['page'])
pages, home_page_index = pdf_pages(pdf_info['all_pages'], pages_returned, home_page)
pdf_info = {
"pageNumbers": [p for p in pages], #change from 0-indexed to real page
"homePageIndex": home_page_index+1, #change from 0-indexed to real page
"fileName": pdf_info['filename'],
"fileSize": pdf_info['filesize']
}
image_url = url_for('api.api_get_media',
corpus=self.es_index,
image_path=document['fieldValues']['image_path'],
page_no=document['fieldValues']['page'],
image_path=image_path,
start_page=pages[0]-1,
end_page=pages[-1],
_external=True
)
return [image_url]
return {'media': [image_url], 'info': pdf_info}


def get_media(self, request_args):
'''
Given the image path and page number of the search result,
construct a new pdf which contains 2 pages before and after.
'''
image_path = request_args['image_path']
home_page = int(request_args['page_no'])
start_page = int(request_args['start_page'])
end_page = int(request_args['end_page'])
absolute_path = op.join(self.data_directory, image_path)
if not op.isfile(absolute_path):
return None
input_pdf, pdf_info = retrieve_pdf(absolute_path)
pages_returned = 5 #number of pages that is displayed. must be odd number.
#the page corresponding to the document
pages, home_page_index = pdf_pages(pdf_info['all_pages'], pages_returned, home_page)
out = build_partial_pdf(pages, input_pdf)
pdf_info = {
"pageNumbers": [p+1 for p in pages], #change from 0-indexed to real page
"homePageIndex": home_page_index+1, #change from 0-indexed to real page
"fileName": pdf_info['filename'],
"fileSize": pdf_info['filesize']
}
return out, pdf_info
input_pdf = retrieve_pdf(absolute_path)
pages = range(start_page, end_page)
out = build_partial_pdf(pages, input_pdf)
return out

10 changes: 6 additions & 4 deletions backend/corpora/ecco/ecco.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@
from addcorpus.extract import Combined, Metadata, XML
from addcorpus import filters
from addcorpus.corpus import XMLCorpus, Field, consolidate_start_end_years, string_contains
from addcorpus.image_processing import retrieve_pdf
from addcorpus.image_processing import get_pdf_info, retrieve_pdf


# Source files ################################################################
Expand Down Expand Up @@ -240,21 +240,23 @@ def fields(self):


def request_media(self, document):
image_path = document['fieldValues']['image_path']
image_url = url_for('api.api_get_media',
corpus=self.es_index,
image_path=document['fieldValues']['image_path'],
image_path=image_path,
page_no=document['fieldValues']['page'],
_external=True
)
return [image_url]
pdf_stats = get_pdf_info(join(self.data_directory, image_path))
return {'media': [image_url], 'info': pdf_stats}


def get_media(self, request_args):
image_path = request_args['image_path']
filename = '{}.pdf'.format(split(image_path)[1])
full_path = join(image_path, filename)
page_no = request_args['page_no']
pdf_data, pdf_stats = retrieve_pdf(full_path)
pdf_data = retrieve_pdf(full_path)
pdf_info = {
"pageNumbers": pdf_stats['all_pages'], #change from 0-indexed to real page
"homePageIndex": page_no, #change from 0-indexed to real page
Expand Down
40 changes: 21 additions & 19 deletions backend/corpora/guardianobserver/guardianobserver.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@
import glob
import re
from pathlib import Path # needed for Python 3.4, as glob does not support recursive argument
import os.path as op
from os.path import join, getsize
from os import getcwd
from datetime import date, datetime
from zipfile import ZipFile
Expand All @@ -20,6 +20,7 @@
from addcorpus import extract
from addcorpus import filters
from addcorpus.corpus import XMLCorpus, Field, until, after, string_contains, consolidate_start_end_years
from addcorpus.image_processing import sizeof_fmt

PROCESSED = "corpora/guardianobserver/processed.txt"

Expand Down Expand Up @@ -177,16 +178,16 @@ def request_media(self, document):
# applicable for post-1910 data
image_path = field_vals['image_path']
# define subdirectory in the zip archive
filename = op.join(field_vals['image_path'].split('/')[2][:-10], target_filename)
filename = join(field_vals['image_path'].split('/')[2][:-10], target_filename)
elif field_vals['date']<'1909-31-12':
path = op.join('1791-1909', 'PDF', field_vals['pub_id'])
path = join('1791-1909', 'PDF', field_vals['pub_id'])
zipname = "{}_{}.zip".format(*field_vals['date'].split("-")[:2])
image_path = op.join(path, zipname)
image_path = join(path, zipname)
# pre-1910, the zip archives contain folders year -> month -> pdfs
filename = op.join(zipname[:4], zipname[5:7], target_filename)
filename = join(zipname[:4], zipname[5:7], target_filename)
else:
path = op.join('1910-2003', 'PDF')
global_path = op.join(self.data_directory, path)
path = join('1910-2003', 'PDF')
global_path = join(self.data_directory, path)
zipname_pattern = "{}_*_{}.zip".format(
field_vals['date'][:4],
field_vals['pub_id']
Expand All @@ -196,15 +197,15 @@ def request_media(self, document):
pdfs = ZipFile(str(zipfile)).namelist()
correct_file = next((pdf for pdf in pdfs if pdf.split("/")[1]==target_filename), None)
if correct_file:
image_path = op.join(path, zipfile.name)
image_path = join(path, zipfile.name)
update_body = {
"doc": {
"image_path": image_path
}
}
update_document(self.es_index, self.es_doctype, document, update_body)
# define subdirectory in the zip archive
filename = op.join(correct_file.split('/')[0], target_filename)
filename = join(correct_file.split('/')[0], target_filename)
break
if not image_path:
return []
Expand All @@ -215,7 +216,13 @@ def request_media(self, document):
filename=filename,
_external=True
)]
return image_urls
pdf_info = {
"pageNumbers": [1], #change from 0-indexed to real page
"homePageIndex": 1, #change from 0-indexed to real page
"fileName": filename,
"fileSize": sizeof_fmt(getsize(join(self.data_directory, image_path)))
}
return {'media': image_urls, 'info': pdf_info}


def get_media(self, request_args):
Expand All @@ -226,17 +233,12 @@ def get_media(self, request_args):
'''
image_path = request_args['image_path']
filename = request_args['filename']
pdf_info = {
"pageNumbers": [1], #change from 0-indexed to real page
"homePageIndex": 1, #change from 0-indexed to real page
"fileName": filename
}

pdf_data = None
with ZipFile(op.join(self.data_directory, image_path), mode='r') as zipped:
with ZipFile(join(self.data_directory, image_path), mode='r') as zipped:
zip_info = zipped.getinfo(filename)
pdf_data = zipped.read(zip_info)
if pdf_data:
pdf_info.update({'fileSize': zip_info.file_size})
return BytesIO(pdf_data), pdf_info
return BytesIO(pdf_data)
else:
return None, None
return None
2 changes: 1 addition & 1 deletion backend/corpora/periodicals/periodicals.py
Original file line number Diff line number Diff line change
Expand Up @@ -294,4 +294,4 @@ def request_media(self, document):
))
else:
continue
return image_list
return {'media': image_list}
2 changes: 1 addition & 1 deletion backend/corpora/times/times.py
Original file line number Diff line number Diff line change
Expand Up @@ -476,4 +476,4 @@ def request_media(self, document):
_external=True
)]
else: image_urls = []
return image_urls
return {'media': image_urls }
6 changes: 2 additions & 4 deletions frontend/angular.json
Original file line number Diff line number Diff line change
Expand Up @@ -24,8 +24,7 @@
"node_modules/primeng/resources/primeng.css",
"src/styles.scss",
"node_modules/font-awesome/css/font-awesome.min.css",
"node_modules/material-icons/iconfont/material-icons.scss",
"node_modules/ng2-image-viewer/imageviewer.scss"
"src/imageviewer.scss"
],
"scripts": [
"node_modules/chart.js/dist/Chart.js"
Expand Down Expand Up @@ -82,8 +81,7 @@
"node_modules/primeicons/primeicons.css",
"node_modules/primeng/resources/primeng.css",
"src/styles.scss",
"node_modules/font-awesome/css/font-awesome.min.css",
"node_modules/ng2-image-viewer/imageviewer.scss"
"node_modules/font-awesome/css/font-awesome.min.css"
],
"assets": [
"src/assets"
Expand Down
Loading

0 comments on commit 7b784ca

Please sign in to comment.