From f6a7837b4ec3576c147cd4246761b9ef35a061c9 Mon Sep 17 00:00:00 2001 From: BeritJanssen Date: Thu, 3 Oct 2019 17:55:24 +0200 Subject: [PATCH 1/5] new version ng2-image-viewer, adjusted tif conversion script to skip processed pdfs --- backend/corpora/ecco/ecco_image_convert.py | 7 +++++-- frontend/package-lock.json | 8 ++++---- frontend/package.json | 2 +- 3 files changed, 10 insertions(+), 7 deletions(-) diff --git a/backend/corpora/ecco/ecco_image_convert.py b/backend/corpora/ecco/ecco_image_convert.py index c6ac706b0..110635299 100644 --- a/backend/corpora/ecco/ecco_image_convert.py +++ b/backend/corpora/ecco/ecco_image_convert.py @@ -1,5 +1,5 @@ import os -from os.path import split, join, splitext +from os.path import exists, split, join, splitext import subprocess def convert_tif_to_pdf(data_dir): @@ -20,7 +20,10 @@ def convert_tif_to_pdf(data_dir): book_id = split(directory)[1] pdf_name = join(directory, '{}.pdf'.format(book_id)) print(pdf_name) - if splitext(filenames[0])[1]=='.tif': + if exists(pdf_name): + print("exists, skipping") + continue + elif splitext(filenames[0])[1]=='.tif': magick_call = ['magick', '-quiet', '*.tif', pdf_name] elif splitext(filenames[0])[1]=='.TIF': magick_call = ['magick', '-quiet', '*.TIF', pdf_name] diff --git a/frontend/package-lock.json b/frontend/package-lock.json index 338370655..e3d34299d 100644 --- a/frontend/package-lock.json +++ b/frontend/package-lock.json @@ -1,6 +1,6 @@ { "name": "frontend", - "version": "1.0.2", + "version": "1.2.0", "lockfileVersion": 1, "requires": true, "dependencies": { @@ -6725,9 +6725,9 @@ "dev": true }, "ng2-image-viewer": { - "version": "3.0.4", - "resolved": "https://registry.npmjs.org/ng2-image-viewer/-/ng2-image-viewer-3.0.4.tgz", - "integrity": "sha512-iNAvO65vWXLBvdmFmFIsB4vkCwtm93OIKafEKttQJ5K36n24mHPd9nVum3NBUaOCJqCBh3Wxcv0ZuoL02OOvEg==" + "version": "3.0.5", + "resolved": "https://registry.npmjs.org/ng2-image-viewer/-/ng2-image-viewer-3.0.5.tgz", + "integrity": "sha512-VxNjNMCyjr+KGuGScgrWPCw+uzBdKrPRo2i4N/1VYnQn3haZcr8tIeUL4CXBnZGYC+0IjHRRLaI4kZFXqSxr1A==" }, "ng2-pdf-viewer": { "version": "5.2.4", diff --git a/frontend/package.json b/frontend/package.json index f71ff48cd..5f93eeba4 100644 --- a/frontend/package.json +++ b/frontend/package.json @@ -43,7 +43,7 @@ "marked": "^0.7.0", "material-icons": "^0.3.1", "moment": "^2.24.0", - "ng2-image-viewer": "^3.0.4", + "ng2-image-viewer": "^3.0.5", "ng2-pdf-viewer": "^5.2.4", "ngx-cookie-service": "^2.2.0", "ngx-md": "^6.0.9", From dab398dfb00ea191b5a1d0c4026fff800a7dffe0 Mon Sep 17 00:00:00 2001 From: BeritJanssen Date: Mon, 14 Oct 2019 12:09:23 +0200 Subject: [PATCH 2/5] bugfix in es_update --- backend/es/es_update.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/backend/es/es_update.py b/backend/es/es_update.py index 5441fbdb9..b266c317b 100644 --- a/backend/es/es_update.py +++ b/backend/es/es_update.py @@ -28,7 +28,8 @@ def update_index(corpus, corpus_definition, query_model): hits = len(results['hits']['hits']) total_hits = results['hits']['total'] for doc in results['hits']['hits']: - update_document(client, corpus, corpus_definition, doc_type, doc) + update_body = corpus_definition.update_body(doc) + update_document(corpus, doc_type, doc, update_body, client) while hits Date: Mon, 14 Oct 2019 14:05:07 +0200 Subject: [PATCH 3/5] adjusting ecco image processing to convert, then combine --- backend/corpora/ecco/ecco_image_convert.py | 27 +++++++++++++++------- backend/es/es_update.py | 2 +- 2 files changed, 20 insertions(+), 9 deletions(-) diff --git a/backend/corpora/ecco/ecco_image_convert.py b/backend/corpora/ecco/ecco_image_convert.py index 110635299..175558f38 100644 --- a/backend/corpora/ecco/ecco_image_convert.py +++ b/backend/corpora/ecco/ecco_image_convert.py @@ -1,6 +1,7 @@ import os from os.path import exists, split, join, splitext import subprocess +import glob def convert_tif_to_pdf(data_dir): lib_call = ['export', 'DYLD_LIBRARY_PATH="$MAGICK_HOME/lib/"'] @@ -23,12 +24,22 @@ def convert_tif_to_pdf(data_dir): if exists(pdf_name): print("exists, skipping") continue - elif splitext(filenames[0])[1]=='.tif': - magick_call = ['magick', '-quiet', '*.tif', pdf_name] - elif splitext(filenames[0])[1]=='.TIF': - magick_call = ['magick', '-quiet', '*.TIF', pdf_name] - else: - print(splitext(filenames[0])) - continue os.chdir(directory) - subprocess.check_call(magick_call) \ No newline at end of file + for filename in filenames: + name, ext = splitext(filename) + if ext=='.tif' or ext=='.TIF': + magick_call = ['convert', filename, '-quiet', name+'.pdf'] + subprocess.check_call(magick_call) + with open("files.txt", "w") as f: + f.write(' '.join(sorted(glob.glob("*.pdf")))) + # combine all pdfs in one file + ghostscript_call = ['gs', '-sDEVICE=pdfwrite', '-dBATCH', '-dNOPAUSE', + '-sOutputFile='+pdf_name, '@files.txt'] + subprocess.check_call(ghostscript_call) + # remove temporary files + remove_call = ['find', '.', '-type', 'f', '-name', book_id+'0*.pdf', '-delete'] + subprocess.check_call(remove_call) + remove_call = ['rm', 'files.txt'] + subprocess.check_call(remove_call) + + \ No newline at end of file diff --git a/backend/es/es_update.py b/backend/es/es_update.py index b266c317b..9209ef62d 100644 --- a/backend/es/es_update.py +++ b/backend/es/es_update.py @@ -44,7 +44,7 @@ def update_index(corpus, corpus_definition, query_model): def update_document(corpus, doc_type, doc, update_body, client=None): if not client: client = get_client(corpus) - doc_id = doc['id'] + doc_id = doc['_id'] client.update(index=corpus, doc_type=doc_type, id=doc_id, body=update_body) From b44c5f7943be87e0d9cbdcd44f71e815fa9c7777 Mon Sep 17 00:00:00 2001 From: BeritJanssen Date: Mon, 14 Oct 2019 15:21:33 +0200 Subject: [PATCH 4/5] added holdings field --- backend/addcorpus/corpus.py | 13 +++++++++++-- backend/corpora/ecco/ecco.py | 15 +++++++++++---- 2 files changed, 22 insertions(+), 6 deletions(-) diff --git a/backend/addcorpus/corpus.py b/backend/addcorpus/corpus.py index f5e14943d..79334ad53 100644 --- a/backend/addcorpus/corpus.py +++ b/backend/addcorpus/corpus.py @@ -427,6 +427,11 @@ def metadata_from_xml(self, filename, tags): if 'attribute' in tag: right_tag = next((candidate for candidate in candidates if candidate.attrs == tag['attribute']), None) + elif 'list' in tag: + if 'subtag' in tag: + right_tag = [candidate.find(tag['subtag']) for candidate in candidates] + else: + right_tag = candidates elif 'subtag' in tag: right_tag = next((candidate.find(tag['subtag']) for candidate in candidates if candidate.find(tag['subtag'])), None) @@ -436,9 +441,13 @@ def metadata_from_xml(self, filename, tags): if not right_tag: continue if 'save_as' in tag: - out_dict[tag['save_as']] = right_tag.text + out_tag = tag['save_as'] else: - out_dict[tag['tag']] = right_tag.text + out_tag = tag['tag'] + if 'list' in tag: + out_dict[out_tag] = [t.text for t in right_tag] + else: + out_dict[out_tag] = right_tag.text return out_dict diff --git a/backend/corpora/ecco/ecco.py b/backend/corpora/ecco/ecco.py index 8f97d65a5..08d7d835d 100644 --- a/backend/corpora/ecco/ecco.py +++ b/backend/corpora/ecco/ecco.py @@ -73,9 +73,10 @@ def sources(self, start=min_date, end=max_date): meta_tags = [ 'collation', {'tag': 'author', 'subtag': 'composed'}, + {'tag': 'holdings', 'subtag': 'libraryName', 'list': True}, 'fullTitle', 'imprintFull', - 'libraryName', + {'tag': 'sourceLibrary', 'subtag': 'libraryName'}, 'ocr', 'pubDateStart', 'publicationPlaceComposed', @@ -214,9 +215,15 @@ def fields(self): ), Field( name='library', - display_name='Holding library', - description='The main holding library of the book', - extractor=Metadata('libraryName') + display_name='Source library', + description='The source library of the book', + extractor=Metadata('sourceLibrary') + ), + Field( + name='holdings', + display_name='Holding libraries', + description='Libraries holding a copy of the book', + extractor=Metadata('holdings') ), Field( name='volume', From 5dea44ff9cb658faed22ac5fc6261529c34de61e Mon Sep 17 00:00:00 2001 From: BeritJanssen Date: Mon, 14 Oct 2019 15:22:23 +0200 Subject: [PATCH 5/5] bumped version number --- frontend/package.json | 2 +- package.json | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/frontend/package.json b/frontend/package.json index 5f93eeba4..2744ba075 100644 --- a/frontend/package.json +++ b/frontend/package.json @@ -1,6 +1,6 @@ { "name": "frontend", - "version": "1.2.0", + "version": "1.2.1", "license": "MIT", "scripts": { "ng": "ng", diff --git a/package.json b/package.json index 30fea6736..0bdb9162b 100644 --- a/package.json +++ b/package.json @@ -1,6 +1,6 @@ { "name": "i-analyzer", - "version": "1.2.0", + "version": "1.2.1", "license": "MIT", "scripts": { "postinstall": "npm run install-backend && npm run install-frontend",