From 078d3c06f819ea65908e08879a191edf96d561ca Mon Sep 17 00:00:00 2001 From: Pedro Gaudencio Date: Wed, 18 Mar 2015 13:05:36 +0200 Subject: [PATCH] BibCheck: update CrossRef checker Signed-off-by: Pedro Gaudencio --- .../bibcheck/lib/plugins/crossref_checker.py | 184 ++++++++++++------ 1 file changed, 129 insertions(+), 55 deletions(-) diff --git a/modules/bibcheck/lib/plugins/crossref_checker.py b/modules/bibcheck/lib/plugins/crossref_checker.py index 36b8faae8c..e193392d1e 100644 --- a/modules/bibcheck/lib/plugins/crossref_checker.py +++ b/modules/bibcheck/lib/plugins/crossref_checker.py @@ -1,7 +1,7 @@ # -*- coding: utf-8 -*- ## ## This file is part of Invenio. -## Copyright (C) 2013 CERN. +## Copyright (C) 2013, 2015 CERN. ## ## Invenio is free software; you can redistribute it and/or ## modify it under the terms of the GNU General Public License as @@ -22,115 +22,191 @@ metadata returned by crossref. """ -from invenio.crossrefutils import get_metadata_for_dois +from invenio.crossrefutils import get_metadata_for_doi from invenio.bibknowledge import get_kbr_values -import difflib + +from difflib import SequenceMatcher import re + def compare_str(str1, str2): - """ Return similarity (0.0 to 1.0) between the two strings """ - return difflib.SequenceMatcher(None, str1, str2).ratio() + """Return similarity (0.0 to 1.0) between the two strings.""" + return SequenceMatcher(None, str1, str2).ratio() -def xpath_text(doc, query): - """ Get the text inside the element result of the xpath query """ - nodes = doc.xpath(query) - if len(nodes) == 0: - return None - return nodes[0].text def get_value(record, tag): - """ Get the value of a (unique) field or null """ + """Get the value of a (unique) field or null.""" record_values = list(record.iterfield(tag)) if len(record_values) == 0: return None return record_values[0][1] + +def find_volume_in_title(journal_title): + """Search for volume letter in journal title.""" + volume_search = re.search("( [A-Z] )|( [A-Z]$)", journal_title) + + return volume_search.group(0).strip() if volume_search else "" + + def compare_metadata(metadata, rec): - """ - Compare a record with the metadata returned by crossref + """Compare a record with the metadata returned by CrossRef. + @param rec Record @param doc xml.etree.ElementTree representation of the xml returned by crossref """ confidence_different = 0 msgs = [] + log_msgs = [] # Check title - title_crossref = metadata["title"] + titles_crossref = metadata.get("container-title", []) title_record = get_value(rec, "773__p") title_similarity = None volume_extra = "" - if title_crossref != "" and title_record is not None: + confidence_different_journals = 0 + different_journals = 0 + if len(titles_crossref) and title_record is not None: + titles_crossref = map(lambda x: str(x[0]) if isinstance(x, tuple) else str(x), + titles_crossref) # Remove Volume number from the title - title_crossref = re.sub(":.*$", "", title_crossref) - if re.search(" [A-Z]$", title_crossref): - volume_extra = title_crossref[-1] - title_crossref = title_crossref[:-2] - title_crossref = re.sub(" (Section|Volume)$", "", title_crossref) - abbr_title = get_kbr_values("JOURNALS", title_crossref, searchtype='e') - title_similarity = compare_str(abbr_title, title_record) - confidence_different += (1 - title_similarity)*2 - if title_similarity < 0.6: + for title_crossref in titles_crossref: + title_crossref = re.sub(":.*$", "", title_crossref) + volume_extra = find_volume_in_title(title_crossref) + title_crossref = re.sub(" (Section|Volume)$", "", title_crossref) + mapped_title = get_kbr_values("JOURNALS", + title_crossref, + searchtype='e') + mapped_similarity = compare_str(mapped_title[0][0], title_record) if \ + mapped_title else None + original_similarity = compare_str(title_crossref, title_record) + title_similarity = max(mapped_similarity, original_similarity) if \ + mapped_similarity is not None else original_similarity + confidence_different_journals += (1 - title_similarity)*2 + if title_similarity < 0.6: + different_journals += 1 + # if all the journal names fail to compare + if len(titles_crossref) == different_journals: + # import ipdb; ipdb.set_trace() + confidence_different += confidence_different_journals msgs.append("Incorrect journal name (773__p) or wrongly assigned DOI") + log_msgs.append("journal name (773__p)\n" + "recd journal name: {0}\n" + "cref journal name: \n{1}\n".format(title_record, + '\n'.join(titles_crossref))) # Check issn - issn_crossref = metadata["issn"] + issn_crossref = metadata.get("ISSN", []) issn_record = get_value(rec, "022__a") - if issn_crossref != "" and issn_record is not None and issn_crossref != issn_record: + # TODO: check every issn + if len(issn_crossref) and issn_record is not None and \ + issn_crossref[0] != issn_record: confidence_different += 3 + # import ipdb; ipdb.set_trace() msgs.append("Invalid ISSN (022__a) or wrongly assigned DOI") + log_msgs.append("ISSN (022__a)\n" + "recd issn: {0}\n" + "cref issn: {1}\n".format(issn_record, + issn_crossref)) # Check page number - page_crossref = metadata["page"] + page_crossref = metadata.get("page") page_record = get_value(rec, "773__c") - if page_record is not None and page_crossref != "": + if page_record is not None and page_crossref is not None: page_record = page_record.split("-")[0] - page_crossref = page_crossref.split("-")[0] - if page_record != page_crossref: - confidence_different += 3 - msgs.append("Invalid page number (773__c) or wrongly assigned DOI") + page_crossref = str(page_crossref).split("-")[0] + if page_record != page_crossref and page_record.find(page_crossref) != 1: + # ignores proceedings + if not (page_record.startswith("pp.") and + page_record[3:].strip() != page_record): + # import ipdb; ipdb.set_trace() + confidence_different += 3 + msgs.append("Invalid page number (773__c) or wrongly assigned DOI") + log_msgs.append("page number (773__c)\n" + "recd page number: {0}\n" + "cref page number: {1}\n".format(page_record, + page_crossref)) # Check author - author_crossref = metadata["author"] + author_crossref = ', '.join(filter(None, + [metadata.get("author")[0].get('family'), + metadata.get("author")[0].get('given')])) if \ + metadata.get("author") else None author_record = get_value(rec, "100__a") - if author_crossref != "" and author_record is not None: - author_similarity = compare_str(author_crossref, author_record) + if author_crossref is not None and author_record is not None: + author_similarity = compare_str(str(author_crossref), author_record) confidence_different += (1 - author_similarity)*1.5 if author_similarity < 0.7: + # import ipdb; ipdb.set_trace() msgs.append("Invalid author (100__a) or wrongly assigned DOI") + log_msgs.append("author (100__a)\n" + "recd author: {0}\n" + "cref author: {1}\n".format(author_record, + author_crossref)) # Check issue - issue_crossref = metadata["issue"] + issue_crossref = metadata.get("issue") issue_record = get_value(rec, "773__n") - if issue_crossref != "" and issue_record is not None and issue_crossref != issue_record: + if issue_crossref is not None and issue_record is not None and \ + issue_crossref != issue_record: confidence_different += 2 + # import ipdb; ipdb.set_trace() msgs.append("Invalid issue (773__n) or wrongly assigned DOI") - + log_msgs.append("issue (773__n)\n" + "recd issue: {0}\n" + "cref issue: {1}\n".format(issue_record, + issn_crossref)) # Check year - year_crossref = metadata["year"] + year_crossref = str(metadata.get("issued").get("date-parts")[0][0]) if \ + isinstance(metadata.get("issued").get("date-parts"), list) else None year_record = get_value(rec, "773__y") - if year_crossref != "" and year_record is not None and year_crossref != year_record: + if year_crossref is not None and year_record is not None and \ + year_crossref != year_record: confidence_different += 2 + # import ipdb; ipdb.set_trace() msgs.append("Invalid year (773__y) or wrongly assigned DOI") + log_msgs.append("year (773__y)\n" + "recd year: {0}\n" + "cref year: {1}\n".format(year_record, + year_crossref)) # Check volume - volume_crossref = metadata["volume"] + volume_crossref = metadata.get("volume") volume_record = get_value(rec, "773__v") - if volume_crossref != "" and volume_record is not None: - volume_crossref = volume_extra + volume_crossref - if volume_crossref != volume_record: + if volume_crossref is not None and volume_record is not None: + volume_crossref = str(volume_crossref) + volume_crossref_extra = volume_extra + volume_crossref + if volume_crossref != volume_record and \ + volume_crossref_extra != volume_record and \ + (volume_record.find(volume_crossref) != 1 and + volume_crossref.find(volume_record) != 1): confidence_different += 2 + # import ipdb; ipdb.set_trace() msgs.append("Invalid volume (773__v) or wrongly assigned DOI") + log_msgs.append("volume (773__v)\n" + "recd volume: {0}\n" + "cref volume: {1}\n".format(volume_record, + volume_crossref)) + + # DEBUG: + if log_msgs: + log_msgs.insert(0, "record {}".format(rec.record_id)) + log_msgs.insert(0, "-"*50) if confidence_different > 4: - for msg in msgs: - rec.set_invalid(msg) + # log_msgs.insert(1, "WARNING") + rec.warn(msgs) + # rec.set_invalid('\n'.join(msgs)) + # DEBUG: + for msg in log_msgs: + print msg def check_records(records, doi_field="0247_a"): - """ - Check the metadata of the records that contain a DOI by comparing it to the - metadata returned by crossref. + """Check the metadata of the records that contain a DOI. + + Comparing it to the metadata returned by CrossRef. """ records_to_check = {} for record in records: @@ -138,9 +214,7 @@ def check_records(records, doi_field="0247_a"): for _, doi in record.iterfield(doi_field): records_to_check[doi] = record - metadatas = get_metadata_for_dois(records_to_check.keys()) - for doi, metadata in metadatas.iteritems(): - # Can't compare books yet - if not metadata["is_book"]: + for doi in records_to_check.keys(): + metadata = get_metadata_for_doi(doi) + if doi != "" and metadata: compare_metadata(metadata, records_to_check[doi]) -