diff --git a/modules/bibcheck/lib/plugins/crossref_checker.py b/modules/bibcheck/lib/plugins/crossref_checker.py index 36b8faae8c..d16b01950e 100644 --- a/modules/bibcheck/lib/plugins/crossref_checker.py +++ b/modules/bibcheck/lib/plugins/crossref_checker.py @@ -1,7 +1,7 @@ # -*- coding: utf-8 -*- ## ## This file is part of Invenio. -## Copyright (C) 2013 CERN. +## Copyright (C) 2013, 2015 CERN. ## ## Invenio is free software; you can redistribute it and/or ## modify it under the terms of the GNU General Public License as @@ -22,115 +22,162 @@ metadata returned by crossref. """ -from invenio.crossrefutils import get_metadata_for_dois +from invenio.crossrefutils import get_metadata_for_doi from invenio.bibknowledge import get_kbr_values -import difflib + +from difflib import SequenceMatcher import re + def compare_str(str1, str2): - """ Return similarity (0.0 to 1.0) between the two strings """ - return difflib.SequenceMatcher(None, str1, str2).ratio() + """Return similarity (0.0 to 1.0) between the two strings.""" + return SequenceMatcher(None, str1, str2).ratio() -def xpath_text(doc, query): - """ Get the text inside the element result of the xpath query """ - nodes = doc.xpath(query) - if len(nodes) == 0: - return None - return nodes[0].text def get_value(record, tag): - """ Get the value of a (unique) field or null """ + """Get the value of a (unique) field or null.""" record_values = list(record.iterfield(tag)) if len(record_values) == 0: return None return record_values[0][1] + def compare_metadata(metadata, rec): - """ - Compare a record with the metadata returned by crossref + """Compare a record with the metadata returned by CrossRef. + @param rec Record @param doc xml.etree.ElementTree representation of the xml returned by crossref """ confidence_different = 0 msgs = [] + log_msgs = [] # Check title - title_crossref = metadata["title"] + titles_crossref = metadata.get("container-title", []) title_record = get_value(rec, "773__p") title_similarity = None volume_extra = "" - if title_crossref != "" and title_record is not None: + confidence_different_journals = 0 + different_journals = 0 + if len(titles_crossref) and title_record is not None: + titles_crossref = map(lambda x: str(x[0]) if isinstance(x, tuple) else str(x), + titles_crossref) # Remove Volume number from the title - title_crossref = re.sub(":.*$", "", title_crossref) - if re.search(" [A-Z]$", title_crossref): - volume_extra = title_crossref[-1] - title_crossref = title_crossref[:-2] - title_crossref = re.sub(" (Section|Volume)$", "", title_crossref) - abbr_title = get_kbr_values("JOURNALS", title_crossref, searchtype='e') - title_similarity = compare_str(abbr_title, title_record) - confidence_different += (1 - title_similarity)*2 - if title_similarity < 0.6: + for title_crossref in titles_crossref: + title_crossref = re.sub(":.*$", "", title_crossref) + if re.search(" [A-Z]$", title_crossref): + volume_extra = title_crossref[-1] + title_crossref = title_crossref[:-2] + title_crossref = re.sub(" (Section|Volume)$", "", title_crossref) + abbr_title = get_kbr_values("JOURNALS", title_crossref, searchtype='e') + title_similarity = compare_str(abbr_title[0][0], title_record) if abbr_title \ + else compare_str(title_crossref, title_record) + confidence_different_journals += (1 - title_similarity)*2 + if title_similarity < 0.6: + different_journals += 1 + # if all the journal names fail + if len(titles_crossref) == different_journals: + confidence_different += confidence_different_journals msgs.append("Incorrect journal name (773__p) or wrongly assigned DOI") + log_msgs.append("journal name (773__p)\n" + "recd journal name: {0}\n" + "cref journal name: {1}\n".format(title_record, + '; '.join(titles_crossref))) # Check issn - issn_crossref = metadata["issn"] + issn_crossref = metadata.get("ISSN", []) issn_record = get_value(rec, "022__a") - if issn_crossref != "" and issn_record is not None and issn_crossref != issn_record: + if len(issn_crossref) and issn_record is not None and issn_crossref[0] != issn_record: confidence_different += 3 msgs.append("Invalid ISSN (022__a) or wrongly assigned DOI") + log_msgs.append("ISSN (022__a)\n" + "recd issn: {0}\n" + "cref issn: {1}\n".format(issn_record, + issn_crossref)) # Check page number - page_crossref = metadata["page"] + page_crossref = metadata.get("page") page_record = get_value(rec, "773__c") - if page_record is not None and page_crossref != "": + if page_record is not None and page_crossref is not None: page_record = page_record.split("-")[0] - page_crossref = page_crossref.split("-")[0] + page_crossref = str(page_crossref).split("-")[0] if page_record != page_crossref: confidence_different += 3 msgs.append("Invalid page number (773__c) or wrongly assigned DOI") + log_msgs.append("page number (773__c)\n" + "recd page number: {0}\n" + "cref page number: {1}\n".format(page_record, + page_crossref)) # Check author - author_crossref = metadata["author"] + author_crossref = ', '.join(filter(None, + [metadata.get("author")[0].get('family'), + metadata.get("author")[0].get('given')])) if \ + metadata.get("author") else None author_record = get_value(rec, "100__a") - if author_crossref != "" and author_record is not None: + if author_crossref is not None and author_record is not None: author_similarity = compare_str(author_crossref, author_record) confidence_different += (1 - author_similarity)*1.5 if author_similarity < 0.7: msgs.append("Invalid author (100__a) or wrongly assigned DOI") + log_msgs.append("author (100__a)\n" + "recd author: {0}\n" + "cref author: {1}\n".format(author_record, + author_crossref)) # Check issue - issue_crossref = metadata["issue"] + issue_crossref = metadata.get("issue") issue_record = get_value(rec, "773__n") - if issue_crossref != "" and issue_record is not None and issue_crossref != issue_record: + if issue_crossref is not None and issue_record is not None and issue_crossref != issue_record: confidence_different += 2 msgs.append("Invalid issue (773__n) or wrongly assigned DOI") - + log_msgs.append("issue (773__n)\n" + "recd issue: {0}\n" + "cref issue: {1}\n".format(issue_record, + issn_crossref)) # Check year - year_crossref = metadata["year"] + year_crossref = str(metadata.get("issued").get("date-parts")[0][0]) if \ + isinstance(metadata.get("issued").get("date-parts"), list) else None year_record = get_value(rec, "773__y") - if year_crossref != "" and year_record is not None and year_crossref != year_record: + if year_crossref is not None and year_record is not None and year_crossref != year_record: confidence_different += 2 msgs.append("Invalid year (773__y) or wrongly assigned DOI") + log_msgs.append("year (773__y)\n" + "recd year: {0}\n" + "cref year: {1}\n".format(year_record, + year_crossref)) # Check volume - volume_crossref = metadata["volume"] + volume_crossref = metadata.get("volume") volume_record = get_value(rec, "773__v") if volume_crossref != "" and volume_record is not None: - volume_crossref = volume_extra + volume_crossref + volume_crossref = volume_extra + str(volume_crossref) if volume_crossref != volume_record: confidence_different += 2 msgs.append("Invalid volume (773__v) or wrongly assigned DOI") + log_msgs.append("volume (773__v)\n" + "recd volume: {0}\n" + "cref volume: {1}\n".format(volume_record, + volume_crossref)) + + # DEBUG: + if log_msgs: + log_msgs.insert(0, "record {}".format(rec.record_id)) + log_msgs.insert(0, "-"*50) if confidence_different > 4: - for msg in msgs: - rec.set_invalid(msg) + rec.warn(msgs) + # rec.set_invalid('\n'.join(msgs)) + # DEBUG: + for msg in log_msgs: + print msg def check_records(records, doi_field="0247_a"): - """ - Check the metadata of the records that contain a DOI by comparing it to the - metadata returned by crossref. + """Check the metadata of the records that contain a DOI. + + Comparing it to the metadata returned by CrossRef. """ records_to_check = {} for record in records: @@ -138,9 +185,7 @@ def check_records(records, doi_field="0247_a"): for _, doi in record.iterfield(doi_field): records_to_check[doi] = record - metadatas = get_metadata_for_dois(records_to_check.keys()) - for doi, metadata in metadatas.iteritems(): - # Can't compare books yet - if not metadata["is_book"]: + for doi in records_to_check.keys(): + metadata = get_metadata_for_doi(doi) + if doi != "" and metadata: compare_metadata(metadata, records_to_check[doi]) -