Skip to content
This repository has been archived by the owner on Sep 20, 2021. It is now read-only.

Commit

Permalink
BibCheck: update CrossRef checker
Browse files Browse the repository at this point in the history
Signed-off-by: Pedro Gaudencio <[email protected]>
  • Loading branch information
pedrogaudencio committed Mar 26, 2015
1 parent a7d4862 commit 078d3c0
Showing 1 changed file with 129 additions and 55 deletions.
184 changes: 129 additions & 55 deletions modules/bibcheck/lib/plugins/crossref_checker.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
# -*- coding: utf-8 -*-
##
## This file is part of Invenio.
## Copyright (C) 2013 CERN.
## Copyright (C) 2013, 2015 CERN.
##
## Invenio is free software; you can redistribute it and/or
## modify it under the terms of the GNU General Public License as
Expand All @@ -22,125 +22,199 @@
metadata returned by crossref.
"""

from invenio.crossrefutils import get_metadata_for_dois
from invenio.crossrefutils import get_metadata_for_doi
from invenio.bibknowledge import get_kbr_values
import difflib

from difflib import SequenceMatcher
import re


def compare_str(str1, str2):
""" Return similarity (0.0 to 1.0) between the two strings """
return difflib.SequenceMatcher(None, str1, str2).ratio()
"""Return similarity (0.0 to 1.0) between the two strings."""
return SequenceMatcher(None, str1, str2).ratio()

def xpath_text(doc, query):
""" Get the text inside the element result of the xpath query """
nodes = doc.xpath(query)
if len(nodes) == 0:
return None
return nodes[0].text

def get_value(record, tag):
""" Get the value of a (unique) field or null """
"""Get the value of a (unique) field or null."""
record_values = list(record.iterfield(tag))
if len(record_values) == 0:
return None
return record_values[0][1]


def find_volume_in_title(journal_title):
"""Search for volume letter in journal title."""
volume_search = re.search("( [A-Z] )|( [A-Z]$)", journal_title)

return volume_search.group(0).strip() if volume_search else ""


def compare_metadata(metadata, rec):
"""
Compare a record with the metadata returned by crossref
"""Compare a record with the metadata returned by CrossRef.
@param rec Record
@param doc xml.etree.ElementTree representation of the xml returned by crossref
"""
confidence_different = 0
msgs = []
log_msgs = []

# Check title
title_crossref = metadata["title"]
titles_crossref = metadata.get("container-title", [])
title_record = get_value(rec, "773__p")
title_similarity = None
volume_extra = ""
if title_crossref != "" and title_record is not None:
confidence_different_journals = 0
different_journals = 0
if len(titles_crossref) and title_record is not None:
titles_crossref = map(lambda x: str(x[0]) if isinstance(x, tuple) else str(x),
titles_crossref)
# Remove Volume number from the title
title_crossref = re.sub(":.*$", "", title_crossref)
if re.search(" [A-Z]$", title_crossref):
volume_extra = title_crossref[-1]
title_crossref = title_crossref[:-2]
title_crossref = re.sub(" (Section|Volume)$", "", title_crossref)
abbr_title = get_kbr_values("JOURNALS", title_crossref, searchtype='e')
title_similarity = compare_str(abbr_title, title_record)
confidence_different += (1 - title_similarity)*2
if title_similarity < 0.6:
for title_crossref in titles_crossref:
title_crossref = re.sub(":.*$", "", title_crossref)
volume_extra = find_volume_in_title(title_crossref)
title_crossref = re.sub(" (Section|Volume)$", "", title_crossref)
mapped_title = get_kbr_values("JOURNALS",
title_crossref,
searchtype='e')
mapped_similarity = compare_str(mapped_title[0][0], title_record) if \
mapped_title else None
original_similarity = compare_str(title_crossref, title_record)
title_similarity = max(mapped_similarity, original_similarity) if \
mapped_similarity is not None else original_similarity
confidence_different_journals += (1 - title_similarity)*2
if title_similarity < 0.6:
different_journals += 1
# if all the journal names fail to compare
if len(titles_crossref) == different_journals:
# import ipdb; ipdb.set_trace()
confidence_different += confidence_different_journals
msgs.append("Incorrect journal name (773__p) or wrongly assigned DOI")
log_msgs.append("journal name (773__p)\n"
"recd journal name: {0}\n"
"cref journal name: \n{1}\n".format(title_record,
'\n'.join(titles_crossref)))

# Check issn
issn_crossref = metadata["issn"]
issn_crossref = metadata.get("ISSN", [])
issn_record = get_value(rec, "022__a")
if issn_crossref != "" and issn_record is not None and issn_crossref != issn_record:
# TODO: check every issn
if len(issn_crossref) and issn_record is not None and \
issn_crossref[0] != issn_record:
confidence_different += 3
# import ipdb; ipdb.set_trace()
msgs.append("Invalid ISSN (022__a) or wrongly assigned DOI")
log_msgs.append("ISSN (022__a)\n"
"recd issn: {0}\n"
"cref issn: {1}\n".format(issn_record,
issn_crossref))

# Check page number
page_crossref = metadata["page"]
page_crossref = metadata.get("page")
page_record = get_value(rec, "773__c")
if page_record is not None and page_crossref != "":
if page_record is not None and page_crossref is not None:
page_record = page_record.split("-")[0]
page_crossref = page_crossref.split("-")[0]
if page_record != page_crossref:
confidence_different += 3
msgs.append("Invalid page number (773__c) or wrongly assigned DOI")
page_crossref = str(page_crossref).split("-")[0]
if page_record != page_crossref and page_record.find(page_crossref) != 1:
# ignores proceedings
if not (page_record.startswith("pp.") and
page_record[3:].strip() != page_record):
# import ipdb; ipdb.set_trace()
confidence_different += 3
msgs.append("Invalid page number (773__c) or wrongly assigned DOI")
log_msgs.append("page number (773__c)\n"
"recd page number: {0}\n"
"cref page number: {1}\n".format(page_record,
page_crossref))

# Check author
author_crossref = metadata["author"]
author_crossref = ', '.join(filter(None,
[metadata.get("author")[0].get('family'),
metadata.get("author")[0].get('given')])) if \
metadata.get("author") else None
author_record = get_value(rec, "100__a")
if author_crossref != "" and author_record is not None:
author_similarity = compare_str(author_crossref, author_record)
if author_crossref is not None and author_record is not None:
author_similarity = compare_str(str(author_crossref), author_record)
confidence_different += (1 - author_similarity)*1.5
if author_similarity < 0.7:
# import ipdb; ipdb.set_trace()
msgs.append("Invalid author (100__a) or wrongly assigned DOI")
log_msgs.append("author (100__a)\n"
"recd author: {0}\n"
"cref author: {1}\n".format(author_record,
author_crossref))

# Check issue
issue_crossref = metadata["issue"]
issue_crossref = metadata.get("issue")
issue_record = get_value(rec, "773__n")
if issue_crossref != "" and issue_record is not None and issue_crossref != issue_record:
if issue_crossref is not None and issue_record is not None and \
issue_crossref != issue_record:
confidence_different += 2
# import ipdb; ipdb.set_trace()
msgs.append("Invalid issue (773__n) or wrongly assigned DOI")

log_msgs.append("issue (773__n)\n"
"recd issue: {0}\n"
"cref issue: {1}\n".format(issue_record,
issn_crossref))

# Check year
year_crossref = metadata["year"]
year_crossref = str(metadata.get("issued").get("date-parts")[0][0]) if \
isinstance(metadata.get("issued").get("date-parts"), list) else None
year_record = get_value(rec, "773__y")
if year_crossref != "" and year_record is not None and year_crossref != year_record:
if year_crossref is not None and year_record is not None and \
year_crossref != year_record:
confidence_different += 2
# import ipdb; ipdb.set_trace()
msgs.append("Invalid year (773__y) or wrongly assigned DOI")
log_msgs.append("year (773__y)\n"
"recd year: {0}\n"
"cref year: {1}\n".format(year_record,
year_crossref))

# Check volume
volume_crossref = metadata["volume"]
volume_crossref = metadata.get("volume")
volume_record = get_value(rec, "773__v")
if volume_crossref != "" and volume_record is not None:
volume_crossref = volume_extra + volume_crossref
if volume_crossref != volume_record:
if volume_crossref is not None and volume_record is not None:
volume_crossref = str(volume_crossref)
volume_crossref_extra = volume_extra + volume_crossref
if volume_crossref != volume_record and \
volume_crossref_extra != volume_record and \
(volume_record.find(volume_crossref) != 1 and
volume_crossref.find(volume_record) != 1):
confidence_different += 2
# import ipdb; ipdb.set_trace()
msgs.append("Invalid volume (773__v) or wrongly assigned DOI")
log_msgs.append("volume (773__v)\n"
"recd volume: {0}\n"
"cref volume: {1}\n".format(volume_record,
volume_crossref))

# DEBUG:
if log_msgs:
log_msgs.insert(0, "record {}".format(rec.record_id))
log_msgs.insert(0, "-"*50)

if confidence_different > 4:
for msg in msgs:
rec.set_invalid(msg)
# log_msgs.insert(1, "WARNING")
rec.warn(msgs)
# rec.set_invalid('\n'.join(msgs))
# DEBUG:
for msg in log_msgs:
print msg


def check_records(records, doi_field="0247_a"):
"""
Check the metadata of the records that contain a DOI by comparing it to the
metadata returned by crossref.
"""Check the metadata of the records that contain a DOI.
Comparing it to the metadata returned by CrossRef.
"""
records_to_check = {}
for record in records:
# FIXME: check the type of the identifier
for _, doi in record.iterfield(doi_field):
records_to_check[doi] = record

metadatas = get_metadata_for_dois(records_to_check.keys())
for doi, metadata in metadatas.iteritems():
# Can't compare books yet
if not metadata["is_book"]:
for doi in records_to_check.keys():
metadata = get_metadata_for_doi(doi)
if doi != "" and metadata:
compare_metadata(metadata, records_to_check[doi])

0 comments on commit 078d3c0

Please sign in to comment.