diff --git a/bibtexparser/middlewares/__init__.py b/bibtexparser/middlewares/__init__.py index 89b0af1..3fd1a36 100644 --- a/bibtexparser/middlewares/__init__.py +++ b/bibtexparser/middlewares/__init__.py @@ -1,5 +1,6 @@ from bibtexparser.middlewares.enclosing import AddEnclosingMiddleware from bibtexparser.middlewares.enclosing import RemoveEnclosingMiddleware +from bibtexparser.middlewares.fieldkeys import NormalizeFieldKeys from bibtexparser.middlewares.interpolate import ResolveStringReferencesMiddleware from bibtexparser.middlewares.latex_encoding import LatexDecodingMiddleware from bibtexparser.middlewares.latex_encoding import LatexEncodingMiddleware diff --git a/bibtexparser/middlewares/fieldkeys.py b/bibtexparser/middlewares/fieldkeys.py new file mode 100644 index 0000000..edb5a7e --- /dev/null +++ b/bibtexparser/middlewares/fieldkeys.py @@ -0,0 +1,52 @@ +import logging +from typing import Dict +from typing import List +from typing import Set + +from bibtexparser.library import Library +from bibtexparser.model import Entry +from bibtexparser.model import Field + +from .middleware import BlockMiddleware + + +class NormalizeFieldKeys(BlockMiddleware): + """Normalize field keys to lowercase. + + In case of conflicts (e.g. both 'author' and 'Author' exist in the same entry), + a warning is emitted, and the last value wins. + + Some other middlewares, such as `SeparateCoAuthors`, assume lowercase key names. + """ + + def __init__(self, allow_inplace_modification: bool = True): + super().__init__( + allow_inplace_modification=allow_inplace_modification, + allow_parallel_execution=True, + ) + + # docstr-coverage: inherited + def transform_entry(self, entry: Entry, library: "Library") -> Entry: + seen_normalized_keys: Set[str] = set() + new_fields_dict: Dict[str, Field] = {} + for field in entry.fields: + normalized_key: str = field.key.lower() + # if the normalized key is already present, apply "last one wins" + # otherwise preserve insertion order + # if a key is overwritten, emit a detailed warning + # if performance is a concern, we could emit a warning with only {entry.key} + # to remove "seen_normalized_keys" and this if statement + if normalized_key in seen_normalized_keys: + logging.warning( + f"NormalizeFieldKeys: in entry '{entry.key}': " + + f"duplicate normalized key '{normalized_key}' " + + f"(original '{field.key}'); overriding previous value" + ) + seen_normalized_keys.add(normalized_key) + field.key = normalized_key + new_fields_dict[normalized_key] = field + + new_fields: List[Field] = list(new_fields_dict.values()) + entry.fields = new_fields + + return entry diff --git a/tests/middleware_tests/test_fieldkeys.py b/tests/middleware_tests/test_fieldkeys.py new file mode 100644 index 0000000..acff042 --- /dev/null +++ b/tests/middleware_tests/test_fieldkeys.py @@ -0,0 +1,68 @@ +import re + +from bibtexparser import Library +from bibtexparser.middlewares.fieldkeys import NormalizeFieldKeys +from bibtexparser.model import Entry +from bibtexparser.model import Field + +entries = { + "article": { + "author": '"Smith, J."', + "title": '"A Test Article"', + "journal": '"J. of Testing"', + "month": '"jan"', + "year": '"2022"', + }, + "book": { + "author": '"Doe, J."', + "title": '"A Test Book"', + "publisher": '"Test Pub."', + "year": '"2021"', + "month": "apr", + }, + "inproceedings": { + "author": '"Jones, R."', + "title": '"A Test Conf. Paper"', + "booktitle": '"Proc. of the Intl. Test Conf."', + "year": '"2023"', + "month": "8", + }, +} + +ref = Library() +for i, (entry_type, fields) in enumerate(entries.items()): + f = [Field(key=k, value=v) for k, v in fields.items()] + ref.add(Entry(entry_type=entry_type, key=f"entry{i}", fields=f)) + + +def test_normalize_fieldkeys(): + """ + Check library with lowercase field keys. + """ + + lib = Library() + for i, (entry_type, fields) in enumerate(entries.items()): + f = [Field(key=k, value=v) for k, v in fields.items()] + lib.add(Entry(entry_type=entry_type, key=f"entry{i}", fields=f)) + + lib = NormalizeFieldKeys().transform(lib) + + for key in lib.entries_dict: + assert lib.entries_dict[key] == ref.entries_dict[key] + + +def test_normalize_fieldkeys_force_last(caplog): + """ + Check library with uppercase field keys and duplicate normalized keys. + """ + lib = Library() + for i, (entry_type, fields) in enumerate(entries.items()): + f = [Field(key=k.lower(), value="dummyvalue") for k in fields] + f += [Field(key=k.upper(), value=v) for k, v in fields.items()] + lib.add(Entry(entry_type=entry_type, key=f"entry{i}", fields=f)) + + lib = NormalizeFieldKeys().transform(lib) + assert re.match(r"(WARNING\s*)(\w*\:\w*\.py\:[0-9]*\s*)(NormalizeFieldKeys)(.*)", caplog.text) + + for key in lib.entries_dict: + assert lib.entries_dict[key] == ref.entries_dict[key]