Skip to content
This repository has been archived by the owner on Feb 4, 2020. It is now read-only.

#109 permissive marcreader #144

Merged
merged 7 commits into from
Dec 5, 2019
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 0 additions & 2 deletions .travis.yml
Original file line number Diff line number Diff line change
Expand Up @@ -5,8 +5,6 @@ script: python setup.py test

matrix:
include:
- python: "2.6"
dist: "trusty"
- python: "2.7"
- python: "3.3"
dist: "trusty"
Expand Down
61 changes: 53 additions & 8 deletions pymarc/reader.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,8 @@
from six import BytesIO, StringIO

from pymarc import Record, Field
from pymarc.exceptions import RecordLengthInvalid
from pymarc.exceptions import PymarcException, RecordLengthInvalid


class Reader(Iterator):
"""
Expand Down Expand Up @@ -58,9 +59,43 @@ class MARCReader(Reader):
if you have a file in incorrect encode and you know what it is, you can
try to use your encode in parameter "file_encoding".

You may want to parse data in a permissive way to avoid stop on the first
wrong record and reads as much as records as possible:

reader = MARCReader(file('file.dat'), permissive=True)

In such case ``None`` is return by the iterator.
This give you the full control to implement the expected behavior getting
exception information under ``reader.last_exception`` which will store
a tuple with (<chunk_data>, <catched exception>):

reader = MARCReader(file('file.dat'), permissive=True)
for record in reader:
if record is None:
print(
"Current chunk: ",
reader.current_chunk,
" was ignored because the following exception raised: ",
reader.current_exception
)
else:
# do something with record

"""
_current_chunk = None
_current_exception = None

@property
def current_chunk(self):
return self._current_chunk

@property
def current_exception(self):
return self._current_exception

def __init__(self, marc_target, to_unicode=True, force_utf8=False,
hide_utf8_warnings=False, utf8_handling='strict',file_encoding = 'iso8859-1'):
hide_utf8_warnings=False, utf8_handling='strict',file_encoding = 'iso8859-1',
permissive=False):
"""
The constructor to which you can pass either raw marc or a file-like
object. Basically the argument you pass in should be raw MARC in
Expand All @@ -72,6 +107,7 @@ def __init__(self, marc_target, to_unicode=True, force_utf8=False,
self.hide_utf8_warnings = hide_utf8_warnings
self.utf8_handling = utf8_handling
self.file_encoding = file_encoding
self.permissive = permissive
if (hasattr(marc_target, "read") and callable(marc_target.read)):
self.file_handle = marc_target
else:
Expand Down Expand Up @@ -99,12 +135,21 @@ def __next__(self):

chunk = self.file_handle.read(length - 5)
chunk = first5 + chunk
record = Record(chunk,
to_unicode=self.to_unicode,
force_utf8=self.force_utf8,
hide_utf8_warnings=self.hide_utf8_warnings,
utf8_handling=self.utf8_handling,
file_encoding = self.file_encoding)
self._current_chunk = chunk
self._current_exception = None
try:
record = Record(chunk,
to_unicode=self.to_unicode,
force_utf8=self.force_utf8,
hide_utf8_warnings=self.hide_utf8_warnings,
utf8_handling=self.utf8_handling,
file_encoding = self.file_encoding)
except (PymarcException, UnicodeDecodeError, ValueError) as ex:
if self.permissive:
self._current_exception = ex
record = None
else:
raise ex
return record

def map_records(f, *files):
Expand Down
2 changes: 1 addition & 1 deletion setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -37,5 +37,5 @@
long_description_content_type = "text/markdown",
classifiers = list(filter(None, classifiers.split('\n'))),
test_suite = 'test',
python_requires='>=2.6, !=3.0.*, !=3.1.*, !=3.2.*',
python_requires='>=2.7, !=3.0.*, !=3.1.*, !=3.2.*',
)
1 change: 1 addition & 0 deletions test/bad_records.mrc
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
00127 2200037 450024500890000001aThe pragmatic programmer : bfrom journeyman to master /cAndrew Hunt, David Thomas.00127 2299937 450024500890000001aThe pragmatic programmer : bfrom journeyman to master /cAndrew Hunt, David Thomas.00127 2200000 450024500890000001aThe pragmatic programmer : bfrom journeyman to master /cAndrew Hunt, David Thomas.00128 2200038 4500245008900000101aThe pragmatic programmer : bfrom journeyman to master /cAndrew Hunt, David Thomas.00128 2200038 4500245ù0890000101aThe pragmatic programmer : bfrom journeyman to master /cAndrew Hunt, David Thomas.00127 22f0037 450024500890000001aThe pragmatic programmer : bfrom journeyman to master /cAndrew Hunt, David Thomas.00026 2200025 450000127 2200037 450024500890000001aThe pragmatic programmer : bfrom journeyman to master /cAndrew Hunt, David Thomas.
129 changes: 106 additions & 23 deletions test/reader.py
Original file line number Diff line number Diff line change
@@ -1,12 +1,32 @@
# -*- coding: utf-8 -*-
import re
import unittest

import six
import pymarc

from six.moves.urllib.request import urlopen

class MARCReaderFileTest(unittest.TestCase):
class MARCReaderBaseTest(object):

def test_iterator(self):
count = 0
for record in self.reader:
count += 1
self.assertEqual(
count, 10,
'found expected number of MARC21 records')

def test_string(self):
# basic test of stringification
starts_with_leader = re.compile('^=LDR')
has_numeric_tag = re.compile('\n=\d\d\d ')
for record in self.reader:
text = str(record)
self.assertTrue(starts_with_leader.search(text), 'got leader')
self.assertTrue(has_numeric_tag.search(text), 'got a tag')


class MARCReaderFileTest(unittest.TestCase, MARCReaderBaseTest):
"""
Tests for the pymarc.MARCReader class which provides iterator
based access to a MARC file.
Expand All @@ -19,15 +39,9 @@ def tearDown(self):
if self.reader:
self.reader.close()

def test_iterator(self):
count = 0
for record in self.reader:
count += 1
self.assertEqual(count, 10,
'found expected number of MARC21 records')

def test_map_records(self):
self.count = 0

def f(r):
self.count += 1
with open('test/test.dat', 'rb') as fh:
Expand All @@ -36,6 +50,7 @@ def f(r):

def test_multi_map_records(self):
self.count = 0

def f(r):
self.count += 1
fh1 = open('test/test.dat', 'rb')
Expand All @@ -45,27 +60,20 @@ def f(r):
fh1.close()
fh2.close()

def test_string(self):
## basic test of stringification
starts_with_leader = re.compile('^=LDR')
has_numeric_tag = re.compile('\n=\d\d\d ')
for record in self.reader:
text = str(record)
self.assertTrue(starts_with_leader.search(text), 'got leader')
self.assertTrue(has_numeric_tag.search(text), 'got a tag')

def disabled_test_codecs(self):
import codecs
with codecs.open('test/test.dat', encoding='utf-8') as fh:
reader = pymarc.MARCReader(fh)
record = next(reader)
self.assertEqual(record['245']['a'], u'ActivePerl with ASP and ADO /')
self.assertEqual(
record['245']['a'], u'ActivePerl with ASP and ADO /')

def test_bad_subfield(self):
with open('test/bad_subfield_code.dat', 'rb') as fh:
reader = pymarc.MARCReader(fh)
record = next(reader)
self.assertEqual(record['245']['a'], u'ActivePerl with ASP and ADO /')
self.assertEqual(
record['245']['a'], u'ActivePerl with ASP and ADO /')

def test_bad_indicator(self):
with open('test/bad_indicator.dat', 'rb') as fh:
Expand All @@ -82,8 +90,17 @@ def test_regression_45(self):
self.assertEqual(record['752']['b'], 'Kostroma Oblast')
self.assertEqual(record['752']['d'], 'Kostroma')

def test_strict_mode(self):
with self.assertRaises(pymarc.exceptions.BaseAddressInvalid), \
open('test/bad_records.mrc', 'rb') as fh:
reader = pymarc.MARCReader(fh)
for record in reader:
self.assertIsNotNone(reader.current_chunk)

# inherit same tests from MARCReaderBaseTest


class MARCReaderStringTest(MARCReaderFileTest):
class MARCReaderStringTest(unittest.TestCase, MARCReaderBaseTest):

def setUp(self):
fh = open('test/test.dat')
Expand All @@ -92,13 +109,79 @@ def setUp(self):

self.reader = pymarc.reader.MARCReader(six.b(raw))

# inherit same tests from MARCReaderTestFile
# inherit same tests from MARCReaderBaseTest


class MARCReaderFilePermissiveTest(unittest.TestCase):
"""
Tests for the pymarc.MARCReader class which provides iterator
based access to a MARC file in a permissive way

"""
def setUp(self):
self.reader = pymarc.MARCReader(
open('test/bad_records.mrc', 'rb'), permissive=True)

def tearDown(self):
if self.reader:
self.reader.close()

def test_permissive_mode(self):
"""In bad_records.mrc we expect following records in the given order

* working record
* BaseAddressInvalid (base_address (99937) >= len(marc))
* BaseAddressNotFound (base_address (00000) <= 0)
* RecordDirectoryInvalid (len(directory) % DIRECTORY_ENTRY_LEN != 0)
* UnicodeDecodeError (directory with non ascii code (245ù0890000))
* ValueError (base_address with literal (f0037))
* last record should be ok
"""
expected_exceptions = [
None,
pymarc.exceptions.BaseAddressInvalid,
pymarc.exceptions.BaseAddressNotFound,
pymarc.exceptions.RecordDirectoryInvalid,
UnicodeDecodeError,
ValueError,
pymarc.exceptions.NoFieldsFound,
None,
]
for exception_type in expected_exceptions:
record = next(self.reader)
self.assertIsNotNone(self.reader.current_chunk)
if exception_type is None:
self.assertIsNotNone(record)
self.assertIsNone(self.reader.current_exception)
self.assertEqual(
record["245"]["a"], 'The pragmatic programmer : ')
self.assertEqual(
record["245"]["b"], 'from journeyman to master /')
self.assertEqual(
record["245"]["c"], 'Andrew Hunt, David Thomas.')
else:
self.assertIsNone(
record,
"expected parsing error with the following "
"exception %r" % exception_type
)
self.assertTrue(
isinstance(self.reader.current_exception, exception_type),
"expected %r exception, "
"received: %r" % (
exception_type, self.reader.current_exception)
)


def suite():
file_suite = unittest.makeSuite(MARCReaderFileTest, 'test')
string_suite = unittest.makeSuite(MARCReaderStringTest, 'test')
test_suite = unittest.TestSuite((file_suite, string_suite))
permissive_file_suite = unittest.makeSuite(
MARCReaderFilePermissiveTest, 'test')
test_suite = unittest.TestSuite(
(file_suite, string_suite, permissive_file_suite))
return test_suite


if __name__ == '__main__':
unittest.main()