edsu · edsu · Dec 5, 2019 · Dec 3, 2019 · Dec 3, 2019 · Dec 3, 2019
diff --git a/.travis.yml b/.travis.yml
@@ -5,8 +5,6 @@ script: python setup.py test
 
 matrix:
   include:
-    - python: "2.6"
-      dist: "trusty"
     - python: "2.7"
     - python: "3.3"
       dist: "trusty"

diff --git a/pymarc/reader.py b/pymarc/reader.py
@@ -6,7 +6,8 @@
 from six import BytesIO, StringIO
 
 from pymarc import Record, Field
-from pymarc.exceptions import RecordLengthInvalid
+from pymarc.exceptions import PymarcException, RecordLengthInvalid
+
 
 class Reader(Iterator):
     """
@@ -58,9 +59,43 @@ class MARCReader(Reader):
     if you have a file in incorrect encode and you know what it is, you can
     try to use your encode in parameter "file_encoding".
 
+    You may want to parse data in a permissive way to avoid stop on the first
+    wrong record and reads as much as records as possible:
+
+        reader = MARCReader(file('file.dat'), permissive=True)
+
+    In such case ``None`` is return by the iterator.
+    This give you the full control to implement the expected behavior getting
+    exception information under ``reader.last_exception`` which will store
+    a tuple with (<chunk_data>, <catched exception>):
+
+        reader = MARCReader(file('file.dat'), permissive=True)
+        for record in reader:
+            if record is None:
+                print(
+                    "Current chunk: ",
+                    reader.current_chunk,
+                    " was ignored because the following exception raised: ",
+                    reader.current_exception
+                )
+            else:
+                # do something with record
+
     """
+    _current_chunk = None
+    _current_exception = None
+
+    @property
+    def current_chunk(self):
+        return self._current_chunk
+
+    @property
+    def current_exception(self):
+        return self._current_exception
+
     def __init__(self, marc_target, to_unicode=True, force_utf8=False,
-        hide_utf8_warnings=False, utf8_handling='strict',file_encoding = 'iso8859-1'):
+        hide_utf8_warnings=False, utf8_handling='strict',file_encoding = 'iso8859-1',
+        permissive=False):
         """
         The constructor to which you can pass either raw marc or a file-like
         object. Basically the argument you pass in should be raw MARC in
@@ -72,6 +107,7 @@ def __init__(self, marc_target, to_unicode=True, force_utf8=False,
         self.hide_utf8_warnings = hide_utf8_warnings
         self.utf8_handling = utf8_handling
         self.file_encoding = file_encoding
+        self.permissive = permissive
         if (hasattr(marc_target, "read") and callable(marc_target.read)):
             self.file_handle = marc_target
         else:
@@ -99,12 +135,21 @@ def __next__(self):
 
         chunk = self.file_handle.read(length - 5)
         chunk = first5 + chunk
-        record = Record(chunk,
-                        to_unicode=self.to_unicode,
-                        force_utf8=self.force_utf8,
-                        hide_utf8_warnings=self.hide_utf8_warnings,
-                        utf8_handling=self.utf8_handling,
-                        file_encoding = self.file_encoding)
+        self._current_chunk = chunk
+        self._current_exception = None
+        try:
+            record = Record(chunk,
+                            to_unicode=self.to_unicode,
+                            force_utf8=self.force_utf8,
+                            hide_utf8_warnings=self.hide_utf8_warnings,
+                            utf8_handling=self.utf8_handling,
+                            file_encoding = self.file_encoding)
+        except (PymarcException, UnicodeDecodeError, ValueError) as ex:
+            if self.permissive:
+                self._current_exception = ex
+                record = None
+            else:
+                raise ex
         return record
 
 def map_records(f, *files):

diff --git a/setup.py b/setup.py
@@ -37,5 +37,5 @@
     long_description_content_type = "text/markdown",
     classifiers = list(filter(None, classifiers.split('\n'))),
     test_suite = 'test',
-    python_requires='>=2.6, !=3.0.*, !=3.1.*, !=3.2.*',
+    python_requires='>=2.7, !=3.0.*, !=3.1.*, !=3.2.*',
 )
diff --git a/test/bad_records.mrc b/test/bad_records.mrc
@@ -0,0 +1 @@
+00127     2200037   450024500890000001aThe pragmatic programmer : bfrom journeyman to master /cAndrew Hunt, David Thomas.00127     2299937   450024500890000001aThe pragmatic programmer : bfrom journeyman to master /cAndrew Hunt, David Thomas.00127     2200000   450024500890000001aThe pragmatic programmer : bfrom journeyman to master /cAndrew Hunt, David Thomas.00128     2200038   4500245008900000101aThe pragmatic programmer : bfrom journeyman to master /cAndrew Hunt, David Thomas.00128     2200038   4500245ù0890000101aThe pragmatic programmer : bfrom journeyman to master /cAndrew Hunt, David Thomas.00127     22f0037   450024500890000001aThe pragmatic programmer : bfrom journeyman to master /cAndrew Hunt, David Thomas.00026     2200025   450000127     2200037   450024500890000001aThe pragmatic programmer : bfrom journeyman to master /cAndrew Hunt, David Thomas.
diff --git a/test/reader.py b/test/reader.py
@@ -1,12 +1,32 @@
+# -*- coding: utf-8 -*-
 import re
 import unittest
 
 import six
 import pymarc
 
-from six.moves.urllib.request import urlopen
 
-class MARCReaderFileTest(unittest.TestCase):
+class MARCReaderBaseTest(object):
+
+    def test_iterator(self):
+        count = 0
+        for record in self.reader:
+            count += 1
+        self.assertEqual(
+            count, 10,
+            'found expected number of MARC21 records')
+
+    def test_string(self):
+        # basic test of stringification
+        starts_with_leader = re.compile('^=LDR')
+        has_numeric_tag = re.compile('\n=\d\d\d ')
+        for record in self.reader:
+            text = str(record)
+            self.assertTrue(starts_with_leader.search(text), 'got leader')
+            self.assertTrue(has_numeric_tag.search(text), 'got a tag')
+
+
+class MARCReaderFileTest(unittest.TestCase, MARCReaderBaseTest):
     """
     Tests for the pymarc.MARCReader class which provides iterator
     based access to a MARC file.
@@ -19,15 +39,9 @@ def tearDown(self):
         if self.reader:
             self.reader.close()
 
-    def test_iterator(self):
-        count = 0
-        for record in self.reader:
-            count += 1
-        self.assertEqual(count, 10,
-                'found expected number of MARC21 records')
-
     def test_map_records(self):
         self.count = 0
+
         def f(r):
             self.count += 1
         with open('test/test.dat', 'rb') as fh:
@@ -36,6 +50,7 @@ def f(r):
 
     def test_multi_map_records(self):
         self.count = 0
+
         def f(r):
             self.count += 1
         fh1 = open('test/test.dat', 'rb')
@@ -45,27 +60,20 @@ def f(r):
         fh1.close()
         fh2.close()
 
-    def test_string(self):
-        ## basic test of stringification
-        starts_with_leader = re.compile('^=LDR')
-        has_numeric_tag = re.compile('\n=\d\d\d ')
-        for record in self.reader:
-            text = str(record)
-            self.assertTrue(starts_with_leader.search(text), 'got leader')
-            self.assertTrue(has_numeric_tag.search(text), 'got a tag')
-
     def disabled_test_codecs(self):
         import codecs
         with codecs.open('test/test.dat', encoding='utf-8') as fh:
             reader = pymarc.MARCReader(fh)
             record = next(reader)
-            self.assertEqual(record['245']['a'], u'ActivePerl with ASP and ADO /')
+            self.assertEqual(
+                record['245']['a'], u'ActivePerl with ASP and ADO /')
 
     def test_bad_subfield(self):
         with open('test/bad_subfield_code.dat', 'rb') as fh:
             reader = pymarc.MARCReader(fh)
             record = next(reader)
-            self.assertEqual(record['245']['a'], u'ActivePerl with ASP and ADO /')
+            self.assertEqual(
+                record['245']['a'], u'ActivePerl with ASP and ADO /')
 
     def test_bad_indicator(self):
         with open('test/bad_indicator.dat', 'rb') as fh:
@@ -82,8 +90,17 @@ def test_regression_45(self):
             self.assertEqual(record['752']['b'], 'Kostroma Oblast')
             self.assertEqual(record['752']['d'], 'Kostroma')
 
+    def test_strict_mode(self):
+        with self.assertRaises(pymarc.exceptions.BaseAddressInvalid), \
+                open('test/bad_records.mrc', 'rb') as fh:
+            reader = pymarc.MARCReader(fh)
+            for record in reader:
+                self.assertIsNotNone(reader.current_chunk)
+
+    # inherit same tests from MARCReaderBaseTest
+
 
-class MARCReaderStringTest(MARCReaderFileTest):
+class MARCReaderStringTest(unittest.TestCase, MARCReaderBaseTest):
 
     def setUp(self):
         fh = open('test/test.dat')
@@ -92,13 +109,79 @@ def setUp(self):
 
         self.reader = pymarc.reader.MARCReader(six.b(raw))
 
-    # inherit same tests from MARCReaderTestFile
+    # inherit same tests from MARCReaderBaseTest
+
+
+class MARCReaderFilePermissiveTest(unittest.TestCase):
+    """
+    Tests for the pymarc.MARCReader class which provides iterator
+    based access to a MARC file in a permissive way
+
+    """
+    def setUp(self):
+        self.reader = pymarc.MARCReader(
+            open('test/bad_records.mrc', 'rb'), permissive=True)
+
+    def tearDown(self):
+        if self.reader:
+            self.reader.close()
+
+    def test_permissive_mode(self):
+        """In bad_records.mrc we expect following records in the given order
+
+        * working record
+        * BaseAddressInvalid (base_address (99937) >= len(marc))
+        * BaseAddressNotFound (base_address (00000) <= 0)
+        * RecordDirectoryInvalid (len(directory) % DIRECTORY_ENTRY_LEN != 0)
+        * UnicodeDecodeError (directory with non ascii code (245ù0890000))
+        * ValueError (base_address with literal (f0037))
+        * last record should be ok
+        """
+        expected_exceptions = [
+            None,
+            pymarc.exceptions.BaseAddressInvalid,
+            pymarc.exceptions.BaseAddressNotFound,
+            pymarc.exceptions.RecordDirectoryInvalid,
+            UnicodeDecodeError,
+            ValueError,
+            pymarc.exceptions.NoFieldsFound,
+            None,
+        ]
+        for exception_type in expected_exceptions:
+            record = next(self.reader)
+            self.assertIsNotNone(self.reader.current_chunk)
+            if exception_type is None:
+                self.assertIsNotNone(record)
+                self.assertIsNone(self.reader.current_exception)
+                self.assertEqual(
+                    record["245"]["a"], 'The pragmatic programmer : ')
+                self.assertEqual(
+                    record["245"]["b"], 'from journeyman to master /')
+                self.assertEqual(
+                    record["245"]["c"], 'Andrew Hunt, David Thomas.')
+            else:
+                self.assertIsNone(
+                    record,
+                    "expected parsing error with the following "
+                    "exception %r" % exception_type
+                )
+                self.assertTrue(
+                    isinstance(self.reader.current_exception, exception_type),
+                    "expected %r exception, "
+                    "received: %r" % (
+                        exception_type, self.reader.current_exception)
+                )
+
 
 def suite():
     file_suite = unittest.makeSuite(MARCReaderFileTest, 'test')
     string_suite = unittest.makeSuite(MARCReaderStringTest, 'test')
-    test_suite = unittest.TestSuite((file_suite, string_suite))
+    permissive_file_suite = unittest.makeSuite(
+        MARCReaderFilePermissiveTest, 'test')
+    test_suite = unittest.TestSuite(
+        (file_suite, string_suite, permissive_file_suite))
     return test_suite
 
+
 if __name__ == '__main__':
     unittest.main()