iso2709.py

#!/usr/bin/env python
# -*- encoding: utf-8 -*-

# ISO-2709 file reader
#
# Copyright (C) 2010 BIREME/PAHO/WHO
#
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU Lesser General Public License as published
# by the Free Software Foundation, either version 2.1 of the License, or
# (at your option) any later version.

# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
# GNU Lesser General Public License for more details.

# You should have received a copy of the GNU Lesser General Public License
# along with this program. If not, see <http://www.gnu.org/licenses/>.

from struct import unpack

CR =  '\x0D' # \r
LF =  '\x0A' # \n
IS1 = '\x1F' # ECMA-48 Unit Separator
IS2 = '\x1E' # ECMA-48 Record Separator / ISO-2709 field separator
IS3 = '\x1D' # ECMA-48 Group Separator / ISO-2709 record separator
LABEL_LEN = 24
LABEL_FORMAT = '5s c 4s c c 5s 3s c c c c'
TAG_LEN = 3
DEFAULT_ENCODING = 'ASCII'
SUBFIELD_DELIMITER = '^'

class IsoFile(object):

    def __init__(self, filename, encoding = DEFAULT_ENCODING):
        self.file = open(filename, 'rb')
        self.encoding = encoding

    def __iter__(self):
        return self

    def next(self):
        return IsoRecord(self)

    __next__ = next # Python 3 compatibility

    def read(self, size):
        ''' read and drop all CR and LF characters '''
        # TODO: this is inneficient but works, patches accepted!
        # NOTE: our fixtures include files which have no linebreaks,
        # files with CR-LF linebreaks and files with LF linebreaks
        chunks = []
        count = 0
        while count < size:
            chunk = self.file.read(size-count)
            if len(chunk) == 0:
                break
            chunk = chunk.replace(CR+LF,'')
            if CR in chunk:
                chunk = chunk.replace(CR,'')
            if LF in chunk:
                chunk = chunk.replace(LF,'')
            count += len(chunk)
            chunks.append(chunk)
        return ''.join(chunks)

    def close(self):
        self.file.close()

class IsoRecord(object):
    label_part_names = ('rec_len rec_status impl_codes indicator_len identifier_len'
                        ' base_addr user_defined'
                        # directory map:
                        ' fld_len_len start_len impl_len reserved').split()
    rec_len = 0

    def __init__(self, iso_file=None):
        self.iso_file = iso_file
        self.load_label()
        self.load_directory()
        self.load_fields()

    def __len__(self):
        return self.rec_len

    def load_label(self):
        label = self.iso_file.read(LABEL_LEN)
        if len(label) == 0:
            raise StopIteration
        elif len(label) != 24:
            raise ValueError('Invalid record label: "%s"' % label)
        parts = unpack(LABEL_FORMAT, label)
        for name, part in zip(self.label_part_names, parts):
            if name.endswith('_len') or name.endswith('_addr'):
                part = int(part)
            setattr(self, name, part)

    def show_label(self):
        for name in self.label_part_names:
            print('%15s : %r' % (name, getattr(self, name)))

    def load_directory(self):
        fmt_dir = '3s %ss %ss %ss' % (self.fld_len_len, self.start_len, self.impl_len)
        entry_len = TAG_LEN + self.fld_len_len + self.start_len + self.impl_len
        self.directory = []
        while True:
            char = self.iso_file.read(1)
            if char.isdigit():
                entry = char + self.iso_file.read(entry_len-1)
                entry = Field(* unpack(fmt_dir, entry))
                self.directory.append(entry)
            else:
                break

    def load_fields(self):
        for field in self.directory:
            if self.indicator_len > 0:
                field.indicator = self.iso_file.read(self.indicator_len)
            # XXX: lilacs30.iso has an identifier_len == 2,
            # but we need to ignore it to succesfully read the field contents
            # TODO: find out when to ignore the idenfier_len,
            # or fix the lilacs30.iso fixture
            #
            ##if self.identifier_len > 0: #
            ##    field.identifier = self.iso_file.read(self.identifier_len)
            value = self.iso_file.read(len(field))
            assert len(value) == len(field)
            field.value = value[:-1] # remove trailing field separator
        self.iso_file.read(1) # discard record separator

    def __iter__(self):
        return self

    def next(self):
        for field in self.directory:
            yield(field)

    __next__ = next # Python 3 compatibility

    def dump(self):
        for field in self.directory:
            print('%3s %r' % (field.tag, field.value))

class Field(object):

    def __init__(self, tag, len, start, impl):
        self.tag = tag
        self.len = int(len)
        self.start = int(start)
        self.impl = impl

    def show(self):
        for name in 'tag len start impl'.split():
            print('%15s : %r' % (name, getattr(self, name)))

    def __len__(self):
        return self.len

def test():
    import doctest
    doctest.testfile('iso2709_test.txt')


if __name__=='__main__':
    test()