CentreForDigitalHumanities · lukavdplas · Jun 26, 2024 · Apr 25, 2024 · Apr 29, 2024 · Apr 29, 2024
diff --git a/docs/api.md b/docs/api.md
@@ -35,3 +35,9 @@ __Module:__ `ianalyzer_readers.readers.html`
 __Module:__ `ianalyzer_readers.extract`
 
 ::: ianalyzer_readers.extract
+
+## XML tags
+
+__Module:__ `ianalyzer_readers.xml_tag`
+
+::: ianalyzer_readers.xml_tag
diff --git a/ianalyzer_readers/extract.py b/ianalyzer_readers/extract.py
@@ -11,9 +11,11 @@
 import re
 import logging
 import traceback
-from typing import Any, Dict, Callable, Union, List, Pattern, Optional
+from typing import Any, Dict, Callable, List, Optional, Iterable
 logger = logging.getLogger()
 
+from .xml_tag import TagSpecification, resolve_tag_specification
+
 
 class Extractor(object):
     '''
@@ -258,165 +260,108 @@ class XML(Extractor):
         setting `external_file`.
     - Choose where to start searching. The default searching point is the entry tag
         for the document, but you can also start from the top of the document by setting
-        `toplevel`. For either of these tags, you can set `parent_level` to select
-        an ancestor to search from. For instance, `parent_level=1` will search from the 
-        parent of the selected tag.
-    - Choose the query to describe the tag(s) you need. Set `tag`, `recursive`,
-        `secondary_tag`.
+        `toplevel`.
+    - Describe the tag(s) you're looking for as a Tag object. You can also provide multiple
+        tags to chain queries. 
     - If you need to return _all_ matching tags, rather than the first match, set
         `multiple=True`.
-    - If needed, set `transform_soup_func` to further modify the matched tag. For
-        instance, you could use built-in parameters to select a tag, and then add a
-        `transform_soup_func` to select a child from it with a more complex condition.
     - Choose how to extract a value: set `attribute`, `flatten`, or `extract_soup_func`
         if needed.
     - The extracted value is a string, or the output of `extract_soup_func`. To further
         transform it, add a function for `transform`.
 
     Parameters:
-        tag: Tag to select. Can be:
-            - a string
-            - a compiled regular expression (the output of `re.compile`).
-            - a list of strings or regular expression pattterns. In that case, it is read
-                as a path to select successive children.
-            - `None`, if the information is in an attribute of the current head of the
-                tree.
-        parent_level: If set, the extractor will ascend the tree before looking for the
-            indicated `tag`. Useful when you need to select information from a tag's
-            sibling or parent.
-        attribute: By default, the extractor will extract the text content of the tag.
-            Set this property to extract the value of an _attribute_ instead.
-        flatten: When extracting the text content of a tag, `flatten` determines whether
+        tags:
+            Tags to select. Each of these can be a `Tag` object, or a callable that
+            takes the document metadata as input and returns a `Tag`.
+
+            If no tags are provided, the extractor will work form the starting tag.
+
+            Tags represent a query to select tags from current tag (e.g. the entry tag of
+            the document). If you provide multiple, they are chained: each Tag query is
+            applied to the results from the previous one.
+        attribute:
+            By default, the extractor will extract the text content of the tag. Set this
+            property to extract the value of an _attribute_ instead.
+        flatten:
+            When extracting the text content of a tag, `flatten` determines whether
             the contents of non-text children are flattened. If `False`, only the direct
-            text content of the tag is extracted. This parameter does nothing if
-            `attribute=True` is set.
-        toplevel: If `True`, the extractor will search from the toplevel tag of the XML
+            text content of the tag is extracted.
+
+            This parameter does nothing if `attribute=True` is set.
+        toplevel:
+            If `True`, the extractor will search from the toplevel tag of the XML
             document, rather than the entry tag for the document.
-        recursive: If `True`, the extractor will search for `tag` recursively. If `False`,
-            it only looks for direct children.
-        multiple: If `False`, the extractor will extract the first matching element. If 
+        multiple:
+            If `False`, the extractor will extract the first matching element. If 
             `True`, it will extract a list of all matching elements.
-        secondary_tag: Adds a condition that the tag must have a sibling tag for which the
-            text content matches a metadata field or a string. The value is a dictionary,
-            with two keys: `'tag'` gives the name of the sibling tag. The other key can be
-            `'exact'`, which gives a string to match, or `'match'`, which gives the name of
-            a metadata field against which to match the content. If this field has
-            `external_file=True`, then `'match'` can also give the name of another field in
-            the reader, which as `external_file=False`.
-        external_file: This property can be set to look through a secondary XML file
-            (usually one containing metadata). It requires that the passed metadata have an
-            `'external_file'` key that specifies the path to the file. This parameter
-            specifies the toplevel tag and entry level tag for that file; if set, the
-            extractor will extract this field from the external file instead of the current
-            source file.
-            The value for `entry_tag` is only used if the extractor does not have a `se
-        transform_soup_func: A function to transform the soup directly after the tag is
-            selected, before further processing (attributes, flattening, etc) to extract
-            the value from it. Keep in mind that the soup passed could be `None` if no
-            matching tag is found.
+        external_file:
+            If `True`, the extractor will look through a secondary XML file (usually one
+            containing metadata). It requires that the passed metadata have an
+            `'external_file'` key that specifies the path to the file.
+
+            Note: this option is not supported when this extractor is nested in another
+            extractor (like `Combined`).
         extract_soup_func: A function to extract a value directly from the soup element,
-            instead of using the content string or an attribute. Keep in mind
-            that the soup passed could be `None`.
+            instead of using the content string or an attribute.
             `attribute` and `flatten` will do nothing if this property is set.
         **kwargs: additional options to pass on to `Extractor`.
     '''
 
     def __init__(self,
-                 tag: Union[str, Pattern, List[Union[str, Pattern]], None] = [],
-                 parent_level: Optional[int] = None,
+                 *tags: TagSpecification,
                  attribute: Optional[str] = None,
                  flatten: bool = False,
                  toplevel: bool = False,
-                 recursive: bool = False,
                  multiple: bool = False,
-                 secondary_tag: Dict = {
-                     'tag': None,
-                     'match': None,
-                     'exact': None,
-                 },
-                 external_file: Dict = {
-                     'xml_tag_toplevel': None,
-                     'xml_tag_entry': None
-                 },
-                 transform_soup_func: Optional[Callable] = None,
+                 external_file: bool = False,
                  extract_soup_func: Optional[Callable] = None,
-                 *nargs,
                  **kwargs
                  ):
 
-        self.tag = tag
-        self.parent_level = parent_level
+        self.tags = tags
         self.attribute = attribute
         self.flatten = flatten
         self.toplevel = toplevel
-        self.recursive = recursive
         self.multiple = multiple
-        self.secondary_tag = secondary_tag if secondary_tag['tag'] != None else None
-        self.external_file = external_file if external_file['xml_tag_toplevel'] else None
-        self.transform_soup_func = transform_soup_func
+        self.external_file = external_file
         self.extract_soup_func = extract_soup_func
-        super().__init__(*nargs, **kwargs)
+        super().__init__(**kwargs)
 
-    def _select(self, soup, metadata=None):
+    def _select(self, tags: Iterable[TagSpecification], soup: bs4.PageElement, metadata=None):
         '''
         Return the BeautifulSoup element that matches the constraints of this
         extractor.
         '''
-        # If the tag was a path, walk through it before continuing
-        tag = self.tag
-        if not tag:
-            return soup
-        if isinstance(self.tag, list):
-            if len(tag) == 0:
-                return soup
-            for i in range(0, len(self.tag)-1):
-                if self.tag[i] == '..':
-                    soup = soup.parent
-                elif self.tag[i] == '.':
-                    pass
-                else:
-                    soup = soup.find(self.tag[i], recursive=self.recursive)
-                if not soup:
-                    return None
-            tag = self.tag[-1]
-
-        # Find and return a tag which is a sibling of a secondary tag
-        # e.g., we need a category tag associated with a specific id
-        if self.secondary_tag:
-            # match metadata field
-            if self.secondary_tag.get('match') is not None:
-                match_string = metadata[self.secondary_tag['match']]
-            elif self.secondary_tag.get('exact') is not None:
-                match_string = self.secondary_tag['exact']
-            sibling = soup.find(self.secondary_tag['tag'], string=match_string)
-            if sibling:
-                return sibling.parent.find(tag)
-
-        # Find and return (all) relevant BeautifulSoup element(s)
-        if self.multiple:
-            return soup.find_all(tag, recursive=self.recursive)
-        elif self.parent_level:
-            count = 0
-            while count < self.parent_level:
-                soup = soup.parent
-                count += 1
-            return soup.find(tag, recursive=self.recursive)
+
+        if len(tags) > 1:
+            tag = resolve_tag_specification(tags[0], metadata)
+            for element in tag.find_in_soup(soup):
+                for result in self._select(tags[1:], element, metadata):
+                    yield result
+        elif len(tags) == 1:
+            tag = resolve_tag_specification(tags[0], metadata)
+            for result in tag.find_in_soup(soup):
+                yield result
         else:
-            return soup.find(tag, recursive=self.recursive)
+            yield soup
+
 
     def _apply(self, soup_top, soup_entry, *nargs, **kwargs):
-        if 'metadata' in kwargs:
-            # pass along the metadata to the _select method
-            soup = self._select(
-                soup_top if self.toplevel else soup_entry, metadata=kwargs['metadata'])
-        # Select appropriate BeautifulSoup element
+        results_generator = self._select(
+            self.tags,
+            soup_top if self.toplevel else soup_entry,
+            metadata=kwargs.get('metadata')
+        )
+
+        if self.multiple:
+            results = list(results_generator)
+            return list(map(self._extract, results))
         else:
-            soup = self._select(soup_top if self.toplevel else soup_entry)
-        if self.transform_soup_func:
-            if type(soup) == bs4.element.ResultSet:
-                soup = [self.transform_soup_func(bowl) for bowl in soup]
-            else:
-                soup = self.transform_soup_func(soup)
+            result = next(results_generator, None)
+            return self._extract(result)
+
+    def _extract(self, soup: Optional[bs4.PageElement]):
         if not soup:
             return None
 
@@ -429,7 +374,7 @@ def _apply(self, soup_top, soup_entry, *nargs, **kwargs):
             if self.flatten:
                 return self._flatten(soup)
             else:
-                return self._string(soup)
+                return self._string(soup)    
 
     def _string(self, soup):
         '''
@@ -481,57 +426,6 @@ def _attr(self, soup):
             ]
 
 
-class FilterAttribute(XML):
-    '''
-    This extractor extracts attributes or contents from a BeautifulSoup node.
-
-    It is an extension of the `XML` extractor and adds a single parameter,
-    `attribute_filter`.
-
-    Parameters:
-        attribute_filter: Specify an attribute / value pair by which to select content
-        **kwargs: additional options to pass on to `XML`.
-    '''
-
-    def __init__(self,
-                 attribute_filter: Dict = {
-                     'attribute': None,
-                     'value': None},
-                 *nargs,
-                 **kwargs
-                 ):
-        super().__init__(*nargs, **kwargs)
-        self.attribute_filter = attribute_filter
-
-    def _select(self, soup, metadata):
-        '''
-        Return the BeautifulSoup element that matches the constraints of this
-        extractor.
-        '''
-        # If the tag was a path, walk through it before continuing
-        tag = self.tag
-
-        if isinstance(self.tag, list):
-            if len(tag) == 0:
-                return soup
-            for i in range(0, len(self.tag)-1):
-                if self.tag[i] == '..':
-                    soup = soup.parent
-                elif self.tag[i] == '.':
-                    pass
-                else:
-                    soup = soup.find(self.tag[i], recursive=self.recursive)
-
-                if not soup:
-                    return None
-            tag = self.tag[-1]
-
-        # Find and return (all) relevant BeautifulSoup element(s)
-        if self.multiple:
-            return soup.find_all(tag, recursive=self.recursive)
-        else:
-            return(soup.find(tag, {self.attribute_filter['attribute']: self.attribute_filter['value']}))
-
 class CSV(Extractor):
     '''
     This extractor extracts values from a list of CSV or spreadsheet rows.

diff --git a/ianalyzer_readers/readers/csv.py b/ianalyzer_readers/readers/csv.py
@@ -71,7 +71,7 @@ def source2dicts(self, source: Source) -> Iterable[Document]:
 
         # make sure the field size is as big as the system permits
         csv.field_size_limit(sys.maxsize)
-        self._reject_extractors(extract.XML, extract.FilterAttribute)
+        self._reject_extractors(extract.XML)
 
         if isinstance(source, str):
             filename = source

diff --git a/ianalyzer_readers/readers/html.py b/ianalyzer_readers/readers/html.py
@@ -22,8 +22,7 @@ class HTMLReader(XMLReader):
     It is based on the XMLReader and supports the same options (`tag_toplevel` and
     `tag_entry`).
 
-    In addition to generic extractor classes, this reader supports the `XML` and
-    `FilterAttribute` extractors.
+    In addition to generic extractor classes, this reader supports the `XML` extractor.
     '''
 
     def source2dicts(self, source: Source) -> Iterable[Document]:
@@ -55,12 +54,11 @@ def source2dicts(self, source: Source) -> Iterable[Document]:
         tag0 = self.tag_toplevel
         tag = self.tag_entry
 
-        bowl = soup.find(tag0) if tag0 else soup
+        bowl = tag0.find_next_in_soup(soup) if tag0 else soup
 
-        # if there is a entry level tag, with html this is not always the case
+        # if there is a entry level tag; with html this is not always the case
         if bowl and tag:
-            # Note that this is non-recursive: will only find direct descendants of the top-level tag
-            for i, spoon in enumerate(bowl.find_all(tag)):
+            for i, spoon in enumerate(tag.find_in_soup(soup)):
                 # yield
                 yield {
                     field.name: field.extractor.apply(

diff --git a/ianalyzer_readers/readers/xlsx.py b/ianalyzer_readers/readers/xlsx.py
@@ -62,7 +62,7 @@ def source2dicts(self, source: Source) -> Iterable[Document]:
                 are based on the extractor of each field.
         '''
 
-        self._reject_extractors(extract.XML, extract.FilterAttribute)
+        self._reject_extractors(extract.XML)
 
         if isinstance(source, str):
             filename = source