add sentence spitter & tsv-converter

UB-Mannheim · Sep 9, 2022 · 44f58e3 · 44f58e3
1 parent f5dd342
commit 44f58e3
Show file tree

Hide file tree

Showing 5 changed files with 193 additions and 31 deletions.
diff --git a/README.md b/README.md
@@ -1,5 +1,7 @@
 # Blatt
 
+[![PyPI version](https://badge.fury.io/py/blatt.svg)](https://badge.fury.io/py/blatt)
+
 NLP-helper for OCR-ed pages in [PAGE XML](https://github.com/PRImA-Research-Lab/PAGE-XML) format.
 
 ## Table of contents
@@ -22,9 +24,36 @@ pip install .
 
 ## How to use
 
-### PAGE XML reader, hyphen remover and converter
+### Page object
+
+On initiation the Page-class reads the file `PAGEXML` and stores TextRegions, TextLines and Baseline Coordinates in the Page-object `p`. 
+```
+from blatt import Page
+p = Page(PAGEXML)
+```
+
+The Page-object stores unprocessed and processed TextLines as attributes.
+```
+print(p)
+[('root', 2),
+ ('namespace', 63),
+ ('filename', 24),
+ ('text_regions_xml', 38),
+ ('text_lines_xml', 260),
+ ('text_regions', 260),
+ ('text_lines', 260),
+ ('baselines', 3651),
+ ('text_with_linebreaks', 12111),
+ ('text_without_linebreaks', 11979),
+ ('sentences', 102),
+ ('x_baselines', 3651),
+ ('y_baselines', 3651),
+ ('center_baseline', 2)]
+```
+
+### Hyphen remover & converter to_txt()
 
-On initiation the Page-class reads the file `PAGEXML` and stores TextRegions, TextLines and Baseline Coordinates in the Page-object `p`. The plain text can be saved to `TXT`:
+The plain text can be saved to `TXT`:
 ```
 from blatt import Page
 p = Page(PAGEXML)
@@ -33,33 +62,63 @@ p.to_txt(TXT)
 
 By default it saves the plain text without line breaks (the hyphens '-', '-', '⹀' and '⸗' are removed and the corresponding words are merged). If you need line breaks, use `p.to_txt(TXT, linebreak=True)`.
 
+### Sentence splitter & converter to_tsv()
+
+The TextLines or sentences can be saved to `TSV`:
+```
+from blatt import Page
+p = Page(PAGEXML)
+p.to_tsv(TSV)
+```
+
+By default it saves TextLines, TextRegionID, TextLineID and Coordinates to TSV. If you use `p.to_tsv(TSV, sentence=True)`, it saves sentences (not lines!) into separate lines of TSV. The sentences are split from the plain text without hyphens using the [SegTok](https://github.com/fnl/segtok) library.
+
 ## Command Line Interface
 
 ```
-% blatt
+% blatt        
 Usage: blatt [OPTIONS] COMMAND [ARGS]...
 
-  BLATT CLI: NLP-helper for OCR-ed pages in PAGE XML format. To get help for a
+  Blatt CLI: NLP-helper for OCR-ed pages in PAGE XML format. To get help for a
   particular COMMAND, use `blatt COMMAND -h`.
 
 Options:
-  --help  Show this message and exit.
+  -h, --help  Show this message and exit.
 
 Commands:
-  convert  Converts PAGE XML files to plain text TXT files
+  to_tsv  Converts PAGE XML files to TSV files with TextLines or sentences
+  to_txt  Converts PAGE XML files to TXT files with or without line breaks &
+          hyphens
 ```
 
 ```
-% blatt convert -h
-Usage: blatt convert [OPTIONS] PAGE_FOLDER TEXT_FOLDER
+% blatt to_txt -h
+Usage: blatt to_txt [OPTIONS] PAGE_FOLDER TEXT_FOLDER
 
-  blatt convert: converts all PAGE XML files in PAGE_FOLDER to TXT files in
-  TEXT_FOLDER.
+  blatt to_txt: converts all PAGE XML files in PAGE_FOLDER to TXT files
+  with/without hyphens in TEXT_FOLDER.
 
 Options:
   -lb, --linebreak BOOLEAN  If linebreak==False, it removes hyphens at the end
                             of lines and merges the lines without line breaks.
                             Otherwise, it merges the lines using line breaks.
                             [default: False]
   -h, --help                Show this message and exit.
-```
+```
+
+```
+% blatt to_tsv -h
+Usage: blatt to_tsv [OPTIONS] PAGE_FOLDER TSV_FOLDER
+
+  blatt to_tsv: converts all PAGE XML files in PAGE_FOLDER to TSV files in
+  TSV_FOLDER.
+
+Options:
+  -s, --sentence BOOLEAN  If sentence==False, it saves TextLines,
+                          TextRegionID, TextLineID and Coordinates to TSV.
+                          Otherwise, it saves sentences (not lines!) into
+                          separate lines of TSV. The sentences are split from
+                          the plain text without hyphens using the SegTok
+                          library.  [default: False]
+  -h, --help              Show this message and exit.
+```
diff --git a/blatt/cli.py b/blatt/cli.py
@@ -6,13 +6,15 @@
 CONTEXT_SETTINGS = dict(help_option_names=['-h', '--help'])
 
 
-@click.group()
+@click.group(context_settings=CONTEXT_SETTINGS)
 def cli():
     """Blatt CLI: NLP-helper for OCR-ed pages in PAGE XML format. To get help for a particular COMMAND, use `blatt
     COMMAND -h`. """
 
 
-@cli.command('convert', short_help='Converts PAGE XML files to plain text TXT files', context_settings=CONTEXT_SETTINGS)
+@cli.command('to_txt',
+             short_help='Converts PAGE XML files to TXT files with or without line breaks & hyphens',
+             context_settings=CONTEXT_SETTINGS)
 @click.option('--linebreak',
               '-lb',
               type=bool,
@@ -22,14 +24,36 @@ def cli():
                    "breaks. Otherwise, it merges the lines using line breaks.")
 @click.argument('page_folder', type=click.Path(exists=True))
 @click.argument('text_folder', type=click.Path())
-def convert(page_folder, text_folder, linebreak):
-    """blatt convert: converts all PAGE XML files in PAGE_FOLDER to TXT files in TEXT_FOLDER."""
+def to_txt(page_folder, txt_folder, linebreak):
+    """blatt to_txt: converts all PAGE XML files in PAGE_FOLDER to TXT files with/without hyphens in TEXT_FOLDER."""
     file_paths = Path(page_folder).glob('*.xml')
     for file_path in tqdm(file_paths):
-        output_file = Path(text_folder, file_path.stem + '.txt').as_posix()
+        output_file = Path(txt_folder, file_path.stem + '.txt').as_posix()
         p = Page(file_path.as_posix())
         p.to_txt(output_file, linebreak)
 
 
+@cli.command('to_tsv',
+             short_help='Converts PAGE XML files to TSV files with TextLines or sentences',
+             context_settings=CONTEXT_SETTINGS)
+@click.option('--sentence',
+              '-s',
+              type=bool,
+              default=False,
+              show_default=True,
+              help="If sentence==False, it saves TextLines, TextRegionID, TextLineID and Coordinates to TSV. "
+                   "Otherwise, it saves sentences (not lines!) into separate lines of TSV. The sentences are split " 
+                   "from the plain text without hyphens using the SegTok library.")
+@click.argument('page_folder', type=click.Path(exists=True))
+@click.argument('tsv_folder', type=click.Path())
+def to_tsv(page_folder, tsv_folder, sentence):
+    """blatt to_tsv: converts all PAGE XML files in PAGE_FOLDER to TSV files in TSV_FOLDER."""
+    file_paths = Path(page_folder).glob('*.xml')
+    for file_path in tqdm(file_paths):
+        output_file = Path(tsv_folder, file_path.stem + '.tsv').as_posix()
+        p = Page(file_path.as_posix())
+        p.to_tsv(output_file, sentence)
+
+
 if __name__ == '__main__':
     cli()
diff --git a/blatt/page.py b/blatt/page.py
@@ -2,13 +2,15 @@
 from pprint import pformat
 from typing import List, Tuple
 from pathlib import Path
+from segtok.segmenter import split_multi
+import csv
 
 
 class Page:
     """
-    Class Page: Reads PAGE-xml file. Stores TextRegions, TextLines and Baseline coordinates.
+    Class Page: Reads PAGE XML file. Stores TextRegions, TextLines and Baseline coordinates.
     Removes hyphens from the text lines. Computes the coordinates of the mid-range average of baseline points.
-    Saves plain text with or without line breaks to file.
+    Saves plain text with or without line breaks to TXT file. Splits plain text into sentences and saves it as TSV.
     """
 
     def __init__(self, filename: Path = ''):
@@ -22,6 +24,7 @@ def __init__(self, filename: Path = ''):
             self._parse_page_xml()
             self.text_with_linebreaks = '\n'.join(self.text_lines)
             self.text_without_linebreaks = self.remove_hyphens(self.text_lines)
+            self.sentences = self.split_sentences(self.text_without_linebreaks)
             self.baselines: List
             self.x_baselines: List
             self.y_baselines: List
@@ -39,7 +42,7 @@ def __str__(self):
 
     @staticmethod
     def _open_page_xml(filename: Path = '') -> Tuple[ET.Element, ET._ElementTree, str]:
-        """Opens a PAGE-xml file and returns its tree, root and namespace."""
+        """Opens a PAGE XML file and returns its tree, root and namespace."""
         tree = ET.parse(filename)
         root = tree.getroot()
         namespace = tree.xpath('namespace-uri(.)')
@@ -96,6 +99,11 @@ def remove_hyphens(lines: list) -> str:
                     text += ' ' + lines[i + 1]
         return text
 
+    @staticmethod
+    def split_sentences(text: str) -> List[str]:
+        """Splits input plain text into sentences using the SegTok library https://github.com/fnl/segtok"""
+        return list(split_multi(text))
+
     def to_txt(self, filename: Path, linebreak: bool = False):
         """Saves TextLines as plain text into filename. If linebreak==True, the lines are separated by line breaks.
         Otherwise, the plain text contains no line breaks and hyphens [this is default]."""
@@ -104,3 +112,15 @@ def to_txt(self, filename: Path, linebreak: bool = False):
                 f.write(self.text_with_linebreaks)
             else:
                 f.write(self.text_without_linebreaks)
+
+    def to_tsv(self, filename: Path, sentence: bool = False):
+        """If sentence==False [default], it saves TextLines, TextRegionID, TextLineID and Coordinates to TSV.
+        Otherwise, it saves sentences (not lines!) into separate lines of TSV. The sentences are split from the plain
+        text without hyphens using the SegTok library. """
+        with open(filename, 'w', newline='') as f:
+            if sentence:
+                tsv = csv.writer(f, delimiter="\n")
+                tsv.writerow(self.sentences)
+            else:
+                tsv = csv.writer(f, delimiter='\t')
+                tsv.writerows(self.text_regions)
diff --git a/docs/index.md b/docs/index.md
@@ -8,6 +8,8 @@ permalink: /
 
 # Home
 
+[![PyPI version](https://badge.fury.io/py/blatt.svg)](https://badge.fury.io/py/blatt)
+
 NLP-helper for OCR-ed pages in [PAGE XML](https://github.com/PRImA-Research-Lab/PAGE-XML) format.
 
 ## Table of contents
@@ -30,9 +32,36 @@ pip install .
 
 ## How to use
 
-### PAGE XML reader, hyphen remover and converter
+### Page object
+
+On initiation the Page-class reads the file `PAGEXML` and stores TextRegions, TextLines and Baseline Coordinates in the Page-object `p`. 
+```
+from blatt import Page
+p = Page(PAGEXML)
+```
+
+The Page-object stores unprocessed and processed TextLines as attributes.
+```
+print(p)
+[('root', 2),
+ ('namespace', 63),
+ ('filename', 24),
+ ('text_regions_xml', 38),
+ ('text_lines_xml', 260),
+ ('text_regions', 260),
+ ('text_lines', 260),
+ ('baselines', 3651),
+ ('text_with_linebreaks', 12111),
+ ('text_without_linebreaks', 11979),
+ ('sentences', 102),
+ ('x_baselines', 3651),
+ ('y_baselines', 3651),
+ ('center_baseline', 2)]
+```
+
+### Hyphen remover & converter to_txt()
 
-On initiation the Page-class reads the file `PAGEXML` and stores TextRegions, TextLines and Baseline Coordinates in the Page-object `p`. The plain text can be saved to `TXT`:
+The plain text can be saved to `TXT`:
 ```
 from blatt import Page
 p = Page(PAGEXML)
@@ -41,28 +70,41 @@ p.to_txt(TXT)
 
 By default it saves the plain text without line breaks (the hyphens '-', '-', '⹀' and '⸗' are removed and the corresponding words are merged). If you need line breaks, use `p.to_txt(TXT, linebreak=True)`.
 
+### Sentence splitter & converter to_tsv()
+
+The TextLines or sentences can be saved to `TSV`:
+```
+from blatt import Page
+p = Page(PAGEXML)
+p.to_tsv(TSV)
+```
+
+By default it saves TextLines, TextRegionID, TextLineID and Coordinates to TSV. If you use `p.to_tsv(TSV, sentence=True)`, it saves sentences (not lines!) into separate lines of TSV. The sentences are split from the plain text without hyphens using the [SegTok](https://github.com/fnl/segtok) library.
+
 ## Command Line Interface
 
 ```
-% blatt
+% blatt        
 Usage: blatt [OPTIONS] COMMAND [ARGS]...
 
-  BLATT CLI: NLP-helper for OCR-ed pages in PAGE XML format. To get help for a
+  Blatt CLI: NLP-helper for OCR-ed pages in PAGE XML format. To get help for a
   particular COMMAND, use `blatt COMMAND -h`.
 
 Options:
-  --help  Show this message and exit.
+  -h, --help  Show this message and exit.
 
 Commands:
-  convert  Converts PAGE XML files to plain text TXT files
+  to_tsv  Converts PAGE XML files to TSV files with TextLines or sentences
+  to_txt  Converts PAGE XML files to TXT files with or without line breaks &
+          hyphens
 ```
 
 ```
-% blatt convert -h
-Usage: blatt convert [OPTIONS] PAGE_FOLDER TEXT_FOLDER
+% blatt to_txt -h
+Usage: blatt to_txt [OPTIONS] PAGE_FOLDER TEXT_FOLDER
 
-  blatt convert: converts all PAGE XML files in PAGE_FOLDER to TXT files in
-  TEXT_FOLDER.
+  blatt to_txt: converts all PAGE XML files in PAGE_FOLDER to TXT files
+  with/without hyphens in TEXT_FOLDER.
 
 Options:
   -lb, --linebreak BOOLEAN  If linebreak==False, it removes hyphens at the end
@@ -71,3 +113,20 @@ Options:
                             [default: False]
   -h, --help                Show this message and exit.
 ```
+
+```
+% blatt to_tsv -h
+Usage: blatt to_tsv [OPTIONS] PAGE_FOLDER TSV_FOLDER
+
+  blatt to_tsv: converts all PAGE XML files in PAGE_FOLDER to TSV files in
+  TSV_FOLDER.
+
+Options:
+  -s, --sentence BOOLEAN  If sentence==False, it saves TextLines,
+                          TextRegionID, TextLineID and Coordinates to TSV.
+                          Otherwise, it saves sentences (not lines!) into
+                          separate lines of TSV. The sentences are split from
+                          the plain text without hyphens using the SegTok
+                          library.  [default: False]
+  -h, --help              Show this message and exit.
+```
diff --git a/setup.py b/setup.py
@@ -5,14 +5,14 @@
 
 setup(
     name="blatt",
-    version='0.1.0',
+    version='0.1.5',
     author="Renat Shigapov",
     license="MIT",
     description="NLP-helper for OCR-ed pages in PAGE XML format.",
     long_description=long_description,
     long_description_content_type="text/markdown",
     url="https://github.com/UB-Mannheim/blatt",
-    install_requires=['lxml', 'tqdm', 'click'],
+    install_requires=['lxml', 'tqdm', 'click', 'segtok'],
     packages=find_packages(),
     classifiers=[
         "Programming Language :: Python :: 3",