Skip to content

Commit

Permalink
all-examples: files for handling pagified lines split with '/'
Browse files Browse the repository at this point in the history
  • Loading branch information
tomlup committed Feb 20, 2024
1 parent 3fd4a68 commit 39c0109
Show file tree
Hide file tree
Showing 5 changed files with 4,110 additions and 0 deletions.
5 changes: 5 additions & 0 deletions all-examples/split-sentence_handling/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
`extract_split_sentences.py` extracts all lines split with `/` from `pagified.html`.
The output is contained in `splits_unsplit.html`.

`cursory_split.py` performs an initial parse of `splits_unsplit.html` to extract the contained sentences in each line.
The output, along with some manual additions, is contained in `splits.html`
21 changes: 21 additions & 0 deletions all-examples/split-sentence_handling/cursory_split.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,21 @@
import re

RE_SPLIT = re.compile(r'(<(u|small-caps)>[A-Za-z _\.]+ ?</(u|small-caps)>)( ?</(em|small-caps|u)>(<sub>i</sub>)? ?/ ?\*?<(em|small-caps|u)> ?)(<(u|small-caps)> ?[A-Za-z _\.]+</(u|small-caps)>(<sub>i</sub>)?)')

if __name__ == '__main__':
splits = open('split_sentences_unsplit.html', 'r', encoding="utf-8").readlines()
all = []
for line in splits:
all.append(line)
if re.search(RE_SPLIT, line) is not None:
print(line)
split1 = re.sub(RE_SPLIT, r'\1', line)
print(split1)
split2 = re.sub(RE_SPLIT, r'\8', line)
print(split2)
all.append(split1)
all.append(split2)

splits_doc = ''.join(all)

print(splits_doc)
14 changes: 14 additions & 0 deletions all-examples/split-sentence_handling/extract_split_sentences.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,14 @@
import re

RE_SPLIT = re.compile(r'(>| )/(\*| |<)')

if __name__ == '__main__':
pagified = open('../pagified.html', 'r', encoding="utf-8").readlines()
splits = []
for line in pagified:
if re.search(RE_SPLIT, line) is not None:
splits.append(line)

splits_doc = '\n'.join(splits)

print(splits_doc)
Loading

0 comments on commit 39c0109

Please sign in to comment.