Skip to content

Commit

Permalink
walk.py: xml outlining for debugging.
Browse files Browse the repository at this point in the history
  • Loading branch information
jessicah committed Aug 21, 2024
1 parent 01335d3 commit 66e716f
Showing 1 changed file with 41 additions and 0 deletions.
41 changes: 41 additions & 0 deletions walk.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,41 @@
#!/usr/bin/env python3

from bs4 import BeautifulSoup
from bs4.builder import XMLParsedAsHTMLWarning
import warnings
import re
import sys

warnings.filterwarnings('ignore', category=XMLParsedAsHTMLWarning)

def text(node):
return ''.join(node.strings).strip()

def walk(node, depth):
if not hasattr(node, 'children'):
return
if depth == 8:
return

classes = []
title = ''

if node.has_attr('class'):
classes = node['class']
if 'titlepage' in classes:
title = text(node)

print(' ' * (depth * 3), node.name, classes, title)
for child in node.children:
walk(child, depth + 1)

def main():
with open(sys.argv[1]) as file:
soup = BeautifulSoup(file, 'lxml')

print(f'Processing {sys.argv[1]}...')

walk(soup.select_one('body > div.section'), 0)

if __name__ == "__main__":
main()

0 comments on commit 66e716f

Please sign in to comment.