Skip to content

Commit

Permalink
Merge pull request #973 from PrimozGodec/replace-lxml
Browse files Browse the repository at this point in the history
[FIX] import_documents - replace lxml xml parser with ElementTree
  • Loading branch information
PrimozGodec authored May 11, 2023
2 parents c18873c + 5becc50 commit 58c7568
Show file tree
Hide file tree
Showing 3 changed files with 59 additions and 27 deletions.
43 changes: 17 additions & 26 deletions orangecontrib/text/import_documents.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,39 +4,32 @@
import os
import pathlib
import re

import httpx
import yaml

from conllu import parse_incr
from requests.exceptions import ConnectionError
import xml.etree.ElementTree as ET
from collections import namedtuple
from tempfile import NamedTemporaryFile
from types import SimpleNamespace as namespace
from typing import List, Tuple, Callable, Optional
from typing import Callable, List, Optional, Tuple
from unicodedata import normalize

import docx2txt
import httpx
import numpy as np
import pandas as pd

import docx2txt
from odf.opendocument import load
from odf import text, teletype

from pdfminer.pdfparser import PDFParser, PDFDocument
from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter
from pdfminer.converter import PDFPageAggregator
from pdfminer.layout import LAParams, LTTextBox, LTTextLine
from bs4 import BeautifulSoup

import serverfiles

from Orange.data import DiscreteVariable, Domain, StringVariable, \
guess_data_type
import yaml
from conllu import parse_incr
from odf import teletype, text
from odf.opendocument import load
from Orange.data import DiscreteVariable, Domain, StringVariable, guess_data_type
from Orange.data.io import detect_encoding, sanitize_variable
from Orange.data.util import get_unique_names
from Orange.util import Registry, dummy_callback
from Orange.misc.utils.embedder_utils import get_proxies
from Orange.util import Registry, dummy_callback
from pdfminer.converter import PDFPageAggregator
from pdfminer.layout import LAParams, LTTextBox, LTTextLine
from pdfminer.pdfinterp import PDFPageInterpreter, PDFResourceManager
from pdfminer.pdfparser import PDFDocument, PDFParser
from requests.exceptions import ConnectionError

from orangecontrib.text.corpus import Corpus

Expand Down Expand Up @@ -165,10 +158,8 @@ class XmlReader(Reader):
ext = [".xml"]

def read_file(self):
encoding = detect_encoding(self.path)
with open(self.path, encoding=encoding, errors='ignore') as markup:
soup = BeautifulSoup(markup.read(), "lxml")
self.content = soup.get_text()
root = ET.parse(self.path).getroot()
self.content = "\n".join(t.strip() for t in root.itertext() if t.strip())


class CsvMetaReader(Reader):
Expand Down
42 changes: 42 additions & 0 deletions orangecontrib/text/tests/test_import_documents.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@
import os
import unittest
from os.path import join, splitext
from tempfile import NamedTemporaryFile
from unittest.mock import patch, MagicMock, call

import numpy as np
Expand All @@ -14,6 +15,7 @@
UrlProxyReader,
TxtReader,
TextData,
XmlReader,
)


Expand Down Expand Up @@ -254,5 +256,45 @@ def test_url_errors(self, _, __):
self.assertGreater(len(errors), 0)


XML_EXAMPLE = """<?xml version="1.0" encoding="UTF-8" standalone="no"?>
<root testAttr="testValue">
The Tree
<children>
<child name="Jack">First</child>
<child name="Rose">Second</child>
<child name="Blue Ivy">
Third
<grandchildren>
<data>One</data>
<data>Two</data>
<unique>Twins</unique>
</grandchildren>
</child>
<child name="Jane">Fourth</child>
</children>
After
</root>"""


class TestXMLReader(unittest.TestCase):
def test_file(self):
exp = "The Tree\nFirst\nSecond\nThird\nOne\nTwo\nTwins\nFourth\nAfter"
with NamedTemporaryFile(mode="w", delete=False) as fp:
fp.write(XML_EXAMPLE)
reader = XmlReader(fp.name)
res = reader.read()[0]
self.assertEqual(exp, res.content)
os.remove(fp.name)

def test_error(self):
with NamedTemporaryFile(mode="w", delete=False) as fp:
fp.write("Test")
reader = XmlReader(fp.name)
res = reader.read()
self.assertIsNone(res[0])
self.assertEqual(fp.name.split(os.sep)[-1], res[1])
os.remove(fp.name)


if __name__ == "__main__":
unittest.main()
1 change: 0 additions & 1 deletion requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,6 @@ gensim>=4.3.0,!=4.3.1 # gensim 4.3.1 is build on numpy 1.24, causing error on o
httpx!=0.23.1 # temporary fix - semantic search fail (but only in tests)
langdetect
lemmagen3
lxml
nltk>=3.0.5 # TweetTokenizer introduced in 3.0.5
numpy
odfpy>=1.3.5
Expand Down

0 comments on commit 58c7568

Please sign in to comment.