diff --git a/requirements/python b/requirements/python index 1c200697..a9d6f61d 100644 --- a/requirements/python +++ b/requirements/python @@ -5,9 +5,9 @@ argcomplete~=1.10.0 beautifulsoup4~=4.8.0 chardet==3.* docx2txt~=0.8 -extract-msg<=0.29.* #Last with python2 support +extract-msg<=0.29.6 #Last with python2 support pdfminer.six==20191110 #Last with python2 support python-pptx~=0.6.18 -six~=1.12.0 +six~=1.16.0 SpeechRecognition~=3.8.1 xlrd~=1.2.0 diff --git a/textract/parsers/html_parser.py b/textract/parsers/html_parser.py index b3dc894e..3fe3ec40 100644 --- a/textract/parsers/html_parser.py +++ b/textract/parsers/html_parser.py @@ -6,10 +6,22 @@ from .utils import BaseParser +HTML_TAG_RE = re.compile(r'(<[^>]+>)') +HTML_SPACE_SQUASH_RE = re.compile(r'\s+') +HTML_SPACE_RE = re.compile(r'\s') + + class Parser(BaseParser): """Extract text from html file using beautifulsoup4. Filter text to only show the visible parts of the page. Insipration from `here `_. + By default it preserves spaces and tries to render tables with ASCII + symbols '|' and '-'. It may be useless if you want to, for example, + extract text and put it to some full text search engine. + To replace several spaces with single one add option + `squash_html_spaces=True` to `textract.process` function. + To not render tables (just extract text) add an argument + `strip_html_tables=True` to `textract.process`. """ _disallowed_names = [ @@ -41,7 +53,7 @@ def _inline(self, element): return True return False - def _find_any_text(self, tag): + def _find_any_text(self, tag, squash_spaces=False): """Looks for any possible text within given tag. """ text = '' @@ -49,10 +61,15 @@ def _find_any_text(self, tag): text = six.text_type(tag) text = re.sub(r'(<[^>]+>)', '', text) text = re.sub(r'\s', ' ', text) + text = re.sub(HTML_TAG_RE, '', text) + if squash_spaces: + text = re.sub(HTML_SPACE_SQUASH_RE, ' ', text) + else: + text = re.sub(HTML_SPACE_RE, ' ', text) text = text.strip() return text - def _parse_tables(self, soup): + def _parse_tables(self, soup, squash_spaces): """Returns array containing basic informations about tables for ASCII replacement (look: _replace_tables()). """ @@ -66,7 +83,9 @@ def _parse_tables(self, soup): tds = tr.find_all('th') + tr.find_all('td') if len(tds) > 0: for i, td in enumerate(tds): - td_text = self._find_any_text(td) + td_text = self._find_any_text( + td, squash_spaces=squash_spaces + ) length = len(td_text) if i in t_dict['col_width']: t_dict['col_width'][i] = max( @@ -85,10 +104,21 @@ def _parse_tables(self, soup): tables.append(t_dict) return tables - def _replace_tables(self, soup, v_separator=' | ', h_separator='-'): + def _strip_tables(self, soup, squash_spaces=False): + tables = self._parse_tables(soup, squash_spaces) + for t in tables: + html = '' + for tr in t['trs']: + html += u'{0}\n'.format(u' '.join(td['text'] for td in tr)) + new_table = soup.new_tag('div') + new_table.string = html + t['table'].replace_with(new_table) + return soup + + def _replace_tables(self, soup, squash_spaces=False, v_separator=' | ', h_separator='-'): """Replaces elements with its ASCII equivalent. """ - tables = self._parse_tables(soup) + tables = self._parse_tables(soup, squash_spaces) v_sep_len = len(v_separator) v_left_sep = v_separator.lstrip() for t in tables: @@ -124,12 +154,21 @@ def _join_inlines(self, soup): elem.unwrap() return soup - def extract(self, filename, **kwargs): + def extract( + self, + filename, + strip_html_tables=False, + squash_html_spaces=False, + **kwargs + ): with open(filename, "rb") as stream: soup = BeautifulSoup(stream, 'lxml') # Convert tables to ASCII ones - soup = self._replace_tables(soup) + if strip_html_tables: + soup = self._strip_tables(soup, squash_spaces=squash_html_spaces) + else: + soup = self._replace_tables(soup, squash_spaces=squash_html_spaces) # Join inline elements soup = self._join_inlines(soup) @@ -141,7 +180,9 @@ def extract(self, filename, **kwargs): for elem in elements: string = elem.string if string is None: - string = self._find_any_text(elem) + string = self._find_any_text( + elem, squash_spaces=squash_html_spaces + ) string = string.strip() if len(string) > 0: html += "\n" + string + "\n"