-
Notifications
You must be signed in to change notification settings - Fork 0
/
extractor.py
27 lines (24 loc) · 1.24 KB
/
extractor.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
from boilerpipe.extract import Extractor
import argparse
import re
parser = argparse.ArgumentParser(description = 'Text content extractor')
parser.add_argument('--urls', type=str, required=True,
help='A new-line separated list of URLS to extract content from')
parser.add_argument('--outputdirhtml', type=str, required=True,
help='The text output dir to store the content from each URL')
parser.add_argument('--outputdirtxt', type=str, required=True,
help='The HTML output dir to store the content from each URL')
args = parser.parse_args()
with open(args.urls, 'r') as url_file:
for url in url_file:
url = url.strip()
url_base_filename = re.sub('[^a-zA-Z0-9_\.\-]', '_', url)
url_txt_filename = url_base_filename + ".txt"
url_html_filename = url_base_filename + ".html"
extractor = Extractor(extractor='ArticleExtractor', url=url)
url_txt_contents = extractor.getText()
url_html_contents = extractor.getHTML()
with open(args.outputdirtxt + '/' + url_txt_filename, 'w') as txt_content_file:
txt_content_file.write(url_txt_contents)
with open(args.outputdirhtml + '/' + url_html_filename, 'w') as html_content_file:
html_content_file.write(url_html_contents)