-
Notifications
You must be signed in to change notification settings - Fork 1
/
index.py
135 lines (104 loc) · 4.54 KB
/
index.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
from collections import namedtuple
import os
import sys
from bs4 import BeautifulSoup
import lxml
import readability
import whoosh.index
import whoosh.fields
import whoosh.qparser
import whoosh.writing
from paths import *
import path_utils
ParsedDocument = namedtuple("ParsedDocument", ["title", "content"])
def parse_html_string(html_string):
# Parse out title and body text
document = readability.Document(html_string)
# TODO(ajayjain): use document.short_title()?
title = document.title()
body_html = document.summary(html_partial=True)
body_text = BeautifulSoup(body_html, 'lxml').get_text().strip()
parsed = ParsedDocument(title=title, content=body_text)
return parsed
def make_preview(text, max_length=250):
"""Return an excerpt view of a string"""
preview = text.replace("\n", " ").strip()
if len(preview) > max_length:
preview = preview[:max_length-3].strip() + "..."
return preview
class Index(object):
def __init__(self):
# Initialize schema for index creation
schema = whoosh.fields.Schema(title=whoosh.fields.TEXT(stored=True),
url=whoosh.fields.ID(stored=True, unique=True),
body_text=whoosh.fields.TEXT(stored=True))
# Create index and index object. self.index can be shared between threads.
if not os.path.exists(INDEX_DIR):
print("Creating search index at {}".format(INDEX_DIR))
os.mkdir(INDEX_DIR)
self.index = whoosh.index.create_in(INDEX_DIR, schema)
else:
print("Loading search index at {}".format(INDEX_DIR))
self.index = whoosh.index.open_dir(INDEX_DIR)
def index_html(self, remote_url, local_path):
# TODO(ajayjain): Switch to boilerpipe / a python wrapper
# TODO(ajayjain): Deduplicate with Luis's code
# Load HTML file
content = ""
with open(os.path.join(WGET_DOWNLOADS, local_path), 'r') as html_file:
try:
content = html_file.read()
except UnicodeDecodeError as e:
print('UnicodeDecodeError in Index, reading a file', e)
return
try:
parsed = parse_html_string(content)
# Add to the index
self.index_parsed(parsed.title, remote_url, parsed.content)
except lxml.etree.ParserError as err:
print("ParserError while parsing HTML content:", err)
print("\tremote url:", remote_url)
print("\tlocal path:", local_path)
def index_parsed(self, title, url, body_text):
preview = make_preview(body_text, max_length=100)
print("[index index_parsed] Indexing...")
print("\turl: ", url)
print("\ttitle:", title)
print("\tbody: ", preview)
if body_text:
# TODO(ajayjain): Bulk write documents to the index
# Wrapping the AsyncWriter in a with clause seems to cause errors:
# "whoosh.writing.IndexingError: This writer is closed"
writer = whoosh.writing.AsyncWriter(self.index)
writer.update_document(
title=title,
url=url,
body_text=body_text)
writer.commit()
else:
print("No content extracted, not indexing\n")
def search(self, query_string):
"""Search for results in the index by a query string"""
# Parse user query string
query_parser = whoosh.qparser.MultifieldParser(["title", "body_text", "url"], self.index.schema)
query = query_parser.parse(query_string)
searcher = self.index.searcher()
raw_results = searcher.search(query, terms=True)
results = []
for hit in raw_results:
result = {}
# Display highlighted text at query word matches in title
result['title'] = hit.highlights('title')
if not result['title']:
# No hits on the title, return the full title
result['title'] = hit['title']
result['url'] = hit['url']
# Remove scheme from URL displayed to user
result['path'] = path_utils.strip_scheme(hit['url'])
# Display highlighted text at query word matches in body
result['body_text'] = hit.highlights("body_text")
if not result['body_text']:
# No hits on the body, return an excerpt
result['body_text'] = make_preview(hit['body_text'], 100)
results.append(result)
return results