Skip to content

Commit

Permalink
Merge pull request #234 from dwvisser/smoke-test-issue-130
Browse files Browse the repository at this point in the history
pysolr.extract() handle spaces and special characters in filenames (with added passing test cases)
  • Loading branch information
acdha authored Jul 9, 2018
2 parents 6edce48 + 15b239f commit fcbf73e
Show file tree
Hide file tree
Showing 2 changed files with 57 additions and 4 deletions.
15 changes: 11 additions & 4 deletions pysolr.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,13 @@
# Python 2.X
from urllib import urlencode

try:
# Python 3.X
from urllib.parse import quote
except ImportError:
# Python 2.X
from urllib import quote

try:
# Python 3.X
import html.entities as htmlentities
Expand Down Expand Up @@ -1027,13 +1034,13 @@ def extract(self, file_obj, extractOnly=True, handler='update/extract', **kwargs
"wt": "json",
}
params.update(kwargs)

filename = quote(file_obj.name.encode('utf-8'))
try:
# We'll provide the file using its true name as Tika may use that
# as a file type hint:
resp = self._send_request('post', handler,
body=params,
files={'file': (file_obj.name, file_obj)})
files={'file': (filename, file_obj)})
except (IOError, SolrError) as err:
self.log.error("Failed to extract document metadata: %s", err,
exc_info=True)
Expand All @@ -1046,10 +1053,10 @@ def extract(self, file_obj, extractOnly=True, handler='update/extract', **kwargs
exc_info=True)
raise

data['contents'] = data.pop(file_obj.name, None)
data['contents'] = data.pop(filename, None)
data['metadata'] = metadata = {}

raw_metadata = data.pop("%s_metadata" % file_obj.name, None)
raw_metadata = data.pop("%s_metadata" % filename, None)

if raw_metadata:
# The raw format is somewhat annoying: it's a flat list of
Expand Down
46 changes: 46 additions & 0 deletions tests/test_client.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,11 @@
except ImportError:
from urllib import unquote_plus

try:
from urllib.parse import quote
except ImportError:
from urllib import quote


class UtilsTestCase(unittest.TestCase):
def test_unescape_html(self):
Expand Down Expand Up @@ -855,6 +860,47 @@ def test_extract(self):
# round-trip:
self.assertEqual(['Test Title ☃☃'], m['title'])

def test_extract_special_char_in_filename(self):
fake_f = StringIO("""
<html>
<head>
<meta charset="utf-8">
<meta name="haystack-test" content="test 1234">
<title>Test Title ☃&#x2603;</title>
</head>
<body>foobar</body>
</html>
""")
fake_f.name = u"test☃.html"
extracted = self.solr.extract(fake_f)
# extract should default to 'update/extract' handler
args, kwargs = self.solr._send_request.call_args
self.assertTrue(args[1].startswith('update/extract'))

# extract should support custom handlers
with self.assertRaises(SolrError):
self.solr.extract(fake_f, handler='fakehandler')
args, kwargs = self.solr._send_request.call_args
self.assertTrue(args[1].startswith('fakehandler'))

# Verify documented response structure:
self.assertIn('contents', extracted)
self.assertIn('metadata', extracted)

self.assertIn('foobar', extracted['contents'])

m = extracted['metadata']

self.assertEqual([quote(fake_f.name.encode('utf-8'))], m['stream_name'])

self.assertIn('haystack-test', m, "HTML metadata should have been extracted!")
self.assertEqual(['test 1234'], m['haystack-test'])

# Note the underhanded use of a double snowman to verify both that Tika
# correctly decoded entities and that our UTF-8 characters survived the
# round-trip:
self.assertEqual(['Test Title ☃☃'], m['title'])

def test_full_url(self):
self.solr.url = 'http://localhost:8983/solr/core0'
full_url = self.solr._create_full_url(path='/update')
Expand Down

0 comments on commit fcbf73e

Please sign in to comment.