From 88bded2febbe5b55a07c51f7cee681d095b6538a Mon Sep 17 00:00:00 2001 From: Ville Vaten Date: Tue, 2 Jan 2018 11:29:15 +0200 Subject: [PATCH] added netloc support for hdfs URIs; removed the special handling of HDFS URIs which was against the URI specification --- smart_open/smart_open_lib.py | 9 ++++++--- smart_open/tests/test_smart_open.py | 28 ++++++++++++++++++++-------- 2 files changed, 26 insertions(+), 11 deletions(-) diff --git a/smart_open/smart_open_lib.py b/smart_open/smart_open_lib.py index 52f5c97c..310dc1d3 100644 --- a/smart_open/smart_open_lib.py +++ b/smart_open/smart_open_lib.py @@ -346,7 +346,8 @@ class ParseUri(object): * s3://my_key:my_secret@my_bucket/my_key * s3://my_key:my_secret@my_server:my_port@my_bucket/my_key * hdfs:///path/file - * hdfs://path/file + * hdfs://host/path/file + * hdfs://host:port/path/file * webhdfs://host:port/path/file * ./local/path/file * ~/local/path/file @@ -355,6 +356,9 @@ class ParseUri(object): * file:///home/user/file * file:///home/user/file.bz2 + NOTE: hdfs://path/file does no longer work as it is against the URI + specification. 'path' here is nowadays interpreted as the host of the URI + """ def __init__(self, uri, default_scheme="file"): """ @@ -370,8 +374,7 @@ def __init__(self, uri, default_scheme="file"): self.scheme = parsed_uri.scheme if parsed_uri.scheme else default_scheme if self.scheme == "hdfs": - self.uri_path = parsed_uri.netloc + parsed_uri.path - self.uri_path = "/" + self.uri_path.lstrip("/") + self.uri_path = uri if not self.uri_path: raise RuntimeError("invalid HDFS URI: %s" % uri) diff --git a/smart_open/tests/test_smart_open.py b/smart_open/tests/test_smart_open.py index e37a6546..7a0330ce 100644 --- a/smart_open/tests/test_smart_open.py +++ b/smart_open/tests/test_smart_open.py @@ -325,12 +325,17 @@ def test_hdfs(self, mock_subprocess): smart_open_object = smart_open.HdfsOpenRead(smart_open.ParseUri("hdfs:///tmp/test.txt")) smart_open_object.__iter__() # called with the correct params? - mock_subprocess.Popen.assert_called_with(["hdfs", "dfs", "-text", "/tmp/test.txt"], stdout=mock_subprocess.PIPE) + mock_subprocess.Popen.assert_called_with(["hdfs", "dfs", "-text", "hdfs:///tmp/test.txt"], stdout=mock_subprocess.PIPE) - # second possibility of schema - smart_open_object = smart_open.HdfsOpenRead(smart_open.ParseUri("hdfs://tmp/test.txt")) + # network location host in HDFS schema + smart_open_object = smart_open.HdfsOpenRead(smart_open.ParseUri("hdfs://host/tmp/test.txt")) smart_open_object.__iter__() - mock_subprocess.Popen.assert_called_with(["hdfs", "dfs", "-text", "/tmp/test.txt"], stdout=mock_subprocess.PIPE) + mock_subprocess.Popen.assert_called_with(["hdfs", "dfs", "-text", "hdfs://host/tmp/test.txt"], stdout=mock_subprocess.PIPE) + + # network location host and port in HDFS schema + smart_open_object = smart_open.HdfsOpenRead(smart_open.ParseUri("hdfs://host:port/tmp/test.txt")) + smart_open_object.__iter__() + mock_subprocess.Popen.assert_called_with(["hdfs", "dfs", "-text", "hdfs://host:port/tmp/test.txt"], stdout=mock_subprocess.PIPE) @mock.patch('smart_open.smart_open_lib.subprocess') def test_hdfs_encoding(self, mock_subprocess): @@ -530,14 +535,21 @@ def test_hdfs(self, mock_subprocess): smart_open_object.write("test") # called with the correct params? mock_subprocess.Popen.assert_called_with( - ["hdfs", "dfs", "-put", "-f", "-", "/tmp/test.txt"], stdin=mock_subprocess.PIPE + ["hdfs", "dfs", "-put", "-f", "-", "hdfs:///tmp/test.txt"], stdin=mock_subprocess.PIPE + ) + + # network location host in HDFS schema + smart_open_object = smart_open.HdfsOpenWrite(smart_open.ParseUri("hdfs://host/tmp/test.txt")) + smart_open_object.write("test") + mock_subprocess.Popen.assert_called_with( + ["hdfs", "dfs", "-put", "-f", "-", "hdfs://host/tmp/test.txt"], stdin=mock_subprocess.PIPE ) - # second possibility of schema - smart_open_object = smart_open.HdfsOpenWrite(smart_open.ParseUri("hdfs://tmp/test.txt")) + # network location host and port in HDFS schema + smart_open_object = smart_open.HdfsOpenWrite(smart_open.ParseUri("hdfs://host:port/tmp/test.txt")) smart_open_object.write("test") mock_subprocess.Popen.assert_called_with( - ["hdfs", "dfs", "-put", "-f", "-", "/tmp/test.txt"], stdin=mock_subprocess.PIPE + ["hdfs", "dfs", "-put", "-f", "-", "hdfs://host:port/tmp/test.txt"], stdin=mock_subprocess.PIPE ) @unittest.skip('Not sure how to implement unsecured mode with boto3')