diff --git a/.gitignore b/.gitignore index bb8e0a4..b05a1ee 100644 --- a/.gitignore +++ b/.gitignore @@ -13,4 +13,6 @@ dist/ *venv*/ # test -.tox/ \ No newline at end of file +.tox/ + +.idea/ \ No newline at end of file diff --git a/src/protego.py b/src/protego.py index 6f89123..0d330f0 100644 --- a/src/protego.py +++ b/src/protego.py @@ -174,6 +174,8 @@ def _quote_path(self, path): def _quote_pattern(self, pattern): if pattern.startswith("https://") or pattern.startswith("http://"): pattern = "/" + pattern + if pattern.startswith("//"): + pattern = "//" + pattern # Corner case for query only (e.g. '/abc?') and param only (e.g. '/abc;') URLs. # Save the last character otherwise, urlparse will kill it. diff --git a/tests/test_protego.py b/tests/test_protego.py index 9f6520e..67337a8 100644 --- a/tests/test_protego.py +++ b/tests/test_protego.py @@ -1122,6 +1122,16 @@ def test_bytestrings(self): self.assertEqual("Protego.parse expects str, got bytes", str(context.exception)) + def test_leading_double_slash_in_pattern(self): + content = "User-Agent: *\nDisallow: //folder/*\n" + rp = Protego.parse(content) + self.assertTrue(rp.can_fetch("http://example.com/", "FooBot")) + self.assertTrue(rp.can_fetch("http://example.com/folder", "FooBot")) + self.assertTrue(rp.can_fetch("http://example.com/folder/", "FooBot")) + self.assertTrue(rp.can_fetch("http://example.com/folder/page", "FooBot")) + self.assertTrue(rp.can_fetch("http://example.com//folder", "FooBot")) + self.assertFalse(rp.can_fetch("http://example.com//folder/page", "FooBot")) + def test_visit_time(self): """Some website specified allow time for crawling in UTC""" content = "User-Agent: *\nVisit-time: 0200 0630\nUser-Agent: NoTime"