diff --git a/news/5838.bugfix b/news/5838.bugfix new file mode 100644 index 00000000000..b83a9fa91a9 --- /dev/null +++ b/news/5838.bugfix @@ -0,0 +1 @@ +Fix content type detection if a directory named like an archive is used as a package source. diff --git a/src/pip/_internal/index.py b/src/pip/_internal/index.py index 866318262dc..012e87a8205 100644 --- a/src/pip/_internal/index.py +++ b/src/pip/_internal/index.py @@ -59,18 +59,113 @@ logger = logging.getLogger(__name__) -def _get_content_type(url, session): - """Get the Content-Type of the given url, using a HEAD request""" +def _match_vcs_scheme(url): + """Look for VCS schemes in the URL. + + Returns the matched VCS scheme, or None if there's no match. + """ + from pip._internal.vcs import VcsSupport + for scheme in VcsSupport.schemes: + if url.lower().startswith(scheme) and url[len(scheme)] in '+:': + return scheme + return None + + +def _is_url_like_archive(url): + """Return whether the URL looks like an archive. + """ + filename = Link(url).filename + for bad_ext in ARCHIVE_EXTENSIONS: + if filename.endswith(bad_ext): + return True + return False + + +class _NotHTML(Exception): + def __init__(self, content_type, request_desc): + super(_NotHTML, self).__init__(content_type, request_desc) + self.content_type = content_type + self.request_desc = request_desc + + +def _ensure_html_header(response): + """Check the Content-Type header to ensure the response contains HTML. + + Raises `_NotHTML` if the content type is not text/html. + """ + content_type = response.headers.get("Content-Type", "") + if not content_type.lower().startswith("text/html"): + raise _NotHTML(content_type, response.request.method) + + +class _NotHTTP(Exception): + pass + + +def _ensure_html_response(url, session): + """Send a HEAD request to the URL, and ensure the response contains HTML. + + Raises `_NotHTTP` if the URL is not available for a HEAD request, or + `_NotHTML` if the content type is not text/html. + """ scheme, netloc, path, query, fragment = urllib_parse.urlsplit(url) if scheme not in {'http', 'https'}: - # FIXME: some warning or something? - # assertion error? - return '' + raise _NotHTTP() resp = session.head(url, allow_redirects=True) resp.raise_for_status() - return resp.headers.get("Content-Type", "") + _ensure_html_header(resp) + + +def _get_html_response(url, session): + """Access an HTML page with GET, and return the response. + + This consists of three parts: + + 1. If the URL looks suspiciously like an archive, send a HEAD first to + check the Content-Type is HTML, to avoid downloading a large file. + Raise `_NotHTTP` if the content type cannot be determined, or + `_NotHTML` if it is not HTML. + 2. Actually perform the request. Raise HTTP exceptions on network failures. + 3. Check the Content-Type header to make sure we got HTML, and raise + `_NotHTML` otherwise. + """ + if _is_url_like_archive(url): + _ensure_html_response(url, session=session) + + logger.debug('Getting page %s', url) + + resp = session.get( + url, + headers={ + "Accept": "text/html", + # We don't want to blindly returned cached data for + # /simple/, because authors generally expecting that + # twine upload && pip install will function, but if + # they've done a pip install in the last ~10 minutes + # it won't. Thus by setting this to zero we will not + # blindly use any cached data, however the benefit of + # using max-age=0 instead of no-cache, is that we will + # still support conditional requests, so we will still + # minimize traffic sent in cases where the page hasn't + # changed at all, we will just always incur the round + # trip for the conditional GET now instead of only + # once per 10 minutes. + # For more information, please see pypa/pip#5670. + "Cache-Control": "max-age=0", + }, + ) + resp.raise_for_status() + + # The check for archives above only works if the url ends with + # something that looks like an archive. However that is not a + # requirement of an url. Unless we issue a HEAD request on every + # url we cannot know ahead of time for sure if something is HTML + # or not. However we can check after we've downloaded it. + _ensure_html_header(resp) + + return resp def _handle_get_page_fail(link, reason, url, meth=None): @@ -85,82 +180,36 @@ def _get_html_page(link, session=None): "_get_html_page() missing 1 required keyword argument: 'session'" ) - url = link.url - url = url.split('#', 1)[0] + url = link.url.split('#', 1)[0] # Check for VCS schemes that do not support lookup as web pages. - from pip._internal.vcs import VcsSupport - for scheme in VcsSupport.schemes: - if url.lower().startswith(scheme) and url[len(scheme)] in '+:': - logger.debug('Cannot look at %s URL %s', scheme, link) - return None + vcs_scheme = _match_vcs_scheme(url) + if vcs_scheme: + logger.debug('Cannot look at %s URL %s', vcs_scheme, link) + return None - try: - filename = link.filename - for bad_ext in ARCHIVE_EXTENSIONS: - if filename.endswith(bad_ext): - content_type = _get_content_type(url, session=session) - if content_type.lower().startswith('text/html'): - break - else: - logger.debug( - 'Skipping page %s because of Content-Type: %s', - link, - content_type, - ) - return + # Tack index.html onto file:// URLs that point to directories + scheme, _, path, _, _, _ = urllib_parse.urlparse(url) + if (scheme == 'file' and os.path.isdir(urllib_request.url2pathname(path))): + # add trailing slash if not present so urljoin doesn't trim + # final segment + if not url.endswith('/'): + url += '/' + url = urllib_parse.urljoin(url, 'index.html') + logger.debug(' file: URL is directory, getting %s', url) - logger.debug('Getting page %s', url) - - # Tack index.html onto file:// URLs that point to directories - (scheme, netloc, path, params, query, fragment) = \ - urllib_parse.urlparse(url) - if (scheme == 'file' and - os.path.isdir(urllib_request.url2pathname(path))): - # add trailing slash if not present so urljoin doesn't trim - # final segment - if not url.endswith('/'): - url += '/' - url = urllib_parse.urljoin(url, 'index.html') - logger.debug(' file: URL is directory, getting %s', url) - - resp = session.get( - url, - headers={ - "Accept": "text/html", - # We don't want to blindly returned cached data for - # /simple/, because authors generally expecting that - # twine upload && pip install will function, but if - # they've done a pip install in the last ~10 minutes - # it won't. Thus by setting this to zero we will not - # blindly use any cached data, however the benefit of - # using max-age=0 instead of no-cache, is that we will - # still support conditional requests, so we will still - # minimize traffic sent in cases where the page hasn't - # changed at all, we will just always incur the round - # trip for the conditional GET now instead of only - # once per 10 minutes. - # For more information, please see pypa/pip#5670. - "Cache-Control": "max-age=0", - }, + try: + resp = _get_html_response(url, session=session) + except _NotHTTP as exc: + logger.debug( + 'Skipping page %s because it looks like an archive, and cannot ' + 'be checked by HEAD.', link, + ) + except _NotHTML as exc: + logger.debug( + 'Skipping page %s because the %s request got Content-Type: %s', + link, exc.request_desc, exc.content_type, ) - resp.raise_for_status() - - # The check for archives above only works if the url ends with - # something that looks like an archive. However that is not a - # requirement of an url. Unless we issue a HEAD request on every - # url we cannot know ahead of time for sure if something is HTML - # or not. However we can check after we've downloaded it. - content_type = resp.headers.get('Content-Type', 'unknown') - if not content_type.lower().startswith("text/html"): - logger.debug( - 'Skipping page %s because of Content-Type: %s', - link, - content_type, - ) - return - - inst = HTMLPage(resp.content, resp.url, resp.headers) except requests.HTTPError as exc: _handle_get_page_fail(link, exc, url) except RetryError as exc: @@ -174,7 +223,7 @@ def _get_html_page(link, session=None): except requests.Timeout: _handle_get_page_fail(link, "timed out", url) else: - return inst + return HTMLPage(resp.content, resp.url, resp.headers) class PackageFinder(object): @@ -679,7 +728,7 @@ def _get_pages(self, locations, project_name): continue seen.add(location) - page = self._get_page(location) + page = _get_html_page(location, session=self.session) if page is None: continue @@ -796,9 +845,6 @@ def _link_package_versions(self, link, search): return InstallationCandidate(search.supplied, version, link) - def _get_page(self, link): - return _get_html_page(link, session=self.session) - def egg_info_matches( egg_info, search_name, link, diff --git a/tests/unit/test_index_html_page.py b/tests/unit/test_index_html_page.py new file mode 100644 index 00000000000..c872ad06563 --- /dev/null +++ b/tests/unit/test_index_html_page.py @@ -0,0 +1,162 @@ +import logging + +import mock +import pytest +from pip._vendor.six.moves.urllib import request as urllib_request + +from pip._internal.download import PipSession +from pip._internal.index import ( + Link, _get_html_page, _get_html_response, _NotHTML, _NotHTTP, +) + + +@pytest.mark.parametrize( + "url", + [ + "ftp://python.org/python-3.7.1.zip", + "file:///opt/data/pip-18.0.tar.gz", + ], +) +def test_get_html_response_archive_to_naive_scheme(url): + """ + `_get_html_response()` should error on an archive-like URL if the scheme + does not allow "poking" without getting data. + """ + with pytest.raises(_NotHTTP): + _get_html_response(url, session=mock.Mock(PipSession)) + + +@pytest.mark.parametrize( + "url, content_type", + [ + ("http://python.org/python-3.7.1.zip", "application/zip"), + ("https://pypi.org/pip-18.0.tar.gz", "application/gzip"), + ], +) +def test_get_html_response_archive_to_http_scheme(url, content_type): + """ + `_get_html_response()` should send a HEAD request on an archive-like URL + if the scheme supports it, and raise `_NotHTML` if the response isn't HTML. + """ + session = mock.Mock(PipSession) + session.head.return_value = mock.Mock(**{ + "request.method": "HEAD", + "headers": {"Content-Type": content_type}, + }) + + with pytest.raises(_NotHTML) as ctx: + _get_html_response(url, session=session) + + session.assert_has_calls([ + mock.call.head(url, allow_redirects=True), + ]) + assert ctx.value.args == (content_type, "HEAD") + + +@pytest.mark.parametrize( + "url", + [ + "http://python.org/python-3.7.1.zip", + "https://pypi.org/pip-18.0.tar.gz", + ], +) +def test_get_html_response_archive_to_http_scheme_is_html(url): + """ + `_get_html_response()` should work with archive-like URLs if the HEAD + request is responded with text/html. + """ + session = mock.Mock(PipSession) + session.head.return_value = mock.Mock(**{ + "request.method": "HEAD", + "headers": {"Content-Type": "text/html"}, + }) + session.get.return_value = mock.Mock(headers={"Content-Type": "text/html"}) + + resp = _get_html_response(url, session=session) + + assert resp is not None + assert session.mock_calls == [ + mock.call.head(url, allow_redirects=True), + mock.call.head().raise_for_status(), + mock.call.get(url, headers={ + "Accept": "text/html", "Cache-Control": "max-age=0", + }), + mock.call.get().raise_for_status(), + ] + + +@pytest.mark.parametrize( + "url", + [ + "https://pypi.org/simple/pip", + "https://pypi.org/simple/pip/", + "https://python.org/sitemap.xml", + ], +) +def test_get_html_response_no_head(url): + """ + `_get_html_response()` shouldn't send a HEAD request if the URL does not + look like an archive, only the GET request that retrieves data. + """ + session = mock.Mock(PipSession) + + # Mock the headers dict to ensure it is accessed. + session.get.return_value = mock.Mock(headers=mock.Mock(**{ + "get.return_value": "text/html", + })) + + resp = _get_html_response(url, session=session) + + assert resp is not None + assert session.head.call_count == 0 + assert session.get.mock_calls == [ + mock.call(url, headers={ + "Accept": "text/html", "Cache-Control": "max-age=0", + }), + mock.call().raise_for_status(), + mock.call().headers.get("Content-Type", ""), + ] + + +@pytest.mark.parametrize( + "url, vcs_scheme", + [ + ("svn+http://pypi.org/something", "svn"), + ("git+https://github.com/pypa/pip.git", "git"), + ], +) +def test_get_html_page_invalid_scheme(caplog, url, vcs_scheme): + """`_get_html_page()` should error if an invalid scheme is given. + + Only file:, http:, https:, and ftp: are allowed. + """ + with caplog.at_level(logging.DEBUG): + page = _get_html_page(Link(url), session=mock.Mock(PipSession)) + + assert page is None + assert caplog.record_tuples == [ + ( + "pip._internal.index", + logging.DEBUG, + "Cannot look at {} URL {}".format(vcs_scheme, url), + ), + ] + + +def test_get_html_page_directory_append_index(tmpdir): + """`_get_html_page()` should append "index.html" to a directory URL. + """ + dirpath = tmpdir.mkdir("something") + dir_url = "file:///{}".format( + urllib_request.pathname2url(dirpath).lstrip("/"), + ) + + session = mock.Mock(PipSession) + with mock.patch("pip._internal.index._get_html_response") as mock_func: + _get_html_page(Link(dir_url), session=session) + assert mock_func.mock_calls == [ + mock.call( + "{}/index.html".format(dir_url.rstrip("/")), + session=session, + ), + ]