diff --git a/news/5838.bugfix b/news/5838.bugfix
new file mode 100644
index 00000000000..b83a9fa91a9
--- /dev/null
+++ b/news/5838.bugfix
@@ -0,0 +1 @@
+Fix content type detection if a directory named like an archive is used as a package source.
diff --git a/src/pip/_internal/index.py b/src/pip/_internal/index.py
index 866318262dc..012e87a8205 100644
--- a/src/pip/_internal/index.py
+++ b/src/pip/_internal/index.py
@@ -59,18 +59,113 @@
 logger = logging.getLogger(__name__)
 
 
-def _get_content_type(url, session):
-    """Get the Content-Type of the given url, using a HEAD request"""
+def _match_vcs_scheme(url):
+    """Look for VCS schemes in the URL.
+
+    Returns the matched VCS scheme, or None if there's no match.
+    """
+    from pip._internal.vcs import VcsSupport
+    for scheme in VcsSupport.schemes:
+        if url.lower().startswith(scheme) and url[len(scheme)] in '+:':
+            return scheme
+    return None
+
+
+def _is_url_like_archive(url):
+    """Return whether the URL looks like an archive.
+    """
+    filename = Link(url).filename
+    for bad_ext in ARCHIVE_EXTENSIONS:
+        if filename.endswith(bad_ext):
+            return True
+    return False
+
+
+class _NotHTML(Exception):
+    def __init__(self, content_type, request_desc):
+        super(_NotHTML, self).__init__(content_type, request_desc)
+        self.content_type = content_type
+        self.request_desc = request_desc
+
+
+def _ensure_html_header(response):
+    """Check the Content-Type header to ensure the response contains HTML.
+
+    Raises `_NotHTML` if the content type is not text/html.
+    """
+    content_type = response.headers.get("Content-Type", "")
+    if not content_type.lower().startswith("text/html"):
+        raise _NotHTML(content_type, response.request.method)
+
+
+class _NotHTTP(Exception):
+    pass
+
+
+def _ensure_html_response(url, session):
+    """Send a HEAD request to the URL, and ensure the response contains HTML.
+
+    Raises `_NotHTTP` if the URL is not available for a HEAD request, or
+    `_NotHTML` if the content type is not text/html.
+    """
     scheme, netloc, path, query, fragment = urllib_parse.urlsplit(url)
     if scheme not in {'http', 'https'}:
-        # FIXME: some warning or something?
-        # assertion error?
-        return ''
+        raise _NotHTTP()
 
     resp = session.head(url, allow_redirects=True)
     resp.raise_for_status()
 
-    return resp.headers.get("Content-Type", "")
+    _ensure_html_header(resp)
+
+
+def _get_html_response(url, session):
+    """Access an HTML page with GET, and return the response.
+
+    This consists of three parts:
+
+    1. If the URL looks suspiciously like an archive, send a HEAD first to
+       check the Content-Type is HTML, to avoid downloading a large file.
+       Raise `_NotHTTP` if the content type cannot be determined, or
+       `_NotHTML` if it is not HTML.
+    2. Actually perform the request. Raise HTTP exceptions on network failures.
+    3. Check the Content-Type header to make sure we got HTML, and raise
+       `_NotHTML` otherwise.
+    """
+    if _is_url_like_archive(url):
+        _ensure_html_response(url, session=session)
+
+    logger.debug('Getting page %s', url)
+
+    resp = session.get(
+        url,
+        headers={
+            "Accept": "text/html",
+            # We don't want to blindly returned cached data for
+            # /simple/, because authors generally expecting that
+            # twine upload && pip install will function, but if
+            # they've done a pip install in the last ~10 minutes
+            # it won't. Thus by setting this to zero we will not
+            # blindly use any cached data, however the benefit of
+            # using max-age=0 instead of no-cache, is that we will
+            # still support conditional requests, so we will still
+            # minimize traffic sent in cases where the page hasn't
+            # changed at all, we will just always incur the round
+            # trip for the conditional GET now instead of only
+            # once per 10 minutes.
+            # For more information, please see pypa/pip#5670.
+            "Cache-Control": "max-age=0",
+        },
+    )
+    resp.raise_for_status()
+
+    # The check for archives above only works if the url ends with
+    # something that looks like an archive. However that is not a
+    # requirement of an url. Unless we issue a HEAD request on every
+    # url we cannot know ahead of time for sure if something is HTML
+    # or not. However we can check after we've downloaded it.
+    _ensure_html_header(resp)
+
+    return resp
 
 
 def _handle_get_page_fail(link, reason, url, meth=None):
@@ -85,82 +180,36 @@ def _get_html_page(link, session=None):
             "_get_html_page() missing 1 required keyword argument: 'session'"
         )
 
-    url = link.url
-    url = url.split('#', 1)[0]
+    url = link.url.split('#', 1)[0]
 
     # Check for VCS schemes that do not support lookup as web pages.
-    from pip._internal.vcs import VcsSupport
-    for scheme in VcsSupport.schemes:
-        if url.lower().startswith(scheme) and url[len(scheme)] in '+:':
-            logger.debug('Cannot look at %s URL %s', scheme, link)
-            return None
+    vcs_scheme = _match_vcs_scheme(url)
+    if vcs_scheme:
+        logger.debug('Cannot look at %s URL %s', vcs_scheme, link)
+        return None
 
-    try:
-        filename = link.filename
-        for bad_ext in ARCHIVE_EXTENSIONS:
-            if filename.endswith(bad_ext):
-                content_type = _get_content_type(url, session=session)
-                if content_type.lower().startswith('text/html'):
-                    break
-                else:
-                    logger.debug(
-                        'Skipping page %s because of Content-Type: %s',
-                        link,
-                        content_type,
-                    )
-                    return
+    # Tack index.html onto file:// URLs that point to directories
+    scheme, _, path, _, _, _ = urllib_parse.urlparse(url)
+    if (scheme == 'file' and os.path.isdir(urllib_request.url2pathname(path))):
+        # add trailing slash if not present so urljoin doesn't trim
+        # final segment
+        if not url.endswith('/'):
+            url += '/'
+        url = urllib_parse.urljoin(url, 'index.html')
+        logger.debug(' file: URL is directory, getting %s', url)
 
-        logger.debug('Getting page %s', url)
-
-        # Tack index.html onto file:// URLs that point to directories
-        (scheme, netloc, path, params, query, fragment) = \
-            urllib_parse.urlparse(url)
-        if (scheme == 'file' and
-                os.path.isdir(urllib_request.url2pathname(path))):
-            # add trailing slash if not present so urljoin doesn't trim
-            # final segment
-            if not url.endswith('/'):
-                url += '/'
-            url = urllib_parse.urljoin(url, 'index.html')
-            logger.debug(' file: URL is directory, getting %s', url)
-
-        resp = session.get(
-            url,
-            headers={
-                "Accept": "text/html",
-                # We don't want to blindly returned cached data for
-                # /simple/, because authors generally expecting that
-                # twine upload && pip install will function, but if
-                # they've done a pip install in the last ~10 minutes
-                # it won't. Thus by setting this to zero we will not
-                # blindly use any cached data, however the benefit of
-                # using max-age=0 instead of no-cache, is that we will
-                # still support conditional requests, so we will still
-                # minimize traffic sent in cases where the page hasn't
-                # changed at all, we will just always incur the round
-                # trip for the conditional GET now instead of only
-                # once per 10 minutes.
-                # For more information, please see pypa/pip#5670.
-                "Cache-Control": "max-age=0",
-            },
+    try:
+        resp = _get_html_response(url, session=session)
+    except _NotHTTP as exc:
+        logger.debug(
+            'Skipping page %s because it looks like an archive, and cannot '
+            'be checked by HEAD.', link,
+        )
+    except _NotHTML as exc:
+        logger.debug(
+            'Skipping page %s because the %s request got Content-Type: %s',
+            link, exc.request_desc, exc.content_type,
         )
-        resp.raise_for_status()
-
-        # The check for archives above only works if the url ends with
-        # something that looks like an archive. However that is not a
-        # requirement of an url. Unless we issue a HEAD request on every
-        # url we cannot know ahead of time for sure if something is HTML
-        # or not. However we can check after we've downloaded it.
-        content_type = resp.headers.get('Content-Type', 'unknown')
-        if not content_type.lower().startswith("text/html"):
-            logger.debug(
-                'Skipping page %s because of Content-Type: %s',
-                link,
-                content_type,
-            )
-            return
-
-        inst = HTMLPage(resp.content, resp.url, resp.headers)
     except requests.HTTPError as exc:
         _handle_get_page_fail(link, exc, url)
     except RetryError as exc:
@@ -174,7 +223,7 @@ def _get_html_page(link, session=None):
     except requests.Timeout:
         _handle_get_page_fail(link, "timed out", url)
     else:
-        return inst
+        return HTMLPage(resp.content, resp.url, resp.headers)
 
 
 class PackageFinder(object):
@@ -679,7 +728,7 @@ def _get_pages(self, locations, project_name):
                 continue
             seen.add(location)
 
-            page = self._get_page(location)
+            page = _get_html_page(location, session=self.session)
             if page is None:
                 continue
 
@@ -796,9 +845,6 @@ def _link_package_versions(self, link, search):
 
         return InstallationCandidate(search.supplied, version, link)
 
-    def _get_page(self, link):
-        return _get_html_page(link, session=self.session)
-
 
 def egg_info_matches(
         egg_info, search_name, link,
diff --git a/tests/unit/test_index_html_page.py b/tests/unit/test_index_html_page.py
new file mode 100644
index 00000000000..c872ad06563
--- /dev/null
+++ b/tests/unit/test_index_html_page.py
@@ -0,0 +1,162 @@
+import logging
+
+import mock
+import pytest
+from pip._vendor.six.moves.urllib import request as urllib_request
+
+from pip._internal.download import PipSession
+from pip._internal.index import (
+    Link, _get_html_page, _get_html_response, _NotHTML, _NotHTTP,
+)
+
+
+@pytest.mark.parametrize(
+    "url",
+    [
+        "ftp://python.org/python-3.7.1.zip",
+        "file:///opt/data/pip-18.0.tar.gz",
+    ],
+)
+def test_get_html_response_archive_to_naive_scheme(url):
+    """
+    `_get_html_response()` should error on an archive-like URL if the scheme
+    does not allow "poking" without getting data.
+    """
+    with pytest.raises(_NotHTTP):
+        _get_html_response(url, session=mock.Mock(PipSession))
+
+
+@pytest.mark.parametrize(
+    "url, content_type",
+    [
+        ("http://python.org/python-3.7.1.zip", "application/zip"),
+        ("https://pypi.org/pip-18.0.tar.gz", "application/gzip"),
+    ],
+)
+def test_get_html_response_archive_to_http_scheme(url, content_type):
+    """
+    `_get_html_response()` should send a HEAD request on an archive-like URL
+    if the scheme supports it, and raise `_NotHTML` if the response isn't HTML.
+    """
+    session = mock.Mock(PipSession)
+    session.head.return_value = mock.Mock(**{
+        "request.method": "HEAD",
+        "headers": {"Content-Type": content_type},
+    })
+
+    with pytest.raises(_NotHTML) as ctx:
+        _get_html_response(url, session=session)
+
+    session.assert_has_calls([
+        mock.call.head(url, allow_redirects=True),
+    ])
+    assert ctx.value.args == (content_type, "HEAD")
+
+
+@pytest.mark.parametrize(
+    "url",
+    [
+        "http://python.org/python-3.7.1.zip",
+        "https://pypi.org/pip-18.0.tar.gz",
+    ],
+)
+def test_get_html_response_archive_to_http_scheme_is_html(url):
+    """
+    `_get_html_response()` should work with archive-like URLs if the HEAD
+    request is responded with text/html.
+    """
+    session = mock.Mock(PipSession)
+    session.head.return_value = mock.Mock(**{
+        "request.method": "HEAD",
+        "headers": {"Content-Type": "text/html"},
+    })
+    session.get.return_value = mock.Mock(headers={"Content-Type": "text/html"})
+
+    resp = _get_html_response(url, session=session)
+
+    assert resp is not None
+    assert session.mock_calls == [
+        mock.call.head(url, allow_redirects=True),
+        mock.call.head().raise_for_status(),
+        mock.call.get(url, headers={
+            "Accept": "text/html", "Cache-Control": "max-age=0",
+        }),
+        mock.call.get().raise_for_status(),
+    ]
+
+
+@pytest.mark.parametrize(
+    "url",
+    [
+        "https://pypi.org/simple/pip",
+        "https://pypi.org/simple/pip/",
+        "https://python.org/sitemap.xml",
+    ],
+)
+def test_get_html_response_no_head(url):
+    """
+    `_get_html_response()` shouldn't send a HEAD request if the URL does not
+    look like an archive, only the GET request that retrieves data.
+    """
+    session = mock.Mock(PipSession)
+
+    # Mock the headers dict to ensure it is accessed.
+    session.get.return_value = mock.Mock(headers=mock.Mock(**{
+        "get.return_value": "text/html",
+    }))
+
+    resp = _get_html_response(url, session=session)
+
+    assert resp is not None
+    assert session.head.call_count == 0
+    assert session.get.mock_calls == [
+        mock.call(url, headers={
+            "Accept": "text/html", "Cache-Control": "max-age=0",
+        }),
+        mock.call().raise_for_status(),
+        mock.call().headers.get("Content-Type", ""),
+    ]
+
+
+@pytest.mark.parametrize(
+    "url, vcs_scheme",
+    [
+        ("svn+http://pypi.org/something", "svn"),
+        ("git+https://github.com/pypa/pip.git", "git"),
+    ],
+)
+def test_get_html_page_invalid_scheme(caplog, url, vcs_scheme):
+    """`_get_html_page()` should error if an invalid scheme is given.
+
+    Only file:, http:, https:, and ftp: are allowed.
+    """
+    with caplog.at_level(logging.DEBUG):
+        page = _get_html_page(Link(url), session=mock.Mock(PipSession))
+
+    assert page is None
+    assert caplog.record_tuples == [
+        (
+            "pip._internal.index",
+            logging.DEBUG,
+            "Cannot look at {} URL {}".format(vcs_scheme, url),
+        ),
+    ]
+
+
+def test_get_html_page_directory_append_index(tmpdir):
+    """`_get_html_page()` should append "index.html" to a directory URL.
+    """
+    dirpath = tmpdir.mkdir("something")
+    dir_url = "file:///{}".format(
+        urllib_request.pathname2url(dirpath).lstrip("/"),
+    )
+
+    session = mock.Mock(PipSession)
+    with mock.patch("pip._internal.index._get_html_response") as mock_func:
+        _get_html_page(Link(dir_url), session=session)
+        assert mock_func.mock_calls == [
+            mock.call(
+                "{}/index.html".format(dir_url.rstrip("/")),
+                session=session,
+            ),
+        ]