Merge pull request #93 from ericdill/new-maps

REF update to new metadata package
ericdill · May 5, 2023 · d7dce34 · d7dce34
2 parents 0853610 + 729b4a3
commit d7dce34
Show file tree

Hide file tree

Showing 6 changed files with 123 additions and 89 deletions.
diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml
@@ -30,21 +30,28 @@ jobs:
         uses: mamba-org/provision-with-micromamba@v15
         with:
          environment-file: false
- 
+
       - name: Python ${{ matrix.python-version }}
         run: |
           micromamba create --name TEST python=${{ matrix.python-version }} pip --file requirements-dev.txt --channel conda-forge
           micromamba activate TEST
+
+      - name: test w/o conda-forge-metadata
+        run: |
+          micromamba activate TEST
+          depfinder --help
           pip install -e . --force-reinstall
+          coverage run -m pytest -vrsx test.py
 
-      - name: test
+      - name: test w/ conda-forge-metadata
         run: |
           micromamba activate TEST
+          pip install -e .[conda-forge] --force-reinstall
           coverage run -m pytest -vrsx test.py
 
       - name: coverage
         run: |
           micromamba activate TEST
           coverage report -m
           codecov
-  
+
diff --git a/depfinder/main.py b/depfinder/main.py
@@ -249,7 +249,7 @@ def simple_import_search_conda_forge_import_map(path_to_source_code, builtins=No
         for name, md in total_import.items():
             total_imports[name].update(md)
     from .reports import report_conda_forge_names_from_import_map
-    imports, _, _ = report_conda_forge_names_from_import_map(
+    imports, _ = report_conda_forge_names_from_import_map(
         total_imports, builtin_modules=builtins, ignore=ignore
     )
     return {k: sorted(list(v)) for k, v in imports.items()}
@@ -288,7 +288,7 @@ def simple_import_to_pkg_map(path_to_source_code, builtins=None, ignore=None, cu
         for name, md in total_import.items():
             total_imports[name].update(md)
     from .reports import report_conda_forge_names_from_import_map
-    _, _, import_to_artifact = report_conda_forge_names_from_import_map(
+    _, import_to_pkg = report_conda_forge_names_from_import_map(
         total_imports, builtin_modules=builtins, ignore=ignore
     )
-    return import_to_artifact
+    return import_to_pkg
diff --git a/depfinder/reports.py b/depfinder/reports.py
@@ -30,45 +30,15 @@
 from __future__ import print_function, division, absolute_import
 
 import logging
-import sys
 from concurrent.futures._base import as_completed
 from concurrent.futures.thread import ThreadPoolExecutor
 from fnmatch import fnmatch
-from functools import lru_cache
-
-import requests
 
 from .stdliblist import builtin_modules as _builtin_modules
 from .utils import SKETCHY_TYPES_TABLE
 
-logger = logging.getLogger('depfinder')
-
-
-@lru_cache()
-def _import_map_num_letters():
-    req = requests.get(
-        'https://raw.githubusercontent.com/regro/libcfgraph/master'
-        '/import_maps_meta.json')
-    req.raise_for_status()
-    return int(req.json()['num_letters'])
-
-
-@lru_cache()
-def _import_map_cache(import_first_letters):
-    req = requests.get(
-        f'https://raw.githubusercontent.com/regro/libcfgraph'
-        f'/master/import_maps/{import_first_letters.lower()}.json')
-    if not req.ok:
-        print('Request to {req_url} failed'.format(req_url=req.url))
-        return {}
-    return {k: set(v['elements']) for k, v in req.json().items()}
-
 
-FILE_LISTING = requests.get('https://raw.githubusercontent.com/regro/libcfgraph/master/.file_listing.json').json()
-# TODO: upstream this to libcfgraph so we just request it, so we reduce bandwidth requirements
-ARTIFACT_TO_PKG = {v.split('/')[-1].rsplit('.', 1)[0]: v.split('/')[1] for v in FILE_LISTING if 'artifacts' in v}
-hubs_auths = requests.get(
-    'https://raw.githubusercontent.com/regro/cf-graph-countyfair/master/ranked_hubs_authorities.json').json()
+logger = logging.getLogger('depfinder')
 
 
 def extract_pkg_from_import(name):
@@ -82,29 +52,25 @@ def extract_pkg_from_import(name):
 
     Returns
     -------
-
+    most_likely_pkg : str
+        The most likely conda-forge package.
+    import_to_pkg : dict mapping str to sets
+        A dict mapping the import name to a set of possible packages that supply that import.
     """
-    num_letters = _import_map_num_letters()
-    original_name = name
-    while True:
-        try:
-            fllt = name[:min(len(name), num_letters)]
-            import_map = _import_map_cache(fllt)
-            supplying_artifacts = import_map[name]
-        except KeyError:
-            if '.' not in name:
-                return original_name, {}, {}
-            name = name.rsplit('.', 1)[0]
-            pass
-        else:
-            break
-    import_to_artifact = {name: supplying_artifacts}
-    # TODO: launder supplying_pkgs through centrality scoring so we have one thing
-    #  but keep the rest for the more detailed reports
-    supplying_pkgs = {ARTIFACT_TO_PKG[k] for k in supplying_artifacts}
-    import_to_pkg = {name: supplying_pkgs}
-
-    return next(iter(k for k in hubs_auths if k in supplying_pkgs), original_name), import_to_artifact, import_to_pkg
+    from conda_forge_metadata.autotick_bot import map_import_to_package
+    from conda_forge_metadata.libcfgraph import get_libcfgraph_pkgs_for_import
+    try:
+        supplying_pkgs, _ = get_libcfgraph_pkgs_for_import(name)
+        best_import = map_import_to_package(name)
+    except Exception:
+        logger.exception(
+            "could not get package name from conda-forge metadata "
+            f"for import {name} due to an error"
+        )
+        supplying_pkgs = set()
+        best_import = name
+    import_to_pkg = {name: supplying_pkgs or set()}
+    return best_import, import_to_pkg
 
 
 def recursively_search_for_name(name, module_names):
@@ -126,7 +92,6 @@ def report_conda_forge_names_from_import_map(total_imports, builtin_modules=None
     report_keys = ['required', 'questionable', 'builtin', 'questionable no match', 'required no match']
     report = {k: set() for k in report_keys}
     import_to_pkg = {k: {} for k in report_keys}
-    import_to_artifact = {k: {} for k in report_keys}
     futures = {}
 
     with ThreadPoolExecutor() as pool:
@@ -140,28 +105,29 @@ def report_conda_forge_names_from_import_map(total_imports, builtin_modules=None
             futures[future] = md
     for future in as_completed(futures):
         md = futures[future]
-        most_likely_pkg, _import_to_artifact, _import_to_pkg = future.result()
+        most_likely_pkg, _import_to_pkg = future.result()
 
         for (filename, lineno), import_metadata in md.items():
             # Make certain to throw out imports, since an import can happen multiple times
             # under different situations, import matplotlib is required by a test file
             # but is questionable for a regular file
             if any(fnmatch(filename, ignore_element) for ignore_element in ignore):
                 continue
+            _name = list(_import_to_pkg.keys())[0]
             if any(import_metadata.get(v, False) for v in SKETCHY_TYPES_TABLE.values()):
                 # if we couldn't find any artifacts to represent this then it doesn't exist in our maps
-                if not _import_to_artifact:
+                if not _import_to_pkg[_name]:
                     report_key = 'questionable no match'
                 else:
                     report_key = 'questionable'
             else:
                 # if we couldn't find any artifacts to represent this then it doesn't exist in our maps
-                if not _import_to_artifact:
+                if not _import_to_pkg[_name]:
                     report_key = 'required no match'
                 else:
                     report_key = 'required'
 
             report[report_key].add(most_likely_pkg)
             import_to_pkg[report_key].update(_import_to_pkg)
-            import_to_artifact[report_key].update(_import_to_artifact)
-    return report, import_to_artifact, import_to_pkg
+
+    return report, import_to_pkg
diff --git a/depfinder/utils.py b/depfinder/utils.py
@@ -6,9 +6,11 @@
 import sys
 
 import requests
+import requests.exceptions
 import yaml
 from .stdliblist import builtin_modules
 
+logger = logging.getLogger("depfinder")
 
 SKETCHY_TYPES_TABLE = {}
 
@@ -68,10 +70,14 @@
     Loader=yaml_loader,
 )
 
-req = requests.get('https://raw.githubusercontent.com/regro/cf-graph-countyfair/master/mappings/pypi/name_mapping.yaml')
-if req.status_code == 200:
-    mapping_list = yaml.load(req.text, Loader=yaml_loader)
-else:
+try:
+    import conda_forge_metadata.autotick_bot
+    mapping_list = conda_forge_metadata.autotick_bot.get_pypi_name_mapping()
+except (ImportError, AttributeError, requests.exceptions.HTTPError):
+    logger.exception(
+        "could not get the conda-forge metadata pypi-to-conda name mapping "
+        "due to error. defaulting to an internal one which may be out of date."
+    )
     mapping_list = yaml.load(
         pkgutil.get_data(__name__, 'pkg_data/name_mapping.yml').decode(),
         Loader=yaml_loader,

diff --git a/setup.cfg b/setup.cfg
@@ -20,12 +20,17 @@ install_requires =
     pyyaml
     stdlib-list; python_version < "3.10"
     requests
+
 python_requires = >=2.7
 packages = find:
 
 [options.entry_points]
 console_scripts =
     depfinder = depfinder.cli:cli
 
+[options.extras_require]
+conda-forge =
+    conda-forge-metadata>=0.3.0
+
 [flake8]
 max-line-length=300
diff --git a/test.py b/test.py
@@ -21,6 +21,12 @@
 from depfinder.reports import report_conda_forge_names_from_import_map, extract_pkg_from_import, \
     recursively_search_for_name, _builtin_modules
 
+try:
+    import conda_forge_metadata  # noqa
+    HAS_CF_METADATA = True
+except ImportError:
+    HAS_CF_METADATA = False
+
 random.seed(12345)
 
 # Testing spec:
@@ -449,36 +455,56 @@ def test_get_top_level_import():
     assert top_level_name == 'google.cloud.storage'
 
 
+@pytest.mark.skipif(
+    not HAS_CF_METADATA,
+    reason="test of optional conda-forge-metadata integration",
+)
 def test_report_conda_forge_names_from_import_map():
     m, f, c = parse_file(join(dirname(depfinder.__file__), 'utils.py'))
-    report, import_to_artifact, import_to_pkg = report_conda_forge_names_from_import_map(c.total_imports)
+    report, import_to_pkg = report_conda_forge_names_from_import_map(c.total_imports)
     assert report['required'] == {'pyyaml', 'requests'}
 
 
+@pytest.mark.skipif(
+    not HAS_CF_METADATA,
+    reason="test of optional conda-forge-metadata integration",
+)
 def test_report_conda_forge_names_from_import_map_ignore():
     m, f, c = parse_file(join(dirname(depfinder.__file__), 'inspection.py'))
-    report, import_to_artifact, import_to_pkg = report_conda_forge_names_from_import_map(c.total_imports,
-                                                                                         ignore=['*insp*'])
+    report, import_to_pkg = report_conda_forge_names_from_import_map(
+        c.total_imports,
+        ignore=['*insp*'],
+    )
     assert report['required'] == set()
 
 
+@pytest.mark.skipif(
+    not HAS_CF_METADATA,
+    reason="test of optional conda-forge-metadata integration",
+)
 def test_simple_import_search_conda_forge_import_map():
     path_to_source = dirname(depfinder.__file__)
     expected_result = sorted(list({"pyyaml", "requests"}))
     report = simple_import_search_conda_forge_import_map(path_to_source)
     assert report['required'] == expected_result
 
 
+@pytest.mark.skipif(
+    not HAS_CF_METADATA,
+    reason="test of optional conda-forge-metadata integration",
+)
 @pytest.mark.parametrize('import_name, expected_result', [
     ('six.moves', 'six'),
-    ('win32com.shell', 'pywin32'),
-    ('win32com', 'pywin32'),
+    # these need special casing elsewhere
+    # ('win32com.shell', 'pywin32'),
+    # ('win32com', 'pywin32'),
+    ("scipy.interpolate", "scipy"),
     # this comes from cython but doesn't seem to be a real pkg
     ('refnanny.hi', 'refnanny.hi')
 ])
 def test_extract_pkg_from_import_for_complex_imports(import_name, expected_result):
-    result, _, _ = extract_pkg_from_import(import_name)
-    assert result == expected_result
+    result, allpkgs = extract_pkg_from_import(import_name)
+    assert result == expected_result, allpkgs
 
 
 @pytest.mark.parametrize('import_name, expected_result', [
@@ -489,19 +515,43 @@ def test_search_for_name(import_name, expected_result):
     assert builtin_name_maybe == expected_result
 
 
+@pytest.mark.skipif(
+    not HAS_CF_METADATA,
+    reason="test of optional conda-forge-metadata integration",
+)
 def test_simple_import_to_pkg_map():
     path_to_source = dirname(depfinder.__file__)
     import_to_artifact = simple_import_to_pkg_map(path_to_source)
-    expected_result = {'builtin': {},
-                                  'questionable': {'stdlib_list': {'stdlib-list'}, 'IPython.core.inputsplitter': {'ipython', 'autovizwidget'}},
-                                  'questionable no match': {},
-                                  'required': {'requests': {'apache-libcloud',
-                                                            'arm_pyart',
-                                                            'autovizwidget',
-                                                            'dbxfs',
-                                                            'google-api-core',
-                                                            'google-cloud-bigquery-storage-core',
-                                                            'requests'},
-                                               'yaml': {'google-cloud-bigquery-storage-core', 'pyyaml'}},
-                                  'required no match': {}}
-    assert import_to_artifact == expected_result
+    expected_result = {
+        'builtin': {},
+        'questionable': {
+            'stdlib_list': {'stdlib-list'},
+            'IPython.core.inputsplitter': {'ipython', 'autovizwidget'},
+            'conda_forge_metadata.autotick_bot': {'conda-forge-metadata'},
+            'conda_forge_metadata.libcfgraph': {'conda-forge-metadata'},
+        },
+        'questionable no match': {},
+        'required': {
+            'requests': {
+                'apache-libcloud',
+                'arm_pyart',
+                'autovizwidget',
+                'dbxfs',
+                'google-api-core',
+                'google-cloud-bigquery-storage-core',
+                'requests'
+            },
+            'requests.exceptions': {
+                'apache-libcloud',
+                'arm_pyart',
+                'autovizwidget',
+                'dbxfs',
+                'google-api-core',
+                'google-cloud-bigquery-storage-core',
+                'requests'
+            },
+            'yaml': {'google-cloud-bigquery-storage-core', 'pyyaml', 'rosco'}
+        },
+        'required no match': {}
+    }
+    assert import_to_artifact == expected_result