Skip to content

Commit

Permalink
Fix support for pyproject.toml dependencies with multiple selectors (
Browse files Browse the repository at this point in the history
…#597)

* Change approach to merging toml and setup deps

We want to prioritize toml deps over setup deps.  We don't want
to remove redundant packages in toml deps because these can
have corresponding, different selectors.

* Change approach to merging pypi and sdist deps

We want to prioritize sdist deps over pypi deps.  We don't want
to remove redundant packages in sdist deps because these can
have corresponding, different selectors.

The progressbar has become a bit more complicated because we
iterate separately over sdist and pypi dependencies.

* Make sure `is_arch` is `True` if any selectors

While `is_arch` gets set to true if there are extras, we also
want to do so if selectors have already been defined.

* Don't remove "duplicates" with different selectors

When removing duplicate packages, we want to consider different
selectors as unique.  Thus, we only remove dependencies if they
have the same package name *and* the same selector.

* Don't drop selectors when formatting deps

Previously, selectors were getting dropped as tags and comments

* Add several pypi tests

These tests are designed to make sure multiple selectors for the
same package are preserved correctly at every stage of the process
of merging dependencies from different sources.

When parsing poetry dependencies, we introduce selectors early
in the process, whereas for other approaches, we retain "extras"
(separated by semicolons) for much of the merging process. Packages
employing each strategy are used in the added tests.

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

---------

Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
  • Loading branch information
xylar and pre-commit-ci[bot] authored Dec 30, 2024
1 parent 32c230d commit f3e910f
Show file tree
Hide file tree
Showing 4 changed files with 244 additions and 32 deletions.
36 changes: 24 additions & 12 deletions grayskull/strategy/py_base.py
Original file line number Diff line number Diff line change
Expand Up @@ -696,18 +696,30 @@ def download_sdist_pkg(sdist_url: str, dest: str, name: str | None = None):

def merge_deps_toml_setup(setup_deps: list, toml_deps: list) -> list:
re_split = re.compile(r"\s+|>|=|<|~|!")
all_deps = defaultdict(list)
for dep in toml_deps + setup_deps:
if dep.strip():
dep_name = re_split.split(dep)[0]
if dep_name not in all_deps:
if dep_name.replace("_", "-") in all_deps:
dep_name = dep_name.replace("_", "-")
elif dep_name.replace("-", "_") in all_deps:
dep_name = dep_name.replace("-", "_")
all_deps[dep_name].append(dep)

return [deps[0] for deps in all_deps.values()]
# drop any empty deps
setup_deps = [dep for dep in setup_deps if dep.strip()]
toml_deps = [dep for dep in toml_deps if dep.strip()]

# get dep names
toml_dep_names = [re_split.split(dep)[0] for dep in toml_deps]
setup_dep_names = [re_split.split(dep)[0] for dep in setup_deps]

# prefer toml over setup; only add setup deps if not found in toml
merged_deps = toml_deps
for dep_name, dep in zip(setup_dep_names, setup_deps):
if not dep_name.strip():
continue
alternatives = [
dep_name,
dep_name.replace("_", "-"),
dep_name.replace("-", "_"),
]
found = any([alternative in toml_dep_names for alternative in alternatives])
# only add the setup dep if no alternative name was found
if not found:
merged_deps.append(dep)

return merged_deps


def merge_setup_toml_metadata(setup_metadata: dict, pyproject_metadata: dict) -> dict:
Expand Down
54 changes: 39 additions & 15 deletions grayskull/strategy/pypi.py
Original file line number Diff line number Diff line change
Expand Up @@ -185,18 +185,16 @@ def merge_requires_dist(pypi_metadata: dict, sdist_metadata: dict) -> list:
:param sdist_metadata: sdist metadata
:return: list with all requirements
"""
all_deps = sdist_metadata.get("install_requires") or []
all_deps += pypi_metadata.get("requires_dist") or []

re_search = re.compile(r";\s*extra")
all_deps = [pkg for pkg in all_deps if not re_search.search(pkg)]
current_pkg = pypi_metadata.get("name", "")

requires_dist = []
pypi_deps_name = set()
with progressbar_with_status(len(all_deps)) as bar:
for pos, sdist_pkg in enumerate(all_deps, 1):
match_deps = RE_DEPS_NAME.match(sdist_pkg)
def _get_pkg_names(deps, current_pkg, bar, pos):
"""
Get pkg names for each dep, dropping deps where no pkg name is found
"""
new_deps = []
pkg_names = []
for dep in deps:
pos += 1
match_deps = RE_DEPS_NAME.match(dep)
if not match_deps:
bar.update(pos)
continue
Expand All @@ -205,11 +203,32 @@ def merge_requires_dist(pypi_metadata: dict, sdist_metadata: dict) -> list:
bar.update(pos, pkg_name=pkg_name)
if current_pkg and current_pkg == pkg_name:
continue
if pkg_name in pypi_deps_name:
continue
new_deps.append(dep.replace(match_deps, pkg_name))
pkg_names.append(pkg_name)
return pkg_names, new_deps

sdist_deps = sdist_metadata.get("install_requires") or []
pypi_deps = pypi_metadata.get("requires_dist") or []

re_search = re.compile(r";\s*extra")
sdist_deps = [pkg for pkg in sdist_deps if not re_search.search(pkg)]
pypi_deps = [pkg for pkg in pypi_deps if not re_search.search(pkg)]
current_pkg = pypi_metadata.get("name", "")

count = len(sdist_deps) + len(pypi_deps)
# progress bar here because calling normalize_pkg_name() on each package
# takes awhile
with progressbar_with_status(count) as bar:
pos = 0
sdist_pkgs, sdist_deps = _get_pkg_names(sdist_deps, current_pkg, bar, pos)
pypi_pkgs, pypi_deps = _get_pkg_names(pypi_deps, current_pkg, bar, pos)

# prefer sdist over pypi; only add pypi deps if not found in sdist
requires_dist = sdist_deps
for pkg_name, dep in zip(pypi_pkgs, pypi_deps):
if pkg_name not in sdist_pkgs:
requires_dist.append(dep)

pypi_deps_name.add(pkg_name)
requires_dist.append(sdist_pkg.replace(match_deps, pkg_name))
return requires_dist


Expand Down Expand Up @@ -322,8 +341,13 @@ def get_pypi_metadata(config: Configuration) -> PypiMetadata:
def get_run_req_from_requires_dist(requires_dist: list, config: Configuration) -> list:
"""Get the run requirements looking for the `requires_dist` key
present in the metadata"""
re_selector = re.compile(r"\s+#\s+\[.*\]", re.DOTALL)
run_req = []
for req in requires_dist:
if re_selector.search(req):
# if there's already a selector, make sure we're arch
config.is_arch = True

list_raw_requirements = req.split(";")
selector = ""
if len(list_raw_requirements) > 1:
Expand Down
24 changes: 19 additions & 5 deletions grayskull/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -128,32 +128,41 @@ def rm_duplicated_deps(all_requirements: list | set | None) -> list | None:
# and underscores converted to dashes. The value is the requirement itself,
# as it should be added.
# (This is order-preserving since dicts are ordered by first insertion.)
new_reqs: dict[str, str] = {}
new_reqs: dict[tuple[str, str], str] = {}
re_split = re.compile(r"\s+(|>|=|<|~|!|#)+")
for dep in all_requirements:
if dep.strip().startswith(("{{", "<{")):
new_reqs[dep] = dep
continue
dep_name, *constrains = re_split.split(dep.strip())
dep_name = dep_name.strip()

if "#" in dep:
selector = dep.split("#")[-1]
else:
selector = ""

constrains = [
c.strip()
for c in constrains
if c.strip() not in {"=*", "==*", "*", "*.*", "*.*.*", ""}
]
canonicalized = dep_name.replace("_", "-").lower()
constrains.insert(0, dep_name)
if canonicalized in new_reqs:
# a canonicalized dependency is only redundant if it also has the same
# selector as a pervious dependency
key = (canonicalized, selector)
if key in new_reqs:
# In order to break ties deterministically, we prioritize the requirement
# which is alphanumerically lowest. This happens to prioritize the "-"
# character over "_".
# Example: given "importlib_metadata" and "importlib-metadata", we will
# keep "importlib-metadata" because it is alphabetically lower.
previous_req = new_reqs[canonicalized]
previous_req = new_reqs[key]
if len(dep) > len(previous_req) or "-" in dep_name:
new_reqs[canonicalized] = " ".join(constrains)
new_reqs[key] = " ".join(constrains)
else:
new_reqs[canonicalized] = " ".join(constrains)
new_reqs[key] = " ".join(constrains)
return [re.sub(r"\s+(#)", " \\1", v.strip()) for v in new_reqs.values()]


Expand All @@ -168,6 +177,7 @@ def format_dependencies(all_dependencies: list, name: str) -> list:
formatted_dependencies = []
re_deps = re.compile(r"^\s*([\.a-zA-Z0-9_-]+)\s*(.*)\s*$", re.MULTILINE | re.DOTALL)
re_remove_space = re.compile(r"([<>!=]+)\s+")
re_selector = re.compile(r"\s+#\s+\[.*\]", re.DOTALL)
re_remove_tags = re.compile(r"\s*(\[.*\])", re.DOTALL)
re_remove_comments = re.compile(r"\s+#.*", re.DOTALL)
for req in all_dependencies:
Expand All @@ -184,6 +194,10 @@ def format_dependencies(all_dependencies: list, name: str) -> list:
if len(match_req) > 1:
deps_name = " ".join(match_req)
deps_name = re_remove_space.sub(r"\1", deps_name.strip())
if re_selector.search(deps_name):
# don't want to remove selectors
formatted_dependencies.append(deps_name)
continue
deps_name = re_remove_tags.sub(r" ", deps_name.strip())
deps_name = re_remove_comments.sub("", deps_name)
formatted_dependencies.append(deps_name.strip())
Expand Down
162 changes: 162 additions & 0 deletions tests/test_pypi.py
Original file line number Diff line number Diff line change
Expand Up @@ -682,6 +682,119 @@ def test_merge_pypi_sdist_metadata():
)


def test_pypi_metadata_constraints_for_python_versions():
config = Configuration(name="apache-airflow-providers-trino", version="6.0.0")
pypi_metadata = get_pypi_metadata(config)
assert sorted(pypi_metadata["requires_dist"]) == sorted(
[
"apache-airflow-providers-common-sql>=1.20.0",
"apache-airflow>=2.9.0",
'pandas<2.2,>=1.5.3; python_version < "3.9"',
'pandas<2.2,>=2.1.2; python_version >= "3.9"',
"trino>=0.318.0",
'apache-airflow-providers-google; extra == "google"',
'apache-airflow-providers-openlineage; extra == "openlineage"',
]
)

config = Configuration(name="databricks-sql-connector", version="3.7.0")
pypi_metadata = get_pypi_metadata(config)
assert sorted(pypi_metadata["requires_dist"]) == sorted(
[
'alembic<2.0.0,>=1.0.11; extra == "alembic"',
"lz4<5.0.0,>=4.0.2",
'numpy>=1.16.6; python_version >= "3.8" and python_version < "3.11"',
'numpy>=1.23.4; python_version >= "3.11"',
"oauthlib<4.0.0,>=3.1.0",
"openpyxl<4.0.0,>=3.0.10",
'pandas<2.3.0,>=1.2.5; python_version >= "3.8"',
"pyarrow>=14.0.1",
"requests<3.0.0,>=2.18.1",
'sqlalchemy>=2.0.21; extra == "sqlalchemy" or extra == "alembic"',
"thrift<0.21.0,>=0.16.0",
"urllib3>=1.26",
]
)


def test_sdist_metadata_from_toml_project_dependencies():
config = Configuration(name="apache-airflow-providers-trino", version="6.0.0")
pypi_metadata = get_pypi_metadata(config)
sdist_metadata = get_sdist_metadata(pypi_metadata["sdist_url"], config)
assert sorted(sdist_metadata["install_requires"]) == sorted(
[
"apache-airflow-providers-common-sql>=1.20.0",
"apache-airflow>=2.9.0",
'pandas>=1.5.3,<2.2;python_version<"3.9"',
'pandas>=2.1.2,<2.2;python_version>="3.9"',
"trino>=0.318.0",
"python ~=3.9",
]
)


def test_sdist_metadata_from_toml_poetry_dependencies():
config = Configuration(name="databricks-sql-connector", version="3.7.0")
pypi_metadata = get_pypi_metadata(config)
sdist_metadata = get_sdist_metadata(pypi_metadata["sdist_url"], config)
assert sorted(sdist_metadata["install_requires"]) == sorted(
[
"python >=3.8.0,<4.0.0",
"thrift >=0.16.0,<0.21.0",
"pandas >=1.2.5,<2.3.0 # [py>=38]",
"pyarrow >=14.0.1",
"lz4 >=4.0.2,<5.0.0",
"requests >=2.18.1,<3.0.0",
"oauthlib >=3.1.0,<4.0.0",
"numpy >=1.16.6 # [py>=38 and py<311]",
"numpy >=1.23.4 # [py>=311]",
"openpyxl >=3.0.10,<4.0.0",
"urllib3 >=1.26",
]
)


def test_merge_pypi_sdist_metadata_from_toml():
# tests merging pyproject.toml dependencies from poetry with pypi data,
# including multiple numpy constraints with python version selectors
config = Configuration(name="databricks-sql-connector", version="3.7.0")
pypi_metadata = get_pypi_metadata(config)
sdist_metadata = get_sdist_metadata(pypi_metadata["sdist_url"], config)
merged_data = merge_pypi_sdist_metadata(pypi_metadata, sdist_metadata, config)
assert sorted(merged_data["requires_dist"]) == sorted(
[
"python >=3.8.0,<4.0.0",
"thrift >=0.16.0,<0.21.0",
"pandas >=1.2.5,<2.3.0 # [py>=38]",
"pyarrow >=14.0.1",
"lz4 >=4.0.2,<5.0.0",
"requests >=2.18.1,<3.0.0",
"oauthlib >=3.1.0,<4.0.0",
"numpy >=1.16.6 # [py>=38 and py<311]",
"numpy >=1.23.4 # [py>=311]",
"openpyxl >=3.0.10,<4.0.0",
"urllib3 >=1.26",
]
)

# tests merging pyproject.toml project dependencies with pypi data,
# including multiple pandas constraints with python version selectors
config = Configuration(name="apache-airflow-providers-trino", version="6.0.0")
pypi_metadata = get_pypi_metadata(config)
sdist_metadata = get_sdist_metadata(pypi_metadata["sdist_url"], config)
merged_data = merge_pypi_sdist_metadata(pypi_metadata, sdist_metadata, config)
assert sorted(merged_data["requires_dist"]) == sorted(
[
"apache-airflow-providers-common-sql>=1.20.0",
"apache-airflow>=2.9.0",
'pandas>=1.5.3,<2.2;python_version<"3.9"',
'pandas>=2.1.2,<2.2;python_version>="3.9"',
"trino>=0.318.0",
"python ~=3.9",
]
)


def test_update_requirements_with_pin():
req = {
"build": ["<{ compiler('c') }}"],
Expand Down Expand Up @@ -787,6 +900,55 @@ def test_run_requirements_sdist():
]
)

# a more complex example with selectors
config = Configuration(name="apache-airflow-providers-trino", version="6.0.0")
recipe = GrayskullFactory.create_recipe("pypi", config)
assert "noarch" not in recipe["build"]
selectors = {
"python >=3.9,<4.dev0": None,
"apache-airflow-providers-common-sql >=1.20.0": None,
"apache-airflow >=2.9.0": None,
"pandas >=1.5.3,<2.2": "[py<39]",
"pandas >=2.1.2,<2.2": "[py>=39]",
"trino-python-client >=0.318.0": None,
}

for dep in recipe["requirements"]["run"]:
dep_str = str(dep)
assert dep_str in selectors
selector = selectors[dep_str]
if dep.inline_comment is None:
assert selector is None
else:
assert str(dep.inline_comment) == selector

# another example with selectors
config = Configuration(name="databricks-sql-connector", version="3.7.0")
recipe = GrayskullFactory.create_recipe("pypi", config)
assert "noarch" not in recipe["build"]
selectors = {
"python >=3.8.0,<4.0.0": None,
"thrift >=0.16.0,<0.21.0": None,
"pandas >=1.2.5,<2.3.0": "[py>=38]",
"pyarrow >=14.0.1": None,
"lz4 >=4.0.2,<5.0.0": None,
"requests >=2.18.1,<3.0.0": None,
"oauthlib >=3.1.0,<4.0.0": None,
"numpy >=1.16.6": "[py>=38 and py<311]",
"numpy >=1.23.4": "[py>=311]",
"openpyxl >=3.0.10,<4.0.0": None,
"urllib3 >=1.26": None,
}

for dep in recipe["requirements"]["run"]:
dep_str = str(dep)
assert dep_str in selectors
selector = selectors[dep_str]
if dep.inline_comment is None:
assert selector is None
else:
assert str(dep.inline_comment) == selector


def test_format_host_requirements():
assert sorted(format_dependencies(["setuptools>=40.0", "pkg2"], "pkg1")) == sorted(
Expand Down

0 comments on commit f3e910f

Please sign in to comment.