From 108f5b46657bb50d784dcd48d9289129a1c2ccb0 Mon Sep 17 00:00:00 2001 From: Romain Date: Tue, 25 Apr 2023 11:29:19 -0700 Subject: [PATCH] Improved requirements.txt parsing. Updated documentation. (#9) Also introduced the possibility of having non python conda packages in a pure pip-only environment. A few bug fixes. Requires Metaflow 2.8.3+ now as there was a breaking internal change in that version. --- README.md | 37 +++++++++- .../cmd/environment/environment_cmd.py | 66 ++++++++++++----- .../netflix_ext/plugins/conda/conda.py | 71 ++++++++++++++++--- .../plugins/conda/conda_environment.py | 4 +- .../plugins/conda/conda_step_decorator.py | 21 +++++- setup.py | 4 +- 6 files changed, 167 insertions(+), 36 deletions(-) diff --git a/README.md b/README.md index cc24bfc..12d1979 100644 --- a/README.md +++ b/README.md @@ -104,8 +104,7 @@ The useful configuration values are listed below: - `CONDA_PREFERRED_FORMAT`: `.tar.bz2` or `.conda`. Prefer `.conda` for speed gains; any package not available in the preferred format will be transmuted to it automatically. If left empty, whatever package is found will be used (ie: there is no preference) -- `CONDA_DEFAULT_PIP_SOURCES`: list of additional mirrors to search for packages. Useful - if your company has an internal mirror. +- `CONDA_DEFAULT_PIP_SOURCE`: mirror to use for PIP. ##### Azure specific setup For Azure, you need to do the following two steps once during setup: @@ -210,6 +209,13 @@ environments are resolved. There are three cases: resolving the conda environment and then using `poetry` to resolve the additional `pip` packages within the confines of the defined `conda` environment. +Note that to support a bit more flexibility, you can have a pure Pip environment +as well as non-Python conda packages. This is similar to the mixed environment +but, in some cases, pip is more flexible than conda-lock in requirement specification +(for example, pip supports GIT repositories) so it makes it possible to gain the +flexibility of installing non python packages in your environment and still use +pip to resolve your python dependencies. + #### Additional command-line tool An additional `environment` command-line tool is available invoked as follows: `python myflow.py --environment=conda environment --help`. @@ -227,6 +233,33 @@ has the following sub-commands: - `alias`: aliases an environment giving it a name so it can be used later - `get`: fetches an environment from the remote environment store locally +#### Supported format for requirements.txt + +The requirements.txt file, which can be used to specify a pip only environment, supports +the following syntax (a subset of the full syntax supported by pip): +- Comment lines starting with '#' +- `--extra-index-url`: to spcify an additional repository to look at. Note that to + specify the `--index-url`, set it with the `METAFLOW_CONDA_DEFAULT_PIP_SOURCE + environment variable. +- `-f`, `--find-links` and `--trusted-host`: passed directly to pip with the + corresponding argument +- `--pre`, `--no-index`: passed directly to pip +- `--conda-pkg`: extension allowing you to specify a conda package that does not + need Python +- a requirement specification. This can include GIT repositories or local directories + as well as more classic package specification. Constraints and environment + markers are not supported. + +Note that GIT repositories, local directories and non wheel packages are not +compatible with cross-architecture resolution. Metaflow will build the wheel on the fly +when resolving the environment and this is only possible if the same architecture is used. + +If possible, it is best to specify pre-built packages. + +#### Supported format for environment.yml + +The environment.yml format is the same as the one for conda-lock. + #### Named environments Environments can optionally be given aliases. diff --git a/metaflow_extensions/netflix_ext/cmd/environment/environment_cmd.py b/metaflow_extensions/netflix_ext/cmd/environment/environment_cmd.py index 7f8b2a8..175cb5f 100644 --- a/metaflow_extensions/netflix_ext/cmd/environment/environment_cmd.py +++ b/metaflow_extensions/netflix_ext/cmd/environment/environment_cmd.py @@ -1,6 +1,7 @@ import json import os import platform +import re import shutil import subprocess import tempfile @@ -48,6 +49,9 @@ ) +REQ_SPLIT_LINE = re.compile(r"([^~<=>]*)([~<=>]+.*)?") + + class CommandObj: def __init__(self): pass @@ -470,6 +474,7 @@ def resolve( new_conda_deps = {} # type: Dict[str, str] new_pip_deps = {} # type: Dict[str, str] + new_np_conda_deps = {} # type: Dict[str, str] new_sources = [] # type: List[TStr] new_extras = [] # type: List[TStr] @@ -482,10 +487,10 @@ def resolve( archs = list(arch) if arch else [arch_id()] base_env_id = None base_env_conda_deps = {} # type: Dict[str, str] + base_env_np_conda_deps = {} # type: Dict[str, str] base_env_pip_deps = {} # type: Dict[str, str] - base_env_conda_channels = [] # type: List[str] - base_env_pip_sources = [] # type: List[str] base_env_extras = [] # type: List[TStr] + base_env_sources = [] # type: List[TStr] base_env_python = python base_env = None # type: Optional[ResolvedEnvironment] if using_str: @@ -522,15 +527,11 @@ def resolve( else: # We will re-add python later base_env_conda_deps[vals[0]] = vals[1] + elif d.category == "npconda": + base_env_np_conda_deps[vals[0]] = vals[1] # Now of channels/sources - all_sources = base_env.sources - base_env_conda_channels.extend( - [s.value for s in all_sources if s.category == "conda"] - ) - base_env_pip_sources.extend( - [s.value for s in all_sources if s.category == "pip"] - ) + base_env_sources = base_env.sources # Finally the extras base_env_extras = base_env.extras @@ -539,7 +540,9 @@ def resolve( if yml_file: _parse_yml_file(yml_file, new_extras, new_sources, new_conda_deps, new_pip_deps) if req_file: - _parse_req_file(req_file, new_extras, new_sources, new_pip_deps) + _parse_req_file( + req_file, new_extras, new_sources, new_pip_deps, new_np_conda_deps + ) if base_env_python is None: base_env_python = platform.python_version() @@ -560,11 +563,13 @@ def resolve( conda_deps.update(base_env_conda_deps) conda_deps.update(new_conda_deps) + np_conda_deps = dict(base_env_np_conda_deps) + np_conda_deps.update(new_np_conda_deps) # Compute the sources - seen = set() - sources = [] # type: List[str] - for c in chain(base_env_conda_channels, base_env_pip_sources, new_sources): + seen = set() # ttype: List[TStr] + sources = [] # type: List[TStr] + for c in chain(base_env_sources, new_sources): if c in seen: continue seen.add(c) @@ -581,6 +586,10 @@ def resolve( TStr("pip", "%s==%s" % (name, ver) if ver else name) for name, ver in pip_deps.items() ), + ( + TStr("npconda", "%s==%s" % (name, ver) if ver else name) + for name, ver in np_conda_deps.items() + ), ) ) @@ -812,7 +821,11 @@ def get(obj, default: bool, arch: Optional[str], source_env: str): def _parse_req_file( - file_name: str, extra_args: List[TStr], sources: List[TStr], deps: Dict[str, str] + file_name: str, + extra_args: List[TStr], + sources: List[TStr], + deps: Dict[str, str], + np_deps: Dict[str, str], ): with open(file_name, mode="r", encoding="utf-8") as f: for line in f: @@ -838,6 +851,18 @@ def _parse_req_file( ) elif first_word in ("--pre", "--no-index"): extra_args.append(TStr(category="pip", value=first_word)) + elif first_word == "--conda-pkg": + # Special extension to allow non-python conda package specification + split_res = REQ_SPLIT_LINE.match(splits[1]) + if split_res is None: + raise InvalidEnvironmentException( + "Could not parse conda package '%s'" % splits[1] + ) + s = split_res.groups() + if s[1] is None: + np_deps[s[0].replace(" ", "")] = "" + else: + np_deps[s[0].replace(" ", "")] = s[1].replace(" ", "").lstrip("=") elif first_word.startswith("#"): continue elif first_word.startswith("-"): @@ -855,11 +880,16 @@ def _parse_req_file( first_word = rem[1:].strip() deps[first_word] = "" else: - splits = line.split("=", 1) - if len(splits) == 1: - deps[line.strip()] = "" + split_res = REQ_SPLIT_LINE.match(line) + if split_res is None: + raise InvalidEnvironmentException("Could not parse '%s'" % line) + splits = split_res.groups() + if splits[1] is None: + deps[splits[0].replace(" ", "")] = "" else: - deps[splits[0].strip()] = splits[1].lstrip(" =").rstrip() + deps[splits[0].replace(" ", "")] = ( + splits[1].replace(" ", "").lstrip("=") + ) def _parse_yml_file( diff --git a/metaflow_extensions/netflix_ext/plugins/conda/conda.py b/metaflow_extensions/netflix_ext/plugins/conda/conda.py index e4ad090..5c43c8a 100644 --- a/metaflow_extensions/netflix_ext/plugins/conda/conda.py +++ b/metaflow_extensions/netflix_ext/plugins/conda/conda.py @@ -4,6 +4,7 @@ import json import os import platform +import re import requests import shutil import socket @@ -256,8 +257,31 @@ def resolve( ) else: # Pip only mode + # In this mode, we also allow (as a workaround for poor support for + # more advanced options in conda-lock (like git repo, local support, + # etc)) the inclusion of conda packages that are *not* python packages. + # To ensure this, we check the npconda packages, create an environment + # for it and check if that environment doesn't contain python deps. + # If that is the case, we then create the actual environment including + # both conda and npconda packages and re-resolve. We could maybe + # optimize to not resolve from scratch twice but given this is a rare + # situation and the cost is only during resolution, it doesn't seem + # worth it. + npconda_deps = [d for d in deps if d.category == "npconda"] + if npconda_deps: + npconda_pkgs = self._resolve_env_with_conda( + npconda_deps, sources, architecture + ) + if any((p.filename.startswith("python-") for p in npconda_pkgs)): + raise InvalidEnvironmentException( + "Cannot specify a non-python Conda dependency that uses " + "python: %s. Please use the mixed mode instead." + % ", ".join([d.value for d in npconda_deps]) + ) packages = self._resolve_env_with_conda(deps, sources, architecture) - conda_only_deps = [d for d in deps if d.category == "conda"] + conda_only_deps = [ + d for d in deps if d.category in ("conda", "npconda") + ] conda_only_sources = [s for s in sources if s.category == "conda"] builder_resolved_env = ResolvedEnvironment( conda_only_deps, @@ -1503,7 +1527,7 @@ def _resolve_env_with_micromamba_server( self, deps: Sequence[TStr], channels: Sequence[TStr], architecture: str ) -> List[PackageSpecification]: - deps = [d for d in deps if d.category == "conda"] + deps = [d for d in deps if d.category in ("conda", "npconda")] if not self._have_micromamba_server: raise CondaException( @@ -1513,7 +1537,8 @@ def _resolve_env_with_micromamba_server( self._start_micromamba_server() # Form the payload to send to the server req = { - "specs": [d.value for d in deps if d.category == "conda"] + ["pip"], + "specs": [d.value for d in deps if d.category in ("conda", "npconda")] + + ["pip"], "platform": architecture, "channels": [c.value for c in channels if c.category == "conda"], } @@ -1560,7 +1585,7 @@ def _resolve_env_with_conda( architecture: str, ) -> List[PackageSpecification]: - deps = [d for d in deps if d.category == "conda"] + deps = [d for d in deps if d.category in ("conda", "npconda")] result = [] with tempfile.TemporaryDirectory() as mamba_dir: @@ -1578,7 +1603,11 @@ def _resolve_env_with_conda( if not have_channels: have_channels = any( - ["::" in d.value for d in deps if d.category == "conda"] + [ + "::" in d.value + for d in deps + if d.category in ("conda", "npconda") + ] ) if have_channels: # Add no-channel-priority because otherwise if a version @@ -1702,9 +1731,25 @@ def _resolve_env_with_pip( args.extend(["--extra-index-url", c]) supported_tags = pip_tags_from_arch(python_version, architecture) - if python_version != platform.python_version() or architecture != arch_id(): - clean_version = ".".join(python_version.split(".")[:2]) - args.extend(["--only-binary=:all:", "--python-version", clean_version]) + pip_python_version = self._call_binary(["-V"], binary="pip").decode( + encoding="utf-8" + ) + matched_version = re.search( + r"\(python ([23])\.([0-9]+)\)", pip_python_version + ) + if not matched_version: + raise InvalidEnvironmentException( + "Cannot identify Python for PIP; got %s" % pip_python_version + ) + clean_python_version = ".".join(python_version.split(".")[:2]) + if ( + clean_python_version + != ".".join([matched_version.group(1), matched_version.group(2)]) + or architecture != arch_id() + ): + args.extend( + ["--only-binary=:all:", "--python-version", clean_python_version] + ) if architecture != arch_id(): implementations = [] # type: List[str] @@ -2129,7 +2174,7 @@ def _resolve_env_with_conda_lock( ) -> List[PackageSpecification]: outfile_name = None my_arch = arch_id() - if any([d.category not in ("pip", "conda") for d in deps]): + if any([d.category not in ("pip", "conda", "npconda") for d in deps]): raise CondaException( "Cannot resolve dependencies that include non-Conda/Pip dependencies: %s" % "; ".join(map(str, deps)) @@ -2172,7 +2217,9 @@ def _poetry_exec(cmd: str, *args: str): ) ) - pip_channels = ([CONDA_DEFAULT_PIP_SOURCE] if CONDA_DEFAULT_PIP_SOURCE else []) + [ + pip_channels = ( + [CONDA_DEFAULT_PIP_SOURCE] if CONDA_DEFAULT_PIP_SOURCE else [] + ) + [ c.value for c in channels if c.category == "pip" ] # type: List[str] try: @@ -2182,7 +2229,9 @@ def _poetry_exec(cmd: str, *args: str): # library to avoid adding another dep pip_deps = [d.value for d in deps if d.category == "pip"] - conda_deps = [d.value for d in deps if d.category == "conda"] + ["pip"] + conda_deps = [ + d.value for d in deps if d.category in ("conda", "npconda") + ] + ["pip"] # Add channels lines = ["channels:\n"] lines.extend( diff --git a/metaflow_extensions/netflix_ext/plugins/conda/conda_environment.py b/metaflow_extensions/netflix_ext/plugins/conda/conda_environment.py index 64387ee..c5c4e87 100644 --- a/metaflow_extensions/netflix_ext/plugins/conda/conda_environment.py +++ b/metaflow_extensions/netflix_ext/plugins/conda/conda_environment.py @@ -226,5 +226,5 @@ def get_client_info( def get_package_commands(self, code_package_url: str, datastore_type: str): return self.base_env.get_package_commands(code_package_url, datastore_type) - def get_environment_info(self): - return self.base_env.get_environment_info() + def get_environment_info(self, include_ext_info=False): + return self.base_env.get_environment_info(include_ext_info) diff --git a/metaflow_extensions/netflix_ext/plugins/conda/conda_step_decorator.py b/metaflow_extensions/netflix_ext/plugins/conda/conda_step_decorator.py index eeea0ff..f0527eb 100644 --- a/metaflow_extensions/netflix_ext/plugins/conda/conda_step_decorator.py +++ b/metaflow_extensions/netflix_ext/plugins/conda/conda_step_decorator.py @@ -177,6 +177,10 @@ def step_deps(self) -> Sequence[TStr]: TStr("conda", "%s==%s" % (name, ver) if ver else name) for name, ver in self._conda_deps().items() ) + deps.extend( + TStr("npconda", "%s==%s" % (name, ver) if ver else name) + for name, ver in self._np_conda_deps().items() + ) # If we have an empty version, we consider that the name is a direct # link to a package like a URL deps.extend( @@ -257,6 +261,7 @@ def step_init( # Information about the environment this environment is built from self._from_env = None # type: Optional[ResolvedEnvironment] self._from_env_conda_deps = None # type: Optional[Dict[str, str]] + self._from_env_np_conda_deps = None # type: Optional[Dict[str, str]] self._from_env_conda_channels = None # type: Optional[List[str]] self._from_env_pip_deps = None # type: Optional[Dict[str, str]] self._from_env_pip_sources = None # type: Optional[List[str]] @@ -297,7 +302,9 @@ def runtime_init(self, flow: FlowSpec, graph: FlowGraph, package: Any, run_id: s with open( os.path.join(self._metaflow_home, "INFO"), mode="wt", encoding="utf-8" ) as f: - f.write(json.dumps(self._env.get_environment_info())) + f.write( + json.dumps(self._env.get_environment_info(include_ext_info=True)) + ) # Do the same for EXT_PKG try: @@ -471,6 +478,7 @@ def _compute_from_env(self): # We either compute all or nothing so it means we computed none self._from_env_conda_deps = {} self._from_env_pip_deps = {} + self._from_env_np_conda_deps = {} self._from_env_conda_channels = [] self._from_env_pip_sources = [] @@ -484,6 +492,8 @@ def _compute_from_env(self): self._from_env_pip_deps[vals[0]] = vals[1] elif d.category == "conda": self._from_env_conda_deps[vals[0]] = vals[1] + elif d.category == "npconda": + self._from_env_np_conda_deps[vals[0]] = vals[1] # Now of channels/sources all_sources = base.sources @@ -498,6 +508,10 @@ def _from_conda_deps(self) -> Optional[Dict[str, str]]: self._compute_from_env() return self._from_env_conda_deps + def _from_np_conda_deps(self) -> Optional[Dict[str, str]]: + self._compute_from_env() + return self._from_env_np_conda_deps + def _from_pip_deps(self) -> Optional[Dict[str, str]]: self._compute_from_env() return self._from_env_pip_deps @@ -517,6 +531,11 @@ def _from_env_python(self) -> Optional[str]: return conda_deps["python"] return None + def _np_conda_deps(self) -> Dict[str, str]: + if self.from_env: + return dict(cast(Dict[str, str], self._from_np_conda_deps())) + return {} + def _conda_deps(self) -> Dict[str, str]: deps = get_pinned_conda_libs(self._python_version(), self._flow_datastore_type) diff --git a/setup.py b/setup.py index 21845ca..2959d9c 100644 --- a/setup.py +++ b/setup.py @@ -1,6 +1,6 @@ from setuptools import setup, find_namespace_packages -version = "0.1.0rc5" +version = "0.1.0" setup( name="metaflow-netflixext", @@ -35,5 +35,5 @@ "metaflow_extensions.netflix_ext.plugins.conda.resources": ["*.png", "*.svg"] }, python_requires=">=3.7.2", - install_requires=["metaflow>=2.7.22"], + install_requires=["metaflow>=2.8.3"], )