diff --git a/conda_docker/cli.py b/conda_docker/cli.py index 4b0d105..5bf3469 100644 --- a/conda_docker/cli.py +++ b/conda_docker/cli.py @@ -2,14 +2,15 @@ import logging import argparse -from conda_docker.conda import ( +from .conda_models import Context +from .conda import ( build_docker_environment, find_user_conda, conda_info, find_precs, fetch_precs, ) -from conda_docker.logging import init_logging +from .logging import init_logging def cli(args): @@ -89,6 +90,7 @@ def handle_conda_build(args): channels = info.get("channels", []) conda_default_channels = info.get("conda_default_channels", []) channels_remap = info.get("channels_remap", []) + context = Context() precs = find_precs( user_conda, download_dir, @@ -99,6 +101,7 @@ def handle_conda_build(args): prefix=args.prefix, package_specs=args.package_specs, solver=args.solver, + context=context, ) records = fetch_precs(download_dir, precs) # now build image diff --git a/conda_docker/conda.py b/conda_docker/conda.py index 9b1f86f..cc1d6fe 100644 --- a/conda_docker/conda.py +++ b/conda_docker/conda.py @@ -1,5 +1,5 @@ """Interface for finding, grabbing, and installing conda pakcages into docker image""" -# Significant portions of this file were originally forked from conda constuctor +# Significant portions of this file were originally forked from conda & conda constuctor # (c) 2016 Anaconda, Inc. / https://anaconda.com # constructor is distributed under the terms of the BSD 3-clause license. import os @@ -11,32 +11,23 @@ import tempfile import subprocess -from conda.exports import download +import requests +from requests import ConnectionError, HTTPError +from requests.exceptions import ( + InvalidSchema, + SSLError, + ProxyError as RequestsProxyError, +) -try: - from conda import __version__ as CONDA_INTERFACE_VERSION - - conda_interface_type = "conda" -except ImportError: - raise RuntimeError( - "Conda must be installed for python interpreter\n" - f"with sys.prefix: {sys.prefix}" - ) -from conda.models.channel import all_channel_urls - -try: - from conda.models.records import PackageCacheRecord -except ImportError: - from conda.models.package_cache_record import PackageCacheRecord -from conda.models.dist import Dist - -from conda_docker.docker.base import Image -from conda_docker.registry.client import pull_image -from conda_docker.utils import timer, md5_files +from .docker.base import Image +from .registry.client import pull_image +from .utils import timer, md5_files +from .download import download, disable_ssl_verify_warning, join_url +from .conda_models import Context, all_channel_urls, PackageCacheRecord, Dist LOGGER = logging.getLogger(__name__) -CONDA_MAJOR_MINOR = tuple(int(x) for x in CONDA_INTERFACE_VERSION.split(".")[:2]) +REPODATA_FN = "repodata.json" def conda_file_filter(trim_static_libs=True, trim_js_maps=True): @@ -67,42 +58,197 @@ def get_final_url(channels_remap, url): return url -def get_repodata(url): - """Obtain the repodata from a channel URL""" - if CONDA_MAJOR_MINOR >= (4, 5): - from conda.core.subdir_data import fetch_repodata_remote_request +def _ensure_text_type(value): + if hasattr(value, "decode"): + return value.decode("utf-8") + return value - raw_repodata_str = fetch_repodata_remote_request(url, None, None) - elif CONDA_MAJOR_MINOR >= (4, 4): - from conda.core.repodata import fetch_repodata_remote_request - raw_repodata_str = fetch_repodata_remote_request(url, None, None) - elif CONDA_MAJOR_MINOR >= (4, 3): - from conda.core.repodata import fetch_repodata_remote_request +def _maybe_decompress(filename, resp_content): + if filename.endswith(".bz2"): + import bz2 - repodata_obj = fetch_repodata_remote_request(None, url, None, None) - raw_repodata_str = json.dumps(repodata_obj) - else: - raise NotImplementedError( - f"unsupported version of conda: {CONDA_INTERFACE_VERSION}" + resp_content = bz2.decompress(resp_content) + return _ensure_text_type(resp_content).strip() + + +def _add_http_value_to_dict(resp, http_key, d, dict_key): + value = resp.headers.get(http_key) + if value: + d[dict_key] = value + + +def fetch_repodata_remote_request( + url, + etag=None, + mod_stamp=None, + repodata_fn=REPODATA_FN, + ssl_verify=True, + remote_connect_timeout_secs=9.15, + remote_read_timeout_secs=60.0, + proxies=None, + context=None, +): + """Get raw repodata string""" + if not ssl_verify: + disable_ssl_verify_warning() + + headers = {} + if etag: + headers["If-None-Match"] = etag + if mod_stamp: + headers["If-Modified-Since"] = mod_stamp + + headers["Accept-Encoding"] = "gzip, deflate, compress, identity" + headers["Content-Type"] = "application/json" + filename = repodata_fn + + try: + timeout = remote_connect_timeout_secs, remote_read_timeout_secs + resp = requests.get( + join_url(url, filename), headers=headers, proxies=proxies, timeout=timeout ) + if LOGGER.isEnabledFor(logging.DEBUG): + LOGGER.debug(str(resp)[:256]) + resp.raise_for_status() + except RequestsProxyError: + raise + except InvalidSchema as e: + if "SOCKS" in str(e): + message = ( + "Requests has identified that your current working environment is configured " + "to use a SOCKS proxy, but pysocks is not installed. To proceed, remove your " + "proxy configuration, run `conda install pysocks`, and then you can re-enable " + "your proxy configuration." + ) + raise RuntimeError(message) + else: + raise + except (ConnectionError, HTTPError, SSLError) as e: + # status_code might not exist on SSLError + status_code = getattr(e.response, "status_code", None) + if status_code in (403, 404): + if not url.endswith("/noarch"): + LOGGER.info( + "Unable to retrieve repodata (response: %d) for %s", + status_code, + url + "/" + repodata_fn, + ) + return None + else: + if context is None: + context = Context() + if context.allow_non_channel_urls: + LOGGER.warning( + "Unable to retrieve repodata (response: %d) for %s", + status_code, + url + "/" + repodata_fn, + ) + return None + else: + raise + elif status_code == 401: + raise + elif status_code is not None and 500 <= status_code < 600: + help_message = ( + "A remote server error occurred when trying to retrieve this URL. " + "A 500-type error (e.g. 500, 501, 502, 503, etc.) indicates the server failed to " + "fulfill a valid request. The problem may be spurious, and will resolve itself if you " + "try your request again. If the problem persists, consider notifying the maintainer " + "of the remote server." + ) + + else: + if url.startswith("https://repo.anaconda.com/"): + help_message = ( + "An HTTP error occurred when trying to retrieve this URL. " + "HTTP errors are often intermittent, and a simple retry will get you on your way. " + "If your current network has https://www.anaconda.com blocked, please file " + "a support request with your network engineering team. " + f"{url}" + ) + else: + help_message = ( + "An HTTP error occurred when trying to retrieve this URL. " + "HTTP errors are often intermittent, and a simple retry will get you on your way. " + f"{url}" + ) + raise HTTPError( + help_message, + join_url(url, filename), + status_code, + getattr(e.response, "reason", None), + getattr(e.response, "elapsed", None), + e.response, + caused_by=e, + ) + + if resp.status_code == 304: + raise RuntimeError("Response 304: Content Unchanged") + + json_str = _maybe_decompress(filename, resp.content) + + saved_fields = {"_url": url} + _add_http_value_to_dict(resp, "Etag", saved_fields, "_etag") + _add_http_value_to_dict(resp, "Last-Modified", saved_fields, "_mod") + _add_http_value_to_dict(resp, "Cache-Control", saved_fields, "_cache_control") + + # add extra values to the raw repodata json + if json_str and json_str != "{}": + raw_repodata_str = "{0}, {1}".format( + json.dumps(saved_fields)[:-1], # remove trailing '}' + json_str[1:], # remove first '{' + ) + else: + raw_repodata_str = _ensure_text_type(json.dumps(saved_fields)) + return raw_repodata_str + + +def get_repodata( + url, + ssl_verify=True, + remote_connect_timeout_secs=9.15, + remote_read_timeout_secs=60.0, + proxies=None, + context=None, +): + """Obtain the repodata from a channel URL""" + if context is None: + context = Context() + raw_repodata_str = fetch_repodata_remote_request( + url, + ssl_verify=ssl_verify, + remote_connect_timeout_secs=remote_connect_timeout_secs, + remote_read_timeout_secs=remote_read_timeout_secs, + proxies=proxies, + context=context, + ) full_repodata = json.loads(raw_repodata_str) return full_repodata def load_repodatas( - download_dir, channels=(), conda_default_channels=(), channels_remap=() + download_dir, + channels=(), + conda_default_channels=(), + channels_remap=(), + context=None, ): """Load all repodatas into a single dict""" + if context is None: + context = Context() cache_dir = os.path.join(download_dir, "cache") os.makedirs(cache_dir, exist_ok=True) remaps = {url["src"].rstrip("/"): url["dest"].rstrip("/") for url in channels_remap} urls = all_channel_urls( - url.rstrip("/") - for url in list(remaps) + list(channels) + list(conda_default_channels) + list( + url.rstrip("/") + for url in list(remaps) + list(channels) + list(conda_default_channels) + ), + context=context, ) - repodatas = {url: get_repodata(url) for url in urls} + repodatas = {url: get_repodata(url, context=context) for url in urls} return repodatas @@ -174,10 +320,13 @@ def precs_from_package_specs( channels=(), conda_default_channels=(), channels_remap=(), + context=None, ): """Get the package records from a list of package names/specs, as you might type them in on the command line. This has to perform a solve. """ + if context is None: + context = Context() # perform solve solver_conda = find_solver_conda(solver, user_conda) LOGGER.info("solving conda environment") @@ -197,7 +346,10 @@ def precs_from_package_specs( with timer(LOGGER, "loading repodata"): used_channels = {f"{x['base_url']}/{x['platform']}" for x in listing} repodatas = load_repodatas( - download_dir, channels=used_channels, channels_remap=channels_remap, + download_dir, + channels=used_channels, + channels_remap=channels_remap, + context=context, ) # now, create PackageCacheRecords @@ -235,12 +387,15 @@ def find_precs( channels=(), conda_default_channels=(), channels_remap=(), + context=None, ): if name is not None: precs = precs_from_environment_name(name, download_dir, user_conda) elif prefix is not None: precs = precs_from_environment_prefix(prefix, download_dir, user_conda) elif package_specs is not None: + if context is None: + context = Context() precs = precs_from_package_specs( package_specs, solver, @@ -249,6 +404,7 @@ def find_precs( channels=channels, conda_default_channels=conda_default_channels, channels_remap=channels_remap, + context=context, ) else: raise RuntimeError("could not determine package list") @@ -297,15 +453,9 @@ def fetch_precs(download_dir, precs): LOGGER.info(f"fetching: {prec.fn}") download(prec.url, os.path.join(download_dir, prec.fn)) - if not os.path.isdir(extracted_package_dir): - from conda.gateways.disk.create import extract_tarball - - extract_tarball(package_tarball_full_path, extracted_package_dir) - - repodata_record_path = os.path.join( - extracted_package_dir, "info", "repodata_record.json" - ) - + info_dir = os.path.join(extracted_package_dir, "info") + os.makedirs(info_dir, exist_ok=True) + repodata_record_path = os.path.join(info_dir, "repodata_record.json") with open(repodata_record_path, "w") as fh: json.dump(prec.dump(), fh, indent=2, sort_keys=True, separators=(",", ": ")) diff --git a/conda_docker/conda_models.py b/conda_docker/conda_models.py new file mode 100644 index 0000000..16883e1 --- /dev/null +++ b/conda_docker/conda_models.py @@ -0,0 +1,829 @@ +"""Substitute containers for conda information""" +# Significant portions of this file were originally forked from conda +# (c) 2016 Anaconda, Inc. / https://anaconda.com +# constructor is distributed under the terms of the BSD 3-clause license. +import os +import re +import sys +import struct +import platform +from collections import OrderedDict + +from toolz import unique, concat, concatv +from urllib3.util.url import Url + +from .download import ( + join_url, + path_to_url, + split_scheme_auth_token, + urlparse, + split_anaconda_token, +) + + +ON_WIN = bool(sys.platform == "win32") +DEFAULT_CUSTOM_CHANNELS = { + "pkgs/pro": "https://repo.anaconda.com", +} +DEFAULT_CHANNEL_ALIAS = "https://conda.anaconda.org" +DEFAULT_CHANNELS_UNIX = ( + "https://repo.anaconda.com/pkgs/main", + "https://repo.anaconda.com/pkgs/r", +) +DEFAULT_CHANNELS_WIN = ( + "https://repo.anaconda.com/pkgs/main", + "https://repo.anaconda.com/pkgs/r", + "https://repo.anaconda.com/pkgs/msys2", +) +DEFAULT_CHANNELS = DEFAULT_CHANNELS_WIN if ON_WIN else DEFAULT_CHANNELS_UNIX +DEFAULTS_CHANNEL_NAME = "defaults" +UNKNOWN_CHANNEL = "" +_PLATFORM_MAP = { + "linux2": "linux", + "linux": "linux", + "darwin": "osx", + "win32": "win", + "zos": "zos", +} +NON_X86_LINUX_MACHINES = frozenset( + {"armv6l", "armv7l", "aarch64", "ppc64", "ppc64le", "s390x",} +) +KNOWN_SUBDIRS = PLATFORM_DIRECTORIES = ( + "noarch", + "linux-32", + "linux-64", + "linux-aarch64", + "linux-armv6l", + "linux-armv7l", + "linux-ppc64", + "linux-ppc64le", + "linux-s390x", + "osx-64", + "win-32", + "win-64", + "zos-z", +) + + +def path_expand(path): + return os.path.abspath(os.path.expanduser(os.path.expandvars(path))) + + +def conda_in_private_env(): + """Is conda located in its own private environment named '_conda_'""" + envs_dir, env_name = os.path.split(sys.prefix) + return env_name == "_conda_" and os.path.basename(envs_dir) == "envs" + + +class Context: + """Context stub""" + + def __init__( + self, + subdir=None, + default_channels=None, + restore_free_channel=False, + croot="", + bld_path="", + root_prefix="", + force_32bit=False, + channel_alias=DEFAULT_CHANNEL_ALIAS, + custom_channels=None, + migrated_channel_aliases=(), + migrated_custom_channels=None, + allow_non_channel_urls=False, + ): + self._subdir = subdir + self._default_channels = ( + DEFAULT_CHANNELS if default_channels is None else default_channels + ) + self._custom_multichannels = {} + self._croot = croot + self._root_prefix = root_prefix + self.bld_path = bld_path + self.restore_free_channel = restore_free_channel + self.force_32bit = force_32bit + self._channel_alias = channel_alias + self._channel_alias_obj = None + self._custom_channels = ( + DEFAULT_CUSTOM_CHANNELS if custom_channels is None else custom_channels + ) + self._custom_channels_obj = None + self._migrated_channel_aliases = migrated_channel_aliases + self.migrated_custom_channels = ( + {} if migrated_custom_channels is None else migrated_custom_channels + ) + self.allow_non_channel_urls = allow_non_channel_urls + + @property + def subdir(self): + if self._subdir: + return self._subdir + m = platform.machine() + if m in NON_X86_LINUX_MACHINES: + self._subdir = f"linux-{m}" + elif self.platform == "zos": + self._subdir = "zos-z" + else: + self._subdir = f"{self.platform}-{self.bits}" + return self._subdir + + @property + def subdirs(self): + return (self.subdir, "noarch") + + @property + def known_subdirs(self): + return frozenset(concatv(KNOWN_SUBDIRS, self.subdirs)) + + @property + def root_prefix(self): + if self._root_prefix: + return os.path.abspath(os.path.expanduser(self._root_prefix)) + elif conda_in_private_env(): + return os.path.abspath(os.path.join(self.conda_prefix, "..", "..")) + else: + return self.conda_prefix + + @property + def conda_prefix(self): + return os.path.abspath(sys.prefix) + + @property + def conda_build_local_paths(self): + # does file system reads to make sure paths actually exist + return tuple( + unique( + full_path + for full_path in ( + path_expand(d) + for d in ( + self._croot, + self.bld_path, + # self.conda_build.get('root-dir'), # not doing it this way + os.path.join(self.root_prefix, "conda-bld"), + "~/conda-bld", + ) + if d + ) + if os.path.isdir(full_path) + ) + ) + + @property + def conda_build_local_urls(self): + return tuple(map(path_to_url, self.conda_build_local_paths)) + + @property + def custom_multichannels(self): + if self._custom_multichannels: + return self._custom_multichannels + default_channels = list(self._default_channels) + if self.restore_free_channel: + default_channels.insert(1, "https://repo.anaconda.com/pkgs/free") + + reserved_multichannel_urls = OrderedDict( + ( + (DEFAULTS_CHANNEL_NAME, default_channels), + ("local", self.conda_build_local_urls), + ) + ) + reserved_multichannels = OrderedDict( + ( + name, + tuple( + Channel.make_simple_channel(self.channel_alias, url) for url in urls + ), + ) + for name, urls in reserved_multichannel_urls.items() + ) + custom_multichannels = OrderedDict( + ( + name, + tuple( + Channel.make_simple_channel(self.channel_alias, url) for url in urls + ), + ) + for name, urls in self._custom_multichannels.items() + ) + all_multichannels = OrderedDict( + (name, channels) + for name, channels in concat( + map( + list, + ( + custom_multichannels.items(), + reserved_multichannels.items(), # reserved comes last, so reserved overrides custom + ), + ) + ) + ) + self._custom_multichannels = all_multichannels + return all_multichannels + + @property + def custom_channels(self): + if self._custom_channels_obj is not None: + return self._custom_channels_obj + custom_channels = ( + Channel.make_simple_channel(self.channel_alias, url, name) + for name, url in self._custom_channels.items() + ) + channels_from_multichannels = concat( + channel for channel in self.custom_multichannels.values() + ) + all_channels = OrderedDict( + (x.name, x) + for x in ( + ch for ch in concatv(channels_from_multichannels, custom_channels,) + ) + ) + self._custom_channels_obj = all_channels + return self._custom_channels_obj + + @property + def platform(self): + return _PLATFORM_MAP.get(sys.platform, "unknown") + + @property + def bits(self): + if self.force_32bit: + return 32 + else: + return 8 * struct.calcsize("P") + + @property + def channel_alias(self): + if self._channel_alias_obj is not None: + return self._channel_alias_obj + location, scheme, auth, token = split_scheme_auth_token(self._channel_alias) + self._channel_alias_obj = Channel( + scheme=scheme, auth=auth, location=location, token=token + ) + return self._channel_alias_obj + + @property + def migrated_channel_aliases(self): + return tuple( + Channel(scheme=scheme, auth=auth, location=location, token=token) + for location, scheme, auth, token in ( + split_scheme_auth_token(c) for c in self._migrated_channel_aliases + ) + ) + + +RE_HAS_SCHEME = re.compile(r"[a-z][a-z0-9]{0,11}://") + + +def has_scheme(value): + """Returns scheme""" + return RE_HAS_SCHEME.match(value) + + +RE_WIN_PATH_BACKOUT = re.compile(r"(\\(?! ))") + + +def win_path_backout(path): + """Replace all backslashes except those escaping spaces + if we pass a file url, something like file://\\unc\path\on\win, make sure + we clean that up too + """ + return RE_WIN_PATH_BACKOUT.sub(r"/", path).replace(":////", "://") + + +def _split_platform_re(known_subdirs): + _platform_match_regex = r"/(%s)(?:/|$)" % r"|".join( + r"%s" % d for d in known_subdirs + ) + return re.compile(_platform_match_regex, re.IGNORECASE) + + +def split_platform(known_subdirs, url): + """ + Examples: + >>> from conda.base.constants import KNOWN_SUBDIRS + >>> split_platform(KNOWN_SUBDIRS, "https://1.2.3.4/t/tk-123/linux-ppc64le/path") + (u'https://1.2.3.4/t/tk-123/path', u'linux-ppc64le') + """ + _platform_match = _split_platform_re(known_subdirs).search(url) + platform = _platform_match.groups()[0] if _platform_match else None + cleaned_url = url.replace("/" + platform, "", 1) if platform is not None else url + return cleaned_url.rstrip("/"), platform + + +def strip_pkg_extension(path): + """ + Examples: + >>> strip_pkg_extension("/path/_license-1.1-py27_1.tar.bz2") + ('/path/_license-1.1-py27_1', '.tar.bz2') + >>> strip_pkg_extension("/path/_license-1.1-py27_1.conda") + ('/path/_license-1.1-py27_1', '.conda') + >>> strip_pkg_extension("/path/_license-1.1-py27_1") + ('/path/_license-1.1-py27_1', None) + """ + # NOTE: not using CONDA_TARBALL_EXTENSION_V1 or CONDA_TARBALL_EXTENSION_V2 to comply with + # import rules and to avoid a global lookup. + if path[-6:] == ".conda": + return path[:-6], ".conda" + elif path[-8:] == ".tar.bz2": + return path[:-8], ".tar.bz2" + elif path[-5:] == ".json": + return path[:-5], ".json" + else: + return path, None + + +def split_conda_url_easy_parts(known_subdirs, url): + # scheme, auth, token, platform, package_filename, host, port, path, query + cleaned_url, token = split_anaconda_token(url) + cleaned_url, platform = split_platform(known_subdirs, cleaned_url) + _, ext = strip_pkg_extension(cleaned_url) + cleaned_url, package_filename = ( + cleaned_url.rsplit("/", 1) if ext else (cleaned_url, None) + ) + url_parts = urlparse(cleaned_url) + return ( + url_parts.scheme, + url_parts.auth, + token, + platform, + package_filename, + url_parts.host, + url_parts.port, + url_parts.path, + url_parts.query, + ) + + +def tokenized_startswith(test_iterable, startswith_iterable): + return all(t == sw for t, sw in zip(test_iterable, startswith_iterable)) + + +def tokenized_conda_url_startswith(test_url, startswith_url): + test_url, startswith_url = urlparse(test_url), urlparse(startswith_url) + if test_url.host != startswith_url.host or test_url.port != startswith_url.port: + return False + norm_url_path = lambda url: url.path.strip("/") or "/" + return tokenized_startswith( + norm_url_path(test_url).split("/"), norm_url_path(startswith_url).split("/") + ) + + +def _read_channel_configuration(scheme, host, port, path, context=None): + # return location, name, scheme, auth, token + path = path and path.rstrip("/") + test_url = Url(host=host, port=port, path=path).url + + # Step 1. No path given; channel name is None + if not path: + return ( + Url(host=host, port=port).url.rstrip("/"), + None, + scheme or None, + None, + None, + ) + + # Step 2. migrated_custom_channels matches + for name, location in sorted( + context.migrated_custom_channels.items(), reverse=True, key=lambda x: len(x[0]) + ): + location, _scheme, _auth, _token = split_scheme_auth_token(location) + if tokenized_conda_url_startswith(test_url, join_url(location, name)): + # translate location to new location, with new credentials + subname = test_url.replace(join_url(location, name), "", 1).strip("/") + channel_name = join_url(name, subname) + channel = _get_channel_for_name(channel_name) + return ( + channel.location, + channel_name, + channel.scheme, + channel.auth, + channel.token, + ) + + # Step 3. migrated_channel_aliases matches + for migrated_alias in context.migrated_channel_aliases: + if test_url.startswith(migrated_alias.location): + name = test_url.replace(migrated_alias.location, "", 1).strip("/") + ca = context.channel_alias + return ca.location, name, ca.scheme, ca.auth, ca.token + + # Step 4. custom_channels matches + for name, channel in sorted( + context.custom_channels.items(), reverse=True, key=lambda x: len(x[0]) + ): + that_test_url = join_url(channel.location, channel.name) + if tokenized_startswith(test_url.split("/"), that_test_url.split("/")): + subname = test_url.replace(that_test_url, "", 1).strip("/") + return ( + channel.location, + join_url(channel.name, subname), + scheme, + channel.auth, + channel.token, + ) + + # Step 5. channel_alias match + ca = context.channel_alias + if ca.location and tokenized_startswith( + test_url.split("/"), ca.location.split("/") + ): + name = test_url.replace(ca.location, "", 1).strip("/") or None + return ca.location, name, scheme, ca.auth, ca.token + + # Step 6. not-otherwise-specified file://-type urls + if host is None: + # this should probably only happen with a file:// type url + assert port is None + location, name = test_url.rsplit("/", 1) + if not location: + location = "/" + _scheme, _auth, _token = "file", None, None + return location, name, _scheme, _auth, _token + + # Step 7. fall through to host:port as channel_location and path as channel_name + # but bump the first token of paths starting with /conda for compatibility with + # Anaconda Enterprise Repository software. + bump = None + path_parts = path.strip("/").split("/") + if path_parts and path_parts[0] == "conda": + bump, path = "conda", "/".join(drop(1, path_parts)) + return ( + Url(host=host, port=port, path=bump).url.rstrip("/"), + path.strip("/") or None, + scheme or None, + None, + None, + ) + + +def parse_conda_channel_url(url, context=None): + """Parses conda URLs""" + ( + scheme, + auth, + token, + platform, + package_filename, + host, + port, + path, + query, + ) = split_conda_url_easy_parts(context.known_subdirs, url) + # recombine host, port, path to get a channel_name and channel_location + ( + channel_location, + channel_name, + configured_scheme, + configured_auth, + configured_token, + ) = _read_channel_configuration(scheme, host, port, path, context=context) + return Channel( + configured_scheme or "https", + auth or configured_auth, + channel_location, + token or configured_token, + channel_name, + platform, + package_filename, + context=context, + ) + + +RE_PATH_MATCH = re.compile( + r"\./" # ./ + r"|\.\." # .. + r"|~" # ~ + r"|/" # / + r"|[a-zA-Z]:[/\\]" # drive letter, colon, forward or backslash + r"|\\\\" # windows UNC path + r"|//" # windows UNC path +) + + +def is_path(value): + if "://" in value: + return False + return RE_PATH_MATCH.match(value) + + +def is_package_file(path): + """ + Examples: + >>> is_package_file("/path/_license-1.1-py27_1.tar.bz2") + True + >>> is_package_file("/path/_license-1.1-py27_1.conda") + True + >>> is_package_file("/path/_license-1.1-py27_1") + False + """ + # NOTE: not using CONDA_TARBALL_EXTENSION_V1 or CONDA_TARBALL_EXTENSION_V2 to comply with + # import rules and to avoid a global lookup. + return path[-6:] == ".conda" or path[-8:] == ".tar.bz2" + + +class Channel: + """Channel stub""" + + def __init__( + self, + scheme=None, + auth=None, + location=None, + token=None, + name=None, + platform=None, + package_filename=None, + context=None, + ): + self.scheme = scheme + self.auth = auth + self.location = location + self.token = token + self.name = name + self.platform = platform + self.package_filename = package_filename + self.context = context + + @property + def canonical_name(self): + if hasattr(self, "__canonical_name"): + return self.__canonical_name + + context = self.context + for multiname, channels in context.custom_multichannels.items(): + for channel in channels: + if self.name == channel.name: + cn = self.__canonical_name = multiname + return cn + + for that_name in context.custom_channels: + if self.name and tokenized_startswith( + self.name.split("/"), that_name.split("/") + ): + cn = self.__canonical_name = self.name + return cn + + if any( + c.location == self.location + for c in concatv( + (context.channel_alias,), context.migrated_channel_aliases, + ) + ): + cn = self.__canonical_name = self.name + return cn + + # fall back to the equivalent of self.base_url + # re-defining here because base_url for MultiChannel is None + if self.scheme: + cn = self.__canonical_name = "%s://%s" % ( + self.scheme, + join_url(self.location, self.name), + ) + return cn + else: + cn = self.__canonical_name = join_url(self.location, self.name).lstrip("/") + return cn + + def urls(self, with_credentials=False, subdirs=None): + if subdirs is None: + subdirs = self.context.subdirs + + if self.canonical_name == UNKNOWN_CHANNEL: + return Channel(DEFAULTS_CHANNEL_NAME).urls( + with_credentials=with_credentials, subdirs=subdirs + ) + + base = [self.location] + if with_credentials and self.token: + base.extend(["t", self.token]) + base.append(self.name) + base = join_url(*base) + + def _platforms(): + if self.platform: + yield self.platform + if self.platform != "noarch": + yield "noarch" + else: + for subdir in subdirs: + yield subdir + + bases = (join_url(base, p) for p in _platforms()) + + if with_credentials and self.auth: + return ["%s://%s@%s" % (self.scheme, self.auth, b) for b in bases] + else: + return ["%s://%s" % (self.scheme, b) for b in bases] + + @staticmethod + def from_url(url, context=None): + return parse_conda_channel_url(url, context=context) + + @staticmethod + def from_value(value, context=None): + if value in (None, "", "None:///", "None"): + return Channel(name=UNKNOWN_CHANNEL, context=context) + value = str(value) + if has_scheme(value): + if value.startswith("file:"): + value = win_path_backout(value) + return Channel.from_url(value, context=context) + elif is_path(value): + return Channel.from_url(path_to_url(value), context=context) + elif is_package_file(value): + if value.startswith("file:"): + value = win_path_backout(value) + return Channel.from_url(value, context=context) + else: + # at this point assume we don't have a bare (non-scheme) url + # e.g. this would be bad: repo.anaconda.com/pkgs/free + _stripped, platform = split_platform(value, context.known_subdirs) + if _stripped in context.custom_multichannels: + return MultiChannel( + _stripped, + context.custom_multichannels[_stripped], + platform, + context=context, + ) + else: + return Channel.from_channel_name(value, context=context) + + @staticmethod + def make_simple_channel(channel_alias, channel_url, name=None, context=None): + ca = channel_alias + test_url, scheme, auth, token = split_scheme_auth_token(channel_url) + if name and scheme: + return Channel( + scheme=scheme, + auth=auth, + location=test_url, + token=token, + name=name.strip("/"), + context=context, + ) + if scheme: + if ca.location and test_url.startswith(ca.location): + location, name = ca.location, test_url.replace(ca.location, "", 1) + else: + url_parts = urlparse(test_url) + location = Url(host=url_parts.host, port=url_parts.port).url + name = url_parts.path or "" + return Channel( + scheme=scheme, + auth=auth, + location=location, + token=token, + name=name.strip("/"), + context=context, + ) + else: + return Channel( + scheme=ca.scheme, + auth=ca.auth, + location=ca.location, + token=ca.token, + name=name and name.strip("/") or channel_url.strip("/"), + context=context, + ) + + +class MultiChannel(Channel): + def __init__(self, name, channels, platform=None, context=None): + self.name = name + self.location = None + + if platform: + c_dicts = tuple(c.dump() for c in channels) + any(cd.update(platform=platform) for cd in c_dicts) + self._channels = tuple(Channel(context=context, **cd) for cd in c_dicts) + else: + self._channels = channels + + self.scheme = None + self.auth = None + self.token = None + self.platform = platform + self.package_filename = None + + @property + def channel_location(self): + return self.location + + @property + def canonical_name(self): + return self.name + + def urls(self, with_credentials=False, subdirs=None): + from itertools import chain + + _channels = self._channels + return list( + chain.from_iterable(c.urls(with_credentials, subdirs) for c in _channels) + ) + + @property + def base_url(self): + return None + + @property + def base_urls(self): + return tuple(c.base_url for c in self._channels) + + def url(self, with_credentials=False): + return None + + def dump(self): + return {"name": self.name, "channels": tuple(c.dump() for c in self._channels)} + + +def all_channel_urls(channels, subdirs=None, with_credentials=True, context=None): + """Finds channel URLs""" + result = set() + for chn in channels: + channel = Channel.from_value(chn, context=context) + result.update(channel.urls(with_credentials, subdirs)) + return result + + +class PackageRecord: + """PackageRecord stub""" + + def __init__( + self, + name=None, + version=None, + url=None, + md5=None, + fn=None, + base_url=None, + build_number=None, + build_string=None, + channel=None, + dist_name=None, + ): + self.name = name + self.version = version + self.md5 = md5 + self.url = url + self.fn = fn + self.base_url = base_url + self.build_number = build_number + self.build_string = build_string + self.channel = channel + self.dist_name = dist_name + + def dump(self): + attrs = ( + "name", + "version", + "md5", + "url", + "fn", + "base_url", + "build_number", + "build_string", + "channel", + "dist_name", + ) + return {k: getattr(self, k, None) for k in attrs} + + +class PackageCacheRecord(PackageRecord): + """PackageCacheRecord stub""" + + def __init__( + self, package_tarball_full_path=None, extracted_package_dir=None, **kwargs + ): + super().__init__(**kwargs) + self.package_tarball_full_path = package_tarball_full_path + self.extracted_package_dir = extracted_package_dir + + @classmethod + def from_objects( + cls, prec, package_tarball_full_path=None, extracted_package_dir=None + ): + d = prec.dump() + return cls( + package_tarball_full_path=package_tarball_full_path, + extracted_package_dir=extracted_package_dir, + **d, + ) + + +class Dist: + """Distribution stub""" + + def __init__(self, channel, dist_name=None, url=None, base_url=None): + self.channel = channel + self.dist_name = dist_name + self.url = url + self.base_url = base_url + + @property + def full_name(self): + return self.__str__() + + def __str__(self): + return f"{self.channel}::{self.dist_name}" if self.channel else self.dist_name diff --git a/conda_docker/download.py b/conda_docker/download.py new file mode 100644 index 0000000..9e28890 --- /dev/null +++ b/conda_docker/download.py @@ -0,0 +1,393 @@ +"""Tools for downloading URLs""" +# -*- coding: utf-8 -*- +# Originally forked from conda +# Copyright (C) 2012 Anaconda, Inc +# SPDX-License-Identifier: BSD-3-Clause +import hashlib +import logging +import tempfile +import warnings +import sys +import os +import re +import shutil +import ctypes +from ctypes.util import find_library + +import requests +from requests import ConnectionError, HTTPError +from requests.packages.urllib3.exceptions import InsecureRequestWarning +from requests.exceptions import ( + InvalidSchema, + SSLError, + ProxyError as RequestsProxyError, +) +from urllib3.util.url import Url, parse_url + + +LOGGER = logging.getLogger(__name__) +ON_WIN = bool(sys.platform == "win32") + + +def join_url(*args): + """Joins URL parts into a single string""" + start = "/" if not args[0] or args[0].startswith("/") else "" + return start + "/".join(y for y in (x.strip("/") for x in args if x) if y) + + +def path_to_url(path): + if not path: + raise ValueError("Not allowed: %r" % path) + if path.startswith(file_scheme): + try: + path.decode("ascii") + except UnicodeDecodeError: + raise ValueError( + "Non-ascii not allowed for things claiming to be URLs: %r" % path + ) + return path + path = os.path.abspath(os.path.expanduser(path)).replace("\\", "/") + # We do not use urljoin here because we want to take our own + # *very* explicit control of how paths get encoded into URLs. + # We should not follow any RFCs on how to encode and decode + # them, we just need to make sure we can represent them in a + # way that will not cause problems for whatever amount of + # urllib processing we *do* need to do on them (which should + # be none anyway, but I doubt that is the case). I have gone + # for ASCII and % encoding of everything not alphanumeric or + # not in `!'()*-._/:`. This should be pretty save. + # + # To avoid risking breaking the internet, this code only runs + # for `file://` URLs. + # + percent_encode_chars = "!'()*-._/\\:" + percent_encode = lambda s: "".join( + ["%%%02X" % ord(c), c][c < "{" and c.isalnum() or c in percent_encode_chars] + for c in s + ) + if any(ord(char) >= 128 for char in path): + path = percent_encode( + path.decode("unicode-escape") + if hasattr(path, "decode") + else bytes(path, "utf-8").decode("unicode-escape") + ) + + # https://blogs.msdn.microsoft.com/ie/2006/12/06/file-uris-in-windows/ + if len(path) > 1 and path[1] == ":": + path = file_scheme + "/" + path + else: + path = file_scheme + path + return path + + +def urlparse(url): + if ON_WIN and url.startswith("file:"): + url.replace("\\", "/") + return parse_url(url) + + +_ANACONDA_TOKEN_RE = re.compile(r"/t/([a-zA-Z0-9-]*)") + + +def split_anaconda_token(url): + """ + Examples: + >>> split_anaconda_token("https://1.2.3.4/t/tk-123-456/path") + (u'https://1.2.3.4/path', u'tk-123-456') + >>> split_anaconda_token("https://1.2.3.4/t//path") + (u'https://1.2.3.4/path', u'') + >>> split_anaconda_token("https://some.domain/api/t/tk-123-456/path") + (u'https://some.domain/api/path', u'tk-123-456') + >>> split_anaconda_token("https://1.2.3.4/conda/t/tk-123-456/path") + (u'https://1.2.3.4/conda/path', u'tk-123-456') + >>> split_anaconda_token("https://1.2.3.4/path") + (u'https://1.2.3.4/path', None) + >>> split_anaconda_token("https://10.2.3.4:8080/conda/t/tk-123-45") + (u'https://10.2.3.4:8080/conda', u'tk-123-45') + """ + _token_match = _ANACONDA_TOKEN_RE.search(url) + token = _token_match.groups()[0] if _token_match else None + cleaned_url = url.replace("/t/" + token, "", 1) if token is not None else url + return cleaned_url.rstrip("/"), token + + +def split_scheme_auth_token(url): + """ + Examples: + >>> split_scheme_auth_token("https://u:p@conda.io/t/x1029384756/more/path") + ('conda.io/more/path', 'https', 'u:p', 'x1029384756') + >>> split_scheme_auth_token(None) + (None, None, None, None) + """ + if not url: + return None, None, None, None + cleaned_url, token = split_anaconda_token(url) + url_parts = urlparse(cleaned_url) + remainder_url = Url( + host=url_parts.host, + port=url_parts.port, + path=url_parts.path, + query=url_parts.query, + ).url + return remainder_url, url_parts.scheme, url_parts.auth, token + + +def disable_ssl_verify_warning(): + """Disables insecure request warnings""" + warnings.simplefilter("ignore", InsecureRequestWarning) + + +def preload_openssl(): + """Because our openssl library lives in Librar/bin, and because that may not be on PATH + if conda.exe in Scripts is called directly, try this preload to avoid user issues.""" + libbin_path = os.path.join(sys.prefix, "Library", "bin") + libssl_dllname = "libssl" + libcrypto_dllname = "libcrypto" + libssl_version = "-1_1" + libssl_arch = "" + if sys.maxsize > 2 ** 32: + libssl_arch = "-x64" + so_name = libssl_dllname + libssl_version + libssl_arch + libssl_path2 = os.path.join(libbin_path, so_name) + # if version 1.1 is not found, try to load 1.0 + if not os.path.exists(libssl_path2 + ".dll"): + libssl_version = "" + libssl_arch = "" + libssl_dllname = "ssleay32" + libcrypto_dllname = "libeay32" + so_name = libssl_dllname + libssl_path2 = os.path.join(libbin_path, so_name) + libssl_path = find_library(so_name) + if not libssl_path: + libssl_path = libssl_path2 + # crypto library might exists ... + so_name = libcrypto_dllname + libssl_version + libssl_arch + libcrypto_path = find_library(so_name) + if not libcrypto_path: + libcrypto_path = os.path.join(sys.prefix, "Library", "bin", so_name) + kernel32 = ctypes.windll.kernel32 + h_mod = kernel32.GetModuleHandleA(libcrypto_path) + if not h_mod: + ctypes.WinDLL(libcrypto_path) + h_mod = kernel32.GetModuleHandleA(libssl_path) + if not h_mod: + ctypes.WinDLL(libssl_path) + + +def download( + url, + target_full_path, + md5=None, + sha256=None, + size=None, + progress_update_callback=None, + ssl_verify=True, + remote_connect_timeout_secs=9.15, + remote_read_timeout_secs=60.0, + proxies=None, +): + if os.path.exists(target_full_path): + raise IOError(f"Target {target_full_path} for {url} already exists") + if sys.platform == "win32": + preload_openssl() + if not ssl_verify: + disable_ssl_verify_warning() + + try: + timeout = remote_connect_timeout_secs, remote_read_timeout_secs + resp = requests.get(url, stream=True, proxies=proxies, timeout=timeout) + if LOGGER.isEnabledFor(logging.DEBUG): + LOGGER.debug(str(resp)[:256]) + resp.raise_for_status() + + content_length = int(resp.headers.get("Content-Length", 0)) + + # prefer sha256 over md5 when both are available + checksum_builder = checksum_type = checksum = None + if sha256: + checksum_builder = hashlib.new("sha256") + checksum_type = "sha256" + checksum = sha256 + elif md5: + checksum_builder = hashlib.new("md5") if md5 else None + checksum_type = "md5" + checksum = md5 + + size_builder = 0 + try: + with open(target_full_path, "wb") as fh: + streamed_bytes = 0 + for chunk in resp.iter_content(2 ** 14): + # chunk could be the decompressed form of the real data + # but we want the exact number of bytes read till now + streamed_bytes = resp.raw.tell() + try: + fh.write(chunk) + except IOError as e: + message = ( + "Failed to write to %(target_path)s\n errno: %(errno)d" + ) + # TODO: make this CondaIOError + raise CondaError( + message, target_path=target_full_path, errno=e.errno + ) + + checksum_builder and checksum_builder.update(chunk) + size_builder += len(chunk) + + if content_length and 0 <= streamed_bytes <= content_length: + if progress_update_callback: + progress_update_callback(streamed_bytes / content_length) + + if content_length and streamed_bytes != content_length: + # TODO: needs to be a more-specific error type + message = ( + "Downloaded bytes did not match Content-Length\n" + f" url: {url}\n" + f" target_path: {target_path}\n" + f" Content-Length: {content_length}\n" + f" downloaded bytes: {downloaded_bytes}\n" + ) + raise RuntimeError( + message, + url=url, + target_path=target_full_path, + content_length=content_length, + downloaded_bytes=streamed_bytes, + ) + + except (IOError, OSError) as e: + if e.errno == 104: + # Connection reset by peer + LOGGER.debug("%s, trying again" % e) + raise + + if checksum: + actual_checksum = checksum_builder.hexdigest() + if actual_checksum != checksum: + LOGGER.debug( + "%s mismatch for download: %s (%s != %s)", + checksum_type, + url, + actual_checksum, + checksum, + ) + raise RuntimeError( + url, target_full_path, checksum_type, checksum, actual_checksum + ) + if size is not None: + actual_size = size_builder + if actual_size != size: + LOGGER.debug( + "size mismatch for download: %s (%s != %s)", url, actual_size, size + ) + raise RuntimeError(url, target_full_path, "size", size, actual_size) + + except RequestsProxyError: + raise + + except InvalidSchema as e: + if "SOCKS" in str(e): + message = ( + "Requests has identified that your current working environment is configured " + "to use a SOCKS proxy, but pysocks is not installed. To proceed, remove your " + "proxy configuration, run 'conda install pysocks', and then you can re-enable " + "your proxy configuration." + ) + raise RuntimeError(message) + else: + raise + + except (ConnectionError, HTTPError, SSLError) as e: + help_message = ( + "An HTTP error occurred when trying to retrieve this URL. " + "HTTP errors are often intermittent, and a simple retry will get you on your way." + ) + raise RuntimeError( + help_message, + url, + getattr(e.response, "status_code", None), + getattr(e.response, "reason", None), + getattr(e.response, "elapsed", None), + e.response, + caused_by=e, + ) + + +def download_text( + url, + ssl_verify=True, + remote_connect_timeout_secs=9.15, + remote_read_timeout_secs=60.0, + proxies=None, +): + if sys.platform == "win32": + preload_openssl() + if not ssl_verify: + disable_ssl_verify_warning() + try: + timeout = remote_connect_timeout_secs, remote_read_timeout_secs + response = requests.get(url, stream=True, proxies=proxies, timeout=timeout) + if LOGGER.isEnabledFor(logging.DEBUG): + LOGGER.debug(str(response)[:256]) + response.raise_for_status() + except RequestsProxyError: + raise + except InvalidSchema as e: + if "SOCKS" in str(e): + message = ( + "Requests has identified that your current working environment is configured " + "to use a SOCKS proxy, but pysocks is not installed. To proceed, remove your " + "proxy configuration, run `conda install pysocks`, and then you can re-enable " + "your proxy configuration." + ) + raise RuntimeError(message) + else: + raise + except (ConnectionError, HTTPError, SSLError) as e: + status_code = getattr(e.response, "status_code", None) + if status_code == 404: + help_message = ( + "An HTTP error occurred when trying to retrieve this URL. " + "The URL does not exist." + ) + else: + help_message = ( + "An HTTP error occurred when trying to retrieve this URL. " + "HTTP errors are often intermittent, and a simple retry will get you on your way." + ) + raise RuntimeError( + help_message, + url, + status_code, + getattr(e.response, "reason", None), + getattr(e.response, "elapsed", None), + e.response, + caused_by=e, + ) + return response.text + + +class TmpDownload(object): + """ + Context manager to handle downloads to a tempfile + """ + + def __init__(self, url, verbose=True): + self.url = url + self.verbose = verbose + + def __enter__(self): + if "://" not in self.url: + # if we provide the file itself, no tmp dir is created + self.tmp_dir = None + return self.url + else: + self.tmp_dir = tempfile.mkdtemp() + dst = os.path.join(self.tmp_dir, os.path.basename(self.url)) + download(self.url, dst) + return dst + + def __exit__(self, exc_type, exc_value, traceback): + if self.tmp_dir: + shutil.rmtree(self.tmp_dir) diff --git a/recipe/meta.yaml b/recipe/meta.yaml index 693465e..4d3efa3 100644 --- a/recipe/meta.yaml +++ b/recipe/meta.yaml @@ -23,7 +23,9 @@ requirements: - pip run: - python - - conda + - requests + - toolz + - urllib3 - mamba - conda-standalone - fakechroot