-
Notifications
You must be signed in to change notification settings - Fork 2
/
generate_packages_to_prefetch.py
executable file
·422 lines (331 loc) · 15.7 KB
/
generate_packages_to_prefetch.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
#!/usr/bin/python
"""Generate list of packages to be prefetched in Cachi2 and used in Konflux for hermetic build.
usage: generate_packages_to_prefetch.py [-h] [-p]
options:
-h, --help show this help message and exit
-p, --process-special-packages
Enable or disable processing special packages like torch etc.
-c, --cleanup Enable or disable work directory cleanup
-w WORK_DIRECTORY, --work-directory WORK_DIRECTORY
Work directory to store files generated during different stages
of processing
This script performs several steps:
1. removes torch+cpu dependency from project file
2. generates requirements.txt file from pyproject.toml + pdm.lock
3. removes all torch dependencies (including CUDA/Nvidia packages)
4. downloads torch+cpu wheel
5. computes hashes for this wheel
6. adds the URL to wheel + hash to resulting requirements.txt file
7. downloads script pip_find_builddeps from the Cachito project
8. generated requirements-build.in file
9. compiles requirements-build.in file into requirements-build.txt file
Please note that this script depends on tool that is downloaded from repository containing
Cachito system. This tool is run locally w/o any additional security checks etc. so some
care is needed (run this script from within containerized environment etc.).
"""
import argparse
import os
import re
import shutil
import subprocess
import sys
import tempfile
from os.path import join
from urllib.request import urlretrieve
import requests
from packaging import tags
# just these files are needed as project stub, no other configs and/or sources are needed
PROJECT_FILES = ("pyproject.toml", "pdm.lock", "LICENSE", "README.md")
# registry with Torch wheels (CPU variant)
TORCH_REGISTRY = "https://download.pytorch.org/whl/cpu"
TORCH_VERSION = "2.2.2"
TORCH_WHEEL = f"torch-{TORCH_VERSION}%2Bcpu-cp311-cp311-linux_x86_64.whl"
# URL to static content of repository containing Cachito system
CACHITO_URL = "https://raw.githubusercontent.com/containerbuildsystem/cachito"
# path to helper script to generate list of packages that will need to be build
# This script is part of Cachito system (predecedor of Cachi2) and have to be user
BUILDDEPS_SCRIPT_PATH = "master/bin" # wokeignore:rule=master
# name of path to helper script to generate list of packages that will need to be build
BUILDDEPS_SCRIPT_NAME = "pip_find_builddeps.py"
# name of standard pip requirement file
REQUIREMENTS_FILE = "requirements.txt"
FILTERED_REQUIREMENTS_FILE = "requirements_filtered.txt"
def shell(command, directory):
"""Run command via shell inside specified directory."""
return subprocess.check_output(command, cwd=directory, shell=True) # noqa: S602
def copy_project_stub(directory):
"""Copy all files that represent project stub into specified directory."""
for project_file in PROJECT_FILES:
shutil.copy(project_file, directory)
def remove_torch_dependency(directory):
"""Remove torch (specifically torch+cpu) dependency from the project.toml file."""
shell("pdm remove torch", directory)
def generate_requirements_file(work_directory):
"""Generate file requirements.txt that contains hashes for all packages."""
shell("pip-compile -vv pyproject.toml --generate-hashes", work_directory)
def remove_package(directory, source, target, package_prefix):
"""Remove package or packages with specified prefix from the requirements file."""
package_block = False
with open(join(directory, source)) as fin:
with open(join(directory, target), "w") as fout:
for line in fin:
if line.startswith(package_prefix):
print(line)
package_block = True
elif package_block:
# the whole block with hashes needs to be filtered out
if not line.startswith(" "):
# end of package block detected
package_block = False
if not package_block:
fout.write(line)
def remove_unwanted_dependencies(directory):
"""Remove all unwanted dependencies from requirements file, creating in-between files."""
# the torch itself
remove_package(directory, REQUIREMENTS_FILE, "step1.txt", "torch")
# all CUDA-related packages (torch depends on them)
remove_package(directory, "step1.txt", "step2.txt", "nvidia")
def wheel_url(registry, wheel):
"""Construct full URL to wheel."""
return f"{registry}/{wheel}"
def download_wheel(directory, registry, wheel):
"""Download selected wheel from registry."""
url = wheel_url(registry, wheel)
into = join(directory, wheel)
urlretrieve(url, into) # noqa: S310
def generate_hash(directory, registry, wheel, target):
"""Generate hash entry for given wheel."""
output = shell(f"pip hash {wheel}", directory)
hash_line = output.decode("ascii").splitlines()[1]
with open(join(directory, target), "w") as fout:
url = wheel_url(registry, wheel)
fout.write(f"torch @ {url} \\\n")
fout.write(f" {hash_line}\n")
def retrieve_supported_tags():
"""Retrieve all supported tags for the current environment."""
supported_tags = {str(tag) for tag in tags.sys_tags()}
# Print supported tags for the current environment
print("Supported tags for current environment:")
for tag in supported_tags:
print(tag)
return supported_tags
def filter_hashes(hashes, package_line, supported_tags):
"""Filter hashes based on platform and Python version compatibility."""
package_name, package_version = package_line.split("==")
package_version = (
package_version.strip()
) # Ensure there's no extra space or backslash
for hash_line in hashes:
# Extract the hash from the hash line
hash_value = re.search(r"sha256:(\w+)", hash_line).group(1)
# Fetch wheel metadata from PyPI
file_info = get_package_file_info(package_name, package_version, hash_value)
if file_info:
# Check if the file is a wheel or source distribution
if file_info["filename"].endswith(".whl"):
# Extract the tags directly from the filename
# (e.g., "cp310-cp310-manylinux_2_17_x86_64")
wheel_tags = extract_tags_from_filename(file_info["filename"])
# Print tags found in file_info
print(f"\nTags for {file_info['filename']} from PyPI:")
for tag in wheel_tags:
print(tag)
# Find the best matching tag based on priority
common_tags = wheel_tags & supported_tags
if common_tags:
best_tag = select_best_tag(common_tags)
print(f"Best tag selected for {file_info['filename']}: {best_tag}")
return hash_value
else:
# For source distributions (.tar.gz or .zip),
# assume compatibility with all platforms
print(f"\nSource distribution {file_info['filename']} is compatible.")
return hash_value # Skip tag matching for source distributions
return None
def extract_tags_from_filename(filename):
"""Extract tags from a wheel filename, handling multiple platform tags.
Example filename:
aiohttp-3.10.2-cp311-cp311-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl
"""
# Remove the .whl extension
filename = filename.replace(".whl", "")
# Split the filename into parts, ignoring the first two (package name and version)
parts = filename.split("-")
if len(parts) < 3:
print(f"Unexpected filename format: {filename}")
return set()
# Extract Python version and ABI tags (e.g., "cp38-cp38")
python_tag = parts[2]
abi_tag = parts[3] if len(parts) > 3 else "none"
# Extract platform tags, which may have multiple parts separated by dots
platform_tags = parts[4:] # Everything after Python and ABI tags
platform_tag_list = ".".join(platform_tags).split(
"."
) # Split on dots to handle multiple platform tags
# Create a set of tags by combining the Python tag, ABI tag, and each platform tag
tags_set = {
f"{python_tag}-{abi_tag}-{platform_tag}" for platform_tag in platform_tag_list
}
return tags_set
def select_best_tag(common_tags):
"""Generalized function to select the 'best' tag based on versioning and architecture."""
def tag_priority(tag):
"""Assign a priority to the tag based on manylinux version."""
def extract_manylinux_version(tag):
"""Extract the manylinux version from the tag."""
match = re.search(r"manylinux_(\d+)_(\d+)", tag)
if match:
return int(match.group(1)), int(match.group(2))
# Normalize manylinux2014 to manylinux_2_17 and manylinux1 to manylinux_2_5
if "manylinux2014" in tag:
return 2, 17
if "manylinux1" in tag:
return 2, 5
return 0, 0 # Default to lowest version if no match
# Extract version from the tag
manylinux_version = extract_manylinux_version(tag)
return manylinux_version
# Sort the common tags based on the priority function
return max(common_tags, key=tag_priority)
def get_package_file_info(package_name, package_version, hash_value):
"""Fetch package file information from PyPI for a specific hash."""
url = f"https://pypi.org/pypi/{package_name}/{package_version}/json"
response = requests.get(url, timeout=60)
if response.status_code == requests.codes.ok:
package_data = response.json()
for file_info in package_data.get("urls", []):
if file_info["digests"]["sha256"] == hash_value:
# Return the filename to extract the tags
return {"filename": file_info["filename"]}
return None
def append_package(outfile, hashes, package_line, supported_tags):
"""Append package with filtered hashes into the output file."""
if package_line and hashes:
filtered_hash = filter_hashes(hashes, package_line, supported_tags)
if filtered_hash:
outfile.write(f"{package_line} \\\n")
outfile.write(f" --hash=sha256:{filtered_hash}\n")
else:
# If no valid hash was found for this package, issue a warning
print(f"WARNING: No valid hash found for {package_line}, skipping.")
def filter_packages_for_platform(input_file: str, output_file: str):
"""Filter packages for given platform."""
supported_tags = retrieve_supported_tags()
with open(input_file, "r") as infile, open(output_file, "w") as outfile:
package_line = None
hashes = []
for line in infile:
# Check if we are at a package definition line
if "==" in line:
# If we have a previous package, filter its hashes and write it to the file
append_package(outfile, hashes, package_line, supported_tags)
# Start a new package (clean the trailing continuation character and quotes)
package_line = line.strip().rstrip("\\").strip()
hashes = []
# Collect hashes for the current package
elif "--hash=sha256:" in line:
hashes.append(line.strip())
# Handle the last package in the file
append_package(outfile, hashes, package_line, supported_tags)
def generate_list_of_packages(
work_directory: str, process_special_packages: bool, filter_packages: bool
):
"""Generate list of packages, take care of unwanted packages and wheel with Torch package."""
copy_project_stub(work_directory)
if process_special_packages:
remove_torch_dependency(work_directory)
generate_requirements_file(work_directory)
if process_special_packages:
remove_unwanted_dependencies(work_directory)
download_wheel(work_directory, TORCH_REGISTRY, TORCH_WHEEL)
shutil.copy(join(work_directory, "step2.txt"), REQUIREMENTS_FILE)
generate_hash(work_directory, TORCH_REGISTRY, TORCH_WHEEL, "hash.txt")
shell("cat step2.txt hash.txt > " + REQUIREMENTS_FILE, work_directory)
if filter_packages:
filter_packages_for_platform(
join(work_directory, REQUIREMENTS_FILE),
join(work_directory, FILTERED_REQUIREMENTS_FILE),
)
# copy the newly generated requirements file back to the project
shutil.copy(join(work_directory, FILTERED_REQUIREMENTS_FILE), REQUIREMENTS_FILE)
else:
# copy the newly generated requirements file back to the project
shutil.copy(join(work_directory, REQUIREMENTS_FILE), REQUIREMENTS_FILE)
def generate_packages_to_be_build(work_directory):
"""Generate list of packages that will need to be build."""
# download helper script to generate list of packages
url = f"{CACHITO_URL}/{BUILDDEPS_SCRIPT_PATH}/{BUILDDEPS_SCRIPT_NAME}"
into = join(work_directory, BUILDDEPS_SCRIPT_NAME)
urlretrieve(url, into) # noqa: S310
infile = "requirements-build.in"
outfile = "requirements-build.txt"
# generate file requirements-build.in
command = (
"python pip_find_builddeps.py requirements.txt --append "
+ f"--only-write-on-update --ignore-errors --allow-binary -o {infile}"
)
shell(command, work_directory)
# generate requirements-build.txt file
command = f"pip-compile {infile} --allow-unsafe --generate-hashes -o {outfile}"
shell(command, work_directory)
# copy everything back to project
shutil.copy(join(work_directory, infile), infile)
shutil.copy(join(work_directory, outfile), outfile)
def args_parser(args: list[str]) -> argparse.Namespace:
"""Command line arguments parser."""
parser = argparse.ArgumentParser()
# flag that enables processing special packages ('pytorch' and 'nvidia-' at this moment)
parser.add_argument(
"-p",
"--process-special-packages",
default=False,
action="store_true",
help="Enable or disable processing special packages like torch etc.",
)
# work directory
parser.add_argument(
"-w",
"--work-directory",
default=tempfile.mkdtemp(),
type=str,
help="Work directory to store files generated during different stages of processing",
)
# flag that enables cleaning up work directory
parser.add_argument(
"-c",
"--cleanup",
default=False,
action="store_true",
help="Enable or disable work directory cleanup",
)
# flag that enables filtering package SHAs that are not compatible with specified platform
parser.add_argument(
"-f",
"--filter-packages",
default=False,
action="store_true",
help="Enable or disable filtering packages not compatible with specified platform",
)
# execute parser
return parser.parse_args()
def main() -> None:
"""Generate packages to prefetch."""
args = args_parser(sys.argv[1:])
# sanitize work directory
work_directory = os.path.normpath("/" + args.work_directory)
if work_directory.startswith("/"):
work_directory = work_directory[1:]
if work_directory == "":
work_directory = "."
print(f"Work directory {work_directory}")
generate_list_of_packages(
work_directory, args.process_special_packages, args.filter_packages
)
generate_packages_to_be_build(work_directory)
# optional cleanup step
# (for debugging purposes it might be better to see 'steps' files to check
# if everything's ok)
if args.cleanup:
shutil.rmtree(work_directory)
if __name__ == "__main__":
main()