Skip to content

Commit

Permalink
Merge pull request #180 from nexB/fix-filter-by-checksums
Browse files Browse the repository at this point in the history
Fix filter by checksums
  • Loading branch information
JonoYang authored Aug 31, 2023
2 parents 3b820dd + f814dd8 commit 2e824e1
Show file tree
Hide file tree
Showing 8 changed files with 264 additions and 11 deletions.
6 changes: 6 additions & 0 deletions matchcode-toolkit/CHANGELOG.rst
Original file line number Diff line number Diff line change
@@ -1,6 +1,12 @@
Changelog
=========

v1.1.3
------

*2023-08-31* -- Do not fingerprint empty directories.
*2023-08-31* -- Track fingerprints to ignore in ``matchcode_toolkit.fingerprinting.IGNORED_DIRECTORY_FINGERPRINTS``.

v1.1.2
------

Expand Down
2 changes: 1 addition & 1 deletion matchcode-toolkit/pyproject.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[project]
name = "matchcode-toolkit"
version = "1.1.1"
version = "1.1.3"

[build-system]
requires = ["setuptools >= 50", "wheel", "setuptools_scm[toml] >= 6"]
Expand Down
2 changes: 1 addition & 1 deletion matchcode-toolkit/setup.cfg
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[metadata]
name = matchcode-toolkit
version = 1.1.2
version = 1.1.3
license = Apache-2.0

# description must be on ONE line https://github.com/pypa/setuptools/issues/1390
Expand Down
10 changes: 9 additions & 1 deletion matchcode-toolkit/src/matchcode_toolkit/fingerprinting.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,14 @@
from matchcode_toolkit.halohash import BitAverageHaloHash


# A collection of directory fingerprints that we want to avoid
IGNORED_DIRECTORY_FINGERPRINTS = [
# This is both the directory content and directory structure fingerprint for
# an empty directory.
'0000000000000000000000000000000000000000',
]


def _create_directory_fingerprint(inputs):
"""
Return a 128-bit BitAverageHaloHash fingerprint in hex from `inputs`
Expand Down Expand Up @@ -75,7 +83,7 @@ def _compute_directory_fingerprints(directory, codebase):
"""
# We do not want to add empty files to our fingerprint
children = [r for r in directory.walk(codebase) if r.is_file and r.size]
if len(children) == 1:
if len(children) <= 1:
return

directory_content_fingerprint = create_content_fingerprint(children)
Expand Down
19 changes: 19 additions & 0 deletions matchcode-toolkit/tests/test_fingerprinting.py
Original file line number Diff line number Diff line change
Expand Up @@ -105,3 +105,22 @@ def test_compute_codebase_directory_fingerprints(self):
expected_directory_structure = '000000034f9bf110673bdf06197cd514a799a66c'
self.assertEqual(expected_directory_content, directory_content)
self.assertEqual(expected_directory_structure, directory_structure)

def test_do_not_compute_fingerprint_for_empty_dirs(self):
scan_loc = self.get_test_loc('test.json')
vc = VirtualCodebase(location=scan_loc)
vc = compute_codebase_directory_fingerprints(vc)
directory_content = vc.root.extra_data['directory_content']
directory_structure = vc.root.extra_data['directory_structure']
expected_directory_content = '000000032a5fa8d01922536b53e8fc6e3d43766f'
expected_directory_structure = '000000030a399ce2b947a6f611821965a4fcc577'
self.assertEqual(expected_directory_content, directory_content)
self.assertEqual(expected_directory_structure, directory_structure)
# These directories should not have fingerprints generated or stored in
# extra_data
empty_dir_1 = vc.get_resource('test/test')
empty_dir_2 = vc.get_resource('test/test/test2')
self.assertEqual({}, empty_dir_1.extra_data)
self.assertEqual({}, empty_dir_1.extra_data)
self.assertEqual({}, empty_dir_2.extra_data)
self.assertEqual({}, empty_dir_2.extra_data)
211 changes: 211 additions & 0 deletions matchcode-toolkit/tests/testfiles/fingerprinting/test.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,211 @@
{
"headers": [
{
"tool_name": "scancode-toolkit",
"tool_version": "32.0.6",
"options": {
"input": [
"/home/jono/Desktop/test"
],
"--info": true,
"--json-pp": "/home/jono/test.json"
},
"notice": "Generated with ScanCode and provided on an \"AS IS\" BASIS, WITHOUT WARRANTIES\nOR CONDITIONS OF ANY KIND, either express or implied. No content created from\nScanCode should be considered or used as legal advice. Consult an Attorney\nfor any legal advice.\nScanCode is a free software code scanning tool from nexB Inc. and others.\nVisit https://github.com/nexB/scancode-toolkit/ for support and download.",
"start_timestamp": "2023-08-31T215533.874398",
"end_timestamp": "2023-08-31T215533.977407",
"output_format_version": "3.0.0",
"duration": 0.10302162170410156,
"message": null,
"errors": [],
"warnings": [],
"extra_data": {
"system_environment": {
"operating_system": "linux",
"cpu_architecture": "64",
"platform": "Linux-5.4.0-150-generic-x86_64-with-glibc2.27",
"platform_version": "#167~18.04.1-Ubuntu SMP Wed May 24 00:51:42 UTC 2023",
"python_version": "3.10.8 (main, Nov 20 2022, 18:43:48) [GCC 7.5.0]"
},
"spdx_license_list_version": "3.21",
"files_count": 3
}
}
],
"files": [
{
"path": "test",
"type": "directory",
"name": "test",
"base_name": "test",
"extension": "",
"size": 0,
"date": null,
"sha1": null,
"md5": null,
"sha256": null,
"mime_type": null,
"file_type": null,
"programming_language": null,
"is_binary": false,
"is_text": false,
"is_archive": false,
"is_media": false,
"is_source": false,
"is_script": false,
"files_count": 3,
"dirs_count": 3,
"size_count": 55,
"scan_errors": []
},
{
"path": "test/package.json",
"type": "file",
"name": "package.json",
"base_name": "package",
"extension": ".json",
"size": 3,
"date": "2023-08-31",
"sha1": "f10e2821bbbea527ea02200352313bc059445190",
"md5": "7815696ecbf1c96e6894b779456d330e",
"sha256": "688787d8ff144c502c7f5cffaafe2cc588d86079f9de88304c26b0cb99ce91c6",
"mime_type": "text/plain",
"file_type": "ASCII text, with no line terminators",
"programming_language": null,
"is_binary": false,
"is_text": true,
"is_archive": false,
"is_media": false,
"is_source": false,
"is_script": false,
"files_count": 0,
"dirs_count": 0,
"size_count": 0,
"scan_errors": []
},
{
"path": "test/src",
"type": "directory",
"name": "src",
"base_name": "src",
"extension": "",
"size": 0,
"date": null,
"sha1": null,
"md5": null,
"sha256": null,
"mime_type": null,
"file_type": null,
"programming_language": null,
"is_binary": false,
"is_text": false,
"is_archive": false,
"is_media": false,
"is_source": false,
"is_script": false,
"files_count": 2,
"dirs_count": 0,
"size_count": 52,
"scan_errors": []
},
{
"path": "test/src/bar.txt",
"type": "file",
"name": "bar.txt",
"base_name": "bar",
"extension": ".txt",
"size": 3,
"date": "2023-08-31",
"sha1": "62cdb7020ff920e5aa642c3d4066950dd1f01f4d",
"md5": "37b51d194a7513e45b56f6524f2d51f2",
"sha256": "fcde2b2edba56bf408601fb721fe9b5c338d10ee429ea04fae5511b68fbf8fb9",
"mime_type": "text/plain",
"file_type": "ASCII text, with no line terminators",
"programming_language": null,
"is_binary": false,
"is_text": true,
"is_archive": false,
"is_media": false,
"is_source": false,
"is_script": false,
"files_count": 0,
"dirs_count": 0,
"size_count": 0,
"scan_errors": []
},
{
"path": "test/src/foo.js",
"type": "file",
"name": "foo.js",
"base_name": "foo",
"extension": ".js",
"size": 49,
"date": "2023-07-26",
"sha1": "fef9e8e1746b8f2175b500c57a9c6d250623885b",
"md5": "54149367c4c4523241c945701eee1a02",
"sha256": "8ab7888ffceb5004ff3d14417c71d7b56812e04d0ac86545e1592208c6d56d04",
"mime_type": "text/plain",
"file_type": "ASCII text",
"programming_language": "JavaScript",
"is_binary": false,
"is_text": true,
"is_archive": false,
"is_media": false,
"is_source": true,
"is_script": false,
"files_count": 0,
"dirs_count": 0,
"size_count": 0,
"scan_errors": []
},
{
"path": "test/test",
"type": "directory",
"name": "test",
"base_name": "test",
"extension": "",
"size": 0,
"date": null,
"sha1": null,
"md5": null,
"sha256": null,
"mime_type": null,
"file_type": null,
"programming_language": null,
"is_binary": false,
"is_text": false,
"is_archive": false,
"is_media": false,
"is_source": false,
"is_script": false,
"files_count": 0,
"dirs_count": 1,
"size_count": 0,
"scan_errors": []
},
{
"path": "test/test/test2",
"type": "directory",
"name": "test2",
"base_name": "test2",
"extension": "",
"size": 0,
"date": null,
"sha1": null,
"md5": null,
"sha256": null,
"mime_type": null,
"file_type": null,
"programming_language": null,
"is_binary": false,
"is_text": false,
"is_archive": false,
"is_media": false,
"is_source": false,
"is_script": false,
"files_count": 0,
"dirs_count": 0,
"size_count": 0,
"scan_errors": []
}
]
}
23 changes: 16 additions & 7 deletions packagedb/api.py
Original file line number Diff line number Diff line change
Expand Up @@ -531,9 +531,11 @@ def filter_by_checksums(self, request, *args, **kwargs):
This will return Packages whose sha1 or md5 matches those values.
"""
data = dict(request.data)

unsupported_fields = []
supported_fields = ['md5', 'sha1', 'sha256', 'sha512', 'enhance_package_data']
for field, value in data.items():
if field not in ('md5', 'sha1', 'sha256', 'sha512', 'enhance_package_data'):
if field not in supported_fields:
unsupported_fields.append(field)

if unsupported_fields:
Expand All @@ -544,14 +546,21 @@ def filter_by_checksums(self, request, *args, **kwargs):
return Response(response_data)

enhance_package_data = data.pop('enhance_package_data', False)
q = Q()
if not data:
response_data = {
'status': 'No values provided'
}
return Response(response_data)

lookups = Q()
for field, value in data.items():
value = value or []
# We create this intermediate dictionary so we can modify the field
# name to have __in at the end
d = {f'{field}__in': value}
q |= Q(**d)
lookups |= Q(**d)

qs = Package.objects.filter(q)
qs = Package.objects.filter(lookups)
paginated_qs = self.paginate_queryset(qs)
if enhance_package_data:
serialized_package_data = [get_enhanced_package(package=package) for package in paginated_qs]
Expand Down Expand Up @@ -685,7 +694,7 @@ class PackageSetViewSet(viewsets.ReadOnlyModelViewSet):
def get_resolved_purls(packages):
"""
Take a list of dict containing purl or version-less purl along with vers
and return a list of resolved purls, a list of unsupported purls, and a
and return a list of resolved purls, a list of unsupported purls, and a
list of unsupported vers.
"""
unique_resolved_purls = set()
Expand Down Expand Up @@ -765,9 +774,9 @@ def get_all_versions(purl: PackageURL):

package_name = get_api_package_name(purl)
versionAPI = get_version_fetcher(purl)

if not package_name or not versionAPI:
return
return

all_versions = versionAPI().fetch(package_name)
versionClass = VERSION_CLASS_BY_PACKAGE_TYPE.get(purl.type)
Expand Down
2 changes: 1 addition & 1 deletion setup.cfg
Original file line number Diff line number Diff line change
Expand Up @@ -55,7 +55,7 @@ install_requires =
rubymarshal == 1.0.3
scancode-toolkit[full] == 32.0.6
urlpy == 0.5
matchcode-toolkit == 1.1.1
matchcode-toolkit >= 1.1.1
univers == 30.10.0
setup_requires = setuptools_scm[toml] >= 4

Expand Down

0 comments on commit 2e824e1

Please sign in to comment.