diff --git a/matchcode-toolkit/CHANGELOG.rst b/matchcode-toolkit/CHANGELOG.rst index 541ddfa8..c0a0e6b3 100644 --- a/matchcode-toolkit/CHANGELOG.rst +++ b/matchcode-toolkit/CHANGELOG.rst @@ -1,6 +1,12 @@ Changelog ========= +v1.1.3 +------ + +*2023-08-31* -- Do not fingerprint empty directories. +*2023-08-31* -- Track fingerprints to ignore in ``matchcode_toolkit.fingerprinting.IGNORED_DIRECTORY_FINGERPRINTS``. + v1.1.2 ------ diff --git a/matchcode-toolkit/pyproject.toml b/matchcode-toolkit/pyproject.toml index b64839f2..d9e6da52 100644 --- a/matchcode-toolkit/pyproject.toml +++ b/matchcode-toolkit/pyproject.toml @@ -1,6 +1,6 @@ [project] name = "matchcode-toolkit" -version = "1.1.1" +version = "1.1.3" [build-system] requires = ["setuptools >= 50", "wheel", "setuptools_scm[toml] >= 6"] diff --git a/matchcode-toolkit/setup.cfg b/matchcode-toolkit/setup.cfg index 4cfec167..34045a93 100644 --- a/matchcode-toolkit/setup.cfg +++ b/matchcode-toolkit/setup.cfg @@ -1,6 +1,6 @@ [metadata] name = matchcode-toolkit -version = 1.1.2 +version = 1.1.3 license = Apache-2.0 # description must be on ONE line https://github.com/pypa/setuptools/issues/1390 diff --git a/matchcode-toolkit/src/matchcode_toolkit/fingerprinting.py b/matchcode-toolkit/src/matchcode_toolkit/fingerprinting.py index 943e6f97..69217b45 100644 --- a/matchcode-toolkit/src/matchcode_toolkit/fingerprinting.py +++ b/matchcode-toolkit/src/matchcode_toolkit/fingerprinting.py @@ -12,6 +12,14 @@ from matchcode_toolkit.halohash import BitAverageHaloHash +# A collection of directory fingerprints that we want to avoid +IGNORED_DIRECTORY_FINGERPRINTS = [ + # This is both the directory content and directory structure fingerprint for + # an empty directory. + '0000000000000000000000000000000000000000', +] + + def _create_directory_fingerprint(inputs): """ Return a 128-bit BitAverageHaloHash fingerprint in hex from `inputs` @@ -75,7 +83,7 @@ def _compute_directory_fingerprints(directory, codebase): """ # We do not want to add empty files to our fingerprint children = [r for r in directory.walk(codebase) if r.is_file and r.size] - if len(children) == 1: + if len(children) <= 1: return directory_content_fingerprint = create_content_fingerprint(children) diff --git a/matchcode-toolkit/tests/test_fingerprinting.py b/matchcode-toolkit/tests/test_fingerprinting.py index 5b37de8f..b1f9b920 100644 --- a/matchcode-toolkit/tests/test_fingerprinting.py +++ b/matchcode-toolkit/tests/test_fingerprinting.py @@ -105,3 +105,22 @@ def test_compute_codebase_directory_fingerprints(self): expected_directory_structure = '000000034f9bf110673bdf06197cd514a799a66c' self.assertEqual(expected_directory_content, directory_content) self.assertEqual(expected_directory_structure, directory_structure) + + def test_do_not_compute_fingerprint_for_empty_dirs(self): + scan_loc = self.get_test_loc('test.json') + vc = VirtualCodebase(location=scan_loc) + vc = compute_codebase_directory_fingerprints(vc) + directory_content = vc.root.extra_data['directory_content'] + directory_structure = vc.root.extra_data['directory_structure'] + expected_directory_content = '000000032a5fa8d01922536b53e8fc6e3d43766f' + expected_directory_structure = '000000030a399ce2b947a6f611821965a4fcc577' + self.assertEqual(expected_directory_content, directory_content) + self.assertEqual(expected_directory_structure, directory_structure) + # These directories should not have fingerprints generated or stored in + # extra_data + empty_dir_1 = vc.get_resource('test/test') + empty_dir_2 = vc.get_resource('test/test/test2') + self.assertEqual({}, empty_dir_1.extra_data) + self.assertEqual({}, empty_dir_1.extra_data) + self.assertEqual({}, empty_dir_2.extra_data) + self.assertEqual({}, empty_dir_2.extra_data) diff --git a/matchcode-toolkit/tests/testfiles/fingerprinting/test.json b/matchcode-toolkit/tests/testfiles/fingerprinting/test.json new file mode 100644 index 00000000..5f6a9930 --- /dev/null +++ b/matchcode-toolkit/tests/testfiles/fingerprinting/test.json @@ -0,0 +1,211 @@ +{ + "headers": [ + { + "tool_name": "scancode-toolkit", + "tool_version": "32.0.6", + "options": { + "input": [ + "/home/jono/Desktop/test" + ], + "--info": true, + "--json-pp": "/home/jono/test.json" + }, + "notice": "Generated with ScanCode and provided on an \"AS IS\" BASIS, WITHOUT WARRANTIES\nOR CONDITIONS OF ANY KIND, either express or implied. No content created from\nScanCode should be considered or used as legal advice. Consult an Attorney\nfor any legal advice.\nScanCode is a free software code scanning tool from nexB Inc. and others.\nVisit https://github.com/nexB/scancode-toolkit/ for support and download.", + "start_timestamp": "2023-08-31T215533.874398", + "end_timestamp": "2023-08-31T215533.977407", + "output_format_version": "3.0.0", + "duration": 0.10302162170410156, + "message": null, + "errors": [], + "warnings": [], + "extra_data": { + "system_environment": { + "operating_system": "linux", + "cpu_architecture": "64", + "platform": "Linux-5.4.0-150-generic-x86_64-with-glibc2.27", + "platform_version": "#167~18.04.1-Ubuntu SMP Wed May 24 00:51:42 UTC 2023", + "python_version": "3.10.8 (main, Nov 20 2022, 18:43:48) [GCC 7.5.0]" + }, + "spdx_license_list_version": "3.21", + "files_count": 3 + } + } + ], + "files": [ + { + "path": "test", + "type": "directory", + "name": "test", + "base_name": "test", + "extension": "", + "size": 0, + "date": null, + "sha1": null, + "md5": null, + "sha256": null, + "mime_type": null, + "file_type": null, + "programming_language": null, + "is_binary": false, + "is_text": false, + "is_archive": false, + "is_media": false, + "is_source": false, + "is_script": false, + "files_count": 3, + "dirs_count": 3, + "size_count": 55, + "scan_errors": [] + }, + { + "path": "test/package.json", + "type": "file", + "name": "package.json", + "base_name": "package", + "extension": ".json", + "size": 3, + "date": "2023-08-31", + "sha1": "f10e2821bbbea527ea02200352313bc059445190", + "md5": "7815696ecbf1c96e6894b779456d330e", + "sha256": "688787d8ff144c502c7f5cffaafe2cc588d86079f9de88304c26b0cb99ce91c6", + "mime_type": "text/plain", + "file_type": "ASCII text, with no line terminators", + "programming_language": null, + "is_binary": false, + "is_text": true, + "is_archive": false, + "is_media": false, + "is_source": false, + "is_script": false, + "files_count": 0, + "dirs_count": 0, + "size_count": 0, + "scan_errors": [] + }, + { + "path": "test/src", + "type": "directory", + "name": "src", + "base_name": "src", + "extension": "", + "size": 0, + "date": null, + "sha1": null, + "md5": null, + "sha256": null, + "mime_type": null, + "file_type": null, + "programming_language": null, + "is_binary": false, + "is_text": false, + "is_archive": false, + "is_media": false, + "is_source": false, + "is_script": false, + "files_count": 2, + "dirs_count": 0, + "size_count": 52, + "scan_errors": [] + }, + { + "path": "test/src/bar.txt", + "type": "file", + "name": "bar.txt", + "base_name": "bar", + "extension": ".txt", + "size": 3, + "date": "2023-08-31", + "sha1": "62cdb7020ff920e5aa642c3d4066950dd1f01f4d", + "md5": "37b51d194a7513e45b56f6524f2d51f2", + "sha256": "fcde2b2edba56bf408601fb721fe9b5c338d10ee429ea04fae5511b68fbf8fb9", + "mime_type": "text/plain", + "file_type": "ASCII text, with no line terminators", + "programming_language": null, + "is_binary": false, + "is_text": true, + "is_archive": false, + "is_media": false, + "is_source": false, + "is_script": false, + "files_count": 0, + "dirs_count": 0, + "size_count": 0, + "scan_errors": [] + }, + { + "path": "test/src/foo.js", + "type": "file", + "name": "foo.js", + "base_name": "foo", + "extension": ".js", + "size": 49, + "date": "2023-07-26", + "sha1": "fef9e8e1746b8f2175b500c57a9c6d250623885b", + "md5": "54149367c4c4523241c945701eee1a02", + "sha256": "8ab7888ffceb5004ff3d14417c71d7b56812e04d0ac86545e1592208c6d56d04", + "mime_type": "text/plain", + "file_type": "ASCII text", + "programming_language": "JavaScript", + "is_binary": false, + "is_text": true, + "is_archive": false, + "is_media": false, + "is_source": true, + "is_script": false, + "files_count": 0, + "dirs_count": 0, + "size_count": 0, + "scan_errors": [] + }, + { + "path": "test/test", + "type": "directory", + "name": "test", + "base_name": "test", + "extension": "", + "size": 0, + "date": null, + "sha1": null, + "md5": null, + "sha256": null, + "mime_type": null, + "file_type": null, + "programming_language": null, + "is_binary": false, + "is_text": false, + "is_archive": false, + "is_media": false, + "is_source": false, + "is_script": false, + "files_count": 0, + "dirs_count": 1, + "size_count": 0, + "scan_errors": [] + }, + { + "path": "test/test/test2", + "type": "directory", + "name": "test2", + "base_name": "test2", + "extension": "", + "size": 0, + "date": null, + "sha1": null, + "md5": null, + "sha256": null, + "mime_type": null, + "file_type": null, + "programming_language": null, + "is_binary": false, + "is_text": false, + "is_archive": false, + "is_media": false, + "is_source": false, + "is_script": false, + "files_count": 0, + "dirs_count": 0, + "size_count": 0, + "scan_errors": [] + } + ] +} \ No newline at end of file diff --git a/packagedb/api.py b/packagedb/api.py index 3a2aa981..77bc9d94 100644 --- a/packagedb/api.py +++ b/packagedb/api.py @@ -531,9 +531,11 @@ def filter_by_checksums(self, request, *args, **kwargs): This will return Packages whose sha1 or md5 matches those values. """ data = dict(request.data) + unsupported_fields = [] + supported_fields = ['md5', 'sha1', 'sha256', 'sha512', 'enhance_package_data'] for field, value in data.items(): - if field not in ('md5', 'sha1', 'sha256', 'sha512', 'enhance_package_data'): + if field not in supported_fields: unsupported_fields.append(field) if unsupported_fields: @@ -544,14 +546,21 @@ def filter_by_checksums(self, request, *args, **kwargs): return Response(response_data) enhance_package_data = data.pop('enhance_package_data', False) - q = Q() + if not data: + response_data = { + 'status': 'No values provided' + } + return Response(response_data) + + lookups = Q() for field, value in data.items(): + value = value or [] # We create this intermediate dictionary so we can modify the field # name to have __in at the end d = {f'{field}__in': value} - q |= Q(**d) + lookups |= Q(**d) - qs = Package.objects.filter(q) + qs = Package.objects.filter(lookups) paginated_qs = self.paginate_queryset(qs) if enhance_package_data: serialized_package_data = [get_enhanced_package(package=package) for package in paginated_qs] @@ -685,7 +694,7 @@ class PackageSetViewSet(viewsets.ReadOnlyModelViewSet): def get_resolved_purls(packages): """ Take a list of dict containing purl or version-less purl along with vers - and return a list of resolved purls, a list of unsupported purls, and a + and return a list of resolved purls, a list of unsupported purls, and a list of unsupported vers. """ unique_resolved_purls = set() @@ -765,9 +774,9 @@ def get_all_versions(purl: PackageURL): package_name = get_api_package_name(purl) versionAPI = get_version_fetcher(purl) - + if not package_name or not versionAPI: - return + return all_versions = versionAPI().fetch(package_name) versionClass = VERSION_CLASS_BY_PACKAGE_TYPE.get(purl.type) diff --git a/setup.cfg b/setup.cfg index 2a2aa21f..b7e95f8f 100644 --- a/setup.cfg +++ b/setup.cfg @@ -55,7 +55,7 @@ install_requires = rubymarshal == 1.0.3 scancode-toolkit[full] == 32.0.6 urlpy == 0.5 - matchcode-toolkit == 1.1.1 + matchcode-toolkit >= 1.1.1 univers == 30.10.0 setup_requires = setuptools_scm[toml] >= 4