From 45755a9f46fdf836cdccea717bab6cec1b53ba0b Mon Sep 17 00:00:00 2001 From: Krushna Date: Wed, 19 Jul 2023 12:03:02 +0530 Subject: [PATCH 1/2] Updated the Spacy library version upper limit to 3.5.0 because of the dagster>=1.3.3 dependency (refer https://github.com/datahub-project/datahub/pull/8384) --- datahub-classify/setup.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/datahub-classify/setup.py b/datahub-classify/setup.py index f970942..4788e61 100644 --- a/datahub-classify/setup.py +++ b/datahub-classify/setup.py @@ -18,7 +18,7 @@ def get_long_description(): "schwifty>=2022.9.0", "python-stdnum>=1.17", "ipaddress>=1.0.23", - "spacy>=3.4.1,<=3.4.3", + "spacy>=3.4.1,<=3.5.0", "phonenumbers>=8.12.56,<=8.13.0", } From d9d0d3a2cb7eb8733f85cb5c788ddeb5fb98179e Mon Sep 17 00:00:00 2001 From: Krushna Date: Wed, 19 Jul 2023 15:52:49 +0530 Subject: [PATCH 2/2] Updated the expected results with new spacy version --- .../expected_output/expected_infotypes_UNIT_TESTING.json | 4 ++-- .../expected_output/expected_infotypes_confidence_slabs.json | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/datahub-classify/tests/expected_output/expected_infotypes_UNIT_TESTING.json b/datahub-classify/tests/expected_output/expected_infotypes_UNIT_TESTING.json index 65a4949..36bdc1a 100644 --- a/datahub-classify/tests/expected_output/expected_infotypes_UNIT_TESTING.json +++ b/datahub-classify/tests/expected_output/expected_infotypes_UNIT_TESTING.json @@ -52,7 +52,7 @@ "General Manager": "Full_Name" }, "2018-seattle-business-districts": { - "Address": "Street_Address", + "Address": "no_infotype", "E-mail": "Email_Address" }, "Customer_Segmentation": { @@ -129,7 +129,7 @@ "Full_Name": "Full_Name" }, "1-MB-Test": { - "First and Last Name": "no_infotype", + "First and Last Name": "Full_Name", "SSN" :"US_Social_Security_Number", "Credit Card Number" : "Credit_Debit_Card_Number" }, diff --git a/datahub-classify/tests/expected_output/expected_infotypes_confidence_slabs.json b/datahub-classify/tests/expected_output/expected_infotypes_confidence_slabs.json index 4220bf3..693c5fa 100644 --- a/datahub-classify/tests/expected_output/expected_infotypes_confidence_slabs.json +++ b/datahub-classify/tests/expected_output/expected_infotypes_confidence_slabs.json @@ -54,7 +54,6 @@ "General Manager": 0.6 }, "2018-seattle-business-districts": { - "Address": 0.6, "E-mail": 1.0 }, "Customer_Segmentation": { @@ -129,6 +128,7 @@ "Full_Name": 0.9 }, "1-MB-Test": { + "First and Last Name": 0.7, "SSN": 1.0, "Credit Card Number": 1.0 },