diff --git a/cumulusci/tasks/bulkdata/select_utils.py b/cumulusci/tasks/bulkdata/select_utils.py index b15389402b..cf8d50f110 100644 --- a/cumulusci/tasks/bulkdata/select_utils.py +++ b/cumulusci/tasks/bulkdata/select_utils.py @@ -1,22 +1,37 @@ +import logging import random import re import typing as T from enum import Enum -import numpy as np -import pandas as pd -from annoy import AnnoyIndex from pydantic import Field, root_validator, validator -from sklearn.feature_extraction.text import HashingVectorizer -from sklearn.preprocessing import StandardScaler from cumulusci.core.enums import StrEnum from cumulusci.tasks.bulkdata.extract_dataset_utils.hardcoded_default_declarations import ( DEFAULT_DECLARATIONS, ) from cumulusci.tasks.bulkdata.utils import CaseInsensitiveDict +from cumulusci.utils import get_cci_upgrade_command from cumulusci.utils.yaml.model_parser import CCIDictModel +logger = logging.getLogger(__name__) +try: + import numpy as np + import pandas as pd + from annoy import AnnoyIndex + from sklearn.feature_extraction.text import HashingVectorizer + from sklearn.preprocessing import StandardScaler + + OPTIONAL_DEPENDENCIES_AVAILABLE = True +except ImportError: + logger.warning( + f"Optional dependencies are missing. " + "Handling high volumes of records for the 'select' functionality will be significantly slower, " + "as optimizations for this feature are currently disabled. " + f"To enable optimized performance, install all required dependencies using: {get_cci_upgrade_command()}[select]\n" + ) + OPTIONAL_DEPENDENCIES_AVAILABLE = False + class SelectStrategy(StrEnum): """Enum defining the different selection strategies requested.""" @@ -308,7 +323,7 @@ def similarity_post_process( select_records = [] insert_records = [] - if complexity_constant < 1000: + if complexity_constant < 1000 or not OPTIONAL_DEPENDENCIES_AVAILABLE: select_records, insert_records = levenshtein_post_process( load_records, query_records, fields, weights, threshold ) diff --git a/cumulusci/tasks/bulkdata/tests/test_select_utils.py b/cumulusci/tasks/bulkdata/tests/test_select_utils.py index 447cdccef6..589f66806a 100644 --- a/cumulusci/tasks/bulkdata/tests/test_select_utils.py +++ b/cumulusci/tasks/bulkdata/tests/test_select_utils.py @@ -1,7 +1,7 @@ -import pandas as pd import pytest from cumulusci.tasks.bulkdata.select_utils import ( + OPTIONAL_DEPENDENCIES_AVAILABLE, SelectOperationExecutor, SelectStrategy, add_limit_offset_to_user_filter, @@ -15,6 +15,14 @@ vectorize_records, ) +# Check for pandas availability +try: + import pandas as pd + + PANDAS_AVAILABLE = True +except ImportError: + PANDAS_AVAILABLE = False + # Test Cases for standard_generate_query def test_standard_generate_query_with_default_record_declaration(): @@ -511,6 +519,10 @@ def test_calculate_levenshtein_distance_weights_length_doesnt_match(): assert "Records must be same size as fields (weights)." in str(e.value) +@pytest.mark.skipif( + not PANDAS_AVAILABLE or not OPTIONAL_DEPENDENCIES_AVAILABLE, + reason="requires optional dependencies for annoy", +) def test_all_numeric_columns(): df_db = pd.DataFrame({"A": ["1", "2", "3"], "B": ["4.5", " 5.5", "6.5"]}) df_query = pd.DataFrame({"A": ["4", "5", ""], "B": ["4.5", "5.5", "6.5"]}) @@ -526,6 +538,10 @@ def test_all_numeric_columns(): assert determine_field_types(df_db, df_query, weights) == expected_output +@pytest.mark.skipif( + not PANDAS_AVAILABLE or not OPTIONAL_DEPENDENCIES_AVAILABLE, + reason="requires optional dependencies for annoy", +) def test_numeric_columns__one_non_numeric(): df_db = pd.DataFrame({"A": ["1", "2", "3"], "B": ["4.5", "5.5", "6.5"]}) df_query = pd.DataFrame({"A": ["4", "5", "6"], "B": ["abcd", "5.5", "6.5"]}) @@ -541,6 +557,10 @@ def test_numeric_columns__one_non_numeric(): assert determine_field_types(df_db, df_query, weights) == expected_output +@pytest.mark.skipif( + not PANDAS_AVAILABLE or not OPTIONAL_DEPENDENCIES_AVAILABLE, + reason="requires optional dependencies for annoy", +) def test_all_boolean_columns(): df_db = pd.DataFrame( {"A": ["true", "false", "true"], "B": ["false", "true", "false"]} @@ -560,6 +580,10 @@ def test_all_boolean_columns(): assert determine_field_types(df_db, df_query, weights) == expected_output +@pytest.mark.skipif( + not PANDAS_AVAILABLE or not OPTIONAL_DEPENDENCIES_AVAILABLE, + reason="requires optional dependencies for annoy", +) def test_all_categorical_columns(): df_db = pd.DataFrame( {"A": ["apple", "banana", "cherry"], "B": ["dog", "cat", "mouse"]} @@ -579,6 +603,10 @@ def test_all_categorical_columns(): assert determine_field_types(df_db, df_query, weights) == expected_output +@pytest.mark.skipif( + not PANDAS_AVAILABLE or not OPTIONAL_DEPENDENCIES_AVAILABLE, + reason="requires optional dependencies for annoy", +) def test_mixed_types(): df_db = pd.DataFrame( { @@ -606,6 +634,10 @@ def test_mixed_types(): assert determine_field_types(df_db, df_query, weights) == expected_output +@pytest.mark.skipif( + not PANDAS_AVAILABLE or not OPTIONAL_DEPENDENCIES_AVAILABLE, + reason="requires optional dependencies for annoy", +) def test_vectorize_records_mixed_numerical_boolean_categorical(): # Test data with mixed types: numerical and categorical only db_records = [["1.0", "true", "apple"], ["2.0", "false", "banana"]] @@ -633,6 +665,10 @@ def test_vectorize_records_mixed_numerical_boolean_categorical(): ), "Query vectors column count mismatch" +@pytest.mark.skipif( + not PANDAS_AVAILABLE or not OPTIONAL_DEPENDENCIES_AVAILABLE, + reason="requires optional dependencies for annoy", +) def test_annoy_post_process(): # Test data load_records = [["Alice", "Engineer"], ["Bob", "Doctor"]] @@ -659,6 +695,10 @@ def test_annoy_post_process(): assert not insert_records +@pytest.mark.skipif( + not PANDAS_AVAILABLE or not OPTIONAL_DEPENDENCIES_AVAILABLE, + reason="requires optional dependencies for annoy", +) def test_annoy_post_process__insert_records(): # Test data load_records = [["Alice", "Engineer"], ["Bob", "Doctor"]] @@ -714,6 +754,10 @@ def test_annoy_post_process__no_query_records(): ] # The first insert record should match the second load record +@pytest.mark.skipif( + not PANDAS_AVAILABLE or not OPTIONAL_DEPENDENCIES_AVAILABLE, + reason="requires optional dependencies for annoy", +) def test_annoy_post_process__insert_records_with_polymorphic_fields(): # Test data load_records = [ @@ -749,6 +793,10 @@ def test_annoy_post_process__insert_records_with_polymorphic_fields(): ] # The first insert record should match the second load record +@pytest.mark.skipif( + not PANDAS_AVAILABLE or not OPTIONAL_DEPENDENCIES_AVAILABLE, + reason="requires optional dependencies for annoy", +) def test_single_record_match_annoy_post_process(): # Mock data where only the first query record matches the first load record load_records = [["Alice", "Engineer"], ["Bob", "Doctor"]] diff --git a/pyproject.toml b/pyproject.toml index 7dec9eedab..d840b1eb9e 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -23,7 +23,6 @@ classifiers = [ "Programming Language :: Python :: 3.13", ] dependencies = [ - "annoy", "click>=8.1", "cryptography", "python-dateutil", @@ -35,8 +34,6 @@ dependencies = [ "defusedxml", "lxml", "MarkupSafe", - "numpy", - "pandas", "psutil", "pydantic<2", "PyJWT", @@ -53,7 +50,6 @@ dependencies = [ "rst2ansi>=0.1.5", "salesforce-bulk", "sarge", - "scikit-learn", "selenium<4", "simple-salesforce==1.11.4", "snowfakery>=4.0.0", @@ -88,6 +84,14 @@ lint = [ "pre-commit>=3.5.0", ] +[project.optional-dependencies] +select = [ + "annoy", + "numpy", + "pandas", + "scikit-learn", +] + [project.scripts] cci = "cumulusci.cli.cci:main" snowfakery = "snowfakery.cli:main"