Set ANNOY related dependencies to be optional and skip related pytests

SFDO-Tooling · Dec 13, 2024 · 39261c4 · 39261c4
1 parent 31e711b
commit 39261c4
Show file tree

Hide file tree

Showing 3 changed files with 78 additions and 11 deletions.
diff --git a/cumulusci/tasks/bulkdata/select_utils.py b/cumulusci/tasks/bulkdata/select_utils.py
@@ -1,22 +1,37 @@
+import logging
 import random
 import re
 import typing as T
 from enum import Enum
 
-import numpy as np
-import pandas as pd
-from annoy import AnnoyIndex
 from pydantic import Field, root_validator, validator
-from sklearn.feature_extraction.text import HashingVectorizer
-from sklearn.preprocessing import StandardScaler
 
 from cumulusci.core.enums import StrEnum
 from cumulusci.tasks.bulkdata.extract_dataset_utils.hardcoded_default_declarations import (
     DEFAULT_DECLARATIONS,
 )
 from cumulusci.tasks.bulkdata.utils import CaseInsensitiveDict
+from cumulusci.utils import get_cci_upgrade_command
 from cumulusci.utils.yaml.model_parser import CCIDictModel
 
+logger = logging.getLogger(__name__)
+try:
+    import numpy as np
+    import pandas as pd
+    from annoy import AnnoyIndex
+    from sklearn.feature_extraction.text import HashingVectorizer
+    from sklearn.preprocessing import StandardScaler
+
+    OPTIONAL_DEPENDENCIES_AVAILABLE = True
+except ImportError:
+    logger.warning(
+        f"Optional dependencies are missing. "
+        "Handling high volumes of records for the 'select' functionality will be significantly slower, "
+        "as optimizations for this feature are currently disabled. "
+        f"To enable optimized performance, install all required dependencies using: {get_cci_upgrade_command()}[select]\n"
+    )
+    OPTIONAL_DEPENDENCIES_AVAILABLE = False
+
 
 class SelectStrategy(StrEnum):
     """Enum defining the different selection strategies requested."""
@@ -308,7 +323,7 @@ def similarity_post_process(
     select_records = []
     insert_records = []
 
-    if complexity_constant < 1000:
+    if complexity_constant < 1000 or not OPTIONAL_DEPENDENCIES_AVAILABLE:
         select_records, insert_records = levenshtein_post_process(
             load_records, query_records, fields, weights, threshold
         )

diff --git a/cumulusci/tasks/bulkdata/tests/test_select_utils.py b/cumulusci/tasks/bulkdata/tests/test_select_utils.py
@@ -1,7 +1,7 @@
-import pandas as pd
 import pytest
 
 from cumulusci.tasks.bulkdata.select_utils import (
+    OPTIONAL_DEPENDENCIES_AVAILABLE,
     SelectOperationExecutor,
     SelectStrategy,
     add_limit_offset_to_user_filter,
@@ -15,6 +15,14 @@
     vectorize_records,
 )
 
+# Check for pandas availability
+try:
+    import pandas as pd
+
+    PANDAS_AVAILABLE = True
+except ImportError:
+    PANDAS_AVAILABLE = False
+
 
 # Test Cases for standard_generate_query
 def test_standard_generate_query_with_default_record_declaration():
@@ -511,6 +519,10 @@ def test_calculate_levenshtein_distance_weights_length_doesnt_match():
     assert "Records must be same size as fields (weights)." in str(e.value)
 
 
+@pytest.mark.skipif(
+    not PANDAS_AVAILABLE or not OPTIONAL_DEPENDENCIES_AVAILABLE,
+    reason="requires optional dependencies for annoy",
+)
 def test_all_numeric_columns():
     df_db = pd.DataFrame({"A": ["1", "2", "3"], "B": ["4.5", " 5.5", "6.5"]})
     df_query = pd.DataFrame({"A": ["4", "5", ""], "B": ["4.5", "5.5", "6.5"]})
@@ -526,6 +538,10 @@ def test_all_numeric_columns():
     assert determine_field_types(df_db, df_query, weights) == expected_output
 
 
+@pytest.mark.skipif(
+    not PANDAS_AVAILABLE or not OPTIONAL_DEPENDENCIES_AVAILABLE,
+    reason="requires optional dependencies for annoy",
+)
 def test_numeric_columns__one_non_numeric():
     df_db = pd.DataFrame({"A": ["1", "2", "3"], "B": ["4.5", "5.5", "6.5"]})
     df_query = pd.DataFrame({"A": ["4", "5", "6"], "B": ["abcd", "5.5", "6.5"]})
@@ -541,6 +557,10 @@ def test_numeric_columns__one_non_numeric():
     assert determine_field_types(df_db, df_query, weights) == expected_output
 
 
+@pytest.mark.skipif(
+    not PANDAS_AVAILABLE or not OPTIONAL_DEPENDENCIES_AVAILABLE,
+    reason="requires optional dependencies for annoy",
+)
 def test_all_boolean_columns():
     df_db = pd.DataFrame(
         {"A": ["true", "false", "true"], "B": ["false", "true", "false"]}
@@ -560,6 +580,10 @@ def test_all_boolean_columns():
     assert determine_field_types(df_db, df_query, weights) == expected_output
 
 
+@pytest.mark.skipif(
+    not PANDAS_AVAILABLE or not OPTIONAL_DEPENDENCIES_AVAILABLE,
+    reason="requires optional dependencies for annoy",
+)
 def test_all_categorical_columns():
     df_db = pd.DataFrame(
         {"A": ["apple", "banana", "cherry"], "B": ["dog", "cat", "mouse"]}
@@ -579,6 +603,10 @@ def test_all_categorical_columns():
     assert determine_field_types(df_db, df_query, weights) == expected_output
 
 
+@pytest.mark.skipif(
+    not PANDAS_AVAILABLE or not OPTIONAL_DEPENDENCIES_AVAILABLE,
+    reason="requires optional dependencies for annoy",
+)
 def test_mixed_types():
     df_db = pd.DataFrame(
         {
@@ -606,6 +634,10 @@ def test_mixed_types():
     assert determine_field_types(df_db, df_query, weights) == expected_output
 
 
+@pytest.mark.skipif(
+    not PANDAS_AVAILABLE or not OPTIONAL_DEPENDENCIES_AVAILABLE,
+    reason="requires optional dependencies for annoy",
+)
 def test_vectorize_records_mixed_numerical_boolean_categorical():
     # Test data with mixed types: numerical and categorical only
     db_records = [["1.0", "true", "apple"], ["2.0", "false", "banana"]]
@@ -633,6 +665,10 @@ def test_vectorize_records_mixed_numerical_boolean_categorical():
     ), "Query vectors column count mismatch"
 
 
+@pytest.mark.skipif(
+    not PANDAS_AVAILABLE or not OPTIONAL_DEPENDENCIES_AVAILABLE,
+    reason="requires optional dependencies for annoy",
+)
 def test_annoy_post_process():
     # Test data
     load_records = [["Alice", "Engineer"], ["Bob", "Doctor"]]
@@ -659,6 +695,10 @@ def test_annoy_post_process():
     assert not insert_records
 
 
+@pytest.mark.skipif(
+    not PANDAS_AVAILABLE or not OPTIONAL_DEPENDENCIES_AVAILABLE,
+    reason="requires optional dependencies for annoy",
+)
 def test_annoy_post_process__insert_records():
     # Test data
     load_records = [["Alice", "Engineer"], ["Bob", "Doctor"]]
@@ -714,6 +754,10 @@ def test_annoy_post_process__no_query_records():
     ]  # The first insert record should match the second load record
 
 
+@pytest.mark.skipif(
+    not PANDAS_AVAILABLE or not OPTIONAL_DEPENDENCIES_AVAILABLE,
+    reason="requires optional dependencies for annoy",
+)
 def test_annoy_post_process__insert_records_with_polymorphic_fields():
     # Test data
     load_records = [
@@ -749,6 +793,10 @@ def test_annoy_post_process__insert_records_with_polymorphic_fields():
     ]  # The first insert record should match the second load record
 
 
+@pytest.mark.skipif(
+    not PANDAS_AVAILABLE or not OPTIONAL_DEPENDENCIES_AVAILABLE,
+    reason="requires optional dependencies for annoy",
+)
 def test_single_record_match_annoy_post_process():
     # Mock data where only the first query record matches the first load record
     load_records = [["Alice", "Engineer"], ["Bob", "Doctor"]]

diff --git a/pyproject.toml b/pyproject.toml
@@ -23,7 +23,6 @@ classifiers = [
     "Programming Language :: Python :: 3.13",
 ]
 dependencies = [
-    "annoy",
     "click>=8.1",
     "cryptography",
     "python-dateutil",
@@ -35,8 +34,6 @@ dependencies = [
     "defusedxml",
     "lxml",
     "MarkupSafe",
-    "numpy",
-    "pandas",
     "psutil",
     "pydantic<2",
     "PyJWT",
@@ -53,7 +50,6 @@ dependencies = [
     "rst2ansi>=0.1.5",
     "salesforce-bulk",
     "sarge",
-    "scikit-learn",
     "selenium<4",
     "simple-salesforce==1.11.4",
     "snowfakery>=4.0.0",
@@ -88,6 +84,14 @@ lint = [
     "pre-commit>=3.5.0",
 ]
 
+[project.optional-dependencies]
+select = [
+    "annoy",
+    "numpy",
+    "pandas",
+    "scikit-learn",
+]
+
 [project.scripts]
 cci = "cumulusci.cli.cci:main"
 snowfakery = "snowfakery.cli:main"