Skip to content

Commit

Permalink
Set ANNOY related dependencies to be optional and skip related pytests
Browse files Browse the repository at this point in the history
  • Loading branch information
aditya-balachander committed Dec 13, 2024
1 parent 31e711b commit 39261c4
Show file tree
Hide file tree
Showing 3 changed files with 78 additions and 11 deletions.
27 changes: 21 additions & 6 deletions cumulusci/tasks/bulkdata/select_utils.py
Original file line number Diff line number Diff line change
@@ -1,22 +1,37 @@
import logging
import random
import re
import typing as T
from enum import Enum

import numpy as np
import pandas as pd
from annoy import AnnoyIndex
from pydantic import Field, root_validator, validator
from sklearn.feature_extraction.text import HashingVectorizer
from sklearn.preprocessing import StandardScaler

from cumulusci.core.enums import StrEnum
from cumulusci.tasks.bulkdata.extract_dataset_utils.hardcoded_default_declarations import (
DEFAULT_DECLARATIONS,
)
from cumulusci.tasks.bulkdata.utils import CaseInsensitiveDict
from cumulusci.utils import get_cci_upgrade_command
from cumulusci.utils.yaml.model_parser import CCIDictModel

logger = logging.getLogger(__name__)
try:
import numpy as np
import pandas as pd
from annoy import AnnoyIndex
from sklearn.feature_extraction.text import HashingVectorizer
from sklearn.preprocessing import StandardScaler

OPTIONAL_DEPENDENCIES_AVAILABLE = True
except ImportError:
logger.warning(
f"Optional dependencies are missing. "
"Handling high volumes of records for the 'select' functionality will be significantly slower, "
"as optimizations for this feature are currently disabled. "
f"To enable optimized performance, install all required dependencies using: {get_cci_upgrade_command()}[select]\n"
)
OPTIONAL_DEPENDENCIES_AVAILABLE = False


class SelectStrategy(StrEnum):
"""Enum defining the different selection strategies requested."""
Expand Down Expand Up @@ -308,7 +323,7 @@ def similarity_post_process(
select_records = []
insert_records = []

if complexity_constant < 1000:
if complexity_constant < 1000 or not OPTIONAL_DEPENDENCIES_AVAILABLE:
select_records, insert_records = levenshtein_post_process(
load_records, query_records, fields, weights, threshold
)
Expand Down
50 changes: 49 additions & 1 deletion cumulusci/tasks/bulkdata/tests/test_select_utils.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
import pandas as pd
import pytest

from cumulusci.tasks.bulkdata.select_utils import (
OPTIONAL_DEPENDENCIES_AVAILABLE,
SelectOperationExecutor,
SelectStrategy,
add_limit_offset_to_user_filter,
Expand All @@ -15,6 +15,14 @@
vectorize_records,
)

# Check for pandas availability
try:
import pandas as pd

PANDAS_AVAILABLE = True
except ImportError:
PANDAS_AVAILABLE = False


# Test Cases for standard_generate_query
def test_standard_generate_query_with_default_record_declaration():
Expand Down Expand Up @@ -511,6 +519,10 @@ def test_calculate_levenshtein_distance_weights_length_doesnt_match():
assert "Records must be same size as fields (weights)." in str(e.value)


@pytest.mark.skipif(
not PANDAS_AVAILABLE or not OPTIONAL_DEPENDENCIES_AVAILABLE,
reason="requires optional dependencies for annoy",
)
def test_all_numeric_columns():
df_db = pd.DataFrame({"A": ["1", "2", "3"], "B": ["4.5", " 5.5", "6.5"]})
df_query = pd.DataFrame({"A": ["4", "5", ""], "B": ["4.5", "5.5", "6.5"]})
Expand All @@ -526,6 +538,10 @@ def test_all_numeric_columns():
assert determine_field_types(df_db, df_query, weights) == expected_output


@pytest.mark.skipif(
not PANDAS_AVAILABLE or not OPTIONAL_DEPENDENCIES_AVAILABLE,
reason="requires optional dependencies for annoy",
)
def test_numeric_columns__one_non_numeric():
df_db = pd.DataFrame({"A": ["1", "2", "3"], "B": ["4.5", "5.5", "6.5"]})
df_query = pd.DataFrame({"A": ["4", "5", "6"], "B": ["abcd", "5.5", "6.5"]})
Expand All @@ -541,6 +557,10 @@ def test_numeric_columns__one_non_numeric():
assert determine_field_types(df_db, df_query, weights) == expected_output


@pytest.mark.skipif(
not PANDAS_AVAILABLE or not OPTIONAL_DEPENDENCIES_AVAILABLE,
reason="requires optional dependencies for annoy",
)
def test_all_boolean_columns():
df_db = pd.DataFrame(
{"A": ["true", "false", "true"], "B": ["false", "true", "false"]}
Expand All @@ -560,6 +580,10 @@ def test_all_boolean_columns():
assert determine_field_types(df_db, df_query, weights) == expected_output


@pytest.mark.skipif(
not PANDAS_AVAILABLE or not OPTIONAL_DEPENDENCIES_AVAILABLE,
reason="requires optional dependencies for annoy",
)
def test_all_categorical_columns():
df_db = pd.DataFrame(
{"A": ["apple", "banana", "cherry"], "B": ["dog", "cat", "mouse"]}
Expand All @@ -579,6 +603,10 @@ def test_all_categorical_columns():
assert determine_field_types(df_db, df_query, weights) == expected_output


@pytest.mark.skipif(
not PANDAS_AVAILABLE or not OPTIONAL_DEPENDENCIES_AVAILABLE,
reason="requires optional dependencies for annoy",
)
def test_mixed_types():
df_db = pd.DataFrame(
{
Expand Down Expand Up @@ -606,6 +634,10 @@ def test_mixed_types():
assert determine_field_types(df_db, df_query, weights) == expected_output


@pytest.mark.skipif(
not PANDAS_AVAILABLE or not OPTIONAL_DEPENDENCIES_AVAILABLE,
reason="requires optional dependencies for annoy",
)
def test_vectorize_records_mixed_numerical_boolean_categorical():
# Test data with mixed types: numerical and categorical only
db_records = [["1.0", "true", "apple"], ["2.0", "false", "banana"]]
Expand Down Expand Up @@ -633,6 +665,10 @@ def test_vectorize_records_mixed_numerical_boolean_categorical():
), "Query vectors column count mismatch"


@pytest.mark.skipif(
not PANDAS_AVAILABLE or not OPTIONAL_DEPENDENCIES_AVAILABLE,
reason="requires optional dependencies for annoy",
)
def test_annoy_post_process():
# Test data
load_records = [["Alice", "Engineer"], ["Bob", "Doctor"]]
Expand All @@ -659,6 +695,10 @@ def test_annoy_post_process():
assert not insert_records


@pytest.mark.skipif(
not PANDAS_AVAILABLE or not OPTIONAL_DEPENDENCIES_AVAILABLE,
reason="requires optional dependencies for annoy",
)
def test_annoy_post_process__insert_records():
# Test data
load_records = [["Alice", "Engineer"], ["Bob", "Doctor"]]
Expand Down Expand Up @@ -714,6 +754,10 @@ def test_annoy_post_process__no_query_records():
] # The first insert record should match the second load record


@pytest.mark.skipif(
not PANDAS_AVAILABLE or not OPTIONAL_DEPENDENCIES_AVAILABLE,
reason="requires optional dependencies for annoy",
)
def test_annoy_post_process__insert_records_with_polymorphic_fields():
# Test data
load_records = [
Expand Down Expand Up @@ -749,6 +793,10 @@ def test_annoy_post_process__insert_records_with_polymorphic_fields():
] # The first insert record should match the second load record


@pytest.mark.skipif(
not PANDAS_AVAILABLE or not OPTIONAL_DEPENDENCIES_AVAILABLE,
reason="requires optional dependencies for annoy",
)
def test_single_record_match_annoy_post_process():
# Mock data where only the first query record matches the first load record
load_records = [["Alice", "Engineer"], ["Bob", "Doctor"]]
Expand Down
12 changes: 8 additions & 4 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,6 @@ classifiers = [
"Programming Language :: Python :: 3.13",
]
dependencies = [
"annoy",
"click>=8.1",
"cryptography",
"python-dateutil",
Expand All @@ -35,8 +34,6 @@ dependencies = [
"defusedxml",
"lxml",
"MarkupSafe",
"numpy",
"pandas",
"psutil",
"pydantic<2",
"PyJWT",
Expand All @@ -53,7 +50,6 @@ dependencies = [
"rst2ansi>=0.1.5",
"salesforce-bulk",
"sarge",
"scikit-learn",
"selenium<4",
"simple-salesforce==1.11.4",
"snowfakery>=4.0.0",
Expand Down Expand Up @@ -88,6 +84,14 @@ lint = [
"pre-commit>=3.5.0",
]

[project.optional-dependencies]
select = [
"annoy",
"numpy",
"pandas",
"scikit-learn",
]

[project.scripts]
cci = "cumulusci.cli.cci:main"
snowfakery = "snowfakery.cli:main"
Expand Down

0 comments on commit 39261c4

Please sign in to comment.