diff --git a/requirements.txt b/requirements.txt index c55b1c3..0e68b1f 100644 --- a/requirements.txt +++ b/requirements.txt @@ -11,4 +11,5 @@ scikit-learn == 1.2.2 seaborn == 0.11.2 tqdm == 4.64.0 typer == 0.9.0 -valentine == 0.1.6 \ No newline at end of file +valentine == 0.1.6 +qpsolvers[open_source_solvers] \ No newline at end of file diff --git a/src/feature_discovery/cli.py b/src/feature_discovery/cli.py index 8e0f9f8..455d75f 100644 --- a/src/feature_discovery/cli.py +++ b/src/feature_discovery/cli.py @@ -210,14 +210,15 @@ def ingest_data( discover_connections_data_lake: Annotated[ bool, typer.Option(help="Run dataset discovery to find more connections within the entire data lake") ] = False, + ): """ Ingest all dataset from specified "data" folder. """ - ingest_nodes() + files = ingest_nodes() if data_discovery_threshold and discover_connections_data_lake: - profile_valentine_all(valentine_threshold=data_discovery_threshold) + profile_valentine_all(valentine_threshold=data_discovery_threshold, files=files) return if data_discovery_threshold and not discover_connections_data_lake: diff --git a/src/feature_discovery/config.py b/src/feature_discovery/config.py index df8b1e5..0708767 100644 --- a/src/feature_discovery/config.py +++ b/src/feature_discovery/config.py @@ -5,6 +5,8 @@ os.getenv("TFD_ROOT_FOLDER", Path(os.path.abspath(__file__)).parent.parent.parent.resolve()) ).resolve() +SLASH = Path("/") + CONNECTIONS = "connections.csv" DATASET_TYPE = "benchmark" @@ -12,7 +14,7 @@ DATA = "data" DATA_FOLDER = ROOT_FOLDER / DATA / DATASET_TYPE # RESULTS_FOLDER = ROOT_FOLDER / "results" / "revision-test" -RESULTS_FOLDER = ROOT_FOLDER / "results_polars" +RESULTS_FOLDER = ROOT_FOLDER / "results" AUTO_GLUON_FOLDER = ROOT_FOLDER / "AutogluonModels" ### CREDENTIALS ### @@ -24,3 +26,5 @@ # NEO4J_DATABASE = os.getenv("NEO4J_DATABASE", DATASET_TYPE) NEO4J_DATABASE = "lake" +# NEO4J_DATABASE = "benchmark" + diff --git a/src/feature_discovery/dataset_relation_graph/dataset_discovery.py b/src/feature_discovery/dataset_relation_graph/dataset_discovery.py index 1695121..7fcabbf 100644 --- a/src/feature_discovery/dataset_relation_graph/dataset_discovery.py +++ b/src/feature_discovery/dataset_relation_graph/dataset_discovery.py @@ -6,15 +6,16 @@ from joblib import Parallel, delayed from tqdm import tqdm from valentine import valentine_match -from valentine.algorithms import Coma +from valentine.algorithms import Coma, Cupid -from feature_discovery.config import DATA_FOLDER, CONNECTIONS +from feature_discovery.config import DATA_FOLDER, CONNECTIONS,SLASH from feature_discovery.graph_processing.neo4j_transactions import merge_nodes_relation_tables -def profile_valentine_all(valentine_threshold: float = 0.55): - files = glob.glob(f"{DATA_FOLDER}/**/*.csv", recursive=True) - files = [f for f in files if CONNECTIONS not in f] +def profile_valentine_all(valentine_threshold: float = 0.55, files: List[str]=None): + if files is None: + files = glob.glob(f"{DATA_FOLDER}/**/*.csv", recursive=True) + files = [f for f in files if CONNECTIONS not in f] profile_valentine_logic(files, valentine_threshold) @@ -30,18 +31,20 @@ def profile_valentine_logic(files: List[str], valentine_threshold: float = 0.55) def profile(table_pair): (tab1, tab2) = table_pair - a_table_path = tab1.partition(f"{DATA_FOLDER}/")[2] - b_table_path = tab2.partition(f"{DATA_FOLDER}/")[2] + a_table_path = tab1.partition(f"{DATA_FOLDER}{SLASH}")[2] + b_table_path = tab2.partition(f"{DATA_FOLDER}{SLASH}")[2] - a_table_name = a_table_path.split("/")[-1] - b_table_name = b_table_path.split("/")[-1] + a_table_name = a_table_path.split(f"{SLASH}")[-1] + b_table_name = b_table_path.split(f"{SLASH}")[-1] - print(f"Processing the match between:\n\t{a_table_path}\n\t{b_table_path}") + print(f"\nProcessing the match between:\n\t{a_table_path}\n\t{b_table_path}") df1 = pd.read_csv(tab1, encoding="utf8") df2 = pd.read_csv(tab2, encoding="utf8") - matches = valentine_match(df1, df2, Coma(strategy="COMA_OPT")) + + matches = valentine_match(df1, df2, Coma(strategy="COMA_OPT")) #COMA_OPT_INST for item in matches.items(): + print(item) ((_, col_from), (_, col_to)), similarity = item if similarity > valentine_threshold: print(f"Similarity {similarity} between:\n\t{a_table_path} -- {col_from}\n\t{b_table_path} -- {col_to}") diff --git a/src/feature_discovery/dataset_relation_graph/ingest_data.py b/src/feature_discovery/dataset_relation_graph/ingest_data.py index 9dd6f09..d6b794c 100644 --- a/src/feature_discovery/dataset_relation_graph/ingest_data.py +++ b/src/feature_discovery/dataset_relation_graph/ingest_data.py @@ -1,8 +1,9 @@ import glob +from pathlib import Path import pandas as pd -from feature_discovery.config import CONNECTIONS, DATA_FOLDER +from feature_discovery.config import CONNECTIONS, DATA_FOLDER, SLASH from feature_discovery.dataset_relation_graph.dataset_discovery import profile_valentine_all, profile_valentine_dataset from feature_discovery.experiments.dataset_object import Dataset from feature_discovery.graph_processing.neo4j_transactions import merge_nodes_relation_tables, create_node @@ -52,14 +53,17 @@ def ingest_nodes(dataset_folder_name: str = None) -> None: files = glob.glob(f"{DATA_FOLDER / dataset_folder_name}/**/*.csv", recursive=True) else: files = glob.glob(f"{DATA_FOLDER}/**/*.csv", recursive=True) - + for f in files: if "datasets.csv" in f: continue - table_path = f.partition(f"{DATA_FOLDER}/")[2] - table_name = table_path.split("/")[-1] + table_path = f.partition(f"{DATA_FOLDER}{SLASH}")[2] + table_name = table_path.split(f"{SLASH}")[-1] + create_node(table_path, table_name) + return files + def ingest_data_with_pk_fk(dataset: Dataset, profile_valentine: bool = False, mix_datasets: bool = False): mapping = ingest_unprocessed_data(dataset.base_table_label)