delftdata · timmy1691 · Mar 16, 2024 · Mar 16, 2024 · Mar 18, 2024
diff --git a/requirements.txt b/requirements.txt
@@ -11,4 +11,5 @@ scikit-learn == 1.2.2
 seaborn == 0.11.2
 tqdm == 4.64.0
 typer == 0.9.0
-valentine == 0.1.6
+valentine == 0.1.6
+qpsolvers[open_source_solvers]
diff --git a/src/feature_discovery/cli.py b/src/feature_discovery/cli.py
@@ -210,14 +210,15 @@ def ingest_data(
     discover_connections_data_lake: Annotated[
         bool, typer.Option(help="Run dataset discovery to find more connections within the entire data lake")
     ] = False,
+
 ):
     """
     Ingest all dataset from specified "data" folder.
     """
-    ingest_nodes()
+    files = ingest_nodes()
 
     if data_discovery_threshold and discover_connections_data_lake:
-        profile_valentine_all(valentine_threshold=data_discovery_threshold)
+        profile_valentine_all(valentine_threshold=data_discovery_threshold, files=files)
         return
 
     if data_discovery_threshold and not discover_connections_data_lake:

diff --git a/src/feature_discovery/config.py b/src/feature_discovery/config.py
@@ -5,14 +5,16 @@
     os.getenv("TFD_ROOT_FOLDER", Path(os.path.abspath(__file__)).parent.parent.parent.resolve())
 ).resolve()
 
+SLASH = Path("/")
+
 CONNECTIONS = "connections.csv"
 
 DATASET_TYPE = "benchmark"
 
 DATA = "data"
 DATA_FOLDER = ROOT_FOLDER / DATA / DATASET_TYPE
 # RESULTS_FOLDER = ROOT_FOLDER / "results" / "revision-test"
-RESULTS_FOLDER = ROOT_FOLDER / "results_polars"
+RESULTS_FOLDER = ROOT_FOLDER / "results"
 AUTO_GLUON_FOLDER = ROOT_FOLDER / "AutogluonModels"
 
 ### CREDENTIALS ###
@@ -24,3 +26,5 @@
 
 # NEO4J_DATABASE = os.getenv("NEO4J_DATABASE", DATASET_TYPE)
 NEO4J_DATABASE = "lake"
+# NEO4J_DATABASE = "benchmark"
+
diff --git a/src/feature_discovery/dataset_relation_graph/dataset_discovery.py b/src/feature_discovery/dataset_relation_graph/dataset_discovery.py
@@ -6,15 +6,16 @@
 from joblib import Parallel, delayed
 from tqdm import tqdm
 from valentine import valentine_match
-from valentine.algorithms import Coma
+from valentine.algorithms import Coma, Cupid
 
-from feature_discovery.config import DATA_FOLDER, CONNECTIONS
+from feature_discovery.config import DATA_FOLDER, CONNECTIONS,SLASH
 from feature_discovery.graph_processing.neo4j_transactions import merge_nodes_relation_tables
 
 
-def profile_valentine_all(valentine_threshold: float = 0.55):
-    files = glob.glob(f"{DATA_FOLDER}/**/*.csv", recursive=True)
-    files = [f for f in files if CONNECTIONS not in f]
+def profile_valentine_all(valentine_threshold: float = 0.55, files: List[str]=None):
+    if files is None:
+        files = glob.glob(f"{DATA_FOLDER}/**/*.csv", recursive=True)
+        files = [f for f in files if CONNECTIONS not in f]
 
     profile_valentine_logic(files, valentine_threshold)
 
@@ -30,18 +31,20 @@ def profile_valentine_logic(files: List[str], valentine_threshold: float = 0.55)
     def profile(table_pair):
         (tab1, tab2) = table_pair
 
-        a_table_path = tab1.partition(f"{DATA_FOLDER}/")[2]
-        b_table_path = tab2.partition(f"{DATA_FOLDER}/")[2]
+        a_table_path = tab1.partition(f"{DATA_FOLDER}{SLASH}")[2]
+        b_table_path = tab2.partition(f"{DATA_FOLDER}{SLASH}")[2]
 
-        a_table_name = a_table_path.split("/")[-1]
-        b_table_name = b_table_path.split("/")[-1]
+        a_table_name = a_table_path.split(f"{SLASH}")[-1]
+        b_table_name = b_table_path.split(f"{SLASH}")[-1]
 
-        print(f"Processing the match between:\n\t{a_table_path}\n\t{b_table_path}")
+        print(f"\nProcessing the match between:\n\t{a_table_path}\n\t{b_table_path}")
         df1 = pd.read_csv(tab1, encoding="utf8")
         df2 = pd.read_csv(tab2, encoding="utf8")
-        matches = valentine_match(df1, df2, Coma(strategy="COMA_OPT"))
+
+        matches = valentine_match(df1, df2, Coma(strategy="COMA_OPT")) #COMA_OPT_INST
 
         for item in matches.items():
+            print(item)
             ((_, col_from), (_, col_to)), similarity = item
             if similarity > valentine_threshold:
                 print(f"Similarity {similarity} between:\n\t{a_table_path} -- {col_from}\n\t{b_table_path} -- {col_to}")

diff --git a/src/feature_discovery/dataset_relation_graph/ingest_data.py b/src/feature_discovery/dataset_relation_graph/ingest_data.py
@@ -1,8 +1,9 @@
 import glob
+from pathlib import Path
 
 import pandas as pd
 
-from feature_discovery.config import CONNECTIONS, DATA_FOLDER
+from feature_discovery.config import CONNECTIONS, DATA_FOLDER, SLASH
 from feature_discovery.dataset_relation_graph.dataset_discovery import profile_valentine_all, profile_valentine_dataset
 from feature_discovery.experiments.dataset_object import Dataset
 from feature_discovery.graph_processing.neo4j_transactions import merge_nodes_relation_tables, create_node
@@ -52,14 +53,17 @@ def ingest_nodes(dataset_folder_name: str = None) -> None:
         files = glob.glob(f"{DATA_FOLDER / dataset_folder_name}/**/*.csv", recursive=True)
     else:
         files = glob.glob(f"{DATA_FOLDER}/**/*.csv", recursive=True)
-
+    
     for f in files:
         if "datasets.csv" in f:
             continue
-        table_path = f.partition(f"{DATA_FOLDER}/")[2]
-        table_name = table_path.split("/")[-1]
+        table_path = f.partition(f"{DATA_FOLDER}{SLASH}")[2]
+        table_name = table_path.split(f"{SLASH}")[-1]
+
         create_node(table_path, table_name)
 
+    return files
+
 
 def ingest_data_with_pk_fk(dataset: Dataset, profile_valentine: bool = False, mix_datasets: bool = False):
     mapping = ingest_unprocessed_data(dataset.base_table_label)