Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Package #3

Closed
wants to merge 5 commits into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 4 additions & 0 deletions carte/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
from carte.src import *
from carte.configs import *
#from carte.data import *
from carte.scripts import *
4 changes: 4 additions & 0 deletions carte/configs/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
from carte.configs.carte_configs import *
from carte.configs.directory import *
from carte.configs.model_parameters import *
from carte.configs.visuailization import *
166 changes: 166 additions & 0 deletions carte/configs/carte_configs.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,166 @@
"""Specific configurations for the CARTE paper."""

## Dataset names
carte_datalist = [
"anime_planet",
"babies_r_us",
"beer_ratings",
"bikedekho",
"bikewale",
"buy_buy_baby",
"cardekho",
"chocolate_bar_ratings",
"clear_corpus",
"coffee_ratings",
"company_employees",
"employee_remuneration",
"employee_salaries",
"fifa22_players",
"filmtv_movies",
"journal_jcr",
"journal_sjr",
"jp_anime",
"k_drama",
"michelin",
"mlds_salaries",
"movies",
"museums",
"mydramalist",
"nba_draft",
"prescription_drugs",
"ramen_ratings",
"roger_ebert",
"rotten_tomatoes",
"spotify",
"us_accidents_counts",
"us_accidents_severity",
"us_presidential",
"used_cars_24",
"used_cars_benz_italy",
"used_cars_dot_com",
"used_cars_pakistan",
"used_cars_saudi_arabia",
"videogame_sales",
"whisky",
"wikiliq_beer",
"wikiliq_spirit",
"wina_pl",
"wine_dot_com_prices",
"wine_dot_com_ratings",
"wine_enthusiasts_prices",
"wine_enthusiasts_ratings",
"wine_vivino_price",
"wine_vivino_rating",
"yelp",
"zomato",
]

## Dictionary of baseline methods
carte_singletable_baselines = dict()
carte_singletable_baselines["full"] = [
"carte-gnn",
"catboost",
"sentence-llm-concat-num_histgb",
"sentence-llm-concat-num_xgb",
"sentence-llm-embed-num_histgb",
"sentence-llm-embed-num_xgb",
"tablevectorizer-fasttext_histgb",
"tablevectorizer-fasttext_xgb",
"tablevectorizer-llm_histgb",
"tablevectorizer-llm_xgb",
"tablevectorizer_histgb",
"tablevectorizer_logistic",
"tablevectorizer_mlp",
"tablevectorizer_randomforest",
"tablevectorizer_resnet",
"tablevectorizer_ridge",
"tablevectorizer_xgb",
"tablevectorizer_tabpfn",
"target-encoder_histgb",
"target-encoder_logistic",
"target-encoder_mlp",
"target-encoder_randomforest",
"target-encoder_resnet",
"target-encoder_ridge",
"target-encoder_xgb",
"target-encoder_tabpfn",
]

carte_singletable_baselines["reduced"] = [
"carte-gnn",
"catboost",
"sentence-llm-concat-num_xgb",
"sentence-llm-embed-num_xgb",
"tablevectorizer_logistic",
"tablevectorizer_mlp",
"tablevectorizer_randomforest",
"tablevectorizer_resnet",
"tablevectorizer_ridge",
"tablevectorizer_xgb",
"target-encoder_tabpfn",
]

carte_multitable_baselines = [
"original_carte-multitable",
"matched_carte-multitable",
"original_catboost-multitable",
"matched_catboost-multitable",
"original-sentence-llm_histgb-multitable",
"matched-sentence-llm_histgb-multitable",
]


## Dictionary of method mapping
carte_singletable_baseline_mapping = dict()
carte_singletable_baseline_mapping["carte-gnn"] = "CARTE"

# Preprocessings
carte_singletable_baseline_mapping["tablevectorizer_"] = "TabVec-"
carte_singletable_baseline_mapping["tablevectorizer-"] = "TabVec-"
carte_singletable_baseline_mapping["target-encoder_"] = "TarEnc-"
carte_singletable_baseline_mapping["fasttext_"] = "FT-"
carte_singletable_baseline_mapping["llm_"] = "LLM-"
carte_singletable_baseline_mapping["sentence-llm-concat-num_"] = "S-LLM-CN-"
carte_singletable_baseline_mapping["sentence-llm-embed-num_"] = "S-LLM-EN-"

# Estimators
carte_singletable_baseline_mapping["catboost"] = "CatBoost"
carte_singletable_baseline_mapping["xgb"] = "XGB"
carte_singletable_baseline_mapping["histgb"] = "HGB"
carte_singletable_baseline_mapping["randomforest"] = "RF"
carte_singletable_baseline_mapping["ridge"] = "Ridge"
carte_singletable_baseline_mapping["logistic"] = "Logistic"
carte_singletable_baseline_mapping["mlp"] = "MLP"
carte_singletable_baseline_mapping["resnet"] = "ResNet"
carte_singletable_baseline_mapping["tabpfn"] = "TabPFN"

# Bagging
carte_singletable_baseline_mapping["bagging"] = "Bagging"

## Colors for visualization
carte_singletable_color_palette = dict()
carte_singletable_color_palette["CARTE"] = "C3"
carte_singletable_color_palette["CatBoost"] = "C0"
carte_singletable_color_palette["TabVec-XGB"] = "C1"
carte_singletable_color_palette["TabVec-RF"] = "C2"
carte_singletable_color_palette["TabVec-Ridge"] = "C4"
carte_singletable_color_palette["TabVec-Logistic"] = "C5"
carte_singletable_color_palette["S-LLM-CN-XGB"] = "C6"
carte_singletable_color_palette["S-LLM-EN-XGB"] = "C7"
carte_singletable_color_palette["TabVec-ResNet"] = "C8"
carte_singletable_color_palette["TabVec-MLP"] = "C9"
carte_singletable_color_palette["TarEnc-TabPFN"] = "#A9561E"

## Markers for visualization
carte_singletable_markers = dict()
carte_singletable_markers["CARTE"] = "o"
carte_singletable_markers["TabVec-XGB"] = (4, 0, 45)
carte_singletable_markers["TabVec-RF"] = "P"
carte_singletable_markers["CatBoost"] = "X"
carte_singletable_markers["S-LLM-CN-XGB"] = (4, 0, 0)
carte_singletable_markers["S-LLM-EN-XGB"] = "d"
carte_singletable_markers["TabVec-Ridge"] = "v"
carte_singletable_markers["TabVec-Logistic"] = "v"
carte_singletable_markers["TabVec-ResNet"] = "^"
carte_singletable_markers["TabVec-MLP"] = "p"
carte_singletable_markers["TarEnc-TabPFN"] = (5, 1, 0)
24 changes: 24 additions & 0 deletions carte/configs/directory.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,24 @@
"""
Configurations for directory
"""

from pathlib import Path

base_path = Path().cwd()
config_directory = dict()
config_directory["base_path"] = base_path

config_directory["data"] = str(base_path / "data/")
config_directory["pretrained_model"] = str(base_path / "data/etc/kg_pretrained.pt")
config_directory["data_raw"] = str(base_path / "data/data_raw/")
config_directory["data_singletable"] = str(base_path / "data/data_singletable/")
config_directory["data_yago"] = str(base_path / "data/data_yago/")
config_directory["etc"] = str(base_path / "data/etc/")

config_directory["results"] = str(base_path / "results/")
config_directory["compiled_results"] = str(base_path / "results/compiled_results/")
config_directory["visualization"] = str(base_path / "visualization/")

# Specify the directory in which you have downloaded each
config_directory["fasttext"] = str(base_path / "data/etc/cc.en.300.bin")
config_directory["ken_embedding"] = str(base_path / "data/etc/ken_embedding.parquet")
148 changes: 148 additions & 0 deletions carte/configs/model_parameters.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,148 @@
"""
Parameter distributions for hyperparameter optimization
"""

import numpy as np
from scipy.stats import loguniform, randint, uniform, norm
import copy


class loguniform_int:
"""Integer valued version of the log-uniform distribution"""

def __init__(self, a, b):
self._distribution = loguniform(a, b)

def rvs(self, *args, **kwargs):
"""Random variable sample"""
return self._distribution.rvs(*args, **kwargs).astype(int)


class norm_int:
"""Integer valued version of the normal distribution"""

def __init__(self, a, b):
self._distribution = norm(a, b)

def rvs(self, *args, **kwargs):
"""Random variable sample"""
if self._distribution.rvs(*args, **kwargs).astype(int) < 1:
return 1
else:
return self._distribution.rvs(*args, **kwargs).astype(int)


param_distributions_total = dict()

# carte-gnn
param_distributions = dict()
lr_grid = [1e-4, 2.5e-4, 5e-4, 7.5e-4, 1e-3]
param_distributions["learning_rate"] = lr_grid
param_distributions_total["carte-gnn"] = param_distributions

# histgb
param_distributions = dict()
param_distributions["learning_rate"] = loguniform(1e-2, 10)
param_distributions["max_depth"] = [None, 2, 3, 4]
param_distributions["max_leaf_nodes"] = norm_int(31, 5)
param_distributions["min_samples_leaf"] = norm_int(20, 2)
param_distributions["l2_regularization"] = loguniform(1e-6, 1e3)
param_distributions_total["histgb"] = param_distributions

# catboost
param_distributions = dict()
param_distributions["max_depth"] = randint(2, 11)
param_distributions["learning_rate"] = loguniform(1e-5, 1)
param_distributions["bagging_temperature"] = uniform(0, 1)
param_distributions["l2_leaf_reg"] = loguniform(1, 10)
param_distributions["iterations"] = randint(400, 1001)
param_distributions["one_hot_max_size"] = randint(2, 26)
param_distributions_total["catboost"] = param_distributions

# xgb
param_distributions = dict()
param_distributions["n_estimators"] = randint(50, 1001)
param_distributions["max_depth"] = randint(2, 11)
param_distributions["min_child_weight"] = loguniform(1, 100)
param_distributions["subsample"] = uniform(0.5, 1 - 0.5)
param_distributions["learning_rate"] = loguniform(1e-5, 1)
param_distributions["colsample_bylevel"] = uniform(0.5, 1 - 0.5)
param_distributions["colsample_bytree"] = uniform(0.5, 1 - 0.5)
param_distributions["gamma"] = loguniform(1e-8, 7)
param_distributions["lambda"] = loguniform(1, 4)
param_distributions["alpha"] = loguniform(1e-8, 100)
param_distributions_total["xgb"] = param_distributions

# RandomForest
param_distributions = dict()
param_distributions["n_estimators"] = randint(50, 250)
param_distributions["max_depth"] = [None, 2, 3, 4]
param_distributions["max_features"] = [
"sqrt",
"log2",
None,
0.1,
0.2,
0.3,
0.4,
0.5,
0.6,
0.7,
0.8,
0.9,
]
param_distributions["min_samples_leaf"] = loguniform_int(0.5, 50.5)
param_distributions["bootstrap"] = [True, False]
param_distributions["min_impurity_decrease"] = [0.0, 0.01, 0.02, 0.05]
param_distributions_total["randomforest"] = param_distributions


# resnet
param_distributions = dict()
param_distributions["normalization"] = ["batchnorm", "layernorm"]
param_distributions["num_layers"] = randint(1, 9)
param_distributions["hidden_dim"] = randint(32, 513)
param_distributions["hidden_factor"] = randint(1, 3)
param_distributions["hidden_dropout_prob"] = uniform(0.0, 0.5)
param_distributions["residual_dropout_prob"] = uniform(0.0, 0.5)
param_distributions["learning_rate"] = loguniform(1e-5, 1e-2)
param_distributions["weight_decay"] = loguniform(1e-8, 1e-2)
param_distributions["batch_size"] = [16, 32]
param_distributions_total["resnet"] = param_distributions

# mlp
param_distributions = dict()
param_distributions["hidden_dim"] = [2**x for x in range(4, 11)]
param_distributions["num_layers"] = randint(1, 5)
param_distributions["dropout_prob"] = uniform(0.0, 0.5)
param_distributions["learning_rate"] = loguniform(1e-5, 1e-2)
param_distributions["weight_decay"] = loguniform(1e-8, 1e-2)
param_distributions["batch_size"] = [16, 32]
param_distributions_total["mlp"] = param_distributions

# ridge regression
param_distributions = dict()
param_distributions["solver"] = ["svd", "cholesky", "lsqr", "sag"]
param_distributions["alpha"] = loguniform(1e-5, 100)
param_distributions_total["ridge"] = param_distributions

# logistic regression
param_distributions = dict()
param_distributions["solver"] = ["newton-cg", "lbfgs", "liblinear"]
param_distributions["penalty"] = ["none", "l1", "l2", "elasticnet"]
param_distributions["C"] = loguniform(1e-5, 100)
param_distributions_total["logistic"] = param_distributions

# tabpfn
param_distributions = dict()
param_distributions_total["tabpfn"] = param_distributions

# catboost-multitable
param_distributions = copy.deepcopy(param_distributions_total["catboost"])
param_distributions["source_fraction"] = uniform(0, 1)
param_distributions_total["catboost-multitable"] = param_distributions

# histgb-multitable
param_distributions = copy.deepcopy(param_distributions_total["histgb"])
param_distributions["source_fraction"] = uniform(0, 1)
param_distributions_total["histgb-multitable"] = param_distributions
43 changes: 43 additions & 0 deletions carte/configs/visuailization.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,43 @@
"""
Visualization configurations
"""

# Main models
model_color_palette = dict()
model_color_palette["CARTE"] = "C3"
model_color_palette["CatBoost"] = "C0"
model_color_palette["TabVec-XGB"] = "C1"
model_color_palette["TabVec-RF"] = "C2"
model_color_palette["TabVec-Ridge"] = "C4"
model_color_palette["TabVec-Logistic"] = "C5"
model_color_palette["S-LLM-CN-XGB"] = "C6" # ""
model_color_palette["S-LLM-EN-XGB"] = "C7" # "C7" "#C875C4" mediumorchid
model_color_palette["ResNet"] = "C8"
model_color_palette["MLP"] = "C9"
model_color_palette["TabPFN"] = "#A9561E"

model_color_palette["TabVec-RandomForest"] = "C2"
model_color_palette["TabVec-ResNet"] = "C8"
model_color_palette["TabVec-MLP"] = "C9"
model_color_palette["TarEnc-TabPFN"] = "#A9561E"


# model_color_palette["CARTE-B"] = "C3"
# model_color_palette["CatBoost-B"] = "C0"
# model_color_palette["TabVec-XGB-B"] = "C1"
# model_color_palette["TabVec-RF-B"] = "C2"
# model_color_palette["TabVec-Ridge-B"] = "C4"
# model_color_palette["TabVec-Logistic-B"] = "C5"
# model_color_palette["S-LLM-CN-XGB-B"] = "C6"
# model_color_palette["S-LLM-EN-XGB-B"] = "C7"
# model_color_palette["ResNet-B"] = "C8"
# model_color_palette["MLP-B"] = "C9"
# model_color_palette["TabPFN-B"] = "#A9561E"


# model_color_palette["TabVec-HGB"] = "#650021"
# model_color_palette["TabVec-TabPFN"] = "#650021"
# model_color_palette["TabVec-FT-XGB"] = "#650021"
# model_color_palette["TabVec-FT-HGB"] = "#650021"

# model_color_palette["TabLLM"] = "#653700"
1 change: 1 addition & 0 deletions carte/data/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
from carte.data.data_singletable import *
Loading
Loading