Skip to content

Commit

Permalink
Merge pull request #20 from cbib/new-bivar
Browse files Browse the repository at this point in the history
New: bivariate analysis
  • Loading branch information
johaGL authored Jan 22, 2024
2 parents 64211e2 + 5931508 commit e5ba409
Show file tree
Hide file tree
Showing 7 changed files with 705 additions and 46 deletions.
1 change: 0 additions & 1 deletion src/dimet/__main__.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,6 @@ def main_run_analysis(cfg: DictConfig) -> None:
config=hydra.utils.instantiate(cfg.analysis.dataset))
dataset.preload()
dataset.split_datafiles_by_compartment()
dataset.save_datafiles_split_by_compartment()
method: Method = hydra.utils.instantiate(
cfg.analysis.method).build() # method factory

Expand Down
26 changes: 26 additions & 0 deletions src/dimet/config/analysis/method/bivariate_analysis.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,26 @@
_target_: dimet.method.BivariateAnalysisConfig

label: bivariate analysis
name: Computation of the correlation of MDV profiles, or the metabolite time course profiles

# (**) : automatically will run

conditions_MDV_comparison: # (**) if >= 2 conditions and >=1 timepoint (timepoints run separately)
isotopologue_proportions: pearson

timepoints_MDV_comparison: # (**) if >= 1 condition and >=2 timepoints
isotopologue_proportions: pearson

conditions_metabolite_time_profiles: # (**) if >= 2 conditions AND >=2 time points in data
abundances: pearson
mean_enrichment: pearson

correction_method: fdr_bh

impute_values:
abundances: "min"
mean_enrichment: "min"
isotopologues: "min"
isotopologue_proportions: "min"

output_include_gmean_arr_columns: True # if False, the 'gmean_arr_.." columns are excluded
30 changes: 9 additions & 21 deletions src/dimet/data/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -47,7 +47,6 @@ def build(self) -> "Dataset":
class Dataset(BaseModel):
config: DatasetConfig
raw_data_folder: str = None
processed_data_folder: str = None
sub_folder_absolute: str = None
metadata_df: Optional[pd.DataFrame] = None
abundances_df: Optional[pd.DataFrame] = None
Expand Down Expand Up @@ -77,8 +76,7 @@ def preload(self):
else:
self.sub_folder_absolute = self.config.subfolder
self.raw_data_folder = os.path.join(self.sub_folder_absolute, "raw")
self.processed_data_folder = os.path.join(self.sub_folder_absolute,
"processed")

# start loading the dataframes
file_paths = [
("metadata", os.path.join(self.raw_data_folder,
Expand Down Expand Up @@ -106,10 +104,14 @@ def preload(self):
dfs.append(pd.read_csv(file_path, sep="\t", header=0))
self.available_datasets.add(label)
except FileNotFoundError:
logger.critical(
"File %s not found, continuing, "
"but this might fail miserably",
file_path)
if file_path.endswith(self.config.isotopologues + ".csv"):
message_detail = "isotopologue absolute values missing"
logger.critical(
"File %s not found (%s), continuing"
% (file_path, message_detail))
else:
logger.critical("File %s not found, continuing",
file_path)
dfs.append(None)
except Exception as e:
logger.error(
Expand Down Expand Up @@ -169,19 +171,6 @@ def split_datafiles_by_compartment(self) -> None:
frames_dict = set_samples_names(frames_dict, self.metadata_df)
self.compartmentalized_dfs = frames_dict

def save_datafiles_split_by_compartment(self) -> None:
os.makedirs(self.processed_data_folder, exist_ok=True)
out_data_path = self.processed_data_folder
for file_name in self.compartmentalized_dfs.keys():
for compartment in self.compartmentalized_dfs[file_name].keys():
df = self.compartmentalized_dfs[file_name][compartment]
tmp_file_name = self.get_file_for_label(file_name)
output_file_name = f"{tmp_file_name}-{compartment}.csv"
df.to_csv(os.path.join(out_data_path, output_file_name),
sep="\t", header=True, index=False)
logger.info(
f"Saved the {compartment} compartment version "
f"of {file_name} in {out_data_path}")

def get_file_for_label(self, label):
if label == "abundances":
Expand Down Expand Up @@ -210,7 +199,6 @@ class DataIntegration(Dataset):
def set_dataset_integration_config(self):
self.preload()
self.split_datafiles_by_compartment()
self.save_datafiles_split_by_compartment()

self.integration_files_folder_absolute = os.path.join(
self.sub_folder_absolute, "integration_files")
Expand Down
88 changes: 88 additions & 0 deletions src/dimet/method/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@
metabolites_values_for_metabologram)
from dimet.data import DataIntegration, Dataset
from dimet.helpers import flatten
from dimet.processing.bivariate_analysis import bivariate_comparison
from dimet.processing.differential_analysis import (differential_comparison,
multi_group_compairson,
time_course_analysis)
Expand Down Expand Up @@ -184,6 +185,18 @@ def build(self) -> "MetabologramIntegration":
return MetabologramIntegration(config=self)


class BivariateAnalysisConfig(MethodConfig):
"""
Sets default values or fills them for the bi-variate analysis
"""
correction_method: str = "fdr_bh"
output_include_gmean_arr_columns: bool = True

def build(self) -> "BivariateAnalysis":
return BivariateAnalysis(config=self)



class Method(BaseModel):
config: MethodConfig

Expand Down Expand Up @@ -852,3 +865,78 @@ def check_expectations_config_metabo(
except ValueError as e:
logger.error(f"Data inconsistency: {e}")
sys.exit(1)


class BivariateAnalysis(Method):
config: BivariateAnalysisConfig

def run(self, cfg: DictConfig, dataset: Dataset) -> None:
"""
Runs bivariate analysis, the 'behavior' is the type of comparison:
- conditions_MDV_comparison
- timepoints_MDV_comparison
- conditions_metabolite_time_profiles
"""
logger.info(
"Will compute bi-variate analysis, with the following config: %s",
self.config)

out_table_dir = os.path.join(os.getcwd(), cfg.table_path)
os.makedirs(out_table_dir, exist_ok=True)
self.check_expectations(cfg, dataset)

datatype = "isotopologue_proportions"
if datatype in dataset.compartmentalized_dfs.keys():
logger.info(f"Running bi-variate analysis with "
f"{datatype}:")
if len(cfg.analysis.conditions) >= 2:
logger.info("assessing MDV (Mass Distribution Vector) "
"between conditions")
bivariate_comparison(
datatype, dataset, cfg,
behavior="conditions_MDV_comparison",
out_table_dir=out_table_dir)
if len(dataset.metadata_df["timepoint"].unique()) >= 2:
logger.info("assessing MDV (Mass Distribution Vector) "
"between time-points")
bivariate_comparison(
datatype, dataset, cfg,
behavior="timepoints_MDV_comparison",
out_table_dir=out_table_dir)

if (len(cfg.analysis.conditions) >= 2) and (
len(dataset.metadata_df["timepoint"].unique()) >= 2):
for datatype in ["abundances", "mean_enrichment"]:
if datatype in dataset.compartmentalized_dfs.keys():
logger.info(f"Running bi-variate analysis with "
f"{datatype} to compare "
f"time course profiles between conditions")
bivariate_comparison(
datatype, dataset, cfg,
behavior="conditions_metabolite_time_profiles",
out_table_dir=out_table_dir)

def check_expectations(self, cfg: DictConfig, dataset: Dataset) -> None:
# check that necessary information is provided in the analysis config
try:
if ((len(cfg.analysis.conditions) < 2) and
(len(dataset.metadata_df["timepoint"].unique()) < 2)):
raise ValueError("Less than 2 conditions, "
"AND less than 2 timepoints, "
"impossible to run bi-variate analysis, "
"aborting")
if not set(cfg.analysis.conditions).issubset(
set(dataset.metadata_df['condition'])):
raise ValueError(
"Conditions provided for bi-variate analysis "
"in the config file "
"are not present in the metadata file, aborting"
)
except ConfigAttributeError as e:
logger.error(
f"Mandatory parameter not provided in the config file:{e}, "
f"aborting")
sys.exit(1)
except ValueError as e:
logger.error(f"Data inconsistency:{e}")
sys.exit(1)
Loading

0 comments on commit e5ba409

Please sign in to comment.