From 08ad85c67dd9131c3ad922a8e61e60c8032450e5 Mon Sep 17 00:00:00 2001 From: Steven Date: Mon, 18 Nov 2024 14:35:27 -0800 Subject: [PATCH] addresses first set of comments --- README.md | 16 ++++----- .../bionemo-esm2/src/bionemo/esm2/run/main.py | 34 +++++++++--------- .../src/bionemo/geneformer/run/main.py | 36 +++++++++---------- .../geneformer/scripts/test_pydantic_train.py | 2 +- 4 files changed, 44 insertions(+), 44 deletions(-) diff --git a/README.md b/README.md index 455edd023..0ce2bcb94 100644 --- a/README.md +++ b/README.md @@ -247,14 +247,14 @@ and DataModule types. ``` bionemo-esm2-train \ ---data-config-t bionemo.esm2.run.config_models.ESM2DataConfig \ ---model-config-t bionemo.esm2.run.config_models.ExposedESM2PretrainConfig \ +--data-config-cls bionemo.esm2.run.config_models.ESM2DataConfig \ +--model-config-cls bionemo.esm2.run.config_models.ExposedESM2PretrainConfig \ --config my_config.yaml ``` -> NOTE: both data-config-t and model-config-t have default values corresponding to ESM2DataConfig and ExposedESM2PretrainingConfig +> NOTE: both data-config-cls and model-config-cls have default values corresponding to ESM2DataConfig and ExposedESM2PretrainingConfig -DataConfigT and ModelConfigT can also refer to locally defined types by the user. As long as python knows how to import +DataConfigCls and ModelConfigCls can also refer to locally defined types by the user. As long as python knows how to import the specified path, they may be configured. For example, you may have a custom Dataset/DataModule that you would like to mix with an existing recipe. In this case, you define a DataConfig object with the generic specified as your DataModule type, and then pass in the config type to the training recipe. @@ -346,14 +346,14 @@ the default pretraining DataConfig and DataModule will be insufficient. See ESM2 ```bash bionemo-geneformer-train \ ---data-config-t bionemo.geneformer.run.config_models.GeneformerPretrainingDataConfig \ ---model-config-t bionemo.geneformer.run.config_models.ExposedGeneformerPretrainConfig \ +--data-config-cls bionemo.geneformer.run.config_models.GeneformerPretrainingDataConfig \ +--model-config-cls bionemo.geneformer.run.config_models.ExposedGeneformerPretrainConfig \ --config my_config.yaml ``` -> NOTE: both data-config-t and model-config-t have default values corresponding to GeneformerPretrainingDataConfig and ExposedGeneformerPretrainConfig +> NOTE: both data-config-cls and model-config-cls have default values corresponding to GeneformerPretrainingDataConfig and ExposedGeneformerPretrainConfig -DataConfigT and ModelConfigT can also refer to locally defined types by the user. As long as python knows how to import +DataConfigCls and ModelConfigCls can also refer to locally defined types by the user. As long as python knows how to import the specified path, they may be configured. For example, you may have a custom Dataset/DataModule that you would like to mix with an existing recipe. In this case, you define a DataConfig object with the generic specified as your DataModule type, and then pass in the config type to the training recipe. diff --git a/sub-packages/bionemo-esm2/src/bionemo/esm2/run/main.py b/sub-packages/bionemo-esm2/src/bionemo/esm2/run/main.py index 1d0ca2659..857db8ad4 100644 --- a/sub-packages/bionemo-esm2/src/bionemo/esm2/run/main.py +++ b/sub-packages/bionemo-esm2/src/bionemo/esm2/run/main.py @@ -29,13 +29,13 @@ def parse_args(): parser = argparse.ArgumentParser(description="Run ESM2 pretraining") parser.add_argument("--config", type=str, required=True, help="Path to the JSON configuration file") parser.add_argument( - "--model-config-t", + "--model-config-cls", default=ExposedESM2PretrainConfig, required=False, help="fully resolvable python import path to the ModelConfig object. Builtin options are ExposedESM2PretrainConfig.", ) parser.add_argument( - "--data-config-t", + "--data-config-cls", default=ESM2DataConfig, required=False, help="fully resolvable python import path to the ModelConfig object.", @@ -87,32 +87,32 @@ def string_to_class(path: str): module = importlib.import_module(module_path) return getattr(module, class_name) - def load_config(config_path: str, model_config_t: Optional[str], data_config_t: Optional[str]) -> MainConfig: + def load_config(config_path: str, model_config_cls: Optional[str], data_config_cls: Optional[str]) -> MainConfig: with open(config_path, "r") as f: config_dict = yaml.safe_load(f) - # model/data_config_t is used to select the parser dynamically. - if model_config_t is None or model_config_t == "ExposedESM2PretrainConfig": - model_config_t = ExposedESM2PretrainConfig - elif model_config_t == "ExposedFineTuneSeqModel": + # model/data_config_cls is used to select the parser dynamically. + if model_config_cls is None or model_config_cls == "ExposedESM2PretrainConfig": + model_config_cls = ExposedESM2PretrainConfig + elif model_config_cls == "ExposedFineTuneSeqModel": # Hardcoded path for those who do not know the full path - # model_config_t = ExposedFineTuneSeqLenBioBertConfig + # model_config_cls = ExposedFineTuneSeqLenBioBertConfig raise NotImplementedError() - elif model_config_t == "ExposedFineTuneTokenModel": + elif model_config_cls == "ExposedFineTuneTokenModel": raise NotImplementedError() - elif isinstance(model_config_t, str): + elif isinstance(model_config_cls, str): # We assume we get a string to some importable config... e.g. in the sub-package jensen, 'bionemo.jensen.configs.MyConfig' - model_config_t = string_to_class(model_config_t) + model_config_cls = string_to_class(model_config_cls) - if data_config_t is None: - data_config_t = ESM2DataConfig - elif isinstance(data_config_t, str): - data_config_t = string_to_class(data_config_t) + if data_config_cls is None: + data_config_cls = ESM2DataConfig + elif isinstance(data_config_cls, str): + data_config_cls = string_to_class(data_config_cls) - return MainConfig[model_config_t, data_config_t](**config_dict) + return MainConfig[model_config_cls, data_config_cls](**config_dict) args = parse_args() - config = load_config(args.config, args.model_config_t, args.data_config_t) + config = load_config(args.config, args.model_config_cls, args.data_config_cls) if args.nsys_profiling: nsys_config = NsysConfig( diff --git a/sub-packages/bionemo-geneformer/src/bionemo/geneformer/run/main.py b/sub-packages/bionemo-geneformer/src/bionemo/geneformer/run/main.py index b7a99947e..4b49946ce 100644 --- a/sub-packages/bionemo-geneformer/src/bionemo/geneformer/run/main.py +++ b/sub-packages/bionemo-geneformer/src/bionemo/geneformer/run/main.py @@ -33,16 +33,16 @@ def parse_args(): parser = argparse.ArgumentParser(description="Run Geneformer pretraining") parser.add_argument("--config", type=str, required=True, help="Path to the JSON configuration file") parser.add_argument( - "--model-config-t", + "--model-config-cls", default=ExposedGeneformerPretrainConfig, required=False, - help="fully resolvable python import path to the ModelConfig object. Builtin options are ExposedGeneformerPretrainConfig and ExposedFineTuneSeqLenBioBertConfig.", + help="fully resolvable python import path to the ModelConfig class. Builtin options are ExposedGeneformerPretrainConfig and ExposedFineTuneSeqLenBioBertConfig.", ) parser.add_argument( - "--data-config-t", + "--data-config-cls", default=GeneformerPretrainingDataConfig, required=False, - help="fully resolvable python import path to the ModelConfig object.", + help="fully resolvable python import path to the class.", ) parser.add_argument( "--resume-if-exists", @@ -92,28 +92,28 @@ def string_to_class(path: str): module = importlib.import_module(module_path) return getattr(module, class_name) - def load_config(config_path: str, model_config_t: Optional[str], data_config_t: Optional[str]) -> MainConfig: + def load_config(config_path: str, model_config_cls: Optional[str], data_config_cls: Optional[str]) -> MainConfig: with open(config_path, "r") as f: config_dict = yaml.safe_load(f) - # model/data_config_t is used to select the parser dynamically. - if model_config_t is None or model_config_t == "ExposedGeneformerPretrainConfig": - model_config_t = ExposedGeneformerPretrainConfig - elif model_config_t == "ExposedFineTuneSeqLenBioBertConfig": + # model/data_config_cls is used to select the parser dynamically. + if model_config_cls is None or model_config_cls == "ExposedGeneformerPretrainConfig": + model_config_cls = ExposedGeneformerPretrainConfig + elif model_config_cls == "ExposedFineTuneSeqLenBioBertConfig": # Hardcoded path for those who do not know the full path - model_config_t = ExposedFineTuneSeqLenBioBertConfig - elif isinstance(model_config_t, str): + model_config_cls = ExposedFineTuneSeqLenBioBertConfig + elif isinstance(model_config_cls, str): # We assume we get a string to some importable config... e.g. in the sub-package jensen, 'bionemo.jensen.configs.MyConfig' - model_config_t = string_to_class(model_config_t) + model_config_cls = string_to_class(model_config_cls) - if data_config_t is None: - data_config_t = GeneformerPretrainingDataConfig - elif isinstance(data_config_t, str): - data_config_t = string_to_class(data_config_t) - return MainConfig[model_config_t, data_config_t](**config_dict) + if data_config_cls is None: + data_config_cls = GeneformerPretrainingDataConfig + elif isinstance(data_config_cls, str): + data_config_cls = string_to_class(data_config_cls) + return MainConfig[model_config_cls, data_config_cls](**config_dict) args = parse_args() - config = load_config(args.config, args.model_config_t, args.data_config_t) + config = load_config(args.config, args.model_config_cls, args.data_config_cls) if args.nsys_profiling: nsys_config = NsysConfig( diff --git a/sub-packages/bionemo-geneformer/tests/bionemo/geneformer/scripts/test_pydantic_train.py b/sub-packages/bionemo-geneformer/tests/bionemo/geneformer/scripts/test_pydantic_train.py index 5d7a821d0..707c3dc36 100644 --- a/sub-packages/bionemo-geneformer/tests/bionemo/geneformer/scripts/test_pydantic_train.py +++ b/sub-packages/bionemo-geneformer/tests/bionemo/geneformer/scripts/test_pydantic_train.py @@ -145,7 +145,7 @@ def test_finetune_cli(tmpdir): if result.returncode != 0: raise Exception(f"Pretrain recipe failed:\n{cmd_str=}\n{result.stdout=}\n{result.stderr=}") - cmd_str = f"bionemo-geneformer-train --conf {config} --model-config-t ExposedFineTuneSeqLenBioBertConfig" + cmd_str = f"bionemo-geneformer-train --conf {config} --model-config-cls ExposedFineTuneSeqLenBioBertConfig" env = dict(**os.environ) # a local copy of the environment open_port = find_free_network_port() env["MASTER_PORT"] = str(open_port)