Skip to content

Commit

Permalink
addresses first set of comments
Browse files Browse the repository at this point in the history
  • Loading branch information
skothenhill-nv committed Nov 18, 2024
1 parent da7105b commit 08ad85c
Show file tree
Hide file tree
Showing 4 changed files with 44 additions and 44 deletions.
16 changes: 8 additions & 8 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -247,14 +247,14 @@ and DataModule types.
```
bionemo-esm2-train \
--data-config-t bionemo.esm2.run.config_models.ESM2DataConfig \
--model-config-t bionemo.esm2.run.config_models.ExposedESM2PretrainConfig \
--data-config-cls bionemo.esm2.run.config_models.ESM2DataConfig \
--model-config-cls bionemo.esm2.run.config_models.ExposedESM2PretrainConfig \
--config my_config.yaml
```

> NOTE: both data-config-t and model-config-t have default values corresponding to ESM2DataConfig and ExposedESM2PretrainingConfig
> NOTE: both data-config-cls and model-config-cls have default values corresponding to ESM2DataConfig and ExposedESM2PretrainingConfig
DataConfigT and ModelConfigT can also refer to locally defined types by the user. As long as python knows how to import
DataConfigCls and ModelConfigCls can also refer to locally defined types by the user. As long as python knows how to import
the specified path, they may be configured. For example, you may have a custom Dataset/DataModule that you would like to
mix with an existing recipe. In this case, you define a DataConfig object with the generic specified as your DataModule
type, and then pass in the config type to the training recipe.
Expand Down Expand Up @@ -346,14 +346,14 @@ the default pretraining DataConfig and DataModule will be insufficient. See ESM2
```bash
bionemo-geneformer-train \
--data-config-t bionemo.geneformer.run.config_models.GeneformerPretrainingDataConfig \
--model-config-t bionemo.geneformer.run.config_models.ExposedGeneformerPretrainConfig \
--data-config-cls bionemo.geneformer.run.config_models.GeneformerPretrainingDataConfig \
--model-config-cls bionemo.geneformer.run.config_models.ExposedGeneformerPretrainConfig \
--config my_config.yaml
```

> NOTE: both data-config-t and model-config-t have default values corresponding to GeneformerPretrainingDataConfig and ExposedGeneformerPretrainConfig
> NOTE: both data-config-cls and model-config-cls have default values corresponding to GeneformerPretrainingDataConfig and ExposedGeneformerPretrainConfig
DataConfigT and ModelConfigT can also refer to locally defined types by the user. As long as python knows how to import
DataConfigCls and ModelConfigCls can also refer to locally defined types by the user. As long as python knows how to import
the specified path, they may be configured. For example, you may have a custom Dataset/DataModule that you would like to
mix with an existing recipe. In this case, you define a DataConfig object with the generic specified as your DataModule
type, and then pass in the config type to the training recipe.
Expand Down
34 changes: 17 additions & 17 deletions sub-packages/bionemo-esm2/src/bionemo/esm2/run/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,13 +29,13 @@ def parse_args():
parser = argparse.ArgumentParser(description="Run ESM2 pretraining")
parser.add_argument("--config", type=str, required=True, help="Path to the JSON configuration file")
parser.add_argument(
"--model-config-t",
"--model-config-cls",
default=ExposedESM2PretrainConfig,
required=False,
help="fully resolvable python import path to the ModelConfig object. Builtin options are ExposedESM2PretrainConfig.",
)
parser.add_argument(
"--data-config-t",
"--data-config-cls",
default=ESM2DataConfig,
required=False,
help="fully resolvable python import path to the ModelConfig object.",
Expand Down Expand Up @@ -87,32 +87,32 @@ def string_to_class(path: str):
module = importlib.import_module(module_path)
return getattr(module, class_name)

def load_config(config_path: str, model_config_t: Optional[str], data_config_t: Optional[str]) -> MainConfig:
def load_config(config_path: str, model_config_cls: Optional[str], data_config_cls: Optional[str]) -> MainConfig:
with open(config_path, "r") as f:
config_dict = yaml.safe_load(f)

# model/data_config_t is used to select the parser dynamically.
if model_config_t is None or model_config_t == "ExposedESM2PretrainConfig":
model_config_t = ExposedESM2PretrainConfig
elif model_config_t == "ExposedFineTuneSeqModel":
# model/data_config_cls is used to select the parser dynamically.
if model_config_cls is None or model_config_cls == "ExposedESM2PretrainConfig":
model_config_cls = ExposedESM2PretrainConfig
elif model_config_cls == "ExposedFineTuneSeqModel":
# Hardcoded path for those who do not know the full path
# model_config_t = ExposedFineTuneSeqLenBioBertConfig
# model_config_cls = ExposedFineTuneSeqLenBioBertConfig
raise NotImplementedError()
elif model_config_t == "ExposedFineTuneTokenModel":
elif model_config_cls == "ExposedFineTuneTokenModel":
raise NotImplementedError()
elif isinstance(model_config_t, str):
elif isinstance(model_config_cls, str):
# We assume we get a string to some importable config... e.g. in the sub-package jensen, 'bionemo.jensen.configs.MyConfig'
model_config_t = string_to_class(model_config_t)
model_config_cls = string_to_class(model_config_cls)

if data_config_t is None:
data_config_t = ESM2DataConfig
elif isinstance(data_config_t, str):
data_config_t = string_to_class(data_config_t)
if data_config_cls is None:
data_config_cls = ESM2DataConfig
elif isinstance(data_config_cls, str):
data_config_cls = string_to_class(data_config_cls)

return MainConfig[model_config_t, data_config_t](**config_dict)
return MainConfig[model_config_cls, data_config_cls](**config_dict)

args = parse_args()
config = load_config(args.config, args.model_config_t, args.data_config_t)
config = load_config(args.config, args.model_config_cls, args.data_config_cls)

if args.nsys_profiling:
nsys_config = NsysConfig(
Expand Down
36 changes: 18 additions & 18 deletions sub-packages/bionemo-geneformer/src/bionemo/geneformer/run/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -33,16 +33,16 @@ def parse_args():
parser = argparse.ArgumentParser(description="Run Geneformer pretraining")
parser.add_argument("--config", type=str, required=True, help="Path to the JSON configuration file")
parser.add_argument(
"--model-config-t",
"--model-config-cls",
default=ExposedGeneformerPretrainConfig,
required=False,
help="fully resolvable python import path to the ModelConfig object. Builtin options are ExposedGeneformerPretrainConfig and ExposedFineTuneSeqLenBioBertConfig.",
help="fully resolvable python import path to the ModelConfig class. Builtin options are ExposedGeneformerPretrainConfig and ExposedFineTuneSeqLenBioBertConfig.",
)
parser.add_argument(
"--data-config-t",
"--data-config-cls",
default=GeneformerPretrainingDataConfig,
required=False,
help="fully resolvable python import path to the ModelConfig object.",
help="fully resolvable python import path to the class.",
)
parser.add_argument(
"--resume-if-exists",
Expand Down Expand Up @@ -92,28 +92,28 @@ def string_to_class(path: str):
module = importlib.import_module(module_path)
return getattr(module, class_name)

def load_config(config_path: str, model_config_t: Optional[str], data_config_t: Optional[str]) -> MainConfig:
def load_config(config_path: str, model_config_cls: Optional[str], data_config_cls: Optional[str]) -> MainConfig:
with open(config_path, "r") as f:
config_dict = yaml.safe_load(f)

# model/data_config_t is used to select the parser dynamically.
if model_config_t is None or model_config_t == "ExposedGeneformerPretrainConfig":
model_config_t = ExposedGeneformerPretrainConfig
elif model_config_t == "ExposedFineTuneSeqLenBioBertConfig":
# model/data_config_cls is used to select the parser dynamically.
if model_config_cls is None or model_config_cls == "ExposedGeneformerPretrainConfig":
model_config_cls = ExposedGeneformerPretrainConfig
elif model_config_cls == "ExposedFineTuneSeqLenBioBertConfig":
# Hardcoded path for those who do not know the full path
model_config_t = ExposedFineTuneSeqLenBioBertConfig
elif isinstance(model_config_t, str):
model_config_cls = ExposedFineTuneSeqLenBioBertConfig
elif isinstance(model_config_cls, str):
# We assume we get a string to some importable config... e.g. in the sub-package jensen, 'bionemo.jensen.configs.MyConfig'
model_config_t = string_to_class(model_config_t)
model_config_cls = string_to_class(model_config_cls)

if data_config_t is None:
data_config_t = GeneformerPretrainingDataConfig
elif isinstance(data_config_t, str):
data_config_t = string_to_class(data_config_t)
return MainConfig[model_config_t, data_config_t](**config_dict)
if data_config_cls is None:
data_config_cls = GeneformerPretrainingDataConfig
elif isinstance(data_config_cls, str):
data_config_cls = string_to_class(data_config_cls)
return MainConfig[model_config_cls, data_config_cls](**config_dict)

args = parse_args()
config = load_config(args.config, args.model_config_t, args.data_config_t)
config = load_config(args.config, args.model_config_cls, args.data_config_cls)

if args.nsys_profiling:
nsys_config = NsysConfig(
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -145,7 +145,7 @@ def test_finetune_cli(tmpdir):
if result.returncode != 0:
raise Exception(f"Pretrain recipe failed:\n{cmd_str=}\n{result.stdout=}\n{result.stderr=}")

cmd_str = f"bionemo-geneformer-train --conf {config} --model-config-t ExposedFineTuneSeqLenBioBertConfig"
cmd_str = f"bionemo-geneformer-train --conf {config} --model-config-cls ExposedFineTuneSeqLenBioBertConfig"
env = dict(**os.environ) # a local copy of the environment
open_port = find_free_network_port()
env["MASTER_PORT"] = str(open_port)
Expand Down

0 comments on commit 08ad85c

Please sign in to comment.