From ad4e9a6c3dfd82ffdf12c18d59c26f72d8cbdf25 Mon Sep 17 00:00:00 2001 From: Pablo Gonzalez Date: Thu, 21 Nov 2024 09:01:31 -0500 Subject: [PATCH 1/4] Add initial 5.0.0 files and configurations --- mlperf_logging/benchmark_meta.py | 9 + mlperf_logging/compliance_checker/README.md | 38 +-- .../compliance_checker/mlp_compliance.py | 2 +- .../compliance_checker/mlp_parser/__init__.py | 3 + .../mlp_parser/ruleset_500.py | 105 ++++++ .../training_5.0.0/closed_bert.yaml | 48 +++ .../training_5.0.0/closed_common.yaml | 11 + .../training_5.0.0/closed_dlrm_dcnv2.yaml | 59 ++++ .../training_5.0.0/closed_gnn.yaml | 21 ++ .../training_5.0.0/closed_gpt3.yaml | 79 +++++ .../closed_llama2_70b_lora.yaml | 42 +++ .../training_5.0.0/closed_retinanet.yaml | 35 ++ .../closed_stable_diffusion.yaml | 74 +++++ .../training_5.0.0/common.yaml | 146 +++++++++ .../training_5.0.0/open_bert.yaml | 7 + .../training_5.0.0/open_common.yaml | 6 + .../training_5.0.0/open_dlrm_dcnv2.yaml | 7 + .../training_5.0.0/open_gnn.yaml | 7 + .../training_5.0.0/open_gpt3.yaml | 79 +++++ .../training_5.0.0/open_llama2_70b_lora.yaml | 7 + .../training_5.0.0/open_retinanet.yaml | 7 + .../training_5.0.0/open_stable_diffusion.yaml | 33 ++ mlperf_logging/mllog/constants.py | 1 + mlperf_logging/package_checker/README.md | 2 +- .../package_checker/package_checker.py | 6 +- mlperf_logging/rcp_checker/README.md | 4 +- mlperf_logging/rcp_checker/rcp_checker.py | 6 +- .../rcp_checker/training_5.0.0/rcps_bert.json | 303 ++++++++++++++++++ .../training_5.0.0/rcps_dlrm_dcnv2.json | 133 ++++++++ .../rcp_checker/training_5.0.0/rcps_gnn.json | 90 ++++++ .../rcp_checker/training_5.0.0/rcps_gpt3.json | 93 ++++++ .../training_5.0.0/rcps_llama2_70b_lora.json | 91 ++++++ .../training_5.0.0/rcps_retinanet.json | 163 ++++++++++ .../training_5.0.0/rcps_stable_diffusion.json | 112 +++++++ mlperf_logging/repo_checker/README.md | 2 +- mlperf_logging/repo_checker/repo_checker.py | 4 +- mlperf_logging/result_summarizer/config.yaml | 9 + scripts/verify_for_v5.0_training.sh | 18 ++ 38 files changed, 1830 insertions(+), 32 deletions(-) create mode 100644 mlperf_logging/compliance_checker/mlp_parser/ruleset_500.py create mode 100644 mlperf_logging/compliance_checker/training_5.0.0/closed_bert.yaml create mode 100755 mlperf_logging/compliance_checker/training_5.0.0/closed_common.yaml create mode 100644 mlperf_logging/compliance_checker/training_5.0.0/closed_dlrm_dcnv2.yaml create mode 100644 mlperf_logging/compliance_checker/training_5.0.0/closed_gnn.yaml create mode 100644 mlperf_logging/compliance_checker/training_5.0.0/closed_gpt3.yaml create mode 100755 mlperf_logging/compliance_checker/training_5.0.0/closed_llama2_70b_lora.yaml create mode 100644 mlperf_logging/compliance_checker/training_5.0.0/closed_retinanet.yaml create mode 100644 mlperf_logging/compliance_checker/training_5.0.0/closed_stable_diffusion.yaml create mode 100755 mlperf_logging/compliance_checker/training_5.0.0/common.yaml create mode 100644 mlperf_logging/compliance_checker/training_5.0.0/open_bert.yaml create mode 100644 mlperf_logging/compliance_checker/training_5.0.0/open_common.yaml create mode 100644 mlperf_logging/compliance_checker/training_5.0.0/open_dlrm_dcnv2.yaml create mode 100644 mlperf_logging/compliance_checker/training_5.0.0/open_gnn.yaml create mode 100644 mlperf_logging/compliance_checker/training_5.0.0/open_gpt3.yaml create mode 100755 mlperf_logging/compliance_checker/training_5.0.0/open_llama2_70b_lora.yaml create mode 100644 mlperf_logging/compliance_checker/training_5.0.0/open_retinanet.yaml create mode 100644 mlperf_logging/compliance_checker/training_5.0.0/open_stable_diffusion.yaml create mode 100644 mlperf_logging/rcp_checker/training_5.0.0/rcps_bert.json create mode 100644 mlperf_logging/rcp_checker/training_5.0.0/rcps_dlrm_dcnv2.json create mode 100644 mlperf_logging/rcp_checker/training_5.0.0/rcps_gnn.json create mode 100644 mlperf_logging/rcp_checker/training_5.0.0/rcps_gpt3.json create mode 100644 mlperf_logging/rcp_checker/training_5.0.0/rcps_llama2_70b_lora.json create mode 100644 mlperf_logging/rcp_checker/training_5.0.0/rcps_retinanet.json create mode 100644 mlperf_logging/rcp_checker/training_5.0.0/rcps_stable_diffusion.json create mode 100755 scripts/verify_for_v5.0_training.sh diff --git a/mlperf_logging/benchmark_meta.py b/mlperf_logging/benchmark_meta.py index 6b56ef5f..42540133 100644 --- a/mlperf_logging/benchmark_meta.py +++ b/mlperf_logging/benchmark_meta.py @@ -131,6 +131,15 @@ 'stable_diffusion', 'llama2_70b_lora', 'gnn' + ], + '5.0': [ + 'bert', + 'dlrm_dcnv2', + 'gpt3', + 'retinanet', + 'stable_diffusion', + 'llama2_70b_lora', + 'gnn' ] }, diff --git a/mlperf_logging/compliance_checker/README.md b/mlperf_logging/compliance_checker/README.md index 36e4603c..5fbd4881 100644 --- a/mlperf_logging/compliance_checker/README.md +++ b/mlperf_logging/compliance_checker/README.md @@ -10,7 +10,7 @@ To check a log file for compliance: python -m mlperf_logging.compliance_checker [--config YAML] [--usage training/hpc] [--ruleset MLPERF_EDITION] FILENAME -By default, 4.1.0 training edition rules are used and the default config is set to `4.1.0/common.yaml`. +By default, 5.0.0 training edition rules are used and the default config is set to `5.0.0/common.yaml`. This config will check all common keys and enqueue benchmark specific config to be checked as well. Old training editions, still supported are 4.0.0, 3.1.0, 3.0.0, 2.1.0, 2.0.0, 1.1.0, 1.0.0, 0.7.0 and 0.6.0 @@ -22,23 +22,23 @@ As log examples use [NVIDIA's training logs](https://github.com/mlperf/training_ ### Existing config files for training submissions - 4.1.0/common.yaml - currently the default config file, checks common fields complience and equeues benchmark-specific config file - 4.1.0/closed_common.yaml - the common rules file for closed submissions. These rules apply to all benchmarks - 4.1.0/open_common.yaml - the common rules file for open submissions. These rules apply to all benchmarks - 4.1.0/closed_ssd.yaml - Per-benchmark rules, closed submissions. - 4.1.0/closed_bert.yaml - 4.1.0/closed_dlrm_dcnv2.yaml - 4.1.0/closed_gpt3.yaml - 4.1.0/closed_gnn.yaml - 4.1.0/closed_llama2_70b_lora.yaml - 4.1.0/closed_stable_diffusion.yaml - 4.1.0/open_ssd.yaml - Per-benchmark rules, open submissions. - 4.1.0/open_bert.yaml - 4.1.0/open_dlrm_dcnv2.yaml - 4.1.0/open_gpt3.yaml - 4.1.0/open_gnn.yaml - 4.1.0/open_llama2_70b_lora.yaml - 4.1.0/open_stable_diffusion.yaml + 5.0.0/common.yaml - currently the default config file, checks common fields complience and equeues benchmark-specific config file + 5.0.0/closed_common.yaml - the common rules file for closed submissions. These rules apply to all benchmarks + 5.0.0/open_common.yaml - the common rules file for open submissions. These rules apply to all benchmarks + 5.0.0/closed_ssd.yaml - Per-benchmark rules, closed submissions. + 5.0.0/closed_bert.yaml + 5.0.0/closed_dlrm_dcnv2.yaml + 5.0.0/closed_gpt3.yaml + 5.0.0/closed_gnn.yaml + 5.0.0/closed_llama2_70b_lora.yaml + 5.0.0/closed_stable_diffusion.yaml + 5.0.0/open_ssd.yaml - Per-benchmark rules, open submissions. + 5.0.0/open_bert.yaml + 5.0.0/open_dlrm_dcnv2.yaml + 5.0.0/open_gpt3.yaml + 5.0.0/open_gnn.yaml + 5.0.0/open_llama2_70b_lora.yaml + 5.0.0/open_stable_diffusion.yaml ### Existing config files for HPC submissions @@ -173,7 +173,7 @@ Tested and confirmed working using the following software versions: - Python 2.7.12 + PyYAML 3.11 - Python 3.6.8 + PyYAML 5.1 - Python 2.9.2 + PyYAML 5.3.1 -- Python 3.9.10 + PyYAML 5.4.1 +- Python 3.9.10 + PyYAML 5.5.0 ### How to install PyYaML diff --git a/mlperf_logging/compliance_checker/mlp_compliance.py b/mlperf_logging/compliance_checker/mlp_compliance.py index aa031de2..89a3aa9c 100644 --- a/mlperf_logging/compliance_checker/mlp_compliance.py +++ b/mlperf_logging/compliance_checker/mlp_compliance.py @@ -315,7 +315,7 @@ def get_parser(): parser.add_argument('--usage', type=str, default='training', choices=usage_choices(), help='what WG do the benchmarks come from') - parser.add_argument('--ruleset', type=str, default='4.1.0', + parser.add_argument('--ruleset', type=str, default='5.0.0', choices=rule_choices(), help='what version of rules to check the log against') parser.add_argument('--config', type=str, diff --git a/mlperf_logging/compliance_checker/mlp_parser/__init__.py b/mlperf_logging/compliance_checker/mlp_parser/__init__.py index 6fa7b9cc..03e886eb 100644 --- a/mlperf_logging/compliance_checker/mlp_parser/__init__.py +++ b/mlperf_logging/compliance_checker/mlp_parser/__init__.py @@ -8,6 +8,7 @@ from .ruleset_310 import parse_file as parse_file_310 from .ruleset_400 import parse_file as parse_file_400 from .ruleset_410 import parse_file as parse_file_410 +from .ruleset_410 import parse_file as parse_file_500 def parse_file(filename, ruleset='0.6.0'): @@ -31,5 +32,7 @@ def parse_file(filename, ruleset='0.6.0'): return parse_file_400(filename) elif ruleset == '4.1.0': return parse_file_410(filename) + elif ruleset == '5.0.0': + return parse_file_500(filename) else: raise Exception(f'Ruleset "{ruleset}" is not supported') diff --git a/mlperf_logging/compliance_checker/mlp_parser/ruleset_500.py b/mlperf_logging/compliance_checker/mlp_parser/ruleset_500.py new file mode 100644 index 00000000..e30b08d2 --- /dev/null +++ b/mlperf_logging/compliance_checker/mlp_parser/ruleset_500.py @@ -0,0 +1,105 @@ +''' +Parses a text MLPerf log into a structured format. +''' + +from __future__ import print_function + +import collections +import json +import re +import sys +from dataclasses import dataclass + +from io import open + +@dataclass +class LogLine: + """Class for keeping track of an item in inventory.""" + full_string: str + timestamp: float + key: str + value: str + lineno: int + +TOKEN = ':::MLLOG ' + + +def parse_line(line): + if not line.startswith(TOKEN): + return None + + return json.loads(line[len(TOKEN):]) + + +def string_to_logline(lineno, string): + ''' Returns a LogLine or raises a ValueError ''' + m = parse_line(string) + + if m is None: + raise ValueError('does not match regex') + + args = [] + args.append(string) # full string + + ts = float(m['time_ms']) # may raise error, e.g. "1.2.3" + # TODO check for weird values + args.append(ts) + + args.append(m['key']) # key + + j = { 'value': m['value'], 'metadata': m['metadata'] } + args.append(j) + + args.append(lineno) + return LogLine(*args) + + +def parse_file(filename): + ''' Reads a file by name and returns list of loglines and list of errors''' + with open(filename, encoding='latin-1') as f: + return parse_generator(f) + + +def strip_and_dedup(gen): + lines = [] + for l in gen: + if TOKEN not in l: + continue + lines.append(re.sub(".*"+TOKEN, TOKEN, l)) + return lines + + + +def parse_generator(gen): + ''' Reads a generator of lines and returns (loglines, errors) + The list of errors are any parsing issues as a tuple (str_line, error_msg) + ''' + loglines = [] + failed = [] + for lineno, line in enumerate(strip_and_dedup(gen)): + line = line.strip() + try: + ll = string_to_logline(lineno, line) + loglines.append(ll) + except ValueError as e: + failed.append((line, str(e))) + return loglines, failed + + +if __name__ == '__main__': + if len(sys.argv) != 2: + print('usage: mlp_parser.py FILENAME') + print(' tests parsing on the file.') + sys.exit(1) + + filename = sys.argv[1] + lines, errors = parse_file(filename) + + print('Parsed {} log lines with {} errors.'.format(len(lines), len(errors))) + + if len(errors) > 0: + print('Lines which failed to parse:') + for line, error in errors: + print(' Following line failed: {}'.format(error)) + print(line) + diff --git a/mlperf_logging/compliance_checker/training_5.0.0/closed_bert.yaml b/mlperf_logging/compliance_checker/training_5.0.0/closed_bert.yaml new file mode 100644 index 00000000..408f669b --- /dev/null +++ b/mlperf_logging/compliance_checker/training_5.0.0/closed_bert.yaml @@ -0,0 +1,48 @@ +- KEY: + NAME: global_batch_size + REQ: EXACTLY_ONE + POST: > + s['global_batch_size'] = v['value'] + +- KEY: + NAME: opt_base_learning_rate + REQ: EXACTLY_ONE + +- KEY: + NAME: opt_lamb_epsilon + REQ: EXACTLY_ONE + +- KEY: + NAME: opt_learning_rate_training_steps + REQ: EXACTLY_ONE + +- KEY: + NAME: opt_learning_rate_warmup_steps + REQ: EXACTLY_ONE + +- KEY: + NAME: num_warmup_steps + REQ: EXACTLY_ONE + +- KEY: + NAME: start_warmup_step + REQ: EXACTLY_ONE + +- KEY: + NAME: opt_lamb_beta_1 + REQ: EXACTLY_ONE + +- KEY: + NAME: opt_lamb_beta_2 + REQ: EXACTLY_ONE + +- KEY: + NAME: opt_lamb_weight_decay_rate + REQ: EXACTLY_ONE + +- KEY: + NAME: eval_accuracy + REQ: AT_LEAST_ONE + CHECK: + - "'epoch_num' in v['metadata']" + ATLEAST_ONE_CHECK: "(v['value'] >= 0.720) and v['value'] < 1.0" diff --git a/mlperf_logging/compliance_checker/training_5.0.0/closed_common.yaml b/mlperf_logging/compliance_checker/training_5.0.0/closed_common.yaml new file mode 100755 index 00000000..501cf1f6 --- /dev/null +++ b/mlperf_logging/compliance_checker/training_5.0.0/closed_common.yaml @@ -0,0 +1,11 @@ + +- KEY: + NAME: submission_benchmark + REQ: EXACTLY_ONE + CHECK: " v['value'] in ['resnet', 'ssd', 'stable_diffusion', 'maskrcnn', 'gpt3', 'dlrm_dcnv2', 'bert', 'rnnt', 'unet3d', 'gnn', 'llama2_70b_lora'] " + POST: " enqueue_config('training_4.1.0/closed_{}.yaml'.format(v['value'])) " + +- KEY: + NAME: gradient_accumulation_steps + REQ: EXACTLY_ONE + CHECK: " v['value'] > 0 " diff --git a/mlperf_logging/compliance_checker/training_5.0.0/closed_dlrm_dcnv2.yaml b/mlperf_logging/compliance_checker/training_5.0.0/closed_dlrm_dcnv2.yaml new file mode 100644 index 00000000..45344bd2 --- /dev/null +++ b/mlperf_logging/compliance_checker/training_5.0.0/closed_dlrm_dcnv2.yaml @@ -0,0 +1,59 @@ +- KEY: + NAME: global_batch_size + REQ: EXACTLY_ONE + +- KEY: + NAME: opt_name + REQ: EXACTLY_ONE + CHECK: " v['value'] == 'adagrad' " + +- KEY: + NAME: opt_base_learning_rate + REQ: EXACTLY_ONE + +- KEY: + NAME: opt_adagrad_learning_rate_decay + REQ: EXACTLY_ONE + CHECK: " v['value'] == 0 " + +- KEY: + NAME: opt_weight_decay + REQ: EXACTLY_ONE + CHECK: " v['value'] == 0 " + +- KEY: + NAME: opt_adagrad_initial_accumulator_value + REQ: EXACTLY_ONE + CHECK: " v['value'] == 0 " + +- KEY: + NAME: opt_adagrad_epsilon + REQ: EXACTLY_ONE + CHECK: " v['value'] == 1e-8 " + +- KEY: + NAME: opt_learning_rate_warmup_steps + REQ: EXACTLY_ONE + CHECK: " v['value'] == 0 " + +- KEY: + NAME: opt_learning_rate_decay_start_step + REQ: EXACTLY_ONE + CHECK: " v['value'] == 0 " + +- KEY: + NAME: opt_learning_rate_decay_steps + REQ: EXACTLY_ONE + CHECK: " v['value'] == 0 " + +- KEY: + NAME: eval_accuracy + REQ: AT_LEAST_ONE + CHECK: + - "'epoch_num' in v['metadata']" + ATLEAST_ONE_CHECK: "v['value'] >= 0.80275 and v['value'] <= 1.0" + +- KEY: + NAME: eval_samples + REQ: EXACTLY_ONE + CHECK: " v['value'] == 89137319 " diff --git a/mlperf_logging/compliance_checker/training_5.0.0/closed_gnn.yaml b/mlperf_logging/compliance_checker/training_5.0.0/closed_gnn.yaml new file mode 100644 index 00000000..2c1f7286 --- /dev/null +++ b/mlperf_logging/compliance_checker/training_5.0.0/closed_gnn.yaml @@ -0,0 +1,21 @@ +- KEY: + NAME: global_batch_size + REQ: EXACTLY_ONE + CHECK: " v['value'] > 0" + +- KEY: + NAME: opt_name + REQ: EXACTLY_ONE + CHECK: " v['value'] == 'adam' " + +- KEY: + NAME: opt_base_learning_rate + REQ: EXACTLY_ONE + CHECK: " v['value'] >= 0.0" + +- KEY: + NAME: eval_accuracy + REQ: AT_LEAST_ONE + CHECK: + - "'epoch_num' in v['metadata']" + ATLEAST_ONE_CHECK: "v['value'] >= 0.72 and v['value'] < 1.0" diff --git a/mlperf_logging/compliance_checker/training_5.0.0/closed_gpt3.yaml b/mlperf_logging/compliance_checker/training_5.0.0/closed_gpt3.yaml new file mode 100644 index 00000000..8007184a --- /dev/null +++ b/mlperf_logging/compliance_checker/training_5.0.0/closed_gpt3.yaml @@ -0,0 +1,79 @@ +- KEY: + NAME: global_batch_size + REQ: EXACTLY_ONE + POST: > + s['global_batch_size'] = v['value'] + +- KEY: + NAME: max_sequence_length + REQ: EXACTLY_ONE + CHECK: " v['value'] == 2048 " + +- KEY: + NAME: opt_name + REQ: EXACTLY_ONE + CHECK: " v['value'] == 'adam' " + +- KEY: + NAME: opt_base_learning_rate + REQ: EXACTLY_ONE + CHECK: " v['value'] == 2e-5 or v['value'] == 3e-5 " + +- KEY: + NAME: opt_end_learning_rate + REQ: EXACTLY_ONE + +- KEY: + NAME: opt_learning_rate_decay_steps + REQ: EXACTLY_ONE + +- KEY: + NAME: opt_learning_rate_warmup_steps + REQ: EXACTLY_ONE + +- KEY: + NAME: opt_learning_rate_decay_schedule + REQ: EXACTLY_ONE + CHECK: " v['value'] == 'cosine with linear warmup' " + +- KEY: + NAME: opt_adam_beta_1 + REQ: EXACTLY_ONE + CHECK: " v['value'] == 0.9 " + +- KEY: + NAME: opt_adam_beta_2 + REQ: EXACTLY_ONE + CHECK: " v['value'] == 0.95 " + +- KEY: + NAME: opt_adam_epsilon + REQ: EXACTLY_ONE + CHECK: " v['value'] == 1e-8 " + +- KEY: + NAME: opt_gradient_clip_norm + REQ: EXACTLY_ONE + CHECK: " v['value'] == 1.0 " + +- KEY: + NAME: gradient_accumulation_steps + REQ: EXACTLY_ONE + CHECK: " v['value'] > 0 " + +- KEY: + NAME: eval_samples + REQ: EXACTLY_ONE + +- KEY: + NAME: eval_accuracy + REQ: AT_LEAST_ONE + CHECK: + - "'epoch_num' in v['metadata']" + ATLEAST_ONE_CHECK: "(v['value'] <= 2.69) and v['value'] > 2.6" + +- KEY: + NAME: init_checkpoint_step + REQ: EXACTLY_ONE + CHECK: " v['value'] > 0 " + diff --git a/mlperf_logging/compliance_checker/training_5.0.0/closed_llama2_70b_lora.yaml b/mlperf_logging/compliance_checker/training_5.0.0/closed_llama2_70b_lora.yaml new file mode 100755 index 00000000..46de03ef --- /dev/null +++ b/mlperf_logging/compliance_checker/training_5.0.0/closed_llama2_70b_lora.yaml @@ -0,0 +1,42 @@ +- KEY: + NAME: global_batch_size + REQ: EXACTLY_ONE + POST: > + s['global_batch_size'] = v['value'] + +- KEY: + NAME: opt_base_learning_rate + REQ: EXACTLY_ONE + + +- KEY: + NAME: opt_learning_rate_training_steps + REQ: EXACTLY_ONE + +- KEY: + NAME: opt_gradient_clip_norm + REQ: EXACTLY_ONE + +- KEY: + NAME: opt_adamw_weight_decay + REQ: EXACTLY_ONE + +- KEY: + NAME: gradient_accumulation_steps + REQ: EXACTLY_ONE + +- KEY: + NAME: lora_alpha + REQ: EXACTLY_ONE + +- KEY: + NAME: lora_rank + REQ: EXACTLY_ONE + CHECK: " v['value'] == 16" + +- KEY: + NAME: eval_accuracy + REQ: AT_LEAST_ONE + CHECK: + - "'samples_count' in v['metadata']" + ATLEAST_ONE_CHECK: "(v['value'] <= 0.925) and v['value'] > 0.0" diff --git a/mlperf_logging/compliance_checker/training_5.0.0/closed_retinanet.yaml b/mlperf_logging/compliance_checker/training_5.0.0/closed_retinanet.yaml new file mode 100644 index 00000000..794ab7ab --- /dev/null +++ b/mlperf_logging/compliance_checker/training_5.0.0/closed_retinanet.yaml @@ -0,0 +1,35 @@ +- KEY: + NAME: global_batch_size + REQ: EXACTLY_ONE + +- KEY: + NAME: opt_name + REQ: EXACTLY_ONE + CHECK: " v['value'] == 'adam' " + +- KEY: + NAME: opt_base_learning_rate + REQ: EXACTLY_ONE + CHECK: " v['value'] >= 0.0" + +- KEY: + NAME: opt_weight_decay + REQ: EXACTLY_ONE + CHECK: " v['value'] == 0.0" + +- KEY: + NAME: opt_learning_rate_warmup_epochs + REQ: EXACTLY_ONE + CHECK: " v['value'] >= 0.0" + +- KEY: + NAME: opt_learning_rate_warmup_factor + REQ: EXACTLY_ONE + CHECK: " v['value'] >= 0.0" + +- KEY: + NAME: eval_accuracy + REQ: AT_LEAST_ONE + CHECK: + - "'epoch_num' in v['metadata']" + ATLEAST_ONE_CHECK: "v['value'] >= 0.340 and v['value'] < 1.0" diff --git a/mlperf_logging/compliance_checker/training_5.0.0/closed_stable_diffusion.yaml b/mlperf_logging/compliance_checker/training_5.0.0/closed_stable_diffusion.yaml new file mode 100644 index 00000000..3cdc3e64 --- /dev/null +++ b/mlperf_logging/compliance_checker/training_5.0.0/closed_stable_diffusion.yaml @@ -0,0 +1,74 @@ +# Stable diffusion uses two metrics, FID and CLIP. +# These metrics can be calculated offline, using different scripts +# and logged seperatly. Therefore, we create a virtual key +# called aggregated_eval_accuracy, which aggregates +# both metrics into a single log line + +- BEGIN: + CODE: | + from dataclasses import replace + agg_eval_lines = {} + for line in loglines: + if line.key == "eval_accuracy" and 'metric' in line.value['metadata']: + samples_count = line.value['metadata']['samples_count'] + if samples_count not in agg_eval_lines: + new_line = replace(line) # Make a copy + new_line.key = "aggregated_eval_accuracy" + new_line.full_string = "" # Not needed + new_line.lineno = -1 # Not needed + new_line.value = {'value': {'samples_count': samples_count}, 'metadata':{}} + agg_eval_lines[samples_count] = new_line + + agg_eval_lines[samples_count].timestamp = max(line.timestamp, agg_eval_lines[samples_count].timestamp) + agg_eval_lines[samples_count].value['value'][line.value['metadata']['metric']] = line.value['value'] + loglines.extend(agg_eval_lines.values()) + + +- KEY: + NAME: global_batch_size + REQ: AT_LEAST_ONE + CHECK: " v['value'] >= 0 " + +- KEY: + NAME: opt_name + REQ: EXACTLY_ONE + CHECK: " v['value'] == 'adamw' " + +- KEY: + NAME: opt_adamw_beta_1 + REQ: EXACTLY_ONE + CHECK: " v['value'] == 0.9 " + +- KEY: + NAME: opt_adamw_beta_2 + REQ: EXACTLY_ONE + CHECK: " v['value'] == 0.999 " + +- KEY: + NAME: opt_adamw_epsilon + REQ: EXACTLY_ONE + CHECK: " v['value'] == 1e-08 " + +- KEY: + NAME: opt_adamw_weight_decay + REQ: EXACTLY_ONE + CHECK: " v['value'] == 0.01 " + +- KEY: + NAME: opt_base_learning_rate + REQ: EXACTLY_ONE + CHECK: " v['value'] >= 0.0 " + +- KEY: + NAME: opt_learning_rate_warmup_steps + REQ: EXACTLY_ONE + CHECK: " v['value'] >= 0 " + +- KEY: + NAME: aggregated_eval_accuracy + REQ: AT_LEAST(2) + CHECK: + - "'FID' in v['value']" + - "'CLIP' in v['value']" + - "'samples_count' in v['value']" + ATLEAST_ONE_CHECK: "(0.0 <= v['value']['FID'] <= 90.0) and (0.15 <= v['value']['CLIP'] <= 1.0)" diff --git a/mlperf_logging/compliance_checker/training_5.0.0/common.yaml b/mlperf_logging/compliance_checker/training_5.0.0/common.yaml new file mode 100755 index 00000000..7a201ac9 --- /dev/null +++ b/mlperf_logging/compliance_checker/training_5.0.0/common.yaml @@ -0,0 +1,146 @@ +# This file lists all the KEYs to be checked. Every line that matches mlperf logging regex (::MLL...) will be checked against these rules. +# In the order of the appearance in the log, for each line will execute the code specified under CHECK for the KEY in that line. +# The code will be launched using local state 'v' which is the content of value field in log line, and global state 's'. +# Global state 's' exists to allow cross-line checks, like start/stop pairs etc. To initialize 's' use BEGIN record which CODE will +# be executed before any checks. +# In addition, occurrence of each key will be counted and at the end if a requirement regarding the number of occurrences is defined it will +# be confirmed. This could be implemented using global state, but since this is a common thing to do it is natively supported. +# +# KEY record: +# NAME +# REQ - optional - {EXACTLY_ONE, AT_LEAST_ONE} +# PRE - optional - code to be executed before CHECK +# CHECK - optional - expression to be evaluated to verify correctness +# POST - optional - code to be executed after CHECK + +- BEGIN: + CODE: > + s.update({ + 'init_started': False, + 'init_stopped' : False, + 'run_started' : False, + 'run_stopped' : False, + 'in_epoch' : False, + 'last_epoch' : 0, + 'in_block' : False, + 'block_first_epoch' : -1, + 'first_init_start': 9e99, + 'compile_time_mins': 0, + }) + +- KEY: + NAME: submission_org + REQ: EXACTLY_ONE + CHECK: " v['value'] != '' " + +- KEY: + NAME: submission_platform + REQ: EXACTLY_ONE + CHECK: " v['value'] != '' " + +- KEY: + NAME: submission_division + REQ: EXACTLY_ONE + CHECK: " v['value'] in ['closed', 'open'] " + POST: " enqueue_config('training_4.1.0/{}_common.yaml'.format(v['value'])); s['compile_time_mins'] = 240 if v['value'] == 'open' else 30 " + +# at least one record should be found, but any found records must pass the test +- KEY: + NAME: cache_clear + REQ: AT_LEAST_ONE + CHECK: + - "'value' in v" + +# frequency not checked +- KEY: + NAME: init_start + REQ: AT_LEAST_ONE + CHECK: + - "not s['init_stopped']" + - "not s['run_started']" + POST: " s['init_started'] = True; s['first_init_start']=min(s['first_init_start'], ll.timestamp) " + +# confirm less than 20min since the very first init_start +- KEY: + NAME: init_stop + REQ: EXACTLY_ONE + CHECK: + - "s['init_started']" + - "not s['run_started']" + - "ll.timestamp - s['first_init_start'] < (s['compile_time_mins']*60*1e3)" + POST: " s['init_stopped'] = True" + +- KEY: + NAME: run_start + REQ: EXACTLY_ONE + CHECK: " ( s['init_stopped'] == True )" + POST: " s['run_started'] = True " + +# status can also be aborted, but not allowing it here for now +# if eval is inside epoch and we decide to terminate, we can lack epoch_stop, it is ok +- KEY: + NAME: run_stop + REQ: EXACTLY_ONE + CHECK: + - "s['run_started']" + - "'status' in v['metadata']" + POST: " s['run_stopped'] = True " + +# FIXME: check epoch_count value match +- KEY: + NAME: block_start + REQ: AT_LEAST_ONE_OR(epoch_start) + CHECK: + - "s['run_started']" + - "('epoch_count' in v['metadata']) | ('samples_count' in v['metadata'])" + - "'first_epoch_num' in v['metadata'] if 'epoch_count' in v['metadata'] else True" + - "v['metadata']['epoch_count'] > 0 if 'epoch_count' in v['metadata'] else True" + - "v['metadata']['samples_count'] >= 0 if 'samples_count' in v['metadata'] else True" + +- KEY: + NAME: block_stop + REQ: AT_LEAST_ONE_OR(epoch_stop) + CHECK: + - "('first_epoch_num' in v['metadata']) | ('samples_count' in v['metadata'])" + +- KEY: + NAME: epoch_start + REQ: AT_LEAST_ONE_OR(block_start) + CHECK: + - "'epoch_num' in v['metadata']" + +- KEY: + NAME: epoch_stop + REQ: AT_LEAST_ONE_OR(block_stop) + CHECK: + - "'epoch_num' in v['metadata']" + +# making sure previous eval did print it's accuracy result +- KEY: + NAME: eval_start + REQ: AT_LEAST_ONE_OR(block_start) + CHECK: + - "('epoch_num' in v['metadata']) | ('samples_count' in v['metadata'])" + +- KEY: + NAME: eval_stop + REQ: AT_LEAST_ONE_OR(block_stop) + CHECK: + - "('epoch_num' in v['metadata']) | ('samples_count' in v['metadata'])" + +- KEY: + NAME: eval_accuracy + REQ: AT_LEAST_ONE + CHECK: + - "('epoch_num' in v['metadata']) | ('samples_count' in v['metadata'])" + +- KEY: + NAME: train_samples + REQ: EXACTLY_ONE + CHECK: " v['value'] != '' " + +- KEY: + NAME: eval_samples + REQ: EXACTLY_ONE + CHECK: " v['value'] != '' " + diff --git a/mlperf_logging/compliance_checker/training_5.0.0/open_bert.yaml b/mlperf_logging/compliance_checker/training_5.0.0/open_bert.yaml new file mode 100644 index 00000000..14c4176d --- /dev/null +++ b/mlperf_logging/compliance_checker/training_5.0.0/open_bert.yaml @@ -0,0 +1,7 @@ + +- KEY: + NAME: eval_accuracy + REQ: AT_LEAST_ONE + CHECK: + - "'epoch_num' in v['metadata']" + ATLEAST_ONE_CHECK: "v['value'] < 1.0" diff --git a/mlperf_logging/compliance_checker/training_5.0.0/open_common.yaml b/mlperf_logging/compliance_checker/training_5.0.0/open_common.yaml new file mode 100644 index 00000000..3e174774 --- /dev/null +++ b/mlperf_logging/compliance_checker/training_5.0.0/open_common.yaml @@ -0,0 +1,6 @@ + +- KEY: + NAME: submission_benchmark + REQ: EXACTLY_ONE + CHECK: " v['value'] in ['resnet', 'ssd', 'stable_diffusion', 'maskrcnn', 'gpt3', 'dlrm_dcnv2', 'bert', 'rnnt', 'unet3d', 'gnn'] " + POST: " enqueue_config('training_4.1.0/open_{}.yaml'.format(v['value'])) " diff --git a/mlperf_logging/compliance_checker/training_5.0.0/open_dlrm_dcnv2.yaml b/mlperf_logging/compliance_checker/training_5.0.0/open_dlrm_dcnv2.yaml new file mode 100644 index 00000000..7f70c0c3 --- /dev/null +++ b/mlperf_logging/compliance_checker/training_5.0.0/open_dlrm_dcnv2.yaml @@ -0,0 +1,7 @@ + +- KEY: + NAME: eval_accuracy + REQ: AT_LEAST_ONE + CHECK: + - "'epoch_num' in v['metadata']" + ATLEAST_ONE_CHECK: "v['value'] <= 1.0" diff --git a/mlperf_logging/compliance_checker/training_5.0.0/open_gnn.yaml b/mlperf_logging/compliance_checker/training_5.0.0/open_gnn.yaml new file mode 100644 index 00000000..14c4176d --- /dev/null +++ b/mlperf_logging/compliance_checker/training_5.0.0/open_gnn.yaml @@ -0,0 +1,7 @@ + +- KEY: + NAME: eval_accuracy + REQ: AT_LEAST_ONE + CHECK: + - "'epoch_num' in v['metadata']" + ATLEAST_ONE_CHECK: "v['value'] < 1.0" diff --git a/mlperf_logging/compliance_checker/training_5.0.0/open_gpt3.yaml b/mlperf_logging/compliance_checker/training_5.0.0/open_gpt3.yaml new file mode 100644 index 00000000..8007184a --- /dev/null +++ b/mlperf_logging/compliance_checker/training_5.0.0/open_gpt3.yaml @@ -0,0 +1,79 @@ +- KEY: + NAME: global_batch_size + REQ: EXACTLY_ONE + POST: > + s['global_batch_size'] = v['value'] + +- KEY: + NAME: max_sequence_length + REQ: EXACTLY_ONE + CHECK: " v['value'] == 2048 " + +- KEY: + NAME: opt_name + REQ: EXACTLY_ONE + CHECK: " v['value'] == 'adam' " + +- KEY: + NAME: opt_base_learning_rate + REQ: EXACTLY_ONE + CHECK: " v['value'] == 2e-5 or v['value'] == 3e-5 " + +- KEY: + NAME: opt_end_learning_rate + REQ: EXACTLY_ONE + +- KEY: + NAME: opt_learning_rate_decay_steps + REQ: EXACTLY_ONE + +- KEY: + NAME: opt_learning_rate_warmup_steps + REQ: EXACTLY_ONE + +- KEY: + NAME: opt_learning_rate_decay_schedule + REQ: EXACTLY_ONE + CHECK: " v['value'] == 'cosine with linear warmup' " + +- KEY: + NAME: opt_adam_beta_1 + REQ: EXACTLY_ONE + CHECK: " v['value'] == 0.9 " + +- KEY: + NAME: opt_adam_beta_2 + REQ: EXACTLY_ONE + CHECK: " v['value'] == 0.95 " + +- KEY: + NAME: opt_adam_epsilon + REQ: EXACTLY_ONE + CHECK: " v['value'] == 1e-8 " + +- KEY: + NAME: opt_gradient_clip_norm + REQ: EXACTLY_ONE + CHECK: " v['value'] == 1.0 " + +- KEY: + NAME: gradient_accumulation_steps + REQ: EXACTLY_ONE + CHECK: " v['value'] > 0 " + +- KEY: + NAME: eval_samples + REQ: EXACTLY_ONE + +- KEY: + NAME: eval_accuracy + REQ: AT_LEAST_ONE + CHECK: + - "'epoch_num' in v['metadata']" + ATLEAST_ONE_CHECK: "(v['value'] <= 2.69) and v['value'] > 2.6" + +- KEY: + NAME: init_checkpoint_step + REQ: EXACTLY_ONE + CHECK: " v['value'] > 0 " + diff --git a/mlperf_logging/compliance_checker/training_5.0.0/open_llama2_70b_lora.yaml b/mlperf_logging/compliance_checker/training_5.0.0/open_llama2_70b_lora.yaml new file mode 100755 index 00000000..14c4176d --- /dev/null +++ b/mlperf_logging/compliance_checker/training_5.0.0/open_llama2_70b_lora.yaml @@ -0,0 +1,7 @@ + +- KEY: + NAME: eval_accuracy + REQ: AT_LEAST_ONE + CHECK: + - "'epoch_num' in v['metadata']" + ATLEAST_ONE_CHECK: "v['value'] < 1.0" diff --git a/mlperf_logging/compliance_checker/training_5.0.0/open_retinanet.yaml b/mlperf_logging/compliance_checker/training_5.0.0/open_retinanet.yaml new file mode 100644 index 00000000..14c4176d --- /dev/null +++ b/mlperf_logging/compliance_checker/training_5.0.0/open_retinanet.yaml @@ -0,0 +1,7 @@ + +- KEY: + NAME: eval_accuracy + REQ: AT_LEAST_ONE + CHECK: + - "'epoch_num' in v['metadata']" + ATLEAST_ONE_CHECK: "v['value'] < 1.0" diff --git a/mlperf_logging/compliance_checker/training_5.0.0/open_stable_diffusion.yaml b/mlperf_logging/compliance_checker/training_5.0.0/open_stable_diffusion.yaml new file mode 100644 index 00000000..fe25e312 --- /dev/null +++ b/mlperf_logging/compliance_checker/training_5.0.0/open_stable_diffusion.yaml @@ -0,0 +1,33 @@ +# Stable diffusion uses two metrics, FID and CLIP. +# These metrics can be calculated offline, using different scripts +# and logged seperatly. Therefore, we create a virtual key +# called aggregated_eval_accuracy, which aggregates +# both metrics into a single log line + +- BEGIN: + CODE: | + from dataclasses import replace + agg_eval_lines = {} + for line in loglines: + if line.key == "eval_accuracy" and 'metric' in line.value['metadata']: + samples_count = line.value['metadata']['samples_count'] + if samples_count not in agg_eval_lines: + new_line = replace(line) # Make a copy + new_line.key = "aggregated_eval_accuracy" + new_line.full_string = "" # Not needed + new_line.lineno = -1 # Not needed + new_line.value = {'value': {'samples_count': samples_count}, 'metadata':{}} + agg_eval_lines[samples_count] = new_line + + agg_eval_lines[samples_count].timestamp = max(line.timestamp, agg_eval_lines[samples_count].timestamp) + agg_eval_lines[samples_count].value['value'][line.value['metadata']['metric']] = line.value['value'] + loglines.extend(agg_eval_lines.values()) + +- KEY: + NAME: aggregated_eval_accuracy + REQ: AT_LEAST(2) + CHECK: + - "'FID' in v['value']" + - "'CLIP' in v['value']" + - "'samples_count' in v['value']" + ATLEAST_ONE_CHECK: "v['value']['FID'] >= 0.0 and v['value']['CLIP'] <= 1.0" diff --git a/mlperf_logging/mllog/constants.py b/mlperf_logging/mllog/constants.py index 79905ac9..c9e39057 100644 --- a/mlperf_logging/mllog/constants.py +++ b/mlperf_logging/mllog/constants.py @@ -44,6 +44,7 @@ NCF = "ncf" RESNET = "resnet" SSD = "ssd" +RETINANET = "retinanet" STABLE_DIFFUSION = "stable_diffusion" TRANSFORMER = "transformer" RNNT = "rnnt" diff --git a/mlperf_logging/package_checker/README.md b/mlperf_logging/package_checker/README.md index 60562381..3445ad01 100644 --- a/mlperf_logging/package_checker/README.md +++ b/mlperf_logging/package_checker/README.md @@ -10,7 +10,7 @@ To check an organization's submission package for compliance: python3 -m mlperf_logging.package_checker FOLDER USAGE RULESET ``` -Currently, USAGE in ["training"] and RULESET in ["0.6.0", "0.7.0", "1.0.0", "1.1.0", "2.0.0", "2.1.0", "3.0.0", "3.1.0", "4.0.0", "4.1.0"] are supported. +Currently, USAGE in ["training"] and RULESET in ["0.6.0", "0.7.0", "1.0.0", "1.1.0", "2.0.0", "2.1.0", "3.0.0", "3.1.0", "4.0.0", "4.1.0", "5.0.0"] are supported. The package checker checks: 1. The number of result files for each benchmark matches the required count. If diff --git a/mlperf_logging/package_checker/package_checker.py b/mlperf_logging/package_checker/package_checker.py index 8495c81d..50b22c61 100644 --- a/mlperf_logging/package_checker/package_checker.py +++ b/mlperf_logging/package_checker/package_checker.py @@ -182,14 +182,14 @@ def check_training_result_files(folder, usage, ruleset, quiet, werror, logging.error(" %d files do not comply, directory cannot be accepted", errors_found) # Check if each run use unique seeds. - if ruleset in {'1.0.0', '1.1.0', '2.0.0', '2.1.0', '3.0.0', '3.1.0', '4.0.0', '4.1.0'} and division == 'closed': + if ruleset in {'1.0.0', '1.1.0', '2.0.0', '2.1.0', '3.0.0', '3.1.0', '4.0.0', '4.1.0', '5.0.0'} and division == 'closed': seed_checker_bypass = (global_seed_checker_bypass or system_seed_checker_bypass or result_seed_checker_bypass) if not seed_checker.check_seeds(result_files, seed_checker_bypass): too_many_errors = True logging.error('Seed checker failed') # Run RCP checker for >= 1.0.0 - if ruleset in {'1.0.0', '1.1.0', '2.0.0', '2.1.0', '3.0.0', '3.1.0', '4.0.0', '4.1.0'} and division == 'closed' and benchmark != 'minigo': + if ruleset in {'1.0.0', '1.1.0', '2.0.0', '2.1.0', '3.0.0', '3.1.0', '4.0.0', '4.1.0', '5.0.0'} and division == 'closed' and benchmark != 'minigo': # Now go again through result files to do RCP checks rcp_bypass = (global_rcp_bypass or system_rcp_bypass or result_rcp_bypass) rcp_pass, rcp_msg, _ = rcp_checker.check_directory( @@ -243,7 +243,7 @@ def check_training_package(folder, usage, ruleset, quiet, werror, rcp_bypass, rc ruleset: The ruleset such as 0.6.0, 0.7.0, 1.0.0, etc. """ too_many_errors = False - if ruleset in {'1.0.0', '1.1.0', '2.0.0', '2.1.0', '3.0.0', '3.1.0', '4.0.0', '4.1.0'}: + if ruleset in {'1.0.0', '1.1.0', '2.0.0', '2.1.0', '3.0.0', '3.1.0', '4.0.0', '4.1.0', '5.0.0'}: logging.info(' Checking System Description Files') system_description_pass = check_systems(folder, usage, ruleset) too_many_errors = too_many_errors or not system_description_pass diff --git a/mlperf_logging/rcp_checker/README.md b/mlperf_logging/rcp_checker/README.md index 79dcdf25..7b05bec7 100644 --- a/mlperf_logging/rcp_checker/README.md +++ b/mlperf_logging/rcp_checker/README.md @@ -8,10 +8,10 @@ Run Reference Convergence Point checks for a submission directory. This consists of testing whether a submission does not converge statistically faster than the reference. -For training, RCPs are loaded from directory mlperf_logging/rcp_checker/training_4.1.0/*.json +For training, RCPs are loaded from directory mlperf_logging/rcp_checker/training_5.0.0/*.json The RCP checker supports only the 1.0.0 version onwards. -The current training version is 4.1.0. +The current training version is 5.0.0. ## Usage diff --git a/mlperf_logging/rcp_checker/rcp_checker.py b/mlperf_logging/rcp_checker/rcp_checker.py index 4ee5cb36..c3479122 100644 --- a/mlperf_logging/rcp_checker/rcp_checker.py +++ b/mlperf_logging/rcp_checker/rcp_checker.py @@ -161,8 +161,8 @@ def get_submission_epochs(result_files, ruleset, bert_train_samples): class RCP_Checker: def __init__(self, usage, ruleset, benchmark, verbose, rcp_file=None): - if ruleset not in {'1.0.0', "1.1.0", "2.0.0", "2.1.0", "3.0.0", "3.1.0", "4.0.0", "4.1.0"}: - raise Exception('RCP Checker only supported in 1.0.0, 1.1.0, 2.0.0, 2.1.0, 3.0.0, 3.1.0, 4.0.0, and 4.1.0') + if ruleset not in {'1.0.0', "1.1.0", "2.0.0", "2.1.0", "3.0.0", "3.1.0", "4.0.0", "4.1.0", "5.0.0"}: + raise Exception('RCP Checker only supported in 1.0.0, 1.1.0, 2.0.0, 2.1.0, 3.0.0, 3.1.0, 4.0.0, 4.1.0 and 5.0.0') self.usage = usage self.ruleset = ruleset self.benchmark = benchmark @@ -530,7 +530,7 @@ def get_parser(): parser.add_argument('--rcp_usage', type=str, default='training', choices=['training', 'hpc'], help='what WG does the benchmark come from to check the log against') - parser.add_argument('--rcp_version', type=str, default='4.1.0', + parser.add_argument('--rcp_version', type=str, default='5.0.0', help='what version of rules to check the log against') parser.add_argument('--verbose', action='store_true') parser.add_argument('--bert_train_samples', action='store_true', diff --git a/mlperf_logging/rcp_checker/training_5.0.0/rcps_bert.json b/mlperf_logging/rcp_checker/training_5.0.0/rcps_bert.json new file mode 100644 index 00000000..71490803 --- /dev/null +++ b/mlperf_logging/rcp_checker/training_5.0.0/rcps_bert.json @@ -0,0 +1,303 @@ +{ + + "bert_ref_256": + { + "Benchmark": "bert", + "Creator": "Google", + "When": "Prior to 1.0 submission", + "Platform": "TPU-v4-16 / TF1, TF version ~2.4", + "BS": 256, + "Hyperparams": { + "opt_base_learning_rate": 0.00035, + "opt_epsilon": 1e-6, + "opt_learning_rate_training_steps": 13700, + "num_warmup_steps": 0, + "start_warmup_step": 0, + "opt_lamb_beta_1": 0.9, + "opt_lamb_beta_2": 0.999, + "opt_lamb_weight_decay_rate": 0.01, + "gradient_accumulation_steps": 1 + }, + "Epochs to converge": [ + 2834944, 2508800, 2709504, 2609152, 2383360, 2308096, 2910208, 2333184, 2283008, 2935296, + 2483712, 2558976, 2709504, 2232832, 2333184, 2533888, 2709504, 2257920, 2609152, 2809856] + }, + + "bert_ref_448": + { + "Benchmark": "bert", + "Creator": "Google", + "When": "Prior to 2.1 submission, with Habana's HP set", + "Platform": "TPU-v4-32 / TF1, TF version ~2.10", + "BS": 448, + "Hyperparams": { + "opt_base_learning_rate": 0.000425, + "opt_epsilon": 1e-6, + "opt_learning_rate_training_steps": 6700, + "num_warmup_steps": 0, + "start_warmup_step": 0, + "opt_lamb_beta_1": 0.9, + "opt_lamb_beta_2": 0.999, + "opt_lamb_weight_decay_rate": 0.01, + "gradient_accumulation_steps": 1 + }, + "Epochs to converge": [ + 2132480, 2333184, 2408448, 2483712, 2684416, 2107392, 2157568, 2709504, 2533888, 2584064, + 1981952, 2182656, 2408448, 2433536, 2333184, 2533888, 2458624, 2558976, 2584064, 2358272, + 2358272, 2358272, 2759680] + }, + + "bert_ref_1536": + { + "Benchmark": "bert", + "Creator": "Google", + "When": "At 1.0 submission", + "Platform": "TPU-v4-128 / TF1, TF version ~2.4", + "BS": 1536, + "Hyperparams": { + "opt_base_learning_rate": 0.002, + "opt_epsilon": 1e-6, + "opt_learning_rate_training_steps": 2254, + "num_warmup_steps": 0, + "start_warmup_step": 0, + "opt_lamb_beta_1": 0.66, + "opt_lamb_beta_2": 0.996, + "opt_lamb_weight_decay_rate": 0.01, + "gradient_accumulation_steps": 1 + }, + "Epochs to converge": [ + 2836240, 2801664, 2801664, 2727936, 2801664, 2875392, 2899968, 2727936, 2777088, 2875392, + 2777088, 2801664, 2678784, 2801664, 2703360, 2629632, 2727936, 2703360, 2654208, 2949120] + }, + + "bert_ref_4096": + { + "Benchmark": "bert", + "Creator": "Google", + "When": "Prior to 1.1 submission", + "Platform": "TPU-v4-128 / TF1, TF version ~2.4", + "BS": 4096, + "Hyperparams": { + "opt_base_learning_rate": 0.0024, + "opt_epsilon": 1e-6, + "opt_learning_rate_training_steps": 855, + "num_warmup_steps": 0, + "start_warmup_step": 0, + "opt_lamb_beta_1": 0.66, + "opt_lamb_beta_2": 0.998, + "opt_lamb_weight_decay_rate": 0.01, + "gradient_accumulation_steps": 16 + }, + "Epochs to converge": [ + 2801664, 3022848, 2801664, 3022848, 3047424, 2727936, 2973696, 2703360, 2924544, 2629632, + 2678784, 2850816, 2777088, 2826240, 2801664, 2850816, 2924544, 2924544, 2727936, 2850816] + }, + + + "bert_ref_3072": + { + "Benchmark": "bert", + "Creator": "Google", + "When": "Prior to 1.0 submission", + "Platform": "TPU-v4-128 / TF1, TF version ~2.4", + "BS": 3072, + "Hyperparams": { + "opt_base_learning_rate": 0.002, + "opt_epsilon": 1e-6, + "opt_learning_rate_training_steps": 1141, + "num_warmup_steps": 100, + "start_warmup_step": 0, + "opt_lamb_beta_1": 0.66, + "opt_lamb_beta_2": 0.998, + "opt_lamb_weight_decay_rate": 0.01, + "gradient_accumulation_steps": 96 + }, + "Epochs to converge": [ + 2703360, 2482176, 3072000, 2654208, 2580480, 2727936, 2605056, 2801664, 2777088, 2580480, + 2875392, 2826240, 2973696, 2850816, 2678784, 2919120, 3121152, 2605056, 2678784, 2850816] + }, + + "bert_ref_4608": + { + "Benchmark": "bert", + "Creator": "Google", + "When": "Prior to 2.0 submission", + "Platform": "TPU-v4-16 / TF1, TF version ~2.8", + "BS": 4608, + "Hyperparams": { + "opt_base_learning_rate": 0.0035, + "opt_epsilon": 1e-6, + "opt_learning_rate_training_steps": 700, + "num_warmup_steps": 0, + "start_warmup_step": 0, + "opt_lamb_beta_1": 0.62, + "opt_lamb_beta_2": 0.9, + "opt_lamb_weight_decay_rate": 0.01, + "gradient_accumulation_steps": 144 + }, + "Epochs to converge": [ + 2626560, 2833920, 2787840, 2949120, 2880000, 2810880, 2880000, 3041280, 2787840, 2833920, + 2741760, 2810880, 2649600, 2718720, 2488320, 2603520, 2833920, 2787840, 2810880, 3018240] + }, + + "bert_ref_6144": + { + "Benchmark": "bert", + "Creator": "Google", + "When": "At 1.0 submission", + "Platform": "TPU-v4-128 / TF1, TF version ~2.4", + "BS": 6144, + "Hyperparams": { + "opt_base_learning_rate": 0.0029293, + "opt_epsilon": 1e-6, + "opt_learning_rate_training_steps": 700, + "num_warmup_steps": 0, + "start_warmup_step": -700, + "opt_lamb_beta_1": 0.7206, + "opt_lamb_beta_2": 0.78921, + "opt_lamb_weight_decay_rate": 0.001, + "gradient_accumulation_steps": 24 + }, + "Epochs to converge": [ + 3366912, 3244032, 3219456, 3686400, 3317760, 3293184, 3416064, 3317760, 3391488, 2998272, + 3317760, 3072000, 3416064, 3293184, 3391488, 3514368, 3194880, 3465216, 3244032, 3268608] + }, + + "bert_ref_6912": + { + "Benchmark": "bert", + "Creator": "Google", + "When": "At 1.0 submission", + "Platform": "TPU-v4-128 / TF1, TF version ~2.4", + "BS": 6912, + "Hyperparams": { + "opt_base_learning_rate": 0.0029293, + "opt_epsilon": 1e-6, + "opt_learning_rate_training_steps": 700, + "num_warmup_steps": 0, + "start_warmup_step": -700, + "opt_lamb_beta_1": 0.7206, + "opt_lamb_beta_2": 0.78921, + "opt_lamb_weight_decay_rate": 0.001, + "gradient_accumulation_steps": 27 + }, + "Epochs to converge": [ + 3621888, 3677184, 3400704, 3594240, 3483648, 3732480, 3677184, 3797776, 3621888, 3760128, + 3649536, 3483648, 3566592, 3649536, 3621888, 3483648, 3290112, 3704832, 3594240, 3511296] + }, + + "bert_ref_8192": + { + "Benchmark": "bert", + "Creator": "Google", + "When": "Prior to 1.0 submission", + "Platform": "TPU-v4-128 / TF1, TF version ~2.4", + "BS": 8192, + "Hyperparams": { + "opt_base_learning_rate": 0.00288293, + "opt_epsilon": 1e-6, + "opt_learning_rate_training_steps": 600, + "num_warmup_steps": 287, + "start_warmup_step": -76, + "opt_lamb_beta_1": 0.88, + "opt_lamb_beta_2": 0.88, + "opt_lamb_weight_decay_rate": 0.0166629, + "gradient_accumulation_steps": 16 + }, + "Epochs to converge": [ + 4251648, 4153344, 4055040, 4177920, 4177920, 4079616, 4276224, 4128768, 4177920, 4153344, + 4177920, 4079616, 4300800, 4153344, 4276224, 4423680, 4276224, 4104192, 4251648, 4153344] + }, + + "bert_ref_8704": + { + "Benchmark": "bert", + "Creator": "NVIDIA", + "When": "At 1.1 submission", + "Platform": "TBD", + "BS": 8704, + "Hyperparams": { + "opt_base_learning_rate": 0.002971656225, + "opt_epsilon": 1e-6, + "opt_learning_rate_training_steps": 600, + "num_warmup_steps": 287, + "start_warmup_step": -76, + "opt_lamb_beta_1": 0.88, + "opt_lamb_beta_2": 0.88, + "opt_lamb_weight_decay_rate": 0.0166629, + "gradient_accumulation_steps": 34 + }, + "Epochs to converge": [ + 4343040, 4143360, 4143360, 4442880, 4392960, 4243200, 4193280, 4542720, 4492800, 4243200, + 4243200, 4392960, 4243200, 4193280, 4093440, 4392960, 4093440, 4243200, 4093440, 4392960] + }, + + "bert_ref_12288": + { + "Benchmark": "bert", + "Creator": "NVIDIA", + "When": "At 1.1 submission", + "Platform": "TBD", + "BS": 12288, + "Hyperparams": { + "opt_base_learning_rate": 0.0031, + "opt_epsilon": 1e-6, + "opt_learning_rate_training_steps": 500, + "num_warmup_steps": 300, + "start_warmup_step": -100, + "opt_lamb_beta_1": 0.80, + "opt_lamb_beta_2": 0.925, + "opt_lamb_weight_decay_rate": 0.0166629, + "gradient_accumulation_steps": 32 + }, + "Epochs to converge": [ + 4542720, 4392960, 4642560, 4542720, 4542720, 4492800, 4343040, 4343040, 4442880, 4442880, + 4442880, 4442880, 4442880, 4692480, 4492800, 4442880, 4442880, 4442880, 4492800, 4343040] + }, + + "bert_ref_13056": + { + "Benchmark": "bert", + "Creator": "NVIDIA", + "When": "At 1.1 submission", + "Platform": "TBD", + "BS": 13056, + "Hyperparams": { + "opt_base_learning_rate": 0.00319540686, + "opt_epsilon": 1e-6, + "opt_learning_rate_training_steps": 500, + "num_warmup_steps": 300, + "start_warmup_step": -100, + "opt_lamb_beta_1": 0.80, + "opt_lamb_beta_2": 0.925, + "opt_lamb_weight_decay_rate": 0.0166629, + "gradient_accumulation_steps": 34 + }, + "Epochs to converge": [ + 4442880, 4592640, 4642560, 4842240, 4742400, 4592640, 4642560, 4692480, 4942080, 4542720, + 4592640, 4093440, 4442880, 4792320, 4642560, 4592640, 4592640, 4892160, 4742400, 4592640] + }, + + "bert_ref_16384": + { + "Benchmark": "bert", + "Creator": "NVIDIA", + "When": "At 2.0 submission", + "Platform": "TPU-v3-128", + "BS": 16384, + "Hyperparams": { + "opt_base_learning_rate": 0.0033, + "opt_epsilon": 1e-6, + "opt_learning_rate_training_steps": 600, + "num_warmup_steps": 290, + "start_warmup_step": -100, + "opt_lamb_beta_1": 0.75, + "opt_lamb_beta_2": 0.9, + "opt_lamb_weight_decay_rate": 0.0166629, + "gradient_accumulation_steps": 32 + }, + "Epochs to converge": [ + 5619712, 5770240, 5720064, 5419008, 5519360, 5569536, 5218304, 5469184, 5419008, 5218304, + 5669888, 5669888, 5519360, 5569536, 5368832, 5469184, 5569536, 5469184, 5368832, 5469184] + } +} diff --git a/mlperf_logging/rcp_checker/training_5.0.0/rcps_dlrm_dcnv2.json b/mlperf_logging/rcp_checker/training_5.0.0/rcps_dlrm_dcnv2.json new file mode 100644 index 00000000..35c34bc0 --- /dev/null +++ b/mlperf_logging/rcp_checker/training_5.0.0/rcps_dlrm_dcnv2.json @@ -0,0 +1,133 @@ +{ + + "dlrm_dcnv2_ref_32768": { + "Benchmark": "dlrm_dcnv2", + "Creator": "NVIDIA", + "When": "Prior to 3.0 submission", + "Platform": "DGX-A100", + "BS": 32768, + "Hyperparams": { + "opt_name": "adagrad", + "opt_base_learning_rate": 0.004, + "opt_adagrad_learning_rate_decay": 0.0, + "opt_adagrad_initial_accumulator_value": 0.0, + "opt_adagrad_epsilon": 1e-08, + "opt_weight_decay": 0.0, + "opt_learning_rate_warmup_steps": 0, + "opt_learning_rate_decay_start_step": 0, + "opt_learning_rate_decay_steps": 0 + }, + "Epochs to converge": [ + 0.7, 0.7, 0.7, 0.7, 0.7, 0.7, 0.7, 0.7, 0.7, 0.7, + 0.75, 0.7, 0.7, 0.7, 0.75, 0.75, 0.75, 0.7, 0.7, 0.7, + 0.7, 0.7, 0.75, 0.7, 0.65, 0.7, 0.7, 0.7, 0.7, 0.7 + ] + }, + + "dlrm_dcnv2_ref_55296": { + "Benchmark": "dlrm_dcnv2", + "Creator": "NVIDIA", + "When": "At 3.0 submission", + "Platform": "DGX-A100", + "BS": 55296, + "Hyperparams": { + "opt_name": "adagrad", + "opt_base_learning_rate": 0.004, + "opt_adagrad_learning_rate_decay": 0.0, + "opt_adagrad_initial_accumulator_value": 0.0, + "opt_adagrad_epsilon": 1e-08, + "opt_weight_decay": 0.0, + "opt_learning_rate_warmup_steps": 0, + "opt_learning_rate_decay_start_step": 0, + "opt_learning_rate_decay_steps": 0 + }, + "Epochs to converge": [ + 0.75, 0.75, 0.75, 0.7, 0.8, 0.75, 0.75, 0.75, 0.75, 0.75, + 0.9, 0.7, 0.75, 0.8, 0.7, 0.8, 0.7, 0.7, 0.75, 0.7, + 0.7, 0.9, 0.75, 0.7, 0.8, 0.75, 0.75, 0.8, 0.75, 0.8, + 0.9, 0.75, 0.8, 0.75, 0.8, 0.75, 0.75, 0.75, 0.7, 0.75, + 0.75, 0.8, 0.75, 0.8, 0.8, 0.9, 0.75, 0.75, 0.7, 0.75 + ] + }, + + "dlrm_dcnv2_ref_65536": { + "Benchmark": "dlrm_dcnv2", + "Creator": "NVIDIA", + "When": "Prior to 3.0 submission", + "Platform": "DGX-A100", + "BS": 65536, + "Hyperparams": { + "opt_name": "adagrad", + "opt_base_learning_rate": 0.004, + "opt_adagrad_learning_rate_decay": 0.0, + "opt_adagrad_initial_accumulator_value": 0.0, + "opt_adagrad_epsilon": 1e-08, + "opt_weight_decay": 0.0, + "opt_learning_rate_warmup_steps": 0, + "opt_learning_rate_decay_start_step": 0, + "opt_learning_rate_decay_steps": 0 + }, + "Epochs to converge": [ + 0.75, 0.8, 0.75, 0.75, 0.8, 0.75, 0.8, 0.9, 0.95, 0.75, + 0.75, 0.75, 0.85, 0.85, 0.7, 0.75, 0.75, 0.9, 0.85, 0.8, + 0.7, 0.75, 0.75, 0.75, 0.8, 0.9, 0.75, 0.8, 0.85, 0.8 + ] + }, + + "dlrm_dcnv2_ref_102400": { + "Benchmark": "dlrm_dcnv2", + "Creator": "NVIDIA", + "When": "Prior to 3.0 submission", + "Platform": "DGX-A100", + "BS": 102400, + "Hyperparams": { + "opt_name": "adagrad", + "opt_base_learning_rate": 0.004, + "opt_adagrad_learning_rate_decay": 0.0, + "opt_adagrad_initial_accumulator_value": 0.0, + "opt_adagrad_epsilon": 1e-08, + "opt_weight_decay": 0.0, + "opt_learning_rate_warmup_steps": 0, + "opt_learning_rate_decay_start_step": 0, + "opt_learning_rate_decay_steps": 0 + }, + "Epochs to converge": [ + 0.85, 0.95, 0.95, 0.85, 0.9, 0.8, 0.85, 0.9, 0.9, 0.9, + 0.95, 0.9, 0.9, 0.9, 0.9, 0.9, 0.85, 0.85, 0.9, 0.9, + 0.8, 0.9, 0.9, 0.9, 0.9, 0.9, 0.9, 0.85, 0.9, 0.9, + 0.9, 0.95, 0.85, 0.9, 0.9, 0.9, 0.85, 0.9, 0.95, 0.9, + 0.85, 0.95, 0.9, 0.9, 0.8, 0.9, 0.9, 0.9, 0.85, 0.9 + ] + }, + + "dlrm_dcnv2_ref_135168": { + "Benchmark": "dlrm_dcnv2", + "Creator": "NVIDIA", + "When": "At 3.0 submission", + "Platform": "DGX-A100", + "BS": 135168, + "Hyperparams": { + "opt_name": "adagrad", + "opt_base_learning_rate": 0.0034, + "opt_adagrad_learning_rate_decay": 0.0, + "opt_adagrad_initial_accumulator_value": 0.0, + "opt_adagrad_epsilon": 1e-08, + "opt_weight_decay": 0.0, + "opt_learning_rate_warmup_steps": 0, + "opt_learning_rate_decay_start_step": 0, + "opt_learning_rate_decay_steps": 0 + }, + "Epochs to converge": [ + 0.95, 0.9, 0.9, 0.9, 0.9, 0.95, 0.9, 0.95, 0.95, 0.9, + 0.95, 0.95, 0.95, 1.0, 0.85, 0.9, 0.9, 0.95, 0.95, 0.95, + 0.95, 0.9, 0.9, 0.9, 0.95, 0.95, 1.0, 0.9, 0.95, 0.95, + 0.85, 0.95, 0.95, 0.95, 0.9, 0.95, 0.9, 0.9, 1.0, 0.9, + 0.95, 0.9, 0.95, 0.95, 0.95, 0.95, 0.95, 0.9, 0.9, 0.9, + 0.9, 0.9, 0.9, 0.9, 0.95, 0.85, 0.95, 0.95, 0.9, 0.95, + 0.95, 0.95, 0.95, 1.0, 0.9, 0.95, 0.9, 1.0, 0.85, 0.9, + 0.9, 0.95, 0.95, 0.9, 0.95, 0.9, 0.95, 0.85, 0.95, 0.95, + 0.95, 0.9, 0.9, 0.95, 0.9, 0.95, 0.9, 1.0 + ] + } + +} diff --git a/mlperf_logging/rcp_checker/training_5.0.0/rcps_gnn.json b/mlperf_logging/rcp_checker/training_5.0.0/rcps_gnn.json new file mode 100644 index 00000000..54aef9c6 --- /dev/null +++ b/mlperf_logging/rcp_checker/training_5.0.0/rcps_gnn.json @@ -0,0 +1,90 @@ +{ + + "gnn_ref_4096": + { + "Benchmark": "gnn", + "Creator": "NVIDIA", + "When": "Reference RCPs before v4.0", + "Platform": "1xDGX-A100 and 8xDGX-A100", + "BS": 4096, + "Hyperparams": { + "opt_base_learning_rate": 0.001 + }, + "Epochs to converge": [ + 0.85,0.75,0.75,0.80,0.80,0.75, + 0.75,0.85,0.75,0.75,0.80,0.80, + 0.80,0.75,0.80,0.80,0.80,0.80, + 0.80,0.85 ] + }, + + "gnn_ref_16384": + { + "Benchmark": "gnn", + "Creator": "NVIDIA", + "When": "Reference RCPs before v4.0", + "Platform": "8xDGX-A100", + "BS": 16384, + "Hyperparams": { + "opt_base_learning_rate": 0.002 + }, + "Epochs to converge": [ + 0.85,0.95,0.85,0.80,0.90,0.75, + 0.80,0.90,0.90,0.85,0.90,0.85, + 0.85,0.85,0.85,0.90,0.85,0.85, + 0.85,0.90 ] + }, + + "gnn_ref_32768": + { + "Benchmark": "gnn", + "Creator": "Intel", + "When": "Reference RCPs before v4.0", + "Platform": "16xSPR-2S", + "BS": 32768, + "Hyperparams": { + "opt_base_learning_rate": 0.002 + }, + "Epochs to converge": [ + 1.00,0.95,0.90,0.95,0.95,1.00, + 0.90,0.95,0.95,0.95,1.00,0.90, + 0.95,0.95,0.95,0.90,0.95,0.90, + 0.90,0.90 ] + }, + + "gnn_ref_65536": + { + "Benchmark": "gnn", + "Creator": "NVIDIA", + "When": "Reference RCPs before v4.0", + "Platform": "32xDGX-A100", + "BS": 65536, + "Hyperparams": { + "opt_base_learning_rate": 0.003 + }, + "Epochs to converge": [ + 1.25,1.20,1.25,1.20,1.15,1.15, + 1.15,1.20,1.15,1.20,1.25,1.15, + 1.20,1.20,1.15,1.25,1.20,1.15, + 1.10,1.15 + ] + }, + + "gnn_ref_262144": + { + "Benchmark": "gnn", + "Creator": "NVIDIA", + "When": "Reference RCPs before v4.0", + "Platform": "128xDGX-H100", + "BS": 262144, + "Hyperparams": { + "opt_base_learning_rate": 0.005 + }, + "Epochs to converge": [ + 2.40,2.55,2.35,2.45,2.50,2.35, + 2.45,2.60,2.35,2.55,2.60,2.40, + 2.40,2.30,2.30,2.45,2.60,2.50, + 2.75,2.45 + ] + } +} + diff --git a/mlperf_logging/rcp_checker/training_5.0.0/rcps_gpt3.json b/mlperf_logging/rcp_checker/training_5.0.0/rcps_gpt3.json new file mode 100644 index 00000000..a37bec48 --- /dev/null +++ b/mlperf_logging/rcp_checker/training_5.0.0/rcps_gpt3.json @@ -0,0 +1,93 @@ +{ + + "gpt3_ref_1536": + { + "Benchmark": "gpt3", + "Creator": "Google & NVIDIA", + "When": "Prior to 3.0 submission", + "Platform": "TPU-v4-1536 / PaxML, 1024 A100-80GB / Megatron-LM", + "BS": 1536, + "Hyperparams": { + "opt_base_learning_rate": 2e-5 + }, + "Epochs to converge": [ + 1157627904, 1157627904, 1157627904, 1258291200, 1207959552, 1258291200 + ] + }, + + "gpt3_ref_2048": + { + "Benchmark": "gpt3", + "Creator": "Google & NVIDIA", + "When": "Prior to 3.0 submission", + "Platform": "TPU-v4-2048 / PaxML, 1024 A100-80GB / Megatron-LM", + "BS": 2048, + "Hyperparams": { + "opt_base_learning_rate": 2e-5 + }, + "Epochs to converge": [ + 1157627904, 1207959552, 1157627904, 1207959552, 1207959552, 1157627904, 1157627904 + ] + }, + + "gpt3_ref_3072": + { + "Benchmark": "gpt3", + "Creator": "Google & NVIDIA", + "When": "Prior to 3.0 submission", + "Platform": "TPU-v4-1536 / PaxML, 1024 A100-80GB / Megatron-LM", + "BS": 3072, + "Hyperparams": { + "opt_base_learning_rate": 2e-5 + }, + "Epochs to converge": [ + 1258291200, 1207959552, 1207959552, 1207959552, 1207959552, 1207959552, 13790871552 + ] + }, + + "gpt3_ref_4096": + { + "Benchmark": "gpt3", + "Creator": "Google & NVIDIA", + "When": "Prior to 3.0 submission", + "Platform": "TPU-v4-2048 / PaxML, 1024 A100-80GB / Megatron-LM", + "BS": 4096, + "Hyperparams": { + "opt_base_learning_rate": 3e-5 + }, + "Epochs to converge": [ + 1258291200, 1258291200, 1308622848, 1258291200, 1258291200, 1258291200 + ] + }, + + "gpt3_ref_6144": + { + "Benchmark": "gpt3", + "Creator": "Google & NVIDIA", + "When": "Prior to 3.0 submission", + "Platform": "TPU-v4-2048 / PaxML, 1024 H100-80GB / Megatron-LM", + "BS": 6144, + "Hyperparams": { + "opt_base_learning_rate": 3e-5 + }, + "Epochs to converge": [ + 1409286144, 1409286144, 1409286144, 1409286144, 1409286144, 1409286144 + ] + }, + + "gpt3_ref_8192": + { + "Benchmark": "gpt3", + "Creator": "Google & NVIDIA", + "When": "Prior to 3.0 submission", + "Platform": "TPU-v4-2048 / PaxML, 1024 A100-80GB / Megatron-LM", + "BS": 8192, + "Hyperparams": { + "opt_base_learning_rate": 3e-5 + }, + "Epochs to converge": [ + 1610612736, 1660944384, 1660944384, 1610612736, 1610612736, 1610612736 + ] + } + +} diff --git a/mlperf_logging/rcp_checker/training_5.0.0/rcps_llama2_70b_lora.json b/mlperf_logging/rcp_checker/training_5.0.0/rcps_llama2_70b_lora.json new file mode 100644 index 00000000..b96a329d --- /dev/null +++ b/mlperf_logging/rcp_checker/training_5.0.0/rcps_llama2_70b_lora.json @@ -0,0 +1,91 @@ +{ + "llama2_70b_lora_ref_8": + { + "Benchmark": "llama2_70b_lora", + "Creator": "NVIDIA", + "When": "Prior to 4.0 submission", + "Platform": "TBD", + "BS": 8, + "Hyperparams": { + "opt_base_learning_rate": 4e-4, + "opt_max_grad_norm": 0.3, + "opt_learning_rate_warmup_epochs": 0, + "opt_learning_rate_decay_boundary_epochs": [], + "gradient_accumulation_steps": 1, + "lora_r": 16, + "lora_alpha": 32, + "max_steps": 1024 + }, + "samples to converge": [ + 3072,2688,3456,3072,3072,3072,3456,3456,3072,2688, + 3456,3072,3072,3072,3840,3456,2688,3072,3456,3456 + ] + }, + + "llama2_70b_lora_ref_16": + { + "Benchmark": "llama2_70b_lora", + "Creator": "NVIDIA", + "When": "Prior to 4.0 submission", + "Platform": "TBD", + "BS": 16, + "Hyperparams": { + "opt_base_learning_rate": 4e-4, + "opt_max_grad_norm": 0.3, + "opt_learning_rate_warmup_epochs": 0, + "opt_learning_rate_decay_boundary_epochs": [], + "gradient_accumulation_steps": 1, + "lora_r": 16, + "lora_alpha": 32, + "max_steps": 1024 + }, + "samples to converge": [ + 3840,3840,4224,3840,3840,3840,4608,3840,4608,3840, + 4992,3840,3840,3840,4992,3840,3840,4224,3840,3456 + ] + }, + "llama2_70b_lora_ref_32": + { + "Benchmark": "llama2_70b_lora", + "Creator": "NVIDIA", + "When": "Prior to 4.0 submission", + "Platform": "TBD", + "BS": 32, + "Hyperparams": { + "opt_base_learning_rate": 4e-4, + "opt_max_grad_norm": 0.3, + "opt_learning_rate_warmup_epochs": 0, + "opt_learning_rate_decay_boundary_epochs": [], + "gradient_accumulation_steps": 1, + "lora_r": 16, + "lora_alpha": 32, + "max_steps": 1024 + }, + "samples to converge": [ + 5760,6528,6144,6528,5376,6528,5760,6144,6144,6528, + 6144,6144,6144,5760,5760,5760,5760,5760,6144,5760 + ] + }, + "llama2_70b_lora_ref_128": + { + "Benchmark": "llama2_70b_lora", + "Creator": "NVIDIA", + "When": "Prior to 4.0 submission", + "Platform": "TBD", + "BS": 128, + "Hyperparams": { + "opt_base_learning_rate": 1e-3, + "opt_max_grad_norm": 0.3, + "opt_learning_rate_warmup_epochs": 0, + "opt_learning_rate_decay_boundary_epochs": [], + "gradient_accumulation_steps": 1, + "lora_r": 16, + "lora_alpha": 32, + "max_steps": 1024 + }, + "samples to converge": [ + 11520,13056,10752,12672,12288,11136,10752,13056, 10752,9984, + 11136,11136,11136,10752,11520,11136,11136,10752,11136,9984 + ] + } +} diff --git a/mlperf_logging/rcp_checker/training_5.0.0/rcps_retinanet.json b/mlperf_logging/rcp_checker/training_5.0.0/rcps_retinanet.json new file mode 100644 index 00000000..2e19f356 --- /dev/null +++ b/mlperf_logging/rcp_checker/training_5.0.0/rcps_retinanet.json @@ -0,0 +1,163 @@ +{ + + "ssd_ref_256": + { + "Benchmark": "ssd", + "Creator": "NVIDIA", + "When": "Reference RCPs before v2.0", + "Platform": "1xDGX-A100", + "BS": 256, + "Hyperparams": { + "opt_base_learning_rate": 0.0001, + "opt_learning_rate_warmup_factor": 1e-3, + "opt_learning_rate_warmup_epochs": 1, + "opt_weight_decay": 0 + }, + "Epochs to converge": [ + 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, + 4, 4, 4, 4, 4, 4, 4, 4, 4, 4] + }, + + "ssd_ref_320": + { + "Benchmark": "ssd", + "Creator": "NVIDIA", + "When": "Reference RCPs before v3.1", + "Platform": "2xDGX-A100", + "BS": 320, + "Hyperparams": { + "opt_base_learning_rate": 0.0001, + "opt_learning_rate_warmup_factor": 1e-3, + "opt_learning_rate_warmup_epochs": 1, + "opt_weight_decay": 0 + }, + "Epochs to converge": [ + 4, 4, 4, 4, 4, 4, 4, 4, 4, 5, + 4, 4, 4, 4, 4, 4, 4, 4, 5, 5] + }, + + "ssd_ref_512": + { + "Benchmark": "ssd", + "Creator": "NVIDIA", + "When": "Reference RCPs before v2.0", + "Platform": "8xDGX-A100", + "BS": 512, + "Hyperparams": { + "opt_base_learning_rate": 0.0001, + "opt_learning_rate_warmup_factor": 1e-3, + "opt_learning_rate_warmup_epochs": 1, + "opt_weight_decay": 0 + }, + "Epochs to converge": [ + 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, + 5, 5, 5, 5, 5, 5, 5, 5, 5, 5] + }, + + "ssd_ref_768": + { + "Benchmark": "ssd", + "Creator": "NVIDIA", + "When": "Reference RCPs before v3.0", + "Platform": "8xDGX-A100", + "BS": 768, + "Hyperparams": { + "opt_base_learning_rate": 0.00013, + "opt_learning_rate_warmup_factor": 1e-3, + "opt_learning_rate_warmup_epochs": 1, + "opt_weight_decay": 0 + }, + "Epochs to converge": [ + 5, 5, 5, 5, 5, 5, 5, 5, 5, 5] + }, + + "ssd_ref_1024": + { + "Benchmark": "ssd", + "Creator": "NVIDIA", + "When": "Reference RCPs before v3.0", + "Platform": "8xDGX-A100", + "BS": 1024, + "Hyperparams": { + "opt_base_learning_rate": 0.00011, + "opt_learning_rate_warmup_factor": 1e-3, + "opt_learning_rate_warmup_epochs": 1, + "opt_weight_decay": 0 + }, + "Epochs to converge": [ + 6, 6, 6, 6, 6, 6, 6, 6, 6, 6] + }, + + "ssd_ref_1280": + { + "Benchmark": "ssd", + "Creator": "NVIDIA", + "When": "Reference RCPs before v2.1", + "Platform": "16xDGX-A100", + "BS": 1280, + "Hyperparams": { + "opt_base_learning_rate": 0.00013, + "opt_learning_rate_warmup_factor": 1e-3, + "opt_learning_rate_warmup_epochs": 1, + "opt_weight_decay": 0 + }, + "Epochs to converge": [ + 6, 6, 6, 6, 6, 6, 6, 7, 7, 7, + 7, 7, 7, 7, 7, 7, 7, 7, 7, 7] + }, + + "ssd_ref_2048": + { + "Benchmark": "ssd", + "Creator": "NVIDIA", + "When": "Reference RCPs before v2.1", + "Platform": "16xDGX-A100", + "BS": 2048, + "Hyperparams": { + "opt_base_learning_rate": 0.000135, + "opt_learning_rate_warmup_factor": 1e-3, + "opt_learning_rate_warmup_epochs": 1, + "opt_weight_decay": 0 + }, + "Epochs to converge": [ + 7, 7, 7, 8, 8, 8, 8, 8, 8, 8, + 8, 8, 8, 8, 8, 8, 8, 8, 8, 9] + }, + + "ssd_ref_2560": + { + "Benchmark": "ssd", + "Creator": "NVIDIA", + "When": "Reference RCPs before v4.0", + "Platform": "20xDGX-H100", + "BS": 2560, + "Hyperparams": { + "opt_base_learning_rate": 0.000145, + "opt_learning_rate_warmup_factor": 1e-3, + "opt_learning_rate_warmup_epochs": 1, + "opt_weight_decay": 0 + }, + "Epochs to converge": [ + 8, 8, 8, 8, 8, 9, 9, 9, 9, 9, + 9, 9, 9] + }, + + "ssd_ref_4096": + { + "Benchmark": "ssd", + "Creator": "NVIDIA", + "When": "Reference RCPs before v2.0", + "Platform": "16xDGX-A100", + "BS": 4096, + "Hyperparams": { + "opt_base_learning_rate": 0.0001, + "opt_learning_rate_warmup_factor": 1e-3, + "opt_learning_rate_warmup_epochs": 1, + "opt_weight_decay": 0 + }, + "Epochs to converge": [ + 14, 15, 15, 15, 15, 15, 15, 15, 15, 15, + 15, 15, 16, 16, 16] + } +} + diff --git a/mlperf_logging/rcp_checker/training_5.0.0/rcps_stable_diffusion.json b/mlperf_logging/rcp_checker/training_5.0.0/rcps_stable_diffusion.json new file mode 100644 index 00000000..2a35896d --- /dev/null +++ b/mlperf_logging/rcp_checker/training_5.0.0/rcps_stable_diffusion.json @@ -0,0 +1,112 @@ +{ + + "sd_ref_256": + { + "Benchmark": "stable_diffusion", + "Creator": "NVIDIA", + "When": "Reference RCPs before v4.1", + "Platform": "4xDGX-H100", + "BS": 256, + "Hyperparams": { + "opt_adamw_beta_1": 0.9, + "opt_adamw_beta_2": 0.999, + "opt_adamw_epsilon": 1e-08, + "opt_adamw_weight_decay": 0.01, + "opt_base_learning_rate": 1.8e-7, + "opt_learning_rate_warmup_steps": 1000 + }, + "Epochs to converge": [ + 2048000, 2048000, 2048000, 2048000, + 2048000, 2048000, 2048000, 1536000, + 2048000, 2048000, 2048000, 2048000, + 2048000, 2048000, 2048000, 1536000, + 2048000, 2048000, 2560000, 2560000] + }, + + + "sd_ref_384": + { + "Benchmark": "stable_diffusion", + "Creator": "NVIDIA", + "When": "Reference RCPs before v4.0", + "Platform": "16xDGX-H100", + "BS": 384, + "Hyperparams": { + "opt_adamw_beta_1": 0.9, + "opt_adamw_beta_2": 0.999, + "opt_adamw_epsilon": 1e-08, + "opt_adamw_weight_decay": 0.01, + "opt_base_learning_rate": 1.25e-7, + "opt_learning_rate_warmup_steps": 1000 + }, + "Epochs to converge": [ + 2049024, 2049024, 2049024, 2561280, + 2561280, 2561280, 2561280, 2561280, + 2561280, 2561280, 2561280, 2561280, + 3073536, 3073536, 3073536] + }, + + "sd_ref_512": + { + "Benchmark": "stable_diffusion", + "Creator": "NVIDIA", + "When": "Reference RCPs before v3.1", + "Platform": "32xDGX-A100", + "BS": 512, + "Hyperparams": { + "opt_adamw_beta_1": 0.9, + "opt_adamw_beta_2": 0.999, + "opt_adamw_epsilon": 1e-08, + "opt_adamw_weight_decay": 0.01, + "opt_base_learning_rate": 1.25e-7, + "opt_learning_rate_warmup_steps": 1000 + }, + "Epochs to converge": [ + 2560000, 2560000, 2560000, 2560000, 2560000, + 2560000, 2560000, 2560000, 2560000, 2560000, + 2560000, 2560000, 2560000, 3072000] + }, + + "sd_ref_1024": + { + "Benchmark": "stable_diffusion", + "Creator": "NVIDIA", + "When": "Reference RCPs before v3.1", + "Platform": "32xDGX-A100", + "BS": 1024, + "Hyperparams": { + "opt_adamw_beta_1": 0.9, + "opt_adamw_beta_2": 0.999, + "opt_adamw_epsilon": 1e-08, + "opt_adamw_weight_decay": 0.01, + "opt_base_learning_rate": 1.25e-7, + "opt_learning_rate_warmup_steps": 1000 + }, + "Epochs to converge": [ + 2560000, 2560000, 2560000, 2560000, 2560000, + 3072000, 3072000, 3072000, 3072000, 3072000, + 3072000, 3072000, 2560000] + }, + + "sd_ref_2048": + { + "Benchmark": "stable_diffusion", + "Creator": "NVIDIA", + "When": "Reference RCPs before v3.1", + "Platform": "32xDGX-A100", + "BS": 2048, + "Hyperparams": { + "opt_adamw_beta_1": 0.9, + "opt_adamw_beta_2": 0.999, + "opt_adamw_epsilon": 1e-08, + "opt_adamw_weight_decay": 0.01, + "opt_base_learning_rate": 1.25e-7, + "opt_learning_rate_warmup_steps": 1000 + }, + "Epochs to converge": [ + 3584000, 3584000, 3584000, 3584000, 4096000, + 4096000, 4096000, 4096000, 4096000, 4096000, + 4096000, 4608000, 4608000] + } + +} diff --git a/mlperf_logging/repo_checker/README.md b/mlperf_logging/repo_checker/README.md index 0c4e4953..3d027834 100644 --- a/mlperf_logging/repo_checker/README.md +++ b/mlperf_logging/repo_checker/README.md @@ -12,7 +12,7 @@ review process. python3 -m mlperf_logging.repo_checker FOLDER USAGE RULESET ``` -Currently, USAGE in ["training", "hpc"] and RULESETS 2.0.0, 2.1.0, 3.0.0, 3.1.0, 4.0.0 and 4.1.0 are supported. +Currently, USAGE in ["training", "hpc"] and RULESETS 2.0.0, 2.1.0, 3.0.0, 3.1.0, 4.0.0, 4.1.0 and 5.0.0 are supported. The repo checker checks: 1. Whether the repo contains filenames that github does not like, e.g. files with spaces, diff --git a/mlperf_logging/repo_checker/repo_checker.py b/mlperf_logging/repo_checker/repo_checker.py index b838742c..fc02548f 100644 --- a/mlperf_logging/repo_checker/repo_checker.py +++ b/mlperf_logging/repo_checker/repo_checker.py @@ -127,8 +127,8 @@ def get_parser(): parser.add_argument( 'ruleset', type=str, - choices=['2.0.0', '2.1.0', '3.0.0', '3.1.0', '4.0.0', '4.1.0'], - help='the ruleset. 2.0.0, 2.1.0, 3.0.0, 3.1.0, 4.0.0, and 4.1.0 are currently supported.' + choices=['2.0.0', '2.1.0', '3.0.0', '3.1.0', '4.0.0', '4.1.0', '5.0.0'], + help='the ruleset. 2.0.0, 2.1.0, 3.0.0, 3.1.0, 4.0.0, 4.1.0 and 5.0.0 are currently supported.' ) parser.add_argument( '--log_output', diff --git a/mlperf_logging/result_summarizer/config.yaml b/mlperf_logging/result_summarizer/config.yaml index e59a58fb..729b28c3 100644 --- a/mlperf_logging/result_summarizer/config.yaml +++ b/mlperf_logging/result_summarizer/config.yaml @@ -84,6 +84,15 @@ columns: llama2_70b_lora: ["Benchmark results (minutes)", "LLM-Finetune", "SCROLSS Gov Report", "LLama2-70B-LoRA"] gnn: ["Benchmark results (minutes)", "Graph node classification", "IGBH-Full", "R-GAT"] default: [" ", " ", " "] + "5.0.0": + bert: ["Benchmark results (minutes)", "NLP", "Wikipedia", "BERT"] + gpt3: ["Benchmark results (minutes)", "LLM", "C4", "GPT3"] + dlrm_dcnv2: ["Benchmark results (minutes)", "Recommendation", "1TB Multihot Clickthrough", "DLRM DCNv2"] + retinanet: ["Benchmark results (minutes)", "Object detection, light-weight", "OpenImages", "RetinaNet"] + stable_diffusion: ["Benchmark results (minutes)", "Text to image", "Laion 400m and Coco-2017", "StableDiffusion"] + llama2_70b_lora: ["Benchmark results (minutes)", "LLM-Finetune", "SCROLSS Gov Report", "LLama2-70B-LoRA"] + gnn: ["Benchmark results (minutes)", "Graph node classification", "IGBH-Full", "R-GAT"] + default: [" ", " ", " "] hpc: "2.0.0": diff --git a/scripts/verify_for_v5.0_training.sh b/scripts/verify_for_v5.0_training.sh new file mode 100755 index 00000000..28770e13 --- /dev/null +++ b/scripts/verify_for_v5.0_training.sh @@ -0,0 +1,18 @@ +#!/bin/bash + +set -e + +# rcp_bypass and rcp_bert_train_samples package checker params +# need to be retrieved at package_checker_params file at top-level submission dir. +PACKAGE_CHECKER_PARAMS="" +PACKAGE_CHECKER_PARAMS_FILE="$1/package_checker_params" +if test -f "$PACKAGE_CHECKER_PARAMS_FILE"; then + while IFS= read -r line + do + PACKAGE_CHECKER_PARAMS="$PACKAGE_CHECKER_PARAMS --$line" + done < "$PACKAGE_CHECKER_PARAMS_FILE" +fi + +python3 -m mlperf_logging.package_checker $1 training 5.0.0 $PACKAGE_CHECKER_PARAMS +python3 -m mlperf_logging.result_summarizer $1 training 5.0.0 +python3 -m mlperf_logging.repo_checker $1 training 5.0.0 From 5f7e2257c716a176d007d6ed3e5637b0881f5fce Mon Sep 17 00:00:00 2001 From: Pablo Gonzalez Date: Fri, 6 Dec 2024 15:12:10 -0500 Subject: [PATCH 2/4] Remove GPT3 v5.0 files + rename ssd -> retinanet --- mlperf_logging/benchmark_meta.py | 3 +- mlperf_logging/compliance_checker/README.md | 6 +- .../training_5.0.0/closed_common.yaml | 2 +- .../training_5.0.0/closed_gpt3.yaml | 79 ---------------- .../training_5.0.0/open_common.yaml | 2 +- .../training_5.0.0/open_gpt3.yaml | 79 ---------------- .../rcp_checker/training_5.0.0/rcps_gpt3.json | 93 ------------------- .../training_5.0.0/rcps_retinanet.json | 36 +++---- mlperf_logging/result_summarizer/config.yaml | 1 - 9 files changed, 23 insertions(+), 278 deletions(-) delete mode 100644 mlperf_logging/compliance_checker/training_5.0.0/closed_gpt3.yaml delete mode 100644 mlperf_logging/compliance_checker/training_5.0.0/open_gpt3.yaml delete mode 100644 mlperf_logging/rcp_checker/training_5.0.0/rcps_gpt3.json diff --git a/mlperf_logging/benchmark_meta.py b/mlperf_logging/benchmark_meta.py index 42540133..afc0c74b 100644 --- a/mlperf_logging/benchmark_meta.py +++ b/mlperf_logging/benchmark_meta.py @@ -134,8 +134,7 @@ ], '5.0': [ 'bert', - 'dlrm_dcnv2', - 'gpt3', + 'dlrm_dcnv2', 'retinanet', 'stable_diffusion', 'llama2_70b_lora', diff --git a/mlperf_logging/compliance_checker/README.md b/mlperf_logging/compliance_checker/README.md index 5fbd4881..10d81c18 100644 --- a/mlperf_logging/compliance_checker/README.md +++ b/mlperf_logging/compliance_checker/README.md @@ -25,17 +25,15 @@ As log examples use [NVIDIA's training logs](https://github.com/mlperf/training_ 5.0.0/common.yaml - currently the default config file, checks common fields complience and equeues benchmark-specific config file 5.0.0/closed_common.yaml - the common rules file for closed submissions. These rules apply to all benchmarks 5.0.0/open_common.yaml - the common rules file for open submissions. These rules apply to all benchmarks - 5.0.0/closed_ssd.yaml - Per-benchmark rules, closed submissions. + 5.0.0/closed_retinanet.yaml - Per-benchmark rules, closed submissions. 5.0.0/closed_bert.yaml 5.0.0/closed_dlrm_dcnv2.yaml - 5.0.0/closed_gpt3.yaml 5.0.0/closed_gnn.yaml 5.0.0/closed_llama2_70b_lora.yaml 5.0.0/closed_stable_diffusion.yaml - 5.0.0/open_ssd.yaml - Per-benchmark rules, open submissions. + 5.0.0/open_retinanet.yaml - Per-benchmark rules, open submissions. 5.0.0/open_bert.yaml 5.0.0/open_dlrm_dcnv2.yaml - 5.0.0/open_gpt3.yaml 5.0.0/open_gnn.yaml 5.0.0/open_llama2_70b_lora.yaml 5.0.0/open_stable_diffusion.yaml diff --git a/mlperf_logging/compliance_checker/training_5.0.0/closed_common.yaml b/mlperf_logging/compliance_checker/training_5.0.0/closed_common.yaml index 501cf1f6..9de408fa 100755 --- a/mlperf_logging/compliance_checker/training_5.0.0/closed_common.yaml +++ b/mlperf_logging/compliance_checker/training_5.0.0/closed_common.yaml @@ -2,7 +2,7 @@ - KEY: NAME: submission_benchmark REQ: EXACTLY_ONE - CHECK: " v['value'] in ['resnet', 'ssd', 'stable_diffusion', 'maskrcnn', 'gpt3', 'dlrm_dcnv2', 'bert', 'rnnt', 'unet3d', 'gnn', 'llama2_70b_lora'] " + CHECK: " v['value'] in ['retinanet', 'stable_diffusion', 'dlrm_dcnv2', 'bert', 'gnn', 'llama2_70b_lora'] " POST: " enqueue_config('training_4.1.0/closed_{}.yaml'.format(v['value'])) " - KEY: diff --git a/mlperf_logging/compliance_checker/training_5.0.0/closed_gpt3.yaml b/mlperf_logging/compliance_checker/training_5.0.0/closed_gpt3.yaml deleted file mode 100644 index 8007184a..00000000 --- a/mlperf_logging/compliance_checker/training_5.0.0/closed_gpt3.yaml +++ /dev/null @@ -1,79 +0,0 @@ -- KEY: - NAME: global_batch_size - REQ: EXACTLY_ONE - POST: > - s['global_batch_size'] = v['value'] - -- KEY: - NAME: max_sequence_length - REQ: EXACTLY_ONE - CHECK: " v['value'] == 2048 " - -- KEY: - NAME: opt_name - REQ: EXACTLY_ONE - CHECK: " v['value'] == 'adam' " - -- KEY: - NAME: opt_base_learning_rate - REQ: EXACTLY_ONE - CHECK: " v['value'] == 2e-5 or v['value'] == 3e-5 " - -- KEY: - NAME: opt_end_learning_rate - REQ: EXACTLY_ONE - -- KEY: - NAME: opt_learning_rate_decay_steps - REQ: EXACTLY_ONE - -- KEY: - NAME: opt_learning_rate_warmup_steps - REQ: EXACTLY_ONE - -- KEY: - NAME: opt_learning_rate_decay_schedule - REQ: EXACTLY_ONE - CHECK: " v['value'] == 'cosine with linear warmup' " - -- KEY: - NAME: opt_adam_beta_1 - REQ: EXACTLY_ONE - CHECK: " v['value'] == 0.9 " - -- KEY: - NAME: opt_adam_beta_2 - REQ: EXACTLY_ONE - CHECK: " v['value'] == 0.95 " - -- KEY: - NAME: opt_adam_epsilon - REQ: EXACTLY_ONE - CHECK: " v['value'] == 1e-8 " - -- KEY: - NAME: opt_gradient_clip_norm - REQ: EXACTLY_ONE - CHECK: " v['value'] == 1.0 " - -- KEY: - NAME: gradient_accumulation_steps - REQ: EXACTLY_ONE - CHECK: " v['value'] > 0 " - -- KEY: - NAME: eval_samples - REQ: EXACTLY_ONE - -- KEY: - NAME: eval_accuracy - REQ: AT_LEAST_ONE - CHECK: - - "'epoch_num' in v['metadata']" - ATLEAST_ONE_CHECK: "(v['value'] <= 2.69) and v['value'] > 2.6" - -- KEY: - NAME: init_checkpoint_step - REQ: EXACTLY_ONE - CHECK: " v['value'] > 0 " - diff --git a/mlperf_logging/compliance_checker/training_5.0.0/open_common.yaml b/mlperf_logging/compliance_checker/training_5.0.0/open_common.yaml index 3e174774..3d874f51 100644 --- a/mlperf_logging/compliance_checker/training_5.0.0/open_common.yaml +++ b/mlperf_logging/compliance_checker/training_5.0.0/open_common.yaml @@ -2,5 +2,5 @@ - KEY: NAME: submission_benchmark REQ: EXACTLY_ONE - CHECK: " v['value'] in ['resnet', 'ssd', 'stable_diffusion', 'maskrcnn', 'gpt3', 'dlrm_dcnv2', 'bert', 'rnnt', 'unet3d', 'gnn'] " + CHECK: " v['value'] in ['retinanet', 'stable_diffusion', 'dlrm_dcnv2', 'bert', 'gnn', 'llama2_70b_lora'] " POST: " enqueue_config('training_4.1.0/open_{}.yaml'.format(v['value'])) " diff --git a/mlperf_logging/compliance_checker/training_5.0.0/open_gpt3.yaml b/mlperf_logging/compliance_checker/training_5.0.0/open_gpt3.yaml deleted file mode 100644 index 8007184a..00000000 --- a/mlperf_logging/compliance_checker/training_5.0.0/open_gpt3.yaml +++ /dev/null @@ -1,79 +0,0 @@ -- KEY: - NAME: global_batch_size - REQ: EXACTLY_ONE - POST: > - s['global_batch_size'] = v['value'] - -- KEY: - NAME: max_sequence_length - REQ: EXACTLY_ONE - CHECK: " v['value'] == 2048 " - -- KEY: - NAME: opt_name - REQ: EXACTLY_ONE - CHECK: " v['value'] == 'adam' " - -- KEY: - NAME: opt_base_learning_rate - REQ: EXACTLY_ONE - CHECK: " v['value'] == 2e-5 or v['value'] == 3e-5 " - -- KEY: - NAME: opt_end_learning_rate - REQ: EXACTLY_ONE - -- KEY: - NAME: opt_learning_rate_decay_steps - REQ: EXACTLY_ONE - -- KEY: - NAME: opt_learning_rate_warmup_steps - REQ: EXACTLY_ONE - -- KEY: - NAME: opt_learning_rate_decay_schedule - REQ: EXACTLY_ONE - CHECK: " v['value'] == 'cosine with linear warmup' " - -- KEY: - NAME: opt_adam_beta_1 - REQ: EXACTLY_ONE - CHECK: " v['value'] == 0.9 " - -- KEY: - NAME: opt_adam_beta_2 - REQ: EXACTLY_ONE - CHECK: " v['value'] == 0.95 " - -- KEY: - NAME: opt_adam_epsilon - REQ: EXACTLY_ONE - CHECK: " v['value'] == 1e-8 " - -- KEY: - NAME: opt_gradient_clip_norm - REQ: EXACTLY_ONE - CHECK: " v['value'] == 1.0 " - -- KEY: - NAME: gradient_accumulation_steps - REQ: EXACTLY_ONE - CHECK: " v['value'] > 0 " - -- KEY: - NAME: eval_samples - REQ: EXACTLY_ONE - -- KEY: - NAME: eval_accuracy - REQ: AT_LEAST_ONE - CHECK: - - "'epoch_num' in v['metadata']" - ATLEAST_ONE_CHECK: "(v['value'] <= 2.69) and v['value'] > 2.6" - -- KEY: - NAME: init_checkpoint_step - REQ: EXACTLY_ONE - CHECK: " v['value'] > 0 " - diff --git a/mlperf_logging/rcp_checker/training_5.0.0/rcps_gpt3.json b/mlperf_logging/rcp_checker/training_5.0.0/rcps_gpt3.json deleted file mode 100644 index a37bec48..00000000 --- a/mlperf_logging/rcp_checker/training_5.0.0/rcps_gpt3.json +++ /dev/null @@ -1,93 +0,0 @@ -{ - - "gpt3_ref_1536": - { - "Benchmark": "gpt3", - "Creator": "Google & NVIDIA", - "When": "Prior to 3.0 submission", - "Platform": "TPU-v4-1536 / PaxML, 1024 A100-80GB / Megatron-LM", - "BS": 1536, - "Hyperparams": { - "opt_base_learning_rate": 2e-5 - }, - "Epochs to converge": [ - 1157627904, 1157627904, 1157627904, 1258291200, 1207959552, 1258291200 - ] - }, - - "gpt3_ref_2048": - { - "Benchmark": "gpt3", - "Creator": "Google & NVIDIA", - "When": "Prior to 3.0 submission", - "Platform": "TPU-v4-2048 / PaxML, 1024 A100-80GB / Megatron-LM", - "BS": 2048, - "Hyperparams": { - "opt_base_learning_rate": 2e-5 - }, - "Epochs to converge": [ - 1157627904, 1207959552, 1157627904, 1207959552, 1207959552, 1157627904, 1157627904 - ] - }, - - "gpt3_ref_3072": - { - "Benchmark": "gpt3", - "Creator": "Google & NVIDIA", - "When": "Prior to 3.0 submission", - "Platform": "TPU-v4-1536 / PaxML, 1024 A100-80GB / Megatron-LM", - "BS": 3072, - "Hyperparams": { - "opt_base_learning_rate": 2e-5 - }, - "Epochs to converge": [ - 1258291200, 1207959552, 1207959552, 1207959552, 1207959552, 1207959552, 13790871552 - ] - }, - - "gpt3_ref_4096": - { - "Benchmark": "gpt3", - "Creator": "Google & NVIDIA", - "When": "Prior to 3.0 submission", - "Platform": "TPU-v4-2048 / PaxML, 1024 A100-80GB / Megatron-LM", - "BS": 4096, - "Hyperparams": { - "opt_base_learning_rate": 3e-5 - }, - "Epochs to converge": [ - 1258291200, 1258291200, 1308622848, 1258291200, 1258291200, 1258291200 - ] - }, - - "gpt3_ref_6144": - { - "Benchmark": "gpt3", - "Creator": "Google & NVIDIA", - "When": "Prior to 3.0 submission", - "Platform": "TPU-v4-2048 / PaxML, 1024 H100-80GB / Megatron-LM", - "BS": 6144, - "Hyperparams": { - "opt_base_learning_rate": 3e-5 - }, - "Epochs to converge": [ - 1409286144, 1409286144, 1409286144, 1409286144, 1409286144, 1409286144 - ] - }, - - "gpt3_ref_8192": - { - "Benchmark": "gpt3", - "Creator": "Google & NVIDIA", - "When": "Prior to 3.0 submission", - "Platform": "TPU-v4-2048 / PaxML, 1024 A100-80GB / Megatron-LM", - "BS": 8192, - "Hyperparams": { - "opt_base_learning_rate": 3e-5 - }, - "Epochs to converge": [ - 1610612736, 1660944384, 1660944384, 1610612736, 1610612736, 1610612736 - ] - } - -} diff --git a/mlperf_logging/rcp_checker/training_5.0.0/rcps_retinanet.json b/mlperf_logging/rcp_checker/training_5.0.0/rcps_retinanet.json index 2e19f356..e4158b12 100644 --- a/mlperf_logging/rcp_checker/training_5.0.0/rcps_retinanet.json +++ b/mlperf_logging/rcp_checker/training_5.0.0/rcps_retinanet.json @@ -1,8 +1,8 @@ { - "ssd_ref_256": + "retinanet_ref_256": { - "Benchmark": "ssd", + "Benchmark": "retinanet", "Creator": "NVIDIA", "When": "Reference RCPs before v2.0", "Platform": "1xDGX-A100", @@ -18,9 +18,9 @@ 4, 4, 4, 4, 4, 4, 4, 4, 4, 4] }, - "ssd_ref_320": + "retinanet_ref_320": { - "Benchmark": "ssd", + "Benchmark": "retinanet", "Creator": "NVIDIA", "When": "Reference RCPs before v3.1", "Platform": "2xDGX-A100", @@ -36,9 +36,9 @@ 4, 4, 4, 4, 4, 4, 4, 4, 5, 5] }, - "ssd_ref_512": + "retinanet_ref_512": { - "Benchmark": "ssd", + "Benchmark": "retinanet", "Creator": "NVIDIA", "When": "Reference RCPs before v2.0", "Platform": "8xDGX-A100", @@ -54,9 +54,9 @@ 5, 5, 5, 5, 5, 5, 5, 5, 5, 5] }, - "ssd_ref_768": + "retinanet_ref_768": { - "Benchmark": "ssd", + "Benchmark": "retinanet", "Creator": "NVIDIA", "When": "Reference RCPs before v3.0", "Platform": "8xDGX-A100", @@ -71,9 +71,9 @@ 5, 5, 5, 5, 5, 5, 5, 5, 5, 5] }, - "ssd_ref_1024": + "retinanet_ref_1024": { - "Benchmark": "ssd", + "Benchmark": "retinanet", "Creator": "NVIDIA", "When": "Reference RCPs before v3.0", "Platform": "8xDGX-A100", @@ -88,9 +88,9 @@ 6, 6, 6, 6, 6, 6, 6, 6, 6, 6] }, - "ssd_ref_1280": + "retinanet_ref_1280": { - "Benchmark": "ssd", + "Benchmark": "retinanet", "Creator": "NVIDIA", "When": "Reference RCPs before v2.1", "Platform": "16xDGX-A100", @@ -106,9 +106,9 @@ 7, 7, 7, 7, 7, 7, 7, 7, 7, 7] }, - "ssd_ref_2048": + "retinanet_ref_2048": { - "Benchmark": "ssd", + "Benchmark": "retinanet", "Creator": "NVIDIA", "When": "Reference RCPs before v2.1", "Platform": "16xDGX-A100", @@ -124,9 +124,9 @@ 8, 8, 8, 8, 8, 8, 8, 8, 8, 9] }, - "ssd_ref_2560": + "retinanet_ref_2560": { - "Benchmark": "ssd", + "Benchmark": "retinanet", "Creator": "NVIDIA", "When": "Reference RCPs before v4.0", "Platform": "20xDGX-H100", @@ -142,9 +142,9 @@ 9, 9, 9] }, - "ssd_ref_4096": + "retinanet_ref_4096": { - "Benchmark": "ssd", + "Benchmark": "retinanet", "Creator": "NVIDIA", "When": "Reference RCPs before v2.0", "Platform": "16xDGX-A100", diff --git a/mlperf_logging/result_summarizer/config.yaml b/mlperf_logging/result_summarizer/config.yaml index 729b28c3..81f2bb12 100644 --- a/mlperf_logging/result_summarizer/config.yaml +++ b/mlperf_logging/result_summarizer/config.yaml @@ -86,7 +86,6 @@ columns: default: [" ", " ", " "] "5.0.0": bert: ["Benchmark results (minutes)", "NLP", "Wikipedia", "BERT"] - gpt3: ["Benchmark results (minutes)", "LLM", "C4", "GPT3"] dlrm_dcnv2: ["Benchmark results (minutes)", "Recommendation", "1TB Multihot Clickthrough", "DLRM DCNv2"] retinanet: ["Benchmark results (minutes)", "Object detection, light-weight", "OpenImages", "RetinaNet"] stable_diffusion: ["Benchmark results (minutes)", "Text to image", "Laion 400m and Coco-2017", "StableDiffusion"] From 02de5adc96eaa4f739538782eae036da46b1e776 Mon Sep 17 00:00:00 2001 From: Pablo Gonzalez Date: Tue, 10 Dec 2024 10:17:12 -0500 Subject: [PATCH 3/4] Fix bug: import parse_file_500 correctly --- mlperf_logging/compliance_checker/mlp_parser/__init__.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mlperf_logging/compliance_checker/mlp_parser/__init__.py b/mlperf_logging/compliance_checker/mlp_parser/__init__.py index 03e886eb..215853f4 100644 --- a/mlperf_logging/compliance_checker/mlp_parser/__init__.py +++ b/mlperf_logging/compliance_checker/mlp_parser/__init__.py @@ -8,7 +8,7 @@ from .ruleset_310 import parse_file as parse_file_310 from .ruleset_400 import parse_file as parse_file_400 from .ruleset_410 import parse_file as parse_file_410 -from .ruleset_410 import parse_file as parse_file_500 +from .ruleset_500 import parse_file as parse_file_500 def parse_file(filename, ruleset='0.6.0'): From e6e040b7f040a44ddce305b233b08ca58c8f7353 Mon Sep 17 00:00:00 2001 From: Pablo Gonzalez Date: Tue, 10 Dec 2024 10:24:01 -0500 Subject: [PATCH 4/4] Rename benchmark: gnn -> rgat --- mlperf_logging/benchmark_meta.py | 6 ++++-- mlperf_logging/compliance_checker/README.md | 4 ++-- .../training_5.0.0/closed_common.yaml | 2 +- .../{closed_gnn.yaml => closed_rgat.yaml} | 0 .../training_5.0.0/open_common.yaml | 2 +- .../{open_gnn.yaml => open_rgat.yaml} | 0 mlperf_logging/mllog/constants.py | 1 + mlperf_logging/rcp_checker/rcp_checker.py | 4 +++- .../{rcps_gnn.json => rcps_rgat.json} | 20 +++++++++---------- mlperf_logging/result_summarizer/config.yaml | 2 +- 10 files changed, 23 insertions(+), 18 deletions(-) rename mlperf_logging/compliance_checker/training_5.0.0/{closed_gnn.yaml => closed_rgat.yaml} (100%) rename mlperf_logging/compliance_checker/training_5.0.0/{open_gnn.yaml => open_rgat.yaml} (100%) rename mlperf_logging/rcp_checker/training_5.0.0/{rcps_gnn.json => rcps_rgat.json} (84%) diff --git a/mlperf_logging/benchmark_meta.py b/mlperf_logging/benchmark_meta.py index afc0c74b..54792e87 100644 --- a/mlperf_logging/benchmark_meta.py +++ b/mlperf_logging/benchmark_meta.py @@ -10,12 +10,14 @@ 'minigo': 10, 'resnet': 5, 'ssd': 5, + 'retinanet': 5, 'stable_diffusion': 10, 'transformer': 10, 'ncf': 10, 'rnnt': 10, 'unet3d': 40, - 'gnn' : 10, + 'gnn' : 10, + 'rgat': 10, 'llama2_70b_lora': 10, }, @@ -138,7 +140,7 @@ 'retinanet', 'stable_diffusion', 'llama2_70b_lora', - 'gnn' + 'rgat' ] }, diff --git a/mlperf_logging/compliance_checker/README.md b/mlperf_logging/compliance_checker/README.md index 10d81c18..abd5474f 100644 --- a/mlperf_logging/compliance_checker/README.md +++ b/mlperf_logging/compliance_checker/README.md @@ -28,13 +28,13 @@ As log examples use [NVIDIA's training logs](https://github.com/mlperf/training_ 5.0.0/closed_retinanet.yaml - Per-benchmark rules, closed submissions. 5.0.0/closed_bert.yaml 5.0.0/closed_dlrm_dcnv2.yaml - 5.0.0/closed_gnn.yaml + 5.0.0/closed_rgat.yaml 5.0.0/closed_llama2_70b_lora.yaml 5.0.0/closed_stable_diffusion.yaml 5.0.0/open_retinanet.yaml - Per-benchmark rules, open submissions. 5.0.0/open_bert.yaml 5.0.0/open_dlrm_dcnv2.yaml - 5.0.0/open_gnn.yaml + 5.0.0/open_rgat.yaml 5.0.0/open_llama2_70b_lora.yaml 5.0.0/open_stable_diffusion.yaml diff --git a/mlperf_logging/compliance_checker/training_5.0.0/closed_common.yaml b/mlperf_logging/compliance_checker/training_5.0.0/closed_common.yaml index 9de408fa..e5cc8073 100755 --- a/mlperf_logging/compliance_checker/training_5.0.0/closed_common.yaml +++ b/mlperf_logging/compliance_checker/training_5.0.0/closed_common.yaml @@ -2,7 +2,7 @@ - KEY: NAME: submission_benchmark REQ: EXACTLY_ONE - CHECK: " v['value'] in ['retinanet', 'stable_diffusion', 'dlrm_dcnv2', 'bert', 'gnn', 'llama2_70b_lora'] " + CHECK: " v['value'] in ['retinanet', 'stable_diffusion', 'dlrm_dcnv2', 'bert', 'rgat', 'llama2_70b_lora'] " POST: " enqueue_config('training_4.1.0/closed_{}.yaml'.format(v['value'])) " - KEY: diff --git a/mlperf_logging/compliance_checker/training_5.0.0/closed_gnn.yaml b/mlperf_logging/compliance_checker/training_5.0.0/closed_rgat.yaml similarity index 100% rename from mlperf_logging/compliance_checker/training_5.0.0/closed_gnn.yaml rename to mlperf_logging/compliance_checker/training_5.0.0/closed_rgat.yaml diff --git a/mlperf_logging/compliance_checker/training_5.0.0/open_common.yaml b/mlperf_logging/compliance_checker/training_5.0.0/open_common.yaml index 3d874f51..5b4f1a35 100644 --- a/mlperf_logging/compliance_checker/training_5.0.0/open_common.yaml +++ b/mlperf_logging/compliance_checker/training_5.0.0/open_common.yaml @@ -2,5 +2,5 @@ - KEY: NAME: submission_benchmark REQ: EXACTLY_ONE - CHECK: " v['value'] in ['retinanet', 'stable_diffusion', 'dlrm_dcnv2', 'bert', 'gnn', 'llama2_70b_lora'] " + CHECK: " v['value'] in ['retinanet', 'stable_diffusion', 'dlrm_dcnv2', 'bert', 'rgat', 'llama2_70b_lora'] " POST: " enqueue_config('training_4.1.0/open_{}.yaml'.format(v['value'])) " diff --git a/mlperf_logging/compliance_checker/training_5.0.0/open_gnn.yaml b/mlperf_logging/compliance_checker/training_5.0.0/open_rgat.yaml similarity index 100% rename from mlperf_logging/compliance_checker/training_5.0.0/open_gnn.yaml rename to mlperf_logging/compliance_checker/training_5.0.0/open_rgat.yaml diff --git a/mlperf_logging/mllog/constants.py b/mlperf_logging/mllog/constants.py index c9e39057..54904bab 100644 --- a/mlperf_logging/mllog/constants.py +++ b/mlperf_logging/mllog/constants.py @@ -53,6 +53,7 @@ GPT3 = "gpt3" LLAMA2_70B_LORA = "llama2_70b_lora" GNN = "gnn" +RGAT = "rgat" # Constant values - model info ADAGRAD = "adagrad" diff --git a/mlperf_logging/rcp_checker/rcp_checker.py b/mlperf_logging/rcp_checker/rcp_checker.py index c3479122..d54d00ab 100644 --- a/mlperf_logging/rcp_checker/rcp_checker.py +++ b/mlperf_logging/rcp_checker/rcp_checker.py @@ -25,10 +25,12 @@ 'maskrcnn' : 5, 'resnet' : 5, 'ssd' : 5, + 'retinanet': 5, 'unet3d' : 40, 'rnnt': 10, 'stable_diffusion': 10, - 'gnn': 10, + 'gnn': 10, + 'rgat': 10, 'llama2_70b_lora': 10, }, "hpc": { diff --git a/mlperf_logging/rcp_checker/training_5.0.0/rcps_gnn.json b/mlperf_logging/rcp_checker/training_5.0.0/rcps_rgat.json similarity index 84% rename from mlperf_logging/rcp_checker/training_5.0.0/rcps_gnn.json rename to mlperf_logging/rcp_checker/training_5.0.0/rcps_rgat.json index 54aef9c6..810a7058 100644 --- a/mlperf_logging/rcp_checker/training_5.0.0/rcps_gnn.json +++ b/mlperf_logging/rcp_checker/training_5.0.0/rcps_rgat.json @@ -1,8 +1,8 @@ { - "gnn_ref_4096": + "rgat_ref_4096": { - "Benchmark": "gnn", + "Benchmark": "rgat", "Creator": "NVIDIA", "When": "Reference RCPs before v4.0", "Platform": "1xDGX-A100 and 8xDGX-A100", @@ -17,9 +17,9 @@ 0.80,0.85 ] }, - "gnn_ref_16384": + "rgat_ref_16384": { - "Benchmark": "gnn", + "Benchmark": "rgat", "Creator": "NVIDIA", "When": "Reference RCPs before v4.0", "Platform": "8xDGX-A100", @@ -34,9 +34,9 @@ 0.85,0.90 ] }, - "gnn_ref_32768": + "rgat_ref_32768": { - "Benchmark": "gnn", + "Benchmark": "rgat", "Creator": "Intel", "When": "Reference RCPs before v4.0", "Platform": "16xSPR-2S", @@ -51,9 +51,9 @@ 0.90,0.90 ] }, - "gnn_ref_65536": + "rgat_ref_65536": { - "Benchmark": "gnn", + "Benchmark": "rgat", "Creator": "NVIDIA", "When": "Reference RCPs before v4.0", "Platform": "32xDGX-A100", @@ -69,9 +69,9 @@ ] }, - "gnn_ref_262144": + "rgat_ref_262144": { - "Benchmark": "gnn", + "Benchmark": "rgat", "Creator": "NVIDIA", "When": "Reference RCPs before v4.0", "Platform": "128xDGX-H100", diff --git a/mlperf_logging/result_summarizer/config.yaml b/mlperf_logging/result_summarizer/config.yaml index 81f2bb12..9a070657 100644 --- a/mlperf_logging/result_summarizer/config.yaml +++ b/mlperf_logging/result_summarizer/config.yaml @@ -90,7 +90,7 @@ columns: retinanet: ["Benchmark results (minutes)", "Object detection, light-weight", "OpenImages", "RetinaNet"] stable_diffusion: ["Benchmark results (minutes)", "Text to image", "Laion 400m and Coco-2017", "StableDiffusion"] llama2_70b_lora: ["Benchmark results (minutes)", "LLM-Finetune", "SCROLSS Gov Report", "LLama2-70B-LoRA"] - gnn: ["Benchmark results (minutes)", "Graph node classification", "IGBH-Full", "R-GAT"] + rgat: ["Benchmark results (minutes)", "Graph node classification", "IGBH-Full", "R-GAT"] default: [" ", " ", " "] hpc: