Skip to content

Commit

Permalink
Merge pull request #359 from drcanchi/gnn_logging_update
Browse files Browse the repository at this point in the history
Add GNN training benchmark
  • Loading branch information
hiwotadese authored Mar 28, 2024
2 parents 99ba37a + c9f3aae commit 8510e2b
Show file tree
Hide file tree
Showing 10 changed files with 121 additions and 6 deletions.
12 changes: 12 additions & 0 deletions mlperf_logging/benchmark_meta.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@
'ncf': 10,
'rnnt': 10,
'unet3d': 40,
'gnn' : 10,
},

'hpc' : {
Expand Down Expand Up @@ -108,6 +109,17 @@
'rnnt',
'unet3d',
'stable_diffusion'
],
'4.0': [
'bert',
'dlrm_dcnv2',
'gpt3',
'resnet',
'ssd',
'rnnt',
'unet3d',
'stable_diffusion',
'gnn'
]
},

Expand Down
4 changes: 2 additions & 2 deletions mlperf_logging/compliance_checker/mlp_parser/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,6 @@ def parse_file(filename, ruleset='0.6.0'):
elif ruleset == '3.1.0':
return parse_file_310(filename)
elif ruleset == '4.0.0':
return parse_file_400(filename)
return parse_file_400(filename)
else:
raise Exception(f'Ruleset "{ruleset}" is not supported')
raise Exception(f'Ruleset "{ruleset}" is not supported')
Original file line number Diff line number Diff line change
Expand Up @@ -2,10 +2,10 @@
- KEY:
NAME: submission_benchmark
REQ: EXACTLY_ONE
CHECK: " v['value'] in ['resnet', 'ssd', 'stable_diffusion', 'maskrcnn', 'gpt3', 'dlrm_dcnv2', 'bert', 'rnnt', 'unet3d'] "
CHECK: " v['value'] in ['resnet', 'ssd', 'stable_diffusion', 'maskrcnn', 'gpt3', 'dlrm_dcnv2', 'bert', 'rnnt', 'unet3d', 'gnn'] "
POST: " enqueue_config('training_4.0.0/closed_{}.yaml'.format(v['value'])) "

- KEY:
NAME: gradient_accumulation_steps
REQ: EXACTLY_ONE
CHECK: " v['value'] > 0 "
CHECK: " v['value'] > 0 "
21 changes: 21 additions & 0 deletions mlperf_logging/compliance_checker/training_4.0.0/closed_gnn.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,21 @@
- KEY:
NAME: global_batch_size
REQ: EXACTLY_ONE
CHECK: " v['value'] > 0"

- KEY:
NAME: opt_name
REQ: EXACTLY_ONE
CHECK: " v['value'] == 'adam' "

- KEY:
NAME: opt_base_learning_rate
REQ: EXACTLY_ONE
CHECK: " v['value'] >= 0.0"

- KEY:
NAME: eval_accuracy
REQ: AT_LEAST_ONE
CHECK:
- "'epoch_num' in v['metadata']"
ATLEAST_ONE_CHECK: "v['value'] >= 0.72 and v['value'] < 1.0"
Original file line number Diff line number Diff line change
Expand Up @@ -2,5 +2,5 @@
- KEY:
NAME: submission_benchmark
REQ: EXACTLY_ONE
CHECK: " v['value'] in ['resnet', 'ssd', 'stable_diffusion', 'maskrcnn', 'gpt3', 'dlrm_dcnv2', 'bert', 'rnnt', 'unet3d'] "
POST: " enqueue_config('training_4.0.0/open_{}.yaml'.format(v['value'])) "
CHECK: " v['value'] in ['resnet', 'ssd', 'stable_diffusion', 'maskrcnn', 'gpt3', 'dlrm_dcnv2', 'bert', 'rnnt', 'unet3d', 'gnn'] "
POST: " enqueue_config('training_4.0.0/open_{}.yaml'.format(v['value'])) "
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@

- KEY:
NAME: eval_accuracy
REQ: AT_LEAST_ONE
CHECK:
- "'epoch_num' in v['metadata']"
ATLEAST_ONE_CHECK: "v['value'] < 1.0"
1 change: 1 addition & 0 deletions mlperf_logging/mllog/constants.py
Original file line number Diff line number Diff line change
Expand Up @@ -50,6 +50,7 @@
UNET3D = "unet3d"
BERT = "bert"
GPT3 = "gpt3"
GNN = "gnn"

# Constant values - model info
ADAGRAD = "adagrad"
Expand Down
1 change: 1 addition & 0 deletions mlperf_logging/rcp_checker/rcp_checker.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,7 @@
'unet3d' : 40,
'rnnt': 10,
'stable_diffusion': 10,
'gnn': 10,
},
"hpc": {
'cosmoflow': 10,
Expand Down
72 changes: 72 additions & 0 deletions mlperf_logging/rcp_checker/training_4.0.0/rcps_gnn.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,72 @@
{

"gnn_ref_4096":
{
"Benchmark": "gnn",
"Creator": "NVIDIA",
"When": "Reference RCPs before v4.0",
"Platform": "1xDGX-A100 and 8xDGX-A100",
"BS": 4096,
"Hyperparams": {
"opt_base_learning_rate": 0.001
},
"Epochs to converge": [
0.85,0.75,0.75,0.80,0.80,0.75,
0.75,0.85,0.75,0.75,0.80,0.80,
0.80,0.75,0.80,0.80,0.80,0.80,
0.80,0.85 ]
},

"gnn_ref_16384":
{
"Benchmark": "gnn",
"Creator": "NVIDIA",
"When": "Reference RCPs before v4.0",
"Platform": "8xDGX-A100",
"BS": 16384,
"Hyperparams": {
"opt_base_learning_rate": 0.002
},
"Epochs to converge": [
0.85,0.95,0.85,0.80,0.90,0.75,
0.80,0.90,0.90,0.85,0.90,0.85,
0.85,0.85,0.85,0.90,0.85,0.85,
0.85,0.90 ]
},

"gnn_ref_32768":
{
"Benchmark": "gnn",
"Creator": "Intel",
"When": "Reference RCPs before v4.0",
"Platform": "16xSPR-2S",
"BS": 32768,
"Hyperparams": {
"opt_base_learning_rate": 0.002
},
"Epochs to converge": [
1.00,0.95,0.90,0.95,0.95,1.00,
0.90,0.95,0.95,0.95,1.00,0.90,
0.95,0.95,0.95,0.90,0.95,0.90,
0.90,0.90 ]
},

"gnn_ref_65536":
{
"Benchmark": "gnn",
"Creator": "NVIDIA",
"When": "Reference RCPs before v4.0",
"Platform": "32xDGX-A100",
"BS": 65536,
"Hyperparams": {
"opt_base_learning_rate": 0.003
},
"Epochs to converge": [
1.25,1.20,1.25,1.20,1.15,1.15,
1.15,1.20,1.15,1.20,1.25,1.15,
1.20,1.20,1.15,1.25,1.20,1.15,
1.10,1.15
]
}
}

1 change: 1 addition & 0 deletions mlperf_logging/result_summarizer/config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -72,6 +72,7 @@ columns:
rnnt: ["Benchmark results (minutes)", "Speech recognition", "LibriSpeech", "RNN-T"]
unet3d: ["Benchmark results (minutes)", "Image segmentation (medical)", "KiTS19", "3D U-Net"]
stable_diffusion: ["Benchmark results (minutes)", "Text to image", "Laion 400m and Coco-2017", "StableDiffusion"]
gnn: ["Benchmark results (minutes)", "Graph node classification", "IGBH-Full", "R-GAT"]
default: [" ", " ", " "]

hpc:
Expand Down

0 comments on commit 8510e2b

Please sign in to comment.