Merge pull request #359 from drcanchi/gnn_logging_update

Add GNN training benchmark
mlcommons · Mar 28, 2024 · 8510e2b · 8510e2b
2 parents 99ba37a + c9f3aae
commit 8510e2b
Show file tree

Hide file tree

Showing 10 changed files with 121 additions and 6 deletions.
diff --git a/mlperf_logging/benchmark_meta.py b/mlperf_logging/benchmark_meta.py
@@ -15,6 +15,7 @@
         'ncf': 10,
         'rnnt': 10,
         'unet3d': 40,
+        'gnn' : 10,   
     },
 
     'hpc' : {
@@ -108,6 +109,17 @@
         'rnnt',
         'unet3d',
         'stable_diffusion'
+    ],
+    '4.0': [
+        'bert',
+        'dlrm_dcnv2',
+        'gpt3',
+        'resnet',
+        'ssd',
+        'rnnt',
+        'unet3d',
+        'stable_diffusion',
+        'gnn'
     ]
     },
 

diff --git a/mlperf_logging/compliance_checker/mlp_parser/__init__.py b/mlperf_logging/compliance_checker/mlp_parser/__init__.py
@@ -27,6 +27,6 @@ def parse_file(filename, ruleset='0.6.0'):
     elif ruleset == '3.1.0':
         return parse_file_310(filename)
     elif ruleset == '4.0.0':
-        return parse_file_400(filename)
+        return parse_file_400(filename)    
     else:
-        raise Exception(f'Ruleset "{ruleset}" is not supported')
+        raise Exception(f'Ruleset "{ruleset}" is not supported')
diff --git a/mlperf_logging/compliance_checker/training_4.0.0/closed_common.yaml b/mlperf_logging/compliance_checker/training_4.0.0/closed_common.yaml
@@ -2,10 +2,10 @@
 - KEY:
     NAME:  submission_benchmark
     REQ:   EXACTLY_ONE
-    CHECK: " v['value'] in ['resnet', 'ssd', 'stable_diffusion', 'maskrcnn', 'gpt3', 'dlrm_dcnv2', 'bert', 'rnnt', 'unet3d'] "
+    CHECK: " v['value'] in ['resnet', 'ssd', 'stable_diffusion', 'maskrcnn', 'gpt3', 'dlrm_dcnv2', 'bert', 'rnnt', 'unet3d', 'gnn'] "
     POST:  " enqueue_config('training_4.0.0/closed_{}.yaml'.format(v['value'])) "
 
 - KEY:
     NAME: gradient_accumulation_steps
     REQ: EXACTLY_ONE
-    CHECK: " v['value'] > 0 "
+    CHECK: " v['value'] > 0 "
diff --git a/mlperf_logging/compliance_checker/training_4.0.0/closed_gnn.yaml b/mlperf_logging/compliance_checker/training_4.0.0/closed_gnn.yaml
@@ -0,0 +1,21 @@
+- KEY:
+    NAME:  global_batch_size
+    REQ:   EXACTLY_ONE
+    CHECK: " v['value'] > 0"
+
+- KEY:
+    NAME:  opt_name
+    REQ:   EXACTLY_ONE
+    CHECK: " v['value'] == 'adam' "
+
+- KEY:
+    NAME:  opt_base_learning_rate
+    REQ:   EXACTLY_ONE
+    CHECK: " v['value'] >= 0.0"
+
+- KEY:
+    NAME:  eval_accuracy
+    REQ:   AT_LEAST_ONE
+    CHECK:
+        - "'epoch_num' in v['metadata']"
+    ATLEAST_ONE_CHECK: "v['value'] >= 0.72 and v['value'] < 1.0"
diff --git a/mlperf_logging/compliance_checker/training_4.0.0/open_common.yaml b/mlperf_logging/compliance_checker/training_4.0.0/open_common.yaml
@@ -2,5 +2,5 @@
 - KEY:
     NAME:  submission_benchmark
     REQ:   EXACTLY_ONE
-    CHECK: " v['value'] in ['resnet', 'ssd', 'stable_diffusion', 'maskrcnn', 'gpt3', 'dlrm_dcnv2', 'bert', 'rnnt', 'unet3d'] "
-    POST:  " enqueue_config('training_4.0.0/open_{}.yaml'.format(v['value'])) "
+    CHECK: " v['value'] in ['resnet', 'ssd', 'stable_diffusion', 'maskrcnn', 'gpt3', 'dlrm_dcnv2', 'bert', 'rnnt', 'unet3d', 'gnn'] "
+    POST:  " enqueue_config('training_4.0.0/open_{}.yaml'.format(v['value'])) "
diff --git a/mlperf_logging/compliance_checker/training_4.0.0/open_gnn.yaml b/mlperf_logging/compliance_checker/training_4.0.0/open_gnn.yaml
@@ -0,0 +1,7 @@
+
+- KEY:
+    NAME:  eval_accuracy
+    REQ:   AT_LEAST_ONE
+    CHECK:
+        - "'epoch_num' in v['metadata']"
+    ATLEAST_ONE_CHECK: "v['value'] < 1.0"
diff --git a/mlperf_logging/mllog/constants.py b/mlperf_logging/mllog/constants.py
@@ -50,6 +50,7 @@
 UNET3D = "unet3d"
 BERT = "bert"
 GPT3 = "gpt3"
+GNN = "gnn"
 
 # Constant values - model info
 ADAGRAD = "adagrad"

diff --git a/mlperf_logging/rcp_checker/rcp_checker.py b/mlperf_logging/rcp_checker/rcp_checker.py
@@ -28,6 +28,7 @@
         'unet3d' : 40,
         'rnnt': 10,
         'stable_diffusion': 10,
+        'gnn': 10,  
     },
     "hpc": {
         'cosmoflow': 10,

diff --git a/mlperf_logging/rcp_checker/training_4.0.0/rcps_gnn.json b/mlperf_logging/rcp_checker/training_4.0.0/rcps_gnn.json
@@ -0,0 +1,72 @@
+{
+
+  "gnn_ref_4096":
+  {
+    "Benchmark": "gnn",
+    "Creator": "NVIDIA",
+    "When": "Reference RCPs before v4.0",
+    "Platform": "1xDGX-A100 and 8xDGX-A100",
+    "BS": 4096,
+    "Hyperparams": {
+      "opt_base_learning_rate": 0.001
+    },
+    "Epochs to converge": [
+	  0.85,0.75,0.75,0.80,0.80,0.75,
+	  0.75,0.85,0.75,0.75,0.80,0.80,
+	  0.80,0.75,0.80,0.80,0.80,0.80,
+	  0.80,0.85 ]
+  },
+
+  "gnn_ref_16384":
+  {
+    "Benchmark": "gnn",
+    "Creator": "NVIDIA",
+    "When": "Reference RCPs before v4.0",
+    "Platform": "8xDGX-A100",
+    "BS": 16384,
+    "Hyperparams": {
+      "opt_base_learning_rate": 0.002
+    },
+    "Epochs to converge": [
+	  0.85,0.95,0.85,0.80,0.90,0.75,
+	  0.80,0.90,0.90,0.85,0.90,0.85,
+	  0.85,0.85,0.85,0.90,0.85,0.85,
+	  0.85,0.90 ]
+  },
+
+  "gnn_ref_32768":
+  {
+    "Benchmark": "gnn",
+    "Creator": "Intel",
+    "When": "Reference RCPs before v4.0",
+    "Platform": "16xSPR-2S",
+    "BS": 32768,
+    "Hyperparams": {
+      "opt_base_learning_rate": 0.002
+    },
+    "Epochs to converge": [
+	  1.00,0.95,0.90,0.95,0.95,1.00,
+	  0.90,0.95,0.95,0.95,1.00,0.90,
+	  0.95,0.95,0.95,0.90,0.95,0.90,
+	  0.90,0.90 ]
+  },
+
+  "gnn_ref_65536":
+  {
+    "Benchmark": "gnn",
+    "Creator": "NVIDIA",
+    "When": "Reference RCPs before v4.0",
+    "Platform": "32xDGX-A100",
+    "BS": 65536,
+    "Hyperparams": {
+      "opt_base_learning_rate": 0.003
+    },
+    "Epochs to converge": [
+	  1.25,1.20,1.25,1.20,1.15,1.15,
+	  1.15,1.20,1.15,1.20,1.25,1.15,
+	  1.20,1.20,1.15,1.25,1.20,1.15,
+	  1.10,1.15
+    ]	  
+  }
+}
+
diff --git a/mlperf_logging/result_summarizer/config.yaml b/mlperf_logging/result_summarizer/config.yaml
@@ -72,6 +72,7 @@ columns:
       rnnt: ["Benchmark results (minutes)", "Speech recognition", "LibriSpeech", "RNN-T"]
       unet3d: ["Benchmark results (minutes)", "Image segmentation (medical)", "KiTS19", "3D U-Net"]
       stable_diffusion: ["Benchmark results (minutes)", "Text to image", "Laion 400m and Coco-2017", "StableDiffusion"]
+      gnn: ["Benchmark results (minutes)", "Graph node classification", "IGBH-Full", "R-GAT"]
       default: [" ", " ", " "]
 
   hpc: