Skip to content
This repository has been archived by the owner on Jan 3, 2023. It is now read-only.

Mingshan/Adding resnet50 validation script #478

Open
wants to merge 17 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
89 changes: 89 additions & 0 deletions test/validate_resnet50/datasets_make_deterministic.patch
Original file line number Diff line number Diff line change
@@ -0,0 +1,89 @@
diff --git a/scripts/tf_cnn_benchmarks/benchmark_cnn.py b/scripts/tf_cnn_benchmarks/benchmark_cnn.py
index 09b118e..4cf9a12 100644
--- a/scripts/tf_cnn_benchmarks/benchmark_cnn.py
+++ b/scripts/tf_cnn_benchmarks/benchmark_cnn.py
@@ -34,6 +34,7 @@ import numpy as np
import six
from six.moves import xrange # pylint: disable=redefined-builtin
import tensorflow as tf
+import ngraph_bridge

from google.protobuf import text_format

@@ -2479,6 +2480,7 @@ class BenchmarkCNN(object):
fetches = self._build_fetches(global_step, all_logits, losses, device_grads,
enqueue_ops, update_ops, all_accuracy_ops,
phase_train)
+
if global_input_producer_op:
global_input_producer_op = tf.group(*global_input_producer_op)
else:
diff --git a/scripts/tf_cnn_benchmarks/data_utils.py b/scripts/tf_cnn_benchmarks/data_utils.py
index 0376d0b..992ee75 100644
--- a/scripts/tf_cnn_benchmarks/data_utils.py
+++ b/scripts/tf_cnn_benchmarks/data_utils.py
@@ -112,7 +112,10 @@ def create_dataset(batch_size,
if not file_names:
raise ValueError('Found no files in --data_dir matching: {}'
.format(glob_pattern))
- ds = tf.data.TFRecordDataset.list_files(file_names)
+
+ # ds = tf.data.TFRecordDataset.list_files(file_names)
+ ds = tf.data.TFRecordDataset.list_files(file_names, shuffle=False, seed=10)
+
ds = ds.apply(
interleave_ops.parallel_interleave(
tf.data.TFRecordDataset, cycle_length=10))
@@ -122,8 +125,9 @@ def create_dataset(batch_size,
counter = counter.repeat()
ds = tf.data.Dataset.zip((ds, counter))
ds = ds.prefetch(buffer_size=batch_size)
- if train:
- ds = ds.shuffle(buffer_size=10000)
+ # Make dataset loader deterministic
+ # if train:
+ # ds = ds.shuffle(buffer_size=10000)
ds = ds.repeat()
ds = ds.apply(
batching.map_and_batch(
diff --git a/scripts/tf_cnn_benchmarks/preprocessing.py b/scripts/tf_cnn_benchmarks/preprocessing.py
index 6a270b0..4e84a1a 100644
--- a/scripts/tf_cnn_benchmarks/preprocessing.py
+++ b/scripts/tf_cnn_benchmarks/preprocessing.py
@@ -335,9 +335,11 @@ def train_image(image_buffer,
else:
image = tf.image.decode_jpeg(image_buffer, channels=3,
dct_method='INTEGER_FAST')
- image = tf.slice(image, bbox_begin, bbox_size)

- distorted_image = tf.image.random_flip_left_right(image)
+ #image = tf.slice(image, bbox_begin, bbox_size)
+
+ #distorted_image = tf.image.random_flip_left_right(image)
+ distorted_image = image

# This resizing operation may distort the images because the aspect
# ratio is not respected.
@@ -361,7 +363,7 @@ def train_image(image_buffer,
distorted_image = distort_color(distorted_image, batch_position,
distort_color_in_yiq=distort_color_in_yiq)

- # Note: This ensures the scaling matches the output of eval_image
+ #Note: This ensures the scaling matches the output of eval_image
distorted_image *= 255

if summary_verbosity >= 3:
@@ -487,10 +489,11 @@ class RecordInputImagePreprocessor(BaseImagePreprocessor):
"""Preprocessing image_buffer as a function of its batch position."""
if self.train:
image = train_image(image_buffer, self.height, self.width, bbox,
- batch_position, self.resize_method, self.distortions,
+ batch_position, self.resize_method, False,
None, summary_verbosity=self.summary_verbosity,
distort_color_in_yiq=self.distort_color_in_yiq,
- fuse_decode_and_crop=self.fuse_decode_and_crop)
+ #fuse_decode_and_crop=self.fuse_decode_and_crop
+ fuse_decode_and_crop=False)
else:
image = tf.image.decode_jpeg(
image_buffer, channels=3, dct_method='INTEGER_FAST')
38 changes: 38 additions & 0 deletions test/validate_resnet50/one_encapsulate.patch
Original file line number Diff line number Diff line change
@@ -0,0 +1,38 @@
diff --git a/scripts/tf_cnn_benchmarks/benchmark_cnn.py b/scripts/tf_cnn_benchmarks/benchmark_cnn.py
index 09b118e..d5a4e29 100644
--- a/scripts/tf_cnn_benchmarks/benchmark_cnn.py
+++ b/scripts/tf_cnn_benchmarks/benchmark_cnn.py
@@ -34,6 +34,7 @@ import numpy as np
import six
from six.moves import xrange # pylint: disable=redefined-builtin
import tensorflow as tf
+import ngraph_bridge

from google.protobuf import text_format

@@ -726,13 +727,23 @@ def benchmark_one_step(sess,
summary_str = None
start_time = time.time()
if summary_op is None:
- results = sess.run(fetches, options=run_options, run_metadata=run_metadata)
+ # get a new set of fetch operation
+ new_fetches = {}
+ for f in fetches:
+ if f == "average_loss":
+ continue
+ new_fetches[f] = fetches[f]
+
+ results = sess.run(new_fetches, options=run_options, run_metadata=run_metadata)
+ #results = sess.run(fetches, options=run_options, run_metadata=run_metadata)
else:
(results, summary_str) = sess.run(
[fetches, summary_op], options=run_options, run_metadata=run_metadata)

if not params.forward_only:
- lossval = results['average_loss']
+ # the calculation is removed in the operations to be fetched
+ #lossval = results['average_loss']
+ lossval = 0
else:
lossval = 0.
if image_producer is not None:
Original file line number Diff line number Diff line change
@@ -0,0 +1,53 @@
W0328 15:31:38.178014 140647503410944 tf_logging.py:125] From /localdisk/mingshan/benchmarks/scripts/tf_cnn_benchmarks/data_utils.py:121: parallel_interleave (from tensorflow.contrib.data.python.ops.interleave_ops) is deprecated and will be removed in a future version.
Instructions for updating:
Use `tf.data.experimental.parallel_interleave(...)`.
W0328 15:31:38.197099 140647503410944 tf_logging.py:125] From /localdisk/mingshan/benchmarks/scripts/tf_cnn_benchmarks/data_utils.py:136: map_and_batch (from tensorflow.contrib.data.python.ops.batching) is deprecated and will be removed in a future version.
Instructions for updating:
Use `tf.data.experimental.map_and_batch(...)`.
W0328 15:31:41.318058 140647503410944 tf_logging.py:125] From /localdisk/mingshan/benchmarks/scripts/tf_cnn_benchmarks/benchmark_cnn.py:1842: Supervisor.__init__ (from tensorflow.python.training.supervisor) is deprecated and will be removed in a future version.
Instructions for updating:
Please switch to tf.train.MonitoredTrainingSession
2019-03-28 15:31:41.734261: I tensorflow/core/platform/cpu_feature_guard.cc:141] Your CPU supports instructions that this TensorFlow binary was not compiled to use: AVX2 FMA
2019-03-28 15:31:42.684223: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1432] Found device 0 with properties:
name: TITAN Xp major: 6 minor: 1 memoryClockRate(GHz): 1.582
pciBusID: 0000:0c:00.0
totalMemory: 11.91GiB freeMemory: 11.75GiB
2019-03-28 15:31:42.684259: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1511] Adding visible gpu devices: 0
2019-03-28 15:31:42.996125: I tensorflow/core/common_runtime/gpu/gpu_device.cc:982] Device interconnect StreamExecutor with strength 1 edge matrix:
2019-03-28 15:31:42.996179: I tensorflow/core/common_runtime/gpu/gpu_device.cc:988] 0
2019-03-28 15:31:42.996185: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1001] 0: N
2019-03-28 15:31:42.996527: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1115] Created TensorFlow device (/job:localhost/replica:0/task:0/device:GPU:0 with 11364 MB memory) -> physical GPU (device: 0, name: TITAN Xp, pci bus id: 0000:0c:00.0, compute capability: 6.1)
I0328 15:31:43.550181 140647503410944 tf_logging.py:115] Running local_init_op.
I0328 15:31:49.378333 140647503410944 tf_logging.py:115] Done running local_init_op.
TensorFlow: 1.12
Model: resnet50
Dataset: imagenet
Mode: training
SingleSess: False
Batch size: 32 global
32.0 per device
Num batches: 100
Num epochs: 0.00
Devices: ['/gpu:0']
Data format: NCHW
Optimizer: sgd
Variables: parameter_server
==========
Generating model
Running warm up
Done warm up
Step Img/sec total_loss top_1_accuracy top_5_accuracy
1 images/sec: 217.4 +/- 0.0 (jitter = 0.0) 8.360 0.000 0.000
10 images/sec: 215.6 +/- 0.4 (jitter = 1.2) 8.027 0.000 0.000
20 images/sec: 215.0 +/- 0.3 (jitter = 1.0) 8.333 0.000 0.031
30 images/sec: 214.9 +/- 0.3 (jitter = 1.0) 8.215 0.000 0.000
40 images/sec: 214.8 +/- 0.2 (jitter = 1.1) 8.189 0.000 0.000
50 images/sec: 214.6 +/- 0.2 (jitter = 1.3) 8.177 0.000 0.000
60 images/sec: 214.3 +/- 0.2 (jitter = 1.4) 8.112 0.000 0.000
70 images/sec: 214.3 +/- 0.2 (jitter = 1.5) 8.185 0.000 0.000
80 images/sec: 214.2 +/- 0.2 (jitter = 1.5) 8.120 0.000 0.000
90 images/sec: 214.2 +/- 0.2 (jitter = 1.5) 8.254 0.000 0.000
100 images/sec: 214.3 +/- 0.2 (jitter = 1.6) 8.093 0.000 0.000
----------------------------------------------------------------
total images/sec: 213.97
----------------------------------------------------------------
Original file line number Diff line number Diff line change
@@ -0,0 +1,63 @@
W0328 15:32:46.495650 140049874691840 tf_logging.py:125] From /localdisk/mingshan/benchmarks/scripts/tf_cnn_benchmarks/data_utils.py:121: parallel_interleave (from tensorflow.contrib.data.python.ops.interleave_ops) is deprecated and will be removed in a future version.
Instructions for updating:
Use `tf.data.experimental.parallel_interleave(...)`.
W0328 15:32:46.520555 140049874691840 tf_logging.py:125] From /localdisk/mingshan/benchmarks/scripts/tf_cnn_benchmarks/data_utils.py:136: map_and_batch (from tensorflow.contrib.data.python.ops.batching) is deprecated and will be removed in a future version.
Instructions for updating:
Use `tf.data.experimental.map_and_batch(...)`.
W0328 15:32:49.878959 140049874691840 tf_logging.py:125] From /localdisk/mingshan/benchmarks/scripts/tf_cnn_benchmarks/benchmark_cnn.py:1842: Supervisor.__init__ (from tensorflow.python.training.supervisor) is deprecated and will be removed in a future version.
Instructions for updating:
Please switch to tf.train.MonitoredTrainingSession
2019-03-28 15:32:50.317750: I tensorflow/core/platform/cpu_feature_guard.cc:141] Your CPU supports instructions that this TensorFlow binary was not compiled to use: AVX2 FMA
2019-03-28 15:32:51.857166: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1432] Found device 0 with properties:
name: TITAN Xp major: 6 minor: 1 memoryClockRate(GHz): 1.582
pciBusID: 0000:0c:00.0
totalMemory: 11.91GiB freeMemory: 11.75GiB
2019-03-28 15:32:51.857212: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1511] Adding visible gpu devices: 0
2019-03-28 15:32:52.273544: I tensorflow/core/common_runtime/gpu/gpu_device.cc:982] Device interconnect StreamExecutor with strength 1 edge matrix:
2019-03-28 15:32:52.273603: I tensorflow/core/common_runtime/gpu/gpu_device.cc:988] 0
2019-03-28 15:32:52.273610: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1001] 0: N
2019-03-28 15:32:52.273926: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1115] Created TensorFlow device (/job:localhost/replica:0/task:0/device:GPU:0 with 11364 MB memory) -> physical GPU (device: 0, name: TITAN Xp, pci bus id: 0000:0c:00.0, compute capability: 6.1)
I0328 15:32:52.851403 140049874691840 tf_logging.py:115] Running local_init_op.
I0328 15:32:58.568639 140049874691840 tf_logging.py:115] Done running local_init_op.
TensorFlow: 1.12
Model: resnet50
Dataset: imagenet
Mode: training
SingleSess: False
Batch size: 32 global
32.0 per device
Num batches: 200
Num epochs: 0.00
Devices: ['/gpu:0']
Data format: NCHW
Optimizer: sgd
Variables: parameter_server
==========
Generating model
Running warm up
Done warm up
Step Img/sec total_loss top_1_accuracy top_5_accuracy
1 images/sec: 217.6 +/- 0.0 (jitter = 0.0) 8.360 0.000 0.000
10 images/sec: 216.7 +/- 0.4 (jitter = 0.3) 8.027 0.000 0.000
20 images/sec: 216.6 +/- 0.3 (jitter = 0.6) 8.333 0.000 0.031
30 images/sec: 216.3 +/- 0.3 (jitter = 1.0) 8.216 0.000 0.000
40 images/sec: 216.3 +/- 0.2 (jitter = 1.0) 8.188 0.000 0.000
50 images/sec: 216.1 +/- 0.2 (jitter = 1.2) 8.177 0.000 0.000
60 images/sec: 216.3 +/- 0.2 (jitter = 1.1) 8.107 0.000 0.000
70 images/sec: 216.4 +/- 0.2 (jitter = 1.0) 8.189 0.000 0.000
80 images/sec: 216.4 +/- 0.2 (jitter = 0.9) 8.114 0.000 0.000
90 images/sec: 216.4 +/- 0.2 (jitter = 0.8) 8.246 0.000 0.000
100 images/sec: 216.3 +/- 0.2 (jitter = 1.1) 8.081 0.000 0.000
110 images/sec: 216.0 +/- 0.2 (jitter = 1.4) 8.363 0.000 0.000
120 images/sec: 215.7 +/- 0.2 (jitter = 1.8) 8.027 0.000 0.000
130 images/sec: 215.5 +/- 0.2 (jitter = 2.1) 8.323 0.000 0.000
140 images/sec: 215.3 +/- 0.2 (jitter = 2.4) 8.440 0.000 0.000
150 images/sec: 215.2 +/- 0.2 (jitter = 2.8) 8.038 0.000 0.000
160 images/sec: 215.0 +/- 0.2 (jitter = 2.9) 8.059 0.000 0.000
170 images/sec: 214.8 +/- 0.2 (jitter = 3.1) 8.318 0.000 0.000
180 images/sec: 214.7 +/- 0.2 (jitter = 3.0) 8.113 0.000 0.000
190 images/sec: 214.6 +/- 0.2 (jitter = 3.0) 8.010 0.031 0.062
200 images/sec: 214.5 +/- 0.2 (jitter = 2.9) 8.379 0.000 0.000
----------------------------------------------------------------
total images/sec: 214.32
----------------------------------------------------------------
Original file line number Diff line number Diff line change
@@ -0,0 +1,57 @@
W0328 16:15:34.093370 140329842439936 tf_logging.py:125] From /localdisk/mingshan/benchmarks/scripts/tf_cnn_benchmarks/benchmark_cnn.py:1842: Supervisor.__init__ (from tensorflow.python.training.supervisor) is deprecated and will be removed in a future version.
Instructions for updating:
Please switch to tf.train.MonitoredTrainingSession
2019-03-28 16:15:34.516200: I tensorflow/core/platform/cpu_feature_guard.cc:141] Your CPU supports instructions that this TensorFlow binary was not compiled to use: AVX2 FMA
2019-03-28 16:15:36.297633: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1432] Found device 0 with properties:
name: TITAN Xp major: 6 minor: 1 memoryClockRate(GHz): 1.582
pciBusID: 0000:0c:00.0
totalMemory: 11.91GiB freeMemory: 11.75GiB
2019-03-28 16:15:36.297676: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1511] Adding visible gpu devices: 0
2019-03-28 16:15:36.626773: I tensorflow/core/common_runtime/gpu/gpu_device.cc:982] Device interconnect StreamExecutor with strength 1 edge matrix:
2019-03-28 16:15:36.626811: I tensorflow/core/common_runtime/gpu/gpu_device.cc:988] 0
2019-03-28 16:15:36.626818: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1001] 0: N
2019-03-28 16:15:36.627131: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1115] Created TensorFlow device (/job:localhost/replica:0/task:0/device:GPU:0 with 11364 MB memory) -> physical GPU (device: 0, name: TITAN Xp, pci bus id: 0000:0c:00.0, compute capability: 6.1)
I0328 16:15:37.193925 140329842439936 tf_logging.py:115] Running local_init_op.
I0328 16:15:37.224961 140329842439936 tf_logging.py:115] Done running local_init_op.
TensorFlow: 1.12
Model: resnet50
Dataset: imagenet (synthetic)
Mode: training
SingleSess: False
Batch size: 32 global
32.0 per device
Num batches: 200
Num epochs: 0.00
Devices: ['/gpu:0']
Data format: NCHW
Optimizer: sgd
Variables: parameter_server
==========
Generating model
Running warm up
Done warm up
Step Img/sec total_loss top_1_accuracy top_5_accuracy
1 images/sec: 219.2 +/- 0.0 (jitter = 0.0) 8.229 0.000 0.000
10 images/sec: 218.4 +/- 0.3 (jitter = 1.1) 8.305 0.000 0.000
20 images/sec: 218.8 +/- 0.3 (jitter = 1.2) 7.921 0.000 0.000
30 images/sec: 218.7 +/- 0.3 (jitter = 1.4) 8.055 0.000 0.000
40 images/sec: 218.0 +/- 0.3 (jitter = 2.3) 8.293 0.000 0.000
50 images/sec: 217.9 +/- 0.3 (jitter = 1.9) 8.092 0.000 0.000
60 images/sec: 218.0 +/- 0.2 (jitter = 1.8) 8.082 0.000 0.000
70 images/sec: 218.1 +/- 0.2 (jitter = 1.7) 8.270 0.000 0.000
80 images/sec: 218.2 +/- 0.2 (jitter = 1.5) 8.177 0.000 0.000
90 images/sec: 218.3 +/- 0.2 (jitter = 1.5) 7.983 0.031 0.031
100 images/sec: 218.3 +/- 0.2 (jitter = 1.5) 8.488 0.000 0.000
110 images/sec: 218.1 +/- 0.1 (jitter = 1.6) 8.207 0.000 0.000
120 images/sec: 218.1 +/- 0.1 (jitter = 1.5) 7.931 0.000 0.000
130 images/sec: 218.0 +/- 0.1 (jitter = 1.5) 8.370 0.000 0.000
140 images/sec: 218.0 +/- 0.1 (jitter = 1.3) 8.345 0.000 0.000
150 images/sec: 217.9 +/- 0.1 (jitter = 1.2) 8.192 0.000 0.031
160 images/sec: 217.9 +/- 0.1 (jitter = 1.2) 8.313 0.031 0.031
170 images/sec: 217.9 +/- 0.1 (jitter = 1.1) 8.381 0.000 0.000
180 images/sec: 217.9 +/- 0.1 (jitter = 1.0) 8.061 0.031 0.031
190 images/sec: 217.8 +/- 0.1 (jitter = 1.0) 8.239 0.000 0.031
200 images/sec: 217.8 +/- 0.1 (jitter = 1.0) 8.045 0.000 0.000
----------------------------------------------------------------
total images/sec: 217.64
----------------------------------------------------------------
Loading