From 740a50efc55d175bd95e605d7fc1e037ca640b9b Mon Sep 17 00:00:00 2001 From: Lee Yang Date: Thu, 25 Jul 2019 17:02:16 -0700 Subject: [PATCH 01/37] initial support for tf2 --- .../mnist/estimator/mnist_estimator_2.0.py | 96 ++++ examples/mnist/keras/mnist_keras_2.0.py | 83 ++++ requirements.txt | 2 +- tensorflowonspark/pipeline.py | 64 ++- test/test_TFCluster.py | 89 ++-- test/test_pipeline.py | 411 +++++++----------- 6 files changed, 382 insertions(+), 363 deletions(-) create mode 100644 examples/mnist/estimator/mnist_estimator_2.0.py create mode 100644 examples/mnist/keras/mnist_keras_2.0.py diff --git a/examples/mnist/estimator/mnist_estimator_2.0.py b/examples/mnist/estimator/mnist_estimator_2.0.py new file mode 100644 index 00000000..750a9d08 --- /dev/null +++ b/examples/mnist/estimator/mnist_estimator_2.0.py @@ -0,0 +1,96 @@ +# Adapted from: https://www.tensorflow.org/beta/tutorials/distribute/multi_worker_with_estimator + +from __future__ import absolute_import, division, print_function, unicode_literals + + +def main(args, ctx): + import tensorflow_datasets as tfds + import tensorflow as tf + + BUFFER_SIZE = args.buffer_size + BATCH_SIZE = args.batch_size + LEARNING_RATE = args.learning_rate + + def input_fn(mode, input_context=None): + datasets, info = tfds.load(name='mnist', + with_info=True, + as_supervised=True) + mnist_dataset = (datasets['train'] if mode == tf.estimator.ModeKeys.TRAIN else + datasets['test']) + + def scale(image, label): + image = tf.cast(image, tf.float32) + image /= 255 + return image, label + + if input_context: + mnist_dataset = mnist_dataset.shard(input_context.num_input_pipelines, + input_context.input_pipeline_id) + return mnist_dataset.repeat(2).map(scale).shuffle(BUFFER_SIZE).batch(BATCH_SIZE) + + def model_fn(features, labels, mode): + model = tf.keras.Sequential([ + tf.keras.layers.Conv2D(32, 3, activation='relu', input_shape=(28, 28, 1)), + tf.keras.layers.MaxPooling2D(), + tf.keras.layers.Flatten(), + tf.keras.layers.Dense(64, activation='relu'), + tf.keras.layers.Dense(10, activation='softmax') + ]) + logits = model(features, training=False) + + if mode == tf.estimator.ModeKeys.PREDICT: + predictions = {'logits': logits} + return tf.estimator.EstimatorSpec(labels=labels, predictions=predictions) + + optimizer = tf.compat.v1.train.GradientDescentOptimizer( + learning_rate=LEARNING_RATE) + loss = tf.keras.losses.SparseCategoricalCrossentropy( + from_logits=True, reduction=tf.keras.losses.Reduction.NONE)(labels, logits) + loss = tf.reduce_sum(loss) * (1. / BATCH_SIZE) + if mode == tf.estimator.ModeKeys.EVAL: + return tf.estimator.EstimatorSpec(mode, loss=loss) + + return tf.estimator.EstimatorSpec( + mode=mode, + loss=loss, + train_op=optimizer.minimize( + loss, tf.compat.v1.train.get_or_create_global_step())) + + strategy = tf.distribute.experimental.MultiWorkerMirroredStrategy() + config = tf.estimator.RunConfig(train_distribute=strategy) + + classifier = tf.estimator.Estimator( + model_fn=model_fn, model_dir=args.model_dir, config=config) + + tf.estimator.train_and_evaluate( + classifier, + train_spec=tf.estimator.TrainSpec(input_fn=input_fn), + eval_spec=tf.estimator.EvalSpec(input_fn=input_fn) + ) + + +if __name__ == "__main__": + # tf.app.run() + + from pyspark.context import SparkContext + from pyspark.conf import SparkConf + from tensorflowonspark import TFCluster + import argparse + + sc = SparkContext(conf=SparkConf().setAppName("mnist_estimator")) + executors = sc._conf.get("spark.executor.instances") + num_executors = int(executors) if executors is not None else 1 + + parser = argparse.ArgumentParser() + parser.add_argument("--batch_size", help="number of records per batch", type=int, default=64) + parser.add_argument("--buffer_size", help="size of shuffle buffer", type=int, default=10000) + parser.add_argument("--cluster_size", help="number of nodes in the cluster", type=int, default=num_executors) + parser.add_argument("--learning_rate", help="learning rate", type=float, default=1e-4) + parser.add_argument("--model_dir", help="path to save model/checkpoint", default="mnist_model") + parser.add_argument("--tensorboard", help="launch tensorboard process", action="store_true") + + args = parser.parse_args() + print("args:", args) + + cluster = TFCluster.run(sc, main, args, args.cluster_size, num_ps=0, tensorboard=args.tensorboard, input_mode=TFCluster.InputMode.TENSORFLOW, log_dir=args.model_dir, master_node='chief') + cluster.shutdown() diff --git a/examples/mnist/keras/mnist_keras_2.0.py b/examples/mnist/keras/mnist_keras_2.0.py new file mode 100644 index 00000000..bf2c708c --- /dev/null +++ b/examples/mnist/keras/mnist_keras_2.0.py @@ -0,0 +1,83 @@ +# Adapted from: https://www.tensorflow.org/beta/tutorials/distribute/multi_worker_with_keras + +from __future__ import absolute_import, division, print_function, unicode_literals + + +def main_fun(args, ctx): + import tensorflow_datasets as tfds + import tensorflow as tf + tfds.disable_progress_bar() + + strategy = tf.distribute.experimental.MultiWorkerMirroredStrategy() + + BUFFER_SIZE = args.buffer_size + BATCH_SIZE = args.batch_size + NUM_WORKERS = args.cluster_size + + # Scaling MNIST data from (0, 255] to (0., 1.] + def scale(image, label): + image = tf.cast(image, tf.float32) + image /= 255 + return image, label + + datasets, info = tfds.load(name='mnist', + with_info=True, + as_supervised=True) + + train_datasets_unbatched = datasets['train'].repeat().map(scale).shuffle(BUFFER_SIZE) + + def build_and_compile_cnn_model(): + model = tf.keras.Sequential([ + tf.keras.layers.Conv2D(32, 3, activation='relu', input_shape=(28, 28, 1)), + tf.keras.layers.MaxPooling2D(), + tf.keras.layers.Flatten(), + tf.keras.layers.Dense(64, activation='relu'), + tf.keras.layers.Dense(10, activation='softmax') + ]) + model.compile( + loss=tf.keras.losses.sparse_categorical_crossentropy, + optimizer=tf.keras.optimizers.SGD(learning_rate=0.001), + metrics=['accuracy']) + return model + + # single node + # single_worker_model = build_and_compile_cnn_model() + # single_worker_model.fit(x=train_datasets, epochs=3) + + # Here the batch size scales up by number of workers since + # `tf.data.Dataset.batch` expects the global batch size. Previously we used 64, + # and now this becomes 128. + GLOBAL_BATCH_SIZE = BATCH_SIZE * NUM_WORKERS + train_datasets = train_datasets_unbatched.batch(GLOBAL_BATCH_SIZE) + with strategy.scope(): + multi_worker_model = build_and_compile_cnn_model() + + if ctx.job_name == 'chief': + # multi_worker_model.save(args.model_dir, save_format='tf') + tf.keras.experimental.export_saved_model(multi_worker_model, args.model_dir) + + +if __name__ == '__main__': + import argparse + from pyspark.context import SparkContext + from pyspark.conf import SparkConf + from tensorflowonspark import TFCluster + + sc = SparkContext(conf=SparkConf().setAppName("mnist_keras")) + executors = sc._conf.get("spark.executor.instances") + num_executors = int(executors) if executors is not None else 1 + + parser = argparse.ArgumentParser() + parser.add_argument("--batch_size", help="number of records per batch", type=int, default=64) + parser.add_argument("--buffer_size", help="size of shuffle buffer", type=int, default=10000) + parser.add_argument("--cluster_size", help="number of nodes in the cluster", type=int, default=num_executors) + parser.add_argument("--epochs", help="number of epochs of training data", type=int, default=5) + parser.add_argument("--model_dir", help="path to save model/checkpoint", default="mnist_model") + parser.add_argument("--steps_per_epoch", help="number of steps per epoch", type=int, default=469) + parser.add_argument("--tensorboard", help="launch tensorboard process", action="store_true") + + args = parser.parse_args() + print("args:", args) + + cluster = TFCluster.run(sc, main_fun, args, args.cluster_size, num_ps=0, tensorboard=args.tensorboard, input_mode=TFCluster.InputMode.TENSORFLOW, master_node='chief') + cluster.shutdown() diff --git a/requirements.txt b/requirements.txt index 1645c6dd..142c3519 100644 --- a/requirements.txt +++ b/requirements.txt @@ -3,4 +3,4 @@ py4j pyspark scipy sphinx -tensorflow +tensorflow==2.0.0-beta1 diff --git a/tensorflowonspark/pipeline.py b/tensorflowonspark/pipeline.py index 3fd91833..6ba32336 100755 --- a/tensorflowonspark/pipeline.py +++ b/tensorflowonspark/pipeline.py @@ -23,15 +23,14 @@ from pyspark.sql import Row, SparkSession import tensorflow as tf -from tensorflow.contrib.saved_model.python.saved_model import reader +# from tensorflow.contrib.saved_model.python.saved_model import reader from tensorflow.python.saved_model import loader -from . import TFCluster, gpu_info, dfutil, util +from tensorflow.python.tools import saved_model_utils +from . import TFCluster, dfutil, util import argparse import copy import logging -import os -import subprocess import sys @@ -112,6 +111,19 @@ def getInputMode(self): return self.getOrDefault(self.input_mode) +class HasMasterNode(Params): + master_node = Param(Params._dummy(), "master_node", "Job name of master/chief worker node", typeConverter=TypeConverters.toString) + + def __init__(self): + super(HasMasterNode, self).__init__() + + def setMasterNode(self, value): + return self._set(master_node=value) + + def getMasterNode(self): + return self.getOrDefault(self.master_node) + + class HasModelDir(Params): model_dir = Param(Params._dummy(), "model_dir", "Path to save/load model checkpoints", typeConverter=TypeConverters.toString) @@ -321,7 +333,7 @@ def merge_args_params(self): class TFEstimator(Estimator, TFParams, HasInputMapping, - HasClusterSize, HasNumPS, HasInputMode, HasProtocol, HasTensorboard, HasModelDir, HasExportDir, HasTFRecordDir, + HasClusterSize, HasNumPS, HasInputMode, HasMasterNode, HasProtocol, HasTensorboard, HasModelDir, HasExportDir, HasTFRecordDir, HasBatchSize, HasEpochs, HasReaders, HasSteps): """Spark ML Estimator which launches a TensorFlowOnSpark cluster for distributed training. @@ -355,6 +367,7 @@ def __init__(self, train_fn, tf_args, export_fn=None): num_ps=0, driver_ps_nodes=False, input_mode=TFCluster.InputMode.SPARK, + master_node='chief', protocol='grpc', tensorboard=False, model_dir=None, @@ -398,7 +411,7 @@ def _fit(self, dataset): tf_args = self.args.argv if self.args.argv else local_args cluster = TFCluster.run(sc, self.train_fn, tf_args, local_args.cluster_size, local_args.num_ps, - local_args.tensorboard, local_args.input_mode, driver_ps_nodes=local_args.driver_ps_nodes) + local_args.tensorboard, local_args.input_mode, master_node=local_args.master_node, driver_ps_nodes=local_args.driver_ps_nodes) if local_args.input_mode == TFCluster.InputMode.SPARK: # feed data, using a deterministic order for input columns (lexicographic by key) input_cols = sorted(self.getInputMapping()) @@ -493,6 +506,8 @@ def _run_model(iterator, args, tf_args): """ single_node_env(tf_args) + tf.compat.v1.disable_eager_execution() + logging.info("===== input_mapping: {}".format(args.input_mapping)) logging.info("===== output_mapping: {}".format(args.output_mapping)) input_tensor_names = [tensor for col, tensor in sorted(args.input_mapping.items())] @@ -502,7 +517,7 @@ def _run_model(iterator, args, tf_args): if args.signature_def_key: assert args.export_dir, "Inferencing with signature_def_key requires --export_dir argument" logging.info("===== loading meta_graph_def for tag_set ({0}) from saved_model: {1}".format(args.tag_set, args.export_dir)) - meta_graph_def = get_meta_graph_def(args.export_dir, args.tag_set) + meta_graph_def = saved_model_utils.get_meta_graph_def(args.export_dir, args.tag_set) signature = meta_graph_def.signature_def[args.signature_def_key] logging.debug("signature: {}".format(signature)) inputs_tensor_info = signature.inputs @@ -518,29 +533,22 @@ def _run_model(iterator, args, tf_args): sess = global_sess else: # otherwise, create new session and load graph from disk - tf.reset_default_graph() - sess = tf.Session(graph=tf.get_default_graph()) + tf.compat.v1.reset_default_graph() + sess = tf.compat.v1.Session(graph=tf.compat.v1.get_default_graph()) if args.export_dir: assert args.tag_set, "Inferencing from a saved_model requires --tag_set" # load graph from a saved_model logging.info("===== restoring from saved_model: {}".format(args.export_dir)) loader.load(sess, args.tag_set.split(','), args.export_dir) - elif args.model_dir: - # load graph from a checkpoint - ckpt = tf.train.latest_checkpoint(args.model_dir) - assert ckpt, "Invalid model checkpoint path: {}".format(args.model_dir) - logging.info("===== restoring from checkpoint: {}".format(ckpt + ".meta")) - saver = tf.train.import_meta_graph(ckpt + ".meta", clear_devices=True) - saver.restore(sess, ckpt) else: - raise Exception("Inferencing requires either --model_dir or --export_dir argument") + raise Exception("Inferencing requires --export_dir argument") global_sess = sess global_args = args # get list of input/output tensors (by name) if args.signature_def_key: input_tensors = [inputs_tensor_info[t].name for t in input_tensor_names] - output_tensors = [outputs_tensor_info[t].name for t in output_tensor_names] + output_tensors = [outputs_tensor_info[output_tensor_names[0]].name] else: input_tensors = [t + ':0' for t in input_tensor_names] output_tensors = [t + ':0' for t in output_tensor_names] @@ -581,26 +589,6 @@ def single_node_env(args): util.single_node_env(num_gpus) -def get_meta_graph_def(saved_model_dir, tag_set): - """Utility function to read a meta_graph_def from disk. - - From `saved_model_cli.py `_ - - Args: - :saved_model_dir: path to saved_model. - :tag_set: list of string tags identifying the TensorFlow graph within the saved_model. - - Returns: - A TensorFlow meta_graph_def, or raises an Exception otherwise. - """ - saved_model = reader.read_saved_model(saved_model_dir) - set_of_tags = set(tag_set.split(',')) - for meta_graph_def in saved_model.meta_graphs: - if set(meta_graph_def.meta_info_def.tags) == set_of_tags: - return meta_graph_def - raise RuntimeError("MetaGraphDef associated with tag-set {0} could not be found in SavedModel".format(tag_set)) - - def yield_batch(iterable, batch_size, num_tensors=1): """Generator that yields batches of a DataFrame iterator. diff --git a/test/test_TFCluster.py b/test/test_TFCluster.py index b4c73f19..2540320b 100644 --- a/test/test_TFCluster.py +++ b/test/test_TFCluster.py @@ -19,10 +19,8 @@ def _map_fun(args, ctx): import tensorflow as tf x = tf.constant(args['x']) y = tf.constant(args['y']) - sum = tf.add(x, y) - with tf.Session() as sess: - result = sess.run([sum]) - assert result[0] == 3 + sum = tf.math.add(x, y) + assert sum.numpy() == 3 args = {'x': 1, 'y': 2} cluster = TFCluster.run(self.sc, _map_fun, tf_args=args, num_executors=self.num_workers, num_ps=0) @@ -32,23 +30,14 @@ def test_inputmode_spark(self): """Distributed TF cluster w/ InputMode.SPARK""" def _map_fun(args, ctx): import tensorflow as tf - cluster, server = TFNode.start_cluster_server(ctx) - if ctx.job_name == "ps": - server.join() - elif ctx.job_name == "worker": - with tf.device(tf.train.replica_device_setter( - worker_device="/job:worker/task:%d" % ctx.task_index, - cluster=cluster)): - x = tf.placeholder(tf.int32, [None, 1]) - sq = tf.square(x) - init_op = tf.global_variables_initializer() - with tf.train.MonitoredTrainingSession(is_chief=(ctx.task_index == 0)) as sess: - tf_feed = TFNode.DataFeed(ctx.mgr, False) - while not sess.should_stop() and not tf_feed.should_stop(): - batch = tf_feed.next_batch(10) - if len(batch) > 0: - outputs = sess.run([sq], feed_dict={x: batch}) - tf_feed.batch_results(outputs[0]) + + tf_feed = TFNode.DataFeed(ctx.mgr, False) + while not tf_feed.should_stop(): + batch = tf_feed.next_batch(batch_size=10) + print("batch: {}".format(batch)) + squares = tf.math.square(batch) + print("squares: {}".format(squares)) + tf_feed.batch_results(squares.numpy()) input = [[x] for x in range(1000)] # set up input as tensors of shape [1] to match placeholder rdd = self.sc.parallelize(input, 10) @@ -62,24 +51,14 @@ def test_inputmode_spark_exception(self): """Distributed TF cluster w/ InputMode.SPARK and exception during feeding""" def _map_fun(args, ctx): import tensorflow as tf - cluster, server = TFNode.start_cluster_server(ctx) - if ctx.job_name == "ps": - server.join() - elif ctx.job_name == "worker": - with tf.device(tf.train.replica_device_setter( - worker_device="/job:worker/task:%d" % ctx.task_index, - cluster=cluster)): - x = tf.placeholder(tf.int32, [None, 1]) - sq = tf.square(x) - init_op = tf.global_variables_initializer() - with tf.train.MonitoredTrainingSession(is_chief=(ctx.task_index == 0)) as sess: - tf_feed = TFNode.DataFeed(ctx.mgr, False) - while not sess.should_stop() and not tf_feed.should_stop(): - batch = tf_feed.next_batch(10) - if len(batch) > 0: - outputs = sess.run([sq], feed_dict={x: batch}) - tf_feed.batch_results(outputs[0]) - raise Exception("FAKE exception during feeding") + + tf_feed = TFNode.DataFeed(ctx.mgr, False) + while not tf_feed.should_stop(): + batch = tf_feed.next_batch(10) + if len(batch) > 0: + squares = tf.math.square(batch) + tf_feed.batch_results(squares.numpy()) + raise Exception("FAKE exception during feeding") input = [[x] for x in range(1000)] # set up input as tensors of shape [1] to match placeholder rdd = self.sc.parallelize(input, 10) @@ -92,27 +71,17 @@ def test_inputmode_spark_late_exception(self): """Distributed TF cluster w/ InputMode.SPARK and exception after feeding""" def _map_fun(args, ctx): import tensorflow as tf - cluster, server = TFNode.start_cluster_server(ctx) - if ctx.job_name == "ps": - server.join() - elif ctx.job_name == "worker": - with tf.device(tf.train.replica_device_setter( - worker_device="/job:worker/task:%d" % ctx.task_index, - cluster=cluster)): - x = tf.placeholder(tf.int32, [None, 1]) - sq = tf.square(x) - init_op = tf.global_variables_initializer() - with tf.train.MonitoredTrainingSession(is_chief=(ctx.task_index == 0)) as sess: - tf_feed = TFNode.DataFeed(ctx.mgr, False) - while not sess.should_stop() and not tf_feed.should_stop(): - batch = tf_feed.next_batch(10) - if len(batch) > 0: - outputs = sess.run([sq], feed_dict={x: batch}) - tf_feed.batch_results(outputs[0]) - - # simulate post-feed actions that raise an exception - time.sleep(2) - raise Exception("FAKE exception after feeding") + + tf_feed = TFNode.DataFeed(ctx.mgr, False) + while not tf_feed.should_stop(): + batch = tf_feed.next_batch(10) + if len(batch) > 0: + squares = tf.math.square(batch) + tf_feed.batch_results(squares.numpy()) + + # simulate post-feed actions that raise an exception + time.sleep(2) + raise Exception("FAKE exception after feeding") input = [[x] for x in range(1000)] # set up input as tensors of shape [1] to match placeholder rdd = self.sc.parallelize(input, 10) diff --git a/test/test_pipeline.py b/test/test_pipeline.py index 0950260c..ed9ff1bd 100644 --- a/test/test_pipeline.py +++ b/test/test_pipeline.py @@ -1,4 +1,3 @@ -from datetime import datetime import numpy as np import os import scipy @@ -9,6 +8,8 @@ from tensorflowonspark import TFCluster, dfutil from tensorflowonspark.pipeline import HasBatchSize, HasSteps, Namespace, TFEstimator, TFParams +from tensorflow.keras import Sequential +from tensorflow.keras.layers import Dense class PipelineTest(test.SparkTest): @@ -87,39 +88,11 @@ def __init__(self, args): expected_args = Namespace({'a': 1, 'b': 2, 'batch_size': 10, 'steps': 100}) self.assertEqual(combined_args, expected_args) - def test_spark_checkpoint(self): - """InputMode.SPARK TFEstimator w/ TFModel inferencing directly from model checkpoint""" - - # create a Spark DataFrame of training examples (features, labels) - trainDF = self.spark.createDataFrame(self.train_examples, ['col1', 'col2']) - - # train model - args = {} - estimator = TFEstimator(self.get_function('spark/train'), args) \ - .setInputMapping({'col1': 'x', 'col2': 'y_'}) \ - .setModelDir(self.model_dir) \ - .setClusterSize(self.num_workers) \ - .setNumPS(1) \ - .setBatchSize(10) \ - .setEpochs(2) - model = estimator.fit(trainDF) - self.assertTrue(os.path.isdir(self.model_dir)) - - # create a Spark DataFrame of test examples (features, labels) - testDF = self.spark.createDataFrame(self.test_examples, ['c1', 'c2']) - - # test model from checkpoint, referencing tensors directly - model.setInputMapping({'c1': 'x'}) \ - .setOutputMapping({'y': 'cout'}) - preds = model.transform(testDF).head() # take first/only result, e.g. [ Row(cout=[4.758000373840332])] - pred = preds.cout[0] # unpack scalar from tensor - self.assertAlmostEqual(pred, np.sum(self.weights), 5) - def test_spark_saved_model(self): """InputMode.SPARK TFEstimator w/ explicit saved_model export for TFModel inferencing""" # create a Spark DataFrame of training examples (features, labels) - trainDF = self.spark.createDataFrame(self.train_examples, ['col1', 'col2']) + trainDF = self.spark.createDataFrame(self.train_examples, ['col1', 'col2']).repartition(3) # train and export model args = {} @@ -128,9 +101,9 @@ def test_spark_saved_model(self): .setModelDir(self.model_dir) \ .setExportDir(self.export_dir) \ .setClusterSize(self.num_workers) \ - .setNumPS(1) \ - .setBatchSize(10) \ - .setEpochs(2) + .setNumPS(0) \ + .setBatchSize(1) \ + .setEpochs(1) model = estimator.fit(trainDF) self.assertTrue(os.path.isdir(self.export_dir)) @@ -138,101 +111,99 @@ def test_spark_saved_model(self): testDF = self.spark.createDataFrame(self.test_examples, ['c1', 'c2']) # test saved_model using exported signature - model.setTagSet('test_tag') \ - .setSignatureDefKey('test_key') \ - .setInputMapping({'c1': 'features'}) \ - .setOutputMapping({'prediction': 'cout'}) + model.setTagSet('serve') \ + .setSignatureDefKey('serving_default') \ + .setInputMapping({'c1': 'dense_input'}) \ + .setOutputMapping({'dense': 'cout'}) preds = model.transform(testDF).head() # take first/only result pred = preds.cout[0] # unpack scalar from tensor expected = np.sum(self.weights) - self.assertAlmostEqual(pred, expected, 5) + self.assertAlmostEqual(pred, expected, 3) # test saved_model using custom/direct mapping - model.setTagSet('test_tag') \ + model.setTagSet('serve') \ .setSignatureDefKey(None) \ - .setInputMapping({'c1': 'x'}) \ - .setOutputMapping({'y': 'cout1', 'y2': 'cout2'}) + .setInputMapping({'c1': 'dense_input'}) \ + .setOutputMapping({'dense/BiasAdd': 'cout'}) preds = model.transform(testDF).head() # take first/only result - pred = preds.cout1[0] # unpack pred scalar from tensor - squared_pred = preds.cout2[0] # unpack squared pred from tensor - - self.assertAlmostEqual(pred, expected, 5) - self.assertAlmostEqual(squared_pred, expected * expected, 5) - - def test_spark_sparse_tensor(self): - """InputMode.SPARK feeding sparse tensors""" - def sparse_train(args, ctx): - import tensorflow as tf - - # reset graph in case we're re-using a Spark python worker (during tests) - tf.reset_default_graph() - - cluster, server = ctx.start_cluster_server(ctx) - if ctx.job_name == "ps": - server.join() - elif ctx.job_name == "worker": - with tf.device(tf.train.replica_device_setter( - worker_device="/job:worker/task:%d" % ctx.task_index, - cluster=cluster)): - y_ = tf.placeholder(tf.float32, name='y_label') - label = tf.identity(y_, name='label') - - row_indices = tf.placeholder(tf.int64, name='x_row_indices') - col_indices = tf.placeholder(tf.int64, name='x_col_indices') - values = tf.placeholder(tf.float32, name='x_values') - indices = tf.stack([row_indices[0], col_indices[0]], axis=1) - data = values[0] - - x = tf.SparseTensor(indices=indices, values=data, dense_shape=[args.batch_size, 10]) - w = tf.Variable(tf.truncated_normal([10, 1]), name='w') - y = tf.sparse_tensor_dense_matmul(x, w, name='y') - - global_step = tf.train.get_or_create_global_step() - cost = tf.reduce_mean(tf.square(y_ - y), name='cost') - optimizer = tf.train.GradientDescentOptimizer(0.1).minimize(cost, global_step) - - with tf.train.MonitoredTrainingSession(master=server.target, - is_chief=(ctx.task_index == 0), - checkpoint_dir=args.model_dir, - save_checkpoint_steps=20) as sess: - tf_feed = ctx.get_data_feed(input_mapping=args.input_mapping) - while not sess.should_stop() and not tf_feed.should_stop(): - batch = tf_feed.next_batch(args.batch_size) - if len(batch['y_label']) > 0: - print("batch: {}".format(batch)) - feed = {y_: batch['y_label'], - row_indices: batch['x_row_indices'], - col_indices: batch['x_col_indices'], - values: batch['x_values']} - _, pred, trained_weights = sess.run([optimizer, y, w], feed_dict=feed) - print("trained_weights: {}".format(trained_weights)) - - # wait for MonitoredTrainingSession to save last checkpoint - time.sleep(10) - - args = {} - estimator = TFEstimator(sparse_train, args) \ - .setInputMapping({'labels': 'y_label', 'row_indices': 'x_row_indices', 'col_indices': 'x_col_indices', 'values': 'x_values'}) \ - .setInputMode(TFCluster.InputMode.SPARK) \ - .setModelDir(self.model_dir) \ - .setClusterSize(self.num_workers) \ - .setNumPS(1) \ - .setBatchSize(1) - - model_weights = np.array([[1.0, 1.0, 1.0, 1.0, 1.0, -1.0, -1.0, -1.0, -1.0, -1.0]]).T - examples = [scipy.sparse.random(1, 10, density=0.5,) for i in range(200)] - rdd = self.sc.parallelize(examples).map(lambda e: ((e * model_weights).tolist()[0][0], e.row.tolist(), e.col.tolist(), e.data.tolist())) - df = rdd.toDF(["labels", "row_indices", "col_indices", "values"]) - df.show(5) - model = estimator.fit(df) - - model.setOutputMapping({'label': 'label', 'y/SparseTensorDenseMatMul': 'predictions'}) - test_examples = [scipy.sparse.random(1, 10, density=0.5,) for i in range(50)] - test_rdd = self.sc.parallelize(test_examples).map(lambda e: ((e * model_weights).tolist()[0][0], e.row.tolist(), e.col.tolist(), e.data.tolist())) - test_df = test_rdd.toDF(["labels", "row_indices", "col_indices", "values"]) - test_df.show(5) - preds = model.transform(test_df) - preds.show(5) + pred = preds.cout[0] # unpack pred scalar from tensor + self.assertAlmostEqual(pred, expected, 3) + +# def test_spark_sparse_tensor(self): +# """InputMode.SPARK feeding sparse tensors""" +# def sparse_train(args, ctx): +# import tensorflow as tf +# +# # reset graph in case we're re-using a Spark python worker (during tests) +# tf.compat.v1.reset_default_graph() +# +# cluster, server = ctx.start_cluster_server(ctx) +# if ctx.job_name == "ps": +# server.join() +# elif ctx.job_name == "worker": +# with tf.device(tf.compat.v1.train.replica_device_setter( +# worker_device="/job:worker/task:%d" % ctx.task_index, +# cluster=cluster)): +# y_ = tf.compat.v1.placeholder(tf.float32, name='y_label') +# label = tf.identity(y_, name='label') +# +# row_indices = tf.compat.v1.placeholder(tf.int64, name='x_row_indices') +# col_indices = tf.compat.v1.placeholder(tf.int64, name='x_col_indices') +# values = tf.compat.v1.placeholder(tf.float32, name='x_values') +# indices = tf.stack([row_indices[0], col_indices[0]], axis=1) +# data = values[0] +# +# x = tf.SparseTensor(indices=indices, values=data, dense_shape=[args.batch_size, 10]) +# w = tf.Variable(tf.random.truncated_normal([10, 1]), name='w') +# y = tf.sparse.sparse_dense_matmul(x, w, name='y') +# +# global_step = tf.compat.v1.train.get_or_create_global_step() +# cost = tf.reduce_mean(input_tensor=tf.square(y_ - y), name='cost') +# optimizer = tf.compat.v1.train.GradientDescentOptimizer(0.1).minimize(cost, global_step) +# +# with tf.compat.v1.train.MonitoredTrainingSession(master=server.target, +# is_chief=(ctx.task_index == 0), +# checkpoint_dir=args.model_dir, +# save_checkpoint_steps=20) as sess: +# tf_feed = ctx.get_data_feed(input_mapping=args.input_mapping) +# while not sess.should_stop() and not tf_feed.should_stop(): +# batch = tf_feed.next_batch(args.batch_size) +# if len(batch) > 0: +# print("batch: {}".format(batch)) +# feed = {y_: batch['y_label'], +# row_indices: batch['x_row_indices'], +# col_indices: batch['x_col_indices'], +# values: batch['x_values']} +# _, pred, trained_weights = sess.run([optimizer, y, w], feed_dict=feed) +# print("trained_weights: {}".format(trained_weights)) +# sess.close() +# +# # wait for MonitoredTrainingSession to save last checkpoint +# time.sleep(10) +# +# args = {} +# estimator = TFEstimator(sparse_train, args) \ +# .setInputMapping({'labels': 'y_label', 'row_indices': 'x_row_indices', 'col_indices': 'x_col_indices', 'values': 'x_values'}) \ +# .setInputMode(TFCluster.InputMode.SPARK) \ +# .setModelDir(self.model_dir) \ +# .setClusterSize(self.num_workers) \ +# .setNumPS(1) \ +# .setBatchSize(1) +# +# model_weights = np.array([[1.0, 1.0, 1.0, 1.0, 1.0, -1.0, -1.0, -1.0, -1.0, -1.0]]).T +# examples = [scipy.sparse.random(1, 10, density=0.5,) for i in range(200)] +# rdd = self.sc.parallelize(examples).map(lambda e: ((e * model_weights).tolist()[0][0], e.row.tolist(), e.col.tolist(), e.data.tolist())) +# df = rdd.toDF(["labels", "row_indices", "col_indices", "values"]) +# df.show(5) +# model = estimator.fit(df) +# +# model.setOutputMapping({'label': 'label', 'y/SparseTensorDenseMatMul': 'predictions'}) +# test_examples = [scipy.sparse.random(1, 10, density=0.5,) for i in range(50)] +# test_rdd = self.sc.parallelize(test_examples).map(lambda e: ((e * model_weights).tolist()[0][0], e.row.tolist(), e.col.tolist(), e.data.tolist())) +# test_df = test_rdd.toDF(["labels", "row_indices", "col_indices", "values"]) +# test_df.show(5) +# preds = model.transform(test_df) +# preds.show(5) def test_tf_column_filter(self): """InputMode.TENSORFLOW TFEstimator saving temporary TFRecords, filtered by input_mapping columns""" @@ -244,26 +215,29 @@ def test_tf_column_filter(self): df = trainDF.withColumn('extra1', trainDF.col1) df = df.withColumn('extra2', trainDF.col2) self.assertEqual(len(df.columns), 4) + df.show() - # train model + # train model on selected columns args = {} - estimator = TFEstimator(self.get_function('tf/train'), args, export_fn=self.get_function('tf/export')) \ + estimator = TFEstimator(self.get_function('tf/train'), args) \ .setInputMapping({'col1': 'x', 'col2': 'y_'}) \ .setInputMode(TFCluster.InputMode.TENSORFLOW) \ - .setModelDir(self.model_dir) \ .setExportDir(self.export_dir) \ .setTFRecordDir(self.tfrecord_dir) \ .setClusterSize(self.num_workers) \ .setNumPS(1) \ .setBatchSize(10) estimator.fit(df) - self.assertTrue(os.path.isdir(self.model_dir)) + self.assertTrue(os.path.isdir(self.export_dir)) self.assertTrue(os.path.isdir(self.tfrecord_dir)) + # verify that temporarily-saved TFRecords have the columns we requested df_tmp = dfutil.loadTFRecords(self.sc, self.tfrecord_dir) + df_tmp.show() + self.assertEqual(df_tmp.columns, ['col1', 'col2']) - def test_tf_checkpoint_with_export_fn(self): + def test_tf_saved_model(self): """InputMode.TENSORFLOW TFEstimator w/ a separate saved_model export function to add placeholders for InputMode.SPARK TFModel inferencing""" # create a Spark DataFrame of training examples (features, labels) @@ -271,30 +245,28 @@ def test_tf_checkpoint_with_export_fn(self): # train model args = {} - estimator = TFEstimator(self.get_function('tf/train'), args, export_fn=self.get_function('tf/export')) \ + estimator = TFEstimator(self.get_function('tf/train'), args) \ .setInputMapping({'col1': 'x', 'col2': 'y_'}) \ .setInputMode(TFCluster.InputMode.TENSORFLOW) \ - .setModelDir(self.model_dir) \ .setExportDir(self.export_dir) \ .setTFRecordDir(self.tfrecord_dir) \ .setClusterSize(self.num_workers) \ .setNumPS(1) \ .setBatchSize(10) model = estimator.fit(trainDF) - self.assertTrue(os.path.isdir(self.model_dir)) self.assertTrue(os.path.isdir(self.export_dir)) # create a Spark DataFrame of test examples (features, labels) testDF = self.spark.createDataFrame(self.test_examples, ['c1', 'c2']) - # test model from checkpoint, referencing tensors directly - model.setTagSet('test_tag') \ - .setInputMapping({'c1': 'x'}) \ - .setOutputMapping({'y': 'cout1', 'y2': 'cout2'}) + # test from saved_model + model.setTagSet('serve') \ + .setSignatureDefKey('serving_default') \ + .setInputMapping({'c1': 'dense_input'}) \ + .setOutputMapping({'dense': 'cout'}) preds = model.transform(testDF).head() # take first/only result, e.g. [ Row(cout=[4.758000373840332])] - pred1, pred2 = preds.cout1[0], preds.cout2[0] + pred1 = preds.cout[0] self.assertAlmostEqual(pred1, np.sum(self.weights), 5) - self.assertAlmostEqual(pred2, np.sum(self.weights) ** 2, 5) def get_function(self, name): """Returns a TF map_function for tests (required to avoid serializing the parent module/class)""" @@ -304,147 +276,58 @@ def _spark_train(args, ctx): import tensorflow as tf from tensorflowonspark import TFNode - class ExportHook(tf.train.SessionRunHook): - def __init__(self, export_dir, input_tensor, output_tensor): - self.export_dir = export_dir - self.input_tensor = input_tensor - self.output_tensor = output_tensor - - def end(self, session): - print("{} ======= Exporting to: {}".format(datetime.now().isoformat(), self.export_dir)) - signatures = { - "test_key": { - 'inputs': {'features': self.input_tensor}, - 'outputs': {'prediction': self.output_tensor}, - 'method_name': tf.saved_model.signature_constants.PREDICT_METHOD_NAME - } - } - TFNode.export_saved_model(session, - self.export_dir, - "test_tag", - signatures) - print("{} ======= Done exporting".format(datetime.now().isoformat())) - - tf.reset_default_graph() # reset graph in case we're re-using a Spark python worker - - cluster, server = TFNode.start_cluster_server(ctx) - if ctx.job_name == "ps": - server.join() - elif ctx.job_name == "worker": - with tf.device(tf.train.replica_device_setter( - worker_device="/job:worker/task:%d" % ctx.task_index, - cluster=cluster)): - x = tf.placeholder(tf.float32, [None, 2], name='x') - y_ = tf.placeholder(tf.float32, [None, 1], name='y_') - w = tf.Variable(tf.truncated_normal([2, 1]), name='w') - y = tf.matmul(x, w, name='y') - y2 = tf.square(y, name="y2") # extra/optional output for testing multiple output tensors - global_step = tf.train.get_or_create_global_step() - cost = tf.reduce_mean(tf.square(y_ - y), name='cost') - optimizer = tf.train.GradientDescentOptimizer(0.5).minimize(cost, global_step) - - chief_hooks = [ExportHook(ctx.absolute_path(args.export_dir), x, y)] if args.export_dir else [] - with tf.train.MonitoredTrainingSession(master=server.target, - is_chief=(ctx.task_index == 0), - checkpoint_dir=args.model_dir, - chief_only_hooks=chief_hooks) as sess: - tf_feed = TFNode.DataFeed(ctx.mgr, input_mapping=args.input_mapping) - while not sess.should_stop() and not tf_feed.should_stop(): - batch = tf_feed.next_batch(10) - if args.input_mapping: - if len(batch['x']) > 0: - feed = {x: batch['x'], y_: batch['y_']} - sess.run(optimizer, feed_dict=feed) + model = Sequential() + model.add(Dense(1, activation='linear', input_shape=(2,))) + model.compile(optimizer=tf.keras.optimizers.Adam(lr=0.2), loss='mse', metrics=['mse']) + model.summary() + + tf_feed = TFNode.DataFeed(ctx.mgr, input_mapping=args.input_mapping) + while not tf_feed.should_stop(): + batch = tf_feed.next_batch(args.batch_size) + if args.input_mapping: + if len(batch['x']) > 0: + model.fit(np.array(batch['x']), np.array(batch['y_'])) + + if ctx.job_name == 'chief': + print("saving checkpoint to: {}".format(args.model_dir)) + tf.saved_model.save(model, args.model_dir) + # model.save_weights(args.model_dir + "/model", overwrite=True, save_format='tf') + + if args.export_dir: + print("exporting model to: {}".format(args.export_dir)) + tf.keras.experimental.export_saved_model(model, args.export_dir) def _tf_train(args, ctx): """Basic linear regression in a distributed TF cluster using InputMode.TENSORFLOW""" import tensorflow as tf - tf.reset_default_graph() # reset graph in case we're re-using a Spark python worker - - cluster, server = ctx.start_cluster_server() - - def _get_examples(batch_size): - """Generate test data (mocking a queue_runner of file inputs)""" - features = tf.random_uniform([batch_size, 2]) # (batch_size x 2) - weights = tf.constant([[3.14], [1.618]]) # (2, 1) - labels = tf.matmul(features, weights) - return features, labels - - if ctx.job_name == "ps": - server.join() - elif ctx.job_name == "worker": - with tf.device(tf.train.replica_device_setter( - worker_device="/job:worker/task:%d" % ctx.task_index, - cluster=cluster)): - x, y_ = _get_examples(10) # no input placeholders, TF code reads (or in this case "generates") input - w = tf.Variable(tf.truncated_normal([2, 1]), name='w') - y = tf.matmul(x, w, name='y') - global_step = tf.train.get_or_create_global_step() - - cost = tf.reduce_mean(tf.square(y_ - y), name='cost') - optimizer = tf.train.GradientDescentOptimizer(0.5).minimize(cost, global_step) - - with tf.train.MonitoredTrainingSession(master=server.target, - is_chief=(ctx.task_index == 0), - checkpoint_dir=args.model_dir) as sess: - step = 0 - while not sess.should_stop() and step < args.steps: - opt, weights, step = sess.run([optimizer, w, global_step]) - if (step % 100 == 0): - print("step: {}, weights: {}".format(step, weights)) - - # synchronize completion (via files) to allow time for all other nodes to complete - done_dir = "{}/done".format(args.model_dir) - tf.gfile.MakeDirs(done_dir) - with tf.gfile.GFile("{}/{}".format(done_dir, ctx.task_index), 'w') as f: - f.write("done!") - - # wait up to 60s for other nodes to complete - for _ in range(60): - if len(tf.gfile.ListDirectory(done_dir)) < len(ctx.cluster_spec['worker']): - time.sleep(1) - else: - break - else: - raise Exception("timeout while waiting for other nodes") - - def _tf_export(args): - """Creates an inference graph w/ placeholder and loads weights from checkpoint""" - import tensorflow as tf - from tensorflowonspark import TFNode + def _get_examples(num_rows, batch_size): + """Generate test data""" + for i in range(num_rows): + features = tf.random.uniform([batch_size, 2]) # (batch_size x 2) + weights = tf.constant([[3.14], [1.618]]) # (2, 1) + labels = tf.matmul(features, weights) + yield features, labels + + model = Sequential() + model.add(Dense(1, activation='linear', input_shape=(2,))) + model.compile(optimizer=tf.keras.optimizers.Adam(lr=0.2), loss='mse', metrics=['mse']) + model.summary() + + model.fit_generator(_get_examples(1000, 10), steps_per_epoch=100, epochs=5) - tf.reset_default_graph() # reset graph in case we're re-using a Spark python worker - x = tf.placeholder(tf.float32, [None, 2], name='x') - w = tf.Variable(tf.truncated_normal([2, 1]), name='w') - y = tf.matmul(x, w, name='y') - y2 = tf.square(y, name="y2") # extra/optional output for testing multiple output tensors - saver = tf.train.Saver() - - with tf.Session() as sess: - # load graph from a checkpoint - ckpt = tf.train.get_checkpoint_state(args.model_dir) - assert ckpt and ckpt.model_checkpoint_path, "Invalid model checkpoint path: {}".format(args.model_dir) - saver.restore(sess, ckpt.model_checkpoint_path) - - # exported signatures defined in code - signatures = { - 'test_key': { - 'inputs': {'features': x}, - 'outputs': {'prediction': y, 'pred2': y2}, - 'method_name': 'test' - } - } - TFNode.export_saved_model(sess, export_dir=args.export_dir, tag_set='test_tag', signatures=signatures) + # export saved_model + if ctx.job_name == 'chief' and args.export_dir: + print("model weights: {}".format(model.get_weights())) + print("exporting model to: {}".format(args.export_dir)) + tf.keras.experimental.export_saved_model(model, args.export_dir) if name == 'spark/train': return _spark_train elif name == 'tf/train': return _tf_train - elif name == 'tf/export': - return _tf_export else: - raise "Unknown function name: {}".format(name) + raise Exception("Unknown function name: {}".format(name)) if __name__ == '__main__': From 4f02303b87d3f951a031688b46a1f143c0902a44 Mon Sep 17 00:00:00 2001 From: Lee Yang Date: Fri, 2 Aug 2019 11:40:43 -0700 Subject: [PATCH 02/37] add missing line --- examples/mnist/keras/mnist_keras_2.0.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/examples/mnist/keras/mnist_keras_2.0.py b/examples/mnist/keras/mnist_keras_2.0.py index bf2c708c..f9dacf81 100644 --- a/examples/mnist/keras/mnist_keras_2.0.py +++ b/examples/mnist/keras/mnist_keras_2.0.py @@ -51,6 +51,7 @@ def build_and_compile_cnn_model(): train_datasets = train_datasets_unbatched.batch(GLOBAL_BATCH_SIZE) with strategy.scope(): multi_worker_model = build_and_compile_cnn_model() + multi_worker_model.fit(x=train_datasets, epochs=args.epochs, steps_per_epoch=args.steps_per_epoch) if ctx.job_name == 'chief': # multi_worker_model.save(args.model_dir, save_format='tf') @@ -71,7 +72,7 @@ def build_and_compile_cnn_model(): parser.add_argument("--batch_size", help="number of records per batch", type=int, default=64) parser.add_argument("--buffer_size", help="size of shuffle buffer", type=int, default=10000) parser.add_argument("--cluster_size", help="number of nodes in the cluster", type=int, default=num_executors) - parser.add_argument("--epochs", help="number of epochs of training data", type=int, default=5) + parser.add_argument("--epochs", help="number of epochs of training data", type=int, default=3) parser.add_argument("--model_dir", help="path to save model/checkpoint", default="mnist_model") parser.add_argument("--steps_per_epoch", help="number of steps per epoch", type=int, default=469) parser.add_argument("--tensorboard", help="launch tensorboard process", action="store_true") From 1e04b59129b29d06fcea43cd94ba41522f6f69a3 Mon Sep 17 00:00:00 2001 From: Lee Yang Date: Fri, 2 Aug 2019 15:28:13 -0700 Subject: [PATCH 03/37] add checkpointing to mnist_keras_2.0.py --- examples/mnist/keras/mnist_keras_2.0.py | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) diff --git a/examples/mnist/keras/mnist_keras_2.0.py b/examples/mnist/keras/mnist_keras_2.0.py index f9dacf81..21aca42b 100644 --- a/examples/mnist/keras/mnist_keras_2.0.py +++ b/examples/mnist/keras/mnist_keras_2.0.py @@ -49,13 +49,20 @@ def build_and_compile_cnn_model(): # and now this becomes 128. GLOBAL_BATCH_SIZE = BATCH_SIZE * NUM_WORKERS train_datasets = train_datasets_unbatched.batch(GLOBAL_BATCH_SIZE) + + # this fails + # callbacks = [tf.keras.callbacks.ModelCheckpoint(filepath=args.model_dir)] + tf.io.gfile.makedirs(args.model_dir) + filepath = args.model_dir + "/weights-{epoch:04d}" + callbacks = [tf.keras.callbacks.ModelCheckpoint(filepath=filepath, verbose=1, save_weights_only=True)] + with strategy.scope(): multi_worker_model = build_and_compile_cnn_model() - multi_worker_model.fit(x=train_datasets, epochs=args.epochs, steps_per_epoch=args.steps_per_epoch) + multi_worker_model.fit(x=train_datasets, epochs=args.epochs, steps_per_epoch=args.steps_per_epoch, callbacks=callbacks) if ctx.job_name == 'chief': # multi_worker_model.save(args.model_dir, save_format='tf') - tf.keras.experimental.export_saved_model(multi_worker_model, args.model_dir) + tf.keras.experimental.export_saved_model(multi_worker_model, args.export_dir) if __name__ == '__main__': @@ -74,6 +81,7 @@ def build_and_compile_cnn_model(): parser.add_argument("--cluster_size", help="number of nodes in the cluster", type=int, default=num_executors) parser.add_argument("--epochs", help="number of epochs of training data", type=int, default=3) parser.add_argument("--model_dir", help="path to save model/checkpoint", default="mnist_model") + parser.add_argument("--export_dir", help="path to export saved_model", default="mnist_export") parser.add_argument("--steps_per_epoch", help="number of steps per epoch", type=int, default=469) parser.add_argument("--tensorboard", help="launch tensorboard process", action="store_true") From 539b0c37e220504de7227fc0982cdd78f282a9de Mon Sep 17 00:00:00 2001 From: Lee Yang Date: Mon, 5 Aug 2019 09:41:22 -0700 Subject: [PATCH 04/37] add checkpoints and saved_models --- .../mnist/estimator/mnist_estimator_2.0.py | 26 ++++++++++++++----- examples/mnist/keras/mnist_keras_2.0.py | 4 +-- 2 files changed, 22 insertions(+), 8 deletions(-) diff --git a/examples/mnist/estimator/mnist_estimator_2.0.py b/examples/mnist/estimator/mnist_estimator_2.0.py index 750a9d08..26881f54 100644 --- a/examples/mnist/estimator/mnist_estimator_2.0.py +++ b/examples/mnist/estimator/mnist_estimator_2.0.py @@ -26,7 +26,12 @@ def scale(image, label): if input_context: mnist_dataset = mnist_dataset.shard(input_context.num_input_pipelines, input_context.input_pipeline_id) - return mnist_dataset.repeat(2).map(scale).shuffle(BUFFER_SIZE).batch(BATCH_SIZE) + return mnist_dataset.repeat(args.epochs).map(scale).shuffle(BUFFER_SIZE).batch(BATCH_SIZE) + + def serving_input_receiver_fn(): + features = tf.compat.v1.placeholder(dtype=tf.float32, shape=[None, 28, 28, 1], name='features') + receiver_tensors = {'features': features} + return tf.estimator.export.ServingInputReceiver(receiver_tensors, receiver_tensors) def model_fn(features, labels, mode): model = tf.keras.Sequential([ @@ -40,7 +45,7 @@ def model_fn(features, labels, mode): if mode == tf.estimator.ModeKeys.PREDICT: predictions = {'logits': logits} - return tf.estimator.EstimatorSpec(labels=labels, predictions=predictions) + return tf.estimator.EstimatorSpec(mode, predictions=predictions) optimizer = tf.compat.v1.train.GradientDescentOptimizer( learning_rate=LEARNING_RATE) @@ -62,12 +67,19 @@ def model_fn(features, labels, mode): classifier = tf.estimator.Estimator( model_fn=model_fn, model_dir=args.model_dir, config=config) + # exporter = tf.estimator.FinalExporter("serving", serving_input_receiver_fn=serving_input_receiver_fn) + tf.estimator.train_and_evaluate( classifier, train_spec=tf.estimator.TrainSpec(input_fn=input_fn), eval_spec=tf.estimator.EvalSpec(input_fn=input_fn) + # eval_spec=tf.estimator.EvalSpec(input_fn=input_fn, exporters=exporter) ) + if ctx.job_name == 'chief': + print("========== exporting saved_model to {}".format(args.export_dir)) + classifier.export_saved_model(args.export_dir, serving_input_receiver_fn) + if __name__ == "__main__": # tf.app.run() @@ -85,12 +97,14 @@ def model_fn(features, labels, mode): parser.add_argument("--batch_size", help="number of records per batch", type=int, default=64) parser.add_argument("--buffer_size", help="size of shuffle buffer", type=int, default=10000) parser.add_argument("--cluster_size", help="number of nodes in the cluster", type=int, default=num_executors) - parser.add_argument("--learning_rate", help="learning rate", type=float, default=1e-4) - parser.add_argument("--model_dir", help="path to save model/checkpoint", default="mnist_model") + parser.add_argument("--epochs", help="number of epochs", type=int, default=2) + parser.add_argument("--learning_rate", help="learning rate", type=float, default=1e-3) + parser.add_argument("--model_dir", help="path to save checkpoint", default="mnist_model") + parser.add_argument("--export_dir", help="path to export saved_model", default="mnist_export") parser.add_argument("--tensorboard", help="launch tensorboard process", action="store_true") args = parser.parse_args() print("args:", args) - cluster = TFCluster.run(sc, main, args, args.cluster_size, num_ps=0, tensorboard=args.tensorboard, input_mode=TFCluster.InputMode.TENSORFLOW, log_dir=args.model_dir, master_node='chief') - cluster.shutdown() + cluster = TFCluster.run(sc, main, args, args.cluster_size, num_ps=0, tensorboard=args.tensorboard, input_mode=TFCluster.InputMode.TENSORFLOW, log_dir=args.model_dir, master_node='chief', eval_node=True) + cluster.shutdown(grace_secs=120) diff --git a/examples/mnist/keras/mnist_keras_2.0.py b/examples/mnist/keras/mnist_keras_2.0.py index 21aca42b..2a6df3d1 100644 --- a/examples/mnist/keras/mnist_keras_2.0.py +++ b/examples/mnist/keras/mnist_keras_2.0.py @@ -54,7 +54,7 @@ def build_and_compile_cnn_model(): # callbacks = [tf.keras.callbacks.ModelCheckpoint(filepath=args.model_dir)] tf.io.gfile.makedirs(args.model_dir) filepath = args.model_dir + "/weights-{epoch:04d}" - callbacks = [tf.keras.callbacks.ModelCheckpoint(filepath=filepath, verbose=1, save_weights_only=True)] + callbacks = [tf.keras.callbacks.ModelCheckpoint(filepath=filepath, verbose=1, load_weights_on_restart=True, save_weights_only=True)] with strategy.scope(): multi_worker_model = build_and_compile_cnn_model() @@ -79,7 +79,7 @@ def build_and_compile_cnn_model(): parser.add_argument("--batch_size", help="number of records per batch", type=int, default=64) parser.add_argument("--buffer_size", help="size of shuffle buffer", type=int, default=10000) parser.add_argument("--cluster_size", help="number of nodes in the cluster", type=int, default=num_executors) - parser.add_argument("--epochs", help="number of epochs of training data", type=int, default=3) + parser.add_argument("--epochs", help="number of epochs", type=int, default=3) parser.add_argument("--model_dir", help="path to save model/checkpoint", default="mnist_model") parser.add_argument("--export_dir", help="path to export saved_model", default="mnist_export") parser.add_argument("--steps_per_epoch", help="number of steps per epoch", type=int, default=469) From 981e4266d4ea816b08a762193bd52f40cd1a3242 Mon Sep 17 00:00:00 2001 From: Lee Yang Date: Wed, 7 Aug 2019 13:45:46 -0700 Subject: [PATCH 05/37] remove outdated examples; update mnist_data_setup.py and mnist_inference.py --- examples/mnist/estimator/mnist_estimator.py | 188 ------------------ .../mnist/estimator/mnist_estimator_2.0.py | 2 +- examples/mnist/keras/mnist_inference.py | 58 ++---- examples/mnist/keras/mnist_mlp.py | 175 ---------------- examples/mnist/keras/mnist_mlp_estimator.py | 162 --------------- examples/mnist/mnist_data_setup.py | 168 ++++------------ 6 files changed, 66 insertions(+), 687 deletions(-) delete mode 100644 examples/mnist/estimator/mnist_estimator.py delete mode 100644 examples/mnist/keras/mnist_mlp.py delete mode 100644 examples/mnist/keras/mnist_mlp_estimator.py diff --git a/examples/mnist/estimator/mnist_estimator.py b/examples/mnist/estimator/mnist_estimator.py deleted file mode 100644 index dddd4b9d..00000000 --- a/examples/mnist/estimator/mnist_estimator.py +++ /dev/null @@ -1,188 +0,0 @@ -# Copyright 2016 The TensorFlow Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -"""Convolutional Neural Network Estimator for MNIST, built with tf.layers.""" - -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function - -import numpy as np -import tensorflow as tf - -tf.logging.set_verbosity(tf.logging.INFO) - - -def cnn_model_fn(features, labels, mode): - """Model function for CNN.""" - # Input Layer - # Reshape X to 4-D tensor: [batch_size, width, height, channels] - # MNIST images are 28x28 pixels, and have one color channel - input_layer = tf.reshape(features["x"], [-1, 28, 28, 1]) - - # Convolutional Layer #1 - # Computes 32 features using a 5x5 filter with ReLU activation. - # Padding is added to preserve width and height. - # Input Tensor Shape: [batch_size, 28, 28, 1] - # Output Tensor Shape: [batch_size, 28, 28, 32] - conv1 = tf.layers.conv2d( - inputs=input_layer, - filters=32, - kernel_size=[5, 5], - padding="same", - activation=tf.nn.relu) - - # Pooling Layer #1 - # First max pooling layer with a 2x2 filter and stride of 2 - # Input Tensor Shape: [batch_size, 28, 28, 32] - # Output Tensor Shape: [batch_size, 14, 14, 32] - pool1 = tf.layers.max_pooling2d(inputs=conv1, pool_size=[2, 2], strides=2) - - # Convolutional Layer #2 - # Computes 64 features using a 5x5 filter. - # Padding is added to preserve width and height. - # Input Tensor Shape: [batch_size, 14, 14, 32] - # Output Tensor Shape: [batch_size, 14, 14, 64] - conv2 = tf.layers.conv2d( - inputs=pool1, - filters=64, - kernel_size=[5, 5], - padding="same", - activation=tf.nn.relu) - - # Pooling Layer #2 - # Second max pooling layer with a 2x2 filter and stride of 2 - # Input Tensor Shape: [batch_size, 14, 14, 64] - # Output Tensor Shape: [batch_size, 7, 7, 64] - pool2 = tf.layers.max_pooling2d(inputs=conv2, pool_size=[2, 2], strides=2) - - # Flatten tensor into a batch of vectors - # Input Tensor Shape: [batch_size, 7, 7, 64] - # Output Tensor Shape: [batch_size, 7 * 7 * 64] - pool2_flat = tf.reshape(pool2, [-1, 7 * 7 * 64]) - - # Dense Layer - # Densely connected layer with 1024 neurons - # Input Tensor Shape: [batch_size, 7 * 7 * 64] - # Output Tensor Shape: [batch_size, 1024] - dense = tf.layers.dense(inputs=pool2_flat, units=1024, activation=tf.nn.relu) - - # Add dropout operation; 0.6 probability that element will be kept - dropout = tf.layers.dropout( - inputs=dense, rate=0.4, training=mode == tf.estimator.ModeKeys.TRAIN) - - # Logits layer - # Input Tensor Shape: [batch_size, 1024] - # Output Tensor Shape: [batch_size, 10] - logits = tf.layers.dense(inputs=dropout, units=10) - - predictions = { - # Generate predictions (for PREDICT and EVAL mode) - "classes": tf.argmax(input=logits, axis=1), - # Add `softmax_tensor` to the graph. It is used for PREDICT and by the - # `logging_hook`. - "probabilities": tf.nn.softmax(logits, name="softmax_tensor") - } - if mode == tf.estimator.ModeKeys.PREDICT: - return tf.estimator.EstimatorSpec(mode=mode, predictions=predictions) - - # Calculate Loss (for both TRAIN and EVAL modes) - loss = tf.losses.sparse_softmax_cross_entropy(labels=labels, logits=logits) - - # Configure the Training Op (for TRAIN mode) - if mode == tf.estimator.ModeKeys.TRAIN: - optimizer = tf.train.GradientDescentOptimizer(learning_rate=0.001) - train_op = optimizer.minimize( - loss=loss, - global_step=tf.train.get_global_step()) - return tf.estimator.EstimatorSpec(mode=mode, loss=loss, train_op=train_op) - - # Add evaluation metrics (for EVAL mode) - eval_metric_ops = { - "accuracy": tf.metrics.accuracy( - labels=labels, predictions=predictions["classes"])} - return tf.estimator.EstimatorSpec( - mode=mode, loss=loss, eval_metric_ops=eval_metric_ops) - - -def main(args, ctx): - # Load training and eval data - mnist = tf.contrib.learn.datasets.mnist.read_data_sets(args.data_dir) - train_data = mnist.train.images # Returns np.array - train_labels = np.asarray(mnist.train.labels, dtype=np.int32) - eval_data = mnist.test.images # Returns np.array - eval_labels = np.asarray(mnist.test.labels, dtype=np.int32) - - # Create the Estimator - mnist_classifier = tf.estimator.Estimator( - model_fn=cnn_model_fn, model_dir=args.model) - - # Set up logging for predictions - # Log the values in the "Softmax" tensor with label "probabilities" - tensors_to_log = {"probabilities": "softmax_tensor"} - logging_hook = tf.train.LoggingTensorHook( - tensors=tensors_to_log, every_n_iter=50) - - # Train the model - train_input_fn = tf.estimator.inputs.numpy_input_fn( - x={"x": train_data}, - y=train_labels, - batch_size=args.batch_size, - num_epochs=None, - shuffle=True) - # mnist_classifier.train( - # input_fn=train_input_fn, - # steps=1000, - # hooks=[logging_hook]) - - # Evaluate the model and print results - eval_input_fn = tf.estimator.inputs.numpy_input_fn( - x={"x": eval_data}, - y=eval_labels, - num_epochs=1, - shuffle=False) - # eval_results = mnist_classifier.evaluate(input_fn=eval_input_fn) - # print(eval_results) - - # Using tf.estimator.train_and_evaluate - train_spec = tf.estimator.TrainSpec(input_fn=train_input_fn, max_steps=args.steps, hooks=[logging_hook]) - eval_spec = tf.estimator.EvalSpec(input_fn=eval_input_fn) - tf.estimator.train_and_evaluate(mnist_classifier, train_spec, eval_spec) - - -if __name__ == "__main__": - # tf.app.run() - - from pyspark.context import SparkContext - from pyspark.conf import SparkConf - from tensorflowonspark import TFCluster - import argparse - - sc = SparkContext(conf=SparkConf().setAppName("mnist_spark")) - executors = sc._conf.get("spark.executor.instances") - num_executors = int(executors) if executors is not None else 1 - - parser = argparse.ArgumentParser() - parser.add_argument("--batch_size", help="number of records per batch", type=int, default=100) - parser.add_argument("--cluster_size", help="number of nodes in the cluster", type=int, default=num_executors) - parser.add_argument("--data_dir", help="path to MNIST data", default="MNIST-data") - parser.add_argument("--model", help="path to save model/checkpoint", default="mnist_model") - parser.add_argument("--num_ps", help="number of PS nodes in cluster", type=int, default=1) - parser.add_argument("--steps", help="maximum number of steps", type=int, default=1000) - parser.add_argument("--tensorboard", help="launch tensorboard process", action="store_true") - - args = parser.parse_args() - print("args:", args) - - cluster = TFCluster.run(sc, main, args, args.cluster_size, args.num_ps, tensorboard=args.tensorboard, input_mode=TFCluster.InputMode.TENSORFLOW, log_dir=args.model, master_node='master') - cluster.shutdown() diff --git a/examples/mnist/estimator/mnist_estimator_2.0.py b/examples/mnist/estimator/mnist_estimator_2.0.py index 26881f54..75d823c8 100644 --- a/examples/mnist/estimator/mnist_estimator_2.0.py +++ b/examples/mnist/estimator/mnist_estimator_2.0.py @@ -51,7 +51,7 @@ def model_fn(features, labels, mode): learning_rate=LEARNING_RATE) loss = tf.keras.losses.SparseCategoricalCrossentropy( from_logits=True, reduction=tf.keras.losses.Reduction.NONE)(labels, logits) - loss = tf.reduce_sum(loss) * (1. / BATCH_SIZE) + loss = tf.reduce_sum(input_tensor=loss) * (1. / BATCH_SIZE) if mode == tf.estimator.ModeKeys.EVAL: return tf.estimator.EstimatorSpec(mode, loss=loss) diff --git a/examples/mnist/keras/mnist_inference.py b/examples/mnist/keras/mnist_inference.py index 0abb795e..af444d50 100644 --- a/examples/mnist/keras/mnist_inference.py +++ b/examples/mnist/keras/mnist_inference.py @@ -9,7 +9,7 @@ # graph. In these situations, we can use Spark to instantiate a single-node TensorFlow instance on each executor, # where each executor can independently load the model and inference on input data. # -# Note: this particular example demonstrates use of `tf.data.Dataset` to read the input data for inferencing, +# Note: this particular example demonstrates use of `tf.data.Dataset` to read the input data for inferencing, # but it could also be adapted to just use an RDD of TFRecords from Spark. from __future__ import absolute_import @@ -20,8 +20,6 @@ import numpy as np import tensorflow as tf -IMAGE_PIXELS = 28 - def inference(it, num_workers, args): from tensorflowonspark import util @@ -34,49 +32,37 @@ def inference(it, num_workers, args): # setup env for single-node TF util.single_node_env() - # load saved_model using default tag and signature - sess = tf.Session() - tf.saved_model.loader.load(sess, ['serve'], args.export) + # load saved_model + saved_model = tf.saved_model.load(args.export_dir, tags='serve') + predict = saved_model.signatures['serving_default'] # parse function for TFRecords def parse_tfr(example_proto): - feature_def = {"label": tf.FixedLenFeature(10, tf.int64), - "image": tf.FixedLenFeature(IMAGE_PIXELS * IMAGE_PIXELS, tf.int64)} - features = tf.parse_single_example(example_proto, feature_def) - norm = tf.constant(255, dtype=tf.float32, shape=(784,)) - image = tf.div(tf.to_float(features['image']), norm) - label = tf.to_float(features['label']) + feature_def = {"label": tf.io.FixedLenFeature(1, tf.int64), + "image": tf.io.FixedLenFeature(784, tf.int64)} + features = tf.io.parse_single_example(serialized=example_proto, features=feature_def) + image = tf.cast(features['image'], dtype=tf.float32) / 255.0 + image = tf.reshape(image, [28, 28, 1]) + label = tf.cast(features['label'], dtype=tf.float32) return (image, label) # define a new tf.data.Dataset (for inferencing) ds = tf.data.Dataset.list_files("{}/part-*".format(args.images_labels)) ds = ds.shard(num_workers, worker_num) - ds = ds.interleave(tf.data.TFRecordDataset, cycle_length=1) - ds = ds.map(parse_tfr).batch(10) - iterator = ds.make_one_shot_iterator() - image_label = iterator.get_next(name='inf_image') + ds = ds.interleave(tf.data.TFRecordDataset) + ds = ds.map(parse_tfr) + ds = ds.batch(10) # create an output file per spark worker for the predictions - tf.gfile.MakeDirs(args.output) - output_file = tf.gfile.GFile("{}/part-{:05d}".format(args.output, worker_num), mode='w') - - while True: - try: - # get images and labels from tf.data.Dataset - img, lbl = sess.run(['inf_image:0', 'inf_image:1']) - - # inference by feeding these images and labels into the input tensors - # you can view the exported model signatures via: - # saved_model_cli show --dir --all - - # note that we feed directly into the graph tensors (bypassing the exported signatures) - # these tensors will be shown in the "name" field of the signature definitions + tf.io.gfile.makedirs(args.output) + output_file = tf.io.gfile.GFile("{}/part-{:05d}".format(args.output, worker_num), mode='w') - outputs = sess.run(['dense_2/Softmax:0'], feed_dict={'Placeholder:0': img}) - for p in outputs[0]: - output_file.write("{}\n".format(np.argmax(p))) - except tf.errors.OutOfRangeError: - break + for batch in ds: + predictions = predict(conv2d_input=batch[0]) + labels = np.reshape(batch[1], -1).astype(np.int) + preds = np.argmax(predictions['dense_1'], axis=1) + for x in zip(labels, preds): + output_file.write("{} {}\n".format(x[0], x[1])) output_file.close() @@ -92,7 +78,7 @@ def parse_tfr(example_proto): parser = argparse.ArgumentParser() parser.add_argument("--cluster_size", help="number of nodes in the cluster (for S with labelspark Standalone)", type=int, default=num_executors) parser.add_argument('--images_labels', type=str, help='Directory for input images with labels') - parser.add_argument("--export", help="HDFS path to export model", type=str, default="mnist_export") + parser.add_argument("--export_dir", help="HDFS path to export model", type=str, default="mnist_export") parser.add_argument("--output", help="HDFS path to save predictions", type=str, default="predictions") args, _ = parser.parse_known_args() print("args: {}".format(args)) diff --git a/examples/mnist/keras/mnist_mlp.py b/examples/mnist/keras/mnist_mlp.py deleted file mode 100644 index 4e0fd3c6..00000000 --- a/examples/mnist/keras/mnist_mlp.py +++ /dev/null @@ -1,175 +0,0 @@ -'''Trains a simple deep NN on the MNIST dataset. -Gets to 98.40% test accuracy after 20 epochs -(there is *a lot* of margin for parameter tuning). -2 seconds per epoch on a K520 GPU. -''' - -from __future__ import print_function - - -def main_fun(args, ctx): - import numpy - import os - import tensorflow as tf - from tensorflow.python import keras - from tensorflow.python.keras import backend as K - from tensorflow.python.keras.datasets import mnist - from tensorflow.python.keras.models import Sequential, load_model, save_model - from tensorflow.python.keras.layers import Dense, Dropout - from tensorflow.python.keras.optimizers import RMSprop - from tensorflow.python.keras.callbacks import LambdaCallback, TensorBoard - from tensorflow.python.saved_model import builder as saved_model_builder - from tensorflow.python.saved_model import tag_constants - from tensorflow.python.saved_model.signature_def_utils_impl import predict_signature_def - from tensorflowonspark import TFNode - - cluster, server = TFNode.start_cluster_server(ctx) - - if ctx.job_name == "ps": - server.join() - elif ctx.job_name == "worker": - - def generate_rdd_data(tf_feed, batch_size): - print("generate_rdd_data invoked") - while True: - batch = tf_feed.next_batch(batch_size) - imgs = [] - lbls = [] - for item in batch: - imgs.append(item[0]) - lbls.append(item[1]) - images = numpy.array(imgs).astype('float32') / 255 - labels = numpy.array(lbls).astype('float32') - yield (images, labels) - - with tf.device(tf.train.replica_device_setter( - worker_device="/job:worker/task:%d" % ctx.task_index, - cluster=cluster)): - - IMAGE_PIXELS = 28 - batch_size = 100 - num_classes = 10 - - # the data, shuffled and split between train and test sets - if args.input_mode == 'tf': - (x_train, y_train), (x_test, y_test) = mnist.load_data() - x_train = x_train.reshape(60000, 784) - x_test = x_test.reshape(10000, 784) - x_train = x_train.astype('float32') / 255 - x_test = x_test.astype('float32') / 255 - - # convert class vectors to binary class matrices - y_train = keras.utils.to_categorical(y_train, num_classes) - y_test = keras.utils.to_categorical(y_test, num_classes) - else: # args.mode == 'spark' - x_train = tf.placeholder(tf.float32, [None, IMAGE_PIXELS * IMAGE_PIXELS], name="x_train") - y_train = tf.placeholder(tf.float32, [None, 10], name="y_train") - (_, _), (x_test, y_test) = mnist.load_data() - x_test = x_test.reshape(10000, 784) - y_test = keras.utils.to_categorical(y_test, num_classes) - - model = Sequential() - model.add(Dense(512, activation='relu', input_shape=(784,))) - model.add(Dropout(0.2)) - model.add(Dense(512, activation='relu')) - model.add(Dropout(0.2)) - model.add(Dense(10, activation='softmax')) - - model.summary() - - model.compile(loss='categorical_crossentropy', - optimizer=RMSprop(), - metrics=['accuracy']) - - saver = tf.train.Saver() - - with tf.Session(server.target) as sess: - K.set_session(sess) - - def save_checkpoint(epoch, logs=None): - if epoch == 1: - tf.train.write_graph(sess.graph.as_graph_def(), args.model_dir, 'graph.pbtxt') - saver.save(sess, os.path.join(args.model_dir, 'model.ckpt'), global_step=epoch * args.steps_per_epoch) - - ckpt_callback = LambdaCallback(on_epoch_end=save_checkpoint) - tb_callback = TensorBoard(log_dir=args.model_dir, histogram_freq=1, write_graph=True, write_images=True) - - # add callbacks to save model checkpoint and tensorboard events (on worker:0 only) - callbacks = [ckpt_callback, tb_callback] if ctx.task_index == 0 else None - - if args.input_mode == 'tf': - # train & validate on in-memory data - model.fit(x_train, y_train, - batch_size=batch_size, - epochs=args.epochs, - verbose=1, - validation_data=(x_test, y_test), - callbacks=callbacks) - else: # args.input_mode == 'spark': - # train on data read from a generator which is producing data from a Spark RDD - tf_feed = TFNode.DataFeed(ctx.mgr) - model.fit_generator(generator=generate_rdd_data(tf_feed, batch_size), - steps_per_epoch=args.steps_per_epoch, - epochs=args.epochs, - verbose=1, - validation_data=(x_test, y_test), - callbacks=callbacks) - - if args.export_dir and ctx.job_name == 'worker' and ctx.task_index == 0: - # save a local Keras model, so we can reload it with an inferencing learning_phase - save_model(model, "tmp_model") - - # reload the model - K.set_learning_phase(False) - new_model = load_model("tmp_model") - - # export a saved_model for inferencing - builder = saved_model_builder.SavedModelBuilder(args.export_dir) - signature = predict_signature_def(inputs={'images': new_model.input}, - outputs={'scores': new_model.output}) - builder.add_meta_graph_and_variables(sess=sess, - tags=[tag_constants.SERVING], - signature_def_map={'predict': signature}, - clear_devices=True) - builder.save() - - if args.input_mode == 'spark': - tf_feed.terminate() - - -if __name__ == '__main__': - import argparse - from pyspark.context import SparkContext - from pyspark.conf import SparkConf - from tensorflowonspark import TFCluster - - sc = SparkContext(conf=SparkConf().setAppName("mnist_mlp")) - executors = sc._conf.get("spark.executor.instances") - num_executors = int(executors) if executors is not None else 1 - num_ps = 1 - - parser = argparse.ArgumentParser() - parser.add_argument("--cluster_size", help="number of nodes in the cluster", type=int, default=num_executors) - parser.add_argument("--epochs", help="number of epochs of training data", type=int, default=20) - parser.add_argument("--export_dir", help="directory to export saved_model") - parser.add_argument("--images", help="HDFS path to MNIST images in parallelized CSV format") - parser.add_argument("--input_mode", help="input mode (tf|spark)", default="tf") - parser.add_argument("--labels", help="HDFS path to MNIST labels in parallelized CSV format") - parser.add_argument("--model_dir", help="directory to write model checkpoints") - parser.add_argument("--num_ps", help="number of ps nodes", type=int, default=1) - parser.add_argument("--steps_per_epoch", help="number of steps per epoch", type=int, default=300) - parser.add_argument("--tensorboard", help="launch tensorboard process", action="store_true") - - args = parser.parse_args() - print("args:", args) - - if args.input_mode == 'tf': - cluster = TFCluster.run(sc, main_fun, args, args.cluster_size, args.num_ps, args.tensorboard, TFCluster.InputMode.TENSORFLOW, log_dir=args.model_dir) - else: # args.input_mode == 'spark': - cluster = TFCluster.run(sc, main_fun, args, args.cluster_size, args.num_ps, args.tensorboard, TFCluster.InputMode.SPARK, log_dir=args.model_dir) - images = sc.textFile(args.images).map(lambda ln: [float(x) for x in ln.split(',')]) - labels = sc.textFile(args.labels).map(lambda ln: [float(x) for x in ln.split(',')]) - dataRDD = images.zip(labels) - cluster.train(dataRDD, args.epochs) - - cluster.shutdown() diff --git a/examples/mnist/keras/mnist_mlp_estimator.py b/examples/mnist/keras/mnist_mlp_estimator.py deleted file mode 100644 index 972c6a75..00000000 --- a/examples/mnist/keras/mnist_mlp_estimator.py +++ /dev/null @@ -1,162 +0,0 @@ -import numpy -import tensorflow as tf -import time -from datetime import datetime -from tensorflow.python import keras -from tensorflow.python.keras.models import Sequential -from tensorflow.python.keras.layers import Dense, Dropout -from tensorflowonspark import TFNode - - -class StopFeedHook(tf.train.SessionRunHook): - """SessionRunHook to terminate InputMode.SPARK RDD feeding if the training loop exits before the entire RDD is consumed.""" - - def __init__(self, tf_feed): - self._tf_feed = tf_feed - - def end(self, session): - self._tf_feed.terminate() - self._tf_feed.next_batch(1) - - -def main_fun(args, ctx): - IMAGE_PIXELS = 28 - num_classes = 10 - - # use Keras API to load data - from tensorflow.python.keras.datasets import mnist - (x_train, y_train), (x_test, y_test) = mnist.load_data() - x_train = x_train.reshape(60000, 784) - x_test = x_test.reshape(10000, 784) - x_train = x_train.astype('float32') / 255 - x_test = x_test.astype('float32') / 255 - - # convert class vectors to binary class matrices - y_train = keras.utils.to_categorical(y_train, num_classes) - y_test = keras.utils.to_categorical(y_test, num_classes) - - # setup a Keras model - model = Sequential() - model.add(Dense(512, activation='relu', input_shape=(784,))) - model.add(Dropout(0.2)) - model.add(Dense(512, activation='relu')) - model.add(Dropout(0.2)) - model.add(Dense(10, activation='softmax')) - model.compile(loss='categorical_crossentropy', - optimizer=tf.train.RMSPropOptimizer(learning_rate=0.001), - metrics=['accuracy']) - model.summary() - - print("model.inputs: {}".format(model.inputs)) - print("model.outputs: {}".format(model.outputs)) - - # convert Keras model to tf.estimator - estimator = tf.keras.estimator.model_to_estimator(model, model_dir=args.model_dir) - - # setup train_input_fn for InputMode.TENSORFLOW or InputMode.SPARK - if args.input_mode == 'tf': - # For InputMode.TENSORFLOW, just use data in memory - train_input_fn = tf.estimator.inputs.numpy_input_fn( - x={"dense_input": x_train}, - y=y_train, - batch_size=128, - num_epochs=args.epochs, - shuffle=True) - - hooks = [] - else: # 'spark' - # For InputMode.SPARK, read data from RDD - tf_feed = TFNode.DataFeed(ctx.mgr) - - def rdd_generator(): - while not tf_feed.should_stop(): - batch = tf_feed.next_batch(1) - if len(batch) > 0: - record = batch[0] - image = numpy.array(record[0]).astype(numpy.float32) / 255.0 - label = numpy.array(record[1]).astype(numpy.float32) - yield (image, label) - else: - return - - def train_input_fn(): - ds = tf.data.Dataset.from_generator(rdd_generator, - (tf.float32, tf.float32), - (tf.TensorShape([IMAGE_PIXELS * IMAGE_PIXELS]), tf.TensorShape([10]))) - ds = ds.batch(args.batch_size) - return ds - - # add a hook to terminate the RDD data feed when the session ends - hooks = [StopFeedHook(tf_feed)] - - # eval_input_fn ALWAYS uses data loaded in memory, since InputMode.SPARK can only feed one RDD at a time - eval_input_fn = tf.estimator.inputs.numpy_input_fn( - x={"dense_input": x_test}, - y=y_test, - num_epochs=1, - shuffle=False) - - # setup tf.estimator.train_and_evaluate() w/ FinalExporter - feature_spec = {'dense_input': tf.placeholder(tf.float32, shape=[None, 784])} - exporter = tf.estimator.FinalExporter("serving", serving_input_receiver_fn=tf.estimator.export.build_raw_serving_input_receiver_fn(feature_spec)) - train_spec = tf.estimator.TrainSpec(input_fn=train_input_fn, max_steps=args.steps, hooks=hooks) - eval_spec = tf.estimator.EvalSpec(input_fn=eval_input_fn, exporters=exporter) - - # train and export model - tf.estimator.train_and_evaluate(estimator, train_spec, eval_spec) - - # WORKAROUND FOR https://github.com/tensorflow/tensorflow/issues/21745 - # wait for all other nodes to complete (via done files) - done_dir = "{}/done".format(ctx.absolute_path(args.model_dir)) - print("Writing done file to: {}".format(done_dir)) - tf.gfile.MakeDirs(done_dir) - with tf.gfile.GFile("{}/{}".format(done_dir, ctx.task_index), 'w') as done_file: - done_file.write("done") - - for i in range(60): - if len(tf.gfile.ListDirectory(done_dir)) < len(ctx.cluster_spec['worker']): - print("{} Waiting for other nodes {}".format(datetime.now().isoformat(), i)) - time.sleep(1) - else: - print("{} All nodes done".format(datetime.now().isoformat())) - break - - -if __name__ == '__main__': - import argparse - from pyspark.context import SparkContext - from pyspark.conf import SparkConf - from tensorflowonspark import TFCluster - - sc = SparkContext(conf=SparkConf().setAppName("mnist_mlp")) - executors = sc._conf.get("spark.executor.instances") - num_executors = int(executors) if executors is not None else 1 - - parser = argparse.ArgumentParser() - parser.add_argument("--batch_size", help="number of records per batch", type=int, default=100) - parser.add_argument("--cluster_size", help="number of nodes in the cluster", type=int, default=num_executors) - parser.add_argument("--epochs", help="number of epochs of training data", type=int, default=None) - parser.add_argument("--images", help="HDFS path to MNIST images in parallelized CSV format") - parser.add_argument("--input_mode", help="input mode (tf|spark)", default="tf") - parser.add_argument("--labels", help="HDFS path to MNIST labels in parallelized CSV format") - parser.add_argument("--model_dir", help="directory to write model checkpoints") - parser.add_argument("--output", help="HDFS path to save test/inference output", default="predictions") - parser.add_argument("--num_ps", help="number of ps nodes", type=int, default=1) - parser.add_argument("--steps", help="max number of steps to train", type=int, default=2000) - parser.add_argument("--tensorboard", help="launch tensorboard process", action="store_true") - - args = parser.parse_args() - print("args:", args) - - if args.input_mode == 'tf': - # for TENSORFLOW mode, each node will load/train/infer entire dataset in memory per original example - cluster = TFCluster.run(sc, main_fun, args, args.cluster_size, args.num_ps, args.tensorboard, TFCluster.InputMode.TENSORFLOW, log_dir=args.model_dir, master_node='master') - cluster.shutdown() - else: # 'spark' - # for SPARK mode, just use CSV format as an example - images = sc.textFile(args.images).map(lambda ln: [float(x) for x in ln.split(',')]) - labels = sc.textFile(args.labels).map(lambda ln: [float(x) for x in ln.split(',')]) - dataRDD = images.zip(labels) - cluster = TFCluster.run(sc, main_fun, args, args.cluster_size, args.num_ps, args.tensorboard, TFCluster.InputMode.SPARK, log_dir=args.model_dir, master_node='master') - cluster.train(dataRDD, args.epochs) - cluster.shutdown() diff --git a/examples/mnist/mnist_data_setup.py b/examples/mnist/mnist_data_setup.py index 3fd66d0e..45398755 100644 --- a/examples/mnist/mnist_data_setup.py +++ b/examples/mnist/mnist_data_setup.py @@ -6,142 +6,60 @@ from __future__ import division from __future__ import print_function -import numpy -import tensorflow as tf -from tensorflow.contrib.learn.python.learn.datasets import mnist - - -def toTFExample(image, label): - """Serializes an image/label as a TFExample byte string""" - example = tf.train.Example( - features=tf.train.Features( - feature={ - 'label': tf.train.Feature(int64_list=tf.train.Int64List(value=label.astype("int64"))), - 'image': tf.train.Feature(int64_list=tf.train.Int64List(value=image.astype("int64"))) - } - ) - ) - return example.SerializeToString() - - -def fromTFExample(bytestr): - """Deserializes a TFExample from a byte string""" - example = tf.train.Example() - example.ParseFromString(bytestr) - return example - - -def toCSV(vec): - """Converts a vector/array into a CSV string""" - return ','.join([str(i) for i in vec]) - - -def fromCSV(s): - """Converts a CSV string to a vector/array""" - return [float(x) for x in s.split(',') if len(s) > 0] - - -def writeMNIST(sc, input_images, input_labels, output, format, num_partitions): - """Writes MNIST image/label vectors into parallelized files on HDFS""" - # load MNIST gzip into memory - with open(input_images, 'rb') as f: - images = numpy.array(mnist.extract_images(f)) - - with open(input_labels, 'rb') as f: - if format == "csv2": - labels = numpy.array(mnist.extract_labels(f, one_hot=False)) - else: - labels = numpy.array(mnist.extract_labels(f, one_hot=True)) - - shape = images.shape - print("images.shape: {0}".format(shape)) # 60000 x 28 x 28 - print("labels.shape: {0}".format(labels.shape)) # 60000 x 10 - - # create RDDs of vectors - imageRDD = sc.parallelize(images.reshape(shape[0], shape[1] * shape[2]), num_partitions) - labelRDD = sc.parallelize(labels, num_partitions) - - output_images = output + "/images" - output_labels = output + "/labels" - - # save RDDs as specific format - if format == "pickle": - imageRDD.saveAsPickleFile(output_images) - labelRDD.saveAsPickleFile(output_labels) - elif format == "csv": - imageRDD.map(toCSV).saveAsTextFile(output_images) - labelRDD.map(toCSV).saveAsTextFile(output_labels) - elif format == "csv2": - imageRDD.map(toCSV).zip(labelRDD).map(lambda x: str(x[1]) + "|" + x[0]).saveAsTextFile(output) - else: # format == "tfr": - tfRDD = imageRDD.zip(labelRDD).map(lambda x: (bytearray(toTFExample(x[0], x[1])), None)) - # requires: --jars tensorflow-hadoop-1.0-SNAPSHOT.jar - tfRDD.saveAsNewAPIHadoopFile(output, "org.tensorflow.hadoop.io.TFRecordFileOutputFormat", - keyClass="org.apache.hadoop.io.BytesWritable", - valueClass="org.apache.hadoop.io.NullWritable") -# Note: this creates TFRecord files w/o requiring a custom Input/Output format -# else: # format == "tfr": -# def writeTFRecords(index, iter): -# output_path = "{0}/part-{1:05d}".format(output, index) -# writer = tf.python_io.TFRecordWriter(output_path) -# for example in iter: -# writer.write(example) -# return [output_path] -# tfRDD = imageRDD.zip(labelRDD).map(lambda x: toTFExample(x[0], x[1])) -# tfRDD.mapPartitionsWithIndex(writeTFRecords).collect() - - -def readMNIST(sc, output, format): - """Reads/verifies previously created output""" - - output_images = output + "/images" - output_labels = output + "/labels" - imageRDD = None - labelRDD = None - - if format == "pickle": - imageRDD = sc.pickleFile(output_images) - labelRDD = sc.pickleFile(output_labels) - elif format == "csv": - imageRDD = sc.textFile(output_images).map(fromCSV) - labelRDD = sc.textFile(output_labels).map(fromCSV) - else: # format.startswith("tf"): - # requires: --jars tensorflow-hadoop-1.0-SNAPSHOT.jar - tfRDD = sc.newAPIHadoopFile(output, "org.tensorflow.hadoop.io.TFRecordFileInputFormat", - keyClass="org.apache.hadoop.io.BytesWritable", - valueClass="org.apache.hadoop.io.NullWritable") - imageRDD = tfRDD.map(lambda x: fromTFExample(bytes(x[0]))) - - num_images = imageRDD.count() - num_labels = labelRDD.count() if labelRDD is not None else num_images - samples = imageRDD.take(10) - print("num_images: ", num_images) - print("num_labels: ", num_labels) - print("samples: ", samples) - if __name__ == "__main__": import argparse from pyspark.context import SparkContext from pyspark.conf import SparkConf + import tensorflow as tf + import tensorflow_datasets as tfds parser = argparse.ArgumentParser() - parser.add_argument("--format", help="output format", choices=["csv", "csv2", "pickle", "tf", "tfr"], default="csv") - parser.add_argument("--num-partitions", help="Number of output partitions", type=int, default=10) - parser.add_argument("--output", help="HDFS directory to save examples in parallelized format", default="mnist_data") - parser.add_argument("--read", help="read previously saved examples", action="store_true") - parser.add_argument("--verify", help="verify saved examples after writing", action="store_true") + parser.add_argument("--num_partitions", help="Number of output partitions", type=int, default=10) + parser.add_argument("--output", help="HDFS directory to save examples in parallelized format", default="data/mnist") args = parser.parse_args() print("args:", args) - sc = SparkContext(conf=SparkConf().setAppName("mnist_parallelize")) + sc = SparkContext(conf=SparkConf().setAppName("mnist_data_setup")) + + mnist, info = tfds.load('mnist', with_info=True) + print(info.as_json) + + # convert to numpy, then RDDs + mnist_train = tfds.as_numpy(mnist['train']) + mnist_test = tfds.as_numpy(mnist['test']) - if not args.read: - # Note: these files are inside the mnist.zip file - writeMNIST(sc, "mnist/train-images-idx3-ubyte.gz", "mnist/train-labels-idx1-ubyte.gz", args.output + "/train", args.format, args.num_partitions) - writeMNIST(sc, "mnist/t10k-images-idx3-ubyte.gz", "mnist/t10k-labels-idx1-ubyte.gz", args.output + "/test", args.format, args.num_partitions) + train_rdd = sc.parallelize(mnist_train, args.num_partitions).cache() + test_rdd = sc.parallelize(mnist_test, args.num_partitions).cache() - if args.read or args.verify: - readMNIST(sc, args.output + "/train", args.format) + # save as CSV (label,comma-separated-features) + def to_csv(example): + return str(example['label']) + ',' + ','.join([str(i) for i in example['image'].reshape(784)]) + + train_rdd.map(to_csv).saveAsTextFile(args.output + "/csv/train") + test_rdd.map(to_csv).saveAsTextFile(args.output + "/csv/test") + + # save as TFRecords (numpy vs. PNG) + # note: the MNIST tensorflow_dataset is already provided as TFRecords but with a PNG bytes_list + # this export format is less-efficient, but easier to work with later + def to_tfr(example): + ex = tf.train.Example( + features=tf.train.Features( + feature={ + 'label': tf.train.Feature(int64_list=tf.train.Int64List(value=[example['label'].astype("int64")])), + 'image': tf.train.Feature(int64_list=tf.train.Int64List(value=example['image'].reshape(784).astype("int64"))) + } + ) + ) + return (bytearray(ex.SerializeToString()), None) + + train_rdd.map(to_tfr).saveAsNewAPIHadoopFile(args.output + "/tfr/train", + "org.tensorflow.hadoop.io.TFRecordFileOutputFormat", + keyClass="org.apache.hadoop.io.BytesWritable", + valueClass="org.apache.hadoop.io.NullWritable") + test_rdd.map(to_tfr).saveAsNewAPIHadoopFile(args.output + "/tfr/test", + "org.tensorflow.hadoop.io.TFRecordFileOutputFormat", + keyClass="org.apache.hadoop.io.BytesWritable", + valueClass="org.apache.hadoop.io.NullWritable") From fec8c0773bf22696d65c51f687904894a7d78a98 Mon Sep 17 00:00:00 2001 From: Lee Yang Date: Wed, 7 Aug 2019 15:31:14 -0700 Subject: [PATCH 06/37] remove low-level API examples --- examples/mnist/spark/mnist_dist.py | 149 ------------------ examples/mnist/spark/mnist_spark.py | 72 --------- examples/mnist/tf/mnist_dist.py | 224 --------------------------- examples/mnist/tf/mnist_inference.py | 103 ------------ examples/mnist/tf/mnist_spark.py | 48 ------ tensorflowonspark/TFNode.py | 122 +-------------- 6 files changed, 4 insertions(+), 714 deletions(-) delete mode 100755 examples/mnist/spark/mnist_dist.py delete mode 100755 examples/mnist/spark/mnist_spark.py delete mode 100644 examples/mnist/tf/mnist_dist.py delete mode 100644 examples/mnist/tf/mnist_inference.py delete mode 100644 examples/mnist/tf/mnist_spark.py mode change 100755 => 100644 tensorflowonspark/TFNode.py diff --git a/examples/mnist/spark/mnist_dist.py b/examples/mnist/spark/mnist_dist.py deleted file mode 100755 index cdf0a8c2..00000000 --- a/examples/mnist/spark/mnist_dist.py +++ /dev/null @@ -1,149 +0,0 @@ -# Copyright 2017 Yahoo Inc. -# Licensed under the terms of the Apache 2.0 license. -# Please see LICENSE file in the project root for terms. - -# Distributed MNIST on grid based on TensorFlow MNIST example - -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function - - -def print_log(worker_num, arg): - print("{0}: {1}".format(worker_num, arg)) - - -def map_fun(args, ctx): - from datetime import datetime - import math - import numpy - import tensorflow as tf - import time - - worker_num = ctx.worker_num - job_name = ctx.job_name - task_index = ctx.task_index - - # Parameters - IMAGE_PIXELS = 28 - hidden_units = 128 - - # Get TF cluster and server instances - cluster, server = ctx.start_cluster_server(1, args.rdma) - - # Create generator for Spark data feed - tf_feed = ctx.get_data_feed(args.mode == 'train') - - def rdd_generator(): - while not tf_feed.should_stop(): - batch = tf_feed.next_batch(1) - if len(batch) == 0: - return - row = batch[0] - image = numpy.array(row[0]).astype(numpy.float32) / 255.0 - label = numpy.array(row[1]).astype(numpy.int64) - yield (image, label) - - if job_name == "ps": - server.join() - elif job_name == "worker": - # Assigns ops to the local worker by default. - with tf.device(tf.train.replica_device_setter( - worker_device="/job:worker/task:%d" % task_index, - cluster=cluster)): - - # Dataset for input data - ds = tf.data.Dataset.from_generator(rdd_generator, (tf.float32, tf.float32), (tf.TensorShape([IMAGE_PIXELS * IMAGE_PIXELS]), tf.TensorShape([10]))).batch(args.batch_size) - iterator = ds.make_one_shot_iterator() - x, y_ = iterator.get_next() - - # Variables of the hidden layer - hid_w = tf.Variable(tf.truncated_normal([IMAGE_PIXELS * IMAGE_PIXELS, hidden_units], - stddev=1.0 / IMAGE_PIXELS), name="hid_w") - hid_b = tf.Variable(tf.zeros([hidden_units]), name="hid_b") - tf.summary.histogram("hidden_weights", hid_w) - - # Variables of the softmax layer - sm_w = tf.Variable(tf.truncated_normal([hidden_units, 10], - stddev=1.0 / math.sqrt(hidden_units)), name="sm_w") - sm_b = tf.Variable(tf.zeros([10]), name="sm_b") - tf.summary.histogram("softmax_weights", sm_w) - - x_img = tf.reshape(x, [-1, IMAGE_PIXELS, IMAGE_PIXELS, 1]) - tf.summary.image("x_img", x_img) - - hid_lin = tf.nn.xw_plus_b(x, hid_w, hid_b) - hid = tf.nn.relu(hid_lin) - - y = tf.nn.softmax(tf.nn.xw_plus_b(hid, sm_w, sm_b)) - - global_step = tf.train.get_or_create_global_step() - - loss = -tf.reduce_sum(y_ * tf.log(tf.clip_by_value(y, 1e-10, 1.0))) - tf.summary.scalar("loss", loss) - train_op = tf.train.AdagradOptimizer(0.01).minimize( - loss, global_step=global_step) - - # Test trained model - label = tf.argmax(y_, 1, name="label") - prediction = tf.argmax(y, 1, name="prediction") - correct_prediction = tf.equal(prediction, label) - accuracy = tf.reduce_mean(tf.cast(correct_prediction, tf.float32), name="accuracy") - tf.summary.scalar("acc", accuracy) - - saver = tf.train.Saver() - summary_op = tf.summary.merge_all() - init_op = tf.global_variables_initializer() - - # Create a "supervisor", which oversees the training process and stores model state into HDFS - logdir = ctx.absolute_path(args.model) - print("tensorflow model path: {0}".format(logdir)) - summary_writer = tf.summary.FileWriter("tensorboard_%d" % worker_num, graph=tf.get_default_graph()) - - hooks = [tf.train.StopAtStepHook(last_step=args.steps)] if args.mode == "train" else [] - with tf.train.MonitoredTrainingSession(master=server.target, - is_chief=(task_index == 0), - scaffold=tf.train.Scaffold(init_op=init_op, summary_op=summary_op, saver=saver), - checkpoint_dir=logdir, - hooks=hooks) as sess: - print("{} session ready".format(datetime.now().isoformat())) - - # Loop until the session shuts down or feed has no more data - step = 0 - while not sess.should_stop() and not tf_feed.should_stop(): - # Run a training step asynchronously. - # See `tf.train.SyncReplicasOptimizer` for additional details on how to - # perform *synchronous* training. - - if args.mode == "train": - _, summary, step = sess.run([train_op, summary_op, global_step]) - if (step % 100 == 0) and (not sess.should_stop()): - print("{} step: {} accuracy: {}".format(datetime.now().isoformat(), step, sess.run(accuracy))) - if task_index == 0: - summary_writer.add_summary(summary, step) - else: # args.mode == "inference" - labels, preds, acc = sess.run([label, prediction, accuracy]) - results = ["{} Label: {}, Prediction: {}".format(datetime.now().isoformat(), l, p) for l, p in zip(labels, preds)] - tf_feed.batch_results(results) - print("acc: {}".format(acc)) - - print("{} stopping MonitoredTrainingSession".format(datetime.now().isoformat())) - - if sess.should_stop() or step >= args.steps: - tf_feed.terminate() - - # WORKAROUND FOR https://github.com/tensorflow/tensorflow/issues/21745 - # wait for all other nodes to complete (via done files) - done_dir = "{}/{}/done".format(ctx.absolute_path(args.model), args.mode) - print("Writing done file to: {}".format(done_dir)) - tf.gfile.MakeDirs(done_dir) - with tf.gfile.GFile("{}/{}".format(done_dir, ctx.task_index), 'w') as done_file: - done_file.write("done") - - for i in range(60): - if len(tf.gfile.ListDirectory(done_dir)) < len(ctx.cluster_spec['worker']): - print("{} Waiting for other nodes {}".format(datetime.now().isoformat(), i)) - time.sleep(1) - else: - print("{} All nodes done".format(datetime.now().isoformat())) - break diff --git a/examples/mnist/spark/mnist_spark.py b/examples/mnist/spark/mnist_spark.py deleted file mode 100755 index 730f0b66..00000000 --- a/examples/mnist/spark/mnist_spark.py +++ /dev/null @@ -1,72 +0,0 @@ -# Copyright 2017 Yahoo Inc. -# Licensed under the terms of the Apache 2.0 license. -# Please see LICENSE file in the project root for terms. - -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function - -from pyspark.context import SparkContext -from pyspark.conf import SparkConf - -import argparse -import numpy -import tensorflow as tf -from datetime import datetime - -from tensorflowonspark import TFCluster -import mnist_dist - -sc = SparkContext(conf=SparkConf().setAppName("mnist_spark")) -executors = sc._conf.get("spark.executor.instances") -num_executors = int(executors) if executors is not None else 1 -num_ps = 1 - -parser = argparse.ArgumentParser() -parser.add_argument("--batch_size", help="number of records per batch", type=int, default=100) -parser.add_argument("--epochs", help="number of epochs", type=int, default=1) -parser.add_argument("--format", help="example format: (csv|tfr)", choices=["csv", "tfr"], default="csv") -parser.add_argument("--images", help="HDFS path to MNIST images in parallelized format") -parser.add_argument("--labels", help="HDFS path to MNIST labels in parallelized format") -parser.add_argument("--model", help="HDFS path to save/load model during train/inference", default="mnist_model") -parser.add_argument("--cluster_size", help="number of nodes in the cluster", type=int, default=num_executors) -parser.add_argument("--output", help="HDFS path to save test/inference output", default="predictions") -parser.add_argument("--readers", help="number of reader/enqueue threads", type=int, default=1) -parser.add_argument("--steps", help="maximum number of steps", type=int, default=1000) -parser.add_argument("--tensorboard", help="launch tensorboard process", action="store_true") -parser.add_argument("--mode", help="train|inference", default="train") -parser.add_argument("--rdma", help="use rdma connection", default=False) -args = parser.parse_args() -print("args:", args) - -print("{0} ===== Start".format(datetime.now().isoformat())) - -if args.format == "tfr": - images = sc.newAPIHadoopFile(args.images, "org.tensorflow.hadoop.io.TFRecordFileInputFormat", - keyClass="org.apache.hadoop.io.BytesWritable", - valueClass="org.apache.hadoop.io.NullWritable") - - def toNumpy(bytestr): - example = tf.train.Example() - example.ParseFromString(bytestr) - features = example.features.feature - image = numpy.array(features['image'].int64_list.value) - label = numpy.array(features['label'].int64_list.value) - return (image, label) - - dataRDD = images.map(lambda x: toNumpy(bytes(x[0]))) -else: # args.format == "csv": - images = sc.textFile(args.images).map(lambda ln: [int(x) for x in ln.split(',')]) - labels = sc.textFile(args.labels).map(lambda ln: [float(x) for x in ln.split(',')]) - print("zipping images and labels") - dataRDD = images.zip(labels) - -cluster = TFCluster.run(sc, mnist_dist.map_fun, args, args.cluster_size, num_ps, args.tensorboard, TFCluster.InputMode.SPARK) -if args.mode == "train": - cluster.train(dataRDD, args.epochs) -else: - labelRDD = cluster.inference(dataRDD) - labelRDD.saveAsTextFile(args.output) -cluster.shutdown() - -print("{0} ===== Stop".format(datetime.now().isoformat())) diff --git a/examples/mnist/tf/mnist_dist.py b/examples/mnist/tf/mnist_dist.py deleted file mode 100644 index cc95e93d..00000000 --- a/examples/mnist/tf/mnist_dist.py +++ /dev/null @@ -1,224 +0,0 @@ -# Copyright 2017 Yahoo Inc. -# Licensed under the terms of the Apache 2.0 license. -# Please see LICENSE file in the project root for terms. - -# Distributed MNIST on grid based on TensorFlow MNIST example - -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function - - -def print_log(worker_num, arg): - print("%d: " % worker_num, end=" ") - print(arg) - - -def map_fun(args, ctx): - from datetime import datetime - from tensorflowonspark import TFNode - import math - import os - import tensorflow as tf - import time - - num_workers = len(ctx.cluster_spec['worker']) - worker_num = ctx.worker_num - job_name = ctx.job_name - task_index = ctx.task_index - - # Parameters - IMAGE_PIXELS = 28 - hidden_units = 128 - - # Get TF cluster and server instances - cluster, server = ctx.start_cluster_server(1, args.rdma) - - def _parse_csv(ln): - splits = tf.string_split([ln], delimiter='|') - lbl = splits.values[0] - img = splits.values[1] - image_defaults = [[0.0] for col in range(IMAGE_PIXELS * IMAGE_PIXELS)] - image = tf.stack(tf.decode_csv(img, record_defaults=image_defaults)) - norm = tf.constant(255, dtype=tf.float32, shape=(784,)) - normalized_image = tf.div(image, norm) - label_value = tf.string_to_number(lbl, tf.int32) - label = tf.one_hot(label_value, 10) - return (normalized_image, label) - - def _parse_tfr(example_proto): - feature_def = {"label": tf.FixedLenFeature(10, tf.int64), - "image": tf.FixedLenFeature(IMAGE_PIXELS * IMAGE_PIXELS, tf.int64)} - features = tf.parse_single_example(example_proto, feature_def) - norm = tf.constant(255, dtype=tf.float32, shape=(784,)) - image = tf.div(tf.to_float(features['image']), norm) - label = tf.to_float(features['label']) - return (image, label) - - def build_model(graph, x): - with graph.as_default(): - # Variables of the hidden layer - hid_w = tf.Variable(tf.truncated_normal([IMAGE_PIXELS * IMAGE_PIXELS, hidden_units], - stddev=1.0 / IMAGE_PIXELS), name="hid_w") - hid_b = tf.Variable(tf.zeros([hidden_units]), name="hid_b") - tf.summary.histogram("hidden_weights", hid_w) - - # Variables of the softmax layer - sm_w = tf.Variable(tf.truncated_normal([hidden_units, 10], - stddev=1.0 / math.sqrt(hidden_units)), name="sm_w") - sm_b = tf.Variable(tf.zeros([10]), name="sm_b") - tf.summary.histogram("softmax_weights", sm_w) - - hid_lin = tf.nn.xw_plus_b(x, hid_w, hid_b) - hid = tf.nn.relu(hid_lin) - - y = tf.nn.softmax(tf.nn.xw_plus_b(hid, sm_w, sm_b)) - prediction = tf.argmax(y, 1, name="prediction") - return y, prediction - - if job_name == "ps": - server.join() - elif job_name == "worker": - # Assigns ops to the local worker by default. - with tf.device(tf.train.replica_device_setter( - worker_device="/job:worker/task:%d" % task_index, - cluster=cluster)): - - # Dataset for input data - image_dir = ctx.absolute_path(args.images_labels) - file_pattern = os.path.join(image_dir, 'part-*') - - ds = tf.data.Dataset.list_files(file_pattern) - ds = ds.shard(num_workers, task_index).repeat(args.epochs).shuffle(args.shuffle_size) - if args.format == 'csv2': - ds = ds.interleave(tf.data.TextLineDataset, cycle_length=args.readers, block_length=1) - parse_fn = _parse_csv - else: # args.format == 'tfr' - ds = ds.interleave(tf.data.TFRecordDataset, cycle_length=args.readers, block_length=1) - parse_fn = _parse_tfr - ds = ds.map(parse_fn).batch(args.batch_size) - iterator = ds.make_one_shot_iterator() - x, y_ = iterator.get_next() - - # Build core model - y, prediction = build_model(tf.get_default_graph(), x) - - # Add training bits - x_img = tf.reshape(x, [-1, IMAGE_PIXELS, IMAGE_PIXELS, 1]) - tf.summary.image("x_img", x_img) - - global_step = tf.train.get_or_create_global_step() - - loss = -tf.reduce_sum(y_ * tf.log(tf.clip_by_value(y, 1e-10, 1.0))) - tf.summary.scalar("loss", loss) - train_op = tf.train.AdagradOptimizer(0.01).minimize( - loss, global_step=global_step) - - label = tf.argmax(y_, 1, name="label") - correct_prediction = tf.equal(prediction, label) - accuracy = tf.reduce_mean(tf.cast(correct_prediction, tf.float32), name="accuracy") - tf.summary.scalar("acc", accuracy) - - saver = tf.train.Saver() - summary_op = tf.summary.merge_all() - init_op = tf.global_variables_initializer() - - # Create a "supervisor", which oversees the training process and stores model state into HDFS - model_dir = ctx.absolute_path(args.model) - export_dir = ctx.absolute_path(args.export) - print("tensorflow model path: {0}".format(model_dir)) - print("tensorflow export path: {0}".format(export_dir)) - summary_writer = tf.summary.FileWriter("tensorboard_%d" % worker_num, graph=tf.get_default_graph()) - - if args.mode == 'inference': - output_dir = ctx.absolute_path(args.output) - print("output_dir: {}".format(output_dir)) - tf.gfile.MkDir(output_dir) - output_file = tf.gfile.Open("{}/part-{:05d}".format(output_dir, task_index), mode='w') - - with tf.train.MonitoredTrainingSession(master=server.target, - is_chief=(task_index == 0), - scaffold=tf.train.Scaffold(init_op=init_op, summary_op=summary_op, saver=saver), - checkpoint_dir=model_dir, - hooks=[tf.train.StopAtStepHook(last_step=args.steps)]) as sess: - print("{} session ready".format(datetime.now().isoformat())) - - # Loop until the session shuts down - step = 0 - count = 0 - while not sess.should_stop(): - - # Run a training step asynchronously. - # See `tf.train.SyncReplicasOptimizer` for additional details on how to - # perform *synchronous* training. - - if args.mode == "train": - if (step % 100 == 0): - print("{} step: {} accuracy: {}".format(datetime.now().isoformat(), step, sess.run(accuracy))) - _, summary, step = sess.run([train_op, summary_op, global_step]) - if task_index == 0: - summary_writer.add_summary(summary, step) - else: # args.mode == "inference" - labels, pred, acc = sess.run([label, prediction, accuracy]) - # print("label: {0}, pred: {1}".format(labels, pred)) - print("acc: {}".format(acc)) - for i in range(len(labels)): - count += 1 - output_file.write("{} {}\n".format(labels[i], pred[i])) - print("count: {}".format(count)) - - if args.mode == 'inference': - output_file.close() - - print("{} stopping MonitoredTrainingSession".format(datetime.now().isoformat())) - - # export model (on chief worker only) - if args.mode == "train" and task_index == 0: - tf.reset_default_graph() - - # add placeholders for input images (and optional labels) - x = tf.placeholder(tf.float32, [None, IMAGE_PIXELS * IMAGE_PIXELS], name='x') - y_ = tf.placeholder(tf.float32, [None, 10], name='y_') - label = tf.argmax(y_, 1, name="label") - - # add core model - y, prediction = build_model(tf.get_default_graph(), x) - - # restore from last checkpoint - saver = tf.train.Saver() - with tf.Session() as sess: - ckpt = tf.train.get_checkpoint_state(model_dir) - print("ckpt: {}".format(ckpt)) - assert ckpt, "Invalid model checkpoint path: {}".format(model_dir) - saver.restore(sess, ckpt.model_checkpoint_path) - - print("Exporting saved_model to: {}".format(export_dir)) - # exported signatures defined in code - signatures = { - tf.saved_model.signature_constants.DEFAULT_SERVING_SIGNATURE_DEF_KEY: { - 'inputs': { 'image': x }, - 'outputs': { 'prediction': prediction }, - 'method_name': tf.saved_model.signature_constants.PREDICT_METHOD_NAME - } - } - TFNode.export_saved_model(sess, - export_dir, - tf.saved_model.tag_constants.SERVING, - signatures) - print("Exported saved_model") - - # WORKAROUND for https://github.com/tensorflow/tensorflow/issues/21745 - # wait for all other nodes to complete (via done files) - done_dir = "{}/{}/done".format(ctx.absolute_path(args.model), args.mode) - print("Writing done file to: {}".format(done_dir)) - tf.gfile.MakeDirs(done_dir) - with tf.gfile.GFile("{}/{}".format(done_dir, ctx.task_index), 'w') as done_file: - done_file.write("done") - - for i in range(60): - if len(tf.gfile.ListDirectory(done_dir)) < len(ctx.cluster_spec['worker']): - print("{} Waiting for other nodes {}".format(datetime.now().isoformat(), i)) - time.sleep(1) - else: - print("{} All nodes done".format(datetime.now().isoformat())) - break diff --git a/examples/mnist/tf/mnist_inference.py b/examples/mnist/tf/mnist_inference.py deleted file mode 100644 index 13e92e30..00000000 --- a/examples/mnist/tf/mnist_inference.py +++ /dev/null @@ -1,103 +0,0 @@ -# Copyright 2018 Yahoo Inc. -# Licensed under the terms of the Apache 2.0 license. -# Please see LICENSE file in the project root for terms. - -# This example demonstrates how to leverage Spark for parallel inferencing from a SavedModel. -# -# Normally, you can use TensorFlowOnSpark to just form a TensorFlow cluster for training and inferencing. -# However, in some situations, you may have a SavedModel without the original code for defining the inferencing -# graph. In these situations, we can use Spark to instantiate a single-node TensorFlow instance on each executor, -# where each executor can independently load the model and inference on input data. -# -# Note: this particular example demonstrates use of `tf.data.Dataset` to read the input data for inferencing, -# but it could also be adapted to just use an RDD of TFRecords from Spark. - -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function - -import argparse -import tensorflow as tf - -IMAGE_PIXELS = 28 - - -def inference(it, num_workers, args): - from tensorflowonspark import util - - # consume worker number from RDD partition iterator - for i in it: - worker_num = i - print("worker_num: {}".format(i)) - - # setup env for single-node TF - util.single_node_env() - - # load saved_model using default tag and signature - sess = tf.Session() - tf.saved_model.loader.load(sess, ['serve'], args.export) - - # parse function for TFRecords - def parse_tfr(example_proto): - feature_def = {"label": tf.FixedLenFeature(10, tf.int64), - "image": tf.FixedLenFeature(IMAGE_PIXELS * IMAGE_PIXELS, tf.int64)} - features = tf.parse_single_example(example_proto, feature_def) - norm = tf.constant(255, dtype=tf.float32, shape=(784,)) - image = tf.div(tf.to_float(features['image']), norm) - label = tf.to_float(features['label']) - return (image, label) - - # define a new tf.data.Dataset (for inferencing) - ds = tf.data.Dataset.list_files("{}/part-*".format(args.images_labels)) - ds = ds.shard(num_workers, worker_num) - ds = ds.interleave(tf.data.TFRecordDataset, cycle_length=1) - ds = ds.map(parse_tfr).batch(10) - iterator = ds.make_one_shot_iterator() - image_label = iterator.get_next(name='inf_image') - - # create an output file per spark worker for the predictions - tf.gfile.MakeDirs(args.output) - output_file = tf.gfile.GFile("{}/part-{:05d}".format(args.output, worker_num), mode='w') - - while True: - try: - # get images and labels from tf.data.Dataset - img, lbl = sess.run(['inf_image:0', 'inf_image:1']) - - # inference by feeding these images and labels into the input tensors - # you can view the exported model signatures via: - # saved_model_cli show --dir --all - - # note that we feed directly into the graph tensors (bypassing the exported signatures) - # these tensors will be shown in the "name" field of the signature definitions - # also note that we can feed/fetch tensors that were not explicitly exported, e.g. `y_` and `label:0` - - labels, preds = sess.run(['label:0', 'prediction:0'], feed_dict={'x:0': img, 'y_:0': lbl}) - for i in range(len(labels)): - output_file.write("{} {}\n".format(labels[i], preds[i])) - except tf.errors.OutOfRangeError: - break - - output_file.close() - - -if __name__ == '__main__': - from pyspark.context import SparkContext - from pyspark.conf import SparkConf - - sc = SparkContext(conf=SparkConf().setAppName("mnist_inference")) - executors = sc._conf.get("spark.executor.instances") - num_executors = int(executors) if executors is not None else 1 - - parser = argparse.ArgumentParser() - parser.add_argument("--cluster_size", help="number of nodes in the cluster (for S with labelspark Standalone)", type=int, default=num_executors) - parser.add_argument('--images_labels', type=str, help='Directory for input images with labels') - parser.add_argument("--export", help="HDFS path to export model", type=str, default="mnist_export") - parser.add_argument("--output", help="HDFS path to save predictions", type=str, default="predictions") - args, _ = parser.parse_known_args() - print("args: {}".format(args)) - - # Not using TFCluster... just running single-node TF instances on each executor - nodes = list(range(args.cluster_size)) - nodeRDD = sc.parallelize(list(range(args.cluster_size)), args.cluster_size) - nodeRDD.foreachPartition(lambda worker_num: inference(worker_num, args.cluster_size, args)) diff --git a/examples/mnist/tf/mnist_spark.py b/examples/mnist/tf/mnist_spark.py deleted file mode 100644 index 83ce77fe..00000000 --- a/examples/mnist/tf/mnist_spark.py +++ /dev/null @@ -1,48 +0,0 @@ -# Copyright 2017 Yahoo Inc. -# Licensed under the terms of the Apache 2.0 license. -# Please see LICENSE file in the project root for terms. - -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function - -from pyspark.context import SparkContext -from pyspark.conf import SparkConf - -import argparse -from datetime import datetime - -from tensorflowonspark import TFCluster -import mnist_dist - -sc = SparkContext(conf=SparkConf().setAppName("mnist_tf")) -executors = sc._conf.get("spark.executor.instances") -num_executors = int(executors) if executors is not None else 1 -num_ps = 1 - -parser = argparse.ArgumentParser() -parser.add_argument("--batch_size", help="number of records per batch", type=int, default=100) -parser.add_argument("--cluster_size", help="number of nodes in the cluster (for Spark Standalone)", type=int, default=num_executors) -parser.add_argument("--driver_ps_nodes", help="""run tensorflow PS node on driver locally. - You will need to set cluster_size = num_executors + num_ps""", default=False) -parser.add_argument("--epochs", help="number of epochs", type=int, default=1) -parser.add_argument("--export", help="HDFS path to export model", type=str, default="mnist_export") -parser.add_argument("--format", help="example format: (csv2|tfr)", choices=["csv2", "tfr"], default="tfr") -parser.add_argument("--images_labels", help="HDFS path to MNIST image_label files in parallelized format") -parser.add_argument("--mode", help="train|inference", default="train") -parser.add_argument("--model", help="HDFS path to save/load model during train/test", default="mnist_model") -parser.add_argument("--num_ps", help="number of ps nodes", default=1) -parser.add_argument("--output", help="HDFS path to save test/inference output", default="predictions") -parser.add_argument("--rdma", help="use rdma connection", default=False) -parser.add_argument("--readers", help="number of reader/enqueue threads per worker", type=int, default=10) -parser.add_argument("--shuffle_size", help="size of shuffle buffer", type=int, default=1000) -parser.add_argument("--steps", help="maximum number of steps", type=int, default=1000) -parser.add_argument("--tensorboard", help="launch tensorboard process", action="store_true") -args = parser.parse_args() -print("args:", args) - -print("{0} ===== Start".format(datetime.now().isoformat())) -cluster = TFCluster.run(sc, mnist_dist.map_fun, args, args.cluster_size, args.num_ps, args.tensorboard, - TFCluster.InputMode.TENSORFLOW, driver_ps_nodes=args.driver_ps_nodes) -cluster.shutdown() -print("{0} ===== Stop".format(datetime.now().isoformat())) diff --git a/tensorflowonspark/TFNode.py b/tensorflowonspark/TFNode.py old mode 100755 new mode 100644 index 93094d7a..82754f7b --- a/tensorflowonspark/TFNode.py +++ b/tensorflowonspark/TFNode.py @@ -16,8 +16,6 @@ import getpass import logging -import os -import time from six.moves.queue import Empty from . import marker @@ -61,79 +59,8 @@ def hdfs_path(ctx, path): def start_cluster_server(ctx, num_gpus=1, rdma=False): - """Function that wraps the creation of TensorFlow ``tf.train.Server`` for a node in a distributed TensorFlow cluster. - - This is intended to be invoked from within the TF ``map_fun``, replacing explicit code to instantiate ``tf.train.ClusterSpec`` - and ``tf.train.Server`` objects. - - Args: - :ctx: TFNodeContext containing the metadata specific to this node in the cluster. - :num_gpu: number of GPUs desired - :rdma: boolean indicating if RDMA 'iverbs' should be used for cluster communications. - - Returns: - A tuple of (cluster_spec, server) - """ - import tensorflow as tf - from . import gpu_info - - logging.info("{0}: ======== {1}:{2} ========".format(ctx.worker_num, ctx.job_name, ctx.task_index)) - cluster_spec = ctx.cluster_spec - logging.info("{0}: Cluster spec: {1}".format(ctx.worker_num, cluster_spec)) - - if tf.test.is_built_with_cuda() and num_gpus > 0: - # compute my index relative to other nodes placed on the same host (for GPU allocation) - my_addr = cluster_spec[ctx.job_name][ctx.task_index] - my_host = my_addr.split(':')[0] - flattened = [v for sublist in cluster_spec.values() for v in sublist] - local_peers = [p for p in flattened if p.startswith(my_host)] - my_index = local_peers.index(my_addr) - - # GPU - gpu_initialized = False - retries = 3 - while not gpu_initialized and retries > 0: - try: - # override PS jobs to only reserve one GPU - if ctx.job_name == 'ps': - num_gpus = 1 - - # Find a free gpu(s) to use - gpus_to_use = gpu_info.get_gpus(num_gpus, my_index) - gpu_prompt = "GPU" if num_gpus == 1 else "GPUs" - logging.info("{0}: Using {1}: {2}".format(ctx.worker_num, gpu_prompt, gpus_to_use)) - - # Set GPU device to use for TensorFlow - os.environ['CUDA_VISIBLE_DEVICES'] = gpus_to_use - - # Create a cluster from the parameter server and worker hosts. - cluster = tf.train.ClusterSpec(cluster_spec) - - # Create and start a server for the local task. - if rdma: - server = tf.train.Server(cluster, ctx.job_name, ctx.task_index, protocol="grpc+verbs") - else: - server = tf.train.Server(cluster, ctx.job_name, ctx.task_index) - gpu_initialized = True - except Exception as e: - print(e) - logging.error("{0}: Failed to allocate GPU, trying again...".format(ctx.worker_num)) - retries -= 1 - time.sleep(10) - if not gpu_initialized: - raise Exception("Failed to allocate GPU") - else: - # CPU - os.environ['CUDA_VISIBLE_DEVICES'] = '' - logging.info("{0}: Using CPU".format(ctx.worker_num)) - - # Create a cluster from the parameter server and worker hosts. - cluster = tf.train.ClusterSpec(cluster_spec) - - # Create and start a server for the local task. - server = tf.train.Server(cluster, ctx.job_name, ctx.task_index) - - return (cluster, server) + """*DEPRECATED*. Use higher-level APIs like `tf.keras` or `tf.estimator`""" + raise Exception("DEPRECATED: Use higher-level APIs like `tf.keras` or `tf.estimator`") def next_batch(mgr, batch_size, qname='input'): @@ -142,49 +69,8 @@ def next_batch(mgr, batch_size, qname='input'): def export_saved_model(sess, export_dir, tag_set, signatures): - """Convenience function to export a saved_model using provided arguments - - The caller specifies the saved_model signatures in a simplified python dictionary form, as follows:: - - signatures = { - 'signature_def_key': { - 'inputs': { 'input_tensor_alias': input_tensor_name }, - 'outputs': { 'output_tensor_alias': output_tensor_name }, - 'method_name': 'method' - } - } - - And this function will generate the `signature_def_map` and export the saved_model. - - Args: - :sess: a tf.Session instance - :export_dir: path to save exported saved_model - :tag_set: string tag_set to identify the exported graph - :signatures: simplified dictionary representation of a TensorFlow signature_def_map - - Returns: - A saved_model exported to disk at ``export_dir``. - """ - import tensorflow as tf - g = sess.graph - g._unsafe_unfinalize() # https://github.com/tensorflow/serving/issues/363 - builder = tf.saved_model.builder.SavedModelBuilder(export_dir) - - logging.info("===== signatures: {}".format(signatures)) - signature_def_map = {} - for key, sig in signatures.items(): - signature_def_map[key] = tf.saved_model.signature_def_utils.build_signature_def( - inputs={name: tf.saved_model.utils.build_tensor_info(tensor) for name, tensor in sig['inputs'].items()}, - outputs={name: tf.saved_model.utils.build_tensor_info(tensor) for name, tensor in sig['outputs'].items()}, - method_name=sig['method_name'] if 'method_name' in sig else key) - logging.info("===== signature_def_map: {}".format(signature_def_map)) - builder.add_meta_graph_and_variables( - sess, - tag_set.split(','), - signature_def_map=signature_def_map, - clear_devices=True) - g.finalize() - builder.save() + """*DEPRECATED*. Use TF provided APIs instead.""" + raise Exception("DEPRECATED: Use TF provided APIs instead.") def batch_results(mgr, results, qname='output'): From c8b47fa1fd2690824307ae74517e69e641712443 Mon Sep 17 00:00:00 2001 From: Lee Yang Date: Thu, 8 Aug 2019 10:01:03 -0700 Subject: [PATCH 07/37] add InputMode.SPARK version of keras --- examples/mnist/keras/mnist_spark.py | 99 +++++++++++++++++++ .../keras/{mnist_keras_2.0.py => mnist_tf.py} | 0 2 files changed, 99 insertions(+) create mode 100644 examples/mnist/keras/mnist_spark.py rename examples/mnist/keras/{mnist_keras_2.0.py => mnist_tf.py} (100%) diff --git a/examples/mnist/keras/mnist_spark.py b/examples/mnist/keras/mnist_spark.py new file mode 100644 index 00000000..09a7fc0f --- /dev/null +++ b/examples/mnist/keras/mnist_spark.py @@ -0,0 +1,99 @@ +# Adapted from: https://www.tensorflow.org/beta/tutorials/distribute/multi_worker_with_keras + +from __future__ import absolute_import, division, print_function, unicode_literals + + +def main_fun(args, ctx): + import numpy as np + import tensorflow as tf + from tensorflowonspark import TFNode + + strategy = tf.distribute.experimental.MultiWorkerMirroredStrategy() + + def build_and_compile_cnn_model(): + model = tf.keras.Sequential([ + tf.keras.layers.Conv2D(32, 3, activation='relu', input_shape=(28, 28, 1)), + tf.keras.layers.MaxPooling2D(), + tf.keras.layers.Flatten(), + tf.keras.layers.Dense(64, activation='relu'), + tf.keras.layers.Dense(10, activation='softmax') + ]) + model.compile( + loss=tf.keras.losses.sparse_categorical_crossentropy, + optimizer=tf.keras.optimizers.SGD(learning_rate=0.001), + metrics=['accuracy']) + return model + + # single node + # single_worker_model = build_and_compile_cnn_model() + # single_worker_model.fit(x=train_datasets, epochs=3) + + tf_feed = TFNode.DataFeed(ctx.mgr, False) + + def rdd_generator(): + while not tf_feed.should_stop(): + batch = tf_feed.next_batch(1) + if len(batch) > 0: + example = batch[0] + image = np.array(example[0]).astype(np.float32) / 255.0 + image = np.reshape(image, (28, 28, 1)) + label = np.array(example[1]).astype(np.float32) + label = np.reshape(label, (1,)) + yield (image, label) + else: + return + + ds = tf.data.Dataset.from_generator(rdd_generator, (tf.float32, tf.float32), (tf.TensorShape([28, 28, 1]), tf.TensorShape([1]))) + ds = ds.batch(args.batch_size) + + # this fails + # callbacks = [tf.keras.callbacks.ModelCheckpoint(filepath=args.model_dir)] + tf.io.gfile.makedirs(args.model_dir) + filepath = args.model_dir + "/weights-{epoch:04d}" + callbacks = [tf.keras.callbacks.ModelCheckpoint(filepath=filepath, verbose=1, load_weights_on_restart=True, save_weights_only=True)] + + with strategy.scope(): + multi_worker_model = build_and_compile_cnn_model() + multi_worker_model.fit(x=ds, epochs=args.epochs, steps_per_epoch=args.steps_per_epoch, callbacks=callbacks) + + tf_feed.terminate() + + if ctx.job_name == 'chief': + # multi_worker_model.save(args.model_dir, save_format='tf') + tf.keras.experimental.export_saved_model(multi_worker_model, args.export_dir) + + +if __name__ == '__main__': + import argparse + from pyspark.context import SparkContext + from pyspark.conf import SparkConf + from tensorflowonspark import TFCluster + + sc = SparkContext(conf=SparkConf().setAppName("mnist_keras")) + executors = sc._conf.get("spark.executor.instances") + num_executors = int(executors) if executors is not None else 1 + + parser = argparse.ArgumentParser() + parser.add_argument("--batch_size", help="number of records per batch", type=int, default=64) + parser.add_argument("--cluster_size", help="number of nodes in the cluster", type=int, default=num_executors) + parser.add_argument("--epochs", help="number of epochs", type=int, default=3) + parser.add_argument("--images_labels", help="path to MNIST images and labels in parallelized format") + parser.add_argument("--model_dir", help="path to save model/checkpoint", default="mnist_model") + parser.add_argument("--export_dir", help="path to export saved_model", default="mnist_export") + parser.add_argument("--steps_per_epoch", help="number of steps per epoch", type=int, default=469) + parser.add_argument("--tensorboard", help="launch tensorboard process", action="store_true") + + args = parser.parse_args() + print("args:", args) + + # create RDD of input data + def parse(ln): + vec = [int(x) for x in ln.split(',')] + return (vec[1:], vec[0]) + images_labels = sc.textFile(args.images_labels).map(parse) + + cluster = TFCluster.run(sc, main_fun, args, args.cluster_size, num_ps=0, tensorboard=args.tensorboard, input_mode=TFCluster.InputMode.SPARK, master_node='chief') + # Note: need to feed extra data to ensure that each worker receives sufficient data to complete epochs + # to compensate for variability in partition sizes and spark scheduling + cluster.train(images_labels, args.epochs + 1) + cluster.shutdown() diff --git a/examples/mnist/keras/mnist_keras_2.0.py b/examples/mnist/keras/mnist_tf.py similarity index 100% rename from examples/mnist/keras/mnist_keras_2.0.py rename to examples/mnist/keras/mnist_tf.py From 383bc65adcf93ed39da9ac6d3003cf0c2174bd88 Mon Sep 17 00:00:00 2001 From: Lee Yang Date: Thu, 8 Aug 2019 15:22:58 -0700 Subject: [PATCH 08/37] update mnist keras readme --- examples/mnist/keras/README.md | 70 ++++++++++++++++------------- examples/mnist/keras/mnist_spark.py | 10 +++-- examples/mnist/keras/mnist_tf.py | 4 +- 3 files changed, 49 insertions(+), 35 deletions(-) diff --git a/examples/mnist/keras/README.md b/examples/mnist/keras/README.md index 366dab24..319a4c3f 100644 --- a/examples/mnist/keras/README.md +++ b/examples/mnist/keras/README.md @@ -1,12 +1,11 @@ # MNIST using Keras -Original Source: https://github.com/fchollet/keras/blob/master/examples/mnist_mlp.py +Original Source: https://www.tensorflow.org/beta/tutorials/distribute/multi_worker_with_keras -This is the MNIST Multi Layer Perceptron example from the [Keras examples](https://github.com/fchollet/keras/blob/master/examples), adapted for the `tf.estimator` API and TensorFlowOnSpark. +This is the [Multi-worker Training with Keras](https://www.tensorflow.org/beta/tutorials/distribute/multi_worker_with_keras) example, adapted for TensorFlowOnSpark. Notes: - This example assumes that Spark, TensorFlow, and TensorFlowOnSpark are already installed. -- InputMode.SPARK only supports feeding data from a single RDD, so the validation dataset/code is disabled in the corresponding example. #### Launch the Spark Standalone cluster @@ -18,74 +17,81 @@ Notes: ${SPARK_HOME}/sbin/start-master.sh; ${SPARK_HOME}/sbin/start-slave.sh -c $CORES_PER_WORKER -m 3G ${MASTER} -#### Run MNIST MLP using InputMode.TENSORFLOW +#### Run using InputMode.TENSORFLOW In this mode, each worker will load the entire MNIST dataset into memory (automatically downloading the dataset if needed). # remove any old artifacts rm -rf ${TFoS_HOME}/mnist_model + rm -rf ${TFoS_HOME}/mnist_export # train and validate ${SPARK_HOME}/bin/spark-submit \ --master ${MASTER} \ --conf spark.cores.max=${TOTAL_CORES} \ --conf spark.task.cpus=${CORES_PER_WORKER} \ - --conf spark.executorEnv.JAVA_HOME="$JAVA_HOME" \ - ${TFoS_HOME}/examples/mnist/keras/mnist_mlp_estimator.py \ + ${TFoS_HOME}/examples/mnist/keras/mnist_tf.py \ --cluster_size ${SPARK_WORKER_INSTANCES} \ - --input_mode tf \ --model_dir ${TFoS_HOME}/mnist_model \ - --epochs 5 \ - --tensorboard + --export_dir ${TFoS_HOME}/mnist_export -#### Run MNIST MLP using InputMode.SPARK -In this mode, Spark will distribute the MNIST dataset (as CSV) across the workers, so each of the two workers will see roughly half of the dataset per epoch. Also note that InputMode.SPARK currently only supports a single input RDD, so the validation/test data is not used. +#### Run using InputMode.SPARK + +In this mode, Spark will distribute the MNIST dataset (as CSV) across the workers, so each of the workers will see only a portion of the dataset per epoch. Also note that InputMode.SPARK currently only supports a single input RDD, so the validation/test data is not used. # Convert the MNIST zip files into CSV (if not already done) cd ${TFoS_HOME} ${SPARK_HOME}/bin/spark-submit \ --master ${MASTER} \ + --jars ${TFoS_HOME}/lib/tensorflow-hadoop-1.0-SNAPSHOT.jar \ ${TFoS_HOME}/examples/mnist/mnist_data_setup.py \ - --output ${TFoS_HOME}/mnist/csv \ - --format csv + --output ${TFoS_HOME}/data/mnist # confirm that data was generated - ls -lR ${TFoS_HOME}/mnist/csv + ls -lR ${TFoS_HOME}/data/mnist/csv # remove any old artifacts rm -rf ${TFoS_HOME}/mnist_model + rm -rf ${TFoS_HOME}/mnist_export # train ${SPARK_HOME}/bin/spark-submit \ --master ${MASTER} \ --conf spark.cores.max=${TOTAL_CORES} \ --conf spark.task.cpus=${CORES_PER_WORKER} \ - --conf spark.executorEnv.JAVA_HOME="$JAVA_HOME" \ - ${TFoS_HOME}/examples/mnist/keras/mnist_mlp_estimator.py \ + ${TFoS_HOME}/examples/mnist/keras/mnist_spark.py \ --cluster_size ${SPARK_WORKER_INSTANCES} \ - --input_mode spark \ - --images ${TFoS_HOME}/mnist/csv/train/images \ - --labels ${TFoS_HOME}/mnist/csv/train/labels \ - --epochs 5 \ + --images_labels ${TFoS_HOME}/data/mnist/csv/train \ --model_dir ${TFoS_HOME}/mnist_model \ - --tensorboard + --export_dir ${TFoS_HOME}/mnist_export #### Inference via saved_model_cli The training code will automatically export a TensorFlow SavedModel, which can be used with the `saved_model_cli` from the command line, as follows: # path to the SavedModel export - export SAVED_MODEL=${TFoS_HOME}/mnist_model/export/serving/* + export MODEL_BASE=${TFoS_HOME}/mnist_export + export MODEL_VERSION=$(ls ${MODEL_BASE} | sort -n | tail -n 1) + export SAVED_MODEL=${MODEL_BASE}/${MODEL_VERSION} # use a CSV formatted test example - IMG=$(head -n 1 $TFoS_HOME/examples/mnist/csv/test/images/part-00000) + # converting from a flat list of 784 digits to a json array (28, 28, 1) + cat <reshape.py + import sys + import numpy as np + vec = [int(x) for x in next(sys.stdin).split(',')] + img = np.reshape(vec[1:], (28, 28, 1)) + print(np.array2string(img).replace('\n ', ',')) + EOF + + IMG=$(head -n 1 $TFoS_HOME/data/mnist/csv/test/part-00000 | python reshape.py) # introspect model saved_model_cli show --dir $SAVED_MODEL --all # inference via saved_model_cli - saved_model_cli run --dir $SAVED_MODEL --tag_set serve --signature_def serving_default --input_exp "dense_input=[[$IMG]]" + saved_model_cli run --dir $SAVED_MODEL --tag_set serve --signature_def serving_default --input_exp "conv2d_input=[$IMG]" #### Inference via TF-Serving @@ -94,7 +100,7 @@ demonstrate the use of the REST API. Also, [per the TensorFlow Serving instruct # Start the TF-Serving instance in a docker container docker pull tensorflow/serving - docker run -t --rm -p 8501:8501 -v "${TFoS_HOME}/mnist_model/export/serving:/models/mnist" -e MODEL_NAME=mnist tensorflow/serving & + docker run -t --rm -p 8501:8501 -v "${MODEL_BASE}:/models/mnist" -e MODEL_NAME=mnist tensorflow/serving & # GET model status curl http://localhost:8501/v1/models/mnist @@ -103,7 +109,7 @@ demonstrate the use of the REST API. Also, [per the TensorFlow Serving instruct curl http://localhost:8501/v1/models/mnist/metadata # POST example for inferencing - curl -v -d "{\"instances\": [ {\"dense_input\": [$IMG] } ]}" -X POST http://localhost:8501/v1/models/mnist:predict + curl -v -d "{\"instances\": [ {\"conv2d_input\": $IMG } ]}" -X POST http://localhost:8501/v1/models/mnist:predict # Stop the TF-Serving container docker stop $(docker ps -q) @@ -117,12 +123,16 @@ For batch inferencing use cases, you can use Spark to run multiple single-node T # inference ${SPARK_HOME}/bin/spark-submit \ - --master $MASTER ${TFoS_HOME}/examples/mnist/keras/mnist_inference.py \ - --cluster_size 3 \ - --images_labels ${TFoS_HOME}/mnist/tfr/test \ - --export ${TFoS_HOME}/mnist_model/export/serving/* \ + --master ${MASTER} \ + --conf spark.cores.max=${TOTAL_CORES} \ + --conf spark.task.cpus=${CORES_PER_WORKER} \ + ${TFoS_HOME}/examples/mnist/keras/mnist_inference.py \ + --cluster_size ${SPARK_WORKER_INSTANCES} \ + --images_labels ${TFoS_HOME}/data/mnist/tfr/test \ + --export_dir ${TFoS_HOME}/mnist_export \ --output ${TFoS_HOME}/predictions + #### Shutdown the Spark Standalone cluster ${SPARK_HOME}/sbin/stop-slave.sh; ${SPARK_HOME}/sbin/stop-master.sh diff --git a/examples/mnist/keras/mnist_spark.py b/examples/mnist/keras/mnist_spark.py index 09a7fc0f..071939ab 100644 --- a/examples/mnist/keras/mnist_spark.py +++ b/examples/mnist/keras/mnist_spark.py @@ -56,11 +56,13 @@ def rdd_generator(): multi_worker_model = build_and_compile_cnn_model() multi_worker_model.fit(x=ds, epochs=args.epochs, steps_per_epoch=args.steps_per_epoch, callbacks=callbacks) - tf_feed.terminate() - if ctx.job_name == 'chief': + from tensorflow_estimator.python.estimator.export import export_lib + export_dir = export_lib.get_timestamped_export_dir(args.export_dir) + tf.keras.experimental.export_saved_model(multi_worker_model, export_dir) # multi_worker_model.save(args.model_dir, save_format='tf') - tf.keras.experimental.export_saved_model(multi_worker_model, args.export_dir) + + tf_feed.terminate() if __name__ == '__main__': @@ -78,7 +80,7 @@ def rdd_generator(): parser.add_argument("--cluster_size", help="number of nodes in the cluster", type=int, default=num_executors) parser.add_argument("--epochs", help="number of epochs", type=int, default=3) parser.add_argument("--images_labels", help="path to MNIST images and labels in parallelized format") - parser.add_argument("--model_dir", help="path to save model/checkpoint", default="mnist_model") + parser.add_argument("--model_dir", help="path to save checkpoint", default="mnist_model") parser.add_argument("--export_dir", help="path to export saved_model", default="mnist_export") parser.add_argument("--steps_per_epoch", help="number of steps per epoch", type=int, default=469) parser.add_argument("--tensorboard", help="launch tensorboard process", action="store_true") diff --git a/examples/mnist/keras/mnist_tf.py b/examples/mnist/keras/mnist_tf.py index 2a6df3d1..ecfc9d8a 100644 --- a/examples/mnist/keras/mnist_tf.py +++ b/examples/mnist/keras/mnist_tf.py @@ -61,8 +61,10 @@ def build_and_compile_cnn_model(): multi_worker_model.fit(x=train_datasets, epochs=args.epochs, steps_per_epoch=args.steps_per_epoch, callbacks=callbacks) if ctx.job_name == 'chief': + from tensorflow_estimator.python.estimator.export import export_lib + export_dir = export_lib.get_timestamped_export_dir(args.export_dir) + tf.keras.experimental.export_saved_model(multi_worker_model, export_dir) # multi_worker_model.save(args.model_dir, save_format='tf') - tf.keras.experimental.export_saved_model(multi_worker_model, args.export_dir) if __name__ == '__main__': From 9ff6a8d09c09e831ed300cec7928b809ac37880f Mon Sep 17 00:00:00 2001 From: Lee Yang Date: Wed, 14 Aug 2019 13:59:36 -0700 Subject: [PATCH 09/37] add support for InputMode.SPARK; add TFNodeContext.num_workers --- examples/mnist/estimator/README.md | 120 ++++++++++++-- examples/mnist/estimator/mnist_spark.py | 155 ++++++++++++++++++ .../{mnist_estimator_2.0.py => mnist_tf.py} | 6 +- examples/mnist/keras/README.md | 3 +- examples/mnist/keras/mnist_spark.py | 9 +- tensorflowonspark/TFSparkNode.py | 1 + 6 files changed, 274 insertions(+), 20 deletions(-) create mode 100644 examples/mnist/estimator/mnist_spark.py rename examples/mnist/estimator/{mnist_estimator_2.0.py => mnist_tf.py} (97%) diff --git a/examples/mnist/estimator/README.md b/examples/mnist/estimator/README.md index a35a7051..f9a8386b 100644 --- a/examples/mnist/estimator/README.md +++ b/examples/mnist/estimator/README.md @@ -1,12 +1,10 @@ -# MNIST using tf.estimator with tf.layers +# MNIST using Estimator -Original Source: https://github.com/tensorflow/tensorflow/blob/r1.6/tensorflow/examples/tutorials/layers/cnn_mnist.py +Original Source: https://www.tensorflow.org/beta/tutorials/distribute/multi_worker_with_estimator -This is the `tf.estimator` version of MNIST from TensorFlow's [tutorial on layers and estimators](https://www.tensorflow.org/versions/master/tutorials/layers), adapted for TensorFlowOnSpark. +This is the [Multi-worker Training with Estimator](https://www.tensorflow.org/beta/tutorials/distribute/multi_worker_with_estimator) example, adapted for TensorFlowOnSpark. -Notes: -- This example assumes that Spark, TensorFlow, and TensorFlowOnSpark are already installed. -- To minimize code changes, this example uses InputMode.TENSORFLOW. +Note: this example assumes that Spark, TensorFlow, and TensorFlowOnSpark are already installed. #### Launch the Spark Standalone cluster @@ -14,29 +12,125 @@ Notes: export SPARK_WORKER_INSTANCES=3 export CORES_PER_WORKER=1 export TOTAL_CORES=$((${CORES_PER_WORKER}*${SPARK_WORKER_INSTANCES})) - export TFoS_HOME= + export TFoS_HOME= ${SPARK_HOME}/sbin/start-master.sh; ${SPARK_HOME}/sbin/start-slave.sh -c $CORES_PER_WORKER -m 3G ${MASTER} -#### Run MNIST using InputMode.TENSORFLOW +#### Run using InputMode.TENSORFLOW In this mode, each worker will load the entire MNIST dataset into memory (automatically downloading the dataset if needed). # remove any old artifacts rm -rf ${TFoS_HOME}/mnist_model + rm -rf ${TFoS_HOME}/mnist_export # train and validate ${SPARK_HOME}/bin/spark-submit \ --master ${MASTER} \ --conf spark.cores.max=${TOTAL_CORES} \ --conf spark.task.cpus=${CORES_PER_WORKER} \ - --conf spark.task.maxFailures=1 \ - --conf spark.executorEnv.JAVA_HOME="$JAVA_HOME" \ - ${TFoS_HOME}/examples/mnist/estimator/mnist_estimator.py \ + ${TFoS_HOME}/examples/mnist/estimator/mnist_tf.py \ --cluster_size ${SPARK_WORKER_INSTANCES} \ - --model ${TFoS_HOME}/mnist_model + --model_dir ${TFoS_HOME}/mnist_model \ + --export_dir ${TFoS_HOME}/mnist_export + +#### Run using InputMode.SPARK + +In this mode, Spark will distribute the MNIST dataset (as CSV) across the workers, so each of the workers will see only a portion of the dataset per epoch. Also note that InputMode.SPARK currently only supports a single input RDD, so the validation/test data is not used. + + # Convert the MNIST zip files into CSV (if not already done) + cd ${TFoS_HOME} + ${SPARK_HOME}/bin/spark-submit \ + --master ${MASTER} \ + --jars ${TFoS_HOME}/lib/tensorflow-hadoop-1.0-SNAPSHOT.jar \ + ${TFoS_HOME}/examples/mnist/mnist_data_setup.py \ + --output ${TFoS_HOME}/data/mnist + + # confirm that data was generated + ls -lR ${TFoS_HOME}/data/mnist/csv + + # remove any old artifacts + rm -rf ${TFoS_HOME}/mnist_model + rm -rf ${TFoS_HOME}/mnist_export + + # train + ${SPARK_HOME}/bin/spark-submit \ + --master ${MASTER} \ + --conf spark.cores.max=${TOTAL_CORES} \ + --conf spark.task.cpus=${CORES_PER_WORKER} \ + ${TFoS_HOME}/examples/mnist/estimator/mnist_spark.py \ + --cluster_size ${SPARK_WORKER_INSTANCES} \ + --images_labels ${TFoS_HOME}/data/mnist/csv/train \ + --model_dir ${TFoS_HOME}/mnist_model \ + --export_dir ${TFoS_HOME}/mnist_export + +#### Inference via saved_model_cli + +The training code will automatically export a TensorFlow SavedModel, which can be used with the `saved_model_cli` from the command line, as follows: + + # path to the SavedModel export + export MODEL_BASE=${TFoS_HOME}/mnist_export + export MODEL_VERSION=$(ls ${MODEL_BASE} | sort -n | tail -n 1) + export SAVED_MODEL=${MODEL_BASE}/${MODEL_VERSION} + + # use a CSV formatted test example + # converting from a flat list of 784 digits to a json array (28, 28, 1) + cat <reshape.py + import sys + import numpy as np + vec = [int(x) for x in next(sys.stdin).split(',')] + img = np.reshape(vec[1:], (28, 28, 1)) + print(np.array2string(img).replace('\n ', ',')) + EOF + + IMG=$(head -n 1 $TFoS_HOME/data/mnist/csv/test/part-00000 | python reshape.py) + + # introspect model + saved_model_cli show --dir $SAVED_MODEL --all + + # inference via saved_model_cli + saved_model_cli run --dir $SAVED_MODEL --tag_set serve --signature_def serving_default --input_exp "features=[$IMG]" + +#### Inference via TF-Serving + +For online inferencing use cases, you can serve the SavedModel via a TensorFlow Serving instance as follows. Note that TF-Serving provides both GRPC and REST APIs, but we will only +demonstrate the use of the REST API. Also, [per the TensorFlow Serving instructions](https://www.tensorflow.org/tfx/serving/docker), we will run the serving instance inside a Docker container. + + # Start the TF-Serving instance in a docker container + docker pull tensorflow/serving + docker run -t --rm -p 8501:8501 -v "${MODEL_BASE}:/models/mnist" -e MODEL_NAME=mnist tensorflow/serving & + + # GET model status + curl http://localhost:8501/v1/models/mnist + + # GET model metadata + curl http://localhost:8501/v1/models/mnist/metadata + + # POST example for inferencing + curl -v -d "{\"instances\": [ {\"features\": $IMG } ]}" -X POST http://localhost:8501/v1/models/mnist:predict + + # Stop the TF-Serving container + docker stop $(docker ps -q) + +#### Run Parallel Inferencing via Spark + +For batch inferencing use cases, you can use Spark to run multiple single-node TensorFlow instances in parallel (on the Spark executors). Each executor/instance will operate independently on a shard of the dataset. Note that this requires that the model fits in the memory of each executor. + + # remove any old artifacts + rm -Rf ${TFoS_HOME}/predictions + + # inference + ${SPARK_HOME}/bin/spark-submit \ + --master ${MASTER} \ + --conf spark.cores.max=${TOTAL_CORES} \ + --conf spark.task.cpus=${CORES_PER_WORKER} \ + ${TFoS_HOME}/examples/mnist/estimator/mnist_inference.py \ + --cluster_size ${SPARK_WORKER_INSTANCES} \ + --images_labels ${TFoS_HOME}/data/mnist/tfr/test \ + --export_dir ${SAVED_MODEL} \ + --output ${TFoS_HOME}/predictions + #### Shutdown the Spark Standalone cluster ${SPARK_HOME}/sbin/stop-slave.sh; ${SPARK_HOME}/sbin/stop-master.sh - diff --git a/examples/mnist/estimator/mnist_spark.py b/examples/mnist/estimator/mnist_spark.py new file mode 100644 index 00000000..f75bf2ec --- /dev/null +++ b/examples/mnist/estimator/mnist_spark.py @@ -0,0 +1,155 @@ +# Adapted from: https://www.tensorflow.org/beta/tutorials/distribute/multi_worker_with_estimator + +from __future__ import absolute_import, division, print_function, unicode_literals + + +def main(args, ctx): + import numpy as np + import tensorflow as tf + import tensorflow_datasets as tfds + from tensorflowonspark import TFNode + + tfds.disable_progress_bar() + + class StopFeedHook(tf.estimator.SessionRunHook): + """SessionRunHook to terminate InputMode.SPARK RDD feeding if the training loop exits before the entire RDD is consumed.""" + + def __init__(self, feed): + self.feed = feed + + def end(self, session): + self.feed.terminate() + self.feed.next_batch(1) + + BATCH_SIZE = args.batch_size + LEARNING_RATE = args.learning_rate + + tf_feed = TFNode.DataFeed(ctx.mgr) + + def rdd_generator(): + while not tf_feed.should_stop(): + batch = tf_feed.next_batch(1) + if len(batch) > 0: + example = batch[0] + image = np.array(example[0]).astype(np.float32) / 255.0 + image = np.reshape(image, (28, 28, 1)) + label = np.array(example[1]).astype(np.float32) + label = np.reshape(label, (1,)) + yield (image, label) + else: + return + + def input_fn(mode, input_context=None): + if mode == tf.estimator.ModeKeys.TRAIN: + # Note: Spark is responsible for sharding/repeating/shuffling the data via RDD + ds = tf.data.Dataset.from_generator(rdd_generator, (tf.float32, tf.float32), (tf.TensorShape([28, 28, 1]), tf.TensorShape([1]))) + return ds.batch(BATCH_SIZE) + else: + raise Exception("I'm evaluating: mode={}, input_context={}".format(mode, input_context)) + + def scale(image, label): + image = tf.cast(image, tf.float32) / 255.0 + return image, label + + mnist = tfds.load(name='mnist', with_info=True, as_supervised=True) + ds = mnist['test'] + if input_context: + ds = ds.shard(input_context.num_input_pipelines, input_context.input_pipeline_id) + return ds.map(scale).batch(BATCH_SIZE) + + def serving_input_receiver_fn(): + features = tf.compat.v1.placeholder(dtype=tf.float32, shape=[None, 28, 28, 1], name='features') + receiver_tensors = {'features': features} + return tf.estimator.export.ServingInputReceiver(receiver_tensors, receiver_tensors) + + def model_fn(features, labels, mode): + model = tf.keras.Sequential([ + tf.keras.layers.Conv2D(32, 3, activation='relu', input_shape=(28, 28, 1)), + tf.keras.layers.MaxPooling2D(), + tf.keras.layers.Flatten(), + tf.keras.layers.Dense(64, activation='relu'), + tf.keras.layers.Dense(10, activation='softmax') + ]) + logits = model(features, training=False) + + if mode == tf.estimator.ModeKeys.PREDICT: + predictions = {'logits': logits} + return tf.estimator.EstimatorSpec(mode, predictions=predictions) + + optimizer = tf.compat.v1.train.GradientDescentOptimizer( + learning_rate=LEARNING_RATE) + loss = tf.keras.losses.SparseCategoricalCrossentropy( + from_logits=True, reduction=tf.keras.losses.Reduction.NONE)(labels, logits) + loss = tf.reduce_sum(input_tensor=loss) * (1. / BATCH_SIZE) + if mode == tf.estimator.ModeKeys.EVAL: + return tf.estimator.EstimatorSpec(mode, loss=loss) + + return tf.estimator.EstimatorSpec( + mode=mode, + loss=loss, + train_op=optimizer.minimize( + loss, tf.compat.v1.train.get_or_create_global_step())) + + strategy = tf.distribute.experimental.MultiWorkerMirroredStrategy() + config = tf.estimator.RunConfig(train_distribute=strategy, save_checkpoints_steps=100) + + classifier = tf.estimator.Estimator( + model_fn=model_fn, model_dir=args.model_dir, config=config) + + # exporter = tf.estimator.FinalExporter("serving", serving_input_receiver_fn=serving_input_receiver_fn) + + # Note: MultiWorkerMirroredStrategy (CollectiveAllReduceStrategy) is synchronous, + # so we need to ensure that all workers complete training before any of them run out of data from the RDD. + # And given that Spark RDD partitions (and partition sizes) can be non-evenly divisible by num_workers, + # we'll just stop training at 90% of the total expected number of steps. + steps = 60000 * args.epochs / args.batch_size + steps_per_worker = steps / ctx.num_workers + max_steps_per_worker = steps_per_worker * 0.9 + + tf.estimator.train_and_evaluate( + classifier, + train_spec=tf.estimator.TrainSpec(input_fn=input_fn, max_steps=max_steps_per_worker, hooks=[StopFeedHook(tf_feed)]), + eval_spec=tf.estimator.EvalSpec(input_fn=input_fn) + # eval_spec=tf.estimator.EvalSpec(input_fn=input_fn, exporters=exporter) + ) + + if ctx.job_name == 'chief': + print("Exporting saved_model to {}".format(args.export_dir)) + classifier.export_saved_model(args.export_dir, serving_input_receiver_fn) + + +if __name__ == "__main__": + + from pyspark.context import SparkContext + from pyspark.conf import SparkConf + from tensorflowonspark import TFCluster + import argparse + + sc = SparkContext(conf=SparkConf().setAppName("mnist_estimator")) + executors = sc._conf.get("spark.executor.instances") + num_executors = int(executors) if executors is not None else 1 + + parser = argparse.ArgumentParser() + parser.add_argument("--batch_size", help="number of records per batch", type=int, default=64) + parser.add_argument("--buffer_size", help="size of shuffle buffer", type=int, default=10000) + parser.add_argument("--cluster_size", help="number of nodes in the cluster", type=int, default=num_executors) + parser.add_argument("--epochs", help="number of epochs", type=int, default=3) + parser.add_argument("--images_labels", help="path to MNIST images and labels in parallelized format") + parser.add_argument("--learning_rate", help="learning rate", type=float, default=1e-3) + parser.add_argument("--model_dir", help="path to save checkpoint", default="mnist_model") + parser.add_argument("--export_dir", help="path to export saved_model", default="mnist_export") + parser.add_argument("--tensorboard", help="launch tensorboard process", action="store_true") + + args = parser.parse_args() + print("args:", args) + + # create RDD of input data + def parse(ln): + vec = [int(x) for x in ln.split(',')] + return (vec[1:], vec[0]) + + images_labels = sc.textFile(args.images_labels).map(parse) + + cluster = TFCluster.run(sc, main, args, args.cluster_size, num_ps=0, tensorboard=args.tensorboard, input_mode=TFCluster.InputMode.SPARK, log_dir=args.model_dir, master_node='chief') + cluster.train(images_labels, args.epochs) + cluster.shutdown(grace_secs=120) # allow time for the chief to export model after data feeding diff --git a/examples/mnist/estimator/mnist_estimator_2.0.py b/examples/mnist/estimator/mnist_tf.py similarity index 97% rename from examples/mnist/estimator/mnist_estimator_2.0.py rename to examples/mnist/estimator/mnist_tf.py index 75d823c8..a827ba02 100644 --- a/examples/mnist/estimator/mnist_estimator_2.0.py +++ b/examples/mnist/estimator/mnist_tf.py @@ -62,7 +62,7 @@ def model_fn(features, labels, mode): loss, tf.compat.v1.train.get_or_create_global_step())) strategy = tf.distribute.experimental.MultiWorkerMirroredStrategy() - config = tf.estimator.RunConfig(train_distribute=strategy) + config = tf.estimator.RunConfig(train_distribute=strategy, save_checkpoints_steps=100) classifier = tf.estimator.Estimator( model_fn=model_fn, model_dir=args.model_dir, config=config) @@ -97,8 +97,8 @@ def model_fn(features, labels, mode): parser.add_argument("--batch_size", help="number of records per batch", type=int, default=64) parser.add_argument("--buffer_size", help="size of shuffle buffer", type=int, default=10000) parser.add_argument("--cluster_size", help="number of nodes in the cluster", type=int, default=num_executors) - parser.add_argument("--epochs", help="number of epochs", type=int, default=2) - parser.add_argument("--learning_rate", help="learning rate", type=float, default=1e-3) + parser.add_argument("--epochs", help="number of epochs", type=int, default=3) + parser.add_argument("--learning_rate", help="learning rate", type=float, default=1e-4) parser.add_argument("--model_dir", help="path to save checkpoint", default="mnist_model") parser.add_argument("--export_dir", help="path to export saved_model", default="mnist_export") parser.add_argument("--tensorboard", help="launch tensorboard process", action="store_true") diff --git a/examples/mnist/keras/README.md b/examples/mnist/keras/README.md index 319a4c3f..5dbf59cc 100644 --- a/examples/mnist/keras/README.md +++ b/examples/mnist/keras/README.md @@ -13,7 +13,7 @@ Notes: export SPARK_WORKER_INSTANCES=3 export CORES_PER_WORKER=1 export TOTAL_CORES=$((${CORES_PER_WORKER}*${SPARK_WORKER_INSTANCES})) - export TFoS_HOME= + export TFoS_HOME= ${SPARK_HOME}/sbin/start-master.sh; ${SPARK_HOME}/sbin/start-slave.sh -c $CORES_PER_WORKER -m 3G ${MASTER} @@ -35,7 +35,6 @@ In this mode, each worker will load the entire MNIST dataset into memory (automa --model_dir ${TFoS_HOME}/mnist_model \ --export_dir ${TFoS_HOME}/mnist_export - #### Run using InputMode.SPARK In this mode, Spark will distribute the MNIST dataset (as CSV) across the workers, so each of the workers will see only a portion of the dataset per epoch. Also note that InputMode.SPARK currently only supports a single input RDD, so the validation/test data is not used. diff --git a/examples/mnist/keras/mnist_spark.py b/examples/mnist/keras/mnist_spark.py index 071939ab..1514d7ac 100644 --- a/examples/mnist/keras/mnist_spark.py +++ b/examples/mnist/keras/mnist_spark.py @@ -54,7 +54,11 @@ def rdd_generator(): with strategy.scope(): multi_worker_model = build_and_compile_cnn_model() - multi_worker_model.fit(x=ds, epochs=args.epochs, steps_per_epoch=args.steps_per_epoch, callbacks=callbacks) + + steps_per_epoch = 60000 / args.batch_size + steps_per_epoch_per_worker = steps_per_epoch / ctx.num_workers + + multi_worker_model.fit(x=ds, epochs=args.epochs, steps_per_epoch=steps_per_epoch_per_worker, callbacks=callbacks) if ctx.job_name == 'chief': from tensorflow_estimator.python.estimator.export import export_lib @@ -62,6 +66,7 @@ def rdd_generator(): tf.keras.experimental.export_saved_model(multi_worker_model, export_dir) # multi_worker_model.save(args.model_dir, save_format='tf') + # terminating feed tells spark to skip processing further partitions tf_feed.terminate() @@ -82,7 +87,6 @@ def rdd_generator(): parser.add_argument("--images_labels", help="path to MNIST images and labels in parallelized format") parser.add_argument("--model_dir", help="path to save checkpoint", default="mnist_model") parser.add_argument("--export_dir", help="path to export saved_model", default="mnist_export") - parser.add_argument("--steps_per_epoch", help="number of steps per epoch", type=int, default=469) parser.add_argument("--tensorboard", help="launch tensorboard process", action="store_true") args = parser.parse_args() @@ -92,6 +96,7 @@ def rdd_generator(): def parse(ln): vec = [int(x) for x in ln.split(',')] return (vec[1:], vec[0]) + images_labels = sc.textFile(args.images_labels).map(parse) cluster = TFCluster.run(sc, main_fun, args, args.cluster_size, num_ps=0, tensorboard=args.tensorboard, input_mode=TFCluster.InputMode.SPARK, master_node='chief') diff --git a/tensorflowonspark/TFSparkNode.py b/tensorflowonspark/TFSparkNode.py index 8ae22ea6..948ad95e 100755 --- a/tensorflowonspark/TFSparkNode.py +++ b/tensorflowonspark/TFSparkNode.py @@ -50,6 +50,7 @@ def __init__(self, executor_id, job_name, task_index, cluster_spec, defaultFS, w self.job_name = job_name self.task_index = task_index self.cluster_spec = cluster_spec + self.num_workers = sum([len(v) for k, v in cluster_spec.items() if k == 'master' or k == 'chief' or k == 'worker']) self.defaultFS = defaultFS self.working_dir = working_dir self.mgr = mgr From d15fc4c01df3acbdebc6b88c48bea6a0c5efcab6 Mon Sep 17 00:00:00 2001 From: Lee Yang Date: Wed, 14 Aug 2019 14:15:23 -0700 Subject: [PATCH 10/37] remove more low-level API examples --- examples/mnist/spark/mnist_dist_pipeline.py | 176 ------------------ examples/mnist/spark/mnist_spark_pipeline.py | 135 -------------- examples/mnist/tf/README.md | 73 -------- examples/mnist/tf/mnist_dist_pipeline.py | 185 ------------------- examples/mnist/tf/mnist_spark_pipeline.py | 133 ------------- 5 files changed, 702 deletions(-) delete mode 100755 examples/mnist/spark/mnist_dist_pipeline.py delete mode 100644 examples/mnist/spark/mnist_spark_pipeline.py delete mode 100644 examples/mnist/tf/README.md delete mode 100644 examples/mnist/tf/mnist_dist_pipeline.py delete mode 100644 examples/mnist/tf/mnist_spark_pipeline.py diff --git a/examples/mnist/spark/mnist_dist_pipeline.py b/examples/mnist/spark/mnist_dist_pipeline.py deleted file mode 100755 index 9ab5f5db..00000000 --- a/examples/mnist/spark/mnist_dist_pipeline.py +++ /dev/null @@ -1,176 +0,0 @@ -# Copyright 2017 Yahoo Inc. -# Licensed under the terms of the Apache 2.0 license. -# Please see LICENSE file in the project root for terms. - -# Distributed MNIST on grid based on TensorFlow MNIST example - -from __future__ import absolute_import -from __future__ import division -from __future__ import nested_scopes -from __future__ import print_function - - -def print_log(worker_num, arg): - print("{0}: {1}".format(worker_num, arg)) - - -def map_fun(args, ctx): - from tensorflowonspark import TFNode - from datetime import datetime - import math - import numpy - import tensorflow as tf - import time - - worker_num = ctx.worker_num - job_name = ctx.job_name - task_index = ctx.task_index - - IMAGE_PIXELS = 28 - - # Delay PS nodes a bit, since workers seem to reserve GPUs more quickly/reliably (w/o conflict) - if job_name == "ps": - time.sleep((worker_num + 1) * 5) - - # Parameters - hidden_units = 128 - batch_size = args.batch_size - - # Get TF cluster and server instances - cluster, server = TFNode.start_cluster_server(ctx, 1, args.protocol == 'rdma') - - def feed_dict(batch): - # Convert from dict of named arrays to two numpy arrays of the proper type - images = batch['image'] - labels = batch['label'] - xs = numpy.array(images) - xs = xs.astype(numpy.float32) - xs = xs / 255.0 - ys = numpy.array(labels) - ys = ys.astype(numpy.uint8) - return (xs, ys) - - if job_name == "ps": - server.join() - elif job_name == "worker": - - # Assigns ops to the local worker by default. - with tf.device(tf.train.replica_device_setter( - worker_device="/job:worker/task:%d" % task_index, - cluster=cluster)): - - # Variables of the hidden layer - hid_w = tf.Variable(tf.truncated_normal([IMAGE_PIXELS * IMAGE_PIXELS, hidden_units], - stddev=1.0 / IMAGE_PIXELS), name="hid_w") - hid_b = tf.Variable(tf.zeros([hidden_units]), name="hid_b") - tf.summary.histogram("hidden_weights", hid_w) - - # Variables of the softmax layer - sm_w = tf.Variable(tf.truncated_normal([hidden_units, 10], - stddev=1.0 / math.sqrt(hidden_units)), name="sm_w") - sm_b = tf.Variable(tf.zeros([10]), name="sm_b") - tf.summary.histogram("softmax_weights", sm_w) - - # Placeholders or QueueRunner/Readers for input data - x = tf.placeholder(tf.float32, [None, IMAGE_PIXELS * IMAGE_PIXELS], name="x") - y_ = tf.placeholder(tf.float32, [None, 10], name="y_") - - x_img = tf.reshape(x, [-1, IMAGE_PIXELS, IMAGE_PIXELS, 1]) - tf.summary.image("x_img", x_img) - - hid_lin = tf.nn.xw_plus_b(x, hid_w, hid_b) - hid = tf.nn.relu(hid_lin) - - y = tf.nn.softmax(tf.nn.xw_plus_b(hid, sm_w, sm_b)) - - global_step = tf.Variable(0) - - loss = -tf.reduce_sum(y_ * tf.log(tf.clip_by_value(y, 1e-10, 1.0))) - tf.summary.scalar("loss", loss) - - train_op = tf.train.AdagradOptimizer(0.01).minimize( - loss, global_step=global_step) - - # Test trained model - label = tf.argmax(y_, 1, name="label") - prediction = tf.argmax(y, 1, name="prediction") - correct_prediction = tf.equal(prediction, label) - - accuracy = tf.reduce_mean(tf.cast(correct_prediction, tf.float32), name="accuracy") - tf.summary.scalar("acc", accuracy) - - saver = tf.train.Saver() - summary_op = tf.summary.merge_all() - init_op = tf.global_variables_initializer() - - # Create a "supervisor", which oversees the training process and stores model state into HDFS - logdir = TFNode.hdfs_path(ctx, args.model_dir) - print("tensorflow model path: {0}".format(logdir)) - summary_writer = tf.summary.FileWriter("tensorboard_%d" % (worker_num), graph=tf.get_default_graph()) - - sv = tf.train.Supervisor(is_chief=(task_index == 0), - logdir=logdir, - init_op=init_op, - summary_op=None, - saver=saver, - global_step=global_step, - stop_grace_secs=300, - save_model_secs=10) - - # The supervisor takes care of session initialization, restoring from - # a checkpoint, and closing when done or an error occurs. - with sv.managed_session(server.target) as sess: - print("{0} session ready".format(datetime.now().isoformat())) - - # Loop until the supervisor shuts down or 1000000 steps have completed. - step = 0 - tf_feed = TFNode.DataFeed(ctx.mgr, input_mapping=args.input_mapping) - while not sv.should_stop() and not tf_feed.should_stop() and step < args.steps: - # Run a training step asynchronously. - # See `tf.train.SyncReplicasOptimizer` for additional details on how to - # perform *synchronous* training. - - # using feed_dict - batch_xs, batch_ys = feed_dict(tf_feed.next_batch(batch_size)) - feed = {x: batch_xs, y_: batch_ys} - - if len(batch_xs) > 0: - _, summary, step = sess.run([train_op, summary_op, global_step], feed_dict=feed) - # print accuracy and save model checkpoint to HDFS every 100 steps - if (step % 100 == 0): - print("{0} step: {1} accuracy: {2}".format(datetime.now().isoformat(), step, sess.run(accuracy, {x: batch_xs, y_: batch_ys}))) - - if sv.is_chief: - summary_writer.add_summary(summary, step) - - if sv.should_stop() or step >= args.steps: - tf_feed.terminate() - - if sv.is_chief and args.export_dir: - print("{0} exporting saved_model to: {1}".format(datetime.now().isoformat(), args.export_dir)) - # exported signatures defined in code - signatures = { - tf.saved_model.signature_constants.DEFAULT_SERVING_SIGNATURE_DEF_KEY: { - 'inputs': {'image': x}, - 'outputs': {'prediction': prediction}, - 'method_name': tf.saved_model.signature_constants.PREDICT_METHOD_NAME - }, - 'featurize': { - 'inputs': {'image': x}, - 'outputs': {'features': hid}, - 'method_name': 'featurize' - } - } - TFNode.export_saved_model(sess, - args.export_dir, - tf.saved_model.tag_constants.SERVING, - signatures) - else: - # non-chief workers should wait for chief - while not sv.should_stop(): - print("Waiting for chief") - time.sleep(5) - - # Ask for all the services to stop. - print("{0} stopping supervisor".format(datetime.now().isoformat())) - sv.stop() diff --git a/examples/mnist/spark/mnist_spark_pipeline.py b/examples/mnist/spark/mnist_spark_pipeline.py deleted file mode 100644 index 2c3ad0a4..00000000 --- a/examples/mnist/spark/mnist_spark_pipeline.py +++ /dev/null @@ -1,135 +0,0 @@ -# Copyright 2017 Yahoo Inc. -# Licensed under the terms of the Apache 2.0 license. -# Please see LICENSE file in the project root for terms. - -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function - -from pyspark.conf import SparkConf -from pyspark.context import SparkContext -from pyspark.sql import SparkSession - -import argparse -import sys -import tensorflow as tf -from datetime import datetime - -from tensorflowonspark import dfutil -from tensorflowonspark.pipeline import TFEstimator, TFModel -import mnist_dist_pipeline - -sc = SparkContext(conf=SparkConf().setAppName("mnist_spark")) -spark = SparkSession(sc) - -executors = sc._conf.get("spark.executor.instances") -num_executors = int(executors) if executors is not None else 1 -num_ps = 1 - -parser = argparse.ArgumentParser() - -# TFoS/cluster -parser.add_argument("--batch_size", help="number of records per batch", type=int, default=100) -parser.add_argument("--epochs", help="number of epochs", type=int, default=1) -parser.add_argument("--model_dir", help="HDFS path to save/load model during train/inference", type=str) -parser.add_argument("--export_dir", help="HDFS path to export model", type=str) -parser.add_argument("--cluster_size", help="number of nodes in the cluster", type=int, default=num_executors) -parser.add_argument("--num_ps", help="number of PS nodes in cluster", type=int, default=1) -parser.add_argument("--protocol", help="Tensorflow network protocol (grpc|rdma)", default="grpc") -parser.add_argument("--steps", help="maximum number of steps", type=int, default=1000) -parser.add_argument("--tensorboard", help="launch tensorboard process", action="store_true") - -# Spark input/output -parser.add_argument("--format", help="example format: (csv|tfr)", choices=["csv", "tfr"], default="csv") -parser.add_argument("--images", help="HDFS path to MNIST images in parallelized format") -parser.add_argument("--labels", help="HDFS path to MNIST labels in parallelized format") -parser.add_argument("--output", help="HDFS path to save test/inference output", default="predictions") - -# Execution Modes -parser.add_argument("--train", help="train a model using Estimator", action="store_true") -parser.add_argument("--inference_mode", help="type of inferencing (none|signature|direct|checkpoint)", choices=["none", "signature", "direct", "checkpoint"], default="none") -parser.add_argument("--inference_output", help="output of inferencing (predictions|features)", choices=["predictions", "features"], default="predictions") - -args = parser.parse_args() -print("args:", args) - -print("{0} ===== Start".format(datetime.now().isoformat())) - -if args.format == "tfr": - df = dfutil.loadTFRecords(sc, args.images) -elif args.format == "csv": - images = sc.textFile(args.images).map(lambda ln: [int(x) for x in ln.split(',')]) - labels = sc.textFile(args.labels).map(lambda ln: [float(x) for x in ln.split(',')]) - dataRDD = images.zip(labels) - df = spark.createDataFrame(dataRDD, ['image', 'label']) -else: - raise Exception("Unsupported format: {}".format(args.format)) - -# Pipeline API - -if args.train: - # train a model using Spark Estimator fitted to a DataFrame - print("{0} ===== Estimator.fit()".format(datetime.now().isoformat())) - # dummy tf args (from imagenet/inception example) - tf_args = {'initial_learning_rate': 0.045, 'num_epochs_per_decay': 2.0, 'learning_rate_decay_factor': 0.94} - estimator = TFEstimator(mnist_dist_pipeline.map_fun, tf_args) \ - .setInputMapping({'image': 'image', 'label': 'label'}) \ - .setModelDir(args.model_dir) \ - .setExportDir(args.export_dir) \ - .setClusterSize(args.cluster_size) \ - .setNumPS(args.num_ps) \ - .setProtocol(args.protocol) \ - .setTensorboard(args.tensorboard) \ - .setEpochs(args.epochs) \ - .setBatchSize(args.batch_size) \ - .setSteps(args.steps) - model = estimator.fit(df) -else: - # use a previously trained/exported model - model = TFModel(args) \ - .setExportDir(args.export_dir) \ - .setBatchSize(args.batch_size) - -# NO INFERENCING -if args.inference_mode == 'none': - sys.exit(0) - -# INFER FROM TENSORFLOW CHECKPOINT -elif args.inference_mode == 'checkpoint': - model.setModelDir(args.model_dir) # load model from checkpoint at args.model_dir - model.setExportDir(None) - model.setInputMapping({'image': 'x'}) # map DataFrame 'image' column to the 'x' input tensor - if args.inference_output == 'predictions': - model.setOutputMapping({'prediction': 'col_out'}) # map 'prediction' output tensor to output DataFrame 'col_out' column - else: # args.inference_output == 'features': - model.setOutputMapping({'prediction': 'col_out', 'Relu': 'col_out2'}) # add 'Relu' output tensor to output DataFrame 'col_out2' column - -# INFER USING TENSORFLOW SAVED_MODEL WITH EXPORTED SIGNATURES -elif args.inference_mode == 'signature': - model.setModelDir(None) - model.setExportDir(args.export_dir) # load saved_model from args.export_dir - model.setTagSet(tf.saved_model.tag_constants.SERVING) # using default SERVING tagset - model.setInputMapping({'image': 'image'}) # map DataFrame 'image' column to the 'image' input tensor alias of signature - if args.inference_output == 'predictions': - model.setSignatureDefKey(tf.saved_model.signature_constants.DEFAULT_SERVING_SIGNATURE_DEF_KEY) # default signature def key, i.e. 'predict' - model.setOutputMapping({'prediction': 'col_out'}) # map 'prediction' output tensor alias to output DataFrame 'col_out' column - else: # args.inference_output == 'features' - model.setSignatureDefKey('featurize') # custom signature def key - model.setOutputMapping({'features': 'col_out'}) # map 'features' output tensor alias to output DataFrame 'col_out' column - -# INFER USING TENSORFLOW SAVED_MODEL, IGNORING EXPORTED SIGNATURES -else: # args.inference_mode == 'direct': - model.setModelDir(None) - model.setExportDir(args.export_dir) # load saved_model from args.export_dir - model.setTagSet(tf.saved_model.tag_constants.SERVING) # using default SERVING tagset - model.setInputMapping({'image': 'x'}) # map DataFrame 'image' column to the 'x' input tensor - if args.inference_output == 'predictions': - model.setOutputMapping({'prediction': 'col_out'}) # map 'prediction' output tensor to output DataFrame 'col_out' column - else: # args.inference_output == 'features' - model.setOutputMapping({'prediction': 'col_out', 'Relu': 'col_out2'}) # add 'Relu' output tensor to output DataFrame 'col_out2' column - -print("{0} ===== Model.transform()".format(datetime.now().isoformat())) -preds = model.transform(df) -preds.write.json(args.output) - -print("{0} ===== Stop".format(datetime.now().isoformat())) diff --git a/examples/mnist/tf/README.md b/examples/mnist/tf/README.md deleted file mode 100644 index 9f325bb4..00000000 --- a/examples/mnist/tf/README.md +++ /dev/null @@ -1,73 +0,0 @@ -## Running distributed MNIST training / inference - -### _using Dataset_ -```bash -# for CPU mode: -# export QUEUE=default -# remove references to $LIB_CUDA - -# hdfs dfs -rm -r mnist_model -# hdfs dfs -rm -r predictions - -${SPARK_HOME}/bin/spark-submit \ ---master yarn \ ---deploy-mode cluster \ ---queue ${QUEUE} \ ---num-executors 4 \ ---executor-memory 27G \ ---py-files TensorFlowOnSpark/tfspark.zip,TensorFlowOnSpark/examples/mnist/tf/mnist_dist.py \ ---conf spark.dynamicAllocation.enabled=false \ ---conf spark.yarn.maxAppAttempts=1 \ ---archives hdfs:///user/${USER}/Python.zip#Python \ ---conf spark.executorEnv.LD_LIBRARY_PATH=$LIB_CUDA:$LIB_JVM:$LIB_HDFS \ ---driver-library-path=$LIB_CUDA \ -TensorFlowOnSpark/examples/mnist/tf/mnist_spark.py \ ---images_labels mnist/csv2/train \ ---format csv2 \ ---mode train \ ---model mnist_model - -# to use inference mode, change `--mode train` to `--mode inference` and add `--output predictions` -# one item in csv2 format is `image | label`, to use input data in TFRecord format, change `--format csv` to `--format tfr` -# to use infiniband, add `--rdma` -``` - -### _using Spark ML Pipeline_ -```bash -# for CPU mode: -# export QUEUE=default -# remove references to $LIB_CUDA - -# hdfs dfs -rm -r mnist_model -# hdfs dfs -rm -r mnist_export -# hdfs dfs -rm -r tfrecords -# hdfs dfs -rm -r predictions - -${SPARK_HOME}/bin/spark-submit \ ---master yarn \ ---deploy-mode cluster \ ---queue ${QUEUE} \ ---num-executors 4 \ ---executor-memory 27G \ ---jars hdfs:///user/${USER}/tensorflow-hadoop-1.0-SNAPSHOT.jar \ ---py-files TensorFlowOnSpark/tfspark.zip,TensorFlowOnSpark/examples/mnist/tf/mnist_dist_pipeline.py \ ---conf spark.dynamicAllocation.enabled=false \ ---conf spark.yarn.maxAppAttempts=1 \ ---archives hdfs:///user/${USER}/Python.zip#Python \ ---conf spark.executorEnv.LD_LIBRARY_PATH=$LIB_CUDA:$LIB_JVM:$LIB_HDFS \ ---driver-library-path=$LIB_CUDA \ -TensorFlowOnSpark/examples/mnist/tf/mnist_spark_pipeline.py \ ---images mnist/csv/train/images \ ---labels mnist/csv/train/labels \ ---tfrecord_dir tfrecords \ ---format csv \ ---model_dir mnist_model \ ---export_dir mnist_export \ ---train \ ---inference_mode signature \ ---inference_output predictions - -# to use input data in TFRecord format, change `--format csv` to `--format tfr` -# tensorflow-hadoop-1.0-SNAPSHOT.jar is needed for transforming csv input to TFRecord -# `--tfrecord_dir` is needed for temporarily saving dataframe to TFRecord on hdfs -``` diff --git a/examples/mnist/tf/mnist_dist_pipeline.py b/examples/mnist/tf/mnist_dist_pipeline.py deleted file mode 100644 index 4145f717..00000000 --- a/examples/mnist/tf/mnist_dist_pipeline.py +++ /dev/null @@ -1,185 +0,0 @@ -# Copyright 2017 Yahoo Inc. -# Licensed under the terms of the Apache 2.0 license. -# Please see LICENSE file in the project root for terms. - -# Distributed MNIST on grid based on TensorFlow MNIST example - -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function - -from tensorflowonspark import TFNode -from datetime import datetime -import logging -import math -import os -import tensorflow as tf - -# Parameters -hidden_units = 128 -batch_size = 100 -IMAGE_PIXELS = 28 - - -def map_fun(args, ctx): - num_workers = args.cluster_size if args.driver_ps_nodes else args.cluster_size - args.num_ps - worker_num = ctx.worker_num - job_name = ctx.job_name - task_index = ctx.task_index - - # Get TF cluster and server instances - cluster, server = TFNode.start_cluster_server(ctx, 1, args.protocol == 'rdma') - - def _parse_tfr(example_proto): - feature_def = {"label": tf.FixedLenFeature(10, tf.int64), - "image": tf.FixedLenFeature(IMAGE_PIXELS * IMAGE_PIXELS, tf.int64)} - features = tf.parse_single_example(example_proto, feature_def) - norm = tf.constant(255, dtype=tf.float32, shape=(784,)) - image = tf.div(tf.to_float(features['image']), norm) - label = tf.to_float(features['label']) - return (image, label) - - if job_name == "ps": - server.join() - elif job_name == "worker": - # Assigns ops to the local worker by default. - with tf.device(tf.train.replica_device_setter( - worker_device="/job:worker/task:%d" % task_index, - cluster=cluster)): - - # Variables of the hidden layer - hid_w = tf.Variable(tf.truncated_normal([IMAGE_PIXELS * IMAGE_PIXELS, hidden_units], - stddev=1.0 / IMAGE_PIXELS), name="hid_w") - hid_b = tf.Variable(tf.zeros([hidden_units]), name="hid_b") - tf.summary.histogram("hidden_weights", hid_w) - - # Variables of the softmax layer - sm_w = tf.Variable(tf.truncated_normal([hidden_units, 10], - stddev=1.0 / math.sqrt(hidden_units)), name="sm_w") - sm_b = tf.Variable(tf.zeros([10]), name="sm_b") - tf.summary.histogram("softmax_weights", sm_w) - - # Read from saved tf records - images = TFNode.hdfs_path(ctx, args.tfrecord_dir) - tf_record_pattern = os.path.join(images, 'part-*') - ds = tf.data.Dataset.list_files(tf_record_pattern) - ds = ds.shard(num_workers, task_index).repeat(args.epochs).shuffle(args.shuffle_size) - ds = ds.interleave(tf.data.TFRecordDataset, cycle_length=args.readers, block_length=1) - ds = ds.map(_parse_tfr).batch(args.batch_size) - iterator = ds.make_initializable_iterator() - x, y_ = iterator.get_next() - - x_img = tf.reshape(x, [-1, IMAGE_PIXELS, IMAGE_PIXELS, 1]) - tf.summary.image("x_img", x_img) - - hid_lin = tf.nn.xw_plus_b(x, hid_w, hid_b) - hid = tf.nn.relu(hid_lin) - - y = tf.nn.softmax(tf.nn.xw_plus_b(hid, sm_w, sm_b)) - - global_step = tf.Variable(0) - - loss = -tf.reduce_sum(y_ * tf.log(tf.clip_by_value(y, 1e-10, 1.0))) - tf.summary.scalar("loss", loss) - train_op = tf.train.AdagradOptimizer(0.01).minimize( - loss, global_step=global_step) - - # Test trained model - label = tf.argmax(y_, 1, name="label") - prediction = tf.argmax(y, 1, name="prediction") - correct_prediction = tf.equal(prediction, label) - accuracy = tf.reduce_mean(tf.cast(correct_prediction, tf.float32), name="accuracy") - tf.summary.scalar("acc", accuracy) - - saver = tf.train.Saver() - summary_op = tf.summary.merge_all() - init_op = tf.global_variables_initializer() - - # Create a "supervisor", which oversees the training process and stores model state into HDFS - logdir = TFNode.hdfs_path(ctx, args.model_dir) - print("tensorflow model path: {0}".format(logdir)) - summary_writer = tf.summary.FileWriter("tensorboard_%d" % (worker_num), graph=tf.get_default_graph()) - - sv = tf.train.Supervisor(is_chief=(task_index == 0), - logdir=logdir, - init_op=init_op, - summary_op=None, - saver=saver, - global_step=global_step, - stop_grace_secs=300, - save_model_secs=10) - - # The supervisor takes care of session initialization, restoring from - # a checkpoint, and closing when done or an error occurs. - with sv.managed_session(server.target) as sess: - print("{0} session ready".format(datetime.now().isoformat())) - sess.run(iterator.initializer) - - # Loop until the supervisor shuts down or 1000000 steps have completed. - step = 0 - while not sv.should_stop() and step < args.steps: - # Run a training step asynchronously. - # See `tf.train.SyncReplicasOptimizer` for additional details on how to - # perform *synchronous* training. - - if (step % 100 == 0): - print("{0} step: {1} accuracy: {2}".format(datetime.now().isoformat(), step, sess.run(accuracy))) - _, summary, step = sess.run([train_op, summary_op, global_step]) - if sv.is_chief: - summary_writer.add_summary(summary, step) - - # Ask for all the services to stop. - print("{0} stopping supervisor".format(datetime.now().isoformat())) - sv.stop() - - -def export_fun(args): - """Define/export a single-node TF graph for inferencing""" - # Input placeholder for inferencing - x = tf.placeholder(tf.float32, [None, IMAGE_PIXELS * IMAGE_PIXELS], name="x") - - # Variables of the hidden layer - hid_w = tf.Variable(tf.truncated_normal([IMAGE_PIXELS * IMAGE_PIXELS, hidden_units], - stddev=1.0 / IMAGE_PIXELS), name="hid_w") - hid_b = tf.Variable(tf.zeros([hidden_units]), name="hid_b") - tf.summary.histogram("hidden_weights", hid_w) - - # Variables of the softmax layer - sm_w = tf.Variable(tf.truncated_normal([hidden_units, 10], - stddev=1.0 / math.sqrt(hidden_units)), name="sm_w") - sm_b = tf.Variable(tf.zeros([10]), name="sm_b") - - hid_lin = tf.nn.xw_plus_b(x, hid_w, hid_b) - hid = tf.nn.relu(hid_lin) - y = tf.nn.softmax(tf.nn.xw_plus_b(hid, sm_w, sm_b)) - prediction = tf.argmax(y, 1, name="prediction") - - saver = tf.train.Saver() - - with tf.Session() as sess: - # load graph from a checkpoint - logging.info("model path: {}".format(args.model_dir)) - ckpt = tf.train.get_checkpoint_state(args.model_dir) - logging.info("ckpt: {}".format(ckpt)) - assert ckpt and ckpt.model_checkpoint_path, "Invalid model checkpoint path: {}".format(args.model_dir) - saver.restore(sess, ckpt.model_checkpoint_path) - - logging.info("Exporting saved_model to: {}".format(args.export_dir)) - # exported signatures defined in code - signatures = { - tf.saved_model.signature_constants.DEFAULT_SERVING_SIGNATURE_DEF_KEY: { - 'inputs': {'image': x}, - 'outputs': {'prediction': prediction}, - 'method_name': tf.saved_model.signature_constants.PREDICT_METHOD_NAME - }, - 'featurize': { - 'inputs': {'image': x}, - 'outputs': {'features': hid}, - 'method_name': 'featurize' - } - } - TFNode.export_saved_model(sess, - args.export_dir, - tf.saved_model.tag_constants.SERVING, - signatures) - logging.info("Exported saved_model") diff --git a/examples/mnist/tf/mnist_spark_pipeline.py b/examples/mnist/tf/mnist_spark_pipeline.py deleted file mode 100644 index 92ef417c..00000000 --- a/examples/mnist/tf/mnist_spark_pipeline.py +++ /dev/null @@ -1,133 +0,0 @@ -# Copyright 2017 Yahoo Inc. -# Licensed under the terms of the Apache 2.0 license. -# Please see LICENSE file in the project root for terms. - -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function - -from pyspark.context import SparkContext -from pyspark.conf import SparkConf -from pyspark.sql import SparkSession - -import argparse -import sys -import tensorflow as tf -from datetime import datetime - -from tensorflowonspark import TFCluster, dfutil -from tensorflowonspark.pipeline import TFEstimator, TFModel -import mnist_dist_pipeline - -sc = SparkContext(conf=SparkConf().setAppName("mnist_tf")) -spark = SparkSession(sc) - -executors = sc._conf.get("spark.executor.instances") -num_executors = int(executors) if executors is not None else 1 -num_ps = 1 - -parser = argparse.ArgumentParser() - -# TFoS/cluster -parser.add_argument("--batch_size", help="number of records per batch", type=int, default=100) -parser.add_argument("--epochs", help="number of epochs", type=int, default=1) -parser.add_argument("--model_dir", help="HDFS path to save/load model during train/inference", type=str) -parser.add_argument("--export_dir", help="HDFS path to export model", type=str) -parser.add_argument("--tfrecord_dir", help="HDFS path to temporarily save DataFrame to disk", type=str) -parser.add_argument("--cluster_size", help="number of nodes in the cluster", type=int, default=num_executors) -parser.add_argument("--num_ps", help="number of PS nodes in cluster", type=int, default=1) -parser.add_argument("-p", "--driver_ps_nodes", help="""run tensorflow PS node on driver locally. - You will need to set cluster_size = num_executors + num_ps""", default=False) -parser.add_argument("--protocol", help="Tensorflow network protocol (grpc|rdma)", default="grpc") -parser.add_argument("--readers", help="number of reader/enqueue threads per worker", type=int, default=10) -parser.add_argument("--steps", help="maximum number of steps", type=int, default=1000) -parser.add_argument("--tensorboard", help="launch tensorboard process", action="store_true") -parser.add_argument("--shuffle_size", help="size of shuffle buffer", type=int, default=1000) - -# Spark input/output -parser.add_argument("--format", help="example format: (csv|tfr)", choices=["csv", "tfr"], default="csv") -parser.add_argument("--images", help="HDFS path to MNIST images in parallelized format") -parser.add_argument("--labels", help="HDFS path to MNIST labels in parallelized format") -parser.add_argument("--output", help="HDFS path to save test/inference output", default="predictions") - -# Execution Modes -parser.add_argument("--train", help="train a model using Estimator", action="store_true") -parser.add_argument("--inference_mode", help="type of inferencing (none|signature|direct)", choices=["none", "signature", "direct"], default="none") -parser.add_argument("--inference_output", help="output of inferencing (predictions|features)", choices=["predictions", "features"], default="predictions") - -args = parser.parse_args() -print("args:", args) - -print("{0} ===== Start".format(datetime.now().isoformat())) - -if args.format == "tfr": - df = dfutil.loadTFRecords(sc, args.images) -elif args.format == "csv": - images = sc.textFile(args.images).map(lambda ln: [int(x) for x in ln.split(',')]) - labels = sc.textFile(args.labels).map(lambda ln: [int(float(x)) for x in ln.split(',')]) - dataRDD = images.zip(labels) - df = spark.createDataFrame(dataRDD, ['image', 'label']) -else: - raise Exception("Unsupported format: {}".format(args.format)) - -# Pipeline API - -if args.train: - # train a model using Spark Estimator fitted to a DataFrame - print("{0} ===== Estimator.fit()".format(datetime.now().isoformat())) - # dummy tf args (from imagenet/inception example) - tf_args = {'initial_learning_rate': 0.045, 'num_epochs_per_decay': 2.0, 'learning_rate_decay_factor': 0.94} - estimator = TFEstimator(mnist_dist_pipeline.map_fun, args, export_fn=mnist_dist_pipeline.export_fun) \ - .setModelDir(args.model_dir) \ - .setExportDir(args.export_dir) \ - .setClusterSize(args.cluster_size) \ - .setNumPS(args.num_ps) \ - .setDriverPSNodes(args.driver_ps_nodes) \ - .setInputMode(TFCluster.InputMode.TENSORFLOW) \ - .setTFRecordDir(args.tfrecord_dir) \ - .setProtocol(args.protocol) \ - .setReaders(args.readers) \ - .setTensorboard(args.tensorboard) \ - .setEpochs(args.epochs) \ - .setBatchSize(args.batch_size) \ - .setSteps(args.steps) - model = estimator.fit(df) -else: - # use a previously trained/exported model - model = TFModel(args) \ - .setExportDir(args.export_dir) \ - .setBatchSize(args.batch_size) - -# NO INFERENCING -if args.inference_mode == 'none': - sys.exit(0) - -# INFER USING EXPORTED SIGNATURES OF TENSORFLOW SAVED_MODEL -elif args.inference_mode == 'signature': - model.setModelDir(None) - model.setExportDir(args.export_dir) # load saved_model from args.export_dir - model.setTagSet(tf.saved_model.tag_constants.SERVING) # using default SERVING tagset - model.setInputMapping({'image': 'image'}) # map DataFrame 'image' column to the 'image' input tensor alias of signature - if args.inference_output == 'predictions': - model.setSignatureDefKey(tf.saved_model.signature_constants.DEFAULT_SERVING_SIGNATURE_DEF_KEY) # default signature def key, i.e. 'predict' - model.setOutputMapping({'prediction': 'col_out'}) # map 'prediction' output tensor alias to output DataFrame 'col_out' column - else: # args.inference_output == 'features' - model.setSignatureDefKey('featurize') # custom signature def key - model.setOutputMapping({'features': 'col_out'}) # map 'features' output tensor alias to output DataFrame 'col_out' column - -# INFER USING TENSORFLOW SAVED_MODEL, IGNORING EXPORTED SIGNATURES -else: # args.inference_mode == 'direct': - model.setModelDir(None) - model.setExportDir(args.export_dir) # load saved_model from args.export_dir - model.setTagSet(tf.saved_model.tag_constants.SERVING) # using default SERVING tagset - model.setInputMapping({'image': 'x'}) # map DataFrame 'image' column to the 'x' input tensor - if args.inference_output == 'predictions': - model.setOutputMapping({'prediction': 'col_out'}) # map 'prediction' output tensor to output DataFrame 'col_out' column - else: # args.inference_output == 'features' - model.setOutputMapping({'prediction': 'col_out', 'Relu': 'col_out2'}) # add 'Relu' output tensor to output DataFrame 'col_out2' column - -print("{0} ===== Model.transform()".format(datetime.now().isoformat())) -preds = model.transform(df) -preds.write.json(args.output) - -print("{0} ===== Stop".format(datetime.now().isoformat())) From 3d1d3e488e084d94187b5c997ae26b585dfa4005 Mon Sep 17 00:00:00 2001 From: Lee Yang Date: Wed, 14 Aug 2019 14:17:22 -0700 Subject: [PATCH 11/37] remove low-level API notebooks --- examples/mnist/mnist_pipeline.ipynb | 411 ---------------------------- examples/mnist/mnist_spark.ipynb | 360 ------------------------ examples/mnist/mnist_tf.ipynb | 328 ---------------------- 3 files changed, 1099 deletions(-) delete mode 100644 examples/mnist/mnist_pipeline.ipynb delete mode 100644 examples/mnist/mnist_spark.ipynb delete mode 100644 examples/mnist/mnist_tf.ipynb diff --git a/examples/mnist/mnist_pipeline.ipynb b/examples/mnist/mnist_pipeline.ipynb deleted file mode 100644 index 7bb70c3e..00000000 --- a/examples/mnist/mnist_pipeline.ipynb +++ /dev/null @@ -1,411 +0,0 @@ -{ - "cells": [ - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# TensorFlowOnSpark with Spark ML Pipelines" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "[Spark ML Pipelines](https://spark.apache.org/docs/latest/ml-pipeline.html) provide high-level APIs (inspired by [scikit-learn](http://scikit-learn.org)) for Spark-based machine learning algorithms.\n", - "\n", - "This notebook demonstrates support for these APIs within TensorFlowOnSpark via the introduction of a new [pipeline](https://github.com/yahoo/TensorFlowOnSpark/blob/master/tensorflowonspark/pipeline.py) module consisting of two main classes: \n", - "\n", - "1. [TFEstimator](https://yahoo.github.io/TensorFlowOnSpark/tensorflowonspark.pipeline.html#tensorflowonspark.pipeline.TFEstimator) - A Spark ML Estimator which wraps a distributed TensorFlowOnSpark cluster for training.\n", - "2. [TFModel](https://yahoo.github.io/TensorFlowOnSpark/tensorflowonspark.pipeline.html#tensorflowonspark.pipeline.TFModel) - A Spark ML Model which represents a TensorFlow model checkpoint or [saved_model](https://www.tensorflow.org/programmers_guide/saved_model#apis_to_build_and_load_a_savedmodel) on disk. **Note**: due to architectural limitations, transform/inferencing is conducted on the executors as parallel instances of a single-node TensorFlow application (vs. a distributed TensorFlow cluster), so the model must fit in the memory of a single executor.\n", - "\n", - "In addition, there is a new [dfutil](https://yahoo.github.io/TensorFlowOnSpark/tensorflowonspark.dfutil.html) module which provides helper functions to convert from TensorFlow TFRecords to Spark DataFrames and vice versa.\n" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Start a Spark Standalone Cluster\n", - "\n", - "First, in a terminal/shell window, start a single-machine Spark Standalone Cluster with three workers:\n", - "```\n", - "export MASTER=spark://$(hostname):7077\n", - "export SPARK_WORKER_INSTANCES=3\n", - "export CORES_PER_WORKER=1\n", - "export TOTAL_CORES=$((${CORES_PER_WORKER}*${SPARK_WORKER_INSTANCES})) \n", - "${SPARK_HOME}/sbin/start-master.sh; ${SPARK_HOME}/sbin/start-slave.sh -c $CORES_PER_WORKER -m 3G ${MASTER}\n", - "```" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Launch the Spark Jupyter Notebook\n", - "\n", - "Now, in the same window, launch a Pyspark Jupyter notebook:\n", - "```\n", - "# export TFoS_HOME=\n", - "cd ${TFoS_HOME}/examples/mnist\n", - "PYSPARK_DRIVER_PYTHON=\"jupyter\" \\\n", - "PYSPARK_DRIVER_PYTHON_OPTS=\"notebook\" \\\n", - "pyspark --master ${MASTER} \\\n", - "--conf spark.cores.max=${TOTAL_CORES} \\\n", - "--conf spark.task.cpus=${CORES_PER_WORKER} \\\n", - "--py-files ${TFoS_HOME}/examples/mnist/spark/mnist_dist_pipeline.py \\\n", - "--conf spark.executorEnv.JAVA_HOME=\"$JAVA_HOME\"\n", - "```\n", - "\n", - "This should open a Jupyter browser pointing to the directory where this notebook is hosted.\n", - "Click on this notebook and begin executing the steps of the notebook." - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "NOTE: the `SparkContext` should be available as the `sc` variable. You can use it to navigate to the Spark UI's \"Executors\" tab, where you will find the logs for each Spark executor. For TensorFlowOnSpark, each executor will correspond to a specific TensorFlow instance in the cluster, and the TensorFlow logs will be reported in each executor's `stderr` logs. " - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "sc" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "from __future__ import absolute_import\n", - "from __future__ import division\n", - "from __future__ import print_function\n", - "\n", - "from pyspark.conf import SparkConf\n", - "from pyspark.context import SparkContext\n", - "from pyspark.sql import SparkSession\n", - "\n", - "import argparse\n", - "import os\n", - "import subprocess\n", - "import sys\n", - "from datetime import datetime\n", - "\n", - "import tensorflow as tf\n", - "from tensorflowonspark import dfutil\n", - "from tensorflowonspark.pipeline import TFEstimator, TFModel\n", - "\n", - "# main TensorFlow code for this example\n", - "import mnist_dist_pipeline" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "num_executors = sc.defaultParallelism\n", - "num_executors" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "**Note**: for a Spark Standalone cluster on a single machine, the executors will operate from different working directories, so relative paths won't work across the cluster. This code just maps relative paths to the absolute path of this notebook's current working directory." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "cwd = os.getcwd()\n", - "model_dir = os.sep.join([cwd, \"mnist_model\"]) # path to TensorFlow model/checkpoint\n", - "export_dir = os.sep.join([cwd, \"mnist_export\"]) # path to TensorFlow saved_model export\n", - "output = os.sep.join([cwd, \"predictions\"]) # path to output of inferencing\n", - "\n", - "print(model_dir)\n", - "print(export_dir)\n", - "print(output)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# Clean up any prior artifacts\n", - "subprocess.call([\"rm\", \"-rf\", model_dir])\n", - "subprocess.call([\"rm\", \"-rf\", export_dir])\n", - "subprocess.call([\"rm\", \"-rf\", output])\n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# Set up parser for command-line options\n", - "parser = argparse.ArgumentParser()\n", - "\n", - "## TFoS/cluster\n", - "parser.add_argument(\"--batch_size\", help=\"number of records per batch\", type=int, default=100)\n", - "parser.add_argument(\"--epochs\", help=\"number of epochs\", type=int, default=1)\n", - "parser.add_argument(\"--model_dir\", help=\"HDFS path to save/load model during train/inference\", type=str)\n", - "parser.add_argument(\"--export_dir\", help=\"HDFS path to export saved_model\", type=str)\n", - "parser.add_argument(\"--cluster_size\", help=\"number of nodes in the cluster\", type=int, default=num_executors)\n", - "parser.add_argument(\"--num_ps\", help=\"number of PS nodes in cluster\", type=int, default=1)\n", - "parser.add_argument(\"--protocol\", help=\"Tensorflow network protocol (grpc|rdma)\", default=\"grpc\")\n", - "parser.add_argument(\"--steps\", help=\"maximum number of steps\", type=int, default=1000)\n", - "parser.add_argument(\"--tensorboard\", help=\"launch tensorboard process\", action=\"store_true\")\n", - "\n", - "# Spark input/output\n", - "parser.add_argument(\"--format\", help=\"example format: (csv|tfr)\", choices=[\"csv\",\"tfr\"], default=\"csv\")\n", - "parser.add_argument(\"--images\", help=\"HDFS path to MNIST images in parallelized format\")\n", - "parser.add_argument(\"--labels\", help=\"HDFS path to MNIST labels in parallelized format\")\n", - "parser.add_argument(\"--output\", help=\"HDFS path to save test/inference output\", default=\"predictions\")\n", - "\n", - "# Execution Modes\n", - "parser.add_argument(\"--train\", help=\"train a model using Estimator\", action=\"store_true\")\n", - "parser.add_argument(\"--inference_mode\", help=\"type of inferencing (none|checkpoint|signature|direct)\", choices=[\"none\",\"signature\",\"direct\",\"checkpoint\"], default=\"none\")\n", - "parser.add_argument(\"--inference_output\", help=\"output type for inferencing (predictions|features)\", choices=[\"predictions\",\"features\"], default=\"predictions\")\n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# Declare arguments for this session\n", - "args = parser.parse_args([\"--model_dir\", model_dir, \\\n", - " \"--export_dir\", export_dir, \\\n", - " \"--output\", output, \\\n", - " \"--images\", \"csv/train/images\", \\\n", - " \"--labels\", \"csv/train/labels\", \\\n", - " \"--train\", \\\n", - " \"--inference_mode\", \"checkpoint\", \\\n", - " \"--inference_output\", \"predictions\"])\n", - "print(args)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "The following code supports reading the MNIST data as both TFRecords and CSV files. It is assumed that you've already converted the MNIST binary data to either of these formats. If not, you can refer to the [Spark Standalone example](https://github.com/yahoo/TensorFlowOnSpark/wiki/GetStarted_Standalone#convert-the-mnist-zip-files-using-spark) for instructions.\n", - "\n", - "For TFRecords, this leverages the `dfutil` module to load the TFRecords at `args.images` as a Spark DataFrame. This conversion assumes a flat TFRecord structure, i.e. a simple list of features consisting of standard types, that can be easily mapped to DataFrame columns. Deeply nested structures and variable schemas are not currently supported, so for those datasets, you may need to write a custom loader/converter.\n", - "\n", - "For CSV, this just uses traditional Spark RDD APIs to read/transform the text files, zip the images with the labels, and then convert the resulting RDD into a DataFrame. Note: this uses a trivial CSV parser to keep the code simple." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "if args.format == \"tfr\":\n", - " df = dfutil.loadTFRecords(sc, args.images)\n", - "elif args.format == \"csv\":\n", - " images = sc.textFile(args.images).map(lambda ln: [int(x) for x in ln.split(',')])\n", - " labels = sc.textFile(args.labels).map(lambda ln: [float(x) for x in ln.split(',')])\n", - " dataRDD = images.zip(labels)\n", - " df = spark.createDataFrame(dataRDD, ['image', 'label'])\n", - "else:\n", - " raise Exception(\"Unsupported format: {}\".format(args.format))\n" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Now, train the model using a `TFEstimator`. This class supports ML Params for arguments and hyper-parameters that are common across TensorFlow applications. The constructor accepts the TensorFlow \"map_fun\" (i.e. the \"main\" function converted to the expected TensorFlowOnSpark API signature) along with an optional dictionary of application-specific hyper-parameters. Note: this shows application-specific hyper-parameters for the Inception network only as an example, since the MNIST network does not have specific hyper-parameters.\n", - "\n", - "When `TFEstimator.fit()` is invoked, it will launch a TensorFlowOnSpark cluster for distributed training, with the model checkpoint persisted on disk. If an `--export_dir` is supplied above, this TensorFlow application will also export a saved_model to that directory. At the end of training, the TensorFlowOnSpark cluster will be automatically shut down.\n", - "\n", - "If the `--train` argument is not supplied above, this code will skip training and just construct a `TFModel` instance using the same arguments to represent a model checkpoint and/or saved_model already available on disk." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "if args.train:\n", - " # train a model using Spark Estimator fitted to a DataFrame\n", - " # dummy tf args (from imagenet/inception example)\n", - " tf_args = { 'initial_learning_rate': 0.045, 'num_epochs_per_decay': 2.0, 'learning_rate_decay_factor': 0.94 }\n", - " estimator = TFEstimator(mnist_dist_pipeline.map_fun, tf_args) \\\n", - " .setInputMapping({'image':'image', 'label':'label'}) \\\n", - " .setModelDir(args.model_dir) \\\n", - " .setExportDir(args.export_dir) \\\n", - " .setClusterSize(args.cluster_size) \\\n", - " .setNumPS(args.num_ps) \\\n", - " .setProtocol(args.protocol) \\\n", - " .setTensorboard(args.tensorboard) \\\n", - " .setEpochs(args.epochs) \\\n", - " .setBatchSize(args.batch_size) \\\n", - " .setSteps(args.steps)\n", - " model = estimator.fit(df)\n", - "else:\n", - " # use a previously trained/exported model\n", - " model = TFModel(args) \\\n", - " .setExportDir(args.export_dir) \\\n", - " .setBatchSize(args.batch_size)\n", - " " - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# Check the model checkpoint\n", - "print(subprocess.check_output([\"ls\", \"-l\", model_dir]).decode(\"utf-8\"))" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# Check the exported saved_model\n", - "print(subprocess.check_output([\"ls\", \"-lR\", export_dir]).decode(\"utf-8\"))" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "At this point, the model checkpoint and the exported saved_model are persisted on disk. The following code demonstrates several different ways to load and use these models for inferencing. The different modes, described below, are specified by the `--inference_mode` command-line argument:\n", - "\n", - "1. **none** - skip inferencing entirely (useful when debugging training step).\n", - "2. **checkpoint** - load the model directly from the TensorFlow checkpoint, and map the DataFrame columns to specific tensors.\n", - "3. **signature** - load the model from the saved_model export and use the exported input/output signatures. Note: these signatures provide a level of indirection between the signature's tensor \"aliases\" and the actual tensors. This is intended to provide stable gRPC signatures for TensorFlow-Serving calls, even when the underlying model changes. In this mode, the DataFrame columns are mapped to these provided \"aliases\".\n", - "4. **direct** - load the model from the saved_model export, but ignore the exported signatures and tensor \"aliases\". In this mode, DataFrame columns are mapped directly to the underlying tensors. This can be useful if the user has a previously trained/exported saved_model but wants to access tensors that weren't originally mapped to a published signature." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# NO INFERENCING\n", - "if args.inference_mode == 'none':\n", - " sys.exit(0)\n", - " \n", - "# INFER FROM TENSORFLOW CHECKPOINT\n", - "elif args.inference_mode == 'checkpoint':\n", - " model.setModelDir(args.model_dir) # load model from checkpoint at args.model_dir\n", - " model.setExportDir(None) # don't use a saved_model\n", - " model.setInputMapping({'image':'x'}) # map DataFrame 'image' column to the 'x' input tensor\n", - " if args.inference_output == 'predictions':\n", - " model.setOutputMapping({'prediction':'col_out'}) # map 'prediction' output tensor to output DataFrame 'col_out' column\n", - " else: # args.inference_output == 'features':\n", - " model.setOutputMapping({'prediction':'col_out', 'Relu':'col_out2'}) # add 'Relu' output tensor to output DataFrame 'col_out2' column\n", - "\n", - "# INFER USING TENSORFLOW SAVED_MODEL WITH EXPORTED SIGNATURES\n", - "elif args.inference_mode == 'signature':\n", - " model.setModelDir(None) # don't use the model checkpoint\n", - " model.setExportDir(args.export_dir) # load saved_model from args.export_dir\n", - " model.setTagSet(tf.saved_model.tag_constants.SERVING) # using default SERVING tagset\n", - " model.setInputMapping({'image':'image'}) # map DataFrame 'image' column to the 'image' input tensor alias of signature\n", - " if args.inference_output == 'predictions':\n", - " model.setSignatureDefKey(tf.saved_model.signature_constants.DEFAULT_SERVING_SIGNATURE_DEF_KEY) # default signature def key, i.e. 'predict'\n", - " model.setOutputMapping({'prediction':'col_out'}) # map 'prediction' output tensor alias to output DataFrame 'col_out' column\n", - " else: # args.inference_output == 'features'\n", - " model.setSignatureDefKey('featurize') # custom signature def key\n", - " model.setOutputMapping({'features':'col_out'}) # map 'features' output tensor alias to output DataFrame 'col_out' column\n", - "\n", - "# INFER USING TENSORFLOW SAVED_MODEL, IGNORING EXPORTED SIGNATURES\n", - "else: # args.inference_mode == 'direct':\n", - " model.setModelDir(None) # don't use the model checkpoint\n", - " model.setExportDir(args.export_dir) # load saved_model from args.export_dir\n", - " model.setTagSet(tf.saved_model.tag_constants.SERVING) # using default SERVING tagset\n", - " model.setInputMapping({'image':'x'}) # map DataFrame 'image' column to the 'x' input tensor\n", - " if args.inference_output == 'predictions':\n", - " model.setOutputMapping({'prediction': 'col_out'}) # map 'prediction' output tensor to output DataFrame 'col_out' column\n", - " else: # args.inference_output == 'features'\n", - " model.setOutputMapping({'prediction': 'col_out', 'Relu': 'col_out2'}) # add 'Relu' output tensor to output DataFrame 'col_out2' column\n" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Now, invoke the `TFModel.transform()` method and save the output DataFrame. **Note**: Spark \"transformations\" are \"lazy\" by design, so no actual inferencing will occur until an \"action\" is invoked on the output DataFrame `preds`, which in this case is the `write.json` call below to save the output to disk." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "print(\"{0} ===== Model.transform()\".format(datetime.now().isoformat()))\n", - "preds = model.transform(df)\n", - "preds.write.json(args.output)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "print(subprocess.check_output([\"ls\", \"-l\", output]).decode(\"utf-8\"))" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Shutdown\n", - "\n", - "In your terminal/shell window, you can type `` to exit the Notebook server.\n", - "\n", - "Then, stop the Standalone Cluster via:\n", - "```\n", - "${SPARK_HOME}/sbin/stop-slave.sh; ${SPARK_HOME}/sbin/stop-master.sh\n", - "```" - ] - } - ], - "metadata": { - "kernelspec": { - "display_name": "Python 2", - "language": "python", - "name": "python2" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.6.0" - } - }, - "nbformat": 4, - "nbformat_minor": 2 -} diff --git a/examples/mnist/mnist_spark.ipynb b/examples/mnist/mnist_spark.ipynb deleted file mode 100644 index 7a597aa5..00000000 --- a/examples/mnist/mnist_spark.ipynb +++ /dev/null @@ -1,360 +0,0 @@ -{ - "cells": [ - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# TensorFlowOnSpark with InputMode.SPARK" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "This notebook demonstrates TensorFlowOnSpark using `InputMode.SPARK`, which feeds a Spark RDD to a distributed TensorFlow cluster via TensorFlow's `feed_dict()` mechanism.\n", - "\n", - "This mode allows existing Spark users a simple way to feed Spark RDDs into TensorFlow. Note that there is a performance penalty for transferring the RDD partitions from disk through Spark to the TensorFlow processes, so if I/O performance is a concern, you should use `InputMode.TENSORFLOW`, where the TensorFlow processes will read directly from disk.\n", - "\n", - "Note: this notebook assumes that you have already followed the [instructions to download and convert the MNIST data](https://github.com/yahoo/TensorFlowOnSpark/wiki/GetStarted_Standalone#download-mnist-data)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Start a Spark Standalone Cluster\n", - "\n", - "First, in a terminal/shell window, start a single-machine Spark Standalone Cluster with three workers:\n", - "```\n", - "export MASTER=spark://$(hostname):7077\n", - "export SPARK_WORKER_INSTANCES=3\n", - "export CORES_PER_WORKER=1\n", - "export TOTAL_CORES=$((${CORES_PER_WORKER}*${SPARK_WORKER_INSTANCES})) \n", - "${SPARK_HOME}/sbin/start-master.sh; ${SPARK_HOME}/sbin/start-slave.sh -c $CORES_PER_WORKER -m 3G ${MASTER}\n", - "```" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Launch the Spark Jupyter Notebook\n", - "\n", - "Now, in the same terminal window, launch a Pyspark Jupyter notebook:\n", - "```\n", - "# export TFoS_HOME=\n", - "cd ${TFoS_HOME}/examples/mnist\n", - "PYSPARK_DRIVER_PYTHON=\"jupyter\" \\\n", - "PYSPARK_DRIVER_PYTHON_OPTS=\"notebook\" \\\n", - "pyspark --master ${MASTER} \\\n", - "--conf spark.cores.max=${TOTAL_CORES} \\\n", - "--conf spark.task.cpus=${CORES_PER_WORKER} \\\n", - "--py-files ${TFoS_HOME}/examples/mnist/spark/mnist_dist.py \\\n", - "--conf spark.executorEnv.JAVA_HOME=\"$JAVA_HOME\"\n", - "```\n", - "\n", - "This should open a Jupyter browser pointing to the directory where this notebook is hosted.\n", - "Click on this notebook and begin executing the steps of the notebook." - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "NOTE: the `SparkContext` should be available as the `sc` variable. You can use it to navigate to the Spark UI's \"Executors\" tab, where you will find the logs for each Spark executor. For TensorFlowOnSpark, each executor will correspond to a specific TensorFlow instance in the cluster, and the TensorFlow logs will be reported in each executor's `stderr` logs. " - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "sc" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "from __future__ import absolute_import\n", - "from __future__ import division\n", - "from __future__ import print_function\n", - "\n", - "import argparse\n", - "import subprocess\n", - "from tensorflowonspark import TFCluster\n", - "\n", - "# main TensorFlow code for this example\n", - "import mnist_dist" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "parser = argparse.ArgumentParser()\n", - "parser.add_argument(\"--batch_size\", help=\"number of examples per batch\", type=int, default=100)\n", - "parser.add_argument(\"--epochs\", help=\"number of epochs\", type=int, default=1)\n", - "parser.add_argument(\"--format\", help=\"example format\", choices=[\"csv\",\"pickle\",\"tfr\"], default=\"csv\")\n", - "parser.add_argument(\"--images\", help=\"HDFS path to MNIST images in parallelized format\")\n", - "parser.add_argument(\"--labels\", help=\"HDFS path to MNIST labels in parallelized format\")\n", - "parser.add_argument(\"--mode\", help=\"train|inference\", default=\"train\")\n", - "parser.add_argument(\"--model\", help=\"HDFS path to save/load model during train/test\", default=\"mnist_model\")\n", - "parser.add_argument(\"--output\", help=\"HDFS path to save test/inference output\", default=\"predictions\")\n", - "parser.add_argument(\"--readers\", help=\"number of reader/enqueue threads\", type=int, default=1)\n", - "parser.add_argument(\"--rdma\", help=\"use rdma connection\", default=False)\n", - "parser.add_argument(\"--steps\", help=\"maximum number of steps\", type=int, default=1000)\n", - "parser.add_argument(\"--tensorboard\", help=\"launch tensorboard process\", action=\"store_true\")\n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "num_executors = sc.defaultParallelism\n", - "num_executors" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Run Distributed Training" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# verify training images\n", - "train_images_files = \"csv/train/images\"\n", - "print(subprocess.check_output([\"ls\", \"-l\", train_images_files]).decode(\"utf-8\"))" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "scrolled": true - }, - "outputs": [], - "source": [ - "# verify training labels\n", - "train_labels_files = \"csv/train/labels\"\n", - "print(subprocess.check_output([\"ls\", \"-l\", train_labels_files]).decode(\"utf-8\"))" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# parse arguments for training\n", - "args = parser.parse_args(['--mode', 'train', \n", - " '--steps', '600', \n", - " '--epochs', '1',\n", - " '--images', train_images_files, \n", - " '--labels', train_labels_files])" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# remove any existing models\n", - "subprocess.call([\"rm\", \"-rf\", args.model])" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# start the cluster for training\n", - "cluster = TFCluster.run(sc, mnist_dist.map_fun, args, num_executors, 1, args.tensorboard, TFCluster.InputMode.SPARK)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# feed data via Spark RDD\n", - "images = sc.textFile(args.images).map(lambda ln: [int(x) for x in ln.split(',')])\n", - "labels = sc.textFile(args.labels).map(lambda ln: [float(x) for x in ln.split(',')])\n", - "dataRDD = images.zip(labels)\n", - "cluster.train(dataRDD, args.epochs)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "scrolled": true - }, - "outputs": [], - "source": [ - "# shutdown the cluster. \n", - "# NOTE: this will block until all RDD data has been fed via the previous step\n", - "cluster.shutdown()" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "print(subprocess.check_output([\"ls\", \"-l\", args.model]).decode(\"utf-8\"))" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Run Distributed Inference" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "test_images_files = \"csv/test/images\"\n", - "print(subprocess.check_output([\"ls\", \"-l\", test_images_files]).decode(\"utf-8\"))" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "test_labels_files = \"csv/test/labels\"\n", - "print(subprocess.check_output([\"ls\", \"-l\", test_labels_files]).decode(\"utf-8\"))" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "#Parse arguments for inference\n", - "args = parser.parse_args(['--mode', 'inference',\n", - " '--images', test_images_files, \n", - " '--labels', test_labels_files])\n", - "args" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "#remove existing output if any\n", - "subprocess.call([\"rm\", \"-rf\", args.output])" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "#Start the cluster for inference\n", - "cluster = TFCluster.run(sc, mnist_dist.map_fun, args, num_executors, 1, False, TFCluster.InputMode.SPARK)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "#feed data for inference\n", - "images = sc.textFile(args.images).map(lambda ln: [int(x) for x in ln.split(',')])\n", - "labels = sc.textFile(args.labels).map(lambda ln: [float(x) for x in ln.split(',')])\n", - "dataRDD = images.zip(labels)\n", - "predictions = cluster.inference(dataRDD)\n", - "predictions.saveAsTextFile(args.output)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "cluster.shutdown()" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "print(subprocess.check_output([\"ls\", \"-l\", args.output]).decode(\"utf-8\"))" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Shutdown\n", - "\n", - "In your terminal/shell window, you can type `` to exit the Notebook server.\n", - "\n", - "Then, stop the Standalone Cluster via:\n", - "```\n", - "${SPARK_HOME}/sbin/stop-slave.sh; ${SPARK_HOME}/sbin/stop-master.sh\n", - "```" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [] - } - ], - "metadata": { - "kernelspec": { - "display_name": "Python 3", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.6.0" - } - }, - "nbformat": 4, - "nbformat_minor": 2 -} diff --git a/examples/mnist/mnist_tf.ipynb b/examples/mnist/mnist_tf.ipynb deleted file mode 100644 index 4aa213c6..00000000 --- a/examples/mnist/mnist_tf.ipynb +++ /dev/null @@ -1,328 +0,0 @@ -{ - "cells": [ - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# TensorFlowOnSpark with InputMode.TENSORFLOW" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "This notebook demonstrates TensorFlowOnSpark using `InputMode.TENSORFLOW`, which launches a distributed TensorFlow cluster on the Spark executors, where each TensorFlow process reads directly from disk." - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Start a Spark Standalone Cluster\n", - "\n", - "First, in a terminal/shell window, start a single-machine Spark Standalone Cluster with three workers:\n", - "```\n", - "export MASTER=spark://$(hostname):7077\n", - "export SPARK_WORKER_INSTANCES=3\n", - "export CORES_PER_WORKER=1\n", - "export TOTAL_CORES=$((${CORES_PER_WORKER}*${SPARK_WORKER_INSTANCES})) \n", - "${SPARK_HOME}/sbin/start-master.sh; ${SPARK_HOME}/sbin/start-slave.sh -c $CORES_PER_WORKER -m 3G ${MASTER}\n", - "```" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Convert the MNIST zip files using Spark\n", - "\n", - "This notebook assumes that you have already [downloaded the MNIST dataset](https://github.com/yahoo/TensorFlowOnSpark/wiki/GetStarted_Standalone#download-mnist-data). If so, you can convert it to TFRecord format as follows:\n", - "```\n", - "export TFoS_HOME=\n", - "cd ${TFoS_HOME}\n", - "# rm -rf examples/mnist/tfr\n", - "${SPARK_HOME}/bin/spark-submit \\\n", - "--master ${MASTER} \\\n", - "${TFoS_HOME}/examples/mnist/mnist_data_setup.py \\\n", - "--output examples/mnist/tfr \\\n", - "--format tfr\n", - "ls -lR examples/mnist/tfr\n", - "```" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Launch the Spark Jupyter Notebook\n", - "\n", - "Now, in the same terminal window, launch a Pyspark Jupyter notebook:\n", - "```\n", - "# export TFoS_HOME=\n", - "cd ${TFoS_HOME}/examples/mnist\n", - "PYSPARK_DRIVER_PYTHON=\"jupyter\" \\\n", - "PYSPARK_DRIVER_PYTHON_OPTS=\"notebook\" \\\n", - "pyspark --master ${MASTER} \\\n", - "--conf spark.cores.max=${TOTAL_CORES} \\\n", - "--conf spark.task.cpus=${CORES_PER_WORKER} \\\n", - "--py-files ${TFoS_HOME}/examples/mnist/tf/mnist_dist.py \\\n", - "--conf spark.executorEnv.JAVA_HOME=\"$JAVA_HOME\"\n", - "```\n", - "\n", - "This should open a Jupyter browser pointing to the directory where this notebook is hosted.\n", - "Click on this notebook and begin executing the steps of the notebook." - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "NOTE: the `SparkContext` should be available as the `sc` variable. You can use it to navigate to the Spark UI's \"Executors\" tab, where you will find the logs for each Spark executor. For TensorFlowOnSpark, each executor will correspond to a specific TensorFlow instance in the cluster, and the TensorFlow logs will be reported in each executor's `stderr` logs. " - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "sc" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "from __future__ import absolute_import\n", - "from __future__ import division\n", - "from __future__ import print_function\n", - "\n", - "import argparse\n", - "import subprocess\n", - "from tensorflowonspark import TFCluster\n", - "\n", - "# main TensorFlow code for this example\n", - "import mnist_dist" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "parser = argparse.ArgumentParser()\n", - "parser.add_argument(\"--batch_size\", help=\"number of records per batch\", type=int, default=100)\n", - "parser.add_argument(\"--epochs\", help=\"number of epochs\", type=int, default=1)\n", - "parser.add_argument(\"--export\", help=\"HDFS path to export model\", type=str, default=\"mnist_export\")\n", - "parser.add_argument(\"--format\", help=\"example format: (csv2|tfr)\", choices=[\"csv2\", \"tfr\"], default=\"tfr\")\n", - "parser.add_argument(\"--images_labels\", help=\"HDFS path to MNIST image_label files in parallelized format\")\n", - "parser.add_argument(\"--mode\", help=\"train|inference\", default=\"train\")\n", - "parser.add_argument(\"--model\", help=\"HDFS path to save/load model during train/test\", default=\"mnist_model\")\n", - "parser.add_argument(\"--output\", help=\"HDFS path to save test/inference output\", default=\"predictions\")\n", - "parser.add_argument(\"--rdma\", help=\"use rdma connection\", default=False)\n", - "parser.add_argument(\"--readers\", help=\"number of reader/enqueue threads per worker\", type=int, default=10)\n", - "parser.add_argument(\"--shuffle_size\", help=\"size of shuffle buffer\", type=int, default=1000)\n", - "parser.add_argument(\"--steps\", help=\"maximum number of steps\", type=int, default=1000)\n", - "parser.add_argument(\"--tensorboard\", help=\"launch tensorboard process\", action=\"store_true\")" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "num_executors = sc.defaultParallelism\n", - "num_executors" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Run Distributed Training" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# verify training images and labels\n", - "train_images_files = \"tfr/train\"\n", - "print(subprocess.check_output([\"ls\", \"-l\", train_images_files]).decode(\"utf-8\"))" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# parse arguments for training\n", - "args = parser.parse_args(['--mode', 'train', \n", - " '--steps', '600', \n", - " '--epochs', '1',\n", - " '--images_labels', train_images_files])\n", - "args" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# remove any existing models\n", - "subprocess.call([\"rm\", \"-rf\", args.model, args.export])" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# start the cluster for training\n", - "cluster = TFCluster.run(sc, mnist_dist.map_fun, args, num_executors, 1, args.tensorboard, TFCluster.InputMode.TENSORFLOW)\n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "scrolled": true - }, - "outputs": [], - "source": [ - "# shutdown the cluster. \n", - "# NOTE: this will block until all TensorFlow nodes have completed\n", - "cluster.shutdown()" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "print(subprocess.check_output([\"ls\", \"-l\", args.model]).decode(\"utf-8\"))" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "print(subprocess.check_output([\"ls\", \"-lR\", args.export]).decode(\"utf-8\"))" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Run Distributed Inference" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "test_images_files = \"tfr/test\"\n", - "print(subprocess.check_output([\"ls\", \"-l\", test_images_files]).decode(\"utf-8\"))" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "#Parse arguments for inference\n", - "args = parser.parse_args(['--mode', 'inference',\n", - " '--images_labels', test_images_files])\n", - "args" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "#remove existing output if any\n", - "subprocess.call([\"rm\", \"-rf\", args.output])" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "#Start the cluster for inference\n", - "cluster = TFCluster.run(sc, mnist_dist.map_fun, args, num_executors, 1, False, TFCluster.InputMode.SPARK)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "cluster.shutdown()" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "print(subprocess.check_output([\"ls\", \"-l\", args.output]).decode(\"utf-8\"))" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Shutdown\n", - "\n", - "In your terminal/shell window, you can type `` to exit the Notebook server.\n", - "\n", - "Then, stop the Standalone Cluster via:\n", - "```\n", - "${SPARK_HOME}/sbin/stop-slave.sh; ${SPARK_HOME}/sbin/stop-master.sh\n", - "```" - ] - } - ], - "metadata": { - "kernelspec": { - "display_name": "Python 3", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.6.0" - } - }, - "nbformat": 4, - "nbformat_minor": 2 -} From 123be55f64e5c40abef7f9ccc59bb665c4abd517 Mon Sep 17 00:00:00 2001 From: Lee Yang Date: Wed, 14 Aug 2019 16:48:29 -0700 Subject: [PATCH 12/37] add parallel inferencing example for estimator --- examples/mnist/estimator/mnist_inference.py | 89 +++++++++++++++++++++ 1 file changed, 89 insertions(+) create mode 100644 examples/mnist/estimator/mnist_inference.py diff --git a/examples/mnist/estimator/mnist_inference.py b/examples/mnist/estimator/mnist_inference.py new file mode 100644 index 00000000..d1a05fdd --- /dev/null +++ b/examples/mnist/estimator/mnist_inference.py @@ -0,0 +1,89 @@ +# Copyright 2018 Yahoo Inc. +# Licensed under the terms of the Apache 2.0 license. +# Please see LICENSE file in the project root for terms. + +# This example demonstrates how to leverage Spark for parallel inferencing from a SavedModel. +# +# Normally, you can use TensorFlowOnSpark to just form a TensorFlow cluster for training and inferencing. +# However, in some situations, you may have a SavedModel without the original code for defining the inferencing +# graph. In these situations, we can use Spark to instantiate a single-node TensorFlow instance on each executor, +# where each executor can independently load the model and inference on input data. +# +# Note: this particular example demonstrates use of `tf.data.Dataset` to read the input data for inferencing, +# but it could also be adapted to just use an RDD of TFRecords from Spark. + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import argparse +import numpy as np +import tensorflow as tf + + +def inference(it, num_workers, args): + from tensorflowonspark import util + + # consume worker number from RDD partition iterator + for i in it: + worker_num = i + print("worker_num: {}".format(i)) + + # setup env for single-node TF + util.single_node_env() + + # load saved_model + saved_model = tf.saved_model.load(args.export_dir, tags='serve') + predict = saved_model.signatures['serving_default'] + + # parse function for TFRecords + def parse_tfr(example_proto): + feature_def = {"label": tf.io.FixedLenFeature(1, tf.int64), + "image": tf.io.FixedLenFeature(784, tf.int64)} + features = tf.io.parse_single_example(serialized=example_proto, features=feature_def) + image = tf.cast(features['image'], dtype=tf.float32) / 255.0 + image = tf.reshape(image, [28, 28, 1]) + label = tf.cast(features['label'], dtype=tf.float32) + return (image, label) + + # define a new tf.data.Dataset (for inferencing) + ds = tf.data.Dataset.list_files("{}/part-*".format(args.images_labels)) + ds = ds.shard(num_workers, worker_num) + ds = ds.interleave(tf.data.TFRecordDataset) + ds = ds.map(parse_tfr) + ds = ds.batch(10) + + # create an output file per spark worker for the predictions + tf.io.gfile.makedirs(args.output) + output_file = tf.io.gfile.GFile("{}/part-{:05d}".format(args.output, worker_num), mode='w') + + for batch in ds: + predictions = predict(features=batch[0]) + labels = np.reshape(batch[1], -1).astype(np.int) + preds = np.argmax(predictions['dense_1'], axis=1) + for x in zip(labels, preds): + output_file.write("{} {}\n".format(x[0], x[1])) + + output_file.close() + + +if __name__ == '__main__': + from pyspark.context import SparkContext + from pyspark.conf import SparkConf + + sc = SparkContext(conf=SparkConf().setAppName("mnist_inference")) + executors = sc._conf.get("spark.executor.instances") + num_executors = int(executors) if executors is not None else 1 + + parser = argparse.ArgumentParser() + parser.add_argument("--cluster_size", help="number of nodes in the cluster (for S with labelspark Standalone)", type=int, default=num_executors) + parser.add_argument('--images_labels', type=str, help='Directory for input images with labels') + parser.add_argument("--export_dir", help="HDFS path to export model", type=str, default="mnist_export") + parser.add_argument("--output", help="HDFS path to save predictions", type=str, default="predictions") + args, _ = parser.parse_known_args() + print("args: {}".format(args)) + + # Not using TFCluster... just running single-node TF instances on each executor + nodes = list(range(args.cluster_size)) + nodeRDD = sc.parallelize(list(range(args.cluster_size)), args.cluster_size) + nodeRDD.foreachPartition(lambda worker_num: inference(worker_num, args.cluster_size, args)) From 3b02cbe8e79d1843ca0318bec9769029ab1601a2 Mon Sep 17 00:00:00 2001 From: Lee Yang Date: Thu, 15 Aug 2019 10:07:15 -0700 Subject: [PATCH 13/37] mnist streaming w/ estimator; add examples/utils --- examples/mnist/estimator/README.md | 83 ++++++++-- .../mnist/estimator/mnist_spark_streaming.py | 143 ++++++++++++++++++ examples/mnist/keras/README.md | 6 +- examples/utils/mnist_reshape.py | 5 + .../utils/stop_streaming.py | 2 +- 5 files changed, 220 insertions(+), 19 deletions(-) create mode 100644 examples/mnist/estimator/mnist_spark_streaming.py create mode 100644 examples/utils/mnist_reshape.py rename tensorflowonspark/reservation_client.py => examples/utils/stop_streaming.py (92%) diff --git a/examples/mnist/estimator/README.md b/examples/mnist/estimator/README.md index f9a8386b..92927a28 100644 --- a/examples/mnist/estimator/README.md +++ b/examples/mnist/estimator/README.md @@ -16,7 +16,7 @@ Note: this example assumes that Spark, TensorFlow, and TensorFlowOnSpark are alr ${SPARK_HOME}/sbin/start-master.sh; ${SPARK_HOME}/sbin/start-slave.sh -c $CORES_PER_WORKER -m 3G ${MASTER} -#### Run using InputMode.TENSORFLOW +#### Train via InputMode.TENSORFLOW In this mode, each worker will load the entire MNIST dataset into memory (automatically downloading the dataset if needed). @@ -34,7 +34,7 @@ In this mode, each worker will load the entire MNIST dataset into memory (automa --model_dir ${TFoS_HOME}/mnist_model \ --export_dir ${TFoS_HOME}/mnist_export -#### Run using InputMode.SPARK +#### Train via InputMode.SPARK In this mode, Spark will distribute the MNIST dataset (as CSV) across the workers, so each of the workers will see only a portion of the dataset per epoch. Also note that InputMode.SPARK currently only supports a single input RDD, so the validation/test data is not used. @@ -64,6 +64,69 @@ In this mode, Spark will distribute the MNIST dataset (as CSV) across the worker --model_dir ${TFoS_HOME}/mnist_model \ --export_dir ${TFoS_HOME}/mnist_export +#### Train via InputMode.SPARK with Spark Streaming + +Spark also includes a streaming mode, which allows you feed data to your Spark applications in an online/streaming mode vs. reading a static list of files from disk. In this mode, Spark watches a location on disk (or listens on a network port) for new data to arrive and batches the incoming data into a sequence of RDDs for your application. + +This example is essentially the same as the one above, except it replaces the Spark RDD with a Spark Streaming RDD (DStream). Note that there is no final export to saved_model, since training is essentially ongoing. + + # Convert the MNIST zip files into CSV (if not already done) + cd ${TFoS_HOME} + ${SPARK_HOME}/bin/spark-submit \ + --master ${MASTER} \ + --jars ${TFoS_HOME}/lib/tensorflow-hadoop-1.0-SNAPSHOT.jar \ + ${TFoS_HOME}/examples/mnist/mnist_data_setup.py \ + --output ${TFoS_HOME}/data/mnist + + # confirm that data was generated + ls -lR ${TFoS_HOME}/data/mnist/csv + + # create a folder for new streaming data to arrive + export STREAM_DATA=${TFoS_HOME}/data/stream + mkdir -p ${STREAM_DATA} + + # create a temp folder to stage streaming data + export TEMP_DATA=${TFoS_HOME}/data/tmp + mkdir -p ${TEMP_DATA} + + # remove any old artifacts + rm -rf ${TFoS_HOME}/mnist_model + rm ${STREAM_DATA}/* + rm ${TEMP_DATA}/* + + # train + ${SPARK_HOME}/bin/spark-submit \ + --master ${MASTER} \ + --conf spark.cores.max=${TOTAL_CORES} \ + --conf spark.task.cpus=${CORES_PER_WORKER} \ + ${TFoS_HOME}/examples/mnist/estimator/mnist_spark_streaming.py \ + --cluster_size ${SPARK_WORKER_INSTANCES} \ + --images_labels ${TFoS_HOME}/data/stream \ + --model_dir ${TFoS_HOME}/mnist_model + + # in another shell window + export TFoS_HOME= + export STREAM_DATA=${TFoS_HOME}/data/stream + export TEMP_DATA=${TFoS_HOME}/data/tmp + + # wait for spark job to be RUNNING, then simulate arrival of NEW data in stream by: + # 1. making a copy of the data (to get a recent timestamp). + # 2. moving it into the stream folder atomically (to avoid spark picking up a partial file). + # for more info, see: http://spark.apache.org/docs/latest/streaming-programming-guide.html#basic-sources + # monitor the spark streaming logs after each command to view behavior. + + COUNT=0 + for f in ${TFoS_HOME}/data/mnist/csv/train/part-*; do cp $f ${TEMP_DATA}/$(basename $f | sed -e "s/[0-9][0-9]*/$COUNT/"); COUNT=$((COUNT + 1)); done; mv ${TEMP_DATA}/* ${STREAM_DATA} + for f in ${TFoS_HOME}/data/mnist/csv/train/part-*; do cp $f ${TEMP_DATA}/$(basename $f | sed -e "s/[0-9][0-9]*/$COUNT/"); COUNT=$((COUNT + 1)); done; mv ${TEMP_DATA}/* ${STREAM_DATA} + for f in ${TFoS_HOME}/data/mnist/csv/train/part-*; do cp $f ${TEMP_DATA}/$(basename $f | sed -e "s/[0-9][0-9]*/$COUNT/"); COUNT=$((COUNT + 1)); done; mv ${TEMP_DATA}/* ${STREAM_DATA} + + # shutdown job via or `yarn application -kill ` + + # for a "graceful" shutdown, we provide the following tool to signal the SparkStreamingContext to stop. + # Note: the host and port of the reservation server will be in the driver logs, e.g. + # "listening for reservations at ('127.0.0.1', 38254)" + python ${TFoS_HOME}/examples/utils/stop_streaming.py + #### Inference via saved_model_cli The training code will automatically export a TensorFlow SavedModel, which can be used with the `saved_model_cli` from the command line, as follows: @@ -73,17 +136,8 @@ The training code will automatically export a TensorFlow SavedModel, which can b export MODEL_VERSION=$(ls ${MODEL_BASE} | sort -n | tail -n 1) export SAVED_MODEL=${MODEL_BASE}/${MODEL_VERSION} - # use a CSV formatted test example - # converting from a flat list of 784 digits to a json array (28, 28, 1) - cat <reshape.py - import sys - import numpy as np - vec = [int(x) for x in next(sys.stdin).split(',')] - img = np.reshape(vec[1:], (28, 28, 1)) - print(np.array2string(img).replace('\n ', ',')) - EOF - - IMG=$(head -n 1 $TFoS_HOME/data/mnist/csv/test/part-00000 | python reshape.py) + # use a CSV formatted test example (reshaping from [784] to [28, 28, 1]) + IMG=$(head -n 1 $TFoS_HOME/data/mnist/csv/test/part-00000 | python ${TFoS_HOME}/examples/utils/mnist_reshape.py) # introspect model saved_model_cli show --dir $SAVED_MODEL --all @@ -112,7 +166,7 @@ demonstrate the use of the REST API. Also, [per the TensorFlow Serving instruct # Stop the TF-Serving container docker stop $(docker ps -q) -#### Run Parallel Inferencing via Spark +#### Parallel Inferencing via Spark For batch inferencing use cases, you can use Spark to run multiple single-node TensorFlow instances in parallel (on the Spark executors). Each executor/instance will operate independently on a shard of the dataset. Note that this requires that the model fits in the memory of each executor. @@ -130,7 +184,6 @@ For batch inferencing use cases, you can use Spark to run multiple single-node T --export_dir ${SAVED_MODEL} \ --output ${TFoS_HOME}/predictions - #### Shutdown the Spark Standalone cluster ${SPARK_HOME}/sbin/stop-slave.sh; ${SPARK_HOME}/sbin/stop-master.sh diff --git a/examples/mnist/estimator/mnist_spark_streaming.py b/examples/mnist/estimator/mnist_spark_streaming.py new file mode 100644 index 00000000..651157d1 --- /dev/null +++ b/examples/mnist/estimator/mnist_spark_streaming.py @@ -0,0 +1,143 @@ +# Adapted from: https://www.tensorflow.org/beta/tutorials/distribute/multi_worker_with_estimator + +from __future__ import absolute_import, division, print_function, unicode_literals + + +def main(args, ctx): + import numpy as np + import tensorflow as tf + import tensorflow_datasets as tfds + from tensorflowonspark import TFNode + + tfds.disable_progress_bar() + + BATCH_SIZE = args.batch_size + LEARNING_RATE = args.learning_rate + + tf_feed = TFNode.DataFeed(ctx.mgr) + + def rdd_generator(): + while not tf_feed.should_stop(): + batch = tf_feed.next_batch(1) + if len(batch) > 0: + example = batch[0] + image = np.array(example[0]).astype(np.float32) / 255.0 + image = np.reshape(image, (28, 28, 1)) + label = np.array(example[1]).astype(np.float32) + label = np.reshape(label, (1,)) + yield (image, label) + else: + return + + def input_fn(mode, input_context=None): + if mode == tf.estimator.ModeKeys.TRAIN: + # Note: Spark is responsible for feeding data via streaming RDD + ds = tf.data.Dataset.from_generator(rdd_generator, (tf.float32, tf.float32), (tf.TensorShape([28, 28, 1]), tf.TensorShape([1]))) + return ds.batch(BATCH_SIZE) + else: + raise Exception("I'm evaluating: mode={}, input_context={}".format(mode, input_context)) + + def scale(image, label): + image = tf.cast(image, tf.float32) / 255.0 + return image, label + + mnist = tfds.load(name='mnist', with_info=True, as_supervised=True) + ds = mnist['test'] + if input_context: + ds = ds.shard(input_context.num_input_pipelines, input_context.input_pipeline_id) + return ds.map(scale).batch(BATCH_SIZE) + + def serving_input_receiver_fn(): + features = tf.compat.v1.placeholder(dtype=tf.float32, shape=[None, 28, 28, 1], name='features') + receiver_tensors = {'features': features} + return tf.estimator.export.ServingInputReceiver(receiver_tensors, receiver_tensors) + + def model_fn(features, labels, mode): + model = tf.keras.Sequential([ + tf.keras.layers.Conv2D(32, 3, activation='relu', input_shape=(28, 28, 1)), + tf.keras.layers.MaxPooling2D(), + tf.keras.layers.Flatten(), + tf.keras.layers.Dense(64, activation='relu'), + tf.keras.layers.Dense(10, activation='softmax') + ]) + logits = model(features, training=False) + + if mode == tf.estimator.ModeKeys.PREDICT: + predictions = {'logits': logits} + return tf.estimator.EstimatorSpec(mode, predictions=predictions) + + optimizer = tf.compat.v1.train.GradientDescentOptimizer( + learning_rate=LEARNING_RATE) + loss = tf.keras.losses.SparseCategoricalCrossentropy( + from_logits=True, reduction=tf.keras.losses.Reduction.NONE)(labels, logits) + loss = tf.reduce_sum(input_tensor=loss) * (1. / BATCH_SIZE) + if mode == tf.estimator.ModeKeys.EVAL: + return tf.estimator.EstimatorSpec(mode, loss=loss) + + return tf.estimator.EstimatorSpec( + mode=mode, + loss=loss, + train_op=optimizer.minimize( + loss, tf.compat.v1.train.get_or_create_global_step())) + + # Note: the original example used MultiWorkerMirroredStrategy which is a synchronous training strategy. + # Since streaming data arrives irregularly, we must use the asynchronous ParameterServerStrategy + # to allow data to be processed as it arrives and to avoid deadlocks. + # strategy = tf.distribute.experimental.MultiWorkerMirroredStrategy() + strategy = tf.distribute.experimental.ParameterServerStrategy() + config = tf.estimator.RunConfig(train_distribute=strategy, save_checkpoints_steps=100) + + classifier = tf.estimator.Estimator( + model_fn=model_fn, model_dir=args.model_dir, config=config) + + # exporter = tf.estimator.FinalExporter("serving", serving_input_receiver_fn=serving_input_receiver_fn) + + tf.estimator.train_and_evaluate( + classifier, + train_spec=tf.estimator.TrainSpec(input_fn=input_fn), + eval_spec=tf.estimator.EvalSpec(input_fn=input_fn) + # eval_spec=tf.estimator.EvalSpec(input_fn=input_fn, exporters=exporter) + ) + + if ctx.job_name == 'chief': + print("Exporting saved_model to {}".format(args.export_dir)) + classifier.export_saved_model(args.export_dir, serving_input_receiver_fn) + + +if __name__ == "__main__": + + from pyspark.context import SparkContext + from pyspark.conf import SparkConf + from pyspark.streaming import StreamingContext + from tensorflowonspark import TFCluster + import argparse + + sc = SparkContext(conf=SparkConf().setAppName("mnist_estimator")) + ssc = StreamingContext(sc, 60) # group data into intervals of one minute + executors = sc._conf.get("spark.executor.instances") + num_executors = int(executors) if executors is not None else 1 + + parser = argparse.ArgumentParser() + parser.add_argument("--batch_size", help="number of records per batch", type=int, default=64) + parser.add_argument("--buffer_size", help="size of shuffle buffer", type=int, default=10000) + parser.add_argument("--cluster_size", help="number of nodes in the cluster", type=int, default=num_executors) + parser.add_argument("--images_labels", help="path to MNIST images and labels in parallelized format") + parser.add_argument("--learning_rate", help="learning rate", type=float, default=1e-3) + parser.add_argument("--model_dir", help="path to save checkpoint", default="mnist_model") + parser.add_argument("--tensorboard", help="launch tensorboard process", action="store_true") + + args = parser.parse_args() + print("args:", args) + + # create RDD of input data + def parse(ln): + vec = [int(x) for x in ln.split(',')] + return (vec[1:], vec[0]) + + stream = ssc.textFileStream(args.images_labels) + images_labels = stream.map(parse) + + cluster = TFCluster.run(sc, main, args, args.cluster_size, num_ps=1, tensorboard=args.tensorboard, input_mode=TFCluster.InputMode.SPARK, log_dir=args.model_dir, master_node='chief') + cluster.train(images_labels, feed_timeout=86400) # extend feed timeout to 24hrs for streaming data to arrive + ssc.start() + cluster.shutdown(ssc) diff --git a/examples/mnist/keras/README.md b/examples/mnist/keras/README.md index 5dbf59cc..9f7049f5 100644 --- a/examples/mnist/keras/README.md +++ b/examples/mnist/keras/README.md @@ -17,7 +17,7 @@ Notes: ${SPARK_HOME}/sbin/start-master.sh; ${SPARK_HOME}/sbin/start-slave.sh -c $CORES_PER_WORKER -m 3G ${MASTER} -#### Run using InputMode.TENSORFLOW +#### Train via InputMode.TENSORFLOW In this mode, each worker will load the entire MNIST dataset into memory (automatically downloading the dataset if needed). @@ -35,7 +35,7 @@ In this mode, each worker will load the entire MNIST dataset into memory (automa --model_dir ${TFoS_HOME}/mnist_model \ --export_dir ${TFoS_HOME}/mnist_export -#### Run using InputMode.SPARK +#### Train via InputMode.SPARK In this mode, Spark will distribute the MNIST dataset (as CSV) across the workers, so each of the workers will see only a portion of the dataset per epoch. Also note that InputMode.SPARK currently only supports a single input RDD, so the validation/test data is not used. @@ -113,7 +113,7 @@ demonstrate the use of the REST API. Also, [per the TensorFlow Serving instruct # Stop the TF-Serving container docker stop $(docker ps -q) -#### Run Parallel Inferencing via Spark +#### Parallel Inferencing via Spark For batch inferencing use cases, you can use Spark to run multiple single-node TensorFlow instances in parallel (on the Spark executors). Each executor/instance will operate independently on a shard of the dataset. Note that this requires that the model fits in the memory of each executor. diff --git a/examples/utils/mnist_reshape.py b/examples/utils/mnist_reshape.py new file mode 100644 index 00000000..fc7cfcc2 --- /dev/null +++ b/examples/utils/mnist_reshape.py @@ -0,0 +1,5 @@ +import sys +import numpy as np +vec = [int(x) for x in next(sys.stdin).split(',')] +img = np.reshape(vec[1:], (28, 28, 1)) +print(np.array2string(img).replace('\n ', ',')) diff --git a/tensorflowonspark/reservation_client.py b/examples/utils/stop_streaming.py similarity index 92% rename from tensorflowonspark/reservation_client.py rename to examples/utils/stop_streaming.py index d32bbcef..5d7f7e27 100644 --- a/tensorflowonspark/reservation_client.py +++ b/examples/utils/stop_streaming.py @@ -6,7 +6,7 @@ Note: use the reservation server address (host, port) reported in the driver logs. """ -import reservation +from tensorflowonspark import reservation import sys if __name__ == "__main__": From 497feb2762db84b53bf66f607a595b7c785ef3a0 Mon Sep 17 00:00:00 2001 From: Lee Yang Date: Thu, 15 Aug 2019 10:16:19 -0700 Subject: [PATCH 14/37] use examples/utils in keras --- examples/mnist/keras/README.md | 13 ++----------- 1 file changed, 2 insertions(+), 11 deletions(-) diff --git a/examples/mnist/keras/README.md b/examples/mnist/keras/README.md index 9f7049f5..d93a987e 100644 --- a/examples/mnist/keras/README.md +++ b/examples/mnist/keras/README.md @@ -74,17 +74,8 @@ The training code will automatically export a TensorFlow SavedModel, which can b export MODEL_VERSION=$(ls ${MODEL_BASE} | sort -n | tail -n 1) export SAVED_MODEL=${MODEL_BASE}/${MODEL_VERSION} - # use a CSV formatted test example - # converting from a flat list of 784 digits to a json array (28, 28, 1) - cat <reshape.py - import sys - import numpy as np - vec = [int(x) for x in next(sys.stdin).split(',')] - img = np.reshape(vec[1:], (28, 28, 1)) - print(np.array2string(img).replace('\n ', ',')) - EOF - - IMG=$(head -n 1 $TFoS_HOME/data/mnist/csv/test/part-00000 | python reshape.py) + # use a CSV formatted test example (reshaping from [784] to [28, 28, 1]) + IMG=$(head -n 1 $TFoS_HOME/data/mnist/csv/test/part-00000 | python ${TFoS_HOME}/examples/utils/mnist_reshape.py) # introspect model saved_model_cli show --dir $SAVED_MODEL --all From d57c3c0907787036cd2bb6485f9cf7fe9fd76000 Mon Sep 17 00:00:00 2001 From: Lee Yang Date: Thu, 15 Aug 2019 13:36:43 -0700 Subject: [PATCH 15/37] move model_export.py into examples/util --- examples/utils/mnist_reshape.py | 4 ++++ examples/{ => utils}/model_export.py | 0 2 files changed, 4 insertions(+) rename examples/{ => utils}/model_export.py (100%) diff --git a/examples/utils/mnist_reshape.py b/examples/utils/mnist_reshape.py index fc7cfcc2..b761edeb 100644 --- a/examples/utils/mnist_reshape.py +++ b/examples/utils/mnist_reshape.py @@ -1,3 +1,7 @@ +# Copyright 2019 Yahoo Inc. +# Licensed under the terms of the Apache 2.0 license. +# Please see LICENSE file in the project root for terms. + import sys import numpy as np vec = [int(x) for x in next(sys.stdin).split(',')] diff --git a/examples/model_export.py b/examples/utils/model_export.py similarity index 100% rename from examples/model_export.py rename to examples/utils/model_export.py From e3caa24a93210c2fd6bf68ae8a9d510e3c58a59d Mon Sep 17 00:00:00 2001 From: Lee Yang Date: Fri, 16 Aug 2019 16:33:46 -0700 Subject: [PATCH 16/37] remove InputMode.TENSORFLOW and checkpoint inferencing from ML pipeline API; add ML pipeline examples; remove model_export.py --- examples/mnist/estimator/README.md | 62 ++++++ examples/mnist/estimator/mnist_inference.py | 2 +- examples/mnist/estimator/mnist_pipeline.py | 196 ++++++++++++++++++ examples/mnist/estimator/mnist_spark.py | 5 +- .../mnist/estimator/mnist_spark_streaming.py | 5 +- examples/mnist/estimator/mnist_tf.py | 4 +- examples/mnist/keras/README.md | 64 ++++++ examples/mnist/keras/mnist_pipeline.py | 150 ++++++++++++++ examples/mnist/keras/mnist_spark.py | 9 +- examples/utils/model_export.py | 68 ------ tensorflowonspark/pipeline.py | 164 ++++++--------- test/test_pipeline.py | 166 +++------------ 12 files changed, 586 insertions(+), 309 deletions(-) create mode 100644 examples/mnist/estimator/mnist_pipeline.py create mode 100644 examples/mnist/keras/mnist_pipeline.py delete mode 100755 examples/utils/model_export.py diff --git a/examples/mnist/estimator/README.md b/examples/mnist/estimator/README.md index 92927a28..256f3265 100644 --- a/examples/mnist/estimator/README.md +++ b/examples/mnist/estimator/README.md @@ -150,6 +150,9 @@ The training code will automatically export a TensorFlow SavedModel, which can b For online inferencing use cases, you can serve the SavedModel via a TensorFlow Serving instance as follows. Note that TF-Serving provides both GRPC and REST APIs, but we will only demonstrate the use of the REST API. Also, [per the TensorFlow Serving instructions](https://www.tensorflow.org/tfx/serving/docker), we will run the serving instance inside a Docker container. + # path to the SavedModel export + export MODEL_BASE=${TFoS_HOME}/mnist_export + # Start the TF-Serving instance in a docker container docker pull tensorflow/serving docker run -t --rm -p 8501:8501 -v "${MODEL_BASE}:/models/mnist" -e MODEL_NAME=mnist tensorflow/serving & @@ -170,6 +173,11 @@ demonstrate the use of the REST API. Also, [per the TensorFlow Serving instruct For batch inferencing use cases, you can use Spark to run multiple single-node TensorFlow instances in parallel (on the Spark executors). Each executor/instance will operate independently on a shard of the dataset. Note that this requires that the model fits in the memory of each executor. + # path to the SavedModel export + export MODEL_BASE=${TFoS_HOME}/mnist_export + export MODEL_VERSION=$(ls ${MODEL_BASE} | sort -n | tail -n 1) + export SAVED_MODEL=${MODEL_BASE}/${MODEL_VERSION} + # remove any old artifacts rm -Rf ${TFoS_HOME}/predictions @@ -184,6 +192,60 @@ For batch inferencing use cases, you can use Spark to run multiple single-node T --export_dir ${SAVED_MODEL} \ --output ${TFoS_HOME}/predictions +#### Train and Inference via Spark ML Pipeline API + +Spark also includes an [ML Pipelines API](https://spark.apache.org/docs/latest/ml-pipeline.html), built on Spark DataFrames and intended for ML applications. Since this API is targeted towards building ML pipelines in Spark, only InputMode.SPARK is supported for this API. However, a `dfutil` library is provided to read simple TFRecords into a Spark DataFrame. Note that complex TFRecords are not supported, since they cannot be easily represented in Spark DataFrames. + + # remove any old artifacts + rm -rf ${TFoS_HOME}/mnist_model + rm -rf ${TFoS_HOME}/mnist_export + + # train w/ CSV + ${SPARK_HOME}/bin/spark-submit \ + --master ${MASTER} \ + --conf spark.cores.max=${TOTAL_CORES} \ + --conf spark.task.cpus=${CORES_PER_WORKER} \ + --conf spark.executorEnv.JAVA_HOME="$JAVA_HOME" \ + --jars ${TFoS_HOME}/lib/tensorflow-hadoop-1.0-SNAPSHOT.jar \ + ${TFoS_HOME}/examples/mnist/estimator/mnist_pipeline.py \ + --cluster_size ${SPARK_WORKER_INSTANCES} \ + --images_labels ${TFoS_HOME}/data/mnist/csv/train \ + --format csv \ + --mode train \ + --model_dir ${TFoS_HOME}/mnist_model \ + --export_dir ${TFoS_HOME}/mnist_export + + # train with TFRecords + # --images_labels ${TFoS_HOME}/data/mnist/tfr/train \ + # --format tfr \ + + # inference w/ CSV using exported saved_model + export MODEL_BASE=${TFoS_HOME}/mnist_export + export MODEL_VERSION=$(ls ${MODEL_BASE} | sort -n | tail -n 1) + export SAVED_MODEL=${MODEL_BASE}/${MODEL_VERSION} + + # remove any old artifacts + rm -rf ${TFoS_HOME}/predictions + + # inference with CSV + ${SPARK_HOME}/bin/spark-submit \ + --master ${MASTER} \ + --conf spark.cores.max=${TOTAL_CORES} \ + --conf spark.task.cpus=${CORES_PER_WORKER} \ + --conf spark.executorEnv.JAVA_HOME="$JAVA_HOME" \ + --jars ${TFoS_HOME}/lib/tensorflow-hadoop-1.0-SNAPSHOT.jar \ + ${TFoS_HOME}/examples/mnist/estimator/mnist_pipeline.py \ + --cluster_size ${SPARK_WORKER_INSTANCES} \ + --images_labels ${TFoS_HOME}/data/mnist/csv/test \ + --format csv \ + --mode inference \ + --export_dir ${SAVED_MODEL} \ + --output ${TFoS_HOME}/predictions + + # inference with TFRecords + # --images_labels ${TFoS_HOME}/data/mnist/tfr/test \ + # --format tfr \ + #### Shutdown the Spark Standalone cluster ${SPARK_HOME}/sbin/stop-slave.sh; ${SPARK_HOME}/sbin/stop-master.sh diff --git a/examples/mnist/estimator/mnist_inference.py b/examples/mnist/estimator/mnist_inference.py index d1a05fdd..2b2c9cb3 100644 --- a/examples/mnist/estimator/mnist_inference.py +++ b/examples/mnist/estimator/mnist_inference.py @@ -60,7 +60,7 @@ def parse_tfr(example_proto): for batch in ds: predictions = predict(features=batch[0]) labels = np.reshape(batch[1], -1).astype(np.int) - preds = np.argmax(predictions['dense_1'], axis=1) + preds = np.argmax(predictions['logits'], axis=1) for x in zip(labels, preds): output_file.write("{} {}\n".format(x[0], x[1])) diff --git a/examples/mnist/estimator/mnist_pipeline.py b/examples/mnist/estimator/mnist_pipeline.py new file mode 100644 index 00000000..362b5aef --- /dev/null +++ b/examples/mnist/estimator/mnist_pipeline.py @@ -0,0 +1,196 @@ +# Adapted from: https://www.tensorflow.org/beta/tutorials/distribute/multi_worker_with_estimator + +from __future__ import absolute_import, division, print_function, unicode_literals + + +def main_fun(args, ctx): + import numpy as np + import tensorflow as tf + import tensorflow_datasets as tfds + from tensorflowonspark import TFNode + + tfds.disable_progress_bar() + + class StopFeedHook(tf.estimator.SessionRunHook): + """SessionRunHook to terminate InputMode.SPARK RDD feeding if the training loop exits before the entire RDD is consumed.""" + + def __init__(self, feed): + self.feed = feed + + def end(self, session): + self.feed.terminate() + self.feed.next_batch(1) + + BATCH_SIZE = args.batch_size + LEARNING_RATE = args.learning_rate + + tf_feed = TFNode.DataFeed(ctx.mgr) + + def rdd_generator(): + while not tf_feed.should_stop(): + batch = tf_feed.next_batch(1) + if len(batch) > 0: + example = batch[0] + image = np.array(example[0]).astype(np.float32) / 255.0 + image = np.reshape(image, (28, 28, 1)) + label = np.array(example[1]).astype(np.float32) + label = np.reshape(label, (1,)) + yield (image, label) + else: + return + + def input_fn(mode, input_context=None): + if mode == tf.estimator.ModeKeys.TRAIN: + # Note: Spark is responsible for sharding/repeating/shuffling the data via RDD + ds = tf.data.Dataset.from_generator(rdd_generator, (tf.float32, tf.float32), (tf.TensorShape([28, 28, 1]), tf.TensorShape([1]))) + return ds.batch(BATCH_SIZE) + else: + raise Exception("I'm evaluating: mode={}, input_context={}".format(mode, input_context)) + + def scale(image, label): + image = tf.cast(image, tf.float32) / 255.0 + return image, label + + mnist = tfds.load(name='mnist', with_info=True, as_supervised=True) + ds = mnist['test'] + if input_context: + ds = ds.shard(input_context.num_input_pipelines, input_context.input_pipeline_id) + return ds.map(scale).batch(BATCH_SIZE) + + def serving_input_receiver_fn(): + features = tf.compat.v1.placeholder(dtype=tf.float32, shape=[None, 28, 28, 1], name='features') + receiver_tensors = {'features': features} + return tf.estimator.export.ServingInputReceiver(receiver_tensors, receiver_tensors) + + def model_fn(features, labels, mode): + model = tf.keras.Sequential([ + tf.keras.layers.Conv2D(32, 3, activation='relu', input_shape=(28, 28, 1)), + tf.keras.layers.MaxPooling2D(), + tf.keras.layers.Flatten(), + tf.keras.layers.Dense(64, activation='relu'), + tf.keras.layers.Dense(10, activation='softmax') + ]) + logits = model(features, training=False) + + if mode == tf.estimator.ModeKeys.PREDICT: + predictions = {'logits': logits} + return tf.estimator.EstimatorSpec(mode, predictions=predictions) + + optimizer = tf.compat.v1.train.GradientDescentOptimizer( + learning_rate=LEARNING_RATE) + loss = tf.keras.losses.SparseCategoricalCrossentropy( + from_logits=True, reduction=tf.keras.losses.Reduction.NONE)(labels, logits) + loss = tf.reduce_sum(input_tensor=loss) * (1. / BATCH_SIZE) + if mode == tf.estimator.ModeKeys.EVAL: + return tf.estimator.EstimatorSpec(mode, loss=loss) + + return tf.estimator.EstimatorSpec( + mode=mode, + loss=loss, + train_op=optimizer.minimize( + loss, tf.compat.v1.train.get_or_create_global_step())) + + strategy = tf.distribute.experimental.MultiWorkerMirroredStrategy() + config = tf.estimator.RunConfig(train_distribute=strategy, save_checkpoints_steps=100) + + classifier = tf.estimator.Estimator( + model_fn=model_fn, model_dir=args.model_dir, config=config) + + # exporter = tf.estimator.FinalExporter("serving", serving_input_receiver_fn=serving_input_receiver_fn) + + # Note: MultiWorkerMirroredStrategy (CollectiveAllReduceStrategy) is synchronous, + # so we need to ensure that all workers complete training before any of them run out of data from the RDD. + # And given that Spark RDD partitions (and partition sizes) can be non-evenly divisible by num_workers, + # we'll just stop training at 90% of the total expected number of steps. + steps = 60000 * args.epochs / args.batch_size + steps_per_worker = steps / ctx.num_workers + max_steps_per_worker = steps_per_worker * 0.9 + + tf.estimator.train_and_evaluate( + classifier, + train_spec=tf.estimator.TrainSpec(input_fn=input_fn, max_steps=max_steps_per_worker, hooks=[StopFeedHook(tf_feed)]), + eval_spec=tf.estimator.EvalSpec(input_fn=input_fn) + # eval_spec=tf.estimator.EvalSpec(input_fn=input_fn, exporters=exporter) + ) + + if ctx.job_name == 'chief': + print("Exporting saved_model to {}".format(args.export_dir)) + classifier.export_saved_model(args.export_dir, serving_input_receiver_fn) + + +if __name__ == "__main__": + + from pyspark.context import SparkContext + from pyspark.conf import SparkConf + from pyspark.sql import SparkSession + from pyspark.sql.functions import udf + from pyspark.sql.types import IntegerType + from tensorflowonspark import TFCluster, dfutil + from tensorflowonspark.pipeline import TFEstimator, TFModel + import argparse + + sc = SparkContext(conf=SparkConf().setAppName("mnist_estimator")) + spark = SparkSession(sc) + + executors = sc._conf.get("spark.executor.instances") + num_executors = int(executors) if executors is not None else 1 + + parser = argparse.ArgumentParser() + parser.add_argument("--batch_size", help="number of records per batch", type=int, default=64) + parser.add_argument("--buffer_size", help="size of shuffle buffer", type=int, default=10000) + parser.add_argument("--cluster_size", help="number of nodes in the cluster", type=int, default=num_executors) + parser.add_argument("--epochs", help="number of epochs", type=int, default=3) + parser.add_argument("--format", help="example format: (csv|tfr)", choices=["csv", "tfr"], default="csv") + parser.add_argument("--images_labels", help="path to MNIST images and labels in parallelized format") + parser.add_argument("--learning_rate", help="learning rate", type=float, default=1e-3) + parser.add_argument("--mode", help="train|inference", choices=["train", "inference"], default="train") + parser.add_argument("--model_dir", help="path to save checkpoint", default="mnist_model") + parser.add_argument("--export_dir", help="path to export saved_model", default="mnist_export") + parser.add_argument("--output", help="HDFS path to save predictions", type=str, default="predictions") + parser.add_argument("--tensorboard", help="launch tensorboard process", action="store_true") + + args = parser.parse_args() + print("args:", args) + + if args.format == 'tfr': + # load TFRecords as a DataFrame + df = dfutil.loadTFRecords(sc, args.images_labels) + else: # args.format == 'csv': + # create RDD of input data + def parse(ln): + vec = [int(x) for x in ln.split(',')] + return (vec[1:], vec[0]) + + images_labels = sc.textFile(args.images_labels).map(parse) + df = spark.createDataFrame(images_labels, ['image', 'label']) + + df.show() + + if args.mode == 'train': + estimator = TFEstimator(main_fun, args) \ + .setInputMapping({'image': 'image', 'label': 'label'}) \ + .setModelDir(args.model_dir) \ + .setExportDir(args.export_dir) \ + .setClusterSize(args.cluster_size) \ + .setInputMode(TFCluster.InputMode.SPARK) \ + .setTensorboard(args.tensorboard) \ + .setEpochs(args.epochs) \ + .setBatchSize(args.batch_size) \ + .setGraceSecs(60) + model = estimator.fit(df) + else: # args.mode == 'inference': + # using a trained/exported model + model = TFModel(args) \ + .setInputMapping({'image': 'features'}) \ + .setOutputMapping({'logits': 'prediction'}) \ + .setExportDir(args.export_dir) \ + .setBatchSize(args.batch_size) + + def argmax_fn(l): + return max(range(len(l)), key=lambda i: l[i]) + + argmax = udf(argmax_fn, IntegerType()) + + preds = model.transform(df).withColumn('argmax', argmax('prediction')) + preds.show() + preds.write.json(args.output) diff --git a/examples/mnist/estimator/mnist_spark.py b/examples/mnist/estimator/mnist_spark.py index f75bf2ec..3433e2fc 100644 --- a/examples/mnist/estimator/mnist_spark.py +++ b/examples/mnist/estimator/mnist_spark.py @@ -3,7 +3,7 @@ from __future__ import absolute_import, division, print_function, unicode_literals -def main(args, ctx): +def main_fun(args, ctx): import numpy as np import tensorflow as tf import tensorflow_datasets as tfds @@ -21,6 +21,7 @@ def end(self, session): self.feed.terminate() self.feed.next_batch(1) + BUFFER_SIZE = args.buffer_size BATCH_SIZE = args.batch_size LEARNING_RATE = args.learning_rate @@ -150,6 +151,6 @@ def parse(ln): images_labels = sc.textFile(args.images_labels).map(parse) - cluster = TFCluster.run(sc, main, args, args.cluster_size, num_ps=0, tensorboard=args.tensorboard, input_mode=TFCluster.InputMode.SPARK, log_dir=args.model_dir, master_node='chief') + cluster = TFCluster.run(sc, main_fun, args, args.cluster_size, num_ps=0, tensorboard=args.tensorboard, input_mode=TFCluster.InputMode.SPARK, log_dir=args.model_dir, master_node='chief') cluster.train(images_labels, args.epochs) cluster.shutdown(grace_secs=120) # allow time for the chief to export model after data feeding diff --git a/examples/mnist/estimator/mnist_spark_streaming.py b/examples/mnist/estimator/mnist_spark_streaming.py index 651157d1..3f492893 100644 --- a/examples/mnist/estimator/mnist_spark_streaming.py +++ b/examples/mnist/estimator/mnist_spark_streaming.py @@ -3,7 +3,7 @@ from __future__ import absolute_import, division, print_function, unicode_literals -def main(args, ctx): +def main_fun(args, ctx): import numpy as np import tensorflow as tf import tensorflow_datasets as tfds @@ -11,6 +11,7 @@ def main(args, ctx): tfds.disable_progress_bar() + BUFFER_SIZE = args.buffer_size BATCH_SIZE = args.batch_size LEARNING_RATE = args.learning_rate @@ -137,7 +138,7 @@ def parse(ln): stream = ssc.textFileStream(args.images_labels) images_labels = stream.map(parse) - cluster = TFCluster.run(sc, main, args, args.cluster_size, num_ps=1, tensorboard=args.tensorboard, input_mode=TFCluster.InputMode.SPARK, log_dir=args.model_dir, master_node='chief') + cluster = TFCluster.run(sc, main_fun, args, args.cluster_size, num_ps=1, tensorboard=args.tensorboard, input_mode=TFCluster.InputMode.SPARK, log_dir=args.model_dir, master_node='chief') cluster.train(images_labels, feed_timeout=86400) # extend feed timeout to 24hrs for streaming data to arrive ssc.start() cluster.shutdown(ssc) diff --git a/examples/mnist/estimator/mnist_tf.py b/examples/mnist/estimator/mnist_tf.py index a827ba02..db788c3d 100644 --- a/examples/mnist/estimator/mnist_tf.py +++ b/examples/mnist/estimator/mnist_tf.py @@ -3,7 +3,7 @@ from __future__ import absolute_import, division, print_function, unicode_literals -def main(args, ctx): +def main_fun(args, ctx): import tensorflow_datasets as tfds import tensorflow as tf @@ -106,5 +106,5 @@ def model_fn(features, labels, mode): args = parser.parse_args() print("args:", args) - cluster = TFCluster.run(sc, main, args, args.cluster_size, num_ps=0, tensorboard=args.tensorboard, input_mode=TFCluster.InputMode.TENSORFLOW, log_dir=args.model_dir, master_node='chief', eval_node=True) + cluster = TFCluster.run(sc, main_fun, args, args.cluster_size, num_ps=0, tensorboard=args.tensorboard, input_mode=TFCluster.InputMode.TENSORFLOW, log_dir=args.model_dir, master_node='chief', eval_node=True) cluster.shutdown(grace_secs=120) diff --git a/examples/mnist/keras/README.md b/examples/mnist/keras/README.md index d93a987e..ba816230 100644 --- a/examples/mnist/keras/README.md +++ b/examples/mnist/keras/README.md @@ -88,6 +88,9 @@ The training code will automatically export a TensorFlow SavedModel, which can b For online inferencing use cases, you can serve the SavedModel via a TensorFlow Serving instance as follows. Note that TF-Serving provides both GRPC and REST APIs, but we will only demonstrate the use of the REST API. Also, [per the TensorFlow Serving instructions](https://www.tensorflow.org/serving/), we will run the serving instance inside a Docker container. + # path to the SavedModel export + export MODEL_BASE=${TFoS_HOME}/mnist_export + # Start the TF-Serving instance in a docker container docker pull tensorflow/serving docker run -t --rm -p 8501:8501 -v "${MODEL_BASE}:/models/mnist" -e MODEL_NAME=mnist tensorflow/serving & @@ -98,6 +101,9 @@ demonstrate the use of the REST API. Also, [per the TensorFlow Serving instruct # GET model metadata curl http://localhost:8501/v1/models/mnist/metadata + # use a CSV formatted test example (reshaping from [784] to [28, 28, 1]) + IMG=$(head -n 1 $TFoS_HOME/data/mnist/csv/test/part-00000 | python ${TFoS_HOME}/examples/utils/mnist_reshape.py) + # POST example for inferencing curl -v -d "{\"instances\": [ {\"conv2d_input\": $IMG } ]}" -X POST http://localhost:8501/v1/models/mnist:predict @@ -108,6 +114,11 @@ demonstrate the use of the REST API. Also, [per the TensorFlow Serving instruct For batch inferencing use cases, you can use Spark to run multiple single-node TensorFlow instances in parallel (on the Spark executors). Each executor/instance will operate independently on a shard of the dataset. Note that this requires that the model fits in the memory of each executor. + # path to the SavedModel export + export MODEL_BASE=${TFoS_HOME}/mnist_export + export MODEL_VERSION=$(ls ${MODEL_BASE} | sort -n | tail -n 1) + export SAVED_MODEL=${MODEL_BASE}/${MODEL_VERSION} + # remove any old artifacts rm -Rf ${TFoS_HOME}/predictions @@ -122,6 +133,59 @@ For batch inferencing use cases, you can use Spark to run multiple single-node T --export_dir ${TFoS_HOME}/mnist_export \ --output ${TFoS_HOME}/predictions +#### Train and Inference via Spark ML Pipeline API + +Spark also includes an [ML Pipelines API](https://spark.apache.org/docs/latest/ml-pipeline.html), built on Spark DataFrames and intended for ML applications. Since this API is targeted towards building ML pipelines in Spark, only InputMode.SPARK is supported for this API. However, a `dfutil` library is provided to read simple TFRecords into a Spark DataFrame. Note that complex TFRecords are not supported, since they cannot be easily represented in Spark DataFrames. + + # remove any old artifacts + rm -rf ${TFoS_HOME}/mnist_model + rm -rf ${TFoS_HOME}/mnist_export + + # train w/ CSV + ${SPARK_HOME}/bin/spark-submit \ + --master ${MASTER} \ + --conf spark.cores.max=${TOTAL_CORES} \ + --conf spark.task.cpus=${CORES_PER_WORKER} \ + --conf spark.executorEnv.JAVA_HOME="$JAVA_HOME" \ + --jars ${TFoS_HOME}/lib/tensorflow-hadoop-1.0-SNAPSHOT.jar \ + ${TFoS_HOME}/examples/mnist/keras/mnist_pipeline.py \ + --cluster_size ${SPARK_WORKER_INSTANCES} \ + --images_labels ${TFoS_HOME}/data/mnist/csv/train \ + --format csv \ + --mode train \ + --model_dir ${TFoS_HOME}/mnist_model \ + --export_dir ${TFoS_HOME}/mnist_export + + # train with TFRecords + # --images_labels ${TFoS_HOME}/data/mnist/tfr/train \ + # --format tfr \ + + # inference w/ CSV using exported saved_model + export MODEL_BASE=${TFoS_HOME}/mnist_export + export MODEL_VERSION=$(ls ${MODEL_BASE} | sort -n | tail -n 1) + export SAVED_MODEL=${MODEL_BASE}/${MODEL_VERSION} + + # remove any old artifacts + rm -rf ${TFoS_HOME}/predictions + + # inference with CSV + ${SPARK_HOME}/bin/spark-submit \ + --master ${MASTER} \ + --conf spark.cores.max=${TOTAL_CORES} \ + --conf spark.task.cpus=${CORES_PER_WORKER} \ + --conf spark.executorEnv.JAVA_HOME="$JAVA_HOME" \ + --jars ${TFoS_HOME}/lib/tensorflow-hadoop-1.0-SNAPSHOT.jar \ + ${TFoS_HOME}/examples/mnist/keras/mnist_pipeline.py \ + --cluster_size ${SPARK_WORKER_INSTANCES} \ + --images_labels ${TFoS_HOME}/data/mnist/csv/test \ + --format csv \ + --mode inference \ + --export_dir ${SAVED_MODEL} \ + --output ${TFoS_HOME}/predictions + + # inference with TFRecords + # --images_labels ${TFoS_HOME}/data/mnist/tfr/test \ + # --format tfr \ #### Shutdown the Spark Standalone cluster diff --git a/examples/mnist/keras/mnist_pipeline.py b/examples/mnist/keras/mnist_pipeline.py new file mode 100644 index 00000000..89620efe --- /dev/null +++ b/examples/mnist/keras/mnist_pipeline.py @@ -0,0 +1,150 @@ +# Adapted from: https://www.tensorflow.org/beta/tutorials/distribute/multi_worker_with_keras + +from __future__ import absolute_import, division, print_function, unicode_literals + + +def main_fun(args, ctx): + import numpy as np + import tensorflow as tf + from tensorflowonspark import TFNode + + strategy = tf.distribute.experimental.MultiWorkerMirroredStrategy() + + def build_and_compile_cnn_model(): + model = tf.keras.Sequential([ + tf.keras.layers.Conv2D(32, 3, activation='relu', input_shape=(28, 28, 1)), + tf.keras.layers.MaxPooling2D(), + tf.keras.layers.Flatten(), + tf.keras.layers.Dense(64, activation='relu'), + tf.keras.layers.Dense(10, activation='softmax') + ]) + model.compile( + loss=tf.keras.losses.sparse_categorical_crossentropy, + optimizer=tf.keras.optimizers.SGD(learning_rate=0.001), + metrics=['accuracy']) + return model + + # single node + # single_worker_model = build_and_compile_cnn_model() + # single_worker_model.fit(x=train_datasets, epochs=3) + + tf_feed = TFNode.DataFeed(ctx.mgr, False) + + def rdd_generator(): + while not tf_feed.should_stop(): + batch = tf_feed.next_batch(1) + if len(batch) > 0: + example = batch[0] + image = np.array(example[0]).astype(np.float32) / 255.0 + image = np.reshape(image, (28, 28, 1)) + label = np.array(example[1]).astype(np.float32) + label = np.reshape(label, (1,)) + yield (image, label) + else: + return + + ds = tf.data.Dataset.from_generator(rdd_generator, (tf.float32, tf.float32), (tf.TensorShape([28, 28, 1]), tf.TensorShape([1]))) + ds = ds.batch(args.batch_size) + + # this fails + # callbacks = [tf.keras.callbacks.ModelCheckpoint(filepath=args.model_dir)] + tf.io.gfile.makedirs(args.model_dir) + filepath = args.model_dir + "/weights-{epoch:04d}" + callbacks = [tf.keras.callbacks.ModelCheckpoint(filepath=filepath, verbose=1, load_weights_on_restart=True, save_weights_only=True)] + + with strategy.scope(): + multi_worker_model = build_and_compile_cnn_model() + + # Note: MultiWorkerMirroredStrategy (CollectiveAllReduceStrategy) is synchronous, + # so we need to ensure that all workers complete training before any of them run out of data from the RDD. + # And given that Spark RDD partitions (and partition sizes) can be non-evenly divisible by num_workers, + # we'll just stop training at 90% of the total expected number of steps. + steps_per_epoch = 60000 / args.batch_size + steps_per_epoch_per_worker = steps_per_epoch / ctx.num_workers + max_steps_per_worker = steps_per_epoch_per_worker * 0.9 + + multi_worker_model.fit(x=ds, epochs=args.epochs, steps_per_epoch=max_steps_per_worker, callbacks=callbacks) + + if ctx.job_name == 'chief': + from tensorflow_estimator.python.estimator.export import export_lib + export_dir = export_lib.get_timestamped_export_dir(args.export_dir) + tf.keras.experimental.export_saved_model(multi_worker_model, export_dir) + # multi_worker_model.save(args.model_dir, save_format='tf') + + # terminating feed tells spark to skip processing further partitions + tf_feed.terminate() + + +if __name__ == '__main__': + import argparse + from pyspark.context import SparkContext + from pyspark.conf import SparkConf + from pyspark.sql import SparkSession + from pyspark.sql.functions import udf + from pyspark.sql.types import IntegerType + from tensorflowonspark import TFCluster, dfutil + from tensorflowonspark.pipeline import TFEstimator, TFModel + + sc = SparkContext(conf=SparkConf().setAppName("mnist_keras")) + spark = SparkSession(sc) + + executors = sc._conf.get("spark.executor.instances") + num_executors = int(executors) if executors is not None else 1 + + parser = argparse.ArgumentParser() + parser.add_argument("--batch_size", help="number of records per batch", type=int, default=64) + parser.add_argument("--cluster_size", help="number of nodes in the cluster", type=int, default=num_executors) + parser.add_argument("--epochs", help="number of epochs", type=int, default=3) + parser.add_argument("--format", help="example format: (csv|tfr)", choices=["csv", "tfr"], default="csv") + parser.add_argument("--images_labels", help="path to MNIST images and labels in parallelized format") + parser.add_argument("--mode", help="train|inference", choices=["train", "inference"], default="train") + parser.add_argument("--model_dir", help="path to save checkpoint", default="mnist_model") + parser.add_argument("--export_dir", help="path to export saved_model", default="mnist_export") + parser.add_argument("--output", help="HDFS path to save predictions", type=str, default="predictions") + parser.add_argument("--tensorboard", help="launch tensorboard process", action="store_true") + + args = parser.parse_args() + print("args:", args) + + if args.format == 'tfr': + # load TFRecords as a DataFrame + df = dfutil.loadTFRecords(sc, args.images_labels) + else: # args.format == 'csv': + # create RDD of input data + def parse(ln): + vec = [int(x) for x in ln.split(',')] + return (vec[1:], vec[0]) + + images_labels = sc.textFile(args.images_labels).map(parse) + df = spark.createDataFrame(images_labels, ['image', 'label']) + + df.show() + + if args.mode == 'train': + estimator = TFEstimator(main_fun, args) \ + .setInputMapping({'image': 'image', 'label': 'label'}) \ + .setModelDir(args.model_dir) \ + .setExportDir(args.export_dir) \ + .setClusterSize(args.cluster_size) \ + .setInputMode(TFCluster.InputMode.SPARK) \ + .setTensorboard(args.tensorboard) \ + .setEpochs(args.epochs) \ + .setBatchSize(args.batch_size) \ + .setGraceSecs(60) + model = estimator.fit(df) + else: # args.mode == 'inference': + # using a trained/exported model + model = TFModel(args) \ + .setInputMapping({'image': 'conv2d_input'}) \ + .setOutputMapping({'dense_1': 'prediction'}) \ + .setExportDir(args.export_dir) \ + .setBatchSize(args.batch_size) + + def argmax_fn(l): + return max(range(len(l)), key=lambda i: l[i]) + + argmax = udf(argmax_fn, IntegerType()) + + preds = model.transform(df).withColumn('argmax', argmax('prediction')) + preds.show() + preds.write.json(args.output) diff --git a/examples/mnist/keras/mnist_spark.py b/examples/mnist/keras/mnist_spark.py index 1514d7ac..8e3c6a15 100644 --- a/examples/mnist/keras/mnist_spark.py +++ b/examples/mnist/keras/mnist_spark.py @@ -55,10 +55,15 @@ def rdd_generator(): with strategy.scope(): multi_worker_model = build_and_compile_cnn_model() + # Note: MultiWorkerMirroredStrategy (CollectiveAllReduceStrategy) is synchronous, + # so we need to ensure that all workers complete training before any of them run out of data from the RDD. + # And given that Spark RDD partitions (and partition sizes) can be non-evenly divisible by num_workers, + # we'll just stop training at 90% of the total expected number of steps. steps_per_epoch = 60000 / args.batch_size steps_per_epoch_per_worker = steps_per_epoch / ctx.num_workers + max_steps_per_worker = steps_per_epoch_per_worker * 0.9 - multi_worker_model.fit(x=ds, epochs=args.epochs, steps_per_epoch=steps_per_epoch_per_worker, callbacks=callbacks) + multi_worker_model.fit(x=ds, epochs=args.epochs, steps_per_epoch=max_steps_per_worker, callbacks=callbacks) if ctx.job_name == 'chief': from tensorflow_estimator.python.estimator.export import export_lib @@ -102,5 +107,5 @@ def parse(ln): cluster = TFCluster.run(sc, main_fun, args, args.cluster_size, num_ps=0, tensorboard=args.tensorboard, input_mode=TFCluster.InputMode.SPARK, master_node='chief') # Note: need to feed extra data to ensure that each worker receives sufficient data to complete epochs # to compensate for variability in partition sizes and spark scheduling - cluster.train(images_labels, args.epochs + 1) + cluster.train(images_labels, args.epochs) cluster.shutdown() diff --git a/examples/utils/model_export.py b/examples/utils/model_export.py deleted file mode 100755 index 3602754c..00000000 --- a/examples/utils/model_export.py +++ /dev/null @@ -1,68 +0,0 @@ -# Copyright 2018 Yahoo Inc. -# Licensed under the terms of the Apache 2.0 license. -# Please see LICENSE file in the project root for terms. - -from __future__ import absolute_import -from __future__ import division -from __future__ import nested_scopes -from __future__ import print_function - -import argparse -import json -import sys -import tensorflow as tf -from tensorflowonspark import TFNode - -# -# Utility to load a TensorFlow checkpoint and export it as a saved_model, -# given a user-supplied signature definition in JSON format supplied as -# a command-line argument or as a file. -# -def main(_): - # restore graph/session from checkpoint - sess = tf.Session(graph=tf.get_default_graph()) - ckpt = tf.train.latest_checkpoint(FLAGS.model_dir) - saver = tf.train.import_meta_graph(ckpt + '.meta', clear_devices=True) - saver.restore(sess, ckpt) - g = sess.graph - - # if --show, dump out all operations in this graph - if FLAGS.show: - for o in g.get_operations(): - print("{:>64}\t{}".format(o.name, o.type)) - - if FLAGS.export_dir and FLAGS.signatures: - # load/parse JSON signatures - if ':' in FLAGS.signatures: - # assume JSON string, since unix filenames shouldn't contain colons - signatures = json.loads(FLAGS.signatures) - else: - # assume JSON file - with open(FLAGS.signatures) as f: - signatures = json.load(f) - - # convert string input/output values with actual tensors from graph - for name, sig in signatures.items(): - for k, v in sig['inputs'].items(): - tensor_name = v if v.endswith(':0') else v + ':0' - sig['inputs'][k] = g.get_tensor_by_name(tensor_name) - for k, v in sig['outputs'].items(): - tensor_name = v if v.endswith(':0') else v + ':0' - sig['outputs'][k] = g.get_tensor_by_name(tensor_name) - - # export a saved model - TFNode.export_saved_model(sess, - FLAGS.export_dir, - tf.saved_model.tag_constants.SERVING, - signatures) - - -if __name__ == "__main__": - parser = argparse.ArgumentParser() - parser.add_argument('--model_dir', type=str, help='Path to trained model checkpoint', required=True) - parser.add_argument('--export_dir', type=str, help='Path to export saved_model') - parser.add_argument('--signatures', type=str, help='JSON file or string representing list of signatures (inputs, outputs) to export') - parser.add_argument('--show', help='Print all graph operations', action="store_true") - FLAGS, _ = parser.parse_known_args() - tf.app.run(main=main, argv=sys.argv) - diff --git a/tensorflowonspark/pipeline.py b/tensorflowonspark/pipeline.py index 6ba32336..000b6a9f 100755 --- a/tensorflowonspark/pipeline.py +++ b/tensorflowonspark/pipeline.py @@ -5,12 +5,10 @@ It provides a TFEstimator class to fit a TFModel using TensorFlow. The TFEstimator will actually spawn a TensorFlowOnSpark cluster to conduct distributed training, but due to architectural limitations, the TFModel will only run single-node TensorFlow instances -when inferencing on the executors. The executors will run in parallel, but the TensorFlow model must fit in the memory +when inferencing on the executors. The executors will run in parallel, so the TensorFlow model must fit in the memory of each executor. -There is also an option to provide a separate "export" function, which allows users to export a different graph for inferencing vs. training. -This is useful when the training graph uses InputMode.TENSORFLOW with queue_runners, but the inferencing graph needs placeholders. -And this is especially useful for exporting saved_models for TensorFlow Serving. + """ from __future__ import absolute_import @@ -26,7 +24,7 @@ # from tensorflow.contrib.saved_model.python.saved_model import reader from tensorflow.python.saved_model import loader from tensorflow.python.tools import saved_model_utils -from . import TFCluster, dfutil, util +from . import TFCluster, util import argparse import copy @@ -85,6 +83,19 @@ def getEpochs(self): return self.getOrDefault(self.epochs) +class HasGraceSecs(Params): + grace_secs = Param(Params._dummy(), "grace_secs", "Number of seconds to wait after feeding data (for final tasks like exporting a saved_model)", typeConverter=TypeConverters.toInt) + + def __init__(self): + super(HasGraceSecs, self).__init__() + + def setGraceSecs(self, value): + return self._set(grace_secs=value) + + def getGraceSecs(self): + return self.getOrDefault(self.grace_secs) + + class HasInputMapping(Params): input_mapping = Param(Params._dummy(), "input_mapping", "Mapping of input DataFrame column to input tensor", typeConverter=TFTypeConverters.toDict) @@ -333,21 +344,18 @@ def merge_args_params(self): class TFEstimator(Estimator, TFParams, HasInputMapping, - HasClusterSize, HasNumPS, HasInputMode, HasMasterNode, HasProtocol, HasTensorboard, HasModelDir, HasExportDir, HasTFRecordDir, + HasClusterSize, HasNumPS, HasInputMode, HasMasterNode, HasProtocol, HasGraceSecs, + HasTensorboard, HasModelDir, HasExportDir, HasTFRecordDir, HasBatchSize, HasEpochs, HasReaders, HasSteps): """Spark ML Estimator which launches a TensorFlowOnSpark cluster for distributed training. The columns of the DataFrame passed to the ``fit()`` method will be mapped to TensorFlow tensors according to the ``setInputMapping()`` method. + Since the Spark ML Estimator API inherently relies on DataFrames/DataSets, InputMode.TENSORFLOW is not supported. If an ``export_fn`` was provided to the constructor, it will be run on a single executor immediately after the distributed training has completed. This allows users to export a TensorFlow saved_model with a different execution graph for inferencing, e.g. replacing an input graph of TFReaders and QueueRunners with Placeholders. - For InputMode.TENSORFLOW, the input DataFrame will be exported as TFRecords to a temporary location specified by the ``tfrecord_dir``. - The TensorFlow application will then be expected to read directly from this location during training. However, if the input DataFrame was - produced by the ``dfutil.loadTFRecords()`` method, i.e. originated from TFRecords on disk, then the `tfrecord_dir` will be set to the - original source location of the TFRecords with the additional export step. - Args: :train_fn: TensorFlow "main" function for training. :tf_args: Arguments specific to the TensorFlow "main" function. @@ -357,10 +365,9 @@ class TFEstimator(Estimator, TFParams, HasInputMapping, train_fn = None export_fn = None - def __init__(self, train_fn, tf_args, export_fn=None): + def __init__(self, train_fn, tf_args): super(TFEstimator, self).__init__() self.train_fn = train_fn - self.export_fn = export_fn self.args = Namespace(tf_args) self._setDefault(input_mapping={}, cluster_size=1, @@ -376,7 +383,8 @@ def __init__(self, train_fn, tf_args, export_fn=None): batch_size=100, epochs=1, readers=1, - steps=1000) + steps=1000, + grace_secs=30) def _fit(self, dataset): """Trains a TensorFlow model and returns a TFModel instance with the same args/params pointing to a checkpoint or saved_model on disk. @@ -395,19 +403,7 @@ def _fit(self, dataset): logging.info("===== 3. train args + params: {0}".format(local_args)) if local_args.input_mode == TFCluster.InputMode.TENSORFLOW: - if dfutil.isLoadedDF(dataset): - # if just a DataFrame loaded from tfrecords, just point to original source path - logging.info("Loaded DataFrame of TFRecord.") - local_args.tfrecord_dir = dfutil.loadedDF[dataset] - else: - # otherwise, save as tfrecords and point to save path - assert local_args.tfrecord_dir, "Please specify --tfrecord_dir to export DataFrame to TFRecord." - if self.getInputMapping(): - # if input mapping provided, filter only required columns before exporting - dataset = dataset.select(list(self.getInputMapping())) - logging.info("Exporting DataFrame {} as TFRecord to: {}".format(dataset.dtypes, local_args.tfrecord_dir)) - dfutil.saveAsTFRecords(dataset, local_args.tfrecord_dir) - logging.info("Done saving") + raise Exception("InputMode.TENSORFLOW is not supported.") tf_args = self.args.argv if self.args.argv else local_args cluster = TFCluster.run(sc, self.train_fn, tf_args, local_args.cluster_size, local_args.num_ps, @@ -416,19 +412,7 @@ def _fit(self, dataset): # feed data, using a deterministic order for input columns (lexicographic by key) input_cols = sorted(self.getInputMapping()) cluster.train(dataset.select(input_cols).rdd, local_args.epochs) - cluster.shutdown(grace_secs=30) - - # Run export function, if provided - if self.export_fn: - assert local_args.export_dir, "Export function requires --export_dir to be set" - logging.info("Exporting saved_model (via export_fn) to: {}".format(local_args.export_dir)) - - def _export(iterator, fn, args): - single_node_env(args) - fn(args) - - # Run on a single exeucutor - sc.parallelize([1], 1).foreachPartition(lambda it: _export(it, self.export_fn, tf_args)) + cluster.shutdown(grace_secs=self.getGraceSecs()) return self._copyValues(TFModel(self.args)) @@ -455,8 +439,8 @@ def __init__(self, tf_args): batch_size=100, model_dir=None, export_dir=None, - signature_def_key=None, - tag_set=None) + signature_def_key='serving_default', + tag_set='serve') def _transform(self, dataset): """Transforms the input DataFrame by applying the _run_model() mapPartitions function. @@ -488,13 +472,13 @@ def _transform(self, dataset): return spark.createDataFrame(rows_out, output_cols) -# global to each python worker process on the executors -global_sess = None # tf.Session cache -global_args = None # args provided to the _run_model() method. Any change will invalidate the global_sess cache. +# global on each python worker process on the executors +pred_fn = None # saved_model prediction function/signature. +pred_args = None # args provided to the _run_model() method. Any change will invalidate the pred_fn. def _run_model(iterator, args, tf_args): - """mapPartitions function to run single-node inferencing from a checkpoint/saved_model, using the model's input/output mappings. + """mapPartitions function to run single-node inferencing from a saved_model, using input/output mappings. Args: :iterator: input RDD partition iterator. @@ -506,68 +490,58 @@ def _run_model(iterator, args, tf_args): """ single_node_env(tf_args) - tf.compat.v1.disable_eager_execution() - logging.info("===== input_mapping: {}".format(args.input_mapping)) logging.info("===== output_mapping: {}".format(args.output_mapping)) input_tensor_names = [tensor for col, tensor in sorted(args.input_mapping.items())] output_tensor_names = [tensor for tensor, col in sorted(args.output_mapping.items())] - # if using a signature_def_key, get input/output tensor info from the requested signature - if args.signature_def_key: - assert args.export_dir, "Inferencing with signature_def_key requires --export_dir argument" - logging.info("===== loading meta_graph_def for tag_set ({0}) from saved_model: {1}".format(args.tag_set, args.export_dir)) - meta_graph_def = saved_model_utils.get_meta_graph_def(args.export_dir, args.tag_set) - signature = meta_graph_def.signature_def[args.signature_def_key] - logging.debug("signature: {}".format(signature)) - inputs_tensor_info = signature.inputs - logging.debug("inputs_tensor_info: {0}".format(inputs_tensor_info)) - outputs_tensor_info = signature.outputs - logging.debug("outputs_tensor_info: {0}".format(outputs_tensor_info)) + global pred_fn, pred_args - result = [] + # cache saved_model pred_fn to avoid reloading the model for each partition + if not pred_fn or args != pred_args: + assert args.export_dir, "Inferencing requires --export_dir argument" + logging.info("===== loading saved_model from: {}".format(args.export_dir)) + saved_model = tf.saved_model.load(args.export_dir, tags=args.tag_set) + logging.info("===== signature_def_key: {}".format(args.signature_def_key)) + pred_fn = saved_model.signatures[args.signature_def_key] + pred_args = args - global global_sess, global_args - if global_sess and global_args == args: - # if graph/session already loaded/started (and using same args), just reuse it - sess = global_sess - else: - # otherwise, create new session and load graph from disk - tf.compat.v1.reset_default_graph() - sess = tf.compat.v1.Session(graph=tf.compat.v1.get_default_graph()) - if args.export_dir: - assert args.tag_set, "Inferencing from a saved_model requires --tag_set" - # load graph from a saved_model - logging.info("===== restoring from saved_model: {}".format(args.export_dir)) - loader.load(sess, args.tag_set.split(','), args.export_dir) - else: - raise Exception("Inferencing requires --export_dir argument") - global_sess = sess - global_args = args + inputs_tensor_info = {i.name: i for i in pred_fn.inputs} + logging.info("===== inputs_tensor_info: {0}".format(inputs_tensor_info)) + outputs_tensor_info = pred_fn.outputs + logging.info("===== outputs_tensor_info: {0}".format(outputs_tensor_info)) - # get list of input/output tensors (by name) - if args.signature_def_key: - input_tensors = [inputs_tensor_info[t].name for t in input_tensor_names] - output_tensors = [outputs_tensor_info[output_tensor_names[0]].name] - else: - input_tensors = [t + ':0' for t in input_tensor_names] - output_tensors = [t + ':0' for t in output_tensor_names] - - logging.info("input_tensors: {0}".format(input_tensors)) - logging.info("output_tensors: {0}".format(output_tensors)) + result = [] # feed data in batches and return output tensors for tensors in yield_batch(iterator, args.batch_size, len(input_tensor_names)): - inputs_feed_dict = {} - for i in range(len(input_tensors)): - inputs_feed_dict[input_tensors[i]] = tensors[i] + inputs = {} + for i in range(len(input_tensor_names)): + name = input_tensor_names[i] + t = inputs_tensor_info[name + ":0"] + tensor = tf.constant(tensors[i], dtype=t.dtype) + # coerce shape if needed, since Spark only supports flat arrays + # and since saved_models don't encode tf.data operations + expected_shape = list(t.shape) + expected_shape[0] = tensor.shape[0] + if tensor.shape != expected_shape: + tensor = tf.reshape(tensor, expected_shape) + inputs[name] = tensor + + predictions = pred_fn(**inputs) + outputs = {k: v for k, v in predictions.items() if k in output_tensor_names} + + # validate that all output sizes match input size + output_sizes = [len(v) for k, v in outputs.items()] - outputs = sess.run(output_tensors, feed_dict=inputs_feed_dict) - lengths = [len(output) for output in outputs] input_size = len(tensors[0]) - assert all([length == input_size for length in lengths]), "Output array sizes {} must match input size: {}".format(lengths, input_size) - python_outputs = [output.tolist() for output in outputs] # convert from numpy to standard python types - result.extend(zip(*python_outputs)) # convert to an array of tuples of "output columns" + assert all([osize == input_size for osize in output_sizes]), "Output array sizes {} must match input size: {}".format(output_sizes, input_size) + + # convert to standard python types + python_outputs = [v.numpy().tolist() for k, v in outputs.items()] + + # convert to an array of tuples of "output columns" + result.extend(zip(*python_outputs)) return result diff --git a/test/test_pipeline.py b/test/test_pipeline.py index ed9ff1bd..ba88d7ab 100644 --- a/test/test_pipeline.py +++ b/test/test_pipeline.py @@ -1,12 +1,9 @@ import numpy as np import os -import scipy import shutil import test -import time import unittest -from tensorflowonspark import TFCluster, dfutil from tensorflowonspark.pipeline import HasBatchSize, HasSteps, Namespace, TFEstimator, TFParams from tensorflow.keras import Sequential from tensorflow.keras.layers import Dense @@ -91,12 +88,40 @@ def __init__(self, args): def test_spark_saved_model(self): """InputMode.SPARK TFEstimator w/ explicit saved_model export for TFModel inferencing""" + def _spark_train(args, ctx): + """Basic linear regression in a distributed TF cluster using InputMode.SPARK""" + import tensorflow as tf + from tensorflowonspark import TFNode + + tf.compat.v1.reset_default_graph() + + model = Sequential() + model.add(Dense(1, activation='linear', input_shape=(2,))) + model.compile(optimizer=tf.keras.optimizers.Adam(lr=0.2), loss='mse', metrics=['mse']) + model.summary() + + tf_feed = TFNode.DataFeed(ctx.mgr, input_mapping=args.input_mapping) + while not tf_feed.should_stop(): + batch = tf_feed.next_batch(args.batch_size) + if args.input_mapping: + if len(batch['x']) > 0: + model.fit(np.array(batch['x']), np.array(batch['y_'])) + + if ctx.job_name == 'chief': + print("saving checkpoint to: {}".format(args.model_dir)) + tf.saved_model.save(model, args.model_dir) + # model.save_weights(args.model_dir + "/model", overwrite=True, save_format='tf') + + if args.export_dir: + print("exporting model to: {}".format(args.export_dir)) + tf.keras.experimental.export_saved_model(model, args.export_dir) + # create a Spark DataFrame of training examples (features, labels) trainDF = self.spark.createDataFrame(self.train_examples, ['col1', 'col2']).repartition(3) # train and export model args = {} - estimator = TFEstimator(self.get_function('spark/train'), args) \ + estimator = TFEstimator(_spark_train, args) \ .setInputMapping({'col1': 'x', 'col2': 'y_'}) \ .setModelDir(self.model_dir) \ .setExportDir(self.export_dir) \ @@ -120,15 +145,6 @@ def test_spark_saved_model(self): expected = np.sum(self.weights) self.assertAlmostEqual(pred, expected, 3) - # test saved_model using custom/direct mapping - model.setTagSet('serve') \ - .setSignatureDefKey(None) \ - .setInputMapping({'c1': 'dense_input'}) \ - .setOutputMapping({'dense/BiasAdd': 'cout'}) - preds = model.transform(testDF).head() # take first/only result - pred = preds.cout[0] # unpack pred scalar from tensor - self.assertAlmostEqual(pred, expected, 3) - # def test_spark_sparse_tensor(self): # """InputMode.SPARK feeding sparse tensors""" # def sparse_train(args, ctx): @@ -205,130 +221,6 @@ def test_spark_saved_model(self): # preds = model.transform(test_df) # preds.show(5) - def test_tf_column_filter(self): - """InputMode.TENSORFLOW TFEstimator saving temporary TFRecords, filtered by input_mapping columns""" - - # create a Spark DataFrame of training examples (features, labels) - trainDF = self.spark.createDataFrame(self.train_examples, ['col1', 'col2']) - - # and add some extra columns - df = trainDF.withColumn('extra1', trainDF.col1) - df = df.withColumn('extra2', trainDF.col2) - self.assertEqual(len(df.columns), 4) - df.show() - - # train model on selected columns - args = {} - estimator = TFEstimator(self.get_function('tf/train'), args) \ - .setInputMapping({'col1': 'x', 'col2': 'y_'}) \ - .setInputMode(TFCluster.InputMode.TENSORFLOW) \ - .setExportDir(self.export_dir) \ - .setTFRecordDir(self.tfrecord_dir) \ - .setClusterSize(self.num_workers) \ - .setNumPS(1) \ - .setBatchSize(10) - estimator.fit(df) - self.assertTrue(os.path.isdir(self.export_dir)) - self.assertTrue(os.path.isdir(self.tfrecord_dir)) - - # verify that temporarily-saved TFRecords have the columns we requested - df_tmp = dfutil.loadTFRecords(self.sc, self.tfrecord_dir) - df_tmp.show() - - self.assertEqual(df_tmp.columns, ['col1', 'col2']) - - def test_tf_saved_model(self): - """InputMode.TENSORFLOW TFEstimator w/ a separate saved_model export function to add placeholders for InputMode.SPARK TFModel inferencing""" - - # create a Spark DataFrame of training examples (features, labels) - trainDF = self.spark.createDataFrame(self.train_examples, ['col1', 'col2']) - - # train model - args = {} - estimator = TFEstimator(self.get_function('tf/train'), args) \ - .setInputMapping({'col1': 'x', 'col2': 'y_'}) \ - .setInputMode(TFCluster.InputMode.TENSORFLOW) \ - .setExportDir(self.export_dir) \ - .setTFRecordDir(self.tfrecord_dir) \ - .setClusterSize(self.num_workers) \ - .setNumPS(1) \ - .setBatchSize(10) - model = estimator.fit(trainDF) - self.assertTrue(os.path.isdir(self.export_dir)) - - # create a Spark DataFrame of test examples (features, labels) - testDF = self.spark.createDataFrame(self.test_examples, ['c1', 'c2']) - - # test from saved_model - model.setTagSet('serve') \ - .setSignatureDefKey('serving_default') \ - .setInputMapping({'c1': 'dense_input'}) \ - .setOutputMapping({'dense': 'cout'}) - preds = model.transform(testDF).head() # take first/only result, e.g. [ Row(cout=[4.758000373840332])] - pred1 = preds.cout[0] - self.assertAlmostEqual(pred1, np.sum(self.weights), 5) - - def get_function(self, name): - """Returns a TF map_function for tests (required to avoid serializing the parent module/class)""" - - def _spark_train(args, ctx): - """Basic linear regression in a distributed TF cluster using InputMode.SPARK""" - import tensorflow as tf - from tensorflowonspark import TFNode - - model = Sequential() - model.add(Dense(1, activation='linear', input_shape=(2,))) - model.compile(optimizer=tf.keras.optimizers.Adam(lr=0.2), loss='mse', metrics=['mse']) - model.summary() - - tf_feed = TFNode.DataFeed(ctx.mgr, input_mapping=args.input_mapping) - while not tf_feed.should_stop(): - batch = tf_feed.next_batch(args.batch_size) - if args.input_mapping: - if len(batch['x']) > 0: - model.fit(np.array(batch['x']), np.array(batch['y_'])) - - if ctx.job_name == 'chief': - print("saving checkpoint to: {}".format(args.model_dir)) - tf.saved_model.save(model, args.model_dir) - # model.save_weights(args.model_dir + "/model", overwrite=True, save_format='tf') - - if args.export_dir: - print("exporting model to: {}".format(args.export_dir)) - tf.keras.experimental.export_saved_model(model, args.export_dir) - - def _tf_train(args, ctx): - """Basic linear regression in a distributed TF cluster using InputMode.TENSORFLOW""" - import tensorflow as tf - - def _get_examples(num_rows, batch_size): - """Generate test data""" - for i in range(num_rows): - features = tf.random.uniform([batch_size, 2]) # (batch_size x 2) - weights = tf.constant([[3.14], [1.618]]) # (2, 1) - labels = tf.matmul(features, weights) - yield features, labels - - model = Sequential() - model.add(Dense(1, activation='linear', input_shape=(2,))) - model.compile(optimizer=tf.keras.optimizers.Adam(lr=0.2), loss='mse', metrics=['mse']) - model.summary() - - model.fit_generator(_get_examples(1000, 10), steps_per_epoch=100, epochs=5) - - # export saved_model - if ctx.job_name == 'chief' and args.export_dir: - print("model weights: {}".format(model.get_weights())) - print("exporting model to: {}".format(args.export_dir)) - tf.keras.experimental.export_saved_model(model, args.export_dir) - - if name == 'spark/train': - return _spark_train - elif name == 'tf/train': - return _tf_train - else: - raise Exception("Unknown function name: {}".format(name)) - if __name__ == '__main__': unittest.main() From b3c9b90f1067508d936ea6fb55db19235ca6b23b Mon Sep 17 00:00:00 2001 From: Lee Yang Date: Mon, 19 Aug 2019 09:50:11 -0700 Subject: [PATCH 17/37] remove input_mode param --- tensorflowonspark/pipeline.py | 36 +++++------------------------------ test/test_pipeline.py | 2 +- 2 files changed, 6 insertions(+), 32 deletions(-) diff --git a/tensorflowonspark/pipeline.py b/tensorflowonspark/pipeline.py index 000b6a9f..c06dfaea 100755 --- a/tensorflowonspark/pipeline.py +++ b/tensorflowonspark/pipeline.py @@ -21,9 +21,6 @@ from pyspark.sql import Row, SparkSession import tensorflow as tf -# from tensorflow.contrib.saved_model.python.saved_model import reader -from tensorflow.python.saved_model import loader -from tensorflow.python.tools import saved_model_utils from . import TFCluster, util import argparse @@ -109,19 +106,6 @@ def getInputMapping(self): return self.getOrDefault(self.input_mapping) -class HasInputMode(Params): - input_mode = Param(Params._dummy(), "input_mode", "Input data feeding mode (0=TENSORFLOW, 1=SPARK)", typeConverter=TypeConverters.toInt) - - def __init__(self): - super(HasInputMode, self).__init__() - - def setInputMode(self, value): - return self._set(input_mode=value) - - def getInputMode(self): - return self.getOrDefault(self.input_mode) - - class HasMasterNode(Params): master_node = Param(Params._dummy(), "master_node", "Job name of master/chief worker node", typeConverter=TypeConverters.toString) @@ -344,7 +328,7 @@ def merge_args_params(self): class TFEstimator(Estimator, TFParams, HasInputMapping, - HasClusterSize, HasNumPS, HasInputMode, HasMasterNode, HasProtocol, HasGraceSecs, + HasClusterSize, HasNumPS, HasMasterNode, HasProtocol, HasGraceSecs, HasTensorboard, HasModelDir, HasExportDir, HasTFRecordDir, HasBatchSize, HasEpochs, HasReaders, HasSteps): """Spark ML Estimator which launches a TensorFlowOnSpark cluster for distributed training. @@ -352,14 +336,9 @@ class TFEstimator(Estimator, TFParams, HasInputMapping, The columns of the DataFrame passed to the ``fit()`` method will be mapped to TensorFlow tensors according to the ``setInputMapping()`` method. Since the Spark ML Estimator API inherently relies on DataFrames/DataSets, InputMode.TENSORFLOW is not supported. - If an ``export_fn`` was provided to the constructor, it will be run on a single executor immediately after the distributed training has completed. - This allows users to export a TensorFlow saved_model with a different execution graph for inferencing, e.g. replacing an input graph of - TFReaders and QueueRunners with Placeholders. - Args: :train_fn: TensorFlow "main" function for training. :tf_args: Arguments specific to the TensorFlow "main" function. - :export_fn: TensorFlow function for exporting a saved_model. """ train_fn = None @@ -373,7 +352,6 @@ def __init__(self, train_fn, tf_args): cluster_size=1, num_ps=0, driver_ps_nodes=False, - input_mode=TFCluster.InputMode.SPARK, master_node='chief', protocol='grpc', tensorboard=False, @@ -402,16 +380,12 @@ def _fit(self, dataset): local_args = self.merge_args_params() logging.info("===== 3. train args + params: {0}".format(local_args)) - if local_args.input_mode == TFCluster.InputMode.TENSORFLOW: - raise Exception("InputMode.TENSORFLOW is not supported.") - tf_args = self.args.argv if self.args.argv else local_args cluster = TFCluster.run(sc, self.train_fn, tf_args, local_args.cluster_size, local_args.num_ps, - local_args.tensorboard, local_args.input_mode, master_node=local_args.master_node, driver_ps_nodes=local_args.driver_ps_nodes) - if local_args.input_mode == TFCluster.InputMode.SPARK: - # feed data, using a deterministic order for input columns (lexicographic by key) - input_cols = sorted(self.getInputMapping()) - cluster.train(dataset.select(input_cols).rdd, local_args.epochs) + local_args.tensorboard, TFCluster.InputMode.SPARK, master_node=local_args.master_node, driver_ps_nodes=local_args.driver_ps_nodes) + # feed data, using a deterministic order for input columns (lexicographic by key) + input_cols = sorted(self.getInputMapping()) + cluster.train(dataset.select(input_cols).rdd, local_args.epochs) cluster.shutdown(grace_secs=self.getGraceSecs()) return self._copyValues(TFModel(self.args)) diff --git a/test/test_pipeline.py b/test/test_pipeline.py index ba88d7ab..7831fd70 100644 --- a/test/test_pipeline.py +++ b/test/test_pipeline.py @@ -143,7 +143,7 @@ def _spark_train(args, ctx): preds = model.transform(testDF).head() # take first/only result pred = preds.cout[0] # unpack scalar from tensor expected = np.sum(self.weights) - self.assertAlmostEqual(pred, expected, 3) + self.assertAlmostEqual(pred, expected, 2) # def test_spark_sparse_tensor(self): # """InputMode.SPARK feeding sparse tensors""" From 9c096beeed9f34a4b6ce631e33d8d2df15c78491 Mon Sep 17 00:00:00 2001 From: Lee Yang Date: Mon, 19 Aug 2019 10:28:25 -0700 Subject: [PATCH 18/37] remove references to InputMode param --- examples/mnist/estimator/mnist_pipeline.py | 3 +-- examples/mnist/keras/mnist_pipeline.py | 3 +-- 2 files changed, 2 insertions(+), 4 deletions(-) diff --git a/examples/mnist/estimator/mnist_pipeline.py b/examples/mnist/estimator/mnist_pipeline.py index 362b5aef..b05d1c7c 100644 --- a/examples/mnist/estimator/mnist_pipeline.py +++ b/examples/mnist/estimator/mnist_pipeline.py @@ -125,7 +125,7 @@ def model_fn(features, labels, mode): from pyspark.sql import SparkSession from pyspark.sql.functions import udf from pyspark.sql.types import IntegerType - from tensorflowonspark import TFCluster, dfutil + from tensorflowonspark import dfutil from tensorflowonspark.pipeline import TFEstimator, TFModel import argparse @@ -172,7 +172,6 @@ def parse(ln): .setModelDir(args.model_dir) \ .setExportDir(args.export_dir) \ .setClusterSize(args.cluster_size) \ - .setInputMode(TFCluster.InputMode.SPARK) \ .setTensorboard(args.tensorboard) \ .setEpochs(args.epochs) \ .setBatchSize(args.batch_size) \ diff --git a/examples/mnist/keras/mnist_pipeline.py b/examples/mnist/keras/mnist_pipeline.py index 89620efe..5b6a6f57 100644 --- a/examples/mnist/keras/mnist_pipeline.py +++ b/examples/mnist/keras/mnist_pipeline.py @@ -82,7 +82,7 @@ def rdd_generator(): from pyspark.sql import SparkSession from pyspark.sql.functions import udf from pyspark.sql.types import IntegerType - from tensorflowonspark import TFCluster, dfutil + from tensorflowonspark import dfutil from tensorflowonspark.pipeline import TFEstimator, TFModel sc = SparkContext(conf=SparkConf().setAppName("mnist_keras")) @@ -126,7 +126,6 @@ def parse(ln): .setModelDir(args.model_dir) \ .setExportDir(args.export_dir) \ .setClusterSize(args.cluster_size) \ - .setInputMode(TFCluster.InputMode.SPARK) \ .setTensorboard(args.tensorboard) \ .setEpochs(args.epochs) \ .setBatchSize(args.batch_size) \ From 9049b127c3bfbeaf3d9f222c06e5290ee074f758 Mon Sep 17 00:00:00 2001 From: Lee Yang Date: Mon, 26 Aug 2019 16:52:29 -0700 Subject: [PATCH 19/37] add image segmentation example --- examples/segmentation/README.md | 102 ++++++++++ examples/segmentation/segmentation.py | 155 +++++++++++++++ examples/segmentation/segmentation_dist.py | 163 ++++++++++++++++ examples/segmentation/segmentation_spark.py | 199 ++++++++++++++++++++ 4 files changed, 619 insertions(+) create mode 100644 examples/segmentation/README.md create mode 100644 examples/segmentation/segmentation.py create mode 100644 examples/segmentation/segmentation_dist.py create mode 100644 examples/segmentation/segmentation_spark.py diff --git a/examples/segmentation/README.md b/examples/segmentation/README.md new file mode 100644 index 00000000..1faebe68 --- /dev/null +++ b/examples/segmentation/README.md @@ -0,0 +1,102 @@ +# Image Segmentation + +Original Source: https://www.tensorflow.org/beta/tutorials/images/segmentation + +This code is based on the [Image Segmentation](https://www.tensorflow.org/beta/tutorials/images/segmentation) notebook example, converted to a single-node TensorFlow python app, then converted into a distributed TensorFlow app using the `MultiWorkerMirroredStrategy`, and then finally adapted for TensorFlowOnSpark. Compare the different versions to see the conversion steps involved at each stage. + +Notes: +- this example assumes that Spark, TensorFlow, and TensorFlowOnSpark are already installed. + +#### Train via Single-Node + +The [segmentation.py](segmentation.py) file contains the bulk of the code from the example notebook, minus any code for interactively visualizing the images and masks, since the end goal will be a non-interactive job in Spark. + +Run the single-node example to ensure that your environment is set up correctly. For brevity, this example only trains a single epoch (vs. the original 20 epochs), but you can modify the source to run more epochs, if desired. +``` +# train +python ${TFoS_HOME}/examples/segmentation/segmentation.py +``` + +This will save the model weights as `keras_weights.*` files, which you can re-use in the original notebook as follows: +``` +# create a new empty model +model = unet_model(OUTPUT_CHANNELS) +model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', + metrics=['accuracy']) +show_predictions() + +# load the weights +model.load_weights("/path/to/keras_weights") +show_predictions() +``` + +#### Train via Distributed TensorFlow + +Next, the [segmentation_dist.py](segmentation_dist.py) file adds a `MultiWorkerMirroredStrategy` to enable distributed training. For simplicity, we can simulate two different machines by using separate shell windows. If you have multiple nodes available, you can run these commands on the separate machines (using the cluster host names instead of `localhost`). +``` +# on one node/shell +export TF_CONFIG='{"cluster": { "worker": ["localhost:2222", "localhost:2223"]}, "task": {"type": "worker", "index": 0}}' +python ${TFoS_HOME}/examples/segmentation/segmentation_dist.py + +# on another node/shell +export TF_CONFIG='{"cluster": { "worker": ["localhost:2222", "localhost:2223"]}, "task": {"type": "worker", "index": 1}}' +python ${TFoS_HOME}/examples/segmentation/segmentation_dist.py +``` + +Note that training will not start until all nodes are running and connected to the cluster. Also note that the `MultiWorkerMirroredStrategy` is a synchronous training strategy, so each node will train a batch of data and update the model weights in lock-step with each of the other nodes. This has implications that are beyond the scope of this tutorial. For more information, you can read the [TensorFlow distributed training documentation](https://www.tensorflow.org/beta/tutorials/distribute/keras). Notably, you should shard the data across the workers and adjust the per-worker batch_size to account for additional nodes in the cluster. However, in order to minimize code changes here, this is left as an exercise for the reader. + +#### Train via TensorFlowOnSpark + +Next, we convert the `segmentation_dist.py` file to TensorFlowOnSpark, resulting in the [segmentation_spark.py](segmentation_spark.py) file. Then, run in a local Spark standalone cluster as follows: +``` +# Start a local standalone Spark cluster +export MASTER=spark://$(hostname):7077 +export SPARK_WORKER_INSTANCES=3 +export CORES_PER_WORKER=1 +export TOTAL_CORES=$((${CORES_PER_WORKER}*${SPARK_WORKER_INSTANCES})) +export TFoS_HOME= + +${SPARK_HOME}/sbin/start-master.sh; ${SPARK_HOME}/sbin/start-slave.sh -c $CORES_PER_WORKER -m 3G ${MASTER} + +# remove any old artifacts +rm -Rf ${TFoS_HOME}/segmentation_model.h5 ${TFoS_HOME}/segmentation_model ${TFoS_HOME}/segmentation_export + +# train +${SPARK_HOME}/bin/spark-submit \ +--master ${MASTER} \ +--conf spark.cores.max=${TOTAL_CORES} \ +--conf spark.task.cpus=${CORES_PER_WORKER} \ +${TFoS_HOME}/examples/segmentation/segmentation_spark.py \ +--cluster_size ${SPARK_WORKER_INSTANCES} \ +--model_dir ${TFoS_HOME}/segmentation_model \ +--export_dir ${TFoS_HOME}/segmentation_export \ +--epochs 1 + +# Shutdown the Spark Standalone cluster +${SPARK_HOME}/sbin/stop-slave.sh; ${SPARK_HOME}/sbin/stop-master.sh +``` + +Once again, this only trains a single epoch and doesn't adjust for the increased cluster size. Feel free to experiment on your own. + +This example will save the model in several different formats: +- TensorFlow/Keras checkpoint (`segmentation_model`) +- Keras HDF5 file (`segmentation_model.h5`) +- TensorFlow saved_model (`segmentation_export`) + +You can re-load these into the original notebook example (for visualization of the segmentation masks) with the following code: +``` +# segmentation_model +model = unet_model(OUTPUT_CHANNELS) +model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', + metrics=['accuracy']) +model.load_weights("/path/to/segmentation_model/weights-0001") +show_predictions(test_dataset) + +# segmentation_model.h5 +model = tf.keras.models.load_model("/path/to/segmentation_model.h5") +show_predictions(test_dataset) + +# segmentation_export +model = tf.keras.experimental.load_from_saved_model("/path/to/segmentation_export") +show_predictions(test_dataset) +``` diff --git a/examples/segmentation/segmentation.py b/examples/segmentation/segmentation.py new file mode 100644 index 00000000..b3b7c502 --- /dev/null +++ b/examples/segmentation/segmentation.py @@ -0,0 +1,155 @@ +# Copyright 2019 The TensorFlow Authors. +# Licensed under the Apache License, Version 2.0 (the "License"); +# +#@title Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# https://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import absolute_import, division, print_function, unicode_literals + +from tensorflow_examples.models.pix2pix import pix2pix +import tensorflow_datasets as tfds +import tensorflow as tf + +dataset, info = tfds.load('oxford_iiit_pet:3.0.0', with_info=True) + + +def normalize(input_image, input_mask): + input_image = tf.cast(input_image, tf.float32)/128.0 - 1 + input_mask -= 1 + return input_image, input_mask + + +@tf.function +def load_image_train(datapoint): + input_image = tf.image.resize(datapoint['image'], (128, 128)) + input_mask = tf.image.resize(datapoint['segmentation_mask'], (128, 128)) + + if tf.random.uniform(()) > 0.5: + input_image = tf.image.flip_left_right(input_image) + input_mask = tf.image.flip_left_right(input_mask) + + input_image, input_mask = normalize(input_image, input_mask) + + return input_image, input_mask + + +def load_image_test(datapoint): + input_image = tf.image.resize(datapoint['image'], (128, 128)) + input_mask = tf.image.resize(datapoint['segmentation_mask'], (128, 128)) + input_image, input_mask = normalize(input_image, input_mask) + return input_image, input_mask + + +TRAIN_LENGTH = info.splits['train'].num_examples +BATCH_SIZE = 64 +BUFFER_SIZE = 1000 +STEPS_PER_EPOCH = TRAIN_LENGTH // BATCH_SIZE + +train = dataset['train'].map(load_image_train, num_parallel_calls=tf.data.experimental.AUTOTUNE) +test = dataset['test'].map(load_image_test) + +train_dataset = train.cache().shuffle(BUFFER_SIZE).batch(BATCH_SIZE).repeat() +train_dataset = train_dataset.prefetch(buffer_size=tf.data.experimental.AUTOTUNE) +test_dataset = test.batch(BATCH_SIZE) + +OUTPUT_CHANNELS = 3 + +base_model = tf.keras.applications.MobileNetV2(input_shape=[128, 128, 3], include_top=False) + +# Use the activations of these layers +layer_names = [ + 'block_1_expand_relu', # 64x64 + 'block_3_expand_relu', # 32x32 + 'block_6_expand_relu', # 16x16 + 'block_13_expand_relu', # 8x8 + 'block_16_project', # 4x4 +] +layers = [base_model.get_layer(name).output for name in layer_names] + +# Create the feature extraction model +down_stack = tf.keras.Model(inputs=base_model.input, outputs=layers) + +down_stack.trainable = False + +up_stack = [ + pix2pix.upsample(512, 3), # 4x4 -> 8x8 + pix2pix.upsample(256, 3), # 8x8 -> 16x16 + pix2pix.upsample(128, 3), # 16x16 -> 32x32 + pix2pix.upsample(64, 3), # 32x32 -> 64x64 +] + + +def unet_model(output_channels): + + # This is the last layer of the model + last = tf.keras.layers.Conv2DTranspose( + output_channels, 3, strides=2, + padding='same', activation='softmax') # 64x64 -> 128x128 + + inputs = tf.keras.layers.Input(shape=[128, 128, 3]) + x = inputs + + # Downsampling through the model + skips = down_stack(x) + x = skips[-1] + skips = reversed(skips[:-1]) + + # Upsampling and establishing the skip connections + for up, skip in zip(up_stack, skips): + x = up(x) + concat = tf.keras.layers.Concatenate() + x = concat([x, skip]) + + x = last(x) + + return tf.keras.Model(inputs=inputs, outputs=x) + + +model = unet_model(OUTPUT_CHANNELS) +model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', + metrics=['accuracy']) + +# Training only (since we're using command-line) +# def create_mask(pred_mask): +# pred_mask = tf.argmax(pred_mask, axis=-1) +# pred_mask = pred_mask[..., tf.newaxis] +# return pred_mask[0] +# +# +# def show_predictions(dataset=None, num=1): +# if dataset: +# for image, mask in dataset.take(num): +# pred_mask = model.predict(image) +# display([image[0], mask[0], create_mask(pred_mask)]) +# else: +# display([sample_image, sample_mask, +# create_mask(model.predict(sample_image[tf.newaxis, ...]))]) +# +# +# class DisplayCallback(tf.keras.callbacks.Callback): +# def on_epoch_end(self, epoch, logs=None): +# clear_output(wait=True) +# show_predictions() +# print ('\nSample Prediction after epoch {}\n'.format(epoch+1)) +# + +# EPOCHS = 20 +EPOCHS = 1 +VAL_SUBSPLITS = 5 +VALIDATION_STEPS = info.splits['test'].num_examples//BATCH_SIZE//VAL_SUBSPLITS + +model_history = model.fit(train_dataset, epochs=EPOCHS, + steps_per_epoch=STEPS_PER_EPOCH, + validation_steps=VALIDATION_STEPS, + validation_data=test_dataset) + +model.save_weights("keras_weights") diff --git a/examples/segmentation/segmentation_dist.py b/examples/segmentation/segmentation_dist.py new file mode 100644 index 00000000..85065ecb --- /dev/null +++ b/examples/segmentation/segmentation_dist.py @@ -0,0 +1,163 @@ +# Copyright 2019 The TensorFlow Authors. +# Licensed under the Apache License, Version 2.0 (the "License"); +# +#@title Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# https://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import absolute_import, division, print_function, unicode_literals + +from tensorflow_examples.models.pix2pix import pix2pix +import json +import os +import tensorflow_datasets as tfds +import tensorflow as tf + +strategy = tf.distribute.experimental.MultiWorkerMirroredStrategy() + +tf_config = json.loads(os.environ.get('TF_CONFIG')) +print("tf_config = ", tf_config) +print("I'm {}:{}".format(tf_config['task']['type'], tf_config['task']['index'])) + +dataset, info = tfds.load('oxford_iiit_pet:3.0.0', with_info=True) + + +def normalize(input_image, input_mask): + input_image = tf.cast(input_image, tf.float32)/128.0 - 1 + input_mask -= 1 + return input_image, input_mask + + +@tf.function +def load_image_train(datapoint): + input_image = tf.image.resize(datapoint['image'], (128, 128)) + input_mask = tf.image.resize(datapoint['segmentation_mask'], (128, 128)) + + if tf.random.uniform(()) > 0.5: + input_image = tf.image.flip_left_right(input_image) + input_mask = tf.image.flip_left_right(input_mask) + + input_image, input_mask = normalize(input_image, input_mask) + + return input_image, input_mask + + +def load_image_test(datapoint): + input_image = tf.image.resize(datapoint['image'], (128, 128)) + input_mask = tf.image.resize(datapoint['segmentation_mask'], (128, 128)) + input_image, input_mask = normalize(input_image, input_mask) + return input_image, input_mask + + +TRAIN_LENGTH = info.splits['train'].num_examples +BATCH_SIZE = 64 +BUFFER_SIZE = 1000 +STEPS_PER_EPOCH = TRAIN_LENGTH // BATCH_SIZE + +train = dataset['train'].map(load_image_train, num_parallel_calls=tf.data.experimental.AUTOTUNE) +test = dataset['test'].map(load_image_test) + +train_dataset = train.cache().shuffle(BUFFER_SIZE).batch(BATCH_SIZE).repeat() +train_dataset = train_dataset.prefetch(buffer_size=tf.data.experimental.AUTOTUNE) +test_dataset = test.batch(BATCH_SIZE) + +OUTPUT_CHANNELS = 3 + +with strategy.scope(): + base_model = tf.keras.applications.MobileNetV2(input_shape=[128, 128, 3], include_top=False) + + # Use the activations of these layers + layer_names = [ + 'block_1_expand_relu', # 64x64 + 'block_3_expand_relu', # 32x32 + 'block_6_expand_relu', # 16x16 + 'block_13_expand_relu', # 8x8 + 'block_16_project', # 4x4 + ] + layers = [base_model.get_layer(name).output for name in layer_names] + + # Create the feature extraction model + down_stack = tf.keras.Model(inputs=base_model.input, outputs=layers) + + down_stack.trainable = False + + up_stack = [ + pix2pix.upsample(512, 3), # 4x4 -> 8x8 + pix2pix.upsample(256, 3), # 8x8 -> 16x16 + pix2pix.upsample(128, 3), # 16x16 -> 32x32 + pix2pix.upsample(64, 3), # 32x32 -> 64x64 + ] + + def unet_model(output_channels): + + # This is the last layer of the model + last = tf.keras.layers.Conv2DTranspose( + output_channels, 3, strides=2, + padding='same', activation='softmax') # 64x64 -> 128x128 + + inputs = tf.keras.layers.Input(shape=[128, 128, 3]) + x = inputs + + # Downsampling through the model + skips = down_stack(x) + x = skips[-1] + skips = reversed(skips[:-1]) + + # Upsampling and establishing the skip connections + for up, skip in zip(up_stack, skips): + x = up(x) + concat = tf.keras.layers.Concatenate() + x = concat([x, skip]) + + x = last(x) + + return tf.keras.Model(inputs=inputs, outputs=x) + + model = unet_model(OUTPUT_CHANNELS) + model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', + metrics=['accuracy']) + +# Training only (since we're using command-line) +# def create_mask(pred_mask): +# pred_mask = tf.argmax(pred_mask, axis=-1) +# pred_mask = pred_mask[..., tf.newaxis] +# return pred_mask[0] +# +# +# def show_predictions(dataset=None, num=1): +# if dataset: +# for image, mask in dataset.take(num): +# pred_mask = model.predict(image) +# display([image[0], mask[0], create_mask(pred_mask)]) +# else: +# display([sample_image, sample_mask, +# create_mask(model.predict(sample_image[tf.newaxis, ...]))]) +# +# +# class DisplayCallback(tf.keras.callbacks.Callback): +# def on_epoch_end(self, epoch, logs=None): +# clear_output(wait=True) +# show_predictions() +# print ('\nSample Prediction after epoch {}\n'.format(epoch+1)) +# + +# EPOCHS = 20 +EPOCHS = 1 +VAL_SUBSPLITS = 5 +VALIDATION_STEPS = info.splits['test'].num_examples//BATCH_SIZE//VAL_SUBSPLITS + +model_history = model.fit(train_dataset, epochs=EPOCHS, + steps_per_epoch=STEPS_PER_EPOCH, + validation_steps=VALIDATION_STEPS, + validation_data=test_dataset) + +if tf_config['task']['index'] == 0: + model.save_weights("keras_weights", save_format='h5') diff --git a/examples/segmentation/segmentation_spark.py b/examples/segmentation/segmentation_spark.py new file mode 100644 index 00000000..2111b0ea --- /dev/null +++ b/examples/segmentation/segmentation_spark.py @@ -0,0 +1,199 @@ +# Copyright 2019 The TensorFlow Authors. +# Licensed under the Apache License, Version 2.0 (the "License"); +# +#@title Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# https://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import absolute_import, division, print_function, unicode_literals + + +def main_fun(args, ctx): + from tensorflow_examples.models.pix2pix import pix2pix + import json + import os + import tensorflow_datasets as tfds + import tensorflow as tf + import time + + print("TensorFlow version: ", tf.__version__) + strategy = tf.distribute.experimental.MultiWorkerMirroredStrategy() + + dataset, info = tfds.load('oxford_iiit_pet:3.0.0', with_info=True) + + def normalize(input_image, input_mask): + input_image = tf.cast(input_image, tf.float32)/128.0 - 1 + input_mask -= 1 + return input_image, input_mask + + @tf.function + def load_image_train(datapoint): + input_image = tf.image.resize(datapoint['image'], (128, 128)) + input_mask = tf.image.resize(datapoint['segmentation_mask'], (128, 128)) + + if tf.random.uniform(()) > 0.5: + input_image = tf.image.flip_left_right(input_image) + input_mask = tf.image.flip_left_right(input_mask) + + input_image, input_mask = normalize(input_image, input_mask) + + return input_image, input_mask + + def load_image_test(datapoint): + input_image = tf.image.resize(datapoint['image'], (128, 128)) + input_mask = tf.image.resize(datapoint['segmentation_mask'], (128, 128)) + input_image, input_mask = normalize(input_image, input_mask) + return input_image, input_mask + + TRAIN_LENGTH = info.splits['train'].num_examples + BATCH_SIZE = args.batch_size + BUFFER_SIZE = args.buffer_size + STEPS_PER_EPOCH = TRAIN_LENGTH // BATCH_SIZE + + train = dataset['train'].map(load_image_train, num_parallel_calls=tf.data.experimental.AUTOTUNE) + test = dataset['test'].map(load_image_test) + + train_dataset = train.cache().shuffle(BUFFER_SIZE).batch(BATCH_SIZE).repeat() + train_dataset = train_dataset.prefetch(buffer_size=tf.data.experimental.AUTOTUNE) + test_dataset = test.batch(BATCH_SIZE) + + OUTPUT_CHANNELS = 3 + + with strategy.scope(): + base_model = tf.keras.applications.MobileNetV2(input_shape=[128, 128, 3], include_top=False) + + # Use the activations of these layers + layer_names = [ + 'block_1_expand_relu', # 64x64 + 'block_3_expand_relu', # 32x32 + 'block_6_expand_relu', # 16x16 + 'block_13_expand_relu', # 8x8 + 'block_16_project', # 4x4 + ] + layers = [base_model.get_layer(name).output for name in layer_names] + + # Create the feature extraction model + down_stack = tf.keras.Model(inputs=base_model.input, outputs=layers) + + down_stack.trainable = False + + up_stack = [ + pix2pix.upsample(512, 3), # 4x4 -> 8x8 + pix2pix.upsample(256, 3), # 8x8 -> 16x16 + pix2pix.upsample(128, 3), # 16x16 -> 32x32 + pix2pix.upsample(64, 3), # 32x32 -> 64x64 + ] + + def unet_model(output_channels): + + # This is the last layer of the model + last = tf.keras.layers.Conv2DTranspose( + output_channels, 3, strides=2, + padding='same', activation='softmax') # 64x64 -> 128x128 + + inputs = tf.keras.layers.Input(shape=[128, 128, 3]) + x = inputs + + # Downsampling through the model + skips = down_stack(x) + x = skips[-1] + skips = reversed(skips[:-1]) + + # Upsampling and establishing the skip connections + for up, skip in zip(up_stack, skips): + x = up(x) + concat = tf.keras.layers.Concatenate() + x = concat([x, skip]) + + x = last(x) + + return tf.keras.Model(inputs=inputs, outputs=x) + + model = unet_model(OUTPUT_CHANNELS) + model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', + metrics=['accuracy']) + +# Training only (since we're using command-line) +# def create_mask(pred_mask): +# pred_mask = tf.argmax(pred_mask, axis=-1) +# pred_mask = pred_mask[..., tf.newaxis] +# return pred_mask[0] +# +# +# def show_predictions(dataset=None, num=1): +# if dataset: +# for image, mask in dataset.take(num): +# pred_mask = model.predict(image) +# display([image[0], mask[0], create_mask(pred_mask)]) +# else: +# display([sample_image, sample_mask, +# create_mask(model.predict(sample_image[tf.newaxis, ...]))]) +# +# +# class DisplayCallback(tf.keras.callbacks.Callback): +# def on_epoch_end(self, epoch, logs=None): +# clear_output(wait=True) +# show_predictions() +# print ('\nSample Prediction after epoch {}\n'.format(epoch+1)) +# + + EPOCHS = args.epochs + VAL_SUBSPLITS = 5 + VALIDATION_STEPS = info.splits['test'].num_examples//BATCH_SIZE//VAL_SUBSPLITS + + tf.io.gfile.makedirs(args.model_dir) + filepath = args.model_dir + "/weights-{epoch:04d}" + ckpt_callback = tf.keras.callbacks.ModelCheckpoint(filepath=filepath, verbose=1, load_weights_on_restart=True, save_weights_only=True) + + model_history = model.fit(train_dataset, epochs=EPOCHS, + steps_per_epoch=STEPS_PER_EPOCH, + callbacks=[ckpt_callback], + validation_steps=VALIDATION_STEPS, + validation_data=test_dataset) + + if ctx.job_name == 'chief': + # Workaround for: https://github.com/tensorflow/tensorflow/issues/30251 + print("===== saving h5py model") + model.save(args.model_dir + ".h5") + print("===== re-loading model w/o DistributionStrategy") + new_model = tf.keras.models.load_model(args.model_dir + ".h5") + print("===== exporting saved_model") + tf.keras.experimental.export_saved_model(new_model, args.export_dir) + print("===== done exporting") + else: + print("===== sleeping") + time.sleep(90) + + +if __name__ == '__main__': + import argparse + from pyspark.context import SparkContext + from pyspark.conf import SparkConf + from tensorflowonspark import TFCluster + + sc = SparkContext(conf=SparkConf().setAppName("segmentation")) + executors = sc._conf.get("spark.executor.instances") + num_executors = int(executors) if executors is not None else 1 + + parser = argparse.ArgumentParser() + parser.add_argument("--batch_size", help="number of records per batch", type=int, default=64) + parser.add_argument("--buffer_size", help="size of shuffle buffer", type=int, default=1000) + parser.add_argument("--cluster_size", help="number of nodes in the cluster", type=int, default=num_executors) + parser.add_argument("--epochs", help="number of epochs", type=int, default=3) + parser.add_argument("--model_dir", help="path to save model/checkpoint", default="segmentation_model") + parser.add_argument("--export_dir", help="path to export saved_model", default="segmentation_export") + parser.add_argument("--tensorboard", help="launch tensorboard process", action="store_true") + + args = parser.parse_args() + print("args:", args) + + cluster = TFCluster.run(sc, main_fun, args, args.cluster_size, num_ps=0, tensorboard=args.tensorboard, input_mode=TFCluster.InputMode.TENSORFLOW, master_node='chief') + cluster.shutdown(grace_secs=30) From 1f86976155e20eaa102090cf896829987a53b4d6 Mon Sep 17 00:00:00 2001 From: Lee Yang Date: Tue, 27 Aug 2019 09:03:51 -0700 Subject: [PATCH 20/37] remove cifar10 and imagenet examples --- examples/cifar10/BUILD | 87 - examples/cifar10/README.md | 69 - examples/cifar10/README_orig.md | 10 - examples/cifar10/__init__.py | 22 - examples/cifar10/cifar10.py | 399 - examples/cifar10/cifar10_eval.py | 169 - examples/cifar10/cifar10_input.py | 257 - examples/cifar10/cifar10_input_test.py | 66 - examples/cifar10/cifar10_multi_gpu_train.py | 282 - examples/cifar10/cifar10_train.py | 129 - examples/imagenet/README.md | 78 - examples/imagenet/inception/BUILD | 198 - examples/imagenet/inception/__init__.py | 0 examples/imagenet/inception/data/__init__.py | 0 .../inception/data/build_image_data.py | 431 - .../inception/data/build_imagenet_data.py | 704 - .../data/download_and_preprocess_flowers.sh | 96 - .../download_and_preprocess_flowers_mac.sh | 96 - .../data/download_and_preprocess_imagenet.sh | 101 - .../inception/data/download_imagenet.sh | 106 - ...imagenet_2012_validation_synset_labels.txt | 50000 ---------------- .../data/imagenet_lsvrc_2015_synsets.txt | 1000 - .../inception/data/imagenet_metadata.txt | 21842 ------- .../preprocess_imagenet_validation_data.py | 82 - .../inception/data/process_bounding_boxes.py | 254 - examples/imagenet/inception/dataset.py | 103 - examples/imagenet/inception/flowers_data.py | 52 - examples/imagenet/inception/flowers_eval.py | 40 - examples/imagenet/inception/flowers_train.py | 41 - .../imagenet/inception/image_processing.py | 513 - examples/imagenet/inception/imagenet_data.py | 59 - .../inception/imagenet_distributed_train.py | 97 - .../imagenet_distributed_train_pipeline.py | 107 - examples/imagenet/inception/imagenet_eval.py | 62 - examples/imagenet/inception/imagenet_train.py | 41 - .../inception/inception_distributed_train.py | 360 - examples/imagenet/inception/inception_eval.py | 166 - .../imagenet/inception/inception_export.py | 115 - .../imagenet/inception/inception_model.py | 157 - .../imagenet/inception/inception_train.py | 357 - examples/imagenet/inception/slim/BUILD | 112 - examples/imagenet/inception/slim/README.md | 631 - examples/imagenet/inception/slim/__init__.py | 0 .../inception/slim/collections_test.py | 181 - .../inception/slim/inception_model.py | 356 - .../imagenet/inception/slim/inception_test.py | 134 - examples/imagenet/inception/slim/losses.py | 174 - .../imagenet/inception/slim/losses_test.py | 177 - examples/imagenet/inception/slim/ops.py | 473 - examples/imagenet/inception/slim/ops_test.py | 692 - examples/imagenet/inception/slim/scopes.py | 170 - .../imagenet/inception/slim/scopes_test.py | 162 - examples/imagenet/inception/slim/slim.py | 24 - examples/imagenet/inception/slim/variables.py | 289 - .../imagenet/inception/slim/variables_test.py | 392 - 55 files changed, 82715 deletions(-) delete mode 100644 examples/cifar10/BUILD delete mode 100644 examples/cifar10/README.md delete mode 100644 examples/cifar10/README_orig.md delete mode 100644 examples/cifar10/__init__.py delete mode 100644 examples/cifar10/cifar10.py delete mode 100644 examples/cifar10/cifar10_eval.py delete mode 100644 examples/cifar10/cifar10_input.py delete mode 100644 examples/cifar10/cifar10_input_test.py delete mode 100644 examples/cifar10/cifar10_multi_gpu_train.py delete mode 100644 examples/cifar10/cifar10_train.py delete mode 100644 examples/imagenet/README.md delete mode 100644 examples/imagenet/inception/BUILD delete mode 100644 examples/imagenet/inception/__init__.py delete mode 100644 examples/imagenet/inception/data/__init__.py delete mode 100644 examples/imagenet/inception/data/build_image_data.py delete mode 100644 examples/imagenet/inception/data/build_imagenet_data.py delete mode 100755 examples/imagenet/inception/data/download_and_preprocess_flowers.sh delete mode 100644 examples/imagenet/inception/data/download_and_preprocess_flowers_mac.sh delete mode 100755 examples/imagenet/inception/data/download_and_preprocess_imagenet.sh delete mode 100755 examples/imagenet/inception/data/download_imagenet.sh delete mode 100644 examples/imagenet/inception/data/imagenet_2012_validation_synset_labels.txt delete mode 100644 examples/imagenet/inception/data/imagenet_lsvrc_2015_synsets.txt delete mode 100644 examples/imagenet/inception/data/imagenet_metadata.txt delete mode 100755 examples/imagenet/inception/data/preprocess_imagenet_validation_data.py delete mode 100755 examples/imagenet/inception/data/process_bounding_boxes.py delete mode 100644 examples/imagenet/inception/dataset.py delete mode 100644 examples/imagenet/inception/flowers_data.py delete mode 100644 examples/imagenet/inception/flowers_eval.py delete mode 100644 examples/imagenet/inception/flowers_train.py delete mode 100644 examples/imagenet/inception/image_processing.py delete mode 100644 examples/imagenet/inception/imagenet_data.py delete mode 100644 examples/imagenet/inception/imagenet_distributed_train.py delete mode 100644 examples/imagenet/inception/imagenet_distributed_train_pipeline.py delete mode 100644 examples/imagenet/inception/imagenet_eval.py delete mode 100644 examples/imagenet/inception/imagenet_train.py delete mode 100644 examples/imagenet/inception/inception_distributed_train.py delete mode 100644 examples/imagenet/inception/inception_eval.py delete mode 100644 examples/imagenet/inception/inception_export.py delete mode 100644 examples/imagenet/inception/inception_model.py delete mode 100644 examples/imagenet/inception/inception_train.py delete mode 100644 examples/imagenet/inception/slim/BUILD delete mode 100644 examples/imagenet/inception/slim/README.md delete mode 100644 examples/imagenet/inception/slim/__init__.py delete mode 100644 examples/imagenet/inception/slim/collections_test.py delete mode 100644 examples/imagenet/inception/slim/inception_model.py delete mode 100644 examples/imagenet/inception/slim/inception_test.py delete mode 100644 examples/imagenet/inception/slim/losses.py delete mode 100644 examples/imagenet/inception/slim/losses_test.py delete mode 100644 examples/imagenet/inception/slim/ops.py delete mode 100644 examples/imagenet/inception/slim/ops_test.py delete mode 100644 examples/imagenet/inception/slim/scopes.py delete mode 100644 examples/imagenet/inception/slim/scopes_test.py delete mode 100644 examples/imagenet/inception/slim/slim.py delete mode 100644 examples/imagenet/inception/slim/variables.py delete mode 100644 examples/imagenet/inception/slim/variables_test.py diff --git a/examples/cifar10/BUILD b/examples/cifar10/BUILD deleted file mode 100644 index 9cf574f6..00000000 --- a/examples/cifar10/BUILD +++ /dev/null @@ -1,87 +0,0 @@ -# Description: -# Example TensorFlow models for CIFAR-10 - -licenses(["notice"]) # Apache 2.0 - -exports_files(["LICENSE"]) - -py_library( - name = "cifar10_input", - srcs = ["cifar10_input.py"], - srcs_version = "PY2AND3", - visibility = ["//tensorflow:internal"], - deps = [ - "//tensorflow:tensorflow_py", - ], -) - -py_test( - name = "cifar10_input_test", - size = "small", - srcs = ["cifar10_input_test.py"], - srcs_version = "PY2AND3", - deps = [ - ":cifar10_input", - "//tensorflow:tensorflow_py", - "//tensorflow/python:framework_test_lib", - "//tensorflow/python:platform_test", - ], -) - -py_library( - name = "cifar10", - srcs = ["cifar10.py"], - srcs_version = "PY2AND3", - deps = [ - ":cifar10_input", - "//tensorflow:tensorflow_py", - ], -) - -py_binary( - name = "cifar10_eval", - srcs = [ - "cifar10_eval.py", - ], - srcs_version = "PY2AND3", - visibility = ["//tensorflow:__subpackages__"], - deps = [ - ":cifar10", - ], -) - -py_binary( - name = "cifar10_train", - srcs = [ - "cifar10_train.py", - ], - srcs_version = "PY2AND3", - visibility = ["//tensorflow:__subpackages__"], - deps = [ - ":cifar10", - ], -) - -py_binary( - name = "cifar10_multi_gpu_train", - srcs = [ - "cifar10_multi_gpu_train.py", - ], - srcs_version = "PY2AND3", - visibility = ["//tensorflow:__subpackages__"], - deps = [ - ":cifar10", - ], -) - -filegroup( - name = "all_files", - srcs = glob( - ["**/*"], - exclude = [ - "**/METADATA", - "**/OWNERS", - ], - ), - visibility = ["//tensorflow:__subpackages__"], -) diff --git a/examples/cifar10/README.md b/examples/cifar10/README.md deleted file mode 100644 index 370f5848..00000000 --- a/examples/cifar10/README.md +++ /dev/null @@ -1,69 +0,0 @@ -# CIFAR-10 Multi-GPU CNN - -Original Source: https://github.com/tensorflow/tensorflow/blob/eaceadc3c421bb41cfbf607ca832b3b9b2ad2507/tensorflow/g3doc/tutorials/deep_cnn/index.md - -The following is the Multi-GPU CNN Tutorial, adapted for TensorFlowOnSpark. This example demonstrates how to use multiple GPU cards on a single node. Note: since YARN currently cannot allocate GPU resources directly, we currently use RAM as a proxy, so in our case, 1GPU == 27GB. You may need to adjust this for your grid. - -Please ensure that you have followed [these instructions](https://github.com/yahoo/TensorFlowOnSpark/wiki/GetStarted_YARN) first. - -Also, you will need to download the CIFAR-10 dataset per the [original example](https://github.com/tensorflow/tensorflow/blob/eaceadc3c421bb41cfbf607ca832b3b9b2ad2507/tensorflow/g3doc/tutorials/deep_cnn/index.md). - -#### Package the code as a Python zip/module - - export TFoS_HOME= - pushd ${TFoS_HOME}/examples/cifar10; zip -r ~/cifar10.zip .; popd - -#### Run Multi-GPU CNN on Spark - - # set environment variables (if not already done) - export PYTHON_ROOT=~/Python - export PYSPARK_PYTHON=${PYTHON_ROOT}/bin/python - export PATH=${PYTHON_ROOT}/bin/:$PATH - export QUEUE=gpu - export CIFAR10_DATA= - - # for CPU mode: - # export QUEUE=default - # --conf spark.executorEnv.LD_LIBRARY_PATH="$JAVA_HOME/jre/lib/amd64/server" \ - # remove --driver-library-path - - # hadoop fs -rm -r cifar10_train - export NUM_GPU=2 - export MEMORY=$((NUM_GPU * 27)) - ${SPARK_HOME}/bin/spark-submit \ - --master yarn \ - --deploy-mode cluster \ - --queue ${QUEUE} \ - --num-executors 1 \ - --executor-memory ${MEMORY}G \ - --py-files ${TFoS_HOME}/tfspark.zip,cifar10.zip \ - --conf spark.dynamicAllocation.enabled=false \ - --conf spark.yarn.maxAppAttempts=1 \ - --archives hdfs:///user/${USER}/Python.zip#Python \ - --conf spark.executorEnv.LD_LIBRARY_PATH="/usr/local/cuda-7.5/lib64:$JAVA_HOME/jre/lib/amd64/server" \ - --driver-library-path="/usr/local/cuda-7.5/lib64" \ - ${TFoS_HOME}/examples/cifar10/cifar10_multi_gpu_train.py \ - --data_dir ${CIFAR10_DATA} \ - --train_dir hdfs://default/user/${USER}/cifar10_train \ - --max_steps 1000 \ - --num_gpus ${NUM_GPU} - -### Run evaluation on Spark - - ${SPARK_HOME}/bin/spark-submit \ - --master yarn \ - --deploy-mode cluster \ - --queue ${QUEUE} \ - --num-executors 1 \ - --executor-memory 27G \ - --py-files ${TFoS_HOME}/tfspark.zip,cifar10.zip \ - --conf spark.dynamicAllocation.enabled=false \ - --conf spark.yarn.maxAppAttempts=1 \ - --archives hdfs:///user/${USER}/Python.zip#Python \ - --conf spark.executorEnv.LD_LIBRARY_PATH="lib64:/usr/local/cuda-7.5/lib64:$JAVA_HOME/jre/lib/amd64/server" \ - --driver-library-path="lib64:/usr/local/cuda-7.5/lib64" \ - ${TFoS_HOME}/examples/cifar10/cifar10_eval.py \ - --data_dir ${CIFAR10_DATA} \ - --checkpoint_dir hdfs://default/user/${USER}/cifar10_train \ - --eval_dir hdfs://default/user/${USER}/cifar10_eval \ - --run_once diff --git a/examples/cifar10/README_orig.md b/examples/cifar10/README_orig.md deleted file mode 100644 index 67877aed..00000000 --- a/examples/cifar10/README_orig.md +++ /dev/null @@ -1,10 +0,0 @@ -CIFAR-10 is a common benchmark in machine learning for image recognition. - -http://www.cs.toronto.edu/~kriz/cifar.html - -Code in this directory demonstrates how to use TensorFlow to train and evaluate a convolutional neural network (CNN) on both CPU and GPU. We also demonstrate how to train a CNN over multiple GPUs. - -Detailed instructions on how to get started available at: - -http://tensorflow.org/tutorials/deep_cnn/ - diff --git a/examples/cifar10/__init__.py b/examples/cifar10/__init__.py deleted file mode 100644 index 6b2729e7..00000000 --- a/examples/cifar10/__init__.py +++ /dev/null @@ -1,22 +0,0 @@ -# Copyright 2015 The TensorFlow Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# ============================================================================== - -"""Makes helper libraries available in the cifar10 package.""" -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function - -import cifar10 -import cifar10_input diff --git a/examples/cifar10/cifar10.py b/examples/cifar10/cifar10.py deleted file mode 100644 index 7909b772..00000000 --- a/examples/cifar10/cifar10.py +++ /dev/null @@ -1,399 +0,0 @@ -# Copyright 2015 The TensorFlow Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# ============================================================================== - -"""Builds the CIFAR-10 network. - -Summary of available functions: - - # Compute input images and labels for training. If you would like to run - # evaluations, use inputs() instead. - inputs, labels = distorted_inputs() - - # Compute inference on the model inputs to make a prediction. - predictions = inference(inputs) - - # Compute the total loss of the prediction with respect to the labels. - loss = loss(predictions, labels) - - # Create a graph to run one step of training with respect to the loss. - train_op = train(loss, global_step) -""" -# pylint: disable=missing-docstring -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function - -import os -import re -import sys -import tarfile - -from six.moves import urllib -import tensorflow as tf - -import cifar10_input - -FLAGS = tf.app.flags.FLAGS - -# Basic model parameters. -tf.app.flags.DEFINE_integer('batch_size', 128, - """Number of images to process in a batch.""") -tf.app.flags.DEFINE_string('data_dir', '/tmp/cifar10_data', - """Path to the CIFAR-10 data directory.""") -tf.app.flags.DEFINE_boolean('use_fp16', False, - """Train the model using fp16.""") - -# Global constants describing the CIFAR-10 data set. -IMAGE_SIZE = cifar10_input.IMAGE_SIZE -NUM_CLASSES = cifar10_input.NUM_CLASSES -NUM_EXAMPLES_PER_EPOCH_FOR_TRAIN = cifar10_input.NUM_EXAMPLES_PER_EPOCH_FOR_TRAIN -NUM_EXAMPLES_PER_EPOCH_FOR_EVAL = cifar10_input.NUM_EXAMPLES_PER_EPOCH_FOR_EVAL - - -# Constants describing the training process. -MOVING_AVERAGE_DECAY = 0.9999 # The decay to use for the moving average. -NUM_EPOCHS_PER_DECAY = 350.0 # Epochs after which learning rate decays. -LEARNING_RATE_DECAY_FACTOR = 0.1 # Learning rate decay factor. -INITIAL_LEARNING_RATE = 0.1 # Initial learning rate. - -# If a model is trained with multiple GPUs, prefix all Op names with tower_name -# to differentiate the operations. Note that this prefix is removed from the -# names of the summaries when visualizing a model. -TOWER_NAME = 'tower' - -DATA_URL = 'http://www.cs.toronto.edu/~kriz/cifar-10-binary.tar.gz' - - -def _activation_summary(x): - """Helper to create summaries for activations. - - Creates a summary that provides a histogram of activations. - Creates a summary that measures the sparsity of activations. - - Args: - x: Tensor - Returns: - nothing - """ - # Remove 'tower_[0-9]/' from the name in case this is a multi-GPU training - # session. This helps the clarity of presentation on tensorboard. - tensor_name = re.sub('%s_[0-9]*/' % TOWER_NAME, '', x.op.name) - tf.summary.histogram(tensor_name + '/activations', x) - tf.summary.scalar(tensor_name + '/sparsity', - tf.nn.zero_fraction(x)) - - -def _variable_on_cpu(name, shape, initializer): - """Helper to create a Variable stored on CPU memory. - - Args: - name: name of the variable - shape: list of ints - initializer: initializer for Variable - - Returns: - Variable Tensor - """ - with tf.device('/cpu:0'): - dtype = tf.float16 if FLAGS.use_fp16 else tf.float32 - var = tf.get_variable(name, shape, initializer=initializer, dtype=dtype) - return var - - -def _variable_with_weight_decay(name, shape, stddev, wd): - """Helper to create an initialized Variable with weight decay. - - Note that the Variable is initialized with a truncated normal distribution. - A weight decay is added only if one is specified. - - Args: - name: name of the variable - shape: list of ints - stddev: standard deviation of a truncated Gaussian - wd: add L2Loss weight decay multiplied by this float. If None, weight - decay is not added for this Variable. - - Returns: - Variable Tensor - """ - dtype = tf.float16 if FLAGS.use_fp16 else tf.float32 - var = _variable_on_cpu( - name, - shape, - tf.truncated_normal_initializer(stddev=stddev, dtype=dtype)) - if wd is not None: - weight_decay = tf.multiply(tf.nn.l2_loss(var), wd, name='weight_loss') - tf.add_to_collection('losses', weight_decay) - return var - - -def distorted_inputs(): - """Construct distorted input for CIFAR training using the Reader ops. - - Returns: - images: Images. 4D tensor of [batch_size, IMAGE_SIZE, IMAGE_SIZE, 3] size. - labels: Labels. 1D tensor of [batch_size] size. - - Raises: - ValueError: If no data_dir - """ - if not FLAGS.data_dir: - raise ValueError('Please supply a data_dir') - data_dir = os.path.join(FLAGS.data_dir, 'cifar-10-batches-bin') - images, labels = cifar10_input.distorted_inputs(data_dir=data_dir, - batch_size=FLAGS.batch_size) - if FLAGS.use_fp16: - images = tf.cast(images, tf.float16) - labels = tf.cast(labels, tf.float16) - return images, labels - - -def inputs(eval_data): - """Construct input for CIFAR evaluation using the Reader ops. - - Args: - eval_data: bool, indicating if one should use the train or eval data set. - - Returns: - images: Images. 4D tensor of [batch_size, IMAGE_SIZE, IMAGE_SIZE, 3] size. - labels: Labels. 1D tensor of [batch_size] size. - - Raises: - ValueError: If no data_dir - """ - if not FLAGS.data_dir: - raise ValueError('Please supply a data_dir') - data_dir = os.path.join(FLAGS.data_dir, 'cifar-10-batches-bin') - images, labels = cifar10_input.inputs(eval_data=eval_data, - data_dir=data_dir, - batch_size=FLAGS.batch_size) - if FLAGS.use_fp16: - images = tf.cast(images, tf.float16) - labels = tf.cast(labels, tf.float16) - return images, labels - - -def inference(images): - """Build the CIFAR-10 model. - - Args: - images: Images returned from distorted_inputs() or inputs(). - - Returns: - Logits. - """ - # We instantiate all variables using tf.get_variable() instead of - # tf.Variable() in order to share variables across multiple GPU training runs. - # If we only ran this model on a single GPU, we could simplify this function - # by replacing all instances of tf.get_variable() with tf.Variable(). - # - # conv1 - with tf.variable_scope('conv1') as scope: - kernel = _variable_with_weight_decay('weights', - shape=[5, 5, 3, 64], - stddev=5e-2, - wd=0.0) - conv = tf.nn.conv2d(images, kernel, [1, 1, 1, 1], padding='SAME') - biases = _variable_on_cpu('biases', [64], tf.constant_initializer(0.0)) - pre_activation = tf.nn.bias_add(conv, biases) - conv1 = tf.nn.relu(pre_activation, name=scope.name) - _activation_summary(conv1) - - # pool1 - pool1 = tf.nn.max_pool(conv1, ksize=[1, 3, 3, 1], strides=[1, 2, 2, 1], - padding='SAME', name='pool1') - # norm1 - norm1 = tf.nn.lrn(pool1, 4, bias=1.0, alpha=0.001 / 9.0, beta=0.75, - name='norm1') - - # conv2 - with tf.variable_scope('conv2') as scope: - kernel = _variable_with_weight_decay('weights', - shape=[5, 5, 64, 64], - stddev=5e-2, - wd=0.0) - conv = tf.nn.conv2d(norm1, kernel, [1, 1, 1, 1], padding='SAME') - biases = _variable_on_cpu('biases', [64], tf.constant_initializer(0.1)) - pre_activation = tf.nn.bias_add(conv, biases) - conv2 = tf.nn.relu(pre_activation, name=scope.name) - _activation_summary(conv2) - - # norm2 - norm2 = tf.nn.lrn(conv2, 4, bias=1.0, alpha=0.001 / 9.0, beta=0.75, - name='norm2') - # pool2 - pool2 = tf.nn.max_pool(norm2, ksize=[1, 3, 3, 1], - strides=[1, 2, 2, 1], padding='SAME', name='pool2') - - # local3 - with tf.variable_scope('local3') as scope: - # Move everything into depth so we can perform a single matrix multiply. - reshape = tf.reshape(pool2, [FLAGS.batch_size, -1]) - dim = reshape.get_shape()[1].value - weights = _variable_with_weight_decay('weights', shape=[dim, 384], - stddev=0.04, wd=0.004) - biases = _variable_on_cpu('biases', [384], tf.constant_initializer(0.1)) - local3 = tf.nn.relu(tf.matmul(reshape, weights) + biases, name=scope.name) - _activation_summary(local3) - - # local4 - with tf.variable_scope('local4') as scope: - weights = _variable_with_weight_decay('weights', shape=[384, 192], - stddev=0.04, wd=0.004) - biases = _variable_on_cpu('biases', [192], tf.constant_initializer(0.1)) - local4 = tf.nn.relu(tf.matmul(local3, weights) + biases, name=scope.name) - _activation_summary(local4) - - # linear layer(WX + b), - # We don't apply softmax here because - # tf.nn.sparse_softmax_cross_entropy_with_logits accepts the unscaled logits - # and performs the softmax internally for efficiency. - with tf.variable_scope('softmax_linear') as scope: - weights = _variable_with_weight_decay('weights', [192, NUM_CLASSES], - stddev=1/192.0, wd=0.0) - biases = _variable_on_cpu('biases', [NUM_CLASSES], - tf.constant_initializer(0.0)) - softmax_linear = tf.add(tf.matmul(local4, weights), biases, name=scope.name) - _activation_summary(softmax_linear) - - return softmax_linear - - -def loss(logits, labels): - """Add L2Loss to all the trainable variables. - - Add summary for "Loss" and "Loss/avg". - Args: - logits: Logits from inference(). - labels: Labels from distorted_inputs or inputs(). 1-D tensor - of shape [batch_size] - - Returns: - Loss tensor of type float. - """ - # Calculate the average cross entropy loss across the batch. - labels = tf.cast(labels, tf.int64) - cross_entropy = tf.nn.sparse_softmax_cross_entropy_with_logits( - labels=labels, logits=logits, name='cross_entropy_per_example') - cross_entropy_mean = tf.reduce_mean(cross_entropy, name='cross_entropy') - tf.add_to_collection('losses', cross_entropy_mean) - - # The total loss is defined as the cross entropy loss plus all of the weight - # decay terms (L2 loss). - return tf.add_n(tf.get_collection('losses'), name='total_loss') - - -def _add_loss_summaries(total_loss): - """Add summaries for losses in CIFAR-10 model. - - Generates moving average for all losses and associated summaries for - visualizing the performance of the network. - - Args: - total_loss: Total loss from loss(). - Returns: - loss_averages_op: op for generating moving averages of losses. - """ - # Compute the moving average of all individual losses and the total loss. - loss_averages = tf.train.ExponentialMovingAverage(0.9, name='avg') - losses = tf.get_collection('losses') - loss_averages_op = loss_averages.apply(losses + [total_loss]) - - # Attach a scalar summary to all individual losses and the total loss; do the - # same for the averaged version of the losses. - for l in losses + [total_loss]: - # Name each loss as '(raw)' and name the moving average version of the loss - # as the original loss name. - tf.summary.scalar(l.op.name + ' (raw)', l) - tf.summary.scalar(l.op.name, loss_averages.average(l)) - - return loss_averages_op - - -def train(total_loss, global_step): - """Train CIFAR-10 model. - - Create an optimizer and apply to all trainable variables. Add moving - average for all trainable variables. - - Args: - total_loss: Total loss from loss(). - global_step: Integer Variable counting the number of training steps - processed. - Returns: - train_op: op for training. - """ - # Variables that affect learning rate. - num_batches_per_epoch = NUM_EXAMPLES_PER_EPOCH_FOR_TRAIN / FLAGS.batch_size - decay_steps = int(num_batches_per_epoch * NUM_EPOCHS_PER_DECAY) - - # Decay the learning rate exponentially based on the number of steps. - lr = tf.train.exponential_decay(INITIAL_LEARNING_RATE, - global_step, - decay_steps, - LEARNING_RATE_DECAY_FACTOR, - staircase=True) - tf.summary.scalar('learning_rate', lr) - - # Generate moving averages of all losses and associated summaries. - loss_averages_op = _add_loss_summaries(total_loss) - - # Compute gradients. - with tf.control_dependencies([loss_averages_op]): - opt = tf.train.GradientDescentOptimizer(lr) - grads = opt.compute_gradients(total_loss) - - # Apply gradients. - apply_gradient_op = opt.apply_gradients(grads, global_step=global_step) - - # Add histograms for trainable variables. - for var in tf.trainable_variables(): - tf.summary.histogram(var.op.name, var) - - # Add histograms for gradients. - for grad, var in grads: - if grad is not None: - tf.summary.histogram(var.op.name + '/gradients', grad) - - # Track the moving averages of all trainable variables. - variable_averages = tf.train.ExponentialMovingAverage( - MOVING_AVERAGE_DECAY, global_step) - variables_averages_op = variable_averages.apply(tf.trainable_variables()) - - with tf.control_dependencies([apply_gradient_op, variables_averages_op]): - train_op = tf.no_op(name='train') - - return train_op - - -def maybe_download_and_extract(): - """Download and extract the tarball from Alex's website.""" - dest_directory = FLAGS.data_dir - if not os.path.exists(dest_directory): - os.makedirs(dest_directory) - filename = DATA_URL.split('/')[-1] - filepath = os.path.join(dest_directory, filename) - if not os.path.exists(filepath): - def _progress(count, block_size, total_size): - sys.stdout.write('\r>> Downloading %s %.1f%%' % (filename, - float(count * block_size) / float(total_size) * 100.0)) - sys.stdout.flush() - filepath, _ = urllib.request.urlretrieve(DATA_URL, filepath, _progress) - print() - statinfo = os.stat(filepath) - print('Successfully downloaded', filename, statinfo.st_size, 'bytes.') - extracted_dir_path = os.path.join(dest_directory, 'cifar-10-batches-bin') - if not os.path.exists(extracted_dir_path): - tarfile.open(filepath, 'r:gz').extractall(dest_directory) diff --git a/examples/cifar10/cifar10_eval.py b/examples/cifar10/cifar10_eval.py deleted file mode 100644 index e83c5c6b..00000000 --- a/examples/cifar10/cifar10_eval.py +++ /dev/null @@ -1,169 +0,0 @@ -# Copyright 2015 The TensorFlow Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# ============================================================================== - -"""Evaluation for CIFAR-10. - -Accuracy: -cifar10_train.py achieves 83.0% accuracy after 100K steps (256 epochs -of data) as judged by cifar10_eval.py. - -Speed: -On a single Tesla K40, cifar10_train.py processes a single batch of 128 images -in 0.25-0.35 sec (i.e. 350 - 600 images /sec). The model reaches ~86% -accuracy after 100K steps in 8 hours of training time. - -Usage: -Please see the tutorial and website for how to download the CIFAR-10 -data set, compile the program and train the model. - -http://tensorflow.org/tutorials/deep_cnn/ -""" -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function - -from pyspark.context import SparkContext -from pyspark.conf import SparkConf -from tensorflowonspark import TFCluster, TFNode -import sys - -from datetime import datetime -import math -import time - -import numpy as np - -def main_fun(argv, ctx): - - import tensorflow as tf - import cifar10 - - sys.argv = argv - FLAGS = tf.app.flags.FLAGS - tf.app.flags.DEFINE_string('eval_dir', '/tmp/cifar10_eval', - """Directory where to write event logs.""") - tf.app.flags.DEFINE_string('eval_data', 'test', - """Either 'test' or 'train_eval'.""") - tf.app.flags.DEFINE_string('checkpoint_dir', '/tmp/cifar10_train', - """Directory where to read model checkpoints.""") - tf.app.flags.DEFINE_integer('eval_interval_secs', 60 * 5, - """How often to run the eval.""") - tf.app.flags.DEFINE_integer('num_examples', 10000, - """Number of examples to run.""") - tf.app.flags.DEFINE_boolean('run_once', False, - """Whether to run eval only once.""") - tf.app.flags.DEFINE_boolean('rdma', False, """Whether to use rdma.""") - - cluster_spec, server = TFNode.start_cluster_server(ctx, 1, FLAGS.rdma) - - def eval_once(saver, summary_writer, top_k_op, summary_op): - """Run Eval once. - - Args: - saver: Saver. - summary_writer: Summary writer. - top_k_op: Top K op. - summary_op: Summary op. - """ - with tf.Session() as sess: - ckpt = tf.train.get_checkpoint_state(FLAGS.checkpoint_dir) - if ckpt and ckpt.model_checkpoint_path: - # Restores from checkpoint - saver.restore(sess, ckpt.model_checkpoint_path) - # Assuming model_checkpoint_path looks something like: - # /my-favorite-path/cifar10_train/model.ckpt-0, - # extract global_step from it. - global_step = ckpt.model_checkpoint_path.split('/')[-1].split('-')[-1] - else: - print('No checkpoint file found') - return - - # Start the queue runners. - coord = tf.train.Coordinator() - try: - threads = [] - for qr in tf.get_collection(tf.GraphKeys.QUEUE_RUNNERS): - threads.extend(qr.create_threads(sess, coord=coord, daemon=True, - start=True)) - - num_iter = int(math.ceil(FLAGS.num_examples / FLAGS.batch_size)) - true_count = 0 # Counts the number of correct predictions. - total_sample_count = num_iter * FLAGS.batch_size - step = 0 - while step < num_iter and not coord.should_stop(): - predictions = sess.run([top_k_op]) - true_count += np.sum(predictions) - step += 1 - - # Compute precision @ 1. - precision = true_count / total_sample_count - print('%s: precision @ 1 = %.3f' % (datetime.now(), precision)) - - summary = tf.Summary() - summary.ParseFromString(sess.run(summary_op)) - summary.value.add(tag='Precision @ 1', simple_value=precision) - summary_writer.add_summary(summary, global_step) - except Exception as e: # pylint: disable=broad-except - coord.request_stop(e) - - coord.request_stop() - coord.join(threads, stop_grace_period_secs=10) - - - def evaluate(): - """Eval CIFAR-10 for a number of steps.""" - with tf.Graph().as_default() as g: - # Get images and labels for CIFAR-10. - eval_data = FLAGS.eval_data == 'test' - images, labels = cifar10.inputs(eval_data=eval_data) - - # Build a Graph that computes the logits predictions from the - # inference model. - logits = cifar10.inference(images) - - # Calculate predictions. - top_k_op = tf.nn.in_top_k(logits, labels, 1) - - # Restore the moving average version of the learned variables for eval. - variable_averages = tf.train.ExponentialMovingAverage( - cifar10.MOVING_AVERAGE_DECAY) - variables_to_restore = variable_averages.variables_to_restore() - saver = tf.train.Saver(variables_to_restore) - - # Build the summary operation based on the TF collection of Summaries. - summary_op = tf.summary.merge_all() - - summary_writer = tf.summary.FileWriter(FLAGS.eval_dir, g) - - while True: - eval_once(saver, summary_writer, top_k_op, summary_op) - if FLAGS.run_once: - break - time.sleep(FLAGS.eval_interval_secs) - - #cifar10.maybe_download_and_extract() - if tf.gfile.Exists(FLAGS.eval_dir): - tf.gfile.DeleteRecursively(FLAGS.eval_dir) - tf.gfile.MakeDirs(FLAGS.eval_dir) - evaluate() - - -if __name__ == '__main__': - sc = SparkContext(conf=SparkConf().setAppName("cifar10_eval")) - num_executors = int(sc._conf.get("spark.executor.instances")) - num_ps = 0 - - cluster = TFCluster.run(sc, main_fun, sys.argv, num_executors, num_ps, False, TFCluster.InputMode.TENSORFLOW) - cluster.shutdown() diff --git a/examples/cifar10/cifar10_input.py b/examples/cifar10/cifar10_input.py deleted file mode 100644 index 10c77623..00000000 --- a/examples/cifar10/cifar10_input.py +++ /dev/null @@ -1,257 +0,0 @@ -# Copyright 2015 The TensorFlow Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# ============================================================================== - -"""Routine for decoding the CIFAR-10 binary file format.""" - -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function - -import os - -from six.moves import xrange # pylint: disable=redefined-builtin -import tensorflow as tf - -# Process images of this size. Note that this differs from the original CIFAR -# image size of 32 x 32. If one alters this number, then the entire model -# architecture will change and any model would need to be retrained. -IMAGE_SIZE = 24 - -# Global constants describing the CIFAR-10 data set. -NUM_CLASSES = 10 -NUM_EXAMPLES_PER_EPOCH_FOR_TRAIN = 50000 -NUM_EXAMPLES_PER_EPOCH_FOR_EVAL = 10000 - - -def read_cifar10(filename_queue): - """Reads and parses examples from CIFAR10 data files. - - Recommendation: if you want N-way read parallelism, call this function - N times. This will give you N independent Readers reading different - files & positions within those files, which will give better mixing of - examples. - - Args: - filename_queue: A queue of strings with the filenames to read from. - - Returns: - An object representing a single example, with the following fields: - height: number of rows in the result (32) - width: number of columns in the result (32) - depth: number of color channels in the result (3) - key: a scalar string Tensor describing the filename & record number - for this example. - label: an int32 Tensor with the label in the range 0..9. - uint8image: a [height, width, depth] uint8 Tensor with the image data - """ - - class CIFAR10Record(object): - pass - result = CIFAR10Record() - - # Dimensions of the images in the CIFAR-10 dataset. - # See http://www.cs.toronto.edu/~kriz/cifar.html for a description of the - # input format. - label_bytes = 1 # 2 for CIFAR-100 - result.height = 32 - result.width = 32 - result.depth = 3 - image_bytes = result.height * result.width * result.depth - # Every record consists of a label followed by the image, with a - # fixed number of bytes for each. - record_bytes = label_bytes + image_bytes - - # Read a record, getting filenames from the filename_queue. No - # header or footer in the CIFAR-10 format, so we leave header_bytes - # and footer_bytes at their default of 0. - reader = tf.FixedLengthRecordReader(record_bytes=record_bytes) - result.key, value = reader.read(filename_queue) - - # Convert from a string to a vector of uint8 that is record_bytes long. - record_bytes = tf.decode_raw(value, tf.uint8) - - # The first bytes represent the label, which we convert from uint8->int32. - result.label = tf.cast( - tf.strided_slice(record_bytes, [0], [label_bytes]), tf.int32) - - # The remaining bytes after the label represent the image, which we reshape - # from [depth * height * width] to [depth, height, width]. - depth_major = tf.reshape( - tf.strided_slice(record_bytes, [label_bytes], - [label_bytes + image_bytes]), - [result.depth, result.height, result.width]) - # Convert from [depth, height, width] to [height, width, depth]. - result.uint8image = tf.transpose(depth_major, [1, 2, 0]) - - return result - - -def _generate_image_and_label_batch(image, label, min_queue_examples, - batch_size, shuffle): - """Construct a queued batch of images and labels. - - Args: - image: 3-D Tensor of [height, width, 3] of type.float32. - label: 1-D Tensor of type.int32 - min_queue_examples: int32, minimum number of samples to retain - in the queue that provides of batches of examples. - batch_size: Number of images per batch. - shuffle: boolean indicating whether to use a shuffling queue. - - Returns: - images: Images. 4D tensor of [batch_size, height, width, 3] size. - labels: Labels. 1D tensor of [batch_size] size. - """ - # Create a queue that shuffles the examples, and then - # read 'batch_size' images + labels from the example queue. - num_preprocess_threads = 16 - if shuffle: - images, label_batch = tf.train.shuffle_batch( - [image, label], - batch_size=batch_size, - num_threads=num_preprocess_threads, - capacity=min_queue_examples + 3 * batch_size, - min_after_dequeue=min_queue_examples) - else: - images, label_batch = tf.train.batch( - [image, label], - batch_size=batch_size, - num_threads=num_preprocess_threads, - capacity=min_queue_examples + 3 * batch_size) - - # Display the training images in the visualizer. - tf.summary.image('images', images) - - return images, tf.reshape(label_batch, [batch_size]) - - -def distorted_inputs(data_dir, batch_size): - """Construct distorted input for CIFAR training using the Reader ops. - - Args: - data_dir: Path to the CIFAR-10 data directory. - batch_size: Number of images per batch. - - Returns: - images: Images. 4D tensor of [batch_size, IMAGE_SIZE, IMAGE_SIZE, 3] size. - labels: Labels. 1D tensor of [batch_size] size. - """ - filenames = [os.path.join(data_dir, 'data_batch_%d.bin' % i) - for i in xrange(1, 6)] - for f in filenames: - if not tf.gfile.Exists(f): - raise ValueError('Failed to find file: ' + f) - - # Create a queue that produces the filenames to read. - filename_queue = tf.train.string_input_producer(filenames) - - # Read examples from files in the filename queue. - read_input = read_cifar10(filename_queue) - reshaped_image = tf.cast(read_input.uint8image, tf.float32) - - height = IMAGE_SIZE - width = IMAGE_SIZE - - # Image processing for training the network. Note the many random - # distortions applied to the image. - - # Randomly crop a [height, width] section of the image. - distorted_image = tf.random_crop(reshaped_image, [height, width, 3]) - - # Randomly flip the image horizontally. - distorted_image = tf.image.random_flip_left_right(distorted_image) - - # Because these operations are not commutative, consider randomizing - # the order their operation. - distorted_image = tf.image.random_brightness(distorted_image, - max_delta=63) - distorted_image = tf.image.random_contrast(distorted_image, - lower=0.2, upper=1.8) - - # Subtract off the mean and divide by the variance of the pixels. - float_image = tf.image.per_image_standardization(distorted_image) - - # Set the shapes of tensors. - float_image.set_shape([height, width, 3]) - read_input.label.set_shape([1]) - - # Ensure that the random shuffling has good mixing properties. - min_fraction_of_examples_in_queue = 0.4 - min_queue_examples = int(NUM_EXAMPLES_PER_EPOCH_FOR_TRAIN * - min_fraction_of_examples_in_queue) - print ('Filling queue with %d CIFAR images before starting to train. ' - 'This will take a few minutes.' % min_queue_examples) - - # Generate a batch of images and labels by building up a queue of examples. - return _generate_image_and_label_batch(float_image, read_input.label, - min_queue_examples, batch_size, - shuffle=True) - - -def inputs(eval_data, data_dir, batch_size): - """Construct input for CIFAR evaluation using the Reader ops. - - Args: - eval_data: bool, indicating if one should use the train or eval data set. - data_dir: Path to the CIFAR-10 data directory. - batch_size: Number of images per batch. - - Returns: - images: Images. 4D tensor of [batch_size, IMAGE_SIZE, IMAGE_SIZE, 3] size. - labels: Labels. 1D tensor of [batch_size] size. - """ - if not eval_data: - filenames = [os.path.join(data_dir, 'data_batch_%d.bin' % i) - for i in xrange(1, 6)] - num_examples_per_epoch = NUM_EXAMPLES_PER_EPOCH_FOR_TRAIN - else: - filenames = [os.path.join(data_dir, 'test_batch.bin')] - num_examples_per_epoch = NUM_EXAMPLES_PER_EPOCH_FOR_EVAL - - for f in filenames: - if not tf.gfile.Exists(f): - raise ValueError('Failed to find file: ' + f) - - # Create a queue that produces the filenames to read. - filename_queue = tf.train.string_input_producer(filenames) - - # Read examples from files in the filename queue. - read_input = read_cifar10(filename_queue) - reshaped_image = tf.cast(read_input.uint8image, tf.float32) - - height = IMAGE_SIZE - width = IMAGE_SIZE - - # Image processing for evaluation. - # Crop the central [height, width] of the image. - resized_image = tf.image.resize_image_with_crop_or_pad(reshaped_image, - height, width) - - # Subtract off the mean and divide by the variance of the pixels. - float_image = tf.image.per_image_standardization(resized_image) - - # Set the shapes of tensors. - float_image.set_shape([height, width, 3]) - read_input.label.set_shape([1]) - - # Ensure that the random shuffling has good mixing properties. - min_fraction_of_examples_in_queue = 0.4 - min_queue_examples = int(num_examples_per_epoch * - min_fraction_of_examples_in_queue) - - # Generate a batch of images and labels by building up a queue of examples. - return _generate_image_and_label_batch(float_image, read_input.label, - min_queue_examples, batch_size, - shuffle=False) diff --git a/examples/cifar10/cifar10_input_test.py b/examples/cifar10/cifar10_input_test.py deleted file mode 100644 index dbae1cab..00000000 --- a/examples/cifar10/cifar10_input_test.py +++ /dev/null @@ -1,66 +0,0 @@ -# Copyright 2015 The TensorFlow Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# ============================================================================== - -"""Tests for cifar10 input.""" - -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function - -import os - -import tensorflow as tf - -import cifar10_input - - -class CIFAR10InputTest(tf.test.TestCase): - - def _record(self, label, red, green, blue): - image_size = 32 * 32 - record = bytes(bytearray([label] + [red] * image_size + - [green] * image_size + [blue] * image_size)) - expected = [[[red, green, blue]] * 32] * 32 - return record, expected - - def testSimple(self): - labels = [9, 3, 0] - records = [self._record(labels[0], 0, 128, 255), - self._record(labels[1], 255, 0, 1), - self._record(labels[2], 254, 255, 0)] - contents = b"".join([record for record, _ in records]) - expected = [expected for _, expected in records] - filename = os.path.join(self.get_temp_dir(), "cifar") - open(filename, "wb").write(contents) - - with self.test_session() as sess: - q = tf.FIFOQueue(99, [tf.string], shapes=()) - q.enqueue([filename]).run() - q.close().run() - result = cifar10_input.read_cifar10(q) - - for i in range(3): - key, label, uint8image = sess.run([ - result.key, result.label, result.uint8image]) - self.assertEqual("%s:%d" % (filename, i), tf.compat.as_text(key)) - self.assertEqual(labels[i], label) - self.assertAllEqual(expected[i], uint8image) - - with self.assertRaises(tf.errors.OutOfRangeError): - sess.run([result.key, result.uint8image]) - - -if __name__ == "__main__": - tf.test.main() diff --git a/examples/cifar10/cifar10_multi_gpu_train.py b/examples/cifar10/cifar10_multi_gpu_train.py deleted file mode 100644 index 71f52eab..00000000 --- a/examples/cifar10/cifar10_multi_gpu_train.py +++ /dev/null @@ -1,282 +0,0 @@ -# Copyright 2015 The TensorFlow Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# ============================================================================== - -"""A binary to train CIFAR-10 using multiple GPU's with synchronous updates. - -Accuracy: -cifar10_multi_gpu_train.py achieves ~86% accuracy after 100K steps (256 -epochs of data) as judged by cifar10_eval.py. - -Speed: With batch_size 128. - -System | Step Time (sec/batch) | Accuracy --------------------------------------------------------------------- -1 Tesla K20m | 0.35-0.60 | ~86% at 60K steps (5 hours) -1 Tesla K40m | 0.25-0.35 | ~86% at 100K steps (4 hours) -2 Tesla K20m | 0.13-0.20 | ~84% at 30K steps (2.5 hours) -3 Tesla K20m | 0.13-0.18 | ~84% at 30K steps -4 Tesla K20m | ~0.10 | ~84% at 30K steps - -Usage: -Please see the tutorial and website for how to download the CIFAR-10 -data set, compile the program and train the model. - -http://tensorflow.org/tutorials/deep_cnn/ -""" -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function - -from pyspark.context import SparkContext -from pyspark.conf import SparkConf -from tensorflowonspark import TFCluster, TFNode -from datetime import datetime - -import os.path -import re -import sys -import time - -import numpy as np -from six.moves import xrange # pylint: disable=redefined-builtin - -def main_fun(argv, ctx): - import tensorflow as tf - import cifar10 - - sys.argv = argv - FLAGS = tf.app.flags.FLAGS - tf.app.flags.DEFINE_string('train_dir', '/tmp/cifar10_train', - """Directory where to write event logs """ - """and checkpoint.""") - tf.app.flags.DEFINE_integer('max_steps', 1000000, - """Number of batches to run.""") - tf.app.flags.DEFINE_integer('num_gpus', 1, - """How many GPUs to use.""") - tf.app.flags.DEFINE_boolean('log_device_placement', False, - """Whether to log device placement.""") - tf.app.flags.DEFINE_boolean('rdma', False, """Whether to use rdma.""") - cluster_spec, server = TFNode.start_cluster_server(ctx, FLAGS.num_gpus, FLAGS.rdma) - - def tower_loss(scope): - """Calculate the total loss on a single tower running the CIFAR model. - - Args: - scope: unique prefix string identifying the CIFAR tower, e.g. 'tower_0' - - Returns: - Tensor of shape [] containing the total loss for a batch of data - """ - # Get images and labels for CIFAR-10. - images, labels = cifar10.distorted_inputs() - - # Build inference Graph. - logits = cifar10.inference(images) - - # Build the portion of the Graph calculating the losses. Note that we will - # assemble the total_loss using a custom function below. - _ = cifar10.loss(logits, labels) - - # Assemble all of the losses for the current tower only. - losses = tf.get_collection('losses', scope) - - # Calculate the total loss for the current tower. - total_loss = tf.add_n(losses, name='total_loss') - - # Attach a scalar summary to all individual losses and the total loss; do the - # same for the averaged version of the losses. - for l in losses + [total_loss]: - # Remove 'tower_[0-9]/' from the name in case this is a multi-GPU training - # session. This helps the clarity of presentation on tensorboard. - loss_name = re.sub('%s_[0-9]*/' % cifar10.TOWER_NAME, '', l.op.name) - tf.summary.scalar(loss_name, l) - - return total_loss - - - def average_gradients(tower_grads): - """Calculate the average gradient for each shared variable across all towers. - - Note that this function provides a synchronization point across all towers. - - Args: - tower_grads: List of lists of (gradient, variable) tuples. The outer list - is over individual gradients. The inner list is over the gradient - calculation for each tower. - Returns: - List of pairs of (gradient, variable) where the gradient has been averaged - across all towers. - """ - average_grads = [] - for grad_and_vars in zip(*tower_grads): - # Note that each grad_and_vars looks like the following: - # ((grad0_gpu0, var0_gpu0), ... , (grad0_gpuN, var0_gpuN)) - grads = [] - for g, _ in grad_and_vars: - # Add 0 dimension to the gradients to represent the tower. - expanded_g = tf.expand_dims(g, 0) - - # Append on a 'tower' dimension which we will average over below. - grads.append(expanded_g) - - # Average over the 'tower' dimension. - grad = tf.concat(axis=0, values=grads) - grad = tf.reduce_mean(grad, 0) - - # Keep in mind that the Variables are redundant because they are shared - # across towers. So .. we will just return the first tower's pointer to - # the Variable. - v = grad_and_vars[0][1] - grad_and_var = (grad, v) - average_grads.append(grad_and_var) - return average_grads - - - def train(): - """Train CIFAR-10 for a number of steps.""" - with tf.Graph().as_default(), tf.device('/cpu:0'): - # Create a variable to count the number of train() calls. This equals the - # number of batches processed * FLAGS.num_gpus. - global_step = tf.get_variable( - 'global_step', [], - initializer=tf.constant_initializer(0), trainable=False) - - # Calculate the learning rate schedule. - num_batches_per_epoch = (cifar10.NUM_EXAMPLES_PER_EPOCH_FOR_TRAIN / - FLAGS.batch_size) - decay_steps = int(num_batches_per_epoch * cifar10.NUM_EPOCHS_PER_DECAY) - - # Decay the learning rate exponentially based on the number of steps. - lr = tf.train.exponential_decay(cifar10.INITIAL_LEARNING_RATE, - global_step, - decay_steps, - cifar10.LEARNING_RATE_DECAY_FACTOR, - staircase=True) - - # Create an optimizer that performs gradient descent. - opt = tf.train.GradientDescentOptimizer(lr) - - # Calculate the gradients for each model tower. - tower_grads = [] - with tf.variable_scope(tf.get_variable_scope()): - for i in xrange(FLAGS.num_gpus): - with tf.device('/gpu:%d' % i): - with tf.name_scope('%s_%d' % (cifar10.TOWER_NAME, i)) as scope: - # Calculate the loss for one tower of the CIFAR model. This function - # constructs the entire CIFAR model but shares the variables across - # all towers. - loss = tower_loss(scope) - - # Reuse variables for the next tower. - tf.get_variable_scope().reuse_variables() - - # Retain the summaries from the final tower. - summaries = tf.get_collection(tf.GraphKeys.SUMMARIES, scope) - - # Calculate the gradients for the batch of data on this CIFAR tower. - grads = opt.compute_gradients(loss) - - # Keep track of the gradients across all towers. - tower_grads.append(grads) - - # We must calculate the mean of each gradient. Note that this is the - # synchronization point across all towers. - grads = average_gradients(tower_grads) - - # Add a summary to track the learning rate. - summaries.append(tf.summary.scalar('learning_rate', lr)) - - # Add histograms for gradients. - for grad, var in grads: - if grad is not None: - summaries.append(tf.summary.histogram(var.op.name + '/gradients', grad)) - - # Apply the gradients to adjust the shared variables. - apply_gradient_op = opt.apply_gradients(grads, global_step=global_step) - - # Add histograms for trainable variables. - for var in tf.trainable_variables(): - summaries.append(tf.summary.histogram(var.op.name, var)) - - # Track the moving averages of all trainable variables. - variable_averages = tf.train.ExponentialMovingAverage( - cifar10.MOVING_AVERAGE_DECAY, global_step) - variables_averages_op = variable_averages.apply(tf.trainable_variables()) - - # Group all updates to into a single train op. - train_op = tf.group(apply_gradient_op, variables_averages_op) - - # Create a saver. - saver = tf.train.Saver(tf.global_variables()) - - # Build the summary operation from the last tower summaries. - summary_op = tf.summary.merge(summaries) - - # Build an initialization operation to run below. - init = tf.global_variables_initializer() - - # Start running operations on the Graph. allow_soft_placement must be set to - # True to build towers on GPU, as some of the ops do not have GPU - # implementations. - sess = tf.Session(config=tf.ConfigProto( - allow_soft_placement=True, - log_device_placement=FLAGS.log_device_placement)) - sess.run(init) - - # Start the queue runners. - tf.train.start_queue_runners(sess=sess) - - summary_writer = tf.summary.FileWriter(FLAGS.train_dir, sess.graph) - - for step in xrange(FLAGS.max_steps): - start_time = time.time() - _, loss_value = sess.run([train_op, loss]) - duration = time.time() - start_time - - assert not np.isnan(loss_value), 'Model diverged with loss = NaN' - - if step % 10 == 0: - num_examples_per_step = FLAGS.batch_size * FLAGS.num_gpus - examples_per_sec = num_examples_per_step / duration - sec_per_batch = duration / FLAGS.num_gpus - - format_str = ('%s: step %d, loss = %.2f (%.1f examples/sec; %.3f ' - 'sec/batch)') - print (format_str % (datetime.now(), step, loss_value, - examples_per_sec, sec_per_batch)) - - if step % 100 == 0: - summary_str = sess.run(summary_op) - summary_writer.add_summary(summary_str, step) - - # Save the model checkpoint periodically. - if step % 1000 == 0 or (step + 1) == FLAGS.max_steps: - checkpoint_path = os.path.join(FLAGS.train_dir, 'model.ckpt') - saver.save(sess, checkpoint_path, global_step=step) - - # cifar10.maybe_download_and_extract() - if tf.gfile.Exists(FLAGS.train_dir): - tf.gfile.DeleteRecursively(FLAGS.train_dir) - tf.gfile.MakeDirs(FLAGS.train_dir) - train() - - -if __name__ == '__main__': - sc = SparkContext(conf=SparkConf().setAppName("cifar10_multi_gpu_train")) - num_executors = int(sc._conf.get("spark.executor.instances")) - num_ps = 0 - - cluster = TFCluster.run(sc, main_fun, sys.argv, num_executors, num_ps, False, TFCluster.InputMode.TENSORFLOW) - cluster.shutdown() diff --git a/examples/cifar10/cifar10_train.py b/examples/cifar10/cifar10_train.py deleted file mode 100644 index 51549741..00000000 --- a/examples/cifar10/cifar10_train.py +++ /dev/null @@ -1,129 +0,0 @@ -# Copyright 2015 The TensorFlow Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# ============================================================================== - -"""A binary to train CIFAR-10 using a single GPU. - -Accuracy: -cifar10_train.py achieves ~86% accuracy after 100K steps (256 epochs of -data) as judged by cifar10_eval.py. - -Speed: With batch_size 128. - -System | Step Time (sec/batch) | Accuracy ------------------------------------------------------------------- -1 Tesla K20m | 0.35-0.60 | ~86% at 60K steps (5 hours) -1 Tesla K40m | 0.25-0.35 | ~86% at 100K steps (4 hours) - -Usage: -Please see the tutorial and website for how to download the CIFAR-10 -data set, compile the program and train the model. - -http://tensorflow.org/tutorials/deep_cnn/ -""" -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function - -from pyspark.context import SparkContext -from pyspark.conf import SparkConf -from tensorflowonspark import TFCluster, TFNode -from datetime import datetime - -import os.path -import sys -import time - -def main_fun(argv, ctx): - import tensorflow as tf - import cifar10 - - sys.argv = argv - FLAGS = tf.app.flags.FLAGS - tf.app.flags.DEFINE_string('train_dir', '/tmp/cifar10_train', - """Directory where to write event logs """ - """and checkpoint.""") - tf.app.flags.DEFINE_integer('max_steps', 1000000, - """Number of batches to run.""") - tf.app.flags.DEFINE_boolean('log_device_placement', False, - """Whether to log device placement.""") - tf.app.flags.DEFINE_boolean('rdma', False, """Whether to use rdma.""") - - # cifar10.maybe_download_and_extract() - if tf.gfile.Exists(FLAGS.train_dir): - tf.gfile.DeleteRecursively(FLAGS.train_dir) - tf.gfile.MakeDirs(FLAGS.train_dir) - - cluster_spec, server = TFNode.start_cluster_server(ctx, 1, FLAGS.rdma) - - # Train CIFAR-10 for a number of steps. - with tf.Graph().as_default(): - global_step = tf.contrib.framework.get_or_create_global_step() - - # Get images and labels for CIFAR-10. - images, labels = cifar10.distorted_inputs() - - # Build a Graph that computes the logits predictions from the - # inference model. - logits = cifar10.inference(images) - - # Calculate loss. - loss = cifar10.loss(logits, labels) - - # Build a Graph that trains the model with one batch of examples and - # updates the model parameters. - train_op = cifar10.train(loss, global_step) - - class _LoggerHook(tf.train.SessionRunHook): - """Logs loss and runtime.""" - - def begin(self): - self._step = -1 - - def before_run(self, run_context): - self._step += 1 - self._start_time = time.time() - return tf.train.SessionRunArgs(loss) # Asks for loss value. - - def after_run(self, run_context, run_values): - duration = time.time() - self._start_time - loss_value = run_values.results - if self._step % 10 == 0: - num_examples_per_step = FLAGS.batch_size - examples_per_sec = num_examples_per_step / duration - sec_per_batch = float(duration) - - format_str = ('%s: step %d, loss = %.2f (%.1f examples/sec; %.3f ' - 'sec/batch)') - print (format_str % (datetime.now(), self._step, loss_value, - examples_per_sec, sec_per_batch)) - - with tf.train.MonitoredTrainingSession( - checkpoint_dir=FLAGS.train_dir, - hooks=[tf.train.StopAtStepHook(last_step=FLAGS.max_steps), - tf.train.NanTensorHook(loss), - _LoggerHook()], - config=tf.ConfigProto( - log_device_placement=FLAGS.log_device_placement)) as mon_sess: - while not mon_sess.should_stop(): - mon_sess.run(train_op) - - -if __name__ == '__main__': - sc = SparkContext(conf=SparkConf().setAppName("cifar10_train")) - num_executors = int(sc._conf.get("spark.executor.instances")) - num_ps = 0 - - cluster = TFCluster.run(sc, main_fun, sys.argv, num_executors, num_ps, False, TFCluster.InputMode.TENSORFLOW) - cluster.shutdown() diff --git a/examples/imagenet/README.md b/examples/imagenet/README.md deleted file mode 100644 index 58acf72d..00000000 --- a/examples/imagenet/README.md +++ /dev/null @@ -1,78 +0,0 @@ -# Inception V3 CNN - -Original Source: https://github.com/tensorflow/models/tree/master/inception - -In this example, we leave the code largely untouched, leveraging TensorFlowOnSpark to launch the cluster in the Hadoop grid. -To view the differences, you can compare the original `imagenet_distributed_train.py` with the version here. - -These instructions are intended for a Spark/YARN grid, so please ensure that you have followed [these instructions](https://github.com/yahoo/TensorFlowOnSpark/wiki/GetStarted_YARN) first. - -Also, you will need to [download the Imagenet dataset per the original example](https://github.com/tensorflow/models/tree/master/inception#getting-started). - -#### Package the inception code as a Python zip/module - - export TFoS_HOME= - pushd ${TFoS_HOME}/examples/imagenet; zip -r ~/inception.zip inception; popd - -#### Run distributed CNN on Spark - - # set environment variables (if not already done) - export PYTHON_ROOT=~/Python - export PYSPARK_PYTHON=${PYTHON_ROOT}/bin/python - export PATH=${PYTHON_ROOT}/bin/:$PATH - export QUEUE=gpu - export IMAGENET_DATA= - - # for CPU mode: - # export QUEUE=default - # --conf spark.executorEnv.LD_LIBRARY_PATH="$JAVA_HOME/jre/lib/amd64/server" \ - # remove --driver-library-path - - # hadoop fs -rm -r imagenet_train - ${SPARK_HOME}/bin/spark-submit \ - --master yarn \ - --deploy-mode cluster \ - --queue ${QUEUE} \ - --num-executors 4 \ - --executor-memory 27G \ - --py-files ${TFoS_HOME}/tfspark.zip,inception.zip \ - --conf spark.dynamicAllocation.enabled=false \ - --conf spark.yarn.maxAppAttempts=1 \ - --archives hdfs:///user/${USER}/Python.zip#Python \ - --conf spark.executorEnv.LD_LIBRARY_PATH="/usr/local/cuda-7.5/lib64:$JAVA_HOME/jre/lib/amd64/server" \ - --driver-library-path="/usr/local/cuda-7.5/lib64" \ - ${TFoS_HOME}/examples/imagenet/inception/imagenet_distributed_train.py \ - --data_dir ${IMAGENET_DATA} \ - --train_dir hdfs://default/user/${USER}/imagenet_train \ - --max_steps 1000 \ - --subset train - # to use infiniband, replace the last line with --subset train --rdma - -#### Run evaluation job on Spark - -To evaluate the model, run the following job after the training has completed. This will calculate the "precision @ 1" metric for the trained model. Note: since we only trained for 1000 steps, the reported metric will be very poor. So, to train a better model, you can increase the `--max_steps` above, and then run the evaluation job in parallel by removing the `--run_once` argument. This will periodically calculate the metric while training is in progress. You can terminate training and/or eval at any time using the standard `yarn application -kill ` command, and the latest model will be stored in your `imagenet_train` HDFS directory. - - # for CPU mode: - # export QUEUE=default - # --conf spark.executorEnv.LD_LIBRARY_PATH="$JAVA_HOME/jre/lib/amd64/server" \ - # remove --driver-library-path - - # hadoop fs -rm -r imagenet_eval - ${SPARK_HOME}/bin/spark-submit \ - --master yarn \ - --deploy-mode cluster \ - --queue ${QUEUE} \ - --num-executors 1 \ - --executor-memory 27G \ - --py-files ${TFoS_HOME}/tfspark.zip,inception.zip \ - --conf spark.dynamicAllocation.enabled=false \ - --conf spark.yarn.maxAppAttempts=1 \ - --archives hdfs:///user/${USER}/Python.zip#Python \ - --conf spark.executorEnv.LD_LIBRARY_PATH="/usr/local/cuda-7.5/lib64:$JAVA_HOME/jre/lib/amd64/server" \ - --driver-library-path="/usr/local/cuda-7.5/lib64" \ - ${TFoS_HOME}/examples/imagenet/inception/imagenet_eval.py \ - --data_dir ${IMAGENET_DATA} \ - --checkpoint_dir hdfs://default/user/${USER}/imagenet_train \ - --eval_dir hdfs://default/user/${USER}/imagenet_eval \ - --subset validation \ - --run_once diff --git a/examples/imagenet/inception/BUILD b/examples/imagenet/inception/BUILD deleted file mode 100644 index 21fc27aa..00000000 --- a/examples/imagenet/inception/BUILD +++ /dev/null @@ -1,198 +0,0 @@ -# Description: -# Example TensorFlow models for ImageNet. - -package(default_visibility = [":internal"]) - -licenses(["notice"]) # Apache 2.0 - -exports_files(["LICENSE"]) - -package_group( - name = "internal", - packages = ["//inception/..."], -) - -py_library( - name = "dataset", - srcs = [ - "dataset.py", - ], -) - -py_library( - name = "imagenet_data", - srcs = [ - "imagenet_data.py", - ], - deps = [ - ":dataset", - ], -) - -py_library( - name = "flowers_data", - srcs = [ - "flowers_data.py", - ], - deps = [ - ":dataset", - ], -) - -py_library( - name = "image_processing", - srcs = [ - "image_processing.py", - ], -) - -py_library( - name = "inception", - srcs = [ - "inception_model.py", - ], - visibility = ["//visibility:public"], - deps = [ - ":dataset", - "//inception/slim", - ], -) - -py_binary( - name = "imagenet_eval", - srcs = [ - "imagenet_eval.py", - ], - deps = [ - ":imagenet_data", - ":inception_eval", - ], -) - -py_binary( - name = "flowers_eval", - srcs = [ - "flowers_eval.py", - ], - deps = [ - ":flowers_data", - ":inception_eval", - ], -) - -py_library( - name = "inception_eval", - srcs = [ - "inception_eval.py", - ], - deps = [ - ":image_processing", - ":inception", - ], -) - -py_binary( - name = "imagenet_train", - srcs = [ - "imagenet_train.py", - ], - deps = [ - ":imagenet_data", - ":inception_train", - ], -) - -py_binary( - name = "imagenet_distributed_train", - srcs = [ - "imagenet_distributed_train.py", - ], - deps = [ - ":imagenet_data", - ":inception_distributed_train", - ], -) - -py_binary( - name = "flowers_train", - srcs = [ - "flowers_train.py", - ], - deps = [ - ":flowers_data", - ":inception_train", - ], -) - -py_library( - name = "inception_train", - srcs = [ - "inception_train.py", - ], - deps = [ - ":image_processing", - ":inception", - ], -) - -py_library( - name = "inception_distributed_train", - srcs = [ - "inception_distributed_train.py", - ], - deps = [ - ":image_processing", - ":inception", - ], -) - -py_binary( - name = "build_image_data", - srcs = ["data/build_image_data.py"], -) - -sh_binary( - name = "download_and_preprocess_flowers", - srcs = ["data/download_and_preprocess_flowers.sh"], - data = [ - ":build_image_data", - ], -) - -sh_binary( - name = "download_and_preprocess_imagenet", - srcs = ["data/download_and_preprocess_imagenet.sh"], - data = [ - "data/download_imagenet.sh", - "data/imagenet_2012_validation_synset_labels.txt", - "data/imagenet_lsvrc_2015_synsets.txt", - "data/imagenet_metadata.txt", - "data/preprocess_imagenet_validation_data.py", - "data/process_bounding_boxes.py", - ":build_imagenet_data", - ], -) - -py_binary( - name = "build_imagenet_data", - srcs = ["data/build_imagenet_data.py"], -) - -filegroup( - name = "srcs", - srcs = glob( - [ - "**/*.py", - "BUILD", - ], - ), -) - -filegroup( - name = "imagenet_metadata", - srcs = [ - "data/imagenet_lsvrc_2015_synsets.txt", - "data/imagenet_metadata.txt", - ], - visibility = ["//visibility:public"], -) diff --git a/examples/imagenet/inception/__init__.py b/examples/imagenet/inception/__init__.py deleted file mode 100644 index e69de29b..00000000 diff --git a/examples/imagenet/inception/data/__init__.py b/examples/imagenet/inception/data/__init__.py deleted file mode 100644 index e69de29b..00000000 diff --git a/examples/imagenet/inception/data/build_image_data.py b/examples/imagenet/inception/data/build_image_data.py deleted file mode 100644 index 8727e3cd..00000000 --- a/examples/imagenet/inception/data/build_image_data.py +++ /dev/null @@ -1,431 +0,0 @@ -# Copyright 2016 Google Inc. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# ============================================================================== -"""Converts image data to TFRecords file format with Example protos. - -The image data set is expected to reside in JPEG files located in the -following directory structure. - - data_dir/label_0/image0.jpeg - data_dir/label_0/image1.jpg - ... - data_dir/label_1/weird-image.jpeg - data_dir/label_1/my-image.jpeg - ... - -where the sub-directory is the unique label associated with these images. - -This TensorFlow script converts the training and evaluation data into -a sharded data set consisting of TFRecord files - - train_directory/train-00000-of-01024 - train_directory/train-00001-of-01024 - ... - train_directory/train-00127-of-01024 - -and - - validation_directory/validation-00000-of-00128 - validation_directory/validation-00001-of-00128 - ... - validation_directory/validation-00127-of-00128 - -where we have selected 1024 and 128 shards for each data set. Each record -within the TFRecord file is a serialized Example proto. The Example proto -contains the following fields: - - image/encoded: string containing JPEG encoded image in RGB colorspace - image/height: integer, image height in pixels - image/width: integer, image width in pixels - image/colorspace: string, specifying the colorspace, always 'RGB' - image/channels: integer, specifying the number of channels, always 3 - image/format: string, specifying the format, always'JPEG' - - image/filename: string containing the basename of the image file - e.g. 'n01440764_10026.JPEG' or 'ILSVRC2012_val_00000293.JPEG' - image/class/label: integer specifying the index in a classification layer. - The label ranges from [0, num_labels] where 0 is unused and left as - the background class. - image/class/text: string specifying the human-readable version of the label - e.g. 'dog' - -If you data set involves bounding boxes, please look at build_imagenet_data.py. -""" -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function - -from datetime import datetime -import os -import random -import sys -import threading - - -import numpy as np -import tensorflow as tf - -tf.app.flags.DEFINE_string('train_directory', '/tmp/', - 'Training data directory') -tf.app.flags.DEFINE_string('validation_directory', '/tmp/', - 'Validation data directory') -tf.app.flags.DEFINE_string('output_directory', '/tmp/', - 'Output data directory') - -tf.app.flags.DEFINE_integer('train_shards', 2, - 'Number of shards in training TFRecord files.') -tf.app.flags.DEFINE_integer('validation_shards', 2, - 'Number of shards in validation TFRecord files.') - -tf.app.flags.DEFINE_integer('num_threads', 2, - 'Number of threads to preprocess the images.') - -# The labels file contains a list of valid labels are held in this file. -# Assumes that the file contains entries as such: -# dog -# cat -# flower -# where each line corresponds to a label. We map each label contained in -# the file to an integer corresponding to the line number starting from 0. -tf.app.flags.DEFINE_string('labels_file', '', 'Labels file') - - -FLAGS = tf.app.flags.FLAGS - - -def _int64_feature(value): - """Wrapper for inserting int64 features into Example proto.""" - if not isinstance(value, list): - value = [value] - return tf.train.Feature(int64_list=tf.train.Int64List(value=value)) - - -def _bytes_feature(value): - """Wrapper for inserting bytes features into Example proto.""" - return tf.train.Feature(bytes_list=tf.train.BytesList(value=[value])) - - -def _convert_to_example(filename, image_buffer, label, text, height, width): - """Build an Example proto for an example. - - Args: - filename: string, path to an image file, e.g., '/path/to/example.JPG' - image_buffer: string, JPEG encoding of RGB image - label: integer, identifier for the ground truth for the network - text: string, unique human-readable, e.g. 'dog' - height: integer, image height in pixels - width: integer, image width in pixels - Returns: - Example proto - """ - - colorspace = 'RGB' - channels = 3 - image_format = 'JPEG' - - example = tf.train.Example(features=tf.train.Features(feature={ - 'image/height': _int64_feature(height), - 'image/width': _int64_feature(width), - 'image/colorspace': _bytes_feature(tf.compat.as_bytes(colorspace)), - 'image/channels': _int64_feature(channels), - 'image/class/label': _int64_feature(label), - 'image/class/text': _bytes_feature(tf.compat.as_bytes(text)), - 'image/format': _bytes_feature(tf.compat.as_bytes(image_format)), - 'image/filename': _bytes_feature(tf.compat.as_bytes(os.path.basename(filename))), - 'image/encoded': _bytes_feature(tf.compat.as_bytes(image_buffer))})) - return example - - -class ImageCoder(object): - """Helper class that provides TensorFlow image coding utilities.""" - - def __init__(self): - # Create a single Session to run all image coding calls. - self._sess = tf.Session() - - # Initializes function that converts PNG to JPEG data. - self._png_data = tf.placeholder(dtype=tf.string) - image = tf.image.decode_png(self._png_data, channels=3) - self._png_to_jpeg = tf.image.encode_jpeg(image, format='rgb', quality=100) - - # Initializes function that decodes RGB JPEG data. - self._decode_jpeg_data = tf.placeholder(dtype=tf.string) - self._decode_jpeg = tf.image.decode_jpeg(self._decode_jpeg_data, channels=3) - - def png_to_jpeg(self, image_data): - return self._sess.run(self._png_to_jpeg, - feed_dict={self._png_data: image_data}) - - def decode_jpeg(self, image_data): - image = self._sess.run(self._decode_jpeg, - feed_dict={self._decode_jpeg_data: image_data}) - assert len(image.shape) == 3 - assert image.shape[2] == 3 - return image - - -def _is_png(filename): - """Determine if a file contains a PNG format image. - - Args: - filename: string, path of the image file. - - Returns: - boolean indicating if the image is a PNG. - """ - return '.png' in filename - - -def _process_image(filename, coder): - """Process a single image file. - - Args: - filename: string, path to an image file e.g., '/path/to/example.JPG'. - coder: instance of ImageCoder to provide TensorFlow image coding utils. - Returns: - image_buffer: string, JPEG encoding of RGB image. - height: integer, image height in pixels. - width: integer, image width in pixels. - """ - # Read the image file. - with tf.gfile.FastGFile(filename, 'r') as f: - image_data = f.read() - - # Convert any PNG to JPEG's for consistency. - if _is_png(filename): - print('Converting PNG to JPEG for %s' % filename) - image_data = coder.png_to_jpeg(image_data) - - # Decode the RGB JPEG. - image = coder.decode_jpeg(image_data) - - # Check that image converted to RGB - assert len(image.shape) == 3 - height = image.shape[0] - width = image.shape[1] - assert image.shape[2] == 3 - - return image_data, height, width - - -def _process_image_files_batch(coder, thread_index, ranges, name, filenames, - texts, labels, num_shards): - """Processes and saves list of images as TFRecord in 1 thread. - - Args: - coder: instance of ImageCoder to provide TensorFlow image coding utils. - thread_index: integer, unique batch to run index is within [0, len(ranges)). - ranges: list of pairs of integers specifying ranges of each batches to - analyze in parallel. - name: string, unique identifier specifying the data set - filenames: list of strings; each string is a path to an image file - texts: list of strings; each string is human readable, e.g. 'dog' - labels: list of integer; each integer identifies the ground truth - num_shards: integer number of shards for this data set. - """ - # Each thread produces N shards where N = int(num_shards / num_threads). - # For instance, if num_shards = 128, and the num_threads = 2, then the first - # thread would produce shards [0, 64). - num_threads = len(ranges) - assert not num_shards % num_threads - num_shards_per_batch = int(num_shards / num_threads) - - shard_ranges = np.linspace(ranges[thread_index][0], - ranges[thread_index][1], - num_shards_per_batch + 1).astype(int) - num_files_in_thread = ranges[thread_index][1] - ranges[thread_index][0] - - counter = 0 - for s in range(num_shards_per_batch): - # Generate a sharded version of the file name, e.g. 'train-00002-of-00010' - shard = thread_index * num_shards_per_batch + s - output_filename = '%s-%.5d-of-%.5d' % (name, shard, num_shards) - output_file = os.path.join(FLAGS.output_directory, output_filename) - writer = tf.python_io.TFRecordWriter(output_file) - - shard_counter = 0 - files_in_shard = np.arange(shard_ranges[s], shard_ranges[s + 1], dtype=int) - for i in files_in_shard: - filename = filenames[i] - label = labels[i] - text = texts[i] - - image_buffer, height, width = _process_image(filename, coder) - - example = _convert_to_example(filename, image_buffer, label, - text, height, width) - writer.write(example.SerializeToString()) - shard_counter += 1 - counter += 1 - - if not counter % 1000: - print('%s [thread %d]: Processed %d of %d images in thread batch.' % - (datetime.now(), thread_index, counter, num_files_in_thread)) - sys.stdout.flush() - - writer.close() - print('%s [thread %d]: Wrote %d images to %s' % - (datetime.now(), thread_index, shard_counter, output_file)) - sys.stdout.flush() - shard_counter = 0 - print('%s [thread %d]: Wrote %d images to %d shards.' % - (datetime.now(), thread_index, counter, num_files_in_thread)) - sys.stdout.flush() - - -def _process_image_files(name, filenames, texts, labels, num_shards): - """Process and save list of images as TFRecord of Example protos. - - Args: - name: string, unique identifier specifying the data set - filenames: list of strings; each string is a path to an image file - texts: list of strings; each string is human readable, e.g. 'dog' - labels: list of integer; each integer identifies the ground truth - num_shards: integer number of shards for this data set. - """ - assert len(filenames) == len(texts) - assert len(filenames) == len(labels) - - # Break all images into batches with a [ranges[i][0], ranges[i][1]]. - spacing = np.linspace(0, len(filenames), FLAGS.num_threads + 1).astype(np.int) - ranges = [] - for i in range(len(spacing) - 1): - ranges.append([spacing[i], spacing[i+1]]) - - # Launch a thread for each batch. - print('Launching %d threads for spacings: %s' % (FLAGS.num_threads, ranges)) - sys.stdout.flush() - - # Create a mechanism for monitoring when all threads are finished. - coord = tf.train.Coordinator() - - # Create a generic TensorFlow-based utility for converting all image codings. - coder = ImageCoder() - - threads = [] - for thread_index in range(len(ranges)): - args = (coder, thread_index, ranges, name, filenames, - texts, labels, num_shards) - t = threading.Thread(target=_process_image_files_batch, args=args) - t.start() - threads.append(t) - - # Wait for all the threads to terminate. - coord.join(threads) - print('%s: Finished writing all %d images in data set.' % - (datetime.now(), len(filenames))) - sys.stdout.flush() - - -def _find_image_files(data_dir, labels_file): - """Build a list of all images files and labels in the data set. - - Args: - data_dir: string, path to the root directory of images. - - Assumes that the image data set resides in JPEG files located in - the following directory structure. - - data_dir/dog/another-image.JPEG - data_dir/dog/my-image.jpg - - where 'dog' is the label associated with these images. - - labels_file: string, path to the labels file. - - The list of valid labels are held in this file. Assumes that the file - contains entries as such: - dog - cat - flower - where each line corresponds to a label. We map each label contained in - the file to an integer starting with the integer 0 corresponding to the - label contained in the first line. - - Returns: - filenames: list of strings; each string is a path to an image file. - texts: list of strings; each string is the class, e.g. 'dog' - labels: list of integer; each integer identifies the ground truth. - """ - print('Determining list of input files and labels from %s.' % data_dir) - unique_labels = [l.strip() for l in tf.gfile.FastGFile( - labels_file, 'r').readlines()] - - labels = [] - filenames = [] - texts = [] - - # Leave label index 0 empty as a background class. - label_index = 1 - - # Construct the list of JPEG files and labels. - for text in unique_labels: - jpeg_file_path = '%s/%s/*' % (data_dir, text) - matching_files = tf.gfile.Glob(jpeg_file_path) - - labels.extend([label_index] * len(matching_files)) - texts.extend([text] * len(matching_files)) - filenames.extend(matching_files) - - if not label_index % 100: - print('Finished finding files in %d of %d classes.' % ( - label_index, len(labels))) - label_index += 1 - - # Shuffle the ordering of all image files in order to guarantee - # random ordering of the images with respect to label in the - # saved TFRecord files. Make the randomization repeatable. - shuffled_index = list(range(len(filenames))) - random.seed(12345) - random.shuffle(shuffled_index) - - filenames = [filenames[i] for i in shuffled_index] - texts = [texts[i] for i in shuffled_index] - labels = [labels[i] for i in shuffled_index] - - print('Found %d JPEG files across %d labels inside %s.' % - (len(filenames), len(unique_labels), data_dir)) - return filenames, texts, labels - - -def _process_dataset(name, directory, num_shards, labels_file): - """Process a complete data set and save it as a TFRecord. - - Args: - name: string, unique identifier specifying the data set. - directory: string, root path to the data set. - num_shards: integer number of shards for this data set. - labels_file: string, path to the labels file. - """ - filenames, texts, labels = _find_image_files(directory, labels_file) - _process_image_files(name, filenames, texts, labels, num_shards) - - -def main(unused_argv): - assert not FLAGS.train_shards % FLAGS.num_threads, ( - 'Please make the FLAGS.num_threads commensurate with FLAGS.train_shards') - assert not FLAGS.validation_shards % FLAGS.num_threads, ( - 'Please make the FLAGS.num_threads commensurate with ' - 'FLAGS.validation_shards') - print('Saving results to %s' % FLAGS.output_directory) - - # Run it! - _process_dataset('validation', FLAGS.validation_directory, - FLAGS.validation_shards, FLAGS.labels_file) - _process_dataset('train', FLAGS.train_directory, - FLAGS.train_shards, FLAGS.labels_file) - - -if __name__ == '__main__': - tf.app.run() diff --git a/examples/imagenet/inception/data/build_imagenet_data.py b/examples/imagenet/inception/data/build_imagenet_data.py deleted file mode 100644 index a830f97b..00000000 --- a/examples/imagenet/inception/data/build_imagenet_data.py +++ /dev/null @@ -1,704 +0,0 @@ -# Copyright 2016 Google Inc. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# ============================================================================== -"""Converts ImageNet data to TFRecords file format with Example protos. - -The raw ImageNet data set is expected to reside in JPEG files located in the -following directory structure. - - data_dir/n01440764/ILSVRC2012_val_00000293.JPEG - data_dir/n01440764/ILSVRC2012_val_00000543.JPEG - ... - -where 'n01440764' is the unique synset label associated with -these images. - -The training data set consists of 1000 sub-directories (i.e. labels) -each containing 1200 JPEG images for a total of 1.2M JPEG images. - -The evaluation data set consists of 1000 sub-directories (i.e. labels) -each containing 50 JPEG images for a total of 50K JPEG images. - -This TensorFlow script converts the training and evaluation data into -a sharded data set consisting of 1024 and 128 TFRecord files, respectively. - - train_directory/train-00000-of-01024 - train_directory/train-00001-of-01024 - ... - train_directory/train-00127-of-01024 - -and - - validation_directory/validation-00000-of-00128 - validation_directory/validation-00001-of-00128 - ... - validation_directory/validation-00127-of-00128 - -Each validation TFRecord file contains ~390 records. Each training TFREcord -file contains ~1250 records. Each record within the TFRecord file is a -serialized Example proto. The Example proto contains the following fields: - - image/encoded: string containing JPEG encoded image in RGB colorspace - image/height: integer, image height in pixels - image/width: integer, image width in pixels - image/colorspace: string, specifying the colorspace, always 'RGB' - image/channels: integer, specifying the number of channels, always 3 - image/format: string, specifying the format, always'JPEG' - - image/filename: string containing the basename of the image file - e.g. 'n01440764_10026.JPEG' or 'ILSVRC2012_val_00000293.JPEG' - image/class/label: integer specifying the index in a classification layer. - The label ranges from [1, 1000] where 0 is not used. - image/class/synset: string specifying the unique ID of the label, - e.g. 'n01440764' - image/class/text: string specifying the human-readable version of the label - e.g. 'red fox, Vulpes vulpes' - - image/object/bbox/xmin: list of integers specifying the 0+ human annotated - bounding boxes - image/object/bbox/xmax: list of integers specifying the 0+ human annotated - bounding boxes - image/object/bbox/ymin: list of integers specifying the 0+ human annotated - bounding boxes - image/object/bbox/ymax: list of integers specifying the 0+ human annotated - bounding boxes - image/object/bbox/label: integer specifying the index in a classification - layer. The label ranges from [1, 1000] where 0 is not used. Note this is - always identical to the image label. - -Note that the length of xmin is identical to the length of xmax, ymin and ymax -for each example. - -Running this script using 16 threads may take around ~2.5 hours on a HP Z420. -""" -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function - -from datetime import datetime -import os -import random -import sys -import threading - - -import numpy as np -import tensorflow as tf - -tf.app.flags.DEFINE_string('train_directory', '/tmp/', - 'Training data directory') -tf.app.flags.DEFINE_string('validation_directory', '/tmp/', - 'Validation data directory') -tf.app.flags.DEFINE_string('output_directory', '/tmp/', - 'Output data directory') - -tf.app.flags.DEFINE_integer('train_shards', 1024, - 'Number of shards in training TFRecord files.') -tf.app.flags.DEFINE_integer('validation_shards', 128, - 'Number of shards in validation TFRecord files.') - -tf.app.flags.DEFINE_integer('num_threads', 8, - 'Number of threads to preprocess the images.') - -# The labels file contains a list of valid labels are held in this file. -# Assumes that the file contains entries as such: -# n01440764 -# n01443537 -# n01484850 -# where each line corresponds to a label expressed as a synset. We map -# each synset contained in the file to an integer (based on the alphabetical -# ordering). See below for details. -tf.app.flags.DEFINE_string('labels_file', - 'imagenet_lsvrc_2015_synsets.txt', - 'Labels file') - -# This file containing mapping from synset to human-readable label. -# Assumes each line of the file looks like: -# -# n02119247 black fox -# n02119359 silver fox -# n02119477 red fox, Vulpes fulva -# -# where each line corresponds to a unique mapping. Note that each line is -# formatted as \t. -tf.app.flags.DEFINE_string('imagenet_metadata_file', - 'imagenet_metadata.txt', - 'ImageNet metadata file') - -# This file is the output of process_bounding_box.py -# Assumes each line of the file looks like: -# -# n00007846_64193.JPEG,0.0060,0.2620,0.7545,0.9940 -# -# where each line corresponds to one bounding box annotation associated -# with an image. Each line can be parsed as: -# -# , , , , -# -# Note that there might exist mulitple bounding box annotations associated -# with an image file. -tf.app.flags.DEFINE_string('bounding_box_file', - './imagenet_2012_bounding_boxes.csv', - 'Bounding box file') - -FLAGS = tf.app.flags.FLAGS - - -def _int64_feature(value): - """Wrapper for inserting int64 features into Example proto.""" - if not isinstance(value, list): - value = [value] - return tf.train.Feature(int64_list=tf.train.Int64List(value=value)) - - -def _float_feature(value): - """Wrapper for inserting float features into Example proto.""" - if not isinstance(value, list): - value = [value] - return tf.train.Feature(float_list=tf.train.FloatList(value=value)) - - -def _bytes_feature(value): - """Wrapper for inserting bytes features into Example proto.""" - return tf.train.Feature(bytes_list=tf.train.BytesList(value=[value])) - - -def _convert_to_example(filename, image_buffer, label, synset, human, bbox, - height, width): - """Build an Example proto for an example. - - Args: - filename: string, path to an image file, e.g., '/path/to/example.JPG' - image_buffer: string, JPEG encoding of RGB image - label: integer, identifier for the ground truth for the network - synset: string, unique WordNet ID specifying the label, e.g., 'n02323233' - human: string, human-readable label, e.g., 'red fox, Vulpes vulpes' - bbox: list of bounding boxes; each box is a list of integers - specifying [xmin, ymin, xmax, ymax]. All boxes are assumed to belong to - the same label as the image label. - height: integer, image height in pixels - width: integer, image width in pixels - Returns: - Example proto - """ - xmin = [] - ymin = [] - xmax = [] - ymax = [] - for b in bbox: - assert len(b) == 4 - # pylint: disable=expression-not-assigned - [l.append(point) for l, point in zip([xmin, ymin, xmax, ymax], b)] - # pylint: enable=expression-not-assigned - - colorspace = 'RGB' - channels = 3 - image_format = 'JPEG' - - example = tf.train.Example(features=tf.train.Features(feature={ - 'image/height': _int64_feature(height), - 'image/width': _int64_feature(width), - 'image/colorspace': _bytes_feature(colorspace), - 'image/channels': _int64_feature(channels), - 'image/class/label': _int64_feature(label), - 'image/class/synset': _bytes_feature(synset), - 'image/class/text': _bytes_feature(human), - 'image/object/bbox/xmin': _float_feature(xmin), - 'image/object/bbox/xmax': _float_feature(xmax), - 'image/object/bbox/ymin': _float_feature(ymin), - 'image/object/bbox/ymax': _float_feature(ymax), - 'image/object/bbox/label': _int64_feature([label] * len(xmin)), - 'image/format': _bytes_feature(image_format), - 'image/filename': _bytes_feature(os.path.basename(filename)), - 'image/encoded': _bytes_feature(image_buffer)})) - return example - - -class ImageCoder(object): - """Helper class that provides TensorFlow image coding utilities.""" - - def __init__(self): - # Create a single Session to run all image coding calls. - self._sess = tf.Session() - - # Initializes function that converts PNG to JPEG data. - self._png_data = tf.placeholder(dtype=tf.string) - image = tf.image.decode_png(self._png_data, channels=3) - self._png_to_jpeg = tf.image.encode_jpeg(image, format='rgb', quality=100) - - # Initializes function that converts CMYK JPEG data to RGB JPEG data. - self._cmyk_data = tf.placeholder(dtype=tf.string) - image = tf.image.decode_jpeg(self._cmyk_data, channels=0) - self._cmyk_to_rgb = tf.image.encode_jpeg(image, format='rgb', quality=100) - - # Initializes function that decodes RGB JPEG data. - self._decode_jpeg_data = tf.placeholder(dtype=tf.string) - self._decode_jpeg = tf.image.decode_jpeg(self._decode_jpeg_data, channels=3) - - def png_to_jpeg(self, image_data): - return self._sess.run(self._png_to_jpeg, - feed_dict={self._png_data: image_data}) - - def cmyk_to_rgb(self, image_data): - return self._sess.run(self._cmyk_to_rgb, - feed_dict={self._cmyk_data: image_data}) - - def decode_jpeg(self, image_data): - image = self._sess.run(self._decode_jpeg, - feed_dict={self._decode_jpeg_data: image_data}) - assert len(image.shape) == 3 - assert image.shape[2] == 3 - return image - - -def _is_png(filename): - """Determine if a file contains a PNG format image. - - Args: - filename: string, path of the image file. - - Returns: - boolean indicating if the image is a PNG. - """ - # File list from: - # https://groups.google.com/forum/embed/?place=forum/torch7#!topic/torch7/fOSTXHIESSU - return 'n02105855_2933.JPEG' in filename - - -def _is_cmyk(filename): - """Determine if file contains a CMYK JPEG format image. - - Args: - filename: string, path of the image file. - - Returns: - boolean indicating if the image is a JPEG encoded with CMYK color space. - """ - # File list from: - # https://github.com/cytsai/ilsvrc-cmyk-image-list - blacklist = ['n01739381_1309.JPEG', 'n02077923_14822.JPEG', - 'n02447366_23489.JPEG', 'n02492035_15739.JPEG', - 'n02747177_10752.JPEG', 'n03018349_4028.JPEG', - 'n03062245_4620.JPEG', 'n03347037_9675.JPEG', - 'n03467068_12171.JPEG', 'n03529860_11437.JPEG', - 'n03544143_17228.JPEG', 'n03633091_5218.JPEG', - 'n03710637_5125.JPEG', 'n03961711_5286.JPEG', - 'n04033995_2932.JPEG', 'n04258138_17003.JPEG', - 'n04264628_27969.JPEG', 'n04336792_7448.JPEG', - 'n04371774_5854.JPEG', 'n04596742_4225.JPEG', - 'n07583066_647.JPEG', 'n13037406_4650.JPEG'] - return filename.split('/')[-1] in blacklist - - -def _process_image(filename, coder): - """Process a single image file. - - Args: - filename: string, path to an image file e.g., '/path/to/example.JPG'. - coder: instance of ImageCoder to provide TensorFlow image coding utils. - Returns: - image_buffer: string, JPEG encoding of RGB image. - height: integer, image height in pixels. - width: integer, image width in pixels. - """ - # Read the image file. - with tf.gfile.FastGFile(filename, 'r') as f: - image_data = f.read() - - # Clean the dirty data. - if _is_png(filename): - # 1 image is a PNG. - print('Converting PNG to JPEG for %s' % filename) - image_data = coder.png_to_jpeg(image_data) - elif _is_cmyk(filename): - # 22 JPEG images are in CMYK colorspace. - print('Converting CMYK to RGB for %s' % filename) - image_data = coder.cmyk_to_rgb(image_data) - - # Decode the RGB JPEG. - image = coder.decode_jpeg(image_data) - - # Check that image converted to RGB - assert len(image.shape) == 3 - height = image.shape[0] - width = image.shape[1] - assert image.shape[2] == 3 - - return image_data, height, width - - -def _process_image_files_batch(coder, thread_index, ranges, name, filenames, - synsets, labels, humans, bboxes, num_shards): - """Processes and saves list of images as TFRecord in 1 thread. - - Args: - coder: instance of ImageCoder to provide TensorFlow image coding utils. - thread_index: integer, unique batch to run index is within [0, len(ranges)). - ranges: list of pairs of integers specifying ranges of each batches to - analyze in parallel. - name: string, unique identifier specifying the data set - filenames: list of strings; each string is a path to an image file - synsets: list of strings; each string is a unique WordNet ID - labels: list of integer; each integer identifies the ground truth - humans: list of strings; each string is a human-readable label - bboxes: list of bounding boxes for each image. Note that each entry in this - list might contain from 0+ entries corresponding to the number of bounding - box annotations for the image. - num_shards: integer number of shards for this data set. - """ - # Each thread produces N shards where N = int(num_shards / num_threads). - # For instance, if num_shards = 128, and the num_threads = 2, then the first - # thread would produce shards [0, 64). - num_threads = len(ranges) - assert not num_shards % num_threads - num_shards_per_batch = int(num_shards / num_threads) - - shard_ranges = np.linspace(ranges[thread_index][0], - ranges[thread_index][1], - num_shards_per_batch + 1).astype(int) - num_files_in_thread = ranges[thread_index][1] - ranges[thread_index][0] - - counter = 0 - for s in range(num_shards_per_batch): - # Generate a sharded version of the file name, e.g. 'train-00002-of-00010' - shard = thread_index * num_shards_per_batch + s - output_filename = '%s-%.5d-of-%.5d' % (name, shard, num_shards) - output_file = os.path.join(FLAGS.output_directory, output_filename) - writer = tf.python_io.TFRecordWriter(output_file) - - shard_counter = 0 - files_in_shard = np.arange(shard_ranges[s], shard_ranges[s + 1], dtype=int) - for i in files_in_shard: - filename = filenames[i] - label = labels[i] - synset = synsets[i] - human = humans[i] - bbox = bboxes[i] - - image_buffer, height, width = _process_image(filename, coder) - - example = _convert_to_example(filename, image_buffer, label, - synset, human, bbox, - height, width) - writer.write(example.SerializeToString()) - shard_counter += 1 - counter += 1 - - if not counter % 1000: - print('%s [thread %d]: Processed %d of %d images in thread batch.' % - (datetime.now(), thread_index, counter, num_files_in_thread)) - sys.stdout.flush() - - writer.close() - print('%s [thread %d]: Wrote %d images to %s' % - (datetime.now(), thread_index, shard_counter, output_file)) - sys.stdout.flush() - shard_counter = 0 - print('%s [thread %d]: Wrote %d images to %d shards.' % - (datetime.now(), thread_index, counter, num_files_in_thread)) - sys.stdout.flush() - - -def _process_image_files(name, filenames, synsets, labels, humans, - bboxes, num_shards): - """Process and save list of images as TFRecord of Example protos. - - Args: - name: string, unique identifier specifying the data set - filenames: list of strings; each string is a path to an image file - synsets: list of strings; each string is a unique WordNet ID - labels: list of integer; each integer identifies the ground truth - humans: list of strings; each string is a human-readable label - bboxes: list of bounding boxes for each image. Note that each entry in this - list might contain from 0+ entries corresponding to the number of bounding - box annotations for the image. - num_shards: integer number of shards for this data set. - """ - assert len(filenames) == len(synsets) - assert len(filenames) == len(labels) - assert len(filenames) == len(humans) - assert len(filenames) == len(bboxes) - - # Break all images into batches with a [ranges[i][0], ranges[i][1]]. - spacing = np.linspace(0, len(filenames), FLAGS.num_threads + 1).astype(np.int) - ranges = [] - threads = [] - for i in range(len(spacing) - 1): - ranges.append([spacing[i], spacing[i+1]]) - - # Launch a thread for each batch. - print('Launching %d threads for spacings: %s' % (FLAGS.num_threads, ranges)) - sys.stdout.flush() - - # Create a mechanism for monitoring when all threads are finished. - coord = tf.train.Coordinator() - - # Create a generic TensorFlow-based utility for converting all image codings. - coder = ImageCoder() - - threads = [] - for thread_index in range(len(ranges)): - args = (coder, thread_index, ranges, name, filenames, - synsets, labels, humans, bboxes, num_shards) - t = threading.Thread(target=_process_image_files_batch, args=args) - t.start() - threads.append(t) - - # Wait for all the threads to terminate. - coord.join(threads) - print('%s: Finished writing all %d images in data set.' % - (datetime.now(), len(filenames))) - sys.stdout.flush() - - -def _find_image_files(data_dir, labels_file): - """Build a list of all images files and labels in the data set. - - Args: - data_dir: string, path to the root directory of images. - - Assumes that the ImageNet data set resides in JPEG files located in - the following directory structure. - - data_dir/n01440764/ILSVRC2012_val_00000293.JPEG - data_dir/n01440764/ILSVRC2012_val_00000543.JPEG - - where 'n01440764' is the unique synset label associated with these images. - - labels_file: string, path to the labels file. - - The list of valid labels are held in this file. Assumes that the file - contains entries as such: - n01440764 - n01443537 - n01484850 - where each line corresponds to a label expressed as a synset. We map - each synset contained in the file to an integer (based on the alphabetical - ordering) starting with the integer 1 corresponding to the synset - contained in the first line. - - The reason we start the integer labels at 1 is to reserve label 0 as an - unused background class. - - Returns: - filenames: list of strings; each string is a path to an image file. - synsets: list of strings; each string is a unique WordNet ID. - labels: list of integer; each integer identifies the ground truth. - """ - print('Determining list of input files and labels from %s.' % data_dir) - challenge_synsets = [l.strip() for l in - tf.gfile.FastGFile(labels_file, 'r').readlines()] - - labels = [] - filenames = [] - synsets = [] - - # Leave label index 0 empty as a background class. - label_index = 1 - - # Construct the list of JPEG files and labels. - for synset in challenge_synsets: - jpeg_file_path = '%s/%s/*.JPEG' % (data_dir, synset) - matching_files = tf.gfile.Glob(jpeg_file_path) - - labels.extend([label_index] * len(matching_files)) - synsets.extend([synset] * len(matching_files)) - filenames.extend(matching_files) - - if not label_index % 100: - print('Finished finding files in %d of %d classes.' % ( - label_index, len(challenge_synsets))) - label_index += 1 - - # Shuffle the ordering of all image files in order to guarantee - # random ordering of the images with respect to label in the - # saved TFRecord files. Make the randomization repeatable. - shuffled_index = list(range(len(filenames))) - random.seed(12345) - random.shuffle(shuffled_index) - - filenames = [filenames[i] for i in shuffled_index] - synsets = [synsets[i] for i in shuffled_index] - labels = [labels[i] for i in shuffled_index] - - print('Found %d JPEG files across %d labels inside %s.' % - (len(filenames), len(challenge_synsets), data_dir)) - return filenames, synsets, labels - - -def _find_human_readable_labels(synsets, synset_to_human): - """Build a list of human-readable labels. - - Args: - synsets: list of strings; each string is a unique WordNet ID. - synset_to_human: dict of synset to human labels, e.g., - 'n02119022' --> 'red fox, Vulpes vulpes' - - Returns: - List of human-readable strings corresponding to each synset. - """ - humans = [] - for s in synsets: - assert s in synset_to_human, ('Failed to find: %s' % s) - humans.append(synset_to_human[s]) - return humans - - -def _find_image_bounding_boxes(filenames, image_to_bboxes): - """Find the bounding boxes for a given image file. - - Args: - filenames: list of strings; each string is a path to an image file. - image_to_bboxes: dictionary mapping image file names to a list of - bounding boxes. This list contains 0+ bounding boxes. - Returns: - List of bounding boxes for each image. Note that each entry in this - list might contain from 0+ entries corresponding to the number of bounding - box annotations for the image. - """ - num_image_bbox = 0 - bboxes = [] - for f in filenames: - basename = os.path.basename(f) - if basename in image_to_bboxes: - bboxes.append(image_to_bboxes[basename]) - num_image_bbox += 1 - else: - bboxes.append([]) - print('Found %d images with bboxes out of %d images' % ( - num_image_bbox, len(filenames))) - return bboxes - - -def _process_dataset(name, directory, num_shards, synset_to_human, - image_to_bboxes): - """Process a complete data set and save it as a TFRecord. - - Args: - name: string, unique identifier specifying the data set. - directory: string, root path to the data set. - num_shards: integer number of shards for this data set. - synset_to_human: dict of synset to human labels, e.g., - 'n02119022' --> 'red fox, Vulpes vulpes' - image_to_bboxes: dictionary mapping image file names to a list of - bounding boxes. This list contains 0+ bounding boxes. - """ - filenames, synsets, labels = _find_image_files(directory, FLAGS.labels_file) - humans = _find_human_readable_labels(synsets, synset_to_human) - bboxes = _find_image_bounding_boxes(filenames, image_to_bboxes) - _process_image_files(name, filenames, synsets, labels, - humans, bboxes, num_shards) - - -def _build_synset_lookup(imagenet_metadata_file): - """Build lookup for synset to human-readable label. - - Args: - imagenet_metadata_file: string, path to file containing mapping from - synset to human-readable label. - - Assumes each line of the file looks like: - - n02119247 black fox - n02119359 silver fox - n02119477 red fox, Vulpes fulva - - where each line corresponds to a unique mapping. Note that each line is - formatted as \t. - - Returns: - Dictionary of synset to human labels, such as: - 'n02119022' --> 'red fox, Vulpes vulpes' - """ - lines = tf.gfile.FastGFile(imagenet_metadata_file, 'r').readlines() - synset_to_human = {} - for l in lines: - if l: - parts = l.strip().split('\t') - assert len(parts) == 2 - synset = parts[0] - human = parts[1] - synset_to_human[synset] = human - return synset_to_human - - -def _build_bounding_box_lookup(bounding_box_file): - """Build a lookup from image file to bounding boxes. - - Args: - bounding_box_file: string, path to file with bounding boxes annotations. - - Assumes each line of the file looks like: - - n00007846_64193.JPEG,0.0060,0.2620,0.7545,0.9940 - - where each line corresponds to one bounding box annotation associated - with an image. Each line can be parsed as: - - , , , , - - Note that there might exist mulitple bounding box annotations associated - with an image file. This file is the output of process_bounding_boxes.py. - - Returns: - Dictionary mapping image file names to a list of bounding boxes. This list - contains 0+ bounding boxes. - """ - lines = tf.gfile.FastGFile(bounding_box_file, 'r').readlines() - images_to_bboxes = {} - num_bbox = 0 - num_image = 0 - for l in lines: - if l: - parts = l.split(',') - assert len(parts) == 5, ('Failed to parse: %s' % l) - filename = parts[0] - xmin = float(parts[1]) - ymin = float(parts[2]) - xmax = float(parts[3]) - ymax = float(parts[4]) - box = [xmin, ymin, xmax, ymax] - - if filename not in images_to_bboxes: - images_to_bboxes[filename] = [] - num_image += 1 - images_to_bboxes[filename].append(box) - num_bbox += 1 - - print('Successfully read %d bounding boxes ' - 'across %d images.' % (num_bbox, num_image)) - return images_to_bboxes - - -def main(unused_argv): - assert not FLAGS.train_shards % FLAGS.num_threads, ( - 'Please make the FLAGS.num_threads commensurate with FLAGS.train_shards') - assert not FLAGS.validation_shards % FLAGS.num_threads, ( - 'Please make the FLAGS.num_threads commensurate with ' - 'FLAGS.validation_shards') - print('Saving results to %s' % FLAGS.output_directory) - - # Build a map from synset to human-readable label. - synset_to_human = _build_synset_lookup(FLAGS.imagenet_metadata_file) - image_to_bboxes = _build_bounding_box_lookup(FLAGS.bounding_box_file) - - # Run it! - _process_dataset('validation', FLAGS.validation_directory, - FLAGS.validation_shards, synset_to_human, image_to_bboxes) - _process_dataset('train', FLAGS.train_directory, FLAGS.train_shards, - synset_to_human, image_to_bboxes) - - -if __name__ == '__main__': - tf.app.run() diff --git a/examples/imagenet/inception/data/download_and_preprocess_flowers.sh b/examples/imagenet/inception/data/download_and_preprocess_flowers.sh deleted file mode 100755 index 37ff5fa5..00000000 --- a/examples/imagenet/inception/data/download_and_preprocess_flowers.sh +++ /dev/null @@ -1,96 +0,0 @@ -#!/bin/bash -# Copyright 2016 Google Inc. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# ============================================================================== - -# Script to download and preprocess the flowers data set. This data set -# provides a demonstration for how to perform fine-tuning (i.e. tranfer -# learning) from one model to a new data set. -# -# This script provides a demonstration for how to prepare an arbitrary -# data set for training an Inception v3 model. -# -# We demonstrate this with the flowers data set which consists of images -# of labeled flower images from 5 classes: -# -# daisy, dandelion, roses, sunflowers, tulips -# -# The final output of this script are sharded TFRecord files containing -# serialized Example protocol buffers. See build_image_data.py for -# details of how the Example protocol buffer contains image data. -# -# usage: -# ./download_and_preprocess_flowers.sh [data-dir] -set -e - -if [ -z "$1" ]; then - echo "usage download_and_preprocess_flowers.sh [data dir]" - exit -fi - -# Create the output and temporary directories. -DATA_DIR="${1%/}" -SCRATCH_DIR="${DATA_DIR}/raw-data/" -mkdir -p "${DATA_DIR}" -mkdir -p "${SCRATCH_DIR}" -WORK_DIR="$0.runfiles/inception/inception" - -# Download the flowers data. -DATA_URL="http://download.tensorflow.org/example_images/flower_photos.tgz" -CURRENT_DIR=$(pwd) -cd "${DATA_DIR}" -TARBALL="flower_photos.tgz" -if [ ! -f ${TARBALL} ]; then - echo "Downloading flower data set." - wget -O ${TARBALL} "${DATA_URL}" -else - echo "Skipping download of flower data." -fi - -# Note the locations of the train and validation data. -TRAIN_DIRECTORY="${SCRATCH_DIR}train/" -VALIDATION_DIRECTORY="${SCRATCH_DIR}validation/" - -# Expands the data into the flower_photos/ directory and rename it as the -# train directory. -tar xf flower_photos.tgz -rm -rf "${TRAIN_DIRECTORY}" "${VALIDATION_DIRECTORY}" -mv flower_photos "${TRAIN_DIRECTORY}" - -# Generate a list of 5 labels: daisy, dandelion, roses, sunflowers, tulips -LABELS_FILE="${SCRATCH_DIR}/labels.txt" -ls -1 "${TRAIN_DIRECTORY}" | grep -v 'LICENSE' | sed 's/\///' | sort > "${LABELS_FILE}" - -# Generate the validation data set. -while read LABEL; do - VALIDATION_DIR_FOR_LABEL="${VALIDATION_DIRECTORY}${LABEL}" - TRAIN_DIR_FOR_LABEL="${TRAIN_DIRECTORY}${LABEL}" - - # Move the first randomly selected 100 images to the validation set. - mkdir -p "${VALIDATION_DIR_FOR_LABEL}" - VALIDATION_IMAGES=$(ls -1 "${TRAIN_DIR_FOR_LABEL}" | shuf | head -100) - for IMAGE in ${VALIDATION_IMAGES}; do - mv -f "${TRAIN_DIRECTORY}${LABEL}/${IMAGE}" "${VALIDATION_DIR_FOR_LABEL}" - done -done < "${LABELS_FILE}" - -# Build the TFRecords version of the image data. -cd "${CURRENT_DIR}" -BUILD_SCRIPT="${WORK_DIR}/build_image_data" -OUTPUT_DIRECTORY="${DATA_DIR}" -"${BUILD_SCRIPT}" \ - --train_directory="${TRAIN_DIRECTORY}" \ - --validation_directory="${VALIDATION_DIRECTORY}" \ - --output_directory="${OUTPUT_DIRECTORY}" \ - --labels_file="${LABELS_FILE}" diff --git a/examples/imagenet/inception/data/download_and_preprocess_flowers_mac.sh b/examples/imagenet/inception/data/download_and_preprocess_flowers_mac.sh deleted file mode 100644 index 794301b4..00000000 --- a/examples/imagenet/inception/data/download_and_preprocess_flowers_mac.sh +++ /dev/null @@ -1,96 +0,0 @@ -#!/bin/bash -# Copyright 2016 Google Inc. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# ============================================================================== - -# Script to download and preprocess the flowers data set. This data set -# provides a demonstration for how to perform fine-tuning (i.e. tranfer -# learning) from one model to a new data set. -# -# This script provides a demonstration for how to prepare an arbitrary -# data set for training an Inception v3 model. -# -# We demonstrate this with the flowers data set which consists of images -# of labeled flower images from 5 classes: -# -# daisy, dandelion, roses, sunflowers, tulips -# -# The final output of this script are sharded TFRecord files containing -# serialized Example protocol buffers. See build_image_data.py for -# details of how the Example protocol buffer contains image data. -# -# usage: -# ./download_and_preprocess_flowers.sh [data-dir] -set -e - -if [ -z "$1" ]; then - echo "usage download_and_preprocess_flowers.sh [data dir]" - exit -fi - -# Create the output and temporary directories. -DATA_DIR="${1%/}" -SCRATCH_DIR="${DATA_DIR}/raw-data/" -mkdir -p "${DATA_DIR}" -mkdir -p "${SCRATCH_DIR}" -WORK_DIR="$0.runfiles/inception/inception" - -# Download the flowers data. -DATA_URL="http://download.tensorflow.org/example_images/flower_photos.tgz" -CURRENT_DIR=$(pwd) -cd "${DATA_DIR}" -TARBALL="flower_photos.tgz" -if [ ! -f ${TARBALL} ]; then - echo "Downloading flower data set." - wget -O ${TARBALL} "${DATA_URL}" -else - echo "Skipping download of flower data." -fi - -# Note the locations of the train and validation data. -TRAIN_DIRECTORY="${SCRATCH_DIR}train/" -VALIDATION_DIRECTORY="${SCRATCH_DIR}validation/" - -# Expands the data into the flower_photos/ directory and rename it as the -# train directory. -tar xf flower_photos.tgz -rm -rf "${TRAIN_DIRECTORY}" "${VALIDATION_DIRECTORY}" -mv flower_photos "${TRAIN_DIRECTORY}" - -# Generate a list of 5 labels: daisy, dandelion, roses, sunflowers, tulips -LABELS_FILE="${SCRATCH_DIR}/labels.txt" -ls -1 "${TRAIN_DIRECTORY}" | grep -v 'LICENSE' | sed 's/\///' | sort > "${LABELS_FILE}" - -# Generate the validation data set. -while read LABEL; do - VALIDATION_DIR_FOR_LABEL="${VALIDATION_DIRECTORY}${LABEL}" - TRAIN_DIR_FOR_LABEL="${TRAIN_DIRECTORY}${LABEL}" - - # Move the first randomly selected 100 images to the validation set. - mkdir -p "${VALIDATION_DIR_FOR_LABEL}" - VALIDATION_IMAGES=$(ls -1 "${TRAIN_DIR_FOR_LABEL}" | gshuf | head -100) - for IMAGE in ${VALIDATION_IMAGES}; do - mv -f "${TRAIN_DIRECTORY}${LABEL}/${IMAGE}" "${VALIDATION_DIR_FOR_LABEL}" - done -done < "${LABELS_FILE}" - -# Build the TFRecords version of the image data. -cd "${CURRENT_DIR}" -BUILD_SCRIPT="${WORK_DIR}/build_image_data" -OUTPUT_DIRECTORY="${DATA_DIR}" -"${BUILD_SCRIPT}" \ - --train_directory="${TRAIN_DIRECTORY}" \ - --validation_directory="${VALIDATION_DIRECTORY}" \ - --output_directory="${OUTPUT_DIRECTORY}" \ - --labels_file="${LABELS_FILE}" diff --git a/examples/imagenet/inception/data/download_and_preprocess_imagenet.sh b/examples/imagenet/inception/data/download_and_preprocess_imagenet.sh deleted file mode 100755 index 682ade70..00000000 --- a/examples/imagenet/inception/data/download_and_preprocess_imagenet.sh +++ /dev/null @@ -1,101 +0,0 @@ -#!/bin/bash -# Copyright 2016 Google Inc. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# ============================================================================== - -# Script to download and preprocess ImageNet Challenge 2012 -# training and validation data set. -# -# The final output of this script are sharded TFRecord files containing -# serialized Example protocol buffers. See build_imagenet_data.py for -# details of how the Example protocol buffers contain the ImageNet data. -# -# The final output of this script appears as such: -# -# data_dir/train-00000-of-01024 -# data_dir/train-00001-of-01024 -# ... -# data_dir/train-00127-of-01024 -# -# and -# -# data_dir/validation-00000-of-00128 -# data_dir/validation-00001-of-00128 -# ... -# data_dir/validation-00127-of-00128 -# -# Note that this script may take several hours to run to completion. The -# conversion of the ImageNet data to TFRecords alone takes 2-3 hours depending -# on the speed of your machine. Please be patient. -# -# **IMPORTANT** -# To download the raw images, the user must create an account with image-net.org -# and generate a username and access_key. The latter two are required for -# downloading the raw images. -# -# usage: -# ./download_and_preprocess_imagenet.sh [data-dir] -set -e - -if [ -z "$1" ]; then - echo "usage download_and_preprocess_imagenet.sh [data dir]" - exit -fi - -# Create the output and temporary directories. -DATA_DIR="${1%/}" -SCRATCH_DIR="${DATA_DIR}/raw-data/" -mkdir -p "${DATA_DIR}" -mkdir -p "${SCRATCH_DIR}" -WORK_DIR="$0.runfiles/inception/inception" - -# Download the ImageNet data. -LABELS_FILE="${WORK_DIR}/data/imagenet_lsvrc_2015_synsets.txt" -DOWNLOAD_SCRIPT="${WORK_DIR}/data/download_imagenet.sh" -"${DOWNLOAD_SCRIPT}" "${SCRATCH_DIR}" "${LABELS_FILE}" - -# Note the locations of the train and validation data. -TRAIN_DIRECTORY="${SCRATCH_DIR}train/" -VALIDATION_DIRECTORY="${SCRATCH_DIR}validation/" - -# Preprocess the validation data by moving the images into the appropriate -# sub-directory based on the label (synset) of the image. -echo "Organizing the validation data into sub-directories." -PREPROCESS_VAL_SCRIPT="${WORK_DIR}/data/preprocess_imagenet_validation_data.py" -VAL_LABELS_FILE="${WORK_DIR}/data/imagenet_2012_validation_synset_labels.txt" - -"${PREPROCESS_VAL_SCRIPT}" "${VALIDATION_DIRECTORY}" "${VAL_LABELS_FILE}" - -# Convert the XML files for bounding box annotations into a single CSV. -echo "Extracting bounding box information from XML." -BOUNDING_BOX_SCRIPT="${WORK_DIR}/data/process_bounding_boxes.py" -BOUNDING_BOX_FILE="${SCRATCH_DIR}/imagenet_2012_bounding_boxes.csv" -BOUNDING_BOX_DIR="${SCRATCH_DIR}bounding_boxes/" - -"${BOUNDING_BOX_SCRIPT}" "${BOUNDING_BOX_DIR}" "${LABELS_FILE}" \ - | sort >"${BOUNDING_BOX_FILE}" -echo "Finished downloading and preprocessing the ImageNet data." - -# Build the TFRecords version of the ImageNet data. -BUILD_SCRIPT="${WORK_DIR}/build_imagenet_data" -OUTPUT_DIRECTORY="${DATA_DIR}" -IMAGENET_METADATA_FILE="${WORK_DIR}/data/imagenet_metadata.txt" - -"${BUILD_SCRIPT}" \ - --train_directory="${TRAIN_DIRECTORY}" \ - --validation_directory="${VALIDATION_DIRECTORY}" \ - --output_directory="${OUTPUT_DIRECTORY}" \ - --imagenet_metadata_file="${IMAGENET_METADATA_FILE}" \ - --labels_file="${LABELS_FILE}" \ - --bounding_box_file="${BOUNDING_BOX_FILE}" diff --git a/examples/imagenet/inception/data/download_imagenet.sh b/examples/imagenet/inception/data/download_imagenet.sh deleted file mode 100755 index 2611c538..00000000 --- a/examples/imagenet/inception/data/download_imagenet.sh +++ /dev/null @@ -1,106 +0,0 @@ -#!/bin/bash -# Copyright 2016 Google Inc. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# ============================================================================== - -# Script to download ImageNet Challenge 2012 training and validation data set. -# -# Downloads and decompresses raw images and bounding boxes. -# -# **IMPORTANT** -# To download the raw images, the user must create an account with image-net.org -# and generate a username and access_key. The latter two are required for -# downloading the raw images. -# -# usage: -# ./download_imagenet.sh [dirname] -set -e - -if [ "x$IMAGENET_ACCESS_KEY" == x -o "x$IMAGENET_USERNAME" == x ]; then - cat < ') - sys.exit(-1) - data_dir = sys.argv[1] - validation_labels_file = sys.argv[2] - - # Read in the 50000 synsets associated with the validation data set. - labels = [l.strip() for l in open(validation_labels_file).readlines()] - unique_labels = set(labels) - - # Make all sub-directories in the validation data dir. - for label in unique_labels: - labeled_data_dir = os.path.join(data_dir, label) - os.makedirs(labeled_data_dir) - - # Move all of the image to the appropriate sub-directory. - for i in range(len(labels)): - basename = 'ILSVRC2012_val_000%.5d.JPEG' % (i + 1) - original_filename = os.path.join(data_dir, basename) - if not os.path.exists(original_filename): - print('Failed to find: ' % original_filename) - sys.exit(-1) - new_filename = os.path.join(data_dir, labels[i], basename) - os.rename(original_filename, new_filename) diff --git a/examples/imagenet/inception/data/process_bounding_boxes.py b/examples/imagenet/inception/data/process_bounding_boxes.py deleted file mode 100755 index 5e9fd786..00000000 --- a/examples/imagenet/inception/data/process_bounding_boxes.py +++ /dev/null @@ -1,254 +0,0 @@ -#!/usr/bin/python -# Copyright 2016 Google Inc. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# ============================================================================== -"""Process the ImageNet Challenge bounding boxes for TensorFlow model training. - -This script is called as - -process_bounding_boxes.py [synsets-file] - -Where is a directory containing the downloaded and unpacked bounding box -data. If [synsets-file] is supplied, then only the bounding boxes whose -synstes are contained within this file are returned. Note that the -[synsets-file] file contains synset ids, one per line. - -The script dumps out a CSV text file in which each line contains an entry. - n00007846_64193.JPEG,0.0060,0.2620,0.7545,0.9940 - -The entry can be read as: - , , , , - -The bounding box for contains two points (xmin, ymin) and -(xmax, ymax) specifying the lower-left corner and upper-right corner of a -bounding box in *relative* coordinates. - -The user supplies a directory where the XML files reside. The directory -structure in the directory is assumed to look like this: - -/nXXXXXXXX/nXXXXXXXX_YYYY.xml - -Each XML file contains a bounding box annotation. The script: - - (1) Parses the XML file and extracts the filename, label and bounding box info. - - (2) The bounding box is specified in the XML files as integer (xmin, ymin) and - (xmax, ymax) *relative* to image size displayed to the human annotator. The - size of the image displayed to the human annotator is stored in the XML file - as integer (height, width). - - Note that the displayed size will differ from the actual size of the image - downloaded from image-net.org. To make the bounding box annotation useable, - we convert bounding box to floating point numbers relative to displayed - height and width of the image. - - Note that each XML file might contain N bounding box annotations. - - Note that the points are all clamped at a range of [0.0, 1.0] because some - human annotations extend outside the range of the supplied image. - - See details here: http://image-net.org/download-bboxes - -(3) By default, the script outputs all valid bounding boxes. If a - [synsets-file] is supplied, only the subset of bounding boxes associated - with those synsets are outputted. Importantly, one can supply a list of - synsets in the ImageNet Challenge and output the list of bounding boxes - associated with the training images of the ILSVRC. - - We use these bounding boxes to inform the random distortion of images - supplied to the network. - -If you run this script successfully, you will see the following output -to stderr: -> Finished processing 544546 XML files. -> Skipped 0 XML files not in ImageNet Challenge. -> Skipped 0 bounding boxes not in ImageNet Challenge. -> Wrote 615299 bounding boxes from 544546 annotated images. -""" - -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function - -import glob -import os.path -import sys -import xml.etree.ElementTree as ET - - -class BoundingBox(object): - pass - - -def GetItem(name, root, index=0): - count = 0 - for item in root.iter(name): - if count == index: - return item.text - count += 1 - # Failed to find "index" occurrence of item. - return -1 - - -def GetInt(name, root, index=0): - # In some XML annotation files, the point values are not integers, but floats. - # So we add a float function to avoid ValueError. - return int(float(GetItem(name, root, index))) - - -def FindNumberBoundingBoxes(root): - index = 0 - while True: - if GetInt('xmin', root, index) == -1: - break - index += 1 - return index - - -def ProcessXMLAnnotation(xml_file): - """Process a single XML file containing a bounding box.""" - # pylint: disable=broad-except - try: - tree = ET.parse(xml_file) - except Exception: - print('Failed to parse: ' + xml_file, file=sys.stderr) - return None - # pylint: enable=broad-except - root = tree.getroot() - - num_boxes = FindNumberBoundingBoxes(root) - boxes = [] - - for index in range(num_boxes): - box = BoundingBox() - # Grab the 'index' annotation. - box.xmin = GetInt('xmin', root, index) - box.ymin = GetInt('ymin', root, index) - box.xmax = GetInt('xmax', root, index) - box.ymax = GetInt('ymax', root, index) - - box.width = GetInt('width', root) - box.height = GetInt('height', root) - box.filename = GetItem('filename', root) + '.JPEG' - box.label = GetItem('name', root) - - xmin = float(box.xmin) / float(box.width) - xmax = float(box.xmax) / float(box.width) - ymin = float(box.ymin) / float(box.height) - ymax = float(box.ymax) / float(box.height) - - # Some images contain bounding box annotations that - # extend outside of the supplied image. See, e.g. - # n03127925/n03127925_147.xml - # Additionally, for some bounding boxes, the min > max - # or the box is entirely outside of the image. - min_x = min(xmin, xmax) - max_x = max(xmin, xmax) - box.xmin_scaled = min(max(min_x, 0.0), 1.0) - box.xmax_scaled = min(max(max_x, 0.0), 1.0) - - min_y = min(ymin, ymax) - max_y = max(ymin, ymax) - box.ymin_scaled = min(max(min_y, 0.0), 1.0) - box.ymax_scaled = min(max(max_y, 0.0), 1.0) - - boxes.append(box) - - return boxes - -if __name__ == '__main__': - if len(sys.argv) < 2 or len(sys.argv) > 3: - print('Invalid usage\n' - 'usage: process_bounding_boxes.py [synsets-file]', - file=sys.stderr) - sys.exit(-1) - - xml_files = glob.glob(sys.argv[1] + '/*/*.xml') - print('Identified %d XML files in %s' % (len(xml_files), sys.argv[1]), - file=sys.stderr) - - if len(sys.argv) == 3: - labels = set([l.strip() for l in open(sys.argv[2]).readlines()]) - print('Identified %d synset IDs in %s' % (len(labels), sys.argv[2]), - file=sys.stderr) - else: - labels = None - - skipped_boxes = 0 - skipped_files = 0 - saved_boxes = 0 - saved_files = 0 - for file_index, one_file in enumerate(xml_files): - # Example: <...>/n06470073/n00141669_6790.xml - label = os.path.basename(os.path.dirname(one_file)) - - # Determine if the annotation is from an ImageNet Challenge label. - if labels is not None and label not in labels: - skipped_files += 1 - continue - - bboxes = ProcessXMLAnnotation(one_file) - assert bboxes is not None, 'No bounding boxes found in ' + one_file - - found_box = False - for bbox in bboxes: - if labels is not None: - if bbox.label != label: - # Note: There is a slight bug in the bounding box annotation data. - # Many of the dog labels have the human label 'Scottish_deerhound' - # instead of the synset ID 'n02092002' in the bbox.label field. As a - # simple hack to overcome this issue, we only exclude bbox labels - # *which are synset ID's* that do not match original synset label for - # the XML file. - if bbox.label in labels: - skipped_boxes += 1 - continue - - # Guard against improperly specified boxes. - if (bbox.xmin_scaled >= bbox.xmax_scaled or - bbox.ymin_scaled >= bbox.ymax_scaled): - skipped_boxes += 1 - continue - - # Note bbox.filename occasionally contains '%s' in the name. This is - # data set noise that is fixed by just using the basename of the XML file. - image_filename = os.path.splitext(os.path.basename(one_file))[0] - print('%s.JPEG,%.4f,%.4f,%.4f,%.4f' % - (image_filename, - bbox.xmin_scaled, bbox.ymin_scaled, - bbox.xmax_scaled, bbox.ymax_scaled)) - - saved_boxes += 1 - found_box = True - if found_box: - saved_files += 1 - else: - skipped_files += 1 - - if not file_index % 5000: - print('--> processed %d of %d XML files.' % - (file_index + 1, len(xml_files)), - file=sys.stderr) - print('--> skipped %d boxes and %d XML files.' % - (skipped_boxes, skipped_files), file=sys.stderr) - - print('Finished processing %d XML files.' % len(xml_files), file=sys.stderr) - print('Skipped %d XML files not in ImageNet Challenge.' % skipped_files, - file=sys.stderr) - print('Skipped %d bounding boxes not in ImageNet Challenge.' % skipped_boxes, - file=sys.stderr) - print('Wrote %d bounding boxes from %d annotated images.' % - (saved_boxes, saved_files), - file=sys.stderr) - print('Finished.', file=sys.stderr) diff --git a/examples/imagenet/inception/dataset.py b/examples/imagenet/inception/dataset.py deleted file mode 100644 index 752c97e0..00000000 --- a/examples/imagenet/inception/dataset.py +++ /dev/null @@ -1,103 +0,0 @@ -# Copyright 2016 Google Inc. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# ============================================================================== -"""Small library that points to a data set. - -Methods of Data class: - data_files: Returns a python list of all (sharded) data set files. - num_examples_per_epoch: Returns the number of examples in the data set. - num_classes: Returns the number of classes in the data set. - reader: Return a reader for a single entry from the data set. -""" -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function - -from abc import ABCMeta -from abc import abstractmethod -import os - - -import tensorflow as tf - -FLAGS = tf.app.flags.FLAGS - -# Basic model parameters. -tf.app.flags.DEFINE_string('data_dir', '/tmp/mydata', - """Path to the processed data, i.e. """ - """TFRecord of Example protos.""") - - -class Dataset(object): - """A simple class for handling data sets.""" - __metaclass__ = ABCMeta - - def __init__(self, name, subset): - """Initialize dataset using a subset and the path to the data.""" - assert subset in self.available_subsets(), self.available_subsets() - self.name = name - self.subset = subset - - @abstractmethod - def num_classes(self): - """Returns the number of classes in the data set.""" - pass - # return 10 - - @abstractmethod - def num_examples_per_epoch(self): - """Returns the number of examples in the data subset.""" - pass - # if self.subset == 'train': - # return 10000 - # if self.subset == 'validation': - # return 1000 - - @abstractmethod - def download_message(self): - """Prints a download message for the Dataset.""" - pass - - def available_subsets(self): - """Returns the list of available subsets.""" - return ['train', 'validation'] - - def data_files(self): - """Returns a python list of all (sharded) data subset files. - - Returns: - python list of all (sharded) data set files. - Raises: - ValueError: if there are not data_files matching the subset. - """ - tf_record_pattern = os.path.join(FLAGS.data_dir, '%s-*' % self.subset) - data_files = tf.gfile.Glob(tf_record_pattern) - if not data_files: - print('No files found for dataset %s/%s at %s' % (self.name, - self.subset, - FLAGS.data_dir)) - - self.download_message() - exit(-1) - return data_files - - def reader(self): - """Return a reader for a single entry from the data set. - - See io_ops.py for details of Reader class. - - Returns: - Reader object that reads the data set. - """ - return tf.TFRecordReader() diff --git a/examples/imagenet/inception/flowers_data.py b/examples/imagenet/inception/flowers_data.py deleted file mode 100644 index 022b5234..00000000 --- a/examples/imagenet/inception/flowers_data.py +++ /dev/null @@ -1,52 +0,0 @@ -# Copyright 2016 Google Inc. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# ============================================================================== -"""Small library that points to the flowers data set. -""" -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function - - - -from inception.dataset import Dataset - - -class FlowersData(Dataset): - """Flowers data set.""" - - def __init__(self, subset): - super(FlowersData, self).__init__('Flowers', subset) - - def num_classes(self): - """Returns the number of classes in the data set.""" - return 5 - - def num_examples_per_epoch(self): - """Returns the number of examples in the data subset.""" - if self.subset == 'train': - return 3170 - if self.subset == 'validation': - return 500 - - def download_message(self): - """Instruction to download and extract the tarball from Flowers website.""" - - print('Failed to find any Flowers %s files'% self.subset) - print('') - print('If you have already downloaded and processed the data, then make ' - 'sure to set --data_dir to point to the directory containing the ' - 'location of the sharded TFRecords.\n') - print('Please see README.md for instructions on how to build ' - 'the flowers dataset using download_and_preprocess_flowers.\n') diff --git a/examples/imagenet/inception/flowers_eval.py b/examples/imagenet/inception/flowers_eval.py deleted file mode 100644 index ae3e9dc1..00000000 --- a/examples/imagenet/inception/flowers_eval.py +++ /dev/null @@ -1,40 +0,0 @@ -# Copyright 2016 Google Inc. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# ============================================================================== -"""A binary to evaluate Inception on the flowers data set. -""" -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function - - -import tensorflow as tf - -from inception import inception_eval -from inception.flowers_data import FlowersData - -FLAGS = tf.app.flags.FLAGS - - -def main(unused_argv=None): - dataset = FlowersData(subset=FLAGS.subset) - assert dataset.data_files() - if tf.gfile.Exists(FLAGS.eval_dir): - tf.gfile.DeleteRecursively(FLAGS.eval_dir) - tf.gfile.MakeDirs(FLAGS.eval_dir) - inception_eval.evaluate(dataset) - - -if __name__ == '__main__': - tf.app.run() diff --git a/examples/imagenet/inception/flowers_train.py b/examples/imagenet/inception/flowers_train.py deleted file mode 100644 index 1f044a53..00000000 --- a/examples/imagenet/inception/flowers_train.py +++ /dev/null @@ -1,41 +0,0 @@ -# Copyright 2016 Google Inc. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# ============================================================================== -"""A binary to train Inception on the flowers data set. -""" -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function - - - -import tensorflow as tf - -from inception import inception_train -from inception.flowers_data import FlowersData - -FLAGS = tf.app.flags.FLAGS - - -def main(_): - dataset = FlowersData(subset=FLAGS.subset) - assert dataset.data_files() - if tf.gfile.Exists(FLAGS.train_dir): - tf.gfile.DeleteRecursively(FLAGS.train_dir) - tf.gfile.MakeDirs(FLAGS.train_dir) - inception_train.train(dataset) - - -if __name__ == '__main__': - tf.app.run() diff --git a/examples/imagenet/inception/image_processing.py b/examples/imagenet/inception/image_processing.py deleted file mode 100644 index df168120..00000000 --- a/examples/imagenet/inception/image_processing.py +++ /dev/null @@ -1,513 +0,0 @@ -# Copyright 2016 Google Inc. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# ============================================================================== -"""Read and preprocess image data. - - Image processing occurs on a single image at a time. Image are read and - preprocessed in parallel across multiple threads. The resulting images - are concatenated together to form a single batch for training or evaluation. - - -- Provide processed image data for a network: - inputs: Construct batches of evaluation examples of images. - distorted_inputs: Construct batches of training examples of images. - batch_inputs: Construct batches of training or evaluation examples of images. - - -- Data processing: - parse_example_proto: Parses an Example proto containing a training example - of an image. - - -- Image decoding: - decode_jpeg: Decode a JPEG encoded string into a 3-D float32 Tensor. - - -- Image preprocessing: - image_preprocessing: Decode and preprocess one image for evaluation or training - distort_image: Distort one image for training a network. - eval_image: Prepare one image for evaluation. - distort_color: Distort the color in one image for training. -""" -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function - -import tensorflow as tf - -FLAGS = tf.app.flags.FLAGS - -tf.app.flags.DEFINE_integer('batch_size', 32, - """Number of images to process in a batch.""") -tf.app.flags.DEFINE_integer('image_size', 299, - """Provide square images of this size.""") -tf.app.flags.DEFINE_integer('num_preprocess_threads', 4, - """Number of preprocessing threads per tower. """ - """Please make this a multiple of 4.""") -tf.app.flags.DEFINE_integer('num_readers', 4, - """Number of parallel readers during train.""") - -# Images are preprocessed asynchronously using multiple threads specified by -# --num_preprocss_threads and the resulting processed images are stored in a -# random shuffling queue. The shuffling queue dequeues --batch_size images -# for processing on a given Inception tower. A larger shuffling queue guarantees -# better mixing across examples within a batch and results in slightly higher -# predictive performance in a trained model. Empirically, -# --input_queue_memory_factor=16 works well. A value of 16 implies a queue size -# of 1024*16 images. Assuming RGB 299x299 images, this implies a queue size of -# 16GB. If the machine is memory limited, then decrease this factor to -# decrease the CPU memory footprint, accordingly. -tf.app.flags.DEFINE_integer('input_queue_memory_factor', 16, - """Size of the queue of preprocessed images. """ - """Default is ideal but try smaller values, e.g. """ - """4, 2 or 1, if host memory is constrained. See """ - """comments in code for more details.""") - - -def inputs(dataset, batch_size=None, num_preprocess_threads=None): - """Generate batches of ImageNet images for evaluation. - - Use this function as the inputs for evaluating a network. - - Note that some (minimal) image preprocessing occurs during evaluation - including central cropping and resizing of the image to fit the network. - - Args: - dataset: instance of Dataset class specifying the dataset. - batch_size: integer, number of examples in batch - num_preprocess_threads: integer, total number of preprocessing threads but - None defaults to FLAGS.num_preprocess_threads. - - Returns: - images: Images. 4D tensor of size [batch_size, FLAGS.image_size, - image_size, 3]. - labels: 1-D integer Tensor of [FLAGS.batch_size]. - """ - if not batch_size: - batch_size = FLAGS.batch_size - - # Force all input processing onto CPU in order to reserve the GPU for - # the forward inference and back-propagation. - with tf.device('/cpu:0'): - images, labels = batch_inputs( - dataset, batch_size, train=False, - num_preprocess_threads=num_preprocess_threads, - num_readers=1) - - return images, labels - - -def distorted_inputs(dataset, batch_size=None, num_preprocess_threads=None): - """Generate batches of distorted versions of ImageNet images. - - Use this function as the inputs for training a network. - - Distorting images provides a useful technique for augmenting the data - set during training in order to make the network invariant to aspects - of the image that do not effect the label. - - Args: - dataset: instance of Dataset class specifying the dataset. - batch_size: integer, number of examples in batch - num_preprocess_threads: integer, total number of preprocessing threads but - None defaults to FLAGS.num_preprocess_threads. - - Returns: - images: Images. 4D tensor of size [batch_size, FLAGS.image_size, - FLAGS.image_size, 3]. - labels: 1-D integer Tensor of [batch_size]. - """ - if not batch_size: - batch_size = FLAGS.batch_size - - # Force all input processing onto CPU in order to reserve the GPU for - # the forward inference and back-propagation. - with tf.device('/cpu:0'): - images, labels = batch_inputs( - dataset, batch_size, train=True, - num_preprocess_threads=num_preprocess_threads, - num_readers=FLAGS.num_readers) - return images, labels - - -def decode_jpeg(image_buffer, scope=None): - """Decode a JPEG string into one 3-D float image Tensor. - - Args: - image_buffer: scalar string Tensor. - scope: Optional scope for name_scope. - Returns: - 3-D float Tensor with values ranging from [0, 1). - """ - with tf.name_scope(values=[image_buffer], name=scope, - default_name='decode_jpeg'): - # Decode the string as an RGB JPEG. - # Note that the resulting image contains an unknown height and width - # that is set dynamically by decode_jpeg. In other words, the height - # and width of image is unknown at compile-time. - image = tf.image.decode_jpeg(image_buffer, channels=3) - - # After this point, all image pixels reside in [0,1) - # until the very end, when they're rescaled to (-1, 1). The various - # adjust_* ops all require this range for dtype float. - image = tf.image.convert_image_dtype(image, dtype=tf.float32) - return image - - -def distort_color(image, thread_id=0, scope=None): - """Distort the color of the image. - - Each color distortion is non-commutative and thus ordering of the color ops - matters. Ideally we would randomly permute the ordering of the color ops. - Rather then adding that level of complication, we select a distinct ordering - of color ops for each preprocessing thread. - - Args: - image: Tensor containing single image. - thread_id: preprocessing thread ID. - scope: Optional scope for name_scope. - Returns: - color-distorted image - """ - with tf.name_scope(values=[image], name=scope, default_name='distort_color'): - color_ordering = thread_id % 2 - - if color_ordering == 0: - image = tf.image.random_brightness(image, max_delta=32. / 255.) - image = tf.image.random_saturation(image, lower=0.5, upper=1.5) - image = tf.image.random_hue(image, max_delta=0.2) - image = tf.image.random_contrast(image, lower=0.5, upper=1.5) - elif color_ordering == 1: - image = tf.image.random_brightness(image, max_delta=32. / 255.) - image = tf.image.random_contrast(image, lower=0.5, upper=1.5) - image = tf.image.random_saturation(image, lower=0.5, upper=1.5) - image = tf.image.random_hue(image, max_delta=0.2) - - # The random_* ops do not necessarily clamp. - image = tf.clip_by_value(image, 0.0, 1.0) - return image - - -def distort_image(image, height, width, bbox, thread_id=0, scope=None): - """Distort one image for training a network. - - Distorting images provides a useful technique for augmenting the data - set during training in order to make the network invariant to aspects - of the image that do not effect the label. - - Args: - image: 3-D float Tensor of image - height: integer - width: integer - bbox: 3-D float Tensor of bounding boxes arranged [1, num_boxes, coords] - where each coordinate is [0, 1) and the coordinates are arranged - as [ymin, xmin, ymax, xmax]. - thread_id: integer indicating the preprocessing thread. - scope: Optional scope for name_scope. - Returns: - 3-D float Tensor of distorted image used for training. - """ - with tf.name_scope(values=[image, height, width, bbox], name=scope, - default_name='distort_image'): - # Each bounding box has shape [1, num_boxes, box coords] and - # the coordinates are ordered [ymin, xmin, ymax, xmax]. - - # Display the bounding box in the first thread only. - if not thread_id: - image_with_box = tf.image.draw_bounding_boxes(tf.expand_dims(image, 0), - bbox) - tf.summary.image('image_with_bounding_boxes', image_with_box) - - # A large fraction of image datasets contain a human-annotated bounding - # box delineating the region of the image containing the object of interest. - # We choose to create a new bounding box for the object which is a randomly - # distorted version of the human-annotated bounding box that obeys an allowed - # range of aspect ratios, sizes and overlap with the human-annotated - # bounding box. If no box is supplied, then we assume the bounding box is - # the entire image. - sample_distorted_bounding_box = tf.image.sample_distorted_bounding_box( - tf.shape(image), - bounding_boxes=bbox, - min_object_covered=0.1, - aspect_ratio_range=[0.75, 1.33], - area_range=[0.05, 1.0], - max_attempts=100, - use_image_if_no_bounding_boxes=True) - bbox_begin, bbox_size, distort_bbox = sample_distorted_bounding_box - if not thread_id: - image_with_distorted_box = tf.image.draw_bounding_boxes( - tf.expand_dims(image, 0), distort_bbox) - tf.summary.image('images_with_distorted_bounding_box', - image_with_distorted_box) - - # Crop the image to the specified bounding box. - distorted_image = tf.slice(image, bbox_begin, bbox_size) - - # This resizing operation may distort the images because the aspect - # ratio is not respected. We select a resize method in a round robin - # fashion based on the thread number. - # Note that ResizeMethod contains 4 enumerated resizing methods. - resize_method = thread_id % 4 - distorted_image = tf.image.resize_images(distorted_image, [height, width], - method=resize_method) - # Restore the shape since the dynamic slice based upon the bbox_size loses - # the third dimension. - distorted_image.set_shape([height, width, 3]) - if not thread_id: - tf.summary.image('cropped_resized_image', - tf.expand_dims(distorted_image, 0)) - - # Randomly flip the image horizontally. - distorted_image = tf.image.random_flip_left_right(distorted_image) - - # Randomly distort the colors. - distorted_image = distort_color(distorted_image, thread_id) - - if not thread_id: - tf.summary.image('final_distorted_image', - tf.expand_dims(distorted_image, 0)) - return distorted_image - - -def eval_image(image, height, width, scope=None): - """Prepare one image for evaluation. - - Args: - image: 3-D float Tensor - height: integer - width: integer - scope: Optional scope for name_scope. - Returns: - 3-D float Tensor of prepared image. - """ - with tf.name_scope(values=[image, height, width], name=scope, - default_name='eval_image'): - # Crop the central region of the image with an area containing 87.5% of - # the original image. - image = tf.image.central_crop(image, central_fraction=0.875) - - # Resize the image to the original height and width. - image = tf.expand_dims(image, 0) - image = tf.image.resize_bilinear(image, [height, width], - align_corners=False) - image = tf.squeeze(image, [0]) - return image - - -def image_preprocessing(image_buffer, bbox, train, thread_id=0): - """Decode and preprocess one image for evaluation or training. - - Args: - image_buffer: JPEG encoded string Tensor - bbox: 3-D float Tensor of bounding boxes arranged [1, num_boxes, coords] - where each coordinate is [0, 1) and the coordinates are arranged as - [ymin, xmin, ymax, xmax]. - train: boolean - thread_id: integer indicating preprocessing thread - - Returns: - 3-D float Tensor containing an appropriately scaled image - - Raises: - ValueError: if user does not provide bounding box - """ - if bbox is None: - raise ValueError('Please supply a bounding box.') - - image = decode_jpeg(image_buffer) - height = FLAGS.image_size - width = FLAGS.image_size - - if train: - image = distort_image(image, height, width, bbox, thread_id) - else: - image = eval_image(image, height, width) - - # Finally, rescale to [-1,1] instead of [0, 1) - image = tf.subtract(image, 0.5) - image = tf.multiply(image, 2.0) - return image - - -def parse_example_proto(example_serialized): - """Parses an Example proto containing a training example of an image. - - The output of the build_image_data.py image preprocessing script is a dataset - containing serialized Example protocol buffers. Each Example proto contains - the following fields: - - image/height: 462 - image/width: 581 - image/colorspace: 'RGB' - image/channels: 3 - image/class/label: 615 - image/class/synset: 'n03623198' - image/class/text: 'knee pad' - image/object/bbox/xmin: 0.1 - image/object/bbox/xmax: 0.9 - image/object/bbox/ymin: 0.2 - image/object/bbox/ymax: 0.6 - image/object/bbox/label: 615 - image/format: 'JPEG' - image/filename: 'ILSVRC2012_val_00041207.JPEG' - image/encoded: - - Args: - example_serialized: scalar Tensor tf.string containing a serialized - Example protocol buffer. - - Returns: - image_buffer: Tensor tf.string containing the contents of a JPEG file. - label: Tensor tf.int32 containing the label. - bbox: 3-D float Tensor of bounding boxes arranged [1, num_boxes, coords] - where each coordinate is [0, 1) and the coordinates are arranged as - [ymin, xmin, ymax, xmax]. - text: Tensor tf.string containing the human-readable label. - """ - # Dense features in Example proto. - feature_map = { - 'image/encoded': tf.FixedLenFeature([], dtype=tf.string, - default_value=''), - 'image/class/label': tf.FixedLenFeature([1], dtype=tf.int64, - default_value=-1), - 'image/class/text': tf.FixedLenFeature([], dtype=tf.string, - default_value=''), - } - sparse_float32 = tf.VarLenFeature(dtype=tf.float32) - # Sparse features in Example proto. - feature_map.update( - {k: sparse_float32 for k in ['image/object/bbox/xmin', - 'image/object/bbox/ymin', - 'image/object/bbox/xmax', - 'image/object/bbox/ymax']}) - - features = tf.parse_single_example(example_serialized, feature_map) - label = tf.cast(features['image/class/label'], dtype=tf.int32) - - xmin = tf.expand_dims(features['image/object/bbox/xmin'].values, 0) - ymin = tf.expand_dims(features['image/object/bbox/ymin'].values, 0) - xmax = tf.expand_dims(features['image/object/bbox/xmax'].values, 0) - ymax = tf.expand_dims(features['image/object/bbox/ymax'].values, 0) - - # Note that we impose an ordering of (y, x) just to make life difficult. - bbox = tf.concat(axis=0, values=[ymin, xmin, ymax, xmax]) - - # Force the variable number of bounding boxes into the shape - # [1, num_boxes, coords]. - bbox = tf.expand_dims(bbox, 0) - bbox = tf.transpose(bbox, [0, 2, 1]) - - return features['image/encoded'], label, bbox, features['image/class/text'] - - -def batch_inputs(dataset, batch_size, train, num_preprocess_threads=None, - num_readers=1): - """Contruct batches of training or evaluation examples from the image dataset. - - Args: - dataset: instance of Dataset class specifying the dataset. - See dataset.py for details. - batch_size: integer - train: boolean - num_preprocess_threads: integer, total number of preprocessing threads - num_readers: integer, number of parallel readers - - Returns: - images: 4-D float Tensor of a batch of images - labels: 1-D integer Tensor of [batch_size]. - - Raises: - ValueError: if data is not found - """ - with tf.name_scope('batch_processing'): - data_files = dataset.data_files() - if data_files is None: - raise ValueError('No data files found for this dataset') - - # Create filename_queue - if train: - filename_queue = tf.train.string_input_producer(data_files, - shuffle=True, - capacity=16) - else: - filename_queue = tf.train.string_input_producer(data_files, - shuffle=False, - capacity=1) - if num_preprocess_threads is None: - num_preprocess_threads = FLAGS.num_preprocess_threads - - if num_preprocess_threads % 4: - raise ValueError('Please make num_preprocess_threads a multiple ' - 'of 4 (%d % 4 != 0).', num_preprocess_threads) - - if num_readers is None: - num_readers = FLAGS.num_readers - - if num_readers < 1: - raise ValueError('Please make num_readers at least 1') - - # Approximate number of examples per shard. - examples_per_shard = 1024 - # Size the random shuffle queue to balance between good global - # mixing (more examples) and memory use (fewer examples). - # 1 image uses 299*299*3*4 bytes = 1MB - # The default input_queue_memory_factor is 16 implying a shuffling queue - # size: examples_per_shard * 16 * 1MB = 17.6GB - min_queue_examples = examples_per_shard * FLAGS.input_queue_memory_factor - if train: - examples_queue = tf.RandomShuffleQueue( - capacity=min_queue_examples + 3 * batch_size, - min_after_dequeue=min_queue_examples, - dtypes=[tf.string]) - else: - examples_queue = tf.FIFOQueue( - capacity=examples_per_shard + 3 * batch_size, - dtypes=[tf.string]) - - # Create multiple readers to populate the queue of examples. - if num_readers > 1: - enqueue_ops = [] - for _ in range(num_readers): - reader = dataset.reader() - _, value = reader.read(filename_queue) - enqueue_ops.append(examples_queue.enqueue([value])) - - tf.train.queue_runner.add_queue_runner( - tf.train.queue_runner.QueueRunner(examples_queue, enqueue_ops)) - example_serialized = examples_queue.dequeue() - else: - reader = dataset.reader() - _, example_serialized = reader.read(filename_queue) - - images_and_labels = [] - for thread_id in range(num_preprocess_threads): - # Parse a serialized Example proto to extract the image and metadata. - image_buffer, label_index, bbox, _ = parse_example_proto( - example_serialized) - image = image_preprocessing(image_buffer, bbox, train, thread_id) - images_and_labels.append([image, label_index]) - - images, label_index_batch = tf.train.batch_join( - images_and_labels, - batch_size=batch_size, - capacity=2 * num_preprocess_threads * batch_size) - - # Reshape images into these desired dimensions. - height = FLAGS.image_size - width = FLAGS.image_size - depth = 3 - - images = tf.cast(images, tf.float32) - images = tf.reshape(images, shape=[batch_size, height, width, depth]) - - # Display the training images in the visualizer. - tf.summary.image('images', images) - - return images, tf.reshape(label_index_batch, [batch_size]) diff --git a/examples/imagenet/inception/imagenet_data.py b/examples/imagenet/inception/imagenet_data.py deleted file mode 100644 index 0a6d22e1..00000000 --- a/examples/imagenet/inception/imagenet_data.py +++ /dev/null @@ -1,59 +0,0 @@ -# Copyright 2016 Google Inc. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# ============================================================================== -"""Small library that points to the ImageNet data set. -""" -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function - - - -from inception.dataset import Dataset - - -class ImagenetData(Dataset): - """ImageNet data set.""" - - def __init__(self, subset): - super(ImagenetData, self).__init__('ImageNet', subset) - - def num_classes(self): - """Returns the number of classes in the data set.""" - return 1000 - - def num_examples_per_epoch(self): - """Returns the number of examples in the data set.""" - # Bounding box data consists of 615299 bounding boxes for 544546 images. - if self.subset == 'train': - return 1281167 - if self.subset == 'validation': - return 50000 - - def download_message(self): - """Instruction to download and extract the tarball from Flowers website.""" - - print('Failed to find any ImageNet %s files'% self.subset) - print('') - print('If you have already downloaded and processed the data, then make ' - 'sure to set --data_dir to point to the directory containing the ' - 'location of the sharded TFRecords.\n') - print('If you have not downloaded and prepared the ImageNet data in the ' - 'TFRecord format, you will need to do this at least once. This ' - 'process could take several hours depending on the speed of your ' - 'computer and network connection\n') - print('Please see README.md for instructions on how to build ' - 'the ImageNet dataset using download_and_preprocess_imagenet.\n') - print('Note that the raw data size is 300 GB and the processed data size ' - 'is 150 GB. Please ensure you have at least 500GB disk space.') diff --git a/examples/imagenet/inception/imagenet_distributed_train.py b/examples/imagenet/inception/imagenet_distributed_train.py deleted file mode 100644 index 560c6617..00000000 --- a/examples/imagenet/inception/imagenet_distributed_train.py +++ /dev/null @@ -1,97 +0,0 @@ -# Copyright 2016 Google Inc. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# ============================================================================== -# pylint: disable=line-too-long -"""A binary to train Inception in a distributed manner using multiple systems. - -Please see accompanying README.md for details and instructions. -""" -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function - -from pyspark.context import SparkContext -from pyspark.conf import SparkConf -from tensorflowonspark import TFCluster, TFNode -from datetime import datetime - -import os -import sys -import tensorflow as tf -import time - -def main_fun(argv, ctx): - - # extract node metadata from ctx - worker_num = ctx.worker_num - job_name = ctx.job_name - task_index = ctx.task_index - - assert job_name in ['ps', 'worker'], 'job_name must be ps or worker' - - from inception import inception_distributed_train - from inception.imagenet_data import ImagenetData - import tensorflow as tf - - # instantiate FLAGS on workers using argv from driver and add job_name and task_id - print("argv:", argv) - sys.argv = argv - - FLAGS = tf.app.flags.FLAGS - FLAGS.job_name = job_name - FLAGS.task_id = task_index - print("FLAGS:", FLAGS.__dict__['__flags']) - - # Get TF cluster and server instances - cluster_spec, server = TFNode.start_cluster_server(ctx, FLAGS.num_gpus, FLAGS.rdma) - - if FLAGS.job_name == 'ps': - # `ps` jobs wait for incoming connections from the workers. - server.join() - else: - # `worker` jobs will actually do the work. - dataset = ImagenetData(subset=FLAGS.subset) - assert dataset.data_files() - # Only the chief checks for or creates train_dir. - if FLAGS.task_id == 0: - if not tf.gfile.Exists(FLAGS.train_dir): - tf.gfile.MakeDirs(FLAGS.train_dir) - inception_distributed_train.train(server.target, dataset, cluster_spec, ctx) - -if __name__ == '__main__': - # parse arguments needed by the Spark driver - import argparse - parser = argparse.ArgumentParser() - parser.add_argument("--epochs", help="number of epochs", type=int, default=0) - parser.add_argument("--input_data", help="HDFS path to input dataset") - parser.add_argument("--input_mode", help="method to ingest data: (spark|tf)", choices=["spark","tf"], default="tf") - parser.add_argument("--tensorboard", help="launch tensorboard process", action="store_true") - - (args,rem) = parser.parse_known_args() - - input_mode = TFCluster.InputMode.SPARK if args.input_mode == 'spark' else TFCluster.InputMode.TENSORFLOW - - print("{0} ===== Start".format(datetime.now().isoformat())) - sc = SparkContext(conf=SparkConf().setAppName('imagenet_distributed_train')) - num_executors = int(sc._conf.get("spark.executor.instances")) - num_ps = 1 - - cluster = TFCluster.run(sc, main_fun, sys.argv, num_executors, num_ps, args.tensorboard, input_mode) - if input_mode == TFCluster.InputMode.SPARK: - dataRDD = sc.newAPIHadoopFile(args.input_data, "org.tensorflow.hadoop.io.TFRecordFileInputFormat", - keyClass="org.apache.hadoop.io.BytesWritable", - valueClass="org.apache.hadoop.io.NullWritable") - cluster.train(dataRDD, args.epochs) - cluster.shutdown() - print("{0} ===== Stop".format(datetime.now().isoformat())) diff --git a/examples/imagenet/inception/imagenet_distributed_train_pipeline.py b/examples/imagenet/inception/imagenet_distributed_train_pipeline.py deleted file mode 100644 index 9f0adb9b..00000000 --- a/examples/imagenet/inception/imagenet_distributed_train_pipeline.py +++ /dev/null @@ -1,107 +0,0 @@ -# Copyright 2017 Yahoo Inc. -# Licensed under the terms of the Apache 2.0 license. -# Please see LICENSE file in the project root for terms. - -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function - -from pyspark.context import SparkContext -from pyspark.conf import SparkConf -from pyspark.sql import SparkSession -from tensorflowonspark import TFCluster, TFNode, dfutil -from tensorflowonspark.pipeline import TFEstimator -from datetime import datetime - -from inception import inception_export - -import sys -import tensorflow as tf - -def main_fun(argv, ctx): - # extract node metadata from ctx - job_name = ctx.job_name - task_index = ctx.task_index - - assert job_name in ['ps', 'worker'], 'job_name must be ps or worker' - - from inception import inception_distributed_train - from inception.imagenet_data import ImagenetData - import tensorflow as tf - - # instantiate FLAGS on workers using argv from driver and add job_name and task_id - print("argv:", argv) - sys.argv = argv - - FLAGS = tf.app.flags.FLAGS - FLAGS.job_name = job_name - FLAGS.task_id = task_index - print("FLAGS:", FLAGS.__dict__['__flags']) - - # Get TF cluster and server instances - cluster_spec, server = TFNode.start_cluster_server(ctx, FLAGS.num_gpus, FLAGS.rdma) - - if FLAGS.job_name == 'ps': - # `ps` jobs wait for incoming connections from the workers. - server.join() - else: - # `worker` jobs will actually do the work. - dataset = ImagenetData(subset=FLAGS.subset) - assert dataset.data_files() - # Only the chief checks for or creates train_dir. - if FLAGS.task_id == 0: - if not tf.gfile.Exists(FLAGS.train_dir): - tf.gfile.MakeDirs(FLAGS.train_dir) - inception_distributed_train.train(server.target, dataset, cluster_spec, ctx) - - -if __name__ == '__main__': - # parse arguments needed by the Spark driver - import argparse - - sc = SparkContext(conf=SparkConf().setAppName('imagenet_distributed_train')) - spark = SparkSession.builder.getOrCreate() - num_executors = int(sc._conf.get("spark.executor.instances")) - - # Note: these arguments are for TFoS only... since the Inception code uses tf.app.FLAGS, for which we need to pass the argv - parser = argparse.ArgumentParser() - parser.add_argument("--cluster_size", help="number of nodes in the cluster", type=int, default=num_executors) - parser.add_argument("--export_dir", help="HDFS path to export model", type=str) - parser.add_argument("--input_mode", help="method to ingest data: (spark|tf)", choices=["spark","tf"], default="tf") - parser.add_argument("--num_ps", help="number of PS nodes in cluster", type=int, default=1) - parser.add_argument("--output", help="HDFS path to save output predictions", type=str) - parser.add_argument("--tensorboard", help="launch tensorboard process", action="store_true") - parser.add_argument("--train_dir", help="HDFS path to save/load model during train/inference", type=str) - parser.add_argument("--tfrecord_dir", help="HDFS path to temporarily save DataFrame to disk", type=str) - parser.add_argument("--train_data", help="HDFS path to training data", type=str) - parser.add_argument("--validation_data", help="HDFS path to validation data", type=str) - - (args,rem) = parser.parse_known_args() - - input_mode = TFCluster.InputMode.SPARK if args.input_mode == 'spark' else TFCluster.InputMode.TENSORFLOW - - print("{0} ===== Start".format(datetime.now().isoformat())) - - df = dfutil.loadTFRecords(sc, args.train_data, binary_features=['image/encoded']) - estimator = TFEstimator(main_fun, sys.argv, export_fn=inception_export.export) \ - .setModelDir(args.train_dir) \ - .setExportDir(args.export_dir) \ - .setTFRecordDir(args.tfrecord_dir) \ - .setClusterSize(args.cluster_size) \ - .setNumPS(args.num_ps) \ - .setInputMode(TFCluster.InputMode.TENSORFLOW) \ - .setTensorboard(args.tensorboard) \ - - print("{0} ===== Train".format(datetime.now().isoformat())) - model = estimator.fit(df) - - print("{0} ===== Inference".format(datetime.now().isoformat())) - df = dfutil.loadTFRecords(sc, args.validation_data, binary_features=['image/encoded']) - preds = model.setTagSet(tf.saved_model.tag_constants.SERVING) \ - .setSignatureDefKey(tf.saved_model.signature_constants.DEFAULT_SERVING_SIGNATURE_DEF_KEY) \ - .setInputMapping({'image/encoded': 'jpegs', 'image/class/label': 'labels'}) \ - .setOutputMapping({'top_5_acc': 'output'}) \ - .transform(df) - preds.write.json(args.output) - - print("{0} ===== Stop".format(datetime.now().isoformat())) diff --git a/examples/imagenet/inception/imagenet_eval.py b/examples/imagenet/inception/imagenet_eval.py deleted file mode 100644 index 13cf58cb..00000000 --- a/examples/imagenet/inception/imagenet_eval.py +++ /dev/null @@ -1,62 +0,0 @@ -# Copyright 2016 Google Inc. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# ============================================================================== -"""A binary to evaluate Inception on the flowers data set. - -Note that using the supplied pre-trained inception checkpoint, the eval should -achieve: - precision @ 1 = 0.7874 recall @ 5 = 0.9436 [50000 examples] - -See the README.md for more details. -""" -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function - - -from pyspark.context import SparkContext -from pyspark.conf import SparkConf -from tensorflowonspark import TFCluster, TFNode -import sys - -def main_fun(argv, ctx): - import tensorflow as tf - from inception import inception_eval - from inception.imagenet_data import ImagenetData - - print("argv:", argv) - sys.argv = argv - - FLAGS = tf.app.flags.FLAGS - FLAGS._parse_flags() - print("FLAGS:", FLAGS.__dict__['__flags']) - - dataset = ImagenetData(subset=FLAGS.subset) - assert dataset.data_files() - if tf.gfile.Exists(FLAGS.eval_dir): - tf.gfile.DeleteRecursively(FLAGS.eval_dir) - tf.gfile.MakeDirs(FLAGS.eval_dir) - - cluster_spec, server = TFNode.start_cluster_server(ctx) - - inception_eval.evaluate(dataset) - - -if __name__ == '__main__': - sc = SparkContext(conf=SparkConf().setAppName("grid_imagenet_eval")) - num_executors = int(sc._conf.get("spark.executor.instances")) - num_ps = 0 - - cluster = TFCluster.run(sc, main_fun, sys.argv, num_executors, num_ps, False, TFCluster.InputMode.TENSORFLOW) - cluster.shutdown() diff --git a/examples/imagenet/inception/imagenet_train.py b/examples/imagenet/inception/imagenet_train.py deleted file mode 100644 index 3ffb55ee..00000000 --- a/examples/imagenet/inception/imagenet_train.py +++ /dev/null @@ -1,41 +0,0 @@ -# Copyright 2016 Google Inc. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# ============================================================================== -"""A binary to train Inception on the ImageNet data set. -""" -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function - - - -import tensorflow as tf - -from inception import inception_train -from inception.imagenet_data import ImagenetData - -FLAGS = tf.app.flags.FLAGS - - -def main(_): - dataset = ImagenetData(subset=FLAGS.subset) - assert dataset.data_files() - if tf.gfile.Exists(FLAGS.train_dir): - tf.gfile.DeleteRecursively(FLAGS.train_dir) - tf.gfile.MakeDirs(FLAGS.train_dir) - inception_train.train(dataset) - - -if __name__ == '__main__': - tf.app.run() diff --git a/examples/imagenet/inception/inception_distributed_train.py b/examples/imagenet/inception/inception_distributed_train.py deleted file mode 100644 index 24378af0..00000000 --- a/examples/imagenet/inception/inception_distributed_train.py +++ /dev/null @@ -1,360 +0,0 @@ -# Copyright 2016 Google Inc. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# ============================================================================== -"""A library to train Inception using multiple replicas with synchronous update. - -Please see accompanying README.md for details and instructions. -""" -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function - -from datetime import datetime -import os.path -import time - -import numpy as np -import tensorflow as tf - -from inception import image_processing -from inception import inception_model as inception -from inception.slim import slim - -from tensorflowonspark import TFNode - -FLAGS = tf.app.flags.FLAGS - -tf.app.flags.DEFINE_string('input_mode', 'tf', 'data ingestion mode (spark|tf)') -tf.app.flags.DEFINE_integer('num_gpus', 1, 'Number of GPUs per node.') - -tf.app.flags.DEFINE_string('job_name', '', 'One of "ps", "worker"') -tf.app.flags.DEFINE_string('ps_hosts', '', - """Comma-separated list of hostname:port for the """ - """parameter server jobs. e.g. """ - """'machine1:2222,machine2:1111,machine2:2222'""") -tf.app.flags.DEFINE_string('worker_hosts', '', - """Comma-separated list of hostname:port for the """ - """worker jobs. e.g. """ - """'machine1:2222,machine2:1111,machine2:2222'""") - -tf.app.flags.DEFINE_string('train_dir', '/tmp/imagenet_train', - """Directory where to write event logs """ - """and checkpoint.""") -tf.app.flags.DEFINE_integer('max_steps', 1000000, 'Number of batches to run.') -tf.app.flags.DEFINE_string('subset', 'train', 'Either "train" or "validation".') -tf.app.flags.DEFINE_boolean('log_device_placement', False, - 'Whether to log device placement.') -tf.app.flags.DEFINE_boolean('rdma', False, - """Whether to use rdma.""") -# Task ID is used to select the chief and also to access the local_step for -# each replica to check staleness of the gradients in sync_replicas_optimizer. -tf.app.flags.DEFINE_integer( - 'task_id', 0, 'Task ID of the worker/replica running the training.') - -# More details can be found in the sync_replicas_optimizer class: -# tensorflow/python/training/sync_replicas_optimizer.py -tf.app.flags.DEFINE_integer('num_replicas_to_aggregate', -1, - """Number of gradients to collect before """ - """updating the parameters.""") -tf.app.flags.DEFINE_integer('save_interval_secs', 10 * 60, - 'Save interval seconds.') -tf.app.flags.DEFINE_integer('save_summaries_secs', 180, - 'Save summaries interval seconds.') - -# **IMPORTANT** -# Please note that this learning rate schedule is heavily dependent on the -# hardware architecture, batch size and any changes to the model architecture -# specification. Selecting a finely tuned learning rate schedule is an -# empirical process that requires some experimentation. Please see README.md -# more guidance and discussion. -# -# Learning rate decay factor selected from https://arxiv.org/abs/1604.00981 -tf.app.flags.DEFINE_float('initial_learning_rate', 0.045, - 'Initial learning rate.') -tf.app.flags.DEFINE_float('num_epochs_per_decay', 2.0, - 'Epochs after which learning rate decays.') -tf.app.flags.DEFINE_float('learning_rate_decay_factor', 0.94, - 'Learning rate decay factor.') - -# Constants dictating the learning rate schedule. -RMSPROP_DECAY = 0.9 # Decay term for RMSProp. -RMSPROP_MOMENTUM = 0.9 # Momentum in RMSProp. -RMSPROP_EPSILON = 1.0 # Epsilon term for RMSProp. - - -def train(target, dataset, cluster_spec, ctx): - """Train Inception on a dataset for a number of steps.""" - # Number of workers and parameter servers are infered from the workers and ps - # hosts string. - num_workers = len(cluster_spec.as_dict()['worker']) - num_parameter_servers = len(cluster_spec.as_dict()['ps']) - # If no value is given, num_replicas_to_aggregate defaults to be the number of - # workers. - if FLAGS.num_replicas_to_aggregate == -1: - num_replicas_to_aggregate = num_workers - else: - num_replicas_to_aggregate = FLAGS.num_replicas_to_aggregate - - # Both should be greater than 0 in a distributed training. - assert num_workers > 0 and num_parameter_servers > 0, (' num_workers and ' - 'num_parameter_servers' - ' must be > 0.') - - # Choose worker 0 as the chief. Note that any worker could be the chief - # but there should be only one chief. - is_chief = (FLAGS.task_id == 0) - - # Ops are assigned to worker by default. - with tf.device('/job:worker/task:%d' % FLAGS.task_id): - # Variables and its related init/assign ops are assigned to ps. - with slim.scopes.arg_scope( - [slim.variables.variable, slim.variables.global_step], - device=slim.variables.VariableDeviceChooser(num_parameter_servers)): - # Create a variable to count the number of train() calls. This equals the - # number of updates applied to the variables. - global_step = slim.variables.global_step() - - # Calculate the learning rate schedule. - num_batches_per_epoch = (dataset.num_examples_per_epoch() / - FLAGS.batch_size) - # Decay steps need to be divided by the number of replicas to aggregate. - decay_steps = int(num_batches_per_epoch * FLAGS.num_epochs_per_decay / - num_replicas_to_aggregate) - - # Decay the learning rate exponentially based on the number of steps. - lr = tf.train.exponential_decay(FLAGS.initial_learning_rate, - global_step, - decay_steps, - FLAGS.learning_rate_decay_factor, - staircase=True) - # Add a summary to track the learning rate. - tf.summary.scalar('learning_rate', lr) - - # Create an optimizer that performs gradient descent. - opt = tf.train.RMSPropOptimizer(lr, - RMSPROP_DECAY, - momentum=RMSPROP_MOMENTUM, - epsilon=RMSPROP_EPSILON) - - if FLAGS.input_mode == 'spark': - def feed_dict(feed_batch): - # extract TFRecords, since feed_batch is [(TFRecord, None)] - tfrecords = [] - for elem in feed_batch: - tfrecords.append(str(elem[0])) - return tfrecords - - batch = tf.placeholder(tf.string, [FLAGS.batch_size / FLAGS.num_preprocess_threads]) - - # The following is adapted from image_processing.py to remove Readers/QueueRunners. - # Note: this removes the RandomShuffledQueue, so the incoming data is not shuffled. - # Presumably, this could be done on the Spark side or done in additional TF code. - examples = tf.unstack(batch) - images, labels = [], [] - for example_serialized in examples: - for thread_id in range(FLAGS.num_preprocess_threads): - # Parse a serialized Example proto to extract the image and metadata. - image_buffer, label_index, bbox, _ = image_processing.parse_example_proto(example_serialized) - image = image_processing.image_preprocessing(image_buffer, bbox, train, thread_id) - images.append(image) - labels.append(label_index) - height = FLAGS.image_size - width = FLAGS.image_size - depth = 3 - images = tf.cast(images, tf.float32) - images = tf.reshape(images, shape=[FLAGS.batch_size, height, width, depth]) - tf.summary.image('images', images) - labels = tf.reshape(labels, [FLAGS.batch_size]) - else: - images, labels = image_processing.distorted_inputs( - dataset, - batch_size=FLAGS.batch_size, - num_preprocess_threads=FLAGS.num_preprocess_threads) - - # Number of classes in the Dataset label set plus 1. - # Label 0 is reserved for an (unused) background class. - num_classes = dataset.num_classes() + 1 - logits = inception.inference(images, num_classes, for_training=True) - # Add classification loss. - inception.loss(logits, labels) - - # Gather all of the losses including regularization losses. - losses = tf.get_collection(slim.losses.LOSSES_COLLECTION) - losses += tf.get_collection(tf.GraphKeys.REGULARIZATION_LOSSES) - - total_loss = tf.add_n(losses, name='total_loss') - - if is_chief: - # Compute the moving average of all individual losses and the - # total loss. - loss_averages = tf.train.ExponentialMovingAverage(0.9, name='avg') - loss_averages_op = loss_averages.apply(losses + [total_loss]) - - # Attach a scalar summmary to all individual losses and the total loss; - # do the same for the averaged version of the losses. - for l in losses + [total_loss]: - loss_name = l.op.name - # Name each loss as '(raw)' and name the moving average version of the - # loss as the original loss name. - tf.summary.scalar(loss_name + ' (raw)', l) - tf.summary.scalar(loss_name, loss_averages.average(l)) - - # Add dependency to compute loss_averages. - with tf.control_dependencies([loss_averages_op]): - total_loss = tf.identity(total_loss) - - # Track the moving averages of all trainable variables. - # Note that we maintain a 'double-average' of the BatchNormalization - # global statistics. - # This is not needed when the number of replicas are small but important - # for synchronous distributed training with tens of workers/replicas. - exp_moving_averager = tf.train.ExponentialMovingAverage( - inception.MOVING_AVERAGE_DECAY, global_step) - - variables_to_average = ( - tf.trainable_variables() + tf.moving_average_variables()) - - # Add histograms for model variables. - for var in variables_to_average: - tf.summary.histogram(var.op.name, var) - - # Create synchronous replica optimizer. - opt = tf.train.SyncReplicasOptimizer( - opt, - replicas_to_aggregate=num_replicas_to_aggregate, - total_num_replicas=num_workers, - variable_averages=exp_moving_averager, - variables_to_average=variables_to_average) - - batchnorm_updates = tf.get_collection(slim.ops.UPDATE_OPS_COLLECTION) - assert batchnorm_updates, 'Batchnorm updates are missing' - batchnorm_updates_op = tf.group(*batchnorm_updates) - # Add dependency to compute batchnorm_updates. - with tf.control_dependencies([batchnorm_updates_op]): - total_loss = tf.identity(total_loss) - - # Compute gradients with respect to the loss. - grads = opt.compute_gradients(total_loss) - - # Add histograms for gradients. - for grad, var in grads: - if grad is not None: - tf.summary.histogram(var.op.name + '/gradients', grad) - - apply_gradients_op = opt.apply_gradients(grads, global_step=global_step) - - with tf.control_dependencies([apply_gradients_op]): - train_op = tf.identity(total_loss, name='train_op') - - # Get chief queue_runners, init_tokens and clean_up_op, which is used to - # synchronize replicas. - # More details can be found in sync_replicas_optimizer. - chief_queue_runners = [opt.get_chief_queue_runner()] - init_tokens_op = opt.get_init_tokens_op() - - # Create a saver. - saver = tf.train.Saver() - - # Build the summary operation based on the TF collection of Summaries. - summary_op = tf.summary.merge_all() - - # Build an initialization operation to run below. - init_op = tf.global_variables_initializer() - - # We run the summaries in the same thread as the training operations by - # passing in None for summary_op to avoid a summary_thread being started. - # Running summaries and training operations in parallel could run out of - # GPU memory. - summary_writer = tf.summary.FileWriter("tensorboard_%d" % ctx.worker_num, graph=tf.get_default_graph()) - sv = tf.train.Supervisor(is_chief=is_chief, - logdir=FLAGS.train_dir, - init_op=init_op, - summary_op=None, - global_step=global_step, - summary_writer=summary_writer, - saver=saver, - save_model_secs=FLAGS.save_interval_secs) - - tf.logging.info('%s Supervisor' % datetime.now()) - - sess_config = tf.ConfigProto( - allow_soft_placement=True, - log_device_placement=FLAGS.log_device_placement) - - # Get a session. - sess = sv.prepare_or_wait_for_session(target, config=sess_config) - - # Start the queue runners. - queue_runners = tf.get_collection(tf.GraphKeys.QUEUE_RUNNERS) - sv.start_queue_runners(sess, queue_runners) - tf.logging.info('Started %d queues for processing input data.', - len(queue_runners)) - - if is_chief: - sv.start_queue_runners(sess, chief_queue_runners) - sess.run(init_tokens_op) - - # Train, checking for Nans. Concurrently run the summary operation at a - # specified interval. Note that the summary_op and train_op never run - # simultaneously in order to prevent running out of GPU memory. - next_summary_time = time.time() + FLAGS.save_summaries_secs - tf_feed = TFNode.DataFeed(ctx.mgr) - while not sv.should_stop(): - try: - start_time = time.time() - if FLAGS.input_mode == 'spark': - tmp = feed_dict(tf_feed.next_batch(FLAGS.batch_size / FLAGS.num_preprocess_threads)) - feed = {batch: tmp} - loss_value, step = sess.run([train_op, global_step], feed_dict=feed) - else: - loss_value, step = sess.run([train_op, global_step]) - assert not np.isnan(loss_value), 'Model diverged with loss = NaN' - if step > FLAGS.max_steps: - break - duration = time.time() - start_time - - if step % 30 == 0: - examples_per_sec = FLAGS.batch_size / float(duration) - format_str = ('Worker %d: %s: step %d, loss = %.2f' - '(%.1f examples/sec; %.3f sec/batch)') - tf.logging.info(format_str % - (FLAGS.task_id, datetime.now(), step, loss_value, - examples_per_sec, duration)) - - # Determine if the summary_op should be run on the chief worker. - if FLAGS.input_mode == 'tf' and is_chief and next_summary_time < time.time(): - tf.logging.info('Running Summary operation on the chief.') - summary_str = sess.run(summary_op) - sv.summary_computed(sess, summary_str) - tf.logging.info('Finished running Summary operation.') - - # Determine the next time for running the summary. - next_summary_time += FLAGS.save_summaries_secs - except: - if is_chief: - tf.logging.info('About to execute sync_clean_up_op!') - raise - - # Stop the TFNode data feed - if FLAGS.input_mode == 'spark': - tf_feed.terminate() - - # Stop the supervisor. This also waits for service threads to finish. - sv.stop() - - # Save after the training ends. - if is_chief: - saver.save(sess, - os.path.join(FLAGS.train_dir, 'model.ckpt'), - global_step=global_step) diff --git a/examples/imagenet/inception/inception_eval.py b/examples/imagenet/inception/inception_eval.py deleted file mode 100644 index 667e739a..00000000 --- a/examples/imagenet/inception/inception_eval.py +++ /dev/null @@ -1,166 +0,0 @@ -# Copyright 2016 Google Inc. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# ============================================================================== -"""A library to evaluate Inception on a single GPU. -""" -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function - -from datetime import datetime -import math -import os.path -import time - - -import numpy as np -import tensorflow as tf - -from inception import image_processing -from inception import inception_model as inception - - -FLAGS = tf.app.flags.FLAGS - -tf.app.flags.DEFINE_string('eval_dir', '/tmp/imagenet_eval', - """Directory where to write event logs.""") -tf.app.flags.DEFINE_string('checkpoint_dir', '/tmp/imagenet_train', - """Directory where to read model checkpoints.""") - -# Flags governing the frequency of the eval. -tf.app.flags.DEFINE_integer('eval_interval_secs', 60 * 5, - """How often to run the eval.""") -tf.app.flags.DEFINE_boolean('run_once', False, - """Whether to run eval only once.""") - -# Flags governing the data used for the eval. -tf.app.flags.DEFINE_integer('num_examples', 50000, - """Number of examples to run. Note that the eval """ - """ImageNet dataset contains 50000 examples.""") -tf.app.flags.DEFINE_string('subset', 'validation', - """Either 'validation' or 'train'.""") - - -def _eval_once(saver, summary_writer, top_1_op, top_5_op, summary_op): - """Runs Eval once. - - Args: - saver: Saver. - summary_writer: Summary writer. - top_1_op: Top 1 op. - top_5_op: Top 5 op. - summary_op: Summary op. - """ - with tf.Session() as sess: - ckpt = tf.train.get_checkpoint_state(FLAGS.checkpoint_dir) - if ckpt and ckpt.model_checkpoint_path: - print("ckpt.model_checkpoint_path: {0}".format(ckpt.model_checkpoint_path)) - saver.restore(sess, ckpt.model_checkpoint_path) - - # Assuming model_checkpoint_path looks something like: - # /my-favorite-path/imagenet_train/model.ckpt-0, - # extract global_step from it. - global_step = ckpt.model_checkpoint_path.split('/')[-1].split('-')[-1] - print('Successfully loaded model from %s at step=%s.' % - (ckpt.model_checkpoint_path, global_step)) - else: - print('No checkpoint file found') - return - - # Start the queue runners. - coord = tf.train.Coordinator() - try: - threads = [] - for qr in tf.get_collection(tf.GraphKeys.QUEUE_RUNNERS): - threads.extend(qr.create_threads(sess, coord=coord, daemon=True, - start=True)) - - num_iter = int(math.ceil(FLAGS.num_examples / FLAGS.batch_size)) - # Counts the number of correct predictions. - count_top_1 = 0.0 - count_top_5 = 0.0 - total_sample_count = num_iter * FLAGS.batch_size - step = 0 - - print('%s: starting evaluation on (%s).' % (datetime.now(), FLAGS.subset)) - start_time = time.time() - while step < num_iter and not coord.should_stop(): - top_1, top_5 = sess.run([top_1_op, top_5_op]) - count_top_1 += np.sum(top_1) - count_top_5 += np.sum(top_5) - step += 1 - if step % 20 == 0: - duration = time.time() - start_time - sec_per_batch = duration / 20.0 - examples_per_sec = FLAGS.batch_size / sec_per_batch - print('%s: [%d batches out of %d] (%.1f examples/sec; %.3f' - 'sec/batch)' % (datetime.now(), step, num_iter, - examples_per_sec, sec_per_batch)) - start_time = time.time() - - # Compute precision @ 1. - precision_at_1 = count_top_1 / total_sample_count - recall_at_5 = count_top_5 / total_sample_count - print('%s: precision @ 1 = %.4f recall @ 5 = %.4f [%d examples]' % - (datetime.now(), precision_at_1, recall_at_5, total_sample_count)) - - summary = tf.Summary() - summary.ParseFromString(sess.run(summary_op)) - summary.value.add(tag='Precision @ 1', simple_value=precision_at_1) - summary.value.add(tag='Recall @ 5', simple_value=recall_at_5) - summary_writer.add_summary(summary, global_step) - - except Exception as e: # pylint: disable=broad-except - coord.request_stop(e) - - coord.request_stop() - coord.join(threads, stop_grace_period_secs=10) - - -def evaluate(dataset): - """Evaluate model on Dataset for a number of steps.""" - with tf.Graph().as_default(): - # Get images and labels from the dataset. - images, labels = image_processing.inputs(dataset) - - # Number of classes in the Dataset label set plus 1. - # Label 0 is reserved for an (unused) background class. - num_classes = dataset.num_classes() + 1 - - # Build a Graph that computes the logits predictions from the - # inference model. - logits, _ = inception.inference(images, num_classes) - - # Calculate predictions. - top_1_op = tf.nn.in_top_k(logits, labels, 1) - top_5_op = tf.nn.in_top_k(logits, labels, 5) - - # Restore the moving average version of the learned variables for eval. - variable_averages = tf.train.ExponentialMovingAverage( - inception.MOVING_AVERAGE_DECAY) - variables_to_restore = variable_averages.variables_to_restore() - saver = tf.train.Saver(variables_to_restore) - - # Build the summary operation based on the TF collection of Summaries. - summary_op = tf.summary.merge_all() - - graph_def = tf.get_default_graph().as_graph_def() - summary_writer = tf.summary.FileWriter(FLAGS.eval_dir, - graph_def=graph_def) - - while True: - _eval_once(saver, summary_writer, top_1_op, top_5_op, summary_op) - if FLAGS.run_once: - break - time.sleep(FLAGS.eval_interval_secs) diff --git a/examples/imagenet/inception/inception_export.py b/examples/imagenet/inception/inception_export.py deleted file mode 100644 index fcca80b2..00000000 --- a/examples/imagenet/inception/inception_export.py +++ /dev/null @@ -1,115 +0,0 @@ -# Copyright 2017 Yahoo Inc. -# Licensed under the terms of the Apache 2.0 license. -# Please see LICENSE file in the project root for terms. - -# Based on https://github.com/tensorflow/serving/blob/c6ace3fed3a0ec7cec6b7267cd86b8ed3a034a50/tensorflow_serving/example/inception_saved_model.py - -"""A library to export an Inception saved_model -""" -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function - -import tensorflow as tf -from tensorflowonspark import TFNode - -from inception import image_processing # for FLAGS.image_size -from inception import inception_model as inception -from inception.imagenet_data import ImagenetData - - -tf.app.flags.DEFINE_string('export_dir', '/tmp/imagenet_export', - """Directory where to write saved_model.""") -tf.app.flags.DEFINE_string('train_dir', '/tmp/imagenet_train', - """Directory where to read model checkpoints.""") - -# Flags governing the data used for the eval. -tf.app.flags.DEFINE_string('subset', 'validation', - """Either 'validation' or 'train'.""") - -def export(_): - FLAGS = tf.app.flags.FLAGS - - """Evaluate model on Dataset for a number of steps.""" - #with tf.Graph().as_default(): - tf.reset_default_graph() - - def preprocess_image(image_buffer): - """Preprocess JPEG encoded bytes to 3D float Tensor.""" - - # Decode the string as an RGB JPEG. - # Note that the resulting image contains an unknown height and width - # that is set dynamically by decode_jpeg. In other words, the height - # and width of image is unknown at compile-time. - image = tf.image.decode_jpeg(image_buffer, channels=3) - # After this point, all image pixels reside in [0,1) - # until the very end, when they're rescaled to (-1, 1). The various - # adjust_* ops all require this range for dtype float. - image = tf.image.convert_image_dtype(image, dtype=tf.float32) - # Crop the central region of the image with an area containing 87.5% of - # the original image. - image = tf.image.central_crop(image, central_fraction=0.875) - # Resize the image to the original height and width. - image = tf.expand_dims(image, 0) - image = tf.image.resize_bilinear( - image, [FLAGS.image_size, FLAGS.image_size], align_corners=False) - image = tf.squeeze(image, [0]) - # Finally, rescale to [-1,1] instead of [0, 1) - image = tf.subtract(image, 0.5) - image = tf.multiply(image, 2.0) - return image - - # Get images and labels from the dataset. - jpegs = tf.placeholder(tf.string, [None], name='jpegs') - images = tf.map_fn(preprocess_image, jpegs, dtype=tf.float32) - labels = tf.placeholder(tf.int32, [None], name='labels') - - # Number of classes in the Dataset label set plus 1. - # Label 0 is reserved for an (unused) background class. - dataset = ImagenetData(subset=FLAGS.subset) - - num_classes = dataset.num_classes() + 1 - - # Build a Graph that computes the logits predictions from the - # inference model. - logits, _ = inception.inference(images, num_classes) - - # Calculate predictions. - top_1_op = tf.nn.in_top_k(logits, labels, 1) - top_5_op = tf.nn.in_top_k(logits, labels, 5) - - # Restore the moving average version of the learned variables for eval. - variable_averages = tf.train.ExponentialMovingAverage( - inception.MOVING_AVERAGE_DECAY) - variables_to_restore = variable_averages.variables_to_restore() - saver = tf.train.Saver(variables_to_restore) - - with tf.Session() as sess: - ckpt = tf.train.get_checkpoint_state(FLAGS.train_dir) - if not ckpt or not ckpt.model_checkpoint_path: - raise Exception("No checkpoint file found at: {}".format(FLAGS.train_dir)) - print("ckpt.model_checkpoint_path: {0}".format(ckpt.model_checkpoint_path)) - - saver.restore(sess, ckpt.model_checkpoint_path) - - # Assuming model_checkpoint_path looks something like: - # /my-favorite-path/imagenet_train/model.ckpt-0, - # extract global_step from it. - global_step = ckpt.model_checkpoint_path.split('/')[-1].split('-')[-1] - print('Successfully loaded model from %s at step=%s.' % - (ckpt.model_checkpoint_path, global_step)) - - print("Exporting saved_model to: {}".format(FLAGS.export_dir)) - # exported signatures defined in code - signatures = { - tf.saved_model.signature_constants.DEFAULT_SERVING_SIGNATURE_DEF_KEY: { - 'inputs': { 'jpegs': jpegs, 'labels': labels }, - 'outputs': { 'top_5_acc': top_5_op }, - 'method_name': tf.saved_model.signature_constants.PREDICT_METHOD_NAME - } - } - TFNode.export_saved_model(sess, - FLAGS.export_dir, - tf.saved_model.tag_constants.SERVING, - signatures) - print("Exported saved_model") diff --git a/examples/imagenet/inception/inception_model.py b/examples/imagenet/inception/inception_model.py deleted file mode 100644 index fedae13a..00000000 --- a/examples/imagenet/inception/inception_model.py +++ /dev/null @@ -1,157 +0,0 @@ -# Copyright 2016 Google Inc. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# ============================================================================== -"""Build the Inception v3 network on ImageNet data set. - -The Inception v3 architecture is described in http://arxiv.org/abs/1512.00567 - -Summary of available functions: - inference: Compute inference on the model inputs to make a prediction - loss: Compute the loss of the prediction with respect to the labels -""" -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function - -import re - -import tensorflow as tf - -from inception.slim import slim - -FLAGS = tf.app.flags.FLAGS - -# If a model is trained using multiple GPUs, prefix all Op names with tower_name -# to differentiate the operations. Note that this prefix is removed from the -# names of the summaries when visualizing a model. -TOWER_NAME = 'tower' - -# Batch normalization. Constant governing the exponential moving average of -# the 'global' mean and variance for all activations. -BATCHNORM_MOVING_AVERAGE_DECAY = 0.9997 - -# The decay to use for the moving average. -MOVING_AVERAGE_DECAY = 0.9999 - - -def inference(images, num_classes, for_training=False, restore_logits=True, - scope=None): - """Build Inception v3 model architecture. - - See here for reference: http://arxiv.org/abs/1512.00567 - - Args: - images: Images returned from inputs() or distorted_inputs(). - num_classes: number of classes - for_training: If set to `True`, build the inference model for training. - Kernels that operate differently for inference during training - e.g. dropout, are appropriately configured. - restore_logits: whether or not the logits layers should be restored. - Useful for fine-tuning a model with different num_classes. - scope: optional prefix string identifying the ImageNet tower. - - Returns: - Logits. 2-D float Tensor. - Auxiliary Logits. 2-D float Tensor of side-head. Used for training only. - """ - # Parameters for BatchNorm. - batch_norm_params = { - # Decay for the moving averages. - 'decay': BATCHNORM_MOVING_AVERAGE_DECAY, - # epsilon to prevent 0s in variance. - 'epsilon': 0.001, - } - # Set weight_decay for weights in Conv and FC layers. - with slim.arg_scope([slim.ops.conv2d, slim.ops.fc], weight_decay=0.00004): - with slim.arg_scope([slim.ops.conv2d], - stddev=0.1, - activation=tf.nn.relu, - batch_norm_params=batch_norm_params): - logits, endpoints = slim.inception.inception_v3( - images, - dropout_keep_prob=0.8, - num_classes=num_classes, - is_training=for_training, - restore_logits=restore_logits, - scope=scope) - - # Add summaries for viewing model statistics on TensorBoard. - _activation_summaries(endpoints) - - # Grab the logits associated with the side head. Employed during training. - auxiliary_logits = endpoints['aux_logits'] - - return logits, auxiliary_logits - - -def loss(logits, labels, batch_size=None): - """Adds all losses for the model. - - Note the final loss is not returned. Instead, the list of losses are collected - by slim.losses. The losses are accumulated in tower_loss() and summed to - calculate the total loss. - - Args: - logits: List of logits from inference(). Each entry is a 2-D float Tensor. - labels: Labels from distorted_inputs or inputs(). 1-D tensor - of shape [batch_size] - batch_size: integer - """ - if not batch_size: - batch_size = FLAGS.batch_size - - # Reshape the labels into a dense Tensor of - # shape [FLAGS.batch_size, num_classes]. - sparse_labels = tf.reshape(labels, [batch_size, 1]) - indices = tf.reshape(tf.range(batch_size), [batch_size, 1]) - concated = tf.concat(axis=1, values=[indices, sparse_labels]) - num_classes = logits[0].get_shape()[-1].value - dense_labels = tf.sparse_to_dense(concated, - [batch_size, num_classes], - 1.0, 0.0) - - # Cross entropy loss for the main softmax prediction. - slim.losses.cross_entropy_loss(logits[0], - dense_labels, - label_smoothing=0.1, - weight=1.0) - - # Cross entropy loss for the auxiliary softmax head. - slim.losses.cross_entropy_loss(logits[1], - dense_labels, - label_smoothing=0.1, - weight=0.4, - scope='aux_loss') - - -def _activation_summary(x): - """Helper to create summaries for activations. - - Creates a summary that provides a histogram of activations. - Creates a summary that measure the sparsity of activations. - - Args: - x: Tensor - """ - # Remove 'tower_[0-9]/' from the name in case this is a multi-GPU training - # session. This helps the clarity of presentation on tensorboard. - tensor_name = re.sub('%s_[0-9]*/' % TOWER_NAME, '', x.op.name) - tf.summary.histogram(tensor_name + '/activations', x) - tf.summary.scalar(tensor_name + '/sparsity', tf.nn.zero_fraction(x)) - - -def _activation_summaries(endpoints): - with tf.name_scope('summaries'): - for act in endpoints.values(): - _activation_summary(act) diff --git a/examples/imagenet/inception/inception_train.py b/examples/imagenet/inception/inception_train.py deleted file mode 100644 index 32c959df..00000000 --- a/examples/imagenet/inception/inception_train.py +++ /dev/null @@ -1,357 +0,0 @@ -# Copyright 2016 Google Inc. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# ============================================================================== -"""A library to train Inception using multiple GPU's with synchronous updates. -""" -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function - -import copy -from datetime import datetime -import os.path -import re -import time - -import numpy as np -import tensorflow as tf - -from inception import image_processing -from inception import inception_model as inception -from inception.slim import slim - -FLAGS = tf.app.flags.FLAGS - -tf.app.flags.DEFINE_string('train_dir', '/tmp/imagenet_train', - """Directory where to write event logs """ - """and checkpoint.""") -tf.app.flags.DEFINE_integer('max_steps', 10000000, - """Number of batches to run.""") -tf.app.flags.DEFINE_string('subset', 'train', - """Either 'train' or 'validation'.""") - -# Flags governing the hardware employed for running TensorFlow. -tf.app.flags.DEFINE_integer('num_gpus', 1, - """How many GPUs to use.""") -tf.app.flags.DEFINE_boolean('log_device_placement', False, - """Whether to log device placement.""") - -# Flags governing the type of training. -tf.app.flags.DEFINE_boolean('fine_tune', False, - """If set, randomly initialize the final layer """ - """of weights in order to train the network on a """ - """new task.""") -tf.app.flags.DEFINE_string('pretrained_model_checkpoint_path', '', - """If specified, restore this pretrained model """ - """before beginning any training.""") - -# **IMPORTANT** -# Please note that this learning rate schedule is heavily dependent on the -# hardware architecture, batch size and any changes to the model architecture -# specification. Selecting a finely tuned learning rate schedule is an -# empirical process that requires some experimentation. Please see README.md -# more guidance and discussion. -# -# With 8 Tesla K40's and a batch size = 256, the following setup achieves -# precision@1 = 73.5% after 100 hours and 100K steps (20 epochs). -# Learning rate decay factor selected from http://arxiv.org/abs/1404.5997. -tf.app.flags.DEFINE_float('initial_learning_rate', 0.1, - """Initial learning rate.""") -tf.app.flags.DEFINE_float('num_epochs_per_decay', 30.0, - """Epochs after which learning rate decays.""") -tf.app.flags.DEFINE_float('learning_rate_decay_factor', 0.16, - """Learning rate decay factor.""") - -# Constants dictating the learning rate schedule. -RMSPROP_DECAY = 0.9 # Decay term for RMSProp. -RMSPROP_MOMENTUM = 0.9 # Momentum in RMSProp. -RMSPROP_EPSILON = 1.0 # Epsilon term for RMSProp. - - -def _tower_loss(images, labels, num_classes, scope, reuse_variables=None): - """Calculate the total loss on a single tower running the ImageNet model. - - We perform 'batch splitting'. This means that we cut up a batch across - multiple GPU's. For instance, if the batch size = 32 and num_gpus = 2, - then each tower will operate on an batch of 16 images. - - Args: - images: Images. 4D tensor of size [batch_size, FLAGS.image_size, - FLAGS.image_size, 3]. - labels: 1-D integer Tensor of [batch_size]. - num_classes: number of classes - scope: unique prefix string identifying the ImageNet tower, e.g. - 'tower_0'. - - Returns: - Tensor of shape [] containing the total loss for a batch of data - """ - # When fine-tuning a model, we do not restore the logits but instead we - # randomly initialize the logits. The number of classes in the output of the - # logit is the number of classes in specified Dataset. - restore_logits = not FLAGS.fine_tune - - # Build inference Graph. - with tf.variable_scope(tf.get_variable_scope(), reuse=reuse_variables): - logits = inception.inference(images, num_classes, for_training=True, - restore_logits=restore_logits, - scope=scope) - - # Build the portion of the Graph calculating the losses. Note that we will - # assemble the total_loss using a custom function below. - split_batch_size = images.get_shape().as_list()[0] - inception.loss(logits, labels, batch_size=split_batch_size) - - # Assemble all of the losses for the current tower only. - losses = tf.get_collection(slim.losses.LOSSES_COLLECTION, scope) - - # Calculate the total loss for the current tower. - regularization_losses = tf.get_collection(tf.GraphKeys.REGULARIZATION_LOSSES) - total_loss = tf.add_n(losses + regularization_losses, name='total_loss') - - # Compute the moving average of all individual losses and the total loss. - loss_averages = tf.train.ExponentialMovingAverage(0.9, name='avg') - loss_averages_op = loss_averages.apply(losses + [total_loss]) - - # Attach a scalar summmary to all individual losses and the total loss; do the - # same for the averaged version of the losses. - for l in losses + [total_loss]: - # Remove 'tower_[0-9]/' from the name in case this is a multi-GPU training - # session. This helps the clarity of presentation on TensorBoard. - loss_name = re.sub('%s_[0-9]*/' % inception.TOWER_NAME, '', l.op.name) - # Name each loss as '(raw)' and name the moving average version of the loss - # as the original loss name. - tf.summary.scalar(loss_name +' (raw)', l) - tf.summary.scalar(loss_name, loss_averages.average(l)) - - with tf.control_dependencies([loss_averages_op]): - total_loss = tf.identity(total_loss) - return total_loss - - -def _average_gradients(tower_grads): - """Calculate the average gradient for each shared variable across all towers. - - Note that this function provides a synchronization point across all towers. - - Args: - tower_grads: List of lists of (gradient, variable) tuples. The outer list - is over individual gradients. The inner list is over the gradient - calculation for each tower. - Returns: - List of pairs of (gradient, variable) where the gradient has been averaged - across all towers. - """ - average_grads = [] - for grad_and_vars in zip(*tower_grads): - # Note that each grad_and_vars looks like the following: - # ((grad0_gpu0, var0_gpu0), ... , (grad0_gpuN, var0_gpuN)) - grads = [] - for g, _ in grad_and_vars: - # Add 0 dimension to the gradients to represent the tower. - expanded_g = tf.expand_dims(g, 0) - - # Append on a 'tower' dimension which we will average over below. - grads.append(expanded_g) - - # Average over the 'tower' dimension. - grad = tf.concat(axis=0, values=grads) - grad = tf.reduce_mean(grad, 0) - - # Keep in mind that the Variables are redundant because they are shared - # across towers. So .. we will just return the first tower's pointer to - # the Variable. - v = grad_and_vars[0][1] - grad_and_var = (grad, v) - average_grads.append(grad_and_var) - return average_grads - - -def train(dataset): - """Train on dataset for a number of steps.""" - with tf.Graph().as_default(), tf.device('/cpu:0'): - # Create a variable to count the number of train() calls. This equals the - # number of batches processed * FLAGS.num_gpus. - global_step = tf.get_variable( - 'global_step', [], - initializer=tf.constant_initializer(0), trainable=False) - - # Calculate the learning rate schedule. - num_batches_per_epoch = (dataset.num_examples_per_epoch() / - FLAGS.batch_size) - decay_steps = int(num_batches_per_epoch * FLAGS.num_epochs_per_decay) - - # Decay the learning rate exponentially based on the number of steps. - lr = tf.train.exponential_decay(FLAGS.initial_learning_rate, - global_step, - decay_steps, - FLAGS.learning_rate_decay_factor, - staircase=True) - - # Create an optimizer that performs gradient descent. - opt = tf.train.RMSPropOptimizer(lr, RMSPROP_DECAY, - momentum=RMSPROP_MOMENTUM, - epsilon=RMSPROP_EPSILON) - - # Get images and labels for ImageNet and split the batch across GPUs. - assert FLAGS.batch_size % FLAGS.num_gpus == 0, ( - 'Batch size must be divisible by number of GPUs') - split_batch_size = int(FLAGS.batch_size / FLAGS.num_gpus) - - # Override the number of preprocessing threads to account for the increased - # number of GPU towers. - num_preprocess_threads = FLAGS.num_preprocess_threads * FLAGS.num_gpus - images, labels = image_processing.distorted_inputs( - dataset, - num_preprocess_threads=num_preprocess_threads) - - input_summaries = copy.copy(tf.get_collection(tf.GraphKeys.SUMMARIES)) - - # Number of classes in the Dataset label set plus 1. - # Label 0 is reserved for an (unused) background class. - num_classes = dataset.num_classes() + 1 - - # Split the batch of images and labels for towers. - images_splits = tf.split(axis=0, num_or_size_splits=FLAGS.num_gpus, value=images) - labels_splits = tf.split(axis=0, num_or_size_splits=FLAGS.num_gpus, value=labels) - - # Calculate the gradients for each model tower. - tower_grads = [] - reuse_variables = None - for i in range(FLAGS.num_gpus): - with tf.device('/gpu:%d' % i): - with tf.name_scope('%s_%d' % (inception.TOWER_NAME, i)) as scope: - # Force all Variables to reside on the CPU. - with slim.arg_scope([slim.variables.variable], device='/cpu:0'): - # Calculate the loss for one tower of the ImageNet model. This - # function constructs the entire ImageNet model but shares the - # variables across all towers. - loss = _tower_loss(images_splits[i], labels_splits[i], num_classes, - scope, reuse_variables) - - # Reuse variables for the next tower. - reuse_variables = True - - # Retain the summaries from the final tower. - summaries = tf.get_collection(tf.GraphKeys.SUMMARIES, scope) - - # Retain the Batch Normalization updates operations only from the - # final tower. Ideally, we should grab the updates from all towers - # but these stats accumulate extremely fast so we can ignore the - # other stats from the other towers without significant detriment. - batchnorm_updates = tf.get_collection(slim.ops.UPDATE_OPS_COLLECTION, - scope) - - # Calculate the gradients for the batch of data on this ImageNet - # tower. - grads = opt.compute_gradients(loss) - - # Keep track of the gradients across all towers. - tower_grads.append(grads) - - # We must calculate the mean of each gradient. Note that this is the - # synchronization point across all towers. - grads = _average_gradients(tower_grads) - - # Add a summaries for the input processing and global_step. - summaries.extend(input_summaries) - - # Add a summary to track the learning rate. - summaries.append(tf.summary.scalar('learning_rate', lr)) - - # Add histograms for gradients. - for grad, var in grads: - if grad is not None: - summaries.append( - tf.summary.histogram(var.op.name + '/gradients', grad)) - - # Apply the gradients to adjust the shared variables. - apply_gradient_op = opt.apply_gradients(grads, global_step=global_step) - - # Add histograms for trainable variables. - for var in tf.trainable_variables(): - summaries.append(tf.summary.histogram(var.op.name, var)) - - # Track the moving averages of all trainable variables. - # Note that we maintain a "double-average" of the BatchNormalization - # global statistics. This is more complicated then need be but we employ - # this for backward-compatibility with our previous models. - variable_averages = tf.train.ExponentialMovingAverage( - inception.MOVING_AVERAGE_DECAY, global_step) - - # Another possibility is to use tf.slim.get_variables(). - variables_to_average = (tf.trainable_variables() + - tf.moving_average_variables()) - variables_averages_op = variable_averages.apply(variables_to_average) - - # Group all updates to into a single train op. - batchnorm_updates_op = tf.group(*batchnorm_updates) - train_op = tf.group(apply_gradient_op, variables_averages_op, - batchnorm_updates_op) - - # Create a saver. - saver = tf.train.Saver(tf.global_variables()) - - # Build the summary operation from the last tower summaries. - summary_op = tf.summary.merge(summaries) - - # Build an initialization operation to run below. - init = tf.global_variables_initializer() - - # Start running operations on the Graph. allow_soft_placement must be set to - # True to build towers on GPU, as some of the ops do not have GPU - # implementations. - sess = tf.Session(config=tf.ConfigProto( - allow_soft_placement=True, - log_device_placement=FLAGS.log_device_placement)) - sess.run(init) - - if FLAGS.pretrained_model_checkpoint_path: - assert tf.gfile.Exists(FLAGS.pretrained_model_checkpoint_path) - variables_to_restore = tf.get_collection( - slim.variables.VARIABLES_TO_RESTORE) - restorer = tf.train.Saver(variables_to_restore) - restorer.restore(sess, FLAGS.pretrained_model_checkpoint_path) - print('%s: Pre-trained model restored from %s' % - (datetime.now(), FLAGS.pretrained_model_checkpoint_path)) - - # Start the queue runners. - tf.train.start_queue_runners(sess=sess) - - summary_writer = tf.summary.FileWriter( - FLAGS.train_dir, - graph=sess.graph) - - for step in range(FLAGS.max_steps): - start_time = time.time() - _, loss_value = sess.run([train_op, loss]) - duration = time.time() - start_time - - assert not np.isnan(loss_value), 'Model diverged with loss = NaN' - - if step % 10 == 0: - examples_per_sec = FLAGS.batch_size / float(duration) - format_str = ('%s: step %d, loss = %.2f (%.1f examples/sec; %.3f ' - 'sec/batch)') - print(format_str % (datetime.now(), step, loss_value, - examples_per_sec, duration)) - - if step % 100 == 0: - summary_str = sess.run(summary_op) - summary_writer.add_summary(summary_str, step) - - # Save the model checkpoint periodically. - if step % 5000 == 0 or (step + 1) == FLAGS.max_steps: - checkpoint_path = os.path.join(FLAGS.train_dir, 'model.ckpt') - saver.save(sess, checkpoint_path, global_step=step) diff --git a/examples/imagenet/inception/slim/BUILD b/examples/imagenet/inception/slim/BUILD deleted file mode 100644 index 174e77d5..00000000 --- a/examples/imagenet/inception/slim/BUILD +++ /dev/null @@ -1,112 +0,0 @@ -# Description: -# Contains the operations and nets for building TensorFlow-Slim models. - -package(default_visibility = ["//inception:internal"]) - -licenses(["notice"]) # Apache 2.0 - -exports_files(["LICENSE"]) - -py_library( - name = "scopes", - srcs = ["scopes.py"], -) - -py_test( - name = "scopes_test", - size = "small", - srcs = ["scopes_test.py"], - deps = [ - ":scopes", - ], -) - -py_library( - name = "variables", - srcs = ["variables.py"], - deps = [ - ":scopes", - ], -) - -py_test( - name = "variables_test", - size = "small", - srcs = ["variables_test.py"], - deps = [ - ":variables", - ], -) - -py_library( - name = "losses", - srcs = ["losses.py"], -) - -py_test( - name = "losses_test", - size = "small", - srcs = ["losses_test.py"], - deps = [ - ":losses", - ], -) - -py_library( - name = "ops", - srcs = ["ops.py"], - deps = [ - ":losses", - ":scopes", - ":variables", - ], -) - -py_test( - name = "ops_test", - size = "small", - srcs = ["ops_test.py"], - deps = [ - ":ops", - ":variables", - ], -) - -py_library( - name = "inception", - srcs = ["inception_model.py"], - deps = [ - ":ops", - ":scopes", - ], -) - -py_test( - name = "inception_test", - size = "medium", - srcs = ["inception_test.py"], - deps = [ - ":inception", - ], -) - -py_library( - name = "slim", - srcs = ["slim.py"], - deps = [ - ":inception", - ":losses", - ":ops", - ":scopes", - ":variables", - ], -) - -py_test( - name = "collections_test", - size = "small", - srcs = ["collections_test.py"], - deps = [ - ":slim", - ], -) diff --git a/examples/imagenet/inception/slim/README.md b/examples/imagenet/inception/slim/README.md deleted file mode 100644 index 0d7a5a82..00000000 --- a/examples/imagenet/inception/slim/README.md +++ /dev/null @@ -1,631 +0,0 @@ -# TensorFlow-Slim - -TF-Slim is a lightweight library for defining, training and evaluating models in -TensorFlow. It enables defining complex networks quickly and concisely while -keeping a model's architecture transparent and its hyperparameters explicit. - -[TOC] - -## Teaser - -As a demonstration of the simplicity of using TF-Slim, compare the simplicity of -the code necessary for defining the entire [VGG] -(http://www.robots.ox.ac.uk/~vgg/research/very_deep/) network using TF-Slim to -the lengthy and verbose nature of defining just the first three layers (out of -16) using native tensorflow: - -```python{.good} -# VGG16 in TF-Slim. -def vgg16(inputs): - with slim.arg_scope([slim.ops.conv2d, slim.ops.fc], stddev=0.01, weight_decay=0.0005): - net = slim.ops.repeat_op(2, inputs, slim.ops.conv2d, 64, [3, 3], scope='conv1') - net = slim.ops.max_pool(net, [2, 2], scope='pool1') - net = slim.ops.repeat_op(2, net, slim.ops.conv2d, 128, [3, 3], scope='conv2') - net = slim.ops.max_pool(net, [2, 2], scope='pool2') - net = slim.ops.repeat_op(3, net, slim.ops.conv2d, 256, [3, 3], scope='conv3') - net = slim.ops.max_pool(net, [2, 2], scope='pool3') - net = slim.ops.repeat_op(3, net, slim.ops.conv2d, 512, [3, 3], scope='conv4') - net = slim.ops.max_pool(net, [2, 2], scope='pool4') - net = slim.ops.repeat_op(3, net, slim.ops.conv2d, 512, [3, 3], scope='conv5') - net = slim.ops.max_pool(net, [2, 2], scope='pool5') - net = slim.ops.flatten(net, scope='flatten5') - net = slim.ops.fc(net, 4096, scope='fc6') - net = slim.ops.dropout(net, 0.5, scope='dropout6') - net = slim.ops.fc(net, 4096, scope='fc7') - net = slim.ops.dropout(net, 0.5, scope='dropout7') - net = slim.ops.fc(net, 1000, activation=None, scope='fc8') - return net -``` - -```python{.bad} -# Layers 1-3 (out of 16) of VGG16 in native tensorflow. -def vgg16(inputs): - with tf.name_scope('conv1_1') as scope: - kernel = tf.Variable(tf.truncated_normal([3, 3, 3, 64], dtype=tf.float32, stddev=1e-1), name='weights') - conv = tf.nn.conv2d(inputs, kernel, [1, 1, 1, 1], padding='SAME') - biases = tf.Variable(tf.constant(0.0, shape=[64], dtype=tf.float32), trainable=True, name='biases') - bias = tf.nn.bias_add(conv, biases) - conv1 = tf.nn.relu(bias, name=scope) - with tf.name_scope('conv1_2') as scope: - kernel = tf.Variable(tf.truncated_normal([3, 3, 64, 64], dtype=tf.float32, stddev=1e-1), name='weights') - conv = tf.nn.conv2d(images, kernel, [1, 1, 1, 1], padding='SAME') - biases = tf.Variable(tf.constant(0.0, shape=[64], dtype=tf.float32), trainable=True, name='biases') - bias = tf.nn.bias_add(conv, biases) - conv1 = tf.nn.relu(bias, name=scope) - with tf.name_scope('pool1') - pool1 = tf.nn.max_pool(conv1, ksize=[1, 2, 2, 1], strides=[1, 2, 2, 1], padding='VALID', name='pool1') -``` - -## Why TF-Slim? - -TF-Slim offers several advantages over just the built-in tensorflow libraries: - -* Allows one to define models much more compactly by eliminating boilerplate - code. This is accomplished through the use of [argument scoping](scopes.py) - and numerous high level [operations](ops.py). These tools increase - readability and maintainability, reduce the likelihood of an error from - copy-and-pasting hyperparameter values and simplifies hyperparameter tuning. -* Makes developing models simple by providing commonly used [loss functions] - (losses.py) -* Provides a concise [definition](inception_model.py) of [Inception v3] - (http://arxiv.org/abs/1512.00567) network architecture ready to be used - out-of-the-box or subsumed into new models. - -Additionally TF-Slim was designed with several principles in mind: - -* The various modules of TF-Slim (scopes, variables, ops, losses) are - independent. This flexibility allows users to pick and choose components of - TF-Slim completely à la carte. -* TF-Slim is written using a Functional Programming style. That means it's - super-lightweight and can be used right alongside any of TensorFlow's native - operations. -* Makes re-using network architectures easy. This allows users to build new - networks on top of existing ones as well as fine-tuning pre-trained models - on new tasks. - -## What are the various components of TF-Slim? - -TF-Slim is composed of several parts which were designed to exist independently. -These include: - -* [scopes.py](./scopes.py): provides a new scope named `arg_scope` that allows - a user to define default arguments for specific operations within that - scope. -* [variables.py](./variables.py): provides convenience wrappers for variable - creation and manipulation. -* [ops.py](./ops.py): provides high level operations for building models using - tensorflow. -* [losses.py](./losses.py): contains commonly used loss functions. - -## Defining Models - -Models can be succinctly defined using TF-Slim by combining its variables, -operations and scopes. Each of these elements are defined below. - -### Variables - -Creating [`Variables`](https://www.tensorflow.org/how_tos/variables/index.html) -in native tensorflow requires either a predefined value or an initialization -mechanism (random, normally distributed). Furthermore, if a variable needs to be -created on a specific device, such as a GPU, the specification must be [made -explicit](https://www.tensorflow.org/how_tos/using_gpu/index.html). To alleviate -the code required for variable creation, TF-Slim provides a set of thin wrapper -functions in [variables.py](./variables.py) which allow callers to easily define -variables. - -For example, to create a `weight` variable, initialize it using a truncated -normal distribution, regularize it with an `l2_loss` and place it on the `CPU`, -one need only declare the following: - -```python -weights = variables.variable('weights', - shape=[10, 10, 3 , 3], - initializer=tf.truncated_normal_initializer(stddev=0.1), - regularizer=lambda t: losses.l2_loss(t, weight=0.05), - device='/cpu:0') -``` - -In addition to the functionality provided by `tf.Variable`, `slim.variables` -keeps track of the variables created by `slim.ops` to define a model, which -allows one to distinguish variables that belong to the model versus other -variables. - -```python -# Get all the variables defined by the model. -model_variables = slim.variables.get_variables() - -# Get all the variables with the same given name, i.e. 'weights', 'biases'. -weights = slim.variables.get_variables_by_name('weights') -biases = slim.variables.get_variables_by_name('biases') - -# Get all the variables in VARIABLES_TO_RESTORE collection. -variables_to_restore = tf.get_collection(slim.variables.VARIABLES_TO_RESTORE) - - -weights = variables.variable('weights', - shape=[10, 10, 3 , 3], - initializer=tf.truncated_normal_initializer(stddev=0.1), - regularizer=lambda t: losses.l2_loss(t, weight=0.05), - device='/cpu:0') -``` - -### Operations (Layers) - -While the set of TensorFlow operations is quite extensive, builders of neural -networks typically think of models in terms of "layers". A layer, such as a -Convolutional Layer, a Fully Connected Layer or a BatchNorm Layer are more -abstract than a single TensorFlow operation and typically involve many such -operations. For example, a Convolutional Layer in a neural network is built -using several steps: - -1. Creating the weight variables -2. Creating the bias variables -3. Convolving the weights with the input from the previous layer -4. Adding the biases to the result of the convolution. - -In python code this can be rather laborious: - -```python -input = ... -with tf.name_scope('conv1_1') as scope: - kernel = tf.Variable(tf.truncated_normal([3, 3, 64, 128], dtype=tf.float32, - stddev=1e-1), name='weights') - conv = tf.nn.conv2d(input, kernel, [1, 1, 1, 1], padding='SAME') - biases = tf.Variable(tf.constant(0.0, shape=[128], dtype=tf.float32), - trainable=True, name='biases') - bias = tf.nn.bias_add(conv, biases) - conv1 = tf.nn.relu(bias, name=scope) -``` - -To alleviate the need to duplicate this code repeatedly, TF-Slim provides a -number of convenient operations defined at the (more abstract) level of neural -network layers. For example, compare the code above to an invocation of the -TF-Slim code: - -```python -input = ... -net = slim.ops.conv2d(input, [3, 3], 128, scope='conv1_1') -``` - -TF-Slim provides numerous operations used in building neural networks which -roughly correspond to such layers. These include: - -Layer | TF-Slim Op ---------------------- | ------------------------ -Convolutional Layer | [ops.conv2d](ops.py) -Fully Connected Layer | [ops.fc](ops.py) -BatchNorm layer | [ops.batch_norm](ops.py) -Max Pooling Layer | [ops.max_pool](ops.py) -Avg Pooling Layer | [ops.avg_pool](ops.py) -Dropout Layer | [ops.dropout](ops.py) - -[ops.py](./ops.py) also includes operations that are not really "layers" per se, -but are often used to manipulate hidden unit representations during inference: - -Operation | TF-Slim Op ---------- | --------------------- -Flatten | [ops.flatten](ops.py) - -TF-Slim also provides a meta-operation called `repeat_op` that allows one to -repeatedly perform the same operation. Consider the following snippet from the -[VGG](https://www.robots.ox.ac.uk/~vgg/research/very_deep/) network whose layers -perform several convolutions in a row between pooling layers: - -```python -net = ... -net = slim.ops.conv2d(net, 256, [3, 3], scope='conv3_1') -net = slim.ops.conv2d(net, 256, [3, 3], scope='conv3_2') -net = slim.ops.conv2d(net, 256, [3, 3], scope='conv3_3') -net = slim.ops.max_pool(net, [2, 2], scope='pool3') -``` - -This clear duplication of code can be removed via a standard loop: - -```python -net = ... -for i in range(3): - net = slim.ops.conv2d(net, 256, [3, 3], scope='conv3_' % (i+1)) -net = slim.ops.max_pool(net, [2, 2], scope='pool3') -``` - -While this does reduce the amount of duplication, it can be made even cleaner by -using the `RepeatOp`: - -```python -net = slim.ops.repeat_op(3, net, slim.ops.conv2d, 256, [3, 3], scope='conv3') -net = slim.ops.max_pool(net, [2, 2], scope='pool2') -``` - -Notice that the RepeatOp not only applies the same argument in-line, it also is -smart enough to unroll the scopes such that the scopes assigned to each -subsequent call of `ops.conv2d` is appended with an underscore and iteration -number. More concretely, the scopes in the example above would be 'conv3_1', -'conv3_2' and 'conv3_3'. - -### Scopes - -In addition to the types of scope mechanisms in TensorFlow ([name_scope] -(https://www.tensorflow.org/api_docs/python/framework.html#name_scope), -[variable_scope] -(https://www.tensorflow.org/api_docs/python/state_ops.html#variable_scope), -TF-Slim adds a new scoping mechanism called "argument scope" or [arg_scope] -(scopes.py). This new scope allows a user to specify one or more operations and -a set of arguments which will be passed to each of the operations defined in the -`arg_scope`. This functionality is best illustrated by example. Consider the -following code snippet: - -```python -net = slim.ops.conv2d(inputs, 64, [11, 11], 4, padding='SAME', stddev=0.01, weight_decay=0.0005, scope='conv1') -net = slim.ops.conv2d(net, 128, [11, 11], padding='VALID', stddev=0.01, weight_decay=0.0005, scope='conv2') -net = slim.ops.conv2d(net, 256, [11, 11], padding='SAME', stddev=0.01, weight_decay=0.0005, scope='conv3') -``` - -It should be clear that these three Convolution layers share many of the same -hyperparameters. Two have the same padding, all three have the same weight_decay -and standard deviation of its weights. Not only do the duplicated values make -the code more difficult to read, it also adds the addition burder to the writer -of needing to doublecheck that all of the values are identical in each step. One -solution would be to specify default values using variables: - -```python -padding='SAME' -stddev=0.01 -weight_decay=0.0005 -net = slim.ops.conv2d(inputs, 64, [11, 11], 4, padding=padding, stddev=stddev, weight_decay=weight_decay, scope='conv1') -net = slim.ops.conv2d(net, 128, [11, 11], padding='VALID', stddev=stddev, weight_decay=weight_decay, scope='conv2') -net = slim.ops.conv2d(net, 256, [11, 11], padding=padding, stddev=stddev, weight_decay=weight_decay, scope='conv3') - -``` - -This solution ensures that all three convolutions share the exact same variable -values but doesn't reduce the code clutter. By using an `arg_scope`, we can both -ensure that each layer uses the same values and simplify the code: - -```python - with slim.arg_scope([slim.ops.conv2d], padding='SAME', stddev=0.01, weight_decay=0.0005): - net = slim.ops.conv2d(inputs, 64, [11, 11], scope='conv1') - net = slim.ops.conv2d(net, 128, [11, 11], padding='VALID', scope='conv2') - net = slim.ops.conv2d(net, 256, [11, 11], scope='conv3') -``` - -As the example illustrates, the use of arg_scope makes the code cleaner, simpler -and easier to maintain. Notice that while argument values are specifed in the -arg_scope, they can be overwritten locally. In particular, while the padding -argument has been set to 'SAME', the second convolution overrides it with the -value of 'VALID'. - -One can also nest `arg_scope`s and use multiple operations in the same scope. -For example: - -```python -with arg_scope([slim.ops.conv2d, slim.ops.fc], stddev=0.01, weight_decay=0.0005): - with arg_scope([slim.ops.conv2d], padding='SAME'), slim.arg_scope([slim.ops.fc], bias=1.0): - net = slim.ops.conv2d(inputs, 64, [11, 11], 4, padding='VALID', scope='conv1') - net = slim.ops.conv2d(net, 256, [5, 5], stddev=0.03, scope='conv2') - net = slim.ops.flatten(net) - net = slim.ops.fc(net, 1000, activation=None, scope='fc') -``` - -In this example, the first `arg_scope` applies the same `stddev` and -`weight_decay` arguments to the `conv2d` and `fc` ops in its scope. In the -second `arg_scope`, additional default arguments to `conv2d` only are specified. - -In addition to `arg_scope`, TF-Slim provides several decorators that wrap the -use of tensorflow arg scopes. These include `@AddArgScope`, `@AddNameScope`, -`@AddVariableScope`, `@AddOpScope` and `@AddVariableOpScope`. To illustrate -their use, consider the following example. - -```python -def MyNewOp(inputs): - varA = ... - varB = ... - outputs = tf.mul(varA, inputs) + varB - return outputs - -``` - -In this example, the user has created a new op which creates two variables. To -ensure that these variables exist within a certain variable scope (to avoid -collisions with variables with the same name), in standard TF, the op must be -called within a variable scope: - -```python -inputs = ... -with tf.variable_scope('layer1'): - outputs = MyNewOp(inputs) -``` - -As an alternative, one can use TF-Slim's decorators to decorate the function and -simplify the call: - -```python -@AddVariableScope -def MyNewOp(inputs): - ... - return outputs - - -inputs = ... -outputs = MyNewOp('layer1') -``` - -The `@AddVariableScope` decorater simply applies the `tf.variable_scope` scoping -to the called function taking "layer1" as its argument. This allows the code to -be written more concisely. - -### Losses - -The loss function defines a quantity that we want to minimize. For -classification problems, this is typically the cross entropy between the true -(one-hot) distribution and the predicted probability distribution across -classes. For regression problems, this is often the sum-of-squares differences -between the predicted and true values. - -Certain models, such as multi-task learning models, require the use of multiple -loss functions simultaneously. In other words, the loss function ultimatey being -minimized is the sum of various other loss functions. For example, consider a -model that predicts both the type of scene in an image as well as the depth from -the camera of each pixel. This model's loss function would be the sum of the -classification loss and depth prediction loss. - -TF-Slim provides an easy-to-use mechanism for defining and keeping track of loss -functions via the [losses.py](./losses.py) module. Consider the simple case -where we want to train the VGG network: - -```python -# Load the images and labels. -images, labels = ... - -# Create the model. -predictions = ... - -# Define the loss functions and get the total loss. -loss = losses.cross_entropy_loss(predictions, labels) -``` - -In this example, we start by creating the model (using TF-Slim's VGG -implementation), and add the standard classification loss. Now, lets turn to the -case where we have a multi-task model that produces multiple outputs: - -```python -# Load the images and labels. -images, scene_labels, depth_labels = ... - -# Create the model. -scene_predictions, depth_predictions = CreateMultiTaskModel(images) - -# Define the loss functions and get the total loss. -classification_loss = slim.losses.cross_entropy_loss(scene_predictions, scene_labels) -sum_of_squares_loss = slim.losses.l2loss(depth_predictions - depth_labels) - -# The following two lines have the same effect: -total_loss1 = classification_loss + sum_of_squares_loss -total_loss2 = tf.get_collection(slim.losses.LOSSES_COLLECTION) -``` - -In this example, we have two losses which we add by calling -`losses.cross_entropy_loss` and `losses.l2loss`. We can obtain the -total loss by adding them together (`total_loss1`) or by calling -`losses.GetTotalLoss()`. How did this work? When you create a loss function via -TF-Slim, TF-Slim adds the loss to a special TensorFlow collection of loss -functions. This enables you to either manage the total loss manually, or allow -TF-Slim to manage them for you. - -What if you want to let TF-Slim manage the losses for you but have a custom loss -function? [losses.py](./losses.py) also has a function that adds this loss to -TF-Slims collection. For example: - -```python -# Load the images and labels. -images, scene_labels, depth_labels, pose_labels = ... - -# Create the model. -scene_predictions, depth_predictions, pose_predictions = CreateMultiTaskModel(images) - -# Define the loss functions and get the total loss. -classification_loss = slim.losses.cross_entropy_loss(scene_predictions, scene_labels) -sum_of_squares_loss = slim.losses.l2loss(depth_predictions - depth_labels) -pose_loss = MyCustomLossFunction(pose_predictions, pose_labels) -tf.add_to_collection(slim.losses.LOSSES_COLLECTION, pose_loss) # Letting TF-Slim know about the additional loss. - -# The following two lines have the same effect: -total_loss1 = classification_loss + sum_of_squares_loss + pose_loss -total_loss2 = losses.GetTotalLoss() -``` - -In this example, we can again either produce the total loss function manually or -let TF-Slim know about the additional loss and let TF-Slim handle the losses. - -## Putting the Pieces Together - -By combining TF-Slim Variables, Operations and scopes, we can write a normally -very complex network with very few lines of code. For example, the entire [VGG] -(https://www.robots.ox.ac.uk/~vgg/research/very_deep/) architecture can be -defined with just the following snippet: - -```python -with arg_scope([slim.ops.conv2d, slim.ops.fc], stddev=0.01, weight_decay=0.0005): - net = slim.ops.repeat_op(2, inputs, slim.ops.conv2d, 64, [3, 3], scope='conv1') - net = slim.ops.max_pool(net, [2, 2], scope='pool1') - net = slim.ops.repeat_op(2, net, slim.ops.conv2d, 128, [3, 3], scope='conv2') - net = slim.ops.max_pool(net, [2, 2], scope='pool2') - net = slim.ops.repeat_op(3, net, slim.ops.conv2d, 256, [3, 3], scope='conv3') - net = slim.ops.max_pool(net, [2, 2], scope='pool3') - net = slim.ops.repeat_op(3, net, slim.ops.conv2d, 512, [3, 3], scope='conv4') - net = slim.ops.max_pool(net, [2, 2], scope='pool4') - net = slim.ops.repeat_op(3, net, slim.ops.conv2d, 512, [3, 3], scope='conv5') - net = slim.ops.max_pool(net, [2, 2], scope='pool5') - net = slim.ops.flatten(net, scope='flatten5') - net = slim.ops.fc(net, 4096, scope='fc6') - net = slim.ops.dropout(net, 0.5, scope='dropout6') - net = slim.ops.fc(net, 4096, scope='fc7') - net = slim.ops.dropout(net, 0.5, scope='dropout7') - net = slim.ops.fc(net, 1000, activation=None, scope='fc8') -return net -``` - -## Re-using previously defined network architectures and pre-trained models. - -### Brief Recap on Restoring Variables from a Checkpoint - -After a model has been trained, it can be restored using `tf.train.Saver()` -which restores `Variables` from a given checkpoint. For many cases, -`tf.train.Saver()` provides a simple mechanism to restore all or just a few -variables. - -```python -# Create some variables. -v1 = tf.Variable(..., name="v1") -v2 = tf.Variable(..., name="v2") -... -# Add ops to restore all the variables. -restorer = tf.train.Saver() - -# Add ops to restore some variables. -restorer = tf.train.Saver([v1, v2]) - -# Later, launch the model, use the saver to restore variables from disk, and -# do some work with the model. -with tf.Session() as sess: - # Restore variables from disk. - restorer.restore(sess, "/tmp/model.ckpt") - print("Model restored.") - # Do some work with the model - ... -``` - -See [Restoring Variables] -(https://www.tensorflow.org/versions/r0.7/how_tos/variables/index.html#restoring-variables) -and [Choosing which Variables to Save and Restore] -(https://www.tensorflow.org/versions/r0.7/how_tos/variables/index.html#choosing-which-variables-to-save-and-restore) -sections of the [Variables] -(https://www.tensorflow.org/versions/r0.7/how_tos/variables/index.html) page for -more details. - -### Using slim.variables to Track which Variables need to be Restored - -It is often desirable to fine-tune a pre-trained model on an entirely new -dataset or even a new task. In these situations, one must specify which layers -of the model should be reused (and consequently loaded from a checkpoint) and -which layers are new. Indicating which variables or layers should be restored is -a process that quickly becomes cumbersome when done manually. - -To help keep track of which variables to restore, `slim.variables` provides a -`restore` argument when creating each Variable. By default, all variables are -marked as `restore=True`, which results in all variables defined by the model -being restored. - -```python -# Create some variables. -v1 = slim.variables.variable(name="v1", ..., restore=False) -v2 = slim.variables.variable(name="v2", ...) # By default restore=True -... -# Get list of variables to restore (which contains only 'v2') -variables_to_restore = tf.get_collection(slim.variables.VARIABLES_TO_RESTORE) -restorer = tf.train.Saver(variables_to_restore) -with tf.Session() as sess: - # Restore variables from disk. - restorer.restore(sess, "/tmp/model.ckpt") - print("Model restored.") - # Do some work with the model - ... -``` - -Additionally, every layer in `slim.ops` that creates slim.variables (such as -`slim.ops.conv2d`, `slim.ops.fc`, `slim.ops.batch_norm`) also has a `restore` -argument which controls whether the variables created by that layer should be -restored or not. - -```python -# Create a small network. -net = slim.ops.conv2d(images, 32, [7, 7], stride=2, scope='conv1') -net = slim.ops.conv2d(net, 64, [3, 3], scope='conv2') -net = slim.ops.conv2d(net, 128, [3, 3], scope='conv3') -net = slim.ops.max_pool(net, [3, 3], stride=2, scope='pool3') -net = slim.ops.flatten(net) -net = slim.ops.fc(net, 10, scope='logits', restore=False) -... - -# VARIABLES_TO_RESTORE would contain the 'weights' and 'bias' defined by 'conv1' -# 'conv2' and 'conv3' but not the ones defined by 'logits' -variables_to_restore = tf.get_collection(slim.variables.VARIABLES_TO_RESTORE) - -# Create a restorer that would restore only the needed variables. -restorer = tf.train.Saver(variables_to_restore) - -# Create a saver that would save all the variables (including 'logits'). -saver = tf.train.Saver() -with tf.Session() as sess: - # Restore variables from disk. - restorer.restore(sess, "/tmp/model.ckpt") - print("Model restored.") - - # Do some work with the model - ... - saver.save(sess, "/tmp/new_model.ckpt") -``` - -Note: When restoring variables from a checkpoint, the `Saver` locates the -variable names in a checkpoint file and maps them to variables in the current -graph. Above, we created a saver by passing to it a list of variables. In this -case, the names of the variables to locate in the checkpoint file were -implicitly obtained from each provided variable's `var.op.name`. - -This works well when the variable names in the checkpoint file match those in -the graph. However, sometimes, we want to restore a model from a checkpoint -whose variables have different names those in the current graph. In this case, -we must provide the `Saver` a dictionary that maps from each checkpoint variable -name to each graph variable. Consider the following example where the checkpoint -variables names are obtained via a simple function: - -```python -# Assuming that 'conv1/weights' should be restored from 'vgg16/conv1/weights' -def name_in_checkpoint(var): - return 'vgg16/' + var.op.name - -# Assuming that 'conv1/weights' and 'conv1/bias' should be restored from 'conv1/params1' and 'conv1/params2' -def name_in_checkpoint(var): - if "weights" in var.op.name: - return var.op.name.replace("weights", "params1") - if "bias" in var.op.name: - return var.op.name.replace("bias", "params2") - -variables_to_restore = tf.get_collection(slim.variables.VARIABLES_TO_RESTORE) -variables_to_restore = {name_in_checkpoint(var):var for var in variables_to_restore} -restorer = tf.train.Saver(variables_to_restore) -with tf.Session() as sess: - # Restore variables from disk. - restorer.restore(sess, "/tmp/model.ckpt") -``` - -### Reusing the VGG16 network defined in TF-Slim on a different task, i.e. PASCAL-VOC. - -Assuming one have already a pre-trained VGG16 model, one just need to replace -the last layer `fc8` with a new layer `fc8_pascal` and use `restore=False`. - -```python -def vgg16_pascal(inputs): - with slim.arg_scope([slim.ops.conv2d, slim.ops.fc], stddev=0.01, weight_decay=0.0005): - net = slim.ops.repeat_op(2, inputs, slim.ops.conv2d, 64, [3, 3], scope='conv1') - net = slim.ops.max_pool(net, [2, 2], scope='pool1') - net = slim.ops.repeat_op(2, net, slim.ops.conv2d, 128, [3, 3], scope='conv2') - net = slim.ops.max_pool(net, [2, 2], scope='pool2') - net = slim.ops.repeat_op(3, net, slim.ops.conv2d, 256, [3, 3], scope='conv3') - net = slim.ops.max_pool(net, [2, 2], scope='pool3') - net = slim.ops.repeat_op(3, net, slim.ops.conv2d, 512, [3, 3], scope='conv4') - net = slim.ops.max_pool(net, [2, 2], scope='pool4') - net = slim.ops.repeat_op(3, net, slim.ops.conv2d, 512, [3, 3], scope='conv5') - net = slim.ops.max_pool(net, [2, 2], scope='pool5') - net = slim.ops.flatten(net, scope='flatten5') - net = slim.ops.fc(net, 4096, scope='fc6') - net = slim.ops.dropout(net, 0.5, scope='dropout6') - net = slim.ops.fc(net, 4096, scope='fc7') - net = slim.ops.dropout(net, 0.5, scope='dropout7') - # To reuse vgg16 on PASCAL-VOC, just change the last layer. - net = slim.ops.fc(net, 21, activation=None, scope='fc8_pascal', restore=False) - return net -``` - -## Authors - -Sergio Guadarrama and Nathan Silberman diff --git a/examples/imagenet/inception/slim/__init__.py b/examples/imagenet/inception/slim/__init__.py deleted file mode 100644 index e69de29b..00000000 diff --git a/examples/imagenet/inception/slim/collections_test.py b/examples/imagenet/inception/slim/collections_test.py deleted file mode 100644 index 2a1f170e..00000000 --- a/examples/imagenet/inception/slim/collections_test.py +++ /dev/null @@ -1,181 +0,0 @@ -# Copyright 2016 Google Inc. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# ============================================================================== -"""Tests for inception.""" -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function - -import tensorflow as tf - -from inception.slim import slim - - -def get_variables(scope=None): - return slim.variables.get_variables(scope) - - -def get_variables_by_name(name): - return slim.variables.get_variables_by_name(name) - - -class CollectionsTest(tf.test.TestCase): - - def testVariables(self): - batch_size = 5 - height, width = 299, 299 - with self.test_session(): - inputs = tf.random_uniform((batch_size, height, width, 3)) - with slim.arg_scope([slim.ops.conv2d], - batch_norm_params={'decay': 0.9997}): - slim.inception.inception_v3(inputs) - self.assertEqual(len(get_variables()), 388) - self.assertEqual(len(get_variables_by_name('weights')), 98) - self.assertEqual(len(get_variables_by_name('biases')), 2) - self.assertEqual(len(get_variables_by_name('beta')), 96) - self.assertEqual(len(get_variables_by_name('gamma')), 0) - self.assertEqual(len(get_variables_by_name('moving_mean')), 96) - self.assertEqual(len(get_variables_by_name('moving_variance')), 96) - - def testVariablesWithoutBatchNorm(self): - batch_size = 5 - height, width = 299, 299 - with self.test_session(): - inputs = tf.random_uniform((batch_size, height, width, 3)) - with slim.arg_scope([slim.ops.conv2d], - batch_norm_params=None): - slim.inception.inception_v3(inputs) - self.assertEqual(len(get_variables()), 196) - self.assertEqual(len(get_variables_by_name('weights')), 98) - self.assertEqual(len(get_variables_by_name('biases')), 98) - self.assertEqual(len(get_variables_by_name('beta')), 0) - self.assertEqual(len(get_variables_by_name('gamma')), 0) - self.assertEqual(len(get_variables_by_name('moving_mean')), 0) - self.assertEqual(len(get_variables_by_name('moving_variance')), 0) - - def testVariablesByLayer(self): - batch_size = 5 - height, width = 299, 299 - with self.test_session(): - inputs = tf.random_uniform((batch_size, height, width, 3)) - with slim.arg_scope([slim.ops.conv2d], - batch_norm_params={'decay': 0.9997}): - slim.inception.inception_v3(inputs) - self.assertEqual(len(get_variables()), 388) - self.assertEqual(len(get_variables('conv0')), 4) - self.assertEqual(len(get_variables('conv1')), 4) - self.assertEqual(len(get_variables('conv2')), 4) - self.assertEqual(len(get_variables('conv3')), 4) - self.assertEqual(len(get_variables('conv4')), 4) - self.assertEqual(len(get_variables('mixed_35x35x256a')), 28) - self.assertEqual(len(get_variables('mixed_35x35x288a')), 28) - self.assertEqual(len(get_variables('mixed_35x35x288b')), 28) - self.assertEqual(len(get_variables('mixed_17x17x768a')), 16) - self.assertEqual(len(get_variables('mixed_17x17x768b')), 40) - self.assertEqual(len(get_variables('mixed_17x17x768c')), 40) - self.assertEqual(len(get_variables('mixed_17x17x768d')), 40) - self.assertEqual(len(get_variables('mixed_17x17x768e')), 40) - self.assertEqual(len(get_variables('mixed_8x8x2048a')), 36) - self.assertEqual(len(get_variables('mixed_8x8x2048b')), 36) - self.assertEqual(len(get_variables('logits')), 2) - self.assertEqual(len(get_variables('aux_logits')), 10) - - def testVariablesToRestore(self): - batch_size = 5 - height, width = 299, 299 - with self.test_session(): - inputs = tf.random_uniform((batch_size, height, width, 3)) - with slim.arg_scope([slim.ops.conv2d], - batch_norm_params={'decay': 0.9997}): - slim.inception.inception_v3(inputs) - variables_to_restore = tf.get_collection( - slim.variables.VARIABLES_TO_RESTORE) - self.assertEqual(len(variables_to_restore), 388) - self.assertListEqual(variables_to_restore, get_variables()) - - def testVariablesToRestoreWithoutLogits(self): - batch_size = 5 - height, width = 299, 299 - with self.test_session(): - inputs = tf.random_uniform((batch_size, height, width, 3)) - with slim.arg_scope([slim.ops.conv2d], - batch_norm_params={'decay': 0.9997}): - slim.inception.inception_v3(inputs, restore_logits=False) - variables_to_restore = tf.get_collection( - slim.variables.VARIABLES_TO_RESTORE) - self.assertEqual(len(variables_to_restore), 384) - - def testRegularizationLosses(self): - batch_size = 5 - height, width = 299, 299 - with self.test_session(): - inputs = tf.random_uniform((batch_size, height, width, 3)) - with slim.arg_scope([slim.ops.conv2d, slim.ops.fc], weight_decay=0.00004): - slim.inception.inception_v3(inputs) - losses = tf.get_collection(tf.GraphKeys.REGULARIZATION_LOSSES) - self.assertEqual(len(losses), len(get_variables_by_name('weights'))) - - def testTotalLossWithoutRegularization(self): - batch_size = 5 - height, width = 299, 299 - num_classes = 1001 - with self.test_session(): - inputs = tf.random_uniform((batch_size, height, width, 3)) - dense_labels = tf.random_uniform((batch_size, num_classes)) - with slim.arg_scope([slim.ops.conv2d, slim.ops.fc], weight_decay=0): - logits, end_points = slim.inception.inception_v3( - inputs, - num_classes=num_classes) - # Cross entropy loss for the main softmax prediction. - slim.losses.cross_entropy_loss(logits, - dense_labels, - label_smoothing=0.1, - weight=1.0) - # Cross entropy loss for the auxiliary softmax head. - slim.losses.cross_entropy_loss(end_points['aux_logits'], - dense_labels, - label_smoothing=0.1, - weight=0.4, - scope='aux_loss') - losses = tf.get_collection(slim.losses.LOSSES_COLLECTION) - self.assertEqual(len(losses), 2) - - def testTotalLossWithRegularization(self): - batch_size = 5 - height, width = 299, 299 - num_classes = 1000 - with self.test_session(): - inputs = tf.random_uniform((batch_size, height, width, 3)) - dense_labels = tf.random_uniform((batch_size, num_classes)) - with slim.arg_scope([slim.ops.conv2d, slim.ops.fc], weight_decay=0.00004): - logits, end_points = slim.inception.inception_v3(inputs, num_classes) - # Cross entropy loss for the main softmax prediction. - slim.losses.cross_entropy_loss(logits, - dense_labels, - label_smoothing=0.1, - weight=1.0) - # Cross entropy loss for the auxiliary softmax head. - slim.losses.cross_entropy_loss(end_points['aux_logits'], - dense_labels, - label_smoothing=0.1, - weight=0.4, - scope='aux_loss') - losses = tf.get_collection(slim.losses.LOSSES_COLLECTION) - self.assertEqual(len(losses), 2) - reg_losses = tf.get_collection(tf.GraphKeys.REGULARIZATION_LOSSES) - self.assertEqual(len(reg_losses), 98) - - -if __name__ == '__main__': - tf.test.main() diff --git a/examples/imagenet/inception/slim/inception_model.py b/examples/imagenet/inception/slim/inception_model.py deleted file mode 100644 index 6136ab1b..00000000 --- a/examples/imagenet/inception/slim/inception_model.py +++ /dev/null @@ -1,356 +0,0 @@ -# Copyright 2016 Google Inc. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# ============================================================================== -"""Inception-v3 expressed in TensorFlow-Slim. - - Usage: - - # Parameters for BatchNorm. - batch_norm_params = { - # Decay for the batch_norm moving averages. - 'decay': BATCHNORM_MOVING_AVERAGE_DECAY, - # epsilon to prevent 0s in variance. - 'epsilon': 0.001, - } - # Set weight_decay for weights in Conv and FC layers. - with slim.arg_scope([slim.ops.conv2d, slim.ops.fc], weight_decay=0.00004): - with slim.arg_scope([slim.ops.conv2d], - stddev=0.1, - activation=tf.nn.relu, - batch_norm_params=batch_norm_params): - # Force all Variables to reside on the CPU. - with slim.arg_scope([slim.variables.variable], device='/cpu:0'): - logits, endpoints = slim.inception.inception_v3( - images, - dropout_keep_prob=0.8, - num_classes=num_classes, - is_training=for_training, - restore_logits=restore_logits, - scope=scope) -""" -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function - -import tensorflow as tf - -from inception.slim import ops -from inception.slim import scopes - - -def inception_v3(inputs, - dropout_keep_prob=0.8, - num_classes=1000, - is_training=True, - restore_logits=True, - scope=''): - """Latest Inception from http://arxiv.org/abs/1512.00567. - - "Rethinking the Inception Architecture for Computer Vision" - - Christian Szegedy, Vincent Vanhoucke, Sergey Ioffe, Jonathon Shlens, - Zbigniew Wojna - - Args: - inputs: a tensor of size [batch_size, height, width, channels]. - dropout_keep_prob: dropout keep_prob. - num_classes: number of predicted classes. - is_training: whether is training or not. - restore_logits: whether or not the logits layers should be restored. - Useful for fine-tuning a model with different num_classes. - scope: Optional scope for name_scope. - - Returns: - a list containing 'logits', 'aux_logits' Tensors. - """ - # end_points will collect relevant activations for external use, for example - # summaries or losses. - end_points = {} - with tf.name_scope(scope, 'inception_v3', [inputs]): - with scopes.arg_scope([ops.conv2d, ops.fc, ops.batch_norm, ops.dropout], - is_training=is_training): - with scopes.arg_scope([ops.conv2d, ops.max_pool, ops.avg_pool], - stride=1, padding='VALID'): - # 299 x 299 x 3 - end_points['conv0'] = ops.conv2d(inputs, 32, [3, 3], stride=2, - scope='conv0') - # 149 x 149 x 32 - end_points['conv1'] = ops.conv2d(end_points['conv0'], 32, [3, 3], - scope='conv1') - # 147 x 147 x 32 - end_points['conv2'] = ops.conv2d(end_points['conv1'], 64, [3, 3], - padding='SAME', scope='conv2') - # 147 x 147 x 64 - end_points['pool1'] = ops.max_pool(end_points['conv2'], [3, 3], - stride=2, scope='pool1') - # 73 x 73 x 64 - end_points['conv3'] = ops.conv2d(end_points['pool1'], 80, [1, 1], - scope='conv3') - # 73 x 73 x 80. - end_points['conv4'] = ops.conv2d(end_points['conv3'], 192, [3, 3], - scope='conv4') - # 71 x 71 x 192. - end_points['pool2'] = ops.max_pool(end_points['conv4'], [3, 3], - stride=2, scope='pool2') - # 35 x 35 x 192. - net = end_points['pool2'] - # Inception blocks - with scopes.arg_scope([ops.conv2d, ops.max_pool, ops.avg_pool], - stride=1, padding='SAME'): - # mixed: 35 x 35 x 256. - with tf.variable_scope('mixed_35x35x256a'): - with tf.variable_scope('branch1x1'): - branch1x1 = ops.conv2d(net, 64, [1, 1]) - with tf.variable_scope('branch5x5'): - branch5x5 = ops.conv2d(net, 48, [1, 1]) - branch5x5 = ops.conv2d(branch5x5, 64, [5, 5]) - with tf.variable_scope('branch3x3dbl'): - branch3x3dbl = ops.conv2d(net, 64, [1, 1]) - branch3x3dbl = ops.conv2d(branch3x3dbl, 96, [3, 3]) - branch3x3dbl = ops.conv2d(branch3x3dbl, 96, [3, 3]) - with tf.variable_scope('branch_pool'): - branch_pool = ops.avg_pool(net, [3, 3]) - branch_pool = ops.conv2d(branch_pool, 32, [1, 1]) - net = tf.concat(axis=3, values=[branch1x1, branch5x5, branch3x3dbl, branch_pool]) - end_points['mixed_35x35x256a'] = net - # mixed_1: 35 x 35 x 288. - with tf.variable_scope('mixed_35x35x288a'): - with tf.variable_scope('branch1x1'): - branch1x1 = ops.conv2d(net, 64, [1, 1]) - with tf.variable_scope('branch5x5'): - branch5x5 = ops.conv2d(net, 48, [1, 1]) - branch5x5 = ops.conv2d(branch5x5, 64, [5, 5]) - with tf.variable_scope('branch3x3dbl'): - branch3x3dbl = ops.conv2d(net, 64, [1, 1]) - branch3x3dbl = ops.conv2d(branch3x3dbl, 96, [3, 3]) - branch3x3dbl = ops.conv2d(branch3x3dbl, 96, [3, 3]) - with tf.variable_scope('branch_pool'): - branch_pool = ops.avg_pool(net, [3, 3]) - branch_pool = ops.conv2d(branch_pool, 64, [1, 1]) - net = tf.concat(axis=3, values=[branch1x1, branch5x5, branch3x3dbl, branch_pool]) - end_points['mixed_35x35x288a'] = net - # mixed_2: 35 x 35 x 288. - with tf.variable_scope('mixed_35x35x288b'): - with tf.variable_scope('branch1x1'): - branch1x1 = ops.conv2d(net, 64, [1, 1]) - with tf.variable_scope('branch5x5'): - branch5x5 = ops.conv2d(net, 48, [1, 1]) - branch5x5 = ops.conv2d(branch5x5, 64, [5, 5]) - with tf.variable_scope('branch3x3dbl'): - branch3x3dbl = ops.conv2d(net, 64, [1, 1]) - branch3x3dbl = ops.conv2d(branch3x3dbl, 96, [3, 3]) - branch3x3dbl = ops.conv2d(branch3x3dbl, 96, [3, 3]) - with tf.variable_scope('branch_pool'): - branch_pool = ops.avg_pool(net, [3, 3]) - branch_pool = ops.conv2d(branch_pool, 64, [1, 1]) - net = tf.concat(axis=3, values=[branch1x1, branch5x5, branch3x3dbl, branch_pool]) - end_points['mixed_35x35x288b'] = net - # mixed_3: 17 x 17 x 768. - with tf.variable_scope('mixed_17x17x768a'): - with tf.variable_scope('branch3x3'): - branch3x3 = ops.conv2d(net, 384, [3, 3], stride=2, padding='VALID') - with tf.variable_scope('branch3x3dbl'): - branch3x3dbl = ops.conv2d(net, 64, [1, 1]) - branch3x3dbl = ops.conv2d(branch3x3dbl, 96, [3, 3]) - branch3x3dbl = ops.conv2d(branch3x3dbl, 96, [3, 3], - stride=2, padding='VALID') - with tf.variable_scope('branch_pool'): - branch_pool = ops.max_pool(net, [3, 3], stride=2, padding='VALID') - net = tf.concat(axis=3, values=[branch3x3, branch3x3dbl, branch_pool]) - end_points['mixed_17x17x768a'] = net - # mixed4: 17 x 17 x 768. - with tf.variable_scope('mixed_17x17x768b'): - with tf.variable_scope('branch1x1'): - branch1x1 = ops.conv2d(net, 192, [1, 1]) - with tf.variable_scope('branch7x7'): - branch7x7 = ops.conv2d(net, 128, [1, 1]) - branch7x7 = ops.conv2d(branch7x7, 128, [1, 7]) - branch7x7 = ops.conv2d(branch7x7, 192, [7, 1]) - with tf.variable_scope('branch7x7dbl'): - branch7x7dbl = ops.conv2d(net, 128, [1, 1]) - branch7x7dbl = ops.conv2d(branch7x7dbl, 128, [7, 1]) - branch7x7dbl = ops.conv2d(branch7x7dbl, 128, [1, 7]) - branch7x7dbl = ops.conv2d(branch7x7dbl, 128, [7, 1]) - branch7x7dbl = ops.conv2d(branch7x7dbl, 192, [1, 7]) - with tf.variable_scope('branch_pool'): - branch_pool = ops.avg_pool(net, [3, 3]) - branch_pool = ops.conv2d(branch_pool, 192, [1, 1]) - net = tf.concat(axis=3, values=[branch1x1, branch7x7, branch7x7dbl, branch_pool]) - end_points['mixed_17x17x768b'] = net - # mixed_5: 17 x 17 x 768. - with tf.variable_scope('mixed_17x17x768c'): - with tf.variable_scope('branch1x1'): - branch1x1 = ops.conv2d(net, 192, [1, 1]) - with tf.variable_scope('branch7x7'): - branch7x7 = ops.conv2d(net, 160, [1, 1]) - branch7x7 = ops.conv2d(branch7x7, 160, [1, 7]) - branch7x7 = ops.conv2d(branch7x7, 192, [7, 1]) - with tf.variable_scope('branch7x7dbl'): - branch7x7dbl = ops.conv2d(net, 160, [1, 1]) - branch7x7dbl = ops.conv2d(branch7x7dbl, 160, [7, 1]) - branch7x7dbl = ops.conv2d(branch7x7dbl, 160, [1, 7]) - branch7x7dbl = ops.conv2d(branch7x7dbl, 160, [7, 1]) - branch7x7dbl = ops.conv2d(branch7x7dbl, 192, [1, 7]) - with tf.variable_scope('branch_pool'): - branch_pool = ops.avg_pool(net, [3, 3]) - branch_pool = ops.conv2d(branch_pool, 192, [1, 1]) - net = tf.concat(axis=3, values=[branch1x1, branch7x7, branch7x7dbl, branch_pool]) - end_points['mixed_17x17x768c'] = net - # mixed_6: 17 x 17 x 768. - with tf.variable_scope('mixed_17x17x768d'): - with tf.variable_scope('branch1x1'): - branch1x1 = ops.conv2d(net, 192, [1, 1]) - with tf.variable_scope('branch7x7'): - branch7x7 = ops.conv2d(net, 160, [1, 1]) - branch7x7 = ops.conv2d(branch7x7, 160, [1, 7]) - branch7x7 = ops.conv2d(branch7x7, 192, [7, 1]) - with tf.variable_scope('branch7x7dbl'): - branch7x7dbl = ops.conv2d(net, 160, [1, 1]) - branch7x7dbl = ops.conv2d(branch7x7dbl, 160, [7, 1]) - branch7x7dbl = ops.conv2d(branch7x7dbl, 160, [1, 7]) - branch7x7dbl = ops.conv2d(branch7x7dbl, 160, [7, 1]) - branch7x7dbl = ops.conv2d(branch7x7dbl, 192, [1, 7]) - with tf.variable_scope('branch_pool'): - branch_pool = ops.avg_pool(net, [3, 3]) - branch_pool = ops.conv2d(branch_pool, 192, [1, 1]) - net = tf.concat(axis=3, values=[branch1x1, branch7x7, branch7x7dbl, branch_pool]) - end_points['mixed_17x17x768d'] = net - # mixed_7: 17 x 17 x 768. - with tf.variable_scope('mixed_17x17x768e'): - with tf.variable_scope('branch1x1'): - branch1x1 = ops.conv2d(net, 192, [1, 1]) - with tf.variable_scope('branch7x7'): - branch7x7 = ops.conv2d(net, 192, [1, 1]) - branch7x7 = ops.conv2d(branch7x7, 192, [1, 7]) - branch7x7 = ops.conv2d(branch7x7, 192, [7, 1]) - with tf.variable_scope('branch7x7dbl'): - branch7x7dbl = ops.conv2d(net, 192, [1, 1]) - branch7x7dbl = ops.conv2d(branch7x7dbl, 192, [7, 1]) - branch7x7dbl = ops.conv2d(branch7x7dbl, 192, [1, 7]) - branch7x7dbl = ops.conv2d(branch7x7dbl, 192, [7, 1]) - branch7x7dbl = ops.conv2d(branch7x7dbl, 192, [1, 7]) - with tf.variable_scope('branch_pool'): - branch_pool = ops.avg_pool(net, [3, 3]) - branch_pool = ops.conv2d(branch_pool, 192, [1, 1]) - net = tf.concat(axis=3, values=[branch1x1, branch7x7, branch7x7dbl, branch_pool]) - end_points['mixed_17x17x768e'] = net - # Auxiliary Head logits - aux_logits = tf.identity(end_points['mixed_17x17x768e']) - with tf.variable_scope('aux_logits'): - aux_logits = ops.avg_pool(aux_logits, [5, 5], stride=3, - padding='VALID') - aux_logits = ops.conv2d(aux_logits, 128, [1, 1], scope='proj') - # Shape of feature map before the final layer. - shape = aux_logits.get_shape() - aux_logits = ops.conv2d(aux_logits, 768, shape[1:3], stddev=0.01, - padding='VALID') - aux_logits = ops.flatten(aux_logits) - aux_logits = ops.fc(aux_logits, num_classes, activation=None, - stddev=0.001, restore=restore_logits) - end_points['aux_logits'] = aux_logits - # mixed_8: 8 x 8 x 1280. - # Note that the scope below is not changed to not void previous - # checkpoints. - # (TODO) Fix the scope when appropriate. - with tf.variable_scope('mixed_17x17x1280a'): - with tf.variable_scope('branch3x3'): - branch3x3 = ops.conv2d(net, 192, [1, 1]) - branch3x3 = ops.conv2d(branch3x3, 320, [3, 3], stride=2, - padding='VALID') - with tf.variable_scope('branch7x7x3'): - branch7x7x3 = ops.conv2d(net, 192, [1, 1]) - branch7x7x3 = ops.conv2d(branch7x7x3, 192, [1, 7]) - branch7x7x3 = ops.conv2d(branch7x7x3, 192, [7, 1]) - branch7x7x3 = ops.conv2d(branch7x7x3, 192, [3, 3], - stride=2, padding='VALID') - with tf.variable_scope('branch_pool'): - branch_pool = ops.max_pool(net, [3, 3], stride=2, padding='VALID') - net = tf.concat(axis=3, values=[branch3x3, branch7x7x3, branch_pool]) - end_points['mixed_17x17x1280a'] = net - # mixed_9: 8 x 8 x 2048. - with tf.variable_scope('mixed_8x8x2048a'): - with tf.variable_scope('branch1x1'): - branch1x1 = ops.conv2d(net, 320, [1, 1]) - with tf.variable_scope('branch3x3'): - branch3x3 = ops.conv2d(net, 384, [1, 1]) - branch3x3 = tf.concat(axis=3, values=[ops.conv2d(branch3x3, 384, [1, 3]), - ops.conv2d(branch3x3, 384, [3, 1])]) - with tf.variable_scope('branch3x3dbl'): - branch3x3dbl = ops.conv2d(net, 448, [1, 1]) - branch3x3dbl = ops.conv2d(branch3x3dbl, 384, [3, 3]) - branch3x3dbl = tf.concat(axis=3, values=[ops.conv2d(branch3x3dbl, 384, [1, 3]), - ops.conv2d(branch3x3dbl, 384, [3, 1])]) - with tf.variable_scope('branch_pool'): - branch_pool = ops.avg_pool(net, [3, 3]) - branch_pool = ops.conv2d(branch_pool, 192, [1, 1]) - net = tf.concat(axis=3, values=[branch1x1, branch3x3, branch3x3dbl, branch_pool]) - end_points['mixed_8x8x2048a'] = net - # mixed_10: 8 x 8 x 2048. - with tf.variable_scope('mixed_8x8x2048b'): - with tf.variable_scope('branch1x1'): - branch1x1 = ops.conv2d(net, 320, [1, 1]) - with tf.variable_scope('branch3x3'): - branch3x3 = ops.conv2d(net, 384, [1, 1]) - branch3x3 = tf.concat(axis=3, values=[ops.conv2d(branch3x3, 384, [1, 3]), - ops.conv2d(branch3x3, 384, [3, 1])]) - with tf.variable_scope('branch3x3dbl'): - branch3x3dbl = ops.conv2d(net, 448, [1, 1]) - branch3x3dbl = ops.conv2d(branch3x3dbl, 384, [3, 3]) - branch3x3dbl = tf.concat(axis=3, values=[ops.conv2d(branch3x3dbl, 384, [1, 3]), - ops.conv2d(branch3x3dbl, 384, [3, 1])]) - with tf.variable_scope('branch_pool'): - branch_pool = ops.avg_pool(net, [3, 3]) - branch_pool = ops.conv2d(branch_pool, 192, [1, 1]) - net = tf.concat(axis=3, values=[branch1x1, branch3x3, branch3x3dbl, branch_pool]) - end_points['mixed_8x8x2048b'] = net - # Final pooling and prediction - with tf.variable_scope('logits'): - shape = net.get_shape() - net = ops.avg_pool(net, shape[1:3], padding='VALID', scope='pool') - # 1 x 1 x 2048 - net = ops.dropout(net, dropout_keep_prob, scope='dropout') - net = ops.flatten(net, scope='flatten') - # 2048 - logits = ops.fc(net, num_classes, activation=None, scope='logits', - restore=restore_logits) - # 1000 - end_points['logits'] = logits - end_points['predictions'] = tf.nn.softmax(logits, name='predictions') - return logits, end_points - - -def inception_v3_parameters(weight_decay=0.00004, stddev=0.1, - batch_norm_decay=0.9997, batch_norm_epsilon=0.001): - """Yields the scope with the default parameters for inception_v3. - - Args: - weight_decay: the weight decay for weights variables. - stddev: standard deviation of the truncated guassian weight distribution. - batch_norm_decay: decay for the moving average of batch_norm momentums. - batch_norm_epsilon: small float added to variance to avoid dividing by zero. - - Yields: - a arg_scope with the parameters needed for inception_v3. - """ - # Set weight_decay for weights in Conv and FC layers. - with scopes.arg_scope([ops.conv2d, ops.fc], - weight_decay=weight_decay): - # Set stddev, activation and parameters for batch_norm. - with scopes.arg_scope([ops.conv2d], - stddev=stddev, - activation=tf.nn.relu, - batch_norm_params={ - 'decay': batch_norm_decay, - 'epsilon': batch_norm_epsilon}) as arg_scope: - yield arg_scope diff --git a/examples/imagenet/inception/slim/inception_test.py b/examples/imagenet/inception/slim/inception_test.py deleted file mode 100644 index 231dea29..00000000 --- a/examples/imagenet/inception/slim/inception_test.py +++ /dev/null @@ -1,134 +0,0 @@ -# Copyright 2016 Google Inc. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# ============================================================================== -"""Tests for slim.inception.""" -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function - -import tensorflow as tf - -from inception.slim import inception_model as inception - - -class InceptionTest(tf.test.TestCase): - - def testBuildLogits(self): - batch_size = 5 - height, width = 299, 299 - num_classes = 1000 - with self.test_session(): - inputs = tf.random_uniform((batch_size, height, width, 3)) - logits, _ = inception.inception_v3(inputs, num_classes) - self.assertTrue(logits.op.name.startswith('logits')) - self.assertListEqual(logits.get_shape().as_list(), - [batch_size, num_classes]) - - def testBuildEndPoints(self): - batch_size = 5 - height, width = 299, 299 - num_classes = 1000 - with self.test_session(): - inputs = tf.random_uniform((batch_size, height, width, 3)) - _, end_points = inception.inception_v3(inputs, num_classes) - self.assertTrue('logits' in end_points) - logits = end_points['logits'] - self.assertListEqual(logits.get_shape().as_list(), - [batch_size, num_classes]) - self.assertTrue('aux_logits' in end_points) - aux_logits = end_points['aux_logits'] - self.assertListEqual(aux_logits.get_shape().as_list(), - [batch_size, num_classes]) - pre_pool = end_points['mixed_8x8x2048b'] - self.assertListEqual(pre_pool.get_shape().as_list(), - [batch_size, 8, 8, 2048]) - - def testVariablesSetDevice(self): - batch_size = 5 - height, width = 299, 299 - num_classes = 1000 - with self.test_session(): - inputs = tf.random_uniform((batch_size, height, width, 3)) - # Force all Variables to reside on the device. - with tf.variable_scope('on_cpu'), tf.device('/cpu:0'): - inception.inception_v3(inputs, num_classes) - with tf.variable_scope('on_gpu'), tf.device('/gpu:0'): - inception.inception_v3(inputs, num_classes) - for v in tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope='on_cpu'): - self.assertDeviceEqual(v.device, '/cpu:0') - for v in tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope='on_gpu'): - self.assertDeviceEqual(v.device, '/gpu:0') - - def testHalfSizeImages(self): - batch_size = 5 - height, width = 150, 150 - num_classes = 1000 - with self.test_session(): - inputs = tf.random_uniform((batch_size, height, width, 3)) - logits, end_points = inception.inception_v3(inputs, num_classes) - self.assertTrue(logits.op.name.startswith('logits')) - self.assertListEqual(logits.get_shape().as_list(), - [batch_size, num_classes]) - pre_pool = end_points['mixed_8x8x2048b'] - self.assertListEqual(pre_pool.get_shape().as_list(), - [batch_size, 3, 3, 2048]) - - def testUnknowBatchSize(self): - batch_size = 1 - height, width = 299, 299 - num_classes = 1000 - with self.test_session() as sess: - inputs = tf.placeholder(tf.float32, (None, height, width, 3)) - logits, _ = inception.inception_v3(inputs, num_classes) - self.assertTrue(logits.op.name.startswith('logits')) - self.assertListEqual(logits.get_shape().as_list(), - [None, num_classes]) - images = tf.random_uniform((batch_size, height, width, 3)) - sess.run(tf.global_variables_initializer()) - output = sess.run(logits, {inputs: images.eval()}) - self.assertEquals(output.shape, (batch_size, num_classes)) - - def testEvaluation(self): - batch_size = 2 - height, width = 299, 299 - num_classes = 1000 - with self.test_session() as sess: - eval_inputs = tf.random_uniform((batch_size, height, width, 3)) - logits, _ = inception.inception_v3(eval_inputs, num_classes, - is_training=False) - predictions = tf.argmax(logits, 1) - sess.run(tf.global_variables_initializer()) - output = sess.run(predictions) - self.assertEquals(output.shape, (batch_size,)) - - def testTrainEvalWithReuse(self): - train_batch_size = 5 - eval_batch_size = 2 - height, width = 150, 150 - num_classes = 1000 - with self.test_session() as sess: - train_inputs = tf.random_uniform((train_batch_size, height, width, 3)) - inception.inception_v3(train_inputs, num_classes) - tf.get_variable_scope().reuse_variables() - eval_inputs = tf.random_uniform((eval_batch_size, height, width, 3)) - logits, _ = inception.inception_v3(eval_inputs, num_classes, - is_training=False) - predictions = tf.argmax(logits, 1) - sess.run(tf.global_variables_initializer()) - output = sess.run(predictions) - self.assertEquals(output.shape, (eval_batch_size,)) - - -if __name__ == '__main__': - tf.test.main() diff --git a/examples/imagenet/inception/slim/losses.py b/examples/imagenet/inception/slim/losses.py deleted file mode 100644 index 78298d09..00000000 --- a/examples/imagenet/inception/slim/losses.py +++ /dev/null @@ -1,174 +0,0 @@ -# Copyright 2016 Google Inc. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# ============================================================================== -"""Contains convenience wrappers for various Neural Network TensorFlow losses. - - All the losses defined here add themselves to the LOSSES_COLLECTION - collection. - - l1_loss: Define a L1 Loss, useful for regularization, i.e. lasso. - l2_loss: Define a L2 Loss, useful for regularization, i.e. weight decay. - cross_entropy_loss: Define a cross entropy loss using - softmax_cross_entropy_with_logits. Useful for classification. -""" -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function - -import tensorflow as tf - -# In order to gather all losses in a network, the user should use this -# key for get_collection, i.e: -# losses = tf.get_collection(slim.losses.LOSSES_COLLECTION) -LOSSES_COLLECTION = '_losses' - - -def l1_regularizer(weight=1.0, scope=None): - """Define a L1 regularizer. - - Args: - weight: scale the loss by this factor. - scope: Optional scope for name_scope. - - Returns: - a regularizer function. - """ - def regularizer(tensor): - with tf.name_scope(scope, 'L1Regularizer', [tensor]): - l1_weight = tf.convert_to_tensor(weight, - dtype=tensor.dtype.base_dtype, - name='weight') - return tf.multiply(l1_weight, tf.reduce_sum(tf.abs(tensor)), name='value') - return regularizer - - -def l2_regularizer(weight=1.0, scope=None): - """Define a L2 regularizer. - - Args: - weight: scale the loss by this factor. - scope: Optional scope for name_scope. - - Returns: - a regularizer function. - """ - def regularizer(tensor): - with tf.name_scope(scope, 'L2Regularizer', [tensor]): - l2_weight = tf.convert_to_tensor(weight, - dtype=tensor.dtype.base_dtype, - name='weight') - return tf.multiply(l2_weight, tf.nn.l2_loss(tensor), name='value') - return regularizer - - -def l1_l2_regularizer(weight_l1=1.0, weight_l2=1.0, scope=None): - """Define a L1L2 regularizer. - - Args: - weight_l1: scale the L1 loss by this factor. - weight_l2: scale the L2 loss by this factor. - scope: Optional scope for name_scope. - - Returns: - a regularizer function. - """ - def regularizer(tensor): - with tf.name_scope(scope, 'L1L2Regularizer', [tensor]): - weight_l1_t = tf.convert_to_tensor(weight_l1, - dtype=tensor.dtype.base_dtype, - name='weight_l1') - weight_l2_t = tf.convert_to_tensor(weight_l2, - dtype=tensor.dtype.base_dtype, - name='weight_l2') - reg_l1 = tf.multiply(weight_l1_t, tf.reduce_sum(tf.abs(tensor)), - name='value_l1') - reg_l2 = tf.multiply(weight_l2_t, tf.nn.l2_loss(tensor), - name='value_l2') - return tf.add(reg_l1, reg_l2, name='value') - return regularizer - - -def l1_loss(tensor, weight=1.0, scope=None): - """Define a L1Loss, useful for regularize, i.e. lasso. - - Args: - tensor: tensor to regularize. - weight: scale the loss by this factor. - scope: Optional scope for name_scope. - - Returns: - the L1 loss op. - """ - with tf.name_scope(scope, 'L1Loss', [tensor]): - weight = tf.convert_to_tensor(weight, - dtype=tensor.dtype.base_dtype, - name='loss_weight') - loss = tf.multiply(weight, tf.reduce_sum(tf.abs(tensor)), name='value') - tf.add_to_collection(LOSSES_COLLECTION, loss) - return loss - - -def l2_loss(tensor, weight=1.0, scope=None): - """Define a L2Loss, useful for regularize, i.e. weight decay. - - Args: - tensor: tensor to regularize. - weight: an optional weight to modulate the loss. - scope: Optional scope for name_scope. - - Returns: - the L2 loss op. - """ - with tf.name_scope(scope, 'L2Loss', [tensor]): - weight = tf.convert_to_tensor(weight, - dtype=tensor.dtype.base_dtype, - name='loss_weight') - loss = tf.multiply(weight, tf.nn.l2_loss(tensor), name='value') - tf.add_to_collection(LOSSES_COLLECTION, loss) - return loss - - -def cross_entropy_loss(logits, one_hot_labels, label_smoothing=0, - weight=1.0, scope=None): - """Define a Cross Entropy loss using softmax_cross_entropy_with_logits. - - It can scale the loss by weight factor, and smooth the labels. - - Args: - logits: [batch_size, num_classes] logits outputs of the network . - one_hot_labels: [batch_size, num_classes] target one_hot_encoded labels. - label_smoothing: if greater than 0 then smooth the labels. - weight: scale the loss by this factor. - scope: Optional scope for name_scope. - - Returns: - A tensor with the softmax_cross_entropy loss. - """ - logits.get_shape().assert_is_compatible_with(one_hot_labels.get_shape()) - with tf.name_scope(scope, 'CrossEntropyLoss', [logits, one_hot_labels]): - num_classes = one_hot_labels.get_shape()[-1].value - one_hot_labels = tf.cast(one_hot_labels, logits.dtype) - if label_smoothing > 0: - smooth_positives = 1.0 - label_smoothing - smooth_negatives = label_smoothing / num_classes - one_hot_labels = one_hot_labels * smooth_positives + smooth_negatives - cross_entropy = tf.contrib.nn.deprecated_flipped_softmax_cross_entropy_with_logits( - logits, one_hot_labels, name='xentropy') - - weight = tf.convert_to_tensor(weight, - dtype=logits.dtype.base_dtype, - name='loss_weight') - loss = tf.multiply(weight, tf.reduce_mean(cross_entropy), name='value') - tf.add_to_collection(LOSSES_COLLECTION, loss) - return loss diff --git a/examples/imagenet/inception/slim/losses_test.py b/examples/imagenet/inception/slim/losses_test.py deleted file mode 100644 index e267f652..00000000 --- a/examples/imagenet/inception/slim/losses_test.py +++ /dev/null @@ -1,177 +0,0 @@ -# Copyright 2016 Google Inc. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# ============================================================================== -"""Tests for slim.losses.""" -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function - - -import tensorflow as tf - -from inception.slim import losses - - -class LossesTest(tf.test.TestCase): - - def testL1Loss(self): - with self.test_session(): - shape = [5, 5, 5] - num_elem = 5 * 5 * 5 - weights = tf.constant(1.0, shape=shape) - wd = 0.01 - loss = losses.l1_loss(weights, wd) - self.assertEquals(loss.op.name, 'L1Loss/value') - self.assertAlmostEqual(loss.eval(), num_elem * wd, 5) - - def testL2Loss(self): - with self.test_session(): - shape = [5, 5, 5] - num_elem = 5 * 5 * 5 - weights = tf.constant(1.0, shape=shape) - wd = 0.01 - loss = losses.l2_loss(weights, wd) - self.assertEquals(loss.op.name, 'L2Loss/value') - self.assertAlmostEqual(loss.eval(), num_elem * wd / 2, 5) - - -class RegularizersTest(tf.test.TestCase): - - def testL1Regularizer(self): - with self.test_session(): - shape = [5, 5, 5] - num_elem = 5 * 5 * 5 - tensor = tf.constant(1.0, shape=shape) - loss = losses.l1_regularizer()(tensor) - self.assertEquals(loss.op.name, 'L1Regularizer/value') - self.assertAlmostEqual(loss.eval(), num_elem, 5) - - def testL1RegularizerWithScope(self): - with self.test_session(): - shape = [5, 5, 5] - num_elem = 5 * 5 * 5 - tensor = tf.constant(1.0, shape=shape) - loss = losses.l1_regularizer(scope='L1')(tensor) - self.assertEquals(loss.op.name, 'L1/value') - self.assertAlmostEqual(loss.eval(), num_elem, 5) - - def testL1RegularizerWithWeight(self): - with self.test_session(): - shape = [5, 5, 5] - num_elem = 5 * 5 * 5 - tensor = tf.constant(1.0, shape=shape) - weight = 0.01 - loss = losses.l1_regularizer(weight)(tensor) - self.assertEquals(loss.op.name, 'L1Regularizer/value') - self.assertAlmostEqual(loss.eval(), num_elem * weight, 5) - - def testL2Regularizer(self): - with self.test_session(): - shape = [5, 5, 5] - num_elem = 5 * 5 * 5 - tensor = tf.constant(1.0, shape=shape) - loss = losses.l2_regularizer()(tensor) - self.assertEquals(loss.op.name, 'L2Regularizer/value') - self.assertAlmostEqual(loss.eval(), num_elem / 2, 5) - - def testL2RegularizerWithScope(self): - with self.test_session(): - shape = [5, 5, 5] - num_elem = 5 * 5 * 5 - tensor = tf.constant(1.0, shape=shape) - loss = losses.l2_regularizer(scope='L2')(tensor) - self.assertEquals(loss.op.name, 'L2/value') - self.assertAlmostEqual(loss.eval(), num_elem / 2, 5) - - def testL2RegularizerWithWeight(self): - with self.test_session(): - shape = [5, 5, 5] - num_elem = 5 * 5 * 5 - tensor = tf.constant(1.0, shape=shape) - weight = 0.01 - loss = losses.l2_regularizer(weight)(tensor) - self.assertEquals(loss.op.name, 'L2Regularizer/value') - self.assertAlmostEqual(loss.eval(), num_elem * weight / 2, 5) - - def testL1L2Regularizer(self): - with self.test_session(): - shape = [5, 5, 5] - num_elem = 5 * 5 * 5 - tensor = tf.constant(1.0, shape=shape) - loss = losses.l1_l2_regularizer()(tensor) - self.assertEquals(loss.op.name, 'L1L2Regularizer/value') - self.assertAlmostEqual(loss.eval(), num_elem + num_elem / 2, 5) - - def testL1L2RegularizerWithScope(self): - with self.test_session(): - shape = [5, 5, 5] - num_elem = 5 * 5 * 5 - tensor = tf.constant(1.0, shape=shape) - loss = losses.l1_l2_regularizer(scope='L1L2')(tensor) - self.assertEquals(loss.op.name, 'L1L2/value') - self.assertAlmostEqual(loss.eval(), num_elem + num_elem / 2, 5) - - def testL1L2RegularizerWithWeights(self): - with self.test_session(): - shape = [5, 5, 5] - num_elem = 5 * 5 * 5 - tensor = tf.constant(1.0, shape=shape) - weight_l1 = 0.01 - weight_l2 = 0.05 - loss = losses.l1_l2_regularizer(weight_l1, weight_l2)(tensor) - self.assertEquals(loss.op.name, 'L1L2Regularizer/value') - self.assertAlmostEqual(loss.eval(), - num_elem * weight_l1 + num_elem * weight_l2 / 2, 5) - - -class CrossEntropyLossTest(tf.test.TestCase): - - def testCrossEntropyLossAllCorrect(self): - with self.test_session(): - logits = tf.constant([[10.0, 0.0, 0.0], - [0.0, 10.0, 0.0], - [0.0, 0.0, 10.0]]) - labels = tf.constant([[1, 0, 0], - [0, 1, 0], - [0, 0, 1]]) - loss = losses.cross_entropy_loss(logits, labels) - self.assertEquals(loss.op.name, 'CrossEntropyLoss/value') - self.assertAlmostEqual(loss.eval(), 0.0, 3) - - def testCrossEntropyLossAllWrong(self): - with self.test_session(): - logits = tf.constant([[10.0, 0.0, 0.0], - [0.0, 10.0, 0.0], - [0.0, 0.0, 10.0]]) - labels = tf.constant([[0, 0, 1], - [1, 0, 0], - [0, 1, 0]]) - loss = losses.cross_entropy_loss(logits, labels) - self.assertEquals(loss.op.name, 'CrossEntropyLoss/value') - self.assertAlmostEqual(loss.eval(), 10.0, 3) - - def testCrossEntropyLossAllWrongWithWeight(self): - with self.test_session(): - logits = tf.constant([[10.0, 0.0, 0.0], - [0.0, 10.0, 0.0], - [0.0, 0.0, 10.0]]) - labels = tf.constant([[0, 0, 1], - [1, 0, 0], - [0, 1, 0]]) - loss = losses.cross_entropy_loss(logits, labels, weight=0.5) - self.assertEquals(loss.op.name, 'CrossEntropyLoss/value') - self.assertAlmostEqual(loss.eval(), 5.0, 3) - -if __name__ == '__main__': - tf.test.main() diff --git a/examples/imagenet/inception/slim/ops.py b/examples/imagenet/inception/slim/ops.py deleted file mode 100644 index 54fda4eb..00000000 --- a/examples/imagenet/inception/slim/ops.py +++ /dev/null @@ -1,473 +0,0 @@ -# Copyright 2016 Google Inc. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# ============================================================================== -"""Contains convenience wrappers for typical Neural Network TensorFlow layers. - - Additionally it maintains a collection with update_ops that need to be - updated after the ops have been computed, for example to update moving means - and moving variances of batch_norm. - - Ops that have different behavior during training or eval have an is_training - parameter. Additionally Ops that contain variables.variable have a trainable - parameter, which control if the ops variables are trainable or not. -""" -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function - - -import tensorflow as tf - -from tensorflow.python.training import moving_averages - -from inception.slim import losses -from inception.slim import scopes -from inception.slim import variables - -# Used to keep the update ops done by batch_norm. -UPDATE_OPS_COLLECTION = '_update_ops_' - - -@scopes.add_arg_scope -def batch_norm(inputs, - decay=0.999, - center=True, - scale=False, - epsilon=0.001, - moving_vars='moving_vars', - activation=None, - is_training=True, - trainable=True, - restore=True, - scope=None, - reuse=None): - """Adds a Batch Normalization layer. - - Args: - inputs: a tensor of size [batch_size, height, width, channels] - or [batch_size, channels]. - decay: decay for the moving average. - center: If True, subtract beta. If False, beta is not created and ignored. - scale: If True, multiply by gamma. If False, gamma is - not used. When the next layer is linear (also e.g. ReLU), this can be - disabled since the scaling can be done by the next layer. - epsilon: small float added to variance to avoid dividing by zero. - moving_vars: collection to store the moving_mean and moving_variance. - activation: activation function. - is_training: whether or not the model is in training mode. - trainable: whether or not the variables should be trainable or not. - restore: whether or not the variables should be marked for restore. - scope: Optional scope for variable_scope. - reuse: whether or not the layer and its variables should be reused. To be - able to reuse the layer scope must be given. - - Returns: - a tensor representing the output of the operation. - - """ - inputs_shape = inputs.get_shape() - with tf.variable_scope(scope, 'BatchNorm', [inputs], reuse=reuse): - axis = list(range(len(inputs_shape) - 1)) - params_shape = inputs_shape[-1:] - # Allocate parameters for the beta and gamma of the normalization. - beta, gamma = None, None - if center: - beta = variables.variable('beta', - params_shape, - initializer=tf.zeros_initializer(), - trainable=trainable, - restore=restore) - if scale: - gamma = variables.variable('gamma', - params_shape, - initializer=tf.ones_initializer(), - trainable=trainable, - restore=restore) - # Create moving_mean and moving_variance add them to - # GraphKeys.MOVING_AVERAGE_VARIABLES collections. - moving_collections = [moving_vars, tf.GraphKeys.MOVING_AVERAGE_VARIABLES] - moving_mean = variables.variable('moving_mean', - params_shape, - initializer=tf.zeros_initializer(), - trainable=False, - restore=restore, - collections=moving_collections) - moving_variance = variables.variable('moving_variance', - params_shape, - initializer=tf.ones_initializer(), - trainable=False, - restore=restore, - collections=moving_collections) - if is_training: - # Calculate the moments based on the individual batch. - mean, variance = tf.nn.moments(inputs, axis) - - update_moving_mean = moving_averages.assign_moving_average( - moving_mean, mean, decay) - tf.add_to_collection(UPDATE_OPS_COLLECTION, update_moving_mean) - update_moving_variance = moving_averages.assign_moving_average( - moving_variance, variance, decay) - tf.add_to_collection(UPDATE_OPS_COLLECTION, update_moving_variance) - else: - # Just use the moving_mean and moving_variance. - mean = moving_mean - variance = moving_variance - # Normalize the activations. - outputs = tf.nn.batch_normalization( - inputs, mean, variance, beta, gamma, epsilon) - outputs.set_shape(inputs.get_shape()) - if activation: - outputs = activation(outputs) - return outputs - - -def _two_element_tuple(int_or_tuple): - """Converts `int_or_tuple` to height, width. - - Several of the functions that follow accept arguments as either - a tuple of 2 integers or a single integer. A single integer - indicates that the 2 values of the tuple are the same. - - This functions normalizes the input value by always returning a tuple. - - Args: - int_or_tuple: A list of 2 ints, a single int or a tf.TensorShape. - - Returns: - A tuple with 2 values. - - Raises: - ValueError: If `int_or_tuple` it not well formed. - """ - if isinstance(int_or_tuple, (list, tuple)): - if len(int_or_tuple) != 2: - raise ValueError('Must be a list with 2 elements: %s' % int_or_tuple) - return int(int_or_tuple[0]), int(int_or_tuple[1]) - if isinstance(int_or_tuple, int): - return int(int_or_tuple), int(int_or_tuple) - if isinstance(int_or_tuple, tf.TensorShape): - if len(int_or_tuple) == 2: - return int_or_tuple[0], int_or_tuple[1] - raise ValueError('Must be an int, a list with 2 elements or a TensorShape of ' - 'length 2') - - -@scopes.add_arg_scope -def conv2d(inputs, - num_filters_out, - kernel_size, - stride=1, - padding='SAME', - activation=tf.nn.relu, - stddev=0.01, - bias=0.0, - weight_decay=0, - batch_norm_params=None, - is_training=True, - trainable=True, - restore=True, - scope=None, - reuse=None): - """Adds a 2D convolution followed by an optional batch_norm layer. - - conv2d creates a variable called 'weights', representing the convolutional - kernel, that is convolved with the input. If `batch_norm_params` is None, a - second variable called 'biases' is added to the result of the convolution - operation. - - Args: - inputs: a tensor of size [batch_size, height, width, channels]. - num_filters_out: the number of output filters. - kernel_size: a list of length 2: [kernel_height, kernel_width] of - of the filters. Can be an int if both values are the same. - stride: a list of length 2: [stride_height, stride_width]. - Can be an int if both strides are the same. Note that presently - both strides must have the same value. - padding: one of 'VALID' or 'SAME'. - activation: activation function. - stddev: standard deviation of the truncated guassian weight distribution. - bias: the initial value of the biases. - weight_decay: the weight decay. - batch_norm_params: parameters for the batch_norm. If is None don't use it. - is_training: whether or not the model is in training mode. - trainable: whether or not the variables should be trainable or not. - restore: whether or not the variables should be marked for restore. - scope: Optional scope for variable_scope. - reuse: whether or not the layer and its variables should be reused. To be - able to reuse the layer scope must be given. - Returns: - a tensor representing the output of the operation. - - """ - with tf.variable_scope(scope, 'Conv', [inputs], reuse=reuse): - kernel_h, kernel_w = _two_element_tuple(kernel_size) - stride_h, stride_w = _two_element_tuple(stride) - num_filters_in = inputs.get_shape()[-1] - weights_shape = [kernel_h, kernel_w, - num_filters_in, num_filters_out] - weights_initializer = tf.truncated_normal_initializer(stddev=stddev) - l2_regularizer = None - if weight_decay and weight_decay > 0: - l2_regularizer = losses.l2_regularizer(weight_decay) - weights = variables.variable('weights', - shape=weights_shape, - initializer=weights_initializer, - regularizer=l2_regularizer, - trainable=trainable, - restore=restore) - conv = tf.nn.conv2d(inputs, weights, [1, stride_h, stride_w, 1], - padding=padding) - if batch_norm_params is not None: - with scopes.arg_scope([batch_norm], is_training=is_training, - trainable=trainable, restore=restore): - outputs = batch_norm(conv, **batch_norm_params) - else: - bias_shape = [num_filters_out,] - bias_initializer = tf.constant_initializer(bias) - biases = variables.variable('biases', - shape=bias_shape, - initializer=bias_initializer, - trainable=trainable, - restore=restore) - outputs = tf.nn.bias_add(conv, biases) - if activation: - outputs = activation(outputs) - return outputs - - -@scopes.add_arg_scope -def fc(inputs, - num_units_out, - activation=tf.nn.relu, - stddev=0.01, - bias=0.0, - weight_decay=0, - batch_norm_params=None, - is_training=True, - trainable=True, - restore=True, - scope=None, - reuse=None): - """Adds a fully connected layer followed by an optional batch_norm layer. - - FC creates a variable called 'weights', representing the fully connected - weight matrix, that is multiplied by the input. If `batch_norm` is None, a - second variable called 'biases' is added to the result of the initial - vector-matrix multiplication. - - Args: - inputs: a [B x N] tensor where B is the batch size and N is the number of - input units in the layer. - num_units_out: the number of output units in the layer. - activation: activation function. - stddev: the standard deviation for the weights. - bias: the initial value of the biases. - weight_decay: the weight decay. - batch_norm_params: parameters for the batch_norm. If is None don't use it. - is_training: whether or not the model is in training mode. - trainable: whether or not the variables should be trainable or not. - restore: whether or not the variables should be marked for restore. - scope: Optional scope for variable_scope. - reuse: whether or not the layer and its variables should be reused. To be - able to reuse the layer scope must be given. - - Returns: - the tensor variable representing the result of the series of operations. - """ - with tf.variable_scope(scope, 'FC', [inputs], reuse=reuse): - num_units_in = inputs.get_shape()[1] - weights_shape = [num_units_in, num_units_out] - weights_initializer = tf.truncated_normal_initializer(stddev=stddev) - l2_regularizer = None - if weight_decay and weight_decay > 0: - l2_regularizer = losses.l2_regularizer(weight_decay) - weights = variables.variable('weights', - shape=weights_shape, - initializer=weights_initializer, - regularizer=l2_regularizer, - trainable=trainable, - restore=restore) - if batch_norm_params is not None: - outputs = tf.matmul(inputs, weights) - with scopes.arg_scope([batch_norm], is_training=is_training, - trainable=trainable, restore=restore): - outputs = batch_norm(outputs, **batch_norm_params) - else: - bias_shape = [num_units_out,] - bias_initializer = tf.constant_initializer(bias) - biases = variables.variable('biases', - shape=bias_shape, - initializer=bias_initializer, - trainable=trainable, - restore=restore) - outputs = tf.nn.xw_plus_b(inputs, weights, biases) - if activation: - outputs = activation(outputs) - return outputs - - -def one_hot_encoding(labels, num_classes, scope=None): - """Transform numeric labels into onehot_labels. - - Args: - labels: [batch_size] target labels. - num_classes: total number of classes. - scope: Optional scope for name_scope. - Returns: - one hot encoding of the labels. - """ - with tf.name_scope(scope, 'OneHotEncoding', [labels]): - batch_size = labels.get_shape()[0] - indices = tf.expand_dims(tf.range(0, batch_size), 1) - labels = tf.cast(tf.expand_dims(labels, 1), indices.dtype) - concated = tf.concat(axis=1, values=[indices, labels]) - onehot_labels = tf.sparse_to_dense( - concated, tf.stack([batch_size, num_classes]), 1.0, 0.0) - onehot_labels.set_shape([batch_size, num_classes]) - return onehot_labels - - -@scopes.add_arg_scope -def max_pool(inputs, kernel_size, stride=2, padding='VALID', scope=None): - """Adds a Max Pooling layer. - - It is assumed by the wrapper that the pooling is only done per image and not - in depth or batch. - - Args: - inputs: a tensor of size [batch_size, height, width, depth]. - kernel_size: a list of length 2: [kernel_height, kernel_width] of the - pooling kernel over which the op is computed. Can be an int if both - values are the same. - stride: a list of length 2: [stride_height, stride_width]. - Can be an int if both strides are the same. Note that presently - both strides must have the same value. - padding: the padding method, either 'VALID' or 'SAME'. - scope: Optional scope for name_scope. - - Returns: - a tensor representing the results of the pooling operation. - Raises: - ValueError: if 'kernel_size' is not a 2-D list - """ - with tf.name_scope(scope, 'MaxPool', [inputs]): - kernel_h, kernel_w = _two_element_tuple(kernel_size) - stride_h, stride_w = _two_element_tuple(stride) - return tf.nn.max_pool(inputs, - ksize=[1, kernel_h, kernel_w, 1], - strides=[1, stride_h, stride_w, 1], - padding=padding) - - -@scopes.add_arg_scope -def avg_pool(inputs, kernel_size, stride=2, padding='VALID', scope=None): - """Adds a Avg Pooling layer. - - It is assumed by the wrapper that the pooling is only done per image and not - in depth or batch. - - Args: - inputs: a tensor of size [batch_size, height, width, depth]. - kernel_size: a list of length 2: [kernel_height, kernel_width] of the - pooling kernel over which the op is computed. Can be an int if both - values are the same. - stride: a list of length 2: [stride_height, stride_width]. - Can be an int if both strides are the same. Note that presently - both strides must have the same value. - padding: the padding method, either 'VALID' or 'SAME'. - scope: Optional scope for name_scope. - - Returns: - a tensor representing the results of the pooling operation. - """ - with tf.name_scope(scope, 'AvgPool', [inputs]): - kernel_h, kernel_w = _two_element_tuple(kernel_size) - stride_h, stride_w = _two_element_tuple(stride) - return tf.nn.avg_pool(inputs, - ksize=[1, kernel_h, kernel_w, 1], - strides=[1, stride_h, stride_w, 1], - padding=padding) - - -@scopes.add_arg_scope -def dropout(inputs, keep_prob=0.5, is_training=True, scope=None): - """Returns a dropout layer applied to the input. - - Args: - inputs: the tensor to pass to the Dropout layer. - keep_prob: the probability of keeping each input unit. - is_training: whether or not the model is in training mode. If so, dropout is - applied and values scaled. Otherwise, inputs is returned. - scope: Optional scope for name_scope. - - Returns: - a tensor representing the output of the operation. - """ - if is_training and keep_prob > 0: - with tf.name_scope(scope, 'Dropout', [inputs]): - return tf.nn.dropout(inputs, keep_prob) - else: - return inputs - - -def flatten(inputs, scope=None): - """Flattens the input while maintaining the batch_size. - - Assumes that the first dimension represents the batch. - - Args: - inputs: a tensor of size [batch_size, ...]. - scope: Optional scope for name_scope. - - Returns: - a flattened tensor with shape [batch_size, k]. - Raises: - ValueError: if inputs.shape is wrong. - """ - if len(inputs.get_shape()) < 2: - raise ValueError('Inputs must be have a least 2 dimensions') - dims = inputs.get_shape()[1:] - k = dims.num_elements() - with tf.name_scope(scope, 'Flatten', [inputs]): - return tf.reshape(inputs, [-1, k]) - - -def repeat_op(repetitions, inputs, op, *args, **kwargs): - """Build a sequential Tower starting from inputs by using an op repeatedly. - - It creates new scopes for each operation by increasing the counter. - Example: given repeat_op(3, _, ops.conv2d, 64, [3, 3], scope='conv1') - it will repeat the given op under the following variable_scopes: - conv1/Conv - conv1/Conv_1 - conv1/Conv_2 - - Args: - repetitions: number or repetitions. - inputs: a tensor of size [batch_size, height, width, channels]. - op: an operation. - *args: args for the op. - **kwargs: kwargs for the op. - - Returns: - a tensor result of applying the operation op, num times. - Raises: - ValueError: if the op is unknown or wrong. - """ - scope = kwargs.pop('scope', None) - with tf.variable_scope(scope, 'RepeatOp', [inputs]): - tower = inputs - for _ in range(repetitions): - tower = op(tower, *args, **kwargs) - return tower diff --git a/examples/imagenet/inception/slim/ops_test.py b/examples/imagenet/inception/slim/ops_test.py deleted file mode 100644 index 0978e0ef..00000000 --- a/examples/imagenet/inception/slim/ops_test.py +++ /dev/null @@ -1,692 +0,0 @@ -# Copyright 2016 Google Inc. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# ============================================================================== -"""Tests for slim.ops.""" -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function - - -import numpy as np -import tensorflow as tf - -from tensorflow.python.ops import control_flow_ops - -from inception.slim import ops -from inception.slim import scopes -from inception.slim import variables - - -class ConvTest(tf.test.TestCase): - - def testCreateConv(self): - height, width = 3, 3 - with self.test_session(): - images = tf.random_uniform((5, height, width, 3), seed=1) - output = ops.conv2d(images, 32, [3, 3]) - self.assertEquals(output.op.name, 'Conv/Relu') - self.assertListEqual(output.get_shape().as_list(), [5, height, width, 32]) - - def testCreateSquareConv(self): - height, width = 3, 3 - with self.test_session(): - images = tf.random_uniform((5, height, width, 3), seed=1) - output = ops.conv2d(images, 32, 3) - self.assertEquals(output.op.name, 'Conv/Relu') - self.assertListEqual(output.get_shape().as_list(), [5, height, width, 32]) - - def testCreateConvWithTensorShape(self): - height, width = 3, 3 - with self.test_session(): - images = tf.random_uniform((5, height, width, 3), seed=1) - output = ops.conv2d(images, 32, images.get_shape()[1:3]) - self.assertEquals(output.op.name, 'Conv/Relu') - self.assertListEqual(output.get_shape().as_list(), [5, height, width, 32]) - - def testCreateFullyConv(self): - height, width = 6, 6 - with self.test_session(): - images = tf.random_uniform((5, height, width, 32), seed=1) - output = ops.conv2d(images, 64, images.get_shape()[1:3], padding='VALID') - self.assertEquals(output.op.name, 'Conv/Relu') - self.assertListEqual(output.get_shape().as_list(), [5, 1, 1, 64]) - - def testCreateVerticalConv(self): - height, width = 3, 3 - with self.test_session(): - images = tf.random_uniform((5, height, width, 3), seed=1) - output = ops.conv2d(images, 32, [3, 1]) - self.assertEquals(output.op.name, 'Conv/Relu') - self.assertListEqual(output.get_shape().as_list(), - [5, height, width, 32]) - - def testCreateHorizontalConv(self): - height, width = 3, 3 - with self.test_session(): - images = tf.random_uniform((5, height, width, 3), seed=1) - output = ops.conv2d(images, 32, [1, 3]) - self.assertEquals(output.op.name, 'Conv/Relu') - self.assertListEqual(output.get_shape().as_list(), - [5, height, width, 32]) - - def testCreateConvWithStride(self): - height, width = 6, 6 - with self.test_session(): - images = tf.random_uniform((5, height, width, 3), seed=1) - output = ops.conv2d(images, 32, [3, 3], stride=2) - self.assertEquals(output.op.name, 'Conv/Relu') - self.assertListEqual(output.get_shape().as_list(), - [5, height/2, width/2, 32]) - - def testCreateConvCreatesWeightsAndBiasesVars(self): - height, width = 3, 3 - images = tf.random_uniform((5, height, width, 3), seed=1) - with self.test_session(): - self.assertFalse(variables.get_variables('conv1/weights')) - self.assertFalse(variables.get_variables('conv1/biases')) - ops.conv2d(images, 32, [3, 3], scope='conv1') - self.assertTrue(variables.get_variables('conv1/weights')) - self.assertTrue(variables.get_variables('conv1/biases')) - - def testCreateConvWithScope(self): - height, width = 3, 3 - with self.test_session(): - images = tf.random_uniform((5, height, width, 3), seed=1) - output = ops.conv2d(images, 32, [3, 3], scope='conv1') - self.assertEquals(output.op.name, 'conv1/Relu') - - def testCreateConvWithoutActivation(self): - height, width = 3, 3 - with self.test_session(): - images = tf.random_uniform((5, height, width, 3), seed=1) - output = ops.conv2d(images, 32, [3, 3], activation=None) - self.assertEquals(output.op.name, 'Conv/BiasAdd') - - def testCreateConvValid(self): - height, width = 3, 3 - with self.test_session(): - images = tf.random_uniform((5, height, width, 3), seed=1) - output = ops.conv2d(images, 32, [3, 3], padding='VALID') - self.assertListEqual(output.get_shape().as_list(), [5, 1, 1, 32]) - - def testCreateConvWithWD(self): - height, width = 3, 3 - with self.test_session() as sess: - images = tf.random_uniform((5, height, width, 3), seed=1) - ops.conv2d(images, 32, [3, 3], weight_decay=0.01) - wd = tf.get_collection(tf.GraphKeys.REGULARIZATION_LOSSES)[0] - self.assertEquals(wd.op.name, - 'Conv/weights/Regularizer/L2Regularizer/value') - sess.run(tf.global_variables_initializer()) - self.assertTrue(sess.run(wd) <= 0.01) - - def testCreateConvWithoutWD(self): - height, width = 3, 3 - with self.test_session(): - images = tf.random_uniform((5, height, width, 3), seed=1) - ops.conv2d(images, 32, [3, 3], weight_decay=0) - self.assertEquals( - tf.get_collection(tf.GraphKeys.REGULARIZATION_LOSSES), []) - - def testReuseVars(self): - height, width = 3, 3 - with self.test_session(): - images = tf.random_uniform((5, height, width, 3), seed=1) - ops.conv2d(images, 32, [3, 3], scope='conv1') - self.assertEquals(len(variables.get_variables()), 2) - ops.conv2d(images, 32, [3, 3], scope='conv1', reuse=True) - self.assertEquals(len(variables.get_variables()), 2) - - def testNonReuseVars(self): - height, width = 3, 3 - with self.test_session(): - images = tf.random_uniform((5, height, width, 3), seed=1) - ops.conv2d(images, 32, [3, 3]) - self.assertEquals(len(variables.get_variables()), 2) - ops.conv2d(images, 32, [3, 3]) - self.assertEquals(len(variables.get_variables()), 4) - - def testReuseConvWithWD(self): - height, width = 3, 3 - with self.test_session(): - images = tf.random_uniform((5, height, width, 3), seed=1) - ops.conv2d(images, 32, [3, 3], weight_decay=0.01, scope='conv1') - self.assertEquals(len(variables.get_variables()), 2) - self.assertEquals( - len(tf.get_collection(tf.GraphKeys.REGULARIZATION_LOSSES)), 1) - ops.conv2d(images, 32, [3, 3], weight_decay=0.01, scope='conv1', - reuse=True) - self.assertEquals(len(variables.get_variables()), 2) - self.assertEquals( - len(tf.get_collection(tf.GraphKeys.REGULARIZATION_LOSSES)), 1) - - def testConvWithBatchNorm(self): - height, width = 3, 3 - with self.test_session(): - images = tf.random_uniform((5, height, width, 32), seed=1) - with scopes.arg_scope([ops.conv2d], batch_norm_params={'decay': 0.9}): - net = ops.conv2d(images, 32, [3, 3]) - net = ops.conv2d(net, 32, [3, 3]) - self.assertEquals(len(variables.get_variables()), 8) - self.assertEquals(len(variables.get_variables('Conv/BatchNorm')), 3) - self.assertEquals(len(variables.get_variables('Conv_1/BatchNorm')), 3) - - def testReuseConvWithBatchNorm(self): - height, width = 3, 3 - with self.test_session(): - images = tf.random_uniform((5, height, width, 32), seed=1) - with scopes.arg_scope([ops.conv2d], batch_norm_params={'decay': 0.9}): - net = ops.conv2d(images, 32, [3, 3], scope='Conv') - net = ops.conv2d(net, 32, [3, 3], scope='Conv', reuse=True) - self.assertEquals(len(variables.get_variables()), 4) - self.assertEquals(len(variables.get_variables('Conv/BatchNorm')), 3) - self.assertEquals(len(variables.get_variables('Conv_1/BatchNorm')), 0) - - -class FCTest(tf.test.TestCase): - - def testCreateFC(self): - height, width = 3, 3 - with self.test_session(): - inputs = tf.random_uniform((5, height * width * 3), seed=1) - output = ops.fc(inputs, 32) - self.assertEquals(output.op.name, 'FC/Relu') - self.assertListEqual(output.get_shape().as_list(), [5, 32]) - - def testCreateFCWithScope(self): - height, width = 3, 3 - with self.test_session(): - inputs = tf.random_uniform((5, height * width * 3), seed=1) - output = ops.fc(inputs, 32, scope='fc1') - self.assertEquals(output.op.name, 'fc1/Relu') - - def testCreateFcCreatesWeightsAndBiasesVars(self): - height, width = 3, 3 - inputs = tf.random_uniform((5, height * width * 3), seed=1) - with self.test_session(): - self.assertFalse(variables.get_variables('fc1/weights')) - self.assertFalse(variables.get_variables('fc1/biases')) - ops.fc(inputs, 32, scope='fc1') - self.assertTrue(variables.get_variables('fc1/weights')) - self.assertTrue(variables.get_variables('fc1/biases')) - - def testReuseVars(self): - height, width = 3, 3 - inputs = tf.random_uniform((5, height * width * 3), seed=1) - with self.test_session(): - ops.fc(inputs, 32, scope='fc1') - self.assertEquals(len(variables.get_variables('fc1')), 2) - ops.fc(inputs, 32, scope='fc1', reuse=True) - self.assertEquals(len(variables.get_variables('fc1')), 2) - - def testNonReuseVars(self): - height, width = 3, 3 - inputs = tf.random_uniform((5, height * width * 3), seed=1) - with self.test_session(): - ops.fc(inputs, 32) - self.assertEquals(len(variables.get_variables('FC')), 2) - ops.fc(inputs, 32) - self.assertEquals(len(variables.get_variables('FC')), 4) - - def testCreateFCWithoutActivation(self): - height, width = 3, 3 - with self.test_session(): - inputs = tf.random_uniform((5, height * width * 3), seed=1) - output = ops.fc(inputs, 32, activation=None) - self.assertEquals(output.op.name, 'FC/xw_plus_b') - - def testCreateFCWithWD(self): - height, width = 3, 3 - with self.test_session() as sess: - inputs = tf.random_uniform((5, height * width * 3), seed=1) - ops.fc(inputs, 32, weight_decay=0.01) - wd = tf.get_collection(tf.GraphKeys.REGULARIZATION_LOSSES)[0] - self.assertEquals(wd.op.name, - 'FC/weights/Regularizer/L2Regularizer/value') - sess.run(tf.global_variables_initializer()) - self.assertTrue(sess.run(wd) <= 0.01) - - def testCreateFCWithoutWD(self): - height, width = 3, 3 - with self.test_session(): - inputs = tf.random_uniform((5, height * width * 3), seed=1) - ops.fc(inputs, 32, weight_decay=0) - self.assertEquals( - tf.get_collection(tf.GraphKeys.REGULARIZATION_LOSSES), []) - - def testReuseFCWithWD(self): - height, width = 3, 3 - with self.test_session(): - inputs = tf.random_uniform((5, height * width * 3), seed=1) - ops.fc(inputs, 32, weight_decay=0.01, scope='fc') - self.assertEquals(len(variables.get_variables()), 2) - self.assertEquals( - len(tf.get_collection(tf.GraphKeys.REGULARIZATION_LOSSES)), 1) - ops.fc(inputs, 32, weight_decay=0.01, scope='fc', reuse=True) - self.assertEquals(len(variables.get_variables()), 2) - self.assertEquals( - len(tf.get_collection(tf.GraphKeys.REGULARIZATION_LOSSES)), 1) - - def testFCWithBatchNorm(self): - height, width = 3, 3 - with self.test_session(): - images = tf.random_uniform((5, height * width * 3), seed=1) - with scopes.arg_scope([ops.fc], batch_norm_params={}): - net = ops.fc(images, 27) - net = ops.fc(net, 27) - self.assertEquals(len(variables.get_variables()), 8) - self.assertEquals(len(variables.get_variables('FC/BatchNorm')), 3) - self.assertEquals(len(variables.get_variables('FC_1/BatchNorm')), 3) - - def testReuseFCWithBatchNorm(self): - height, width = 3, 3 - with self.test_session(): - images = tf.random_uniform((5, height * width * 3), seed=1) - with scopes.arg_scope([ops.fc], batch_norm_params={'decay': 0.9}): - net = ops.fc(images, 27, scope='fc1') - net = ops.fc(net, 27, scope='fc1', reuse=True) - self.assertEquals(len(variables.get_variables()), 4) - self.assertEquals(len(variables.get_variables('fc1/BatchNorm')), 3) - - -class MaxPoolTest(tf.test.TestCase): - - def testCreateMaxPool(self): - height, width = 3, 3 - with self.test_session(): - images = tf.random_uniform((5, height, width, 3), seed=1) - output = ops.max_pool(images, [3, 3]) - self.assertEquals(output.op.name, 'MaxPool/MaxPool') - self.assertListEqual(output.get_shape().as_list(), [5, 1, 1, 3]) - - def testCreateSquareMaxPool(self): - height, width = 3, 3 - with self.test_session(): - images = tf.random_uniform((5, height, width, 3), seed=1) - output = ops.max_pool(images, 3) - self.assertEquals(output.op.name, 'MaxPool/MaxPool') - self.assertListEqual(output.get_shape().as_list(), [5, 1, 1, 3]) - - def testCreateMaxPoolWithScope(self): - height, width = 3, 3 - with self.test_session(): - images = tf.random_uniform((5, height, width, 3), seed=1) - output = ops.max_pool(images, [3, 3], scope='pool1') - self.assertEquals(output.op.name, 'pool1/MaxPool') - - def testCreateMaxPoolSAME(self): - height, width = 3, 3 - with self.test_session(): - images = tf.random_uniform((5, height, width, 3), seed=1) - output = ops.max_pool(images, [3, 3], padding='SAME') - self.assertListEqual(output.get_shape().as_list(), [5, 2, 2, 3]) - - def testCreateMaxPoolStrideSAME(self): - height, width = 3, 3 - with self.test_session(): - images = tf.random_uniform((5, height, width, 3), seed=1) - output = ops.max_pool(images, [3, 3], stride=1, padding='SAME') - self.assertListEqual(output.get_shape().as_list(), [5, height, width, 3]) - - def testGlobalMaxPool(self): - height, width = 3, 3 - with self.test_session(): - images = tf.random_uniform((5, height, width, 3), seed=1) - output = ops.max_pool(images, images.get_shape()[1:3], stride=1) - self.assertListEqual(output.get_shape().as_list(), [5, 1, 1, 3]) - - -class AvgPoolTest(tf.test.TestCase): - - def testCreateAvgPool(self): - height, width = 3, 3 - with self.test_session(): - images = tf.random_uniform((5, height, width, 3), seed=1) - output = ops.avg_pool(images, [3, 3]) - self.assertEquals(output.op.name, 'AvgPool/AvgPool') - self.assertListEqual(output.get_shape().as_list(), [5, 1, 1, 3]) - - def testCreateSquareAvgPool(self): - height, width = 3, 3 - with self.test_session(): - images = tf.random_uniform((5, height, width, 3), seed=1) - output = ops.avg_pool(images, 3) - self.assertEquals(output.op.name, 'AvgPool/AvgPool') - self.assertListEqual(output.get_shape().as_list(), [5, 1, 1, 3]) - - def testCreateAvgPoolWithScope(self): - height, width = 3, 3 - with self.test_session(): - images = tf.random_uniform((5, height, width, 3), seed=1) - output = ops.avg_pool(images, [3, 3], scope='pool1') - self.assertEquals(output.op.name, 'pool1/AvgPool') - - def testCreateAvgPoolSAME(self): - height, width = 3, 3 - with self.test_session(): - images = tf.random_uniform((5, height, width, 3), seed=1) - output = ops.avg_pool(images, [3, 3], padding='SAME') - self.assertListEqual(output.get_shape().as_list(), [5, 2, 2, 3]) - - def testCreateAvgPoolStrideSAME(self): - height, width = 3, 3 - with self.test_session(): - images = tf.random_uniform((5, height, width, 3), seed=1) - output = ops.avg_pool(images, [3, 3], stride=1, padding='SAME') - self.assertListEqual(output.get_shape().as_list(), [5, height, width, 3]) - - def testGlobalAvgPool(self): - height, width = 3, 3 - with self.test_session(): - images = tf.random_uniform((5, height, width, 3), seed=1) - output = ops.avg_pool(images, images.get_shape()[1:3], stride=1) - self.assertListEqual(output.get_shape().as_list(), [5, 1, 1, 3]) - - -class OneHotEncodingTest(tf.test.TestCase): - - def testOneHotEncodingCreate(self): - with self.test_session(): - labels = tf.constant([0, 1, 2]) - output = ops.one_hot_encoding(labels, num_classes=3) - self.assertEquals(output.op.name, 'OneHotEncoding/SparseToDense') - self.assertListEqual(output.get_shape().as_list(), [3, 3]) - - def testOneHotEncoding(self): - with self.test_session(): - labels = tf.constant([0, 1, 2]) - one_hot_labels = tf.constant([[1, 0, 0], - [0, 1, 0], - [0, 0, 1]]) - output = ops.one_hot_encoding(labels, num_classes=3) - self.assertAllClose(output.eval(), one_hot_labels.eval()) - - -class DropoutTest(tf.test.TestCase): - - def testCreateDropout(self): - height, width = 3, 3 - with self.test_session(): - images = tf.random_uniform((5, height, width, 3), seed=1) - output = ops.dropout(images) - self.assertEquals(output.op.name, 'Dropout/dropout/mul_1') - output.get_shape().assert_is_compatible_with(images.get_shape()) - - def testCreateDropoutNoTraining(self): - height, width = 3, 3 - with self.test_session(): - images = tf.random_uniform((5, height, width, 3), seed=1, name='images') - output = ops.dropout(images, is_training=False) - self.assertEquals(output, images) - - -class FlattenTest(tf.test.TestCase): - - def testFlatten4D(self): - height, width = 3, 3 - with self.test_session(): - images = tf.random_uniform((5, height, width, 3), seed=1, name='images') - output = ops.flatten(images) - self.assertEquals(output.get_shape().num_elements(), - images.get_shape().num_elements()) - self.assertEqual(output.get_shape()[0], images.get_shape()[0]) - - def testFlatten3D(self): - height, width = 3, 3 - with self.test_session(): - images = tf.random_uniform((5, height, width), seed=1, name='images') - output = ops.flatten(images) - self.assertEquals(output.get_shape().num_elements(), - images.get_shape().num_elements()) - self.assertEqual(output.get_shape()[0], images.get_shape()[0]) - - def testFlattenBatchSize(self): - height, width = 3, 3 - with self.test_session() as sess: - images = tf.random_uniform((5, height, width, 3), seed=1, name='images') - inputs = tf.placeholder(tf.int32, (None, height, width, 3)) - output = ops.flatten(inputs) - self.assertEquals(output.get_shape().as_list(), - [None, height * width * 3]) - output = sess.run(output, {inputs: images.eval()}) - self.assertEquals(output.size, - images.get_shape().num_elements()) - self.assertEqual(output.shape[0], images.get_shape()[0]) - - -class BatchNormTest(tf.test.TestCase): - - def testCreateOp(self): - height, width = 3, 3 - with self.test_session(): - images = tf.random_uniform((5, height, width, 3), seed=1) - output = ops.batch_norm(images) - self.assertTrue(output.op.name.startswith('BatchNorm/batchnorm')) - self.assertListEqual(output.get_shape().as_list(), [5, height, width, 3]) - - def testCreateVariables(self): - height, width = 3, 3 - with self.test_session(): - images = tf.random_uniform((5, height, width, 3), seed=1) - ops.batch_norm(images) - beta = variables.get_variables_by_name('beta')[0] - self.assertEquals(beta.op.name, 'BatchNorm/beta') - gamma = variables.get_variables_by_name('gamma') - self.assertEquals(gamma, []) - moving_mean = tf.moving_average_variables()[0] - moving_variance = tf.moving_average_variables()[1] - self.assertEquals(moving_mean.op.name, 'BatchNorm/moving_mean') - self.assertEquals(moving_variance.op.name, 'BatchNorm/moving_variance') - - def testCreateVariablesWithScale(self): - height, width = 3, 3 - with self.test_session(): - images = tf.random_uniform((5, height, width, 3), seed=1) - ops.batch_norm(images, scale=True) - beta = variables.get_variables_by_name('beta')[0] - gamma = variables.get_variables_by_name('gamma')[0] - self.assertEquals(beta.op.name, 'BatchNorm/beta') - self.assertEquals(gamma.op.name, 'BatchNorm/gamma') - moving_mean = tf.moving_average_variables()[0] - moving_variance = tf.moving_average_variables()[1] - self.assertEquals(moving_mean.op.name, 'BatchNorm/moving_mean') - self.assertEquals(moving_variance.op.name, 'BatchNorm/moving_variance') - - def testCreateVariablesWithoutCenterWithScale(self): - height, width = 3, 3 - with self.test_session(): - images = tf.random_uniform((5, height, width, 3), seed=1) - ops.batch_norm(images, center=False, scale=True) - beta = variables.get_variables_by_name('beta') - self.assertEquals(beta, []) - gamma = variables.get_variables_by_name('gamma')[0] - self.assertEquals(gamma.op.name, 'BatchNorm/gamma') - moving_mean = tf.moving_average_variables()[0] - moving_variance = tf.moving_average_variables()[1] - self.assertEquals(moving_mean.op.name, 'BatchNorm/moving_mean') - self.assertEquals(moving_variance.op.name, 'BatchNorm/moving_variance') - - def testCreateVariablesWithoutCenterWithoutScale(self): - height, width = 3, 3 - with self.test_session(): - images = tf.random_uniform((5, height, width, 3), seed=1) - ops.batch_norm(images, center=False, scale=False) - beta = variables.get_variables_by_name('beta') - self.assertEquals(beta, []) - gamma = variables.get_variables_by_name('gamma') - self.assertEquals(gamma, []) - moving_mean = tf.moving_average_variables()[0] - moving_variance = tf.moving_average_variables()[1] - self.assertEquals(moving_mean.op.name, 'BatchNorm/moving_mean') - self.assertEquals(moving_variance.op.name, 'BatchNorm/moving_variance') - - def testMovingAverageVariables(self): - height, width = 3, 3 - with self.test_session(): - images = tf.random_uniform((5, height, width, 3), seed=1) - ops.batch_norm(images, scale=True) - moving_mean = tf.moving_average_variables()[0] - moving_variance = tf.moving_average_variables()[1] - self.assertEquals(moving_mean.op.name, 'BatchNorm/moving_mean') - self.assertEquals(moving_variance.op.name, 'BatchNorm/moving_variance') - - def testUpdateOps(self): - height, width = 3, 3 - with self.test_session(): - images = tf.random_uniform((5, height, width, 3), seed=1) - ops.batch_norm(images) - update_ops = tf.get_collection(ops.UPDATE_OPS_COLLECTION) - update_moving_mean = update_ops[0] - update_moving_variance = update_ops[1] - self.assertEquals(update_moving_mean.op.name, - 'BatchNorm/AssignMovingAvg') - self.assertEquals(update_moving_variance.op.name, - 'BatchNorm/AssignMovingAvg_1') - - def testReuseVariables(self): - height, width = 3, 3 - with self.test_session(): - images = tf.random_uniform((5, height, width, 3), seed=1) - ops.batch_norm(images, scale=True, scope='bn') - ops.batch_norm(images, scale=True, scope='bn', reuse=True) - beta = variables.get_variables_by_name('beta') - gamma = variables.get_variables_by_name('gamma') - self.assertEquals(len(beta), 1) - self.assertEquals(len(gamma), 1) - moving_vars = tf.get_collection('moving_vars') - self.assertEquals(len(moving_vars), 2) - - def testReuseUpdateOps(self): - height, width = 3, 3 - with self.test_session(): - images = tf.random_uniform((5, height, width, 3), seed=1) - ops.batch_norm(images, scope='bn') - self.assertEquals(len(tf.get_collection(ops.UPDATE_OPS_COLLECTION)), 2) - ops.batch_norm(images, scope='bn', reuse=True) - self.assertEquals(len(tf.get_collection(ops.UPDATE_OPS_COLLECTION)), 4) - - def testCreateMovingVars(self): - height, width = 3, 3 - with self.test_session(): - images = tf.random_uniform((5, height, width, 3), seed=1) - _ = ops.batch_norm(images, moving_vars='moving_vars') - moving_mean = tf.get_collection('moving_vars', - 'BatchNorm/moving_mean') - self.assertEquals(len(moving_mean), 1) - self.assertEquals(moving_mean[0].op.name, 'BatchNorm/moving_mean') - moving_variance = tf.get_collection('moving_vars', - 'BatchNorm/moving_variance') - self.assertEquals(len(moving_variance), 1) - self.assertEquals(moving_variance[0].op.name, 'BatchNorm/moving_variance') - - def testComputeMovingVars(self): - height, width = 3, 3 - with self.test_session() as sess: - image_shape = (10, height, width, 3) - image_values = np.random.rand(*image_shape) - expected_mean = np.mean(image_values, axis=(0, 1, 2)) - expected_var = np.var(image_values, axis=(0, 1, 2)) - images = tf.constant(image_values, shape=image_shape, dtype=tf.float32) - output = ops.batch_norm(images, decay=0.1) - update_ops = tf.get_collection(ops.UPDATE_OPS_COLLECTION) - with tf.control_dependencies(update_ops): - barrier = tf.no_op(name='gradient_barrier') - output = control_flow_ops.with_dependencies([barrier], output) - # Initialize all variables - sess.run(tf.global_variables_initializer()) - moving_mean = variables.get_variables('BatchNorm/moving_mean')[0] - moving_variance = variables.get_variables('BatchNorm/moving_variance')[0] - mean, variance = sess.run([moving_mean, moving_variance]) - # After initialization moving_mean == 0 and moving_variance == 1. - self.assertAllClose(mean, [0] * 3) - self.assertAllClose(variance, [1] * 3) - for _ in range(10): - sess.run([output]) - mean = moving_mean.eval() - variance = moving_variance.eval() - # After 10 updates with decay 0.1 moving_mean == expected_mean and - # moving_variance == expected_var. - self.assertAllClose(mean, expected_mean) - self.assertAllClose(variance, expected_var) - - def testEvalMovingVars(self): - height, width = 3, 3 - with self.test_session() as sess: - image_shape = (10, height, width, 3) - image_values = np.random.rand(*image_shape) - expected_mean = np.mean(image_values, axis=(0, 1, 2)) - expected_var = np.var(image_values, axis=(0, 1, 2)) - images = tf.constant(image_values, shape=image_shape, dtype=tf.float32) - output = ops.batch_norm(images, decay=0.1, is_training=False) - update_ops = tf.get_collection(ops.UPDATE_OPS_COLLECTION) - with tf.control_dependencies(update_ops): - barrier = tf.no_op(name='gradient_barrier') - output = control_flow_ops.with_dependencies([barrier], output) - # Initialize all variables - sess.run(tf.global_variables_initializer()) - moving_mean = variables.get_variables('BatchNorm/moving_mean')[0] - moving_variance = variables.get_variables('BatchNorm/moving_variance')[0] - mean, variance = sess.run([moving_mean, moving_variance]) - # After initialization moving_mean == 0 and moving_variance == 1. - self.assertAllClose(mean, [0] * 3) - self.assertAllClose(variance, [1] * 3) - # Simulate assigment from saver restore. - init_assigns = [tf.assign(moving_mean, expected_mean), - tf.assign(moving_variance, expected_var)] - sess.run(init_assigns) - for _ in range(10): - sess.run([output], {images: np.random.rand(*image_shape)}) - mean = moving_mean.eval() - variance = moving_variance.eval() - # Although we feed different images, the moving_mean and moving_variance - # shouldn't change. - self.assertAllClose(mean, expected_mean) - self.assertAllClose(variance, expected_var) - - def testReuseVars(self): - height, width = 3, 3 - with self.test_session() as sess: - image_shape = (10, height, width, 3) - image_values = np.random.rand(*image_shape) - expected_mean = np.mean(image_values, axis=(0, 1, 2)) - expected_var = np.var(image_values, axis=(0, 1, 2)) - images = tf.constant(image_values, shape=image_shape, dtype=tf.float32) - output = ops.batch_norm(images, decay=0.1, is_training=False) - update_ops = tf.get_collection(ops.UPDATE_OPS_COLLECTION) - with tf.control_dependencies(update_ops): - barrier = tf.no_op(name='gradient_barrier') - output = control_flow_ops.with_dependencies([barrier], output) - # Initialize all variables - sess.run(tf.global_variables_initializer()) - moving_mean = variables.get_variables('BatchNorm/moving_mean')[0] - moving_variance = variables.get_variables('BatchNorm/moving_variance')[0] - mean, variance = sess.run([moving_mean, moving_variance]) - # After initialization moving_mean == 0 and moving_variance == 1. - self.assertAllClose(mean, [0] * 3) - self.assertAllClose(variance, [1] * 3) - # Simulate assigment from saver restore. - init_assigns = [tf.assign(moving_mean, expected_mean), - tf.assign(moving_variance, expected_var)] - sess.run(init_assigns) - for _ in range(10): - sess.run([output], {images: np.random.rand(*image_shape)}) - mean = moving_mean.eval() - variance = moving_variance.eval() - # Although we feed different images, the moving_mean and moving_variance - # shouldn't change. - self.assertAllClose(mean, expected_mean) - self.assertAllClose(variance, expected_var) - -if __name__ == '__main__': - tf.test.main() diff --git a/examples/imagenet/inception/slim/scopes.py b/examples/imagenet/inception/slim/scopes.py deleted file mode 100644 index 2c2fb0a2..00000000 --- a/examples/imagenet/inception/slim/scopes.py +++ /dev/null @@ -1,170 +0,0 @@ -# Copyright 2016 Google Inc. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# ============================================================================== -"""Contains the new arg_scope used for TF-Slim ops. - - Allows one to define models much more compactly by eliminating boilerplate - code. This is accomplished through the use of argument scoping (arg_scope). - - Example of how to use scopes.arg_scope: - - with scopes.arg_scope(ops.conv2d, padding='SAME', - stddev=0.01, weight_decay=0.0005): - net = ops.conv2d(inputs, 64, [11, 11], 4, padding='VALID', scope='conv1') - net = ops.conv2d(net, 256, [5, 5], scope='conv2') - - The first call to conv2d will overwrite padding: - ops.conv2d(inputs, 64, [11, 11], 4, padding='VALID', - stddev=0.01, weight_decay=0.0005, scope='conv1') - - The second call to Conv will use predefined args: - ops.conv2d(inputs, 256, [5, 5], padding='SAME', - stddev=0.01, weight_decay=0.0005, scope='conv2') - - Example of how to reuse an arg_scope: - with scopes.arg_scope(ops.conv2d, padding='SAME', - stddev=0.01, weight_decay=0.0005) as conv2d_arg_scope: - net = ops.conv2d(net, 256, [5, 5], scope='conv1') - .... - - with scopes.arg_scope(conv2d_arg_scope): - net = ops.conv2d(net, 256, [5, 5], scope='conv2') - - Example of how to use scopes.add_arg_scope: - - @scopes.add_arg_scope - def conv2d(*args, **kwargs) -""" -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function - -import contextlib -import functools - -from tensorflow.python.framework import ops - -_ARGSTACK_KEY = ("__arg_stack",) - -_DECORATED_OPS = set() - - -def _get_arg_stack(): - stack = ops.get_collection(_ARGSTACK_KEY) - if stack: - return stack[0] - else: - stack = [{}] - ops.add_to_collection(_ARGSTACK_KEY, stack) - return stack - - -def _current_arg_scope(): - stack = _get_arg_stack() - return stack[-1] - - -def _add_op(op): - key_op = (op.__module__, op.__name__) - if key_op not in _DECORATED_OPS: - _DECORATED_OPS.add(key_op) - - -@contextlib.contextmanager -def arg_scope(list_ops_or_scope, **kwargs): - """Stores the default arguments for the given set of list_ops. - - For usage, please see examples at top of the file. - - Args: - list_ops_or_scope: List or tuple of operations to set argument scope for or - a dictionary containg the current scope. When list_ops_or_scope is a dict, - kwargs must be empty. When list_ops_or_scope is a list or tuple, then - every op in it need to be decorated with @add_arg_scope to work. - **kwargs: keyword=value that will define the defaults for each op in - list_ops. All the ops need to accept the given set of arguments. - - Yields: - the current_scope, which is a dictionary of {op: {arg: value}} - Raises: - TypeError: if list_ops is not a list or a tuple. - ValueError: if any op in list_ops has not be decorated with @add_arg_scope. - """ - if isinstance(list_ops_or_scope, dict): - # Assumes that list_ops_or_scope is a scope that is being reused. - if kwargs: - raise ValueError("When attempting to re-use a scope by suppling a" - "dictionary, kwargs must be empty.") - current_scope = list_ops_or_scope.copy() - try: - _get_arg_stack().append(current_scope) - yield current_scope - finally: - _get_arg_stack().pop() - else: - # Assumes that list_ops_or_scope is a list/tuple of ops with kwargs. - if not isinstance(list_ops_or_scope, (list, tuple)): - raise TypeError("list_ops_or_scope must either be a list/tuple or reused" - "scope (i.e. dict)") - try: - current_scope = _current_arg_scope().copy() - for op in list_ops_or_scope: - key_op = (op.__module__, op.__name__) - if not has_arg_scope(op): - raise ValueError("%s is not decorated with @add_arg_scope", key_op) - if key_op in current_scope: - current_kwargs = current_scope[key_op].copy() - current_kwargs.update(kwargs) - current_scope[key_op] = current_kwargs - else: - current_scope[key_op] = kwargs.copy() - _get_arg_stack().append(current_scope) - yield current_scope - finally: - _get_arg_stack().pop() - - -def add_arg_scope(func): - """Decorates a function with args so it can be used within an arg_scope. - - Args: - func: function to decorate. - - Returns: - A tuple with the decorated function func_with_args(). - """ - @functools.wraps(func) - def func_with_args(*args, **kwargs): - current_scope = _current_arg_scope() - current_args = kwargs - key_func = (func.__module__, func.__name__) - if key_func in current_scope: - current_args = current_scope[key_func].copy() - current_args.update(kwargs) - return func(*args, **current_args) - _add_op(func) - return func_with_args - - -def has_arg_scope(func): - """Checks whether a func has been decorated with @add_arg_scope or not. - - Args: - func: function to check. - - Returns: - a boolean. - """ - key_op = (func.__module__, func.__name__) - return key_op in _DECORATED_OPS diff --git a/examples/imagenet/inception/slim/scopes_test.py b/examples/imagenet/inception/slim/scopes_test.py deleted file mode 100644 index cd349399..00000000 --- a/examples/imagenet/inception/slim/scopes_test.py +++ /dev/null @@ -1,162 +0,0 @@ -# Copyright 2016 Google Inc. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# ============================================================================== -"""Tests slim.scopes.""" -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function - - -import tensorflow as tf -from inception.slim import scopes - - -@scopes.add_arg_scope -def func1(*args, **kwargs): - return (args, kwargs) - - -@scopes.add_arg_scope -def func2(*args, **kwargs): - return (args, kwargs) - - -class ArgScopeTest(tf.test.TestCase): - - def testEmptyArgScope(self): - with self.test_session(): - self.assertEqual(scopes._current_arg_scope(), {}) - - def testCurrentArgScope(self): - func1_kwargs = {'a': 1, 'b': None, 'c': [1]} - key_op = (func1.__module__, func1.__name__) - current_scope = {key_op: func1_kwargs.copy()} - with self.test_session(): - with scopes.arg_scope([func1], a=1, b=None, c=[1]) as scope: - self.assertDictEqual(scope, current_scope) - - def testCurrentArgScopeNested(self): - func1_kwargs = {'a': 1, 'b': None, 'c': [1]} - func2_kwargs = {'b': 2, 'd': [2]} - key = lambda f: (f.__module__, f.__name__) - current_scope = {key(func1): func1_kwargs.copy(), - key(func2): func2_kwargs.copy()} - with self.test_session(): - with scopes.arg_scope([func1], a=1, b=None, c=[1]): - with scopes.arg_scope([func2], b=2, d=[2]) as scope: - self.assertDictEqual(scope, current_scope) - - def testReuseArgScope(self): - func1_kwargs = {'a': 1, 'b': None, 'c': [1]} - key_op = (func1.__module__, func1.__name__) - current_scope = {key_op: func1_kwargs.copy()} - with self.test_session(): - with scopes.arg_scope([func1], a=1, b=None, c=[1]) as scope1: - pass - with scopes.arg_scope(scope1) as scope: - self.assertDictEqual(scope, current_scope) - - def testReuseArgScopeNested(self): - func1_kwargs = {'a': 1, 'b': None, 'c': [1]} - func2_kwargs = {'b': 2, 'd': [2]} - key = lambda f: (f.__module__, f.__name__) - current_scope1 = {key(func1): func1_kwargs.copy()} - current_scope2 = {key(func1): func1_kwargs.copy(), - key(func2): func2_kwargs.copy()} - with self.test_session(): - with scopes.arg_scope([func1], a=1, b=None, c=[1]) as scope1: - with scopes.arg_scope([func2], b=2, d=[2]) as scope2: - pass - with scopes.arg_scope(scope1): - self.assertDictEqual(scopes._current_arg_scope(), current_scope1) - with scopes.arg_scope(scope2): - self.assertDictEqual(scopes._current_arg_scope(), current_scope2) - - def testSimpleArgScope(self): - func1_args = (0,) - func1_kwargs = {'a': 1, 'b': None, 'c': [1]} - with self.test_session(): - with scopes.arg_scope([func1], a=1, b=None, c=[1]): - args, kwargs = func1(0) - self.assertTupleEqual(args, func1_args) - self.assertDictEqual(kwargs, func1_kwargs) - - def testSimpleArgScopeWithTuple(self): - func1_args = (0,) - func1_kwargs = {'a': 1, 'b': None, 'c': [1]} - with self.test_session(): - with scopes.arg_scope((func1,), a=1, b=None, c=[1]): - args, kwargs = func1(0) - self.assertTupleEqual(args, func1_args) - self.assertDictEqual(kwargs, func1_kwargs) - - def testOverwriteArgScope(self): - func1_args = (0,) - func1_kwargs = {'a': 1, 'b': 2, 'c': [1]} - with scopes.arg_scope([func1], a=1, b=None, c=[1]): - args, kwargs = func1(0, b=2) - self.assertTupleEqual(args, func1_args) - self.assertDictEqual(kwargs, func1_kwargs) - - def testNestedArgScope(self): - func1_args = (0,) - func1_kwargs = {'a': 1, 'b': None, 'c': [1]} - with scopes.arg_scope([func1], a=1, b=None, c=[1]): - args, kwargs = func1(0) - self.assertTupleEqual(args, func1_args) - self.assertDictEqual(kwargs, func1_kwargs) - func1_kwargs['b'] = 2 - with scopes.arg_scope([func1], b=2): - args, kwargs = func1(0) - self.assertTupleEqual(args, func1_args) - self.assertDictEqual(kwargs, func1_kwargs) - - def testSharedArgScope(self): - func1_args = (0,) - func1_kwargs = {'a': 1, 'b': None, 'c': [1]} - with scopes.arg_scope([func1, func2], a=1, b=None, c=[1]): - args, kwargs = func1(0) - self.assertTupleEqual(args, func1_args) - self.assertDictEqual(kwargs, func1_kwargs) - args, kwargs = func2(0) - self.assertTupleEqual(args, func1_args) - self.assertDictEqual(kwargs, func1_kwargs) - - def testSharedArgScopeTuple(self): - func1_args = (0,) - func1_kwargs = {'a': 1, 'b': None, 'c': [1]} - with scopes.arg_scope((func1, func2), a=1, b=None, c=[1]): - args, kwargs = func1(0) - self.assertTupleEqual(args, func1_args) - self.assertDictEqual(kwargs, func1_kwargs) - args, kwargs = func2(0) - self.assertTupleEqual(args, func1_args) - self.assertDictEqual(kwargs, func1_kwargs) - - def testPartiallySharedArgScope(self): - func1_args = (0,) - func1_kwargs = {'a': 1, 'b': None, 'c': [1]} - func2_args = (1,) - func2_kwargs = {'a': 1, 'b': None, 'd': [2]} - with scopes.arg_scope([func1, func2], a=1, b=None): - with scopes.arg_scope([func1], c=[1]), scopes.arg_scope([func2], d=[2]): - args, kwargs = func1(0) - self.assertTupleEqual(args, func1_args) - self.assertDictEqual(kwargs, func1_kwargs) - args, kwargs = func2(1) - self.assertTupleEqual(args, func2_args) - self.assertDictEqual(kwargs, func2_kwargs) - -if __name__ == '__main__': - tf.test.main() diff --git a/examples/imagenet/inception/slim/slim.py b/examples/imagenet/inception/slim/slim.py deleted file mode 100644 index b7a5c0f8..00000000 --- a/examples/imagenet/inception/slim/slim.py +++ /dev/null @@ -1,24 +0,0 @@ -# Copyright 2016 Google Inc. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# ============================================================================== -"""TF-Slim grouped API. Please see README.md for details and usage.""" -# pylint: disable=unused-import - -# Collapse tf-slim into a single namespace. -from inception.slim import inception_model as inception -from inception.slim import losses -from inception.slim import ops -from inception.slim import scopes -from inception.slim import variables -from inception.slim.scopes import arg_scope diff --git a/examples/imagenet/inception/slim/variables.py b/examples/imagenet/inception/slim/variables.py deleted file mode 100644 index 1d967b79..00000000 --- a/examples/imagenet/inception/slim/variables.py +++ /dev/null @@ -1,289 +0,0 @@ -# Copyright 2016 Google Inc. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# ============================================================================== -"""Contains convenience wrappers for creating variables in TF-Slim. - -The variables module is typically used for defining model variables from the -ops routines (see slim.ops). Such variables are used for training, evaluation -and inference of models. - -All the variables created through this module would be added to the -MODEL_VARIABLES collection, if you create a model variable outside slim, it can -be added with slim.variables.add_variable(external_variable, reuse). - -Usage: - weights_initializer = tf.truncated_normal_initializer(stddev=0.01) - l2_regularizer = lambda t: losses.l2_loss(t, weight=0.0005) - weights = variables.variable('weights', - shape=[100, 100], - initializer=weights_initializer, - regularizer=l2_regularizer, - device='/cpu:0') - - biases = variables.variable('biases', - shape=[100], - initializer=tf.zeros_initializer(), - device='/cpu:0') - - # More complex example. - - net = slim.ops.conv2d(input, 32, [3, 3], scope='conv1') - net = slim.ops.conv2d(net, 64, [3, 3], scope='conv2') - with slim.arg_scope([variables.variable], restore=False): - net = slim.ops.conv2d(net, 64, [3, 3], scope='conv3') - - # Get all model variables from all the layers. - model_variables = slim.variables.get_variables() - - # Get all model variables from a specific the layer, i.e 'conv1'. - conv1_variables = slim.variables.get_variables('conv1') - - # Get all weights from all the layers. - weights = slim.variables.get_variables_by_name('weights') - - # Get all bias from all the layers. - biases = slim.variables.get_variables_by_name('biases') - - # Get all variables to restore. - # (i.e. only those created by 'conv1' and 'conv2') - variables_to_restore = slim.variables.get_variables_to_restore() - -************************************************ -* Initializing model variables from a checkpoint -************************************************ - -# Create some variables. -v1 = slim.variables.variable(name="v1", ..., restore=False) -v2 = slim.variables.variable(name="v2", ...) # By default restore=True -... -# The list of variables to restore should only contain 'v2'. -variables_to_restore = slim.variables.get_variables_to_restore() -restorer = tf.train.Saver(variables_to_restore) -with tf.Session() as sess: - # Restore variables from disk. - restorer.restore(sess, "/tmp/model.ckpt") - print("Model restored.") - # Do some work with the model - ... - -""" -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function - -import tensorflow as tf - -from inception.slim import scopes - -# Collection containing all the variables created using slim.variables -MODEL_VARIABLES = '_model_variables_' - -# Collection containing the slim.variables that are created with restore=True. -VARIABLES_TO_RESTORE = '_variables_to_restore_' - - -def add_variable(var, restore=True): - """Adds a variable to the MODEL_VARIABLES collection. - - Optionally it will add the variable to the VARIABLES_TO_RESTORE collection. - Args: - var: a variable. - restore: whether the variable should be added to the - VARIABLES_TO_RESTORE collection. - - """ - collections = [MODEL_VARIABLES] - if restore: - collections.append(VARIABLES_TO_RESTORE) - for collection in collections: - if var not in tf.get_collection(collection): - tf.add_to_collection(collection, var) - - -def get_variables(scope=None, suffix=None): - """Gets the list of variables, filtered by scope and/or suffix. - - Args: - scope: an optional scope for filtering the variables to return. - suffix: an optional suffix for filtering the variables to return. - - Returns: - a copied list of variables with scope and suffix. - """ - candidates = tf.get_collection(MODEL_VARIABLES, scope)[:] - if suffix is not None: - candidates = [var for var in candidates if var.op.name.endswith(suffix)] - return candidates - - -def get_variables_to_restore(): - """Gets the list of variables to restore. - - Returns: - a copied list of variables. - """ - return tf.get_collection(VARIABLES_TO_RESTORE)[:] - - -def get_variables_by_name(given_name, scope=None): - """Gets the list of variables that were given that name. - - Args: - given_name: name given to the variable without scope. - scope: an optional scope for filtering the variables to return. - - Returns: - a copied list of variables with the given name and prefix. - """ - return get_variables(scope=scope, suffix=given_name) - - -def get_unique_variable(name): - """Gets the variable uniquely identified by that name. - - Args: - name: a name that uniquely identifies the variable. - - Returns: - a tensorflow variable. - - Raises: - ValueError: if no variable uniquely identified by the name exists. - """ - candidates = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, name) - if not candidates: - raise ValueError('Couldnt find variable %s' % name) - - for candidate in candidates: - if candidate.op.name == name: - return candidate - raise ValueError('Variable %s does not uniquely identify a variable', name) - - -class VariableDeviceChooser(object): - """Slim device chooser for variables. - - When using a parameter server it will assign them in a round-robin fashion. - When not using a parameter server it allows GPU:0 placement otherwise CPU:0. - """ - - def __init__(self, - num_parameter_servers=0, - ps_device='/job:ps', - placement='CPU:0'): - """Initialize VariableDeviceChooser. - - Args: - num_parameter_servers: number of parameter servers. - ps_device: string representing the parameter server device. - placement: string representing the placement of the variable either CPU:0 - or GPU:0. When using parameter servers forced to CPU:0. - """ - self._num_ps = num_parameter_servers - self._ps_device = ps_device - self._placement = placement if num_parameter_servers == 0 else 'CPU:0' - self._next_task_id = 0 - - def __call__(self, op): - device_string = '' - if self._num_ps > 0: - task_id = self._next_task_id - self._next_task_id = (self._next_task_id + 1) % self._num_ps - device_string = '%s/task:%d' % (self._ps_device, task_id) - device_string += '/%s' % self._placement - return device_string - - -# TODO(sguada) Remove once get_variable is able to colocate op.devices. -def variable_device(device, name): - """Fix the variable device to colocate its ops.""" - if callable(device): - var_name = tf.get_variable_scope().name + '/' + name - var_def = tf.NodeDef(name=var_name, op='Variable') - device = device(var_def) - if device is None: - device = '' - return device - - -@scopes.add_arg_scope -def global_step(device=''): - """Returns the global step variable. - - Args: - device: Optional device to place the variable. It can be an string or a - function that is called to get the device for the variable. - - Returns: - the tensor representing the global step variable. - """ - global_step_ref = tf.get_collection(tf.GraphKeys.GLOBAL_STEP) - if global_step_ref: - return global_step_ref[0] - else: - collections = [ - VARIABLES_TO_RESTORE, - tf.GraphKeys.GLOBAL_VARIABLES, - tf.GraphKeys.GLOBAL_STEP, - ] - # Get the device for the variable. - with tf.device(variable_device(device, 'global_step')): - return tf.get_variable('global_step', shape=[], dtype=tf.int64, - initializer=tf.zeros_initializer(), - trainable=False, collections=collections) - - -@scopes.add_arg_scope -def variable(name, shape=None, dtype=tf.float32, initializer=None, - regularizer=None, trainable=True, collections=None, device='', - restore=True): - """Gets an existing variable with these parameters or creates a new one. - - It also add itself to a group with its name. - - Args: - name: the name of the new or existing variable. - shape: shape of the new or existing variable. - dtype: type of the new or existing variable (defaults to `DT_FLOAT`). - initializer: initializer for the variable if one is created. - regularizer: a (Tensor -> Tensor or None) function; the result of - applying it on a newly created variable will be added to the collection - GraphKeys.REGULARIZATION_LOSSES and can be used for regularization. - trainable: If `True` also add the variable to the graph collection - `GraphKeys.TRAINABLE_VARIABLES` (see tf.Variable). - collections: A list of collection names to which the Variable will be added. - Note that the variable is always also added to the tf.GraphKeys.GLOBAL_VARIABLES - and MODEL_VARIABLES collections. - device: Optional device to place the variable. It can be an string or a - function that is called to get the device for the variable. - restore: whether the variable should be added to the - VARIABLES_TO_RESTORE collection. - - Returns: - The created or existing variable. - """ - collections = list(collections or []) - - # Make sure variables are added to tf.GraphKeys.GLOBAL_VARIABLES and MODEL_VARIABLES - collections += [tf.GraphKeys.GLOBAL_VARIABLES, MODEL_VARIABLES] - # Add to VARIABLES_TO_RESTORE if necessary - if restore: - collections.append(VARIABLES_TO_RESTORE) - # Remove duplicates - collections = set(collections) - # Get the device for the variable. - with tf.device(variable_device(device, name)): - return tf.get_variable(name, shape=shape, dtype=dtype, - initializer=initializer, regularizer=regularizer, - trainable=trainable, collections=collections) diff --git a/examples/imagenet/inception/slim/variables_test.py b/examples/imagenet/inception/slim/variables_test.py deleted file mode 100644 index b8c1944d..00000000 --- a/examples/imagenet/inception/slim/variables_test.py +++ /dev/null @@ -1,392 +0,0 @@ -# Copyright 2016 Google Inc. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# ============================================================================== -"""Tests for slim.variables.""" -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function - -import tensorflow as tf - -from inception.slim import scopes -from inception.slim import variables - - -class VariablesTest(tf.test.TestCase): - - def testCreateVariable(self): - with self.test_session(): - with tf.variable_scope('A'): - a = variables.variable('a', [5]) - self.assertEquals(a.op.name, 'A/a') - self.assertListEqual(a.get_shape().as_list(), [5]) - - def testGetVariables(self): - with self.test_session(): - with tf.variable_scope('A'): - a = variables.variable('a', [5]) - with tf.variable_scope('B'): - b = variables.variable('a', [5]) - self.assertEquals([a, b], variables.get_variables()) - self.assertEquals([a], variables.get_variables('A')) - self.assertEquals([b], variables.get_variables('B')) - - def testGetVariablesSuffix(self): - with self.test_session(): - with tf.variable_scope('A'): - a = variables.variable('a', [5]) - with tf.variable_scope('A'): - b = variables.variable('b', [5]) - self.assertEquals([a], variables.get_variables(suffix='a')) - self.assertEquals([b], variables.get_variables(suffix='b')) - - def testGetVariableWithSingleVar(self): - with self.test_session(): - with tf.variable_scope('parent'): - a = variables.variable('child', [5]) - self.assertEquals(a, variables.get_unique_variable('parent/child')) - - def testGetVariableWithDistractors(self): - with self.test_session(): - with tf.variable_scope('parent'): - a = variables.variable('child', [5]) - with tf.variable_scope('child'): - variables.variable('grandchild1', [7]) - variables.variable('grandchild2', [9]) - self.assertEquals(a, variables.get_unique_variable('parent/child')) - - def testGetVariableThrowsExceptionWithNoMatch(self): - var_name = 'cant_find_me' - with self.test_session(): - with self.assertRaises(ValueError): - variables.get_unique_variable(var_name) - - def testGetThrowsExceptionWithChildrenButNoMatch(self): - var_name = 'parent/child' - with self.test_session(): - with tf.variable_scope(var_name): - variables.variable('grandchild1', [7]) - variables.variable('grandchild2', [9]) - with self.assertRaises(ValueError): - variables.get_unique_variable(var_name) - - def testGetVariablesToRestore(self): - with self.test_session(): - with tf.variable_scope('A'): - a = variables.variable('a', [5]) - with tf.variable_scope('B'): - b = variables.variable('a', [5]) - self.assertEquals([a, b], variables.get_variables_to_restore()) - - def testNoneGetVariablesToRestore(self): - with self.test_session(): - with tf.variable_scope('A'): - a = variables.variable('a', [5], restore=False) - with tf.variable_scope('B'): - b = variables.variable('a', [5], restore=False) - self.assertEquals([], variables.get_variables_to_restore()) - self.assertEquals([a, b], variables.get_variables()) - - def testGetMixedVariablesToRestore(self): - with self.test_session(): - with tf.variable_scope('A'): - a = variables.variable('a', [5]) - b = variables.variable('b', [5], restore=False) - with tf.variable_scope('B'): - c = variables.variable('c', [5]) - d = variables.variable('d', [5], restore=False) - self.assertEquals([a, b, c, d], variables.get_variables()) - self.assertEquals([a, c], variables.get_variables_to_restore()) - - def testReuseVariable(self): - with self.test_session(): - with tf.variable_scope('A'): - a = variables.variable('a', []) - with tf.variable_scope('A', reuse=True): - b = variables.variable('a', []) - self.assertEquals(a, b) - self.assertListEqual([a], variables.get_variables()) - - def testVariableWithDevice(self): - with self.test_session(): - with tf.variable_scope('A'): - a = variables.variable('a', [], device='cpu:0') - b = variables.variable('b', [], device='cpu:1') - self.assertDeviceEqual(a.device, 'cpu:0') - self.assertDeviceEqual(b.device, 'cpu:1') - - def testVariableWithDeviceFromScope(self): - with self.test_session(): - with tf.device('/cpu:0'): - a = variables.variable('a', []) - b = variables.variable('b', [], device='cpu:1') - self.assertDeviceEqual(a.device, 'cpu:0') - self.assertDeviceEqual(b.device, 'cpu:1') - - def testVariableWithDeviceFunction(self): - class DevFn(object): - - def __init__(self): - self.counter = -1 - - def __call__(self, op): - self.counter += 1 - return 'cpu:%d' % self.counter - - with self.test_session(): - with scopes.arg_scope([variables.variable], device=DevFn()): - a = variables.variable('a', []) - b = variables.variable('b', []) - c = variables.variable('c', [], device='cpu:12') - d = variables.variable('d', []) - with tf.device('cpu:99'): - e_init = tf.constant(12) - e = variables.variable('e', initializer=e_init) - self.assertDeviceEqual(a.device, 'cpu:0') - self.assertDeviceEqual(a.initial_value.device, 'cpu:0') - self.assertDeviceEqual(b.device, 'cpu:1') - self.assertDeviceEqual(b.initial_value.device, 'cpu:1') - self.assertDeviceEqual(c.device, 'cpu:12') - self.assertDeviceEqual(c.initial_value.device, 'cpu:12') - self.assertDeviceEqual(d.device, 'cpu:2') - self.assertDeviceEqual(d.initial_value.device, 'cpu:2') - self.assertDeviceEqual(e.device, 'cpu:3') - self.assertDeviceEqual(e.initial_value.device, 'cpu:99') - - def testVariableWithReplicaDeviceSetter(self): - with self.test_session(): - with tf.device(tf.train.replica_device_setter(ps_tasks=2)): - a = variables.variable('a', []) - b = variables.variable('b', []) - c = variables.variable('c', [], device='cpu:12') - d = variables.variable('d', []) - with tf.device('cpu:99'): - e_init = tf.constant(12) - e = variables.variable('e', initializer=e_init) - # The values below highlight how the replica_device_setter puts initial - # values on the worker job, and how it merges explicit devices. - self.assertDeviceEqual(a.device, '/job:ps/task:0/cpu:0') - self.assertDeviceEqual(a.initial_value.device, '/job:worker/cpu:0') - self.assertDeviceEqual(b.device, '/job:ps/task:1/cpu:0') - self.assertDeviceEqual(b.initial_value.device, '/job:worker/cpu:0') - self.assertDeviceEqual(c.device, '/job:ps/task:0/cpu:12') - self.assertDeviceEqual(c.initial_value.device, '/job:worker/cpu:12') - self.assertDeviceEqual(d.device, '/job:ps/task:1/cpu:0') - self.assertDeviceEqual(d.initial_value.device, '/job:worker/cpu:0') - self.assertDeviceEqual(e.device, '/job:ps/task:0/cpu:0') - self.assertDeviceEqual(e.initial_value.device, '/job:worker/cpu:99') - - def testVariableWithVariableDeviceChooser(self): - - with tf.Graph().as_default(): - device_fn = variables.VariableDeviceChooser(num_parameter_servers=2) - with scopes.arg_scope([variables.variable], device=device_fn): - a = variables.variable('a', []) - b = variables.variable('b', []) - c = variables.variable('c', [], device='cpu:12') - d = variables.variable('d', []) - with tf.device('cpu:99'): - e_init = tf.constant(12) - e = variables.variable('e', initializer=e_init) - # The values below highlight how the VariableDeviceChooser puts initial - # values on the same device as the variable job. - self.assertDeviceEqual(a.device, '/job:ps/task:0/cpu:0') - self.assertDeviceEqual(a.initial_value.device, a.device) - self.assertDeviceEqual(b.device, '/job:ps/task:1/cpu:0') - self.assertDeviceEqual(b.initial_value.device, b.device) - self.assertDeviceEqual(c.device, '/cpu:12') - self.assertDeviceEqual(c.initial_value.device, c.device) - self.assertDeviceEqual(d.device, '/job:ps/task:0/cpu:0') - self.assertDeviceEqual(d.initial_value.device, d.device) - self.assertDeviceEqual(e.device, '/job:ps/task:1/cpu:0') - self.assertDeviceEqual(e.initial_value.device, '/cpu:99') - - def testVariableGPUPlacement(self): - - with tf.Graph().as_default(): - device_fn = variables.VariableDeviceChooser(placement='gpu:0') - with scopes.arg_scope([variables.variable], device=device_fn): - a = variables.variable('a', []) - b = variables.variable('b', []) - c = variables.variable('c', [], device='cpu:12') - d = variables.variable('d', []) - with tf.device('cpu:99'): - e_init = tf.constant(12) - e = variables.variable('e', initializer=e_init) - # The values below highlight how the VariableDeviceChooser puts initial - # values on the same device as the variable job. - self.assertDeviceEqual(a.device, '/gpu:0') - self.assertDeviceEqual(a.initial_value.device, a.device) - self.assertDeviceEqual(b.device, '/gpu:0') - self.assertDeviceEqual(b.initial_value.device, b.device) - self.assertDeviceEqual(c.device, '/cpu:12') - self.assertDeviceEqual(c.initial_value.device, c.device) - self.assertDeviceEqual(d.device, '/gpu:0') - self.assertDeviceEqual(d.initial_value.device, d.device) - self.assertDeviceEqual(e.device, '/gpu:0') - self.assertDeviceEqual(e.initial_value.device, '/cpu:99') - - def testVariableCollection(self): - with self.test_session(): - a = variables.variable('a', [], collections='A') - b = variables.variable('b', [], collections='B') - self.assertEquals(a, tf.get_collection('A')[0]) - self.assertEquals(b, tf.get_collection('B')[0]) - - def testVariableCollections(self): - with self.test_session(): - a = variables.variable('a', [], collections=['A', 'C']) - b = variables.variable('b', [], collections=['B', 'C']) - self.assertEquals(a, tf.get_collection('A')[0]) - self.assertEquals(b, tf.get_collection('B')[0]) - - def testVariableCollectionsWithArgScope(self): - with self.test_session(): - with scopes.arg_scope([variables.variable], collections='A'): - a = variables.variable('a', []) - b = variables.variable('b', []) - self.assertListEqual([a, b], tf.get_collection('A')) - - def testVariableCollectionsWithArgScopeNested(self): - with self.test_session(): - with scopes.arg_scope([variables.variable], collections='A'): - a = variables.variable('a', []) - with scopes.arg_scope([variables.variable], collections='B'): - b = variables.variable('b', []) - self.assertEquals(a, tf.get_collection('A')[0]) - self.assertEquals(b, tf.get_collection('B')[0]) - - def testVariableCollectionsWithArgScopeNonNested(self): - with self.test_session(): - with scopes.arg_scope([variables.variable], collections='A'): - a = variables.variable('a', []) - with scopes.arg_scope([variables.variable], collections='B'): - b = variables.variable('b', []) - variables.variable('c', []) - self.assertListEqual([a], tf.get_collection('A')) - self.assertListEqual([b], tf.get_collection('B')) - - def testVariableRestoreWithArgScopeNested(self): - with self.test_session(): - with scopes.arg_scope([variables.variable], restore=True): - a = variables.variable('a', []) - with scopes.arg_scope([variables.variable], - trainable=False, - collections=['A', 'B']): - b = variables.variable('b', []) - c = variables.variable('c', []) - self.assertListEqual([a, b, c], variables.get_variables_to_restore()) - self.assertListEqual([a, c], tf.trainable_variables()) - self.assertListEqual([b], tf.get_collection('A')) - self.assertListEqual([b], tf.get_collection('B')) - - -class GetVariablesByNameTest(tf.test.TestCase): - - def testGetVariableGivenNameScoped(self): - with self.test_session(): - with tf.variable_scope('A'): - a = variables.variable('a', [5]) - b = variables.variable('b', [5]) - self.assertEquals([a], variables.get_variables_by_name('a')) - self.assertEquals([b], variables.get_variables_by_name('b')) - - def testGetVariablesByNameReturnsByValueWithScope(self): - with self.test_session(): - with tf.variable_scope('A'): - a = variables.variable('a', [5]) - matched_variables = variables.get_variables_by_name('a') - - # If variables.get_variables_by_name returns the list by reference, the - # following append should persist, and be returned, in subsequent calls - # to variables.get_variables_by_name('a'). - matched_variables.append(4) - - matched_variables = variables.get_variables_by_name('a') - self.assertEquals([a], matched_variables) - - def testGetVariablesByNameReturnsByValueWithoutScope(self): - with self.test_session(): - a = variables.variable('a', [5]) - matched_variables = variables.get_variables_by_name('a') - - # If variables.get_variables_by_name returns the list by reference, the - # following append should persist, and be returned, in subsequent calls - # to variables.get_variables_by_name('a'). - matched_variables.append(4) - - matched_variables = variables.get_variables_by_name('a') - self.assertEquals([a], matched_variables) - - -class GlobalStepTest(tf.test.TestCase): - - def testStable(self): - with tf.Graph().as_default(): - gs = variables.global_step() - gs2 = variables.global_step() - self.assertTrue(gs is gs2) - - def testDevice(self): - with tf.Graph().as_default(): - with scopes.arg_scope([variables.global_step], device='/gpu:0'): - gs = variables.global_step() - self.assertDeviceEqual(gs.device, '/gpu:0') - - def testDeviceFn(self): - class DevFn(object): - - def __init__(self): - self.counter = -1 - - def __call__(self, op): - self.counter += 1 - return '/cpu:%d' % self.counter - - with tf.Graph().as_default(): - with scopes.arg_scope([variables.global_step], device=DevFn()): - gs = variables.global_step() - gs2 = variables.global_step() - self.assertDeviceEqual(gs.device, '/cpu:0') - self.assertEquals(gs, gs2) - self.assertDeviceEqual(gs2.device, '/cpu:0') - - def testReplicaDeviceSetter(self): - device_fn = tf.train.replica_device_setter(2) - with tf.Graph().as_default(): - with scopes.arg_scope([variables.global_step], device=device_fn): - gs = variables.global_step() - gs2 = variables.global_step() - self.assertEquals(gs, gs2) - self.assertDeviceEqual(gs.device, '/job:ps/task:0') - self.assertDeviceEqual(gs.initial_value.device, '/job:ps/task:0') - self.assertDeviceEqual(gs2.device, '/job:ps/task:0') - self.assertDeviceEqual(gs2.initial_value.device, '/job:ps/task:0') - - def testVariableWithVariableDeviceChooser(self): - - with tf.Graph().as_default(): - device_fn = variables.VariableDeviceChooser() - with scopes.arg_scope([variables.global_step], device=device_fn): - gs = variables.global_step() - gs2 = variables.global_step() - self.assertEquals(gs, gs2) - self.assertDeviceEqual(gs.device, 'cpu:0') - self.assertDeviceEqual(gs.initial_value.device, gs.device) - self.assertDeviceEqual(gs2.device, 'cpu:0') - self.assertDeviceEqual(gs2.initial_value.device, gs2.device) - - -if __name__ == '__main__': - tf.test.main() From b6d454e4dc7651c5c8fc83d54bad62acb221563d Mon Sep 17 00:00:00 2001 From: Lee Yang Date: Tue, 27 Aug 2019 15:21:33 -0700 Subject: [PATCH 21/37] initial code from tensorflow/models/official --- examples/resnet/README.md | 100 ++++ examples/resnet/__init__.py | 0 examples/resnet/cifar_preprocessing.py | 164 +++++++ examples/resnet/common.py | 407 ++++++++++++++++ examples/resnet/common_test.py | 109 +++++ examples/resnet/imagenet_preprocessing.py | 537 ++++++++++++++++++++++ examples/resnet/resnet_cifar_main.py | 241 ++++++++++ examples/resnet/resnet_cifar_model.py | 262 +++++++++++ examples/resnet/resnet_cifar_test.py | 187 ++++++++ examples/resnet/resnet_imagenet_main.py | 285 ++++++++++++ examples/resnet/resnet_imagenet_test.py | 282 ++++++++++++ examples/resnet/resnet_model.py | 389 ++++++++++++++++ 12 files changed, 2963 insertions(+) create mode 100644 examples/resnet/README.md create mode 100644 examples/resnet/__init__.py create mode 100644 examples/resnet/cifar_preprocessing.py create mode 100644 examples/resnet/common.py create mode 100644 examples/resnet/common_test.py create mode 100644 examples/resnet/imagenet_preprocessing.py create mode 100644 examples/resnet/resnet_cifar_main.py create mode 100644 examples/resnet/resnet_cifar_model.py create mode 100644 examples/resnet/resnet_cifar_test.py create mode 100644 examples/resnet/resnet_imagenet_main.py create mode 100644 examples/resnet/resnet_imagenet_test.py create mode 100644 examples/resnet/resnet_model.py diff --git a/examples/resnet/README.md b/examples/resnet/README.md new file mode 100644 index 00000000..72260396 --- /dev/null +++ b/examples/resnet/README.md @@ -0,0 +1,100 @@ +This folder contains the Keras implementation of the ResNet models. For more +information about the models, please refer to this [README file](../../README.md). + +Similar to the [estimator implementation](../../r1/resnet), the Keras +implementation has code for both CIFAR-10 data and ImageNet data. The CIFAR-10 +version uses a ResNet56 model implemented in +[`resnet_cifar_model.py`](./resnet_cifar_model.py), and the ImageNet version +uses a ResNet50 model implemented in [`resnet_model.py`](./resnet_model.py). + +To use +either dataset, make sure that you have the latest version of TensorFlow +installed and +[add the models folder to your Python path](/official/#running-the-models), +otherwise you may encounter an error like `ImportError: No module named +official.resnet`. + +## CIFAR-10 + +Download and extract the CIFAR-10 data. You can use the following script: +```bash +python ../../r1/resnet/cifar10_download_and_extract.py +``` + +After you download the data, you can run the program by: + +```bash +python resnet_cifar_main.py +``` + +If you did not use the default directory to download the data, specify the +location with the `--data_dir` flag, like: + +```bash +python resnet_cifar_main.py --data_dir=/path/to/cifar +``` + +## ImageNet + +Download the ImageNet dataset and convert it to TFRecord format. +The following [script](https://github.com/tensorflow/tpu/blob/master/tools/datasets/imagenet_to_gcs.py) +and [README](https://github.com/tensorflow/tpu/tree/master/tools/datasets#imagenet_to_gcspy) +provide a few options. + +Once your dataset is ready, you can begin training the model as follows: + +```bash +python resnet_imagenet_main.py +``` + +Again, if you did not download the data to the default directory, specify the +location with the `--data_dir` flag: + +```bash +python resnet_imagenet_main.py --data_dir=/path/to/imagenet +``` + +There are more flag options you can specify. Here are some examples: + +- `--use_synthetic_data`: when set to true, synthetic data, rather than real +data, are used; +- `--batch_size`: the batch size used for the model; +- `--model_dir`: the directory to save the model checkpoint; +- `--train_epochs`: number of epoches to run for training the model; +- `--train_steps`: number of steps to run for training the model. We now only +support a number that is smaller than the number of batches in an epoch. +- `--skip_eval`: when set to true, evaluation as well as validation during +training is skipped + +For example, this is a typical command line to run with ImageNet data with +batch size 128 per GPU: + +```bash +python -m resnet_imagenet_main \ + --model_dir=/tmp/model_dir/something \ + --num_gpus=2 \ + --batch_size=128 \ + --train_epochs=90 \ + --train_steps=10 \ + --use_synthetic_data=false +``` + +See [`common.py`](common.py) for full list of options. + +## Using multiple GPUs +You can train these models on multiple GPUs using `tf.distribute.Strategy` API. +You can read more about them in this +[guide](https://www.tensorflow.org/guide/distribute_strategy). + +In this example, we have made it easier to use is with just a command line flag +`--num_gpus`. By default this flag is 1 if TensorFlow is compiled with CUDA, +and 0 otherwise. + +- --num_gpus=0: Uses tf.distribute.OneDeviceStrategy with CPU as the device. +- --num_gpus=1: Uses tf.distribute.OneDeviceStrategy with GPU as the device. +- --num_gpus=2+: Uses tf.distribute.MirroredStrategy to run synchronous +distributed training across the GPUs. + +If you wish to run without `tf.distribute.Strategy`, you can do so by setting +`--distribution_strategy=off`. + diff --git a/examples/resnet/__init__.py b/examples/resnet/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/examples/resnet/cifar_preprocessing.py b/examples/resnet/cifar_preprocessing.py new file mode 100644 index 00000000..dcf174b9 --- /dev/null +++ b/examples/resnet/cifar_preprocessing.py @@ -0,0 +1,164 @@ +# Copyright 2016 The TensorFlow Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================== +"""Provides utilities to Cifar-10 dataset.""" + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import os +from absl import logging +import tensorflow as tf + +from official.vision.image_classification import imagenet_preprocessing + +HEIGHT = 32 +WIDTH = 32 +NUM_CHANNELS = 3 +_DEFAULT_IMAGE_BYTES = HEIGHT * WIDTH * NUM_CHANNELS +# The record is the image plus a one-byte label +_RECORD_BYTES = _DEFAULT_IMAGE_BYTES + 1 + +# TODO(tobyboyd): Change to best practice 45K(train)/5K(val)/10K(test) splits. +NUM_IMAGES = { + 'train': 50000, + 'validation': 10000, +} +_NUM_DATA_FILES = 5 +NUM_CLASSES = 10 + + +def parse_record(raw_record, is_training, dtype): + """Parses a record containing a training example of an image. + + The input record is parsed into a label and image, and the image is passed + through preprocessing steps (cropping, flipping, and so on). + + This method converts the label to one hot to fit the loss function. + + Args: + raw_record: scalar Tensor tf.string containing a serialized + Example protocol buffer. + is_training: A boolean denoting whether the input is for training. + dtype: Data type to use for input images. + + Returns: + Tuple with processed image tensor and one-hot-encoded label tensor. + """ + # Convert bytes to a vector of uint8 that is record_bytes long. + record_vector = tf.io.decode_raw(raw_record, tf.uint8) + + # The first byte represents the label, which we convert from uint8 to int32 + # and then to one-hot. + label = tf.cast(record_vector[0], tf.int32) + + # The remaining bytes after the label represent the image, which we reshape + # from [depth * height * width] to [depth, height, width]. + depth_major = tf.reshape(record_vector[1:_RECORD_BYTES], + [NUM_CHANNELS, HEIGHT, WIDTH]) + + # Convert from [depth, height, width] to [height, width, depth], and cast as + # float32. + image = tf.cast(tf.transpose(a=depth_major, perm=[1, 2, 0]), tf.float32) + + image = preprocess_image(image, is_training) + image = tf.cast(image, dtype) + + # TODO(haoyuzhang,hongkuny,tobyboyd): Remove or replace the use of V1 API + label = tf.compat.v1.sparse_to_dense(label, (NUM_CLASSES,), 1) + return image, label + + +def preprocess_image(image, is_training): + """Preprocess a single image of layout [height, width, depth].""" + if is_training: + # Resize the image to add four extra pixels on each side. + image = tf.image.resize_with_crop_or_pad( + image, HEIGHT + 8, WIDTH + 8) + + # Randomly crop a [HEIGHT, WIDTH] section of the image. + image = tf.image.random_crop(image, [HEIGHT, WIDTH, NUM_CHANNELS]) + + # Randomly flip the image horizontally. + image = tf.image.random_flip_left_right(image) + + # Subtract off the mean and divide by the variance of the pixels. + image = tf.image.per_image_standardization(image) + return image + + +def get_filenames(is_training, data_dir): + """Returns a list of filenames.""" + assert tf.io.gfile.exists(data_dir), ( + 'Run cifar10_download_and_extract.py first to download and extract the ' + 'CIFAR-10 data.') + + if is_training: + return [ + os.path.join(data_dir, 'data_batch_%d.bin' % i) + for i in range(1, _NUM_DATA_FILES + 1) + ] + else: + return [os.path.join(data_dir, 'test_batch.bin')] + + +def input_fn(is_training, + data_dir, + batch_size, + num_epochs=1, + dtype=tf.float32, + datasets_num_private_threads=None, + parse_record_fn=parse_record, + input_context=None, + drop_remainder=False): + """Input function which provides batches for train or eval. + + Args: + is_training: A boolean denoting whether the input is for training. + data_dir: The directory containing the input data. + batch_size: The number of samples per batch. + num_epochs: The number of epochs to repeat the dataset. + dtype: Data type to use for images/features + datasets_num_private_threads: Number of private threads for tf.data. + parse_record_fn: Function to use for parsing the records. + input_context: A `tf.distribute.InputContext` object passed in by + `tf.distribute.Strategy`. + drop_remainder: A boolean indicates whether to drop the remainder of the + batches. If True, the batch dimension will be static. + + Returns: + A dataset that can be used for iteration. + """ + filenames = get_filenames(is_training, data_dir) + dataset = tf.data.FixedLengthRecordDataset(filenames, _RECORD_BYTES) + + if input_context: + logging.info( + 'Sharding the dataset: input_pipeline_id=%d num_input_pipelines=%d', + input_context.input_pipeline_id, input_context.num_input_pipelines) + dataset = dataset.shard(input_context.num_input_pipelines, + input_context.input_pipeline_id) + + return imagenet_preprocessing.process_record_dataset( + dataset=dataset, + is_training=is_training, + batch_size=batch_size, + shuffle_buffer=NUM_IMAGES['train'], + parse_record_fn=parse_record_fn, + num_epochs=num_epochs, + dtype=dtype, + datasets_num_private_threads=datasets_num_private_threads, + drop_remainder=drop_remainder + ) diff --git a/examples/resnet/common.py b/examples/resnet/common.py new file mode 100644 index 00000000..434a59d5 --- /dev/null +++ b/examples/resnet/common.py @@ -0,0 +1,407 @@ +# Copyright 2018 The TensorFlow Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================== +"""Common util functions and classes used by both keras cifar and imagenet.""" +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import multiprocessing +import os + +from absl import flags +import numpy as np +import tensorflow as tf + +from tensorflow.python.keras.optimizer_v2 import gradient_descent as gradient_descent_v2 +from official.utils.flags import core as flags_core +from official.utils.misc import keras_utils + +FLAGS = flags.FLAGS +BASE_LEARNING_RATE = 0.1 # This matches Jing's version. +TRAIN_TOP_1 = 'training_accuracy_top_1' + + +class LearningRateBatchScheduler(tf.keras.callbacks.Callback): + """Callback to update learning rate on every batch (not epoch boundaries). + + N.B. Only support Keras optimizers, not TF optimizers. + + Args: + schedule: a function that takes an epoch index and a batch index as input + (both integer, indexed from 0) and returns a new learning rate as + output (float). + """ + + def __init__(self, schedule, batch_size, num_images): + super(LearningRateBatchScheduler, self).__init__() + self.schedule = schedule + self.batches_per_epoch = num_images / batch_size + self.batch_size = batch_size + self.epochs = -1 + self.prev_lr = -1 + + def on_epoch_begin(self, epoch, logs=None): + if not hasattr(self.model.optimizer, 'learning_rate'): + raise ValueError('Optimizer must have a "learning_rate" attribute.') + self.epochs += 1 + + def on_batch_begin(self, batch, logs=None): + """Executes before step begins.""" + lr = self.schedule(self.epochs, + batch, + self.batches_per_epoch, + self.batch_size) + if not isinstance(lr, (float, np.float32, np.float64)): + raise ValueError('The output of the "schedule" function should be float.') + if lr != self.prev_lr: + self.model.optimizer.learning_rate = lr # lr should be a float here + self.prev_lr = lr + tf.compat.v1.logging.debug( + 'Epoch %05d Batch %05d: LearningRateBatchScheduler ' + 'change learning rate to %s.', self.epochs, batch, lr) + + +class PiecewiseConstantDecayWithWarmup( + tf.keras.optimizers.schedules.LearningRateSchedule): + """Piecewise constant decay with warmup schedule.""" + + def __init__(self, batch_size, epoch_size, warmup_epochs, boundaries, + multipliers, compute_lr_on_cpu=True, name=None): + super(PiecewiseConstantDecayWithWarmup, self).__init__() + if len(boundaries) != len(multipliers) - 1: + raise ValueError('The length of boundaries must be 1 less than the ' + 'length of multipliers') + + base_lr_batch_size = 256 + num_batches_per_epoch = epoch_size // batch_size + + self.rescaled_lr = BASE_LEARNING_RATE * batch_size / base_lr_batch_size + self.step_boundaries = [float(num_batches_per_epoch) * x + for x in boundaries] + self.lr_values = [self.rescaled_lr * m for m in multipliers] + self.warmup_steps = warmup_epochs * num_batches_per_epoch + self.compute_lr_on_cpu = compute_lr_on_cpu + self.name = name + + self.learning_rate_ops_cache = {} + + def __call__(self, step): + if tf.executing_eagerly(): + return self._get_learning_rate(step) + + # In an eager function or graph, the current implementation of optimizer + # repeatedly call and thus create ops for the learning rate schedule. To + # avoid this, we cache the ops if not executing eagerly. + graph = tf.compat.v1.get_default_graph() + if graph not in self.learning_rate_ops_cache: + if self.compute_lr_on_cpu: + with tf.device('/device:CPU:0'): + self.learning_rate_ops_cache[graph] = self._get_learning_rate(step) + else: + self.learning_rate_ops_cache[graph] = self._get_learning_rate(step) + return self.learning_rate_ops_cache[graph] + + def _get_learning_rate(self, step): + """Compute learning rate at given step.""" + with tf.compat.v1.name_scope(self.name, 'PiecewiseConstantDecayWithWarmup', + [self.rescaled_lr, self.step_boundaries, + self.lr_values, self.warmup_steps, + self.compute_lr_on_cpu]): + def warmup_lr(step): + return self.rescaled_lr * ( + tf.cast(step, tf.float32) / tf.cast(self.warmup_steps, tf.float32)) + def piecewise_lr(step): + return tf.compat.v1.train.piecewise_constant( + step, self.step_boundaries, self.lr_values) + return tf.cond(step < self.warmup_steps, + lambda: warmup_lr(step), + lambda: piecewise_lr(step)) + + def get_config(self): + return { + 'rescaled_lr': self.rescaled_lr, + 'step_boundaries': self.step_boundaries, + 'lr_values': self.lr_values, + 'warmup_steps': self.warmup_steps, + 'compute_lr_on_cpu': self.compute_lr_on_cpu, + 'name': self.name + } + + +def set_gpu_thread_mode_and_count(flags_obj): + """Set GPU thread mode and count, and adjust dataset threads count.""" + cpu_count = multiprocessing.cpu_count() + tf.compat.v1.logging.info('Logical CPU cores: %s', cpu_count) + + # Allocate private thread pool for each GPU to schedule and launch kernels + per_gpu_thread_count = flags_obj.per_gpu_thread_count or 2 + os.environ['TF_GPU_THREAD_MODE'] = flags_obj.tf_gpu_thread_mode + os.environ['TF_GPU_THREAD_COUNT'] = str(per_gpu_thread_count) + tf.compat.v1.logging.info('TF_GPU_THREAD_COUNT: %s', + os.environ['TF_GPU_THREAD_COUNT']) + tf.compat.v1.logging.info('TF_GPU_THREAD_MODE: %s', + os.environ['TF_GPU_THREAD_MODE']) + + # Limit data preprocessing threadpool to CPU cores minus number of total GPU + # private threads and memory copy threads. + total_gpu_thread_count = per_gpu_thread_count * flags_obj.num_gpus + num_runtime_threads = flags_obj.num_gpus + if not flags_obj.datasets_num_private_threads: + flags_obj.datasets_num_private_threads = min( + cpu_count - total_gpu_thread_count - num_runtime_threads, + flags_obj.num_gpus * 8) + tf.compat.v1.logging.info('Set datasets_num_private_threads to %s', + flags_obj.datasets_num_private_threads) + + +def get_optimizer(learning_rate=0.1): + """Returns optimizer to use.""" + # The learning_rate is overwritten at the beginning of each step by callback. + return gradient_descent_v2.SGD(learning_rate=learning_rate, momentum=0.9) + + +def get_callbacks(learning_rate_schedule_fn, num_images): + """Returns common callbacks.""" + time_callback = keras_utils.TimeHistory(FLAGS.batch_size, FLAGS.log_steps) + callbacks = [time_callback] + + if not FLAGS.use_tensor_lr: + lr_callback = LearningRateBatchScheduler( + learning_rate_schedule_fn, + batch_size=FLAGS.batch_size, + num_images=num_images) + callbacks.append(lr_callback) + + if FLAGS.enable_tensorboard: + tensorboard_callback = tf.keras.callbacks.TensorBoard( + log_dir=FLAGS.model_dir) + callbacks.append(tensorboard_callback) + + if FLAGS.profile_steps: + profiler_callback = keras_utils.get_profiler_callback( + FLAGS.model_dir, + FLAGS.profile_steps, + FLAGS.enable_tensorboard) + callbacks.append(profiler_callback) + + return callbacks + + +def build_stats(history, eval_output, callbacks): + """Normalizes and returns dictionary of stats. + + Args: + history: Results of the training step. Supports both categorical_accuracy + and sparse_categorical_accuracy. + eval_output: Output of the eval step. Assumes first value is eval_loss and + second value is accuracy_top_1. + callbacks: a list of callbacks which might include a time history callback + used during keras.fit. + + Returns: + Dictionary of normalized results. + """ + stats = {} + if eval_output: + stats['accuracy_top_1'] = eval_output[1].item() + stats['eval_loss'] = eval_output[0].item() + + if history and history.history: + train_hist = history.history + # Gets final loss from training. + stats['loss'] = train_hist['loss'][-1].item() + # Gets top_1 training accuracy. + if 'categorical_accuracy' in train_hist: + stats[TRAIN_TOP_1] = train_hist['categorical_accuracy'][-1].item() + elif 'sparse_categorical_accuracy' in train_hist: + stats[TRAIN_TOP_1] = train_hist['sparse_categorical_accuracy'][-1].item() + + if not callbacks: + return stats + + # Look for the time history callback which was used during keras.fit + for callback in callbacks: + if isinstance(callback, keras_utils.TimeHistory): + timestamp_log = callback.timestamp_log + stats['step_timestamp_log'] = timestamp_log + stats['train_finish_time'] = callback.train_finish_time + if len(timestamp_log) > 1: + stats['avg_exp_per_second'] = ( + callback.batch_size * callback.log_steps * + (len(callback.timestamp_log)-1) / + (timestamp_log[-1].timestamp - timestamp_log[0].timestamp)) + return stats + + +def define_keras_flags(dynamic_loss_scale=True): + """Define flags for Keras models.""" + flags_core.define_base(run_eagerly=True) + flags_core.define_performance(num_parallel_calls=False, + synthetic_data=True, + dtype=True, + all_reduce_alg=True, + num_packs=True, + tf_gpu_thread_mode=True, + datasets_num_private_threads=True, + dynamic_loss_scale=dynamic_loss_scale, + loss_scale=True, + tf_data_experimental_slack=True, + enable_xla=True, + force_v2_in_keras_compile=True) + flags_core.define_image() + flags_core.define_benchmark() + flags_core.define_distribution() + flags.adopt_module_key_flags(flags_core) + + flags.DEFINE_boolean(name='enable_eager', default=False, help='Enable eager?') + flags.DEFINE_boolean(name='skip_eval', default=False, help='Skip evaluation?') + # TODO(b/135607288): Remove this flag once we understand the root cause of + # slowdown when setting the learning phase in Keras backend. + flags.DEFINE_boolean( + name='set_learning_phase_to_train', default=True, + help='If skip eval, also set Keras learning phase to 1 (training).') + flags.DEFINE_boolean( + name='explicit_gpu_placement', default=False, + help='If not using distribution strategy, explicitly set device scope ' + 'for the Keras training loop.') + flags.DEFINE_boolean(name='use_trivial_model', default=False, + help='Whether to use a trivial Keras model.') + flags.DEFINE_boolean(name='report_accuracy_metrics', default=True, + help='Report metrics during training and evaluation.') + flags.DEFINE_boolean(name='use_tensor_lr', default=False, + help='Use learning rate tensor instead of a callback.') + flags.DEFINE_boolean( + name='enable_tensorboard', default=False, + help='Whether to enable Tensorboard callback.') + flags.DEFINE_integer( + name='train_steps', default=None, + help='The number of steps to run for training. If it is larger than ' + '# batches per epoch, then use # batches per epoch. When this flag is ' + 'set, only one epoch is going to run for training.') + flags.DEFINE_string( + name='profile_steps', default=None, + help='Save profiling data to model dir at given range of steps. The ' + 'value must be a comma separated pair of positive integers, specifying ' + 'the first and last step to profile. For example, "--profile_steps=2,4" ' + 'triggers the profiler to process 3 steps, starting from the 2nd step. ' + 'Note that profiler has a non-trivial performance overhead, and the ' + 'output file can be gigantic if profiling many steps.') + flags.DEFINE_boolean( + name='data_delay_prefetch', default=False, + help='Add a small delay in tf.data prefetch to prioritize memory copy of ' + 'other tensors over the data minibatch for the (T+1)th step. It should ' + 'help improve performance using EagerIterator and function. The codepath ' + 'when enabling this feature is experimental and will be removed once the ' + 'corresponding performance features are fully supported in TensorFlow.') + flags.DEFINE_boolean( + name='batchnorm_spatial_persistent', default=True, + help='Enable the spacial persistent mode for CuDNN batch norm kernel.') + flags.DEFINE_boolean( + name='enable_get_next_as_optional', default=False, + help='Enable get_next_as_optional behavior in DistributedIterator.') + +def get_synth_input_fn(height, width, num_channels, num_classes, + dtype=tf.float32, drop_remainder=True): + """Returns an input function that returns a dataset with random data. + + This input_fn returns a data set that iterates over a set of random data and + bypasses all preprocessing, e.g. jpeg decode and copy. The host to device + copy is still included. This used to find the upper throughput bound when + tuning the full input pipeline. + + Args: + height: Integer height that will be used to create a fake image tensor. + width: Integer width that will be used to create a fake image tensor. + num_channels: Integer depth that will be used to create a fake image tensor. + num_classes: Number of classes that should be represented in the fake labels + tensor + dtype: Data type for features/images. + drop_remainder: A boolean indicates whether to drop the remainder of the + batches. If True, the batch dimension will be static. + + Returns: + An input_fn that can be used in place of a real one to return a dataset + that can be used for iteration. + """ + # pylint: disable=unused-argument + def input_fn(is_training, data_dir, batch_size, *args, **kwargs): + """Returns dataset filled with random data.""" + # Synthetic input should be within [0, 255]. + inputs = tf.random.truncated_normal([height, width, num_channels], + dtype=dtype, + mean=127, + stddev=60, + name='synthetic_inputs') + + labels = tf.random.uniform([1], + minval=0, + maxval=num_classes - 1, + dtype=tf.int32, + name='synthetic_labels') + # Cast to float32 for Keras model. + labels = tf.cast(labels, dtype=tf.float32) + + data = tf.data.Dataset.from_tensors((inputs, labels)).repeat() + + # `drop_remainder` will make dataset produce outputs with known shapes. + data = data.batch(batch_size, drop_remainder=drop_remainder) + data = data.prefetch(buffer_size=tf.data.experimental.AUTOTUNE) + return data + + return input_fn + + +def data_delay_prefetch(): + """Use unstable code for perf tuning purposes.""" + if not FLAGS.use_synthetic_data: + _monkey_patch_org_create_device_dataset() + + +def set_cudnn_batchnorm_mode(): + """Set CuDNN batchnorm mode for better performance. + + Note: Spatial Persistent mode may lead to accuracy losses for certain + models. + """ + if FLAGS.batchnorm_spatial_persistent: + os.environ['TF_USE_CUDNN_BATCHNORM_SPATIAL_PERSISTENT'] = '1' + else: + os.environ.pop('TF_USE_CUDNN_BATCHNORM_SPATIAL_PERSISTENT', None) + + +# TODO(haoyuzhang): remove this monkey patch when the "prefetch with slack" +# feature is available in tf.data. +def _monkey_patch_org_create_device_dataset(): + """Monkey-patch `_create_device_dataset` method with delayed prefetch.""" + + import ast # pylint: disable=g-import-not-at-top + import inspect # pylint: disable=g-import-not-at-top + from tensorflow.python.data.ops import multi_device_iterator_ops # pylint: disable=g-import-not-at-top + + tf.compat.v1.logging.info( + 'Using monkey-patched version of MultiDeviceIterator. It should be ' + 'removed when the prefetch with slack feature is implemented in tf.data.') + cls_multi_device_iterator = ast.parse( + inspect.getsource(multi_device_iterator_ops.MultiDeviceIterator)) + org_create_device_dataset_code = inspect.getsource( + multi_device_iterator_ops.MultiDeviceIterator._create_device_dataset) # pylint: disable=protected-access + code_lines = org_create_device_dataset_code.split('\n') + # Insert in reverse order to avoid line number shift by previous insertions + code_lines.insert(5, ' ds = ds.apply(sleep_ops.sleep(11000))') # 11ms + code_lines.insert(2, ' from tensorflow.python.data.experimental.ops import sleep as sleep_ops') # pylint: disable=line-too-long + patched_code = '\n'.join(line[2:] for line in code_lines) + cls_multi_device_iterator.body[0].body[2] = ast.parse(patched_code).body[0] + exec(compile(cls_multi_device_iterator, '', 'exec'), # pylint: disable=exec-used + multi_device_iterator_ops.__dict__) diff --git a/examples/resnet/common_test.py b/examples/resnet/common_test.py new file mode 100644 index 00000000..d7cd5310 --- /dev/null +++ b/examples/resnet/common_test.py @@ -0,0 +1,109 @@ +# Copyright 2018 The TensorFlow Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================== +"""Tests for the common module.""" +from __future__ import absolute_import +from __future__ import print_function + +from mock import Mock +import numpy as np +import tensorflow as tf + +from tensorflow.python.platform import googletest +from official.utils.misc import keras_utils +from official.vision.image_classification import common + + +class KerasCommonTests(tf.test.TestCase): + """Tests for common.""" + + @classmethod + def setUpClass(cls): # pylint: disable=invalid-name + super(KerasCommonTests, cls).setUpClass() + + def test_build_stats(self): + + history = self._build_history(1.145, cat_accuracy=.99988) + eval_output = self._build_eval_output(.56432111, 5.990) + th = keras_utils.TimeHistory(128, 100) + + th.timestamp_log = [keras_utils.BatchTimestamp(0, 1), + keras_utils.BatchTimestamp(1, 2), + keras_utils.BatchTimestamp(2, 3)] + th.train_finish_time = 12345 + stats = common.build_stats(history, eval_output, [th]) + + self.assertEqual(1.145, stats['loss']) + self.assertEqual(.99988, stats['training_accuracy_top_1']) + + self.assertEqual(.56432111, stats['accuracy_top_1']) + self.assertEqual(5.990, stats['eval_loss']) + + self.assertEqual(3, stats['step_timestamp_log'][2].timestamp) + self.assertEqual(12345, stats['train_finish_time']) + + def test_build_stats_sparse(self): + + history = self._build_history(1.145, cat_accuracy_sparse=.99988) + eval_output = self._build_eval_output(.928, 1.9844) + stats = common.build_stats(history, eval_output, None) + + self.assertEqual(1.145, stats['loss']) + self.assertEqual(.99988, stats['training_accuracy_top_1']) + + self.assertEqual(.928, stats['accuracy_top_1']) + self.assertEqual(1.9844, stats['eval_loss']) + + def test_time_history(self): + th = keras_utils.TimeHistory(batch_size=128, log_steps=3) + + th.on_train_begin() + th.on_batch_begin(0) + th.on_batch_end(0) + th.on_batch_begin(1) + th.on_batch_end(1) + th.on_batch_begin(2) + th.on_batch_end(2) + th.on_batch_begin(3) + th.on_batch_end(3) + th.on_batch_begin(4) + th.on_batch_end(4) + th.on_batch_begin(5) + th.on_batch_end(5) + th.on_batch_begin(6) + th.on_batch_end(6) + th.on_train_end() + + self.assertEqual(3, len(th.timestamp_log)) + + def _build_history(self, loss, cat_accuracy=None, + cat_accuracy_sparse=None): + history_p = Mock() + history = {} + history_p.history = history + history['loss'] = [np.float64(loss)] + if cat_accuracy: + history['categorical_accuracy'] = [np.float64(cat_accuracy)] + if cat_accuracy_sparse: + history['sparse_categorical_accuracy'] = [np.float64(cat_accuracy_sparse)] + + return history_p + + def _build_eval_output(self, top_1, eval_loss): + eval_output = [np.float64(eval_loss), np.float64(top_1)] + return eval_output + +if __name__ == '__main__': + tf.compat.v1.logging.set_verbosity(tf.compat.v1.logging.ERROR) + googletest.main() diff --git a/examples/resnet/imagenet_preprocessing.py b/examples/resnet/imagenet_preprocessing.py new file mode 100644 index 00000000..eb5bb0d4 --- /dev/null +++ b/examples/resnet/imagenet_preprocessing.py @@ -0,0 +1,537 @@ +# Copyright 2016 The TensorFlow Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================== +"""Provides utilities to preprocess images. + +Training images are sampled using the provided bounding boxes, and subsequently +cropped to the sampled bounding box. Images are additionally flipped randomly, +then resized to the target output size (without aspect-ratio preservation). + +Images used during evaluation are resized (with aspect-ratio preservation) and +centrally cropped. + +All images undergo mean color subtraction. + +Note that these steps are colloquially referred to as "ResNet preprocessing," +and they differ from "VGG preprocessing," which does not use bounding boxes +and instead does an aspect-preserving resize followed by random crop during +training. (These both differ from "Inception preprocessing," which introduces +color distortion steps.) + +""" + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import os +from absl import logging +import tensorflow as tf + +DEFAULT_IMAGE_SIZE = 224 +NUM_CHANNELS = 3 +NUM_CLASSES = 1001 + +NUM_IMAGES = { + 'train': 1281167, + 'validation': 50000, +} + +_NUM_TRAIN_FILES = 1024 +_SHUFFLE_BUFFER = 10000 + +_R_MEAN = 123.68 +_G_MEAN = 116.78 +_B_MEAN = 103.94 +_CHANNEL_MEANS = [_R_MEAN, _G_MEAN, _B_MEAN] + +# The lower bound for the smallest side of the image for aspect-preserving +# resizing. For example, if an image is 500 x 1000, it will be resized to +# _RESIZE_MIN x (_RESIZE_MIN * 2). +_RESIZE_MIN = 256 + + +def process_record_dataset(dataset, + is_training, + batch_size, + shuffle_buffer, + parse_record_fn, + num_epochs=1, + dtype=tf.float32, + datasets_num_private_threads=None, + drop_remainder=False, + tf_data_experimental_slack=False): + """Given a Dataset with raw records, return an iterator over the records. + + Args: + dataset: A Dataset representing raw records + is_training: A boolean denoting whether the input is for training. + batch_size: The number of samples per batch. + shuffle_buffer: The buffer size to use when shuffling records. A larger + value results in better randomness, but smaller values reduce startup + time and use less memory. + parse_record_fn: A function that takes a raw record and returns the + corresponding (image, label) pair. + num_epochs: The number of epochs to repeat the dataset. + dtype: Data type to use for images/features. + datasets_num_private_threads: Number of threads for a private + threadpool created for all datasets computation. + drop_remainder: A boolean indicates whether to drop the remainder of the + batches. If True, the batch dimension will be static. + tf_data_experimental_slack: Whether to enable tf.data's + `experimental_slack` option. + + Returns: + Dataset of (image, label) pairs ready for iteration. + """ + # Defines a specific size thread pool for tf.data operations. + if datasets_num_private_threads: + options = tf.data.Options() + options.experimental_threading.private_threadpool_size = ( + datasets_num_private_threads) + dataset = dataset.with_options(options) + logging.info( + 'datasets_num_private_threads: %s', datasets_num_private_threads) + + # Disable intra-op parallelism to optimize for throughput instead of latency. + options = tf.data.Options() + options.experimental_threading.max_intra_op_parallelism = 1 + dataset = dataset.with_options(options) + + # Prefetches a batch at a time to smooth out the time taken to load input + # files for shuffling and processing. + dataset = dataset.prefetch(buffer_size=batch_size) + if is_training: + # Shuffles records before repeating to respect epoch boundaries. + dataset = dataset.shuffle(buffer_size=shuffle_buffer) + + # Repeats the dataset for the number of epochs to train. + dataset = dataset.repeat(num_epochs) + + # Parses the raw records into images and labels. + dataset = dataset.map( + lambda value: parse_record_fn(value, is_training, dtype), + num_parallel_calls=tf.data.experimental.AUTOTUNE) + dataset = dataset.batch(batch_size, drop_remainder=drop_remainder) + + # Operations between the final prefetch and the get_next call to the iterator + # will happen synchronously during run time. We prefetch here again to + # background all of the above processing work and keep it out of the + # critical training path. Setting buffer_size to tf.contrib.data.AUTOTUNE + # allows DistributionStrategies to adjust how many batches to fetch based + # on how many devices are present. + dataset = dataset.prefetch(buffer_size=tf.data.experimental.AUTOTUNE) + + if tf_data_experimental_slack: + options = tf.data.Options() + options.experimental_slack = True + dataset = dataset.with_options(options) + + return dataset + + +def get_filenames(is_training, data_dir): + """Return filenames for dataset.""" + if is_training: + return [ + os.path.join(data_dir, 'train-%05d-of-01024' % i) + for i in range(_NUM_TRAIN_FILES)] + else: + return [ + os.path.join(data_dir, 'validation-%05d-of-00128' % i) + for i in range(128)] + + +def _parse_example_proto(example_serialized): + """Parses an Example proto containing a training example of an image. + + The output of the build_image_data.py image preprocessing script is a dataset + containing serialized Example protocol buffers. Each Example proto contains + the following fields (values are included as examples): + + image/height: 462 + image/width: 581 + image/colorspace: 'RGB' + image/channels: 3 + image/class/label: 615 + image/class/synset: 'n03623198' + image/class/text: 'knee pad' + image/object/bbox/xmin: 0.1 + image/object/bbox/xmax: 0.9 + image/object/bbox/ymin: 0.2 + image/object/bbox/ymax: 0.6 + image/object/bbox/label: 615 + image/format: 'JPEG' + image/filename: 'ILSVRC2012_val_00041207.JPEG' + image/encoded: + + Args: + example_serialized: scalar Tensor tf.string containing a serialized + Example protocol buffer. + + Returns: + image_buffer: Tensor tf.string containing the contents of a JPEG file. + label: Tensor tf.int32 containing the label. + bbox: 3-D float Tensor of bounding boxes arranged [1, num_boxes, coords] + where each coordinate is [0, 1) and the coordinates are arranged as + [ymin, xmin, ymax, xmax]. + """ + # Dense features in Example proto. + feature_map = { + 'image/encoded': tf.io.FixedLenFeature([], dtype=tf.string, + default_value=''), + 'image/class/label': tf.io.FixedLenFeature([], dtype=tf.int64, + default_value=-1), + 'image/class/text': tf.io.FixedLenFeature([], dtype=tf.string, + default_value=''), + } + sparse_float32 = tf.io.VarLenFeature(dtype=tf.float32) + # Sparse features in Example proto. + feature_map.update( + {k: sparse_float32 for k in [ + 'image/object/bbox/xmin', 'image/object/bbox/ymin', + 'image/object/bbox/xmax', 'image/object/bbox/ymax']}) + + features = tf.io.parse_single_example(serialized=example_serialized, + features=feature_map) + label = tf.cast(features['image/class/label'], dtype=tf.int32) + + xmin = tf.expand_dims(features['image/object/bbox/xmin'].values, 0) + ymin = tf.expand_dims(features['image/object/bbox/ymin'].values, 0) + xmax = tf.expand_dims(features['image/object/bbox/xmax'].values, 0) + ymax = tf.expand_dims(features['image/object/bbox/ymax'].values, 0) + + # Note that we impose an ordering of (y, x) just to make life difficult. + bbox = tf.concat([ymin, xmin, ymax, xmax], 0) + + # Force the variable number of bounding boxes into the shape + # [1, num_boxes, coords]. + bbox = tf.expand_dims(bbox, 0) + bbox = tf.transpose(a=bbox, perm=[0, 2, 1]) + + return features['image/encoded'], label, bbox + + +def parse_record(raw_record, is_training, dtype): + """Parses a record containing a training example of an image. + + The input record is parsed into a label and image, and the image is passed + through preprocessing steps (cropping, flipping, and so on). + + Args: + raw_record: scalar Tensor tf.string containing a serialized + Example protocol buffer. + is_training: A boolean denoting whether the input is for training. + dtype: data type to use for images/features. + + Returns: + Tuple with processed image tensor and one-hot-encoded label tensor. + """ + image_buffer, label, bbox = _parse_example_proto(raw_record) + + image = preprocess_image( + image_buffer=image_buffer, + bbox=bbox, + output_height=DEFAULT_IMAGE_SIZE, + output_width=DEFAULT_IMAGE_SIZE, + num_channels=NUM_CHANNELS, + is_training=is_training) + image = tf.cast(image, dtype) + + # Subtract one so that labels are in [0, 1000), and cast to float32 for + # Keras model. + label = tf.cast(tf.cast(tf.reshape(label, shape=[1]), dtype=tf.int32) - 1, + dtype=tf.float32) + return image, label + + +def input_fn(is_training, + data_dir, + batch_size, + num_epochs=1, + dtype=tf.float32, + datasets_num_private_threads=None, + parse_record_fn=parse_record, + input_context=None, + drop_remainder=False, + tf_data_experimental_slack=False): + """Input function which provides batches for train or eval. + + Args: + is_training: A boolean denoting whether the input is for training. + data_dir: The directory containing the input data. + batch_size: The number of samples per batch. + num_epochs: The number of epochs to repeat the dataset. + dtype: Data type to use for images/features + datasets_num_private_threads: Number of private threads for tf.data. + parse_record_fn: Function to use for parsing the records. + input_context: A `tf.distribute.InputContext` object passed in by + `tf.distribute.Strategy`. + drop_remainder: A boolean indicates whether to drop the remainder of the + batches. If True, the batch dimension will be static. + tf_data_experimental_slack: Whether to enable tf.data's + `experimental_slack` option. + + Returns: + A dataset that can be used for iteration. + """ + filenames = get_filenames(is_training, data_dir) + dataset = tf.data.Dataset.from_tensor_slices(filenames) + + if input_context: + logging.info( + 'Sharding the dataset: input_pipeline_id=%d num_input_pipelines=%d', + input_context.input_pipeline_id, input_context.num_input_pipelines) + dataset = dataset.shard(input_context.num_input_pipelines, + input_context.input_pipeline_id) + + if is_training: + # Shuffle the input files + dataset = dataset.shuffle(buffer_size=_NUM_TRAIN_FILES) + + # Convert to individual records. + # cycle_length = 10 means that up to 10 files will be read and deserialized in + # parallel. You may want to increase this number if you have a large number of + # CPU cores. + dataset = dataset.interleave( + tf.data.TFRecordDataset, + cycle_length=10, + num_parallel_calls=tf.data.experimental.AUTOTUNE) + + return process_record_dataset( + dataset=dataset, + is_training=is_training, + batch_size=batch_size, + shuffle_buffer=_SHUFFLE_BUFFER, + parse_record_fn=parse_record_fn, + num_epochs=num_epochs, + dtype=dtype, + datasets_num_private_threads=datasets_num_private_threads, + drop_remainder=drop_remainder, + tf_data_experimental_slack=tf_data_experimental_slack, + ) + + +def _decode_crop_and_flip(image_buffer, bbox, num_channels): + """Crops the given image to a random part of the image, and randomly flips. + + We use the fused decode_and_crop op, which performs better than the two ops + used separately in series, but note that this requires that the image be + passed in as an un-decoded string Tensor. + + Args: + image_buffer: scalar string Tensor representing the raw JPEG image buffer. + bbox: 3-D float Tensor of bounding boxes arranged [1, num_boxes, coords] + where each coordinate is [0, 1) and the coordinates are arranged as + [ymin, xmin, ymax, xmax]. + num_channels: Integer depth of the image buffer for decoding. + + Returns: + 3-D tensor with cropped image. + + """ + # A large fraction of image datasets contain a human-annotated bounding box + # delineating the region of the image containing the object of interest. We + # choose to create a new bounding box for the object which is a randomly + # distorted version of the human-annotated bounding box that obeys an + # allowed range of aspect ratios, sizes and overlap with the human-annotated + # bounding box. If no box is supplied, then we assume the bounding box is + # the entire image. + sample_distorted_bounding_box = tf.image.sample_distorted_bounding_box( + tf.image.extract_jpeg_shape(image_buffer), + bounding_boxes=bbox, + min_object_covered=0.1, + aspect_ratio_range=[0.75, 1.33], + area_range=[0.05, 1.0], + max_attempts=100, + use_image_if_no_bounding_boxes=True) + bbox_begin, bbox_size, _ = sample_distorted_bounding_box + + # Reassemble the bounding box in the format the crop op requires. + offset_y, offset_x, _ = tf.unstack(bbox_begin) + target_height, target_width, _ = tf.unstack(bbox_size) + crop_window = tf.stack([offset_y, offset_x, target_height, target_width]) + + # Use the fused decode and crop op here, which is faster than each in series. + cropped = tf.image.decode_and_crop_jpeg( + image_buffer, crop_window, channels=num_channels) + + # Flip to add a little more random distortion in. + cropped = tf.image.random_flip_left_right(cropped) + return cropped + + +def _central_crop(image, crop_height, crop_width): + """Performs central crops of the given image list. + + Args: + image: a 3-D image tensor + crop_height: the height of the image following the crop. + crop_width: the width of the image following the crop. + + Returns: + 3-D tensor with cropped image. + """ + shape = tf.shape(input=image) + height, width = shape[0], shape[1] + + amount_to_be_cropped_h = (height - crop_height) + crop_top = amount_to_be_cropped_h // 2 + amount_to_be_cropped_w = (width - crop_width) + crop_left = amount_to_be_cropped_w // 2 + return tf.slice( + image, [crop_top, crop_left, 0], [crop_height, crop_width, -1]) + + +def _mean_image_subtraction(image, means, num_channels): + """Subtracts the given means from each image channel. + + For example: + means = [123.68, 116.779, 103.939] + image = _mean_image_subtraction(image, means) + + Note that the rank of `image` must be known. + + Args: + image: a tensor of size [height, width, C]. + means: a C-vector of values to subtract from each channel. + num_channels: number of color channels in the image that will be distorted. + + Returns: + the centered image. + + Raises: + ValueError: If the rank of `image` is unknown, if `image` has a rank other + than three or if the number of channels in `image` doesn't match the + number of values in `means`. + """ + if image.get_shape().ndims != 3: + raise ValueError('Input must be of size [height, width, C>0]') + + if len(means) != num_channels: + raise ValueError('len(means) must match the number of channels') + + # We have a 1-D tensor of means; convert to 3-D. + # Note(b/130245863): we explicitly call `broadcast` instead of simply + # expanding dimensions for better performance. + means = tf.broadcast_to(means, tf.shape(image)) + + return image - means + + +def _smallest_size_at_least(height, width, resize_min): + """Computes new shape with the smallest side equal to `smallest_side`. + + Computes new shape with the smallest side equal to `smallest_side` while + preserving the original aspect ratio. + + Args: + height: an int32 scalar tensor indicating the current height. + width: an int32 scalar tensor indicating the current width. + resize_min: A python integer or scalar `Tensor` indicating the size of + the smallest side after resize. + + Returns: + new_height: an int32 scalar tensor indicating the new height. + new_width: an int32 scalar tensor indicating the new width. + """ + resize_min = tf.cast(resize_min, tf.float32) + + # Convert to floats to make subsequent calculations go smoothly. + height, width = tf.cast(height, tf.float32), tf.cast(width, tf.float32) + + smaller_dim = tf.minimum(height, width) + scale_ratio = resize_min / smaller_dim + + # Convert back to ints to make heights and widths that TF ops will accept. + new_height = tf.cast(height * scale_ratio, tf.int32) + new_width = tf.cast(width * scale_ratio, tf.int32) + + return new_height, new_width + + +def _aspect_preserving_resize(image, resize_min): + """Resize images preserving the original aspect ratio. + + Args: + image: A 3-D image `Tensor`. + resize_min: A python integer or scalar `Tensor` indicating the size of + the smallest side after resize. + + Returns: + resized_image: A 3-D tensor containing the resized image. + """ + shape = tf.shape(input=image) + height, width = shape[0], shape[1] + + new_height, new_width = _smallest_size_at_least(height, width, resize_min) + + return _resize_image(image, new_height, new_width) + + +def _resize_image(image, height, width): + """Simple wrapper around tf.resize_images. + + This is primarily to make sure we use the same `ResizeMethod` and other + details each time. + + Args: + image: A 3-D image `Tensor`. + height: The target height for the resized image. + width: The target width for the resized image. + + Returns: + resized_image: A 3-D tensor containing the resized image. The first two + dimensions have the shape [height, width]. + """ + return tf.compat.v1.image.resize( + image, [height, width], method=tf.image.ResizeMethod.BILINEAR, + align_corners=False) + + +def preprocess_image(image_buffer, bbox, output_height, output_width, + num_channels, is_training=False): + """Preprocesses the given image. + + Preprocessing includes decoding, cropping, and resizing for both training + and eval images. Training preprocessing, however, introduces some random + distortion of the image to improve accuracy. + + Args: + image_buffer: scalar string Tensor representing the raw JPEG image buffer. + bbox: 3-D float Tensor of bounding boxes arranged [1, num_boxes, coords] + where each coordinate is [0, 1) and the coordinates are arranged as + [ymin, xmin, ymax, xmax]. + output_height: The height of the image after preprocessing. + output_width: The width of the image after preprocessing. + num_channels: Integer depth of the image buffer for decoding. + is_training: `True` if we're preprocessing the image for training and + `False` otherwise. + + Returns: + A preprocessed image. + """ + if is_training: + # For training, we want to randomize some of the distortions. + image = _decode_crop_and_flip(image_buffer, bbox, num_channels) + image = _resize_image(image, output_height, output_width) + else: + # For validation, we want to decode, resize, then just crop the middle. + image = tf.image.decode_jpeg(image_buffer, channels=num_channels) + image = _aspect_preserving_resize(image, _RESIZE_MIN) + image = _central_crop(image, output_height, output_width) + + image.set_shape([output_height, output_width, num_channels]) + + return _mean_image_subtraction(image, _CHANNEL_MEANS, num_channels) diff --git a/examples/resnet/resnet_cifar_main.py b/examples/resnet/resnet_cifar_main.py new file mode 100644 index 00000000..bfc411a4 --- /dev/null +++ b/examples/resnet/resnet_cifar_main.py @@ -0,0 +1,241 @@ +# Copyright 2018 The TensorFlow Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================== +"""Runs a ResNet model on the Cifar-10 dataset.""" + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +from absl import app as absl_app +from absl import flags +import tensorflow as tf + +from official.utils.flags import core as flags_core +from official.utils.logs import logger +from official.utils.misc import distribution_utils +from official.utils.misc import keras_utils +from official.vision.image_classification import cifar_preprocessing +from official.vision.image_classification import common +from official.vision.image_classification import resnet_cifar_model + + +LR_SCHEDULE = [ # (multiplier, epoch to start) tuples + (0.1, 91), (0.01, 136), (0.001, 182) +] + + +def learning_rate_schedule(current_epoch, + current_batch, + batches_per_epoch, + batch_size): + """Handles linear scaling rule and LR decay. + + Scale learning rate at epoch boundaries provided in LR_SCHEDULE by the + provided scaling factor. + + Args: + current_epoch: integer, current epoch indexed from 0. + current_batch: integer, current batch in the current epoch, indexed from 0. + batches_per_epoch: integer, number of steps in an epoch. + batch_size: integer, total batch sized. + + Returns: + Adjusted learning rate. + """ + del current_batch, batches_per_epoch # not used + initial_learning_rate = common.BASE_LEARNING_RATE * batch_size / 128 + learning_rate = initial_learning_rate + for mult, start_epoch in LR_SCHEDULE: + if current_epoch >= start_epoch: + learning_rate = initial_learning_rate * mult + else: + break + return learning_rate + + +def run(flags_obj): + """Run ResNet Cifar-10 training and eval loop using native Keras APIs. + + Args: + flags_obj: An object containing parsed flag values. + + Raises: + ValueError: If fp16 is passed as it is not currently supported. + + Returns: + Dictionary of training and eval stats. + """ + keras_utils.set_session_config( + enable_eager=flags_obj.enable_eager, + enable_xla=flags_obj.enable_xla) + + # Execute flag override logic for better model performance + if flags_obj.tf_gpu_thread_mode: + common.set_gpu_thread_mode_and_count(flags_obj) + common.set_cudnn_batchnorm_mode() + + dtype = flags_core.get_tf_dtype(flags_obj) + if dtype == 'fp16': + raise ValueError('dtype fp16 is not supported in Keras. Use the default ' + 'value(fp32).') + + data_format = flags_obj.data_format + if data_format is None: + data_format = ('channels_first' + if tf.test.is_built_with_cuda() else 'channels_last') + tf.keras.backend.set_image_data_format(data_format) + + strategy = distribution_utils.get_distribution_strategy( + distribution_strategy=flags_obj.distribution_strategy, + num_gpus=flags_obj.num_gpus, + num_workers=distribution_utils.configure_cluster(), + all_reduce_alg=flags_obj.all_reduce_alg, + num_packs=flags_obj.num_packs) + + if strategy: + # flags_obj.enable_get_next_as_optional controls whether enabling + # get_next_as_optional behavior in DistributedIterator. If true, last + # partial batch can be supported. + strategy.extended.experimental_enable_get_next_as_optional = ( + flags_obj.enable_get_next_as_optional + ) + + strategy_scope = distribution_utils.get_strategy_scope(strategy) + + if flags_obj.use_synthetic_data: + distribution_utils.set_up_synthetic_data() + input_fn = common.get_synth_input_fn( + height=cifar_preprocessing.HEIGHT, + width=cifar_preprocessing.WIDTH, + num_channels=cifar_preprocessing.NUM_CHANNELS, + num_classes=cifar_preprocessing.NUM_CLASSES, + dtype=flags_core.get_tf_dtype(flags_obj), + drop_remainder=True) + else: + distribution_utils.undo_set_up_synthetic_data() + input_fn = cifar_preprocessing.input_fn + + train_input_dataset = input_fn( + is_training=True, + data_dir=flags_obj.data_dir, + batch_size=flags_obj.batch_size, + num_epochs=flags_obj.train_epochs, + parse_record_fn=cifar_preprocessing.parse_record, + datasets_num_private_threads=flags_obj.datasets_num_private_threads, + dtype=dtype, + # Setting drop_remainder to avoid the partial batch logic in normalization + # layer, which triggers tf.where and leads to extra memory copy of input + # sizes between host and GPU. + drop_remainder=(not flags_obj.enable_get_next_as_optional)) + + eval_input_dataset = None + if not flags_obj.skip_eval: + eval_input_dataset = input_fn( + is_training=False, + data_dir=flags_obj.data_dir, + batch_size=flags_obj.batch_size, + num_epochs=flags_obj.train_epochs, + parse_record_fn=cifar_preprocessing.parse_record) + + with strategy_scope: + optimizer = common.get_optimizer() + model = resnet_cifar_model.resnet56(classes=cifar_preprocessing.NUM_CLASSES) + + # TODO(b/138957587): Remove when force_v2_in_keras_compile is on longer + # a valid arg for this model. Also remove as a valid flag. + if flags_obj.force_v2_in_keras_compile is not None: + model.compile( + loss='categorical_crossentropy', + optimizer=optimizer, + metrics=(['categorical_accuracy'] + if flags_obj.report_accuracy_metrics else None), + run_eagerly=flags_obj.run_eagerly, + experimental_run_tf_function=flags_obj.force_v2_in_keras_compile) + else: + model.compile( + loss='categorical_crossentropy', + optimizer=optimizer, + metrics=(['categorical_accuracy'] + if flags_obj.report_accuracy_metrics else None), + run_eagerly=flags_obj.run_eagerly) + + callbacks = common.get_callbacks( + learning_rate_schedule, cifar_preprocessing.NUM_IMAGES['train']) + + train_steps = cifar_preprocessing.NUM_IMAGES['train'] // flags_obj.batch_size + train_epochs = flags_obj.train_epochs + + if flags_obj.train_steps: + train_steps = min(flags_obj.train_steps, train_steps) + train_epochs = 1 + + num_eval_steps = (cifar_preprocessing.NUM_IMAGES['validation'] // + flags_obj.batch_size) + + validation_data = eval_input_dataset + if flags_obj.skip_eval: + if flags_obj.set_learning_phase_to_train: + # TODO(haoyuzhang): Understand slowdown of setting learning phase when + # not using distribution strategy. + tf.keras.backend.set_learning_phase(1) + num_eval_steps = None + validation_data = None + + if not strategy and flags_obj.explicit_gpu_placement: + # TODO(b/135607227): Add device scope automatically in Keras training loop + # when not using distribition strategy. + no_dist_strat_device = tf.device('/device:GPU:0') + no_dist_strat_device.__enter__() + + history = model.fit(train_input_dataset, + epochs=train_epochs, + steps_per_epoch=train_steps, + callbacks=callbacks, + validation_steps=num_eval_steps, + validation_data=validation_data, + validation_freq=flags_obj.epochs_between_evals, + verbose=2) + eval_output = None + if not flags_obj.skip_eval: + eval_output = model.evaluate(eval_input_dataset, + steps=num_eval_steps, + verbose=2) + + if not strategy and flags_obj.explicit_gpu_placement: + no_dist_strat_device.__exit__() + + stats = common.build_stats(history, eval_output, callbacks) + return stats + + +def define_cifar_flags(): + common.define_keras_flags(dynamic_loss_scale=False) + + flags_core.set_defaults(data_dir='/tmp/cifar10_data/cifar-10-batches-bin', + model_dir='/tmp/cifar10_model', + train_epochs=182, + epochs_between_evals=10, + batch_size=128) + + +def main(_): + with logger.benchmark_context(flags.FLAGS): + return run(flags.FLAGS) + + +if __name__ == '__main__': + tf.compat.v1.logging.set_verbosity(tf.compat.v1.logging.INFO) + define_cifar_flags() + absl_app.run(main) diff --git a/examples/resnet/resnet_cifar_model.py b/examples/resnet/resnet_cifar_model.py new file mode 100644 index 00000000..1b507381 --- /dev/null +++ b/examples/resnet/resnet_cifar_model.py @@ -0,0 +1,262 @@ +# Copyright 2018 The TensorFlow Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================== +"""ResNet56 model for Keras adapted from tf.keras.applications.ResNet50. + +# Reference: +- [Deep Residual Learning for Image Recognition]( + https://arxiv.org/abs/1512.03385) +Adapted from code contributed by BigMoyan. +""" +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import functools +import tensorflow as tf +from tensorflow.python.keras import backend +from tensorflow.python.keras import initializers +from tensorflow.python.keras import layers +from tensorflow.python.keras import regularizers + + +BATCH_NORM_DECAY = 0.997 +BATCH_NORM_EPSILON = 1e-5 +L2_WEIGHT_DECAY = 2e-4 + + +def identity_building_block(input_tensor, + kernel_size, + filters, + stage, + block, + training=None): + """The identity block is the block that has no conv layer at shortcut. + + Arguments: + input_tensor: input tensor + kernel_size: default 3, the kernel size of + middle conv layer at main path + filters: list of integers, the filters of 3 conv layer at main path + stage: integer, current stage label, used for generating layer names + block: current block label, used for generating layer names + training: Only used if training keras model with Estimator. In other + scenarios it is handled automatically. + + Returns: + Output tensor for the block. + """ + filters1, filters2 = filters + if backend.image_data_format() == 'channels_last': + bn_axis = 3 + else: + bn_axis = 1 + conv_name_base = 'res' + str(stage) + block + '_branch' + bn_name_base = 'bn' + str(stage) + block + '_branch' + + x = layers.Conv2D(filters1, kernel_size, + padding='same', use_bias=False, + kernel_initializer='he_normal', + kernel_regularizer=regularizers.l2(L2_WEIGHT_DECAY), + name=conv_name_base + '2a')(input_tensor) + x = layers.BatchNormalization( + axis=bn_axis, momentum=BATCH_NORM_DECAY, epsilon=BATCH_NORM_EPSILON, + name=bn_name_base + '2a')(x, training=training) + x = layers.Activation('relu')(x) + + x = layers.Conv2D(filters2, kernel_size, + padding='same', use_bias=False, + kernel_initializer='he_normal', + kernel_regularizer=regularizers.l2(L2_WEIGHT_DECAY), + name=conv_name_base + '2b')(x) + x = layers.BatchNormalization( + axis=bn_axis, momentum=BATCH_NORM_DECAY, epsilon=BATCH_NORM_EPSILON, + name=bn_name_base + '2b')(x, training=training) + + x = layers.add([x, input_tensor]) + x = layers.Activation('relu')(x) + return x + + +def conv_building_block(input_tensor, + kernel_size, + filters, + stage, + block, + strides=(2, 2), + training=None): + """A block that has a conv layer at shortcut. + + Arguments: + input_tensor: input tensor + kernel_size: default 3, the kernel size of + middle conv layer at main path + filters: list of integers, the filters of 3 conv layer at main path + stage: integer, current stage label, used for generating layer names + block: current block label, used for generating layer names + strides: Strides for the first conv layer in the block. + training: Only used if training keras model with Estimator. In other + scenarios it is handled automatically. + + Returns: + Output tensor for the block. + + Note that from stage 3, + the first conv layer at main path is with strides=(2, 2) + And the shortcut should have strides=(2, 2) as well + """ + filters1, filters2 = filters + if tf.keras.backend.image_data_format() == 'channels_last': + bn_axis = 3 + else: + bn_axis = 1 + conv_name_base = 'res' + str(stage) + block + '_branch' + bn_name_base = 'bn' + str(stage) + block + '_branch' + + x = layers.Conv2D(filters1, kernel_size, strides=strides, + padding='same', use_bias=False, + kernel_initializer='he_normal', + kernel_regularizer=regularizers.l2(L2_WEIGHT_DECAY), + name=conv_name_base + '2a')(input_tensor) + x = layers.BatchNormalization( + axis=bn_axis, momentum=BATCH_NORM_DECAY, epsilon=BATCH_NORM_EPSILON, + name=bn_name_base + '2a')(x, training=training) + x = layers.Activation('relu')(x) + + x = layers.Conv2D(filters2, kernel_size, padding='same', use_bias=False, + kernel_initializer='he_normal', + kernel_regularizer=regularizers.l2(L2_WEIGHT_DECAY), + name=conv_name_base + '2b')(x) + x = layers.BatchNormalization( + axis=bn_axis, momentum=BATCH_NORM_DECAY, epsilon=BATCH_NORM_EPSILON, + name=bn_name_base + '2b')(x, training=training) + + shortcut = layers.Conv2D(filters2, (1, 1), strides=strides, use_bias=False, + kernel_initializer='he_normal', + kernel_regularizer=regularizers.l2(L2_WEIGHT_DECAY), + name=conv_name_base + '1')(input_tensor) + shortcut = layers.BatchNormalization( + axis=bn_axis, momentum=BATCH_NORM_DECAY, epsilon=BATCH_NORM_EPSILON, + name=bn_name_base + '1')(shortcut, training=training) + + x = layers.add([x, shortcut]) + x = layers.Activation('relu')(x) + return x + + +def resnet_block(input_tensor, + size, + kernel_size, + filters, + stage, + conv_strides=(2, 2), + training=None): + """A block which applies conv followed by multiple identity blocks. + + Arguments: + input_tensor: input tensor + size: integer, number of constituent conv/identity building blocks. + A conv block is applied once, followed by (size - 1) identity blocks. + kernel_size: default 3, the kernel size of + middle conv layer at main path + filters: list of integers, the filters of 3 conv layer at main path + stage: integer, current stage label, used for generating layer names + conv_strides: Strides for the first conv layer in the block. + training: Only used if training keras model with Estimator. In other + scenarios it is handled automatically. + + Returns: + Output tensor after applying conv and identity blocks. + """ + + x = conv_building_block(input_tensor, kernel_size, filters, stage=stage, + strides=conv_strides, block='block_0', + training=training) + for i in range(size - 1): + x = identity_building_block(x, kernel_size, filters, stage=stage, + block='block_%d' % (i + 1), training=training) + return x + + +def resnet(num_blocks, classes=10, training=None): + """Instantiates the ResNet architecture. + + Arguments: + num_blocks: integer, the number of conv/identity blocks in each block. + The ResNet contains 3 blocks with each block containing one conv block + followed by (layers_per_block - 1) number of idenity blocks. Each + conv/idenity block has 2 convolutional layers. With the input + convolutional layer and the pooling layer towards the end, this brings + the total size of the network to (6*num_blocks + 2) + classes: optional number of classes to classify images into + training: Only used if training keras model with Estimator. In other + scenarios it is handled automatically. + + Returns: + A Keras model instance. + """ + + input_shape = (32, 32, 3) + img_input = layers.Input(shape=input_shape) + + if backend.image_data_format() == 'channels_first': + x = layers.Lambda(lambda x: backend.permute_dimensions(x, (0, 3, 1, 2)), + name='transpose')(img_input) + bn_axis = 1 + else: # channel_last + x = img_input + bn_axis = 3 + + x = layers.ZeroPadding2D(padding=(1, 1), name='conv1_pad')(x) + x = layers.Conv2D(16, (3, 3), + strides=(1, 1), + padding='valid', use_bias=False, + kernel_initializer='he_normal', + kernel_regularizer=regularizers.l2(L2_WEIGHT_DECAY), + name='conv1')(x) + x = layers.BatchNormalization(axis=bn_axis, + momentum=BATCH_NORM_DECAY, + epsilon=BATCH_NORM_EPSILON, + name='bn_conv1',)(x, training=training) + x = layers.Activation('relu')(x) + + x = resnet_block(x, size=num_blocks, kernel_size=3, filters=[16, 16], + stage=2, conv_strides=(1, 1), training=training) + + x = resnet_block(x, size=num_blocks, kernel_size=3, filters=[32, 32], + stage=3, conv_strides=(2, 2), training=training) + + x = resnet_block(x, size=num_blocks, kernel_size=3, filters=[64, 64], + stage=4, conv_strides=(2, 2), training=training) + + rm_axes = [1, 2] if backend.image_data_format() == 'channels_last' else [2, 3] + x = layers.Lambda(lambda x: backend.mean(x, rm_axes), name='reduce_mean')(x) + x = layers.Dense(classes, + activation='softmax', + kernel_initializer=initializers.RandomNormal(stddev=0.01), + kernel_regularizer=regularizers.l2(L2_WEIGHT_DECAY), + bias_regularizer=regularizers.l2(L2_WEIGHT_DECAY), + name='fc10')(x) + + inputs = img_input + # Create model. + model = tf.keras.models.Model(inputs, x, name='resnet56') + + return model + + +resnet20 = functools.partial(resnet, num_blocks=3) +resnet32 = functools.partial(resnet, num_blocks=5) +resnet56 = functools.partial(resnet, num_blocks=9) +resnet10 = functools.partial(resnet, num_blocks=110) diff --git a/examples/resnet/resnet_cifar_test.py b/examples/resnet/resnet_cifar_test.py new file mode 100644 index 00000000..b1480697 --- /dev/null +++ b/examples/resnet/resnet_cifar_test.py @@ -0,0 +1,187 @@ +# Copyright 2018 The TensorFlow Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================== +"""Test the keras ResNet model with Cifar data.""" + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import tempfile + +import tensorflow as tf + +from tensorflow.python.eager import context +from tensorflow.python.platform import googletest +from official.utils.misc import keras_utils +from official.utils.testing import integration +from official.vision.image_classification import cifar_preprocessing +from official.vision.image_classification import resnet_cifar_main + + +class KerasCifarTest(googletest.TestCase): + """Unit tests for Keras ResNet with Cifar.""" + + _extra_flags = [ + "-batch_size", "4", + "-train_steps", "1", + "-use_synthetic_data", "true" + ] + _tempdir = None + + def get_temp_dir(self): + if not self._tempdir: + self._tempdir = tempfile.mkdtemp(dir=googletest.GetTempDir()) + return self._tempdir + + @classmethod + def setUpClass(cls): # pylint: disable=invalid-name + super(KerasCifarTest, cls).setUpClass() + resnet_cifar_main.define_cifar_flags() + + def setUp(self): + super(KerasCifarTest, self).setUp() + cifar_preprocessing.NUM_IMAGES["validation"] = 4 + + def tearDown(self): + super(KerasCifarTest, self).tearDown() + tf.io.gfile.rmtree(self.get_temp_dir()) + + def test_end_to_end_no_dist_strat(self): + """Test Keras model with 1 GPU, no distribution strategy.""" + config = keras_utils.get_config_proto_v1() + tf.compat.v1.enable_eager_execution(config=config) + + extra_flags = [ + "-distribution_strategy", "off", + "-model_dir", "keras_cifar_no_dist_strat", + "-data_format", "channels_last", + ] + extra_flags = extra_flags + self._extra_flags + + integration.run_synthetic( + main=resnet_cifar_main.run, + tmp_root=self.get_temp_dir(), + extra_flags=extra_flags + ) + + def test_end_to_end_graph_no_dist_strat(self): + """Test Keras model in legacy graph mode with 1 GPU, no dist strat.""" + extra_flags = [ + "-enable_eager", "false", + "-distribution_strategy", "off", + "-model_dir", "keras_cifar_graph_no_dist_strat", + "-data_format", "channels_last", + ] + extra_flags = extra_flags + self._extra_flags + + integration.run_synthetic( + main=resnet_cifar_main.run, + tmp_root=self.get_temp_dir(), + extra_flags=extra_flags + ) + + def test_end_to_end_1_gpu(self): + """Test Keras model with 1 GPU.""" + config = keras_utils.get_config_proto_v1() + tf.compat.v1.enable_eager_execution(config=config) + + if context.num_gpus() < 1: + self.skipTest( + "{} GPUs are not available for this test. {} GPUs are available". + format(1, context.num_gpus())) + + extra_flags = [ + "-num_gpus", "1", + "-distribution_strategy", "default", + "-model_dir", "keras_cifar_1_gpu", + "-data_format", "channels_last", + ] + extra_flags = extra_flags + self._extra_flags + + integration.run_synthetic( + main=resnet_cifar_main.run, + tmp_root=self.get_temp_dir(), + extra_flags=extra_flags + ) + + def test_end_to_end_graph_1_gpu(self): + """Test Keras model in legacy graph mode with 1 GPU.""" + if context.num_gpus() < 1: + self.skipTest( + "{} GPUs are not available for this test. {} GPUs are available". + format(1, context.num_gpus())) + + extra_flags = [ + "-num_gpus", "1", + "-noenable_eager", + "-distribution_strategy", "default", + "-model_dir", "keras_cifar_graph_1_gpu", + "-data_format", "channels_last", + ] + extra_flags = extra_flags + self._extra_flags + + integration.run_synthetic( + main=resnet_cifar_main.run, + tmp_root=self.get_temp_dir(), + extra_flags=extra_flags + ) + + def test_end_to_end_2_gpu(self): + """Test Keras model with 2 GPUs.""" + config = keras_utils.get_config_proto_v1() + tf.compat.v1.enable_eager_execution(config=config) + + if context.num_gpus() < 2: + self.skipTest( + "{} GPUs are not available for this test. {} GPUs are available". + format(2, context.num_gpus())) + + extra_flags = [ + "-num_gpus", "2", + "-distribution_strategy", "default", + "-model_dir", "keras_cifar_2_gpu", + ] + extra_flags = extra_flags + self._extra_flags + + integration.run_synthetic( + main=resnet_cifar_main.run, + tmp_root=self.get_temp_dir(), + extra_flags=extra_flags + ) + + def test_end_to_end_graph_2_gpu(self): + """Test Keras model in legacy graph mode with 2 GPUs.""" + if context.num_gpus() < 2: + self.skipTest( + "{} GPUs are not available for this test. {} GPUs are available". + format(2, context.num_gpus())) + + extra_flags = [ + "-num_gpus", "2", + "-enable_eager", "false", + "-distribution_strategy", "default", + "-model_dir", "keras_cifar_graph_2_gpu", + ] + extra_flags = extra_flags + self._extra_flags + + integration.run_synthetic( + main=resnet_cifar_main.run, + tmp_root=self.get_temp_dir(), + extra_flags=extra_flags + ) + + +if __name__ == "__main__": + googletest.main() diff --git a/examples/resnet/resnet_imagenet_main.py b/examples/resnet/resnet_imagenet_main.py new file mode 100644 index 00000000..6de458c1 --- /dev/null +++ b/examples/resnet/resnet_imagenet_main.py @@ -0,0 +1,285 @@ +# Copyright 2018 The TensorFlow Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================== +"""Runs a ResNet model on the ImageNet dataset.""" + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +from absl import app as absl_app +from absl import flags +from absl import logging +import tensorflow as tf + +from official.utils.flags import core as flags_core +from official.utils.logs import logger +from official.utils.misc import distribution_utils +from official.utils.misc import keras_utils +from official.utils.misc import model_helpers +from official.vision.image_classification import common +from official.vision.image_classification import imagenet_preprocessing +from official.vision.image_classification import resnet_model +from official.benchmark.models import trivial_model + + +LR_SCHEDULE = [ # (multiplier, epoch to start) tuples + (1.0, 5), (0.1, 30), (0.01, 60), (0.001, 80) +] + + +def learning_rate_schedule(current_epoch, + current_batch, + batches_per_epoch, + batch_size): + """Handles linear scaling rule, gradual warmup, and LR decay. + + Scale learning rate at epoch boundaries provided in LR_SCHEDULE by the + provided scaling factor. + + Args: + current_epoch: integer, current epoch indexed from 0. + current_batch: integer, current batch in the current epoch, indexed from 0. + batches_per_epoch: integer, number of steps in an epoch. + batch_size: integer, total batch sized. + + Returns: + Adjusted learning rate. + """ + initial_lr = common.BASE_LEARNING_RATE * batch_size / 256 + epoch = current_epoch + float(current_batch) / batches_per_epoch + warmup_lr_multiplier, warmup_end_epoch = LR_SCHEDULE[0] + if epoch < warmup_end_epoch: + # Learning rate increases linearly per step. + return initial_lr * warmup_lr_multiplier * epoch / warmup_end_epoch + for mult, start_epoch in LR_SCHEDULE: + if epoch >= start_epoch: + learning_rate = initial_lr * mult + else: + break + return learning_rate + + +def run(flags_obj): + """Run ResNet ImageNet training and eval loop using native Keras APIs. + + Args: + flags_obj: An object containing parsed flag values. + + Raises: + ValueError: If fp16 is passed as it is not currently supported. + + Returns: + Dictionary of training and eval stats. + """ + keras_utils.set_session_config( + enable_eager=flags_obj.enable_eager, + enable_xla=flags_obj.enable_xla) + + # Execute flag override logic for better model performance + if flags_obj.tf_gpu_thread_mode: + common.set_gpu_thread_mode_and_count(flags_obj) + if flags_obj.data_delay_prefetch: + common.data_delay_prefetch() + common.set_cudnn_batchnorm_mode() + + dtype = flags_core.get_tf_dtype(flags_obj) + if dtype == 'float16': + policy = tf.keras.mixed_precision.experimental.Policy('infer_float32_vars') + tf.keras.mixed_precision.experimental.set_policy(policy) + + data_format = flags_obj.data_format + if data_format is None: + data_format = ('channels_first' + if tf.test.is_built_with_cuda() else 'channels_last') + tf.keras.backend.set_image_data_format(data_format) + + # Configures cluster spec for distribution strategy. + num_workers = distribution_utils.configure_cluster(flags_obj.worker_hosts, + flags_obj.task_index) + + strategy = distribution_utils.get_distribution_strategy( + distribution_strategy=flags_obj.distribution_strategy, + num_gpus=flags_obj.num_gpus, + num_workers=num_workers, + all_reduce_alg=flags_obj.all_reduce_alg, + num_packs=flags_obj.num_packs) + + if strategy: + # flags_obj.enable_get_next_as_optional controls whether enabling + # get_next_as_optional behavior in DistributedIterator. If true, last + # partial batch can be supported. + strategy.extended.experimental_enable_get_next_as_optional = ( + flags_obj.enable_get_next_as_optional + ) + + strategy_scope = distribution_utils.get_strategy_scope(strategy) + + # pylint: disable=protected-access + if flags_obj.use_synthetic_data: + distribution_utils.set_up_synthetic_data() + input_fn = common.get_synth_input_fn( + height=imagenet_preprocessing.DEFAULT_IMAGE_SIZE, + width=imagenet_preprocessing.DEFAULT_IMAGE_SIZE, + num_channels=imagenet_preprocessing.NUM_CHANNELS, + num_classes=imagenet_preprocessing.NUM_CLASSES, + dtype=dtype, + drop_remainder=True) + else: + distribution_utils.undo_set_up_synthetic_data() + input_fn = imagenet_preprocessing.input_fn + + # When `enable_xla` is True, we always drop the remainder of the batches + # in the dataset, as XLA-GPU doesn't support dynamic shapes. + drop_remainder = flags_obj.enable_xla + + train_input_dataset = input_fn( + is_training=True, + data_dir=flags_obj.data_dir, + batch_size=flags_obj.batch_size, + num_epochs=flags_obj.train_epochs, + parse_record_fn=imagenet_preprocessing.parse_record, + datasets_num_private_threads=flags_obj.datasets_num_private_threads, + dtype=dtype, + drop_remainder=drop_remainder, + tf_data_experimental_slack=flags_obj.tf_data_experimental_slack, + ) + + eval_input_dataset = None + if not flags_obj.skip_eval: + eval_input_dataset = input_fn( + is_training=False, + data_dir=flags_obj.data_dir, + batch_size=flags_obj.batch_size, + num_epochs=flags_obj.train_epochs, + parse_record_fn=imagenet_preprocessing.parse_record, + dtype=dtype, + drop_remainder=drop_remainder) + + lr_schedule = 0.1 + if flags_obj.use_tensor_lr: + lr_schedule = common.PiecewiseConstantDecayWithWarmup( + batch_size=flags_obj.batch_size, + epoch_size=imagenet_preprocessing.NUM_IMAGES['train'], + warmup_epochs=LR_SCHEDULE[0][1], + boundaries=list(p[1] for p in LR_SCHEDULE[1:]), + multipliers=list(p[0] for p in LR_SCHEDULE), + compute_lr_on_cpu=True) + + with strategy_scope: + optimizer = common.get_optimizer(lr_schedule) + if dtype == 'float16': + # TODO(reedwm): Remove manually wrapping optimizer once mixed precision + # can be enabled with a single line of code. + optimizer = tf.keras.mixed_precision.experimental.LossScaleOptimizer( + optimizer, loss_scale=flags_core.get_loss_scale(flags_obj, + default_for_fp16=128)) + + # TODO(hongkuny): Remove trivial model usage and move it to benchmark. + if flags_obj.use_trivial_model: + model = trivial_model.trivial_model( + imagenet_preprocessing.NUM_CLASSES, dtype) + else: + model = resnet_model.resnet50( + num_classes=imagenet_preprocessing.NUM_CLASSES, dtype=dtype) + + # TODO(b/138957587): Remove when force_v2_in_keras_compile is on longer + # a valid arg for this model. Also remove as a valid flag. + if flags_obj.force_v2_in_keras_compile is not None: + model.compile( + loss='sparse_categorical_crossentropy', + optimizer=optimizer, + metrics=(['sparse_categorical_accuracy'] + if flags_obj.report_accuracy_metrics else None), + run_eagerly=flags_obj.run_eagerly, + experimental_run_tf_function=flags_obj.force_v2_in_keras_compile) + else: + model.compile( + loss='sparse_categorical_crossentropy', + optimizer=optimizer, + metrics=(['sparse_categorical_accuracy'] + if flags_obj.report_accuracy_metrics else None), + run_eagerly=flags_obj.run_eagerly) + + callbacks = common.get_callbacks( + learning_rate_schedule, imagenet_preprocessing.NUM_IMAGES['train']) + + train_steps = ( + imagenet_preprocessing.NUM_IMAGES['train'] // flags_obj.batch_size) + train_epochs = flags_obj.train_epochs + + if flags_obj.train_steps: + train_steps = min(flags_obj.train_steps, train_steps) + train_epochs = 1 + + num_eval_steps = ( + imagenet_preprocessing.NUM_IMAGES['validation'] // flags_obj.batch_size) + + validation_data = eval_input_dataset + if flags_obj.skip_eval: + # Only build the training graph. This reduces memory usage introduced by + # control flow ops in layers that have different implementations for + # training and inference (e.g., batch norm). + if flags_obj.set_learning_phase_to_train: + # TODO(haoyuzhang): Understand slowdown of setting learning phase when + # not using distribution strategy. + tf.keras.backend.set_learning_phase(1) + num_eval_steps = None + validation_data = None + + if not strategy and flags_obj.explicit_gpu_placement: + # TODO(b/135607227): Add device scope automatically in Keras training loop + # when not using distribition strategy. + no_dist_strat_device = tf.device('/device:GPU:0') + no_dist_strat_device.__enter__() + + history = model.fit(train_input_dataset, + epochs=train_epochs, + steps_per_epoch=train_steps, + callbacks=callbacks, + validation_steps=num_eval_steps, + validation_data=validation_data, + validation_freq=flags_obj.epochs_between_evals, + verbose=2) + + eval_output = None + if not flags_obj.skip_eval: + eval_output = model.evaluate(eval_input_dataset, + steps=num_eval_steps, + verbose=2) + + if not strategy and flags_obj.explicit_gpu_placement: + no_dist_strat_device.__exit__() + + stats = common.build_stats(history, eval_output, callbacks) + return stats + + +def define_imagenet_keras_flags(): + common.define_keras_flags() + flags_core.set_defaults(train_epochs=90) + flags.adopt_module_key_flags(common) + + +def main(_): + model_helpers.apply_clean(flags.FLAGS) + with logger.benchmark_context(flags.FLAGS): + stats = run(flags.FLAGS) + logging.info('Run stats:\n%s', stats) + + +if __name__ == '__main__': + logging.set_verbosity(logging.INFO) + define_imagenet_keras_flags() + absl_app.run(main) diff --git a/examples/resnet/resnet_imagenet_test.py b/examples/resnet/resnet_imagenet_test.py new file mode 100644 index 00000000..a640c1d2 --- /dev/null +++ b/examples/resnet/resnet_imagenet_test.py @@ -0,0 +1,282 @@ +# Copyright 2017 The TensorFlow Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================== +"""Test the keras ResNet model with ImageNet data.""" + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import tempfile + +import tensorflow as tf + +from tensorflow.python.eager import context +from tensorflow.python.platform import googletest +from official.utils.misc import keras_utils +from official.utils.testing import integration +from official.vision.image_classification import imagenet_preprocessing +from official.vision.image_classification import resnet_imagenet_main + + +class KerasImagenetTest(googletest.TestCase): + """Unit tests for Keras ResNet with ImageNet.""" + + _extra_flags = [ + "-batch_size", "4", + "-train_steps", "1", + "-use_synthetic_data", "true" + ] + _tempdir = None + + def get_temp_dir(self): + if not self._tempdir: + self._tempdir = tempfile.mkdtemp(dir=googletest.GetTempDir()) + return self._tempdir + + @classmethod + def setUpClass(cls): # pylint: disable=invalid-name + super(KerasImagenetTest, cls).setUpClass() + resnet_imagenet_main.define_imagenet_keras_flags() + + def setUp(self): + super(KerasImagenetTest, self).setUp() + imagenet_preprocessing.NUM_IMAGES["validation"] = 4 + + def tearDown(self): + super(KerasImagenetTest, self).tearDown() + tf.io.gfile.rmtree(self.get_temp_dir()) + + def test_end_to_end_no_dist_strat(self): + """Test Keras model with 1 GPU, no distribution strategy.""" + config = keras_utils.get_config_proto_v1() + tf.compat.v1.enable_eager_execution(config=config) + + extra_flags = [ + "-distribution_strategy", "off", + "-model_dir", "keras_imagenet_no_dist_strat", + "-data_format", "channels_last", + ] + extra_flags = extra_flags + self._extra_flags + + integration.run_synthetic( + main=resnet_imagenet_main.run, + tmp_root=self.get_temp_dir(), + extra_flags=extra_flags + ) + + def test_end_to_end_graph_no_dist_strat(self): + """Test Keras model in legacy graph mode with 1 GPU, no dist strat.""" + extra_flags = [ + "-enable_eager", "false", + "-distribution_strategy", "off", + "-model_dir", "keras_imagenet_graph_no_dist_strat", + "-data_format", "channels_last", + ] + extra_flags = extra_flags + self._extra_flags + + integration.run_synthetic( + main=resnet_imagenet_main.run, + tmp_root=self.get_temp_dir(), + extra_flags=extra_flags + ) + + def test_end_to_end_1_gpu(self): + """Test Keras model with 1 GPU.""" + config = keras_utils.get_config_proto_v1() + tf.compat.v1.enable_eager_execution(config=config) + + if context.num_gpus() < 1: + self.skipTest( + "{} GPUs are not available for this test. {} GPUs are available". + format(1, context.num_gpus())) + + extra_flags = [ + "-num_gpus", "1", + "-distribution_strategy", "default", + "-model_dir", "keras_imagenet_1_gpu", + "-data_format", "channels_last", + ] + extra_flags = extra_flags + self._extra_flags + + integration.run_synthetic( + main=resnet_imagenet_main.run, + tmp_root=self.get_temp_dir(), + extra_flags=extra_flags + ) + + def test_end_to_end_graph_1_gpu(self): + """Test Keras model in legacy graph mode with 1 GPU.""" + if context.num_gpus() < 1: + self.skipTest( + "{} GPUs are not available for this test. {} GPUs are available". + format(1, context.num_gpus())) + + extra_flags = [ + "-num_gpus", "1", + "-enable_eager", "false", + "-distribution_strategy", "default", + "-model_dir", "keras_imagenet_graph_1_gpu", + "-data_format", "channels_last", + ] + extra_flags = extra_flags + self._extra_flags + + integration.run_synthetic( + main=resnet_imagenet_main.run, + tmp_root=self.get_temp_dir(), + extra_flags=extra_flags + ) + + def test_end_to_end_2_gpu(self): + """Test Keras model with 2 GPUs.""" + config = keras_utils.get_config_proto_v1() + tf.compat.v1.enable_eager_execution(config=config) + + if context.num_gpus() < 2: + self.skipTest( + "{} GPUs are not available for this test. {} GPUs are available". + format(2, context.num_gpus())) + + extra_flags = [ + "-num_gpus", "2", + "-distribution_strategy", "default", + "-model_dir", "keras_imagenet_2_gpu", + ] + extra_flags = extra_flags + self._extra_flags + + integration.run_synthetic( + main=resnet_imagenet_main.run, + tmp_root=self.get_temp_dir(), + extra_flags=extra_flags + ) + + def test_end_to_end_xla_2_gpu(self): + """Test Keras model with XLA and 2 GPUs.""" + config = keras_utils.get_config_proto_v1() + tf.compat.v1.enable_eager_execution(config=config) + + if context.num_gpus() < 2: + self.skipTest( + "{} GPUs are not available for this test. {} GPUs are available". + format(2, context.num_gpus())) + + extra_flags = [ + "-num_gpus", "2", + "-enable_xla", "true", + "-distribution_strategy", "default", + "-model_dir", "keras_imagenet_xla_2_gpu", + ] + extra_flags = extra_flags + self._extra_flags + + integration.run_synthetic( + main=resnet_imagenet_main.run, + tmp_root=self.get_temp_dir(), + extra_flags=extra_flags + ) + + def test_end_to_end_2_gpu_fp16(self): + """Test Keras model with 2 GPUs and fp16.""" + config = keras_utils.get_config_proto_v1() + tf.compat.v1.enable_eager_execution(config=config) + + if context.num_gpus() < 2: + self.skipTest( + "{} GPUs are not available for this test. {} GPUs are available". + format(2, context.num_gpus())) + + extra_flags = [ + "-num_gpus", "2", + "-dtype", "fp16", + "-distribution_strategy", "default", + "-model_dir", "keras_imagenet_2_gpu_fp16", + ] + extra_flags = extra_flags + self._extra_flags + + integration.run_synthetic( + main=resnet_imagenet_main.run, + tmp_root=self.get_temp_dir(), + extra_flags=extra_flags + ) + + def test_end_to_end_xla_2_gpu_fp16(self): + """Test Keras model with XLA, 2 GPUs and fp16.""" + config = keras_utils.get_config_proto_v1() + tf.compat.v1.enable_eager_execution(config=config) + + if context.num_gpus() < 2: + self.skipTest( + "{} GPUs are not available for this test. {} GPUs are available". + format(2, context.num_gpus())) + + extra_flags = [ + "-num_gpus", "2", + "-dtype", "fp16", + "-enable_xla", "true", + "-distribution_strategy", "default", + "-model_dir", "keras_imagenet_xla_2_gpu_fp16", + ] + extra_flags = extra_flags + self._extra_flags + + integration.run_synthetic( + main=resnet_imagenet_main.run, + tmp_root=self.get_temp_dir(), + extra_flags=extra_flags + ) + + def test_end_to_end_graph_2_gpu(self): + """Test Keras model in legacy graph mode with 2 GPUs.""" + if context.num_gpus() < 2: + self.skipTest( + "{} GPUs are not available for this test. {} GPUs are available". + format(2, context.num_gpus())) + + extra_flags = [ + "-num_gpus", "2", + "-enable_eager", "false", + "-distribution_strategy", "default", + "-model_dir", "keras_imagenet_graph_2_gpu", + ] + extra_flags = extra_flags + self._extra_flags + + integration.run_synthetic( + main=resnet_imagenet_main.run, + tmp_root=self.get_temp_dir(), + extra_flags=extra_flags + ) + + def test_end_to_end_graph_xla_2_gpu(self): + """Test Keras model in legacy graph mode with XLA and 2 GPUs.""" + if context.num_gpus() < 2: + self.skipTest( + "{} GPUs are not available for this test. {} GPUs are available". + format(2, context.num_gpus())) + + extra_flags = [ + "-num_gpus", "2", + "-enable_eager", "false", + "-enable_xla", "true", + "-distribution_strategy", "default", + "-model_dir", "keras_imagenet_graph_xla_2_gpu", + ] + extra_flags = extra_flags + self._extra_flags + + integration.run_synthetic( + main=resnet_imagenet_main.run, + tmp_root=self.get_temp_dir(), + extra_flags=extra_flags + ) + + +if __name__ == "__main__": + googletest.main() diff --git a/examples/resnet/resnet_model.py b/examples/resnet/resnet_model.py new file mode 100644 index 00000000..fb014027 --- /dev/null +++ b/examples/resnet/resnet_model.py @@ -0,0 +1,389 @@ +# Copyright 2018 The TensorFlow Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================== +"""ResNet50 model for Keras. + +Adapted from tf.keras.applications.resnet50.ResNet50(). +This is ResNet model version 1.5. + +Related papers/blogs: +- https://arxiv.org/abs/1512.03385 +- https://arxiv.org/pdf/1603.05027v2.pdf +- http://torch.ch/blog/2016/02/04/resnets.html + +""" +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +from tensorflow.python.keras import backend +from tensorflow.python.keras import initializers +from tensorflow.python.keras import layers +from tensorflow.python.keras import models +from tensorflow.python.keras import regularizers + + +L2_WEIGHT_DECAY = 1e-4 +BATCH_NORM_DECAY = 0.9 +BATCH_NORM_EPSILON = 1e-5 + + +def _gen_l2_regularizer(use_l2_regularizer=True): + return regularizers.l2(L2_WEIGHT_DECAY) if use_l2_regularizer else None + + +def identity_block(input_tensor, + kernel_size, + filters, + stage, + block, + use_l2_regularizer=True): + """The identity block is the block that has no conv layer at shortcut. + + Args: + input_tensor: input tensor + kernel_size: default 3, the kernel size of middle conv layer at main path + filters: list of integers, the filters of 3 conv layer at main path + stage: integer, current stage label, used for generating layer names + block: 'a','b'..., current block label, used for generating layer names + use_l2_regularizer: whether to use L2 regularizer on Conv layer. + + Returns: + Output tensor for the block. + """ + filters1, filters2, filters3 = filters + if backend.image_data_format() == 'channels_last': + bn_axis = 3 + else: + bn_axis = 1 + conv_name_base = 'res' + str(stage) + block + '_branch' + bn_name_base = 'bn' + str(stage) + block + '_branch' + + x = layers.Conv2D( + filters1, (1, 1), + use_bias=False, + kernel_initializer='he_normal', + kernel_regularizer=_gen_l2_regularizer(use_l2_regularizer), + name=conv_name_base + '2a')( + input_tensor) + x = layers.BatchNormalization( + axis=bn_axis, + momentum=BATCH_NORM_DECAY, + epsilon=BATCH_NORM_EPSILON, + name=bn_name_base + '2a')( + x) + x = layers.Activation('relu')(x) + + x = layers.Conv2D( + filters2, + kernel_size, + padding='same', + use_bias=False, + kernel_initializer='he_normal', + kernel_regularizer=_gen_l2_regularizer(use_l2_regularizer), + name=conv_name_base + '2b')( + x) + x = layers.BatchNormalization( + axis=bn_axis, + momentum=BATCH_NORM_DECAY, + epsilon=BATCH_NORM_EPSILON, + name=bn_name_base + '2b')( + x) + x = layers.Activation('relu')(x) + + x = layers.Conv2D( + filters3, (1, 1), + use_bias=False, + kernel_initializer='he_normal', + kernel_regularizer=_gen_l2_regularizer(use_l2_regularizer), + name=conv_name_base + '2c')( + x) + x = layers.BatchNormalization( + axis=bn_axis, + momentum=BATCH_NORM_DECAY, + epsilon=BATCH_NORM_EPSILON, + name=bn_name_base + '2c')( + x) + + x = layers.add([x, input_tensor]) + x = layers.Activation('relu')(x) + return x + + +def conv_block(input_tensor, + kernel_size, + filters, + stage, + block, + strides=(2, 2), + use_l2_regularizer=True): + """A block that has a conv layer at shortcut. + + Note that from stage 3, + the second conv layer at main path is with strides=(2, 2) + And the shortcut should have strides=(2, 2) as well + + Args: + input_tensor: input tensor + kernel_size: default 3, the kernel size of middle conv layer at main path + filters: list of integers, the filters of 3 conv layer at main path + stage: integer, current stage label, used for generating layer names + block: 'a','b'..., current block label, used for generating layer names + strides: Strides for the second conv layer in the block. + use_l2_regularizer: whether to use L2 regularizer on Conv layer. + + Returns: + Output tensor for the block. + """ + filters1, filters2, filters3 = filters + if backend.image_data_format() == 'channels_last': + bn_axis = 3 + else: + bn_axis = 1 + conv_name_base = 'res' + str(stage) + block + '_branch' + bn_name_base = 'bn' + str(stage) + block + '_branch' + + x = layers.Conv2D( + filters1, (1, 1), + use_bias=False, + kernel_initializer='he_normal', + kernel_regularizer=_gen_l2_regularizer(use_l2_regularizer), + name=conv_name_base + '2a')( + input_tensor) + x = layers.BatchNormalization( + axis=bn_axis, + momentum=BATCH_NORM_DECAY, + epsilon=BATCH_NORM_EPSILON, + name=bn_name_base + '2a')( + x) + x = layers.Activation('relu')(x) + + x = layers.Conv2D( + filters2, + kernel_size, + strides=strides, + padding='same', + use_bias=False, + kernel_initializer='he_normal', + kernel_regularizer=_gen_l2_regularizer(use_l2_regularizer), + name=conv_name_base + '2b')( + x) + x = layers.BatchNormalization( + axis=bn_axis, + momentum=BATCH_NORM_DECAY, + epsilon=BATCH_NORM_EPSILON, + name=bn_name_base + '2b')( + x) + x = layers.Activation('relu')(x) + + x = layers.Conv2D( + filters3, (1, 1), + use_bias=False, + kernel_initializer='he_normal', + kernel_regularizer=_gen_l2_regularizer(use_l2_regularizer), + name=conv_name_base + '2c')( + x) + x = layers.BatchNormalization( + axis=bn_axis, + momentum=BATCH_NORM_DECAY, + epsilon=BATCH_NORM_EPSILON, + name=bn_name_base + '2c')( + x) + + shortcut = layers.Conv2D( + filters3, (1, 1), + strides=strides, + use_bias=False, + kernel_initializer='he_normal', + kernel_regularizer=_gen_l2_regularizer(use_l2_regularizer), + name=conv_name_base + '1')( + input_tensor) + shortcut = layers.BatchNormalization( + axis=bn_axis, + momentum=BATCH_NORM_DECAY, + epsilon=BATCH_NORM_EPSILON, + name=bn_name_base + '1')( + shortcut) + + x = layers.add([x, shortcut]) + x = layers.Activation('relu')(x) + return x + + +def resnet50(num_classes, + dtype='float32', + batch_size=None, + use_l2_regularizer=True): + """Instantiates the ResNet50 architecture. + + Args: + num_classes: `int` number of classes for image classification. + dtype: dtype to use float32 or float16 are most common. + batch_size: Size of the batches for each step. + use_l2_regularizer: whether to use L2 regularizer on Conv/Dense layer. + + Returns: + A Keras model instance. + """ + input_shape = (224, 224, 3) + img_input = layers.Input( + shape=input_shape, dtype=dtype, batch_size=batch_size) + + if backend.image_data_format() == 'channels_first': + x = layers.Lambda( + lambda x: backend.permute_dimensions(x, (0, 3, 1, 2)), + name='transpose')( + img_input) + bn_axis = 1 + else: # channels_last + x = img_input + bn_axis = 3 + + x = layers.ZeroPadding2D(padding=(3, 3), name='conv1_pad')(x) + x = layers.Conv2D( + 64, (7, 7), + strides=(2, 2), + padding='valid', + use_bias=False, + kernel_initializer='he_normal', + kernel_regularizer=_gen_l2_regularizer(use_l2_regularizer), + name='conv1')( + x) + x = layers.BatchNormalization( + axis=bn_axis, + momentum=BATCH_NORM_DECAY, + epsilon=BATCH_NORM_EPSILON, + name='bn_conv1')( + x) + x = layers.Activation('relu')(x) + x = layers.MaxPooling2D((3, 3), strides=(2, 2), padding='same')(x) + + x = conv_block( + x, + 3, [64, 64, 256], + stage=2, + block='a', + strides=(1, 1), + use_l2_regularizer=use_l2_regularizer) + x = identity_block( + x, + 3, [64, 64, 256], + stage=2, + block='b', + use_l2_regularizer=use_l2_regularizer) + x = identity_block( + x, + 3, [64, 64, 256], + stage=2, + block='c', + use_l2_regularizer=use_l2_regularizer) + + x = conv_block( + x, + 3, [128, 128, 512], + stage=3, + block='a', + use_l2_regularizer=use_l2_regularizer) + x = identity_block( + x, + 3, [128, 128, 512], + stage=3, + block='b', + use_l2_regularizer=use_l2_regularizer) + x = identity_block( + x, + 3, [128, 128, 512], + stage=3, + block='c', + use_l2_regularizer=use_l2_regularizer) + x = identity_block( + x, + 3, [128, 128, 512], + stage=3, + block='d', + use_l2_regularizer=use_l2_regularizer) + + x = conv_block( + x, + 3, [256, 256, 1024], + stage=4, + block='a', + use_l2_regularizer=use_l2_regularizer) + x = identity_block( + x, + 3, [256, 256, 1024], + stage=4, + block='b', + use_l2_regularizer=use_l2_regularizer) + x = identity_block( + x, + 3, [256, 256, 1024], + stage=4, + block='c', + use_l2_regularizer=use_l2_regularizer) + x = identity_block( + x, + 3, [256, 256, 1024], + stage=4, + block='d', + use_l2_regularizer=use_l2_regularizer) + x = identity_block( + x, + 3, [256, 256, 1024], + stage=4, + block='e', + use_l2_regularizer=use_l2_regularizer) + x = identity_block( + x, + 3, [256, 256, 1024], + stage=4, + block='f', + use_l2_regularizer=use_l2_regularizer) + + x = conv_block( + x, + 3, [512, 512, 2048], + stage=5, + block='a', + use_l2_regularizer=use_l2_regularizer) + x = identity_block( + x, + 3, [512, 512, 2048], + stage=5, + block='b', + use_l2_regularizer=use_l2_regularizer) + x = identity_block( + x, + 3, [512, 512, 2048], + stage=5, + block='c', + use_l2_regularizer=use_l2_regularizer) + + rm_axes = [1, 2] if backend.image_data_format() == 'channels_last' else [2, 3] + x = layers.Lambda(lambda x: backend.mean(x, rm_axes), name='reduce_mean')(x) + x = layers.Dense( + num_classes, + kernel_initializer=initializers.RandomNormal(stddev=0.01), + kernel_regularizer=_gen_l2_regularizer(use_l2_regularizer), + bias_regularizer=_gen_l2_regularizer(use_l2_regularizer), + name='fc1000')( + x) + + # TODO(reedwm): Remove manual casts once mixed precision can be enabled with a + # single line of code. + x = backend.cast(x, 'float32') + x = layers.Activation('softmax')(x) + + # Create model. + return models.Model(img_input, x, name='resnet50') From 036a5c55d9cc0d8b4f6e3e63cb7d328f12499c63 Mon Sep 17 00:00:00 2001 From: Lee Yang Date: Thu, 29 Aug 2019 14:13:22 -0700 Subject: [PATCH 22/37] add resnet/cifar example --- examples/resnet/README.md | 151 +++++++--------- examples/resnet/README_orig.md | 100 +++++++++++ examples/resnet/resnet_cifar_dist.py | 240 ++++++++++++++++++++++++++ examples/resnet/resnet_cifar_spark.py | 22 +++ 4 files changed, 427 insertions(+), 86 deletions(-) create mode 100644 examples/resnet/README_orig.md create mode 100644 examples/resnet/resnet_cifar_dist.py create mode 100644 examples/resnet/resnet_cifar_spark.py diff --git a/examples/resnet/README.md b/examples/resnet/README.md index 72260396..98d7f597 100644 --- a/examples/resnet/README.md +++ b/examples/resnet/README.md @@ -1,100 +1,79 @@ -This folder contains the Keras implementation of the ResNet models. For more -information about the models, please refer to this [README file](../../README.md). - -Similar to the [estimator implementation](../../r1/resnet), the Keras -implementation has code for both CIFAR-10 data and ImageNet data. The CIFAR-10 -version uses a ResNet56 model implemented in -[`resnet_cifar_model.py`](./resnet_cifar_model.py), and the ImageNet version -uses a ResNet50 model implemented in [`resnet_model.py`](./resnet_model.py). - -To use -either dataset, make sure that you have the latest version of TensorFlow -installed and -[add the models folder to your Python path](/official/#running-the-models), -otherwise you may encounter an error like `ImportError: No module named -official.resnet`. - -## CIFAR-10 - -Download and extract the CIFAR-10 data. You can use the following script: -```bash -python ../../r1/resnet/cifar10_download_and_extract.py -``` - -After you download the data, you can run the program by: +# ResNet Image Classification -```bash -python resnet_cifar_main.py -``` +Original Source: https://github.com/tensorflow/models/tree/master/official/vision/image_classification -If you did not use the default directory to download the data, specify the -location with the `--data_dir` flag, like: +This code is based on the Image Classification model from the official [TensorFlow Models](https://github.com/tensorflow/models) repository. This example already supports different forms of distribution via the `DistributionStrategy` API, so there isn't much additional work to convert it to TensorFlowOnSpark. -```bash -python resnet_cifar_main.py --data_dir=/path/to/cifar -``` +Notes: +- This example assumes that Spark, TensorFlow, and TensorFlowOnSpark are already installed. +- For simplicity, this just uses a single-node Spark Standalone installation. -## ImageNet +#### Run the Single-Node Application -Download the ImageNet dataset and convert it to TFRecord format. -The following [script](https://github.com/tensorflow/tpu/blob/master/tools/datasets/imagenet_to_gcs.py) -and [README](https://github.com/tensorflow/tpu/tree/master/tools/datasets#imagenet_to_gcspy) -provide a few options. - -Once your dataset is ready, you can begin training the model as follows: - -```bash -python resnet_imagenet_main.py +First, make sure that you can run the example per the [original instructions](https://github.com/tensorflow/models/tree/68c3c65596b8fc624be15aef6eac3dc8952cbf23/official/vision/image_classification). For now, we'll just use the CIFAR-10 dataset. After cloning the `tensorflow/models` repository and downloading the dataset, you should be able to run the training as follows: ``` - -Again, if you did not download the data to the default directory, specify the -location with the `--data_dir` flag: - -```bash -python resnet_imagenet_main.py --data_dir=/path/to/imagenet +export TENSORFLOW_MODELS=/path/to/tensorflow/models +export CIFAR_DATA=/path/to/cifar +export PYTHONPATH=${PYTHONPATH}:${TENSORFLOW_MODELS} +python resnet_cifar_main.py --data_dir=${CIFAR_DATA} --num_gpus=0 --train_epochs=1 ``` -There are more flag options you can specify. Here are some examples: - -- `--use_synthetic_data`: when set to true, synthetic data, rather than real -data, are used; -- `--batch_size`: the batch size used for the model; -- `--model_dir`: the directory to save the model checkpoint; -- `--train_epochs`: number of epoches to run for training the model; -- `--train_steps`: number of steps to run for training the model. We now only -support a number that is smaller than the number of batches in an epoch. -- `--skip_eval`: when set to true, evaluation as well as validation during -training is skipped - -For example, this is a typical command line to run with ImageNet data with -batch size 128 per GPU: - -```bash -python -m resnet_imagenet_main \ - --model_dir=/tmp/model_dir/something \ - --num_gpus=2 \ - --batch_size=128 \ - --train_epochs=90 \ - --train_steps=10 \ - --use_synthetic_data=false -``` +If you have GPUs available, just set `--num_gpus` to the number of GPUs on your machine. Note: by default, `--train_epochs=182`, which runs for a long time on a CPU machine, so for brevity, we'll just run a single epoch in these examples. -See [`common.py`](common.py) for full list of options. +#### Run as a Distributed TensorFlow Application -## Using multiple GPUs -You can train these models on multiple GPUs using `tf.distribute.Strategy` API. -You can read more about them in this -[guide](https://www.tensorflow.org/guide/distribute_strategy). +Next, confirm that this application is capable of being distributed. We can test this on a single CPU machine by using two different terminal/shell sessions, as follows: +``` +# in one shell/window +export PYTHONPATH=${PYTHONPATH}:${TENSORFLOW_MODELS} +export TF_CONFIG='{"cluster": { "worker": ["localhost:2222", "localhost:2223"]}, "task": {"type": "worker", "index": 0}}' +python resnet_cifar_main.py --data_dir=${CIFAR_DATA} --num_gpus=0 --ds=multi_worker_mirrored --train_epochs=1 + +# in another shell/window +export PYTHONPATH=${PYTHONPATH}:${TENSORFLOW_MODELS} +export TF_CONFIG='{"cluster": { "worker": ["localhost:2222", "localhost:2223"]}, "task": {"type": "worker", "index": 1}}' +python resnet_cifar_main.py --data_dir=${CIFAR_DATA} --num_gpus=0 --ds=multi_worker_mirrored --train_epochs=1 +``` -In this example, we have made it easier to use is with just a command line flag -`--num_gpus`. By default this flag is 1 if TensorFlow is compiled with CUDA, -and 0 otherwise. +Note that we now configure the code to use the `MultiWorkerMirroredtrategy`. Also note that training will not begin until both nodes have started. -- --num_gpus=0: Uses tf.distribute.OneDeviceStrategy with CPU as the device. -- --num_gpus=1: Uses tf.distribute.OneDeviceStrategy with GPU as the device. -- --num_gpus=2+: Uses tf.distribute.MirroredStrategy to run synchronous -distributed training across the GPUs. +### Run as a TensorFlowOnSpark Application -If you wish to run without `tf.distribute.Strategy`, you can do so by setting -`--distribution_strategy=off`. +Finally, we can run the converted application as follows: +``` +export TFoS_HOME=/path/to/TensorFlowOnSpark +export PYTHONPATH=${PYTHONPATH}:${TENSORFLOW_MODELS} +export MASTER=spark://$(hostname):7077 +export SPARK_WORKER_INSTANCES=2 +export CORES_PER_WORKER=1 +export TOTAL_CORES=$((${CORES_PER_WORKER}*${SPARK_WORKER_INSTANCES})) + +# start spark cluster +${SPARK_HOME}/sbin/start-master.sh; ${SPARK_HOME}/sbin/start-slave.sh -c $CORES_PER_WORKER -m 3G ${MASTER} + +# train and evaluate +${SPARK_HOME}/bin/spark-submit \ +--master ${MASTER} \ +--conf spark.cores.max=${TOTAL_CORES} \ +--conf spark.task.cpus=${CORES_PER_WORKER} \ +--py-files ${TFoS_HOME}/examples/resnet/resnet_cifar_dist.py \ +${TFoS_HOME}/examples/resnet/resnet_cifar_spark.py \ +--cluster_size ${SPARK_WORKER_INSTANCES} \ +--epochs 1 \ +--data_dir /Users/leewyang/datasets/cifar10/cifar-10-batches-bin \ +--num_gpus=0 \ +--ds=multi_worker_mirrored \ +--train_epochs 1 + +# shutdown spark +${SPARK_HOME}/sbin/stop-slave.sh; ${SPARK_HOME}/sbin/stop-master.sh +``` +Notes: +- Most of the original TensorFlow code from `resnet_cifar_main.py` has been copied into `resnet_cifar_dist.py`, so you can diff the changes. +- The `def main(_)` function was changed to `def main_fun(argv, ctx)`. +- The `absl_app.run(main)` invocation was replaced by the Spark "main" function in `resnet_cifar_spark.py`. This file mostly contains the Spark application boilerplate along with the TensorFlowOnSpark calls to setup the TensorFlow cluster. Note that having the separate Spark and TensorFlow files can help isolate code and avoid Spark serialization issues. +- The Spark "main" function uses `argparse` to parse TensorFlowOnSpark-specific command line arguments, but it passes the remaining argments (in the `rem` variable) to the TensorFlow `main_fun`, which then parses those arguments via `define_cifar_flags()` and `flags.FLAGS(argv)`. +- In a truly distributed environment, you would need: + - A distributed file system to store the dataset, so that each executor/node is able to read the data. + - The dependencies from the `tensorflow/models` to be available on the executors, either installed locally or bundled with the Spark application. diff --git a/examples/resnet/README_orig.md b/examples/resnet/README_orig.md new file mode 100644 index 00000000..72260396 --- /dev/null +++ b/examples/resnet/README_orig.md @@ -0,0 +1,100 @@ +This folder contains the Keras implementation of the ResNet models. For more +information about the models, please refer to this [README file](../../README.md). + +Similar to the [estimator implementation](../../r1/resnet), the Keras +implementation has code for both CIFAR-10 data and ImageNet data. The CIFAR-10 +version uses a ResNet56 model implemented in +[`resnet_cifar_model.py`](./resnet_cifar_model.py), and the ImageNet version +uses a ResNet50 model implemented in [`resnet_model.py`](./resnet_model.py). + +To use +either dataset, make sure that you have the latest version of TensorFlow +installed and +[add the models folder to your Python path](/official/#running-the-models), +otherwise you may encounter an error like `ImportError: No module named +official.resnet`. + +## CIFAR-10 + +Download and extract the CIFAR-10 data. You can use the following script: +```bash +python ../../r1/resnet/cifar10_download_and_extract.py +``` + +After you download the data, you can run the program by: + +```bash +python resnet_cifar_main.py +``` + +If you did not use the default directory to download the data, specify the +location with the `--data_dir` flag, like: + +```bash +python resnet_cifar_main.py --data_dir=/path/to/cifar +``` + +## ImageNet + +Download the ImageNet dataset and convert it to TFRecord format. +The following [script](https://github.com/tensorflow/tpu/blob/master/tools/datasets/imagenet_to_gcs.py) +and [README](https://github.com/tensorflow/tpu/tree/master/tools/datasets#imagenet_to_gcspy) +provide a few options. + +Once your dataset is ready, you can begin training the model as follows: + +```bash +python resnet_imagenet_main.py +``` + +Again, if you did not download the data to the default directory, specify the +location with the `--data_dir` flag: + +```bash +python resnet_imagenet_main.py --data_dir=/path/to/imagenet +``` + +There are more flag options you can specify. Here are some examples: + +- `--use_synthetic_data`: when set to true, synthetic data, rather than real +data, are used; +- `--batch_size`: the batch size used for the model; +- `--model_dir`: the directory to save the model checkpoint; +- `--train_epochs`: number of epoches to run for training the model; +- `--train_steps`: number of steps to run for training the model. We now only +support a number that is smaller than the number of batches in an epoch. +- `--skip_eval`: when set to true, evaluation as well as validation during +training is skipped + +For example, this is a typical command line to run with ImageNet data with +batch size 128 per GPU: + +```bash +python -m resnet_imagenet_main \ + --model_dir=/tmp/model_dir/something \ + --num_gpus=2 \ + --batch_size=128 \ + --train_epochs=90 \ + --train_steps=10 \ + --use_synthetic_data=false +``` + +See [`common.py`](common.py) for full list of options. + +## Using multiple GPUs +You can train these models on multiple GPUs using `tf.distribute.Strategy` API. +You can read more about them in this +[guide](https://www.tensorflow.org/guide/distribute_strategy). + +In this example, we have made it easier to use is with just a command line flag +`--num_gpus`. By default this flag is 1 if TensorFlow is compiled with CUDA, +and 0 otherwise. + +- --num_gpus=0: Uses tf.distribute.OneDeviceStrategy with CPU as the device. +- --num_gpus=1: Uses tf.distribute.OneDeviceStrategy with GPU as the device. +- --num_gpus=2+: Uses tf.distribute.MirroredStrategy to run synchronous +distributed training across the GPUs. + +If you wish to run without `tf.distribute.Strategy`, you can do so by setting +`--distribution_strategy=off`. + diff --git a/examples/resnet/resnet_cifar_dist.py b/examples/resnet/resnet_cifar_dist.py new file mode 100644 index 00000000..55022c48 --- /dev/null +++ b/examples/resnet/resnet_cifar_dist.py @@ -0,0 +1,240 @@ +# Copyright 2018 The TensorFlow Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================== +"""Runs a ResNet model on the Cifar-10 dataset.""" + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +# from absl import app as absl_app +from absl import flags +import tensorflow as tf + +from official.utils.flags import core as flags_core +from official.utils.logs import logger +from official.utils.misc import distribution_utils +from official.utils.misc import keras_utils +from official.vision.image_classification import cifar_preprocessing +from official.vision.image_classification import common +from official.vision.image_classification import resnet_cifar_model + + +LR_SCHEDULE = [ # (multiplier, epoch to start) tuples + (0.1, 91), (0.01, 136), (0.001, 182) +] + + +def learning_rate_schedule(current_epoch, + current_batch, + batches_per_epoch, + batch_size): + """Handles linear scaling rule and LR decay. + + Scale learning rate at epoch boundaries provided in LR_SCHEDULE by the + provided scaling factor. + + Args: + current_epoch: integer, current epoch indexed from 0. + current_batch: integer, current batch in the current epoch, indexed from 0. + batches_per_epoch: integer, number of steps in an epoch. + batch_size: integer, total batch sized. + + Returns: + Adjusted learning rate. + """ + del current_batch, batches_per_epoch # not used + initial_learning_rate = common.BASE_LEARNING_RATE * batch_size / 128 + learning_rate = initial_learning_rate + for mult, start_epoch in LR_SCHEDULE: + if current_epoch >= start_epoch: + learning_rate = initial_learning_rate * mult + else: + break + return learning_rate + + +def run(flags_obj): + """Run ResNet Cifar-10 training and eval loop using native Keras APIs. + + Args: + flags_obj: An object containing parsed flag values. + + Raises: + ValueError: If fp16 is passed as it is not currently supported. + + Returns: + Dictionary of training and eval stats. + """ + keras_utils.set_session_config( + enable_eager=flags_obj.enable_eager, + enable_xla=flags_obj.enable_xla) + + # Execute flag override logic for better model performance + if flags_obj.tf_gpu_thread_mode: + common.set_gpu_thread_mode_and_count(flags_obj) + common.set_cudnn_batchnorm_mode() + + dtype = flags_core.get_tf_dtype(flags_obj) + if dtype == 'fp16': + raise ValueError('dtype fp16 is not supported in Keras. Use the default ' + 'value(fp32).') + + data_format = flags_obj.data_format + if data_format is None: + data_format = ('channels_first' + if tf.test.is_built_with_cuda() else 'channels_last') + tf.keras.backend.set_image_data_format(data_format) + + strategy = distribution_utils.get_distribution_strategy( + distribution_strategy=flags_obj.distribution_strategy, + num_gpus=flags_obj.num_gpus, + num_workers=distribution_utils.configure_cluster(), + all_reduce_alg=flags_obj.all_reduce_alg, + num_packs=flags_obj.num_packs) + + if strategy: + # flags_obj.enable_get_next_as_optional controls whether enabling + # get_next_as_optional behavior in DistributedIterator. If true, last + # partial batch can be supported. + strategy.extended.experimental_enable_get_next_as_optional = ( + flags_obj.enable_get_next_as_optional + ) + + strategy_scope = distribution_utils.get_strategy_scope(strategy) + + if flags_obj.use_synthetic_data: + distribution_utils.set_up_synthetic_data() + input_fn = common.get_synth_input_fn( + height=cifar_preprocessing.HEIGHT, + width=cifar_preprocessing.WIDTH, + num_channels=cifar_preprocessing.NUM_CHANNELS, + num_classes=cifar_preprocessing.NUM_CLASSES, + dtype=flags_core.get_tf_dtype(flags_obj), + drop_remainder=True) + else: + distribution_utils.undo_set_up_synthetic_data() + input_fn = cifar_preprocessing.input_fn + + train_input_dataset = input_fn( + is_training=True, + data_dir=flags_obj.data_dir, + batch_size=flags_obj.batch_size, + num_epochs=flags_obj.train_epochs, + parse_record_fn=cifar_preprocessing.parse_record, + datasets_num_private_threads=flags_obj.datasets_num_private_threads, + dtype=dtype, + # Setting drop_remainder to avoid the partial batch logic in normalization + # layer, which triggers tf.where and leads to extra memory copy of input + # sizes between host and GPU. + drop_remainder=(not flags_obj.enable_get_next_as_optional)) + + eval_input_dataset = None + if not flags_obj.skip_eval: + eval_input_dataset = input_fn( + is_training=False, + data_dir=flags_obj.data_dir, + batch_size=flags_obj.batch_size, + num_epochs=flags_obj.train_epochs, + parse_record_fn=cifar_preprocessing.parse_record) + + with strategy_scope: + optimizer = common.get_optimizer() + model = resnet_cifar_model.resnet56(classes=cifar_preprocessing.NUM_CLASSES) + + # TODO(b/138957587): Remove when force_v2_in_keras_compile is on longer + # a valid arg for this model. Also remove as a valid flag. + if flags_obj.force_v2_in_keras_compile is not None: + model.compile( + loss='categorical_crossentropy', + optimizer=optimizer, + metrics=(['categorical_accuracy'] + if flags_obj.report_accuracy_metrics else None), + run_eagerly=flags_obj.run_eagerly, + experimental_run_tf_function=flags_obj.force_v2_in_keras_compile) + else: + model.compile( + loss='categorical_crossentropy', + optimizer=optimizer, + metrics=(['categorical_accuracy'] + if flags_obj.report_accuracy_metrics else None), + run_eagerly=flags_obj.run_eagerly) + + callbacks = common.get_callbacks( + learning_rate_schedule, cifar_preprocessing.NUM_IMAGES['train']) + + train_steps = cifar_preprocessing.NUM_IMAGES['train'] // flags_obj.batch_size + train_epochs = flags_obj.train_epochs + + if flags_obj.train_steps: + train_steps = min(flags_obj.train_steps, train_steps) + train_epochs = 1 + + num_eval_steps = (cifar_preprocessing.NUM_IMAGES['validation'] // + flags_obj.batch_size) + + validation_data = eval_input_dataset + if flags_obj.skip_eval: + if flags_obj.set_learning_phase_to_train: + # TODO(haoyuzhang): Understand slowdown of setting learning phase when + # not using distribution strategy. + tf.keras.backend.set_learning_phase(1) + num_eval_steps = None + validation_data = None + + if not strategy and flags_obj.explicit_gpu_placement: + # TODO(b/135607227): Add device scope automatically in Keras training loop + # when not using distribition strategy. + no_dist_strat_device = tf.device('/device:GPU:0') + no_dist_strat_device.__enter__() + + history = model.fit(train_input_dataset, + epochs=train_epochs, + steps_per_epoch=train_steps, + callbacks=callbacks, + validation_steps=num_eval_steps, + validation_data=validation_data, + validation_freq=flags_obj.epochs_between_evals, + verbose=2) + eval_output = None + if not flags_obj.skip_eval: + eval_output = model.evaluate(eval_input_dataset, + steps=num_eval_steps, + verbose=2) + + if not strategy and flags_obj.explicit_gpu_placement: + no_dist_strat_device.__exit__() + + stats = common.build_stats(history, eval_output, callbacks) + return stats + + +def define_cifar_flags(): + common.define_keras_flags(dynamic_loss_scale=False) + + flags_core.set_defaults(data_dir='/tmp/cifar10_data/cifar-10-batches-bin', + model_dir='/tmp/cifar10_model', + train_epochs=182, + epochs_between_evals=10, + batch_size=128) + + +def main_fun(argv, ctx): + tf.compat.v1.logging.set_verbosity(tf.compat.v1.logging.INFO) + define_cifar_flags() + flags.FLAGS(argv) + + print("====== FLAGS: {}".format(flags.FLAGS)) + with logger.benchmark_context(flags.FLAGS): + return run(flags.FLAGS) diff --git a/examples/resnet/resnet_cifar_spark.py b/examples/resnet/resnet_cifar_spark.py new file mode 100644 index 00000000..ac5a15ee --- /dev/null +++ b/examples/resnet/resnet_cifar_spark.py @@ -0,0 +1,22 @@ +import resnet_cifar_dist + +if __name__ == '__main__': + # tf.compat.v1.logging.set_verbosity(tf.compat.v1.logging.INFO) + # absl_app.run(main) + from pyspark.context import SparkContext + from pyspark.conf import SparkConf + from tensorflowonspark import TFCluster + import argparse + + sc = SparkContext(conf=SparkConf().setAppName("resnet_cifar")) + executors = sc._conf.get("spark.executor.instances") + num_executors = int(executors) if executors is not None else 1 + + parser = argparse.ArgumentParser() + parser.add_argument("--cluster_size", help="number of nodes in the cluster (for Spark Standalone)", type=int, default=num_executors) + parser.add_argument("--num_ps", help="number of parameter servers", type=int, default=1) + parser.add_argument("--tensorboard", help="launch tensorboard process", action="store_true") + args, rem = parser.parse_known_args() + + cluster = TFCluster.run(sc, resnet_cifar_dist.main_fun, rem, args.cluster_size, args.num_ps, args.tensorboard, TFCluster.InputMode.TENSORFLOW) + cluster.shutdown() From dcb30c02c93e44aa77919d1376746879fcfcff2f Mon Sep 17 00:00:00 2001 From: Lee Yang Date: Fri, 30 Aug 2019 10:05:22 -0700 Subject: [PATCH 23/37] remove more TF1.x examples --- examples/criteo/README.md | 117 --------- examples/criteo/spark/__init__.py | 0 examples/criteo/spark/criteo_dist.py | 294 ----------------------- examples/criteo/spark/criteo_spark.py | 66 ----- examples/criteo/spark/requirements.txt | 5 - examples/wide_deep/README.md | 76 ------ examples/wide_deep/census_dataset.py | 204 ---------------- examples/wide_deep/census_main.py | 159 ------------ examples/wide_deep/wide_deep_run_loop.py | 131 ---------- 9 files changed, 1052 deletions(-) delete mode 100644 examples/criteo/README.md delete mode 100644 examples/criteo/spark/__init__.py delete mode 100644 examples/criteo/spark/criteo_dist.py delete mode 100644 examples/criteo/spark/criteo_spark.py delete mode 100644 examples/criteo/spark/requirements.txt delete mode 100644 examples/wide_deep/README.md delete mode 100644 examples/wide_deep/census_dataset.py delete mode 100644 examples/wide_deep/census_main.py delete mode 100644 examples/wide_deep/wide_deep_run_loop.py diff --git a/examples/criteo/README.md b/examples/criteo/README.md deleted file mode 100644 index 39f3c177..00000000 --- a/examples/criteo/README.md +++ /dev/null @@ -1,117 +0,0 @@ -# Learning Click-Through Rate at Scale with Tensorflow on Spark - -## Introduction -This project consists of learning a click-throughrate model at scale using TensorflowOnSpark technology. -Criteo released a 1TB dataset: http://labs.criteo.com/2013/12/download-terabyte-click-logs/ -In order to promote Google cloud technology, Google published a solution to train a model at scale using there -proprietary platform : https://cloud.google.com/blog/big-data/2017/02/using-google-cloud-machine-learning-to-predict-clicks-at-scale - -Instead, we propose a solution based on open source technology that can be leveraged on any cloud, -or private cluster relying on spark. - -We demonstrate how Tensorflow on Spark (https://github.com/yahoo/TensorFlowOnSpark) can be used to reach the state of the art when it comes to predicting the proba of click at scale. -Notice that the goal here is not to produce the best pCTR predictor, but rather establish a open method that still reaches the best performance published so far on this dataset. -Hence, our solutions remains very simple, and rely solely on basic feature extraction, cross-features and hashing, the all trained on logistic regression. - -## Install and test TF on spark -Before making use of this code, please make sure you can install TF on spark on your cluster and -run the mnist example as illustrated here: -https://github.com/yahoo/TensorFlowOnSpark/wiki/GetStarted_YARN -By so doing, you should make sure that did set up the following variables correctly: - -``` -export JAVA_HOME= -export HADOOP_HOME= -export SPARK_HOME= -export HADOOP_HDFS_HOME= -export SPARK_HOME= -export PYTHON_ROOT=./Python -export PATH=${PATH}:${HADOOP_HOME}/bin:${SPARK_HOME}/bin:${HADOOP_HDFS_HOME}/bin:${SPARK_HOME}/bin:${PYTHON_ROOT}/bin -export PYSPARK_PYTHON=${PYTHON_ROOT}/bin/python -export SPARK_YARN_USER_ENV="PYSPARK_PYTHON=/usr/bin/python" -export QUEUE=default -export LIB_HDFS= -export LIB_JVM= -``` - -## Data set - -The raw data can be accessed here: http://labs.criteo.com/2013/12/download-terabyte-click-logs/ - -### Download the data set -``` -for i in 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23; do - curl -O http://azuremlsampleexperiments.blob.core.windows.net/criteo/day_${i}.gz - aws s3 mv day_${i}.gz s3://criteo-display-ctr-dataset/released/ -done -``` - -### Upload training data on your AWS s3 using Pig - -``` -%declare awskey yourkey -%declare awssecretkey yoursecretkey -SET mapred.output.compress 'true'; -SET mapred.output.compression.codec 'org.apache.hadoop.io.compress.BZip2Codec'; -train_data = load 's3n://${awskey}:${awssecretkey}@criteo-display-ctr-dataset/released/day_{[0-9],1[0-9],2[0-2]}.gz '; -train_data = FOREACH (GROUP train_data BY ROUND(10000* RANDOM()) PARALLEL 10000) GENERATE FLATTEN(train_data); -store train_data into 's3n://${awskey}:${awssecretkey}@criteo-display-ctr-dataset/data/training/' using PigStorage(); -``` -We here divide the training data in 10000 chunks, which will allow TFonSpark to reduce its memory usage. - -### Upload validation data on your AWS s3 using Pig -``` -%declare awskey yourkey -%declare awssecretkey yoursecretkey -SET mapred.output.compress 'true'; -SET mapred.output.compression.codec 'org.apache.hadoop.io.compress.BZip2Codec'; -train_data = load 's3n://${awskey}:${awssecretkey}@criteo-display-ctr-dataset/released/day_23.gz'; -train_data = FOREACH (GROUP train_data BY ROUND(100* RANDOM()) PARALLEL 100) GENERATE FLATTEN(train_data); -store train_data into 's3n://${awskey}:${awssecretkey}@criteo-display-ctr-dataset/data/validation' using PigStorage(); -``` - - - - - - -## Running the example - -Set up task variables -``` -export TRAINING_DATA=hdfs_path_to_training_data_directory -export VALIDATION_DATA=hdfs_path_to_validation_data_directory -export MODEL_OUTPUT=hdfs://default/tmp/criteo_ctr_prediction -``` -Run command: - -``` -${SPARK_HOME}/bin/spark-submit \ ---master yarn \ ---deploy-mode cluster \ ---queue ${QUEUE} \ ---num-executors 12 \ ---executor-memory 27G \ ---py-files TensorFlowOnSpark/tfspark.zip,TensorFlowOnSpark/examples/criteo/spark/criteo_dist.py \ ---conf spark.dynamicAllocation.enabled=false \ ---conf spark.yarn.maxAppAttempts=1 \ ---archives hdfs:///user/${USER}/Python.zip#Python \ ---conf spark.executorEnv.LD_LIBRARY_PATH="$LIB_HDFS:$LIB_JVM" \ ---conf spark.executorEnv.HADOOP_HDFS_HOME="$HADOOP_HDFS_HOME" \ ---conf spark.executorEnv.CLASSPATH="$($HADOOP_HOME/bin/hadoop classpath --glob):${CLASSPATH}" \ -TensorFlowOnSpark/examples/criteo/spark/criteo_spark.py \ ---mode train \ ---data ${TRAINING_DATA} \ ---validation ${VALIDATION_DATA} \ ---steps 1000000 \ ---model ${MODEL_OUTPUT} --tensorboard \ ---tensorboardlogdir ${MODEL_OUTPUT} -``` -## Tensorboard tracking: - -By connecting to the Web UI tracker of your application, -you be able to retrieve the tensorboard URL in the stdout of the driver, e.g.: -``` - TensorBoard running at: http://10.4.112.234:36911 -``` - diff --git a/examples/criteo/spark/__init__.py b/examples/criteo/spark/__init__.py deleted file mode 100644 index e69de29b..00000000 diff --git a/examples/criteo/spark/criteo_dist.py b/examples/criteo/spark/criteo_dist.py deleted file mode 100644 index f482d706..00000000 --- a/examples/criteo/spark/criteo_dist.py +++ /dev/null @@ -1,294 +0,0 @@ -# Copyright 2018 Criteo -# Licensed under the terms of the Apache 2.0 license. -# Please see LICENSE file in the project root for terms. -# Distributed Criteo Display CTR prediction on grid based on TensorFlow on Spark -# https://github.com/yahoo/TensorFlowOnSpark - -from __future__ import absolute_import -from __future__ import division -from __future__ import nested_scopes -from __future__ import print_function - -validation_file = None - - -def print_log(worker_num, arg): - print("{0}: {1}".format(worker_num, arg)) - - -def map_fun(args, ctx): - from datetime import datetime - import math - import tensorflow as tf - import numpy as np - import time - from sklearn.metrics import roc_auc_score - import mmh3 - - class CircularFile(object): - def __init__(self, filename): - self.filename = filename - self.file = None - - def readline(self): - if (self.file is None): - self.file = tf.gfile.GFile(self.filename, "r") - - p_line = self.file.readline() - - if p_line == "": - self.file.close() - self.file = tf.gfile.GFile(self.filename, "r") - p_line = self.file.readline() - return p_line - - def close(self): - self.file.close() - self.file = None - - - worker_num = ctx.worker_num - job_name = ctx.job_name - task_index = ctx.task_index - - - # Delay PS nodes a bit, since workers seem to reserve GPUs more quickly/reliably (w/o conflict) - if job_name == "ps": - time.sleep((worker_num + 1) * 5) - - vocabulary_size = 39 - # Feature indexes as defined in input file - INDEX_CAT_FEATURES = 13 - - # These parameters values have been selected for illustration purpose and have not been tuned. - learning_rate = 0.0005 - droupout_rate = 0.4 - NB_OF_HASHES_CAT = 2 ** 15 - NB_OF_HASHES_CROSS = 2 ** 15 - NB_BUCKETS = 40 - - boundaries_bucket = [1.5 ** j - 0.51 for j in range(NB_BUCKETS)] - # Same as in: - # [https://github.com/GoogleCloudPlatform/cloudml-samples/blob/c272e9f3bf670404fb1570698d8808ab62f0fc9a/criteo_tft/trainer/task.py#L163] - - nb_input_features = ((INDEX_CAT_FEATURES) * NB_BUCKETS) + ( - (vocabulary_size - INDEX_CAT_FEATURES) * NB_OF_HASHES_CAT) + NB_OF_HASHES_CROSS - - - batch_size = args.batch_size - - # Get TF cluster and server instances - cluster, server = ctx.start_cluster_server(1, args.rdma) - - - def get_index_bucket(feature_value): - """ - maps the input feature to a one hot encoding index - :param feature_value: the value of the feature - :return: the index of the one hot encoding that activates for the input value - """ - for index, boundary_value in enumerate(boundaries_bucket): - if feature_value < boundary_value: - return index - return index - - - def get_batch_validation(batch_size): - """ - :param batch_size: - :return: a list of read lines, each lines being a list of the features as read from the input file - """ - global validation_file - if validation_file is None: - validation_file = CircularFile(args.validation) - return [validation_file.readline().split('\t') for _ in range(batch_size)] - - def get_cross_feature_name(index, features): - if index < INDEX_CAT_FEATURES: - index_str = str(index) + "_" + str(get_index_bucket(int(features[index]))) - else: - index_str = str(index) + "_" + features[index] - - return index_str - - def get_next_batch(batch): - """ - maps the batch read from the input file to a data array, and a label array that are fed to - the tf placeholders - :param batch: - :return: - """ - data = np.zeros((batch_size, nb_input_features)) - labels = np.zeros(batch_size) - - index = 0 - while True: - - features = batch[index][1:] - - if len(features) != vocabulary_size: - continue - - # BUCKETIZE CONTINIOUS FEATURES - for f_index in range(0, INDEX_CAT_FEATURES ): - if features[f_index]: - bucket_index = get_index_bucket(int(features[f_index])) - bucket_number_index = f_index * NB_BUCKETS - bucket_index_offset = bucket_index + bucket_number_index - data[index, bucket_index_offset] = 1 - - # BUCKETIZE CATEGORY FEATURES - offset = INDEX_CAT_FEATURES * NB_BUCKETS - for f_index in range(INDEX_CAT_FEATURES, vocabulary_size): - if features[f_index]: - hash_index = mmh3.hash(features[f_index]) % NB_OF_HASHES_CAT - hash_number_index = (f_index - INDEX_CAT_FEATURES) * NB_OF_HASHES_CAT + offset - hash_index_offset = hash_index + hash_number_index - data[index, hash_index_offset] = 1 - - # BUCKETIZE CROSS CATEGORY AND CONTINIOUS - offset = INDEX_CAT_FEATURES * NB_BUCKETS + (vocabulary_size - INDEX_CAT_FEATURES) * NB_OF_HASHES_CAT - - for index_i in range(0, vocabulary_size-1): - for index_j in range(index_i + 1, vocabulary_size): - if features[index_i].rstrip() == '' or features[index_j].rstrip() == '': - continue - - index_str_i = get_cross_feature_name(index_i,features) - index_str_j = get_cross_feature_name(index_j,features) - - hash_index = mmh3.hash(index_str_i + "_" + index_str_j) % NB_OF_HASHES_CROSS + offset - data[index, hash_index] = 1 - - labels[index] = batch[index][0] - index += 1 - if index == batch_size: - break - - return data.astype(int), labels.astype(int) - - - - if job_name == "ps": - server.join() - elif job_name == "worker": - is_chiefing = (task_index == 0) - with tf.device(tf.train.replica_device_setter( - worker_device="/job:worker/task:%d" % task_index, - cluster=cluster)): - - def lineartf(x, droupout_rate, is_training, name=None, reuse=None, dropout=None): - """ - Apply a simple lineartf transformation A*x+b to the input - """ - n_output = 1 - if len(x.get_shape()) != 2: - x = tf.contrib.layers.flatten(x) - - n_input = x.get_shape().as_list()[1] - - with tf.variable_scope(name, reuse=reuse): - W = tf.get_variable( - name='W', - shape=[n_input, n_output], - dtype=tf.float32, - initializer=tf.contrib.layers.xavier_initializer()) - - b = tf.get_variable( - name='b', - shape=[n_output], - dtype=tf.float32, - initializer=tf.constant_initializer(0.0)) - - h = tf.nn.bias_add( - name='h', - value=tf.matmul(x, W), - bias=b) - - if dropout: - h = tf.cond(is_training, lambda: tf.layers.dropout(h, rate=droupout_rate, training=True), - lambda: tf.layers.dropout(h, rate=0.0, training=True)) - - return h, W - - is_training = tf.placeholder(tf.bool, shape=()) - input_features = tf.placeholder(tf.float32, [None, nb_input_features], name="input_features") - input_features_lineartf, _ = lineartf(input_features, droupout_rate=droupout_rate, - name='linear_layer', - is_training=is_training, - dropout=None) - - y_true = tf.placeholder(tf.float32, shape=None) - y_prediction = input_features_lineartf - pCTR = tf.nn.sigmoid(y_prediction, name="pCTR") - global_step = tf.Variable(0) - cross_entropy = tf.reduce_mean(tf.nn.sigmoid_cross_entropy_with_logits(labels=y_true, logits=y_prediction)) - tf.summary.scalar('cross_entropy', cross_entropy) - adam_train_step = tf.train.AdamOptimizer(learning_rate=learning_rate).minimize(cross_entropy, - global_step=global_step) - - saver = tf.train.Saver() - summary_op = tf.summary.merge_all() - init_op = tf.global_variables_initializer() - - logdir = ctx.absolute_path(args.model) - print("Tensorflow model path: {0}".format(logdir)) - - if job_name == "worker" and is_chiefing: - summary_writer = tf.summary.FileWriter(logdir + "/train", graph=tf.get_default_graph()) - summary_val_writer = tf.summary.FileWriter(logdir + "/validation", graph=tf.get_default_graph()) - - options = dict(is_chief=is_chiefing, - logdir=logdir, - summary_op=None, - saver=saver, - global_step=global_step, - stop_grace_secs=300, - save_model_secs=0) - - if args.mode == "train": - options['save_model_secs'] = 120 - options['init_op'] = init_op - options['summary_writer'] = None - - sv = tf.train.Supervisor(**options) - - with sv.managed_session(server.target) as sess: - - print("{0} session ready".format(datetime.now().isoformat())) - - tf_feed = ctx.get_data_feed(args.mode == "train") - step = 0 - while not sv.should_stop() and not tf_feed.should_stop() and step < args.steps: - batch_data, batch_labels = get_next_batch(tf_feed.next_batch(batch_size)) - - if len(batch_data) > 0: - - if args.mode == "train": - - if sv.is_chief: - # Evaluate current state of the model on next batch of validation - batch_val = get_batch_validation(batch_size) - batch_data, batch_labels = get_next_batch(batch_val) - feed = {input_features: batch_data, y_true: batch_labels, is_training: False} - logloss, summary, step = sess.run([cross_entropy, summary_op, global_step], feed_dict=feed) - summary_val_writer.add_summary(summary, step) - print("validation loss: {0}".format(logloss)) - - feed = {input_features: batch_data, y_true: batch_labels, is_training: True} - _, logloss, summary, step = sess.run([adam_train_step, cross_entropy, summary_op, global_step], - feed_dict=feed) - - else: - feed = {input_features: batch_data, y_true: batch_labels, is_training: False} - yscore = sess.run(pCTR, feed_dict=feed) - tf_feed.batch_results(yscore) - - if sv.should_stop() or step >= args.steps: - tf_feed.terminate() - if is_chiefing: - summary_writer.close() - summary_val_writer.close() - - print("{0} stopping supervisor".format(datetime.now().isoformat())) - sv.stop() diff --git a/examples/criteo/spark/criteo_spark.py b/examples/criteo/spark/criteo_spark.py deleted file mode 100644 index aad2bd66..00000000 --- a/examples/criteo/spark/criteo_spark.py +++ /dev/null @@ -1,66 +0,0 @@ -# Copyright 2018 Criteo -# Licensed under the terms of the Apache 2.0 license. -# Please see LICENSE file in the project root for terms. - -# Distributed Criteo Display CTR prediction on grid based on TensorFlow on Spark -# https://github.com/yahoo/TensorFlowOnSpark - -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function - -from pyspark.context import SparkContext -from pyspark.conf import SparkConf - -import argparse -from datetime import datetime - - - -from tensorflowonspark import TFCluster - - -import criteo_dist - - -if __name__ == "__main__": - sc = SparkContext(conf=SparkConf().setAppName("criteo_spark")) - executors = sc._conf.get("spark.executor.instances") - if executors is None: - raise Exception("Could not retrieve the number of executors from the SparkContext") - num_executors = int(executors) - num_ps = 1 - - parser = argparse.ArgumentParser() - parser.add_argument("-b", "--batch_size", help="number of records per batch", type=int, default=100) - parser.add_argument("-e", "--epochs", help="number of epochs", type=int, default=1) - parser.add_argument("-i", "--data", help="HDFS path to data in parallelized format") - parser.add_argument("-m", "--model", help="HDFS path to save/load model during train/inference", default="criteo_model") - parser.add_argument("-v", "--validation", help="HDFS path to validation data") - - parser.add_argument("-n", "--cluster_size", help="number of nodes in the cluster", type=int, default=num_executors) - parser.add_argument("-o", "--output", help="HDFS path to save test/inference output", default="predictions") - parser.add_argument("-r", "--readers", help="number of reader/enqueue threads", type=int, default=1) - parser.add_argument("-s", "--steps", help="maximum number of steps", type=int, default=1000) - parser.add_argument("-tb", "--tensorboard", help="launch tensorboard process", action="store_true") - parser.add_argument("-X", "--mode", help="train|inference", default="train") - parser.add_argument("-c", "--rdma", help="use rdma connection", default=False) - parser.add_argument("-tbld", "--tensorboardlogdir", - help="Tensorboard log directory. It should on hdfs. Thus, it must be prefixed with hdfs://default") - - args = parser.parse_args() - print("args:", args) - - print("{0} ===== Start".format(datetime.now().isoformat())) - - dataRDD = sc.textFile(args.data).map(lambda ln: [x for x in ln.split('\t')]) - - cluster = TFCluster.run(sc, criteo_dist.map_fun, args, args.cluster_size, num_ps, args.tensorboard, - TFCluster.InputMode.SPARK, log_dir=args.model) - if args.mode == "train": - cluster.train(dataRDD, args.epochs) - else: - labelRDD = cluster.inference(dataRDD) - labelRDD.saveAsTextFile(args.output) - cluster.shutdown() - print("{0} ===== Stop".format(datetime.now().isoformat())) \ No newline at end of file diff --git a/examples/criteo/spark/requirements.txt b/examples/criteo/spark/requirements.txt deleted file mode 100644 index 7ae0e482..00000000 --- a/examples/criteo/spark/requirements.txt +++ /dev/null @@ -1,5 +0,0 @@ -mmh3 -tensorflow -numpy -scipy -scikit-learn diff --git a/examples/wide_deep/README.md b/examples/wide_deep/README.md deleted file mode 100644 index 6b756635..00000000 --- a/examples/wide_deep/README.md +++ /dev/null @@ -1,76 +0,0 @@ -# Wide & Deep Model - -Original Source: https://github.com/tensorflow/models/tree/master/official/wide_deep - -In this example, we use TensorFlowOnSpark, along with the [tf.estimator.train_and_evaluate](https://www.tensorflow.org/api_docs/python/tf/estimator/train_and_evaluate) API, to convert a single-node TensorFlow application into a distributed one. - - -## How to run - -For simplicity, we'll use Spark Standalone on a single node. If you haven't already done so, you should try the [Getting Started on Spark Standalone](https://github.com/yahoo/TensorFlowOnSpark/wiki/GetStarted_Standalone) instructions. - -#### Clone this repository (if not already done) - -```bash -git clone https://github.com/yahoo/TensorFlowOnSpark.git -cd TensorFlowOnSpark -export TFoS_HOME=$(pwd) -``` - -#### Clone the TensorFlow Models repository - -This example depends on code in the [TensorFlow Models](https://github.com/tensorflow/models) repository, so you will have to clone the repo: -```bash -git clone https://github.com/tensorflow/models.git -cd models -pip install -r official/requirements.txt -export TF_MODELS=$(pwd) -export PYTHONPATH=$PYTHONPATH:$(pwd) -``` - -#### Start Spark Standalone Cluster - -```bash -export MASTER=spark://$(hostname):7077 -export SPARK_WORKER_INSTANCES=3 -export CORES_PER_WORKER=1 -export TOTAL_CORES=$((${CORES_PER_WORKER}*${SPARK_WORKER_INSTANCES})) - -${SPARK_HOME}/sbin/start-master.sh; ${SPARK_HOME}/sbin/start-slave.sh -c $CORES_PER_WORKER -m 3G ${MASTER} -``` - -### Download the UCI Census Income Dataset - -```bash -cd ${TFoS_HOME}/examples/wide_deep - -python census_dataset.py -``` - -### Run Distributed Wide & Deep - -```bash -# rm -Rf /tmp/census_model; \ -${SPARK_HOME}/bin/spark-submit \ ---master ${MASTER} \ ---py-files census_dataset.py,wide_deep_run_loop.py \ ---conf spark.cores.max=${TOTAL_CORES} \ ---conf spark.task.cpus=${CORES_PER_WORKER} \ ---conf spark.executorEnv.JAVA_HOME="$JAVA_HOME" \ ---conf spark.task.maxFailures=1 \ ---conf spark.stage.maxConsecutiveAttempts=1 \ -census_main.py \ ---cluster_size 3 -``` - -The TensorFlow logs for each node will be available in `stderr` link of each executor in the Spark UI. For example, in the log of the `master` node, you should see something like the following: -``` -I0124 09:33:27.728477 4486518208 tf_logging.py:115] Finished evaluation at 2019-01-24-17:33:27 -I0124 09:33:27.729230 4486518208 tf_logging.py:115] Saving dict for global step 1729: accuracy = 0.82875, accuracy_baseline = 0.76325, auc = 0.8827834, auc_precision_recall = 0.7127151, average_loss = 0.3687935, global_step = 1729, label/mean = 0.23675, loss = 14.7517395, precision = 0.7119741, prediction/mean = 0.261756, recall = 0.46462512 -``` - -#### Shutdown Standalone Cluster - -```bash -${SPARK_HOME}/sbin/stop-slave.sh; ${SPARK_HOME}/sbin/stop-master.sh -``` diff --git a/examples/wide_deep/census_dataset.py b/examples/wide_deep/census_dataset.py deleted file mode 100644 index 4cf66f82..00000000 --- a/examples/wide_deep/census_dataset.py +++ /dev/null @@ -1,204 +0,0 @@ -# Copyright 2017 The TensorFlow Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# ============================================================================== -"""Download and clean the Census Income Dataset.""" - -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function - -import os -import sys - -# pylint: disable=wrong-import-order -from absl import app as absl_app -from absl import flags -from six.moves import urllib -import tensorflow as tf -# pylint: enable=wrong-import-order - -from official.utils.flags import core as flags_core - - -DATA_URL = 'https://archive.ics.uci.edu/ml/machine-learning-databases/adult' -TRAINING_FILE = 'adult.data' -TRAINING_URL = '%s/%s' % (DATA_URL, TRAINING_FILE) -EVAL_FILE = 'adult.test' -EVAL_URL = '%s/%s' % (DATA_URL, EVAL_FILE) - - -_CSV_COLUMNS = [ - 'age', 'workclass', 'fnlwgt', 'education', 'education_num', - 'marital_status', 'occupation', 'relationship', 'race', 'gender', - 'capital_gain', 'capital_loss', 'hours_per_week', 'native_country', - 'income_bracket' -] - -_CSV_COLUMN_DEFAULTS = [[0], [''], [0], [''], [0], [''], [''], [''], [''], [''], - [0], [0], [0], [''], ['']] - -_HASH_BUCKET_SIZE = 1000 - -_NUM_EXAMPLES = { - 'train': 32561, - 'validation': 16281, -} - - -def _download_and_clean_file(filename, url): - """Downloads data from url, and makes changes to match the CSV format.""" - temp_file, _ = urllib.request.urlretrieve(url) - with tf.gfile.Open(temp_file, 'r') as temp_eval_file: - with tf.gfile.Open(filename, 'w') as eval_file: - for line in temp_eval_file: - line = line.strip() - line = line.replace(', ', ',') - if not line or ',' not in line: - continue - if line[-1] == '.': - line = line[:-1] - line += '\n' - eval_file.write(line) - tf.gfile.Remove(temp_file) - - -def download(data_dir): - """Download census data if it is not already present.""" - tf.gfile.MakeDirs(data_dir) - - training_file_path = os.path.join(data_dir, TRAINING_FILE) - if not tf.gfile.Exists(training_file_path): - _download_and_clean_file(training_file_path, TRAINING_URL) - - eval_file_path = os.path.join(data_dir, EVAL_FILE) - if not tf.gfile.Exists(eval_file_path): - _download_and_clean_file(eval_file_path, EVAL_URL) - - -def build_model_columns(): - """Builds a set of wide and deep feature columns.""" - # Continuous variable columns - age = tf.feature_column.numeric_column('age') - education_num = tf.feature_column.numeric_column('education_num') - capital_gain = tf.feature_column.numeric_column('capital_gain') - capital_loss = tf.feature_column.numeric_column('capital_loss') - hours_per_week = tf.feature_column.numeric_column('hours_per_week') - - education = tf.feature_column.categorical_column_with_vocabulary_list( - 'education', [ - 'Bachelors', 'HS-grad', '11th', 'Masters', '9th', 'Some-college', - 'Assoc-acdm', 'Assoc-voc', '7th-8th', 'Doctorate', 'Prof-school', - '5th-6th', '10th', '1st-4th', 'Preschool', '12th']) - - marital_status = tf.feature_column.categorical_column_with_vocabulary_list( - 'marital_status', [ - 'Married-civ-spouse', 'Divorced', 'Married-spouse-absent', - 'Never-married', 'Separated', 'Married-AF-spouse', 'Widowed']) - - relationship = tf.feature_column.categorical_column_with_vocabulary_list( - 'relationship', [ - 'Husband', 'Not-in-family', 'Wife', 'Own-child', 'Unmarried', - 'Other-relative']) - - workclass = tf.feature_column.categorical_column_with_vocabulary_list( - 'workclass', [ - 'Self-emp-not-inc', 'Private', 'State-gov', 'Federal-gov', - 'Local-gov', '?', 'Self-emp-inc', 'Without-pay', 'Never-worked']) - - # To show an example of hashing: - occupation = tf.feature_column.categorical_column_with_hash_bucket( - 'occupation', hash_bucket_size=_HASH_BUCKET_SIZE) - - # Transformations. - age_buckets = tf.feature_column.bucketized_column( - age, boundaries=[18, 25, 30, 35, 40, 45, 50, 55, 60, 65]) - - # Wide columns and deep columns. - base_columns = [ - education, marital_status, relationship, workclass, occupation, - age_buckets, - ] - - crossed_columns = [ - tf.feature_column.crossed_column( - ['education', 'occupation'], hash_bucket_size=_HASH_BUCKET_SIZE), - tf.feature_column.crossed_column( - [age_buckets, 'education', 'occupation'], - hash_bucket_size=_HASH_BUCKET_SIZE), - ] - - wide_columns = base_columns + crossed_columns - - deep_columns = [ - age, - education_num, - capital_gain, - capital_loss, - hours_per_week, - tf.feature_column.indicator_column(workclass), - tf.feature_column.indicator_column(education), - tf.feature_column.indicator_column(marital_status), - tf.feature_column.indicator_column(relationship), - # To show an example of embedding - tf.feature_column.embedding_column(occupation, dimension=8), - ] - - return wide_columns, deep_columns - - -def input_fn(data_file, num_epochs, shuffle, batch_size): - """Generate an input function for the Estimator.""" - assert tf.gfile.Exists(data_file), ( - '%s not found. Please make sure you have run census_dataset.py and ' - 'set the --data_dir argument to the correct path.' % data_file) - - def parse_csv(value): - tf.logging.info('Parsing {}'.format(data_file)) - columns = tf.decode_csv(value, record_defaults=_CSV_COLUMN_DEFAULTS) - features = dict(zip(_CSV_COLUMNS, columns)) - labels = features.pop('income_bracket') - classes = tf.equal(labels, '>50K') # binary classification - return features, classes - - # Extract lines from input files using the Dataset API. - dataset = tf.data.TextLineDataset(data_file) - - if shuffle: - dataset = dataset.shuffle(buffer_size=_NUM_EXAMPLES['train']) - - dataset = dataset.map(parse_csv, num_parallel_calls=5) - - # We call repeat after shuffling, rather than before, to prevent separate - # epochs from blending together. - dataset = dataset.repeat(num_epochs) - dataset = dataset.batch(batch_size) - return dataset - - -def define_data_download_flags(): - """Add flags specifying data download arguments.""" - flags.DEFINE_string( - name="data_dir", default="/tmp/census_data/", - help=flags_core.help_wrap( - "Directory to download and extract data.")) - - -def main(_): - download(flags.FLAGS.data_dir) - - -if __name__ == '__main__': - tf.logging.set_verbosity(tf.logging.INFO) - define_data_download_flags() - absl_app.run(main) diff --git a/examples/wide_deep/census_main.py b/examples/wide_deep/census_main.py deleted file mode 100644 index dfbde486..00000000 --- a/examples/wide_deep/census_main.py +++ /dev/null @@ -1,159 +0,0 @@ -# Copyright 2018 The TensorFlow Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# ============================================================================== -"""Train DNN on census income dataset.""" - -import os - -# from absl import app as absl_app -from absl import flags -import tensorflow as tf - -from official.utils.flags import core as flags_core -from official.utils.logs import logger -from official.utils.logs import hooks_helper -from official.utils.misc import model_helpers -import census_dataset -import wide_deep_run_loop - - -LOSS_PREFIX = {'wide': 'linear/', 'deep': 'dnn/'} - - -def define_census_flags(): - wide_deep_run_loop.define_wide_deep_flags() - flags.adopt_module_key_flags(wide_deep_run_loop) - flags_core.set_defaults(data_dir='/tmp/census_data', - model_dir='/tmp/census_model', - train_epochs=40, - epochs_between_evals=2, - inter_op_parallelism_threads=0, - intra_op_parallelism_threads=0, - batch_size=40) - - -def build_estimator(model_dir, model_type, model_column_fn, inter_op, intra_op, ctx): - """Build an estimator appropriate for the given model type.""" - wide_columns, deep_columns = model_column_fn() - hidden_units = [100, 75, 50, 25] - - # Create a tf.estimator.RunConfig to ensure the model is run on CPU, which - # trains faster than GPU for this model. - # Note: adding device_filter to fix: https://github.com/tensorflow/tensorflow/issues/21745 - run_config = tf.estimator.RunConfig().replace( - session_config=tf.ConfigProto(device_count={'GPU': 0}, - device_filters=['/job:ps', '/job:%s/task:%d' % (ctx.job_name, ctx.task_index)], - inter_op_parallelism_threads=inter_op, - intra_op_parallelism_threads=intra_op)) - - if model_type == 'wide': - return tf.estimator.LinearClassifier( - model_dir=model_dir, - feature_columns=wide_columns, - config=run_config) - elif model_type == 'deep': - return tf.estimator.DNNClassifier( - model_dir=model_dir, - feature_columns=deep_columns, - hidden_units=hidden_units, - config=run_config) - else: - return tf.estimator.DNNLinearCombinedClassifier( - model_dir=model_dir, - linear_feature_columns=wide_columns, - dnn_feature_columns=deep_columns, - dnn_hidden_units=hidden_units, - config=run_config) - - -def run_census(flags_obj, ctx): - """Construct all necessary functions and call run_loop. - - Args: - flags_obj: Object containing user specified flags. - """ - train_file = os.path.join(flags_obj.data_dir, census_dataset.TRAINING_FILE) - test_file = os.path.join(flags_obj.data_dir, census_dataset.EVAL_FILE) - - # Train and evaluate the model every `flags.epochs_between_evals` epochs. - def train_input_fn(): - return census_dataset.input_fn( - train_file, flags_obj.epochs_between_evals, True, flags_obj.batch_size) - - def eval_input_fn(): - return census_dataset.input_fn(test_file, 1, False, flags_obj.batch_size) - - tensors_to_log = { - 'average_loss': '{loss_prefix}head/truediv', - 'loss': '{loss_prefix}head/weighted_loss/Sum' - } - - # Removing run_loop, since we can only invoke train_and_evaluate once - model_helpers.apply_clean(flags.FLAGS) - model = build_estimator( - model_dir=flags_obj.model_dir, model_type=flags_obj.model_type, - model_column_fn=census_dataset.build_model_columns, - inter_op=flags_obj.inter_op_parallelism_threads, - intra_op=flags_obj.intra_op_parallelism_threads, - ctx=ctx) - - loss_prefix = LOSS_PREFIX.get(flags_obj.model_type, '') - tensors_to_log = {k: v.format(loss_prefix=loss_prefix) - for k, v in tensors_to_log.items()} - train_hooks = hooks_helper.get_train_hooks( - flags_obj.hooks, model_dir=flags_obj.model_dir, - batch_size=flags_obj.batch_size, tensors_to_log=tensors_to_log) - - # Note: this will only be invoked once, so `--epochs_between_evals` is now effectively `--train_epochs` - # and evaluation will only execute once. - train_spec = tf.estimator.TrainSpec(input_fn=train_input_fn, hooks=train_hooks) - eval_spec = tf.estimator.EvalSpec(input_fn=eval_input_fn) - tf.estimator.train_and_evaluate(model, train_spec, eval_spec) - - -def main_fun(argv, ctx): - sys.argv = argv - define_census_flags() - flags.FLAGS(sys.argv) - tf.logging.set_verbosity(tf.logging.INFO) - - with logger.benchmark_context(flags.FLAGS): - run_census(flags.FLAGS, ctx) - - -if __name__ == '__main__': - import argparse - import sys - from pyspark import SparkConf, SparkContext - from tensorflowonspark import TFCluster - - sc = SparkContext(conf=SparkConf().setAppName('wide_deep')) - executors = int(sc._conf.get("spark.executor.instances", "1")) - - # arguments for Spark and TFoS - parser = argparse.ArgumentParser() - parser.add_argument("--cluster_size", help="number of nodes in the cluster", type=int, default=executors) - parser.add_argument("--num_ps", help="number of ps nodes", type=int, default=1) - (args, remainder) = parser.parse_known_args() - - # construct an ARGV (with script name as first element) from remaining args and pass it to the TF processes on executors - remainder.insert(0, __file__) - print("spark args:", args) - print("tf args:", remainder) - - num_workers = args.cluster_size - args.num_ps - print("===== num_executors={}, num_workers={}, num_ps={}".format(args.cluster_size, num_workers, args.num_ps)) - - cluster = TFCluster.run(sc, main_fun, remainder, args.cluster_size, args.num_ps, False, TFCluster.InputMode.TENSORFLOW, master_node='master') - cluster.shutdown() diff --git a/examples/wide_deep/wide_deep_run_loop.py b/examples/wide_deep/wide_deep_run_loop.py deleted file mode 100644 index 7bc4c555..00000000 --- a/examples/wide_deep/wide_deep_run_loop.py +++ /dev/null @@ -1,131 +0,0 @@ -# Copyright 2017 The TensorFlow Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# ============================================================================== -"""Core run logic for TensorFlow Wide & Deep Tutorial using tf.estimator API.""" - -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function - -import os -import shutil - -from absl import app as absl_app -from absl import flags -import tensorflow as tf # pylint: disable=g-bad-import-order - -from official.utils.flags import core as flags_core -from official.utils.logs import hooks_helper -from official.utils.logs import logger -from official.utils.misc import model_helpers - - -LOSS_PREFIX = {'wide': 'linear/', 'deep': 'dnn/'} - - -def define_wide_deep_flags(): - """Add supervised learning flags, as well as wide-deep model type.""" - flags_core.define_base() - flags_core.define_benchmark() - flags_core.define_performance( - num_parallel_calls=False, inter_op=True, intra_op=True, - synthetic_data=False, max_train_steps=False, dtype=False, - all_reduce_alg=False) - - flags.adopt_module_key_flags(flags_core) - - flags.DEFINE_enum( - name="model_type", short_name="mt", default="wide_deep", - enum_values=['wide', 'deep', 'wide_deep'], - help="Select model topology.") - flags.DEFINE_boolean( - name="download_if_missing", default=True, help=flags_core.help_wrap( - "Download data to data_dir if it is not already present.")) - - -def export_model(model, model_type, export_dir, model_column_fn): - """Export to SavedModel format. - - Args: - model: Estimator object - model_type: string indicating model type. "wide", "deep" or "wide_deep" - export_dir: directory to export the model. - model_column_fn: Function to generate model feature columns. - """ - wide_columns, deep_columns = model_column_fn() - if model_type == 'wide': - columns = wide_columns - elif model_type == 'deep': - columns = deep_columns - else: - columns = wide_columns + deep_columns - feature_spec = tf.feature_column.make_parse_example_spec(columns) - example_input_fn = ( - tf.estimator.export.build_parsing_serving_input_receiver_fn(feature_spec)) - model.export_savedmodel(export_dir, example_input_fn, - strip_default_attrs=True) - - -def run_loop(name, train_input_fn, eval_input_fn, model_column_fn, - build_estimator_fn, flags_obj, tensors_to_log, early_stop=False): - """Define training loop.""" - model_helpers.apply_clean(flags.FLAGS) - model = build_estimator_fn( - model_dir=flags_obj.model_dir, model_type=flags_obj.model_type, - model_column_fn=model_column_fn, - inter_op=flags_obj.inter_op_parallelism_threads, - intra_op=flags_obj.intra_op_parallelism_threads) - - run_params = { - 'batch_size': flags_obj.batch_size, - 'train_epochs': flags_obj.train_epochs, - 'model_type': flags_obj.model_type, - } - - benchmark_logger = logger.get_benchmark_logger() - benchmark_logger.log_run_info('wide_deep', name, run_params, - test_id=flags_obj.benchmark_test_id) - - loss_prefix = LOSS_PREFIX.get(flags_obj.model_type, '') - tensors_to_log = {k: v.format(loss_prefix=loss_prefix) - for k, v in tensors_to_log.items()} - train_hooks = hooks_helper.get_train_hooks( - flags_obj.hooks, model_dir=flags_obj.model_dir, - batch_size=flags_obj.batch_size, tensors_to_log=tensors_to_log) - - # Train and evaluate the model every `flags.epochs_between_evals` epochs. - for n in range(flags_obj.train_epochs // flags_obj.epochs_between_evals): - model.train(input_fn=train_input_fn, hooks=train_hooks) - - results = model.evaluate(input_fn=eval_input_fn) - - # Display evaluation metrics - tf.logging.info('Results at epoch %d / %d', - (n + 1) * flags_obj.epochs_between_evals, - flags_obj.train_epochs) - tf.logging.info('-' * 60) - - for key in sorted(results): - tf.logging.info('%s: %s' % (key, results[key])) - - benchmark_logger.log_evaluation_result(results) - - if early_stop and model_helpers.past_stop_threshold( - flags_obj.stop_threshold, results['accuracy']): - break - - # Export the model - if flags_obj.export_dir is not None: - export_model(model, flags_obj.model_type, flags_obj.export_dir, - model_column_fn) From 3a3d1e24d7c5dc2e922963f9e3337f22f1029f9e Mon Sep 17 00:00:00 2001 From: Lee Yang Date: Fri, 6 Sep 2019 15:31:40 -0700 Subject: [PATCH 24/37] update versions; remove more outdated examples --- docs/source/conf.py | 4 +- examples/mnist/streaming/__init__.py | 0 examples/mnist/streaming/mnist_dist.py | 170 ------------------------ examples/mnist/streaming/mnist_spark.py | 65 --------- pom.xml | 2 +- setup.py | 2 +- tensorflowonspark/__init__.py | 2 +- 7 files changed, 5 insertions(+), 240 deletions(-) delete mode 100644 examples/mnist/streaming/__init__.py delete mode 100755 examples/mnist/streaming/mnist_dist.py delete mode 100755 examples/mnist/streaming/mnist_spark.py diff --git a/docs/source/conf.py b/docs/source/conf.py index b85ec818..6dd3ee39 100644 --- a/docs/source/conf.py +++ b/docs/source/conf.py @@ -27,9 +27,9 @@ author = 'Yahoo Inc' # The short X.Y version -version = '1.4.3' +version = '2.0.0' # The full version, including alpha/beta/rc tags -release = '1.4.3' +release = '2.0.0.rc0' # -- General configuration --------------------------------------------------- diff --git a/examples/mnist/streaming/__init__.py b/examples/mnist/streaming/__init__.py deleted file mode 100644 index e69de29b..00000000 diff --git a/examples/mnist/streaming/mnist_dist.py b/examples/mnist/streaming/mnist_dist.py deleted file mode 100755 index 98782ca3..00000000 --- a/examples/mnist/streaming/mnist_dist.py +++ /dev/null @@ -1,170 +0,0 @@ -# Copyright 2017 Yahoo Inc. -# Licensed under the terms of the Apache 2.0 license. -# Please see LICENSE file in the project root for terms. - -# Distributed MNIST on grid based on TensorFlow MNIST example - -from __future__ import absolute_import -from __future__ import division -from __future__ import nested_scopes -from __future__ import print_function - - -def print_log(worker_num, arg): - print("{0}: {1}".format(worker_num, arg)) - - -def map_fun(args, ctx): - from tensorflowonspark import TFNode - from datetime import datetime - import math - import numpy - import tensorflow as tf - import time - - worker_num = ctx.worker_num - job_name = ctx.job_name - task_index = ctx.task_index - - IMAGE_PIXELS = 28 - - # Delay PS nodes a bit, since workers seem to reserve GPUs more quickly/reliably (w/o conflict) - if job_name == "ps": - time.sleep((worker_num + 1) * 5) - - # Parameters - hidden_units = 128 - batch_size = args.batch_size - - # Get TF cluster and server instances - cluster, server = TFNode.start_cluster_server(ctx, 1, args.rdma) - - def feed_dict(batch): - # Convert from [(images, labels)] to two numpy arrays of the proper type - images = [] - labels = [] - for item in batch: - images.append(item[0]) - labels.append(item[1]) - xs = numpy.array(images) - xs = xs.astype(numpy.float32) - xs = xs / 255.0 - ys = numpy.array(labels) - ys = ys.astype(numpy.uint8) - return (xs, ys) - - if job_name == "ps": - server.join() - elif job_name == "worker": - - # Assigns ops to the local worker by default. - with tf.device(tf.train.replica_device_setter( - worker_device="/job:worker/task:%d" % task_index, - cluster=cluster)): - - # Variables of the hidden layer - hid_w = tf.Variable(tf.truncated_normal([IMAGE_PIXELS * IMAGE_PIXELS, hidden_units], - stddev=1.0 / IMAGE_PIXELS), name="hid_w") - hid_b = tf.Variable(tf.zeros([hidden_units]), name="hid_b") - tf.summary.histogram("hidden_weights", hid_w) - - # Variables of the softmax layer - sm_w = tf.Variable(tf.truncated_normal([hidden_units, 10], - stddev=1.0 / math.sqrt(hidden_units)), name="sm_w") - sm_b = tf.Variable(tf.zeros([10]), name="sm_b") - tf.summary.histogram("softmax_weights", sm_w) - - # Placeholders or QueueRunner/Readers for input data - x = tf.placeholder(tf.float32, [None, IMAGE_PIXELS * IMAGE_PIXELS], name="x") - y_ = tf.placeholder(tf.float32, [None, 10], name="y_") - - x_img = tf.reshape(x, [-1, IMAGE_PIXELS, IMAGE_PIXELS, 1]) - tf.summary.image("x_img", x_img) - - hid_lin = tf.nn.xw_plus_b(x, hid_w, hid_b) - hid = tf.nn.relu(hid_lin) - - y = tf.nn.softmax(tf.nn.xw_plus_b(hid, sm_w, sm_b)) - - global_step = tf.Variable(0) - - loss = -tf.reduce_sum(y_ * tf.log(tf.clip_by_value(y, 1e-10, 1.0))) - tf.summary.scalar("loss", loss) - - train_op = tf.train.AdagradOptimizer(0.01).minimize( - loss, global_step=global_step) - - # Test trained model - label = tf.argmax(y_, 1, name="label") - prediction = tf.argmax(y, 1, name="prediction") - correct_prediction = tf.equal(prediction, label) - - accuracy = tf.reduce_mean(tf.cast(correct_prediction, tf.float32), name="accuracy") - tf.summary.scalar("acc", accuracy) - - saver = tf.train.Saver() - summary_op = tf.summary.merge_all() - init_op = tf.global_variables_initializer() - - # Create a "supervisor", which oversees the training process and stores model state into HDFS - logdir = TFNode.hdfs_path(ctx, args.model) - print("tensorflow model path: {0}".format(logdir)) - summary_writer = tf.summary.FileWriter("tensorboard_%d" % worker_num, graph=tf.get_default_graph()) - - if args.mode == "train": - sv = tf.train.Supervisor(is_chief=(task_index == 0), - logdir=logdir, - init_op=init_op, - summary_op=None, - saver=saver, - global_step=global_step, - stop_grace_secs=300, - save_model_secs=10) - else: - sv = tf.train.Supervisor(is_chief=(task_index == 0), - logdir=logdir, - summary_op=None, - saver=saver, - global_step=global_step, - stop_grace_secs=300, - save_model_secs=0) - - # The supervisor takes care of session initialization, restoring from - # a checkpoint, and closing when done or an error occurs. - with sv.managed_session(server.target) as sess: - print("{0} session ready".format(datetime.now().isoformat())) - - # Loop until the supervisor shuts down or 1000000 steps have completed. - step = 0 - tf_feed = TFNode.DataFeed(ctx.mgr, args.mode == "train") - while not sv.should_stop() and not tf_feed.should_stop() and step < args.steps: - # Run a training step asynchronously. - # See `tf.train.SyncReplicasOptimizer` for additional details on how to - # perform *synchronous* training. - - # using feed_dict - batch_xs, batch_ys = feed_dict(tf_feed.next_batch(batch_size)) - feed = {x: batch_xs, y_: batch_ys} - - if len(batch_xs) > 0: - if args.mode == "train": - _, summary, step = sess.run([train_op, summary_op, global_step], feed_dict=feed) - # print accuracy and save model checkpoint to HDFS every 100 steps - if (step % 100 == 0): - print("{0} step: {1} accuracy: {2}".format(datetime.now().isoformat(), step, sess.run(accuracy, {x: batch_xs, y_: batch_ys}))) - - if sv.is_chief: - summary_writer.add_summary(summary, step) - else: # args.mode == "inference" - labels, preds, acc = sess.run([label, prediction, accuracy], feed_dict=feed) - - results = ["{0} Label: {1}, Prediction: {2}".format(datetime.now().isoformat(), l, p) for l, p in zip(labels, preds)] - tf_feed.batch_results(results) - print("acc: {0}".format(acc)) - - if sv.should_stop() or step >= args.steps: - tf_feed.terminate() - - # Ask for all the services to stop. - print("{0} stopping supervisor".format(datetime.now().isoformat())) - sv.stop() diff --git a/examples/mnist/streaming/mnist_spark.py b/examples/mnist/streaming/mnist_spark.py deleted file mode 100755 index 72f1c1f4..00000000 --- a/examples/mnist/streaming/mnist_spark.py +++ /dev/null @@ -1,65 +0,0 @@ -# Copyright 2017 Yahoo Inc. -# Licensed under the terms of the Apache 2.0 license. -# Please see LICENSE file in the project root for terms. - -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function - -from pyspark.context import SparkContext -from pyspark.conf import SparkConf -from pyspark.streaming import StreamingContext - -import argparse -import numpy -from datetime import datetime - -from tensorflowonspark import TFCluster -import mnist_dist - -sc = SparkContext(conf=SparkConf().setAppName("mnist_streaming")) -ssc = StreamingContext(sc, 60) -executors = sc._conf.get("spark.executor.instances") -num_executors = int(executors) if executors is not None else 1 -num_ps = 1 - -parser = argparse.ArgumentParser() -parser.add_argument("--batch_size", help="number of records per batch", type=int, default=100) -parser.add_argument("--epochs", help="number of epochs", type=int, default=1) -parser.add_argument("--format", help="example format: (csv|csv2|pickle|tfr)", choices=["csv", "csv2", "pickle", "tfr"], default="stream") -parser.add_argument("--images", help="HDFS path to MNIST images in parallelized format") -parser.add_argument("--model", help="HDFS path to save/load model during train/inference", default="mnist_model") -parser.add_argument("--cluster_size", help="number of nodes in the cluster", type=int, default=num_executors) -parser.add_argument("--output", help="HDFS path to save test/inference output", default="predictions") -parser.add_argument("--steps", help="maximum number of steps", type=int, default=1000) -parser.add_argument("--tensorboard", help="launch tensorboard process", action="store_true") -parser.add_argument("--mode", help="train|inference", default="train") -parser.add_argument("--rdma", help="use rdma connection", default=False) -args = parser.parse_args() -print("args:", args) - -print("{0} ===== Start".format(datetime.now().isoformat())) - - -def parse(ln): - lbl, img = ln.split('|') - image = [int(x) for x in img.split(',')] - label = numpy.zeros(10) - label[int(lbl)] = 1.0 - return (image, label) - - -stream = ssc.textFileStream(args.images) -imageRDD = stream.map(lambda ln: parse(ln)) - -cluster = TFCluster.run(sc, mnist_dist.map_fun, args, args.cluster_size, num_ps, args.tensorboard, TFCluster.InputMode.SPARK) -if args.mode == "train": - cluster.train(imageRDD) -else: - labelRDD = cluster.inference(imageRDD) - labelRDD.saveAsTextFiles(args.output) - -ssc.start() -cluster.shutdown(ssc) - -print("{0} ===== Stop".format(datetime.now().isoformat())) diff --git a/pom.xml b/pom.xml index 281b1c5d..1b56730d 100644 --- a/pom.xml +++ b/pom.xml @@ -5,7 +5,7 @@ 4.0.0 com.yahoo.ml tensorflowonspark - 1.0.2 + 2.0.0-SNAPSHOT jar tensorflowonspark Spark Scala inferencing for TensorFlowOnSpark diff --git a/setup.py b/setup.py index 11dbfa8c..32b04dd9 100644 --- a/setup.py +++ b/setup.py @@ -6,7 +6,7 @@ setup( name='tensorflowonspark', packages=['tensorflowonspark'], - version='1.4.3', + version='2.0.0.rc0', description='Deep learning with TensorFlow on Apache Spark clusters', long_description=long_description, long_description_content_type='text/markdown', diff --git a/tensorflowonspark/__init__.py b/tensorflowonspark/__init__.py index 96cf15c2..cec98d00 100644 --- a/tensorflowonspark/__init__.py +++ b/tensorflowonspark/__init__.py @@ -2,4 +2,4 @@ logging.basicConfig(level=logging.INFO, format="%(asctime)s %(levelname)s (%(threadName)s-%(process)d) %(message)s") -__version__ = "1.4.3" +__version__ = "2.0.0.rc0" From 3a32a3e005df5865d833e8fa1628118ef79e1c73 Mon Sep 17 00:00:00 2001 From: Lee Yang Date: Fri, 6 Sep 2019 15:32:43 -0700 Subject: [PATCH 25/37] remove outdated examples/mnist/README.md --- examples/mnist/README.md | 7 ------- 1 file changed, 7 deletions(-) delete mode 100644 examples/mnist/README.md diff --git a/examples/mnist/README.md b/examples/mnist/README.md deleted file mode 100644 index 3d2e5ee5..00000000 --- a/examples/mnist/README.md +++ /dev/null @@ -1,7 +0,0 @@ -# MNIST - -Original Source: https://github.com/tensorflow/tensorflow/blob/master/tensorflow/tools/dist_test/python/mnist_replica.py - -Note: this has been heavily modified to support different input formats (CSV and TFRecords) as well as to demonstrate the different data ingestion methods (feed_dict and QueueRunner). - -Please follow [these instructions](https://github.com/yahoo/TensorFlowOnSpark/wiki/GetStarted_YARN) to run this example. From d04f6cb2713fcef0cf30e806b808d33e2ac35498 Mon Sep 17 00:00:00 2001 From: Lee Yang Date: Fri, 6 Sep 2019 16:01:38 -0700 Subject: [PATCH 26/37] spark 2.4.4; use sphinx_rtd_theme --- .travis.yml | 8 ++++---- docs/source/conf.py | 4 +++- requirements.txt | 1 + 3 files changed, 8 insertions(+), 5 deletions(-) diff --git a/.travis.yml b/.travis.yml index 9f1557bd..78225dff 100644 --- a/.travis.yml +++ b/.travis.yml @@ -4,10 +4,10 @@ matrix: python: 2.7 dist: trusty before_install: - - curl -LO http://www-us.apache.org/dist/spark/spark-2.4.3/spark-2.4.3-bin-hadoop2.7.tgz + - curl -LO http://www-us.apache.org/dist/spark/spark-2.4.4/spark-2.4.4-bin-hadoop2.7.tgz - export SPARK_HOME=./spark - mkdir $SPARK_HOME - - tar -xf spark-2.4.3-bin-hadoop2.7.tgz -C $SPARK_HOME --strip-components=1 + - tar -xf spark-2.4.4-bin-hadoop2.7.tgz -C $SPARK_HOME --strip-components=1 - export PATH=$SPARK_HOME/bin:$PATH - export SPARK_LOCAL_IP=127.0.0.1 - export SPARK_CLASSPATH=./lib/tensorflow-hadoop-1.0-SNAPSHOT.jar @@ -21,10 +21,10 @@ matrix: python: 3.6 dist: trusty before_install: - - curl -LO http://www-us.apache.org/dist/spark/spark-2.4.3/spark-2.4.3-bin-hadoop2.7.tgz + - curl -LO http://www-us.apache.org/dist/spark/spark-2.4.4/spark-2.4.4-bin-hadoop2.7.tgz - export SPARK_HOME=./spark - mkdir $SPARK_HOME - - tar -xf spark-2.4.3-bin-hadoop2.7.tgz -C $SPARK_HOME --strip-components=1 + - tar -xf spark-2.4.4-bin-hadoop2.7.tgz -C $SPARK_HOME --strip-components=1 - export PATH=$SPARK_HOME/bin:$PATH - export SPARK_LOCAL_IP=127.0.0.1 - export SPARK_CLASSPATH=./lib/tensorflow-hadoop-1.0-SNAPSHOT.jar diff --git a/docs/source/conf.py b/docs/source/conf.py index 6dd3ee39..7365f018 100644 --- a/docs/source/conf.py +++ b/docs/source/conf.py @@ -13,6 +13,7 @@ # documentation root, use os.path.abspath to make it absolute, like shown here. # import os +import sphinx_rtd_theme import sys _pysrc = os.path.abspath(os.path.join(os.path.abspath(__file__), '..', '..', '..')) @@ -45,6 +46,7 @@ 'sphinx.ext.autodoc', 'sphinx.ext.viewcode', 'sphinx.ext.githubpages', + 'sphinx_rtd_theme' ] # Add any paths that contain templates here, relative to this directory. @@ -84,7 +86,7 @@ # The theme to use for HTML and HTML Help pages. See the documentation for # a list of builtin themes. # -html_theme = 'classic' +html_theme = 'sphinx_rtd_theme' # Theme options are theme-specific and customize the look and feel of a theme # further. For a list of options available for each theme, see the diff --git a/requirements.txt b/requirements.txt index 142c3519..585302cc 100644 --- a/requirements.txt +++ b/requirements.txt @@ -3,4 +3,5 @@ py4j pyspark scipy sphinx +sphinx_rtd_theme tensorflow==2.0.0-beta1 From 688a23774e2e0f1b34fda61b0c95ffd7b1495b3e Mon Sep 17 00:00:00 2001 From: Lee Yang Date: Fri, 6 Sep 2019 16:13:51 -0700 Subject: [PATCH 27/37] fix h5py version; use TF2.0.0rc0 --- requirements.txt | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/requirements.txt b/requirements.txt index 585302cc..1884e552 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,7 +1,8 @@ +h5py==2.9.0 numpy>=1.14.0 py4j pyspark scipy sphinx sphinx_rtd_theme -tensorflow==2.0.0-beta1 +tensorflow==2.0.0rc0 From 3fd36767b90fd0d61dc15f3d8ca376d157dfe3f2 Mon Sep 17 00:00:00 2001 From: Lee Yang Date: Fri, 6 Sep 2019 16:19:08 -0700 Subject: [PATCH 28/37] go back to 2.0.0b1 --- requirements.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/requirements.txt b/requirements.txt index 1884e552..faf5b79e 100644 --- a/requirements.txt +++ b/requirements.txt @@ -5,4 +5,4 @@ pyspark scipy sphinx sphinx_rtd_theme -tensorflow==2.0.0rc0 +tensorflow==2.0.0b1 From fea0d5879189f53928b07471fd949d2286107d0e Mon Sep 17 00:00:00 2001 From: Lee Yang Date: Mon, 16 Sep 2019 15:37:55 -0700 Subject: [PATCH 29/37] fix compatibility issues --- test/test_pipeline.py | 57 +++++++++++++++++++++++++++++++------------ 1 file changed, 41 insertions(+), 16 deletions(-) diff --git a/test/test_pipeline.py b/test/test_pipeline.py index 7831fd70..3c6982e9 100644 --- a/test/test_pipeline.py +++ b/test/test_pipeline.py @@ -94,30 +94,55 @@ def _spark_train(args, ctx): from tensorflowonspark import TFNode tf.compat.v1.reset_default_graph() + strategy = tf.distribute.experimental.MultiWorkerMirroredStrategy() - model = Sequential() - model.add(Dense(1, activation='linear', input_shape=(2,))) - model.compile(optimizer=tf.keras.optimizers.Adam(lr=0.2), loss='mse', metrics=['mse']) - model.summary() + with strategy.scope(): + model = Sequential() + model.add(Dense(1, activation='linear', input_shape=[2])) + model.compile(optimizer=tf.keras.optimizers.Adam(lr=0.2), loss='mse', metrics=['mse']) + model.summary() tf_feed = TFNode.DataFeed(ctx.mgr, input_mapping=args.input_mapping) - while not tf_feed.should_stop(): - batch = tf_feed.next_batch(args.batch_size) - if args.input_mapping: + + def rdd_generator(): + while not tf_feed.should_stop(): + batch = tf_feed.next_batch(1) if len(batch['x']) > 0: - model.fit(np.array(batch['x']), np.array(batch['y_'])) + features = batch['x'][0] + label = batch['y_'][0] + yield (features, label) + else: + return + + ds = tf.data.Dataset.from_generator(rdd_generator, (tf.float32, tf.float32), (tf.TensorShape([2]), tf.TensorShape([1]))) + ds = ds.batch(args.batch_size) + + # disable auto-sharding dataset + options = tf.data.Options() + options.experimental_distribute.auto_shard = False + ds = ds.with_options(options) + + # only train 90% of each epoch to account for uneven RDD partition sizes + steps_per_epoch = 1000 * 0.9 // (args.batch_size * ctx.num_workers) + + tf.io.gfile.makedirs(args.model_dir) + filepath = args.model_dir + "/weights-{epoch:04d}" + callbacks = [tf.keras.callbacks.ModelCheckpoint(filepath=filepath, verbose=1, load_weights_on_restart=True, save_weights_only=True)] + + model.fit(ds, epochs=args.epochs, steps_per_epoch=steps_per_epoch, callbacks=callbacks) + + # This fails with: "NotImplementedError: `fit_generator` is not supported for models compiled with tf.distribute.Strategy" + # model.fit_generator(ds, epochs=args.epochs, steps_per_epoch=steps_per_epoch, callbacks=callbacks) - if ctx.job_name == 'chief': - print("saving checkpoint to: {}".format(args.model_dir)) - tf.saved_model.save(model, args.model_dir) - # model.save_weights(args.model_dir + "/model", overwrite=True, save_format='tf') + if ctx.job_name == 'chief' and args.export_dir: + print("exporting model to: {}".format(args.export_dir)) + tf.keras.experimental.export_saved_model(model, args.export_dir) - if args.export_dir: - print("exporting model to: {}".format(args.export_dir)) - tf.keras.experimental.export_saved_model(model, args.export_dir) + tf_feed.terminate() # create a Spark DataFrame of training examples (features, labels) - trainDF = self.spark.createDataFrame(self.train_examples, ['col1', 'col2']).repartition(3) + rdd = self.sc.parallelize(self.train_examples, 3) + trainDF = rdd.toDF(['col1', 'col2']) # train and export model args = {} From 14d4aa9d75bc9386354c19724086cc865f8710f8 Mon Sep 17 00:00:00 2001 From: Lee Yang Date: Mon, 16 Sep 2019 16:26:14 -0700 Subject: [PATCH 30/37] minor edits --- examples/mnist/keras/README.md | 2 +- examples/mnist/keras/mnist_spark.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/examples/mnist/keras/README.md b/examples/mnist/keras/README.md index ba816230..9b7d4bb0 100644 --- a/examples/mnist/keras/README.md +++ b/examples/mnist/keras/README.md @@ -75,7 +75,7 @@ The training code will automatically export a TensorFlow SavedModel, which can b export SAVED_MODEL=${MODEL_BASE}/${MODEL_VERSION} # use a CSV formatted test example (reshaping from [784] to [28, 28, 1]) - IMG=$(head -n 1 $TFoS_HOME/data/mnist/csv/test/part-00000 | python ${TFoS_HOME}/examples/utils/mnist_reshape.py) + IMG=$(head -n 1 ${TFoS_HOME}/data/mnist/csv/test/part-00000 | python ${TFoS_HOME}/examples/utils/mnist_reshape.py) # introspect model saved_model_cli show --dir $SAVED_MODEL --all diff --git a/examples/mnist/keras/mnist_spark.py b/examples/mnist/keras/mnist_spark.py index 8e3c6a15..ab6ad340 100644 --- a/examples/mnist/keras/mnist_spark.py +++ b/examples/mnist/keras/mnist_spark.py @@ -50,7 +50,7 @@ def rdd_generator(): # callbacks = [tf.keras.callbacks.ModelCheckpoint(filepath=args.model_dir)] tf.io.gfile.makedirs(args.model_dir) filepath = args.model_dir + "/weights-{epoch:04d}" - callbacks = [tf.keras.callbacks.ModelCheckpoint(filepath=filepath, verbose=1, load_weights_on_restart=True, save_weights_only=True)] + callbacks = [tf.keras.callbacks.ModelCheckpoint(filepath=filepath, verbose=1, save_weights_only=True)] with strategy.scope(): multi_worker_model = build_and_compile_cnn_model() From 5e6a69b5f7eedea45d06b8596ed1c95be2eb2b9a Mon Sep 17 00:00:00 2001 From: Lee Yang Date: Mon, 30 Sep 2019 16:23:04 -0700 Subject: [PATCH 31/37] disable Java build; update README.md --- .travis.yml | 16 ++++++++-------- README.md | 17 +++++++++++------ 2 files changed, 19 insertions(+), 14 deletions(-) diff --git a/.travis.yml b/.travis.yml index 78225dff..9e88a89d 100644 --- a/.travis.yml +++ b/.travis.yml @@ -35,9 +35,9 @@ matrix: script: - sphinx-build -b html docs/source docs/build/html - test/run_tests.sh - - language: java - dist: trusty - jdk: oraclejdk8 +# - language: java +# dist: trusty +# jdk: oraclejdk8 notifications: email: false deploy: @@ -63,8 +63,8 @@ deploy: - provider: script script: mvn deploy -DskipTests --settings .travis.settings.xml skip_cleanup: true - on: - branch: master - jdk: oraclejdk8 - tags: true - condition: "$TRAVIS_TAG =~ ^scala_.*$" +# on: +# branch: master +# jdk: oraclejdk8 +# tags: true +# condition: "$TRAVIS_TAG =~ ^scala_.*$" diff --git a/README.md b/README.md index 131b925d..0d4d3f8b 100644 --- a/README.md +++ b/README.md @@ -1,5 +1,5 @@ @@ -21,7 +21,7 @@ cluster with the following steps: 1. **Startup** - launches the Tensorflow main function on the executors, along with listeners for data/control messages. 1. **Data ingestion** - **InputMode.TENSORFLOW** - leverages TensorFlow's built-in APIs to read data files directly from HDFS. - - **InputMode.SPARK** - sends Spark RDD data to the TensorFlow nodes via the [feed_dict](https://www.tensorflow.org/how_tos/reading_data/#feeding) mechanism. Note that we leverage the [Hadoop Input/Output Format](https://github.com/tensorflow/ecosystem/tree/master/hadoop) to access TFRecords on HDFS. + - **InputMode.SPARK** - sends Spark RDD data to the TensorFlow nodes via a generator. Note that we leverage the [Hadoop Input/Output Format](https://github.com/tensorflow/ecosystem/tree/master/hadoop) to access TFRecords on HDFS. 1. **Shutdown** - shuts down the Tensorflow workers and PS nodes on the executors. ## Table of Contents @@ -36,7 +36,7 @@ cluster with the following steps: ## Background TensorFlowOnSpark was developed by Yahoo for large-scale distributed -deep learning on our Hadoop clusters in Yahoo's private cloud. +deep learning on our Hadoop clusters in Yahoo's private cloud. TensorFlowOnSpark provides some important benefits (see [our blog](http://yahoohadoop.tumblr.com/post/157196317141/open-sourcing-tensorflowonspark-distributed-deep)) @@ -44,15 +44,19 @@ over alternative deep learning solutions. * Easily migrate all existing TensorFlow programs with <10 lines of code change; * Support all TensorFlow functionalities: synchronous/asynchronous training, model/data parallelism, inferencing and TensorBoard; * Server-to-server direct communication achieves faster learning when available; - * Allow datasets on HDFS and other sources pushed by Spark or pulled by TensorFlow; + * Allow datasets on HDFS and other sources pushed by Spark or pulled by TensorFlow; * Easily integrate with your existing data processing pipelines and machine learning algorithms (ex. MLlib, CaffeOnSpark); - * Easily deployed on cloud or on-premise: CPU & GPU, Ethernet and Infiniband. + * Easily deployed on cloud or on-premise: CPU & GPU, Ethernet and Infiniband. ## Install TensorFlowOnSpark is provided as a pip package, which can be installed on single machines via: ``` +# for tensorflow>=2.0.0 pip install tensorflowonspark + +# for tensorflow<2.0.0 +pip install tensorflowonspark==1.4.4 ``` For distributed clusters, please see our [wiki site](../../wiki) for detailed documentation for specific environments, such as our getting started guides for [single-node Spark Standalone](https://github.com/yahoo/TensorFlowOnSpark/wiki/GetStarted_Standalone), [YARN clusters](../../wiki/GetStarted_YARN) and [AWS EC2](../../wiki/GetStarted_EC2). Note: the Windows operating system is not currently supported due to [this issue](https://github.com/yahoo/TensorFlowOnSpark/issues/36). @@ -61,6 +65,8 @@ For distributed clusters, please see our [wiki site](../../wiki) for detailed do To use TensorFlowOnSpark with an existing TensorFlow application, you can follow our [Conversion Guide](../../wiki/Conversion-Guide) to describe the required changes. Additionally, our [wiki site](../../wiki) has pointers to some presentations which provide an overview of the platform. +**Note: since TensorFlow 2.x breaks API compatibility with TensorFlow 1.x, the examples have been updated accordingly. If you are using TensorFlow 1.x, you will need to checkout the `v1.4.4` tag for compatible examples and instructions.** + ## API [API Documentation](https://yahoo.github.io/TensorFlowOnSpark/) is automatically generated from the code. @@ -71,7 +77,6 @@ Please join the [TensorFlowOnSpark user group](https://groups.google.com/forum/# Contributions are always welcome. For more information, please see our [guide for getting involved](Contributing.md). - ## License The use and distribution terms for this software are covered by the Apache 2.0 license. From 2941b020978c9a6519367c631d2d0b6764fad393 Mon Sep 17 00:00:00 2001 From: Lee Yang Date: Tue, 1 Oct 2019 09:23:56 -0700 Subject: [PATCH 32/37] try xenial env --- .travis.yml | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/.travis.yml b/.travis.yml index 9e88a89d..6c01d327 100644 --- a/.travis.yml +++ b/.travis.yml @@ -2,7 +2,7 @@ matrix: include: - language: python python: 2.7 - dist: trusty + dist: xenial before_install: - curl -LO http://www-us.apache.org/dist/spark/spark-2.4.4/spark-2.4.4-bin-hadoop2.7.tgz - export SPARK_HOME=./spark @@ -19,7 +19,7 @@ matrix: - test/run_tests.sh - language: python python: 3.6 - dist: trusty + dist: xenial before_install: - curl -LO http://www-us.apache.org/dist/spark/spark-2.4.4/spark-2.4.4-bin-hadoop2.7.tgz - export SPARK_HOME=./spark @@ -36,7 +36,7 @@ matrix: - sphinx-build -b html docs/source docs/build/html - test/run_tests.sh # - language: java -# dist: trusty +# dist: xenial # jdk: oraclejdk8 notifications: email: false From 2dbc0be45f2889738edcf0da6326d510eee303cc Mon Sep 17 00:00:00 2001 From: Lee Yang Date: Tue, 1 Oct 2019 11:05:48 -0700 Subject: [PATCH 33/37] update pip version --- .travis.yml | 2 ++ 1 file changed, 2 insertions(+) diff --git a/.travis.yml b/.travis.yml index 6c01d327..2c5c6d74 100644 --- a/.travis.yml +++ b/.travis.yml @@ -13,6 +13,7 @@ matrix: - export SPARK_CLASSPATH=./lib/tensorflow-hadoop-1.0-SNAPSHOT.jar - export PYTHONPATH=$(pwd) install: + - pip install --upgrade pip - pip install -r requirements.txt - pip install mock script: @@ -30,6 +31,7 @@ matrix: - export SPARK_CLASSPATH=./lib/tensorflow-hadoop-1.0-SNAPSHOT.jar - export PYTHONPATH=$(pwd) install: + - pip install --upgrade pip - pip install -r requirements.txt - pip list script: From 36b60155569aeabaf9d4456225efe176d0b8c0c1 Mon Sep 17 00:00:00 2001 From: Lee Yang Date: Tue, 1 Oct 2019 11:16:12 -0700 Subject: [PATCH 34/37] adjust requirements.txt dependencies --- requirements.txt | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/requirements.txt b/requirements.txt index 477a3be0..6be7c9b0 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,8 +1,9 @@ -h5py==2.9.0 +h5py>=2.9.0 numpy>=1.14.0 -py4j +py4j==0.10.7 pyspark scipy +setuptools>=41.0.0 sphinx sphinx_rtd_theme tensorflow>=2.0.0 From 7a9053e38c24f502f84d52bb4655bc469e415a92 Mon Sep 17 00:00:00 2001 From: Lee Yang Date: Tue, 1 Oct 2019 13:11:13 -0700 Subject: [PATCH 35/37] move travis spark setup to script --- .travis.yml | 33 ++++----------------------------- scripts/local-setup-spark.sh | 10 ---------- scripts/travis_install_spark.sh | 21 +++++++++++++++++++++ 3 files changed, 25 insertions(+), 39 deletions(-) delete mode 100755 scripts/local-setup-spark.sh create mode 100644 scripts/travis_install_spark.sh diff --git a/.travis.yml b/.travis.yml index 2c5c6d74..0be8ec2f 100644 --- a/.travis.yml +++ b/.travis.yml @@ -4,16 +4,9 @@ matrix: python: 2.7 dist: xenial before_install: - - curl -LO http://www-us.apache.org/dist/spark/spark-2.4.4/spark-2.4.4-bin-hadoop2.7.tgz - - export SPARK_HOME=./spark - - mkdir $SPARK_HOME - - tar -xf spark-2.4.4-bin-hadoop2.7.tgz -C $SPARK_HOME --strip-components=1 - - export PATH=$SPARK_HOME/bin:$PATH - - export SPARK_LOCAL_IP=127.0.0.1 - - export SPARK_CLASSPATH=./lib/tensorflow-hadoop-1.0-SNAPSHOT.jar - - export PYTHONPATH=$(pwd) - install: + - source scripts/travis_install_spark.sh - pip install --upgrade pip + install: - pip install -r requirements.txt - pip install mock script: @@ -22,24 +15,14 @@ matrix: python: 3.6 dist: xenial before_install: - - curl -LO http://www-us.apache.org/dist/spark/spark-2.4.4/spark-2.4.4-bin-hadoop2.7.tgz - - export SPARK_HOME=./spark - - mkdir $SPARK_HOME - - tar -xf spark-2.4.4-bin-hadoop2.7.tgz -C $SPARK_HOME --strip-components=1 - - export PATH=$SPARK_HOME/bin:$PATH - - export SPARK_LOCAL_IP=127.0.0.1 - - export SPARK_CLASSPATH=./lib/tensorflow-hadoop-1.0-SNAPSHOT.jar - - export PYTHONPATH=$(pwd) - install: + - source scripts/travis_install_spark.sh - pip install --upgrade pip + install: - pip install -r requirements.txt - pip list script: - sphinx-build -b html docs/source docs/build/html - test/run_tests.sh -# - language: java -# dist: xenial -# jdk: oraclejdk8 notifications: email: false deploy: @@ -62,11 +45,3 @@ deploy: python: 3.6 tags: true condition: "$TRAVIS_TAG =~ ^v.*$" -- provider: script - script: mvn deploy -DskipTests --settings .travis.settings.xml - skip_cleanup: true -# on: -# branch: master -# jdk: oraclejdk8 -# tags: true -# condition: "$TRAVIS_TAG =~ ^scala_.*$" diff --git a/scripts/local-setup-spark.sh b/scripts/local-setup-spark.sh deleted file mode 100755 index d041e5ee..00000000 --- a/scripts/local-setup-spark.sh +++ /dev/null @@ -1,10 +0,0 @@ -#!/usr/bin/env bash -# Copyright 2017 Yahoo Inc. -# Licensed under the terms of the Apache 2.0 license. -# Please see LICENSE file in the project root for terms. -# -# This script install Spark locally - -wget http://archive.apache.org/dist/spark/spark-1.6.0/spark-1.6.0-bin-hadoop2.6.tgz -gunzip spark-1.6.0-bin-hadoop2.6.tgz -tar -xvf spark-1.6.0-bin-hadoop2.6.tar diff --git a/scripts/travis_install_spark.sh b/scripts/travis_install_spark.sh new file mode 100644 index 00000000..03154f27 --- /dev/null +++ b/scripts/travis_install_spark.sh @@ -0,0 +1,21 @@ +#!/bin/bash + +# TensorFlow 2.0.0 is tested/supported on Ubuntu 16 (xenial) or later +# But Travis' xenial build env uses JDK11, while Spark requires JDK8 + +# Install JDK8 +add-apt-repository -y ppa:openjdk-r/ppa +apt-get update +apt-get install -y openjdk-8-jdk --no-install-recommends +update-java-alternatives -s java-1.8.0-openjdk-amd64 + +# Download and install Spark +curl -LO http://www-us.apache.org/dist/spark/spark-2.4.4/spark-2.4.4-bin-hadoop2.7.tgz +export SPARK_HOME=./spark +mkdir $SPARK_HOME +tar -xf spark-2.4.4-bin-hadoop2.7.tgz -C $SPARK_HOME --strip-components=1 +export SPARK_LOCAL_IP=127.0.0.1 +export SPARK_CLASSPATH=./lib/tensorflow-hadoop-1.0-SNAPSHOT.jar +export PATH=$SPARK_HOME/bin:$PATH +export PYTHONPATH=$(pwd) +export JAVA_HOME=/usr/lib/jvm/java-8-openjdk-amd64 From 6968b3c158f4994741c24fc0c8f4ecd1a7fbae40 Mon Sep 17 00:00:00 2001 From: Lee Yang Date: Tue, 1 Oct 2019 13:41:39 -0700 Subject: [PATCH 36/37] more travis setup --- .travis.yml | 6 ++---- ...stall_spark.sh => travis_before_install.sh} | 18 +++++++++++++----- 2 files changed, 15 insertions(+), 9 deletions(-) rename scripts/{travis_install_spark.sh => travis_before_install.sh} (67%) diff --git a/.travis.yml b/.travis.yml index 0be8ec2f..5e677fe7 100644 --- a/.travis.yml +++ b/.travis.yml @@ -4,8 +4,7 @@ matrix: python: 2.7 dist: xenial before_install: - - source scripts/travis_install_spark.sh - - pip install --upgrade pip + - source scripts/travis_before_install.sh install: - pip install -r requirements.txt - pip install mock @@ -15,8 +14,7 @@ matrix: python: 3.6 dist: xenial before_install: - - source scripts/travis_install_spark.sh - - pip install --upgrade pip + - source scripts/travis_before_install.sh install: - pip install -r requirements.txt - pip list diff --git a/scripts/travis_install_spark.sh b/scripts/travis_before_install.sh similarity index 67% rename from scripts/travis_install_spark.sh rename to scripts/travis_before_install.sh index 03154f27..096b1267 100644 --- a/scripts/travis_install_spark.sh +++ b/scripts/travis_before_install.sh @@ -4,10 +4,10 @@ # But Travis' xenial build env uses JDK11, while Spark requires JDK8 # Install JDK8 -add-apt-repository -y ppa:openjdk-r/ppa -apt-get update -apt-get install -y openjdk-8-jdk --no-install-recommends -update-java-alternatives -s java-1.8.0-openjdk-amd64 +sudo add-apt-repository -y ppa:openjdk-r/ppa +sudo apt-get update +sudo apt-get install -y openjdk-8-jdk --no-install-recommends +sudo update-java-alternatives -s java-1.8.0-openjdk-amd64 # Download and install Spark curl -LO http://www-us.apache.org/dist/spark/spark-2.4.4/spark-2.4.4-bin-hadoop2.7.tgz @@ -17,5 +17,13 @@ tar -xf spark-2.4.4-bin-hadoop2.7.tgz -C $SPARK_HOME --strip-components=1 export SPARK_LOCAL_IP=127.0.0.1 export SPARK_CLASSPATH=./lib/tensorflow-hadoop-1.0-SNAPSHOT.jar export PATH=$SPARK_HOME/bin:$PATH -export PYTHONPATH=$(pwd) export JAVA_HOME=/usr/lib/jvm/java-8-openjdk-amd64 + +# Update Python +pip install --upgrade pip +export PYTHONPATH=$(pwd) + +# Echo versions +java -version +pyspark --version +python --version From cb0a169da1b27901077754845f9da0d188052acb Mon Sep 17 00:00:00 2001 From: Lee Yang Date: Tue, 1 Oct 2019 16:40:09 -0700 Subject: [PATCH 37/37] fix test --- scripts/travis_before_install.sh | 6 +----- test/test_pipeline.py | 2 +- 2 files changed, 2 insertions(+), 6 deletions(-) diff --git a/scripts/travis_before_install.sh b/scripts/travis_before_install.sh index 096b1267..97e7ee3e 100644 --- a/scripts/travis_before_install.sh +++ b/scripts/travis_before_install.sh @@ -20,10 +20,6 @@ export PATH=$SPARK_HOME/bin:$PATH export JAVA_HOME=/usr/lib/jvm/java-8-openjdk-amd64 # Update Python +# Note: TensorFlow 2.0.0 requires pip>19.0 pip install --upgrade pip export PYTHONPATH=$(pwd) - -# Echo versions -java -version -pyspark --version -python --version diff --git a/test/test_pipeline.py b/test/test_pipeline.py index 3c6982e9..3fe1c19d 100644 --- a/test/test_pipeline.py +++ b/test/test_pipeline.py @@ -141,7 +141,7 @@ def rdd_generator(): tf_feed.terminate() # create a Spark DataFrame of training examples (features, labels) - rdd = self.sc.parallelize(self.train_examples, 3) + rdd = self.sc.parallelize(self.train_examples, 2) trainDF = rdd.toDF(['col1', 'col2']) # train and export model