Handle remote GCS files

GoogleCloudPlatform · Aug 20, 2019 · 90153a9 · 90153a9
1 parent 4e29e27
commit 90153a9
Show file tree

Hide file tree

Showing 6 changed files with 174 additions and 46 deletions.
diff --git a/tutorials/tensorflow/mlflow_gcp/README.md b/tutorials/tensorflow/mlflow_gcp/README.md
@@ -37,8 +37,64 @@ This dataset is provided by a third party. Google provides no
 representation, warranty, or other guarantees about the validity or any 
 other aspects of this dataset.
 
+### Create a Compute Engine instance
+
+Create a new Deep Learning Virtual Machine instance
+
+```
+export IMAGE_FAMILY="tf-latest-cpu"
+export ZONE="us-central1-b"
+export INSTANCE_NAME="mlflow-server"
+gcloud compute instances create $INSTANCE_NAME \
+       --zone=$ZONE \
+       --image-family=$IMAGE_FAMILY \
+       --machine-type=n1-standard-8 \
+       --image-project=deeplearning-platform-release \
+       --maintenance-policy=TERMINATE \
+       --scopes=https://www.googleapis.com/auth/cloud-platform \
+       --tags http-server,https-server
+```
+
+#### Installing MLflow
+
+Install git, pip and virtual environment
+
+```
+sudo apt-get install git -y
+sudo apt-get install python-pip -y
+pip install virtualenv
+```
+
+Create virtual environment
+
+```
+virtualenv -p `which python3` mlflow_env
+source mlflow_env/bin/activate
+```
+
+Install MLflow
+
+```
+pip install mlflow
+```
+Verify installation
+
+```
+pip freeze | grep mlflow
+mlflow==1.2.0
+```
+
 ### **Install dependencies**
 
+In this tutorial we will train a TensorFlow model and use different 
+parameters. We will use MLflow to track those different parameters and 
+their metrics. Start by cloning the repo.
+
+```
+git clone https://github.com/GoogleCloudPlatform/ml-on-gcp.git
+cd ml-on-gcp/tutorials/tensorflow/mlflow_gcp/
+```
+
 Install the python dependencies. 
 
 ```
@@ -189,19 +245,19 @@ export JOB_NAME=mlflow_$DATE
 export REGION=us-central1
 export GCS_JOB_DIR=gs://mlflow_gcp/jobs/$JOB_NAME
 
-gcloud ai-platform job sumit training $JOB_NAME \
-    --stream-logs \
-    --runtime-version 1.14 \
-    --package-path trainer \
-    --module-name trainer.task \
-    --region $REGION \
-    -- \
-    --train-files $TRAIN_FILE \
-    --eval-files $EVAL_FILE \
-    --job-dir $GCS_JOB_DIR \
-    --train-steps $TRAIN_STEPS \
-    --eval-steps $EVAL_STEPS
-    --mlflow-tracking-uri http://<MLFlow Public IP Address>:5000
+gcloud ai-platform jobs submit training $JOB_NAME \
+   --stream-logs \
+   --runtime-version 1.14 \
+   --job-dir $GCS_JOB_DIR \
+   --package-path trainer \
+   --module-name trainer.task \
+   --region $REGION \
+   -- \
+   --train-files $TRAIN_FILE \
+   --eval-files $EVAL_FILE \
+   --train-steps $TRAIN_STEPS \
+   --eval-steps $EVAL_STEPS \
+   --mlflow-tracking-uri http://<MLflow Server Public IP Address>:<MLflow server port>
 ```
 
 

diff --git a/tutorials/tensorflow/mlflow_gcp/requirements.txt b/tutorials/tensorflow/mlflow_gcp/requirements.txt
@@ -1,7 +1,8 @@
-numpy>=1.14
-pandas>=0.22
-six>=1.11
-google-api-python-client
-google-cloud-storage
-tensorflow>=1.14,<2
-mlflow>1.0,<2
+# The pip syntax below allows us to not repeat
+# In order to not maintain two separate dependency
+# lists in setup.py vs requirements.txt
+# See https://caremad.io/posts/2013/07/setup-vs-requirement/
+
+--index-url https://pypi.python.org/simple/
+
+-e .
diff --git a/tutorials/tensorflow/mlflow_gcp/setup.py b/tutorials/tensorflow/mlflow_gcp/setup.py
@@ -0,0 +1,36 @@
+#!/usr/bin/env python
+# Copyright 2019 Google LLC. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from setuptools import find_packages
+from setuptools import setup
+
+REQUIRED_PACKAGES = [
+    'numpy>=1.14',
+    'pandas>=0.22',
+    'six>=1.11',
+    'google-api-python-client',
+    'google-cloud-storage',
+    'tensorflow>=1.14,<2',
+    'mlflow>1.0,<2'
+]
+
+setup(
+    name='trainer',
+    version='0.1',
+    install_requires=REQUIRED_PACKAGES,
+    packages=find_packages(),
+    include_package_data=True,
+    description='AI Platform trainer'
+)
diff --git a/tutorials/tensorflow/mlflow_gcp/trainer/model_deployment.py b/tutorials/tensorflow/mlflow_gcp/trainer/model_deployment.py
@@ -33,24 +33,27 @@ def _create_service():
     return discovery.build('ml', 'v1')
 
 
+def copy_artifacts(source_path, destination_path):
+    """
+
+    :param source_path:
+    :param destination_path:
+    :return:
+    """
+    logging.info(
+        'Moving model directory from {} to {}'.format(source_path,
+                                                      destination_path))
+    subprocess.call(
+        "gsutil -m cp -r {} {}".format(source_path, destination_path),
+        shell=True)
+
+
 class AIPlatformModel(object):
     def __init__(self, project_id):
         self._project_id = project_id
         self._service = _create_service()
 
-    def upload_model(self, model_local_path, model_gcs_path):
-        """
 
-        :param model_local_path:
-        :param model_gcs_path:
-        :return:
-        """
-        logging.info(
-            'Moving model directory from {} to {}'.format(model_local_path,
-                                                          model_gcs_path))
-        subprocess.call(
-            "gsutil -m cp -r {} {}".format(model_local_path, model_gcs_path),
-            shell=True)
 
     def model_exists(self, model_name):
         """

diff --git a/tutorials/tensorflow/mlflow_gcp/trainer/task.py b/tutorials/tensorflow/mlflow_gcp/trainer/task.py
@@ -19,7 +19,9 @@
 
 import argparse
 import logging
+import tempfile
 import os
+import shutil
 
 from builtins import int
 from mlflow import pyfunc
@@ -157,7 +159,7 @@ def train_and_evaluate(args):
       args: dictionary of arguments - see get_args() for details
     """
 
-    logging.info('Resume training:', args.reuse_job_dir)
+    logging.info('Resume training: {}'.format(args.reuse_job_dir))
     if not args.reuse_job_dir:
         if tf.io.gfile.exists(args.job_dir):
             tf.io.gfile.rmtree(args.job_dir)
@@ -199,13 +201,23 @@ def train_and_evaluate(args):
     # Train model
     with mlflow.start_run() as active_run:
         run_id = active_run.info.run_id
+
+        class MlflowCallback(tf.keras.callbacks.Callback):
+            # This function will be called after training completes.
+            def on_train_end(self, logs=None):
+                mlflow.log_param('num_layers', len(self.model.layers))
+                mlflow.log_param('optimizer_name',
+                                 type(self.model.optimizer).__name__)
+
+        mlflow_callback = MlflowCallback()
         # Setup Learning Rate decay.
         lr_decay_callback = tf.keras.callbacks.LearningRateScheduler(
             lambda epoch: args.learning_rate + 0.02 * (0.5 ** (1 + epoch)),
             verbose=False)
         # Setup TensorBoard callback.
+        tensorboard_path = os.path.join(args.job_dir, run_id, 'tensorboard')
         tensorboard_callback = tf.keras.callbacks.TensorBoard(
-            os.path.join(args.job_dir, run_id, 'tensorboard'),
+            tensorboard_path,
             histogram_freq=1)
         history = keras_model.fit(
             training_dataset,
@@ -214,7 +226,8 @@ def train_and_evaluate(args):
             validation_data=validation_dataset,
             validation_steps=args.eval_steps,
             verbose=1,
-            callbacks=[lr_decay_callback, tensorboard_callback])
+            callbacks=[lr_decay_callback, tensorboard_callback,
+                       mlflow_callback])
         metrics = history.history
         logging.info(metrics)
         keras_model.summary()
@@ -238,7 +251,19 @@ def train_and_evaluate(args):
         model_local_path = os.path.join(args.job_dir, run_id, 'model')
         tf.keras.experimental.export_saved_model(keras_model, model_local_path)
         # Define artifacts.
-        logging.info('Model exported to: ', model_local_path)
+        logging.info('Model exported to: {}'.format(model_local_path))
+        # MLflow workaround since is unable to read GCS path.
+        if model_local_path.startswith('gs://'):
+            logging.info('Creating temp folder')
+            temp = tempfile.mkdtemp()
+            model_deployment.copy_artifacts(model_local_path, temp)
+            model_local_path = os.path.join(temp, 'model')
+        if tensorboard_path.startswith('gs://'):
+            logging.info('Creating temp folder')
+            temp = tempfile.mkdtemp()
+            model_deployment.copy_artifacts(tensorboard_path, temp)
+            tensorboard_path = temp
+
         mlflow.tensorflow.log_model(tf_saved_model_dir=model_local_path,
                                     tf_meta_graph_tags=[tag_constants.SERVING],
                                     tf_signature_def_key='serving_default',
@@ -247,29 +272,36 @@ def train_and_evaluate(args):
         pyfunc_model = mlflow.pyfunc.load_model(
             mlflow.get_artifact_uri('model'))
         logging.info('Uploading TensorFlow events as a run artifact.')
-        mlflow.log_artifacts(os.path.join(args.job_dir, run_id, 'tensorboard'),
-                             artifact_path='events')
-        print("\nLaunch TensorBoard with:\n\ntensorboard --logdir=%s" %
-              os.path.join(mlflow.get_artifact_uri(), 'events'))
+        mlflow.log_artifacts(tensorboard_path)
+        logging.info(
+            'Launch TensorBoard with:\n\ntensorboard --logdir=%s' %
+            tensorboard_path)
         duration = time() - start_time
         mlflow.log_metric('duration', duration)
         mlflow.end_run()
+        if model_local_path.startswith('gs://') and tensorboard_path.startswith(
+            'gs://'):
+            shutil.rmtree(model_local_path)
+            shutil.rmtree(tensorboard_path)
 
     # Deploy to AI Platform.
     if args.deploy_gcp:
         # Create AI Platform helper instance.
         model_helper = model_deployment.AIPlatformModel(
             project_id=args.project_id)
         # Copy local model to GCS for deployment.
-        model_gcs_path = os.path.join('gs://', args.gcs_bucket, run_id, 'model')
-        model_helper.upload_model(model_local_path, model_gcs_path)
+        if not model_local_path.startswith('gs://'):
+            model_gcs_path = os.path.join('gs://', args.gcs_bucket, run_id,
+                                          'model')
+            model_deployment.copy_artifacts(model_local_path, model_gcs_path)
         # Create model
         model_helper.create_model(args.model_name)
         # Create model version
         model_helper.deploy_model(model_gcs_path, args.model_name, run_id,
                                   args.run_time_version)
-        print('Model deployment in GCP completed')
-    print('This model took: ', duration, 'seconds to train and test.')
+        logging.info('Model deployment in GCP completed')
+    logging.info(
+        'This model took: {} seconds to train and test.'.format(duration))
 
 
 if __name__ == '__main__':

diff --git a/tutorials/tensorflow/mlflow_gcp/trainer/utils.py b/tutorials/tensorflow/mlflow_gcp/trainer/utils.py
@@ -205,8 +205,8 @@ def load_data(training_file_path, eval_file_path, *args, **kwargs):
     """
 
     # TODO Download and clean custom files.
-    print('Location train file: %s, eval file %s', training_file_path,
-          eval_file_path)
+    print('Location train file: {}, eval file {}'.format(training_file_path,
+                                                         eval_file_path))
     training_file_path, eval_file_path = download(DATA_DIR)
 
     # This census data uses the value '?' for missing entries. We use