Skip to content

Commit

Permalink
Handle remote GCS files
Browse files Browse the repository at this point in the history
  • Loading branch information
gogasca committed Aug 20, 2019
1 parent 4e29e27 commit 90153a9
Show file tree
Hide file tree
Showing 6 changed files with 174 additions and 46 deletions.
82 changes: 69 additions & 13 deletions tutorials/tensorflow/mlflow_gcp/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -37,8 +37,64 @@ This dataset is provided by a third party. Google provides no
representation, warranty, or other guarantees about the validity or any
other aspects of this dataset.

### Create a Compute Engine instance

Create a new Deep Learning Virtual Machine instance

```
export IMAGE_FAMILY="tf-latest-cpu"
export ZONE="us-central1-b"
export INSTANCE_NAME="mlflow-server"
gcloud compute instances create $INSTANCE_NAME \
--zone=$ZONE \
--image-family=$IMAGE_FAMILY \
--machine-type=n1-standard-8 \
--image-project=deeplearning-platform-release \
--maintenance-policy=TERMINATE \
--scopes=https://www.googleapis.com/auth/cloud-platform \
--tags http-server,https-server
```

#### Installing MLflow

Install git, pip and virtual environment

```
sudo apt-get install git -y
sudo apt-get install python-pip -y
pip install virtualenv
```

Create virtual environment

```
virtualenv -p `which python3` mlflow_env
source mlflow_env/bin/activate
```

Install MLflow

```
pip install mlflow
```
Verify installation

```
pip freeze | grep mlflow
mlflow==1.2.0
```

### **Install dependencies**

In this tutorial we will train a TensorFlow model and use different
parameters. We will use MLflow to track those different parameters and
their metrics. Start by cloning the repo.

```
git clone https://github.com/GoogleCloudPlatform/ml-on-gcp.git
cd ml-on-gcp/tutorials/tensorflow/mlflow_gcp/
```

Install the python dependencies.

```
Expand Down Expand Up @@ -189,19 +245,19 @@ export JOB_NAME=mlflow_$DATE
export REGION=us-central1
export GCS_JOB_DIR=gs://mlflow_gcp/jobs/$JOB_NAME
gcloud ai-platform job sumit training $JOB_NAME \
--stream-logs \
--runtime-version 1.14 \
--package-path trainer \
--module-name trainer.task \
--region $REGION \
-- \
--train-files $TRAIN_FILE \
--eval-files $EVAL_FILE \
--job-dir $GCS_JOB_DIR \
--train-steps $TRAIN_STEPS \
--eval-steps $EVAL_STEPS
--mlflow-tracking-uri http://<MLFlow Public IP Address>:5000
gcloud ai-platform jobs submit training $JOB_NAME \
--stream-logs \
--runtime-version 1.14 \
--job-dir $GCS_JOB_DIR \
--package-path trainer \
--module-name trainer.task \
--region $REGION \
-- \
--train-files $TRAIN_FILE \
--eval-files $EVAL_FILE \
--train-steps $TRAIN_STEPS \
--eval-steps $EVAL_STEPS \
--mlflow-tracking-uri http://<MLflow Server Public IP Address>:<MLflow server port>
```


Expand Down
15 changes: 8 additions & 7 deletions tutorials/tensorflow/mlflow_gcp/requirements.txt
Original file line number Diff line number Diff line change
@@ -1,7 +1,8 @@
numpy>=1.14
pandas>=0.22
six>=1.11
google-api-python-client
google-cloud-storage
tensorflow>=1.14,<2
mlflow>1.0,<2
# The pip syntax below allows us to not repeat
# In order to not maintain two separate dependency
# lists in setup.py vs requirements.txt
# See https://caremad.io/posts/2013/07/setup-vs-requirement/

--index-url https://pypi.python.org/simple/

-e .
36 changes: 36 additions & 0 deletions tutorials/tensorflow/mlflow_gcp/setup.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,36 @@
#!/usr/bin/env python
# Copyright 2019 Google LLC. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

from setuptools import find_packages
from setuptools import setup

REQUIRED_PACKAGES = [
'numpy>=1.14',
'pandas>=0.22',
'six>=1.11',
'google-api-python-client',
'google-cloud-storage',
'tensorflow>=1.14,<2',
'mlflow>1.0,<2'
]

setup(
name='trainer',
version='0.1',
install_requires=REQUIRED_PACKAGES,
packages=find_packages(),
include_package_data=True,
description='AI Platform trainer'
)
27 changes: 15 additions & 12 deletions tutorials/tensorflow/mlflow_gcp/trainer/model_deployment.py
Original file line number Diff line number Diff line change
Expand Up @@ -33,24 +33,27 @@ def _create_service():
return discovery.build('ml', 'v1')


def copy_artifacts(source_path, destination_path):
"""
:param source_path:
:param destination_path:
:return:
"""
logging.info(
'Moving model directory from {} to {}'.format(source_path,
destination_path))
subprocess.call(
"gsutil -m cp -r {} {}".format(source_path, destination_path),
shell=True)


class AIPlatformModel(object):
def __init__(self, project_id):
self._project_id = project_id
self._service = _create_service()

def upload_model(self, model_local_path, model_gcs_path):
"""

:param model_local_path:
:param model_gcs_path:
:return:
"""
logging.info(
'Moving model directory from {} to {}'.format(model_local_path,
model_gcs_path))
subprocess.call(
"gsutil -m cp -r {} {}".format(model_local_path, model_gcs_path),
shell=True)

def model_exists(self, model_name):
"""
Expand Down
56 changes: 44 additions & 12 deletions tutorials/tensorflow/mlflow_gcp/trainer/task.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,9 @@

import argparse
import logging
import tempfile
import os
import shutil

from builtins import int
from mlflow import pyfunc
Expand Down Expand Up @@ -157,7 +159,7 @@ def train_and_evaluate(args):
args: dictionary of arguments - see get_args() for details
"""

logging.info('Resume training:', args.reuse_job_dir)
logging.info('Resume training: {}'.format(args.reuse_job_dir))
if not args.reuse_job_dir:
if tf.io.gfile.exists(args.job_dir):
tf.io.gfile.rmtree(args.job_dir)
Expand Down Expand Up @@ -199,13 +201,23 @@ def train_and_evaluate(args):
# Train model
with mlflow.start_run() as active_run:
run_id = active_run.info.run_id

class MlflowCallback(tf.keras.callbacks.Callback):
# This function will be called after training completes.
def on_train_end(self, logs=None):
mlflow.log_param('num_layers', len(self.model.layers))
mlflow.log_param('optimizer_name',
type(self.model.optimizer).__name__)

mlflow_callback = MlflowCallback()
# Setup Learning Rate decay.
lr_decay_callback = tf.keras.callbacks.LearningRateScheduler(
lambda epoch: args.learning_rate + 0.02 * (0.5 ** (1 + epoch)),
verbose=False)
# Setup TensorBoard callback.
tensorboard_path = os.path.join(args.job_dir, run_id, 'tensorboard')
tensorboard_callback = tf.keras.callbacks.TensorBoard(
os.path.join(args.job_dir, run_id, 'tensorboard'),
tensorboard_path,
histogram_freq=1)
history = keras_model.fit(
training_dataset,
Expand All @@ -214,7 +226,8 @@ def train_and_evaluate(args):
validation_data=validation_dataset,
validation_steps=args.eval_steps,
verbose=1,
callbacks=[lr_decay_callback, tensorboard_callback])
callbacks=[lr_decay_callback, tensorboard_callback,
mlflow_callback])
metrics = history.history
logging.info(metrics)
keras_model.summary()
Expand All @@ -238,7 +251,19 @@ def train_and_evaluate(args):
model_local_path = os.path.join(args.job_dir, run_id, 'model')
tf.keras.experimental.export_saved_model(keras_model, model_local_path)
# Define artifacts.
logging.info('Model exported to: ', model_local_path)
logging.info('Model exported to: {}'.format(model_local_path))
# MLflow workaround since is unable to read GCS path.
if model_local_path.startswith('gs://'):
logging.info('Creating temp folder')
temp = tempfile.mkdtemp()
model_deployment.copy_artifacts(model_local_path, temp)
model_local_path = os.path.join(temp, 'model')
if tensorboard_path.startswith('gs://'):
logging.info('Creating temp folder')
temp = tempfile.mkdtemp()
model_deployment.copy_artifacts(tensorboard_path, temp)
tensorboard_path = temp

mlflow.tensorflow.log_model(tf_saved_model_dir=model_local_path,
tf_meta_graph_tags=[tag_constants.SERVING],
tf_signature_def_key='serving_default',
Expand All @@ -247,29 +272,36 @@ def train_and_evaluate(args):
pyfunc_model = mlflow.pyfunc.load_model(
mlflow.get_artifact_uri('model'))
logging.info('Uploading TensorFlow events as a run artifact.')
mlflow.log_artifacts(os.path.join(args.job_dir, run_id, 'tensorboard'),
artifact_path='events')
print("\nLaunch TensorBoard with:\n\ntensorboard --logdir=%s" %
os.path.join(mlflow.get_artifact_uri(), 'events'))
mlflow.log_artifacts(tensorboard_path)
logging.info(
'Launch TensorBoard with:\n\ntensorboard --logdir=%s' %
tensorboard_path)
duration = time() - start_time
mlflow.log_metric('duration', duration)
mlflow.end_run()
if model_local_path.startswith('gs://') and tensorboard_path.startswith(
'gs://'):
shutil.rmtree(model_local_path)
shutil.rmtree(tensorboard_path)

# Deploy to AI Platform.
if args.deploy_gcp:
# Create AI Platform helper instance.
model_helper = model_deployment.AIPlatformModel(
project_id=args.project_id)
# Copy local model to GCS for deployment.
model_gcs_path = os.path.join('gs://', args.gcs_bucket, run_id, 'model')
model_helper.upload_model(model_local_path, model_gcs_path)
if not model_local_path.startswith('gs://'):
model_gcs_path = os.path.join('gs://', args.gcs_bucket, run_id,
'model')
model_deployment.copy_artifacts(model_local_path, model_gcs_path)
# Create model
model_helper.create_model(args.model_name)
# Create model version
model_helper.deploy_model(model_gcs_path, args.model_name, run_id,
args.run_time_version)
print('Model deployment in GCP completed')
print('This model took: ', duration, 'seconds to train and test.')
logging.info('Model deployment in GCP completed')
logging.info(
'This model took: {} seconds to train and test.'.format(duration))


if __name__ == '__main__':
Expand Down
4 changes: 2 additions & 2 deletions tutorials/tensorflow/mlflow_gcp/trainer/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -205,8 +205,8 @@ def load_data(training_file_path, eval_file_path, *args, **kwargs):
"""

# TODO Download and clean custom files.
print('Location train file: %s, eval file %s', training_file_path,
eval_file_path)
print('Location train file: {}, eval file {}'.format(training_file_path,
eval_file_path))
training_file_path, eval_file_path = download(DATA_DIR)

# This census data uses the value '?' for missing entries. We use
Expand Down

0 comments on commit 90153a9

Please sign in to comment.