diff --git a/.gitignore b/.gitignore
index 899063f52..b87118ac9 100644
--- a/.gitignore
+++ b/.gitignore
@@ -10,3 +10,5 @@ data/.DS_Store
 *.pyc
 *.gz
 .spyproject/
+*~
+
diff --git a/yapf.yaml b/.style.yapf
similarity index 100%
rename from yapf.yaml
rename to .style.yapf
diff --git a/docs/index.rst b/docs/index.rst
index 7bea600b1..12b8bdca5 100644
--- a/docs/index.rst
+++ b/docs/index.rst
@@ -56,6 +56,18 @@ method, this part of the documentation is for you.
   modules/distributed
   modules/db
 
+
+Command-line Reference
+----------------------
+
+TensorLayer provides a handy command-line tool `tl` to perform some common tasks.
+
+.. toctree::
+  :maxdepth: 2
+
+  modules/cli
+
+
 Indices and tables
 ==================
 
diff --git a/docs/modules/cli.rst b/docs/modules/cli.rst
new file mode 100644
index 000000000..6e6fc0267
--- /dev/null
+++ b/docs/modules/cli.rst
@@ -0,0 +1,6 @@
+CLI
+======
+
+.. automodule:: tensorlayer.cli
+
+.. automodule:: tensorlayer.cli.train
diff --git a/docs/requirements.txt b/docs/requirements.txt
index 7fab94be5..d41735b82 100644
--- a/docs/requirements.txt
+++ b/docs/requirements.txt
@@ -3,3 +3,5 @@ numpydoc
 scipy
 scikit-image
 matplotlib
+pymongo
+sphinx
diff --git a/example/tutorial_imagenet_inceptionV3_distributed.py b/example/tutorial_imagenet_inceptionV3_distributed.py
index 094b7046a..fe72310f6 100644
--- a/example/tutorial_imagenet_inceptionV3_distributed.py
+++ b/example/tutorial_imagenet_inceptionV3_distributed.py
@@ -117,7 +117,7 @@ def load_data(file, task_spec=None, batch_size=16, epochs=1, shuffle_size=0):
         dataset = dataset.shuffle(buffer_size=shuffle_size)
 
     def _parse_example_fn(line):
-        line_split = line.split(',')
+        line_split = line.decode().split(',')
         filename = line_split[0]
         labels_names = line_split[1:]
         # labels
diff --git a/setup.py b/setup.py
index 299945fd6..943d60224 100755
--- a/setup.py
+++ b/setup.py
@@ -14,12 +14,14 @@
     include_package_data=True,
     author='TensorLayer Contributors',
     author_email='hao.dong11@imperial.ac.uk',
-    url = "https://github.com/zsdonghao/tensorlayer" ,
-    license = "apache" ,
-    packages = find_packages(),
+    url="https://github.com/tensorlayer/tensorlayer",
+    license="apache",
+    packages=find_packages(),
     install_requires=install_requires,
-    # scripts=['tutorial_mnist.py'],
-    description = "Reinforcement Learning and Deep Learning Library for Researcher and Engineer.",
-    keywords = "deep learning, reinforcement learning, tensorflow",
+    scripts=[
+        'tl',
+    ],
+    description="Reinforcement Learning and Deep Learning Library for Researcher and Engineer.",
+    keywords="deep learning, reinforcement learning, tensorflow",
     platform=['any'],
 )
diff --git a/tensorlayer/cli/__init__.py b/tensorlayer/cli/__init__.py
new file mode 100644
index 000000000..249749dde
--- /dev/null
+++ b/tensorlayer/cli/__init__.py
@@ -0,0 +1,3 @@
+"""
+The tensorlayer.cli module provides a command-line tool for some common tasks.
+"""
diff --git a/tensorlayer/cli/__main__.py b/tensorlayer/cli/__main__.py
new file mode 100644
index 000000000..f20479e3b
--- /dev/null
+++ b/tensorlayer/cli/__main__.py
@@ -0,0 +1,13 @@
+import argparse
+from tensorlayer.cli import train
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(prog='tl')
+    subparsers = parser.add_subparsers(dest='cmd')
+    train_parser = subparsers.add_parser('train', help='train a model using multiple local GPUs or CPUs.')
+    train.build_arg_parser(train_parser)
+    args = parser.parse_args()
+    if args.cmd == 'train':
+        train.main(args)
+    else:
+        parser.print_help()
diff --git a/tensorlayer/cli/train.py b/tensorlayer/cli/train.py
new file mode 100755
index 000000000..f6bae9b54
--- /dev/null
+++ b/tensorlayer/cli/train.py
@@ -0,0 +1,166 @@
+#!/usr/bin/env python
+# encoding: utf-8
+
+"""
+tl train
+========
+(Alpha release)
+
+The tensorlayer.cli.train module provides the ``tl train`` subcommand.
+It helps the user bootstrap a TensorFlow/TensorLayer program for distributed training 
+using multiple GPU cards or CPUs on a computer.
+
+You need to first setup the CUDA_VISIBLE_DEVICES to tell ``tl train``
+which GPUs are available. If the CUDA_VISIBLE_DEVICES is not given,
+``tl train`` would try best to discover all available GPUs.
+
+In distribute training, each TensorFlow program needs a TF_CONFIG environment variable to describe
+the cluster. It also needs a master daemon to 
+monitor all trainers. ``tl train`` is responsible
+for automatically managing these two tasks. 
+
+Usage
+-----
+
+tl train [-h] [-p NUM_PSS] [-c CPU_TRAINERS] <file> [args [args ...]]
+
+.. code-block:: bash
+  
+  tl train example/tutorial_mnist_distributed.py
+
+  # example of using customized number of CPUs
+  tl train -c 16 example/tutorial_imagenet_inceptionV3_distributed.py
+
+  # example of running training program with customized arguments
+  tl train example/tutorial_imagenet_inceptionV3_distributed.py -- --batch_size 16
+
+
+Parameters
+----------
+
+- ``file``: python file path.
+
+- ``NUM_PSS`` : The number of parameter servers.
+
+- ``CPU_TRAINERS``: The number of CPU trainers.
+
+  It is recommended that ``NUM_PSS + CPU_TRAINERS <= cpu count``
+
+- ``args``: Any parameter after ``--`` would be passed to the python program.
+
+
+Notes
+-----
+
+A parallel training program would require multiple parameter servers
+to help parallel trainers to exchange intermediate gradients.
+The best number of parameter servers is often proportional to the 
+size of your model as well as the number of CPUs available.
+You can control the number of parameter servers using the ``-p`` parameter.
+
+If you have a single computer with massive CPUs, you can use the ``-c`` parameter
+to enable CPU-only parallel training.
+The reason we are not supporting GPU-CPU co-training is because GPU and 
+CPU are running at different speeds. Using them together in training would
+incur stragglers.
+"""
+
+import argparse
+import json
+import multiprocessing
+import os
+import platform
+import re
+import subprocess
+import sys
+
+PORT_BASE = 10000
+
+
+def _get_gpu_ids():
+    if 'CUDA_VISIBLE_DEVICES' in os.environ:
+        return [int(x) for x in os.environ.get('CUDA_VISIBLE_DEVICES', '').split(',')]
+    if platform.system() in ['Darwin', 'Linux']:
+        return [int(d.replace('nvidia', '')) for d in os.listdir('/dev') if re.match('^nvidia\d+$', d)]
+    else:
+        print('Please set CUDA_VISIBLE_DEVICES (see http://acceleware.com/blog/cudavisibledevices-masking-gpus)')
+        return []
+
+
+GPU_IDS = _get_gpu_ids()
+
+
+def create_tf_config(cluster_spec, task_type, task_index):
+    return {
+        'cluster': cluster_spec,
+        'task': {
+            'type': task_type,
+            'index': task_index
+        },
+    }
+
+
+def create_tf_jobs(cluster_spec, prog, args):
+    gpu_assignment = dict((('worker', idx), gpu_idx) for (idx, gpu_idx) in enumerate(GPU_IDS))
+    for job_type in cluster_spec:
+        for task_index in range(len(cluster_spec[job_type])):
+            new_env = os.environ.copy()
+            new_env.update({
+                'CUDA_VISIBLE_DEVICES': str(gpu_assignment.get((job_type, task_index), '')),
+                'TF_CONFIG': json.dumps(create_tf_config(cluster_spec, job_type, task_index)),
+            })
+            yield subprocess.Popen(['python3', prog] + args, env=new_env)
+
+
+def validate_arguments(args):
+    if args.num_pss < 1:
+        print('Value error: must have ore than one parameter servers.')
+        exit(1)
+
+    if not GPU_IDS:
+        num_cpus = multiprocessing.cpu_count()
+        if args.cpu_trainers > num_cpus:
+            print('Value error: there are %s available CPUs but you are requiring %s.' % (num_cpus, args.cpu_trainers))
+            exit(1)
+
+    if not os.path.isfile(args.file):
+        print('Value error: model trainning file does not exist')
+        exit(1)
+
+
+def main(args):
+    validate_arguments(args)
+    num_workers = len(GPU_IDS) if GPU_IDS else args.cpu_trainers
+    print('Using program %s with args %s' % (args.file, ' '.join(args.args)))
+    print('Using %d workers, %d parameter servers, %d GPUs.' % (num_workers, args.num_pss, len(GPU_IDS)))
+    cluster_spec = {
+        'ps': ['localhost:%d' % (PORT_BASE + i) for i in range(args.num_pss)],
+        'worker': ['localhost:%d' % (PORT_BASE + args.num_pss + i) for i in range(num_workers)]
+    }
+    processes = list(create_tf_jobs(cluster_spec, args.file, args.args))
+    try:
+        print('Press ENTER to exit the training ...')
+        sys.stdin.readline()
+    except KeyboardInterrupt:  # https://docs.python.org/3/library/exceptions.html#KeyboardInterrupt
+        print('Keyboard interrupt received')
+    finally:
+        print('stopping all subprocesses ...')
+        for p in processes:
+            p.kill()
+        for p in processes:
+            p.wait()
+        print('END')
+
+
+def build_arg_parser(parser):
+    parser.add_argument('-p', '--pss', dest='num_pss', type=int, default=1, help='number of parameter servers')
+    parser.add_argument('-c', '--cpu_trainers', dest='cpu_trainers', type=int, default=1, help='number of CPU trainers')
+    parser.add_argument('file', help='model trainning file path')
+    parser.add_argument('args', nargs='*', type=str, help='arguments to <file>')
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    build_arg_parser(parser)
+    args = parser.parse_args()
+    main(args)
diff --git a/tl b/tl
new file mode 100755
index 000000000..3c5a78a40
--- /dev/null
+++ b/tl
@@ -0,0 +1,13 @@
+#!/bin/bash
+
+SOURCE="${BASH_SOURCE[0]}"
+while [ -h "$SOURCE" ]; do # resolve $SOURCE until the file is no longer a symlink
+  DIR="$( cd -P "$( dirname "$SOURCE" )" && pwd )"
+  SOURCE="$(readlink "$SOURCE")"
+  [[ $SOURCE != /* ]] && SOURCE="$DIR/$SOURCE" # if $SOURCE was a relative symlink, we need to resolve it relative to the path where the symlink file was located
+done
+DIR="$( cd -P "$( dirname "$SOURCE" )" && pwd )"
+
+export PYTHONPATH="${DIR}/src:${PYTHONPATH}"
+
+python3 -m tensorlayer.cli "$@"