diff --git a/.gitignore b/.gitignore index 899063f52..b87118ac9 100644 --- a/.gitignore +++ b/.gitignore @@ -10,3 +10,5 @@ data/.DS_Store *.pyc *.gz .spyproject/ +*~ + diff --git a/yapf.yaml b/.style.yapf similarity index 100% rename from yapf.yaml rename to .style.yapf diff --git a/docs/index.rst b/docs/index.rst index 7bea600b1..12b8bdca5 100644 --- a/docs/index.rst +++ b/docs/index.rst @@ -56,6 +56,18 @@ method, this part of the documentation is for you. modules/distributed modules/db + +Command-line Reference +---------------------- + +TensorLayer provides a handy command-line tool `tl` to perform some common tasks. + +.. toctree:: + :maxdepth: 2 + + modules/cli + + Indices and tables ================== diff --git a/docs/modules/cli.rst b/docs/modules/cli.rst new file mode 100644 index 000000000..6e6fc0267 --- /dev/null +++ b/docs/modules/cli.rst @@ -0,0 +1,6 @@ +CLI +====== + +.. automodule:: tensorlayer.cli + +.. automodule:: tensorlayer.cli.train diff --git a/docs/requirements.txt b/docs/requirements.txt index 7fab94be5..d41735b82 100644 --- a/docs/requirements.txt +++ b/docs/requirements.txt @@ -3,3 +3,5 @@ numpydoc scipy scikit-image matplotlib +pymongo +sphinx diff --git a/example/tutorial_imagenet_inceptionV3_distributed.py b/example/tutorial_imagenet_inceptionV3_distributed.py index 094b7046a..fe72310f6 100644 --- a/example/tutorial_imagenet_inceptionV3_distributed.py +++ b/example/tutorial_imagenet_inceptionV3_distributed.py @@ -117,7 +117,7 @@ def load_data(file, task_spec=None, batch_size=16, epochs=1, shuffle_size=0): dataset = dataset.shuffle(buffer_size=shuffle_size) def _parse_example_fn(line): - line_split = line.split(',') + line_split = line.decode().split(',') filename = line_split[0] labels_names = line_split[1:] # labels diff --git a/setup.py b/setup.py index 299945fd6..943d60224 100755 --- a/setup.py +++ b/setup.py @@ -14,12 +14,14 @@ include_package_data=True, author='TensorLayer Contributors', author_email='hao.dong11@imperial.ac.uk', - url = "https://github.com/zsdonghao/tensorlayer" , - license = "apache" , - packages = find_packages(), + url="https://github.com/tensorlayer/tensorlayer", + license="apache", + packages=find_packages(), install_requires=install_requires, - # scripts=['tutorial_mnist.py'], - description = "Reinforcement Learning and Deep Learning Library for Researcher and Engineer.", - keywords = "deep learning, reinforcement learning, tensorflow", + scripts=[ + 'tl', + ], + description="Reinforcement Learning and Deep Learning Library for Researcher and Engineer.", + keywords="deep learning, reinforcement learning, tensorflow", platform=['any'], ) diff --git a/tensorlayer/cli/__init__.py b/tensorlayer/cli/__init__.py new file mode 100644 index 000000000..249749dde --- /dev/null +++ b/tensorlayer/cli/__init__.py @@ -0,0 +1,3 @@ +""" +The tensorlayer.cli module provides a command-line tool for some common tasks. +""" diff --git a/tensorlayer/cli/__main__.py b/tensorlayer/cli/__main__.py new file mode 100644 index 000000000..f20479e3b --- /dev/null +++ b/tensorlayer/cli/__main__.py @@ -0,0 +1,13 @@ +import argparse +from tensorlayer.cli import train + +if __name__ == "__main__": + parser = argparse.ArgumentParser(prog='tl') + subparsers = parser.add_subparsers(dest='cmd') + train_parser = subparsers.add_parser('train', help='train a model using multiple local GPUs or CPUs.') + train.build_arg_parser(train_parser) + args = parser.parse_args() + if args.cmd == 'train': + train.main(args) + else: + parser.print_help() diff --git a/tensorlayer/cli/train.py b/tensorlayer/cli/train.py new file mode 100755 index 000000000..f6bae9b54 --- /dev/null +++ b/tensorlayer/cli/train.py @@ -0,0 +1,166 @@ +#!/usr/bin/env python +# encoding: utf-8 + +""" +tl train +======== +(Alpha release) + +The tensorlayer.cli.train module provides the ``tl train`` subcommand. +It helps the user bootstrap a TensorFlow/TensorLayer program for distributed training +using multiple GPU cards or CPUs on a computer. + +You need to first setup the CUDA_VISIBLE_DEVICES to tell ``tl train`` +which GPUs are available. If the CUDA_VISIBLE_DEVICES is not given, +``tl train`` would try best to discover all available GPUs. + +In distribute training, each TensorFlow program needs a TF_CONFIG environment variable to describe +the cluster. It also needs a master daemon to +monitor all trainers. ``tl train`` is responsible +for automatically managing these two tasks. + +Usage +----- + +tl train [-h] [-p NUM_PSS] [-c CPU_TRAINERS] [args [args ...]] + +.. code-block:: bash + + tl train example/tutorial_mnist_distributed.py + + # example of using customized number of CPUs + tl train -c 16 example/tutorial_imagenet_inceptionV3_distributed.py + + # example of running training program with customized arguments + tl train example/tutorial_imagenet_inceptionV3_distributed.py -- --batch_size 16 + + +Parameters +---------- + +- ``file``: python file path. + +- ``NUM_PSS`` : The number of parameter servers. + +- ``CPU_TRAINERS``: The number of CPU trainers. + + It is recommended that ``NUM_PSS + CPU_TRAINERS <= cpu count`` + +- ``args``: Any parameter after ``--`` would be passed to the python program. + + +Notes +----- + +A parallel training program would require multiple parameter servers +to help parallel trainers to exchange intermediate gradients. +The best number of parameter servers is often proportional to the +size of your model as well as the number of CPUs available. +You can control the number of parameter servers using the ``-p`` parameter. + +If you have a single computer with massive CPUs, you can use the ``-c`` parameter +to enable CPU-only parallel training. +The reason we are not supporting GPU-CPU co-training is because GPU and +CPU are running at different speeds. Using them together in training would +incur stragglers. +""" + +import argparse +import json +import multiprocessing +import os +import platform +import re +import subprocess +import sys + +PORT_BASE = 10000 + + +def _get_gpu_ids(): + if 'CUDA_VISIBLE_DEVICES' in os.environ: + return [int(x) for x in os.environ.get('CUDA_VISIBLE_DEVICES', '').split(',')] + if platform.system() in ['Darwin', 'Linux']: + return [int(d.replace('nvidia', '')) for d in os.listdir('/dev') if re.match('^nvidia\d+$', d)] + else: + print('Please set CUDA_VISIBLE_DEVICES (see http://acceleware.com/blog/cudavisibledevices-masking-gpus)') + return [] + + +GPU_IDS = _get_gpu_ids() + + +def create_tf_config(cluster_spec, task_type, task_index): + return { + 'cluster': cluster_spec, + 'task': { + 'type': task_type, + 'index': task_index + }, + } + + +def create_tf_jobs(cluster_spec, prog, args): + gpu_assignment = dict((('worker', idx), gpu_idx) for (idx, gpu_idx) in enumerate(GPU_IDS)) + for job_type in cluster_spec: + for task_index in range(len(cluster_spec[job_type])): + new_env = os.environ.copy() + new_env.update({ + 'CUDA_VISIBLE_DEVICES': str(gpu_assignment.get((job_type, task_index), '')), + 'TF_CONFIG': json.dumps(create_tf_config(cluster_spec, job_type, task_index)), + }) + yield subprocess.Popen(['python3', prog] + args, env=new_env) + + +def validate_arguments(args): + if args.num_pss < 1: + print('Value error: must have ore than one parameter servers.') + exit(1) + + if not GPU_IDS: + num_cpus = multiprocessing.cpu_count() + if args.cpu_trainers > num_cpus: + print('Value error: there are %s available CPUs but you are requiring %s.' % (num_cpus, args.cpu_trainers)) + exit(1) + + if not os.path.isfile(args.file): + print('Value error: model trainning file does not exist') + exit(1) + + +def main(args): + validate_arguments(args) + num_workers = len(GPU_IDS) if GPU_IDS else args.cpu_trainers + print('Using program %s with args %s' % (args.file, ' '.join(args.args))) + print('Using %d workers, %d parameter servers, %d GPUs.' % (num_workers, args.num_pss, len(GPU_IDS))) + cluster_spec = { + 'ps': ['localhost:%d' % (PORT_BASE + i) for i in range(args.num_pss)], + 'worker': ['localhost:%d' % (PORT_BASE + args.num_pss + i) for i in range(num_workers)] + } + processes = list(create_tf_jobs(cluster_spec, args.file, args.args)) + try: + print('Press ENTER to exit the training ...') + sys.stdin.readline() + except KeyboardInterrupt: # https://docs.python.org/3/library/exceptions.html#KeyboardInterrupt + print('Keyboard interrupt received') + finally: + print('stopping all subprocesses ...') + for p in processes: + p.kill() + for p in processes: + p.wait() + print('END') + + +def build_arg_parser(parser): + parser.add_argument('-p', '--pss', dest='num_pss', type=int, default=1, help='number of parameter servers') + parser.add_argument('-c', '--cpu_trainers', dest='cpu_trainers', type=int, default=1, help='number of CPU trainers') + parser.add_argument('file', help='model trainning file path') + parser.add_argument('args', nargs='*', type=str, help='arguments to ') + + +if __name__ == "__main__": + parser = argparse.ArgumentParser() + build_arg_parser(parser) + args = parser.parse_args() + main(args) diff --git a/tl b/tl new file mode 100755 index 000000000..3c5a78a40 --- /dev/null +++ b/tl @@ -0,0 +1,13 @@ +#!/bin/bash + +SOURCE="${BASH_SOURCE[0]}" +while [ -h "$SOURCE" ]; do # resolve $SOURCE until the file is no longer a symlink + DIR="$( cd -P "$( dirname "$SOURCE" )" && pwd )" + SOURCE="$(readlink "$SOURCE")" + [[ $SOURCE != /* ]] && SOURCE="$DIR/$SOURCE" # if $SOURCE was a relative symlink, we need to resolve it relative to the path where the symlink file was located +done +DIR="$( cd -P "$( dirname "$SOURCE" )" && pwd )" + +export PYTHONPATH="${DIR}/src:${PYTHONPATH}" + +python3 -m tensorlayer.cli "$@"