diff --git a/examples/mnist_500_tasks/README.md b/examples/mnist_500_tasks/README.md index b510663e73..6fedc89520 100644 --- a/examples/mnist_500_tasks/README.md +++ b/examples/mnist_500_tasks/README.md @@ -7,4 +7,36 @@ Here we provide a CPU-only job with 500 tasks on a taskrole. The example use Con | ConvNet | CPU | 6h30m10s (500*5 epoch) | [Details](metrics/ConvNet_CPU_500Task.JPG) | 95.15% (lr: 0.0101) 98.53% (lr: 0.1001) 98.95% (lr: 0.9981)| [CPU_500Task_MNIST.yaml](yaml/CPU_500Task_MNIST.yaml) | ## Usage -To quickly submit a training job to the OpenPAI cluster, users can directly submit the corresponding yaml file as mentioned above (in the yaml folder). \ No newline at end of file +Before running this example, you should first make sure that you have at least one permitted storage in OpenPAI. If you don’t know how to use storage, please refer to [our doc](https://openpai.readthedocs.io/). + +Before submitting yaml file as mentioned above (in the yaml folder), you need to update the following commands with your own storage path: + +`master` taskrole: +``` +python get_results.py --number=500 --data_path /mnt/confignfs/mnist500_result/ +--> +python get_results.py --number=500 --data_path /mnist500_result/ +``` + +`taskrole` taskerole: +``` +mount -t nfs4 10.151.40.235:/data data +--> +mount -t nfs4 data +``` + +Now you can submit the yaml file to try this example, and **don't forget** to select the storage you want to use in the `data` area on the right side of the page. + +## Visualization of results + +When all instances in `taskrole` run successfully, you can view the visualized results through the running `master`. The following figure shows the final status of the successful job. It should be noted that the visualized results can only be viewed when the `master` is running. This taskrole will keep running until the user manually stops it. + + + +You can access jupyter notebook by visiting `:8888` in the browser. Then, click on the file `show_results.ipynb`. + + + +Run it and get the following visualized result. + + \ No newline at end of file diff --git a/examples/mnist_500_tasks/images/final_status.JPG b/examples/mnist_500_tasks/images/final_status.JPG new file mode 100644 index 0000000000..2108707413 Binary files /dev/null and b/examples/mnist_500_tasks/images/final_status.JPG differ diff --git a/examples/mnist_500_tasks/images/show_results.JPG b/examples/mnist_500_tasks/images/show_results.JPG new file mode 100644 index 0000000000..8e8ab66922 Binary files /dev/null and b/examples/mnist_500_tasks/images/show_results.JPG differ diff --git a/examples/mnist_500_tasks/images/show_results_file.JPG b/examples/mnist_500_tasks/images/show_results_file.JPG new file mode 100644 index 0000000000..36e422145a Binary files /dev/null and b/examples/mnist_500_tasks/images/show_results_file.JPG differ diff --git a/examples/mnist_500_tasks/src/get_results.py b/examples/mnist_500_tasks/src/get_results.py new file mode 100644 index 0000000000..41ba81f85c --- /dev/null +++ b/examples/mnist_500_tasks/src/get_results.py @@ -0,0 +1,44 @@ + +import os +import csv +import time +import argparse +import shutil + +def summary(filepath, result_path): + with open(filepath, 'r') as f: + csv_read = csv.reader(f) + with open(result_path, 'a') as r: + csv_write = csv.writer(r) + for line in csv_read: + csv_write.writerow(line) + +def main(): + parser = argparse.ArgumentParser(description='Display Results') + parser.add_argument('--number', type=int, default=500, + help='The number of learning rates') + parser.add_argument('--data_path', default='./mnist500_result/', + help='The number of learning rates') + args = parser.parse_args() + + path = args.data_path + if not os.path.exists(path): + os.makedirs(path) + # Waiting for all results + while(len([lists for lists in os.listdir(path)]) < args.number): + for file in os.listdir('.'): + if file[-4:]=='.csv': + shutil.move(file, os.path.join(path, file)) + time.sleep(1) + for file in os.listdir('.'): + if file[-4:]=='.csv': + shutil.move(file, os.path.join(path, file)) + + for file in os.listdir(path): + filepath = os.path.join(path, file) + if os.path.isfile(filepath) and file[-4:]=='.csv': + summary(filepath, 'results.csv') + + +if __name__ == '__main__': + main() \ No newline at end of file diff --git a/examples/mnist_500_tasks/src/mnist_lr_500.py b/examples/mnist_500_tasks/src/mnist_lr_500.py index 26c4ceb5c6..efd1eac0a9 100644 --- a/examples/mnist_500_tasks/src/mnist_lr_500.py +++ b/examples/mnist_500_tasks/src/mnist_lr_500.py @@ -6,7 +6,7 @@ import torch.optim as optim from torchvision import datasets, transforms from torch.optim.lr_scheduler import StepLR - +import csv class Net(nn.Module): def __init__(self): @@ -68,7 +68,14 @@ def test(model, device, test_loader): print('\nTest set: Average loss: {:.4f}, Accuracy: {}/{} ({:.0f}%)\n'.format( test_loss, correct, len(test_loader.dataset), 100. * correct / len(test_loader.dataset))) + + return 100. * correct / len(test_loader.dataset) +def write_result(filepath, lr, acc): + with open(filepath, 'a') as f: + csv_write = csv.writer(f) + data = [lr, acc] + csv_write.writerow(data) def main(): # Training settings @@ -95,6 +102,8 @@ def main(): help='For Saving the current Model') parser.add_argument('--task_index', default=0, help='Multi-task Index') + parser.add_argument('--result_file', default='results.csv', + help='Accuracy of different learning rates') args = parser.parse_args() use_cuda = not args.no_cuda and torch.cuda.is_available() @@ -131,13 +140,13 @@ def main(): scheduler = StepLR(optimizer, step_size=1, gamma=args.gamma) for epoch in range(1, args.epochs + 1): train(args, model, device, train_loader, optimizer, epoch) - test(model, device, test_loader) + acc = test(model, device, test_loader) scheduler.step() + write_result(args.result_file, lr, acc) if args.save_model: - torch.save(model.state_dict(), "mnist_cnn.pt") - + torch.save(model.state_dict(), "mnist_cnn.pt") if __name__ == '__main__': - main() + main() \ No newline at end of file diff --git a/examples/mnist_500_tasks/src/show_results.ipynb b/examples/mnist_500_tasks/src/show_results.ipynb new file mode 100644 index 0000000000..8733e6e640 --- /dev/null +++ b/examples/mnist_500_tasks/src/show_results.ipynb @@ -0,0 +1,37 @@ +{ + "metadata": { + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": 3 + }, + "orig_nbformat": 2 + }, + "nbformat": 4, + "nbformat_minor": 2, + "cells": [ + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import numpy as np\n", + "import matplotlib.pyplot as plt\n", + "\n", + "results = np.genfromtxt('./results.csv', delimiter=\",\", names=[\"LR\",\"ACC\"])\n", + "plt.plot(results[\"LR\"], results[\"ACC\"], 'o')\n", + "plt.xlabel('Learning Rate')\n", + "plt.ylabel('Accuracy')\n", + "plt.show()" + ] + } + ] +} \ No newline at end of file diff --git a/examples/mnist_500_tasks/yaml/CPU_500Task_MNIST.yaml b/examples/mnist_500_tasks/yaml/CPU_500Task_MNIST.yaml index c01c9d4ede..021be13199 100644 --- a/examples/mnist_500_tasks/yaml/CPU_500Task_MNIST.yaml +++ b/examples/mnist_500_tasks/yaml/CPU_500Task_MNIST.yaml @@ -7,22 +7,52 @@ prerequisites: uri: 'openpai/standard:python_3.6-pytorch_1.4.0-cpu' name: docker_image_0 taskRoles: + master: + instances: 1 + completion: + minFailedInstances: 1 + taskRetryCount: 0 + dockerImage: docker_image_0 + resourcePerInstance: + gpu: 0 + cpu: 1 + memoryMB: 50000 + commands: + - >- + wget + https://raw.githubusercontent.com/microsoft/pai/master/examples/mnist_500_tasks/src/get_results.py + - >- + python get_results.py --number=500 --data_path + /mnt/confignfs/mnist500_result/ + - >- + wget + https://raw.githubusercontent.com/microsoft/pai/master/examples/mnist_500_tasks/src/show_results.ipynb + - jupyter notebook taskrole: instances: 500 completion: minFailedInstances: 1 + minSucceededInstances: -1 taskRetryCount: 0 dockerImage: docker_image_0 resourcePerInstance: gpu: 0 cpu: 1 - memoryMB: 51200 + memoryMB: 50000 commands: - >- - wget https://raw.githubusercontent.com/microsoft/pai/master/examples/mnist_500_tasks/src/mnist_lr_500.py + wget + https://raw.githubusercontent.com/microsoft/pai/master/examples/mnist_500_tasks/src/mnist_lr_500.py - >- python mnist_lr_500.py --epoch 5 --task_index=$PAI_CURRENT_TASK_ROLE_CURRENT_TASK_INDEX + - apt-get update + - apt-get install --assume-yes nfs-common + - mkdir -p data/mnist500_result + - 'mount -t nfs4 10.151.40.235:/data data' + - >- + cp results.csv + data/mnist500_result/results_$PAI_CURRENT_TASK_ROLE_CURRENT_TASK_INDEX.csv defaults: virtualCluster: default extras: @@ -31,3 +61,7 @@ extras: - plugin: ssh parameters: jobssh: true + - plugin: teamwise_storage + parameters: + storageConfigNames: + - confignfs diff --git a/src/marketplace-restserver/build/marketplace-restserver.k8s.dockerfile b/src/marketplace-restserver/build/marketplace-restserver.k8s.dockerfile index 005c719740..51eab058c6 100644 --- a/src/marketplace-restserver/build/marketplace-restserver.k8s.dockerfile +++ b/src/marketplace-restserver/build/marketplace-restserver.k8s.dockerfile @@ -1,4 +1,4 @@ # Copyright (c) Microsoft Corporation. # Licensed under the MIT License. -FROM docker.io/openpai/pai-marketplace-restserver:v1.2.0 +FROM docker.io/openpai/pai-marketplace-restserver:v1.3.0 diff --git a/src/marketplace-webportal/build/marketplace-webportal.k8s.dockerfile b/src/marketplace-webportal/build/marketplace-webportal.k8s.dockerfile index a2f12ff94a..71e47075f7 100644 --- a/src/marketplace-webportal/build/marketplace-webportal.k8s.dockerfile +++ b/src/marketplace-webportal/build/marketplace-webportal.k8s.dockerfile @@ -1,4 +1,4 @@ # Copyright (c) Microsoft Corporation. # Licensed under the MIT License. -FROM docker.io/openpai/pai-marketplace-webportal:v1.2.0 +FROM docker.io/openpai/pai-marketplace-webportal:v1.3.0