From d931b9d62df613d6d980d41c169b77406a2a8848 Mon Sep 17 00:00:00 2001 From: srishtih Date: Tue, 27 Feb 2024 13:20:09 -0800 Subject: [PATCH] refactor pytorch-cuda12 image to include torchvision (#2279) --- images/pytorch-cuda12/README.md | 3 +- images/pytorch-cuda12/config/main.tf | 7 +- images/pytorch-cuda12/main.tf | 2 +- .../tests/pytorch-helm-install.sh | 3 +- images/pytorch-cuda12/tests/quickstart.py | 110 ++++++++++++++++++ images/pytorch-cuda12/tests/torch_optim.py | 53 --------- 6 files changed, 118 insertions(+), 60 deletions(-) create mode 100644 images/pytorch-cuda12/tests/quickstart.py delete mode 100644 images/pytorch-cuda12/tests/torch_optim.py diff --git a/images/pytorch-cuda12/README.md b/images/pytorch-cuda12/README.md index 974bbeded2..a3e714a33d 100644 --- a/images/pytorch-cuda12/README.md +++ b/images/pytorch-cuda12/README.md @@ -60,8 +60,9 @@ As a quick intro, we will use pytorch to create a very simple deep learning mode To run this script, ```bash -docker run --rm -it -v /home/srishihegde/quick.py:/tmp/model_builder.py --gpus all cgr.dev/chainguard/pytorch-cuda12:latest python /tmp/model_builder.py +docker run --rm -it -v /home/srishihegde/quick.py:/tmp/model_builder.py --gpus all cgr.dev/chainguard/pytorch-cuda12:latest -c "python /tmp/model_builder.py" ``` +A quickstart tutorial as outlined [here](https://pytorch.org/tutorials/beginner/basics/quickstart_tutorial.html) can also be run using the tests/quickstart.py script similar to the above run ### Using Helm charts diff --git a/images/pytorch-cuda12/config/main.tf b/images/pytorch-cuda12/config/main.tf index d8e51a28c5..a31ecda3a9 100644 --- a/images/pytorch-cuda12/config/main.tf +++ b/images/pytorch-cuda12/config/main.tf @@ -7,7 +7,8 @@ terraform { variable "extra_packages" { description = "Additional packages to install." type = list(string) - default = ["pytorch-cuda12"] + # torchvision is currently built on top of torch and should include all the packages we expect from it + default = ["torchvision-cuda12"] } variable "extra_repositories" { @@ -37,10 +38,10 @@ output "config" { } accounts = module.accts.block environment = merge({ - "PATH" : "/usr/share/pytorch/.venv/bin:/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin", + "PATH" : "/usr/share/torchvision/.venv/bin:/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin", }, var.environment) entrypoint = { - command = "/bin/bash -c" + command = "/bin/bash" } archs = ["x86_64"] }) diff --git a/images/pytorch-cuda12/main.tf b/images/pytorch-cuda12/main.tf index 514d1545bd..e1073dba67 100644 --- a/images/pytorch-cuda12/main.tf +++ b/images/pytorch-cuda12/main.tf @@ -10,7 +10,7 @@ variable "target_repository" { module "config" { source = "./config" - extra_packages = ["pytorch-cuda12", "busybox", "bash"] + extra_packages = ["torchvision-cuda12", "busybox", "bash"] } module "latest" { diff --git a/images/pytorch-cuda12/tests/pytorch-helm-install.sh b/images/pytorch-cuda12/tests/pytorch-helm-install.sh index 734fecf1cd..a3e558c01c 100755 --- a/images/pytorch-cuda12/tests/pytorch-helm-install.sh +++ b/images/pytorch-cuda12/tests/pytorch-helm-install.sh @@ -10,12 +10,11 @@ set -o errexit -o nounset -o errtrace -o pipefail -x my_d=$(cd "${0%/*}" && pwd) -test_script="/home/runner/work/images-private/images-private/images/pytorch/tests/torch_optim.py" exit_code=1 run_scripts() { pod_name=$(kubectl get pods -n ${RELEASE_NAMESPACE} -l "app.kubernetes.io/instance=${RELEASE_NAME}" -o custom-columns=:metadata.name --no-headers | head -n 1) - kubectl cp "${my_d}/torch_optim.py" "$pod_name":/tmp/pytorch.py -n ${RELEASE_NAMESPACE} + kubectl cp "${my_d}/quickstart.py" "$pod_name":/tmp/pytorch.py -n ${RELEASE_NAMESPACE} kubectl exec "$pod_name" -n ${RELEASE_NAMESPACE} -- python /tmp/pytorch.py exit_code=$? if [ $exit_code -eq 0 ]; then diff --git a/images/pytorch-cuda12/tests/quickstart.py b/images/pytorch-cuda12/tests/quickstart.py new file mode 100644 index 0000000000..5113fb580c --- /dev/null +++ b/images/pytorch-cuda12/tests/quickstart.py @@ -0,0 +1,110 @@ +import torch +from torch import nn +from torch.utils.data import DataLoader +from torchvision import datasets +from torchvision.transforms import ToTensor + +# Download training data from open datasets. +training_data = datasets.FashionMNIST( + root="data", + train=True, + download=True, + transform=ToTensor(), +) + +# Download test data from open datasets. +test_data = datasets.FashionMNIST( + root="data", + train=False, + download=True, + transform=ToTensor(), +) + +batch_size = 64 + +# Create data loaders. +train_dataloader = DataLoader(training_data, batch_size=batch_size) +test_dataloader = DataLoader(test_data, batch_size=batch_size) + +for X, y in test_dataloader: + print(f"Shape of X [N, C, H, W]: {X.shape}") + print(f"Shape of y: {y.shape} {y.dtype}") + break + +# Get cpu, gpu or mps device for training. +device = ( + "cuda" + if torch.cuda.is_available() + else "mps" + if torch.backends.mps.is_available() + else "cpu" +) +print(f"Using {device} device") + +# Define model +class NeuralNetwork(nn.Module): + def __init__(self): + super().__init__() + self.flatten = nn.Flatten() + self.linear_relu_stack = nn.Sequential( + nn.Linear(28*28, 512), + nn.ReLU(), + nn.Linear(512, 512), + nn.ReLU(), + nn.Linear(512, 10) + ) + + def forward(self, x): + x = self.flatten(x) + logits = self.linear_relu_stack(x) + return logits + +model = NeuralNetwork().to(device) +print(model) + + +loss_fn = nn.CrossEntropyLoss() +optimizer = torch.optim.SGD(model.parameters(), lr=1e-3) + +def train(dataloader, model, loss_fn, optimizer): + size = len(dataloader.dataset) + model.train() + for batch, (X, y) in enumerate(dataloader): + X, y = X.to(device), y.to(device) + + # Compute prediction error + pred = model(X) + loss = loss_fn(pred, y) + + # Backpropagation + loss.backward() + optimizer.step() + optimizer.zero_grad() + + if batch % 100 == 0: + loss, current = loss.item(), (batch + 1) * len(X) + print(f"loss: {loss:>7f} [{current:>5d}/{size:>5d}]") + + +def test(dataloader, model, loss_fn): + size = len(dataloader.dataset) + num_batches = len(dataloader) + model.eval() + test_loss, correct = 0, 0 + with torch.no_grad(): + for X, y in dataloader: + X, y = X.to(device), y.to(device) + pred = model(X) + test_loss += loss_fn(pred, y).item() + correct += (pred.argmax(1) == y).type(torch.float).sum().item() + test_loss /= num_batches + correct /= size + print(f"Test Error: \n Accuracy: {(100*correct):>0.1f}%, Avg loss: {test_loss:>8f} \n") + + +epochs = 3 +for t in range(epochs): + print(f"Epoch {t+1}\n-------------------------------") + train(train_dataloader, model, loss_fn, optimizer) + test(test_dataloader, model, loss_fn) +print("Done!") \ No newline at end of file diff --git a/images/pytorch-cuda12/tests/torch_optim.py b/images/pytorch-cuda12/tests/torch_optim.py deleted file mode 100644 index a79c57fbde..0000000000 --- a/images/pytorch-cuda12/tests/torch_optim.py +++ /dev/null @@ -1,53 +0,0 @@ -# -*- coding: utf-8 -*- -import torch -import math - - -# Create Tensors to hold input and outputs. -x = torch.linspace(-math.pi, math.pi, 2000) -y = torch.sin(x) - -# Prepare the input tensor (x, x^2, x^3). -p = torch.tensor([1, 2, 3]) -xx = x.unsqueeze(-1).pow(p) - -# Use the nn package to define our model and loss function. -model = torch.nn.Sequential( - torch.nn.Linear(3, 1), - torch.nn.Flatten(0, 1) -) -loss_fn = torch.nn.MSELoss(reduction='sum') - -# Use the optim package to define an Optimizer that will update the weights of -# the model for us. Here we will use RMSprop; the optim package contains many other -# optimization algorithms. The first argument to the RMSprop constructor tells the -# optimizer which Tensors it should update. -learning_rate = 1e-3 -optimizer = torch.optim.RMSprop(model.parameters(), lr=learning_rate) -for t in range(2000): - # Forward pass: compute predicted y by passing x to the model. - y_pred = model(xx) - - # Compute and print loss. - loss = loss_fn(y_pred, y) - if t % 100 == 99: - print(t, loss.item()) - - # Before the backward pass, use the optimizer object to zero all of the - # gradients for the variables it will update (which are the learnable - # weights of the model). This is because by default, gradients are - # accumulated in buffers( i.e, not overwritten) whenever .backward() - # is called. Checkout docs of torch.autograd.backward for more details. - optimizer.zero_grad() - - # Backward pass: compute gradient of the loss with respect to model - # parameters - loss.backward() - - # Calling the step function on an Optimizer makes an update to its - # parameters - optimizer.step() - - -linear_layer = model[0] -print(f'Result: y = {linear_layer.bias.item()} + {linear_layer.weight[:, 0].item()} x + {linear_layer.weight[:, 1].item()} x^2 + {linear_layer.weight[:, 2].item()} x^3')