From d931b9d62df613d6d980d41c169b77406a2a8848 Mon Sep 17 00:00:00 2001
From: srishtih <srishti.hegde.sh@gmail.com>
Date: Tue, 27 Feb 2024 13:20:09 -0800
Subject: [PATCH] refactor pytorch-cuda12 image to include torchvision (#2279)

---
 images/pytorch-cuda12/README.md               |   3 +-
 images/pytorch-cuda12/config/main.tf          |   7 +-
 images/pytorch-cuda12/main.tf                 |   2 +-
 .../tests/pytorch-helm-install.sh             |   3 +-
 images/pytorch-cuda12/tests/quickstart.py     | 110 ++++++++++++++++++
 images/pytorch-cuda12/tests/torch_optim.py    |  53 ---------
 6 files changed, 118 insertions(+), 60 deletions(-)
 create mode 100644 images/pytorch-cuda12/tests/quickstart.py
 delete mode 100644 images/pytorch-cuda12/tests/torch_optim.py

diff --git a/images/pytorch-cuda12/README.md b/images/pytorch-cuda12/README.md
index 974bbeded2..a3e714a33d 100644
--- a/images/pytorch-cuda12/README.md
+++ b/images/pytorch-cuda12/README.md
@@ -60,8 +60,9 @@ As a quick intro, we will use pytorch to create a very simple deep learning mode
 
 To run this script, 
 ```bash
-docker run --rm -it -v /home/srishihegde/quick.py:/tmp/model_builder.py --gpus all cgr.dev/chainguard/pytorch-cuda12:latest python /tmp/model_builder.py
+docker run --rm -it -v /home/srishihegde/quick.py:/tmp/model_builder.py --gpus all cgr.dev/chainguard/pytorch-cuda12:latest -c "python /tmp/model_builder.py"
 ```
+A quickstart tutorial as outlined [here](https://pytorch.org/tutorials/beginner/basics/quickstart_tutorial.html) can also be run using the tests/quickstart.py script similar to the above run 
 
 ### Using Helm charts
 
diff --git a/images/pytorch-cuda12/config/main.tf b/images/pytorch-cuda12/config/main.tf
index d8e51a28c5..a31ecda3a9 100644
--- a/images/pytorch-cuda12/config/main.tf
+++ b/images/pytorch-cuda12/config/main.tf
@@ -7,7 +7,8 @@ terraform {
 variable "extra_packages" {
   description = "Additional packages to install."
   type        = list(string)
-  default     = ["pytorch-cuda12"]
+  # torchvision is currently built on top of torch and should include all the packages we expect from it
+  default = ["torchvision-cuda12"]
 }
 
 variable "extra_repositories" {
@@ -37,10 +38,10 @@ output "config" {
     }
     accounts = module.accts.block
     environment = merge({
-      "PATH" : "/usr/share/pytorch/.venv/bin:/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin",
+      "PATH" : "/usr/share/torchvision/.venv/bin:/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin",
     }, var.environment)
     entrypoint = {
-      command = "/bin/bash -c"
+      command = "/bin/bash"
     }
     archs = ["x86_64"]
   })
diff --git a/images/pytorch-cuda12/main.tf b/images/pytorch-cuda12/main.tf
index 514d1545bd..e1073dba67 100644
--- a/images/pytorch-cuda12/main.tf
+++ b/images/pytorch-cuda12/main.tf
@@ -10,7 +10,7 @@ variable "target_repository" {
 
 module "config" {
   source         = "./config"
-  extra_packages = ["pytorch-cuda12", "busybox", "bash"]
+  extra_packages = ["torchvision-cuda12", "busybox", "bash"]
 }
 
 module "latest" {
diff --git a/images/pytorch-cuda12/tests/pytorch-helm-install.sh b/images/pytorch-cuda12/tests/pytorch-helm-install.sh
index 734fecf1cd..a3e558c01c 100755
--- a/images/pytorch-cuda12/tests/pytorch-helm-install.sh
+++ b/images/pytorch-cuda12/tests/pytorch-helm-install.sh
@@ -10,12 +10,11 @@
 set -o errexit -o nounset -o errtrace -o pipefail -x
 
 my_d=$(cd "${0%/*}" && pwd)
-test_script="/home/runner/work/images-private/images-private/images/pytorch/tests/torch_optim.py"
 exit_code=1
 
 run_scripts() {
   pod_name=$(kubectl get pods -n ${RELEASE_NAMESPACE} -l "app.kubernetes.io/instance=${RELEASE_NAME}" -o custom-columns=:metadata.name --no-headers | head -n 1)
-  kubectl cp "${my_d}/torch_optim.py" "$pod_name":/tmp/pytorch.py -n ${RELEASE_NAMESPACE}
+  kubectl cp "${my_d}/quickstart.py" "$pod_name":/tmp/pytorch.py -n ${RELEASE_NAMESPACE}
   kubectl exec "$pod_name" -n ${RELEASE_NAMESPACE} -- python /tmp/pytorch.py
   exit_code=$?
   if [ $exit_code -eq 0 ]; then
diff --git a/images/pytorch-cuda12/tests/quickstart.py b/images/pytorch-cuda12/tests/quickstart.py
new file mode 100644
index 0000000000..5113fb580c
--- /dev/null
+++ b/images/pytorch-cuda12/tests/quickstart.py
@@ -0,0 +1,110 @@
+import torch
+from torch import nn
+from torch.utils.data import DataLoader
+from torchvision import datasets
+from torchvision.transforms import ToTensor
+
+# Download training data from open datasets.
+training_data = datasets.FashionMNIST(
+    root="data",
+    train=True,
+    download=True,
+    transform=ToTensor(),
+)
+
+# Download test data from open datasets.
+test_data = datasets.FashionMNIST(
+    root="data",
+    train=False,
+    download=True,
+    transform=ToTensor(),
+)
+
+batch_size = 64
+
+# Create data loaders.
+train_dataloader = DataLoader(training_data, batch_size=batch_size)
+test_dataloader = DataLoader(test_data, batch_size=batch_size)
+
+for X, y in test_dataloader:
+    print(f"Shape of X [N, C, H, W]: {X.shape}")
+    print(f"Shape of y: {y.shape} {y.dtype}")
+    break
+
+# Get cpu, gpu or mps device for training.
+device = (
+    "cuda"
+    if torch.cuda.is_available()
+    else "mps"
+    if torch.backends.mps.is_available()
+    else "cpu"
+)
+print(f"Using {device} device")
+
+# Define model
+class NeuralNetwork(nn.Module):
+    def __init__(self):
+        super().__init__()
+        self.flatten = nn.Flatten()
+        self.linear_relu_stack = nn.Sequential(
+            nn.Linear(28*28, 512),
+            nn.ReLU(),
+            nn.Linear(512, 512),
+            nn.ReLU(),
+            nn.Linear(512, 10)
+        )
+
+    def forward(self, x):
+        x = self.flatten(x)
+        logits = self.linear_relu_stack(x)
+        return logits
+
+model = NeuralNetwork().to(device)
+print(model)
+
+
+loss_fn = nn.CrossEntropyLoss()
+optimizer = torch.optim.SGD(model.parameters(), lr=1e-3)
+
+def train(dataloader, model, loss_fn, optimizer):
+    size = len(dataloader.dataset)
+    model.train()
+    for batch, (X, y) in enumerate(dataloader):
+        X, y = X.to(device), y.to(device)
+
+        # Compute prediction error
+        pred = model(X)
+        loss = loss_fn(pred, y)
+
+        # Backpropagation
+        loss.backward()
+        optimizer.step()
+        optimizer.zero_grad()
+
+        if batch % 100 == 0:
+            loss, current = loss.item(), (batch + 1) * len(X)
+            print(f"loss: {loss:>7f}  [{current:>5d}/{size:>5d}]")
+
+
+def test(dataloader, model, loss_fn):
+    size = len(dataloader.dataset)
+    num_batches = len(dataloader)
+    model.eval()
+    test_loss, correct = 0, 0
+    with torch.no_grad():
+        for X, y in dataloader:
+            X, y = X.to(device), y.to(device)
+            pred = model(X)
+            test_loss += loss_fn(pred, y).item()
+            correct += (pred.argmax(1) == y).type(torch.float).sum().item()
+    test_loss /= num_batches
+    correct /= size
+    print(f"Test Error: \n Accuracy: {(100*correct):>0.1f}%, Avg loss: {test_loss:>8f} \n")
+
+
+epochs = 3
+for t in range(epochs):
+    print(f"Epoch {t+1}\n-------------------------------")
+    train(train_dataloader, model, loss_fn, optimizer)
+    test(test_dataloader, model, loss_fn)
+print("Done!")
\ No newline at end of file
diff --git a/images/pytorch-cuda12/tests/torch_optim.py b/images/pytorch-cuda12/tests/torch_optim.py
deleted file mode 100644
index a79c57fbde..0000000000
--- a/images/pytorch-cuda12/tests/torch_optim.py
+++ /dev/null
@@ -1,53 +0,0 @@
-# -*- coding: utf-8 -*-
-import torch
-import math
-
-
-# Create Tensors to hold input and outputs.
-x = torch.linspace(-math.pi, math.pi, 2000)
-y = torch.sin(x)
-
-# Prepare the input tensor (x, x^2, x^3).
-p = torch.tensor([1, 2, 3])
-xx = x.unsqueeze(-1).pow(p)
-
-# Use the nn package to define our model and loss function.
-model = torch.nn.Sequential(
-    torch.nn.Linear(3, 1),
-    torch.nn.Flatten(0, 1)
-)
-loss_fn = torch.nn.MSELoss(reduction='sum')
-
-# Use the optim package to define an Optimizer that will update the weights of
-# the model for us. Here we will use RMSprop; the optim package contains many other
-# optimization algorithms. The first argument to the RMSprop constructor tells the
-# optimizer which Tensors it should update.
-learning_rate = 1e-3
-optimizer = torch.optim.RMSprop(model.parameters(), lr=learning_rate)
-for t in range(2000):
-    # Forward pass: compute predicted y by passing x to the model.
-    y_pred = model(xx)
-
-    # Compute and print loss.
-    loss = loss_fn(y_pred, y)
-    if t % 100 == 99:
-        print(t, loss.item())
-
-    # Before the backward pass, use the optimizer object to zero all of the
-    # gradients for the variables it will update (which are the learnable
-    # weights of the model). This is because by default, gradients are
-    # accumulated in buffers( i.e, not overwritten) whenever .backward()
-    # is called. Checkout docs of torch.autograd.backward for more details.
-    optimizer.zero_grad()
-
-    # Backward pass: compute gradient of the loss with respect to model
-    # parameters
-    loss.backward()
-
-    # Calling the step function on an Optimizer makes an update to its
-    # parameters
-    optimizer.step()
-
-
-linear_layer = model[0]
-print(f'Result: y = {linear_layer.bias.item()} + {linear_layer.weight[:, 0].item()} x + {linear_layer.weight[:, 1].item()} x^2 + {linear_layer.weight[:, 2].item()} x^3')