refactor pytorch-cuda12 image to include torchvision (#2279)

chainguard-images · Feb 27, 2024 · d931b9d · d931b9d
1 parent 4136733
commit d931b9d
Show file tree

Hide file tree

Showing 6 changed files with 118 additions and 60 deletions.
diff --git a/images/pytorch-cuda12/README.md b/images/pytorch-cuda12/README.md
@@ -60,8 +60,9 @@ As a quick intro, we will use pytorch to create a very simple deep learning mode
 
 To run this script, 
 ```bash
-docker run --rm -it -v /home/srishihegde/quick.py:/tmp/model_builder.py --gpus all cgr.dev/chainguard/pytorch-cuda12:latest python /tmp/model_builder.py
+docker run --rm -it -v /home/srishihegde/quick.py:/tmp/model_builder.py --gpus all cgr.dev/chainguard/pytorch-cuda12:latest -c "python /tmp/model_builder.py"
 ```
+A quickstart tutorial as outlined [here](https://pytorch.org/tutorials/beginner/basics/quickstart_tutorial.html) can also be run using the tests/quickstart.py script similar to the above run 
 
 ### Using Helm charts
 

diff --git a/images/pytorch-cuda12/config/main.tf b/images/pytorch-cuda12/config/main.tf
@@ -7,7 +7,8 @@ terraform {
 variable "extra_packages" {
   description = "Additional packages to install."
   type        = list(string)
-  default     = ["pytorch-cuda12"]
+  # torchvision is currently built on top of torch and should include all the packages we expect from it
+  default = ["torchvision-cuda12"]
 }
 
 variable "extra_repositories" {
@@ -37,10 +38,10 @@ output "config" {
     }
     accounts = module.accts.block
     environment = merge({
-      "PATH" : "/usr/share/pytorch/.venv/bin:/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin",
+      "PATH" : "/usr/share/torchvision/.venv/bin:/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin",
     }, var.environment)
     entrypoint = {
-      command = "/bin/bash -c"
+      command = "/bin/bash"
     }
     archs = ["x86_64"]
   })

diff --git a/images/pytorch-cuda12/main.tf b/images/pytorch-cuda12/main.tf
@@ -10,7 +10,7 @@ variable "target_repository" {
 
 module "config" {
   source         = "./config"
-  extra_packages = ["pytorch-cuda12", "busybox", "bash"]
+  extra_packages = ["torchvision-cuda12", "busybox", "bash"]
 }
 
 module "latest" {

diff --git a/images/pytorch-cuda12/tests/pytorch-helm-install.sh b/images/pytorch-cuda12/tests/pytorch-helm-install.sh
@@ -10,12 +10,11 @@
 set -o errexit -o nounset -o errtrace -o pipefail -x
 
 my_d=$(cd "${0%/*}" && pwd)
-test_script="/home/runner/work/images-private/images-private/images/pytorch/tests/torch_optim.py"
 exit_code=1
 
 run_scripts() {
   pod_name=$(kubectl get pods -n ${RELEASE_NAMESPACE} -l "app.kubernetes.io/instance=${RELEASE_NAME}" -o custom-columns=:metadata.name --no-headers | head -n 1)
-  kubectl cp "${my_d}/torch_optim.py" "$pod_name":/tmp/pytorch.py -n ${RELEASE_NAMESPACE}
+  kubectl cp "${my_d}/quickstart.py" "$pod_name":/tmp/pytorch.py -n ${RELEASE_NAMESPACE}
   kubectl exec "$pod_name" -n ${RELEASE_NAMESPACE} -- python /tmp/pytorch.py
   exit_code=$?
   if [ $exit_code -eq 0 ]; then

diff --git a/images/pytorch-cuda12/tests/quickstart.py b/images/pytorch-cuda12/tests/quickstart.py
@@ -0,0 +1,110 @@
+import torch
+from torch import nn
+from torch.utils.data import DataLoader
+from torchvision import datasets
+from torchvision.transforms import ToTensor
+
+# Download training data from open datasets.
+training_data = datasets.FashionMNIST(
+    root="data",
+    train=True,
+    download=True,
+    transform=ToTensor(),
+)
+
+# Download test data from open datasets.
+test_data = datasets.FashionMNIST(
+    root="data",
+    train=False,
+    download=True,
+    transform=ToTensor(),
+)
+
+batch_size = 64
+
+# Create data loaders.
+train_dataloader = DataLoader(training_data, batch_size=batch_size)
+test_dataloader = DataLoader(test_data, batch_size=batch_size)
+
+for X, y in test_dataloader:
+    print(f"Shape of X [N, C, H, W]: {X.shape}")
+    print(f"Shape of y: {y.shape} {y.dtype}")
+    break
+
+# Get cpu, gpu or mps device for training.
+device = (
+    "cuda"
+    if torch.cuda.is_available()
+    else "mps"
+    if torch.backends.mps.is_available()
+    else "cpu"
+)
+print(f"Using {device} device")
+
+# Define model
+class NeuralNetwork(nn.Module):
+    def __init__(self):
+        super().__init__()
+        self.flatten = nn.Flatten()
+        self.linear_relu_stack = nn.Sequential(
+            nn.Linear(28*28, 512),
+            nn.ReLU(),
+            nn.Linear(512, 512),
+            nn.ReLU(),
+            nn.Linear(512, 10)
+        )
+
+    def forward(self, x):
+        x = self.flatten(x)
+        logits = self.linear_relu_stack(x)
+        return logits
+
+model = NeuralNetwork().to(device)
+print(model)
+
+
+loss_fn = nn.CrossEntropyLoss()
+optimizer = torch.optim.SGD(model.parameters(), lr=1e-3)
+
+def train(dataloader, model, loss_fn, optimizer):
+    size = len(dataloader.dataset)
+    model.train()
+    for batch, (X, y) in enumerate(dataloader):
+        X, y = X.to(device), y.to(device)
+
+        # Compute prediction error
+        pred = model(X)
+        loss = loss_fn(pred, y)
+
+        # Backpropagation
+        loss.backward()
+        optimizer.step()
+        optimizer.zero_grad()
+
+        if batch % 100 == 0:
+            loss, current = loss.item(), (batch + 1) * len(X)
+            print(f"loss: {loss:>7f}  [{current:>5d}/{size:>5d}]")
+
+
+def test(dataloader, model, loss_fn):
+    size = len(dataloader.dataset)
+    num_batches = len(dataloader)
+    model.eval()
+    test_loss, correct = 0, 0
+    with torch.no_grad():
+        for X, y in dataloader:
+            X, y = X.to(device), y.to(device)
+            pred = model(X)
+            test_loss += loss_fn(pred, y).item()
+            correct += (pred.argmax(1) == y).type(torch.float).sum().item()
+    test_loss /= num_batches
+    correct /= size
+    print(f"Test Error: \n Accuracy: {(100*correct):>0.1f}%, Avg loss: {test_loss:>8f} \n")
+
+
+epochs = 3
+for t in range(epochs):
+    print(f"Epoch {t+1}\n-------------------------------")
+    train(train_dataloader, model, loss_fn, optimizer)
+    test(test_dataloader, model, loss_fn)
+print("Done!")
diff --git a/images/pytorch-cuda12/tests/torch_optim.py b/images/pytorch-cuda12/tests/torch_optim.py