Skip to content

Commit

Permalink
refactor pytorch-cuda12 image to include torchvision (#2279)
Browse files Browse the repository at this point in the history
  • Loading branch information
srishtih authored Feb 27, 2024
1 parent 4136733 commit d931b9d
Show file tree
Hide file tree
Showing 6 changed files with 118 additions and 60 deletions.
3 changes: 2 additions & 1 deletion images/pytorch-cuda12/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -60,8 +60,9 @@ As a quick intro, we will use pytorch to create a very simple deep learning mode

To run this script,
```bash
docker run --rm -it -v /home/srishihegde/quick.py:/tmp/model_builder.py --gpus all cgr.dev/chainguard/pytorch-cuda12:latest python /tmp/model_builder.py
docker run --rm -it -v /home/srishihegde/quick.py:/tmp/model_builder.py --gpus all cgr.dev/chainguard/pytorch-cuda12:latest -c "python /tmp/model_builder.py"
```
A quickstart tutorial as outlined [here](https://pytorch.org/tutorials/beginner/basics/quickstart_tutorial.html) can also be run using the tests/quickstart.py script similar to the above run

### Using Helm charts

Expand Down
7 changes: 4 additions & 3 deletions images/pytorch-cuda12/config/main.tf
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,8 @@ terraform {
variable "extra_packages" {
description = "Additional packages to install."
type = list(string)
default = ["pytorch-cuda12"]
# torchvision is currently built on top of torch and should include all the packages we expect from it
default = ["torchvision-cuda12"]
}

variable "extra_repositories" {
Expand Down Expand Up @@ -37,10 +38,10 @@ output "config" {
}
accounts = module.accts.block
environment = merge({
"PATH" : "/usr/share/pytorch/.venv/bin:/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin",
"PATH" : "/usr/share/torchvision/.venv/bin:/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin",
}, var.environment)
entrypoint = {
command = "/bin/bash -c"
command = "/bin/bash"
}
archs = ["x86_64"]
})
Expand Down
2 changes: 1 addition & 1 deletion images/pytorch-cuda12/main.tf
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@ variable "target_repository" {

module "config" {
source = "./config"
extra_packages = ["pytorch-cuda12", "busybox", "bash"]
extra_packages = ["torchvision-cuda12", "busybox", "bash"]
}

module "latest" {
Expand Down
3 changes: 1 addition & 2 deletions images/pytorch-cuda12/tests/pytorch-helm-install.sh
Original file line number Diff line number Diff line change
Expand Up @@ -10,12 +10,11 @@
set -o errexit -o nounset -o errtrace -o pipefail -x

my_d=$(cd "${0%/*}" && pwd)
test_script="/home/runner/work/images-private/images-private/images/pytorch/tests/torch_optim.py"
exit_code=1

run_scripts() {
pod_name=$(kubectl get pods -n ${RELEASE_NAMESPACE} -l "app.kubernetes.io/instance=${RELEASE_NAME}" -o custom-columns=:metadata.name --no-headers | head -n 1)
kubectl cp "${my_d}/torch_optim.py" "$pod_name":/tmp/pytorch.py -n ${RELEASE_NAMESPACE}
kubectl cp "${my_d}/quickstart.py" "$pod_name":/tmp/pytorch.py -n ${RELEASE_NAMESPACE}
kubectl exec "$pod_name" -n ${RELEASE_NAMESPACE} -- python /tmp/pytorch.py
exit_code=$?
if [ $exit_code -eq 0 ]; then
Expand Down
110 changes: 110 additions & 0 deletions images/pytorch-cuda12/tests/quickstart.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,110 @@
import torch
from torch import nn
from torch.utils.data import DataLoader
from torchvision import datasets
from torchvision.transforms import ToTensor

# Download training data from open datasets.
training_data = datasets.FashionMNIST(
root="data",
train=True,
download=True,
transform=ToTensor(),
)

# Download test data from open datasets.
test_data = datasets.FashionMNIST(
root="data",
train=False,
download=True,
transform=ToTensor(),
)

batch_size = 64

# Create data loaders.
train_dataloader = DataLoader(training_data, batch_size=batch_size)
test_dataloader = DataLoader(test_data, batch_size=batch_size)

for X, y in test_dataloader:
print(f"Shape of X [N, C, H, W]: {X.shape}")
print(f"Shape of y: {y.shape} {y.dtype}")
break

# Get cpu, gpu or mps device for training.
device = (
"cuda"
if torch.cuda.is_available()
else "mps"
if torch.backends.mps.is_available()
else "cpu"
)
print(f"Using {device} device")

# Define model
class NeuralNetwork(nn.Module):
def __init__(self):
super().__init__()
self.flatten = nn.Flatten()
self.linear_relu_stack = nn.Sequential(
nn.Linear(28*28, 512),
nn.ReLU(),
nn.Linear(512, 512),
nn.ReLU(),
nn.Linear(512, 10)
)

def forward(self, x):
x = self.flatten(x)
logits = self.linear_relu_stack(x)
return logits

model = NeuralNetwork().to(device)
print(model)


loss_fn = nn.CrossEntropyLoss()
optimizer = torch.optim.SGD(model.parameters(), lr=1e-3)

def train(dataloader, model, loss_fn, optimizer):
size = len(dataloader.dataset)
model.train()
for batch, (X, y) in enumerate(dataloader):
X, y = X.to(device), y.to(device)

# Compute prediction error
pred = model(X)
loss = loss_fn(pred, y)

# Backpropagation
loss.backward()
optimizer.step()
optimizer.zero_grad()

if batch % 100 == 0:
loss, current = loss.item(), (batch + 1) * len(X)
print(f"loss: {loss:>7f} [{current:>5d}/{size:>5d}]")


def test(dataloader, model, loss_fn):
size = len(dataloader.dataset)
num_batches = len(dataloader)
model.eval()
test_loss, correct = 0, 0
with torch.no_grad():
for X, y in dataloader:
X, y = X.to(device), y.to(device)
pred = model(X)
test_loss += loss_fn(pred, y).item()
correct += (pred.argmax(1) == y).type(torch.float).sum().item()
test_loss /= num_batches
correct /= size
print(f"Test Error: \n Accuracy: {(100*correct):>0.1f}%, Avg loss: {test_loss:>8f} \n")


epochs = 3
for t in range(epochs):
print(f"Epoch {t+1}\n-------------------------------")
train(train_dataloader, model, loss_fn, optimizer)
test(test_dataloader, model, loss_fn)
print("Done!")
53 changes: 0 additions & 53 deletions images/pytorch-cuda12/tests/torch_optim.py

This file was deleted.

0 comments on commit d931b9d

Please sign in to comment.