Minor fixes (#6)

* Add tensorboard logger * Minor * Update affine coupling * Fix flow block Add autoflake * Minor fixes * Add tests for trainer * Minor fixes * Update README.md
kashperova · Jan 2, 2025 · 1f89492 · 1f89492
1 parent d28101a
commit 1f89492
Show file tree

Hide file tree

Showing 4 changed files with 23 additions and 6 deletions.
diff --git a/.gitignore b/.gitignore
@@ -2,3 +2,4 @@ todo.py
 ./runs
 ./samples
 ./.misc/notebooks
+./checkpoints
diff --git a/README.md b/README.md
@@ -0,0 +1,16 @@
+# PyTorch Glow: Generative Flow with Invertible 1x1 Convolutions
+
+![MIT License](https://img.shields.io/badge/License-MIT-blue.svg)
+[![Paper](https://img.shields.io/badge/ArXiv-Paper-red)](https://arxiv.org/abs/1807.03039)
+
+Glow is a normalizing flow model introduced by OpenAI that uses an invertible generative architecture.
+Glow’s flow blocks consist of 3 components: act norm, 1x1 invertible convolutions and affine coupling layers.
+<br></br>
+This repository contains the complete workflow for training and testing Glow. All code was developed during the GenAI UCU course.
+Here are presented:
+- model implementation from scratch
+- train script with hydra configs
+- tensorboard logging
+- DDP trainer
+- tests with pytest
+- CI using github actions
diff --git a/src/modules/trainer/ddp_trainer.py b/src/modules/trainer/ddp_trainer.py
@@ -40,8 +40,8 @@ def __init__(
             test_dataset=self.test_dataset,
         )
 
-    def train_epoch(self) -> float:
-        train_loss = super().train_epoch()
+    def train_epoch(self, epoch: int) -> float:
+        train_loss = super().train_epoch(epoch)
 
         loss_tensor = torch.tensor(train_loss, device=self.ddp.rank)
         dist.all_reduce(loss_tensor, op=dist.ReduceOp.SUM)
@@ -65,5 +65,5 @@ def train(self):
 
             for i in tqdm(range(self.train_config.epochs)):
                 self.ddp.set_train_epoch(i)
-                self.train_epoch()
+                self.train_epoch(i)
                 self.test_epoch()
diff --git a/src/modules/trainer/trainer.py b/src/modules/trainer/trainer.py
@@ -119,7 +119,7 @@ def train(self):
             self.model.module(images)
 
         for i in tqdm(range(self.train_config.n_epochs)):
-            train_loss = self.train_epoch(1)
+            train_loss = self.train_epoch(i)
             train_loss /= len(self.train_dataset)
 
             test_loss = self.test_epoch()
@@ -135,11 +135,11 @@ def save_checkpoint(self, epoch: int):
             os.makedirs(self.train_config.save_dir, exist_ok=True)
 
         torch.save(
-            self.model.state_dict(), f"{self.train_config.save_dir}/model_{epoch}.pt"
+            self.model.state_dict(), f"{self.train_config.save_dir}/model_{epoch}.bin"
         )
         torch.save(
             self.optimizer.state_dict(),
-            f"{self.train_config.save_dir}/optimizer_{epoch}.pt",
+            f"{self.train_config.save_dir}/optimizer_{epoch}.bin",
         )
 
     @torch.inference_mode()