diff --git a/.gitignore b/.gitignore index 2e93839..8469650 100644 --- a/.gitignore +++ b/.gitignore @@ -2,3 +2,4 @@ todo.py ./runs ./samples ./.misc/notebooks +./checkpoints diff --git a/README.md b/README.md index e69de29..03051f2 100644 --- a/README.md +++ b/README.md @@ -0,0 +1,16 @@ +# PyTorch Glow: Generative Flow with Invertible 1x1 Convolutions + +![MIT License](https://img.shields.io/badge/License-MIT-blue.svg) +[![Paper](https://img.shields.io/badge/ArXiv-Paper-red)](https://arxiv.org/abs/1807.03039) + +Glow is a normalizing flow model introduced by OpenAI that uses an invertible generative architecture. +Glow’s flow blocks consist of 3 components: act norm, 1x1 invertible convolutions and affine coupling layers. +

+This repository contains the complete workflow for training and testing Glow. All code was developed during the GenAI UCU course. +Here are presented: +- model implementation from scratch +- train script with hydra configs +- tensorboard logging +- DDP trainer +- tests with pytest +- CI using github actions diff --git a/src/modules/trainer/ddp_trainer.py b/src/modules/trainer/ddp_trainer.py index a3859f9..c3704f5 100644 --- a/src/modules/trainer/ddp_trainer.py +++ b/src/modules/trainer/ddp_trainer.py @@ -40,8 +40,8 @@ def __init__( test_dataset=self.test_dataset, ) - def train_epoch(self) -> float: - train_loss = super().train_epoch() + def train_epoch(self, epoch: int) -> float: + train_loss = super().train_epoch(epoch) loss_tensor = torch.tensor(train_loss, device=self.ddp.rank) dist.all_reduce(loss_tensor, op=dist.ReduceOp.SUM) @@ -65,5 +65,5 @@ def train(self): for i in tqdm(range(self.train_config.epochs)): self.ddp.set_train_epoch(i) - self.train_epoch() + self.train_epoch(i) self.test_epoch() diff --git a/src/modules/trainer/trainer.py b/src/modules/trainer/trainer.py index 6ef55c5..2097516 100644 --- a/src/modules/trainer/trainer.py +++ b/src/modules/trainer/trainer.py @@ -119,7 +119,7 @@ def train(self): self.model.module(images) for i in tqdm(range(self.train_config.n_epochs)): - train_loss = self.train_epoch(1) + train_loss = self.train_epoch(i) train_loss /= len(self.train_dataset) test_loss = self.test_epoch() @@ -135,11 +135,11 @@ def save_checkpoint(self, epoch: int): os.makedirs(self.train_config.save_dir, exist_ok=True) torch.save( - self.model.state_dict(), f"{self.train_config.save_dir}/model_{epoch}.pt" + self.model.state_dict(), f"{self.train_config.save_dir}/model_{epoch}.bin" ) torch.save( self.optimizer.state_dict(), - f"{self.train_config.save_dir}/optimizer_{epoch}.pt", + f"{self.train_config.save_dir}/optimizer_{epoch}.bin", ) @torch.inference_mode()