Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add progress bar for CTGAN fitting #317

Merged
merged 4 commits into from
Sep 29, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
33 changes: 29 additions & 4 deletions ctgan/synthesizers/ctgan.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@
import torch
from torch import optim
from torch.nn import BatchNorm1d, Dropout, LeakyReLU, Linear, Module, ReLU, Sequential, functional
from tqdm import tqdm

from ctgan.data_sampler import DataSampler
from ctgan.data_transformer import DataTransformer
Expand Down Expand Up @@ -175,6 +176,8 @@ def __init__(self, embedding_dim=128, generator_dim=(256, 256), discriminator_di
self._data_sampler = None
self._generator = None

self.loss_values = pd.DataFrame(columns=['Epoch', 'Generator Loss', 'Distriminator Loss'])

@staticmethod
def _gumbel_softmax(logits, tau=1, hard=False, eps=1e-10, dim=-1):
"""Deals with the instability of the gumbel_softmax for older versions of torch.
Expand Down Expand Up @@ -335,8 +338,15 @@ def fit(self, train_data, discrete_columns=(), epochs=None):
mean = torch.zeros(self._batch_size, self._embedding_dim, device=self._device)
std = mean + 1

self.loss_values = pd.DataFrame(columns=['Epoch', 'Generator Loss', 'Distriminator Loss'])

epoch_iterator = tqdm(range(epochs), disable=(not self._verbose))
if self._verbose:
description = 'Gen. ({gen:.2f}) | Discrim. ({dis:.2f})'
epoch_iterator.set_description(description.format(gen=0, dis=0))

steps_per_epoch = max(len(train_data) // self._batch_size, 1)
for i in range(epochs):
for i in epoch_iterator:
for id_ in range(steps_per_epoch):

for n in range(self._discriminator_steps):
Expand Down Expand Up @@ -412,10 +422,25 @@ def fit(self, train_data, discrete_columns=(), epochs=None):
loss_g.backward()
optimizerG.step()

generator_loss = loss_g.detach().cpu()
discriminator_loss = loss_d.detach().cpu()

epoch_loss_df = pd.DataFrame({
'Epoch': [i],
'Generator Loss': [generator_loss],
'Discriminator Loss': [discriminator_loss]
})
if not self.loss_values.empty:
self.loss_values = pd.concat(
[self.loss_values, epoch_loss_df]
).reset_index(drop=True)
else:
self.loss_values = epoch_loss_df

if self._verbose:
print(f'Epoch {i+1}, Loss G: {loss_g.detach().cpu(): .4f},' # noqa: T001
f'Loss D: {loss_d.detach().cpu(): .4f}',
flush=True)
epoch_iterator.set_description(
description.format(gen=generator_loss, dis=discriminator_loss)
)

@random_state
def sample(self, n, condition_column=None, condition_value=None):
Expand Down
1 change: 1 addition & 0 deletions setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@
"torch>=1.8.0;python_version<'3.10'",
"torch>=1.11.0;python_version>='3.10' and python_version<'3.11'",
"torch>=2.0.0;python_version>='3.11'",
'tqdm>=4.15,<5',
'rdt>=1.6.1,<2.0',
]

Expand Down
16 changes: 16 additions & 0 deletions tests/integration/synthesizer/test_ctgan.py
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,8 @@ def test_ctgan_no_categoricals():
assert sampled.shape == (100, 1)
assert isinstance(sampled, pd.DataFrame)
assert set(sampled.columns) == {'continuous'}
assert len(ctgan.loss_values) == 1
assert list(ctgan.loss_values.columns) == ['Epoch', 'Generator Loss', 'Discriminator Loss']


def test_ctgan_dataframe():
Expand All @@ -51,6 +53,8 @@ def test_ctgan_dataframe():
assert isinstance(sampled, pd.DataFrame)
assert set(sampled.columns) == {'continuous', 'discrete'}
assert set(sampled['discrete'].unique()) == {'a', 'b', 'c'}
assert len(ctgan.loss_values) == 1
assert list(ctgan.loss_values.columns) == ['Epoch', 'Generator Loss', 'Discriminator Loss']


def test_ctgan_numpy():
Expand All @@ -69,6 +73,8 @@ def test_ctgan_numpy():
assert sampled.shape == (100, 2)
assert isinstance(sampled, np.ndarray)
assert set(np.unique(sampled[:, 1])) == {'a', 'b', 'c'}
assert len(ctgan.loss_values) == 1
assert list(ctgan.loss_values.columns) == ['Epoch', 'Generator Loss', 'Discriminator Loss']


def test_log_frequency():
Expand All @@ -83,13 +89,23 @@ def test_log_frequency():
ctgan = CTGAN(epochs=100)
ctgan.fit(data, discrete_columns)

assert len(ctgan.loss_values) == 100
assert list(ctgan.loss_values.columns) == ['Epoch', 'Generator Loss', 'Discriminator Loss']
pd.testing.assert_series_equal(ctgan.loss_values['Epoch'],
pd.Series(range(100), name='Epoch'))

sampled = ctgan.sample(10000)
counts = sampled['discrete'].value_counts()
assert counts['a'] < 6500

ctgan = CTGAN(log_frequency=False, epochs=100)
ctgan.fit(data, discrete_columns)

assert len(ctgan.loss_values) == 100
assert list(ctgan.loss_values.columns) == ['Epoch', 'Generator Loss', 'Discriminator Loss']
pd.testing.assert_series_equal(ctgan.loss_values['Epoch'],
pd.Series(range(100), name='Epoch'))

sampled = ctgan.sample(10000)
counts = sampled['discrete'].value_counts()
assert counts['a'] > 9000
Expand Down
Loading