Skip to content

Commit

Permalink
async add print logic as well
Browse files Browse the repository at this point in the history
  • Loading branch information
aldo authored and aldo committed Jun 21, 2024
1 parent 2731161 commit d4342a2
Show file tree
Hide file tree
Showing 2 changed files with 24 additions and 23 deletions.
1 change: 1 addition & 0 deletions src/instructlab/training/async_logger.py
Original file line number Diff line number Diff line change
Expand Up @@ -37,6 +37,7 @@ async def log(self, data):
data['timestamp'] = datetime.now().isoformat()
self.logs.append(data)
await self._write_logs_to_file(data)
{{ print(f"\033[92m{json.dumps(data, indent=4)}\033[0m") }}

Check warning on line 40 in src/instructlab/training/async_logger.py

View workflow job for this annotation

GitHub Actions / lint

C0303: Trailing whitespace (trailing-whitespace)

Check failure on line 40 in src/instructlab/training/async_logger.py

View workflow job for this annotation

GitHub Actions / lint

E1143: '{print(f'\x1b[92m{json.dumps(data, indent=4)}\x1b[0m')}' is unhashable and can't be used as a member in a set (unhashable-member)

async def _write_logs_to_file(self, data):
'''appends to the log instead of writing the whole log each time'''
Expand Down
46 changes: 23 additions & 23 deletions src/instructlab/training/main_ds.py
Original file line number Diff line number Diff line change
Expand Up @@ -401,18 +401,18 @@ def train(args, model, tokenizer, train_loader, grad_accum, metric_logger):
)
weight_norm = float(model.optimizer.single_partition_of_fp32_groups[0].norm())

print(
f"throughput: {overall_throughput} "
f"samples/s, lr: {current_lr}, "
f"loss: {loss.item()} "
f"cuda_mem_allocated: {cuda_mem_allocated} GB "
f"cuda_malloc_retries: {cuda_malloc_retries} "
f"num_loss_counted_tokens: {num_loss_counted_tokens} "
f"batch_size: {aggregated_values[1]} "
f"total loss: {aggregated_values[2]/num_loss_counted_tokens} "
f"gradnorm: {global_grad_norm} "
f"weight_norm: {weight_norm}"
)
# print(
# f"throughput: {overall_throughput} "
# f"samples/s, lr: {current_lr}, "
# f"loss: {loss.item()} "
# f"cuda_mem_allocated: {cuda_mem_allocated} GB "
# f"cuda_malloc_retries: {cuda_malloc_retries} "
# f"num_loss_counted_tokens: {num_loss_counted_tokens} "
# f"batch_size: {aggregated_values[1]} "
# f"total loss: {aggregated_values[2]/num_loss_counted_tokens} "
# f"gradnorm: {global_grad_norm} "
# f"weight_norm: {weight_norm}"
# )
metric_logger.log_sync(
{
"epoch": epoch,
Expand Down Expand Up @@ -510,17 +510,17 @@ def main(args):
)

if args.local_rank == 0:
print(
f"\033[96mnum_gpus: {torch.distributed.get_world_size()}\n"
f"avg_sample_len: {dataset.get_lengths().mean()}\n"
f"effective_batch_size: {args.effective_batch_size}\n"
f"max_batch_len_per_gpu: {args.max_batch_len}\n"
f"packing_max_batch_len: {packing_max_batch_len}\n"
f"grad_accum: {grad_accum}\n"
f"num batches: {len(train_loader)}\n"
f"avg_samples_per_batch: {len(dataset)/len(train_loader)}\n"
f"samples_per_gpu: {args.samples_per_gpu}\033[0m"
)
# print(
# f"\033[96mnum_gpus: {torch.distributed.get_world_size()}\n"
# f"avg_sample_len: {dataset.get_lengths().mean()}\n"
# f"effective_batch_size: {args.effective_batch_size}\n"
# f"max_batch_len_per_gpu: {args.max_batch_len}\n"
# f"packing_max_batch_len: {packing_max_batch_len}\n"
# f"grad_accum: {grad_accum}\n"
# f"num batches: {len(train_loader)}\n"
# f"avg_samples_per_batch: {len(dataset)/len(train_loader)}\n"
# f"samples_per_gpu: {args.samples_per_gpu}\033[0m"
# )
metric_logger.log_sync({
'num_gpus': torch.distributed.get_world_size(),
'avg_sample_len': dataset.get_lengths().mean(),
Expand Down

0 comments on commit d4342a2

Please sign in to comment.