From 287b49c9cde09fd9eb010b9d567f15ad13916318 Mon Sep 17 00:00:00 2001 From: Kye Date: Fri, 7 Jul 2023 18:15:08 -0400 Subject: [PATCH] unit testing for pretraining --- testing/README.md | 22 +++++++++++++++ testing/main.py | 69 +++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 91 insertions(+) create mode 100644 testing/README.md create mode 100644 testing/main.py diff --git a/testing/README.md b/testing/README.md new file mode 100644 index 0000000..8fa999d --- /dev/null +++ b/testing/README.md @@ -0,0 +1,22 @@ +To ensure the performance and reliability of the language model, we need to track a variety of metrics. These include both quantitative measures, such as accuracy, perplexity, speed and memory consumption, as well as qualitative measures like coherence, relevance, and versatility of the generated responses. Here is a list of potential metrics to consider: + +**1. Accuracy Metrics:** +- **Perplexity**: A measure of how well the model predicts a sample. Lower values are better. +- **BLEU (Bilingual Evaluation Understudy) Score**: Measures how many words overlap in the predicted and actual outputs, with a particular emphasis on the order of the words. It is most useful in tasks like translation. +- **ROUGE (Recall-Oriented Understudy for Gisting Evaluation) Score**: It measures the quality of summaries by counting the number of overlapping units like n-grams, word sequences, and word pairs between the source and target text. +- **F1 Score**: Harmonic mean of precision (how many selected items are relevant) and recall (how many relevant items are selected). + +**2. Speed and Resource Metrics:** +- **Latency**: The time it takes to generate a response after the input is given. +- **Throughput**: The number of tasks the model can complete in a given time period. +- **Memory consumption**: The amount of RAM consumed during the prediction phase. + +**3. Qualitative Metrics:** +- **Coherence**: Whether the output makes sense. +- **Relevance**: Whether the output is relevant to the input query. +- **Versatility**: Whether the model can handle a variety of input types and still produce coherent, relevant output. + + +This suite tests the model for speed (latency and throughput) and memory consumption. In addition to these, you should also conduct manual tests to evaluate the model's output on various inputs for coherence, relevance and versatility. + +Remember, there is no specific test for accuracy metrics such as perplexity, BLEU score, ROUGE score or F1 score because these are often task-specific and need to be evaluated on a per task basis. \ No newline at end of file diff --git a/testing/main.py b/testing/main.py new file mode 100644 index 0000000..6ac65f1 --- /dev/null +++ b/testing/main.py @@ -0,0 +1,69 @@ +import unittest +import torch +import time +from Andromeda.model import Andromeda +from Andromeda.utils.stable_adamw import StableAdamWUnfused + +class AndromedaTest(unittest.TestCase): + + def setUp(self): + self.model = Andromeda + self.optimizer = StableAdamWUnfused() + self.loss_function = torch.nn.CrossEntropyLoss() + self.test_input = torch.randint(0, 256, (1, 1024)).cuda() + + def test_forward_pass(self): + #test if the models forward pass works + output = self.model(self.input_tensor) + self.assertEqual(output.shape, (1, 1024, 64007)) # test if output shape is correct + + def test_backward_pass(self): + #test if the models backward pass works correctly + self.optimizer.zero_grad() + output = self.model(self.input_tensor) + loss = self.loss_function(output, self.input_tensor) + + loss.backward() + for name, parameter in self.model.named_parameters(): + self.assertFalse(torch.isnan(parameter.grad().any(), f"Gradient for {name} contains NaNs")) + self.assertFalse(torch.isinf(parameter.grad().any(), f"Gradient for {name} contains Infs")) + + def test_optimizer_step(self): + #test if the optimizer steps correctly + initial_params = [param.clone() for param in self.model.parameters()] + output = self.model(self.input_tensor) + loss = self.loss_function(output, self.input_tensor) + + self.optimizer.zero_grad() + loss.backward() + self.optimizer_step() + for initial_param, param in zip(initial_params, self.model.parameters()): + self.assertFalse(torch.equal(initial_param, param), 'Model parameters did not change after an optimizer step') + + def test_prediction(self): + start_time = time.time() + prediction = self.model(self.test_input) + latency = time.time() - start_time + + self.assertLess(latency, 1) # test if latency is less than 1 second + self.assertEqual(prediction.shape, (1, 1024, 64007)) # test if output shape si correct + + + def test_memory_consumption(self): + start_mem = torch.cuda.memory_allocated() + prediction = self.model(self.test_input) + end_mem = torch.cuda.memory_allocated() + mem_diff = end_mem - start_mem + self.assertLess(mem_diff, 2 * 1024**3) # memory diff should be less than 2gb + + def test_throughput(self): + start_time = time.time() + for _ in range(100): + prediction = self.model(self.test_input) + end_time = time.time() + throughput = 100 / (end_time - start_time) + self.assertGreater(throughput, 10) # model should handle atleast at 10 inferences per second + + +if __name__ == "__main__": + unittest.main() \ No newline at end of file