Make GPTQ test less flaky (#1295)

# What does this PR do?   Fixes # (issue) ## Before submitting - [ ] This PR fixes a typo or improves the docs (you can dismiss the other checks if that's the case). - [ ] Did you read the [contributor guideline](https://github.com/huggingface/transformers/blob/main/CONTRIBUTING.md#start-contributing-pull-requests), Pull Request section? - [ ] Was this discussed/approved via a Github issue or the [forum](https://discuss.huggingface.co/)? Please add a link to it if that's the case. - [ ] Did you make sure to update the documentation with your changes? Here are the [documentation guidelines](https://github.com/huggingface/transformers/tree/main/docs), and [here are tips on formatting docstrings](https://github.com/huggingface/transformers/tree/main/docs#writing-source-documentation). - [ ] Did you write any new necessary tests? ## Who can review? Anyone in the community is free to review the PR once the tests have passed. Feel free to tag members/contributors who may be interested in your PR.
huggingface · Nov 28, 2023 · 624800c · 624800c
1 parent ba552e1
commit 624800c
Show file tree

Hide file tree

Showing 2 changed files with 17 additions and 8 deletions.
diff --git a/integration-tests/conftest.py b/integration-tests/conftest.py
@@ -24,6 +24,7 @@
 
 
 class ResponseComparator(JSONSnapshotExtension):
+    rtol = 0.2
     def serialize(
         self,
         data,
@@ -58,7 +59,7 @@ def eq_token(token: Token, other: Token) -> bool:
             return (
                 token.id == other.id
                 and token.text == other.text
-                and math.isclose(token.logprob, other.logprob, rel_tol=0.2)
+                and math.isclose(token.logprob, other.logprob, rel_tol=self.rtol)
                 and token.special == other.special
             )
 
@@ -68,7 +69,7 @@ def eq_prefill_token(prefill_token: InputToken, other: InputToken) -> bool:
                     prefill_token.id == other.id
                     and prefill_token.text == other.text
                     and (
-                        math.isclose(prefill_token.logprob, other.logprob, rel_tol=0.2)
+                        math.isclose(prefill_token.logprob, other.logprob, rel_tol=self.rtol)
                         if prefill_token.logprob is not None
                         else prefill_token.logprob == other.logprob
                     )
@@ -148,6 +149,10 @@ def eq_response(response: Response, other: Response) -> bool:
         )
 
 
+class GenerousResponseComparator(ResponseComparator):
+    # Needed for GPTQ with exllama which has serious numerical fluctuations.
+    rtol = 0.75
+
 class LauncherHandle:
     def __init__(self, port: int):
         self.client = AsyncClient(f"http://localhost:{port}")
@@ -193,6 +198,10 @@ def _inner_health(self) -> bool:
 def response_snapshot(snapshot):
     return snapshot.use_extension(ResponseComparator)
 
+@pytest.fixture
+def generous_response_snapshot(snapshot):
+    return snapshot.use_extension(GenerousResponseComparator)
+
 
 @pytest.fixture(scope="module")
 def event_loop():

diff --git a/integration-tests/models/test_flash_starcoder_gptq.py b/integration-tests/models/test_flash_starcoder_gptq.py
@@ -15,20 +15,20 @@ async def flash_starcoder_gptq(flash_starcoder_gptq_handle):
 
 @pytest.mark.asyncio
 @pytest.mark.private
-async def test_flash_starcoder_gptq(flash_starcoder_gptq, response_snapshot):
+async def test_flash_starcoder_gptq(flash_starcoder_gptq, generous_response_snapshot):
     response = await flash_starcoder_gptq.generate(
         "def geometric_mean(L: List[float]):",
         max_new_tokens=20,
         decoder_input_details=True,
     )
     assert response.details.generated_tokens == 20
-    assert response == response_snapshot
+    assert response == generous_response_snapshot
 
 
 @pytest.mark.asyncio
 @pytest.mark.private
 async def test_flash_starcoder_gptq_default_params(
-    flash_starcoder_gptq, response_snapshot
+    flash_starcoder_gptq, generous_response_snapshot
 ):
     response = await flash_starcoder_gptq.generate(
         "def geometric_mean(L: List[float]):",
@@ -39,13 +39,13 @@ async def test_flash_starcoder_gptq_default_params(
         seed=0,
     )
     assert response.details.generated_tokens == 20
-    assert response == response_snapshot
+    assert response == generous_response_snapshot
 
 
 @pytest.mark.asyncio
 @pytest.mark.private
 async def test_flash_starcoder_gptq_load(
-    flash_starcoder_gptq, generate_load, response_snapshot
+    flash_starcoder_gptq, generate_load, generous_response_snapshot
 ):
     responses = await generate_load(
         flash_starcoder_gptq,
@@ -57,4 +57,4 @@ async def test_flash_starcoder_gptq_load(
     assert len(responses) == 4
     assert all([r.generated_text == responses[0].generated_text for r in responses])
 
-    assert responses == response_snapshot
+    assert responses == generous_response_snapshot