diff --git a/integration-tests/models/__snapshots__/test_compressed_tensors_w8a8_int_dynamic_weight/test_compressed_tensors_w8a8_int_dynamic_weight.json b/integration-tests/models/__snapshots__/test_compressed_tensors_w8a8_int_dynamic_weight/test_compressed_tensors_w8a8_int_dynamic_weight.json
index 2525f72cd14..7dbfc627c20 100644
--- a/integration-tests/models/__snapshots__/test_compressed_tensors_w8a8_int_dynamic_weight/test_compressed_tensors_w8a8_int_dynamic_weight.json
+++ b/integration-tests/models/__snapshots__/test_compressed_tensors_w8a8_int_dynamic_weight/test_compressed_tensors_w8a8_int_dynamic_weight.json
@@ -1,73 +1,469 @@
 {
   "details": {
     "best_of_sequences": null,
-    "finish_reason": "length",
-    "generated_tokens": 10,
+    "finish_reason": "eos_token",
+    "generated_tokens": 76,
     "prefill": [],
     "seed": null,
     "tokens": [
       {
         "id": 18183,
-        "logprob": -1.6669922,
+        "logprob": -1.5195312,
         "special": false,
         "text": " Deep"
       },
       {
         "id": 6832,
-        "logprob": -0.08959961,
+        "logprob": -0.06817627,
         "special": false,
         "text": " learning"
       },
       {
         "id": 374,
-        "logprob": -0.14685059,
+        "logprob": -0.13122559,
         "special": false,
         "text": " is"
       },
       {
         "id": 264,
-        "logprob": -0.125,
+        "logprob": -0.13415527,
         "special": false,
         "text": " a"
       },
       {
         "id": 25993,
-        "logprob": -0.81640625,
+        "logprob": -0.8769531,
         "special": false,
         "text": " subset"
       },
       {
         "id": 315,
-        "logprob": -0.0013418198,
+        "logprob": -0.0011396408,
         "special": false,
         "text": " of"
       },
       {
         "id": 5662,
-        "logprob": -0.16027832,
+        "logprob": -0.16442871,
         "special": false,
         "text": " machine"
       },
       {
         "id": 6832,
-        "logprob": -0.0016393661,
+        "logprob": -0.0026416779,
         "special": false,
         "text": " learning"
       },
       {
         "id": 429,
-        "logprob": -0.4477539,
+        "logprob": -0.48754883,
         "special": false,
         "text": " that"
       },
       {
         "id": 5711,
-        "logprob": -1.2802734,
+        "logprob": -1.2294922,
         "special": false,
         "text": " uses"
+      },
+      {
+        "id": 29728,
+        "logprob": -0.66503906,
+        "special": false,
+        "text": " neural"
+      },
+      {
+        "id": 14155,
+        "logprob": -0.02960205,
+        "special": false,
+        "text": " networks"
+      },
+      {
+        "id": 311,
+        "logprob": -0.7236328,
+        "special": false,
+        "text": " to"
+      },
+      {
+        "id": 3960,
+        "logprob": -1.1914062,
+        "special": false,
+        "text": " learn"
+      },
+      {
+        "id": 504,
+        "logprob": -0.7089844,
+        "special": false,
+        "text": " from"
+      },
+      {
+        "id": 821,
+        "logprob": -0.7729492,
+        "special": false,
+        "text": " data"
+      },
+      {
+        "id": 13,
+        "logprob": -0.7836914,
+        "special": false,
+        "text": "."
+      },
+      {
+        "id": 1084,
+        "logprob": -0.9941406,
+        "special": false,
+        "text": " It"
+      },
+      {
+        "id": 374,
+        "logprob": -0.52441406,
+        "special": false,
+        "text": " is"
+      },
+      {
+        "id": 264,
+        "logprob": -0.9511719,
+        "special": false,
+        "text": " a"
+      },
+      {
+        "id": 943,
+        "logprob": -0.8642578,
+        "special": false,
+        "text": " type"
+      },
+      {
+        "id": 315,
+        "logprob": -0.00030231476,
+        "special": false,
+        "text": " of"
+      },
+      {
+        "id": 20443,
+        "logprob": -0.14416504,
+        "special": false,
+        "text": " artificial"
+      },
+      {
+        "id": 11229,
+        "logprob": -0.013824463,
+        "special": false,
+        "text": " intelligence"
+      },
+      {
+        "id": 429,
+        "logprob": -0.18762207,
+        "special": false,
+        "text": " that"
+      },
+      {
+        "id": 646,
+        "logprob": -1.0087891,
+        "special": false,
+        "text": " can"
+      },
+      {
+        "id": 3960,
+        "logprob": -0.90234375,
+        "special": false,
+        "text": " learn"
+      },
+      {
+        "id": 504,
+        "logprob": -0.54345703,
+        "special": false,
+        "text": " from"
+      },
+      {
+        "id": 323,
+        "logprob": -1.0400391,
+        "special": false,
+        "text": " and"
+      },
+      {
+        "id": 1281,
+        "logprob": -0.072509766,
+        "special": false,
+        "text": " make"
+      },
+      {
+        "id": 19898,
+        "logprob": -0.16516113,
+        "special": false,
+        "text": " predictions"
+      },
+      {
+        "id": 389,
+        "logprob": -0.4416504,
+        "special": false,
+        "text": " on"
+      },
+      {
+        "id": 3460,
+        "logprob": -0.5385742,
+        "special": false,
+        "text": " large"
+      },
+      {
+        "id": 14713,
+        "logprob": -0.4387207,
+        "special": false,
+        "text": " amounts"
+      },
+      {
+        "id": 315,
+        "logprob": -0.00015091896,
+        "special": false,
+        "text": " of"
+      },
+      {
+        "id": 821,
+        "logprob": -0.061431885,
+        "special": false,
+        "text": " data"
+      },
+      {
+        "id": 13,
+        "logprob": -0.71875,
+        "special": false,
+        "text": "."
+      },
+      {
+        "id": 18183,
+        "logprob": -0.23632812,
+        "special": false,
+        "text": " Deep"
+      },
+      {
+        "id": 6832,
+        "logprob": -0.0017204285,
+        "special": false,
+        "text": " learning"
+      },
+      {
+        "id": 374,
+        "logprob": -1.1738281,
+        "special": false,
+        "text": " is"
+      },
+      {
+        "id": 1483,
+        "logprob": -0.61083984,
+        "special": false,
+        "text": " used"
+      },
+      {
+        "id": 304,
+        "logprob": -0.035003662,
+        "special": false,
+        "text": " in"
+      },
+      {
+        "id": 264,
+        "logprob": -0.118652344,
+        "special": false,
+        "text": " a"
+      },
+      {
+        "id": 8045,
+        "logprob": -0.42016602,
+        "special": false,
+        "text": " variety"
+      },
+      {
+        "id": 315,
+        "logprob": -1.6212463e-05,
+        "special": false,
+        "text": " of"
+      },
+      {
+        "id": 8357,
+        "logprob": -0.1315918,
+        "special": false,
+        "text": " applications"
+      },
+      {
+        "id": 11,
+        "logprob": -0.12915039,
+        "special": false,
+        "text": ","
+      },
+      {
+        "id": 2670,
+        "logprob": -0.12463379,
+        "special": false,
+        "text": " including"
+      },
+      {
+        "id": 2168,
+        "logprob": -0.37402344,
+        "special": false,
+        "text": " image"
+      },
+      {
+        "id": 323,
+        "logprob": -0.1451416,
+        "special": false,
+        "text": " and"
+      },
+      {
+        "id": 8806,
+        "logprob": -0.028869629,
+        "special": false,
+        "text": " speech"
+      },
+      {
+        "id": 17843,
+        "logprob": -0.00024068356,
+        "special": false,
+        "text": " recognition"
+      },
+      {
+        "id": 11,
+        "logprob": -0.00031018257,
+        "special": false,
+        "text": ","
+      },
+      {
+        "id": 5810,
+        "logprob": -0.019821167,
+        "special": false,
+        "text": " natural"
+      },
+      {
+        "id": 4128,
+        "logprob": -0.00012528896,
+        "special": false,
+        "text": " language"
+      },
+      {
+        "id": 8692,
+        "logprob": -0.00089263916,
+        "special": false,
+        "text": " processing"
+      },
+      {
+        "id": 11,
+        "logprob": -0.00073862076,
+        "special": false,
+        "text": ","
+      },
+      {
+        "id": 323,
+        "logprob": -0.040161133,
+        "special": false,
+        "text": " and"
+      },
+      {
+        "id": 38193,
+        "logprob": -0.4519043,
+        "special": false,
+        "text": " autonomous"
+      },
+      {
+        "id": 11474,
+        "logprob": -0.39941406,
+        "special": false,
+        "text": " vehicles"
+      },
+      {
+        "id": 13,
+        "logprob": -0.21166992,
+        "special": false,
+        "text": "."
+      },
+      {
+        "id": 1084,
+        "logprob": -0.9082031,
+        "special": false,
+        "text": " It"
+      },
+      {
+        "id": 374,
+        "logprob": -0.44213867,
+        "special": false,
+        "text": " is"
+      },
+      {
+        "id": 264,
+        "logprob": -1.2177734,
+        "special": false,
+        "text": " a"
+      },
+      {
+        "id": 18512,
+        "logprob": -0.5205078,
+        "special": false,
+        "text": " rapidly"
+      },
+      {
+        "id": 7826,
+        "logprob": -0.15332031,
+        "special": false,
+        "text": " growing"
+      },
+      {
+        "id": 2070,
+        "logprob": -0.0039978027,
+        "special": false,
+        "text": " field"
+      },
+      {
+        "id": 448,
+        "logprob": -0.9091797,
+        "special": false,
+        "text": " with"
+      },
+      {
+        "id": 1657,
+        "logprob": -0.17114258,
+        "special": false,
+        "text": " many"
+      },
+      {
+        "id": 4650,
+        "logprob": -0.70703125,
+        "special": false,
+        "text": " potential"
+      },
+      {
+        "id": 8357,
+        "logprob": -0.025131226,
+        "special": false,
+        "text": " applications"
+      },
+      {
+        "id": 304,
+        "logprob": -0.6699219,
+        "special": false,
+        "text": " in"
+      },
+      {
+        "id": 279,
+        "logprob": -0.35205078,
+        "special": false,
+        "text": " the"
+      },
+      {
+        "id": 3853,
+        "logprob": -0.049194336,
+        "special": false,
+        "text": " future"
+      },
+      {
+        "id": 13,
+        "logprob": -0.21972656,
+        "special": false,
+        "text": "."
+      },
+      {
+        "id": 151643,
+        "logprob": -2.0019531,
+        "special": true,
+        "text": "<|endoftext|>"
       }
     ],
     "top_tokens": null
   },
-  "generated_text": " Deep learning is a subset of machine learning that uses"
+  "generated_text": " Deep learning is a subset of machine learning that uses neural networks to learn from data. It is a type of artificial intelligence that can learn from and make predictions on large amounts of data. Deep learning is used in a variety of applications, including image and speech recognition, natural language processing, and autonomous vehicles. It is a rapidly growing field with many potential applications in the future."
 }
diff --git a/integration-tests/models/__snapshots__/test_compressed_tensors_w8a8_int_dynamic_weight/test_compressed_tensors_w8a8_int_dynamic_weight_all_params.json b/integration-tests/models/__snapshots__/test_compressed_tensors_w8a8_int_dynamic_weight/test_compressed_tensors_w8a8_int_dynamic_weight_all_params.json
index 6b3f5092917..2c840e67124 100644
--- a/integration-tests/models/__snapshots__/test_compressed_tensors_w8a8_int_dynamic_weight/test_compressed_tensors_w8a8_int_dynamic_weight_all_params.json
+++ b/integration-tests/models/__snapshots__/test_compressed_tensors_w8a8_int_dynamic_weight/test_compressed_tensors_w8a8_int_dynamic_weight_all_params.json
@@ -7,67 +7,67 @@
     "seed": 0,
     "tokens": [
       {
-        "id": 1939,
-        "logprob": -2.2460938,
+        "id": 5267,
+        "logprob": -1.1464844,
         "special": false,
-        "text": "?\n\n"
+        "text": "?\n"
       },
       {
         "id": 33464,
-        "logprob": 0.0,
+        "logprob": -0.83203125,
         "special": false,
         "text": "Deep"
       },
       {
         "id": 20909,
-        "logprob": -0.48608398,
+        "logprob": -0.5625,
         "special": false,
         "text": " Learning"
       },
       {
-        "id": 4102,
-        "logprob": -2.265625,
+        "id": 320,
+        "logprob": -2.1464844,
         "special": false,
-        "text": " "
+        "text": " ("
       },
       {
-        "id": 285,
+        "id": 16524,
         "logprob": 0.0,
         "special": false,
-        "text": "is"
+        "text": "DL"
       },
       {
-        "id": 458,
-        "logprob": -0.6328125,
+        "id": 701,
+        "logprob": -2.2089844,
         "special": false,
-        "text": " an"
+        "text": "),"
       },
       {
-        "id": 20443,
-        "logprob": -0.1796875,
+        "id": 476,
+        "logprob": -0.27368164,
         "special": false,
-        "text": " artificial"
+        "text": " or"
       },
       {
-        "id": 11229,
-        "logprob": 0.0,
+        "id": 20443,
+        "logprob": -0.09442139,
         "special": false,
-        "text": " intelligence"
+        "text": " artificial"
       },
       {
-        "id": 320,
-        "logprob": -0.37695312,
+        "id": 29728,
+        "logprob": 0.0,
         "special": false,
-        "text": " ("
+        "text": " neural"
       },
       {
-        "id": 15469,
+        "id": 14155,
         "logprob": 0.0,
         "special": false,
-        "text": "AI"
+        "text": " networks"
       }
     ],
     "top_tokens": null
   },
-  "generated_text": "What is deep learning?\n\nDeep Learning is an artificial intelligence (AI"
+  "generated_text": "What is deep learning?\nDeep Learning (DL), or artificial neural networks"
 }
diff --git a/integration-tests/models/__snapshots__/test_compressed_tensors_w8a8_int_dynamic_weight/test_compressed_tensors_w8a8_int_dynamic_weight_load.json b/integration-tests/models/__snapshots__/test_compressed_tensors_w8a8_int_dynamic_weight/test_compressed_tensors_w8a8_int_dynamic_weight_load.json
index 1fa4e33aa05..aee5698b474 100644
--- a/integration-tests/models/__snapshots__/test_compressed_tensors_w8a8_int_dynamic_weight/test_compressed_tensors_w8a8_int_dynamic_weight_load.json
+++ b/integration-tests/models/__snapshots__/test_compressed_tensors_w8a8_int_dynamic_weight/test_compressed_tensors_w8a8_int_dynamic_weight_load.json
@@ -9,61 +9,61 @@
       "tokens": [
         {
           "id": 18183,
-          "logprob": -1.4912109,
+          "logprob": -1.5195312,
           "special": false,
           "text": " Deep"
         },
         {
           "id": 6832,
-          "logprob": -0.075683594,
+          "logprob": -0.06817627,
           "special": false,
           "text": " learning"
         },
         {
           "id": 374,
-          "logprob": -0.12408447,
+          "logprob": -0.13122559,
           "special": false,
           "text": " is"
         },
         {
           "id": 264,
-          "logprob": -0.12768555,
+          "logprob": -0.13415527,
           "special": false,
           "text": " a"
         },
         {
           "id": 25993,
-          "logprob": -0.82128906,
+          "logprob": -0.87353516,
           "special": false,
           "text": " subset"
         },
         {
           "id": 315,
-          "logprob": -0.0012636185,
+          "logprob": -0.0011396408,
           "special": false,
           "text": " of"
         },
         {
           "id": 5662,
-          "logprob": -0.12878418,
+          "logprob": -0.16442871,
           "special": false,
           "text": " machine"
         },
         {
           "id": 6832,
-          "logprob": -0.0015888214,
+          "logprob": -0.0026416779,
           "special": false,
           "text": " learning"
         },
         {
           "id": 429,
-          "logprob": -0.49194336,
+          "logprob": -0.48754883,
           "special": false,
           "text": " that"
         },
         {
           "id": 5711,
-          "logprob": -1.2626953,
+          "logprob": -1.2294922,
           "special": false,
           "text": " uses"
         }
@@ -82,61 +82,61 @@
       "tokens": [
         {
           "id": 18183,
-          "logprob": -1.4912109,
+          "logprob": -1.5195312,
           "special": false,
           "text": " Deep"
         },
         {
           "id": 6832,
-          "logprob": -0.075683594,
+          "logprob": -0.06817627,
           "special": false,
           "text": " learning"
         },
         {
           "id": 374,
-          "logprob": -0.12408447,
+          "logprob": -0.13122559,
           "special": false,
           "text": " is"
         },
         {
           "id": 264,
-          "logprob": -0.12768555,
+          "logprob": -0.13415527,
           "special": false,
           "text": " a"
         },
         {
           "id": 25993,
-          "logprob": -0.82128906,
+          "logprob": -0.87353516,
           "special": false,
           "text": " subset"
         },
         {
           "id": 315,
-          "logprob": -0.0012636185,
+          "logprob": -0.0011396408,
           "special": false,
           "text": " of"
         },
         {
           "id": 5662,
-          "logprob": -0.12878418,
+          "logprob": -0.16442871,
           "special": false,
           "text": " machine"
         },
         {
           "id": 6832,
-          "logprob": -0.0015888214,
+          "logprob": -0.0026416779,
           "special": false,
           "text": " learning"
         },
         {
           "id": 429,
-          "logprob": -0.49194336,
+          "logprob": -0.48754883,
           "special": false,
           "text": " that"
         },
         {
           "id": 5711,
-          "logprob": -1.2626953,
+          "logprob": -1.2294922,
           "special": false,
           "text": " uses"
         }
@@ -155,61 +155,61 @@
       "tokens": [
         {
           "id": 18183,
-          "logprob": -1.4912109,
+          "logprob": -1.5195312,
           "special": false,
           "text": " Deep"
         },
         {
           "id": 6832,
-          "logprob": -0.075683594,
+          "logprob": -0.06817627,
           "special": false,
           "text": " learning"
         },
         {
           "id": 374,
-          "logprob": -0.12408447,
+          "logprob": -0.13122559,
           "special": false,
           "text": " is"
         },
         {
           "id": 264,
-          "logprob": -0.12768555,
+          "logprob": -0.13415527,
           "special": false,
           "text": " a"
         },
         {
           "id": 25993,
-          "logprob": -0.82128906,
+          "logprob": -0.87353516,
           "special": false,
           "text": " subset"
         },
         {
           "id": 315,
-          "logprob": -0.0012636185,
+          "logprob": -0.0011396408,
           "special": false,
           "text": " of"
         },
         {
           "id": 5662,
-          "logprob": -0.12878418,
+          "logprob": -0.16442871,
           "special": false,
           "text": " machine"
         },
         {
           "id": 6832,
-          "logprob": -0.0015888214,
+          "logprob": -0.0026416779,
           "special": false,
           "text": " learning"
         },
         {
           "id": 429,
-          "logprob": -0.49194336,
+          "logprob": -0.48754883,
           "special": false,
           "text": " that"
         },
         {
           "id": 5711,
-          "logprob": -1.2626953,
+          "logprob": -1.2294922,
           "special": false,
           "text": " uses"
         }
@@ -228,61 +228,61 @@
       "tokens": [
         {
           "id": 18183,
-          "logprob": -1.4912109,
+          "logprob": -1.5195312,
           "special": false,
           "text": " Deep"
         },
         {
           "id": 6832,
-          "logprob": -0.075683594,
+          "logprob": -0.06817627,
           "special": false,
           "text": " learning"
         },
         {
           "id": 374,
-          "logprob": -0.12408447,
+          "logprob": -0.13122559,
           "special": false,
           "text": " is"
         },
         {
           "id": 264,
-          "logprob": -0.12768555,
+          "logprob": -0.13415527,
           "special": false,
           "text": " a"
         },
         {
           "id": 25993,
-          "logprob": -0.82128906,
+          "logprob": -0.87353516,
           "special": false,
           "text": " subset"
         },
         {
           "id": 315,
-          "logprob": -0.0012636185,
+          "logprob": -0.0011396408,
           "special": false,
           "text": " of"
         },
         {
           "id": 5662,
-          "logprob": -0.12878418,
+          "logprob": -0.16442871,
           "special": false,
           "text": " machine"
         },
         {
           "id": 6832,
-          "logprob": -0.0015888214,
+          "logprob": -0.0026416779,
           "special": false,
           "text": " learning"
         },
         {
           "id": 429,
-          "logprob": -0.49194336,
+          "logprob": -0.48754883,
           "special": false,
           "text": " that"
         },
         {
           "id": 5711,
-          "logprob": -1.2626953,
+          "logprob": -1.2294922,
           "special": false,
           "text": " uses"
         }
diff --git a/integration-tests/models/__snapshots__/test_flash_qwen2_vl/test_flash_qwen2_vl_simple.json b/integration-tests/models/__snapshots__/test_flash_qwen2_vl/test_flash_qwen2_vl_simple.json
index 131631e65c8..49f332252bc 100644
--- a/integration-tests/models/__snapshots__/test_flash_qwen2_vl/test_flash_qwen2_vl_simple.json
+++ b/integration-tests/models/__snapshots__/test_flash_qwen2_vl/test_flash_qwen2_vl_simple.json
@@ -13,14 +13,14 @@
       "usage": null
     }
   ],
-  "created": 1730164250,
+  "created": 1737645979,
   "id": "",
   "model": "Qwen/Qwen2-VL-7B-Instruct",
   "object": "chat.completion",
-  "system_fingerprint": "2.4.2-dev0-native",
+  "system_fingerprint": "3.0.2-dev0-native",
   "usage": {
     "completion_tokens": 58,
-    "prompt_tokens": 349,
-    "total_tokens": 407
+    "prompt_tokens": 1364,
+    "total_tokens": 1422
   }
 }
diff --git a/integration-tests/models/__snapshots__/test_flash_qwen2_vl/test_flash_qwen2_vl_simple_streaming.json b/integration-tests/models/__snapshots__/test_flash_qwen2_vl/test_flash_qwen2_vl_simple_streaming.json
index 3e2faca714d..3dc8fc6d6d5 100644
--- a/integration-tests/models/__snapshots__/test_flash_qwen2_vl/test_flash_qwen2_vl_simple_streaming.json
+++ b/integration-tests/models/__snapshots__/test_flash_qwen2_vl/test_flash_qwen2_vl_simple_streaming.json
@@ -11,10 +11,10 @@
       "logprobs": null
     }
   ],
-  "created": 1730416361,
+  "created": 1737646031,
   "id": "",
   "model": "Qwen/Qwen2-VL-7B-Instruct",
   "object": "chat.completion.chunk",
-  "system_fingerprint": "2.4.2-dev0-native",
+  "system_fingerprint": "3.0.2-dev0-native",
   "usage": null
 }
diff --git a/integration-tests/models/test_compressed_tensors_w8a8_int_dynamic_weight.py b/integration-tests/models/test_compressed_tensors_w8a8_int_dynamic_weight.py
index a0b0416b861..17e12c221c2 100644
--- a/integration-tests/models/test_compressed_tensors_w8a8_int_dynamic_weight.py
+++ b/integration-tests/models/test_compressed_tensors_w8a8_int_dynamic_weight.py
@@ -27,15 +27,16 @@ async def test_compressed_tensors_w8a8_int_dynamic_weight(
 ):
     response = await compressed_tensors_w8a8_int_dynamic_weight.generate(
         "What is deep learning?",
-        max_new_tokens=10,
+        # prefer a longer response than the default, allow the llm to end generation
+        max_new_tokens=1000,
         decoder_input_details=True,
     )
 
     assert (
         response.generated_text
-        == " Deep learning is a subset of machine learning that uses"
+        == " Deep learning is a subset of machine learning that uses neural networks to learn from data. It is a type of artificial intelligence that can learn from and make predictions on large amounts of data. Deep learning is used in a variety of applications, including image and speech recognition, natural language processing, and autonomous vehicles. It is a rapidly growing field with many potential applications in the future."
     )
-    assert response.details.generated_tokens == 10
+    assert response.details.generated_tokens == 76
     assert response == response_snapshot
 
 
@@ -64,7 +65,7 @@ async def test_compressed_tensors_w8a8_int_dynamic_weight_all_params(
     assert response.details.generated_tokens == 10
     assert (
         response.generated_text
-        == "What is deep learning?\n\nDeep Learning is an artificial intelligence (AI"
+        == "What is deep learning?\nDeep Learning (DL), or artificial neural networks"
     )
     assert response == response_snapshot
 
diff --git a/integration-tests/models/test_flash_qwen2_vl.py b/integration-tests/models/test_flash_qwen2_vl.py
index 97a533fc5d4..dacd92a87b3 100644
--- a/integration-tests/models/test_flash_qwen2_vl.py
+++ b/integration-tests/models/test_flash_qwen2_vl.py
@@ -1,81 +1,78 @@
-# Disabled because it's broken.
-# import pytest
-#
-#
-# @pytest.fixture(scope="module")
-# def flash_qwen2_vl_handle(launcher):
-#     with launcher("Qwen/Qwen2-VL-7B-Instruct") as handle:
-#         yield handle
-#
-#
-# @pytest.fixture(scope="module")
-# async def flash_qwen2(flash_qwen2_vl_handle):
-#     await flash_qwen2_vl_handle.health(300)
-#     return flash_qwen2_vl_handle.client
-#
-#
-# @pytest.mark.private
-# async def test_flash_qwen2_vl_simple(flash_qwen2, response_snapshot):
-#     response = await flash_qwen2.chat(
-#         max_tokens=100,
-#         seed=42,
-#         messages=[
-#             {
-#                 "role": "user",
-#                 "content": [
-#                     {
-#                         "type": "image_url",
-#                         "image_url": {
-#                             "url": "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/rabbit.png"
-#                         },
-#                     },
-#                     {"type": "text", "text": "Describe this image."},
-#                 ],
-#             },
-#         ],
-#     )
-#
-#     assert (
-#         response.choices[0].message.content
-#         == "The image depicts an anthropomorphic rabbit, wearing a futuristic spacesuit, in an extraterrestrial environment. The setting appears to be a red planet resembling Mars, with rugged terrain and rocky formations in the background. The moon is visible in the distant sky, adding to the lunar landscape."
-#     )
-#
-#     assert response == response_snapshot
-#
-#
-# @pytest.mark.private
-# async def test_flash_qwen2_vl_simple_streaming(flash_qwen2, response_snapshot):
-#     responses = await flash_qwen2.chat(
-#         max_tokens=100,
-#         seed=42,
-#         messages=[
-#             {
-#                 "role": "user",
-#                 "content": [
-#                     {
-#                         "type": "image_url",
-#                         "image_url": {
-#                             "url": "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/rabbit.png"
-#                         },
-#                     },
-#                     {"type": "text", "text": "Describe this image."},
-#                 ],
-#             },
-#         ],
-#         stream=True,
-#     )
-#
-#     count = 0
-#     generated = ""
-#     last_response = None
-#     async for response in responses:
-#         count += 1
-#         generated += response.choices[0].delta.content
-#         last_response = response
-#
-#     assert (
-#         generated
-#         == "The image depicts an anthropomorphic rabbit, wearing a futuristic spacesuit, in an extraterrestrial environment. The setting appears to be a red planet resembling Mars, with rugged terrain and rocky formations in the background. The moon is visible in the distant sky, adding to the lunar landscape."
-#     )
-#     assert count == 58
-#     assert last_response == response_snapshot
+import pytest
+
+
+@pytest.fixture(scope="module")
+def flash_qwen2_vl_handle(launcher):
+    with launcher("Qwen/Qwen2-VL-7B-Instruct") as handle:
+        yield handle
+
+
+@pytest.fixture(scope="module")
+async def flash_qwen2(flash_qwen2_vl_handle):
+    await flash_qwen2_vl_handle.health(300)
+    return flash_qwen2_vl_handle.client
+
+
+@pytest.mark.private
+async def test_flash_qwen2_vl_simple(flash_qwen2, response_snapshot):
+    response = await flash_qwen2.chat(
+        seed=42,
+        messages=[
+            {
+                "role": "user",
+                "content": [
+                    {
+                        "type": "image_url",
+                        "image_url": {
+                            "url": "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/rabbit.png"
+                        },
+                    },
+                    {"type": "text", "text": "Describe this image."},
+                ],
+            },
+        ],
+    )
+
+    assert (
+        response.choices[0].message.content
+        == "The image depicts an anthropomorphic rabbit, wearing a futuristic spacesuit, in an extraterrestrial environment. The setting appears to be a red planet resembling Mars, with rugged terrain and rocky formations in the background. The moon is visible in the distant sky, adding to the lunar landscape."
+    )
+
+    assert response == response_snapshot
+
+
+@pytest.mark.private
+async def test_flash_qwen2_vl_simple_streaming(flash_qwen2, response_snapshot):
+    responses = await flash_qwen2.chat(
+        seed=42,
+        messages=[
+            {
+                "role": "user",
+                "content": [
+                    {
+                        "type": "image_url",
+                        "image_url": {
+                            "url": "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/rabbit.png"
+                        },
+                    },
+                    {"type": "text", "text": "Describe this image."},
+                ],
+            },
+        ],
+        stream=True,
+    )
+
+    count = 0
+    generated = ""
+    last_response = None
+    async for response in responses:
+        count += 1
+        generated += response.choices[0].delta.content
+        last_response = response
+
+    assert (
+        generated
+        == "The image depicts an anthropomorphic rabbit, wearing a futuristic spacesuit, in an extraterrestrial environment. The setting appears to be a red planet resembling Mars, with rugged terrain and rocky formations in the background. The moon is visible in the distant sky, adding to the lunar landscape."
+    )
+    assert count == 58
+    assert last_response == response_snapshot
diff --git a/launcher/src/main.rs b/launcher/src/main.rs
index 394cc1e641e..597ebdde510 100644
--- a/launcher/src/main.rs
+++ b/launcher/src/main.rs
@@ -230,7 +230,14 @@ struct QuantizationConfig {
 }
 
 #[derive(Debug, Deserialize)]
-struct VisionConfig {}
+struct VisionConfig {
+    depth: Option<usize>,
+    embed_dim: Option<usize>,
+    mlp_ratio: Option<usize>,
+    in_chans: Option<usize>,
+    patch_size: Option<usize>,
+    temporal_patch_size: Option<usize>,
+}
 
 #[derive(Debug, Deserialize)]
 struct Config {
@@ -253,11 +260,6 @@ struct Config {
 
 impl Config {
     fn flop(&self) -> Option<u64> {
-        if self.vision_config.is_some() {
-            // VLM are much harder to predict and VRAM requirements
-            // Are more complex.
-            return None;
-        }
         let num_heads = self.num_heads? as u64;
         let num_kv_heads = self.num_kv_heads? as u64;
         let head_dim = self.head_dim? as u64;
@@ -277,8 +279,50 @@ impl Config {
         let gate_up_down_flops = 2 * 3 * hidden_size * intermediate_size;
 
         let layer_flops = attn_layer_flops + gate_up_down_flops;
-        let total = layer_flops * num_layers;
-        Some(total)
+        let text_flops = layer_flops * num_layers;
+
+        tracing::debug!("Text flops: {}", human_size(text_flops as usize, "flop"));
+
+        // text-only case
+        if self.vision_config.is_none() {
+            return Some(text_flops);
+        }
+
+        let vision_config = self.vision_config.as_ref().unwrap();
+
+        // estimate vision flops for specific model types
+        match self.model_type.as_deref() {
+            Some("qwen2_vl") => {
+                let in_chans = vision_config.in_chans? as u64;
+                let patch_size = vision_config.patch_size? as u64;
+                let embed_dim = vision_config.embed_dim? as u64;
+                let vision_depth = vision_config.depth? as u64;
+                let mlp_ratio = vision_config.mlp_ratio? as u64;
+                let temporal_patch_size = vision_config.temporal_patch_size? as u64;
+                // 1. patch embedding:
+                // - conv3d operation: (t*h*w) * (k_t*k_h*k_w) * c_in * c_out * 2
+                // where the 2 accounts for multiply-add
+                let patch_flops =
+                    2 * temporal_patch_size * patch_size.pow(2) * embed_dim * in_chans;
+                // 2. self-attention + mlp:
+                // - qkv projections: 3 * d_model * d_model * 2
+                // - attention: d_model * d_model * 2
+                // - mlp: 2 * d_model * (mlp_ratio * d_model) * 2
+                // simplified to: 2 * d_model * (4 + mlp_ratio * d_model)
+                let attn_flops = 2 * embed_dim * (4 + mlp_ratio * embed_dim);
+                // 3. add with layer norm flops for total vision layer flops
+                let layer_flops = patch_flops + attn_flops + 2 * embed_dim;
+                let vision_flops = layer_flops * vision_depth;
+                tracing::debug!(
+                    "Vision flops: {}",
+                    human_size(vision_flops as usize, "flop")
+                );
+                Some(text_flops + vision_flops)
+            }
+            // model has a vision config but is not supported for flops calculation
+            // we return None to avoid overestimating the memory requirements
+            _ => None,
+        }
     }
 
     fn kv_vram_per_tok(&self) -> Option<usize> {
diff --git a/server/text_generation_server/layers/rotary.py b/server/text_generation_server/layers/rotary.py
index e346d0f8946..9f1770ff6b0 100644
--- a/server/text_generation_server/layers/rotary.py
+++ b/server/text_generation_server/layers/rotary.py
@@ -86,6 +86,16 @@ def static(cls, config, dim, base, device):
             # `rope_type` is now standard in transformers, but some existing models
             # have `type` instead.
             rope_type = rope_scaling.get("rope_type", rope_scaling.get("type", None))
+            mrope_section = rope_scaling.get("mrope_section", None)
+
+            # only apply mrope if sections are provided and the rope type is mrope or default
+            if mrope_section is not None and (
+                rope_type == "mrope" or rope_type == "default"
+            ):
+                mrope_section = rope_scaling.get("mrope_section")
+                return RotaryPositionEmbeddingMultimodalSections(
+                    inv_freq, scaling_factor, mrope_section
+                )
 
             if rope_type == "linear":
                 pass
@@ -548,3 +558,76 @@ def apply_llama3_scaling(
             new_freqs.append((1 - smooth) * freq / scaling_factor + smooth * freq)
 
     return torch.tensor(new_freqs, dtype=freqs.dtype, device=freqs.device)
+
+
+class RotaryPositionEmbeddingMultimodalSections(PositionRotaryEmbedding):
+    def __init__(self, inv_freq, scaling_factor, sections):
+        super().__init__(inv_freq, scaling_factor)
+        # expand the inv_freq for the 3 sections
+        self.inv_freq_exp = inv_freq[None, None, :, None].expand(3, -1, -1, 1)
+        self.sections = sections * 2
+        self._cos_cached = None
+        self._sin_cached = None
+
+    def forward(
+        self,
+        query: torch.Tensor,
+        key: torch.Tensor,
+        cos: torch.Tensor,
+        sin: torch.Tensor,
+    ):
+        # process multi-modal rotary embeddings
+        split_cos, split_sin = [
+            torch.split(t, self.sections, dim=-1) for t in (cos, sin)
+        ]
+        cos = torch.cat([m[i % 3] for i, m in enumerate(split_cos)], dim=-1).unsqueeze(
+            1
+        )
+        sin = torch.cat([m[i % 3] for i, m in enumerate(split_sin)], dim=-1).unsqueeze(
+            1
+        )
+        # prepare input tensors
+        q, k = [x.transpose(0, 1).unsqueeze(0) for x in (query, key)]
+        rotary_dim = cos.shape[-1]
+        q1, k1 = q[..., :rotary_dim], k[..., :rotary_dim]
+        q2 = torch.cat((-q[..., rotary_dim // 2 :], q[..., : rotary_dim // 2]), dim=-1)
+        k2 = torch.cat((-k[..., rotary_dim // 2 :], k[..., : rotary_dim // 2]), dim=-1)
+
+        rotary_emb.apply_rotary(q1, q2, cos, sin, q1, q2, True)
+        rotary_emb.apply_rotary(k1, k2, cos, sin, k1, k2, True)
+
+    def _update_cos_sin_cache(self, dtype, device, seqlen):
+        # always cache the cos/sin for the full sequence length to avoid
+        # recomputing if the sequence length is smaller than the cached one
+        if (
+            seqlen > self._seq_len_cached
+            or self._cos_cached_exp.device != device
+            or self._cos_cached_exp.dtype != dtype
+        ):
+            self._seq_len_cached = seqlen
+            t = torch.arange(seqlen, device=device, dtype=self.inv_freq.dtype)
+            freqs = torch.outer(t, self.inv_freq.to(device=t.device))
+            freqs = freqs.expand(3, -1, -1)
+            self._cos_cached_exp = freqs.cos().to(dtype)
+            self._sin_cached_exp = freqs.sin().to(dtype)
+
+    def get_cos_sin(
+        self,
+        position_ids: torch.Tensor,
+        max_s: int,
+        dtype: torch.dtype,
+    ):
+        self._update_cos_sin_cache(dtype, position_ids.device, max_s)
+        # expand the position_ids to match the shape of the cached cos/sin
+        indices = (
+            position_ids.squeeze(1)
+            .unsqueeze(-1)
+            .expand(-1, -1, self._cos_cached_exp.shape[-1])
+        )
+        cos_c = torch.gather(self._cos_cached_exp, 1, indices)
+        cos_c = torch.cat([cos_c, cos_c], dim=-1).unsqueeze(1)
+
+        sin_c = torch.gather(self._sin_cached_exp, 1, indices)
+        sin_c = torch.cat([sin_c, sin_c], dim=-1).unsqueeze(1)
+
+        return cos_c, sin_c
diff --git a/server/text_generation_server/models/__init__.py b/server/text_generation_server/models/__init__.py
index 3b437ce0492..2dbe206742a 100644
--- a/server/text_generation_server/models/__init__.py
+++ b/server/text_generation_server/models/__init__.py
@@ -1249,6 +1249,7 @@ def get_model(
             quantize=quantize,
             speculator=speculator,
             dtype=dtype,
+            default_dtype=torch.bfloat16,
             kv_cache_dtype=kv_cache_dtype,
             trust_remote_code=trust_remote_code,
             lora_adapter_ids=lora_adapter_ids,
diff --git a/server/text_generation_server/models/custom_modeling/flash_qwen2_modeling.py b/server/text_generation_server/models/custom_modeling/flash_qwen2_modeling.py
index cc4039b1cbc..78ae3020cb8 100644
--- a/server/text_generation_server/models/custom_modeling/flash_qwen2_modeling.py
+++ b/server/text_generation_server/models/custom_modeling/flash_qwen2_modeling.py
@@ -61,11 +61,6 @@ def __init__(
             config.sliding_window if config.sliding_window is not None else -1
         )
         self.num_heads = config.num_attention_heads
-        self.mrope_section = (
-            config.rope_scaling.get("mrope_section", None)
-            if config.rope_scaling is not None
-            else None
-        )
         self.hidden_size = config.hidden_size
         self.head_size = self.hidden_size // self.num_heads
 
@@ -127,17 +122,6 @@ def forward(
         query = query.view(-1, self.num_heads, self.head_size)
         kv = kv.view(-1, 2, self.num_key_value_heads, self.head_size)
 
-        if self.mrope_section is not None:
-            # if mrope_section is set, we need to split the cos and sin into 3 parts and concatenate them in a specific order
-            cos = torch.cat(
-                [m[i % 3] for i, m in enumerate(cos.split(self.mrope_section, dim=-1))],
-                dim=-1,
-            )
-            sin = torch.cat(
-                [m[i % 3] for i, m in enumerate(sin.split(self.mrope_section, dim=-1))],
-                dim=-1,
-            )
-
         self.rotary_emb(query, torch.select(kv, dim=1, index=0), cos, sin)
 
         if prefill_cache_indices is not None:
@@ -251,7 +235,8 @@ def forward(
         max_s,
         prefill_cache_indices,
     ):
-        normed_hidden_states, res = self.input_layernorm(hidden_states, residual)
+        residual = hidden_states
+        normed_hidden_states, _ = self.input_layernorm(hidden_states)
 
         # Self Attention
         attn_output = self.self_attn(
@@ -266,15 +251,14 @@ def forward(
             max_s,
             prefill_cache_indices,
         )
+        hidden_states = attn_output + residual
 
         # faster post attention rms norm
-        normed_attn_res_output, attn_res = self.post_attention_layernorm(
-            attn_output, res
-        )
-
-        mlp_output = self.mlp(normed_attn_res_output)
-
-        return mlp_output, attn_res
+        residual = hidden_states
+        hidden_states, _ = self.post_attention_layernorm(hidden_states)
+        mlp_output = self.mlp(hidden_states)
+        hidden_states = mlp_output + residual
+        return hidden_states
 
 
 class Qwen2Model(torch.nn.Module):
@@ -322,18 +306,15 @@ def forward(
     ) -> torch.Tensor:
         hidden_states = inputs_embeds
 
-        # flatten position ids from 2D to 1D
         cos, sin = self.layers[0].self_attn.rotary_emb.get_cos_sin(
-            position_ids.flatten(), true_max_s, hidden_states.dtype
+            position_ids,
+            true_max_s,
+            hidden_states.dtype,
         )
-        # reshape back to 2D if the position_ids were 2D
-        if position_ids.size(0) != cos.size(0):
-            cos = cos.view(position_ids.size(0), position_ids.size(-1), -1).unsqueeze(2)
-            sin = sin.view(position_ids.size(0), position_ids.size(-1), -1).unsqueeze(2)
 
         residual = None
         for i, layer in enumerate(self.layers):
-            hidden_states, residual = layer(
+            hidden_states = layer(
                 hidden_states,
                 residual,
                 cos,
@@ -347,7 +328,7 @@ def forward(
                 prefill_cache_indices,
             )
 
-        hidden_states, _ = self.norm(hidden_states, residual)
+        hidden_states, _ = self.norm(hidden_states)
 
         return hidden_states
 
diff --git a/server/text_generation_server/models/custom_modeling/qwen2_vl.py b/server/text_generation_server/models/custom_modeling/qwen2_vl.py
index a8e1e8c1593..e0ae19df766 100644
--- a/server/text_generation_server/models/custom_modeling/qwen2_vl.py
+++ b/server/text_generation_server/models/custom_modeling/qwen2_vl.py
@@ -222,12 +222,11 @@ def __init__(self, prefix, config, weights):
     def forward(
         self, hidden_states, cu_seqlens, rotary_pos_emb, max_seqlen
     ) -> torch.Tensor:
-        hidden_states_post_norm1, res = self.norm1(hidden_states)
-        hidden_states = hidden_states + self.attn(
-            hidden_states_post_norm1, cu_seqlens, rotary_pos_emb, max_seqlen
-        )
-        hidden_states_post_norm2, res = self.norm2(hidden_states)
-        hidden_states = hidden_states + self.mlp(hidden_states_post_norm2)
+        norm1_out, _ = self.norm1(hidden_states)
+        attn_out = self.attn(norm1_out, cu_seqlens, rotary_pos_emb, max_seqlen)
+        hidden_states = hidden_states + attn_out
+        norm2_out, _ = self.norm2(hidden_states)
+        hidden_states = hidden_states + self.mlp(norm2_out)
         return hidden_states
 
 
@@ -527,6 +526,7 @@ def forward(
 
         # apply the visual model to the pixel values if they are provided
         if pixel_values is not None and len(pixel_values) > 0:
+            pixel_values = pixel_values.to(inputs_embeds.dtype)
             if pixel_values is not None:
                 image_embeds = self.visual(
                     pixel_values, grid_thw=image_grid_thw
diff --git a/server/text_generation_server/models/flash_causal_lm.py b/server/text_generation_server/models/flash_causal_lm.py
index d097c54fc2c..ac69f11537a 100644
--- a/server/text_generation_server/models/flash_causal_lm.py
+++ b/server/text_generation_server/models/flash_causal_lm.py
@@ -1400,7 +1400,11 @@ def cuda_graph_warmup(self, bs: int, max_s: int, max_bt: int):
         cache_lengths = [0] * bs
         if max_bs is None:
             input_ids = torch.zeros(bs, dtype=torch.int64, device=self.device)
-            position_ids = torch.zeros(bs, dtype=torch.int32, device=self.device)
+            if hasattr(self.model, "get_position_ids"):
+                # use model specific position ids for initialization
+                position_ids = self.model.get_position_ids(input_ids)
+            else:
+                position_ids = torch.zeros(bs, dtype=torch.int32, device=self.device)
             slots = torch.arange(bs, dtype=torch.int64, device=self.device)
             input_lengths_tensor = (
                 torch.ones(bs, dtype=torch.int32, device=self.device) * max_s
@@ -1427,7 +1431,7 @@ def cuda_graph_warmup(self, bs: int, max_s: int, max_bt: int):
                     "Cuda graphs should be generated in decreasing order size to reduce VRAM usage"
                 )
             input_ids = self.cuda_graphs[max_bs]["input_ids"][:bs]
-            position_ids = self.cuda_graphs[max_bs]["position_ids"][:bs]
+            position_ids = self.cuda_graphs[max_bs]["position_ids"][..., :bs]
             if ATTENTION == "flashinfer":
                 block_tables = self.cuda_graphs[max_bs]["block_tables"][: bs * max_bt]
             else:
@@ -1456,14 +1460,6 @@ def cuda_graph_warmup(self, bs: int, max_s: int, max_bt: int):
         else:
             state = None
 
-        if (
-            hasattr(self.model, "config")
-            and hasattr(self.model.config, "model_type")
-            and self.model.config.model_type == "qwen2_vl"
-        ):
-            if position_ids.dim() == 1:
-                position_ids = self.model.get_position_ids(input_ids)
-
         graph = torch.cuda.CUDAGraph()
         self.cuda_graphs[bs] = {
             "input_ids": input_ids,