diff --git a/integration-tests/models/__snapshots__/test_compressed_tensors_w8a8_int_dynamic_weight/test_compressed_tensors_w8a8_int_dynamic_weight.json b/integration-tests/models/__snapshots__/test_compressed_tensors_w8a8_int_dynamic_weight/test_compressed_tensors_w8a8_int_dynamic_weight.json index 2525f72cd14..7dbfc627c20 100644 --- a/integration-tests/models/__snapshots__/test_compressed_tensors_w8a8_int_dynamic_weight/test_compressed_tensors_w8a8_int_dynamic_weight.json +++ b/integration-tests/models/__snapshots__/test_compressed_tensors_w8a8_int_dynamic_weight/test_compressed_tensors_w8a8_int_dynamic_weight.json @@ -1,73 +1,469 @@ { "details": { "best_of_sequences": null, - "finish_reason": "length", - "generated_tokens": 10, + "finish_reason": "eos_token", + "generated_tokens": 76, "prefill": [], "seed": null, "tokens": [ { "id": 18183, - "logprob": -1.6669922, + "logprob": -1.5195312, "special": false, "text": " Deep" }, { "id": 6832, - "logprob": -0.08959961, + "logprob": -0.06817627, "special": false, "text": " learning" }, { "id": 374, - "logprob": -0.14685059, + "logprob": -0.13122559, "special": false, "text": " is" }, { "id": 264, - "logprob": -0.125, + "logprob": -0.13415527, "special": false, "text": " a" }, { "id": 25993, - "logprob": -0.81640625, + "logprob": -0.8769531, "special": false, "text": " subset" }, { "id": 315, - "logprob": -0.0013418198, + "logprob": -0.0011396408, "special": false, "text": " of" }, { "id": 5662, - "logprob": -0.16027832, + "logprob": -0.16442871, "special": false, "text": " machine" }, { "id": 6832, - "logprob": -0.0016393661, + "logprob": -0.0026416779, "special": false, "text": " learning" }, { "id": 429, - "logprob": -0.4477539, + "logprob": -0.48754883, "special": false, "text": " that" }, { "id": 5711, - "logprob": -1.2802734, + "logprob": -1.2294922, "special": false, "text": " uses" + }, + { + "id": 29728, + "logprob": -0.66503906, + "special": false, + "text": " neural" + }, + { + "id": 14155, + "logprob": -0.02960205, + "special": false, + "text": " networks" + }, + { + "id": 311, + "logprob": -0.7236328, + "special": false, + "text": " to" + }, + { + "id": 3960, + "logprob": -1.1914062, + "special": false, + "text": " learn" + }, + { + "id": 504, + "logprob": -0.7089844, + "special": false, + "text": " from" + }, + { + "id": 821, + "logprob": -0.7729492, + "special": false, + "text": " data" + }, + { + "id": 13, + "logprob": -0.7836914, + "special": false, + "text": "." + }, + { + "id": 1084, + "logprob": -0.9941406, + "special": false, + "text": " It" + }, + { + "id": 374, + "logprob": -0.52441406, + "special": false, + "text": " is" + }, + { + "id": 264, + "logprob": -0.9511719, + "special": false, + "text": " a" + }, + { + "id": 943, + "logprob": -0.8642578, + "special": false, + "text": " type" + }, + { + "id": 315, + "logprob": -0.00030231476, + "special": false, + "text": " of" + }, + { + "id": 20443, + "logprob": -0.14416504, + "special": false, + "text": " artificial" + }, + { + "id": 11229, + "logprob": -0.013824463, + "special": false, + "text": " intelligence" + }, + { + "id": 429, + "logprob": -0.18762207, + "special": false, + "text": " that" + }, + { + "id": 646, + "logprob": -1.0087891, + "special": false, + "text": " can" + }, + { + "id": 3960, + "logprob": -0.90234375, + "special": false, + "text": " learn" + }, + { + "id": 504, + "logprob": -0.54345703, + "special": false, + "text": " from" + }, + { + "id": 323, + "logprob": -1.0400391, + "special": false, + "text": " and" + }, + { + "id": 1281, + "logprob": -0.072509766, + "special": false, + "text": " make" + }, + { + "id": 19898, + "logprob": -0.16516113, + "special": false, + "text": " predictions" + }, + { + "id": 389, + "logprob": -0.4416504, + "special": false, + "text": " on" + }, + { + "id": 3460, + "logprob": -0.5385742, + "special": false, + "text": " large" + }, + { + "id": 14713, + "logprob": -0.4387207, + "special": false, + "text": " amounts" + }, + { + "id": 315, + "logprob": -0.00015091896, + "special": false, + "text": " of" + }, + { + "id": 821, + "logprob": -0.061431885, + "special": false, + "text": " data" + }, + { + "id": 13, + "logprob": -0.71875, + "special": false, + "text": "." + }, + { + "id": 18183, + "logprob": -0.23632812, + "special": false, + "text": " Deep" + }, + { + "id": 6832, + "logprob": -0.0017204285, + "special": false, + "text": " learning" + }, + { + "id": 374, + "logprob": -1.1738281, + "special": false, + "text": " is" + }, + { + "id": 1483, + "logprob": -0.61083984, + "special": false, + "text": " used" + }, + { + "id": 304, + "logprob": -0.035003662, + "special": false, + "text": " in" + }, + { + "id": 264, + "logprob": -0.118652344, + "special": false, + "text": " a" + }, + { + "id": 8045, + "logprob": -0.42016602, + "special": false, + "text": " variety" + }, + { + "id": 315, + "logprob": -1.6212463e-05, + "special": false, + "text": " of" + }, + { + "id": 8357, + "logprob": -0.1315918, + "special": false, + "text": " applications" + }, + { + "id": 11, + "logprob": -0.12915039, + "special": false, + "text": "," + }, + { + "id": 2670, + "logprob": -0.12463379, + "special": false, + "text": " including" + }, + { + "id": 2168, + "logprob": -0.37402344, + "special": false, + "text": " image" + }, + { + "id": 323, + "logprob": -0.1451416, + "special": false, + "text": " and" + }, + { + "id": 8806, + "logprob": -0.028869629, + "special": false, + "text": " speech" + }, + { + "id": 17843, + "logprob": -0.00024068356, + "special": false, + "text": " recognition" + }, + { + "id": 11, + "logprob": -0.00031018257, + "special": false, + "text": "," + }, + { + "id": 5810, + "logprob": -0.019821167, + "special": false, + "text": " natural" + }, + { + "id": 4128, + "logprob": -0.00012528896, + "special": false, + "text": " language" + }, + { + "id": 8692, + "logprob": -0.00089263916, + "special": false, + "text": " processing" + }, + { + "id": 11, + "logprob": -0.00073862076, + "special": false, + "text": "," + }, + { + "id": 323, + "logprob": -0.040161133, + "special": false, + "text": " and" + }, + { + "id": 38193, + "logprob": -0.4519043, + "special": false, + "text": " autonomous" + }, + { + "id": 11474, + "logprob": -0.39941406, + "special": false, + "text": " vehicles" + }, + { + "id": 13, + "logprob": -0.21166992, + "special": false, + "text": "." + }, + { + "id": 1084, + "logprob": -0.9082031, + "special": false, + "text": " It" + }, + { + "id": 374, + "logprob": -0.44213867, + "special": false, + "text": " is" + }, + { + "id": 264, + "logprob": -1.2177734, + "special": false, + "text": " a" + }, + { + "id": 18512, + "logprob": -0.5205078, + "special": false, + "text": " rapidly" + }, + { + "id": 7826, + "logprob": -0.15332031, + "special": false, + "text": " growing" + }, + { + "id": 2070, + "logprob": -0.0039978027, + "special": false, + "text": " field" + }, + { + "id": 448, + "logprob": -0.9091797, + "special": false, + "text": " with" + }, + { + "id": 1657, + "logprob": -0.17114258, + "special": false, + "text": " many" + }, + { + "id": 4650, + "logprob": -0.70703125, + "special": false, + "text": " potential" + }, + { + "id": 8357, + "logprob": -0.025131226, + "special": false, + "text": " applications" + }, + { + "id": 304, + "logprob": -0.6699219, + "special": false, + "text": " in" + }, + { + "id": 279, + "logprob": -0.35205078, + "special": false, + "text": " the" + }, + { + "id": 3853, + "logprob": -0.049194336, + "special": false, + "text": " future" + }, + { + "id": 13, + "logprob": -0.21972656, + "special": false, + "text": "." + }, + { + "id": 151643, + "logprob": -2.0019531, + "special": true, + "text": "<|endoftext|>" } ], "top_tokens": null }, - "generated_text": " Deep learning is a subset of machine learning that uses" + "generated_text": " Deep learning is a subset of machine learning that uses neural networks to learn from data. It is a type of artificial intelligence that can learn from and make predictions on large amounts of data. Deep learning is used in a variety of applications, including image and speech recognition, natural language processing, and autonomous vehicles. It is a rapidly growing field with many potential applications in the future." } diff --git a/integration-tests/models/__snapshots__/test_compressed_tensors_w8a8_int_dynamic_weight/test_compressed_tensors_w8a8_int_dynamic_weight_all_params.json b/integration-tests/models/__snapshots__/test_compressed_tensors_w8a8_int_dynamic_weight/test_compressed_tensors_w8a8_int_dynamic_weight_all_params.json index 6b3f5092917..2c840e67124 100644 --- a/integration-tests/models/__snapshots__/test_compressed_tensors_w8a8_int_dynamic_weight/test_compressed_tensors_w8a8_int_dynamic_weight_all_params.json +++ b/integration-tests/models/__snapshots__/test_compressed_tensors_w8a8_int_dynamic_weight/test_compressed_tensors_w8a8_int_dynamic_weight_all_params.json @@ -7,67 +7,67 @@ "seed": 0, "tokens": [ { - "id": 1939, - "logprob": -2.2460938, + "id": 5267, + "logprob": -1.1464844, "special": false, - "text": "?\n\n" + "text": "?\n" }, { "id": 33464, - "logprob": 0.0, + "logprob": -0.83203125, "special": false, "text": "Deep" }, { "id": 20909, - "logprob": -0.48608398, + "logprob": -0.5625, "special": false, "text": " Learning" }, { - "id": 4102, - "logprob": -2.265625, + "id": 320, + "logprob": -2.1464844, "special": false, - "text": " " + "text": " (" }, { - "id": 285, + "id": 16524, "logprob": 0.0, "special": false, - "text": "is" + "text": "DL" }, { - "id": 458, - "logprob": -0.6328125, + "id": 701, + "logprob": -2.2089844, "special": false, - "text": " an" + "text": ")," }, { - "id": 20443, - "logprob": -0.1796875, + "id": 476, + "logprob": -0.27368164, "special": false, - "text": " artificial" + "text": " or" }, { - "id": 11229, - "logprob": 0.0, + "id": 20443, + "logprob": -0.09442139, "special": false, - "text": " intelligence" + "text": " artificial" }, { - "id": 320, - "logprob": -0.37695312, + "id": 29728, + "logprob": 0.0, "special": false, - "text": " (" + "text": " neural" }, { - "id": 15469, + "id": 14155, "logprob": 0.0, "special": false, - "text": "AI" + "text": " networks" } ], "top_tokens": null }, - "generated_text": "What is deep learning?\n\nDeep Learning is an artificial intelligence (AI" + "generated_text": "What is deep learning?\nDeep Learning (DL), or artificial neural networks" } diff --git a/integration-tests/models/__snapshots__/test_compressed_tensors_w8a8_int_dynamic_weight/test_compressed_tensors_w8a8_int_dynamic_weight_load.json b/integration-tests/models/__snapshots__/test_compressed_tensors_w8a8_int_dynamic_weight/test_compressed_tensors_w8a8_int_dynamic_weight_load.json index 1fa4e33aa05..aee5698b474 100644 --- a/integration-tests/models/__snapshots__/test_compressed_tensors_w8a8_int_dynamic_weight/test_compressed_tensors_w8a8_int_dynamic_weight_load.json +++ b/integration-tests/models/__snapshots__/test_compressed_tensors_w8a8_int_dynamic_weight/test_compressed_tensors_w8a8_int_dynamic_weight_load.json @@ -9,61 +9,61 @@ "tokens": [ { "id": 18183, - "logprob": -1.4912109, + "logprob": -1.5195312, "special": false, "text": " Deep" }, { "id": 6832, - "logprob": -0.075683594, + "logprob": -0.06817627, "special": false, "text": " learning" }, { "id": 374, - "logprob": -0.12408447, + "logprob": -0.13122559, "special": false, "text": " is" }, { "id": 264, - "logprob": -0.12768555, + "logprob": -0.13415527, "special": false, "text": " a" }, { "id": 25993, - "logprob": -0.82128906, + "logprob": -0.87353516, "special": false, "text": " subset" }, { "id": 315, - "logprob": -0.0012636185, + "logprob": -0.0011396408, "special": false, "text": " of" }, { "id": 5662, - "logprob": -0.12878418, + "logprob": -0.16442871, "special": false, "text": " machine" }, { "id": 6832, - "logprob": -0.0015888214, + "logprob": -0.0026416779, "special": false, "text": " learning" }, { "id": 429, - "logprob": -0.49194336, + "logprob": -0.48754883, "special": false, "text": " that" }, { "id": 5711, - "logprob": -1.2626953, + "logprob": -1.2294922, "special": false, "text": " uses" } @@ -82,61 +82,61 @@ "tokens": [ { "id": 18183, - "logprob": -1.4912109, + "logprob": -1.5195312, "special": false, "text": " Deep" }, { "id": 6832, - "logprob": -0.075683594, + "logprob": -0.06817627, "special": false, "text": " learning" }, { "id": 374, - "logprob": -0.12408447, + "logprob": -0.13122559, "special": false, "text": " is" }, { "id": 264, - "logprob": -0.12768555, + "logprob": -0.13415527, "special": false, "text": " a" }, { "id": 25993, - "logprob": -0.82128906, + "logprob": -0.87353516, "special": false, "text": " subset" }, { "id": 315, - "logprob": -0.0012636185, + "logprob": -0.0011396408, "special": false, "text": " of" }, { "id": 5662, - "logprob": -0.12878418, + "logprob": -0.16442871, "special": false, "text": " machine" }, { "id": 6832, - "logprob": -0.0015888214, + "logprob": -0.0026416779, "special": false, "text": " learning" }, { "id": 429, - "logprob": -0.49194336, + "logprob": -0.48754883, "special": false, "text": " that" }, { "id": 5711, - "logprob": -1.2626953, + "logprob": -1.2294922, "special": false, "text": " uses" } @@ -155,61 +155,61 @@ "tokens": [ { "id": 18183, - "logprob": -1.4912109, + "logprob": -1.5195312, "special": false, "text": " Deep" }, { "id": 6832, - "logprob": -0.075683594, + "logprob": -0.06817627, "special": false, "text": " learning" }, { "id": 374, - "logprob": -0.12408447, + "logprob": -0.13122559, "special": false, "text": " is" }, { "id": 264, - "logprob": -0.12768555, + "logprob": -0.13415527, "special": false, "text": " a" }, { "id": 25993, - "logprob": -0.82128906, + "logprob": -0.87353516, "special": false, "text": " subset" }, { "id": 315, - "logprob": -0.0012636185, + "logprob": -0.0011396408, "special": false, "text": " of" }, { "id": 5662, - "logprob": -0.12878418, + "logprob": -0.16442871, "special": false, "text": " machine" }, { "id": 6832, - "logprob": -0.0015888214, + "logprob": -0.0026416779, "special": false, "text": " learning" }, { "id": 429, - "logprob": -0.49194336, + "logprob": -0.48754883, "special": false, "text": " that" }, { "id": 5711, - "logprob": -1.2626953, + "logprob": -1.2294922, "special": false, "text": " uses" } @@ -228,61 +228,61 @@ "tokens": [ { "id": 18183, - "logprob": -1.4912109, + "logprob": -1.5195312, "special": false, "text": " Deep" }, { "id": 6832, - "logprob": -0.075683594, + "logprob": -0.06817627, "special": false, "text": " learning" }, { "id": 374, - "logprob": -0.12408447, + "logprob": -0.13122559, "special": false, "text": " is" }, { "id": 264, - "logprob": -0.12768555, + "logprob": -0.13415527, "special": false, "text": " a" }, { "id": 25993, - "logprob": -0.82128906, + "logprob": -0.87353516, "special": false, "text": " subset" }, { "id": 315, - "logprob": -0.0012636185, + "logprob": -0.0011396408, "special": false, "text": " of" }, { "id": 5662, - "logprob": -0.12878418, + "logprob": -0.16442871, "special": false, "text": " machine" }, { "id": 6832, - "logprob": -0.0015888214, + "logprob": -0.0026416779, "special": false, "text": " learning" }, { "id": 429, - "logprob": -0.49194336, + "logprob": -0.48754883, "special": false, "text": " that" }, { "id": 5711, - "logprob": -1.2626953, + "logprob": -1.2294922, "special": false, "text": " uses" } diff --git a/integration-tests/models/__snapshots__/test_flash_qwen2_vl/test_flash_qwen2_vl_simple.json b/integration-tests/models/__snapshots__/test_flash_qwen2_vl/test_flash_qwen2_vl_simple.json index 131631e65c8..49f332252bc 100644 --- a/integration-tests/models/__snapshots__/test_flash_qwen2_vl/test_flash_qwen2_vl_simple.json +++ b/integration-tests/models/__snapshots__/test_flash_qwen2_vl/test_flash_qwen2_vl_simple.json @@ -13,14 +13,14 @@ "usage": null } ], - "created": 1730164250, + "created": 1737645979, "id": "", "model": "Qwen/Qwen2-VL-7B-Instruct", "object": "chat.completion", - "system_fingerprint": "2.4.2-dev0-native", + "system_fingerprint": "3.0.2-dev0-native", "usage": { "completion_tokens": 58, - "prompt_tokens": 349, - "total_tokens": 407 + "prompt_tokens": 1364, + "total_tokens": 1422 } } diff --git a/integration-tests/models/__snapshots__/test_flash_qwen2_vl/test_flash_qwen2_vl_simple_streaming.json b/integration-tests/models/__snapshots__/test_flash_qwen2_vl/test_flash_qwen2_vl_simple_streaming.json index 3e2faca714d..3dc8fc6d6d5 100644 --- a/integration-tests/models/__snapshots__/test_flash_qwen2_vl/test_flash_qwen2_vl_simple_streaming.json +++ b/integration-tests/models/__snapshots__/test_flash_qwen2_vl/test_flash_qwen2_vl_simple_streaming.json @@ -11,10 +11,10 @@ "logprobs": null } ], - "created": 1730416361, + "created": 1737646031, "id": "", "model": "Qwen/Qwen2-VL-7B-Instruct", "object": "chat.completion.chunk", - "system_fingerprint": "2.4.2-dev0-native", + "system_fingerprint": "3.0.2-dev0-native", "usage": null } diff --git a/integration-tests/models/test_compressed_tensors_w8a8_int_dynamic_weight.py b/integration-tests/models/test_compressed_tensors_w8a8_int_dynamic_weight.py index a0b0416b861..17e12c221c2 100644 --- a/integration-tests/models/test_compressed_tensors_w8a8_int_dynamic_weight.py +++ b/integration-tests/models/test_compressed_tensors_w8a8_int_dynamic_weight.py @@ -27,15 +27,16 @@ async def test_compressed_tensors_w8a8_int_dynamic_weight( ): response = await compressed_tensors_w8a8_int_dynamic_weight.generate( "What is deep learning?", - max_new_tokens=10, + # prefer a longer response than the default, allow the llm to end generation + max_new_tokens=1000, decoder_input_details=True, ) assert ( response.generated_text - == " Deep learning is a subset of machine learning that uses" + == " Deep learning is a subset of machine learning that uses neural networks to learn from data. It is a type of artificial intelligence that can learn from and make predictions on large amounts of data. Deep learning is used in a variety of applications, including image and speech recognition, natural language processing, and autonomous vehicles. It is a rapidly growing field with many potential applications in the future." ) - assert response.details.generated_tokens == 10 + assert response.details.generated_tokens == 76 assert response == response_snapshot @@ -64,7 +65,7 @@ async def test_compressed_tensors_w8a8_int_dynamic_weight_all_params( assert response.details.generated_tokens == 10 assert ( response.generated_text - == "What is deep learning?\n\nDeep Learning is an artificial intelligence (AI" + == "What is deep learning?\nDeep Learning (DL), or artificial neural networks" ) assert response == response_snapshot diff --git a/integration-tests/models/test_flash_qwen2_vl.py b/integration-tests/models/test_flash_qwen2_vl.py index 97a533fc5d4..dacd92a87b3 100644 --- a/integration-tests/models/test_flash_qwen2_vl.py +++ b/integration-tests/models/test_flash_qwen2_vl.py @@ -1,81 +1,78 @@ -# Disabled because it's broken. -# import pytest -# -# -# @pytest.fixture(scope="module") -# def flash_qwen2_vl_handle(launcher): -# with launcher("Qwen/Qwen2-VL-7B-Instruct") as handle: -# yield handle -# -# -# @pytest.fixture(scope="module") -# async def flash_qwen2(flash_qwen2_vl_handle): -# await flash_qwen2_vl_handle.health(300) -# return flash_qwen2_vl_handle.client -# -# -# @pytest.mark.private -# async def test_flash_qwen2_vl_simple(flash_qwen2, response_snapshot): -# response = await flash_qwen2.chat( -# max_tokens=100, -# seed=42, -# messages=[ -# { -# "role": "user", -# "content": [ -# { -# "type": "image_url", -# "image_url": { -# "url": "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/rabbit.png" -# }, -# }, -# {"type": "text", "text": "Describe this image."}, -# ], -# }, -# ], -# ) -# -# assert ( -# response.choices[0].message.content -# == "The image depicts an anthropomorphic rabbit, wearing a futuristic spacesuit, in an extraterrestrial environment. The setting appears to be a red planet resembling Mars, with rugged terrain and rocky formations in the background. The moon is visible in the distant sky, adding to the lunar landscape." -# ) -# -# assert response == response_snapshot -# -# -# @pytest.mark.private -# async def test_flash_qwen2_vl_simple_streaming(flash_qwen2, response_snapshot): -# responses = await flash_qwen2.chat( -# max_tokens=100, -# seed=42, -# messages=[ -# { -# "role": "user", -# "content": [ -# { -# "type": "image_url", -# "image_url": { -# "url": "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/rabbit.png" -# }, -# }, -# {"type": "text", "text": "Describe this image."}, -# ], -# }, -# ], -# stream=True, -# ) -# -# count = 0 -# generated = "" -# last_response = None -# async for response in responses: -# count += 1 -# generated += response.choices[0].delta.content -# last_response = response -# -# assert ( -# generated -# == "The image depicts an anthropomorphic rabbit, wearing a futuristic spacesuit, in an extraterrestrial environment. The setting appears to be a red planet resembling Mars, with rugged terrain and rocky formations in the background. The moon is visible in the distant sky, adding to the lunar landscape." -# ) -# assert count == 58 -# assert last_response == response_snapshot +import pytest + + +@pytest.fixture(scope="module") +def flash_qwen2_vl_handle(launcher): + with launcher("Qwen/Qwen2-VL-7B-Instruct") as handle: + yield handle + + +@pytest.fixture(scope="module") +async def flash_qwen2(flash_qwen2_vl_handle): + await flash_qwen2_vl_handle.health(300) + return flash_qwen2_vl_handle.client + + +@pytest.mark.private +async def test_flash_qwen2_vl_simple(flash_qwen2, response_snapshot): + response = await flash_qwen2.chat( + seed=42, + messages=[ + { + "role": "user", + "content": [ + { + "type": "image_url", + "image_url": { + "url": "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/rabbit.png" + }, + }, + {"type": "text", "text": "Describe this image."}, + ], + }, + ], + ) + + assert ( + response.choices[0].message.content + == "The image depicts an anthropomorphic rabbit, wearing a futuristic spacesuit, in an extraterrestrial environment. The setting appears to be a red planet resembling Mars, with rugged terrain and rocky formations in the background. The moon is visible in the distant sky, adding to the lunar landscape." + ) + + assert response == response_snapshot + + +@pytest.mark.private +async def test_flash_qwen2_vl_simple_streaming(flash_qwen2, response_snapshot): + responses = await flash_qwen2.chat( + seed=42, + messages=[ + { + "role": "user", + "content": [ + { + "type": "image_url", + "image_url": { + "url": "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/rabbit.png" + }, + }, + {"type": "text", "text": "Describe this image."}, + ], + }, + ], + stream=True, + ) + + count = 0 + generated = "" + last_response = None + async for response in responses: + count += 1 + generated += response.choices[0].delta.content + last_response = response + + assert ( + generated + == "The image depicts an anthropomorphic rabbit, wearing a futuristic spacesuit, in an extraterrestrial environment. The setting appears to be a red planet resembling Mars, with rugged terrain and rocky formations in the background. The moon is visible in the distant sky, adding to the lunar landscape." + ) + assert count == 58 + assert last_response == response_snapshot diff --git a/launcher/src/main.rs b/launcher/src/main.rs index 394cc1e641e..597ebdde510 100644 --- a/launcher/src/main.rs +++ b/launcher/src/main.rs @@ -230,7 +230,14 @@ struct QuantizationConfig { } #[derive(Debug, Deserialize)] -struct VisionConfig {} +struct VisionConfig { + depth: Option, + embed_dim: Option, + mlp_ratio: Option, + in_chans: Option, + patch_size: Option, + temporal_patch_size: Option, +} #[derive(Debug, Deserialize)] struct Config { @@ -253,11 +260,6 @@ struct Config { impl Config { fn flop(&self) -> Option { - if self.vision_config.is_some() { - // VLM are much harder to predict and VRAM requirements - // Are more complex. - return None; - } let num_heads = self.num_heads? as u64; let num_kv_heads = self.num_kv_heads? as u64; let head_dim = self.head_dim? as u64; @@ -277,8 +279,50 @@ impl Config { let gate_up_down_flops = 2 * 3 * hidden_size * intermediate_size; let layer_flops = attn_layer_flops + gate_up_down_flops; - let total = layer_flops * num_layers; - Some(total) + let text_flops = layer_flops * num_layers; + + tracing::debug!("Text flops: {}", human_size(text_flops as usize, "flop")); + + // text-only case + if self.vision_config.is_none() { + return Some(text_flops); + } + + let vision_config = self.vision_config.as_ref().unwrap(); + + // estimate vision flops for specific model types + match self.model_type.as_deref() { + Some("qwen2_vl") => { + let in_chans = vision_config.in_chans? as u64; + let patch_size = vision_config.patch_size? as u64; + let embed_dim = vision_config.embed_dim? as u64; + let vision_depth = vision_config.depth? as u64; + let mlp_ratio = vision_config.mlp_ratio? as u64; + let temporal_patch_size = vision_config.temporal_patch_size? as u64; + // 1. patch embedding: + // - conv3d operation: (t*h*w) * (k_t*k_h*k_w) * c_in * c_out * 2 + // where the 2 accounts for multiply-add + let patch_flops = + 2 * temporal_patch_size * patch_size.pow(2) * embed_dim * in_chans; + // 2. self-attention + mlp: + // - qkv projections: 3 * d_model * d_model * 2 + // - attention: d_model * d_model * 2 + // - mlp: 2 * d_model * (mlp_ratio * d_model) * 2 + // simplified to: 2 * d_model * (4 + mlp_ratio * d_model) + let attn_flops = 2 * embed_dim * (4 + mlp_ratio * embed_dim); + // 3. add with layer norm flops for total vision layer flops + let layer_flops = patch_flops + attn_flops + 2 * embed_dim; + let vision_flops = layer_flops * vision_depth; + tracing::debug!( + "Vision flops: {}", + human_size(vision_flops as usize, "flop") + ); + Some(text_flops + vision_flops) + } + // model has a vision config but is not supported for flops calculation + // we return None to avoid overestimating the memory requirements + _ => None, + } } fn kv_vram_per_tok(&self) -> Option { diff --git a/server/text_generation_server/layers/rotary.py b/server/text_generation_server/layers/rotary.py index e346d0f8946..9f1770ff6b0 100644 --- a/server/text_generation_server/layers/rotary.py +++ b/server/text_generation_server/layers/rotary.py @@ -86,6 +86,16 @@ def static(cls, config, dim, base, device): # `rope_type` is now standard in transformers, but some existing models # have `type` instead. rope_type = rope_scaling.get("rope_type", rope_scaling.get("type", None)) + mrope_section = rope_scaling.get("mrope_section", None) + + # only apply mrope if sections are provided and the rope type is mrope or default + if mrope_section is not None and ( + rope_type == "mrope" or rope_type == "default" + ): + mrope_section = rope_scaling.get("mrope_section") + return RotaryPositionEmbeddingMultimodalSections( + inv_freq, scaling_factor, mrope_section + ) if rope_type == "linear": pass @@ -548,3 +558,76 @@ def apply_llama3_scaling( new_freqs.append((1 - smooth) * freq / scaling_factor + smooth * freq) return torch.tensor(new_freqs, dtype=freqs.dtype, device=freqs.device) + + +class RotaryPositionEmbeddingMultimodalSections(PositionRotaryEmbedding): + def __init__(self, inv_freq, scaling_factor, sections): + super().__init__(inv_freq, scaling_factor) + # expand the inv_freq for the 3 sections + self.inv_freq_exp = inv_freq[None, None, :, None].expand(3, -1, -1, 1) + self.sections = sections * 2 + self._cos_cached = None + self._sin_cached = None + + def forward( + self, + query: torch.Tensor, + key: torch.Tensor, + cos: torch.Tensor, + sin: torch.Tensor, + ): + # process multi-modal rotary embeddings + split_cos, split_sin = [ + torch.split(t, self.sections, dim=-1) for t in (cos, sin) + ] + cos = torch.cat([m[i % 3] for i, m in enumerate(split_cos)], dim=-1).unsqueeze( + 1 + ) + sin = torch.cat([m[i % 3] for i, m in enumerate(split_sin)], dim=-1).unsqueeze( + 1 + ) + # prepare input tensors + q, k = [x.transpose(0, 1).unsqueeze(0) for x in (query, key)] + rotary_dim = cos.shape[-1] + q1, k1 = q[..., :rotary_dim], k[..., :rotary_dim] + q2 = torch.cat((-q[..., rotary_dim // 2 :], q[..., : rotary_dim // 2]), dim=-1) + k2 = torch.cat((-k[..., rotary_dim // 2 :], k[..., : rotary_dim // 2]), dim=-1) + + rotary_emb.apply_rotary(q1, q2, cos, sin, q1, q2, True) + rotary_emb.apply_rotary(k1, k2, cos, sin, k1, k2, True) + + def _update_cos_sin_cache(self, dtype, device, seqlen): + # always cache the cos/sin for the full sequence length to avoid + # recomputing if the sequence length is smaller than the cached one + if ( + seqlen > self._seq_len_cached + or self._cos_cached_exp.device != device + or self._cos_cached_exp.dtype != dtype + ): + self._seq_len_cached = seqlen + t = torch.arange(seqlen, device=device, dtype=self.inv_freq.dtype) + freqs = torch.outer(t, self.inv_freq.to(device=t.device)) + freqs = freqs.expand(3, -1, -1) + self._cos_cached_exp = freqs.cos().to(dtype) + self._sin_cached_exp = freqs.sin().to(dtype) + + def get_cos_sin( + self, + position_ids: torch.Tensor, + max_s: int, + dtype: torch.dtype, + ): + self._update_cos_sin_cache(dtype, position_ids.device, max_s) + # expand the position_ids to match the shape of the cached cos/sin + indices = ( + position_ids.squeeze(1) + .unsqueeze(-1) + .expand(-1, -1, self._cos_cached_exp.shape[-1]) + ) + cos_c = torch.gather(self._cos_cached_exp, 1, indices) + cos_c = torch.cat([cos_c, cos_c], dim=-1).unsqueeze(1) + + sin_c = torch.gather(self._sin_cached_exp, 1, indices) + sin_c = torch.cat([sin_c, sin_c], dim=-1).unsqueeze(1) + + return cos_c, sin_c diff --git a/server/text_generation_server/models/__init__.py b/server/text_generation_server/models/__init__.py index 3b437ce0492..2dbe206742a 100644 --- a/server/text_generation_server/models/__init__.py +++ b/server/text_generation_server/models/__init__.py @@ -1249,6 +1249,7 @@ def get_model( quantize=quantize, speculator=speculator, dtype=dtype, + default_dtype=torch.bfloat16, kv_cache_dtype=kv_cache_dtype, trust_remote_code=trust_remote_code, lora_adapter_ids=lora_adapter_ids, diff --git a/server/text_generation_server/models/custom_modeling/flash_qwen2_modeling.py b/server/text_generation_server/models/custom_modeling/flash_qwen2_modeling.py index cc4039b1cbc..78ae3020cb8 100644 --- a/server/text_generation_server/models/custom_modeling/flash_qwen2_modeling.py +++ b/server/text_generation_server/models/custom_modeling/flash_qwen2_modeling.py @@ -61,11 +61,6 @@ def __init__( config.sliding_window if config.sliding_window is not None else -1 ) self.num_heads = config.num_attention_heads - self.mrope_section = ( - config.rope_scaling.get("mrope_section", None) - if config.rope_scaling is not None - else None - ) self.hidden_size = config.hidden_size self.head_size = self.hidden_size // self.num_heads @@ -127,17 +122,6 @@ def forward( query = query.view(-1, self.num_heads, self.head_size) kv = kv.view(-1, 2, self.num_key_value_heads, self.head_size) - if self.mrope_section is not None: - # if mrope_section is set, we need to split the cos and sin into 3 parts and concatenate them in a specific order - cos = torch.cat( - [m[i % 3] for i, m in enumerate(cos.split(self.mrope_section, dim=-1))], - dim=-1, - ) - sin = torch.cat( - [m[i % 3] for i, m in enumerate(sin.split(self.mrope_section, dim=-1))], - dim=-1, - ) - self.rotary_emb(query, torch.select(kv, dim=1, index=0), cos, sin) if prefill_cache_indices is not None: @@ -251,7 +235,8 @@ def forward( max_s, prefill_cache_indices, ): - normed_hidden_states, res = self.input_layernorm(hidden_states, residual) + residual = hidden_states + normed_hidden_states, _ = self.input_layernorm(hidden_states) # Self Attention attn_output = self.self_attn( @@ -266,15 +251,14 @@ def forward( max_s, prefill_cache_indices, ) + hidden_states = attn_output + residual # faster post attention rms norm - normed_attn_res_output, attn_res = self.post_attention_layernorm( - attn_output, res - ) - - mlp_output = self.mlp(normed_attn_res_output) - - return mlp_output, attn_res + residual = hidden_states + hidden_states, _ = self.post_attention_layernorm(hidden_states) + mlp_output = self.mlp(hidden_states) + hidden_states = mlp_output + residual + return hidden_states class Qwen2Model(torch.nn.Module): @@ -322,18 +306,15 @@ def forward( ) -> torch.Tensor: hidden_states = inputs_embeds - # flatten position ids from 2D to 1D cos, sin = self.layers[0].self_attn.rotary_emb.get_cos_sin( - position_ids.flatten(), true_max_s, hidden_states.dtype + position_ids, + true_max_s, + hidden_states.dtype, ) - # reshape back to 2D if the position_ids were 2D - if position_ids.size(0) != cos.size(0): - cos = cos.view(position_ids.size(0), position_ids.size(-1), -1).unsqueeze(2) - sin = sin.view(position_ids.size(0), position_ids.size(-1), -1).unsqueeze(2) residual = None for i, layer in enumerate(self.layers): - hidden_states, residual = layer( + hidden_states = layer( hidden_states, residual, cos, @@ -347,7 +328,7 @@ def forward( prefill_cache_indices, ) - hidden_states, _ = self.norm(hidden_states, residual) + hidden_states, _ = self.norm(hidden_states) return hidden_states diff --git a/server/text_generation_server/models/custom_modeling/qwen2_vl.py b/server/text_generation_server/models/custom_modeling/qwen2_vl.py index a8e1e8c1593..e0ae19df766 100644 --- a/server/text_generation_server/models/custom_modeling/qwen2_vl.py +++ b/server/text_generation_server/models/custom_modeling/qwen2_vl.py @@ -222,12 +222,11 @@ def __init__(self, prefix, config, weights): def forward( self, hidden_states, cu_seqlens, rotary_pos_emb, max_seqlen ) -> torch.Tensor: - hidden_states_post_norm1, res = self.norm1(hidden_states) - hidden_states = hidden_states + self.attn( - hidden_states_post_norm1, cu_seqlens, rotary_pos_emb, max_seqlen - ) - hidden_states_post_norm2, res = self.norm2(hidden_states) - hidden_states = hidden_states + self.mlp(hidden_states_post_norm2) + norm1_out, _ = self.norm1(hidden_states) + attn_out = self.attn(norm1_out, cu_seqlens, rotary_pos_emb, max_seqlen) + hidden_states = hidden_states + attn_out + norm2_out, _ = self.norm2(hidden_states) + hidden_states = hidden_states + self.mlp(norm2_out) return hidden_states @@ -527,6 +526,7 @@ def forward( # apply the visual model to the pixel values if they are provided if pixel_values is not None and len(pixel_values) > 0: + pixel_values = pixel_values.to(inputs_embeds.dtype) if pixel_values is not None: image_embeds = self.visual( pixel_values, grid_thw=image_grid_thw diff --git a/server/text_generation_server/models/flash_causal_lm.py b/server/text_generation_server/models/flash_causal_lm.py index d097c54fc2c..ac69f11537a 100644 --- a/server/text_generation_server/models/flash_causal_lm.py +++ b/server/text_generation_server/models/flash_causal_lm.py @@ -1400,7 +1400,11 @@ def cuda_graph_warmup(self, bs: int, max_s: int, max_bt: int): cache_lengths = [0] * bs if max_bs is None: input_ids = torch.zeros(bs, dtype=torch.int64, device=self.device) - position_ids = torch.zeros(bs, dtype=torch.int32, device=self.device) + if hasattr(self.model, "get_position_ids"): + # use model specific position ids for initialization + position_ids = self.model.get_position_ids(input_ids) + else: + position_ids = torch.zeros(bs, dtype=torch.int32, device=self.device) slots = torch.arange(bs, dtype=torch.int64, device=self.device) input_lengths_tensor = ( torch.ones(bs, dtype=torch.int32, device=self.device) * max_s @@ -1427,7 +1431,7 @@ def cuda_graph_warmup(self, bs: int, max_s: int, max_bt: int): "Cuda graphs should be generated in decreasing order size to reduce VRAM usage" ) input_ids = self.cuda_graphs[max_bs]["input_ids"][:bs] - position_ids = self.cuda_graphs[max_bs]["position_ids"][:bs] + position_ids = self.cuda_graphs[max_bs]["position_ids"][..., :bs] if ATTENTION == "flashinfer": block_tables = self.cuda_graphs[max_bs]["block_tables"][: bs * max_bt] else: @@ -1456,14 +1460,6 @@ def cuda_graph_warmup(self, bs: int, max_s: int, max_bt: int): else: state = None - if ( - hasattr(self.model, "config") - and hasattr(self.model.config, "model_type") - and self.model.config.model_type == "qwen2_vl" - ): - if position_ids.dim() == 1: - position_ids = self.model.get_position_ids(input_ids) - graph = torch.cuda.CUDAGraph() self.cuda_graphs[bs] = { "input_ids": input_ids,