Skip to content

Commit

Permalink
set kv cache in f16 (#3101)
Browse files Browse the repository at this point in the history
### Changes

explicitly disable kv cache compression to u8, f16 precision is used
instead.

### Reason for changes

PTWC nightly has a different metrics (ticket 157594). 
It happens, because since
openvinotoolkit/openvino#27454 KV Cache
compressed to u8 by default and it affects accuracy of fp32 models
(ticket 157571).

Propose using kv cache in the f16 in order to handle issues in nncf
rather than in ov (there's still an open issue with kv cache
compression, and it can be modified in the nearest future)

### Related tickets

157571
157594

### Tests

- [x] openvino-nightly/job/post_training_weight_compression/56

![image](https://github.com/user-attachments/assets/0772a8e5-0f92-4f53-8ac0-e16841bd8193)
- [x] https://github.com/openvinotoolkit/nncf/actions/runs/11934079602
- [x] job/weekly/job/openvino-nightly/job/test_examples/77
  • Loading branch information
ljaljushkin authored Nov 22, 2024
1 parent 8d501c7 commit dc9f5cb
Show file tree
Hide file tree
Showing 3 changed files with 11 additions and 3 deletions.
4 changes: 3 additions & 1 deletion examples/llm_compression/openvino/tiny_llama/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -67,7 +67,9 @@ def transform_fn(data, model, tokenizer):
)
model.save_pretrained(OUTPUT_DIR)

model = OVModelForCausalLM.from_pretrained(OUTPUT_DIR, ov_config={"DYNAMIC_QUANTIZATION_GROUP_SIZE": "0"})
model = OVModelForCausalLM.from_pretrained(
OUTPUT_DIR, ov_config={"DYNAMIC_QUANTIZATION_GROUP_SIZE": "0", "KV_CACHE_PRECISION": "f16"}
)
input_ids = tokenizer("What is PyTorch?", return_tensors="pt").to(device=model.device)

start_t = time.time()
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -249,6 +249,7 @@ def main():
"NUM_STREAMS": "1",
"CACHE_DIR": "",
"DYNAMIC_QUANTIZATION_GROUP_SIZE": "0",
"KV_CACHE_PRECISION": "f16",
}
model = OVModelForCausalLM.from_pretrained(
model_id,
Expand Down
9 changes: 7 additions & 2 deletions tests/post_training/pipelines/lm_weight_compression.py
Original file line number Diff line number Diff line change
Expand Up @@ -271,7 +271,12 @@ def _validate(self):
if os.getenv("NNCF_TEST_REGEN_DOT") is not None:
print("Collection ground-truth reference data")
model_gold = OVModelForCausalLM.from_pretrained(
self.fp32_model_dir, trust_remote_code=True, load_in_8bit=False, compile=False, stateful=is_stateful
self.fp32_model_dir,
trust_remote_code=True,
load_in_8bit=False,
compile=False,
stateful=is_stateful,
ov_config={"KV_CACHE_PRECISION": "f16"},
)
evaluator = Evaluator(base_model=model_gold, tokenizer=self.preprocessor, metrics=("similarity",))
evaluator.dump_gt(str(gt_data_path))
Expand All @@ -290,7 +295,7 @@ def _validate(self):
load_in_8bit=False,
compile=False,
stateful=is_stateful,
ov_config={"DYNAMIC_QUANTIZATION_GROUP_SIZE": "0"},
ov_config={"DYNAMIC_QUANTIZATION_GROUP_SIZE": "0", "KV_CACHE_PRECISION": "f16"},
)
print("Evaluation of the target model")
_, all_metrics = evaluator.score(compressed_model_hf)
Expand Down

0 comments on commit dc9f5cb

Please sign in to comment.