From 1167a3408adc49898aa4b00c6235adbfceabf9d1 Mon Sep 17 00:00:00 2001 From: Elisei Rykov Date: Tue, 15 Oct 2024 10:38:57 +0300 Subject: [PATCH] in progress --- .../llava_with_textual_attention.json | 81 +++++++ .../train/multimodal/attention_pooling.json | 6 +- .../attention_pooling_with_n_heads.json | 205 ++++++++++++++++++ .../train/multimodal/llava_with_replica.json | 205 ++++++++++++++++++ .../multimodal/projectors/__init__.py | 1 + .../projectors/attention_pooling.py | 132 ++++++++++- turbo_alignment/settings/modality.py | 1 + 7 files changed, 623 insertions(+), 8 deletions(-) create mode 100755 configs/exp/multimodal/llava_with_textual_attention.json create mode 100644 configs/exp/train/multimodal/attention_pooling_with_n_heads.json create mode 100644 configs/exp/train/multimodal/llava_with_replica.json diff --git a/configs/exp/multimodal/llava_with_textual_attention.json b/configs/exp/multimodal/llava_with_textual_attention.json new file mode 100755 index 0000000..71b023b --- /dev/null +++ b/configs/exp/multimodal/llava_with_textual_attention.json @@ -0,0 +1,81 @@ +{ + "inference_settings": [ + { + "metric_settings": [], + "model_settings": { + "model_path": "/from_s3/model", + "projections_path": "/from_s3/projections/modality_adapters.pt", + "n_modality_embeddings": 256, + "model_type": "causal", + "transformers_settings": {}, + "adapter_path": "/from_s3/adapter" + }, + "modality_encoder_settings_mapping": { + "image": { + "modality_encoder_type": "clip", + "is_pickle": true, + "encoder_path": "openai/clip-vit-large-patch14" + }, + "audio": null + }, + "modality_projector_mapping": { + "image": "llava", + "audio": null + }, + "tokenizer_settings": { + "use_fast": false, + "tokenizer_path": "/from_s3/tokenizer" + }, + "generation_settings": [ + { + "transformers_settings": { + "num_beams": 1, + "max_new_tokens": 128 + }, + "custom_settings": {} + } + ], + "use_vllm": false, + "batch": 1, + "micro_batch": 1 + } + ], + "dataset_settings": { + "sources": [ + { + "name": "rullava", + "records_path": "/app/data/multimodal/rullava/val_chat.jsonl", + "num_samples": 50 + } + ], + "prompt_template": { + "role_tag_mapping": { + "bot": "", + "user": "", + "system": "" + }, + "prefix_template": "{role}", + "suffix_template": "" + }, + "dataset_type": "multimodal", + "max_tokens_count": 2000, + "n_modality_embeddings": 256, + "start_modality_token": "", + "end_modality_token": "", + "only_answer_loss": true, + "modality_token_mapping": { + "image": "", + "audio": "