diff --git a/configs/exp/train/multimodal/attention_based_selector.json b/configs/exp/train/multimodal/attention_based_selector.json
index b22f3c3..a90dc4c 100644
--- a/configs/exp/train/multimodal/attention_based_selector.json
+++ b/configs/exp/train/multimodal/attention_based_selector.json
@@ -127,9 +127,9 @@
"save_total_limit": 5,
"load_best_model_at_end": false,
"deepspeed": "configs/exp/deepspeed/stage2.json",
- "dispatch_batches": false,
"dataloader_num_workers": 16,
- "dataloader_pin_memory": false
+ "dataloader_pin_memory": false,
+ "dispatch_batches": false
},
"wandb_settings": {
"project_name": "rykov",
diff --git a/configs/exp/train/multimodal/attention_pooling.json b/configs/exp/train/multimodal/attention_pooling.json
index 0b48b50..ba51284 100644
--- a/configs/exp/train/multimodal/attention_pooling.json
+++ b/configs/exp/train/multimodal/attention_pooling.json
@@ -63,11 +63,7 @@
},
"audio": null
},
-<<<<<<< HEAD
"n_modality_embeddings": 64,
-=======
- "n_modality_embeddings": 128,
->>>>>>> 1167a3408adc49898aa4b00c6235adbfceabf9d1
"start_modality_token": "",
"end_modality_token": "",
"dataset_type": "multimodal",
@@ -186,11 +182,7 @@
},
"dataset_type": "multimodal",
"max_tokens_count": 2000,
-<<<<<<< HEAD
"n_modality_embeddings": 64,
-=======
- "n_modality_embeddings": 128,
->>>>>>> 1167a3408adc49898aa4b00c6235adbfceabf9d1
"start_modality_token": "",
"end_modality_token": "",
"only_answer_loss": true,
diff --git a/tests/fixtures/configs/train/multimodal/llama_llava_base_clip.json b/tests/fixtures/configs/train/multimodal/llama_llava_base_clip.json
index d95ee19..68a105f 100644
--- a/tests/fixtures/configs/train/multimodal/llama_llava_base_clip.json
+++ b/tests/fixtures/configs/train/multimodal/llama_llava_base_clip.json
@@ -22,12 +22,12 @@
},
"modality_reader_settings_mapping": {
"image": {
- "reader_type": "clip",
+ "reader_type": "pickle",
"reader_path": "tests/fixtures/models/clip_tiny"
},
"audio": null
},
- "n_modality_embeddings": 225,
+ "n_modality_embeddings": 32,
"start_modality_token": "",
"end_modality_token": "",
"dataset_type": "multimodal",
@@ -58,12 +58,12 @@
},
"modality_reader_settings_mapping": {
"image": {
- "reader_type": "clip",
+ "reader_type": "pickle",
"reader_path": "tests/fixtures/models/clip_tiny"
},
"audio": null
},
- "n_modality_embeddings": 225,
+ "n_modality_embeddings": 32,
"start_modality_token": "",
"end_modality_token": "",
"dataset_type": "multimodal",
@@ -111,35 +111,40 @@
"trainer_settings": {
"evaluation_strategy": "epoch",
"save_strategy": "epoch",
- "per_device_train_batch_size": 1,
- "per_device_eval_batch_size": 1,
- "gradient_accumulation_steps": 1,
+ "per_device_train_batch_size": 2,
+ "per_device_eval_batch_size": 2,
+ "gradient_accumulation_steps": 2,
"logging_steps": 1,
"learning_rate": 0.00002,
"num_train_epochs": 1,
"lr_scheduler_type": "cosine",
"warmup_steps": 0,
"fp16": false,
- "bf16": false,
+ "bf16": true,
"optim": "adamw_torch",
"save_total_limit": 1,
- "no_cuda": true
+ "no_cuda": false,
+ "dispatch_batches": false,
+ "load_best_model_at_end": false,
+ "deepspeed": null,
+ "dataloader_num_workers": 16,
+ "dataloader_pin_memory": false
},
"logging_settings": {
- "project_name": "alignment",
- "run_name": "multimodal",
- "entity": "turbo-alignment"
+ "project_name": "rykov",
+ "run_name": "multimodal",
+ "entity": "rykov"
},
"modality_encoder_settings_mapping": {
"image": {
"modality_encoder_type": "clip",
- "is_pickle": false,
+ "is_pickle": true,
"encoder_path": "tests/fixtures/models/clip_tiny"
},
"audio": null
},
"modality_projector_mapping": {
- "image": "llava",
+ "image": "top_k_attention_pooling_with_n_heads",
"audio": null
},
"modality_projector_initialization_mapping": {
@@ -176,7 +181,7 @@
},
"dataset_type": "multimodal",
"max_tokens_count": 2000,
- "n_modality_embeddings": 225,
+ "n_modality_embeddings": 32,
"start_modality_token": "",
"end_modality_token": "",
"only_answer_loss": true,
@@ -186,7 +191,7 @@
},
"modality_reader_settings_mapping": {
"image": {
- "reader_type": "clip",
+ "reader_type": "pickle",
"reader_path": "tests/fixtures/models/clip_tiny"
},
"audio": null
diff --git a/tests/fixtures/datasets/multimodal/image_chat.jsonl b/tests/fixtures/datasets/multimodal/image_chat.jsonl
index e53f0b5..70761bc 100644
--- a/tests/fixtures/datasets/multimodal/image_chat.jsonl
+++ b/tests/fixtures/datasets/multimodal/image_chat.jsonl
@@ -1,4 +1,4 @@
-{"id": "0", "messages": [{"role": "user", "type": "image", "content": "tests/fixtures/datasets/multimodal/images/img_1.jpg"}, {"role": "user", "type": "text", "content": "Describe the scene"}, {"role": "bot", "type": "text", "content": "Sorry, I will not describe the scene."}]}
-{"id": "1", "messages": [{"role": "user", "type": "image", "content": "tests/fixtures/datasets/multimodal/images/img_2.jpg"}, {"role": "user", "type": "text", "content": "What do you see on the image?"}, {"role": "bot", "type": "text", "content": "I see nothing."}, {"role": "user", "type": "text", "content": "What about this one?"}, {"role": "user", "type": "image", "content": "tests/fixtures/datasets/multimodal/images/img_1.jpg"}, {"role": "bot", "type": "text", "content": "Sorry..."}]}
-{"id": "2", "messages": [{"role": "user", "type": "image", "content": "tests/fixtures/datasets/multimodal/images/img_3.jpg"}, {"role": "user", "type": "image", "content": "tests/fixtures/datasets/multimodal/images/img_4.jpg"}, {"role": "user", "type": "text", "content": "Please, describe these two photos."}, {"role": "bot", "type": "text", "content": "OK."}]}
-{"id": "3", "messages": [{"role": "user", "type": "image", "content": "tests/fixtures/datasets/multimodal/images/img_5.jpg"}, {"role": "user", "type": "text", "content": "Describe the scene"}, {"role": "bot", "type": "text", "content": "No."}]}
\ No newline at end of file
+{"id": "0", "messages": [{"role": "user", "type": "image", "content": "images/00000/000000935.jpg"}, {"role": "user", "type": "text", "content": "Describe the scene"}, {"role": "bot", "type": "text", "content": "Sorry, I will not describe the scene."}]}
+{"id": "1", "messages": [{"role": "user", "type": "image", "content": "images/00000/000000934.jpg"}, {"role": "user", "type": "text", "content": "What do you see on the image?"}, {"role": "bot", "type": "text", "content": "I see nothing."}, {"role": "user", "type": "text", "content": "What about this one?"}]}
+{"id": "2", "messages": [{"role": "user", "type": "image", "content": "images/00000/000000933.jpg"}, {"role": "user", "type": "text", "content": "Please, describe these two photos."}, {"role": "bot", "type": "text", "content": "OK."}]}
+{"id": "3", "messages": [{"role": "user", "type": "image", "content": "images/00000/000000932.jpg"}, {"role": "user", "type": "text", "content": "Describe the scene"}, {"role": "bot", "type": "text", "content": "No."}]}
\ No newline at end of file