diff --git a/configs/exp/train/multimodal/attention_based_selector.json b/configs/exp/train/multimodal/attention_based_selector.json index b22f3c3..a90dc4c 100644 --- a/configs/exp/train/multimodal/attention_based_selector.json +++ b/configs/exp/train/multimodal/attention_based_selector.json @@ -127,9 +127,9 @@ "save_total_limit": 5, "load_best_model_at_end": false, "deepspeed": "configs/exp/deepspeed/stage2.json", - "dispatch_batches": false, "dataloader_num_workers": 16, - "dataloader_pin_memory": false + "dataloader_pin_memory": false, + "dispatch_batches": false }, "wandb_settings": { "project_name": "rykov", diff --git a/configs/exp/train/multimodal/attention_pooling.json b/configs/exp/train/multimodal/attention_pooling.json index 0b48b50..ba51284 100644 --- a/configs/exp/train/multimodal/attention_pooling.json +++ b/configs/exp/train/multimodal/attention_pooling.json @@ -63,11 +63,7 @@ }, "audio": null }, -<<<<<<< HEAD "n_modality_embeddings": 64, -======= - "n_modality_embeddings": 128, ->>>>>>> 1167a3408adc49898aa4b00c6235adbfceabf9d1 "start_modality_token": "", "end_modality_token": "", "dataset_type": "multimodal", @@ -186,11 +182,7 @@ }, "dataset_type": "multimodal", "max_tokens_count": 2000, -<<<<<<< HEAD "n_modality_embeddings": 64, -======= - "n_modality_embeddings": 128, ->>>>>>> 1167a3408adc49898aa4b00c6235adbfceabf9d1 "start_modality_token": "", "end_modality_token": "", "only_answer_loss": true, diff --git a/tests/fixtures/configs/train/multimodal/llama_llava_base_clip.json b/tests/fixtures/configs/train/multimodal/llama_llava_base_clip.json index d95ee19..68a105f 100644 --- a/tests/fixtures/configs/train/multimodal/llama_llava_base_clip.json +++ b/tests/fixtures/configs/train/multimodal/llama_llava_base_clip.json @@ -22,12 +22,12 @@ }, "modality_reader_settings_mapping": { "image": { - "reader_type": "clip", + "reader_type": "pickle", "reader_path": "tests/fixtures/models/clip_tiny" }, "audio": null }, - "n_modality_embeddings": 225, + "n_modality_embeddings": 32, "start_modality_token": "", "end_modality_token": "", "dataset_type": "multimodal", @@ -58,12 +58,12 @@ }, "modality_reader_settings_mapping": { "image": { - "reader_type": "clip", + "reader_type": "pickle", "reader_path": "tests/fixtures/models/clip_tiny" }, "audio": null }, - "n_modality_embeddings": 225, + "n_modality_embeddings": 32, "start_modality_token": "", "end_modality_token": "", "dataset_type": "multimodal", @@ -111,35 +111,40 @@ "trainer_settings": { "evaluation_strategy": "epoch", "save_strategy": "epoch", - "per_device_train_batch_size": 1, - "per_device_eval_batch_size": 1, - "gradient_accumulation_steps": 1, + "per_device_train_batch_size": 2, + "per_device_eval_batch_size": 2, + "gradient_accumulation_steps": 2, "logging_steps": 1, "learning_rate": 0.00002, "num_train_epochs": 1, "lr_scheduler_type": "cosine", "warmup_steps": 0, "fp16": false, - "bf16": false, + "bf16": true, "optim": "adamw_torch", "save_total_limit": 1, - "no_cuda": true + "no_cuda": false, + "dispatch_batches": false, + "load_best_model_at_end": false, + "deepspeed": null, + "dataloader_num_workers": 16, + "dataloader_pin_memory": false }, "logging_settings": { - "project_name": "alignment", - "run_name": "multimodal", - "entity": "turbo-alignment" + "project_name": "rykov", + "run_name": "multimodal", + "entity": "rykov" }, "modality_encoder_settings_mapping": { "image": { "modality_encoder_type": "clip", - "is_pickle": false, + "is_pickle": true, "encoder_path": "tests/fixtures/models/clip_tiny" }, "audio": null }, "modality_projector_mapping": { - "image": "llava", + "image": "top_k_attention_pooling_with_n_heads", "audio": null }, "modality_projector_initialization_mapping": { @@ -176,7 +181,7 @@ }, "dataset_type": "multimodal", "max_tokens_count": 2000, - "n_modality_embeddings": 225, + "n_modality_embeddings": 32, "start_modality_token": "", "end_modality_token": "", "only_answer_loss": true, @@ -186,7 +191,7 @@ }, "modality_reader_settings_mapping": { "image": { - "reader_type": "clip", + "reader_type": "pickle", "reader_path": "tests/fixtures/models/clip_tiny" }, "audio": null diff --git a/tests/fixtures/datasets/multimodal/image_chat.jsonl b/tests/fixtures/datasets/multimodal/image_chat.jsonl index e53f0b5..70761bc 100644 --- a/tests/fixtures/datasets/multimodal/image_chat.jsonl +++ b/tests/fixtures/datasets/multimodal/image_chat.jsonl @@ -1,4 +1,4 @@ -{"id": "0", "messages": [{"role": "user", "type": "image", "content": "tests/fixtures/datasets/multimodal/images/img_1.jpg"}, {"role": "user", "type": "text", "content": "Describe the scene"}, {"role": "bot", "type": "text", "content": "Sorry, I will not describe the scene."}]} -{"id": "1", "messages": [{"role": "user", "type": "image", "content": "tests/fixtures/datasets/multimodal/images/img_2.jpg"}, {"role": "user", "type": "text", "content": "What do you see on the image?"}, {"role": "bot", "type": "text", "content": "I see nothing."}, {"role": "user", "type": "text", "content": "What about this one?"}, {"role": "user", "type": "image", "content": "tests/fixtures/datasets/multimodal/images/img_1.jpg"}, {"role": "bot", "type": "text", "content": "Sorry..."}]} -{"id": "2", "messages": [{"role": "user", "type": "image", "content": "tests/fixtures/datasets/multimodal/images/img_3.jpg"}, {"role": "user", "type": "image", "content": "tests/fixtures/datasets/multimodal/images/img_4.jpg"}, {"role": "user", "type": "text", "content": "Please, describe these two photos."}, {"role": "bot", "type": "text", "content": "OK."}]} -{"id": "3", "messages": [{"role": "user", "type": "image", "content": "tests/fixtures/datasets/multimodal/images/img_5.jpg"}, {"role": "user", "type": "text", "content": "Describe the scene"}, {"role": "bot", "type": "text", "content": "No."}]} \ No newline at end of file +{"id": "0", "messages": [{"role": "user", "type": "image", "content": "images/00000/000000935.jpg"}, {"role": "user", "type": "text", "content": "Describe the scene"}, {"role": "bot", "type": "text", "content": "Sorry, I will not describe the scene."}]} +{"id": "1", "messages": [{"role": "user", "type": "image", "content": "images/00000/000000934.jpg"}, {"role": "user", "type": "text", "content": "What do you see on the image?"}, {"role": "bot", "type": "text", "content": "I see nothing."}, {"role": "user", "type": "text", "content": "What about this one?"}]} +{"id": "2", "messages": [{"role": "user", "type": "image", "content": "images/00000/000000933.jpg"}, {"role": "user", "type": "text", "content": "Please, describe these two photos."}, {"role": "bot", "type": "text", "content": "OK."}]} +{"id": "3", "messages": [{"role": "user", "type": "image", "content": "images/00000/000000932.jpg"}, {"role": "user", "type": "text", "content": "Describe the scene"}, {"role": "bot", "type": "text", "content": "No."}]} \ No newline at end of file