From 9b822067222a29cb13861eca6bf484aa40967035 Mon Sep 17 00:00:00 2001
From: Jintao <huangjintao.hjt@alibaba-inc.com>
Date: Thu, 17 Oct 2024 13:47:52 +0800
Subject: [PATCH] support mplug3 1b/2b (#2271)

---
 README.md                                     |  2 +-
 README_CN.md                                  |  2 +-
 ...14\346\225\260\346\215\256\351\233\206.md" |  2 ++
 .../Instruction/Supported-models-datasets.md  |  2 ++
 swift/llm/utils/model.py                      | 20 +++++++++++++++++++
 swift/llm/utils/template.py                   |  5 ++---
 6 files changed, 28 insertions(+), 5 deletions(-)
diff --git a/README.md b/README.md
index 2f0775e43..d48d00691 100644
--- a/README.md
+++ b/README.md
@@ -635,7 +635,7 @@ The complete list of supported models and datasets can be found at [Supported Mo
 | Llava-HF                                                   | [Llava-HF series models](https://huggingface.co/llava-hf)                          | English       | 0.5B-110B           | chat model           |
 | Llava1.5<br>Llava1.6                                       | [Llava series models](https://github.com/haotian-liu/LLaVA)                            | English            | 7B-34B                                | chat model               |
 | Llava-Next<br>Llava-Next-Video                             | [Llava-Next series models](https://github.com/LLaVA-VL/LLaVA-NeXT)                     | Chinese<br>English | 7B-110B                               | chat model               |
-| mPLUG-Owl2<br>mPLUG-Owl2.1<br>mPLUG-Owl3                | [mPLUG-Owl series models](https://github.com/X-PLUG/mPLUG-Owl)                         | English            | 11B                                   | chat model               |
+| mPLUG-Owl2<br>mPLUG-Owl2.1<br>mPLUG-Owl3                | [mPLUG-Owl series models](https://github.com/X-PLUG/mPLUG-Owl)                         | English            | 1B-11B                                   | chat model               |
 | InternVL<br>Mini-InternVL<br>InternVL2                     | [InternVL](https://github.com/OpenGVLab/InternVL)                                      | Chinese<br>English | 1B-40B<br>including quantized version | chat model               |
 | Llava-llama3                                               | [xtuner](https://huggingface.co/xtuner/llava-llama-3-8b-v1_1-transformers)             | English            | 8B                                    | chat model               |
 | Phi3-Vision                                                | Microsoft                                                                              | English            | 4B                                    | chat model               |
diff --git a/README_CN.md b/README_CN.md
index 38c36ea04..d37fb894a 100644
--- a/README_CN.md
+++ b/README_CN.md
@@ -628,7 +628,7 @@ CUDA_VISIBLE_DEVICES=0 swift deploy \
 | Llava-HF               | [Llava-HF系列模型](https://huggingface.co/llava-hf)                          | 英文       | 0.5B-110B           | chat模型           |
 | Llava1.5<br>Llava1.6                                    | [Llava系列模型](https://github.com/haotian-liu/LLaVA)                          | 英文       | 7B-34B           | chat模型           |
 | Llava-Next<br>Llava-Next-Video                          | [Llava-Next系列模型](https://github.com/LLaVA-VL/LLaVA-NeXT)                   | 中文<br>英文 | 7B-110B          | chat模型           |
-| mPLUG-Owl2<br>mPLUG-Owl2.1<br>mPLUG-Owl3           | [mPLUG-Owl系列模型](https://github.com/X-PLUG/mPLUG-Owl)                       | 英文       | 11B              | chat模型           |
+| mPLUG-Owl2<br>mPLUG-Owl2.1<br>mPLUG-Owl3           | [mPLUG-Owl系列模型](https://github.com/X-PLUG/mPLUG-Owl)                       | 英文       | 1B-11B              | chat模型           |
 | InternVL<br>Mini-InternVL<br>InternVL2                  | [InternVL](https://github.com/OpenGVLab/InternVL)                          | 中文<br>英文 | 1B-40B<br>包含量化版本 | chat模型           |
 | Llava-llama3                                            | [xtuner](https://huggingface.co/xtuner/llava-llama-3-8b-v1_1-transformers) | 英文       | 8B               | chat模型       |
 | Phi3-Vision                                             | 微软                                                                         | 英文       | 4B               | chat模型       |
diff --git "a/docs/source/Instruction/\346\224\257\346\214\201\347\232\204\346\250\241\345\236\213\345\222\214\346\225\260\346\215\256\351\233\206.md" "b/docs/source/Instruction/\346\224\257\346\214\201\347\232\204\346\250\241\345\236\213\345\222\214\346\225\260\346\215\256\351\233\206.md"
index b673f4ad7..85c82980e 100644
--- "a/docs/source/Instruction/\346\224\257\346\214\201\347\232\204\346\250\241\345\236\213\345\222\214\346\225\260\346\215\256\351\233\206.md"
+++ "b/docs/source/Instruction/\346\224\257\346\214\201\347\232\204\346\250\241\345\236\213\345\222\214\346\225\260\346\215\256\351\233\206.md"
@@ -506,6 +506,8 @@
 |pixtral-12b|[AI-ModelScope/pixtral-12b](https://modelscope.cn/models/AI-ModelScope/pixtral-12b/summary)|^(language_model\|multi_modal_projector)(?!.\*(lm_head\|output\|emb\|wte\|shared)).\*|pixtral|&#x2718;|&#x2718;|&#x2718;|&#x2718;|transformers>=4.45|vision|[mistral-community/pixtral-12b](https://huggingface.co/mistral-community/pixtral-12b)|
 |mplug-owl2-chat|[iic/mPLUG-Owl2](https://modelscope.cn/models/iic/mPLUG-Owl2/summary)|q_proj, k_proj.multiway.0, k_proj.multiway.1, v_proj.multiway.0, v_proj.multiway.1|mplug-owl2|&#x2714;|&#x2718;|&#x2718;|&#x2718;|transformers<4.35, icecream|vision|[MAGAer13/mplug-owl2-llama2-7b](https://huggingface.co/MAGAer13/mplug-owl2-llama2-7b)|
 |mplug-owl2_1-chat|[iic/mPLUG-Owl2.1](https://modelscope.cn/models/iic/mPLUG-Owl2.1/summary)|c_attn.multiway.0, c_attn.multiway.1|mplug-owl2|&#x2714;|&#x2718;|&#x2718;|&#x2718;|transformers<4.35, icecream|vision|[Mizukiluke/mplug_owl_2_1](https://huggingface.co/Mizukiluke/mplug_owl_2_1)|
+|mplug-owl3-1b-chat|[iic/mPLUG-Owl3-1B-241014](https://modelscope.cn/models/iic/mPLUG-Owl3-1B-241014/summary)|^(language_model\|vision2text_model)(?!.\*(lm_head\|output\|emb\|wte\|shared)).\*|mplug_owl3|&#x2714;|&#x2718;|&#x2718;|&#x2718;|transformers>=4.36, icecream|vision, video|[mPLUG/mPLUG-Owl3-1B-241014](https://huggingface.co/mPLUG/mPLUG-Owl3-1B-241014)|
+|mplug-owl3-2b-chat|[iic/mPLUG-Owl3-2B-241014](https://modelscope.cn/models/iic/mPLUG-Owl3-2B-241014/summary)|^(language_model\|vision2text_model)(?!.\*(lm_head\|output\|emb\|wte\|shared)).\*|mplug_owl3|&#x2714;|&#x2718;|&#x2718;|&#x2718;|transformers>=4.36, icecream|vision, video|[mPLUG/mPLUG-Owl3-2B-241014](https://huggingface.co/mPLUG/mPLUG-Owl3-2B-241014)|
 |mplug-owl3-7b-chat|[iic/mPLUG-Owl3-7B-240728](https://modelscope.cn/models/iic/mPLUG-Owl3-7B-240728/summary)|^(language_model\|vision2text_model)(?!.\*(lm_head\|output\|emb\|wte\|shared)).\*|mplug_owl3|&#x2714;|&#x2718;|&#x2718;|&#x2718;|transformers>=4.36, icecream|vision, video|[mPLUG/mPLUG-Owl3-7B-240728](https://huggingface.co/mPLUG/mPLUG-Owl3-7B-240728)|
 |phi3-vision-128k-instruct|[LLM-Research/Phi-3-vision-128k-instruct](https://modelscope.cn/models/LLM-Research/Phi-3-vision-128k-instruct/summary)|^(model.layers\|model.vision_embed_tokens.img_projection)(?!.\*(lm_head\|output\|emb\|wte\|shared)).\*|phi3-vl|&#x2714;|&#x2714;|&#x2718;|&#x2718;|transformers>=4.36|vision|[microsoft/Phi-3-vision-128k-instruct](https://huggingface.co/microsoft/Phi-3-vision-128k-instruct)|
 |phi3_5-vision-instruct|[LLM-Research/Phi-3.5-vision-instruct](https://modelscope.cn/models/LLM-Research/Phi-3.5-vision-instruct/summary)|^(model.layers\|model.vision_embed_tokens.img_projection)(?!.\*(lm_head\|output\|emb\|wte\|shared)).\*|phi3-vl|&#x2714;|&#x2714;|&#x2718;|&#x2718;|transformers>=4.36|vision|[microsoft/Phi-3.5-vision-instruct](https://huggingface.co/microsoft/Phi-3.5-vision-instruct)|
diff --git a/docs/source_en/Instruction/Supported-models-datasets.md b/docs/source_en/Instruction/Supported-models-datasets.md
index fcacf3130..4cb94ec65 100644
--- a/docs/source_en/Instruction/Supported-models-datasets.md
+++ b/docs/source_en/Instruction/Supported-models-datasets.md
@@ -506,6 +506,8 @@ The table below introcudes all models supported by SWIFT:
 |pixtral-12b|[AI-ModelScope/pixtral-12b](https://modelscope.cn/models/AI-ModelScope/pixtral-12b/summary)|^(language_model\|multi_modal_projector)(?!.\*(lm_head\|output\|emb\|wte\|shared)).\*|pixtral|&#x2718;|&#x2718;|&#x2718;|&#x2718;|transformers>=4.45|vision|[mistral-community/pixtral-12b](https://huggingface.co/mistral-community/pixtral-12b)|
 |mplug-owl2-chat|[iic/mPLUG-Owl2](https://modelscope.cn/models/iic/mPLUG-Owl2/summary)|q_proj, k_proj.multiway.0, k_proj.multiway.1, v_proj.multiway.0, v_proj.multiway.1|mplug-owl2|&#x2714;|&#x2718;|&#x2718;|&#x2718;|transformers<4.35, icecream|vision|[MAGAer13/mplug-owl2-llama2-7b](https://huggingface.co/MAGAer13/mplug-owl2-llama2-7b)|
 |mplug-owl2_1-chat|[iic/mPLUG-Owl2.1](https://modelscope.cn/models/iic/mPLUG-Owl2.1/summary)|c_attn.multiway.0, c_attn.multiway.1|mplug-owl2|&#x2714;|&#x2718;|&#x2718;|&#x2718;|transformers<4.35, icecream|vision|[Mizukiluke/mplug_owl_2_1](https://huggingface.co/Mizukiluke/mplug_owl_2_1)|
+|mplug-owl3-1b-chat|[iic/mPLUG-Owl3-1B-241014](https://modelscope.cn/models/iic/mPLUG-Owl3-1B-241014/summary)|^(language_model\|vision2text_model)(?!.\*(lm_head\|output\|emb\|wte\|shared)).\*|mplug_owl3|&#x2714;|&#x2718;|&#x2718;|&#x2718;|transformers>=4.36, icecream|vision, video|[mPLUG/mPLUG-Owl3-1B-241014](https://huggingface.co/mPLUG/mPLUG-Owl3-1B-241014)|
+|mplug-owl3-2b-chat|[iic/mPLUG-Owl3-2B-241014](https://modelscope.cn/models/iic/mPLUG-Owl3-2B-241014/summary)|^(language_model\|vision2text_model)(?!.\*(lm_head\|output\|emb\|wte\|shared)).\*|mplug_owl3|&#x2714;|&#x2718;|&#x2718;|&#x2718;|transformers>=4.36, icecream|vision, video|[mPLUG/mPLUG-Owl3-2B-241014](https://huggingface.co/mPLUG/mPLUG-Owl3-2B-241014)|
 |mplug-owl3-7b-chat|[iic/mPLUG-Owl3-7B-240728](https://modelscope.cn/models/iic/mPLUG-Owl3-7B-240728/summary)|^(language_model\|vision2text_model)(?!.\*(lm_head\|output\|emb\|wte\|shared)).\*|mplug_owl3|&#x2714;|&#x2718;|&#x2718;|&#x2718;|transformers>=4.36, icecream|vision, video|[mPLUG/mPLUG-Owl3-7B-240728](https://huggingface.co/mPLUG/mPLUG-Owl3-7B-240728)|
 |phi3-vision-128k-instruct|[LLM-Research/Phi-3-vision-128k-instruct](https://modelscope.cn/models/LLM-Research/Phi-3-vision-128k-instruct/summary)|^(model.layers\|model.vision_embed_tokens.img_projection)(?!.\*(lm_head\|output\|emb\|wte\|shared)).\*|phi3-vl|&#x2714;|&#x2714;|&#x2718;|&#x2718;|transformers>=4.36|vision|[microsoft/Phi-3-vision-128k-instruct](https://huggingface.co/microsoft/Phi-3-vision-128k-instruct)|
 |phi3_5-vision-instruct|[LLM-Research/Phi-3.5-vision-instruct](https://modelscope.cn/models/LLM-Research/Phi-3.5-vision-instruct/summary)|^(model.layers\|model.vision_embed_tokens.img_projection)(?!.\*(lm_head\|output\|emb\|wte\|shared)).\*|phi3-vl|&#x2714;|&#x2714;|&#x2718;|&#x2718;|transformers>=4.36|vision|[microsoft/Phi-3.5-vision-instruct](https://huggingface.co/microsoft/Phi-3.5-vision-instruct)|
diff --git a/swift/llm/utils/model.py b/swift/llm/utils/model.py
index 07be42bb9..6acb24a0d 100644
--- a/swift/llm/utils/model.py
+++ b/swift/llm/utils/model.py
@@ -523,6 +523,8 @@ class ModelType:
     # owl
     mplug_owl2_chat = 'mplug-owl2-chat'  # llama
     mplug_owl2_1_chat = 'mplug-owl2_1-chat'  # qwen
+    mplug_owl3_1b_chat = 'mplug-owl3-1b-chat'
+    mplug_owl3_2b_chat = 'mplug-owl3-2b-chat'
     mplug_owl3_7b_chat = 'mplug-owl3-7b-chat'
     # yuan
     yuan2_2b_instruct = 'yuan2-2b-instruct'
@@ -2887,6 +2889,24 @@ def update(self, key_states: torch.Tensor, value_states: torch.Tensor, layer_idx
     return model, tokenizer
 
 
+@register_model(
+    ModelType.mplug_owl3_1b_chat,
+    'iic/mPLUG-Owl3-1B-241014',
+    LoRATM.mplug_owl3,
+    TemplateType.mplug_owl3,
+    requires=['transformers>=4.36', 'icecream'],  # decord
+    support_flash_attn=True,
+    tags=['multi-modal', 'vision', 'video'],
+    hf_model_id='mPLUG/mPLUG-Owl3-1B-241014')
+@register_model(
+    ModelType.mplug_owl3_2b_chat,
+    'iic/mPLUG-Owl3-2B-241014',
+    LoRATM.mplug_owl3,
+    TemplateType.mplug_owl3,
+    requires=['transformers>=4.36', 'icecream'],  # decord
+    support_flash_attn=True,
+    tags=['multi-modal', 'vision', 'video'],
+    hf_model_id='mPLUG/mPLUG-Owl3-2B-241014')
 @register_model(
     ModelType.mplug_owl3_7b_chat,
     'iic/mPLUG-Owl3-7B-240728',
diff --git a/swift/llm/utils/template.py b/swift/llm/utils/template.py
index 72ccbd56c..9adeb7394 100644
--- a/swift/llm/utils/template.py
+++ b/swift/llm/utils/template.py
@@ -3800,11 +3800,10 @@ def _get_image_token_list(self, cut_shape):
         processor = self.tokenizer.processor
         text = processor.image_processor.cut_prompt_template(img_token='<|image|>', h=cut_shape[0], w=cut_shape[1])
         text_list = text.split('<|image|>')
-        if text_list[-1] == '':
-            text_list.pop()
         res_text_list = []
-        for text in text_list:
+        for text in text_list[:-1]:
             res_text_list += [text, '<|image|>']
+        res_text_list += text_list[-1]
         token_list = self._encode_context_list(res_text_list)[0]
         return token_list