From 9b822067222a29cb13861eca6bf484aa40967035 Mon Sep 17 00:00:00 2001 From: Jintao Date: Thu, 17 Oct 2024 13:47:52 +0800 Subject: [PATCH] support mplug3 1b/2b (#2271) --- README.md | 2 +- README_CN.md | 2 +- ...14\346\225\260\346\215\256\351\233\206.md" | 2 ++ .../Instruction/Supported-models-datasets.md | 2 ++ swift/llm/utils/model.py | 20 +++++++++++++++++++ swift/llm/utils/template.py | 5 ++--- 6 files changed, 28 insertions(+), 5 deletions(-) diff --git a/README.md b/README.md index 2f0775e43..d48d00691 100644 --- a/README.md +++ b/README.md @@ -635,7 +635,7 @@ The complete list of supported models and datasets can be found at [Supported Mo | Llava-HF | [Llava-HF series models](https://huggingface.co/llava-hf) | English | 0.5B-110B | chat model | | Llava1.5
Llava1.6 | [Llava series models](https://github.com/haotian-liu/LLaVA) | English | 7B-34B | chat model | | Llava-Next
Llava-Next-Video | [Llava-Next series models](https://github.com/LLaVA-VL/LLaVA-NeXT) | Chinese
English | 7B-110B | chat model | -| mPLUG-Owl2
mPLUG-Owl2.1
mPLUG-Owl3 | [mPLUG-Owl series models](https://github.com/X-PLUG/mPLUG-Owl) | English | 11B | chat model | +| mPLUG-Owl2
mPLUG-Owl2.1
mPLUG-Owl3 | [mPLUG-Owl series models](https://github.com/X-PLUG/mPLUG-Owl) | English | 1B-11B | chat model | | InternVL
Mini-InternVL
InternVL2 | [InternVL](https://github.com/OpenGVLab/InternVL) | Chinese
English | 1B-40B
including quantized version | chat model | | Llava-llama3 | [xtuner](https://huggingface.co/xtuner/llava-llama-3-8b-v1_1-transformers) | English | 8B | chat model | | Phi3-Vision | Microsoft | English | 4B | chat model | diff --git a/README_CN.md b/README_CN.md index 38c36ea04..d37fb894a 100644 --- a/README_CN.md +++ b/README_CN.md @@ -628,7 +628,7 @@ CUDA_VISIBLE_DEVICES=0 swift deploy \ | Llava-HF | [Llava-HF系列模型](https://huggingface.co/llava-hf) | 英文 | 0.5B-110B | chat模型 | | Llava1.5
Llava1.6 | [Llava系列模型](https://github.com/haotian-liu/LLaVA) | 英文 | 7B-34B | chat模型 | | Llava-Next
Llava-Next-Video | [Llava-Next系列模型](https://github.com/LLaVA-VL/LLaVA-NeXT) | 中文
英文 | 7B-110B | chat模型 | -| mPLUG-Owl2
mPLUG-Owl2.1
mPLUG-Owl3 | [mPLUG-Owl系列模型](https://github.com/X-PLUG/mPLUG-Owl) | 英文 | 11B | chat模型 | +| mPLUG-Owl2
mPLUG-Owl2.1
mPLUG-Owl3 | [mPLUG-Owl系列模型](https://github.com/X-PLUG/mPLUG-Owl) | 英文 | 1B-11B | chat模型 | | InternVL
Mini-InternVL
InternVL2 | [InternVL](https://github.com/OpenGVLab/InternVL) | 中文
英文 | 1B-40B
包含量化版本 | chat模型 | | Llava-llama3 | [xtuner](https://huggingface.co/xtuner/llava-llama-3-8b-v1_1-transformers) | 英文 | 8B | chat模型 | | Phi3-Vision | 微软 | 英文 | 4B | chat模型 | diff --git "a/docs/source/Instruction/\346\224\257\346\214\201\347\232\204\346\250\241\345\236\213\345\222\214\346\225\260\346\215\256\351\233\206.md" "b/docs/source/Instruction/\346\224\257\346\214\201\347\232\204\346\250\241\345\236\213\345\222\214\346\225\260\346\215\256\351\233\206.md" index b673f4ad7..85c82980e 100644 --- "a/docs/source/Instruction/\346\224\257\346\214\201\347\232\204\346\250\241\345\236\213\345\222\214\346\225\260\346\215\256\351\233\206.md" +++ "b/docs/source/Instruction/\346\224\257\346\214\201\347\232\204\346\250\241\345\236\213\345\222\214\346\225\260\346\215\256\351\233\206.md" @@ -506,6 +506,8 @@ |pixtral-12b|[AI-ModelScope/pixtral-12b](https://modelscope.cn/models/AI-ModelScope/pixtral-12b/summary)|^(language_model\|multi_modal_projector)(?!.\*(lm_head\|output\|emb\|wte\|shared)).\*|pixtral|✘|✘|✘|✘|transformers>=4.45|vision|[mistral-community/pixtral-12b](https://huggingface.co/mistral-community/pixtral-12b)| |mplug-owl2-chat|[iic/mPLUG-Owl2](https://modelscope.cn/models/iic/mPLUG-Owl2/summary)|q_proj, k_proj.multiway.0, k_proj.multiway.1, v_proj.multiway.0, v_proj.multiway.1|mplug-owl2|✔|✘|✘|✘|transformers<4.35, icecream|vision|[MAGAer13/mplug-owl2-llama2-7b](https://huggingface.co/MAGAer13/mplug-owl2-llama2-7b)| |mplug-owl2_1-chat|[iic/mPLUG-Owl2.1](https://modelscope.cn/models/iic/mPLUG-Owl2.1/summary)|c_attn.multiway.0, c_attn.multiway.1|mplug-owl2|✔|✘|✘|✘|transformers<4.35, icecream|vision|[Mizukiluke/mplug_owl_2_1](https://huggingface.co/Mizukiluke/mplug_owl_2_1)| +|mplug-owl3-1b-chat|[iic/mPLUG-Owl3-1B-241014](https://modelscope.cn/models/iic/mPLUG-Owl3-1B-241014/summary)|^(language_model\|vision2text_model)(?!.\*(lm_head\|output\|emb\|wte\|shared)).\*|mplug_owl3|✔|✘|✘|✘|transformers>=4.36, icecream|vision, video|[mPLUG/mPLUG-Owl3-1B-241014](https://huggingface.co/mPLUG/mPLUG-Owl3-1B-241014)| +|mplug-owl3-2b-chat|[iic/mPLUG-Owl3-2B-241014](https://modelscope.cn/models/iic/mPLUG-Owl3-2B-241014/summary)|^(language_model\|vision2text_model)(?!.\*(lm_head\|output\|emb\|wte\|shared)).\*|mplug_owl3|✔|✘|✘|✘|transformers>=4.36, icecream|vision, video|[mPLUG/mPLUG-Owl3-2B-241014](https://huggingface.co/mPLUG/mPLUG-Owl3-2B-241014)| |mplug-owl3-7b-chat|[iic/mPLUG-Owl3-7B-240728](https://modelscope.cn/models/iic/mPLUG-Owl3-7B-240728/summary)|^(language_model\|vision2text_model)(?!.\*(lm_head\|output\|emb\|wte\|shared)).\*|mplug_owl3|✔|✘|✘|✘|transformers>=4.36, icecream|vision, video|[mPLUG/mPLUG-Owl3-7B-240728](https://huggingface.co/mPLUG/mPLUG-Owl3-7B-240728)| |phi3-vision-128k-instruct|[LLM-Research/Phi-3-vision-128k-instruct](https://modelscope.cn/models/LLM-Research/Phi-3-vision-128k-instruct/summary)|^(model.layers\|model.vision_embed_tokens.img_projection)(?!.\*(lm_head\|output\|emb\|wte\|shared)).\*|phi3-vl|✔|✔|✘|✘|transformers>=4.36|vision|[microsoft/Phi-3-vision-128k-instruct](https://huggingface.co/microsoft/Phi-3-vision-128k-instruct)| |phi3_5-vision-instruct|[LLM-Research/Phi-3.5-vision-instruct](https://modelscope.cn/models/LLM-Research/Phi-3.5-vision-instruct/summary)|^(model.layers\|model.vision_embed_tokens.img_projection)(?!.\*(lm_head\|output\|emb\|wte\|shared)).\*|phi3-vl|✔|✔|✘|✘|transformers>=4.36|vision|[microsoft/Phi-3.5-vision-instruct](https://huggingface.co/microsoft/Phi-3.5-vision-instruct)| diff --git a/docs/source_en/Instruction/Supported-models-datasets.md b/docs/source_en/Instruction/Supported-models-datasets.md index fcacf3130..4cb94ec65 100644 --- a/docs/source_en/Instruction/Supported-models-datasets.md +++ b/docs/source_en/Instruction/Supported-models-datasets.md @@ -506,6 +506,8 @@ The table below introcudes all models supported by SWIFT: |pixtral-12b|[AI-ModelScope/pixtral-12b](https://modelscope.cn/models/AI-ModelScope/pixtral-12b/summary)|^(language_model\|multi_modal_projector)(?!.\*(lm_head\|output\|emb\|wte\|shared)).\*|pixtral|✘|✘|✘|✘|transformers>=4.45|vision|[mistral-community/pixtral-12b](https://huggingface.co/mistral-community/pixtral-12b)| |mplug-owl2-chat|[iic/mPLUG-Owl2](https://modelscope.cn/models/iic/mPLUG-Owl2/summary)|q_proj, k_proj.multiway.0, k_proj.multiway.1, v_proj.multiway.0, v_proj.multiway.1|mplug-owl2|✔|✘|✘|✘|transformers<4.35, icecream|vision|[MAGAer13/mplug-owl2-llama2-7b](https://huggingface.co/MAGAer13/mplug-owl2-llama2-7b)| |mplug-owl2_1-chat|[iic/mPLUG-Owl2.1](https://modelscope.cn/models/iic/mPLUG-Owl2.1/summary)|c_attn.multiway.0, c_attn.multiway.1|mplug-owl2|✔|✘|✘|✘|transformers<4.35, icecream|vision|[Mizukiluke/mplug_owl_2_1](https://huggingface.co/Mizukiluke/mplug_owl_2_1)| +|mplug-owl3-1b-chat|[iic/mPLUG-Owl3-1B-241014](https://modelscope.cn/models/iic/mPLUG-Owl3-1B-241014/summary)|^(language_model\|vision2text_model)(?!.\*(lm_head\|output\|emb\|wte\|shared)).\*|mplug_owl3|✔|✘|✘|✘|transformers>=4.36, icecream|vision, video|[mPLUG/mPLUG-Owl3-1B-241014](https://huggingface.co/mPLUG/mPLUG-Owl3-1B-241014)| +|mplug-owl3-2b-chat|[iic/mPLUG-Owl3-2B-241014](https://modelscope.cn/models/iic/mPLUG-Owl3-2B-241014/summary)|^(language_model\|vision2text_model)(?!.\*(lm_head\|output\|emb\|wte\|shared)).\*|mplug_owl3|✔|✘|✘|✘|transformers>=4.36, icecream|vision, video|[mPLUG/mPLUG-Owl3-2B-241014](https://huggingface.co/mPLUG/mPLUG-Owl3-2B-241014)| |mplug-owl3-7b-chat|[iic/mPLUG-Owl3-7B-240728](https://modelscope.cn/models/iic/mPLUG-Owl3-7B-240728/summary)|^(language_model\|vision2text_model)(?!.\*(lm_head\|output\|emb\|wte\|shared)).\*|mplug_owl3|✔|✘|✘|✘|transformers>=4.36, icecream|vision, video|[mPLUG/mPLUG-Owl3-7B-240728](https://huggingface.co/mPLUG/mPLUG-Owl3-7B-240728)| |phi3-vision-128k-instruct|[LLM-Research/Phi-3-vision-128k-instruct](https://modelscope.cn/models/LLM-Research/Phi-3-vision-128k-instruct/summary)|^(model.layers\|model.vision_embed_tokens.img_projection)(?!.\*(lm_head\|output\|emb\|wte\|shared)).\*|phi3-vl|✔|✔|✘|✘|transformers>=4.36|vision|[microsoft/Phi-3-vision-128k-instruct](https://huggingface.co/microsoft/Phi-3-vision-128k-instruct)| |phi3_5-vision-instruct|[LLM-Research/Phi-3.5-vision-instruct](https://modelscope.cn/models/LLM-Research/Phi-3.5-vision-instruct/summary)|^(model.layers\|model.vision_embed_tokens.img_projection)(?!.\*(lm_head\|output\|emb\|wte\|shared)).\*|phi3-vl|✔|✔|✘|✘|transformers>=4.36|vision|[microsoft/Phi-3.5-vision-instruct](https://huggingface.co/microsoft/Phi-3.5-vision-instruct)| diff --git a/swift/llm/utils/model.py b/swift/llm/utils/model.py index 07be42bb9..6acb24a0d 100644 --- a/swift/llm/utils/model.py +++ b/swift/llm/utils/model.py @@ -523,6 +523,8 @@ class ModelType: # owl mplug_owl2_chat = 'mplug-owl2-chat' # llama mplug_owl2_1_chat = 'mplug-owl2_1-chat' # qwen + mplug_owl3_1b_chat = 'mplug-owl3-1b-chat' + mplug_owl3_2b_chat = 'mplug-owl3-2b-chat' mplug_owl3_7b_chat = 'mplug-owl3-7b-chat' # yuan yuan2_2b_instruct = 'yuan2-2b-instruct' @@ -2887,6 +2889,24 @@ def update(self, key_states: torch.Tensor, value_states: torch.Tensor, layer_idx return model, tokenizer +@register_model( + ModelType.mplug_owl3_1b_chat, + 'iic/mPLUG-Owl3-1B-241014', + LoRATM.mplug_owl3, + TemplateType.mplug_owl3, + requires=['transformers>=4.36', 'icecream'], # decord + support_flash_attn=True, + tags=['multi-modal', 'vision', 'video'], + hf_model_id='mPLUG/mPLUG-Owl3-1B-241014') +@register_model( + ModelType.mplug_owl3_2b_chat, + 'iic/mPLUG-Owl3-2B-241014', + LoRATM.mplug_owl3, + TemplateType.mplug_owl3, + requires=['transformers>=4.36', 'icecream'], # decord + support_flash_attn=True, + tags=['multi-modal', 'vision', 'video'], + hf_model_id='mPLUG/mPLUG-Owl3-2B-241014') @register_model( ModelType.mplug_owl3_7b_chat, 'iic/mPLUG-Owl3-7B-240728', diff --git a/swift/llm/utils/template.py b/swift/llm/utils/template.py index 72ccbd56c..9adeb7394 100644 --- a/swift/llm/utils/template.py +++ b/swift/llm/utils/template.py @@ -3800,11 +3800,10 @@ def _get_image_token_list(self, cut_shape): processor = self.tokenizer.processor text = processor.image_processor.cut_prompt_template(img_token='<|image|>', h=cut_shape[0], w=cut_shape[1]) text_list = text.split('<|image|>') - if text_list[-1] == '': - text_list.pop() res_text_list = [] - for text in text_list: + for text in text_list[:-1]: res_text_list += [text, '<|image|>'] + res_text_list += text_list[-1] token_list = self._encode_context_list(res_text_list)[0] return token_list