Skip to content

Commit

Permalink
support mplug3
Browse files Browse the repository at this point in the history
  • Loading branch information
Jintao-Huang committed Oct 17, 2024
1 parent 82e2522 commit e0ff7b9
Show file tree
Hide file tree
Showing 4 changed files with 26 additions and 3 deletions.
2 changes: 2 additions & 0 deletions docs/source/Instruction/支持的模型和数据集.md
Original file line number Diff line number Diff line change
Expand Up @@ -506,6 +506,8 @@
|pixtral-12b|[AI-ModelScope/pixtral-12b](https://modelscope.cn/models/AI-ModelScope/pixtral-12b/summary)|^(language_model\|multi_modal_projector)(?!.\*(lm_head\|output\|emb\|wte\|shared)).\*|pixtral|✘|✘|✘|✘|transformers>=4.45|vision|[mistral-community/pixtral-12b](https://huggingface.co/mistral-community/pixtral-12b)|
|mplug-owl2-chat|[iic/mPLUG-Owl2](https://modelscope.cn/models/iic/mPLUG-Owl2/summary)|q_proj, k_proj.multiway.0, k_proj.multiway.1, v_proj.multiway.0, v_proj.multiway.1|mplug-owl2|&#x2714;|&#x2718;|&#x2718;|&#x2718;|transformers<4.35, icecream|vision|[MAGAer13/mplug-owl2-llama2-7b](https://huggingface.co/MAGAer13/mplug-owl2-llama2-7b)|
|mplug-owl2_1-chat|[iic/mPLUG-Owl2.1](https://modelscope.cn/models/iic/mPLUG-Owl2.1/summary)|c_attn.multiway.0, c_attn.multiway.1|mplug-owl2|&#x2714;|&#x2718;|&#x2718;|&#x2718;|transformers<4.35, icecream|vision|[Mizukiluke/mplug_owl_2_1](https://huggingface.co/Mizukiluke/mplug_owl_2_1)|
|mplug-owl3-1b-chat|[iic/mPLUG-Owl3-1B-241014](https://modelscope.cn/models/iic/mPLUG-Owl3-1B-241014/summary)|^(language_model\|vision2text_model)(?!.\*(lm_head\|output\|emb\|wte\|shared)).\*|mplug_owl3|&#x2714;|&#x2718;|&#x2718;|&#x2718;|transformers>=4.36, icecream|vision, video|[mPLUG/mPLUG-Owl3-1B-241014](https://huggingface.co/mPLUG/mPLUG-Owl3-1B-241014)|
|mplug-owl3-2b-chat|[iic/mPLUG-Owl3-2B-241014](https://modelscope.cn/models/iic/mPLUG-Owl3-2B-241014/summary)|^(language_model\|vision2text_model)(?!.\*(lm_head\|output\|emb\|wte\|shared)).\*|mplug_owl3|&#x2714;|&#x2718;|&#x2718;|&#x2718;|transformers>=4.36, icecream|vision, video|[mPLUG/mPLUG-Owl3-2B-241014](https://huggingface.co/mPLUG/mPLUG-Owl3-2B-241014)|
|mplug-owl3-7b-chat|[iic/mPLUG-Owl3-7B-240728](https://modelscope.cn/models/iic/mPLUG-Owl3-7B-240728/summary)|^(language_model\|vision2text_model)(?!.\*(lm_head\|output\|emb\|wte\|shared)).\*|mplug_owl3|&#x2714;|&#x2718;|&#x2718;|&#x2718;|transformers>=4.36, icecream|vision, video|[mPLUG/mPLUG-Owl3-7B-240728](https://huggingface.co/mPLUG/mPLUG-Owl3-7B-240728)|
|phi3-vision-128k-instruct|[LLM-Research/Phi-3-vision-128k-instruct](https://modelscope.cn/models/LLM-Research/Phi-3-vision-128k-instruct/summary)|^(model.layers\|model.vision_embed_tokens.img_projection)(?!.\*(lm_head\|output\|emb\|wte\|shared)).\*|phi3-vl|&#x2714;|&#x2714;|&#x2718;|&#x2718;|transformers>=4.36|vision|[microsoft/Phi-3-vision-128k-instruct](https://huggingface.co/microsoft/Phi-3-vision-128k-instruct)|
|phi3_5-vision-instruct|[LLM-Research/Phi-3.5-vision-instruct](https://modelscope.cn/models/LLM-Research/Phi-3.5-vision-instruct/summary)|^(model.layers\|model.vision_embed_tokens.img_projection)(?!.\*(lm_head\|output\|emb\|wte\|shared)).\*|phi3-vl|&#x2714;|&#x2714;|&#x2718;|&#x2718;|transformers>=4.36|vision|[microsoft/Phi-3.5-vision-instruct](https://huggingface.co/microsoft/Phi-3.5-vision-instruct)|
Expand Down
2 changes: 2 additions & 0 deletions docs/source_en/Instruction/Supported-models-datasets.md
Original file line number Diff line number Diff line change
Expand Up @@ -506,6 +506,8 @@ The table below introcudes all models supported by SWIFT:
|pixtral-12b|[AI-ModelScope/pixtral-12b](https://modelscope.cn/models/AI-ModelScope/pixtral-12b/summary)|^(language_model\|multi_modal_projector)(?!.\*(lm_head\|output\|emb\|wte\|shared)).\*|pixtral|&#x2718;|&#x2718;|&#x2718;|&#x2718;|transformers>=4.45|vision|[mistral-community/pixtral-12b](https://huggingface.co/mistral-community/pixtral-12b)|
|mplug-owl2-chat|[iic/mPLUG-Owl2](https://modelscope.cn/models/iic/mPLUG-Owl2/summary)|q_proj, k_proj.multiway.0, k_proj.multiway.1, v_proj.multiway.0, v_proj.multiway.1|mplug-owl2|&#x2714;|&#x2718;|&#x2718;|&#x2718;|transformers<4.35, icecream|vision|[MAGAer13/mplug-owl2-llama2-7b](https://huggingface.co/MAGAer13/mplug-owl2-llama2-7b)|
|mplug-owl2_1-chat|[iic/mPLUG-Owl2.1](https://modelscope.cn/models/iic/mPLUG-Owl2.1/summary)|c_attn.multiway.0, c_attn.multiway.1|mplug-owl2|&#x2714;|&#x2718;|&#x2718;|&#x2718;|transformers<4.35, icecream|vision|[Mizukiluke/mplug_owl_2_1](https://huggingface.co/Mizukiluke/mplug_owl_2_1)|
|mplug-owl3-1b-chat|[iic/mPLUG-Owl3-1B-241014](https://modelscope.cn/models/iic/mPLUG-Owl3-1B-241014/summary)|^(language_model\|vision2text_model)(?!.\*(lm_head\|output\|emb\|wte\|shared)).\*|mplug_owl3|&#x2714;|&#x2718;|&#x2718;|&#x2718;|transformers>=4.36, icecream|vision, video|[mPLUG/mPLUG-Owl3-1B-241014](https://huggingface.co/mPLUG/mPLUG-Owl3-1B-241014)|
|mplug-owl3-2b-chat|[iic/mPLUG-Owl3-2B-241014](https://modelscope.cn/models/iic/mPLUG-Owl3-2B-241014/summary)|^(language_model\|vision2text_model)(?!.\*(lm_head\|output\|emb\|wte\|shared)).\*|mplug_owl3|&#x2714;|&#x2718;|&#x2718;|&#x2718;|transformers>=4.36, icecream|vision, video|[mPLUG/mPLUG-Owl3-2B-241014](https://huggingface.co/mPLUG/mPLUG-Owl3-2B-241014)|
|mplug-owl3-7b-chat|[iic/mPLUG-Owl3-7B-240728](https://modelscope.cn/models/iic/mPLUG-Owl3-7B-240728/summary)|^(language_model\|vision2text_model)(?!.\*(lm_head\|output\|emb\|wte\|shared)).\*|mplug_owl3|&#x2714;|&#x2718;|&#x2718;|&#x2718;|transformers>=4.36, icecream|vision, video|[mPLUG/mPLUG-Owl3-7B-240728](https://huggingface.co/mPLUG/mPLUG-Owl3-7B-240728)|
|phi3-vision-128k-instruct|[LLM-Research/Phi-3-vision-128k-instruct](https://modelscope.cn/models/LLM-Research/Phi-3-vision-128k-instruct/summary)|^(model.layers\|model.vision_embed_tokens.img_projection)(?!.\*(lm_head\|output\|emb\|wte\|shared)).\*|phi3-vl|&#x2714;|&#x2714;|&#x2718;|&#x2718;|transformers>=4.36|vision|[microsoft/Phi-3-vision-128k-instruct](https://huggingface.co/microsoft/Phi-3-vision-128k-instruct)|
|phi3_5-vision-instruct|[LLM-Research/Phi-3.5-vision-instruct](https://modelscope.cn/models/LLM-Research/Phi-3.5-vision-instruct/summary)|^(model.layers\|model.vision_embed_tokens.img_projection)(?!.\*(lm_head\|output\|emb\|wte\|shared)).\*|phi3-vl|&#x2714;|&#x2714;|&#x2718;|&#x2718;|transformers>=4.36|vision|[microsoft/Phi-3.5-vision-instruct](https://huggingface.co/microsoft/Phi-3.5-vision-instruct)|
Expand Down
20 changes: 20 additions & 0 deletions swift/llm/utils/model.py
Original file line number Diff line number Diff line change
Expand Up @@ -523,6 +523,8 @@ class ModelType:
# owl
mplug_owl2_chat = 'mplug-owl2-chat' # llama
mplug_owl2_1_chat = 'mplug-owl2_1-chat' # qwen
mplug_owl3_1b_chat = 'mplug-owl3-1b-chat'
mplug_owl3_2b_chat = 'mplug-owl3-2b-chat'
mplug_owl3_7b_chat = 'mplug-owl3-7b-chat'
# yuan
yuan2_2b_instruct = 'yuan2-2b-instruct'
Expand Down Expand Up @@ -2887,6 +2889,24 @@ def update(self, key_states: torch.Tensor, value_states: torch.Tensor, layer_idx
return model, tokenizer


@register_model(
ModelType.mplug_owl3_1b_chat,
'iic/mPLUG-Owl3-1B-241014',
LoRATM.mplug_owl3,
TemplateType.mplug_owl3,
requires=['transformers>=4.36', 'icecream'], # decord
support_flash_attn=True,
tags=['multi-modal', 'vision', 'video'],
hf_model_id='mPLUG/mPLUG-Owl3-1B-241014')
@register_model(
ModelType.mplug_owl3_2b_chat,
'iic/mPLUG-Owl3-2B-241014',
LoRATM.mplug_owl3,
TemplateType.mplug_owl3,
requires=['transformers>=4.36', 'icecream'], # decord
support_flash_attn=True,
tags=['multi-modal', 'vision', 'video'],
hf_model_id='mPLUG/mPLUG-Owl3-2B-241014')
@register_model(
ModelType.mplug_owl3_7b_chat,
'iic/mPLUG-Owl3-7B-240728',
Expand Down
5 changes: 2 additions & 3 deletions swift/llm/utils/template.py
Original file line number Diff line number Diff line change
Expand Up @@ -3800,11 +3800,10 @@ def _get_image_token_list(self, cut_shape):
processor = self.tokenizer.processor
text = processor.image_processor.cut_prompt_template(img_token='<|image|>', h=cut_shape[0], w=cut_shape[1])
text_list = text.split('<|image|>')
if text_list[-1] == '':
text_list.pop()
res_text_list = []
for text in text_list:
for text in text_list[:-1]:
res_text_list += [text, '<|image|>']
res_text_list += text_list[-1]
token_list = self._encode_context_list(res_text_list)[0]
return token_list

Expand Down

0 comments on commit e0ff7b9

Please sign in to comment.