diff --git a/README.md b/README.md index a62836f8a..14dd3dccb 100644 --- a/README.md +++ b/README.md @@ -284,6 +284,7 @@ You can refine your search by selecting the task you're interested in (e.g., [te 1. **[DeBERTa-v2](https://huggingface.co/docs/transformers/model_doc/deberta-v2)** (from Microsoft) released with the paper [DeBERTa: Decoding-enhanced BERT with Disentangled Attention](https://arxiv.org/abs/2006.03654) by Pengcheng He, Xiaodong Liu, Jianfeng Gao, Weizhu Chen. 1. **[DeiT](https://huggingface.co/docs/transformers/model_doc/deit)** (from Facebook) released with the paper [Training data-efficient image transformers & distillation through attention](https://arxiv.org/abs/2012.12877) by Hugo Touvron, Matthieu Cord, Matthijs Douze, Francisco Massa, Alexandre Sablayrolles, Hervé Jégou. 1. **[DETR](https://huggingface.co/docs/transformers/model_doc/detr)** (from Facebook) released with the paper [End-to-End Object Detection with Transformers](https://arxiv.org/abs/2005.12872) by Nicolas Carion, Francisco Massa, Gabriel Synnaeve, Nicolas Usunier, Alexander Kirillov, Sergey Zagoruyko. +1. **[DINOv2](https://huggingface.co/docs/transformers/model_doc/dinov2)** (from Meta AI) released with the paper [DINOv2: Learning Robust Visual Features without Supervision](https://arxiv.org/abs/2304.07193) by Maxime Oquab, Timothée Darcet, Théo Moutakanni, Huy Vo, Marc Szafraniec, Vasil Khalidov, Pierre Fernandez, Daniel Haziza, Francisco Massa, Alaaeldin El-Nouby, Mahmoud Assran, Nicolas Ballas, Wojciech Galuba, Russell Howes, Po-Yao Huang, Shang-Wen Li, Ishan Misra, Michael Rabbat, Vasu Sharma, Gabriel Synnaeve, Hu Xu, Hervé Jegou, Julien Mairal, Patrick Labatut, Armand Joulin, Piotr Bojanowski. 1. **[DistilBERT](https://huggingface.co/docs/transformers/model_doc/distilbert)** (from HuggingFace), released together with the paper [DistilBERT, a distilled version of BERT: smaller, faster, cheaper and lighter](https://arxiv.org/abs/1910.01108) by Victor Sanh, Lysandre Debut and Thomas Wolf. The same method has been applied to compress GPT2 into [DistilGPT2](https://github.com/huggingface/transformers/tree/main/examples/research_projects/distillation), RoBERTa into [DistilRoBERTa](https://github.com/huggingface/transformers/tree/main/examples/research_projects/distillation), Multilingual BERT into [DistilmBERT](https://github.com/huggingface/transformers/tree/main/examples/research_projects/distillation) and a German version of DistilBERT. 1. **[Donut](https://huggingface.co/docs/transformers/model_doc/donut)** (from NAVER), released together with the paper [OCR-free Document Understanding Transformer](https://arxiv.org/abs/2111.15664) by Geewook Kim, Teakgyu Hong, Moonbin Yim, Jeongyeon Nam, Jinyoung Park, Jinyeong Yim, Wonseok Hwang, Sangdoo Yun, Dongyoon Han, Seunghyun Park. 1. **[DPT](https://huggingface.co/docs/transformers/master/model_doc/dpt)** (from Intel Labs) released with the paper [Vision Transformers for Dense Prediction](https://arxiv.org/abs/2103.13413) by René Ranftl, Alexey Bochkovskiy, Vladlen Koltun. diff --git a/docs/snippets/6_supported-models.snippet b/docs/snippets/6_supported-models.snippet index 4dfc00e2d..ed8d2df54 100644 --- a/docs/snippets/6_supported-models.snippet +++ b/docs/snippets/6_supported-models.snippet @@ -20,6 +20,7 @@ 1. **[DeBERTa-v2](https://huggingface.co/docs/transformers/model_doc/deberta-v2)** (from Microsoft) released with the paper [DeBERTa: Decoding-enhanced BERT with Disentangled Attention](https://arxiv.org/abs/2006.03654) by Pengcheng He, Xiaodong Liu, Jianfeng Gao, Weizhu Chen. 1. **[DeiT](https://huggingface.co/docs/transformers/model_doc/deit)** (from Facebook) released with the paper [Training data-efficient image transformers & distillation through attention](https://arxiv.org/abs/2012.12877) by Hugo Touvron, Matthieu Cord, Matthijs Douze, Francisco Massa, Alexandre Sablayrolles, Hervé Jégou. 1. **[DETR](https://huggingface.co/docs/transformers/model_doc/detr)** (from Facebook) released with the paper [End-to-End Object Detection with Transformers](https://arxiv.org/abs/2005.12872) by Nicolas Carion, Francisco Massa, Gabriel Synnaeve, Nicolas Usunier, Alexander Kirillov, Sergey Zagoruyko. +1. **[DINOv2](https://huggingface.co/docs/transformers/model_doc/dinov2)** (from Meta AI) released with the paper [DINOv2: Learning Robust Visual Features without Supervision](https://arxiv.org/abs/2304.07193) by Maxime Oquab, Timothée Darcet, Théo Moutakanni, Huy Vo, Marc Szafraniec, Vasil Khalidov, Pierre Fernandez, Daniel Haziza, Francisco Massa, Alaaeldin El-Nouby, Mahmoud Assran, Nicolas Ballas, Wojciech Galuba, Russell Howes, Po-Yao Huang, Shang-Wen Li, Ishan Misra, Michael Rabbat, Vasu Sharma, Gabriel Synnaeve, Hu Xu, Hervé Jegou, Julien Mairal, Patrick Labatut, Armand Joulin, Piotr Bojanowski. 1. **[DistilBERT](https://huggingface.co/docs/transformers/model_doc/distilbert)** (from HuggingFace), released together with the paper [DistilBERT, a distilled version of BERT: smaller, faster, cheaper and lighter](https://arxiv.org/abs/1910.01108) by Victor Sanh, Lysandre Debut and Thomas Wolf. The same method has been applied to compress GPT2 into [DistilGPT2](https://github.com/huggingface/transformers/tree/main/examples/research_projects/distillation), RoBERTa into [DistilRoBERTa](https://github.com/huggingface/transformers/tree/main/examples/research_projects/distillation), Multilingual BERT into [DistilmBERT](https://github.com/huggingface/transformers/tree/main/examples/research_projects/distillation) and a German version of DistilBERT. 1. **[Donut](https://huggingface.co/docs/transformers/model_doc/donut)** (from NAVER), released together with the paper [OCR-free Document Understanding Transformer](https://arxiv.org/abs/2111.15664) by Geewook Kim, Teakgyu Hong, Moonbin Yim, Jeongyeon Nam, Jinyoung Park, Jinyeong Yim, Wonseok Hwang, Sangdoo Yun, Dongyoon Han, Seunghyun Park. 1. **[DPT](https://huggingface.co/docs/transformers/master/model_doc/dpt)** (from Intel Labs) released with the paper [Vision Transformers for Dense Prediction](https://arxiv.org/abs/2103.13413) by René Ranftl, Alexey Bochkovskiy, Vladlen Koltun. diff --git a/scripts/supported_models.py b/scripts/supported_models.py index fca98bc9e..be632b92e 100644 --- a/scripts/supported_models.py +++ b/scripts/supported_models.py @@ -289,6 +289,23 @@ 'facebook/detr-resnet-50-panoptic', ], }, + 'dinov2': { + # Feature extraction + 'feature-extraction': [ + 'facebook/dinov2-small', + 'facebook/dinov2-base', + 'facebook/dinov2-large', + # 'facebook/dinov2-giant', # TODO add + ], + + # Image classification + 'image-classification': [ + 'facebook/dinov2-small-imagenet1k-1-layer', + 'facebook/dinov2-base-imagenet1k-1-layer', + 'facebook/dinov2-large-imagenet1k-1-layer', + # 'facebook/dinov2-giant-imagenet1k-1-layer', # TODO add + ], + }, 'distilbert': { # Feature extraction 'feature-extraction': [ diff --git a/src/models.js b/src/models.js index 48fab2f08..f4e8a8b5d 100644 --- a/src/models.js +++ b/src/models.js @@ -3605,6 +3605,28 @@ export class ConvNextV2ForImageClassification extends ConvNextV2PreTrainedModel } ////////////////////////////////////////////////// +////////////////////////////////////////////////// +export class Dinov2PreTrainedModel extends PreTrainedModel { } + +/** + * The bare DINOv2 Model transformer outputting raw hidden-states without any specific head on top. + */ +export class Dinov2Model extends Dinov2PreTrainedModel { } + +/** + * Dinov2 Model transformer with an image classification head on top (a linear layer on top of the final hidden state of the [CLS] token) e.g. for ImageNet. + */ +export class Dinov2ForImageClassification extends Dinov2PreTrainedModel { + /** + * @param {any} model_inputs + */ + async _call(model_inputs) { + return new SequenceClassifierOutput(await super._call(model_inputs)); + } +} +////////////////////////////////////////////////// + + ////////////////////////////////////////////////// export class YolosPreTrainedModel extends PreTrainedModel { } export class YolosModel extends YolosPreTrainedModel { } @@ -4296,6 +4318,7 @@ const MODEL_MAPPING_NAMES_ENCODER_ONLY = new Map([ ['deit', ['DeiTModel', DeiTModel]], ['convnext', ['ConvNextModel', ConvNextModel]], ['convnextv2', ['ConvNextV2Model', ConvNextV2Model]], + ['dinov2', ['Dinov2Model', Dinov2Model]], ['resnet', ['ResNetModel', ResNetModel]], ['swin', ['SwinModel', SwinModel]], ['swin2sr', ['Swin2SRModel', Swin2SRModel]], @@ -4450,6 +4473,7 @@ const MODEL_FOR_IMAGE_CLASSIFICATION_MAPPING_NAMES = new Map([ ['deit', ['DeiTForImageClassification', DeiTForImageClassification]], ['convnext', ['ConvNextForImageClassification', ConvNextForImageClassification]], ['convnextv2', ['ConvNextV2ForImageClassification', ConvNextV2ForImageClassification]], + ['dinov2', ['Dinov2ForImageClassification', Dinov2ForImageClassification]], ['resnet', ['ResNetForImageClassification', ResNetForImageClassification]], ['swin', ['SwinForImageClassification', SwinForImageClassification]], ]); diff --git a/src/processors.js b/src/processors.js index a6f1351d1..8750894a2 100644 --- a/src/processors.js +++ b/src/processors.js @@ -606,6 +606,7 @@ export class ImageFeatureExtractor extends FeatureExtractor { } +export class BitImageProcessor extends ImageFeatureExtractor { } export class DPTFeatureExtractor extends ImageFeatureExtractor { } export class GLPNFeatureExtractor extends ImageFeatureExtractor { } export class CLIPFeatureExtractor extends ImageFeatureExtractor { } @@ -1652,6 +1653,7 @@ export class AutoProcessor { CLIPFeatureExtractor, ConvNextFeatureExtractor, ConvNextImageProcessor, + BitImageProcessor, DPTFeatureExtractor, GLPNFeatureExtractor, BeitFeatureExtractor, diff --git a/tests/processors.test.js b/tests/processors.test.js index c6703f14e..73d5b3e86 100644 --- a/tests/processors.test.js +++ b/tests/processors.test.js @@ -43,6 +43,7 @@ describe('Processors', () => { nougat: 'facebook/nougat-small', owlvit: 'google/owlvit-base-patch32', clip: 'openai/clip-vit-base-patch16', + dinov2: 'facebook/dinov2-small-imagenet1k-1-layer', } const TEST_IMAGES = { @@ -336,6 +337,22 @@ describe('Processors', () => { compare(reshaped_input_sizes, [[224, 224]]); } }, MAX_TEST_EXECUTION_TIME); + + // BitImageProcessor + it(MODELS.dinov2, async () => { + const processor = await AutoProcessor.from_pretrained(m(MODELS.dinov2)) + + { + const image = await load_image(TEST_IMAGES.tiger); + const { pixel_values, original_sizes, reshaped_input_sizes } = await processor(image); + + compare(pixel_values.dims, [1, 3, 224, 224]); + compare(avg(pixel_values.data), 0.06262318789958954); + + compare(original_sizes, [[408, 612]]); + compare(reshaped_input_sizes, [[224, 224]]); + } + }, MAX_TEST_EXECUTION_TIME); }); describe('Audio processors', () => {