diff --git a/.github/workflows/docs.yml b/.github/workflows/docs.yml deleted file mode 100644 index c591b2b..0000000 --- a/.github/workflows/docs.yml +++ /dev/null @@ -1,23 +0,0 @@ -name: πŸ“š Docs WorkFlow - -on: - push: - branches: [main, develop] - -jobs: - deploy: - runs-on: ubuntu-latest - permissions: - contents: write - steps: - - name: πŸ›ŽοΈ Checkout - uses: actions/checkout@v4 - - name: 🐍 Set up Python - uses: actions/setup-python@v5 - with: - python-version: 3.x - - name: πŸ“¦ Install dependencies - run: | - pip install mkdocs-material "mkdocstrings[python]" - - name: πŸš€ Deploy Docs - run: mkdocs gh-deploy --force diff --git a/.github/workflows/maestro-tests.yml b/.github/workflows/maestro-tests.yml index 0a27b43..df80a17 100644 --- a/.github/workflows/maestro-tests.yml +++ b/.github/workflows/maestro-tests.yml @@ -1,4 +1,4 @@ -name: Test WorkFlow +name: Maestro PyTest WorkFlow πŸ§ͺ on: pull_request: @@ -40,5 +40,5 @@ jobs: pip install . pip install pytest - - name: πŸ§ͺ Test + - name: πŸ§ͺ Run Tests run: "python -m pytest ./test" diff --git a/.github/workflows/publish-dev-docs.yml b/.github/workflows/publish-dev-docs.yml new file mode 100644 index 0000000..d584941 --- /dev/null +++ b/.github/workflows/publish-dev-docs.yml @@ -0,0 +1,39 @@ +name: Maestro Develop Documentation Workflow πŸ“š + +on: + push: + branches: + - develop + workflow_dispatch: + +concurrency: + group: ${{ github.workflow }}-${{ github.event_name == 'push' && github.ref}} + cancel-in-progress: true + +permissions: + contents: write + pages: write + pull-requests: write + + +jobs: + maestro-dev-docs: + runs-on: ubuntu-latest + steps: + - name: πŸ”„ Checkout code + uses: actions/checkout@v4 + with: + fetch-depth: 0 + - name: 🐍 Set up Python + uses: actions/setup-python@v5 + with: + python-version: '3.10' + - name: πŸ“¦ Install Packages + run: pip install "mkdocs-material" "mkdocstrings[python]" "mkdocs-material[imaging]" mike + - name: βš™οΈ Configure git for github-actions + run: | + git config --global user.name "github-actions[bot]" + git config --global user.email "41898282+github-actions[bot]@users.noreply.github.com" + - name: πŸš€ Deploy MkDoc-Material with mike + run: | + MKDOCS_GIT_COMMITTERS_APIKEY=${{ secrets.GITHUB_TOKEN }} mike deploy --push develop diff --git a/.github/workflows/publish-release-docs.yml b/.github/workflows/publish-release-docs.yml new file mode 100644 index 0000000..7ff5cb3 --- /dev/null +++ b/.github/workflows/publish-release-docs.yml @@ -0,0 +1,41 @@ +name: Maestro Release Documentation Workflow πŸ“š +on: + workflow_dispatch: + +concurrency: + group: ${{ github.workflow }}-${{ github.event_name == 'push' && github.ref}} + cancel-in-progress: true + +permissions: + contents: write + pages: write + pull-requests: write + + +jobs: + maestro-release-docs: + runs-on: ubuntu-latest + strategy: + matrix: + python-version: ["3.10"] + steps: + - name: πŸ›ŽοΈ Checkout + uses: actions/checkout@v4 + with: + fetch-depth: 0 + ref: ${{ github.head_ref }} + + - name: 🐍 Set up Python + uses: actions/setup-python@v5 + with: + python-version: '3.10' + - name: πŸ“¦ Install Packages + run: pip install "mkdocs-material" "mkdocstrings[python]" "mkdocs-material[imaging]" mike + - name: βš™οΈ Configure git for github-actions πŸ‘· + run: | + git config --global user.name "github-actions[bot]" + git config --global user.email "41898282+github-actions[bot]@users.noreply.github.com" + - name: πŸš€ Deploy MkDoc-Material πŸ“š + run: | + latest_tag=$(git describe --tags `git rev-list --tags --max-count=1`) + MKDOCS_GIT_COMMITTERS_APIKEY=${{ secrets.GITHUB_TOKEN }} mike deploy --push --update-aliases $latest_tag latest diff --git a/.github/workflows/pypi-publish.yml b/.github/workflows/pypi-publish.yml index 4d74d52..d4a9414 100644 --- a/.github/workflows/pypi-publish.yml +++ b/.github/workflows/pypi-publish.yml @@ -1,8 +1,9 @@ -name: Multimodal Maestro Releases to PyPi +name: Maestro Releases to PyPi on: push: tags: - '[0-9]+.[0-9]+[0-9]+.[0-9]' + - '[0-9]+.[0-9]+.[0-9]' workflow_dispatch: @@ -27,14 +28,12 @@ jobs: python -m pip install --upgrade build twine python -m build twine check --strict dist/* - - name: πŸš€ Publish to PyPi - uses: pypa/gh-action-pypi-publish@release/v1 + - name: πŸš€ Publish to PyPi - Release + uses: pypa/gh-action-pypi-publish@release/v1.10 with: - user: ${{ secrets.PYPI_USERNAME }} - password: ${{ secrets.PYPI_PASSWORD }} - - name: πŸš€ Publish to Test-PyPi - uses: pypa/gh-action-pypi-publish@release/v1 + password: ${{ secrets.PYPI_API_TOKEN }} + - name: πŸš€ Publish to Test-PyPi - Release + uses: pypa/gh-action-pypi-publish@release/v1.10 with: repository-url: https://test.pypi.org/legacy/ - user: ${{ secrets.PYPI_TEST_USERNAME }} - password: ${{ secrets.PYPI_TEST_PASSWORD }} + password: ${{ secrets.TEST_PYPI_API_TOKEN }} diff --git a/.github/workflows/pypi-test-publish.yml b/.github/workflows/pypi-test-publish.yml index b6e2d4d..40e6a10 100644 --- a/.github/workflows/pypi-test-publish.yml +++ b/.github/workflows/pypi-test-publish.yml @@ -1,10 +1,13 @@ -name: Multimodal Maestro Test Releases to PyPi +name: Maestro Test Releases to PyPi on: push: tags: - '[0-9]+.[0-9]+[0-9]+.[0-9]+a[0-9]' - '[0-9]+.[0-9]+[0-9]+.[0-9]+b[0-9]' - '[0-9]+.[0-9]+[0-9]+.[0-9]+rc[0-9]' + - '[0-9]+.[0-9]+.[0-9]+a[0-9]' + - '[0-9]+.[0-9]+.[0-9]+b[0-9]' + - '[0-9]+.[0-9]+.[0-9]+rc[0-9]' workflow_dispatch: @@ -30,14 +33,12 @@ jobs: python -m pip install --upgrade build twine python -m build twine check --strict dist/* - - name: πŸš€ Publish distribution to PyPI - uses: pypa/gh-action-pypi-publish@release/v1 + - name: πŸš€ Publish to PyPi - Prelease + uses: pypa/gh-action-pypi-publish@release/v1.10 with: - user: ${{ secrets.PYPI_USERNAME }} - password: ${{ secrets.PYPI_PASSWORD }} - - name: πŸš€ Publish to Test-PyPi - uses: pypa/gh-action-pypi-publish@release/v1 + password: ${{ secrets.PYPI_API_TOKEN }} + - name: πŸš€ Publish to Test-PyPi - Prelease + uses: pypa/gh-action-pypi-publish@release/v1.10 with: repository-url: https://test.pypi.org/legacy/ - user: ${{ secrets.PYPI_TEST_USERNAME }} - password: ${{ secrets.PYPI_TEST_PASSWORD }} + password: ${{ secrets.TEST_PYPI_API_TOKEN }} diff --git a/README.md b/README.md index 8fcbc32..4f3eccc 100644 --- a/README.md +++ b/README.md @@ -11,7 +11,7 @@ **maestro** is a tool designed to streamline and accelerate the fine-tuning process for multimodal models. It provides ready-to-use recipes for fine-tuning popular vision-language models (VLMs) such as **Florence-2**, **PaliGemma**, and -**Phi-3.5 Vision** on downstream vision-language tasks. +**Qwen2-VL** on downstream vision-language tasks. ## πŸ’» install @@ -40,9 +40,9 @@ arguments as the CLI example above: ```python from maestro.trainer.common import MeanAveragePrecisionMetric -from maestro.trainer.models.florence_2 import train, TrainingConfiguration +from maestro.trainer.models.florence_2 import train, Configuration -config = TrainingConfiguration( +config = Configuration( dataset='', epochs=10, batch_size=8, diff --git a/docs/florence-2.md b/docs/florence-2.md new file mode 100644 index 0000000..cb31b14 --- /dev/null +++ b/docs/florence-2.md @@ -0,0 +1,219 @@ +## Overview + +Florence-2 is a lightweight vision-language model open-sourced by Microsoft under the +MIT license. The model demonstrates strong zero-shot and fine-tuning capabilities +across tasks such as captioning, object detection, grounding, and segmentation. + + +*Florence-2: Fine-tune Microsoft’s Multimodal Model.* + +## Architecture + +The model takes images and task prompts as input, generating the desired results in +text format. It uses a DaViT vision encoder to convert images into visual token +embeddings. These are then concatenated with BERT-generated text embeddings and +processed by a transformer-based multi-modal encoder-decoder to generate the response. + +![florence-2-architecture](https://storage.googleapis.com/com-roboflow-marketing/maestro/florence-2-architecture.webp) +*Overview of Florence-2 architecture. Source: Florence-2: Advancing a Unified Representation for a Variety of Vision Tasks.* + + +## Fine-tuning Examples + +### Dataset Format + +The Florence-2 model expects a specific dataset structure for training and evaluation. +The dataset should be organized into train, test, and validation splits, with each +split containing image files and an `annotations.jsonl` file. + +``` +dataset/ +β”œβ”€β”€ train/ +β”‚ β”œβ”€β”€ 123e4567-e89b-12d3-a456-426614174000.png +β”‚ β”œβ”€β”€ 987f6543-a21c-43c3-a562-926514273001.png +β”‚ β”œβ”€β”€ ... +β”‚ β”œβ”€β”€ annotations.jsonl +β”œβ”€β”€ test/ +β”‚ β”œβ”€β”€ 456b7890-e32d-44f5-b678-724564172002.png +β”‚ β”œβ”€β”€ 678c1234-e45b-67f6-c789-813264172003.png +β”‚ β”œβ”€β”€ ... +β”‚ β”œβ”€β”€ annotations.jsonl +└── valid/ + β”œβ”€β”€ 789d2345-f67c-89d7-e891-912354172004.png + β”œβ”€β”€ 135e6789-d89f-12e3-f012-456464172005.png + β”œβ”€β”€ ... + └── annotations.jsonl +``` + +Depending on the vision task being performed, the structure of the `annotations.jsonl` +file will vary slightly. + +!!! warning + The dataset samples shown below are formatted for improved readability, with each + JSON structure spread across multiple lines. In practice, the `annotations.jsonl` + file must contain each JSON object on a single line, without any line breaks + between the key-value pairs. Make sure to adhere to this structure to avoid parsing + errors during model training. + +=== "Object Detection" + + ```txt + { + "image":"123e4567-e89b-12d3-a456-426614174000.png", + "prefix":"", + "suffix":"9 of clubs10 of clubs" + } + { + "image":"987f6543-a21c-43c3-a562-926514273001.png", + "prefix":"", + "suffix":"5 of clubs6 of clubs" + } + ... + ``` + +=== "Visual Question Answering (VQA)" + + ```txt + { + "image":"123e4567-e89b-12d3-a456-426614174000.png", + "prefix":" Is the value of Favorable 38 in 2015?", + "suffix":"Yes" + } + { + "image":"987f6543-a21c-43c3-a562-926514273001.png", + "prefix":" How many values are below 40 in Unfavorable graph?", + "suffix":"6" + } + ... + ``` + +=== "Object Character Recognition (OCR)" + + ```txt + { + "image":"123e4567-e89b-12d3-a456-426614174000.png", + "prefix":"", + "suffix":"ke begherte Die mi" + } + { + "image":"987f6543-a21c-43c3-a562-926514273001.png", + "prefix":"", + "suffix":"mi uort in de middelt" + } + ... + ``` + +### CLI + +!!! tip + Depending on the GPU you are using, you may need to adjust the `batch-size` to + ensure that your model trains within memory limits. For larger GPUs with more + memory, you can increase the batch size for better performance. + +!!! tip + Depending on the vision task you are executing, you may need to select different + vision metrics. For example, tasks like object detection typically use + `mean_average_precision`, while VQA and OCR tasks use metrics like + `word_error_rate` and `character_error_rate`. + +!!! tip + You may need to use different learning rates depending on the task. We have found + that lower learning rates work better for tasks like OCR or VQA, as these tasks + require more precision. + + +=== "Object Detection" + + ```bash + maestro florence2 train --dataset='' \ + --epochs=10 --batch-size=8 --lr=5e-6 --metrics=mean_average_precision + ``` + +=== "Visual Question Answering (VQA)" + + ```bash + maestro florence2 train --dataset='' \ + --epochs=10 --batch-size=8 --lr=1e-6 \ + --metrics=word_error_rate, character_error_rate + ``` + +=== "Object Character Recognition (OCR)" + + ```bash + maestro florence2 train --dataset='' \ + --epochs=10 --batch-size=8 --lr=1e-6 \ + --metrics=word_error_rate, character_error_rate + ``` + +### SDK + +=== "Object Detection" + + ```python + from maestro.trainer.common import MeanAveragePrecisionMetric + from maestro.trainer.models.florence_2 import train, Configuration + + config = Configuration( + dataset='', + epochs=10, + batch_size=8, + lr=5e-6, + metrics=[MeanAveragePrecisionMetric()] + ) + + train(config) + ``` + +=== "Visual Question Answering (VQA)" + + ```python + from maestro.trainer.common import WordErrorRateMetric, CharacterErrorRateMetric + from maestro.trainer.models.florence_2 import train, Configuration + + config = Configuration( + dataset='', + epochs=10, + batch_size=8, + lr=1e-6, + metrics=[WordErrorRateMetric(), CharacterErrorRateMetric()] + ) + + train(config) + ``` + +=== "Object Character Recognition (OCR)" + + ```python + from maestro.trainer.common import WordErrorRateMetric, CharacterErrorRateMetric + from maestro.trainer.models.florence_2 import train, Configuration + + config = Configuration( + dataset='', + epochs=10, + batch_size=8, + lr=1e-6, + metrics=[WordErrorRateMetric(), CharacterErrorRateMetric()] + ) + + train(config) + ``` + +## API + + + +:::maestro.trainer.models.florence_2.core.Configuration + +
+

train

+
+ +:::maestro.trainer.models.florence_2.core.train + + + +:::maestro.trainer.models.florence_2.core.evaluate diff --git a/docs/get_started.md b/docs/get_started.md deleted file mode 100644 index efa305f..0000000 --- a/docs/get_started.md +++ /dev/null @@ -1 +0,0 @@ -## 🚧 under construction diff --git a/docs/index.md b/docs/index.md index fe5b5e8..e66b525 100644 --- a/docs/index.md +++ b/docs/index.md @@ -1,24 +1,51 @@ -## πŸ‘‹ hello +
-Multimodal-Maestro gives you more control over large multimodal models to get the -outputs you want. With more effective prompting tactics, you can get multimodal models -to do tasks you didn't know (or think!) were possible. Curious how it works? Try our -HF [space](https://huggingface.co/spaces/Roboflow/SoM)! +

maestro

-🚧 The project is still under construction and the API is prone to change. +

coming: when it's ready...

-## πŸ’» install +
-⚠️ Our package has been renamed to `maestro`. Install package in a -[**3.11>=Python>=3.8**](https://www.python.org/) environment. +**maestro** is a tool designed to streamline and accelerate the fine-tuning process for +multimodal models. It provides ready-to-use recipes for fine-tuning popular +vision-language models (VLMs) such as **Florence-2**, **PaliGemma**, and +**Qwen2-VL** on downstream vision-language tasks. + +## install + +Pip install the supervision package in a +[**Python>=3.8**](https://www.python.org/) environment. ```bash pip install maestro ``` -## 🦸 contribution +## quickstart + +### CLI + +VLMs can be fine-tuned on downstream tasks directly from the command line with +`maestro` command: + +```bash +maestro florence2 train --dataset='' --epochs=10 --batch-size=8 +``` + +### SDK + +Alternatively, you can fine-tune VLMs using the Python SDK, which accepts the same +arguments as the CLI example above: -We would love your help in making this repository even better! If you noticed any bug, -or if you have any suggestions for improvement, feel free to open an -[issue](https://github.com/roboflow/multimodal-maestro/issues) or submit a -[pull request](https://github.com/roboflow/multimodal-maestro/pulls). +```python +from maestro.trainer.common import MeanAveragePrecisionMetric +from maestro.trainer.models.florence_2 import train, Configuration + +config = Configuration( + dataset='', + epochs=10, + batch_size=8, + metrics=[MeanAveragePrecisionMetric()] +) + +train(config) +``` diff --git a/docs/lmms.md b/docs/lmms.md deleted file mode 100644 index f114eef..0000000 --- a/docs/lmms.md +++ /dev/null @@ -1,3 +0,0 @@ -## GPT-4 Vision - -:::maestro.lmms.gpt4.prompt_image diff --git a/docs/markers.md b/docs/markers.md deleted file mode 100644 index 72f4bd6..0000000 --- a/docs/markers.md +++ /dev/null @@ -1,3 +0,0 @@ -## Segment Anything - -:::maestro.markers.sam.SegmentAnythingMarkGenerator diff --git a/docs/metrics.md b/docs/metrics.md new file mode 100644 index 0000000..823b5cd --- /dev/null +++ b/docs/metrics.md @@ -0,0 +1,17 @@ + + +:::maestro.trainer.common.MeanAveragePrecisionMetric + + + +:::maestro.trainer.common.WordErrorRateMetric + + + +:::maestro.trainer.common.CharacterErrorRateMetric diff --git a/docs/tasks.md b/docs/tasks.md new file mode 100644 index 0000000..c9943ae --- /dev/null +++ b/docs/tasks.md @@ -0,0 +1,11 @@ +## Object Detection + +Object Detection is a core computer vision task where a model is trained to identify and locate multiple objects within an image by drawing bounding boxes around them. In the context of Vision-Language Models (VLMs), object detection is enhanced by the model's ability to not only recognize objects but also describe them in natural language. VLMs can provide additional context by naming objects, detailing attributes (such as color, size, or type), and offering richer descriptions of the scene. This fusion of vision and language supports more detailed and semantically aware detection, where object recognition can be linked to more complex visual understanding tasks. + +## Visual Question Answering (VQA) + +Visual Question Answering (VQA) merges vision and language by requiring a model to analyze an image and answer questions about its content. VLMs excel in VQA because they jointly understand both the visual components of an image and the linguistic details of the question. This allows the model to perform tasks like answering "How many dogs are there?" or "Is the person in the image wearing glasses?" with high accuracy. VQA is a key task for VLMs, demonstrating their ability to reason about complex visual scenes while considering natural language prompts. + +## Object Character Recognition (OCR) + +Object Character Recognition (OCR) involves detecting and recognizing text within an image, often from signs, documents, or other real-world scenes. With VLMs, OCR capabilities go beyond simple text extraction. These models understand the context in which the text appears, enabling them to answer questions, perform translations, or incorporate textual information into broader visual tasks. This contextual awareness makes VLMs particularly adept at handling tasks like reading and interpreting text embedded in images, and answering questions like "What does the sign say?" or "Translate the text in the image." diff --git a/docs/visualizers.md b/docs/visualizers.md deleted file mode 100644 index 94d8fff..0000000 --- a/docs/visualizers.md +++ /dev/null @@ -1,3 +0,0 @@ -## MarkVisualizer - -:::maestro.visualizers.MarkVisualizer diff --git a/maestro/trainer/common/__init__.py b/maestro/trainer/common/__init__.py index 0bfe6b2..c54680c 100644 --- a/maestro/trainer/common/__init__.py +++ b/maestro/trainer/common/__init__.py @@ -1 +1,5 @@ -from maestro.trainer.common.utils.metrics import MeanAveragePrecisionMetric +from maestro.trainer.common.utils.metrics import ( + CharacterErrorRateMetric, + MeanAveragePrecisionMetric, + WordErrorRateMetric, +) diff --git a/maestro/trainer/common/data_loaders/datasets.py b/maestro/trainer/common/data_loaders/datasets.py index 62bd05e..7e0e316 100644 --- a/maestro/trainer/common/data_loaders/datasets.py +++ b/maestro/trainer/common/data_loaders/datasets.py @@ -30,7 +30,7 @@ def __getitem__(self, idx: int) -> tuple[Image.Image, dict[str, Any]]: entry = self.entries[idx] image_path = os.path.join(self.image_directory_path, entry["image"]) try: - image = Image.open(image_path) + image = Image.open(image_path).convert("RGB") except FileNotFoundError: raise FileNotFoundError(f"Image file {image_path} not found.") else: diff --git a/maestro/trainer/common/utils/metrics.py b/maestro/trainer/common/utils/metrics.py index d855199..184fdbf 100644 --- a/maestro/trainer/common/utils/metrics.py +++ b/maestro/trainer/common/utils/metrics.py @@ -11,6 +11,7 @@ import matplotlib.pyplot as plt import supervision as sv +from jiwer import cer, wer from PIL import Image from supervision.metrics.mean_average_precision import MeanAveragePrecision @@ -45,7 +46,13 @@ def compute(self, targets: list[Any], predictions: list[Any]) -> dict[str, float class MeanAveragePrecisionMetric(BaseMetric): - """A class used to compute the Mean Average Precision (mAP) metric.""" + """A class used to compute the Mean Average Precision (mAP) metric. + + mAP is a popular metric for object detection tasks, measuring the average precision + across all classes and IoU thresholds. + """ + + name = "mean_average_precision" def describe(self) -> list[str]: """Returns a list of metric names that this class will compute. @@ -70,6 +77,88 @@ def compute(self, targets: list[sv.Detections], predictions: list[sv.Detections] return {"map50:95": result.map50_95, "map50": result.map50, "map75": result.map75} +class WordErrorRateMetric(BaseMetric): + """A class used to compute the Word Error Rate (WER) metric. + + WER measures the edit distance between predicted and reference transcriptions + at the word level, commonly used in speech recognition and machine translation. + """ + + name = "word_error_rate" + + def describe(self) -> list[str]: + """Returns a list of metric names that this class will compute. + + Returns: + List[str]: A list of metric names. + """ + return ["wer"] + + def compute(self, targets: list[str], predictions: list[str]) -> dict[str, float]: + """Computes the WER metric based on the targets and predictions. + + Args: + targets (List[str]): The ground truth texts. + predictions (List[str]): The predicted texts. + + Returns: + Dict[str, float]: A dictionary of computed WER metrics with metric names as + keys and their values. + """ + if len(targets) != len(predictions): + raise ValueError("The number of targets and predictions must be the same.") + + total_wer = 0.0 + count = len(targets) + + for target, prediction in zip(targets, predictions): + total_wer += wer(target, prediction) + + average_wer = total_wer / count if count > 0 else 0.0 + return {"wer": average_wer} + + +class CharacterErrorRateMetric(BaseMetric): + """A class used to compute the Character Error Rate (CER) metric. + + CER is similar to WER but operates at the character level, making it useful for + tasks like optical character recognition (OCR) and handwriting recognition. + """ + + name = "character_error_rate" + + def describe(self) -> list[str]: + """Returns a list of metric names that this class will compute. + + Returns: + List[str]: A list of metric names. + """ + return ["cer"] + + def compute(self, targets: list[str], predictions: list[str]) -> dict[str, float]: + """Computes the CER metric based on the targets and predictions. + + Args: + targets (List[str]): The ground truth texts. + predictions (List[str]): The predicted texts. + + Returns: + Dict[str, float]: A dictionary of computed CER metrics with metric names as + keys and their values. + """ + if len(targets) != len(predictions): + raise ValueError("The number of targets and predictions must be the same.") + + total_cer = 0.0 + count = len(targets) + + for target, prediction in zip(targets, predictions): + total_cer += cer(target, prediction) + + average_cer = total_cer / count if count > 0 else 0.0 + return {"cer": average_cer} + + class MetricsTracker: @classmethod def init(cls, metrics: list[str]) -> MetricsTracker: diff --git a/maestro/trainer/models/florence_2/__init__.py b/maestro/trainer/models/florence_2/__init__.py index 77d45b3..62aab7d 100644 --- a/maestro/trainer/models/florence_2/__init__.py +++ b/maestro/trainer/models/florence_2/__init__.py @@ -1 +1 @@ -from maestro.trainer.models.florence_2.core import TrainingConfiguration, train +from maestro.trainer.models.florence_2.core import Configuration, train diff --git a/maestro/trainer/models/florence_2/core.py b/maestro/trainer/models/florence_2/core.py index 4b6e871..d25c18a 100644 --- a/maestro/trainer/models/florence_2/core.py +++ b/maestro/trainer/models/florence_2/core.py @@ -26,18 +26,19 @@ CheckpointManager, load_model, ) -from maestro.trainer.models.florence_2.data_loading import prepare_data_loaders +from maestro.trainer.models.florence_2.inference import run_predictions +from maestro.trainer.models.florence_2.loaders import create_data_loaders from maestro.trainer.models.florence_2.metrics import ( - extract_unique_detection_dataset_classes, - postprocess_florence2_output_for_mean_average_precision, - run_predictions, + get_unique_detection_classes, + process_output_for_detection_metric, + process_output_for_text_metric, ) from maestro.trainer.models.paligemma.training import LoraInitLiteral @dataclass(frozen=True) -class TrainingConfiguration: - """Configuration for training a Florence-2 model. +class Configuration: + """Configuration for a Florence-2 model. This class encapsulates all the parameters needed for training a Florence-2 model, including dataset paths, model specifications, training hyperparameters, and output @@ -92,7 +93,22 @@ class TrainingConfiguration: metrics: list[BaseMetric] = field(default_factory=list) -def train(config: TrainingConfiguration) -> None: +def train(config: Configuration) -> None: + """Train a Florence-2 model using the provided configuration. + + This function sets up the training environment, prepares the model and data loaders, + and runs the training loop. It also handles metric tracking and checkpoint saving. + + Args: + config (Configuration): The configuration object containing all necessary + parameters for training. + + Returns: + None + + Raises: + ValueError: If an unsupported optimizer is specified in the configuration. + """ make_it_reproducible(avoid_non_deterministic_algorithms=False) run_dir = create_new_run_directory( base_output_dir=config.output_dir, @@ -109,7 +125,7 @@ def train(config: TrainingConfiguration) -> None: device=config.device, cache_dir=config.cache_dir, ) - train_loader, val_loader, test_loader = prepare_data_loaders( + train_loader, val_loader, test_loader = create_data_loaders( dataset_location=config.dataset, train_batch_size=config.batch_size, processor=processor, @@ -190,7 +206,7 @@ def run_training_loop( processor: AutoProcessor, model: PeftModel, data_loaders: tuple[DataLoader, Optional[DataLoader]], - config: TrainingConfiguration, + config: Configuration, training_metrics_tracker: MetricsTracker, validation_metrics_tracker: MetricsTracker, checkpoint_manager: CheckpointManager, @@ -226,7 +242,7 @@ def run_training_epoch( train_loader: DataLoader, val_loader: Optional[DataLoader], epoch: int, - config: TrainingConfiguration, + config: Configuration, optimizer: Optimizer, lr_scheduler: LRScheduler, training_metrics_tracker: MetricsTracker, @@ -234,39 +250,37 @@ def run_training_epoch( checkpoint_manager: CheckpointManager, ) -> None: model.train() - training_losses: list[float] = [] - - with tqdm(total=len(train_loader), desc=f"Epoch {epoch}/{config.epochs}", unit="batch") as pbar: - for step_id, (inputs, answers) in enumerate(train_loader): - input_ids = inputs["input_ids"] - pixel_values = inputs["pixel_values"] + loss_values: list[float] = [] + progress_bar = tqdm(total=len(train_loader), desc=f"training {epoch}/{config.epochs}", unit="batch") + with progress_bar: + for batch_id, (inputs, _, answers, _) in enumerate(train_loader): labels = processor.tokenizer( text=answers, return_tensors="pt", padding=True, return_token_type_ids=False ).input_ids.to(config.device) - outputs = model(input_ids=input_ids, pixel_values=pixel_values, labels=labels) + outputs = model(input_ids=inputs["input_ids"], pixel_values=inputs["pixel_values"], labels=labels) + loss = outputs.loss loss.backward() optimizer.step() lr_scheduler.step() optimizer.zero_grad() loss = loss.item() + training_metrics_tracker.register( metric="loss", epoch=epoch, - step=step_id + 1, + step=batch_id + 1, value=loss, ) - training_losses.append(loss) + loss_values.append(loss) + average_loss = sum(loss_values) / len(loss_values) if loss_values else 0.0 - # Update progress bar - last_100_losses = training_losses[-100:] - loss_moving_average = sum(last_100_losses) / len(last_100_losses) if last_100_losses else 0.0 - pbar.set_postfix({"Loss": f"{loss_moving_average:.4f}"}) - pbar.update(1) + progress_bar.set_postfix({"loss": f"{average_loss: .4f}"}) + progress_bar.update(1) # Save checkpoints based on training loss if no validation loader if val_loader is None or len(val_loader) == 0: - train_loss = sum(training_losses) / len(training_losses) + train_loss = sum(loss_values) / len(loss_values) if loss_values else 0.0 checkpoint_manager.save_latest(processor, model) checkpoint_manager.save_best(processor, model, train_loss) return @@ -289,44 +303,39 @@ def run_validation_epoch( processor: AutoProcessor, model: Union[PeftModel, AutoModelForCausalLM], loader: DataLoader, - config: TrainingConfiguration, + config: Configuration, metrics_tracker: MetricsTracker, epoch_number: int, ) -> None: - val_loss = 0.0 + loss_values: list[float] = [] with torch.no_grad(): - for inputs, targets in loader: - input_ids = inputs["input_ids"] - pixel_values = inputs["pixel_values"] + progress_bar = tqdm(loader, desc="running validation", unit="batch") + for inputs, questions, answers, images in progress_bar: labels = processor.tokenizer( - text=targets, return_tensors="pt", padding=True, return_token_type_ids=False + text=answers, return_tensors="pt", padding=True, return_token_type_ids=False ).input_ids.to(config.device) - outputs = model(input_ids=input_ids, pixel_values=pixel_values, labels=labels) - loss = outputs.loss - val_loss += loss.item() - avg_val_loss = val_loss / len(loader) + outputs = model(input_ids=inputs["input_ids"], pixel_values=inputs["pixel_values"], labels=labels) + loss_values.append(outputs.loss.item()) + average_loss = sum(loss_values) / len(loss_values) if loss_values else 0.0 metrics_tracker.register( metric="loss", epoch=epoch_number, step=1, - value=avg_val_loss, + value=average_loss, ) # Run inference once for all metrics - prompts, expected_responses, generated_texts, images = run_predictions( - dataset=loader.dataset, - processor=processor, - model=model, - device=config.device, + questions, expected_answers, generated_answers, images = run_predictions( + loader=loader, processor=processor, model=model ) - metrics_results = {"loss": avg_val_loss} + metrics_results = {"loss": average_loss} for metric in config.metrics: if isinstance(metric, MeanAveragePrecisionMetric): - classes = extract_unique_detection_dataset_classes(loader.dataset) - targets, predictions = postprocess_florence2_output_for_mean_average_precision( - expected_responses=expected_responses, - generated_texts=generated_texts, + classes = get_unique_detection_classes(loader.dataset) + targets, predictions = process_output_for_detection_metric( + expected_answers=expected_answers, + generated_answers=generated_answers, images=images, classes=classes, processor=processor, @@ -340,14 +349,29 @@ def run_validation_epoch( value=value, ) metrics_results[key] = value + else: + predictions = process_output_for_text_metric( + generated_answers=generated_answers, + images=images, + processor=processor, + ) + result = metric.compute(predictions=predictions, targets=expected_answers) + for key, value in result.items(): + metrics_tracker.register( + metric=key, + epoch=epoch_number, + step=1, + value=value, + ) + metrics_results[key] = value print("Validation Metrics:", ", ".join([f"{k}: {v:.4f}" for k, v in metrics_results.items()])) # Display inference results in IPython environments - display_results(prompts, expected_responses, generated_texts, images) + display_results(questions, expected_answers, generated_answers, images) -def get_optimizer(model: PeftModel, config: TrainingConfiguration) -> Optimizer: +def get_optimizer(model: PeftModel, config: Configuration) -> Optimizer: optimizer_type = config.optimizer.lower() if optimizer_type == "adamw": return AdamW(model.parameters(), lr=config.lr) @@ -358,14 +382,26 @@ def get_optimizer(model: PeftModel, config: TrainingConfiguration) -> Optimizer: raise ValueError(f"Unsupported optimizer: {config.optimizer}") -def evaluate(config: TrainingConfiguration) -> None: +def evaluate(config: Configuration) -> None: + """Evaluate a Florence-2 model using the provided configuration. + + This function loads the model and data, runs predictions on the evaluation dataset, + computes specified metrics, and saves the results. + + Args: + config (Configuration): The configuration object containing all necessary + parameters for evaluation. + + Returns: + None + """ processor, model = load_model( model_id_or_path=config.model_id, revision=config.revision, device=config.device, cache_dir=config.cache_dir, ) - train_loader, val_loader, test_loader = prepare_data_loaders( + train_loader, val_loader, test_loader = create_data_loaders( dataset_location=config.dataset, train_batch_size=config.batch_size, processor=processor, @@ -381,19 +417,16 @@ def evaluate(config: TrainingConfiguration) -> None: evaluation_metrics_tracker = MetricsTracker.init(metrics=metrics) # Run inference once for all metrics - _, expected_responses, generated_texts, images = run_predictions( - dataset=evaluation_loader.dataset, - processor=processor, - model=model, - device=config.device, + _, expected_answers, generated_answers, images = run_predictions( + loader=evaluation_loader, processor=processor, model=model ) for metric in config.metrics: if isinstance(metric, MeanAveragePrecisionMetric): - classes = extract_unique_detection_dataset_classes(train_loader.dataset) - targets, predictions = postprocess_florence2_output_for_mean_average_precision( - expected_responses=expected_responses, - generated_texts=generated_texts, + classes = get_unique_detection_classes(train_loader.dataset) + targets, predictions = process_output_for_detection_metric( + expected_answers=expected_answers, + generated_answers=generated_answers, images=images, classes=classes, processor=processor, @@ -406,6 +439,20 @@ def evaluate(config: TrainingConfiguration) -> None: step=1, value=value, ) + else: + predictions = process_output_for_text_metric( + generated_answers=generated_answers, + images=images, + processor=processor, + ) + result = metric.compute(targets=expected_answers, predictions=predictions) + for key, value in result.items(): + evaluation_metrics_tracker.register( + metric=key, + epoch=1, + step=1, + value=value, + ) evaluation_metrics_tracker.as_json( output_dir=os.path.join(config.output_dir, "metrics"), filename="evaluation.json" diff --git a/maestro/trainer/models/florence_2/entrypoint.py b/maestro/trainer/models/florence_2/entrypoint.py index c11441c..aecefd9 100644 --- a/maestro/trainer/models/florence_2/entrypoint.py +++ b/maestro/trainer/models/florence_2/entrypoint.py @@ -5,13 +5,18 @@ import torch import typer -from maestro.trainer.common.utils.metrics import BaseMetric, MeanAveragePrecisionMetric +from maestro.trainer.common.utils.metrics import ( + BaseMetric, + CharacterErrorRateMetric, + MeanAveragePrecisionMetric, + WordErrorRateMetric, +) from maestro.trainer.models.florence_2.checkpoints import ( DEFAULT_FLORENCE2_MODEL_ID, DEFAULT_FLORENCE2_MODEL_REVISION, DEVICE, ) -from maestro.trainer.models.florence_2.core import LoraInitLiteral, TrainingConfiguration +from maestro.trainer.models.florence_2.core import Configuration, LoraInitLiteral from maestro.trainer.models.florence_2.core import evaluate as florence2_evaluate from maestro.trainer.models.florence_2.core import train as florence2_train @@ -19,7 +24,9 @@ METRIC_CLASSES: dict[str, type[BaseMetric]] = { - "mean_average_precision": MeanAveragePrecisionMetric, + MeanAveragePrecisionMetric.name: MeanAveragePrecisionMetric, + WordErrorRateMetric.name: WordErrorRateMetric, + CharacterErrorRateMetric.name: CharacterErrorRateMetric, } @@ -124,7 +131,7 @@ def train( ] = [], ) -> None: metric_objects = parse_metrics(metrics) - config = TrainingConfiguration( + config = Configuration( dataset=dataset, model_id=model_id, revision=revision, @@ -196,7 +203,7 @@ def evaluate( ] = [], ) -> None: metric_objects = parse_metrics(metrics) - config = TrainingConfiguration( + config = Configuration( dataset=dataset, model_id=model_id, revision=revision, diff --git a/maestro/trainer/models/florence_2/inference.py b/maestro/trainer/models/florence_2/inference.py new file mode 100644 index 0000000..6e8bd6f --- /dev/null +++ b/maestro/trainer/models/florence_2/inference.py @@ -0,0 +1,33 @@ +import torch +from PIL import Image +from torch.utils.data import DataLoader +from tqdm import tqdm +from transformers import AutoModelForCausalLM, AutoProcessor + + +def run_predictions( + loader: DataLoader, processor: AutoProcessor, model: AutoModelForCausalLM +) -> tuple[list[str], list[str], list[str], list[Image.Image]]: + questions_total = [] + expected_answers_total = [] + generated_answers_total = [] + images_total = [] + + with torch.no_grad(): + progress_bar = tqdm(loader, desc="running predictions", unit="batch") + for inputs, questions, answers, images in progress_bar: + generated_ids = model.generate( + input_ids=inputs["input_ids"], + pixel_values=inputs["pixel_values"], + max_new_tokens=1024, + do_sample=False, + num_beams=3, + ) + generated_answers = processor.batch_decode(generated_ids, skip_special_tokens=False) + + questions_total.extend(questions) + expected_answers_total.extend(answers) + generated_answers_total.extend(generated_answers) + images_total.extend(images) + + return (questions_total, expected_answers_total, generated_answers_total, images_total) diff --git a/maestro/trainer/models/florence_2/data_loading.py b/maestro/trainer/models/florence_2/loaders.py similarity index 83% rename from maestro/trainer/models/florence_2/data_loading.py rename to maestro/trainer/models/florence_2/loaders.py index 5794d6e..c35bff5 100644 --- a/maestro/trainer/models/florence_2/data_loading.py +++ b/maestro/trainer/models/florence_2/loaders.py @@ -11,7 +11,7 @@ from maestro.trainer.common.data_loaders.datasets import DetectionDataset -def prepare_data_loaders( +def create_data_loaders( dataset_location: str, train_batch_size: int, processor: AutoProcessor, @@ -26,7 +26,7 @@ def prepare_data_loaders( ]: test_batch_size = test_batch_size or train_batch_size test_loaders_workers = test_loaders_workers or num_workers - train_data_loader = prepare_detection_data_loader( + train_data_loader = create_split_data_loader( dataset_location=dataset_location, split_name="train", batch_size=train_batch_size, @@ -37,7 +37,7 @@ def prepare_data_loaders( ) if train_data_loader is None: raise RuntimeError("Could not initialise train data loader") - valid_data_loader = prepare_detection_data_loader( + valid_data_loader = create_split_data_loader( dataset_location=dataset_location, split_name="valid", batch_size=test_batch_size, @@ -46,7 +46,7 @@ def prepare_data_loaders( num_workers=test_loaders_workers, shuffle=False, ) - test_data_loader = prepare_detection_data_loader( + test_data_loader = create_split_data_loader( dataset_location=dataset_location, split_name="test", batch_size=test_batch_size, @@ -58,7 +58,7 @@ def prepare_data_loaders( return train_data_loader, valid_data_loader, test_data_loader -def prepare_detection_data_loader( +def create_split_data_loader( dataset_location: str, split_name: str, batch_size: int, @@ -67,7 +67,7 @@ def prepare_detection_data_loader( num_workers: int = 0, shuffle: bool = True, ) -> Optional[DataLoader]: - dataset = prepare_detection_dataset( + dataset = load_split_dataset( dataset_location=dataset_location, split_name=split_name, ) @@ -76,20 +76,20 @@ def prepare_detection_data_loader( return DataLoader( dataset, batch_size=batch_size, - collate_fn=partial(collate_fn, processor=processor, device=device), + collate_fn=partial(process_batch, processor=processor, device=device), num_workers=num_workers, shuffle=shuffle, ) -def prepare_detection_dataset( +def load_split_dataset( dataset_location: str, split_name: str, ) -> Optional[DetectionDataset]: image_directory_path = os.path.join(dataset_location, split_name) jsonl_file_path = os.path.join(dataset_location, split_name, "annotations.jsonl") if not os.path.exists(image_directory_path): - logging.warning(f"Could not data directory: {image_directory_path}") + logging.warning(f"Could not find data directory: {image_directory_path}") return None if not os.path.exists(jsonl_file_path): logging.warning(f"Could not find JSONL file: {jsonl_file_path}") @@ -100,11 +100,11 @@ def prepare_detection_dataset( ) -def collate_fn( +def process_batch( batch: tuple[list[str], list[str], list[Image.Image]], processor: AutoProcessor, device: torch.device, -) -> tuple[torch.Tensor, torch.Tensor]: +) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]: questions, answers, images = zip(*batch) inputs = processor(text=list(questions), images=list(images), return_tensors="pt", padding=True).to(device) - return inputs, answers + return inputs, questions, answers, images diff --git a/maestro/trainer/models/florence_2/metrics.py b/maestro/trainer/models/florence_2/metrics.py index be805db..501bc96 100644 --- a/maestro/trainer/models/florence_2/metrics.py +++ b/maestro/trainer/models/florence_2/metrics.py @@ -2,19 +2,17 @@ import numpy as np import supervision as sv -import torch from PIL import Image -from tqdm import tqdm -from transformers import AutoModelForCausalLM, AutoProcessor +from transformers import AutoProcessor from maestro.trainer.common.data_loaders.datasets import DetectionDataset DETECTION_CLASS_PATTERN = r"([a-zA-Z0-9 -]+)" -def postprocess_florence2_output_for_mean_average_precision( - expected_responses: list[str], - generated_texts: list[str], +def process_output_for_detection_metric( + expected_answers: list[str], + generated_answers: list[str], images: list[Image.Image], classes: list[str], processor: AutoProcessor, @@ -22,7 +20,7 @@ def postprocess_florence2_output_for_mean_average_precision( targets = [] predictions = [] - for image, suffix, generated_text in zip(images, expected_responses, generated_texts): + for image, suffix, generated_text in zip(images, expected_answers, generated_answers): # Postprocess prediction for mean average precision calculation prediction = processor.post_process_generation(generated_text, task="", image_size=image.size) prediction = sv.Detections.from_lmm(sv.LMM.FLORENCE_2, prediction, resolution_wh=image.size) @@ -46,37 +44,22 @@ def postprocess_florence2_output_for_mean_average_precision( return targets, predictions -def run_predictions( - dataset: DetectionDataset, +def process_output_for_text_metric( + generated_answers: list[str], + images: list[Image.Image], processor: AutoProcessor, - model: AutoModelForCausalLM, - device: torch.device, -) -> tuple[list[str], list[str], list[str], list[Image.Image]]: - prompts = [] - expected_responses = [] - generated_texts = [] - images = [] - - for idx in tqdm(list(range(len(dataset))), desc="Generating predictions..."): - image, data = dataset.dataset[idx] - prefix = data["prefix"] - suffix = data["suffix"] - - inputs = processor(text=prefix, images=image, return_tensors="pt").to(device) - generated_ids = model.generate( - input_ids=inputs["input_ids"], pixel_values=inputs["pixel_values"], max_new_tokens=1024, num_beams=3 - ) - generated_text = processor.batch_decode(generated_ids, skip_special_tokens=False)[0] - - prompts.append(prefix) - expected_responses.append(suffix) - generated_texts.append(generated_text) - images.append(image) +) -> list[str]: + predictions = [] + for image, generated_text in zip(images, generated_answers): + prediction = processor.post_process_generation(generated_text, task="pure_text", image_size=image.size)[ + "pure_text" + ] + predictions.append(prediction) - return prompts, expected_responses, generated_texts, images + return predictions -def extract_unique_detection_dataset_classes(dataset: DetectionDataset) -> list[str]: +def get_unique_detection_classes(dataset: DetectionDataset) -> list[str]: class_set = set() for i in range(len(dataset)): _, suffix, _ = dataset[i] diff --git a/mkdocs.yaml b/mkdocs.yaml index 25dfd02..d32e41e 100644 --- a/mkdocs.yaml +++ b/mkdocs.yaml @@ -1,11 +1,11 @@ -site_name: multimodal-maestro +site_name: maestro site_url: https://roboflow.github.io/multimodal-maestro/ site_author: Roboflow -site_description: Effective prompting for Large Multimodal Models like GPT-4 Vision or LLaVA. πŸ”₯ +site_description: 'Streamline the fine-tuning process for multimodal models: PaliGemma, Florence-2, Qwen2-VL.' repo_name: roboflow/multimodal-maestro repo_url: https://github.com/roboflow/multimodal-maestro edit_uri: https://github.com/roboflow/multimodal-maestro/tree/main/docs -copyright: Roboflow 2023. All rights reserved. +copyright: Roboflow 2024. All rights reserved. extra: social: @@ -17,14 +17,16 @@ extra: link: https://www.linkedin.com/company/roboflow-ai/mycompany/ - icon: fontawesome/brands/twitter link: https://twitter.com/roboflow + version: + provider: mike + nav: - - Home: index.md - - Get Started: get_started.md - - API: - - LMMs: lmms.md - - Markers: markers.md - - Visualizers: visualizers.md + - Maestro: index.md + - Models: + - Florence-2: florence-2.md + - Tasks: tasks.md + - Metrics: metrics.md theme: @@ -51,5 +53,43 @@ theme: code: Roboto Mono plugins: - - mkdocstrings - search + - mkdocstrings: + default_handler: python + handlers: + python: + options: + parameter_headings: true + paths: [maestro] + load_external_modules: true + allow_inspection: true + show_bases: true + group_by_category: true + docstring_style: google + show_symbol_type_heading: true + show_symbol_type_toc: true + show_category_heading: true + domains: [std, py] + + +markdown_extensions: + - admonition + - pymdownx.details + - pymdownx.superfences + - pymdownx.inlinehilite + - pymdownx.snippets + - attr_list + - md_in_html + - pymdownx.tabbed: + alternate_style: true + - toc: + permalink: true + - pymdownx.emoji: + emoji_index: !!python/name:material.extensions.emoji.twemoji + emoji_generator: !!python/name:material.extensions.emoji.to_svg + - pymdownx.snippets: + check_paths: true + - pymdownx.highlight: + anchor_linenums: true + line_spans: __span + pygments_lang_class: true diff --git a/pyproject.toml b/pyproject.toml index 6a3e163..17f6e87 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -11,7 +11,7 @@ authors = [ {name = "Roboflow", email = "help@roboflow.com"} ] license = {file = "LICENSE"} -keywords = ["roboflow","maestro","multimodal-maestro","transformers", "torch", "accelerate", "multimodal", "lmm", "vision", "nlp", "prompting"] +keywords = ["roboflow","maestro","transformers", "torch", "accelerate", "multimodal", "lmm", "vision", "nlp", "prompting", "vlm"] requires-python = ">=3.9,<3.13" classifiers = [ "Development Status :: 3 - Alpha", @@ -44,7 +44,8 @@ dependencies = [ "flash-attn~=2.6.3; sys_platform != 'darwin'", "einops~=0.8.0", "timm~=1.0.9", - "typer~=0.12.5" + "typer~=0.12.5", + "jiwer~=3.0.4", ] [project.urls] @@ -57,8 +58,10 @@ Changelog = "https://github.com/roboflow/multimodal-maestro/blob/main/CHANGELOG. [project.optional-dependencies] docs = [ - "mkdocs-material~=9.5.33", - "mkdocstrings[python]>=0.20.0,<0.26.2" + "mkdocs-material~=9.5.35", + "mkdocstrings[python]>=0.20.0,<0.26.2", + "mike~=2.1.3", + ] dev = [ "pytest~=8.3.2",