diff --git a/.typos.toml b/.typos.toml index 1c3a4859..9398d36d 100644 --- a/.typos.toml +++ b/.typos.toml @@ -7,7 +7,12 @@ extend-exclude = [ "customer-satisfaction/streamlit_app.py", "nba-pipeline/Building and Using An MLOPs Stack With ZenML.ipynb", "customer-satisfaction/tests/data_test.py", - "end-to-end-computer-vision/**/*.ipynb" + "end-to-end-computer-vision/**/*.ipynb", + "classifier-e2e/run_skip_basics.ipynb", + "classifier-e2e/run_full.ipynb", + "classifier-e2e/run_skip_basics.ipynb", + "classifier-e2e/run_full.ipynb", + "classifier-e2e/run_skip_basics.ipynb" ] [default.extend-identifiers] @@ -26,6 +31,7 @@ Implicitly = "Implicitly" fo = "fo" mapp = "mapp" polution = "polution" +magent = "magent" [default] locale = "en-us" diff --git a/end-to-end-computer-vision/README.md b/end-to-end-computer-vision/README.md index 34f62b5a..20fcf34b 100644 --- a/end-to-end-computer-vision/README.md +++ b/end-to-end-computer-vision/README.md @@ -26,13 +26,13 @@ things that you'll need to do. ## ZenML We recommend using our [ZenML Pro offering](https://cloud.zenml.io/) to get a -deployed instance of zenml: +deployed instance of ZenML: ### Set up your environment ```bash pip install -r requirements.txt -zenml integration install label_studio torch gcp mlflow -y +zenml integration install torch gcp mlflow label_studio -y pip uninstall wandb # This comes in automatically ``` @@ -63,37 +63,50 @@ zenml connect --url We will use GCP in the commands listed below, but it will work for other cloud providers. -### Follow our guide to set up your credential for GCP +1) Follow our guide to set up your credentials for GCP [here](https://docs.zenml.io/how-to/auth-management/gcp-service-connector) -[Set up a GCP service -connector](https://docs.zenml.io/how-to/auth-management/gcp-service-connector) +2) Set up a bucket in GCP to persist your training data -### Set up a bucket to persist your training data - -### Set up a bucket to use as artifact store within ZenML - -[Learn how to set up a GCP artifact store stack component within zenml -here](https://docs.zenml.io/stack-components/artifact-stores) -### Set up vertex for pipeline orchestration - -[Learn how to set up a Vertex orchestrator stack component within zenml -here](https://docs.zenml.io/stack-components/orchestrators/vertex) -### For training on accelerators like GPUs/TPUs set up Vertex - -[Learn how to set up a Vertex step operator stack component within zenml -here](https://docs.zenml.io/stack-components/step-operators/vertex) -### Set up Container Registry - -[Learn how to set up a google cloud container registry component within zenml -here](https://docs.zenml.io/stack-components/container-registries/gcp) +3) Set up a bucket to use as artifact store within ZenML +Learn how to set up a GCP artifact store stack component within ZenML +[here](https://docs.zenml.io/stack-components/artifact-stores) +4) Set up Vertex for pipeline orchestration +Learn how to set up a Vertex orchestrator stack component within ZenML +[here](https://docs.zenml.io/stack-components/orchestrators/vertex) +5) For training on accelerators like GPUs/TPUs set up Vertex +Learn how to set up a Vertex step operator stack component within ZenML +[here](https://docs.zenml.io/stack-components/step-operators/vertex) +6) Set up a Container Registry in GCP. Learn how to set up a google cloud container registry component within ZenML +[here](https://docs.zenml.io/stack-components/container-registries/gcp) ## Label Studio -### [Start Label Studio locally](https://labelstud.io/guide/start) -### [Follow these ZenML instructions to set up Label Studio as a stack component](https://docs.zenml.io/stack-components/annotators/label-studio) -### Create a project within Label Studio and name it `ship_detection_gcp` -### [Set up Label Studio to use external storage](https://labelstud.io/guide/storage) -use the first bucket that you created to data persistence +1) [Start Label Studio locally](https://labelstud.io/guide/start) +For Label Studio we recommend using docker/docker-compose to deploy a local instance +```bash +git clone https://github.com/HumanSignal/label-studio.git +cd label-studio +docker-compose up -d # starts label studio at http://localhost:8080 +``` +2) [Follow these ZenML instructions to set up Label Studio as a stack component](https://docs.zenml.io/stack-components/annotators/label-studio#how-to-deploy-it) +3) Create a project within Label Studio and name it `ship_detection_gcp` +![img.png](_assets/project_creation_label_studio.png) +4) Configure your project to use `Object Detection with Bounding Boxes` as Labeling Setup +![img.png](_assets/labeling_setup.png) +In the following screen you now need to configure the labeling interface. This is where you define the different classes that you want to detect. In our case this should be a single `ship` class. +![img.png](_assets/labeling_interface.png) +Additionally you might want to allow users to zoom during labeling. This can be configured when you scroll down on this same screen. +6) [Set up Label Studio to use external storage](https://labelstud.io/guide/storage) +Use the first bucket that you created for data persistence + +## Hugging Face + +This specific project relies on a dataset loaded from Hugging Face. As such a free Hugging Face account is needed. + +1) Login in the CLI. Simply follow the instructions from this command. +```commandline +huggingface-cli login +``` ## ZenML Stacks @@ -126,7 +139,7 @@ The project consists of the following pipelines: This pipeline downloads the [Ship Detection dataset](https://huggingface.co/datasets/datadrivenscience/ship-detection). This dataset contains some truly huge images with a few hundred million pixels. In -order to make these useable, we break down all source images into manageable +order to make these usable, we break down all source images into manageable tiles with a maximum height/width of 1000 pixels. After this preprocessing is done, the images are uploaded into a cloud bucket and the ground truth annotations are uploaded to a local Label Studio instance. diff --git a/end-to-end-computer-vision/_assets/labeling_interface.png b/end-to-end-computer-vision/_assets/labeling_interface.png new file mode 100644 index 00000000..e6b02f4d Binary files /dev/null and b/end-to-end-computer-vision/_assets/labeling_interface.png differ diff --git a/end-to-end-computer-vision/_assets/labeling_setup.png b/end-to-end-computer-vision/_assets/labeling_setup.png new file mode 100644 index 00000000..ed4cd717 Binary files /dev/null and b/end-to-end-computer-vision/_assets/labeling_setup.png differ diff --git a/end-to-end-computer-vision/_assets/project_creation_label_studio.png b/end-to-end-computer-vision/_assets/project_creation_label_studio.png new file mode 100644 index 00000000..3b6db829 Binary files /dev/null and b/end-to-end-computer-vision/_assets/project_creation_label_studio.png differ diff --git a/end-to-end-computer-vision/configs/ingest_data.yaml b/end-to-end-computer-vision/configs/ingest_data.yaml index 420b1fa4..b4072c3f 100644 --- a/end-to-end-computer-vision/configs/ingest_data.yaml +++ b/end-to-end-computer-vision/configs/ingest_data.yaml @@ -5,7 +5,7 @@ steps: enable_step_logs: False parameters: dataset: "datadrivenscience/ship-detection" - data_source: # Insert your bucket path here where the training images will live e.g. "gs://foo/bar" + data_source: # Replace this with the path to a data source upload_labels_to_label_studio: enable_cache: False parameters: diff --git a/end-to-end-computer-vision/configs/training_pipeline.yaml b/end-to-end-computer-vision/configs/training_pipeline.yaml index f49ba554..89195f0b 100644 --- a/end-to-end-computer-vision/configs/training_pipeline.yaml +++ b/end-to-end-computer-vision/configs/training_pipeline.yaml @@ -11,6 +11,7 @@ steps: batch_size: 8 imgsz: 720 epochs: 1 + is_apple_silicon_env: False settings: docker: diff --git a/end-to-end-computer-vision/configs/training_pipeline_remote_gpu.yaml b/end-to-end-computer-vision/configs/training_pipeline_remote_gpu.yaml index 1ee643ab..8d685056 100644 --- a/end-to-end-computer-vision/configs/training_pipeline_remote_gpu.yaml +++ b/end-to-end-computer-vision/configs/training_pipeline_remote_gpu.yaml @@ -26,6 +26,7 @@ steps: imgsz: 720 epochs: 50000 is_quad_gpu_env: True + is_apple_silicon_env: False settings: step_operator.vertex: accelerator_type: NVIDIA_TESLA_T4 # see https://cloud.google.com/vertex-ai/docs/reference/rest/v1/MachineSpec#AcceleratorType diff --git a/end-to-end-computer-vision/steps/download_and_tile_from_hf.py b/end-to-end-computer-vision/steps/download_and_tile_from_hf.py deleted file mode 100644 index e69de29b..00000000 diff --git a/end-to-end-computer-vision/steps/download_from_hf.py b/end-to-end-computer-vision/steps/download_from_hf.py deleted file mode 100644 index 45bf7657..00000000 --- a/end-to-end-computer-vision/steps/download_from_hf.py +++ /dev/null @@ -1,86 +0,0 @@ -# Apache Software License 2.0 -# -# Copyright (c) ZenML GmbH 2024. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# -import os -from typing import Any, Dict - -from datasets import load_dataset -from PIL import Image -from zenml import step -from zenml.io import fileio -from zenml.logger import get_logger - -Image.MAX_IMAGE_PIXELS = None - -logger = get_logger(__name__) - - -@step -def download_dataset_from_hf(dataset: str, data_source: str) -> Dict[str, Any]: - dataset = load_dataset(dataset) - data = dataset["train"] - - output_dir = "data" - if not os.path.exists(output_dir): - os.mkdir(output_dir) - - all_images = {} - - for i, d in enumerate(data): - img = d["image"] - img_name = f"image_{i}.png" - logger.info(f"Handling {img_name}") - img_path = f"{output_dir}/{img_name}" - - logger.info(f"Storing image to {img_path}.") - img.save(img_path) - - bucket_path = os.path.join(data_source, img_name) - logger.info(f"Copying into gcp bucket {bucket_path}") - fileio.copy(img_path, bucket_path, overwrite=True) - - width, height = d["image"].size - - results = [] - for j, bbox in enumerate(d["objects"]["bbox"]): - x1, y1, x2, y2 = bbox - x = x1 / width - y = y1 / height - w = (x2 - x1) / width - h = (y2 - y1) / height - results.append( - { - "original_width": width, - "original_height": height, - "image_rotation": 0, - "value": { - "x": x * 100, - "y": y * 100, - "width": w * 100, - "height": h * 100, - "rotation": 0, - "rectanglelabels": ["ship"], - }, - "from_name": "label", - "to_name": "image", - "type": "rectanglelabels", - "origin": "manual", - } - ) - - all_images[img_name] = results - - return all_images diff --git a/end-to-end-computer-vision/steps/train_model.py b/end-to-end-computer-vision/steps/train_model.py index 67be2774..3b90a04d 100644 --- a/end-to-end-computer-vision/steps/train_model.py +++ b/end-to-end-computer-vision/steps/train_model.py @@ -41,6 +41,7 @@ def train_model( batch_size: int = 16, imgsz: int = 640, is_quad_gpu_env: bool = False, + is_apple_silicon_env: bool = False, ) -> Tuple[ Annotated[ YOLO, ArtifactConfig(name="Trained_YOLO", is_model_artifact=True) @@ -57,6 +58,7 @@ def train_model( dataset: Dataset to train the model on. data_source: Source where the data lives is_quad_gpu_env: Whether we are in an env with 4 gpus + is_apple_silicon_env: In case we are running on Apple compute Returns: Tuple[YOLO, Dict[str, Any]]: Trained model and validation metrics. @@ -75,6 +77,14 @@ def train_model( imgsz=imgsz, device=[0, 1, 2, 3], ) + elif is_apple_silicon_env: + model.train( + data=data_path, + epochs=epochs, + batch=batch_size, + imgsz=imgsz, + device="mps", + ) else: model.train( data=data_path, diff --git a/end-to-end-computer-vision/utils/dataset_utils.py b/end-to-end-computer-vision/utils/dataset_utils.py index 20e38a78..81a47297 100644 --- a/end-to-end-computer-vision/utils/dataset_utils.py +++ b/end-to-end-computer-vision/utils/dataset_utils.py @@ -45,6 +45,20 @@ def load_images_from_folder(folder): return images +def load_images_from_source(data_source, download_dir, filenames): + total_images = len(filenames) + for index, filename in enumerate(filenames): + src_path = f"{data_source}/{filename}.png" + dst_path = os.path.join(download_dir, f"{filename}.png") + if not os.path.exists(dst_path): + fileio.copy(src_path, dst_path) + + if (index + 1) % 100 == 0 or index == total_images - 1: + logger.info( + f"{index + 1} of {total_images} images have been downloaded..." + ) + + def load_and_split_data( dataset: LabelStudioAnnotationExport, data_source: str ) -> str: @@ -71,21 +85,33 @@ def load_and_split_data( if f.endswith(".txt") ] - # Download corresponding images from gcp bucket - images_folder = os.path.join(extract_location, "images") + # Download images from source bucket and if successful keep them to reuse for future runs + load_images = False + download_dir = os.path.join(os.getcwd(), "images") # Temporary dirname that represents a still incomplete download + loaded_images = os.path.join(os.getcwd(), "loaded-images") # The dirname used once the download fully completes + images_folder = os.path.join(extract_location, "images") # tmp dirpath used for the current run only + + # Check that images have not already been downloaded + if not os.path.exists(loaded_images): + os.makedirs(download_dir, exist_ok=True) + load_images = True + + # Checks that new images have not been added since previous download + if os.path.exists(loaded_images): + if len(os.listdir(loaded_images)) != len(filenames): + download_dir = loaded_images + load_images = True + + if load_images: + logger.info(f"Downloading images from {data_source}") + load_images_from_source(data_source, download_dir, filenames) + os.rename(download_dir, loaded_images) + os.makedirs(images_folder, exist_ok=True) - total_images = len(filenames) - logger.info(f"Downloading images from {data_source}") - for index, filename in enumerate(filenames): - src_path = f"{data_source}/{filename}.png" - dst_path = os.path.join(images_folder, f"{filename}.png") - fileio.copy(src_path, dst_path) + logger.info(f"Copy images to {images_folder}") + load_images_from_source(loaded_images, images_folder, filenames) - if (index + 1) % 100 == 0 or index == total_images - 1: - logger.info( - f"{index + 1} of {total_images} images have been downloaded..." - ) split_dataset(extract_location, ratio=(0.7, 0.15, 0.15), seed=42) yaml_path = generate_yaml(extract_location) return yaml_path diff --git a/end-to-end-computer-vision/utils/split_data.py b/end-to-end-computer-vision/utils/split_data.py index 5789736d..c019cad2 100644 --- a/end-to-end-computer-vision/utils/split_data.py +++ b/end-to-end-computer-vision/utils/split_data.py @@ -1,3 +1,4 @@ +import math import os import random import shutil @@ -37,7 +38,7 @@ def split_dataset( seed: Random seed for reproducibility. """ # Ensure the ratio is correct - assert sum(ratio) == 1.0 + assert math.isclose(sum(ratio), 1.0, rel_tol=1e-9) # Seed to get consistent results if seed is not None: diff --git a/llm-agents/README.md b/llm-agents/README.md index c10e7993..5ba8e487 100644 --- a/llm-agents/README.md +++ b/llm-agents/README.md @@ -63,7 +63,7 @@ You can sign up for a free trial of the cloud at https://cloud.zenml.io. Once si ### Models Tab in the Dashboard -The models tab acts as a central control plane for all of your models. You can view the different versions that get created implictly with your pipeline runs, check their metadata, deployments and more! +The models tab acts as a central control plane for all of your models. You can view the different versions that get created implicitly with your pipeline runs, check their metadata, deployments and more! ![model versions](./assets/llm-agent/model_versions.png) diff --git a/llm-finetuning/README.md b/llm-finetuning/README.md index 22252c03..d4a9863f 100644 --- a/llm-finetuning/README.md +++ b/llm-finetuning/README.md @@ -160,7 +160,7 @@ This project recently did a [call of volunteers](https://www.linkedin.com/feed/u While the work here is solely based on the task of finetuning the model for the ZenML library, the pipeline can be changed with minimal effort to point to any set of repositories on GitHub. Theoretically, one could extend this work to point to proprietary codebases to learn from them for any use-case. -For example, see how [VMWare fine-tuned StarCoder to learn their style](https://octo.vmware.com/fine-tuning-starcoder-to-learn-vmwares-coding-style/). +For example, see how [VMWare fine-tuned StarCoder to learn their style](https://entreprenerdly.com/fine-tuning-starcoder-to-create-a-coding-assistant-that-adapts-to-your-coding-style/). Also, make sure to join our Slack diff --git a/llm-litgpt-finetuning/lit_gpt/lora.py b/llm-litgpt-finetuning/lit_gpt/lora.py index 6e9274e1..105d6441 100644 --- a/llm-litgpt-finetuning/lit_gpt/lora.py +++ b/llm-litgpt-finetuning/lit_gpt/lora.py @@ -383,7 +383,7 @@ def conv1d( If the number of heads is equal to the number of query groups - grouped queries are disabled (see scheme in `lit_gpt/config.py:Config`). In this case the combined QKV matrix consists of equally sized query, key and value parts, which means we can utilize `groups` argument from `conv1d`: with this argument the - input and weight matrices will be splitted in equally sized parts and applied separately (like having multiple + input and weight matrices will be split in equally sized parts and applied separately (like having multiple conv layers side by side). Otherwise QKV matrix consists of unequally sized parts and thus we have to split input and weight matrices manually, @@ -408,14 +408,14 @@ def conv1d( # ⚬ C_output': embeddings size for each LoRA layer (not equal in size) # ⚬ r: rank of all LoRA layers (equal in size) - input_splitted = input.chunk( + input_split = input.chunk( sum(self.enable_lora), dim=1 ) # N * (B, C // N, T) - weight_splitted = weight.split( + weight_split = weight.split( self.qkv_shapes ) # N * (C_output', r, 1) return torch.cat( - [F.conv1d(a, b) for a, b in zip(input_splitted, weight_splitted)], + [F.conv1d(a, b) for a, b in zip(input_split, weight_split)], dim=1, # (B, C_output', T) ) # (B, C_output, T)