Merge branch 'project/cv-webinar-improved-datagen' of github.com:zenm…

…l-io/zenml-projects into project/cv-webinar-improved-datagen
zenml-io · May 7, 2024 · bbb9f47 · bbb9f47
2 parents c26af1b + 2e62f9b
commit bbb9f47
Show file tree

Hide file tree

Showing 29 changed files with 1,793 additions and 165 deletions.
diff --git a/.gitignore b/.gitignore
@@ -145,3 +145,8 @@ zencoder/cloned_public_repos
 
 .DS_Store
 .local
+
+# local files
+llm-lora-finetuning/ckpt/
+llm-lora-finetuning/data_generation/
+llm-lora-finetuning/datagen/
diff --git a/.typos.toml b/.typos.toml
@@ -1,5 +1,14 @@
 [files]
-extend-exclude = ["*.csv", "sign-language-detection-yolov5/*", "orbit-user-analysis/steps/report.py", "customer-satisfaction/pipelines/deployment_pipeline.py", "customer-satisfaction/streamlit_app.py", "nba-pipeline/Building and Using An MLOPs Stack With ZenML.ipynb", "customer-satisfaction/tests/data_test.py"]
+extend-exclude = [
+    "*.csv",
+    "sign-language-detection-yolov5/*",
+    "orbit-user-analysis/steps/report.py",
+    "customer-satisfaction/pipelines/deployment_pipeline.py",
+    "customer-satisfaction/streamlit_app.py",
+    "nba-pipeline/Building and Using An MLOPs Stack With ZenML.ipynb",
+    "customer-satisfaction/tests/data_test.py",
+    "end-to-end-computer-vision/**/*.ipynb"
+]
 
 [default.extend-identifiers]
 #  HashiCorp = "HashiCorp"
@@ -14,6 +23,9 @@ lenght = "lenght"
 preprocesser = "preprocesser"
 Preprocesser = "Preprocesser"
 Implicitly = "Implicitly"
+fo = "fo"
+mapp = "mapp"
+polution = "polution"
 
 [default]
 locale = "en-us"
diff --git a/end-to-end-computer-vision/README.md b/end-to-end-computer-vision/README.md
@@ -14,18 +14,18 @@ label images to improve the model's performance, as well as feedback using
 
 The project uses the [Ship Detection
 dataset](https://huggingface.co/datasets/datadrivenscience/ship-detection) from
-[DataDrivenScience](https://datadrivenscience.com/) on the Hugging Face Hub, which contains images of ships 
-in satellite imagery. The goal is to train a model to detect ships in the images.
-Note that this isn't something that our YOLOv8 model is particularly good at out
-of the box, so it serves as a good example of how to build a pipeline that can
-be extended to other use cases.
+[DataDrivenScience](https://datadrivenscience.com/) on the Hugging Face Hub,
+which contains images of ships in satellite imagery. The goal is to train a
+model to detect ships in the images. Note that this isn't something that our
+YOLOv8 model is particularly good at out of the box, so it serves as a good
+example of how to build a pipeline that can be extended to other use cases.
 
-This project needs some infrastructure and tool setup to work. Here is a list 
-of things that you'll need to do.
+This project needs some infrastructure and tool setup to work. Here is a list of
+things that you'll need to do.
 
 ## ZenML
 
-We recommend using our [ZenML Cloud offering](https://cloud.zenml.io/) to get a 
+We recommend using our [ZenML Cloud offering](https://cloud.zenml.io/) to get a
 deployed instance of zenml:
 
 ### Set up your environment
@@ -36,8 +36,8 @@ zenml integration install label_studio torch gcp mlflow -y
 pip uninstall wandb  # This comes in automatically
 ```
 
-And to use the Albumentations and annotation plugins in the last step,
-you'll need to install them:
+And to use the Albumentations and annotation plugins in the last step, you'll
+need to install them:
 
 ```bash
 fiftyone plugins download https://github.com/jacobmarks/fiftyone-albumentations-plugin
@@ -52,47 +52,56 @@ export DATA_UPLOAD_MAX_NUMBER_FILES=1000000
 export WANDB_DISABLED=True
 ```
 
-### Connect to your deployed zenml instance
+### Connect to your deployed ZenML instance
+
 ```bash
 zenml connect --url <INSERT_ZENML_URL_HERE>
 ```
 
-## Cloud Provider (we will use GCP in our examples here)
+## Cloud Provider
+
+We will use GCP in the commands listed below, but it will work for other cloud
+providers.
 
-### Follow our guide to set up your credential for gcp 
+### Follow our guide to set up your credential for GCP
 
-[Set up a gcp service connector](https://docs.zenml.io/stacks-and-components/auth-management/gcp-service-connector)
+[Set up a GCP service
+connector](https://docs.zenml.io/stacks-and-components/auth-management/gcp-service-connector)
 
 ### Set up a bucket to persist your training data
 
 ### Set up a bucket to use as artifact store within ZenML
 
-[Learn how to set up a gcp artifact store stack component within zenml here](https://docs.zenml.io/stacks-and-components/component-guide/artifact-stores)
+[Learn how to set up a GCP artifact store stack component within zenml
+here](https://docs.zenml.io/stacks-and-components/component-guide/artifact-stores)
 ### Set up vertex for pipeline orchestration
 
-[Learn how to set up a vertex orchestrator stack component within zenml here](https://docs.zenml.io/stacks-and-components/component-guide/orchestrators/vertex)
-### For training on accelerators like GPUs/TPUs set up vertex
+[Learn how to set up a Vertex orchestrator stack component within zenml
+here](https://docs.zenml.io/stacks-and-components/component-guide/orchestrators/vertex)
+### For training on accelerators like GPUs/TPUs set up Vertex
 
-[Learn how to set up a vertex step operator stack component within zenml here](https://docs.zenml.io/stacks-and-components/component-guide/step-operators/vertex)
+[Learn how to set up a Vertex step operator stack component within zenml
+here](https://docs.zenml.io/stacks-and-components/component-guide/step-operators/vertex)
 ### Set up Container Registry
 
-[Learn how to set up a google cloud container registry component within zenml here](https://docs.zenml.io/stacks-and-components/component-guide/container-registries/gcp)
+[Learn how to set up a google cloud container registry component within zenml
+here](https://docs.zenml.io/stacks-and-components/component-guide/container-registries/gcp)
 
 ## Label Studio
 
-### [Start label studio locally](https://labelstud.io/guide/start)
-### [Follow these zenml instructions to set up label studio as a stack component](https://docs.zenml.io/stacks-and-components/component-guide/annotators/label-studio)
-### Create a project within label studio and name it `ship_detection_gcp`
-### [Set up label studio to use external storage](https://labelstud.io/guide/storage) 
+### [Start Label Studio locally](https://labelstud.io/guide/start)
+### [Follow these ZenML instructions to set up Label Studio as a stack component](https://docs.zenml.io/stacks-and-components/component-guide/annotators/label-studio)
+### Create a project within Label Studio and name it `ship_detection_gcp`
+### [Set up Label Studio to use external storage](https://labelstud.io/guide/storage) 
 use the first bucket that you created to data persistence
 
 ## ZenML Stacks
 
 ### Local Stack
 
-The local stack should use the `default` orchestrator, a gcp remote artifact 
-store that we'll call `gcp_artifact_store` here and a local label-studio annotator 
-that we'll refer to as `label_studio_local`
+The local stack should use the `default` orchestrator, a gcp remote artifact
+store that we'll call `gcp_artifact_store` here and a local label-studio
+annotator that we'll refer to as `label_studio_local`.
 
 ```bash
 # Make sure to replace the names with the names that you choose for your setup
@@ -110,28 +119,27 @@ a `gcp_container_registry` and a `vertex_step_operator`.
 zenml stack register <gcp_stack> -o <vertex_orchestrator> -a <gcp_artifact_store> -c <gcp_container_registry> -s <vertex_step_operator>
 ```
 
-
-The project consists of the following pipelines
+The project consists of the following pipelines:
 
 ## data_ingestion_pipeline
 
-This pipeline downloads the [Ship Detection dataset](https://huggingface.co/datasets/datadrivenscience/ship-detection). This dataset contains
-some truly huge images with a few hundred million pixels. In order to make these
-useable, we break down all source images into managable tiles with a max 
-height/width of 1000 pixels. After this preprocessing is done, the images are
-uploaded into a cloud bucket and the ground truth annotations are uploaded to
-a local label studio instance. 
+This pipeline downloads the [Ship Detection
+dataset](https://huggingface.co/datasets/datadrivenscience/ship-detection). This
+dataset contains some truly huge images with a few hundred million pixels. In
+order to make these useable, we break down all source images into manageable
+tiles with a maximum height/width of 1000 pixels. After this preprocessing is
+done, the images are uploaded into a cloud bucket and the ground truth
+annotations are uploaded to a local Label Studio instance.
 
 ### Configure this pipeline
 The configuration file for this pipeline lives at `./configs/ingest_data.yaml`.
-Make sure in particular to change `data_source` to point at the gcp bucket 
-which is dedicated to be the storage location for the data. Also make sure to 
-adjust the `ls_project_id` to correspond to the id of your project within label
-studio.
+Make sure in particular to change `data_source` to point at the GCP bucket which
+is dedicated to be the storage location for the data. Also make sure to adjust
+the `ls_project_id` to correspond to the id of your project within Label Studio.
 
 ### Run this pipeline
 
-Label studio should be up and running for the whole duration of this pipeline 
+Label Studio should be up and running for the whole duration of this pipeline
 run.
 
 ```bash
@@ -141,17 +149,17 @@ python run.py --ingest
 
 ## data_export_pipeline
 
-This pipeline exports the annotations from label studio and loads it into the
-zenml artifact store to make them accessible to downstream pipelines. 
+This pipeline exports the annotations from Label Studio and loads it into the
+ZenML artifact store to make them accessible to downstream pipelines.
 
 ### Configure this pipeline
 The configuration file for this pipeline lives at `./configs/data_export.yaml`.
-Make sure in particular to change `dataset_name` to reflect the name of the 
+Make sure in particular to change `dataset_name` to reflect the name of the
 dataset within Label Studio.
 
 ### Run this pipeline
 
-Label studio should be up and running for the whole duration of this pipeline 
+Label Studio should be up and running for the whole duration of this pipeline
 run.
 
 ```bash
@@ -161,21 +169,21 @@ python run.py --export
 
 ## training_pipeline
 
-This pipeline trains a yolo v8 object detection model. 
+This pipeline trains a YOLOv8 object detection model. 
 
 ### Configure this pipeline
-You can choose to run this pipeline locally or on the cloud. These two options 
-use two different configuration files. For local training: 
-`./configs/training_pipeline.yaml`. For training on the cloud: 
-`./configs/training_pipeline_remote_gpu.yaml`. Make sure 
-`data_source` points to your cloud storage bucket.
+You can choose to run this pipeline locally or on the cloud. These two options
+use two different configuration files. For local training:
+`./configs/training_pipeline.yaml`. For training on the cloud:
+`./configs/training_pipeline_remote_gpu.yaml`. Make sure `data_source` points to
+your cloud storage bucket.
 
 ### Run this pipeline
 
 This pipeline requires the associated model (see the model section of the
-configuration yaml file) to have a version in the `staging` stage.
-In order to promote the model produced by the latest run of the 
-`data_export_pipeline`, run the following code:
+configuration yaml file) to have a version in the `staging` stage. In order to
+promote the model produced by the latest run of the `data_export_pipeline`, run
+the following code:
 
 ```bash
 zenml model version update <MODEL_NAME> latest -s staging 
@@ -195,23 +203,22 @@ zenml stack set <remote_stack>
 python run.py --training
 ```
 
-
 ## inference_pipeline
 
-This pipeline perform inference on the object detection model. 
+This pipeline performs inference on the object detection model.
 
 ### Configure this pipeline
 You can configure this pipeline at the following yaml file
-`./configs/inference_pipeline.yaml`.  Make sure 
-`data_source` points to your cloud storage bucket that contains images that you
-want to perform batch inference on
+`./configs/inference_pipeline.yaml`.  Make sure `data_source` points to your
+cloud storage bucket that contains images that you want to perform batch
+inference on
 
 ### Run this pipeline
 
 This pipeline requires the associated model (see the model section of the
-configuration yaml file) to have a version in the `production` stage.
-In order to promote the model produced by the latest run of the 
-`training_pipeline`, run the following code:
+configuration yaml file) to have a version in the `production` stage. In order
+to promote the model produced by the latest run of the `training_pipeline`, run
+the following code:
 
 ```bash
 zenml model version update <MODEL_NAME> staging -s production 
@@ -223,14 +230,14 @@ python run.py --inference
 ```
 
 
-## Analyze and Curate your data through fiftyone
+## Analyze and Curate your data through FiftyOne
 
-Now to close the loop, we will import the predictions into fiftyone. All you'll
+Now to close the loop, we will import the predictions into FiftyOne. All you'll
 need to do is run:
 
 ```bash
 python run.py --fiftyone
 ```
 
-Within fiftyone, you can now analyze all the predictions and export them back 
-to label studio for finetuned labeling and retraining.
+Within FiftyOne, you can now analyze all the predictions and export them back to
+Label Studio for finetuned labeling and retraining.
diff --git a/end-to-end-computer-vision/steps/process_hf_dataset.py b/end-to-end-computer-vision/steps/process_hf_dataset.py
@@ -34,7 +34,7 @@
 def process_hf_dataset(
     dataset: str, data_source: str, max_tile_size: int = 1000
 ) -> Dict[str, Any]:
-    """Downloads a hf dataset and does some processing.
+    """Downloads a Hugging Face dataset and does some processing.
 
     Converts the labels into the label_studio format.
     Also uploads the images to the datasource path.

diff --git a/llm-complete-guide/README.md b/llm-complete-guide/README.md
@@ -110,6 +110,17 @@ Note that Claude will require a different API key from Anthropic. See [the
 `litellm` docs](https://docs.litellm.ai/docs/providers/anthropic) on how to set
 this up.
 
+### Run the evaluation pipeline
+
+To run the evaluation pipeline, you can use the following command:
+
+```shell
+python run.py --evaluation
+```
+
+You'll need to have first run the RAG pipeline to have the necessary assets in
+the database to evaluate.
+
 ## ☁️ Running in your own VPC
 
 The basic RAG pipeline will run using a local stack, but if you want to improve
@@ -120,6 +131,9 @@ guides](https://docs.zenml.io/user-guide/cloud-guide) (currently available for
 [GCP](https://docs.zenml.io/user-guide/cloud-guide/gcp-guide)) to learn how you
 can run the pipelines on a remote stack.
 
+If you run the pipeline using a cloud artifact store, logs from all the steps as
+well as assets like the visualizations will all be shown in the ZenML dashboard.
+
 ### BONUS: Connect to ZenML Cloud
 
 If you run the pipeline using ZenML Cloud you'll have access to the managed
@@ -137,20 +151,38 @@ The project loosely follows [the recommended ZenML project structure](https://do
 ```
 .
 ├── LICENSE                                             # License file
-├── README.md                                           # This file
-├── constants.py                                        # Constants for the project
+├── README.md                                           # Project documentation
+├── __init__.py
+├── constants.py                                        # Constants used throughout the project
+├── materializers
+│   ├── __init__.py
+│   └── document_materializer.py                        # Document materialization logic
+├── most_basic_eval.py                                  # Basic evaluation script
+├── most_basic_rag_pipeline.py                          # Basic RAG pipeline script
+├── notebooks
+│   └── visualise_embeddings.ipynb                      # Notebook to visualize embeddings
 ├── pipelines
-│   ├── __init__.py                                    
-│   └── llm_basic_rag.py                                # Basic RAG pipeline
-├── requirements.txt                                    # Requirements file
-├── run.py                                              # Script to run the pipelines
+│   ├── __init__.py
+│   ├── generate_chunk_questions.py                     # Pipeline to generate chunk questions
+│   ├── llm_basic_rag.py                                # Basic RAG pipeline using LLM
+│   └── llm_eval.py                                     # Pipeline for LLM evaluation
+├── requirements.txt                                    # Project dependencies
+├── run.py                                              # Main script to run the project
 ├── steps
-│   ├── __init__.py                                     
-│   ├── populate_index.py                               # Step to populate the index
-│   ├── url_scraper.py                                  # Step to scrape the URLs
-│   ├── url_scraping_utils.py                           # Utilities for the URL scraper
-│   └── web_url_loader.py                               # Step to load the URLs
-└── utils                                              
+│   ├── __init__.py
+│   ├── eval_e2e.py                                     # End-to-end evaluation step
+│   ├── eval_retrieval.py                               # Retrieval evaluation step
+│   ├── eval_visualisation.py                           # Evaluation visualization step
+│   ├── populate_index.py                               # Step to populate the index
+│   ├── synthetic_data.py                               # Step to generate synthetic data
+│   ├── url_scraper.py                                  # Step to scrape URLs
+│   ├── url_scraping_utils.py                           # Utilities for URL scraping
+│   └── web_url_loader.py                               # Step to load web URLs
+├── structures.py                                       # Data structures used in the project
+├── tests
+│   ├── __init__.py
+│   └── test_url_scraping_utils.py                      # Tests for URL scraping utilities
+└── utils
     ├── __init__.py
     └── llm_utils.py                                    # Utilities related to the LLM
 ```

diff --git a/llm-complete-guide/__init__.py b/llm-complete-guide/__init__.py
diff --git a/llm-complete-guide/constants.py b/llm-complete-guide/constants.py
@@ -15,8 +15,9 @@
 # limitations under the License.
 #
 
+
 # Vector Store constants
-CHUNK_SIZE = 500
+CHUNK_SIZE = 2000
 CHUNK_OVERLAP = 50
 EMBEDDING_DIMENSIONALITY = (
     384  # Update this to match the dimensionality of the new model
@@ -29,7 +30,7 @@
 OPENAI_MODEL = "gpt-3.5-turbo"
 EMBEDDINGS_MODEL = "sentence-transformers/all-MiniLM-L12-v2"
 MODEL_NAME_MAP = {
-    "gpt4": "gpt-4-0125-preview",
+    "gpt4": "gpt-4",
     "gpt35": "gpt-3.5-turbo",
     "claude3": "claude-3-opus-20240229",
     "claudehaiku": "claude-3-haiku-20240307",

diff --git a/llm-complete-guide/materializers/__init__.py b/llm-complete-guide/materializers/__init__.py