From aea6d606db8888b7c7743396d9bd9a21280960fe Mon Sep 17 00:00:00 2001 From: AlexejPenner Date: Thu, 31 Oct 2024 13:17:33 +0100 Subject: [PATCH 01/14] Inconsequential change --- llm-complete-guide/run.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/llm-complete-guide/run.py b/llm-complete-guide/run.py index 5e351478..50bbf8fe 100644 --- a/llm-complete-guide/run.py +++ b/llm-complete-guide/run.py @@ -264,4 +264,4 @@ def main( materializer_registry.register_materializer_type( Document, DocumentMaterializer ) - main() + main() \ No newline at end of file From f82e6a145e5556e82c563cd380d078f8b89fc2d2 Mon Sep 17 00:00:00 2001 From: AlexejPenner Date: Thu, 31 Oct 2024 14:49:13 +0100 Subject: [PATCH 02/14] Enable running on full dataset --- llm-complete-guide/configs/rag_local_dev.yaml | 2 +- llm-complete-guide/run.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/llm-complete-guide/configs/rag_local_dev.yaml b/llm-complete-guide/configs/rag_local_dev.yaml index 334044b5..3d7c9ac8 100644 --- a/llm-complete-guide/configs/rag_local_dev.yaml +++ b/llm-complete-guide/configs/rag_local_dev.yaml @@ -31,4 +31,4 @@ model: steps: url_scraper: parameters: - docs_url: https://docs.zenml.io/stack-components/orchestrators + docs_url: https://docs.zenml.io diff --git a/llm-complete-guide/run.py b/llm-complete-guide/run.py index 50bbf8fe..5e351478 100644 --- a/llm-complete-guide/run.py +++ b/llm-complete-guide/run.py @@ -264,4 +264,4 @@ def main( materializer_registry.register_materializer_type( Document, DocumentMaterializer ) - main() \ No newline at end of file + main() From eaefa0d6a35d01a297bcaabdf800b28b1853fe29 Mon Sep 17 00:00:00 2001 From: AlexejPenner Date: Thu, 31 Oct 2024 16:18:52 +0100 Subject: [PATCH 03/14] Refactored configs --- .../workflows/production_run_complete_llm.yml | 2 +- .../workflows/staging_run_complete_llm.yml | 2 +- .../configs/{ => dev}/embeddings.yaml | 2 +- .../{rag_local_dev.yaml => dev/rag.yaml} | 4 +- .../configs/{ => dev}/rag_eval.yaml | 2 +- .../configs/{ => dev}/synthetic.yaml | 2 +- .../configs/production/embeddings.yaml | 48 +++++ .../{rag_gcp.yaml => production/eval.yaml} | 27 +-- .../configs/production/rag.yaml | 44 +++++ .../configs/production/synthetic.yaml | 39 ++++ .../configs/staging/embeddings.yaml | 40 ++++ llm-complete-guide/configs/staging/eval.yaml | 32 ++++ llm-complete-guide/configs/staging/rag.yaml | 38 ++++ .../configs/staging/synthetic.yaml | 39 ++++ llm-complete-guide/run.py | 181 +++++++----------- 15 files changed, 361 insertions(+), 141 deletions(-) rename llm-complete-guide/configs/{ => dev}/embeddings.yaml (94%) rename llm-complete-guide/configs/{rag_local_dev.yaml => dev/rag.yaml} (84%) rename llm-complete-guide/configs/{ => dev}/rag_eval.yaml (90%) rename llm-complete-guide/configs/{ => dev}/synthetic.yaml (94%) create mode 100644 llm-complete-guide/configs/production/embeddings.yaml rename llm-complete-guide/configs/{rag_gcp.yaml => production/eval.yaml} (51%) create mode 100644 llm-complete-guide/configs/production/rag.yaml create mode 100644 llm-complete-guide/configs/production/synthetic.yaml create mode 100644 llm-complete-guide/configs/staging/embeddings.yaml create mode 100644 llm-complete-guide/configs/staging/eval.yaml create mode 100644 llm-complete-guide/configs/staging/rag.yaml create mode 100644 llm-complete-guide/configs/staging/synthetic.yaml diff --git a/.github/workflows/production_run_complete_llm.yml b/.github/workflows/production_run_complete_llm.yml index 354c9f9e..efc93f8b 100644 --- a/.github/workflows/production_run_complete_llm.yml +++ b/.github/workflows/production_run_complete_llm.yml @@ -56,4 +56,4 @@ jobs: - name: Run pipeline, create pipeline, configure trigger (Production) working-directory: ./llm-complete-guide run: | - python gh_action_rag.py --no-cache --create-template ----event-source-id --service-account-id ${{ env.ZENML_SERVICE_ACCOUNT_ID }} --action-id ${{ env.ZENML_ACTION_ID }} --config rag_gcp.yaml \ No newline at end of file + python gh_action_rag.py --no-cache --create-template ----event-source-id --service-account-id ${{ env.ZENML_SERVICE_ACCOUNT_ID }} --action-id ${{ env.ZENML_ACTION_ID }} --config production/rag.yaml \ No newline at end of file diff --git a/.github/workflows/staging_run_complete_llm.yml b/.github/workflows/staging_run_complete_llm.yml index 57125f84..0a68c797 100644 --- a/.github/workflows/staging_run_complete_llm.yml +++ b/.github/workflows/staging_run_complete_llm.yml @@ -52,4 +52,4 @@ jobs: - name: Run pipeline (Staging) working-directory: ./llm-complete-guide run: | - python gh_action_rag.py --no-cache --config rag_local_dev.yaml \ No newline at end of file + python gh_action_rag.py --no-cache --config staging/rag.yaml \ No newline at end of file diff --git a/llm-complete-guide/configs/embeddings.yaml b/llm-complete-guide/configs/dev/embeddings.yaml similarity index 94% rename from llm-complete-guide/configs/embeddings.yaml rename to llm-complete-guide/configs/dev/embeddings.yaml index 6fb2cc20..f7a66d09 100644 --- a/llm-complete-guide/configs/embeddings.yaml +++ b/llm-complete-guide/configs/dev/embeddings.yaml @@ -33,7 +33,7 @@ settings: # configuration of the Model Control Plane model: - name: finetuned-zenml-docs-embeddings + name: dev_finetuned-zenml-docs-embeddings version: latest license: Apache 2.0 description: Finetuned LLM on ZenML docs diff --git a/llm-complete-guide/configs/rag_local_dev.yaml b/llm-complete-guide/configs/dev/rag.yaml similarity index 84% rename from llm-complete-guide/configs/rag_local_dev.yaml rename to llm-complete-guide/configs/dev/rag.yaml index 3d7c9ac8..8a3848ed 100644 --- a/llm-complete-guide/configs/rag_local_dev.yaml +++ b/llm-complete-guide/configs/dev/rag.yaml @@ -23,7 +23,7 @@ settings: # configuration of the Model Control Plane model: - name: finetuned-zenml-docs-embeddings + name: dev_finetuned-zenml-docs-embeddings license: Apache 2.0 description: Finetuned LLM on ZenML docs tags: ["rag", "finetuned"] @@ -31,4 +31,4 @@ model: steps: url_scraper: parameters: - docs_url: https://docs.zenml.io + docs_url: https://docs.zenml.io/stack-components/orchestrators diff --git a/llm-complete-guide/configs/rag_eval.yaml b/llm-complete-guide/configs/dev/rag_eval.yaml similarity index 90% rename from llm-complete-guide/configs/rag_eval.yaml rename to llm-complete-guide/configs/dev/rag_eval.yaml index 6116f3bc..904c1a08 100644 --- a/llm-complete-guide/configs/rag_eval.yaml +++ b/llm-complete-guide/configs/dev/rag_eval.yaml @@ -16,7 +16,7 @@ settings: # configuration of the Model Control Plane model: - name: finetuned-zenml-docs-embeddings + name: dev_finetuned-zenml-docs-embeddings license: Apache 2.0 description: Finetuned LLM on ZenML docs tags: ["rag", "finetuned"] \ No newline at end of file diff --git a/llm-complete-guide/configs/synthetic.yaml b/llm-complete-guide/configs/dev/synthetic.yaml similarity index 94% rename from llm-complete-guide/configs/synthetic.yaml rename to llm-complete-guide/configs/dev/synthetic.yaml index 6b052429..9577e096 100644 --- a/llm-complete-guide/configs/synthetic.yaml +++ b/llm-complete-guide/configs/dev/synthetic.yaml @@ -31,7 +31,7 @@ settings: # configuration of the Model Control Plane model: - name: finetuned-zenml-docs-embeddings + name: dev_finetuned-zenml-docs-embeddings version: latest license: Apache 2.0 description: Finetuned LLM on ZenML docs diff --git a/llm-complete-guide/configs/production/embeddings.yaml b/llm-complete-guide/configs/production/embeddings.yaml new file mode 100644 index 00000000..b681a11b --- /dev/null +++ b/llm-complete-guide/configs/production/embeddings.yaml @@ -0,0 +1,48 @@ +# enable_cache: False + +# environment configuration +settings: + docker: + parent_image: "zenmldocker/prepare-release:base-0.68.1" + requirements: + - langchain-community + - ratelimit + - langchain>=0.0.325 + - langchain-openai + - pgvector + - psycopg2-binary + - beautifulsoup4 + - unstructured + - pandas + - numpy + - sentence-transformers>=3 + - transformers[torch]==4.43.1 + - litellm + - ollama + - tiktoken + - umap-learn + - matplotlib + - pyarrow + - rerankers[flashrank] + - datasets + - torch + - pygithub + environment: + ZENML_PROJECT_SECRET_NAME: llm_complete + + +# configuration of the Model Control Plane +model: + name: prod_finetuned-zenml-docs-embeddings + version: latest + license: Apache 2.0 + description: Finetuned LLM on ZenML docs + tags: ["rag", "finetuned"] + +steps: + finetune: + step_operator: "gcp_a100" + settings: + step_operator.vertex: + accelerator_count: 1 + accelerator_type: NVIDIA_TESLA_A100 \ No newline at end of file diff --git a/llm-complete-guide/configs/rag_gcp.yaml b/llm-complete-guide/configs/production/eval.yaml similarity index 51% rename from llm-complete-guide/configs/rag_gcp.yaml rename to llm-complete-guide/configs/production/eval.yaml index 462b5790..7b07c33b 100644 --- a/llm-complete-guide/configs/rag_gcp.yaml +++ b/llm-complete-guide/configs/production/eval.yaml @@ -1,3 +1,5 @@ +enable_cache: False + # environment configuration settings: docker: @@ -11,31 +13,20 @@ settings: - psycopg2-binary - tiktoken - ratelimit - - rerankers + - rerankers[flashrank] + - matplotlib + - pillow - pygithub environment: ZENML_PROJECT_SECRET_NAME: llm_complete ZENML_ENABLE_RICH_TRACEBACK: FALSE ZENML_LOGGING_VERBOSITY: INFO -steps: - url_scraper: - parameters: - docs_url: https://docs.zenml.io - repo_url: https://github.com/zenml-io/zenml - website_url: https://zenml.io - -# generate_embeddings: -# step_operator: "terraform-gcp-6c0fd52233ca" -# settings: -# step_operator.vertex: -# accelerator_type: "NVIDIA_TESLA_P100" -# accelerator_count: 1 -# machine_type: "n1-standard-8" - # configuration of the Model Control Plane model: - name: finetuned-zenml-docs-embeddings + name: prod_finetuned-zenml-docs-embeddings + version: latest license: Apache 2.0 description: Finetuned LLM on ZenML docs - tags: ["rag", "finetuned"] \ No newline at end of file + tags: ["rag", "finetuned"] + limitations: "Only works for ZenML documentation. Not generalizable to other domains. Entirely build with synthetic data. The data is also quite noisy on account of how the chunks were split." \ No newline at end of file diff --git a/llm-complete-guide/configs/production/rag.yaml b/llm-complete-guide/configs/production/rag.yaml new file mode 100644 index 00000000..76de23d7 --- /dev/null +++ b/llm-complete-guide/configs/production/rag.yaml @@ -0,0 +1,44 @@ +enable_cache: True + +# environment configuration +settings: + docker: + requirements: + - unstructured + - sentence-transformers>=3 + - pgvector + - datasets + - litellm + - numpy + - psycopg2-binary + - tiktoken + - ratelimit + - rerankers + - pygithub + environment: + ZENML_PROJECT_SECRET_NAME: llm_complete + ZENML_ENABLE_RICH_TRACEBACK: FALSE + ZENML_LOGGING_VERBOSITY: INFO + + +# configuration of the Model Control Plane +model: + name: prod_finetuned-zenml-docs-embeddings + license: Apache 2.0 + description: A fine-tuned embeddings model for ZenML documentation. Used for RAG retrieval. + tags: ["rag", "finetuned"] + limitations: Only works for ZenML documentation. Not generalizable to other domains. Entirely build with synthetic data. The data is also quite noisy on account of how the chunks were split. + trade_offs: Focused on a specific RAG retrieval use case. Not generalizable to other domains. + audience: ZenML users + use_cases: RAG retrieval + +steps: + url_scraper: + parameters: + docs_url: https://docs.zenml.io + generate_embeddings: + step_operator: "gcp_a100" + settings: + step_operator.vertex: + accelerator_count: 1 + accelerator_type: NVIDIA_TESLA_A100 \ No newline at end of file diff --git a/llm-complete-guide/configs/production/synthetic.yaml b/llm-complete-guide/configs/production/synthetic.yaml new file mode 100644 index 00000000..ff071427 --- /dev/null +++ b/llm-complete-guide/configs/production/synthetic.yaml @@ -0,0 +1,39 @@ +# environment configuration +settings: + docker: + requirements: + - langchain-community + - ratelimit + - langchain>=0.0.325 + - langchain-openai + - pgvector + - psycopg2-binary + - beautifulsoup4 + - unstructured + - pandas + - numpy + - sentence-transformers>=3 + - transformers==4.43.1 + - litellm + - ollama + - tiktoken + - umap-learn + - matplotlib + - pyarrow + - rerankers[flashrank] + - datasets + - torch + - distilabel + - argilla + - pygithub + environment: + ZENML_PROJECT_SECRET_NAME: llm_complete + + +# configuration of the Model Control Plane +model: + name: prod_finetuned-zenml-docs-embeddings + version: latest + license: Apache 2.0 + description: Finetuned LLM on ZenML docs + tags: ["rag", "finetuned"] diff --git a/llm-complete-guide/configs/staging/embeddings.yaml b/llm-complete-guide/configs/staging/embeddings.yaml new file mode 100644 index 00000000..c0b050bb --- /dev/null +++ b/llm-complete-guide/configs/staging/embeddings.yaml @@ -0,0 +1,40 @@ +# enable_cache: False + +# environment configuration +settings: + docker: + parent_image: "zenmldocker/prepare-release:base-0.68.0" + requirements: + - langchain-community + - ratelimit + - langchain>=0.0.325 + - langchain-openai + - pgvector + - psycopg2-binary + - beautifulsoup4 + - unstructured + - pandas + - numpy + - sentence-transformers>=3 + - transformers[torch]==4.43.1 + - litellm + - ollama + - tiktoken + - umap-learn + - matplotlib + - pyarrow + - rerankers[flashrank] + - datasets + - torch + - pygithub + environment: + ZENML_PROJECT_SECRET_NAME: llm_complete + + +# configuration of the Model Control Plane +model: + name: staging_finetuned-zenml-docs-embeddings + version: latest + license: Apache 2.0 + description: Finetuned LLM on ZenML docs + tags: ["rag", "finetuned"] \ No newline at end of file diff --git a/llm-complete-guide/configs/staging/eval.yaml b/llm-complete-guide/configs/staging/eval.yaml new file mode 100644 index 00000000..aee20b09 --- /dev/null +++ b/llm-complete-guide/configs/staging/eval.yaml @@ -0,0 +1,32 @@ +enable_cache: False + +# environment configuration +settings: + docker: + requirements: + - unstructured + - sentence-transformers>=3 + - pgvector + - datasets + - litellm + - numpy + - psycopg2-binary + - tiktoken + - ratelimit + - rerankers[flashrank] + - matplotlib + - pillow + - pygithub + environment: + ZENML_PROJECT_SECRET_NAME: llm_complete + ZENML_ENABLE_RICH_TRACEBACK: FALSE + ZENML_LOGGING_VERBOSITY: INFO + +# configuration of the Model Control Plane +model: + name: staging_finetuned-zenml-docs-embeddings + version: latest + license: Apache 2.0 + description: Finetuned LLM on ZenML docs + tags: ["rag", "finetuned"] + limitations: "Only works for ZenML documentation. Not generalizable to other domains. Entirely build with synthetic data. The data is also quite noisy on account of how the chunks were split." \ No newline at end of file diff --git a/llm-complete-guide/configs/staging/rag.yaml b/llm-complete-guide/configs/staging/rag.yaml new file mode 100644 index 00000000..59c0c736 --- /dev/null +++ b/llm-complete-guide/configs/staging/rag.yaml @@ -0,0 +1,38 @@ +enable_cache: False + +# environment configuration +settings: + docker: + requirements: + - unstructured + - sentence-transformers>=3 + - pgvector + - datasets + - litellm + - numpy + - psycopg2-binary + - tiktoken + - ratelimit + - rerankers + - pygithub + environment: + ZENML_PROJECT_SECRET_NAME: llm_complete + ZENML_ENABLE_RICH_TRACEBACK: FALSE + ZENML_LOGGING_VERBOSITY: INFO + + +# configuration of the Model Control Plane +model: + name: staging_finetuned-zenml-docs-embeddings + license: Apache 2.0 + description: A fine-tuned embeddings model for ZenML documentation. Used for RAG retrieval. + tags: ["rag", "finetuned"] + limitations: Only works for ZenML documentation. Not generalizable to other domains. Entirely build with synthetic data. The data is also quite noisy on account of how the chunks were split. + trade_offs: Focused on a specific RAG retrieval use case. Not generalizable to other domains. + audience: ZenML users + use_cases: RAG retrieval + +steps: + url_scraper: + parameters: + docs_url: https://docs.zenml.io diff --git a/llm-complete-guide/configs/staging/synthetic.yaml b/llm-complete-guide/configs/staging/synthetic.yaml new file mode 100644 index 00000000..91b1e13c --- /dev/null +++ b/llm-complete-guide/configs/staging/synthetic.yaml @@ -0,0 +1,39 @@ +# environment configuration +settings: + docker: + requirements: + - langchain-community + - ratelimit + - langchain>=0.0.325 + - langchain-openai + - pgvector + - psycopg2-binary + - beautifulsoup4 + - unstructured + - pandas + - numpy + - sentence-transformers>=3 + - transformers==4.43.1 + - litellm + - ollama + - tiktoken + - umap-learn + - matplotlib + - pyarrow + - rerankers[flashrank] + - datasets + - torch + - distilabel + - argilla + - pygithub + environment: + ZENML_PROJECT_SECRET_NAME: llm_complete + + +# configuration of the Model Control Plane +model: + name: staging_finetuned-zenml-docs-embeddings + version: latest + license: Apache 2.0 + description: Finetuned LLM on ZenML docs + tags: ["rag", "finetuned"] diff --git a/llm-complete-guide/run.py b/llm-complete-guide/run.py index 5e351478..c007521b 100644 --- a/llm-complete-guide/run.py +++ b/llm-complete-guide/run.py @@ -15,6 +15,7 @@ # limitations under the License. import os import warnings +from pathlib import Path # Suppress the specific FutureWarning from huggingface_hub warnings.filterwarnings( @@ -62,33 +63,18 @@ Run the ZenML LLM RAG complete guide project pipelines. """ ) -@click.option( - "--rag", - "rag", - is_flag=True, - default=False, - help="Whether to run the pipeline that creates the dataset.", -) -@click.option( - "--deploy", - "deploy", - is_flag=True, - default=False, - help="Whether to deploy a Gradio app to serve the RAG functionality.", -) -@click.option( - "--evaluation", - "evaluation", - is_flag=True, - default=False, - help="Whether to run the evaluation pipeline.", -) -@click.option( - "--query", - "query", - type=str, - required=False, - help="Query the RAG model.", +@click.argument( + "pipeline", + type=click.Choice([ + "rag", + "deploy", + "evaluation", + "query", + "synthetic", + "embeddings", + "chunks" + ]), + required=True ) @click.option( "--model", @@ -112,41 +98,20 @@ default=False, help="Disable cache.", ) -@click.option( - "--synthetic", - "synthetic", - is_flag=True, - default=False, - help="Run the synthetic data pipeline.", -) -@click.option( - "--embeddings", - "embeddings", - is_flag=True, - default=False, - help="Fine-tunes embeddings.", -) @click.option( "--argilla", - "argilla", + "use_argilla", is_flag=True, default=False, help="Uses Argilla annotations.", ) @click.option( "--reranked", - "reranked", + "use_reranker", is_flag=True, default=False, help="Whether to use the reranker.", ) -@click.option( - "--chunks", - "chunks", - is_flag=True, - default=False, - help="Generate chunks for Hugging Face dataset", -) @click.option( "--config", "config", @@ -154,107 +119,91 @@ help="Generate chunks for Hugging Face dataset", ) def main( - rag: bool = False, - deploy: bool = False, - evaluation: bool = False, - query: Optional[str] = None, + pipeline: str, + query_text: Optional[str] = None, model: str = OPENAI_MODEL, no_cache: bool = False, - synthetic: bool = False, - embeddings: bool = False, - argilla: bool = False, - reranked: bool = False, - chunks: bool = False, - config: str = None, + use_argilla: bool = False, + use_reranker: bool = False, + config: Optional[str] = None, ): """Main entry point for the pipeline execution. Args: - rag (bool): If `True`, the basic RAG pipeline will be run. - deploy (bool): If `True`, a Gradio app will be deployed to serve the RAG functionality. - evaluation (bool): If `True`, the evaluation pipeline will be run. - query (Optional[str]): If provided, the RAG model will be queried with this string. - model (str): The model to use for the completion. Default is OPENAI_MODEL. - no_cache (bool): If `True`, cache will be disabled. - synthetic (bool): If `True`, the synthetic data pipeline will be run. - embeddings (bool): If `True`, the embeddings will be fine-tuned. - argilla (bool): If `True`, the Argilla annotations will be used. - chunks (bool): If `True`, the chunks pipeline will be run. - reranked (bool): If `True`, rerankers will be used - config (str: Path to config + pipeline (str): The pipeline to execute (rag, deploy, evaluation, etc.) + query_text (Optional[str]): Query text when using 'query' command + model (str): The model to use for the completion + no_cache (bool): If True, cache will be disabled + use_argilla (bool): If True, Argilla annotations will be used + use_reranker (bool): If True, rerankers will be used + config (Optional[str]): Path to config file """ pipeline_args = {"enable_cache": not no_cache} embeddings_finetune_args = { "enable_cache": not no_cache, "steps": { "prepare_load_data": { - "parameters": {"use_argilla_annotations": argilla} + "parameters": {"use_argilla_annotations": use_argilla} } }, } - if query: + # Handle config path + config_path = None + if config: + config_path = Path(__file__).parent / "configs" / config + + # Set default config paths based on pipeline + if not config_path: + config_mapping = { + "rag": "dev/rag.yaml", + "evaluation": "dev/rag_eval.yaml", + "synthetic": "dev/synthetic.yaml", + "embeddings": "dev/embeddings.yaml" + } + if pipeline in config_mapping: + config_path = Path(__file__).parent / "configs" / config_mapping[pipeline] + + + # Execute query + if pipeline == "query": + if not query_text: + raise click.UsageError("--query-text is required when using 'query' command") response = process_input_with_retrieval( - query, model=model, use_reranking=reranked + query_text, model=model, use_reranking=use_reranker ) - - # print rich markdown to the console console = Console() md = Markdown(response) console.print(md) + return - config_path = None - if config: - config_path = os.path.join( - os.path.dirname(os.path.realpath(__file__)), - "configs", - config, - ) - - if rag: - if not config_path: - config_path = os.path.join( - os.path.dirname(os.path.realpath(__file__)), - "configs", - "rag_local_dev.yaml", - ) + # Execute the appropriate pipeline + if pipeline == "rag": llm_basic_rag.with_options(config_path=config_path, **pipeline_args)() - if deploy: + # Also deploy if config is provided + if config: rag_deployment.with_options( config_path=config_path, **pipeline_args )() - if deploy: + + elif pipeline == "deploy": rag_deployment.with_options(**pipeline_args)() - if evaluation: - if not config_path: - config_path = os.path.join( - os.path.dirname(os.path.realpath(__file__)), - "configs", - "rag_eval.yaml", - ) + + elif pipeline == "evaluation": pipeline_args["enable_cache"] = False llm_eval.with_options(config_path=config_path)() - if synthetic: - if not config_path: - config_path = os.path.join( - os.path.dirname(os.path.realpath(__file__)), - "configs", - "synthetic.yaml", - ) + + elif pipeline == "synthetic": generate_synthetic_data.with_options( config_path=config_path, **pipeline_args )() - if embeddings: - if not config_path: - config_path = os.path.join( - os.path.dirname(os.path.realpath(__file__)), - "configs", - "embeddings.yaml", - ) + + elif pipeline == "embeddings": finetune_embeddings.with_options( config_path=config_path, **embeddings_finetune_args )() - if chunks: + + elif pipeline == "chunks": generate_chunk_questions.with_options(**pipeline_args)() From d52526b3e2d89d63b39719429970565d0272e50d Mon Sep 17 00:00:00 2001 From: AlexejPenner Date: Thu, 31 Oct 2024 16:43:03 +0100 Subject: [PATCH 04/14] Use new commands --- llm-complete-guide/README.md | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/llm-complete-guide/README.md b/llm-complete-guide/README.md index 033de4d6..fc6f4072 100644 --- a/llm-complete-guide/README.md +++ b/llm-complete-guide/README.md @@ -85,7 +85,7 @@ to run the pipelines in the correct order. You can run the script with the following command: ```shell -python run.py --rag +python run.py rag ``` This will run the basic RAG pipeline, which scrapes the ZenML documentation and @@ -100,7 +100,7 @@ use for the LLM. When you're ready to make the query, run the following command: ```shell -python run.py --query "how do I use a custom materializer inside my own zenml steps? i.e. how do I set it? inside the @step decorator?" --model=gpt4 +python run.py query "how do I use a custom materializer inside my own zenml steps? i.e. how do I set it? inside the @step decorator?" --model=gpt4 ``` Alternative options for LLMs to use include: @@ -119,7 +119,7 @@ this up. To run the evaluation pipeline, you can use the following command: ```shell -python run.py --evaluation +python run.py evaluation ``` You'll need to have first run the RAG pipeline to have the necessary assets in @@ -137,7 +137,7 @@ To run the `distilabel` synthetic data generation pipeline, you can use the foll ```shell pip install -r requirements-argilla.txt # special requirements -python run.py --synthetic +python run.py synthetic ``` You will also need to have set up and connected to an Argilla instance for this @@ -177,7 +177,7 @@ commands: ```shell pip install -r requirements-argilla.txt # special requirements -python run.py --embeddings +python run.py embeddings ``` *Credit to Phil Schmid for his [tutorial on embeddings finetuning with Matryoshka From a0535df9d694334eb011a823735a9e5097ad6591 Mon Sep 17 00:00:00 2001 From: AlexejPenner Date: Sat, 2 Nov 2024 08:31:10 +0100 Subject: [PATCH 05/14] Reformatted --- llm-complete-guide/gh_action_rag.py | 61 ++- llm-complete-guide/notebooks/reranking.ipynb | 351 +++--------------- llm-complete-guide/pipelines/llm_basic_rag.py | 3 +- llm-complete-guide/pipelines/llm_eval.py | 14 +- llm-complete-guide/run.py | 36 +- .../steps/eval_visualisation.py | 2 +- .../steps/finetune_embeddings.py | 5 +- llm-complete-guide/steps/push_to_argilla.py | 3 +- 8 files changed, 105 insertions(+), 370 deletions(-) diff --git a/llm-complete-guide/gh_action_rag.py b/llm-complete-guide/gh_action_rag.py index 49c8c0f3..4828b57d 100644 --- a/llm-complete-guide/gh_action_rag.py +++ b/llm-complete-guide/gh_action_rag.py @@ -21,11 +21,10 @@ import click import yaml +from pipelines.llm_basic_rag import llm_basic_rag from zenml.client import Client from zenml.exceptions import ZenKeyError -from pipelines.llm_basic_rag import llm_basic_rag - @click.command( help=""" @@ -39,7 +38,6 @@ default=False, help="Disable cache.", ) - @click.option( "--create-template", "create_template", @@ -51,26 +49,26 @@ "--config", "config", default="rag_local_dev.yaml", - help="Specify a configuration file" + help="Specify a configuration file", ) @click.option( "--service-account-id", "service_account_id", default=None, - help="Specify a service account ID" + help="Specify a service account ID", ) @click.option( "--event-source-id", "event_source_id", default=None, - help="Specify an event source ID" + help="Specify an event source ID", ) def main( no_cache: bool = False, - config: Optional[str]= "rag_local_dev.yaml", + config: Optional[str] = "rag_local_dev.yaml", create_template: bool = False, service_account_id: Optional[str] = None, - event_source_id: Optional[str] = None + event_source_id: Optional[str] = None, ): """ Executes the pipeline to train a basic RAG model. @@ -86,43 +84,43 @@ def main( client = Client() config_path = Path(__file__).parent / "configs" / config - with (open(config_path,"r") as file): + with open(config_path, "r") as file: config = yaml.safe_load(file) if create_template: - # run pipeline run = llm_basic_rag.with_options( - config_path=str(config_path), - enable_cache=not no_cache + config_path=str(config_path), enable_cache=not no_cache )() # create new run template rt = client.create_run_template( name=f"production-llm-complete-{datetime.now().strftime('%Y-%m-%d_%H-%M-%S')}", - deployment_id=run.deployment_id + deployment_id=run.deployment_id, ) try: # Check if an action ahs already be configured for this pipeline action = client.get_action( name_id_or_prefix="LLM Complete (production)", - allow_name_prefix_match=True + allow_name_prefix_match=True, ) except ZenKeyError: if not event_source_id: - raise RuntimeError("An event source is required for this workflow.") + raise RuntimeError( + "An event source is required for this workflow." + ) if not service_account_id: service_account_id = client.create_service_account( name="github-action-sa", - description="To allow triggered pipelines to run with M2M authentication." + description="To allow triggered pipelines to run with M2M authentication.", ).id action_id = client.create_action( name="LLM Complete (production)", configuration={ "template_id": str(rt.id), - "run_config": pop_restricted_configs(config) + "run_config": pop_restricted_configs(config), }, service_account_id=service_account_id, auth_window=0, @@ -132,7 +130,7 @@ def main( event_source_id=UUID(event_source_id), event_filter={"event_type": "tag_event"}, action_id=action_id, - description="Trigger pipeline to reindex everytime the docs are updated through git." + description="Trigger pipeline to reindex everytime the docs are updated through git.", ) else: # update the action with the new template @@ -141,14 +139,13 @@ def main( name_id_or_prefix=action.id, configuration={ "template_id": str(rt.id), - "run_config": pop_restricted_configs(config) - } + "run_config": pop_restricted_configs(config), + }, ) else: llm_basic_rag.with_options( - config_path=str(config_path), - enable_cache=not no_cache + config_path=str(config_path), enable_cache=not no_cache )() @@ -162,22 +159,22 @@ def pop_restricted_configs(run_configuration: dict) -> dict: Modified dictionary with restricted items removed """ # Pop top-level restricted items - run_configuration.pop('parameters', None) - run_configuration.pop('build', None) - run_configuration.pop('schedule', None) + run_configuration.pop("parameters", None) + run_configuration.pop("build", None) + run_configuration.pop("schedule", None) # Pop docker settings if they exist - if 'settings' in run_configuration: - run_configuration['settings'].pop('docker', None) + if "settings" in run_configuration: + run_configuration["settings"].pop("docker", None) # Pop docker settings from steps if they exist - if 'steps' in run_configuration: - for step in run_configuration['steps'].values(): - if 'settings' in step: - step['settings'].pop('docker', None) + if "steps" in run_configuration: + for step in run_configuration["steps"].values(): + if "settings" in step: + step["settings"].pop("docker", None) return run_configuration if __name__ == "__main__": - main() \ No newline at end of file + main() diff --git a/llm-complete-guide/notebooks/reranking.ipynb b/llm-complete-guide/notebooks/reranking.ipynb index 94342811..80f8507a 100644 --- a/llm-complete-guide/notebooks/reranking.ipynb +++ b/llm-complete-guide/notebooks/reranking.ipynb @@ -2,7 +2,7 @@ "cells": [ { "cell_type": "code", - "execution_count": 3, + "execution_count": null, "metadata": {}, "outputs": [], "source": [ @@ -12,128 +12,9 @@ }, { "cell_type": "code", - "execution_count": 41, + "execution_count": null, "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Loading TransformerRanker model mixedbread-ai/mxbai-rerank-large-v1\n", - "No device set\n", - "Using device cuda\n", - "No dtype set\n", - "Using dtype torch.float16\n" - ] - }, - { - "data": { - "application/vnd.jupyter.widget-view+json": { - "model_id": "867edac78ccb49aea85b6e96c03c201b", - "version_major": 2, - "version_minor": 0 - }, - "text/plain": [ - "config.json: 0%| | 0.00/970 [00:00RankedResults(\n", - " results=[\n", - " Result(doc_id=0, text='I like to play soccer', score=-1.2607421875, rank=1),\n", - " Result(doc_id=2, text='I like to play basketball', score=-1.2890625, rank=2),\n", - " Result(doc_id=1, text='I like to play football', score=-1.9384765625, rank=3),\n", - " Result(doc_id=3, text='I love dogs', score=-5.12109375, rank=4),\n", - " Result(doc_id=4, text='Catcher in the Rye is a great book', score=-6.19140625, rank=5)\n", - " ],\n", - " query=\"What's your favorite sport?\",\n", - " has_scores=True\n", - ")\n", - "\n" - ], - "text/plain": [ - "\u001b[1;35mRankedResults\u001b[0m\u001b[1m(\u001b[0m\n", - " \u001b[33mresults\u001b[0m=\u001b[1m[\u001b[0m\n", - " \u001b[1;35mResult\u001b[0m\u001b[1m(\u001b[0m\u001b[33mdoc_id\u001b[0m=\u001b[1;36m0\u001b[0m, \u001b[33mtext\u001b[0m=\u001b[32m'I like to play soccer'\u001b[0m, \u001b[33mscore\u001b[0m=\u001b[1;36m-1.2607421875\u001b[0m, \u001b[33mrank\u001b[0m=\u001b[1;36m1\u001b[0m\u001b[1m)\u001b[0m,\n", - " \u001b[1;35mResult\u001b[0m\u001b[1m(\u001b[0m\u001b[33mdoc_id\u001b[0m=\u001b[1;36m2\u001b[0m, \u001b[33mtext\u001b[0m=\u001b[32m'I like to play basketball'\u001b[0m, \u001b[33mscore\u001b[0m=\u001b[1;36m-1.2890625\u001b[0m, \u001b[33mrank\u001b[0m=\u001b[1;36m2\u001b[0m\u001b[1m)\u001b[0m,\n", - " \u001b[1;35mResult\u001b[0m\u001b[1m(\u001b[0m\u001b[33mdoc_id\u001b[0m=\u001b[1;36m1\u001b[0m, \u001b[33mtext\u001b[0m=\u001b[32m'I like to play football'\u001b[0m, \u001b[33mscore\u001b[0m=\u001b[1;36m-1.9384765625\u001b[0m, \u001b[33mrank\u001b[0m=\u001b[1;36m3\u001b[0m\u001b[1m)\u001b[0m,\n", - " \u001b[1;35mResult\u001b[0m\u001b[1m(\u001b[0m\u001b[33mdoc_id\u001b[0m=\u001b[1;36m3\u001b[0m, \u001b[33mtext\u001b[0m=\u001b[32m'I love dogs'\u001b[0m, \u001b[33mscore\u001b[0m=\u001b[1;36m-5.12109375\u001b[0m, \u001b[33mrank\u001b[0m=\u001b[1;36m4\u001b[0m\u001b[1m)\u001b[0m,\n", - " \u001b[1;35mResult\u001b[0m\u001b[1m(\u001b[0m\u001b[33mdoc_id\u001b[0m=\u001b[1;36m4\u001b[0m, \u001b[33mtext\u001b[0m=\u001b[32m'Catcher in the Rye is a great book'\u001b[0m, \u001b[33mscore\u001b[0m=\u001b[1;36m-6.19140625\u001b[0m, \u001b[33mrank\u001b[0m=\u001b[1;36m5\u001b[0m\u001b[1m)\u001b[0m\n", - " \u001b[1m]\u001b[0m,\n", - " \u001b[33mquery\u001b[0m=\u001b[32m\"What\u001b[0m\u001b[32m's your favorite sport?\"\u001b[0m,\n", - " \u001b[33mhas_scores\u001b[0m=\u001b[3;92mTrue\u001b[0m\n", - "\u001b[1m)\u001b[0m\n" - ] - }, - "metadata": {}, - "output_type": "display_data" - } - ], + "outputs": [], "source": [ "results = ranker.rank(query=\"What's your favorite sport?\", docs=texts)\n", "\n", @@ -207,35 +56,9 @@ }, { "cell_type": "code", - "execution_count": 44, + "execution_count": null, "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
[\n",
-       "    'I like to play soccer',\n",
-       "    'I like to play basketball',\n",
-       "    'I like to play football',\n",
-       "    'I love dogs',\n",
-       "    'Catcher in the Rye is a great book'\n",
-       "]\n",
-       "
\n" - ], - "text/plain": [ - "\u001b[1m[\u001b[0m\n", - " \u001b[32m'I like to play soccer'\u001b[0m,\n", - " \u001b[32m'I like to play basketball'\u001b[0m,\n", - " \u001b[32m'I like to play football'\u001b[0m,\n", - " \u001b[32m'I love dogs'\u001b[0m,\n", - " \u001b[32m'Catcher in the Rye is a great book'\u001b[0m\n", - "\u001b[1m]\u001b[0m\n" - ] - }, - "metadata": {}, - "output_type": "display_data" - } - ], + "outputs": [], "source": [ "print([document.text for document in results.results])" ] @@ -256,36 +79,24 @@ }, { "cell_type": "code", - "execution_count": 46, + "execution_count": null, "metadata": {}, - "outputs": [ - { - "ename": "ImportError", - "evalue": "attempted relative import with no known parent package", - "output_type": "error", - "traceback": [ - "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", - "\u001b[0;31mImportError\u001b[0m Traceback (most recent call last)", - "Cell \u001b[0;32mIn[46], line 1\u001b[0m\n\u001b[0;32m----> 1\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01msteps\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01meval_retrieval\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m query_similar_docs\n", - "\u001b[0;31mImportError\u001b[0m: attempted relative import with no known parent package" - ] - } - ], + "outputs": [], "source": [ "embedded_question = get_embeddings(question)\n", - " db_conn = get_db_conn()\n", - " num_docs = 20 if use_reranking else 5\n", - " # get (content, url) tuples for the top n similar documents\n", - " top_similar_docs = get_topn_similar_docs(\n", - " embedded_question, db_conn, n=num_docs, include_metadata=True\n", - " )\n", - "\n", - " if use_reranking:\n", - " urls = rerank_documents(question, top_similar_docs)[:5]\n", - " else:\n", - " urls = [doc[1] for doc in top_similar_docs] # Unpacking URLs\n", - "\n", - " return (question, url_ending, urls)\n" + "db_conn = get_db_conn()\n", + "num_docs = 20 if use_reranking else 5\n", + "# get (content, url) tuples for the top n similar documents\n", + "top_similar_docs = get_topn_similar_docs(\n", + " embedded_question, db_conn, n=num_docs, include_metadata=True\n", + ")\n", + "\n", + "if use_reranking:\n", + " urls = rerank_documents(question, top_similar_docs)[:5]\n", + "else:\n", + " urls = [doc[1] for doc in top_similar_docs] # Unpacking URLs\n", + "\n", + "return (question, url_ending, urls)" ] }, { @@ -297,32 +108,9 @@ }, { "cell_type": "code", - "execution_count": 1, + "execution_count": null, "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Ranked documents for query: quick brown fox\n", - "Document: A quick brown fox jumps over the lazy dog\n", - "Score: 0.6937165451385258\n", - "\n", - "Document: The quick brown fox jumps over the lazy dog\n", - "Score: 0.6928630071635998\n", - "\n", - "Document: The quick brown fox is quick and brown\n", - "Score: 0.6868308019742143\n", - "\n", - "Document: The quick brown fox is different from the lazy dog\n", - "Score: 0.6802242759508812\n", - "\n", - "Document: The lazy dog is lazy and sleepy\n", - "Score: 0.5727275080137214\n", - "\n" - ] - } - ], + "outputs": [], "source": [ "import numpy as np\n", "from sklearn.feature_extraction.text import TfidfVectorizer\n", @@ -338,7 +126,11 @@ "]\n", "\n", "# Toy queries and their corresponding relevant document indices\n", - "queries = [(\"quick fox\", [0, 1, 2]), (\"lazy dog\", [3, 4]), (\"brown fox\", [0, 1, 2, 4])]\n", + "queries = [\n", + " (\"quick fox\", [0, 1, 2]),\n", + " (\"lazy dog\", [3, 4]),\n", + " (\"brown fox\", [0, 1, 2, 4]),\n", + "]\n", "\n", "# Create TF-IDF vectorizer\n", "vectorizer = TfidfVectorizer()\n", @@ -355,7 +147,9 @@ " query_vector = vectorizer.transform([query])\n", " for doc_idx, doc_vector in enumerate(document_vectors):\n", " X_train.append(\n", - " np.concatenate((query_vector.toarray()[0], doc_vector.toarray()[0]))\n", + " np.concatenate(\n", + " (query_vector.toarray()[0], doc_vector.toarray()[0])\n", + " )\n", " )\n", " y_train.append(1 if doc_idx in relevant_docs else 0)\n", "\n", @@ -367,7 +161,9 @@ "scores = []\n", "\n", "for doc_vector in document_vectors:\n", - " input_vector = np.concatenate((query_vector.toarray()[0], doc_vector.toarray()[0]))\n", + " input_vector = np.concatenate(\n", + " (query_vector.toarray()[0], doc_vector.toarray()[0])\n", + " )\n", " score = reranker.predict_proba([input_vector])[0][1]\n", " scores.append(score)\n", "\n", @@ -381,28 +177,9 @@ }, { "cell_type": "code", - "execution_count": 5, + "execution_count": null, "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Loading default cross-encoder model for language en\n", - "Warning: Model type could not be auto-mapped with the defaults list. Defaulting to TransformerRanker.\n", - "If your model is NOT intended to be ran as a one-label cross-encoder, please reload it and specify the model_type! Otherwise, you may ignore this warning. You may specify `model_type='cross-encoder'` to suppress this warning in the future.\n", - "Default Model: mixedbread-ai/mxbai-rerank-base-v1\n", - "Loading TransformerRanker model mixedbread-ai/mxbai-rerank-base-v1\n", - "No device set\n", - "Using device cuda\n", - "No dtype set\n", - "Using dtype torch.float16\n", - "Loaded model mixedbread-ai/mxbai-rerank-base-v1\n", - "Using device cuda.\n", - "Using dtype torch.float16.\n" - ] - } - ], + "outputs": [], "source": [ "from rerankers import Reranker\n", "\n", @@ -422,45 +199,9 @@ }, { "cell_type": "code", - "execution_count": 6, + "execution_count": null, "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
RankedResults(\n",
-       "    results=[\n",
-       "        Result(doc_id=5, text='I like to play basketball', score=-0.46533203125, rank=1),\n",
-       "        Result(doc_id=0, text='I like to play soccer', score=-0.7353515625, rank=2),\n",
-       "        Result(doc_id=1, text='I like to play football', score=-0.9677734375, rank=3),\n",
-       "        Result(doc_id=2, text='War and Peace is a great book', score=-5.40234375, rank=4),\n",
-       "        Result(doc_id=3, text='I love dogs', score=-5.5859375, rank=5),\n",
-       "        Result(doc_id=4, text=\"Ginger cats aren't very smart\", score=-5.94921875, rank=6)\n",
-       "    ],\n",
-       "    query=\"What's your favorite sport?\",\n",
-       "    has_scores=True\n",
-       ")\n",
-       "
\n" - ], - "text/plain": [ - "\u001b[1;35mRankedResults\u001b[0m\u001b[1m(\u001b[0m\n", - " \u001b[33mresults\u001b[0m=\u001b[1m[\u001b[0m\n", - " \u001b[1;35mResult\u001b[0m\u001b[1m(\u001b[0m\u001b[33mdoc_id\u001b[0m=\u001b[1;36m5\u001b[0m, \u001b[33mtext\u001b[0m=\u001b[32m'I like to play basketball'\u001b[0m, \u001b[33mscore\u001b[0m=\u001b[1;36m-0.46533203125\u001b[0m, \u001b[33mrank\u001b[0m=\u001b[1;36m1\u001b[0m\u001b[1m)\u001b[0m,\n", - " \u001b[1;35mResult\u001b[0m\u001b[1m(\u001b[0m\u001b[33mdoc_id\u001b[0m=\u001b[1;36m0\u001b[0m, \u001b[33mtext\u001b[0m=\u001b[32m'I like to play soccer'\u001b[0m, \u001b[33mscore\u001b[0m=\u001b[1;36m-0.7353515625\u001b[0m, \u001b[33mrank\u001b[0m=\u001b[1;36m2\u001b[0m\u001b[1m)\u001b[0m,\n", - " \u001b[1;35mResult\u001b[0m\u001b[1m(\u001b[0m\u001b[33mdoc_id\u001b[0m=\u001b[1;36m1\u001b[0m, \u001b[33mtext\u001b[0m=\u001b[32m'I like to play football'\u001b[0m, \u001b[33mscore\u001b[0m=\u001b[1;36m-0.9677734375\u001b[0m, \u001b[33mrank\u001b[0m=\u001b[1;36m3\u001b[0m\u001b[1m)\u001b[0m,\n", - " \u001b[1;35mResult\u001b[0m\u001b[1m(\u001b[0m\u001b[33mdoc_id\u001b[0m=\u001b[1;36m2\u001b[0m, \u001b[33mtext\u001b[0m=\u001b[32m'War and Peace is a great book'\u001b[0m, \u001b[33mscore\u001b[0m=\u001b[1;36m-5.40234375\u001b[0m, \u001b[33mrank\u001b[0m=\u001b[1;36m4\u001b[0m\u001b[1m)\u001b[0m,\n", - " \u001b[1;35mResult\u001b[0m\u001b[1m(\u001b[0m\u001b[33mdoc_id\u001b[0m=\u001b[1;36m3\u001b[0m, \u001b[33mtext\u001b[0m=\u001b[32m'I love dogs'\u001b[0m, \u001b[33mscore\u001b[0m=\u001b[1;36m-5.5859375\u001b[0m, \u001b[33mrank\u001b[0m=\u001b[1;36m5\u001b[0m\u001b[1m)\u001b[0m,\n", - " \u001b[1;35mResult\u001b[0m\u001b[1m(\u001b[0m\u001b[33mdoc_id\u001b[0m=\u001b[1;36m4\u001b[0m, \u001b[33mtext\u001b[0m=\u001b[32m\"Ginger\u001b[0m\u001b[32m cats aren't very smart\"\u001b[0m, \u001b[33mscore\u001b[0m=\u001b[1;36m-5.94921875\u001b[0m, \u001b[33mrank\u001b[0m=\u001b[1;36m6\u001b[0m\u001b[1m)\u001b[0m\n", - " \u001b[1m]\u001b[0m,\n", - " \u001b[33mquery\u001b[0m=\u001b[32m\"What\u001b[0m\u001b[32m's your favorite sport?\"\u001b[0m,\n", - " \u001b[33mhas_scores\u001b[0m=\u001b[3;92mTrue\u001b[0m\n", - "\u001b[1m)\u001b[0m\n" - ] - }, - "metadata": {}, - "output_type": "display_data" - } - ], + "outputs": [], "source": [ "print(results)" ] @@ -475,7 +216,7 @@ ], "metadata": { "kernelspec": { - "display_name": "new-rag", + "display_name": "Python 3 (ipykernel)", "language": "python", "name": "python3" }, @@ -489,9 +230,9 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.10.12" + "version": "3.11.9" } }, "nbformat": 4, - "nbformat_minor": 2 + "nbformat_minor": 4 } diff --git a/llm-complete-guide/pipelines/llm_basic_rag.py b/llm-complete-guide/pipelines/llm_basic_rag.py index 3cfb4051..6cf99f08 100644 --- a/llm-complete-guide/pipelines/llm_basic_rag.py +++ b/llm-complete-guide/pipelines/llm_basic_rag.py @@ -15,8 +15,6 @@ # limitations under the License. # -from zenml import pipeline - from steps.populate_index import ( generate_embeddings, index_generator, @@ -24,6 +22,7 @@ ) from steps.url_scraper import url_scraper from steps.web_url_loader import web_url_loader +from zenml import pipeline @pipeline diff --git a/llm-complete-guide/pipelines/llm_eval.py b/llm-complete-guide/pipelines/llm_eval.py index d310fd18..8f604dac 100644 --- a/llm-complete-guide/pipelines/llm_eval.py +++ b/llm-complete-guide/pipelines/llm_eval.py @@ -13,12 +13,10 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -import os from pathlib import Path from typing import Optional import click - from steps.eval_e2e import e2e_evaluation, e2e_evaluation_llm_judged from steps.eval_retrieval import ( retrieval_evaluation_full, @@ -82,12 +80,9 @@ def llm_eval() -> None: "--config", "config", default="rag_local_dev.yaml", - help="Specify a configuration file" + help="Specify a configuration file", ) -def main( - no_cache: bool = False, - config: Optional[str] = "rag_eval.yaml" -): +def main(no_cache: bool = False, config: Optional[str] = "rag_eval.yaml"): """ Executes the pipeline to train a basic RAG model. @@ -98,10 +93,9 @@ def main( config_path = Path(__file__).parent.parent / "configs" / config llm_eval.with_options( - config_path=str(config_path), - enable_cache=not no_cache + config_path=str(config_path), enable_cache=not no_cache )() if __name__ == "__main__": - main() \ No newline at end of file + main() diff --git a/llm-complete-guide/run.py b/llm-complete-guide/run.py index 58634858..a84c37ec 100644 --- a/llm-complete-guide/run.py +++ b/llm-complete-guide/run.py @@ -13,7 +13,6 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -import os import warnings from pathlib import Path @@ -65,16 +64,18 @@ ) @click.argument( "pipeline", - type=click.Choice([ - "rag", - "deploy", - "evaluation", - "query", - "synthetic", - "embeddings", - "chunks" - ]), - required=True + type=click.Choice( + [ + "rag", + "deploy", + "evaluation", + "query", + "synthetic", + "embeddings", + "chunks", + ] + ), + required=True, ) @click.option( "--model", @@ -159,16 +160,19 @@ def main( "rag": "dev/rag.yaml", "evaluation": "dev/rag_eval.yaml", "synthetic": "dev/synthetic.yaml", - "embeddings": "dev/embeddings.yaml" + "embeddings": "dev/embeddings.yaml", } if pipeline in config_mapping: - config_path = Path(__file__).parent / "configs" / config_mapping[pipeline] - + config_path = ( + Path(__file__).parent / "configs" / config_mapping[pipeline] + ) # Execute query if pipeline == "query": if not query_text: - raise click.UsageError("--query-text is required when using 'query' command") + raise click.UsageError( + "--query-text is required when using 'query' command" + ) response = process_input_with_retrieval( query_text, model=model, use_reranking=use_reranker ) @@ -213,4 +217,4 @@ def main( materializer_registry.register_materializer_type( Document, DocumentMaterializer ) - main() \ No newline at end of file + main() diff --git a/llm-complete-guide/steps/eval_visualisation.py b/llm-complete-guide/steps/eval_visualisation.py index 4b7b004b..badd62c1 100644 --- a/llm-complete-guide/steps/eval_visualisation.py +++ b/llm-complete-guide/steps/eval_visualisation.py @@ -65,7 +65,7 @@ def create_image( fontweight="bold", ) else: - bar_color = colors[i] if alternate_colours else "blue" + colors[i] if alternate_colours else "blue" text_color = "white" ax.text( v diff --git a/llm-complete-guide/steps/finetune_embeddings.py b/llm-complete-guide/steps/finetune_embeddings.py index ad9d9469..3117c473 100644 --- a/llm-complete-guide/steps/finetune_embeddings.py +++ b/llm-complete-guide/steps/finetune_embeddings.py @@ -23,7 +23,8 @@ DATASET_NAME_DISTILABEL, EMBEDDINGS_MODEL_ID_BASELINE, EMBEDDINGS_MODEL_ID_FINE_TUNED, - EMBEDDINGS_MODEL_MATRYOSHKA_DIMS, SECRET_NAME, + EMBEDDINGS_MODEL_MATRYOSHKA_DIMS, + SECRET_NAME, ) from datasets import DatasetDict, concatenate_datasets, load_dataset from datasets.arrow_dataset import Dataset @@ -294,7 +295,7 @@ def finetune( trainer.model.push_to_hub( f"zenml/{EMBEDDINGS_MODEL_ID_FINE_TUNED}", exist_ok=True, - token=zenml_client.get_secret(SECRET_NAME).secret_values["hf_token"] + token=zenml_client.get_secret(SECRET_NAME).secret_values["hf_token"], ) log_model_metadata( diff --git a/llm-complete-guide/steps/push_to_argilla.py b/llm-complete-guide/steps/push_to_argilla.py index 90c3d2d9..e67bf621 100644 --- a/llm-complete-guide/steps/push_to_argilla.py +++ b/llm-complete-guide/steps/push_to_argilla.py @@ -16,7 +16,6 @@ import argilla as rg import torch from argilla._exceptions import ConflictError - from constants import ( DATASET_NAME_ARGILLA, EMBEDDINGS_MODEL_ID_BASELINE, @@ -115,7 +114,7 @@ def push_to_argilla(train_dataset: Dataset, test_dataset: Dataset) -> None: try: ds.create() except ConflictError: - ds = client.datasets(DATASET_NAME_ARGILLA) + ds = client.datasets(DATASET_NAME_ARGILLA) # process original HF dataset try: From 63eca31d8b24689d4cd9e90df8027ef29c01fbad Mon Sep 17 00:00:00 2001 From: Alexej Penner Date: Mon, 4 Nov 2024 23:01:48 +0100 Subject: [PATCH 06/14] Update llm-complete-guide/configs/staging/synthetic.yaml Co-authored-by: Alex Strick van Linschoten --- llm-complete-guide/configs/staging/synthetic.yaml | 3 --- 1 file changed, 3 deletions(-) diff --git a/llm-complete-guide/configs/staging/synthetic.yaml b/llm-complete-guide/configs/staging/synthetic.yaml index 91b1e13c..1cfd772f 100644 --- a/llm-complete-guide/configs/staging/synthetic.yaml +++ b/llm-complete-guide/configs/staging/synthetic.yaml @@ -2,10 +2,7 @@ settings: docker: requirements: - - langchain-community - ratelimit - - langchain>=0.0.325 - - langchain-openai - pgvector - psycopg2-binary - beautifulsoup4 From fbb45fee42c1c3e7bbf2ab1b2f6ce75924f230b0 Mon Sep 17 00:00:00 2001 From: Alexej Penner Date: Mon, 4 Nov 2024 23:02:10 +0100 Subject: [PATCH 07/14] Update llm-complete-guide/configs/staging/embeddings.yaml Co-authored-by: Alex Strick van Linschoten --- llm-complete-guide/configs/staging/embeddings.yaml | 3 --- 1 file changed, 3 deletions(-) diff --git a/llm-complete-guide/configs/staging/embeddings.yaml b/llm-complete-guide/configs/staging/embeddings.yaml index c0b050bb..22e0ba09 100644 --- a/llm-complete-guide/configs/staging/embeddings.yaml +++ b/llm-complete-guide/configs/staging/embeddings.yaml @@ -5,10 +5,7 @@ settings: docker: parent_image: "zenmldocker/prepare-release:base-0.68.0" requirements: - - langchain-community - ratelimit - - langchain>=0.0.325 - - langchain-openai - pgvector - psycopg2-binary - beautifulsoup4 From 61987b28efffe96077b6b4e038488be6973e4801 Mon Sep 17 00:00:00 2001 From: Alexej Penner Date: Mon, 4 Nov 2024 23:04:02 +0100 Subject: [PATCH 08/14] Update llm-complete-guide/configs/production/synthetic.yaml Co-authored-by: Alex Strick van Linschoten --- llm-complete-guide/configs/production/synthetic.yaml | 3 --- 1 file changed, 3 deletions(-) diff --git a/llm-complete-guide/configs/production/synthetic.yaml b/llm-complete-guide/configs/production/synthetic.yaml index ff071427..9f2c0bb3 100644 --- a/llm-complete-guide/configs/production/synthetic.yaml +++ b/llm-complete-guide/configs/production/synthetic.yaml @@ -2,10 +2,7 @@ settings: docker: requirements: - - langchain-community - ratelimit - - langchain>=0.0.325 - - langchain-openai - pgvector - psycopg2-binary - beautifulsoup4 From cfc5567c604192f38c6acd18e2e2976caea07677 Mon Sep 17 00:00:00 2001 From: AlexejPenner Date: Mon, 4 Nov 2024 23:33:37 +0100 Subject: [PATCH 09/14] Fixed configs and Readme --- llm-complete-guide/README.md | 4 ++-- llm-complete-guide/configs/dev/embeddings.yaml | 2 +- llm-complete-guide/configs/dev/synthetic.yaml | 1 + llm-complete-guide/configs/production/embeddings.yaml | 5 +---- llm-complete-guide/configs/production/synthetic.yaml | 1 + llm-complete-guide/configs/staging/embeddings.yaml | 2 +- llm-complete-guide/configs/staging/synthetic.yaml | 1 + 7 files changed, 8 insertions(+), 8 deletions(-) diff --git a/llm-complete-guide/README.md b/llm-complete-guide/README.md index fc6f4072..3aaa271e 100644 --- a/llm-complete-guide/README.md +++ b/llm-complete-guide/README.md @@ -23,7 +23,7 @@ instructions are provided below for how to set that up. ## 📽️ Watch the webinars -We've recently been holding some webinars about this repository and project. Watche the videos below if you want an introduction and context around the code and ideas covered in this project. +We've recently been holding some webinars about this repository and project. Watch the videos below if you want an introduction and context around the code and ideas covered in this project. [![Building and Optimizing RAG Pipelines: Data Preprocessing, Embeddings, and Evaluation with ZenML](https://github.com/user-attachments/assets/1aea2bd4-8079-4ea2-98e1-8da6ba9aeebe)](https://www.youtube.com/watch?v=PazRMY8bo3U) @@ -45,7 +45,7 @@ pip install -r requirements.txt Depending on your hardware you may run into some issues when running the `pip install` command with the `flash_attn` package. In that case running `FLASH_ATTENTION_SKIP_CUDA_BUILD=TRUE pip install flash-attn --no-build-isolation` -could help you. +could help you. Possibly you might also need to install torch separately. In order to use the default LLM for this query, you'll need an account and an API key from OpenAI specified as a ZenML secret: diff --git a/llm-complete-guide/configs/dev/embeddings.yaml b/llm-complete-guide/configs/dev/embeddings.yaml index f7a66d09..4f00a98a 100644 --- a/llm-complete-guide/configs/dev/embeddings.yaml +++ b/llm-complete-guide/configs/dev/embeddings.yaml @@ -3,7 +3,6 @@ # environment configuration settings: docker: - parent_image: "zenmldocker/prepare-release:base-0.68.0" requirements: - langchain-community - ratelimit @@ -27,6 +26,7 @@ settings: - datasets - torch - pygithub + - openai environment: ZENML_PROJECT_SECRET_NAME: llm_complete diff --git a/llm-complete-guide/configs/dev/synthetic.yaml b/llm-complete-guide/configs/dev/synthetic.yaml index 9577e096..bb7ebbf7 100644 --- a/llm-complete-guide/configs/dev/synthetic.yaml +++ b/llm-complete-guide/configs/dev/synthetic.yaml @@ -25,6 +25,7 @@ settings: - torch - distilabel - pygithub + - openai environment: ZENML_PROJECT_SECRET_NAME: llm_complete diff --git a/llm-complete-guide/configs/production/embeddings.yaml b/llm-complete-guide/configs/production/embeddings.yaml index b681a11b..7d027743 100644 --- a/llm-complete-guide/configs/production/embeddings.yaml +++ b/llm-complete-guide/configs/production/embeddings.yaml @@ -3,12 +3,8 @@ # environment configuration settings: docker: - parent_image: "zenmldocker/prepare-release:base-0.68.1" requirements: - - langchain-community - ratelimit - - langchain>=0.0.325 - - langchain-openai - pgvector - psycopg2-binary - beautifulsoup4 @@ -27,6 +23,7 @@ settings: - datasets - torch - pygithub + - openai environment: ZENML_PROJECT_SECRET_NAME: llm_complete diff --git a/llm-complete-guide/configs/production/synthetic.yaml b/llm-complete-guide/configs/production/synthetic.yaml index 9f2c0bb3..5d5bac27 100644 --- a/llm-complete-guide/configs/production/synthetic.yaml +++ b/llm-complete-guide/configs/production/synthetic.yaml @@ -23,6 +23,7 @@ settings: - distilabel - argilla - pygithub + - openai environment: ZENML_PROJECT_SECRET_NAME: llm_complete diff --git a/llm-complete-guide/configs/staging/embeddings.yaml b/llm-complete-guide/configs/staging/embeddings.yaml index 22e0ba09..d8bbfc45 100644 --- a/llm-complete-guide/configs/staging/embeddings.yaml +++ b/llm-complete-guide/configs/staging/embeddings.yaml @@ -3,7 +3,6 @@ # environment configuration settings: docker: - parent_image: "zenmldocker/prepare-release:base-0.68.0" requirements: - ratelimit - pgvector @@ -24,6 +23,7 @@ settings: - datasets - torch - pygithub + - openai environment: ZENML_PROJECT_SECRET_NAME: llm_complete diff --git a/llm-complete-guide/configs/staging/synthetic.yaml b/llm-complete-guide/configs/staging/synthetic.yaml index 1cfd772f..ba16d590 100644 --- a/llm-complete-guide/configs/staging/synthetic.yaml +++ b/llm-complete-guide/configs/staging/synthetic.yaml @@ -23,6 +23,7 @@ settings: - distilabel - argilla - pygithub + - openai environment: ZENML_PROJECT_SECRET_NAME: llm_complete From e2d66d382bf2c56bdb6bdacd626a3de8897f757e Mon Sep 17 00:00:00 2001 From: AlexejPenner Date: Mon, 4 Nov 2024 23:51:29 +0100 Subject: [PATCH 10/14] Only load hf_token when needed --- llm-complete-guide/steps/rag_deployment.py | 9 +++++---- llm-complete-guide/utils/hf_utils.py | 8 ++++++++ llm-complete-guide/utils/openai_utils.py | 2 +- 3 files changed, 14 insertions(+), 5 deletions(-) create mode 100644 llm-complete-guide/utils/hf_utils.py diff --git a/llm-complete-guide/steps/rag_deployment.py b/llm-complete-guide/steps/rag_deployment.py index a750dde6..4aa2fec2 100644 --- a/llm-complete-guide/steps/rag_deployment.py +++ b/llm-complete-guide/steps/rag_deployment.py @@ -2,6 +2,8 @@ import webbrowser from huggingface_hub import HfApi + +from utils.hf_utils import get_hf_token from utils.llm_utils import process_input_with_retrieval from zenml import step from zenml.client import Client @@ -11,7 +13,6 @@ ZENML_API_TOKEN = secret.secret_values["zenml_api_token"] ZENML_STORE_URL = secret.secret_values["zenml_store_url"] -HF_TOKEN = os.getenv("HF_TOKEN") SPACE_USERNAME = os.environ.get("ZENML_HF_USERNAME", "zenml") SPACE_NAME = os.environ.get("ZENML_HF_SPACE_NAME", "llm-complete-guide-rag") @@ -50,7 +51,7 @@ def predict(message, history): def upload_files_to_repo( - api, repo_id: str, files_mapping: dict, token: str = HF_TOKEN + api, repo_id: str, files_mapping: dict, token: str ): """Upload multiple files to a Hugging Face repository @@ -89,7 +90,7 @@ def gradio_rag_deployment() -> None: space_sdk="gradio", private=True, exist_ok=True, - token=HF_TOKEN, + token=get_hf_token(), ) api.add_space_secret( repo_id=hf_repo_id, @@ -112,6 +113,6 @@ def gradio_rag_deployment() -> None: hf_repo_requirements: "requirements.txt", } - upload_files_to_repo(api, hf_repo_id, files_to_upload, HF_TOKEN) + upload_files_to_repo(api, hf_repo_id, files_to_upload, get_hf_token()) webbrowser.open(f"https://huggingface.co/spaces/{hf_repo_id}") diff --git a/llm-complete-guide/utils/hf_utils.py b/llm-complete-guide/utils/hf_utils.py new file mode 100644 index 00000000..2de954fa --- /dev/null +++ b/llm-complete-guide/utils/hf_utils.py @@ -0,0 +1,8 @@ +from constants import SECRET_NAME +from zenml.client import Client + + +def get_hf_token() -> str: + api_key = Client().get_secret(SECRET_NAME).secret_values["hf_token"] + + return api_key diff --git a/llm-complete-guide/utils/openai_utils.py b/llm-complete-guide/utils/openai_utils.py index e67ba5f9..15b84cc5 100644 --- a/llm-complete-guide/utils/openai_utils.py +++ b/llm-complete-guide/utils/openai_utils.py @@ -2,7 +2,7 @@ from zenml.client import Client -def get_openai_api_key(): +def get_openai_api_key() -> str: api_key = Client().get_secret(SECRET_NAME).secret_values["openai_api_key"] return api_key From 6ebbd74f74a3766726375fe464f2a90278b768d6 Mon Sep 17 00:00:00 2001 From: AlexejPenner Date: Tue, 5 Nov 2024 00:09:28 +0100 Subject: [PATCH 11/14] Adjusted zenml env vars --- .github/workflows/production_run_complete_llm.yml | 4 ++-- .github/workflows/staging_run_complete_llm.yml | 4 ++-- llm-complete-guide/steps/rag_deployment.py | 4 ++-- 3 files changed, 6 insertions(+), 6 deletions(-) diff --git a/.github/workflows/production_run_complete_llm.yml b/.github/workflows/production_run_complete_llm.yml index efc93f8b..c57b84d5 100644 --- a/.github/workflows/production_run_complete_llm.yml +++ b/.github/workflows/production_run_complete_llm.yml @@ -14,7 +14,7 @@ jobs: run-staging-workflow: runs-on: ubuntu-latest env: - ZENML_HOST: ${{ secrets.ZENML_PROJECTS_HOST }} + ZENML_STORE_URL: ${{ secrets.ZENML_PROJECTS_HOST }} ZENML_API_KEY: ${{ secrets.ZENML_PROJECTS_API_KEY }} ZENML_PRODUCTION_STACK: b3951d43-0fb2-4d32-89c5-3399374e7c7e # Set this to your production stack ID ZENML_GITHUB_SHA: ${{ github.event.pull_request.head.sha }} @@ -46,7 +46,7 @@ jobs: working-directory: ./llm-complete-guide run: | zenml init - zenml connect --url $ZENML_HOST --api-key $ZENML_API_KEY + zenml connect --url $ZENML_STORE_URL --api-key $ZENML_API_KEY - name: Set stack (Production) working-directory: ./llm-complete-guide diff --git a/.github/workflows/staging_run_complete_llm.yml b/.github/workflows/staging_run_complete_llm.yml index 0a68c797..2c7ad35b 100644 --- a/.github/workflows/staging_run_complete_llm.yml +++ b/.github/workflows/staging_run_complete_llm.yml @@ -12,7 +12,7 @@ jobs: run-staging-workflow: runs-on: ubuntu-latest env: - ZENML_HOST: ${{ secrets.ZENML_PROJECTS_HOST }} + ZENML_STORE_URL: ${{ secrets.ZENML_PROJECTS_HOST }} ZENML_API_KEY: ${{ secrets.ZENML_PROJECTS_API_KEY }} ZENML_STAGING_STACK : 67166d73-a44e-42f9-b67f-011e9afab9b5 # Set this to your staging stack ID ZENML_GITHUB_SHA: ${{ github.event.pull_request.head.sha }} @@ -42,7 +42,7 @@ jobs: working-directory: ./llm-complete-guide run: | zenml init - zenml connect --url $ZENML_HOST --api-key $ZENML_API_KEY + zenml connect --url $ZENML_STORE_URL --api-key $ZENML_API_KEY - name: Set stack (Staging) working-directory: ./llm-complete-guide diff --git a/llm-complete-guide/steps/rag_deployment.py b/llm-complete-guide/steps/rag_deployment.py index 4aa2fec2..99a8c911 100644 --- a/llm-complete-guide/steps/rag_deployment.py +++ b/llm-complete-guide/steps/rag_deployment.py @@ -11,8 +11,8 @@ secret = Client().get_secret("llm-complete") -ZENML_API_TOKEN = secret.secret_values["zenml_api_token"] -ZENML_STORE_URL = secret.secret_values["zenml_store_url"] +ZENML_API_TOKEN = os.environ.get("ZENML_API_TOKEN") +ZENML_STORE_URL = os.environ.get("ZENML_STORE_URL") SPACE_USERNAME = os.environ.get("ZENML_HF_USERNAME", "zenml") SPACE_NAME = os.environ.get("ZENML_HF_SPACE_NAME", "llm-complete-guide-rag") From 1a0d75bd3794521c9e459c310dca5a42ff8cc512 Mon Sep 17 00:00:00 2001 From: AlexejPenner Date: Sun, 10 Nov 2024 17:34:01 +0100 Subject: [PATCH 12/14] Use only subset locally --- llm-complete-guide/configs/dev/rag.yaml | 3 ++- .../configs/production/rag.yaml | 13 +++++----- llm-complete-guide/configs/staging/rag.yaml | 1 + llm-complete-guide/pipelines/llm_basic_rag.py | 1 + llm-complete-guide/steps/url_scraper.py | 26 ++++++++++--------- 5 files changed, 25 insertions(+), 19 deletions(-) diff --git a/llm-complete-guide/configs/dev/rag.yaml b/llm-complete-guide/configs/dev/rag.yaml index 8a3848ed..637da8bb 100644 --- a/llm-complete-guide/configs/dev/rag.yaml +++ b/llm-complete-guide/configs/dev/rag.yaml @@ -31,4 +31,5 @@ model: steps: url_scraper: parameters: - docs_url: https://docs.zenml.io/stack-components/orchestrators + docs_url: https://docs.zenml.io/ + use_dev_set: true diff --git a/llm-complete-guide/configs/production/rag.yaml b/llm-complete-guide/configs/production/rag.yaml index 76de23d7..7e11a615 100644 --- a/llm-complete-guide/configs/production/rag.yaml +++ b/llm-complete-guide/configs/production/rag.yaml @@ -36,9 +36,10 @@ steps: url_scraper: parameters: docs_url: https://docs.zenml.io - generate_embeddings: - step_operator: "gcp_a100" - settings: - step_operator.vertex: - accelerator_count: 1 - accelerator_type: NVIDIA_TESLA_A100 \ No newline at end of file + use_dev_set: false +# generate_embeddings: +# step_operator: "sagemaker" +# settings: +# step_operator.sagemaker: +# accelerator_count: 1 +# accelerator_type: NVIDIA_TESLA_A100 \ No newline at end of file diff --git a/llm-complete-guide/configs/staging/rag.yaml b/llm-complete-guide/configs/staging/rag.yaml index 59c0c736..e02e1e26 100644 --- a/llm-complete-guide/configs/staging/rag.yaml +++ b/llm-complete-guide/configs/staging/rag.yaml @@ -36,3 +36,4 @@ steps: url_scraper: parameters: docs_url: https://docs.zenml.io + use_dev_set: false diff --git a/llm-complete-guide/pipelines/llm_basic_rag.py b/llm-complete-guide/pipelines/llm_basic_rag.py index 895c4df3..82a97b21 100644 --- a/llm-complete-guide/pipelines/llm_basic_rag.py +++ b/llm-complete-guide/pipelines/llm_basic_rag.py @@ -14,6 +14,7 @@ # See the License for the specific language governing permissions and # limitations under the License. # +from litellm import config_path from steps.populate_index import ( generate_embeddings, diff --git a/llm-complete-guide/steps/url_scraper.py b/llm-complete-guide/steps/url_scraper.py index f7910e26..ef7c1a9d 100644 --- a/llm-complete-guide/steps/url_scraper.py +++ b/llm-complete-guide/steps/url_scraper.py @@ -21,11 +21,12 @@ from steps.url_scraping_utils import get_all_pages -@step(enable_cache=True) +@step(enable_cache=True, step_operator="gcp_a100") def url_scraper( docs_url: str = "https://docs.zenml.io", repo_url: str = "https://github.com/zenml-io/zenml", website_url: str = "https://zenml.io", + use_dev_set: bool = False ) -> Annotated[str, ArtifactConfig(name="urls")]: """Generates a list of relevant URLs to scrape. @@ -39,18 +40,19 @@ def url_scraper( """ # We comment this out to make this pipeline faster # examples_readme_urls = get_nested_readme_urls(repo_url) - docs_urls = get_all_pages(docs_url) + if use_dev_set: - # FOR TESTING ONLY - # docs_urls = [ - # "https://docs.zenml.io/getting-started/system-architectures", - # "https://docs.zenml.io/getting-started/core-concepts", - # "https://docs.zenml.io/user-guide/llmops-guide/rag-with-zenml/rag-85-loc", - # "https://docs.zenml.io/how-to/track-metrics-metadata/logging-metadata", - # "https://docs.zenml.io/how-to/debug-and-solve-issues", - # "https://docs.zenml.io/stack-components/step-operators/azureml", - # "https://docs.zenml.io/how-to/interact-with-secrets", - # ] + docs_urls = [ + "https://docs.zenml.io/getting-started/system-architectures", + "https://docs.zenml.io/getting-started/core-concepts", + "https://docs.zenml.io/user-guide/llmops-guide/rag-with-zenml/rag-85-loc", + "https://docs.zenml.io/how-to/track-metrics-metadata/logging-metadata", + "https://docs.zenml.io/how-to/debug-and-solve-issues", + "https://docs.zenml.io/stack-components/step-operators/azureml", + "https://docs.zenml.io/how-to/interact-with-secrets", + ] + else: + docs_urls = get_all_pages(docs_url) # website_urls = get_all_pages(website_url) # all_urls = docs_urls + website_urls + examples_readme_urls From b36081fd5a89e49bbb0cadf86d3f9950284d62f2 Mon Sep 17 00:00:00 2001 From: Hamza Tahir Date: Sun, 10 Nov 2024 20:54:53 +0100 Subject: [PATCH 13/14] Update url_scraper.py with new step parameters --- llm-complete-guide/configs/production/rag.yaml | 1 + llm-complete-guide/configs/staging/rag.yaml | 2 ++ llm-complete-guide/steps/url_scraper.py | 2 +- 3 files changed, 4 insertions(+), 1 deletion(-) diff --git a/llm-complete-guide/configs/production/rag.yaml b/llm-complete-guide/configs/production/rag.yaml index 7e11a615..59ad858f 100644 --- a/llm-complete-guide/configs/production/rag.yaml +++ b/llm-complete-guide/configs/production/rag.yaml @@ -37,6 +37,7 @@ steps: parameters: docs_url: https://docs.zenml.io use_dev_set: false + enable_cache: true # generate_embeddings: # step_operator: "sagemaker" # settings: diff --git a/llm-complete-guide/configs/staging/rag.yaml b/llm-complete-guide/configs/staging/rag.yaml index e02e1e26..5107f01e 100644 --- a/llm-complete-guide/configs/staging/rag.yaml +++ b/llm-complete-guide/configs/staging/rag.yaml @@ -37,3 +37,5 @@ steps: parameters: docs_url: https://docs.zenml.io use_dev_set: false + enable_cache: true + step_operator: "gcp_a100" \ No newline at end of file diff --git a/llm-complete-guide/steps/url_scraper.py b/llm-complete-guide/steps/url_scraper.py index ef7c1a9d..e2d85df5 100644 --- a/llm-complete-guide/steps/url_scraper.py +++ b/llm-complete-guide/steps/url_scraper.py @@ -21,7 +21,7 @@ from steps.url_scraping_utils import get_all_pages -@step(enable_cache=True, step_operator="gcp_a100") +@step def url_scraper( docs_url: str = "https://docs.zenml.io", repo_url: str = "https://github.com/zenml-io/zenml", From ea6228cc441f42e6d9d28ea904f74d76f95a3737 Mon Sep 17 00:00:00 2001 From: Hamza Tahir Date: Mon, 11 Nov 2024 10:15:11 +0100 Subject: [PATCH 14/14] Add latest version for dev_finetuned model --- llm-complete-guide/configs/dev/rag_eval.yaml | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/llm-complete-guide/configs/dev/rag_eval.yaml b/llm-complete-guide/configs/dev/rag_eval.yaml index 904c1a08..fd8df1e5 100644 --- a/llm-complete-guide/configs/dev/rag_eval.yaml +++ b/llm-complete-guide/configs/dev/rag_eval.yaml @@ -19,4 +19,5 @@ model: name: dev_finetuned-zenml-docs-embeddings license: Apache 2.0 description: Finetuned LLM on ZenML docs - tags: ["rag", "finetuned"] \ No newline at end of file + tags: ["rag", "finetuned"] + version: latest \ No newline at end of file