diff --git a/llm-complete-guide/pipelines/local_deployment.py b/llm-complete-guide/pipelines/local_deployment.py index 314d8a53..94a062f7 100644 --- a/llm-complete-guide/pipelines/local_deployment.py +++ b/llm-complete-guide/pipelines/local_deployment.py @@ -1,5 +1,6 @@ from steps.bento_builder import bento_builder from steps.bento_deployment import bento_deployment +from steps.visualize_chat import create_chat_interface from zenml import pipeline @@ -7,5 +8,6 @@ def local_deployment(): bento = bento_builder() bento_deployment(bento) + create_chat_interface() #vllm_model_deployer_step() diff --git a/llm-complete-guide/run.py b/llm-complete-guide/run.py index c32c2365..1a0e1f55 100644 --- a/llm-complete-guide/run.py +++ b/llm-complete-guide/run.py @@ -50,6 +50,7 @@ rag_deployment, llm_index_and_evaluate, local_deployment, + production_deployment, ) from structures import Document from zenml.materializers.materializer_registry import materializer_registry @@ -144,6 +145,12 @@ default=None, help="Path to config", ) +@click.option( + "--env", + "env", + default="local", + help="The environment to use for the completion.", +) def main( pipeline: str, query_text: Optional[str] = None, @@ -154,6 +161,7 @@ def main( use_argilla: bool = False, use_reranker: bool = False, config: Optional[str] = None, + env: str = "local", ): """Main entry point for the pipeline execution. @@ -167,6 +175,7 @@ def main( use_argilla (bool): If True, Argilla an notations will be used use_reranker (bool): If True, rerankers will be used config (Optional[str]): Path to config file + env (str): The environment to use for the deployment (local, huggingface space, k8s etc.) """ pipeline_args = {"enable_cache": not no_cache} embeddings_finetune_args = { @@ -259,9 +268,18 @@ def main( )() elif pipeline == "deploy": - #rag_deployment.with_options(model=zenml_model, **pipeline_args)() - local_deployment.with_options(model=zenml_model, **pipeline_args)() - + if env == "local": + local_deployment.with_options( + model=zenml_model, config_path=config_path, **pipeline_args + )() + elif env == "huggingface": + rag_deployment.with_options( + model=zenml_model, config_path=config_path, **pipeline_args + )() + elif env == "k8s": + production_deployment.with_options( + model=zenml_model, config_path=config_path, **pipeline_args + )() elif pipeline == "evaluation": pipeline_args["enable_cache"] = False llm_eval.with_options(model=zenml_model, config_path=config_path)() diff --git a/llm-complete-guide/service.py b/llm-complete-guide/service.py index 8ccbf7e6..adec9f44 100644 --- a/llm-complete-guide/service.py +++ b/llm-complete-guide/service.py @@ -26,6 +26,18 @@ "timeout": 300, "concurrency": 256, }, + http={ + "cors": { + "enabled": True, + "access_control_allow_origins": ["https://cloud.zenml.io"], # Add your allowed origins + "access_control_allow_methods": ["GET", "OPTIONS", "POST", "HEAD", "PUT"], + "access_control_allow_credentials": True, + "access_control_allow_headers": ["*"], + # "access_control_allow_origin_regex": "https://.*\.my_org\.com", # Optional regex + "access_control_max_age": 1200, + "access_control_expose_headers": ["Content-Length"], + } + } ) class RAGService: """RAG service for generating responses using LLM and RAG.""" diff --git a/llm-complete-guide/steps/bento_builder.py b/llm-complete-guide/steps/bento_builder.py index 98e5bd1f..b89d571b 100644 --- a/llm-complete-guide/steps/bento_builder.py +++ b/llm-complete-guide/steps/bento_builder.py @@ -31,6 +31,7 @@ ) from zenml.integrations.bentoml.steps import bento_builder_step from zenml.logger import get_logger +from zenml.orchestrators.utils import get_config_environment_vars from zenml.utils import source_utils logger = get_logger(__name__) @@ -64,6 +65,7 @@ def bento_builder() -> ( if Client().active_stack.orchestrator.flavor == "local": model = get_step_context().model version_to_deploy = Model(name=model.name, version="production") + logger.info(f"Building BentoML bundle for model: {version_to_deploy.name}") # Build the BentoML bundle bento = bentos.build( service="service.py:RAGService", diff --git a/llm-complete-guide/steps/bento_dockerizer.py b/llm-complete-guide/steps/bento_dockerizer.py index 813f58a8..4e52dcba 100644 --- a/llm-complete-guide/steps/bento_dockerizer.py +++ b/llm-complete-guide/steps/bento_dockerizer.py @@ -28,7 +28,7 @@ logger = get_logger(__name__) -@step +@step(enable_cache=False) def bento_dockerizer() -> ( Annotated[ str, @@ -40,12 +40,11 @@ def bento_dockerizer() -> ( This step is responsible for dockerizing the BentoML model. """ ### ADD YOUR OWN CODE HERE - THIS IS JUST AN EXAMPLE ### + zenml_client = Client() model = get_step_context().model - version_to_deploy = Model(name=model.name, version="production") - bentoml_deployment = version_to_deploy.get_model_artifact(name="bentoml_rag_deployment") + version_to_deploy = Model(name=model.name) + bentoml_deployment = zenml_client.get_artifact_version(name_id_or_prefix="bentoml_rag_deployment") bento_tag = f'{bentoml_deployment.run_metadata["bento_tag_name"]}:{bentoml_deployment.run_metadata["bento_info_version"]}' - - zenml_client = Client() container_registry = zenml_client.active_stack.container_registry assert container_registry, "Container registry is not configured." image_name = f"{container_registry.config.uri}/{bento_tag}" diff --git a/llm-complete-guide/steps/k8s_deployment.py b/llm-complete-guide/steps/k8s_deployment.py index 7ca1839c..9726dbd3 100644 --- a/llm-complete-guide/steps/k8s_deployment.py +++ b/llm-complete-guide/steps/k8s_deployment.py @@ -11,15 +11,21 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express # or implied. See the License for the specific language governing # permissions and limitations under the License. -from pathlib import Path -from typing import Dict, Optional import re +from pathlib import Path +from typing import Dict, Optional, cast + import yaml from kubernetes import client, config from kubernetes.client.rest import ApiException from zenml import get_step_context, step from zenml.client import Client +from zenml.integrations.bentoml.services.bentoml_local_deployment import ( + BentoMLLocalDeploymentConfig, + BentoMLLocalDeploymentService, +) from zenml.logger import get_logger +from zenml.orchestrators.utils import get_config_environment_vars logger = get_logger(__name__) @@ -93,7 +99,7 @@ def apply_kubernetes_configuration(k8s_configs: list) -> None: logger.error(f"Error applying {kind} {name}: {e}") raise e -@step +@step(enable_cache=False) def k8s_deployment( docker_image_tag: str, namespace: str = "default" @@ -103,6 +109,17 @@ def k8s_deployment( # Sanitize the model name model_name = sanitize_name(raw_model_name) + # Get environment variables + environment_vars = get_config_environment_vars() + + # Get current deployment + zenml_client = Client() + model_deployer = zenml_client.active_stack.model_deployer + services = model_deployer.find_model_server( + model_name=model_name, + model_version="production", + ) + # Read the K8s template template_path = Path(__file__).parent / "k8s_template.yaml" with open(template_path, "r") as f: @@ -120,6 +137,23 @@ def k8s_deployment( if config["kind"] == "Service": # Update service selector config["spec"]["selector"]["app"] = model_name + + # Update metadata annotations with SSL certificate ARN + config["metadata"]["annotations"] = { + "service.beta.kubernetes.io/aws-load-balancer-ssl-cert": "arn:aws:acm:eu-central-1:339712793861:certificate/0426ace8-5fa3-40dd-bd81-b0fb1064bd85", + "service.beta.kubernetes.io/aws-load-balancer-backend-protocol": "http", + "service.beta.kubernetes.io/aws-load-balancer-ssl-ports": "443", + "service.beta.kubernetes.io/aws-load-balancer-connection-idle-timeout": "3600" + } + + # Update ports + config["spec"]["ports"] = [ + { + "name": "https", + "port": 443, + "targetPort": 3000 + } + ] elif config["kind"] == "Deployment": # Update deployment selector and template @@ -131,6 +165,12 @@ def k8s_deployment( for container in containers: container["name"] = model_name container["image"] = docker_image_tag + + # Add environment variables to the container + env_vars = [] + for key, value in environment_vars.items(): + env_vars.append({"name": key, "value": value}) + container["env"] = env_vars # Apply the configurations try: @@ -149,9 +189,22 @@ def k8s_deployment( "namespace": namespace, "status": deployment_status, "service_port": 3000, - "configurations": k8s_configs + "configurations": k8s_configs, + "url": "chat-rag.staging.cloudinfra.zenml.io" } + if services: + bentoml_deployment= cast(BentoMLLocalDeploymentService, services[0]) + zenml_client.update_service( + id=bentoml_deployment.uuid, + prediction_url="https://chat-rag.staging.cloudinfra.zenml.io", + health_check_url="https://chat-rag.staging.cloudinfra.zenml.io/healthz", + labels={ + "docker_image": docker_image_tag, + "namespace": namespace, + } + ) + return deployment_info diff --git a/llm-complete-guide/k8s_template.yaml b/llm-complete-guide/steps/k8s_template.yaml similarity index 51% rename from llm-complete-guide/k8s_template.yaml rename to llm-complete-guide/steps/k8s_template.yaml index dd6b918f..2ad971b2 100644 --- a/llm-complete-guide/k8s_template.yaml +++ b/llm-complete-guide/steps/k8s_template.yaml @@ -1,17 +1,22 @@ apiVersion: v1 kind: Service metadata: + name: placeholder labels: app: placeholder - name: placeholder + annotations: + service.beta.kubernetes.io/aws-load-balancer-ssl-cert: arn:aws:acm:region:account-id:certificate/certificate-id + service.beta.kubernetes.io/aws-load-balancer-backend-protocol: http + service.beta.kubernetes.io/aws-load-balancer-ssl-ports: "443" + service.beta.kubernetes.io/aws-load-balancer-connection-idle-timeout: "3600" spec: - ports: - - name: http # Changed from 'predict' to 'http' for clarity - port: 80 # External port exposed by LoadBalancer - targetPort: 3000 # Internal container port selector: app: placeholder type: LoadBalancer + ports: + - name: https + port: 443 # External port exposed by LoadBalancer (HTTPS) + targetPort: 3000 # Internal container port --- apiVersion: apps/v1 kind: Deployment diff --git a/llm-complete-guide/steps/visualize_chat.py b/llm-complete-guide/steps/visualize_chat.py new file mode 100644 index 00000000..f726db53 --- /dev/null +++ b/llm-complete-guide/steps/visualize_chat.py @@ -0,0 +1,260 @@ +from typing import Optional +from zenml import pipeline, step +from zenml.types import HTMLString + +@step(enable_cache=False) +def create_chat_interface() -> HTMLString: + html = """ +
+ + +
+
+

ZenML Assistant

+
+ +
+
+
+ Hi! I'm your ZenML assistant. How can I help you today? +
+
+
+ Assistant is typing... +
+
+ +
+ + +
+
+ + + +
+ """ + return HTMLString(html) \ No newline at end of file diff --git a/llm-complete-guide/utils/openai_utils.py b/llm-complete-guide/utils/openai_utils.py index 15b84cc5..9f5e8ac8 100644 --- a/llm-complete-guide/utils/openai_utils.py +++ b/llm-complete-guide/utils/openai_utils.py @@ -5,4 +5,5 @@ def get_openai_api_key() -> str: api_key = Client().get_secret(SECRET_NAME).secret_values["openai_api_key"] + return api_key