From a4e09b94f52ba7bfb97dbe839ab32bfc293c2f5d Mon Sep 17 00:00:00 2001 From: Junjie Bai Date: Wed, 23 Aug 2023 01:41:40 +0000 Subject: [PATCH 1/6] Add Llama2 example --- advanced/llama2/README.md | 107 ++++++++++++++++++++++++++++++++++++++ 1 file changed, 107 insertions(+) create mode 100644 advanced/llama2/README.md diff --git a/advanced/llama2/README.md b/advanced/llama2/README.md new file mode 100644 index 0000000..5a5fc42 --- /dev/null +++ b/advanced/llama2/README.md @@ -0,0 +1,107 @@ +# Llama2 + +[Llama2](https://ai.meta.com/llama/) is the latest collection of pretrained and fine-tuned generative text models released by Meta, ranging in scale from 7 billion to 70 billion parameters. In this example we are gonna use the Llama2-7B model to demonstrate how to get state of the art LLm models running on Lepton within just seconds. + +There are two ways to access Llama2 models on Lepton: + +## Fully managed Llama2 inference api + +Lepton provides the standard Llama2 models as fully managed api endpoints at https://llama2.lepton.run. This api endpoint is fully compatible with OpenAI's ChatGPT API, users can directly use OpenAI's sdk or any tools that are using ChatGPT API to seamlessly switch to Llama2 model service. e.g. If you are using OpenAI's Python sdk, you can simply switch to Lepton's Llama2 inference api with + +```python +import openai + +openai.api_base = "https://llama2.lepton.run/api/v1" +openai.api_key = "sk-" + "a" * 48 +``` + +After setting the `api_base` (and `api_key`) configuration, all existing code are compatible with Llama2 inference API e.g. the following typical Python code that uses OpenAI's ChatGPT API simply works without any modifications: + +```python +sys_prompt = """ +The following is a conversation with an AI assistant. The assistant is helpful, creative, clever, and very friendly. +""" +# Create a completion +completion = openai.ChatCompletion.create( + model="gpt-3.5-turbo", + messages=[ + {"role": "system", "content": sys_prompt}, + {"role": "user", "content": "tell me a short story"}, + ], + stream=True, + max_tokens=64, +) +for chunk in completion: + content = chunk["choices"][0]["delta"].get("content") + if content: + print(content, end="") +print() +``` + +## Dedicated Llama2 inference service + +If fully managed api does not fit your use case, you can also easily launch a dedicated Llama2 model inference service on Lepton platform. + +Note: +Meta hosts Llama2 models weights on Huggingface. You should obtain access to these models weights by going to the corresponding model page(e.g. [llama-2-7b-chat-hf](https://huggingface.co/meta-llama/Llama-2-7b-chat-hf) and request for access. Once you have the access, go to Huggingface's [token management page](https://huggingface.co/settings/tokens) to generate a token. + +### Use Lepton's secret management + +As you may use the token multiple times, we recommend storing it in Lepton's secret store. Simply do this and remember to replace the token with your own. +```shell +lep secret create -n HUGGING_FACE_HUB_TOKEN -v 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 +``` +(Don't worry, the above token is only an example and isn't active.) + +You can verify the secret exists with `lep secret list`: +```shell +>> lep secret list + Secrets +┏━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━┓ +┃ ID ┃ Value ┃ +┡━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━┩ +│ HUGGING_FACE_HUB_TOKEN │ (hidden) │ +└────────────────────────┴──────────┘ +``` + +### Launch llama2 inference service locally + +Ensure that you have installed the required dependencies. Then, run: +```shell +lep photon run -n llama2 -m hf:meta-llama/Llama-2-7b-hf +``` +Note that you will need to have a relatively large GPU (>20GB memory). + +### Launch llama2 inference service in the cloud + +Similar to other examples, you can run llama2 with the following command. Remember to pass in the huggingface access token, and also, use a reasonably sized GPU like `gpu.a10` to ensure that things run. + +```shell +lep photon create -n llama2 -m hf:meta-llama/Llama-2-7b-hf +lep photon push -n llama2 +lep photon run \ + -n llama2 \ + --secret HUGGING_FACE_HUB_TOKEN \ + --resource-shape gpu.a10 +``` + +And visit [dashboard.lepton.ai](https://dashboard.lepton.ai/) to try out the model. + +Note: in default, the server is protected via a token, so you won't be able to access the gradio UI. This is by design to provide adequate security. If you want to make the UI public, you can either add the `--public` argument to `lep photon run`, or update the deployment with: + +```shell +lep deployment update -n llama2 --public +``` + +### Client + +Once the inference service is up (either locally or in the cloude), you can use the client to access it in a programmatical way: + +```python +>>> from leptonai.client import Client + +>>> client = Client(...) + +>>> client.run(inputs=["what is 2 + 3"]) +'what is 2 + 3? 5 is the answer? Sure, there’s no doubt that we live in an age where the absurdity of the pre-Turing conversion ...' +``` From d841847dab1eef17d1ea75350c169b42ea249251 Mon Sep 17 00:00:00 2001 From: Junjie Bai Date: Wed, 23 Aug 2023 01:44:14 +0000 Subject: [PATCH 2/6] typo --- advanced/llama2/README.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/advanced/llama2/README.md b/advanced/llama2/README.md index 5a5fc42..70033ec 100644 --- a/advanced/llama2/README.md +++ b/advanced/llama2/README.md @@ -15,7 +15,7 @@ openai.api_base = "https://llama2.lepton.run/api/v1" openai.api_key = "sk-" + "a" * 48 ``` -After setting the `api_base` (and `api_key`) configuration, all existing code are compatible with Llama2 inference API e.g. the following typical Python code that uses OpenAI's ChatGPT API simply works without any modifications: +After setting the `api_base` (and `api_key`) configuration, all existing code are compatible with Lepton's Llama2 inference API e.g. the following typical Python code that uses OpenAI's ChatGPT API simply works without any modifications: ```python sys_prompt = """ @@ -43,7 +43,7 @@ print() If fully managed api does not fit your use case, you can also easily launch a dedicated Llama2 model inference service on Lepton platform. Note: -Meta hosts Llama2 models weights on Huggingface. You should obtain access to these models weights by going to the corresponding model page(e.g. [llama-2-7b-chat-hf](https://huggingface.co/meta-llama/Llama-2-7b-chat-hf) and request for access. Once you have the access, go to Huggingface's [token management page](https://huggingface.co/settings/tokens) to generate a token. +Meta hosts Llama2 models weights on Huggingface. You should obtain access to these models weights by going to the corresponding model page(e.g. [llama-2-7b-chat-hf](https://huggingface.co/meta-llama/Llama-2-7b-chat-hf)) and request for access. Once you have the access, go to Huggingface's [token management page](https://huggingface.co/settings/tokens) to generate a token. ### Use Lepton's secret management From 389561e788d2b4f9a704057f886e64f8ad851e2f Mon Sep 17 00:00:00 2001 From: Junjie Bai Date: Wed, 23 Aug 2023 01:45:42 +0000 Subject: [PATCH 3/6] typo --- advanced/llama2/README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/advanced/llama2/README.md b/advanced/llama2/README.md index 70033ec..82cd5fc 100644 --- a/advanced/llama2/README.md +++ b/advanced/llama2/README.md @@ -95,7 +95,7 @@ lep deployment update -n llama2 --public ### Client -Once the inference service is up (either locally or in the cloude), you can use the client to access it in a programmatical way: +Once the inference service is up (either locally or in the cloud), you can use the client to access it in a programmatical way: ```python >>> from leptonai.client import Client From 751647bdc1a201f6d1d6c59aa378e93a0ab0d96f Mon Sep 17 00:00:00 2001 From: Junjie Bai Date: Wed, 23 Aug 2023 04:39:53 +0000 Subject: [PATCH 4/6] add max_new_token --- advanced/llama2/README.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/advanced/llama2/README.md b/advanced/llama2/README.md index 82cd5fc..e088842 100644 --- a/advanced/llama2/README.md +++ b/advanced/llama2/README.md @@ -102,6 +102,6 @@ Once the inference service is up (either locally or in the cloud), you can use t >>> client = Client(...) ->>> client.run(inputs=["what is 2 + 3"]) -'what is 2 + 3? 5 is the answer? Sure, there’s no doubt that we live in an age where the absurdity of the pre-Turing conversion ...' +>>> client.run(inputs=["what is 2 + 3"], max_new_tokens=128) +"what is 2 + 3.\nThis is quite common in mathematics: variable height means variable growth and variable foot (puz- ulating, pus, pulsating), variable width for a three dimensional thing. Variable has an incorrect connotation for us. It would be better to say that the statistic is unsatisfactory in all conditions.\nBut...since he _says_ he's a 90th percentile man, and since the classification is as it is, and since those who classify him for that percentile have based it on other empirical evidence, you still have either an error in the percentile, or" ``` From 21b69ddc31a05449558e2f5260f2b7e2e09ec2dd Mon Sep 17 00:00:00 2001 From: Junjie Bai Date: Wed, 23 Aug 2023 04:43:55 +0000 Subject: [PATCH 5/6] add notebook --- llama2-api.ipynb | 92 ++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 92 insertions(+) create mode 100644 llama2-api.ipynb diff --git a/llama2-api.ipynb b/llama2-api.ipynb new file mode 100644 index 0000000..9b6d33f --- /dev/null +++ b/llama2-api.ipynb @@ -0,0 +1,92 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 3, + "metadata": { + "id": "lIYdn1woOS1n" + }, + "outputs": [], + "source": [ + "!pip install -qqq openai" + ] + }, + { + "cell_type": "code", + "source": [ + "import openai\n", + "\n", + "openai.api_base = \"https://llama2.lepton.run/api/v1\"\n", + "openai.api_key = \"sk-\" + \"a\" * 48" + ], + "metadata": { + "id": "UCOfN-VEsy5m" + }, + "execution_count": 4, + "outputs": [] + }, + { + "cell_type": "code", + "source": [ + "sys_prompt = \"\"\"\n", + "The following is a conversation with an AI assistant. The assistant is helpful, creative, clever, and very friendly.\n", + "\"\"\"\n", + "# Create a completion\n", + "completion = openai.ChatCompletion.create(\n", + " model=\"gpt-3.5-turbo\",\n", + " messages=[\n", + " {\"role\": \"system\", \"content\": sys_prompt},\n", + " {\"role\": \"user\", \"content\": \"tell me a short story\"},\n", + " ],\n", + " stream=True,\n", + " max_tokens=64,\n", + ")\n", + "for chunk in completion:\n", + " content = chunk[\"choices\"][0][\"delta\"].get(\"content\")\n", + " if content:\n", + " print(content, end=\"\")\n", + "print()" + ], + "metadata": { + "id": "y7eV3R87sz6Y", + "outputId": "89b8a5a4-a767-4355-b738-1358dd9e30a6", + "colab": { + "base_uri": "https://localhost:8080/" + } + }, + "execution_count": 5, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "Of course, I'd be happy to tell you a short story! Here it is:\n", + "\n", + "Once upon a time, in a far-off land, there was a magical forest filled with towering trees, sparkling streams, and a variety of creatures great and small. Among these creatures lived\n" + ] + } + ] + }, + { + "cell_type": "code", + "source": [], + "metadata": { + "id": "IuKq8VV0s3C_" + }, + "execution_count": null, + "outputs": [] + } + ], + "metadata": { + "colab": { + "name": "scratchpad", + "provenance": [] + }, + "kernelspec": { + "display_name": "Python 3", + "name": "python3" + } + }, + "nbformat": 4, + "nbformat_minor": 0 +} \ No newline at end of file From 78ef49c224194f742293ce6a3c4276cec99acd9c Mon Sep 17 00:00:00 2001 From: Junjie Bai Date: Wed, 23 Aug 2023 04:45:55 +0000 Subject: [PATCH 6/6] update notebook --- .../llama2/llama2-api.ipynb | 25 ++++++------------- 1 file changed, 8 insertions(+), 17 deletions(-) rename llama2-api.ipynb => advanced/llama2/llama2-api.ipynb (81%) diff --git a/llama2-api.ipynb b/advanced/llama2/llama2-api.ipynb similarity index 81% rename from llama2-api.ipynb rename to advanced/llama2/llama2-api.ipynb index 9b6d33f..e0422a7 100644 --- a/llama2-api.ipynb +++ b/advanced/llama2/llama2-api.ipynb @@ -2,7 +2,7 @@ "cells": [ { "cell_type": "code", - "execution_count": 3, + "execution_count": 1, "metadata": { "id": "lIYdn1woOS1n" }, @@ -22,7 +22,7 @@ "metadata": { "id": "UCOfN-VEsy5m" }, - "execution_count": 4, + "execution_count": 2, "outputs": [] }, { @@ -48,33 +48,24 @@ "print()" ], "metadata": { - "id": "y7eV3R87sz6Y", - "outputId": "89b8a5a4-a767-4355-b738-1358dd9e30a6", "colab": { "base_uri": "https://localhost:8080/" - } + }, + "id": "y7eV3R87sz6Y", + "outputId": "75896f74-408c-4946-8bbd-d392b1a4178b" }, - "execution_count": 5, + "execution_count": 3, "outputs": [ { "output_type": "stream", "name": "stdout", "text": [ - "Of course, I'd be happy to tell you a short story! Here it is:\n", + "Of course! I'd be happy to tell you a short story. Here is one I came up with on the spot:\n", "\n", - "Once upon a time, in a far-off land, there was a magical forest filled with towering trees, sparkling streams, and a variety of creatures great and small. Among these creatures lived\n" + "Once upon a time, in a far-off land, there was a magical forest filled with towering trees, sparkling streams, and a variety of enchanting cre\n" ] } ] - }, - { - "cell_type": "code", - "source": [], - "metadata": { - "id": "IuKq8VV0s3C_" - }, - "execution_count": null, - "outputs": [] } ], "metadata": {