From 366c94257807d229a6e29a4e020e145defd10d04 Mon Sep 17 00:00:00 2001 From: Dhruv Chawla <43818888+Dominastorm@users.noreply.github.com> Date: Fri, 29 Sep 2023 20:43:22 +0530 Subject: [PATCH] Update docs and add examples (#380) * rename variables * re-organize examples * Update api reference * Rename * Add examples --- README.md | 20 +- docs/api-reference/README.md | 86 +++++--- .../language/GuidelineAdherenceScore.md | 1 + .../ResponseCompletenessWrtContext.md | 1 + docs/getting-started/quickstart.mdx | 15 +- ...i-client.mdx => open-source-evaluator.mdx} | 22 +- .../refactor/qna-logs-eval.ipynb | 0 .../refactor/quality-monitoring.ipynb | 0 examples/{ => archive}/refactor/sql_test.py | 0 examples/{ => archive}/refactor/test_docs.py | 0 .../refactor/test_model_grading.py | 0 .../{ => archive}/refactor/test_w_client.py | 0 examples/{ => archive}/v0/README.md | 0 .../v0/automated_finetuning/README.md | 0 .../bert_finetuning_ww.ipynb | 0 .../v0/automated_finetuning/helper_funcs.py | 0 .../v0/conversation_summarization/README.md | 0 .../generate_output_and_embeddings.py | 0 .../grammar_check.ipynb | 0 .../v0/conversation_summarization/run.ipynb | 0 .../v0/fraud_detection/README.md | 0 .../v0/fraud_detection/helper_funcs.py | 0 .../v0/fraud_detection/run.ipynb | 0 .../README.md | 0 .../deepdive_examples/README.md | 0 .../helper_files/__init__.py | 0 .../helper_files/dataset.py | 0 .../helper_files/model_logistic_regression.py | 0 .../helper_files/model_tensorflow.py | 0 .../helper_files/model_torch.py | 0 .../helper_files/pushup_signal.py | 0 .../run.ipynb | 0 .../run_background.py | 0 .../v0/integrations/test_bq_integration.py | 0 .../integrations/test_postgres_integration.py | 0 .../v0/masked_language_modeling/.gitignore | 0 .../v0/masked_language_modeling/README.md | 0 .../masked_language_modeling/helper_funcs.py | 0 .../model_constants.py | 0 .../masked_language_modeling/model_train.py | 0 .../v0/masked_language_modeling/run.ipynb | 0 .../v0/ride_time_estimation/README.md | 0 .../v0/ride_time_estimation/helper_funcs.py | 0 .../v0/ride_time_estimation/run.ipynb | 0 .../v0/shopping_cart_recommendation/README.md | 0 .../helper_funcs.py | 0 .../v0/shopping_cart_recommendation/run.ipynb | 0 .../{ => archive}/v0/speech_to_text/run.ipynb | 0 .../v0/text_summarization/README.md | 0 .../v0/text_summarization/helper_funcs.py | 0 .../v0/text_summarization/run.ipynb | 0 .../advanced}/openai_evals_tutorial.ipynb | 2 +- .../prompt_experiments_tutorial.ipynb | 2 +- .../advanced}/validation_tutorial.ipynb | 2 +- .../experiments_evaluation_tutorial.ipynb | 155 ++++++++++++++ .../managed_service_client_tutorial.ipynb | 176 ++++++++++++++++ .../open_source_evaluator_tutorial.ipynb | 193 ++++++++++++++++++ get_started.py | 4 +- pyproject.toml | 2 +- 59 files changed, 623 insertions(+), 58 deletions(-) create mode 100644 docs/api-reference/operators/language/GuidelineAdherenceScore.md create mode 100644 docs/api-reference/operators/language/ResponseCompletenessWrtContext.md rename docs/tutorials/{open-source-api-client.mdx => open-source-evaluator.mdx} (80%) rename examples/{ => archive}/refactor/qna-logs-eval.ipynb (100%) rename examples/{ => archive}/refactor/quality-monitoring.ipynb (100%) rename examples/{ => archive}/refactor/sql_test.py (100%) rename examples/{ => archive}/refactor/test_docs.py (100%) rename examples/{ => archive}/refactor/test_model_grading.py (100%) rename examples/{ => archive}/refactor/test_w_client.py (100%) rename examples/{ => archive}/v0/README.md (100%) rename examples/{ => archive}/v0/automated_finetuning/README.md (100%) rename examples/{ => archive}/v0/automated_finetuning/bert_finetuning_ww.ipynb (100%) rename examples/{ => archive}/v0/automated_finetuning/helper_funcs.py (100%) rename examples/{ => archive}/v0/conversation_summarization/README.md (100%) rename examples/{ => archive}/v0/conversation_summarization/generate_output_and_embeddings.py (100%) rename examples/{ => archive}/v0/conversation_summarization/grammar_check.ipynb (100%) rename examples/{ => archive}/v0/conversation_summarization/run.ipynb (100%) rename examples/{ => archive}/v0/fraud_detection/README.md (100%) rename examples/{ => archive}/v0/fraud_detection/helper_funcs.py (100%) rename examples/{ => archive}/v0/fraud_detection/run.ipynb (100%) rename examples/{ => archive}/v0/human_orientation_classification/README.md (100%) rename examples/{ => archive}/v0/human_orientation_classification/deepdive_examples/README.md (100%) rename examples/{ => archive}/v0/human_orientation_classification/helper_files/__init__.py (100%) rename examples/{ => archive}/v0/human_orientation_classification/helper_files/dataset.py (100%) rename examples/{ => archive}/v0/human_orientation_classification/helper_files/model_logistic_regression.py (100%) rename examples/{ => archive}/v0/human_orientation_classification/helper_files/model_tensorflow.py (100%) rename examples/{ => archive}/v0/human_orientation_classification/helper_files/model_torch.py (100%) rename examples/{ => archive}/v0/human_orientation_classification/helper_files/pushup_signal.py (100%) rename examples/{ => archive}/v0/human_orientation_classification/run.ipynb (100%) rename examples/{ => archive}/v0/human_orientation_classification/run_background.py (100%) rename examples/{ => archive}/v0/integrations/test_bq_integration.py (100%) rename examples/{ => archive}/v0/integrations/test_postgres_integration.py (100%) rename examples/{ => archive}/v0/masked_language_modeling/.gitignore (100%) rename examples/{ => archive}/v0/masked_language_modeling/README.md (100%) rename examples/{ => archive}/v0/masked_language_modeling/helper_funcs.py (100%) rename examples/{ => archive}/v0/masked_language_modeling/model_constants.py (100%) rename examples/{ => archive}/v0/masked_language_modeling/model_train.py (100%) rename examples/{ => archive}/v0/masked_language_modeling/run.ipynb (100%) rename examples/{ => archive}/v0/ride_time_estimation/README.md (100%) rename examples/{ => archive}/v0/ride_time_estimation/helper_funcs.py (100%) rename examples/{ => archive}/v0/ride_time_estimation/run.ipynb (100%) rename examples/{ => archive}/v0/shopping_cart_recommendation/README.md (100%) rename examples/{ => archive}/v0/shopping_cart_recommendation/helper_funcs.py (100%) rename examples/{ => archive}/v0/shopping_cart_recommendation/run.ipynb (100%) rename examples/{ => archive}/v0/speech_to_text/run.ipynb (100%) rename examples/{ => archive}/v0/text_summarization/README.md (100%) rename examples/{ => archive}/v0/text_summarization/helper_funcs.py (100%) rename examples/{ => archive}/v0/text_summarization/run.ipynb (100%) rename examples/{ => python/advanced}/openai_evals_tutorial.ipynb (99%) rename examples/{ => python/advanced}/prompt_experiments_tutorial.ipynb (99%) rename examples/{ => python/advanced}/validation_tutorial.ipynb (99%) create mode 100644 examples/python/basic/experiments_evaluation_tutorial.ipynb create mode 100644 examples/python/basic/managed_service_client_tutorial.ipynb create mode 100644 examples/python/basic/open_source_evaluator_tutorial.ipynb diff --git a/README.md b/README.md index 41b54955..b53720e7 100644 --- a/README.md +++ b/README.md @@ -3,13 +3,6 @@ Github banner 006 (1) -

Try out Evaluations @@ -47,9 +40,9 @@ | Evaluation | Description | | ------------- | ------------- | | [Factual Accuracy](https://uptrain-ai.github.io/uptrain/operators/language/ResponseFactualScore/) | Checks if the response is grounded by the context provided | -| [Guideline Adherence](https://uptrain-ai.github.io/uptrain/operators/language/ResponseCompleteness/) | Checks if the response or the LLM adhers to the given guideline or not | +| [Guideline Adherence](https://uptrain-ai.github.io/uptrain/operators/language/GuidelineAdherenceScore/) | Checks if the response or the LLM adhers to the given guideline or not | | [Response Completeness](https://uptrain-ai.github.io/uptrain/operators/language/ResponseCompleteness/) | Grades how if the response completes the given question | -| [Response Completeness wrt Context](https://uptrain-ai.github.io/uptrain/operators/language/ResponseCompleteness/) | Grades how complete the response was for the question specified with respect to the information present in the context | +| [Response Completeness wrt Context](https://uptrain-ai.github.io/uptrain/operators/language/ResponseCompletenessWrtContext/) | Grades how complete the response was for the question specified with respect to the information present in the context | | [Context Relevance](https://uptrain-ai.github.io/uptrain/operators/language/ContextRelevance/) | Evaluates if the context has all the information to answer the given question | | [Response Relevance](https://uptrain-ai.github.io/uptrain/operators/language/ResponseRelevance/) | Grades how relevant the generated response is or if it has any additional irrelevant information for the question asked. | | [Tone Critique](https://uptrain-ai.github.io/uptrain/operators/language/ToneCritique/) | Assesses if the tone of machine-generated responses matches with the desired persona. | @@ -75,6 +68,7 @@ Follow the code snippet below to get started with UpTrain. ```python from uptrain.framework import EvalLLM, Evals, CritiqueTone +import json OPENAI_API_KEY = "sk-***************" @@ -91,7 +85,7 @@ results = eval_llm.evaluate( checks=[Evals.CONTEXT_RELEVANCE, Evals.FACTUAL_ACCURACY, Evals.RESPONSE_RELEVANCE, CritiqueTone(persona="teacher")] ) -print(results) +print(json.dumps(results, indent=3)) ``` If you have any questions, please join our [Slack community](https://join.slack.com/t/uptraincommunity/shared_invite/zt-1yih3aojn-CEoR_gAh6PDSknhFmuaJeg) @@ -103,6 +97,7 @@ If you have any questions, please join our [Slack community](https://join.slack. 2. Follow the code snippets below to get started with UpTrain. ```python from uptrain.framework import APIClient, Evals, CritiqueTone +import json UPTRAIN_API_KEY = "up-***************" @@ -120,7 +115,7 @@ results = client.log_and_evaluate( checks=[Evals.CONTEXT_RELEVANCE, Evals.FACTUAL_ACCURACY, Evals.RESPONSE_RELEVANCE, CritiqueTone(persona="teacher")] ) -print(results) +print(json.dumps(results, indent=3)) ``` To have a customized onboarding, please book a [demo call here](https://calendly.com/uptrain-sourabh/uptrain-demo). @@ -131,6 +126,7 @@ Experiments help you perform A/B testing with prompts, so you can compare and ch ```python from uptrain.framework import APIClient, Evals, CritiqueTone +import json UPTRAIN_API_KEY = "up-***************" @@ -156,7 +152,7 @@ results = client.evaluate_experiments( exp_columns=['prompt_variation'] ) -print(results) +print(json.dumps(results, indent=3)) ```

diff --git a/docs/api-reference/README.md b/docs/api-reference/README.md index c6028e94..85a907c5 100644 --- a/docs/api-reference/README.md +++ b/docs/api-reference/README.md @@ -1,16 +1,15 @@

- uptrain + Github banner 006 (1)

-

-

An open-source framework to evaluate applications

-

Try out Evaluations - +Read Docs +- Slack Community - Feature Request @@ -23,8 +22,8 @@ - - Community + + Quickstart Website @@ -32,21 +31,22 @@

- Demo of UpTrain's LLM evaluations with scores for hallucinations, retrieved-context quality, response tonality for a customer support chatbot + Demo of UpTrain's LLM evaluations with scores for hallucinations, retrieved-context quality, response tonality for a customer support chatbot

-**[UpTrain](https://uptrain.ai)** is a Python framework that ensures your LLM applications are performing reliably by allowing users to check aspects such as correctness, structural integrity, bias, hallucination, etc. UpTrain can be used to: +**[UpTrain](https://uptrain.ai)** is an open-source tool to evaluate LLM applications. UpTrain provides pre-built metrics to check LLM responses on aspects such as correctness, hallucination, toxicity, etc. as well as provides an easy-to-use framework to configure custom checks. -# Evalulations We Offer 📝 +# Pre-built Evaluations We Offer 📝 | Evaluation | Description | | ------------- | ------------- | -| [Factual Accuracy](https://uptrain-ai.github.io/uptrain/operators/language/ResponseFactualScore/) | Checks if the response is factually accurate | -| [Response Completeness](https://uptrain-ai.github.io/uptrain/operators/language/ResponseCompleteness/) | Grades how complete the response was for the question specified | -| [Response Completeness wrt Context](https://uptrain-ai.github.io/uptrain/operators/language/ResponseCompleteness/) | Grades how complete the response was for the question specified with respect to the context | -| [Context Relevance](https://uptrain-ai.github.io/uptrain/operators/language/ContextRelevance/) | Evaluates how relevant the context is to the question specified | +| [Factual Accuracy](https://uptrain-ai.github.io/uptrain/operators/language/ResponseFactualScore/) | Checks if the response is grounded by the context provided | +| [Guideline Adherence](https://uptrain-ai.github.io/uptrain/operators/language/ResponseCompleteness/) | Checks if the response or the LLM adhers to the given guideline or not | +| [Response Completeness](https://uptrain-ai.github.io/uptrain/operators/language/ResponseCompleteness/) | Grades how if the response completes the given question | +| [Response Completeness wrt Context](https://uptrain-ai.github.io/uptrain/operators/language/ResponseCompleteness/) | Grades how complete the response was for the question specified with respect to the information present in the context | +| [Context Relevance](https://uptrain-ai.github.io/uptrain/operators/language/ContextRelevance/) | Evaluates if the context has all the information to answer the given question | | [Response Relevance](https://uptrain-ai.github.io/uptrain/operators/language/ResponseRelevance/) | Grades how relevant the generated response is or if it has any additional irrelevant information for the question asked. | -| [Tone Critique](https://uptrain-ai.github.io/uptrain/operators/language/ToneCritique/) | Assesses the tone of machine generated responses. | +| [Tone Critique](https://uptrain-ai.github.io/uptrain/operators/language/ToneCritique/) | Assesses if the tone of machine-generated responses matches with the desired persona. | | [Language Critique](https://uptrain-ai.github.io/uptrain/operators/language/LanguageCritique/) | Scores machine generated responses in a conversation. The response is evaluated on multiple aspects - fluence, politeness, grammar, and coherence. | # Get started 🙌 @@ -58,7 +58,42 @@ pip install uptrain ### How to use UpTrain: -1. Get your free UpTrain API Key [here](https://uptrain.ai/dashboard). +There are two ways to use UpTrain: +1. **Open-source framework:** You can evaluate your responses via the open-source version by providing your OpenAI API key to run evaluations. UpTrain leverages a pipeline comprising GPT-3.5 calls for the same. Note that the evaluation pipeline runs on UpTrain's server but none of the data is logged. + +2. **UpTrain API:** You can use UpTrain's managed service to log and evaluate your LLM responses. Just provide your UpTrain API key (no need for OpenAI keys) and UpTrain manages running evaluations for you with real-time dashboards and deep insights. + +#### Open-source framework: + +Follow the code snippet below to get started with UpTrain. + +```python +from uptrain.framework import EvalLLM, Evals, CritiqueTone +import json + +OPENAI_API_KEY = "sk-***************" + +data = [{ + 'question': 'Which is the most popular global sport?', + 'context': "The popularity of sports can be measured in various ways, including TV viewership, social media presence, number of participants, and economic impact. Football is undoubtedly the world's most popular sport with major events like the FIFA World Cup and sports personalities like Ronaldo and Messi, drawing a followership of more than 4 billion people. Cricket is particularly popular in countries like India, Pakistan, Australia, and England. The ICC Cricket World Cup and Indian Premier League (IPL) have substantial viewership. The NBA has made basketball popular worldwide, especially in countries like the USA, Canada, China, and the Philippines. Major tennis tournaments like Wimbledon, the US Open, French Open, and Australian Open have large global audiences. Players like Roger Federer, Serena Williams, and Rafael Nadal have boosted the sport's popularity. Field Hockey is very popular in countries like India, Netherlands, and Australia. It has a considerable following in many parts of the world.", + 'response': 'Football is the most popular sport with around 4 billion followers worldwide' +}] + +eval_llm = EvalLLM(openai_api_key=OPENAI_API_KEY) + +results = eval_llm.evaluate( + data=data, + checks=[Evals.CONTEXT_RELEVANCE, Evals.FACTUAL_ACCURACY, Evals.RESPONSE_RELEVANCE, CritiqueTone(persona="teacher")] +) + +print(json.dumps(results, indent=3)) +``` +If you have any questions, please join our [Slack community](https://join.slack.com/t/uptraincommunity/shared_invite/zt-1yih3aojn-CEoR_gAh6PDSknhFmuaJeg) + + +#### UpTrain API: + +1. Get your free UpTrain API Key [here](https://uptrain.ai/). 2. Follow the code snippets below to get started with UpTrain. ```python @@ -78,14 +113,12 @@ client = APIClient(uptrain_api_key=UPTRAIN_API_KEY) results = client.log_and_evaluate( project_name="Sample-Project", data=data, - evals=[Evals.CONTEXT_RELEVANCE, Evals.FACTUAL_ACCURACY, Evals.RESPONSE_RELEVANCE, CritiqueTone(persona="teacher")] + checks=[Evals.CONTEXT_RELEVANCE, Evals.FACTUAL_ACCURACY, Evals.RESPONSE_RELEVANCE, CritiqueTone(persona="teacher")] ) print(json.dumps(results, indent=3)) ``` -If you have any questions, please join our [Slack community](https://join.slack.com/t/uptraincommunity/shared_invite/zt-1yih3aojn-CEoR_gAh6PDSknhFmuaJeg) - To have a customized onboarding, please book a [demo call here](https://calendly.com/uptrain-sourabh/uptrain-demo). ### Performing experiments with UpTrain: @@ -93,7 +126,12 @@ To have a customized onboarding, please book a [demo call here](https://calendly Experiments help you perform A/B testing with prompts, so you can compare and choose the options most suitable for you. ```python -data = pd.DataFrame([{ +from uptrain.framework import APIClient, Evals, CritiqueTone +import json + +UPTRAIN_API_KEY = "up-***************" + +data = [{ 'question': 'Which is the most popular global sport?', 'context': "The popularity of sports can be measured in various ways, including TV viewership, social media presence, number of participants, and economic impact. Football is undoubtedly the world's most popular sport with major events like the FIFA World Cup and sports personalities like Ronaldo and Messi, drawing a followership of more than 4 billion people. Cricket is particularly popular in countries like India, Pakistan, Australia, and England. The ICC Cricket World Cup and Indian Premier League (IPL) have substantial viewership. The NBA has made basketball popular worldwide, especially in countries like the USA, Canada, China, and the Philippines. Major tennis tournaments like Wimbledon, the US Open, French Open, and Australian Open have large global audiences. Players like Roger Federer, Serena Williams, and Rafael Nadal have boosted the sport's popularity. Field Hockey is very popular in countries like India, Netherlands, and Australia. It has a considerable following in many parts of the world.", 'response': "1. The most popular global sport is determined by factors such as TV viewership, social media presence, number of participants, and economic impact. 2. Football is considered the most popular sport in the world, with events like the FIFA World Cup and star players like Ronaldo and Messi attracting over 4 billion followers. 3. Cricket is particularly popular in countries like India, Pakistan, Australia, and England, with events like the ICC Cricket World Cup and the Indian Premier League (IPL) having substantial viewership. 4. Basketball has gained global popularity, especially in the USA, Canada, China, and the Philippines, largely thanks to the NBA. 5. Tennis also has a significant global audience, with major tournaments like Wimbledon, the US Open, French Open, and Australian Open, and notable players like Roger Federer, Serena Williams, and Rafael Nadal. 6. Field Hockey enjoys popularity in countries like India, Netherlands, and Australia and has followers in many other parts of the world. So, the most popular global sport is football, followed by cricket, basketball, tennis, and field hockey, depending on the region and various factors.", @@ -104,12 +142,14 @@ data = pd.DataFrame([{ 'context': "The popularity of sports can be measured in various ways, including TV viewership, social media presence, number of participants, and economic impact. Football is undoubtedly the world's most popular sport with major events like the FIFA World Cup and sports personalities like Ronaldo and Messi, drawing a followership of more than 4 billion people. Cricket is particularly popular in countries like India, Pakistan, Australia, and England. The ICC Cricket World Cup and Indian Premier League (IPL) have substantial viewership. The NBA has made basketball popular worldwide, especially in countries like the USA, Canada, China, and the Philippines. Major tennis tournaments like Wimbledon, the US Open, French Open, and Australian Open have large global audiences. Players like Roger Federer, Serena Williams, and Rafael Nadal have boosted the sport's popularity. Field Hockey is very popular in countries like India, Netherlands, and Australia. It has a considerable following in many parts of the world.", 'response': "- Which is the most popular global sport? - Popularity of sports can be measured in different ways: - TV viewership - Social media presence - Number of participants - Economic impact - Football: - FIFA World Cup - Ronaldo and Messi draw over 4 billion followers - Cricket: - Popular in India, Pakistan, Australia, and England - ICC Cricket World Cup - Indian Premier League (IPL) - Basketball: - NBA - Popularity in the USA, Canada, China, Philippines - Tennis: - Major tournaments: Wimbledon, US Open, French Open, Australian Open - Players: Roger Federer, Serena Williams, Rafael Nadal - Field Hockey: - Popular in India, Netherlands, Australia - Followers in many parts of the world In summary, football is the most popular global sport, followed by cricket, basketball, tennis, and field hockey, with variations in popularity depending on region and measurement criteria.", 'prompt_variation': 'tree-of-thought' -}]) +}] + +client = APIClient(uptrain_api_key=UPTRAIN_API_KEY) results = client.evaluate_experiments( project_name="Sample-Experiment", data=data, - evals=[Evals.CONTEXT_RELEVANCE, Evals.FACTUAL_ACCURACY, Evals.RESPONSE_RELEVANCE, CritiqueTone()], + checks=[Evals.CONTEXT_RELEVANCE, Evals.FACTUAL_ACCURACY, Evals.RESPONSE_RELEVANCE, CritiqueTone()], exp_columns=['prompt_variation'] ) @@ -121,8 +161,8 @@ print(json.dumps(results, indent=3)) # Key Features 💡 -- **[Custom Grading Checks](https://uptrain-ai.github.io/uptrain/operators/language/ModelGradeScore/)** - Write your custom grading prompts. -- **[Embeddings Similarity Check](https://uptrain-ai.github.io/uptrain/operators/CosineSimilarity/)** - Compute cosine similarity between prompt and response embeddings +- **[Custom Grading Checks](https://uptrain-ai.github.io/uptrain/operators/language/ModelGradeScore/)** - Write your custom grading prompts to use LLM as an evaluator. +- **[Embeddings Similarity Check](https://uptrain-ai.github.io/uptrain/operators/CosineSimilarity/)** - Compute cosine similarity between prompt-response embeddings - **[UMAP Visualization and Clustering](https://uptrain-ai.github.io/uptrain/operators/UMAP/)** - Visualize your embedding space using tools like UMAP and t-SNE. - **[Feature Slicing]()** - Built-in pivoting functionalities for data dice and slice to pinpoint low-performing cohorts. - **[Realtime Dashboards]()** - Monitor your model's performance in realtime. diff --git a/docs/api-reference/operators/language/GuidelineAdherenceScore.md b/docs/api-reference/operators/language/GuidelineAdherenceScore.md new file mode 100644 index 00000000..345d3ca7 --- /dev/null +++ b/docs/api-reference/operators/language/GuidelineAdherenceScore.md @@ -0,0 +1 @@ +:::uptrain.operators.GuidelineAdherenceScore diff --git a/docs/api-reference/operators/language/ResponseCompletenessWrtContext.md b/docs/api-reference/operators/language/ResponseCompletenessWrtContext.md new file mode 100644 index 00000000..b5ef23f2 --- /dev/null +++ b/docs/api-reference/operators/language/ResponseCompletenessWrtContext.md @@ -0,0 +1 @@ +:::uptrain.operators.ResponseCompletenessWrtContext diff --git a/docs/getting-started/quickstart.mdx b/docs/getting-started/quickstart.mdx index 5a10f0f2..e0dbdfdc 100644 --- a/docs/getting-started/quickstart.mdx +++ b/docs/getting-started/quickstart.mdx @@ -17,6 +17,12 @@ Run the following commands in your terminal to install UpTrain: pip install uptrain ``` +### Import required libraries + +```python +from uptrain.framework import APIClient, Evals, CritiqueTone +import json +``` ### Create an UpTrain API Client @@ -29,7 +35,7 @@ client = APIClient(uptrain_api_key=UPTRAIN_API_KEY) ### Create your data -You can define your data as simple dictionary with the following keys: +You can define your data as a simple dictionary with the following keys: - `question`: The question you want to ask - `context`: The context relevant to the question @@ -64,13 +70,10 @@ Now that we have our data, we can log it and evaluate it using UpTrain. We use t You can find the list of all available evaluations [here](/key-components/evals). ```python -from uptrain.framework import APIClient, Evals, CritiqueTone -import json - results = client.log_and_evaluate( project_name="Sample-Project", data=data, - evals=[Evals.CONTEXT_RELEVANCE, Evals.FACTUAL_ACCURACY, Evals.RESPONSE_RELEVANCE, CritiqueTone(persona="teacher")] + checks=[Evals.CONTEXT_RELEVANCE, Evals.FACTUAL_ACCURACY, Evals.RESPONSE_RELEVANCE, CritiqueTone(persona="teacher")] ) ``` @@ -109,7 +112,7 @@ data = pd.DataFrame([{ results = client.evaluate_experiments( project_name="Sample-Experiment", data=data, - evals=[Evals.CONTEXT_RELEVANCE, Evals.FACTUAL_ACCURACY, Evals.RESPONSE_RELEVANCE, CritiqueTone()], + checks=[Evals.CONTEXT_RELEVANCE, Evals.FACTUAL_ACCURACY, Evals.RESPONSE_RELEVANCE, CritiqueTone()], exp_columns=['prompt_variation'] ) diff --git a/docs/tutorials/open-source-api-client.mdx b/docs/tutorials/open-source-evaluator.mdx similarity index 80% rename from docs/tutorials/open-source-api-client.mdx rename to docs/tutorials/open-source-evaluator.mdx index 21f96e61..ec035349 100644 --- a/docs/tutorials/open-source-api-client.mdx +++ b/docs/tutorials/open-source-evaluator.mdx @@ -1,9 +1,9 @@ --- -title: "Open Source API Client" +title: "Open Source Evaluator" description: "Get started with UpTrain in a few simple steps" --- -This tutorial will walk you through the process of getting started with UpTrain using our open source API client. You can use this API client to log and evaluate your data, and get results in a few simple steps. +This tutorial will walk you through the process of getting started with UpTrain using our open source Evaluator. You can use this Evaluator to log and evaluate your data, and get results in a few simple steps. ### Get your OpenAI API key @@ -17,18 +17,21 @@ Run the following commands in your terminal to install UpTrain: pip install uptrain ``` -### Create an EvalLLM Client +### Create an EvalLLM Evaluator -Before we can start using UpTrain, we need to create an API client. You can do this by passing your API key to the `EvalLLM` constructor. +Before we can start using UpTrain, we need to create an EvalLLM Evaluator. You can do this by passing your API key to the `EvalLLM` constructor. ```python -client = EvalLLM(openai_api_key=OPENAI_API_KEY) +from uptrain.framework import EvalLLM, Evals, CritiqueTone +import json + +eval_llm = EvalLLM(openai_api_key=OPENAI_API_KEY) ``` ### Create your data -You can define your data as simple dictionary with the following keys: +You can define your data as a simple dictionary with the following keys: - `question`: The question you want to ask - `context`: The context relevant to the question @@ -52,7 +55,7 @@ data = [{ ``` -### Log and evaluate your data +### Evaluate your data Now that we have our data, we can log it and evaluate it using UpTrain. We use the `evaluate` method to do this. This method takes the following arguments: @@ -62,10 +65,7 @@ Now that we have our data, we can log it and evaluate it using UpTrain. We use t You can find the list of all available evaluations [here](/key-components/evals). ```python -from uptrain.framework import EvalLLM, Evals, CritiqueTone -import json - -results = client.evaluate( +results = eval_llm.evaluate( data=data, checks=[Evals.CONTEXT_RELEVANCE, Evals.FACTUAL_ACCURACY, Evals.RESPONSE_RELEVANCE, CritiqueTone(persona="teacher")] ) diff --git a/examples/refactor/qna-logs-eval.ipynb b/examples/archive/refactor/qna-logs-eval.ipynb similarity index 100% rename from examples/refactor/qna-logs-eval.ipynb rename to examples/archive/refactor/qna-logs-eval.ipynb diff --git a/examples/refactor/quality-monitoring.ipynb b/examples/archive/refactor/quality-monitoring.ipynb similarity index 100% rename from examples/refactor/quality-monitoring.ipynb rename to examples/archive/refactor/quality-monitoring.ipynb diff --git a/examples/refactor/sql_test.py b/examples/archive/refactor/sql_test.py similarity index 100% rename from examples/refactor/sql_test.py rename to examples/archive/refactor/sql_test.py diff --git a/examples/refactor/test_docs.py b/examples/archive/refactor/test_docs.py similarity index 100% rename from examples/refactor/test_docs.py rename to examples/archive/refactor/test_docs.py diff --git a/examples/refactor/test_model_grading.py b/examples/archive/refactor/test_model_grading.py similarity index 100% rename from examples/refactor/test_model_grading.py rename to examples/archive/refactor/test_model_grading.py diff --git a/examples/refactor/test_w_client.py b/examples/archive/refactor/test_w_client.py similarity index 100% rename from examples/refactor/test_w_client.py rename to examples/archive/refactor/test_w_client.py diff --git a/examples/v0/README.md b/examples/archive/v0/README.md similarity index 100% rename from examples/v0/README.md rename to examples/archive/v0/README.md diff --git a/examples/v0/automated_finetuning/README.md b/examples/archive/v0/automated_finetuning/README.md similarity index 100% rename from examples/v0/automated_finetuning/README.md rename to examples/archive/v0/automated_finetuning/README.md diff --git a/examples/v0/automated_finetuning/bert_finetuning_ww.ipynb b/examples/archive/v0/automated_finetuning/bert_finetuning_ww.ipynb similarity index 100% rename from examples/v0/automated_finetuning/bert_finetuning_ww.ipynb rename to examples/archive/v0/automated_finetuning/bert_finetuning_ww.ipynb diff --git a/examples/v0/automated_finetuning/helper_funcs.py b/examples/archive/v0/automated_finetuning/helper_funcs.py similarity index 100% rename from examples/v0/automated_finetuning/helper_funcs.py rename to examples/archive/v0/automated_finetuning/helper_funcs.py diff --git a/examples/v0/conversation_summarization/README.md b/examples/archive/v0/conversation_summarization/README.md similarity index 100% rename from examples/v0/conversation_summarization/README.md rename to examples/archive/v0/conversation_summarization/README.md diff --git a/examples/v0/conversation_summarization/generate_output_and_embeddings.py b/examples/archive/v0/conversation_summarization/generate_output_and_embeddings.py similarity index 100% rename from examples/v0/conversation_summarization/generate_output_and_embeddings.py rename to examples/archive/v0/conversation_summarization/generate_output_and_embeddings.py diff --git a/examples/v0/conversation_summarization/grammar_check.ipynb b/examples/archive/v0/conversation_summarization/grammar_check.ipynb similarity index 100% rename from examples/v0/conversation_summarization/grammar_check.ipynb rename to examples/archive/v0/conversation_summarization/grammar_check.ipynb diff --git a/examples/v0/conversation_summarization/run.ipynb b/examples/archive/v0/conversation_summarization/run.ipynb similarity index 100% rename from examples/v0/conversation_summarization/run.ipynb rename to examples/archive/v0/conversation_summarization/run.ipynb diff --git a/examples/v0/fraud_detection/README.md b/examples/archive/v0/fraud_detection/README.md similarity index 100% rename from examples/v0/fraud_detection/README.md rename to examples/archive/v0/fraud_detection/README.md diff --git a/examples/v0/fraud_detection/helper_funcs.py b/examples/archive/v0/fraud_detection/helper_funcs.py similarity index 100% rename from examples/v0/fraud_detection/helper_funcs.py rename to examples/archive/v0/fraud_detection/helper_funcs.py diff --git a/examples/v0/fraud_detection/run.ipynb b/examples/archive/v0/fraud_detection/run.ipynb similarity index 100% rename from examples/v0/fraud_detection/run.ipynb rename to examples/archive/v0/fraud_detection/run.ipynb diff --git a/examples/v0/human_orientation_classification/README.md b/examples/archive/v0/human_orientation_classification/README.md similarity index 100% rename from examples/v0/human_orientation_classification/README.md rename to examples/archive/v0/human_orientation_classification/README.md diff --git a/examples/v0/human_orientation_classification/deepdive_examples/README.md b/examples/archive/v0/human_orientation_classification/deepdive_examples/README.md similarity index 100% rename from examples/v0/human_orientation_classification/deepdive_examples/README.md rename to examples/archive/v0/human_orientation_classification/deepdive_examples/README.md diff --git a/examples/v0/human_orientation_classification/helper_files/__init__.py b/examples/archive/v0/human_orientation_classification/helper_files/__init__.py similarity index 100% rename from examples/v0/human_orientation_classification/helper_files/__init__.py rename to examples/archive/v0/human_orientation_classification/helper_files/__init__.py diff --git a/examples/v0/human_orientation_classification/helper_files/dataset.py b/examples/archive/v0/human_orientation_classification/helper_files/dataset.py similarity index 100% rename from examples/v0/human_orientation_classification/helper_files/dataset.py rename to examples/archive/v0/human_orientation_classification/helper_files/dataset.py diff --git a/examples/v0/human_orientation_classification/helper_files/model_logistic_regression.py b/examples/archive/v0/human_orientation_classification/helper_files/model_logistic_regression.py similarity index 100% rename from examples/v0/human_orientation_classification/helper_files/model_logistic_regression.py rename to examples/archive/v0/human_orientation_classification/helper_files/model_logistic_regression.py diff --git a/examples/v0/human_orientation_classification/helper_files/model_tensorflow.py b/examples/archive/v0/human_orientation_classification/helper_files/model_tensorflow.py similarity index 100% rename from examples/v0/human_orientation_classification/helper_files/model_tensorflow.py rename to examples/archive/v0/human_orientation_classification/helper_files/model_tensorflow.py diff --git a/examples/v0/human_orientation_classification/helper_files/model_torch.py b/examples/archive/v0/human_orientation_classification/helper_files/model_torch.py similarity index 100% rename from examples/v0/human_orientation_classification/helper_files/model_torch.py rename to examples/archive/v0/human_orientation_classification/helper_files/model_torch.py diff --git a/examples/v0/human_orientation_classification/helper_files/pushup_signal.py b/examples/archive/v0/human_orientation_classification/helper_files/pushup_signal.py similarity index 100% rename from examples/v0/human_orientation_classification/helper_files/pushup_signal.py rename to examples/archive/v0/human_orientation_classification/helper_files/pushup_signal.py diff --git a/examples/v0/human_orientation_classification/run.ipynb b/examples/archive/v0/human_orientation_classification/run.ipynb similarity index 100% rename from examples/v0/human_orientation_classification/run.ipynb rename to examples/archive/v0/human_orientation_classification/run.ipynb diff --git a/examples/v0/human_orientation_classification/run_background.py b/examples/archive/v0/human_orientation_classification/run_background.py similarity index 100% rename from examples/v0/human_orientation_classification/run_background.py rename to examples/archive/v0/human_orientation_classification/run_background.py diff --git a/examples/v0/integrations/test_bq_integration.py b/examples/archive/v0/integrations/test_bq_integration.py similarity index 100% rename from examples/v0/integrations/test_bq_integration.py rename to examples/archive/v0/integrations/test_bq_integration.py diff --git a/examples/v0/integrations/test_postgres_integration.py b/examples/archive/v0/integrations/test_postgres_integration.py similarity index 100% rename from examples/v0/integrations/test_postgres_integration.py rename to examples/archive/v0/integrations/test_postgres_integration.py diff --git a/examples/v0/masked_language_modeling/.gitignore b/examples/archive/v0/masked_language_modeling/.gitignore similarity index 100% rename from examples/v0/masked_language_modeling/.gitignore rename to examples/archive/v0/masked_language_modeling/.gitignore diff --git a/examples/v0/masked_language_modeling/README.md b/examples/archive/v0/masked_language_modeling/README.md similarity index 100% rename from examples/v0/masked_language_modeling/README.md rename to examples/archive/v0/masked_language_modeling/README.md diff --git a/examples/v0/masked_language_modeling/helper_funcs.py b/examples/archive/v0/masked_language_modeling/helper_funcs.py similarity index 100% rename from examples/v0/masked_language_modeling/helper_funcs.py rename to examples/archive/v0/masked_language_modeling/helper_funcs.py diff --git a/examples/v0/masked_language_modeling/model_constants.py b/examples/archive/v0/masked_language_modeling/model_constants.py similarity index 100% rename from examples/v0/masked_language_modeling/model_constants.py rename to examples/archive/v0/masked_language_modeling/model_constants.py diff --git a/examples/v0/masked_language_modeling/model_train.py b/examples/archive/v0/masked_language_modeling/model_train.py similarity index 100% rename from examples/v0/masked_language_modeling/model_train.py rename to examples/archive/v0/masked_language_modeling/model_train.py diff --git a/examples/v0/masked_language_modeling/run.ipynb b/examples/archive/v0/masked_language_modeling/run.ipynb similarity index 100% rename from examples/v0/masked_language_modeling/run.ipynb rename to examples/archive/v0/masked_language_modeling/run.ipynb diff --git a/examples/v0/ride_time_estimation/README.md b/examples/archive/v0/ride_time_estimation/README.md similarity index 100% rename from examples/v0/ride_time_estimation/README.md rename to examples/archive/v0/ride_time_estimation/README.md diff --git a/examples/v0/ride_time_estimation/helper_funcs.py b/examples/archive/v0/ride_time_estimation/helper_funcs.py similarity index 100% rename from examples/v0/ride_time_estimation/helper_funcs.py rename to examples/archive/v0/ride_time_estimation/helper_funcs.py diff --git a/examples/v0/ride_time_estimation/run.ipynb b/examples/archive/v0/ride_time_estimation/run.ipynb similarity index 100% rename from examples/v0/ride_time_estimation/run.ipynb rename to examples/archive/v0/ride_time_estimation/run.ipynb diff --git a/examples/v0/shopping_cart_recommendation/README.md b/examples/archive/v0/shopping_cart_recommendation/README.md similarity index 100% rename from examples/v0/shopping_cart_recommendation/README.md rename to examples/archive/v0/shopping_cart_recommendation/README.md diff --git a/examples/v0/shopping_cart_recommendation/helper_funcs.py b/examples/archive/v0/shopping_cart_recommendation/helper_funcs.py similarity index 100% rename from examples/v0/shopping_cart_recommendation/helper_funcs.py rename to examples/archive/v0/shopping_cart_recommendation/helper_funcs.py diff --git a/examples/v0/shopping_cart_recommendation/run.ipynb b/examples/archive/v0/shopping_cart_recommendation/run.ipynb similarity index 100% rename from examples/v0/shopping_cart_recommendation/run.ipynb rename to examples/archive/v0/shopping_cart_recommendation/run.ipynb diff --git a/examples/v0/speech_to_text/run.ipynb b/examples/archive/v0/speech_to_text/run.ipynb similarity index 100% rename from examples/v0/speech_to_text/run.ipynb rename to examples/archive/v0/speech_to_text/run.ipynb diff --git a/examples/v0/text_summarization/README.md b/examples/archive/v0/text_summarization/README.md similarity index 100% rename from examples/v0/text_summarization/README.md rename to examples/archive/v0/text_summarization/README.md diff --git a/examples/v0/text_summarization/helper_funcs.py b/examples/archive/v0/text_summarization/helper_funcs.py similarity index 100% rename from examples/v0/text_summarization/helper_funcs.py rename to examples/archive/v0/text_summarization/helper_funcs.py diff --git a/examples/v0/text_summarization/run.ipynb b/examples/archive/v0/text_summarization/run.ipynb similarity index 100% rename from examples/v0/text_summarization/run.ipynb rename to examples/archive/v0/text_summarization/run.ipynb diff --git a/examples/openai_evals_tutorial.ipynb b/examples/python/advanced/openai_evals_tutorial.ipynb similarity index 99% rename from examples/openai_evals_tutorial.ipynb rename to examples/python/advanced/openai_evals_tutorial.ipynb index d531c62b..35b76c26 100644 --- a/examples/openai_evals_tutorial.ipynb +++ b/examples/python/advanced/openai_evals_tutorial.ipynb @@ -346,7 +346,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.10.6" + "version": "3.10.12" } }, "nbformat": 4, diff --git a/examples/prompt_experiments_tutorial.ipynb b/examples/python/advanced/prompt_experiments_tutorial.ipynb similarity index 99% rename from examples/prompt_experiments_tutorial.ipynb rename to examples/python/advanced/prompt_experiments_tutorial.ipynb index 783c7f00..e76794c7 100644 --- a/examples/prompt_experiments_tutorial.ipynb +++ b/examples/python/advanced/prompt_experiments_tutorial.ipynb @@ -471,7 +471,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.10.6" + "version": "3.10.12" }, "vscode": { "interpreter": { diff --git a/examples/validation_tutorial.ipynb b/examples/python/advanced/validation_tutorial.ipynb similarity index 99% rename from examples/validation_tutorial.ipynb rename to examples/python/advanced/validation_tutorial.ipynb index 267bfc0a..03c420dd 100644 --- a/examples/validation_tutorial.ipynb +++ b/examples/python/advanced/validation_tutorial.ipynb @@ -809,7 +809,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.11.3" + "version": "3.10.12" }, "vscode": { "interpreter": { diff --git a/examples/python/basic/experiments_evaluation_tutorial.ipynb b/examples/python/basic/experiments_evaluation_tutorial.ipynb new file mode 100644 index 00000000..57f15693 --- /dev/null +++ b/examples/python/basic/experiments_evaluation_tutorial.ipynb @@ -0,0 +1,155 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Perform Experiments with UpTrain\n", + "\n", + "Experiments help you perform A/B testing with prompts, so you can compare and choose the options most suitable for you. " + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Install UpTrain\n", + "\n", + "Run the following commands in your terminal to install UpTrain:\n", + "```bash\n", + "pip install uptrain\n", + "```" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from uptrain.framework import APIClient, Evals, CritiqueTone\n", + "import json" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Create an UpTrain API Client\n", + "\n", + "Before we can start using UpTrain, we need to create an API client. You can do this by passing your API key to the `APIClient` constructor.\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "UPTRAIN_API_KEY = \"up-***************\"\n", + "client = APIClient(uptrain_api_key=UPTRAIN_API_KEY)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Create your data\n", + "\n", + "You can define your data as a simple dictionary with the following keys:\n", + "\n", + "- `question`: The question you want to ask\n", + "- `context`: The context relevant to the question\n", + "- `response`: The response to the question" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "data = [{\n", + " 'question': 'Which is the most popular global sport?',\n", + " 'context': \"The popularity of sports can be measured in various ways, including TV viewership, social media presence, number of participants, and economic impact. Football is undoubtedly the world's most popular sport with major events like the FIFA World Cup and sports personalities like Ronaldo and Messi, drawing a followership of more than 4 billion people. Cricket is particularly popular in countries like India, Pakistan, Australia, and England. The ICC Cricket World Cup and Indian Premier League (IPL) have substantial viewership. The NBA has made basketball popular worldwide, especially in countries like the USA, Canada, China, and the Philippines. Major tennis tournaments like Wimbledon, the US Open, French Open, and Australian Open have large global audiences. Players like Roger Federer, Serena Williams, and Rafael Nadal have boosted the sport's popularity. Field Hockey is very popular in countries like India, Netherlands, and Australia. It has a considerable following in many parts of the world.\",\n", + " 'response': \"1. The most popular global sport is determined by factors such as TV viewership, social media presence, number of participants, and economic impact. 2. Football is considered the most popular sport in the world, with events like the FIFA World Cup and star players like Ronaldo and Messi attracting over 4 billion followers. 3. Cricket is particularly popular in countries like India, Pakistan, Australia, and England, with events like the ICC Cricket World Cup and the Indian Premier League (IPL) having substantial viewership. 4. Basketball has gained global popularity, especially in the USA, Canada, China, and the Philippines, largely thanks to the NBA. 5. Tennis also has a significant global audience, with major tournaments like Wimbledon, the US Open, French Open, and Australian Open, and notable players like Roger Federer, Serena Williams, and Rafael Nadal. 6. Field Hockey enjoys popularity in countries like India, Netherlands, and Australia and has followers in many other parts of the world. So, the most popular global sport is football, followed by cricket, basketball, tennis, and field hockey, depending on the region and various factors.\",\n", + " 'prompt_variation': 'chain-of-thought'\n", + "},\n", + "{\n", + " 'question': 'Which is the most popular global sport?',\n", + " 'context': \"The popularity of sports can be measured in various ways, including TV viewership, social media presence, number of participants, and economic impact. Football is undoubtedly the world's most popular sport with major events like the FIFA World Cup and sports personalities like Ronaldo and Messi, drawing a followership of more than 4 billion people. Cricket is particularly popular in countries like India, Pakistan, Australia, and England. The ICC Cricket World Cup and Indian Premier League (IPL) have substantial viewership. The NBA has made basketball popular worldwide, especially in countries like the USA, Canada, China, and the Philippines. Major tennis tournaments like Wimbledon, the US Open, French Open, and Australian Open have large global audiences. Players like Roger Federer, Serena Williams, and Rafael Nadal have boosted the sport's popularity. Field Hockey is very popular in countries like India, Netherlands, and Australia. It has a considerable following in many parts of the world.\",\n", + " 'response': \"- Which is the most popular global sport? - Popularity of sports can be measured in different ways: - TV viewership - Social media presence - Number of participants - Economic impact - Football: - FIFA World Cup - Ronaldo and Messi draw over 4 billion followers - Cricket: - Popular in India, Pakistan, Australia, and England - ICC Cricket World Cup - Indian Premier League (IPL) - Basketball: - NBA - Popularity in the USA, Canada, China, Philippines - Tennis: - Major tournaments: Wimbledon, US Open, French Open, Australian Open - Players: Roger Federer, Serena Williams, Rafael Nadal - Field Hockey: - Popular in India, Netherlands, Australia - Followers in many parts of the world In summary, football is the most popular global sport, followed by cricket, basketball, tennis, and field hockey, with variations in popularity depending on region and measurement criteria.\",\n", + " 'prompt_variation': 'tree-of-thought'\n", + "}]" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Experiments\n", + "\n", + "Now that we have our data, we can perform experiments on it using UpTrain. We use the `evaluate_experiments` method to do this. This method takes the following arguments:\n", + "\n", + "- `project_name`: The name of your project\n", + "- `data`: The data you want to log and evaluate\n", + "- `evals`: The evaluations you want to perform on your data\n", + "- `exp_columns` - A list of all the columns that act as identifiers to indicate which experiment the row belongs to. You can enter multiple column names here.\n", + "\n", + "You can find the list of all available evaluations [here](https://docs.uptrain.ai/key-components/evals).\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "results = client.evaluate_experiments(\n", + " project_name=\"Sample-Experiment\",\n", + " data=data,\n", + " checks=[Evals.CONTEXT_RELEVANCE, Evals.FACTUAL_ACCURACY, Evals.RESPONSE_RELEVANCE, CritiqueTone()],\n", + " exp_columns=['prompt_variation']\n", + ")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Get your results" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "print(json.dumps(results, indent=3))" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": ".venv", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.10.12" + }, + "orig_nbformat": 4 + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/examples/python/basic/managed_service_client_tutorial.ipynb b/examples/python/basic/managed_service_client_tutorial.ipynb new file mode 100644 index 00000000..55c9687e --- /dev/null +++ b/examples/python/basic/managed_service_client_tutorial.ipynb @@ -0,0 +1,176 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Using UpTrain with our Managed Service API Client\n", + "\n", + "You can use UpTrain's managed service to log and evaluate your LLM responses. Just provide your UpTrain API key (no need for OpenAI keys) and UpTrain manages running evaluations for you with real-time dashboards and deep insights." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Install UpTrain\n", + "\n", + "Run the following commands in your terminal to install UpTrain:\n", + "```bash\n", + "pip install uptrain\n", + "```" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [], + "source": [ + "from uptrain.framework import APIClient, Evals, CritiqueTone\n", + "import json" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Create an UpTrain API Client\n", + "\n", + "Before we can start using UpTrain, we need to create an API client. You can do this by passing your API key to the `APIClient` constructor.\n" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [], + "source": [ + "UPTRAIN_API_KEY = \"up-***************\"\n", + "client = APIClient(uptrain_api_key=UPTRAIN_API_KEY)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Create your data\n", + "\n", + "You can define your data as a simple dictionary with the following keys:\n", + "\n", + "- `question`: The question you want to ask\n", + "- `context`: The context relevant to the question\n", + "- `response`: The response to the question" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [], + "source": [ + "data = [{\n", + " 'question': 'Which is the most popular global sport?',\n", + " 'context': \"The popularity of sports can be measured in various ways, including TV viewership, social media presence, number of participants, and economic impact. Football is undoubtedly the world's most popular sport with major events like the FIFA World Cup and sports personalities like Ronaldo and Messi, drawing a followership of more than 4 billion people. Cricket is particularly popular in countries like India, Pakistan, Australia, and England. The ICC Cricket World Cup and Indian Premier League (IPL) have substantial viewership. The NBA has made basketball popular worldwide, especially in countries like the USA, Canada, China, and the Philippines. Major tennis tournaments like Wimbledon, the US Open, French Open, and Australian Open have large global audiences. Players like Roger Federer, Serena Williams, and Rafael Nadal have boosted the sport's popularity. Field Hockey is very popular in countries like India, Netherlands, and Australia. It has a considerable following in many parts of the world.\",\n", + " 'response': 'Football is the most popular sport with around 4 billion followers worldwide'\n", + "}]" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Log and evaluate your data\n", + "\n", + "Now that we have our data, we can log it and evaluate it using UpTrain. We use the `log_and_evaluate` method to do this. This method takes the following arguments:\n", + "\n", + "- `project_name`: The name of your project\n", + "- `data`: The data you want to log and evaluate\n", + "- `evals`: The evaluations you want to perform on your data\n", + "\n", + "You can find the list of all available evaluations [here](https://docs.uptrain.ai/key-components/evals)." + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "\u001b[32m2023-09-29 19:25:43.139\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36muptrain.framework.remote\u001b[0m:\u001b[36mlog_and_evaluate\u001b[0m:\u001b[36m446\u001b[0m - \u001b[1mSending evaluation request for rows 0 to <50 to the Uptrain server\u001b[0m\n" + ] + } + ], + "source": [ + "results = client.log_and_evaluate(\n", + " project_name=\"Sample-Project\",\n", + " data=data,\n", + " checks=[Evals.CONTEXT_RELEVANCE, Evals.FACTUAL_ACCURACY, Evals.RESPONSE_RELEVANCE, CritiqueTone(persona=\"teacher\")]\n", + ")\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Get your results" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[\n", + " {\n", + " \"question\": \"Which is the most popular global sport?\",\n", + " \"context\": \"The popularity of sports can be measured in various ways, including TV viewership, social media presence, number of participants, and economic impact. Football is undoubtedly the world's most popular sport with major events like the FIFA World Cup and sports personalities like Ronaldo and Messi, drawing a followership of more than 4 billion people. Cricket is particularly popular in countries like India, Pakistan, Australia, and England. The ICC Cricket World Cup and Indian Premier League (IPL) have substantial viewership. The NBA has made basketball popular worldwide, especially in countries like the USA, Canada, China, and the Philippines. Major tennis tournaments like Wimbledon, the US Open, French Open, and Australian Open have large global audiences. Players like Roger Federer, Serena Williams, and Rafael Nadal have boosted the sport's popularity. Field Hockey is very popular in countries like India, Netherlands, and Australia. It has a considerable following in many parts of the world.\",\n", + " \"response\": \"Football is the most popular sport with around 4 billion followers worldwide\",\n", + " \"score_context_relevance\": 1.0,\n", + " \"explanation_context_relevance\": \"1. The question asks for the most popular global sport.\\n2. The extracted context provides information about the popularity of various sports, including football, cricket, basketball, tennis, and field hockey.\\n3. The extracted context states that football is undoubtedly the world's most popular sport, with major events like the FIFA World Cup and sports personalities like Ronaldo and Messi drawing a followership of more than 4 billion people.\\n4. Based on the information provided in the extracted context, it can be concluded that football is the most popular global sport.\\n5. Therefore, the extracted context can answer the given question completely.\\n\\nScore: 1.0\\n1.0\",\n", + " \"score_factual_accuracy\": 1.0,\n", + " \"explanation_factual_accuracy\": \"1. Football is the most popular sport.\\nArgument for yes: The context explicitly states that football is undoubtedly the world's most popular sport.\\nArgument for no: The context mentions other sports like cricket, basketball, and field hockey that are also popular, so it is not clear if football is the most popular.\\nJudgement: yes. The argument for yes is stronger as the context clearly states that football is the most popular sport.\\n2. It has around 4 billion followers worldwide.\\nArgument for yes: The context mentions that football has a followership of more than 4 billion people.\\nArgument for no: No arguments.\\nJudgement: yes. The argument for yes is stronger as the context explicitly states that football has more than 4 billion followers worldwide.\\n\",\n", + " \"score_response_relevance\": 0.0,\n", + " \"explanation_response_relevance\": \"1. Read the question: \\\"Which is the most popular global sport?\\\"\\n2. Read the response: \\\"Football is the most popular sport with around 4 billion followers worldwide.\\\"\\n3. Compare the response to the question.\\n4. The response directly answers the question by stating that football is the most popular global sport.\\n5. There is no additional irrelevant information in the response.\\n6. The response is concise and does not contain any unnecessary details.\\n7. The response is focused and does not include any logical fallacies, incorrect assumptions, or errors in reasoning.\\n8. Based on the above analysis, the generated answer has no additional irrelevant information.\\n\\nThe correct answer is C. The generated answer has no additional irrelevant information.\\n0.0\\n0.0\",\n", + " \"score_tone\": 0.4,\n", + " \"explanation_tone\": \"The tone is not appropriate for a teacher. It is a simple statement of fact without any context or explanation. A teacher would typically provide more information and engage the student in a discussion about the topic.\"\n", + " }\n", + "]\n" + ] + } + ], + "source": [ + "print(json.dumps(results, indent=3))" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": ".venv", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.10.12" + }, + "orig_nbformat": 4 + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/examples/python/basic/open_source_evaluator_tutorial.ipynb b/examples/python/basic/open_source_evaluator_tutorial.ipynb new file mode 100644 index 00000000..329cb533 --- /dev/null +++ b/examples/python/basic/open_source_evaluator_tutorial.ipynb @@ -0,0 +1,193 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Using UpTrain with the Open-source EvalLLM Evaluator\n", + "\n", + "You can evaluate your responses via the open-source version by providing your OpenAI API key to run evaluations. UpTrain leverages a pipeline comprising GPT-3.5 calls for the same. Note that the evaluation pipeline runs on UpTrain's server but none of the data is logged." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Install UpTrain\n", + "\n", + "Run the following commands in your terminal to install UpTrain:\n", + "```bash\n", + "pip install uptrain\n", + "```" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [], + "source": [ + "from uptrain.framework import EvalLLM, Evals, CritiqueTone\n", + "import json" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Get your OpenAI API key\n", + "\n", + "You can get your OpenAI API key [here](https://platform.openai.com/account/api-keys)." + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [], + "source": [ + "OPENAI_API_KEY = \"sk-*****************\"" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Create an EvalLLM Evaluator\n", + "\n", + "Before we can start using UpTrain, we need to create an EvalLLM Evaluator. You can do this by passing your API key to the `EvalLLM` constructor." + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [], + "source": [ + "eval_llm = EvalLLM(openai_api_key=OPENAI_API_KEY)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Create your data\n", + "\n", + "You can define your data as a simple dictionary with the following keys:\n", + "\n", + "- `question`: The question you want to ask\n", + "- `context`: The context relevant to the question\n", + "- `response`: The response to the question" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [], + "source": [ + "data = [{\n", + " 'question': 'Which is the most popular global sport?',\n", + " 'context': \"The popularity of sports can be measured in various ways, including TV viewership, social media presence, number of participants, and economic impact. Football is undoubtedly the world's most popular sport with major events like the FIFA World Cup and sports personalities like Ronaldo and Messi, drawing a followership of more than 4 billion people. Cricket is particularly popular in countries like India, Pakistan, Australia, and England. The ICC Cricket World Cup and Indian Premier League (IPL) have substantial viewership. The NBA has made basketball popular worldwide, especially in countries like the USA, Canada, China, and the Philippines. Major tennis tournaments like Wimbledon, the US Open, French Open, and Australian Open have large global audiences. Players like Roger Federer, Serena Williams, and Rafael Nadal have boosted the sport's popularity. Field Hockey is very popular in countries like India, Netherlands, and Australia. It has a considerable following in many parts of the world.\",\n", + " 'response': 'Football is the most popular sport with around 4 billion followers worldwide'\n", + "}]" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Evaluate your data\n", + "\n", + "Now that we have our data, we can log it and evaluate it using UpTrain. We use the `evaluate` method to do this. This method takes the following arguments:\n", + "\n", + "- `data`: The data you want to log and evaluate\n", + "- `checks`: The evaluations you want to perform on your data\n", + "\n", + "You can find the list of all available evaluations [here](https://docs.uptrain.ai/key-components/evals)." + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "\u001b[32m2023-09-29 19:17:50.503\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36muptrain.framework.evalllm\u001b[0m:\u001b[36mevaluate\u001b[0m:\u001b[36m95\u001b[0m - \u001b[1mSending evaluation request for rows 0 to <50 to the Uptrain\u001b[0m\n", + "\u001b[32m2023-09-29 19:18:21.530\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36muptrain.framework.evalllm\u001b[0m:\u001b[36mevaluate\u001b[0m:\u001b[36m95\u001b[0m - \u001b[1mSending evaluation request for rows 0 to <50 to the Uptrain\u001b[0m\n", + "\u001b[32m2023-09-29 19:18:47.231\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36muptrain.framework.evalllm\u001b[0m:\u001b[36mevaluate\u001b[0m:\u001b[36m95\u001b[0m - \u001b[1mSending evaluation request for rows 0 to <50 to the Uptrain\u001b[0m\n" + ] + } + ], + "source": [ + "results = eval_llm.evaluate(\n", + " data=data,\n", + " checks=[Evals.CONTEXT_RELEVANCE, Evals.FACTUAL_ACCURACY, Evals.RESPONSE_RELEVANCE, CritiqueTone(persona=\"teacher\")]\n", + ")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Get your results" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[\n", + " {\n", + " \"question\": \"Which is the most popular global sport?\",\n", + " \"context\": \"The popularity of sports can be measured in various ways, including TV viewership, social media presence, number of participants, and economic impact. Football is undoubtedly the world's most popular sport with major events like the FIFA World Cup and sports personalities like Ronaldo and Messi, drawing a followership of more than 4 billion people. Cricket is particularly popular in countries like India, Pakistan, Australia, and England. The ICC Cricket World Cup and Indian Premier League (IPL) have substantial viewership. The NBA has made basketball popular worldwide, especially in countries like the USA, Canada, China, and the Philippines. Major tennis tournaments like Wimbledon, the US Open, French Open, and Australian Open have large global audiences. Players like Roger Federer, Serena Williams, and Rafael Nadal have boosted the sport's popularity. Field Hockey is very popular in countries like India, Netherlands, and Australia. It has a considerable following in many parts of the world.\",\n", + " \"response\": \"Football is the most popular sport with around 4 billion followers worldwide\",\n", + " \"score_context_relevance\": 1.0,\n", + " \"explanation_context_relevance\": \"1. The question asks for the most popular global sport.\\n2. The extracted context provides information about the popularity of various sports, including football, cricket, basketball, tennis, and field hockey.\\n3. The context states that football is undoubtedly the world's most popular sport, with major events like the FIFA World Cup and popular sports personalities like Ronaldo and Messi drawing a followership of more than 4 billion people.\\n4. The context also mentions the popularity of cricket in countries like India, Pakistan, Australia, and England, the popularity of basketball worldwide due to the NBA, the global audiences of major tennis tournaments, and the popularity of field hockey in countries like India, Netherlands, and Australia.\\n5. Based on the information provided in the context, it can be concluded that football is the most popular global sport.\\n6. Therefore, the extracted context can answer the given question completely.\\n\\nScore: 1.0\\n1.0\",\n", + " \"score_factual_accuracy\": 1.0,\n", + " \"explanation_factual_accuracy\": \"1. Football is the most popular sport.\\nArgument for yes: The context explicitly states that football is undoubtedly the world's most popular sport.\\nArgument for no: The context mentions other sports like cricket, basketball, and field hockey that are also popular, so it is not clear if football is the most popular.\\nJudgement: yes. The argument for yes is stronger as the context clearly states that football is the most popular sport.\\n2. It has around 4 billion followers worldwide.\\nArgument for yes: The context mentions that football has a followership of more than 4 billion people.\\nArgument for no: No arguments.\\nJudgement: yes. The argument for yes is stronger as the context explicitly states that football has more than 4 billion followers worldwide.\\n\",\n", + " \"score_response_relevance\": 1.0,\n", + " \"explanation_response_relevance\": \"1. The question asks for the most popular global sport.\\n2. The response states that football is the most popular sport with around 4 billion followers worldwide.\\n3. The response directly answers the question by providing the specific sport (football) and the estimated number of followers worldwide.\\n4. There is no additional irrelevant information in the response.\\n5. The response is concise and focused on the main topic without any unnecessary details.\\n\\nBased on the reasoning above, the correct answer is:\\n\\nC. The generated answer has no additional irrelevant information.\\n\\nScore: 1.0\\n1.0\",\n", + " \"score_tone\": 0.4,\n", + " \"explanation_tone\": \"The tone does not align well with the persona of a teacher. The response is factual and lacks the warmth and guidance that is typically expected from a teacher.\"\n", + " }\n", + "]\n" + ] + } + ], + "source": [ + "print(json.dumps(results, indent=3))" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": ".venv", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.10.12" + }, + "orig_nbformat": 4 + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/get_started.py b/get_started.py index 2fe8771c..349f68bc 100644 --- a/get_started.py +++ b/get_started.py @@ -9,9 +9,9 @@ 'response': 'Football is the most popular sport with around 4 billion followers worldwide' }] -client = EvalLLM(openai_api_key=OPENAI_API_KEY) +eval_llm = EvalLLM(openai_api_key=OPENAI_API_KEY) -results = client.evaluate( +results = eval_llm.evaluate( data=data, checks=[Evals.CONTEXT_RELEVANCE, Evals.FACTUAL_ACCURACY, Evals.RESPONSE_RELEVANCE, CritiqueTone(persona="teacher")] ) diff --git a/pyproject.toml b/pyproject.toml index 905bb99d..30e8239c 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -3,7 +3,7 @@ requires = ["setuptools", "wheel"] [project] name = "uptrain" -version = "0.3.8" +version = "0.4.0" description = "UpTrain - tool to evaluate LLM applications on aspects like factual accuracy, response quality, retrieval quality, tonality, etc." readme = "README.md" maintainers = [{ name = "UpTrain AI Team", email = "uptrain.ai@gmail.com" }]