From e8ee779bc5bec73e494dfd53c7f5738a92700246 Mon Sep 17 00:00:00 2001 From: Ife Ojomo Date: Tue, 24 Sep 2024 13:41:30 +0100 Subject: [PATCH 1/7] stepfunctions --- stepfunctions/.gitignore | 10 + stepfunctions/README.md | 120 +++++ .../INSTALLER | 1 + .../agent_evaluation-0.2.0.dist-info/LICENSE | 175 +++++++ .../agent_evaluation-0.2.0.dist-info/METADATA | 74 +++ .../agent_evaluation-0.2.0.dist-info/NOTICE | 1 + .../agent_evaluation-0.2.0.dist-info/RECORD | 87 ++++ .../REQUESTED | 0 .../agent_evaluation-0.2.0.dist-info/WHEEL | 5 + .../entry_points.txt | 2 + .../top_level.txt | 1 + stepfunctions/agenteval/__init__.py | 46 ++ stepfunctions/agenteval/cli.py | 109 +++++ stepfunctions/agenteval/conversation.py | 35 ++ stepfunctions/agenteval/defaults.py | 8 + .../agenteval/evaluators/__init__.py | 7 + .../agenteval/evaluators/base_evaluator.py | 139 ++++++ .../agenteval/evaluators/claude_3/__init__.py | 6 + .../evaluators/claude_3/evaluator.py | 244 ++++++++++ .../evaluators/claude_3/model_configs.py | 26 ++ .../claude_3/most_updated_prompt_2607.txt | 67 +++ .../agenteval/evaluators/evaluator_factory.py | 27 ++ stepfunctions/agenteval/hook.py | 33 ++ stepfunctions/agenteval/plan.py | 110 +++++ stepfunctions/agenteval/runner/__init__.py | 6 + stepfunctions/agenteval/runner/runner.py | 116 +++++ stepfunctions/agenteval/runner/summary.py | 30 ++ stepfunctions/agenteval/target_response.py | 15 + stepfunctions/agenteval/targets/__init__.py | 8 + .../agenteval/targets/base_target.py | 27 ++ .../targets/bedrock_agent/__init__.py | 3 + .../agenteval/targets/bedrock_agent/target.py | 41 ++ .../bedrock_knowledge_base/__init__.py | 3 + .../targets/bedrock_knowledge_base/target.py | 38 ++ .../agenteval/targets/boto3_target.py | 41 ++ .../agenteval/targets/q_business/__init__.py | 3 + .../agenteval/targets/q_business/target.py | 32 ++ .../targets/sagemaker_endpoint/__init__.py | 3 + .../targets/sagemaker_endpoint/target.py | 85 ++++ .../agenteval/targets/target_factory.py | 32 ++ .../claude_3/generate_evaluation.jinja | 13 + .../claude_3/generate_initial_prompt.jinja | 5 + .../claude_3/generate_test_status.jinja | 13 + .../claude_3/generate_user_response.jinja | 13 + .../claude_3/system/generate_evaluation.jinja | 12 + .../system/generate_initial_prompt.jinja | 13 + .../system/generate_test_status.jinja | 13 + .../system/generate_user_response.jinja | 15 + .../summary/agenteval_summary.md.jinja | 49 ++ stepfunctions/agenteval/test.py | 29 ++ stepfunctions/agenteval/test_result.py | 27 ++ stepfunctions/agenteval/trace.py | 72 +++ stepfunctions/agenteval/utils/__init__.py | 7 + stepfunctions/agenteval/utils/aws.py | 36 ++ stepfunctions/agenteval/utils/imports.py | 35 ++ stepfunctions/app.py | 28 ++ stepfunctions/cdk.json | 70 +++ .../prompts_scenarios.json | 158 +++++++ .../layers/agent-evaluation/requirements.txt | 1 + stepfunctions/layers/jinja2/requirements.txt | 1 + .../layers/pydantic/requirements.txt | 1 + stepfunctions/layers/pyyaml/requirements.txt | 1 + stepfunctions/requirements-dev.txt | 1 + stepfunctions/requirements.txt | 3 + stepfunctions/source.bat | 13 + stepfunctions/stepfunctions/__init__.py | 0 .../INSTALLER | 1 + .../agent_evaluation-0.2.0.dist-info/LICENSE | 175 +++++++ .../agent_evaluation-0.2.0.dist-info/METADATA | 74 +++ .../agent_evaluation-0.2.0.dist-info/NOTICE | 1 + .../agent_evaluation-0.2.0.dist-info/RECORD | 87 ++++ .../REQUESTED | 0 .../agent_evaluation-0.2.0.dist-info/WHEEL | 5 + .../entry_points.txt | 2 + .../top_level.txt | 1 + .../stepfunctions/agenteval/__init__.py | 46 ++ stepfunctions/stepfunctions/agenteval/cli.py | 109 +++++ .../stepfunctions/agenteval/conversation.py | 35 ++ .../stepfunctions/agenteval/defaults.py | 8 + .../agenteval/evaluators/__init__.py | 7 + .../agenteval/evaluators/base_evaluator.py | 139 ++++++ .../agenteval/evaluators/claude_3/__init__.py | 6 + .../evaluators/claude_3/evaluator.py | 244 ++++++++++ .../evaluators/claude_3/model_configs.py | 26 ++ .../claude_3/most_updated_prompt_2607.txt | 67 +++ .../agenteval/evaluators/evaluator_factory.py | 27 ++ stepfunctions/stepfunctions/agenteval/hook.py | 33 ++ stepfunctions/stepfunctions/agenteval/plan.py | 110 +++++ .../agenteval/runner/__init__.py | 6 + .../stepfunctions/agenteval/runner/runner.py | 116 +++++ .../stepfunctions/agenteval/runner/summary.py | 30 ++ .../agenteval/target_response.py | 15 + .../agenteval/targets/__init__.py | 8 + .../agenteval/targets/base_target.py | 27 ++ .../targets/bedrock_agent/__init__.py | 3 + .../agenteval/targets/bedrock_agent/target.py | 41 ++ .../bedrock_knowledge_base/__init__.py | 3 + .../targets/bedrock_knowledge_base/target.py | 38 ++ .../agenteval/targets/boto3_target.py | 41 ++ .../agenteval/targets/q_business/__init__.py | 3 + .../agenteval/targets/q_business/target.py | 32 ++ .../targets/sagemaker_endpoint/__init__.py | 3 + .../targets/sagemaker_endpoint/target.py | 85 ++++ .../agenteval/targets/target_factory.py | 32 ++ .../claude_3/generate_evaluation.jinja | 13 + .../claude_3/generate_initial_prompt.jinja | 5 + .../claude_3/generate_test_status.jinja | 13 + .../claude_3/generate_user_response.jinja | 13 + .../claude_3/system/generate_evaluation.jinja | 12 + .../system/generate_initial_prompt.jinja | 13 + .../system/generate_test_status.jinja | 13 + .../system/generate_user_response.jinja | 15 + .../summary/agenteval_summary.md.jinja | 49 ++ stepfunctions/stepfunctions/agenteval/test.py | 29 ++ .../stepfunctions/agenteval/test_result.py | 27 ++ .../stepfunctions/agenteval/trace.py | 72 +++ .../stepfunctions/agenteval/utils/__init__.py | 7 + .../stepfunctions/agenteval/utils/aws.py | 36 ++ .../stepfunctions/agenteval/utils/imports.py | 35 ++ .../.~c9_invoke_Zi2ZN1.py | 43 ++ .../functions/check_agent_status_1/index.py | 29 ++ .../.~c9_invoke_Zi2ZN1.py | 43 ++ .../functions/check_agent_status_2/index.py | 27 ++ .../functions/create_alias/index.py | 31 ++ .../functions/delete_alias/index.py | 26 ++ .../functions/generate_map/index.py | 47 ++ .../stepfunctions/functions/run_test/index.py | 163 +++++++ .../functions/update_bedrock_agent/index.py | 37 ++ stepfunctions/stepfunctions/layer/__init__.py | 1 + stepfunctions/stepfunctions/layer/layer.py | 60 +++ .../stepfunctions/stepfunctions_stack.py | 432 ++++++++++++++++++ stepfunctions/tests/__init__.py | 0 stepfunctions/tests/unit/__init__.py | 0 .../tests/unit/test_stepfunctions_stack.py | 15 + 134 files changed, 5437 insertions(+) create mode 100644 stepfunctions/.gitignore create mode 100644 stepfunctions/README.md create mode 100644 stepfunctions/agent_evaluation-0.2.0.dist-info/INSTALLER create mode 100644 stepfunctions/agent_evaluation-0.2.0.dist-info/LICENSE create mode 100644 stepfunctions/agent_evaluation-0.2.0.dist-info/METADATA create mode 100644 stepfunctions/agent_evaluation-0.2.0.dist-info/NOTICE create mode 100644 stepfunctions/agent_evaluation-0.2.0.dist-info/RECORD create mode 100644 stepfunctions/agent_evaluation-0.2.0.dist-info/REQUESTED create mode 100644 stepfunctions/agent_evaluation-0.2.0.dist-info/WHEEL create mode 100644 stepfunctions/agent_evaluation-0.2.0.dist-info/entry_points.txt create mode 100644 stepfunctions/agent_evaluation-0.2.0.dist-info/top_level.txt create mode 100644 stepfunctions/agenteval/__init__.py create mode 100644 stepfunctions/agenteval/cli.py create mode 100644 stepfunctions/agenteval/conversation.py create mode 100644 stepfunctions/agenteval/defaults.py create mode 100644 stepfunctions/agenteval/evaluators/__init__.py create mode 100644 stepfunctions/agenteval/evaluators/base_evaluator.py create mode 100644 stepfunctions/agenteval/evaluators/claude_3/__init__.py create mode 100644 stepfunctions/agenteval/evaluators/claude_3/evaluator.py create mode 100644 stepfunctions/agenteval/evaluators/claude_3/model_configs.py create mode 100644 stepfunctions/agenteval/evaluators/claude_3/most_updated_prompt_2607.txt create mode 100644 stepfunctions/agenteval/evaluators/evaluator_factory.py create mode 100644 stepfunctions/agenteval/hook.py create mode 100644 stepfunctions/agenteval/plan.py create mode 100644 stepfunctions/agenteval/runner/__init__.py create mode 100644 stepfunctions/agenteval/runner/runner.py create mode 100644 stepfunctions/agenteval/runner/summary.py create mode 100644 stepfunctions/agenteval/target_response.py create mode 100644 stepfunctions/agenteval/targets/__init__.py create mode 100644 stepfunctions/agenteval/targets/base_target.py create mode 100644 stepfunctions/agenteval/targets/bedrock_agent/__init__.py create mode 100644 stepfunctions/agenteval/targets/bedrock_agent/target.py create mode 100644 stepfunctions/agenteval/targets/bedrock_knowledge_base/__init__.py create mode 100644 stepfunctions/agenteval/targets/bedrock_knowledge_base/target.py create mode 100644 stepfunctions/agenteval/targets/boto3_target.py create mode 100644 stepfunctions/agenteval/targets/q_business/__init__.py create mode 100644 stepfunctions/agenteval/targets/q_business/target.py create mode 100644 stepfunctions/agenteval/targets/sagemaker_endpoint/__init__.py create mode 100644 stepfunctions/agenteval/targets/sagemaker_endpoint/target.py create mode 100644 stepfunctions/agenteval/targets/target_factory.py create mode 100644 stepfunctions/agenteval/templates/evaluators/claude_3/generate_evaluation.jinja create mode 100644 stepfunctions/agenteval/templates/evaluators/claude_3/generate_initial_prompt.jinja create mode 100644 stepfunctions/agenteval/templates/evaluators/claude_3/generate_test_status.jinja create mode 100644 stepfunctions/agenteval/templates/evaluators/claude_3/generate_user_response.jinja create mode 100644 stepfunctions/agenteval/templates/evaluators/claude_3/system/generate_evaluation.jinja create mode 100644 stepfunctions/agenteval/templates/evaluators/claude_3/system/generate_initial_prompt.jinja create mode 100644 stepfunctions/agenteval/templates/evaluators/claude_3/system/generate_test_status.jinja create mode 100644 stepfunctions/agenteval/templates/evaluators/claude_3/system/generate_user_response.jinja create mode 100644 stepfunctions/agenteval/templates/summary/agenteval_summary.md.jinja create mode 100644 stepfunctions/agenteval/test.py create mode 100644 stepfunctions/agenteval/test_result.py create mode 100644 stepfunctions/agenteval/trace.py create mode 100644 stepfunctions/agenteval/utils/__init__.py create mode 100644 stepfunctions/agenteval/utils/aws.py create mode 100644 stepfunctions/agenteval/utils/imports.py create mode 100644 stepfunctions/app.py create mode 100644 stepfunctions/cdk.json create mode 100644 stepfunctions/example_prompt_jsons/prompts_scenarios.json create mode 100644 stepfunctions/layers/agent-evaluation/requirements.txt create mode 100644 stepfunctions/layers/jinja2/requirements.txt create mode 100644 stepfunctions/layers/pydantic/requirements.txt create mode 100644 stepfunctions/layers/pyyaml/requirements.txt create mode 100644 stepfunctions/requirements-dev.txt create mode 100644 stepfunctions/requirements.txt create mode 100644 stepfunctions/source.bat create mode 100644 stepfunctions/stepfunctions/__init__.py create mode 100644 stepfunctions/stepfunctions/agent_evaluation-0.2.0.dist-info/INSTALLER create mode 100644 stepfunctions/stepfunctions/agent_evaluation-0.2.0.dist-info/LICENSE create mode 100644 stepfunctions/stepfunctions/agent_evaluation-0.2.0.dist-info/METADATA create mode 100644 stepfunctions/stepfunctions/agent_evaluation-0.2.0.dist-info/NOTICE create mode 100644 stepfunctions/stepfunctions/agent_evaluation-0.2.0.dist-info/RECORD create mode 100644 stepfunctions/stepfunctions/agent_evaluation-0.2.0.dist-info/REQUESTED create mode 100644 stepfunctions/stepfunctions/agent_evaluation-0.2.0.dist-info/WHEEL create mode 100644 stepfunctions/stepfunctions/agent_evaluation-0.2.0.dist-info/entry_points.txt create mode 100644 stepfunctions/stepfunctions/agent_evaluation-0.2.0.dist-info/top_level.txt create mode 100644 stepfunctions/stepfunctions/agenteval/__init__.py create mode 100644 stepfunctions/stepfunctions/agenteval/cli.py create mode 100644 stepfunctions/stepfunctions/agenteval/conversation.py create mode 100644 stepfunctions/stepfunctions/agenteval/defaults.py create mode 100644 stepfunctions/stepfunctions/agenteval/evaluators/__init__.py create mode 100644 stepfunctions/stepfunctions/agenteval/evaluators/base_evaluator.py create mode 100644 stepfunctions/stepfunctions/agenteval/evaluators/claude_3/__init__.py create mode 100644 stepfunctions/stepfunctions/agenteval/evaluators/claude_3/evaluator.py create mode 100644 stepfunctions/stepfunctions/agenteval/evaluators/claude_3/model_configs.py create mode 100644 stepfunctions/stepfunctions/agenteval/evaluators/claude_3/most_updated_prompt_2607.txt create mode 100644 stepfunctions/stepfunctions/agenteval/evaluators/evaluator_factory.py create mode 100644 stepfunctions/stepfunctions/agenteval/hook.py create mode 100644 stepfunctions/stepfunctions/agenteval/plan.py create mode 100644 stepfunctions/stepfunctions/agenteval/runner/__init__.py create mode 100644 stepfunctions/stepfunctions/agenteval/runner/runner.py create mode 100644 stepfunctions/stepfunctions/agenteval/runner/summary.py create mode 100644 stepfunctions/stepfunctions/agenteval/target_response.py create mode 100644 stepfunctions/stepfunctions/agenteval/targets/__init__.py create mode 100644 stepfunctions/stepfunctions/agenteval/targets/base_target.py create mode 100644 stepfunctions/stepfunctions/agenteval/targets/bedrock_agent/__init__.py create mode 100644 stepfunctions/stepfunctions/agenteval/targets/bedrock_agent/target.py create mode 100644 stepfunctions/stepfunctions/agenteval/targets/bedrock_knowledge_base/__init__.py create mode 100644 stepfunctions/stepfunctions/agenteval/targets/bedrock_knowledge_base/target.py create mode 100644 stepfunctions/stepfunctions/agenteval/targets/boto3_target.py create mode 100644 stepfunctions/stepfunctions/agenteval/targets/q_business/__init__.py create mode 100644 stepfunctions/stepfunctions/agenteval/targets/q_business/target.py create mode 100644 stepfunctions/stepfunctions/agenteval/targets/sagemaker_endpoint/__init__.py create mode 100644 stepfunctions/stepfunctions/agenteval/targets/sagemaker_endpoint/target.py create mode 100644 stepfunctions/stepfunctions/agenteval/targets/target_factory.py create mode 100644 stepfunctions/stepfunctions/agenteval/templates/evaluators/claude_3/generate_evaluation.jinja create mode 100644 stepfunctions/stepfunctions/agenteval/templates/evaluators/claude_3/generate_initial_prompt.jinja create mode 100644 stepfunctions/stepfunctions/agenteval/templates/evaluators/claude_3/generate_test_status.jinja create mode 100644 stepfunctions/stepfunctions/agenteval/templates/evaluators/claude_3/generate_user_response.jinja create mode 100644 stepfunctions/stepfunctions/agenteval/templates/evaluators/claude_3/system/generate_evaluation.jinja create mode 100644 stepfunctions/stepfunctions/agenteval/templates/evaluators/claude_3/system/generate_initial_prompt.jinja create mode 100644 stepfunctions/stepfunctions/agenteval/templates/evaluators/claude_3/system/generate_test_status.jinja create mode 100644 stepfunctions/stepfunctions/agenteval/templates/evaluators/claude_3/system/generate_user_response.jinja create mode 100644 stepfunctions/stepfunctions/agenteval/templates/summary/agenteval_summary.md.jinja create mode 100644 stepfunctions/stepfunctions/agenteval/test.py create mode 100644 stepfunctions/stepfunctions/agenteval/test_result.py create mode 100644 stepfunctions/stepfunctions/agenteval/trace.py create mode 100644 stepfunctions/stepfunctions/agenteval/utils/__init__.py create mode 100644 stepfunctions/stepfunctions/agenteval/utils/aws.py create mode 100644 stepfunctions/stepfunctions/agenteval/utils/imports.py create mode 100644 stepfunctions/stepfunctions/functions/check_agent_status_1/.~c9_invoke_Zi2ZN1.py create mode 100644 stepfunctions/stepfunctions/functions/check_agent_status_1/index.py create mode 100644 stepfunctions/stepfunctions/functions/check_agent_status_2/.~c9_invoke_Zi2ZN1.py create mode 100644 stepfunctions/stepfunctions/functions/check_agent_status_2/index.py create mode 100644 stepfunctions/stepfunctions/functions/create_alias/index.py create mode 100644 stepfunctions/stepfunctions/functions/delete_alias/index.py create mode 100644 stepfunctions/stepfunctions/functions/generate_map/index.py create mode 100644 stepfunctions/stepfunctions/functions/run_test/index.py create mode 100644 stepfunctions/stepfunctions/functions/update_bedrock_agent/index.py create mode 100644 stepfunctions/stepfunctions/layer/__init__.py create mode 100644 stepfunctions/stepfunctions/layer/layer.py create mode 100644 stepfunctions/stepfunctions/stepfunctions_stack.py create mode 100644 stepfunctions/tests/__init__.py create mode 100644 stepfunctions/tests/unit/__init__.py create mode 100644 stepfunctions/tests/unit/test_stepfunctions_stack.py diff --git a/stepfunctions/.gitignore b/stepfunctions/.gitignore new file mode 100644 index 0000000..37833f8 --- /dev/null +++ b/stepfunctions/.gitignore @@ -0,0 +1,10 @@ +*.swp +package-lock.json +__pycache__ +.pytest_cache +.venv +*.egg-info + +# CDK asset staging directory +.cdk.staging +cdk.out diff --git a/stepfunctions/README.md b/stepfunctions/README.md new file mode 100644 index 0000000..af346c3 --- /dev/null +++ b/stepfunctions/README.md @@ -0,0 +1,120 @@ +# Bedrock Agent Evaluation Framework + +This project implements an automated evaluation framework for Amazon Bedrock Agents using AWS CDK, Step Functions, and Lambda. + +## Overview + +The framework automates the process of updating Bedrock Agents with new prompts, creating aliases, running evaluation scenarios, and cleaning up resources. It uses AWS Step Functions to orchestrate the workflow and AWS Lambda functions to perform individual tasks. + +The example provided is for an energy chatbot usecase + +## Components + +1. **CDK Stack (StepfunctionsStack)**: Defines the infrastructure, including Lambda functions, Step Functions state machine, and associated IAM roles. + +2. **Lambda Functions**: + - `generate_map`: Generates evaluation scenarios from S3 input. + - `check_agent_status_1` and `check_agent_status_2`: Check the status of Bedrock Agents. + - `update_bedrock_agent`: Updates the Bedrock Agent with new instructions. + - `create_alias`: Creates an alias for the updated agent. + - `run_test`: Executes evaluation scenarios using the `agenteval` library. + - `delete_alias`: Removes the temporary alias after evaluation. + +3. **Step Functions State Machine**: Orchestrates the evaluation workflow, including agent updates, status checks, and scenario execution. + +4. **S3 Bucket**: Stores evaluation prompts and results. + +5. **EventBridge Rule**: Triggers the Step Functions workflow when new evaluation prompts are uploaded to S3. + +## Workflow + +1. New evaluation prompts are uploaded to the S3 bucket. +2. The EventBridge rule triggers the Step Functions state machine. +3. The state machine updates the Bedrock Agent with new instructions. +4. An alias is created for the updated agent. +5. Evaluation scenarios are executed using the `agenteval` library. +6. Results are stored in the S3 bucket. +7. The temporary alias is deleted. + +## Setup and Deployment + +1. Ensure you have the AWS CDK installed and configured. +2. Install project dependencies: + ``` + npm install + ``` +3. Deploy the stack: + ``` + cdk deploy + ``` + +## Usage + +To run an evaluation: + +1. Prepare an evaluation JSON file with prompts and customer profiles. +2. Upload the file to the S3 bucket in the `evaluation_prompts/` prefix. +3. The evaluation process will start automatically. +4. Results will be available in the S3 bucket under the `results/` prefix. + +## Notes + +- Ensure proper IAM permissions are set up for accessing Bedrock, S3, and other AWS services. +- The `agenteval` library is assumed to be provided as a custom Lambda layer. + + +# CDK instructions + +The `cdk.json` file tells the CDK Toolkit how to execute your app. + +This project is set up like a standard Python project. The initialization +process also creates a virtualenv within this project, stored under the `.venv` +directory. To create the virtualenv it assumes that there is a `python3` +(or `python` for Windows) executable in your path with access to the `venv` +package. If for any reason the automatic creation of the virtualenv fails, +you can create the virtualenv manually. + +To manually create a virtualenv on MacOS and Linux: + +``` +$ python3 -m venv .venv +``` + +After the init process completes and the virtualenv is created, you can use the following +step to activate your virtualenv. + +``` +$ source .venv/bin/activate +``` + +If you are a Windows platform, you would activate the virtualenv like this: + +``` +% .venv\Scripts\activate.bat +``` + +Once the virtualenv is activated, you can install the required dependencies. + +``` +$ pip install -r requirements.txt +``` + +At this point you can now synthesize the CloudFormation template for this code. + +``` +$ cdk synth +``` + +To add additional dependencies, for example other CDK libraries, just add +them to your `setup.py` file and rerun the `pip install -r requirements.txt` +command. + +## Useful commands + + * `cdk ls` list all stacks in the app + * `cdk synth` emits the synthesized CloudFormation template + * `cdk deploy` deploy this stack to your default AWS account/region + * `cdk diff` compare deployed stack with current state + * `cdk docs` open CDK documentation + +Enjoy! diff --git a/stepfunctions/agent_evaluation-0.2.0.dist-info/INSTALLER b/stepfunctions/agent_evaluation-0.2.0.dist-info/INSTALLER new file mode 100644 index 0000000..a1b589e --- /dev/null +++ b/stepfunctions/agent_evaluation-0.2.0.dist-info/INSTALLER @@ -0,0 +1 @@ +pip diff --git a/stepfunctions/agent_evaluation-0.2.0.dist-info/LICENSE b/stepfunctions/agent_evaluation-0.2.0.dist-info/LICENSE new file mode 100644 index 0000000..67db858 --- /dev/null +++ b/stepfunctions/agent_evaluation-0.2.0.dist-info/LICENSE @@ -0,0 +1,175 @@ + + Apache License + Version 2.0, January 2004 + http://www.apache.org/licenses/ + + TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION + + 1. Definitions. + + "License" shall mean the terms and conditions for use, reproduction, + and distribution as defined by Sections 1 through 9 of this document. + + "Licensor" shall mean the copyright owner or entity authorized by + the copyright owner that is granting the License. + + "Legal Entity" shall mean the union of the acting entity and all + other entities that control, are controlled by, or are under common + control with that entity. For the purposes of this definition, + "control" means (i) the power, direct or indirect, to cause the + direction or management of such entity, whether by contract or + otherwise, or (ii) ownership of fifty percent (50%) or more of the + outstanding shares, or (iii) beneficial ownership of such entity. + + "You" (or "Your") shall mean an individual or Legal Entity + exercising permissions granted by this License. + + "Source" form shall mean the preferred form for making modifications, + including but not limited to software source code, documentation + source, and configuration files. + + "Object" form shall mean any form resulting from mechanical + transformation or translation of a Source form, including but + not limited to compiled object code, generated documentation, + and conversions to other media types. + + "Work" shall mean the work of authorship, whether in Source or + Object form, made available under the License, as indicated by a + copyright notice that is included in or attached to the work + (an example is provided in the Appendix below). + + "Derivative Works" shall mean any work, whether in Source or Object + form, that is based on (or derived from) the Work and for which the + editorial revisions, annotations, elaborations, or other modifications + represent, as a whole, an original work of authorship. For the purposes + of this License, Derivative Works shall not include works that remain + separable from, or merely link (or bind by name) to the interfaces of, + the Work and Derivative Works thereof. + + "Contribution" shall mean any work of authorship, including + the original version of the Work and any modifications or additions + to that Work or Derivative Works thereof, that is intentionally + submitted to Licensor for inclusion in the Work by the copyright owner + or by an individual or Legal Entity authorized to submit on behalf of + the copyright owner. For the purposes of this definition, "submitted" + means any form of electronic, verbal, or written communication sent + to the Licensor or its representatives, including but not limited to + communication on electronic mailing lists, source code control systems, + and issue tracking systems that are managed by, or on behalf of, the + Licensor for the purpose of discussing and improving the Work, but + excluding communication that is conspicuously marked or otherwise + designated in writing by the copyright owner as "Not a Contribution." + + "Contributor" shall mean Licensor and any individual or Legal Entity + on behalf of whom a Contribution has been received by Licensor and + subsequently incorporated within the Work. + + 2. Grant of Copyright License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + copyright license to reproduce, prepare Derivative Works of, + publicly display, publicly perform, sublicense, and distribute the + Work and such Derivative Works in Source or Object form. + + 3. Grant of Patent License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + (except as stated in this section) patent license to make, have made, + use, offer to sell, sell, import, and otherwise transfer the Work, + where such license applies only to those patent claims licensable + by such Contributor that are necessarily infringed by their + Contribution(s) alone or by combination of their Contribution(s) + with the Work to which such Contribution(s) was submitted. If You + institute patent litigation against any entity (including a + cross-claim or counterclaim in a lawsuit) alleging that the Work + or a Contribution incorporated within the Work constitutes direct + or contributory patent infringement, then any patent licenses + granted to You under this License for that Work shall terminate + as of the date such litigation is filed. + + 4. Redistribution. You may reproduce and distribute copies of the + Work or Derivative Works thereof in any medium, with or without + modifications, and in Source or Object form, provided that You + meet the following conditions: + + (a) You must give any other recipients of the Work or + Derivative Works a copy of this License; and + + (b) You must cause any modified files to carry prominent notices + stating that You changed the files; and + + (c) You must retain, in the Source form of any Derivative Works + that You distribute, all copyright, patent, trademark, and + attribution notices from the Source form of the Work, + excluding those notices that do not pertain to any part of + the Derivative Works; and + + (d) If the Work includes a "NOTICE" text file as part of its + distribution, then any Derivative Works that You distribute must + include a readable copy of the attribution notices contained + within such NOTICE file, excluding those notices that do not + pertain to any part of the Derivative Works, in at least one + of the following places: within a NOTICE text file distributed + as part of the Derivative Works; within the Source form or + documentation, if provided along with the Derivative Works; or, + within a display generated by the Derivative Works, if and + wherever such third-party notices normally appear. The contents + of the NOTICE file are for informational purposes only and + do not modify the License. You may add Your own attribution + notices within Derivative Works that You distribute, alongside + or as an addendum to the NOTICE text from the Work, provided + that such additional attribution notices cannot be construed + as modifying the License. + + You may add Your own copyright statement to Your modifications and + may provide additional or different license terms and conditions + for use, reproduction, or distribution of Your modifications, or + for any such Derivative Works as a whole, provided Your use, + reproduction, and distribution of the Work otherwise complies with + the conditions stated in this License. + + 5. Submission of Contributions. Unless You explicitly state otherwise, + any Contribution intentionally submitted for inclusion in the Work + by You to the Licensor shall be under the terms and conditions of + this License, without any additional terms or conditions. + Notwithstanding the above, nothing herein shall supersede or modify + the terms of any separate license agreement you may have executed + with Licensor regarding such Contributions. + + 6. Trademarks. This License does not grant permission to use the trade + names, trademarks, service marks, or product names of the Licensor, + except as required for reasonable and customary use in describing the + origin of the Work and reproducing the content of the NOTICE file. + + 7. Disclaimer of Warranty. Unless required by applicable law or + agreed to in writing, Licensor provides the Work (and each + Contributor provides its Contributions) on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or + implied, including, without limitation, any warranties or conditions + of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A + PARTICULAR PURPOSE. You are solely responsible for determining the + appropriateness of using or redistributing the Work and assume any + risks associated with Your exercise of permissions under this License. + + 8. Limitation of Liability. In no event and under no legal theory, + whether in tort (including negligence), contract, or otherwise, + unless required by applicable law (such as deliberate and grossly + negligent acts) or agreed to in writing, shall any Contributor be + liable to You for damages, including any direct, indirect, special, + incidental, or consequential damages of any character arising as a + result of this License or out of the use or inability to use the + Work (including but not limited to damages for loss of goodwill, + work stoppage, computer failure or malfunction, or any and all + other commercial damages or losses), even if such Contributor + has been advised of the possibility of such damages. + + 9. Accepting Warranty or Additional Liability. While redistributing + the Work or Derivative Works thereof, You may choose to offer, + and charge a fee for, acceptance of support, warranty, indemnity, + or other liability obligations and/or rights consistent with this + License. However, in accepting such obligations, You may act only + on Your own behalf and on Your sole responsibility, not on behalf + of any other Contributor, and only if You agree to indemnify, + defend, and hold each Contributor harmless for any liability + incurred by, or claims asserted against, such Contributor by reason + of your accepting any such warranty or additional liability. diff --git a/stepfunctions/agent_evaluation-0.2.0.dist-info/METADATA b/stepfunctions/agent_evaluation-0.2.0.dist-info/METADATA new file mode 100644 index 0000000..1070391 --- /dev/null +++ b/stepfunctions/agent_evaluation-0.2.0.dist-info/METADATA @@ -0,0 +1,74 @@ +Metadata-Version: 2.1 +Name: agent-evaluation +Version: 0.2.0 +Summary: A generative AI-powered framework for testing virtual agents. +Home-page: https://awslabs.github.io/agent-evaluation/ +Author: Amazon Web Services +Author-email: agent-evaluation-oss-core-team@amazon.com +License: Apache 2.0 +Classifier: Development Status :: 4 - Beta +Classifier: Intended Audience :: Developers +Classifier: Topic :: Utilities +Classifier: Topic :: Software Development :: Testing +Classifier: License :: OSI Approved :: Apache Software License +Classifier: Programming Language :: Python :: 3 :: Only +Classifier: Programming Language :: Python :: 3.9 +Classifier: Programming Language :: Python :: 3.10 +Classifier: Programming Language :: Python :: 3.11 +Classifier: Programming Language :: Python :: 3.12 +Requires-Python: >=3.9 +Description-Content-Type: text/markdown +License-File: LICENSE +License-File: NOTICE +Requires-Dist: pyyaml ~=6.0 +Requires-Dist: boto3 <2.0,>=1.34.20 +Requires-Dist: click ~=8.0 +Requires-Dist: pydantic <3.0,>=2.1.0 +Requires-Dist: rich <14.0,>=13.7.0 +Requires-Dist: jinja2 <4.0,>=3.1.3 +Requires-Dist: jsonpath-ng <2.0,>=1.6.1 +Provides-Extra: dev +Requires-Dist: flake8 ; extra == 'dev' +Requires-Dist: black ; extra == 'dev' +Requires-Dist: isort ; extra == 'dev' +Requires-Dist: pytest ; extra == 'dev' +Requires-Dist: pytest-cov ; extra == 'dev' +Requires-Dist: pytest-mock ; extra == 'dev' +Requires-Dist: mkdocs ; extra == 'dev' +Requires-Dist: mkdocs-material ; extra == 'dev' +Requires-Dist: mkdocstrings[python] ; extra == 'dev' +Requires-Dist: mkdocs-click ; extra == 'dev' +Requires-Dist: bandit ; extra == 'dev' +Requires-Dist: pip-audit ; extra == 'dev' + +![PyPI - Version](https://img.shields.io/pypi/v/agent-evaluation) +![PyPI - Python Version](https://img.shields.io/pypi/pyversions/agent-evaluation) +![GitHub License](https://img.shields.io/github/license/awslabs/agent-evaluation) +[![security: bandit](https://img.shields.io/badge/security-bandit-yellow.svg)](https://github.com/PyCQA/bandit) +[![Code style: black](https://img.shields.io/badge/code%20style-black-000000.svg)](https://github.com/psf/black) +[![Built with Material for MkDocs](https://img.shields.io/badge/Material_for_MkDocs-526CFE?style=for-the-badge&logo=MaterialForMkDocs&logoColor=white)](https://squidfunk.github.io/mkdocs-material/) + +# Agent Evaluation + +Agent Evaluation is a generative AI-powered framework for testing virtual agents. + +Internally, Agent Evaluation implements an LLM agent (evaluator) that will orchestrate conversations with your own agent (target) and evaluate the responses during the conversation. + +## ✨ Key features + +- Built-in support for popular AWS services including [Amazon Bedrock](https://aws.amazon.com/bedrock/), [Amazon Q Business](https://aws.amazon.com/q/business/), and [Amazon SageMaker](https://aws.amazon.com/sagemaker/). You can also [bring your own agent](https://awslabs.github.io/agent-evaluation/targets/custom_targets/) to test using Agent Evaluation. +- Orchestrate concurrent, multi-turn conversations with your agent while evaluating its responses. +- Define [hooks](https://awslabs.github.io/agent-evaluation/hooks/) to perform additional tasks such as integration testing. +- Can be incorporated into CI/CD pipelines to expedite the time to delivery while maintaining the stability of agents in production environments. + +## 📚 Documentation + +To get started, please visit the full documentation [here](https://awslabs.github.io/agent-evaluation/). To contribute, please refer to [CONTRIBUTING.md](./CONTRIBUTING.md) + +## 👏 Contributors + +Shout out to these awesome contributors: + + + + diff --git a/stepfunctions/agent_evaluation-0.2.0.dist-info/NOTICE b/stepfunctions/agent_evaluation-0.2.0.dist-info/NOTICE new file mode 100644 index 0000000..616fc58 --- /dev/null +++ b/stepfunctions/agent_evaluation-0.2.0.dist-info/NOTICE @@ -0,0 +1 @@ +Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. diff --git a/stepfunctions/agent_evaluation-0.2.0.dist-info/RECORD b/stepfunctions/agent_evaluation-0.2.0.dist-info/RECORD new file mode 100644 index 0000000..fcc2eac --- /dev/null +++ b/stepfunctions/agent_evaluation-0.2.0.dist-info/RECORD @@ -0,0 +1,87 @@ +../../../bin/agenteval,sha256=sKahy-HYfncxw3pVqCLLgxIokhvln3Qm9eDSvskMrV8,250 +agent_evaluation-0.2.0.dist-info/INSTALLER,sha256=zuuue4knoyJ-UwPPXg8fezS7VCrXJQrAP7zeNuwvFQg,4 +agent_evaluation-0.2.0.dist-info/LICENSE,sha256=CeipvOyAZxBGUsFoaFqwkx54aPnIKEtm9a5u2uXxEws,10142 +agent_evaluation-0.2.0.dist-info/METADATA,sha256=WOBzlzyr7ozBQpU_A99iEq8w2O-j-Zii-Q2al9A3D_Y,3759 +agent_evaluation-0.2.0.dist-info/NOTICE,sha256=1CkO1kwu3Q_OHYTj-d-yiBJA_lNN73a4zSntavaD4oc,67 +agent_evaluation-0.2.0.dist-info/RECORD,, +agent_evaluation-0.2.0.dist-info/REQUESTED,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0 +agent_evaluation-0.2.0.dist-info/WHEEL,sha256=GJ7t_kWBFywbagK5eo9IoUwLW6oyOeTKmQ-9iHFVNxQ,92 +agent_evaluation-0.2.0.dist-info/entry_points.txt,sha256=DxxfiIbldqK82WRgaMOL4BjPJcnr7JOkDYTch6xNahs,48 +agent_evaluation-0.2.0.dist-info/top_level.txt,sha256=k6izISLxVoNnLxZHqnS3X-0eDdUD8LsV-OoM0afYdew,10 +agenteval/__init__.py,sha256=JQm11m01-rh2PjCw1OVqgy1rkU725Q6vMqfDtLbRH2U,1150 +agenteval/__pycache__/__init__.cpython-311.pyc,, +agenteval/__pycache__/cli.cpython-311.pyc,, +agenteval/__pycache__/conversation.cpython-311.pyc,, +agenteval/__pycache__/defaults.cpython-311.pyc,, +agenteval/__pycache__/hook.cpython-311.pyc,, +agenteval/__pycache__/plan.cpython-311.pyc,, +agenteval/__pycache__/target_response.cpython-311.pyc,, +agenteval/__pycache__/test.cpython-311.pyc,, +agenteval/__pycache__/test_result.cpython-311.pyc,, +agenteval/__pycache__/trace.cpython-311.pyc,, +agenteval/cli.py,sha256=wl0z_pCCKuu9lJgUWrS8cuHkvMYclhd-aCmCB6MN8u8,2807 +agenteval/conversation.py,sha256=r3fvnsnVI7zRoi_RS2JjPahUtLXF9vhnZYJcx1RMe3M,1030 +agenteval/defaults.py,sha256=PB1UniJ-uyiBn0WWSA3EI1UxcfpU2wlvsJZmhWgdV5E,280 +agenteval/evaluators/__init__.py,sha256=U6uQ6THgK0yxMnqVKL5l7_zUUxepoo11W1fPLa9xgNE,247 +agenteval/evaluators/__pycache__/__init__.cpython-311.pyc,, +agenteval/evaluators/__pycache__/base_evaluator.cpython-311.pyc,, +agenteval/evaluators/__pycache__/evaluator_factory.cpython-311.pyc,, +agenteval/evaluators/base_evaluator.py,sha256=zpWFBhQdaB-__TdiM7lFlkvQFX27KSFvzDFQ1KBvuLw,5052 +agenteval/evaluators/claude_3/__init__.py,sha256=mKv_FTRrhYIIS86zqxzj5edy-tKREHsn3nXUBmck71Q,180 +agenteval/evaluators/claude_3/__pycache__/__init__.cpython-311.pyc,, +agenteval/evaluators/claude_3/__pycache__/evaluator.cpython-311.pyc,, +agenteval/evaluators/claude_3/__pycache__/model_configs.cpython-311.pyc,, +agenteval/evaluators/claude_3/evaluator.py,sha256=k-ZXtKBtywVYy1XEAkSufb9LYXlAElaklV8Wao-udLo,7751 +agenteval/evaluators/claude_3/model_configs.py,sha256=KUf0C5Bbgc-c05ZZlokVgjHVH4WGdoOfKtwQWwuQFLY,635 +agenteval/evaluators/evaluator_factory.py,sha256=JCTVoN62QNMcKR68KY2Li8zpm55HNvYwVBXZ0Yi3rhQ,712 +agenteval/hook.py,sha256=z8UfREnySi2E6tRwjeklI3CwjWQ5MMk59wLHj6TK9C0,1049 +agenteval/plan.py,sha256=tIXTXepcVZEA8JX0yoEzsSuLDVpqSYvBdKsGJYYCVbU,3236 +agenteval/runner/__init__.py,sha256=6f0fmworOJ0fn2MNzDg52zbip4osTovhwetT6ZQnI74,157 +agenteval/runner/__pycache__/__init__.cpython-311.pyc,, +agenteval/runner/__pycache__/runner.cpython-311.pyc,, +agenteval/runner/__pycache__/summary.cpython-311.pyc,, +agenteval/runner/runner.py,sha256=wSYcX82WTMwmMFCfqoHjxq8NTnV1_UdPr4A1fnmkD_U,3937 +agenteval/runner/summary.py,sha256=jTdFRFo7zAaE-PTA6Cy3n1cndgFB14vA20MDO9FeJyE,872 +agenteval/target_response.py,sha256=R_Gy-655vPEsSO7X2siU2GNiFPRl1CkRetiON8WYEGM,285 +agenteval/targets/__init__.py,sha256=JmGtuue6VQYkK5jAiArxlbnRQsA23p8NgDTMvnCWyGU,282 +agenteval/targets/__pycache__/__init__.cpython-311.pyc,, +agenteval/targets/__pycache__/base_target.cpython-311.pyc,, +agenteval/targets/__pycache__/boto3_target.cpython-311.pyc,, +agenteval/targets/__pycache__/target_factory.cpython-311.pyc,, +agenteval/targets/base_target.py,sha256=aYW5dLAlbKgscdf8XTcV9Bppbay-pz-c_y5RtCgdBD0,743 +agenteval/targets/bedrock_agent/__init__.py,sha256=2B5TCxdyQAXuQRtji0lclk5odB7xgT5Hi_dBwjErIzo,73 +agenteval/targets/bedrock_agent/__pycache__/__init__.cpython-311.pyc,, +agenteval/targets/bedrock_agent/__pycache__/target.cpython-311.pyc,, +agenteval/targets/bedrock_agent/target.py,sha256=GRfn4dOGkARF_3_DBupgoHrbiYQZADfqwXO65Z2-RDM,1332 +agenteval/targets/bedrock_knowledge_base/__init__.py,sha256=tYJixJ0x9ohkM7oker8eX7U4vkkxqV_xVlA4CsWIuec,89 +agenteval/targets/bedrock_knowledge_base/__pycache__/__init__.cpython-311.pyc,, +agenteval/targets/bedrock_knowledge_base/__pycache__/target.cpython-311.pyc,, +agenteval/targets/bedrock_knowledge_base/target.py,sha256=jOsAixfOSy6jEQF6p_uCwDLP7M1WB64F6K49CbtiSYc,1401 +agenteval/targets/boto3_target.py,sha256=qNukrm2GZOrG81pJc61BrJEFcNB_f80cvvWQyMFRQiA,1271 +agenteval/targets/q_business/__init__.py,sha256=1KT5BdoA_KD2fX3gNLvSyg9K5x0OfWBN8X15nxJf13U,67 +agenteval/targets/q_business/__pycache__/__init__.cpython-311.pyc,, +agenteval/targets/q_business/__pycache__/target.cpython-311.pyc,, +agenteval/targets/q_business/target.py,sha256=Bv9YiXcnBoUmXFN3nfCh2FNLNP9vMm_1ruWVlDGsXXs,1014 +agenteval/targets/sagemaker_endpoint/__init__.py,sha256=whoMO69GOhPMNOrbQAfYzVmIXuxhxt8dHJGABnR4_Ck,83 +agenteval/targets/sagemaker_endpoint/__pycache__/__init__.cpython-311.pyc,, +agenteval/targets/sagemaker_endpoint/__pycache__/target.cpython-311.pyc,, +agenteval/targets/sagemaker_endpoint/target.py,sha256=zLsgkOljavYzrjrVnY3qDOjc-zsKFPSIdqugsZZy6po,2677 +agenteval/targets/target_factory.py,sha256=W8mzSy3E44jpYJs6XLD2WaLAaXXZ_T_WGw49CyPLigQ,1092 +agenteval/templates/evaluators/claude_3/generate_evaluation.jinja,sha256=aaTBZnr-3J29SpdernWW8bmQzF7lV0-bed1glZk36Yk,287 +agenteval/templates/evaluators/claude_3/generate_initial_prompt.jinja,sha256=wIhfhNUsTVdeIDBJNH1QWIBQWVE8h0Lc958vuuNU_eE,43 +agenteval/templates/evaluators/claude_3/generate_test_status.jinja,sha256=2T9HuihEVtGvq-ncxl6hLrTZXi2wAYu3cQhCUl0F_qY,238 +agenteval/templates/evaluators/claude_3/generate_user_response.jinja,sha256=2T9HuihEVtGvq-ncxl6hLrTZXi2wAYu3cQhCUl0F_qY,238 +agenteval/templates/evaluators/claude_3/system/generate_evaluation.jinja,sha256=3ihPICeDofWljtl6YpUJQM-lJSPNeWjhjgGndKM1wYQ,554 +agenteval/templates/evaluators/claude_3/system/generate_initial_prompt.jinja,sha256=DR1UaUvn0u_8MD0cSHAWSPLfEIwnGCKlEFPkuUAKLDQ,566 +agenteval/templates/evaluators/claude_3/system/generate_test_status.jinja,sha256=akAKahEda6A3-XhVjXpacGR3e48HrbqE4UT4ONlqVZg,587 +agenteval/templates/evaluators/claude_3/system/generate_user_response.jinja,sha256=yCy-IkJRM2y9-pPbaZaNrT-_4J7x9YM9kMgMXeYf5D4,800 +agenteval/templates/summary/agenteval_summary.md.jinja,sha256=Ri9B_lIpewlBtvs0ggj4IO9FbIZlMq70aDBZg_-xfQk,1107 +agenteval/test.py,sha256=mMbZWI5Yv6oQDS4xh5gCUvAj_IOih3vurqsMJs_9KbM,806 +agenteval/test_result.py,sha256=pDdXfrhIQtgO3au0XaxNLY1uql-POqZrlgu2vtNa0fc,738 +agenteval/trace.py,sha256=9JhT1i295AbKk1Zaj7Qa9EiXW1IJu-GsbOZ1hs8kiEU,2090 +agenteval/utils/__init__.py,sha256=xgJ0V8V34ju5tDEaX-WDBwXLTwMjFBztdYJ5lk2Y-OE,230 +agenteval/utils/__pycache__/__init__.cpython-311.pyc,, +agenteval/utils/__pycache__/aws.cpython-311.pyc,, +agenteval/utils/__pycache__/imports.cpython-311.pyc,, +agenteval/utils/aws.py,sha256=z6YjWUK1MhMl0Z6J-vxZiRBaHv8d444avFxEMjicq0c,1115 +agenteval/utils/imports.py,sha256=i-cd9Ze6LWeaBktGHgZkWLa6W_iUa11vTOBc5CQrfzA,1106 diff --git a/stepfunctions/agent_evaluation-0.2.0.dist-info/REQUESTED b/stepfunctions/agent_evaluation-0.2.0.dist-info/REQUESTED new file mode 100644 index 0000000..e69de29 diff --git a/stepfunctions/agent_evaluation-0.2.0.dist-info/WHEEL b/stepfunctions/agent_evaluation-0.2.0.dist-info/WHEEL new file mode 100644 index 0000000..bab98d6 --- /dev/null +++ b/stepfunctions/agent_evaluation-0.2.0.dist-info/WHEEL @@ -0,0 +1,5 @@ +Wheel-Version: 1.0 +Generator: bdist_wheel (0.43.0) +Root-Is-Purelib: true +Tag: py3-none-any + diff --git a/stepfunctions/agent_evaluation-0.2.0.dist-info/entry_points.txt b/stepfunctions/agent_evaluation-0.2.0.dist-info/entry_points.txt new file mode 100644 index 0000000..6919bf1 --- /dev/null +++ b/stepfunctions/agent_evaluation-0.2.0.dist-info/entry_points.txt @@ -0,0 +1,2 @@ +[console_scripts] +agenteval = agenteval.cli:cli diff --git a/stepfunctions/agent_evaluation-0.2.0.dist-info/top_level.txt b/stepfunctions/agent_evaluation-0.2.0.dist-info/top_level.txt new file mode 100644 index 0000000..060c7ea --- /dev/null +++ b/stepfunctions/agent_evaluation-0.2.0.dist-info/top_level.txt @@ -0,0 +1 @@ +agenteval diff --git a/stepfunctions/agenteval/__init__.py b/stepfunctions/agenteval/__init__.py new file mode 100644 index 0000000..cd7bf51 --- /dev/null +++ b/stepfunctions/agenteval/__init__.py @@ -0,0 +1,46 @@ +# Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +# SPDX-License-Identifier: Apache-2.0 + +from importlib.metadata import version + +import logging +import os + +from jinja2 import Environment, PackageLoader, select_autoescape +from rich.logging import RichHandler + +from .hook import Hook +from .target_response import TargetResponse + +__all__ = ["Hook", "TargetResponse"] +__version__ = version("agent-evaluation") + + +_LOG_LEVEL_ENV = "LOG_LEVEL" + + +def configure_logger(): + # supress logs from botocore + logging.getLogger("botocore").setLevel(logging.CRITICAL) + + # configure logging using rich + formatter = logging.Formatter("%(message)s", datefmt="[%X]") + handler = RichHandler(markup=True, show_level=True, rich_tracebacks=True) + handler.setFormatter(formatter) + + logger = logging.getLogger(__name__) + + logger.setLevel(os.environ.get(_LOG_LEVEL_ENV, logging.INFO)) + logger.addHandler(handler) + + +configure_logger() + +jinja_env = Environment( + loader=PackageLoader(__name__), + autoescape=select_autoescape( + disabled_extensions=["jinja"], + default_for_string=True, + default=True, + ), +) diff --git a/stepfunctions/agenteval/cli.py b/stepfunctions/agenteval/cli.py new file mode 100644 index 0000000..940f621 --- /dev/null +++ b/stepfunctions/agenteval/cli.py @@ -0,0 +1,109 @@ +# Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +# SPDX-License-Identifier: Apache-2.0 + +import logging +import os +from typing import Optional + +import click + +from agenteval.plan import Plan +from agenteval.runner import Runner + +logger = logging.getLogger(__name__) + + +def validate_directory(directory): + if not os.path.isdir(directory): + raise NotADirectoryError(f"{directory} is not a directory") + if not os.access(directory, os.R_OK) or not os.access(directory, os.W_OK): + raise PermissionError(f"No read/write permissions for {directory}") + + +@click.group() +def cli(): + pass + + +@cli.command(help="Initialize a test plan.") +@click.option( + "--plan-dir", + type=str, + required=False, + help="The destination directory for storing the test plan. If unspecified, then the test plan is saved to the current working directory.", +) +def init(plan_dir: Optional[str]): + if plan_dir: + validate_directory(plan_dir) + try: + path = Plan.init_plan(plan_dir) + logger.info(f"[green]Test plan created at {path}") + + except FileExistsError as e: + logger.error(f"[red]{e}") + exit(1) + + +@cli.command(help="Run test plan.") +@click.option( + "--filter", + type=str, + required=False, + help="Specifies the test(s) to run. Multiple tests should be seperated using a comma. If unspecified, all tests from the test plan will be run.", +) +@click.option( + "--plan-dir", + type=str, + required=False, + help="The directory where the test plan is stored. If unspecified, then the current working directory is used.", +) +@click.option( + "--verbose", + is_flag=True, + type=bool, + default=False, + help="Controls the verbosity of the terminal logs.", +) +@click.option( + "--num-threads", + type=int, + required=False, + help="Number of threads (and thus tests) to run concurrently. If unspecified, number of threads will be capped at 45.", +) +@click.option( + "--work-dir", + type=str, + required=False, + help="The directory where the test result and trace will be generated. If unspecified, then the current working directory is used.", +) +def run( + filter: Optional[str], + plan_dir: Optional[str], + verbose: bool, + num_threads: Optional[int], + work_dir: Optional[str], +): + try: + plan = Plan.load(plan_dir, filter) + if work_dir: + validate_directory(work_dir) + runner = Runner( + plan, + verbose, + num_threads, + work_dir, + ) + num_failed = runner.run() + _num_failed_exit(num_failed) + + except Exception as e: + _exception_exit(e) + + +def _num_failed_exit(num_failed): + exit(1 if num_failed else 0) + + +def _exception_exit(e): + logger.exception(f"Error running test: {e}") + exit(1) diff --git a/stepfunctions/agenteval/conversation.py b/stepfunctions/agenteval/conversation.py new file mode 100644 index 0000000..59e4304 --- /dev/null +++ b/stepfunctions/agenteval/conversation.py @@ -0,0 +1,35 @@ +# Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +# SPDX-License-Identifier: Apache-2.0 + +_USER = "USER" +_AGENT = "AGENT" +_START_TURN_COUNT = 0 + + +class Conversation: + """Captures the interaction between a user and an agent. + + Attributes: + messages (list): A list of tuples of the form (role, message). + turns (int): The number of turns in the conversation. + """ + + def __init__(self): + self.messages = [] + self.turns = _START_TURN_COUNT + + def __iter__(self): + """Allow iteration over conversation messages.""" + return iter(self.messages) + + def add_turn(self, user_message: str, agent_response: str): + """Record a turn in the conversation. + + Args: + user_message (str): The users's message + agent_response (str): The agent's response to the user's message + + Increments the `turn` counter by `1`. + """ + self.messages.extend([(_USER, user_message), (_AGENT, agent_response)]) + self.turns += 1 diff --git a/stepfunctions/agenteval/defaults.py b/stepfunctions/agenteval/defaults.py new file mode 100644 index 0000000..929c675 --- /dev/null +++ b/stepfunctions/agenteval/defaults.py @@ -0,0 +1,8 @@ +# Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +# SPDX-License-Identifier: Apache-2.0 + +MAX_TURNS = 2 + +# Default max number of threads not exceeding Bedrock service quota: +# https://docs.aws.amazon.com/bedrock/latest/userguide/quotas.html +MAX_NUM_THREADS = 45 diff --git a/stepfunctions/agenteval/evaluators/__init__.py b/stepfunctions/agenteval/evaluators/__init__.py new file mode 100644 index 0000000..8e52702 --- /dev/null +++ b/stepfunctions/agenteval/evaluators/__init__.py @@ -0,0 +1,7 @@ +# Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +# SPDX-License-Identifier: Apache-2.0 + +from .base_evaluator import BaseEvaluator +from .evaluator_factory import EvaluatorFactory + +__all__ = ["BaseEvaluator", "EvaluatorFactory"] diff --git a/stepfunctions/agenteval/evaluators/base_evaluator.py b/stepfunctions/agenteval/evaluators/base_evaluator.py new file mode 100644 index 0000000..e1bd4c9 --- /dev/null +++ b/stepfunctions/agenteval/evaluators/base_evaluator.py @@ -0,0 +1,139 @@ +# Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +# SPDX-License-Identifier: Apache-2.0 + +import json +from abc import ABC, abstractmethod +from typing import Optional + +from agenteval.conversation import Conversation +from agenteval.hook import Hook +from agenteval.targets import BaseTarget +from agenteval.test import Test +from agenteval.test_result import TestResult +from agenteval.trace import Trace +from agenteval.utils import create_boto3_client, import_class + +_DEFAULT_MAX_RETRY = 10 +_BOTO3_SERVICE_NAME = "bedrock-runtime" + + +class BaseEvaluator(ABC): + """The `BaseEvaluator` abstract base class defines the common interface for evaluator + classes. + + Attributes: + test (Test): The test case. + target (BaseTarget): The target agent being evaluated. + conversation (Conversation): Captures the interaction between a user and an agent. + trace (Trace): Captures steps during evaluation. + test_result (TestResult): The result of the test which is set in `BaseEvaluator.run`. + input_token_count (int): Number of input tokens processed by the evaluator. + output_token_count (int): Number of output tokens generated by the evaluator. + model_id (str): The ID of the Bedrock model used to run evaluation. If `provisioned_throughput_arn` is provided, + then this will be set to the ARN of the provisioned throughput. + boto3_client (BaseClient): A `boto3` client representing Amazon Bedrock Runtime. + """ + + def __init__( + self, + test: Test, + target: BaseTarget, + work_dir: str, + model_id: str, + provisioned_throughput_arn: Optional[str] = None, + aws_profile: Optional[str] = None, + aws_region: Optional[str] = None, + endpoint_url: Optional[str] = None, + max_retry: int = _DEFAULT_MAX_RETRY, + ): + """Initialize the evaluator instance for a given `Test` and `Target`. + + Args: + test (Test): The test case. + target (BaseTarget): The target agent being evaluated. + work_dir (str): The work directory. + model_id (str): The ID of the Bedrock model used to run evaluation. + provisioned_throughput_arn (str, optional): The ARN of the provisioned throughput. + aws_profile (str, optional): The AWS profile name. + aws_region (str, optional): The AWS region. + endpoint_url (str, optional): The endpoint URL for the AWS service. + max_retry (int, optional): The maximum number of retry attempts. + """ + self.test = test + self.target = target + self.conversation = Conversation() + self.trace = Trace(work_dir=work_dir, test_name=test.name) + self.test_result = None + self.input_token_count = 0 + self.output_token_count = 0 + self.model_id = provisioned_throughput_arn or model_id + self.bedrock_runtime_client = create_boto3_client( + boto3_service_name=_BOTO3_SERVICE_NAME, + aws_profile=aws_profile, + aws_region=aws_region, + endpoint_url=endpoint_url, + max_retry=max_retry, + ) + + @abstractmethod + def evaluate(self) -> TestResult: + """Conduct a test. + + Returns: + TestResult: The result of the test. + """ + pass + + def _get_hook_cls(self, hook: Optional[str]) -> Optional[type[Hook]]: + if hook: + hook_cls = import_class(hook, parent_class=Hook) + return hook_cls + + def invoke_model(self, request_body: dict) -> dict: + """ + Invoke the Bedrock model using the `boto3_client`. This method will convert + a request dictionary to a JSON string before passing it to the `InvokeModel` API. + + Refer to the `boto3` documentation for more details. + + Args: + request_body (dict): The request payload as a dictionary. + + Returns: + dict: The response from the model invocation. + + """ + response = self.bedrock_runtime_client.invoke_model( + modelId=self.model_id, body=json.dumps(request_body) + ) + + self._incr_token_counts(response) + + return response + + def _incr_token_counts(self, response: dict): + headers = response["ResponseMetadata"]["HTTPHeaders"] + + self.input_token_count += int( + headers.get("x-amzn-bedrock-input-token-count", 0) + ) + self.output_token_count += int( + headers.get("x-amzn-bedrock-output-token-count", 0) + ) + + def run(self) -> TestResult: + """ + Run the evaluator within a trace context manager and run hooks + if provided. + """ + + hook_cls = self._get_hook_cls(self.test.hook) + + with self.trace: + if hook_cls: + hook_cls.pre_evaluate(self.test, self.trace) + self.test_result = self.evaluate() + if hook_cls: + hook_cls.post_evaluate(self.test, self.test_result, self.trace) + + return self.test_result diff --git a/stepfunctions/agenteval/evaluators/claude_3/__init__.py b/stepfunctions/agenteval/evaluators/claude_3/__init__.py new file mode 100644 index 0000000..338be7d --- /dev/null +++ b/stepfunctions/agenteval/evaluators/claude_3/__init__.py @@ -0,0 +1,6 @@ +# Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +# SPDX-License-Identifier: Apache-2.0 + +from .evaluator import Claude3Evaluator + +__all__ = ["Claude3Evaluator"] diff --git a/stepfunctions/agenteval/evaluators/claude_3/evaluator.py b/stepfunctions/agenteval/evaluators/claude_3/evaluator.py new file mode 100644 index 0000000..cc8b3ae --- /dev/null +++ b/stepfunctions/agenteval/evaluators/claude_3/evaluator.py @@ -0,0 +1,244 @@ +# Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +# SPDX-License-Identifier: Apache-2.0 + +import json +import logging +import os +import re +from typing import Tuple + +from agenteval import jinja_env +from agenteval.evaluators import BaseEvaluator +from agenteval.evaluators.claude_3 import model_configs +from agenteval.test_result import TestResult + +logger = logging.getLogger(__name__) + +_PROMPT_TEMPLATE_ROOT = "evaluators/claude_3" +_SYSTEM_PROMPT_DIR = "system" +_PROMPT_TEMPLATE_NAMES = [ + "generate_initial_prompt", + "generate_user_response", + "generate_test_status", + "generate_evaluation", +] + +# enable backwards-compatible StrEnum +try: + from enum import StrEnum +except ImportError: + from enum import Enum + + class StrEnum(str, Enum): + pass + + +class TestStatusCategories(StrEnum): + ALL_STEPS_ATTEMPTED = "A" + NOT_ALL_STEPS_ATTEMPTED = "B" + + +class EvaluationCategories(StrEnum): + ALL_EXPECTED_RESULTS_OBSERVED = "A" + NOT_ALL_EXPECTED_RESULTS_OBSERVED = "B" + + +class Results(StrEnum): + MAX_TURNS_REACHED = "Maximum turns reached." + ALL_EXPECTED_RESULTS_OBSERVED = ( + "All of the expected results can be observed in the conversation." + ) + NOT_ALL_EXPECTED_RESULTS_OBSERVED = ( + "Not all of the expected results can be observed in the conversation." + ) + + +class Claude3Evaluator(BaseEvaluator): + def __init__( + self, + **kwargs, + ): + super().__init__(model_id=model_configs.MODEL_ID, **kwargs) + + self._prompt_template_map = { + name: { + "system": jinja_env.get_template( + os.path.join( + _PROMPT_TEMPLATE_ROOT, _SYSTEM_PROMPT_DIR, f"{name}.jinja" + ) + ), + "prompt": jinja_env.get_template( + os.path.join(_PROMPT_TEMPLATE_ROOT, f"{name}.jinja") + ), + } + for name in _PROMPT_TEMPLATE_NAMES + } + + @staticmethod + def _extract_content_from_xml(xml_data: str, element_names: list[str]) -> Tuple: + content = [] + for e in element_names: + pattern = rf"<{e}>(.*?)" + match = re.search(pattern, xml_data, re.DOTALL) + content.append(match.group(1).strip() if match else None) + return tuple(content) + + def _generate( + self, + system_prompt: str, + prompt: str, + output_xml_element: str, + ) -> str: + request_body = model_configs.REQUEST_BODY + request_body["system"] = system_prompt + request_body["messages"][0]["content"][0]["text"] = prompt + + response = self.invoke_model(request_body=request_body) + response_body = response.get("body").read() + completion = json.loads(response_body)["content"][0]["text"] + + logger.debug( + f"[{self.test.name}]\n[PROMPT]\n{prompt}\n[COMPLETION]\n{completion}" + ) + + output, reasoning = self._extract_content_from_xml( + completion, [output_xml_element, "thinking"] + ) + + return output, reasoning + + def _generate_initial_prompt(self) -> str: + system_prompt = self._prompt_template_map["generate_initial_prompt"][ + "system" + ].render() + prompt = self._prompt_template_map["generate_initial_prompt"]["prompt"].render( + step=self.test.steps[0] + ) + + initial_prompt, reasoning = self._generate( + system_prompt=system_prompt, + prompt=prompt, + output_xml_element="initial_prompt", + ) + + self.trace.add_step( + system_prompt=system_prompt, + prompt=prompt, + initial_prompt=initial_prompt, + reasoning=reasoning, + ) + return initial_prompt + + def _generate_test_status(self) -> str: + system_prompt = self._prompt_template_map["generate_test_status"][ + "system" + ].render() + prompt = self._prompt_template_map["generate_test_status"]["prompt"].render( + steps=self.test.steps, conversation=self.conversation + ) + test_status, reasoning = self._generate( + system_prompt=system_prompt, + prompt=prompt, + output_xml_element="category", + ) + self.trace.add_step( + system_prompt=system_prompt, + prompt=prompt, + test_status=test_status, + reasoning=reasoning, + ) + return test_status + + def _generate_evaluation(self) -> tuple[str, str]: + system_prompt = self._prompt_template_map["generate_evaluation"][ + "system" + ].render() + prompt = self._prompt_template_map["generate_evaluation"]["prompt"].render( + expected_results=self.test.expected_results, + conversation=self.conversation, + ) + + evaluation, reasoning = self._generate( + system_prompt=system_prompt, + prompt=prompt, + output_xml_element="category", + ) + self.trace.add_step( + system_prompt=system_prompt, + prompt=prompt, + evaluation=evaluation, + reasoning=reasoning, + ) + + return evaluation, reasoning + + def _generate_user_response(self) -> str: + system_prompt = self._prompt_template_map["generate_user_response"][ + "system" + ].render() + prompt = self._prompt_template_map["generate_user_response"]["prompt"].render( + steps=self.test.steps, conversation=self.conversation + ) + + user_response, reasoning = self._generate( + system_prompt=system_prompt, + prompt=prompt, + output_xml_element="user_response", + ) + + self.trace.add_step( + system_prompt=system_prompt, + prompt=prompt, + user_response=user_response, + reasoning=reasoning, + ) + return user_response + + def _invoke_target(self, user_input) -> str: + target_response = self.target.invoke(user_input) + self.trace.add_step(data=target_response.data) + + return target_response.response + + def evaluate(self) -> TestResult: + success = False + result = Results.MAX_TURNS_REACHED.value + reasoning = "" + + while self.conversation.turns < self.test.max_turns: + if self.conversation.turns == 0: + # start conversation + if self.test.initial_prompt: + user_input = self.test.initial_prompt + else: + user_input = self._generate_initial_prompt() + else: + # generate next user response + user_input = self._generate_user_response() + + # add turn to the conversation + self.conversation.add_turn(user_input, self._invoke_target(user_input)) + + # get test status + test_status = self._generate_test_status() + if test_status == TestStatusCategories.ALL_STEPS_ATTEMPTED: + # evaluate conversation + eval_category, reasoning = self._generate_evaluation() + if ( + eval_category + == EvaluationCategories.NOT_ALL_EXPECTED_RESULTS_OBSERVED.value # noqa: W503 + ): + result = Results.NOT_ALL_EXPECTED_RESULTS_OBSERVED.value + else: + result = Results.ALL_EXPECTED_RESULTS_OBSERVED.value + success = True + + break + + return TestResult( + test_name=self.test.name, + success=success, + result=result, + reasoning=reasoning, + conversation=self.conversation, + ) diff --git a/stepfunctions/agenteval/evaluators/claude_3/model_configs.py b/stepfunctions/agenteval/evaluators/claude_3/model_configs.py new file mode 100644 index 0000000..e6bc2fc --- /dev/null +++ b/stepfunctions/agenteval/evaluators/claude_3/model_configs.py @@ -0,0 +1,26 @@ +# Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +# SPDX-License-Identifier: Apache-2.0 + +MODEL_ID = "anthropic.claude-3-sonnet-20240229-v1:0" +ANTHROPIC_VERSION = "bedrock-2023-05-31" +ROLE = "user" +MAX_TOKENS_TO_SAMPLE = 300 +TEMPERATURE = 0 +TOP_K = 250 +TOP_P = 1 +REQUEST_BODY = { + "anthropic_version": ANTHROPIC_VERSION, + "max_tokens": MAX_TOKENS_TO_SAMPLE, + "system": None, + "messages": [ + { + "role": ROLE, + "content": [ + {"type": "text", "text": None}, + ], + } + ], + "temperature": TEMPERATURE, + "top_p": TOP_P, + "top_k": TOP_K, +} diff --git a/stepfunctions/agenteval/evaluators/claude_3/most_updated_prompt_2607.txt b/stepfunctions/agenteval/evaluators/claude_3/most_updated_prompt_2607.txt new file mode 100644 index 0000000..fce3738 --- /dev/null +++ b/stepfunctions/agenteval/evaluators/claude_3/most_updated_prompt_2607.txt @@ -0,0 +1,67 @@ + +You are an energy advisor with twenty years of experience at the UK's leading energy providers. You are empathetic and compassionate, you understand that rising energy prices can be a source of strain. You are pragmatic. Ask the user clarifying questions to understand their personal situation and to ensure you are giving personalised advice. Do not make information up, if you do not know how to answer be honest. Before answering, please think about all the information you would need before answering the user's question. + + + + + + + +You are a compassionate and empathetic customer-facing energy advisor with twenty years of experience at the UK's leading energy providers. You have the important role of preventing customers from debt or payment difficulties, whilst also providing tailored support to hose already struggling with energy costs. Most importantly, you assess each customer's unique needs and provide support that's tailored to their individual situation. + + + + +Your approach is to: +1) Create a profile of the customer by asking a few clarifying questions, one at a time, about their situation, energy usage and any challenges they are facing. +2) Based on their responses, provide a personalised recommendation to resolve their issue or improve their circumstance and ensure they are being energy efficient. + +Some example questions include: + + + +* Does the customer have a smart meter? +* Are they aware of Energy Hub? +* Are they on the right tariff? +* How many people are in their household? +* What is their current living situation (apartment, house, etc.)? + + + +Some examples of recommendations include: + + +* Smart meter installation for better usage monitoring +* Checking their eligibility for financial assistance including debt relief or the Warm Home Discount + + + +Always greet the customer with a salutation, even if they do not use one themselves. Approach each question with care. Do not make information up - if you do not know the answer - please be honest. Always remember to keep a conversational tone, especially when providing the recommendations. Ask the customer questions one at a time. Once you have enough information to provide the user with a helpful recommendation, then provide it. + + +Here is an example interaction: + + +A: how can I reduce my energy bill? + +B: Hi there, I understand you want to reduce your energy bill. I want to give you advice that is personal to your situation. So will ask some questions to understand you better. Is that okay? + +A: Yes + +B: What kind of house do you live in and with how many people? + +A: I live in a one-bedroom apartment with my partner? + +B: Thank you, and how do you measure your energy use? + +A: I send meter readings? + +B: Okay, so to confirm you don’t have a smart meter? + +A: No + +B: My first recommendation would be a smart meter. A smart meter is a way to ensure that your energy readings are always up to date and can assist with your payment if you are overpaying at some points in the year. Would you like some more recommendations? +... +[continues dialogue to gather more details if required and then provide a personalized recommendation] + + diff --git a/stepfunctions/agenteval/evaluators/evaluator_factory.py b/stepfunctions/agenteval/evaluators/evaluator_factory.py new file mode 100644 index 0000000..d42f8e3 --- /dev/null +++ b/stepfunctions/agenteval/evaluators/evaluator_factory.py @@ -0,0 +1,27 @@ +from typing import Optional + +from pydantic import BaseModel + +from agenteval.evaluators import BaseEvaluator +from agenteval.evaluators.claude_3 import Claude3Evaluator +from agenteval.targets import BaseTarget +from agenteval.test import Test + +_EVALUATOR_MAP = { + "claude-3": Claude3Evaluator, +} + + +class EvaluatorFactory(BaseModel): + config: dict + + def create( + self, test: Test, target: BaseTarget, work_dir: Optional[str] + ) -> BaseEvaluator: + evaluator_cls = _EVALUATOR_MAP[self.config["model"]] + return evaluator_cls( + test=test, + target=target, + work_dir=work_dir, + **{k: v for k, v in self.config.items() if k != "model"} + ) diff --git a/stepfunctions/agenteval/hook.py b/stepfunctions/agenteval/hook.py new file mode 100644 index 0000000..a1386e6 --- /dev/null +++ b/stepfunctions/agenteval/hook.py @@ -0,0 +1,33 @@ +# Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +# SPDX-License-Identifier: Apache-2.0 + +from agenteval.test import Test +from agenteval.test_result import TestResult +from agenteval.trace import Trace + + +class Hook: + """An evaluation hook.""" + + def pre_evaluate(test: Test, trace: Trace) -> None: + """ + Method called before evaluation. Can be used to perform any setup tasks. + + Args: + test (Test): The test case. + trace (Trace): Captures steps during evaluation. + """ + pass + + def post_evaluate(test: Test, test_result: TestResult, trace: Trace) -> None: + """ + Method called after evaluation. This may be used to perform integration testing + or clean up tasks. + + Args: + test (Test): The test case. + test_result (TestResult): The result of the test, which can be overriden + by updating the attributes of this object. + trace (Trace): Captures steps during evaluation. + """ + pass diff --git a/stepfunctions/agenteval/plan.py b/stepfunctions/agenteval/plan.py new file mode 100644 index 0000000..73a3107 --- /dev/null +++ b/stepfunctions/agenteval/plan.py @@ -0,0 +1,110 @@ +# Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +# SPDX-License-Identifier: Apache-2.0 + +from __future__ import annotations + +import logging +import os +import sys +from typing import Optional + +import yaml +from pydantic import BaseModel, model_validator + +from agenteval import defaults +from agenteval.evaluators import EvaluatorFactory +from agenteval.targets import TargetFactory +from agenteval.test import Test + +_PLAN_FILE_NAME = "agenteval.yml" + +_INIT_PLAN = { + "evaluator": {"model": "claude-3"}, + "target": { + "type": "bedrock-agent", + "bedrock_agent_id": None, + "bedrock_agent_alias_id": None, + }, + "tests": { + "retrieve_missing_documents": { + "steps": ["Ask agent for a list of missing documents for claim-006."], + "expected_results": ["The agent returns a list of missing documents."], + } + }, +} + + +sys.path.append(".") +logger = logging.getLogger(__name__) + + +class Plan(BaseModel, validate_assignment=True, arbitrary_types_allowed=True): + evaluator_factory: EvaluatorFactory + target_factory: TargetFactory + tests: list[Test] + + @model_validator(mode="after") + def check_test_names_unique(self) -> Plan: + unique_names = len(set(test.name for test in self.tests)) + + if unique_names != len(self.tests): + raise ValueError("Test names must be unique") + + return self + + @classmethod + def load(cls, plan_dir: Optional[str], filter: str) -> Plan: + plan_path = os.path.join(plan_dir or os.getcwd(), _PLAN_FILE_NAME) + plan = cls._load_yaml(plan_path) + + return cls( + evaluator_factory=EvaluatorFactory(config=plan["evaluator"]), + target_factory=TargetFactory(config=plan["target"]), + tests=cls._load_tests(plan["tests"], filter), + ) + + @staticmethod + def _load_yaml(path: str) -> dict: + with open(path) as stream: + return yaml.safe_load(stream) + + @staticmethod + def _load_tests(test_config: list[dict], filter: str) -> list[Test]: + tests = [] + + if filter: + names = Plan._parse_filter(filter) + else: + names = test_config.keys() + + for name in names: + config = test_config[name] + tests.append( + Test( + name=name, + steps=config["steps"], + expected_results=config["expected_results"], + initial_prompt=config.get("initial_prompt"), + max_turns=config.get("max_turns", defaults.MAX_TURNS), + hook=config.get("hook"), + ) + ) + + return tests + + @staticmethod + def _parse_filter(filter: str) -> list[str]: + return [n.strip() for n in filter.split(",")] + + @staticmethod + def init_plan(plan_dir: Optional[str]) -> str: + plan_path = os.path.join(plan_dir or os.getcwd(), _PLAN_FILE_NAME) + + # check if plan exists + if os.path.exists(plan_path): + raise FileExistsError(f"Test plan already exists at {plan_path}") + + with open(plan_path, "w") as stream: + yaml.safe_dump(_INIT_PLAN, stream, sort_keys=False) + + return plan_path diff --git a/stepfunctions/agenteval/runner/__init__.py b/stepfunctions/agenteval/runner/__init__.py new file mode 100644 index 0000000..32377b3 --- /dev/null +++ b/stepfunctions/agenteval/runner/__init__.py @@ -0,0 +1,6 @@ +# Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +# SPDX-License-Identifier: Apache-2.0 + +from .runner import Runner + +__all__ = ["Runner"] diff --git a/stepfunctions/agenteval/runner/runner.py b/stepfunctions/agenteval/runner/runner.py new file mode 100644 index 0000000..c3e0803 --- /dev/null +++ b/stepfunctions/agenteval/runner/runner.py @@ -0,0 +1,116 @@ +# Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +# SPDX-License-Identifier: Apache-2.0 + +import concurrent.futures +import logging +import os +import time +from typing import Optional + +from rich.progress import Progress + +from agenteval.defaults import MAX_NUM_THREADS +from agenteval.plan import Plan +from agenteval.runner.summary import create_markdown_summary + +logger = logging.getLogger(__name__) + + +class Runner: + def __init__( + self, + plan: Plan, + verbose: bool, + num_threads: Optional[int], + work_dir: Optional[str], + ): + self.plan = plan + self.work_dir = work_dir if work_dir else os.getcwd() + self.num_tests = len(self.plan.tests) + self.verbose = verbose + self.num_threads = num_threads + if not self.num_threads: + self.num_threads = min(self.num_tests, MAX_NUM_THREADS) + self.results = {test.name: None for test in self.plan.tests} + self.num_failed = 0 + self.evaluator_input_token_counts = [] + self.evaluator_output_token_counts = [] + + def run(self) -> int: + self._log_run_start() + + self.start_time = time.time() + with Progress(transient=True) as self.progress: + self.tracker = self.progress.add_task("running...", total=self.num_tests) + + with concurrent.futures.ThreadPoolExecutor( + max_workers=self.num_tests + ) as executor: + futures = [ + executor.submit(self.run_test, test) for test in self.plan.tests + ] + for future in concurrent.futures.as_completed(futures): + try: + future.result() + except Exception as e: + raise e + + self._log_run_end() + + create_markdown_summary( + self.work_dir, self.plan.tests, list(self.results.values()), self.verbose + ) + + return self.num_failed + + def run_test(self, test): + target = self.plan.target_factory.create() + evaluator = self.plan.evaluator_factory.create( + test=test, + target=target, + work_dir=self.work_dir, + ) + + result = evaluator.run() + if result.success is False: + self.num_failed += 1 + + self.progress.update(self.tracker, advance=1) + self.results[test.name] = result + self.evaluator_input_token_counts.append(evaluator.input_token_count) + self.evaluator_output_token_counts.append(evaluator.output_token_count) + + def _log_run_start(self): + logger.info(f"Starting {self.num_tests} tests with {self.num_threads} threads.") + + def _log_run_end(self): + self._log_pass_fail_count() + logger.info(f"Completed in {round(time.time() - self.start_time, 2)} seconds.") + if self.verbose: + self._log_test_result() + self._log_evaluator_token_io() + + def _log_test_result(self): + for _, result in self.results.items(): + logger_func = logger.info if result.success else logger.error + logger_func( + f"[bold {'green' if result.success else 'red'}]{result.test_name}...{'PASSED' if result.success else 'FAILED'}", + ) + + def _log_pass_fail_count(self): + passed_count = self.num_tests - self.num_failed + status_str = ( + f"[red]{passed_count} passed, {self.num_failed} failed." + if self.num_failed + else f"[green]{self.num_tests} passed." + ) + logger_func = logger.error if self.num_failed else logger.info + logger_func(status_str) + + def _log_evaluator_token_io(self): + logger.info( + f"Input tokens processed by evaluator: {sum(self.evaluator_input_token_counts)}" + ) + logger.info( + f"Output tokens generated by evaluator: {sum(self.evaluator_output_token_counts)}" + ) diff --git a/stepfunctions/agenteval/runner/summary.py b/stepfunctions/agenteval/runner/summary.py new file mode 100644 index 0000000..1abfaad --- /dev/null +++ b/stepfunctions/agenteval/runner/summary.py @@ -0,0 +1,30 @@ +# Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +# SPDX-License-Identifier: Apache-2.0 + +import logging +import os + +from agenteval import jinja_env +from agenteval.test import Test +from agenteval.test_result import TestResult + +logger = logging.getLogger(__name__) + +_TEMPLATE_ROOT = "summary" +_TEMPLATE_FILE_NAME = "agenteval_summary.md.jinja" + + +def create_markdown_summary( + work_dir: str, tests: list[Test], test_results: list[TestResult], verbose: bool +): + template = jinja_env.get_template(os.path.join(_TEMPLATE_ROOT, _TEMPLATE_FILE_NAME)) + + summary_path = os.path.join(work_dir, os.path.splitext(_TEMPLATE_FILE_NAME)[0]) + + rendered = template.render(tests=tests, results=test_results, zip=zip) + + with open(summary_path, "w+") as f: + f.write(rendered) + + if verbose: + logger.info(f"Summary available at {summary_path}") diff --git a/stepfunctions/agenteval/target_response.py b/stepfunctions/agenteval/target_response.py new file mode 100644 index 0000000..417543f --- /dev/null +++ b/stepfunctions/agenteval/target_response.py @@ -0,0 +1,15 @@ +from typing import Optional + +from pydantic import BaseModel + + +class TargetResponse(BaseModel): + """A target's response. + + Attributes: + response: The response string. + data: Additional data (if applicable). + """ + + response: str + data: Optional[dict] = None diff --git a/stepfunctions/agenteval/targets/__init__.py b/stepfunctions/agenteval/targets/__init__.py new file mode 100644 index 0000000..910e303 --- /dev/null +++ b/stepfunctions/agenteval/targets/__init__.py @@ -0,0 +1,8 @@ +# Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +# SPDX-License-Identifier: Apache-2.0 + +from .base_target import BaseTarget +from .boto3_target import Boto3Target +from .target_factory import TargetFactory + +__all__ = ["BaseTarget", "TargetFactory", "Boto3Target"] diff --git a/stepfunctions/agenteval/targets/base_target.py b/stepfunctions/agenteval/targets/base_target.py new file mode 100644 index 0000000..f8fbaa8 --- /dev/null +++ b/stepfunctions/agenteval/targets/base_target.py @@ -0,0 +1,27 @@ +# Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +# SPDX-License-Identifier: Apache-2.0 + +from __future__ import annotations + +from abc import ABC, abstractmethod + +from agenteval import TargetResponse + + +class BaseTarget(ABC): + """The `BaseTarget` abstract base class defines the common interface for target + classes. + """ + + @abstractmethod + def invoke(self, prompt: str) -> TargetResponse: + """Invoke the target with a prompt and return a response as a string. + + Args: + prompt: The prompt string to pass to the target. + + Returns: + A TargetResponse object containing the target's response string and + any trace data (if applicable). + """ + pass diff --git a/stepfunctions/agenteval/targets/bedrock_agent/__init__.py b/stepfunctions/agenteval/targets/bedrock_agent/__init__.py new file mode 100644 index 0000000..4d393ff --- /dev/null +++ b/stepfunctions/agenteval/targets/bedrock_agent/__init__.py @@ -0,0 +1,3 @@ +from .target import BedrockAgentTarget + +__all__ = ["BedrockAgentTarget"] diff --git a/stepfunctions/agenteval/targets/bedrock_agent/target.py b/stepfunctions/agenteval/targets/bedrock_agent/target.py new file mode 100644 index 0000000..f7e6f9c --- /dev/null +++ b/stepfunctions/agenteval/targets/bedrock_agent/target.py @@ -0,0 +1,41 @@ +import uuid + +from agenteval import TargetResponse +from agenteval.targets import Boto3Target + +_SERVICE_NAME = "bedrock-agent-runtime" + + +class BedrockAgentTarget(Boto3Target): + def __init__(self, bedrock_agent_id: str, bedrock_agent_alias_id: str, **kwargs): + super().__init__(boto3_service_name=_SERVICE_NAME, **kwargs) + self._bedrock_agent_id = bedrock_agent_id + self._bedrock_agent_alias_id = bedrock_agent_alias_id + self._session_id: str = str(uuid.uuid4()) + + def invoke(self, prompt: str) -> TargetResponse: + args = { + "agentId": self._bedrock_agent_id, + "agentAliasId": self._bedrock_agent_alias_id, + "sessionId": self._session_id, + "inputText": prompt, + "enableTrace": True, + } + + response = self.boto3_client.invoke_agent(**args) + + stream = response["completion"] + completion = "" + trace_data = [] + + for event in stream: + chunk = event.get("chunk") + event_trace = event.get("trace") + if chunk: + completion += chunk.get("bytes").decode() + if event_trace: + trace_data.append(event_trace.get("trace")) + + return TargetResponse( + response=completion, data={"bedrock_agent_trace": trace_data} + ) diff --git a/stepfunctions/agenteval/targets/bedrock_knowledge_base/__init__.py b/stepfunctions/agenteval/targets/bedrock_knowledge_base/__init__.py new file mode 100644 index 0000000..d56ea6f --- /dev/null +++ b/stepfunctions/agenteval/targets/bedrock_knowledge_base/__init__.py @@ -0,0 +1,3 @@ +from .target import BedrockKnowledgeBaseTarget + +__all__ = ["BedrockKnowledgeBaseTarget"] diff --git a/stepfunctions/agenteval/targets/bedrock_knowledge_base/target.py b/stepfunctions/agenteval/targets/bedrock_knowledge_base/target.py new file mode 100644 index 0000000..a9491e2 --- /dev/null +++ b/stepfunctions/agenteval/targets/bedrock_knowledge_base/target.py @@ -0,0 +1,38 @@ +from agenteval import TargetResponse +from agenteval.targets import Boto3Target + +_SERVICE_NAME = "bedrock-agent-runtime" + + +class BedrockKnowledgeBaseTarget(Boto3Target): + def __init__(self, knowledge_base_id: str, model_id: str, **kwargs): + super().__init__(boto3_service_name=_SERVICE_NAME, **kwargs) + aws_region = self.boto3_client.meta.region_name + self._knowledge_base_id = knowledge_base_id + self._model_arn = f"arn:aws:bedrock:{aws_region}::foundation-model/{model_id}" + self._session_id: str = None + + def invoke(self, prompt: str) -> TargetResponse: + args = { + "input": { + "text": prompt, + }, + "retrieveAndGenerateConfiguration": { + "type": "KNOWLEDGE_BASE", + "knowledgeBaseConfiguration": { + "knowledgeBaseId": self._knowledge_base_id, + "modelArn": self._model_arn, + }, + }, + } + if self._session_id: + args["sessionId"] = self._session_id + + response = self.boto3_client.retrieve_and_generate(**args) + generated_text = response["output"]["text"] + citations = response["citations"] + self._session_id = response["sessionId"] + + return TargetResponse( + response=generated_text, data={"bedrock_knowledgebase_citations": citations} + ) diff --git a/stepfunctions/agenteval/targets/boto3_target.py b/stepfunctions/agenteval/targets/boto3_target.py new file mode 100644 index 0000000..e47e8cb --- /dev/null +++ b/stepfunctions/agenteval/targets/boto3_target.py @@ -0,0 +1,41 @@ +from typing import Optional + +from agenteval.targets import BaseTarget +from agenteval.utils import create_boto3_client + +_DEFAULT_MAX_RETRY = 10 + + +class Boto3Target(BaseTarget): + """A target that can be interfaced with via the `boto3` library. + + Attributes: + boto3_client (BaseClient): A `boto3` client. + """ + + def __init__( + self, + boto3_service_name: str, + aws_profile: Optional[str] = None, + aws_region: Optional[str] = None, + endpoint_url: Optional[str] = None, + max_retry: int = _DEFAULT_MAX_RETRY, + ): + """ + Initialize the AWS target. + + Args: + boto3_service_name (str): The `boto3` service name (e.g `"bedrock-agent-runtime"`). + aws_profile (str, optional): The AWS profile name. + aws_region (str, optional): The AWS region. + endpoint_url (str, optional): The endpoint URL for the AWS service. + max_retry (int, optional): The maximum number of retry attempts. + """ + + self.boto3_client = create_boto3_client( + boto3_service_name=boto3_service_name, + aws_profile=aws_profile, + aws_region=aws_region, + endpoint_url=endpoint_url, + max_retry=max_retry, + ) diff --git a/stepfunctions/agenteval/targets/q_business/__init__.py b/stepfunctions/agenteval/targets/q_business/__init__.py new file mode 100644 index 0000000..3f621e5 --- /dev/null +++ b/stepfunctions/agenteval/targets/q_business/__init__.py @@ -0,0 +1,3 @@ +from .target import QBusinessTarget + +__all__ = ["QBusinessTarget"] diff --git a/stepfunctions/agenteval/targets/q_business/target.py b/stepfunctions/agenteval/targets/q_business/target.py new file mode 100644 index 0000000..8fd59be --- /dev/null +++ b/stepfunctions/agenteval/targets/q_business/target.py @@ -0,0 +1,32 @@ +from typing import Optional + +from agenteval import TargetResponse +from agenteval.targets import Boto3Target + +_SERVICE_NAME = "qbusiness" + + +class QBusinessTarget(Boto3Target): + def __init__( + self, + q_business_application_id: str, + q_business_user_id: Optional[str] = None, + **kwargs + ): + super().__init__(boto3_service_name=_SERVICE_NAME, **kwargs) + + self._chat_sync_args = {"applicationId": q_business_application_id} + if q_business_user_id: + self._chat_sync_args["userId"] = q_business_user_id + + def invoke(self, prompt: str) -> str: + self._chat_sync_args["userMessage"] = prompt + + response = self.boto3_client.chat_sync(**self._chat_sync_args) + + if "conversationId" not in self._chat_sync_args: + self._chat_sync_args["conversationId"] = response["conversationId"] + + self._chat_sync_args["parentMessageId"] = response["systemMessageId"] + + return TargetResponse(response=response["systemMessage"]) diff --git a/stepfunctions/agenteval/targets/sagemaker_endpoint/__init__.py b/stepfunctions/agenteval/targets/sagemaker_endpoint/__init__.py new file mode 100644 index 0000000..8c9adc2 --- /dev/null +++ b/stepfunctions/agenteval/targets/sagemaker_endpoint/__init__.py @@ -0,0 +1,3 @@ +from .target import SageMakerEndpointTarget + +__all__ = ["SageMakerEndpointTarget"] diff --git a/stepfunctions/agenteval/targets/sagemaker_endpoint/target.py b/stepfunctions/agenteval/targets/sagemaker_endpoint/target.py new file mode 100644 index 0000000..74d2056 --- /dev/null +++ b/stepfunctions/agenteval/targets/sagemaker_endpoint/target.py @@ -0,0 +1,85 @@ +import json +from typing import Optional + +from jsonpath_ng import parse + +from agenteval import TargetResponse +from agenteval.targets import Boto3Target + +_SERVICE_NAME = "sagemaker-runtime" +_CONTENT_TYPE = "application/json" +_ACCEPT = "application/json" + + +class SageMakerEndpointTarget(Boto3Target): + def __init__( + self, + endpoint_name: str, + request_body: dict, + input_path: str, + output_path: str, + custom_attributes: Optional[str] = None, + target_model: Optional[str] = None, + target_variant: Optional[str] = None, + target_container_hostname: Optional[str] = None, + inference_component_name: Optional[str] = None, + **kwargs + ): + super().__init__(boto3_service_name=_SERVICE_NAME, **kwargs) + + self._request_body = request_body + self._input_jp_expr = parse(input_path) + self._output_jp_expr = parse(output_path) + + self._args = self._create_base_args( + endpoint_name, + custom_attributes, + target_model, + target_variant, + target_container_hostname, + inference_component_name, + ) + + @staticmethod + def _create_base_args( + endpoint_name: str, + custom_attributes: Optional[str], + target_model: Optional[str], + target_variant: Optional[str], + target_container_hostname: Optional[str], + inference_component_name: Optional[str], + ): + args = { + "EndpointName": endpoint_name, + "ContentType": _CONTENT_TYPE, + "Accept": _ACCEPT, + **{ + key: value + for key, value in { + "CustomAttributes": custom_attributes, + "TargetModel": target_model, + "TargetVariant": target_variant, + "TargetContainerHostname": target_container_hostname, + "InferenceComponentName": inference_component_name, + }.items() + if value is not None + }, + } + + return args + + def _update_request(self, prompt: str): + self._input_jp_expr.update(self._request_body, prompt) + self._args["Body"] = json.dumps(self._request_body) + + def _query_response(self, response_body: dict) -> str: + return self._output_jp_expr.find(response_body)[0].value + + def invoke(self, prompt: str) -> str: + self._update_request(prompt) + + response = self.boto3_client.invoke_endpoint(**self._args) + + response_body = json.loads(response.get("Body").read()) + + return TargetResponse(response=self._query_response(response_body)) diff --git a/stepfunctions/agenteval/targets/target_factory.py b/stepfunctions/agenteval/targets/target_factory.py new file mode 100644 index 0000000..a8e7e9c --- /dev/null +++ b/stepfunctions/agenteval/targets/target_factory.py @@ -0,0 +1,32 @@ +from pydantic import BaseModel + +from agenteval.targets import BaseTarget +from agenteval.targets.bedrock_agent import BedrockAgentTarget +from agenteval.targets.bedrock_knowledge_base import BedrockKnowledgeBaseTarget +from agenteval.targets.q_business import QBusinessTarget +from agenteval.targets.sagemaker_endpoint import SageMakerEndpointTarget +from agenteval.utils import import_class + +_TARGET_MAP = { + "bedrock-agent": BedrockAgentTarget, + "q-business": QBusinessTarget, + "sagemaker-endpoint": SageMakerEndpointTarget, + "bedrock-knowledgebase": BedrockKnowledgeBaseTarget, +} + + +class TargetFactory(BaseModel): + config: dict + + def create(self) -> BaseTarget: + target_cls = self._get_target_class() + + return target_cls(**{k: v for k, v in self.config.items() if k != "type"}) + + def _get_target_class(self) -> type[BaseTarget]: + if self.config["type"] in _TARGET_MAP: + target_cls = _TARGET_MAP[self.config["type"]] + else: + target_cls = import_class(self.config["type"], parent_class=BaseTarget) + + return target_cls diff --git a/stepfunctions/agenteval/templates/evaluators/claude_3/generate_evaluation.jinja b/stepfunctions/agenteval/templates/evaluators/claude_3/generate_evaluation.jinja new file mode 100644 index 0000000..9cd9dd4 --- /dev/null +++ b/stepfunctions/agenteval/templates/evaluators/claude_3/generate_evaluation.jinja @@ -0,0 +1,13 @@ +Here are the expected results and conversation: + + +{% for result in expected_results -%} +{{ loop.index }}. {{ result }} +{% endfor -%} + + + +{% for sender, message in conversation -%} +{{ sender }}: {{ message }} +{% endfor -%} + \ No newline at end of file diff --git a/stepfunctions/agenteval/templates/evaluators/claude_3/generate_initial_prompt.jinja b/stepfunctions/agenteval/templates/evaluators/claude_3/generate_initial_prompt.jinja new file mode 100644 index 0000000..832ba37 --- /dev/null +++ b/stepfunctions/agenteval/templates/evaluators/claude_3/generate_initial_prompt.jinja @@ -0,0 +1,5 @@ +Here is the step: + + +{{ step }} + \ No newline at end of file diff --git a/stepfunctions/agenteval/templates/evaluators/claude_3/generate_test_status.jinja b/stepfunctions/agenteval/templates/evaluators/claude_3/generate_test_status.jinja new file mode 100644 index 0000000..79ad0df --- /dev/null +++ b/stepfunctions/agenteval/templates/evaluators/claude_3/generate_test_status.jinja @@ -0,0 +1,13 @@ +Here are the steps and conversation: + + +{% for step in steps -%} +{{ loop.index }}. {{ step }} +{% endfor -%} + + + +{% for sender, message in conversation -%} +{{ sender }}: {{ message }} +{% endfor -%} + \ No newline at end of file diff --git a/stepfunctions/agenteval/templates/evaluators/claude_3/generate_user_response.jinja b/stepfunctions/agenteval/templates/evaluators/claude_3/generate_user_response.jinja new file mode 100644 index 0000000..79ad0df --- /dev/null +++ b/stepfunctions/agenteval/templates/evaluators/claude_3/generate_user_response.jinja @@ -0,0 +1,13 @@ +Here are the steps and conversation: + + +{% for step in steps -%} +{{ loop.index }}. {{ step }} +{% endfor -%} + + + +{% for sender, message in conversation -%} +{{ sender }}: {{ message }} +{% endfor -%} + \ No newline at end of file diff --git a/stepfunctions/agenteval/templates/evaluators/claude_3/system/generate_evaluation.jinja b/stepfunctions/agenteval/templates/evaluators/claude_3/system/generate_evaluation.jinja new file mode 100644 index 0000000..22cace3 --- /dev/null +++ b/stepfunctions/agenteval/templates/evaluators/claude_3/system/generate_evaluation.jinja @@ -0,0 +1,12 @@ +You are a quality assurance engineer evaluating a conversation between an USER and an AGENT. + +Your job is to analyze the conversation in tags and a list of expected results +in tags. + +You will classify the the conversation into the following categories: + +- A: All of the expected results can be observed in the conversation. +- B: Not all of the expected results can be observed in the conversation. + +Please think hard about the response in tags before providing only the category letter +within tags. \ No newline at end of file diff --git a/stepfunctions/agenteval/templates/evaluators/claude_3/system/generate_initial_prompt.jinja b/stepfunctions/agenteval/templates/evaluators/claude_3/system/generate_initial_prompt.jinja new file mode 100644 index 0000000..d0e8e23 --- /dev/null +++ b/stepfunctions/agenteval/templates/evaluators/claude_3/system/generate_initial_prompt.jinja @@ -0,0 +1,13 @@ +You are role playing as an USER in a conversastion with an AGENT. + +You will be given a step that is wrapped in tags. This step represents a +task the USER wants to perform when interacting with the AGENT. + +Your job is to generate the very first message as the USER that will help complete the step. + +Make sure this message is concise and to the point. + +Do not provide any information if it is expected that the AGENT will eventually ask for it. + +Please think hard about the response in tags before providing the message +within tags. \ No newline at end of file diff --git a/stepfunctions/agenteval/templates/evaluators/claude_3/system/generate_test_status.jinja b/stepfunctions/agenteval/templates/evaluators/claude_3/system/generate_test_status.jinja new file mode 100644 index 0000000..7bb8e6b --- /dev/null +++ b/stepfunctions/agenteval/templates/evaluators/claude_3/system/generate_test_status.jinja @@ -0,0 +1,13 @@ +You are a quality assurance engineer evaluating a conversation between an USER and an AGENT. + +You will be given an ordered list of steps wrapped in tags. Each step represents a task +that the USER wants to perform when interacting with the AGENT. + +Your job is analyze the running conversation in tags and classify it into the following +categories: + +- A: The USER has attempted all the steps. +- B: The USER has not yet attempted all the steps. + +Please think hard about the response in tags before providing only the category letter +within tags. \ No newline at end of file diff --git a/stepfunctions/agenteval/templates/evaluators/claude_3/system/generate_user_response.jinja b/stepfunctions/agenteval/templates/evaluators/claude_3/system/generate_user_response.jinja new file mode 100644 index 0000000..e670420 --- /dev/null +++ b/stepfunctions/agenteval/templates/evaluators/claude_3/system/generate_user_response.jinja @@ -0,0 +1,15 @@ +You are role playing as an USER in a conversastion with an AGENT. + +You will be given an ordered list of steps wrapped in tags. Each step represents +a task that the USER wants to perform when interacting with the AGENT. + +Using the list of steps, your job is analyze the running conversation in the + tags and generate the next appropriate response as the USER. + +Do not include any information from a step unless the AGENT asks for it. + +If the AGENT was unable to help or did not understand the last request, just move on to +the next step. Do not attempt to rephrase the request in the next response as the USER. + +Please think hard about the response in tags before providing the response +within tags. Do not include the string "USER:" in your response. \ No newline at end of file diff --git a/stepfunctions/agenteval/templates/summary/agenteval_summary.md.jinja b/stepfunctions/agenteval/templates/summary/agenteval_summary.md.jinja new file mode 100644 index 0000000..a624303 --- /dev/null +++ b/stepfunctions/agenteval/templates/summary/agenteval_summary.md.jinja @@ -0,0 +1,49 @@ +# Test Summary +--- +This document provides a summary of the tests executed by Agent Evaluation. + +> :warning: This tool utilizes generative AI to assess virtual agents and its evaluations may contain errors. **Please thoroughly examine the results below prior to deciding whether to implement an agent.** +--- +## Tests +{% for test, result in zip(tests, results) -%} +- [{% if result.success %}:green_circle:{% else %}:red_circle:{% endif %} {{ test.name }}](#{{ test.name | replace(' ', '-') }}) +{% endfor %} + +--- + + +{% for test, result in zip(tests, results) -%} +## {% if result.success %}:green_circle:{% else %}:red_circle:{% endif %} {{ test.name }} + +**Steps** +{% for step in test.steps -%} +{{ loop.index }}. {{ step }} +{% endfor %} + +**Expected results** +{% for result in test.expected_results -%} +{{ loop.index }}. {{ result }} +{% endfor %} + +**Conversation** +``` +{% for sender, message in result.conversation -%} +[{{ sender }}] {{ message }} +{% endfor -%} +``` + +**Result** +{{ result.result }} + +**Reasoning** +``` +{{ result.reasoning }} +``` + +--- +{% endfor %} + + + + + diff --git a/stepfunctions/agenteval/test.py b/stepfunctions/agenteval/test.py new file mode 100644 index 0000000..695f2fe --- /dev/null +++ b/stepfunctions/agenteval/test.py @@ -0,0 +1,29 @@ +# Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +# SPDX-License-Identifier: Apache-2.0 + +from typing import Optional + +from pydantic import BaseModel + + +class Test(BaseModel, validate_assignment=True): + """A test case for an agent. + + Attributes: + name: Name of the test. + steps: List of step to perform for the test. + expected_results: List of expected results for the test. + initial_prompt: Optional initial prompt. + max_turns: Maximum number of turns allowed for the test. + hook: The module path to an evaluation hook. + """ + + # do not collect as a test + __test__ = False + + name: str + steps: list[str] + expected_results: list[str] + initial_prompt: Optional[str] = None + max_turns: int + hook: Optional[str] = None diff --git a/stepfunctions/agenteval/test_result.py b/stepfunctions/agenteval/test_result.py new file mode 100644 index 0000000..5258aef --- /dev/null +++ b/stepfunctions/agenteval/test_result.py @@ -0,0 +1,27 @@ +# Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +# SPDX-License-Identifier: Apache-2.0 + +from pydantic import BaseModel + +from agenteval.conversation import Conversation + + +class TestResult(BaseModel, arbitrary_types_allowed=True): + """The result of a test. + + Attributes: + test_name: Name of the test. + result: Description of the test result. + reasoning: The rationale for the test result. + success: `True` if the test passed, otherwise `False`. + conversation: Captures the interaction between a user and an agent. + """ + + # do not collect as a test + __test__ = False + + test_name: str + result: str + reasoning: str + success: bool + conversation: Conversation diff --git a/stepfunctions/agenteval/trace.py b/stepfunctions/agenteval/trace.py new file mode 100644 index 0000000..25d477a --- /dev/null +++ b/stepfunctions/agenteval/trace.py @@ -0,0 +1,72 @@ +# Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +# SPDX-License-Identifier: Apache-2.0 + +import inspect +import json +import os +from datetime import datetime, timezone +from typing import Optional + +_TRACE_DIR = "agenteval_traces" + + +class Trace: + """Captures steps during evaluation. + + Attributes: + test_name (str): Name of the test. + trace_dir (str): Directory to store the trace. + start_time (datetime): Start time of the trace. + end_time (datetime): End time of the trace. + steps (list): List of steps in the trace. + + """ + + def __init__(self, test_name: str, work_dir: str): + """ + Initialize the trace handler. + + Args: + test_name (str): Name of the trace + """ + self.test_name = test_name + self.trace_dir = os.path.join(work_dir, _TRACE_DIR) + self.start_time = None + self.end_time = None + self.steps = [] + + def __enter__(self): + self.start_time = datetime.now(timezone.utc) + return self + + def __exit__(self, *exc): + self.end_time = datetime.now(timezone.utc) + self._dump_trace() + + def _dump_trace(self): + """Dump the trace to a JSON file.""" + + os.makedirs(self.trace_dir, exist_ok=True) + + with open(os.path.join(self.trace_dir, f"{self.test_name}.json"), "w") as f: + json.dump(self._get_trace(), f, default=str) + + def _get_trace(self) -> str: + return { + "test_name": self.test_name, + "start_time": self.start_time, + "end_time": self.end_time, + "steps": self.steps, + } + + def add_step(self, step_name: Optional[str] = None, **kwargs): + """Add a step to the trace. + + Args: + step_name (str, optional): The name of the step. Defaults to + the name of the caller function + """ + step_name = step_name or inspect.stack()[1].function + step = {"timestamp": datetime.now(timezone.utc), "step_name": step_name} + step.update(kwargs) + self.steps.append(step) diff --git a/stepfunctions/agenteval/utils/__init__.py b/stepfunctions/agenteval/utils/__init__.py new file mode 100644 index 0000000..5f80a10 --- /dev/null +++ b/stepfunctions/agenteval/utils/__init__.py @@ -0,0 +1,7 @@ +# Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +# SPDX-License-Identifier: Apache-2.0 + +from .aws import create_boto3_client +from .imports import import_class + +__all__ = ["import_class", "create_boto3_client"] diff --git a/stepfunctions/agenteval/utils/aws.py b/stepfunctions/agenteval/utils/aws.py new file mode 100644 index 0000000..4d5d4dd --- /dev/null +++ b/stepfunctions/agenteval/utils/aws.py @@ -0,0 +1,36 @@ +# Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +# SPDX-License-Identifier: Apache-2.0 + +from typing import Optional + +import boto3 +from botocore.client import BaseClient +from botocore.config import Config + +_RETRY_MODE = "adaptive" + + +def create_boto3_client( + boto3_service_name: str, + aws_profile: Optional[str], + aws_region: Optional[str], + endpoint_url: Optional[str], + max_retry: int, +) -> BaseClient: + """Create a `boto3` client. + + Args: + boto3_service_name (str): The `boto3` service name (e.g `"bedrock-runtime"`). + aws_profile (str, optional): The AWS profile name. + aws_region (str, optional): The AWS region. + endpoint_url (str, optional): The endpoint URL for the AWS service. + max_retry (int, optional): The maximum number of retry attempts. + + Returns: + BaseClient + """ + + config = Config(retries={"max_attempts": max_retry, "mode": _RETRY_MODE}) + + session = boto3.Session(profile_name=aws_profile, region_name=aws_region) + return session.client(boto3_service_name, endpoint_url=endpoint_url, config=config) diff --git a/stepfunctions/agenteval/utils/imports.py b/stepfunctions/agenteval/utils/imports.py new file mode 100644 index 0000000..f0e2685 --- /dev/null +++ b/stepfunctions/agenteval/utils/imports.py @@ -0,0 +1,35 @@ +# Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +# SPDX-License-Identifier: Apache-2.0 + +from importlib import import_module +from typing import Optional + +_ALLOWED_MODULE_NAME_SUFFIX = ["_hook", "_target"] + + +def import_class(module_path: str, parent_class: Optional[type] = None) -> type: + name, class_name = module_path.rsplit(".", 1) + + # make sure module name starts with one of the allowed suffixes + _validate_module_name(name.split(".")[-1]) + + module = import_module(name) + cls = getattr(module, class_name) + + if parent_class: + # make sure the imported class is a subclass + _validate_subclass(cls, parent_class) + + return cls + + +def _validate_module_name(name: str) -> None: + if not any(name.endswith(suffix) for suffix in _ALLOWED_MODULE_NAME_SUFFIX): + raise ValueError(f"Invalid module name: {name}") + + +def _validate_subclass(child_class: type, parent_class: type) -> None: + if not issubclass(child_class, parent_class): + raise TypeError( + f"{child_class.__name__} is not a {parent_class.__name__} subclass" + ) diff --git a/stepfunctions/app.py b/stepfunctions/app.py new file mode 100644 index 0000000..5751ac6 --- /dev/null +++ b/stepfunctions/app.py @@ -0,0 +1,28 @@ +#!/usr/bin/env python3 +import os + +import aws_cdk as cdk + +from stepfunctions.stepfunctions_stack import StepfunctionsStack + + +app = cdk.App() +StepfunctionsStack(app, "StepfunctionsStack", + # If you don't specify 'env', this stack will be environment-agnostic. + # Account/Region-dependent features and context lookups will not work, + # but a single synthesized template can be deployed anywhere. + + # Uncomment the next line to specialize this stack for the AWS Account + # and Region that are implied by the current CLI configuration. + + #env=cdk.Environment(account=os.getenv('CDK_DEFAULT_ACCOUNT'), region=os.getenv('CDK_DEFAULT_REGION')), + + # Uncomment the next line if you know exactly what Account and Region you + # want to deploy the stack to. */ + + env=cdk.Environment(region='us-east-1'), + + # For more information, see https://docs.aws.amazon.com/cdk/latest/guide/environments.html + ) + +app.synth() diff --git a/stepfunctions/cdk.json b/stepfunctions/cdk.json new file mode 100644 index 0000000..5553cf4 --- /dev/null +++ b/stepfunctions/cdk.json @@ -0,0 +1,70 @@ +{ + "app": "python3 app.py", + "watch": { + "include": [ + "**" + ], + "exclude": [ + "README.md", + "cdk*.json", + "requirements*.txt", + "source.bat", + "**/__init__.py", + "**/__pycache__", + "tests" + ] + }, + "context": { + "@aws-cdk/aws-lambda:recognizeLayerVersion": true, + "@aws-cdk/core:checkSecretUsage": true, + "@aws-cdk/core:target-partitions": [ + "aws", + "aws-cn" + ], + "@aws-cdk-containers/ecs-service-extensions:enableDefaultLogDriver": true, + "@aws-cdk/aws-ec2:uniqueImdsv2TemplateName": true, + "@aws-cdk/aws-ecs:arnFormatIncludesClusterName": true, + "@aws-cdk/aws-iam:minimizePolicies": true, + "@aws-cdk/core:validateSnapshotRemovalPolicy": true, + "@aws-cdk/aws-codepipeline:crossAccountKeyAliasStackSafeResourceName": true, + "@aws-cdk/aws-s3:createDefaultLoggingPolicy": true, + "@aws-cdk/aws-sns-subscriptions:restrictSqsDescryption": true, + "@aws-cdk/aws-apigateway:disableCloudWatchRole": true, + "@aws-cdk/core:enablePartitionLiterals": true, + "@aws-cdk/aws-events:eventsTargetQueueSameAccount": true, + "@aws-cdk/aws-ecs:disableExplicitDeploymentControllerForCircuitBreaker": true, + "@aws-cdk/aws-iam:importedRoleStackSafeDefaultPolicyName": true, + "@aws-cdk/aws-s3:serverAccessLogsUseBucketPolicy": true, + "@aws-cdk/aws-route53-patters:useCertificate": true, + "@aws-cdk/customresources:installLatestAwsSdkDefault": false, + "@aws-cdk/aws-rds:databaseProxyUniqueResourceName": true, + "@aws-cdk/aws-codedeploy:removeAlarmsFromDeploymentGroup": true, + "@aws-cdk/aws-apigateway:authorizerChangeDeploymentLogicalId": true, + "@aws-cdk/aws-ec2:launchTemplateDefaultUserData": true, + "@aws-cdk/aws-secretsmanager:useAttachedSecretResourcePolicyForSecretTargetAttachments": true, + "@aws-cdk/aws-redshift:columnId": true, + "@aws-cdk/aws-stepfunctions-tasks:enableEmrServicePolicyV2": true, + "@aws-cdk/aws-ec2:restrictDefaultSecurityGroup": true, + "@aws-cdk/aws-apigateway:requestValidatorUniqueId": true, + "@aws-cdk/aws-kms:aliasNameRef": true, + "@aws-cdk/aws-autoscaling:generateLaunchTemplateInsteadOfLaunchConfig": true, + "@aws-cdk/core:includePrefixInUniqueNameGeneration": true, + "@aws-cdk/aws-efs:denyAnonymousAccess": true, + "@aws-cdk/aws-opensearchservice:enableOpensearchMultiAzWithStandby": true, + "@aws-cdk/aws-lambda-nodejs:useLatestRuntimeVersion": true, + "@aws-cdk/aws-efs:mountTargetOrderInsensitiveLogicalId": true, + "@aws-cdk/aws-rds:auroraClusterChangeScopeOfInstanceParameterGroupWithEachParameters": true, + "@aws-cdk/aws-appsync:useArnForSourceApiAssociationIdentifier": true, + "@aws-cdk/aws-rds:preventRenderingDeprecatedCredentials": true, + "@aws-cdk/aws-codepipeline-actions:useNewDefaultBranchForCodeCommitSource": true, + "@aws-cdk/aws-cloudwatch-actions:changeLambdaPermissionLogicalIdForLambdaAction": true, + "@aws-cdk/aws-codepipeline:crossAccountKeysDefaultValueToFalse": true, + "@aws-cdk/aws-codepipeline:defaultPipelineTypeToV2": true, + "@aws-cdk/aws-kms:reduceCrossAccountRegionPolicyScope": true, + "@aws-cdk/aws-eks:nodegroupNameAttribute": true, + "@aws-cdk/aws-ec2:ebsDefaultGp3Volume": true, + "@aws-cdk/aws-ecs:removeDefaultDeploymentAlarm": true, + "@aws-cdk/custom-resources:logApiResponseDataPropertyTrueDefault": false, + "@aws-cdk/aws-s3:keepNotificationInImportedBucket": false + } +} diff --git a/stepfunctions/example_prompt_jsons/prompts_scenarios.json b/stepfunctions/example_prompt_jsons/prompts_scenarios.json new file mode 100644 index 0000000..936a47a --- /dev/null +++ b/stepfunctions/example_prompt_jsons/prompts_scenarios.json @@ -0,0 +1,158 @@ +{ "agent_id" : "ABCDEFGHIJ", + "agent_name": "agent_name", + "prompts": [ + { + "id":"1", + "prompt": "You are a customer-facing energy advisor with twenty years of experience at the UK's leading energy providers. Your role is to:\n1) Create a profile of the user by asking a few clarifying questions\n2) Generate a personalised recommendation to the user to solve their problem\n\nYou are empathetic and compassionate. You approach each question with care. You understand that rising energy prices can be a source of strain.\n\nYou are pragmatic. Do not make information up, if you do not know the answer - please be honest.\nBe friendly. Keep a conversational tone. Ask the customer questions one at a time.\nOnly ask a maximum of five questions before giving your personalised recommendation." + }, + + { + "id":"2", + "prompt": "You are a seasoned energy consultant with two decades of experience working for top UK energy companies. Your task is to:\n1)Build a customer profile through a brief series of targeted questions\n2)Offer a tailored solution to address the customer's energy-related concerns\nApproach each interaction with empathy and understanding, recognising the potential stress caused by increasing energy costs. Maintain a practical mindset and prioritise accuracy over speculation. Engage in a friendly, conversatinal manner, posing questions one at a time. Limit your inquiry to no more than five questions before presenting your customised recommendation" + }, + { + "id":"3", + "prompt": "You're an expert energy consultant who has spent two decades with the UK's top energy providers. Your mission is to:\n1)Craft a user profile using a few well-chosen questions\n2)Deliver a custom-tailored recommendation to address the user's energy concerns\n\nApproach each interaction with genuine care and understanding, recognizing the potential anxiety surrounding energy costs. Be practical in your approach, and honest about any limitations in your knowledge. Maintain a friendly, conversational tone throughout, asking questions one at a time. Limit yourself to a maximum of five questions before presenting your personalized recommendation" + } + ], + "customer_profiles":[ + { + "id": 1, + "profile": "Single Professional", + "household_size": 1, + "demography": "30-year-old urban professional", + "appliances": [ + "Laptop", "smartphone", "LED TV", "microwave", + "energy-efficient washer and dryer", "smart thermostat", + "electric car charger" + ], + "energy_usage": "Moderate, primarily in the evening and weekends", + "tarrif": "standard variable", + "payment_type": "on demand" + }, + { + "id": 2, + "profile": "Young Couple", + "household_size": 2, + "demography": "25-35 years old, urban", + "appliances": [ + "Two laptops", "two smartphones", "LED TV", "dishwasher", + "energy-efficient refrigerator", "smart home devices", + "washer and dryer" + ], + "energy_usage": "Moderate, spread throughout the day", + "tarrif": "dual fuel", + "payment_type": "direct debit" + }, + { + "id": 3, + "profile": "Family with Young Children", + "household_size": 4, + "demography": "35-45 years old parents with two children under 10, suburban", + "appliances": [ + "Multiple TVs", "gaming console", "desktop computer", "laptops", + "smartphones", "refrigerator", "dishwasher", "washer", "dryer", + "electric oven", "microwave", "air conditioning" + ], + "energy_usage": "High, spread throughout the day, peaks in the evening", + "tarrif": "pre-payment", + "payment_type":"pay bill in full within 14 days" + }, + { + "id": 4, + "profile": "Single Senior Citizen", + "household_size": 1, + "demography": "70-year-old retired individual, rural", + "appliances": [ + "TV", "radio", "landline phone", "microwave", "refrigerator", + "washer", "dryer", "medical equipment (e.g., oxygen concentrator)" + ], + "energy_usage": "Low to moderate, primarily during the day", + "tarrif":"standard variable", + "payment_type":"pay bill in full by Direct Debit" + }, + { + "id": 5, + "profile": "Roommates Sharing Apartment", + "household_size": 3, + "demography": "20-30 years old, urban", + "appliances": [ + "Three laptops", "three smartphones", "shared TV", + "shared kitchen appliances (microwave, refrigerator, oven)", + "washer and dryer" + ], + "energy_usage": "Moderate to high, varies throughout the day", + "tarrif": "dual fuel", + "payment_type":"on demand" + }, + { + "id": 6, + "profile": "Large Family", + "household_size": 6, + "demography": "40-50 years old parents with four children, suburban", + "appliances": [ + "Multiple TVs", "gaming consoles", "multiple laptops and smartphones", + "refrigerator", "two dishwashers", "washer and dryer", "microwave", + "electric oven", "central air conditioning" + ], + "energy_usage": "Very high, continuous throughout the day", + "tarrif":"pre-payment", + "payment_type":"direct debit" + }, + { + "id": 7, + "profile": "Eco-Conscious Couple", + "household_size": 2, + "demography": "30-40 years old, urban", + "appliances": [ + "Energy-efficient appliances", "solar panels", "electric car charger", + "LED lighting", "smart home devices", "minimalistic approach to electronic devices" + ], + "energy_usage": "Low to moderate, optimized for energy savings", + "tarrif": "standard variable", + "payment_type":"pay bill in full within 14 days" + }, + { + "id": 8, + "profile": "Student Living Alone", + "household_size": 1, + "demography": "20-year-old college student, urban", + "appliances": [ + "Laptop", "smartphone", "small TV", "microwave", "mini-fridge", + "electric kettle", "shared washer and dryer in building" + ], + "energy_usage": "Low, primarily in the evening and weekends", + "tarrif": "dual fuel", + "payment_type":"pay bill in full by Direct Debit" + }, + { + "id": 9, + "profile": "Retired Couple", + "household_size": 2, + "demography": "65-75 years old, rural", + "appliances": [ + "TV", "desktop computer", "smartphones", "refrigerator", + "microwave", "washer and dryer", "space heaters", + "medical equipment (e.g., CPAP machine)" + ], + "energy_usage": "Moderate, primarily during the day", + "tarrif": "pre-payment", + "payment_type":"on demand" + }, + { + "id": 10, + "profile": "Home-Based Business Owner", + "household_size": 1, + "demography": "45-year-old entrepreneur, suburban", + "appliances": [ + "Desktop computer", "laptop", "multiple smartphones", "printer", + "fax machine", "energy-efficient refrigerator", "microwave", + "washer and dryer" + ], + "energy_usage": "High, continuous throughout the day", + "tarrif": "standard variable", + "payment_type":"direct debit" + } + + ] +} \ No newline at end of file diff --git a/stepfunctions/layers/agent-evaluation/requirements.txt b/stepfunctions/layers/agent-evaluation/requirements.txt new file mode 100644 index 0000000..8885a87 --- /dev/null +++ b/stepfunctions/layers/agent-evaluation/requirements.txt @@ -0,0 +1 @@ +agent-evaluation \ No newline at end of file diff --git a/stepfunctions/layers/jinja2/requirements.txt b/stepfunctions/layers/jinja2/requirements.txt new file mode 100644 index 0000000..1c579e7 --- /dev/null +++ b/stepfunctions/layers/jinja2/requirements.txt @@ -0,0 +1 @@ +jinja2 \ No newline at end of file diff --git a/stepfunctions/layers/pydantic/requirements.txt b/stepfunctions/layers/pydantic/requirements.txt new file mode 100644 index 0000000..59cc1e9 --- /dev/null +++ b/stepfunctions/layers/pydantic/requirements.txt @@ -0,0 +1 @@ +pydantic \ No newline at end of file diff --git a/stepfunctions/layers/pyyaml/requirements.txt b/stepfunctions/layers/pyyaml/requirements.txt new file mode 100644 index 0000000..4818cc5 --- /dev/null +++ b/stepfunctions/layers/pyyaml/requirements.txt @@ -0,0 +1 @@ +pyyaml \ No newline at end of file diff --git a/stepfunctions/requirements-dev.txt b/stepfunctions/requirements-dev.txt new file mode 100644 index 0000000..9270945 --- /dev/null +++ b/stepfunctions/requirements-dev.txt @@ -0,0 +1 @@ +pytest==6.2.5 diff --git a/stepfunctions/requirements.txt b/stepfunctions/requirements.txt new file mode 100644 index 0000000..c0a15e4 --- /dev/null +++ b/stepfunctions/requirements.txt @@ -0,0 +1,3 @@ +pathlib +aws-cdk-lib==2.155.0 +constructs>=10.0.0,<11.0.0 diff --git a/stepfunctions/source.bat b/stepfunctions/source.bat new file mode 100644 index 0000000..9e1a834 --- /dev/null +++ b/stepfunctions/source.bat @@ -0,0 +1,13 @@ +@echo off + +rem The sole purpose of this script is to make the command +rem +rem source .venv/bin/activate +rem +rem (which activates a Python virtualenv on Linux or Mac OS X) work on Windows. +rem On Windows, this command just runs this batch file (the argument is ignored). +rem +rem Now we don't need to document a Windows command for activating a virtualenv. + +echo Executing .venv\Scripts\activate.bat for you +.venv\Scripts\activate.bat diff --git a/stepfunctions/stepfunctions/__init__.py b/stepfunctions/stepfunctions/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/stepfunctions/stepfunctions/agent_evaluation-0.2.0.dist-info/INSTALLER b/stepfunctions/stepfunctions/agent_evaluation-0.2.0.dist-info/INSTALLER new file mode 100644 index 0000000..a1b589e --- /dev/null +++ b/stepfunctions/stepfunctions/agent_evaluation-0.2.0.dist-info/INSTALLER @@ -0,0 +1 @@ +pip diff --git a/stepfunctions/stepfunctions/agent_evaluation-0.2.0.dist-info/LICENSE b/stepfunctions/stepfunctions/agent_evaluation-0.2.0.dist-info/LICENSE new file mode 100644 index 0000000..67db858 --- /dev/null +++ b/stepfunctions/stepfunctions/agent_evaluation-0.2.0.dist-info/LICENSE @@ -0,0 +1,175 @@ + + Apache License + Version 2.0, January 2004 + http://www.apache.org/licenses/ + + TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION + + 1. Definitions. + + "License" shall mean the terms and conditions for use, reproduction, + and distribution as defined by Sections 1 through 9 of this document. + + "Licensor" shall mean the copyright owner or entity authorized by + the copyright owner that is granting the License. + + "Legal Entity" shall mean the union of the acting entity and all + other entities that control, are controlled by, or are under common + control with that entity. For the purposes of this definition, + "control" means (i) the power, direct or indirect, to cause the + direction or management of such entity, whether by contract or + otherwise, or (ii) ownership of fifty percent (50%) or more of the + outstanding shares, or (iii) beneficial ownership of such entity. + + "You" (or "Your") shall mean an individual or Legal Entity + exercising permissions granted by this License. + + "Source" form shall mean the preferred form for making modifications, + including but not limited to software source code, documentation + source, and configuration files. + + "Object" form shall mean any form resulting from mechanical + transformation or translation of a Source form, including but + not limited to compiled object code, generated documentation, + and conversions to other media types. + + "Work" shall mean the work of authorship, whether in Source or + Object form, made available under the License, as indicated by a + copyright notice that is included in or attached to the work + (an example is provided in the Appendix below). + + "Derivative Works" shall mean any work, whether in Source or Object + form, that is based on (or derived from) the Work and for which the + editorial revisions, annotations, elaborations, or other modifications + represent, as a whole, an original work of authorship. For the purposes + of this License, Derivative Works shall not include works that remain + separable from, or merely link (or bind by name) to the interfaces of, + the Work and Derivative Works thereof. + + "Contribution" shall mean any work of authorship, including + the original version of the Work and any modifications or additions + to that Work or Derivative Works thereof, that is intentionally + submitted to Licensor for inclusion in the Work by the copyright owner + or by an individual or Legal Entity authorized to submit on behalf of + the copyright owner. For the purposes of this definition, "submitted" + means any form of electronic, verbal, or written communication sent + to the Licensor or its representatives, including but not limited to + communication on electronic mailing lists, source code control systems, + and issue tracking systems that are managed by, or on behalf of, the + Licensor for the purpose of discussing and improving the Work, but + excluding communication that is conspicuously marked or otherwise + designated in writing by the copyright owner as "Not a Contribution." + + "Contributor" shall mean Licensor and any individual or Legal Entity + on behalf of whom a Contribution has been received by Licensor and + subsequently incorporated within the Work. + + 2. Grant of Copyright License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + copyright license to reproduce, prepare Derivative Works of, + publicly display, publicly perform, sublicense, and distribute the + Work and such Derivative Works in Source or Object form. + + 3. Grant of Patent License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + (except as stated in this section) patent license to make, have made, + use, offer to sell, sell, import, and otherwise transfer the Work, + where such license applies only to those patent claims licensable + by such Contributor that are necessarily infringed by their + Contribution(s) alone or by combination of their Contribution(s) + with the Work to which such Contribution(s) was submitted. If You + institute patent litigation against any entity (including a + cross-claim or counterclaim in a lawsuit) alleging that the Work + or a Contribution incorporated within the Work constitutes direct + or contributory patent infringement, then any patent licenses + granted to You under this License for that Work shall terminate + as of the date such litigation is filed. + + 4. Redistribution. You may reproduce and distribute copies of the + Work or Derivative Works thereof in any medium, with or without + modifications, and in Source or Object form, provided that You + meet the following conditions: + + (a) You must give any other recipients of the Work or + Derivative Works a copy of this License; and + + (b) You must cause any modified files to carry prominent notices + stating that You changed the files; and + + (c) You must retain, in the Source form of any Derivative Works + that You distribute, all copyright, patent, trademark, and + attribution notices from the Source form of the Work, + excluding those notices that do not pertain to any part of + the Derivative Works; and + + (d) If the Work includes a "NOTICE" text file as part of its + distribution, then any Derivative Works that You distribute must + include a readable copy of the attribution notices contained + within such NOTICE file, excluding those notices that do not + pertain to any part of the Derivative Works, in at least one + of the following places: within a NOTICE text file distributed + as part of the Derivative Works; within the Source form or + documentation, if provided along with the Derivative Works; or, + within a display generated by the Derivative Works, if and + wherever such third-party notices normally appear. The contents + of the NOTICE file are for informational purposes only and + do not modify the License. You may add Your own attribution + notices within Derivative Works that You distribute, alongside + or as an addendum to the NOTICE text from the Work, provided + that such additional attribution notices cannot be construed + as modifying the License. + + You may add Your own copyright statement to Your modifications and + may provide additional or different license terms and conditions + for use, reproduction, or distribution of Your modifications, or + for any such Derivative Works as a whole, provided Your use, + reproduction, and distribution of the Work otherwise complies with + the conditions stated in this License. + + 5. Submission of Contributions. Unless You explicitly state otherwise, + any Contribution intentionally submitted for inclusion in the Work + by You to the Licensor shall be under the terms and conditions of + this License, without any additional terms or conditions. + Notwithstanding the above, nothing herein shall supersede or modify + the terms of any separate license agreement you may have executed + with Licensor regarding such Contributions. + + 6. Trademarks. This License does not grant permission to use the trade + names, trademarks, service marks, or product names of the Licensor, + except as required for reasonable and customary use in describing the + origin of the Work and reproducing the content of the NOTICE file. + + 7. Disclaimer of Warranty. Unless required by applicable law or + agreed to in writing, Licensor provides the Work (and each + Contributor provides its Contributions) on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or + implied, including, without limitation, any warranties or conditions + of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A + PARTICULAR PURPOSE. You are solely responsible for determining the + appropriateness of using or redistributing the Work and assume any + risks associated with Your exercise of permissions under this License. + + 8. Limitation of Liability. In no event and under no legal theory, + whether in tort (including negligence), contract, or otherwise, + unless required by applicable law (such as deliberate and grossly + negligent acts) or agreed to in writing, shall any Contributor be + liable to You for damages, including any direct, indirect, special, + incidental, or consequential damages of any character arising as a + result of this License or out of the use or inability to use the + Work (including but not limited to damages for loss of goodwill, + work stoppage, computer failure or malfunction, or any and all + other commercial damages or losses), even if such Contributor + has been advised of the possibility of such damages. + + 9. Accepting Warranty or Additional Liability. While redistributing + the Work or Derivative Works thereof, You may choose to offer, + and charge a fee for, acceptance of support, warranty, indemnity, + or other liability obligations and/or rights consistent with this + License. However, in accepting such obligations, You may act only + on Your own behalf and on Your sole responsibility, not on behalf + of any other Contributor, and only if You agree to indemnify, + defend, and hold each Contributor harmless for any liability + incurred by, or claims asserted against, such Contributor by reason + of your accepting any such warranty or additional liability. diff --git a/stepfunctions/stepfunctions/agent_evaluation-0.2.0.dist-info/METADATA b/stepfunctions/stepfunctions/agent_evaluation-0.2.0.dist-info/METADATA new file mode 100644 index 0000000..1070391 --- /dev/null +++ b/stepfunctions/stepfunctions/agent_evaluation-0.2.0.dist-info/METADATA @@ -0,0 +1,74 @@ +Metadata-Version: 2.1 +Name: agent-evaluation +Version: 0.2.0 +Summary: A generative AI-powered framework for testing virtual agents. +Home-page: https://awslabs.github.io/agent-evaluation/ +Author: Amazon Web Services +Author-email: agent-evaluation-oss-core-team@amazon.com +License: Apache 2.0 +Classifier: Development Status :: 4 - Beta +Classifier: Intended Audience :: Developers +Classifier: Topic :: Utilities +Classifier: Topic :: Software Development :: Testing +Classifier: License :: OSI Approved :: Apache Software License +Classifier: Programming Language :: Python :: 3 :: Only +Classifier: Programming Language :: Python :: 3.9 +Classifier: Programming Language :: Python :: 3.10 +Classifier: Programming Language :: Python :: 3.11 +Classifier: Programming Language :: Python :: 3.12 +Requires-Python: >=3.9 +Description-Content-Type: text/markdown +License-File: LICENSE +License-File: NOTICE +Requires-Dist: pyyaml ~=6.0 +Requires-Dist: boto3 <2.0,>=1.34.20 +Requires-Dist: click ~=8.0 +Requires-Dist: pydantic <3.0,>=2.1.0 +Requires-Dist: rich <14.0,>=13.7.0 +Requires-Dist: jinja2 <4.0,>=3.1.3 +Requires-Dist: jsonpath-ng <2.0,>=1.6.1 +Provides-Extra: dev +Requires-Dist: flake8 ; extra == 'dev' +Requires-Dist: black ; extra == 'dev' +Requires-Dist: isort ; extra == 'dev' +Requires-Dist: pytest ; extra == 'dev' +Requires-Dist: pytest-cov ; extra == 'dev' +Requires-Dist: pytest-mock ; extra == 'dev' +Requires-Dist: mkdocs ; extra == 'dev' +Requires-Dist: mkdocs-material ; extra == 'dev' +Requires-Dist: mkdocstrings[python] ; extra == 'dev' +Requires-Dist: mkdocs-click ; extra == 'dev' +Requires-Dist: bandit ; extra == 'dev' +Requires-Dist: pip-audit ; extra == 'dev' + +![PyPI - Version](https://img.shields.io/pypi/v/agent-evaluation) +![PyPI - Python Version](https://img.shields.io/pypi/pyversions/agent-evaluation) +![GitHub License](https://img.shields.io/github/license/awslabs/agent-evaluation) +[![security: bandit](https://img.shields.io/badge/security-bandit-yellow.svg)](https://github.com/PyCQA/bandit) +[![Code style: black](https://img.shields.io/badge/code%20style-black-000000.svg)](https://github.com/psf/black) +[![Built with Material for MkDocs](https://img.shields.io/badge/Material_for_MkDocs-526CFE?style=for-the-badge&logo=MaterialForMkDocs&logoColor=white)](https://squidfunk.github.io/mkdocs-material/) + +# Agent Evaluation + +Agent Evaluation is a generative AI-powered framework for testing virtual agents. + +Internally, Agent Evaluation implements an LLM agent (evaluator) that will orchestrate conversations with your own agent (target) and evaluate the responses during the conversation. + +## ✨ Key features + +- Built-in support for popular AWS services including [Amazon Bedrock](https://aws.amazon.com/bedrock/), [Amazon Q Business](https://aws.amazon.com/q/business/), and [Amazon SageMaker](https://aws.amazon.com/sagemaker/). You can also [bring your own agent](https://awslabs.github.io/agent-evaluation/targets/custom_targets/) to test using Agent Evaluation. +- Orchestrate concurrent, multi-turn conversations with your agent while evaluating its responses. +- Define [hooks](https://awslabs.github.io/agent-evaluation/hooks/) to perform additional tasks such as integration testing. +- Can be incorporated into CI/CD pipelines to expedite the time to delivery while maintaining the stability of agents in production environments. + +## 📚 Documentation + +To get started, please visit the full documentation [here](https://awslabs.github.io/agent-evaluation/). To contribute, please refer to [CONTRIBUTING.md](./CONTRIBUTING.md) + +## 👏 Contributors + +Shout out to these awesome contributors: + + + + diff --git a/stepfunctions/stepfunctions/agent_evaluation-0.2.0.dist-info/NOTICE b/stepfunctions/stepfunctions/agent_evaluation-0.2.0.dist-info/NOTICE new file mode 100644 index 0000000..616fc58 --- /dev/null +++ b/stepfunctions/stepfunctions/agent_evaluation-0.2.0.dist-info/NOTICE @@ -0,0 +1 @@ +Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. diff --git a/stepfunctions/stepfunctions/agent_evaluation-0.2.0.dist-info/RECORD b/stepfunctions/stepfunctions/agent_evaluation-0.2.0.dist-info/RECORD new file mode 100644 index 0000000..fcc2eac --- /dev/null +++ b/stepfunctions/stepfunctions/agent_evaluation-0.2.0.dist-info/RECORD @@ -0,0 +1,87 @@ +../../../bin/agenteval,sha256=sKahy-HYfncxw3pVqCLLgxIokhvln3Qm9eDSvskMrV8,250 +agent_evaluation-0.2.0.dist-info/INSTALLER,sha256=zuuue4knoyJ-UwPPXg8fezS7VCrXJQrAP7zeNuwvFQg,4 +agent_evaluation-0.2.0.dist-info/LICENSE,sha256=CeipvOyAZxBGUsFoaFqwkx54aPnIKEtm9a5u2uXxEws,10142 +agent_evaluation-0.2.0.dist-info/METADATA,sha256=WOBzlzyr7ozBQpU_A99iEq8w2O-j-Zii-Q2al9A3D_Y,3759 +agent_evaluation-0.2.0.dist-info/NOTICE,sha256=1CkO1kwu3Q_OHYTj-d-yiBJA_lNN73a4zSntavaD4oc,67 +agent_evaluation-0.2.0.dist-info/RECORD,, +agent_evaluation-0.2.0.dist-info/REQUESTED,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0 +agent_evaluation-0.2.0.dist-info/WHEEL,sha256=GJ7t_kWBFywbagK5eo9IoUwLW6oyOeTKmQ-9iHFVNxQ,92 +agent_evaluation-0.2.0.dist-info/entry_points.txt,sha256=DxxfiIbldqK82WRgaMOL4BjPJcnr7JOkDYTch6xNahs,48 +agent_evaluation-0.2.0.dist-info/top_level.txt,sha256=k6izISLxVoNnLxZHqnS3X-0eDdUD8LsV-OoM0afYdew,10 +agenteval/__init__.py,sha256=JQm11m01-rh2PjCw1OVqgy1rkU725Q6vMqfDtLbRH2U,1150 +agenteval/__pycache__/__init__.cpython-311.pyc,, +agenteval/__pycache__/cli.cpython-311.pyc,, +agenteval/__pycache__/conversation.cpython-311.pyc,, +agenteval/__pycache__/defaults.cpython-311.pyc,, +agenteval/__pycache__/hook.cpython-311.pyc,, +agenteval/__pycache__/plan.cpython-311.pyc,, +agenteval/__pycache__/target_response.cpython-311.pyc,, +agenteval/__pycache__/test.cpython-311.pyc,, +agenteval/__pycache__/test_result.cpython-311.pyc,, +agenteval/__pycache__/trace.cpython-311.pyc,, +agenteval/cli.py,sha256=wl0z_pCCKuu9lJgUWrS8cuHkvMYclhd-aCmCB6MN8u8,2807 +agenteval/conversation.py,sha256=r3fvnsnVI7zRoi_RS2JjPahUtLXF9vhnZYJcx1RMe3M,1030 +agenteval/defaults.py,sha256=PB1UniJ-uyiBn0WWSA3EI1UxcfpU2wlvsJZmhWgdV5E,280 +agenteval/evaluators/__init__.py,sha256=U6uQ6THgK0yxMnqVKL5l7_zUUxepoo11W1fPLa9xgNE,247 +agenteval/evaluators/__pycache__/__init__.cpython-311.pyc,, +agenteval/evaluators/__pycache__/base_evaluator.cpython-311.pyc,, +agenteval/evaluators/__pycache__/evaluator_factory.cpython-311.pyc,, +agenteval/evaluators/base_evaluator.py,sha256=zpWFBhQdaB-__TdiM7lFlkvQFX27KSFvzDFQ1KBvuLw,5052 +agenteval/evaluators/claude_3/__init__.py,sha256=mKv_FTRrhYIIS86zqxzj5edy-tKREHsn3nXUBmck71Q,180 +agenteval/evaluators/claude_3/__pycache__/__init__.cpython-311.pyc,, +agenteval/evaluators/claude_3/__pycache__/evaluator.cpython-311.pyc,, +agenteval/evaluators/claude_3/__pycache__/model_configs.cpython-311.pyc,, +agenteval/evaluators/claude_3/evaluator.py,sha256=k-ZXtKBtywVYy1XEAkSufb9LYXlAElaklV8Wao-udLo,7751 +agenteval/evaluators/claude_3/model_configs.py,sha256=KUf0C5Bbgc-c05ZZlokVgjHVH4WGdoOfKtwQWwuQFLY,635 +agenteval/evaluators/evaluator_factory.py,sha256=JCTVoN62QNMcKR68KY2Li8zpm55HNvYwVBXZ0Yi3rhQ,712 +agenteval/hook.py,sha256=z8UfREnySi2E6tRwjeklI3CwjWQ5MMk59wLHj6TK9C0,1049 +agenteval/plan.py,sha256=tIXTXepcVZEA8JX0yoEzsSuLDVpqSYvBdKsGJYYCVbU,3236 +agenteval/runner/__init__.py,sha256=6f0fmworOJ0fn2MNzDg52zbip4osTovhwetT6ZQnI74,157 +agenteval/runner/__pycache__/__init__.cpython-311.pyc,, +agenteval/runner/__pycache__/runner.cpython-311.pyc,, +agenteval/runner/__pycache__/summary.cpython-311.pyc,, +agenteval/runner/runner.py,sha256=wSYcX82WTMwmMFCfqoHjxq8NTnV1_UdPr4A1fnmkD_U,3937 +agenteval/runner/summary.py,sha256=jTdFRFo7zAaE-PTA6Cy3n1cndgFB14vA20MDO9FeJyE,872 +agenteval/target_response.py,sha256=R_Gy-655vPEsSO7X2siU2GNiFPRl1CkRetiON8WYEGM,285 +agenteval/targets/__init__.py,sha256=JmGtuue6VQYkK5jAiArxlbnRQsA23p8NgDTMvnCWyGU,282 +agenteval/targets/__pycache__/__init__.cpython-311.pyc,, +agenteval/targets/__pycache__/base_target.cpython-311.pyc,, +agenteval/targets/__pycache__/boto3_target.cpython-311.pyc,, +agenteval/targets/__pycache__/target_factory.cpython-311.pyc,, +agenteval/targets/base_target.py,sha256=aYW5dLAlbKgscdf8XTcV9Bppbay-pz-c_y5RtCgdBD0,743 +agenteval/targets/bedrock_agent/__init__.py,sha256=2B5TCxdyQAXuQRtji0lclk5odB7xgT5Hi_dBwjErIzo,73 +agenteval/targets/bedrock_agent/__pycache__/__init__.cpython-311.pyc,, +agenteval/targets/bedrock_agent/__pycache__/target.cpython-311.pyc,, +agenteval/targets/bedrock_agent/target.py,sha256=GRfn4dOGkARF_3_DBupgoHrbiYQZADfqwXO65Z2-RDM,1332 +agenteval/targets/bedrock_knowledge_base/__init__.py,sha256=tYJixJ0x9ohkM7oker8eX7U4vkkxqV_xVlA4CsWIuec,89 +agenteval/targets/bedrock_knowledge_base/__pycache__/__init__.cpython-311.pyc,, +agenteval/targets/bedrock_knowledge_base/__pycache__/target.cpython-311.pyc,, +agenteval/targets/bedrock_knowledge_base/target.py,sha256=jOsAixfOSy6jEQF6p_uCwDLP7M1WB64F6K49CbtiSYc,1401 +agenteval/targets/boto3_target.py,sha256=qNukrm2GZOrG81pJc61BrJEFcNB_f80cvvWQyMFRQiA,1271 +agenteval/targets/q_business/__init__.py,sha256=1KT5BdoA_KD2fX3gNLvSyg9K5x0OfWBN8X15nxJf13U,67 +agenteval/targets/q_business/__pycache__/__init__.cpython-311.pyc,, +agenteval/targets/q_business/__pycache__/target.cpython-311.pyc,, +agenteval/targets/q_business/target.py,sha256=Bv9YiXcnBoUmXFN3nfCh2FNLNP9vMm_1ruWVlDGsXXs,1014 +agenteval/targets/sagemaker_endpoint/__init__.py,sha256=whoMO69GOhPMNOrbQAfYzVmIXuxhxt8dHJGABnR4_Ck,83 +agenteval/targets/sagemaker_endpoint/__pycache__/__init__.cpython-311.pyc,, +agenteval/targets/sagemaker_endpoint/__pycache__/target.cpython-311.pyc,, +agenteval/targets/sagemaker_endpoint/target.py,sha256=zLsgkOljavYzrjrVnY3qDOjc-zsKFPSIdqugsZZy6po,2677 +agenteval/targets/target_factory.py,sha256=W8mzSy3E44jpYJs6XLD2WaLAaXXZ_T_WGw49CyPLigQ,1092 +agenteval/templates/evaluators/claude_3/generate_evaluation.jinja,sha256=aaTBZnr-3J29SpdernWW8bmQzF7lV0-bed1glZk36Yk,287 +agenteval/templates/evaluators/claude_3/generate_initial_prompt.jinja,sha256=wIhfhNUsTVdeIDBJNH1QWIBQWVE8h0Lc958vuuNU_eE,43 +agenteval/templates/evaluators/claude_3/generate_test_status.jinja,sha256=2T9HuihEVtGvq-ncxl6hLrTZXi2wAYu3cQhCUl0F_qY,238 +agenteval/templates/evaluators/claude_3/generate_user_response.jinja,sha256=2T9HuihEVtGvq-ncxl6hLrTZXi2wAYu3cQhCUl0F_qY,238 +agenteval/templates/evaluators/claude_3/system/generate_evaluation.jinja,sha256=3ihPICeDofWljtl6YpUJQM-lJSPNeWjhjgGndKM1wYQ,554 +agenteval/templates/evaluators/claude_3/system/generate_initial_prompt.jinja,sha256=DR1UaUvn0u_8MD0cSHAWSPLfEIwnGCKlEFPkuUAKLDQ,566 +agenteval/templates/evaluators/claude_3/system/generate_test_status.jinja,sha256=akAKahEda6A3-XhVjXpacGR3e48HrbqE4UT4ONlqVZg,587 +agenteval/templates/evaluators/claude_3/system/generate_user_response.jinja,sha256=yCy-IkJRM2y9-pPbaZaNrT-_4J7x9YM9kMgMXeYf5D4,800 +agenteval/templates/summary/agenteval_summary.md.jinja,sha256=Ri9B_lIpewlBtvs0ggj4IO9FbIZlMq70aDBZg_-xfQk,1107 +agenteval/test.py,sha256=mMbZWI5Yv6oQDS4xh5gCUvAj_IOih3vurqsMJs_9KbM,806 +agenteval/test_result.py,sha256=pDdXfrhIQtgO3au0XaxNLY1uql-POqZrlgu2vtNa0fc,738 +agenteval/trace.py,sha256=9JhT1i295AbKk1Zaj7Qa9EiXW1IJu-GsbOZ1hs8kiEU,2090 +agenteval/utils/__init__.py,sha256=xgJ0V8V34ju5tDEaX-WDBwXLTwMjFBztdYJ5lk2Y-OE,230 +agenteval/utils/__pycache__/__init__.cpython-311.pyc,, +agenteval/utils/__pycache__/aws.cpython-311.pyc,, +agenteval/utils/__pycache__/imports.cpython-311.pyc,, +agenteval/utils/aws.py,sha256=z6YjWUK1MhMl0Z6J-vxZiRBaHv8d444avFxEMjicq0c,1115 +agenteval/utils/imports.py,sha256=i-cd9Ze6LWeaBktGHgZkWLa6W_iUa11vTOBc5CQrfzA,1106 diff --git a/stepfunctions/stepfunctions/agent_evaluation-0.2.0.dist-info/REQUESTED b/stepfunctions/stepfunctions/agent_evaluation-0.2.0.dist-info/REQUESTED new file mode 100644 index 0000000..e69de29 diff --git a/stepfunctions/stepfunctions/agent_evaluation-0.2.0.dist-info/WHEEL b/stepfunctions/stepfunctions/agent_evaluation-0.2.0.dist-info/WHEEL new file mode 100644 index 0000000..bab98d6 --- /dev/null +++ b/stepfunctions/stepfunctions/agent_evaluation-0.2.0.dist-info/WHEEL @@ -0,0 +1,5 @@ +Wheel-Version: 1.0 +Generator: bdist_wheel (0.43.0) +Root-Is-Purelib: true +Tag: py3-none-any + diff --git a/stepfunctions/stepfunctions/agent_evaluation-0.2.0.dist-info/entry_points.txt b/stepfunctions/stepfunctions/agent_evaluation-0.2.0.dist-info/entry_points.txt new file mode 100644 index 0000000..6919bf1 --- /dev/null +++ b/stepfunctions/stepfunctions/agent_evaluation-0.2.0.dist-info/entry_points.txt @@ -0,0 +1,2 @@ +[console_scripts] +agenteval = agenteval.cli:cli diff --git a/stepfunctions/stepfunctions/agent_evaluation-0.2.0.dist-info/top_level.txt b/stepfunctions/stepfunctions/agent_evaluation-0.2.0.dist-info/top_level.txt new file mode 100644 index 0000000..060c7ea --- /dev/null +++ b/stepfunctions/stepfunctions/agent_evaluation-0.2.0.dist-info/top_level.txt @@ -0,0 +1 @@ +agenteval diff --git a/stepfunctions/stepfunctions/agenteval/__init__.py b/stepfunctions/stepfunctions/agenteval/__init__.py new file mode 100644 index 0000000..cd7bf51 --- /dev/null +++ b/stepfunctions/stepfunctions/agenteval/__init__.py @@ -0,0 +1,46 @@ +# Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +# SPDX-License-Identifier: Apache-2.0 + +from importlib.metadata import version + +import logging +import os + +from jinja2 import Environment, PackageLoader, select_autoescape +from rich.logging import RichHandler + +from .hook import Hook +from .target_response import TargetResponse + +__all__ = ["Hook", "TargetResponse"] +__version__ = version("agent-evaluation") + + +_LOG_LEVEL_ENV = "LOG_LEVEL" + + +def configure_logger(): + # supress logs from botocore + logging.getLogger("botocore").setLevel(logging.CRITICAL) + + # configure logging using rich + formatter = logging.Formatter("%(message)s", datefmt="[%X]") + handler = RichHandler(markup=True, show_level=True, rich_tracebacks=True) + handler.setFormatter(formatter) + + logger = logging.getLogger(__name__) + + logger.setLevel(os.environ.get(_LOG_LEVEL_ENV, logging.INFO)) + logger.addHandler(handler) + + +configure_logger() + +jinja_env = Environment( + loader=PackageLoader(__name__), + autoescape=select_autoescape( + disabled_extensions=["jinja"], + default_for_string=True, + default=True, + ), +) diff --git a/stepfunctions/stepfunctions/agenteval/cli.py b/stepfunctions/stepfunctions/agenteval/cli.py new file mode 100644 index 0000000..940f621 --- /dev/null +++ b/stepfunctions/stepfunctions/agenteval/cli.py @@ -0,0 +1,109 @@ +# Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +# SPDX-License-Identifier: Apache-2.0 + +import logging +import os +from typing import Optional + +import click + +from agenteval.plan import Plan +from agenteval.runner import Runner + +logger = logging.getLogger(__name__) + + +def validate_directory(directory): + if not os.path.isdir(directory): + raise NotADirectoryError(f"{directory} is not a directory") + if not os.access(directory, os.R_OK) or not os.access(directory, os.W_OK): + raise PermissionError(f"No read/write permissions for {directory}") + + +@click.group() +def cli(): + pass + + +@cli.command(help="Initialize a test plan.") +@click.option( + "--plan-dir", + type=str, + required=False, + help="The destination directory for storing the test plan. If unspecified, then the test plan is saved to the current working directory.", +) +def init(plan_dir: Optional[str]): + if plan_dir: + validate_directory(plan_dir) + try: + path = Plan.init_plan(plan_dir) + logger.info(f"[green]Test plan created at {path}") + + except FileExistsError as e: + logger.error(f"[red]{e}") + exit(1) + + +@cli.command(help="Run test plan.") +@click.option( + "--filter", + type=str, + required=False, + help="Specifies the test(s) to run. Multiple tests should be seperated using a comma. If unspecified, all tests from the test plan will be run.", +) +@click.option( + "--plan-dir", + type=str, + required=False, + help="The directory where the test plan is stored. If unspecified, then the current working directory is used.", +) +@click.option( + "--verbose", + is_flag=True, + type=bool, + default=False, + help="Controls the verbosity of the terminal logs.", +) +@click.option( + "--num-threads", + type=int, + required=False, + help="Number of threads (and thus tests) to run concurrently. If unspecified, number of threads will be capped at 45.", +) +@click.option( + "--work-dir", + type=str, + required=False, + help="The directory where the test result and trace will be generated. If unspecified, then the current working directory is used.", +) +def run( + filter: Optional[str], + plan_dir: Optional[str], + verbose: bool, + num_threads: Optional[int], + work_dir: Optional[str], +): + try: + plan = Plan.load(plan_dir, filter) + if work_dir: + validate_directory(work_dir) + runner = Runner( + plan, + verbose, + num_threads, + work_dir, + ) + num_failed = runner.run() + _num_failed_exit(num_failed) + + except Exception as e: + _exception_exit(e) + + +def _num_failed_exit(num_failed): + exit(1 if num_failed else 0) + + +def _exception_exit(e): + logger.exception(f"Error running test: {e}") + exit(1) diff --git a/stepfunctions/stepfunctions/agenteval/conversation.py b/stepfunctions/stepfunctions/agenteval/conversation.py new file mode 100644 index 0000000..59e4304 --- /dev/null +++ b/stepfunctions/stepfunctions/agenteval/conversation.py @@ -0,0 +1,35 @@ +# Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +# SPDX-License-Identifier: Apache-2.0 + +_USER = "USER" +_AGENT = "AGENT" +_START_TURN_COUNT = 0 + + +class Conversation: + """Captures the interaction between a user and an agent. + + Attributes: + messages (list): A list of tuples of the form (role, message). + turns (int): The number of turns in the conversation. + """ + + def __init__(self): + self.messages = [] + self.turns = _START_TURN_COUNT + + def __iter__(self): + """Allow iteration over conversation messages.""" + return iter(self.messages) + + def add_turn(self, user_message: str, agent_response: str): + """Record a turn in the conversation. + + Args: + user_message (str): The users's message + agent_response (str): The agent's response to the user's message + + Increments the `turn` counter by `1`. + """ + self.messages.extend([(_USER, user_message), (_AGENT, agent_response)]) + self.turns += 1 diff --git a/stepfunctions/stepfunctions/agenteval/defaults.py b/stepfunctions/stepfunctions/agenteval/defaults.py new file mode 100644 index 0000000..929c675 --- /dev/null +++ b/stepfunctions/stepfunctions/agenteval/defaults.py @@ -0,0 +1,8 @@ +# Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +# SPDX-License-Identifier: Apache-2.0 + +MAX_TURNS = 2 + +# Default max number of threads not exceeding Bedrock service quota: +# https://docs.aws.amazon.com/bedrock/latest/userguide/quotas.html +MAX_NUM_THREADS = 45 diff --git a/stepfunctions/stepfunctions/agenteval/evaluators/__init__.py b/stepfunctions/stepfunctions/agenteval/evaluators/__init__.py new file mode 100644 index 0000000..8e52702 --- /dev/null +++ b/stepfunctions/stepfunctions/agenteval/evaluators/__init__.py @@ -0,0 +1,7 @@ +# Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +# SPDX-License-Identifier: Apache-2.0 + +from .base_evaluator import BaseEvaluator +from .evaluator_factory import EvaluatorFactory + +__all__ = ["BaseEvaluator", "EvaluatorFactory"] diff --git a/stepfunctions/stepfunctions/agenteval/evaluators/base_evaluator.py b/stepfunctions/stepfunctions/agenteval/evaluators/base_evaluator.py new file mode 100644 index 0000000..e1bd4c9 --- /dev/null +++ b/stepfunctions/stepfunctions/agenteval/evaluators/base_evaluator.py @@ -0,0 +1,139 @@ +# Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +# SPDX-License-Identifier: Apache-2.0 + +import json +from abc import ABC, abstractmethod +from typing import Optional + +from agenteval.conversation import Conversation +from agenteval.hook import Hook +from agenteval.targets import BaseTarget +from agenteval.test import Test +from agenteval.test_result import TestResult +from agenteval.trace import Trace +from agenteval.utils import create_boto3_client, import_class + +_DEFAULT_MAX_RETRY = 10 +_BOTO3_SERVICE_NAME = "bedrock-runtime" + + +class BaseEvaluator(ABC): + """The `BaseEvaluator` abstract base class defines the common interface for evaluator + classes. + + Attributes: + test (Test): The test case. + target (BaseTarget): The target agent being evaluated. + conversation (Conversation): Captures the interaction between a user and an agent. + trace (Trace): Captures steps during evaluation. + test_result (TestResult): The result of the test which is set in `BaseEvaluator.run`. + input_token_count (int): Number of input tokens processed by the evaluator. + output_token_count (int): Number of output tokens generated by the evaluator. + model_id (str): The ID of the Bedrock model used to run evaluation. If `provisioned_throughput_arn` is provided, + then this will be set to the ARN of the provisioned throughput. + boto3_client (BaseClient): A `boto3` client representing Amazon Bedrock Runtime. + """ + + def __init__( + self, + test: Test, + target: BaseTarget, + work_dir: str, + model_id: str, + provisioned_throughput_arn: Optional[str] = None, + aws_profile: Optional[str] = None, + aws_region: Optional[str] = None, + endpoint_url: Optional[str] = None, + max_retry: int = _DEFAULT_MAX_RETRY, + ): + """Initialize the evaluator instance for a given `Test` and `Target`. + + Args: + test (Test): The test case. + target (BaseTarget): The target agent being evaluated. + work_dir (str): The work directory. + model_id (str): The ID of the Bedrock model used to run evaluation. + provisioned_throughput_arn (str, optional): The ARN of the provisioned throughput. + aws_profile (str, optional): The AWS profile name. + aws_region (str, optional): The AWS region. + endpoint_url (str, optional): The endpoint URL for the AWS service. + max_retry (int, optional): The maximum number of retry attempts. + """ + self.test = test + self.target = target + self.conversation = Conversation() + self.trace = Trace(work_dir=work_dir, test_name=test.name) + self.test_result = None + self.input_token_count = 0 + self.output_token_count = 0 + self.model_id = provisioned_throughput_arn or model_id + self.bedrock_runtime_client = create_boto3_client( + boto3_service_name=_BOTO3_SERVICE_NAME, + aws_profile=aws_profile, + aws_region=aws_region, + endpoint_url=endpoint_url, + max_retry=max_retry, + ) + + @abstractmethod + def evaluate(self) -> TestResult: + """Conduct a test. + + Returns: + TestResult: The result of the test. + """ + pass + + def _get_hook_cls(self, hook: Optional[str]) -> Optional[type[Hook]]: + if hook: + hook_cls = import_class(hook, parent_class=Hook) + return hook_cls + + def invoke_model(self, request_body: dict) -> dict: + """ + Invoke the Bedrock model using the `boto3_client`. This method will convert + a request dictionary to a JSON string before passing it to the `InvokeModel` API. + + Refer to the `boto3` documentation for more details. + + Args: + request_body (dict): The request payload as a dictionary. + + Returns: + dict: The response from the model invocation. + + """ + response = self.bedrock_runtime_client.invoke_model( + modelId=self.model_id, body=json.dumps(request_body) + ) + + self._incr_token_counts(response) + + return response + + def _incr_token_counts(self, response: dict): + headers = response["ResponseMetadata"]["HTTPHeaders"] + + self.input_token_count += int( + headers.get("x-amzn-bedrock-input-token-count", 0) + ) + self.output_token_count += int( + headers.get("x-amzn-bedrock-output-token-count", 0) + ) + + def run(self) -> TestResult: + """ + Run the evaluator within a trace context manager and run hooks + if provided. + """ + + hook_cls = self._get_hook_cls(self.test.hook) + + with self.trace: + if hook_cls: + hook_cls.pre_evaluate(self.test, self.trace) + self.test_result = self.evaluate() + if hook_cls: + hook_cls.post_evaluate(self.test, self.test_result, self.trace) + + return self.test_result diff --git a/stepfunctions/stepfunctions/agenteval/evaluators/claude_3/__init__.py b/stepfunctions/stepfunctions/agenteval/evaluators/claude_3/__init__.py new file mode 100644 index 0000000..338be7d --- /dev/null +++ b/stepfunctions/stepfunctions/agenteval/evaluators/claude_3/__init__.py @@ -0,0 +1,6 @@ +# Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +# SPDX-License-Identifier: Apache-2.0 + +from .evaluator import Claude3Evaluator + +__all__ = ["Claude3Evaluator"] diff --git a/stepfunctions/stepfunctions/agenteval/evaluators/claude_3/evaluator.py b/stepfunctions/stepfunctions/agenteval/evaluators/claude_3/evaluator.py new file mode 100644 index 0000000..cc8b3ae --- /dev/null +++ b/stepfunctions/stepfunctions/agenteval/evaluators/claude_3/evaluator.py @@ -0,0 +1,244 @@ +# Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +# SPDX-License-Identifier: Apache-2.0 + +import json +import logging +import os +import re +from typing import Tuple + +from agenteval import jinja_env +from agenteval.evaluators import BaseEvaluator +from agenteval.evaluators.claude_3 import model_configs +from agenteval.test_result import TestResult + +logger = logging.getLogger(__name__) + +_PROMPT_TEMPLATE_ROOT = "evaluators/claude_3" +_SYSTEM_PROMPT_DIR = "system" +_PROMPT_TEMPLATE_NAMES = [ + "generate_initial_prompt", + "generate_user_response", + "generate_test_status", + "generate_evaluation", +] + +# enable backwards-compatible StrEnum +try: + from enum import StrEnum +except ImportError: + from enum import Enum + + class StrEnum(str, Enum): + pass + + +class TestStatusCategories(StrEnum): + ALL_STEPS_ATTEMPTED = "A" + NOT_ALL_STEPS_ATTEMPTED = "B" + + +class EvaluationCategories(StrEnum): + ALL_EXPECTED_RESULTS_OBSERVED = "A" + NOT_ALL_EXPECTED_RESULTS_OBSERVED = "B" + + +class Results(StrEnum): + MAX_TURNS_REACHED = "Maximum turns reached." + ALL_EXPECTED_RESULTS_OBSERVED = ( + "All of the expected results can be observed in the conversation." + ) + NOT_ALL_EXPECTED_RESULTS_OBSERVED = ( + "Not all of the expected results can be observed in the conversation." + ) + + +class Claude3Evaluator(BaseEvaluator): + def __init__( + self, + **kwargs, + ): + super().__init__(model_id=model_configs.MODEL_ID, **kwargs) + + self._prompt_template_map = { + name: { + "system": jinja_env.get_template( + os.path.join( + _PROMPT_TEMPLATE_ROOT, _SYSTEM_PROMPT_DIR, f"{name}.jinja" + ) + ), + "prompt": jinja_env.get_template( + os.path.join(_PROMPT_TEMPLATE_ROOT, f"{name}.jinja") + ), + } + for name in _PROMPT_TEMPLATE_NAMES + } + + @staticmethod + def _extract_content_from_xml(xml_data: str, element_names: list[str]) -> Tuple: + content = [] + for e in element_names: + pattern = rf"<{e}>(.*?)" + match = re.search(pattern, xml_data, re.DOTALL) + content.append(match.group(1).strip() if match else None) + return tuple(content) + + def _generate( + self, + system_prompt: str, + prompt: str, + output_xml_element: str, + ) -> str: + request_body = model_configs.REQUEST_BODY + request_body["system"] = system_prompt + request_body["messages"][0]["content"][0]["text"] = prompt + + response = self.invoke_model(request_body=request_body) + response_body = response.get("body").read() + completion = json.loads(response_body)["content"][0]["text"] + + logger.debug( + f"[{self.test.name}]\n[PROMPT]\n{prompt}\n[COMPLETION]\n{completion}" + ) + + output, reasoning = self._extract_content_from_xml( + completion, [output_xml_element, "thinking"] + ) + + return output, reasoning + + def _generate_initial_prompt(self) -> str: + system_prompt = self._prompt_template_map["generate_initial_prompt"][ + "system" + ].render() + prompt = self._prompt_template_map["generate_initial_prompt"]["prompt"].render( + step=self.test.steps[0] + ) + + initial_prompt, reasoning = self._generate( + system_prompt=system_prompt, + prompt=prompt, + output_xml_element="initial_prompt", + ) + + self.trace.add_step( + system_prompt=system_prompt, + prompt=prompt, + initial_prompt=initial_prompt, + reasoning=reasoning, + ) + return initial_prompt + + def _generate_test_status(self) -> str: + system_prompt = self._prompt_template_map["generate_test_status"][ + "system" + ].render() + prompt = self._prompt_template_map["generate_test_status"]["prompt"].render( + steps=self.test.steps, conversation=self.conversation + ) + test_status, reasoning = self._generate( + system_prompt=system_prompt, + prompt=prompt, + output_xml_element="category", + ) + self.trace.add_step( + system_prompt=system_prompt, + prompt=prompt, + test_status=test_status, + reasoning=reasoning, + ) + return test_status + + def _generate_evaluation(self) -> tuple[str, str]: + system_prompt = self._prompt_template_map["generate_evaluation"][ + "system" + ].render() + prompt = self._prompt_template_map["generate_evaluation"]["prompt"].render( + expected_results=self.test.expected_results, + conversation=self.conversation, + ) + + evaluation, reasoning = self._generate( + system_prompt=system_prompt, + prompt=prompt, + output_xml_element="category", + ) + self.trace.add_step( + system_prompt=system_prompt, + prompt=prompt, + evaluation=evaluation, + reasoning=reasoning, + ) + + return evaluation, reasoning + + def _generate_user_response(self) -> str: + system_prompt = self._prompt_template_map["generate_user_response"][ + "system" + ].render() + prompt = self._prompt_template_map["generate_user_response"]["prompt"].render( + steps=self.test.steps, conversation=self.conversation + ) + + user_response, reasoning = self._generate( + system_prompt=system_prompt, + prompt=prompt, + output_xml_element="user_response", + ) + + self.trace.add_step( + system_prompt=system_prompt, + prompt=prompt, + user_response=user_response, + reasoning=reasoning, + ) + return user_response + + def _invoke_target(self, user_input) -> str: + target_response = self.target.invoke(user_input) + self.trace.add_step(data=target_response.data) + + return target_response.response + + def evaluate(self) -> TestResult: + success = False + result = Results.MAX_TURNS_REACHED.value + reasoning = "" + + while self.conversation.turns < self.test.max_turns: + if self.conversation.turns == 0: + # start conversation + if self.test.initial_prompt: + user_input = self.test.initial_prompt + else: + user_input = self._generate_initial_prompt() + else: + # generate next user response + user_input = self._generate_user_response() + + # add turn to the conversation + self.conversation.add_turn(user_input, self._invoke_target(user_input)) + + # get test status + test_status = self._generate_test_status() + if test_status == TestStatusCategories.ALL_STEPS_ATTEMPTED: + # evaluate conversation + eval_category, reasoning = self._generate_evaluation() + if ( + eval_category + == EvaluationCategories.NOT_ALL_EXPECTED_RESULTS_OBSERVED.value # noqa: W503 + ): + result = Results.NOT_ALL_EXPECTED_RESULTS_OBSERVED.value + else: + result = Results.ALL_EXPECTED_RESULTS_OBSERVED.value + success = True + + break + + return TestResult( + test_name=self.test.name, + success=success, + result=result, + reasoning=reasoning, + conversation=self.conversation, + ) diff --git a/stepfunctions/stepfunctions/agenteval/evaluators/claude_3/model_configs.py b/stepfunctions/stepfunctions/agenteval/evaluators/claude_3/model_configs.py new file mode 100644 index 0000000..e6bc2fc --- /dev/null +++ b/stepfunctions/stepfunctions/agenteval/evaluators/claude_3/model_configs.py @@ -0,0 +1,26 @@ +# Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +# SPDX-License-Identifier: Apache-2.0 + +MODEL_ID = "anthropic.claude-3-sonnet-20240229-v1:0" +ANTHROPIC_VERSION = "bedrock-2023-05-31" +ROLE = "user" +MAX_TOKENS_TO_SAMPLE = 300 +TEMPERATURE = 0 +TOP_K = 250 +TOP_P = 1 +REQUEST_BODY = { + "anthropic_version": ANTHROPIC_VERSION, + "max_tokens": MAX_TOKENS_TO_SAMPLE, + "system": None, + "messages": [ + { + "role": ROLE, + "content": [ + {"type": "text", "text": None}, + ], + } + ], + "temperature": TEMPERATURE, + "top_p": TOP_P, + "top_k": TOP_K, +} diff --git a/stepfunctions/stepfunctions/agenteval/evaluators/claude_3/most_updated_prompt_2607.txt b/stepfunctions/stepfunctions/agenteval/evaluators/claude_3/most_updated_prompt_2607.txt new file mode 100644 index 0000000..fce3738 --- /dev/null +++ b/stepfunctions/stepfunctions/agenteval/evaluators/claude_3/most_updated_prompt_2607.txt @@ -0,0 +1,67 @@ + +You are an energy advisor with twenty years of experience at the UK's leading energy providers. You are empathetic and compassionate, you understand that rising energy prices can be a source of strain. You are pragmatic. Ask the user clarifying questions to understand their personal situation and to ensure you are giving personalised advice. Do not make information up, if you do not know how to answer be honest. Before answering, please think about all the information you would need before answering the user's question. + + + + + + + +You are a compassionate and empathetic customer-facing energy advisor with twenty years of experience at the UK's leading energy providers. You have the important role of preventing customers from debt or payment difficulties, whilst also providing tailored support to hose already struggling with energy costs. Most importantly, you assess each customer's unique needs and provide support that's tailored to their individual situation. + + + + +Your approach is to: +1) Create a profile of the customer by asking a few clarifying questions, one at a time, about their situation, energy usage and any challenges they are facing. +2) Based on their responses, provide a personalised recommendation to resolve their issue or improve their circumstance and ensure they are being energy efficient. + +Some example questions include: + + + +* Does the customer have a smart meter? +* Are they aware of Energy Hub? +* Are they on the right tariff? +* How many people are in their household? +* What is their current living situation (apartment, house, etc.)? + + + +Some examples of recommendations include: + + +* Smart meter installation for better usage monitoring +* Checking their eligibility for financial assistance including debt relief or the Warm Home Discount + + + +Always greet the customer with a salutation, even if they do not use one themselves. Approach each question with care. Do not make information up - if you do not know the answer - please be honest. Always remember to keep a conversational tone, especially when providing the recommendations. Ask the customer questions one at a time. Once you have enough information to provide the user with a helpful recommendation, then provide it. + + +Here is an example interaction: + + +A: how can I reduce my energy bill? + +B: Hi there, I understand you want to reduce your energy bill. I want to give you advice that is personal to your situation. So will ask some questions to understand you better. Is that okay? + +A: Yes + +B: What kind of house do you live in and with how many people? + +A: I live in a one-bedroom apartment with my partner? + +B: Thank you, and how do you measure your energy use? + +A: I send meter readings? + +B: Okay, so to confirm you don’t have a smart meter? + +A: No + +B: My first recommendation would be a smart meter. A smart meter is a way to ensure that your energy readings are always up to date and can assist with your payment if you are overpaying at some points in the year. Would you like some more recommendations? +... +[continues dialogue to gather more details if required and then provide a personalized recommendation] + + diff --git a/stepfunctions/stepfunctions/agenteval/evaluators/evaluator_factory.py b/stepfunctions/stepfunctions/agenteval/evaluators/evaluator_factory.py new file mode 100644 index 0000000..d42f8e3 --- /dev/null +++ b/stepfunctions/stepfunctions/agenteval/evaluators/evaluator_factory.py @@ -0,0 +1,27 @@ +from typing import Optional + +from pydantic import BaseModel + +from agenteval.evaluators import BaseEvaluator +from agenteval.evaluators.claude_3 import Claude3Evaluator +from agenteval.targets import BaseTarget +from agenteval.test import Test + +_EVALUATOR_MAP = { + "claude-3": Claude3Evaluator, +} + + +class EvaluatorFactory(BaseModel): + config: dict + + def create( + self, test: Test, target: BaseTarget, work_dir: Optional[str] + ) -> BaseEvaluator: + evaluator_cls = _EVALUATOR_MAP[self.config["model"]] + return evaluator_cls( + test=test, + target=target, + work_dir=work_dir, + **{k: v for k, v in self.config.items() if k != "model"} + ) diff --git a/stepfunctions/stepfunctions/agenteval/hook.py b/stepfunctions/stepfunctions/agenteval/hook.py new file mode 100644 index 0000000..a1386e6 --- /dev/null +++ b/stepfunctions/stepfunctions/agenteval/hook.py @@ -0,0 +1,33 @@ +# Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +# SPDX-License-Identifier: Apache-2.0 + +from agenteval.test import Test +from agenteval.test_result import TestResult +from agenteval.trace import Trace + + +class Hook: + """An evaluation hook.""" + + def pre_evaluate(test: Test, trace: Trace) -> None: + """ + Method called before evaluation. Can be used to perform any setup tasks. + + Args: + test (Test): The test case. + trace (Trace): Captures steps during evaluation. + """ + pass + + def post_evaluate(test: Test, test_result: TestResult, trace: Trace) -> None: + """ + Method called after evaluation. This may be used to perform integration testing + or clean up tasks. + + Args: + test (Test): The test case. + test_result (TestResult): The result of the test, which can be overriden + by updating the attributes of this object. + trace (Trace): Captures steps during evaluation. + """ + pass diff --git a/stepfunctions/stepfunctions/agenteval/plan.py b/stepfunctions/stepfunctions/agenteval/plan.py new file mode 100644 index 0000000..73a3107 --- /dev/null +++ b/stepfunctions/stepfunctions/agenteval/plan.py @@ -0,0 +1,110 @@ +# Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +# SPDX-License-Identifier: Apache-2.0 + +from __future__ import annotations + +import logging +import os +import sys +from typing import Optional + +import yaml +from pydantic import BaseModel, model_validator + +from agenteval import defaults +from agenteval.evaluators import EvaluatorFactory +from agenteval.targets import TargetFactory +from agenteval.test import Test + +_PLAN_FILE_NAME = "agenteval.yml" + +_INIT_PLAN = { + "evaluator": {"model": "claude-3"}, + "target": { + "type": "bedrock-agent", + "bedrock_agent_id": None, + "bedrock_agent_alias_id": None, + }, + "tests": { + "retrieve_missing_documents": { + "steps": ["Ask agent for a list of missing documents for claim-006."], + "expected_results": ["The agent returns a list of missing documents."], + } + }, +} + + +sys.path.append(".") +logger = logging.getLogger(__name__) + + +class Plan(BaseModel, validate_assignment=True, arbitrary_types_allowed=True): + evaluator_factory: EvaluatorFactory + target_factory: TargetFactory + tests: list[Test] + + @model_validator(mode="after") + def check_test_names_unique(self) -> Plan: + unique_names = len(set(test.name for test in self.tests)) + + if unique_names != len(self.tests): + raise ValueError("Test names must be unique") + + return self + + @classmethod + def load(cls, plan_dir: Optional[str], filter: str) -> Plan: + plan_path = os.path.join(plan_dir or os.getcwd(), _PLAN_FILE_NAME) + plan = cls._load_yaml(plan_path) + + return cls( + evaluator_factory=EvaluatorFactory(config=plan["evaluator"]), + target_factory=TargetFactory(config=plan["target"]), + tests=cls._load_tests(plan["tests"], filter), + ) + + @staticmethod + def _load_yaml(path: str) -> dict: + with open(path) as stream: + return yaml.safe_load(stream) + + @staticmethod + def _load_tests(test_config: list[dict], filter: str) -> list[Test]: + tests = [] + + if filter: + names = Plan._parse_filter(filter) + else: + names = test_config.keys() + + for name in names: + config = test_config[name] + tests.append( + Test( + name=name, + steps=config["steps"], + expected_results=config["expected_results"], + initial_prompt=config.get("initial_prompt"), + max_turns=config.get("max_turns", defaults.MAX_TURNS), + hook=config.get("hook"), + ) + ) + + return tests + + @staticmethod + def _parse_filter(filter: str) -> list[str]: + return [n.strip() for n in filter.split(",")] + + @staticmethod + def init_plan(plan_dir: Optional[str]) -> str: + plan_path = os.path.join(plan_dir or os.getcwd(), _PLAN_FILE_NAME) + + # check if plan exists + if os.path.exists(plan_path): + raise FileExistsError(f"Test plan already exists at {plan_path}") + + with open(plan_path, "w") as stream: + yaml.safe_dump(_INIT_PLAN, stream, sort_keys=False) + + return plan_path diff --git a/stepfunctions/stepfunctions/agenteval/runner/__init__.py b/stepfunctions/stepfunctions/agenteval/runner/__init__.py new file mode 100644 index 0000000..32377b3 --- /dev/null +++ b/stepfunctions/stepfunctions/agenteval/runner/__init__.py @@ -0,0 +1,6 @@ +# Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +# SPDX-License-Identifier: Apache-2.0 + +from .runner import Runner + +__all__ = ["Runner"] diff --git a/stepfunctions/stepfunctions/agenteval/runner/runner.py b/stepfunctions/stepfunctions/agenteval/runner/runner.py new file mode 100644 index 0000000..c3e0803 --- /dev/null +++ b/stepfunctions/stepfunctions/agenteval/runner/runner.py @@ -0,0 +1,116 @@ +# Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +# SPDX-License-Identifier: Apache-2.0 + +import concurrent.futures +import logging +import os +import time +from typing import Optional + +from rich.progress import Progress + +from agenteval.defaults import MAX_NUM_THREADS +from agenteval.plan import Plan +from agenteval.runner.summary import create_markdown_summary + +logger = logging.getLogger(__name__) + + +class Runner: + def __init__( + self, + plan: Plan, + verbose: bool, + num_threads: Optional[int], + work_dir: Optional[str], + ): + self.plan = plan + self.work_dir = work_dir if work_dir else os.getcwd() + self.num_tests = len(self.plan.tests) + self.verbose = verbose + self.num_threads = num_threads + if not self.num_threads: + self.num_threads = min(self.num_tests, MAX_NUM_THREADS) + self.results = {test.name: None for test in self.plan.tests} + self.num_failed = 0 + self.evaluator_input_token_counts = [] + self.evaluator_output_token_counts = [] + + def run(self) -> int: + self._log_run_start() + + self.start_time = time.time() + with Progress(transient=True) as self.progress: + self.tracker = self.progress.add_task("running...", total=self.num_tests) + + with concurrent.futures.ThreadPoolExecutor( + max_workers=self.num_tests + ) as executor: + futures = [ + executor.submit(self.run_test, test) for test in self.plan.tests + ] + for future in concurrent.futures.as_completed(futures): + try: + future.result() + except Exception as e: + raise e + + self._log_run_end() + + create_markdown_summary( + self.work_dir, self.plan.tests, list(self.results.values()), self.verbose + ) + + return self.num_failed + + def run_test(self, test): + target = self.plan.target_factory.create() + evaluator = self.plan.evaluator_factory.create( + test=test, + target=target, + work_dir=self.work_dir, + ) + + result = evaluator.run() + if result.success is False: + self.num_failed += 1 + + self.progress.update(self.tracker, advance=1) + self.results[test.name] = result + self.evaluator_input_token_counts.append(evaluator.input_token_count) + self.evaluator_output_token_counts.append(evaluator.output_token_count) + + def _log_run_start(self): + logger.info(f"Starting {self.num_tests} tests with {self.num_threads} threads.") + + def _log_run_end(self): + self._log_pass_fail_count() + logger.info(f"Completed in {round(time.time() - self.start_time, 2)} seconds.") + if self.verbose: + self._log_test_result() + self._log_evaluator_token_io() + + def _log_test_result(self): + for _, result in self.results.items(): + logger_func = logger.info if result.success else logger.error + logger_func( + f"[bold {'green' if result.success else 'red'}]{result.test_name}...{'PASSED' if result.success else 'FAILED'}", + ) + + def _log_pass_fail_count(self): + passed_count = self.num_tests - self.num_failed + status_str = ( + f"[red]{passed_count} passed, {self.num_failed} failed." + if self.num_failed + else f"[green]{self.num_tests} passed." + ) + logger_func = logger.error if self.num_failed else logger.info + logger_func(status_str) + + def _log_evaluator_token_io(self): + logger.info( + f"Input tokens processed by evaluator: {sum(self.evaluator_input_token_counts)}" + ) + logger.info( + f"Output tokens generated by evaluator: {sum(self.evaluator_output_token_counts)}" + ) diff --git a/stepfunctions/stepfunctions/agenteval/runner/summary.py b/stepfunctions/stepfunctions/agenteval/runner/summary.py new file mode 100644 index 0000000..1abfaad --- /dev/null +++ b/stepfunctions/stepfunctions/agenteval/runner/summary.py @@ -0,0 +1,30 @@ +# Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +# SPDX-License-Identifier: Apache-2.0 + +import logging +import os + +from agenteval import jinja_env +from agenteval.test import Test +from agenteval.test_result import TestResult + +logger = logging.getLogger(__name__) + +_TEMPLATE_ROOT = "summary" +_TEMPLATE_FILE_NAME = "agenteval_summary.md.jinja" + + +def create_markdown_summary( + work_dir: str, tests: list[Test], test_results: list[TestResult], verbose: bool +): + template = jinja_env.get_template(os.path.join(_TEMPLATE_ROOT, _TEMPLATE_FILE_NAME)) + + summary_path = os.path.join(work_dir, os.path.splitext(_TEMPLATE_FILE_NAME)[0]) + + rendered = template.render(tests=tests, results=test_results, zip=zip) + + with open(summary_path, "w+") as f: + f.write(rendered) + + if verbose: + logger.info(f"Summary available at {summary_path}") diff --git a/stepfunctions/stepfunctions/agenteval/target_response.py b/stepfunctions/stepfunctions/agenteval/target_response.py new file mode 100644 index 0000000..417543f --- /dev/null +++ b/stepfunctions/stepfunctions/agenteval/target_response.py @@ -0,0 +1,15 @@ +from typing import Optional + +from pydantic import BaseModel + + +class TargetResponse(BaseModel): + """A target's response. + + Attributes: + response: The response string. + data: Additional data (if applicable). + """ + + response: str + data: Optional[dict] = None diff --git a/stepfunctions/stepfunctions/agenteval/targets/__init__.py b/stepfunctions/stepfunctions/agenteval/targets/__init__.py new file mode 100644 index 0000000..910e303 --- /dev/null +++ b/stepfunctions/stepfunctions/agenteval/targets/__init__.py @@ -0,0 +1,8 @@ +# Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +# SPDX-License-Identifier: Apache-2.0 + +from .base_target import BaseTarget +from .boto3_target import Boto3Target +from .target_factory import TargetFactory + +__all__ = ["BaseTarget", "TargetFactory", "Boto3Target"] diff --git a/stepfunctions/stepfunctions/agenteval/targets/base_target.py b/stepfunctions/stepfunctions/agenteval/targets/base_target.py new file mode 100644 index 0000000..f8fbaa8 --- /dev/null +++ b/stepfunctions/stepfunctions/agenteval/targets/base_target.py @@ -0,0 +1,27 @@ +# Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +# SPDX-License-Identifier: Apache-2.0 + +from __future__ import annotations + +from abc import ABC, abstractmethod + +from agenteval import TargetResponse + + +class BaseTarget(ABC): + """The `BaseTarget` abstract base class defines the common interface for target + classes. + """ + + @abstractmethod + def invoke(self, prompt: str) -> TargetResponse: + """Invoke the target with a prompt and return a response as a string. + + Args: + prompt: The prompt string to pass to the target. + + Returns: + A TargetResponse object containing the target's response string and + any trace data (if applicable). + """ + pass diff --git a/stepfunctions/stepfunctions/agenteval/targets/bedrock_agent/__init__.py b/stepfunctions/stepfunctions/agenteval/targets/bedrock_agent/__init__.py new file mode 100644 index 0000000..4d393ff --- /dev/null +++ b/stepfunctions/stepfunctions/agenteval/targets/bedrock_agent/__init__.py @@ -0,0 +1,3 @@ +from .target import BedrockAgentTarget + +__all__ = ["BedrockAgentTarget"] diff --git a/stepfunctions/stepfunctions/agenteval/targets/bedrock_agent/target.py b/stepfunctions/stepfunctions/agenteval/targets/bedrock_agent/target.py new file mode 100644 index 0000000..f7e6f9c --- /dev/null +++ b/stepfunctions/stepfunctions/agenteval/targets/bedrock_agent/target.py @@ -0,0 +1,41 @@ +import uuid + +from agenteval import TargetResponse +from agenteval.targets import Boto3Target + +_SERVICE_NAME = "bedrock-agent-runtime" + + +class BedrockAgentTarget(Boto3Target): + def __init__(self, bedrock_agent_id: str, bedrock_agent_alias_id: str, **kwargs): + super().__init__(boto3_service_name=_SERVICE_NAME, **kwargs) + self._bedrock_agent_id = bedrock_agent_id + self._bedrock_agent_alias_id = bedrock_agent_alias_id + self._session_id: str = str(uuid.uuid4()) + + def invoke(self, prompt: str) -> TargetResponse: + args = { + "agentId": self._bedrock_agent_id, + "agentAliasId": self._bedrock_agent_alias_id, + "sessionId": self._session_id, + "inputText": prompt, + "enableTrace": True, + } + + response = self.boto3_client.invoke_agent(**args) + + stream = response["completion"] + completion = "" + trace_data = [] + + for event in stream: + chunk = event.get("chunk") + event_trace = event.get("trace") + if chunk: + completion += chunk.get("bytes").decode() + if event_trace: + trace_data.append(event_trace.get("trace")) + + return TargetResponse( + response=completion, data={"bedrock_agent_trace": trace_data} + ) diff --git a/stepfunctions/stepfunctions/agenteval/targets/bedrock_knowledge_base/__init__.py b/stepfunctions/stepfunctions/agenteval/targets/bedrock_knowledge_base/__init__.py new file mode 100644 index 0000000..d56ea6f --- /dev/null +++ b/stepfunctions/stepfunctions/agenteval/targets/bedrock_knowledge_base/__init__.py @@ -0,0 +1,3 @@ +from .target import BedrockKnowledgeBaseTarget + +__all__ = ["BedrockKnowledgeBaseTarget"] diff --git a/stepfunctions/stepfunctions/agenteval/targets/bedrock_knowledge_base/target.py b/stepfunctions/stepfunctions/agenteval/targets/bedrock_knowledge_base/target.py new file mode 100644 index 0000000..a9491e2 --- /dev/null +++ b/stepfunctions/stepfunctions/agenteval/targets/bedrock_knowledge_base/target.py @@ -0,0 +1,38 @@ +from agenteval import TargetResponse +from agenteval.targets import Boto3Target + +_SERVICE_NAME = "bedrock-agent-runtime" + + +class BedrockKnowledgeBaseTarget(Boto3Target): + def __init__(self, knowledge_base_id: str, model_id: str, **kwargs): + super().__init__(boto3_service_name=_SERVICE_NAME, **kwargs) + aws_region = self.boto3_client.meta.region_name + self._knowledge_base_id = knowledge_base_id + self._model_arn = f"arn:aws:bedrock:{aws_region}::foundation-model/{model_id}" + self._session_id: str = None + + def invoke(self, prompt: str) -> TargetResponse: + args = { + "input": { + "text": prompt, + }, + "retrieveAndGenerateConfiguration": { + "type": "KNOWLEDGE_BASE", + "knowledgeBaseConfiguration": { + "knowledgeBaseId": self._knowledge_base_id, + "modelArn": self._model_arn, + }, + }, + } + if self._session_id: + args["sessionId"] = self._session_id + + response = self.boto3_client.retrieve_and_generate(**args) + generated_text = response["output"]["text"] + citations = response["citations"] + self._session_id = response["sessionId"] + + return TargetResponse( + response=generated_text, data={"bedrock_knowledgebase_citations": citations} + ) diff --git a/stepfunctions/stepfunctions/agenteval/targets/boto3_target.py b/stepfunctions/stepfunctions/agenteval/targets/boto3_target.py new file mode 100644 index 0000000..e47e8cb --- /dev/null +++ b/stepfunctions/stepfunctions/agenteval/targets/boto3_target.py @@ -0,0 +1,41 @@ +from typing import Optional + +from agenteval.targets import BaseTarget +from agenteval.utils import create_boto3_client + +_DEFAULT_MAX_RETRY = 10 + + +class Boto3Target(BaseTarget): + """A target that can be interfaced with via the `boto3` library. + + Attributes: + boto3_client (BaseClient): A `boto3` client. + """ + + def __init__( + self, + boto3_service_name: str, + aws_profile: Optional[str] = None, + aws_region: Optional[str] = None, + endpoint_url: Optional[str] = None, + max_retry: int = _DEFAULT_MAX_RETRY, + ): + """ + Initialize the AWS target. + + Args: + boto3_service_name (str): The `boto3` service name (e.g `"bedrock-agent-runtime"`). + aws_profile (str, optional): The AWS profile name. + aws_region (str, optional): The AWS region. + endpoint_url (str, optional): The endpoint URL for the AWS service. + max_retry (int, optional): The maximum number of retry attempts. + """ + + self.boto3_client = create_boto3_client( + boto3_service_name=boto3_service_name, + aws_profile=aws_profile, + aws_region=aws_region, + endpoint_url=endpoint_url, + max_retry=max_retry, + ) diff --git a/stepfunctions/stepfunctions/agenteval/targets/q_business/__init__.py b/stepfunctions/stepfunctions/agenteval/targets/q_business/__init__.py new file mode 100644 index 0000000..3f621e5 --- /dev/null +++ b/stepfunctions/stepfunctions/agenteval/targets/q_business/__init__.py @@ -0,0 +1,3 @@ +from .target import QBusinessTarget + +__all__ = ["QBusinessTarget"] diff --git a/stepfunctions/stepfunctions/agenteval/targets/q_business/target.py b/stepfunctions/stepfunctions/agenteval/targets/q_business/target.py new file mode 100644 index 0000000..8fd59be --- /dev/null +++ b/stepfunctions/stepfunctions/agenteval/targets/q_business/target.py @@ -0,0 +1,32 @@ +from typing import Optional + +from agenteval import TargetResponse +from agenteval.targets import Boto3Target + +_SERVICE_NAME = "qbusiness" + + +class QBusinessTarget(Boto3Target): + def __init__( + self, + q_business_application_id: str, + q_business_user_id: Optional[str] = None, + **kwargs + ): + super().__init__(boto3_service_name=_SERVICE_NAME, **kwargs) + + self._chat_sync_args = {"applicationId": q_business_application_id} + if q_business_user_id: + self._chat_sync_args["userId"] = q_business_user_id + + def invoke(self, prompt: str) -> str: + self._chat_sync_args["userMessage"] = prompt + + response = self.boto3_client.chat_sync(**self._chat_sync_args) + + if "conversationId" not in self._chat_sync_args: + self._chat_sync_args["conversationId"] = response["conversationId"] + + self._chat_sync_args["parentMessageId"] = response["systemMessageId"] + + return TargetResponse(response=response["systemMessage"]) diff --git a/stepfunctions/stepfunctions/agenteval/targets/sagemaker_endpoint/__init__.py b/stepfunctions/stepfunctions/agenteval/targets/sagemaker_endpoint/__init__.py new file mode 100644 index 0000000..8c9adc2 --- /dev/null +++ b/stepfunctions/stepfunctions/agenteval/targets/sagemaker_endpoint/__init__.py @@ -0,0 +1,3 @@ +from .target import SageMakerEndpointTarget + +__all__ = ["SageMakerEndpointTarget"] diff --git a/stepfunctions/stepfunctions/agenteval/targets/sagemaker_endpoint/target.py b/stepfunctions/stepfunctions/agenteval/targets/sagemaker_endpoint/target.py new file mode 100644 index 0000000..74d2056 --- /dev/null +++ b/stepfunctions/stepfunctions/agenteval/targets/sagemaker_endpoint/target.py @@ -0,0 +1,85 @@ +import json +from typing import Optional + +from jsonpath_ng import parse + +from agenteval import TargetResponse +from agenteval.targets import Boto3Target + +_SERVICE_NAME = "sagemaker-runtime" +_CONTENT_TYPE = "application/json" +_ACCEPT = "application/json" + + +class SageMakerEndpointTarget(Boto3Target): + def __init__( + self, + endpoint_name: str, + request_body: dict, + input_path: str, + output_path: str, + custom_attributes: Optional[str] = None, + target_model: Optional[str] = None, + target_variant: Optional[str] = None, + target_container_hostname: Optional[str] = None, + inference_component_name: Optional[str] = None, + **kwargs + ): + super().__init__(boto3_service_name=_SERVICE_NAME, **kwargs) + + self._request_body = request_body + self._input_jp_expr = parse(input_path) + self._output_jp_expr = parse(output_path) + + self._args = self._create_base_args( + endpoint_name, + custom_attributes, + target_model, + target_variant, + target_container_hostname, + inference_component_name, + ) + + @staticmethod + def _create_base_args( + endpoint_name: str, + custom_attributes: Optional[str], + target_model: Optional[str], + target_variant: Optional[str], + target_container_hostname: Optional[str], + inference_component_name: Optional[str], + ): + args = { + "EndpointName": endpoint_name, + "ContentType": _CONTENT_TYPE, + "Accept": _ACCEPT, + **{ + key: value + for key, value in { + "CustomAttributes": custom_attributes, + "TargetModel": target_model, + "TargetVariant": target_variant, + "TargetContainerHostname": target_container_hostname, + "InferenceComponentName": inference_component_name, + }.items() + if value is not None + }, + } + + return args + + def _update_request(self, prompt: str): + self._input_jp_expr.update(self._request_body, prompt) + self._args["Body"] = json.dumps(self._request_body) + + def _query_response(self, response_body: dict) -> str: + return self._output_jp_expr.find(response_body)[0].value + + def invoke(self, prompt: str) -> str: + self._update_request(prompt) + + response = self.boto3_client.invoke_endpoint(**self._args) + + response_body = json.loads(response.get("Body").read()) + + return TargetResponse(response=self._query_response(response_body)) diff --git a/stepfunctions/stepfunctions/agenteval/targets/target_factory.py b/stepfunctions/stepfunctions/agenteval/targets/target_factory.py new file mode 100644 index 0000000..a8e7e9c --- /dev/null +++ b/stepfunctions/stepfunctions/agenteval/targets/target_factory.py @@ -0,0 +1,32 @@ +from pydantic import BaseModel + +from agenteval.targets import BaseTarget +from agenteval.targets.bedrock_agent import BedrockAgentTarget +from agenteval.targets.bedrock_knowledge_base import BedrockKnowledgeBaseTarget +from agenteval.targets.q_business import QBusinessTarget +from agenteval.targets.sagemaker_endpoint import SageMakerEndpointTarget +from agenteval.utils import import_class + +_TARGET_MAP = { + "bedrock-agent": BedrockAgentTarget, + "q-business": QBusinessTarget, + "sagemaker-endpoint": SageMakerEndpointTarget, + "bedrock-knowledgebase": BedrockKnowledgeBaseTarget, +} + + +class TargetFactory(BaseModel): + config: dict + + def create(self) -> BaseTarget: + target_cls = self._get_target_class() + + return target_cls(**{k: v for k, v in self.config.items() if k != "type"}) + + def _get_target_class(self) -> type[BaseTarget]: + if self.config["type"] in _TARGET_MAP: + target_cls = _TARGET_MAP[self.config["type"]] + else: + target_cls = import_class(self.config["type"], parent_class=BaseTarget) + + return target_cls diff --git a/stepfunctions/stepfunctions/agenteval/templates/evaluators/claude_3/generate_evaluation.jinja b/stepfunctions/stepfunctions/agenteval/templates/evaluators/claude_3/generate_evaluation.jinja new file mode 100644 index 0000000..9cd9dd4 --- /dev/null +++ b/stepfunctions/stepfunctions/agenteval/templates/evaluators/claude_3/generate_evaluation.jinja @@ -0,0 +1,13 @@ +Here are the expected results and conversation: + + +{% for result in expected_results -%} +{{ loop.index }}. {{ result }} +{% endfor -%} + + + +{% for sender, message in conversation -%} +{{ sender }}: {{ message }} +{% endfor -%} + \ No newline at end of file diff --git a/stepfunctions/stepfunctions/agenteval/templates/evaluators/claude_3/generate_initial_prompt.jinja b/stepfunctions/stepfunctions/agenteval/templates/evaluators/claude_3/generate_initial_prompt.jinja new file mode 100644 index 0000000..832ba37 --- /dev/null +++ b/stepfunctions/stepfunctions/agenteval/templates/evaluators/claude_3/generate_initial_prompt.jinja @@ -0,0 +1,5 @@ +Here is the step: + + +{{ step }} + \ No newline at end of file diff --git a/stepfunctions/stepfunctions/agenteval/templates/evaluators/claude_3/generate_test_status.jinja b/stepfunctions/stepfunctions/agenteval/templates/evaluators/claude_3/generate_test_status.jinja new file mode 100644 index 0000000..79ad0df --- /dev/null +++ b/stepfunctions/stepfunctions/agenteval/templates/evaluators/claude_3/generate_test_status.jinja @@ -0,0 +1,13 @@ +Here are the steps and conversation: + + +{% for step in steps -%} +{{ loop.index }}. {{ step }} +{% endfor -%} + + + +{% for sender, message in conversation -%} +{{ sender }}: {{ message }} +{% endfor -%} + \ No newline at end of file diff --git a/stepfunctions/stepfunctions/agenteval/templates/evaluators/claude_3/generate_user_response.jinja b/stepfunctions/stepfunctions/agenteval/templates/evaluators/claude_3/generate_user_response.jinja new file mode 100644 index 0000000..79ad0df --- /dev/null +++ b/stepfunctions/stepfunctions/agenteval/templates/evaluators/claude_3/generate_user_response.jinja @@ -0,0 +1,13 @@ +Here are the steps and conversation: + + +{% for step in steps -%} +{{ loop.index }}. {{ step }} +{% endfor -%} + + + +{% for sender, message in conversation -%} +{{ sender }}: {{ message }} +{% endfor -%} + \ No newline at end of file diff --git a/stepfunctions/stepfunctions/agenteval/templates/evaluators/claude_3/system/generate_evaluation.jinja b/stepfunctions/stepfunctions/agenteval/templates/evaluators/claude_3/system/generate_evaluation.jinja new file mode 100644 index 0000000..22cace3 --- /dev/null +++ b/stepfunctions/stepfunctions/agenteval/templates/evaluators/claude_3/system/generate_evaluation.jinja @@ -0,0 +1,12 @@ +You are a quality assurance engineer evaluating a conversation between an USER and an AGENT. + +Your job is to analyze the conversation in tags and a list of expected results +in tags. + +You will classify the the conversation into the following categories: + +- A: All of the expected results can be observed in the conversation. +- B: Not all of the expected results can be observed in the conversation. + +Please think hard about the response in tags before providing only the category letter +within tags. \ No newline at end of file diff --git a/stepfunctions/stepfunctions/agenteval/templates/evaluators/claude_3/system/generate_initial_prompt.jinja b/stepfunctions/stepfunctions/agenteval/templates/evaluators/claude_3/system/generate_initial_prompt.jinja new file mode 100644 index 0000000..d0e8e23 --- /dev/null +++ b/stepfunctions/stepfunctions/agenteval/templates/evaluators/claude_3/system/generate_initial_prompt.jinja @@ -0,0 +1,13 @@ +You are role playing as an USER in a conversastion with an AGENT. + +You will be given a step that is wrapped in tags. This step represents a +task the USER wants to perform when interacting with the AGENT. + +Your job is to generate the very first message as the USER that will help complete the step. + +Make sure this message is concise and to the point. + +Do not provide any information if it is expected that the AGENT will eventually ask for it. + +Please think hard about the response in tags before providing the message +within tags. \ No newline at end of file diff --git a/stepfunctions/stepfunctions/agenteval/templates/evaluators/claude_3/system/generate_test_status.jinja b/stepfunctions/stepfunctions/agenteval/templates/evaluators/claude_3/system/generate_test_status.jinja new file mode 100644 index 0000000..7bb8e6b --- /dev/null +++ b/stepfunctions/stepfunctions/agenteval/templates/evaluators/claude_3/system/generate_test_status.jinja @@ -0,0 +1,13 @@ +You are a quality assurance engineer evaluating a conversation between an USER and an AGENT. + +You will be given an ordered list of steps wrapped in tags. Each step represents a task +that the USER wants to perform when interacting with the AGENT. + +Your job is analyze the running conversation in tags and classify it into the following +categories: + +- A: The USER has attempted all the steps. +- B: The USER has not yet attempted all the steps. + +Please think hard about the response in tags before providing only the category letter +within tags. \ No newline at end of file diff --git a/stepfunctions/stepfunctions/agenteval/templates/evaluators/claude_3/system/generate_user_response.jinja b/stepfunctions/stepfunctions/agenteval/templates/evaluators/claude_3/system/generate_user_response.jinja new file mode 100644 index 0000000..e670420 --- /dev/null +++ b/stepfunctions/stepfunctions/agenteval/templates/evaluators/claude_3/system/generate_user_response.jinja @@ -0,0 +1,15 @@ +You are role playing as an USER in a conversastion with an AGENT. + +You will be given an ordered list of steps wrapped in tags. Each step represents +a task that the USER wants to perform when interacting with the AGENT. + +Using the list of steps, your job is analyze the running conversation in the + tags and generate the next appropriate response as the USER. + +Do not include any information from a step unless the AGENT asks for it. + +If the AGENT was unable to help or did not understand the last request, just move on to +the next step. Do not attempt to rephrase the request in the next response as the USER. + +Please think hard about the response in tags before providing the response +within tags. Do not include the string "USER:" in your response. \ No newline at end of file diff --git a/stepfunctions/stepfunctions/agenteval/templates/summary/agenteval_summary.md.jinja b/stepfunctions/stepfunctions/agenteval/templates/summary/agenteval_summary.md.jinja new file mode 100644 index 0000000..a624303 --- /dev/null +++ b/stepfunctions/stepfunctions/agenteval/templates/summary/agenteval_summary.md.jinja @@ -0,0 +1,49 @@ +# Test Summary +--- +This document provides a summary of the tests executed by Agent Evaluation. + +> :warning: This tool utilizes generative AI to assess virtual agents and its evaluations may contain errors. **Please thoroughly examine the results below prior to deciding whether to implement an agent.** +--- +## Tests +{% for test, result in zip(tests, results) -%} +- [{% if result.success %}:green_circle:{% else %}:red_circle:{% endif %} {{ test.name }}](#{{ test.name | replace(' ', '-') }}) +{% endfor %} + +--- + + +{% for test, result in zip(tests, results) -%} +## {% if result.success %}:green_circle:{% else %}:red_circle:{% endif %} {{ test.name }} + +**Steps** +{% for step in test.steps -%} +{{ loop.index }}. {{ step }} +{% endfor %} + +**Expected results** +{% for result in test.expected_results -%} +{{ loop.index }}. {{ result }} +{% endfor %} + +**Conversation** +``` +{% for sender, message in result.conversation -%} +[{{ sender }}] {{ message }} +{% endfor -%} +``` + +**Result** +{{ result.result }} + +**Reasoning** +``` +{{ result.reasoning }} +``` + +--- +{% endfor %} + + + + + diff --git a/stepfunctions/stepfunctions/agenteval/test.py b/stepfunctions/stepfunctions/agenteval/test.py new file mode 100644 index 0000000..695f2fe --- /dev/null +++ b/stepfunctions/stepfunctions/agenteval/test.py @@ -0,0 +1,29 @@ +# Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +# SPDX-License-Identifier: Apache-2.0 + +from typing import Optional + +from pydantic import BaseModel + + +class Test(BaseModel, validate_assignment=True): + """A test case for an agent. + + Attributes: + name: Name of the test. + steps: List of step to perform for the test. + expected_results: List of expected results for the test. + initial_prompt: Optional initial prompt. + max_turns: Maximum number of turns allowed for the test. + hook: The module path to an evaluation hook. + """ + + # do not collect as a test + __test__ = False + + name: str + steps: list[str] + expected_results: list[str] + initial_prompt: Optional[str] = None + max_turns: int + hook: Optional[str] = None diff --git a/stepfunctions/stepfunctions/agenteval/test_result.py b/stepfunctions/stepfunctions/agenteval/test_result.py new file mode 100644 index 0000000..5258aef --- /dev/null +++ b/stepfunctions/stepfunctions/agenteval/test_result.py @@ -0,0 +1,27 @@ +# Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +# SPDX-License-Identifier: Apache-2.0 + +from pydantic import BaseModel + +from agenteval.conversation import Conversation + + +class TestResult(BaseModel, arbitrary_types_allowed=True): + """The result of a test. + + Attributes: + test_name: Name of the test. + result: Description of the test result. + reasoning: The rationale for the test result. + success: `True` if the test passed, otherwise `False`. + conversation: Captures the interaction between a user and an agent. + """ + + # do not collect as a test + __test__ = False + + test_name: str + result: str + reasoning: str + success: bool + conversation: Conversation diff --git a/stepfunctions/stepfunctions/agenteval/trace.py b/stepfunctions/stepfunctions/agenteval/trace.py new file mode 100644 index 0000000..25d477a --- /dev/null +++ b/stepfunctions/stepfunctions/agenteval/trace.py @@ -0,0 +1,72 @@ +# Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +# SPDX-License-Identifier: Apache-2.0 + +import inspect +import json +import os +from datetime import datetime, timezone +from typing import Optional + +_TRACE_DIR = "agenteval_traces" + + +class Trace: + """Captures steps during evaluation. + + Attributes: + test_name (str): Name of the test. + trace_dir (str): Directory to store the trace. + start_time (datetime): Start time of the trace. + end_time (datetime): End time of the trace. + steps (list): List of steps in the trace. + + """ + + def __init__(self, test_name: str, work_dir: str): + """ + Initialize the trace handler. + + Args: + test_name (str): Name of the trace + """ + self.test_name = test_name + self.trace_dir = os.path.join(work_dir, _TRACE_DIR) + self.start_time = None + self.end_time = None + self.steps = [] + + def __enter__(self): + self.start_time = datetime.now(timezone.utc) + return self + + def __exit__(self, *exc): + self.end_time = datetime.now(timezone.utc) + self._dump_trace() + + def _dump_trace(self): + """Dump the trace to a JSON file.""" + + os.makedirs(self.trace_dir, exist_ok=True) + + with open(os.path.join(self.trace_dir, f"{self.test_name}.json"), "w") as f: + json.dump(self._get_trace(), f, default=str) + + def _get_trace(self) -> str: + return { + "test_name": self.test_name, + "start_time": self.start_time, + "end_time": self.end_time, + "steps": self.steps, + } + + def add_step(self, step_name: Optional[str] = None, **kwargs): + """Add a step to the trace. + + Args: + step_name (str, optional): The name of the step. Defaults to + the name of the caller function + """ + step_name = step_name or inspect.stack()[1].function + step = {"timestamp": datetime.now(timezone.utc), "step_name": step_name} + step.update(kwargs) + self.steps.append(step) diff --git a/stepfunctions/stepfunctions/agenteval/utils/__init__.py b/stepfunctions/stepfunctions/agenteval/utils/__init__.py new file mode 100644 index 0000000..5f80a10 --- /dev/null +++ b/stepfunctions/stepfunctions/agenteval/utils/__init__.py @@ -0,0 +1,7 @@ +# Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +# SPDX-License-Identifier: Apache-2.0 + +from .aws import create_boto3_client +from .imports import import_class + +__all__ = ["import_class", "create_boto3_client"] diff --git a/stepfunctions/stepfunctions/agenteval/utils/aws.py b/stepfunctions/stepfunctions/agenteval/utils/aws.py new file mode 100644 index 0000000..4d5d4dd --- /dev/null +++ b/stepfunctions/stepfunctions/agenteval/utils/aws.py @@ -0,0 +1,36 @@ +# Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +# SPDX-License-Identifier: Apache-2.0 + +from typing import Optional + +import boto3 +from botocore.client import BaseClient +from botocore.config import Config + +_RETRY_MODE = "adaptive" + + +def create_boto3_client( + boto3_service_name: str, + aws_profile: Optional[str], + aws_region: Optional[str], + endpoint_url: Optional[str], + max_retry: int, +) -> BaseClient: + """Create a `boto3` client. + + Args: + boto3_service_name (str): The `boto3` service name (e.g `"bedrock-runtime"`). + aws_profile (str, optional): The AWS profile name. + aws_region (str, optional): The AWS region. + endpoint_url (str, optional): The endpoint URL for the AWS service. + max_retry (int, optional): The maximum number of retry attempts. + + Returns: + BaseClient + """ + + config = Config(retries={"max_attempts": max_retry, "mode": _RETRY_MODE}) + + session = boto3.Session(profile_name=aws_profile, region_name=aws_region) + return session.client(boto3_service_name, endpoint_url=endpoint_url, config=config) diff --git a/stepfunctions/stepfunctions/agenteval/utils/imports.py b/stepfunctions/stepfunctions/agenteval/utils/imports.py new file mode 100644 index 0000000..f0e2685 --- /dev/null +++ b/stepfunctions/stepfunctions/agenteval/utils/imports.py @@ -0,0 +1,35 @@ +# Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +# SPDX-License-Identifier: Apache-2.0 + +from importlib import import_module +from typing import Optional + +_ALLOWED_MODULE_NAME_SUFFIX = ["_hook", "_target"] + + +def import_class(module_path: str, parent_class: Optional[type] = None) -> type: + name, class_name = module_path.rsplit(".", 1) + + # make sure module name starts with one of the allowed suffixes + _validate_module_name(name.split(".")[-1]) + + module = import_module(name) + cls = getattr(module, class_name) + + if parent_class: + # make sure the imported class is a subclass + _validate_subclass(cls, parent_class) + + return cls + + +def _validate_module_name(name: str) -> None: + if not any(name.endswith(suffix) for suffix in _ALLOWED_MODULE_NAME_SUFFIX): + raise ValueError(f"Invalid module name: {name}") + + +def _validate_subclass(child_class: type, parent_class: type) -> None: + if not issubclass(child_class, parent_class): + raise TypeError( + f"{child_class.__name__} is not a {parent_class.__name__} subclass" + ) diff --git a/stepfunctions/stepfunctions/functions/check_agent_status_1/.~c9_invoke_Zi2ZN1.py b/stepfunctions/stepfunctions/functions/check_agent_status_1/.~c9_invoke_Zi2ZN1.py new file mode 100644 index 0000000..8f677c7 --- /dev/null +++ b/stepfunctions/stepfunctions/functions/check_agent_status_1/.~c9_invoke_Zi2ZN1.py @@ -0,0 +1,43 @@ +import boto3 +import json +import os + +s3_client = boto3.client('s3') +bedrock_agent = boto3.client('bedrock-agent') + +# from aws_lambda_powertools import Logger, Tracer + +# tracer = Tracer() +# logger = Logger() +def handler(event, context) + + agent_id = event["agent_id"] + + response = bedrock_agent.get_agent( + agentId='string' + ) + + agent_status = response["Agent"]["AgentStatus"] + + return { + 'statusCode': 200, + 'agent_id': agent_id, + 'agent_status': agent_status, + 'agent_name': text["agent_name"], + 'body': scenarios + } + + + + + + + + + + + + + + + diff --git a/stepfunctions/stepfunctions/functions/check_agent_status_1/index.py b/stepfunctions/stepfunctions/functions/check_agent_status_1/index.py new file mode 100644 index 0000000..d84319a --- /dev/null +++ b/stepfunctions/stepfunctions/functions/check_agent_status_1/index.py @@ -0,0 +1,29 @@ +import boto3 +import json +import os + +s3_client = boto3.client('s3') +bedrock_agent = boto3.client('bedrock-agent') + +# from aws_lambda_powertools import Logger, Tracer + +# tracer = Tracer() +# logger = Logger() + +def handler(event, context): + + agent_id = event["agent_id"] + + response = bedrock_agent.get_agent( + agentId=agent_id + ) + + + agent_status = response["agent"]["agentStatus"] + + + return { + 'statusCode': 200, + 'agent_id': agent_id, + 'agent_status': agent_status + } diff --git a/stepfunctions/stepfunctions/functions/check_agent_status_2/.~c9_invoke_Zi2ZN1.py b/stepfunctions/stepfunctions/functions/check_agent_status_2/.~c9_invoke_Zi2ZN1.py new file mode 100644 index 0000000..8f677c7 --- /dev/null +++ b/stepfunctions/stepfunctions/functions/check_agent_status_2/.~c9_invoke_Zi2ZN1.py @@ -0,0 +1,43 @@ +import boto3 +import json +import os + +s3_client = boto3.client('s3') +bedrock_agent = boto3.client('bedrock-agent') + +# from aws_lambda_powertools import Logger, Tracer + +# tracer = Tracer() +# logger = Logger() +def handler(event, context) + + agent_id = event["agent_id"] + + response = bedrock_agent.get_agent( + agentId='string' + ) + + agent_status = response["Agent"]["AgentStatus"] + + return { + 'statusCode': 200, + 'agent_id': agent_id, + 'agent_status': agent_status, + 'agent_name': text["agent_name"], + 'body': scenarios + } + + + + + + + + + + + + + + + diff --git a/stepfunctions/stepfunctions/functions/check_agent_status_2/index.py b/stepfunctions/stepfunctions/functions/check_agent_status_2/index.py new file mode 100644 index 0000000..eec7d52 --- /dev/null +++ b/stepfunctions/stepfunctions/functions/check_agent_status_2/index.py @@ -0,0 +1,27 @@ +import boto3 +import json +import os + +s3_client = boto3.client('s3') +bedrock_agent = boto3.client('bedrock-agent') + +# from aws_lambda_powertools import Logger, Tracer + +# tracer = Tracer() +# logger = Logger() + +def handler(event, context): + + agent_id = event["update_output"]["agentid"] + + response = bedrock_agent.get_agent( + agentId=agent_id + ) + + agent_status = response["agent"]["agentStatus"] + + return { + 'statusCode': 200, + 'agent_id': agent_id, + 'agent_status': agent_status + } diff --git a/stepfunctions/stepfunctions/functions/create_alias/index.py b/stepfunctions/stepfunctions/functions/create_alias/index.py new file mode 100644 index 0000000..e2cada8 --- /dev/null +++ b/stepfunctions/stepfunctions/functions/create_alias/index.py @@ -0,0 +1,31 @@ +import json +import boto3 +import uuid + + +def handler(event, context): + + bedrock_agent = boto3.client('bedrock-agent') + + agent_alias = str(uuid.uuid4()) + agent_id = event["update_output"]["agentid"] + + alias_resp = bedrock_agent.create_agent_alias( + agentAliasName=agent_alias, + agentId=agent_id + ) + + print(alias_resp) + + + agent_id = alias_resp["agentAlias"]["agentId"] + agent_alias_id = alias_resp["agentAlias"]["agentAliasId"] + agent_alias_name = alias_resp["agentAlias"]["agentAliasName"] + + return { + 'prompt': event['prompt'], + 'agent_id':agent_id, + 'agent_alias_id': agent_alias_id, + 'agent_alias_name': agent_alias_name, + 'scenarios': event['scenarios'] + } \ No newline at end of file diff --git a/stepfunctions/stepfunctions/functions/delete_alias/index.py b/stepfunctions/stepfunctions/functions/delete_alias/index.py new file mode 100644 index 0000000..99b2a7a --- /dev/null +++ b/stepfunctions/stepfunctions/functions/delete_alias/index.py @@ -0,0 +1,26 @@ +import json +import boto3 +import uuid +import os + +def handler(event, context): + # TODO implement + + #pass in from step function but for now + + agent_id = event["agent_id"] + agent_alias_id = event["agent_alias_id"] + + bedrock_agent = boto3.client('bedrock-agent') + + response = bedrock_agent.delete_agent_alias( + agentAliasId=agent_alias_id, + agentId=agent_id +) + + + return { + 'statusCode': 200, + 'agentid':agent_id + } + diff --git a/stepfunctions/stepfunctions/functions/generate_map/index.py b/stepfunctions/stepfunctions/functions/generate_map/index.py new file mode 100644 index 0000000..45b05de --- /dev/null +++ b/stepfunctions/stepfunctions/functions/generate_map/index.py @@ -0,0 +1,47 @@ +import boto3 +import json +import os + +s3_client = boto3.client('s3') + +# from aws_lambda_powertools import Logger, Tracer + +# AWS_REGION = os.environ["AWS_REGION"] +# AUTOMATION_STATE_MACHINE_ARN = os.environ["AUTOMATION_STATE_MACHINE_ARN"] + + + +# tracer = Tracer() +# logger = Logger() + +def handler(event, context): + + bucket = event["detail"]["bucket"]["name"] + key = event["detail"]["object"]["key"] + + scenario_json = s3_client.get_object(Bucket=bucket, Key=key) + text = json.loads(scenario_json["Body"].read()) + + print(text) + prompts = text['prompts'] + profiles = text['customer_profiles'] + + + # Generate scenarios + scenarios = [] + + for prompt in prompts: + item = { + 'prompt': prompt['prompt'], + 'scenarios': profiles + } + scenarios.append(item) + + + return { + 'statusCode': 200, + 'agent_id': text["agent_id"], + 'agent_name': text["agent_name"], + 'body': scenarios + } + \ No newline at end of file diff --git a/stepfunctions/stepfunctions/functions/run_test/index.py b/stepfunctions/stepfunctions/functions/run_test/index.py new file mode 100644 index 0000000..21219ed --- /dev/null +++ b/stepfunctions/stepfunctions/functions/run_test/index.py @@ -0,0 +1,163 @@ +import yaml +import json +import datetime +import os +import shutil +import threading +import time +import boto3 +import uuid + +from agenteval.runner import Runner +from agenteval.plan import Plan + +s3_client = boto3.client('s3') + + +def handler(event, context): + + scenario = event['scenario'] + prompt = event['prompt'] + agent_id = event['agent_id'] + agent_alias_id = event['agent_alias_id'] + agent_alias_name = event['agent_alias_name'] + bucket_name = os.environ["EVALUATION_BUCKET"] + uid = uuid.uuid4() + + user_profile = { + 'demographic': scenario['demography'], + 'household_size': scenario['household_size'], + 'appliances': scenario['appliances'], + 'energy_usage': scenario['energy_usage'], + 'tariff': scenario['tarrif'], + 'payment_type': scenario['payment_type'] + } + + profile_str = yaml.safe_dump(user_profile, default_flow_style=False, sort_keys=False) + + yaml_data = { + 'evaluator': { + 'model': 'claude-3' + }, + 'target': { + 'type': 'bedrock-agent', + 'bedrock_agent_id': agent_id, + 'bedrock_agent_alias_id': agent_alias_id + }, + # 'user_profile':{ + # 'demographic': scenario['demography'], + # 'household_size': scenario['household_size'], + # 'appliances': scenario['appliances'], + # 'energy_usage': scenario['energy_usage'], + # 'tariff': scenario['tarrif'], + # 'payment_type': scenario['payment_type'] + # }, + 'tests': { + 'provide recommendation to customer in need': { + 'profile': user_profile, + 'max_turns': 10, + 'steps': [ + 'Ask the agent how you can reduce your energy bills', + 'Respond to the agents questions using the details in:', + profile_str, + 'Respond to the agents questions using the details in:', + profile_str, + 'Respond to the agents questions using the details in:', + profile_str, + 'Respond to the agents questions using the details in:', + profile_str + ], + 'expected_results': [ + 'The agent asks the user questions to create a profile', + 'The agent asks the user questions to create a profile', + 'The agent asks the user questions to create a profile', + 'The agent asks the user questions to create a profile', + 'The agent returns a recommendation' + ] + } + } + } + + + # Convert to YAML + yaml_output = yaml.safe_dump(yaml_data, sort_keys=False, default_flow_style=False) + + yaml_dir = "/tmp/plan" + local_yaml_path = f"{yaml_dir}/agenteval.yml" + os.makedirs(os.path.dirname(local_yaml_path), exist_ok=True) + + with open(local_yaml_path,"w") as file: + file.write(yaml_output) + + + plan = Plan.load(plan_dir=yaml_dir, filter=None) + + + now = datetime.datetime.now() + created_at = now.strftime("%Y-%m-%d %H:%M:%S") + test_result_dir = f"/tmp/results/" + + + runner = Runner( + plan=plan, + verbose=False, + num_threads=None, + work_dir = test_result_dir + ) + + try: + + runner_thread = threading.Thread(target=runner.run) + runner_thread.start() + + start_time = datetime.datetime.now() + num_completed = 0 + + while num_completed < runner.num_tests: + time.sleep(1) + num_completed = len(list(filter(lambda x:x != None, runner.results.values()))) + percentage = num_completed / runner.num_tests + + runner_thread.join() + now = datetime.datetime.now() + status = "completed" + finished_at = now.strftime("%Y-%m-%d %H:%M:%S") + + test_passed_rate = ( + f"{runner.num_tests - runner.num_failed}/ {runner.num_tests}" + ) + + + with open(os.path.join(test_result_dir, "agenteval_summary.md")) as f: + result = f.read() + + s3_key = f"results/{agent_alias_name}/{uid}/results.md" + s3_client.put_object(Bucket=bucket_name, Key=s3_key, Body=result) + + # print('reached_here 2') + + except Exception as e: + status = "error" + + return{ + 'created_at': created_at, + 'finished_at':finished_at, + 'target_type': yaml_data["target"]["type"], + 'status': status, + 'test_passed_rate':test_passed_rate + } + + # finally: + # shutil.copy(local_yaml_path, test_result_dir) + + # insert_result(created_at, finished_at, yaml_data["target"]["type"], status, test_passed_rate) + # # For this example, we'll just return it + + # #plan is made + + + + # return { + # 'statusCode': 200, + # 'body': 'success' + # } \ No newline at end of file diff --git a/stepfunctions/stepfunctions/functions/update_bedrock_agent/index.py b/stepfunctions/stepfunctions/functions/update_bedrock_agent/index.py new file mode 100644 index 0000000..19876f9 --- /dev/null +++ b/stepfunctions/stepfunctions/functions/update_bedrock_agent/index.py @@ -0,0 +1,37 @@ +import json +import boto3 +import uuid +import os + +def handler(event, context): + # TODO implement + + #pass in from step function but for now + + agent_id = event["agent_id"] + agent_name=event["agent_name"] + agent_role = os.environ['AGENT_ROLE'] + + # role = "arn:aws:iam::905418302891:role/service-role/AmazonBedrockExecutionRoleForAgents_LED91O3XKK" + model = 'anthropic.claude-3-sonnet-20240229-v1:0' + + instruction = event['prompt'] + + bedrock_agent = boto3.client('bedrock-agent') + update_resp = bedrock_agent.update_agent( + agentId=agent_id, + agentName=agent_name, + agentResourceRoleArn=agent_role, + foundationModel=model, + instruction=instruction, + + ) + + prep_resp = bedrock_agent.prepare_agent(agentId=agent_id) + + + return { + 'statusCode': 200, + 'agentid':agent_id + } + diff --git a/stepfunctions/stepfunctions/layer/__init__.py b/stepfunctions/stepfunctions/layer/__init__.py new file mode 100644 index 0000000..1f229c9 --- /dev/null +++ b/stepfunctions/stepfunctions/layer/__init__.py @@ -0,0 +1 @@ +from .layer import Layer \ No newline at end of file diff --git a/stepfunctions/stepfunctions/layer/layer.py b/stepfunctions/stepfunctions/layer/layer.py new file mode 100644 index 0000000..a706586 --- /dev/null +++ b/stepfunctions/stepfunctions/layer/layer.py @@ -0,0 +1,60 @@ +import aws_cdk as cdk +import os +from constructs import Construct +from aws_cdk import ( + aws_s3_assets as assets, + aws_lambda as lambda_, +) + + +class Layer(Construct): + layer_version: lambda_.ILayerVersion + layer_version_arn: str + + def __init__( + self, + scope: Construct, + construct_id: str, + architecture: lambda_.Architecture, + runtime: lambda_.Runtime, + path: str, + **kwargs, + ) -> None: + super().__init__(scope, construct_id, **kwargs) + + print(f"architecture {architecture}") + print(f"runtime {runtime}") + + default_platform_flag = os.environ.get("DOCKER_DEFAULT_PLATFORM") + print(f"DOCKER_DEFAULT_PLATFORM={default_platform_flag}") + + layer_assets = assets.Asset( + self, + "LayerAsset", + path=path, + bundling=cdk.BundlingOptions( + image=runtime.bundling_image, + platform=architecture.docker_platform, + output_type=cdk.BundlingOutput.AUTO_DISCOVER, + security_opt="no-new-privileges:true", # https://docs.docker.com/engine/reference/commandline/run/#optional-security-options---security-opt + network="host", + command=[ + "bash", + "-c", + "pip install -r requirements.txt -t /asset-output/python && cp -au . /asset-output/python", + ], + ), + ) + + layer = lambda_.LayerVersion( + self, + "Layer", + removal_policy=cdk.RemovalPolicy.DESTROY, + code=lambda_.Code.from_bucket( + bucket=layer_assets.bucket, key=layer_assets.s3_object_key + ), + compatible_architectures=[architecture], + ) + + self.layer_version = layer + self.layer_version_arn = layer.layer_version_arn diff --git a/stepfunctions/stepfunctions/stepfunctions_stack.py b/stepfunctions/stepfunctions/stepfunctions_stack.py new file mode 100644 index 0000000..4917d05 --- /dev/null +++ b/stepfunctions/stepfunctions/stepfunctions_stack.py @@ -0,0 +1,432 @@ +import os +import pathlib + +import aws_cdk as cdk + +from aws_cdk import ( + Duration, + Stack, + aws_lambda as _lambda, + aws_stepfunctions_tasks as tasks, + aws_events as events, + aws_events_targets as targets, + aws_stepfunctions as sfn, + aws_iam as iam +) +from constructs import Construct +from .layer import Layer +architecture = _lambda.Architecture.X86_64 +runtime = _lambda.Runtime.PYTHON_3_12 + +class StepfunctionsStack(Stack): + + def __init__(self, scope: Construct, construct_id: str, **kwargs) -> None: + super().__init__(scope, construct_id, **kwargs) + + + evaluation_bucket = cdk.aws_s3.Bucket( + self, + "EvaluationBucket", + event_bridge_enabled=True + ) + + agenteval_layer = Layer( + self, + "AgentEvalLayer", + architecture=architecture, + runtime=runtime, + path=os.path.join( + pathlib.Path(__file__).parent.resolve().parent, + "layers", + "agent-evaluation" + ) + ) + + generate_map_function = _lambda.Function( + self, + "GenerateMapFunction", + runtime=runtime, + architecture=architecture, + timeout=cdk.Duration.minutes(5), + handler="index.handler", + code=_lambda.Code.from_asset( + os.path.join( + pathlib.Path(__file__).resolve().parent, + "functions", + "generate_map", + ) + ) + ) + + generate_map_step = tasks.LambdaInvoke( + self, + "Generate Map State", + lambda_function = generate_map_function, + payload=sfn.TaskInput.from_json_path_at("$"), + output_path=sfn.JsonPath.string_at("$.Payload") + ) + + + get_status_function_1 = _lambda.Function( + self, + "GetStatusFunction", + runtime=runtime, + architecture=architecture, + timeout=cdk.Duration.minutes(5), + handler="index.handler", + code=_lambda.Code.from_asset( + os.path.join( + pathlib.Path(__file__).resolve().parent, + "functions", + "check_agent_status_1", + ) + ) + ) + + get_status_step_1 = tasks.LambdaInvoke( + self, + "Get Status 1", + lambda_function=get_status_function_1, + payload=sfn.TaskInput.from_json_path_at("$"), + result_selector = { + "agentid": sfn.JsonPath.string_at("$.Payload.agent_id"), + "agentstatus": sfn.JsonPath.string_at("$.Payload.agent_status"), + "full_payload": sfn.JsonPath.string_at("$")}, + result_path = sfn.JsonPath.string_at("$.status_output_1") + ) + + + agent_role = iam.Role( + self, + "AgentRole", + assumed_by=iam.ServicePrincipal("bedrock.amazonaws.com"), + managed_policies=[ + iam.ManagedPolicy.from_aws_managed_policy_name("AmazonBedrockFullAccess") + ]) + + update_agent_function = _lambda.Function( + self, + "UpdateAgentFunction", + runtime=runtime, + architecture=architecture, + timeout=cdk.Duration.minutes(5), + handler="index.handler", + code=_lambda.Code.from_asset( + os.path.join( + pathlib.Path(__file__).resolve().parent, + "functions", + "update_bedrock_agent", + ) + ) + ) + + + + update_agent_function.add_environment("AGENT_ROLE",agent_role.role_arn) + + update_agent_step = tasks.LambdaInvoke( + self, + "Update Agent", + lambda_function = update_agent_function, + payload=sfn.TaskInput.from_json_path_at("$"), + result_path = "$.update_output", + result_selector = { + "agentid": sfn.JsonPath.string_at("$.Payload.agentid") + } + ) + + first_choice = sfn.Choice(self, "UpdateChoice1") + + condition1 = sfn.Condition.or_( + sfn.Condition.string_equals("$.status_output_1.agentstatus", "UPDATING"), + sfn.Condition.string_equals("$.status_output_1.agentstatus", "VERSIONING") + ) + + wait_step= sfn.Wait( + self, + "Wait1", + time=sfn.WaitTime.duration(Duration.seconds(30)) + ) + + # first_choice_def = first_choice.when(condition1, wait_step.next(get_status_step_1).next(first_choice)).otherwise(update_agent_step).afterwards() + + + create_alias_function = _lambda.Function( + self, + "CreateAliasFunction", + runtime=runtime, + architecture=architecture, + timeout=cdk.Duration.minutes(5), + handler="index.handler", + code=_lambda.Code.from_asset( + os.path.join( + pathlib.Path(__file__).resolve().parent, + "functions", + "create_alias", + ) + ) + ) + + create_alias_step = tasks.LambdaInvoke( + self, + "Create Alias", + lambda_function = create_alias_function, + payload=sfn.TaskInput.from_json_path_at("$"), + output_path = sfn.JsonPath.string_at("$.Payload"), + + ) + + get_status_function_2 = _lambda.Function( + self, + "GetStatusFunction2", + runtime=runtime, + architecture=architecture, + timeout=cdk.Duration.minutes(5), + handler="index.handler", + code=_lambda.Code.from_asset( + os.path.join( + pathlib.Path(__file__).resolve().parent, + "functions", + "check_agent_status_2", + ) + ) + ) + + get_status_step_2 = tasks.LambdaInvoke( + self, + "Get Status 2", + lambda_function=get_status_function_2, + payload=sfn.TaskInput.from_json_path_at("$"), + result_selector = { + "agentid": sfn.JsonPath.string_at("$.Payload.agent_id"), + "agentstatus": sfn.JsonPath.string_at("$.Payload.agent_status"), + "full_payload": sfn.JsonPath.string_at("$")}, + result_path = "$.status_output_2") + + + + second_choice = sfn.Choice(self, "UpdateChoice2") + condition2 = sfn.Condition.not_( + sfn.Condition.string_equals("$.status_output_2.agentstatus", "PREPARED"), + ) + wait_step_2= sfn.Wait( + self, + "Wait2", + time=sfn.WaitTime.duration(Duration.seconds(30)) + ) + + + agent_alias_map = sfn.Map( + self, + "Agent Alias Map", + max_concurrency=1, + items_path = sfn.JsonPath.string_at("$.body"), + parameters={ + "agent_id": sfn.JsonPath.string_at("$.agent_id"), + "agent_name": sfn.JsonPath.string_at("$.agent_name"), + "prompt": sfn.JsonPath.string_at("$$.Map.Item.Value.prompt"), + "scenarios": sfn.JsonPath.string_at("$$.Map.Item.Value.scenarios") + } + #you can only update an agent one at a time + # + ) + + pass_step = sfn.Pass(self, + "Pass State" + ) + + run_test_function = _lambda.Function( + self, + "RunTestFunction", + runtime=runtime, + architecture=architecture, + timeout=cdk.Duration.minutes(5), + handler="index.handler", + code=_lambda.Code.from_asset( + os.path.join( + pathlib.Path(__file__).resolve().parent, + "functions", + "run_test", + ) + ), + layers=[agenteval_layer.layer_version], + environment={ + "EVALUATION_BUCKET": evaluation_bucket.bucket_name, + }, + ) + run_test_step = tasks.LambdaInvoke( + self, + "Run Test", + lambda_function=run_test_function, + payload=sfn.TaskInput.from_json_path_at("$"), + result_path="$.run_test" + ) + + error_pass = sfn.Pass(self, "handle failure") + + run_test_step.add_catch(error_pass, + result_path="$.error") + test_map= sfn.Map( + self, + "Evaluation Map", + items_path = sfn.JsonPath.string_at("$.scenarios"), + parameters={ + "prompt": sfn.JsonPath.string_at("$.prompt"), + "agent_id": sfn.JsonPath.string_at("$.agent_id"), + "agent_alias_id": sfn.JsonPath.string_at("$.agent_alias_id"), + "agent_alias_name": sfn.JsonPath.string_at("$.agent_alias_name"), + "scenario": sfn.JsonPath.string_at("$$.Map.Item.Value") + }, + result_path="$.map_output" + ) + delete_alias_function = _lambda.Function( + self, + "DeleteAliasFunction", + runtime=runtime, + architecture=architecture, + timeout=cdk.Duration.minutes(5), + handler="index.handler", + code=_lambda.Code.from_asset( + os.path.join( + pathlib.Path(__file__).resolve().parent, + "functions", + "delete_alias", + ) + ) + ) + + delete_alias_function.add_to_role_policy( + iam.PolicyStatement( + actions=["bedrock:*"], + resources=["*"], + ) + ) + + delete_alias_step = tasks.LambdaInvoke( + self, + "Delete Alias", + lambda_function = delete_alias_function, + payload=sfn.TaskInput.from_json_path_at("$"), + result_path = "$.Payload", + + ) + + map_definition_2= run_test_step.next(pass_step) + + test_map.iterator(map_definition_2) + + + # eval_function_timeout_minutes = 10 + map_definition = get_status_step_1.next( + first_choice.when(condition1, wait_step.next(get_status_step_1)).otherwise(update_agent_step + .next( + get_status_step_2.next( + second_choice.when(condition2,wait_step_2.next(get_status_step_2)).otherwise(create_alias_step.next( + test_map + ).next( + delete_alias_step + ) + ) + ) + ) + ) + ) + + + + agent_alias_map.iterator(map_definition) + + + + chain = generate_map_step.next(agent_alias_map) + + evaluator_state_machine = sfn.StateMachine( + self, + "EvaluatorState", + definition_body = sfn.DefinitionBody.from_chainable(chain)) + + + + evaluator_state_machine.role.attach_inline_policy( + iam.Policy( + self, + "BedrockPolicy", + statements=[ + iam.PolicyStatement( + actions=["bedrock:*"], + resources=["*"], + ) + ], + ) + ) + + + + on_put_rule = events.Rule( + self, + "InvokeState", + event_pattern=events.EventPattern( + source=["aws.s3"], + detail_type=[ + "Object Created" + ], + detail={ + "bucket": { + "name": [evaluation_bucket.bucket_name] + }, + "object": { + "key": [{"prefix": "evaluation_prompts"}]}, + }, + ), + ) + + on_put_rule.add_target(targets.SfnStateMachine(evaluator_state_machine)) + + get_status_function_1.add_to_role_policy( + iam.PolicyStatement( + actions=["bedrock:*"], + resources=["*"], + ) + ) + get_status_function_2.add_to_role_policy( + iam.PolicyStatement( + actions=["bedrock:*"], + resources=["*"], + ) + ) + + create_alias_function.add_to_role_policy( + iam.PolicyStatement( + actions=["bedrock:*"], + resources=["*"], + ) + ) + + generate_map_function.add_to_role_policy( + iam.PolicyStatement( + actions=["s3:*"], + resources=["*"], + ) + ) + + run_test_function.add_to_role_policy( + iam.PolicyStatement( + actions=["s3:*","bedrock:*"], + resources=["*"], + ) + ) + + update_agent_function.add_to_role_policy( + iam.PolicyStatement( + actions=["bedrock:*","iam:PassRole","iam:ListRoles"], + resources=["*"], + ) + ) + + # The code that defines your stack goes here + + # example resource + # queue = sqs.Queue( + # self, "StepfunctionsQueue", + # visibility_timeout=Duration.seconds(300), + # ) diff --git a/stepfunctions/tests/__init__.py b/stepfunctions/tests/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/stepfunctions/tests/unit/__init__.py b/stepfunctions/tests/unit/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/stepfunctions/tests/unit/test_stepfunctions_stack.py b/stepfunctions/tests/unit/test_stepfunctions_stack.py new file mode 100644 index 0000000..e213d46 --- /dev/null +++ b/stepfunctions/tests/unit/test_stepfunctions_stack.py @@ -0,0 +1,15 @@ +import aws_cdk as core +import aws_cdk.assertions as assertions + +from stepfunctions.stepfunctions_stack import StepfunctionsStack + +# example tests. To run these tests, uncomment this file along with the example +# resource in stepfunctions/stepfunctions_stack.py +def test_sqs_queue_created(): + app = core.App() + stack = StepfunctionsStack(app, "stepfunctions") + template = assertions.Template.from_stack(stack) + +# template.has_resource_properties("AWS::SQS::Queue", { +# "VisibilityTimeout": 300 +# }) From 6be2f9555933d8675ad3a4a52fcaacbbe4fbb67d Mon Sep 17 00:00:00 2001 From: Ife Ojomo Date: Tue, 24 Sep 2024 13:46:41 +0100 Subject: [PATCH 2/7] feat: Add StepfunctionsStack for agent evaluation workflow - Implement StepfunctionsStack with AWS CDK - Add Lambda functions for various steps in the workflow: - generate_map - check_agent_status - create_alias - delete_alias - run_test - update_bedrock_agent - Create Step Functions state machine for agent evaluation process - Set up EventBridge rule to trigger workflow on S3 object creation - Configure necessary IAM roles and permissions - Implement error handling and retry logic in the state machine From adb8a16357c52ab56cd35970976d59f89500d1f0 Mon Sep 17 00:00:00 2001 From: Ife Ojomo Date: Fri, 4 Oct 2024 16:46:40 +0100 Subject: [PATCH 3/7] fix: Resolved comments and performed linting checks --- .DS_Store | Bin 0 -> 8196 bytes demo/.DS_Store | Bin 0 -> 6148 bytes docs/.DS_Store | Bin 0 -> 6148 bytes src/.DS_Store | Bin 0 -> 6148 bytes stepfunctions/.DS_Store | Bin 0 -> 6148 bytes stepfunctions/.gitignore | 1 + .../INSTALLER | 1 - .../agent_evaluation-0.2.0.dist-info/LICENSE | 175 ------------- .../agent_evaluation-0.2.0.dist-info/METADATA | 74 ------ .../agent_evaluation-0.2.0.dist-info/NOTICE | 1 - .../agent_evaluation-0.2.0.dist-info/RECORD | 87 ------- .../REQUESTED | 0 .../agent_evaluation-0.2.0.dist-info/WHEEL | 5 - .../entry_points.txt | 2 - .../top_level.txt | 1 - stepfunctions/agenteval/__init__.py | 46 ---- stepfunctions/agenteval/cli.py | 109 -------- stepfunctions/agenteval/conversation.py | 35 --- stepfunctions/agenteval/defaults.py | 8 - .../agenteval/evaluators/__init__.py | 7 - .../agenteval/evaluators/base_evaluator.py | 139 ---------- .../agenteval/evaluators/claude_3/__init__.py | 6 - .../evaluators/claude_3/evaluator.py | 244 ------------------ .../evaluators/claude_3/model_configs.py | 26 -- .../claude_3/most_updated_prompt_2607.txt | 67 ----- .../agenteval/evaluators/evaluator_factory.py | 27 -- stepfunctions/agenteval/hook.py | 33 --- stepfunctions/agenteval/plan.py | 110 -------- stepfunctions/agenteval/runner/__init__.py | 6 - stepfunctions/agenteval/runner/runner.py | 116 --------- stepfunctions/agenteval/runner/summary.py | 30 --- stepfunctions/agenteval/target_response.py | 15 -- stepfunctions/agenteval/targets/__init__.py | 8 - .../agenteval/targets/base_target.py | 27 -- .../targets/bedrock_agent/__init__.py | 3 - .../agenteval/targets/bedrock_agent/target.py | 41 --- .../bedrock_knowledge_base/__init__.py | 3 - .../targets/bedrock_knowledge_base/target.py | 38 --- .../agenteval/targets/boto3_target.py | 41 --- .../agenteval/targets/q_business/__init__.py | 3 - .../agenteval/targets/q_business/target.py | 32 --- .../targets/sagemaker_endpoint/__init__.py | 3 - .../targets/sagemaker_endpoint/target.py | 85 ------ .../agenteval/targets/target_factory.py | 32 --- .../claude_3/generate_evaluation.jinja | 13 - .../claude_3/generate_initial_prompt.jinja | 5 - .../claude_3/generate_test_status.jinja | 13 - .../claude_3/generate_user_response.jinja | 13 - .../claude_3/system/generate_evaluation.jinja | 12 - .../system/generate_initial_prompt.jinja | 13 - .../system/generate_test_status.jinja | 13 - .../system/generate_user_response.jinja | 15 -- .../summary/agenteval_summary.md.jinja | 49 ---- stepfunctions/agenteval/test.py | 29 --- stepfunctions/agenteval/test_result.py | 27 -- stepfunctions/agenteval/trace.py | 72 ------ stepfunctions/agenteval/utils/__init__.py | 7 - stepfunctions/agenteval/utils/aws.py | 36 --- stepfunctions/agenteval/utils/imports.py | 35 --- stepfunctions/app.py | 2 +- .../INSTALLER | 1 - .../agent_evaluation-0.2.0.dist-info/LICENSE | 175 ------------- .../agent_evaluation-0.2.0.dist-info/METADATA | 74 ------ .../agent_evaluation-0.2.0.dist-info/NOTICE | 1 - .../agent_evaluation-0.2.0.dist-info/RECORD | 87 ------- .../REQUESTED | 0 .../agent_evaluation-0.2.0.dist-info/WHEEL | 5 - .../entry_points.txt | 2 - .../top_level.txt | 1 - .../stepfunctions/agenteval/__init__.py | 46 ---- stepfunctions/stepfunctions/agenteval/cli.py | 109 -------- .../stepfunctions/agenteval/conversation.py | 35 --- .../stepfunctions/agenteval/defaults.py | 8 - .../agenteval/evaluators/__init__.py | 7 - .../agenteval/evaluators/base_evaluator.py | 139 ---------- .../agenteval/evaluators/claude_3/__init__.py | 6 - .../evaluators/claude_3/evaluator.py | 244 ------------------ .../evaluators/claude_3/model_configs.py | 26 -- .../claude_3/most_updated_prompt_2607.txt | 67 ----- .../agenteval/evaluators/evaluator_factory.py | 27 -- stepfunctions/stepfunctions/agenteval/hook.py | 33 --- stepfunctions/stepfunctions/agenteval/plan.py | 110 -------- .../agenteval/runner/__init__.py | 6 - .../stepfunctions/agenteval/runner/runner.py | 116 --------- .../stepfunctions/agenteval/runner/summary.py | 30 --- .../agenteval/target_response.py | 15 -- .../agenteval/targets/__init__.py | 8 - .../agenteval/targets/base_target.py | 27 -- .../targets/bedrock_agent/__init__.py | 3 - .../agenteval/targets/bedrock_agent/target.py | 41 --- .../bedrock_knowledge_base/__init__.py | 3 - .../targets/bedrock_knowledge_base/target.py | 38 --- .../agenteval/targets/boto3_target.py | 41 --- .../agenteval/targets/q_business/__init__.py | 3 - .../agenteval/targets/q_business/target.py | 32 --- .../targets/sagemaker_endpoint/__init__.py | 3 - .../targets/sagemaker_endpoint/target.py | 85 ------ .../agenteval/targets/target_factory.py | 32 --- .../claude_3/generate_evaluation.jinja | 13 - .../claude_3/generate_initial_prompt.jinja | 5 - .../claude_3/generate_test_status.jinja | 13 - .../claude_3/generate_user_response.jinja | 13 - .../claude_3/system/generate_evaluation.jinja | 12 - .../system/generate_initial_prompt.jinja | 13 - .../system/generate_test_status.jinja | 13 - .../system/generate_user_response.jinja | 15 -- .../summary/agenteval_summary.md.jinja | 49 ---- stepfunctions/stepfunctions/agenteval/test.py | 29 --- .../stepfunctions/agenteval/test_result.py | 27 -- .../stepfunctions/agenteval/trace.py | 72 ------ .../stepfunctions/agenteval/utils/__init__.py | 7 - .../stepfunctions/agenteval/utils/aws.py | 36 --- .../stepfunctions/agenteval/utils/imports.py | 35 --- .../.~c9_invoke_Zi2ZN1.py | 43 --- .../.~c9_invoke_Zi2ZN1.py | 43 --- 115 files changed, 2 insertions(+), 4163 deletions(-) create mode 100644 .DS_Store create mode 100644 demo/.DS_Store create mode 100644 docs/.DS_Store create mode 100644 src/.DS_Store create mode 100644 stepfunctions/.DS_Store delete mode 100644 stepfunctions/agent_evaluation-0.2.0.dist-info/INSTALLER delete mode 100644 stepfunctions/agent_evaluation-0.2.0.dist-info/LICENSE delete mode 100644 stepfunctions/agent_evaluation-0.2.0.dist-info/METADATA delete mode 100644 stepfunctions/agent_evaluation-0.2.0.dist-info/NOTICE delete mode 100644 stepfunctions/agent_evaluation-0.2.0.dist-info/RECORD delete mode 100644 stepfunctions/agent_evaluation-0.2.0.dist-info/REQUESTED delete mode 100644 stepfunctions/agent_evaluation-0.2.0.dist-info/WHEEL delete mode 100644 stepfunctions/agent_evaluation-0.2.0.dist-info/entry_points.txt delete mode 100644 stepfunctions/agent_evaluation-0.2.0.dist-info/top_level.txt delete mode 100644 stepfunctions/agenteval/__init__.py delete mode 100644 stepfunctions/agenteval/cli.py delete mode 100644 stepfunctions/agenteval/conversation.py delete mode 100644 stepfunctions/agenteval/defaults.py delete mode 100644 stepfunctions/agenteval/evaluators/__init__.py delete mode 100644 stepfunctions/agenteval/evaluators/base_evaluator.py delete mode 100644 stepfunctions/agenteval/evaluators/claude_3/__init__.py delete mode 100644 stepfunctions/agenteval/evaluators/claude_3/evaluator.py delete mode 100644 stepfunctions/agenteval/evaluators/claude_3/model_configs.py delete mode 100644 stepfunctions/agenteval/evaluators/claude_3/most_updated_prompt_2607.txt delete mode 100644 stepfunctions/agenteval/evaluators/evaluator_factory.py delete mode 100644 stepfunctions/agenteval/hook.py delete mode 100644 stepfunctions/agenteval/plan.py delete mode 100644 stepfunctions/agenteval/runner/__init__.py delete mode 100644 stepfunctions/agenteval/runner/runner.py delete mode 100644 stepfunctions/agenteval/runner/summary.py delete mode 100644 stepfunctions/agenteval/target_response.py delete mode 100644 stepfunctions/agenteval/targets/__init__.py delete mode 100644 stepfunctions/agenteval/targets/base_target.py delete mode 100644 stepfunctions/agenteval/targets/bedrock_agent/__init__.py delete mode 100644 stepfunctions/agenteval/targets/bedrock_agent/target.py delete mode 100644 stepfunctions/agenteval/targets/bedrock_knowledge_base/__init__.py delete mode 100644 stepfunctions/agenteval/targets/bedrock_knowledge_base/target.py delete mode 100644 stepfunctions/agenteval/targets/boto3_target.py delete mode 100644 stepfunctions/agenteval/targets/q_business/__init__.py delete mode 100644 stepfunctions/agenteval/targets/q_business/target.py delete mode 100644 stepfunctions/agenteval/targets/sagemaker_endpoint/__init__.py delete mode 100644 stepfunctions/agenteval/targets/sagemaker_endpoint/target.py delete mode 100644 stepfunctions/agenteval/targets/target_factory.py delete mode 100644 stepfunctions/agenteval/templates/evaluators/claude_3/generate_evaluation.jinja delete mode 100644 stepfunctions/agenteval/templates/evaluators/claude_3/generate_initial_prompt.jinja delete mode 100644 stepfunctions/agenteval/templates/evaluators/claude_3/generate_test_status.jinja delete mode 100644 stepfunctions/agenteval/templates/evaluators/claude_3/generate_user_response.jinja delete mode 100644 stepfunctions/agenteval/templates/evaluators/claude_3/system/generate_evaluation.jinja delete mode 100644 stepfunctions/agenteval/templates/evaluators/claude_3/system/generate_initial_prompt.jinja delete mode 100644 stepfunctions/agenteval/templates/evaluators/claude_3/system/generate_test_status.jinja delete mode 100644 stepfunctions/agenteval/templates/evaluators/claude_3/system/generate_user_response.jinja delete mode 100644 stepfunctions/agenteval/templates/summary/agenteval_summary.md.jinja delete mode 100644 stepfunctions/agenteval/test.py delete mode 100644 stepfunctions/agenteval/test_result.py delete mode 100644 stepfunctions/agenteval/trace.py delete mode 100644 stepfunctions/agenteval/utils/__init__.py delete mode 100644 stepfunctions/agenteval/utils/aws.py delete mode 100644 stepfunctions/agenteval/utils/imports.py delete mode 100644 stepfunctions/stepfunctions/agent_evaluation-0.2.0.dist-info/INSTALLER delete mode 100644 stepfunctions/stepfunctions/agent_evaluation-0.2.0.dist-info/LICENSE delete mode 100644 stepfunctions/stepfunctions/agent_evaluation-0.2.0.dist-info/METADATA delete mode 100644 stepfunctions/stepfunctions/agent_evaluation-0.2.0.dist-info/NOTICE delete mode 100644 stepfunctions/stepfunctions/agent_evaluation-0.2.0.dist-info/RECORD delete mode 100644 stepfunctions/stepfunctions/agent_evaluation-0.2.0.dist-info/REQUESTED delete mode 100644 stepfunctions/stepfunctions/agent_evaluation-0.2.0.dist-info/WHEEL delete mode 100644 stepfunctions/stepfunctions/agent_evaluation-0.2.0.dist-info/entry_points.txt delete mode 100644 stepfunctions/stepfunctions/agent_evaluation-0.2.0.dist-info/top_level.txt delete mode 100644 stepfunctions/stepfunctions/agenteval/__init__.py delete mode 100644 stepfunctions/stepfunctions/agenteval/cli.py delete mode 100644 stepfunctions/stepfunctions/agenteval/conversation.py delete mode 100644 stepfunctions/stepfunctions/agenteval/defaults.py delete mode 100644 stepfunctions/stepfunctions/agenteval/evaluators/__init__.py delete mode 100644 stepfunctions/stepfunctions/agenteval/evaluators/base_evaluator.py delete mode 100644 stepfunctions/stepfunctions/agenteval/evaluators/claude_3/__init__.py delete mode 100644 stepfunctions/stepfunctions/agenteval/evaluators/claude_3/evaluator.py delete mode 100644 stepfunctions/stepfunctions/agenteval/evaluators/claude_3/model_configs.py delete mode 100644 stepfunctions/stepfunctions/agenteval/evaluators/claude_3/most_updated_prompt_2607.txt delete mode 100644 stepfunctions/stepfunctions/agenteval/evaluators/evaluator_factory.py delete mode 100644 stepfunctions/stepfunctions/agenteval/hook.py delete mode 100644 stepfunctions/stepfunctions/agenteval/plan.py delete mode 100644 stepfunctions/stepfunctions/agenteval/runner/__init__.py delete mode 100644 stepfunctions/stepfunctions/agenteval/runner/runner.py delete mode 100644 stepfunctions/stepfunctions/agenteval/runner/summary.py delete mode 100644 stepfunctions/stepfunctions/agenteval/target_response.py delete mode 100644 stepfunctions/stepfunctions/agenteval/targets/__init__.py delete mode 100644 stepfunctions/stepfunctions/agenteval/targets/base_target.py delete mode 100644 stepfunctions/stepfunctions/agenteval/targets/bedrock_agent/__init__.py delete mode 100644 stepfunctions/stepfunctions/agenteval/targets/bedrock_agent/target.py delete mode 100644 stepfunctions/stepfunctions/agenteval/targets/bedrock_knowledge_base/__init__.py delete mode 100644 stepfunctions/stepfunctions/agenteval/targets/bedrock_knowledge_base/target.py delete mode 100644 stepfunctions/stepfunctions/agenteval/targets/boto3_target.py delete mode 100644 stepfunctions/stepfunctions/agenteval/targets/q_business/__init__.py delete mode 100644 stepfunctions/stepfunctions/agenteval/targets/q_business/target.py delete mode 100644 stepfunctions/stepfunctions/agenteval/targets/sagemaker_endpoint/__init__.py delete mode 100644 stepfunctions/stepfunctions/agenteval/targets/sagemaker_endpoint/target.py delete mode 100644 stepfunctions/stepfunctions/agenteval/targets/target_factory.py delete mode 100644 stepfunctions/stepfunctions/agenteval/templates/evaluators/claude_3/generate_evaluation.jinja delete mode 100644 stepfunctions/stepfunctions/agenteval/templates/evaluators/claude_3/generate_initial_prompt.jinja delete mode 100644 stepfunctions/stepfunctions/agenteval/templates/evaluators/claude_3/generate_test_status.jinja delete mode 100644 stepfunctions/stepfunctions/agenteval/templates/evaluators/claude_3/generate_user_response.jinja delete mode 100644 stepfunctions/stepfunctions/agenteval/templates/evaluators/claude_3/system/generate_evaluation.jinja delete mode 100644 stepfunctions/stepfunctions/agenteval/templates/evaluators/claude_3/system/generate_initial_prompt.jinja delete mode 100644 stepfunctions/stepfunctions/agenteval/templates/evaluators/claude_3/system/generate_test_status.jinja delete mode 100644 stepfunctions/stepfunctions/agenteval/templates/evaluators/claude_3/system/generate_user_response.jinja delete mode 100644 stepfunctions/stepfunctions/agenteval/templates/summary/agenteval_summary.md.jinja delete mode 100644 stepfunctions/stepfunctions/agenteval/test.py delete mode 100644 stepfunctions/stepfunctions/agenteval/test_result.py delete mode 100644 stepfunctions/stepfunctions/agenteval/trace.py delete mode 100644 stepfunctions/stepfunctions/agenteval/utils/__init__.py delete mode 100644 stepfunctions/stepfunctions/agenteval/utils/aws.py delete mode 100644 stepfunctions/stepfunctions/agenteval/utils/imports.py delete mode 100644 stepfunctions/stepfunctions/functions/check_agent_status_1/.~c9_invoke_Zi2ZN1.py delete mode 100644 stepfunctions/stepfunctions/functions/check_agent_status_2/.~c9_invoke_Zi2ZN1.py diff --git a/.DS_Store b/.DS_Store new file mode 100644 index 0000000000000000000000000000000000000000..31c8f3d565bd06fb012adf4c4f64e9484137247e GIT binary patch literal 8196 zcmeHMzl#$=6n>M98qk9it~f<;LGh|PJxdE&&zzMY608pK^2>{oy$zRKVIy1F`Y%{m z*;v|$1{Dhn6)O=Nu@M9jODn(mkG2hofv+OBJ+6kN|Kph7&gHLpx z&M$BnTeRV%+~q^L%*x$Rl*x|qedA6luqdr4APQs^(0BJDEzt(ure5m){`U6|ZVRJw zxz>(q=qtVY^y1_F>u)Zm?TuV}r^UVvyze8F4h-^Ai?*qj%1$$ttOw_C?(7%uGMVg^;$x0CngO2<-K8Eisf+d|)v1cwqGp=g@kij6_ z2m5S_e0&x=i%iYWz63qYZ;R?_RUcf%9Gr#oJa+HhpU3t$nOKK=Dr-&U+y4QBVn^#; z--LR%@O#|EiuQ0SM58*Mm^*Sg-<$JVc%7UTIpcHPri+L30F}-6xg8xs#va+R}kdx z)|v1EdVJJ%R9&=RY|(_E(Js2*T*raQABK4Loh@(}TZ9GWpMMDGQCHsoMtW)XumXPp DhbvsD literal 0 HcmV?d00001 diff --git a/demo/.DS_Store b/demo/.DS_Store new file mode 100644 index 0000000000000000000000000000000000000000..fc99d1ff0b1477d9be91a94e43d8c93ba25abbc7 GIT binary patch literal 6148 zcmeHLy-EW?5S}%OhS;R?r@Kb16jKOM2%OeNY7v`} zBTI;k&v6Vj@W6#OoJXR~fKk9G@Y@uicQ*&?(8o8n2g>{FwhGmT?{y{NMpB&TJ5Js8 z>*&vRMlY$yz2}|e*ygumyQgY>3#Wt(@*E<^4h&UxD_*6jdsR+PyT#GvKIf$5d;Ww; ze1uwm?eTW#G$1NFWOAG{x zbYODx!DMA7cPNZk$N4QD4$Ky4dZU0*Kvy6oPV=<>SCaStdXhOa3K#|cl>#j7RGl(j zl3ZIO7e{NYh3yC%8N~$xc?x!NIpznn6x06-+|Xyy0irE25Qrma=7#{6!E{D}A64KB Dv*^o{ literal 0 HcmV?d00001 diff --git a/docs/.DS_Store b/docs/.DS_Store new file mode 100644 index 0000000000000000000000000000000000000000..96146a4ad277ff431179e98a7a82cb486675fbe5 GIT binary patch literal 6148 zcmeHLzfS@&6n>RUL>Oe$xSS3qj>Lc9B)FM4urcxDP9On~fRMQ8E&eC|5zY?$AC7Kr zu70m=j+VQ_Nn=bO>HB)GukFXzTuUJ$(O86aq8bsEQ5d}nT#j)&myGS`$N^CB7~?^` z)sH9BOr{Gs1)KtZQ2~B;J?hg4CA8jO5q+3#lepRJjpH6-<@dw4((Bdg!fp?@woiw~ z-XIcmr3yu~q%MtV0e?p^&1i0Mp#AHjJd@AEY04*e%t^fF_Fb;UQ|^nj2WCrfo#8X0 z5VZ}K;utJ*vP_O=?UZ9~=9=O5%v^pmaJrEUBf7;(Mz)IdBKc*qOn$3JgI`YOn&0+J zn_mYLQ8Lo<)Sy%jT0XpPw?E*=*&iLg$?VP64MtssQf~J_=)?F;OVD4mf!Q0D1^Z zLtXBnK%WD^Kx3j15tvY2fvT&}R}7)LBkt?GKx3j%btfTb#yEOrp>HTc&W^Y*(@6vh zop%a21#AUM>aoV>|7G$0-|pm|oB~dPf2Dx%nyqF7UrC>>nJ>p@t&h?{VdK0+q3i;u jk7HTkqnQ6!kcKv&E5JZwq7V_7`yn7@aLy_4s|tJpmb~Cn literal 0 HcmV?d00001 diff --git a/src/.DS_Store b/src/.DS_Store new file mode 100644 index 0000000000000000000000000000000000000000..fa26f4d7cc7e170664efb0420068e4ac7e31253e GIT binary patch literal 6148 zcmeHK!AiqG5S@)hQoQtN@sb}<=pV!qFN&b_EY@0$g{0}B^_-vZBGzH{Pl3tNYfvRw79EgHpX_STj59F?pprA#dEt=HNTVNy*JITC0<^9?Eh=a z+WxlBcz-6wfH7bUe3t>#Y?g3G(0XIQ7%&Fb4Dk0MLSu{-OF{qXK zTk_cB`hWEK{lD&HPsV^T@UIwfon({@k&@QdMsi$hBeXFZ3)_`~YZnZBDTXbV;$1W^ Z&__G~#)_pNG!XkE;AybV82D8N-T{L(ZGiv) literal 0 HcmV?d00001 diff --git a/stepfunctions/.DS_Store b/stepfunctions/.DS_Store new file mode 100644 index 0000000000000000000000000000000000000000..6f6161b8788a1e59e7fef2dbc9786f1245bb967c GIT binary patch literal 6148 zcmeHLJ#SMn6g^IpHUc44Dgy%ut72e6>dN$5g{>W`PE4UqN-0h9qI^_^>5Uz_Mg0+U z>VH7$z`)3gm>3z5*l2sMZ8i2wQUXI&Az#Tp&-T5~=N|dlaR8{5etij;2T*1aEZ$)? z#$;T|h!tGV1EMnT*l+ACZPe0cd&D<(L!HQf(ZlW-#nrb**L(XM_o~(3zbwaV-tRM>E+XJx(!#%F*h>z%6*BuH z9QFBa|2#S0wQInO_xX(X2Q_#rQ0D2|?{JahO@;2u;~gq77B(CBsn>2HOng&G4abR nnh;j*I5rP*6wCh##yFEVigs*Q3WmmtTyxP literal 0 HcmV?d00001 diff --git a/stepfunctions/.gitignore b/stepfunctions/.gitignore index 37833f8..c5aed14 100644 --- a/stepfunctions/.gitignore +++ b/stepfunctions/.gitignore @@ -4,6 +4,7 @@ __pycache__ .pytest_cache .venv *.egg-info +*.dist-info # CDK asset staging directory .cdk.staging diff --git a/stepfunctions/agent_evaluation-0.2.0.dist-info/INSTALLER b/stepfunctions/agent_evaluation-0.2.0.dist-info/INSTALLER deleted file mode 100644 index a1b589e..0000000 --- a/stepfunctions/agent_evaluation-0.2.0.dist-info/INSTALLER +++ /dev/null @@ -1 +0,0 @@ -pip diff --git a/stepfunctions/agent_evaluation-0.2.0.dist-info/LICENSE b/stepfunctions/agent_evaluation-0.2.0.dist-info/LICENSE deleted file mode 100644 index 67db858..0000000 --- a/stepfunctions/agent_evaluation-0.2.0.dist-info/LICENSE +++ /dev/null @@ -1,175 +0,0 @@ - - Apache License - Version 2.0, January 2004 - http://www.apache.org/licenses/ - - TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION - - 1. Definitions. - - "License" shall mean the terms and conditions for use, reproduction, - and distribution as defined by Sections 1 through 9 of this document. - - "Licensor" shall mean the copyright owner or entity authorized by - the copyright owner that is granting the License. - - "Legal Entity" shall mean the union of the acting entity and all - other entities that control, are controlled by, or are under common - control with that entity. For the purposes of this definition, - "control" means (i) the power, direct or indirect, to cause the - direction or management of such entity, whether by contract or - otherwise, or (ii) ownership of fifty percent (50%) or more of the - outstanding shares, or (iii) beneficial ownership of such entity. - - "You" (or "Your") shall mean an individual or Legal Entity - exercising permissions granted by this License. - - "Source" form shall mean the preferred form for making modifications, - including but not limited to software source code, documentation - source, and configuration files. - - "Object" form shall mean any form resulting from mechanical - transformation or translation of a Source form, including but - not limited to compiled object code, generated documentation, - and conversions to other media types. - - "Work" shall mean the work of authorship, whether in Source or - Object form, made available under the License, as indicated by a - copyright notice that is included in or attached to the work - (an example is provided in the Appendix below). - - "Derivative Works" shall mean any work, whether in Source or Object - form, that is based on (or derived from) the Work and for which the - editorial revisions, annotations, elaborations, or other modifications - represent, as a whole, an original work of authorship. For the purposes - of this License, Derivative Works shall not include works that remain - separable from, or merely link (or bind by name) to the interfaces of, - the Work and Derivative Works thereof. - - "Contribution" shall mean any work of authorship, including - the original version of the Work and any modifications or additions - to that Work or Derivative Works thereof, that is intentionally - submitted to Licensor for inclusion in the Work by the copyright owner - or by an individual or Legal Entity authorized to submit on behalf of - the copyright owner. For the purposes of this definition, "submitted" - means any form of electronic, verbal, or written communication sent - to the Licensor or its representatives, including but not limited to - communication on electronic mailing lists, source code control systems, - and issue tracking systems that are managed by, or on behalf of, the - Licensor for the purpose of discussing and improving the Work, but - excluding communication that is conspicuously marked or otherwise - designated in writing by the copyright owner as "Not a Contribution." - - "Contributor" shall mean Licensor and any individual or Legal Entity - on behalf of whom a Contribution has been received by Licensor and - subsequently incorporated within the Work. - - 2. Grant of Copyright License. Subject to the terms and conditions of - this License, each Contributor hereby grants to You a perpetual, - worldwide, non-exclusive, no-charge, royalty-free, irrevocable - copyright license to reproduce, prepare Derivative Works of, - publicly display, publicly perform, sublicense, and distribute the - Work and such Derivative Works in Source or Object form. - - 3. Grant of Patent License. Subject to the terms and conditions of - this License, each Contributor hereby grants to You a perpetual, - worldwide, non-exclusive, no-charge, royalty-free, irrevocable - (except as stated in this section) patent license to make, have made, - use, offer to sell, sell, import, and otherwise transfer the Work, - where such license applies only to those patent claims licensable - by such Contributor that are necessarily infringed by their - Contribution(s) alone or by combination of their Contribution(s) - with the Work to which such Contribution(s) was submitted. If You - institute patent litigation against any entity (including a - cross-claim or counterclaim in a lawsuit) alleging that the Work - or a Contribution incorporated within the Work constitutes direct - or contributory patent infringement, then any patent licenses - granted to You under this License for that Work shall terminate - as of the date such litigation is filed. - - 4. Redistribution. You may reproduce and distribute copies of the - Work or Derivative Works thereof in any medium, with or without - modifications, and in Source or Object form, provided that You - meet the following conditions: - - (a) You must give any other recipients of the Work or - Derivative Works a copy of this License; and - - (b) You must cause any modified files to carry prominent notices - stating that You changed the files; and - - (c) You must retain, in the Source form of any Derivative Works - that You distribute, all copyright, patent, trademark, and - attribution notices from the Source form of the Work, - excluding those notices that do not pertain to any part of - the Derivative Works; and - - (d) If the Work includes a "NOTICE" text file as part of its - distribution, then any Derivative Works that You distribute must - include a readable copy of the attribution notices contained - within such NOTICE file, excluding those notices that do not - pertain to any part of the Derivative Works, in at least one - of the following places: within a NOTICE text file distributed - as part of the Derivative Works; within the Source form or - documentation, if provided along with the Derivative Works; or, - within a display generated by the Derivative Works, if and - wherever such third-party notices normally appear. The contents - of the NOTICE file are for informational purposes only and - do not modify the License. You may add Your own attribution - notices within Derivative Works that You distribute, alongside - or as an addendum to the NOTICE text from the Work, provided - that such additional attribution notices cannot be construed - as modifying the License. - - You may add Your own copyright statement to Your modifications and - may provide additional or different license terms and conditions - for use, reproduction, or distribution of Your modifications, or - for any such Derivative Works as a whole, provided Your use, - reproduction, and distribution of the Work otherwise complies with - the conditions stated in this License. - - 5. Submission of Contributions. Unless You explicitly state otherwise, - any Contribution intentionally submitted for inclusion in the Work - by You to the Licensor shall be under the terms and conditions of - this License, without any additional terms or conditions. - Notwithstanding the above, nothing herein shall supersede or modify - the terms of any separate license agreement you may have executed - with Licensor regarding such Contributions. - - 6. Trademarks. This License does not grant permission to use the trade - names, trademarks, service marks, or product names of the Licensor, - except as required for reasonable and customary use in describing the - origin of the Work and reproducing the content of the NOTICE file. - - 7. Disclaimer of Warranty. Unless required by applicable law or - agreed to in writing, Licensor provides the Work (and each - Contributor provides its Contributions) on an "AS IS" BASIS, - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or - implied, including, without limitation, any warranties or conditions - of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A - PARTICULAR PURPOSE. You are solely responsible for determining the - appropriateness of using or redistributing the Work and assume any - risks associated with Your exercise of permissions under this License. - - 8. Limitation of Liability. In no event and under no legal theory, - whether in tort (including negligence), contract, or otherwise, - unless required by applicable law (such as deliberate and grossly - negligent acts) or agreed to in writing, shall any Contributor be - liable to You for damages, including any direct, indirect, special, - incidental, or consequential damages of any character arising as a - result of this License or out of the use or inability to use the - Work (including but not limited to damages for loss of goodwill, - work stoppage, computer failure or malfunction, or any and all - other commercial damages or losses), even if such Contributor - has been advised of the possibility of such damages. - - 9. Accepting Warranty or Additional Liability. While redistributing - the Work or Derivative Works thereof, You may choose to offer, - and charge a fee for, acceptance of support, warranty, indemnity, - or other liability obligations and/or rights consistent with this - License. However, in accepting such obligations, You may act only - on Your own behalf and on Your sole responsibility, not on behalf - of any other Contributor, and only if You agree to indemnify, - defend, and hold each Contributor harmless for any liability - incurred by, or claims asserted against, such Contributor by reason - of your accepting any such warranty or additional liability. diff --git a/stepfunctions/agent_evaluation-0.2.0.dist-info/METADATA b/stepfunctions/agent_evaluation-0.2.0.dist-info/METADATA deleted file mode 100644 index 1070391..0000000 --- a/stepfunctions/agent_evaluation-0.2.0.dist-info/METADATA +++ /dev/null @@ -1,74 +0,0 @@ -Metadata-Version: 2.1 -Name: agent-evaluation -Version: 0.2.0 -Summary: A generative AI-powered framework for testing virtual agents. -Home-page: https://awslabs.github.io/agent-evaluation/ -Author: Amazon Web Services -Author-email: agent-evaluation-oss-core-team@amazon.com -License: Apache 2.0 -Classifier: Development Status :: 4 - Beta -Classifier: Intended Audience :: Developers -Classifier: Topic :: Utilities -Classifier: Topic :: Software Development :: Testing -Classifier: License :: OSI Approved :: Apache Software License -Classifier: Programming Language :: Python :: 3 :: Only -Classifier: Programming Language :: Python :: 3.9 -Classifier: Programming Language :: Python :: 3.10 -Classifier: Programming Language :: Python :: 3.11 -Classifier: Programming Language :: Python :: 3.12 -Requires-Python: >=3.9 -Description-Content-Type: text/markdown -License-File: LICENSE -License-File: NOTICE -Requires-Dist: pyyaml ~=6.0 -Requires-Dist: boto3 <2.0,>=1.34.20 -Requires-Dist: click ~=8.0 -Requires-Dist: pydantic <3.0,>=2.1.0 -Requires-Dist: rich <14.0,>=13.7.0 -Requires-Dist: jinja2 <4.0,>=3.1.3 -Requires-Dist: jsonpath-ng <2.0,>=1.6.1 -Provides-Extra: dev -Requires-Dist: flake8 ; extra == 'dev' -Requires-Dist: black ; extra == 'dev' -Requires-Dist: isort ; extra == 'dev' -Requires-Dist: pytest ; extra == 'dev' -Requires-Dist: pytest-cov ; extra == 'dev' -Requires-Dist: pytest-mock ; extra == 'dev' -Requires-Dist: mkdocs ; extra == 'dev' -Requires-Dist: mkdocs-material ; extra == 'dev' -Requires-Dist: mkdocstrings[python] ; extra == 'dev' -Requires-Dist: mkdocs-click ; extra == 'dev' -Requires-Dist: bandit ; extra == 'dev' -Requires-Dist: pip-audit ; extra == 'dev' - -![PyPI - Version](https://img.shields.io/pypi/v/agent-evaluation) -![PyPI - Python Version](https://img.shields.io/pypi/pyversions/agent-evaluation) -![GitHub License](https://img.shields.io/github/license/awslabs/agent-evaluation) -[![security: bandit](https://img.shields.io/badge/security-bandit-yellow.svg)](https://github.com/PyCQA/bandit) -[![Code style: black](https://img.shields.io/badge/code%20style-black-000000.svg)](https://github.com/psf/black) -[![Built with Material for MkDocs](https://img.shields.io/badge/Material_for_MkDocs-526CFE?style=for-the-badge&logo=MaterialForMkDocs&logoColor=white)](https://squidfunk.github.io/mkdocs-material/) - -# Agent Evaluation - -Agent Evaluation is a generative AI-powered framework for testing virtual agents. - -Internally, Agent Evaluation implements an LLM agent (evaluator) that will orchestrate conversations with your own agent (target) and evaluate the responses during the conversation. - -## ✨ Key features - -- Built-in support for popular AWS services including [Amazon Bedrock](https://aws.amazon.com/bedrock/), [Amazon Q Business](https://aws.amazon.com/q/business/), and [Amazon SageMaker](https://aws.amazon.com/sagemaker/). You can also [bring your own agent](https://awslabs.github.io/agent-evaluation/targets/custom_targets/) to test using Agent Evaluation. -- Orchestrate concurrent, multi-turn conversations with your agent while evaluating its responses. -- Define [hooks](https://awslabs.github.io/agent-evaluation/hooks/) to perform additional tasks such as integration testing. -- Can be incorporated into CI/CD pipelines to expedite the time to delivery while maintaining the stability of agents in production environments. - -## 📚 Documentation - -To get started, please visit the full documentation [here](https://awslabs.github.io/agent-evaluation/). To contribute, please refer to [CONTRIBUTING.md](./CONTRIBUTING.md) - -## 👏 Contributors - -Shout out to these awesome contributors: - - - - diff --git a/stepfunctions/agent_evaluation-0.2.0.dist-info/NOTICE b/stepfunctions/agent_evaluation-0.2.0.dist-info/NOTICE deleted file mode 100644 index 616fc58..0000000 --- a/stepfunctions/agent_evaluation-0.2.0.dist-info/NOTICE +++ /dev/null @@ -1 +0,0 @@ -Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. diff --git a/stepfunctions/agent_evaluation-0.2.0.dist-info/RECORD b/stepfunctions/agent_evaluation-0.2.0.dist-info/RECORD deleted file mode 100644 index fcc2eac..0000000 --- a/stepfunctions/agent_evaluation-0.2.0.dist-info/RECORD +++ /dev/null @@ -1,87 +0,0 @@ -../../../bin/agenteval,sha256=sKahy-HYfncxw3pVqCLLgxIokhvln3Qm9eDSvskMrV8,250 -agent_evaluation-0.2.0.dist-info/INSTALLER,sha256=zuuue4knoyJ-UwPPXg8fezS7VCrXJQrAP7zeNuwvFQg,4 -agent_evaluation-0.2.0.dist-info/LICENSE,sha256=CeipvOyAZxBGUsFoaFqwkx54aPnIKEtm9a5u2uXxEws,10142 -agent_evaluation-0.2.0.dist-info/METADATA,sha256=WOBzlzyr7ozBQpU_A99iEq8w2O-j-Zii-Q2al9A3D_Y,3759 -agent_evaluation-0.2.0.dist-info/NOTICE,sha256=1CkO1kwu3Q_OHYTj-d-yiBJA_lNN73a4zSntavaD4oc,67 -agent_evaluation-0.2.0.dist-info/RECORD,, -agent_evaluation-0.2.0.dist-info/REQUESTED,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0 -agent_evaluation-0.2.0.dist-info/WHEEL,sha256=GJ7t_kWBFywbagK5eo9IoUwLW6oyOeTKmQ-9iHFVNxQ,92 -agent_evaluation-0.2.0.dist-info/entry_points.txt,sha256=DxxfiIbldqK82WRgaMOL4BjPJcnr7JOkDYTch6xNahs,48 -agent_evaluation-0.2.0.dist-info/top_level.txt,sha256=k6izISLxVoNnLxZHqnS3X-0eDdUD8LsV-OoM0afYdew,10 -agenteval/__init__.py,sha256=JQm11m01-rh2PjCw1OVqgy1rkU725Q6vMqfDtLbRH2U,1150 -agenteval/__pycache__/__init__.cpython-311.pyc,, -agenteval/__pycache__/cli.cpython-311.pyc,, -agenteval/__pycache__/conversation.cpython-311.pyc,, -agenteval/__pycache__/defaults.cpython-311.pyc,, -agenteval/__pycache__/hook.cpython-311.pyc,, -agenteval/__pycache__/plan.cpython-311.pyc,, -agenteval/__pycache__/target_response.cpython-311.pyc,, -agenteval/__pycache__/test.cpython-311.pyc,, -agenteval/__pycache__/test_result.cpython-311.pyc,, -agenteval/__pycache__/trace.cpython-311.pyc,, -agenteval/cli.py,sha256=wl0z_pCCKuu9lJgUWrS8cuHkvMYclhd-aCmCB6MN8u8,2807 -agenteval/conversation.py,sha256=r3fvnsnVI7zRoi_RS2JjPahUtLXF9vhnZYJcx1RMe3M,1030 -agenteval/defaults.py,sha256=PB1UniJ-uyiBn0WWSA3EI1UxcfpU2wlvsJZmhWgdV5E,280 -agenteval/evaluators/__init__.py,sha256=U6uQ6THgK0yxMnqVKL5l7_zUUxepoo11W1fPLa9xgNE,247 -agenteval/evaluators/__pycache__/__init__.cpython-311.pyc,, -agenteval/evaluators/__pycache__/base_evaluator.cpython-311.pyc,, -agenteval/evaluators/__pycache__/evaluator_factory.cpython-311.pyc,, -agenteval/evaluators/base_evaluator.py,sha256=zpWFBhQdaB-__TdiM7lFlkvQFX27KSFvzDFQ1KBvuLw,5052 -agenteval/evaluators/claude_3/__init__.py,sha256=mKv_FTRrhYIIS86zqxzj5edy-tKREHsn3nXUBmck71Q,180 -agenteval/evaluators/claude_3/__pycache__/__init__.cpython-311.pyc,, -agenteval/evaluators/claude_3/__pycache__/evaluator.cpython-311.pyc,, -agenteval/evaluators/claude_3/__pycache__/model_configs.cpython-311.pyc,, -agenteval/evaluators/claude_3/evaluator.py,sha256=k-ZXtKBtywVYy1XEAkSufb9LYXlAElaklV8Wao-udLo,7751 -agenteval/evaluators/claude_3/model_configs.py,sha256=KUf0C5Bbgc-c05ZZlokVgjHVH4WGdoOfKtwQWwuQFLY,635 -agenteval/evaluators/evaluator_factory.py,sha256=JCTVoN62QNMcKR68KY2Li8zpm55HNvYwVBXZ0Yi3rhQ,712 -agenteval/hook.py,sha256=z8UfREnySi2E6tRwjeklI3CwjWQ5MMk59wLHj6TK9C0,1049 -agenteval/plan.py,sha256=tIXTXepcVZEA8JX0yoEzsSuLDVpqSYvBdKsGJYYCVbU,3236 -agenteval/runner/__init__.py,sha256=6f0fmworOJ0fn2MNzDg52zbip4osTovhwetT6ZQnI74,157 -agenteval/runner/__pycache__/__init__.cpython-311.pyc,, -agenteval/runner/__pycache__/runner.cpython-311.pyc,, -agenteval/runner/__pycache__/summary.cpython-311.pyc,, -agenteval/runner/runner.py,sha256=wSYcX82WTMwmMFCfqoHjxq8NTnV1_UdPr4A1fnmkD_U,3937 -agenteval/runner/summary.py,sha256=jTdFRFo7zAaE-PTA6Cy3n1cndgFB14vA20MDO9FeJyE,872 -agenteval/target_response.py,sha256=R_Gy-655vPEsSO7X2siU2GNiFPRl1CkRetiON8WYEGM,285 -agenteval/targets/__init__.py,sha256=JmGtuue6VQYkK5jAiArxlbnRQsA23p8NgDTMvnCWyGU,282 -agenteval/targets/__pycache__/__init__.cpython-311.pyc,, -agenteval/targets/__pycache__/base_target.cpython-311.pyc,, -agenteval/targets/__pycache__/boto3_target.cpython-311.pyc,, -agenteval/targets/__pycache__/target_factory.cpython-311.pyc,, -agenteval/targets/base_target.py,sha256=aYW5dLAlbKgscdf8XTcV9Bppbay-pz-c_y5RtCgdBD0,743 -agenteval/targets/bedrock_agent/__init__.py,sha256=2B5TCxdyQAXuQRtji0lclk5odB7xgT5Hi_dBwjErIzo,73 -agenteval/targets/bedrock_agent/__pycache__/__init__.cpython-311.pyc,, -agenteval/targets/bedrock_agent/__pycache__/target.cpython-311.pyc,, -agenteval/targets/bedrock_agent/target.py,sha256=GRfn4dOGkARF_3_DBupgoHrbiYQZADfqwXO65Z2-RDM,1332 -agenteval/targets/bedrock_knowledge_base/__init__.py,sha256=tYJixJ0x9ohkM7oker8eX7U4vkkxqV_xVlA4CsWIuec,89 -agenteval/targets/bedrock_knowledge_base/__pycache__/__init__.cpython-311.pyc,, -agenteval/targets/bedrock_knowledge_base/__pycache__/target.cpython-311.pyc,, -agenteval/targets/bedrock_knowledge_base/target.py,sha256=jOsAixfOSy6jEQF6p_uCwDLP7M1WB64F6K49CbtiSYc,1401 -agenteval/targets/boto3_target.py,sha256=qNukrm2GZOrG81pJc61BrJEFcNB_f80cvvWQyMFRQiA,1271 -agenteval/targets/q_business/__init__.py,sha256=1KT5BdoA_KD2fX3gNLvSyg9K5x0OfWBN8X15nxJf13U,67 -agenteval/targets/q_business/__pycache__/__init__.cpython-311.pyc,, -agenteval/targets/q_business/__pycache__/target.cpython-311.pyc,, -agenteval/targets/q_business/target.py,sha256=Bv9YiXcnBoUmXFN3nfCh2FNLNP9vMm_1ruWVlDGsXXs,1014 -agenteval/targets/sagemaker_endpoint/__init__.py,sha256=whoMO69GOhPMNOrbQAfYzVmIXuxhxt8dHJGABnR4_Ck,83 -agenteval/targets/sagemaker_endpoint/__pycache__/__init__.cpython-311.pyc,, -agenteval/targets/sagemaker_endpoint/__pycache__/target.cpython-311.pyc,, -agenteval/targets/sagemaker_endpoint/target.py,sha256=zLsgkOljavYzrjrVnY3qDOjc-zsKFPSIdqugsZZy6po,2677 -agenteval/targets/target_factory.py,sha256=W8mzSy3E44jpYJs6XLD2WaLAaXXZ_T_WGw49CyPLigQ,1092 -agenteval/templates/evaluators/claude_3/generate_evaluation.jinja,sha256=aaTBZnr-3J29SpdernWW8bmQzF7lV0-bed1glZk36Yk,287 -agenteval/templates/evaluators/claude_3/generate_initial_prompt.jinja,sha256=wIhfhNUsTVdeIDBJNH1QWIBQWVE8h0Lc958vuuNU_eE,43 -agenteval/templates/evaluators/claude_3/generate_test_status.jinja,sha256=2T9HuihEVtGvq-ncxl6hLrTZXi2wAYu3cQhCUl0F_qY,238 -agenteval/templates/evaluators/claude_3/generate_user_response.jinja,sha256=2T9HuihEVtGvq-ncxl6hLrTZXi2wAYu3cQhCUl0F_qY,238 -agenteval/templates/evaluators/claude_3/system/generate_evaluation.jinja,sha256=3ihPICeDofWljtl6YpUJQM-lJSPNeWjhjgGndKM1wYQ,554 -agenteval/templates/evaluators/claude_3/system/generate_initial_prompt.jinja,sha256=DR1UaUvn0u_8MD0cSHAWSPLfEIwnGCKlEFPkuUAKLDQ,566 -agenteval/templates/evaluators/claude_3/system/generate_test_status.jinja,sha256=akAKahEda6A3-XhVjXpacGR3e48HrbqE4UT4ONlqVZg,587 -agenteval/templates/evaluators/claude_3/system/generate_user_response.jinja,sha256=yCy-IkJRM2y9-pPbaZaNrT-_4J7x9YM9kMgMXeYf5D4,800 -agenteval/templates/summary/agenteval_summary.md.jinja,sha256=Ri9B_lIpewlBtvs0ggj4IO9FbIZlMq70aDBZg_-xfQk,1107 -agenteval/test.py,sha256=mMbZWI5Yv6oQDS4xh5gCUvAj_IOih3vurqsMJs_9KbM,806 -agenteval/test_result.py,sha256=pDdXfrhIQtgO3au0XaxNLY1uql-POqZrlgu2vtNa0fc,738 -agenteval/trace.py,sha256=9JhT1i295AbKk1Zaj7Qa9EiXW1IJu-GsbOZ1hs8kiEU,2090 -agenteval/utils/__init__.py,sha256=xgJ0V8V34ju5tDEaX-WDBwXLTwMjFBztdYJ5lk2Y-OE,230 -agenteval/utils/__pycache__/__init__.cpython-311.pyc,, -agenteval/utils/__pycache__/aws.cpython-311.pyc,, -agenteval/utils/__pycache__/imports.cpython-311.pyc,, -agenteval/utils/aws.py,sha256=z6YjWUK1MhMl0Z6J-vxZiRBaHv8d444avFxEMjicq0c,1115 -agenteval/utils/imports.py,sha256=i-cd9Ze6LWeaBktGHgZkWLa6W_iUa11vTOBc5CQrfzA,1106 diff --git a/stepfunctions/agent_evaluation-0.2.0.dist-info/REQUESTED b/stepfunctions/agent_evaluation-0.2.0.dist-info/REQUESTED deleted file mode 100644 index e69de29..0000000 diff --git a/stepfunctions/agent_evaluation-0.2.0.dist-info/WHEEL b/stepfunctions/agent_evaluation-0.2.0.dist-info/WHEEL deleted file mode 100644 index bab98d6..0000000 --- a/stepfunctions/agent_evaluation-0.2.0.dist-info/WHEEL +++ /dev/null @@ -1,5 +0,0 @@ -Wheel-Version: 1.0 -Generator: bdist_wheel (0.43.0) -Root-Is-Purelib: true -Tag: py3-none-any - diff --git a/stepfunctions/agent_evaluation-0.2.0.dist-info/entry_points.txt b/stepfunctions/agent_evaluation-0.2.0.dist-info/entry_points.txt deleted file mode 100644 index 6919bf1..0000000 --- a/stepfunctions/agent_evaluation-0.2.0.dist-info/entry_points.txt +++ /dev/null @@ -1,2 +0,0 @@ -[console_scripts] -agenteval = agenteval.cli:cli diff --git a/stepfunctions/agent_evaluation-0.2.0.dist-info/top_level.txt b/stepfunctions/agent_evaluation-0.2.0.dist-info/top_level.txt deleted file mode 100644 index 060c7ea..0000000 --- a/stepfunctions/agent_evaluation-0.2.0.dist-info/top_level.txt +++ /dev/null @@ -1 +0,0 @@ -agenteval diff --git a/stepfunctions/agenteval/__init__.py b/stepfunctions/agenteval/__init__.py deleted file mode 100644 index cd7bf51..0000000 --- a/stepfunctions/agenteval/__init__.py +++ /dev/null @@ -1,46 +0,0 @@ -# Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. -# SPDX-License-Identifier: Apache-2.0 - -from importlib.metadata import version - -import logging -import os - -from jinja2 import Environment, PackageLoader, select_autoescape -from rich.logging import RichHandler - -from .hook import Hook -from .target_response import TargetResponse - -__all__ = ["Hook", "TargetResponse"] -__version__ = version("agent-evaluation") - - -_LOG_LEVEL_ENV = "LOG_LEVEL" - - -def configure_logger(): - # supress logs from botocore - logging.getLogger("botocore").setLevel(logging.CRITICAL) - - # configure logging using rich - formatter = logging.Formatter("%(message)s", datefmt="[%X]") - handler = RichHandler(markup=True, show_level=True, rich_tracebacks=True) - handler.setFormatter(formatter) - - logger = logging.getLogger(__name__) - - logger.setLevel(os.environ.get(_LOG_LEVEL_ENV, logging.INFO)) - logger.addHandler(handler) - - -configure_logger() - -jinja_env = Environment( - loader=PackageLoader(__name__), - autoescape=select_autoescape( - disabled_extensions=["jinja"], - default_for_string=True, - default=True, - ), -) diff --git a/stepfunctions/agenteval/cli.py b/stepfunctions/agenteval/cli.py deleted file mode 100644 index 940f621..0000000 --- a/stepfunctions/agenteval/cli.py +++ /dev/null @@ -1,109 +0,0 @@ -# Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. -# SPDX-License-Identifier: Apache-2.0 - -import logging -import os -from typing import Optional - -import click - -from agenteval.plan import Plan -from agenteval.runner import Runner - -logger = logging.getLogger(__name__) - - -def validate_directory(directory): - if not os.path.isdir(directory): - raise NotADirectoryError(f"{directory} is not a directory") - if not os.access(directory, os.R_OK) or not os.access(directory, os.W_OK): - raise PermissionError(f"No read/write permissions for {directory}") - - -@click.group() -def cli(): - pass - - -@cli.command(help="Initialize a test plan.") -@click.option( - "--plan-dir", - type=str, - required=False, - help="The destination directory for storing the test plan. If unspecified, then the test plan is saved to the current working directory.", -) -def init(plan_dir: Optional[str]): - if plan_dir: - validate_directory(plan_dir) - try: - path = Plan.init_plan(plan_dir) - logger.info(f"[green]Test plan created at {path}") - - except FileExistsError as e: - logger.error(f"[red]{e}") - exit(1) - - -@cli.command(help="Run test plan.") -@click.option( - "--filter", - type=str, - required=False, - help="Specifies the test(s) to run. Multiple tests should be seperated using a comma. If unspecified, all tests from the test plan will be run.", -) -@click.option( - "--plan-dir", - type=str, - required=False, - help="The directory where the test plan is stored. If unspecified, then the current working directory is used.", -) -@click.option( - "--verbose", - is_flag=True, - type=bool, - default=False, - help="Controls the verbosity of the terminal logs.", -) -@click.option( - "--num-threads", - type=int, - required=False, - help="Number of threads (and thus tests) to run concurrently. If unspecified, number of threads will be capped at 45.", -) -@click.option( - "--work-dir", - type=str, - required=False, - help="The directory where the test result and trace will be generated. If unspecified, then the current working directory is used.", -) -def run( - filter: Optional[str], - plan_dir: Optional[str], - verbose: bool, - num_threads: Optional[int], - work_dir: Optional[str], -): - try: - plan = Plan.load(plan_dir, filter) - if work_dir: - validate_directory(work_dir) - runner = Runner( - plan, - verbose, - num_threads, - work_dir, - ) - num_failed = runner.run() - _num_failed_exit(num_failed) - - except Exception as e: - _exception_exit(e) - - -def _num_failed_exit(num_failed): - exit(1 if num_failed else 0) - - -def _exception_exit(e): - logger.exception(f"Error running test: {e}") - exit(1) diff --git a/stepfunctions/agenteval/conversation.py b/stepfunctions/agenteval/conversation.py deleted file mode 100644 index 59e4304..0000000 --- a/stepfunctions/agenteval/conversation.py +++ /dev/null @@ -1,35 +0,0 @@ -# Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. -# SPDX-License-Identifier: Apache-2.0 - -_USER = "USER" -_AGENT = "AGENT" -_START_TURN_COUNT = 0 - - -class Conversation: - """Captures the interaction between a user and an agent. - - Attributes: - messages (list): A list of tuples of the form (role, message). - turns (int): The number of turns in the conversation. - """ - - def __init__(self): - self.messages = [] - self.turns = _START_TURN_COUNT - - def __iter__(self): - """Allow iteration over conversation messages.""" - return iter(self.messages) - - def add_turn(self, user_message: str, agent_response: str): - """Record a turn in the conversation. - - Args: - user_message (str): The users's message - agent_response (str): The agent's response to the user's message - - Increments the `turn` counter by `1`. - """ - self.messages.extend([(_USER, user_message), (_AGENT, agent_response)]) - self.turns += 1 diff --git a/stepfunctions/agenteval/defaults.py b/stepfunctions/agenteval/defaults.py deleted file mode 100644 index 929c675..0000000 --- a/stepfunctions/agenteval/defaults.py +++ /dev/null @@ -1,8 +0,0 @@ -# Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. -# SPDX-License-Identifier: Apache-2.0 - -MAX_TURNS = 2 - -# Default max number of threads not exceeding Bedrock service quota: -# https://docs.aws.amazon.com/bedrock/latest/userguide/quotas.html -MAX_NUM_THREADS = 45 diff --git a/stepfunctions/agenteval/evaluators/__init__.py b/stepfunctions/agenteval/evaluators/__init__.py deleted file mode 100644 index 8e52702..0000000 --- a/stepfunctions/agenteval/evaluators/__init__.py +++ /dev/null @@ -1,7 +0,0 @@ -# Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. -# SPDX-License-Identifier: Apache-2.0 - -from .base_evaluator import BaseEvaluator -from .evaluator_factory import EvaluatorFactory - -__all__ = ["BaseEvaluator", "EvaluatorFactory"] diff --git a/stepfunctions/agenteval/evaluators/base_evaluator.py b/stepfunctions/agenteval/evaluators/base_evaluator.py deleted file mode 100644 index e1bd4c9..0000000 --- a/stepfunctions/agenteval/evaluators/base_evaluator.py +++ /dev/null @@ -1,139 +0,0 @@ -# Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. -# SPDX-License-Identifier: Apache-2.0 - -import json -from abc import ABC, abstractmethod -from typing import Optional - -from agenteval.conversation import Conversation -from agenteval.hook import Hook -from agenteval.targets import BaseTarget -from agenteval.test import Test -from agenteval.test_result import TestResult -from agenteval.trace import Trace -from agenteval.utils import create_boto3_client, import_class - -_DEFAULT_MAX_RETRY = 10 -_BOTO3_SERVICE_NAME = "bedrock-runtime" - - -class BaseEvaluator(ABC): - """The `BaseEvaluator` abstract base class defines the common interface for evaluator - classes. - - Attributes: - test (Test): The test case. - target (BaseTarget): The target agent being evaluated. - conversation (Conversation): Captures the interaction between a user and an agent. - trace (Trace): Captures steps during evaluation. - test_result (TestResult): The result of the test which is set in `BaseEvaluator.run`. - input_token_count (int): Number of input tokens processed by the evaluator. - output_token_count (int): Number of output tokens generated by the evaluator. - model_id (str): The ID of the Bedrock model used to run evaluation. If `provisioned_throughput_arn` is provided, - then this will be set to the ARN of the provisioned throughput. - boto3_client (BaseClient): A `boto3` client representing Amazon Bedrock Runtime. - """ - - def __init__( - self, - test: Test, - target: BaseTarget, - work_dir: str, - model_id: str, - provisioned_throughput_arn: Optional[str] = None, - aws_profile: Optional[str] = None, - aws_region: Optional[str] = None, - endpoint_url: Optional[str] = None, - max_retry: int = _DEFAULT_MAX_RETRY, - ): - """Initialize the evaluator instance for a given `Test` and `Target`. - - Args: - test (Test): The test case. - target (BaseTarget): The target agent being evaluated. - work_dir (str): The work directory. - model_id (str): The ID of the Bedrock model used to run evaluation. - provisioned_throughput_arn (str, optional): The ARN of the provisioned throughput. - aws_profile (str, optional): The AWS profile name. - aws_region (str, optional): The AWS region. - endpoint_url (str, optional): The endpoint URL for the AWS service. - max_retry (int, optional): The maximum number of retry attempts. - """ - self.test = test - self.target = target - self.conversation = Conversation() - self.trace = Trace(work_dir=work_dir, test_name=test.name) - self.test_result = None - self.input_token_count = 0 - self.output_token_count = 0 - self.model_id = provisioned_throughput_arn or model_id - self.bedrock_runtime_client = create_boto3_client( - boto3_service_name=_BOTO3_SERVICE_NAME, - aws_profile=aws_profile, - aws_region=aws_region, - endpoint_url=endpoint_url, - max_retry=max_retry, - ) - - @abstractmethod - def evaluate(self) -> TestResult: - """Conduct a test. - - Returns: - TestResult: The result of the test. - """ - pass - - def _get_hook_cls(self, hook: Optional[str]) -> Optional[type[Hook]]: - if hook: - hook_cls = import_class(hook, parent_class=Hook) - return hook_cls - - def invoke_model(self, request_body: dict) -> dict: - """ - Invoke the Bedrock model using the `boto3_client`. This method will convert - a request dictionary to a JSON string before passing it to the `InvokeModel` API. - - Refer to the `boto3` documentation for more details. - - Args: - request_body (dict): The request payload as a dictionary. - - Returns: - dict: The response from the model invocation. - - """ - response = self.bedrock_runtime_client.invoke_model( - modelId=self.model_id, body=json.dumps(request_body) - ) - - self._incr_token_counts(response) - - return response - - def _incr_token_counts(self, response: dict): - headers = response["ResponseMetadata"]["HTTPHeaders"] - - self.input_token_count += int( - headers.get("x-amzn-bedrock-input-token-count", 0) - ) - self.output_token_count += int( - headers.get("x-amzn-bedrock-output-token-count", 0) - ) - - def run(self) -> TestResult: - """ - Run the evaluator within a trace context manager and run hooks - if provided. - """ - - hook_cls = self._get_hook_cls(self.test.hook) - - with self.trace: - if hook_cls: - hook_cls.pre_evaluate(self.test, self.trace) - self.test_result = self.evaluate() - if hook_cls: - hook_cls.post_evaluate(self.test, self.test_result, self.trace) - - return self.test_result diff --git a/stepfunctions/agenteval/evaluators/claude_3/__init__.py b/stepfunctions/agenteval/evaluators/claude_3/__init__.py deleted file mode 100644 index 338be7d..0000000 --- a/stepfunctions/agenteval/evaluators/claude_3/__init__.py +++ /dev/null @@ -1,6 +0,0 @@ -# Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. -# SPDX-License-Identifier: Apache-2.0 - -from .evaluator import Claude3Evaluator - -__all__ = ["Claude3Evaluator"] diff --git a/stepfunctions/agenteval/evaluators/claude_3/evaluator.py b/stepfunctions/agenteval/evaluators/claude_3/evaluator.py deleted file mode 100644 index cc8b3ae..0000000 --- a/stepfunctions/agenteval/evaluators/claude_3/evaluator.py +++ /dev/null @@ -1,244 +0,0 @@ -# Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. -# SPDX-License-Identifier: Apache-2.0 - -import json -import logging -import os -import re -from typing import Tuple - -from agenteval import jinja_env -from agenteval.evaluators import BaseEvaluator -from agenteval.evaluators.claude_3 import model_configs -from agenteval.test_result import TestResult - -logger = logging.getLogger(__name__) - -_PROMPT_TEMPLATE_ROOT = "evaluators/claude_3" -_SYSTEM_PROMPT_DIR = "system" -_PROMPT_TEMPLATE_NAMES = [ - "generate_initial_prompt", - "generate_user_response", - "generate_test_status", - "generate_evaluation", -] - -# enable backwards-compatible StrEnum -try: - from enum import StrEnum -except ImportError: - from enum import Enum - - class StrEnum(str, Enum): - pass - - -class TestStatusCategories(StrEnum): - ALL_STEPS_ATTEMPTED = "A" - NOT_ALL_STEPS_ATTEMPTED = "B" - - -class EvaluationCategories(StrEnum): - ALL_EXPECTED_RESULTS_OBSERVED = "A" - NOT_ALL_EXPECTED_RESULTS_OBSERVED = "B" - - -class Results(StrEnum): - MAX_TURNS_REACHED = "Maximum turns reached." - ALL_EXPECTED_RESULTS_OBSERVED = ( - "All of the expected results can be observed in the conversation." - ) - NOT_ALL_EXPECTED_RESULTS_OBSERVED = ( - "Not all of the expected results can be observed in the conversation." - ) - - -class Claude3Evaluator(BaseEvaluator): - def __init__( - self, - **kwargs, - ): - super().__init__(model_id=model_configs.MODEL_ID, **kwargs) - - self._prompt_template_map = { - name: { - "system": jinja_env.get_template( - os.path.join( - _PROMPT_TEMPLATE_ROOT, _SYSTEM_PROMPT_DIR, f"{name}.jinja" - ) - ), - "prompt": jinja_env.get_template( - os.path.join(_PROMPT_TEMPLATE_ROOT, f"{name}.jinja") - ), - } - for name in _PROMPT_TEMPLATE_NAMES - } - - @staticmethod - def _extract_content_from_xml(xml_data: str, element_names: list[str]) -> Tuple: - content = [] - for e in element_names: - pattern = rf"<{e}>(.*?)" - match = re.search(pattern, xml_data, re.DOTALL) - content.append(match.group(1).strip() if match else None) - return tuple(content) - - def _generate( - self, - system_prompt: str, - prompt: str, - output_xml_element: str, - ) -> str: - request_body = model_configs.REQUEST_BODY - request_body["system"] = system_prompt - request_body["messages"][0]["content"][0]["text"] = prompt - - response = self.invoke_model(request_body=request_body) - response_body = response.get("body").read() - completion = json.loads(response_body)["content"][0]["text"] - - logger.debug( - f"[{self.test.name}]\n[PROMPT]\n{prompt}\n[COMPLETION]\n{completion}" - ) - - output, reasoning = self._extract_content_from_xml( - completion, [output_xml_element, "thinking"] - ) - - return output, reasoning - - def _generate_initial_prompt(self) -> str: - system_prompt = self._prompt_template_map["generate_initial_prompt"][ - "system" - ].render() - prompt = self._prompt_template_map["generate_initial_prompt"]["prompt"].render( - step=self.test.steps[0] - ) - - initial_prompt, reasoning = self._generate( - system_prompt=system_prompt, - prompt=prompt, - output_xml_element="initial_prompt", - ) - - self.trace.add_step( - system_prompt=system_prompt, - prompt=prompt, - initial_prompt=initial_prompt, - reasoning=reasoning, - ) - return initial_prompt - - def _generate_test_status(self) -> str: - system_prompt = self._prompt_template_map["generate_test_status"][ - "system" - ].render() - prompt = self._prompt_template_map["generate_test_status"]["prompt"].render( - steps=self.test.steps, conversation=self.conversation - ) - test_status, reasoning = self._generate( - system_prompt=system_prompt, - prompt=prompt, - output_xml_element="category", - ) - self.trace.add_step( - system_prompt=system_prompt, - prompt=prompt, - test_status=test_status, - reasoning=reasoning, - ) - return test_status - - def _generate_evaluation(self) -> tuple[str, str]: - system_prompt = self._prompt_template_map["generate_evaluation"][ - "system" - ].render() - prompt = self._prompt_template_map["generate_evaluation"]["prompt"].render( - expected_results=self.test.expected_results, - conversation=self.conversation, - ) - - evaluation, reasoning = self._generate( - system_prompt=system_prompt, - prompt=prompt, - output_xml_element="category", - ) - self.trace.add_step( - system_prompt=system_prompt, - prompt=prompt, - evaluation=evaluation, - reasoning=reasoning, - ) - - return evaluation, reasoning - - def _generate_user_response(self) -> str: - system_prompt = self._prompt_template_map["generate_user_response"][ - "system" - ].render() - prompt = self._prompt_template_map["generate_user_response"]["prompt"].render( - steps=self.test.steps, conversation=self.conversation - ) - - user_response, reasoning = self._generate( - system_prompt=system_prompt, - prompt=prompt, - output_xml_element="user_response", - ) - - self.trace.add_step( - system_prompt=system_prompt, - prompt=prompt, - user_response=user_response, - reasoning=reasoning, - ) - return user_response - - def _invoke_target(self, user_input) -> str: - target_response = self.target.invoke(user_input) - self.trace.add_step(data=target_response.data) - - return target_response.response - - def evaluate(self) -> TestResult: - success = False - result = Results.MAX_TURNS_REACHED.value - reasoning = "" - - while self.conversation.turns < self.test.max_turns: - if self.conversation.turns == 0: - # start conversation - if self.test.initial_prompt: - user_input = self.test.initial_prompt - else: - user_input = self._generate_initial_prompt() - else: - # generate next user response - user_input = self._generate_user_response() - - # add turn to the conversation - self.conversation.add_turn(user_input, self._invoke_target(user_input)) - - # get test status - test_status = self._generate_test_status() - if test_status == TestStatusCategories.ALL_STEPS_ATTEMPTED: - # evaluate conversation - eval_category, reasoning = self._generate_evaluation() - if ( - eval_category - == EvaluationCategories.NOT_ALL_EXPECTED_RESULTS_OBSERVED.value # noqa: W503 - ): - result = Results.NOT_ALL_EXPECTED_RESULTS_OBSERVED.value - else: - result = Results.ALL_EXPECTED_RESULTS_OBSERVED.value - success = True - - break - - return TestResult( - test_name=self.test.name, - success=success, - result=result, - reasoning=reasoning, - conversation=self.conversation, - ) diff --git a/stepfunctions/agenteval/evaluators/claude_3/model_configs.py b/stepfunctions/agenteval/evaluators/claude_3/model_configs.py deleted file mode 100644 index e6bc2fc..0000000 --- a/stepfunctions/agenteval/evaluators/claude_3/model_configs.py +++ /dev/null @@ -1,26 +0,0 @@ -# Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. -# SPDX-License-Identifier: Apache-2.0 - -MODEL_ID = "anthropic.claude-3-sonnet-20240229-v1:0" -ANTHROPIC_VERSION = "bedrock-2023-05-31" -ROLE = "user" -MAX_TOKENS_TO_SAMPLE = 300 -TEMPERATURE = 0 -TOP_K = 250 -TOP_P = 1 -REQUEST_BODY = { - "anthropic_version": ANTHROPIC_VERSION, - "max_tokens": MAX_TOKENS_TO_SAMPLE, - "system": None, - "messages": [ - { - "role": ROLE, - "content": [ - {"type": "text", "text": None}, - ], - } - ], - "temperature": TEMPERATURE, - "top_p": TOP_P, - "top_k": TOP_K, -} diff --git a/stepfunctions/agenteval/evaluators/claude_3/most_updated_prompt_2607.txt b/stepfunctions/agenteval/evaluators/claude_3/most_updated_prompt_2607.txt deleted file mode 100644 index fce3738..0000000 --- a/stepfunctions/agenteval/evaluators/claude_3/most_updated_prompt_2607.txt +++ /dev/null @@ -1,67 +0,0 @@ - -You are an energy advisor with twenty years of experience at the UK's leading energy providers. You are empathetic and compassionate, you understand that rising energy prices can be a source of strain. You are pragmatic. Ask the user clarifying questions to understand their personal situation and to ensure you are giving personalised advice. Do not make information up, if you do not know how to answer be honest. Before answering, please think about all the information you would need before answering the user's question. - - - - - - - -You are a compassionate and empathetic customer-facing energy advisor with twenty years of experience at the UK's leading energy providers. You have the important role of preventing customers from debt or payment difficulties, whilst also providing tailored support to hose already struggling with energy costs. Most importantly, you assess each customer's unique needs and provide support that's tailored to their individual situation. - - - - -Your approach is to: -1) Create a profile of the customer by asking a few clarifying questions, one at a time, about their situation, energy usage and any challenges they are facing. -2) Based on their responses, provide a personalised recommendation to resolve their issue or improve their circumstance and ensure they are being energy efficient. - -Some example questions include: - - - -* Does the customer have a smart meter? -* Are they aware of Energy Hub? -* Are they on the right tariff? -* How many people are in their household? -* What is their current living situation (apartment, house, etc.)? - - - -Some examples of recommendations include: - - -* Smart meter installation for better usage monitoring -* Checking their eligibility for financial assistance including debt relief or the Warm Home Discount - - - -Always greet the customer with a salutation, even if they do not use one themselves. Approach each question with care. Do not make information up - if you do not know the answer - please be honest. Always remember to keep a conversational tone, especially when providing the recommendations. Ask the customer questions one at a time. Once you have enough information to provide the user with a helpful recommendation, then provide it. - - -Here is an example interaction: - - -A: how can I reduce my energy bill? - -B: Hi there, I understand you want to reduce your energy bill. I want to give you advice that is personal to your situation. So will ask some questions to understand you better. Is that okay? - -A: Yes - -B: What kind of house do you live in and with how many people? - -A: I live in a one-bedroom apartment with my partner? - -B: Thank you, and how do you measure your energy use? - -A: I send meter readings? - -B: Okay, so to confirm you don’t have a smart meter? - -A: No - -B: My first recommendation would be a smart meter. A smart meter is a way to ensure that your energy readings are always up to date and can assist with your payment if you are overpaying at some points in the year. Would you like some more recommendations? -... -[continues dialogue to gather more details if required and then provide a personalized recommendation] - - diff --git a/stepfunctions/agenteval/evaluators/evaluator_factory.py b/stepfunctions/agenteval/evaluators/evaluator_factory.py deleted file mode 100644 index d42f8e3..0000000 --- a/stepfunctions/agenteval/evaluators/evaluator_factory.py +++ /dev/null @@ -1,27 +0,0 @@ -from typing import Optional - -from pydantic import BaseModel - -from agenteval.evaluators import BaseEvaluator -from agenteval.evaluators.claude_3 import Claude3Evaluator -from agenteval.targets import BaseTarget -from agenteval.test import Test - -_EVALUATOR_MAP = { - "claude-3": Claude3Evaluator, -} - - -class EvaluatorFactory(BaseModel): - config: dict - - def create( - self, test: Test, target: BaseTarget, work_dir: Optional[str] - ) -> BaseEvaluator: - evaluator_cls = _EVALUATOR_MAP[self.config["model"]] - return evaluator_cls( - test=test, - target=target, - work_dir=work_dir, - **{k: v for k, v in self.config.items() if k != "model"} - ) diff --git a/stepfunctions/agenteval/hook.py b/stepfunctions/agenteval/hook.py deleted file mode 100644 index a1386e6..0000000 --- a/stepfunctions/agenteval/hook.py +++ /dev/null @@ -1,33 +0,0 @@ -# Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. -# SPDX-License-Identifier: Apache-2.0 - -from agenteval.test import Test -from agenteval.test_result import TestResult -from agenteval.trace import Trace - - -class Hook: - """An evaluation hook.""" - - def pre_evaluate(test: Test, trace: Trace) -> None: - """ - Method called before evaluation. Can be used to perform any setup tasks. - - Args: - test (Test): The test case. - trace (Trace): Captures steps during evaluation. - """ - pass - - def post_evaluate(test: Test, test_result: TestResult, trace: Trace) -> None: - """ - Method called after evaluation. This may be used to perform integration testing - or clean up tasks. - - Args: - test (Test): The test case. - test_result (TestResult): The result of the test, which can be overriden - by updating the attributes of this object. - trace (Trace): Captures steps during evaluation. - """ - pass diff --git a/stepfunctions/agenteval/plan.py b/stepfunctions/agenteval/plan.py deleted file mode 100644 index 73a3107..0000000 --- a/stepfunctions/agenteval/plan.py +++ /dev/null @@ -1,110 +0,0 @@ -# Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. -# SPDX-License-Identifier: Apache-2.0 - -from __future__ import annotations - -import logging -import os -import sys -from typing import Optional - -import yaml -from pydantic import BaseModel, model_validator - -from agenteval import defaults -from agenteval.evaluators import EvaluatorFactory -from agenteval.targets import TargetFactory -from agenteval.test import Test - -_PLAN_FILE_NAME = "agenteval.yml" - -_INIT_PLAN = { - "evaluator": {"model": "claude-3"}, - "target": { - "type": "bedrock-agent", - "bedrock_agent_id": None, - "bedrock_agent_alias_id": None, - }, - "tests": { - "retrieve_missing_documents": { - "steps": ["Ask agent for a list of missing documents for claim-006."], - "expected_results": ["The agent returns a list of missing documents."], - } - }, -} - - -sys.path.append(".") -logger = logging.getLogger(__name__) - - -class Plan(BaseModel, validate_assignment=True, arbitrary_types_allowed=True): - evaluator_factory: EvaluatorFactory - target_factory: TargetFactory - tests: list[Test] - - @model_validator(mode="after") - def check_test_names_unique(self) -> Plan: - unique_names = len(set(test.name for test in self.tests)) - - if unique_names != len(self.tests): - raise ValueError("Test names must be unique") - - return self - - @classmethod - def load(cls, plan_dir: Optional[str], filter: str) -> Plan: - plan_path = os.path.join(plan_dir or os.getcwd(), _PLAN_FILE_NAME) - plan = cls._load_yaml(plan_path) - - return cls( - evaluator_factory=EvaluatorFactory(config=plan["evaluator"]), - target_factory=TargetFactory(config=plan["target"]), - tests=cls._load_tests(plan["tests"], filter), - ) - - @staticmethod - def _load_yaml(path: str) -> dict: - with open(path) as stream: - return yaml.safe_load(stream) - - @staticmethod - def _load_tests(test_config: list[dict], filter: str) -> list[Test]: - tests = [] - - if filter: - names = Plan._parse_filter(filter) - else: - names = test_config.keys() - - for name in names: - config = test_config[name] - tests.append( - Test( - name=name, - steps=config["steps"], - expected_results=config["expected_results"], - initial_prompt=config.get("initial_prompt"), - max_turns=config.get("max_turns", defaults.MAX_TURNS), - hook=config.get("hook"), - ) - ) - - return tests - - @staticmethod - def _parse_filter(filter: str) -> list[str]: - return [n.strip() for n in filter.split(",")] - - @staticmethod - def init_plan(plan_dir: Optional[str]) -> str: - plan_path = os.path.join(plan_dir or os.getcwd(), _PLAN_FILE_NAME) - - # check if plan exists - if os.path.exists(plan_path): - raise FileExistsError(f"Test plan already exists at {plan_path}") - - with open(plan_path, "w") as stream: - yaml.safe_dump(_INIT_PLAN, stream, sort_keys=False) - - return plan_path diff --git a/stepfunctions/agenteval/runner/__init__.py b/stepfunctions/agenteval/runner/__init__.py deleted file mode 100644 index 32377b3..0000000 --- a/stepfunctions/agenteval/runner/__init__.py +++ /dev/null @@ -1,6 +0,0 @@ -# Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. -# SPDX-License-Identifier: Apache-2.0 - -from .runner import Runner - -__all__ = ["Runner"] diff --git a/stepfunctions/agenteval/runner/runner.py b/stepfunctions/agenteval/runner/runner.py deleted file mode 100644 index c3e0803..0000000 --- a/stepfunctions/agenteval/runner/runner.py +++ /dev/null @@ -1,116 +0,0 @@ -# Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. -# SPDX-License-Identifier: Apache-2.0 - -import concurrent.futures -import logging -import os -import time -from typing import Optional - -from rich.progress import Progress - -from agenteval.defaults import MAX_NUM_THREADS -from agenteval.plan import Plan -from agenteval.runner.summary import create_markdown_summary - -logger = logging.getLogger(__name__) - - -class Runner: - def __init__( - self, - plan: Plan, - verbose: bool, - num_threads: Optional[int], - work_dir: Optional[str], - ): - self.plan = plan - self.work_dir = work_dir if work_dir else os.getcwd() - self.num_tests = len(self.plan.tests) - self.verbose = verbose - self.num_threads = num_threads - if not self.num_threads: - self.num_threads = min(self.num_tests, MAX_NUM_THREADS) - self.results = {test.name: None for test in self.plan.tests} - self.num_failed = 0 - self.evaluator_input_token_counts = [] - self.evaluator_output_token_counts = [] - - def run(self) -> int: - self._log_run_start() - - self.start_time = time.time() - with Progress(transient=True) as self.progress: - self.tracker = self.progress.add_task("running...", total=self.num_tests) - - with concurrent.futures.ThreadPoolExecutor( - max_workers=self.num_tests - ) as executor: - futures = [ - executor.submit(self.run_test, test) for test in self.plan.tests - ] - for future in concurrent.futures.as_completed(futures): - try: - future.result() - except Exception as e: - raise e - - self._log_run_end() - - create_markdown_summary( - self.work_dir, self.plan.tests, list(self.results.values()), self.verbose - ) - - return self.num_failed - - def run_test(self, test): - target = self.plan.target_factory.create() - evaluator = self.plan.evaluator_factory.create( - test=test, - target=target, - work_dir=self.work_dir, - ) - - result = evaluator.run() - if result.success is False: - self.num_failed += 1 - - self.progress.update(self.tracker, advance=1) - self.results[test.name] = result - self.evaluator_input_token_counts.append(evaluator.input_token_count) - self.evaluator_output_token_counts.append(evaluator.output_token_count) - - def _log_run_start(self): - logger.info(f"Starting {self.num_tests} tests with {self.num_threads} threads.") - - def _log_run_end(self): - self._log_pass_fail_count() - logger.info(f"Completed in {round(time.time() - self.start_time, 2)} seconds.") - if self.verbose: - self._log_test_result() - self._log_evaluator_token_io() - - def _log_test_result(self): - for _, result in self.results.items(): - logger_func = logger.info if result.success else logger.error - logger_func( - f"[bold {'green' if result.success else 'red'}]{result.test_name}...{'PASSED' if result.success else 'FAILED'}", - ) - - def _log_pass_fail_count(self): - passed_count = self.num_tests - self.num_failed - status_str = ( - f"[red]{passed_count} passed, {self.num_failed} failed." - if self.num_failed - else f"[green]{self.num_tests} passed." - ) - logger_func = logger.error if self.num_failed else logger.info - logger_func(status_str) - - def _log_evaluator_token_io(self): - logger.info( - f"Input tokens processed by evaluator: {sum(self.evaluator_input_token_counts)}" - ) - logger.info( - f"Output tokens generated by evaluator: {sum(self.evaluator_output_token_counts)}" - ) diff --git a/stepfunctions/agenteval/runner/summary.py b/stepfunctions/agenteval/runner/summary.py deleted file mode 100644 index 1abfaad..0000000 --- a/stepfunctions/agenteval/runner/summary.py +++ /dev/null @@ -1,30 +0,0 @@ -# Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. -# SPDX-License-Identifier: Apache-2.0 - -import logging -import os - -from agenteval import jinja_env -from agenteval.test import Test -from agenteval.test_result import TestResult - -logger = logging.getLogger(__name__) - -_TEMPLATE_ROOT = "summary" -_TEMPLATE_FILE_NAME = "agenteval_summary.md.jinja" - - -def create_markdown_summary( - work_dir: str, tests: list[Test], test_results: list[TestResult], verbose: bool -): - template = jinja_env.get_template(os.path.join(_TEMPLATE_ROOT, _TEMPLATE_FILE_NAME)) - - summary_path = os.path.join(work_dir, os.path.splitext(_TEMPLATE_FILE_NAME)[0]) - - rendered = template.render(tests=tests, results=test_results, zip=zip) - - with open(summary_path, "w+") as f: - f.write(rendered) - - if verbose: - logger.info(f"Summary available at {summary_path}") diff --git a/stepfunctions/agenteval/target_response.py b/stepfunctions/agenteval/target_response.py deleted file mode 100644 index 417543f..0000000 --- a/stepfunctions/agenteval/target_response.py +++ /dev/null @@ -1,15 +0,0 @@ -from typing import Optional - -from pydantic import BaseModel - - -class TargetResponse(BaseModel): - """A target's response. - - Attributes: - response: The response string. - data: Additional data (if applicable). - """ - - response: str - data: Optional[dict] = None diff --git a/stepfunctions/agenteval/targets/__init__.py b/stepfunctions/agenteval/targets/__init__.py deleted file mode 100644 index 910e303..0000000 --- a/stepfunctions/agenteval/targets/__init__.py +++ /dev/null @@ -1,8 +0,0 @@ -# Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. -# SPDX-License-Identifier: Apache-2.0 - -from .base_target import BaseTarget -from .boto3_target import Boto3Target -from .target_factory import TargetFactory - -__all__ = ["BaseTarget", "TargetFactory", "Boto3Target"] diff --git a/stepfunctions/agenteval/targets/base_target.py b/stepfunctions/agenteval/targets/base_target.py deleted file mode 100644 index f8fbaa8..0000000 --- a/stepfunctions/agenteval/targets/base_target.py +++ /dev/null @@ -1,27 +0,0 @@ -# Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. -# SPDX-License-Identifier: Apache-2.0 - -from __future__ import annotations - -from abc import ABC, abstractmethod - -from agenteval import TargetResponse - - -class BaseTarget(ABC): - """The `BaseTarget` abstract base class defines the common interface for target - classes. - """ - - @abstractmethod - def invoke(self, prompt: str) -> TargetResponse: - """Invoke the target with a prompt and return a response as a string. - - Args: - prompt: The prompt string to pass to the target. - - Returns: - A TargetResponse object containing the target's response string and - any trace data (if applicable). - """ - pass diff --git a/stepfunctions/agenteval/targets/bedrock_agent/__init__.py b/stepfunctions/agenteval/targets/bedrock_agent/__init__.py deleted file mode 100644 index 4d393ff..0000000 --- a/stepfunctions/agenteval/targets/bedrock_agent/__init__.py +++ /dev/null @@ -1,3 +0,0 @@ -from .target import BedrockAgentTarget - -__all__ = ["BedrockAgentTarget"] diff --git a/stepfunctions/agenteval/targets/bedrock_agent/target.py b/stepfunctions/agenteval/targets/bedrock_agent/target.py deleted file mode 100644 index f7e6f9c..0000000 --- a/stepfunctions/agenteval/targets/bedrock_agent/target.py +++ /dev/null @@ -1,41 +0,0 @@ -import uuid - -from agenteval import TargetResponse -from agenteval.targets import Boto3Target - -_SERVICE_NAME = "bedrock-agent-runtime" - - -class BedrockAgentTarget(Boto3Target): - def __init__(self, bedrock_agent_id: str, bedrock_agent_alias_id: str, **kwargs): - super().__init__(boto3_service_name=_SERVICE_NAME, **kwargs) - self._bedrock_agent_id = bedrock_agent_id - self._bedrock_agent_alias_id = bedrock_agent_alias_id - self._session_id: str = str(uuid.uuid4()) - - def invoke(self, prompt: str) -> TargetResponse: - args = { - "agentId": self._bedrock_agent_id, - "agentAliasId": self._bedrock_agent_alias_id, - "sessionId": self._session_id, - "inputText": prompt, - "enableTrace": True, - } - - response = self.boto3_client.invoke_agent(**args) - - stream = response["completion"] - completion = "" - trace_data = [] - - for event in stream: - chunk = event.get("chunk") - event_trace = event.get("trace") - if chunk: - completion += chunk.get("bytes").decode() - if event_trace: - trace_data.append(event_trace.get("trace")) - - return TargetResponse( - response=completion, data={"bedrock_agent_trace": trace_data} - ) diff --git a/stepfunctions/agenteval/targets/bedrock_knowledge_base/__init__.py b/stepfunctions/agenteval/targets/bedrock_knowledge_base/__init__.py deleted file mode 100644 index d56ea6f..0000000 --- a/stepfunctions/agenteval/targets/bedrock_knowledge_base/__init__.py +++ /dev/null @@ -1,3 +0,0 @@ -from .target import BedrockKnowledgeBaseTarget - -__all__ = ["BedrockKnowledgeBaseTarget"] diff --git a/stepfunctions/agenteval/targets/bedrock_knowledge_base/target.py b/stepfunctions/agenteval/targets/bedrock_knowledge_base/target.py deleted file mode 100644 index a9491e2..0000000 --- a/stepfunctions/agenteval/targets/bedrock_knowledge_base/target.py +++ /dev/null @@ -1,38 +0,0 @@ -from agenteval import TargetResponse -from agenteval.targets import Boto3Target - -_SERVICE_NAME = "bedrock-agent-runtime" - - -class BedrockKnowledgeBaseTarget(Boto3Target): - def __init__(self, knowledge_base_id: str, model_id: str, **kwargs): - super().__init__(boto3_service_name=_SERVICE_NAME, **kwargs) - aws_region = self.boto3_client.meta.region_name - self._knowledge_base_id = knowledge_base_id - self._model_arn = f"arn:aws:bedrock:{aws_region}::foundation-model/{model_id}" - self._session_id: str = None - - def invoke(self, prompt: str) -> TargetResponse: - args = { - "input": { - "text": prompt, - }, - "retrieveAndGenerateConfiguration": { - "type": "KNOWLEDGE_BASE", - "knowledgeBaseConfiguration": { - "knowledgeBaseId": self._knowledge_base_id, - "modelArn": self._model_arn, - }, - }, - } - if self._session_id: - args["sessionId"] = self._session_id - - response = self.boto3_client.retrieve_and_generate(**args) - generated_text = response["output"]["text"] - citations = response["citations"] - self._session_id = response["sessionId"] - - return TargetResponse( - response=generated_text, data={"bedrock_knowledgebase_citations": citations} - ) diff --git a/stepfunctions/agenteval/targets/boto3_target.py b/stepfunctions/agenteval/targets/boto3_target.py deleted file mode 100644 index e47e8cb..0000000 --- a/stepfunctions/agenteval/targets/boto3_target.py +++ /dev/null @@ -1,41 +0,0 @@ -from typing import Optional - -from agenteval.targets import BaseTarget -from agenteval.utils import create_boto3_client - -_DEFAULT_MAX_RETRY = 10 - - -class Boto3Target(BaseTarget): - """A target that can be interfaced with via the `boto3` library. - - Attributes: - boto3_client (BaseClient): A `boto3` client. - """ - - def __init__( - self, - boto3_service_name: str, - aws_profile: Optional[str] = None, - aws_region: Optional[str] = None, - endpoint_url: Optional[str] = None, - max_retry: int = _DEFAULT_MAX_RETRY, - ): - """ - Initialize the AWS target. - - Args: - boto3_service_name (str): The `boto3` service name (e.g `"bedrock-agent-runtime"`). - aws_profile (str, optional): The AWS profile name. - aws_region (str, optional): The AWS region. - endpoint_url (str, optional): The endpoint URL for the AWS service. - max_retry (int, optional): The maximum number of retry attempts. - """ - - self.boto3_client = create_boto3_client( - boto3_service_name=boto3_service_name, - aws_profile=aws_profile, - aws_region=aws_region, - endpoint_url=endpoint_url, - max_retry=max_retry, - ) diff --git a/stepfunctions/agenteval/targets/q_business/__init__.py b/stepfunctions/agenteval/targets/q_business/__init__.py deleted file mode 100644 index 3f621e5..0000000 --- a/stepfunctions/agenteval/targets/q_business/__init__.py +++ /dev/null @@ -1,3 +0,0 @@ -from .target import QBusinessTarget - -__all__ = ["QBusinessTarget"] diff --git a/stepfunctions/agenteval/targets/q_business/target.py b/stepfunctions/agenteval/targets/q_business/target.py deleted file mode 100644 index 8fd59be..0000000 --- a/stepfunctions/agenteval/targets/q_business/target.py +++ /dev/null @@ -1,32 +0,0 @@ -from typing import Optional - -from agenteval import TargetResponse -from agenteval.targets import Boto3Target - -_SERVICE_NAME = "qbusiness" - - -class QBusinessTarget(Boto3Target): - def __init__( - self, - q_business_application_id: str, - q_business_user_id: Optional[str] = None, - **kwargs - ): - super().__init__(boto3_service_name=_SERVICE_NAME, **kwargs) - - self._chat_sync_args = {"applicationId": q_business_application_id} - if q_business_user_id: - self._chat_sync_args["userId"] = q_business_user_id - - def invoke(self, prompt: str) -> str: - self._chat_sync_args["userMessage"] = prompt - - response = self.boto3_client.chat_sync(**self._chat_sync_args) - - if "conversationId" not in self._chat_sync_args: - self._chat_sync_args["conversationId"] = response["conversationId"] - - self._chat_sync_args["parentMessageId"] = response["systemMessageId"] - - return TargetResponse(response=response["systemMessage"]) diff --git a/stepfunctions/agenteval/targets/sagemaker_endpoint/__init__.py b/stepfunctions/agenteval/targets/sagemaker_endpoint/__init__.py deleted file mode 100644 index 8c9adc2..0000000 --- a/stepfunctions/agenteval/targets/sagemaker_endpoint/__init__.py +++ /dev/null @@ -1,3 +0,0 @@ -from .target import SageMakerEndpointTarget - -__all__ = ["SageMakerEndpointTarget"] diff --git a/stepfunctions/agenteval/targets/sagemaker_endpoint/target.py b/stepfunctions/agenteval/targets/sagemaker_endpoint/target.py deleted file mode 100644 index 74d2056..0000000 --- a/stepfunctions/agenteval/targets/sagemaker_endpoint/target.py +++ /dev/null @@ -1,85 +0,0 @@ -import json -from typing import Optional - -from jsonpath_ng import parse - -from agenteval import TargetResponse -from agenteval.targets import Boto3Target - -_SERVICE_NAME = "sagemaker-runtime" -_CONTENT_TYPE = "application/json" -_ACCEPT = "application/json" - - -class SageMakerEndpointTarget(Boto3Target): - def __init__( - self, - endpoint_name: str, - request_body: dict, - input_path: str, - output_path: str, - custom_attributes: Optional[str] = None, - target_model: Optional[str] = None, - target_variant: Optional[str] = None, - target_container_hostname: Optional[str] = None, - inference_component_name: Optional[str] = None, - **kwargs - ): - super().__init__(boto3_service_name=_SERVICE_NAME, **kwargs) - - self._request_body = request_body - self._input_jp_expr = parse(input_path) - self._output_jp_expr = parse(output_path) - - self._args = self._create_base_args( - endpoint_name, - custom_attributes, - target_model, - target_variant, - target_container_hostname, - inference_component_name, - ) - - @staticmethod - def _create_base_args( - endpoint_name: str, - custom_attributes: Optional[str], - target_model: Optional[str], - target_variant: Optional[str], - target_container_hostname: Optional[str], - inference_component_name: Optional[str], - ): - args = { - "EndpointName": endpoint_name, - "ContentType": _CONTENT_TYPE, - "Accept": _ACCEPT, - **{ - key: value - for key, value in { - "CustomAttributes": custom_attributes, - "TargetModel": target_model, - "TargetVariant": target_variant, - "TargetContainerHostname": target_container_hostname, - "InferenceComponentName": inference_component_name, - }.items() - if value is not None - }, - } - - return args - - def _update_request(self, prompt: str): - self._input_jp_expr.update(self._request_body, prompt) - self._args["Body"] = json.dumps(self._request_body) - - def _query_response(self, response_body: dict) -> str: - return self._output_jp_expr.find(response_body)[0].value - - def invoke(self, prompt: str) -> str: - self._update_request(prompt) - - response = self.boto3_client.invoke_endpoint(**self._args) - - response_body = json.loads(response.get("Body").read()) - - return TargetResponse(response=self._query_response(response_body)) diff --git a/stepfunctions/agenteval/targets/target_factory.py b/stepfunctions/agenteval/targets/target_factory.py deleted file mode 100644 index a8e7e9c..0000000 --- a/stepfunctions/agenteval/targets/target_factory.py +++ /dev/null @@ -1,32 +0,0 @@ -from pydantic import BaseModel - -from agenteval.targets import BaseTarget -from agenteval.targets.bedrock_agent import BedrockAgentTarget -from agenteval.targets.bedrock_knowledge_base import BedrockKnowledgeBaseTarget -from agenteval.targets.q_business import QBusinessTarget -from agenteval.targets.sagemaker_endpoint import SageMakerEndpointTarget -from agenteval.utils import import_class - -_TARGET_MAP = { - "bedrock-agent": BedrockAgentTarget, - "q-business": QBusinessTarget, - "sagemaker-endpoint": SageMakerEndpointTarget, - "bedrock-knowledgebase": BedrockKnowledgeBaseTarget, -} - - -class TargetFactory(BaseModel): - config: dict - - def create(self) -> BaseTarget: - target_cls = self._get_target_class() - - return target_cls(**{k: v for k, v in self.config.items() if k != "type"}) - - def _get_target_class(self) -> type[BaseTarget]: - if self.config["type"] in _TARGET_MAP: - target_cls = _TARGET_MAP[self.config["type"]] - else: - target_cls = import_class(self.config["type"], parent_class=BaseTarget) - - return target_cls diff --git a/stepfunctions/agenteval/templates/evaluators/claude_3/generate_evaluation.jinja b/stepfunctions/agenteval/templates/evaluators/claude_3/generate_evaluation.jinja deleted file mode 100644 index 9cd9dd4..0000000 --- a/stepfunctions/agenteval/templates/evaluators/claude_3/generate_evaluation.jinja +++ /dev/null @@ -1,13 +0,0 @@ -Here are the expected results and conversation: - - -{% for result in expected_results -%} -{{ loop.index }}. {{ result }} -{% endfor -%} - - - -{% for sender, message in conversation -%} -{{ sender }}: {{ message }} -{% endfor -%} - \ No newline at end of file diff --git a/stepfunctions/agenteval/templates/evaluators/claude_3/generate_initial_prompt.jinja b/stepfunctions/agenteval/templates/evaluators/claude_3/generate_initial_prompt.jinja deleted file mode 100644 index 832ba37..0000000 --- a/stepfunctions/agenteval/templates/evaluators/claude_3/generate_initial_prompt.jinja +++ /dev/null @@ -1,5 +0,0 @@ -Here is the step: - - -{{ step }} - \ No newline at end of file diff --git a/stepfunctions/agenteval/templates/evaluators/claude_3/generate_test_status.jinja b/stepfunctions/agenteval/templates/evaluators/claude_3/generate_test_status.jinja deleted file mode 100644 index 79ad0df..0000000 --- a/stepfunctions/agenteval/templates/evaluators/claude_3/generate_test_status.jinja +++ /dev/null @@ -1,13 +0,0 @@ -Here are the steps and conversation: - - -{% for step in steps -%} -{{ loop.index }}. {{ step }} -{% endfor -%} - - - -{% for sender, message in conversation -%} -{{ sender }}: {{ message }} -{% endfor -%} - \ No newline at end of file diff --git a/stepfunctions/agenteval/templates/evaluators/claude_3/generate_user_response.jinja b/stepfunctions/agenteval/templates/evaluators/claude_3/generate_user_response.jinja deleted file mode 100644 index 79ad0df..0000000 --- a/stepfunctions/agenteval/templates/evaluators/claude_3/generate_user_response.jinja +++ /dev/null @@ -1,13 +0,0 @@ -Here are the steps and conversation: - - -{% for step in steps -%} -{{ loop.index }}. {{ step }} -{% endfor -%} - - - -{% for sender, message in conversation -%} -{{ sender }}: {{ message }} -{% endfor -%} - \ No newline at end of file diff --git a/stepfunctions/agenteval/templates/evaluators/claude_3/system/generate_evaluation.jinja b/stepfunctions/agenteval/templates/evaluators/claude_3/system/generate_evaluation.jinja deleted file mode 100644 index 22cace3..0000000 --- a/stepfunctions/agenteval/templates/evaluators/claude_3/system/generate_evaluation.jinja +++ /dev/null @@ -1,12 +0,0 @@ -You are a quality assurance engineer evaluating a conversation between an USER and an AGENT. - -Your job is to analyze the conversation in tags and a list of expected results -in tags. - -You will classify the the conversation into the following categories: - -- A: All of the expected results can be observed in the conversation. -- B: Not all of the expected results can be observed in the conversation. - -Please think hard about the response in tags before providing only the category letter -within tags. \ No newline at end of file diff --git a/stepfunctions/agenteval/templates/evaluators/claude_3/system/generate_initial_prompt.jinja b/stepfunctions/agenteval/templates/evaluators/claude_3/system/generate_initial_prompt.jinja deleted file mode 100644 index d0e8e23..0000000 --- a/stepfunctions/agenteval/templates/evaluators/claude_3/system/generate_initial_prompt.jinja +++ /dev/null @@ -1,13 +0,0 @@ -You are role playing as an USER in a conversastion with an AGENT. - -You will be given a step that is wrapped in tags. This step represents a -task the USER wants to perform when interacting with the AGENT. - -Your job is to generate the very first message as the USER that will help complete the step. - -Make sure this message is concise and to the point. - -Do not provide any information if it is expected that the AGENT will eventually ask for it. - -Please think hard about the response in tags before providing the message -within tags. \ No newline at end of file diff --git a/stepfunctions/agenteval/templates/evaluators/claude_3/system/generate_test_status.jinja b/stepfunctions/agenteval/templates/evaluators/claude_3/system/generate_test_status.jinja deleted file mode 100644 index 7bb8e6b..0000000 --- a/stepfunctions/agenteval/templates/evaluators/claude_3/system/generate_test_status.jinja +++ /dev/null @@ -1,13 +0,0 @@ -You are a quality assurance engineer evaluating a conversation between an USER and an AGENT. - -You will be given an ordered list of steps wrapped in tags. Each step represents a task -that the USER wants to perform when interacting with the AGENT. - -Your job is analyze the running conversation in tags and classify it into the following -categories: - -- A: The USER has attempted all the steps. -- B: The USER has not yet attempted all the steps. - -Please think hard about the response in tags before providing only the category letter -within tags. \ No newline at end of file diff --git a/stepfunctions/agenteval/templates/evaluators/claude_3/system/generate_user_response.jinja b/stepfunctions/agenteval/templates/evaluators/claude_3/system/generate_user_response.jinja deleted file mode 100644 index e670420..0000000 --- a/stepfunctions/agenteval/templates/evaluators/claude_3/system/generate_user_response.jinja +++ /dev/null @@ -1,15 +0,0 @@ -You are role playing as an USER in a conversastion with an AGENT. - -You will be given an ordered list of steps wrapped in tags. Each step represents -a task that the USER wants to perform when interacting with the AGENT. - -Using the list of steps, your job is analyze the running conversation in the - tags and generate the next appropriate response as the USER. - -Do not include any information from a step unless the AGENT asks for it. - -If the AGENT was unable to help or did not understand the last request, just move on to -the next step. Do not attempt to rephrase the request in the next response as the USER. - -Please think hard about the response in tags before providing the response -within tags. Do not include the string "USER:" in your response. \ No newline at end of file diff --git a/stepfunctions/agenteval/templates/summary/agenteval_summary.md.jinja b/stepfunctions/agenteval/templates/summary/agenteval_summary.md.jinja deleted file mode 100644 index a624303..0000000 --- a/stepfunctions/agenteval/templates/summary/agenteval_summary.md.jinja +++ /dev/null @@ -1,49 +0,0 @@ -# Test Summary ---- -This document provides a summary of the tests executed by Agent Evaluation. - -> :warning: This tool utilizes generative AI to assess virtual agents and its evaluations may contain errors. **Please thoroughly examine the results below prior to deciding whether to implement an agent.** ---- -## Tests -{% for test, result in zip(tests, results) -%} -- [{% if result.success %}:green_circle:{% else %}:red_circle:{% endif %} {{ test.name }}](#{{ test.name | replace(' ', '-') }}) -{% endfor %} - ---- - - -{% for test, result in zip(tests, results) -%} -## {% if result.success %}:green_circle:{% else %}:red_circle:{% endif %} {{ test.name }} - -**Steps** -{% for step in test.steps -%} -{{ loop.index }}. {{ step }} -{% endfor %} - -**Expected results** -{% for result in test.expected_results -%} -{{ loop.index }}. {{ result }} -{% endfor %} - -**Conversation** -``` -{% for sender, message in result.conversation -%} -[{{ sender }}] {{ message }} -{% endfor -%} -``` - -**Result** -{{ result.result }} - -**Reasoning** -``` -{{ result.reasoning }} -``` - ---- -{% endfor %} - - - - - diff --git a/stepfunctions/agenteval/test.py b/stepfunctions/agenteval/test.py deleted file mode 100644 index 695f2fe..0000000 --- a/stepfunctions/agenteval/test.py +++ /dev/null @@ -1,29 +0,0 @@ -# Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. -# SPDX-License-Identifier: Apache-2.0 - -from typing import Optional - -from pydantic import BaseModel - - -class Test(BaseModel, validate_assignment=True): - """A test case for an agent. - - Attributes: - name: Name of the test. - steps: List of step to perform for the test. - expected_results: List of expected results for the test. - initial_prompt: Optional initial prompt. - max_turns: Maximum number of turns allowed for the test. - hook: The module path to an evaluation hook. - """ - - # do not collect as a test - __test__ = False - - name: str - steps: list[str] - expected_results: list[str] - initial_prompt: Optional[str] = None - max_turns: int - hook: Optional[str] = None diff --git a/stepfunctions/agenteval/test_result.py b/stepfunctions/agenteval/test_result.py deleted file mode 100644 index 5258aef..0000000 --- a/stepfunctions/agenteval/test_result.py +++ /dev/null @@ -1,27 +0,0 @@ -# Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. -# SPDX-License-Identifier: Apache-2.0 - -from pydantic import BaseModel - -from agenteval.conversation import Conversation - - -class TestResult(BaseModel, arbitrary_types_allowed=True): - """The result of a test. - - Attributes: - test_name: Name of the test. - result: Description of the test result. - reasoning: The rationale for the test result. - success: `True` if the test passed, otherwise `False`. - conversation: Captures the interaction between a user and an agent. - """ - - # do not collect as a test - __test__ = False - - test_name: str - result: str - reasoning: str - success: bool - conversation: Conversation diff --git a/stepfunctions/agenteval/trace.py b/stepfunctions/agenteval/trace.py deleted file mode 100644 index 25d477a..0000000 --- a/stepfunctions/agenteval/trace.py +++ /dev/null @@ -1,72 +0,0 @@ -# Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. -# SPDX-License-Identifier: Apache-2.0 - -import inspect -import json -import os -from datetime import datetime, timezone -from typing import Optional - -_TRACE_DIR = "agenteval_traces" - - -class Trace: - """Captures steps during evaluation. - - Attributes: - test_name (str): Name of the test. - trace_dir (str): Directory to store the trace. - start_time (datetime): Start time of the trace. - end_time (datetime): End time of the trace. - steps (list): List of steps in the trace. - - """ - - def __init__(self, test_name: str, work_dir: str): - """ - Initialize the trace handler. - - Args: - test_name (str): Name of the trace - """ - self.test_name = test_name - self.trace_dir = os.path.join(work_dir, _TRACE_DIR) - self.start_time = None - self.end_time = None - self.steps = [] - - def __enter__(self): - self.start_time = datetime.now(timezone.utc) - return self - - def __exit__(self, *exc): - self.end_time = datetime.now(timezone.utc) - self._dump_trace() - - def _dump_trace(self): - """Dump the trace to a JSON file.""" - - os.makedirs(self.trace_dir, exist_ok=True) - - with open(os.path.join(self.trace_dir, f"{self.test_name}.json"), "w") as f: - json.dump(self._get_trace(), f, default=str) - - def _get_trace(self) -> str: - return { - "test_name": self.test_name, - "start_time": self.start_time, - "end_time": self.end_time, - "steps": self.steps, - } - - def add_step(self, step_name: Optional[str] = None, **kwargs): - """Add a step to the trace. - - Args: - step_name (str, optional): The name of the step. Defaults to - the name of the caller function - """ - step_name = step_name or inspect.stack()[1].function - step = {"timestamp": datetime.now(timezone.utc), "step_name": step_name} - step.update(kwargs) - self.steps.append(step) diff --git a/stepfunctions/agenteval/utils/__init__.py b/stepfunctions/agenteval/utils/__init__.py deleted file mode 100644 index 5f80a10..0000000 --- a/stepfunctions/agenteval/utils/__init__.py +++ /dev/null @@ -1,7 +0,0 @@ -# Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. -# SPDX-License-Identifier: Apache-2.0 - -from .aws import create_boto3_client -from .imports import import_class - -__all__ = ["import_class", "create_boto3_client"] diff --git a/stepfunctions/agenteval/utils/aws.py b/stepfunctions/agenteval/utils/aws.py deleted file mode 100644 index 4d5d4dd..0000000 --- a/stepfunctions/agenteval/utils/aws.py +++ /dev/null @@ -1,36 +0,0 @@ -# Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. -# SPDX-License-Identifier: Apache-2.0 - -from typing import Optional - -import boto3 -from botocore.client import BaseClient -from botocore.config import Config - -_RETRY_MODE = "adaptive" - - -def create_boto3_client( - boto3_service_name: str, - aws_profile: Optional[str], - aws_region: Optional[str], - endpoint_url: Optional[str], - max_retry: int, -) -> BaseClient: - """Create a `boto3` client. - - Args: - boto3_service_name (str): The `boto3` service name (e.g `"bedrock-runtime"`). - aws_profile (str, optional): The AWS profile name. - aws_region (str, optional): The AWS region. - endpoint_url (str, optional): The endpoint URL for the AWS service. - max_retry (int, optional): The maximum number of retry attempts. - - Returns: - BaseClient - """ - - config = Config(retries={"max_attempts": max_retry, "mode": _RETRY_MODE}) - - session = boto3.Session(profile_name=aws_profile, region_name=aws_region) - return session.client(boto3_service_name, endpoint_url=endpoint_url, config=config) diff --git a/stepfunctions/agenteval/utils/imports.py b/stepfunctions/agenteval/utils/imports.py deleted file mode 100644 index f0e2685..0000000 --- a/stepfunctions/agenteval/utils/imports.py +++ /dev/null @@ -1,35 +0,0 @@ -# Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. -# SPDX-License-Identifier: Apache-2.0 - -from importlib import import_module -from typing import Optional - -_ALLOWED_MODULE_NAME_SUFFIX = ["_hook", "_target"] - - -def import_class(module_path: str, parent_class: Optional[type] = None) -> type: - name, class_name = module_path.rsplit(".", 1) - - # make sure module name starts with one of the allowed suffixes - _validate_module_name(name.split(".")[-1]) - - module = import_module(name) - cls = getattr(module, class_name) - - if parent_class: - # make sure the imported class is a subclass - _validate_subclass(cls, parent_class) - - return cls - - -def _validate_module_name(name: str) -> None: - if not any(name.endswith(suffix) for suffix in _ALLOWED_MODULE_NAME_SUFFIX): - raise ValueError(f"Invalid module name: {name}") - - -def _validate_subclass(child_class: type, parent_class: type) -> None: - if not issubclass(child_class, parent_class): - raise TypeError( - f"{child_class.__name__} is not a {parent_class.__name__} subclass" - ) diff --git a/stepfunctions/app.py b/stepfunctions/app.py index 5751ac6..bf4ff67 100644 --- a/stepfunctions/app.py +++ b/stepfunctions/app.py @@ -20,7 +20,7 @@ # Uncomment the next line if you know exactly what Account and Region you # want to deploy the stack to. */ - env=cdk.Environment(region='us-east-1'), + # env=cdk.Environment(region='us-east-1'), # For more information, see https://docs.aws.amazon.com/cdk/latest/guide/environments.html ) diff --git a/stepfunctions/stepfunctions/agent_evaluation-0.2.0.dist-info/INSTALLER b/stepfunctions/stepfunctions/agent_evaluation-0.2.0.dist-info/INSTALLER deleted file mode 100644 index a1b589e..0000000 --- a/stepfunctions/stepfunctions/agent_evaluation-0.2.0.dist-info/INSTALLER +++ /dev/null @@ -1 +0,0 @@ -pip diff --git a/stepfunctions/stepfunctions/agent_evaluation-0.2.0.dist-info/LICENSE b/stepfunctions/stepfunctions/agent_evaluation-0.2.0.dist-info/LICENSE deleted file mode 100644 index 67db858..0000000 --- a/stepfunctions/stepfunctions/agent_evaluation-0.2.0.dist-info/LICENSE +++ /dev/null @@ -1,175 +0,0 @@ - - Apache License - Version 2.0, January 2004 - http://www.apache.org/licenses/ - - TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION - - 1. Definitions. - - "License" shall mean the terms and conditions for use, reproduction, - and distribution as defined by Sections 1 through 9 of this document. - - "Licensor" shall mean the copyright owner or entity authorized by - the copyright owner that is granting the License. - - "Legal Entity" shall mean the union of the acting entity and all - other entities that control, are controlled by, or are under common - control with that entity. For the purposes of this definition, - "control" means (i) the power, direct or indirect, to cause the - direction or management of such entity, whether by contract or - otherwise, or (ii) ownership of fifty percent (50%) or more of the - outstanding shares, or (iii) beneficial ownership of such entity. - - "You" (or "Your") shall mean an individual or Legal Entity - exercising permissions granted by this License. - - "Source" form shall mean the preferred form for making modifications, - including but not limited to software source code, documentation - source, and configuration files. - - "Object" form shall mean any form resulting from mechanical - transformation or translation of a Source form, including but - not limited to compiled object code, generated documentation, - and conversions to other media types. - - "Work" shall mean the work of authorship, whether in Source or - Object form, made available under the License, as indicated by a - copyright notice that is included in or attached to the work - (an example is provided in the Appendix below). - - "Derivative Works" shall mean any work, whether in Source or Object - form, that is based on (or derived from) the Work and for which the - editorial revisions, annotations, elaborations, or other modifications - represent, as a whole, an original work of authorship. For the purposes - of this License, Derivative Works shall not include works that remain - separable from, or merely link (or bind by name) to the interfaces of, - the Work and Derivative Works thereof. - - "Contribution" shall mean any work of authorship, including - the original version of the Work and any modifications or additions - to that Work or Derivative Works thereof, that is intentionally - submitted to Licensor for inclusion in the Work by the copyright owner - or by an individual or Legal Entity authorized to submit on behalf of - the copyright owner. For the purposes of this definition, "submitted" - means any form of electronic, verbal, or written communication sent - to the Licensor or its representatives, including but not limited to - communication on electronic mailing lists, source code control systems, - and issue tracking systems that are managed by, or on behalf of, the - Licensor for the purpose of discussing and improving the Work, but - excluding communication that is conspicuously marked or otherwise - designated in writing by the copyright owner as "Not a Contribution." - - "Contributor" shall mean Licensor and any individual or Legal Entity - on behalf of whom a Contribution has been received by Licensor and - subsequently incorporated within the Work. - - 2. Grant of Copyright License. Subject to the terms and conditions of - this License, each Contributor hereby grants to You a perpetual, - worldwide, non-exclusive, no-charge, royalty-free, irrevocable - copyright license to reproduce, prepare Derivative Works of, - publicly display, publicly perform, sublicense, and distribute the - Work and such Derivative Works in Source or Object form. - - 3. Grant of Patent License. Subject to the terms and conditions of - this License, each Contributor hereby grants to You a perpetual, - worldwide, non-exclusive, no-charge, royalty-free, irrevocable - (except as stated in this section) patent license to make, have made, - use, offer to sell, sell, import, and otherwise transfer the Work, - where such license applies only to those patent claims licensable - by such Contributor that are necessarily infringed by their - Contribution(s) alone or by combination of their Contribution(s) - with the Work to which such Contribution(s) was submitted. If You - institute patent litigation against any entity (including a - cross-claim or counterclaim in a lawsuit) alleging that the Work - or a Contribution incorporated within the Work constitutes direct - or contributory patent infringement, then any patent licenses - granted to You under this License for that Work shall terminate - as of the date such litigation is filed. - - 4. Redistribution. You may reproduce and distribute copies of the - Work or Derivative Works thereof in any medium, with or without - modifications, and in Source or Object form, provided that You - meet the following conditions: - - (a) You must give any other recipients of the Work or - Derivative Works a copy of this License; and - - (b) You must cause any modified files to carry prominent notices - stating that You changed the files; and - - (c) You must retain, in the Source form of any Derivative Works - that You distribute, all copyright, patent, trademark, and - attribution notices from the Source form of the Work, - excluding those notices that do not pertain to any part of - the Derivative Works; and - - (d) If the Work includes a "NOTICE" text file as part of its - distribution, then any Derivative Works that You distribute must - include a readable copy of the attribution notices contained - within such NOTICE file, excluding those notices that do not - pertain to any part of the Derivative Works, in at least one - of the following places: within a NOTICE text file distributed - as part of the Derivative Works; within the Source form or - documentation, if provided along with the Derivative Works; or, - within a display generated by the Derivative Works, if and - wherever such third-party notices normally appear. The contents - of the NOTICE file are for informational purposes only and - do not modify the License. You may add Your own attribution - notices within Derivative Works that You distribute, alongside - or as an addendum to the NOTICE text from the Work, provided - that such additional attribution notices cannot be construed - as modifying the License. - - You may add Your own copyright statement to Your modifications and - may provide additional or different license terms and conditions - for use, reproduction, or distribution of Your modifications, or - for any such Derivative Works as a whole, provided Your use, - reproduction, and distribution of the Work otherwise complies with - the conditions stated in this License. - - 5. Submission of Contributions. Unless You explicitly state otherwise, - any Contribution intentionally submitted for inclusion in the Work - by You to the Licensor shall be under the terms and conditions of - this License, without any additional terms or conditions. - Notwithstanding the above, nothing herein shall supersede or modify - the terms of any separate license agreement you may have executed - with Licensor regarding such Contributions. - - 6. Trademarks. This License does not grant permission to use the trade - names, trademarks, service marks, or product names of the Licensor, - except as required for reasonable and customary use in describing the - origin of the Work and reproducing the content of the NOTICE file. - - 7. Disclaimer of Warranty. Unless required by applicable law or - agreed to in writing, Licensor provides the Work (and each - Contributor provides its Contributions) on an "AS IS" BASIS, - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or - implied, including, without limitation, any warranties or conditions - of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A - PARTICULAR PURPOSE. You are solely responsible for determining the - appropriateness of using or redistributing the Work and assume any - risks associated with Your exercise of permissions under this License. - - 8. Limitation of Liability. In no event and under no legal theory, - whether in tort (including negligence), contract, or otherwise, - unless required by applicable law (such as deliberate and grossly - negligent acts) or agreed to in writing, shall any Contributor be - liable to You for damages, including any direct, indirect, special, - incidental, or consequential damages of any character arising as a - result of this License or out of the use or inability to use the - Work (including but not limited to damages for loss of goodwill, - work stoppage, computer failure or malfunction, or any and all - other commercial damages or losses), even if such Contributor - has been advised of the possibility of such damages. - - 9. Accepting Warranty or Additional Liability. While redistributing - the Work or Derivative Works thereof, You may choose to offer, - and charge a fee for, acceptance of support, warranty, indemnity, - or other liability obligations and/or rights consistent with this - License. However, in accepting such obligations, You may act only - on Your own behalf and on Your sole responsibility, not on behalf - of any other Contributor, and only if You agree to indemnify, - defend, and hold each Contributor harmless for any liability - incurred by, or claims asserted against, such Contributor by reason - of your accepting any such warranty or additional liability. diff --git a/stepfunctions/stepfunctions/agent_evaluation-0.2.0.dist-info/METADATA b/stepfunctions/stepfunctions/agent_evaluation-0.2.0.dist-info/METADATA deleted file mode 100644 index 1070391..0000000 --- a/stepfunctions/stepfunctions/agent_evaluation-0.2.0.dist-info/METADATA +++ /dev/null @@ -1,74 +0,0 @@ -Metadata-Version: 2.1 -Name: agent-evaluation -Version: 0.2.0 -Summary: A generative AI-powered framework for testing virtual agents. -Home-page: https://awslabs.github.io/agent-evaluation/ -Author: Amazon Web Services -Author-email: agent-evaluation-oss-core-team@amazon.com -License: Apache 2.0 -Classifier: Development Status :: 4 - Beta -Classifier: Intended Audience :: Developers -Classifier: Topic :: Utilities -Classifier: Topic :: Software Development :: Testing -Classifier: License :: OSI Approved :: Apache Software License -Classifier: Programming Language :: Python :: 3 :: Only -Classifier: Programming Language :: Python :: 3.9 -Classifier: Programming Language :: Python :: 3.10 -Classifier: Programming Language :: Python :: 3.11 -Classifier: Programming Language :: Python :: 3.12 -Requires-Python: >=3.9 -Description-Content-Type: text/markdown -License-File: LICENSE -License-File: NOTICE -Requires-Dist: pyyaml ~=6.0 -Requires-Dist: boto3 <2.0,>=1.34.20 -Requires-Dist: click ~=8.0 -Requires-Dist: pydantic <3.0,>=2.1.0 -Requires-Dist: rich <14.0,>=13.7.0 -Requires-Dist: jinja2 <4.0,>=3.1.3 -Requires-Dist: jsonpath-ng <2.0,>=1.6.1 -Provides-Extra: dev -Requires-Dist: flake8 ; extra == 'dev' -Requires-Dist: black ; extra == 'dev' -Requires-Dist: isort ; extra == 'dev' -Requires-Dist: pytest ; extra == 'dev' -Requires-Dist: pytest-cov ; extra == 'dev' -Requires-Dist: pytest-mock ; extra == 'dev' -Requires-Dist: mkdocs ; extra == 'dev' -Requires-Dist: mkdocs-material ; extra == 'dev' -Requires-Dist: mkdocstrings[python] ; extra == 'dev' -Requires-Dist: mkdocs-click ; extra == 'dev' -Requires-Dist: bandit ; extra == 'dev' -Requires-Dist: pip-audit ; extra == 'dev' - -![PyPI - Version](https://img.shields.io/pypi/v/agent-evaluation) -![PyPI - Python Version](https://img.shields.io/pypi/pyversions/agent-evaluation) -![GitHub License](https://img.shields.io/github/license/awslabs/agent-evaluation) -[![security: bandit](https://img.shields.io/badge/security-bandit-yellow.svg)](https://github.com/PyCQA/bandit) -[![Code style: black](https://img.shields.io/badge/code%20style-black-000000.svg)](https://github.com/psf/black) -[![Built with Material for MkDocs](https://img.shields.io/badge/Material_for_MkDocs-526CFE?style=for-the-badge&logo=MaterialForMkDocs&logoColor=white)](https://squidfunk.github.io/mkdocs-material/) - -# Agent Evaluation - -Agent Evaluation is a generative AI-powered framework for testing virtual agents. - -Internally, Agent Evaluation implements an LLM agent (evaluator) that will orchestrate conversations with your own agent (target) and evaluate the responses during the conversation. - -## ✨ Key features - -- Built-in support for popular AWS services including [Amazon Bedrock](https://aws.amazon.com/bedrock/), [Amazon Q Business](https://aws.amazon.com/q/business/), and [Amazon SageMaker](https://aws.amazon.com/sagemaker/). You can also [bring your own agent](https://awslabs.github.io/agent-evaluation/targets/custom_targets/) to test using Agent Evaluation. -- Orchestrate concurrent, multi-turn conversations with your agent while evaluating its responses. -- Define [hooks](https://awslabs.github.io/agent-evaluation/hooks/) to perform additional tasks such as integration testing. -- Can be incorporated into CI/CD pipelines to expedite the time to delivery while maintaining the stability of agents in production environments. - -## 📚 Documentation - -To get started, please visit the full documentation [here](https://awslabs.github.io/agent-evaluation/). To contribute, please refer to [CONTRIBUTING.md](./CONTRIBUTING.md) - -## 👏 Contributors - -Shout out to these awesome contributors: - - - - diff --git a/stepfunctions/stepfunctions/agent_evaluation-0.2.0.dist-info/NOTICE b/stepfunctions/stepfunctions/agent_evaluation-0.2.0.dist-info/NOTICE deleted file mode 100644 index 616fc58..0000000 --- a/stepfunctions/stepfunctions/agent_evaluation-0.2.0.dist-info/NOTICE +++ /dev/null @@ -1 +0,0 @@ -Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. diff --git a/stepfunctions/stepfunctions/agent_evaluation-0.2.0.dist-info/RECORD b/stepfunctions/stepfunctions/agent_evaluation-0.2.0.dist-info/RECORD deleted file mode 100644 index fcc2eac..0000000 --- a/stepfunctions/stepfunctions/agent_evaluation-0.2.0.dist-info/RECORD +++ /dev/null @@ -1,87 +0,0 @@ -../../../bin/agenteval,sha256=sKahy-HYfncxw3pVqCLLgxIokhvln3Qm9eDSvskMrV8,250 -agent_evaluation-0.2.0.dist-info/INSTALLER,sha256=zuuue4knoyJ-UwPPXg8fezS7VCrXJQrAP7zeNuwvFQg,4 -agent_evaluation-0.2.0.dist-info/LICENSE,sha256=CeipvOyAZxBGUsFoaFqwkx54aPnIKEtm9a5u2uXxEws,10142 -agent_evaluation-0.2.0.dist-info/METADATA,sha256=WOBzlzyr7ozBQpU_A99iEq8w2O-j-Zii-Q2al9A3D_Y,3759 -agent_evaluation-0.2.0.dist-info/NOTICE,sha256=1CkO1kwu3Q_OHYTj-d-yiBJA_lNN73a4zSntavaD4oc,67 -agent_evaluation-0.2.0.dist-info/RECORD,, -agent_evaluation-0.2.0.dist-info/REQUESTED,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0 -agent_evaluation-0.2.0.dist-info/WHEEL,sha256=GJ7t_kWBFywbagK5eo9IoUwLW6oyOeTKmQ-9iHFVNxQ,92 -agent_evaluation-0.2.0.dist-info/entry_points.txt,sha256=DxxfiIbldqK82WRgaMOL4BjPJcnr7JOkDYTch6xNahs,48 -agent_evaluation-0.2.0.dist-info/top_level.txt,sha256=k6izISLxVoNnLxZHqnS3X-0eDdUD8LsV-OoM0afYdew,10 -agenteval/__init__.py,sha256=JQm11m01-rh2PjCw1OVqgy1rkU725Q6vMqfDtLbRH2U,1150 -agenteval/__pycache__/__init__.cpython-311.pyc,, -agenteval/__pycache__/cli.cpython-311.pyc,, -agenteval/__pycache__/conversation.cpython-311.pyc,, -agenteval/__pycache__/defaults.cpython-311.pyc,, -agenteval/__pycache__/hook.cpython-311.pyc,, -agenteval/__pycache__/plan.cpython-311.pyc,, -agenteval/__pycache__/target_response.cpython-311.pyc,, -agenteval/__pycache__/test.cpython-311.pyc,, -agenteval/__pycache__/test_result.cpython-311.pyc,, -agenteval/__pycache__/trace.cpython-311.pyc,, -agenteval/cli.py,sha256=wl0z_pCCKuu9lJgUWrS8cuHkvMYclhd-aCmCB6MN8u8,2807 -agenteval/conversation.py,sha256=r3fvnsnVI7zRoi_RS2JjPahUtLXF9vhnZYJcx1RMe3M,1030 -agenteval/defaults.py,sha256=PB1UniJ-uyiBn0WWSA3EI1UxcfpU2wlvsJZmhWgdV5E,280 -agenteval/evaluators/__init__.py,sha256=U6uQ6THgK0yxMnqVKL5l7_zUUxepoo11W1fPLa9xgNE,247 -agenteval/evaluators/__pycache__/__init__.cpython-311.pyc,, -agenteval/evaluators/__pycache__/base_evaluator.cpython-311.pyc,, -agenteval/evaluators/__pycache__/evaluator_factory.cpython-311.pyc,, -agenteval/evaluators/base_evaluator.py,sha256=zpWFBhQdaB-__TdiM7lFlkvQFX27KSFvzDFQ1KBvuLw,5052 -agenteval/evaluators/claude_3/__init__.py,sha256=mKv_FTRrhYIIS86zqxzj5edy-tKREHsn3nXUBmck71Q,180 -agenteval/evaluators/claude_3/__pycache__/__init__.cpython-311.pyc,, -agenteval/evaluators/claude_3/__pycache__/evaluator.cpython-311.pyc,, -agenteval/evaluators/claude_3/__pycache__/model_configs.cpython-311.pyc,, -agenteval/evaluators/claude_3/evaluator.py,sha256=k-ZXtKBtywVYy1XEAkSufb9LYXlAElaklV8Wao-udLo,7751 -agenteval/evaluators/claude_3/model_configs.py,sha256=KUf0C5Bbgc-c05ZZlokVgjHVH4WGdoOfKtwQWwuQFLY,635 -agenteval/evaluators/evaluator_factory.py,sha256=JCTVoN62QNMcKR68KY2Li8zpm55HNvYwVBXZ0Yi3rhQ,712 -agenteval/hook.py,sha256=z8UfREnySi2E6tRwjeklI3CwjWQ5MMk59wLHj6TK9C0,1049 -agenteval/plan.py,sha256=tIXTXepcVZEA8JX0yoEzsSuLDVpqSYvBdKsGJYYCVbU,3236 -agenteval/runner/__init__.py,sha256=6f0fmworOJ0fn2MNzDg52zbip4osTovhwetT6ZQnI74,157 -agenteval/runner/__pycache__/__init__.cpython-311.pyc,, -agenteval/runner/__pycache__/runner.cpython-311.pyc,, -agenteval/runner/__pycache__/summary.cpython-311.pyc,, -agenteval/runner/runner.py,sha256=wSYcX82WTMwmMFCfqoHjxq8NTnV1_UdPr4A1fnmkD_U,3937 -agenteval/runner/summary.py,sha256=jTdFRFo7zAaE-PTA6Cy3n1cndgFB14vA20MDO9FeJyE,872 -agenteval/target_response.py,sha256=R_Gy-655vPEsSO7X2siU2GNiFPRl1CkRetiON8WYEGM,285 -agenteval/targets/__init__.py,sha256=JmGtuue6VQYkK5jAiArxlbnRQsA23p8NgDTMvnCWyGU,282 -agenteval/targets/__pycache__/__init__.cpython-311.pyc,, -agenteval/targets/__pycache__/base_target.cpython-311.pyc,, -agenteval/targets/__pycache__/boto3_target.cpython-311.pyc,, -agenteval/targets/__pycache__/target_factory.cpython-311.pyc,, -agenteval/targets/base_target.py,sha256=aYW5dLAlbKgscdf8XTcV9Bppbay-pz-c_y5RtCgdBD0,743 -agenteval/targets/bedrock_agent/__init__.py,sha256=2B5TCxdyQAXuQRtji0lclk5odB7xgT5Hi_dBwjErIzo,73 -agenteval/targets/bedrock_agent/__pycache__/__init__.cpython-311.pyc,, -agenteval/targets/bedrock_agent/__pycache__/target.cpython-311.pyc,, -agenteval/targets/bedrock_agent/target.py,sha256=GRfn4dOGkARF_3_DBupgoHrbiYQZADfqwXO65Z2-RDM,1332 -agenteval/targets/bedrock_knowledge_base/__init__.py,sha256=tYJixJ0x9ohkM7oker8eX7U4vkkxqV_xVlA4CsWIuec,89 -agenteval/targets/bedrock_knowledge_base/__pycache__/__init__.cpython-311.pyc,, -agenteval/targets/bedrock_knowledge_base/__pycache__/target.cpython-311.pyc,, -agenteval/targets/bedrock_knowledge_base/target.py,sha256=jOsAixfOSy6jEQF6p_uCwDLP7M1WB64F6K49CbtiSYc,1401 -agenteval/targets/boto3_target.py,sha256=qNukrm2GZOrG81pJc61BrJEFcNB_f80cvvWQyMFRQiA,1271 -agenteval/targets/q_business/__init__.py,sha256=1KT5BdoA_KD2fX3gNLvSyg9K5x0OfWBN8X15nxJf13U,67 -agenteval/targets/q_business/__pycache__/__init__.cpython-311.pyc,, -agenteval/targets/q_business/__pycache__/target.cpython-311.pyc,, -agenteval/targets/q_business/target.py,sha256=Bv9YiXcnBoUmXFN3nfCh2FNLNP9vMm_1ruWVlDGsXXs,1014 -agenteval/targets/sagemaker_endpoint/__init__.py,sha256=whoMO69GOhPMNOrbQAfYzVmIXuxhxt8dHJGABnR4_Ck,83 -agenteval/targets/sagemaker_endpoint/__pycache__/__init__.cpython-311.pyc,, -agenteval/targets/sagemaker_endpoint/__pycache__/target.cpython-311.pyc,, -agenteval/targets/sagemaker_endpoint/target.py,sha256=zLsgkOljavYzrjrVnY3qDOjc-zsKFPSIdqugsZZy6po,2677 -agenteval/targets/target_factory.py,sha256=W8mzSy3E44jpYJs6XLD2WaLAaXXZ_T_WGw49CyPLigQ,1092 -agenteval/templates/evaluators/claude_3/generate_evaluation.jinja,sha256=aaTBZnr-3J29SpdernWW8bmQzF7lV0-bed1glZk36Yk,287 -agenteval/templates/evaluators/claude_3/generate_initial_prompt.jinja,sha256=wIhfhNUsTVdeIDBJNH1QWIBQWVE8h0Lc958vuuNU_eE,43 -agenteval/templates/evaluators/claude_3/generate_test_status.jinja,sha256=2T9HuihEVtGvq-ncxl6hLrTZXi2wAYu3cQhCUl0F_qY,238 -agenteval/templates/evaluators/claude_3/generate_user_response.jinja,sha256=2T9HuihEVtGvq-ncxl6hLrTZXi2wAYu3cQhCUl0F_qY,238 -agenteval/templates/evaluators/claude_3/system/generate_evaluation.jinja,sha256=3ihPICeDofWljtl6YpUJQM-lJSPNeWjhjgGndKM1wYQ,554 -agenteval/templates/evaluators/claude_3/system/generate_initial_prompt.jinja,sha256=DR1UaUvn0u_8MD0cSHAWSPLfEIwnGCKlEFPkuUAKLDQ,566 -agenteval/templates/evaluators/claude_3/system/generate_test_status.jinja,sha256=akAKahEda6A3-XhVjXpacGR3e48HrbqE4UT4ONlqVZg,587 -agenteval/templates/evaluators/claude_3/system/generate_user_response.jinja,sha256=yCy-IkJRM2y9-pPbaZaNrT-_4J7x9YM9kMgMXeYf5D4,800 -agenteval/templates/summary/agenteval_summary.md.jinja,sha256=Ri9B_lIpewlBtvs0ggj4IO9FbIZlMq70aDBZg_-xfQk,1107 -agenteval/test.py,sha256=mMbZWI5Yv6oQDS4xh5gCUvAj_IOih3vurqsMJs_9KbM,806 -agenteval/test_result.py,sha256=pDdXfrhIQtgO3au0XaxNLY1uql-POqZrlgu2vtNa0fc,738 -agenteval/trace.py,sha256=9JhT1i295AbKk1Zaj7Qa9EiXW1IJu-GsbOZ1hs8kiEU,2090 -agenteval/utils/__init__.py,sha256=xgJ0V8V34ju5tDEaX-WDBwXLTwMjFBztdYJ5lk2Y-OE,230 -agenteval/utils/__pycache__/__init__.cpython-311.pyc,, -agenteval/utils/__pycache__/aws.cpython-311.pyc,, -agenteval/utils/__pycache__/imports.cpython-311.pyc,, -agenteval/utils/aws.py,sha256=z6YjWUK1MhMl0Z6J-vxZiRBaHv8d444avFxEMjicq0c,1115 -agenteval/utils/imports.py,sha256=i-cd9Ze6LWeaBktGHgZkWLa6W_iUa11vTOBc5CQrfzA,1106 diff --git a/stepfunctions/stepfunctions/agent_evaluation-0.2.0.dist-info/REQUESTED b/stepfunctions/stepfunctions/agent_evaluation-0.2.0.dist-info/REQUESTED deleted file mode 100644 index e69de29..0000000 diff --git a/stepfunctions/stepfunctions/agent_evaluation-0.2.0.dist-info/WHEEL b/stepfunctions/stepfunctions/agent_evaluation-0.2.0.dist-info/WHEEL deleted file mode 100644 index bab98d6..0000000 --- a/stepfunctions/stepfunctions/agent_evaluation-0.2.0.dist-info/WHEEL +++ /dev/null @@ -1,5 +0,0 @@ -Wheel-Version: 1.0 -Generator: bdist_wheel (0.43.0) -Root-Is-Purelib: true -Tag: py3-none-any - diff --git a/stepfunctions/stepfunctions/agent_evaluation-0.2.0.dist-info/entry_points.txt b/stepfunctions/stepfunctions/agent_evaluation-0.2.0.dist-info/entry_points.txt deleted file mode 100644 index 6919bf1..0000000 --- a/stepfunctions/stepfunctions/agent_evaluation-0.2.0.dist-info/entry_points.txt +++ /dev/null @@ -1,2 +0,0 @@ -[console_scripts] -agenteval = agenteval.cli:cli diff --git a/stepfunctions/stepfunctions/agent_evaluation-0.2.0.dist-info/top_level.txt b/stepfunctions/stepfunctions/agent_evaluation-0.2.0.dist-info/top_level.txt deleted file mode 100644 index 060c7ea..0000000 --- a/stepfunctions/stepfunctions/agent_evaluation-0.2.0.dist-info/top_level.txt +++ /dev/null @@ -1 +0,0 @@ -agenteval diff --git a/stepfunctions/stepfunctions/agenteval/__init__.py b/stepfunctions/stepfunctions/agenteval/__init__.py deleted file mode 100644 index cd7bf51..0000000 --- a/stepfunctions/stepfunctions/agenteval/__init__.py +++ /dev/null @@ -1,46 +0,0 @@ -# Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. -# SPDX-License-Identifier: Apache-2.0 - -from importlib.metadata import version - -import logging -import os - -from jinja2 import Environment, PackageLoader, select_autoescape -from rich.logging import RichHandler - -from .hook import Hook -from .target_response import TargetResponse - -__all__ = ["Hook", "TargetResponse"] -__version__ = version("agent-evaluation") - - -_LOG_LEVEL_ENV = "LOG_LEVEL" - - -def configure_logger(): - # supress logs from botocore - logging.getLogger("botocore").setLevel(logging.CRITICAL) - - # configure logging using rich - formatter = logging.Formatter("%(message)s", datefmt="[%X]") - handler = RichHandler(markup=True, show_level=True, rich_tracebacks=True) - handler.setFormatter(formatter) - - logger = logging.getLogger(__name__) - - logger.setLevel(os.environ.get(_LOG_LEVEL_ENV, logging.INFO)) - logger.addHandler(handler) - - -configure_logger() - -jinja_env = Environment( - loader=PackageLoader(__name__), - autoescape=select_autoescape( - disabled_extensions=["jinja"], - default_for_string=True, - default=True, - ), -) diff --git a/stepfunctions/stepfunctions/agenteval/cli.py b/stepfunctions/stepfunctions/agenteval/cli.py deleted file mode 100644 index 940f621..0000000 --- a/stepfunctions/stepfunctions/agenteval/cli.py +++ /dev/null @@ -1,109 +0,0 @@ -# Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. -# SPDX-License-Identifier: Apache-2.0 - -import logging -import os -from typing import Optional - -import click - -from agenteval.plan import Plan -from agenteval.runner import Runner - -logger = logging.getLogger(__name__) - - -def validate_directory(directory): - if not os.path.isdir(directory): - raise NotADirectoryError(f"{directory} is not a directory") - if not os.access(directory, os.R_OK) or not os.access(directory, os.W_OK): - raise PermissionError(f"No read/write permissions for {directory}") - - -@click.group() -def cli(): - pass - - -@cli.command(help="Initialize a test plan.") -@click.option( - "--plan-dir", - type=str, - required=False, - help="The destination directory for storing the test plan. If unspecified, then the test plan is saved to the current working directory.", -) -def init(plan_dir: Optional[str]): - if plan_dir: - validate_directory(plan_dir) - try: - path = Plan.init_plan(plan_dir) - logger.info(f"[green]Test plan created at {path}") - - except FileExistsError as e: - logger.error(f"[red]{e}") - exit(1) - - -@cli.command(help="Run test plan.") -@click.option( - "--filter", - type=str, - required=False, - help="Specifies the test(s) to run. Multiple tests should be seperated using a comma. If unspecified, all tests from the test plan will be run.", -) -@click.option( - "--plan-dir", - type=str, - required=False, - help="The directory where the test plan is stored. If unspecified, then the current working directory is used.", -) -@click.option( - "--verbose", - is_flag=True, - type=bool, - default=False, - help="Controls the verbosity of the terminal logs.", -) -@click.option( - "--num-threads", - type=int, - required=False, - help="Number of threads (and thus tests) to run concurrently. If unspecified, number of threads will be capped at 45.", -) -@click.option( - "--work-dir", - type=str, - required=False, - help="The directory where the test result and trace will be generated. If unspecified, then the current working directory is used.", -) -def run( - filter: Optional[str], - plan_dir: Optional[str], - verbose: bool, - num_threads: Optional[int], - work_dir: Optional[str], -): - try: - plan = Plan.load(plan_dir, filter) - if work_dir: - validate_directory(work_dir) - runner = Runner( - plan, - verbose, - num_threads, - work_dir, - ) - num_failed = runner.run() - _num_failed_exit(num_failed) - - except Exception as e: - _exception_exit(e) - - -def _num_failed_exit(num_failed): - exit(1 if num_failed else 0) - - -def _exception_exit(e): - logger.exception(f"Error running test: {e}") - exit(1) diff --git a/stepfunctions/stepfunctions/agenteval/conversation.py b/stepfunctions/stepfunctions/agenteval/conversation.py deleted file mode 100644 index 59e4304..0000000 --- a/stepfunctions/stepfunctions/agenteval/conversation.py +++ /dev/null @@ -1,35 +0,0 @@ -# Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. -# SPDX-License-Identifier: Apache-2.0 - -_USER = "USER" -_AGENT = "AGENT" -_START_TURN_COUNT = 0 - - -class Conversation: - """Captures the interaction between a user and an agent. - - Attributes: - messages (list): A list of tuples of the form (role, message). - turns (int): The number of turns in the conversation. - """ - - def __init__(self): - self.messages = [] - self.turns = _START_TURN_COUNT - - def __iter__(self): - """Allow iteration over conversation messages.""" - return iter(self.messages) - - def add_turn(self, user_message: str, agent_response: str): - """Record a turn in the conversation. - - Args: - user_message (str): The users's message - agent_response (str): The agent's response to the user's message - - Increments the `turn` counter by `1`. - """ - self.messages.extend([(_USER, user_message), (_AGENT, agent_response)]) - self.turns += 1 diff --git a/stepfunctions/stepfunctions/agenteval/defaults.py b/stepfunctions/stepfunctions/agenteval/defaults.py deleted file mode 100644 index 929c675..0000000 --- a/stepfunctions/stepfunctions/agenteval/defaults.py +++ /dev/null @@ -1,8 +0,0 @@ -# Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. -# SPDX-License-Identifier: Apache-2.0 - -MAX_TURNS = 2 - -# Default max number of threads not exceeding Bedrock service quota: -# https://docs.aws.amazon.com/bedrock/latest/userguide/quotas.html -MAX_NUM_THREADS = 45 diff --git a/stepfunctions/stepfunctions/agenteval/evaluators/__init__.py b/stepfunctions/stepfunctions/agenteval/evaluators/__init__.py deleted file mode 100644 index 8e52702..0000000 --- a/stepfunctions/stepfunctions/agenteval/evaluators/__init__.py +++ /dev/null @@ -1,7 +0,0 @@ -# Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. -# SPDX-License-Identifier: Apache-2.0 - -from .base_evaluator import BaseEvaluator -from .evaluator_factory import EvaluatorFactory - -__all__ = ["BaseEvaluator", "EvaluatorFactory"] diff --git a/stepfunctions/stepfunctions/agenteval/evaluators/base_evaluator.py b/stepfunctions/stepfunctions/agenteval/evaluators/base_evaluator.py deleted file mode 100644 index e1bd4c9..0000000 --- a/stepfunctions/stepfunctions/agenteval/evaluators/base_evaluator.py +++ /dev/null @@ -1,139 +0,0 @@ -# Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. -# SPDX-License-Identifier: Apache-2.0 - -import json -from abc import ABC, abstractmethod -from typing import Optional - -from agenteval.conversation import Conversation -from agenteval.hook import Hook -from agenteval.targets import BaseTarget -from agenteval.test import Test -from agenteval.test_result import TestResult -from agenteval.trace import Trace -from agenteval.utils import create_boto3_client, import_class - -_DEFAULT_MAX_RETRY = 10 -_BOTO3_SERVICE_NAME = "bedrock-runtime" - - -class BaseEvaluator(ABC): - """The `BaseEvaluator` abstract base class defines the common interface for evaluator - classes. - - Attributes: - test (Test): The test case. - target (BaseTarget): The target agent being evaluated. - conversation (Conversation): Captures the interaction between a user and an agent. - trace (Trace): Captures steps during evaluation. - test_result (TestResult): The result of the test which is set in `BaseEvaluator.run`. - input_token_count (int): Number of input tokens processed by the evaluator. - output_token_count (int): Number of output tokens generated by the evaluator. - model_id (str): The ID of the Bedrock model used to run evaluation. If `provisioned_throughput_arn` is provided, - then this will be set to the ARN of the provisioned throughput. - boto3_client (BaseClient): A `boto3` client representing Amazon Bedrock Runtime. - """ - - def __init__( - self, - test: Test, - target: BaseTarget, - work_dir: str, - model_id: str, - provisioned_throughput_arn: Optional[str] = None, - aws_profile: Optional[str] = None, - aws_region: Optional[str] = None, - endpoint_url: Optional[str] = None, - max_retry: int = _DEFAULT_MAX_RETRY, - ): - """Initialize the evaluator instance for a given `Test` and `Target`. - - Args: - test (Test): The test case. - target (BaseTarget): The target agent being evaluated. - work_dir (str): The work directory. - model_id (str): The ID of the Bedrock model used to run evaluation. - provisioned_throughput_arn (str, optional): The ARN of the provisioned throughput. - aws_profile (str, optional): The AWS profile name. - aws_region (str, optional): The AWS region. - endpoint_url (str, optional): The endpoint URL for the AWS service. - max_retry (int, optional): The maximum number of retry attempts. - """ - self.test = test - self.target = target - self.conversation = Conversation() - self.trace = Trace(work_dir=work_dir, test_name=test.name) - self.test_result = None - self.input_token_count = 0 - self.output_token_count = 0 - self.model_id = provisioned_throughput_arn or model_id - self.bedrock_runtime_client = create_boto3_client( - boto3_service_name=_BOTO3_SERVICE_NAME, - aws_profile=aws_profile, - aws_region=aws_region, - endpoint_url=endpoint_url, - max_retry=max_retry, - ) - - @abstractmethod - def evaluate(self) -> TestResult: - """Conduct a test. - - Returns: - TestResult: The result of the test. - """ - pass - - def _get_hook_cls(self, hook: Optional[str]) -> Optional[type[Hook]]: - if hook: - hook_cls = import_class(hook, parent_class=Hook) - return hook_cls - - def invoke_model(self, request_body: dict) -> dict: - """ - Invoke the Bedrock model using the `boto3_client`. This method will convert - a request dictionary to a JSON string before passing it to the `InvokeModel` API. - - Refer to the `boto3` documentation for more details. - - Args: - request_body (dict): The request payload as a dictionary. - - Returns: - dict: The response from the model invocation. - - """ - response = self.bedrock_runtime_client.invoke_model( - modelId=self.model_id, body=json.dumps(request_body) - ) - - self._incr_token_counts(response) - - return response - - def _incr_token_counts(self, response: dict): - headers = response["ResponseMetadata"]["HTTPHeaders"] - - self.input_token_count += int( - headers.get("x-amzn-bedrock-input-token-count", 0) - ) - self.output_token_count += int( - headers.get("x-amzn-bedrock-output-token-count", 0) - ) - - def run(self) -> TestResult: - """ - Run the evaluator within a trace context manager and run hooks - if provided. - """ - - hook_cls = self._get_hook_cls(self.test.hook) - - with self.trace: - if hook_cls: - hook_cls.pre_evaluate(self.test, self.trace) - self.test_result = self.evaluate() - if hook_cls: - hook_cls.post_evaluate(self.test, self.test_result, self.trace) - - return self.test_result diff --git a/stepfunctions/stepfunctions/agenteval/evaluators/claude_3/__init__.py b/stepfunctions/stepfunctions/agenteval/evaluators/claude_3/__init__.py deleted file mode 100644 index 338be7d..0000000 --- a/stepfunctions/stepfunctions/agenteval/evaluators/claude_3/__init__.py +++ /dev/null @@ -1,6 +0,0 @@ -# Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. -# SPDX-License-Identifier: Apache-2.0 - -from .evaluator import Claude3Evaluator - -__all__ = ["Claude3Evaluator"] diff --git a/stepfunctions/stepfunctions/agenteval/evaluators/claude_3/evaluator.py b/stepfunctions/stepfunctions/agenteval/evaluators/claude_3/evaluator.py deleted file mode 100644 index cc8b3ae..0000000 --- a/stepfunctions/stepfunctions/agenteval/evaluators/claude_3/evaluator.py +++ /dev/null @@ -1,244 +0,0 @@ -# Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. -# SPDX-License-Identifier: Apache-2.0 - -import json -import logging -import os -import re -from typing import Tuple - -from agenteval import jinja_env -from agenteval.evaluators import BaseEvaluator -from agenteval.evaluators.claude_3 import model_configs -from agenteval.test_result import TestResult - -logger = logging.getLogger(__name__) - -_PROMPT_TEMPLATE_ROOT = "evaluators/claude_3" -_SYSTEM_PROMPT_DIR = "system" -_PROMPT_TEMPLATE_NAMES = [ - "generate_initial_prompt", - "generate_user_response", - "generate_test_status", - "generate_evaluation", -] - -# enable backwards-compatible StrEnum -try: - from enum import StrEnum -except ImportError: - from enum import Enum - - class StrEnum(str, Enum): - pass - - -class TestStatusCategories(StrEnum): - ALL_STEPS_ATTEMPTED = "A" - NOT_ALL_STEPS_ATTEMPTED = "B" - - -class EvaluationCategories(StrEnum): - ALL_EXPECTED_RESULTS_OBSERVED = "A" - NOT_ALL_EXPECTED_RESULTS_OBSERVED = "B" - - -class Results(StrEnum): - MAX_TURNS_REACHED = "Maximum turns reached." - ALL_EXPECTED_RESULTS_OBSERVED = ( - "All of the expected results can be observed in the conversation." - ) - NOT_ALL_EXPECTED_RESULTS_OBSERVED = ( - "Not all of the expected results can be observed in the conversation." - ) - - -class Claude3Evaluator(BaseEvaluator): - def __init__( - self, - **kwargs, - ): - super().__init__(model_id=model_configs.MODEL_ID, **kwargs) - - self._prompt_template_map = { - name: { - "system": jinja_env.get_template( - os.path.join( - _PROMPT_TEMPLATE_ROOT, _SYSTEM_PROMPT_DIR, f"{name}.jinja" - ) - ), - "prompt": jinja_env.get_template( - os.path.join(_PROMPT_TEMPLATE_ROOT, f"{name}.jinja") - ), - } - for name in _PROMPT_TEMPLATE_NAMES - } - - @staticmethod - def _extract_content_from_xml(xml_data: str, element_names: list[str]) -> Tuple: - content = [] - for e in element_names: - pattern = rf"<{e}>(.*?)" - match = re.search(pattern, xml_data, re.DOTALL) - content.append(match.group(1).strip() if match else None) - return tuple(content) - - def _generate( - self, - system_prompt: str, - prompt: str, - output_xml_element: str, - ) -> str: - request_body = model_configs.REQUEST_BODY - request_body["system"] = system_prompt - request_body["messages"][0]["content"][0]["text"] = prompt - - response = self.invoke_model(request_body=request_body) - response_body = response.get("body").read() - completion = json.loads(response_body)["content"][0]["text"] - - logger.debug( - f"[{self.test.name}]\n[PROMPT]\n{prompt}\n[COMPLETION]\n{completion}" - ) - - output, reasoning = self._extract_content_from_xml( - completion, [output_xml_element, "thinking"] - ) - - return output, reasoning - - def _generate_initial_prompt(self) -> str: - system_prompt = self._prompt_template_map["generate_initial_prompt"][ - "system" - ].render() - prompt = self._prompt_template_map["generate_initial_prompt"]["prompt"].render( - step=self.test.steps[0] - ) - - initial_prompt, reasoning = self._generate( - system_prompt=system_prompt, - prompt=prompt, - output_xml_element="initial_prompt", - ) - - self.trace.add_step( - system_prompt=system_prompt, - prompt=prompt, - initial_prompt=initial_prompt, - reasoning=reasoning, - ) - return initial_prompt - - def _generate_test_status(self) -> str: - system_prompt = self._prompt_template_map["generate_test_status"][ - "system" - ].render() - prompt = self._prompt_template_map["generate_test_status"]["prompt"].render( - steps=self.test.steps, conversation=self.conversation - ) - test_status, reasoning = self._generate( - system_prompt=system_prompt, - prompt=prompt, - output_xml_element="category", - ) - self.trace.add_step( - system_prompt=system_prompt, - prompt=prompt, - test_status=test_status, - reasoning=reasoning, - ) - return test_status - - def _generate_evaluation(self) -> tuple[str, str]: - system_prompt = self._prompt_template_map["generate_evaluation"][ - "system" - ].render() - prompt = self._prompt_template_map["generate_evaluation"]["prompt"].render( - expected_results=self.test.expected_results, - conversation=self.conversation, - ) - - evaluation, reasoning = self._generate( - system_prompt=system_prompt, - prompt=prompt, - output_xml_element="category", - ) - self.trace.add_step( - system_prompt=system_prompt, - prompt=prompt, - evaluation=evaluation, - reasoning=reasoning, - ) - - return evaluation, reasoning - - def _generate_user_response(self) -> str: - system_prompt = self._prompt_template_map["generate_user_response"][ - "system" - ].render() - prompt = self._prompt_template_map["generate_user_response"]["prompt"].render( - steps=self.test.steps, conversation=self.conversation - ) - - user_response, reasoning = self._generate( - system_prompt=system_prompt, - prompt=prompt, - output_xml_element="user_response", - ) - - self.trace.add_step( - system_prompt=system_prompt, - prompt=prompt, - user_response=user_response, - reasoning=reasoning, - ) - return user_response - - def _invoke_target(self, user_input) -> str: - target_response = self.target.invoke(user_input) - self.trace.add_step(data=target_response.data) - - return target_response.response - - def evaluate(self) -> TestResult: - success = False - result = Results.MAX_TURNS_REACHED.value - reasoning = "" - - while self.conversation.turns < self.test.max_turns: - if self.conversation.turns == 0: - # start conversation - if self.test.initial_prompt: - user_input = self.test.initial_prompt - else: - user_input = self._generate_initial_prompt() - else: - # generate next user response - user_input = self._generate_user_response() - - # add turn to the conversation - self.conversation.add_turn(user_input, self._invoke_target(user_input)) - - # get test status - test_status = self._generate_test_status() - if test_status == TestStatusCategories.ALL_STEPS_ATTEMPTED: - # evaluate conversation - eval_category, reasoning = self._generate_evaluation() - if ( - eval_category - == EvaluationCategories.NOT_ALL_EXPECTED_RESULTS_OBSERVED.value # noqa: W503 - ): - result = Results.NOT_ALL_EXPECTED_RESULTS_OBSERVED.value - else: - result = Results.ALL_EXPECTED_RESULTS_OBSERVED.value - success = True - - break - - return TestResult( - test_name=self.test.name, - success=success, - result=result, - reasoning=reasoning, - conversation=self.conversation, - ) diff --git a/stepfunctions/stepfunctions/agenteval/evaluators/claude_3/model_configs.py b/stepfunctions/stepfunctions/agenteval/evaluators/claude_3/model_configs.py deleted file mode 100644 index e6bc2fc..0000000 --- a/stepfunctions/stepfunctions/agenteval/evaluators/claude_3/model_configs.py +++ /dev/null @@ -1,26 +0,0 @@ -# Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. -# SPDX-License-Identifier: Apache-2.0 - -MODEL_ID = "anthropic.claude-3-sonnet-20240229-v1:0" -ANTHROPIC_VERSION = "bedrock-2023-05-31" -ROLE = "user" -MAX_TOKENS_TO_SAMPLE = 300 -TEMPERATURE = 0 -TOP_K = 250 -TOP_P = 1 -REQUEST_BODY = { - "anthropic_version": ANTHROPIC_VERSION, - "max_tokens": MAX_TOKENS_TO_SAMPLE, - "system": None, - "messages": [ - { - "role": ROLE, - "content": [ - {"type": "text", "text": None}, - ], - } - ], - "temperature": TEMPERATURE, - "top_p": TOP_P, - "top_k": TOP_K, -} diff --git a/stepfunctions/stepfunctions/agenteval/evaluators/claude_3/most_updated_prompt_2607.txt b/stepfunctions/stepfunctions/agenteval/evaluators/claude_3/most_updated_prompt_2607.txt deleted file mode 100644 index fce3738..0000000 --- a/stepfunctions/stepfunctions/agenteval/evaluators/claude_3/most_updated_prompt_2607.txt +++ /dev/null @@ -1,67 +0,0 @@ - -You are an energy advisor with twenty years of experience at the UK's leading energy providers. You are empathetic and compassionate, you understand that rising energy prices can be a source of strain. You are pragmatic. Ask the user clarifying questions to understand their personal situation and to ensure you are giving personalised advice. Do not make information up, if you do not know how to answer be honest. Before answering, please think about all the information you would need before answering the user's question. - - - - - - - -You are a compassionate and empathetic customer-facing energy advisor with twenty years of experience at the UK's leading energy providers. You have the important role of preventing customers from debt or payment difficulties, whilst also providing tailored support to hose already struggling with energy costs. Most importantly, you assess each customer's unique needs and provide support that's tailored to their individual situation. - - - - -Your approach is to: -1) Create a profile of the customer by asking a few clarifying questions, one at a time, about their situation, energy usage and any challenges they are facing. -2) Based on their responses, provide a personalised recommendation to resolve their issue or improve their circumstance and ensure they are being energy efficient. - -Some example questions include: - - - -* Does the customer have a smart meter? -* Are they aware of Energy Hub? -* Are they on the right tariff? -* How many people are in their household? -* What is their current living situation (apartment, house, etc.)? - - - -Some examples of recommendations include: - - -* Smart meter installation for better usage monitoring -* Checking their eligibility for financial assistance including debt relief or the Warm Home Discount - - - -Always greet the customer with a salutation, even if they do not use one themselves. Approach each question with care. Do not make information up - if you do not know the answer - please be honest. Always remember to keep a conversational tone, especially when providing the recommendations. Ask the customer questions one at a time. Once you have enough information to provide the user with a helpful recommendation, then provide it. - - -Here is an example interaction: - - -A: how can I reduce my energy bill? - -B: Hi there, I understand you want to reduce your energy bill. I want to give you advice that is personal to your situation. So will ask some questions to understand you better. Is that okay? - -A: Yes - -B: What kind of house do you live in and with how many people? - -A: I live in a one-bedroom apartment with my partner? - -B: Thank you, and how do you measure your energy use? - -A: I send meter readings? - -B: Okay, so to confirm you don’t have a smart meter? - -A: No - -B: My first recommendation would be a smart meter. A smart meter is a way to ensure that your energy readings are always up to date and can assist with your payment if you are overpaying at some points in the year. Would you like some more recommendations? -... -[continues dialogue to gather more details if required and then provide a personalized recommendation] - - diff --git a/stepfunctions/stepfunctions/agenteval/evaluators/evaluator_factory.py b/stepfunctions/stepfunctions/agenteval/evaluators/evaluator_factory.py deleted file mode 100644 index d42f8e3..0000000 --- a/stepfunctions/stepfunctions/agenteval/evaluators/evaluator_factory.py +++ /dev/null @@ -1,27 +0,0 @@ -from typing import Optional - -from pydantic import BaseModel - -from agenteval.evaluators import BaseEvaluator -from agenteval.evaluators.claude_3 import Claude3Evaluator -from agenteval.targets import BaseTarget -from agenteval.test import Test - -_EVALUATOR_MAP = { - "claude-3": Claude3Evaluator, -} - - -class EvaluatorFactory(BaseModel): - config: dict - - def create( - self, test: Test, target: BaseTarget, work_dir: Optional[str] - ) -> BaseEvaluator: - evaluator_cls = _EVALUATOR_MAP[self.config["model"]] - return evaluator_cls( - test=test, - target=target, - work_dir=work_dir, - **{k: v for k, v in self.config.items() if k != "model"} - ) diff --git a/stepfunctions/stepfunctions/agenteval/hook.py b/stepfunctions/stepfunctions/agenteval/hook.py deleted file mode 100644 index a1386e6..0000000 --- a/stepfunctions/stepfunctions/agenteval/hook.py +++ /dev/null @@ -1,33 +0,0 @@ -# Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. -# SPDX-License-Identifier: Apache-2.0 - -from agenteval.test import Test -from agenteval.test_result import TestResult -from agenteval.trace import Trace - - -class Hook: - """An evaluation hook.""" - - def pre_evaluate(test: Test, trace: Trace) -> None: - """ - Method called before evaluation. Can be used to perform any setup tasks. - - Args: - test (Test): The test case. - trace (Trace): Captures steps during evaluation. - """ - pass - - def post_evaluate(test: Test, test_result: TestResult, trace: Trace) -> None: - """ - Method called after evaluation. This may be used to perform integration testing - or clean up tasks. - - Args: - test (Test): The test case. - test_result (TestResult): The result of the test, which can be overriden - by updating the attributes of this object. - trace (Trace): Captures steps during evaluation. - """ - pass diff --git a/stepfunctions/stepfunctions/agenteval/plan.py b/stepfunctions/stepfunctions/agenteval/plan.py deleted file mode 100644 index 73a3107..0000000 --- a/stepfunctions/stepfunctions/agenteval/plan.py +++ /dev/null @@ -1,110 +0,0 @@ -# Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. -# SPDX-License-Identifier: Apache-2.0 - -from __future__ import annotations - -import logging -import os -import sys -from typing import Optional - -import yaml -from pydantic import BaseModel, model_validator - -from agenteval import defaults -from agenteval.evaluators import EvaluatorFactory -from agenteval.targets import TargetFactory -from agenteval.test import Test - -_PLAN_FILE_NAME = "agenteval.yml" - -_INIT_PLAN = { - "evaluator": {"model": "claude-3"}, - "target": { - "type": "bedrock-agent", - "bedrock_agent_id": None, - "bedrock_agent_alias_id": None, - }, - "tests": { - "retrieve_missing_documents": { - "steps": ["Ask agent for a list of missing documents for claim-006."], - "expected_results": ["The agent returns a list of missing documents."], - } - }, -} - - -sys.path.append(".") -logger = logging.getLogger(__name__) - - -class Plan(BaseModel, validate_assignment=True, arbitrary_types_allowed=True): - evaluator_factory: EvaluatorFactory - target_factory: TargetFactory - tests: list[Test] - - @model_validator(mode="after") - def check_test_names_unique(self) -> Plan: - unique_names = len(set(test.name for test in self.tests)) - - if unique_names != len(self.tests): - raise ValueError("Test names must be unique") - - return self - - @classmethod - def load(cls, plan_dir: Optional[str], filter: str) -> Plan: - plan_path = os.path.join(plan_dir or os.getcwd(), _PLAN_FILE_NAME) - plan = cls._load_yaml(plan_path) - - return cls( - evaluator_factory=EvaluatorFactory(config=plan["evaluator"]), - target_factory=TargetFactory(config=plan["target"]), - tests=cls._load_tests(plan["tests"], filter), - ) - - @staticmethod - def _load_yaml(path: str) -> dict: - with open(path) as stream: - return yaml.safe_load(stream) - - @staticmethod - def _load_tests(test_config: list[dict], filter: str) -> list[Test]: - tests = [] - - if filter: - names = Plan._parse_filter(filter) - else: - names = test_config.keys() - - for name in names: - config = test_config[name] - tests.append( - Test( - name=name, - steps=config["steps"], - expected_results=config["expected_results"], - initial_prompt=config.get("initial_prompt"), - max_turns=config.get("max_turns", defaults.MAX_TURNS), - hook=config.get("hook"), - ) - ) - - return tests - - @staticmethod - def _parse_filter(filter: str) -> list[str]: - return [n.strip() for n in filter.split(",")] - - @staticmethod - def init_plan(plan_dir: Optional[str]) -> str: - plan_path = os.path.join(plan_dir or os.getcwd(), _PLAN_FILE_NAME) - - # check if plan exists - if os.path.exists(plan_path): - raise FileExistsError(f"Test plan already exists at {plan_path}") - - with open(plan_path, "w") as stream: - yaml.safe_dump(_INIT_PLAN, stream, sort_keys=False) - - return plan_path diff --git a/stepfunctions/stepfunctions/agenteval/runner/__init__.py b/stepfunctions/stepfunctions/agenteval/runner/__init__.py deleted file mode 100644 index 32377b3..0000000 --- a/stepfunctions/stepfunctions/agenteval/runner/__init__.py +++ /dev/null @@ -1,6 +0,0 @@ -# Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. -# SPDX-License-Identifier: Apache-2.0 - -from .runner import Runner - -__all__ = ["Runner"] diff --git a/stepfunctions/stepfunctions/agenteval/runner/runner.py b/stepfunctions/stepfunctions/agenteval/runner/runner.py deleted file mode 100644 index c3e0803..0000000 --- a/stepfunctions/stepfunctions/agenteval/runner/runner.py +++ /dev/null @@ -1,116 +0,0 @@ -# Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. -# SPDX-License-Identifier: Apache-2.0 - -import concurrent.futures -import logging -import os -import time -from typing import Optional - -from rich.progress import Progress - -from agenteval.defaults import MAX_NUM_THREADS -from agenteval.plan import Plan -from agenteval.runner.summary import create_markdown_summary - -logger = logging.getLogger(__name__) - - -class Runner: - def __init__( - self, - plan: Plan, - verbose: bool, - num_threads: Optional[int], - work_dir: Optional[str], - ): - self.plan = plan - self.work_dir = work_dir if work_dir else os.getcwd() - self.num_tests = len(self.plan.tests) - self.verbose = verbose - self.num_threads = num_threads - if not self.num_threads: - self.num_threads = min(self.num_tests, MAX_NUM_THREADS) - self.results = {test.name: None for test in self.plan.tests} - self.num_failed = 0 - self.evaluator_input_token_counts = [] - self.evaluator_output_token_counts = [] - - def run(self) -> int: - self._log_run_start() - - self.start_time = time.time() - with Progress(transient=True) as self.progress: - self.tracker = self.progress.add_task("running...", total=self.num_tests) - - with concurrent.futures.ThreadPoolExecutor( - max_workers=self.num_tests - ) as executor: - futures = [ - executor.submit(self.run_test, test) for test in self.plan.tests - ] - for future in concurrent.futures.as_completed(futures): - try: - future.result() - except Exception as e: - raise e - - self._log_run_end() - - create_markdown_summary( - self.work_dir, self.plan.tests, list(self.results.values()), self.verbose - ) - - return self.num_failed - - def run_test(self, test): - target = self.plan.target_factory.create() - evaluator = self.plan.evaluator_factory.create( - test=test, - target=target, - work_dir=self.work_dir, - ) - - result = evaluator.run() - if result.success is False: - self.num_failed += 1 - - self.progress.update(self.tracker, advance=1) - self.results[test.name] = result - self.evaluator_input_token_counts.append(evaluator.input_token_count) - self.evaluator_output_token_counts.append(evaluator.output_token_count) - - def _log_run_start(self): - logger.info(f"Starting {self.num_tests} tests with {self.num_threads} threads.") - - def _log_run_end(self): - self._log_pass_fail_count() - logger.info(f"Completed in {round(time.time() - self.start_time, 2)} seconds.") - if self.verbose: - self._log_test_result() - self._log_evaluator_token_io() - - def _log_test_result(self): - for _, result in self.results.items(): - logger_func = logger.info if result.success else logger.error - logger_func( - f"[bold {'green' if result.success else 'red'}]{result.test_name}...{'PASSED' if result.success else 'FAILED'}", - ) - - def _log_pass_fail_count(self): - passed_count = self.num_tests - self.num_failed - status_str = ( - f"[red]{passed_count} passed, {self.num_failed} failed." - if self.num_failed - else f"[green]{self.num_tests} passed." - ) - logger_func = logger.error if self.num_failed else logger.info - logger_func(status_str) - - def _log_evaluator_token_io(self): - logger.info( - f"Input tokens processed by evaluator: {sum(self.evaluator_input_token_counts)}" - ) - logger.info( - f"Output tokens generated by evaluator: {sum(self.evaluator_output_token_counts)}" - ) diff --git a/stepfunctions/stepfunctions/agenteval/runner/summary.py b/stepfunctions/stepfunctions/agenteval/runner/summary.py deleted file mode 100644 index 1abfaad..0000000 --- a/stepfunctions/stepfunctions/agenteval/runner/summary.py +++ /dev/null @@ -1,30 +0,0 @@ -# Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. -# SPDX-License-Identifier: Apache-2.0 - -import logging -import os - -from agenteval import jinja_env -from agenteval.test import Test -from agenteval.test_result import TestResult - -logger = logging.getLogger(__name__) - -_TEMPLATE_ROOT = "summary" -_TEMPLATE_FILE_NAME = "agenteval_summary.md.jinja" - - -def create_markdown_summary( - work_dir: str, tests: list[Test], test_results: list[TestResult], verbose: bool -): - template = jinja_env.get_template(os.path.join(_TEMPLATE_ROOT, _TEMPLATE_FILE_NAME)) - - summary_path = os.path.join(work_dir, os.path.splitext(_TEMPLATE_FILE_NAME)[0]) - - rendered = template.render(tests=tests, results=test_results, zip=zip) - - with open(summary_path, "w+") as f: - f.write(rendered) - - if verbose: - logger.info(f"Summary available at {summary_path}") diff --git a/stepfunctions/stepfunctions/agenteval/target_response.py b/stepfunctions/stepfunctions/agenteval/target_response.py deleted file mode 100644 index 417543f..0000000 --- a/stepfunctions/stepfunctions/agenteval/target_response.py +++ /dev/null @@ -1,15 +0,0 @@ -from typing import Optional - -from pydantic import BaseModel - - -class TargetResponse(BaseModel): - """A target's response. - - Attributes: - response: The response string. - data: Additional data (if applicable). - """ - - response: str - data: Optional[dict] = None diff --git a/stepfunctions/stepfunctions/agenteval/targets/__init__.py b/stepfunctions/stepfunctions/agenteval/targets/__init__.py deleted file mode 100644 index 910e303..0000000 --- a/stepfunctions/stepfunctions/agenteval/targets/__init__.py +++ /dev/null @@ -1,8 +0,0 @@ -# Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. -# SPDX-License-Identifier: Apache-2.0 - -from .base_target import BaseTarget -from .boto3_target import Boto3Target -from .target_factory import TargetFactory - -__all__ = ["BaseTarget", "TargetFactory", "Boto3Target"] diff --git a/stepfunctions/stepfunctions/agenteval/targets/base_target.py b/stepfunctions/stepfunctions/agenteval/targets/base_target.py deleted file mode 100644 index f8fbaa8..0000000 --- a/stepfunctions/stepfunctions/agenteval/targets/base_target.py +++ /dev/null @@ -1,27 +0,0 @@ -# Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. -# SPDX-License-Identifier: Apache-2.0 - -from __future__ import annotations - -from abc import ABC, abstractmethod - -from agenteval import TargetResponse - - -class BaseTarget(ABC): - """The `BaseTarget` abstract base class defines the common interface for target - classes. - """ - - @abstractmethod - def invoke(self, prompt: str) -> TargetResponse: - """Invoke the target with a prompt and return a response as a string. - - Args: - prompt: The prompt string to pass to the target. - - Returns: - A TargetResponse object containing the target's response string and - any trace data (if applicable). - """ - pass diff --git a/stepfunctions/stepfunctions/agenteval/targets/bedrock_agent/__init__.py b/stepfunctions/stepfunctions/agenteval/targets/bedrock_agent/__init__.py deleted file mode 100644 index 4d393ff..0000000 --- a/stepfunctions/stepfunctions/agenteval/targets/bedrock_agent/__init__.py +++ /dev/null @@ -1,3 +0,0 @@ -from .target import BedrockAgentTarget - -__all__ = ["BedrockAgentTarget"] diff --git a/stepfunctions/stepfunctions/agenteval/targets/bedrock_agent/target.py b/stepfunctions/stepfunctions/agenteval/targets/bedrock_agent/target.py deleted file mode 100644 index f7e6f9c..0000000 --- a/stepfunctions/stepfunctions/agenteval/targets/bedrock_agent/target.py +++ /dev/null @@ -1,41 +0,0 @@ -import uuid - -from agenteval import TargetResponse -from agenteval.targets import Boto3Target - -_SERVICE_NAME = "bedrock-agent-runtime" - - -class BedrockAgentTarget(Boto3Target): - def __init__(self, bedrock_agent_id: str, bedrock_agent_alias_id: str, **kwargs): - super().__init__(boto3_service_name=_SERVICE_NAME, **kwargs) - self._bedrock_agent_id = bedrock_agent_id - self._bedrock_agent_alias_id = bedrock_agent_alias_id - self._session_id: str = str(uuid.uuid4()) - - def invoke(self, prompt: str) -> TargetResponse: - args = { - "agentId": self._bedrock_agent_id, - "agentAliasId": self._bedrock_agent_alias_id, - "sessionId": self._session_id, - "inputText": prompt, - "enableTrace": True, - } - - response = self.boto3_client.invoke_agent(**args) - - stream = response["completion"] - completion = "" - trace_data = [] - - for event in stream: - chunk = event.get("chunk") - event_trace = event.get("trace") - if chunk: - completion += chunk.get("bytes").decode() - if event_trace: - trace_data.append(event_trace.get("trace")) - - return TargetResponse( - response=completion, data={"bedrock_agent_trace": trace_data} - ) diff --git a/stepfunctions/stepfunctions/agenteval/targets/bedrock_knowledge_base/__init__.py b/stepfunctions/stepfunctions/agenteval/targets/bedrock_knowledge_base/__init__.py deleted file mode 100644 index d56ea6f..0000000 --- a/stepfunctions/stepfunctions/agenteval/targets/bedrock_knowledge_base/__init__.py +++ /dev/null @@ -1,3 +0,0 @@ -from .target import BedrockKnowledgeBaseTarget - -__all__ = ["BedrockKnowledgeBaseTarget"] diff --git a/stepfunctions/stepfunctions/agenteval/targets/bedrock_knowledge_base/target.py b/stepfunctions/stepfunctions/agenteval/targets/bedrock_knowledge_base/target.py deleted file mode 100644 index a9491e2..0000000 --- a/stepfunctions/stepfunctions/agenteval/targets/bedrock_knowledge_base/target.py +++ /dev/null @@ -1,38 +0,0 @@ -from agenteval import TargetResponse -from agenteval.targets import Boto3Target - -_SERVICE_NAME = "bedrock-agent-runtime" - - -class BedrockKnowledgeBaseTarget(Boto3Target): - def __init__(self, knowledge_base_id: str, model_id: str, **kwargs): - super().__init__(boto3_service_name=_SERVICE_NAME, **kwargs) - aws_region = self.boto3_client.meta.region_name - self._knowledge_base_id = knowledge_base_id - self._model_arn = f"arn:aws:bedrock:{aws_region}::foundation-model/{model_id}" - self._session_id: str = None - - def invoke(self, prompt: str) -> TargetResponse: - args = { - "input": { - "text": prompt, - }, - "retrieveAndGenerateConfiguration": { - "type": "KNOWLEDGE_BASE", - "knowledgeBaseConfiguration": { - "knowledgeBaseId": self._knowledge_base_id, - "modelArn": self._model_arn, - }, - }, - } - if self._session_id: - args["sessionId"] = self._session_id - - response = self.boto3_client.retrieve_and_generate(**args) - generated_text = response["output"]["text"] - citations = response["citations"] - self._session_id = response["sessionId"] - - return TargetResponse( - response=generated_text, data={"bedrock_knowledgebase_citations": citations} - ) diff --git a/stepfunctions/stepfunctions/agenteval/targets/boto3_target.py b/stepfunctions/stepfunctions/agenteval/targets/boto3_target.py deleted file mode 100644 index e47e8cb..0000000 --- a/stepfunctions/stepfunctions/agenteval/targets/boto3_target.py +++ /dev/null @@ -1,41 +0,0 @@ -from typing import Optional - -from agenteval.targets import BaseTarget -from agenteval.utils import create_boto3_client - -_DEFAULT_MAX_RETRY = 10 - - -class Boto3Target(BaseTarget): - """A target that can be interfaced with via the `boto3` library. - - Attributes: - boto3_client (BaseClient): A `boto3` client. - """ - - def __init__( - self, - boto3_service_name: str, - aws_profile: Optional[str] = None, - aws_region: Optional[str] = None, - endpoint_url: Optional[str] = None, - max_retry: int = _DEFAULT_MAX_RETRY, - ): - """ - Initialize the AWS target. - - Args: - boto3_service_name (str): The `boto3` service name (e.g `"bedrock-agent-runtime"`). - aws_profile (str, optional): The AWS profile name. - aws_region (str, optional): The AWS region. - endpoint_url (str, optional): The endpoint URL for the AWS service. - max_retry (int, optional): The maximum number of retry attempts. - """ - - self.boto3_client = create_boto3_client( - boto3_service_name=boto3_service_name, - aws_profile=aws_profile, - aws_region=aws_region, - endpoint_url=endpoint_url, - max_retry=max_retry, - ) diff --git a/stepfunctions/stepfunctions/agenteval/targets/q_business/__init__.py b/stepfunctions/stepfunctions/agenteval/targets/q_business/__init__.py deleted file mode 100644 index 3f621e5..0000000 --- a/stepfunctions/stepfunctions/agenteval/targets/q_business/__init__.py +++ /dev/null @@ -1,3 +0,0 @@ -from .target import QBusinessTarget - -__all__ = ["QBusinessTarget"] diff --git a/stepfunctions/stepfunctions/agenteval/targets/q_business/target.py b/stepfunctions/stepfunctions/agenteval/targets/q_business/target.py deleted file mode 100644 index 8fd59be..0000000 --- a/stepfunctions/stepfunctions/agenteval/targets/q_business/target.py +++ /dev/null @@ -1,32 +0,0 @@ -from typing import Optional - -from agenteval import TargetResponse -from agenteval.targets import Boto3Target - -_SERVICE_NAME = "qbusiness" - - -class QBusinessTarget(Boto3Target): - def __init__( - self, - q_business_application_id: str, - q_business_user_id: Optional[str] = None, - **kwargs - ): - super().__init__(boto3_service_name=_SERVICE_NAME, **kwargs) - - self._chat_sync_args = {"applicationId": q_business_application_id} - if q_business_user_id: - self._chat_sync_args["userId"] = q_business_user_id - - def invoke(self, prompt: str) -> str: - self._chat_sync_args["userMessage"] = prompt - - response = self.boto3_client.chat_sync(**self._chat_sync_args) - - if "conversationId" not in self._chat_sync_args: - self._chat_sync_args["conversationId"] = response["conversationId"] - - self._chat_sync_args["parentMessageId"] = response["systemMessageId"] - - return TargetResponse(response=response["systemMessage"]) diff --git a/stepfunctions/stepfunctions/agenteval/targets/sagemaker_endpoint/__init__.py b/stepfunctions/stepfunctions/agenteval/targets/sagemaker_endpoint/__init__.py deleted file mode 100644 index 8c9adc2..0000000 --- a/stepfunctions/stepfunctions/agenteval/targets/sagemaker_endpoint/__init__.py +++ /dev/null @@ -1,3 +0,0 @@ -from .target import SageMakerEndpointTarget - -__all__ = ["SageMakerEndpointTarget"] diff --git a/stepfunctions/stepfunctions/agenteval/targets/sagemaker_endpoint/target.py b/stepfunctions/stepfunctions/agenteval/targets/sagemaker_endpoint/target.py deleted file mode 100644 index 74d2056..0000000 --- a/stepfunctions/stepfunctions/agenteval/targets/sagemaker_endpoint/target.py +++ /dev/null @@ -1,85 +0,0 @@ -import json -from typing import Optional - -from jsonpath_ng import parse - -from agenteval import TargetResponse -from agenteval.targets import Boto3Target - -_SERVICE_NAME = "sagemaker-runtime" -_CONTENT_TYPE = "application/json" -_ACCEPT = "application/json" - - -class SageMakerEndpointTarget(Boto3Target): - def __init__( - self, - endpoint_name: str, - request_body: dict, - input_path: str, - output_path: str, - custom_attributes: Optional[str] = None, - target_model: Optional[str] = None, - target_variant: Optional[str] = None, - target_container_hostname: Optional[str] = None, - inference_component_name: Optional[str] = None, - **kwargs - ): - super().__init__(boto3_service_name=_SERVICE_NAME, **kwargs) - - self._request_body = request_body - self._input_jp_expr = parse(input_path) - self._output_jp_expr = parse(output_path) - - self._args = self._create_base_args( - endpoint_name, - custom_attributes, - target_model, - target_variant, - target_container_hostname, - inference_component_name, - ) - - @staticmethod - def _create_base_args( - endpoint_name: str, - custom_attributes: Optional[str], - target_model: Optional[str], - target_variant: Optional[str], - target_container_hostname: Optional[str], - inference_component_name: Optional[str], - ): - args = { - "EndpointName": endpoint_name, - "ContentType": _CONTENT_TYPE, - "Accept": _ACCEPT, - **{ - key: value - for key, value in { - "CustomAttributes": custom_attributes, - "TargetModel": target_model, - "TargetVariant": target_variant, - "TargetContainerHostname": target_container_hostname, - "InferenceComponentName": inference_component_name, - }.items() - if value is not None - }, - } - - return args - - def _update_request(self, prompt: str): - self._input_jp_expr.update(self._request_body, prompt) - self._args["Body"] = json.dumps(self._request_body) - - def _query_response(self, response_body: dict) -> str: - return self._output_jp_expr.find(response_body)[0].value - - def invoke(self, prompt: str) -> str: - self._update_request(prompt) - - response = self.boto3_client.invoke_endpoint(**self._args) - - response_body = json.loads(response.get("Body").read()) - - return TargetResponse(response=self._query_response(response_body)) diff --git a/stepfunctions/stepfunctions/agenteval/targets/target_factory.py b/stepfunctions/stepfunctions/agenteval/targets/target_factory.py deleted file mode 100644 index a8e7e9c..0000000 --- a/stepfunctions/stepfunctions/agenteval/targets/target_factory.py +++ /dev/null @@ -1,32 +0,0 @@ -from pydantic import BaseModel - -from agenteval.targets import BaseTarget -from agenteval.targets.bedrock_agent import BedrockAgentTarget -from agenteval.targets.bedrock_knowledge_base import BedrockKnowledgeBaseTarget -from agenteval.targets.q_business import QBusinessTarget -from agenteval.targets.sagemaker_endpoint import SageMakerEndpointTarget -from agenteval.utils import import_class - -_TARGET_MAP = { - "bedrock-agent": BedrockAgentTarget, - "q-business": QBusinessTarget, - "sagemaker-endpoint": SageMakerEndpointTarget, - "bedrock-knowledgebase": BedrockKnowledgeBaseTarget, -} - - -class TargetFactory(BaseModel): - config: dict - - def create(self) -> BaseTarget: - target_cls = self._get_target_class() - - return target_cls(**{k: v for k, v in self.config.items() if k != "type"}) - - def _get_target_class(self) -> type[BaseTarget]: - if self.config["type"] in _TARGET_MAP: - target_cls = _TARGET_MAP[self.config["type"]] - else: - target_cls = import_class(self.config["type"], parent_class=BaseTarget) - - return target_cls diff --git a/stepfunctions/stepfunctions/agenteval/templates/evaluators/claude_3/generate_evaluation.jinja b/stepfunctions/stepfunctions/agenteval/templates/evaluators/claude_3/generate_evaluation.jinja deleted file mode 100644 index 9cd9dd4..0000000 --- a/stepfunctions/stepfunctions/agenteval/templates/evaluators/claude_3/generate_evaluation.jinja +++ /dev/null @@ -1,13 +0,0 @@ -Here are the expected results and conversation: - - -{% for result in expected_results -%} -{{ loop.index }}. {{ result }} -{% endfor -%} - - - -{% for sender, message in conversation -%} -{{ sender }}: {{ message }} -{% endfor -%} - \ No newline at end of file diff --git a/stepfunctions/stepfunctions/agenteval/templates/evaluators/claude_3/generate_initial_prompt.jinja b/stepfunctions/stepfunctions/agenteval/templates/evaluators/claude_3/generate_initial_prompt.jinja deleted file mode 100644 index 832ba37..0000000 --- a/stepfunctions/stepfunctions/agenteval/templates/evaluators/claude_3/generate_initial_prompt.jinja +++ /dev/null @@ -1,5 +0,0 @@ -Here is the step: - - -{{ step }} - \ No newline at end of file diff --git a/stepfunctions/stepfunctions/agenteval/templates/evaluators/claude_3/generate_test_status.jinja b/stepfunctions/stepfunctions/agenteval/templates/evaluators/claude_3/generate_test_status.jinja deleted file mode 100644 index 79ad0df..0000000 --- a/stepfunctions/stepfunctions/agenteval/templates/evaluators/claude_3/generate_test_status.jinja +++ /dev/null @@ -1,13 +0,0 @@ -Here are the steps and conversation: - - -{% for step in steps -%} -{{ loop.index }}. {{ step }} -{% endfor -%} - - - -{% for sender, message in conversation -%} -{{ sender }}: {{ message }} -{% endfor -%} - \ No newline at end of file diff --git a/stepfunctions/stepfunctions/agenteval/templates/evaluators/claude_3/generate_user_response.jinja b/stepfunctions/stepfunctions/agenteval/templates/evaluators/claude_3/generate_user_response.jinja deleted file mode 100644 index 79ad0df..0000000 --- a/stepfunctions/stepfunctions/agenteval/templates/evaluators/claude_3/generate_user_response.jinja +++ /dev/null @@ -1,13 +0,0 @@ -Here are the steps and conversation: - - -{% for step in steps -%} -{{ loop.index }}. {{ step }} -{% endfor -%} - - - -{% for sender, message in conversation -%} -{{ sender }}: {{ message }} -{% endfor -%} - \ No newline at end of file diff --git a/stepfunctions/stepfunctions/agenteval/templates/evaluators/claude_3/system/generate_evaluation.jinja b/stepfunctions/stepfunctions/agenteval/templates/evaluators/claude_3/system/generate_evaluation.jinja deleted file mode 100644 index 22cace3..0000000 --- a/stepfunctions/stepfunctions/agenteval/templates/evaluators/claude_3/system/generate_evaluation.jinja +++ /dev/null @@ -1,12 +0,0 @@ -You are a quality assurance engineer evaluating a conversation between an USER and an AGENT. - -Your job is to analyze the conversation in tags and a list of expected results -in tags. - -You will classify the the conversation into the following categories: - -- A: All of the expected results can be observed in the conversation. -- B: Not all of the expected results can be observed in the conversation. - -Please think hard about the response in tags before providing only the category letter -within tags. \ No newline at end of file diff --git a/stepfunctions/stepfunctions/agenteval/templates/evaluators/claude_3/system/generate_initial_prompt.jinja b/stepfunctions/stepfunctions/agenteval/templates/evaluators/claude_3/system/generate_initial_prompt.jinja deleted file mode 100644 index d0e8e23..0000000 --- a/stepfunctions/stepfunctions/agenteval/templates/evaluators/claude_3/system/generate_initial_prompt.jinja +++ /dev/null @@ -1,13 +0,0 @@ -You are role playing as an USER in a conversastion with an AGENT. - -You will be given a step that is wrapped in tags. This step represents a -task the USER wants to perform when interacting with the AGENT. - -Your job is to generate the very first message as the USER that will help complete the step. - -Make sure this message is concise and to the point. - -Do not provide any information if it is expected that the AGENT will eventually ask for it. - -Please think hard about the response in tags before providing the message -within tags. \ No newline at end of file diff --git a/stepfunctions/stepfunctions/agenteval/templates/evaluators/claude_3/system/generate_test_status.jinja b/stepfunctions/stepfunctions/agenteval/templates/evaluators/claude_3/system/generate_test_status.jinja deleted file mode 100644 index 7bb8e6b..0000000 --- a/stepfunctions/stepfunctions/agenteval/templates/evaluators/claude_3/system/generate_test_status.jinja +++ /dev/null @@ -1,13 +0,0 @@ -You are a quality assurance engineer evaluating a conversation between an USER and an AGENT. - -You will be given an ordered list of steps wrapped in tags. Each step represents a task -that the USER wants to perform when interacting with the AGENT. - -Your job is analyze the running conversation in tags and classify it into the following -categories: - -- A: The USER has attempted all the steps. -- B: The USER has not yet attempted all the steps. - -Please think hard about the response in tags before providing only the category letter -within tags. \ No newline at end of file diff --git a/stepfunctions/stepfunctions/agenteval/templates/evaluators/claude_3/system/generate_user_response.jinja b/stepfunctions/stepfunctions/agenteval/templates/evaluators/claude_3/system/generate_user_response.jinja deleted file mode 100644 index e670420..0000000 --- a/stepfunctions/stepfunctions/agenteval/templates/evaluators/claude_3/system/generate_user_response.jinja +++ /dev/null @@ -1,15 +0,0 @@ -You are role playing as an USER in a conversastion with an AGENT. - -You will be given an ordered list of steps wrapped in tags. Each step represents -a task that the USER wants to perform when interacting with the AGENT. - -Using the list of steps, your job is analyze the running conversation in the - tags and generate the next appropriate response as the USER. - -Do not include any information from a step unless the AGENT asks for it. - -If the AGENT was unable to help or did not understand the last request, just move on to -the next step. Do not attempt to rephrase the request in the next response as the USER. - -Please think hard about the response in tags before providing the response -within tags. Do not include the string "USER:" in your response. \ No newline at end of file diff --git a/stepfunctions/stepfunctions/agenteval/templates/summary/agenteval_summary.md.jinja b/stepfunctions/stepfunctions/agenteval/templates/summary/agenteval_summary.md.jinja deleted file mode 100644 index a624303..0000000 --- a/stepfunctions/stepfunctions/agenteval/templates/summary/agenteval_summary.md.jinja +++ /dev/null @@ -1,49 +0,0 @@ -# Test Summary ---- -This document provides a summary of the tests executed by Agent Evaluation. - -> :warning: This tool utilizes generative AI to assess virtual agents and its evaluations may contain errors. **Please thoroughly examine the results below prior to deciding whether to implement an agent.** ---- -## Tests -{% for test, result in zip(tests, results) -%} -- [{% if result.success %}:green_circle:{% else %}:red_circle:{% endif %} {{ test.name }}](#{{ test.name | replace(' ', '-') }}) -{% endfor %} - ---- - - -{% for test, result in zip(tests, results) -%} -## {% if result.success %}:green_circle:{% else %}:red_circle:{% endif %} {{ test.name }} - -**Steps** -{% for step in test.steps -%} -{{ loop.index }}. {{ step }} -{% endfor %} - -**Expected results** -{% for result in test.expected_results -%} -{{ loop.index }}. {{ result }} -{% endfor %} - -**Conversation** -``` -{% for sender, message in result.conversation -%} -[{{ sender }}] {{ message }} -{% endfor -%} -``` - -**Result** -{{ result.result }} - -**Reasoning** -``` -{{ result.reasoning }} -``` - ---- -{% endfor %} - - - - - diff --git a/stepfunctions/stepfunctions/agenteval/test.py b/stepfunctions/stepfunctions/agenteval/test.py deleted file mode 100644 index 695f2fe..0000000 --- a/stepfunctions/stepfunctions/agenteval/test.py +++ /dev/null @@ -1,29 +0,0 @@ -# Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. -# SPDX-License-Identifier: Apache-2.0 - -from typing import Optional - -from pydantic import BaseModel - - -class Test(BaseModel, validate_assignment=True): - """A test case for an agent. - - Attributes: - name: Name of the test. - steps: List of step to perform for the test. - expected_results: List of expected results for the test. - initial_prompt: Optional initial prompt. - max_turns: Maximum number of turns allowed for the test. - hook: The module path to an evaluation hook. - """ - - # do not collect as a test - __test__ = False - - name: str - steps: list[str] - expected_results: list[str] - initial_prompt: Optional[str] = None - max_turns: int - hook: Optional[str] = None diff --git a/stepfunctions/stepfunctions/agenteval/test_result.py b/stepfunctions/stepfunctions/agenteval/test_result.py deleted file mode 100644 index 5258aef..0000000 --- a/stepfunctions/stepfunctions/agenteval/test_result.py +++ /dev/null @@ -1,27 +0,0 @@ -# Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. -# SPDX-License-Identifier: Apache-2.0 - -from pydantic import BaseModel - -from agenteval.conversation import Conversation - - -class TestResult(BaseModel, arbitrary_types_allowed=True): - """The result of a test. - - Attributes: - test_name: Name of the test. - result: Description of the test result. - reasoning: The rationale for the test result. - success: `True` if the test passed, otherwise `False`. - conversation: Captures the interaction between a user and an agent. - """ - - # do not collect as a test - __test__ = False - - test_name: str - result: str - reasoning: str - success: bool - conversation: Conversation diff --git a/stepfunctions/stepfunctions/agenteval/trace.py b/stepfunctions/stepfunctions/agenteval/trace.py deleted file mode 100644 index 25d477a..0000000 --- a/stepfunctions/stepfunctions/agenteval/trace.py +++ /dev/null @@ -1,72 +0,0 @@ -# Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. -# SPDX-License-Identifier: Apache-2.0 - -import inspect -import json -import os -from datetime import datetime, timezone -from typing import Optional - -_TRACE_DIR = "agenteval_traces" - - -class Trace: - """Captures steps during evaluation. - - Attributes: - test_name (str): Name of the test. - trace_dir (str): Directory to store the trace. - start_time (datetime): Start time of the trace. - end_time (datetime): End time of the trace. - steps (list): List of steps in the trace. - - """ - - def __init__(self, test_name: str, work_dir: str): - """ - Initialize the trace handler. - - Args: - test_name (str): Name of the trace - """ - self.test_name = test_name - self.trace_dir = os.path.join(work_dir, _TRACE_DIR) - self.start_time = None - self.end_time = None - self.steps = [] - - def __enter__(self): - self.start_time = datetime.now(timezone.utc) - return self - - def __exit__(self, *exc): - self.end_time = datetime.now(timezone.utc) - self._dump_trace() - - def _dump_trace(self): - """Dump the trace to a JSON file.""" - - os.makedirs(self.trace_dir, exist_ok=True) - - with open(os.path.join(self.trace_dir, f"{self.test_name}.json"), "w") as f: - json.dump(self._get_trace(), f, default=str) - - def _get_trace(self) -> str: - return { - "test_name": self.test_name, - "start_time": self.start_time, - "end_time": self.end_time, - "steps": self.steps, - } - - def add_step(self, step_name: Optional[str] = None, **kwargs): - """Add a step to the trace. - - Args: - step_name (str, optional): The name of the step. Defaults to - the name of the caller function - """ - step_name = step_name or inspect.stack()[1].function - step = {"timestamp": datetime.now(timezone.utc), "step_name": step_name} - step.update(kwargs) - self.steps.append(step) diff --git a/stepfunctions/stepfunctions/agenteval/utils/__init__.py b/stepfunctions/stepfunctions/agenteval/utils/__init__.py deleted file mode 100644 index 5f80a10..0000000 --- a/stepfunctions/stepfunctions/agenteval/utils/__init__.py +++ /dev/null @@ -1,7 +0,0 @@ -# Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. -# SPDX-License-Identifier: Apache-2.0 - -from .aws import create_boto3_client -from .imports import import_class - -__all__ = ["import_class", "create_boto3_client"] diff --git a/stepfunctions/stepfunctions/agenteval/utils/aws.py b/stepfunctions/stepfunctions/agenteval/utils/aws.py deleted file mode 100644 index 4d5d4dd..0000000 --- a/stepfunctions/stepfunctions/agenteval/utils/aws.py +++ /dev/null @@ -1,36 +0,0 @@ -# Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. -# SPDX-License-Identifier: Apache-2.0 - -from typing import Optional - -import boto3 -from botocore.client import BaseClient -from botocore.config import Config - -_RETRY_MODE = "adaptive" - - -def create_boto3_client( - boto3_service_name: str, - aws_profile: Optional[str], - aws_region: Optional[str], - endpoint_url: Optional[str], - max_retry: int, -) -> BaseClient: - """Create a `boto3` client. - - Args: - boto3_service_name (str): The `boto3` service name (e.g `"bedrock-runtime"`). - aws_profile (str, optional): The AWS profile name. - aws_region (str, optional): The AWS region. - endpoint_url (str, optional): The endpoint URL for the AWS service. - max_retry (int, optional): The maximum number of retry attempts. - - Returns: - BaseClient - """ - - config = Config(retries={"max_attempts": max_retry, "mode": _RETRY_MODE}) - - session = boto3.Session(profile_name=aws_profile, region_name=aws_region) - return session.client(boto3_service_name, endpoint_url=endpoint_url, config=config) diff --git a/stepfunctions/stepfunctions/agenteval/utils/imports.py b/stepfunctions/stepfunctions/agenteval/utils/imports.py deleted file mode 100644 index f0e2685..0000000 --- a/stepfunctions/stepfunctions/agenteval/utils/imports.py +++ /dev/null @@ -1,35 +0,0 @@ -# Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. -# SPDX-License-Identifier: Apache-2.0 - -from importlib import import_module -from typing import Optional - -_ALLOWED_MODULE_NAME_SUFFIX = ["_hook", "_target"] - - -def import_class(module_path: str, parent_class: Optional[type] = None) -> type: - name, class_name = module_path.rsplit(".", 1) - - # make sure module name starts with one of the allowed suffixes - _validate_module_name(name.split(".")[-1]) - - module = import_module(name) - cls = getattr(module, class_name) - - if parent_class: - # make sure the imported class is a subclass - _validate_subclass(cls, parent_class) - - return cls - - -def _validate_module_name(name: str) -> None: - if not any(name.endswith(suffix) for suffix in _ALLOWED_MODULE_NAME_SUFFIX): - raise ValueError(f"Invalid module name: {name}") - - -def _validate_subclass(child_class: type, parent_class: type) -> None: - if not issubclass(child_class, parent_class): - raise TypeError( - f"{child_class.__name__} is not a {parent_class.__name__} subclass" - ) diff --git a/stepfunctions/stepfunctions/functions/check_agent_status_1/.~c9_invoke_Zi2ZN1.py b/stepfunctions/stepfunctions/functions/check_agent_status_1/.~c9_invoke_Zi2ZN1.py deleted file mode 100644 index 8f677c7..0000000 --- a/stepfunctions/stepfunctions/functions/check_agent_status_1/.~c9_invoke_Zi2ZN1.py +++ /dev/null @@ -1,43 +0,0 @@ -import boto3 -import json -import os - -s3_client = boto3.client('s3') -bedrock_agent = boto3.client('bedrock-agent') - -# from aws_lambda_powertools import Logger, Tracer - -# tracer = Tracer() -# logger = Logger() -def handler(event, context) - - agent_id = event["agent_id"] - - response = bedrock_agent.get_agent( - agentId='string' - ) - - agent_status = response["Agent"]["AgentStatus"] - - return { - 'statusCode': 200, - 'agent_id': agent_id, - 'agent_status': agent_status, - 'agent_name': text["agent_name"], - 'body': scenarios - } - - - - - - - - - - - - - - - diff --git a/stepfunctions/stepfunctions/functions/check_agent_status_2/.~c9_invoke_Zi2ZN1.py b/stepfunctions/stepfunctions/functions/check_agent_status_2/.~c9_invoke_Zi2ZN1.py deleted file mode 100644 index 8f677c7..0000000 --- a/stepfunctions/stepfunctions/functions/check_agent_status_2/.~c9_invoke_Zi2ZN1.py +++ /dev/null @@ -1,43 +0,0 @@ -import boto3 -import json -import os - -s3_client = boto3.client('s3') -bedrock_agent = boto3.client('bedrock-agent') - -# from aws_lambda_powertools import Logger, Tracer - -# tracer = Tracer() -# logger = Logger() -def handler(event, context) - - agent_id = event["agent_id"] - - response = bedrock_agent.get_agent( - agentId='string' - ) - - agent_status = response["Agent"]["AgentStatus"] - - return { - 'statusCode': 200, - 'agent_id': agent_id, - 'agent_status': agent_status, - 'agent_name': text["agent_name"], - 'body': scenarios - } - - - - - - - - - - - - - - - From 3c3a51b260cd26a415fcebb463edd999a9d398b6 Mon Sep 17 00:00:00 2001 From: Ife Ojomo Date: Fri, 4 Oct 2024 16:49:24 +0100 Subject: [PATCH 4/7] fix: added requirements to main dir to pass test --- requirements.txt | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/requirements.txt b/requirements.txt index 199e458..c368cd1 100644 --- a/requirements.txt +++ b/requirements.txt @@ -4,4 +4,7 @@ click~=8.0 pydantic>=2.1.0,<3.0 rich>=13.7.0,<14.0 jinja2>=3.1.3,<4.0 -jsonpath-ng>=1.6.1,<2.0 \ No newline at end of file +jsonpath-ng>=1.6.1,<2.0 +pathlib +aws-cdk-lib==2.155.0 +constructs>=10.0.0,<11.0.0 \ No newline at end of file From 2c9d51539a25995b69a282bc720f24090a9670ef Mon Sep 17 00:00:00 2001 From: Ife Ojomo Date: Fri, 4 Oct 2024 17:24:24 +0100 Subject: [PATCH 5/7] fix: gitignore update --- .gitignore | 1 + stepfunctions/.DS_Store | Bin 6148 -> 6148 bytes stepfunctions/.gitignore | 1 + 3 files changed, 2 insertions(+) diff --git a/.gitignore b/.gitignore index 5d903b0..96dc6e1 100644 --- a/.gitignore +++ b/.gitignore @@ -170,3 +170,4 @@ cython_debug/ local-test local-test/* +.DS_Store diff --git a/stepfunctions/.DS_Store b/stepfunctions/.DS_Store index 6f6161b8788a1e59e7fef2dbc9786f1245bb967c..2e447912a5c766e17495b85d8858cc8b852abdbd 100644 GIT binary patch delta 31 ncmZoMXfc@J&nU7nU^g?P$YdUt=FRe~FPJ7aOxVoM@s}R}q2vmO delta 170 zcmZoMXfc@J&nU4mU^g?P#AF_p=6ZI9WQG)mYz94se1=knlALtI;N<+=0tO&px)!Je zq|j94=DWBg<>V&;WjOx#Og&W;c+3%-suUtr6=d)ttYc2zu(_M%4bx_Jj=%f>g)J=g diff --git a/stepfunctions/.gitignore b/stepfunctions/.gitignore index c5aed14..e9071d0 100644 --- a/stepfunctions/.gitignore +++ b/stepfunctions/.gitignore @@ -9,3 +9,4 @@ __pycache__ # CDK asset staging directory .cdk.staging cdk.out +.DS_Store From 8f202750277b6477df124c86f7be01671cff3bac Mon Sep 17 00:00:00 2001 From: Ife Ojomo Date: Fri, 4 Oct 2024 18:04:50 +0100 Subject: [PATCH 6/7] fix: lambda powertools, comments removed --- .../aws-lambda-powertools/requirements.txt | 1 + .../functions/check_agent_status_1/index.py | 18 ++++---- .../functions/check_agent_status_2/index.py | 18 ++++---- .../functions/create_alias/index.py | 15 ++++--- .../functions/delete_alias/index.py | 19 +++++---- .../functions/generate_map/index.py | 24 +++++------ .../stepfunctions/functions/run_test/index.py | 28 ++----------- .../functions/update_bedrock_agent/index.py | 37 ++++++++++------ .../stepfunctions/stepfunctions_stack.py | 42 +++++++++++-------- .../tests/unit/test_stepfunctions_stack.py | 9 ---- 10 files changed, 107 insertions(+), 104 deletions(-) create mode 100644 stepfunctions/layers/aws-lambda-powertools/requirements.txt diff --git a/stepfunctions/layers/aws-lambda-powertools/requirements.txt b/stepfunctions/layers/aws-lambda-powertools/requirements.txt new file mode 100644 index 0000000..ed0b171 --- /dev/null +++ b/stepfunctions/layers/aws-lambda-powertools/requirements.txt @@ -0,0 +1 @@ +aws-lambda-powertools \ No newline at end of file diff --git a/stepfunctions/stepfunctions/functions/check_agent_status_1/index.py b/stepfunctions/stepfunctions/functions/check_agent_status_1/index.py index d84319a..513205a 100644 --- a/stepfunctions/stepfunctions/functions/check_agent_status_1/index.py +++ b/stepfunctions/stepfunctions/functions/check_agent_status_1/index.py @@ -5,18 +5,21 @@ s3_client = boto3.client('s3') bedrock_agent = boto3.client('bedrock-agent') -# from aws_lambda_powertools import Logger, Tracer - -# tracer = Tracer() -# logger = Logger() +from aws_lambda_powertools import Logger +logger = Logger() def handler(event, context): agent_id = event["agent_id"] + + try: - response = bedrock_agent.get_agent( - agentId=agent_id - ) + response = bedrock_agent.get_agent( + agentId=agent_id + ) + logger.info(f"Getting agent response: {response}") + except Exception as e: + logger.error(f"Error getting agent: {e}") agent_status = response["agent"]["agentStatus"] @@ -27,3 +30,4 @@ def handler(event, context): 'agent_id': agent_id, 'agent_status': agent_status } + \ No newline at end of file diff --git a/stepfunctions/stepfunctions/functions/check_agent_status_2/index.py b/stepfunctions/stepfunctions/functions/check_agent_status_2/index.py index eec7d52..692c418 100644 --- a/stepfunctions/stepfunctions/functions/check_agent_status_2/index.py +++ b/stepfunctions/stepfunctions/functions/check_agent_status_2/index.py @@ -5,18 +5,20 @@ s3_client = boto3.client('s3') bedrock_agent = boto3.client('bedrock-agent') -# from aws_lambda_powertools import Logger, Tracer - -# tracer = Tracer() -# logger = Logger() +from aws_lambda_powertools import Logger +logger = Logger() def handler(event, context): agent_id = event["update_output"]["agentid"] - - response = bedrock_agent.get_agent( - agentId=agent_id - ) + + try: + response = bedrock_agent.get_agent( + agentId=agent_id + ) + logger.info(f"Getting agent response: {response}") + except Exception as e: + logger.error(f"Erorr getting agent: {e}") agent_status = response["agent"]["agentStatus"] diff --git a/stepfunctions/stepfunctions/functions/create_alias/index.py b/stepfunctions/stepfunctions/functions/create_alias/index.py index e2cada8..b8cb2cf 100644 --- a/stepfunctions/stepfunctions/functions/create_alias/index.py +++ b/stepfunctions/stepfunctions/functions/create_alias/index.py @@ -1,7 +1,9 @@ import json import boto3 import uuid +from aws_lambda_powertools import Logger +logger = Logger() def handler(event, context): @@ -10,12 +12,15 @@ def handler(event, context): agent_alias = str(uuid.uuid4()) agent_id = event["update_output"]["agentid"] - alias_resp = bedrock_agent.create_agent_alias( - agentAliasName=agent_alias, - agentId=agent_id - ) + try: + alias_resp = bedrock_agent.create_agent_alias( + agentAliasName=agent_alias, + agentId=agent_id + ) + logger.info(f"Create Alias Response: {alias_resp}") - print(alias_resp) + except Exception as e: + logger.error(f"Error creating alias: {e}") agent_id = alias_resp["agentAlias"]["agentId"] diff --git a/stepfunctions/stepfunctions/functions/delete_alias/index.py b/stepfunctions/stepfunctions/functions/delete_alias/index.py index 99b2a7a..980ca5e 100644 --- a/stepfunctions/stepfunctions/functions/delete_alias/index.py +++ b/stepfunctions/stepfunctions/functions/delete_alias/index.py @@ -2,22 +2,27 @@ import boto3 import uuid import os +from aws_lambda_powertools import Logger + +logger = Logger() def handler(event, context): - # TODO implement - #pass in from step function but for now agent_id = event["agent_id"] agent_alias_id = event["agent_alias_id"] bedrock_agent = boto3.client('bedrock-agent') - response = bedrock_agent.delete_agent_alias( - agentAliasId=agent_alias_id, - agentId=agent_id -) - + try: + response = bedrock_agent.delete_agent_alias( + agentAliasId=agent_alias_id, + agentId=agent_id + ) + logger.info(f"Delete response: {response}") + + except Exception as e: + logger.error(f"Error preparing agent : {e}") return { 'statusCode': 200, diff --git a/stepfunctions/stepfunctions/functions/generate_map/index.py b/stepfunctions/stepfunctions/functions/generate_map/index.py index 45b05de..886f5a4 100644 --- a/stepfunctions/stepfunctions/functions/generate_map/index.py +++ b/stepfunctions/stepfunctions/functions/generate_map/index.py @@ -4,29 +4,27 @@ s3_client = boto3.client('s3') -# from aws_lambda_powertools import Logger, Tracer +from aws_lambda_powertools import Logger, Tracer -# AWS_REGION = os.environ["AWS_REGION"] -# AUTOMATION_STATE_MACHINE_ARN = os.environ["AUTOMATION_STATE_MACHINE_ARN"] - - - -# tracer = Tracer() -# logger = Logger() +logger = Logger() def handler(event, context): bucket = event["detail"]["bucket"]["name"] key = event["detail"]["object"]["key"] - - scenario_json = s3_client.get_object(Bucket=bucket, Key=key) - text = json.loads(scenario_json["Body"].read()) - print(text) + try: + scenario_json = s3_client.get_object(Bucket=bucket, Key=key) + text = json.loads(scenario_json["Body"].read()) + logger.info(text) + except Exception as e: + logger.error(f"Error getting object: {e}") + + prompts = text['prompts'] profiles = text['customer_profiles'] - + # Generate scenarios scenarios = [] diff --git a/stepfunctions/stepfunctions/functions/run_test/index.py b/stepfunctions/stepfunctions/functions/run_test/index.py index 21219ed..b6b0a6d 100644 --- a/stepfunctions/stepfunctions/functions/run_test/index.py +++ b/stepfunctions/stepfunctions/functions/run_test/index.py @@ -37,21 +37,14 @@ def handler(event, context): yaml_data = { 'evaluator': { - 'model': 'claude-3' + 'model': 'claude-3', + 'region': 'us-east-1' }, 'target': { 'type': 'bedrock-agent', 'bedrock_agent_id': agent_id, 'bedrock_agent_alias_id': agent_alias_id }, - # 'user_profile':{ - # 'demographic': scenario['demography'], - # 'household_size': scenario['household_size'], - # 'appliances': scenario['appliances'], - # 'energy_usage': scenario['energy_usage'], - # 'tariff': scenario['tarrif'], - # 'payment_type': scenario['payment_type'] - # }, 'tests': { 'provide recommendation to customer in need': { 'profile': user_profile, @@ -134,7 +127,6 @@ def handler(event, context): s3_key = f"results/{agent_alias_name}/{uid}/results.md" s3_client.put_object(Bucket=bucket_name, Key=s3_key, Body=result) - # print('reached_here 2') except Exception as e: status = "error" @@ -146,18 +138,4 @@ def handler(event, context): 'status': status, 'test_passed_rate':test_passed_rate } - - # finally: - # shutil.copy(local_yaml_path, test_result_dir) - - # insert_result(created_at, finished_at, yaml_data["target"]["type"], status, test_passed_rate) - # # For this example, we'll just return it - - # #plan is made - - - - # return { - # 'statusCode': 200, - # 'body': 'success' - # } \ No newline at end of file + \ No newline at end of file diff --git a/stepfunctions/stepfunctions/functions/update_bedrock_agent/index.py b/stepfunctions/stepfunctions/functions/update_bedrock_agent/index.py index 19876f9..f3843e0 100644 --- a/stepfunctions/stepfunctions/functions/update_bedrock_agent/index.py +++ b/stepfunctions/stepfunctions/functions/update_bedrock_agent/index.py @@ -2,32 +2,43 @@ import boto3 import uuid import os +from aws_lambda_powertools import Logger, Tracer +logger = Logger() + +@logger.inject_lambda_context def handler(event, context): - # TODO implement - #pass in from step function but for now - agent_id = event["agent_id"] agent_name=event["agent_name"] agent_role = os.environ['AGENT_ROLE'] - # role = "arn:aws:iam::905418302891:role/service-role/AmazonBedrockExecutionRoleForAgents_LED91O3XKK" model = 'anthropic.claude-3-sonnet-20240229-v1:0' instruction = event['prompt'] bedrock_agent = boto3.client('bedrock-agent') - update_resp = bedrock_agent.update_agent( - agentId=agent_id, - agentName=agent_name, - agentResourceRoleArn=agent_role, - foundationModel=model, - instruction=instruction, - - ) - prep_resp = bedrock_agent.prepare_agent(agentId=agent_id) + logger.info("Updating Agent") + try: + + update_resp = bedrock_agent.update_agent( + agentId=agent_id, + agentName=agent_name, + agentResourceRoleArn=agent_role, + foundationModel=model, + instruction=instruction, + + ) + logger.info(f"Update agent response: {update_resp}") + except Exception as e: + logger.error(f"Error updating agent: {e}") + + try: + prep_resp = bedrock_agent.prepare_agent(agentId=agent_id) + logger.info(f"Prepaing Agent response: {prep_resp}") + except Exception as e: + logger.error(f"Error preparing agent : {e}") return { diff --git a/stepfunctions/stepfunctions/stepfunctions_stack.py b/stepfunctions/stepfunctions/stepfunctions_stack.py index 4917d05..934b3e0 100644 --- a/stepfunctions/stepfunctions/stepfunctions_stack.py +++ b/stepfunctions/stepfunctions/stepfunctions_stack.py @@ -29,6 +29,18 @@ def __init__(self, scope: Construct, construct_id: str, **kwargs) -> None: "EvaluationBucket", event_bridge_enabled=True ) + + powertools_layer = Layer( + self, + "PowertoolsLayer", + architecture=architecture, + runtime=runtime, + path=os.path.join( + pathlib.Path(__file__).parent.resolve().parent, + "layers", + "aws-lambda-powertools" + ) + ) agenteval_layer = Layer( self, @@ -55,7 +67,8 @@ def __init__(self, scope: Construct, construct_id: str, **kwargs) -> None: "functions", "generate_map", ) - ) + ), + layers=[powertools_layer.layer_version] ) generate_map_step = tasks.LambdaInvoke( @@ -80,7 +93,8 @@ def __init__(self, scope: Construct, construct_id: str, **kwargs) -> None: "functions", "check_agent_status_1", ) - ) + ), + layers=[powertools_layer.layer_version] ) get_status_step_1 = tasks.LambdaInvoke( @@ -117,7 +131,8 @@ def __init__(self, scope: Construct, construct_id: str, **kwargs) -> None: "functions", "update_bedrock_agent", ) - ) + ), + layers=[powertools_layer.layer_version] ) @@ -147,8 +162,6 @@ def __init__(self, scope: Construct, construct_id: str, **kwargs) -> None: "Wait1", time=sfn.WaitTime.duration(Duration.seconds(30)) ) - - # first_choice_def = first_choice.when(condition1, wait_step.next(get_status_step_1).next(first_choice)).otherwise(update_agent_step).afterwards() create_alias_function = _lambda.Function( @@ -164,7 +177,8 @@ def __init__(self, scope: Construct, construct_id: str, **kwargs) -> None: "functions", "create_alias", ) - ) + ), + layers=[powertools_layer.layer_version] ) create_alias_step = tasks.LambdaInvoke( @@ -189,7 +203,8 @@ def __init__(self, scope: Construct, construct_id: str, **kwargs) -> None: "functions", "check_agent_status_2", ) - ) + ), + layers=[powertools_layer.layer_version] ) get_status_step_2 = tasks.LambdaInvoke( @@ -292,7 +307,8 @@ def __init__(self, scope: Construct, construct_id: str, **kwargs) -> None: "functions", "delete_alias", ) - ) + ), + layers=[powertools_layer.layer_version] ) delete_alias_function.add_to_role_policy( @@ -421,12 +437,4 @@ def __init__(self, scope: Construct, construct_id: str, **kwargs) -> None: actions=["bedrock:*","iam:PassRole","iam:ListRoles"], resources=["*"], ) - ) - - # The code that defines your stack goes here - - # example resource - # queue = sqs.Queue( - # self, "StepfunctionsQueue", - # visibility_timeout=Duration.seconds(300), - # ) + ) \ No newline at end of file diff --git a/stepfunctions/tests/unit/test_stepfunctions_stack.py b/stepfunctions/tests/unit/test_stepfunctions_stack.py index e213d46..ce36343 100644 --- a/stepfunctions/tests/unit/test_stepfunctions_stack.py +++ b/stepfunctions/tests/unit/test_stepfunctions_stack.py @@ -3,13 +3,4 @@ from stepfunctions.stepfunctions_stack import StepfunctionsStack -# example tests. To run these tests, uncomment this file along with the example -# resource in stepfunctions/stepfunctions_stack.py -def test_sqs_queue_created(): - app = core.App() - stack = StepfunctionsStack(app, "stepfunctions") - template = assertions.Template.from_stack(stack) -# template.has_resource_properties("AWS::SQS::Queue", { -# "VisibilityTimeout": 300 -# }) From ae74430634ade6c07a5add964b4b6c37a008a9e3 Mon Sep 17 00:00:00 2001 From: Ife Ojomo Date: Fri, 4 Oct 2024 18:16:49 +0100 Subject: [PATCH 7/7] fix: improved logging --- .../stepfunctions/functions/check_agent_status_1/index.py | 8 ++++---- .../stepfunctions/functions/check_agent_status_2/index.py | 5 ++++- .../stepfunctions/functions/create_alias/index.py | 1 + .../stepfunctions/functions/delete_alias/index.py | 6 +++--- .../stepfunctions/functions/generate_map/index.py | 3 ++- .../stepfunctions/functions/update_bedrock_agent/index.py | 5 +++-- 6 files changed, 17 insertions(+), 11 deletions(-) diff --git a/stepfunctions/stepfunctions/functions/check_agent_status_1/index.py b/stepfunctions/stepfunctions/functions/check_agent_status_1/index.py index 513205a..3a5ee67 100644 --- a/stepfunctions/stepfunctions/functions/check_agent_status_1/index.py +++ b/stepfunctions/stepfunctions/functions/check_agent_status_1/index.py @@ -12,15 +12,15 @@ def handler(event, context): agent_id = event["agent_id"] + logger.info("Getting agent status") try: - response = bedrock_agent.get_agent( agentId=agent_id ) - logger.info(f"Getting agent response: {response}") + agent_status = response["agent"]["agentStatus"] + logger.info(f"Agent status: {agent_status}") except Exception as e: - logger.error(f"Error getting agent: {e}") - + logger.error(f"Error getting agent status: {e}") agent_status = response["agent"]["agentStatus"] diff --git a/stepfunctions/stepfunctions/functions/check_agent_status_2/index.py b/stepfunctions/stepfunctions/functions/check_agent_status_2/index.py index 692c418..eb3fd07 100644 --- a/stepfunctions/stepfunctions/functions/check_agent_status_2/index.py +++ b/stepfunctions/stepfunctions/functions/check_agent_status_2/index.py @@ -12,11 +12,14 @@ def handler(event, context): agent_id = event["update_output"]["agentid"] + logger.info("Getting agent status") try: response = bedrock_agent.get_agent( agentId=agent_id ) - logger.info(f"Getting agent response: {response}") + agent_status = response["agent"]["agentStatus"] + logger.info(f"Agent status: {agent_status}") + except Exception as e: logger.error(f"Erorr getting agent: {e}") diff --git a/stepfunctions/stepfunctions/functions/create_alias/index.py b/stepfunctions/stepfunctions/functions/create_alias/index.py index b8cb2cf..a75b644 100644 --- a/stepfunctions/stepfunctions/functions/create_alias/index.py +++ b/stepfunctions/stepfunctions/functions/create_alias/index.py @@ -12,6 +12,7 @@ def handler(event, context): agent_alias = str(uuid.uuid4()) agent_id = event["update_output"]["agentid"] + logger.info("Creating Agent Alias") try: alias_resp = bedrock_agent.create_agent_alias( agentAliasName=agent_alias, diff --git a/stepfunctions/stepfunctions/functions/delete_alias/index.py b/stepfunctions/stepfunctions/functions/delete_alias/index.py index 980ca5e..0e5c87b 100644 --- a/stepfunctions/stepfunctions/functions/delete_alias/index.py +++ b/stepfunctions/stepfunctions/functions/delete_alias/index.py @@ -13,16 +13,16 @@ def handler(event, context): agent_alias_id = event["agent_alias_id"] bedrock_agent = boto3.client('bedrock-agent') - + logger.info("Deleting Agent Alias") try: response = bedrock_agent.delete_agent_alias( agentAliasId=agent_alias_id, agentId=agent_id ) - logger.info(f"Delete response: {response}") + logger.info(f"Delete alias response: {response}") except Exception as e: - logger.error(f"Error preparing agent : {e}") + logger.error(f"Error deleting agent alias : {e}") return { 'statusCode': 200, diff --git a/stepfunctions/stepfunctions/functions/generate_map/index.py b/stepfunctions/stepfunctions/functions/generate_map/index.py index 886f5a4..873b7e7 100644 --- a/stepfunctions/stepfunctions/functions/generate_map/index.py +++ b/stepfunctions/stepfunctions/functions/generate_map/index.py @@ -4,7 +4,7 @@ s3_client = boto3.client('s3') -from aws_lambda_powertools import Logger, Tracer +from aws_lambda_powertools import Logger logger = Logger() @@ -13,6 +13,7 @@ def handler(event, context): bucket = event["detail"]["bucket"]["name"] key = event["detail"]["object"]["key"] + logger.info("Fetching scenarios") try: scenario_json = s3_client.get_object(Bucket=bucket, Key=key) text = json.loads(scenario_json["Body"].read()) diff --git a/stepfunctions/stepfunctions/functions/update_bedrock_agent/index.py b/stepfunctions/stepfunctions/functions/update_bedrock_agent/index.py index f3843e0..3ba810c 100644 --- a/stepfunctions/stepfunctions/functions/update_bedrock_agent/index.py +++ b/stepfunctions/stepfunctions/functions/update_bedrock_agent/index.py @@ -2,7 +2,7 @@ import boto3 import uuid import os -from aws_lambda_powertools import Logger, Tracer +from aws_lambda_powertools import Logger logger = Logger() @@ -33,7 +33,8 @@ def handler(event, context): logger.info(f"Update agent response: {update_resp}") except Exception as e: logger.error(f"Error updating agent: {e}") - + + logger.info("Preparing Agent") try: prep_resp = bedrock_agent.prepare_agent(agentId=agent_id) logger.info(f"Prepaing Agent response: {prep_resp}")