From 875dd92909c53e704d30d63b8a271eeae60d6781 Mon Sep 17 00:00:00 2001 From: Thomas Schaffter Date: Wed, 1 Nov 2023 03:10:21 +0000 Subject: [PATCH] Work now in Python script --- .vscode/settings.json | 2 + .../notebook/challenge_headlines.json | 28 ++ .../notebooks/challenge-headline-llm.ipynb | 312 +++++++++--------- .../src/challenge_headline/__init__.py | 0 .../challenge_headline_llm.py | 134 ++++++++ .../notebook/src/utils/__init__.py | 0 .../notebook/src/utils/bedrock.py | 80 +++++ .../notebook/src/utils/print_ww.py | 21 ++ 8 files changed, 425 insertions(+), 152 deletions(-) create mode 100644 apps/openchallenges/notebook/challenge_headlines.json create mode 100644 apps/openchallenges/notebook/src/challenge_headline/__init__.py create mode 100644 apps/openchallenges/notebook/src/challenge_headline/challenge_headline_llm.py create mode 100644 apps/openchallenges/notebook/src/utils/__init__.py create mode 100644 apps/openchallenges/notebook/src/utils/bedrock.py create mode 100644 apps/openchallenges/notebook/src/utils/print_ww.py diff --git a/.vscode/settings.json b/.vscode/settings.json index 25e0c3d68e..03e0ffd14e 100644 --- a/.vscode/settings.json +++ b/.vscode/settings.json @@ -12,6 +12,8 @@ "editor.defaultFormatter": "ms-python.black-formatter", "editor.formatOnSave": true, "editor.tabSize": 4 + // This settings still works even if VS Code does not recognize it. + // "notebook.formatOnSave.enabled": true }, "[json][jsonc]": { "editor.defaultFormatter": "esbenp.prettier-vscode", diff --git a/apps/openchallenges/notebook/challenge_headlines.json b/apps/openchallenges/notebook/challenge_headlines.json new file mode 100644 index 0000000000..ae0e40f347 --- /dev/null +++ b/apps/openchallenges/notebook/challenge_headlines.json @@ -0,0 +1,28 @@ +[ + { + "id": 279, + "slug": "niddk-central-repository-data-centric-challenge", + "name": "NIDDK Central Repository Data-Centric Challenge", + "headline": "Enhancing NIDDK datasets for future Artificial Intelligence (AI) applications.", + "headline_alternatives": [ + "1. Challenge Seeks AI Solutions to Standardize NIDDK Data", + "2. Data Challenge Aims to Ready NIDDK Data for AI Discovery ", + "3. Making NIDDK Data AI-Ready Via Data Standardization Challenge", + "4. Challenge Invites AI to Standardize NIDDK Data for Reuse", + "5. Data Challenge Seeks AI Tools to Improve NIDDK Data Reuse" + ] + }, + { + "id": 278, + "slug": "qbi-hackathon", + "name": "QBI hackathon", + "headline": "A 48-hour event connecting the Bay Area developer community with scientists ...", + "headline_alternatives": [ + "1. Hackathon connects developers and scientists to advance biomedical research ", + "2. 48-hour hackathon applies AI to biomedical data ", + "3. Developers and scientists collaborate on biomedical challenges", + "4. Hackathon pushes science ahead with latest algorithms ", + "5. Event connects Bay Area developers and scientists" + ] + } +] \ No newline at end of file diff --git a/apps/openchallenges/notebook/notebooks/challenge-headline-llm.ipynb b/apps/openchallenges/notebook/notebooks/challenge-headline-llm.ipynb index 207f6b89a0..3dad5722ce 100644 --- a/apps/openchallenges/notebook/notebooks/challenge-headline-llm.ipynb +++ b/apps/openchallenges/notebook/notebooks/challenge-headline-llm.ipynb @@ -7,7 +7,7 @@ "tags": [] }, "source": [ - "# Generate Challenge Headlines" + "# Generate Challenge Headlines with AWS Bedrock" ] }, { @@ -43,6 +43,45 @@ "- Access to OpenAI API" ] }, + { + "cell_type": "markdown", + "id": "2dd21cb4", + "metadata": {}, + "source": [ + "## Preparation" + ] + }, + { + "cell_type": "markdown", + "id": "c74bd100", + "metadata": {}, + "source": [ + "Load config file `.env`." + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "id": "b9a13bdf", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "True" + ] + }, + "execution_count": 1, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "from dotenv import load_dotenv\n", + "\n", + "load_dotenv()" + ] + }, { "cell_type": "markdown", "id": "c90710c2-f053-44ae-a3c2-610eecff9073", @@ -53,7 +92,7 @@ }, { "cell_type": "code", - "execution_count": 1, + "execution_count": 2, "id": "f37a60d9", "metadata": {}, "outputs": [], @@ -65,7 +104,7 @@ }, { "cell_type": "code", - "execution_count": 2, + "execution_count": 3, "id": "8fc3ac4c-2ceb-4bbc-bdb8-3bb8be08dfc6", "metadata": {}, "outputs": [], @@ -78,7 +117,7 @@ }, { "cell_type": "code", - "execution_count": 3, + "execution_count": 4, "id": "b2f952f5-9140-4702-8a96-3457ca4df841", "metadata": {}, "outputs": [], @@ -99,158 +138,48 @@ ] }, { - "cell_type": "code", - "execution_count": 4, - "id": "6d590b17", + "cell_type": "markdown", + "id": "1ff9d4b9", "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "True" - ] - }, - "execution_count": 4, - "metadata": {}, - "output_type": "execute_result" - } - ], "source": [ - "from dotenv import load_dotenv\n", - "\n", - "load_dotenv()" + "## Prepare the challenge descriptions" ] }, { "cell_type": "code", - "execution_count": 5, - "id": "488632df", + "execution_count": 20, + "id": "2fcad35d", "metadata": {}, "outputs": [], "source": [ - "import openai" + "import requests\n", + "from bs4 import BeautifulSoup" ] }, { - "cell_type": "code", - "execution_count": 6, - "id": "9158b9b9", + "cell_type": "markdown", + "id": "49d20733", "metadata": {}, - "outputs": [], "source": [ - "# Source: https://medium.com/muthoni-wanyoike/implementing-text-summarization-using-openais-gpt-3-api-dcd6be4f6933\n", - "def split_text(text):\n", - " max_chunk_size = 2048\n", - " chunks = []\n", - " current_chunk = \"\"\n", - " for sentence in text.split(\".\"):\n", - " if len(current_chunk) + len(sentence) < max_chunk_size:\n", - " current_chunk += sentence + \".\"\n", - " else:\n", - " chunks.append(current_chunk.strip())\n", - " current_chunk = sentence + \".\"\n", - " if current_chunk:\n", - " chunks.append(current_chunk.strip())\n", - " return chunks" + "TODO" ] }, { "cell_type": "code", - "execution_count": 7, - "id": "1ea1b66b", + "execution_count": null, + "id": "eb6c1ada", "metadata": {}, "outputs": [], - "source": [ - "def generate_challenge_headline(text):\n", - " prompt=(\n", - " \"Please generate five headlines that have a maximum ten words from the following \"\n", - " \"challenge description. The headline must summarize the goal of the challenge. \"\n", - " f\"Description: \\n{text}\"\n", - " )\n", - " response = openai.ChatCompletion.create(\n", - " model=\"gpt-3.5-turbo\",\n", - " messages=[\n", - " {\"role\": \"system\", \"content\": \"You are a helpful assistant.\"},\n", - " {\"role\": \"user\", \"content\": prompt},\n", - " ],\n", - " max_tokens=1024,\n", - " temperature=0.5\n", - " )\n", - " return response['choices'][0]['message']['content']" - ] - }, - { - "cell_type": "code", - "execution_count": 8, - "id": "d0c0b308-0b58-44a7-8ff6-4987dfbccb17", - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "('1. NIDDK Data Centric Challenge: Enhancing Repository for AI Research\\n'\n", - " '2. Unlocking Insights: NIDDK Challenge Improves Data Quality for AI\\n'\n", - " \"3. NIDDK-CR's Data Centric Challenge: Advancing AI-driven Discovery\\n\"\n", - " '4. Bridging the Gap: NIDDK Challenge Boosts Data Collaboration for AI\\n'\n", - " '5. NIDDK Repository Challenge: Making Research Data FAIR for AI')\n" - ] - } - ], "source": [ "challenge = challenges[0]\n", - "result = generate_challenge_headline(challenge.description)\n", - "pprint(result)\n" - ] - }, - { - "cell_type": "code", - "execution_count": 28, - "id": "f2bd77de", - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "{\n", - " \"id\": 279,\n", - " \"slug\": \"niddk-central-repository-data-centric-challenge\",\n", - " \"name\": \"NIDDK Central Repository Data-Centric Challenge\",\n", - " \"headline\": \"Enhancing NIDDK datasets for future Artificial Intelligence (AI) applications.\",\n", - " \"headline_alternatives\": [\n", - " \"1. NIDDK Data Centric Challenge: Enhancing Repository for AI Research\",\n", - " \"2. Unlocking Insights: NIDDK Challenge Improves Data Quality for AI\",\n", - " \"3. NIDDK-CR's Data Centric Challenge: Advancing AI-driven Discovery\",\n", - " \"4. Bridging the Gap: NIDDK Challenge Boosts Data Collaboration for AI\",\n", - " \"5. NIDDK Repository Challenge: Making Research Data FAIR for AI\"\n", - " ]\n", - "}\n" - ] - } - ], - "source": [ - "from itertools import compress\n", - "import json\n", "\n", - "raw_headlines = result.splitlines()\n", + "response = requests.get(challenge.website_url)\n", "\n", - "def is_raw_headline(raw_headline):\n", - " prefixes = (\"1. \", \"2. \", \"3. \", \"4. \", \"5. \")\n", - " return raw_headline.startswith(prefixes)\n", "\n", - "headlines = list(compress(raw_headlines, map(is_raw_headline, raw_headlines)))\n", "\n", - "obj = {\n", - " \"id\": challenge.id,\n", - " \"slug\": challenge.slug,\n", - " \"name\": challenge.name,\n", - " \"headline\": challenge.headline,\n", - " \"headline_alternatives\": headlines\n", - "}\n", - "json_str = json.dumps(obj, indent=2)\n", "\n", - "print(json_str)" + "\n", + "pprint(response)" ] }, { @@ -258,7 +187,7 @@ "id": "14ba8e14", "metadata": {}, "source": [ - "## Generating challenge headlines with AWS LLM" + "## Generate the headlines with AWS Bedrock" ] }, { @@ -338,25 +267,17 @@ " )\n" ] }, - { - "cell_type": "markdown", - "id": "83d704d1", - "metadata": {}, - "source": [ - "Call API and output results" - ] - }, { "cell_type": "code", - "execution_count": 9, + "execution_count": 7, "id": "451eca6a", "metadata": {}, "outputs": [], "source": [ - "def generate_challenge_headline(text):\n", + "def generate_challenge_headlines(text, num_headlines):\n", " prompt=(\n", - " \"Please generate five headlines that have a maximum ten words from the following \"\n", - " \"challenge description. The headline must summarize the goal of the challenge. \"\n", + " f\"Please generate {num_headlines} headlines that have a maximum ten words from the \"\n", + " \"following challenge description. The headline must summarize the goal of the challenge. \"\n", " f\"Description: \\n{text}\"\n", " )\n", " response = Bedrock(model_id = \"anthropic.claude-v2\",\n", @@ -366,6 +287,21 @@ " return response" ] }, + { + "cell_type": "code", + "execution_count": 8, + "id": "1a397957", + "metadata": {}, + "outputs": [], + "source": [ + "from itertools import compress\n", + "import json\n", + "\n", + "def is_raw_headline(raw_headline):\n", + " prefixes = (\"1. \", \"2. \", \"3. \", \"4. \", \"5. \")\n", + " return raw_headline.startswith(prefixes)" + ] + }, { "cell_type": "markdown", "id": "903a6de3", @@ -380,7 +316,7 @@ }, { "cell_type": "code", - "execution_count": 10, + "execution_count": 9, "id": "b0c4c192", "metadata": {}, "outputs": [ @@ -391,31 +327,103 @@ "(' Here are 5 headlines with a maximum of 10 words summarizing the goal of the '\n", " 'challenge:\\n'\n", " '\\n'\n", - " '1. Challenge Seeks to Standardize Data for AI Discovery\\n'\n", + " '1. Challenge Seeks to Improve Data for AI Discovery\\n'\n", " '\\n'\n", - " '2. Improve Data Quality for AI Research, Says NIDDK Challenge \\n'\n", + " '2. Data Challenge Aims to Augment Repository for AI Use \\n'\n", " '\\n'\n", - " '3. NIDDK Launches Data Challenge to Boost AI Reuse\\n'\n", + " '3. Challenge Targets Data Standardization for AI Research\\n'\n", " '\\n'\n", - " '4. Challenge Aims to Ready Data for AI Insights\\n'\n", + " '4. Competition Focuses on Making Data AI Ready \\n'\n", " '\\n'\n", - " '5. Data Challenge Targets Interoperability for AI')\n" + " '5. Challenge Works to Ready Data for AI Insights')\n" ] } ], "source": [ "challenge = challenges[0]\n", - "result = generate_challenge_headline(challenge.description)\n", - "pprint(result)\n" + "response = generate_challenge_headlines(challenge.description, 5)\n", + "\n", + "pprint(response)" ] }, { "cell_type": "code", - "execution_count": null, + "execution_count": 14, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "{\n", + " \"id\": 279,\n", + " \"slug\": \"niddk-central-repository-data-centric-challenge\",\n", + " \"name\": \"NIDDK Central Repository Data-Centric Challenge\",\n", + " \"headline\": \"Enhancing NIDDK datasets for future Artificial Intelligence (AI) applications.\",\n", + " \"headline_alternatives\": [\n", + " \"1. Challenge Seeks to Improve Data for AI Discovery\",\n", + " \"2. Data Challenge Aims to Augment Repository for AI Use \",\n", + " \"3. Challenge Targets Data Standardization for AI Research\",\n", + " \"4. Competition Focuses on Making Data AI Ready \",\n", + " \"5. Challenge Works to Ready Data for AI Insights\"\n", + " ]\n", + "}\n" + ] + } + ], + "source": [ + "raw_headlines = response.splitlines()\n", + "headlines = list(compress(raw_headlines, map(is_raw_headline, raw_headlines)))\n", + "\n", + "obj = {\n", + " \"id\": challenge.id,\n", + " \"slug\": challenge.slug,\n", + " \"name\": challenge.name,\n", + " \"headline\": challenge.headline,\n", + " \"headline_alternatives\": headlines\n", + "}\n", + "json_str = json.dumps(obj, indent=2)\n", + "\n", + "print(json_str)" + ] + }, + { + "cell_type": "markdown", "id": "12d3b54d", "metadata": {}, + "source": [ + "### Output challenge headlines" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "4496fd1d", + "metadata": {}, "outputs": [], - "source": [] + "source": [ + "from itertools import compress\n", + "import json\n", + "\n", + "raw_headlines = result.splitlines()\n", + "\n", + "def is_raw_headline(raw_headline):\n", + " prefixes = (\"1. \", \"2. \", \"3. \", \"4. \", \"5. \")\n", + " return raw_headline.startswith(prefixes)\n", + "\n", + "headlines = list(compress(raw_headlines, map(is_raw_headline, raw_headlines)))\n", + "\n", + "obj = {\n", + " \"id\": challenge.id,\n", + " \"slug\": challenge.slug,\n", + " \"name\": challenge.name,\n", + " \"headline\": challenge.headline,\n", + " \"headline_alternatives\": headlines\n", + "}\n", + "json_str = json.dumps(obj, indent=2)\n", + "\n", + "print(json_str)" + ] } ], "metadata": { diff --git a/apps/openchallenges/notebook/src/challenge_headline/__init__.py b/apps/openchallenges/notebook/src/challenge_headline/__init__.py new file mode 100644 index 0000000000..e69de29bb2 diff --git a/apps/openchallenges/notebook/src/challenge_headline/challenge_headline_llm.py b/apps/openchallenges/notebook/src/challenge_headline/challenge_headline_llm.py new file mode 100644 index 0000000000..9806b571ef --- /dev/null +++ b/apps/openchallenges/notebook/src/challenge_headline/challenge_headline_llm.py @@ -0,0 +1,134 @@ +# Requirements +# +# - Login with AWS: aws --profile cnb sso login + +from dotenv import load_dotenv + +import openchallenges_client +from pprint import pprint +from openchallenges_client.api import challenge_api + +load_dotenv() + +# List challenges from OC.io + +# See configuration.py for a list of all supported configuration parameters. +configuration = openchallenges_client.Configuration( + host="https://openchallenges.io/api/v1" +) + +# Enter a context with an instance of the API client +challenges = [] +with openchallenges_client.ApiClient(configuration) as api_client: + api_instance = challenge_api.ChallengeApi(api_client) + + query = openchallenges_client.ChallengeSearchQuery(page_number=1, page_size=1000) + pprint(query) + + try: + # Get the first page of the list of challenges + page = api_instance.list_challenges(query) + pprint(page.size) + challenges.extend(page.challenges) + except openchallenges_client.ApiException as e: + print("Exception when calling ChallengeApi->list_challenges: %s\n" % e) + +# Sort challenge by ID +# challenges.sort(key=lambda challenge: challenge.id, reverse=False) +# pprint(challenges[:2]) + +print(len(challenges)) + +import sys + +sys.exit() + +# GENERATE THE HEADLINES WITH AWS BEDROCK + +# Configure the Bedrock client + +import json +import os +import sys + +import boto3 +import botocore + +module_path = "src" +sys.path.append(os.path.abspath(module_path)) +from utils import bedrock, print_ww + +os.environ["AWS_DEFAULT_REGION"] = "us-east-1" +os.environ["AWS_PROFILE"] = "cnb" + +boto3_bedrock = bedrock.get_bedrock_client( + assumed_role=os.environ.get("BEDROCK_ASSUME_ROLE", None), + region=os.environ.get("AWS_DEFAULT_REGION", None), +) + +# Configure base model options + +from langchain.llms.bedrock import Bedrock + +inference_modifier = { + "max_tokens_to_sample": 6000, + "temperature": 0.6, + "top_k": 250, + "top_p": 1, + "stop_sequences": ["\n\nHuman"], +} + +textgen_llm = Bedrock( + model_id="anthropic.claude-v2", + client=boto3_bedrock, + model_kwargs=inference_modifier, +) + + +def generate_challenge_headlines(text, num_headlines): + prompt = ( + f"Please generate {num_headlines} headlines that have a maximum ten words from the " + "following challenge description. The headline must summarize the goal of the challenge. " + f"Description: \n{text}" + ) + response = Bedrock( + model_id="anthropic.claude-v2", + client=boto3_bedrock, + model_kwargs=inference_modifier, + )(prompt) + return response + + +from itertools import compress +import json + + +def is_raw_headline(raw_headline): + prefixes = ("1. ", "2. ", "3. ", "4. ", "5. ") + return raw_headline.startswith(prefixes) + + +def process_challenge(challenge): + print(f"Processing challenge ID {challenge.id}: {challenge.name}") + response = generate_challenge_headlines(challenge.description, 5) + + raw_headlines = response.splitlines() + headlines = list(compress(raw_headlines, map(is_raw_headline, raw_headlines))) + + obj = { + "id": challenge.id, + "slug": challenge.slug, + "name": challenge.name, + "headline": challenge.headline, + "headline_alternatives": headlines, + } + return obj + + +challenge_headlines = list(map(process_challenge, challenges[:2])) + + +# SAVE OUTPUT TO FILE + +with open("challenge_headlines.json", "w") as f: + json.dump(challenge_headlines, f, indent=2) diff --git a/apps/openchallenges/notebook/src/utils/__init__.py b/apps/openchallenges/notebook/src/utils/__init__.py new file mode 100644 index 0000000000..e69de29bb2 diff --git a/apps/openchallenges/notebook/src/utils/bedrock.py b/apps/openchallenges/notebook/src/utils/bedrock.py new file mode 100644 index 0000000000..b959e1009a --- /dev/null +++ b/apps/openchallenges/notebook/src/utils/bedrock.py @@ -0,0 +1,80 @@ +# Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +# SPDX-License-Identifier: MIT-0 +"""Helper utilities for working with Amazon Bedrock from Python notebooks""" +# Python Built-Ins: +import os +from typing import Optional + +# External Dependencies: +import boto3 +from botocore.config import Config + + +def get_bedrock_client( + assumed_role: Optional[str] = None, + region: Optional[str] = None, + runtime: Optional[bool] = True, +): + """Create a boto3 client for Amazon Bedrock, with optional configuration overrides + + Parameters + ---------- + assumed_role : + Optional ARN of an AWS IAM role to assume for calling the Bedrock service. If not + specified, the current active credentials will be used. + region : + Optional name of the AWS Region in which the service should be called (e.g. "us-east-1"). + If not specified, AWS_REGION or AWS_DEFAULT_REGION environment variable will be used. + runtime : + Optional choice of getting different client to perform operations with the Amazon Bedrock service. + """ + if region is None: + target_region = os.environ.get( + "AWS_REGION", os.environ.get("AWS_DEFAULT_REGION") + ) + else: + target_region = region + + print(f"Create new client\n Using region: {target_region}") + session_kwargs = {"region_name": target_region} + client_kwargs = {**session_kwargs} + + profile_name = os.environ.get("AWS_PROFILE") + if profile_name: + print(f" Using profile: {profile_name}") + session_kwargs["profile_name"] = profile_name + + retry_config = Config( + region_name=target_region, + retries={ + "max_attempts": 10, + "mode": "standard", + }, + ) + session = boto3.Session(**session_kwargs) + + if assumed_role: + print(f" Using role: {assumed_role}", end="") + sts = session.client("sts") + response = sts.assume_role( + RoleArn=str(assumed_role), RoleSessionName="langchain-llm-1" + ) + print(" ... successful!") + client_kwargs["aws_access_key_id"] = response["Credentials"]["AccessKeyId"] + client_kwargs["aws_secret_access_key"] = response["Credentials"][ + "SecretAccessKey" + ] + client_kwargs["aws_session_token"] = response["Credentials"]["SessionToken"] + + if runtime: + service_name = "bedrock-runtime" + else: + service_name = "bedrock" + + bedrock_client = session.client( + service_name=service_name, config=retry_config, **client_kwargs + ) + + print("boto3 Bedrock client successfully created!") + print(bedrock_client._endpoint) + return bedrock_client diff --git a/apps/openchallenges/notebook/src/utils/print_ww.py b/apps/openchallenges/notebook/src/utils/print_ww.py new file mode 100644 index 0000000000..b03ad2c10a --- /dev/null +++ b/apps/openchallenges/notebook/src/utils/print_ww.py @@ -0,0 +1,21 @@ +# Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +# SPDX-License-Identifier: MIT-0 +"""General helper utilities the workshop notebooks""" +# Python Built-Ins: +from io import StringIO +import sys +import textwrap + + +def print_ww(*args, width: int = 100, **kwargs): + """Like print(), but wraps output to `width` characters (default 100)""" + buffer = StringIO() + try: + _stdout = sys.stdout + sys.stdout = buffer + print(*args, **kwargs) + output = buffer.getvalue() + finally: + sys.stdout = _stdout + for line in output.splitlines(): + print("\n".join(textwrap.wrap(line, width=width)))