Generate 5 headlines for a challenge

Sage-Bionetworks · Oct 24, 2023 · 50eab17 · 50eab17
1 parent 9110fc4
commit 50eab17
Show file tree

Hide file tree

Showing 3 changed files with 528 additions and 16 deletions.
diff --git a/apps/openchallenges/notebook/notebooks/openai-challenge-headline.ipynb b/apps/openchallenges/notebook/notebooks/openai-challenge-headline.ipynb
@@ -100,33 +100,134 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 24,
-   "id": "d0c0b308-0b58-44a7-8ff6-4987dfbccb17",
+   "execution_count": 44,
+   "id": "6d590b17",
    "metadata": {},
    "outputs": [
     {
-     "data": {
-      "text/plain": [
-       "Challenge(id=279, slug='niddk-central-repository-data-centric-challenge', name='NIDDK Central Repository Data-Centric Challenge', headline='Enhancing NIDDK datasets for future Artificial Intelligence (AI) applications.', description='The National Institute of Diabetes and Digestive and Kidney Diseases (NIDDK) Central Repository (https://repository.niddk.nih.gov/home/) is conducting a Data Centric Challenge aimed at augmenting existing Repository data for future secondary research including data-driven discovery by artificial intelligence (AI) researchers. The NIDDK Central Repository (NIDDK-CR) program strives to increase the utilization and impact of the resources under its guardianship. However, lack of standardization and consistent metadata within and across studies limit the ability of secondary researchers to easily combine datasets from related studies to generate new insights using data science methods. In the fall of 2021, the NIDDK-CR began implementing approaches to augment data quality to improve AI-readiness by making research data FAIR (findable, accessible, interoperable, and reusable) via a small pilot project utilizing Natural Language Processing (NLP) to tag study variables. In 2022, the NIDD...', doi='', status=<ChallengeStatus.ACTIVE: 'active'>, difficulty=<ChallengeDifficulty.INTERMEDIATE: 'intermediate'>, platform=SimpleChallengePlatform(id=14, slug='other', name='Other'), website_url='https://www.challenge.gov/?challenge=niddk-central-repository-data-centric-challenge', avatar_url='', incentives=[<ChallengeIncentive.PUBLICATION: 'publication'>, <ChallengeIncentive.SPEAKING_ENGAGEMENT: 'speaking_engagement'>, <ChallengeIncentive.OTHER: 'other'>], submission_types=[<ChallengeSubmissionType.PREDICTION_FILE: 'prediction_file'>, <ChallengeSubmissionType.NOTEBOOK: 'notebook'>], input_data_types=[], start_date=datetime.date(2023, 9, 20), end_date=datetime.date(2023, 11, 3), starred_count=0, created_at=datetime.datetime(2023, 10, 18, 16, 58, 17, tzinfo=datetime.timezone.utc), updated_at=datetime.datetime(2023, 10, 18, 20, 52, 49, tzinfo=datetime.timezone.utc))"
-      ]
-     },
-     "execution_count": 24,
-     "metadata": {},
-     "output_type": "execute_result"
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "'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'\n"
+     ]
     }
    ],
    "source": [
-    "challenge = challenges[0]\n",
-    "challenge"
+    "from dotenv import dotenv_values\n",
+    "\n",
+    "config = dotenv_values(\"../.env\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 45,
+   "id": "488632df",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import openai\n",
+    "\n",
+    "openai.api_key = config['OPENAI_API_KEY']"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 46,
+   "id": "9158b9b9",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Source: https://medium.com/muthoni-wanyoike/implementing-text-summarization-using-openais-gpt-3-api-dcd6be4f6933\n",
+    "def split_text(text):\n",
+    "    max_chunk_size = 2048\n",
+    "    chunks = []\n",
+    "    current_chunk = \"\"\n",
+    "    for sentence in text.split(\".\"):\n",
+    "        if len(current_chunk) + len(sentence) < max_chunk_size:\n",
+    "            current_chunk += sentence + \".\"\n",
+    "        else:\n",
+    "            chunks.append(current_chunk.strip())\n",
+    "            current_chunk = sentence + \".\"\n",
+    "    if current_chunk:\n",
+    "        chunks.append(current_chunk.strip())\n",
+    "    return chunks"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": null,
-   "id": "dfaaa94a",
+   "execution_count": 60,
+   "id": "0116c3f8",
    "metadata": {},
    "outputs": [],
-   "source": []
+   "source": [
+    "# Source: https://medium.com/muthoni-wanyoike/implementing-text-summarization-using-openais-gpt-3-api-dcd6be4f6933\n",
+    "def generate_challenge_headline_legacy(text):\n",
+    "    input_chunks = split_text(text)\n",
+    "    output_chunks = []\n",
+    "    for chunk in input_chunks:\n",
+    "        response = openai.Completion.create(\n",
+    "            engine=\"davinci\",\n",
+    "            prompt=(f\"Please generate a headline with maximum ten words from the following challenge description:\\n{chunk}\"),\n",
+    "            temperature=0.5,\n",
+    "            max_tokens=1024,\n",
+    "            n = 1,\n",
+    "            stop=None\n",
+    "        )\n",
+    "        summary = response.choices[0].text.strip()\n",
+    "        output_chunks.append(summary)\n",
+    "    return \" \".join(output_chunks)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 114,
+   "id": "1ea1b66b",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def generate_challenge_headline(text):\n",
+    "    prompt=(\n",
+    "        \"Please generate five headlines that have a maximum ten words from the following \"\n",
+    "        \"challenge description. The headline must summarize the goal of the challenge. \"\n",
+    "        f\"Description: \\n{text}\"\n",
+    "    )\n",
+    "    response = openai.ChatCompletion.create(\n",
+    "        model=\"gpt-3.5-turbo\",\n",
+    "        messages=[\n",
+    "            {\"role\": \"system\", \"content\": \"You are a helpful assistant.\"},\n",
+    "            {\"role\": \"user\", \"content\": prompt},\n",
+    "        ],\n",
+    "        max_tokens=1024,\n",
+    "        temperature=0.5\n",
+    "    )\n",
+    "    return response['choices'][0]['message']['content']"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 115,
+   "id": "d0c0b308-0b58-44a7-8ff6-4987dfbccb17",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "('1. \"NIDDK Data Centric Challenge: Enhancing Repository Data for AI '\n",
+      " 'Research\"\\n'\n",
+      " '2. \"Improving Data Quality for AI-Driven Discoveries: NIDDK Challenge\"\\n'\n",
+      " '3. \"Unlocking Insights: NIDDK Challenge to Combine Datasets for AI '\n",
+      " 'Research\"\\n'\n",
+      " '4. \"NIDDK-CR Pilot Project: Making Research Data FAIR for AI\"\\n'\n",
+      " '5. \"NLP Tagging Study Variables: NIDDK Challenge for Data Standardization\"')\n"
+     ]
+    }
+   ],
+   "source": [
+    "challenge = challenges[0]\n",
+    "result = generate_challenge_headline(challenge.description)\n",
+    "pprint(result)\n"
+   ]
   }
  ],
  "metadata": {