From 4b35acf5ee0cd1e99e33e564891e299d144e39d4 Mon Sep 17 00:00:00 2001 From: Chau Pham Date: Sun, 10 Nov 2024 13:11:19 -0500 Subject: [PATCH] website release --- README.md | 2 +- assets/{ => img}/pipeline.png | Bin assets/styles/main.css | 217 ++++++++++ demo.ipynb | 4 +- index.html | 676 ++++++++++++++++++++++++++++++++ setup.py | 4 +- topicgpt_python/correction.py | 13 +- topicgpt_python/generation_1.py | 16 + topicgpt_python/generation_2.py | 3 + 9 files changed, 929 insertions(+), 6 deletions(-) rename assets/{ => img}/pipeline.png (100%) create mode 100644 assets/styles/main.css create mode 100644 index.html diff --git a/README.md b/README.md index 691bb82..8f55e36 100644 --- a/README.md +++ b/README.md @@ -8,7 +8,7 @@ This repository contains scripts and prompts for our paper ["TopicGPT: Topic Mod - `assign_topics` assigns the generated topics to the input text, along with a quote that supports the assignment. - `correct_topics` corrects the generated topics by reprompting the model so that the final topic assignment is grounded in the topic list. -![TopicGPT Pipeline Overview](assets/pipeline.png) +![TopicGPT Pipeline Overview](assets/img/pipeline.png) ## 📣 Updates - [11/09/24] Python package `topicgpt_python` is released! You can install it via `pip install topicgpt_python`. We support OpenAI API, Vertex AI, and vLLM (requires GPUs for inference). See [PyPI](https://pypi.org/project/topicgpt-python/). diff --git a/assets/pipeline.png b/assets/img/pipeline.png similarity index 100% rename from assets/pipeline.png rename to assets/img/pipeline.png diff --git a/assets/styles/main.css b/assets/styles/main.css new file mode 100644 index 0000000..4c16983 --- /dev/null +++ b/assets/styles/main.css @@ -0,0 +1,217 @@ +@import url('https://fonts.googleapis.com/css2?family=Inter:wght@300;400;600;700&display=swap'); + +html { + font-style: normal; + font-weight: normal; + -ms-text-size-adjust: 100%; + -webkit-text-size-adjust: 100%; +} + +body { + max-width: 60em; + margin: 2em auto; + padding: 0em 1em; + font-family: 'Inter', sans-serif; +} + +h1 { + font-size: 40px; + color: #09194f; + padding-top: 15px; + font-weight: 700; +} + +img { + max-width: 100%; + height: auto; +} + +@media (max-width: 1000px) { + .responsive-header { + max-width: 100%; + max-height: auto; + padding-bottom: 20px; + } + + .responsive-img { + max-width: 70%; + max-height: auto; + padding-bottom: 20px; + } + + .responsive-icon { + max-width: 20%; + max-height: auto; + } +} + +.title { + font-size: 20px; + text-align: center; + background-color: #f9ce00; + padding: 5px 10px; + display: inline-block; + font-weight: 600; +} + +.link-body { + color: #00818a; + font-size: 16px; +} + +.link-body:hover { + padding: 0 0 5px; + background-color: #fff48f; + text-decoration: none; +} + +.highlighted { + font-weight: bold; +} + +p { + font-size: 16px; + color: rgb(48, 47, 47); + margin: 0 0 1em 0; + line-height: 21px; + font-family: 'Inter', sans-serif; + font-weight: 500; +} + +b { + color: #09194f; +} + +figcaption { + font-size: 13px; + font-style: italic; + text-align: center; +} + +.dropping { + font-family: 'Menlo', Menlo, monospace; + font-size: 10px; + width: 100%; + white-space: pre-wrap; + word-wrap: break-word; + overflow-wrap: break-word; + text-align: left; +} + +.publication_block { + padding-top: 10px; + padding-bottom: 10px; +} + +.hidden { + display: none; +} + +.unhidden { + display: table; + position: relative; +} + +.texttt { + font-family: 'Menlo', Menlo, monospace; +} + +.left-column { + width: 40%; + border-right: 1px solid #ccc; + padding: 10px; +} + +.right-column { + width: 60%; + padding: 10px; +} + +.image-container { + display: flex; + padding-top: 10px; + max-width: 50%; + gap: 20px; + justify-content: center; +} + +.nord-dark-mode { + background-color: #2E3440; + color: #D8DEE9; +} + +.nord-dark-mode a { + color: #88C0D0; +} + +.nord-dark-mode a:hover { + padding: 0 0 5px; + background-color: #fff48f; + color: #00818a; +} + +.nord-dark-mode hr { + border-color: #4C566A; +} + +.nord-dark-mode .container { + border-color: #4C566A; +} + +.nord-dark-mode p { + color: #D8DEE9; +} + +.nord-dark-mode b { + color: #D8DEE9; +} + +.nord-dark-mode h1 { + color: #D8DEE9; +} + +.title { + color: black; +} + +.dark-mode-toggle { + position: static; + cursor: pointer; + width: 16px; + height: 16px; +} + +.nord-dark-mode .bibtex { + color: #D8DEE9; +} + +code { + font-family: 'Menlo', Menlo, monospace; + font-size: 14px; + background-color: #f7f7f7; + padding: 5px; + border-radius: 5px; + color: #d63333; + white-space: pre-wrap; + overflow-wrap: break-word; + word-wrap: break-word; +} + +.code-block { + overflow-x: auto; + padding: 1em; + background-color: #f7f7f7; + border-radius: 5px; +} + +pre { + overflow-x: auto; + white-space: normal; + overflow-wrap: break-word; + word-wrap: break-word; + font-family: 'Menlo', monospace; + font-size: 14px; + padding: 0; + margin: 0; + color: #d63333; +} diff --git a/demo.ipynb b/demo.ipynb index 0a738eb..2ecff0f 100644 --- a/demo.ipynb +++ b/demo.ipynb @@ -13,7 +13,7 @@ "- `assignment` assigns the generated topics to the input text, along with a quote that supports the assignment.\n", "- `correction` corrects the generated topics by reprompting the model so that the topic assignment is grounded in the topic list. \n", "\n", - "![topicgpt_python](assets/pipeline.png)" + "![topicgpt_python](assets/img/pipeline.png)" ] }, { @@ -242,7 +242,7 @@ "source": [ "### Topic Refinement\n", "If topics are generated by a weaker model, there sometimes exist irrelevant or redundant topics. This module: \n", - "- Merges similar topics using Cosine Similarity scores. \n", + "- Merges similar topics.\n", "- Removes overly specific or redundant topics that occur < 1% of the time (you can skip this by setting `remove` to False in `config.yml`).\n", "- Expect the refined topics in `data/output/{data_name}/refinement_1.md` and `data/output/{data_name}/refinement_1.jsonl`. If nothing happens, it means that the topic list is coherent.\n", "- If you're unsatisfied with the refined topics, call the function again and set `refined_again` to True in the function call. " diff --git a/index.html b/index.html new file mode 100644 index 0000000..033896f --- /dev/null +++ b/index.html @@ -0,0 +1,676 @@ + + + + + + + + + + + + + + + + + TopicGPT (NAACL'24) + + + + + + + +
+

TopicGPT: A Prompt-based Topic Modeling Framework

+ Chau Minh Pham🔍, + Alexander Hoyle🔦, + Simeng Sun🔍,
+ Philip Resnik🔦, + Mohit Iyyer🔍 +
+ 🔍University of Massachusetts Amherst
+ 🔦University of Maryland College Park
+ [Paper] + [Code] + [ + + ] + +
+ +
+ +
+ +
+ +
+
+

TLDR;

+
+
+
+
+

+ 1. We introduce TopicGPT, a prompt-based framework that uses large + language models (LLMs) to uncover latent topics in a text collection. + TopicGPT generates interpretable topics, dispensing with ambiguous + bags of words in favor of topics with natural language labels and + associated free-form descriptions. +

+

+ 2. TopicGPT works in three main stages. +

    +
  • Generation: It generates high-level topics using a prompt-based approach.
  • +
  • Refinement: It refines the topics by merging similar ones and removing outliers.
  • +
  • Assignment: It assigns topics to documents with supporting quotes.
  • +
+

+

+ 3. TopicGPT produces topics that align better with human + categorizations compared to competing methods: it achieves a harmonic + mean purity of 0.74 against human-annotated Wikipedia topics compared + to 0.64 for the strongest baseline. +

+
+
+ +
+ +
+

Data Preparation

+
+
+ +
+
+

+ 1. Prepare your .jsonl data file in the following format: +

+
+
+ +
+
+

+  {
+      "id": "IDs (optional)",
+      "text": "Documents",
+      "label": "Ground-truth labels (optional)"
+  }
+      
+
+
+ +
+
+

+ 2. Place your data file in the data/input directory. A + sample data file, data/input/sample.jsonl, is provided to + help you debug the code. +

+

+ 3. You can download the raw datasets used in the paper (Bills and + Wiki) from the following link: + Dataset Link. +

+
+
+
+ +
+

Setting up

+
+
+ +
+
+

+ Check out demo.ipynb for a complete pipeline and more + detailed instructions. We advise trying a subset with more affordable + (or open-source) models before scaling to the full dataset. +

+

+ Metric calculation functions are available in + topicgpt_python.metrics to evaluate topic alignment + with ground-truth labels (Adjusted Rand Index, Harmonic Purity, + Normalized Mutual Information). +

+ Our package supports OpenAI API, Google Cloud Vertex AI API, and vLLM + inference. vLLM requires GPUs to run. Please refer to + OpenAI API pricing + or to + Vertex API pricing + for cost details. +

+

+ 1. Make a new Python 3.9+ environment using virtualenv or conda. +

+

+ 2. Install the required packages: + pip install --upgrade topicgpt_python +

+

3. Set environment variables:

+
+
+
+
+

+  export OPENAI_API_KEY={your_openai_api_key}
+  export VERTEX_PROJECT={your_vertex_project}
+  export VERTEX_LOCATION={your_vertex_location}
+      
+
+
+
+
+

+ 4. Define I/O paths in config.yml. +

+
+
+

+ 5. Run the following code snippet to load the configuration file: +

+
+
+ +
+
+

+  from topicgpt_python import *
+  import yaml
+  
+  with open("config.yml", "r") as f:
+      config = yaml.safe_load(f)
+      
+
+
+
+ +
+

Generating Topics

+
+
+
+ +
+
+ Function: generate_topic_lvl1 +

Generate high-level topics.

+ +
+ +
+
+

+  generate_topic_lvl1(
+    api, model, data, prompt_file, seed_file, out_file, topic_file, verbose
+  )                
+      
+
+
+ +
+ Function: generate_topic_lvl2 +

Generate subtopics for each top-level topic.

+ +
+ +
+
+

+  generate_topic_lvl2(
+    api, model, seed_file, data, prompt_file, out_file, topic_file, verbose
+  )
+      
+
+
+
+ +
+

Refining Topics

+
+
+
+ If topics are generated by a weaker model, there sometimes exist irrelevant or redundant topics. This module: + +
+ +
+ Function: refine_topics +

+ Refine topics by merging and updating based on API + response. +

+ +
+ +
+
+

+  refine_topics(
+    api, model, prompt_file, generation_file, topic_file, out_file, updated_file, verbose, remove, mapping_file, refined_again 
+  )
+      
+
+

+ +
+

Assigning Topics

+
+
+ +
+ +
+ + +
+ Function: assign_topics +

Assign topics to a list of documents.

+ +
+ +
+
+

+  assign_topics(
+    api, model, data, prompt_file, out_file, topic_file, verbose
+  )
+      
+
+
+
+ +
+ Function: correct_topics +

Main function to parse, correct, and save topic assignments.

+ +
+ +
+
+

+  correct_topics(
+    api, model, data_path, prompt_path, topic_path, output_path, verbose
+  ) 
+      
+
+
+
+
+

Citation

+
+
+ +
+
+      @misc{pham2024topicgptpromptbasedtopicmodeling,
+        title={TopicGPT: A Prompt-based Topic Modeling Framework}, 
+        author={Chau Minh Pham and Alexander Hoyle and Simeng Sun and Philip Resnik and Mohit Iyyer},
+        year={2024},
+        eprint={2311.01449},
+        archivePrefix={arXiv},
+        primaryClass={cs.CL},
+        url={https://arxiv.org/abs/2311.01449}, 
+      }
+
+ + + + + + + + diff --git a/setup.py b/setup.py index 5fbc9d1..184501b 100644 --- a/setup.py +++ b/setup.py @@ -6,12 +6,12 @@ with open("README.md", "r") as fh: long_description = fh.read() long_description = long_description.replace( - "![TopicGPT Pipeline Overview](pipeline.png)", "" + "![TopicGPT Pipeline Overview](assets/img/pipeline.png)", "" ) setup( name="topicgpt_python", - version="0.2.0", + version="0.2.1", packages=find_packages(), install_requires=requirements, author="Chau Minh Pham", diff --git a/topicgpt_python/correction.py b/topicgpt_python/correction.py index 2915e31..30b605c 100644 --- a/topicgpt_python/correction.py +++ b/topicgpt_python/correction.py @@ -186,7 +186,18 @@ def correct_batch( def correct_topics( api, model, data_path, prompt_path, topic_path, output_path, verbose=False ): - """Main function to parse, correct, and save topic assignments.""" + """ + Main function to parse, correct, and save topic assignments. + + Parameters: + - api: API type (e.g., 'openai', 'vertex', 'vllm') + - model: Model name (e.g., 'gpt-4') + - data_path: Path to data file + - prompt_path: Path to prompt file + - topic_path: Path to topic file + - output_path: Path to save corrected output + - verbose: Print verbose output + """ api_client = APIClient(api=api, model=model) max_tokens, temperature, top_p = 1000, 0.0, 1.0 context_len = ( diff --git a/topicgpt_python/generation_1.py b/topicgpt_python/generation_1.py index c7794c4..4036000 100644 --- a/topicgpt_python/generation_1.py +++ b/topicgpt_python/generation_1.py @@ -153,6 +153,22 @@ def generate_topics( def generate_topic_lvl1( api, model, data, prompt_file, seed_file, out_file, topic_file, verbose ): + """ + Generate high-level topics + + Parameters: + - api (str): API to use ('openai', 'vertex', 'vllm') + - model (str): Model to use + - data (str): Data file + - prompt_file (str): File to read prompts from + - seed_file (str): Markdown file to read seed topics from + - out_file (str): File to write results to + - topic_file (str): File to write topics to + - verbose (bool): Whether to print out results + + Returns: + - topics_root (TopicTree): Root node of the topic tree + """ api_client = APIClient(api=api, model=model) max_tokens, temperature, top_p = 1000, 0.0, 1.0 diff --git a/topicgpt_python/generation_2.py b/topicgpt_python/generation_2.py index d5bb5bf..dbb64c1 100644 --- a/topicgpt_python/generation_2.py +++ b/topicgpt_python/generation_2.py @@ -250,6 +250,8 @@ def generate_topic_lvl2( - out_file: Output result file - topic_file: Output topics file - verbose: Enable verbose output + + Returns: Root node of the topic tree """ api_client = APIClient(api=api, model=model) max_tokens, temperature, top_p = 1000, 0.0, 1.0 @@ -295,6 +297,7 @@ def generate_topic_lvl2( pd.DataFrame({"text": docs, "topics": res}).to_json( out_file, orient="records", lines=True ) + return topics_root if __name__ == "__main__":