From 4b35acf5ee0cd1e99e33e564891e299d144e39d4 Mon Sep 17 00:00:00 2001
From: Chau Pham <ctpham@umass.edu>
Date: Sun, 10 Nov 2024 13:11:19 -0500
Subject: [PATCH] website release

---
 README.md                       |   2 +-
 assets/{ => img}/pipeline.png   | Bin
 assets/styles/main.css          | 217 ++++++++++
 demo.ipynb                      |   4 +-
 index.html                      | 676 ++++++++++++++++++++++++++++++++
 setup.py                        |   4 +-
 topicgpt_python/correction.py   |  13 +-
 topicgpt_python/generation_1.py |  16 +
 topicgpt_python/generation_2.py |   3 +
 9 files changed, 929 insertions(+), 6 deletions(-)
 rename assets/{ => img}/pipeline.png (100%)
 create mode 100644 assets/styles/main.css
 create mode 100644 index.html

diff --git a/README.md b/README.md
index 691bb82..8f55e36 100644
--- a/README.md
+++ b/README.md
@@ -8,7 +8,7 @@ This repository contains scripts and prompts for our paper ["TopicGPT: Topic Mod
 - `assign_topics` assigns the generated topics to the input text, along with a quote that supports the assignment.
 - `correct_topics` corrects the generated topics by reprompting the model so that the final topic assignment is grounded in the topic list. 
 
-![TopicGPT Pipeline Overview](assets/pipeline.png)
+![TopicGPT Pipeline Overview](assets/img/pipeline.png)
 
 ## 📣 Updates
 - [11/09/24] Python package `topicgpt_python` is released! You can install it via `pip install topicgpt_python`. We support OpenAI API, Vertex AI, and vLLM (requires GPUs for inference). See [PyPI](https://pypi.org/project/topicgpt-python/).
diff --git a/assets/pipeline.png b/assets/img/pipeline.png
similarity index 100%
rename from assets/pipeline.png
rename to assets/img/pipeline.png
diff --git a/assets/styles/main.css b/assets/styles/main.css
new file mode 100644
index 0000000..4c16983
--- /dev/null
+++ b/assets/styles/main.css
@@ -0,0 +1,217 @@
+@import url('https://fonts.googleapis.com/css2?family=Inter:wght@300;400;600;700&display=swap');
+
+html {
+	font-style: normal;
+	font-weight: normal;
+	-ms-text-size-adjust: 100%;
+  -webkit-text-size-adjust: 100%;
+}
+
+body {
+  max-width: 60em;
+  margin: 2em auto;
+  padding: 0em 1em;
+  font-family: 'Inter', sans-serif;
+}
+
+h1 {
+  font-size: 40px;
+  color: #09194f; 
+  padding-top: 15px; 
+  font-weight: 700;
+}
+
+img {
+  max-width: 100%; 
+  height: auto; 
+}
+
+@media (max-width: 1000px) {
+  .responsive-header {
+    max-width: 100%; 
+    max-height: auto;
+    padding-bottom: 20px;
+  }
+
+  .responsive-img {
+    max-width: 70%; 
+    max-height: auto;
+    padding-bottom: 20px;
+  }
+  
+  .responsive-icon {
+    max-width: 20%; 
+    max-height: auto;
+  }
+}
+
+.title {
+  font-size: 20px;
+  text-align: center;
+  background-color: #f9ce00;
+  padding: 5px 10px; 
+  display: inline-block;
+  font-weight: 600;
+}
+
+.link-body {
+  color: #00818a;
+  font-size: 16px;
+}
+
+.link-body:hover {
+  padding: 0 0 5px; 
+  background-color: #fff48f;
+  text-decoration: none;
+}
+
+.highlighted {
+  font-weight: bold;
+}
+
+p {
+  font-size: 16px;
+  color: rgb(48, 47, 47);
+  margin: 0 0 1em 0;
+  line-height: 21px; 
+  font-family: 'Inter', sans-serif;
+  font-weight: 500;
+}
+
+b {
+  color: #09194f;
+}
+
+figcaption {
+  font-size: 13px; 
+  font-style: italic;
+  text-align: center;
+}
+
+.dropping {
+  font-family: 'Menlo', Menlo, monospace;
+  font-size: 10px;
+  width: 100%; 
+  white-space: pre-wrap;
+  word-wrap: break-word;
+  overflow-wrap: break-word;
+  text-align: left;
+}
+
+.publication_block {
+  padding-top: 10px;
+  padding-bottom: 10px;
+}
+
+.hidden {
+  display: none;
+}
+
+.unhidden {
+  display: table;
+  position: relative;
+}
+
+.texttt {
+  font-family: 'Menlo', Menlo, monospace;
+}
+
+.left-column {
+  width: 40%;
+  border-right: 1px solid #ccc;
+  padding: 10px;
+}
+
+.right-column {
+  width: 60%;
+  padding: 10px;
+}
+
+.image-container {
+  display: flex; 
+  padding-top: 10px; 
+  max-width: 50%;
+  gap: 20px;
+  justify-content: center;
+}
+
+.nord-dark-mode {
+  background-color: #2E3440;
+  color: #D8DEE9;
+}
+
+.nord-dark-mode a {
+  color: #88C0D0;
+}
+
+.nord-dark-mode a:hover {
+  padding: 0 0 5px; 
+  background-color: #fff48f;
+  color: #00818a;
+}
+
+.nord-dark-mode hr {
+  border-color: #4C566A;
+}
+
+.nord-dark-mode .container {
+  border-color: #4C566A;
+}
+
+.nord-dark-mode p {
+  color: #D8DEE9; 
+}
+
+.nord-dark-mode b {
+  color: #D8DEE9; 
+}
+
+.nord-dark-mode h1 {
+  color: #D8DEE9; 
+}
+
+.title {
+  color: black;
+}
+
+.dark-mode-toggle {
+  position: static;
+  cursor: pointer;
+  width: 16px; 
+  height: 16px; 
+}
+
+.nord-dark-mode .bibtex {
+  color: #D8DEE9;
+}
+
+code {
+  font-family: 'Menlo', Menlo, monospace;
+  font-size: 14px;
+  background-color: #f7f7f7;
+  padding: 5px;
+  border-radius: 5px;
+  color: #d63333;
+  white-space: pre-wrap;
+  overflow-wrap: break-word;
+  word-wrap: break-word;
+}
+
+.code-block {
+  overflow-x: auto;
+  padding: 1em;
+  background-color: #f7f7f7;
+  border-radius: 5px;
+}
+
+pre {
+  overflow-x: auto;
+  white-space: normal;
+  overflow-wrap: break-word;
+  word-wrap: break-word;
+  font-family: 'Menlo', monospace;
+  font-size: 14px;
+  padding: 0;
+  margin: 0;
+  color: #d63333;
+}
diff --git a/demo.ipynb b/demo.ipynb
index 0a738eb..2ecff0f 100644
--- a/demo.ipynb
+++ b/demo.ipynb
@@ -13,7 +13,7 @@
     "- `assignment` assigns the generated topics to the input text, along with a quote that supports the assignment.\n",
     "- `correction` corrects the generated topics by reprompting the model so that the topic assignment is grounded in the topic list. \n",
     "\n",
-    "![topicgpt_python](assets/pipeline.png)"
+    "![topicgpt_python](assets/img/pipeline.png)"
    ]
   },
   {
@@ -242,7 +242,7 @@
    "source": [
     "### Topic Refinement\n",
     "If topics are generated by a weaker model, there sometimes exist irrelevant or redundant topics. This module: \n",
-    "- Merges similar topics using Cosine Similarity scores. \n",
+    "- Merges similar topics.\n",
     "- Removes overly specific or redundant topics that occur < 1% of the time (you can skip this by setting `remove` to False in `config.yml`).\n",
     "- Expect the refined topics in `data/output/{data_name}/refinement_1.md` and `data/output/{data_name}/refinement_1.jsonl`. If nothing happens, it means that the topic list is coherent.\n",
     "- If you're unsatisfied with the refined topics, call the function again and set `refined_again` to True in the function call. "
diff --git a/index.html b/index.html
new file mode 100644
index 0000000..033896f
--- /dev/null
+++ b/index.html
@@ -0,0 +1,676 @@
+<!doctype html>
+<html lang="en">
+  <head>
+    <!-- Required meta tags -->
+    <meta charset="utf-8" />
+    <meta
+      name="viewport"
+      content="width=device-width, initial-scale=1, shrink-to-fit=no"
+    />
+    <meta
+      name="google-site-verification"
+      content="tCsJJWn2zQXxNRhhVU8PobNOXvc7wxoOyZmEUCI0pk"
+    />
+
+    <!-- Bootstrap CSS -->
+    <link
+      rel="stylesheet"
+      href="https://stackpath.bootstrapcdn.com/bootstrap/4.5.0/css/bootstrap.min.css"
+      integrity="sha384-9aIt2nRpC12Uk9gS9baDl411NQApFmC26EwAOH8WgZl5MYYxFfc+NcPb1dKGj7Sk"
+      crossorigin="anonymous"
+    />
+    <link rel="stylesheet" href="assets/styles/main.css" />
+    <link rel="icon" href="assets/img/avatartion.png" type="image/x-icon" />
+    <link
+      rel="shortcut icon"
+      href="assets/img/avatartion.png"
+      type="image/x-icon"
+    />
+    <style>
+      .nord-dark-mode .bib {
+        color: white;
+      }
+      .reference-div {
+        border: 1px dashed whitesmoke;
+      }
+    </style>
+    <script>
+      function applySystemDefaultMode() {
+        if (
+          window.matchMedia &&
+          window.matchMedia("(prefers-color-scheme: dark)").matches
+        ) {
+          document.body.classList.add("nord-dark-mode");
+        }
+      }
+
+      window.onload = applySystemDefaultMode;
+    </script>
+    <script
+      id="MathJax-script"
+      async
+      src="https://cdn.jsdelivr.net/npm/mathjax@3/es5/tex-mml-chtml.js"
+    ></script>
+    <title>TopicGPT (NAACL'24)</title>
+  </head>
+
+  <!-- Google tag (gtag.js) -->
+  <script
+    async
+    src="https://www.googletagmanager.com/gtag/js?id=G-V0ZY4Q1942"
+  ></script>
+  <script>
+    // Google tag
+    window.dataLayer = window.dataLayer || [];
+    function gtag() {
+      dataLayer.push(arguments);
+    }
+    gtag("js", new Date());
+    gtag("config", "G-V0ZY4Q1942");
+  </script>
+
+  <body>
+    <div style="text-align: center">
+      <h1>TopicGPT: A Prompt-based Topic Modeling Framework</h1>
+      <a class="link-body" href="https://chtmp223.github.io/" onclick="_blank"
+        >Chau Minh Pham🔍</a
+      >,
+      <a class="link-body" href="https://alexanderhoyle.com" onclick="_blank"
+        >Alexander Hoyle🔦</a
+      >,
+      <a class="link-body" href="https://simengsun.github.io" onclick="_blank"
+        >Simeng Sun🔍<sup></sup></a
+      >, <br />
+      <a
+        class="link-body"
+        href="http://users.umiacs.umd.edu/~resnik/"
+        onclick="_blank"
+        >Philip Resnik🔦</a
+      >,
+      <a
+        class="link-body"
+        href="https://people.cs.umass.edu/~miyyer/"
+        onclick="_blank"
+        >Mohit Iyyer🔍</a
+      >
+      <br />
+      🔍University of Massachusetts Amherst<br />
+      🔦University of Maryland College Park<br />
+      <a
+        class="link-body"
+        href="https://arxiv.org/abs/2311.01449"
+        target="_blank"
+        >[Paper]</a
+      >
+      <a
+        class="link-body"
+        href="https://github.com/chtmp223/topicGPT"
+        target="_blank"
+        >[Code]</a
+      >
+      [<svg
+        xmlns="http://www.w3.org/2000/svg"
+        fill="currentColor"
+        class="bi bi-moon-stars-fill dark-mode-toggle responsive-icon"
+        viewBox="0 0 16 16"
+        id="darkModeToggle"
+      >
+        <path
+          d="M6 .278a.77.77 0 0 1 .08.858 7.2 7.2 0 0 0-.878 3.46c0 4.021 3.278 7.277 7.318 7.277q.792-.001 1.533-.16a.79.79 0 0 1 .81.316.73.73 0 0 1-.031.893A8.35 8.35 0 0 1 8.344 16C3.734 16 0 12.286 0 7.71 0 4.266 2.114 1.312 5.124.06A.75.75 0 0 1 6 .278"
+        />
+        <path
+          d="M10.794 3.148a.217.217 0 0 1 .412 0l.387 1.162c.173.518.579.924 1.097 1.097l1.162.387a.217.217 0 0 1 0 .412l-1.162.387a1.73 1.73 0 0 0-1.097 1.097l-.387 1.162a.217.217 0 0 1-.412 0l-.387-1.162A1.73 1.73 0 0 0 9.31 6.593l-1.162-.387a.217.217 0 0 1 0-.412l1.162-.387a1.73 1.73 0 0 0 1.097-1.097zM13.863.099a.145.145 0 0 1 .274 0l.258.774c.115.346.386.617.732.732l.774.258a.145.145 0 0 1 0 .274l-.774.258a1.16 1.16 0 0 0-.732.732l-.258.774a.145.145 0 0 1-.274 0l-.258-.774a1.16 1.16 0 0 0-.732-.732l-.774-.258a.145.145 0 0 1 0-.274l.774-.258c.346-.115.617-.386.732-.732z"
+        /></svg
+      >]
+      <script>
+        const toggleButton = document.getElementById("darkModeToggle");
+        toggleButton.addEventListener("click", () => {
+          document.body.classList.toggle("nord-dark-mode");
+        });
+      </script>
+    </div>
+
+    <hr />
+
+    <div style="text-align: center">
+      <img
+        class="responsive-header"
+        style="padding-top: 10px; height: auto; max-width: 80%"
+        src="assets/img/pipeline.png"
+      />
+    </div>
+
+    <br />
+    <div class="container" style="text-align: center">
+      <h2 class="title">TLDR;</h2>
+      <br />
+    </div>
+    <div
+      class="container"
+      style="max-width: 75%; margin: 0 auto; text-align: justify"
+    >
+      <div class="row">
+        <p class="tlr">
+          1. We introduce <b>TopicGPT</b>, a prompt-based framework that uses large
+          language models (LLMs) to uncover latent topics in a text collection.
+          TopicGPT generates interpretable topics, dispensing with ambiguous
+          bags of words in favor of topics with natural language labels and
+          associated free-form descriptions.
+        </p>
+        <p class="tlr">
+          2. TopicGPT works in three main stages. 
+          <ul>
+            <li>Generation: It generates high-level topics using a prompt-based approach.</li>
+            <li>Refinement: It refines the topics by merging similar ones and removing outliers.</li>
+            <li>Assignment: It assigns topics to documents with supporting quotes.</li>
+          </ul>
+        </p>
+        <p class="tlr">
+          3. TopicGPT produces topics that align better with human
+          categorizations compared to competing methods: it achieves a harmonic
+          mean purity of 0.74 against human-annotated Wikipedia topics compared
+          to 0.64 for the strongest baseline.
+        </p>
+      </div>
+    </div>
+
+    <hr />
+
+    <div class="container" style="text-align: center">
+      <h2 class="title">Data Preparation</h2>
+      <br />
+    </div>
+
+    <div
+      class="container"
+      style="max-width: 75%; margin: 0 auto; text-align: justify"
+    >
+      <div class="row">
+        <p class="tlr">
+          1. Prepare your <code>.jsonl</code> data file in the following format:
+        </p>
+      </div>
+    </div>
+
+    <div
+      class="container"
+      style="
+        max-width: 75%;
+        margin: 0 auto;
+        text-align: left;
+        background-color: #f7f7f7;
+      "
+    >
+      <div class="code-block">
+        <pre><code>
+  {
+      "id": "IDs (optional)",
+      "text": "Documents",
+      "label": "Ground-truth labels (optional)"
+  }
+      </code></pre>
+      </div>
+    </div>
+
+    <div
+      class="container"
+      style="max-width: 75%; margin: 0 auto; text-align: justify"
+    >
+      <div class="row">
+        <p class="tlr">
+          2. Place your data file in the <code>data/input</code> directory. A
+          sample data file, <code>data/input/sample.jsonl</code>, is provided to
+          help you debug the code.
+        </p>
+        <p class="tlr">
+          3. You can download the raw datasets used in the paper (Bills and
+          Wiki) from the following link:
+          <a
+            class="link-body"
+            href="https://drive.google.com/drive/folders/1rCTR5ZQQ7bZQoewFA8eqV6glP6zhY31e?usp=sharing"
+            target="_blank"
+            >Dataset Link</a
+          >.
+        </p>
+      </div>
+    </div>
+    <hr />
+
+    <div class="container" style="text-align: center">
+      <h2 class="title">Setting up</h2>
+      <br />
+    </div>
+
+    <div
+      class="container"
+      style="max-width: 75%; margin: 0 auto; text-align: justify"
+    >
+      <div class="row">
+        <p class="tlr">
+          Check out <code>demo.ipynb</code> for a complete pipeline and more
+          detailed instructions. We advise trying a subset with more affordable
+          (or open-source) models before scaling to the full dataset.
+        </p>
+        <p class="tlr">
+          Metric calculation functions are available in
+            <code>topicgpt_python.metrics</code> to evaluate topic alignment
+            with ground-truth labels (Adjusted Rand Index, Harmonic Purity,
+            Normalized Mutual Information).
+        <p class="tlr">
+          Our package supports OpenAI API, Google Cloud Vertex AI API, and vLLM
+          inference. vLLM requires GPUs to run. Please refer to
+          <a
+            class="link-body"
+            href="https://openai.com/pricing/"
+            target="_blank"
+            >OpenAI API pricing</a
+          >
+          or to
+          <a
+            class="link-body"
+            href="https://cloud.google.com/vertex-ai/pricing"
+            target="_blank"
+            >Vertex API pricing</a
+          >
+          for cost details.
+        </p>
+        <p class="tlr">
+          1. Make a new Python 3.9+ environment using virtualenv or conda.
+        </p>
+        <p class="tlr">
+          2. Install the required packages:
+          <code>pip install --upgrade topicgpt_python</code>
+        </p>
+        <p class="tlr">3. Set environment variables:</p>
+      </div>
+    </div>
+    <div
+      class="container"
+      style="
+        max-width: 75%;
+        margin: 0 auto;
+        text-align: left;
+        background-color: #f7f7f7;
+      "
+    >
+      <div class="code-block">
+        <pre><code>
+  export OPENAI_API_KEY={your_openai_api_key}
+  export VERTEX_PROJECT={your_vertex_project}
+  export VERTEX_LOCATION={your_vertex_location}
+      </code></pre>
+      </div>
+    </div>
+    <div
+      class="container"
+      style="max-width: 75%; margin: 0 auto; text-align: justify"
+    >
+      <div class="row">
+        <p class="tlr">
+          4. Define I/O paths in <code>config.yml</code>.
+        </p>
+      </div>
+      <div class="row">
+        <p class="tlr">
+          5. Run the following code snippet to load the configuration file:
+        </p>
+      </div>  
+    </div>
+
+    <div
+      class="container"
+      style="
+        max-width: 75%;
+        margin: 0 auto;
+        text-align: left;
+        background-color: #f7f7f7;
+      "
+    >
+      <div class="code-block">
+        <pre><code>
+  from topicgpt_python import *
+  import yaml
+  
+  with open("config.yml", "r") as f:
+      config = yaml.safe_load(f)
+      </code></pre>
+      </div>
+    </div>
+    <hr />
+
+    <div class="container" style="text-align: center">
+      <h2 class="title">Generating Topics</h2>
+      <br />
+    </div>
+    <div class="container" style="max-width: 80%; margin: 0 auto; text-align: justify;">
+      <ul>
+        <li>Define your seed topics in `prompt/seed_1.md`.</li>
+        <li>(Optional) Modify few-shot examples in `prompt/generation_1.txt`.</li>
+        <li>Right now, early stopping is set to 100, meaning that if no new topic has been generated in the last 100 iterations, the generation process will stop.</li>
+      </ul>
+    </div>
+    <div
+      class="container"
+      style="max-width: 75%; margin: 0 auto; text-align: justify"
+    >
+  <b>Function: generate_topic_lvl1</b>
+  <p>Generate high-level topics.</p>
+  <ul>
+    <li>
+      <strong>api</strong> (str): API to use ('openai', 'vertex', 'vllm')
+    </li>
+    <li><strong>model</strong> (str): Model to run topic generation with</li>
+    <li><strong>data</strong> (str): Input data file</li>
+    <li><strong>prompt_file</strong> (str): File to read prompt from</li>
+    <li>
+      <strong>seed_file</strong> (str): Markdown file to read seed topics
+      from
+    </li>
+    <li><strong>out_file</strong> (str): File to write results (original texts with the corresponding generated topics) to</li>
+    <li><strong>topic_file</strong> (str): File to write generated topics to</li>
+    <li><strong>verbose</strong> (bool): Enable verbose output</li>
+  </ul>
+    </div>
+
+    <div
+      class="container"
+      style="
+        max-width: 75%;
+        margin: 0 auto;
+        text-align: left;
+        background-color: #f7f7f7;
+      "
+    >
+      <div class="code-block">
+        <pre><code>
+  generate_topic_lvl1(
+    api, model, data, prompt_file, seed_file, out_file, topic_file, verbose
+  )                
+      </code></pre>
+      </div>
+    </div>
+
+    <div
+      class="container"
+      style="
+        max-width: 75%;
+        margin: 0 auto;
+        text-align: justify;
+        padding-left: 15px;
+      "
+    >
+      <b>Function: generate_topic_lvl2</b>
+      <p>Generate subtopics for each top-level topic.</p>
+      <ul>
+        <li><strong>api</strong> (str): API to use ('openai', 'vertex', 'vllm')</li>
+        <li><strong>model</strong> (str): Model to run topic generation with</li>
+        <li><strong>seed_file</strong> (str): File to read seed topics from</li>
+        <li><strong>data</strong> (str): Input data file</li>
+        <li><strong>prompt_file</strong> (str): Prompt file</li>
+        <li><strong>out_file</strong> (str): Output result file (original texts with corresponding generated topics)</li>
+        <li><strong>topic_file</strong> (str): Output topics file</li>
+        <li><strong>verbose</strong> (bool): Enable verbose output</li>
+      </ul>
+    </div>
+
+    <div
+      class="container"
+      style="
+        max-width: 75%;
+        margin: 0 auto;
+        text-align: left;
+        background-color: #f7f7f7;
+      "
+    >
+      <div class="code-block">
+        <pre><code>
+  generate_topic_lvl2(
+    api, model, seed_file, data, prompt_file, out_file, topic_file, verbose
+  )
+      </code></pre>
+      </div>
+    </div>
+    <hr>
+
+    <div class="container" style="text-align: center">
+      <h2 class="title">Refining Topics</h2>
+      <br />
+    </div>
+    <div
+      class="container"
+      style="
+        max-width: 75%;
+        margin: 0 auto;
+        text-align: justify;
+        padding-left: 15px;
+      "
+    >
+    If topics are generated by a weaker model, there sometimes exist irrelevant or redundant topics. This module: 
+      <ul>
+        <li>Merges similar topics</li>
+        <li>Removes overly specific or redundant topics that occur < 1% of the time</li>
+        <li>If you're unsatisfied with the refined topics, call the function again and set <code>refined_again</code> to True in the function call. </li>
+      </ul>
+    </div>
+
+    <div
+      class="container"
+      style="
+        max-width: 75%;
+        margin: 0 auto;
+        text-align: justify;
+        padding-left: 15px;
+      "
+    >
+      <b>Function: refine_topics</b>
+      <p>
+        Refine topics by merging and updating based on API
+        response.
+      </p>
+      <ul>
+        <li>
+          <strong>api</strong> (str): API to use ('openai', 'vertex', 'vllm')
+        </li>
+        <li><strong>model</strong> (str): Model to run topic refinement with</li>
+        <li><strong>prompt_file</strong> (str): Path to the refinement prompt file</li>
+        <li>
+          <strong>generation_file</strong> (str): Path to the generation JSON
+          file (obtained from the topic generation stage)
+        </li>
+        <li><strong>topic_file</strong> (str): Path to the topic file (obtained from the topic generation stage)
+        </li>
+        <li>
+          <strong>out_file</strong> (str): Path to save the refined topic file
+        </li>
+        <li>
+          <strong>updated_file</strong> (str): Path to save the updated
+          generation JSON file
+        </li>
+        <li>
+          <strong>verbose</strong> (bool): If True, prints out implemntation details
+        </li>
+        <li>
+          <strong>remove</strong> (bool): If True, removes low-frequency topics (< 1% occurence times)
+        </li>
+        <li>
+          <strong>mapping_file</strong> (str): Path to save the mapping as a
+          JSON file
+        </li>
+        <li>
+          <strong>refined_again</strong> (bool): If True, indicates if refinement is applied again.
+        </li>
+      </ul>
+    </div>
+
+    <div
+      class="container"
+      style="
+        max-width: 75%;
+        margin: 0 auto;
+        text-align: left;
+        background-color: #f7f7f7;
+      "
+    >
+      <div class="code-block">
+        <pre><code>
+  refine_topics(
+    api, model, prompt_file, generation_file, topic_file, out_file, updated_file, verbose, remove, mapping_file, refined_again 
+  )
+      </code></pre>
+      </div>
+    </div><hr>
+
+    <div class="container" style="text-align: center">
+      <h2 class="title">Assigning Topics</h2>
+      <br />
+    </div>
+
+    <div
+      class="container"
+      style="
+        max-width: 80%;
+        margin: 0 auto;
+        text-align: justify;
+        padding-left: 15px;
+      "
+    >
+    <ul>
+      <li>Each assignment is supported by a quote from the input text.</li>
+      <li>The model used here is often a weaker model to save cost, so the topics may not be grounded in the topic list. To correct this, apply the <code>correct_topics</code> module until there are no more hallucinations.</li>
+    </ul>
+    </div>
+
+
+    <div
+      class="container"
+      style="
+        max-width: 75%;
+        margin: 0 auto;
+        text-align: justify;
+        padding-left: 15px;
+      "
+    >
+      <b>Function: assign_topics</b>
+      <p>Assign topics to a list of documents.</p>
+      <ul>
+        <li>
+          <strong>api</strong> (str): API to use ('openai', 'vertex', 'vllm')
+        </li>
+        <li><strong>model</strong> (str): Model to use</li>
+        <li><strong>data</strong> (str): Data file</li>
+        <li><strong>prompt_file</strong> (str): Prompt file</li>
+        <li><strong>out_file</strong> (str): Output file</li>
+        <li><strong>topic_file</strong> (str): File to write topics to</li>
+        <li><strong>verbose</strong> (bool): Whether to print out results</li>
+      </ul>
+    </div>
+
+    <div
+      class="container"
+      style="
+        max-width: 75%;
+        margin: 0 auto;
+        text-align: left;
+        background-color: #f7f7f7;
+      "
+    >
+      <div class="code-block">
+        <pre><code>
+  assign_topics(
+    api, model, data, prompt_file, out_file, topic_file, verbose
+  )
+      </code></pre>
+      </div>
+    </div>
+    <br />
+
+    <div
+      class="container"
+      style="
+        max-width: 75%;
+        margin: 0 auto;
+        text-align: justify;
+        padding-left: 15px;
+      "
+    >
+      <b>Function: correct_topics</b>
+      <p>Main function to parse, correct, and save topic assignments.</p>
+      <ul>
+        <li>
+          <strong>api</strong>: API type (e.g., 'openai', 'vertex', 'vllm')
+        </li>
+        <li><strong>model</strong>: Model name (e.g., 'gpt-4')</li>
+        <li><strong>data_path</strong>: Path to data file</li>
+        <li><strong>prompt_path</strong>: Path to prompt file</li>
+        <li><strong>topic_path</strong>: Path to topic file</li>
+        <li><strong>output_path</strong>: Path to save corrected output</li>
+        <li><strong>verbose</strong>: Print verbose output</li>
+      </ul>
+    </div>
+
+    <div
+      class="container"
+      style="
+        max-width: 75%;
+        margin: 0 auto;
+        text-align: left;
+        background-color: #f7f7f7;
+      "
+    >
+      <div class="code-block">
+        <pre><code>
+  correct_topics(
+    api, model, data_path, prompt_path, topic_path, output_path, verbose
+  ) 
+      </code></pre>
+      </div>
+    </div>
+    <hr />
+    <div class="container" style="text-align: center">
+      <h2 class="title">Citation</h2>
+      <br />
+    </div>
+
+    <div
+      class="reference-div"
+      style="border: 1px dashed; padding: 20px; margin: 20px 20px"
+    >
+      <pre
+        style="
+          font-size: smaller;
+          font-family: &quot;Menlo&quot;, Menlo, monospace;
+          white-space: pre-wrap;
+          text-align: left;
+        "
+        class="bib"
+      >
+      @misc{pham2024topicgptpromptbasedtopicmodeling,
+        title={TopicGPT: A Prompt-based Topic Modeling Framework}, 
+        author={Chau Minh Pham and Alexander Hoyle and Simeng Sun and Philip Resnik and Mohit Iyyer},
+        year={2024},
+        eprint={2311.01449},
+        archivePrefix={arXiv},
+        primaryClass={cs.CL},
+        url={https://arxiv.org/abs/2311.01449}, 
+      }</pre>
+    </div>
+
+    <!-- Optional JavaScript -->
+    <!-- jQuery first, then Popper.js, then Bootstrap JS -->
+    <script
+      src="https://code.jquery.com/jquery-3.5.1.slim.min.js"
+      integrity="sha384-DfXdz2htPH0lsSSs5nCTpuj/zy4C+OGpamoFVy38MVBnE+IbbVYUew+OrCXaRkfj"
+      crossorigin="anonymous"
+    ></script>
+    <script
+      src="https://cdn.jsdelivr.net/npm/popper.js@1.16.0/dist/umd/popper.min.js"
+      integrity="sha384-Q6E9RHvbIyZFJoft+2mJbHaEWldlvI9IOYy5n3zV9zzTtmI3UksdQRVvoxMfooAo"
+      crossorigin="anonymous"
+    ></script>
+    <script
+      src="https://stackpath.bootstrapcdn.com/bootstrap/4.5.0/js/bootstrap.min.js"
+      integrity="sha384-OgVRvuATP1z7JjHLkuOU7Xw704+h835Lr+6QL9UvYjZE3Ipu6Tp75j7Bh/kR0JKI"
+      crossorigin="anonymous"
+    ></script>
+  </body>
+</html>
diff --git a/setup.py b/setup.py
index 5fbc9d1..184501b 100644
--- a/setup.py
+++ b/setup.py
@@ -6,12 +6,12 @@
 with open("README.md", "r") as fh:
     long_description = fh.read()
     long_description = long_description.replace(
-        "![TopicGPT Pipeline Overview](pipeline.png)", ""
+        "![TopicGPT Pipeline Overview](assets/img/pipeline.png)", ""
     )
 
 setup(
     name="topicgpt_python",
-    version="0.2.0",
+    version="0.2.1",
     packages=find_packages(),
     install_requires=requirements,
     author="Chau Minh Pham",
diff --git a/topicgpt_python/correction.py b/topicgpt_python/correction.py
index 2915e31..30b605c 100644
--- a/topicgpt_python/correction.py
+++ b/topicgpt_python/correction.py
@@ -186,7 +186,18 @@ def correct_batch(
 def correct_topics(
     api, model, data_path, prompt_path, topic_path, output_path, verbose=False
 ):
-    """Main function to parse, correct, and save topic assignments."""
+    """
+    Main function to parse, correct, and save topic assignments.
+
+    Parameters:
+    - api: API type (e.g., 'openai', 'vertex', 'vllm')
+    - model: Model name (e.g., 'gpt-4')
+    - data_path: Path to data file
+    - prompt_path: Path to prompt file
+    - topic_path: Path to topic file
+    - output_path: Path to save corrected output
+    - verbose: Print verbose output
+    """
     api_client = APIClient(api=api, model=model)
     max_tokens, temperature, top_p = 1000, 0.0, 1.0
     context_len = (
diff --git a/topicgpt_python/generation_1.py b/topicgpt_python/generation_1.py
index c7794c4..4036000 100644
--- a/topicgpt_python/generation_1.py
+++ b/topicgpt_python/generation_1.py
@@ -153,6 +153,22 @@ def generate_topics(
 def generate_topic_lvl1(
     api, model, data, prompt_file, seed_file, out_file, topic_file, verbose
 ):
+    """
+    Generate high-level topics
+
+    Parameters:
+    - api (str): API to use ('openai', 'vertex', 'vllm')
+    - model (str): Model to use
+    - data (str): Data file
+    - prompt_file (str): File to read prompts from
+    - seed_file (str): Markdown file to read seed topics from
+    - out_file (str): File to write results to
+    - topic_file (str): File to write topics to
+    - verbose (bool): Whether to print out results
+
+    Returns:
+    - topics_root (TopicTree): Root node of the topic tree
+    """
     api_client = APIClient(api=api, model=model)
     max_tokens, temperature, top_p = 1000, 0.0, 1.0
 
diff --git a/topicgpt_python/generation_2.py b/topicgpt_python/generation_2.py
index d5bb5bf..dbb64c1 100644
--- a/topicgpt_python/generation_2.py
+++ b/topicgpt_python/generation_2.py
@@ -250,6 +250,8 @@ def generate_topic_lvl2(
     - out_file: Output result file
     - topic_file: Output topics file
     - verbose: Enable verbose output
+
+    Returns: Root node of the topic tree
     """
     api_client = APIClient(api=api, model=model)
     max_tokens, temperature, top_p = 1000, 0.0, 1.0
@@ -295,6 +297,7 @@ def generate_topic_lvl2(
     pd.DataFrame({"text": docs, "topics": res}).to_json(
         out_file, orient="records", lines=True
     )
+    return topics_root
 
 
 if __name__ == "__main__":