From e813d2ef342406cfb1d55efa9e42cd6ff7f532eb Mon Sep 17 00:00:00 2001
From: Remy Liu <36778645+RemyLau@users.noreply.github.com>
Date: Tue, 11 Jul 2023 11:54:22 -0400
Subject: [PATCH] Created using Colaboratory

---
 tutorials/basic_tutorial.ipynb | 1932 ++++++++------------------------
 1 file changed, 481 insertions(+), 1451 deletions(-)
diff --git a/tutorials/basic_tutorial.ipynb b/tutorials/basic_tutorial.ipynb
index 2e623df3..025fc4c8 100644
--- a/tutorials/basic_tutorial.ipynb
+++ b/tutorials/basic_tutorial.ipynb
@@ -4,7 +4,8 @@
   "metadata": {
     "colab": {
       "provenance": [],
-      "authorship_tag": "ABX9TyO107Ua39a7xOmb/P+xC24j",
+      "toc_visible": true,
+      "authorship_tag": "ABX9TyM/72QVmPpoW9JPrZYT0/P3",
       "include_colab_link": true
     },
     "kernelspec": {
@@ -26,39 +27,57 @@
         "<a href=\"https://colab.research.google.com/github/krishnanlab/obnb/blob/tutorial/tutorials/basic_tutorial.ipynb\" target=\"_parent\"><img src=\"https://colab.research.google.com/assets/colab-badge.svg\" alt=\"Open In Colab\"/></a>"
       ]
     },
+    {
+      "cell_type": "markdown",
+      "source": [
+        "# Basic Tutorial for the Open Biomedical Network Benchmark package"
+      ],
+      "metadata": {
+        "id": "Ba_AaNS7Stg8"
+      }
+    },
+    {
+      "cell_type": "markdown",
+      "source": [
+        "## 1. Installation\n",
+        "\n",
+        "Installation can be easily done via `pip`.\n",
+        "\n",
+        "via PyPI (released or pre-release versions)\n",
+        "```bash\n",
+        "pip install obnb\n",
+        "```\n",
+        "\n",
+        "or via GitHub (latest dev version)\n",
+        "```bash\n",
+        "pip install git+https://github.com/krishnanlab/obnb\n",
+        "```"
+      ],
+      "metadata": {
+        "id": "pv7SYyrlTKl4"
+      }
+    },
     {
       "cell_type": "code",
-      "execution_count": 1,
+      "execution_count": null,
       "metadata": {
-        "colab": {
-          "base_uri": "https://localhost:8080/"
-        },
-        "id": "fypP0bMZ-Wsu",
-        "outputId": "f4bf8c41-0632-42dc-b2bc-7035757928ac"
+        "id": "fypP0bMZ-Wsu"
       },
-      "outputs": [
-        {
-          "output_type": "stream",
-          "name": "stdout",
-          "text": [
-            "  Installing build dependencies ... \u001b[?25l\u001b[?25hdone\n",
-            "  Getting requirements to build wheel ... \u001b[?25l\u001b[?25hdone\n",
-            "  Preparing metadata (pyproject.toml) ... \u001b[?25l\u001b[?25hdone\n",
-            "\u001b[2K     \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m61.1/61.1 kB\u001b[0m \u001b[31m2.2 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
-            "\u001b[2K     \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m59.0/59.0 kB\u001b[0m \u001b[31m5.8 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
-            "\u001b[2K     \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m4.3/4.3 MB\u001b[0m \u001b[31m42.1 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
-            "\u001b[2K     \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m112.7/112.7 kB\u001b[0m \u001b[31m7.9 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
-            "\u001b[2K     \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m54.5/54.5 kB\u001b[0m \u001b[31m4.2 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
-            "\u001b[?25h  Preparing metadata (setup.py) ... \u001b[?25l\u001b[?25hdone\n",
-            "  Building wheel for obnb (pyproject.toml) ... \u001b[?25l\u001b[?25hdone\n",
-            "  Building wheel for littleutils (setup.py) ... \u001b[?25l\u001b[?25hdone\n"
-          ]
-        }
-      ],
+      "outputs": [],
       "source": [
+        "# Install latest dev version of OBNB\n",
         "!pip install -q git+https://github.com/krishnanlab/obnb"
       ]
     },
+    {
+      "cell_type": "markdown",
+      "source": [
+        "Check if the package is installed successfully"
+      ],
+      "metadata": {
+        "id": "WefaXPkqUS6e"
+      }
+    },
     {
       "cell_type": "code",
       "source": [
@@ -66,22 +85,47 @@
         "print(f\"Installed obnb {obnb.__version__}\")"
       ],
       "metadata": {
-        "colab": {
-          "base_uri": "https://localhost:8080/"
-        },
-        "id": "XJbywsXU-6zH",
-        "outputId": "3e561b06-0f81-4545-887f-ec71f7165cc8"
+        "id": "XJbywsXU-6zH"
       },
-      "execution_count": 2,
-      "outputs": [
-        {
-          "output_type": "stream",
-          "name": "stdout",
-          "text": [
-            "Installed obnb 0.1.1-dev\n"
-          ]
-        }
-      ]
+      "execution_count": null,
+      "outputs": []
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "import obnb.ext.pecanpy\n",
+        "print(f\"Extension for PecanPy installed: {obnb.ext.pecanpy}\")"
+      ],
+      "metadata": {
+        "id": "_ZYMxfgfUZFe"
+      },
+      "execution_count": null,
+      "outputs": []
+    },
+    {
+      "cell_type": "markdown",
+      "source": [
+        "## 2. Data downloading and processing"
+      ],
+      "metadata": {
+        "id": "oZsfNaHqVaQu"
+      }
+    },
+    {
+      "cell_type": "markdown",
+      "source": [
+        "First, load the `obnb.data` module that contains \"recipies\" for processing\n",
+        "differentt selections of biological networks and gene annotation data.\n",
+        "\n",
+        "We also need to specify (1) the path to which the data will be saved, and more\n",
+        "importantly, (2) the **version** of the data we want to retrieve. The version\n",
+        "option allows for flexible data retrieval (either retrieve data from source, or\n",
+        "retrieve from processed data archive) and also enable reproduction of the\n",
+        "downstream analysis."
+      ],
+      "metadata": {
+        "id": "bDx-hDiTVsM4"
+      }
     },
     {
       "cell_type": "code",
@@ -89,1507 +133,465 @@
         "import obnb.data\n",
         "import yaml\n",
         "\n",
+        "# Where do we want to save the data and related files to\n",
         "root = \"datasets\"\n",
+        "\n",
+        "# What version of the pre-processed data to download\n",
         "data_version = \"obnbdata-0.1.0\"\n",
-        "lsc = obnb.data.DisGeNET(root, version=data_version)"
+        "# data_version = \"latest\"  # download data from source and process from scratch\n",
+        "# data_version = \"current\"  # use the latest archived data version"
       ],
       "metadata": {
-        "colab": {
-          "base_uri": "https://localhost:8080/"
-        },
-        "id": "s1apiAau_GDY",
-        "outputId": "95ac9e0b-52ab-42db-fc1b-dba9f7691827"
+        "id": "s1apiAau_GDY"
       },
-      "execution_count": 3,
-      "outputs": [
-        {
-          "output_type": "stream",
-          "name": "stderr",
-          "text": [
-            "[INFO][2023-07-10 23:25:35,227][base][download_archive] Loading DisGeNET (version='obnbdata-0.1.0')...\n",
-            "[INFO][2023-07-10 23:25:35,229][download][get_data_url] Download URL: https://zenodo.org/record/8045270/files/DisGeNET.zip\n",
-            "[INFO][2023-07-10 23:25:35,232][download][download_unzip] Downloading zip archive from https://zenodo.org/record/8045270/files/DisGeNET.zip\n",
-            "100%|██████████| 219k/219k [00:00<00:00, 687kB/s]\n",
-            "[INFO][2023-07-10 23:25:36,852][download][download_unzip] Download completed, start unpacking...\n",
-            "[INFO][2023-07-10 23:25:36,864][download][download_unzip] Done extracting\n",
-            "[INFO][2023-07-10 23:25:36,869][download][get_data_url] Download URL: https://zenodo.org/record/8045270/files/.cache.zip\n",
-            "[INFO][2023-07-10 23:25:36,871][download][download_unzip] Downloading zip archive from https://zenodo.org/record/8045270/files/.cache.zip\n",
-            "100%|██████████| 24.7M/24.7M [00:01<00:00, 13.0MB/s]\n",
-            "[INFO][2023-07-10 23:25:39,817][download][download_unzip] Download completed, start unpacking...\n",
-            "[INFO][2023-07-10 23:25:41,874][download][download_unzip] Done extracting\n",
-            "[INFO][2023-07-10 23:25:41,902][base][load_processed_data] Load processed file datasets/DisGeNET/processed/data.gmt\n"
-          ]
-        }
-      ]
+      "execution_count": null,
+      "outputs": []
+    },
+    {
+      "cell_type": "markdown",
+      "source": [
+        "### 2.1. Biological networks"
+      ],
+      "metadata": {
+        "id": "8YF_zoqBWOzV"
+      }
+    },
+    {
+      "cell_type": "markdown",
+      "source": [
+        "Let's start with an example of obtaining the `BioPlex` network, which is a\n",
+        "protein-protein interaction (PPI) network that is constructed via AP-MS on\n",
+        "human cell-lines ([HEK293T](https://www.synthego.com/hek293) and\n",
+        "[HCT116](https://imanislife.com/collections/cell-lines/hct116-cells/)).\n",
+        "Checkout other avaialble options for processed biomedical networks on the OBNB\n",
+        "benchmark\n",
+        "[README](https://github.com/krishnanlab/obnbench#data-stats-obnbdata-010-) page.\n",
+        "\n",
+        "[1] Huttlin, Edward L., et al. \"The BioPlex network: a systematic exploration of the human interactome.\" Cell 162.2 (2015): 425-440.\n",
+        "\n",
+        "[2] Huttlin, Edward L., et al. \"Dual proteome-scale networks reveal cell-specific remodeling of the human interactome.\" Cell 184.11 (2021): 3022-3040."
+      ],
+      "metadata": {
+        "id": "7idT6WBxXR29"
+      }
     },
     {
       "cell_type": "code",
-      "source": [],
+      "source": [
+        "# Download network from archive\n",
+        "g = obnb.data.BioPlex(root, version=data_version)"
+      ],
       "metadata": {
-        "id": "dHZSPRK0_xYz"
+        "id": "-Wsdv0VmWVfr"
       },
-      "execution_count": 3,
+      "execution_count": null,
       "outputs": []
     },
     {
       "cell_type": "code",
       "source": [
-        "print(yaml.dump(lsc.to_config()))"
+        "# Once downloaded, it can be used in future acess without redownloading\n",
+        "g = obnb.data.BioPlex(root, version=data_version)"
       ],
       "metadata": {
-        "colab": {
-          "base_uri": "https://localhost:8080/"
-        },
-        "id": "UEZQ-LJ5_vJO",
-        "outputId": "a88f919b-7eab-4f94-cbfb-ba91a283ac5e"
+        "id": "ovT8pvzbWVdR"
       },
-      "execution_count": 4,
-      "outputs": [
-        {
-          "output_type": "stream",
-          "name": "stdout",
-          "text": [
-            "data_module: DisGeNET\n",
-            "data_module_params:\n",
-            "  gene_id_converter: HumanEntrez\n",
-            "  pre_transform:\n",
-            "  - LabelsetRangeFilterSize:\n",
-            "      max_val: '600'\n",
-            "      min_val: None\n",
-            "  - LabelsetNonRedFilter:\n",
-            "      thresholds: (0.5, 0.7)\n",
-            "  - LabelsetRangeFilterSize:\n",
-            "      max_val: None\n",
-            "      min_val: '10'\n",
-            "  version: obnbdata-0.1.0\n",
-            "package_version: 0.1.1-dev\n",
-            "processed_time: '2023-07-10 23:15:57'\n",
-            "\n"
-          ]
-        }
-      ]
+      "execution_count": null,
+      "outputs": []
     },
     {
       "cell_type": "code",
       "source": [
-        "lsc.to_df()"
+        "# You can also force redownloading the data by specifying redownload=True\n",
+        "g = obnb.data.BioPlex(root, version=data_version, redownload=True)"
       ],
       "metadata": {
-        "colab": {
-          "base_uri": "https://localhost:8080/",
-          "height": 748
-        },
-        "id": "bxYbfhya_GAj",
-        "outputId": "c3f222d2-c853-4023-901c-9726ac62bed4"
+        "id": "KDFC5JnyWVOb"
       },
-      "execution_count": 5,
-      "outputs": [
-        {
-          "output_type": "execute_result",
-          "data": {
-            "text/plain": [
-              "               Name                                        Info  Size       0  \\\n",
-              "0     MONDO:0000004                adrenocortical insufficiency    44   23530   \n",
-              "1     MONDO:0021034                            genetic alopecia    76  340419   \n",
-              "2     MONDO:0000009  inherited bleeding disorder, platelet-type    37    5328   \n",
-              "3     MONDO:0002243                         hemorrhagic disease   135    7450   \n",
-              "4     MONDO:0002245                      blood platelet disease   329    5698   \n",
-              "...             ...                                         ...   ...     ...   \n",
-              "1035  MONDO:0044976      obsolete disease of catalytic activity    11    2936   \n",
-              "1036  MONDO:0100130   adult acute respiratory distress syndrome    55    6347   \n",
-              "1037  MONDO:0100431                       migraine without aura    19     796   \n",
-              "1038  MONDO:0100459                                 azoospermia    95    5889   \n",
-              "1039  MONDO:0100471                        vitamin D deficiency    13    6197   \n",
-              "\n",
-              "           1       2       3          4       5       6  ...   584   585  \\\n",
-              "0       2737   55699    3284       1585    1589   50940  ...  None  None   \n",
-              "1       5894    6635   92344      10913    4289   22808  ...  None  None   \n",
-              "2       2533  342618    6916       2531    6915   80739  ...  None  None   \n",
-              "3     342618    4618    6916       2531     421  196527  ...  None  None   \n",
-              "4       7706   55135    2475     342618   79053  374569  ...  None  None   \n",
-              "...      ...     ...     ...        ...     ...     ...  ...   ...   ...   \n",
-              "1035    2539    2729    2937        226    2023    3098  ...  None  None   \n",
-              "1036    1906  407055  442911       5685  406953     210  ...  None  None   \n",
-              "1037   79783    1909    4209  101929660   79054    1740  ...  None  None   \n",
-              "1038    3077    4952    9085       2488   84464    6660  ...  None  None   \n",
-              "1039    3508   10939    9772       4036   84617    7421  ...  None  None   \n",
-              "\n",
-              "       586   587   588   589   590   591   592   593  \n",
-              "0     None  None  None  None  None  None  None  None  \n",
-              "1     None  None  None  None  None  None  None  None  \n",
-              "2     None  None  None  None  None  None  None  None  \n",
-              "3     None  None  None  None  None  None  None  None  \n",
-              "4     None  None  None  None  None  None  None  None  \n",
-              "...    ...   ...   ...   ...   ...   ...   ...   ...  \n",
-              "1035  None  None  None  None  None  None  None  None  \n",
-              "1036  None  None  None  None  None  None  None  None  \n",
-              "1037  None  None  None  None  None  None  None  None  \n",
-              "1038  None  None  None  None  None  None  None  None  \n",
-              "1039  None  None  None  None  None  None  None  None  \n",
-              "\n",
-              "[1040 rows x 597 columns]"
-            ],
-            "text/html": [
-              "\n",
-              "\n",
-              "  <div id=\"df-4cd2dadf-1e49-4c47-acf1-d646d70cdea8\">\n",
-              "    <div class=\"colab-df-container\">\n",
-              "      <div>\n",
-              "<style scoped>\n",
-              "    .dataframe tbody tr th:only-of-type {\n",
-              "        vertical-align: middle;\n",
-              "    }\n",
-              "\n",
-              "    .dataframe tbody tr th {\n",
-              "        vertical-align: top;\n",
-              "    }\n",
-              "\n",
-              "    .dataframe thead th {\n",
-              "        text-align: right;\n",
-              "    }\n",
-              "</style>\n",
-              "<table border=\"1\" class=\"dataframe\">\n",
-              "  <thead>\n",
-              "    <tr style=\"text-align: right;\">\n",
-              "      <th></th>\n",
-              "      <th>Name</th>\n",
-              "      <th>Info</th>\n",
-              "      <th>Size</th>\n",
-              "      <th>0</th>\n",
-              "      <th>1</th>\n",
-              "      <th>2</th>\n",
-              "      <th>3</th>\n",
-              "      <th>4</th>\n",
-              "      <th>5</th>\n",
-              "      <th>6</th>\n",
-              "      <th>...</th>\n",
-              "      <th>584</th>\n",
-              "      <th>585</th>\n",
-              "      <th>586</th>\n",
-              "      <th>587</th>\n",
-              "      <th>588</th>\n",
-              "      <th>589</th>\n",
-              "      <th>590</th>\n",
-              "      <th>591</th>\n",
-              "      <th>592</th>\n",
-              "      <th>593</th>\n",
-              "    </tr>\n",
-              "  </thead>\n",
-              "  <tbody>\n",
-              "    <tr>\n",
-              "      <th>0</th>\n",
-              "      <td>MONDO:0000004</td>\n",
-              "      <td>adrenocortical insufficiency</td>\n",
-              "      <td>44</td>\n",
-              "      <td>23530</td>\n",
-              "      <td>2737</td>\n",
-              "      <td>55699</td>\n",
-              "      <td>3284</td>\n",
-              "      <td>1585</td>\n",
-              "      <td>1589</td>\n",
-              "      <td>50940</td>\n",
-              "      <td>...</td>\n",
-              "      <td>None</td>\n",
-              "      <td>None</td>\n",
-              "      <td>None</td>\n",
-              "      <td>None</td>\n",
-              "      <td>None</td>\n",
-              "      <td>None</td>\n",
-              "      <td>None</td>\n",
-              "      <td>None</td>\n",
-              "      <td>None</td>\n",
-              "      <td>None</td>\n",
-              "    </tr>\n",
-              "    <tr>\n",
-              "      <th>1</th>\n",
-              "      <td>MONDO:0021034</td>\n",
-              "      <td>genetic alopecia</td>\n",
-              "      <td>76</td>\n",
-              "      <td>340419</td>\n",
-              "      <td>5894</td>\n",
-              "      <td>6635</td>\n",
-              "      <td>92344</td>\n",
-              "      <td>10913</td>\n",
-              "      <td>4289</td>\n",
-              "      <td>22808</td>\n",
-              "      <td>...</td>\n",
-              "      <td>None</td>\n",
-              "      <td>None</td>\n",
-              "      <td>None</td>\n",
-              "      <td>None</td>\n",
-              "      <td>None</td>\n",
-              "      <td>None</td>\n",
-              "      <td>None</td>\n",
-              "      <td>None</td>\n",
-              "      <td>None</td>\n",
-              "      <td>None</td>\n",
-              "    </tr>\n",
-              "    <tr>\n",
-              "      <th>2</th>\n",
-              "      <td>MONDO:0000009</td>\n",
-              "      <td>inherited bleeding disorder, platelet-type</td>\n",
-              "      <td>37</td>\n",
-              "      <td>5328</td>\n",
-              "      <td>2533</td>\n",
-              "      <td>342618</td>\n",
-              "      <td>6916</td>\n",
-              "      <td>2531</td>\n",
-              "      <td>6915</td>\n",
-              "      <td>80739</td>\n",
-              "      <td>...</td>\n",
-              "      <td>None</td>\n",
-              "      <td>None</td>\n",
-              "      <td>None</td>\n",
-              "      <td>None</td>\n",
-              "      <td>None</td>\n",
-              "      <td>None</td>\n",
-              "      <td>None</td>\n",
-              "      <td>None</td>\n",
-              "      <td>None</td>\n",
-              "      <td>None</td>\n",
-              "    </tr>\n",
-              "    <tr>\n",
-              "      <th>3</th>\n",
-              "      <td>MONDO:0002243</td>\n",
-              "      <td>hemorrhagic disease</td>\n",
-              "      <td>135</td>\n",
-              "      <td>7450</td>\n",
-              "      <td>342618</td>\n",
-              "      <td>4618</td>\n",
-              "      <td>6916</td>\n",
-              "      <td>2531</td>\n",
-              "      <td>421</td>\n",
-              "      <td>196527</td>\n",
-              "      <td>...</td>\n",
-              "      <td>None</td>\n",
-              "      <td>None</td>\n",
-              "      <td>None</td>\n",
-              "      <td>None</td>\n",
-              "      <td>None</td>\n",
-              "      <td>None</td>\n",
-              "      <td>None</td>\n",
-              "      <td>None</td>\n",
-              "      <td>None</td>\n",
-              "      <td>None</td>\n",
-              "    </tr>\n",
-              "    <tr>\n",
-              "      <th>4</th>\n",
-              "      <td>MONDO:0002245</td>\n",
-              "      <td>blood platelet disease</td>\n",
-              "      <td>329</td>\n",
-              "      <td>5698</td>\n",
-              "      <td>7706</td>\n",
-              "      <td>55135</td>\n",
-              "      <td>2475</td>\n",
-              "      <td>342618</td>\n",
-              "      <td>79053</td>\n",
-              "      <td>374569</td>\n",
-              "      <td>...</td>\n",
-              "      <td>None</td>\n",
-              "      <td>None</td>\n",
-              "      <td>None</td>\n",
-              "      <td>None</td>\n",
-              "      <td>None</td>\n",
-              "      <td>None</td>\n",
-              "      <td>None</td>\n",
-              "      <td>None</td>\n",
-              "      <td>None</td>\n",
-              "      <td>None</td>\n",
-              "    </tr>\n",
-              "    <tr>\n",
-              "      <th>...</th>\n",
-              "      <td>...</td>\n",
-              "      <td>...</td>\n",
-              "      <td>...</td>\n",
-              "      <td>...</td>\n",
-              "      <td>...</td>\n",
-              "      <td>...</td>\n",
-              "      <td>...</td>\n",
-              "      <td>...</td>\n",
-              "      <td>...</td>\n",
-              "      <td>...</td>\n",
-              "      <td>...</td>\n",
-              "      <td>...</td>\n",
-              "      <td>...</td>\n",
-              "      <td>...</td>\n",
-              "      <td>...</td>\n",
-              "      <td>...</td>\n",
-              "      <td>...</td>\n",
-              "      <td>...</td>\n",
-              "      <td>...</td>\n",
-              "      <td>...</td>\n",
-              "      <td>...</td>\n",
-              "    </tr>\n",
-              "    <tr>\n",
-              "      <th>1035</th>\n",
-              "      <td>MONDO:0044976</td>\n",
-              "      <td>obsolete disease of catalytic activity</td>\n",
-              "      <td>11</td>\n",
-              "      <td>2936</td>\n",
-              "      <td>2539</td>\n",
-              "      <td>2729</td>\n",
-              "      <td>2937</td>\n",
-              "      <td>226</td>\n",
-              "      <td>2023</td>\n",
-              "      <td>3098</td>\n",
-              "      <td>...</td>\n",
-              "      <td>None</td>\n",
-              "      <td>None</td>\n",
-              "      <td>None</td>\n",
-              "      <td>None</td>\n",
-              "      <td>None</td>\n",
-              "      <td>None</td>\n",
-              "      <td>None</td>\n",
-              "      <td>None</td>\n",
-              "      <td>None</td>\n",
-              "      <td>None</td>\n",
-              "    </tr>\n",
-              "    <tr>\n",
-              "      <th>1036</th>\n",
-              "      <td>MONDO:0100130</td>\n",
-              "      <td>adult acute respiratory distress syndrome</td>\n",
-              "      <td>55</td>\n",
-              "      <td>6347</td>\n",
-              "      <td>1906</td>\n",
-              "      <td>407055</td>\n",
-              "      <td>442911</td>\n",
-              "      <td>5685</td>\n",
-              "      <td>406953</td>\n",
-              "      <td>210</td>\n",
-              "      <td>...</td>\n",
-              "      <td>None</td>\n",
-              "      <td>None</td>\n",
-              "      <td>None</td>\n",
-              "      <td>None</td>\n",
-              "      <td>None</td>\n",
-              "      <td>None</td>\n",
-              "      <td>None</td>\n",
-              "      <td>None</td>\n",
-              "      <td>None</td>\n",
-              "      <td>None</td>\n",
-              "    </tr>\n",
-              "    <tr>\n",
-              "      <th>1037</th>\n",
-              "      <td>MONDO:0100431</td>\n",
-              "      <td>migraine without aura</td>\n",
-              "      <td>19</td>\n",
-              "      <td>796</td>\n",
-              "      <td>79783</td>\n",
-              "      <td>1909</td>\n",
-              "      <td>4209</td>\n",
-              "      <td>101929660</td>\n",
-              "      <td>79054</td>\n",
-              "      <td>1740</td>\n",
-              "      <td>...</td>\n",
-              "      <td>None</td>\n",
-              "      <td>None</td>\n",
-              "      <td>None</td>\n",
-              "      <td>None</td>\n",
-              "      <td>None</td>\n",
-              "      <td>None</td>\n",
-              "      <td>None</td>\n",
-              "      <td>None</td>\n",
-              "      <td>None</td>\n",
-              "      <td>None</td>\n",
-              "    </tr>\n",
-              "    <tr>\n",
-              "      <th>1038</th>\n",
-              "      <td>MONDO:0100459</td>\n",
-              "      <td>azoospermia</td>\n",
-              "      <td>95</td>\n",
-              "      <td>5889</td>\n",
-              "      <td>3077</td>\n",
-              "      <td>4952</td>\n",
-              "      <td>9085</td>\n",
-              "      <td>2488</td>\n",
-              "      <td>84464</td>\n",
-              "      <td>6660</td>\n",
-              "      <td>...</td>\n",
-              "      <td>None</td>\n",
-              "      <td>None</td>\n",
-              "      <td>None</td>\n",
-              "      <td>None</td>\n",
-              "      <td>None</td>\n",
-              "      <td>None</td>\n",
-              "      <td>None</td>\n",
-              "      <td>None</td>\n",
-              "      <td>None</td>\n",
-              "      <td>None</td>\n",
-              "    </tr>\n",
-              "    <tr>\n",
-              "      <th>1039</th>\n",
-              "      <td>MONDO:0100471</td>\n",
-              "      <td>vitamin D deficiency</td>\n",
-              "      <td>13</td>\n",
-              "      <td>6197</td>\n",
-              "      <td>3508</td>\n",
-              "      <td>10939</td>\n",
-              "      <td>9772</td>\n",
-              "      <td>4036</td>\n",
-              "      <td>84617</td>\n",
-              "      <td>7421</td>\n",
-              "      <td>...</td>\n",
-              "      <td>None</td>\n",
-              "      <td>None</td>\n",
-              "      <td>None</td>\n",
-              "      <td>None</td>\n",
-              "      <td>None</td>\n",
-              "      <td>None</td>\n",
-              "      <td>None</td>\n",
-              "      <td>None</td>\n",
-              "      <td>None</td>\n",
-              "      <td>None</td>\n",
-              "    </tr>\n",
-              "  </tbody>\n",
-              "</table>\n",
-              "<p>1040 rows × 597 columns</p>\n",
-              "</div>\n",
-              "      <button class=\"colab-df-convert\" onclick=\"convertToInteractive('df-4cd2dadf-1e49-4c47-acf1-d646d70cdea8')\"\n",
-              "              title=\"Convert this dataframe to an interactive table.\"\n",
-              "              style=\"display:none;\">\n",
-              "\n",
-              "  <svg xmlns=\"http://www.w3.org/2000/svg\" height=\"24px\"viewBox=\"0 0 24 24\"\n",
-              "       width=\"24px\">\n",
-              "    <path d=\"M0 0h24v24H0V0z\" fill=\"none\"/>\n",
-              "    <path d=\"M18.56 5.44l.94 2.06.94-2.06 2.06-.94-2.06-.94-.94-2.06-.94 2.06-2.06.94zm-11 1L8.5 8.5l.94-2.06 2.06-.94-2.06-.94L8.5 2.5l-.94 2.06-2.06.94zm10 10l.94 2.06.94-2.06 2.06-.94-2.06-.94-.94-2.06-.94 2.06-2.06.94z\"/><path d=\"M17.41 7.96l-1.37-1.37c-.4-.4-.92-.59-1.43-.59-.52 0-1.04.2-1.43.59L10.3 9.45l-7.72 7.72c-.78.78-.78 2.05 0 2.83L4 21.41c.39.39.9.59 1.41.59.51 0 1.02-.2 1.41-.59l7.78-7.78 2.81-2.81c.8-.78.8-2.07 0-2.86zM5.41 20L4 18.59l7.72-7.72 1.47 1.35L5.41 20z\"/>\n",
-              "  </svg>\n",
-              "      </button>\n",
-              "\n",
-              "\n",
-              "\n",
-              "    <div id=\"df-f0cde8ea-c4f7-445b-bd84-fa771ea1baf1\">\n",
-              "      <button class=\"colab-df-quickchart\" onclick=\"quickchart('df-f0cde8ea-c4f7-445b-bd84-fa771ea1baf1')\"\n",
-              "              title=\"Suggest charts.\"\n",
-              "              style=\"display:none;\">\n",
-              "\n",
-              "<svg xmlns=\"http://www.w3.org/2000/svg\" height=\"24px\"viewBox=\"0 0 24 24\"\n",
-              "     width=\"24px\">\n",
-              "    <g>\n",
-              "        <path d=\"M19 3H5c-1.1 0-2 .9-2 2v14c0 1.1.9 2 2 2h14c1.1 0 2-.9 2-2V5c0-1.1-.9-2-2-2zM9 17H7v-7h2v7zm4 0h-2V7h2v10zm4 0h-2v-4h2v4z\"/>\n",
-              "    </g>\n",
-              "</svg>\n",
-              "      </button>\n",
-              "    </div>\n",
-              "\n",
-              "<style>\n",
-              "  .colab-df-quickchart {\n",
-              "    background-color: #E8F0FE;\n",
-              "    border: none;\n",
-              "    border-radius: 50%;\n",
-              "    cursor: pointer;\n",
-              "    display: none;\n",
-              "    fill: #1967D2;\n",
-              "    height: 32px;\n",
-              "    padding: 0 0 0 0;\n",
-              "    width: 32px;\n",
-              "  }\n",
-              "\n",
-              "  .colab-df-quickchart:hover {\n",
-              "    background-color: #E2EBFA;\n",
-              "    box-shadow: 0px 1px 2px rgba(60, 64, 67, 0.3), 0px 1px 3px 1px rgba(60, 64, 67, 0.15);\n",
-              "    fill: #174EA6;\n",
-              "  }\n",
-              "\n",
-              "  [theme=dark] .colab-df-quickchart {\n",
-              "    background-color: #3B4455;\n",
-              "    fill: #D2E3FC;\n",
-              "  }\n",
-              "\n",
-              "  [theme=dark] .colab-df-quickchart:hover {\n",
-              "    background-color: #434B5C;\n",
-              "    box-shadow: 0px 1px 3px 1px rgba(0, 0, 0, 0.15);\n",
-              "    filter: drop-shadow(0px 1px 2px rgba(0, 0, 0, 0.3));\n",
-              "    fill: #FFFFFF;\n",
-              "  }\n",
-              "</style>\n",
-              "\n",
-              "    <script>\n",
-              "      async function quickchart(key) {\n",
-              "        const containerElement = document.querySelector('#' + key);\n",
-              "        const charts = await google.colab.kernel.invokeFunction(\n",
-              "            'suggestCharts', [key], {});\n",
-              "      }\n",
-              "    </script>\n",
-              "\n",
-              "      <script>\n",
-              "\n",
-              "function displayQuickchartButton(domScope) {\n",
-              "  let quickchartButtonEl =\n",
-              "    domScope.querySelector('#df-f0cde8ea-c4f7-445b-bd84-fa771ea1baf1 button.colab-df-quickchart');\n",
-              "  quickchartButtonEl.style.display =\n",
-              "    google.colab.kernel.accessAllowed ? 'block' : 'none';\n",
-              "}\n",
-              "\n",
-              "        displayQuickchartButton(document);\n",
-              "      </script>\n",
-              "      <style>\n",
-              "    .colab-df-container {\n",
-              "      display:flex;\n",
-              "      flex-wrap:wrap;\n",
-              "      gap: 12px;\n",
-              "    }\n",
-              "\n",
-              "    .colab-df-convert {\n",
-              "      background-color: #E8F0FE;\n",
-              "      border: none;\n",
-              "      border-radius: 50%;\n",
-              "      cursor: pointer;\n",
-              "      display: none;\n",
-              "      fill: #1967D2;\n",
-              "      height: 32px;\n",
-              "      padding: 0 0 0 0;\n",
-              "      width: 32px;\n",
-              "    }\n",
-              "\n",
-              "    .colab-df-convert:hover {\n",
-              "      background-color: #E2EBFA;\n",
-              "      box-shadow: 0px 1px 2px rgba(60, 64, 67, 0.3), 0px 1px 3px 1px rgba(60, 64, 67, 0.15);\n",
-              "      fill: #174EA6;\n",
-              "    }\n",
-              "\n",
-              "    [theme=dark] .colab-df-convert {\n",
-              "      background-color: #3B4455;\n",
-              "      fill: #D2E3FC;\n",
-              "    }\n",
-              "\n",
-              "    [theme=dark] .colab-df-convert:hover {\n",
-              "      background-color: #434B5C;\n",
-              "      box-shadow: 0px 1px 3px 1px rgba(0, 0, 0, 0.15);\n",
-              "      filter: drop-shadow(0px 1px 2px rgba(0, 0, 0, 0.3));\n",
-              "      fill: #FFFFFF;\n",
-              "    }\n",
-              "  </style>\n",
-              "\n",
-              "      <script>\n",
-              "        const buttonEl =\n",
-              "          document.querySelector('#df-4cd2dadf-1e49-4c47-acf1-d646d70cdea8 button.colab-df-convert');\n",
-              "        buttonEl.style.display =\n",
-              "          google.colab.kernel.accessAllowed ? 'block' : 'none';\n",
-              "\n",
-              "        async function convertToInteractive(key) {\n",
-              "          const element = document.querySelector('#df-4cd2dadf-1e49-4c47-acf1-d646d70cdea8');\n",
-              "          const dataTable =\n",
-              "            await google.colab.kernel.invokeFunction('convertToInteractive',\n",
-              "                                                     [key], {});\n",
-              "          if (!dataTable) return;\n",
-              "\n",
-              "          const docLinkHtml = 'Like what you see? Visit the ' +\n",
-              "            '<a target=\"_blank\" href=https://colab.research.google.com/notebooks/data_table.ipynb>data table notebook</a>'\n",
-              "            + ' to learn more about interactive tables.';\n",
-              "          element.innerHTML = '';\n",
-              "          dataTable['output_type'] = 'display_data';\n",
-              "          await google.colab.output.renderOutput(dataTable, element);\n",
-              "          const docLink = document.createElement('div');\n",
-              "          docLink.innerHTML = docLinkHtml;\n",
-              "          element.appendChild(docLink);\n",
-              "        }\n",
-              "      </script>\n",
-              "    </div>\n",
-              "  </div>\n"
-            ]
-          },
-          "metadata": {},
-          "execution_count": 5
-        }
-      ]
+      "execution_count": null,
+      "outputs": []
+    },
+    {
+      "cell_type": "markdown",
+      "source": [
+        "You can also checkout more information about the processing done for this\n",
+        "network by looking into the config."
+      ],
+      "metadata": {
+        "id": "z8qGrmgTbRzW"
+      }
     },
     {
       "cell_type": "code",
-      "source": [],
+      "source": [
+        "print(yaml.dump(g.to_config()))"
+      ],
       "metadata": {
-        "id": "nnH4lXpm_F9g"
+        "id": "4B7NKw55bQ-k"
       },
-      "execution_count": 10,
+      "execution_count": null,
       "outputs": []
     },
+    {
+      "cell_type": "markdown",
+      "source": [
+        "The gene IDs in the network can be accessed via the `node_ids` attribute, which\n",
+        "are [Entrez](https://www.ncbi.nlm.nih.gov/pmc/articles/PMC1761442/) gene ID by\n",
+        "default."
+      ],
+      "metadata": {
+        "id": "5G5tGtPSb_Ob"
+      }
+    },
     {
       "cell_type": "code",
       "source": [
-        "g = obnb.data.BioGRID(root, version=data_version)"
+        "print(f\"The first gene in the network is {g.node_ids[0]!r}\")\n",
+        "print(f\"The second gene in the network is {g.node_ids[1]!r}\")\n",
+        "print(f\"The third gene in the network is {g.node_ids[2]!r}\")"
       ],
       "metadata": {
-        "colab": {
-          "base_uri": "https://localhost:8080/"
-        },
-        "id": "ui07Xv9tBh3E",
-        "outputId": "9f425f89-bc48-4215-962c-bb849eee1100"
+        "id": "gQrZ6besb5E_"
       },
-      "execution_count": 6,
-      "outputs": [
-        {
-          "output_type": "stream",
-          "name": "stderr",
-          "text": [
-            "[INFO][2023-07-10 23:16:03,767][base][download_archive] Loading BioGRID (version='obnbdata-0.1.0')...\n",
-            "[INFO][2023-07-10 23:16:03,770][download][get_data_url] Download URL: https://zenodo.org/record/8045270/files/BioGRID.zip\n",
-            "[INFO][2023-07-10 23:16:03,773][download][download_unzip] Downloading zip archive from https://zenodo.org/record/8045270/files/BioGRID.zip\n",
-            "100%|██████████| 39.3M/39.3M [00:02<00:00, 14.3MB/s]\n",
-            "[INFO][2023-07-10 23:16:07,863][download][download_unzip] Download completed, start unpacking...\n",
-            "[INFO][2023-07-10 23:16:10,818][download][download_unzip] Done extracting\n",
-            "[INFO][2023-07-10 23:16:10,825][base][load_processed_data] Load processed file datasets/BioGRID/processed/data.npz\n"
-          ]
-        }
-      ]
+      "execution_count": null,
+      "outputs": []
+    },
+    {
+      "cell_type": "markdown",
+      "source": [
+        "The graph `g` object is an instance of the `obnb.graph.SparseGraph` object.\n",
+        "But it could be easily converted into a dense adjacency matrix via `to_adjmat`"
+      ],
+      "metadata": {
+        "id": "yRbbhwpTbgwz"
+      }
     },
     {
       "cell_type": "code",
       "source": [
-        "g = obnb.data.BioGRID(root, version=data_version)"
+        "adj = g.to_adjmat()\n",
+        "adj"
       ],
       "metadata": {
-        "colab": {
-          "base_uri": "https://localhost:8080/"
-        },
-        "id": "lLqZ9f_-BoOt",
-        "outputId": "8aa93790-55c3-419f-d001-71a157b4caaa"
+        "id": "BjX_K65nbfvV"
       },
-      "execution_count": 7,
-      "outputs": [
-        {
-          "output_type": "stream",
-          "name": "stderr",
-          "text": [
-            "[INFO][2023-07-10 23:16:12,911][base][load_processed_data] Load processed file datasets/BioGRID/processed/data.npz\n"
-          ]
-        }
-      ]
+      "execution_count": null,
+      "outputs": []
+    },
+    {
+      "cell_type": "markdown",
+      "source": [
+        "### 2.2. Gene annotations"
+      ],
+      "metadata": {
+        "id": "IUeQvMf4WWM7"
+      }
+    },
+    {
+      "cell_type": "markdown",
+      "source": [
+        "Setting up gene annotation tasks is a tedious process that include\n",
+        "\n",
+        "1. Obtain annotations for gene-term associations and convert gene identifier to\n",
+        "   the desired option.\n",
+        "1. Obtain and construct ontology graph that represents the relationships among\n",
+        "   different terms.\n",
+        "1. Propagate the gene-term annotations upward the ontology graph.\n",
+        "1. Extract non-redundant representative gene sets (terms) from the propagated\n",
+        "   annotations.\n",
+        "\n",
+        "\n",
+        "Here, we use the [DisGeNET](https://www.disgenet.org/) disease gene annotations\n",
+        "with [MONDO](https://mondo.monarchinitiative.org/) disease ontology as an\n",
+        "example to set up the DisGeNET gene set collection.\n",
+        "\n",
+        "[3] Piñero, Janet, et al. \"DisGeNET: a comprehensive platform integrating information on human disease-associated genes and variants.\" Nucleic acids research (2016): gkw943.\n",
+        "\n",
+        "[4] Vasilevsky, Nicole A., et al. \"Mondo: Unifying diseases for the world, by the world.\" medRxiv (2022): 2022-04."
+      ],
+      "metadata": {
+        "id": "RXFunr8jfgA-"
+      }
     },
     {
       "cell_type": "code",
       "source": [
-        "print(yaml.dump(g.to_config()))"
+        "# Download annotations and ontology from archive\n",
+        "gsc = obnb.data.DisGeNET(root, version=data_version)"
       ],
       "metadata": {
-        "colab": {
-          "base_uri": "https://localhost:8080/"
-        },
-        "id": "1-vLqR-H_F6h",
-        "outputId": "ba4aee5c-9a99-48b3-b519-7e37c6fd211a"
+        "id": "HGtLoOl8WNfh"
       },
-      "execution_count": 8,
-      "outputs": [
-        {
-          "output_type": "stream",
-          "name": "stdout",
-          "text": [
-            "data_module: BioGRID\n",
-            "data_module_params:\n",
-            "  cx_kwargs:\n",
-            "    interaction_types:\n",
-            "    - interacts-with\n",
-            "    node_id_entry: r\n",
-            "    node_id_prefix: ncbigene\n",
-            "  cx_uuid: 36f7d8fd-23dc-11e8-b939-0ac135e8bacf\n",
-            "  directed: false\n",
-            "  gene_id_converter: HumanEntrez\n",
-            "  largest_comp: true\n",
-            "  version: obnbdata-0.1.0\n",
-            "  weighted: false\n",
-            "package_version: 0.1.1-dev\n",
-            "processed_time: '2023-07-10 23:16:15'\n",
-            "\n"
-          ]
-        }
-      ]
+      "execution_count": null,
+      "outputs": []
     },
     {
       "cell_type": "code",
       "source": [
-        "adj = g.to_adjmat()\n",
-        "print(adj)"
+        "# Again, once downloaded and processed, it can be used in the future\n",
+        "gsc = obnb.data.DisGeNET(root, version=data_version)"
       ],
       "metadata": {
-        "colab": {
-          "base_uri": "https://localhost:8080/"
-        },
-        "id": "-QtlyfJQ-6wT",
-        "outputId": "ee33b889-5206-4e21-9a7d-44f33ceb25be"
+        "id": "PN2yNtAgjGSm"
       },
-      "execution_count": 9,
-      "outputs": [
-        {
-          "output_type": "stream",
-          "name": "stdout",
-          "text": [
-            "[[0. 1. 0. ... 0. 0. 0.]\n",
-            " [1. 0. 0. ... 0. 0. 0.]\n",
-            " [0. 0. 0. ... 0. 0. 0.]\n",
-            " ...\n",
-            " [0. 0. 0. ... 0. 0. 0.]\n",
-            " [0. 0. 0. ... 0. 0. 0.]\n",
-            " [0. 0. 0. ... 0. 0. 0.]]\n"
-          ]
-        }
-      ]
+      "execution_count": null,
+      "outputs": []
+    },
+    {
+      "cell_type": "markdown",
+      "source": [
+        "Processing config can be inspected in a similar fashion as before"
+      ],
+      "metadata": {
+        "id": "aHvUg8NOjU8N"
+      }
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "print(yaml.dump(gsc.to_config()))"
+      ],
+      "metadata": {
+        "id": "UEZQ-LJ5_vJO"
+      },
+      "execution_count": null,
+      "outputs": []
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "obnb.label.LabelsetCollection"
+      ],
+      "metadata": {
+        "id": "TvKsu8rejken"
+      },
+      "execution_count": null,
+      "outputs": []
+    },
+    {
+      "cell_type": "markdown",
+      "source": [
+        "The `gsc` object is an instance of the `obnb.label.LabelsetCollection` object.\n",
+        "You can also convert it to a\n",
+        "[GMT](https://software.broadinstitute.org/cancer/software/gsea/wiki/index.php/Data_formats#GMT:_Gene_Matrix_Transposed_file_format_.28.2A.gmt.29)-like\n",
+        "dataframe by calling the `to_df` method.\n",
+        "\n",
+        "The resulting dataframe is a table where the first three columns correspond to\n",
+        "the term ID, term info, and the number of genes associated with this term after\n",
+        "the processing. The rest of the columns are gene IDs that are associated with a\n",
+        "particular term, padded with `None`s."
+      ],
+      "metadata": {
+        "id": "jHAIQT6ujeKX"
+      }
     },
     {
       "cell_type": "code",
-      "source": [],
+      "source": [
+        "gsc.to_df()"
+      ],
       "metadata": {
-        "id": "cmvRjsv6-6pj"
+        "id": "bxYbfhya_GAj"
       },
       "execution_count": null,
       "outputs": []
     },
+    {
+      "cell_type": "markdown",
+      "source": [
+        "## 3. Constructing dataset"
+      ],
+      "metadata": {
+        "id": "t7aARNyJlEBk"
+      }
+    },
+    {
+      "cell_type": "markdown",
+      "source": [
+        "### 3.1 The hard way: consolidate the network with gene set collection and combine into a dataset\n",
+        "\n",
+        "- Pros: Flexible filtering and dataset construction to help investigate specific\n",
+        "  biological questions.\n",
+        "- Cons: Many steps involved to filter and pre-process."
+      ],
+      "metadata": {
+        "id": "nVOZQA6OlcLu"
+      }
+    },
     {
       "cell_type": "code",
       "source": [
-        "from obnb import OpenBiomedNetBench\n",
+        "from obnb.label import filters\n",
+        "from obnb.label.split import RatioPartition\n",
+        "from obnb.util.converter import GenePropertyConverter\n",
         "\n",
-        "dataset = OpenBiomedNetBench(root=root, graph_name=\"BioGRID\", label_name=\"DisGeNET\",\n",
-        "                             version=data_version, graph_as_feature=True, use_dense_graph=True)"
+        "\n",
+        "# Load PubMed count gene property converter\n",
+        "pubmedcnt_converter = GenePropertyConverter(root, name=\"PubMedCount\")\n",
+        "\n",
+        "# 6/2/2/ study-bias holdout split for genes\n",
+        "splitter = RatioPartition(0.6, 0.2, 0.2, ascending=False,\n",
+        "                          property_converter=pubmedcnt_converter)\n",
+        "\n",
+        "# Apply filters to the gene set collection\n",
+        "gsc_filtered = gsc.apply(\n",
+        "    filters.Compose(\n",
+        "        # Only use genes that are present in the network\n",
+        "        filters.EntityExistenceFilter(list(g.node_ids), log_level=\"INFO\",),\n",
+        "        # Remove any labelsets with less than 50 network genes\n",
+        "        filters.LabelsetRangeFilterSize(min_val=50, log_level=\"INFO\",),\n",
+        "        # Make sure each split has at least 10 positive examples\n",
+        "        filters.LabelsetRangeFilterSplit(min_val=10, splitter=splitter, log_level=\"INFO\",),\n",
+        "        log_level=\"INFO\",\n",
+        "    ),\n",
+        ")"
       ],
       "metadata": {
-        "colab": {
-          "base_uri": "https://localhost:8080/"
-        },
-        "id": "F6Di1ZJ--6md",
-        "outputId": "ff9a0237-7b48-478f-a8c3-885a12af6c6d"
+        "id": "NOVBeVkPlOIX"
       },
-      "execution_count": 4,
-      "outputs": [
-        {
-          "output_type": "stream",
-          "name": "stderr",
-          "text": [
-            "[INFO][2023-07-10 23:25:45,356][base][download_archive] Loading BioGRID (version='obnbdata-0.1.0')...\n",
-            "[INFO][2023-07-10 23:25:45,363][download][get_data_url] Download URL: https://zenodo.org/record/8045270/files/BioGRID.zip\n",
-            "[INFO][2023-07-10 23:25:45,368][download][download_unzip] Downloading zip archive from https://zenodo.org/record/8045270/files/BioGRID.zip\n",
-            "100%|██████████| 39.3M/39.3M [00:02<00:00, 14.9MB/s]\n",
-            "[INFO][2023-07-10 23:25:49,102][download][download_unzip] Download completed, start unpacking...\n",
-            "[INFO][2023-07-10 23:25:52,276][download][download_unzip] Done extracting\n",
-            "[INFO][2023-07-10 23:25:52,283][base][load_processed_data] Load processed file datasets/BioGRID/processed/data.npz\n",
-            "[INFO][obnb.GenePropertyConverter][_load_cache] Loaded gene conversion cache datasets/.cache/geneprop_convert-PubMedCount.json\n",
-            "[INFO][2023-07-10 23:26:03,783][base][load_processed_data] Load processed file datasets/DisGeNET/processed/data.gmt\n",
-            "[INFO][2023-07-10 23:26:04,977][base][_apply_transform] Before transformation:\n",
-            "Number of labelsets: 1040\n",
-            "max: 594\n",
-            "min: 10\n",
-            "med: 36.00\n",
-            "avg: 85.61\n",
-            "std: 120.19\n",
-            "\n",
-            "[INFO][2023-07-10 23:26:04,980][base][_apply_transform] Applying transformation:\n",
-            "Composition of filters:\n",
-            "\t- EntityExistenceFilter(remove_specified=False)\n",
-            "\t- LabelsetRangeFilterSize(min_val=50, max_val=None)\n",
-            "\t- LabelsetRangeFilterSplit(splitter=RatioPartition(property_converter=GenePropertyConverter(name='PubMedCount'), ascending=False, ratios=(0.6, 0.2, 0.2)), min_val=5, count_negatives=True)\n",
-            "\t- NegativeGeneratorHypergeom(p_thresh=0.05)\n",
-            "EntityExistenceFilter(remove_specified=False): 100%|██████████| 9427/9427 [00:07<00:00, 1226.87it/s]\n",
-            "[INFO][obnb.Compose][__call__] Number of labelsets: 1040\n",
-            "max: 571\n",
-            "min: 4\n",
-            "med: 35.00\n",
-            "avg: 81.62\n",
-            "std: 114.00\n",
-            "\n",
-            "LabelsetRangeFilterSize(min_val=50, max_val=None): 100%|██████████| 1040/1040 [00:00<00:00, 3544.88it/s]\n",
-            "[INFO][obnb.Compose][__call__] Number of labelsets: 406\n",
-            "max: 571\n",
-            "min: 50\n",
-            "med: 118.00\n",
-            "avg: 174.50\n",
-            "std: 137.68\n",
-            "\n",
-            "LabelsetRangeFilterSplit(splitter=RatioPartition(property_converter=GenePropertyConverter(name='PubMedCount'), ascending=False, ratios=(0.6, 0.2, 0.2)), min_val=5, count_negatives=True): 100%|██████████| 406/406 [00:44<00:00,  9.20it/s]\n",
-            "[INFO][obnb.Compose][__call__] Number of labelsets: 305\n",
-            "max: 571\n",
-            "min: 50\n",
-            "med: 159.00\n",
-            "avg: 208.26\n",
-            "std: 143.10\n",
-            "\n",
-            "Computing hypergeometric p-value matrix: 100%|██████████| 46360/46360 [00:54<00:00, 857.21it/s] \n",
-            "NegativeGeneratorHypergeom(p_thresh=0.05): 100%|██████████| 305/305 [00:02<00:00, 113.19it/s]\n",
-            "[INFO][obnb.Compose][__call__] Number of labelsets: 305\n",
-            "max: 571\n",
-            "min: 50\n",
-            "med: 159.00\n",
-            "avg: 208.26\n",
-            "std: 143.10\n",
-            "\n",
-            "[INFO][2023-07-10 23:27:53,980][base][_apply_transform] After transformation:\n",
-            "Number of labelsets: 305\n",
-            "max: 571\n",
-            "min: 50\n",
-            "med: 159.00\n",
-            "avg: 208.26\n",
-            "std: 143.10\n",
-            "\n",
-            "[INFO][2023-07-10 23:27:53,998][base][_apply_transform] Saved cache transformation to datasets/DisGeNET/processed/.cache/3f21b172b9a43412952d179a1e7d9f3f/data.gmt\n"
-          ]
-        }
-      ]
+      "execution_count": null,
+      "outputs": []
     },
     {
       "cell_type": "code",
       "source": [
-        "dataset = OpenBiomedNetBench(root=root, graph_name=\"BioGRID\", label_name=\"DisGeNET\",\n",
-        "                             version=data_version, graph_as_feature=True, use_dense_graph=True)"
+        "# Combine into a OBNB dataset object\n",
+        "dataset = obnb.Dataset(\n",
+        "    graph=g,\n",
+        "    feature=g.to_dense_graph().to_feature(),\n",
+        "    label=gsc_filtered,\n",
+        "    splitter=splitter,\n",
+        ")"
       ],
       "metadata": {
-        "colab": {
-          "base_uri": "https://localhost:8080/"
-        },
-        "id": "_I775-DC-6jL",
-        "outputId": "daaa0cbc-8f3b-4927-f954-3963c9730dd9"
+        "id": "SU4bL7WGlOFG"
       },
       "execution_count": null,
-      "outputs": [
-        {
-          "output_type": "stream",
-          "name": "stderr",
-          "text": [
-            "[INFO][2023-07-10 23:19:55,923][base][load_processed_data] Load processed file datasets/BioGRID/processed/data.npz\n",
-            "[INFO][obnb.GenePropertyConverter][_load_cache] Loaded gene conversion cache datasets/.cache/geneprop_convert-PubMedCount.json\n",
-            "[INFO][2023-07-10 23:20:09,055][base][load_processed_data] Load processed file datasets/DisGeNET/processed/data.gmt\n",
-            "[INFO][2023-07-10 23:20:12,024][base][_apply_transform] Loading cached transformed data from datasets/DisGeNET/processed/.cache/3f21b172b9a43412952d179a1e7d9f3f/data.gmt\n",
-            "[INFO][2023-07-10 23:20:12,028][base][load_processed_data] Load processed file datasets/DisGeNET/processed/.cache/3f21b172b9a43412952d179a1e7d9f3f/data.gmt\n"
-          ]
-        }
-      ]
+      "outputs": []
     },
     {
       "cell_type": "code",
       "source": [
-        "from obnb.model_trainer import LabelPropagationTrainer\n",
-        "from obnb.model.label_propagation import OneHopPropagation\n",
-        "\n",
-        "mdl = OneHopPropagation()\n",
-        "trainer = LabelPropagationTrainer()\n",
+        "dataset.graph"
+      ],
+      "metadata": {
+        "id": "HYDWpfnNlOBb"
+      },
+      "execution_count": null,
+      "outputs": []
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "dataset.label"
+      ],
+      "metadata": {
+        "id": "UhKG5PFalN6_"
+      },
+      "execution_count": null,
+      "outputs": []
+    },
+    {
+      "cell_type": "markdown",
+      "source": [
+        "### 3.2. The easy way: OBNB default dataset construction\n",
         "\n",
-        "results = trainer.fit_and_eval(mdl, dataset)"
+        "- Pros: Easy to construct the dataset as it masked out a lot of common steps.\n",
+        "- Cons: Less flexible and hard to construct specialized datasets."
       ],
       "metadata": {
-        "id": "QlCizk9x-6gD"
+        "id": "sk3HPD3JlXJe"
+      }
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "dataset = obnb.OpenBiomedNetBench(\n",
+        "    root=root,\n",
+        "    graph_name=\"BioPlex\",\n",
+        "    label_name=\"DisGeNET\",\n",
+        "    version=data_version,\n",
+        "    graph_as_feature=True,\n",
+        "    use_dense_graph=True,\n",
+        ")"
+      ],
+      "metadata": {
+        "id": "F6Di1ZJ--6md"
       },
-      "execution_count": 5,
+      "execution_count": null,
       "outputs": []
     },
+    {
+      "cell_type": "code",
+      "source": [
+        "# Similar to all previously shown cases, dataset have builtin cache utility\n",
+        "# to help spead up dataloading after the first instantiation.\n",
+        "dataset = obnb.OpenBiomedNetBench(\n",
+        "    root=root,\n",
+        "    graph_name=\"BioPlex\",\n",
+        "    label_name=\"DisGeNET\",\n",
+        "    version=data_version,\n",
+        "    graph_as_feature=True,\n",
+        "    use_dense_graph=True,\n",
+        ")"
+      ],
+      "metadata": {
+        "id": "_I775-DC-6jL"
+      },
+      "execution_count": null,
+      "outputs": []
+    },
+    {
+      "cell_type": "markdown",
+      "source": [
+        "## 4. Simple model evaluation using the dataset and the builtin trianer"
+      ],
+      "metadata": {
+        "id": "DgV8pJf9otkk"
+      }
+    },
+    {
+      "cell_type": "markdown",
+      "source": [
+        "### 4.1. Label propagation"
+      ],
+      "metadata": {
+        "id": "3RDHlp18pP0B"
+      }
+    },
     {
       "cell_type": "code",
       "source": [
         "import pandas as pd\n",
         "\n",
-        "df = pd.DataFrame(results, index=dataset.label.label_ids)\n",
-        "df"
+        "from obnb.model_trainer import LabelPropagationTrainer\n",
+        "from obnb.model.label_propagation import OneHopPropagation"
       ],
       "metadata": {
-        "colab": {
-          "base_uri": "https://localhost:8080/",
-          "height": 424
-        },
-        "id": "NBtgy76xIoc_",
-        "outputId": "2c3e0472-c679-41d4-d293-c46b2c1d8059"
+        "id": "QlCizk9x-6gD"
       },
-      "execution_count": 10,
-      "outputs": [
-        {
-          "output_type": "execute_result",
-          "data": {
-            "text/plain": [
-              "               train_apop  val_apop  test_apop  train_auroc  val_auroc  \\\n",
-              "MONDO:0021034    1.843529  2.163573   0.033863     0.675152   0.483754   \n",
-              "MONDO:0002243    1.813295  0.303094   1.002558     0.701548   0.516716   \n",
-              "MONDO:0002245    0.809497  0.620750   0.206753     0.637431   0.557898   \n",
-              "MONDO:0001703    0.778886  1.511866   4.148121     0.548935   0.588175   \n",
-              "MONDO:0013099    1.914333  2.262900   0.493989     0.646922   0.647186   \n",
-              "...                   ...       ...        ...          ...        ...   \n",
-              "MONDO:0100284    0.603496  0.189524   0.000000     0.573916   0.527694   \n",
-              "MONDO:0020019    1.583002  0.591055   0.879580     0.681023   0.518388   \n",
-              "MONDO:0021002    1.211121  1.055366   1.127546     0.628159   0.585140   \n",
-              "MONDO:0021017    2.228560  1.146228   0.000000     0.520709   0.528579   \n",
-              "MONDO:0100459    3.250616  3.966312   0.060178     0.692115   0.708832   \n",
-              "\n",
-              "               test_auroc  \n",
-              "MONDO:0021034    0.472385  \n",
-              "MONDO:0002243    0.595784  \n",
-              "MONDO:0002245    0.560433  \n",
-              "MONDO:0001703    0.497549  \n",
-              "MONDO:0013099    0.532789  \n",
-              "...                   ...  \n",
-              "MONDO:0100284    0.413082  \n",
-              "MONDO:0020019    0.598177  \n",
-              "MONDO:0021002    0.629074  \n",
-              "MONDO:0021017    0.462455  \n",
-              "MONDO:0100459    0.517378  \n",
-              "\n",
-              "[305 rows x 6 columns]"
-            ],
-            "text/html": [
-              "\n",
-              "\n",
-              "  <div id=\"df-2e61da5a-8ac2-4f5c-8a70-b20197d3e132\">\n",
-              "    <div class=\"colab-df-container\">\n",
-              "      <div>\n",
-              "<style scoped>\n",
-              "    .dataframe tbody tr th:only-of-type {\n",
-              "        vertical-align: middle;\n",
-              "    }\n",
-              "\n",
-              "    .dataframe tbody tr th {\n",
-              "        vertical-align: top;\n",
-              "    }\n",
-              "\n",
-              "    .dataframe thead th {\n",
-              "        text-align: right;\n",
-              "    }\n",
-              "</style>\n",
-              "<table border=\"1\" class=\"dataframe\">\n",
-              "  <thead>\n",
-              "    <tr style=\"text-align: right;\">\n",
-              "      <th></th>\n",
-              "      <th>train_apop</th>\n",
-              "      <th>val_apop</th>\n",
-              "      <th>test_apop</th>\n",
-              "      <th>train_auroc</th>\n",
-              "      <th>val_auroc</th>\n",
-              "      <th>test_auroc</th>\n",
-              "    </tr>\n",
-              "  </thead>\n",
-              "  <tbody>\n",
-              "    <tr>\n",
-              "      <th>MONDO:0021034</th>\n",
-              "      <td>1.843529</td>\n",
-              "      <td>2.163573</td>\n",
-              "      <td>0.033863</td>\n",
-              "      <td>0.675152</td>\n",
-              "      <td>0.483754</td>\n",
-              "      <td>0.472385</td>\n",
-              "    </tr>\n",
-              "    <tr>\n",
-              "      <th>MONDO:0002243</th>\n",
-              "      <td>1.813295</td>\n",
-              "      <td>0.303094</td>\n",
-              "      <td>1.002558</td>\n",
-              "      <td>0.701548</td>\n",
-              "      <td>0.516716</td>\n",
-              "      <td>0.595784</td>\n",
-              "    </tr>\n",
-              "    <tr>\n",
-              "      <th>MONDO:0002245</th>\n",
-              "      <td>0.809497</td>\n",
-              "      <td>0.620750</td>\n",
-              "      <td>0.206753</td>\n",
-              "      <td>0.637431</td>\n",
-              "      <td>0.557898</td>\n",
-              "      <td>0.560433</td>\n",
-              "    </tr>\n",
-              "    <tr>\n",
-              "      <th>MONDO:0001703</th>\n",
-              "      <td>0.778886</td>\n",
-              "      <td>1.511866</td>\n",
-              "      <td>4.148121</td>\n",
-              "      <td>0.548935</td>\n",
-              "      <td>0.588175</td>\n",
-              "      <td>0.497549</td>\n",
-              "    </tr>\n",
-              "    <tr>\n",
-              "      <th>MONDO:0013099</th>\n",
-              "      <td>1.914333</td>\n",
-              "      <td>2.262900</td>\n",
-              "      <td>0.493989</td>\n",
-              "      <td>0.646922</td>\n",
-              "      <td>0.647186</td>\n",
-              "      <td>0.532789</td>\n",
-              "    </tr>\n",
-              "    <tr>\n",
-              "      <th>...</th>\n",
-              "      <td>...</td>\n",
-              "      <td>...</td>\n",
-              "      <td>...</td>\n",
-              "      <td>...</td>\n",
-              "      <td>...</td>\n",
-              "      <td>...</td>\n",
-              "    </tr>\n",
-              "    <tr>\n",
-              "      <th>MONDO:0100284</th>\n",
-              "      <td>0.603496</td>\n",
-              "      <td>0.189524</td>\n",
-              "      <td>0.000000</td>\n",
-              "      <td>0.573916</td>\n",
-              "      <td>0.527694</td>\n",
-              "      <td>0.413082</td>\n",
-              "    </tr>\n",
-              "    <tr>\n",
-              "      <th>MONDO:0020019</th>\n",
-              "      <td>1.583002</td>\n",
-              "      <td>0.591055</td>\n",
-              "      <td>0.879580</td>\n",
-              "      <td>0.681023</td>\n",
-              "      <td>0.518388</td>\n",
-              "      <td>0.598177</td>\n",
-              "    </tr>\n",
-              "    <tr>\n",
-              "      <th>MONDO:0021002</th>\n",
-              "      <td>1.211121</td>\n",
-              "      <td>1.055366</td>\n",
-              "      <td>1.127546</td>\n",
-              "      <td>0.628159</td>\n",
-              "      <td>0.585140</td>\n",
-              "      <td>0.629074</td>\n",
-              "    </tr>\n",
-              "    <tr>\n",
-              "      <th>MONDO:0021017</th>\n",
-              "      <td>2.228560</td>\n",
-              "      <td>1.146228</td>\n",
-              "      <td>0.000000</td>\n",
-              "      <td>0.520709</td>\n",
-              "      <td>0.528579</td>\n",
-              "      <td>0.462455</td>\n",
-              "    </tr>\n",
-              "    <tr>\n",
-              "      <th>MONDO:0100459</th>\n",
-              "      <td>3.250616</td>\n",
-              "      <td>3.966312</td>\n",
-              "      <td>0.060178</td>\n",
-              "      <td>0.692115</td>\n",
-              "      <td>0.708832</td>\n",
-              "      <td>0.517378</td>\n",
-              "    </tr>\n",
-              "  </tbody>\n",
-              "</table>\n",
-              "<p>305 rows × 6 columns</p>\n",
-              "</div>\n",
-              "      <button class=\"colab-df-convert\" onclick=\"convertToInteractive('df-2e61da5a-8ac2-4f5c-8a70-b20197d3e132')\"\n",
-              "              title=\"Convert this dataframe to an interactive table.\"\n",
-              "              style=\"display:none;\">\n",
-              "\n",
-              "  <svg xmlns=\"http://www.w3.org/2000/svg\" height=\"24px\"viewBox=\"0 0 24 24\"\n",
-              "       width=\"24px\">\n",
-              "    <path d=\"M0 0h24v24H0V0z\" fill=\"none\"/>\n",
-              "    <path d=\"M18.56 5.44l.94 2.06.94-2.06 2.06-.94-2.06-.94-.94-2.06-.94 2.06-2.06.94zm-11 1L8.5 8.5l.94-2.06 2.06-.94-2.06-.94L8.5 2.5l-.94 2.06-2.06.94zm10 10l.94 2.06.94-2.06 2.06-.94-2.06-.94-.94-2.06-.94 2.06-2.06.94z\"/><path d=\"M17.41 7.96l-1.37-1.37c-.4-.4-.92-.59-1.43-.59-.52 0-1.04.2-1.43.59L10.3 9.45l-7.72 7.72c-.78.78-.78 2.05 0 2.83L4 21.41c.39.39.9.59 1.41.59.51 0 1.02-.2 1.41-.59l7.78-7.78 2.81-2.81c.8-.78.8-2.07 0-2.86zM5.41 20L4 18.59l7.72-7.72 1.47 1.35L5.41 20z\"/>\n",
-              "  </svg>\n",
-              "      </button>\n",
-              "\n",
-              "\n",
-              "\n",
-              "    <div id=\"df-30de502d-6582-4034-a2c8-456d846e0bb1\">\n",
-              "      <button class=\"colab-df-quickchart\" onclick=\"quickchart('df-30de502d-6582-4034-a2c8-456d846e0bb1')\"\n",
-              "              title=\"Suggest charts.\"\n",
-              "              style=\"display:none;\">\n",
-              "\n",
-              "<svg xmlns=\"http://www.w3.org/2000/svg\" height=\"24px\"viewBox=\"0 0 24 24\"\n",
-              "     width=\"24px\">\n",
-              "    <g>\n",
-              "        <path d=\"M19 3H5c-1.1 0-2 .9-2 2v14c0 1.1.9 2 2 2h14c1.1 0 2-.9 2-2V5c0-1.1-.9-2-2-2zM9 17H7v-7h2v7zm4 0h-2V7h2v10zm4 0h-2v-4h2v4z\"/>\n",
-              "    </g>\n",
-              "</svg>\n",
-              "      </button>\n",
-              "    </div>\n",
-              "\n",
-              "<style>\n",
-              "  .colab-df-quickchart {\n",
-              "    background-color: #E8F0FE;\n",
-              "    border: none;\n",
-              "    border-radius: 50%;\n",
-              "    cursor: pointer;\n",
-              "    display: none;\n",
-              "    fill: #1967D2;\n",
-              "    height: 32px;\n",
-              "    padding: 0 0 0 0;\n",
-              "    width: 32px;\n",
-              "  }\n",
-              "\n",
-              "  .colab-df-quickchart:hover {\n",
-              "    background-color: #E2EBFA;\n",
-              "    box-shadow: 0px 1px 2px rgba(60, 64, 67, 0.3), 0px 1px 3px 1px rgba(60, 64, 67, 0.15);\n",
-              "    fill: #174EA6;\n",
-              "  }\n",
-              "\n",
-              "  [theme=dark] .colab-df-quickchart {\n",
-              "    background-color: #3B4455;\n",
-              "    fill: #D2E3FC;\n",
-              "  }\n",
-              "\n",
-              "  [theme=dark] .colab-df-quickchart:hover {\n",
-              "    background-color: #434B5C;\n",
-              "    box-shadow: 0px 1px 3px 1px rgba(0, 0, 0, 0.15);\n",
-              "    filter: drop-shadow(0px 1px 2px rgba(0, 0, 0, 0.3));\n",
-              "    fill: #FFFFFF;\n",
-              "  }\n",
-              "</style>\n",
-              "\n",
-              "    <script>\n",
-              "      async function quickchart(key) {\n",
-              "        const containerElement = document.querySelector('#' + key);\n",
-              "        const charts = await google.colab.kernel.invokeFunction(\n",
-              "            'suggestCharts', [key], {});\n",
-              "      }\n",
-              "    </script>\n",
-              "\n",
-              "      <script>\n",
-              "\n",
-              "function displayQuickchartButton(domScope) {\n",
-              "  let quickchartButtonEl =\n",
-              "    domScope.querySelector('#df-30de502d-6582-4034-a2c8-456d846e0bb1 button.colab-df-quickchart');\n",
-              "  quickchartButtonEl.style.display =\n",
-              "    google.colab.kernel.accessAllowed ? 'block' : 'none';\n",
-              "}\n",
-              "\n",
-              "        displayQuickchartButton(document);\n",
-              "      </script>\n",
-              "      <style>\n",
-              "    .colab-df-container {\n",
-              "      display:flex;\n",
-              "      flex-wrap:wrap;\n",
-              "      gap: 12px;\n",
-              "    }\n",
-              "\n",
-              "    .colab-df-convert {\n",
-              "      background-color: #E8F0FE;\n",
-              "      border: none;\n",
-              "      border-radius: 50%;\n",
-              "      cursor: pointer;\n",
-              "      display: none;\n",
-              "      fill: #1967D2;\n",
-              "      height: 32px;\n",
-              "      padding: 0 0 0 0;\n",
-              "      width: 32px;\n",
-              "    }\n",
-              "\n",
-              "    .colab-df-convert:hover {\n",
-              "      background-color: #E2EBFA;\n",
-              "      box-shadow: 0px 1px 2px rgba(60, 64, 67, 0.3), 0px 1px 3px 1px rgba(60, 64, 67, 0.15);\n",
-              "      fill: #174EA6;\n",
-              "    }\n",
-              "\n",
-              "    [theme=dark] .colab-df-convert {\n",
-              "      background-color: #3B4455;\n",
-              "      fill: #D2E3FC;\n",
-              "    }\n",
-              "\n",
-              "    [theme=dark] .colab-df-convert:hover {\n",
-              "      background-color: #434B5C;\n",
-              "      box-shadow: 0px 1px 3px 1px rgba(0, 0, 0, 0.15);\n",
-              "      filter: drop-shadow(0px 1px 2px rgba(0, 0, 0, 0.3));\n",
-              "      fill: #FFFFFF;\n",
-              "    }\n",
-              "  </style>\n",
-              "\n",
-              "      <script>\n",
-              "        const buttonEl =\n",
-              "          document.querySelector('#df-2e61da5a-8ac2-4f5c-8a70-b20197d3e132 button.colab-df-convert');\n",
-              "        buttonEl.style.display =\n",
-              "          google.colab.kernel.accessAllowed ? 'block' : 'none';\n",
-              "\n",
-              "        async function convertToInteractive(key) {\n",
-              "          const element = document.querySelector('#df-2e61da5a-8ac2-4f5c-8a70-b20197d3e132');\n",
-              "          const dataTable =\n",
-              "            await google.colab.kernel.invokeFunction('convertToInteractive',\n",
-              "                                                     [key], {});\n",
-              "          if (!dataTable) return;\n",
-              "\n",
-              "          const docLinkHtml = 'Like what you see? Visit the ' +\n",
-              "            '<a target=\"_blank\" href=https://colab.research.google.com/notebooks/data_table.ipynb>data table notebook</a>'\n",
-              "            + ' to learn more about interactive tables.';\n",
-              "          element.innerHTML = '';\n",
-              "          dataTable['output_type'] = 'display_data';\n",
-              "          await google.colab.output.renderOutput(dataTable, element);\n",
-              "          const docLink = document.createElement('div');\n",
-              "          docLink.innerHTML = docLinkHtml;\n",
-              "          element.appendChild(docLink);\n",
-              "        }\n",
-              "      </script>\n",
-              "    </div>\n",
-              "  </div>\n"
-            ]
-          },
-          "metadata": {},
-          "execution_count": 10
-        }
-      ]
+      "execution_count": null,
+      "outputs": []
     },
     {
       "cell_type": "code",
       "source": [
-        "df.describe()"
+        "lp_mdl = OneHopPropagation()\n",
+        "lp_trainer = LabelPropagationTrainer()"
       ],
       "metadata": {
-        "colab": {
-          "base_uri": "https://localhost:8080/",
-          "height": 300
-        },
-        "id": "np234taeKVjQ",
-        "outputId": "19e843c3-b927-4c82-ca9e-b42aea000202"
+        "id": "JyrNWeZ4pDA5"
       },
-      "execution_count": 11,
-      "outputs": [
-        {
-          "output_type": "execute_result",
-          "data": {
-            "text/plain": [
-              "       train_apop    val_apop   test_apop  train_auroc   val_auroc  test_auroc\n",
-              "count  305.000000  305.000000  305.000000   305.000000  305.000000  305.000000\n",
-              "mean     1.152127    1.003746    0.819258     0.623729    0.561479    0.521614\n",
-              "std      0.753213    1.065416    1.116516     0.065599    0.082058    0.063061\n",
-              "min      0.001297   -0.320246   -0.213516     0.485241    0.351099    0.375165\n",
-              "25%      0.646890    0.255761    0.053465     0.582988    0.506204    0.473407\n",
-              "50%      0.993217    0.623817    0.392785     0.620002    0.551109    0.521458\n",
-              "75%      1.507760    1.465971    1.137018     0.659771    0.596552    0.560724\n",
-              "max      5.851295    6.370345    6.111766     0.965775    0.951942    0.794405"
-            ],
-            "text/html": [
-              "\n",
-              "\n",
-              "  <div id=\"df-bb7ed080-4e5b-45f1-a904-96f54ad26681\">\n",
-              "    <div class=\"colab-df-container\">\n",
-              "      <div>\n",
-              "<style scoped>\n",
-              "    .dataframe tbody tr th:only-of-type {\n",
-              "        vertical-align: middle;\n",
-              "    }\n",
-              "\n",
-              "    .dataframe tbody tr th {\n",
-              "        vertical-align: top;\n",
-              "    }\n",
-              "\n",
-              "    .dataframe thead th {\n",
-              "        text-align: right;\n",
-              "    }\n",
-              "</style>\n",
-              "<table border=\"1\" class=\"dataframe\">\n",
-              "  <thead>\n",
-              "    <tr style=\"text-align: right;\">\n",
-              "      <th></th>\n",
-              "      <th>train_apop</th>\n",
-              "      <th>val_apop</th>\n",
-              "      <th>test_apop</th>\n",
-              "      <th>train_auroc</th>\n",
-              "      <th>val_auroc</th>\n",
-              "      <th>test_auroc</th>\n",
-              "    </tr>\n",
-              "  </thead>\n",
-              "  <tbody>\n",
-              "    <tr>\n",
-              "      <th>count</th>\n",
-              "      <td>305.000000</td>\n",
-              "      <td>305.000000</td>\n",
-              "      <td>305.000000</td>\n",
-              "      <td>305.000000</td>\n",
-              "      <td>305.000000</td>\n",
-              "      <td>305.000000</td>\n",
-              "    </tr>\n",
-              "    <tr>\n",
-              "      <th>mean</th>\n",
-              "      <td>1.152127</td>\n",
-              "      <td>1.003746</td>\n",
-              "      <td>0.819258</td>\n",
-              "      <td>0.623729</td>\n",
-              "      <td>0.561479</td>\n",
-              "      <td>0.521614</td>\n",
-              "    </tr>\n",
-              "    <tr>\n",
-              "      <th>std</th>\n",
-              "      <td>0.753213</td>\n",
-              "      <td>1.065416</td>\n",
-              "      <td>1.116516</td>\n",
-              "      <td>0.065599</td>\n",
-              "      <td>0.082058</td>\n",
-              "      <td>0.063061</td>\n",
-              "    </tr>\n",
-              "    <tr>\n",
-              "      <th>min</th>\n",
-              "      <td>0.001297</td>\n",
-              "      <td>-0.320246</td>\n",
-              "      <td>-0.213516</td>\n",
-              "      <td>0.485241</td>\n",
-              "      <td>0.351099</td>\n",
-              "      <td>0.375165</td>\n",
-              "    </tr>\n",
-              "    <tr>\n",
-              "      <th>25%</th>\n",
-              "      <td>0.646890</td>\n",
-              "      <td>0.255761</td>\n",
-              "      <td>0.053465</td>\n",
-              "      <td>0.582988</td>\n",
-              "      <td>0.506204</td>\n",
-              "      <td>0.473407</td>\n",
-              "    </tr>\n",
-              "    <tr>\n",
-              "      <th>50%</th>\n",
-              "      <td>0.993217</td>\n",
-              "      <td>0.623817</td>\n",
-              "      <td>0.392785</td>\n",
-              "      <td>0.620002</td>\n",
-              "      <td>0.551109</td>\n",
-              "      <td>0.521458</td>\n",
-              "    </tr>\n",
-              "    <tr>\n",
-              "      <th>75%</th>\n",
-              "      <td>1.507760</td>\n",
-              "      <td>1.465971</td>\n",
-              "      <td>1.137018</td>\n",
-              "      <td>0.659771</td>\n",
-              "      <td>0.596552</td>\n",
-              "      <td>0.560724</td>\n",
-              "    </tr>\n",
-              "    <tr>\n",
-              "      <th>max</th>\n",
-              "      <td>5.851295</td>\n",
-              "      <td>6.370345</td>\n",
-              "      <td>6.111766</td>\n",
-              "      <td>0.965775</td>\n",
-              "      <td>0.951942</td>\n",
-              "      <td>0.794405</td>\n",
-              "    </tr>\n",
-              "  </tbody>\n",
-              "</table>\n",
-              "</div>\n",
-              "      <button class=\"colab-df-convert\" onclick=\"convertToInteractive('df-bb7ed080-4e5b-45f1-a904-96f54ad26681')\"\n",
-              "              title=\"Convert this dataframe to an interactive table.\"\n",
-              "              style=\"display:none;\">\n",
-              "\n",
-              "  <svg xmlns=\"http://www.w3.org/2000/svg\" height=\"24px\"viewBox=\"0 0 24 24\"\n",
-              "       width=\"24px\">\n",
-              "    <path d=\"M0 0h24v24H0V0z\" fill=\"none\"/>\n",
-              "    <path d=\"M18.56 5.44l.94 2.06.94-2.06 2.06-.94-2.06-.94-.94-2.06-.94 2.06-2.06.94zm-11 1L8.5 8.5l.94-2.06 2.06-.94-2.06-.94L8.5 2.5l-.94 2.06-2.06.94zm10 10l.94 2.06.94-2.06 2.06-.94-2.06-.94-.94-2.06-.94 2.06-2.06.94z\"/><path d=\"M17.41 7.96l-1.37-1.37c-.4-.4-.92-.59-1.43-.59-.52 0-1.04.2-1.43.59L10.3 9.45l-7.72 7.72c-.78.78-.78 2.05 0 2.83L4 21.41c.39.39.9.59 1.41.59.51 0 1.02-.2 1.41-.59l7.78-7.78 2.81-2.81c.8-.78.8-2.07 0-2.86zM5.41 20L4 18.59l7.72-7.72 1.47 1.35L5.41 20z\"/>\n",
-              "  </svg>\n",
-              "      </button>\n",
-              "\n",
-              "\n",
-              "\n",
-              "    <div id=\"df-5d515117-d4cc-4a36-88bf-1e06feab09c6\">\n",
-              "      <button class=\"colab-df-quickchart\" onclick=\"quickchart('df-5d515117-d4cc-4a36-88bf-1e06feab09c6')\"\n",
-              "              title=\"Suggest charts.\"\n",
-              "              style=\"display:none;\">\n",
-              "\n",
-              "<svg xmlns=\"http://www.w3.org/2000/svg\" height=\"24px\"viewBox=\"0 0 24 24\"\n",
-              "     width=\"24px\">\n",
-              "    <g>\n",
-              "        <path d=\"M19 3H5c-1.1 0-2 .9-2 2v14c0 1.1.9 2 2 2h14c1.1 0 2-.9 2-2V5c0-1.1-.9-2-2-2zM9 17H7v-7h2v7zm4 0h-2V7h2v10zm4 0h-2v-4h2v4z\"/>\n",
-              "    </g>\n",
-              "</svg>\n",
-              "      </button>\n",
-              "    </div>\n",
-              "\n",
-              "<style>\n",
-              "  .colab-df-quickchart {\n",
-              "    background-color: #E8F0FE;\n",
-              "    border: none;\n",
-              "    border-radius: 50%;\n",
-              "    cursor: pointer;\n",
-              "    display: none;\n",
-              "    fill: #1967D2;\n",
-              "    height: 32px;\n",
-              "    padding: 0 0 0 0;\n",
-              "    width: 32px;\n",
-              "  }\n",
-              "\n",
-              "  .colab-df-quickchart:hover {\n",
-              "    background-color: #E2EBFA;\n",
-              "    box-shadow: 0px 1px 2px rgba(60, 64, 67, 0.3), 0px 1px 3px 1px rgba(60, 64, 67, 0.15);\n",
-              "    fill: #174EA6;\n",
-              "  }\n",
-              "\n",
-              "  [theme=dark] .colab-df-quickchart {\n",
-              "    background-color: #3B4455;\n",
-              "    fill: #D2E3FC;\n",
-              "  }\n",
-              "\n",
-              "  [theme=dark] .colab-df-quickchart:hover {\n",
-              "    background-color: #434B5C;\n",
-              "    box-shadow: 0px 1px 3px 1px rgba(0, 0, 0, 0.15);\n",
-              "    filter: drop-shadow(0px 1px 2px rgba(0, 0, 0, 0.3));\n",
-              "    fill: #FFFFFF;\n",
-              "  }\n",
-              "</style>\n",
-              "\n",
-              "    <script>\n",
-              "      async function quickchart(key) {\n",
-              "        const containerElement = document.querySelector('#' + key);\n",
-              "        const charts = await google.colab.kernel.invokeFunction(\n",
-              "            'suggestCharts', [key], {});\n",
-              "      }\n",
-              "    </script>\n",
-              "\n",
-              "      <script>\n",
-              "\n",
-              "function displayQuickchartButton(domScope) {\n",
-              "  let quickchartButtonEl =\n",
-              "    domScope.querySelector('#df-5d515117-d4cc-4a36-88bf-1e06feab09c6 button.colab-df-quickchart');\n",
-              "  quickchartButtonEl.style.display =\n",
-              "    google.colab.kernel.accessAllowed ? 'block' : 'none';\n",
-              "}\n",
-              "\n",
-              "        displayQuickchartButton(document);\n",
-              "      </script>\n",
-              "      <style>\n",
-              "    .colab-df-container {\n",
-              "      display:flex;\n",
-              "      flex-wrap:wrap;\n",
-              "      gap: 12px;\n",
-              "    }\n",
-              "\n",
-              "    .colab-df-convert {\n",
-              "      background-color: #E8F0FE;\n",
-              "      border: none;\n",
-              "      border-radius: 50%;\n",
-              "      cursor: pointer;\n",
-              "      display: none;\n",
-              "      fill: #1967D2;\n",
-              "      height: 32px;\n",
-              "      padding: 0 0 0 0;\n",
-              "      width: 32px;\n",
-              "    }\n",
-              "\n",
-              "    .colab-df-convert:hover {\n",
-              "      background-color: #E2EBFA;\n",
-              "      box-shadow: 0px 1px 2px rgba(60, 64, 67, 0.3), 0px 1px 3px 1px rgba(60, 64, 67, 0.15);\n",
-              "      fill: #174EA6;\n",
-              "    }\n",
-              "\n",
-              "    [theme=dark] .colab-df-convert {\n",
-              "      background-color: #3B4455;\n",
-              "      fill: #D2E3FC;\n",
-              "    }\n",
-              "\n",
-              "    [theme=dark] .colab-df-convert:hover {\n",
-              "      background-color: #434B5C;\n",
-              "      box-shadow: 0px 1px 3px 1px rgba(0, 0, 0, 0.15);\n",
-              "      filter: drop-shadow(0px 1px 2px rgba(0, 0, 0, 0.3));\n",
-              "      fill: #FFFFFF;\n",
-              "    }\n",
-              "  </style>\n",
-              "\n",
-              "      <script>\n",
-              "        const buttonEl =\n",
-              "          document.querySelector('#df-bb7ed080-4e5b-45f1-a904-96f54ad26681 button.colab-df-convert');\n",
-              "        buttonEl.style.display =\n",
-              "          google.colab.kernel.accessAllowed ? 'block' : 'none';\n",
-              "\n",
-              "        async function convertToInteractive(key) {\n",
-              "          const element = document.querySelector('#df-bb7ed080-4e5b-45f1-a904-96f54ad26681');\n",
-              "          const dataTable =\n",
-              "            await google.colab.kernel.invokeFunction('convertToInteractive',\n",
-              "                                                     [key], {});\n",
-              "          if (!dataTable) return;\n",
-              "\n",
-              "          const docLinkHtml = 'Like what you see? Visit the ' +\n",
-              "            '<a target=\"_blank\" href=https://colab.research.google.com/notebooks/data_table.ipynb>data table notebook</a>'\n",
-              "            + ' to learn more about interactive tables.';\n",
-              "          element.innerHTML = '';\n",
-              "          dataTable['output_type'] = 'display_data';\n",
-              "          await google.colab.output.renderOutput(dataTable, element);\n",
-              "          const docLink = document.createElement('div');\n",
-              "          docLink.innerHTML = docLinkHtml;\n",
-              "          element.appendChild(docLink);\n",
-              "        }\n",
-              "      </script>\n",
-              "    </div>\n",
-              "  </div>\n"
-            ]
-          },
-          "metadata": {},
-          "execution_count": 11
-        }
-      ]
+      "execution_count": null,
+      "outputs": []
     },
     {
       "cell_type": "code",
-      "source": [],
+      "source": [
+        "lp_results = lp_trainer.fit_and_eval(lp_mdl, dataset)"
+      ],
       "metadata": {
-        "id": "3I3Z0WJyKVQP"
+        "id": "5sWjyXv1pFuP"
       },
       "execution_count": null,
       "outputs": []
@@ -1597,73 +599,101 @@
     {
       "cell_type": "code",
       "source": [
-        "from sklearn.linear_model import LogisticRegression\n",
-        "from obnb.model_trainer import SupervisedLearningTrainer\n",
-        "\n",
-        "mdl = LogisticRegression(penalty=\"l2\", solver=\"lbfgs\")\n",
-        "trainer = SupervisedLearningTrainer()\n",
-        "\n",
-        "results2 = trainer.fit_and_eval(mdl, dataset)"
+        "lp_df = pd.DataFrame(lp_results, index=dataset.label.label_ids)\n",
+        "lp_df"
       ],
       "metadata": {
-        "id": "GdeNbEDz-6cx"
+        "id": "NBtgy76xIoc_"
       },
       "execution_count": null,
       "outputs": []
     },
     {
       "cell_type": "code",
-      "source": [],
+      "source": [
+        "lp_df.describe()"
+      ],
       "metadata": {
-        "id": "-NJKfTpx-6Z4"
+        "id": "np234taeKVjQ"
       },
       "execution_count": null,
       "outputs": []
     },
+    {
+      "cell_type": "markdown",
+      "source": [
+        "### 4.2. Supervised learning"
+      ],
+      "metadata": {
+        "id": "hkVYJQE8pR9F"
+      }
+    },
     {
       "cell_type": "code",
-      "source": [],
+      "source": [
+        "from sklearn.linear_model import LogisticRegression\n",
+        "from obnb.model_trainer import SupervisedLearningTrainer"
+      ],
       "metadata": {
-        "id": "3uHHEcsx-6Wy"
+        "id": "GdeNbEDz-6cx"
       },
       "execution_count": null,
       "outputs": []
     },
     {
       "cell_type": "code",
-      "source": [],
+      "source": [
+        "sl_mdl = LogisticRegression(penalty=\"l2\", solver=\"lbfgs\")\n",
+        "sl_trainer = SupervisedLearningTrainer()"
+      ],
       "metadata": {
-        "id": "-14ui8Jt-6Tf"
+        "id": "JXWOi3hGpfIG"
       },
       "execution_count": null,
       "outputs": []
     },
     {
       "cell_type": "code",
-      "source": [],
+      "source": [
+        "sl_results = sl_trainer.fit_and_eval(sl_mdl, dataset)"
+      ],
       "metadata": {
-        "id": "poGro_Qo-6Qz"
+        "id": "FTj2l-9ipj-4"
       },
       "execution_count": null,
       "outputs": []
     },
     {
       "cell_type": "code",
-      "source": [],
+      "source": [
+        "sl_df = pd.DataFrame(sl_results, index=dataset.label.label_ids)\n",
+        "sl_df"
+      ],
       "metadata": {
-        "id": "Hn4bHRIg-6Nz"
+        "id": "A4OZoPsipaiw"
       },
       "execution_count": null,
       "outputs": []
     },
     {
       "cell_type": "code",
-      "source": [],
+      "source": [
+        "sl_df.describe()"
+      ],
       "metadata": {
-        "id": "v3FTLk__-5zs"
+        "id": "lxHqDjPupcT0"
       },
       "execution_count": null,
       "outputs": []
+    },
+    {
+      "cell_type": "markdown",
+      "source": [
+        "### 4.3. GNN (coming soon)"
+      ],
+      "metadata": {
+        "id": "Qf6Z7iBfpVfZ"
+      }
     }
   ]
 }
\ No newline at end of file

	Name	Info	Size	0	1	2	3	4	5	6	...	584	585	586	587	588	589	590	591	592	593
0	MONDO:0000004	adrenocortical insufficiency	44	23530	2737	55699	3284	1585	1589	50940	...	None	None	None	None	None	None	None	None	None	None
1	MONDO:0021034	genetic alopecia	76	340419	5894	6635	92344	10913	4289	22808	...	None	None	None	None	None	None	None	None	None	None
2	MONDO:0000009	inherited bleeding disorder, platelet-type	37	5328	2533	342618	6916	2531	6915	80739	...	None	None	None	None	None	None	None	None	None	None
3	MONDO:0002243	hemorrhagic disease	135	7450	342618	4618	6916	2531	421	196527	...	None	None	None	None	None	None	None	None	None	None
4	MONDO:0002245	blood platelet disease	329	5698	7706	55135	2475	342618	79053	374569	...	None	None	None	None	None	None	None	None	None	None
...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...
1035	MONDO:0044976	obsolete disease of catalytic activity	11	2936	2539	2729	2937	226	2023	3098	...	None	None	None	None	None	None	None	None	None	None
1036	MONDO:0100130	adult acute respiratory distress syndrome	55	6347	1906	407055	442911	5685	406953	210	...	None	None	None	None	None	None	None	None	None	None
1037	MONDO:0100431	migraine without aura	19	796	79783	1909	4209	101929660	79054	1740	...	None	None	None	None	None	None	None	None	None	None
1038	MONDO:0100459	azoospermia	95	5889	3077	4952	9085	2488	84464	6660	...	None	None	None	None	None	None	None	None	None	None
1039	MONDO:0100471	vitamin D deficiency	13	6197	3508	10939	9772	4036	84617	7421	...	None	None	None	None	None	None	None	None	None	None
	train_apop	val_apop	test_apop	train_auroc	val_auroc	test_auroc
MONDO:0021034	1.843529	2.163573	0.033863	0.675152	0.483754	0.472385
MONDO:0002243	1.813295	0.303094	1.002558	0.701548	0.516716	0.595784
MONDO:0002245	0.809497	0.620750	0.206753	0.637431	0.557898	0.560433
MONDO:0001703	0.778886	1.511866	4.148121	0.548935	0.588175	0.497549
MONDO:0013099	1.914333	2.262900	0.493989	0.646922	0.647186	0.532789
...	...	...	...	...	...	...
MONDO:0100284	0.603496	0.189524	0.000000	0.573916	0.527694	0.413082
MONDO:0020019	1.583002	0.591055	0.879580	0.681023	0.518388	0.598177
MONDO:0021002	1.211121	1.055366	1.127546	0.628159	0.585140	0.629074
MONDO:0021017	2.228560	1.146228	0.000000	0.520709	0.528579	0.462455
MONDO:0100459	3.250616	3.966312	0.060178	0.692115	0.708832	0.517378
	train_apop	val_apop	test_apop	train_auroc	val_auroc	test_auroc
count	305.000000	305.000000	305.000000	305.000000	305.000000	305.000000
mean	1.152127	1.003746	0.819258	0.623729	0.561479	0.521614
std	0.753213	1.065416	1.116516	0.065599	0.082058	0.063061
min	0.001297	-0.320246	-0.213516	0.485241	0.351099	0.375165
25%	0.646890	0.255761	0.053465	0.582988	0.506204	0.473407
50%	0.993217	0.623817	0.392785	0.620002	0.551109	0.521458
75%	1.507760	1.465971	1.137018	0.659771	0.596552	0.560724
max	5.851295	6.370345	6.111766	0.965775	0.951942	0.794405