From e813d2ef342406cfb1d55efa9e42cd6ff7f532eb Mon Sep 17 00:00:00 2001
From: Remy Liu <36778645+RemyLau@users.noreply.github.com>
Date: Tue, 11 Jul 2023 11:54:22 -0400
Subject: [PATCH] Created using Colaboratory
---
tutorials/basic_tutorial.ipynb | 1932 ++++++++------------------------
1 file changed, 481 insertions(+), 1451 deletions(-)
diff --git a/tutorials/basic_tutorial.ipynb b/tutorials/basic_tutorial.ipynb
index 2e623df3..025fc4c8 100644
--- a/tutorials/basic_tutorial.ipynb
+++ b/tutorials/basic_tutorial.ipynb
@@ -4,7 +4,8 @@
"metadata": {
"colab": {
"provenance": [],
- "authorship_tag": "ABX9TyO107Ua39a7xOmb/P+xC24j",
+ "toc_visible": true,
+ "authorship_tag": "ABX9TyM/72QVmPpoW9JPrZYT0/P3",
"include_colab_link": true
},
"kernelspec": {
@@ -26,39 +27,57 @@
""
]
},
+ {
+ "cell_type": "markdown",
+ "source": [
+ "# Basic Tutorial for the Open Biomedical Network Benchmark package"
+ ],
+ "metadata": {
+ "id": "Ba_AaNS7Stg8"
+ }
+ },
+ {
+ "cell_type": "markdown",
+ "source": [
+ "## 1. Installation\n",
+ "\n",
+ "Installation can be easily done via `pip`.\n",
+ "\n",
+ "via PyPI (released or pre-release versions)\n",
+ "```bash\n",
+ "pip install obnb\n",
+ "```\n",
+ "\n",
+ "or via GitHub (latest dev version)\n",
+ "```bash\n",
+ "pip install git+https://github.com/krishnanlab/obnb\n",
+ "```"
+ ],
+ "metadata": {
+ "id": "pv7SYyrlTKl4"
+ }
+ },
{
"cell_type": "code",
- "execution_count": 1,
+ "execution_count": null,
"metadata": {
- "colab": {
- "base_uri": "https://localhost:8080/"
- },
- "id": "fypP0bMZ-Wsu",
- "outputId": "f4bf8c41-0632-42dc-b2bc-7035757928ac"
+ "id": "fypP0bMZ-Wsu"
},
- "outputs": [
- {
- "output_type": "stream",
- "name": "stdout",
- "text": [
- " Installing build dependencies ... \u001b[?25l\u001b[?25hdone\n",
- " Getting requirements to build wheel ... \u001b[?25l\u001b[?25hdone\n",
- " Preparing metadata (pyproject.toml) ... \u001b[?25l\u001b[?25hdone\n",
- "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m61.1/61.1 kB\u001b[0m \u001b[31m2.2 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
- "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m59.0/59.0 kB\u001b[0m \u001b[31m5.8 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
- "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m4.3/4.3 MB\u001b[0m \u001b[31m42.1 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
- "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m112.7/112.7 kB\u001b[0m \u001b[31m7.9 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
- "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m54.5/54.5 kB\u001b[0m \u001b[31m4.2 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
- "\u001b[?25h Preparing metadata (setup.py) ... \u001b[?25l\u001b[?25hdone\n",
- " Building wheel for obnb (pyproject.toml) ... \u001b[?25l\u001b[?25hdone\n",
- " Building wheel for littleutils (setup.py) ... \u001b[?25l\u001b[?25hdone\n"
- ]
- }
- ],
+ "outputs": [],
"source": [
+ "# Install latest dev version of OBNB\n",
"!pip install -q git+https://github.com/krishnanlab/obnb"
]
},
+ {
+ "cell_type": "markdown",
+ "source": [
+ "Check if the package is installed successfully"
+ ],
+ "metadata": {
+ "id": "WefaXPkqUS6e"
+ }
+ },
{
"cell_type": "code",
"source": [
@@ -66,22 +85,47 @@
"print(f\"Installed obnb {obnb.__version__}\")"
],
"metadata": {
- "colab": {
- "base_uri": "https://localhost:8080/"
- },
- "id": "XJbywsXU-6zH",
- "outputId": "3e561b06-0f81-4545-887f-ec71f7165cc8"
+ "id": "XJbywsXU-6zH"
},
- "execution_count": 2,
- "outputs": [
- {
- "output_type": "stream",
- "name": "stdout",
- "text": [
- "Installed obnb 0.1.1-dev\n"
- ]
- }
- ]
+ "execution_count": null,
+ "outputs": []
+ },
+ {
+ "cell_type": "code",
+ "source": [
+ "import obnb.ext.pecanpy\n",
+ "print(f\"Extension for PecanPy installed: {obnb.ext.pecanpy}\")"
+ ],
+ "metadata": {
+ "id": "_ZYMxfgfUZFe"
+ },
+ "execution_count": null,
+ "outputs": []
+ },
+ {
+ "cell_type": "markdown",
+ "source": [
+ "## 2. Data downloading and processing"
+ ],
+ "metadata": {
+ "id": "oZsfNaHqVaQu"
+ }
+ },
+ {
+ "cell_type": "markdown",
+ "source": [
+ "First, load the `obnb.data` module that contains \"recipies\" for processing\n",
+ "differentt selections of biological networks and gene annotation data.\n",
+ "\n",
+ "We also need to specify (1) the path to which the data will be saved, and more\n",
+ "importantly, (2) the **version** of the data we want to retrieve. The version\n",
+ "option allows for flexible data retrieval (either retrieve data from source, or\n",
+ "retrieve from processed data archive) and also enable reproduction of the\n",
+ "downstream analysis."
+ ],
+ "metadata": {
+ "id": "bDx-hDiTVsM4"
+ }
},
{
"cell_type": "code",
@@ -89,1507 +133,465 @@
"import obnb.data\n",
"import yaml\n",
"\n",
+ "# Where do we want to save the data and related files to\n",
"root = \"datasets\"\n",
+ "\n",
+ "# What version of the pre-processed data to download\n",
"data_version = \"obnbdata-0.1.0\"\n",
- "lsc = obnb.data.DisGeNET(root, version=data_version)"
+ "# data_version = \"latest\" # download data from source and process from scratch\n",
+ "# data_version = \"current\" # use the latest archived data version"
],
"metadata": {
- "colab": {
- "base_uri": "https://localhost:8080/"
- },
- "id": "s1apiAau_GDY",
- "outputId": "95ac9e0b-52ab-42db-fc1b-dba9f7691827"
+ "id": "s1apiAau_GDY"
},
- "execution_count": 3,
- "outputs": [
- {
- "output_type": "stream",
- "name": "stderr",
- "text": [
- "[INFO][2023-07-10 23:25:35,227][base][download_archive] Loading DisGeNET (version='obnbdata-0.1.0')...\n",
- "[INFO][2023-07-10 23:25:35,229][download][get_data_url] Download URL: https://zenodo.org/record/8045270/files/DisGeNET.zip\n",
- "[INFO][2023-07-10 23:25:35,232][download][download_unzip] Downloading zip archive from https://zenodo.org/record/8045270/files/DisGeNET.zip\n",
- "100%|██████████| 219k/219k [00:00<00:00, 687kB/s]\n",
- "[INFO][2023-07-10 23:25:36,852][download][download_unzip] Download completed, start unpacking...\n",
- "[INFO][2023-07-10 23:25:36,864][download][download_unzip] Done extracting\n",
- "[INFO][2023-07-10 23:25:36,869][download][get_data_url] Download URL: https://zenodo.org/record/8045270/files/.cache.zip\n",
- "[INFO][2023-07-10 23:25:36,871][download][download_unzip] Downloading zip archive from https://zenodo.org/record/8045270/files/.cache.zip\n",
- "100%|██████████| 24.7M/24.7M [00:01<00:00, 13.0MB/s]\n",
- "[INFO][2023-07-10 23:25:39,817][download][download_unzip] Download completed, start unpacking...\n",
- "[INFO][2023-07-10 23:25:41,874][download][download_unzip] Done extracting\n",
- "[INFO][2023-07-10 23:25:41,902][base][load_processed_data] Load processed file datasets/DisGeNET/processed/data.gmt\n"
- ]
- }
- ]
+ "execution_count": null,
+ "outputs": []
+ },
+ {
+ "cell_type": "markdown",
+ "source": [
+ "### 2.1. Biological networks"
+ ],
+ "metadata": {
+ "id": "8YF_zoqBWOzV"
+ }
+ },
+ {
+ "cell_type": "markdown",
+ "source": [
+ "Let's start with an example of obtaining the `BioPlex` network, which is a\n",
+ "protein-protein interaction (PPI) network that is constructed via AP-MS on\n",
+ "human cell-lines ([HEK293T](https://www.synthego.com/hek293) and\n",
+ "[HCT116](https://imanislife.com/collections/cell-lines/hct116-cells/)).\n",
+ "Checkout other avaialble options for processed biomedical networks on the OBNB\n",
+ "benchmark\n",
+ "[README](https://github.com/krishnanlab/obnbench#data-stats-obnbdata-010-) page.\n",
+ "\n",
+ "[1] Huttlin, Edward L., et al. \"The BioPlex network: a systematic exploration of the human interactome.\" Cell 162.2 (2015): 425-440.\n",
+ "\n",
+ "[2] Huttlin, Edward L., et al. \"Dual proteome-scale networks reveal cell-specific remodeling of the human interactome.\" Cell 184.11 (2021): 3022-3040."
+ ],
+ "metadata": {
+ "id": "7idT6WBxXR29"
+ }
},
{
"cell_type": "code",
- "source": [],
+ "source": [
+ "# Download network from archive\n",
+ "g = obnb.data.BioPlex(root, version=data_version)"
+ ],
"metadata": {
- "id": "dHZSPRK0_xYz"
+ "id": "-Wsdv0VmWVfr"
},
- "execution_count": 3,
+ "execution_count": null,
"outputs": []
},
{
"cell_type": "code",
"source": [
- "print(yaml.dump(lsc.to_config()))"
+ "# Once downloaded, it can be used in future acess without redownloading\n",
+ "g = obnb.data.BioPlex(root, version=data_version)"
],
"metadata": {
- "colab": {
- "base_uri": "https://localhost:8080/"
- },
- "id": "UEZQ-LJ5_vJO",
- "outputId": "a88f919b-7eab-4f94-cbfb-ba91a283ac5e"
+ "id": "ovT8pvzbWVdR"
},
- "execution_count": 4,
- "outputs": [
- {
- "output_type": "stream",
- "name": "stdout",
- "text": [
- "data_module: DisGeNET\n",
- "data_module_params:\n",
- " gene_id_converter: HumanEntrez\n",
- " pre_transform:\n",
- " - LabelsetRangeFilterSize:\n",
- " max_val: '600'\n",
- " min_val: None\n",
- " - LabelsetNonRedFilter:\n",
- " thresholds: (0.5, 0.7)\n",
- " - LabelsetRangeFilterSize:\n",
- " max_val: None\n",
- " min_val: '10'\n",
- " version: obnbdata-0.1.0\n",
- "package_version: 0.1.1-dev\n",
- "processed_time: '2023-07-10 23:15:57'\n",
- "\n"
- ]
- }
- ]
+ "execution_count": null,
+ "outputs": []
},
{
"cell_type": "code",
"source": [
- "lsc.to_df()"
+ "# You can also force redownloading the data by specifying redownload=True\n",
+ "g = obnb.data.BioPlex(root, version=data_version, redownload=True)"
],
"metadata": {
- "colab": {
- "base_uri": "https://localhost:8080/",
- "height": 748
- },
- "id": "bxYbfhya_GAj",
- "outputId": "c3f222d2-c853-4023-901c-9726ac62bed4"
+ "id": "KDFC5JnyWVOb"
},
- "execution_count": 5,
- "outputs": [
- {
- "output_type": "execute_result",
- "data": {
- "text/plain": [
- " Name Info Size 0 \\\n",
- "0 MONDO:0000004 adrenocortical insufficiency 44 23530 \n",
- "1 MONDO:0021034 genetic alopecia 76 340419 \n",
- "2 MONDO:0000009 inherited bleeding disorder, platelet-type 37 5328 \n",
- "3 MONDO:0002243 hemorrhagic disease 135 7450 \n",
- "4 MONDO:0002245 blood platelet disease 329 5698 \n",
- "... ... ... ... ... \n",
- "1035 MONDO:0044976 obsolete disease of catalytic activity 11 2936 \n",
- "1036 MONDO:0100130 adult acute respiratory distress syndrome 55 6347 \n",
- "1037 MONDO:0100431 migraine without aura 19 796 \n",
- "1038 MONDO:0100459 azoospermia 95 5889 \n",
- "1039 MONDO:0100471 vitamin D deficiency 13 6197 \n",
- "\n",
- " 1 2 3 4 5 6 ... 584 585 \\\n",
- "0 2737 55699 3284 1585 1589 50940 ... None None \n",
- "1 5894 6635 92344 10913 4289 22808 ... None None \n",
- "2 2533 342618 6916 2531 6915 80739 ... None None \n",
- "3 342618 4618 6916 2531 421 196527 ... None None \n",
- "4 7706 55135 2475 342618 79053 374569 ... None None \n",
- "... ... ... ... ... ... ... ... ... ... \n",
- "1035 2539 2729 2937 226 2023 3098 ... None None \n",
- "1036 1906 407055 442911 5685 406953 210 ... None None \n",
- "1037 79783 1909 4209 101929660 79054 1740 ... None None \n",
- "1038 3077 4952 9085 2488 84464 6660 ... None None \n",
- "1039 3508 10939 9772 4036 84617 7421 ... None None \n",
- "\n",
- " 586 587 588 589 590 591 592 593 \n",
- "0 None None None None None None None None \n",
- "1 None None None None None None None None \n",
- "2 None None None None None None None None \n",
- "3 None None None None None None None None \n",
- "4 None None None None None None None None \n",
- "... ... ... ... ... ... ... ... ... \n",
- "1035 None None None None None None None None \n",
- "1036 None None None None None None None None \n",
- "1037 None None None None None None None None \n",
- "1038 None None None None None None None None \n",
- "1039 None None None None None None None None \n",
- "\n",
- "[1040 rows x 597 columns]"
- ],
- "text/html": [
- "\n",
- "\n",
- "
\n",
- "
\n",
- "
\n",
- "\n",
- "
\n",
- " \n",
- " \n",
- " | \n",
- " Name | \n",
- " Info | \n",
- " Size | \n",
- " 0 | \n",
- " 1 | \n",
- " 2 | \n",
- " 3 | \n",
- " 4 | \n",
- " 5 | \n",
- " 6 | \n",
- " ... | \n",
- " 584 | \n",
- " 585 | \n",
- " 586 | \n",
- " 587 | \n",
- " 588 | \n",
- " 589 | \n",
- " 590 | \n",
- " 591 | \n",
- " 592 | \n",
- " 593 | \n",
- "
\n",
- " \n",
- " \n",
- " \n",
- " 0 | \n",
- " MONDO:0000004 | \n",
- " adrenocortical insufficiency | \n",
- " 44 | \n",
- " 23530 | \n",
- " 2737 | \n",
- " 55699 | \n",
- " 3284 | \n",
- " 1585 | \n",
- " 1589 | \n",
- " 50940 | \n",
- " ... | \n",
- " None | \n",
- " None | \n",
- " None | \n",
- " None | \n",
- " None | \n",
- " None | \n",
- " None | \n",
- " None | \n",
- " None | \n",
- " None | \n",
- "
\n",
- " \n",
- " 1 | \n",
- " MONDO:0021034 | \n",
- " genetic alopecia | \n",
- " 76 | \n",
- " 340419 | \n",
- " 5894 | \n",
- " 6635 | \n",
- " 92344 | \n",
- " 10913 | \n",
- " 4289 | \n",
- " 22808 | \n",
- " ... | \n",
- " None | \n",
- " None | \n",
- " None | \n",
- " None | \n",
- " None | \n",
- " None | \n",
- " None | \n",
- " None | \n",
- " None | \n",
- " None | \n",
- "
\n",
- " \n",
- " 2 | \n",
- " MONDO:0000009 | \n",
- " inherited bleeding disorder, platelet-type | \n",
- " 37 | \n",
- " 5328 | \n",
- " 2533 | \n",
- " 342618 | \n",
- " 6916 | \n",
- " 2531 | \n",
- " 6915 | \n",
- " 80739 | \n",
- " ... | \n",
- " None | \n",
- " None | \n",
- " None | \n",
- " None | \n",
- " None | \n",
- " None | \n",
- " None | \n",
- " None | \n",
- " None | \n",
- " None | \n",
- "
\n",
- " \n",
- " 3 | \n",
- " MONDO:0002243 | \n",
- " hemorrhagic disease | \n",
- " 135 | \n",
- " 7450 | \n",
- " 342618 | \n",
- " 4618 | \n",
- " 6916 | \n",
- " 2531 | \n",
- " 421 | \n",
- " 196527 | \n",
- " ... | \n",
- " None | \n",
- " None | \n",
- " None | \n",
- " None | \n",
- " None | \n",
- " None | \n",
- " None | \n",
- " None | \n",
- " None | \n",
- " None | \n",
- "
\n",
- " \n",
- " 4 | \n",
- " MONDO:0002245 | \n",
- " blood platelet disease | \n",
- " 329 | \n",
- " 5698 | \n",
- " 7706 | \n",
- " 55135 | \n",
- " 2475 | \n",
- " 342618 | \n",
- " 79053 | \n",
- " 374569 | \n",
- " ... | \n",
- " None | \n",
- " None | \n",
- " None | \n",
- " None | \n",
- " None | \n",
- " None | \n",
- " None | \n",
- " None | \n",
- " None | \n",
- " None | \n",
- "
\n",
- " \n",
- " ... | \n",
- " ... | \n",
- " ... | \n",
- " ... | \n",
- " ... | \n",
- " ... | \n",
- " ... | \n",
- " ... | \n",
- " ... | \n",
- " ... | \n",
- " ... | \n",
- " ... | \n",
- " ... | \n",
- " ... | \n",
- " ... | \n",
- " ... | \n",
- " ... | \n",
- " ... | \n",
- " ... | \n",
- " ... | \n",
- " ... | \n",
- " ... | \n",
- "
\n",
- " \n",
- " 1035 | \n",
- " MONDO:0044976 | \n",
- " obsolete disease of catalytic activity | \n",
- " 11 | \n",
- " 2936 | \n",
- " 2539 | \n",
- " 2729 | \n",
- " 2937 | \n",
- " 226 | \n",
- " 2023 | \n",
- " 3098 | \n",
- " ... | \n",
- " None | \n",
- " None | \n",
- " None | \n",
- " None | \n",
- " None | \n",
- " None | \n",
- " None | \n",
- " None | \n",
- " None | \n",
- " None | \n",
- "
\n",
- " \n",
- " 1036 | \n",
- " MONDO:0100130 | \n",
- " adult acute respiratory distress syndrome | \n",
- " 55 | \n",
- " 6347 | \n",
- " 1906 | \n",
- " 407055 | \n",
- " 442911 | \n",
- " 5685 | \n",
- " 406953 | \n",
- " 210 | \n",
- " ... | \n",
- " None | \n",
- " None | \n",
- " None | \n",
- " None | \n",
- " None | \n",
- " None | \n",
- " None | \n",
- " None | \n",
- " None | \n",
- " None | \n",
- "
\n",
- " \n",
- " 1037 | \n",
- " MONDO:0100431 | \n",
- " migraine without aura | \n",
- " 19 | \n",
- " 796 | \n",
- " 79783 | \n",
- " 1909 | \n",
- " 4209 | \n",
- " 101929660 | \n",
- " 79054 | \n",
- " 1740 | \n",
- " ... | \n",
- " None | \n",
- " None | \n",
- " None | \n",
- " None | \n",
- " None | \n",
- " None | \n",
- " None | \n",
- " None | \n",
- " None | \n",
- " None | \n",
- "
\n",
- " \n",
- " 1038 | \n",
- " MONDO:0100459 | \n",
- " azoospermia | \n",
- " 95 | \n",
- " 5889 | \n",
- " 3077 | \n",
- " 4952 | \n",
- " 9085 | \n",
- " 2488 | \n",
- " 84464 | \n",
- " 6660 | \n",
- " ... | \n",
- " None | \n",
- " None | \n",
- " None | \n",
- " None | \n",
- " None | \n",
- " None | \n",
- " None | \n",
- " None | \n",
- " None | \n",
- " None | \n",
- "
\n",
- " \n",
- " 1039 | \n",
- " MONDO:0100471 | \n",
- " vitamin D deficiency | \n",
- " 13 | \n",
- " 6197 | \n",
- " 3508 | \n",
- " 10939 | \n",
- " 9772 | \n",
- " 4036 | \n",
- " 84617 | \n",
- " 7421 | \n",
- " ... | \n",
- " None | \n",
- " None | \n",
- " None | \n",
- " None | \n",
- " None | \n",
- " None | \n",
- " None | \n",
- " None | \n",
- " None | \n",
- " None | \n",
- "
\n",
- " \n",
- "
\n",
- "
1040 rows × 597 columns
\n",
- "
\n",
- "
\n",
- "\n",
- "\n",
- "\n",
- "
\n",
- "
\n",
- "
\n",
- "\n",
- "\n",
- "\n",
- " \n",
- "\n",
- " \n",
- " \n",
- "\n",
- " \n",
- "
\n",
- "
\n"
- ]
- },
- "metadata": {},
- "execution_count": 5
- }
- ]
+ "execution_count": null,
+ "outputs": []
+ },
+ {
+ "cell_type": "markdown",
+ "source": [
+ "You can also checkout more information about the processing done for this\n",
+ "network by looking into the config."
+ ],
+ "metadata": {
+ "id": "z8qGrmgTbRzW"
+ }
},
{
"cell_type": "code",
- "source": [],
+ "source": [
+ "print(yaml.dump(g.to_config()))"
+ ],
"metadata": {
- "id": "nnH4lXpm_F9g"
+ "id": "4B7NKw55bQ-k"
},
- "execution_count": 10,
+ "execution_count": null,
"outputs": []
},
+ {
+ "cell_type": "markdown",
+ "source": [
+ "The gene IDs in the network can be accessed via the `node_ids` attribute, which\n",
+ "are [Entrez](https://www.ncbi.nlm.nih.gov/pmc/articles/PMC1761442/) gene ID by\n",
+ "default."
+ ],
+ "metadata": {
+ "id": "5G5tGtPSb_Ob"
+ }
+ },
{
"cell_type": "code",
"source": [
- "g = obnb.data.BioGRID(root, version=data_version)"
+ "print(f\"The first gene in the network is {g.node_ids[0]!r}\")\n",
+ "print(f\"The second gene in the network is {g.node_ids[1]!r}\")\n",
+ "print(f\"The third gene in the network is {g.node_ids[2]!r}\")"
],
"metadata": {
- "colab": {
- "base_uri": "https://localhost:8080/"
- },
- "id": "ui07Xv9tBh3E",
- "outputId": "9f425f89-bc48-4215-962c-bb849eee1100"
+ "id": "gQrZ6besb5E_"
},
- "execution_count": 6,
- "outputs": [
- {
- "output_type": "stream",
- "name": "stderr",
- "text": [
- "[INFO][2023-07-10 23:16:03,767][base][download_archive] Loading BioGRID (version='obnbdata-0.1.0')...\n",
- "[INFO][2023-07-10 23:16:03,770][download][get_data_url] Download URL: https://zenodo.org/record/8045270/files/BioGRID.zip\n",
- "[INFO][2023-07-10 23:16:03,773][download][download_unzip] Downloading zip archive from https://zenodo.org/record/8045270/files/BioGRID.zip\n",
- "100%|██████████| 39.3M/39.3M [00:02<00:00, 14.3MB/s]\n",
- "[INFO][2023-07-10 23:16:07,863][download][download_unzip] Download completed, start unpacking...\n",
- "[INFO][2023-07-10 23:16:10,818][download][download_unzip] Done extracting\n",
- "[INFO][2023-07-10 23:16:10,825][base][load_processed_data] Load processed file datasets/BioGRID/processed/data.npz\n"
- ]
- }
- ]
+ "execution_count": null,
+ "outputs": []
+ },
+ {
+ "cell_type": "markdown",
+ "source": [
+ "The graph `g` object is an instance of the `obnb.graph.SparseGraph` object.\n",
+ "But it could be easily converted into a dense adjacency matrix via `to_adjmat`"
+ ],
+ "metadata": {
+ "id": "yRbbhwpTbgwz"
+ }
},
{
"cell_type": "code",
"source": [
- "g = obnb.data.BioGRID(root, version=data_version)"
+ "adj = g.to_adjmat()\n",
+ "adj"
],
"metadata": {
- "colab": {
- "base_uri": "https://localhost:8080/"
- },
- "id": "lLqZ9f_-BoOt",
- "outputId": "8aa93790-55c3-419f-d001-71a157b4caaa"
+ "id": "BjX_K65nbfvV"
},
- "execution_count": 7,
- "outputs": [
- {
- "output_type": "stream",
- "name": "stderr",
- "text": [
- "[INFO][2023-07-10 23:16:12,911][base][load_processed_data] Load processed file datasets/BioGRID/processed/data.npz\n"
- ]
- }
- ]
+ "execution_count": null,
+ "outputs": []
+ },
+ {
+ "cell_type": "markdown",
+ "source": [
+ "### 2.2. Gene annotations"
+ ],
+ "metadata": {
+ "id": "IUeQvMf4WWM7"
+ }
+ },
+ {
+ "cell_type": "markdown",
+ "source": [
+ "Setting up gene annotation tasks is a tedious process that include\n",
+ "\n",
+ "1. Obtain annotations for gene-term associations and convert gene identifier to\n",
+ " the desired option.\n",
+ "1. Obtain and construct ontology graph that represents the relationships among\n",
+ " different terms.\n",
+ "1. Propagate the gene-term annotations upward the ontology graph.\n",
+ "1. Extract non-redundant representative gene sets (terms) from the propagated\n",
+ " annotations.\n",
+ "\n",
+ "\n",
+ "Here, we use the [DisGeNET](https://www.disgenet.org/) disease gene annotations\n",
+ "with [MONDO](https://mondo.monarchinitiative.org/) disease ontology as an\n",
+ "example to set up the DisGeNET gene set collection.\n",
+ "\n",
+ "[3] Piñero, Janet, et al. \"DisGeNET: a comprehensive platform integrating information on human disease-associated genes and variants.\" Nucleic acids research (2016): gkw943.\n",
+ "\n",
+ "[4] Vasilevsky, Nicole A., et al. \"Mondo: Unifying diseases for the world, by the world.\" medRxiv (2022): 2022-04."
+ ],
+ "metadata": {
+ "id": "RXFunr8jfgA-"
+ }
},
{
"cell_type": "code",
"source": [
- "print(yaml.dump(g.to_config()))"
+ "# Download annotations and ontology from archive\n",
+ "gsc = obnb.data.DisGeNET(root, version=data_version)"
],
"metadata": {
- "colab": {
- "base_uri": "https://localhost:8080/"
- },
- "id": "1-vLqR-H_F6h",
- "outputId": "ba4aee5c-9a99-48b3-b519-7e37c6fd211a"
+ "id": "HGtLoOl8WNfh"
},
- "execution_count": 8,
- "outputs": [
- {
- "output_type": "stream",
- "name": "stdout",
- "text": [
- "data_module: BioGRID\n",
- "data_module_params:\n",
- " cx_kwargs:\n",
- " interaction_types:\n",
- " - interacts-with\n",
- " node_id_entry: r\n",
- " node_id_prefix: ncbigene\n",
- " cx_uuid: 36f7d8fd-23dc-11e8-b939-0ac135e8bacf\n",
- " directed: false\n",
- " gene_id_converter: HumanEntrez\n",
- " largest_comp: true\n",
- " version: obnbdata-0.1.0\n",
- " weighted: false\n",
- "package_version: 0.1.1-dev\n",
- "processed_time: '2023-07-10 23:16:15'\n",
- "\n"
- ]
- }
- ]
+ "execution_count": null,
+ "outputs": []
},
{
"cell_type": "code",
"source": [
- "adj = g.to_adjmat()\n",
- "print(adj)"
+ "# Again, once downloaded and processed, it can be used in the future\n",
+ "gsc = obnb.data.DisGeNET(root, version=data_version)"
],
"metadata": {
- "colab": {
- "base_uri": "https://localhost:8080/"
- },
- "id": "-QtlyfJQ-6wT",
- "outputId": "ee33b889-5206-4e21-9a7d-44f33ceb25be"
+ "id": "PN2yNtAgjGSm"
},
- "execution_count": 9,
- "outputs": [
- {
- "output_type": "stream",
- "name": "stdout",
- "text": [
- "[[0. 1. 0. ... 0. 0. 0.]\n",
- " [1. 0. 0. ... 0. 0. 0.]\n",
- " [0. 0. 0. ... 0. 0. 0.]\n",
- " ...\n",
- " [0. 0. 0. ... 0. 0. 0.]\n",
- " [0. 0. 0. ... 0. 0. 0.]\n",
- " [0. 0. 0. ... 0. 0. 0.]]\n"
- ]
- }
- ]
+ "execution_count": null,
+ "outputs": []
+ },
+ {
+ "cell_type": "markdown",
+ "source": [
+ "Processing config can be inspected in a similar fashion as before"
+ ],
+ "metadata": {
+ "id": "aHvUg8NOjU8N"
+ }
+ },
+ {
+ "cell_type": "code",
+ "source": [
+ "print(yaml.dump(gsc.to_config()))"
+ ],
+ "metadata": {
+ "id": "UEZQ-LJ5_vJO"
+ },
+ "execution_count": null,
+ "outputs": []
+ },
+ {
+ "cell_type": "code",
+ "source": [
+ "obnb.label.LabelsetCollection"
+ ],
+ "metadata": {
+ "id": "TvKsu8rejken"
+ },
+ "execution_count": null,
+ "outputs": []
+ },
+ {
+ "cell_type": "markdown",
+ "source": [
+ "The `gsc` object is an instance of the `obnb.label.LabelsetCollection` object.\n",
+ "You can also convert it to a\n",
+ "[GMT](https://software.broadinstitute.org/cancer/software/gsea/wiki/index.php/Data_formats#GMT:_Gene_Matrix_Transposed_file_format_.28.2A.gmt.29)-like\n",
+ "dataframe by calling the `to_df` method.\n",
+ "\n",
+ "The resulting dataframe is a table where the first three columns correspond to\n",
+ "the term ID, term info, and the number of genes associated with this term after\n",
+ "the processing. The rest of the columns are gene IDs that are associated with a\n",
+ "particular term, padded with `None`s."
+ ],
+ "metadata": {
+ "id": "jHAIQT6ujeKX"
+ }
},
{
"cell_type": "code",
- "source": [],
+ "source": [
+ "gsc.to_df()"
+ ],
"metadata": {
- "id": "cmvRjsv6-6pj"
+ "id": "bxYbfhya_GAj"
},
"execution_count": null,
"outputs": []
},
+ {
+ "cell_type": "markdown",
+ "source": [
+ "## 3. Constructing dataset"
+ ],
+ "metadata": {
+ "id": "t7aARNyJlEBk"
+ }
+ },
+ {
+ "cell_type": "markdown",
+ "source": [
+ "### 3.1 The hard way: consolidate the network with gene set collection and combine into a dataset\n",
+ "\n",
+ "- Pros: Flexible filtering and dataset construction to help investigate specific\n",
+ " biological questions.\n",
+ "- Cons: Many steps involved to filter and pre-process."
+ ],
+ "metadata": {
+ "id": "nVOZQA6OlcLu"
+ }
+ },
{
"cell_type": "code",
"source": [
- "from obnb import OpenBiomedNetBench\n",
+ "from obnb.label import filters\n",
+ "from obnb.label.split import RatioPartition\n",
+ "from obnb.util.converter import GenePropertyConverter\n",
"\n",
- "dataset = OpenBiomedNetBench(root=root, graph_name=\"BioGRID\", label_name=\"DisGeNET\",\n",
- " version=data_version, graph_as_feature=True, use_dense_graph=True)"
+ "\n",
+ "# Load PubMed count gene property converter\n",
+ "pubmedcnt_converter = GenePropertyConverter(root, name=\"PubMedCount\")\n",
+ "\n",
+ "# 6/2/2/ study-bias holdout split for genes\n",
+ "splitter = RatioPartition(0.6, 0.2, 0.2, ascending=False,\n",
+ " property_converter=pubmedcnt_converter)\n",
+ "\n",
+ "# Apply filters to the gene set collection\n",
+ "gsc_filtered = gsc.apply(\n",
+ " filters.Compose(\n",
+ " # Only use genes that are present in the network\n",
+ " filters.EntityExistenceFilter(list(g.node_ids), log_level=\"INFO\",),\n",
+ " # Remove any labelsets with less than 50 network genes\n",
+ " filters.LabelsetRangeFilterSize(min_val=50, log_level=\"INFO\",),\n",
+ " # Make sure each split has at least 10 positive examples\n",
+ " filters.LabelsetRangeFilterSplit(min_val=10, splitter=splitter, log_level=\"INFO\",),\n",
+ " log_level=\"INFO\",\n",
+ " ),\n",
+ ")"
],
"metadata": {
- "colab": {
- "base_uri": "https://localhost:8080/"
- },
- "id": "F6Di1ZJ--6md",
- "outputId": "ff9a0237-7b48-478f-a8c3-885a12af6c6d"
+ "id": "NOVBeVkPlOIX"
},
- "execution_count": 4,
- "outputs": [
- {
- "output_type": "stream",
- "name": "stderr",
- "text": [
- "[INFO][2023-07-10 23:25:45,356][base][download_archive] Loading BioGRID (version='obnbdata-0.1.0')...\n",
- "[INFO][2023-07-10 23:25:45,363][download][get_data_url] Download URL: https://zenodo.org/record/8045270/files/BioGRID.zip\n",
- "[INFO][2023-07-10 23:25:45,368][download][download_unzip] Downloading zip archive from https://zenodo.org/record/8045270/files/BioGRID.zip\n",
- "100%|██████████| 39.3M/39.3M [00:02<00:00, 14.9MB/s]\n",
- "[INFO][2023-07-10 23:25:49,102][download][download_unzip] Download completed, start unpacking...\n",
- "[INFO][2023-07-10 23:25:52,276][download][download_unzip] Done extracting\n",
- "[INFO][2023-07-10 23:25:52,283][base][load_processed_data] Load processed file datasets/BioGRID/processed/data.npz\n",
- "[INFO][obnb.GenePropertyConverter][_load_cache] Loaded gene conversion cache datasets/.cache/geneprop_convert-PubMedCount.json\n",
- "[INFO][2023-07-10 23:26:03,783][base][load_processed_data] Load processed file datasets/DisGeNET/processed/data.gmt\n",
- "[INFO][2023-07-10 23:26:04,977][base][_apply_transform] Before transformation:\n",
- "Number of labelsets: 1040\n",
- "max: 594\n",
- "min: 10\n",
- "med: 36.00\n",
- "avg: 85.61\n",
- "std: 120.19\n",
- "\n",
- "[INFO][2023-07-10 23:26:04,980][base][_apply_transform] Applying transformation:\n",
- "Composition of filters:\n",
- "\t- EntityExistenceFilter(remove_specified=False)\n",
- "\t- LabelsetRangeFilterSize(min_val=50, max_val=None)\n",
- "\t- LabelsetRangeFilterSplit(splitter=RatioPartition(property_converter=GenePropertyConverter(name='PubMedCount'), ascending=False, ratios=(0.6, 0.2, 0.2)), min_val=5, count_negatives=True)\n",
- "\t- NegativeGeneratorHypergeom(p_thresh=0.05)\n",
- "EntityExistenceFilter(remove_specified=False): 100%|██████████| 9427/9427 [00:07<00:00, 1226.87it/s]\n",
- "[INFO][obnb.Compose][__call__] Number of labelsets: 1040\n",
- "max: 571\n",
- "min: 4\n",
- "med: 35.00\n",
- "avg: 81.62\n",
- "std: 114.00\n",
- "\n",
- "LabelsetRangeFilterSize(min_val=50, max_val=None): 100%|██████████| 1040/1040 [00:00<00:00, 3544.88it/s]\n",
- "[INFO][obnb.Compose][__call__] Number of labelsets: 406\n",
- "max: 571\n",
- "min: 50\n",
- "med: 118.00\n",
- "avg: 174.50\n",
- "std: 137.68\n",
- "\n",
- "LabelsetRangeFilterSplit(splitter=RatioPartition(property_converter=GenePropertyConverter(name='PubMedCount'), ascending=False, ratios=(0.6, 0.2, 0.2)), min_val=5, count_negatives=True): 100%|██████████| 406/406 [00:44<00:00, 9.20it/s]\n",
- "[INFO][obnb.Compose][__call__] Number of labelsets: 305\n",
- "max: 571\n",
- "min: 50\n",
- "med: 159.00\n",
- "avg: 208.26\n",
- "std: 143.10\n",
- "\n",
- "Computing hypergeometric p-value matrix: 100%|██████████| 46360/46360 [00:54<00:00, 857.21it/s] \n",
- "NegativeGeneratorHypergeom(p_thresh=0.05): 100%|██████████| 305/305 [00:02<00:00, 113.19it/s]\n",
- "[INFO][obnb.Compose][__call__] Number of labelsets: 305\n",
- "max: 571\n",
- "min: 50\n",
- "med: 159.00\n",
- "avg: 208.26\n",
- "std: 143.10\n",
- "\n",
- "[INFO][2023-07-10 23:27:53,980][base][_apply_transform] After transformation:\n",
- "Number of labelsets: 305\n",
- "max: 571\n",
- "min: 50\n",
- "med: 159.00\n",
- "avg: 208.26\n",
- "std: 143.10\n",
- "\n",
- "[INFO][2023-07-10 23:27:53,998][base][_apply_transform] Saved cache transformation to datasets/DisGeNET/processed/.cache/3f21b172b9a43412952d179a1e7d9f3f/data.gmt\n"
- ]
- }
- ]
+ "execution_count": null,
+ "outputs": []
},
{
"cell_type": "code",
"source": [
- "dataset = OpenBiomedNetBench(root=root, graph_name=\"BioGRID\", label_name=\"DisGeNET\",\n",
- " version=data_version, graph_as_feature=True, use_dense_graph=True)"
+ "# Combine into a OBNB dataset object\n",
+ "dataset = obnb.Dataset(\n",
+ " graph=g,\n",
+ " feature=g.to_dense_graph().to_feature(),\n",
+ " label=gsc_filtered,\n",
+ " splitter=splitter,\n",
+ ")"
],
"metadata": {
- "colab": {
- "base_uri": "https://localhost:8080/"
- },
- "id": "_I775-DC-6jL",
- "outputId": "daaa0cbc-8f3b-4927-f954-3963c9730dd9"
+ "id": "SU4bL7WGlOFG"
},
"execution_count": null,
- "outputs": [
- {
- "output_type": "stream",
- "name": "stderr",
- "text": [
- "[INFO][2023-07-10 23:19:55,923][base][load_processed_data] Load processed file datasets/BioGRID/processed/data.npz\n",
- "[INFO][obnb.GenePropertyConverter][_load_cache] Loaded gene conversion cache datasets/.cache/geneprop_convert-PubMedCount.json\n",
- "[INFO][2023-07-10 23:20:09,055][base][load_processed_data] Load processed file datasets/DisGeNET/processed/data.gmt\n",
- "[INFO][2023-07-10 23:20:12,024][base][_apply_transform] Loading cached transformed data from datasets/DisGeNET/processed/.cache/3f21b172b9a43412952d179a1e7d9f3f/data.gmt\n",
- "[INFO][2023-07-10 23:20:12,028][base][load_processed_data] Load processed file datasets/DisGeNET/processed/.cache/3f21b172b9a43412952d179a1e7d9f3f/data.gmt\n"
- ]
- }
- ]
+ "outputs": []
},
{
"cell_type": "code",
"source": [
- "from obnb.model_trainer import LabelPropagationTrainer\n",
- "from obnb.model.label_propagation import OneHopPropagation\n",
- "\n",
- "mdl = OneHopPropagation()\n",
- "trainer = LabelPropagationTrainer()\n",
+ "dataset.graph"
+ ],
+ "metadata": {
+ "id": "HYDWpfnNlOBb"
+ },
+ "execution_count": null,
+ "outputs": []
+ },
+ {
+ "cell_type": "code",
+ "source": [
+ "dataset.label"
+ ],
+ "metadata": {
+ "id": "UhKG5PFalN6_"
+ },
+ "execution_count": null,
+ "outputs": []
+ },
+ {
+ "cell_type": "markdown",
+ "source": [
+ "### 3.2. The easy way: OBNB default dataset construction\n",
"\n",
- "results = trainer.fit_and_eval(mdl, dataset)"
+ "- Pros: Easy to construct the dataset as it masked out a lot of common steps.\n",
+ "- Cons: Less flexible and hard to construct specialized datasets."
],
"metadata": {
- "id": "QlCizk9x-6gD"
+ "id": "sk3HPD3JlXJe"
+ }
+ },
+ {
+ "cell_type": "code",
+ "source": [
+ "dataset = obnb.OpenBiomedNetBench(\n",
+ " root=root,\n",
+ " graph_name=\"BioPlex\",\n",
+ " label_name=\"DisGeNET\",\n",
+ " version=data_version,\n",
+ " graph_as_feature=True,\n",
+ " use_dense_graph=True,\n",
+ ")"
+ ],
+ "metadata": {
+ "id": "F6Di1ZJ--6md"
},
- "execution_count": 5,
+ "execution_count": null,
"outputs": []
},
+ {
+ "cell_type": "code",
+ "source": [
+ "# Similar to all previously shown cases, dataset have builtin cache utility\n",
+ "# to help spead up dataloading after the first instantiation.\n",
+ "dataset = obnb.OpenBiomedNetBench(\n",
+ " root=root,\n",
+ " graph_name=\"BioPlex\",\n",
+ " label_name=\"DisGeNET\",\n",
+ " version=data_version,\n",
+ " graph_as_feature=True,\n",
+ " use_dense_graph=True,\n",
+ ")"
+ ],
+ "metadata": {
+ "id": "_I775-DC-6jL"
+ },
+ "execution_count": null,
+ "outputs": []
+ },
+ {
+ "cell_type": "markdown",
+ "source": [
+ "## 4. Simple model evaluation using the dataset and the builtin trianer"
+ ],
+ "metadata": {
+ "id": "DgV8pJf9otkk"
+ }
+ },
+ {
+ "cell_type": "markdown",
+ "source": [
+ "### 4.1. Label propagation"
+ ],
+ "metadata": {
+ "id": "3RDHlp18pP0B"
+ }
+ },
{
"cell_type": "code",
"source": [
"import pandas as pd\n",
"\n",
- "df = pd.DataFrame(results, index=dataset.label.label_ids)\n",
- "df"
+ "from obnb.model_trainer import LabelPropagationTrainer\n",
+ "from obnb.model.label_propagation import OneHopPropagation"
],
"metadata": {
- "colab": {
- "base_uri": "https://localhost:8080/",
- "height": 424
- },
- "id": "NBtgy76xIoc_",
- "outputId": "2c3e0472-c679-41d4-d293-c46b2c1d8059"
+ "id": "QlCizk9x-6gD"
},
- "execution_count": 10,
- "outputs": [
- {
- "output_type": "execute_result",
- "data": {
- "text/plain": [
- " train_apop val_apop test_apop train_auroc val_auroc \\\n",
- "MONDO:0021034 1.843529 2.163573 0.033863 0.675152 0.483754 \n",
- "MONDO:0002243 1.813295 0.303094 1.002558 0.701548 0.516716 \n",
- "MONDO:0002245 0.809497 0.620750 0.206753 0.637431 0.557898 \n",
- "MONDO:0001703 0.778886 1.511866 4.148121 0.548935 0.588175 \n",
- "MONDO:0013099 1.914333 2.262900 0.493989 0.646922 0.647186 \n",
- "... ... ... ... ... ... \n",
- "MONDO:0100284 0.603496 0.189524 0.000000 0.573916 0.527694 \n",
- "MONDO:0020019 1.583002 0.591055 0.879580 0.681023 0.518388 \n",
- "MONDO:0021002 1.211121 1.055366 1.127546 0.628159 0.585140 \n",
- "MONDO:0021017 2.228560 1.146228 0.000000 0.520709 0.528579 \n",
- "MONDO:0100459 3.250616 3.966312 0.060178 0.692115 0.708832 \n",
- "\n",
- " test_auroc \n",
- "MONDO:0021034 0.472385 \n",
- "MONDO:0002243 0.595784 \n",
- "MONDO:0002245 0.560433 \n",
- "MONDO:0001703 0.497549 \n",
- "MONDO:0013099 0.532789 \n",
- "... ... \n",
- "MONDO:0100284 0.413082 \n",
- "MONDO:0020019 0.598177 \n",
- "MONDO:0021002 0.629074 \n",
- "MONDO:0021017 0.462455 \n",
- "MONDO:0100459 0.517378 \n",
- "\n",
- "[305 rows x 6 columns]"
- ],
- "text/html": [
- "\n",
- "\n",
- " \n",
- "
\n",
- "
\n",
- "\n",
- "
\n",
- " \n",
- " \n",
- " | \n",
- " train_apop | \n",
- " val_apop | \n",
- " test_apop | \n",
- " train_auroc | \n",
- " val_auroc | \n",
- " test_auroc | \n",
- "
\n",
- " \n",
- " \n",
- " \n",
- " MONDO:0021034 | \n",
- " 1.843529 | \n",
- " 2.163573 | \n",
- " 0.033863 | \n",
- " 0.675152 | \n",
- " 0.483754 | \n",
- " 0.472385 | \n",
- "
\n",
- " \n",
- " MONDO:0002243 | \n",
- " 1.813295 | \n",
- " 0.303094 | \n",
- " 1.002558 | \n",
- " 0.701548 | \n",
- " 0.516716 | \n",
- " 0.595784 | \n",
- "
\n",
- " \n",
- " MONDO:0002245 | \n",
- " 0.809497 | \n",
- " 0.620750 | \n",
- " 0.206753 | \n",
- " 0.637431 | \n",
- " 0.557898 | \n",
- " 0.560433 | \n",
- "
\n",
- " \n",
- " MONDO:0001703 | \n",
- " 0.778886 | \n",
- " 1.511866 | \n",
- " 4.148121 | \n",
- " 0.548935 | \n",
- " 0.588175 | \n",
- " 0.497549 | \n",
- "
\n",
- " \n",
- " MONDO:0013099 | \n",
- " 1.914333 | \n",
- " 2.262900 | \n",
- " 0.493989 | \n",
- " 0.646922 | \n",
- " 0.647186 | \n",
- " 0.532789 | \n",
- "
\n",
- " \n",
- " ... | \n",
- " ... | \n",
- " ... | \n",
- " ... | \n",
- " ... | \n",
- " ... | \n",
- " ... | \n",
- "
\n",
- " \n",
- " MONDO:0100284 | \n",
- " 0.603496 | \n",
- " 0.189524 | \n",
- " 0.000000 | \n",
- " 0.573916 | \n",
- " 0.527694 | \n",
- " 0.413082 | \n",
- "
\n",
- " \n",
- " MONDO:0020019 | \n",
- " 1.583002 | \n",
- " 0.591055 | \n",
- " 0.879580 | \n",
- " 0.681023 | \n",
- " 0.518388 | \n",
- " 0.598177 | \n",
- "
\n",
- " \n",
- " MONDO:0021002 | \n",
- " 1.211121 | \n",
- " 1.055366 | \n",
- " 1.127546 | \n",
- " 0.628159 | \n",
- " 0.585140 | \n",
- " 0.629074 | \n",
- "
\n",
- " \n",
- " MONDO:0021017 | \n",
- " 2.228560 | \n",
- " 1.146228 | \n",
- " 0.000000 | \n",
- " 0.520709 | \n",
- " 0.528579 | \n",
- " 0.462455 | \n",
- "
\n",
- " \n",
- " MONDO:0100459 | \n",
- " 3.250616 | \n",
- " 3.966312 | \n",
- " 0.060178 | \n",
- " 0.692115 | \n",
- " 0.708832 | \n",
- " 0.517378 | \n",
- "
\n",
- " \n",
- "
\n",
- "
305 rows × 6 columns
\n",
- "
\n",
- "
\n",
- "\n",
- "\n",
- "\n",
- "
\n",
- "
\n",
- "
\n",
- "\n",
- "\n",
- "\n",
- " \n",
- "\n",
- " \n",
- " \n",
- "\n",
- " \n",
- "
\n",
- "
\n"
- ]
- },
- "metadata": {},
- "execution_count": 10
- }
- ]
+ "execution_count": null,
+ "outputs": []
},
{
"cell_type": "code",
"source": [
- "df.describe()"
+ "lp_mdl = OneHopPropagation()\n",
+ "lp_trainer = LabelPropagationTrainer()"
],
"metadata": {
- "colab": {
- "base_uri": "https://localhost:8080/",
- "height": 300
- },
- "id": "np234taeKVjQ",
- "outputId": "19e843c3-b927-4c82-ca9e-b42aea000202"
+ "id": "JyrNWeZ4pDA5"
},
- "execution_count": 11,
- "outputs": [
- {
- "output_type": "execute_result",
- "data": {
- "text/plain": [
- " train_apop val_apop test_apop train_auroc val_auroc test_auroc\n",
- "count 305.000000 305.000000 305.000000 305.000000 305.000000 305.000000\n",
- "mean 1.152127 1.003746 0.819258 0.623729 0.561479 0.521614\n",
- "std 0.753213 1.065416 1.116516 0.065599 0.082058 0.063061\n",
- "min 0.001297 -0.320246 -0.213516 0.485241 0.351099 0.375165\n",
- "25% 0.646890 0.255761 0.053465 0.582988 0.506204 0.473407\n",
- "50% 0.993217 0.623817 0.392785 0.620002 0.551109 0.521458\n",
- "75% 1.507760 1.465971 1.137018 0.659771 0.596552 0.560724\n",
- "max 5.851295 6.370345 6.111766 0.965775 0.951942 0.794405"
- ],
- "text/html": [
- "\n",
- "\n",
- " \n",
- "
\n",
- "
\n",
- "\n",
- "
\n",
- " \n",
- " \n",
- " | \n",
- " train_apop | \n",
- " val_apop | \n",
- " test_apop | \n",
- " train_auroc | \n",
- " val_auroc | \n",
- " test_auroc | \n",
- "
\n",
- " \n",
- " \n",
- " \n",
- " count | \n",
- " 305.000000 | \n",
- " 305.000000 | \n",
- " 305.000000 | \n",
- " 305.000000 | \n",
- " 305.000000 | \n",
- " 305.000000 | \n",
- "
\n",
- " \n",
- " mean | \n",
- " 1.152127 | \n",
- " 1.003746 | \n",
- " 0.819258 | \n",
- " 0.623729 | \n",
- " 0.561479 | \n",
- " 0.521614 | \n",
- "
\n",
- " \n",
- " std | \n",
- " 0.753213 | \n",
- " 1.065416 | \n",
- " 1.116516 | \n",
- " 0.065599 | \n",
- " 0.082058 | \n",
- " 0.063061 | \n",
- "
\n",
- " \n",
- " min | \n",
- " 0.001297 | \n",
- " -0.320246 | \n",
- " -0.213516 | \n",
- " 0.485241 | \n",
- " 0.351099 | \n",
- " 0.375165 | \n",
- "
\n",
- " \n",
- " 25% | \n",
- " 0.646890 | \n",
- " 0.255761 | \n",
- " 0.053465 | \n",
- " 0.582988 | \n",
- " 0.506204 | \n",
- " 0.473407 | \n",
- "
\n",
- " \n",
- " 50% | \n",
- " 0.993217 | \n",
- " 0.623817 | \n",
- " 0.392785 | \n",
- " 0.620002 | \n",
- " 0.551109 | \n",
- " 0.521458 | \n",
- "
\n",
- " \n",
- " 75% | \n",
- " 1.507760 | \n",
- " 1.465971 | \n",
- " 1.137018 | \n",
- " 0.659771 | \n",
- " 0.596552 | \n",
- " 0.560724 | \n",
- "
\n",
- " \n",
- " max | \n",
- " 5.851295 | \n",
- " 6.370345 | \n",
- " 6.111766 | \n",
- " 0.965775 | \n",
- " 0.951942 | \n",
- " 0.794405 | \n",
- "
\n",
- " \n",
- "
\n",
- "
\n",
- "
\n",
- "\n",
- "\n",
- "\n",
- "
\n",
- "
\n",
- "
\n",
- "\n",
- "\n",
- "\n",
- " \n",
- "\n",
- " \n",
- " \n",
- "\n",
- " \n",
- "
\n",
- "
\n"
- ]
- },
- "metadata": {},
- "execution_count": 11
- }
- ]
+ "execution_count": null,
+ "outputs": []
},
{
"cell_type": "code",
- "source": [],
+ "source": [
+ "lp_results = lp_trainer.fit_and_eval(lp_mdl, dataset)"
+ ],
"metadata": {
- "id": "3I3Z0WJyKVQP"
+ "id": "5sWjyXv1pFuP"
},
"execution_count": null,
"outputs": []
@@ -1597,73 +599,101 @@
{
"cell_type": "code",
"source": [
- "from sklearn.linear_model import LogisticRegression\n",
- "from obnb.model_trainer import SupervisedLearningTrainer\n",
- "\n",
- "mdl = LogisticRegression(penalty=\"l2\", solver=\"lbfgs\")\n",
- "trainer = SupervisedLearningTrainer()\n",
- "\n",
- "results2 = trainer.fit_and_eval(mdl, dataset)"
+ "lp_df = pd.DataFrame(lp_results, index=dataset.label.label_ids)\n",
+ "lp_df"
],
"metadata": {
- "id": "GdeNbEDz-6cx"
+ "id": "NBtgy76xIoc_"
},
"execution_count": null,
"outputs": []
},
{
"cell_type": "code",
- "source": [],
+ "source": [
+ "lp_df.describe()"
+ ],
"metadata": {
- "id": "-NJKfTpx-6Z4"
+ "id": "np234taeKVjQ"
},
"execution_count": null,
"outputs": []
},
+ {
+ "cell_type": "markdown",
+ "source": [
+ "### 4.2. Supervised learning"
+ ],
+ "metadata": {
+ "id": "hkVYJQE8pR9F"
+ }
+ },
{
"cell_type": "code",
- "source": [],
+ "source": [
+ "from sklearn.linear_model import LogisticRegression\n",
+ "from obnb.model_trainer import SupervisedLearningTrainer"
+ ],
"metadata": {
- "id": "3uHHEcsx-6Wy"
+ "id": "GdeNbEDz-6cx"
},
"execution_count": null,
"outputs": []
},
{
"cell_type": "code",
- "source": [],
+ "source": [
+ "sl_mdl = LogisticRegression(penalty=\"l2\", solver=\"lbfgs\")\n",
+ "sl_trainer = SupervisedLearningTrainer()"
+ ],
"metadata": {
- "id": "-14ui8Jt-6Tf"
+ "id": "JXWOi3hGpfIG"
},
"execution_count": null,
"outputs": []
},
{
"cell_type": "code",
- "source": [],
+ "source": [
+ "sl_results = sl_trainer.fit_and_eval(sl_mdl, dataset)"
+ ],
"metadata": {
- "id": "poGro_Qo-6Qz"
+ "id": "FTj2l-9ipj-4"
},
"execution_count": null,
"outputs": []
},
{
"cell_type": "code",
- "source": [],
+ "source": [
+ "sl_df = pd.DataFrame(sl_results, index=dataset.label.label_ids)\n",
+ "sl_df"
+ ],
"metadata": {
- "id": "Hn4bHRIg-6Nz"
+ "id": "A4OZoPsipaiw"
},
"execution_count": null,
"outputs": []
},
{
"cell_type": "code",
- "source": [],
+ "source": [
+ "sl_df.describe()"
+ ],
"metadata": {
- "id": "v3FTLk__-5zs"
+ "id": "lxHqDjPupcT0"
},
"execution_count": null,
"outputs": []
+ },
+ {
+ "cell_type": "markdown",
+ "source": [
+ "### 4.3. GNN (coming soon)"
+ ],
+ "metadata": {
+ "id": "Qf6Z7iBfpVfZ"
+ }
}
]
}
\ No newline at end of file