4dn-dcic · RahiNav · Feb 29, 2024 · Mar 25, 2024
diff --git a/CHANGELOG.rst b/CHANGELOG.rst
@@ -7,6 +7,13 @@ Change Log
 ------
 
 
+3.0.3
+=====
+
+
+* Added new wrangling notebook 16_check_publication_in_GEO.ipynb to check for associated GEO datasets for a given list of PMIDs.
+
+
 3.0.2
 =====
 

diff --git a/notebooks/useful_notebooks/16_check_publication_in_GEO.ipynb b/notebooks/useful_notebooks/16_check_publication_in_GEO.ipynb
@@ -0,0 +1,142 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "id": "d4da4f53-c565-4f8f-8c06-fe4884db3a52",
+   "metadata": {},
+   "source": [
+    "### PLEASE COPY NOTEBOOKS TO YOUR FOLDERS TO PREVENT COMMIT CONFLICTS\n",
+    "\n",
+    "#### For a given PMID, provides associated GEO dataset ID(s) (GSEXXX)\n",
+    "\n",
+    "#### Protocol:\n",
+    "\n",
+    "#### Register for a account in NCBI and copy the API KEY provided in NCBI account > NCBI Account Settings\n",
+    "#### Provide a file with list of publications, just numbers no prefix. \n",
+    "\n",
+    "#### NOTE:\n",
+    "#### Needs Biopython module\n",
+    "#### Entrez allows max 10 requests per second (If API key is provided) otherwise the limit is 3 requests/second. "
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "62ec2562-0707-4d78-9f59-bedb6c1812cc",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from Bio import Entrez\n",
+    "import time"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "7d66e411-3c92-4855-a8a7-877b473edf20",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Set up NCBI creds\n",
+    "\n",
+    "Entrez.email = \"\"  # Email associated with your NCBI account\n",
+    "Entrez.api_key = \"\"  # API key can be copied from your NCBI account > NCBI Account Settings\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "3fed6fe3-b282-43bc-862d-bbbb677165a3",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def get_gse(pmid_accession_list):\n",
+    "    no_gse = False\n",
+    "    \n",
+    "    for pmid in pmid_accession_list:\n",
+    "        handle_gds_uid = Entrez.elink(dbfrom=\"pubmed\", id=pmid, db=\"gds\") #get GDS UID (which is different from GSE accession)\n",
+    "        record_gds_uid = Entrez.read(handle_gds_uid)\n",
+    "        handle_gds_uid.close()\n",
+    "        if len(record_gds_uid[0][\"LinkSetDb\"]) == 0: #If there is no GEO datasets (GDS) with the PMID\n",
+    "            no_gse = True\n",
+    "            gse_accession = \"Not (yet) in GEO\"\n",
+    "        elif len(record_gds_uid[0]['LinkSetDb'][0]['Link']) >= 1: #If PMID is associated with GEO Datasets (GDS)\n",
+    "            gds_uids = []\n",
+    "            list_gds_acc = record_gds_uid[0]['LinkSetDb'][0]['Link']\n",
+    "            for gse_acc in list_gds_acc:\n",
+    "                gds_uids.append(gse_acc['Id'])\n",
+    "            for ids in gds_uids:\n",
+    "                handle2_gse_acc = Entrez.esummary(db=\"gds\", id=ids)  # convert GDS UID to GSE accession\n",
+    "                record2_gse_acc = Entrez.read(handle2_gse_acc)\n",
+    "                handle2_gse_acc.close()\n",
+    "                gse_accession = record2_gse_acc[0]['Accession']\n",
+    "                print(pmid,gse_accession)\n",
+    "                time.sleep(1) \n",
+    "    \n",
+    "        if no_gse:\n",
+    "            print(pmid, gse_accession)\n",
+    "            time.sleep(1)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "e45d94eb-0e06-491c-b05c-39589e682fd8",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# File with a list of PMIDs - one on each line.\n",
+    "\n",
+    "input_file = ''\n",
+    "chunks = 5       #Splits the PMID list into specified no. of values per list\n",
+    "\n",
+    "with open(input_file) as file1:\n",
+    "\n",
+    "    all_pmids = []\n",
+    "    for pmid in file1:\n",
+    "        pmid = pmid.rstrip()\n",
+    "        all_pmids.append(str(pmid))\n",
+    "\n",
+    "for count in range(0,len(all_pmids),chunks):\n",
+    "    pmid_accession_list = all_pmids[count:count+chunks]\n",
+    "    get_gse(pmid_accession_list)\n",
+    "    time.sleep(2)\n",
+    "    "
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "257f63a4-f5e4-4d37-9c4b-68e17ff0634b",
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.8.12"
+  },
+  "vscode": {
+   "interpreter": {
+    "hash": "20d91ac981d81ffaf62d1b59390659afd05fbcd41a0bec0e13249d20dc131a1e"
+   }
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}
diff --git a/pyproject.toml b/pyproject.toml
@@ -1,6 +1,6 @@
 [tool.poetry]
 name = "dcicwrangling"
-version = "3.0.2"
+version = "3.0.3"
 description = "Scripts and Jupyter notebooks for 4DN wrangling"
 authors = ["4DN-DCIC Team <[email protected]>"]
 license = "MIT"