Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

new wrangler notebook #112

Open
wants to merge 2 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
7 changes: 7 additions & 0 deletions CHANGELOG.rst
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,13 @@ Change Log
------


3.0.3
=====


* Added new wrangling notebook 16_check_publication_in_GEO.ipynb to check for associated GEO datasets for a given list of PMIDs.


3.0.2
=====

Expand Down
142 changes: 142 additions & 0 deletions notebooks/useful_notebooks/16_check_publication_in_GEO.ipynb
Original file line number Diff line number Diff line change
@@ -0,0 +1,142 @@
{
"cells": [
{
"cell_type": "markdown",
"id": "d4da4f53-c565-4f8f-8c06-fe4884db3a52",
"metadata": {},
"source": [
"### PLEASE COPY NOTEBOOKS TO YOUR FOLDERS TO PREVENT COMMIT CONFLICTS\n",
"\n",
"#### For a given PMID, provides associated GEO dataset ID(s) (GSEXXX)\n",
"\n",
"#### Protocol:\n",
"\n",
"#### Register for a account in NCBI and copy the API KEY provided in NCBI account > NCBI Account Settings\n",
"#### Provide a file with list of publications, just numbers no prefix. \n",
"\n",
"#### NOTE:\n",
"#### Needs Biopython module\n",
"#### Entrez allows max 10 requests per second (If API key is provided) otherwise the limit is 3 requests/second. "
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "62ec2562-0707-4d78-9f59-bedb6c1812cc",
"metadata": {},
"outputs": [],
"source": [
"from Bio import Entrez\n",
"import time"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "7d66e411-3c92-4855-a8a7-877b473edf20",
"metadata": {},
"outputs": [],
"source": [
"# Set up NCBI creds\n",
"\n",
"Entrez.email = \"\" # Email associated with your NCBI account\n",
"Entrez.api_key = \"\" # API key can be copied from your NCBI account > NCBI Account Settings\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "3fed6fe3-b282-43bc-862d-bbbb677165a3",
"metadata": {},
"outputs": [],
"source": [
"def get_gse(pmid_accession_list):\n",
" no_gse = False\n",
" \n",
" for pmid in pmid_accession_list:\n",
" handle_gds_uid = Entrez.elink(dbfrom=\"pubmed\", id=pmid, db=\"gds\") #get GDS UID (which is different from GSE accession)\n",
" record_gds_uid = Entrez.read(handle_gds_uid)\n",
" handle_gds_uid.close()\n",
" if len(record_gds_uid[0][\"LinkSetDb\"]) == 0: #If there is no GEO datasets (GDS) with the PMID\n",
" no_gse = True\n",
" gse_accession = \"Not (yet) in GEO\"\n",
" elif len(record_gds_uid[0]['LinkSetDb'][0]['Link']) >= 1: #If PMID is associated with GEO Datasets (GDS)\n",
" gds_uids = []\n",
" list_gds_acc = record_gds_uid[0]['LinkSetDb'][0]['Link']\n",
" for gse_acc in list_gds_acc:\n",
" gds_uids.append(gse_acc['Id'])\n",
" for ids in gds_uids:\n",
" handle2_gse_acc = Entrez.esummary(db=\"gds\", id=ids) # convert GDS UID to GSE accession\n",
" record2_gse_acc = Entrez.read(handle2_gse_acc)\n",
" handle2_gse_acc.close()\n",
" gse_accession = record2_gse_acc[0]['Accession']\n",
" print(pmid,gse_accession)\n",
" time.sleep(1) \n",
" \n",
" if no_gse:\n",
" print(pmid, gse_accession)\n",
" time.sleep(1)"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "e45d94eb-0e06-491c-b05c-39589e682fd8",
"metadata": {},
"outputs": [],
"source": [
"# File with a list of PMIDs - one on each line.\n",
"\n",
"input_file = ''\n",
"chunks = 5 #Splits the PMID list into specified no. of values per list\n",
"\n",
"with open(input_file) as file1:\n",
"\n",
" all_pmids = []\n",
" for pmid in file1:\n",
" pmid = pmid.rstrip()\n",
" all_pmids.append(str(pmid))\n",
"\n",
"for count in range(0,len(all_pmids),chunks):\n",
" pmid_accession_list = all_pmids[count:count+chunks]\n",
" get_gse(pmid_accession_list)\n",
" time.sleep(2)\n",
" "
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "257f63a4-f5e4-4d37-9c4b-68e17ff0634b",
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.8.12"
},
"vscode": {
"interpreter": {
"hash": "20d91ac981d81ffaf62d1b59390659afd05fbcd41a0bec0e13249d20dc131a1e"
}
}
},
"nbformat": 4,
"nbformat_minor": 5
}
2 changes: 1 addition & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[tool.poetry]
name = "dcicwrangling"
version = "3.0.2"
version = "3.0.3"
description = "Scripts and Jupyter notebooks for 4DN wrangling"
authors = ["4DN-DCIC Team <[email protected]>"]
license = "MIT"
Expand Down
Loading