Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Added function to convert one NCBI Gene tsv file to a Python module. #231

Merged
merged 1 commit into from
Nov 7, 2021
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
27 changes: 17 additions & 10 deletions goatools/cli/ncbi_gene_results_to_python.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,15 +6,19 @@
__author__ = "DV Klopfenstein"

import os
import sys
from sys import stdout
import re
import datetime
import collections as cx
from argparse import ArgumentParser
from goatools.parsers.ncbi_gene_file_reader import NCBIgeneFileReader


# pylint: disable=too-few-public-methods
def ncbi_tsv_to_py(fin_tsv, fout_py=None, prt=stdout):
"""Read a NCBI Gene file. Write data into one Python module per gene file"""
obj = NCBIgeneToPythonCli()
obj.ncbi_tsv_to_py(fin_tsv, fout_py, prt)

class NCBIgeneToPythonCli:
"""Read a NCBI Gene gene_result.txt file and write a Python module."""

Expand All @@ -26,8 +30,7 @@ class NCBIgeneToPythonCli:
'-o', '--outfile',
help='Write current citation report to an ASCII text file.')


def cli(self, prt=sys.stdout):
def cli(self, prt=stdout):
"""Command-line interface to print specified GO Terms from the DAG source ."""
args = self.argparser.parse_args()
# Aggregate all NCBI Gene data into a single output file
Expand All @@ -36,17 +39,21 @@ def cli(self, prt=sys.stdout):
return
self.tsv_to_py_each(args.NCBI_gene_tsv, args.outfile, prt)

def tsv_to_py(self, fin_tsv, fout_py=None, prt=sys.stdout):
def tsv_to_py(self, fin_tsv, fout_py=None, prt=stdout):
"""Read each NCBI Gene files. Write data into one Python module per gene file"""
self.tsv_to_py_each([fin_tsv], fout_py, prt)

def tsv_to_py_each(self, fin_tsvs, fout_py=None, prt=sys.stdout):
def tsv_to_py_each(self, fin_tsvs, fout_py=None, prt=stdout):
"""Read each NCBI Gene files. Write data into one Python module per gene file"""
in_outs = self._get_io_filenames(fin_tsvs, fout_py)
for fin_tsv, fo_py in in_outs:
nts = NCBIgeneFileReader(fin_tsv).get_nts()
geneid2nt = self._get_geneid2nt(nts)
self._wrpy_ncbi_gene_nts(fo_py, geneid2nt, prt)
self.ncbi_tsv_to_py(fin_tsv, fo_py, prt)

def ncbi_tsv_to_py(self, fin_tsv, fout_py=None, prt=stdout):
"""Read a NCBI Gene file. Write data into one Python module per gene file"""
nts = NCBIgeneFileReader(fin_tsv).get_nts()
geneid2nt = self._get_geneid2nt(nts)
self._wrpy_ncbi_gene_nts(fout_py, geneid2nt, prt)

def _get_io_filenames(self, fin_tsvs, fout_py):
"""Get one output file for each input file"""
Expand Down Expand Up @@ -74,7 +81,7 @@ def _get_foutpy(basename, cnt):
return '{F}.py'.format(F=basename)
return '{F}{N}.py'.format(F=basename, N=cnt)

def tsv_to_py_all(self, fin_tsvs, fout_py=None, prt=sys.stdout):
def tsv_to_py_all(self, fin_tsvs, fout_py=None, prt=stdout):
"""Read all NCBI Gene files. Write all data into one Python module"""
nts = self._read_tsvs_all(fin_tsvs)
geneid2nt = self._get_geneid2nt(nts)
Expand Down
14 changes: 7 additions & 7 deletions notebooks/annotation_coverage.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -31,7 +31,7 @@
}
],
"source": [
"# Get ftp://ftp.ncbi.nlm.nih.gov/gene/DATA/gene2go.gz\n",
"# wget ftp://ftp.ncbi.nlm.nih.gov/gene/DATA/gene2go.gz\n",
"from goatools.base import download_ncbi_associations\n",
"gene2go = download_ncbi_associations()"
]
Expand All @@ -54,8 +54,8 @@
"name": "stdout",
"output_type": "stream",
"text": [
"HMS:0:00:07.574562 323,107 annotations, 19,649 genes, 18,246 GOs, 1 taxids READ: gene2go \n",
"HMS:0:00:03.391011 101,655 annotations, 13,617 genes, 8,512 GOs, 1 taxids READ: gene2go \n"
"HMS:0:00:07.954549 331,423 annotations, 20,689 genes, 18,627 GOs, 1 taxids READ: gene2go \n",
"HMS:0:00:04.919504 104,435 annotations, 13,813 genes, 8,683 GOs, 1 taxids READ: gene2go \n"
]
}
],
Expand All @@ -82,7 +82,7 @@
"name": "stdout",
"output_type": "stream",
"text": [
"HMS:0:00:06.648749 424,762 annotations, 33,266 genes, 19,699 GOs, 2 taxids READ: gene2go \n"
"HMS:0:00:12.061128 435,858 annotations, 34,502 genes, 20,059 GOs, 2 taxids READ: gene2go \n"
]
}
],
Expand Down Expand Up @@ -111,7 +111,7 @@
"name": "stdout",
"output_type": "stream",
"text": [
"19,919 human genes\n",
"19,658 human genes\n",
"13,968 fly genes\n"
]
}
Expand Down Expand Up @@ -182,8 +182,8 @@
"text": [
" taxid GOs GeneIDs Coverage\n",
"------ ------ ------- ----------------------\n",
" 9606 18,103 18,598 93% GO coverage of 19,919 protein-coding genes\n",
" 7227 8,417 10,660 76% GO coverage of 13,968 protein-coding genes\n"
" 9606 18,470 18,725 95% GO coverage of 19,658 protein-coding genes\n",
" 7227 8,587 10,952 78% GO coverage of 13,968 protein-coding genes\n"
]
}
],
Expand Down
44 changes: 37 additions & 7 deletions notebooks/background_genes_ncbi.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -21,21 +21,51 @@
"![](images/dnld_mouse_pcd_genes.png)\n",
"\n",
"## 2) Convert NCBI Gene tab separated values (tsv) file to a Python module\n",
"Use the command line or a Python script to convert a NCBI Gene tsv file to a Python module\n",
"\n",
"**A GOA Tools Python script will convert a NCBI Gene tsv file to a Python module:**\n",
"### 2a) Run a script from the command line\n",
"```\n",
"$ scripts/ncbi_gene_results_to_python.py gene_result.txt -o genes_ncbi_10090_proteincoding.py\n",
" 26,386 lines READ: gene_result.txt\n",
" 26,376 geneids WROTE: genes_ncbi_10090_proteincoding.py\n",
"```\n",
"\n",
"### 2b) Run a function from inside your Python script"
]
},
{
"cell_type": "code",
"execution_count": 1,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
" 26,386 lines READ: gene_result.txt\n",
" 26,376 geneids WROTE: genes_ncbi_10090_proteincoding.py\n"
]
}
],
"source": [
"from goatools.cli.ncbi_gene_results_to_python import ncbi_tsv_to_py\n",
"\n",
"ncbi_tsv = 'gene_result.txt'\n",
"output_py = 'genes_ncbi_10090_proteincoding.py'\n",
"ncbi_tsv_to_py(ncbi_tsv, output_py)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## 3) Explore NCBI gene data\n",
"### 3a) Import NCBI data from new NCBI gene Python module"
]
},
{
"cell_type": "code",
"execution_count": 1,
"execution_count": 2,
"metadata": {},
"outputs": [],
"source": [
Expand All @@ -51,7 +81,7 @@
},
{
"cell_type": "code",
"execution_count": 2,
"execution_count": 3,
"metadata": {
"scrolled": true
},
Expand Down Expand Up @@ -99,7 +129,7 @@
},
{
"cell_type": "code",
"execution_count": 3,
"execution_count": 4,
"metadata": {},
"outputs": [
{
Expand All @@ -125,7 +155,7 @@
},
{
"cell_type": "code",
"execution_count": 4,
"execution_count": 5,
"metadata": {},
"outputs": [
{
Expand Down Expand Up @@ -173,7 +203,7 @@
},
{
"cell_type": "code",
"execution_count": 5,
"execution_count": 6,
"metadata": {},
"outputs": [
{
Expand All @@ -199,7 +229,7 @@
},
{
"cell_type": "code",
"execution_count": 6,
"execution_count": 7,
"metadata": {},
"outputs": [
{
Expand Down
Loading