tanghaibao · dvklopfenstein · Nov 7, 2021 · Nov 7, 2021
diff --git a/goatools/cli/ncbi_gene_results_to_python.py b/goatools/cli/ncbi_gene_results_to_python.py
@@ -6,15 +6,19 @@
 __author__ = "DV Klopfenstein"
 
 import os
-import sys
+from sys import stdout
 import re
 import datetime
 import collections as cx
 from argparse import ArgumentParser
 from goatools.parsers.ncbi_gene_file_reader import NCBIgeneFileReader
 
 
-# pylint: disable=too-few-public-methods
+def ncbi_tsv_to_py(fin_tsv, fout_py=None, prt=stdout):
+    """Read a NCBI Gene file. Write data into one Python module per gene file"""
+    obj = NCBIgeneToPythonCli()
+    obj.ncbi_tsv_to_py(fin_tsv, fout_py, prt)
+
 class NCBIgeneToPythonCli:
     """Read a NCBI Gene gene_result.txt file and write a Python module."""
 
@@ -26,8 +30,7 @@ class NCBIgeneToPythonCli:
         '-o', '--outfile',
         help='Write current citation report to an ASCII text file.')
 
-
-    def cli(self, prt=sys.stdout):
+    def cli(self, prt=stdout):
         """Command-line interface to print specified GO Terms from the DAG source ."""
         args = self.argparser.parse_args()
         # Aggregate all NCBI Gene data into a single output file
@@ -36,17 +39,21 @@ def cli(self, prt=sys.stdout):
             return
         self.tsv_to_py_each(args.NCBI_gene_tsv, args.outfile, prt)
 
-    def tsv_to_py(self, fin_tsv, fout_py=None, prt=sys.stdout):
+    def tsv_to_py(self, fin_tsv, fout_py=None, prt=stdout):
         """Read each NCBI Gene files. Write data into one Python module per gene file"""
         self.tsv_to_py_each([fin_tsv], fout_py, prt)
 
-    def tsv_to_py_each(self, fin_tsvs, fout_py=None, prt=sys.stdout):
+    def tsv_to_py_each(self, fin_tsvs, fout_py=None, prt=stdout):
         """Read each NCBI Gene files. Write data into one Python module per gene file"""
         in_outs = self._get_io_filenames(fin_tsvs, fout_py)
         for fin_tsv, fo_py in in_outs:
-            nts = NCBIgeneFileReader(fin_tsv).get_nts()
-            geneid2nt = self._get_geneid2nt(nts)
-            self._wrpy_ncbi_gene_nts(fo_py, geneid2nt, prt)
+            self.ncbi_tsv_to_py(fin_tsv, fo_py, prt)
+
+    def ncbi_tsv_to_py(self, fin_tsv, fout_py=None, prt=stdout):
+        """Read a NCBI Gene file. Write data into one Python module per gene file"""
+        nts = NCBIgeneFileReader(fin_tsv).get_nts()
+        geneid2nt = self._get_geneid2nt(nts)
+        self._wrpy_ncbi_gene_nts(fout_py, geneid2nt, prt)
 
     def _get_io_filenames(self, fin_tsvs, fout_py):
         """Get one output file for each input file"""
@@ -74,7 +81,7 @@ def _get_foutpy(basename, cnt):
             return '{F}.py'.format(F=basename)
         return '{F}{N}.py'.format(F=basename, N=cnt)
 
-    def tsv_to_py_all(self, fin_tsvs, fout_py=None, prt=sys.stdout):
+    def tsv_to_py_all(self, fin_tsvs, fout_py=None, prt=stdout):
         """Read all NCBI Gene files. Write all data into one Python module"""
         nts = self._read_tsvs_all(fin_tsvs)
         geneid2nt = self._get_geneid2nt(nts)

diff --git a/notebooks/annotation_coverage.ipynb b/notebooks/annotation_coverage.ipynb
@@ -31,7 +31,7 @@
     }
    ],
    "source": [
-    "# Get ftp://ftp.ncbi.nlm.nih.gov/gene/DATA/gene2go.gz\n",
+    "# wget ftp://ftp.ncbi.nlm.nih.gov/gene/DATA/gene2go.gz\n",
     "from goatools.base import download_ncbi_associations\n",
     "gene2go = download_ncbi_associations()"
    ]
@@ -54,8 +54,8 @@
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "HMS:0:00:07.574562 323,107 annotations, 19,649 genes, 18,246 GOs, 1 taxids READ: gene2go \n",
-      "HMS:0:00:03.391011 101,655 annotations, 13,617 genes,  8,512 GOs, 1 taxids READ: gene2go \n"
+      "HMS:0:00:07.954549 331,423 annotations, 20,689 genes, 18,627 GOs, 1 taxids READ: gene2go \n",
+      "HMS:0:00:04.919504 104,435 annotations, 13,813 genes,  8,683 GOs, 1 taxids READ: gene2go \n"
      ]
     }
    ],
@@ -82,7 +82,7 @@
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "HMS:0:00:06.648749 424,762 annotations, 33,266 genes, 19,699 GOs, 2 taxids READ: gene2go \n"
+      "HMS:0:00:12.061128 435,858 annotations, 34,502 genes, 20,059 GOs, 2 taxids READ: gene2go \n"
      ]
     }
    ],
@@ -111,7 +111,7 @@
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "19,919 human genes\n",
+      "19,658 human genes\n",
       "13,968 fly genes\n"
      ]
     }
@@ -182,8 +182,8 @@
      "text": [
       " taxid    GOs GeneIDs Coverage\n",
       "------ ------ ------- ----------------------\n",
-      "  9606 18,103  18,598  93% GO coverage of 19,919 protein-coding genes\n",
-      "  7227  8,417  10,660  76% GO coverage of 13,968 protein-coding genes\n"
+      "  9606 18,470  18,725  95% GO coverage of 19,658 protein-coding genes\n",
+      "  7227  8,587  10,952  78% GO coverage of 13,968 protein-coding genes\n"
      ]
     }
    ],

diff --git a/notebooks/background_genes_ncbi.ipynb b/notebooks/background_genes_ncbi.ipynb
@@ -21,21 +21,51 @@
     "![](images/dnld_mouse_pcd_genes.png)\n",
     "\n",
     "## 2) Convert NCBI Gene tab separated values (tsv) file to a Python module\n",
+    "Use the command line or a Python script to convert a NCBI Gene tsv file to a Python module\n",
     "\n",
-    "**A GOA Tools Python script will convert a NCBI Gene tsv file to a Python module:**\n",
+    "### 2a) Run a script from the command line\n",
     "```\n",
     "$ scripts/ncbi_gene_results_to_python.py gene_result.txt -o genes_ncbi_10090_proteincoding.py\n",
     "      26,386 lines READ:  gene_result.txt\n",
     "      26,376 geneids WROTE: genes_ncbi_10090_proteincoding.py\n",
     "```\n",
     "\n",
+    "### 2b) Run a function from inside your Python script"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "      26,386 lines READ:  gene_result.txt\n",
+      "      26,376 geneids WROTE: genes_ncbi_10090_proteincoding.py\n"
+     ]
+    }
+   ],
+   "source": [
+    "from goatools.cli.ncbi_gene_results_to_python import ncbi_tsv_to_py\n",
+    "\n",
+    "ncbi_tsv = 'gene_result.txt'\n",
+    "output_py = 'genes_ncbi_10090_proteincoding.py'\n",
+    "ncbi_tsv_to_py(ncbi_tsv, output_py)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
     "## 3) Explore NCBI gene data\n",
     "### 3a) Import NCBI data from new NCBI gene Python module"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 1,
+   "execution_count": 2,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -51,7 +81,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 2,
+   "execution_count": 3,
    "metadata": {
     "scrolled": true
    },
@@ -99,7 +129,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 3,
+   "execution_count": 4,
    "metadata": {},
    "outputs": [
     {
@@ -125,7 +155,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 4,
+   "execution_count": 5,
    "metadata": {},
    "outputs": [
     {
@@ -173,7 +203,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 5,
+   "execution_count": 6,
    "metadata": {},
    "outputs": [
     {
@@ -199,7 +229,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 6,
+   "execution_count": 7,
    "metadata": {},
    "outputs": [
     {