Create data/diseases.tsv with summary info

Supercedes cognoma#45
dhimmel · Apr 13, 2018 · 9f9f675 · 9f9f675
1 parent 2924585
commit 9f9f675
Show file tree

Hide file tree

Showing 3 changed files with 180 additions and 6 deletions.
diff --git a/2.TCGA-process.ipynb b/2.TCGA-process.ipynb
@@ -1724,6 +1724,110 @@
     "y_gene_df.head(2)"
    ]
   },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### Cancer type (disease) stats"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 36,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/html": [
+       "<div>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th></th>\n",
+       "      <th>acronym</th>\n",
+       "      <th>disease</th>\n",
+       "      <th>n_samples</th>\n",
+       "      <th>n_clinical_samples</th>\n",
+       "      <th>n_expression_samples</th>\n",
+       "      <th>n_mutation_samples</th>\n",
+       "      <th>median_mutations</th>\n",
+       "      <th>mean_mutations</th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <th>0</th>\n",
+       "      <td>ACC</td>\n",
+       "      <td>adrenocortical cancer</td>\n",
+       "      <td>79</td>\n",
+       "      <td>92</td>\n",
+       "      <td>79</td>\n",
+       "      <td>92</td>\n",
+       "      <td>27.0</td>\n",
+       "      <td>68.000000</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>1</th>\n",
+       "      <td>BLCA</td>\n",
+       "      <td>bladder urothelial carcinoma</td>\n",
+       "      <td>403</td>\n",
+       "      <td>409</td>\n",
+       "      <td>405</td>\n",
+       "      <td>407</td>\n",
+       "      <td>164.0</td>\n",
+       "      <td>228.168734</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "</div>"
+      ],
+      "text/plain": [
+       "  acronym                       disease  n_samples  n_clinical_samples  \\\n",
+       "0     ACC         adrenocortical cancer         79                  92   \n",
+       "1    BLCA  bladder urothelial carcinoma        403                 409   \n",
+       "\n",
+       "   n_expression_samples  n_mutation_samples  median_mutations  mean_mutations  \n",
+       "0                    79                  92              27.0       68.000000  \n",
+       "1                   405                 407             164.0      228.168734  "
+      ]
+     },
+     "execution_count": 36,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "sample_to_acronym = dict(zip(clinmat_df.sample_id, clinmat_df.acronym))\n",
+    "\n",
+    "def get_cancer_count_column(sample_ids):\n",
+    "    \"\"\"\n",
+    "    sample_ids is a pandas.Series\n",
+    "    \"\"\"\n",
+    "    sample_ids = pandas.Series(sample_ids)\n",
+    "    aconyms = sample_ids.map(sample_to_acronym)\n",
+    "    counter = collections.Counter(aconyms)\n",
+    "    counts = disease_df.acronym.map(counter)\n",
+    "    return counts.fillna(0).astype(int)\n",
+    "\n",
+    "# Compute nubmer of samples per disease (cancer type)\n",
+    "disease_df['n_samples'] = get_cancer_count_column(sample_df.sample_id)\n",
+    "disease_df['n_clinical_samples'] = get_cancer_count_column(clinmat_df.sample_id)\n",
+    "disease_df['n_expression_samples'] = get_cancer_count_column(expr_df.index)\n",
+    "disease_df['n_mutation_samples'] = get_cancer_count_column(gene_mutation_mat_df.index)\n",
+    "\n",
+    "# Compute n_mutation summaries for samples in the aligned set\n",
+    "acronyms = list(pandas.Series(y_df.index).map(sample_to_acronym))\n",
+    "groups = y_df.sum(axis='columns').groupby(acronyms)\n",
+    "disease_df['median_mutations'] = disease_df.acronym.map(dict(groups.median()))\n",
+    "disease_df['mean_mutations'] = disease_df.acronym.map(dict(groups.mean()))\n",
+    "\n",
+    "# Export to TSV\n",
+    "path = os.path.join('data', 'diseases.tsv')\n",
+    "disease_df.to_csv(path, sep='\\t', float_format='%.1f', index=False)\n",
+    "\n",
+    "disease_df.head(2)"
+   ]
+  },
   {
    "cell_type": "markdown",
    "metadata": {},
@@ -1735,7 +1839,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 36,
+   "execution_count": 37,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -1745,7 +1849,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 37,
+   "execution_count": 38,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -1765,7 +1869,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 38,
+   "execution_count": 39,
    "metadata": {},
    "outputs": [],
    "source": [

diff --git a/data/diseases.tsv b/data/diseases.tsv
@@ -0,0 +1,34 @@
+acronym	disease	n_samples	n_clinical_samples	n_expression_samples	n_mutation_samples	median_mutations	mean_mutations
+ACC	adrenocortical cancer	79	92	79	92	27.0	68.0
+BLCA	bladder urothelial carcinoma	403	409	405	407	164.0	228.2
+BRCA	breast invasive carcinoma	787	1092	1090	789	37.0	76.5
+CESC	cervical & endocervical cancer	286	307	304	289	87.0	181.9
+CHOL	cholangiocarcinoma	36	36	36	36	36.0	52.2
+COAD	colon adenocarcinoma	287	457	449	288	122.0	444.0
+DLBC	diffuse large B-cell lymphoma	37	48	48	37	108.0	122.6
+ESCA	esophageal carcinoma	183	185	184	184	106.0	139.2
+GBM	glioblastoma multiforme	149	588	153	311	48.0	111.7
+HNSC	head & neck squamous cell carcinoma	499	528	520	507	102.0	142.2
+KICH	kidney chromophobe	66	66	66	66	21.0	33.2
+KIRC	kidney clear cell carcinoma	366	536	533	368	54.0	56.3
+KIRP	kidney papillary cell carcinoma	280	291	290	281	58.5	59.0
+LAML	acute myeloid leukemia	0	200	173	0		
+LGG	brain lower grade glioma	510	515	515	510	29.0	45.8
+LIHC	liver hepatocellular carcinoma	357	376	370	362	84.0	102.2
+LUAD	lung adenocarcinoma	509	519	515	513	189.0	263.1
+LUSC	lung squamous cell carcinoma	477	504	501	480	217.0	257.3
+MESO	mesothelioma	81	87	87	81	28.0	31.5
+OV	ovarian serous cystadenocarcinoma	14	537	295	62	67.0	70.4
+PAAD	pancreatic adenocarcinoma	170	185	178	177	35.5	88.0
+PCPG	pheochromocytoma & paraganglioma	179	179	179	179	9.0	10.4
+PRAD	prostate adenocarcinoma	494	498	497	495	27.0	41.0
+READ	rectum adenocarcinoma	89	166	159	89	95.0	230.8
+SARC	sarcoma	234	261	259	236	38.5	60.0
+SKCM	skin cutaneous melanoma	103	108	103	104	235.0	342.1
+STAD	stomach adenocarcinoma	412	443	415	439	114.0	324.0
+TGCT	testicular germ cell tumor	129	134	134	129	14.0	16.2
+THCA	thyroid carcinoma	490	507	505	492	10.0	11.4
+THYM	thymoma	118	124	120	122	13.0	21.0
+UCEC	uterine corpus endometrioid carcinoma	436	547	532	447	80.0	804.6
+UCS	uterine carcinosarcoma	57	57	57	57	49.0	120.2
+UVM	uveal melanoma	80	80	80	80	12.0	17.2
diff --git a/scripts/2.TCGA-process.py b/scripts/2.TCGA-process.py
@@ -444,18 +444,54 @@
 y_gene_df.head(2)
 
 
+# ### Cancer type (disease) stats
+
+# In[36]:
+
+
+sample_to_acronym = dict(zip(clinmat_df.sample_id, clinmat_df.acronym))
+
+def get_cancer_count_column(sample_ids):
+    """
+    sample_ids is a pandas.Series
+    """
+    sample_ids = pandas.Series(sample_ids)
+    aconyms = sample_ids.map(sample_to_acronym)
+    counter = collections.Counter(aconyms)
+    counts = disease_df.acronym.map(counter)
+    return counts.fillna(0).astype(int)
+
+# Compute nubmer of samples per disease (cancer type)
+disease_df['n_samples'] = get_cancer_count_column(sample_df.sample_id)
+disease_df['n_clinical_samples'] = get_cancer_count_column(clinmat_df.sample_id)
+disease_df['n_expression_samples'] = get_cancer_count_column(expr_df.index)
+disease_df['n_mutation_samples'] = get_cancer_count_column(gene_mutation_mat_df.index)
+
+# Compute n_mutation summaries for samples in the aligned set
+acronyms = list(pandas.Series(y_df.index).map(sample_to_acronym))
+groups = y_df.sum(axis='columns').groupby(acronyms)
+disease_df['median_mutations'] = disease_df.acronym.map(dict(groups.median()))
+disease_df['mean_mutations'] = disease_df.acronym.map(dict(groups.mean()))
+
+# Export to TSV
+path = os.path.join('data', 'diseases.tsv')
+disease_df.to_csv(path, sep='\t', float_format='%.1f', index=False)
+
+disease_df.head(2)
+
+
 # ### Export matrices to TSVs
 # 
 # Matrices are saved as sample × gene TSVs. Subsetted matrices are also exported to allow users to quickly explore small portions of the dataset.
 
-# In[36]:
+# In[37]:
 
 
 path = os.path.join('data', 'samples.tsv')
 sample_df.to_csv(path, sep='\t', float_format='%.0f', index=False)
 
 
-# In[37]:
+# In[38]:
 
 
 def subset_df(df, nrows=None, ncols=None, row_seed=0, col_seed=0):
@@ -472,7 +508,7 @@ def subset_df(df, nrows=None, ncols=None, row_seed=0, col_seed=0):
     )
 
 
-# In[38]:
+# In[39]:
 
 
 tsv_args = {'sep': '\t', 'float_format': '%.3g'}