cognoma · dhimmel · Apr 16, 2018 · Apr 12, 2018 · Apr 12, 2018 · Apr 13, 2018
diff --git a/2.TCGA-process.ipynb b/2.TCGA-process.ipynb
@@ -931,7 +931,8 @@
    ],
    "source": [
     "# Number of samples with at least one mutation\n",
-    "snp_mutation_df.sample_id.nunique()"
+    "samples_with_mutation_calls = sorted(set(snp_mutation_df.sample_id))\n",
+    "len(samples_with_mutation_calls)"
    ]
   },
   {
@@ -1211,7 +1212,7 @@
     {
      "data": {
       "text/plain": [
-       "(9093, 20224)"
+       "(9104, 20224)"
       ]
      },
      "execution_count": 22,
@@ -1226,6 +1227,7 @@
     "                 columns='entrez_gene_id',\n",
     "                 values='count',\n",
     "                 fill_value=0)\n",
+    "    .reindex(samples_with_mutation_calls, fill_value=0)\n",
     "    .astype(bool).astype(int)\n",
     ")\n",
     "gene_mutation_mat_df.columns = gene_mutation_mat_df.columns.astype(str)\n",
@@ -1514,7 +1516,7 @@
     "\n",
     "Find samples with both mutation and expression data.\n",
     "\n",
-    "We assume that if a sample was not in the `MC3` data, it was not assayed for mutation. Hence, zero-mutation cancers are excluded even if they have mutation data."
+    "We assume that if a sample was not in the `MC3` data, it was not assayed for mutation ([more info](https://github.com/cognoma/cancer-data/issues/43#issuecomment-380957274))."
    ]
   },
   {
@@ -1525,7 +1527,7 @@
     {
      "data": {
       "text/plain": [
-       "8388"
+       "8397"
       ]
      },
      "execution_count": 31,
@@ -1597,17 +1599,17 @@
        "      <td>A1BG</td>\n",
        "      <td>alpha-1-B glycoprotein</td>\n",
        "      <td>protein-coding</td>\n",
-       "      <td>6.540659</td>\n",
-       "      <td>2.308643</td>\n",
+       "      <td>6.540815</td>\n",
+       "      <td>2.307743</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>100</th>\n",
        "      <td>100</td>\n",
        "      <td>ADA</td>\n",
        "      <td>adenosine deaminase</td>\n",
        "      <td>protein-coding</td>\n",
-       "      <td>7.445200</td>\n",
-       "      <td>1.538660</td>\n",
+       "      <td>7.444250</td>\n",
+       "      <td>1.538879</td>\n",
        "    </tr>\n",
        "  </tbody>\n",
        "</table>\n",
@@ -1621,8 +1623,8 @@
        "\n",
        "        mean_expression  stdev_expression  \n",
        "sample                                     \n",
-       "1              6.540659          2.308643  \n",
-       "100            7.445200          1.538660  "
+       "1              6.540815          2.307743  \n",
+       "100            7.444250          1.538879  "
       ]
      },
      "execution_count": 34,
@@ -1679,7 +1681,7 @@
        "      <td>alpha-1-B glycoprotein</td>\n",
        "      <td>protein-coding</td>\n",
        "      <td>54</td>\n",
-       "      <td>0.006438</td>\n",
+       "      <td>0.006431</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>2</th>\n",
@@ -1688,7 +1690,7 @@
        "      <td>alpha-2-macroglobulin</td>\n",
        "      <td>protein-coding</td>\n",
        "      <td>198</td>\n",
-       "      <td>0.023605</td>\n",
+       "      <td>0.023580</td>\n",
        "    </tr>\n",
        "  </tbody>\n",
        "</table>\n",
@@ -1702,8 +1704,8 @@
        "\n",
        "                n_mutations  mutation_freq  \n",
        "entrez_gene_id                              \n",
-       "1                        54       0.006438  \n",
-       "2                       198       0.023605  "
+       "1                        54       0.006431  \n",
+       "2                       198       0.023580  "
       ]
      },
      "execution_count": 35,
@@ -1722,6 +1724,110 @@
     "y_gene_df.head(2)"
    ]
   },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### Cancer type (disease) stats"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 36,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/html": [
+       "<div>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th></th>\n",
+       "      <th>acronym</th>\n",
+       "      <th>disease</th>\n",
+       "      <th>n_samples</th>\n",
+       "      <th>n_clinical_samples</th>\n",
+       "      <th>n_expression_samples</th>\n",
+       "      <th>n_mutation_samples</th>\n",
+       "      <th>median_mutations</th>\n",
+       "      <th>mean_mutations</th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <th>0</th>\n",
+       "      <td>ACC</td>\n",
+       "      <td>adrenocortical cancer</td>\n",
+       "      <td>79</td>\n",
+       "      <td>92</td>\n",
+       "      <td>79</td>\n",
+       "      <td>92</td>\n",
+       "      <td>27.0</td>\n",
+       "      <td>68.000000</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>1</th>\n",
+       "      <td>BLCA</td>\n",
+       "      <td>bladder urothelial carcinoma</td>\n",
+       "      <td>403</td>\n",
+       "      <td>409</td>\n",
+       "      <td>405</td>\n",
+       "      <td>407</td>\n",
+       "      <td>164.0</td>\n",
+       "      <td>228.168734</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "</div>"
+      ],
+      "text/plain": [
+       "  acronym                       disease  n_samples  n_clinical_samples  \\\n",
+       "0     ACC         adrenocortical cancer         79                  92   \n",
+       "1    BLCA  bladder urothelial carcinoma        403                 409   \n",
+       "\n",
+       "   n_expression_samples  n_mutation_samples  median_mutations  mean_mutations  \n",
+       "0                    79                  92              27.0       68.000000  \n",
+       "1                   405                 407             164.0      228.168734  "
+      ]
+     },
+     "execution_count": 36,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "sample_to_acronym = dict(zip(clinmat_df.sample_id, clinmat_df.acronym))\n",
+    "\n",
+    "def get_cancer_count_column(sample_ids):\n",
+    "    \"\"\"\n",
+    "    sample_ids is a pandas.Series\n",
+    "    \"\"\"\n",
+    "    sample_ids = pandas.Series(sample_ids)\n",
+    "    aconyms = sample_ids.map(sample_to_acronym)\n",
+    "    counter = collections.Counter(aconyms)\n",
+    "    counts = disease_df.acronym.map(counter)\n",
+    "    return counts.fillna(0).astype(int)\n",
+    "\n",
+    "# Compute nubmer of samples per disease (cancer type)\n",
+    "disease_df['n_samples'] = get_cancer_count_column(sample_df.sample_id)\n",
+    "disease_df['n_clinical_samples'] = get_cancer_count_column(clinmat_df.sample_id)\n",
+    "disease_df['n_expression_samples'] = get_cancer_count_column(expr_df.index)\n",
+    "disease_df['n_mutation_samples'] = get_cancer_count_column(gene_mutation_mat_df.index)\n",
+    "\n",
+    "# Compute n_mutation summaries for samples in the aligned set\n",
+    "acronyms = list(pandas.Series(y_df.index).map(sample_to_acronym))\n",
+    "groups = y_df.sum(axis='columns').groupby(acronyms)\n",
+    "disease_df['median_mutations'] = disease_df.acronym.map(dict(groups.median()))\n",
+    "disease_df['mean_mutations'] = disease_df.acronym.map(dict(groups.mean()))\n",
+    "\n",
+    "# Export to TSV\n",
+    "path = os.path.join('data', 'diseases.tsv')\n",
+    "disease_df.to_csv(path, sep='\\t', float_format='%.1f', index=False)\n",
+    "\n",
+    "disease_df.head(2)"
+   ]
+  },
   {
    "cell_type": "markdown",
    "metadata": {},
@@ -1733,7 +1839,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 36,
+   "execution_count": 37,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -1743,7 +1849,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 37,
+   "execution_count": 38,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -1763,7 +1869,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 38,
+   "execution_count": 39,
    "metadata": {},
    "outputs": [],
    "source": [