Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Retain zero-mutation samples #44

Merged
merged 4 commits into from
Apr 16, 2018
Merged
Show file tree
Hide file tree
Changes from 3 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
140 changes: 123 additions & 17 deletions 2.TCGA-process.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -931,7 +931,8 @@
],
"source": [
"# Number of samples with at least one mutation\n",
"snp_mutation_df.sample_id.nunique()"
"samples_with_mutation_calls = sorted(set(snp_mutation_df.sample_id))\n",
"len(samples_with_mutation_calls)"
]
},
{
Expand Down Expand Up @@ -1211,7 +1212,7 @@
{
"data": {
"text/plain": [
"(9093, 20224)"
"(9104, 20224)"
]
},
"execution_count": 22,
Expand All @@ -1226,6 +1227,7 @@
" columns='entrez_gene_id',\n",
" values='count',\n",
" fill_value=0)\n",
" .reindex(samples_with_mutation_calls, fill_value=0)\n",
" .astype(bool).astype(int)\n",
")\n",
"gene_mutation_mat_df.columns = gene_mutation_mat_df.columns.astype(str)\n",
Expand Down Expand Up @@ -1514,7 +1516,7 @@
"\n",
"Find samples with both mutation and expression data.\n",
"\n",
"We assume that if a sample was not in the `MC3` data, it was not assayed for mutation. Hence, zero-mutation cancers are excluded even if they have mutation data."
"We assume that if a sample was not in the `MC3` data, it was not assayed for mutation ([more info](https://github.com/cognoma/cancer-data/issues/43#issuecomment-380957274))."
]
},
{
Expand All @@ -1525,7 +1527,7 @@
{
"data": {
"text/plain": [
"8388"
"8397"
]
},
"execution_count": 31,
Expand Down Expand Up @@ -1597,17 +1599,17 @@
" <td>A1BG</td>\n",
" <td>alpha-1-B glycoprotein</td>\n",
" <td>protein-coding</td>\n",
" <td>6.540659</td>\n",
" <td>2.308643</td>\n",
" <td>6.540815</td>\n",
" <td>2.307743</td>\n",
" </tr>\n",
" <tr>\n",
" <th>100</th>\n",
" <td>100</td>\n",
" <td>ADA</td>\n",
" <td>adenosine deaminase</td>\n",
" <td>protein-coding</td>\n",
" <td>7.445200</td>\n",
" <td>1.538660</td>\n",
" <td>7.444250</td>\n",
" <td>1.538879</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
Expand All @@ -1621,8 +1623,8 @@
"\n",
" mean_expression stdev_expression \n",
"sample \n",
"1 6.540659 2.308643 \n",
"100 7.445200 1.538660 "
"1 6.540815 2.307743 \n",
"100 7.444250 1.538879 "
]
},
"execution_count": 34,
Expand Down Expand Up @@ -1679,7 +1681,7 @@
" <td>alpha-1-B glycoprotein</td>\n",
" <td>protein-coding</td>\n",
" <td>54</td>\n",
" <td>0.006438</td>\n",
" <td>0.006431</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
Expand All @@ -1688,7 +1690,7 @@
" <td>alpha-2-macroglobulin</td>\n",
" <td>protein-coding</td>\n",
" <td>198</td>\n",
" <td>0.023605</td>\n",
" <td>0.023580</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
Expand All @@ -1702,8 +1704,8 @@
"\n",
" n_mutations mutation_freq \n",
"entrez_gene_id \n",
"1 54 0.006438 \n",
"2 198 0.023605 "
"1 54 0.006431 \n",
"2 198 0.023580 "
]
},
"execution_count": 35,
Expand All @@ -1722,6 +1724,110 @@
"y_gene_df.head(2)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Cancer type (disease) stats"
]
},
{
"cell_type": "code",
"execution_count": 36,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>acronym</th>\n",
" <th>disease</th>\n",
" <th>n_samples</th>\n",
" <th>n_clinical_samples</th>\n",
" <th>n_expression_samples</th>\n",
" <th>n_mutation_samples</th>\n",
" <th>median_mutations</th>\n",
" <th>mean_mutations</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>ACC</td>\n",
" <td>adrenocortical cancer</td>\n",
" <td>79</td>\n",
" <td>92</td>\n",
" <td>79</td>\n",
" <td>92</td>\n",
" <td>27.0</td>\n",
" <td>68.000000</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>BLCA</td>\n",
" <td>bladder urothelial carcinoma</td>\n",
" <td>403</td>\n",
" <td>409</td>\n",
" <td>405</td>\n",
" <td>407</td>\n",
" <td>164.0</td>\n",
" <td>228.168734</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" acronym disease n_samples n_clinical_samples \\\n",
"0 ACC adrenocortical cancer 79 92 \n",
"1 BLCA bladder urothelial carcinoma 403 409 \n",
"\n",
" n_expression_samples n_mutation_samples median_mutations mean_mutations \n",
"0 79 92 27.0 68.000000 \n",
"1 405 407 164.0 228.168734 "
]
},
"execution_count": 36,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"sample_to_acronym = dict(zip(clinmat_df.sample_id, clinmat_df.acronym))\n",
"\n",
"def get_cancer_count_column(sample_ids):\n",
" \"\"\"\n",
" sample_ids is a pandas.Series\n",
" \"\"\"\n",
" sample_ids = pandas.Series(sample_ids)\n",
" aconyms = sample_ids.map(sample_to_acronym)\n",
" counter = collections.Counter(aconyms)\n",
" counts = disease_df.acronym.map(counter)\n",
" return counts.fillna(0).astype(int)\n",
"\n",
"# Compute nubmer of samples per disease (cancer type)\n",
"disease_df['n_samples'] = get_cancer_count_column(sample_df.sample_id)\n",
"disease_df['n_clinical_samples'] = get_cancer_count_column(clinmat_df.sample_id)\n",
"disease_df['n_expression_samples'] = get_cancer_count_column(expr_df.index)\n",
"disease_df['n_mutation_samples'] = get_cancer_count_column(gene_mutation_mat_df.index)\n",
"\n",
"# Compute n_mutation summaries for samples in the aligned set\n",
"acronyms = list(pandas.Series(y_df.index).map(sample_to_acronym))\n",
"groups = y_df.sum(axis='columns').groupby(acronyms)\n",
"disease_df['median_mutations'] = disease_df.acronym.map(dict(groups.median()))\n",
"disease_df['mean_mutations'] = disease_df.acronym.map(dict(groups.mean()))\n",
"\n",
"# Export to TSV\n",
"path = os.path.join('data', 'diseases.tsv')\n",
"disease_df.to_csv(path, sep='\\t', float_format='%.1f', index=False)\n",
"\n",
"disease_df.head(2)"
]
},
{
"cell_type": "markdown",
"metadata": {},
Expand All @@ -1733,7 +1839,7 @@
},
{
"cell_type": "code",
"execution_count": 36,
"execution_count": 37,
"metadata": {},
"outputs": [],
"source": [
Expand All @@ -1743,7 +1849,7 @@
},
{
"cell_type": "code",
"execution_count": 37,
"execution_count": 38,
"metadata": {},
"outputs": [],
"source": [
Expand All @@ -1763,7 +1869,7 @@
},
{
"cell_type": "code",
"execution_count": 38,
"execution_count": 39,
"metadata": {},
"outputs": [],
"source": [
Expand Down
Loading