Skip to content

Commit

Permalink
Create data/diseases.tsv with summary info
Browse files Browse the repository at this point in the history
Supercedes cognoma#45
  • Loading branch information
dhimmel committed Apr 13, 2018
1 parent 2924585 commit 9f9f675
Show file tree
Hide file tree
Showing 3 changed files with 180 additions and 6 deletions.
110 changes: 107 additions & 3 deletions 2.TCGA-process.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -1724,6 +1724,110 @@
"y_gene_df.head(2)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Cancer type (disease) stats"
]
},
{
"cell_type": "code",
"execution_count": 36,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>acronym</th>\n",
" <th>disease</th>\n",
" <th>n_samples</th>\n",
" <th>n_clinical_samples</th>\n",
" <th>n_expression_samples</th>\n",
" <th>n_mutation_samples</th>\n",
" <th>median_mutations</th>\n",
" <th>mean_mutations</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>ACC</td>\n",
" <td>adrenocortical cancer</td>\n",
" <td>79</td>\n",
" <td>92</td>\n",
" <td>79</td>\n",
" <td>92</td>\n",
" <td>27.0</td>\n",
" <td>68.000000</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>BLCA</td>\n",
" <td>bladder urothelial carcinoma</td>\n",
" <td>403</td>\n",
" <td>409</td>\n",
" <td>405</td>\n",
" <td>407</td>\n",
" <td>164.0</td>\n",
" <td>228.168734</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" acronym disease n_samples n_clinical_samples \\\n",
"0 ACC adrenocortical cancer 79 92 \n",
"1 BLCA bladder urothelial carcinoma 403 409 \n",
"\n",
" n_expression_samples n_mutation_samples median_mutations mean_mutations \n",
"0 79 92 27.0 68.000000 \n",
"1 405 407 164.0 228.168734 "
]
},
"execution_count": 36,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"sample_to_acronym = dict(zip(clinmat_df.sample_id, clinmat_df.acronym))\n",
"\n",
"def get_cancer_count_column(sample_ids):\n",
" \"\"\"\n",
" sample_ids is a pandas.Series\n",
" \"\"\"\n",
" sample_ids = pandas.Series(sample_ids)\n",
" aconyms = sample_ids.map(sample_to_acronym)\n",
" counter = collections.Counter(aconyms)\n",
" counts = disease_df.acronym.map(counter)\n",
" return counts.fillna(0).astype(int)\n",
"\n",
"# Compute nubmer of samples per disease (cancer type)\n",
"disease_df['n_samples'] = get_cancer_count_column(sample_df.sample_id)\n",
"disease_df['n_clinical_samples'] = get_cancer_count_column(clinmat_df.sample_id)\n",
"disease_df['n_expression_samples'] = get_cancer_count_column(expr_df.index)\n",
"disease_df['n_mutation_samples'] = get_cancer_count_column(gene_mutation_mat_df.index)\n",
"\n",
"# Compute n_mutation summaries for samples in the aligned set\n",
"acronyms = list(pandas.Series(y_df.index).map(sample_to_acronym))\n",
"groups = y_df.sum(axis='columns').groupby(acronyms)\n",
"disease_df['median_mutations'] = disease_df.acronym.map(dict(groups.median()))\n",
"disease_df['mean_mutations'] = disease_df.acronym.map(dict(groups.mean()))\n",
"\n",
"# Export to TSV\n",
"path = os.path.join('data', 'diseases.tsv')\n",
"disease_df.to_csv(path, sep='\\t', float_format='%.1f', index=False)\n",
"\n",
"disease_df.head(2)"
]
},
{
"cell_type": "markdown",
"metadata": {},
Expand All @@ -1735,7 +1839,7 @@
},
{
"cell_type": "code",
"execution_count": 36,
"execution_count": 37,
"metadata": {},
"outputs": [],
"source": [
Expand All @@ -1745,7 +1849,7 @@
},
{
"cell_type": "code",
"execution_count": 37,
"execution_count": 38,
"metadata": {},
"outputs": [],
"source": [
Expand All @@ -1765,7 +1869,7 @@
},
{
"cell_type": "code",
"execution_count": 38,
"execution_count": 39,
"metadata": {},
"outputs": [],
"source": [
Expand Down
34 changes: 34 additions & 0 deletions data/diseases.tsv
Original file line number Diff line number Diff line change
@@ -0,0 +1,34 @@
acronym disease n_samples n_clinical_samples n_expression_samples n_mutation_samples median_mutations mean_mutations
ACC adrenocortical cancer 79 92 79 92 27.0 68.0
BLCA bladder urothelial carcinoma 403 409 405 407 164.0 228.2
BRCA breast invasive carcinoma 787 1092 1090 789 37.0 76.5
CESC cervical & endocervical cancer 286 307 304 289 87.0 181.9
CHOL cholangiocarcinoma 36 36 36 36 36.0 52.2
COAD colon adenocarcinoma 287 457 449 288 122.0 444.0
DLBC diffuse large B-cell lymphoma 37 48 48 37 108.0 122.6
ESCA esophageal carcinoma 183 185 184 184 106.0 139.2
GBM glioblastoma multiforme 149 588 153 311 48.0 111.7
HNSC head & neck squamous cell carcinoma 499 528 520 507 102.0 142.2
KICH kidney chromophobe 66 66 66 66 21.0 33.2
KIRC kidney clear cell carcinoma 366 536 533 368 54.0 56.3
KIRP kidney papillary cell carcinoma 280 291 290 281 58.5 59.0
LAML acute myeloid leukemia 0 200 173 0
LGG brain lower grade glioma 510 515 515 510 29.0 45.8
LIHC liver hepatocellular carcinoma 357 376 370 362 84.0 102.2
LUAD lung adenocarcinoma 509 519 515 513 189.0 263.1
LUSC lung squamous cell carcinoma 477 504 501 480 217.0 257.3
MESO mesothelioma 81 87 87 81 28.0 31.5
OV ovarian serous cystadenocarcinoma 14 537 295 62 67.0 70.4
PAAD pancreatic adenocarcinoma 170 185 178 177 35.5 88.0
PCPG pheochromocytoma & paraganglioma 179 179 179 179 9.0 10.4
PRAD prostate adenocarcinoma 494 498 497 495 27.0 41.0
READ rectum adenocarcinoma 89 166 159 89 95.0 230.8
SARC sarcoma 234 261 259 236 38.5 60.0
SKCM skin cutaneous melanoma 103 108 103 104 235.0 342.1
STAD stomach adenocarcinoma 412 443 415 439 114.0 324.0
TGCT testicular germ cell tumor 129 134 134 129 14.0 16.2
THCA thyroid carcinoma 490 507 505 492 10.0 11.4
THYM thymoma 118 124 120 122 13.0 21.0
UCEC uterine corpus endometrioid carcinoma 436 547 532 447 80.0 804.6
UCS uterine carcinosarcoma 57 57 57 57 49.0 120.2
UVM uveal melanoma 80 80 80 80 12.0 17.2
42 changes: 39 additions & 3 deletions scripts/2.TCGA-process.py
Original file line number Diff line number Diff line change
Expand Up @@ -444,18 +444,54 @@
y_gene_df.head(2)


# ### Cancer type (disease) stats

# In[36]:


sample_to_acronym = dict(zip(clinmat_df.sample_id, clinmat_df.acronym))

def get_cancer_count_column(sample_ids):
"""
sample_ids is a pandas.Series
"""
sample_ids = pandas.Series(sample_ids)
aconyms = sample_ids.map(sample_to_acronym)
counter = collections.Counter(aconyms)
counts = disease_df.acronym.map(counter)
return counts.fillna(0).astype(int)

# Compute nubmer of samples per disease (cancer type)
disease_df['n_samples'] = get_cancer_count_column(sample_df.sample_id)
disease_df['n_clinical_samples'] = get_cancer_count_column(clinmat_df.sample_id)
disease_df['n_expression_samples'] = get_cancer_count_column(expr_df.index)
disease_df['n_mutation_samples'] = get_cancer_count_column(gene_mutation_mat_df.index)

# Compute n_mutation summaries for samples in the aligned set
acronyms = list(pandas.Series(y_df.index).map(sample_to_acronym))
groups = y_df.sum(axis='columns').groupby(acronyms)
disease_df['median_mutations'] = disease_df.acronym.map(dict(groups.median()))
disease_df['mean_mutations'] = disease_df.acronym.map(dict(groups.mean()))

# Export to TSV
path = os.path.join('data', 'diseases.tsv')
disease_df.to_csv(path, sep='\t', float_format='%.1f', index=False)

disease_df.head(2)


# ### Export matrices to TSVs
#
# Matrices are saved as sample × gene TSVs. Subsetted matrices are also exported to allow users to quickly explore small portions of the dataset.

# In[36]:
# In[37]:


path = os.path.join('data', 'samples.tsv')
sample_df.to_csv(path, sep='\t', float_format='%.0f', index=False)


# In[37]:
# In[38]:


def subset_df(df, nrows=None, ncols=None, row_seed=0, col_seed=0):
Expand All @@ -472,7 +508,7 @@ def subset_df(df, nrows=None, ncols=None, row_seed=0, col_seed=0):
)


# In[38]:
# In[39]:


tsv_args = {'sep': '\t', 'float_format': '%.3g'}
Expand Down

0 comments on commit 9f9f675

Please sign in to comment.