diff --git a/2.TCGA-process.ipynb b/2.TCGA-process.ipynb index 29a519a..3d06975 100755 --- a/2.TCGA-process.ipynb +++ b/2.TCGA-process.ipynb @@ -1724,6 +1724,110 @@ "y_gene_df.head(2)" ] }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Cancer type (disease) stats" + ] + }, + { + "cell_type": "code", + "execution_count": 36, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
acronymdiseasen_samplesn_clinical_samplesn_expression_samplesn_mutation_samplesmedian_mutationsmean_mutations
0ACCadrenocortical cancer7992799227.068.000000
1BLCAbladder urothelial carcinoma403409405407164.0228.168734
\n", + "
" + ], + "text/plain": [ + " acronym disease n_samples n_clinical_samples \\\n", + "0 ACC adrenocortical cancer 79 92 \n", + "1 BLCA bladder urothelial carcinoma 403 409 \n", + "\n", + " n_expression_samples n_mutation_samples median_mutations mean_mutations \n", + "0 79 92 27.0 68.000000 \n", + "1 405 407 164.0 228.168734 " + ] + }, + "execution_count": 36, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "sample_to_acronym = dict(zip(clinmat_df.sample_id, clinmat_df.acronym))\n", + "\n", + "def get_cancer_count_column(sample_ids):\n", + " \"\"\"\n", + " sample_ids is a pandas.Series\n", + " \"\"\"\n", + " sample_ids = pandas.Series(sample_ids)\n", + " aconyms = sample_ids.map(sample_to_acronym)\n", + " counter = collections.Counter(aconyms)\n", + " counts = disease_df.acronym.map(counter)\n", + " return counts.fillna(0).astype(int)\n", + "\n", + "# Compute nubmer of samples per disease (cancer type)\n", + "disease_df['n_samples'] = get_cancer_count_column(sample_df.sample_id)\n", + "disease_df['n_clinical_samples'] = get_cancer_count_column(clinmat_df.sample_id)\n", + "disease_df['n_expression_samples'] = get_cancer_count_column(expr_df.index)\n", + "disease_df['n_mutation_samples'] = get_cancer_count_column(gene_mutation_mat_df.index)\n", + "\n", + "# Compute n_mutation summaries for samples in the aligned set\n", + "acronyms = list(pandas.Series(y_df.index).map(sample_to_acronym))\n", + "groups = y_df.sum(axis='columns').groupby(acronyms)\n", + "disease_df['median_mutations'] = disease_df.acronym.map(dict(groups.median()))\n", + "disease_df['mean_mutations'] = disease_df.acronym.map(dict(groups.mean()))\n", + "\n", + "# Export to TSV\n", + "path = os.path.join('data', 'diseases.tsv')\n", + "disease_df.to_csv(path, sep='\\t', float_format='%.1f', index=False)\n", + "\n", + "disease_df.head(2)" + ] + }, { "cell_type": "markdown", "metadata": {}, @@ -1735,7 +1839,7 @@ }, { "cell_type": "code", - "execution_count": 36, + "execution_count": 37, "metadata": {}, "outputs": [], "source": [ @@ -1745,7 +1849,7 @@ }, { "cell_type": "code", - "execution_count": 37, + "execution_count": 38, "metadata": {}, "outputs": [], "source": [ @@ -1765,7 +1869,7 @@ }, { "cell_type": "code", - "execution_count": 38, + "execution_count": 39, "metadata": {}, "outputs": [], "source": [ diff --git a/data/diseases.tsv b/data/diseases.tsv new file mode 100644 index 0000000..5faf7da --- /dev/null +++ b/data/diseases.tsv @@ -0,0 +1,34 @@ +acronym disease n_samples n_clinical_samples n_expression_samples n_mutation_samples median_mutations mean_mutations +ACC adrenocortical cancer 79 92 79 92 27.0 68.0 +BLCA bladder urothelial carcinoma 403 409 405 407 164.0 228.2 +BRCA breast invasive carcinoma 787 1092 1090 789 37.0 76.5 +CESC cervical & endocervical cancer 286 307 304 289 87.0 181.9 +CHOL cholangiocarcinoma 36 36 36 36 36.0 52.2 +COAD colon adenocarcinoma 287 457 449 288 122.0 444.0 +DLBC diffuse large B-cell lymphoma 37 48 48 37 108.0 122.6 +ESCA esophageal carcinoma 183 185 184 184 106.0 139.2 +GBM glioblastoma multiforme 149 588 153 311 48.0 111.7 +HNSC head & neck squamous cell carcinoma 499 528 520 507 102.0 142.2 +KICH kidney chromophobe 66 66 66 66 21.0 33.2 +KIRC kidney clear cell carcinoma 366 536 533 368 54.0 56.3 +KIRP kidney papillary cell carcinoma 280 291 290 281 58.5 59.0 +LAML acute myeloid leukemia 0 200 173 0 +LGG brain lower grade glioma 510 515 515 510 29.0 45.8 +LIHC liver hepatocellular carcinoma 357 376 370 362 84.0 102.2 +LUAD lung adenocarcinoma 509 519 515 513 189.0 263.1 +LUSC lung squamous cell carcinoma 477 504 501 480 217.0 257.3 +MESO mesothelioma 81 87 87 81 28.0 31.5 +OV ovarian serous cystadenocarcinoma 14 537 295 62 67.0 70.4 +PAAD pancreatic adenocarcinoma 170 185 178 177 35.5 88.0 +PCPG pheochromocytoma & paraganglioma 179 179 179 179 9.0 10.4 +PRAD prostate adenocarcinoma 494 498 497 495 27.0 41.0 +READ rectum adenocarcinoma 89 166 159 89 95.0 230.8 +SARC sarcoma 234 261 259 236 38.5 60.0 +SKCM skin cutaneous melanoma 103 108 103 104 235.0 342.1 +STAD stomach adenocarcinoma 412 443 415 439 114.0 324.0 +TGCT testicular germ cell tumor 129 134 134 129 14.0 16.2 +THCA thyroid carcinoma 490 507 505 492 10.0 11.4 +THYM thymoma 118 124 120 122 13.0 21.0 +UCEC uterine corpus endometrioid carcinoma 436 547 532 447 80.0 804.6 +UCS uterine carcinosarcoma 57 57 57 57 49.0 120.2 +UVM uveal melanoma 80 80 80 80 12.0 17.2 diff --git a/scripts/2.TCGA-process.py b/scripts/2.TCGA-process.py index 65ae5fd..4a3ff72 100644 --- a/scripts/2.TCGA-process.py +++ b/scripts/2.TCGA-process.py @@ -444,18 +444,54 @@ y_gene_df.head(2) +# ### Cancer type (disease) stats + +# In[36]: + + +sample_to_acronym = dict(zip(clinmat_df.sample_id, clinmat_df.acronym)) + +def get_cancer_count_column(sample_ids): + """ + sample_ids is a pandas.Series + """ + sample_ids = pandas.Series(sample_ids) + aconyms = sample_ids.map(sample_to_acronym) + counter = collections.Counter(aconyms) + counts = disease_df.acronym.map(counter) + return counts.fillna(0).astype(int) + +# Compute nubmer of samples per disease (cancer type) +disease_df['n_samples'] = get_cancer_count_column(sample_df.sample_id) +disease_df['n_clinical_samples'] = get_cancer_count_column(clinmat_df.sample_id) +disease_df['n_expression_samples'] = get_cancer_count_column(expr_df.index) +disease_df['n_mutation_samples'] = get_cancer_count_column(gene_mutation_mat_df.index) + +# Compute n_mutation summaries for samples in the aligned set +acronyms = list(pandas.Series(y_df.index).map(sample_to_acronym)) +groups = y_df.sum(axis='columns').groupby(acronyms) +disease_df['median_mutations'] = disease_df.acronym.map(dict(groups.median())) +disease_df['mean_mutations'] = disease_df.acronym.map(dict(groups.mean())) + +# Export to TSV +path = os.path.join('data', 'diseases.tsv') +disease_df.to_csv(path, sep='\t', float_format='%.1f', index=False) + +disease_df.head(2) + + # ### Export matrices to TSVs # # Matrices are saved as sample × gene TSVs. Subsetted matrices are also exported to allow users to quickly explore small portions of the dataset. -# In[36]: +# In[37]: path = os.path.join('data', 'samples.tsv') sample_df.to_csv(path, sep='\t', float_format='%.0f', index=False) -# In[37]: +# In[38]: def subset_df(df, nrows=None, ncols=None, row_seed=0, col_seed=0): @@ -472,7 +508,7 @@ def subset_df(df, nrows=None, ncols=None, row_seed=0, col_seed=0): ) -# In[38]: +# In[39]: tsv_args = {'sep': '\t', 'float_format': '%.3g'}