diff --git a/2.TCGA-process.ipynb b/2.TCGA-process.ipynb
index 29a519a..3d06975 100755
--- a/2.TCGA-process.ipynb
+++ b/2.TCGA-process.ipynb
@@ -1724,6 +1724,110 @@
"y_gene_df.head(2)"
]
},
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "### Cancer type (disease) stats"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 36,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "
\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " acronym | \n",
+ " disease | \n",
+ " n_samples | \n",
+ " n_clinical_samples | \n",
+ " n_expression_samples | \n",
+ " n_mutation_samples | \n",
+ " median_mutations | \n",
+ " mean_mutations | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " 0 | \n",
+ " ACC | \n",
+ " adrenocortical cancer | \n",
+ " 79 | \n",
+ " 92 | \n",
+ " 79 | \n",
+ " 92 | \n",
+ " 27.0 | \n",
+ " 68.000000 | \n",
+ "
\n",
+ " \n",
+ " 1 | \n",
+ " BLCA | \n",
+ " bladder urothelial carcinoma | \n",
+ " 403 | \n",
+ " 409 | \n",
+ " 405 | \n",
+ " 407 | \n",
+ " 164.0 | \n",
+ " 228.168734 | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " acronym disease n_samples n_clinical_samples \\\n",
+ "0 ACC adrenocortical cancer 79 92 \n",
+ "1 BLCA bladder urothelial carcinoma 403 409 \n",
+ "\n",
+ " n_expression_samples n_mutation_samples median_mutations mean_mutations \n",
+ "0 79 92 27.0 68.000000 \n",
+ "1 405 407 164.0 228.168734 "
+ ]
+ },
+ "execution_count": 36,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "sample_to_acronym = dict(zip(clinmat_df.sample_id, clinmat_df.acronym))\n",
+ "\n",
+ "def get_cancer_count_column(sample_ids):\n",
+ " \"\"\"\n",
+ " sample_ids is a pandas.Series\n",
+ " \"\"\"\n",
+ " sample_ids = pandas.Series(sample_ids)\n",
+ " aconyms = sample_ids.map(sample_to_acronym)\n",
+ " counter = collections.Counter(aconyms)\n",
+ " counts = disease_df.acronym.map(counter)\n",
+ " return counts.fillna(0).astype(int)\n",
+ "\n",
+ "# Compute nubmer of samples per disease (cancer type)\n",
+ "disease_df['n_samples'] = get_cancer_count_column(sample_df.sample_id)\n",
+ "disease_df['n_clinical_samples'] = get_cancer_count_column(clinmat_df.sample_id)\n",
+ "disease_df['n_expression_samples'] = get_cancer_count_column(expr_df.index)\n",
+ "disease_df['n_mutation_samples'] = get_cancer_count_column(gene_mutation_mat_df.index)\n",
+ "\n",
+ "# Compute n_mutation summaries for samples in the aligned set\n",
+ "acronyms = list(pandas.Series(y_df.index).map(sample_to_acronym))\n",
+ "groups = y_df.sum(axis='columns').groupby(acronyms)\n",
+ "disease_df['median_mutations'] = disease_df.acronym.map(dict(groups.median()))\n",
+ "disease_df['mean_mutations'] = disease_df.acronym.map(dict(groups.mean()))\n",
+ "\n",
+ "# Export to TSV\n",
+ "path = os.path.join('data', 'diseases.tsv')\n",
+ "disease_df.to_csv(path, sep='\\t', float_format='%.1f', index=False)\n",
+ "\n",
+ "disease_df.head(2)"
+ ]
+ },
{
"cell_type": "markdown",
"metadata": {},
@@ -1735,7 +1839,7 @@
},
{
"cell_type": "code",
- "execution_count": 36,
+ "execution_count": 37,
"metadata": {},
"outputs": [],
"source": [
@@ -1745,7 +1849,7 @@
},
{
"cell_type": "code",
- "execution_count": 37,
+ "execution_count": 38,
"metadata": {},
"outputs": [],
"source": [
@@ -1765,7 +1869,7 @@
},
{
"cell_type": "code",
- "execution_count": 38,
+ "execution_count": 39,
"metadata": {},
"outputs": [],
"source": [
diff --git a/data/diseases.tsv b/data/diseases.tsv
new file mode 100644
index 0000000..5faf7da
--- /dev/null
+++ b/data/diseases.tsv
@@ -0,0 +1,34 @@
+acronym disease n_samples n_clinical_samples n_expression_samples n_mutation_samples median_mutations mean_mutations
+ACC adrenocortical cancer 79 92 79 92 27.0 68.0
+BLCA bladder urothelial carcinoma 403 409 405 407 164.0 228.2
+BRCA breast invasive carcinoma 787 1092 1090 789 37.0 76.5
+CESC cervical & endocervical cancer 286 307 304 289 87.0 181.9
+CHOL cholangiocarcinoma 36 36 36 36 36.0 52.2
+COAD colon adenocarcinoma 287 457 449 288 122.0 444.0
+DLBC diffuse large B-cell lymphoma 37 48 48 37 108.0 122.6
+ESCA esophageal carcinoma 183 185 184 184 106.0 139.2
+GBM glioblastoma multiforme 149 588 153 311 48.0 111.7
+HNSC head & neck squamous cell carcinoma 499 528 520 507 102.0 142.2
+KICH kidney chromophobe 66 66 66 66 21.0 33.2
+KIRC kidney clear cell carcinoma 366 536 533 368 54.0 56.3
+KIRP kidney papillary cell carcinoma 280 291 290 281 58.5 59.0
+LAML acute myeloid leukemia 0 200 173 0
+LGG brain lower grade glioma 510 515 515 510 29.0 45.8
+LIHC liver hepatocellular carcinoma 357 376 370 362 84.0 102.2
+LUAD lung adenocarcinoma 509 519 515 513 189.0 263.1
+LUSC lung squamous cell carcinoma 477 504 501 480 217.0 257.3
+MESO mesothelioma 81 87 87 81 28.0 31.5
+OV ovarian serous cystadenocarcinoma 14 537 295 62 67.0 70.4
+PAAD pancreatic adenocarcinoma 170 185 178 177 35.5 88.0
+PCPG pheochromocytoma & paraganglioma 179 179 179 179 9.0 10.4
+PRAD prostate adenocarcinoma 494 498 497 495 27.0 41.0
+READ rectum adenocarcinoma 89 166 159 89 95.0 230.8
+SARC sarcoma 234 261 259 236 38.5 60.0
+SKCM skin cutaneous melanoma 103 108 103 104 235.0 342.1
+STAD stomach adenocarcinoma 412 443 415 439 114.0 324.0
+TGCT testicular germ cell tumor 129 134 134 129 14.0 16.2
+THCA thyroid carcinoma 490 507 505 492 10.0 11.4
+THYM thymoma 118 124 120 122 13.0 21.0
+UCEC uterine corpus endometrioid carcinoma 436 547 532 447 80.0 804.6
+UCS uterine carcinosarcoma 57 57 57 57 49.0 120.2
+UVM uveal melanoma 80 80 80 80 12.0 17.2
diff --git a/scripts/2.TCGA-process.py b/scripts/2.TCGA-process.py
index 65ae5fd..4a3ff72 100644
--- a/scripts/2.TCGA-process.py
+++ b/scripts/2.TCGA-process.py
@@ -444,18 +444,54 @@
y_gene_df.head(2)
+# ### Cancer type (disease) stats
+
+# In[36]:
+
+
+sample_to_acronym = dict(zip(clinmat_df.sample_id, clinmat_df.acronym))
+
+def get_cancer_count_column(sample_ids):
+ """
+ sample_ids is a pandas.Series
+ """
+ sample_ids = pandas.Series(sample_ids)
+ aconyms = sample_ids.map(sample_to_acronym)
+ counter = collections.Counter(aconyms)
+ counts = disease_df.acronym.map(counter)
+ return counts.fillna(0).astype(int)
+
+# Compute nubmer of samples per disease (cancer type)
+disease_df['n_samples'] = get_cancer_count_column(sample_df.sample_id)
+disease_df['n_clinical_samples'] = get_cancer_count_column(clinmat_df.sample_id)
+disease_df['n_expression_samples'] = get_cancer_count_column(expr_df.index)
+disease_df['n_mutation_samples'] = get_cancer_count_column(gene_mutation_mat_df.index)
+
+# Compute n_mutation summaries for samples in the aligned set
+acronyms = list(pandas.Series(y_df.index).map(sample_to_acronym))
+groups = y_df.sum(axis='columns').groupby(acronyms)
+disease_df['median_mutations'] = disease_df.acronym.map(dict(groups.median()))
+disease_df['mean_mutations'] = disease_df.acronym.map(dict(groups.mean()))
+
+# Export to TSV
+path = os.path.join('data', 'diseases.tsv')
+disease_df.to_csv(path, sep='\t', float_format='%.1f', index=False)
+
+disease_df.head(2)
+
+
# ### Export matrices to TSVs
#
# Matrices are saved as sample × gene TSVs. Subsetted matrices are also exported to allow users to quickly explore small portions of the dataset.
-# In[36]:
+# In[37]:
path = os.path.join('data', 'samples.tsv')
sample_df.to_csv(path, sep='\t', float_format='%.0f', index=False)
-# In[37]:
+# In[38]:
def subset_df(df, nrows=None, ncols=None, row_seed=0, col_seed=0):
@@ -472,7 +508,7 @@ def subset_df(df, nrows=None, ncols=None, row_seed=0, col_seed=0):
)
-# In[38]:
+# In[39]:
tsv_args = {'sep': '\t', 'float_format': '%.3g'}