biof309 · dewanr2 · Feb 28, 2020 · Apr 23, 2020 · Apr 23, 2020 · Apr 23, 2020
diff --git a/README.md b/README.md
@@ -1,3 +1,18 @@
-# project_spring_2020
+# Final Project
+BIOF 309 Spring 2020 <br>
+**Author:** Ramita Karra <br>
+**Last edited:** 05-08-2020 <br>
 
-[![CircleCI](https://circleci.com/gh/biof309/project_spring_2020/tree/master.svg?style=shield)](https://circleci.com/gh/biof309/project_spring_2020/tree/master)
+## Project Description
+This aim of this project is to produce a script that will process raw tab-delimited output returned from the ExpansionHunter-Targeted software tool, used for making sequence-graph-based predictions of repeat lengths for known genetic repeat loci. The ultimate goal is to clean, compile, and process data for many different loci into a summary table, and to provide visualizations pertinent to the functional relevance of this data (i.e. number of samples containing repeat numbers above the pathogenic threshold for each gene).  
+
+More information on ExpansionHunter can be found [here](https://academic.oup.com/bioinformatics/article/35/22/4754/5499079). 
+
+## Required Input
+The raw data consists of a directory of all .txt files returned by ExpansionHunter-Targeted for multiple genes from a sample set.
+## Project Details:
+- Exploratory data analysis
+- Input data from separate text files and merge into a single master dataframe based on sample ID
+- Create dictionary for each gene, containing pathogenic repeat threshold values
+- Determine summary statistics for each gene (total number of samples, samples with pathogenic range repeats)
+- Create visualization to better understand data
diff --git a/project_spring_2020/.ipynb_checkpoints/Final_Project_Workbook-checkpoint.ipynb b/project_spring_2020/.ipynb_checkpoints/Final_Project_Workbook-checkpoint.ipynb
@@ -0,0 +1,372 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Final Project Workbook\n",
+    "### BIOF 309 Spring 2020 "
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "**Author:** Ramita Karra <br>\n",
+    "**Last edited:** 04-23-2020"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "'/Users/dewanr2/Documents/GitHub/project_spring_2020/project_spring_2020'"
+      ]
+     },
+     "execution_count": 1,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "pwd"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### Input files"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "#### Create a list of file paths, consisting of all files in the directory containing raw data"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import os\n",
+    "import glob\n",
+    "import pandas as pd\n",
+    "\n",
+    "# Assign variable to directory containing raw ExpansionHunter - Targeted data\n",
+    "directory_name = '/Users/dewanr2/Documents/Ramitas_Docs/NIH_Classes/BIOF309/ExpansionHunterTargeted'\n",
+    "\n",
+    "file_list = []\n",
+    "\n",
+    "# Check to make sure that only '.txt' files are being appended to list\n",
+    "for filename in os.listdir(directory_name):\n",
+    "    if filename.endswith('.txt'):\n",
+    "        file_list.append(filename)\n",
+    "    else:\n",
+    "        print('Found non .txt file in directory: ' + filename)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "#### Import one of the files for exploratory data analysis"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 25,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Change into directory containing raw data\n",
+    "\n",
+    "os.chdir(directory_name)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 26,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "'/Users/dewanr2/Documents/Ramitas_Docs/NIH_Classes/BIOF309/ExpansionHunterTargeted'"
+      ]
+     },
+     "execution_count": 26,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "pwd"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 28,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Index(['Cohort', 'SampleID', 'chr', 'pos', 'INFO',\n",
+      "       'GT:SO:REPCN:REPCI:ADSP:ADFL:ADIR:LC', 'REPCN:ATXN7_GCC_allele1',\n",
+      "       'REPCN:ATXN7_GCC_allele2'],\n",
+      "      dtype='object')\n"
+     ]
+    }
+   ],
+   "source": [
+    "# Import first file in file_list \n",
+    "test_df = pd.read_csv(file_list[0], sep='\\t')\n",
+    "\n",
+    "# Examine columns\n",
+    "print(test_df.columns)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 29,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Index(['SampleID', 'chr', 'pos', 'min', 'max'], dtype='object')\n",
+      "<class 'pandas.core.frame.DataFrame'>\n",
+      "RangeIndex: 1938 entries, 0 to 1937\n",
+      "Data columns (total 5 columns):\n",
+      "SampleID    1938 non-null object\n",
+      "chr         1938 non-null object\n",
+      "pos         1938 non-null int64\n",
+      "min         1938 non-null int64\n",
+      "max         1938 non-null int64\n",
+      "dtypes: int64(3), object(2)\n",
+      "memory usage: 75.8+ KB\n",
+      "None\n"
+     ]
+    }
+   ],
+   "source": [
+    "# Create 'min' and 'max' columns for minimum and maximum allele repeat values respectively\n",
+    "test_df['min'] = test_df[['REPCN:ATXN7_GCC_allele1','REPCN:ATXN7_GCC_allele2']].min(axis=1)\n",
+    "test_df['max'] = test_df[['REPCN:ATXN7_GCC_allele1','REPCN:ATXN7_GCC_allele2']].max(axis=1)\n",
+    "\n",
+    "# Select only desired columns\n",
+    "test_df = test_df[['SampleID','chr','pos','min','max']]\n",
+    "\n",
+    "print(test_df.columns)\n",
+    "print(test_df.info())"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "*EDA suggests that when importing dataframes, need to apply the following:*\n",
+    "- evaluate data to create 'min' and 'max' columns\n",
+    "- rename 'max' column with gene name\n",
+    "- select only desired columns: 'SampleID','chr','pos','max'\n",
+    "\n",
+    "*EDA did not show any null entries for this df, but need to import with default null value in case all genes were not evaluated for all samples*"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "#### Import all files from list of filepaths created earlier"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 30,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "'/Users/dewanr2/Documents/Ramitas_Docs/NIH_Classes/BIOF309/ExpansionHunterTargeted'"
+      ]
+     },
+     "execution_count": 30,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "# Confirm that current directory contains raw data\n",
+    "\n",
+    "pwd"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 63,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Create list of dataframes with columns: 'SampleID','chr','pos','min','max'\n",
+    "\n",
+    "df_list = []\n",
+    "\n",
+    "for filename in file_list:\n",
+    "    \n",
+    "    # Get gene name\n",
+    "    if filename.startswith(\"ExpansionHunterTargeted.ftd\"):\n",
+    "        gene = filename.replace(\"ExpansionHunterTargeted.ftd.\",\"\", 1)\n",
+    "        if gene.endswith(\".txt\"):\n",
+    "            gene = gene.replace(\".txt\",\"\",1)\n",
+    "        else:\n",
+    "            print(\"filename does not contain suffix\")\n",
+    "    else:\n",
+    "        print(\"filename does not contain prefix\")\n",
+    "    \n",
+    "    # Import and format df\n",
+    "    df_temp = pd.read_csv(filename, sep='\\t')\n",
+    "    df_temp.rename(columns={ df_temp.columns[6]: \"allele1\" }, inplace = True)\n",
+    "    df_temp.rename(columns={ df_temp.columns[7]: \"allele2\" }, inplace = True)\n",
+    "    df_temp['min'] = df_temp[['allele1','allele2']].min(axis=1)\n",
+    "    df_temp[gene] = df_temp[['allele1','allele2']].max(axis=1)\n",
+    "    df_temp = df_temp[['SampleID','chr','pos',gene]]\n",
+    "    df_list.append(df_temp)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Create dictionary with keys as genes, subdictionaries as key:value pairs for \"chr\", \"pos\"\n",
+    "\n",
+    "\n",
+    "\n",
+    "# Get chromosome\n",
+    "# Get gene position"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 65,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/html": [
+       "<div>\n",
+       "<style scoped>\n",
+       "    .dataframe tbody tr th:only-of-type {\n",
+       "        vertical-align: middle;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe tbody tr th {\n",
+       "        vertical-align: top;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe thead th {\n",
+       "        text-align: right;\n",
+       "    }\n",
+       "</style>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th></th>\n",
+       "      <th>SampleID</th>\n",
+       "      <th>chr</th>\n",
+       "      <th>pos</th>\n",
+       "      <th>ATXN7.GCC</th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <th>0</th>\n",
+       "      <td>RES04914</td>\n",
+       "      <td>chr3</td>\n",
+       "      <td>63912714</td>\n",
+       "      <td>16</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>1</th>\n",
+       "      <td>RES08323</td>\n",
+       "      <td>chr3</td>\n",
+       "      <td>63912714</td>\n",
+       "      <td>15</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>2</th>\n",
+       "      <td>RES04107</td>\n",
+       "      <td>chr3</td>\n",
+       "      <td>63912714</td>\n",
+       "      <td>14</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>3</th>\n",
+       "      <td>RES05106</td>\n",
+       "      <td>chr3</td>\n",
+       "      <td>63912714</td>\n",
+       "      <td>13</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>4</th>\n",
+       "      <td>RES04513</td>\n",
+       "      <td>chr3</td>\n",
+       "      <td>63912714</td>\n",
+       "      <td>13</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "</div>"
+      ],
+      "text/plain": [
+       "   SampleID   chr       pos  ATXN7.GCC\n",
+       "0  RES04914  chr3  63912714         16\n",
+       "1  RES08323  chr3  63912714         15\n",
+       "2  RES04107  chr3  63912714         14\n",
+       "3  RES05106  chr3  63912714         13\n",
+       "4  RES04513  chr3  63912714         13"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    }
+   ],
+   "source": [
+    "display(df_list[0].head())"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.7.1"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}
diff --git a/project_spring_2020/BIOF309_RDK/EHT_RDK/__init__.py b/project_spring_2020/BIOF309_RDK/EHT_RDK/__init__.py