-
Notifications
You must be signed in to change notification settings - Fork 2
/
main.py
63 lines (46 loc) · 2.4 KB
/
main.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
import pandas as pd
import re
import seaborn as sns
from tabulate import tabulate
fname = "data/result.txt"
sns.set_style('darkgrid')
sns.color_palette('Spectral')
data = pd.read_csv(fname, sep=',', dtype={'rsid':'str', 'chromosome':'object',
'position':'int', 'genotype':'str'}, comment='#')
df = pd.DataFrame(data)
df['chromosome'] = df['chromosome'].apply(lambda x: re.sub(r'X', r'23', x))
df['chromosome'] = df['chromosome'].apply(lambda x: re.sub(r'MT', r'24', x))
df['chromosome'] = df['chromosome'].apply(lambda x: re.sub(r'Y', r'25', x))
df['chromosome'] = df['chromosome'].apply(lambda x: int(x))
chromosome_dict = {1:'1', 2:'2', 3:'3', 4:'4', 5:'5',
6:'6', 7:'7', 8:'8', 9:'9', 10:'10',
11:'11', 12:'12', 13:'13', 14:'14',
15:'15', 16:'16', 17:'17', 18:'18',
19:'19', 20:'20', 21:'21', 22:'22',
23:'X', 24:'MT', 25:'Y'}
df.rename({' rsid': 'rsid'}, axis='columns', inplace=True)
# hiw many SNP's are there per chromosome
rsid_per_chromosome_series = df.groupby('chromosome')['rsid'].count()
rsid_per_chromosome_series.columns = ['chromosome', 'count']
rsid_per_chromosome_series.plot.barh(figsize=(16,9), fontsize=15)
# plt.show()
snp_df = pd.read_csv('data/uniq_snips.csv')
snp_df['genotype'] = snp_df['rsid'].apply(lambda x: re.sub(r'.*([AGCT]);([AGCT])\)', r'\1\2', x))
new_cols = ['rsid', 'magnitude', 'repute', 'summary', 'genotype']
snp_df.columns = new_cols
snp_df['rsid'] = snp_df['rsid'].map(lambda x : x.lower())
snp_df['rsid'] = snp_df['rsid'].map(lambda x : re.sub(r'([a-z]{1,}[\d]+)\([agct];[agct]\)', r'\1', x))
null_repute = snp_df[snp_df['repute'].isnull()]
null_summaries = snp_df[snp_df['summary'].isnull()]
null_repute_and_summaries = pd.concat([null_repute,null_summaries]).drop_duplicates().reset_index(drop=True)
snp_df['repute'].fillna(value='Neutral', inplace=True)
snp_df['summary'].fillna(value='None', inplace=True)
new_df = snp_df.merge(df, how='inner', on=['rsid', 'genotype'], suffixes=('_SNPedia', '_myDNA'))
good_genes = new_df[new_df.repute == 'Good']
bad_genes = new_df[new_df.repute == 'Bad']
interesting_genes = new_df[new_df.magnitude > 4] # 4 is the threshold for "worth your time" given by SNPedia
base_url = 'https://www.snpedia.com/index.php/'
gene_urls = [base_url + rsid for rsid in bad_genes['rsid']]
for url in gene_urls:
print(url, '\n')
print(tabulate(bad_genes))