-
Notifications
You must be signed in to change notification settings - Fork 0
/
sqlite_herring_v1.py
164 lines (140 loc) · 4.91 KB
/
sqlite_herring_v1.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
#!/usr/bin/env python3
import pandas as pd
import sqlite3
import sys
conn = sqlite3.connect("herring_v1.db") # creates db file
c = conn.cursor() # cursor c
# Create tables
try:
c.executescript(
"""DROP TABLE IF EXISTS Samples;
DROP TABLE IF EXISTS Genotypes;
DROP TABLE IF EXISTS Storage_info;
DROP TABLE IF EXISTS Genetic_markers;
DROP TABLE IF EXISTS Species_id_codes;
DROP TABLE IF EXISTS Hafro_samples_info;
CREATE TABLE Samples(
Sample_id text PRIMARY KEY,
Short_sid char(12) UNIQUE, --for input into genepop max 12 char
Species_id_code int REFERENCES Species_id_codes(Species_id_code),
Pop_name text,
Maturity text, --"spawning" "feeding"
FOREIGN KEY(Sample_id) REFERENCES Hafro_samples_info(Sample_id),
FOREIGN KEY(Sample_id) REFERENCES Storage_info(Sample_id)
);
CREATE TABLE Genotypes(
Sample_id text REFERENCES Samples(Sample_id),
Marker_id text REFERENCES Genetic_markers(Marker_id),
Gt text,
PRIMARY KEY(Sample_id, Marker_id)
);
CREATE TABLE Storage_info(
Sample_id text,
Box text,
Plate_id text,
Well text,
Sample_type text, --"tissue", "scales", "otholit", "DNA"
Matis_id int PRIMARY KEY
);
CREATE TABLE Genetic_markers(
Marker_id text PRIMARY KEY,
Short_mid text UNIQUE,
Possible_genotypes text,
Marker_type text, --affy
Probe_sequence text, --{"forw":"ATCGCWACTW", "rev":"TGTRWCTG"}
Marker_info text --"dinuclotype", "trinuclotide", "A/T"
--Pcr_program text
);
CREATE TABLE Species_id_codes(
Species_id_code int PRIMARY KEY,
Species_latin text,
Species_english text,
Species_icelandic text
);
CREATE TABLE Hafro_samples_info(
Sample_id text PRIMARY KEY,
Date_time_collect date,
Year_sampled int,
Latitute real,
Longitute real,
Cruise_id text,
Station text,
Age int,
Weight real,
Maturity int,
Length int,
Sex int,
Ship_id int,
Location text,
Sample_type text --"spawning", "feeding"
)"""
)
except sqlite3.Error as err:
print(err)
raise # reraise after print for Python to handle
conn.commit()
def check_table_exists(tbl):
"""Check whether table tbl already exists from creation of database, otherwise it would create
an additional one from the csv indexes"""
c.execute(
"SELECT name FROM sqlite_master WHERE type='table'"
) # select all tables' names
names = c.fetchall() # put all these tables' names into a variable
print(f"Tables in the database are {names}")
if (tbl,) in names:
pass
else:
sys.exit(f"Something wrong with table {tbl}, it doesn't already exist")
return None
def csv_into_table(filename, table):
"""Add csv into the database, csv should have columns named as the table in database,
give table argument as a string"""
try:
with conn:
csv_file = pd.read_csv(
filename, sep=";", encoding="latin1"
) # latin1 Western Europe
csv_file.to_sql(table, conn, if_exists="append", index=False)
except sqlite3.Error:
raise
return None
def load_data(filenm, tb):
check_table_exists(tb)
csv_into_table(filenm, tb)
return None
def get_gt_csv(file):
"""Queries and reshape to output a csv containing genotypes at each marker for
each individual with its short_id and population"""
query = """SELECT Samples.Sample_id, Samples.Short_sid, Samples.Pop_name,
Genotypes.Marker_id, Genotypes.Gt
FROM Samples, Genotypes
WHERE Samples.Sample_id = Genotypes.Sample_id"""
try:
with conn:
df = pd.read_sql_query(query, conn)
except sqlite3.Error:
raise
df2 = df.pivot_table( # long to wide format
index=["Sample_id", "Short_sid", "Pop_name"],
columns="Marker_id",
values="Gt", # str so need to adjust aggfunc
aggfunc=lambda x: " ".join(
x
), # if several values for same index, will concatenate with space
)
df2.to_csv(file, sep=";")
print(
f"Pulling out all genotypes from all individuals x markers into csv {file}..."
)
return None
load_data("/mnt/c/Users/alexa/Matis/SQL/results/04_06_20/HerSNP_long.csv", "Genotypes")
load_data(
"/mnt/c/Users/alexa/Matis/SQL/data/herring/Species_id_codes.csv", "Species_id_codes"
)
load_data("/mnt/c/Users/alexa/Matis/SQL/data/herring/Samples.csv", "Samples")
load_data(
"/mnt/c/Users/alexa/Matis/SQL/data/herring/Hafro_samples_info.csv",
"Hafro_samples_info",
)
get_gt_csv("test_gt.csv")
conn.close()