-
Notifications
You must be signed in to change notification settings - Fork 0
/
filter_VOCs
44 lines (36 loc) · 2.16 KB
/
filter_VOCs
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
#!/usr/bin/env bash
#Outputs list of VOCs, with LIMS IDs filled in
#You need: *metadata.tsv (download "Input for Augur Pipeline" from GISAID.org), LIMS.tsv (3 columns, first is LIMS ID, third is external strain name)
#Output is VOCs_output.tsv (with all VOCs and their metadata) and lookup_output.tsv (with LIMS IDs matched to external strain names, so that you can fill in column 2 of VOCs_output.tsv)
#FILTER THE METADATA TSV FILE
#Filter out only the VOCs and put in a new file called VOCs.tsv, with the columns in a particular order
awk -F"\t" '$19 == "B.1.617.2" {print $27, "\t", "\t", $1, "\t", $5, "\t", $19, "\t", "\t", $21, "\t", $22}' *metadata.tsv > VOCs.tsv
awk -F"\t" '$19 ~ "AY." {print $27, "\t", "\t", $1, "\t", $5, "\t", $19, "\t", "\t", $21, "\t", $22}' *metadata.tsv >> VOCs.tsv
awk -F"\t" '$19 ~ "B.1.1.529" {print $27, "\t", "\t", $1, "\t", $5, "\t", $19, "\t", "\t", $21, "\t", $22}' *metadata.tsv >> VOCs.tsv
awk -F"\t" '$19 ~ "BA." {print $27, "\t", "\t", $1, "\t", $5, "\t", $19, "\t", "\t", $21, "\t", $22}' *metadata.tsv >> VOCs.tsv
#Add WHO designations
awk -F"\t" '$5 == "B.1.617.2" || $5 ~ "AY." {print $1, "\t", $2, "\t", $3, "\t", $4, "\t", $5, "\t", "Delta", "\t", $7, "\t", $8}' VOCs.tsv > VOCs2.tsv
awk -F"\t" '$5 == "B.1.1.529" || $5 ~ "BA." {print $1, "\t", $2, "\t", $3, "\t", $4, "\t", $5, "\t", "Omicron", "\t", $7, "\t", $8}' VOCs.tsv > VOCs2.tsv
#Take external strain names from VOCs.tsv and put in separate file lookup.tsv
awk -F"\t" '{print $3}' VOCs.tsv > lookup.tsv
#trim function
trim() {
local var="$*"
# remove leading whitespace characters
var="${var#"${var%%[![:space:]]*}"}"
# remove trailing whitespace characters
var="${var%"${var##*[![:space:]]}"}"
printf '%s\n' "$var"
}
#Remove any white space in lookup.tsv
while read -r line;
do
trim $line >> lookup2.tsv
done < lookup.tsv
#Show any rows in the reference table that matches an item in the lookup list
grep -Ff lookup2.tsv LIMS.tsv > lookup3.tsv
#Sort output files by external strain name (column 3)
sort -k3 -n VOCs2.tsv > VOCs_output.tsv
sort -k3 -n lookup3.tsv > lookup_output.tsv
#Remove unneeded files
rm VOCs.tsv VOCs2.tsv lookup.tsv lookup2.tsv lookup3.tsv