-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy path01-CleanData.R
164 lines (140 loc) · 9.25 KB
/
01-CleanData.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
# Introduction ------------------------------------------------------------
#
# 2018-02-14: The data supplied by AP were inconsistent with the database
# entries. This was due in part to the fact that some isolates had
# their identifiers changed in the Steadman lab database and others
# had perished after genotyping and were discarded. We now have two
# data sets that we will use to update the data to use for the
# analysis.
#
# Setup -------------------------------------------------------------------
library("poppr")
library("tidyverse")
library("readxl")
library("lubridate")
if (!interactive()) options(width = 200)
enc <- getOption("encoding")
options(encoding = "iso-8859-1")
# Merging Data ------------------------------------------------------------
genotypes <- read.genalex(here::here("data/data.csv"), ploidy = 1) %>%
genind2df() %>%
rownames_to_column("AP-GenoID") %>%
as_tibble()
genotypes
#summarizing number per isolates per population
genotypes.summary.Brazil <- genotypes %>% group_by( pop) %>% summarize(Number_isolates=n())
save(list = ls(all=TRUE), file= "data/genotypes.summary.Brazil.Rdata")
rm(list = ls())
# reading in the excel sheet has its own problems since the date column contains
# part dates and part text and they get screwed up no matter what you do. The
# way I've dealt with this: import as dates and then convert what didn't parse
# into the number of days since 1899-12-30
metadata <- read_excel(here::here("data/MasterGenoMCGDataBrazilPaper2018.xlsx"),
col_types = "text",
na = c("NA", "")) %>%
mutate(date = as.Date(parse_date_time(`JRS-Collection Date`, c("mdy", "y")))) %>%
mutate(date = case_when(
is.na(date) ~ as.Date("1899-12-30") + days(as.integer(`JRS-Collection Date`)),
TRUE ~ date
))
metadata
full_data <- left_join(metadata, genotypes, by = "AP-GenoID")
# Data Cleaning -----------------------------------------------------------
#
# Because there are discrepancies between the locations, we will rely on the
# location information from the JRS database. Unfortunately, there is no
# consistent pattern in naming, so will will manually create a table from the
# data and use that to match genotypes. Those without region names will have the
# country or state name in place.
full_data %>%
select(`JRS-Geographical Location`, pop) %>%
distinct() %>%
mutate(reglen = max(nchar(`JRS-Geographical Location`), na.rm = TRUE) - nchar(`JRS-Geographical Location`)) %>%
filter(!is.na(`JRS-Geographical Location`)) %>%
rowwise() %>%
mutate(trail = paste(rep(" ", reglen), collapse = "")) %>%
glue::glue_data("'{`JRS-Geographical Location`}'{trail} ~ '{pop}',")
full_data <- mutate(full_data,
continent_country_state_region = case_when(
# Because there is only one point from CO in the data, we will compress both
# CO and NE into a single region called "Midwest"
`JRS-Geographical Location` == 'Greeley, CO' ~ 'North America_United States_Midwest_Greeley, CO',
`JRS-Geographical Location` == 'Ithaca, NE' ~ 'North America_United States_Midwest_Ithaca, NE',
`JRS-Geographical Location` == 'Platte Co., NE' ~ 'North America_United States_Midwest_Platte Co., NE',
`JRS-Geographical Location` == 'Tekamah, Burt Co., NE' ~ 'North America_United States_Midwest_Tekamah, NE',
`JRS-Geographical Location` == 'Saunders Co., NE' ~ 'North America_United States_Midwest_Saunders Co., NE',
`JRS-Geographical Location` == 'Bellwood, NE' ~ 'North America_United States_Midwest_Bellwood, NE',
`JRS-Geographical Location` == 'Herman, NE' ~ 'North America_United States_Midwest_Herman, NE',
`JRS-Geographical Location` == 'Ord, NE' ~ 'North America_United States_Midwest_Ord, NE',
`JRS-Geographical Location` == 'UNL PN, Lincoln, NE' ~ 'North America_United States_Midwest_Lincoln, NE',
`JRS-Geographical Location` == 'Mead, Nebraska' ~ 'North America_United States_Midwest_Mead, NE',
`JRS-Geographical Location` == 'Ewing, NE' ~ 'North America_United States_Midwest_Ewing, NE',
`JRS-Geographical Location` == 'Auburn, NE' ~ 'North America_United States_Midwest_Auburn, NE',
`JRS-Geographical Location` == 'Argentina' ~ 'South America_Argentina_Argentina_Argentina',
`JRS-Geographical Location` == 'Rio Verde/GO, Brazil' ~ 'South America_Brazil_Goiás_Rio Verde',
`JRS-Geographical Location` == 'Campo Mourão/PR, Brazil' ~ 'South America_Brazil_Paraná_Campo Mourão',
`JRS-Geographical Location` == 'São Miguel do Passo Quatro/GO' ~ 'South America_Brazil_Goiás_São Miguel do Passo',
`JRS-Geographical Location` == 'Pinhão/PR, Brazil' ~ 'South America_Brazil_Paraná_Pinhão', # Note: in Anthony's data, one isolate labeled as South America_Brazil_Goiás
`JRS-Geographical Location` == 'Formoso, GO, Brazil' ~ 'South America_Brazil_Goiás_Formoso',
`JRS-Geographical Location` == 'Guarapuava, PR, Brazil' ~ 'South America_Brazil_Paraná_Guarapuava',
`JRS-Geographical Location` == 'Luiz Eduardo Magalhães/BA, Brazil' ~ 'South America_Brazil_Bahia_Bahia',
`JRS-Geographical Location` == 'Mauá da Serra/PR, Brazil' ~ 'South America_Brazil_Paraná_Mauá da Serra',
`JRS-Geographical Location` == 'Nᾶo me Toque/Rio Grande do Sul, Brazil' ~ 'South America_Brazil_Rio Grande do Sul_Não me Toque',
`JRS-Geographical Location` == 'Sᾶo Desidério/Bahia, Brazil' ~ 'South America_Brazil_Bahia_São Desidério',
`JRS-Geographical Location` == 'Jataí/GO, Brazil' ~ 'South America_Brazil_Goiás_Jataí',
`JRS-Geographical Location` == 'Cristalina/GO, Brazil' ~ 'South America_Brazil_Goiás_Cristalina',
`JRS-Geographical Location` == 'Formosa/GO, Brazil' ~ 'South America_Brazil_Goiás_Formosa',
`JRS-Geographical Location` == 'Sudeste/GO, Brazil' ~ 'South America_Brazil_Goiás_Sudeste',
`JRS-Geographical Location` == 'Uberlândia/MG, Brazil' ~ 'South America_Brazil_Minas Gerais_Uberlândia',
`JRS-Geographical Location` == 'Correntina/BA, Brazil' ~ 'South America_Brazil_Bahia_Correntina',
`JRS-Geographical Location` == 'Bahia, Brazil' ~ 'South America_Brazil_Bahia_Bahia',
`JRS-Geographical Location` == 'Vacaria/RS, Brazil' ~ 'South America_Brazil_Rio Grande do Sul_Vacaria',
`JRS-Geographical Location` == 'Coxilha/RS, Brazil' ~ 'South America_Brazil_Rio Grande do Sul_Coxilha',
`JRS-Geographical Location` == 'Faxinal/PR, Brazil' ~ 'South America_Brazil_Paraná_Faxinal',
`JRS-Geographical Location` == 'Chapadão do Sul/MS, Brazil' ~ 'South America_Brazil_Mato Grosso do Sul_Chapadão do Sul',
TRUE ~ `JRS-Geographical Location`
))
# Saving Cleaned Data as CSV -----------------------------------------------
#
# Now that the data are cleaned, I will save the important bits for reproduction
# as both a CSV and a genclone object.
clean_data <- full_data %>%
select(GenoID = `AP-GenoID`,
MCG,
Year = date,
continent_country_state_region,
matches("\\d-\\d")) %>%
mutate(Year = year(Year)) %>%
filter(!is.na(continent_country_state_region)) %>% # remove isolate that has no info
separate(continent_country_state_region,
c("Continent", "Country", "Population", "Subpop"),
sep = "_") %>%
write_csv(path = here::here("data/clean-genotypes.csv"))
# Head off any encoding issues
readLines(here::here("data/clean-genotypes.csv")) %>%
iconv(from = "UTF-8", to = "ISO-8859-1") %>%
writeLines(con = here::here("data/clean-genotypes.csv"))
print(clean_data, n = 100)
# Converting to genclone, adding Repeat Lengths, Palette ------------------
gid <- df2genind(select(clean_data, matches("\\d-\\d")),
ploidy = 1,
ind.names = clean_data$GenoID,
strata = select(clean_data,
GenoID, Continent, Country, Population, Subpop, MCG, Year)) %>%
as.genclone() %>%
setPop(~Population)
# This is a color-blind friendly palette
other(gid)$palette <- c("Midwest" = "#000000",
"Argentina" = "#F0E442", # "#E69F00",
"Bahia" = "#56B4E9",
"Goiás" = "#009E73",
"Mato Grosso do Sul" = "#E69F00",
"Minas Gerais" = "#0072B2",
"Paraná" = "#D55E00",
"Rio Grande do Sul" = "#CC79A7")
# These are the repeat lengths that we are correcting to avoid rounding errors
(other(gid)$REPLEN <- fix_replen(gid, c(2, 6, 2, 2, 2, 2, 4, 4, 4, 4, 3)))
write_rds(gid, path = here::here("data/full-genclone-object.rds"))
# Session Information -----------------------------------------------------
sessioninfo::session_info()
options(encoding = enc)