-
Notifications
You must be signed in to change notification settings - Fork 1
/
liberia_data_munging.R
126 lines (118 loc) · 4.23 KB
/
liberia_data_munging.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
getwd()
path <- 'C:/Users/user/Documents/liberia_data' #folder where all the csv files are stored
setwd(path)
filenames <- list.files()
df <- data.frame()
#rbind each file after formatting it correctly
for (filename in filenames){
temp_df <- read.csv(filename, header = TRUE, stringsAsFactors=FALSE)
if (nrow(temp_df) == 31 | nrow(temp_df) == 32){
if (nrow(temp_df) == 32) {
temp_df <- temp_df[-32,]
}
date <- temp_df[1,1]
temp_df <- temp_df[,-1]
column_names <- temp_df$Variable
temp_df <- as.data.frame(t(temp_df[,2])) #only looking at National - 2nd column
colnames(temp_df) <- column_names
temp_df <- temp_df[1,]
temp_df <- cbind(date,temp_df)
df <- rbind(df,temp_df)
#print(filename)
} else { #only one file with nrow == 39
print(filename)
print(nrow(temp_df))
print("Needs to be manually modified")
}
rm(temp_df) #for memory purposes, garbage cleaning
}
#This is one file with different column names
filename = '2014-10-04-v142.csv'
temp_df <- read.csv(filename, header = TRUE, stringsAsFactors=FALSE)
date <- temp_df[1,1]
temp_df <- temp_df[,-1]
column_names <- temp_df$Variable
temp_df <- as.data.frame(t(temp_df[,2])) #only looking at National - 2nd column
colnames(temp_df) <- column_names
temp_df <- temp_df[1,]
temp_df <- cbind(date,temp_df) #add date as first column
library('gtools')
df <- smartbind(df,temp_df) #Efficient rbind of data frames, even if the column names don't match
rm(temp_df) #for memory purposes, garbage cleaning
df <- df[,1:32] #remove last 5 extra columns
rownames(df) <- NULL
#convert to date
## to give priority to %y format, define your own select_format function:
## ** how to use select_formats **
## By default %Y has precedence:
#parse_date_time(c("27-09-13", "27-09-2013"), "dmy")
## [1] "13-09-27 UTC" "2013-09-27 UTC"
## to give priority to %y format, define your own select_format function:
my_select <- function(trained){
n_fmts <- nchar(gsub("[^%]", "", names(trained))) + grepl("%y", names(trained))*1.5
names(trained[ which.max(n_fmts) ])
}
library(lubridate)
df$date <- parse_date_time(as.character(df$date), "mdy", select_formats = my_select)
## '[1] "2013-09-27 UTC" "2013-09-27 UTC"
#sort by date
df <- df[order(df$date),]
write.csv(df, file = "liberia_clean.csv")
#rm(list=setdiff(ls(), "df")) #remove all variables except df
#Function from http://stackoverflow.com/questions/3402371/rbind-different-number-of-columns
### combines data frames (like rbind) but by matching column names
# columns without matches in the other data frame are still combined
# but with NA in the rows corresponding to the data frame without
# the variable
# A warning is issued if there is a type mismatch between columns of
# the same name and an attempt is made to combine the columns
# combineByName <- function(A,B) {
# a.names <- names(A)
# b.names <- names(B)
# all.names <- union(a.names,b.names)
# print(paste("Number of columns:",length(all.names)))
# a.type <- NULL
# for (i in 1:ncol(A)) {
# a.type[i] <- typeof(A[,i])
# }
# b.type <- NULL
# for (i in 1:ncol(B)) {
# b.type[i] <- typeof(B[,i])
# }
# a_b.names <- names(A)[!names(A)%in%names(B)]
# b_a.names <- names(B)[!names(B)%in%names(A)]
# if (length(a_b.names)>0 | length(b_a.names)>0){
# print("Columns in data frame A but not in data frame B:")
# print(a_b.names)
# print("Columns in data frame B but not in data frame A:")
# print(b_a.names)
# } else if(a.names==b.names & a.type==b.type){
# C <- rbind(A,B)
# return(C)
# }
# C <- list()
# for(i in 1:length(all.names)) {
# l.a <- all.names[i]%in%a.names
# pos.a <- match(all.names[i],a.names)
# typ.a <- a.type[pos.a]
# l.b <- all.names[i]%in%b.names
# pos.b <- match(all.names[i],b.names)
# typ.b <- b.type[pos.b]
# if(l.a & l.b) {
# if(typ.a==typ.b) {
# vec <- c(A[,pos.a],B[,pos.b])
# } else {
# warning(c("Type mismatch in variable named: ",all.names[i],"\n"))
# vec <- try(c(A[,pos.a],B[,pos.b]))
# }
# } else if (l.a) {
# vec <- c(A[,pos.a],rep(NA,nrow(B)))
# } else {
# vec <- c(rep(NA,nrow(A)),B[,pos.b])
# }
# C[[i]] <- vec
# }
# names(C) <- all.names
# C <- as.data.frame(C)
# return(C)
# }