-
Notifications
You must be signed in to change notification settings - Fork 0
/
run_analysis.R
105 lines (75 loc) · 3.25 KB
/
run_analysis.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
## run_analysis.R
## Getting and Cleaning Data Course Project
## Ramon Perez Hernandez
# **********
# * TASK 1 *
# **********
# "Merge the training and the test sets to create one data set"
# Download and extract all files.
url <- "https://d396qusza40orc.cloudfront.net/getdata%2Fprojectfiles%2FUCI%20HAR%20Dataset.zip"
name <- "data.zip"
if(!file.exists("data.zip")) {
download.file(url, destfile = name, method = "curl")
if(!file.exists("UCI HAR Dataset")) {
unzip(name)
}
}
# The final data frame will be composed by:
# - Subject who performed the activity (from subject_train/test.txt).
# - Activity (from y_train/test.txt).
# - Measures (from X_train/test.txt).
# Loading train data frame.
train_df <- cbind(read.table("UCI HAR Dataset/train/subject_train.txt"),
read.table("UCI HAR Dataset/train/y_train.txt"),
read.table("UCI HAR Dataset/train/X_train.txt"))
# Loading test data frame.
test_df <- cbind(read.table("UCI HAR Dataset/test/subject_test.txt"),
read.table("UCI HAR Dataset/test/y_test.txt"),
read.table("UCI HAR Dataset/test/X_test.txt"))
# Merging train and test data frame.
df <- rbind(train_df, test_df)
# **********
# * TASK 2 *
# **********
# "Extract only the measurements on the mean and standard deviation for each measurement"
# Read features.txt, which have the names for measures in X_train/text.txt,
# and transform them to a character vector.
feat_names <- read.table("UCI HAR Dataset/features.txt")
feat_names <- as.character(feat_names$V2)
# Look for the position of names which contains "mean()" or "std()" and add them 2 in
# order to choose the correct columns in df (remember that first and second column in df
# are the subject and the activity).
positions <- grep("mean\\(\\)|std\\(\\)", feat_names) + 2
# Choose "positions" columns + first and second column from df.
df <- df[,c(1,2,positions)]
# **********
# * TASK 3 *
# **********
# "Use descriptive activity names to name the activities in the data set"
# Read activity_labels.txt, which have the names for every activity, and transform
# them to a character vector.
act_names <- read.table("UCI HAR Dataset/activity_labels.txt")
act_names <- as.character(act_names$V2)
# Transform df second column into factor, using act_names as levels.
df[,2] <- factor(df[,2])
levels(df[,2]) <- act_names
# **********
# * TASK 4 *
# **********
# "Appropriately label the data set with descriptive variable names"
# First and second column will be called "subject" and "activity", respectively.
# The rest of columns will use "feat_names" names as follows.
colnames(df) <- c("subject","activity",feat_names[positions-2])
# **********
# * TASK 5 *
# **********
# "From the data set in step 4, creates a second, independent tidy data set
# with the average of each variable for each activity and each subject"
# Here we will need dplyr package with group_by/summarise_each functions.
library(dplyr)
tidy_df <- df %>% group_by(subject, activity) %>% summarise_each(funs(mean))
# Changing these column names to "MEAN-...".
colnames(tidy_df) <- c("subject","activity",paste("MEAN-",
feat_names[positions-2], sep = ""))
# Save tidy_df into "tidy_df.txt" file.
write.table(tidy_df, "tidy_df.txt", row.names=FALSE)