-
Notifications
You must be signed in to change notification settings - Fork 0
/
run_analysis.R
82 lines (66 loc) · 3.75 KB
/
run_analysis.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
library(dplyr)
#reads dataset. Assigns human-readable names to columns.
#Attaches subject ID(SUBJECT_ID), activity ID(ACTIVITY_ID)
#and corresponding activity name(ACTIVITY_NAME, but depends on providedactivityNamesTable ) columns.
#dataSetFilename - filename of raw dataset. (i.e. no column names)
#variableNamesVector - vector of corresponding column names
#activityIDsFilename - filename of activity id column file
#activityNamesTable mapping "activity id" ==> "activity name". activity id must be named "ID".
#subjectIDColumnFilename subject ID column filename
#returns table with human readable columns
readEnrichedDataSet <- function(dataSetFilename, variableNamesVector, activityIDsFilename, activityNamesTable, subjectIDColumnFilename){
activityIDColumn <- read.table(activityIDsFilename, col.names=c("ACTIVITY_ID"));
subjectIDColumn <- read.table(subjectIDColumnFilename, col.names=c("SUBJECT_ID"));
#fread causes SIGSEGV. read.table works(slow).
#dt <- fread(dataSetFilename, header = FALSE, stringsAsFactors=FALSE);
#names(dt) <- variableNamesVector;
dt <- read.table(dataSetFilename, col.names=variableNamesVector);
dt <- cbind(activityIDColumn, subjectIDColumn, dt);
dt <- merge(dt, activityNamesTable, by.x = "ACTIVITY_ID", by.y = "ID");
dt;
}
#returns original dataset with mean value(mean()) or standard deviation(std()) columns only
#(meanFreq not included)
extractMeasuresOfInterest <- function(dataSet){
dt <- select(dataSet,
one_of("SUBJECT_ID", "ACTIVITY_NAME"), matches("(std|mean)", ignore.case = FALSE),
-contains("meanFreq"));
names(dt) <- sub("std", "standard_deviation", names(dt));
names(dt) <- sub("mean", "mean_value", names(dt));
dt;
}
#groups dataset by subject_id, activity_id. values with groups are mean() over original dataSet.
#i.e. in terms of SQL(mean eq avg):
#"SELECT SUBJECT_ID, ACTIVITY_NAME, avg(col1), avg(col2), avg(colN) FROM
# dataSet
# GROUP BY SUBJECT_ID, ACTIVITY_NAME"
collapseToGroups <- function(dataSet){
dt <- aggregate(select(dataSet, -SUBJECT_ID, -ACTIVITY_NAME),
by=list(dataSet$SUBJECT_ID, dataSet$ACTIVITY_NAME),
FUN=mean);
#make group names human readable
names(dt)[1:2] <- c("SUBJECT_ID", "ACTIVITY_NAME");
dt;
}
TEST_DATASET_FILE <- "UCI HAR Dataset/test/X_test.txt";
TEST_ACTIVITY_IDS_FILE <- "UCI HAR Dataset/test/y_test.txt";
TEST_SUBJECTID_FILENAME <- "UCI HAR Dataset/test/subject_test.txt";
TRAIN_DATASET_FILE <- "UCI HAR Dataset/train/X_train.txt";
TRAIN_ACTIVITY_IDS_FILE <- "UCI HAR Dataset/train/y_train.txt";
TRAIN_SUBJECTID_FILENAME <- "UCI HAR Dataset/train/subject_train.txt";
VARIABLES_NAMES_FILE <- "UCI HAR Dataset/features.txt";
ACTIVITY_NAMES_FILE <- "UCI HAR Dataset/activity_labels.txt";
activityNamesTable <- read.table(ACTIVITY_NAMES_FILE, col.names = c("ID", "ACTIVITY_NAME"));
variableNames <- read.table(VARIABLES_NAMES_FILE, as.is=TRUE)[, 2];
variableNames <- gsub("\\(\\)", "", variableNames);
variableNames <- gsub("-", "_", variableNames);
cat("reading test dataset\n");
testDataSet <- readEnrichedDataSet(TEST_DATASET_FILE, variableNames, TEST_ACTIVITY_IDS_FILE, activityNamesTable, TEST_SUBJECTID_FILENAME);
cat("reading train dataset\n");
trainDataSet <- readEnrichedDataSet(TRAIN_DATASET_FILE, variableNames, TRAIN_ACTIVITY_IDS_FILE, activityNamesTable, TRAIN_SUBJECTID_FILENAME);
cat("joining datasets into big one\n");
tidyDataSet <- rbind(extractMeasuresOfInterest(testDataSet), extractMeasuresOfInterest(trainDataSet));
cat("grouping by subject and activity, calculating mean values\n");
tidyDataSet <- collapseToGroups(tidyDataSet);
cat("writing sanitized data into UCI_HAR_tidied.txt\n");
write.table(tidyDataSet, file="UCI_HAR_tidied.txt", row.name=FALSE);