forked from rdpeng/RepData_PeerAssessment1
-
Notifications
You must be signed in to change notification settings - Fork 0
/
PA1_template.Rmd
76 lines (61 loc) · 2.3 KB
/
PA1_template.Rmd
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
# Reproducible Research: Peer Assessment 1
## Loading and preprocessing the data
```{r}
data<-read.csv(unz("activity.zip", "activity.csv"))
library(lubridate)
data<-transform(data, date=ymd(date))
```
## What is mean total number of steps taken per day?
```{r}
library(plyr)
dataPerDay<-ddply(data, .(date), summarize, stepsSum = sum(steps, na.rm=TRUE))
hist(dataPerDay$stepsSum)
mean(dataPerDay$stepsSum)
median(dataPerDay$stepsSum)
```
## What is the average daily activity pattern?
###Plot
```{r}
dataPerInterval<-ddply(data, .(interval), summarize, stepsMean = mean(steps, na.rm=TRUE))
plot(x=dataPerInterval$interval, y=dataPerInterval$stepsMean, type = "l")
```
###Interval with maximum steps
```{r}
dataPerInterval[which(dataPerInterval$stepsMean == max(dataPerInterval$stepsMean)),1]
```
## Imputing missing values
How many NAs are there?
```{r}
sum(is.na(data$steps))
```
We will use the mean of the interval to impute the missing value.
```{r}
imputedData<-merge(data, dataPerInterval, by="interval")
ind <- which(is.na(imputedData$steps), arr.ind=TRUE)
#substitute NAs with means for the respective interval
imputedData[ind,"steps"]<-imputedData[ind,"stepsMean"]
#remove stepsMean
imputedData<-imputedData[,-c(4)]
imputedDataPerDay<-ddply(imputedData, .(date), summarize, stepsSum = sum(steps, na.rm=TRUE))
#prepare data for visual comparison
imputedDataPerDayComparison<-imputedDataPerDay
imputedDataPerDayComparison$kind<-"imputed"
dataPerDayComparison<-dataPerDay
dataPerDayComparison$kind <-'original'
comparison<-rbind(imputedDataPerDayComparison, dataPerDayComparison)
library(ggplot2)
ggplot(comparison, aes(x=stepsSum, fill = kind)) +
geom_histogram(alpha = 0.2, binwidth = diff(range(comparison$stepsSum))/10)
c(mean(imputedDataPerDay$stepsSum),mean(dataPerDay$stepsSum))
c(median(imputedDataPerDay$stepsSum),median(dataPerDay$stepsSum))
```
## Are there differences in activity patterns between weekdays and weekends?
```{r}
indWeekend<-which(weekdays(imputedData$date) %in% c('Saturday','Sunday'), arr.ind = TRUE)
imputedData$day<-"Weekday"
imputedData[indWeekend,]$day<-"Weekend"
imputedData$day<-as.factor(imputedData$day)
imputedDataPerInterval<-ddply(imputedData, .(interval, day), summarize, steps = mean(steps))
ggplot(imputedDataPerInterval, aes(x=interval, y=steps, color = day)) +
geom_line()
```