-
Notifications
You must be signed in to change notification settings - Fork 0
/
run_analysis.R
91 lines (76 loc) · 4.27 KB
/
run_analysis.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
library(reshape2)
# Check to see whether the dataset exists, and if not; download it.
dataFile <- "./data/getdata-projectfiles-UCI HAR Dataset.zip"
if (!file.exists(dataFile))
{
suppressWarnings(dir.create("./data"))
fileURL <- "https://d396qusza40orc.cloudfront.net/getdata%2Fprojectfiles%2FUCI%20HAR%20Dataset.zip"
download.file(fileURL, destfile = dataFile, method = 'curl')
}
# read the contents of the compressed file into a list. In lieu of extracting
# the contents of the compressed file, the list will serve as an index to the
dataFileContents <- unzip(dataFile, list = TRUE)
# extract the README.txt and features_info.txt
unzip(dataFile, dataFileContents[4, 1], exdir = './data')
unzip(dataFile, dataFileContents[3, 1], exdir = './data')
# read the training activity labels, subjects and data.
trainingLabels <- read.table(unz(dataFile, dataFileContents[32, 1]),
stringsAsFactors = FALSE)
trainingSet <- read.table(unz(dataFile, dataFileContents[31, 1]),
stringsAsFactors = FALSE)
trainingSubject <- read.table(unz(dataFile, dataFileContents[30, 1]),
stringsAsFactors = FALSE)
# read the testing activity labels, subjects and data.
testingLabels <- read.table(unz(dataFile, dataFileContents[18, 1]),
stringsAsFactors = FALSE)
testingSet <- read.table(unz(dataFile, dataFileContents[17, 1]),
stringsAsFactors = FALSE)
testingSubject <- read.table(unz(dataFile, dataFileContents[16, 1]),
stringsAsFactors = FALSE)
# read the activity names and feature labels
activityLabels <- read.table(unz(dataFile, dataFileContents[1, 1]),
stringsAsFactors = FALSE)
featuresLabels <- read.table(unz(dataFile, dataFileContents[2, 1]),
stringsAsFactors = FALSE)
# close any open connections
closeAllConnections()
# add the subject and activityCode columns to their respective datasets. The
# activityCode is stored as a factor
trainingSet$subject <- trainingSubject$V1
trainingSet$activityCode <- as.factor(trainingLabels$V1)
testingSet$subject <- testingSubject$V1
testingSet$activityCode <- as.factor(testingLabels$V1)
# merge the testing and training datasets, convert the subject to a factor.
mergedSet <- rbind(trainingSet, testingSet)
mergedSet$subject <- as.factor(mergedSet$subject)
# label the 'feature' variables
colnames(mergedSet)[1:561] <- featuresLabels$V2
# apply column names to activity labels, format the activity text, and map to
# their corresponding values in the mergedSet
activityLabels$V2 <- tolower(activityLabels$V2)
activityLabels$V2 <- gsub("(_.)", "\\U\\1", activityLabels$V2, perl = TRUE)
activityLabels$V2 <- gsub("_", "", activityLabels$V2)
colnames(activityLabels) <- c("activityCode", "activity")
mergedSet$activityCode <- plyr::mapvalues(mergedSet$activityCode,
from = c(activityLabels$activityCode),
to = c(activityLabels$activity))
names(mergedSet)[names(mergedSet) == 'activityCode'] <- 'activity'
# create an index of the columns to be kept, and trim the dataset. Note that
# (1) only column names that match mean() and std() are kept.
# (2) clean names remove special characters, and the clean names are re-written
# as the variable names.
patterns <- c("^subject$", "^activity$", "\\bmean()\\b", "\\bstd()\\b")
index <- grep(paste(patterns, collapse = "|"), colnames(mergedSet))
trimmedSet <- mergedSet[index]
cleanNames <- gsub("(-.)", "\\U\\1", colnames(trimmedSet), perl = TRUE)
cleanNames <- gsub("-|\\(\\)", "", cleanNames)
colnames(trimmedSet) <- cleanNames
# (1) melt the trimmed data into a long set about id variables subject and activity.
# (2) calculate the mean and cast the melted data into a wide set similar to the
# original input data
# (3) order the data by subject and then activity.
# (4) write the data to a file named tidySet.txt
meltedSet <- melt(trimmedSet, id.vars = c("subject", "activity"))
castedSet <- dcast(meltedSet, subject + activity ~ variable, mean)
tidySet <- castedSet[order(castedSet$subject, castedSet$activity), ]
write.table(tidySet, file = "./tidySet.txt", row.names = FALSE)