knitr::opts_chunk$set(echo = TRUE, warning = F, message = F)
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(ggplot2)
d = read.csv('ProstateData.csv')
str(d)
## 'data.frame': 97 obs. of 10 variables:
## $ lcavol : num -0.58 -0.994 -0.511 -1.204 0.751 ...
## $ lweight: num 2.77 3.32 2.69 3.28 3.43 ...
## $ age : int 50 58 74 58 62 50 64 58 47 63 ...
## $ lbph : num -1.39 -1.39 -1.39 -1.39 -1.39 ...
## $ svi : int 0 0 0 0 0 0 0 0 0 0 ...
## $ lcp : num -1.39 -1.39 -1.39 -1.39 -1.39 ...
## $ gleason: int 6 6 7 6 6 6 6 6 6 6 ...
## $ pgg45 : int 0 0 20 0 0 0 0 0 0 0 ...
## $ lpsa : num -0.431 -0.163 -0.163 -0.163 0.372 ...
## $ train : logi TRUE TRUE TRUE TRUE TRUE TRUE ...
Categorical
d = d %>%
mutate(svi = factor(svi, label=c("svi0", "svi1"))) # converting categoorical to nominal
str(d)
## 'data.frame': 97 obs. of 10 variables:
## $ lcavol : num -0.58 -0.994 -0.511 -1.204 0.751 ...
## $ lweight: num 2.77 3.32 2.69 3.28 3.43 ...
## $ age : int 50 58 74 58 62 50 64 58 47 63 ...
## $ lbph : num -1.39 -1.39 -1.39 -1.39 -1.39 ...
## $ svi : Factor w/ 2 levels "svi0","svi1": 1 1 1 1 1 1 1 1 1 1 ...
## $ lcp : num -1.39 -1.39 -1.39 -1.39 -1.39 ...
## $ gleason: int 6 6 7 6 6 6 6 6 6 6 ...
## $ pgg45 : int 0 0 20 0 0 0 0 0 0 0 ...
## $ lpsa : num -0.431 -0.163 -0.163 -0.163 0.372 ...
## $ train : logi TRUE TRUE TRUE TRUE TRUE TRUE ...
contrasts(d$svi)
## svi1
## svi0 0
## svi1 1
d %>%
ggplot(aes(sample = lpsa)) + geom_qq()

d %>%
select(-svi, -train) %>%
cor()
## lcavol lweight age lbph lcp gleason
## lcavol 1.0000000 0.28052138 0.2249999 0.027349703 0.675310484 0.43241706
## lweight 0.2805214 1.00000000 0.3479691 0.442264399 0.164537142 0.05688209
## age 0.2249999 0.34796911 1.0000000 0.350185896 0.127667752 0.26889160
## lbph 0.0273497 0.44226440 0.3501859 1.000000000 -0.006999431 0.07782045
## lcp 0.6753105 0.16453714 0.1276678 -0.006999431 1.000000000 0.51483006
## gleason 0.4324171 0.05688209 0.2688916 0.077820447 0.514830063 1.00000000
## pgg45 0.4336522 0.10735379 0.2761124 0.078460018 0.631528246 0.75190451
## lpsa 0.7344603 0.43331938 0.1695928 0.179809404 0.548813175 0.36898681
## pgg45 lpsa
## lcavol 0.43365225 0.7344603
## lweight 0.10735379 0.4333194
## age 0.27611245 0.1695928
## lbph 0.07846002 0.1798094
## lcp 0.63152825 0.5488132
## gleason 0.75190451 0.3689868
## pgg45 1.00000000 0.4223159
## lpsa 0.42231586 1.0000000
d %>%
ggplot(aes(x=svi, y=lpsa, fill = svi)) + geom_boxplot()

library(broom)
d %>%
do(tidy(t.test(lpsa~svi, data=.))) %>%
select(p.value)
## # A tibble: 1 × 1
## p.value
## <dbl>
## 1 0.0000000788
trainD = d %>%
filter(train == T) %>%
select(-train)
testD = d %>%
filter(train == F) %>%
select(-train)
library(leaps)
model = regsubsets(lpsa ~ . , data = trainD, method = "exhaustive")
summary(model)
## Subset selection object
## Call: regsubsets.formula(lpsa ~ ., data = trainD, method = "exhaustive")
## 8 Variables (and intercept)
## Forced in Forced out
## lcavol FALSE FALSE
## lweight FALSE FALSE
## age FALSE FALSE
## lbph FALSE FALSE
## svisvi1 FALSE FALSE
## lcp FALSE FALSE
## gleason FALSE FALSE
## pgg45 FALSE FALSE
## 1 subsets of each size up to 8
## Selection Algorithm: exhaustive
## lcavol lweight age lbph svisvi1 lcp gleason pgg45
## 1 ( 1 ) "*" " " " " " " " " " " " " " "
## 2 ( 1 ) "*" "*" " " " " " " " " " " " "
## 3 ( 1 ) "*" "*" " " " " "*" " " " " " "
## 4 ( 1 ) "*" "*" " " "*" "*" " " " " " "
## 5 ( 1 ) "*" "*" " " "*" "*" " " " " "*"
## 6 ( 1 ) "*" "*" " " "*" "*" "*" " " "*"
## 7 ( 1 ) "*" "*" "*" "*" "*" "*" " " "*"
## 8 ( 1 ) "*" "*" "*" "*" "*" "*" "*" "*"
summary(model)$adjr2
## [1] 0.5304013 0.6027172 0.6201758 0.6371877 0.6396181 0.6510880 0.6579833
## [8] 0.6522155
summary(model)$rss
## [1] 44.52858 37.09185 34.90775 32.81499 32.06945 30.53978 29.43730 29.42638
modelSum=summary(model)
which.max(modelSum$adjr2)
## [1] 7
coef(model, which.max(modelSum$adjr2))
## (Intercept) lcavol lweight age lbph svisvi1
## 0.259061747 0.573930391 0.619208833 -0.019479879 0.144426474 0.741781258
## lcp pgg45
## -0.205416986 0.008944996