knitr::opts_chunk$set(echo = TRUE, warning = F, message = F)
library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(ggplot2)
d = read.csv('ProstateData.csv')
str(d)
## 'data.frame':    97 obs. of  10 variables:
##  $ lcavol : num  -0.58 -0.994 -0.511 -1.204 0.751 ...
##  $ lweight: num  2.77 3.32 2.69 3.28 3.43 ...
##  $ age    : int  50 58 74 58 62 50 64 58 47 63 ...
##  $ lbph   : num  -1.39 -1.39 -1.39 -1.39 -1.39 ...
##  $ svi    : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ lcp    : num  -1.39 -1.39 -1.39 -1.39 -1.39 ...
##  $ gleason: int  6 6 7 6 6 6 6 6 6 6 ...
##  $ pgg45  : int  0 0 20 0 0 0 0 0 0 0 ...
##  $ lpsa   : num  -0.431 -0.163 -0.163 -0.163 0.372 ...
##  $ train  : logi  TRUE TRUE TRUE TRUE TRUE TRUE ...

Categorical

d = d %>%
  mutate(svi = factor(svi, label=c("svi0", "svi1"))) # converting categoorical to nominal  
str(d)
## 'data.frame':    97 obs. of  10 variables:
##  $ lcavol : num  -0.58 -0.994 -0.511 -1.204 0.751 ...
##  $ lweight: num  2.77 3.32 2.69 3.28 3.43 ...
##  $ age    : int  50 58 74 58 62 50 64 58 47 63 ...
##  $ lbph   : num  -1.39 -1.39 -1.39 -1.39 -1.39 ...
##  $ svi    : Factor w/ 2 levels "svi0","svi1": 1 1 1 1 1 1 1 1 1 1 ...
##  $ lcp    : num  -1.39 -1.39 -1.39 -1.39 -1.39 ...
##  $ gleason: int  6 6 7 6 6 6 6 6 6 6 ...
##  $ pgg45  : int  0 0 20 0 0 0 0 0 0 0 ...
##  $ lpsa   : num  -0.431 -0.163 -0.163 -0.163 0.372 ...
##  $ train  : logi  TRUE TRUE TRUE TRUE TRUE TRUE ...
contrasts(d$svi)
##      svi1
## svi0    0
## svi1    1
d %>%
  ggplot(aes(sample = lpsa)) + geom_qq()

d %>%
  select(-svi, -train) %>%
  cor()
##            lcavol    lweight       age         lbph          lcp    gleason
## lcavol  1.0000000 0.28052138 0.2249999  0.027349703  0.675310484 0.43241706
## lweight 0.2805214 1.00000000 0.3479691  0.442264399  0.164537142 0.05688209
## age     0.2249999 0.34796911 1.0000000  0.350185896  0.127667752 0.26889160
## lbph    0.0273497 0.44226440 0.3501859  1.000000000 -0.006999431 0.07782045
## lcp     0.6753105 0.16453714 0.1276678 -0.006999431  1.000000000 0.51483006
## gleason 0.4324171 0.05688209 0.2688916  0.077820447  0.514830063 1.00000000
## pgg45   0.4336522 0.10735379 0.2761124  0.078460018  0.631528246 0.75190451
## lpsa    0.7344603 0.43331938 0.1695928  0.179809404  0.548813175 0.36898681
##              pgg45      lpsa
## lcavol  0.43365225 0.7344603
## lweight 0.10735379 0.4333194
## age     0.27611245 0.1695928
## lbph    0.07846002 0.1798094
## lcp     0.63152825 0.5488132
## gleason 0.75190451 0.3689868
## pgg45   1.00000000 0.4223159
## lpsa    0.42231586 1.0000000
d %>%
  ggplot(aes(x=svi, y=lpsa, fill = svi)) + geom_boxplot()

library(broom)
d %>% 
  do(tidy(t.test(lpsa~svi, data=.))) %>%
  select(p.value)
## # A tibble: 1 × 1
##        p.value
##          <dbl>
## 1 0.0000000788
trainD = d %>%
  filter(train == T) %>%
  select(-train)

testD = d %>%
  filter(train == F) %>%
  select(-train)
library(leaps)

model = regsubsets(lpsa ~ . , data = trainD, method = "exhaustive")
summary(model)
## Subset selection object
## Call: regsubsets.formula(lpsa ~ ., data = trainD, method = "exhaustive")
## 8 Variables  (and intercept)
##         Forced in Forced out
## lcavol      FALSE      FALSE
## lweight     FALSE      FALSE
## age         FALSE      FALSE
## lbph        FALSE      FALSE
## svisvi1     FALSE      FALSE
## lcp         FALSE      FALSE
## gleason     FALSE      FALSE
## pgg45       FALSE      FALSE
## 1 subsets of each size up to 8
## Selection Algorithm: exhaustive
##          lcavol lweight age lbph svisvi1 lcp gleason pgg45
## 1  ( 1 ) "*"    " "     " " " "  " "     " " " "     " "  
## 2  ( 1 ) "*"    "*"     " " " "  " "     " " " "     " "  
## 3  ( 1 ) "*"    "*"     " " " "  "*"     " " " "     " "  
## 4  ( 1 ) "*"    "*"     " " "*"  "*"     " " " "     " "  
## 5  ( 1 ) "*"    "*"     " " "*"  "*"     " " " "     "*"  
## 6  ( 1 ) "*"    "*"     " " "*"  "*"     "*" " "     "*"  
## 7  ( 1 ) "*"    "*"     "*" "*"  "*"     "*" " "     "*"  
## 8  ( 1 ) "*"    "*"     "*" "*"  "*"     "*" "*"     "*"
summary(model)$adjr2
## [1] 0.5304013 0.6027172 0.6201758 0.6371877 0.6396181 0.6510880 0.6579833
## [8] 0.6522155
summary(model)$rss
## [1] 44.52858 37.09185 34.90775 32.81499 32.06945 30.53978 29.43730 29.42638
modelSum=summary(model)
which.max(modelSum$adjr2)
## [1] 7
coef(model, which.max(modelSum$adjr2))
##  (Intercept)       lcavol      lweight          age         lbph      svisvi1 
##  0.259061747  0.573930391  0.619208833 -0.019479879  0.144426474  0.741781258 
##          lcp        pgg45 
## -0.205416986  0.008944996