Tests for Proportions & Logistic Regression

# Install and load libraries.
library(xlsx)
## Loading required package: rJava
## Loading required package: xlsxjars
library(plyr)
# Import data from 'homework5data.xlsx' using xlsx function.
pain.data <- read.xlsx("pain.xlsx", sheetName = "pain", header = TRUE)
head(pain.data) ; tail(pain.data)
##       Treatment Age Severe Pain
## 1 New Treatment  29      1    1
## 2 New Treatment  39      0    0
## 3 New Treatment  43      0    0
## 4 New Treatment  27      0    1
## 5 New Treatment  16      1    1
## 6 New Treatment  39      0    0
##     Treatment Age Severe Pain
## 195   Placebo  57      0    0
## 196   Placebo  66      1    0
## 197   Placebo  56      0    0
## 198   Placebo  50      0    0
## 199   Placebo  39      0    1
## 200   Placebo  49      0    0
# View summary of pain.data
summary(pain.data)
##          Treatment        Age            Severe           Pain     
##  New Treatment:100   Min.   :10.00   Min.   :0.000   Min.   :0.00  
##  Placebo      :100   1st Qu.:38.75   1st Qu.:0.000   1st Qu.:0.00  
##                      Median :48.00   Median :0.000   Median :0.00  
##                      Mean   :47.60   Mean   :0.275   Mean   :0.26  
##                      3rd Qu.:56.00   3rd Qu.:1.000   3rd Qu.:1.00  
##                      Max.   :81.00   Max.   :1.000   Max.   :1.00
str(pain.data)
## 'data.frame':    200 obs. of  4 variables:
##  $ Treatment: Factor w/ 2 levels "New Treatment",..: 1 1 1 1 1 1 1 1 1 1 ...
##  $ Age      : num  29 39 43 27 16 39 51 35 39 52 ...
##  $ Severe   : num  1 0 0 0 1 0 1 0 1 0 ...
##  $ Pain     : num  1 0 0 1 1 0 0 0 0 0 ...
#-------------
# Question 1.
#-------------
# Summarize data relating to pain by treatment type.
ddply(pain.data, "Treatment", summarise, 
      N = length(Pain),
      Mild_to_No = length(Pain[Pain == 0]),
      Mod_to_Sev = length(Pain[Pain == 1]),
      P_Mild_to_No = Mild_to_No / N,
      P_Mod_to_Sev = Mod_to_Sev / N)
##       Treatment   N Mild_to_No Mod_to_Sev P_Mild_to_No P_Mod_to_Sev
## 1 New Treatment 100         83         17         0.83         0.17
## 2       Placebo 100         65         35         0.65         0.35
# Generate a mosaic plot for pain score by treatment type.
x <- matrix(table(pain.data$Treatment, pain.data$Pain), ncol = 2, byrow = TRUE)
colnames(x) <- c("New Treatment", "Placebo")
rownames(x) <- c("Mild-to-None (0)", "Moderate-to-Severe (1)")
mosaicplot(x, dir = c("h", "v"),
           main = "Mosaic Plot for Treatment vs. Pain",
           xlab = "Treatment Type", ylab = "Pain Score", type = "n",
           col = c("steelblue1", "slateblue3"))

#-------------
# Question 2.
#-------------
# Determine test statistic to compare against.
qnorm(0.975)
## [1] 1.959964
# Test moderate-to-severe pain between treatment groups.
prop.test(c(17, 35), c(100, 100), alternative = "two.sided",
          conf.level = 0.95, correct = FALSE)
## 
##  2-sample test for equality of proportions without continuity
##  correction
## 
## data:  c(17, 35) out of c(100, 100)
## X-squared = 8.42, df = 1, p-value = 0.003711
## alternative hypothesis: two.sided
## 95 percent confidence interval:
##  -0.29899419 -0.06100581
## sample estimates:
## prop 1 prop 2 
##   0.17   0.35
#-------------
# Question 3.
#-------------
# Perform logistic regression with treatment as only explantory variable.
# Control new treatment variable for regression.
pain.data$Treatment_New <- ifelse(pain.data$Treatment == "New Treatment", 1, 0)
m1 <- glm(pain.data$Pain ~ pain.data$Treatment_New, family = binomial)
summary(m1)
## 
## Call:
## glm(formula = pain.data$Pain ~ pain.data$Treatment_New, family = binomial)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -0.9282  -0.9282  -0.6105   1.4490   1.8825  
## 
## Coefficients:
##                         Estimate Std. Error z value Pr(>|z|)   
## (Intercept)              -0.6190     0.2097  -2.953  0.00315 **
## pain.data$Treatment_New  -0.9666     0.3389  -2.852  0.00434 **
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 229.22  on 199  degrees of freedom
## Residual deviance: 220.67  on 198  degrees of freedom
## AIC: 224.67
## 
## Number of Fisher Scoring iterations: 4
# Control placebo treatement variable for regression.
pain.data$Treatment_Placebo <- ifelse(pain.data$Treatment == "Placebo", 1, 0)
m2 <- glm(pain.data$Pain ~ pain.data$Treatment_Placebo, family = binomial)
summary(m2)
## 
## Call:
## glm(formula = pain.data$Pain ~ pain.data$Treatment_Placebo, family = binomial)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -0.9282  -0.9282  -0.6105   1.4490   1.8825  
## 
## Coefficients:
##                             Estimate Std. Error z value Pr(>|z|)    
## (Intercept)                  -1.5856     0.2662  -5.956 2.58e-09 ***
## pain.data$Treatment_Placebo   0.9666     0.3389   2.852  0.00434 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 229.22  on 199  degrees of freedom
## Residual deviance: 220.67  on 198  degrees of freedom
## AIC: 224.67
## 
## Number of Fisher Scoring iterations: 4
# Calculate odds ratio for beta coefficient (new treatment).
exp(-0.9666)
## [1] 0.3803741
# Calculate odds ratio for beta coefficient (placebo treatment).
exp(0.9666)
## [1] 2.628991
# Determine 95% confidence intervals for treatments.
exp(cbind(OR = coef(m1), confint.default(m1)))
##                                OR     2.5 %    97.5 %
## (Intercept)             0.5384615 0.3570215 0.8121103
## pain.data$Treatment_New 0.3803787 0.1957834 0.7390203
exp(cbind(OR = coef(m2), confint.default(m2)))
##                                    OR     2.5 %    97.5 %
## (Intercept)                 0.2048193 0.1215531 0.3451243
## pain.data$Treatment_Placebo 2.6289593 1.3531428 5.1076849
# Calculate c-statistic using roc function.
#install.packages("pROC", dependencies = TRUE)
library(pROC)
## Warning: package 'pROC' was built under R version 3.4.4
## Type 'citation("pROC")' for a citation.
## 
## Attaching package: 'pROC'
## The following objects are masked from 'package:stats':
## 
##     cov, smooth, var
pain.data$prob1 <- predict(m1, type = c("response"))
pain.data$prob1
##   [1] 0.17 0.17 0.17 0.17 0.17 0.17 0.17 0.17 0.17 0.17 0.17 0.17 0.17 0.17
##  [15] 0.17 0.17 0.17 0.17 0.17 0.17 0.17 0.17 0.17 0.17 0.17 0.17 0.17 0.17
##  [29] 0.17 0.17 0.17 0.17 0.17 0.17 0.17 0.17 0.17 0.17 0.17 0.17 0.17 0.17
##  [43] 0.17 0.17 0.17 0.17 0.17 0.17 0.17 0.17 0.17 0.17 0.17 0.17 0.17 0.17
##  [57] 0.17 0.17 0.17 0.17 0.17 0.17 0.17 0.17 0.17 0.17 0.17 0.17 0.17 0.17
##  [71] 0.17 0.17 0.17 0.17 0.17 0.17 0.17 0.17 0.17 0.17 0.17 0.17 0.17 0.17
##  [85] 0.17 0.17 0.17 0.17 0.17 0.17 0.17 0.17 0.17 0.17 0.17 0.17 0.17 0.17
##  [99] 0.17 0.17 0.35 0.35 0.35 0.35 0.35 0.35 0.35 0.35 0.35 0.35 0.35 0.35
## [113] 0.35 0.35 0.35 0.35 0.35 0.35 0.35 0.35 0.35 0.35 0.35 0.35 0.35 0.35
## [127] 0.35 0.35 0.35 0.35 0.35 0.35 0.35 0.35 0.35 0.35 0.35 0.35 0.35 0.35
## [141] 0.35 0.35 0.35 0.35 0.35 0.35 0.35 0.35 0.35 0.35 0.35 0.35 0.35 0.35
## [155] 0.35 0.35 0.35 0.35 0.35 0.35 0.35 0.35 0.35 0.35 0.35 0.35 0.35 0.35
## [169] 0.35 0.35 0.35 0.35 0.35 0.35 0.35 0.35 0.35 0.35 0.35 0.35 0.35 0.35
## [183] 0.35 0.35 0.35 0.35 0.35 0.35 0.35 0.35 0.35 0.35 0.35 0.35 0.35 0.35
## [197] 0.35 0.35 0.35 0.35
roc(pain.data$Pain ~ pain.data$prob1)
## 
## Call:
## roc.formula(formula = pain.data$Pain ~ pain.data$prob1)
## 
## Data: pain.data$prob1 in 148 controls (pain.data$Pain 0) < 52 cases (pain.data$Pain 1).
## Area under the curve: 0.6169
#-------------
# Question 4.
#-------------
# Perform multiple logistic regression to predict pain from treatment, age, severity.
attach(pain.data)
n1 <- glm(Pain ~ Treatment + Age + Severe, family = "binomial") ; n1
## 
## Call:  glm(formula = Pain ~ Treatment + Age + Severe, family = "binomial")
## 
## Coefficients:
##      (Intercept)  TreatmentPlacebo               Age            Severe  
##           5.5703            2.7612           -0.1933            1.0561  
## 
## Degrees of Freedom: 199 Total (i.e. Null);  196 Residual
## Null Deviance:       229.2 
## Residual Deviance: 136.5     AIC: 144.5
# Perform multiple logistic regression to predict pain using treatment dummy, age, severity.
n2 <- glm(Pain ~ Treatment_New + Age + Severe, family = "binomial") ; n2
## 
## Call:  glm(formula = Pain ~ Treatment_New + Age + Severe, family = "binomial")
## 
## Coefficients:
##   (Intercept)  Treatment_New            Age         Severe  
##        8.3315        -2.7612        -0.1933         1.0561  
## 
## Degrees of Freedom: 199 Total (i.e. Null);  196 Residual
## Null Deviance:       229.2 
## Residual Deviance: 136.5     AIC: 144.5
# Calculate odds ratios given the beta coefficients from regression equation.
exp(n1$coefficients[[2]])
## [1] 15.81889
exp(n1$coefficients[[3]])
## [1] 0.8242507
exp(n1$coefficients[[4]])
## [1] 2.875093
# Calculate odds ratio for age for a 10-yr increase.
exp(n1$coefficients[[3]])
## [1] 0.8242507
exp(n1$coefficients[[3]]*10)
## [1] 0.1447416
# Calculate odds ratios given the beta coefficients from regression equation (with Treatment_New).
exp(n2$coefficients[[2]])
## [1] 0.06321554
exp(n2$coefficients[[3]])
## [1] 0.8242507
exp(n2$coefficients[[4]])
## [1] 2.875093
# Calculate odds ratio for age for a 10-yr increase (with Treatment_ New).
exp(n2$coefficients[[3]])
## [1] 0.8242507
exp(n2$coefficients[[3]]*10)
## [1] 0.1447416
# Calculate c-statistic using roc function.
pain.data$prob2 <- predict(n1, type = c("response"))
roc(pain.data$Pain ~ pain.data$prob2)
## 
## Call:
## roc.formula(formula = pain.data$Pain ~ pain.data$prob2)
## 
## Data: pain.data$prob2 in 148 controls (pain.data$Pain 0) < 52 cases (pain.data$Pain 1).
## Area under the curve: 0.8905
# Display roc curve.
g <- roc(pain.data$Pain ~ pain.data$prob2)
plot(1-g$specificities, g$sensitivities, type = "l",
     xlab = "1 - Specificity", ylab = "Sensitivity",
     main = "ROC Curve for Model",
     col = "steelblue", lwd = 3)
abline(a = 0, b = 1, col = "gray")
mtext("Area Under the Curve = 0.8905", side = 3, line = 0.65)
grid()

# Display two ROC curves together.
g1 <- roc(pain.data$Pain ~ pain.data$prob1)
g2 <- roc(pain.data$Pain ~ pain.data$prob2)
plot(1-g1$specificities, g1$sensitivities, type = "l",
     xlab = "1 - Specificity", ylab = "Sensitivity",
     main = "ROC Curves for Both Models",
     col = "red", lwd = 3)
par(new = TRUE)
plot(1-g2$specificities, g2$sensitivities, type = "l",
     xlab = "", ylab = "",
     col = "steelblue", lwd = 3)
abline(a = 0, b = 1, col = "gray")
grid()
legend(x = "bottomright", c("~Treatment", "~Treatment+Age+Severe"),
       col = c("red", "steelblue"), lwd = c(2, 2))

# Optional: Look at correlations between variables.
cov2cor(vcov(m1))
##                         (Intercept) pain.data$Treatment_New
## (Intercept)               1.0000000              -0.6187081
## pain.data$Treatment_New  -0.6187081               1.0000000
cov2cor(vcov(n2))
##               (Intercept) Treatment_New        Age     Severe
## (Intercept)     1.0000000    -0.6991441 -0.9784557  0.2274324
## Treatment_New  -0.6991441     1.0000000  0.6275680 -0.2919393
## Age            -0.9784557     0.6275680  1.0000000 -0.3100057
## Severe          0.2274324    -0.2919393 -0.3100057  1.0000000