Jeromy Anglim's Blog: Psychology and Statistics


Wednesday, December 4, 2013

Using R to replicate common SPSS multiple regression output

The following post replicates some of the standard output you might get from a multiple regression analysis in SPSS. A copy of the code in RMarkdown format is available on github. The post was motivated by this previous post that discussed using R to teach psychology students statistics.
library(foreign)  # read.spss
library(psych)  # describe 
library(Hmisc)  # rcorr 
library(QuantPsyc)  # lm.beta
library(car)  # vif, durbinWatsonTest
library(MASS)  # studres
library(lmSupport)  #lm.sumSquares
library(perturb)  # colldiag
In order to emulate SPSS output, it is necessary to install several add-on packages. The above library commands load the packages into your R workspace. I've highlighted in the comment the names of the functions that are used in this script.
You may not have the above packages installed. If not, run commands like:
  • install.packages('foreign')
  • install.packages('psych')
  • etc.
for each of the above packages not installed or use the “packages” tab in RStudio to install.
Note also that much of this analysis could be performed using Rcommander using a more SPSS-style GUI environment.

Import and prepare data

cars_raw <- read.spss("cars.sav", to.data.frame = TRUE)
# get rid of missing data listwise
cars <- na.omit(cars_raw[, c("accel", "mpg", "engine", "horse", "weight")])
Ensure that cars.sav is the working directory.

Quick look at data

# note the need to deal with missing data
psych::describe(cars_raw)
##             var   n    mean     sd  median trimmed    mad    min     max
## mpg           1 398   23.51   7.82   23.00   23.06   8.90   9.00   46.60
## engine        2 406  194.04 105.21  148.50  183.75  86.73   4.00  455.00
## horse         3 400  104.83  38.52   95.00  100.36  29.65  46.00  230.00
## weight        4 406 2969.56 849.83 2811.00 2913.97 947.38 732.00 5140.00
## accel         5 406   15.50   2.82   15.50   15.45   2.59   8.00   24.80
## year*         6 405    6.94   3.74    7.00    6.93   4.45   1.00   13.00
## origin*       7 405    1.57   0.80    1.00    1.46   0.00   1.00    3.00
## cylinder*     8 405    3.20   1.33    2.00    3.14   0.00   1.00    5.00
## filter_.*     9 398    1.73   0.44    2.00    1.79   0.00   1.00    2.00
## weightKG     10 406 1346.97 385.48 1275.05 1321.75 429.72 332.03 2331.46
## engineLitre  11 406    3.19   1.73    2.44    3.02   1.42   0.07    7.47
##               range  skew kurtosis    se
## mpg           37.60  0.45    -0.53  0.39
## engine       451.00  0.69    -0.81  5.22
## horse        184.00  1.04     0.55  1.93
## weight      4408.00  0.46    -0.77 42.18
## accel         16.80  0.21     0.35  0.14
## year*         12.00  0.02    -1.21  0.19
## origin*        2.00  0.92    -0.81  0.04
## cylinder*      4.00  0.27    -1.69  0.07
## filter_.*      1.00 -1.04    -0.92  0.02
## weightKG    1999.43  0.46    -0.77 19.13
## engineLitre    7.41  0.69    -0.81  0.09

dim(cars)
## [1] 392   5
head(cars)
##   accel mpg engine horse weight
## 1  12.0  18    307   130   3504
## 2  11.5  15    350   165   3693
## 3  11.0  18    318   150   3436
## 4  12.0  16    304   150   3433
## 5  10.5  17    302   140   3449
## 6  10.0  15    429   198   4341
str(cars)
## 'data.frame':    392 obs. of  5 variables:
##  $ accel : num  12 11.5 11 12 10.5 10 9 8.5 10 8.5 ...
##  $ mpg   : num  18 15 18 16 17 15 14 14 14 15 ...
##  $ engine: num  307 350 318 304 302 429 454 440 455 390 ...
##  $ horse : num  130 165 150 150 140 198 220 215 225 190 ...
##  $ weight: num  3504 3693 3436 3433 3449 ...
##  - attr(*, "na.action")=Class 'omit'  Named int [1:14] 11 12 13 14 15 18 39 40 134 338 ...
##   .. ..- attr(*, "names")= chr [1:14] "11" "12" "13" "14" ...

Fit model

fit <- lm(accel ~ mpg + engine + horse + weight, data = cars)

Descriptive Statistics

# Descriptive statistics
psych::describe(cars)
##        var   n    mean     sd  median trimmed    mad min    max  range
## accel    1 392   15.52   2.78   15.50   15.46   2.52   8   24.8   16.8
## mpg      2 392   23.45   7.81   22.75   22.99   8.60   9   46.6   37.6
## engine   3 392  193.65 104.94  148.50  183.15  86.73   4  455.0  451.0
## horse    4 392  104.21  38.23   93.00   99.61  28.17  46  230.0  184.0
## weight   5 392 2967.38 852.29 2797.50 2909.64 945.90 732 5140.0 4408.0
##        skew kurtosis    se
## accel  0.27     0.43  0.14
## mpg    0.45    -0.54  0.39
## engine 0.69    -0.77  5.30
## horse  1.09     0.71  1.93
## weight 0.48    -0.76 43.05

# correlations
cor(cars)
##          accel     mpg  engine   horse  weight
## accel   1.0000  0.4375 -0.5298 -0.6936 -0.4013
## mpg     0.4375  1.0000 -0.7893 -0.7713 -0.8072
## engine -0.5298 -0.7893  1.0000  0.8959  0.9339
## horse  -0.6936 -0.7713  0.8959  1.0000  0.8572
## weight -0.4013 -0.8072  0.9339  0.8572  1.0000
rcorr(as.matrix(cars))  # include sig test for all correlations
##        accel   mpg engine horse weight
## accel   1.00  0.44  -0.53 -0.69  -0.40
## mpg     0.44  1.00  -0.79 -0.77  -0.81
## engine -0.53 -0.79   1.00  0.90   0.93
## horse  -0.69 -0.77   0.90  1.00   0.86
## weight -0.40 -0.81   0.93  0.86   1.00
## 
## n= 392 
## 
## 
## P
##        accel mpg engine horse weight
## accel         0   0      0     0    
## mpg     0         0      0     0    
## engine  0     0          0     0    
## horse   0     0   0            0    
## weight  0     0   0      0
# scatterplot matrix if you want
pairs.panels(cars)

Summary of model

# r-square, adjusted r-square, std. error of estimate, overall ANOVA, df, p,
# unstandardised coefficients, sig tests
summary(fit)
## 
## Call:
## lm(formula = accel ~ mpg + engine + horse + weight, data = cars)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -4.177 -1.023 -0.184  0.936  6.873 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 16.980778   0.977425   17.37   <2e-16 ***
## mpg          0.007476   0.019298    0.39   0.6987    
## engine      -0.008230   0.002674   -3.08   0.0022 ** 
## horse       -0.087169   0.005204  -16.75   <2e-16 ***
## weight       0.003046   0.000297   10.24   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 1.7 on 387 degrees of freedom
## Multiple R-squared:  0.631,  Adjusted R-squared:  0.627 
## F-statistic:  166 on 4 and 387 DF,  p-value: <2e-16
### additional info in terms of sums of squares
anova(fit)
## Analysis of Variance Table
## 
## Response: accel
##            Df Sum Sq Mean Sq F value Pr(>F)    
## mpg         1    577     577   200.8 <2e-16 ***
## engine      1    272     272    94.7 <2e-16 ***
## horse       1    753     753   261.8 <2e-16 ***
## weight      1    302     302   104.9 <2e-16 ***
## Residuals 387   1113       3                   
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

# 95% confidence intervals (defaults to 95%)
confint(fit)
##                 2.5 %    97.5 %
## (Intercept) 15.059049 18.902506
## mpg         -0.030466  0.045418
## engine      -0.013488 -0.002972
## horse       -0.097401 -0.076938
## weight       0.002461  0.003630
# but can specify different confidence intervals
confint(fit, level = 0.99)
##                 0.5 %    99.5 %
## (Intercept) 14.450621 19.510934
## mpg         -0.042478  0.057430
## engine      -0.015153 -0.001308
## horse       -0.100641 -0.073698
## weight       0.002276  0.003816

# standardised coefficients
lm.beta(fit)
##      mpg   engine    horse   weight 
##  0.02101 -0.31093 -1.19988  0.93456

# or you could do it manually
zcars <- data.frame(scale(cars))  # make all variables z-scores
zfit <- lm(accel ~ mpg + engine + horse + weight, data = zcars)
coef(zfit)[-1]
##      mpg   engine    horse   weight 
##  0.02101 -0.31093 -1.19988  0.93456

# correlations: zero-order, semi-partial, partial obscure function seems to
# do it
sqrt(lm.sumSquares(fit)[, c(2, 3)])
##              dR-sqr pEta-sqr
## (Intercept) 0.53638   0.6620
## mpg         0.01000   0.0200
## engine      0.09487   0.1546
## horse       0.51711   0.6483
## weight      0.31623   0.4617
## Error (SSE)      NA       NA
## Total (SST)      NA       NA

# or use own function
cor_lm <- function(fit) {
    dv <- names(fit$model)[1]
    dv_data <- fit$model[, dv]
    ivs <- names(fit$model)[-1]
    iv_data <- fit$model[, ivs]
    x <- fit$model
    x_omit <- lapply(ivs, function(X) x[, c(dv, setdiff(ivs, X))])
    names(x_omit) <- ivs
    lapply(x_omit, head)
    fits_omit <- lapply(x_omit, function(X) lm(as.formula(paste(dv, "~ .")), 
        data = X))
    resid_omit <- sapply(fits_omit, resid)
    iv_omit <- lapply(ivs, function(X) lm(as.formula(paste(X, "~ .")), data = iv_data))
    resid_iv_omit <- sapply(iv_omit, resid)

    results <- sapply(seq(ivs), function(i) c(zeroorder = cor(iv_data[, i], 
        dv_data), partial = cor(resid_iv_omit[, i], resid_omit[, i]), semipartial = cor(resid_iv_omit[, 
        i], dv_data)))
    results <- data.frame(results)

    names(results) <- ivs
    results <- data.frame(t(results))
    results
}

round(cor_lm(fit), 3)
##        zeroorder partial semipartial
## mpg        0.438   0.020       0.012
## engine    -0.530  -0.155      -0.095
## horse     -0.694  -0.648      -0.517
## weight    -0.401   0.462       0.316

Assumption testing

# Durbin Watson test
durbinWatsonTest(fit)
##  lag Autocorrelation D-W Statistic p-value
##    1           0.136         1.721   0.004
##  Alternative hypothesis: rho != 0

# vif
vif(fit)
##    mpg engine  horse weight 
##  3.085 10.709  5.383  8.736

# tolerance
1/vif(fit)
##     mpg  engine   horse  weight 
## 0.32415 0.09338 0.18576 0.11447

# collinearity diagnostics
colldiag(fit)
## Condition
## Index    Variance Decomposition Proportions
##           intercept mpg   engine horse weight
## 1   1.000 0.000     0.001 0.001  0.001 0.000 
## 2   3.623 0.002     0.051 0.016  0.005 0.001 
## 3  16.214 0.006     0.066 0.365  0.763 0.019 
## 4  18.519 0.127     0.431 0.243  0.152 0.227 
## 5  32.892 0.865     0.451 0.375  0.079 0.753

# residual statistics
rfit <- data.frame(predicted = predict(fit), residuals = resid(fit), studentised_residuals = studres(fit))
psych::describe(rfit)
##                       var   n  mean   sd median trimmed  mad   min   max
## predicted               1 392 15.52 2.21  16.11   15.80 1.40  3.13 20.06
## residuals               2 392  0.00 1.69  -0.18   -0.11 1.39 -4.18  6.87
## studentised_residuals   3 392  0.00 1.01  -0.11   -0.07 0.82 -2.49  4.47
##                       range  skew kurtosis   se
## predicted             16.93 -1.61     4.10 0.11
## residuals             11.05  0.75     1.10 0.09
## studentised_residuals  6.95  0.81     1.38 0.05

# distribution of standarised residuals
zresid <- scale(resid(fit))
hist(zresid)


# or add normal curve http://www.statmethods.net/graphs/density.html
hist_with_normal_curve <- function(x, breaks = 24) {
    h <- hist(zresid, breaks = breaks, col = "lightblue")
    xfit <- seq(min(x), max(x), length = 40)
    yfit <- dnorm(xfit, mean = mean(x), sd = sd(x))
    yfit <- yfit * diff(h$mids[1:2]) * length(x)
    lines(xfit, yfit, lwd = 2)
}
hist_with_normal_curve(zresid)


# normality of residuals
qqnorm(zresid)
abline(a = 0, b = 1)


# plot predicted by residual
plot(predict(fit), resid(fit))


# plot dependent by residual
plot(cars$accel, resid(fit))