Link Search Menu Expand Document

Navigation Structure

Table of contents
  1. Multivariate outliers: Leverage
    1. Mahalanobis distances
    2. Robost Mahalanobis distances
  2. Multivariate outliers: Influence
    1. General Cook’s distance
    2. Influence on goodness of fit
    3. Influence on model fit: cfi

Import our datasets:

#import data
dat <- read.table("uk_ipip300_data1.csv", sep = ";", header = TRUE) 

Multivariate outliers: Leverage

Use smaller df to evaluate MD of all variables:

extra.labels <- c("Opene_1", "Opene_2", "Opene_3", "Opene_4", 
                  "Agree_1", "Agree_2", "Agree_3", "Agree_4", 
                  "Neuro_1", "Neuro_2", "Neuro_3", "Neuro_4")
dat.extra <- na.omit(dat[,extra.labels])

Mahalanobis distances

Test for outliers (Leverage) using Mahalanobis distances default is a thershold of 1-0.025 Maybe choose more strict threshold of .001:

library(performance)
md <- check_outliers(dat.extra, threshold  = stats::qchisq(p = 1 - 0.001, df = ncol(dat.extra)))
print(md)
## Warning: 11 outliers detected (cases 16, 144, 179, 240, 255, 268, 357, 447, 473, 508, 579).
plot(md)


Show outliers:

mdf <- as.data.frame(md)
head(mdf[order(mdf$Distance_Mahalanobis, decreasing=T),],15)
##     Distance_Mahalanobis Outlier_Mahalanobis Outlier
## 447             57.89132                   1       1
## 473             42.44906                   1       1
## 179             41.48188                   1       1
## 508             40.46880                   1       1
## 144             38.68100                   1       1
## 268             36.73672                   1       1
## 579             35.28286                   1       1
## 16              35.05630                   1       1
## 357             34.45752                   1       1
## 240             33.90330                   1       1
## 255             33.40577                   1       1
## 27              31.81040                   0       0
## 487             30.71935                   0       0
## 482             30.00463                   0       0
## 201             29.29412                   0       0

Robost Mahalanobis distances

Alternatively, use robust md (Leverage):

library(faoutlier)
md <- robustMD(dat.extra)
print(md)
##          mah p  sig
## 447 92.82780 0 ****
## 473 70.84139 0 ****
## 508 70.37589 0 ****
## 144 64.47126 0 ****
## 179 63.23098 0 ****
## 16  59.64405 0 ****
## 268 56.53132 0 ****
## 357 56.20606 0 ****
## 579 55.48568 0 ****
## 126 53.95579 0 ****
plot(md)

Multivariate outliers: Influence

Define a basic lavaan cfa model with 3 facets:

model <- '
  op =~ Opene_1 + Opene_2 + Opene_3 + Opene_4
  ag =~ Agree_1 + Agree_2 + Agree_3 + Agree_4
  ne =~ Neuro_1 + Neuro_2 + Neuro_3 + Neuro_4'

General Cook’s distance

Screen for case influentials using general Cooks distance:

library(faoutlier)
gCDresult2 <- gCD(dat, model, orthogonal=TRUE)
print(gCDresult2)
##           gCD
## 460 1.4663159
## 186 0.8598830
## 487 0.7766408
## 150 0.7658846
## 524 0.7618273
## 369 0.7451415
## 277 0.6661814
## 596 0.6387400
## 502 0.6071973
## 130 0.5580723
plot(gCDresult2)

Influence on goodness of fit

go <- GOF(dat, model)
print(go)
##          GOF
## 460 10.25004
## 502 -9.46973
## 596 -5.65248
## 228 -5.38852
## 597 -5.02171
## 315  0.00000
## 332  0.00000
## 429  0.00000
## 483  0.00000
## 501  0.00000
## 511  0.00000
## 578  0.00000
plot(go)

Influence on model fit: cfi

Alternatively “rmsea”, “tli…”.

library(influence.SEM)
FI <- fitinfluence("cfi",model,dat)

Graphical exploration of fit influential cases:

explore.influence(FI$Dind$cfi)

Print highest influential cases on cfi:

head(FI$Dind[order(FI$Dind$cfi, decreasing=F),],10)
##     case          cfi
## 596  596 -0.006000970
## 502  502 -0.005959073
## 228  228 -0.004250393
## 597  597 -0.004050729
## 364  364 -0.004039169
## 46    46 -0.003472951
## 126  126 -0.003437482
## 121  121 -0.003288414
## 182  182 -0.003179929
## 23    23 -0.003021831

Potentially remove case no. 543 as it has highest neg impact on cfi:

FI.d <- dat[-c(543), ]
# better use subset command
# dat <- subset(dat, id!= 6)

We see cfi has improved:

#New Case influence indicates improved fit regarding cfi
FI.d <- fitinfluence("cfi",model,FI.d)
print(FI.d$Oind)
##       cfi 
## 0.8579951