Ch09 Graphics and Data Quality

9.2 Missing values
9.3 Outliers
9.4 Modelling and testing for data quality

 

Missing values | 178

data(CHAIN, package="mi")
par(mar=c(1.1, 4.1, 1.1, 2.1))
mi::missing.pattern.plot(CHAIN, y.order=TRUE, xlab="", main="")

 

179

visna(CHAIN, sort="b")

 

180

data(oly12, package="VGAMdata")
oly12a <- oly12
names(oly12a) <- abbreviate(names(oly12), 3)
visna(oly12a, sort="b")

 

181

data(freetrade, package="Amelia")
freetrade <- within(freetrade, land1 <-
           reorder(country, tariff, function(x) sum(is.na(x))))
fluctile(xtabs(is.na(tariff) ~ land1 + year, data=freetrade))

 

182

data(Pima.tr2, package="MASS")
visna(Pima.tr2, sort="b")

 

Outliers | 184

a <- ggplot(US10, aes("var", MurderRate)) +
            geom_boxplot() + xlab("") +
            ylab("Murder rate per 100,000 population")
b <- ggplot(US10, aes(TheftRate, VehicleTheftRate)) +
            geom_point() +
            xlab("Theft rate per 100,000 population") + 
            ylab("Vehicle theft rate per 100,000 population")
c <- ggparcoord(data = US10, columns = c(2:10),
                scale="uniminmax") +
                theme(axis.title.x = element_blank(),
                axis.title.y = element_blank())
grid.arrange(arrangeGrob(a, b, ncol=2, widths=c(1, 4)),
             c, nrow=2)

 

187

library(tidyr)
diam1 <- diamonds %>% select(carat, depth:z) %>%
       gather(dX, dV, carat:z)
ggplot(diam1, aes("dX", dV)) + geom_boxplot() + 
       facet_wrap(~dX, scales = "free_y", nrow=1) +
       xlab("") + ylab("") + scale_x_discrete(breaks=NULL) 

 

188

a2 <- ggplot(diamonds, aes(y, z)) + geom_point() + 
             xlab("width") + ylab("depth")
d2 <- filter(diamonds, y > 2 & y < 11 & z > 1 & z < 7)
b2 <- ggplot(d2, aes(y, z)) + geom_point() + 
             xlab("width") + ylab("depth")
grid.arrange(a2, b2, ncol=2)

 

189

data(olives, package="extracat")
ggplot(data=olives, aes(x=oleic, y=palmitic)) + geom_point() +
       geom_density2d(bins=4, col="red") + geom_smooth()

 

190

data(Boston, package="MASS")
a <- ggplot(Boston, aes("var", ptratio)) + geom_boxplot() +
            xlab("") + ylab("Pupil-teacher ratio")  +
            scale_x_discrete(breaks=NULL) 
Boston <- within(Boston, pt1 <- ifelse(ptratio < 13, 1, 0))
oc <- order(Boston$pt1)
b <- ggparcoord(data = Boston[oc,], columns = c(1:14),
                scale="uniminmax", groupColumn="pt1") +
                theme(axis.title.x = element_blank(),
                axis.title.y = element_blank())
grid.arrange(a, b, nrow=1, widths=c(1,4))

191

a <- ggplot(iris, aes("boxplot for all", Sepal.Width)) +
            xlab("")  + geom_boxplot() +
            scale_x_discrete(breaks=NULL) 
b <- ggplot(iris, aes(Species, Sepal.Width)) + 
            geom_boxplot() +  xlab("")
grid.arrange(a, b, nrow=1, widths=c(1,2))

Modelling and testing for data quality | 196

data(mercer.wheat.uniformity, package="agridat")
ggplot(mercer.wheat.uniformity, aes(straw, grain)) + geom_point()