9.2 Missing values
9.3 Outliers
9.4 Modelling and testing for data quality
Missing values | 178
data(CHAIN, package="mi") par(mar=c(1.1, 4.1, 1.1, 2.1)) mi::missing.pattern.plot(CHAIN, y.order=TRUE, xlab="", main="")
179
visna(CHAIN, sort="b")
180
data(oly12, package="VGAMdata") oly12a <- oly12 names(oly12a) <- abbreviate(names(oly12), 3) visna(oly12a, sort="b")
181
data(freetrade, package="Amelia") freetrade <- within(freetrade, land1 <- reorder(country, tariff, function(x) sum(is.na(x)))) fluctile(xtabs(is.na(tariff) ~ land1 + year, data=freetrade))
182
data(Pima.tr2, package="MASS") visna(Pima.tr2, sort="b")
Outliers | 184
a <- ggplot(US10, aes("var", MurderRate)) + geom_boxplot() + xlab("") + ylab("Murder rate per 100,000 population") b <- ggplot(US10, aes(TheftRate, VehicleTheftRate)) + geom_point() + xlab("Theft rate per 100,000 population") + ylab("Vehicle theft rate per 100,000 population") c <- ggparcoord(data = US10, columns = c(2:10), scale="uniminmax") + theme(axis.title.x = element_blank(), axis.title.y = element_blank()) grid.arrange(arrangeGrob(a, b, ncol=2, widths=c(1, 4)), c, nrow=2)
187
library(tidyr) diam1 <- diamonds %>% select(carat, depth:z) %>% gather(dX, dV, carat:z) ggplot(diam1, aes("dX", dV)) + geom_boxplot() + facet_wrap(~dX, scales = "free_y", nrow=1) + xlab("") + ylab("") + scale_x_discrete(breaks=NULL)
188
a2 <- ggplot(diamonds, aes(y, z)) + geom_point() + xlab("width") + ylab("depth") d2 <- filter(diamonds, y > 2 & y < 11 & z > 1 & z < 7) b2 <- ggplot(d2, aes(y, z)) + geom_point() + xlab("width") + ylab("depth") grid.arrange(a2, b2, ncol=2)
189
data(olives, package="extracat") ggplot(data=olives, aes(x=oleic, y=palmitic)) + geom_point() + geom_density2d(bins=4, col="red") + geom_smooth()
190
data(Boston, package="MASS") a <- ggplot(Boston, aes("var", ptratio)) + geom_boxplot() + xlab("") + ylab("Pupil-teacher ratio") + scale_x_discrete(breaks=NULL) Boston <- within(Boston, pt1 <- ifelse(ptratio < 13, 1, 0)) oc <- order(Boston$pt1) b <- ggparcoord(data = Boston[oc,], columns = c(1:14), scale="uniminmax", groupColumn="pt1") + theme(axis.title.x = element_blank(), axis.title.y = element_blank()) grid.arrange(a, b, nrow=1, widths=c(1,4))
191
a <- ggplot(iris, aes("boxplot for all", Sepal.Width)) + xlab("") + geom_boxplot() + scale_x_discrete(breaks=NULL) b <- ggplot(iris, aes(Species, Sepal.Width)) + geom_boxplot() + xlab("") grid.arrange(a, b, nrow=1, widths=c(1,2))
Modelling and testing for data quality | 196
data(mercer.wheat.uniformity, package="agridat") ggplot(mercer.wheat.uniformity, aes(straw, grain)) + geom_point()