Ch03 Examining continuous variables

3.1 Introduction
3.3 Looking for features
3.4 Comparing distributions by subgroups

 

Introduction | 28

data(btw2009, package = "flexclust")
btw2009 <- within(btw2009, Linke2 <- 100*LINKE2/valid2)
ggplot(btw2009, aes(Linke2)) + geom_bar(binwidth = 1, 
       fill = "mediumpurple") + ylab("") +
       xlab("Percentage voter support for Die Linke in 2009")

 

Looking for features | 30

data(galton, package="UsingR")
ht <- "height (in)"
par(mfrow=c(1,2), las=1, mar=c(3.1, 4.1, 1.1, 2.1))
with(galton, {
     hist(child, xlab=ht, main="Children", col="green")
     hist(parent, xlab=ht, main="Parents", col="blue")})

 

31

par(mfrow=c(1,2), mar=c(3.1, 4.1, 1.1, 2.1))
with(galton, {
     MASS::truehist(child, h=0.1)
     MASS::truehist(parent, h=0.1)})

 

33

c1 <- ggplot(galton, aes(child)) + geom_bar( binwidth=1) +
             xlim(60, 75) + ylim(0, 225) + ylab("") + 
             geom_vline(xintercept=median(galton$child),
             col="red")
p1 <- ggplot(galton, aes(parent)) + geom_bar( binwidth=1) +
             xlim(60, 75) + ylim(0, 225) + ylab("") +
             geom_vline(xintercept=median(galton$parent),
             col="red")
grid.arrange(c1, p1)

 

34

data(father.son, package="UsingR")
c2 <- ggplot(father.son, aes(sheight)) + 
             geom_histogram(aes(y = ..density..), binwidth=1) +
             geom_density() + xlim(58, 80) + ylim(0, 0.16) +
             xlab("ht (inches)") + ylab("") + ggtitle("Sons")
p2 <- ggplot(father.son, aes(fheight)) + 
             geom_histogram(aes(y = ..density..), binwidth=1) +
             geom_density() + xlim(58, 80) + ylim(0, 0.16) +
             xlab("ht (inches)") + ylab("") +
             ggtitle("Fathers")
grid.arrange(c2, p2, nrow = 1)

 

35

with(father.son, {
     qqnorm(sheight, main="Sons", xlab="",
            ylab="", pch=16, ylim=c(55,80))
     qqline(sheight)
     qqnorm(fheight, main="Fathers", xlab="",
            ylab="", pch=16, ylim=c(55,80))
     qqline(fheight)})

 

36

par(mfrow=c(1,1), mar=c(3.1, 4.1, 1.1, 2.1))
with(MASS::hills,
     boxplot(time, horizontal=TRUE, pch=16, ylim=c(0, 220)))

 

37

ggplot(MASS::Boston, aes(medv)) + geom_bar() + ylab("") +
       xlab("Median housing value (thousands of dollars)")

 

38

library(tidyr)
B2 <- gather(MASS::Boston, BosVars, BosValues, crim:medv)
ggplot(B2, aes(BosValues)) + geom_histogram() + xlab("") +
       ylab("") + facet_wrap(~ BosVars, scales = "free")

 

41

library(KernSmooth)
data(Hidalgo1872, package="MMST")
par(las=1, mar=c(3.1, 4.1, 1.1, 2.1))
with(Hidalgo1872, {
     hist(thickness,breaks=seq(0.055,0.135,0.001), freq=FALSE, main="", col="bisque2", ylab="")
     lines(density(thickness), lwd=2)
     ks1 <- bkde(thickness, bandwidth=dpik(thickness))
     lines(ks1, col="red", lty=5, lwd=2)})

 

42

ggplot(movies, aes(length)) + geom_bar() + ylab("")

 

43

ggplot(movies, aes("var", length)) + geom_boxplot() +
       xlab("")  + scale_x_discrete(breaks=NULL) + coord_flip()

 

44

ggplot(movies, aes(x = length)) +  xlim(0,180) +
       geom_histogram(binwidth=1)  +
       xlab("Movie lengths in minutes") + ylab("")

 

Comparing distributions by subgroups | 45

btw2009 <- within(btw2009, Bundesland <- state)
btw2009 <- within(btw2009, levels(Bundesland) <- c("BW", "BY", "BE", "BB",
   "HB", "HH", "HE", "MV", "NI", "NW","RP", "SL", "SN", "ST", "SH", "TH"))
ggplot(btw2009, aes(Bundesland, Linke2)) + geom_boxplot(varwidth=TRUE) + ylab("")