Senior Theses

Case study: senior theses

theses <- read.csv("sample-theses.csv")
str(theses)

## 'data.frame':    339 obs. of  2 variables:
##  $ year     : int  1970 1992 1991 1978 2016 1949 2007 2007 1973 1940 ...
##  $ checkouts: int  3 0 1 3 1 2 0 1 1 0 ...

Question 1: What is the average number of times a thesis is checked out?

ggplot(theses, aes(x = checkouts)) +
  geom_bar()

theses %>%
  summarize(mean(checkouts),
            median(checkouts))

##   mean(checkouts) median(checkouts)
## 1        1.740413                 1

Question 1: What is the average number of times a thesis is checked out?

Confidence interval on one mean

\[ \bar{x} \pm t^* \times s/\sqrt n \]

stats <- theses %>%
  summarize(n     = n(),
            x_bar = mean(checkouts),
            s     = sd(checkouts))
stats$x_bar + c(1, -1) * qt(.025, df = stats$n - 1) * stats$s / sqrt(stats$n)

## [1] 1.498635 1.982191

Question 2: Is the distribution of theses uniform in time?

ggplot(theses, aes(x = year)) +
  geom_histogram()

Question 2: Is the distribution of theses uniform in time?

Chi-squared goodness of fit test

bd <- data.frame(year_binned = cut(theses$year, 
                                            seq(from = 1920, to = 2020, by = 10)))
ggplot(bd, aes(x = year_binned)) +
  geom_bar() +
  geom_hline(yintercept = stats$n/10, 
             color = "goldenrod")

Question 2: Is the distribution of theses uniform in time?

Chi-squared goodness of fit test

Question 2: Is the distribution of theses uniform in time?

Chi-squared goodness of fit test

(obs <- table(bd$year_binned))

## 
## (1.92e+03,1.93e+03] (1.93e+03,1.94e+03] (1.94e+03,1.95e+03] 
##                   2                   9                   9 
## (1.95e+03,1.96e+03] (1.96e+03,1.97e+03] (1.97e+03,1.98e+03] 
##                  19                  27                  40 
## (1.98e+03,1.99e+03]    (1.99e+03,2e+03]    (2e+03,2.01e+03] 
##                  59                  62                  72 
## (2.01e+03,2.02e+03] 
##                  40

(exp <- length(bd$year_binned)/10)

## [1] 33.9

(chisq_obs <- sum((obs - exp)^2/exp))

## [1] 161.4425

Question 2: Is the distribution of theses uniform in time?

Chi-squared goodness of fit test

pchisq(chisq_obs, df = length(obs) - 1, lower.tail = FALSE)

## [1] 3.724151e-30

Question 3: What is the relationship between the age of a thesis and the number of checkouts?

head(theses)

##   year checkouts
## 1 1970         3
## 2 1992         0
## 3 1991         1
## 4 1978         3
## 5 2016         1
## 6 1949         2

theses <- theses %>%
  mutate(age = 2018 - year)
head(theses)

##   year checkouts age
## 1 1970         3  48
## 2 1992         0  26
## 3 1991         1  27
## 4 1978         3  40
## 5 2016         1   2
## 6 1949         2  69

Question 3: What is the relationship between the age of a thesis and the number of checkouts?

ggplot(theses, aes(x = age, y = checkouts)) +
  geom_jitter()

Question 3: What is the relationship between the age of a thesis and the number of checkouts?

m1 <- lm(checkouts ~ age, data = theses)
summary(m1)

## 
## Call:
## lm(formula = checkouts ~ age, data = theses)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -1.9378 -1.6323 -0.7545  0.4899 14.3270 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  1.944591   0.220012   8.839   <2e-16 ***
## age         -0.006789   0.006069  -1.119    0.264    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 2.262 on 337 degrees of freedom
## Multiple R-squared:  0.0037, Adjusted R-squared:  0.0007439 
## F-statistic: 1.252 on 1 and 337 DF,  p-value: 0.264

Question 3: What is the relationship between the age of a thesis and the number of checkouts?

ggplot(theses, aes(x = age, y = checkouts)) +
  geom_jitter() +
  stat_smooth(method = "lm", se = FALSE)

Poisson Regression

m2 <- glm(checkouts ~ age, data = theses, family = "poisson")
summary(m2)

## 
## Call:
## glm(formula = checkouts ~ age, family = "poisson", data = theses)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -1.9749  -1.8039  -0.6166   0.3767   6.6111  
## 
## Coefficients:
##              Estimate Std. Error z value Pr(>|z|)    
## (Intercept)  0.671912   0.072573   9.258   <2e-16 ***
## age         -0.004025   0.002099  -1.918   0.0552 .  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for poisson family taken to be 1)
## 
##     Null deviance: 848.84  on 338  degrees of freedom
## Residual deviance: 845.08  on 337  degrees of freedom
## AIC: 1429.4
## 
## Number of Fisher Scoring iterations: 6

ggplot(theses, aes(x = age, y = checkouts)) +
  geom_jitter() +
  stat_function(fun = function(age) {exp(coef(m2)[1] + coef(m2)[2] * age)},
                color = "red")

t2 <- theses %>%
  filter(year > 1994)
m2 <- glm(checkouts ~ age, data = t2, family = "poisson")
summary(m2)

## 
## Call:
## glm(formula = checkouts ~ age, family = "poisson", data = t2)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -2.4893  -1.3885  -0.3515   0.4554   4.2511  
## 
## Coefficients:
##             Estimate Std. Error z value Pr(>|z|)    
## (Intercept) -0.22104    0.16217  -1.363    0.173    
## age          0.06145    0.01060   5.796  6.8e-09 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for poisson family taken to be 1)
## 
##     Null deviance: 298.81  on 144  degrees of freedom
## Residual deviance: 264.10  on 143  degrees of freedom
## AIC: 541.4
## 
## Number of Fisher Scoring iterations: 5

qplot(x = age, y = checkouts, data = t2, geom = "jitter") +
  stat_function(fun = function(age) {exp(coef(m2)[1] + coef(m2)[2] * age)},
                color = "red")

Ground truth

From the library's database…

theses_lib <- read.csv("library_theses.csv")
head(theses_lib)

##   year checkouts
## 1 1916         1
## 2 1917         2
## 3 1917         0
## 4 1918         0
## 5 1920         0
## 6 1920         0

Question 1: What is the average number of times a thesis is checked out?

qplot(x = checkouts, data = theses_lib, geom = "bar")

theses_lib %>%
  summarize(mean(checkouts),
            median(checkouts))

##   mean(checkouts) median(checkouts)
## 1        1.332394                 0

Question 2: Is the distribution of theses uniform in time?

qplot(x = year, data = theses_lib, geom = "histogram")

Question 3: What is the relationship between the age of a thesis and the number of checkouts?

theses_lib <- theses_lib %>%
  mutate(age = 2018 - year)
qplot(x = age, y = checkouts, data = theses_lib, geom = "jitter")

Poisson Regression

t3 <- theses_lib %>%
  filter(year > 1994)
m3 <- glm(checkouts ~ age, data = t3, family = "poisson")
summary(m3)

## 
## Call:
## glm(formula = checkouts ~ age, family = "poisson", data = t3)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -2.7209  -1.4842  -0.3773   0.6152   6.6148  
## 
## Coefficients:
##              Estimate Std. Error z value Pr(>|z|)    
## (Intercept) -0.158623   0.022659   -7.00 2.55e-12 ***
## age          0.063801   0.001443   44.21  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for poisson family taken to be 1)
## 
##     Null deviance: 16368  on 6552  degrees of freedom
## Residual deviance: 14343  on 6551  degrees of freedom
## AIC: 26682
## 
## Number of Fisher Scoring iterations: 5

qplot(x = age, y = checkouts, data = t3, geom = "jitter") +
  stat_function(fun = function(age) {exp(coef(m3)[1] + coef(m3)[2] * age)},
                color = "red")

How'd we do?

coef(m2) # our data

## (Intercept)         age 
## -0.22103967  0.06145069

coef(m3) # library's data

## (Intercept)         age 
## -0.15862299  0.06380102