theses <- read.csv("sample-theses.csv") str(theses)
## 'data.frame': 339 obs. of 2 variables: ## $ year : int 1970 1992 1991 1978 2016 1949 2007 2007 1973 1940 ... ## $ checkouts: int 3 0 1 3 1 2 0 1 1 0 ...
theses <- read.csv("sample-theses.csv") str(theses)
## 'data.frame': 339 obs. of 2 variables: ## $ year : int 1970 1992 1991 1978 2016 1949 2007 2007 1973 1940 ... ## $ checkouts: int 3 0 1 3 1 2 0 1 1 0 ...
Question 1: What is the average number of times a thesis is checked out?
ggplot(theses, aes(x = checkouts)) + geom_bar()
theses %>% summarize(mean(checkouts), median(checkouts))
## mean(checkouts) median(checkouts) ## 1 1.740413 1
Question 1: What is the average number of times a thesis is checked out?
Confidence interval on one mean
\[ \bar{x} \pm t^* \times s/\sqrt n \]
stats <- theses %>% summarize(n = n(), x_bar = mean(checkouts), s = sd(checkouts)) stats$x_bar + c(1, -1) * qt(.025, df = stats$n - 1) * stats$s / sqrt(stats$n)
## [1] 1.498635 1.982191
Question 2: Is the distribution of theses uniform in time?
ggplot(theses, aes(x = year)) + geom_histogram()
Question 2: Is the distribution of theses uniform in time?
Chi-squared goodness of fit test
bd <- data.frame(year_binned = cut(theses$year, seq(from = 1920, to = 2020, by = 10))) ggplot(bd, aes(x = year_binned)) + geom_bar() + geom_hline(yintercept = stats$n/10, color = "goldenrod")
Question 2: Is the distribution of theses uniform in time?
Chi-squared goodness of fit test
Question 2: Is the distribution of theses uniform in time?
Chi-squared goodness of fit test
(obs <- table(bd$year_binned))
## ## (1.92e+03,1.93e+03] (1.93e+03,1.94e+03] (1.94e+03,1.95e+03] ## 2 9 9 ## (1.95e+03,1.96e+03] (1.96e+03,1.97e+03] (1.97e+03,1.98e+03] ## 19 27 40 ## (1.98e+03,1.99e+03] (1.99e+03,2e+03] (2e+03,2.01e+03] ## 59 62 72 ## (2.01e+03,2.02e+03] ## 40
(exp <- length(bd$year_binned)/10)
## [1] 33.9
(chisq_obs <- sum((obs - exp)^2/exp))
## [1] 161.4425
Question 2: Is the distribution of theses uniform in time?
Chi-squared goodness of fit test
pchisq(chisq_obs, df = length(obs) - 1, lower.tail = FALSE)
## [1] 3.724151e-30
Question 3: What is the relationship between the age of a thesis and the number of checkouts?
head(theses)
## year checkouts ## 1 1970 3 ## 2 1992 0 ## 3 1991 1 ## 4 1978 3 ## 5 2016 1 ## 6 1949 2
theses <- theses %>% mutate(age = 2018 - year) head(theses)
## year checkouts age ## 1 1970 3 48 ## 2 1992 0 26 ## 3 1991 1 27 ## 4 1978 3 40 ## 5 2016 1 2 ## 6 1949 2 69
Question 3: What is the relationship between the age of a thesis and the number of checkouts?
ggplot(theses, aes(x = age, y = checkouts)) + geom_jitter()
Question 3: What is the relationship between the age of a thesis and the number of checkouts?
m1 <- lm(checkouts ~ age, data = theses) summary(m1)
## ## Call: ## lm(formula = checkouts ~ age, data = theses) ## ## Residuals: ## Min 1Q Median 3Q Max ## -1.9378 -1.6323 -0.7545 0.4899 14.3270 ## ## Coefficients: ## Estimate Std. Error t value Pr(>|t|) ## (Intercept) 1.944591 0.220012 8.839 <2e-16 *** ## age -0.006789 0.006069 -1.119 0.264 ## --- ## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1 ## ## Residual standard error: 2.262 on 337 degrees of freedom ## Multiple R-squared: 0.0037, Adjusted R-squared: 0.0007439 ## F-statistic: 1.252 on 1 and 337 DF, p-value: 0.264
Question 3: What is the relationship between the age of a thesis and the number of checkouts?
ggplot(theses, aes(x = age, y = checkouts)) + geom_jitter() + stat_smooth(method = "lm", se = FALSE)
m2 <- glm(checkouts ~ age, data = theses, family = "poisson") summary(m2)
## ## Call: ## glm(formula = checkouts ~ age, family = "poisson", data = theses) ## ## Deviance Residuals: ## Min 1Q Median 3Q Max ## -1.9749 -1.8039 -0.6166 0.3767 6.6111 ## ## Coefficients: ## Estimate Std. Error z value Pr(>|z|) ## (Intercept) 0.671912 0.072573 9.258 <2e-16 *** ## age -0.004025 0.002099 -1.918 0.0552 . ## --- ## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1 ## ## (Dispersion parameter for poisson family taken to be 1) ## ## Null deviance: 848.84 on 338 degrees of freedom ## Residual deviance: 845.08 on 337 degrees of freedom ## AIC: 1429.4 ## ## Number of Fisher Scoring iterations: 6
ggplot(theses, aes(x = age, y = checkouts)) + geom_jitter() + stat_function(fun = function(age) {exp(coef(m2)[1] + coef(m2)[2] * age)}, color = "red")
t2 <- theses %>% filter(year > 1994) m2 <- glm(checkouts ~ age, data = t2, family = "poisson") summary(m2)
## ## Call: ## glm(formula = checkouts ~ age, family = "poisson", data = t2) ## ## Deviance Residuals: ## Min 1Q Median 3Q Max ## -2.4893 -1.3885 -0.3515 0.4554 4.2511 ## ## Coefficients: ## Estimate Std. Error z value Pr(>|z|) ## (Intercept) -0.22104 0.16217 -1.363 0.173 ## age 0.06145 0.01060 5.796 6.8e-09 *** ## --- ## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1 ## ## (Dispersion parameter for poisson family taken to be 1) ## ## Null deviance: 298.81 on 144 degrees of freedom ## Residual deviance: 264.10 on 143 degrees of freedom ## AIC: 541.4 ## ## Number of Fisher Scoring iterations: 5
qplot(x = age, y = checkouts, data = t2, geom = "jitter") + stat_function(fun = function(age) {exp(coef(m2)[1] + coef(m2)[2] * age)}, color = "red")
theses_lib <- read.csv("library_theses.csv") head(theses_lib)
## year checkouts ## 1 1916 1 ## 2 1917 2 ## 3 1917 0 ## 4 1918 0 ## 5 1920 0 ## 6 1920 0
Question 1: What is the average number of times a thesis is checked out?
qplot(x = checkouts, data = theses_lib, geom = "bar")
theses_lib %>% summarize(mean(checkouts), median(checkouts))
## mean(checkouts) median(checkouts) ## 1 1.332394 0
Question 2: Is the distribution of theses uniform in time?
qplot(x = year, data = theses_lib, geom = "histogram")
Question 3: What is the relationship between the age of a thesis and the number of checkouts?
theses_lib <- theses_lib %>% mutate(age = 2018 - year) qplot(x = age, y = checkouts, data = theses_lib, geom = "jitter")
t3 <- theses_lib %>% filter(year > 1994) m3 <- glm(checkouts ~ age, data = t3, family = "poisson") summary(m3)
## ## Call: ## glm(formula = checkouts ~ age, family = "poisson", data = t3) ## ## Deviance Residuals: ## Min 1Q Median 3Q Max ## -2.7209 -1.4842 -0.3773 0.6152 6.6148 ## ## Coefficients: ## Estimate Std. Error z value Pr(>|z|) ## (Intercept) -0.158623 0.022659 -7.00 2.55e-12 *** ## age 0.063801 0.001443 44.21 < 2e-16 *** ## --- ## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1 ## ## (Dispersion parameter for poisson family taken to be 1) ## ## Null deviance: 16368 on 6552 degrees of freedom ## Residual deviance: 14343 on 6551 degrees of freedom ## AIC: 26682 ## ## Number of Fisher Scoring iterations: 5
qplot(x = age, y = checkouts, data = t3, geom = "jitter") + stat_function(fun = function(age) {exp(coef(m3)[1] + coef(m3)[2] * age)}, color = "red")
coef(m2) # our data
## (Intercept) age ## -0.22103967 0.06145069
coef(m3) # library's data
## (Intercept) age ## -0.15862299 0.06380102