using Turing
using DataFrames
using CSV
using Random
using Distributions
using StatisticalRethinking
using StatisticalRethinking: link
using StatisticalRethinkingPlots
using ParetoSmooth
using StatsPlots
using StatsBase
using FreqTables
using Logging
default(label=false);
#Logging.disable_logging(Logging.Warn);
Code 12.1
# define alias for Beta(α, β), see: https://en.wikipedia.org/wiki/Beta_distribution#Mean_and_sample_size
Beta2(μ, ν) = Beta(μ*ν, (1-μ)*ν)
BetaBinomial2(n, μ, ν) = BetaBinomial(n, μ*ν, (1-μ)*ν)
p̄ = 0.5
θ = 5
plot(Beta2(p̄, θ), xlab="probability", ylab="Density")
Code 12.2
d = DataFrame(CSV.File("data/UCBadmit.csv", skipto=2,
header=[:id, :dept, :gender, :admit, :reject, :applications]))
d.gid = @. ifelse(d.gender == "male", 1, 2)
@model function m12_1(A, N, gid)
a ~ MvNormal([0, 0], 1.5)
p̄ = @. logistic(a[gid])
ϕ ~ Exponential(1)
θ = ϕ + 2
@. A ~ BetaBinomial2(N, p̄, θ)
end
Random.seed!(1)
m12_1_ch = sample(m12_1(d.admit, d.applications, d.gid), NUTS(), 1000)
m12_1_df = DataFrame(m12_1_ch);
Code 12.3
m12_1_df.θ = m12_1_df.ϕ .+ 2
m12_1_df.da = m12_1_df."a[1]" .- m12_1_df."a[2]"
precis(m12_1_df)
Code 12.4
gid = 2
p̄ = m12_1_df[:, "a[$gid]"] .|> logistic |> mean
θ = mean(m12_1_df.θ)
plot(Beta2(p̄, θ), lw=2, c=:black, ylab="Density",
xlab="probability admit", ylims=(0, 3))
for (a, θ) ∈ first(zip(m12_1_df[:, "a[$gid]"], m12_1_df.θ), 50)
plot!(Beta2(logistic(a), θ), c=:black, alpha=0.2)
end
gender = gid == 2 ? "female" : "male"
title!("distribution of $gender admission rates")
Code 12.5
Random.seed!(1)
adm_rate = d.admit ./ d.applications
fun = (r, (N, gid)) -> begin
p̄ = logistic(get(r, "a[$gid]", 0))
rand(BetaBinomial2(N, p̄, r.θ))
end
pred_adm = link(m12_1_df, fun, zip(d.applications, d.gid))
pred_rates = pred_adm ./ d.applications
μ_adm = mean.(pred_rates)
σ = std.(pred_rates) ./ 2
ci_adm = PI.(pred_rates)
scatter(adm_rate, xlab="case", ylab="A", title="Posterior validation check")
scatter!(μ_adm, mc=:white, yerr=σ)
scatter!(first.(ci_adm), shape=:cross, c=:black)
scatter!(last.(ci_adm), shape=:cross, c=:black)
Code 12.6
d = DataFrame(CSV.File("data/Kline.csv"))
d.P = standardize(ZScoreTransform, log.(d.population))
d.contact_id = ifelse.(d.contact .== "high", 2, 1)
@model function m12_2(T, P, cid)
g ~ Exponential()
ϕ ~ Exponential()
a ~ MvNormal([1,1], 1)
b₁ ~ Exponential()
b₂ ~ Exponential()
b = [b₁, b₂]
λ = @. exp(a[cid])*(P^b[cid]) / g
p = 1/(ϕ+1)
r = λ/ϕ
clamp!(r, 0.01, Inf)
p = clamp(p, 0.01, 1)
@. T ~ NegativeBinomial(r, p)
end
Random.seed!(1)
m12_2_ch = sample(m12_2(d.total_tools, d.population, d.contact_id), NUTS(), 1000)
m12_2_df = DataFrame(m12_2_ch)
precis(m12_2_df)
Code 12.7
# define parameters
prob_drink = 0.2 # 20% of days
rate_work = 1 # average 1 manuscript per day
# sample one year of production
N = 365
# simulate days monks drink
Random.seed!(365)
drink = rand(Binomial(1, prob_drink), N)
# simulate manuscripts completed
y = (1 .- drink).*rand(Poisson(rate_work), N);
Code 12.8
p = histogram(y, xlab="manuscripts completed", ylab="Frequency")
zeros_drink = sum(drink)
zeros_work = sum(@. (y == 0) & (drink == 0))
bar!([0], [zeros_work], bar_width=0.3)
p
Code 12.9
# Based on this discussion
# https://github.com/StatisticalRethinkingJulia/SR2TuringPluto.jl/issues/1
import Distributions: logpdf, rand
struct ZIPoisson{T1,T2} <: DiscreteUnivariateDistribution
λ::T1
w::T2
end
function logpdf(d::ZIPoisson, y::Int)
if y == 0
logsumexp([log(d.w), log(1 - d.w) - d.λ])
else
log(1 - d.w) + logpdf(Poisson(d.λ), y)
end
end
function rand(d::ZIPoisson)
rand() <= d.w ? 0 : rand(Poisson(d.λ))
end
rand(d::ZIPoisson, N::Int) = map(_->rand(d), 1:N)
@model function m12_3(y)
ap ~ Normal(-1.5, 1)
al ~ Normal(1, 0.5)
λ = exp(al)
p = logistic(ap)
y .~ ZIPoisson(λ, p)
end
m12_3_ch = sample(m12_3(y), NUTS(), 1000)
m12_3_df = DataFrame(m12_3_ch)
precis(m12_3_df)
Code 12.10
m12_3_df.ap .|> logistic |> mean,
exp.(m12_3_df.al) |> mean
Code 12.12
d = DataFrame(CSV.File("data/Trolley.csv"))
describe(d)
Code 12.13
histogram(d.response, xlab="response")
Code 12.14
pr_k = counts(d.response) / length(d.response)
cum_pr_k = cumsum(pr_k)
plot(cum_pr_k, m=:o, xlab="response", ylab="cumulative proportion")
Code 12.15
round.(logit.(cum_pr_k), digits=2)
Code 12.16
@model function m12_4(R)
# to ensure sorted cutpoints, use deltas
Δ_cutpoints ~ filldist(Exponential(), 6)
cutpoints = -2 .+ cumsum(Δ_cutpoints)
R .~ OrderedLogistic(0, cutpoints)
end
Code 12.18
Random.seed!(1)
m12_4_ch = sample(m12_4(d.response), NUTS(), 1000)
m12_4_df = DataFrame(m12_4_ch)
precis(m12_4_df)
cutpoints = -2 .+ cumsum(mean.(eachcol(m12_4_df[!,r"Δ.*"])))
Code 12.19
round.(logistic.(cutpoints), digits=3)
Code 12.20
pk = pdf.(OrderedLogistic(0, cutpoints), 1:7)
round.(pk, digits=2)
Code 12.21
sum(pk.*(1:7))
Code 12.22
pk = pdf.(OrderedLogistic(0, cutpoints .- 0.5), 1:7)
round.(pk, digits=2)
Code 12.23
sum(pk.*(1:7))
Code 12.24
@model function m12_5(R, A, I, C)
# to ensure sorted cutpoints, use deltas
Δ_cutpoints ~ filldist(Exponential(), 6)
cutpoints = -3 .+ cumsum(Δ_cutpoints)
bA ~ Normal(0, 0.5)
bI ~ Normal(0, 0.5)
bC ~ Normal(0, 0.5)
bIA ~ Normal(0, 0.5)
bIC ~ Normal(0, 0.5)
BI = @. bI + bIA*A + bIC*C
phi = @. bA*A + bC*C + BI*I
for i in eachindex(R)
R[i] ~ OrderedLogistic(phi[i], cutpoints)
end
end
Random.seed!(2)
m12_5_ch = sample(m12_5(d.response, d.action, d.intention, d.contact), NUTS(), 1000)
m12_5_df = DataFrame(m12_5_ch)
precis(m12_5_df)
#cutpoints = -2 .+ cumsum(mean.(eachcol(m12_5_df[!,r"Δ.*"])))
Code 12.25
coeftab_plot(m12_5_df, pars=[:bIC, :bIA, :bC, :bI, :bA])
Code 12.26
(not needed, in fact)
#p = plot(xlab="intention", ylab="probability", xlim=(0, 1), ylim=(0, 1))
Code 12.27
kA = 0 # value for action
kC = 0 # value for contact
kI = 0:1 # values for intention to calculate over
rI_to_phi = (r, i) -> begin
BI = r.bI + r.bIA*kA + r.bIC*kC
r.bA*kA + r.bC*kC + BI*i
end
phi = link(m12_5_df, rI_to_phi, kI);
Code 12.28
p = plot(xlab="intention", ylab="probability", xlim=(0, 1), ylim=(0, 1), title="action=$kA, contact=$kC")
for ri in 1:50
r = m12_5_df[ri,:]
cutpoints = -3 .+ cumsum(r[r"Δ.*"])
pk1 = cumsum(pdf.(OrderedLogistic(phi[1][ri], cutpoints), 1:6))
pk2 = cumsum(pdf.(OrderedLogistic(phi[2][ri], cutpoints), 1:6))
for i ∈ 1:6
plot!(kI, [pk1[i], pk2[i]], c=:gray, alpha=0.2)
end
end
p
Code 12.29
kA = 0
kC = 1
kI = 0:1
rI_to_dist = (r, i) -> begin
BI = r.bI + r.bIA*kA + r.bIC*kC
phi = r.bA*kA + r.bC*kC + BI*i
cutpoints = -3 .+ cumsum(r[r"Δ.*"])
OrderedLogistic(phi, cutpoints)
end
Random.seed!(1)
s = simulate(m12_5_df, rI_to_dist, kI)
histogram(map(first, s), bar_width=0.5, label="I=0")
histogram!(map(last, s), bar_width=0.2, label="I=1")
Code 12.30
d = DataFrame(CSV.File("data/Trolley.csv"))
levels(d.edu)
Code 12.31
edu_l = Dict{String, Int}(
"Bachelor's Degree" => 6,
"Elementary School" => 1,
"Graduate Degree" => 8,
"High School Graduate" => 4,
"Master's Degree" => 7,
"Middle School" => 2,
"Some College" => 5,
"Some High School" => 3,
)
d.edu_new = map(s -> edu_l[s], d.edu);
Code 12.32
Random.seed!(1805)
delta = rand(Dirichlet(7, 2), 10)
Code 12.33
h = 3
p = plot(xlab="index", ylab="probability")
for (idx, col) ∈ enumerate(eachcol(delta))
plot!(col, c=:black, alpha=0.7, lw=idx == h ? 3 : 1, m= idx == h ? :v : :o)
end
p
Code 12.34
Could take 20-30 minutes...
@model function m12_6(R, action, intention, contact, E)
delta ~ Dirichlet(7, 2)
pushfirst!(delta, 0.0)
bA ~ Normal()
bI ~ Normal()
bC ~ Normal()
bE ~ Normal()
# sum all education's deltas
sE = sum.(map(i -> delta[1:i], E))
phi = @. bE*sE + bA*action + bI*intention + bC*contact
# use same cutpoints as before
Δ_cutpoints ~ filldist(Exponential(), 6)
cutpoints = -3 .+ cumsum(Δ_cutpoints)
for i ∈ eachindex(R)
R[i] ~ OrderedLogistic(phi[i], cutpoints)
end
end
model = m12_6(d.response, d.action, d.intention, d.contact, d.edu_new)
m12_6_ch = sample(model, NUTS(), 1000);
Code 12.35
m12_6_df = DataFrame(m12_6_ch)
precis(m12_6_df)
Code 12.36
delta_labels = ["Elem","MidSch","SHS","HSG","SCol","Bach","Mast","Grad"]
df = m12_6_df[!,r"delta.*"]
corrplot(Matrix(df); seriestype=:scatter, bins=30, grid=false, ms=0.1, ma=0.8,
size=(1000,800), label=delta_labels)
Code 12.37
d.edu_norm = standardize(UnitRangeTransform, Float64.(d.edu_new))
@model function m12_7(R, action, intention, contact, edu_norm)
bA ~ Normal()
bI ~ Normal()
bC ~ Normal()
bE ~ Normal()
phi = @. bE*edu_norm + bA*action + bI*intention + bC*contact
# use same cutpoints as before
Δ_cutpoints ~ filldist(Exponential(), 6)
cutpoints = -3 .+ cumsum(Δ_cutpoints)
for i ∈ eachindex(R)
R[i] ~ OrderedLogistic(phi[i], cutpoints)
end
end
model = m12_7(d.response, d.action, d.intention, d.contact, d.edu_norm)
m12_7_ch = sample(model, NUTS(), 1000)
m12_7_df = DataFrame(m12_7_ch)
precis(m12_7_df)