Trying to replicate Rust (1987) results using Rust (1987) and Hotz-Miller (1993) approaches
using Statistics, LinearAlgebra, GLM
using Random, Distributions, StatsBase
using DataFrames, DataFramesMeta, ShiftedArrays
using Optim, NLopt, NLSolversBase
using Plots
using DelimitedFiles, CSVFiles
Threads.nthreads()
Create a function that computes logsumexp trying to avoid overflows
function logsumexp(mat)
A = mean(mat);
out = log.(sum(exp.(mat .- A), dims = 1)) .+ A # for matrices (not vectors), may need to specify dimension to sum over (usually dims = 1)
return out
end
# function logsumexp(mat)
# A = mean(mat);
# out = log.(sum(exp.(mat .- A))) .+ A # for matrices (not vectors), may need to specify dimension to sum over (usually dims = 1)
# return out
# end
Some cleaning code
# df1 = readdlm("g870.csv", ',', Float64);
# df2 = readdlm("rt50.csv", ',', Float64);
# df3 = readdlm("t8h203.csv", ',', Float64);
# df8 = readdlm("a530875.csv", ',', Float64);
# df1 = reshape(df, 36, 15)'
# writedlm("g870.csv", df, ",")
# df2 = reshape(df2,60,4)'
# writedlm("rt50.csv", df2, ",")
# df3 = reshape(df3, 81, 48)'
# writedlm("t8h203.csv", df3, ",")
# df8 = reshape(df8, 128, 37)'
# writedlm("a530875.csv", df8, ",")
# load the dataframe
# - df1.csv are groups 1-3 in Rust's paper
# - df2.csv is group 4
# - df3.csv are groups 1-4
df = load("df3.csv") |> DataFrame;
# compute empirical distribution of state transitions, using the discretized states
# Note that the assumption in Rust's paper is that the transition is at most 2 states
# - first obtain the transition probability from choosing action 0 (no replace)
J = 2; # number of choices
B = 90; # number of states
β = .9999; # discount factor
θ = [3; 10]; # initial guess of [θ_1, RC]
θ_3 = zeros(3, J)
trans0 = @linq df |>
where(:i.== 0) |>
transform(p = :delta_x .==0, q = (:delta_x .== 1));
p = mean(trans0.p);
q = mean(trans0.q);
θ_3[:, 1] = [p; q; 1 - p - q];
# - obtain transition probabilities from choosing action 1 (replace)
trans1 = @linq df |>
where(:i.== 1) |>
transform(p = :delta_x .==0, q = (:delta_x .== 1));
p1 = mean(trans1.p);
q1 = mean(trans1.q);
θ_3[:, 2] = [p1; q1; 1 - p1 - q1];
# outer loop - partial likelihood taking the state transition parameters
# - θ_3 as given.
# θ: structural parameters of interest to search over. In Rust's case this is
# - θ_11 and RC
# θ_3: 4x1 vector of transition parameters, first two indicate 0 and +1 state transition for action i = 0
# - second two indictate the 0 and +1 state transitions for action i = 1
# data: data
# B: number of states (90)
# J: number of actions (2)
# β: discount factor
function compute_ll(θ, data, B, G, J, β)
# we only need the state and action from the dataset
x = data.x
i = data.i
# given θ, iterate the inner loop
EV = compute_EV(θ, B, G, J, β)
maxEV = maximum(EV)
# initialize Cond. choice prob.
CCP = zeros(J, B)
u = compute_utility(θ, B, J);
# use logit formula to calculate conditional choice prob
for y = 1:B
denom = sum(exp.(u[:, y] .+ β * EV .- maxEV), dims = 1)
CCP[1,y] = exp.(u[1, y] .+ β .* EV[1, y] .- maxEV)./denom[1,y]
end
# CCP for the second action is just "1 - CCP" of the first action given state x
CCP[2,:] = 1 .- CCP[1,:]
# compute the partial log likelihood
ll = 0
for n = 1:length(x)
x_t = x[n]
i_t = i[n]
ll = ll + log(CCP[i_t + 1, x_t + 1])
end
return -ll
end
# # outer loop - partial likelihood taking the state transition parameters
# # - θ_3 as given.
# # θ: structural parameters of interest to search over. In Rust's case this is
# # - θ_11 and RC
# # θ_3: 4x1 vector of transition parameters, first two indicate 0 and +1 state transition for action i = 0
# # - second two indictate the 0 and +1 state transitions for action i = 1
# # data: data
# # B: number of states (90)
# # J: number of actions (2)
# # β: discount factor
# function compute_ll(θ, θ_3, data, B, J, β)
# # we only need the state and action from the dataset
# x = data.x
# i = data.i
# # construct parameter vector to feed into inner loop
# θ = [θ; θ_3]
# # given θ, iterate the inner loop
# EV = compute_EV(θ, B, J, β)
# maxEV = maximum(EV)
# # initialize Cond. choice prob.
# CCP = zeros(J, B)
# # use logit formula to calculate conditional choice prob
# for x = 1:B
# u = compute_utility(θ, x, J);
# denom = sum(exp.(u .+ β * EV .- maxEV), dims = 1)
# CCP[1,x] = exp.(u[1] .+ β .* EV[1, x] .- maxEV)./denom[1,x]
# end
# # CCP for the second action is just "1 - CCP" of the first action given state x
# CCP[2,:] = 1 .- CCP[1,:]
# # compute the partial log likelihood
# ll = 0
# for n = 1:length(x)
# x_t = x[n]
# i_t = i[n]
# ll = ll + log(CCP[i_t + 1, x_t + 1])
# end
# return -ll
# end
Note that because of discreteness of our state space, the equation of interest (4.14) can be simplified:
EV(x,i)=∫ylog[∑jexp[u(y,j,θ1)+βEV(y,j)]p(dy|x,i,θ3)=p∫x+5000xlog[∑jexp[u(y,j,θ1)+βEV(y,j)]dy+q∫x+10000x+5000log[∑jexp[u(y,j,θ1)+βEV(y,j)]dy+(1−p−q)∫∞x+5000log[∑jexp[u(y,j,θ1)+βEV(y,j)]dyBut notice that the integral over dy doesn't really matter because we have discretized the states for every 5000 miles. So we can replace y with current state x plus however many discrete states we will transition as implied by the transition probability and drop the integral
# computes the inner fixed point for EV
# θ: vector of parameters, includes state transition parameters (unlike the outer loop θ)
# B: number of states (90) (note that they are indexed 1:90, but the realizations are 0:89)
# J: number of actions (2)
# β: discount factor
function compute_EV(θ, B, G, J, β)
tol = 100
# initial guess of EV
EV_old = zeros(J, B)
while tol > 1e-4 # good enough tolerance level, set 1e-6 if desired
EV_new = zeros(J, B)
for j = 1:J # iterate over actions
EV_new[j, :] = logsumexp(compute_utility(θ, B, J) + β * EV_old) * G[:, :, j]'
end
tol = maximum(abs.(EV_new - EV_old))
EV_old = EV_new
end
return EV_old
end
# function compute_EV(θ, B, J, β)
# computes the inner fixed point for EV
# θ: vector of parameters, includes state transition parameters (unlike the outer loop θ)
# B: number of states (90) (note that they are indexed 1:90, but the realizations are 0:89)
# J: number of actions (2)
# β: discount factor
# tol = 100
# θ_3 = zeros(2, J) # two parameters indexing transitions, J actions
# θ_3[:, 1] = θ[3:4] # transition parameters for action 0
# θ_3[:, 2] = θ[5:6] # transition parameters for action 1
# # initial guess of EV
# EV_old = zeros(J, B)
# while tol > 1e-4 # good enough tolerance level, set 1e-6 if desired
# EV_new = zeros(J, B)
# for j = 1:J # iterate over actions
# p = θ_3[1, j] # given action, obtain transition probabilities
# q = θ_3[2, j]
# for x = 1:B # iterate over states
# if j == 1
# # if don't replace engine, we can transition 0, 1 or 2 states
# # - integral over the state is dropped because of discreteness of states and transitions
# # - minimum function exist because we have a finite number of states
# # - multiplying by the transition probabilities is essentially integrating over p(dy|⋅) in equation 4.14
# # -- because of the discrete states/transition nature
# EV_new[j, x] = p * logsumexp(compute_utility(θ, x-1, J) .+ β*EV_old[:,x]) +
# q * logsumexp(compute_utility(θ, minimum([x, B-1]), J) .+ β*EV_old[:, minimum([x+1, B])]) +
# (1 - p - q) * logsumexp(compute_utility(θ, minimum([x + 1, B - 1]), J) .+ β*EV_old[:, minimum([x+2, B])])
# else
# # if replace engine, state must transition back to 0, 1, or 2 next period
# EV_new[j, x] = p * logsumexp(compute_utility(θ, 0, J) .+ β*EV_old[:,1]) +
# q * logsumexp(compute_utility(θ, 1, J) .+ β*EV_old[:,2]) +
# (1 - p - q) * logsumexp(compute_utility(θ, 2, J) .+ β*EV_old[:,3])
# end
# end
# end
# tol = maximum(abs.(EV_new - EV_old))
# EV_old = EV_new
# end
# return (EV_old)
# end
We let c=0.001θ1x in order to attempt to replicate Table IX
# θ: params
# x: state, in {0 ,.., 89}
# J: number of choices
# out: J x 1 vector of utilities, each element corresponding to a specific choice
function compute_utility(θ, B, J)
θ_1 = θ[1] # since cost fn only has 1 param
R = θ[2]
u = zeros(J,B)
# let c(x, θ_1) = 0.001θ_1x
for x = 0:B-1
for i = 0: J-1
u[i+1, x+1] = i * -R - (1 - i) * 0.001 * θ_1 * x
end
end
return u
end
# # θ: params
# # x: state, in {0 ,.., 89}
# # J: number of choices
# # out: J x 1 vector of utilities, each element corresponding to a specific choice
# function compute_utility(θ, x, J)
# θ_1 = θ[1] # since cost fn only has 1 param
# R = θ[2]
# u = zeros(J,1)
# # let c(x, θ_1) = 0.001θ_1x
# for i = 0: J-1
# u[i+1] = i * -R - (1 - i) * 0.001 * θ_1 * x
# end
# return u
# end
# create the transition matrices
function build_transition_mx(θ_3, B, J)
G = zeros(B, B, J);
for x = 1:B
G[x, 1:3, 2] = θ_3[:, 2]
for y = x:x+2
G[x, minimum([y, B]), 1] = G[x,minimum([y, B]),1] + θ_3[y - x + 1, 1]
end
end
return G
end
We have already gotten the θ3 parameters from the data, so the next step is to take those as given and solve the partial log-likeihood T∑t=1log(Pr(it|xt;θ))
Given the observed actions it and states xt from the data. To do this, we compute the conditional choice probability implies by the NFP algorithm and then search over parameters θ1,RC that return the CCP that maximizes the likelihood.
# a general framework if you want to have multiple optimizatioθations
G = build_transition_mx(θ_3, B, J)
nruns = 1;
res = zeros(length(θ), nruns)
minf = zeros(nruns)
opt = Opt(:LN_NELDERMEAD, length(θ))
ftol_rel!(opt,1e-6)
maxeval!(opt, 10000)
for run = 1:nruns
if run >= 2
θ = res[:, run - 1]
end
min_objective!(opt, (vars, y) -> compute_ll(vars, df, B, G, J, β))
@time (minf[run],res[:, run],ret) = NLopt.optimize(opt, θ)
end
res
Compare the results to Table IX column 3, seems pretty close.
Now we want to optimize over the full likelihood function starting at our estimates of θ1,RC,θ3 in order to obtain an effcient estimate of these parameters (optional, can stop with the partial LL)
function compute_ll_full(θ, data, B, J, β)
x = data.x
i = data.i
delta_x = data.delta_x
θ_1 = θ[1:2];
θ_3 = zeros(3, J) # two parameters indexing transitions, J actions
θ_3[:, 1] = [θ[3:4]; 1 - sum(θ[3:4])] # obtain state transitions given choice i = 0
θ_3[:, 2] = [θ[5:6]; 1 - sum(θ[5:6])] # " i = 1
# If optimizer wants negative probabilies, throwa large objective function value
if sum(θ_3 .< 0) >= 1
return 1e10
end
# build the transition matrix implied by the parameters
G = build_transition_mx(θ_3, B, J)
CCP = zeros(J, B)
# compute inner loop
EV = compute_EV(θ_1, B, G, J, β)
maxEV = maximum(EV)
u = compute_utility(θ, B, J);
# use logit formula to calculate conditional choice prob
for y = 1:B
denom = sum(exp.(u[:, y] .+ β * EV .- maxEV), dims = 1)
CCP[1,y] = exp.(u[1, y] .+ β .* EV[1, y] .- maxEV)./denom[1,y]
end
# CCP for the second action is just "1 - CCP" of the first action given state x
CCP[2,:] = 1 .- CCP[1,:]
# compute full log likelihood function
ll = 0
for n = 1:length(x)
x_t = x[n]
i_t = i[n]
x_tp1 = delta_x[n] # state transitions
# minimum is because we cap our state transitions at 2
ll = ll + log(CCP[i_t + 1, x_t + 1]) + log(θ_3[minimum([x_tp1, 2]) + 1, i_t + 1])
end
return -ll
end
# function compute_ll_full(θ, data, B, G, J, β)
# x = data.x
# i = data.i
# delta_x = data.delta_x
# θ_3 = zeros(3, J) # two parameters indexing transitions, J actions
# θ_3[:, 1] = [θ[3:4]; 1 - sum(θ[3:4])] # obtain state transitions given choice i = 0
# θ_3[:, 2] = [θ[5:6]; 1 - sum(θ[5:6])] # " i = 1
# # If optimizer wants negative probabilies, throwa large objective function value
# if sum(θ_3 .< 0) >= 1
# return 1e10
# end
# CCP = zeros(J, B)
# # compute inner loop
# EV = compute_EV(θ, B, J, β)
# maxEV = maximum(EV)
# # obtain CCP
# for x = 1:B
# u = compute_utility(θ, x, J);
# denom = sum(exp.(u .+ β * EV .- maxEV), dims = 1)
# CCP[1,x] = exp.(u[1] .+ β .* EV[1, x] .- maxEV)./denom[1,x]
# end
# CCP[2,:] = 1 .- CCP[1,:]
# # compute full log likelihood function
# ll = 0
# for n = 1:length(x)
# x_t = x[n]
# i_t = i[n]
# x_tp1 = delta_x[n] # state transitions
# # minimum is because we cap our state transitions at 2
# ll = ll + log(CCP[i_t + 1, x_t + 1]) + log(θ_3[minimum([x_tp1, 2]) + 1, i_t + 1])
# end
# return -ll
# end
θ_3hat = reshape(θ_3[1:2,:], 4)
θ_new = [θ; θ_3hat]
opt2 = Opt(:LN_NELDERMEAD, length(θ_new))
ftol_rel!(opt2,1e-6)
maxeval!(opt2, 10000)
min_objective!(opt2, (vars, y) -> compute_ll_full(vars, df, B, J, β))
@time (minf2,minx2,ret2) = NLopt.optimize(opt2, θ_new)
Pretty close to the actual Rust results. Note that the results from the partial likelihood are even closer. Differences from this data and Rust's data might be the reason why results are slightly different. Rust has slightly fewer observationss.
We will estimate the transition CDF (note: not probability density) from the data simply by counting the number of observations with transitions.
Assume that if we do not see any people in a state/action pairing, then they stay in that state if j=0, or go to state 0 if j=1
# transition density
f_hat = zeros(B, B, J)
tmp = by(df, :id, xp = :x => Base.Fix2(lead, 1))
df.xp = tmp.xp
for j = 1:J
df_j = @linq df |>
where(:i .== j-1)
df_j = @linq df_j |>
where(.!ismissing.(:xp))
for x = 0:B-1
N = sum(df_j.x .== x);
# if don't replace, cannot transition backwards in state. If do replace, can transition backwards
for xp = (x*(j%2)):B-1
df_j2 = @linq df_j |>
where(:x .== x, :xp .== xp)
num = sum(length(df_j2.x))
if(N != 0)
f_hat[x + 1, xp + 1, j] = num/N;
elseif ((xp == x) & (j == 1))
f_hat[x + 1, xp + 1, j] = 1;
elseif ((xp == 0) & (j == 2))
f_hat[x + 1, xp + 1, j] = 1;
else
f_hat[x + 1, xp + 1, j] = 0;
end
end
end
end
# empirical CCP, use a probit because 0 probability is bad
# df.x2 = df.x.^2
# df.x3 = df.x.^3
CCP_probit = glm(@formula(i ~ x), df, Binomial(), ProbitLink())
CCP_hat = zeros(J,B)
pred_ccp_x = DataFrame(x = 0:89, x2 = (0:89).^2, x3 = (0:89).^3)
CCP_hat[2, :] = predict(CCP_probit, pred_ccp_x);
CCP_hat[1, :] = 1 .- CCP_hat[2, :];
The next step is to simulate the choice-specific value function. That is, we want to simulate:
Vs(it,xt;θ)=Eϵit,t[V(it,xt,ϵt;θ)]=−c(xt,it,;θ1)+βEϵ[u(xt+1,it+1;θ)+ϵit+1,t+1+βEϵ[…]]Note that EV type 1 assumptions allos us to simplify the expectation: E[ϵ|it,xt]=γ−log(pr(it|xt)) Where γ=.5772 is euler's constant. We draw S simulations each of length T, and then compute the conditional choice probability implied by the simulated choice-specific value functions.
S = 100; # number of sims
T = 100; # time length of sims
function simulate_V(θ, f_hat, CCP_hat, B, J, S, T)
# we'll eventually take the mean across the third dimension of the simulated V's
V_sim = zeros(J, B, S)
u = compute_utility(θ, B, J)
Threads.@threads for s = 1:S
Random.seed!(29489 + s);
for j = 0:J - 1
for x = 0:B - 1
path = zeros(Int64, T, 2) # T x 2 matrix of the maths that state and choices will evolve
path[1, :] = [x j] # initial starting point
for t = 2:T
# obtain previous time period state and action
x_m1 = path[t - 1, 1]
j_m1 = path[t - 1, 2];
# simulate this period's state and action
xp = sample(0:B-1, Weights(f_hat[x_m1 + 1, :, j_m1 + 1]))
jp = sample(0:J-1, Weights(CCP_hat[:, xp + 1]))
# add to bath
path[t, :] = [xp jp]
end
# backwards iterate on path to compute V_sim
for t = T:-1:2
x_t = path[t, 1];
j_t = path[t, 2];
# recursively compute flow utility + E(ϵ) + β * future util
V_sim[j + 1,x + 1,s] = u[j_t + 1, x_t + 1] + .5772 - log(CCP_hat[j_t + 1, x_t + 1]) + β * V_sim[j + 1,x + 1,s]
end
# in period 1, we do not add E[ϵ]
V_sim[j+1, x+1, s] = u[j + 1, x + 1] + β * V_sim[j + 1,x + 1,s]
end
end
end
# take the mean over the simulations
V_sim = mean(V_sim, dims = 3)
return V_sim
end
function Hotz_Miller_obj(θ, f_hat, CCP_hat, B, J, S, T, W)
# compute the simulated V given θ
V_sim = simulate_V(θ, f_hat, CCP_hat, B, J, S, T)
maxV = maximum(V_sim);
CCP = zeros(J, B)
for x = 1:B
denom = sum(exp.(V_sim[:, x] .- maxV), dims = 1)
CCP[1,x] = exp.(V_sim[1, x] .- maxV)./denom[1]
end
# CCP for the second action is just "1 - CCP" of the first action given state x
CCP[2,:] = 1 .- CCP[1,:]
# find difference between computed CCP and "empirical" CCP
g = CCP[1,:] - CCP_hat[1,:]
# weighting matrix is some arbitrary PSD matrix
# I weight states with no observations much less
obj = g' * W * g
return obj
end
opt3 = Opt(:LN_NELDERMEAD, length(θ))
ftol_rel!(opt3,1e-6)
maxeval!(opt3, 10000)
# W = Matrix{Float64}(I, B, B)
W = Diagonal([fill(1, 80); fill(0.02, B - 80)])
min_objective!(opt3, (vars, y) -> Hotz_Miller_obj(vars, f_hat, CCP_hat, B, J, S, T, W))
@time (minf3,minx3,ret3) = NLopt.optimize(opt3, θ)
Note that the performance with Hotz-Miller is about 15x faster! The actual parameter estimates are a little different, but within a reasonable += 20% range