Code is a summer self-exercise as preparation for graduate school:
Some helpful sources:
Note: Code is run on a fairly budget laptop (i5-8250U, 8GB RAM), so performance should be better in faster computers
import pandas as pd
import numpy as np
from scipy.optimize import minimize
import scipy
from numba import jit, njit, prange
import time
import multiprocessing as mp
import pickle
from sklearn.linear_model import LinearRegression
import pyblp
Dataframe cleaning is a lot more concise in R...
library(magrittr)
df <- cars %>%
dplyr::mutate(ln_hpwt = log(hpwt),
ln_space = log(space),
ln_mpg = log(mpg),
ln_mpd = log(mpd),
ln_price = log(price),
trend = market + 70,
cons = 1) %>%
dplyr::group_by(model_year) %>%
dplyr::mutate(s_0 = log(1-sum(share))) %>%
dplyr::ungroup() %>%
dplyr::mutate(s_i = log(share)) %>%
dplyr::mutate(dif = s_i - s_0,
dif_2 = log(share) - log(share_out),
ln_price = log(price)) %>%
dplyr::arrange(market, firmid)
# Read the dataframe
df = pd.read_csv("../data/blp_1995_data.csv")
df = df.drop(df.columns[0], axis = 1)
# python code for cleaning
df[["ln_hpwt", "ln_space", "ln_mpg", "ln_mpd", "ln_price"]] = \
df[["hpwt", "space", "mpg", "mpd", "price"]].apply(lambda x: np.log(x))
# instrument change
df["trend"] = df.market.map(lambda x: x + 70) # use with non pyblp instruments
# df["trend"] = df.market
df["cons"] = 1
df["s_0"] = np.log(1 - df.share.groupby(df["model_year"]).transform("sum"))
df["s_i"] = np.log(df.share)
df["dif"] = df.s_i - df.s_0
df["dif_2"] = np.log(df.share) - np.log(df.share_out)
df["ln_price"] = np.log(df.price)
df.head()
# this is here because we may want to use their instruments
product_data = pd.read_csv(pyblp.data.BLP_PRODUCTS_LOCATION)
# demand variables
X = df[["cons", "hpwt", "air", "mpd", "space"]].as_matrix()
# suppy variables
W = df[["cons", "ln_hpwt", "air", "ln_mpg", "ln_space", "trend"]].as_matrix()
# price
p = df.price.values
# initial delta_0 estimate: log(share) - log(share outside good)
delta_0 = df.dif_2.values
# number of goods per market
J = df.groupby("year").sum().cons.values
# number of draws per market
N = 500
# number of markets
T = len(J)
# Estimated log income means for years 1971 - 1990
incomeMeans = [2.01156, 2.06526, 2.07843, 2.05775, 2.02915, 2.05346, 2.06745,
2.09805, 2.10404, 2.07208, 2.06019, 2.06561, 2.07672, 2.10437, 2.12608, 2.16426,
2.18071, 2.18856, 2.21250, 2.18377]
# standard deviation of log incomes, assuming empirically given in 1995
sigma_v = 1.72
# number of terms that have the random coefficient
# according to table 4, this is constant, hp/wt, air, mp$, size, price
k = 5
# markets for itj
markets = df.market.values
# unique markets
marks = np.unique(df.market)
# firms
firms = np.reshape(df.firmid.values, (-1,1))
# define a class so I can repeatedly update the delta value
class delta:
def __init__(self, delta):
self.delta = delta
# initialize a delta object using the delta_0 values
d = delta(delta_0)
Even though the orignal BLP (1995) paper did not use different draws in each market, I decided that this is a better idea than using the same draws for all markets.
# set seed
np.random.seed(135567)
m_t = np.repeat(incomeMeans, N)
# matrix of simulated values
# number of rows = number of simulations
# treat last column is price/income
# note that BLP treat it as the same individuals in the market across all years (Nevo 2000)
# different draws for each market
V = np.reshape(np.random.standard_normal((k + 1) * N * T), (T * N, k + 1))
# # same draws for each market
# V = pd.read_csv("../v_ik.csv", header = None).as_matrix()
# V = np.reshape(np.random.standard_normal((k + 1) * N), (N, k + 1))
# income if we have different draws per market
y_it = np.exp(m_t + sigma_v * V[:, k]).reshape(T,N).T
# draws for income if same draws in every market
# y_it = np.exp(m_t + sigma_v * np.tile(V[:, k], len(incomeMeans))).reshape(T,N).T
If same draws are used in each market, you must remove the "N * t" indexing and just have "i"
# the loops that calculate utility in a separate function so that it can be
# run in parallel.
@jit(nopython = True, parallel = True)
def util_iter(out, x, v, p, y, delta, theta_2, J, T, N):
# first iterate over the individuals
for i in prange(N):
# iterator through t and j
tj = 0
# iterate over the markets
for t in prange(T):
# market size of market t
mktSize = J[t]
# income for individual i in market t
y_im = y[i, t]
# iterate over goods in a particular market
for j in prange(mktSize):
out[tj, i] = delta[tj] + \
v[N * t + i, 0] * theta_2[0] * x[tj, 0] + \
v[N * t + i, 1] * theta_2[1] * x[tj, 1] + \
v[N * t + i, 2] * theta_2[2] * x[tj, 2] + \
v[N * t + i, 3] * theta_2[3] * x[tj, 3] + \
v[N * t + i, 4] * theta_2[4] * x[tj, 4] - \
theta_2[5] / y_im * p[tj]
tj += 1
return out
# computes indirect utility given parameters
# x: matrix of demand characteristics
# v: monte carlo draws of N simulations
# p: price vector
# y: income of individuals
# delta: guess for the mean utility
# theta_2: non-linear params (sigma - can think of as stdev's)
# J: vector of number of goods per market
# T: numer of markets
# N: number of simulations
@jit
def compute_indirect_utility(x, v, p, y, delta, theta_2, J, T, N):
# make sure theta_2 are positive
theta_2 = np.abs(theta_2)
# output matrix
out = np.zeros((sum(J), N))
# call the iteration function to calculate utilities
out = util_iter(out, x, v, p, y, delta, theta_2, J, T, N)
return out
# computes the implied shares of goods in each market given inputs
# same inputs as above function
@jit
def compute_share(x, v, p, y, delta, theta_2, J, T, N):
q = np.zeros((np.sum(J), N))
# obtain vector of indirect utilities
u = compute_indirect_utility(x, v, p, y, delta, theta_2, J, T, N)
# exponentiate the utilities
exp_u = np.exp(u)
# pointer to first good in the market
first_good = 0
for t in range(T):
# market size of market t
mktSize = J[t]
# calculate the numerator of the share eq
numer = exp_u[first_good:first_good + mktSize,:]
# calculate the denom of the share eq
denom = 1 + numer.sum(axis = 0)
# calculate the quantity each indv purchases of each good in each market
q[first_good:first_good + mktSize,:] = numer/denom
first_good += mktSize
# to obtain shares, assume that each simulation carries the same weight.
# this averages each row, which is essentially computing the sahres for each
# good in each market.
s = np.matmul(q, np.repeat(1/N, N))
return [q,s]
@jit
def solve_delta(s, x, v, p, y, delta, theta_2, J, T, N, tol):
# define the tolerance variable
eps = 10
# renaming delta as delta^r
delta_old = delta
while eps > tol:
# Aviv's step 1: obtain predicted shares and quantities
q_s = compute_share(x, v, p, y, delta_old,
theta_2, J, T, N)
# extract the shares
sigma_jt = q_s[1]
# step 2: use contraction mapping to find delta
delta_new = delta_old + np.log(s/sigma_jt)
# update tolerance
eps = np.max(np.abs(delta_new - delta_old))
delta_old = delta_new.copy()
return delta_old
# calculates the marginal costs given probabilities and shares
# q_s: output of compute share, a list of probabilities matrix(q) and shares vector(s)
# firms: vector of firms operating in each market (length is JxT)
# marks: vector of unique markets (length T)
# markets: vector indicating observation in which market (length JxT)
@jit
def calc_mc(q_s, firms, p, y, alpha, J, T, N, marks, markets):
# declare output vector
out = np.zeros((np.sum(J)))
# make sure the value of alpha is positive
alpha = np.abs(alpha)
# read in quantities
q = q_s[0]
# read in shares
s = q_s[1].reshape((-1,1))
# reshape some vectors into column vectors
p = p.reshape((-1,1))
# iterate over markets
for m in marks:
# obtain list of firms operating in that market/year
firm_yr = firms[markets == m]
# obtain list of prices of goods in that market/year
price = p[markets == m]
# J_t x J_t block matrix of 1's indicating goods belonging to same firm
# in that market/year
# Also known as the ownership matrix
same_firm = np.equal(firm_yr, np.transpose(firm_yr))
# obtain matrix of probabilities for all simulations for goods in that
# market/year
yr = q[markets == m,:]
# obtain number of goods in that market
nobs = np.size(yr, 0)
# this is the omega matrix initializing
grad = np.zeros((nobs, nobs))
# calculate the omega matix by iterating through all individuals
# Omega matrix is cross-price deriv element-wise product with
# ownership matrix
for i in range(N):
yr_i = yr[:, i].reshape((-1, 1))
grad += alpha / y[i, m - 1] * same_firm * 1/N * \
(yr_i @ yr_i.T - np.diag(yr[:,i]))
# Omega matrix actually requires negative cross price derivs
subMatrix = -grad
# now obtain the marginal costs
b = np.linalg.inv(subMatrix) @ s[markets == m]
mc = price - b
mc[mc < 0] = .001
# update entries in the output vector
out[markets == m] = mc.flatten()
return out
# generate instruments for BLP paper
# To be honest: I have no idea what's going on here.
# Additionally, there is discussion that the instruments BLP created were wrong.
# However, this code below generates the instruments that correctly matches
# the instruments used in the BLP paper.
# One can use packages to generate more "optimal" instruments instead, like Hausmann
# instruments
# A source of instruments exists in the Conlon and Gortmaker's pyblp package
def gen_inst(inx, firms, marks, markets):
totMarket = np.zeros((np.size(inx, axis = 0), np.size(inx, axis = 1)))
totFirm = np.zeros((np.size(inx, axis = 0), np.size(inx, axis = 1)))
for m in marks:
sub = inx[markets == m, :]
firminfo = firms[markets == m]
same_firm = np.equal(firminfo, np.transpose(firminfo))
z_1 = np.zeros((np.size(sub,axis = 0), np.size(sub, axis = 1)))
for i in range(np.size(sub, axis = 1)):
z_1[:,i] = (sub[:,i].reshape((-1,1)) * same_firm).T.sum(axis = 0)
totFirm[markets == m,:] = z_1
z_1 = np.zeros((np.size(sub,axis = 0), np.size(sub, axis = 1)))
for i in range(np.size(sub, axis = 1)):
z_1[:,i] = (sub[:,i].reshape((-1,1)) * (same_firm + np.logical_not(same_firm))).sum(axis = 0)
totMarket[markets == m, :] = z_1
return [totFirm, totMarket]
# If you're looking at the four steps Aviv lists in his appendix, start here
# This is the objective function that we optimize the non-linear parameters over
def objective(theta_2, s, X, V, p, y, J, T, N, marks, markets, tol,
Z, Z_s, W, weigh, firms):
obs = np.sum(J)
# Aviv's step 1 & 2:
d.delta = solve_delta(s, X, V, p, y, d.delta, theta_2, J, T, N, tol)
# obtain the actual implied quantities and shares from converged delta
q_s = compute_share(X, V, p, y, d.delta, theta_2, J, T, N)
# calculate marginal costs
mc = calc_mc(q_s, firms, p, y, theta_2[5], J, T, N, marks, markets).reshape((-1,1))
# since we are using both demand and supply side variables,
# we want to stack the estimated delta and estimated mc
y2 = np.vstack((d.delta.reshape((-1,1)), np.log(mc)))
# create characteristics matrix that includes both supply and demand side
# with demand characteristics on the bottom left and supply on the top right
x = scipy.linalg.block_diag(X,W)
# create matrix of supply and demand instruments, again with
# demand instruments on the right and supply on the left (top/down changed)
z = scipy.linalg.block_diag(Z,Z_s)
# get linear parameters (this FOC is from Aviv's appendix)
b = np.linalg.inv(x.T @ z @ weigh @ z.T @ x) @ (x.T @ z @ weigh @ z.T @ y2)
# Step 3: get the error term xi (also called omega)
xi_w = y2 - x @ b
# computeo g_bar in GMM methods
g = z.T @ xi_w / obs
obj = float(obs**2 * g.T @ weigh @ g)
print([theta_2, obj])
return obj
# create table
table_1 = pd.DataFrame()
# calculate weighted means of the following column with quantity weights
table_1[[
"P", "Dom", "Japan", "Eu", "HP_Wt", "Size", "Air", "MPG", "MPD", "drop"
]] = df[["price", "domestic", "japan",
"european", "hpwt", "space",
"air", "mpg", "mpd", "quantity"]] \
.groupby(df.year) \
.apply(lambda x: pd.Series(np.average(x, weights=x["quantity"], axis = 0)))
# count number of models per year/market
table_1.insert(0, "num_models", df.groupby("year").sum().cons.values)
# mean quantity per year/market
table_1.insert(1, "Q", df.quantity.groupby(df.year).mean())
# delete the extraneous weighted quantity column
table_1.drop('drop', axis=1, inplace=True)
# round the table to 3 digits to emulate the paper
np.round(table_1, 3)
# table 3 column 1
t3c1 = LinearRegression().fit(df[["hpwt", "air", "mpd", "space", "price"]], df.dif_2)
np.hstack((t3c1.intercept_, t3c1.coef_))
# table 3 column 3
t3c3 = LinearRegression().fit(
df[["ln_hpwt", "air", "ln_mpg", "ln_space", "trend"]],
df.ln_price)
np.hstack((t3c3.intercept_, t3c3.coef_))
# ====================================================================
# IV Reg in table 3 column 2
# ====================================================================
# close to BLP original instruments
tempDemand = gen_inst(X, firms, marks, markets)
tempSupply = gen_inst(W, firms, marks, markets)
Z = np.hstack((X, tempDemand[0], tempDemand[1]))
baseData = np.hstack((p.reshape((-1,1)), X))
Z_s = np.hstack((W, tempSupply[0], tempSupply[1], df.mpd.values.reshape((-1,1))))
# # pyblp instruments
# Z = np.hstack((product_data[["demand_instruments0", "demand_instruments1",
# "demand_instruments2", "demand_instruments3",
# "demand_instruments4", "demand_instruments5"]], X))
# Z_s = np.hstack((product_data[["supply_instruments0", "supply_instruments1",
# "supply_instruments2", "supply_instruments3",
# "supply_instruments4", "supply_instruments5"]], W))
baseData = np.hstack((X, p.reshape((-1,1))))
zxw1 = Z.T @ baseData
bx1 = np.linalg.inv(zxw1.T @ zxw1)@ zxw1.T @ Z.T @ delta_0
e = delta_0 - baseData @ bx1
g_ind = e.reshape((-1,1)) * Z
demean = g_ind - g_ind.mean(axis=0).reshape((1,-1))
vg = demean.T @ demean / demean.shape[0]
w0 = np.linalg.inv(vg)
t3c2 = np.linalg.inv(zxw1.T @ w0 @ zxw1) @ zxw1.T @ w0 @ Z.T @ delta_0
t3c2
# obtain block-diag matrix of supply and demand instruments
z = scipy.linalg.block_diag(Z,Z_s)
# Recommended initial weighting matrix from Aviv's appendix
w1 = np.linalg.inv(z.T @ z)
# # run objective function once to see if it is working
# initial parameter guess (from BLP(1995))
theta_2 = [3.612, 4.628, 1.818, 1.050, 2.056, 43.501]
t0 = time.time()
obj = objective(theta_2,
df.share.values,
X, V, p, y_it, J, T, N, marks, markets, 1e-4,
Z, Z_s, W, w1, firms)
time.time() - t0
# constraints for the minimization problem
# not needed because utility considers abosolute value of params before calculating
# Step 4: search over parameters that minimize the objective function
t1 = time.time()
# set bounds for optimization (not entirely needed but oh well)
bnds = ((0,np.inf), (0,np.inf), (0,np.inf),
(0,np.inf), (0,np.inf), (5,np.inf))
res_init_wt = minimize(objective,
theta_2,
args = (df.share.values, X, V, p, y_it,
J, T, N, marks, markets, 1e-4,
Z, Z_s, W, w1, firms),
bounds = bnds,
method = "L-BFGS-B",
options = {'maxiter': 1000, 'maxfun': 1000, 'eps': 1e-3},
tol = 1e-4)
time.time() - t1
# save the output
outfile = open("res_init_wt_bfgs.pickle", "wb")
pickle.dump(res_init_wt, outfile)
outfile.close()
# remember to write code that reads in the above output later and comment out the save
obs = np.sum(J)
# approximate optimal weighting matrix
params_2 = res_init_wt.x
# calculate mean utility given the optimal parameters (with id weighting matrix)
d.delta = solve_delta(df.share.values, X, V, p, y_it,
d.delta, params_2, J, T, N, 1e-5)
# calculate probabilities and shares given the optimal params (w/ id weight matrix)
q_s = compute_share(X, V, p, y_it, d.delta, params_2, J, T, N)
# calculate marginal costs
mc = calc_mc(q_s, firms, p, y_it, params_2[5], J, T, N, marks, markets).reshape((-1,1))
y2 = np.vstack((d.delta.reshape((-1,1)), np.log(mc)))
x = scipy.linalg.block_diag(X,W)
z = scipy.linalg.block_diag(Z,Z_s)
# this is the first order condition that solves for the linear parameters
b = np.linalg.inv(x.T @ z @ w1 @ z.T @ x) @ (x.T @ z @ w1 @ z.T @ y2)
# obtain the error
xi_w = y2 - x @ b
# update weighting matrix
g_ind = z * xi_w
vg = g_ind.T @ g_ind / obs
# obtain optimal weighting matrix
weight = scipy.linalg.inv(vg)
# now search for optimal parameters with the optimal weighting matrix
t1 = time.time()
res = minimize(objective,
theta_2,
args = (df.share.values, X, V, p, y_it,
J, T, N, marks, markets, 1e-4,
Z, Z_s, W, weight, firms),
bounds = bnds,
method = "L-BFGS-B",
options = {'maxiter': 1000, 'maxfun': 1000, 'eps': 1e-3},
tol = 1e-4)
time.time() - t1
outfile = open("res_bfgs.pickle", "wb")
pickle.dump(res, outfile)
outfile.close()
# obtain the linear parameters
params_3 = res.x
d.delta = solve_delta(df.share.values, X, V, p, y_it,
d.delta, params_3, J, T, N, 1e-4)
q_s = compute_share(X, V, p, y_it, d.delta, params_3, J, T, N)
mc = calc_mc(q_s, firms, p, y_it, params_3[5], J, T, N, marks, markets).reshape((-1,1))
y2 = np.vstack((d.delta.reshape((-1,1)), np.log(mc)))
# linear parameters
# this is the FOC on page 5 of Aviv's Appendix
# compare parameters to Table iv of BLP
# first 5 are the demand side means
# last 6 are the cost side params
b = np.linalg.inv(x.T @ z @ weight @ z.T @ x) @ (x.T @ z @ weight @ z.T @ y2)
b
# non-linear parameters - these are the sigma estimates in table iv of BLP
res.x
# average markup (compare to table 12 of Conlon and Gortmaker (2019))
np.mean((p.flatten() - mc.flatten())/p.flatten())
share_deriv = []
q = q_s[0]
s = q_s[1]
for m in marks:
# obtain list of firms operating in that market/year
firm_yr = firms[markets == m]
# obtain list of prices of goods in that market/year
price = p[markets == m]
# J_t x J_t block matrix of 1's indicating goods belonging to same firm
# in that market/year
# Also known as the ownership matrix
same_firm = np.equal(firm_yr, np.transpose(firm_yr))
# obtain matrix of probabilities for all simulations for goods in that
# market/year
yr = q[markets == m,:]
# obtain number of goods in that market
nobs = np.size(yr, 0)
# this is the omega matrix initializing
deriv_m = np.zeros((nobs, nobs))
# calculate the omega matix by iterating through all individuals
# Omega matrix is cross-price share deriv element-wise product with
# ownership matrix
for i in range(N):
yr_i = yr[:, i].reshape((-1, 1))
deriv_m += params_3[5] / y_it[i, m - 1] * 1/N * \
(yr_i @ yr_i.T - np.diag(yr[:,i]))
share_deriv.append(deriv_m)
# obtain the mazda 323 own-price deriv in 1990
mz323_deriv = (share_deriv[19][df.modelvec[markets == 20] == "MZ323"]).flatten()[df.modelvec[markets == 20] == "MZ323"]
# obtain the bmw 735i own-price deriv in 1990
bw735i_deriv = (share_deriv[19][df.modelvec[markets == 20] == "BW735i"]).flatten()[df.modelvec[markets == 20] == "BW735i"]
# obtain the mazda 323 mkt share in 1990
mz323_s = s[(markets == 20) & (df.modelvec == "MZ323")]
# obtain the bmw 735i market share in 1990
bw735i_s = s[(markets == 20) & (df.modelvec == "BW735i")]
# compare with table VI, first row first column of BLP
mz323_deriv/ mz323_s * 100
# compare with table VI, last row last column of BLP
bw735i_deriv/bw735i_s * 100
# now we attempt to create all of table VI
# extract all of the cars
table_vi_cars = ["MZ323", "NISENT", "FDESCO", "CVCAVA", "HDACCO", "FDTAUR", "BKCENT",
"NIMAXI", "ACLEGE", "LNTOWN", "CDSEVI", "LXLS40", "BW735i"]
# obtain share derivs and shares relevant cars in 1990
deriv_1990 = share_deriv[19]
s_1990 = s[markets == 20]
# initialize the matrix dimensions of table VI
table_vi_mx = np.zeros((len(table_vi_cars), len(table_vi_cars)))
# iterate across cars to obtain table vi
row = 0
for car in table_vi_cars:
col = 0
for cars in table_vi_cars:
table_vi_mx[row, col] = (deriv_1990[df.modelvec[markets == 20] == cars].flatten())[df.modelvec[markets == 20] == car] / \
s_1990[df.modelvec[markets == 20] == car] * 100
col += 1
row += 1
# store in dataframe for easier viewing
table_vi = pd.DataFrame(np.round(table_vi_mx, 3))
# renaming rows and columns
table_vi.index = table_vi_cars
table_vi.columns = table_vi_cars
table_vi
Berry, S., Levinsohn, J., & Pakes, A. (1995). Automobile prices in market equilibrium. Econometrica: Journal of the Econometric Society, 841-890.
Conlon, C., & Gortmaker, J. (2019). Best practices for differentiated products demand estimation with pyblp. Working paper. url: https://chrisconlon. github. io/site/pyblp. pdf.
Nevo, A. (2000). A practitioner's guide to estimation of random‐coefficients logit models of demand. Journal of economics & management strategy, 9(4), 513-548.