-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathblp.jl
960 lines (782 loc) · 41.2 KB
/
blp.jl
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
# BLP for opioids.
#=
Resources:
NEVO RA Guide, RAs guide to BLP appendix
Nevo, RAs guide to BLP Estimation
Gentzkow, Shapiro Replication
Train, Discrete Choice Models with Simulation
=#
#=
how do we handle correlation among product characteristics when that is unknown?
They are all assumed independent, apparently, b/c anything else is too demanding.
=#
using CSV
#using GLM
using DataFrames
using DataFramesMeta
using StatsBase
#using StaticArrays
using LinearAlgebra
using Statistics
using Random
using Distributed
#using DistributedArrays
using SharedArrays
"""
`FW(x)`
Takes the object x and maps it to FrequencyWeights for use in sampling.
x should be a vector of real numbers of some kind
## Test: ##
julia>`FW([1.0])`
1-element FrequencyWeights{Float64,Float64,Array{Float64,1}}:
1.0
julia>`sample(["a","b"], FW([0.5, 0.5]))`
"b"
"""
function FW(x)
return FrequencyWeights(x)
end
"""
`MFW(ar1)`
Takes a matrix of real numbers (percentages of population having some characteristic from CPS)
and returns an Array{AbstractWeights, 1} where each element is a set of FrequencyWeights for some
population characteristic.
## Test ##
julia> MFW([0.5 0.5;0.25 0.75])
2-element Array{AbstractWeights,1}:
[0.5, 0.5]
[0.25, 0.75]
julia> sample(["a", "B"], MFW([0.5 0.5;0.25 0.75])[1])
"B"
julia> for el in MFW([0.5 0.5;0.25 0.75])
println(sample(["a", "b"], el))
end
"a"
"b"
"""
function MFW(ar1)
outp = Array{AbstractWeights, 1}()
for i = 1:size(ar1,1)
push!(outp, FW(ar1[i,:]))
end
return outp
end
"""
`OH(N)`
Generates an Array{Array{Int64,1}, 1}, of one-hot vectors. It's an identity matrix, but represented
as a vector of vectors, so the indexing is simpler.
## Test ##
julia> OH(3)
3-element Array{Array{Int64,1},1}:
[1, 0, 0]
[0, 1, 0]
[0, 0, 1]
"""
function OH(N)
# generates an Array{Array{Int64,1}, 1} w/ 1's on the diagonal. Looks like an identity matrix but is an array of the rows of one instead.
outp = Array{Array{Int64,1},1}()
for i = 1:N
tm1 = zeros(N)
tm1[i] += 1
push!(outp, tm1)
end
return outp
end
"""
`Sampler(n, x...)`
Samples characteristics for a single individual. Returns a list of those characteristics x..., which can handle many
characteristics. Index n is for the market.
- n indexes the market
- x... is a collection of characteristics, stored as tuples of OH-vectors and FrequencyWeights.
## TEST: ##
mkt_chars = CSV.read("./state_demographics.csv", DataFrame)
mkt_chars[!,:total_est_pop] = log.(mkt_chars[!,:total_est_pop])
race_w = convert(Array{Float64,2}, mkt_chars[!, [:race_white, :race_afam, :race_nativeam, :race_asian, :race_pacisland, :race_other]])
disability_w = hcat(mkt_chars[!,:total_pop]-mkt_chars[!,:total_pop_w_disability], mkt_chars[!,:total_pop_w_disability])
race = (OH(size(race_w, 2)), MFW(race_w))
disability = (OH(size(disability_w, 2)), MFW(disability_w))
Sampler(4, race, disability)
# returns an Array{Float64,1} of eight elements, first six corresponding to race and last two to disability status.
"""
function Sampler(n, x...)
# generates ONE random individual according to the objects in the list x...
outp = zeros(Float64, 0)
for (i,el) in enumerate(x)
outp = vcat(outp, sample(el[1], el[2][n])) # n is the index of the market sampling weights
end
return outp
end
"""
`Population(S, n, x...)`
Returns a set of S individuals w/ random characteristics from the collection x... for the market n.
- S is an integer representing a number of people.
- n is the index of a market
- x... is a collection of characteristics, stored as tuples of OH-vectors and FrequencyWeights
## TEST: ##
mkt_chars = CSV.read("./state_demographics.csv", DataFrame)
mkt_chars[!,:total_est_pop] = log.(mkt_chars[!,:total_est_pop])
race_w = convert(Array{Float64,2}, Matrix(mkt_chars[!, [:race_white, :race_afam, :race_nativeam, :race_asian, :race_pacisland, :race_other]]))
disability_w = hcat(mkt_chars[!,:total_pop]-mkt_chars[!,:total_pop_w_disability], mkt_chars[!,:total_pop_w_disability])
race = (OH(size(race_w, 2)), MFW(race_w))
disability = (OH(size(disability_w, 2)), MFW(disability_w))
Population(100, 2, race, disability)
# returns an 8 × 100 array, where each column is an "individual" represented as a race (first six elements) and a disability status (last two)
"""
function Population(S, n, x...)
# for a market n, returns S random individuals given the objects in the list x
length = size( Sampler(n, x...) , 1)
outp = zeros(Float64, length, S)
for i = 1:S
outp[:,i] += Sampler(n, x...)
end
return outp
end
"""
`PopMarkets(M, S, Ch, x...)`
Returns a collection of simulated individuals numbering S across the markets M, where C is a set of random shocks and x... is a collection of characteristics.
- M indexes markets. A list of e.g., strings ("Alabama", "Arkansas", ... ) or years or market-years.
- S is the number of individuals to be generated.
- Ch is an integer. indexes the number of characteristics w/ a random coefficient as an integer (?)
- x... is a list of demographics, e.g., pop_w, race_w, ...
## TEST: ##
mkt_chars = CSV.read("./state_demographics.csv", DataFrame)
st1 = ["AK", "AL", "AR", "AZ", "CA", "CO", "CT", "DC", "DE", "FL", "GA", "HI", "IA", "ID", "IL", "IN", "KS", "KY", "LA", "MA", "MD", "ME", "MI", "MN", "MO", "MS", "MT", "NC", "ND", "NE", "NH", "NJ", "NM", "NV", "NY", "OH", "OK", "OR", "PA", "RI", "SC", "SD", "TN", "TX", "UT", "VA", "VT", "WA", "WI", "WV", "WY"];
yr1 = [2009, 2010, 2011, 2012, 2013];
mkt_ids = sort([Base.Iterators.product(st1,yr1)...], by = x-> (x[1], x[2]))
pop_w = convert(Array{Float64,2}, Matrix(mkt_chars[!, [:pop_10_14, :pop_15_19, :pop_20_24, :pop_25_29, :pop_30_34, :pop_35_39, :pop_40_44, :pop_45_49, :pop_50_54, :pop_55_59, :pop_60_64, :pop_65_69, :pop_70_74, :pop_75_79, :pop_80_84, :pop_85_plus]]))
pop = (OH(size(pop_w, 2)), MFW(pop_w))
N_individuals = 100
N_characteristics = 3
sim_individuals, shocks = PopMarkets(st1, N_individuals, N_characteristics, pop)
# returns a 19 × 100 × 52 (characteristics × individuals × markets) array
"""
function PopMarkets(M, S, Ch, x...; years = 5)
# get the simulated for S individuals demographics over all of the markets M w/ Ch characteristics and x... demographics.
length = size( Sampler(1, x...) , 1)
mkts = size(M,1)
mlen = mkts*years
outp = zeros(Float64, length+Ch, S, mlen) # hold space open for the ν = μ + σ*ϵ version of the shocks.
shocks = zeros(Float64, Ch, S, mlen)
for i = 1:mkts
for k = 1:years
outp[1:length,:,(i-1)*years+k] += Population(S, i, x...)
shocks[1:Ch,:,(i-1)*years+k] += randn(Ch, S) # shocks, but think carefully about this. These go on product features.
end
end
return outp, shocks # separately holding demographics, shocks.
end
"""
`AllMarketShares(mkts::Array, params::Array, δ::Array, products::Array, mean_utils::Array)`
- `mkts` - a collection of markets
- `params` - current parameter values
- `δ` - mean utilities
- `products` - product characteristics
- `mean_utils` - output, shares across markets
This will compute the predicted market shares across all of the markets given in `mkts`.
Operates in-place on mean_utils. Ordered returned is products × markets in mean_utils.
## TEST ##
st1 = ["AK", "AL", "AR", "AZ", "CA", "CO", "CT", "DC", "DE", "FL", "GA", "HI", "IA", "ID", "IL", "IN", "KS", "KY", "LA", "MA", "MD", "ME", "MI", "MN", "MO", "MS", "MT", "NC", "ND", "NE", "NH", "NJ", "NM", "NV", "NY", "OH", "OK", "OR", "PA", "RI", "SC", "SD", "TN", "TX", "UT", "VA", "VT", "WA", "WI", "WV", "WY"];
yr1 = [2009, 2010, 2011, 2012, 2013];
shares, labs = MarketShares(st1, yr1);
charcs = ProductChars(:ndc_code, :avg_copay, :codeine, :hydrocodone, :hydromorphone, :methadone, :morphine, :oxycodone,:other, :tramadol, :mme, :small_package, :medium_package,:large_package);
params_indices, common_params, markets, shocks = MKT(10,3);
# now markets is [93,10,52] - characteristics dim × individuals × markets (states)
cinc = markets[:,:,10];
mean_u = zeros(size(shares[1])[1], 52);
AllMarketShares(markets, params_indices[1], zeros(948), charcs[1], mean_u)
mean_u
## TODO ##
δ is market specific too. Needs to be products × markets
"""
function AllMarketShares(mkts::Array, params::Array, δ::Array, products::Array, mean_utils::Array)
param_dim, n_individuals, n_markets = size(mkts)
n_products, n_chars = size(products)
ind_utils = zeros(n_products)
for m = 1:n_markets
mu = @view mean_utils[:,m]
mk = @view mkts[:,:,m]
PredShare(mk, params, δ, products, mu, ind_utils)
end
return nothing
end
"""
`PredShare(mkt::Array, params::Array, δ::Array, products::Array, market_shares, ind_utils::Array)`
Should take a single market, return mean util across a bunch of products.
To cut allocations, pass a vector to hold the utilities.
This should operate in-place on the vector δ
- `mkt` current collection of simulated individuals.
- `params` current value of all parameters
- `δ` current value of mean utilities
- `products` set of items and their characteristics, by market
- `market_shares` temporary collection to put the mean new market_shares
- `ind_utils` container for utilites, called in Utils.
∑_ns exp ( δ + ∑_k σ_k x^k_jt ν_i^k + π_k1 D_i1 + … + π_kd D_id ) / 1 + ∑ exp ( δ + ∑_k σ_k x^k_jt ν_i^k + π_k1 D_i1 + … + π_kd D_id )
## TEST ##
st1 = ["AK", "AL", "AR", "AZ", "CA", "CO", "CT", "DC", "DE", "FL", "GA", "HI", "IA", "ID", "IL", "IN", "KS", "KY", "LA", "MA", "MD", "ME", "MI", "MN", "MO", "MS", "MT", "NC", "ND", "NE", "NH", "NJ", "NM", "NV", "NY", "OH", "OK", "OR", "PA", "RI", "SC", "SD", "TN", "TX", "UT", "VA", "VT", "WA", "WI", "WV", "WY"];
yr1 = [2009, 2010, 2011, 2012, 2013];
shares, labs = MarketShares(st1, yr1);
charcs = ProductChars(:ndc_code, :avg_copay, :codeine, :hydrocodone, :hydromorphone, :methadone, :morphine, :oxycodone,:other, :tramadol, :mme, :small_package, :medium_package,:large_package);
params_indices, common_params, markets, shocks = MKT(10,3);
# now markets is [93,10,52] - characteristics dim × individuals × markets (states)
cinc = markets[:,:,10];
cin_shock = shocks[:,:,10];
mu = zeros(size(shares[1],1));
δ = zeros(size(shares[1],1));
ind_u = zeros(size(shares[1],1));
p = zeros(Float64,3);
PredShare(cinc, cin_shock, params_indices[1],params_indices[2] , δ, charcs, mu, ind_u, p)
@benchmark PredShare(cinc, cin_shock, params_indices[1],params_indices[2] , δ, charcs, mu, ind_u, p)
sum(ind_u) # does sum to nearly one.
TODO - mean_utils should be an array with dimension equal to the number of markets. Each market gets a row
TODO - ind_utils can be pre-allocated at a higher level.
TIMING - takes basically exactly 10x as long (and allocates 10x) as Util when individuals == 10.
"""
function PredShare(mkt, shk, params::Array, shk_params::Array, δ::Array, products::Array, market_shares, ind_utils::Array, pd::Array)
characs, individuals = size(mkt) # number of features, number of individuals.
ZeroOut(market_shares) # be careful w/ this since it will zero out the **entire** market_shares Array.
for i = 1:individuals
v1 = @view mkt[:,i] # selects demographics for one "person" w/out allocating
s1 = @view shk[:,i]
# TODO - should all of these arguments be views??
Util( v1, s1, products, δ, params, shk_params, ind_utils, pd ) # computes utility for ALL products in market for one person
market_shares .+= ind_utils
end
market_shares ./=individuals # take mean over individuals in the market - divide by N_individuals.
return nothing
end
"""
`Util(demographics, products_char::Array, δ::Array, params::Array, utils::Array)`
Compute the utility over all products for a single simulated person in the market
- `demographics` is a view of an array of demographics for an individual
- `products_char` is an array of features
- `δ` is an array of the current mean utilities
- `params` is an array of the current set of parameters
- `utils` is an array w/ dimension equal to that of the products in the market.
- NB: operates in-place on the vector utils, which is reset to zero every time. Cuts allocations somewhat
- NB: Products, δ must be in the same order
This is computing: exp ( δ + ∑_k x^k_jt (σ_k ν_i^k + π_k1 D_i1 + … + π_kd D_id ) / 1 + ∑ exp ( δ + ∑_k σ_k x^k_jt ν_i^k + π_k1 D_i1 + … + π_kd D_id )
- δ a mean util
- x^k_jt are characteristics getting a random coefficient.
- σ_k a parameter multiplying a shock
- D_i1, ..., D_id are demographic characteristics (no random coeff, but param π_id)
## TEST ##
st1 = ["AK", "AL", "AR", "AZ", "CA", "CO", "CT", "DC", "DE", "FL", "GA", "HI", "IA", "ID", "IL", "IN", "KS", "KY", "LA", "MA", "MD", "ME", "MI", "MN", "MO", "MS", "MT", "NC", "ND", "NE", "NH", "NJ", "NM", "NV", "NY", "OH", "OK", "OR", "PA", "RI", "SC", "SD", "TN", "TX", "UT", "VA", "VT", "WA", "WI", "WV", "WY"];
yr1 = [2009, 2010, 2011, 2012, 2013];
shares, labs =MarketShares(st1, yr1);
charcs = ProductChars(:ndc_code, :avg_copay, :codeine, :hydrocodone, :hydromorphone, :methadone, :morphine, :oxycodone,:other, :tramadol, :mme, :small_package, :medium_package,:large_package);
params_indices, common_params, markets, shocks = MKT(10,3);
cinc = markets[:,:,10];
cin_shock = shocks[:,:,10];
utils = zeros(size(shares[1],1));
δ = zeros(size(shares[1],1));
pd = zeros(Float64,3)
shr = zeros(size(shares[1],1));
Util(cinc[:,1], cin_shock[:,1], charcs, shr, params_indices[1], params_indices[2] ,utils, pd );
@benchmark Util(cinc[:,1], cin_shock[:,1], charcs, shr, params_indices[1], params_indices[2] ,utils, pd)
minimum time: 67.992 μs (0.00% GC)
median time: 69.050 μs (0.00% GC)
mean time: 75.198 μs (2.18% GC)
maximum time: 1.942 ms (94.05% GC)
## Known answer test case ##
us = [0.0 0.0 0.0]
Util([1; 0; 0], ['x' 'y' 1 1], [0 0 0], [1; 1; 1], us)
us.≈[0.7112345942275937 0.09625513525746869 0.09625513525746869]
# TODO - approximately equals 1, but *very* approximately (w/in 0.1). Should be able to do better.
# TODO - made 40% faster, but can I do more? multithreading makes max_time worse, FYI.
# TODO - problem currently is that shares and charcs have different dimensions. Why? Product characteristics
now are too numerous.
"""
function Util(demographics,
shocks,
products_char::Array,
δ::Array,
demo_params::Array,
shock_params::Array,
utils::Array,
pd::Array)
ZeroOut(utils) # will hold utility for one guy over all of the products
num_prods, num_chars = size(products_char)
ZeroOut(pd)
demo_ix = size(demographics,1)-size(pd,1) # Starting point. This is terrible.
for k = 1:size(pd,1)
@inbounds demographics[demo_ix+k] = shock_params[1,k] + shock_params[2,k]*shocks[k] # Creates the characteristic-specific shock first.
end
for j = 1:size(pd,1) # == size(params,2)
for i = 1:size(demographics,1) # NOTE: demographics column has different dimensions from params (shorter, on purpose, see InitialParams).
@inbounds pd[j] += demo_params[i,j]*demographics[i] # this term is constant across the products
end
end
utils .+= δ
for i = 1:num_prods # NB: multithreading here makes max_time worse by 6x - 8x
tmp_sum = 0.0 # reset the running utility for each person - weirdly faster than adding directly to utils[i].
for j = 2:num_chars # TODO - this can be redone so that it doesn't require keeping track of this 3.
@inbounds tmp_sum += products_char[i,j]*pd[j] # TODO - 90% of the allocation in this function takes place here.
end
utils[i] += tmp_sum # TODO - the other 10% of the allocation takes place here.
end
mx_u = maximum(utils) # max for numerical stability - faster than doing the comparison every step in the main loop
sm = 1/exp(mx_u) # denominator: 1/exp(mx_u) + sum (exp ( util - mx_u))
for i = 1:length(utils)
@inbounds sm += exp(utils[i]-mx_u)
@inbounds utils[i] = exp(utils[i] - mx_u)
end
utils ./=sm
return nothing # make sure this doesn't return, but operates on utils in place
end
"""
`ZeroOut`
Dumb function to set a vector or any array equal to zero.
"""
function ZeroOut(x)
for i = 1:length(x)
x[i] = 0.0
end
end
"""
`Contraction(mkt::Array, params::Array, products::Array, empirical_shares, predicted_shares, δ::Array, new_δ::Array ; ϵ = 1e-6, max_it = 5_000_000)`
- mkt::Array - a set of demographics for simulated individuals.
- params::Array - the current set of parameters (e.g., β, σ)
- products::Array - a set of product features, needed to compute predicted shares
- empirical_shares - these are shares from the data.
- predicted_shares - market shares are predicted given the δ's available at the current iteration
- δ::Array - the product-specific terms used in computing utility.
- new_δ::Array - container to hold the new δ's after the update
- ϵ = 1e-6 - this will need to be set a lot lower eventually.
- max_it = 5_000_000 - stop if it doesn't finish by then.
Computes the contraction within a single market.
This should be:
δ^{t+1} = δ^{t} + log S_{.t} - log ( s_{.t}( ... ) )
- δ^t is previous iteration of values
- log S_{.t} is log market shares from data
- log (s_{.t} (...) ) are log computed market shares.
## Test ##
st1 = ["AK", "AL", "AR", "AZ", "CA", "CO", "CT", "DC", "DE", "FL", "GA", "HI", "IA", "ID", "IL", "IN", "KS", "KY", "LA", "MA", "MD", "ME", "MI", "MN", "MO", "MS", "MT", "NC", "ND", "NE", "NH", "NJ", "NM", "NV", "NY", "OH", "OK", "OR", "PA", "RI", "SC", "SD", "TN", "TX", "UT", "VA", "VT", "WA", "WI", "WV", "WY"];
yr1 = [2009, 2010, 2011, 2012, 2013];
shares, labs = MarketShares(st1, yr1);
charcs = ProductChars(:ndc_code, :avg_copay, :codeine, :hydrocodone, :hydromorphone, :methadone, :morphine, :oxycodone,:other, :tramadol, :mme, :small_package, :medium_package,:large_package);
params_indices, common_params, markets, shocks = MKT(10,3);
# now markets is [93,10,52] - characteristics dim × individuals × markets (states)
cinc = markets[:,:,10];
cinc_shock = shocks[:,:,10];
market_shares = zeros(size(shares[1],1), 52);
# TODO - indexing here will allocate, @view instead.
emp_shr = @view shares[1][:,2]
Contraction(cinc, cinc_shock, params_indices[1], params_indices[2], charcs, emp_shr, 4)
This allocates a lot and takes a while, b/c it is computing PredShares until convergence.
numerically stable version frequently gives NaN, probably due to small mkt shares?
Timing note: all overhead is due to Utils, via PredShares
# TODO - needs to take a market identifier and return one. Easy.
# TODO - now returns all NaNs pretty rapidly
"""
function Contraction(mkt::Array, mkt_shock::Array, params::Array, shk_params::Array, products::Array, empirical_shares, ID; ϵ = 1e-6, max_it = 5)
conv = 1.0
curr_its = 1
us = zeros(size(products,1),1) # TODO - check that this will be right.
δ = zeros(size(empirical_shares,1),1).+=(1/size(empirical_shares,1))
new_δ = zeros(size(empirical_shares,1)) # TODO - these can be started at a better value, like log(s) - log(s_0)
predicted_shares = zeros(size(empirical_shares,1),1)
pd = zeros(Float64, size(mkt_shock,1),1) # The number of random coefficients.
while (conv > ϵ) & (curr_its < max_it)
# debugging related...
conv = norm(new_δ.-δ, Inf)
two_norm = norm(new_δ.-δ, 2)
if curr_its%1000 == 0
@info ID curr_its conv two_norm
end
PredShare(mkt, mkt_shock, params, shk_params, new_δ, products, predicted_shares, us, pd)
# now update the δ
for i = 1:length(new_δ)
δ[i] = new_δ[i] # copy/reassign so that I can preserve these values to compare.
new_δ[i] = new_δ[i] + (log(empirical_shares[i]) - log(predicted_shares[i]))
end
curr_its += 1
end
return ID, new_δ
end
"""
`FastContraction(mkt::Array, params::Array, products::Array, empirical_shares, predicted_shares, δ::Array, new_δ::Array ; ϵ = 1e-6, max_it = 5_000_000)`
A version of the Ryngaert/Varadhan/Nash (2012) faster BLP contraction.
See Section 2.3 for the equation.
See also MST RCNL.f90 lines 4737 - 4775 for an implementation
TODO
"""
function FastContraction(mkt::Array, params::Array, products::Array, empirical_shares, predicted_shares, δ::Array, new_δ::Array ; ϵ = 1e-6, max_it = 5_000_000)
return nothing
end
"""
`FormError(mkts, params::Array, products::Array, empirical_shares, predicted_shares)`
- mkts: the markets, e.g., demographics for N simulated individuals in the markets
- mkt_shocks: shocks for the random coeffs of N individuals in the market
- params::Array: Current value of the parameters, except those for the shocks
- shk_params::Array: Current value of the mean and variance of the shock parameters
- common_params::Array: current value of the common parameters.
- products::Array: product characteristics
- empirical_shares: data, e.g., the empirical shares
- IDs: a vector of IDs (here tuples of "state abbrev" and int year)
- random_coeffs = 3: number of random coefficients.
This will pmap the Contraction function across the markets to recover the error ω.
Computes ω = δ - X_1 Θ_1, where X_1 is J × T, where J is the number of products and T is the number of markets.
Per appendix Step 0, X_1 contains the components of the δ in Equation (3) in RG p.520.
So δ_jt = x_jt β - α p_jt + ξ_jt. The contraction recovers δ_jt.
There is a current set of parameters θ_1 = (β, α) which are in params.
The characteristics are in products. This is the X_1 in X_1 Θ_1
So all that is necessary is to recover ξ_jt = δ_jt - X_jt (β α)
These are the characteristics without random coefficients.
# Test
st1 = ["AK", "AL", "AR", "AZ", "CA", "CO", "CT", "DC", "DE", "FL", "GA", "HI", "IA", "ID", "IL", "IN", "KS", "KY", "LA", "MA", "MD", "ME", "MI", "MN", "MO", "MS", "MT", "NC", "ND", "NE", "NH", "NJ", "NM", "NV", "NY", "OH", "OK", "OR", "PA", "RI", "SC", "SD", "TN", "TX", "UT", "VA", "VT", "WA", "WI", "WV", "WY"];
yr1 = [2009, 2010, 2011, 2012, 2013];
cop = (:copay, [:copay])
ingr = (:active_ingredient, [:codeine, :hydrocodone, :hydromorphone, :methadone, :morphine, :oxycodone, :other, :tramadol])
mme = (:mme, [:mme])
pack = (:package_size, [:small_package, :medium_package, :large_package])
mkt_ID = [ Iterators.product(st1, yr1)...]
shares, labs =MarketShares(st1, yr1);
charcs = ProductChars(:ndc_code, :avg_copay, :codeine, :hydrocodone, :hydromorphone, :methadone, :morphine, :oxycodone,:other, :tramadol, :mme, :small_package, :medium_package,:large_package);
params_indices, common_params, markets, shocks = MKT(10,3);
common_params, common_locs, common_names = ProductParams(cop, ingr, mme, pack)
labs, val = FormError(markets, shocks, params_indices[1], params_indices[2], common_params, charcs, shares, labs)
TODO - need to fix the products × params issue.
TODO - fix tolerance when debugging is done!
TODO - several of these return all NaNs.
("CA", 2009)
("DE", 2009)
("HI", 2009)
("DC", 2012)
("SD", 2013)
"""
function FormError(mkts, mkt_shocks, params::Array, shk_params::Array, common_params::Array, products::Array, empirical_shares, IDs; random_coeffs = 3)
demos, individuals, num_mkts = size(mkts)
m = [ mkts[:,:,k] for k = 1:size(mkts,3)] # faster than a loop
s = [ mkt_shocks[:,:,k] for k = 1:size(mkt_shocks,3)]
p = [params for k = 1:num_mkts]
sp = [shk_params for k = 1:num_mkts]
prods = [products for k = 1:num_mkts]
# x[1] - markets, m as above
# x[2] - market shocks, s as above
# x[3] - parameters, repeated for each processor
# x[4] - shock parameters, repeated for each processor
# x[5] - products, comes w/ right dimension.
# x[6] - empirical shares, comes w/ right dimension.
# x[7] - market-level ID's
contract_δ = pmap(x->Contraction(x[1], x[2], x[3], x[4], x[5], x[6], x[7]), zip(m, s, p, sp, prods, empirical_shares, IDs))
labels, new_δ = ([x[1] for x in contract_δ], [x[2] for x in contract_δ]) # separate to regress.
# TODO - is does not make sense to do these as rows since the storage order is column major.
mkt_δ = reduce(hcat, new_δ)
err = zeros(size(mkt_δ)) # TODO - preallocate this
for i = 1:size(mkt_δ,2) # up to 255
for j = 1:size(mkt_δ, 1) # up to
pv = @view products[j, 2:end]
mv = @view common_params[:]
err[j,i] = mkt_δ[j,i] - pv'*mv# form error here.
end
end
# reshape to stack
err = reshape(err, (size(err,1)*size(err,2), 1)) # dimensions are J products × T markets
# TODO - these must be sorted so that the order respects that of the instruments.
return labels, err
end
"""
GMM()
- W/ the error in hand, form the GMM objective.
- The instruments are constant obviously.
- This function should call `FormError`
- We want:
ω(θ)'ZΦZ'ω(θ)
The matrices are now the right shape...
# TODO - Φ function
"""
function GMM(err::Array, Z::Array)
# TODO - these dimensions aren't right anyway.
Φ = I(size(transpose(err)*Z,1), size(transpose(Z)*err,2))# TODO
(transpose(err)*Z)*Φ*(transpose(Z)*err)
return nothing
end
"""
`Instruments`
- These instruments are created in differentiation_ivs.do
- They are organized by Year - State - NDC code
- All are normalized so they have mean 0 and variance 1. This is fine - gives numerically identical results in the linear case.
- There are ten instruments in this order:
1. "smme_prod_instrument",
2. "sp_prod_instrument",
3. "scov_price_ingredient_instrument",
4. "scov_mme_package_instrument",
5. "scov_price_package_instrument",
6. "scov_price_mme_instrument",
7. "ssub_instrument",
8. "spackage_instrument",
9. "smme_instrument",
10. "sp_instrument"
TODO - there is still something to do w/ the p̂ values from Gandhi/Houde.
# Checking the size of this or something else...
# Now there are a huge number of missing values here - why?
i1 = Instruments()
"""
function Instruments()
# load the instruments generated by differentiation_ivs.do.
inst = CSV.read("./differentiation_ivs.csv", DataFrame)
# There are missing vals in the DataFrame now.
return Matrix{Union{Float64,Missing}}( inst[!, ["smme_prod_instrument", "sp_prod_instrument", "scov_price_ingredient_instrument", "scov_mme_package_instrument", "scov_price_package_instrument", "scov_price_mme_instrument", "ssub_instrument", "spackage_instrument", "smme_instrument", "sp_instrument"]])
end
"""
`NormalizeVar(x)`
Normalizes the variable given by the vector x, subtracting the mean and dividing by the SD.
## Test ##
NormalizeVar([1,0]) ≈ [+ √2, - √2]
"""
function NormalizeVar(x)
# check here that this has only one dimension
if (minimum(size(x)) > 2)&(ndims(x) > 1)
throw(ArgumentError("the argument x is too big, e.g., is not a column vector"))
end
μ = Statistics.mean(x)
σ = Statistics.std(x)
return (x.-μ)./σ
end
"""
`InitialParams(Characteristics, x...; rand_init = true )`
Will take a collection of demographics stored as tuples of OH-vectors and FrequencyWeights in x...
and will return an initial vector, which can either be random or from a particular spot.
Will also return a collection of indexes corresponding to the dimension of the x... vectors
Can set rand_init = false to start from a particular spot.
- `Characteristics` - how many characteristics will get random coefficients? these have variances > 0
- `Characteristics` - must also have means.
- `x...` collection of demographic features stored as tuples of OH-vectors and FrequencyWeights
- The `x...` features are "dummy trap coded". Fix first value here to zero.
- `rand_init` = true; set to false to start from some particular set of parameters
- The returned items are a vector w/ the parameter values
- FIXME: [Not correct at the moment] A vector of tuples recording where each demographic item stops and finishes
- Then a separate matrix w/ the random coefficient pieces.
- the locations of the means of the random coefficients.
- Followed by the locations of the variances of the random coefficients.
- The last 2*#(characteristics) + [#(characteristics) × #(characteristics)] items should be handled carefully.
- For each characteristic i, the last element in the block is the variance.
- For each characteristic i, the second last in this block is the mean.
- the #(c) × #(c) prior to that, the i-th element of row i is the σ. The j ≂̸ i are the covariances among different product characteristics.
- Then for each individual we have drawn N(0,1) shocks η', so the shock for the individual is actually: η_i = μ_i + η'_i*σ_i
- This appears interacted w/ the i-th random characteristic
- But also in the j-th random characteristic via ρ_ij ( η_j ) = ρ_ij ( μ_i + η'_i*σ_i)
- Params vector returned by initial params is LONGER than demographics.
## Test ##
mkt_chars = CSV.read("./state_demographics.csv", DataFrame) ;
mkt_chars[!,:total_est_pop] = log.(mkt_chars[!,:total_est_pop]);
race_w = Matrix{Float64}( mkt_chars[!, [:race_white, :race_afam, :race_nativeam, :race_asian, :race_pacisland, :race_other]]);
disability_w = hcat(mkt_chars[!,:total_pop]-mkt_chars[!,:total_pop_w_disability], mkt_chars[!,:total_pop_w_disability]);
race = (OH(size(race_w, 2)), MFW(race_w));
disability = (OH(size(disability_w, 2)), MFW(disability_w));
a1, b1, c1 = InitialParams(3, race, disability)
- not right ATM since another argument added
b1 == [(1,6), (7,8)]
sum(a1 .≈ [ 1.065729740994666, -0.829056350999318, 0.8962148070867403, 1.0436992067470956, 0.07009106904271295, -0.5353616478134361, -0.44631360818507515, 0.11163462482008807]) == 8
TODO: indices are not right, but they aren't being used so whatever.
"""
function InitialParams(Characteristics, x...; rand_init = true )
Random.seed!(323)
ds = Array{Tuple{Int64,Int64},1}()
curr = 1
len = 0
first_ix = zeros(Int64, length(x)) # this will be length(x) long
# creates tuples corresponding to parameters, records how many parameters there are.
for (i, el) in enumerate(x)
push!(ds, (curr,curr+length(el[1])-1))
first_ix[i] = curr
curr = curr +length(el[1])
len += length(el[1])
end
if rand_init
arr = randn(len, max(Characteristics,1)) # generate random parameters
else
arr = zeros(Float64, len, max(Characteristics,1))
end
for j ∈ first_ix
arr[j,:] .= 0.0 # reset these back to zero to make multiplication easier.
end
if Characteristics > 0
# means of the characteristics.
push!(ds, (curr,curr+Characteristics-1))
curr = curr +max(Characteristics,1)
push!(ds, (curr,curr+Characteristics-1))
# duplicate this to get the variances.
curr += 1
push!(ds, (curr,curr+Characteristics-1))
curr = curr +max(Characteristics,1)
push!(ds, (curr,curr+Characteristics-1))
end # NB:If characteristics == 0, vcat below has the correct dimension.
vcv = rand(Float64, Characteristics, Characteristics)
# tODO - identity matrix of dim #(characteristics) here. Zeros(Characteristics,Characteristics).+I(Characteristics)
if Characteristics > 0 # Order below: parameters, means, variances, correlation parameters Ρ.
return vcat(arr, vcv), vcat(rand(Float64,1,Characteristics), abs.(rand(Float64,1,Characteristics))), ds, first_ix # can return len if need params less random coeffs.
else
return arr, ds, first_ix # special case when there are no random characteristics - not that relevant.
end
end
"""
`ProductParams`
This function returns the parameters which multiply the characteristics within δ_jt.
Since δ_jt = x_{jt}β- α p_{jt} + ξ_{jt} we need parameters multiplying the product
characteristics which are shared across all individuals in the market.
- Takes a collection of product characteristics and returns some parameters.
cop = (:copay, [:copay])
ingr = (:active_ingredient, [:codeine, :hydrocodone, :hydromorphone, :methadone, :morphine, :oxycodone, :other, :tramadol])
mme = (:mme, [:mme])
pack = (:package_size, [:small_package, :medium_package, :large_package])
ProductParams(cop, ingr, mme, pack)
"""
function ProductParams(Characteristics...)
Random.seed!(31)
# 13 total
locs = Array{Tuple{Int64,Int64},1}()
name_loc = Array{Tuple{Symbol, Int64},1}()
ix = 1
tot_p = 0
for el in Characteristics
tot_p += length(el[2])
tup1 = (ix, ix+(length(el[2])-1))
push!(locs, tup1 )
for i = 1:length(el[2])
push!(name_loc, (el[2][i], ix+i-1))
end
ix += length(el[2])
end
return randn(ix-1), locs, name_loc
end
"""
`MKT(N, C)`
Imports the demographics, draws a sample of simulated individuals according to those characteristics.
- The variable N is the number of individuals drawn per market.
- C is the number of characteristics getting a random coefficient.
- Returns a collection of those individuals, plus a collection of parameters of the right dimension (corresponding to the characteristcs AND the product features)
Returns a collection given by the call to PopMarkets at the bottom:
(characteristics + # rand shocks) × number of individuals × number of markets
# Test:
mkk = MKT(10,3)
mkk[3] # the demographics across markets.
mkk[4] # shocks, so random coefficient draws
# TODO - how many markets? State × year I think.
"""
function MKT(N, C)
N_individuals = N # number of individuals to simulate per market.
N_characteristics = C # number of characteristics getting a random coeff
mkt_chars = CSV.read("./state_demographics.csv", DataFrame)
mkt_chars[!,:total_est_pop] = log.(mkt_chars[!,:total_est_pop])
# at least import the state-level data...
# TODO - there are too many features here relative to other applications.
# could do sum of maybe the 20 - 49 popluation
pop_w = Matrix{Float64}( mkt_chars[!, [:pop_10_14, :pop_15_19, :pop_20_24, :pop_25_29, :pop_30_34, :pop_35_39, :pop_40_44, :pop_45_49, :pop_50_54, :pop_55_59, :pop_60_64, :pop_65_69, :pop_70_74, :pop_75_79, :pop_80_84, :pop_85_plus]])
# males 20 - 49, 50++
male_w = Matrix{Float64}( mkt_chars[!, [:male_10_14, :male_15_19, :male_20_24, :male_25_29, :male_30_34, :male_35_39, :male_40_44, :male_45_49, :male_50_54, :male_55_59, :male_60_64, :male_65_69, :male_70_74, :male_75_79, :male_80_84, :male_85_plus]])
# females 20 - 49, 50++
female_w = Matrix{Float64}( mkt_chars[!, [:female_10_14, :female_15_19, :female_20_24, :female_25_29, :female_30_34, :female_35_39, :female_40_44, :female_45_49, :female_50_54, :female_55_59, :female_60_64, :female_65_69, :female_70_74, :female_75_79, :female_80_84, :female_85_plus]])
# white, black, other
race_w = Matrix{Float64}( mkt_chars[!, [:race_white, :race_afam, :race_nativeam, :race_asian, :race_pacisland, :race_other]])
# leave as is.
disability_w = hcat(mkt_chars[!,:total_pop]-mkt_chars[!,:total_pop_w_disability], mkt_chars[!,:total_pop_w_disability])
# HS or less, Some college or More
education_w = Matrix{Float64}( mkt_chars[!, [:total_less_than_hs_grad, :total_hs_grad, :total_some_college, :total_bachelors, :total_less_than_9th_grade, :total_only_9_12_grade, :total_aa_degree, :total_ba_degree, :total_grad_degree]])
# would be nice to get out of labor force
laborforce_w = Matrix{Float64}( mkt_chars[!, [:in_lab_for_16_19, :in_lab_for_20_24, :in_lab_for_25_44, :in_lab_for_45_54, :in_lab_for_55_64, :in_lab_for_65_74, :in_lab_for_75_plus]])
# unemployment 20 - 54
unemployment_w = Matrix{Float64}( mkt_chars[!, [:unemp_rate_16_plus, :unemp_rate_16_19, :unemp_rate_20_24, :unemp_rate_25_44, :unemp_rate_45_54, :unemp_rate_55_64, :unemp_rate_65_74, :unemp_rate_75_plus]])
# not sure yet.
hhinc_w = Matrix{Float64}( mkt_chars[!, [:hhinc_lt_10000, :hhinc_10_14999, :hhinc_15_24999, :hhinc_25_34999, :hhinc_35_49999, :hhinc_50_74999, :hhinc_75_99999, :hhinc_100_149999, :hhinc_150_199999, :hhinc_gt_200000]])
states = Vector{String}( mkt_chars[!,:name])
# sample systems
# a tuple of one-hot vectors and weights, one for each state market.
pop = (OH(size(pop_w, 2)), MFW(pop_w))
male = (OH(size(male_w, 2)), MFW(male_w))
female = (OH(size(female_w, 2)), MFW(female_w))
race = (OH(size(race_w, 2)), MFW(race_w))
disability = (OH(size(disability_w, 2)), MFW(disability_w))
education = (OH(size(education_w, 2)), MFW(education_w))
labor = (OH(size(laborforce_w, 2)), MFW(laborforce_w))
unemp = (OH(size(unemployment_w, 2)), MFW(unemployment_w))
hhinc = (OH(size(hhinc_w, 2)), MFW(hhinc_w))
# TODO: dumb place to put these
cop = (:copay, [:copay])
ingr = (:active_ingredient, [:codeine, :hydrocodone, :hydromorphone, :methadone, :morphine, :oxycodone, :other, :tramadol])
mme = (:mme, [:mme])
pack = (:package_size, [:small_package, :medium_package, :large_package])
common_params, common_locs, common_names = ProductParams(cop, ingr, mme, pack)
params = InitialParams(3, pop, male, female, race, disability, education, labor, unemp, hhinc)
# NB size of this is: ("features" = demographics + characteristics) × number of individuals × # of markets
# to index one person: sim_individuals[:, i, j] -> some person i in market j
sim_individuals, shocks = PopMarkets(states, N_individuals, N_characteristics, pop, male, female, race, disability, education, labor, unemp, hhinc)
# products w/ their characteristics.
# shares, when available.
return params, common_params, sim_individuals, shocks
end
"""
`MarketShares(mkt_vars::Array, MKT...)`
Doesn't do anything but import and return the market share and product characteristic data.
Returns both the original data set and a subset called wanted_data w/ just the columns of interest.
- Takes a set of arguments MKT... which are column indices
- Slightly awful, but returns a vector: (state, year, [ndc_code, market_share]),
- Can iterate over state-year combinations like that.
- Comes sorted from mkt_shares.do
-
## TEST ##
st1 = ["AK", "AL", "AR", "AZ", "CA", "CO", "CT", "DC", "DE", "FL", "GA", "HI", "IA", "ID", "IL", "IN", "KS", "KY", "LA", "MA", "MD", "ME", "MI", "MN", "MO", "MS", "MT", "NC", "ND", "NE", "NH", "NJ", "NM", "NV", "NY", "OH", "OK", "OR", "PA", "RI", "SC", "SD", "TN", "TX", "UT", "VA", "VT", "WA", "WI", "WV", "WY"];
yr1 = [2009, 2010, 2011, 2012, 2013];
a1, a2 = MarketShares(st1, yr1)
TODO: hold el[1], el[2] separate and return separately.
## To check the size...
a1, a2 = MarketShares(st1, yr1)
outp = []
for el = 1:size(a1,1)
for s = 1:size(a1[el],1)
push!(outp, a1[el][s])
end
end
size(outp)
"""
function MarketShares(MKT...) # take a variable identifying the market here
# TODO - rewrite to return two vectors, one w/ data the other w/ state/year.
inp_shares = CSV.read("./state_year_shares.csv", DataFrame) # |> DataFrame
market_shares = Array{Array{Union{String,Real},2}, 1 }()
labels = Array{ Tuple{ String, Int64 } , 1}()
for el in Iterators.product(MKT...)
tm1 = Matrix{Union{String,Real}}(inp_shares[ (inp_shares[!,:state].==el[1]).&(inp_shares[!,:yr].==el[2]) , [:ndc_code, :market_share]])
push!(labels, (el[1], el[2]))
push!(market_shares, tm1)
end
return market_shares, labels
end
"""
`ProductChars(mkt_vars::Array, Characteristics...)`
Takes a set `Characteristics` of column indexes in the file and returns the characteristics
NB: comes sorted out of mkt_shares.do.
TODO - make sure all future continuous variables are normalized.
TODO - need variables to split the market up
TODO - would make sense to keep the list so that InitialParams will still behave as intended.
## TEST ###
charcs = ProductChars(:ndc_code, :avg_copay, :codeine, :hydrocodone, :hydromorphone, :methadone, :morphine, :oxycodone,:other, :tramadol, :mme, :small_package, :medium_package,:large_package)
Columns:
["ndc_code",
"avg_copay",
"codeine",
"hydrocodone",
"hydromorphone",
"methadone",
"morphine",
"oxycodone",
"other",
"tramadol",
"mme",
"small_package",
"medium_package",
"large_package"]
Categories:
[:ndc_code, [:copay]],
[:copay, [:copay]],
(:active_ingredient, [:codeine, :hydrocodone, :hydromorphone, :methadone, :morphine, :oxycodone, :other, :tramadol])
[:mme, [:mme]],
(:package_size, [:small_package, :medium_package, :large_package])
"""
function ProductChars(Characteristics...)
inp_charcs = CSV.read("./products_characteristics.csv",DataFrame)
charcs = inp_charcs[ !, [Characteristics...]]
# normalize cols 2 and 11, copay and mme.
charcs[:,2] =NormalizeVar(charcs[:,2])
charcs[:,11] = NormalizeVar(charcs[:,11])
return Matrix{Float64}(charcs )
end