-
Notifications
You must be signed in to change notification settings - Fork 8
/
Copy pathsteincoresummary.tex
executable file
·4310 lines (3446 loc) · 370 KB
/
steincoresummary.tex
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
%%%%%%%%%%%%%%%%%
%Problems include
%%%%%%%%%%%%%%%%%
%-References
%-Inconsistent notation
%-Implicit assumptions (e.g., iid)
%-Inconsistently pitched level
%-Stuff missing
%-Inconsistent inclusion of fact vs. proof/intuition
%-Some special cases explicit; others implicit
%-Organization is mix of chronological and thematic
%-Inconsistent formatting
%
%
%%%%%%%%%%%%%%%%%
%Things to (potentially) add include
%%%%%%%%%%%%%%%%%
%-End of Nir
%-Confidence intervals
%-Bootstrap
%-Levin math notes
%-More Bernheim notes
%-Manuel notes
%-Metrics: indirect Wald; NLLS consistency requires linearity in endogenous variables otherwise must instrument (see MaCurdy P.S. 2); QML only OK if linear in endogenous (and errors)
\documentclass[8pt,letterpaper, landscape]{extarticle} % using extarticle instead of article to get smaller fonts; should be used with three columns
%\documentclass[10pt,letterpaper, landscape]{article} % Full sized font version; should be used with two columnsf
\usepackage{intcheatsheet}
% Shortcuts for Script X, Y, and B and bold A, B, C, X, I, 0, ...
\newcommand{\B}{\ensuremath{\mathcal{B}}}
\newcommand{\X}{\ensuremath{\mathcal{X}}}
\newcommand{\Y}{\ensuremath{\mathcal{Y}}}
\newcommand{\mA}{\ensuremath{\mathbf{A}}}
\newcommand{\mB}{\ensuremath{\mathbf{B}}}
\newcommand{\mC}{\ensuremath{\mathbf{C}}}
\newcommand{\mD}{\ensuremath{\mathbf{D}}}
\newcommand{\mX}{\ensuremath{\mathbf{X}}}
\newcommand{\mY}{\ensuremath{\mathbf{Y}}}
\newcommand{\mx}{\ensuremath{\mathbf{x}}}
\newcommand{\my}{\ensuremath{\mathbf{y}}}
\newcommand{\mI}{\ensuremath{\mathbf{I}}}
\newcommand{\mi}{\ensuremath{\mathbf{\iota}}}
\newcommand{\mmu}{\ensuremath{\mathbf{\mu}}}
\newcommand{\mc}{\ensuremath{\mathbf{c}}}
\newcommand{\mSigma}{\ensuremath{\mathbf{\Sigma}}}
\newcommand{\mzero}{\ensuremath{\mathbf{0}}}
\renewcommand{\ln}{\log}
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\begin{document}
\noindent Luke Stein \\
%\footnote{Most content is copied verbatim (or only minimally rewritten) from a variety of sources; errors in the source materials are now in the good company of numerous additional errors I have presumably introduced in writing/compiling these notes.}
Stanford graduate economics core (2006--7) \\
Reference and review (last updated April 11, 2012)
\hrule
%%%%%%%%%%%%%%%%%%%%%%%%%%
\setlength\columnseprule{.4pt} % Set vertical rules between columns
\begin{multicols}{4}
\cftpagenumbersoff{subsubsection} % Turn off page numbers for subsections (headers) in the TOC
\tableofcontents
\end{multicols}
%\newpage % pagebreak to use instead of rule
\hrule % rule to use instead of page break
%%%%%%%%%%%%%%%%%%%%%%%%%%
\setlength\columnseprule{0pt} % Clear vertical rules between columns
\begin{multicols}{3}
%\begin{multicols*}{2}
\section{Econometrics}
\begin{description}
\subsection{Probability foundations}
\litem{Basic set theory}{C\&B 1.1.4} All sets $ A, B, C $ satisfy:
\begin{enumerate}
\item Commutativity\index{Commutativity}: $ A \cup B = B \cup A $ and $ A \cap B = B \cap A $;
\item Associativity\index{Associativity}: $ A \cup (B \cup C) = (A \cup B) \cup C $ and $ A \cap (B \cap C) = (A \cap B) \cap C $;
\item Distributive laws\index{Distributive laws}: $ A \cap (B \cup C) = (A \cap B) \cup (A \cap C) $ and $ A \cup (B \cap C) = (A \cup B) \cap (A \cup C) $;
\item DeMorgan's Laws\index{DeMorgan's Laws}: $ (A \cup B)^{\mathsf{C}} = A^{\mathsf{C}} \cap B^{\mathsf{C}} $ and $ (A \cap B)^{\mathsf{C}} = A^{\mathsf{C}} \cup B^{\mathsf{C}} $.
\end{enumerate}
\litem{Disjointness}{C\&B 1.1.5} Two events are disjoint (a.k.a.\ mutually exclusive\index{Mutual exclusivity}) iff $ A \cap B = \emptyset $. Events $ \{ A_i \} $ are pairwise disjoint or mutually exclusive iff $ A_i \cap A_j = \emptyset $ for all $ i \neq j $. Two events with nonzero probability cannot be both mutually exclusive and independent (see ex.~1.39).
\litem{Partition}{C\&B 1.1.6} $ A_1, A_2, \dotsc $ is a partition of $ S $ iff:
\begin{enumerate}
\item $ S = \bigcup_i A_i $ (i.e., covering);
\item $ A_1, A_2, \dotsc $ are pairwise disjoint (i.e., non-overlapping).
\end{enumerate}
\litem{Sigma algebra}{C\&B 1.2.1} Collection of subsets of $ S $ (i.e., a subset of the power set of $ S $) is a sigma algebra, denoted \B{} iff:
\begin{enumerate}
\item $ \emptyset \in \B $;
\item If $ A \in \B $, then $ A^{\mathsf{C}} \in \B $ (closed under complementation---along with first axiom gives $ S \in \B $);
\item If $ \{ A_i \} \subseteq \B $, then $ \bigcup_i A_i \in \B $ (closed under countable unions).
\end{enumerate}
A cdf completely determines the probability distribution of a random variable if its probability function is defined only for events in the Borel field $ \B^1 $\index{Borel field}\index{B1 (Borel field)@$ \B^1 $ (Borel field)}, the smallest sigma algebra containing all the intervals of real numbers of the form $ (a, b)$, $[a, b)$, $(a, b]$, $[a,b]$. If probabilities are defined for a larger class of events, two random variables \textit{may} have the same cdf but not the same probability for every event (see C\&B p.~33).
\litem{Probability function}{C\&B 1.2.4, 8--9, 11} Given a sample space $ S $ and associated sigma algebra \B, a function $ P \colon \B \to \R $ is a probability function iff it satisfies the Kolmogorov Axioms or Axioms of Probability\index{Kolmogorov Axioms or Axioms of Probability}\index{Axioms of Probability}:
\begin{enumerate}
\item $ P(A) \geq 0 $ for all $ A \in \B $;
\item $ P(S) = 1 $;
\item If $ \{ A_i \} \subseteq \B $ are pairwise disjoint, then $ P(\bigcup_i A_i) = \sum_i P(A_i) $ (countable additivity for pairwise disjoint sets).
\end{enumerate}
For any probability function $ P $ and $ A, $ $ B \in \B $,
\begin{enumerate}
\item $ P(\emptyset) = 0 $;
\item $ P (A) \leq 1 $;
\item $ P(A^{\mathsf{C}}) = 1 - P(A) $;
\item $ P (B \cap A^{\mathsf{C}}) = P(B) - P (A \cap B) $ ($B$ but not $A$ is $ B $ minus both $ A $ and $ B $);
\item $ P (A \cup B) = P(A) + P(B) - P(A \cap B) $;
\item If $ A \subseteq B $, then $ P(A) \leq P(B) $.
\end{enumerate}
If $ \{ C_i \} $ partitions $ A $, then $ P(A) = \sum_i P(A \cap C_i) $.
\litem{Probability space}{Hansen 1-31, 1-28} $ (\Omega, \mathcal{F}, P) $ where:
\begin{enumerate}
\item $ \Omega $ is the universe (e.g., $ S $, the sample space);
\item $ \mathcal{F} $ is the $ \sigma $-field (e.g., $ \B^1 $);
\item $ P $ a probability measure (e.g., $ \mathcal{P} $, the probability measure that governs all random variables).
\end{enumerate}
A random variable $ X $ induces a probability measure $ P_X $ defined by $ P_X (B) \equiv P(X \in B) = P(F) $. This gives the probability space $ (\R, \B, P_X) $.
\litem{Counting}{C\&B sec.~1.2.3} The number of possible arrangement of size $ r $ from $ n $ objects is
\begin{center}
\begin{tabular}{ccc}
\hline
& No replacement & With replacement \\
Ordered & $ \frac{n!}{(n-r)!} $ & $ n^r $ \\
Unordered & $ \binom{n}{r} $ & $ \binom{n+r-1}{r} $ \\
\hline
\end{tabular}
\end{center}
where $ \binom{n}{r} \equiv \frac{n!}{r! (n-r)!} $. (Unordered with replacement, a.k.a.\ ``stars and bars.''\index{Stars and bars@``Stars and bars''})
\litem{Conditional probability}{C\&B 1.3.2, ex.~1.38} For $ A $, $ B \in S $ with $ P(B) > 0 $, the conditional probability of $ A $ given $ B $ is $ P(A|B) \equiv P(A \cap B) / P(B) $.
\begin{enumerate}
\item If $ A $ and $ B $ are disjoint ($ A \cap B = \emptyset $), then $ P(A|B) = P(B|A) = 0 $.
\item If $ P(B) = 1 $, then $ \forall A, P(A|B) = P(A) $.
\item If $ A \subseteq B $, then $ P(B|A) = 1 $ and $ P(A|B) = P(A)/P(B) $.
\item If A and B are mutually exclusive, $ P(A|A \cup B) = P(A) / [P(A) + P(B)] $.
\item $ P (A \cap B \cap C) = P(A|B \cap C) \cdot P(B|C) \cdot P(C) $.
\end{enumerate}
\litem{Bayes' Rule}{C\&B 1.3.5} A formula for ``turning around'' conditional probabilities: $ P (A|B) = P(B|A) \cdot P(A) / P(B) $. More generally, if $ \{ A_i \} $ partition the sample space and $ B $ is any set, then $ \forall i, $ $$ P(A_i | B) = \frac{P(B|A_i) \cdot P(A_i)}{\sum_j P(B|A_j) \cdot P(A_j)}. $$
\litem{Independence of events}{C\&B 1.3.7, 9, 4.2.10}\index{Independent} $ A $, $ B $ statistically independent iff $ P(A \cap B) = P(A) \cdot P(B) $ or identically iff $ P(A|B) = P(A) $ (this happens iff $ P(B|A) = P(B) $).
\begin{enumerate}
\item Iff $ A $ and $ B $ are independent, then the following pairs are also independent: $ A $ and $ B^{\mathsf{C}} $, $ A^{\mathsf{C}} $ and $ B $, $ A^{\mathsf{C}} $ and $ B^{\mathsf{C}} $.
\item Two events with nonzero probability cannot be both mutually exclusive and independent (see ex.~1.39).
\item If $ X $, $ Y $ independent r.v.s, then for any $ A $, $ B \subseteq \R $, events $ \{ X \in A \} $ and $ \{ Y \in B \} $ are independent events.
\end{enumerate}
\litem{Mutual independence of events}{C\&B 1.3.12}\index{Independent} A collection of events $ \{ A_i \} $ are mutually independent iff for any subcollection $ A_{i_1}, \dotsc, A_{i_k} $ we have $ P (\bigcap_j A_{i_j}) = \prod_j P(A_{i_j}) $. Note that pairwise independence does \textit{not} imply mutual independence.
\subsection{Random variables}
\litem{Random variable}{1.4.1, 1.5.7--8, 10, sec.~3.2} A function $ X \colon S \to \R $ where $ S $ is a sample space.
\begin{enumerate}
\item Continuous\index{Continuous r.v.} iff its cdf is a continuous function and discrete\index{Discrete r.v.} iff its cdf is a step function (i.e., if sample space is countable).
\item Identically distributed iff $ \forall A \in \B^1, $ $ P(X \in A) = P (Y \in A) $, or identically iff $ \forall x, $ $ F_X(x) = F_Y(x) $.
\item Note identical distribution says nothing about (in)dependence.
\end{enumerate}
\litem{Random vector}{C\&B 4.1.1} $ n $-dimensional random vector is a function $ \mX \colon S \to \R^n $ where $ S $ is a sample space.
\litem{Measurability}{Hansen 1-28, 4-11} A r.v.\ $ X \colon (\Omega, \mathcal{F}) \to (\R, \mathcal{B}) $ is $ \mathcal{F} $-measurable iff $ \forall B \in \mathcal{B} $, $ \{ \omega \in \Omega \colon X(\omega) \in B \} \in \mathcal{F} $ (i.e., the preimage of every element of $ \mathcal{B} $ is an element of $ \mathcal{F} $).
If $ \mathcal{F} $ and $ \mathcal{G} $ are both $ \sigma $-fields with $ \mathcal{G} \subseteq \mathcal{F} $, then $ X $ is $ \mathcal{G} $-measurable $ \implies X $ is $ \mathcal{F} $-measurable (i.e., if the preimage of every $ B $ is in $ \mathcal{G} $, it is also in its superset $ \mathcal{F} $).
\litem{Smallest $ \sigma $-field}{Hansen 4-12} The smallest $ \sigma $-field that makes a r.v.\ $ Z \colon (\Omega, \mathcal{F}) \to (\R, \B) $ measurable is $ \sigma (Z) \equiv \{ G \subseteq \Omega \colon \exists B \in \B $, $ G = Z^{-1} (B) \} $ (i.e., the set of preimages of elements of $ \B $).
\litem{Independence of r.v.s}{C\&B 4.2.5, 7, p.~154, 4.3.5}\index{Independent} $ X $ and $ Y $ independent r.v.s (written $ X \independent Y $) iff any of the following equivalent conditions hold:
\begin{enumerate}
\item $ \forall A $, $ B $, $ P (X \in A $, $ Y \in B) = P (X \in A) \cdot P (Y \in B) $.
\item $ F_{XY} (x,y) \equiv P (X \leq x $, $ Y \leq y) = F_X (x) F_Y(y) $.
\item $ f(x,y) = f_{X}(x) f_Y(y) $ (i.e., joint pdf/pmf is the product of marginal pdfs/pmfs).
\item $ f(y|x) = f_Y(y) $ (i.e., conditional pdf/pmf equals marginal pdf/pmf).
\item $ \exists g(x) $, $ h(y) $, $ \forall x $, $ y $, $ f(x,y) = g(x) h(y) $ (i.e., joint pdf/pmf is separable). Note that functional forms may appear separable, but limits may still depend on the other variable; if the support set of $ (X,Y) $ is not a cross product, then $ X $ and $ Y $ are \textit{not} independent.
\end{enumerate}
For any functions $ g(t) $ and $ h(t) $, $ X \independent Y \implies g(X) \independent h(Y) $.
\litem{Independence of random vectors}{C\&B 4.6.5}\index{Independent} $ \mX_1 , \dotsc , \mX_n $ mutually independent iff for every $ ( \mx_1 , \dotsc , \mx_n ) $, the joint pdf/pmf is the product of the marginal pdfs/pmfs; i.e.,
$ f ( \mx_1 , \dotsc , \mx_n ) = \prod_i f_{\mX_i} (\mx_i)$.
\begin{enumerate}
\item Knowledge about the values of some coordinates gives us no information about the values of the other coordinates.
\item The conditional distribution of any subset of the coordinates, given the values of the rest of the coordinates, is the same as the marginal distribution of the subset.
\item Mutual independence implies pairwise independence, but pairwise independence does \textit{not} imply mutual independence.
\end{enumerate}
\litem{Mean independence}{Metrics P.S. 3-4c, Metrics section} $ X $ is mean independent of $ Y $ (written $ X \independentm Y $) iff $ \E (X|Y) = \E (X) $.
\begin{enumerate}
\item Mean independence is not transitive (i.e., $ X \independentm Y $ does \textit{not} imply that $ Y \independentm X $).
\item Independence implies mean independence (i.e., $ X \independent Y \implies X \independentm Y \land Y \independentm X $).
\item $ X \independentm Y \implies \E [(X | g(Y)] = \E [X] $, for any function $ g (\cdot) $.
\item $ X \independentm Y \implies \Cov (X, g(Y)) = 0 $ for any function $ g (\cdot) $.
\end{enumerate}
\litem{Cumulative distribution function}{C\&B 1.5.1, 3, p.~147}\index{cdf (cumulative distribution function)} $ F_X(x) \equiv P(X \leq x) $. By the Fundamental Theorem of Calculus, $ \frac{d}{dx} F_X(x) = f_X(x) $ for a continuous r.v.\ at continuity points of $ f_X $. A function $ F $ is a cdf iff:
\begin{enumerate}
\item $ \lim_{x \to -\infty} F(x) = 0 $ and $ \lim_{x \to \infty} F(x) = 1 $;
\item $ F (\cdot) $ nondecreasing;
\item $ F (\cdot) $ right-continuous; i.e., $ \forall x_0 $, $ \lim_{x \downarrow x_0} F(x) = F(x_0) $.
\end{enumerate}
A random vector $ X $ has joint cdf\index{Joint cdf} $ F_X(x_1, \dotsc, x_n) \equiv P (X_1 \leq x_1, \dotsc, X_n \leq x_n) $. By the Fundamental Theorem of Calculus, $ \frac{\partial^n}{\partial x_1 \dotsm \partial x_n} F_X(\vec{x}) = f_X(\vec{x}) $ for a continuous (in all dimensions) random vector at continuity points of $ f_X $.
\litem{Probability mass function}{C\&B 1.6.1, 5, 4.1.3}\index{pmf (probability mass function)} For a discrete r.v., $ f_X (x) \equiv P (X = x) $. A function $ f_X $ is a pmf iff:
\begin{enumerate}
\item $ \forall x, $ $ f_X(x) \geq 0 $;
\item $ \sum_x f_X(x) = 1 $.
\end{enumerate}
$ f_X $ gives the probability of any event: $ P (X \in B) = \sum_k 1_{(x_k \in B)} f_X(x_k) $.
A discrete random vector $ X $ has joint pmf\index{Joint pmf} $ f_X(\vec{v}) \equiv P (X = \vec{v}) $.
\litem{Marginal pmf}{C\&B 4.1.6, p.~178} For a discrete random vector,
%$ f_{X_i} (x) \equiv P(X_i = x) = $
%$$ \sum_{(x_1, \dotsc, x_{i-1}, x_{i+1}, \dotsc, x_n) \in \R^{n-1}} f_X (x_1, \dotsc, x_{i-1}, x, x_{i+1}, \dotsc, x_n), $$
$$ f_{X_i} (x_i) \equiv P(X_i = x_i) = \sum_{x_{-i} \in \R^{n-1}} f_X (x) ; $$
i.e., hold $ X_i = x_i $, and sum $ f_X $ over all remaining possible values of $ X $.
We can also take the marginal pmf for multiple $ i $ by holding these and summing $ f_X $ over all remaining possible values of $ X $.
\litem{Conditional pmf}{C\&B 4.2.1} For $ (X,Y) $ a discrete random vector, $ f(y|x) \equiv P(Y = y | X = x) = f(x,y)/f_X(x) $, where $ f(x,y) $ is joint pmf, and $ f_X(x) $ is marginal pmf.
\litem{Probability density function}{C\&B 1.6.3, 5, 4.1.10}\index{pdf (probability density function)} For a continuous r.v., $ f_X (x) $ defined as the function which satisfies $ F_X(x) = \int_{-\infty}^{x} f_X(t) \, dt $ for all $ x $. A function $ f_X $ is a pdf iff:
\begin{enumerate}
\item $ \forall x $, $ f_X(x) \geq 0 $;
\item $ \int_\R f_X(x) \, dx = 1 $.
\end{enumerate}
$ f_X $ gives the probability of any event: $ P (X \in B) = \int_\R 1_{(x \in B)} f_X(x) \, dx $.
A continuous (in all dimensions) random vector $ X $ has joint pdf\index{Joint pdf} $ f_X(x_1, \dotsc, x_n) $ iff $ \forall A \subseteq \R^n, $ $ P(X \in A) = \idotsint_A f_X(x_1, \dotsc, x_n) \, dx_1 \dotsm dx_n $.
\litem{Marginal pdf}{C\&B p.~145, 178} For a continuous (in all dimensions) random vector,
$$ f_{X_i} (x_i) \equiv \idotsint_{\R^{n-1}} f_X (x) \, dx_1 \dotsm dx_{i-1} \, dx_{i+1} \dotsm dx_n, $$
i.e., hold $ X_i = x_i $, and integrate $ f_X $ over $ \R $ in all $ X_j $ for $ i \neq j $.
We can also take the marginal pdf for multiple $ i $ by holding these and integrating $ f_X $ over $ \R $ in all $ X_j $ that aren't being held.
\litem{Conditional pdf}{C\&B 4.2.3, p.~178} For $ (X,Y) $ a continuous random vector, $ f(y|x) \equiv f(x,y)/f_X(x) $ as long as $ f_X(x) \neq 0 $, where $ f(x,y) $ is joint pdf, and $ f_X(x) $ is marginal pdf.
We can also condition for/on multiple coordinates: e.g., for $ (X_1, X_2, X_3, X_4) $ a continuous random vector, $ f(x_3, x_4|x_1, x_2) \equiv f(x_1, x_2, x_3, x_4) / f_{X_1 X_2}(x_1, x_2) $, where $ f $ is a joint pdf, and $ f_{X_1 X_2} $ is the marginal pdf in $ X_1 $ and $ X_2 $.
\litem{Borel Paradox}{4.9.3} Be careful when we condition on events of probability zero: two events of probability zero may be equivalent, but the probabilities conditional on the two events is different!
\litem{Stochastic ordering}{C\&B ex.~1.49, ex.~3.41-2} cdf $ F_X $ stochastically greater than cdf $ F_Y $ iff $ F_X(t) \leq F_Y(t) $ at all $ t $, with strict inequality at some $ t $. This implies $ P (X > t) \geq P (Y > t) $ at all $ t $, with strict inequality at some $ t $.
A family of cdfs $ \{ F(x|\theta) \} $ is stochastically increasing in $ \theta $ iff $ \theta_1 > \theta_2 \implies F(x | \theta_1) $ stochastically greater than $ F(x | \theta_2) $. A location family is stochastically increasing in its location parameter; if a scale family has sample space $ [0, \infty) $, it is stochastically increasing in its scale parameter.
\litem{Support set}{C\&B eq.~2.1.7}\index{Support} Support set (a.k.a.\ support) of a r.v.\ $ X $ is $ \X \equiv \{ x \colon f_X(x) > 0 \} $, where $ f_X $ a cdf or pdf (or in general, any nonnegative function).
\subsection{Transformations of random variables}
\litem{Transformation $ \R^1 \to \R^1 $}{C\&B 2.1.3, 5, 8} A discrete r.v.\ can be transformed into a discrete r.v. A continuous r.v.\ can be transformed into either a continuous or a discrete r.v.\ (or mixed!). When $ Y = g(X) $ and $ \Y \equiv g(\X) $ (where $ \X $ is the support of $ X $),
\begin{enumerate}
\item If $ g $ monotone increasing on $ \X $, then $ F_Y(y) = F_X(g^{-1}(y)) $ for $ y \in \Y $;
\item If $ g $ monotone decreasing on $ \X $ \textit{and $ X $ a continuous r.v.}, then $ F_Y(y) = 1 - F_X(g^{-1}(y)) $ for $ y \in \Y $.
\end{enumerate}
If $ g $ monotone, $ f_X $ continuous on $ \X $, and $ g^{-1} $ has continuous derivative on $ \Y $, then:
$$ f_Y(y) = \begin{cases}
f_X \left( g^{-1}(y) \right) \left| \tfrac{d}{dy} g^{-1}(y) \right|, & y \in \Y; \\
0, & \text{otherwise.}
\end{cases} $$
If $ \{A_i \}_{i=0}^k $ partitions $ \X $, with $ P (X \in A_0) = 0 $; $ f_X $ continuous on each $ A_i $; and $ \exists \{A_i \}_{i=1}^k $ satisfying:
\begin{enumerate}
\item $ g(x) = g_i(x) $ for $ x \in A_i $,
\item $ g_i $ monotone on $ A_i $,
\item $ \exists \Y $, $ \forall i $, $ g_i(A_i) = \Y $ (i.e., all $ A_i $ have same image under their respective $ g_i $s) [Hansen note 2-15 suggests this need not hold],
\item $ \forall i $, $ g_i^{-1} $ has a continuous derivative on $ \Y $; then:
\end{enumerate}
$$ f_Y(y) = \begin{cases}
\sum_{i=1}^k f_X \left( g_i^{-1}(y) \right) \left| \tfrac{d}{dy} g_i^{-1}(y) \right|, & y \in \Y; \\
0, & \text{otherwise.}
\end{cases} $$
\litem{Transformation $ \R^2 \to \R^2 $}{C\&B p.~158, 185} Let $ U = g_1 (X,Y) $ and $ V = g_2 (X,Y) $ where:
\begin{enumerate}
\item $ (X,Y) $ has pdf $ f_{XY} $ and support $ \mathcal{A} $;
\item $ g_1 $ and $ g_2 $ define a 1-to-1 transformation from $ \mathcal{A} $ to $ \mathcal{B} \equiv \{ (u,v) \colon u \in g_1(\mathcal{A}) $, $ v \in g_2(\mathcal{A}) \} $ (i.e., the support of $ (U,V) $);
\item Inverse transform is $ X = h_1 (U, V) $ and $ Y = h_2 (U, V) $; then:
\end{enumerate}
$$ f_{UV}(u, v) = \begin{cases}
f_{XY} (h_1(u, v), h_2(u, v)) \left| J \right| , & (u,v) \in \mathcal{B}; \\
0, & \text{otherwise;}
\end{cases} $$
where $ J $ is the Jacobian\index{Jacobian},
$$ J \equiv \operatorname{det} \begin{bmatrix}
\tfrac{\partial x}{\partial u} & \tfrac{\partial x}{\partial v} \\
\tfrac{\partial y}{\partial u} & \tfrac{\partial y}{\partial v}
\end{bmatrix} . $$
If the transformation is not 1-to-1, we can partition $ \mathcal{A} $ into $ \{ \mathcal{A}_i \} $ such that 1-to-1 transformations exist from each $ \mathcal{A}_i $ to $ \mathcal{B} $ which map $ (x,y) \mapsto (u,v) $ appropriately. Letting $ x = h_{1i} (u, v) $ and $ y = h_{2i} (u, v) $ be the inverses, and $ J_i $ the Jacobian, on $ \mathcal{A}_i $;
$$ f_{UV}(u, v) = \begin{cases}
\sum_i f_{XY} (h_{1i}(u, v), h_{2i}(u, v)) \left| J_i \right| , & (u,v) \in \mathcal{B}; \\
0, & \text{otherwise.}
\end{cases} $$
For generalization to $ \R^n \to \R^n $ case for $ n > 2 $, see C\&B p.~185.
\litem{Convolution formulae}{C\&B 5.2.9, ex.~5.6} $ X \independent Y $ both continuous. Then:
\begin{enumerate}
\item $ f_{X+Y}(z) = \int_\R f_X(w) f_Y(z-w) \, dw $.
\item $ f_{X-Y}(z) = \int_\R f_X(w) f_Y(w-z) \, dw $.
\item $ f_{X Y}(z) = \int_\R | \tfrac{1}{w} | f_X(w) f_Y(z/w) \, dw $.
\item $ f_{X/Y}(z) = \int_\R | w | f_X(wz) f_Y(w) \, dw $.
\end{enumerate}
\litem{Probability integral transformation}{C\&B 2.1.10, ex.~2.10} If $ Y = F_X(X) $ (for $ X $ continuous) then $ Y \sim \unifnoital (0,1) $. Can be used to generate random samples from a particular distribution: generate a uniform random and apply inverse of cdf of target distribution. If $ X $ is discrete, $ Y $ is stochastically greater than $ \unifnoital (0,1) $.
\subsection{Properties of random variables}
\litem{Expected value, mean}{C\&B 2.2.1, 5--6, 4.6.6}\index{$ \mu $ (expected value)}
$$ \E g(X) \equiv \begin{cases}
\int_\R g(x) f_X (x) \, dx, & \text{if $ X $ continuous;} \\
\sum_{x \in \X} g(x) f_X (x) \, dx, & \text{if $ X $ discrete;}
\end{cases} $$
provided the integral or sum exists \textit{and} that $ \E |g(X)| \neq \infty $. For constants $ a, $ $ b, $ $ c $ and functions $ g_1, $ $ g_2 $ such that $ \E(g_1(X)), $ $ \E(g_2(X)) $ exist,
\begin{enumerate}
\item $ \E [a g_1(X) + b g_2(X) + c] = a \E(g_1(X)) + b \E(g_2(X)) + c $ (i.e., expectation is a linear operator);
\item If $ \forall x, g_1(x) \geq 0 $, then $ \E g_1 (X) \geq 0 $;
\item If $ \forall x, g_1(x) \geq g_2 (x) $, then $ \E g_1 (X) \geq \E g_2 (X) $;
\item If $ \forall x, a \leq g_1(x) \leq b $, then $ a \leq\E g_1 (X) \leq b $.
\end{enumerate}
The mean is the MSE minimizing predictor for $ X $; i.e., $ \min_b \E (X - b)^2 = \E (X - \E X)^2 $. If $ X_1, \dotsc , X_n $ mutually independent, then $ \E [ g_1(X_1) \cdot \dotsb \cdot g_n(X_n) ] = \E [ g_1(X_1)] \cdot \dotsb \cdot \E [g_n(X_n) ] $.
\litem{Conditional expectation}{C\&B p.~150; Hansen 4-14--6; Hayashi 138--9} a.k.a.\ regression of $ Y $ on $ X $.\index{Regression} $ \E (Y|X) $ is a r.v.\ which is a function of $ X $. For discrete $ (X,Y) $, $ \E (g(Y)|x) \equiv \sum_Y g(y) f(y|x) $. For continuous $ (X,Y) $, $ \E (g(Y)|x) \equiv \int_\R g(y) f(y|x) \, dy $. Conditional expected value has all the standard properties of expected value. Also:
\begin{enumerate}
\item $ \E [g(X) | X] = g(X) $ for any function $ g $.
\item $ \E [g(X) h(Y) | X] = g(X) \E [h(Y)|X] $ for any functions $ g $ and $ h $.
\item $ X \independent Y \implies \E (Y|X) = \E (Y) $ (i.e., knowing $ X $ gives us no additional information about $ \E Y $).
\item $ \E (Y|X) = \E (Y) \implies \Cov (X, Y) = 0 $
\item $ \E (Y|X) $ is the MSE minimizing predictor of $ Y $ based on knowledge of $ X $, (i.e., $ \min_{g(x)} \E [Y - g(X)]^{2} = \E [Y - \E (Y|X)]^{2} $).
\end{enumerate}
Let $ X $ be a r.v.\ that takes values in $ (\R, \B) $, let $ \mathcal{G} = \sigma (X) $ (i.e., the smallest sigma field measuring $ X $), and assume $ \E |Y| < \infty $. Conditional expected value of $ Y $ given $ X $, is defined implicitly (and non-uniquely) as satisfying:
\begin{enumerate}
\item $ \E | \E (Y|X) | < \infty $;
\item $ \E (Y|X) $ is $ \mathcal{G} $-measurable (i.e., $ Y|X $ cannot rely on more information than $ X $ does);
\item $ \forall G \in \mathcal{G} $, $ \int_G \E (Y|X) \, dP(\omega) = \int_G Y \, dP(\omega) $ (i.e., $ \E [ \E (Y|X) | X \in G] = \E [Y | X \in G ] $);
\end{enumerate}
where the notation $ \int_B \cdot \, dP_X(x) $ means $ \int_B \cdot \, f_X(x) \, dx $ if $ X $ is continuous, and means $ \sum_{x \in B} \cdot \, f_X(x) $ if $ X $ is discrete.
\litem{Two-way rule for expectations}{C\&B p.~58, ex.~2.21} If $ Y = g(X) $, then $ \E g(X) = \E Y $; i.e., $ \int_\R g(x) f_X(x)\, dx = \int_\R y f_Y(y) \, dy $.
\litem{Law of Iterated Expectations}{C\&B 4.4.3; Hansen 4-21}\index{Iterated expectations} $ \E X = \E [ \E (X|Y)] $, provided the expectations exist. More generally, when $ \mathcal{L} \subseteq \mathcal{M} $ (i.e., $ \mathcal{L} $ contains less information, $ \mathcal{M} $ contains more),
$$ \E [X | \mathcal{L} ] = \E [ \E (X | \mathcal{M} ) | \mathcal{L} ] = \E [ \E (X | \mathcal{L} ) | \mathcal{M} ] . $$
\litem{Median}{C\&B ex.~2.17--18} $ m $ such that $ P (X \leq m) \geq \tfrac{1}{2} $ and $ P (X \geq m) \geq \tfrac{1}{2} $. If $ X $ continuous, the median minimizes absolute deviation; i.e., $ \min_a \E |X-a| = \E|X - m| $.
\litem{Mode}{C\&B ex.~2.27} $ f(x) $ is unimodal\index{Unimodality} with mode equal to $ a $ iff $ a \geq x \geq y \implies f(a) \geq f(x) \geq f(y) $ and $ a \leq x \leq y \implies f(a) \geq f(x) \geq f(y) $.
\begin{enumerate}
\item Modes are not necessarily unique.
\item If $ f $ is symmetric and unimodal, then the point of symmetry is a mode.
\end{enumerate}
\litem{Symmetric distribution}{C\&B ex.~2.25--26} If $ f_X $ is symmetric about $ a $ (i.e., $ \forall \epsilon, $ $ f_X(a + \epsilon) = f_X(a - \epsilon) $), then:
\begin{enumerate}
\item $ X $ and $ 2a - X $ are identically distributed;
\item If $ a = 0 $, then $ M_X $ is symmetric about 0;
\item $ a $ is the median;
\item If $ \E X $ exists, then $ \E X = a $.
\item For odd $ k $, the $ k $th central moment $ \mu_k $ is zero (if it exists); if the distribution is symmetric about $ 0 $, then all odd moments are zero (if they exist).
\end{enumerate}
\litem{Moment}{C\&B 2.3.1, 11; Hansen 2-37}\index{Central moment}\index{$ \mu_n $ (central moment)}\index{$ \mu_{n}' $ (moment)} For $ n \in \Z $, the $ n $th moment of $ X $ is $ \mu_{n}' \equiv \E X^n $. Also denote $ \mu_{1}' = \E X $ as $ \mu $. The $ n $th central moment is $ \mu_n \equiv \E(X - \mu)^n $.
\begin{enumerate}
\item Two different distributions \textit{can} have all the same moments, but only if the variables have unbounded support sets.
\item A distribution is uniquely determined by its moments if all moments are defined and $ \lim_{n \to \infty} \sum_{k=1}^{n} \mu_{k}' r^k / k! $ exists for all $ r $ in a neighborhood of zero.
\end{enumerate}
\litem{Variance}{C\&B 2.3.2, 4, p.~60, 4.5.6, ex.~4.58}\index{$ \sigma^2 $ (variance)} $ \Var X \equiv \mu_2 = \E (X - \E X)^2 = \E X^2 - (\E X)^2 $. Often parametrized as $ \sigma^2 $.
\begin{enumerate}
\item For constants $ a, $ $ b $, if $ \Var X \neq \infty $, then $ \Var (aX+b) = a^2 \Var X $;
\item Assuming variances exist, $ \Var (aX + bY) = a^2 \Var X + b^2 \Var Y + 2ab \Cov (X,Y) $;
\item $ \Var [Y - \E (Y|X)] = \E [\Var(Y|X)] $.
\end{enumerate}
\litem{Multivariate variance}{?} $ \Var \mX \equiv \E [ \mX \mX' ] - \E [ \mX ] \E [ \mX ]' $. Thus:
\begin{enumerate}
\item $ \Var ( \mX + \mY ) = \Var ( \mX ) + \Cov ( \mX , \mY ) + \Cov ( \mX , \mY )' + \Var ( \mY ) $;
\item $ \Var (\mA \mX) = \mA \Var ( \mX ) \mA' $.
\end{enumerate}
\litem{Conditional variance}{C\&B p.~151, 4.4.7; Greene 81--4} a.k.a.\ scedastic function.\index{Scedastic function} $ \Var (Y|X) \equiv E[ (Y - \E [Y|X] )^2 | X] = \E [Y^2 | X] - (\E [Y|X])^2 $.
\begin{enumerate}
\item $ X \independent Y \implies \Var (Y | X) = \Var (Y) $.
\item Conditional variance identity: provided the expectations exist,
$$ \Var (Y) = \underbrace{\E [\Var (Y|X)]}_{\text{residual variance}} + \underbrace{\Var [\E (Y|X)]}_{\text{regression variance}} . $$
Implies that on average, conditioning reduces the variance of the variable subject to conditioning ($ \Var (Y) \geq \E [\Var (Y|X)] $).
\end{enumerate}
\litem{Standard deviation}{C\&B 2.3.2}\index{$ \sigma $ (standard deviation)} $ \sigma \equiv \sqrt{\Var X} $.
\litem{Covariance}{C\&B 4.5.1, 3, ex.~4.58--9; Greene 77} $ \Cov (X, Y) \equiv \E [ (X - \E X) (Y - \E Y)] = \E [ (X - \E X) Y ] = \E [ X (Y - \E Y)] = \E (XY) - (\E X )( \E Y) $. If $ X $, $ Y $, $ Z $ all have finite variances, then:
\begin{enumerate}
\item $ \Cov (X, Y) = \Cov [X, \E(Y|X)] $;
\item $ \Cov [X, Y - \E(Y|X) ] = 0 $;
\item $ \Cov (X, Y) = \E [\Cov(X, Y|Z)] + \Cov[\E(X|Z), \E (Y|Z) ] $.
\item $ \Cov (X, Y+Z) = \Cov (X, Y) + \Cov (X,Z) $.
\item $ \Cov (aX + bY , cX + dY) = ac \Var (X) + bd \Var (Y) + (ad+bc) \Cov (X,Y) $.
\end{enumerate}
\litem{Multivariate covariance}{Hansen 5-28; Hayashi 75--6} $ \Cov (\mX, \mY) \equiv \E [(\mX - \E \mX)(\mY - \E \mY)'] = \E (\mX \mY') - ( \E \mX ) (\E \mY') $. Thus:
\begin{enumerate}
\item $ \Cov (\mA \mX, \mB \mY) = \mA \Cov (\mX, \mY) \mB' $;
\item $ \Cov (\mX, \mY) = \Cov ( \mY , \mX )' $.
\end{enumerate}
\litem{Correlation}{C\&B 4.5.2, 5, 7}\index{$ \rho $ (correlation)} $ \operatorname{Corr} (X,Y) \equiv \rho_{XY} \equiv \Cov (X,Y) / (\sigma_X \sigma_Y) $.
\begin{enumerate}
\item $ \operatorname{Corr} (a_1 X + b_1, a_2 Y + b_2) = \operatorname{Corr} (X,Y) $.
\item Correlation is in the range $ [-1,1] $, with $ \pm 1 $ indicating a perfectly linear relationship ($ +1 $ for positive slope, $ -1 $ for negative slope), by the Cauchy-Schwarz Inequality.
\item $ X \independent Y \implies \Cov (X,Y) = \rho_{XY} = 0 $ (assuming finite moments); note however that zero covariance need \textit{not} imply independence.
\end{enumerate}
\litem{Skewness}{C\&B ex.~2.28; Greene 66}\index{$ \alpha_3 $ (skewness)} $ \alpha_3 \equiv \mu_3 \cdot (\mu_2)^{-3/2} $, where $ \mu_i $ is the $ i $th central moment. Measures the lack of symmetry in the pdf. 0 for any normal, $ t $, or uniform; $ 2 $ for exponential, $ 2 \sqrt{2/r} $ for $ \chi_r^2 $, $ 2 \sqrt{a} / a $ for gamma.
\litem{Kurtosis}{C\&B ex.~2.28}\index{$ \alpha_4 $ (kurtosis)} $ \alpha_4 \equiv \mu_4 \cdot \mu_2^{-2} $, where $ \mu_i $ is the $ i $th central moment. Measures the ``peakedness'' of the pdf. $ \alpha_4 = 3 $ for any normal. (Sometimes normalized by subtracting 3.)
\litem{Moment generating function}{C\&B 2.3.6--7, 11--12, 15, 4.2.12, 4.6.7, 9}\index{mgf (moment generating function)} $ M_X(t) \equiv \E e^{tX} $ as long as the expectation exists for $ t $ in a neighborhood 0. If $ M_X $ exists, then $ \forall n \in \Z$, $ n \geq 0 $,
$$ \mu_{n}' \equiv \E X^n = \left. \frac{d^n}{dt^n} M_X(t) \right|_{t=0}. $$
\begin{enumerate}
\item It is possible for all moments to exist, but not the mgf.
\item If r.v.s have equal mgfs in some neighborhood of 0, then the variables are identically distributed (i.e., an extant mgf characterizes a distribution).
\item If the mgfs of a sequence of r.v.s converge to $ M_X $ in some neighborhood of zero, then the cdfs of the sequence converge to $ F_X $ at all points where $ F_X $ is continuous.
\item For constants $ a, $ $ b $, if $ M_X $ exists, then $ M_{aX + b} (t) = e^{bt} M_X (at) $.
\item For $ X \independent Y $, $ M_{X + Y}(t) = M_X(t) M_Y (t) $. For $ X_1, \dotsc, X_n $ mutually independent, $ M_{\sum X_i} = \prod_i M_{X_i} $.
\item For $ X_1, \dotsc, X_n $ mutually independent, $ Z \equiv (a_1 X_1 + b_1) + \dotsb + (a_n X_n + b_n) $, then $ M_{Z}(t) = (e^{t(\sum b_i)}) \prod_i M_{X_i}(a_i t) $.
\end{enumerate}
\litem{Characteristic function}{C\&B sec.~2.6.2}\index{$ \phi (\cdot) $ (characteristic function)} $ \phi_X (t) \equiv \E e^{itX} $, where $ i = \sqrt{-1} $.
\begin{enumerate}
\item The cf always exists.
\item A cf completely determines a distribution: if the cfs of a sequence of r.v.s converge to $ \phi_X $ in some neighborhood of zero, then the cdfs of the sequence converge to $ F_X $ at all points where $ F_X $ is continuous.
\item For $ X \sim \n (0,1) $, $ \phi_X (t) = e^{-t^2 /2} $.
\item We can recover probability from a cf: for all $ a $, $ b $ such that $ P(X=a) = P(X=b) = 0 $,
$$ P (X \in [a,b]) = \lim_{T \to \infty} \frac{1}{2\pi} \int_{-T}^{T} \frac{e^{-ita} - e^{-itb}}{it} \phi_X (t) \, dt. $$
\end{enumerate}
\litem{Other generating functions}{C\&B sec.~2.6.2} Cumulant generating function\index{Cumulant generating function} $ \equiv \ln [M_X(t)] $, if the mgf exists.
Factorial moment generating function\index{Factorial moment generating function} (a.k.a.\ probability-generating function\index{Probability-generating function} when $ X $ is discrete) $ \equiv \E t^X $, if the expectation exists.
\subsection{Distributions}
\litem{Normal distribution}{C\&B p.~102--4, 2.1.9, 3.6.5, 4.2.14, 4.3.4, 6, 5.3.3; Wikipedia}\index{Gaussian distribution} Normal (a.k.a.\ Gaussian) particularly important because it is analytically tractable, has a familiar symmetric bell shape, and CLT shows that it can approximate many distributions in large samples. If $ X $ is normal with mean (and median) $ \mu $ and variance $ \sigma^2 $, then $ X \sim \n (\mu, \sigma^2) $ with pdf
$$ f_X(x) = \frac{1}{\sqrt{2 \pi \sigma^2}} e^{-(x-\mu)^2/(2 \sigma^2)} = \frac{1}{\sigma} \phi \left( \frac{x - \mu}{\sigma} \right). $$
$ f_X $ has maximum at $ \mu $ and inflection points at $ \mu \pm \sigma $. Moments are $ \E X = \mu $, $ \E X^2 = \mu^2 + \sigma^2 $, $ \E X^3 = \mu^3 + 3 \mu \sigma^2 $, $ \E X^4 = \mu^4 + 6 \mu^2 \sigma^2 + 3 \sigma^4 $.
Stein's Lemma:\index{Stein's Lemma} If $ g(\cdot) $ is differentiable with $ \E | g'(X)| < \infty $, then $ \E [g(X)(X - \mu)] = \sigma^2 \E g'(X) $.
$ Z \equiv (X - \mu) / \sigma $ is distributed $ \n (0,1) $ (i.e., ``standard normal''). $ \E [Z^k] = 0 $ if $ k $ odd, $ \E [Z^k] = 1 \cdot 3 \cdot 5 \dotsm (n-1) $ if $ k $ even. CDF denoted $ \Phi (\cdot) $\index{$ \Phi (\cdot) $ (standard normal cdf)}; pdf is
$$ \phi(z) \equiv f_Z(z) = \frac{1}{\sqrt{2 \pi}} e^{-z^2/2}.\index{$ \phi (\cdot) $ (standard normal pdf)} $$
\begin{enumerate}
\item $ P (|X - \mu | \leq \sigma) = P (|Z| \leq 1) \approx 68.26\% $;
\item $ P (|X - \mu | \leq 2\sigma) = P (|Z| \leq 2) \approx 95.44\% $;
\item $ P (|X - \mu | \leq 3\sigma) = P (|Z| \leq 3) \approx 99.74\% $.
\end{enumerate}
Independence and zero-covariance are equivalent for linear functions of normally distributed r.v.s. If normally distributed random vectors are pairwise independent, they are mutually independent.
Given iid sample $ X_i \sim \n(\mu, \sigma^2) $, log-likelihood is
$$ L(\mx) = -\tfrac{n}{2} \log (2 \pi) -\tfrac{n}{2} \log (\sigma^2) - \tfrac{1}{2 \sigma^2} \sum (x_i - \mu)^2. $$
Many distributions can be generated with manipulations/combinations of normals:
\begin{enumerate}
\item Square of standard normal is $ \chi^2_1 $.
\item If $ X \sim \n (\mu, \sigma^2) $, $ Y \sim \n (\gamma, \tau^2) $, and $ X \independent Y $, then $ X + Y \sim \n (\mu + \gamma, \sigma^2 + \tau^2) $ (i.e., independent normals are additive in mean and variance).
\item The sum and difference of independent normal r.v.s are independent normal as long as the variances are equal.
\item Ratio of independent standard normals is Cauchy ($ \sigma = 1 $, $ \theta = 0 $); look for the kernel of the exponential distribution when dividing normals.
\end{enumerate}
\litem{Bivariate normal distribution}{C\&B 4.5.10, ex.~4.45} Parameters $ \mu_X $, $ \mu_Y \in \R $; $ \sigma_X $, $ \sigma_Y > 0 $; $ \rho \in [-1,1] $; and pdf (on $ \R^2 $):
\begin{multline*}
f(x,y) = \left( 2 \pi \sigma_X \sigma_Y \sqrt{1 - \rho^2} \right)^{-1} \\
\times \exp \left( \frac{-1}{2(1 - \rho^2)} \left( \left( \frac{x - \mu_X}{\sigma_X} \right)^2 \right. \right. \\
\left. \left. - 2 \rho \left( \frac{x - \mu_X}{\sigma_X} \right) \left( \frac{y - \mu_Y}{\sigma_Y} \right) + \left( \frac{y - \mu_Y}{\sigma_Y} \right)^2 \right) \right).
\end{multline*}
\begin{enumerate}
\item The marginal distributions of $ X $ and $ Y $ are $ \n (\mu_X, \sigma_X^2) $ and $ \n (\mu_Y, \sigma_Y^2) $ (note marginal normality does \textit{not} imply joint normality).
\item The correlation between $ X $ and $ Y $ is $ \rho $.
\item For any constants $ a $ and $ b $, the distribution of $ a X + bY $ is $ \n (a \mu_X + b \mu_Y, a^2 \sigma_X^2 + b^2 \sigma_Y^2 + 2ab\rho \sigma_X \sigma_Y) $.
\item All conditional distributions of $ Y $ given $ X = x $ are normal: $ Y | X=x \sim $
$$ \n (\mu_Y + \underbrace{ \rho (\sigma_Y / \sigma_X)}_{\Cov (X,Y) / \sigma^2_X} (x - \mu_X), \sigma_Y^2 (1 - \rho^2)) .$$
\end{enumerate}
\litem{Multivariate normal distribution}{Hansen 5-17--35; MaCurdy p.~6; Greene 94} $ p $-dimensional normal, $ \n_p (\mathbf{\mu}, \mathbf{\Sigma}) $ has pdf
$$ f(\mx) = (2 \pi)^{-\frac{p}{2}} | \mathbf{\Sigma} |^{-\frac{1}{2}} \exp \left[ -\tfrac{1}{2} (\mx - \mathbf{\mu})' \mathbf{\Sigma}^{-1} (\mx - \mathbf{\mu}) \right], $$
where $ \mathbf{\mu} = \E [\mX] $ and $ \mathbf{\Sigma}_{ij} = \Cov (X_i, X_j) $.
A linear transformation of a normal is normal: if $ \mX \sim \n_p (\mmu, \mSigma) $, then for any $ \mA \in \R^{q \times p} $ with full row rank (which implies $ q \leq p $), and any $ \mathbf{b} \in \R^q $, we have $ \mA \mX + \mathbf{b} \sim \n_q (\mA \mmu + \mathbf{b} , \mA \mSigma \mA' ) $. In particular, $ \mSigma^{-1/2} (\mX - \mmu) \sim \n (\mzero, \mI) $, where $ \Sigma^{-1/2} = (\Sigma^{1/2})^{-1} = \mathbf{H} \mathbf{\Lambda}^{-1/2} \mathbf{H}' $.
The following transformations of $ \mX \sim \n_p (\mmu, \mSigma) $ are independent iff $ \mA \mSigma \mB' = \Cov (\mA \mX , \mB \mX) = \mzero $:
\begin{enumerate}
\item $ \mA \mX \sim \n (\mA \mmu , \mA \mSigma \mA') $ and $ \mB \mX \sim \n (\mB \mmu , \mB \mSigma \mB') $,
\item $ \mA \mX \sim \n (\mA \mmu , \mA \mSigma \mA') $ and $ \mX' \mB \mX \sim \chi^2_{\rank (\mB \Sigma)} $ (where $ \mB \mSigma $ is an idempotent matrix),
\item $ \mX' \mA \mX \sim \chi^2_{\rank (\mA \Sigma)} $ and $ \mX' \mB \mX \sim \chi^2_{\rank (\mB \Sigma)} $ (where $ \mA \mSigma $ and $ \mB \mSigma $ are idempotent matrices).
\end{enumerate}
\litem{Chi squared distribution}{C\&B 5.3.2; Hansen 5-29--32; MaCurdy p.~6; Greene 92}\index{$ \chi^2 $ distribution} $ \chi^2_n $ (Chi squared with $ n $ degrees of freedom) has mean $ n $ and variance $ 2n $. Can be generated from normal:
\begin{enumerate}
\item If $ Z \sim \n (0,1) $, then $ Z^2 \sim \chi^2_1 $ (i.e., the square of standard normal is a chi squared with 1 degree of freedom);
\item If $ X_1, \dotsc, X_n $ are independent with $ X_i \sim \chi^2_{p_i} $, then $ \sum X_i \sim \chi^2_{\sum p_i} $ (i.e., independent chi squared variables add to a chi squared, and the degrees of freedom add).
\item If $ \mX \sim \n_n (\mmu, \mSigma) $, then $ (\mX - \mmu)' \mSigma^{-1} (\mX - \mmu) \sim \chi^2_n $.
\item If $ \mX \sim \n_n (\mzero, \mI) $ and $ \mathbf{P}_{n \times n} $ is an idempotent matrix, then $ \mX' \mathbf{P} \mX \sim \chi^2_{\rank (\mathbf{P})} = \chi^2_{\operatorname{tr} (\mathbf{P})} $.
\item If $ \mX \sim \n_n (\mzero, \mI) $ then the sum of the squared deviations from the sample mean $ \mX' \mathbf{M}_{\mi} \mX \sim \chi^2_{n-1} $.
\item If $ \mX \sim \n_n (\mzero, \mSigma) $ and $ \mB_{n \times n} \mSigma $ is an idempotent matrix, then $ \mX' \mB \mX \sim \chi^2_{\rank (\mB \mSigma)} = \chi^2_{\operatorname{tr} (\mB \mSigma)} $.
\end{enumerate}
\litem{Student's $ t $ distribution}{C\&B 5.3.4; Greene 69--70}\index{t distribution@$ t $ distribution} If $ X_1, \dotsc, X_n $ are iid $ \n (\mu, \sigma^2) $, then $ \sqrt{n} (\bar{X}-\mu) / \sigma \sim \n (0,1) $. However, we will generally not know $ \sigma $. Using the sample variance rather than the true variance gives $ \sqrt{n} (\bar{X}-\mu) / s \sim t_{n-1} $.
Generally, $ \n (0,1) / \sqrt{\chi^2_{n-1} / (n-1)} \sim t_{n-1} $. If a $ t $ distribution has $ p $ degrees of freedom, there are only $ p-1 $ defined moments. $ t $ has thicker tails than normal.
$ t_1 $ is Cauchy distribution (the ratio of two independent standard normals). $ t_\infty $ is standard normal.
\litem{Snedecor's $ F $ distribution}{C\&B 5.3.6--8}\index{F distribution@$ F $ distribution} $ (\chi^2_p / p) / (\chi^2_q / q) \sim F_{p, q} $. The $ F $ distribution is also related by transformation with several other distributions:
\begin{enumerate}
\item $ 1 / F_{p,q} \sim F_{q, p} $ (i.e., the reciprocal of an $ F $ r.v.\ is another $ F $ with the degrees of freedom switched);
\item $ (t_q)^2 \sim F_{1,q} $;
\item $ (p/q) F_{p,q} / (1 + (p/q) F_{p,q}) \sim \operatorname{beta} (p/2, q/2) $.
\end{enumerate}
\litem{Lognormal distribution}{C\&B p.~625} If $ X \sim \n (\mu, \sigma^2) $, then $ Y \equiv e^X $ is lognormally distributed. (Note: a lognormal is \textit{not} the $ \log $ of a normally distributed r.v.).
\begin{align*}
\E Y &= e^{\mu + (\sigma^2 / 2)}; \\
\Var Y &= e^{2(\mu + \sigma^2)} - e^{2 \mu + \sigma^2} .
\end{align*}
\litem{Exponential families}{C\&B 3.4; Mahajan 1-5--6, 11} Any family of pdfs or pmfs that can be expressed as
$$ f(x | \mathbf{\theta}) = h(x) c(\mathbf{\theta}) \exp \left( \sum_{i=1}^{k} w_i (\mathbf{\theta}) t_i (x) \right), $$
where $ h(x) \geq 0 $, $ \{ t_i(x) \} $ are real-valued functions, $ c(\mathbf{\theta}) \geq 0 $, $ \{ w_i (\mathbf{\theta}) \} $ are real-valued functions, and the support does not depend on $ \theta $.
Includes normal, gamma, beta, $ \chi^2 $, binomial, Poisson, and negative binomial. C\&B Theorem 3.4.2 gives results that may help calculate mean and variance using differentiation, rather than summation/integration.
Can be re-parametrized as:
$$ f(x | \eta) = h(x) c^* (\eta) \exp \left( \sum_{i=1}^{k} \eta_i t_i (x) \right), $$
over ``natural parameter space''\index{Natural parameter space} $ \mathcal{H} \equiv \{ \eta = (\eta_1, \dotsc , \eta_k) \colon \int_\R h(x) \exp(\sum_{i=1}^{k} \eta_i t_i(x)) \, dx < \infty \} $, where for all $ \eta \in \mathcal{H} $, we have $ c^* (\eta) \equiv [ \int_\R h(x) \exp(\sum_{i=1}^{k} \eta_i t_i(x)) \, dx ]^{-1} $ to ensure the pdf integrates to $ 1 $.
The joint distribution of an iid sample from an exponential family will also be an exponential family (closure under random sampling).
\litem{Location and Scale families}{C\&B 3.5.1--7, p.~121} If $ f(x) $ a pdf, $ \mu, $ $ \sigma $ constants with $ \sigma > 0 $, then $ g(x) $ also a pdf: $$ g(x) \equiv \frac{1}{\sigma} f \left( \frac{x - \mu}{\sigma} \right). $$
$ X \sim g(x) $ iff $ \exists Z \sim f $, $ X = \sigma Z + \mu $. Assume $ X $ and $ Z $ exist; $ P (X \leq x) = P (Z \leq (x - \mu)/\sigma) $, and if $ \E Z $ and $ \Var Z $ exist, then $ \E X = \sigma \E Z + \mu $ and $ \Var (X) = \sigma^2 \Var Z $.
\begin{enumerate}
\item Family of pdfs $ f(x - \mu) $ indexed by $ \mu $ is the ``location family'' with standard pdf $ f(x) $ and location parameter $ \mu $.
\item Family of pdfs $ \frac{1}{\sigma} f(x / \sigma) $ indexed by $ \sigma > 0 $ is the ``scale family'' with standard pdf $ f(x) $ and scale parameter $ \sigma $. (e.g., exponential.)
\item Family of pdfs $ \frac{1}{\sigma} f((x - \mu) / \sigma) $ indexed by $ \mu $, and $ \sigma > 0 $ is the ``location-scale family'' with standard pdf $ f(x) $, location parameter $ \mu $, and scale parameter $ \sigma $. (e.g., uniform, normal, double exponential, Cauchy.)
\end{enumerate}
\litem{Stable distribution}{Hansen 5-15} Let $ X_1 $, $ X_2 $ be iid $ F $, and define $ Y = a X_1 + bX_2 + c $. Then $ F $ is a stable distribution iff $ \forall a $, $ b $, $ c $, $ \exists d $, $ e $ such that $ dY + e \sim F $.
\subsection{Random samples}
\litem{Random sample, iid}{5.1.1} R.v.s $ X_1, \dotsc, X_n $ are a random sample of size $ n $ from population $ f(x) $ (a.k.a.\ $ n $ iid r.v.s with pdf/pmf $ f(x) $) if they are mutually independent, each with marginal pdf/pmf $ f(x) $. By independence, $ f(x_1, \dotsc, x_n) = \prod_i f(x_i) $.
\litem{Statistic}{C\&B 5.2.1; Mahajan 1-7} A r.v.\ $ Y \equiv T (X_1, \dotsc, X_n) $, where $ T $ is a real or vector-valued function $ T(x_1, \dotsc, x_n) $ whose domain includes the support of $ (X_1, \dotsc, X_n) $. The distribution of $ Y $ is called its sampling distribution.
Alternately, any measurable function \textit{of the data} (as distinct from a parameter---a function of the distribution).
\litem{Unbiased estimator}{Hansen 5-14; C\&B 7.3.2} An statistic $ \hat{\theta} $ is unbiased for $ \theta $ iff $ \E_{\theta} (\hat{\theta}) = \theta $ for all $ \theta $. That is, if $ \operatorname{Bias} [\hat{\theta}] \equiv \E_{\theta} \hat{\theta} - \theta = 0 $ for all $ \theta $.
\litem{Sample mean}{C\&B 5.2.2, 4, 6--8, 10, p.~216, 5.5.2}\index{X (sample mean)@$ \bar{X} $ (sample mean)} $ \bar{X} \equiv \tfrac{1}{n} \sum_i X_i $ (i.e., the arithmetic average of the values in a random sample). For any real numbers, the arithmetic average minimizes SSR (i.e., $ \bar{x} \in \argmin_a \sum_i (x_i - a)^2 $).
\begin{enumerate}
\item If $ \E X_i = \mu < \infty $, then $ \E \bar{X} = \mu $ (i.e., $ \bar{X} $ is an unbiased estimator of $ \mu $)
\item If $ \Var X_i = \sigma^2 < \infty $, then $ \Var \bar{X} = \sigma^2 / n $.
\item If $ X_i $ have mgf $ M_X(t) $, then $ M_{\bar{X}}(t) = [M_X(t/n)]^n $.
\item (Law of Large Numbers)\index{Law of Large Numbers for iid samples}\index{Weak Law of Large Numbers}\index{Strong Law of Large Numbers}\index{LLN (Law of Large Numbers) for iid samples} If $ \{ X_i \} $ iid, $ \E X_i = \mu < \infty $ and $ \Var X_i = \sigma^2 < \infty $, then the series $ \bar{X}_n \xrightarrow{\text{as}} \mu $ (this also implies convergence in probability, the ``Weak'' LLN).
\end{enumerate}
The distribution of the $ X_i $, together with $ n $, characterize the distribution of $ \bar{X} $:
\begin{enumerate}
\item If $ X_i \sim \n(\mu, \sigma^2) $, then $ \bar{X} \sim \n (\mu, \sigma^2/n) $.
\item If $ X_i \sim \gammanoital (\alpha, \beta) $, then $ \bar{X} \sim \gammanoital (n \alpha, \beta / n) $.
\item If $ X_i \sim \operatorname{Cauchy} (\theta, \sigma) $, then $ \bar{X} \sim \operatorname{Cauchy} (\theta, \sigma) $.
\item If $ X_i \sim (1/ \sigma) f((x - \mu)/ \sigma) $ are members of a location-scale family, then $ \bar{X} = \sigma \bar{Z} + \mu $, where $ \{ Z_i \}_{i=1}^{n} $ is a random sample with $ Z_i \sim f(z) $.
\end{enumerate}
\litem{Sample variance}{C\&B 5.2.3--4, 6; Greene 102--4}\index{s2 (sample variance)@$ s^2 $ (sample variance)}
$$ s^2 \equiv \frac{1}{n-1} \sum_i (X_i - \bar{X})^2 = \frac{1}{n-1} \Bigl [ \sum_{i} X_i^2 - n \bar{X}^2 \Bigr ] .$$
Sample standard deviation\index{Sample standard deviation} is $ s \equiv \sqrt{s^2} $. If $ \Var (X_i) = \sigma^2 < \infty $, then $ \E s^2 = \sigma^2 $ (i.e., $ s^2 $ is an unbiased estimator of $ \sigma^2 $). $ s_{aX}^{2} = a^2 s_X^2 $.
For any real numbers $ \{ x_i \}_{i=1}^{n} $, we have $ \sum_i (x_i - \bar{x})^2 = \sum_i x_i^2 - n\bar{x}^2 $.
\litem{Sample covariance}{Greene 102--4}\index{sXY (sample covariance)@$ s_{XY} $ (sample covariance)}
\begin{align*}
s_{XY} &\equiv \frac{1}{n-1} \sum_i (X_i - \bar{X})(Y_i - \bar{Y}) \\
&= \frac{1}{n-1} \Bigl [ \sum_{i} X_i Y_i - n \bar{X} \bar{Y} \Bigr ] .
\end{align*}
If $ \Cov (X_i, Y_i) = \sigma_{XY} < \infty $, then $ \E s_{XY} = \sigma_{XY} $ (i.e., $ s_{XY} $ is an unbiased estimator of $ \sigma_{XY} $). $ s_{aX, bY} = |ab| s_{XY} $.
For any real numbers $ \{ x_i, y_i \}_{i=1}^{n} $, we have $ \sum_i (x_i - \bar{x})(y_i - \bar{y}) = \sum_i x_i y_i - n\bar{x}\bar{y} $.
\litem{Sample correlation}{Greene 102--4}\index{rXY (sample covariance)@$ r_{XY} $ (sample correlation)} $ r_{XY} \equiv s_{XY} / (s_{X} s_{Y}) $.
$ r_{aX, bY} = (ab / |ab|) r_{XY} $.
\litem{Order statistic}{C\&B 5.4.1--4} The order statistics of a sample $ X_1 , \dotsc , X_n $ are the ordered values from $ X_{(1)} $ (the sample minimum) to $ X_{(n)} $ (the sample maximum). Thus the sample median\index{Sample median} is
\[ M \equiv \begin{cases} X_{((n+1)/2)}, & \text{$ n $ is odd;} \\ \tfrac{1}{2} (X_{(n/2)} + X_{(n/2 + 1)}), & \text{$ n $ is even.} \end{cases} \]
If $ \{ X_i \}_{i=1}^{n} $ iid continuous r.v.s, then
\begin{align*}
F_{X_{(j)}} (x) &= \sum_{k=j}^{n} \binom{n}{k} [F_X (x)]^{k} [1 - F_X (x)]^{n-k}; \\
f_{X_{(j)}} (x) &= \frac{n!}{(j-1)! (n-j)!} f_X (x) [F_X (x)]^{j-1} [1 - F_X (x)]^{n-j}.
\end{align*}
See C\&B 5.4.3 for discrete r.v.
\litem{Samples from the normal distribution}{C\&B 5.3.1, 6} $ \{ X_i \}_{i=1}^{n} $ iid $ \n (\mu, \sigma^2 ) $ gives:
\begin{enumerate}
\item $ \bar{X} \independent s^2 $ (can also be shown with Basu's Theorem);
\item $ \bar{X} \sim \n (\mu, \sigma^2 / n ) $;
\item $ \Var (s^2) = 2 \sigma^4 / (n-1) $;
\item $ (n-1) s^2 / \sigma^2 \sim \chi^2_{n-1} $.
\end{enumerate}
If $ \{ X_i \}_{i=1}^{n} $ iid $ \n (\mu_X, \sigma_X^2 ) $ and $ \{ Y_i \}_{i=1}^{m} $ iid $ \n (\mu_Y, \sigma_Y^2 ) $, $$ \frac{s_X^2 / s_Y^2}{\sigma_X^2 / \sigma_Y^2} = \frac{s_X^2 / \sigma_X^2}{s_Y^2 / \sigma_Y^2} \sim F_{n-1, m-1} . $$
\subsection{Convergence of random variables}
\begin{displaymath}
\xymatrix{
& & X_n \xrightarrow{L_s} X \ar@{=>}[d]^{s \geq r} \\
X_n \xrightarrow{\text{as}} X \ar@{=>}[dr] & & X_n \xrightarrow{L_r} X \ar@{=>}[dl]^{r \geq 0} \\
& X_n \xrightarrow{\text{p}} X \ar@{=>}[d] & \\
& X_n \xrightarrow{\text{d}} X &
}
\end{displaymath}
See more LLNs and CLTs in ``Time-series concepts.''
\litem{Convergence in probability}{C\&B 5.5.1--4, 12; Hansen 5-41; Hayashi 89; D\&M 103; MaCurdy p.~9} $ \{ X_i \}_{i=1}^{\infty} $ converges in probability to $ X $ iff, $ \forall \epsilon > 0 $, $ \lim_{n \to \infty} P( |X_n - X| \geq \epsilon ) =0 $, or equivalently $ \lim_{n \to \infty} P( |X_n - X| < \epsilon ) = 1 $. Written as $ X_n \xrightarrow{\text{p}} X $ or $ X_n - X = o_p (1) $ or $ \plim_{n \to \infty} X_n = X $.
\begin{enumerate}
\item Convergence in probability is implied by almost sure convergence or convergence in $ L_p $ (for $ p>0 $).
\item Convergence in probability implies convergence in distribution (but \textit{not} conversely).
\item (Weak Law of Large Numbers)\index{Weak Law of Large Numbers}\index{Law of Large Numbers for iid samples} If $ \{ X_i \} $ iid with $ \E X_i = \mu < \infty $ and $ \Var X_i = \sigma^2 < \infty $, then the series $ \bar{X}_n \xrightarrow{\text{p}} \mu $ (stronger result gives convergence almost surely).
\item (Continuous Mapping Theorem)\index{Continuous Mapping Theorem}\index{CMT (Continuous Mapping Theorem)} If $ X_n \xrightarrow{\text{p}} X $ and $ h $ is a continuous function, then $ h(X_n) \xrightarrow{\text{p}} h(X) $.
\item If $ \E X_n \to \mu $ and $ \Var X_n \to 0 $, then $ X_n \xrightarrow{\text{p}} \mu $.
\end{enumerate}
\litem{Uniform convergence in probability}{Hayashi 456--7; MaCurdy p.~14; D\&M 137} $ \{ Q_i(\theta) \}_{i=1}^{\infty} $ converges in probability to $ Q_0(\theta) $ iff, $ \sup_{\theta \in \Theta} \lVert Q_n(\theta) - Q_0(\theta) \rVert \xrightarrow{\text{p}} 0 $.
That is $ \forall \epsilon > 0 $, $ \lim_{n \to \infty} P( \sup_{\theta \in \Theta} \lVert Q_n(\theta) - Q_0(\theta) \rVert \geq \epsilon ) =0 $, or equivalently $ \lim_{n \to \infty} P( \sup_{\theta \in \Theta} \lVert Q_n(\theta) - Q_0(\theta) \rVert < \epsilon ) = 1 $.
This is stronger than pointwise convergence. Uniform convergence in probability is the regularity condition required to pass $ \plim $s through functions or to reverse the order of differentiation and integration.
\litem{Little $ o $ error notation}{D\&M 108--13; Hansen 5-42; MathWorld}\index{o error notation@$ o $ error notation}\index{Landau symbols} Roughly speaking, a function is $ o(z) $ iff is of lower asymptotic order than $ z $.
$ f(n) = o(g(n)) $ iff $ \lim_{n \to \infty} f(n) / g(n) = 0 $. If $ \{ f(n) \} $ is a sequence of random variables, then $ f(n) = o_p (g(n)) $ iff $ \plim_{n \to \infty} f(n) / g(n) = 0 $.
%Per MathWorld, $ f(x) = o(\phi) $ iff $ \lim_{x \to \infty} f(x) / \phi = 0 $.
We write $ X_n - X = o_p (n^{- \gamma}) $ iff $ n^{\gamma}(X_n - X) \xrightarrow{\text{p}} 0 $.
\litem{Big $ O $ error notation}{D\&M 108--13; MathWorld}\index{O error notation@$ O $ error notation}\index{Landau symbols} Roughly speaking, a function is $ O(z) $ iff is of the same asymptotic order as $ z $.
$ f(n) = O(g(n)) $ iff $ | f(n) / g(n) | < K $ for all $ n > N $ and some positive integer $ N $ and some constant $ K >0 $. If $ \{ f(n) \} $ is a sequence of random variables, then $ f(n) = o_p (g(n)) $ iff $ \plim_{n \to \infty} f(n) / g(n) = 0 $.
%Per MathWorld, $ f(x) = O(\phi) $ iff $ |f(x)| < A \phi $ for some constant $ A $ and all values of $ x $.
\litem{Order symbols}{D\&M 111-2}
\begin{align*}
O(n^p) \pm O(n^q) &= O(n^{\max \{ p,q \}}). \\
o(n^p) \pm o(n^q) &= o(n^{\max \{ p,q \}}). \\
O(n^p) \pm o(n^q) &= \begin{cases} O(n^p), & p \geq q; \\ o(n^q), & p < q. \end{cases} \\
O(n^p) \cdot O(n^q) &= O(n^{p+q}). \\
o(n^p) \cdot o(n^q) &= o(n^{p+q}). \\
O(n^p) \cdot o(n^q) &= o(n^{p+q}).
\end{align*}
These identities cover sums, \textit{as long as the number of terms summed is independent of $ n $}.
\litem{Asymptotic equivalence}{D\&M 110--1} $ f(n) \stackrel{a}{=} g(n) $ iff $ \lim_{n \to \infty} f(n) / g(n) = 1 $.
\litem{Convergence in $ L_p $}{Hansen 5-43--5; Hayashi 90; MaCurdy p.~11}\index{Convergence in mean square}\index{Convergence in quadratic mean} $ \{ X_i \}_{i=1}^{\infty} $ converges in $ L_p $ to $ X $ iff, $ \lim_{n \to \infty} \E( |X_n - X|^p) = 0 $. Note this requires the existence of a $ p $th moment. Written $ X_n \xrightarrow{L_p} X $. Convergence in $ L_2 $ a.k.a.\ convergence in mean square/quadratic mean.
\begin{enumerate}
\item Convergence in $ L_p $ is implied by convergence in $ L_q $ for $ q \geq p $.
\item Convergence in $ L_p $ implies convergence in $ L_j $ for $ j \leq p $.
\item Convergence in $ L_p $ (for $ p>0 $) implies convergence in probability and in distribution (but \textit{not} conversely).
\item (Continuous Mapping Theorem extension)\index{Continuous Mapping Theorem} If $ X_n \xrightarrow{L_p} X $ and $ h $ is a continuous function, then $ h(X_n) \xrightarrow{L_p} h(X) $.
\end{enumerate}
Convergence in $ L_2 $ to a constant requires $ \lim_{n \to \infty} \E [(X_n - X)' (X_n - X)] = \lim_{n \to \infty} \operatorname{Bias}^2 + \Var^2 = 0 $. Thus it is necessary and sufficient that squared bias and squared variance go to zero.
\litem{Almost sure convergence}{C\&B 5.5.6, 9; Hayashi 89; D\&M 106, 131; MaCurdy p.~10}\index{Convergence almost surely} $ \{ X_i \}_{i=1}^{\infty} $ converges in probability to $ X $ iff, $ \forall \epsilon > 0 $, $ P( \lim_{n \to \infty} |X_n - X| < \epsilon ) = 1 $. Written $ X_n \xrightarrow{\text{as}} X $. Also known as strong convergence, or convergence almost everywhere.
\begin{enumerate}
\item Almost sure convergence implies convergence in probability and in distribution (but \textit{not} conversely).
\item (Strong Law of Large Numbers)\index{Strong Law of Large Numbers}\index{Law of Large Numbers for iid samples} If $ \{ X_i \} $ iid with $ \E X_i = \mu < \infty $ and $ \Var X_i = \sigma^2 < \infty $, then the series $ \bar{X}_n \xrightarrow{\text{as}} \mu $.
\item (Strong Law of Large Numbers, niid)\index{Strong Law of Large Numbers}\index{Law of Large Numbers for niid samples} If $ \{ X_i \} $ niid with $ \E X_i = 0 $ and $ \lim_{n \to \infty} n^{-2} \sum_{i} \Var X_i = \infty $, then the series $ \bar{X}_n \xrightarrow{\text{as}} 0 $.
\item (Continuous Mapping Theorem extension)\index{Continuous Mapping Theorem} If $ X_n \xrightarrow{\text{as}} X $ and $ h $ is a continuous function, then $ h(X_n) \xrightarrow{\text{as}} h(X) $.
\end{enumerate}
\litem{Convergence in distribution}{C\&B 5.5.10--13; Hayashi 90--1; Greene 119--20; D\&M 107} $ \{ X_i \}_{i=1}^{\infty} $ converges in distribution to $ X $ iff, $ \lim_{n \to \infty} F_{X_n} (x) = F_X (x) $ at all points where $ F_X $ is continuous. Written as $ X_n \xrightarrow{\text{d}} X $ or $ X_n - X = O_p (1) $ or as ``$ X $ is the limiting distribution of $ X_n $.''
\begin{enumerate}
\item Convergence in probability is implied by almost sure convergence, convergence in $ L_p $ (for $ p>0 $), or convergence in probability.
\item Convergence in distribution implies convergence in probability \textit{if the series converges to a constant}.
\end{enumerate}
\litem{Central Limit Theorem for iid samples}{C\&B 5.5.14--15; Hansen 5-60--65; Hayashi 96}\index{CLT (Central Limit Theorem) for iid samples}\index{Lindeberg-Levy CLT} Lindeberg-Levy CLT: $ \sqrt{n} (\bar{X}_n - \mu ) / \sigma \xrightarrow{\text{d}} \n (0,1) $ as long as the iid $ X_i $s each have finite mean, and finite, nonzero variance. Note a weaker form requires mgfs of $ X_i $ to exist in a neighborhood of zero.
In multivariate case, iid $ \mX_i \sim (\mmu, \mSigma) $ satisfy $ \sqrt{n} (\bar{\mX}_n - \mmu ) \xrightarrow{\text{d}} \n (\mzero, \mSigma) $. Proved using Cram\'{e}r-Wold Device\index{Cram\'{e}r-Wold Device}.
%See Lyapounov's Theorem\index{Lyapounov's Theorem} on Hansen 5-62 or MaCurdy p.~24 for a CLT on independent r.v.s with non-constant variance (n.b., requires existance of $ (2 + \delta ) $th moment).
We also have CLT for niid samples (Lyapounov's Theorem), Ergodic stationary mds CLT, and CLT for $ \text{MA} (\infty) $ processes.
\litem{Central Limit Theorem for niid samples}{Hansen 5-62; D\&M 126; MaCurdy p.~21--2}\index{CLT (Central Limit Theorem) for niid samples}\index{Lyapounov's Theorem} [Lyapounov's Theorem] If $ X_i \sim \text{niid} (\mu , \sigma_i^2) $ and a $ (2 + \delta ) $th moment exists for each $ X_i $, then
$$ \sqrt{n} ( \bar{X}_n - \mu ) \xrightarrow{\text{d}} \n (0 , \bar{\sigma}^{2} ), $$
where $ \bar{\sigma}^{2} \equiv \lim_{n \to \infty} \tfrac{1}{n} \sum_{i} \sigma_{i}^{2} $, as long as the iid $ X_i $s each have finite mean, and finite, nonzero variance. Note a weaker form requires mgfs of $ X_i $ to exist in a neighborhood of zero.
Implies that if $ \epsilon_i \sim \text{niid} (0 , \sigma^2) $ (with extant $ (2 + \delta ) $th moment), and $ \{ z_i \} $ a series of (non-stochastic) constants, then $ n^{-1/2} Z' \epsilon \xrightarrow{\text{d}} \n (0 , \sigma^2 S_{zz} ) $ where $ S_{zz} \equiv \lim_{n \to \infty} \tfrac{1}{n} \sum_{i} z_i^2 = \lim_{n \to \infty} \tfrac{1}{n} Z' Z $.
\litem{Slutsky's Theorem}{C\&B 5.5.17; Hayashi 92--3} If $ X_n \xrightarrow{\text{d}} X $ and $ Y_n \xrightarrow{\text{p}} a $, where $ a $ is a constant, then:
\begin{enumerate}
\item $ Y_n X_n \xrightarrow{\text{d}} aX $;
\item $ X_n + Y_n \xrightarrow{\text{d}} X + a $.
\end{enumerate}
\litem{Delta Method}{C\&B 5.5.24, 26, 28; Wikipedia; Hayashi 93--4}
%Let $ T_1, \dotsc, T_k $ be random variables with means $ \theta_1 , \dotsc, \theta_k $, and define $ \mathbf{T} \equiv (T_1, \dotsc, T_k) $ and $ \mathbf{\theta} \equiv (\theta_1 , \dotsc, \theta_k) $. For any differentiable function $ g(\mathbf{T}) $, we use a Tailor series expansion about $ \mathbf{\theta} $ to get $ \E g(\mathbf{T}) \approx g(\mathbf{\theta}) $ and
%$$ \Var g(\mathbf{T}) \approx \sum_{i=1}^{k} [g_{i}' (\mathbf{\theta})]^2 \Var T_i + 2 \sum_{i > j} g_{i}' (\mathbf{\theta}) g_{j}' (\mathbf{\theta}) \Cov (T_i, T_j). $$
Let $ \{ X_i \}_{i=1}^{\infty} $ be a sequence of r.v.s satisfying $ \sqrt{n} (X_n - \theta) \xrightarrow{\text{d}} \n (0, \sigma^2) $. For a given function $ g $ and specific $ \theta $, suppose $ g'(\theta) $ exists and $ g'(\theta) \neq 0 $. Then:
$$ \sqrt{n} [g(X_n) - g(\theta )] \xrightarrow{\text{d}} \n (0, \sigma^2 [g' (\theta)]^2). $$
If $ g'(\theta) = 0 $, but $ g''(\theta) $ exists and $ g''(\theta) \neq 0 $, we can apply the second-order Delta Method and get
$ n [g(X_n) - g(\theta )] \xrightarrow{\text{d}} \tfrac{1}{2} \sigma^2 g''(\theta) \chi^2_1 $.
%For multivariate Delta Method see C\&B~5.5.28.
Alternate formulation: If $ B $ is an estimator for $ \beta $ then the variance of a function $ h(B) \in \R $ is $ \Var (h(B)) \approx \nabla h(\beta)' \Var (B) \nabla h(\beta) $. If $ h(B) $ is vector-valued, the variance is $ H \Var (B) H' $, where $ H = \frac{\partial h}{\partial \beta'} $ (i.e., $ H_{ij} \equiv \partial_{j} h_{i} (\beta) $).
\subsection{Parametric models}
\litem{Parametric model}{Mahajan 1-1--2} Describe an (unknown) probability distribution $ P $ that is assumed to be a member of a family of distributions $ \mathcal{P} $. We describe $ \mathcal{P} $ with a parametrization: a map from a (simpler) space $ \Theta $ to $ \mathcal{P} $ such that $ \mathcal{P} = \{ P_\theta \colon \theta \in \Theta \} $. If $ \Theta $ is a ``nice'' subset of Euclidean space, and the mapping $ P_\theta $ is ``smooth,'' then $ \mathcal{P} $ is called a parametric model. A regular parametric model if either all $ P_\theta $ are continuous, or all are discrete.
\litem{Parameter}{Mahajan 1-2} Mapping from the family of distributions $ \mathcal{P} $ to another space (typically a subset of Euclidean Space). Can be explicitly or implicitly defined.
A function \textit{of the distribution} (as distinct from a statistic---a function of the data).
\litem{Identification}{Mahajan 1-3--4; C\&B 11.2.2}\index{Point identification} A parameter is (point) identified if it is uniquely determined for every distribution; i.e., if the mapping is one-to-one. When the parameter is implicitly defined as the solution to an optimization problem (e.g., $ \theta(P) = \argmax_{b \in \Theta} Q_0 (b,P) $), identification corresponds to existence of a unique solution to the optimization.
Two elements $ \theta_1 $, $ \theta_2 \in \Theta $ are observationally equivalent\index{Observational equivalence} iff they imply $ P_{\theta_1} = P_{\theta_2} $. Identification of $ \theta $ means there is no other element of $ \Theta $ observationally equivalent to $ \theta $; i.e., $ P_{\theta_1} = P_{\theta_2} \implies \theta_1 = \theta_2 $.
\litem{Identification in exponential families}{Mahajan 1-5--6, Metrics P.S. 5-1} For iid sampling from an exponential family where $ \eta_i (\theta) = \theta_i $, if the $ k \times k $ (Fisher Information\index{Fisher Information}) matrix
$$ I(\theta^*) \equiv \E \left[ \left( \frac{d \ln p(x, \theta^*)}{d \theta} \right) \left( \frac{d \ln p(x, \theta^*)}{d \theta} \right)' \right] $$
is nonsingular for every $ \theta* \in \Theta $, then $ \theta $ is identified.
\litem{Conditional homoscedasticity}{Mahajan 2-17} The assumption that $ \Var (Y|Z) = \sigma^2 $; i.e., variance of $ Y $ does not depend on $ Z $.
\litem{Regression model}{Mahajan 1-6--7, 3-9 Hayashi 465--6, Metrics P.S. 7-7} $ \{ Y_i , X_i \}_{i=1}^{n} $, where $ Y_i \in \R $ the dependent variable (a.k.a.\ response) and $ X_i \in \R^d $ the covariates or explanatory variables. Possible specifications include:
\begin{enumerate}
\item $ \E (Y_i | X_i) = g(X_i) $ for some unknown function $ g $ (nonparametric regression);
\item $ \E (Y_i | X_i) = g(X_{i}' \theta_0) $ for some unknown $ \theta_0 \in \R^d $ and unknown function $ g $ (single-index model; semi-parametric);
\item $ \E (Y_i | X_i) = X_{i}' \theta_0 $ for some unknown $ \theta_0 \in \R^d $;
\item $ (Y_i | X_i) \sim \n (X_{i}' \theta_0 , \sigma^2) $ for some unknown $ \theta_0 \in \R^d $ and $ \sigma^2 \in \R_+ $ (Gaussian regression model\index{Gaussian regression model}; $ \theta_0 $ is identified and conditional MLE $ \hat{\theta} = (X' X)^{-1}X' Y $ is consistent if $ \E (X_i X_{i}') $ is nonsingular, conditional MLE $ \hat{\sigma}^2 = \tfrac{1}{n} (Y - X \hat{\theta})' (Y - X \hat{\theta}) $).
\end{enumerate}
\litem{Linear regression model with non-stochastic covariates}{Mahajan 1-10--1, 18--9, Metrics P.S. 5-3} For two-dimensional Gaussian regression model with $ X_i = (1, x_i)' $ known. The parameter $ (\theta_0, \sigma^2) $ is identified as long as the $ x_i $ are not all identical, with complete sufficient statistic $ (\sum Y_i, \sum Y_i^2, \sum x_i Y_i) $. MLE computed in problem set.
\litem{Seemingly Unrelated Regressions}{Mahajan 2-11, 21}\index{SUR (seemingly unrelated regressions)} $ \{ Y_i , X_i \}_{i=1}^{n} $ where $ Y_i \in \R^m $ and $ X_i = (X_{1i}', \dotsc , X_{mi}')' \in \R^{(m^{2})} $. We assume $ Y_i $ are (multivariate) normal distributed with means $ \E [Y_{si}] = x_{si}' \beta_{s} $ where $ \beta_{s} \in \R^m $ and the parameter of interest is $ \beta = (\beta_{1}', \dotsc , \beta_{m}')' \in \R^{(m^{2})} $.
\litem{Probit model}{Mahajan 2-12--3, 3-9--10; Hayashi 466} iid $ \{ W_i \}_{i=1}^{n} = \{ Y_i , Z_i \}_{i=1}^{n} $ where $ Y_i \in \{0,1\} $ and $ Y_i $ have conditional distribution $ P (Y_i = 1 | Z_i) = \Phi (\theta' Z_i) $ where $ \Phi (\cdot) $ is the cdf of the standard normal. $ \theta $ is identified and MLE is consistent if $ \E (Z_i Z_i') $ is nonsingular.
Alternate motivation is threshold crossing model\index{Threshold crossing model}: $ Y_i^* = \theta' Z_i - \epsilon_i $ where $ \epsilon_i \independent Z_i $ and standard normal, and $ Y_i = I\{Y_i^* > 0 \} $.
\litem{Nonlinear least squares}{Mahajan 2-16--7, 3-6--7, 17--9}\index{NLS (nonlinear least squares)} iid $ \{ Y_i , Z_i \}_{i=1}^{n} $ where $ Y_i $ have conditional expectation $ \E (Y|Z) = \psi (Z, \theta) $. The parameter $ \theta $ can also be defined implicitly as $ \theta = \argmin_{b \in \Theta} \E [Y - \psi(Z,b)]^2 $. Identification condition is that for all $ b \neq \theta $, we have $ P (\psi(Z, b) \neq \psi(Z, \theta)) > 0 $.
See Mahajan 3-17--9 for asymptotic properties, including heteroscedasticity robust asymptotic variance matrix\index{Heteroscedasticity robust asymptotic variance matrix}
\litem{Linear instrumental variables}{Mahajan 2-22--3, 3-11--2} $ Y_i = X_i \theta + \epsilon_i $ with moment conditions $ \E [(Y_i - X_i \theta) Z_i ] = 0 $ for random vector $ Z_i $. The $ Z_i $ are ``endogenous instruments'' for regressors $ X_i $, endogenous because $ \E \epsilon_i X_i \neq 0 $. Identification condition for $ \theta $ is $ \E (Z_i X_{i}') $ has full column rank and that dimension of $ Z_i $ be at least dimension of $ X_i $.
\subsection{Statistics}
\litem{Sufficient statistic}{Mahajan 1-8--10; C\&B 6.2.1} $ T(\mX) $ is sufficient for $ \{P_\theta \colon \theta \in \Theta \} $ (or more compactly for $ \theta $) iff the conditional distribution of $ \mX $ given $ T(\mX) $ does not depend on $ \theta $; i.e., $ p(\mx | T(\mX)) = p(\mx | T(\mx), \theta) $. Once the value of a sufficient statistic is known, the sample does not carry any further information about $ \theta $. Useful for:
\begin{enumerate}
\item Decision theory: base decision rules on sufficient statistics (for any decision rule, we can always come up with rule based only on a sufficient statistic that has the same risk);
\item Dealing with nuisance parameters in hypothesis testing: find sufficient statistics for the nuisance parameters and condition decision rules on them;
\item Unbiased estimation: look for unbiased estimators that are functions of sufficient statistics.
\end{enumerate}
Any one-to-one function of a sufficient statistic is also sufficient. Outside exponential families, it is rare to have a sufficient statistic of smaller dimension than the data.
\litem{Factorization theorem}{Mahajan 1-9; C\&B 6.2.6} In a regular parametric model $ \{P_\theta \colon \theta \in \Theta \} $, a statistic $ T(\mX) $ (with range $ \mathcal{T} $) is sufficient for $ \theta $ iff there exists a function $ g \colon \mathcal{T} \times \Theta \to \R $ and a function $ h $ such that $ f(\mx, \theta ) = g(T(\mx), \theta) h(\mx) $ for all $ \mx $ and $ \theta $.
\litem{Minimal sufficient statistic}{Mahajan 1-12, 19; C\&B 6.2.11} $ T(\mX) $ is minimal sufficient if it is sufficient, and for any other sufficient statistic $ S(\mX) $ we can find a function $ r $ such that $ T(\mX) = r(S(\mX)) $. This means that a minimal sufficient statistic induces the coarsest possible partition on the data; i.e., it has achieved the maximal amount of data reduction possible while still retaining all information about the parameter.
Any one-to-one function of a minimal sufficient statistic is minimal sufficient. If a minimal sufficient statistic exists, then any complete sufficient statistic is also a minimal sufficient statistic.
\litem{Likelihood function}{Mahajan 1-13--4} $ L(\mx, \theta) \equiv p(\mx, \theta) $. This is the same as the pdf/pmf, but considered as a function of $ \theta $ instead of $ \mx $.
The likelihood ratio $ \Lambda(\mx, \cdot) \equiv L(\mx, \cdot) / L(\mx, \theta_0) $, where $ \theta_0 \in \Theta $ is fixed and known, with the support of $ P_\theta $ a subset of the support of $ P_{\theta_0} $ for all $ \theta \in \Theta $. The likelihood ratio is minimal sufficient for $ \theta $.
\litem{Ancillary statistic}{Mahajan 1-14; C\&B 6.2.16} $ S(\mX) $ is ancillary for $ \theta $ iff the distribution of $ S(\mX) $ does not depend on $ \theta $.
It is first-order ancillary\index{First-order ancillary statistic} iff $ \E S(\mX) $ does not depend on $ \theta $.
\litem{Complete statistic}{Mahajan 1-16; C\&B 6.2.21, 28} $ T \colon \mathcal{X} \to \mathcal{T} $ is complete iff for every measurable real function $ g \colon \mathcal{T} \to \R $ such that $ \forall \theta \in \Theta $, $ \E_\theta [g(T)] = 0 $ implies that $ g(T) = 0 $ almost everywhere. Equivalently, $ T $ is complete if no non-constant function of $ T $ is first-order ancillary.
If a minimal sufficient statistic exists, then any complete statistic is also a minimal sufficient statistic.
\litem{Basu's theorem}{Mahajan 1-19; C\&B 6.2.24, 28} If $ T(\mX) $ is a complete minimal sufficient statistic, then $ T(\mX) $ is independent of every ancillary statistic. Note the minimal wasn't really necessary: if a minimal sufficient statistic exists, then any complete statistic is also a minimal sufficient statistic.
\litem{Statistics in exponential families}{Mahajan 1-11, 17; C\&B 6.2.10, 25} By the factorization theorem, $ T(\mX) \equiv (T_1(\mX), \dotsc , T_k(\mX)) $ is sufficient for $ \theta $. (N.B.\ The statistic must contain \textit{all} the $ T_i $.)
If $ \mX $ is an iid sample, then $ T(\mX) \equiv (\sum_i T_1(X_1), \dotsc , \sum_i T_k(X_i)) $ is a complete statistic if the set $ \{ (\eta_1(\theta), \dotsc, \eta_k(\theta)) \colon \theta \in \Theta \} $ contains an open set in $ \R^k $. (Usually, all we'll check is dimensionality.)
\subsection{Point estimation}
\litem{Estimator}{Mahajan 2-2} Any measurable function of the data. Note this must \textit{not} be a function of any parameters of the distribution.
\litem{Extremum estimator}{Hayashi 446} An estimator $ \hat{\theta} $ such that there is a scalar (``objective'') function $ Q_n (\theta) $ such that $ \hat{\theta} $ maximizes $ Q_n(\theta) $ subject to $ \theta \in \Theta \subseteq \R^p $. The objective function depends not only on $ \theta $, but also on the data (a sample of size $ n $).
\litem{Analogy principle}{Mahajan 2-2--3} Consider finding an estimator that satisfies the same properties in the sample that the parameter satisfies in the population; i.e., seek to estimate $ \theta(P) $ with $ \theta(P_n) $ where $ P_n $ is the empirical distribution\index{Empirical distribution} which puts mass $ \tfrac{1}{n} $ at each sample point. Note this distribution converges uniformly to $ P $.
\litem{Consistent estimator}{Mahajan 3-2; Hansen 5-41--2; C\&B 10.1.1, 3} The sequence of estimators $ \{ \hat{\theta}_n \}_{n=1}^{\infty} $ is consistent for $ \theta $ iff $ \hat{\theta}_n \xrightarrow{\text{p}} \theta(P) $. The sequence is superconsistent\index{Superconsistent estimator} iff $ \hat{\theta}_n - \theta = o_p (n^{-1/2}) $). Superconsistency implies consistency.
If $ \lim_{n \to \infty} \Var_\theta \hat{\theta}_n = 0 $ (variance goes to zero) and $ \lim_{n \to \infty} \E_\theta \hat{\theta}_n = \theta $ (bias goes to zero) for every $ \theta \in \Theta $, then $ \{ \hat{\theta}_n \} $ is consistent (sufficient, not necessary, by Chebychev's Inequality).
\litem{Consistency with compact parameter space}{Mahajan 3-4--5, 11; Hayashi 457} Let $ \hat{\theta}_n \equiv \argmax_{b \in \Theta} Q_n (\mathbf{W}, b) \equiv \argmax_{b \in \Theta} Q_n (b) $. (In the last equivalence we have suppressed dependence on the data $ \mathbf{W} $.) This covers M-estimators, MLE, and GMM estimators. Suppose that:
\begin{enumerate}
\item $ \Theta $ is a \textit{compact} subset of $ \R^d $ [generally not satisfied];
\item $ Q_n (b) $ is continuous in $ b $ for any realization of the data $ \mathbf{W} $ [``usually easily checked''];
\item $ Q_n (b) $ is a measurable function of the data for all $ b \in \Theta $ [generally assumed].
\end{enumerate}
These conditions ensure that $ \hat{\theta}_n $ is well-defined. Suppose there exists a function $ Q_0 (b) $ such that:
\begin{enumerate}
\item \textit{Identification}: $ Q_0 (\cdot) $ is uniquely (globally) maximized on $ \Theta $ at $ \theta \in \Theta $;
\item \textit{Uniform convergence}: $ Q_n (\cdot) $ converges uniformly in probability to $ Q_0 (\cdot) $ [can be verified by checking more primitive conditions; in particular, for M-estimators a Uniform Weak LLN will suffice].
\end{enumerate}
Then $ \hat{\theta}_n \xrightarrow{\text{p}} \theta $.
\litem{Consistency without compact parameter space}{Mahajan 3-5, 6; Hayashi 458} Let $ \hat{\theta}_n \equiv \argmax_{b \in \Theta} Q_n (\mathbf{W}, b) \equiv \argmax_{b \in \Theta} Q_n (b) $ as above. Suppose that:
\begin{enumerate}
\item True parameter $ \theta \in \operatorname{interior} (\Theta) $;
\item $ \Theta $ is a convex set;
\item $ Q_n (b) $ is concave in $ b $ for any realization of the data $ \mathbf{W} $ [will be true for MLE, since log-likelihood is concave, if only after a re-parametrization];
\item $ Q_n (b) $ is a measurable function of the data for all $ b \in \Theta $.
\end{enumerate}
These conditions ensure that $ \hat{\theta}_n $ is well-defined. Suppose there exists a function $ Q_0 (b) $ such that:
\begin{enumerate}
\item \textit{Identification}: $ Q_0 (\cdot) $ is uniquely (globally) maximized on $ \Theta $ at $ \theta \in \Theta $;
\item \textit{Pointwise convergence}: $ Q_n (b) \xrightarrow{\text{p}} Q_0 (b) $ for all $ b \in \Theta $.
\end{enumerate}
Then $ \hat{\theta}_n $ exists with probability approaching $ 1 $ and $ \hat{\theta}_n \xrightarrow{\text{p}} \theta $.
See Mahajan 3-6 for M-estimators.
\litem{Uniform (Weak) Law of Large Numbers}{Mahajan 3-6; Hayashi 459}\index{ULLN (Uniform Law of Large Numbers)} Suppose $ \{ W_i \}_{i} $ is ergodic stationary and that:
\begin{enumerate}
\item $ \Theta $ is compact;
\item $ q(W_i, b) $ is continuous in $ b $ for all $ W_i $;
\item $ q(W_i, b) $ is measurable in $ W_i $ for all $ b $;
\item $ \E_P [\sup_{b \in \Theta} | q(W_i, b)| ] < \infty $.
\end{enumerate}
Then $ \tfrac{1}{n} \sum_{i} q(W_i, b) $ converges uniformly to $ \E [q(W_i, b)] $, and $ \E [q(W_i, b)] $ is a continuous function of $ b $.
\litem{Asymptotically normal estimator}{Mahajan 3-2, 13--4} The sequence of estimators $ \{ \hat{\theta}_n \}_i $ is ($ \sqrt{n} $) asymptotically normal iff $ \sqrt{n} (\hat{\theta}_n - \theta(P)) \xrightarrow{\text{d}} \n (0, V(P)) $ for some symmetric positive definite matrix $ V(P) $ (somewhat inaccurately referred to as the asymptotic variance of $ \hat{\theta}_n $).
Suppose that
\begin{enumerate}
\item $ \hat{\theta}_n $ is consistent for $ \theta $;
\item $ \theta \in \operatorname{interior} (\Theta) $;
\item $ Q_n(b) $ is twice continuously differentiable in a neighborhood $ \mathcal{N} $ of $ \theta $;
\item $ \sqrt{n} \frac{\partial Q_n (\theta)}{\partial \theta} \xrightarrow{\text{d}} \n (0, \mSigma) $;
\item \textit{Uniform convergence of the Hessian:} There exists a matrix $ H(b) $ that is continuous and nonsingular at $ \theta $ such that
$$ \sup_{b \in \mathcal{N}} \left \lVert \frac{\partial^2 Q_n (b)}{\partial \theta \, \partial \theta'} - H(b) \right \rVert \xrightarrow{\text{p}} 0 . $$
\end{enumerate}
Then $ \sqrt{n} (\hat{\theta}_n - \theta) \xrightarrow{\text{d}} \n (0, H(\theta)^{-1} \mSigma H(\theta)^{-1}) $.
\litem{Asymptotic variance}{C\&B 10.1.9} If $ k_n [\hat{\theta}_n - \theta] \xrightarrow{\text{d}} \n (0, \sigma^2) $ for some sequence of constants $ \{ k_n \} $, then $ \sigma^2 $ is the asymptotic variance.
\litem{Asymptotically efficient estimator}{C\&B 10.1.11--2} $ \{ \hat{\sigma}_n \} $ is asymptotically efficient if $ \sqrt{n} [\hat{\theta}_n - \theta] \xrightarrow{\text{d}} \n (0, \sigma^2) $ and $ \sigma^2 $ is the CRLB.
Under regularity conditions, the MLE is consistent and asymptotically efficient.
\litem{Maximum Likelihood estimator}{Mahajan 2-4--10, 36; Hayashi 448--9, 463--5}\index{MLE (Maximum Likelihood estimator)} $ \hat{\theta} \equiv \argmax_{b \in \Theta} L (\mX, b) $. Equivalently, for iid data, $ \hat{\theta} = \argmax_{b \in \Theta} \tfrac{1}{n} \sum \ln p(X_i, b) $.\footnote{This is ``quasi-ML'' if used for non-iid data. It \textit{can} be consistent even for (non-iid) ergodic stationary processes---see Hayashi 464--5.} Estimating $ \theta = \argmax_{b \in \Theta} \E_{P_\theta} \ln p(\mX, b) $. An M-Estimator with $ q(X_i, b) \equiv - \ln p(X_i, b) $. Note:
\begin{enumerate}
\item The identification condition is that the parameter being estimated is identified.
\item MLE need not always exist, and if they do, need not be unique.
\item We may not be able to get a closed-form expression for $ \hat{\theta} $, and therefore have to characterize it as the solution to a maximization problem (or its FOCs).
\item The expected value of (log) likelihood is uniquely maximized at the true parameter as long as $ \theta $ is identified; i.e., the Kullback-Liebler Divergence\index{Kullback-Liebler Divergence}
$$ K(b, \theta) \equiv \E_{\theta} \left[ \ln \left( \frac{p(\mX, \theta)}{p(\mX, b)} \right) \right] > 0 \, \text{for all $ b \neq \theta $,} $$
or equivalently, if $ p(\mx, b) = p(\mx, \theta) $ for all $ \mx $ implies that $ b = \theta $.
\item (Invariance property) If $ \hat{\theta} $ is an MLE of $ \theta $, then $ h(\hat{\theta}) $ is an MLE of $ h(\theta) $.
\end{enumerate}
\litem{Consistency for for MLE}{Hayashi 463--465; Mahajan 3-8--10} Let $ \{ y_t, \mx_t \} $ be ergodic stationary, and let $ \hat{\theta} $ be the conditional MLE that maximizes the average log conditional likelihood (derived under the assumption that $ \{ y_t, \mx_t \} $ is iid).
Suppose conditions (specified on Hayashi 464) allow us to apply a general consistency theorem. Then $ \hat{\theta} \xrightarrow{\text{p}} \theta_0 $ \textit{despite} the fact that the MLE was derived under the iid assumption.
\litem{Asymptotic normality for MLE}{Mahajan 3-20--2; C\&B 10.1.12; Hayashi 474--6; D\&M 258, 260--3, 270--4} Suppose $ \{ W_i \} \equiv \{ Y_i, Z_i \} $ is an iid sample, that $ Z $ is ancillary for $ \theta $, that $ \hat{\theta}_n \equiv \argmax_{b \in \Theta} \tfrac{1}{n} \sum \log p (Y_i | Z_i, b) \equiv \argmax_{b \in \Theta} Q_n(b) $. Define score $ s(W_i, b) \equiv \frac{\partial \log p(Y_i | Z_i, b)}{\partial b} $ and Hessian $ H(W_i, b) \equiv \frac{\partial s (W_i, b)}{\partial b} = \frac{\partial^2 \log p (Y_i | Z_i, b)}{\partial b \, \partial b'} $. Suppose:
\begin{enumerate}
\item $ \hat{\theta}_n $ is consistent for $ \theta $---generally fails either because number of parameters increases with sample size, or model not asymptotically identified (even if it is identified by any finite sample);
\item $ \theta \in \operatorname{interior} (\Theta) $;
\item $ p(Y|Z, b) $ is twice continuously differentiable in $ b $ for any $ (Y, Z) $;
\item $ \E [s(W, \theta)] = 0 $ and $ -\E [H(W, \theta)] = \E [s(W, \theta) s(W, \theta)'] $ (this is stated as an assumption, but is the Information Equality and hence holds if its requirements do);
\item $ \frac{1}{\sqrt{n}} \sum s(W_i, \theta) \xrightarrow{\text{d}} \n (0, \mSigma) $ for some $ \mSigma > 0 $;
\item $ \E ( \sup_{b \in \mathcal{N}} \lVert H(W, b) \rVert ) < \infty $, which implies via ULLN that
$$ \sup_{b \in \mathcal{N}} \biggl \lVert \frac{1}{n} \sum_{i=1}^{n} H(W_i, b) - \E [H(W, b)] \biggr \rVert \xrightarrow{\text{p}} 0 ; $$
\item $ \E [H(W, \theta)] $ is nonsingular (only required at true parameter).
\end{enumerate}
Then $ \hat{\theta}_n $ is asymptotically normal with variance $ \mathcal{I}(\theta)^{-1} $ (note this is the Fisher information for \textit{one} observation, not the joint distribution). The MLE is not necessarily unbiased, but in the limit the variance attains the CRLB hence MLE is \textit{asymptotically efficient}. No GMM estimator can achieve lower variance.
Estimation of asymptotic variance can either be done by estimating the Hessian or the score (either gives a consistent expression using the Fisher Information Equality).
\litem{M-estimator}{Mahajan 2-15--6; Hayashi 447} $ \hat{\theta} \equiv \argmin_{b \in \Theta} \tfrac{1}{n} \sum q(X_i, b) $ (assuming iid data). Estimating $ \theta = \argmin_{b \in \Theta} \E_{P} q(\mX, b) $.
\begin{enumerate}
\item MLE is an M-Estimator with $ q(X_i, b) \equiv - \ln L(X_i, b) $.
\item Sample mean is an M-Estimator with $ q(X_i, b) \equiv (X_i - b)^2 $.
\end{enumerate}
\litem{Asymptotic normality for M-estimator}{Mahajan 3-14--7; Hayashi 470--4; D\&M 593} Suppose $ \{ W_i \} $ is an iid sample, that $ \hat{\theta}_n \equiv \argmax_{b \in \Theta} \tfrac{1}{n} \sum q(W_i, b) \equiv \argmax_{b \in \Theta} Q_n(b) $. Define ``score'' $ s(W_i, b) \equiv \frac{\partial q (W_i, b)}{\partial b} $ and Hessian $ H(W_i, b) \equiv \frac{\partial s (W_i, b)}{\partial b} = \frac{\partial^2 q (W_i, b)}{\partial b \, \partial b'} $. Suppose:
\begin{enumerate}
\item $ \hat{\theta}_n $ is consistent for $ \theta $;
\item $ \theta \in \operatorname{interior} (\Theta) $;
\item $ q(W,b) $ is twice continuously differentiable in $ b $ for any $ W $;
\item $ \frac{1}{\sqrt{n}} \sum s(W_i, \theta) \xrightarrow{\text{d}} \n (0, \mSigma) $ for some $ \mSigma > 0 $;\footnote{If $ \{ W_i \} $ is non-iid ergodic stationary, then $ \mSigma $ is the long-run variance of $ \{ s(W_i, \theta) \} $; Gordin's conditions are sufficient for this convergence.}
\item $ \E ( \sup_{b \in \mathcal{N}} \lVert H(W, b) \rVert ) < \infty $, which implies via ULLN that
$$ \sup_{b \in \mathcal{N}} \biggl \lVert \frac{1}{n} \sum_{i=1}^{n} H(W_i, b) - \E [H(W, b)] \biggr \rVert \xrightarrow{\text{p}} 0 ; $$
\item $ \E [H(W, \theta)] $ is nonsingular (only required at true parameter).
\end{enumerate}
Then $ \hat{\theta}_n $ is asymptotically normal with variance $ ( \E [H(W, \theta)])^{-1} \mSigma ( \E [H(W, \theta)])^{-1} $.
``Under appropriate conditions for a ULLN to apply,'' this variance is estimated by:
\begin{multline*}
\widehat{V} = \left[ \tfrac{1}{n} \sum H(W_i, \theta_n) \right]^{-1}
\underbrace{\left[ \tfrac{1}{n} \sum s(W_i, \theta_n) s(W_i, \theta_n)' \right]}_{\widehat{\mSigma}} \\
\left[ \tfrac{1}{n} \sum H(W_i, \theta_n) \right]^{-1}.
\end{multline*}
\litem{Method of Moments estimator}{Mahajan 2-19--20} Suppose iid data from a parametric model with $ \theta \in \R^d $ identified and the first $ d $ moments of $ P_\theta $ exist: $ \{m_j(\theta) \}_{j=1}^{d} \equiv \{\E_\theta X^j \}_{j=1}^{d} $. Method of Moments estimator gives moments equal to sample moments:
$$ m_j(\hat{\theta}) = \widehat{m}_j \equiv \frac{1}{n} \sum_{i=1}^{n} X_i^j \, \text{for all $ j \in \{1, \dotsc , d \} $.} $$
\litem{Generalized Method of Moments estimator}{Mahajan 2-20--1; Hayashi 447, 468}\index{GMM (Generalized Method of Moments)} GMM estimates parameter $ \theta \in \R^d $ satisfying $ \E [\mathbf{m}(\mX, \theta)] = 0 $ where $ \mathbf{m} \colon \mathcal{X} \times \Theta \to \R^m $ a vector of $ m $ moment conditions. If $ \theta $ is identified, it is the unique solution to the moment conditions.
When $ m \geq d $ we typically can't find a $ \hat{\theta} $ satisfying all moment conditions, so instead we seek $ \hat{\theta} = \argmin_{b \in \Theta} Q_n (b) $ where
$$ Q_n(b) \equiv \left( \tfrac{1}{n} \sum m(X_i, \theta) \right)' S \left( \tfrac{1}{n} \sum m(X_i, \theta) \right) $$
for some specified weight matrix $ S_{m \times m} $ symmetric and positive definite. The quadratic form in $ S $ defines a norm. Correct specification requires orthogonality condition $ \E [m(X_i, \theta)] = 0 $.
Extremum estimators can typically be thought of as GMM estimators when we characterize them by FOCs. Includes MLE and M-estimators that can be characterized by FOCs.
\litem{Consistency for GMM estimators}{Mahajan 3-10--1; Hayashi 467--8} Let $ \hat{\theta}_n \equiv \argmax_{b \in \Theta} Q_n (b) $ (as above), where
$$ Q_n(b) = - \frac{1}{2} \biggl ( \frac{1}{n} \sum_{i=1}^{n} m(X_i, b) \biggr )' S_n \biggl ( \frac{1}{n} \sum_{i=1}^{n} m(X_i, b) \biggr ). $$
The true parameter satisfies $ \E_P [m(W, \theta)] = 0 $ and hence uniquely maximizes limit function $ Q_0 (b) = -\tfrac{1}{2} \E (m(W,b)]' S \E (m(W,b)] $. Suppose that:
\begin{enumerate}
\item $ \Theta $ is a \textit{compact} subset of $ \R^d $;
\item $ m (b) $ is continuous in $ b $ for any realization of the data;
\item $ m (b) $ is a measurable function of the data for all $ b \in \Theta $ (this ensures that $ \hat{\theta} $ is measurable);
\item The weight matrices $ S_n $ converge in probability to some symmetric positive definite matrix $ S $.
\end{enumerate}
Suppose further that:
\begin{enumerate}
\item \textit{Identification}: $ \E [m(W, b)] \neq 0 $ for any $ b \neq \theta $;
\item \textit{Dominance to ensure ULLN applies}: $ \E [ \sup_{b \in \Theta} \lVert m(W, b) \rVert] < \infty $.
\end{enumerate}
Then $ \hat{\theta}_n \xrightarrow{\text{p}} \theta $.
Showing identification and dominance for nonlinear GMM is quite difficult and usually just assumed. If objective function is concave, we can replace compact $ \Theta $; continuous, measurable $ m $; and dominance by requirement that $ \E [m(W,b)] $ exist and be finite for all $ b \in \Theta $.
\litem{Asymptotic normality for GMM estimator}{Mahajan 3-24--6; Hayashi 478--81} Let $ \hat{\theta}_n \equiv \argmax_{b \in \Theta} - \frac{1}{2} [m_n(b)]' W_n [m_n(b)] \equiv \argmax_{b \in \Theta} Q_n(b) $, where $ m_n(b) \equiv \tfrac{1}{n} \sum m(X_i, b_{d \times 1})_{m \times 1} $. Jacobian $ M_n(b)_{d \times m} \equiv \tfrac{1}{n} \sum \frac{\partial m(b)}{\partial b} = \frac{\partial m_n(b)}{\partial b} $. Suppose:
\begin{enumerate}
\item The matrix $ M_n(\theta_n) W_n M_n (\bar{b}_n) $ is invertible;
\item $ \sqrt{n} m_n (\theta) \xrightarrow{\text{d}} \n (0, \E [m(X, \theta) m(X, \theta)']) \equiv \n (0, S(\theta)) $ (by a CLT);
\item $ M_n (b_n) \xrightarrow{\text{p}} \E [ \frac{\partial m(X, \theta)}{\partial b}] \equiv M(\theta) $ (by a ULLN);
\item $ W_n \xrightarrow{\text{p}} W $.
\end{enumerate}
Then $ \hat{\theta}_n $ is asymptotically normal with variance
$$ [M(\theta) W M(\theta)']^{-1} M(\theta) W S(\theta) W M(\theta)' [M(\theta) W M(\theta)']^{-1}. $$
If we choose $ W = S(\theta)^{-1} $ (the efficient choice), the asymptotic variance reduces to $ [M(\theta) S(\theta)^{-1} M(\theta)']^{-1} $.
Assuming conditions for ULLN apply, we can estimate terms using consistent estimators: $ \widehat{S} \equiv \tfrac{1}{n} \sum m(X_i, \hat{\theta}_n) m(X_i, \hat{\theta}_n)' $ and $ \widehat{M} \equiv \tfrac{1}{n} \sum \frac{\partial m(X_i, \hat{\theta}_n)}{\partial b} $.
\litem{Efficient GMM}{Mahajan 3-26--7; Hayashi 212--3} Given above GMM estimator, if $ W^* = S(\theta)^{-1} $ (the inverse of the variance of the moment conditions), then the asymptotic variance reduces to $ V_e = [M(\theta) S(\theta)^{-1} M(\theta)']^{-1} $.
This is the lowest variance (in the matrix sense) that can be achieved. Therefore the optimal choice of weighting matrices is any sequence of random matrices that converge to $ S^{-1} $. A natural choice is to develop preliminary estimates $ \tilde{\theta}_n \xrightarrow{\text{p}} \theta $ (often using the identity as weighting matrix) and generating
$$ W_n = \left[ \tfrac{1}{n} \sum m(X_i, \tilde{\theta}_n) \sum m(X_i, \tilde{\theta}_n)' \right]^{-1}. $$
Under conditions necessary to implement ULLN, $ W_n \xrightarrow{\text{p}} S^{-1} $.
\litem{Minimum Distance estimator}{Mahajan 2-23--4} Estimator $ \hat{\theta}_n \equiv \argmin_{b \in \Theta} g_n(b)' S_n g_n(b) $ for some square weight matrix $ S_n $. GMM is a special case where $ g_n $ are sample averages of some function.
\litem{Asymptotic normality for MD estimator}{Mahajan 3-22--4} Let $ \hat{\theta}_n \equiv \argmax_{b \in \Theta} - \frac{1}{2} [g_n(b)]' W_n [g_n(b)] \equiv \argmax_{b \in \Theta} Q_n(b) $. Define Jacobian [transpose of way we usually write derivatives?] $ G_n(b) \equiv \frac{\partial g_n(b)}{\partial b} $. Suppose:
\begin{enumerate}
\item The matrix $ G_n(\theta_n) W_n G_n (\bar{b}_n) $ is invertible (where $ \bar{b}_n $ is a point between $ \theta $ and $ \hat{\theta}_n $ for which a mean value expansion holds);
\item $ \sqrt{n} g_n (\theta) \xrightarrow{\text{d}} \n (0, S(\theta)) $ (by a CLT);
\item $ G_n (b_n) \xrightarrow{\text{p}} G(\theta) $ (by a ULLN);
\item $ W_n \xrightarrow{\text{p}} W $.
\end{enumerate}
Then $ \hat{\theta}_n $ is asymptotically normal with variance
$$ [G(\theta) W G(\theta)']^{-1} G(\theta) W S(\theta) W G(\theta)' [G(\theta) W G(\theta)']^{-1}. $$
If we choose $ W = S(\theta)^{-1} $ (the efficient choice), the asymptotic variance reduces to $ [G(\theta) S(\theta)^{-1} G(\theta)']^{-1} $.
\litem{Uniformly minimum variance unbiased estimator}{Mahajan 2-27--9, Metrics P.S. 6-1; C\&B 7.3.7, 17, 19--20, 23, 7.5.1}\index{UMVUE (uniformly minimum variance unbiased estimator)}\index{Best unbiased estimator}\index{BUE (best unbiased estimator)} An unbiased estimator $ \phi (X) $ of a quantity $ g(\theta) $ is a UMVUE (a.k.a.\ best unbiased estimator) iff $ \phi $ has finite variance and for every unbiased estimator $ \delta (X) $ of $ g(\theta) $, we have $ \Var \phi (X) \leq \Var \delta (X) $ for all $ \theta $. Note:
\begin{enumerate}
\item Unbiased estimators may not exist;
\item Not every unbiased estimator is a UMVUE;
\item If a UMVUE exists, it is unique;
\item (Rao-Blackwell)\index{Rao-Blackwell Theorem} If $ h(X) $ is an unbiased estimator of $ g(\theta) $, and $ T(X) $ is a sufficient statistic for $ \theta $, then $ \phi(T) \equiv \E [h(X)|T] $ is unbiased for $ g(\theta) $, and has variance (weakly) lower than $ h(X) $ for all $ \theta $---means we only need consider statistics that are functions of the data only through sufficient statistics;
\item If $ \phi(T) $ is an unbiased estimator of $ g(\theta) $ and is a function of a complete statistic $ T(X) $, then all other unbiased estimators that are functions of $ T $ are equal to $ \phi(T) $ almost everywhere;
\item (Lehmann-Scheff\'{e})\index{Lehmann-Scheff\'{e} Theorem} If $ \phi(T) $ is a function (only) of a complete statistic $ T(X) $, then $ \phi(T) $ is the unique UMVUE of $ \E \phi(T) $;
\item (Hausman Principle)\index{Hausman Principle} $ W $ is a UMVUE for $ \E W $ iff it is uncorrelated with every unbiased estimator of $ 0 $ (practically, impossible to prove, except for the case where $ W $ is a function only of a complete statistic).
\end{enumerate}
\litem{Fisher Information}{Mahajan 2-30--1, Metrics P.S. 6-3}\index{Score}
$$ \mathcal{I}(\theta) \equiv \E_\theta \biggl [ \underbrace{ \Bigl ( \tfrac{\partial}{\partial \theta} \ln f(x, \theta ) \Bigr )}_{\text{Score}} \Bigl ( \tfrac{\partial}{\partial \theta} \ln f(x, \theta) \Bigr )' \biggr ]. $$
Fisher information for an iid sample is $ n $ times information for each individual observation. For (univariate) normal,
$$ \mathcal{I}(\mu, \sigma^2) = \begin{bmatrix}
\sigma^{-2} & 0 \\
0 & \frac{1}{2} \sigma^{-4}
\end{bmatrix} ; \quad
\mathcal{I}(\mu, \sigma^2)^{-1} = \begin{bmatrix}
\sigma^{2} & 0 \\
0 & 2 \sigma^{4}
\end{bmatrix} . $$
%Cramer-Rao
\litem{Cram\'{e}r-Rao Inequality}{Mahajan 2-30--1, 5; C\&B 7.3.9--11, 15}\index{CRLB (Cram\'{e}r-Rao lower bound)} Given a sample $ \mX \sim f(\mx | \theta) $ and an estimator $ \phi (\mX) $ with $ \E_\theta \phi (\mX) = g (\theta) $, suppose that
\begin{enumerate}
\item The support does not depend on $ \theta $;
\item pdf $ f(\mx | \theta ) $ is differentiable in $ \theta $ almost everywhere;
\item $ \E_\theta |\phi| < \theta $ (or per C\&B, $ \Var_\theta \phi < \infty $);
\item The operations of differentiation and integration can be interchanged in $ \frac{d}{d \theta} \int \phi (\mx) f(\mx, \theta) \, d \mx $;
\item Fisher Information $ \mathcal{I}(\theta) $ is nonsingular. Note under previous conditions,
\begin{enumerate}
\item $ \mathcal{I} (\theta) = \Var_\theta [\tfrac{\partial}{\partial \theta} \ln f(\mx, \theta) ] $;