forked from LexILuthor/Statistics-Note
-
Notifications
You must be signed in to change notification settings - Fork 0
/
capitolo4.tex
677 lines (645 loc) · 35.1 KB
/
capitolo4.tex
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
\chapter{Statistics}
\vspace{15pt}
The notation of statistic was introduced by Fisher (1920).\\
The importance of sufficiency is that ot can be found in any statistical decision (point estimation, testing, confidential bound)
\begin{defi}
Let $X=(X_1... X_n)$ be a random sample from a parametric model $X\sim f_X(x,\theta)$ for some $\theta \in \Theta$ unknown.\\
We say that $T_n=T(X)$ is \textbf{sufficient for the parameter $\theta$} if the conditional distribution of $X$ given $T_n$ does not depend of $\theta$ i.e. defined:
\begin{itemize}
\item $f_{\uX|T_n=t}(\ux;t,\theta)$ the conditional distribution of $\uX$ given $T_n$
\item $h_{\uX,T_n}(z,t,\theta)$ the joint distribution of $\uX$ and $T_n$
\item $g_{T_n}(t,\theta)$ the marginal distribution of $T_n$
\end{itemize}
then $T_n$ is \textbf{sufficient for the parameter $\theta$} only if $f_{\uX|T_n=t}(\ux;t,\theta)$ does not depend of $\theta$.\\
Note that
\[
f_{\uX|T_n=t}(\ux;t,\theta)=\frac{h_{\uX,T_n}(\ux,t,\theta)}{g_{T_n}(t,\theta)}
\]
\end{defi}
\begin{eg} \label{eg:ber}
$(X_1... X_n)\in \{0,1\}^n$ from a $Ber(\theta)$, $\theta \in (0,1)$.\\
Define $T_n=\sum_{i=1}^{n}X_i$, then we want to verify if $T_n$ is sufficient.\\
\begin{itemize}
\item $f_{\uX}(\ux;\theta) = \prod_{i=1}^{n} f_{{X_i}}({x_i};\theta)=\theta^{\sum_{i=1}^{n}x_i}(1-\theta)^{n-\sum_{i=1}^{n}x_i}$
\item$g_{T_n}(t,\theta)={{n}\choose{t}}\sigma^t(1-\sigma)^{n-t}\mathbbm{1}_{0,1...n}(t)$
\item$h_{\uX,T_n(z,t,\theta)}=\p(\uX=\ux,T_n=t)=\sigma^t(1-\sigma)^{n-t}$
\end{itemize}
so
$$f_{\uX|T_n=t}=\frac{\sigma^t(1-\sigma)^{n-t}}{{{n}\choose{t}}\sigma^t(1-\sigma)^{n-t}}=\frac{1}{{{n}\choose{t}}}$$
So $T_n$ is a sufficient statistic for $\theta$.\\
This is a really special case because all the $X_i$ are already in function of $T_n$.
\end{eg}
\begin{oss}
If $T_n$ is sufficient for $\theta$ then all the statistical information of $\theta$ contained in the random sample is relocated in $T_n$. In the example above to infer about $\theta$ we just need $\sum_{i=1}^{n}X_i$.\\
\end{oss}
\begin{oss}
The notation of sufficiency derive from the probability structure of the parametric family $X\sim f_X(x;\theta)$. We can talk about sufficiency for a parameter $\theta$ only after we have specified $X\sim f_X(x;\theta)$
\end{oss}
The definition of sufficiency based on conditional probability is not of practical use because we need this two distributions $
\begin{cases}
g_{T_n}(\cdot)\\
h_{\uX,T_n}(\cdot,\cdot)
\end{cases}
$ that can be difficult to find.
To avoid that we could use a corollary of the \textit{Fisher Factorization Theorem}:
\begin{corol}\label{corol:Savage}
Let $\uX=(X_1... X_n)$ from $X\sim f_x(x,\theta)$. Then a statistic $T_n$ is sufficient for $\theta$ if and only if there exist two non negative functions $g(\cdot), h(\cdot)$ such that $\lf=g(T(\ux);\theta)h(\ux)$
\end{corol}
\begin{oss}
\begin{itemize}
\item $g$ is a function of the observed sample via $T_n$
\item $h$ is a function of the observed sample and does not depend on $\theta$
\end{itemize}
\end{oss}
\begin{eg}
Recall the example \ref{eg:ber}
$(X_1... X_n)\in \{0,1\}^n$ from a $Ber(\theta)$, $\theta \in (0,1)$.\\
Define $T_n=\sum_{i=1}^{n}X_i$, then we want to verify if $T_n$ is sufficient. \\
We have
$$f_{\uX}(\ux;\theta) = \prod_{i=1}^{n} f_{{X_i}}(x_i;\theta)=\theta^{\sum_{i=1}^{n}x_i}(1-\theta)^{n-\sum_{i=1}^{n}x_i}$$
hence we can apply the previous theorem \ref{corol:Savage} using
\begin{enumerate}
\item $h(\uX)=1$ \item $g(T_n(\uX);\theta)=\theta^{\sum_{i=1}^{n}x_i}(1-\theta)^{n-\sum_{i=1}^{n}x_i}$
\end{enumerate}
\end{eg}
\begin{eg}\label{eg:gauss}
$(X_1... X_n)\in \{0,1\}^n$ from a $N(\theta,1)$. We want to verify that $T_n=\sum_{i=1}^{n} X_i$ is a sufficient statistic:
\[
\begin{split}
\lf
&=\prod_{i=1}^{n}\frac{1}{\sqrt{2 \pi}} \exp \bigg\{ -\frac{1}{2} (x_i-\theta)^2 \bigg\}\\
&=(2\pi)^{-n/2}\exp \bigg\{ -\frac{1}{2} \sum_{i=1}^{n}(x_i-\theta)^2 \bigg\}\\
&=(2\pi)^{-n/2}\exp \bigg\{ -\frac{1}{2} \sum_{i=1}^{n}x_i^2- \frac{n\theta^2}{2}+\theta\sum_{i=1}^{n} x_i \bigg\}\\
&=(2\pi)^{-n/2}\exp \bigg\{ -\frac{1}{2} \sum_{i=1}^{n}x_i^2 \bigg\} \exp \bigg\{- \frac{n\theta^2}{2}+\theta\sum_{i=1}^{n} x_i \bigg\}
\end{split}
\]
so
\begin{itemize}
\item $h(x)=\exp \bigg\{ -\frac{1}{2} \sum_{i=1}^{n}x_i^2 \bigg\}$
\item $g(\sum_{i=1}^n x_i,\theta) = \exp \bigg\{- \frac{n\theta^2}{2}+\theta\sum_{i=1}^{n} x_i \bigg\}$
\end{itemize}
\end{eg}
\begin{teo}\textbf{Fisher Theorem}\\
If $\lfd$ is the joint density function or the joint probability mass function of $\uX$ and $q(t;\theta)$ is the density function or the probability mass function of $T_n(\uX)$, then $T_n(\uX)$ is sufficient for $\theta$ if for every point in the sample space, the ratio
$$\frac{\lfd}{q(t;\theta)}$$
is a constant function of $\theta$.
\end{teo}
\begin{proof}
MISSING
\end{proof}
We can see now the prof of the corollary \ref{corol:Savage}
\begin{corol}\textbf{Savage}\\
Let $\lfd$ be the joint PDF or PMF of a random sample $\uX=(X_1... X_n)$. A statistic $T_n$ is sufficient for $\theta$ if and only if there exist two non negative functions $g(t,\theta), h(\ux)$ such that for all $\ux$ in the sample space and for all $\theta\in \Theta$
$$\lfd=g(T(\ux);\theta)h(\ux)$$
\end{corol}
\begin{proof}
We are going to prove the theorem only in the discrete settings.\\
\begin{itemize}
\item["$\Rightarrow$"] Suppose that $T(\uX)$ is sufficient for $\theta$.\\
Define:
\begin{itemize}
\item $g(t,\theta):=\p(T(\uX)=t)$
\item $h(\ux):=\p\bigg(\uX=\ux \bigg|T(\uX)=T(\ux)\bigg)$
\end{itemize}
Because $T(\uX)$ is sufficient for $\theta$ the conditional probability defining $h(\ux)$ does not depend on $\theta$. Hence the choice of $g(t,\theta)$ and $h(\ux)$ is legitimate and for this choice we have
\[
\begin{split}
\p(\uX=\ux)
&=\p(\uX=\ux \wedge T(\uX)=T(\ux))\\
&=\p(T(\uX)=T(\ux))\p(\uX=\ux|T(\uX)=T(\ux))\\
&=g(t,\theta)h(\ux)
\end{split}
\]
So we have the factorization and in particular we can see that
$$\p\bigg(T(\uX)=T(\ux)\bigg)=g(t,\theta)$$
$\implies g(T(\ux),\theta)$ is the PMF of $T(s)$
\item["$\Leftarrow$"] We assume that the factorization holds.\\
Let $q(t,\theta)$ be the PMF of $T(\uX)$. We study the ratio
$$\frac{\lfd}{q(T(\ux);\theta)}$$
in particular define
$$A_{T(\ux)}=\{\underline y | T(\underline y)= T(\ux) \}$$
Then
\[
\begin{split}
\frac{\lfd}{q(T(\ux);\theta)}
&=\frac{g(T(\ux);\theta)h(\ux)}{q(T(\ux);\theta)}\\
&=\frac{g(T(\ux);\theta)h(\ux)}{\sum_{\underline y \in A_{T(\ux)}}{g(T(\ux);\theta)h(\underline y)}}\\
&=\frac{g(T(\ux);\theta)h(\ux)}{g(T(\ux);\theta) {\sum_{\underline y \in A_{T(\ux)}}h(\underline y)}}\\
&=\frac{h(\ux)}{\sum_{\underline y \in A_{T(\ux)}}h(\underline y)}
\end{split}
\]
This is constant \wrt $\theta$.\\
Then by the Fisher Theorem $T(\uX)$ is sufficient for $\theta$.
\end{itemize}
\end{proof}
\begin{eg}$(X_1... X_n)\in \{0,1\}^n$ from a $N(\mu,\sigma^2)$, $\sigma^2$ known. As we did in the example \ref{eg:gauss} we want to find if $T_n=\frac{1}{n}\sum_{i=1}^{n} X_i$ is a sufficient statistic for $\mu$.
\[
\begin{split}
f_{\uX}(\ux;\mu\sigma^2)
&=(2 \pi \sigma^2)^{-n/2} \exp \bigg\{ -\frac{1}{2\sigma^2}\sum_{i=1}^{n} (x_i-\mu)^2 \bigg\}\\
&=(2 \pi \sigma^2)^{-n/2} \exp \bigg\{ -\frac{1}{2\sigma^2} \sum_{i=1}^{n}(x_i-\bar x_n -\bar x_n -\mu)^2 \bigg\}\\
&=(2 \pi \sigma^2)^{-n/2} \exp \bigg\{ -\frac{1}{2\sigma^2}\bigg( \sum_{i=1}^{n} (x_i-\bar x_n)^2 + n(\bar x_n-\mu)^2 \bigg) \bigg\}\\
\end{split}
\]
we already know the distribution of $T(\ux)=\bar x_n=\frac{1}{n}\sum_{i=1}^{n}x_i$ is $\bar X_n\sim N(\mu,\sigma^2/n)$.\\
So we can apply Fisher theorem to the ratio:
\[
\frac{(2 \pi \sigma^2)^{-n/2} \exp \bigg\{ -\frac{1}{2\sigma^2}\bigg( \sum_{i=1}^{n} (x_i-\bar x_n)^2 + n(\bar x_n-\mu)^2 \bigg) \bigg\}}{(2 \pi \sigma^2/n)^{-1/2}\exp \bigg\{- \frac{n}{2\sigma^2} (\bar x_n -\mu)^2 \bigg\}}
\]
\[
\frac{(2 \pi \sigma^2)^{-n/2} \exp \bigg\{ -\frac{1}{2\sigma^2} \sum_{i=1}^{n} (x_i-\bar x_n)^2 \bigg\} \exp \bigg\{ -\frac{n}{2\sigma^2}(\bar x_n-\mu)^2 \bigg\}}{(2 \pi \sigma^2/n)^{-1/2}\exp \bigg\{- \frac{n}{2\sigma^2} (\bar x_n -\mu)^2 \bigg\}}
\]
\[
\frac{(2 \pi \sigma^2)^{-n/2} \exp \bigg\{ -\frac{1}{2\sigma^2} \sum_{i=1}^{n} (x_i-\bar x_n)^2 \bigg\} }{(2 \pi \sigma^2/n)^{-1/2}}
\]
Hence by Fisher Theorem $T_n=\frac{1}{n}\sum_{i=1}^{n} X_i$ is sufficient for $\mu$
\end{eg}
\begin{oss}
Until now we found only one sufficient statistic for a fixed parametric model. However we can define many sufficient statistics.\\
For example the statistic given by the identity $T(\ux)=\ux$ is always a sufficient statistic, indeed we can factorize the distribution $f_X(x,\theta)$ with
\begin{itemize}
\item $h(x)=1$
\item $g(T(x);\theta)=f_X(x,\theta)$
\end{itemize}
\end{oss}
\begin{oss}
Given one sufficient statistic a way to produce more sufficient statistics is thru a one to one function.\\
Suppose $T(\ux)$ is a sufficient statistic for $\theta$, and define $T^*(\ux)=r(T(\ux))$ where $r$ is a one to one function with inverse $r^{-1}$.\\
By Savage's Theorem there exist $g,h$ such that
$$\lf=g(T(\ux),\theta)h(\ux)=g(r^{-1}(r(T(\ux),\theta)))h(\ux)=g(r^{-1}(T^*(\ux)),\theta)h(\ux)$$
So defining $g^*(t,\theta)=g(r^{-1}(t),\theta)$ we have that
$$\lf=g^*(T^*(\ux),\theta)h(\ux)$$
$\implies$ by Savage Theorem we have that $T^*(\ux)$ is a sufficient statistic.
\end{oss}
We saw that in principle we can define many sufficient statistics so it is natural to define a tool that allows us to decide when a sufficient statistic is better than another.\\
Recall that the purpose of statistic is to achieve data reduction without loss of information.\\
Therefore a statistic that achieve the most data reduction while still retaining all of the information about $\theta$ might be preferable.
\begin{oss}
We saw in example \ref{eg:gauss} that if $(X_1... X_n)\in \{0,1\}^n$ from a $N(\theta,1)$, $T_n=\sum_{i=1}^{n} x_i$ is a sufficient statistic. Instead of $\sum_{i=1}^{n} X_i$ we can use $T'(\ux)\bigg( \sum_{i=1}^{n} x_i, \sum_{i=1}^{n} x_i^2 \bigg)$. Clearly $T(X)$ s a greater data reduction than $T'(\ux)$ since we do not need to know the sample variance if we want to know $\theta$. Moreover we can write $T(\ux)$ as a function of $T'(\ux)$ by defining the function $r(a,b)=a$, then we can write
$$T(\ux)=\bar x_n=r(\bar x_n, S^2_n)=r(T'(\ux))$$
Since $T(\ux)$ and $T'(\ux)$ are both sufficient they contains the same information about $\mu$.
In other terms the additional information given by the sample variance is null.
\end{oss}
\begin{defi}
A sufficient statistic $T(\ux)$ is called \textbf{minimal} if for any other sufficient statistic $T'(\ux)$, $T(\ux)$ is a function of $T'(\ux)$.
\end{defi}
NOTE:\\
To say that $T(\ux)$ is a function of $T'(\ux)$ simply means that if $T'(x)=T'(y)$ then $T(x)=T(y)$.\\
In other terms if $\{ B_t\} $ where $B_t:=\{t' : T'(t)=T'(t') \}$ is the partition set induced by $T'$ and $\{ A_t\} $ where $A_t:=\{t' : T(t)=T(t') \}$ is the partition set induced by $T$ then for every $t$, $B_t \subseteq A_t$.\\
$\implies$the partition of the sample space induced by a minimal statistic is the partition with the smallest cardinality.
\begin{teo}\textbf{Lehmann and Sheffe}
let $\lfd$ be the joint density function or joint probability mass function of a \rs \ $\uX=(X_1...X_n)$. Suppose there exist a function $T$ such that for any two sample points $\ux$ , $\uy$ the ratio
\[
\frac{\lfd}{f_{\uX}(\uy;\theta)}
\]
is constant as a function of $\theta$ if and only if $T(\ux)=T(\uy)$
Then $T$ is a minimal sufficient statistic for $\theta$
\end{teo}
\begin{proof}
To simplify the proof we assume $\lfd > 0 \ \forall \ux, \forall \theta$.\\
First we show that $T(\ux)$ is sufficient.\\
Define $\Tau$ as the image of the sample space under the function $\st$.
$$\Tau:=\{ t:t=\st \text{for some $\ux$ in the sample space} \}$$
Define $\{ A_t\} $ the partition set induced by $T$, where $A_t:=\{t' : T(t)=T(t') \}$
For each $A_t$ choose and fix some elements $x_t\in A_t$. For any point in the space $\ux_{\st}$ is the fixed element that is in the same set ,$A_t$, as $\ux$. Since $\ux$ and $\ux_{\st}$ are in the same set $A_t$ then $\st=T(\ux_{\st})$ so by the assumptions the ratio
\[
\frac{\lfd}{f_{\uX}(\ux_{\st};\theta)}
\]
Does not depend on $\theta$. Thus we can define $h(\ux):=\frac{\lfd}{f_{\uX}(\ux_{\st};\theta)}$.\\
Then define the function $g(\ux,\theta)\lfd$, so we have:
\[
\lfd =\frac{f_{\uX}(\ux_{\st};\theta)\lfd}{f_{\uX}(\ux_{\st};\theta)}=\gf h(\theta)
\]
and by Savage Theorem $\st$ is sufficient for $\theta$.\\
Now we will show that $\st$ is minimal sufficient.\\
Let $T'(\ux)$ be another sufficient statistic. By Savage Theorem we know that exist $h',g'$ such that
\[
\lfd=g'(T'(\ux),theta)h'(\theta)
\]
Let $\ux,\uy$ be two sample points such that $T'(\ux)=T'(\uy)$ then we can study the ratio:
\[
\frac{\lfd}{f_{\uX}(\uy;\theta)}=\frac{g'(T'(\ux);\theta)h'(\ux)}{g'(T'(\uy);\theta)h'(\uy)}=\frac{h'(\ux)}{h'(\uy)}
\]
Since the ratio does not depend on $\theta$, by the assumption (the other implication of the IFF) implies $\st =T(\uy)$. So we can say that $T(\ux)$ is a function of $T'(\ux)$ therefore $\st$ is minimal.
\end{proof}
\section{Estimators}
\begin{defi}
Suppose there is a fixed parameter $\theta$ that needs to be estimated. Then an \textbf{estimator} is a function that maps the sample space to a set of sample estimates. An estimator of $\theta$ is usually denoted by the symbol $\bar \theta$.
\end{defi}
Now we re going to introduce some definition of \textit{"good"} estimators.
\begin{defi}
$T_n(\uX)$ is said to be \textbf{unbiased} for $\theta$ if $\e[T_n(\uX)]=\theta$
\end{defi}
\begin{oss}
we use the expected value to define a "good"v estimator because of the linearity of the operator.
\end{oss}
\begin{defi}
\textbf{Bias}:
\[
Bias_\theta (T_n(\uX))=\e\bigg[T_n(\uX)-\e[T_n(\uX)]\bigg]
\]
\end{defi}
When we ask an estimator to be unbiased basically we are requiring it to be centred around $\theta$.\\
Another parameter that give us information about the goodness of an estimator is the variance. We can interpret the variance as a measure of the dispersion around the expected value, so before check the variance we mus be sure that the expected value overlap with $\theta$. In this scenario the less is the variance the best is the estimator.
\begin{oss}
Variance is a good parameter to watch only if the estimator is unbiased
\end{oss}
To avoid this problem we can introduce the \textit{Mean Squared Error}
\begin{defi}\textbf{Mean Squared Error}(MSE)
\[
\e[(T_n(\uX)-\theta)^2]
\]
\end{defi}
The importance of this quantity comes from the \textit{Chebyshev's Inequality} \ref{eq:Chebyshev}
\[
\p(|T_n(\ux)-\theta|< k)>1-\frac{\e[(T_n(\uX)-\theta)^2]}{k^2}
\]
Indeed we notice the smaller the MSE the greater is $\p(|T_n(\ux)-\theta|< k)$.
\begin{prop}
$$\e[(T_n(\uX)-\theta)^2]=Var(T_n(\uX))+Bias_\theta(T_n(\uX))^2$$
\end{prop}
\begin{proof}
\[
\begin{split}
\e[(T_n(\uX)-\theta)^2]
&=\e[(T_n(\uX) - \e[T_n(\uX)] + \e[T_n(\uX)]-\theta)^2 ]\\
&=\e[(T_n(\uX)-\e[T_n(\uX)])^2]+\e[(\e[T_n(\uX)]-\theta)^2]+2\e[(T_n(\uX)-\e[T_n(\uX)])(\e[T_n(\uX)]-\theta)]\\
&=\e[(T_n(\uX)-\e[T_n(\uX)])^2]+\e[(\e[T_n(\uX)]-\theta)^2]+2\e[ \tn(\uX)\e[\tn(\uX)] - \tn(\uX)\theta - \e[\tn(\uX)]^2 + \theta \e[\tn(\uX)]\\
&=\e[(T_n(\uX)-\e[T_n(\uX)])^2]+\e[(\e[T_n(\uX)]-\theta)^2]\\
&=Var(T_n(\uX))+Bias_\theta(T_n(\uX))^2
\end{split}
\]
\end{proof}
\begin{oss}
If $\e[T_n(\uX)]=\theta$ then $MSE(T_n(\uX))=Var(T_n(\uX))$.
\end{oss}
\begin{defi}
Let $X_1.. X_n$ from $X\sim f_X(x,\theta)$, $T_n'$ and $T_n''$ estimators for $\theta$. We say $T_n'$ is \textbf{more efficient} than $T_n''$ if
$$MSE(T_n')<MSE(T_n'')$$
\end{defi}
Usually we choose the estimator with the lower MSE even if it is biased.\\
\section{Properties of Estimators}
The problem of the MSE is that we can not be sure that there exist $T_n$ such that $MSE (\tn)$ is the lowest possible.\\
A solution for this comes from
\begin{teo}\label{teo:Cramer-Rao Bound}
\textbf{Cramer-Rao Bound}\\
Let $X=(X_1... X_n)$ be a random sample from a parametric model $X\sim f_X(x,\theta)$.\\
Then, under condition of regularity, for any estimator $\tn$ of $\theta$
\[
Var(\tn)\geq \frac{[1+b'(\tn)]^2}{\ifn}
\]
Where
\begin{itemize}
\item[$b(\tn)$] is the bias of $\tn$
\item[$\ifn$] is the Fisher Information
\end{itemize}
\end{teo}
\begin{proof}
consider the estimator $\tn$.
\begin{itemize}
\item $\e[\tn]=\theta +b(\theta)$
\item $\der \e[\tn]=1+b'(\theta)$
\item $\e[V_n'(\theta)]=0 \leftarrow$ because we suppose our model regular
\end{itemize}
$\implies Cov(\tn, V_n'(\theta))=\e[\tn V_n'(\theta)] - \e[\tn]\e[V_n'n(\theta)]=\e[\tn V_n'(\theta)]$
\[
\begin{split}
\e[\tn V_n'(\theta)]&
=\int_{\mathbb{R}^n}\tn V_n'(\theta) \lfd d\ux\\
&=\int_{\mathbb{R}^n}\tn \frac{f_{\uX}'(\ux,\theta)}{\lfd}\lfd d\ux\\
&=\int_{\mathbb{R}^n}\tn \der \lfd d\ux\\
&=\der \int_{\mathbb{R}^n}\tn \lfd d\ux\\
&=\der \e[\tn]\\
&=1+b'(\theta)
\end{split}
\]
So
\[
Cov(\tn,V_n'(\theta))=1+b'(\theta)
\]
We know that in general for $X,Y$ \rv s such that $\e[X]=\mu, \e[Y]=\nu, \e[X^2]<\infty, \e[Y^2]<\infty$, from Cauchy-Schwarz, it holds
\[
\begin{split}
(Cov(X,Y))^2&=\bigg( \e[(X-\mu)(Y-\nu)] \bigg)^2\\
&\leq \e[(X-\mu)^2] \e[(Y-\nu)^2]\\
&= Var(X)Var(Y)
\end{split}
\]
So replacing $X$ with $\tn$ and $Y$ with $V_n'(\theta)$ we obtain
\[
\begin{split}
Var(\tn)&\geq \frac{(Cov(\tn,V_n'(\theta)))^2}{Var(V_n'(\theta))}\\
&=\frac{(Cov(\tn,V_n'(\theta)))^2}{\e[(V_n'(\theta)-\e[V_n'(\theta)])^2]}\\
&=\frac{(Cov(\tn,V_n'(\theta)))^2}{\e[V_n'(\theta)^2]}\\
&=\frac{(Cov(\tn,V_n'(\theta)))^2}{\ifn}
\end{split}
\]
\end{proof}
\begin{corol}
Under condition of regularity
\[MSE \geq \frac{(1+b'(\theta))^2}{\ifn} +b^2(\theta) \]
\end{corol}
\begin{proof}
Directly from Cramer-Rao Bound \ref{teo:Cramer-Rao Bound} remembering that $MSE(\tn)=Var(\tn)+b^2(\theta)$
\end{proof}
\begin{corol}
Let $\uX=(X_1... X_n)$ be a random sample from a regular model $X\sim f_X(x;\theta)$\\
If there exist a unbiased estimator for $\theta$ whose variance is equal to the Cramer-Rao bound, Then $\tn$ is unique
\end{corol}
\begin{proof}
Take $T_{1n}$, $\tnd$ be unbiased estimators for $\theta$ such that
\[
Var(\tnu)=Var(\tnd)=\frac{1}{\ifn}
\]
Define $\tn := \frac{\tnu + \tnd}{2}$\\
$\e[\tn]=\frac{\e[\tnu]+\e[\tnd]}{2}=\frac{2}{2}\theta=\theta$\\
$\implies \tn$ is also unbiased for $\theta$\\
$\implies Var(\tn)\geq \frac{1}{\ifn}$
\[
\begin{split}
Var(\tn)
&=Var\bigg( \frac{\tnu + \tnd}{2} \bigg)\\
&=\frac{1}{4}\bigg[ Var(\tnu)+ Var(\tnd) +2 Cov(\tnu, \tnd) \bigg]\\
&=\frac{1}{4 }\bigg[ Var(\tnu)+ Var(\tnd) +2 Cov(\tnu, \tnd) \bigg]\frac{[Var(\tnu)Var(\tnd)]^{1/2}}{[Var(\tnu)Var(\tnd)]^{1/2}}\\
&=\frac{(1+Corr(\tnu,\tnd))}{2}\frac{1}{\ifn}
\end{split}
\]
\\MEMO:$|Corr(X,Y)|\leq 1$\\
Because $Var(\tn)\leq\frac{1}{\ifn}$ then we must have $Corr(\tnu,\tnd)\geq 1 \implies Corr(\tnu,\tnd)= 1$.\\
$\implies \tnd=a+b\tnu$.\\
Hence we must have $\theta=\e[\tnu]=\e[a+b\tnu]=\e[a]+b\e[\tnu]=a+b\theta\implies a=0,b=1$\\
$\implies \tnd=\tnu$
\end{proof}
\begin{defi}
Consider a regular model $X\sim f_x(x\theta)$. We say that an unbiased estimator $\tn$ whose variance is
\[
Var(\tn)=\frac{1}{\ifn}
\]
is \textbf{efficient}.\\ Moreover we define the \textbf{efficiency} of an estimator as:
\[
Eff(\tn)=\frac{1}{Var(\tn)\ifn} \ \ \ \ \ \ \in [0,1]
\]
\end{defi}
\begin{oss}
\begin{enumerate}
\item We introduce (absolute) efficiency at the cost of assuming regularity for the parametric model.
\item The variance of an unbiased estimator $\tn$ of $\theta$ can not be smaller than the Cramer-Rao bound. However we do not know if there exist an estimator whose variance is equal to the Cramer-Rao bound.
\item The proper lower bound involves the MSE
$$MSE(\tn)\geq\frac{[1+b'(\theta)]}{\ifn}+b^2(\theta)$$
\end{enumerate}
\end{oss}
\begin{prop}
Let $\uX=(X_1...X_n)$ be a random sample from a parametric model $X\sim f_x(x\theta)$. Let $\tn$ be a unbiased estimator for $\theta$. Than $\tn$ i efficient for $\theta$ if and only if
$$V_n'(\theta)=\ifn (T_n-\theta)$$
\end{prop}
\begin{proof}
\[
\begin{split}
V_n'(\theta)=\ifn (\tn-\e[\tn])&
\iff V_n'(\theta)^2=\ifn^2 (\tn-\e[\tn])^2\\
&\iff \e[V_n'(\theta)^2]=\e[\ifn^2 (\tn-\e[\tn])^2]\\
&\iff \ \ifn=\ifn^2 \e[(\tn-\e[\tn])^2]\\
&\iff \ 1=\ifn Var(\tn)\\
&\iff Var(\tn)=\frac{1}{\ifn}
\end{split}
\]
i.e. $\tn$ is efficient.
\end{proof}
\begin{teo}
\textbf{Rao-Blackwell}\\
Let $\uX=(X_1 ... X_n)$ a random sample from a parametric model $X\sim f_x(x\theta)$. Let
\begin{itemize}
\item $\tnu$ a sufficient estimator for $\theta$
\item $\tnd$ a unbiased estimator for $\theta$
\item $\tn:=\e[\tnd|\tnu]$
\end{itemize}
Then
\begin{enumerate}
\item $\tn$ is a function of $\tnu$
\item $\e[\tn]=\theta$
\item $Var(\tn)<Var(\tnd)$
\end{enumerate}
\end{teo}
\begin{defi}
Let $X_1, X_2...$ real valued \rv s with CDF $F_{X_1},F_{X_2}...$. We say that $(X_n)_n$ \textbf{converges in distribution} or \textbf{converges weakly} to a \rv \ $X$ if $$\lim_{n\to \infty}F_{X_n}(x)=F_X(x)$$
\end{defi}
\begin{defi}
A sequence $\{ X_n\}_n$ of \rv s \textbf{converges in probability} towards the \rv \ $X$ if for all $\epsilon>0$
$$\lim_{n\to \infty}\p(|X_n-X|>\epsilon)=0$$
We write $X_n\xrightarrow{p}X$
\end{defi}
\begin{defi}
Given a real number $r \geq 1$, we say that the sequence $\{X_n\}$ converges in the \textbf{r-th mean} (or \textbf{in the $L^r$-norm}) towards the random variable $X$, if the r-th absolute moments $\e(|X_n|^r)$ and $\e(|X|^r)$ of $X_n$ and $X$ exist, and
$$\lim_{n\to \infty}\e[|X_n-X|^r]=0$$
We write $X_n \xrightarrow{L^r} X$.\\
For $r=2$ we say that $\{X_n\}$ converges in \textbf{mean square} to $X$.
\end{defi}
\begin{prop}
Convergence in probability $\implies$ convergence in distribution.\\
If $X$ is a degenerate \rv \ we have also\\ convergence in distribution $\implies$ convergence in probability.
\end{prop}
\begin{defi}
We say an estimator $\tn$ is \textbf{consistent in mean squared} for $\theta$ if
$$\lim_{n\to \infty}MSE(\tn)=0$$
\end{defi}
NOTATION: we will use $b(\tn):=b(\theta)$
\begin{oss}
Since $MSE(\tn)=Var(T_n)+b^2(\tn)$, then then $\lim_{n\to \infty}MSE(\tn)=0$ is equivalent to say
\begin{itemize}
\item $\lim_{n\to \infty}Var(\tn)=0$
\item $\lim_{n\to \infty}b^2(\tn)=0$
\end{itemize}
\end{oss}
\begin{defi}
$T_n$ is \textbf{asymptotically unbiased} for $\theta$ if
\begin{itemize}
\item $\lim_{n\to \infty}\e[\tn]=\theta$
\item $\lim_{n\to \infty}b(\tn)=0$
\end{itemize}
\end{defi}
\begin{prop}
A consistent estimator in mean square is also asymptotically unbiased
\end{prop}
\begin{defi}
We say $\tn$ is consistent in probability for $\theta$ if for all $\epsilon >0$
$$\lim_{n\to \infty}\p(|\tn-\theta|<\epsilon)=1$$
\end{defi}
\begin{prop}
The consistencyin mean squared implies consistency in probability
\end{prop}
\begin{proof}
Using Chebyshev's inequality \ref{eq:Chebyshev}
$$\p(|\tn-\theta|<\epsilon)\geq 1-\frac{MSE(\tn)}{\epsilon^2}$$
$$\lim_{n\to \infty}\p(|\tn-\theta|<\epsilon)\geq 1-\lim_{n\to \infty}\frac{MSE(\tn)}{\epsilon^2}$$
\end{proof}
\begin{teo}
\textbf{Central Limit Teorem}\\
Let $\uX=(X_1... X_n)$ be a random sample of size $n$ with $X_i$ independent and identically distributed \rv s. With expected value $\mu$ and finite variance $\sigma^2$. Then $S_n:=\frac{\sum_{i=1}^n}{n}$ converges in probability to the expected value $\mu$
$$S_n\xrightarrow{p} \mu$$
\end{teo}
\begin{teo}
\textbf{Week law of large numbers}\\
Let $\uX=(X_1... X_n)$ be a random sample of size $n$ with $X_i$ independent and identically distributed \rv s. With expected value $\mu$.
Define $\bar X_n=\frac{1}{n}\sum_{i=1}^{n}X_i$, then
$$\bar X_n \xrightarrow{p} \mu$$
That is, for any $\epsilon>0$
$$\lim_{n\to \infty}\p(|\bar X_n- \mu|>\epsilon)=0$$
\end{teo}
\section{Method of Moments}
\begin{oss}
Before we start, like the likelihood method, this method can also be done for irregular models.
\end{oss}
Let $X=(X_1,\ldots,X_n)$ be a random sample from a parametric model $X\sim f_X(x,\theta)$.
Assume that $\theta\in\Theta$ s.t. $\Theta$ has dimension $r$. In order to apply the method of moment for estimating $\theta$ we need:
\begin{itemize}
\item
A condition related to the dimension of $\Theta$. If the number of parameters that we need to estimate is $r$, then the moments of $X$ must exist up to the order $r$.
$$\e[{|x|}^r]<\infty.$$
\item
Explicit expression for the first $r$ moments of $X$.
\end{itemize}
Given that, the method of moments exists in solving the system of equations given by:
$$\frac{1}{n}=\sum_{i=1}^{n}{X_i}^{J}=\e[X^{J}], \quad \forall\enspace J=1,\ldots,r$$
\begin{oss}
We don't have any result on the theoretical qualities on this method for the simulation. We need to check case by case.
\end{oss}
\section{Maximum Likelihood Approach}
We saw (exercitation week not) that the Maximum Likelihood Approach consist in finding the $$arg \ \max_{\theta\in \Theta} \ln(\lf)$$
\begin{itemize}
\item if the model is regular then we can simply solve the ML equation $\vnp =0$
\item if the parameter is a positive integer then we used the ratio
\item the parameter defines the support of the model we must insert the indicator function in the likelihood function
\item if the model is regular but we can not solve explicitly the likelihood equation.\\
A way to solve this is the \textbf{Newton-Raphson method}
\end{itemize}
\begin{teo}
\textbf{Delta Method}(Generalization of Central Limit Theorem for functions)\\
Let $(Y_n)_{n\geq 1}$ be a sequence of \rv s such that
$$\sqrt{n}(Y_n-\theta) \xrightarrow{w} N(0,\sigma^2) \ \ \ \ as \ n \to \infty$$
for a specific function $g$ and a given value $\theta$ assume $g'(\theta)$ exist and it is not zero then
$$\sqrt{n}(g(Y_n)-g(\theta)) \xrightarrow{w} N(0,\sigma^2[g'(\theta)]^2) \ \ \ \ as \ n \to \infty$$
\end{teo}
\begin{proof}
Take Taylor's approx of $g(Y_n)$ around $Y_0=\theta$. \\
Then $$g(Y_n)=g(\theta)+ g'(\theta)[Y_n-\theta]+R$$
Where $R\to 0$ as $Y_n\to \theta$. Since $Y_n\xrightarrow{p}\theta$ then $\xrightarrow{p} 0$. By an application of Slutsky Theorem we have:
$$\sqrt{n}(g(Y_n)-g(\theta)) = \sqrt(n)(Y_n-\theta)g'(\theta)\xrightarrow{w} N(0,\sigma^2[g'(\theta)]^2) \ \ \ \ as \ n \to \infty$$
\end{proof}
A useful property of the \mle is the \textbf{invariance property}.\\
Suppose we have a parametric family indexed by a parameter $\theta$ but we re interested in estimating $\Tau(\theta)$. The invariance property of \mle say that if $\hat \theta$ it the \mle for $\theta$ then $\T(\hat{\theta})$ is the \mle for $\T(\theta)$.\\
If the map $\theta \T(\theta)$ is one to one then the invariance property is easy to show. Indeed if we let $\mu=\T(\theta)$ then the inverse function$\T^{-1}(\mu)=\theta$ is well defined and the likelihood function written as a function of $\mu $ is:
$$\lfs=\prod_{i=1}^{n}f_{X_i}(x_i;\T^{-1}(\mu))=\mathcal{L}(\T^{-1}(\mu); \underline x)$$
and
$$\sup_{\tau}\lfs=\sup_{\tau}=\mathcal{L}(\T^{-1}(\mu); \underline x)=\sup_\theta \lf$$
Then the $\max$ of $\lfs$ is attained at $\T=\T(\theta)=\T(\hat \theta)$. This shows $\T(\hat \theta)$ is the \mle for $\T( \theta)$.\\
In general the invariance property of \mle is true for any functional of $\theta$.
\begin{teo}\label{them:invariance}
\textbf{invariance property of \mle}\\
If $\hat \theta$ is the \mle for $\theta$ then for all function $\T(\theta)$ the \mle of $\T(\theta)$ is $\T(\hat \theta)$.
\end{teo}
\begin{proof}
Define $$\lfs:=\sup_{ \{\theta \in \Theta: \T(\theta)=\T\} }\lf$$
let $\hat \mu$ the value that maximize the function $\lfs$, it can be shown that $\hat \mu$ coincides with the argument that maximize $\lf$.\\
We must show that
$$\mathcal{L}^*(\hat \mu;\underline x)=\mathcal{L}*(\T(\hat \theta);\underline x)$$
As we stated above $\hat \mu$ coincides with the argument that maximize $\lf$ so
$$\mathcal{L}^*(\hat \mu;\underline x)=\sup_\mu \sup_{ \{\theta \in \Theta: \T(\theta)=\mu \} }\lf=\sup_\theta \lf=\mathcal{L}^*(\hat \theta;\underline x)$$
Moreover
\[
\begin{split}
\mathcal{L}^*(\hat \theta;\underline x)
&= \sup_{ \{\theta \in \Theta: \T(\theta)=\T(\hat \theta) \} }\lf\\
&=\mathcal{L}^*(\T(\hat\theta);\ux)
\end{split}
\]
By combining the two identities we have that
$$\mathcal{L}^*(\hat \mu;\ux)=\mathcal{L}^*(\T(\hat \theta);\ux)$$
So $\T(\hat \theta)$ is the \mle of $\T(\theta)$ where $\hat \theta$ is the \mle of $\theta$
\end{proof}
\begin{oss}
Fore example we can use the above theorem \ref{them:invariance} to compute the \mle of $\theta^2$ where $\theta$ is the meano of a Gaussian model
\end{oss}
\begin{oss}
The above result \ref{them:invariance} holds also in the context of multidimensional parametric spaces.
\end{oss}
\begin{eg}
$\uX=(X_1... X_n)$ from $X\sim N(\mu,\sigma^2)$. Find the \mle estimator of $(\mu, \sigma^2)$.\\
\begin{itemize}
\item $\mathcal{L}(\mu, \sigma^2;\ux)=\prod_{i=1}^{n}\frac{1}{\sqrt{2 \pi \sigma^2}}\exp \bigg\{ -\frac{(x_i-\mu)^2}{2\sigma^2} \bigg\}=(2 \pi \sigma^2)^{-n/2}\exp \bigg\{-\frac{1}{2\sigma^2} \sum_{i=1}^{n} (x_i-\mu)^2 \bigg\}$
\item $\ln(\mathcal{L}(\mu, \sigma^2;\ux))=-\frac{n}{2}\ln(2 \pi \sigma^2)-\frac{1}{2\sigma^2}\sum_{i=1}^{n}(x_i-\mu)^2\propto -\frac{n}{2}\ln \sigma^2-\frac{1}{2\sigma^2}\sum_{i=1}^{n}(x_i-\mu)^2$
\item $\frac{d}{d\mu}\ln(\mathcal{L}(\mu, \sigma^2;\ux))=\frac{1}{\sigma^2}\sum_{i=1}^{n}(x_1-\mu)$
\item $\frac{d}{\sigma^2}\ln(\mathcal{L}(\mu, \sigma^2;\ux))=-\frac{n}{2\sigma^2}+\frac{1}{2(\sigma^2)^2}\sum_{i=1}^{n}(x_1-\mu)^2$
\end{itemize}
so solving the system
\[
\begin{cases}
\frac{d}{d\mu}\ln(\mathcal{L}(\mu, \sigma^2;\ux))=\frac{1}{\sigma^2}\sum_{i=1}^{n}(x_1-\mu)=0
\\ \frac{d}{\sigma^2}\ln(\mathcal{L}(\mu, \sigma^2;\ux))=-\frac{n}{2\sigma^2}+\frac{1}{2(\sigma^2)^2}\sum_{i=1}^{n}(x_1-\mu)^2=0
\end{cases}
\]
we obtain $\hat \mu=\frac{1}{n} \sum_{i=1}^{n}x_i; \hat \sigma^2=\frac{1}{n}\sum_{i=1}^{n}(x_i-\bar x_n)^2$.\\
The question now is whether $(\hat \mu , \hat \sigma^2)$ is a point of maximum or not.
To do this we can check the Jacobian, but the computational work would be too heavy.\\ Another way is by reducing the dimension of the problem:\\
Note that
$$\sum_{i=1}^{n}(x_i-\mu)^2=\sum_{i=1}^{n}(x_i- \bar x_n +\bar x_n -\mu)^2=\sum_{i=1}^{n}(x_i-\bar x_n)^2 + \sum_{i=1}^{n}(\bar x_n-\mu)^2 + 2 \sum_{i=1}^{n}(x_i-\bar x_n)(\bar x_n -\mu)=\sum_{i=1}^{n}(x_i-\bar x_n)^2 + \sum_{i=1}^{n}(\bar x_n-\mu)^2> \sum_{i=1}^{n}(x_i -\bar x_n)^2$$
So, for any $\theta \not = bar x_n$ we have:
$$\sum_{i=1}^{n}(x_i-\mu)^2>\sum_{i=1}^{n}(x_i -\bar x_n)^2$$
Hence:
$$(2 \pi \sigma^2)^{-n/2}\exp \bigg\{-\frac{1}{2\sigma^2} \sum_{i=1}^{n} (x_i-\bar x_n)^2 \bigg\}\geq(2 \pi \sigma^2)^{-n/2}\exp \bigg\{-\frac{1}{2\sigma^2} \sum_{i=1}^{n} (x_i-\mu)^2 \bigg\}$$
Therefore the problem of verifying that $(\hat \mu, \hat \ sigma^2)$ is a maximum is reduced to a one dimensional problem (we need to check only $\hat \sigma^2$ is actually the max).
\end{eg}
\begin{oss}
In the end we can say that the invariance property is a good property form a computable point of view, but this property is also the reason why the \mle can be biased for a finite sample size.
\end{oss}
\begin{prop}
Let $\uX=(X_1... X_n)$ be a random sample from a parametric model $X\sim f_X(x,\theta)$. If the \mle $\tn$ for $\theta $ exist and it is unique then $\tn$ is a function of the sufficient statistic for $\theta$
\end{prop}
\begin{proof}\textit{(informal)}\\
If a sufficient statistic exist then the likelihood function $\lf$ can be factorized as follows:
$$\lf =g(T(\ux);\theta)h(\ux)$$
$$\implies \ln \lf = \ln (g(T(\ux);\theta))+ \ln (h(\ux))\propto \ln (g(T(\ux);\theta)) $$
\end{proof}
\begin{teo}
Let $\uX=(X_1... X_n)$ be a random sample from a parametric model $X\sim f_X(x;\theta)$ and let $\lf$ be the likelihood function. Let $\hat \theta_n$ be the \mle for $\theta$ and let $\T(\theta)$ be a continuous function of $\theta$. Under the assumption of regularity, for any $\epsilon>0$ and for any point $\theta \in \Theta$ we have that:
$$\lim_{n\to \infty}\p(|\T(\hat \theta_n)-\T(\theta)|\geq \epsilon)=0$$
I.e. the \mle are consistent in probability
\end{teo}
\begin{teo}
Let $\uX=(X_1... X_n)$ be a random sample from a parametric model $X\sim f_X(x;\theta)$ and let $\lf$ be the likelihood function. Let $\hat \theta_n$ be the \mle for $\theta$ and let $\T(\theta)$ be a continuous function of $\theta$. Under the assumption of regularity, for any $\epsilon>0$ and for any point $\theta \in \Theta$ we have that:
$$\sqrt{n}[\T(\hat \theta_n)-\T(\theta)]\xrightarrow{w}N\bigg(0, \frac{1}{\mathcal{I}_1(\theta)} \bigg)$$
where $\mathcal{I}_1(\theta)$ is the Fisher information of the parametric model.
\end{teo}
This means that the \mle are:
\begin{itemize}
\item consistent (weakly)
\item asymptotically unbiased
\item asymptotically Gaussian
\item asymptotically efficient
\end{itemize}
\begin{proof}
we will prove the result only for $\T(\theta)=\theta$. The proof for a generic $\T(\te)$ is an application of the delta method.\\
We have that $L(\theta;\ux)=\sum_{i=1}^{n}\ln (f_{X_i}(x_i;\theta))$.\\ We denote by $L'$ and $L''$ the first and the second derivative \wrt $\theta$.\\
Expand the first derivative (Taylor) around $\theta_0$
$$L'(\theta; \ux)= L'(\theta_0; \ux)+ (\theta- \theta_0)L''(\theta_0, \ux)+R$$
where $R$ is the residual and can be ignored.?\\
Now substitute $\theta$ with the \mle of $\theta$, say $\hat \theta_n$
$$0= L'(\hat \theta_0; \ux)+ (\hat \theta- \theta_0)L''(\theta_0, \ux)+R$$
\[
\begin{split}
\implies \sqrt{n}(\hat \theta_n;- \theta_0)&=-\sqrt{n}\frac{L'(\theta_0)}{L''(\theta_0; \ux)}\\&=\frac{-\frac{1}{\sqrt{n}} L'(\theta_0)}{\frac{1}{n}L''(\theta_0; \ux)}
\end{split}
\]
We can apply the Central Limit Theorem to the denominator and the Weak Law of Large Numbers to the numerator:\\
\begin{itemize}
\item[Numerator:] $$\frac{1}{\sqrt{n}}L'(\theta_0;\ux)=\sqrt{n}\bigg(\frac{1}{n}\sum_{i=1}^{n}W_i \bigg)$$
Where $W_i=\frac{\frac{d}{d\theta} f_{X_i}(x_i;\theta)}{f_{X_i}(x_i;\theta)}$ and $\e[W_i]=0$ and $Var(W_i)=\mathcal{I}_1(\theta)$. By the Central Limit Theorem
$$\frac{1}{\sqrt{n}} L'(\theta_0);\ux \xrightarrow{w}N(0;\mathcal{I}_1(\theta))$$
\item[Denominator:] $$-\frac{1}{n}L''(\theta_0;\ux)=\frac{1}{n}\sum_{i=1}^{n}W_i^2-\frac{1}{n}\sum_{i=1}^{n}\frac{\frac{d^2}{d \theta^2}f_{X_i}(x_i;\theta)}{f_{X_i}(x_i;\theta)}$$
Where $\e[\frac{1}{n}\sum_{i=1}^{n}W_i^2]=\mathcal{I}_1(\theta_0)$ and $\e\bigg[ \frac{1}{n} \sum_{i=1}^{n} \frac{\frac{d^2}{d \theta^2}f_{X_i}(x_i;\theta)}{f_{X_i}(x_i;\theta)}\bigg]=0$. Then by the Weak Law of Large Numbers
$$-\frac{1}{n}L''(\theta;\ux)\xrightarrow{w}\mathcal{I}_1(\theta_0)$$
\end{itemize}
So
$$\sqrt{n}(\hat \theta_n - \theta )\xrightarrow{w} \frac{X}{\mathcal{I}_1(\theta_0)}$$
Where $X\sim N(0,\mathcal{I}_1(\theta_0))$
$$\implies \sqrt{n}(\hat \theta_n - \theta )\xrightarrow{w} N\bigg(0,\frac{1}{\mathcal{I}_1(\theta_0)}\bigg) $$
\end{proof}