capitolo4.tex

\chapter{Statistics}
\vspace{15pt}



The notation of statistic was introduced by Fisher (1920).\\
The importance of sufficiency is that ot can be found in any statistical decision (point estimation, testing, confidential bound)
\begin{defi}
	Let $X=(X_1... X_n)$ be a random sample from a parametric model $X\sim f_X(x,\theta)$ for some $\theta \in \Theta$ unknown.\\
	We say that $T_n=T(X)$ is \textbf{sufficient for the parameter $\theta$} if the conditional distribution of $X$ given $T_n$ does not depend of $\theta$ i.e. defined:
	\begin{itemize}
		\item $f_{\uX|T_n=t}(\ux;t,\theta)$ the conditional distribution of $\uX$ given $T_n$
		\item $h_{\uX,T_n}(z,t,\theta)$ the joint distribution of $\uX$ and $T_n$
		\item $g_{T_n}(t,\theta)$ the marginal distribution of $T_n$
	\end{itemize}
then $T_n$ is \textbf{sufficient for the parameter $\theta$} only if $f_{\uX|T_n=t}(\ux;t,\theta)$ does not depend of $\theta$.\\
Note that
\[
f_{\uX|T_n=t}(\ux;t,\theta)=\frac{h_{\uX,T_n}(\ux,t,\theta)}{g_{T_n}(t,\theta)}
\]
\end{defi}
\begin{eg} \label{eg:ber}
	$(X_1... X_n)\in \{0,1\}^n$ from a $Ber(\theta)$, $\theta \in (0,1)$.\\
	Define $T_n=\sum_{i=1}^{n}X_i$, then we want to verify if $T_n$ is sufficient.\\
	\begin{itemize}
		\item $f_{\uX}(\ux;\theta) = \prod_{i=1}^{n} f_{{X_i}}({x_i};\theta)=\theta^{\sum_{i=1}^{n}x_i}(1-\theta)^{n-\sum_{i=1}^{n}x_i}$
		\item$g_{T_n}(t,\theta)={{n}\choose{t}}\sigma^t(1-\sigma)^{n-t}\mathbbm{1}_{0,1...n}(t)$
		\item$h_{\uX,T_n(z,t,\theta)}=\p(\uX=\ux,T_n=t)=\sigma^t(1-\sigma)^{n-t}$
	\end{itemize}
	so 
	$$f_{\uX|T_n=t}=\frac{\sigma^t(1-\sigma)^{n-t}}{{{n}\choose{t}}\sigma^t(1-\sigma)^{n-t}}=\frac{1}{{{n}\choose{t}}}$$
	So $T_n$ is a sufficient statistic for $\theta$.\\
	This is a really special case because all the $X_i$ are already in function of $T_n$.
\end{eg}

\begin{oss}
	If $T_n$ is sufficient for $\theta$ then all the statistical information of $\theta$ contained in the random sample is relocated in $T_n$. In the example above to infer about $\theta$ we just need $\sum_{i=1}^{n}X_i$.\\
\end{oss}
\begin{oss}
	The notation of sufficiency derive from the probability structure of the parametric family $X\sim f_X(x;\theta)$. We can talk about sufficiency for a parameter $\theta$ only after we have specified $X\sim f_X(x;\theta)$
\end{oss}
The definition of sufficiency based on conditional probability is not of practical use because we need this two distributions $
\begin{cases}
g_{T_n}(\cdot)\\
h_{\uX,T_n}(\cdot,\cdot)
\end{cases}
$ that can be difficult to find.
To avoid that we could use a corollary of the \textit{Fisher Factorization Theorem}:
\begin{corol}\label{corol:Savage}
	Let $\uX=(X_1... X_n)$ from $X\sim f_x(x,\theta)$. Then a statistic $T_n$ is sufficient for $\theta$ if and only if there exist two non negative functions $g(\cdot), h(\cdot)$ such that $\lf=g(T(\ux);\theta)h(\ux)$
\end{corol}
\begin{oss}
	\begin{itemize}
		\item $g$ is a function of the observed sample via $T_n$
		\item $h$ is a function of the observed sample and does not depend on $\theta$
 	\end{itemize}
\end{oss}
\begin{eg}
	Recall the example \ref{eg:ber} 
		$(X_1... X_n)\in \{0,1\}^n$ from a $Ber(\theta)$, $\theta \in (0,1)$.\\
	Define $T_n=\sum_{i=1}^{n}X_i$, then we want to verify if $T_n$ is sufficient. \\
	We have
	 $$f_{\uX}(\ux;\theta) = \prod_{i=1}^{n} f_{{X_i}}(x_i;\theta)=\theta^{\sum_{i=1}^{n}x_i}(1-\theta)^{n-\sum_{i=1}^{n}x_i}$$
	hence we can apply the previous theorem \ref{corol:Savage} using 
	\begin{enumerate}
		\item $h(\uX)=1$ \item $g(T_n(\uX);\theta)=\theta^{\sum_{i=1}^{n}x_i}(1-\theta)^{n-\sum_{i=1}^{n}x_i}$
	\end{enumerate}
\end{eg}
\begin{eg}\label{eg:gauss}
		$(X_1... X_n)\in \{0,1\}^n$ from a $N(\theta,1)$. We want to verify that $T_n=\sum_{i=1}^{n} X_i$ is a sufficient statistic:
		\[
		\begin{split}
			\lf
			&=\prod_{i=1}^{n}\frac{1}{\sqrt{2 \pi}} \exp \bigg\{ -\frac{1}{2} (x_i-\theta)^2 \bigg\}\\
			&=(2\pi)^{-n/2}\exp \bigg\{ -\frac{1}{2} \sum_{i=1}^{n}(x_i-\theta)^2 \bigg\}\\
			&=(2\pi)^{-n/2}\exp \bigg\{ -\frac{1}{2} \sum_{i=1}^{n}x_i^2- \frac{n\theta^2}{2}+\theta\sum_{i=1}^{n} x_i \bigg\}\\
			&=(2\pi)^{-n/2}\exp \bigg\{ -\frac{1}{2} \sum_{i=1}^{n}x_i^2 \bigg\} \exp \bigg\{- \frac{n\theta^2}{2}+\theta\sum_{i=1}^{n} x_i \bigg\}
		\end{split}
		\]
		so
		\begin{itemize}
			\item $h(x)=\exp \bigg\{ -\frac{1}{2} \sum_{i=1}^{n}x_i^2 \bigg\}$
			\item $g(\sum_{i=1}^n x_i,\theta) = \exp \bigg\{- \frac{n\theta^2}{2}+\theta\sum_{i=1}^{n} x_i \bigg\}$
		\end{itemize}
\end{eg}

\begin{teo}\textbf{Fisher Theorem}\\
	If $\lfd$ is the joint density function or the joint probability mass function of $\uX$ and $q(t;\theta)$ is the density function or the probability mass function of $T_n(\uX)$, then $T_n(\uX)$ is sufficient for $\theta$ if for every point in the sample space, the ratio
	$$\frac{\lfd}{q(t;\theta)}$$ 
	is a constant function of $\theta$.
\end{teo}
\begin{proof}
	MISSING
\end{proof}
We can see now the prof of the corollary \ref{corol:Savage}
\begin{corol}\textbf{Savage}\\
	Let $\lfd$ be the joint PDF or PMF of a random sample $\uX=(X_1... X_n)$. A statistic $T_n$ is sufficient for $\theta$ if and only if there exist two non negative functions $g(t,\theta), h(\ux)$ such that for all $\ux$ in the sample space and for all $\theta\in \Theta$ 
	$$\lfd=g(T(\ux);\theta)h(\ux)$$
\end{corol}
\begin{proof}
	We are going to prove the theorem only in the discrete settings.\\
	\begin{itemize}
		\item["$\Rightarrow$"] Suppose that $T(\uX)$ is sufficient for $\theta$.\\
		Define:
		\begin{itemize}
			\item $g(t,\theta):=\p(T(\uX)=t)$
			\item $h(\ux):=\p\bigg(\uX=\ux \bigg|T(\uX)=T(\ux)\bigg)$
		\end{itemize}
	Because $T(\uX)$ is sufficient for $\theta$ the conditional probability defining $h(\ux)$ does not depend on $\theta$. Hence the choice of $g(t,\theta)$ and $h(\ux)$ is legitimate and for this choice we have
	\[
	\begin{split}
	\p(\uX=\ux)
	&=\p(\uX=\ux \wedge T(\uX)=T(\ux))\\
	&=\p(T(\uX)=T(\ux))\p(\uX=\ux|T(\uX)=T(\ux))\\
	&=g(t,\theta)h(\ux)
	\end{split}
	\]
	So we have the factorization and in particular we can see that
	$$\p\bigg(T(\uX)=T(\ux)\bigg)=g(t,\theta)$$
	$\implies g(T(\ux),\theta)$ is the PMF of $T(s)$
	\item["$\Leftarrow$"] We assume that the factorization holds.\\
	Let $q(t,\theta)$ be the PMF of $T(\uX)$. We study the ratio
	$$\frac{\lfd}{q(T(\ux);\theta)}$$
	in particular define
	$$A_{T(\ux)}=\{\underline y | T(\underline y)= T(\ux) \}$$
	Then 
	\[
	\begin{split}
	\frac{\lfd}{q(T(\ux);\theta)}
	&=\frac{g(T(\ux);\theta)h(\ux)}{q(T(\ux);\theta)}\\
	&=\frac{g(T(\ux);\theta)h(\ux)}{\sum_{\underline y \in A_{T(\ux)}}{g(T(\ux);\theta)h(\underline y)}}\\
	&=\frac{g(T(\ux);\theta)h(\ux)}{g(T(\ux);\theta) {\sum_{\underline y \in A_{T(\ux)}}h(\underline y)}}\\
	&=\frac{h(\ux)}{\sum_{\underline y \in A_{T(\ux)}}h(\underline y)}
	\end{split}
	\]
	This is constant \wrt $\theta$.\\
	Then by the Fisher Theorem $T(\uX)$ is sufficient for $\theta$.
	\end{itemize}

\end{proof}

\begin{eg}$(X_1... X_n)\in \{0,1\}^n$ from a $N(\mu,\sigma^2)$, $\sigma^2$ known. As we did in the example \ref{eg:gauss} we want to find if $T_n=\frac{1}{n}\sum_{i=1}^{n} X_i$ is a sufficient statistic for $\mu$.
	\[
	\begin{split}
	f_{\uX}(\ux;\mu\sigma^2)
	&=(2 \pi \sigma^2)^{-n/2} \exp \bigg\{ -\frac{1}{2\sigma^2}\sum_{i=1}^{n} (x_i-\mu)^2 \bigg\}\\
	&=(2 \pi \sigma^2)^{-n/2} \exp \bigg\{ -\frac{1}{2\sigma^2} \sum_{i=1}^{n}(x_i-\bar x_n -\bar x_n -\mu)^2 \bigg\}\\
	&=(2 \pi \sigma^2)^{-n/2} \exp \bigg\{ -\frac{1}{2\sigma^2}\bigg( \sum_{i=1}^{n} (x_i-\bar x_n)^2 + n(\bar x_n-\mu)^2  \bigg) \bigg\}\\
	\end{split}
	\]
	we already know the distribution of $T(\ux)=\bar x_n=\frac{1}{n}\sum_{i=1}^{n}x_i$ is $\bar X_n\sim N(\mu,\sigma^2/n)$.\\
	So we can apply Fisher theorem to the ratio:
	\[
	\frac{(2 \pi \sigma^2)^{-n/2} \exp \bigg\{ -\frac{1}{2\sigma^2}\bigg( \sum_{i=1}^{n} (x_i-\bar x_n)^2 + n(\bar x_n-\mu)^2  \bigg) \bigg\}}{(2 \pi \sigma^2/n)^{-1/2}\exp \bigg\{- \frac{n}{2\sigma^2} (\bar x_n -\mu)^2 \bigg\}}
	\]
		\[
	\frac{(2 \pi \sigma^2)^{-n/2} \exp \bigg\{ -\frac{1}{2\sigma^2} \sum_{i=1}^{n} (x_i-\bar x_n)^2   \bigg\} \exp \bigg\{ -\frac{n}{2\sigma^2}(\bar x_n-\mu)^2  \bigg\}}{(2 \pi \sigma^2/n)^{-1/2}\exp \bigg\{- \frac{n}{2\sigma^2} (\bar x_n -\mu)^2 \bigg\}}
	\]
	\[
	\frac{(2 \pi \sigma^2)^{-n/2} \exp \bigg\{ -\frac{1}{2\sigma^2} \sum_{i=1}^{n} (x_i-\bar x_n)^2   \bigg\} }{(2 \pi \sigma^2/n)^{-1/2}}
	\]
	Hence by Fisher Theorem $T_n=\frac{1}{n}\sum_{i=1}^{n} X_i$ is sufficient for $\mu$
\end{eg}
\begin{oss}
	Until now we found only one sufficient statistic for a fixed parametric model. However we can define many sufficient statistics.\\
	For example the statistic given by the identity $T(\ux)=\ux$ is always a sufficient statistic, indeed we can factorize the distribution $f_X(x,\theta)$ with
	\begin{itemize}
		\item $h(x)=1$
		\item $g(T(x);\theta)=f_X(x,\theta)$
	\end{itemize}
\end{oss}


\begin{oss}
 Given one sufficient statistic a way to produce more sufficient statistics is thru a one to one function.\\
Suppose $T(\ux)$ is a sufficient statistic for $\theta$, and define $T^*(\ux)=r(T(\ux))$ where $r$ is a one to one function with inverse $r^{-1}$.\\
By Savage's Theorem there exist $g,h$ such that
$$\lf=g(T(\ux),\theta)h(\ux)=g(r^{-1}(r(T(\ux),\theta)))h(\ux)=g(r^{-1}(T^*(\ux)),\theta)h(\ux)$$
So defining $g^*(t,\theta)=g(r^{-1}(t),\theta)$ we have that
$$\lf=g^*(T^*(\ux),\theta)h(\ux)$$
$\implies$ by Savage Theorem we have that $T^*(\ux)$ is a sufficient statistic.
\end{oss}
We saw that in principle we can define many sufficient statistics so it is natural to define a tool that allows us to decide when a sufficient statistic is better than another.\\
Recall that the purpose of statistic is to achieve data reduction without loss of information.\\
Therefore a statistic that achieve the most data reduction while still retaining all of the information about $\theta$ might be preferable.
\begin{oss}
	We saw in example \ref{eg:gauss} that if 	$(X_1... X_n)\in \{0,1\}^n$ from a $N(\theta,1)$,  $T_n=\sum_{i=1}^{n} x_i$ is a sufficient statistic. Instead of $\sum_{i=1}^{n} X_i$ we can use $T'(\ux)\bigg( \sum_{i=1}^{n} x_i, \sum_{i=1}^{n} x_i^2 \bigg)$. Clearly $T(X)$ s a greater data reduction than $T'(\ux)$ since we do not need to know the sample variance if we want to know $\theta$. Moreover we can write $T(\ux)$ as a function of $T'(\ux)$ by defining the function $r(a,b)=a$, then we can write
	$$T(\ux)=\bar x_n=r(\bar x_n, S^2_n)=r(T'(\ux))$$
	Since $T(\ux)$ and $T'(\ux)$ are both sufficient they contains the same information about $\mu$.
	In other terms the additional information given by the sample variance is null.
\end{oss} 
\begin{defi}
	A sufficient statistic $T(\ux)$ is called \textbf{minimal} if for any other sufficient statistic $T'(\ux)$, $T(\ux)$ is a function of $T'(\ux)$. 
\end{defi}
NOTE:\\
To say that $T(\ux)$  is a function of  $T'(\ux)$ simply means that if $T'(x)=T'(y)$ then $T(x)=T(y)$.\\
In other terms if $\{ B_t\} $ where $B_t:=\{t' : T'(t)=T'(t') \}$ is the partition set induced by $T'$ and $\{ A_t\} $ where $A_t:=\{t' : T(t)=T(t') \}$ is the partition set induced by $T$ then for every $t$, $B_t \subseteq A_t$.\\
$\implies$the partition of the sample space induced by a minimal statistic is the partition with the smallest cardinality.
\begin{teo}\textbf{Lehmann and Sheffe}
	let $\lfd$ be the joint density function or joint probability mass function of a \rs \  $\uX=(X_1...X_n)$. Suppose there exist a function $T$ such that for any two sample points $\ux$ , $\uy$ the ratio 
	\[
	\frac{\lfd}{f_{\uX}(\uy;\theta)}
	\]
	is constant as a function of $\theta$ if and only if $T(\ux)=T(\uy)$
	
	
	Then $T$ is a minimal sufficient statistic for $\theta$ 
\end{teo}

\begin{proof}
	To simplify the proof we assume $\lfd > 0 \ \forall \ux, \forall \theta$.\\
	First we show that $T(\ux)$ is sufficient.\\
	Define $\Tau$ as the image of the sample space under the function $\st$.
	$$\Tau:=\{ t:t=\st \text{for some $\ux$ in the sample space} \}$$
	Define $\{ A_t\} $ the partition set induced by $T$, where $A_t:=\{t' : T(t)=T(t') \}$ 
	For each $A_t$  choose and fix some elements $x_t\in A_t$. For any point in the space $\ux_{\st}$ is the fixed element that is in the same set ,$A_t$, as $\ux$. Since $\ux$ and $\ux_{\st}$ are in the same set $A_t$ then $\st=T(\ux_{\st})$ so by the assumptions the ratio
	\[
	\frac{\lfd}{f_{\uX}(\ux_{\st};\theta)}
	\]
Does not depend on $\theta$. Thus we can define $h(\ux):=\frac{\lfd}{f_{\uX}(\ux_{\st};\theta)}$.\\
Then define the function $g(\ux,\theta)\lfd$, so we have:
\[
\lfd =\frac{f_{\uX}(\ux_{\st};\theta)\lfd}{f_{\uX}(\ux_{\st};\theta)}=\gf h(\theta)
\]
and by Savage Theorem $\st$ is sufficient for $\theta$.\\


Now we will show that $\st$ is minimal sufficient.\\
Let  $T'(\ux)$ be another sufficient statistic. By Savage Theorem we know that exist $h',g'$ such that
\[
\lfd=g'(T'(\ux),theta)h'(\theta)
\]
Let $\ux,\uy$ be two sample points such that $T'(\ux)=T'(\uy)$ then we can study the ratio:
\[
\frac{\lfd}{f_{\uX}(\uy;\theta)}=\frac{g'(T'(\ux);\theta)h'(\ux)}{g'(T'(\uy);\theta)h'(\uy)}=\frac{h'(\ux)}{h'(\uy)}
\]
Since the ratio does not depend on $\theta$, by the assumption (the other implication of the IFF) implies $\st =T(\uy)$. So we can say that $T(\ux)$ is a function of $T'(\ux)$ therefore $\st$ is minimal.
\end{proof}
\section{Estimators}

\begin{defi}
	Suppose there is a fixed parameter $\theta$ that needs to be estimated. Then an \textbf{estimator} is a function that maps the sample space to a set of sample estimates. An estimator of $\theta$ is usually denoted by the symbol $\bar \theta$.
\end{defi}
Now we re going to introduce some definition of \textit{"good"} estimators.
\begin{defi}
	$T_n(\uX)$ is said to be \textbf{unbiased} for $\theta$ if $\e[T_n(\uX)]=\theta$
\end{defi}
\begin{oss}
	we use the expected value to define a "good"v estimator because of the linearity of the operator.
\end{oss}
\begin{defi}
	\textbf{Bias}:
	\[
		Bias_\theta (T_n(\uX))=\e\bigg[T_n(\uX)-\e[T_n(\uX)]\bigg]
	\]
\end{defi}

When we ask an estimator to be unbiased basically we are requiring it to be centred around $\theta$.\\
Another parameter that give us information about the goodness of an estimator is the variance. We can interpret the variance as a measure of the dispersion around the expected value, so before check the variance we mus be sure that the expected value overlap with $\theta$. In this scenario the less is the variance the best is the estimator.
\begin{oss}
	Variance is a good parameter to watch only if the estimator is unbiased
\end{oss}
To avoid this problem we can introduce the \textit{Mean Squared Error}
\begin{defi}\textbf{Mean Squared Error}(MSE)
	\[
	\e[(T_n(\uX)-\theta)^2]
	\]
\end{defi}
The importance of this quantity comes from the \textit{Chebyshev's Inequality} \ref{eq:Chebyshev}
\[
\p(|T_n(\ux)-\theta|< k)>1-\frac{\e[(T_n(\uX)-\theta)^2]}{k^2}
\]
Indeed we notice the smaller the MSE the greater is $\p(|T_n(\ux)-\theta|< k)$.
\begin{prop}
	$$\e[(T_n(\uX)-\theta)^2]=Var(T_n(\uX))+Bias_\theta(T_n(\uX))^2$$
\end{prop}
\begin{proof}
	\[
	\begin{split}
		\e[(T_n(\uX)-\theta)^2]
		&=\e[(T_n(\uX) - \e[T_n(\uX)] + \e[T_n(\uX)]-\theta)^2 ]\\
		&=\e[(T_n(\uX)-\e[T_n(\uX)])^2]+\e[(\e[T_n(\uX)]-\theta)^2]+2\e[(T_n(\uX)-\e[T_n(\uX)])(\e[T_n(\uX)]-\theta)]\\
		&=\e[(T_n(\uX)-\e[T_n(\uX)])^2]+\e[(\e[T_n(\uX)]-\theta)^2]+2\e[ \tn(\uX)\e[\tn(\uX)] - \tn(\uX)\theta - \e[\tn(\uX)]^2 + \theta \e[\tn(\uX)]\\
		&=\e[(T_n(\uX)-\e[T_n(\uX)])^2]+\e[(\e[T_n(\uX)]-\theta)^2]\\
		&=Var(T_n(\uX))+Bias_\theta(T_n(\uX))^2
	\end{split}
	\]
\end{proof}
\begin{oss}
	If $\e[T_n(\uX)]=\theta$ then $MSE(T_n(\uX))=Var(T_n(\uX))$.
\end{oss}
\begin{defi}
	Let $X_1.. X_n$ from $X\sim f_X(x,\theta)$, $T_n'$ and $T_n''$ estimators for $\theta$. We say $T_n'$ is  \textbf{more efficient} than $T_n''$ if
	$$MSE(T_n')<MSE(T_n'')$$
\end{defi}
Usually we choose the estimator with the lower MSE even if it is biased.\\

\section{Properties of Estimators}
The problem of the MSE is that we can not be sure that there exist $T_n$ such that $MSE (\tn)$ is the lowest possible.\\
A solution for this comes from
\begin{teo}\label{teo:Cramer-Rao Bound}
	\textbf{Cramer-Rao Bound}\\
			Let $X=(X_1... X_n)$ be a random sample from a parametric model $X\sim f_X(x,\theta)$.\\
			Then, under condition of regularity, for any estimator $\tn$ of $\theta$ 
			\[
			Var(\tn)\geq \frac{[1+b'(\tn)]^2}{\ifn}
			\]
Where
\begin{itemize}
	\item[$b(\tn)$] is the bias of $\tn$
	\item[$\ifn$] is the Fisher Information
\end{itemize}
\end{teo}
\begin{proof}
	consider the estimator $\tn$.
	\begin{itemize}
		\item $\e[\tn]=\theta +b(\theta)$
		\item $\der \e[\tn]=1+b'(\theta)$
		\item $\e[V_n'(\theta)]=0 \leftarrow$ because we suppose our model regular
	\end{itemize}
$\implies Cov(\tn, V_n'(\theta))=\e[\tn V_n'(\theta)] - \e[\tn]\e[V_n'n(\theta)]=\e[\tn V_n'(\theta)]$ 
\[
\begin{split}
\e[\tn V_n'(\theta)]&
=\int_{\mathbb{R}^n}\tn V_n'(\theta) \lfd d\ux\\
&=\int_{\mathbb{R}^n}\tn \frac{f_{\uX}'(\ux,\theta)}{\lfd}\lfd d\ux\\
&=\int_{\mathbb{R}^n}\tn \der \lfd d\ux\\
&=\der \int_{\mathbb{R}^n}\tn \lfd d\ux\\
&=\der \e[\tn]\\
&=1+b'(\theta)
\end{split}
\]
So
\[
Cov(\tn,V_n'(\theta))=1+b'(\theta)
\]
We know that in general for $X,Y$ \rv s such that $\e[X]=\mu, \e[Y]=\nu, \e[X^2]<\infty, \e[Y^2]<\infty$, from Cauchy-Schwarz, it holds
\[
\begin{split}
(Cov(X,Y))^2&=\bigg( \e[(X-\mu)(Y-\nu)] \bigg)^2\\
&\leq  \e[(X-\mu)^2] \e[(Y-\nu)^2]\\
&= Var(X)Var(Y)
\end{split}
\]
So replacing $X$ with $\tn$ and $Y$ with $V_n'(\theta)$ we  obtain
\[
\begin{split}
Var(\tn)&\geq \frac{(Cov(\tn,V_n'(\theta)))^2}{Var(V_n'(\theta))}\\
&=\frac{(Cov(\tn,V_n'(\theta)))^2}{\e[(V_n'(\theta)-\e[V_n'(\theta)])^2]}\\
&=\frac{(Cov(\tn,V_n'(\theta)))^2}{\e[V_n'(\theta)^2]}\\
&=\frac{(Cov(\tn,V_n'(\theta)))^2}{\ifn}
\end{split}
\]
\end{proof}
\begin{corol}
	Under condition of regularity
	\[MSE \geq \frac{(1+b'(\theta))^2}{\ifn} +b^2(\theta) \]
\end{corol}
\begin{proof}
	Directly from Cramer-Rao Bound \ref{teo:Cramer-Rao Bound} remembering that $MSE(\tn)=Var(\tn)+b^2(\theta)$
\end{proof}

\begin{corol}
	Let $\uX=(X_1... X_n)$ be a random sample from a regular model $X\sim f_X(x;\theta)$\\
	If there exist a unbiased estimator for $\theta$ whose variance is equal to the Cramer-Rao bound, Then $\tn$ is unique
\end{corol}
\begin{proof}
	Take $T_{1n}$, $\tnd$ be unbiased estimators for $\theta$ such that
	\[
	Var(\tnu)=Var(\tnd)=\frac{1}{\ifn}
	\]
	Define $\tn := \frac{\tnu + \tnd}{2}$\\
	$\e[\tn]=\frac{\e[\tnu]+\e[\tnd]}{2}=\frac{2}{2}\theta=\theta$\\
	$\implies \tn$ is also unbiased for $\theta$\\
	$\implies Var(\tn)\geq \frac{1}{\ifn}$
	\[
	\begin{split}
	Var(\tn)
	&=Var\bigg( \frac{\tnu + \tnd}{2}  \bigg)\\
	&=\frac{1}{4}\bigg[ Var(\tnu)+ Var(\tnd) +2 Cov(\tnu, \tnd) \bigg]\\
	&=\frac{1}{4 }\bigg[ Var(\tnu)+ Var(\tnd) +2 Cov(\tnu, \tnd) \bigg]\frac{[Var(\tnu)Var(\tnd)]^{1/2}}{[Var(\tnu)Var(\tnd)]^{1/2}}\\
	&=\frac{(1+Corr(\tnu,\tnd))}{2}\frac{1}{\ifn}
	\end{split}
	\]
	\\MEMO:$|Corr(X,Y)|\leq 1$\\
	Because $Var(\tn)\leq\frac{1}{\ifn}$ then we must have  $Corr(\tnu,\tnd)\geq 1 \implies Corr(\tnu,\tnd)= 1$.\\
	$\implies \tnd=a+b\tnu$.\\
	Hence we must have $\theta=\e[\tnu]=\e[a+b\tnu]=\e[a]+b\e[\tnu]=a+b\theta\implies a=0,b=1$\\
	$\implies \tnd=\tnu$
\end{proof}
\begin{defi}
	Consider a regular model $X\sim f_x(x\theta)$. We say that an unbiased estimator $\tn$ whose variance is 
	\[
	Var(\tn)=\frac{1}{\ifn}
	\]
	is \textbf{efficient}.\\ Moreover we define the \textbf{efficiency} of an estimator as:
	\[
	Eff(\tn)=\frac{1}{Var(\tn)\ifn} \ \ \ \ \ \ \in [0,1]
	\]
\end{defi}
\begin{oss}
	\begin{enumerate}
		\item We introduce (absolute) efficiency at the cost of assuming regularity for the parametric model.
		\item The variance of an unbiased estimator $\tn$ of $\theta$ can not be smaller than the Cramer-Rao bound. However we do not know if there exist an estimator whose variance is equal to the Cramer-Rao bound.
		\item The proper lower bound involves the MSE
		$$MSE(\tn)\geq\frac{[1+b'(\theta)]}{\ifn}+b^2(\theta)$$ 
\end{enumerate}
\end{oss}
\begin{prop}
	Let $\uX=(X_1...X_n)$ be a random sample from a parametric model  $X\sim f_x(x\theta)$. Let $\tn$ be a unbiased estimator for $\theta$. Than $\tn$ i efficient for $\theta$ if and only if
	$$V_n'(\theta)=\ifn (T_n-\theta)$$
\end{prop}
\begin{proof}
	\[
	\begin{split}
	V_n'(\theta)=\ifn (\tn-\e[\tn])&
	\iff 	V_n'(\theta)^2=\ifn^2 (\tn-\e[\tn])^2\\
	&\iff 	\e[V_n'(\theta)^2]=\e[\ifn^2 (\tn-\e[\tn])^2]\\
	&\iff \ \ifn=\ifn^2 \e[(\tn-\e[\tn])^2]\\
	&\iff \ 1=\ifn Var(\tn)\\
	&\iff Var(\tn)=\frac{1}{\ifn}
	\end{split}
	\]
	i.e. $\tn$ is efficient.
\end{proof}
\begin{teo}
	\textbf{Rao-Blackwell}\\
	Let $\uX=(X_1 ... X_n)$ a random sample from a parametric model  $X\sim f_x(x\theta)$. Let
	\begin{itemize}
		\item $\tnu$ a sufficient estimator for $\theta$
		\item $\tnd$ a unbiased estimator for $\theta$
		\item $\tn:=\e[\tnd|\tnu]$
	\end{itemize}
Then
\begin{enumerate}
	\item $\tn$ is a function of $\tnu$
	\item $\e[\tn]=\theta$
	\item $Var(\tn)<Var(\tnd)$
\end{enumerate}
\end{teo}

\begin{defi}
	Let $X_1, X_2...$ real valued  \rv s with CDF $F_{X_1},F_{X_2}...$. We say that $(X_n)_n$ \textbf{converges in distribution} or \textbf{converges weakly} to a \rv  \  $X$ if $$\lim_{n\to \infty}F_{X_n}(x)=F_X(x)$$
\end{defi}
\begin{defi}
	A sequence $\{ X_n\}_n$ of \rv s \textbf{converges in probability} towards the \rv \ $X$ if for all $\epsilon>0$
	$$\lim_{n\to \infty}\p(|X_n-X|>\epsilon)=0$$
	We write $X_n\xrightarrow{p}X$
\end{defi}
\begin{defi}
	Given a real number $r \geq 1$, we say that the sequence $\{X_n\}$ converges in the \textbf{r-th mean} (or \textbf{in the $L^r$-norm}) towards the random variable $X$, if the r-th absolute moments $\e(|X_n|^r)$ and $\e(|X|^r)$ of $X_n$ and $X$ exist, and
	$$\lim_{n\to \infty}\e[|X_n-X|^r]=0$$
	We write $X_n \xrightarrow{L^r} X$.\\
	For $r=2$ we say that $\{X_n\}$ converges in \textbf{mean square}  to $X$.
\end{defi}

\begin{prop}
	Convergence in probability $\implies$ convergence in distribution.\\
	If $X$ is a degenerate \rv \ we have also\\  convergence in distribution $\implies$ convergence in probability.
\end{prop}
\begin{defi}
	We say an estimator $\tn$ is \textbf{consistent in mean squared} for $\theta$ if 
	$$\lim_{n\to \infty}MSE(\tn)=0$$
\end{defi}
NOTATION: we will use $b(\tn):=b(\theta)$
\begin{oss}
	Since $MSE(\tn)=Var(T_n)+b^2(\tn)$, then then $\lim_{n\to \infty}MSE(\tn)=0$ is equivalent to say
	\begin{itemize}
		\item  $\lim_{n\to \infty}Var(\tn)=0$
		\item $\lim_{n\to \infty}b^2(\tn)=0$
	\end{itemize}
\end{oss}
\begin{defi}
	$T_n$ is \textbf{asymptotically unbiased} for $\theta$ if
	\begin{itemize}
		\item $\lim_{n\to \infty}\e[\tn]=\theta$
		\item $\lim_{n\to \infty}b(\tn)=0$
	\end{itemize}
\end{defi}
\begin{prop}
	A consistent estimator in mean square is also asymptotically unbiased
\end{prop}

\begin{defi}
	We say $\tn$ is consistent in probability for $\theta$ if for all $\epsilon >0$
	$$\lim_{n\to \infty}\p(|\tn-\theta|<\epsilon)=1$$
\end{defi}
\begin{prop}
	The  consistencyin mean squared implies consistency in probability 
\end{prop}
\begin{proof}
	Using Chebyshev's inequality \ref{eq:Chebyshev}
	$$\p(|\tn-\theta|<\epsilon)\geq 1-\frac{MSE(\tn)}{\epsilon^2}$$
	$$\lim_{n\to \infty}\p(|\tn-\theta|<\epsilon)\geq 1-\lim_{n\to \infty}\frac{MSE(\tn)}{\epsilon^2}$$
\end{proof}
\begin{teo}
	\textbf{Central Limit Teorem}\\
	Let $\uX=(X_1... X_n)$ be a random sample of size $n$ with $X_i$ independent and identically distributed \rv s. With  expected value $\mu$ and finite variance $\sigma^2$. Then $S_n:=\frac{\sum_{i=1}^n}{n}$ converges in probability to the expected value $\mu$
	$$S_n\xrightarrow{p} \mu$$
\end{teo}

\begin{teo}
	\textbf{Week law of large numbers}\\
	Let $\uX=(X_1... X_n)$ be a random sample of size $n$ with $X_i$ independent and identically distributed \rv s. With  expected value $\mu$.
	Define $\bar X_n=\frac{1}{n}\sum_{i=1}^{n}X_i$, then
	$$\bar X_n \xrightarrow{p} \mu$$
	That is, for any $\epsilon>0$
	$$\lim_{n\to \infty}\p(|\bar X_n- \mu|>\epsilon)=0$$
\end{teo}


	\section{Method of Moments}

\begin{oss}
	Before we start, like the likelihood method, this method can also be done for irregular models.
\end{oss}


Let $X=(X_1,\ldots,X_n)$ be a random sample from a parametric model $X\sim f_X(x,\theta)$.
Assume that $\theta\in\Theta$ s.t. $\Theta$ has dimension $r$. In order to apply the method of moment for estimating $\theta$ we need: 
\begin{itemize}
	\item
	A condition related to the dimension of $\Theta$. If the number of parameters that we need to estimate is $r$, then the moments of $X$ must exist up to the order $r$.
	$$\e[{|x|}^r]<\infty.$$
	\item
	Explicit expression for the first $r$ moments of $X$.
\end{itemize}
Given that, the method of moments exists in solving the system of equations given by:
$$\frac{1}{n}=\sum_{i=1}^{n}{X_i}^{J}=\e[X^{J}], \quad \forall\enspace J=1,\ldots,r$$
\begin{oss}
	We don't have any result on the theoretical qualities on this method for the simulation. We need to check case by case.
\end{oss}

\section{Maximum Likelihood Approach}
We saw (exercitation week not) that the Maximum Likelihood Approach consist in finding the $$arg \  \max_{\theta\in \Theta} \ln(\lf)$$
\begin{itemize}	
		\item if the model is regular then we can simply solve the ML equation $\vnp =0$
		\item if the parameter is a positive integer then we used the ratio
		\item the parameter defines the support of the model we must insert the indicator function in the likelihood function
		\item if the model is regular but we can not solve explicitly the likelihood equation.\\
		A way to solve this is the \textbf{Newton-Raphson  method}
\end{itemize}
\begin{teo}
	\textbf{Delta Method}(Generalization of Central Limit Theorem for functions)\\
	Let $(Y_n)_{n\geq 1}$ be a sequence of \rv s such that
	$$\sqrt{n}(Y_n-\theta) \xrightarrow{w} N(0,\sigma^2) \ \ \ \ as \  n \to \infty$$
	for a specific function $g$ and a given value $\theta$ assume $g'(\theta)$ exist and it is not zero then
		$$\sqrt{n}(g(Y_n)-g(\theta)) \xrightarrow{w} N(0,\sigma^2[g'(\theta)]^2) \ \ \ \ as \  n \to \infty$$
\end{teo}
\begin{proof}
	Take Taylor's approx of $g(Y_n)$ around $Y_0=\theta$. \\
	Then $$g(Y_n)=g(\theta)+ g'(\theta)[Y_n-\theta]+R$$
	Where $R\to 0$ as $Y_n\to \theta$. Since $Y_n\xrightarrow{p}\theta$ then $\xrightarrow{p} 0$. By an application of  Slutsky Theorem we have:
	$$\sqrt{n}(g(Y_n)-g(\theta)) = \sqrt(n)(Y_n-\theta)g'(\theta)\xrightarrow{w} N(0,\sigma^2[g'(\theta)]^2) \ \ \ \ as \  n \to \infty$$
\end{proof}

A useful property of the \mle is the \textbf{invariance property}.\\
Suppose we have a parametric family indexed by a parameter $\theta$ but we re interested in estimating $\Tau(\theta)$. The invariance property of \mle say that if $\hat \theta$ it the \mle for $\theta$ then $\T(\hat{\theta})$ is the \mle for $\T(\theta)$.\\
If the map $\theta \T(\theta)$ is one to one then the invariance property is easy to show. Indeed if  we let $\mu=\T(\theta)$ then the inverse function$\T^{-1}(\mu)=\theta$ is well defined and the likelihood function written as a function of $\mu $ is:
$$\lfs=\prod_{i=1}^{n}f_{X_i}(x_i;\T^{-1}(\mu))=\mathcal{L}(\T^{-1}(\mu); \underline x)$$
and
$$\sup_{\tau}\lfs=\sup_{\tau}=\mathcal{L}(\T^{-1}(\mu); \underline x)=\sup_\theta \lf$$
Then the $\max$ of $\lfs$ is attained at $\T=\T(\theta)=\T(\hat \theta)$. This shows $\T(\hat \theta)$ is the \mle for $\T( \theta)$.\\
In general the invariance property of \mle is true for any functional of $\theta$.
\begin{teo}\label{them:invariance}
	\textbf{invariance property of \mle}\\
	If $\hat \theta$ is the \mle for $\theta$ then for all function $\T(\theta)$ the \mle of $\T(\theta)$ is $\T(\hat \theta)$.
\end{teo}
\begin{proof}
	Define $$\lfs:=\sup_{ \{\theta \in \Theta: \T(\theta)=\T\} }\lf$$
	let $\hat \mu$ the value that maximize the function $\lfs$, it can be shown that $\hat \mu$ coincides with the argument that maximize $\lf$.\\
	We must show that 
	$$\mathcal{L}^*(\hat \mu;\underline x)=\mathcal{L}*(\T(\hat \theta);\underline x)$$
	As we stated above $\hat \mu$ coincides with the argument that maximize $\lf$ so
	$$\mathcal{L}^*(\hat \mu;\underline x)=\sup_\mu \sup_{ \{\theta \in \Theta: \T(\theta)=\mu \} }\lf=\sup_\theta \lf=\mathcal{L}^*(\hat \theta;\underline x)$$
	Moreover 
	\[
	\begin{split}
	\mathcal{L}^*(\hat \theta;\underline x)
	&= \sup_{ \{\theta \in \Theta: \T(\theta)=\T(\hat \theta) \} }\lf\\
	&=\mathcal{L}^*(\T(\hat\theta);\ux)
	\end{split}
	\]
	By combining the two identities we have that 
	$$\mathcal{L}^*(\hat \mu;\ux)=\mathcal{L}^*(\T(\hat \theta);\ux)$$
	So $\T(\hat \theta)$ is the \mle of $\T(\theta)$ where $\hat \theta$ is the \mle of $\theta$
\end{proof}
\begin{oss}
	Fore example we can use the above theorem \ref{them:invariance} to compute the \mle of $\theta^2$ where $\theta$ is the meano of a Gaussian model
\end{oss}
\begin{oss}
	The above result \ref{them:invariance} holds also in the context of multidimensional parametric spaces.
\end{oss}
\begin{eg}
	$\uX=(X_1... X_n)$ from $X\sim N(\mu,\sigma^2)$. Find the \mle estimator of $(\mu, \sigma^2)$.\\
	\begin{itemize}
		\item $\mathcal{L}(\mu, \sigma^2;\ux)=\prod_{i=1}^{n}\frac{1}{\sqrt{2 \pi \sigma^2}}\exp \bigg\{  -\frac{(x_i-\mu)^2}{2\sigma^2}		\bigg\}=(2 \pi \sigma^2)^{-n/2}\exp \bigg\{-\frac{1}{2\sigma^2} \sum_{i=1}^{n} (x_i-\mu)^2 \bigg\}$
		\item $\ln(\mathcal{L}(\mu, \sigma^2;\ux))=-\frac{n}{2}\ln(2 \pi \sigma^2)-\frac{1}{2\sigma^2}\sum_{i=1}^{n}(x_i-\mu)^2\propto -\frac{n}{2}\ln \sigma^2-\frac{1}{2\sigma^2}\sum_{i=1}^{n}(x_i-\mu)^2$
		\item $\frac{d}{d\mu}\ln(\mathcal{L}(\mu, \sigma^2;\ux))=\frac{1}{\sigma^2}\sum_{i=1}^{n}(x_1-\mu)$
		\item $\frac{d}{\sigma^2}\ln(\mathcal{L}(\mu, \sigma^2;\ux))=-\frac{n}{2\sigma^2}+\frac{1}{2(\sigma^2)^2}\sum_{i=1}^{n}(x_1-\mu)^2$
	\end{itemize}
so solving the system 
\[
\begin{cases}
\frac{d}{d\mu}\ln(\mathcal{L}(\mu, \sigma^2;\ux))=\frac{1}{\sigma^2}\sum_{i=1}^{n}(x_1-\mu)=0
\\ \frac{d}{\sigma^2}\ln(\mathcal{L}(\mu, \sigma^2;\ux))=-\frac{n}{2\sigma^2}+\frac{1}{2(\sigma^2)^2}\sum_{i=1}^{n}(x_1-\mu)^2=0
\end{cases}		
\]
we obtain $\hat \mu=\frac{1}{n} \sum_{i=1}^{n}x_i; \hat \sigma^2=\frac{1}{n}\sum_{i=1}^{n}(x_i-\bar x_n)^2$.\\
The question now is whether $(\hat \mu , \hat \sigma^2)$ is a point of maximum or not.
To do this we can check the Jacobian, but the computational work would be too heavy.\\ Another way is by reducing the dimension of the problem:\\
Note that
	$$\sum_{i=1}^{n}(x_i-\mu)^2=\sum_{i=1}^{n}(x_i- \bar x_n +\bar x_n -\mu)^2=\sum_{i=1}^{n}(x_i-\bar x_n)^2 + \sum_{i=1}^{n}(\bar x_n-\mu)^2 + 2 \sum_{i=1}^{n}(x_i-\bar x_n)(\bar x_n -\mu)=\sum_{i=1}^{n}(x_i-\bar x_n)^2 + \sum_{i=1}^{n}(\bar x_n-\mu)^2> \sum_{i=1}^{n}(x_i -\bar x_n)^2$$

So, for any $\theta \not = bar x_n$ we have:
$$\sum_{i=1}^{n}(x_i-\mu)^2>\sum_{i=1}^{n}(x_i -\bar x_n)^2$$
Hence:
$$(2 \pi \sigma^2)^{-n/2}\exp \bigg\{-\frac{1}{2\sigma^2} \sum_{i=1}^{n} (x_i-\bar x_n)^2 \bigg\}\geq(2 \pi \sigma^2)^{-n/2}\exp \bigg\{-\frac{1}{2\sigma^2} \sum_{i=1}^{n} (x_i-\mu)^2 \bigg\}$$
Therefore the problem of verifying that $(\hat \mu, \hat \ sigma^2)$ is a maximum is reduced to a one dimensional problem (we need to check only $\hat \sigma^2$ is actually the max).
\end{eg}
\begin{oss}
	In the end  we can say that the invariance property is a good property form a computable point of view, but this property is also the reason why the \mle can be biased for a finite sample size.
\end{oss}

\begin{prop}
	Let $\uX=(X_1... X_n)$ be a random sample from a parametric model $X\sim f_X(x,\theta)$. If the \mle $\tn$ for $\theta $ exist and it is unique then $\tn$ is a function of the sufficient statistic for $\theta$
\end{prop}
\begin{proof}\textit{(informal)}\\
	If a sufficient statistic exist then the likelihood function $\lf$ can be factorized as follows:
	$$\lf =g(T(\ux);\theta)h(\ux)$$
	$$\implies \ln \lf = \ln (g(T(\ux);\theta))+ \ln (h(\ux))\propto \ln (g(T(\ux);\theta)) $$
\end{proof}

\begin{teo}
	Let $\uX=(X_1... X_n)$ be a random sample from a parametric model $X\sim f_X(x;\theta)$ and let $\lf$ be the likelihood function. Let $\hat \theta_n$ be the \mle for $\theta$ and let $\T(\theta)$ be a continuous function of $\theta$. Under the assumption of regularity, for any $\epsilon>0$ and for any point $\theta \in \Theta$ we have that:
	$$\lim_{n\to \infty}\p(|\T(\hat \theta_n)-\T(\theta)|\geq \epsilon)=0$$
	I.e. the \mle are consistent in probability
\end{teo}
\begin{teo}
	Let $\uX=(X_1... X_n)$ be a random sample from a parametric model $X\sim f_X(x;\theta)$ and let $\lf$ be the likelihood function. Let $\hat \theta_n$ be the \mle for $\theta$ and let $\T(\theta)$ be a continuous function of $\theta$. Under the assumption of regularity, for any $\epsilon>0$ and for any point $\theta \in \Theta$ we have that:
	$$\sqrt{n}[\T(\hat \theta_n)-\T(\theta)]\xrightarrow{w}N\bigg(0, \frac{1}{\mathcal{I}_1(\theta)} \bigg)$$
	where $\mathcal{I}_1(\theta)$ is the Fisher information of the parametric model.
\end{teo}
This means that the \mle are:
\begin{itemize}
	\item consistent (weakly)
	\item asymptotically unbiased
	\item asymptotically Gaussian
	\item asymptotically efficient
\end{itemize}
\begin{proof}
	we will prove the result only for $\T(\theta)=\theta$. The proof for a generic $\T(\te)$ is an application of the delta method.\\
	We have that $L(\theta;\ux)=\sum_{i=1}^{n}\ln (f_{X_i}(x_i;\theta))$.\\ We denote by $L'$ and $L''$ the first and the second derivative \wrt $\theta$.\\
	Expand the first derivative (Taylor) around $\theta_0$
	$$L'(\theta; \ux)= L'(\theta_0; \ux)+ (\theta- \theta_0)L''(\theta_0, \ux)+R$$
	where $R$ is the residual and can be ignored.?\\
	Now substitute $\theta$ with the \mle of $\theta$, say $\hat \theta_n$
	$$0= L'(\hat \theta_0; \ux)+ (\hat \theta- \theta_0)L''(\theta_0, \ux)+R$$
	\[
	\begin{split}
	\implies \sqrt{n}(\hat \theta_n;- \theta_0)&=-\sqrt{n}\frac{L'(\theta_0)}{L''(\theta_0; \ux)}\\&=\frac{-\frac{1}{\sqrt{n}} L'(\theta_0)}{\frac{1}{n}L''(\theta_0; \ux)}
	\end{split}
	\]
	We can apply the Central Limit Theorem to the denominator and the Weak Law of Large Numbers to the numerator:\\
	\begin{itemize}
		\item[Numerator:]  $$\frac{1}{\sqrt{n}}L'(\theta_0;\ux)=\sqrt{n}\bigg(\frac{1}{n}\sum_{i=1}^{n}W_i \bigg)$$
		Where $W_i=\frac{\frac{d}{d\theta} f_{X_i}(x_i;\theta)}{f_{X_i}(x_i;\theta)}$ and $\e[W_i]=0$ and $Var(W_i)=\mathcal{I}_1(\theta)$. By the Central Limit Theorem
		$$\frac{1}{\sqrt{n}} L'(\theta_0);\ux \xrightarrow{w}N(0;\mathcal{I}_1(\theta))$$
		\item[Denominator:] $$-\frac{1}{n}L''(\theta_0;\ux)=\frac{1}{n}\sum_{i=1}^{n}W_i^2-\frac{1}{n}\sum_{i=1}^{n}\frac{\frac{d^2}{d \theta^2}f_{X_i}(x_i;\theta)}{f_{X_i}(x_i;\theta)}$$
		Where $\e[\frac{1}{n}\sum_{i=1}^{n}W_i^2]=\mathcal{I}_1(\theta_0)$ and  $\e\bigg[ \frac{1}{n} \sum_{i=1}^{n}  \frac{\frac{d^2}{d \theta^2}f_{X_i}(x_i;\theta)}{f_{X_i}(x_i;\theta)}\bigg]=0$. Then by the Weak Law of Large Numbers
		$$-\frac{1}{n}L''(\theta;\ux)\xrightarrow{w}\mathcal{I}_1(\theta_0)$$
	\end{itemize}
   	So 
   	$$\sqrt{n}(\hat \theta_n - \theta )\xrightarrow{w} \frac{X}{\mathcal{I}_1(\theta_0)}$$
	Where $X\sim N(0,\mathcal{I}_1(\theta_0))$
	$$\implies \sqrt{n}(\hat \theta_n - \theta )\xrightarrow{w}  N\bigg(0,\frac{1}{\mathcal{I}_1(\theta_0)}\bigg) $$
\end{proof}