diff --git a/CodingAndCryptography/00_modelling_communication.tex b/CodingAndCryptography/00_modelling_communication.tex
index 527c1e1..9c6c745 100644
--- a/CodingAndCryptography/00_modelling_communication.tex
+++ b/CodingAndCryptography/00_modelling_communication.tex
@@ -17,7 +17,7 @@ \section{Modelling communication}
 Noiseless coding is adapted to the source.
 
 Here is an example of noisy coding.
-Each book has an ISBN $a_1 a_2 \dots a_9 a_{10}$ where the $a_1, \dots, a_9$ are digits in $\qty{0, \dots, 9}$, and $a_{10} \in \qty{0, \dots, 9, X}$ such that $11 \mid \sum_{j=1}^{10} j a_j$.
+Each book has an ISBN $a_1 a_2 \dots a_9 a_{10}$ where the $a_1, \dots, a_9$ are digits in $\qty{0, \dots, 9}$, and $a_{10} \in \qty{0, \dots, 9, X}$ s.t. $11 \mid \sum_{j=1}^{10} j a_j$.
 This coding system detects the common human errors of writing an incorrect digit and transposing two adjacent digits.
 Noisy coding is adapted to the channel, which in this case is the human reading the number and typing it into a computer.
 
@@ -68,7 +68,7 @@ \section{Modelling communication}
 \end{definition}
 
 \begin{definition}[Capacity]
-    A channel can \vocab{transmit reliably at rate $R$} if there is a sequence of codes $(C_n)_{n=1}^\infty$ with each $C_n$ a code of length $n$ such that $\lim_{n \to \infty} \rho(C_n) = R$ and $\lim_{n \to \infty} \hat e(C_n) = 0$.
+    A channel can \vocab{transmit reliably at rate $R$} if there is a sequence of codes $(C_n)_{n=1}^\infty$ with each $C_n$ a code of length $n$ s.t. $\lim_{n \to \infty} \rho(C_n) = R$ and $\lim_{n \to \infty} \hat e(C_n) = 0$.
     The \vocab{capacity} of a channel is the supremum of all reliable transmission rates.
 \end{definition}
 
diff --git a/CodingAndCryptography/01_noiseless_coding.tex b/CodingAndCryptography/01_noiseless_coding.tex
index 423f551..92cc6a8 100644
--- a/CodingAndCryptography/01_noiseless_coding.tex
+++ b/CodingAndCryptography/01_noiseless_coding.tex
@@ -120,8 +120,8 @@ \subsection{McMillan's inequality}
 
 \subsection{Entropy}
 \vocab{Entropy} is a measure of `randomness' or `uncertainty' in an input message.
-Suppose that we have a random variable $X$ taking a finite number of values $x_1, \dots, x_n$ with probability $p_1, \dots, p_n$.
-Then, the entropy of this random variable is the expected number of fair coin tosses required to determine $X$.
+Suppose that we have a r.v. $X$ taking a finite number of values $x_1, \dots, x_n$ with probability $p_1, \dots, p_n$.
+Then, the entropy of this r.v. is the expected number of fair coin tosses required to determine $X$.
 
 \begin{example}
     Suppose $p_1 = p_2 = p_3 = p_4 = \frac{1}{4}$.
@@ -136,7 +136,7 @@ \subsection{Entropy}
 In a sense, the first example is `more random' than the second, as its entropy is higher.
 
 \begin{definition}[Entropy]
-    The \vocab{entropy} of a random variable $X$ taking a finite number of values $x_1, \dots, x_n$ with probabilities $p_1, \dots, p_n$ is defined to be
+    The \vocab{entropy} of a r.v. $X$ taking a finite number of values $x_1, \dots, x_n$ with probabilities $p_1, \dots, p_n$ is defined to be
     \begin{align*}
         H(X) = H(p_1, \dots, p_n) = -\sum_{i=1}^n p_i \log p_i = -\expect{\log p_i}
     \end{align*}
@@ -195,7 +195,7 @@ \subsection{Gibbs' inequality}
 
 \subsection{Optimal codes}
 Let $\mathcal A = \qty{\mu_1, \dots, \mu_m}$ be an alphabet of $m \geq 2$ messages, and let $\mathcal B$ be an alphabet of length $a \geq 2$.
-Let $X$ be a random variable taking values in $A$ with probabilities $p_1, \dots, p_m$.
+Let $X$ be a r.v. taking values in $A$ with probabilities $p_1, \dots, p_m$.
 
 \begin{definition}[Optimal Code]
     A code $c \colon \mathcal A \to \mathcal B^\star$ is called \vocab{optimal} if it has the smallest possible expected word length $\sum p_i \ell_i = \expect{S}$ among all decipherable codes.
@@ -350,7 +350,7 @@ \subsection{Huffman coding}
     If $m = 2$, then the codewords are 0 and 1, which is clearly optimal.
 
     Assume $m > 2$, and let $c_m$ be the Huffman code for $X_m$ which takes values $\mu_1, \dots, \mu_m$ with probabilities $p_1 \geq \dots \geq p_m$.
-    $c_m$ is constructed from a Huffman code $c_{m-1}$ with random variable $X_{m-1}$ taking values $\mu_1, \dots, \mu_{n-2}, \nu$ with probabilities $p_1, \dots, p_{m-2}, p_{m-1} + p_m$.
+    $c_m$ is constructed from a Huffman code $c_{m-1}$ with r.v. $X_{m-1}$ taking values $\mu_1, \dots, \mu_{n-2}, \nu$ with probabilities $p_1, \dots, p_{m-2}, p_{m-1} + p_m$.
     The code $c_{m-1}$ is optimal by the inductive hypothesis.
     The expected word length $\expect{S_m}$ is given by
     \begin{align*}
@@ -384,16 +384,16 @@ \subsection{Huffman coding}
 \end{remark}
 
 \subsection{Joint entropy}
-Let $X, Y$ be random variables with values in $\mathcal A, \mathcal B$.
-Then, the pair $(X, Y)$ is also a random variable, taking values in $\mathcal A \times \mathcal B$.
+Let $X, Y$ be r.v.s with values in $\mathcal A, \mathcal B$.
+Then, the pair $(X, Y)$ is also a r.v., taking values in $\mathcal A \times \mathcal B$.
 This has entropy $H(X,Y)$, called the \vocab{joint entropy} for $X$ and $Y$.
 \begin{align*}
     H(X,Y) = - \sum_{x \in \mathcal A} \sum_{y \in \mathcal B} \prob{X = x, Y = y} \log \prob{X = x, Y = y}
 \end{align*}
-This construction generalises to finite tuples of random variables.
+This construction generalises to finite tuples of r.v.s.
 
 \begin{lemma}
-    Let $X, Y$ be random variables taking values in $\mathcal A, \mathcal B$.
+    Let $X, Y$ be r.v.s taking values in $\mathcal A, \mathcal B$.
     Then $H(X,Y) \leq H(X) + H(Y)$, with equality iff $X$ and $Y$ are independent.
 \end{lemma}
 
diff --git a/CodingAndCryptography/02_noisy_channels.tex b/CodingAndCryptography/02_noisy_channels.tex
index efd7540..6ee06fd 100644
--- a/CodingAndCryptography/02_noisy_channels.tex
+++ b/CodingAndCryptography/02_noisy_channels.tex
@@ -210,11 +210,11 @@ \subsection{Covering estimates}
 \end{proof}
 
 \begin{definition}[Perfect Code]
-    An $e$-error correcting code $C$ of length $n$ such that $\abs{C} = \frac{2^n}{V(n,e)}$ is called \vocab{perfect}.
+    An $e$-error correcting code $C$ of length $n$ s.t. $\abs{C} = \frac{2^n}{V(n,e)}$ is called \vocab{perfect}.
 \end{definition}
 
 \begin{remark}
-    Equivalently, a code is perfect if for all $x \in \mathbb F_2^n$, $\exists! \; c \in C$ such that $d(x,c) \leq e$.
+    Equivalently, a code is perfect if for all $x \in \mathbb F_2^n$, $\exists! \; c \in C$ s.t. $d(x,c) \leq e$.
     Alternatively, $\mathbb F_2^n$ is a union of disjoint balls $B(c,e)$ for all $c \in C$, or that any collection of $e + 1$ will cause the message to be decoded incorrectly.
 \end{remark}
 
@@ -254,7 +254,7 @@ \subsection{Covering estimates}
 
 \begin{proof}
     Let $m = A(n,d+1)$, and let $C$ be an $[n,m,d+1]$-code.
-    Let $c_1, c_2 \in C$ be distinct codewords such that $d(c_1,c_2) = d+1$.
+    Let $c_1, c_2 \in C$ be distinct codewords s.t. $d(c_1,c_2) = d+1$.
     Let $c_1'$ differ from $c_1$ in exactly one of the places where $c_1$ and $c_2$ differ.
     Then $d(c_1', c_2) = d$.
     If $c \in C$ is any codeword not equal to $c_1$, then $d(c,c_1) \leq d(c,c_1') + d(c_1',c_1)$ hence $d + 1 \leq d(c,c_1') + 1$, so the code given by $C \cup \qty{c_1'} \setminus \qty{c_1}$ has minimum distance $d$, but has length $n$ and size $m$.
@@ -265,7 +265,7 @@ \subsection{Covering estimates}
     Equivalently, $A(n,d) = \max \qty{m : \exists [n,m,d'] \text{-code, for some } d' \geq d}$.
 \end{corollary}
 
-\begin{theorem}
+\begin{theorem} ~\vspace*{-1.5\baselineskip}
     \begin{align*}
         \frac{2^n}{V(n,d-1)} \leq A(n,d) \leq \frac{2^n}{V\qty(n,\floor*{\frac{d-1}{2}})}
     \end{align*}
@@ -285,6 +285,7 @@ \subsection{Covering estimates}
     \end{align*}
     as required.
 \end{proof}
+
 \begin{example}
     Let $n = 10, d = 3$.
     Then $V(n,1) = 11$ and $V(n,2) = 56$, so the GSV bound is $\frac{2^{10}}{56} \leq A(10,3) \leq \frac{2^{10}}{11}$.
@@ -305,19 +306,20 @@ \subsection{Asymptotics}
     \end{enumerate}
     where $H(\delta) = -\delta \log \delta - (1-\delta)\log (1-\delta)$.
 \end{proposition}
+
 \begin{proof}
-    \vocab{(i) implies (ii).}
+    \emph{(i) implies (ii).}
     By the GSV bound, we find
     \begin{align*}
         A(n,\floor*{n\delta}) \geq \frac{2^n}{V(n,\floor*{n\delta} - 1)} \geq \frac{2^n}{V(n,\floor*{n\delta})}
     \end{align*}
-    Taking logarithms,
+    Taking logarithms and dividing by $n$,
     \begin{align*}
         \frac{1}{n}\log A(n,\floor*{n\delta}) \geq 1 - \frac{\log V(n,\floor*{n\delta})}{n} \geq 1 - H(\delta)
     \end{align*}
-    \vocab{Part (i).}
+    \emph{Part (i).}
     $H(\delta)$ is increasing for $\delta < \frac{1}{2}$.
-    Therefore, without loss of generality, we may assume $n\delta$ is an integer.
+    Therefore, wlog, we may assume $n\delta$ is an integer.
     Now, as $\frac{\delta}{1-\delta} < 1$,
     \begin{align*}
         1 &= (\delta + (1-\delta))^n \\
@@ -333,37 +335,59 @@ \subsection{Asymptotics}
     \end{align*}
     as required.
 \end{proof}
+
 The constant $H(\delta)$ in the proposition is optimal.
+
 \begin{lemma}
     $\lim_{n \to \infty} \frac{\log V(n,\floor*{n\delta})}{n} = H(\delta)$.
 \end{lemma}
+
 \begin{proof}
-    Exercise.
-    Follows from Stirling's approximation to factorials.
+    Wlog assume $0 < \delta < \frac{1}{2}$.
+    Let $0 \leq r \leq \frac{n}{2}$.
+    Recall $V(n, r) = \sum_{i=0}^{r} \binom{n}{i}$.
+    Then
+    \begin{align*}
+        \binom{n}{r} \leq V(n, r) \leq (r + 1) \binom{n}{r} \quad (\ast)
+    \end{align*}
+    Recall Stirling's formula: $\ln n! = n \ln n - n + O(\log n)$.
+    \begin{align*}
+        \ln \binom{n}{r} &= (n \ln n - n) - (r \ln r - r) \\ &- \qty((n - r) \log(n - r) - (n - r)) + O(\log n) \\
+        \log \binom{n}{r} &= -r \log \frac{r}{n} - (n - r) \log \frac{n - r}{n} + O(\log n) \\
+        &= n H\qty(\frac{r}{n}) + O(\log n).
+        \intertext{By $(\ast)$}
+        H\qty(\frac{r}{n}) + O\qty(\frac{\log n}{n}) &\leq \frac{\log V(n, r)}{n} \leq H\qty(\frac{r}{n}) + O\qty(\frac{\log n}{n}) \\
+        \lim\limits_{n \to \infty} \frac{\log V(n \floor{n \delta})}{n} &= H(\delta)
+    \end{align*}
 \end{proof}
 
 \subsection{Constructing new codes from old}
 Let $C$ be an $[n,m,d]$-code.
-\begin{example}
+
+\begin{example}[Parity Check Extension]
     The \vocab{parity check extension} is an $[n+1,m,d']$-code given by
     \begin{align*}
-        C^+ = \qty{\qty(c_1, \dots, c_n, \sum_{i=1}^n c_i) \midd (c_1, \dots, c_n) \in C}
+        C^+ = \qty{\qty(c_1, \dots, c_n, \sum_{i=1}^n c_i \mod 2) : (c_1, \dots, c_n) \in C}
     \end{align*}
     where $d'$ is either $d$ or $d + 1$, depending on whether $d$ is odd or even.
 \end{example}
-\begin{example}
+
+\begin{example}[Punctured Code]
     Let $1 \leq i \leq n$.
     Then, deleting the $i$th digit from each codeword gives the \vocab{punctured code}
     \begin{align*}
-        C^- = \qty{(c_1, \dots, c_{i-1}, c_{i+1}, \dots, c_n) \midd (c_1, \dots, c_n) \in C}
+        C^- = \qty{(c_1, \dots, c_{i-1}, c_{i+1}, \dots, c_n) : (c_1, \dots, c_n) \in C}
     \end{align*}
     If $d \geq 2$, this is an $[n-1, m, d']$-code where $d'$ is either $d$ or $d - 1$.
 \end{example}
-\begin{example}
-    Let $1 \leq i \leq n$ and let $\alpha \in \mathbb F_2$.
+
+\begin{example}[Shortened Code]
+    Fix $1 \leq i \leq n$ and $\alpha \in \mathbb F_2$.
     The \vocab{shortened code} is
     \begin{align*}
-        C' = \qty{(c_1, \dots, c_{i-1}, c_{i+1}, \dots, c_n) \midd (c_1, \dots, c_{i-1}, \alpha, c_{i+1}, \dots, c_n) \in C}
+        C' = \qty{(c_1, \dots, c_{i-1}, c_{i+1}, \dots, c_n) : (c_1, \dots, c_{i-1}, \alpha, c_{i+1}, \dots, c_n) \in C}
     \end{align*}
     This is an $[n-1,m',d']$ with $d' \geq d$ and $m' \geq \frac{m}{2}$ for a suitable choice of $\alpha$.
+
+    Note that puncturing and shortenings are not the same thing.
 \end{example}
diff --git a/CodingAndCryptography/03_information_theory.tex b/CodingAndCryptography/03_information_theory.tex
index ebfb647..8d845af 100644
--- a/CodingAndCryptography/03_information_theory.tex
+++ b/CodingAndCryptography/03_information_theory.tex
@@ -1,34 +1,40 @@
 \section{Information theory}
 
 \subsection{Sources and information rate}
-\begin{definition}
-    A \vocab{source} is a sequence of random variables $X_1, X_2, \dots$ taking values in $\mathcal A$.
+\begin{definition}[Source]
+    A \vocab{source} is a sequence of r.v.s $X_1, X_2, \dots$ taking values in the alphabet $\mathcal A$.
 \end{definition}
-\begin{example}
-    The \vocab{Bernoulli} (or \vocab{memoryless}) source is a source where the $X_i$ are independent and identically distributed according to a Bernoulli distribution.
+
+\begin{example}[Bernoulli Source]
+    The \vocab{Bernoulli} (or \vocab{memoryless}) source is a source where the $X_i$ are iid Bernoulli's.
 \end{example}
-\begin{definition}
-    A source $X_1, X_2, \dots$ is \vocab{reliably encodable} at rate $r$ if there exist subsets $A_n \subseteq \mathcal A^n$ such that
+
+\begin{definition}[Reliably Encodable]
+    A source $X_1, X_2, \dots$ is \vocab{reliably encodable} at rate $r$ if $\exists$ subsets $A_n \subseteq \mathcal A^n$ s.t.
     \begin{enumerate}
-        \item $\lim \frac{\log \abs{A_n}}{n} = r$;
-        \item $\lim \prob{(X_1, \dots, X_n) \in A_n} = 1$.
+        \item $\lim_{n \to \infty} \frac{\log \abs{A_n}}{n} = r$;
+        \item $\lim_{n \to \infty} \prob{(X_1, \dots, X_n) \in A_n} = 1$.
     \end{enumerate}
 \end{definition}
-\begin{definition}
+
+\begin{definition}[Information Rate]
     The \vocab{information rate} $H$ of a source is the infimum of all reliable encoding rates.
 \end{definition}
+
 \begin{example}
     $0 \leq H \leq \log\abs{\mathcal A}$, with both bounds attainable.
     The proof is left as an exercise.
 \end{example}
+
 Shannon's first coding theorem computes the information rate of certain sources, including Bernoulli sources.
 
-Recall from IA Probability that a probability space is a tuple $(\Omega, \mathcal F, \mathbb P)$, and a discrete random variable is a function $X \colon \Omega \to \mathcal A$.
+Recall from IA Probability that a probability space is a tuple $(\Omega, \mathcal F, \mathbb P)$, and a discrete r.v. is a function $X \colon \Omega \to \mathcal A$.
 The probability mass function is the function $p_X \colon \mathcal A \to [0,1]$ given by $p_X(x) = \prob{X = x}$.
-We can consider the function $p(X) \colon \Omega \to [0,1]$ defined by the composition $p_X \circ X$, which assigns $p(X)(\omega) = \prob{X = X(\omega)}$; hence, $p(X)$ is also a random variable.
+We can consider the function $p(X) \colon \Omega \to [0,1]$ defined by the composition $p_X \circ X$, which assigns $p(X)(\omega) = \prob{X = X(\omega)}$; hence, $p(X)$ is also a r.v..
 
-Similarly, given a source $X_1, X_2, \dots$ of random variables with values in $\mathcal A$, the probability mass function of any tuple $X^{(n)} = (X_1, \dots, X_n)$ is $p_{X^{(n)}}(x_1, \dots, x_n) = \prob{X_1 = x_1, \dots, X_n = x_n}$.
+Similarly, given a source $X_1, X_2, \dots$ of r.v.s with values in $\mathcal A$, the probability mass function of any tuple $X^{(n)} = (X_1, \dots, X_n)$ is $p_{X^{(n)}}(x_1, \dots, x_n) = \prob{X_1 = x_1, \dots, X_n = x_n}$.
 As $p_{X^{(n)}} \colon \mathcal A^n \to [0,1]$, and $X^{(n)} \colon \Omega \to \mathcal A^n$, we can consider $p(X^{(n)}) = p_{X^{(n)}} \circ X^{(n)}$ defined by $\omega \mapsto p_{X^{(n)}}(X^{(n)}(\omega))$.
+
 \begin{example}
     Let $\mathcal A = \qty{A, B, C}$.
     Suppose
@@ -54,26 +60,30 @@ \subsection{Sources and information rate}
     \end{cases}
     \end{align*}
 \end{example}
-We say that a source $X_1,X_2, \dots$ converges in probability to a random variable $L$ if for all $\varepsilon > 0$, $\lim_{n \to \infty} \prob{\abs{X_n - L} > \varepsilon} = 0$.
+
+We say that a source $X_1,X_2, \dots$ converges in probability to a r.v. $L$ if $\forall \; \varepsilon > 0$, $\lim_{n \to \infty} \prob{\abs{X_n - L} > \varepsilon} = 0$.
 We write $X_n \xrightarrow{\mathbb P} L$.
-The weak law of large numbers states that if $X_1, X_2, \dots$ is a sequence of independent identically distributed real-valued random variables with finite expectation $\expect{X_1}$, then $\frac{1}{n} \sum_{i=1}^n X_i \xrightarrow{\mathbb P} \expect{X}$.
+The weak law of large numbers states that if $X_1, X_2, \dots$ are iid real-valued r.v.s with finite $\expect{X_1}$, then $\frac{1}{n} \sum_{i=1}^n X_i \xrightarrow{\mathbb P} \expect{X}$.
+
 \begin{example}
     Let $X_1, X_2, \dots$ be a Bernoulli source.
-    Then $p(X_1), p(X_2), \dots$ are independent and identically distributed random variables, and $p(X_1, \dots, X_n) = p(X_1) \dots p(X_n)$.
-    Note that by the weak law of large numbers,
+    Then $p(X_1), p(X_2), \dots$ are iid r.v.s, and $p(X_1, \dots, X_n) = p(X_1) \dots p(X_n)$.
+    Note that by the WLLN,
     \begin{align*}
         -\frac{1}{n} \log p(X_1, \dots, X_n) = -\frac{1}{n} \sum_{i=1}^n \log p(X_i) \xrightarrow{\mathbb P} \expect{-\log p(X_1)} = H(X_1)
     \end{align*}
 \end{example}
+
 \begin{lemma}
     The information rate of a Bernoulli source $X_1, X_2, \dots$ is at most the expected word length of an optimal code $c \colon \mathcal A \to \qty{0,1}^\star$ for $X_1$.
 \end{lemma}
+
 \begin{proof}
     Let $\ell_1, \ell_2, \dots$ be the codeword lengths when we encode $X_1, X_2, \dots$ using $c$.
     Let $\varepsilon > 0$.
     Let
     \begin{align*}
-        A_n = \qty{x \in \mathcal A^n \mid c^\star(x) \text{ has length less than } n \qty(\expect{\ell_1} + \varepsilon)}
+        A_n = \qty{x \in \mathcal A^n : c^\star(x) \text{ has length less than } n \qty(\expect{\ell_1} + \varepsilon)}
     \end{align*}
     Then,
     \begin{align*}
@@ -93,7 +103,7 @@ \subsection{Sources and information rate}
     Combine the previous lemma with the noiseless coding theorem.
 \end{proof}
 Suppose we encode $X_1, X_2, \dots$ in blocks of size $N$.
-Let $Y_1 = (X_1, \dots, X_N), Y_2 = (X_{N+1}, \dots, X_{2N})$ and so on, such that $Y_1, Y_2, \dots$ take values in $\mathcal A^N$.
+Let $Y_1 = (X_1, \dots, X_N), Y_2 = (X_{N+1}, \dots, X_{2N})$ and so on, s.t. $Y_1, Y_2, \dots$ take values in $\mathcal A^N$.
 One can show that if the source $X_1, X_2, \dots$ has information rate $H$, then $Y_1, Y_2, \dots$ has information rate $NH$.
 \begin{proposition}
     The information rate $H$ of a Bernoulli source is at most $H(X_1)$.
@@ -101,18 +111,19 @@ \subsection{Sources and information rate}
 \begin{proof}
     Apply the previous corollary to the $Y_i$ to obtain
     \begin{align*}
-        NH < H(Y_1) + 1 = H(X_1, \dots, X_N) + 1 = NH(X_1) + 1 \implies H < H(X_1) + \frac{1}{N}
+        NH < H(Y_1) + 1 = H(X_1, \dots, X_N) + 1 = NH(X_1) + 1 \implies H < H(X_1) + \frac{1}{N}.
     \end{align*}
-    as required.
+    But $N > 1$ is arbitrary so can take limit.
 \end{proof}
 
 \subsection{Asymptotic equipartition property}
-\begin{definition}
-    A source $X_1, X_2, \dots$ satisfies the \vocab{asymptotic equipartition property} if there exists a constant $H \geq 0$ such that
+\begin{definition}[Asymptotic Equipartition Property (AEP)]
+    A source $X_1, X_2, \dots$ satisfies the \vocab{asymptotic equipartition property} if $\exists$ a constant $H \geq 0$ s.t.
     \begin{align*}
         -\frac{1}{n} \log p(X_1, \dots, X_n) \xrightarrow{\mathbb P} H
     \end{align*}
 \end{definition}
+
 \begin{example}
     Suppose we toss a biased coin with probability $p$ of obtaining a head.
     Let $X_1, X_2, \dots$ be the results of independent coin tosses.
@@ -124,18 +135,21 @@ \subsection{Asymptotic equipartition property}
     Not every sequence of tosses is of this form, but there is only a small probability of `atypical sequences'.
     With high probability, it is a `typical sequence' which has a probability close to $2^{-NH(X)}$.
 \end{example}
+
 \begin{lemma}
-    The asymptotic equipartition property for a source $X_1, X_2, \dots$ is equivalent to the property that for all $\varepsilon > 0$, there exists $n \in \mathbb N$ such that for all $n \geq n_0$, there exists a `typical set' $T_n \subseteq \mathcal A^n$ such that
+    The AEP for a source $X_1, X_2, \dots$ is equivalent to the property that $\forall \; \varepsilon > 0 \ \exists \; n_0 \in \mathbb N$ s.t. $\forall \; n \geq n_0$, $\exists \;$ `typical set' $T_n \subseteq \mathcal A^n$ s.t.
     \begin{enumerate}
         \item $\prob{(X_1, \dots, X_n) \in T_n} > 1 - \varepsilon$;
-        \item $2^{-n(H+\varepsilon)} \leq p(x_1, \dots, x_n) \leq 2^{-n(H-\varepsilon)}$ for all $(x_1, \dots, x_n) \in T_n$.
+        \item $2^{-n(H+\varepsilon)} \leq p(x_1, \dots, x_n) \leq 2^{-n(H-\varepsilon)} \quad \forall \; (x_1, \dots, x_n) \in T_n$.
     \end{enumerate}
 \end{lemma}
-\begin{proof}[Proof sketch]
-    First, we show that the asymptotic equipartition property implies the alternative definition.
+
+\begin{proof}[Proof sketch (Not Lectured)]
+    First, we show that the AEP implies the alternative definition.
     We define
     \begin{align*}
-        T_n = \qty{(x_1, \dots, x_n) \midd \abs{-\frac{1}{n} \log p(x_1, \dots, x_n) - H} \leq \varepsilon} = \qty{(x_1, \dots, x_n) \mid \text{condition (ii) holds}}
+        T_n &= \qty{(x_1, \dots, x_n) \midd \abs{-\frac{1}{n} \log p(x_1, \dots, x_n) - H} \leq \varepsilon} \\
+        &= \qty{(x_1, \dots, x_n) \mid \text{condition (ii) holds}}
     \end{align*}
     For the converse,
     \begin{align*}
@@ -144,10 +158,11 @@ \subsection{Asymptotic equipartition property}
 \end{proof}
 
 \subsection{Shannon's first coding theorem}
-\begin{theorem}
-    Let $X_1, X_2, \dots$ be a source satisfying the asymptotic equipartition property with constant $H$.
+\begin{theorem}[Shannon's First Coding Theorem]
+    Let $X_1, X_2, \dots$ be a source satisfying AEP with constant $H$.
     Then this source has information rate $H$.
 \end{theorem}
+
 \begin{proof}
     Let $\varepsilon > 0$, and let $T_n \subseteq \mathcal A^n$ be typical sets.
     Then, for all $n \geq n_0(\varepsilon)$, for all $(x_1, \dots, x_n) \in T_n$ we have $p(x_1, \dots, x_n) \geq 2^{-n(H + \varepsilon)}$.
@@ -165,19 +180,22 @@ \subsection{Shannon's first coding theorem}
     But $\prob{T_n} \leq \prob{A_n \cap T_n} + \prob{\mathcal A^n \setminus A_n} \to 0 + 0$, contradicting typicality.
     So we cannot reliably encode at rate $H - \varepsilon$, so the information rate is at least $H$.
 \end{proof}
+
 \begin{corollary}
     A Bernoulli source $X_1, X_2, \dots$ has information rate $H(X_1)$.
 \end{corollary}
+
 \begin{proof}
-    In a previous example we showed that for a Bernoulli source, $-\frac{1}{n} \log p(X_1, \dots, X_n) \xrightarrow{\mathbb P} H(X_1)$.
-    So the asymptotic equipartition property holds with $H = H(X_1)$, giving the result by Shannon's first coding theorem.
+    In a previous example we showed that for a Bernoulli source, \\ $-\frac{1}{n} \log p(X_1, \dots, X_n) \xrightarrow{\mathbb P} H(X_1)$.
+    So the AEP holds with $H = H(X_1)$, giving the result by Shannon's first coding theorem.
 \end{proof}
+
 \begin{remark}
-    The asymptotic equipartition property is useful for noiseless coding.
+    The AEP is useful for noiseless coding.
     We can encode the typical sequences using a block code, and encode the atypical sequences arbitrarily.
 
     Many sources, which are not necessarily Bernoulli, also satisfy the property.
-    Under suitable hypotheses, the sequence $\frac{1}{n} H(X_1, \dots, X_n)$ is decreasing, and the asymptotic equipartition property is satisfied with constant $H = \lim_{n \to \infty} \frac{1}{n} H(X_1, \dots, X_n)$.
+    Under suitable hypotheses, the sequence $\frac{1}{n} H(X_1, \dots, X_n)$ is decreasing, and the AEP is satisfied with constant $H = \lim_{n \to \infty} \frac{1}{n} H(X_1, \dots, X_n)$.
 \end{remark}
 
 \subsection{Capacity}
@@ -189,7 +207,7 @@ \subsection{Capacity}
         \hat e(C) = \max_{c \in C} \prob{\text{error} \mid c \text{ sent}}
     \end{align*}
 The \vocab{information rate} is $\rho(C) = \frac{\log \abs{C}}{n}$.
-A channel can \vocab{transmit reliably at rate $R$} if there exist codes $C_1, C_2, \dots$ where $C_n$ has length $n$ such that $\lim_{n \to \infty} \rho(C_n) = R$ and $\lim_{n \to \infty} \hat e(C_n) = 0$.
+A channel can \vocab{transmit reliably at rate $R$} if there exist codes $C_1, C_2, \dots$ where $C_n$ has length $n$ s.t. $\lim_{n \to \infty} \rho(C_n) = R$ and $\lim_{n \to \infty} \hat e(C_n) = 0$.
 The \vocab{(operational) capacity} of a channel is the supremum of all rates at which it can transmit reliably.
 
 Suppose we are given a source with information rate $r$ bits per second that emits symbols at a rate of $s$ symbols per second.
@@ -202,7 +220,7 @@ \subsection{Capacity}
     A binary symmetric channel with error probability $p < \frac{1}{4}$ has nonzero capacity.
 \end{proposition}
 \begin{proof}
-    Let $\delta$ be such that $2p < \delta < \frac{1}{2}$.
+    Let $\delta$ be s.t. $2p < \delta < \frac{1}{2}$.
     We claim that we can reliably transmit at rate $R = 1 - H(\delta) > 0$.
     Let $C_n$ be a code of length $n$, and suppose it has minimum distance $\floor*{n\delta}$ of maximal size.
     Then, by the GSV bound,
@@ -215,7 +233,7 @@ \subsection{Capacity}
         \hat e(C_n) &\leq \prob{\text{in $n$ uses, the channel makes at least } \floor*{\frac{\floor*{n\delta}-1}{2}} \text{ errors}} \\
         &\leq \prob{\text{in $n$ uses, the channel makes at least } \floor*{\frac{n\delta - 1}{2}} \text{ errors}}
     \end{align*}
-    Let $\varepsilon > 0$ be such that $p + \varepsilon < \frac{\delta}{2}$.
+    Let $\varepsilon > 0$ be s.t. $p + \varepsilon < \frac{\delta}{2}$.
     Then, for $n$ sufficiently large, $\frac{n\delta - 1}{2} = n\qty(\frac{\delta}{2} - \frac{1}{2n}) > n(p + \varepsilon)$.
     Hence, $\hat e(C_n) \leq \prob{\text{in $n$ uses, the channel makes at least } n(p+\varepsilon) \text{ errors}}$.
     We show that this value converges to zero as $n \to \infty$ using the next lemma.
@@ -229,8 +247,8 @@ \subsection{Capacity}
     \end{align*}
 \end{lemma}
 \begin{proof}
-    Consider random variables $U_i = 1\qty[\text{the $i$th digit is mistransmitted}]$.
-    The $U_i$ are independent and identically distributed with $\prob{U_i = 1} = p$.
+    Consider r.v.s $U_i = 1\qty[\text{the $i$th digit is mistransmitted}]$.
+    The $U_i$ are iid with $\prob{U_i = 1} = p$.
     In particular, $\expect{U_i} = p$.
     Therefore, the probability that the channel makes at least $n(p + \varepsilon)$ errors is
     \begin{align*}
@@ -241,7 +259,7 @@ \subsection{Capacity}
 
 \subsection{Conditional entropy}
 \begin{definition}
-    Let $X, Y$ be random variables taking values in alphabets $\mathcal A, \mathcal B$ respectively.
+    Let $X, Y$ be r.v.s taking values in alphabets $\mathcal A, \mathcal B$ respectively.
     Then, the \vocab{conditional entropy} is defined by
     \begin{align*}
         H(X \mid Y = y) = - \sum_{x \in \mathcal A} \prob{X = x \mid Y = y} \log \prob{X = x \mid Y = y}
@@ -268,7 +286,7 @@ \subsection{Conditional entropy}
     \end{align*}
 \end{proof}
 \begin{example}
-    Let $X$ be a uniform random variable on $\qty{1, \dots, 6}$ modelling a dice roll, and $Y$ is defined to be zero if $X$ is even, and one if $X$ is odd.
+    Let $X$ be a uniform r.v. on $\qty{1, \dots, 6}$ modelling a dice roll, and $Y$ is defined to be zero if $X$ is even, and one if $X$ is odd.
     Then, $H(X,Y) = H(X) = \log 6$ and $H(Y) = \log 2$.
     Therefore, $H(X \mid Y) = \log 3$ and $H(Y \mid X) = 0$.
 \end{example}
@@ -278,11 +296,11 @@ \subsection{Conditional entropy}
 \begin{proof}
     Combine this result with the fact that $H(X,Y) \leq H(X) + H(Y)$ where equality holds iff $H(X), H(Y)$ are independent.
 \end{proof}
-Now, replace random variables $X$ and $Y$ with random vectors $X^{(r)} = (X_1, \dots, X_r)$ and $Y^{(s)} = (Y_1, \dots, Y_s)$.
+Now, replace r.v.s $X$ and $Y$ with random vectors $X^{(r)} = (X_1, \dots, X_r)$ and $Y^{(s)} = (Y_1, \dots, Y_s)$.
 Similarly, we can define $H(X_1, \dots, X_r \mid Y_1, \dots, Y_s) = H(X^{(r)} \mid Y^{(s)})$.
 Note that $H(X,Y\mid Z)$ is the entropy of $X$ and $Y$ combined, given the value of $Z$, and is not the entropy of $X$, together with $Y$ given $Z$.
 \begin{lemma}
-    Let $X, Y, Z$ be random variables.
+    Let $X, Y, Z$ be r.v.s.
     Then, $H(X \mid Y) \leq H(X \mid Y, Z) + H(Z)$.
 \end{lemma}
 \begin{proof}
@@ -296,7 +314,7 @@ \subsection{Conditional entropy}
     \end{align*}
 \end{proof}
 \begin{proposition}[Fano's inequality]
-    Let $X, Y$ be random variables taking values in $\mathcal A$.
+    Let $X, Y$ be r.v.s taking values in $\mathcal A$.
     Let $\abs{\mathcal A} = m$, and let $p = \prob{X \neq Y}$.
     Then $H(X \mid Y) \leq H(p) + p \log(m-1)$.
 \end{proposition}
@@ -317,12 +335,12 @@ \subsection{Conditional entropy}
     \end{align*}
     as required.
 \end{proof}
-Let $X$ be a random variable describing the input to a channel and $Y$ be a random variable describing the output of the channel.
+Let $X$ be a r.v. describing the input to a channel and $Y$ be a r.v. describing the output of the channel.
 $H(p)$ provides the information required to decide whether an error has occurred, and $p\log(m-1)$ gives the information needed to resolve that error in the worst possible case.
 
 \subsection{Shannon's second coding theorem}
 \begin{definition}
-    Let $X, Y$ be random variables taking values in $\mathcal A$.
+    Let $X, Y$ be r.v.s taking values in $\mathcal A$.
     The \vocab{mutual information} is $I(X;Y) = H(X) - H(X \mid Y)$.
 \end{definition}
 This is nonnegative, as $I(X;Y) = H(X) + H(Y) - H(X,Y) \geq 0$.
@@ -330,11 +348,11 @@ \subsection{Shannon's second coding theorem}
 Clearly, $I(X;Y) = I(Y;X)$.
 \begin{definition}
     Consider a discrete memoryless channel with input alphabet $\mathcal A$ of size $m$ and output alphabet $\mathcal B$.
-    Let $X$ be a random variable taking values in $\mathcal A$, used as the input to this channel.
-    Let $Y$ be the random variable output by the channel, depending on $X$ and the channel matrix.
+    Let $X$ be a r.v. taking values in $\mathcal A$, used as the input to this channel.
+    Let $Y$ be the r.v. output by the channel, depending on $X$ and the channel matrix.
     The \vocab{information capacity} of the channel is $\max_{X} I(X;Y)$.
 \end{definition}
-The maximum is taken over all discrete random variables $X$ taking values in $\mathcal A$, or equivalently.
+The maximum is taken over all discrete r.v.s $X$ taking values in $\mathcal A$, or equivalently.
 This maximum is attained since $I$ is continuous and the space
 \begin{align*}
         \qty{(p_1, \dots, p_m) \in \mathbb R^m \midd p_i \geq 0, \sum_{i=1}^m p_i = 1}
@@ -395,7 +413,7 @@ \subsection{Shannon's second coding theorem}
         &= \sum_{i=1}^n \qty[H(Y_i) - H(Y_i \mid X_i)] \\
         &= \sum_{i=1}^n I(X_i;Y_i) \leq nC
     \end{align*}
-    Equality is attained by taking $X_1, \dots, X_n$ independent and identically distributed such that $I(X_i;Y_i) = C$.
+    Equality is attained by taking $X_1, \dots, X_n$ iid s.t. $I(X_i;Y_i) = C$.
     Indeed, if $X_1, \dots, X_n$ are independent, then so are $Y_1, \dots, Y_n$, so $H(Y_1, \dots, Y_n) = \sum_{i=1}^n H(Y_i)$.
     Therefore,
     \begin{align*}
@@ -407,14 +425,14 @@ \subsection{Shannon's second coding theorem}
 \begin{proof}
     Let $C$ be the information capacity.
     Suppose reliable transmission is possible at a rate $R > C$.
-    Then, there is a sequence of codes $(C_n)_{n \geq 1}$ where $C_n$ has length $n$ and size $\floor*{2^{nR}}$, such that $\lim_{n \to \infty} \rho(C_n) = R$ and $\lim_{n \to \infty} \hat e(C_n) = 0$.
+    Then, there is a sequence of codes $(C_n)_{n \geq 1}$ where $C_n$ has length $n$ and size $\floor*{2^{nR}}$, s.t. $\lim_{n \to \infty} \rho(C_n) = R$ and $\lim_{n \to \infty} \hat e(C_n) = 0$.
 
     Recall that $\hat e(C_n) = \max_{c \in C_n} \prob{\text{error} \mid c \text{ sent}}$.
     Define the \vocab{average error rate} $e(C)$ by $e(C) = \frac{1}{\abs{C_n}} \sum_{c \in C} \prob{\text{error} \mid c \text{ sent}}$.
     Note that $e(C_n) \leq \hat e(C_n)$.
     As $\hat e(C_n) \to 0$, we also have $e(C_n) \to 0$.
 
-    Consider an input random variable $X$ distributed uniformly over $C_n$.
+    Consider an input r.v. $X$ distributed uniformly over $C_n$.
     Let $Y$ be the output given by $X$ and the channel matrix.
     Then $e(C_n) = \prob{X \neq Y} = p$.
     Hence, $H(X) = \log \abs{C_n} = \log \floor*{2^{nR}} \geq nR - 1$ for sufficiently large $n$.
@@ -432,7 +450,7 @@ \subsection{Shannon's second coding theorem}
 To complete the proof of Shannon's second coding theorem for the binary symmetric channel with error probability $p$, we prove that the operational capacity is at least $1 - H(p)$.
 \begin{proposition}
     Consider a binary symmetric channel with error probability $p$, and let $R < 1 - H(p)$.
-    Then there exists a sequence of codes $(C_n)_{n \geq 1}$ with $C_n$ of length $n$ and size $\floor*{2^{nR}}$ such that $\lim_{n \to \infty} \rho(C_n) = R$ and $\lim_{n \to \infty} e(C_n) = 0$.
+    Then there exists a sequence of codes $(C_n)_{n \geq 1}$ with $C_n$ of length $n$ and size $\floor*{2^{nR}}$ s.t. $\lim_{n \to \infty} \rho(C_n) = R$ and $\lim_{n \to \infty} e(C_n) = 0$.
 \end{proposition}
 \begin{remark}
     This proposition deals with the average error rate, instead of the error rate $\hat e$.
@@ -440,13 +458,13 @@ \subsection{Shannon's second coding theorem}
 \begin{proof}
     We use the method of random coding.
     Without loss of generality let $p < \frac{1}{2}$.
-    Let $\varepsilon > 0$ such that $p + \varepsilon < \frac{1}{2}$ and $R < 1 - H(p + \varepsilon)$.
+    Let $\varepsilon > 0$ s.t. $p + \varepsilon < \frac{1}{2}$ and $R < 1 - H(p + \varepsilon)$.
     We use minimum distance decoding, and in the case of a tie, we make an arbitrary choice.
     Let $m = \floor*{2^{nR}}$, and let $C = \qty{c_1, \dots, c_m}$ be a code chosen uniformly at random from $\mathcal C = \qty{[n,m]\text{-codes}}$, a set of size $\binom{2^n}{m}$.
 
     Choose $1 \leq i \leq m$ uniformly at random, and send $c_i$ through the channel, and obtain an output $Y$.
     Then, $\prob{Y \text{ not decoded as } c_i}$ is the average value of $e(C)$ for $C$ ranging over $\mathcal C$, giving $\frac{1}{\abs{\mathcal C}} \sum_{C \in \mathcal C} e(C)$.
-    We can choose a code $C_n \in \mathcal C$ such that $e(C_n) \leq \frac{1}{\abs{\mathcal C}}\sum_{C \in \mathcal C} e(C)$.
+    We can choose a code $C_n \in \mathcal C$ s.t. $e(C_n) \leq \frac{1}{\abs{\mathcal C}}\sum_{C \in \mathcal C} e(C)$.
     So it suffices to show $\prob{Y \text{ not decoded as } c_i} \to 0$.
 
     Let $r = \floor*{n(p + \varepsilon)}$.
@@ -479,7 +497,7 @@ \subsection{Shannon's second coding theorem}
     We can replace $e$ with $\hat e$ in the previous result.
 \end{proposition}
 \begin{proof}
-    Let $R'$ be such that $R < R' < 1 - H(p)$.
+    Let $R'$ be s.t. $R < R' < 1 - H(p)$.
     Then, apply the previous result to $R'$ to construct a sequence of codes $(C_n')_{n \geq 1}$ of length $n$ and size $\floor*{2^{nR'}}$, where $e(C_n') \to 0$.
     Order the codewords of $C_n'$ by the probability of error given that the codeword was sent, and delete the worst half.
     This gives a code $C_n$ with $\hat e(C_n) \leq 2 e(C_n')$.
@@ -506,8 +524,8 @@ \subsection{The Kelly criterion}
     X_n(1-w) & (n + 1)\text{th toss is a tail}
 \end{cases}
     \end{align*}
-Define $Y_{n+1} = \frac{X_{n+1}}{X_n}$, then the $Y_i$ are independent and identically distributed.
-Then $\log Y_i$ is a sequence of independent and identically distributed random variables.
+Define $Y_{n+1} = \frac{X_{n+1}}{X_n}$, then the $Y_i$ are iid.
+Then $\log Y_i$ is a sequence of iid r.v.s.
 Note that $\log X_n = \sum_{i=1}^n \log Y_i$.
 \begin{lemma}
     Let $\mu = \expect{\log Y_1}, \sigma^2 = \Var{\log Y_1}$.
@@ -515,7 +533,7 @@ \subsection{The Kelly criterion}
     \begin{enumerate}
         \item $\prob{\abs{\frac{1}{n} \sum_{i=1}^n \log Y_i - \mu} \geq a} \leq \frac{\sigma^2}{na^2}$ by Chebyshev's inequality;
         \item $\prob{\abs{\frac{\log X_n}{n} - \mu} \geq a} \leq \frac{\sigma^2}{na^2}$;
-        \item given $\varepsilon > 0$ and $\delta > 0$, there exists $N$ such that $\prob{\abs{\frac{\log X_n}{n} - \mu} \geq \delta} \leq \varepsilon$ for all $n \geq N$.
+        \item given $\varepsilon > 0$ and $\delta > 0$, there exists $N$ s.t. $\prob{\abs{\frac{\log X_n}{n} - \mu} \geq \delta} \leq \varepsilon$ for all $n \geq N$.
     \end{enumerate}
 \end{lemma}
 Consider a single coin toss, with probability $p < 1$ of a head.
diff --git a/CodingAndCryptography/04_algebraic_coding_theory.tex b/CodingAndCryptography/04_algebraic_coding_theory.tex
index 83b028d..1697d0a 100644
--- a/CodingAndCryptography/04_algebraic_coding_theory.tex
+++ b/CodingAndCryptography/04_algebraic_coding_theory.tex
@@ -30,7 +30,7 @@ \subsection{Linear codes}
     Define $x \cdot y = \sum_{i=1}^n x_i y_i \in \mathbb F_2$.
     This is symmetric and bilinear.
 \end{definition}
-There are nonzero $x$ such that $x \cdot x = 0$.
+There are nonzero $x$ s.t. $x \cdot x = 0$.
 \begin{definition}
     Let $P \subseteq \mathbb F_2^n$.
     The \vocab{parity check code} defined by $P$ is
@@ -181,7 +181,7 @@ \subsection{Hamming codes}
     Any two columns of $H$ are linearly independent, but there are three linearly dependent columns.
     Hence, $d(C) = 3$.
     Hence, $C$ is $\floor*{\frac{3-1}{2}} = 1$-error correcting.
-    A perfect code is one such that $\abs{C} = \frac{2^n}{V(n,e)}$.
+    A perfect code is one s.t. $\abs{C} = \frac{2^n}{V(n,e)}$.
     In this case, $n = 2^d - 1$ and $e = 1$, so $\frac{2^n}{1 + 2^d - 1} = 2^{n-d} = \abs{C}$ as required.
 \end{proof}
 
@@ -319,7 +319,7 @@ \subsection{Cyclic codes}
 Since $\mathbb F_2[X]$ is a principal ideal domain, these ideals correspond to polynomials $g(X) \in \mathbb F_2[X]$ dividing $X^n - 1$.
 \begin{theorem}
     Let $C \trianglelefteq \faktor{\mathbb F_2[X]}{(X^n - 1)}$ be a cyclic code.
-    Then, there exists a unique \vocab{generator} polynomial $g(X) \in \mathbb F_2[X]$ such that
+    Then, there exists a unique \vocab{generator} polynomial $g(X) \in \mathbb F_2[X]$ s.t.
     \begin{enumerate}
         \item $C = (g)$;
         \item $g(X) \mid X^n - 1$.
@@ -365,7 +365,7 @@ \subsection{Cyclic codes}
 \end{corollary}
 \begin{definition}
     Let $g$ be a generator for $C$.
-    The \vocab{parity check polynomial} is the polynomial $h$ such that $g(X) h(X) = X^n - 1$.
+    The \vocab{parity check polynomial} is the polynomial $h$ s.t. $g(X) h(X) = X^n - 1$.
 \end{definition}
 \begin{corollary}
     Writing $h(X) = b_0 + b_1 X + \dots + b_{n-k} X^{n-k}$, the parity check matrix is
@@ -402,11 +402,11 @@ \subsection{BCH codes}
 
 If $q = p^\alpha$ is a prime power where $\alpha \geq 1$, there exists a unique field $\mathbb F_q$ of order $q$, up to isomorphism.
 Note that $\mathbb F_q \not\simeq \faktor{\mathbb Z}{q\mathbb Z}$ if $\alpha > 1$.
-The multiplicative group $\mathbb F_q^\times$ is cyclic; there exists $\beta \in \mathbb F_q$ such that $\mathbb F_q^\times = \genset{\beta} = \qty{1, \beta, \dots, \beta^{q-2}}$.
+The multiplicative group $\mathbb F_q^\times$ is cyclic; there exists $\beta \in \mathbb F_q$ s.t. $\mathbb F_q^\times = \genset{\beta} = \qty{1, \beta, \dots, \beta^{q-2}}$.
 Such a $\beta$ is called a \vocab{primitive element}.
 % BCH codes are a particular type of cyclic code.
 
-Let $n$ be an odd integer, and let $r \geq 1$ such that $2^r \equiv 1$ mod $n$, which always exists as $2$ is coprime to $n$.
+Let $n$ be an odd integer, and let $r \geq 1$ s.t. $2^r \equiv 1$ mod $n$, which always exists as $2$ is coprime to $n$.
 Let $K = \mathbb F_{2^r}$, and define $\bm \mu_n(K) = \qty{x \in K \mid x^n = 1} \leq K^\times$, which is a cyclic group.
 Since $n \mid (2^r - 1) = \abs{K^\times}$, $\bm \mu_n(K)$ is the cyclic group of order $n$.
 Hence, $\bm \mu_n(K) = \qty{1, \alpha, \alpha^2, \dots, \alpha^{n-1}}$ for some primitive $n$th root of unity $\alpha \in K$.
@@ -416,7 +416,7 @@ \subsection{BCH codes}
         C = \qty{f(X) \in \faktor{\mathbb F_2[X]}{(X^n - 1)} \midd \forall a \in A,\, f(a) = 0}
     \end{align*}
 \end{definition}
-The generator polynomial $g(X)$ is the nonzero polynomial of least degree such that $g(a) = 0$ for all $a \in A$.
+The generator polynomial $g(X)$ is the nonzero polynomial of least degree s.t. $g(a) = 0$ for all $a \in A$.
 Equivalently, $g$ is the least common multiple of the minimal polynomials of the elements of $A$.
 \begin{definition}
     The cyclic code of length $n$ with defining set $\qty{\alpha, \alpha^2, \dots, \alpha^{\delta - 1}}$ is a \vocab{BCH code} with \vocab{design distance} $\delta$.
@@ -469,7 +469,7 @@ \subsection{BCH codes}
 Assuming that $\deg \sigma = \abs{\mathcal E}$, where $2t + 1 \leq \delta$, we must recover $\sigma$ from $r(X)$.
 \begin{theorem}
     Suppose $\deg \sigma = \abs{\mathcal E} \leq t$ where $2t + 1 \leq \delta$.
-    Then $\sigma(X)$ is the unique polynomial in $K[X]$ of least degree such that
+    Then $\sigma(X)$ is the unique polynomial in $K[X]$ of least degree s.t.
     \begin{enumerate}
         \item $\sigma(0) = 1$;
         \item $\sigma(X) \sum_{j=1}^{2t} r(\alpha^j) X^j = \omega(X)$ mod $X^{2t+1}$ for some $\omega \in K[X]$ of degree at most $t$.
@@ -556,7 +556,7 @@ \subsection{Shift registers}
     A sequence $y_0, \dots$ of elements of $\mathbb F_2$ has \vocab{generating function} $\sum_{j=0}^\infty y_j X^j \in \mathbb F_2\Brackets{X}$.
 \end{definition}
 \begin{theorem}
-    The stream $(y_n)_{n \in \mathbb N}$ comes from a linear feedback shift register with auxiliary polynomial $P(X)$ iff its generating function is (formally) of the form $\frac{A(X)}{\check{P}(X)}$ with $A \in \mathbb F_2[X]$ such that $\deg A < \deg \check{P}$.
+    The stream $(y_n)_{n \in \mathbb N}$ comes from a linear feedback shift register with auxiliary polynomial $P(X)$ iff its generating function is (formally) of the form $\frac{A(X)}{\check{P}(X)}$ with $A \in \mathbb F_2[X]$ s.t. $\deg A < \deg \check{P}$.
 \end{theorem}
 Note that $\check{P}(X) = X^{\deg P}P(X^{-1})$.
 \begin{proof}
@@ -574,7 +574,7 @@ \subsection{Shift registers}
 
 \subsection{The Berlekamp--Massey method}
 Let $(x_n)_{n \in \mathbb N}$ be the output of a binary linear feedback shift register.
-We wish to find the unknown length $d$ and values $a_0, \dots, a_{d-1}$ such that $x_n + \sum_{i=1}^d a_{d-i} x_{n-i} = 0$ for all $n \geq d$.
+We wish to find the unknown length $d$ and values $a_0, \dots, a_{d-1}$ s.t. $x_n + \sum_{i=1}^d a_{d-i} x_{n-i} = 0$ for all $n \geq d$.
 We have
 \begin{align*}
     \underbrace{\begin{pmatrix}
diff --git a/CodingAndCryptography/05_cryptography.tex b/CodingAndCryptography/05_cryptography.tex
index e289e85..8eec002 100644
--- a/CodingAndCryptography/05_cryptography.tex
+++ b/CodingAndCryptography/05_cryptography.tex
@@ -1,7 +1,7 @@
 \section{Cryptography}
 
 \subsection{Cryptosystems}
-We want to modify a message such that it becomes unintelligible to an eavesdropper Eve.
+We want to modify a message s.t. it becomes unintelligible to an eavesdropper Eve.
 Certain secret information is shared between two participants Alice and Bob, called the \vocab{key}, chosen from a set of possible keys $\mathcal K$.
 The unencrypted message is called the \vocab{plaintext}, which lies in a set $\mathcal M$, and the encrypted message is called the \vocab{ciphertext}, and lies in a set $\mathcal C$.
 A \vocab{cryptosystem} consists of $(\mathcal K, \mathcal M, \mathcal C)$ together with the \vocab{encryption} function $e \colon \mathcal M \times \mathcal K \to \mathcal C$ and \vocab{decryption} function $d \colon \mathcal C \times \mathcal K \to \mathcal M$.
@@ -35,8 +35,8 @@ \subsection{Breaking cryptosystems}
     For modern applications, Level 3 security is desirable.
 \end{remark}
 Consider a cryptosystem $(\mathcal M, \mathcal K, \mathcal C)$.
-We model the keys and messages as independent random variables $K, M$ taking values in $\mathcal K, \mathcal M$.
-The ciphertext random variable is $C = e(K,M) \in \mathcal C$.
+We model the keys and messages as independent r.v.s $K, M$ taking values in $\mathcal K, \mathcal M$.
+The ciphertext r.v. is $C = e(K,M) \in \mathcal C$.
 \begin{definition}
     A cryptosystem $(\mathcal M, \mathcal K, \mathcal C)$ has \vocab{perfect secrecy} if $H(M \mid C) = H(M)$, or equivalently, $M$ and $C$ are independent, or $I(M;C) = 0$.
 \end{definition}
@@ -63,7 +63,7 @@ \subsection{Breaking cryptosystems}
 \end{proof}
 Let $\mathcal M = \mathcal C = \mathcal A$, and suppose we send $n$ messages modelled as $M^{(n)} = (M_1, \dots, M_n)$ encrypted as $C^{(n)} = (C_1, \dots, C_n)$ using the same key $K$.
 \begin{definition}
-    The \vocab{unicity distance} is the least $n$ such that $H\qty(K \mid C^{(n)}) = 0$; it is the smallest number of encrypted messages required to uniquely determine the key.
+    The \vocab{unicity distance} is the least $n$ s.t. $H\qty(K \mid C^{(n)}) = 0$; it is the smallest number of encrypted messages required to uniquely determine the key.
 \end{definition}
 Now,
 \begin{align*}
@@ -96,7 +96,7 @@ \subsection{One-time pad}
 \begin{definition}
     A \vocab{one-time pad} is a cryptosystem where $k$ is generated randomly; the $k_i$ are independent and take values of 0 or 1 with probability $\frac{1}{2}$.
 \end{definition}
-$z = p + k$ is now a stream of independent and identically distributed random variables taking values of 0 or 1 with probability $\frac{1}{2}$.
+$z = p + k$ is now a stream of iid r.v.s taking values of 0 or 1 with probability $\frac{1}{2}$.
 Hence, without the key stream, deciphering is impossible, so the unicity distance is infinite.
 One can show that a one-time pad has perfect secrecy.
 
@@ -107,13 +107,13 @@ \subsection{One-time pad}
 We then apply the following result.
 \begin{lemma}
     Let $x_0, x_1, \dots$ be a stream in $\mathbb F_2$ produced by a feedback shift register of length $d$.
-    Then there exist $M, N \leq 2^d$ such that $x_{N+r} = x_{r}$ for all $r \geq M$.
+    Then there exist $M, N \leq 2^d$ s.t. $x_{N+r} = x_{r}$ for all $r \geq M$.
 \end{lemma}
 \begin{proof}
     Let the register be $f \colon \mathbb F_2^d \to \mathbb F_2^d$, and let $v_i = (x_i, \dots, x_{i+d-1})$.
     Then for all $i$, we have $f(v_i) = v_{i+1}$.
     Since $\abs{\mathbb F_2^d} = 2^d$, the tuples $v_0, v_1, \dots, v_{2^d}$ cannot all be distinct.
-    Let $a < b \leq 2^d$ such that $v_a = v_b$.
+    Let $a < b \leq 2^d$ s.t. $v_a = v_b$.
     Let $M = a$ and $N = b - a$, so $v_M = v_{M+N}$ so by induction we have $v_r = v_{r+N}$ for all $r \geq M$.
 \end{proof}
 \begin{remark}
@@ -176,7 +176,7 @@ \subsection{Asymmetric ciphers}
     Given $N$, the task is to find $p$ and $q$.
     \item Discrete logarithm problem.
     Let $p$ be a large prime and $g$ be a primitive root mod $p$ (a generator of $\mathbb F_p^\star$).
-    Given $x$, we wish to find $a$ such that $x \equiv g^a$ mod $p$.
+    Given $x$, we wish to find $a$ s.t. $x \equiv g^a$ mod $p$.
 \end{enumerate}
 \begin{definition}
     An algorithm runs in \vocab{polynomial time} if the number of operations needed to perform the algorithm is at most $c N^d$ where $N$ is the input size, and $c, d$ are constants.
@@ -226,9 +226,9 @@ \subsection{Rabin cryptosystem}
 We encrypt a plaintext message $m$ as $c = m^2$ mod $N$.
 Usually, we restrict our messages so that $(m, N) = 1$ and $m > \sqrt{N}$.
 
-Receiving ciphertext $c$, we can solve for $x_1, x_2$ such that $x_1^2 \equiv c$ mod $p$ and $x_2^2 \equiv c$ mod $q$ using the previous lemma.
-Then, applying the Chinese remainder theorem, we can find $x$ such that $x \equiv x_1$ mod $p$ and $x \equiv x_2$ mod $q$, hence $x^2 \equiv c$ mod $N$.
-Indeed, running the Euclidean algorithm on $p, q$ gives integers $r, s$ such that $rp + sq = 1$, then we can take $x = sqx_1 + rpx_2$.
+Receiving ciphertext $c$, we can solve for $x_1, x_2$ s.t. $x_1^2 \equiv c$ mod $p$ and $x_2^2 \equiv c$ mod $q$ using the previous lemma.
+Then, applying the Chinese remainder theorem, we can find $x$ s.t. $x \equiv x_1$ mod $p$ and $x \equiv x_2$ mod $q$, hence $x^2 \equiv c$ mod $N$.
+Indeed, running the Euclidean algorithm on $p, q$ gives integers $r, s$ s.t. $rp + sq = 1$, then we can take $x = sqx_1 + rpx_2$.
 \begin{lemma}
     \begin{enumerate}
         \item Let $p$ be an odd prime, and let $(d, p) = 1$.
@@ -254,7 +254,7 @@ \subsection{Rabin cryptosystem}
 \begin{proof}
     If we can factorise $N$ as $pq$, we have seen that we can decrypt messages.
     Conversely, suppose we can break the cryptosystem, so we have an algorithm to find square roots modulo $N$.
-    Choose $x$ mod $N$ at random, and use the algorithm to find $y$ such that $y^2 \equiv x^2$ mod $N$.
+    Choose $x$ mod $N$ at random, and use the algorithm to find $y$ s.t. $y^2 \equiv x^2$ mod $N$.
     With probability $\frac{1}{2}$, $x \neq \pm y$ mod $N$.
     Then, $(N, x-y)$ is a nontrivial factor of $N$.
     If this fails, choose another $x$, and repeat until the probability of failure $\qty(\frac{1}{2})^r$ is acceptably low.
@@ -271,7 +271,7 @@ \subsection{RSA cryptosystem}
 \end{align*}
 \begin{theorem}
     \begin{enumerate}
-        \item If $x \in X$, then there exists $0 \leq t < a$ such that $(x^{2^t b} - 1, N)$ is a nontrivial factor of $N$.
+        \item If $x \in X$, then there exists $0 \leq t < a$ s.t. $(x^{2^t b} - 1, N)$ is a nontrivial factor of $N$.
         \item $\abs{X} \geq \frac{1}{2} \abs{\qty(\faktor{\mathbb Z}{N\mathbb Z})^\times} = \frac{1}{2}(p-1)(q-1)$.
     \end{enumerate}
 \end{theorem}
@@ -289,8 +289,8 @@ \subsection{RSA cryptosystem}
 The proof of part (ii) will be seen later.
 
 In the RSA cryptosystem, the private key consists of large distinct primes $p, q$ chosen at random.
-Let $N = pq$, and choose the \vocab{encrypting exponent} $e$ randomly such that $(e, \varphi(N)) = 1$, for instance taking $e$ prime larger than $p, q$.
-By Euclid's algorithm, there exist $d, k$ such that $de - k\varphi(N) = 1$; $d$ is called the \vocab{decrypting exponent}.
+Let $N = pq$, and choose the \vocab{encrypting exponent} $e$ randomly s.t. $(e, \varphi(N)) = 1$, for instance taking $e$ prime larger than $p, q$.
+By Euclid's algorithm, there exist $d, k$ s.t. $de - k\varphi(N) = 1$; $d$ is called the \vocab{decrypting exponent}.
 
 The public key is $(N, e)$, and we encrypt $m \in \mathcal M$ as $c \equiv m^e$ mod $N$.
 The private key is $(N, d)$, and we decrypt $c \in \mathcal C$ as $x \equiv c^d$ mod $N$.
@@ -355,9 +355,9 @@ \subsection{RSA cryptosystem}
 \begin{example}[Shamir's padlock example]
     Let $\mathcal A = \mathbb Z_p$.
     Alice chooses $a \in \mathbb Z_{p-1}^\star$ and computes $g^a$.
-    She finds $a'$ such that $aa' = 1$ mod $p-1$.
+    She finds $a'$ s.t. $aa' = 1$ mod $p-1$.
     Bob chooses $b \in \mathbb Z_{p-1}^\star$ and computes $g^b$.
-    He similarly finds $b'$ such that $bb' = 1$ mod $p-1$.
+    He similarly finds $b'$ s.t. $bb' = 1$ mod $p-1$.
 
     Let $m$ be a message in $\mathbb Z_p$.
     She encodes $m$ as $c = m^a$ mod $p$.
@@ -413,7 +413,7 @@ \subsection{Secrecy and attacks}
     A message $m$ is \vocab{signed} as $(m, s)$ where the \vocab{signature} $s = s(m,k)$ is a function of $m$ and the private key $k$.
 \end{definition}
 The recipient can check the signature using the public key to verify authenticity of the message.
-The signature function or \vocab{trapdoor} function $s \colon \mathcal M \times \mathcal K \to \mathcal S$ is designed such that without knowledge of the private key, one cannot sign messages, but anyone can check whether a signature is valid.
+The signature function or \vocab{trapdoor} function $s \colon \mathcal M \times \mathcal K \to \mathcal S$ is designed s.t. without knowledge of the private key, one cannot sign messages, but anyone can check whether a signature is valid.
 Note that the signature is associated to each message, not to each sender.
 \begin{example}[signatures using RSA]
     Suppose Alice has a private key $(N, d)$, and broadcasts a public key $(N, e)$.
@@ -487,8 +487,8 @@ \subsection{The digital signature algorithm}
 The public key is $(p, q, g)$ constructed as follows.
 % TODO: verify previous line
 \begin{itemize}
-    \item Let $p$ be a prime of exactly $N$ bits, where $N$ is a multiple of 64 such that $512 \leq N \leq 1024$, so $2^{N-1} < p < 2^N$.
-    \item Let $q$ be a prime of 160 bits, such that $q \mid p-1$.
+    \item Let $p$ be a prime of exactly $N$ bits, where $N$ is a multiple of 64 s.t. $512 \leq N \leq 1024$, so $2^{N-1} < p < 2^N$.
+    \item Let $q$ be a prime of 160 bits, s.t. $q \mid p-1$.
     \item Let $g \equiv h^{\frac{p-1}{q}}$ mod $p$, where $h$ is a primitive root mod $p$; in particular, $g$ is an element of order $q$ in $\mathbb Z_p^\times$.
     \item Alice chooses a private key $x$ with $1 < x < q$ and publishes $y = g^x$.
 \end{itemize}
@@ -539,15 +539,15 @@ \subsection{Commitment schemes}
 Alternatively, suppose that Alice has two ways to communicate to Bob: a clear channel which transmits with no errors, and a binary symmetric channel with error probability $p$.
 Suppose $0 < p < \frac{1}{2}$, and the noisy channel corrupts bits independent of any action of Alice or Bob, so neither can affect its behaviour.
 Bob publishes a binary linear code $C$ of length $N$ and minimum distance $d$, and Alice publishes a random non-trivial linear map $\theta \colon C \to \mathbb F_2$.
-To send a bit $m \in \mathbb F_2$, Alice chooses a random codeword $c \in C$ such that $\theta(c) = m$, and sends $c$ to Bob via the noisy channel.
+To send a bit $m \in \mathbb F_2$, Alice chooses a random codeword $c \in C$ s.t. $\theta(c) = m$, and sends $c$ to Bob via the noisy channel.
 Bob receives $r = c + e \in \mathbb F_2^N$ where $e$ is the error pattern.
 The expected value of $d(r,c) = d(e,0)$ is $Np$.
-$N$ is chosen such that $Np \gg d$, so Bob cannot tell what the original codeword $c$ was, and hence cannot find $\theta(c) = m$.
+$N$ is chosen s.t. $Np \gg d$, so Bob cannot tell what the original codeword $c$ was, and hence cannot find $\theta(c) = m$.
 
 To reveal, Alice sends $c$ to Bob using the clear channel.
 Bob can check that $d(c,r) \approx Np$; if so, he accepts the message.
 It is possible that many more or many fewer bits of $c$ were corrupted by the noisy channel, which may make Bob reject the message even if Alice correctly committed and revealed the message.
-$N, d$ should be chosen such that the probability of this occurring is negligible.
+$N, d$ should be chosen s.t. the probability of this occurring is negligible.
 
 We have shown that Bob cannot read Alice's guess until she reveals it.
 In addition, Alice cannot cheat by changing her guess, because she knows $c$ but not how it was corrupted by the noisy channel.
@@ -566,7 +566,7 @@ \subsection{Secret sharing schemes}
 It is required that, in the absence of the Leader, any $k$ members of the Faculty can reconstruct the secret from their shadows, but any $k-1$ cannot.
 \begin{definition}
     Let $k, n \in \mathbb N$ with $k < n$.
-    A \vocab{$(k, n)$-threshold scheme} is a method of sharing a message $S$ among a set of $n$ participants such that any subset of $k$ participants can reconstruct $S$, but no subset of smaller size can reconstruct $S$.
+    A \vocab{$(k, n)$-threshold scheme} is a method of sharing a message $S$ among a set of $n$ participants s.t. any subset of $k$ participants can reconstruct $S$, but no subset of smaller size can reconstruct $S$.
 \end{definition}
 We discuss Shamir's method for implementing such a scheme.
 Let $0 \leq S \leq N$ be the secret, which can be chosen at random by the Leader.
diff --git a/CodingAndCryptography/cc.pdf b/CodingAndCryptography/cc.pdf
index a60ccf7..4780212 100644
Binary files a/CodingAndCryptography/cc.pdf and b/CodingAndCryptography/cc.pdf differ
diff --git a/LogicAndSetTheory/02_well_orderings.tex b/LogicAndSetTheory/02_well_orderings.tex
index 5f7a3a6..177582e 100644
--- a/LogicAndSetTheory/02_well_orderings.tex
+++ b/LogicAndSetTheory/02_well_orderings.tex
@@ -647,7 +647,7 @@ \subsection{Ordinal arithmetic}
     It follows that $\sup \qty{\alpha + \beta : \beta < \lambda} \leq \sup \qty{\alpha + \delta : \delta \in S}$.
 \end{proof}
 
-\begin{proposition}
+\begin{proposition} \label{prp:16}
     $\forall \; \alpha, \beta, \gamma$, $(\alpha + \beta) + \gamma = \alpha + (\beta + \gamma)$.
 \end{proposition}
 
diff --git a/LogicAndSetTheory/03_posets.tex b/LogicAndSetTheory/03_posets.tex
index 83a47a7..43fe14a 100644
--- a/LogicAndSetTheory/03_posets.tex
+++ b/LogicAndSetTheory/03_posets.tex
@@ -1,5 +1,7 @@
+\section{Posets}
+
 \subsection{Definitions}
-\begin{definition}
+\begin{definition}[Poset]
     A \vocab{partially ordered set} or \vocab{poset} is a pair $(X, \leq)$ where $X$ is a set, and $\leq$ is a relation on $X$ s.t.
     \begin{itemize}
         \item (reflexivity) for all $x \in X$, $x \leq x$;
@@ -7,12 +9,15 @@ \subsection{Definitions}
         \item (antisymmetry) for all $x, y \in X$, $x \leq y$ and $y \leq x$ implies $x = y$.
     \end{itemize}
 \end{definition}
+
 We write $x < y$ for $x \leq y$ and $x \neq y$.
-Alternatively, a post is a pair $(X, <)$ where $X$ is a set, and $<$ is a relation on $X$ s.t.
+Alternatively, a poset is a pair $(X, <)$ where $X$ is a set, and $<$ is a relation on $X$ s.t.
+
 \begin{itemize}
     \item (irreflexivity) for all $x \in X$, $x \not < x$;
     \item (transitivity) for all $x, y, z \in X$, $x < y$ and $y < z$ implies $x < z$.
 \end{itemize}
+
 \begin{example}
     \begin{enumerate}
         \item Any total order is a poset.
diff --git a/LogicAndSetTheory/logicandsettheory.pdf b/LogicAndSetTheory/logicandsettheory.pdf
index 772a41d..58a6617 100644
Binary files a/LogicAndSetTheory/logicandsettheory.pdf and b/LogicAndSetTheory/logicandsettheory.pdf differ
diff --git a/LogicAndSetTheory/logicandsettheory.synctex(busy) b/LogicAndSetTheory/logicandsettheory.synctex(busy)
new file mode 100644
index 0000000..e69de29