From d9397721be745e53aaa2e3105e50b4ce96a0c363 Mon Sep 17 00:00:00 2001
From: zeramorphic <zeramorphic@proton.me>
Date: Sat, 12 Aug 2023 23:47:13 +0100
Subject: [PATCH] Typographical edits to probability

Signed-off-by: zeramorphic <zeramorphic@proton.me>
---
 ia/de/01_differentiation.tex                  |  2 +-
 ia/de/03_multivariate_functions.tex           |  2 +-
 ia/de/06_isoclines_and_solution_curves.tex    |  2 +-
 ia/de/07_phase_portraits.tex                  |  2 +-
 ia/de/10_impulse_forcing.tex                  |  2 +-
 ia/de/13_systems_of_odes.tex                  |  2 +-
 ...01_basic_definitions_and_newton_s_laws.tex |  2 +-
 ia/groups/01_axiomatic_definition.tex         |  4 ++--
 .../06_cosets_and_lagrange_s_theorem.tex      |  2 +-
 .../07_normal_subgroups_and_quotients.tex     |  8 +++----
 ia/groups/08_isomorphism_theorems.tex         |  2 +-
 ia/groups/10_conjugation.tex                  |  2 +-
 ia/ns/01_proofs.tex                           |  2 +-
 ia/ns/02_elementary_number_theory.tex         |  4 ++--
 ia/ns/04_the_reals.tex                        | 10 ++++----
 ia/ns/05_sets.tex                             |  2 +-
 ia/ns/06_functions.tex                        |  2 +-
 ia/probability/01_probability_spaces.tex      |  8 +++----
 ia/probability/02_inclusion_exclusion.tex     |  8 +++----
 ..._independence_and_dependence_of_events.tex | 14 +++++------
 ia/probability/04_discrete_distributions.tex  |  9 ++++----
 .../05_discrete_random_variables.tex          | 15 +++++++-----
 ia/probability/06_variance_and_covariance.tex |  7 +++++-
 .../08_combinations_of_random_variables.tex   | 12 ++++++----
 .../12_continuous_random_variables.tex        | 10 ++++----
 .../13_multivariate_density_functions.tex     | 18 +++++++++------
 .../14_moment_generating_functions.tex        |  8 ++++---
 ia/probability/15_limit_theorems.tex          | 23 ++++++++++++-------
 ia/probability/16_gaussian_vectors.tex        |  9 +++++---
 ia/probability/import.tex                     |  2 +-
 ia/vm/02_vectors_in_three_dimensions.tex      |  3 +--
 ia/vm/05_vectors_in_real_euclidean_space.tex  |  2 +-
 ib/antop/01_uniform_convergence.tex           |  2 +-
 ib/stats/01_introduction_and_review.tex       |  2 +-
 ib/stats/02_estimation.tex                    |  2 +-
 ib/vp/03_euler_lagrange_equation.tex          |  2 +-
 36 files changed, 117 insertions(+), 91 deletions(-)

diff --git a/ia/de/01_differentiation.tex b/ia/de/01_differentiation.tex
index c747a567..1bf036a3 100644
--- a/ia/de/01_differentiation.tex
+++ b/ia/de/01_differentiation.tex
@@ -172,7 +172,7 @@ \subsection{Order of magnitude}
 	then \(f(x) = O(g(x))\) as \(x \to \infty\).
 \end{definition}
 
-This is basically the same as the previous definition --- but obviously we can't pick a value slightly less than infinity to test, so we just provide a lower bound on \(x\) where the condition holds true.
+This is basically the same as the previous definition---but obviously we can't pick a value slightly less than infinity to test, so we just provide a lower bound on \(x\) where the condition holds true.
 
 For example, \(2x^3 + 4x + 12 = O(x^3)\) as \(x \to \infty\).
 This is because the function is a cubic, so can be bounded by a cubic as it shoots off to infinity.
diff --git a/ia/de/03_multivariate_functions.tex b/ia/de/03_multivariate_functions.tex
index c9167730..0c7ff81c 100644
--- a/ia/de/03_multivariate_functions.tex
+++ b/ia/de/03_multivariate_functions.tex
@@ -133,7 +133,7 @@ \subsection{Implicit differentiation}
 	\eval{\frac{\partial f}{\partial z}}_{xy} \eval{\frac{\partial z}{\partial x}}_{y}
 \]
 The left hand side is zero because on the surface \(z(x, y)\), \(f\) is always equivalent to \(c\) so there is never any \(\delta f\).
-The \(\eval{\frac{\partial f}{\partial x}}_{yz}\) term, however, is not zero in general because we are not going across the \(z(x, y)\) surface --- just parallel to the \(x\) axis, because we fixed both \(y\) and \(z\).
+The \(\eval{\frac{\partial f}{\partial x}}_{yz}\) term, however, is not zero in general because we are not going across the \(z(x, y)\) surface---just parallel to the \(x\) axis, because we fixed both \(y\) and \(z\).
 Hence,
 \[
 	\eval{\frac{\partial z}{\partial x}}_y = \frac{-\eval{\frac{\partial f}{\partial x}}_{yz}}{\eval{\frac{\partial f}{\partial z}}_{xy}}
diff --git a/ia/de/06_isoclines_and_solution_curves.tex b/ia/de/06_isoclines_and_solution_curves.tex
index 5f96c7cc..8b4a6eb1 100644
--- a/ia/de/06_isoclines_and_solution_curves.tex
+++ b/ia/de/06_isoclines_and_solution_curves.tex
@@ -133,7 +133,7 @@ \subsection{Isoclines}
 
 Two such solution curves are drawn on this graph; the one intersecting zero has \(A = 1\) in the solution for \(y\), and the one above it has \(A = -1\).
 Note how, as they intersect the isoclines in red, they have exactly the gradient defined by the isocline.
-Particularly, the lower solution curve intersects the same isocline twice, and therefore has this exact gradient at two distinct points --- we observe these points as the intersection points between the solution curve and the isocline.
+Particularly, the lower solution curve intersects the same isocline twice, and therefore has this exact gradient at two distinct points---we observe these points as the intersection points between the solution curve and the isocline.
 
 Note also that the solutions \(y = 1\) and \(y = -1\) lie on these isoclines for all \(t\).
 This is because the isoclines specify that the function has zero gradient on such a straight line, so it makes sense that the function and isocline coincide.
diff --git a/ia/de/07_phase_portraits.tex b/ia/de/07_phase_portraits.tex
index 63cae4ae..3f351c41 100644
--- a/ia/de/07_phase_portraits.tex
+++ b/ia/de/07_phase_portraits.tex
@@ -76,7 +76,7 @@ \subsection{Phase portraits}
 \]
 This is an autonomous nonlinear first order ordinary differential equation.
 We can create a phase portrait by mapping out \(\frac{\dd{c}}{\dd{t}}\) as a function of \(c\), as shown in the first diagram here, which is known as a 2D phase portrait.
-The second diagram, known as a 1D phase portrait, shows similar information but helps us see the behaviour of fixed points --- essentially the arrows point in the direction of motion of \(c\); if \(\dot c\) is positive then the arrows point to the right, if \(\dot c\) is negative they point to the left.
+The second diagram, known as a 1D phase portrait, shows similar information but helps us see the behaviour of fixed points---essentially the arrows point in the direction of motion of \(c\); if \(\dot c\) is positive then the arrows point to the right, if \(\dot c\) is negative they point to the left.
 \newpage
 \begin{wrapfigure}{r}{0.5\textwidth}
 	\begin{tikzpicture}
diff --git a/ia/de/10_impulse_forcing.tex b/ia/de/10_impulse_forcing.tex
index bbe8c6f0..eef7493a 100644
--- a/ia/de/10_impulse_forcing.tex
+++ b/ia/de/10_impulse_forcing.tex
@@ -61,7 +61,7 @@ \subsection{Delta function forcing}
 \end{equation}
 The key principle is that the highest order deriative `inherits' the level of discontinuity from the forcing term, since if any other derivative were to contain the discontinuous function, then the next higher derivative would only be more discontinuous.
 So, \(y''\) behaves somewhat like \(\delta\).
-Here, we will denote this \(y'' \sim \delta\) --- this is extremely non-standard notation, however.
+Here, we will denote this \(y'' \sim \delta\)---this is extremely non-standard notation, however.
 
 Now, since \(\delta(x) = 0\) for all nonzero \(x\), then
 \[
diff --git a/ia/de/13_systems_of_odes.tex b/ia/de/13_systems_of_odes.tex
index 2e3319e3..a7a9f2ea 100644
--- a/ia/de/13_systems_of_odes.tex
+++ b/ia/de/13_systems_of_odes.tex
@@ -225,7 +225,7 @@ \subsection{Nonlinear systems of ODEs}
 \[
 	\eqref{nonlinear1} \implies \dot \xi = f(x_0 + \xi, y_0 + \eta)
 \]
-We can expand this in a multivariate Taylor series, keeping the first three terms --- the constant term and the two linear terms.
+We can expand this in a multivariate Taylor series, keeping the first three terms---the constant term and the two linear terms.
 \begin{align*}
 	\dot \xi  & \approx f(x_0, y_0) + \xi f_x(x_0, y_0) + \eta f_y(x_0, y_0) \\
 	          & = \xi f_x(x_0, y_0) + \eta f_y(x_0, y_0)                     \\
diff --git a/ia/dr/01_basic_definitions_and_newton_s_laws.tex b/ia/dr/01_basic_definitions_and_newton_s_laws.tex
index 2e031985..ce41893a 100644
--- a/ia/dr/01_basic_definitions_and_newton_s_laws.tex
+++ b/ia/dr/01_basic_definitions_and_newton_s_laws.tex
@@ -98,7 +98,7 @@ \subsection{Galilean transformations}
 	\item at any constant velocity
 \end{itemize}
 Any set of equations which describe Newtonian physics must preserve this Galilean invariant.
-This shows that measurement of velocity cannot be absolute, it must be relative to a specific inertial frame of reference --- but conversely, measurement of acceleration \textit{is} absolute.
+This shows that measurement of velocity cannot be absolute, it must be relative to a specific inertial frame of reference---but conversely, measurement of acceleration \textit{is} absolute.
 
 \subsection{Newton's second law}
 For any particle subject to a force \(\vb F\), the momentum \(\vb p\) of the particle satisfies
diff --git a/ia/groups/01_axiomatic_definition.tex b/ia/groups/01_axiomatic_definition.tex
index 607db54f..dc75fe37 100644
--- a/ia/groups/01_axiomatic_definition.tex
+++ b/ia/groups/01_axiomatic_definition.tex
@@ -65,7 +65,7 @@ \subsection{Definition}
 
 Here are a few examples of groups.
 \begin{enumerate}
-	\item \(G = \{ e \}\) --- this is the `trivial group'.
+	\item \(G = \{ e \}\)---this is the `trivial group'.
 	\item \(G = \{ \text{symmetries of the equilateral triangle} \} \); \(\ast\) is defined by: `\(g \ast h\) means doing \(h\) then \(g\)'.
 	\item \(G = (\mathbb Z, +)\).
 	      This is easy to prove by verifying the axioms.
@@ -151,7 +151,7 @@ \subsection{Subgroups}
 	A subset \(H \subseteq G\) is a subgroup of \(G\) if \((H, \ast)\) is a group.
 	We denote this \(H \leq G\).
 \end{definition}
-We must verify each group axiom on a subset to check if it is a subgroup --- with the notable exception of the associativity axiom, the property of associativity is inherited by subgroups.
+We must verify each group axiom on a subset to check if it is a subgroup---with the notable exception of the associativity axiom, the property of associativity is inherited by subgroups.
 Here are some examples of subgroups.
 
 \begin{enumerate}
diff --git a/ia/groups/06_cosets_and_lagrange_s_theorem.tex b/ia/groups/06_cosets_and_lagrange_s_theorem.tex
index dd20d738..58e43b52 100644
--- a/ia/groups/06_cosets_and_lagrange_s_theorem.tex
+++ b/ia/groups/06_cosets_and_lagrange_s_theorem.tex
@@ -129,7 +129,7 @@ \subsection{Lagrange's theorem}
 \end{proof}
 We can take Lagrange's theorem into the world of number theory, and specifically modular arithmetic, where we are dealing with finite groups.
 Clearly, \(\mathbb Z_n\) is a group under addition modulo \(n\), but what happens with multiplication modulo \(n\)?
-Clearly this is not a group --- for a start, 0 has no inverse.
+Clearly this is not a group---for a start, 0 has no inverse.
 By removing all elements of the group that have no inverse, we obtain \(\mathbb Z_n^*\).
 
 Note that for any \(x \in \mathbb Z_n\), \(x\) has a multiplicative inverse if and only if \(\HCF(x, n) = 1\), i.e.\ if \(x\) and \(n\) are coprime.
diff --git a/ia/groups/07_normal_subgroups_and_quotients.tex b/ia/groups/07_normal_subgroups_and_quotients.tex
index ca845b65..819c5e26 100644
--- a/ia/groups/07_normal_subgroups_and_quotients.tex
+++ b/ia/groups/07_normal_subgroups_and_quotients.tex
@@ -67,7 +67,7 @@ \subsection{Normal subgroups}
 As another example, let \(k \in K\) and let \(g \in G \setminus K\).
 Then \(kg\) does not have this property, as \(kg \notin K\).
 
-We can encapsulate this behaviour by making a homomorphism from the whole group \(G\) to some other group --- it \textit{doesn't matter where we end up}, just as long as anything with this particular property maps to the new group's identity element.
+We can encapsulate this behaviour by making a homomorphism from the whole group \(G\) to some other group---it \textit{doesn't matter where we end up}, just as long as anything with this particular property maps to the new group's identity element.
 Let \(\varphi: G \to H\), where \(H\) is some group that we don't really care about (apart from the identity).
 This means that any element of \(K\), i.e.\ any element with property \(P\), is mapped to \(e_H\).
 By the laws of homomorphisms, any product of \(k \in K\) with \(g \in G \setminus K\) does not give the identity element, so it does not have this property!
@@ -179,7 +179,7 @@ \subsection{Quotients}
 
 Now, given some element in one of the cosets (i.e.\ in \(G\)) we can do some transformation \(g\) to take us to another element.
 But because we made cosets out of a normal subgroup, multiplying by \(g\) is the same as swapping some of the rows, then maybe moving around the order of the elements in each row.
-It keeps the identity of each row consistent --- all elements in a given row are transformed to the same output row.
+It keeps the identity of each row consistent---all elements in a given row are transformed to the same output row.
 Remember that the word `row' basically means `coset'.
 
 This means that we can basically forget about the individual elements in these cosets, all that we really care about is how the rows are swapped with each other under a given transformation.
@@ -224,12 +224,12 @@ \subsection{Examples and properties}
 \begin{itemize}
 	\item We can check that certain properties are inherited into quotient groups from the original group, such as being abelian and being finite.
 	\item Quotients are not subgroups of the original group.
-	      They are associated with tha original group in a very different way to subgroups --- in general, a coset may not even be isomorphic to a subgroup in the group.
+	      They are associated with tha original group in a very different way to subgroups---in general, a coset may not even be isomorphic to a subgroup in the group.
 	      The example with direct products above was an example that is not true in general.
 	\item With normality, we need to specify in which group the subgroup is normal.
 	      For example, if \(K \leq N \leq G\), with \(K \trianglelefteq N\).
 	      This does not imply that \(K \trianglelefteq G\), this would require that \(g^{-1}Kg = K\) for all elements \(g\) in \(G\), but we only have that \(n^{-1}Kn = K\) for all elements \(n\) in \(N\), which is a weaker condition.
-	      Normality is not transitive --- for example, \(K \trianglelefteq N \trianglelefteq G\) does not imply \(K \trianglelefteq G\).
+	      Normality is not transitive---for example, \(K \trianglelefteq N \trianglelefteq G\) does not imply \(K \trianglelefteq G\).
 	\item However, if \(N \leq H \leq G\) and \(N \trianglelefteq G\), then the weaker condition \(N \trianglelefteq H\) is true.
 \end{itemize}
 
diff --git a/ia/groups/08_isomorphism_theorems.tex b/ia/groups/08_isomorphism_theorems.tex
index 9f370933..c616c5a2 100644
--- a/ia/groups/08_isomorphism_theorems.tex
+++ b/ia/groups/08_isomorphism_theorems.tex
@@ -23,7 +23,7 @@ \subsection{First isomorphism theorem}
 Now, multiplying together two rows, i.e.\ two elements from \(K\), we can apply the homomorphism \(\varphi\) to one of the coset representatives for each row to see how the entire row behaves under \(\varphi\).
 We know that all coset representatives give equal results, because each element in a given coset \(gN\) can be written as \(gn, n \in N\), so \(\varphi(gn) = \varphi(g)\).
 So all elements in the rows behave just like their coset representatives under the homomorphism.
-Further, all the cosets give different outputs under \(\varphi\) --- if they gave the same output they'd have to be part of the same coset.
+Further, all the cosets give different outputs under \(\varphi\)---if they gave the same output they'd have to be part of the same coset.
 So in some sense, each row represents a distinct output for \(\varphi\).
 So the quotient group must be isomorphic to the image of the homomorphism.
 
diff --git a/ia/groups/10_conjugation.tex b/ia/groups/10_conjugation.tex
index ab597e55..4da8f920 100644
--- a/ia/groups/10_conjugation.tex
+++ b/ia/groups/10_conjugation.tex
@@ -2,7 +2,7 @@ \subsection{Conjugation actions}
 \begin{definition}
 	Given \(g, h \in G\), the element \(hgh^{-1}\) is the conjugate of \(g\) by \(h\).
 \end{definition}
-We should think of conjugate elements as doing the same thing but from different `points of view' --- we change perspective by doing \(h^{-1}\), then do the action \(g\), then reset the perspective back to normal using \(h\).
+We should think of conjugate elements as doing the same thing but from different `points of view'---we change perspective by doing \(h^{-1}\), then do the action \(g\), then reset the perspective back to normal using \(h\).
 
 Here is an example using \(D_{10}\), where the vertices of the regular pentagon are \(v_1 \dots v_5\) clockwise.
 Consider the conjugates \(s\) and \(rsr^{-1}\), where \(s\) is a reflection through \(v_1\) and the centre, and \(r\) is a rotation by \(\frac{2\pi}{5}\) clockwise.
diff --git a/ia/ns/01_proofs.tex b/ia/ns/01_proofs.tex
index 6131536b..de97399e 100644
--- a/ia/ns/01_proofs.tex
+++ b/ia/ns/01_proofs.tex
@@ -88,7 +88,7 @@ \subsection{Proofs and non-proofs}
 \end{proof}
 \begin{itemize}
 	\item We prove things to show \textit{why} something is true.
-	      We can see why this claim was true here --- it's really a statement about the properties of odd numbers, not the properties of even numbers.
+	      We can see why this claim was true here---it's really a statement about the properties of odd numbers, not the properties of even numbers.
 	\item We started by saying that we need something tangible to work with: just stating that `\(n^2\) is even' is really hard to work with because square roots just get messy and don't yield any result.
 	      So we had to choose a clever first step.
 	\item The symbol \contradiction{} shows that we have a contradiction.
diff --git a/ia/ns/02_elementary_number_theory.tex b/ia/ns/02_elementary_number_theory.tex
index dd2657f6..ba984e19 100644
--- a/ia/ns/02_elementary_number_theory.tex
+++ b/ia/ns/02_elementary_number_theory.tex
@@ -51,7 +51,7 @@ \subsection{Strong induction}
 This provides a very useful alternative way of looking at induction.
 Instead of just considering a process from \(n\) to \(n+1\), we can inject an inductive viewpoint into any proof.
 When proving something on the natural numbers, we can always assume that the hypothesis is true for smaller \(n\) than what we are currently using.
-This allows us to write very powerful proofs because in the general case we are allowed to refer back to other smaller cases --- but not just \(n-1\), any \(k\) less than \(n\).
+This allows us to write very powerful proofs because in the general case we are allowed to refer back to other smaller cases---but not just \(n-1\), any \(k\) less than \(n\).
 
 We may rewrite the principle of strong induction in the following ways:
 \begin{enumerate}
@@ -105,7 +105,7 @@ \subsection{Primes}
 
 We want to prove that prime factorisation is unique (up to the ordering).
 We need that \(p \mid ab \implies p \mid a \lor p \mid b\).
-However, this is hard to answer --- \(p\) is defined in terms of what divides it, not what it divides.
+However, this is hard to answer---\(p\) is defined in terms of what divides it, not what it divides.
 This is the reverse of its definition, so we need to prove it in a more round-about way.
 
 \subsection{Highest common factors}
diff --git a/ia/ns/04_the_reals.tex b/ia/ns/04_the_reals.tex
index 9169693f..3b138687 100644
--- a/ia/ns/04_the_reals.tex
+++ b/ia/ns/04_the_reals.tex
@@ -103,7 +103,7 @@ \subsection{Examples of sets and least upper bounds}
 \begin{remark}
 	If \(S\) has a greatest element, then this element is the supremum of the set: \(\sup S \in S\).
 	But if \(S\) does not have a greatest element, then \(\sup S \notin S\).
-	Also, we do not need any kind of `greatest lower bound' axiom --- if \(S\) is a non-empty, bounded below set of reals, then the set \(\{ -x: x \in S \}\) is non-empty and bounded above, and so has a least upper bound, so \(S\) has a greatest lower bound equivalent to its additive inverse.
+	Also, we do not need any kind of `greatest lower bound' axiom---if \(S\) is a non-empty, bounded below set of reals, then the set \(\{ -x: x \in S \}\) is non-empty and bounded above, and so has a least upper bound, so \(S\) has a greatest lower bound equivalent to its additive inverse.
 	This is commonly called the `infimum', or \(\inf S\).
 \end{remark}
 \begin{theorem}
@@ -117,10 +117,10 @@ \subsection{Examples of sets and least upper bounds}
 	\begin{itemize}
 		\item (\(c^2 < 2\)) We want to prove that \((c+t)^2 < 2\) for some small \(t\).
 		      For \(0<t<1\), we have \((c+t)^2 = c^2 + 2ct + t^2 \leq c^2 + 5t\), since \(c\) is at most 2, and \(t^2\) is at most \(t\).
-		      So this value is less than 2 for some suitably small \(t\), contradicting the least upper bound --- we have just shown that \((c+t) \in S\).
+		      So this value is less than 2 for some suitably small \(t\), contradicting the least upper bound---we have just shown that \((c+t) \in S\).
 		\item (\(c^2 > 2\)) We want to prove that \((c-t)^2 > 2\) for some small \(t\).
 		      For \(0<t<1\), we have \((c-t)^2 = c^2 - 2ct + t^2 \geq c^2 - 4t\), since \(c\) is at most 2, and \(t^2\) is at least zero.
-		      So this value is greater than 2 for some suitably small \(t\), contradicting the least upper bound --- we have just created a lower upper bound.
+		      So this value is greater than 2 for some suitably small \(t\), contradicting the least upper bound---we have just created a lower upper bound.
 	\end{itemize}
 	So \(c^2 = 2\).
 \end{proof}
@@ -165,7 +165,7 @@ \subsection{Sequences and limits}
 	\item Consider now \(x_n = (-1)^n\), i.e.\ \(-1, 1, -1, 1, \dots\).
 	      We want to show that this does not tend to a limit.
 	      Suppose \(x_n \to c\) as \(n \to \infty\).
-	      We may choose some \(\varepsilon\) that acts as a counterexample --- for example, \(\varepsilon = 1\).
+	      We may choose some \(\varepsilon\) that acts as a counterexample---for example, \(\varepsilon = 1\).
 	      So \(\exists N \in \mathbb N\) such that \(\forall n \geq n\) we have \(\abs{x_n - c} < 1\).
 	      In particular, \(\abs{1 - c} < 1\) and \(\abs{-1 - c} < 1\) so \(\abs{1 - (-1)} < 2\), by the triangle inequality.
 	      This is a contradiction.
@@ -214,7 +214,7 @@ \subsection{Series}
 	So for every \(n \geq \max(M, N)\), by the triangle inequality, \(\abs{(x_n + y_n) - (c + d)} < 2\zeta = \varepsilon\) as required.
 \end{proof}
 This is commonly known as an \(\varepsilon/2\) argument.
-Also, if we had instead not taken any \(\zeta\) value and just stuck with \(\varepsilon\), it would still be a good proof because we could just have divided \(\varepsilon\) at the beginning --- it's not expected that you completely rewrite the proof to add in this division.
+Also, if we had instead not taken any \(\zeta\) value and just stuck with \(\varepsilon\), it would still be a good proof because we could just have divided \(\varepsilon\) at the beginning---it's not expected that you completely rewrite the proof to add in this division.
 
 \subsection{Testing convergence of a sequence}
 A sequence \(x_1, x_2, \dots\) is called `increasing' if \(x_{n+1} \geq x_n\) for all \(n\).
diff --git a/ia/ns/05_sets.tex b/ia/ns/05_sets.tex
index dc052320..7e80cd2b 100644
--- a/ia/ns/05_sets.tex
+++ b/ia/ns/05_sets.tex
@@ -128,7 +128,7 @@ \subsection{Computing binomial coefficients}
 \end{proposition}
 \begin{proof}
 	The number of ways to name a \(k\)-set is \(n(n-1)(n-2)\cdots(n-k+1)\) because there are \(n\) ways to choose a first element, \(n-1\) ways to choose a second element, and so on.
-	We have overcounted the \(k\)-sets, though --- there are \(k(k-1)(k-2)\cdots(1)\) ways to name a given \(k\)-set because you have \(k\) choices for the first element, \(k-1\) choices for the second element, and so on.
+	We have overcounted the \(k\)-sets, though---there are \(k(k-1)(k-2)\cdots(1)\) ways to name a given \(k\)-set because you have \(k\) choices for the first element, \(k-1\) choices for the second element, and so on.
 	Hence the number of \(k\)-sets in \(\{ 1, 2, \dots, n \}\) is the required result.
 \end{proof}
 Note that we can also write
diff --git a/ia/ns/06_functions.tex b/ia/ns/06_functions.tex
index 435415a1..cdd03650 100644
--- a/ia/ns/06_functions.tex
+++ b/ia/ns/06_functions.tex
@@ -186,7 +186,7 @@ \subsection{Equivalence classes as partitions}
 	So \([x] = [y]\).
 \end{proof}
 As an example, does there exist an equivalence relation on \(\mathbb N\) with three equivalence classes, two of which are infinite, and one of which is finite?
-Yes --- we can break up \(\mathbb N\) into three parts, for example positive numbers, negative numbers and zero.
+Yes---we can break up \(\mathbb N\) into three parts, for example positive numbers, negative numbers and zero.
 This defines an equivalence relation.
 
 \subsection{Quotients}
diff --git a/ia/probability/01_probability_spaces.tex b/ia/probability/01_probability_spaces.tex
index bd45fff9..17ac622c 100644
--- a/ia/probability/01_probability_spaces.tex
+++ b/ia/probability/01_probability_spaces.tex
@@ -27,17 +27,18 @@ \subsection{Probability spaces and \texorpdfstring{\(\sigma\)}{σ}-algebras}
 \end{definition}
 Note that \(\mathbb P\) is dependent on \(\mathcal F\) but not on \(\Omega\).
 We talk about probabilities of \textit{events}, not probabilities of \textit{outcomes}.
-For example, if you pick a uniform number from the interval \([0, 1]\); then the probability of getting any specific outcome is zero --- but we can define useful events that have nonzero probabilities.
+For example, if you pick a uniform number from the interval \([0, 1]\); then the probability of getting any specific outcome is zero---but we can define useful events that have nonzero probabilities.
 
 \subsection{Properties of the probability measure}
 \begin{itemize}
 	\item \(\prob{\stcomp{A}} = 1 - \prob{A}\), since \(A\) and \(\stcomp{A}\) are disjoint sets, whose union is \(\Omega\)
 	\item \(\prob{\varnothing} = 0\), since it is the complement of \(\Omega\)
 	\item if \(A \subseteq B\), then \(\prob{A} \leq \prob{B}\)
-	\item \(\prob{A \cup B} = \prob{A} + \prob{B} - \prob{A \cap B}\) using the Inclusion-Exclusion theorem
+	\item \(\prob{A \cup B} = \prob{A} + \prob{B} - \prob{A \cap B}\) using the inclusion-exclusion theorem
 \end{itemize}
 
 \begin{example}
+	Consider the following examples of probability spaces and probability measures.
 \begin{itemize}
 	\item Rolling a fair 6-sided die:
 	      \begin{itemize}
@@ -81,11 +82,10 @@ \subsection{Properties of the probability measure}
 
 	      Let \(\Omega = \{1, \dots, 365\}^n\), and \(\mathcal F = \mathcal P(\Omega)\).
 	      Since all outcomes are equally likely, we take \(\prob{\{\omega\}} = \frac{1}{365^n}\).
-
 	      Let \(A = \{ \text{at least two people share the same birthday} \}\).
 	      \(\stcomp{A} = \{ \text{all } n \text{ birthdays are different} \}\).
 	      Since \(\prob{A} = 1 - \prob{\stcomp{A}}\), it suffices to calculate \(\prob{\stcomp{A}}\), which is \(\frac{\abs*{\stcomp{A}}}{\abs*{\Omega}} = \frac{365!}{(365 - n)!365^n}\).
-	      So the answer is \(\prob{A} = 1 - \frac{365!}{(365 - n)!365^n}\)
+	      So the answer is \(\prob{A} = 1 - \frac{365!}{(365 - n)!365^n}\).
 
 	      Note that at \(n=22\), \(\prob{A} \approx 0.476\) and at \(n=23\), \(\prob{A} \approx 0.507\).
 	      So when there are at least 23 people in a room, the probability that two of them share a birthday is around 50\%.
diff --git a/ia/probability/02_inclusion_exclusion.tex b/ia/probability/02_inclusion_exclusion.tex
index dd7c7242..5131d562 100644
--- a/ia/probability/02_inclusion_exclusion.tex
+++ b/ia/probability/02_inclusion_exclusion.tex
@@ -49,7 +49,7 @@ \subsection{Inclusion-exclusion formula}
 
 \subsection{Bonferroni inequalities}
 Truncating the sum in the inclusion-exclusion formula at the \(r\)th term yields an estimate for the probability.
-The Bonferroni Inequalities state that if \(r\) is odd, it is an overestimate, and if \(r\) is even, it is an underestimate.
+The Bonferroni inequalities state that if \(r\) is odd, it is an overestimate, and if \(r\) is even, it is an underestimate.
 \begin{align*}
 	r \text{ odd}  & \implies \prob{\bigcup_{i=1}^n A_i} \leq \sum_{k=1}^r (-1)^{k+1} \sum_{1 \leq i_1 < \dots < i_k \leq n} \prob{A_{i_1} \cap \dots \cap A_{i_k}} \\
 	r \text{ even} & \implies \prob{\bigcup_{i=1}^n A_i} \geq \sum_{k=1}^r (-1)^{k+1} \sum_{1 \leq i_1 < \dots < i_k \leq n} \prob{A_{i_1} \cap \dots \cap A_{i_k}}
@@ -75,7 +75,7 @@ \subsection{Bonferroni inequalities}
 \end{proof}
 
 \subsection{Counting using inclusion-exclusion}
-We can apply the Inclusion-Exclusion formula to count various things.
+We can apply the inclusion-exclusion formula to count various things.
 How many functions \(f \colon \{ 1, \dots, n \} \to \{ 1, \dots, m \}\) are surjective?
 Let \(\Omega\) be the set of such functions, and \(A = \{ f \in \Omega : f \text{ is a surjection} \}\).
 For all \(i \in \{ 1, \dots, m \}\), we define \(A_i = \{ f \in \Omega : i \notin \{ f(1), f(2), \dots, f(n) \} \}\).
@@ -84,7 +84,7 @@ \subsection{Counting using inclusion-exclusion}
 \[
 	\abs{A} = \abs{\Omega} - \abs{A_1 \cup \dots \cup A_m} = m^n - \abs{A_1 \cup \dots \cup A_m}
 \]
-Now, let us use the Inclusion-Exclusion formula.
+Now, let us use the inclusion-exclusion formula.
 \begin{align*}
 	\abs{A_1 \cup \dots \cup A_m} & = \sum_{k=1}^n (-1)^{k+1} \sum_{1 \leq i_1 < \dots < i_k \leq n} \abs{A_{i_1} \cap \dots \cap A_{i_k}} \\
 	\intertext{Note that \(A_{i_1} \cap \dots \cap A_{i_k}\) is the set of functions where \(k\) distinct numbers are not included in the function's range.
@@ -101,7 +101,7 @@ \subsection{Counting derangements}
 Let \(A\) be the set of derangements in \(\Omega\).
 Let us pick a permutation \(\sigma\) at random from \(\Omega\).
 What is the probability that it is a derangement?
-We define \(A_i = \{ f \in \Omega \colon f(i) = i \}\), then \(A = \stcomp{A_1} \cap \dots \stcomp{A_n} = \stcomp{\left( \bigcup_{i=1}^n A_i  \right)}\), so \(\prob{A} = 1 - \prob{\bigcup_{i=1}^n A_i}\).
+We define \(A_i = \{ f \in \Omega \colon f(i) = i \}\), then \(A = A_1^c \cap \dots \cap A_n^c = \left( \bigcup_{i=1}^n A_i  \right)^c\), so \(\prob{A} = 1 - \prob{\bigcup_{i=1}^n A_i}\).
 By the inclusion-exclusion formula,
 \begin{align*}
 	\prob{\bigcup_{i=1}^n A_i} & = \sum_{k=1}^n (-1)^{k+1} \sum_{1 \leq i_1 < i_2 < \dots < i_k \leq n} \prob{A_{i_1} \cap \dots \cap A_{i_k}} \\
diff --git a/ia/probability/03_independence_and_dependence_of_events.tex b/ia/probability/03_independence_and_dependence_of_events.tex
index 51662b27..f2cb4f3d 100644
--- a/ia/probability/03_independence_and_dependence_of_events.tex
+++ b/ia/probability/03_independence_and_dependence_of_events.tex
@@ -49,7 +49,7 @@ \subsection{Conditional probability}
 \begin{definition}
 	Let \((\Omega, \mathcal F, \mathbb P)\) be a probability space.
 	Let \(B \in \mathcal F\) with \(\prob{B} > 0\).
-	We define the conditional probability of \(A\) given \(B\), written \(\prob{A \mid B}\) as
+	We define the conditional probability of \(A\) given \(B\), written \(\prob{A \mid B}\), as
 	\[
 		\prob{A \mid B} = \frac{\prob{A \cap B}}{\prob{B}}
 	\]
@@ -68,14 +68,14 @@ \subsection{Conditional probability}
 \end{claim}
 \begin{proof}
 	\begin{align*}
-		\prob{\bigcup A_n \mathrel{\Big|} B} & = \frac{\prob{(\bigcup A_n) \cap B}}{\prob{B}} \\
+		\prob{\bigcup A_n \mathrel{\Big|} B} & = \frac{\prob{\qty(\bigcup A_n) \cap B}}{\prob{B}} \\
 		                                     & = \frac{\prob{\bigcup (A_n \cap B)}}{\prob{B}} \\
-		\intertext{By countable additivity, since that \((A_n \cap B)\) are disjoint,}
+		\intertext{By countable additivity, since the \((A_n \cap B)\) are disjoint,}
 		                                     & = \sum_n \frac{\prob{A_n \cap B}}{\prob{B}}    \\
 		                                     & = \sum_n \prob{A_n \mid B}
 	\end{align*}
 \end{proof}
-We can think of \(\prob{\dots \mid B}\) as a new probability measure for the same \(\Omega\).
+We can think of \(\prob{\wildcard \mid B}\) as a new probability measure for the same \(\Omega\).
 
 \subsection{Law of total probability}
 \begin{claim}
@@ -108,13 +108,13 @@ \subsection{Bayes' formula}
 	\begin{align*}
 		\prob{B_n \mid A} & = \frac{\prob{B_n \cap A}}{\prob{A}}                                       \\
 		                  & = \frac{\prob{A \mid B_n} \prob{B_n}}{\prob{A}}                            \\
-		\intertext{By the Law of Total Probability,}
+		\intertext{By the law of total probability,}
 		                  & = \frac{\prob{A \mid B_n} \prob{B_n}}{\sum_k \prob{A \mid B_k} \prob{B_k}}
 	\end{align*}
 \end{proof}
 Note that on the right hand side, the numerator appears somewhere in the denominator.
 This formula is the basis of Bayesian statistics.
-It allows us to reverse the direction of a conditional probability --- knowing the probabilities of the events \((B_n)\), and given a model of \(\prob{A \mid B_n}\), we can calculuate the posterior probabilities of \(B_n\) given that \(A\) occurs.
+It allows us to reverse the direction of a conditional probability---knowing the probabilities of the events \((B_n)\), and given a model of \(\prob{A \mid B_n}\), we can calculuate the posterior probabilities of \(B_n\) given that \(A\) occurs.
 
 \subsection{Bayes' formula for medical tests}
 Consider the probability of getting a false positive on a test for a rare condition.
@@ -136,7 +136,7 @@ \subsection{Bayes' formula for medical tests}
 \[
 	\prob{A \mid P} \approx \frac{1}{1 + \frac{\prob{P \mid \stcomp{A}}}{\prob{A}}}
 \]
-So this is low because the probability that \(\prob{P \mid \stcomp{A}} \gg \prob{A}\).
+So this is low because \(\prob{P \mid \stcomp{A}} \gg \prob{A}\).
 Suppose that there is a population of 1000 people and about 1 suffers from the disease.
 Among the 999 not suffering from \(A\), about 10 will test positive.
 So there will be about 11 people who test positive, and only 1 out of 11 (9\%) of those actually has the disease.
diff --git a/ia/probability/04_discrete_distributions.tex b/ia/probability/04_discrete_distributions.tex
index 0cbd3de9..fa3eac17 100644
--- a/ia/probability/04_discrete_distributions.tex
+++ b/ia/probability/04_discrete_distributions.tex
@@ -33,11 +33,12 @@ \subsection{Multinomial distribution}
 The multinomial distribution is a generalisation of the binomial distribution.
 \(M\) has parameters \(N \in \mathbb Z^+\) and \(p_1, p_2, \dots \in [0, 1]\) where \(\sum_{i=1}^k p_i = 1\).
 This models a sequence of \(N\) independent trials in which a number from 1 to \(N\) is selected, where the probability of selecting \(i\) is \(p_i\).
-\(\Omega = \{ (n_1, \dots, n_k) \in \mathbb N^k \colon \sum_{i=1}^k n_i = N \}\), in other words, ordered partitions of \(N\).
+\(\Omega = \qty{ (n_1, \dots, n_k) \in \mathbb N^k \colon \sum_{i=1}^k n_i = N }\), in other words, ordered partitions of \(N\).
 Therefore
-\[
-	\prob{n_1 \text{ outcomes had value 1}, \dots, n_k \text{ outcomes had value }k} = \prob{(n_1, \dots, n_k)} = \binom{N}{n_1,\dots,n_k}p_1^{n_1}\dots p_k^{n_k}
-\]
+\begin{align*}
+	\prob{n_1 \text{ outcomes had value 1}, \dots, n_k \text{ outcomes had value }k} &= \prob{(n_1, \dots, n_k)} \\
+	&= \binom{N}{n_1,\dots,n_k}p_1^{n_1}\dots p_k^{n_k}
+\end{align*}
 
 \subsection{Geometric distribution}
 Consider a Bernoulli distribution of parameter \(p\).
diff --git a/ia/probability/05_discrete_random_variables.tex b/ia/probability/05_discrete_random_variables.tex
index df47cc0b..6865ea83 100644
--- a/ia/probability/05_discrete_random_variables.tex
+++ b/ia/probability/05_discrete_random_variables.tex
@@ -20,9 +20,9 @@ \subsection{Random variables}
 		0 & \text{otherwise}
 	\end{cases}
 \]
-Because \(A \in \mathbb F\), \(1_A\) is a random variable.
+Because \(A \in \mathcal F\), \(1_A\) is a random variable.
 Suppose \(X\) is a random variable.
-We define probability distribution function of \(X\) to be
+We define the probability distribution function of \(X\) to be
 \[
 	F_X \colon \mathbb R \to [0, 1];\quad F_X(x) = \prob{X \leq x}
 \]
@@ -105,9 +105,12 @@ \subsection{Expectation}
 \]
 So we have partitioned \(\Omega\) using \(X\).
 Note that
-\[
-	\expect{X} = \sum_\omega X(\omega) \prob{\{\omega\}} = \sum_{x \in \Omega_X} \sum_{\omega \in \{ X = x\}} X(\omega) \prob{\{\omega\}} = \sum_{x \in \Omega_X} \sum_{\omega \in \{ X = x\}} x \prob{\{\omega\}} = \sum_{x \in \Omega_X} x\prob{\{X = x \}}
-\]
+\begin{align*}
+	\expect{X} &= \sum_\omega X(\omega) \prob{\{\omega\}} \\
+	&= \sum_{x \in \Omega_X} \sum_{\omega \in \{ X = x\}} X(\omega) \prob{\{\omega\}} \\
+	&= \sum_{x \in \Omega_X} \sum_{\omega \in \{ X = x\}} x \prob{\{\omega\}} \\
+	&= \sum_{x \in \Omega_X} x\prob{\{X = x \}}
+\end{align*}
 which matches the more familiar definition of the expectation; the average of the values taken by \(X\), weighted by the probability of the event occcuring.
 So
 \[
@@ -135,7 +138,7 @@ \subsection{Expectation of binomial distribution}
 	           & = Np
 \end{align*}
 
-\subsection{Expectation of poisson distribution}
+\subsection{Expectation of Poisson distribution}
 Let \(X \sim \text{Poi}(\lambda)\), so
 \[
 	\prob{X = k} = e^{-\lambda} \frac{\lambda^k}{k!}
diff --git a/ia/probability/06_variance_and_covariance.tex b/ia/probability/06_variance_and_covariance.tex
index 1bf96caa..91651169 100644
--- a/ia/probability/06_variance_and_covariance.tex
+++ b/ia/probability/06_variance_and_covariance.tex
@@ -13,7 +13,12 @@ \subsection{Variance}
 	\item \(\Var{X} \geq 0\), and if \(\Var{X} = 0\), \(\prob{X = \expect{X}} = 1\).
 	\item If \(c \in \mathbb R\), then \(\Var{cX} = c^2\Var{X}\), and \(\Var{X + c} = \Var{X}\).
 	\item \(\Var{X} = \expect{X^2} - \expect{X}^2\).
-	      This follows since \(\expect{(X - \expect{X})^2} = \expect{X^2 - 2X\expect{X} + \expect{X}^2} = \expect{X^2} - 2\expect{X} \expect{X} + \expect{X}^2 = \expect{X^2} - \expect{X}^2\).
+	      This follows since
+		  \begin{align*}
+			\expect{(X - \expect{X})^2} &= \expect{X^2 - 2X\expect{X} + \expect{X}^2} \\
+			&= \expect{X^2} - 2\expect{X} \expect{X} + \expect{X}^2 \\
+			&= \expect{X^2} - \expect{X}^2
+		  \end{align*}
 	\item \(\Var{X} = \min_{c \in \mathbb R} \expect{(X - c)^2}\), and this minimum is achieved at \(c = \expect{X}\).
 	      Indeed, if we let \(f(c) = \expect{(X - c)^2}\), then \(f(c) = \expect{X^2} - 2c\expect{X} + c^2\).
 	      Minimising \(f\), we get \(f(\expect{X}) = \Var{X}\) as required.
diff --git a/ia/probability/08_combinations_of_random_variables.tex b/ia/probability/08_combinations_of_random_variables.tex
index b8915e1b..fa978cbe 100644
--- a/ia/probability/08_combinations_of_random_variables.tex
+++ b/ia/probability/08_combinations_of_random_variables.tex
@@ -9,7 +9,7 @@ \subsection{Conditional expectation and law of total expectation}
 	\expect{X \mid B} = \frac{\expect{X \cdot 1(B)}}{\prob{B}}
 \]
 The numerator is notably zero when \(1(B) = 0\), so in essence we are excluding the case where \(X\) is not \(B\).
-\begin{theorem}[Law of Total Expectation]
+\begin{theorem}[law of total expectation]
 	Suppose \(X \geq 0\).
 	Let \((\Omega_n)\) be a partition of \(\Omega\) into disjoint events, so \(\Omega = \bigcup_n \Omega_n\).
 	Then
@@ -92,9 +92,11 @@ \subsection{Convolution}
 \subsection{Conditional expectation}
 Let \(X\) and \(Y\) be discrete random variables.
 Then the conditional expectation of \(X\) given that \(Y = y\) is
-\[
-	\expect{X \mid Y = y} = \frac{\expect{X \cdot 1(Y = y)}}{\prob{Y = y}} = \frac{1}{\prob{Y = y}} \sum_x x \cdot \prob{X = x, Y = y} = \sum_x x \cdot \prob{X = x \mid Y = y}
-\]
+\begin{align*}
+	\expect{X \mid Y = y} &= \frac{\expect{X \cdot 1(Y = y)}}{\prob{Y = y}} \\
+	&= \frac{1}{\prob{Y = y}} \sum_x x \cdot \prob{X = x, Y = y} \\
+	&= \sum_x x \cdot \prob{X = x \mid Y = y}
+\end{align*}
 Observe that for every \(y \in \Omega_y\), this expectation is purely a function of \(y\).
 Let \(g(y) = \expect{X \mid Y = y}\).
 Now, we define the conditional expectation of \(X\) given \(Y\) as \(\expect{X \mid Y} = g(Y)\).
@@ -200,7 +202,7 @@ \subsection{Properties of conditional expectation}
 \begin{proof}
 	Note that
 	\[
-		\expect{h(Y) \cdot X \mid Y = y} = \expect{h(y) \cdot X \mid Y = y} = h(y) \expect{X \mid Y = y}
+		\expect{h(Y) \cdot X \mid Y = y} = \expect{h(y) \cdot X \mid Y = y} = h(y) \cdot \expect{X \mid Y = y}
 	\]
 	Then
 	\[
diff --git a/ia/probability/12_continuous_random_variables.tex b/ia/probability/12_continuous_random_variables.tex
index 1e514270..ac1b15f2 100644
--- a/ia/probability/12_continuous_random_variables.tex
+++ b/ia/probability/12_continuous_random_variables.tex
@@ -32,15 +32,15 @@ \subsection{Probability distribution function}
 			                          & = \prob{X \leq b} - \prob{X \leq a}                       \\
 			                          & = F(b) - F(a)
 		      \end{align*}
-		\item For right continuity, we want to prove \(\lim_{n \to \infty} F(x + \frac{1}{n}) = F(x)\).
-		      We will define \(A_n = \{ x < X \leq x + \frac{1}{n} \}\).
+		\item For right continuity, we want to prove \(\lim_{n \to \infty} F\qty(x + \frac{1}{n}) = F(x)\).
+		      We will define \(A_n = \qty{ x < X \leq x + \frac{1}{n} }\).
 		      Then the \(A_n\) are decreasing events, and the intersection of all \(A_n\) is the empty set \(\varnothing\).
 		      Hence, by continuity of the probability measure, \(\prob{A_n} \to 0\) as \(n \to \infty\).
-		      But \(\prob{A_n} = \prob{x < X \leq x + \frac{1}{n}} = F(x + \frac{1}{n}) - F(x)\), hence \(F(x + \frac{1}{n}) \to F(x)\) as required.
+		      But \(\prob{A_n} = \prob{x < X \leq x + \frac{1}{n}} = F\qty(x + \frac{1}{n}) - F(x)\), hence \(F(x + \frac{1}{n}) \to F(x)\) as required.
 		      Now, we want to show that left limits always exist.
 		      This is clear since \(F\) is an increasing function, and is always bounded above by 1.
-		\item We know \(F(x^-) = \lim_{n \to \infty}F(x - \frac{1}{n})\).
-		      Consider \(B_n = \{ X \leq x - \frac{1}{n} \}\).
+		\item We know \(F(x^-) = \lim_{n \to \infty}F\qty(x - \frac{1}{n})\).
+		      Consider \(B_n = \qty{ X \leq x - \frac{1}{n} }\).
 		      Then the \(B_n\) is an increasing sequence of events, and their union is \(\{ X < x \}\).
 		      Hence \(\prob{B_n}\) converges to \(\prob{X < x}\), so \(F(x^-) = \prob{X < x}\).
 		\item This is evident from the properties of the probability measure.
diff --git a/ia/probability/13_multivariate_density_functions.tex b/ia/probability/13_multivariate_density_functions.tex
index f526fce2..24ab7a98 100644
--- a/ia/probability/13_multivariate_density_functions.tex
+++ b/ia/probability/13_multivariate_density_functions.tex
@@ -36,8 +36,10 @@ \subsection{Multivariate density functions}
 \]
 
 \subsection{Independence of events}
-In the continuous case, we can no longer use the definition \(\prob{X = a, Y = b} = \prob{X = a}\prob{Y = b}\), since the probability of a random variable being a specific value is always zero.
-Instead, we define that \(X_1, \dots X_n\) are independent if for all \(x_1, \dots, x_n \in \mathbb R\),
+In the continuous case, we can no longer use the definition
+\[\prob{X = a, Y = b} = \prob{X = a}\prob{Y = b}\]
+since the probability of a random variable being a specific value is always zero.
+Instead, we define that \(X_1, \dots, X_n\) are independent if for all \(x_1, \dots, x_n \in \mathbb R\),
 \[
 	\prob{X_1 \leq x_1, \dots, X_n \leq x_n} = \prob{X_1 \leq x_1}\cdots\prob{X_n \leq x_n}
 \]
@@ -107,9 +109,11 @@ \subsection{Marginal density}
 
 \subsection{Sum of random variables}
 Recall that in the discrete case, for independent random variables \(X\) and \(Y\) we have
-\[
-	\prob{X+Y = z} = \sum_y \prob{X+Y = z, Y=y} = \sum_y \prob{X = z-y} \prob{Y = y} = \sum_y p_x(z-y) p_y(y)
-\]
+\begin{align*}
+	\prob{X+Y = z} &= \sum_y \prob{X+Y = z, Y=y} \\
+	&= \sum_y \prob{X = z-y} \prob{Y = y} \\
+	&= \sum_y p_x(z-y) p_y(y)
+\end{align*}
 which was called the convolution.
 In the continuous case,
 \begin{align*}
@@ -202,7 +206,7 @@ \subsection{Transformations of multidimensional random variables}
 \]
 so the random variables \(R\) and \(\Theta\) are independent, where \(\Theta \sim U[0, 2\pi]\) and \(R\) has density \(re^{\frac{-r^2}{2}}\) on \((0, \infty)\).
 
-\subsection{Ordered statistics of a random sample}
+\subsection{Order statistics of a random sample}
 Let \(X_1, \dots, X_n\) be independent and identically distributed random variables with distribution function \(F\) and density function \(f\).
 We can put them in increasing order:
 \[
@@ -243,7 +247,7 @@ \subsection{Ordered statistics of a random sample}
 when \(x_1 < x_2 < \dots < x_n\), and the joint density is zero otherwise.
 Note that this joint density does not factorise as a product of densities, since we must always consider the indicator function that \(x_1 < x_2 < \dots < x_n\).
 
-\subsection{Ordered statistics on exponential distribution}
+\subsection{Order statistics on exponential distribution}
 Let \(X \sim \mathrm{Exp}(\lambda)\), \(Y \sim \mathrm{Exp}(\mu)\) be independent continuous random variables.
 Let \(Z = \min(X, Y)\).
 \[
diff --git a/ia/probability/14_moment_generating_functions.tex b/ia/probability/14_moment_generating_functions.tex
index 5eb56c66..2c70ff6d 100644
--- a/ia/probability/14_moment_generating_functions.tex
+++ b/ia/probability/14_moment_generating_functions.tex
@@ -107,9 +107,11 @@ \subsection{Moment generating function of the normal distribution}
 \]
 Now, suppose that \(X \sim \mathrm{N}(\mu, \sigma^2)\) and \(Y \sim \mathrm{N}(\nu, \tau^2)\) are independent.
 Then
-\[
-	\expect{e^{\theta(X + Y)}} = \expect{e^{\theta X}} \expect{e^{\theta Y}} = \exp(\theta \mu + \frac{\theta^2 \sigma^2}{2}) \exp(\theta \nu + \frac{\theta^2 \tau^2}{2}) = \exp(\theta(\mu + \nu) + \frac{\theta^2 (\sigma^2 + \tau^2)}{2})
-\]
+\begin{align*}
+	\expect{e^{\theta(X + Y)}} &= \expect{e^{\theta X}} \expect{e^{\theta Y}} \\
+	&= \exp(\theta \mu + \frac{\theta^2 \sigma^2}{2}) \exp(\theta \nu + \frac{\theta^2 \tau^2}{2}) \\
+	&= \exp(\theta(\mu + \nu) + \frac{\theta^2 (\sigma^2 + \tau^2)}{2})
+\end{align*}
 Hence \(X + Y \sim \mathrm{N}(\mu + \nu, \sigma^2 + \tau^2)\).
 
 \subsection{Cauchy distribution}
diff --git a/ia/probability/15_limit_theorems.tex b/ia/probability/15_limit_theorems.tex
index 269e598f..74193cd6 100644
--- a/ia/probability/15_limit_theorems.tex
+++ b/ia/probability/15_limit_theorems.tex
@@ -223,12 +223,12 @@ \subsection{Applications of central limit theorem}
 for \(n\) large.
 Note that we showed before that
 \[
-	\mathrm{Bin}\qty(n, \frac{\lambda}{n}) \to \mathrm{Po}(\lambda)
+	\mathrm{Bin}\qty(n, \frac{\lambda}{n}) \to \mathrm{Poi}(\lambda)
 \]
 Note that with this approximation to the binomial, we let the parameter \(p\) depend on \(n\).
 Since this is the case, we can no longer apply the central limit theorem, and we get a Poisson distributed approximation.
 
-We can, however, use the central limit theorem to find a normal approximation for a Poisson random variable \(S_n \sim \mathrm{Po}(n)\), since \(S_n\) can be written as \(\sum_{i=1}^n X_i\) where the \(X_i \sim \mathrm{Po}(1)\).
+We can, however, use the central limit theorem to find a normal approximation for a Poisson random variable \(S_n \sim \mathrm{Poi}(n)\), since \(S_n\) can be written as \(\sum_{i=1}^n X_i\) where the \(X_i \sim \mathrm{Poi}(1)\).
 Then
 \[
 	S_n \approx \mathrm{N}(n, n)
@@ -320,9 +320,12 @@ \subsection{Bertrand's paradox}
 	      Then we have formed a triangle between this intersection point, one end of the chord, and the circle's centre.
 	      By Pythagoras' theorem, the length of the chord is then twice the height of this triangle, so \(C = 2\sqrt{r^2 - X^2}\).
 	      Hence,
-	      \[
-		      \prob{C \leq r} = \prob{2\sqrt{r^2 - X^2} \leq r} = \prob{4(r^2 - X^2) \leq r^2} = \prob{X \geq \frac{\sqrt{3}}{2}r} = 1 - \frac{\sqrt 3}{2} \approx 0.134
-	      \]
+		  \begin{align*}
+				\prob{C \leq r} &= \prob{2\sqrt{r^2 - X^2} \leq r} \\
+				&= \prob{4(r^2 - X^2) \leq r^2} \\
+				&= \prob{X \geq \frac{\sqrt{3}}{2}r} \\
+				&= 1 - \frac{\sqrt 3}{2} \approx 0.134
+		  \end{align*}
 	\item Instead, let us fix one end point of the chord \(A\), and let \(\Theta \sim \mathrm{U}[0, 2\pi]\).
 	      Let the other end point \(B\) be such that the angle between the radii \(OA\) and \(OB\) is \(\Theta\).
 	      Then if \(\Theta \in [0, \pi]\), the length of the chord can be found by splitting this triangle in two by dropping a perpendicular from the centre, giving
@@ -335,8 +338,12 @@ \subsection{Bertrand's paradox}
 	      \]
 	      as before.
 	      Now,
-	      \[
-		      \prob{C \leq r} = \prob{2r\sin\frac{\Theta}{2} \leq r} = \prob{\sin\frac{\Theta}{2} \leq \frac{1}{2}} = \prob{\Theta \leq \frac{\pi}{3}} + \prob{\Theta \geq \frac{5\pi}{3}} = \frac{1}{6} + \frac{1}{6} = \frac{1}{3} \approx 0.333
-	      \]
+		  \begin{align*}
+			\prob{C \leq r} &= \prob{2r\sin\frac{\Theta}{2} \leq r} \\
+			&= \prob{\sin\frac{\Theta}{2} \leq \frac{1}{2}} \\
+			&= \prob{\Theta \leq \frac{\pi}{3}} + \prob{\Theta \geq \frac{5\pi}{3}} \\
+			&= \frac{1}{6} + \frac{1}{6} \\
+			&= \frac{1}{3} \approx 0.333
+		  \end{align*}
 \end{enumerate}
 Clearly, the two probabilities do not match.
diff --git a/ia/probability/16_gaussian_vectors.tex b/ia/probability/16_gaussian_vectors.tex
index 9625a11e..e4203e74 100644
--- a/ia/probability/16_gaussian_vectors.tex
+++ b/ia/probability/16_gaussian_vectors.tex
@@ -88,9 +88,12 @@ \subsection{Constructing Gaussian vectors}
 		u^\transpose Z = \sum_{i=1}^n u_i Z_i
 	\]
 	Because the \(Z_i\) are independent, it is easy to take the moment generating function to get
-	\[
-		\expect{\exp(\lambda \sum_{i=1}^n u_i z_i)} = \expect{\prod_{i=1}^n \exp(\lambda u_i Z_i)} = \prod_{i=1}^n \expect{\exp(\lambda u_i Z_i)} = \prod_{i=1}^n \exp(\frac{(\lambda u_i)^2}{2}) = \exp(\frac{\lambda^2 \abs{u}^2}{2})
-	\]
+	\begin{align*}
+		\expect{\exp(\lambda \sum_{i=1}^n u_i z_i)} &= \expect{\prod_{i=1}^n \exp(\lambda u_i Z_i)} \\
+		&= \prod_{i=1}^n \expect{\exp(\lambda u_i Z_i)} \\
+		&= \prod_{i=1}^n \exp(\frac{(\lambda u_i)^2}{2}) \\
+		&= \exp(\frac{\lambda^2 \abs{u}^2}{2})
+	\end{align*}
 	So \(u^\transpose Z \sim \mathrm{N}(0, \abs{u}^2)\), which is normal as required.
 \end{proof}
 Now, \(\expect{Z} = 0\), and \(\Var{Z} = I\), the identity matrix.
diff --git a/ia/probability/import.tex b/ia/probability/import.tex
index 73016ce0..c0d26c00 100644
--- a/ia/probability/import.tex
+++ b/ia/probability/import.tex
@@ -7,7 +7,7 @@
 Each event can be assigned a probability of occurring; in this case, one sixth.
 By carefully reasoning about probabilities of events using the rules of probability spaces, we can avoid many apparent paradoxes of probability, such as Simpson's paradox.
 
-When there are many different possible outcomes (or even infinitely many), it becomes helpful to think of certain events as tied to `random variables'.
+When there are many different possible outcomes (or even infinitely many), it becomes helpful to think of certain events as tied to random variables.
 For example, the amount of coin flips needed before getting a head is a random variable, and its value could be any integer at least 1.
 The statement `at least three coin flips were needed' is an example of an event linked to this random variable.
 The values that a random variable can be, as well as the probabilities that they occur, form the distribution of the random variable.
diff --git a/ia/vm/02_vectors_in_three_dimensions.tex b/ia/vm/02_vectors_in_three_dimensions.tex
index e66428d1..c21662de 100644
--- a/ia/vm/02_vectors_in_three_dimensions.tex
+++ b/ia/vm/02_vectors_in_three_dimensions.tex
@@ -187,7 +187,7 @@ \subsection{Other vector equations}
 \[
 	\vb a \cdot \vb r = \vb a \cdot \vb c \tag{2}
 \]
-Note that using the dot product loses information --- this is simply a tool to make deductions; (2) does not contain the full information of (1).
+Note that using the dot product loses information---this is simply a tool to make deductions; (2) does not contain the full information of (1).
 Combining (1) and (2), and using the formula for the vector triple product, we get
 \begin{align*}
 	\vb r + (\vb a \cdot \vb r) \vb b - (\vb a \cdot \vb b) \vb r          & = \vb c \tag{3} \\
@@ -206,4 +206,3 @@ \subsection{Other vector equations}
 	(\vb a \cdot \vb r - \vb a \cdot \vb c) \vb b = \vb 0
 \]
 This shows us that (given that \(\vb b\) is nonzero) the solutions to the equation are given by (2), which is the equation of a plane.
-
diff --git a/ia/vm/05_vectors_in_real_euclidean_space.tex b/ia/vm/05_vectors_in_real_euclidean_space.tex
index d8e7b32b..72d90c70 100644
--- a/ia/vm/05_vectors_in_real_euclidean_space.tex
+++ b/ia/vm/05_vectors_in_real_euclidean_space.tex
@@ -224,7 +224,7 @@ \subsection{Bases and dimensions}
 \end{align*}
 So \(S_n\) are orthonormal and therefore linearly independent.
 So we can continue adding more vectors until it becomes a basis.
-However, the set of all \(S_n\) is already infinite --- so \(V\) must have infinite dimensionality.
+However, the set of all \(S_n\) is already infinite---so \(V\) must have infinite dimensionality.
 
 \subsection{Multidimensional complex space}
 We define \(\mathbb C^n\) by
diff --git a/ib/antop/01_uniform_convergence.tex b/ib/antop/01_uniform_convergence.tex
index ff47fff8..ea3c0033 100644
--- a/ib/antop/01_uniform_convergence.tex
+++ b/ib/antop/01_uniform_convergence.tex
@@ -277,7 +277,7 @@ \subsection{General principle of uniform convergence}
 	Now, we must show \( \forall n \geq N, \forall x \in S, \abs{f_n(x) - f(x)} < 2 \varepsilon \), then we are done.
 	We will fix \( x \in S, n \geq N \).
 	Since \( f_n(x) \to f(x) \), we can choose \( m \in \mathbb N \) such that \( \abs{f_m(x) - f(x)} < \varepsilon \), and \( m \geq N \).
-	Note however that \( m \) depends on \( x \) in this statement, but this doesn't matter --- we have shown that
+	Note however that \( m \) depends on \( x \) in this statement, but this doesn't matter---we have shown that
 	\[
 		\abs{f_n(x) - f(x)} \leq \abs{f_n(x) - f_m(x)} + \abs{f_m(x) - f(x)} \leq \varepsilon + \varepsilon = 2 \varepsilon
 	\]
diff --git a/ib/stats/01_introduction_and_review.tex b/ib/stats/01_introduction_and_review.tex
index d98eb787..46204eb9 100644
--- a/ib/stats/01_introduction_and_review.tex
+++ b/ib/stats/01_introduction_and_review.tex
@@ -10,7 +10,7 @@ \subsection{Introduction}
 \end{itemize}
 This course concerns itself with \textit{parametric inference}.
 Let \( X_1, \dots, X_n \) be i.i.d.\ (independent and identically distributed) random variables, where we assume that the distribution of \( X_1 \) belongs to some family with parameter \( \theta \in \Theta \).
-For instance, let \( X_1 \sim \mathrm{Poisson}(\mu) \), where \( \theta = \mu \) and \( \Theta = (0, \infty) \).
+For instance, let \( X_1 \sim \mathrm{Poi}(\mu) \), where \( \theta = \mu \) and \( \Theta = (0, \infty) \).
 Another example is \( X_1 \sim N(\mu, \sigma^2) \), and \( \theta = (\mu, \sigma^2) \) and \( \Theta = \mathbb R \times (0, \infty) \).
 We use the observed \( X = (X_1, \dots, X_n) \) to make inferences about the parameter \( \theta \):
 \begin{enumerate}
diff --git a/ib/stats/02_estimation.tex b/ib/stats/02_estimation.tex
index 8d160599..069459ca 100644
--- a/ib/stats/02_estimation.tex
+++ b/ib/stats/02_estimation.tex
@@ -81,7 +81,7 @@ \subsection{Bias-variance decomposition}
 \end{example}
 It is not necessarily desirable that an estimator is unbiased.
 \begin{example}
-	Suppose \( X \sim \mathrm{Poisson}(\lambda) \) and we wish to estimate \( \theta = \prob{X = 0}^2 = e^{-2\lambda} \).
+	Suppose \( X \sim \mathrm{Poi}(\lambda) \) and we wish to estimate \( \theta = \prob{X = 0}^2 = e^{-2\lambda} \).
 	For some estimator \( T(X) \) of \( \theta \) to be unbiased, we need that
 	\[
 		\esub{\lambda}{T(X)} = \sum_{x=0}^\infty T(x) \frac{\lambda^x e^{-\lambda}}{x!} = e^{-2\lambda}
diff --git a/ib/vp/03_euler_lagrange_equation.tex b/ib/vp/03_euler_lagrange_equation.tex
index 6e68085d..4e39c1e2 100644
--- a/ib/vp/03_euler_lagrange_equation.tex
+++ b/ib/vp/03_euler_lagrange_equation.tex
@@ -9,7 +9,7 @@ \subsection{Fundamental lemma of calculus of variations}
 	y \mapsto y + \varepsilon \eta(x);\quad \eta(\alpha) = \eta(\beta) = 0
 \]
 In order to compute the functional for this new function, we first need an additional lemma.
-\begin{lemma}[Fundamental Lemma of Calculus of Variations]
+\begin{lemma}[Fundamental lemma of calculus of variations]
 	If \( g \colon [\alpha, \beta] \to \mathbb R \) is continuous on this interval, and is such that
 	\[
 		\forall \eta \text{ continuous}, \eta(\alpha) = \eta(\beta) = 0,\; \int_\alpha^\beta g(x) \eta(x) \dd{x} = 0