probability.tex

\documentclass[openany,12pt]{book}

% \\begin\{([a-z]*)\}\{(.*)\}\{.*\}
% \begin{$1}[$2]

\usepackage[left=3cm,right=3cm,top=3cm,bottom=3cm]{geometry}
\usepackage{amsmath,amssymb,amsthm,bm}
\usepackage{tnptimes}
\usepackage{enumitem}
\setlist[enumerate,1]{label=(\roman*), ref=(\roman*)}

\usepackage{titlesec}
\titleformat{\chapter}[display]{\normalfont\huge\bfseries\centering}{\chaptertitlename~\thechapter}{20pt}{\Huge}

\usepackage{hyperref}
\hypersetup{
  breaklinks,
  colorlinks = true,
  citecolor  = blue,
  linkcolor  = black,
  urlcolor   = magenta,
}

\newtheorem{theorem}{Theorem}[chapter]
\newtheorem{corollary}{Corollary}[chapter]
\newtheorem{assumption}{Assumption}[chapter]
\newtheorem{remark}{Remark}[chapter]
\newtheorem{lemma}{Lemma}[chapter]
\newtheorem{definition}{Definition}[chapter]

% \makeatletter
% \def\thanks#1{
%   \protected@xdef\@thanks{
%     \@thanks\protect\footnotetext{#1}
%   }
% }
% \makeatother

\input{settings.tex}

\title{\Huge\textbf{Probability}}
\author{\href{https://www.jingxuanyang.com/}{Jingxuan Yang}}
\date{\today}

\begin{document}

\frontmatter

\maketitle

\tableofcontents

\chapter*{Preface}

Currently, most of the material in this book comes from the lecture notes of \href{https://www.ee.nthu.edu.tw/jcheng/}{Professor Jay Cheng}. The proofs of theorems are not included yet, which warrants further efforts in the future.

\mainmatter

\chapter{Axioms of Probability}

\begin{definition}[Sample Space]
  The sample space $\Omega$ of an experiment is the set of all possible outcomes of the experiment.
\end{definition}

\begin{definition}[Event]
  An event of an experiment is a subset of the sample space $\Omega$ of the experiment. We call $\Omega$ the certain event and $\varnothing$ the impossible event of the experiment. We say that an event $A$ occurs if the outcome of the experiment belongs to $A$.
\end{definition}

\begin{definition}[$\sigma$-algebra]
  A $\sigma$-algebra $\ma$ of subsets of a sample space $\Omega$ is a collection of subsets of $\Omega$ such that
  \begin{enumerate}
    \item $\Omega\in\ma$,
    \item $\ma$ is closed under complementation, i.e., if $A\in\ma$, then $\Omega\setminus A\in\ma$,
    \item $\ma$ is closed under countable union, i.e., if $A_n\in\ma$ for $n=1,2,\dots$, then $\cup_{n=1}^\infty A_n \in\ma$.
  \end{enumerate}
\end{definition}

\begin{theorem}[Properties of $\sigma$-algebra]
  Suppose $\ma$ is a $\sigma$-algebra of subsets of a sample space $\Omega$.
  \begin{enumerate}
    \item $\varnothing\in\ma$,
    \item $\ma$ is closed under finite union,
    \item $\ma$ is closed under countable and finite intersection.
  \end{enumerate}
\end{theorem}

\begin{proof}
  abc
\end{proof}

\begin{theorem}[Intersection of $\sigma$-algebras]
  Suppose $\Gamma$ is a nonempty collection of $\sigma$-algebras of subsets of a sample space $\Omega$. Then the intersection $\mb=\cap_{\ma\in\Gamma}\ma$ of the $\sigma$-algebras in $\Gamma$ is also a $\sigma$-algebra of subsets of $\Omega$.
\end{theorem}

\begin{proof}
  abc
\end{proof}

\begin{corollary}[Existence of Smallest $\sigma$-algebra]
  Suppose $\mathcal{C}$ is a collection of subsets of a sample space $\Omega$. Then there exists a smallest $\sigma$-algebra of subsets of $\Omega$ including $\mathcal{C}$.
\end{corollary}

\begin{proof}
  abc
\end{proof}

\begin{definition}[Generated $\sigma$-algebra]
  Let $\mathcal{C}$ be a collection of subsets of a sample space $\Omega$, we define the $\sigma$-algebra of subsets of $\Omega$ generated by $\mathcal{C}$ as the smallest $\sigma$-algebra of subsets of $\Omega$ including $\mathcal{C}$ and denote it as $\sigma$($\mathcal{C}$).
\end{definition}

\begin{definition}[Probability Measure]
  Let $\ma$ be a $\sigma$-algebra of subsets of a sample space $\Omega$, a probability measure $\mathbb{P}:\ma\to\mr$ on $\ma$ is a real-valued function on $\ma$ such that
  \begin{enumerate}
    \item Nonnegativity: $\mathbb{P}(A)\gs0$, $\forall A\in\ma$,
    \item Normalization: $\mathbb{P}(\Omega)=1$,
    \item Countable additivity: If $A_1,A_2,\dots$ are pairwise disjoint events in $\ma$ then $$\mathbb{P}\left(\bigcup_{n=1}^\infty A_n \right)=\sumn \mathbb{P}(A).$$
  \end{enumerate}
  For an event $A\in\ma$, we call $\mathbb{P}(A)$ the probability of the event $A$.
\end{definition}

\begin{definition}[Probability Space]
  A probability space is an ordered triple $(\Omega,\ma,\mathbb{P})$ consisting of a sample space $\Omega$, a $\sigma$-algebra $\ma$  of subsets of $\Omega$, and a probability measure $\mathbb{P}$ on $\ma$.
\end{definition}

\begin{theorem}[A Kind of Probability Measure]
  Suppose $\Omega=\{\omega_1,\omega_2,\dots\}$, $\ma=\mathcal{P}(\Omega)$ and $\mathbb{P}(A)=\sum_{\omega_i\in\ma}P_i$, for all $A\in\mathcal{P}(\Omega)$, where $P_i\gs0$, $\forall i=1,2,\dots$, and $\sumi P_i =1$, then $\mathbb{P}$ is a probability measure on $\mathcal{P}(\Omega)$. A similar result holds if $\Omega=\{\omega_1,\omega_2,\dots,\omega_N\}$, where $N\gs1$.
\end{theorem}

\begin{proof}
  abc
\end{proof}

\begin{corollary}[A Kind of Probability Measure (special)]
  Suppose $\Omega=\{\omega_1,\omega_2, \cdots,\omega_N\}$, $\ma=\mathcal{P}(\Omega)$, and $\mathbb{P}(A)=\frac{|A|}{N}$ for all $A\in\mathcal{P}(\Omega)$, then $\mathbb{P}$ is a probability measure on $\mathcal{P}(\Omega)$.
\end{corollary}

\begin{proof}
  abc
\end{proof}

\begin{theorem}[Classical definition of probability]
  Suppose $\Omega=\{\omega_1,\omega_2, \cdots,\omega_N\}$, $\ma=\mathcal{P}(\Omega)$ and $\mathbb{P}$ is a probability measure on $\mathcal{P}(\Omega)$ such that $\mathbb{P}({\omega_1})=\mathbb{P}({\omega_2})=\cdots=\mathbb{P}({\omega_N})$, then $\mathbb{P}(A)=\frac{|A|}{N}$ for all $A\in\mathcal{P}(\Omega)$.
\end{theorem}

\begin{proof}
  abc
\end{proof}

\begin{theorem}[Properties of Probability Measure]
  Suppose $(\Omega,\ma,\mathbb{P})$ is a probability space.
  \begin{enumerate}
    \item $\mathbb{P}(\varnothing)=0$.
    \item $\mathbb{P}(A)+\mathbb{P}(A^c)=1$. Therefore, $0\ls\mathbb{P}(A)\ls1$, for all $A\in\ma$.
    \item Finite additivity: If $A_1,A_2,\dots,A_N$ are pairwise disjoint events in $A$, then $$\mathbb{P}\left(\bigcup_{n=1}^NA_n \right)=\sum_{n=1}^N\mathbb{P}(A).$$
  \end{enumerate}
\end{theorem}

\begin{proof}
  abc
\end{proof}

\begin{theorem}[Properties of Probability Measure]
Suppose $(\Omega,\ma,\mathbb{P})$ is a probability space, and suppose $A,B\in\ma$.
\\
(1) If $A_1,A_2,\cdots$ are pairwise disjoint events on $A$ and 
$$ \bigcup_{n=1}^\infty A_n =\Omega,$$ then 
$$\mathbb{P}(A)= \sumi \mathbb{P}\left(A \cap A_n \right).$$
(2) If $B\subseteq A$, then $\mathbb{P}(A)=\mathbb{P}(A\cap B)+\mathbb{P}(A\cap A^c )$ for all $A,B\in\ma$.\\
(3) $\mathbb{P}(A\cap B)  \ls \min \{\mathbb{P}(A),\mathbb{P}(B)\}  \ls \max\{ \mathbb{P}(A),\mathbb{P}(B)\} \ls  \mathbb{P}(A\cup B)$.
\end{theorem}

\begin{proof}
  abc
\end{proof}

\begin{corollary}[Finite Additivity under Union]
Suppose $(\Omega,\ma,\mathbb{P})$ is a probability space, $A\in\ma$, $A_1,A_2,\cdots$ are pairwise disjoint events in $\ma$, and $$\mathbb{P}\left(\bigcup_{n=1}^\infty A_n \right)=1,$$ then 
$$\mathbb{P}(A)=\sum_{n=1}^\infty \mathbb{P}\left(A\cap A_n \right).$$ 
\end{corollary}

\begin{proof}
  abc
\end{proof}

\begin{theorem}[Inclusion-exclusion Identity]
Suppose $(\Omega,\ma,\mathbb{P})$ is a probability space, and suppose $A_1,A_2, \cdots,A_n\in\ma $, where $n \gs2$, then $$\mathbb{P}\left(\bigcup_{i=1}^n A_i \right)=\sumkfn(-1)^{k+1}\cdot  \sum_{1\ls i_1<i_2<\cdots<i_k\ls n}\mathbb{P}\left(A_{i_1} \cap A_{i_2 } \cap\cdots\cap A_{i_k } \right).$$
\end{theorem}

\begin{proof}
  abc
\end{proof}

\begin{lemma}[Generated Pairwise Disjoint]
Suppose $\ma$ is a $\sigma$-algebra of subsets of a sample space $\Omega$, suppose $A_1,A_2,\cdots\in\ma$, $B_1=A_1$, and $$B_n=A_n\setminus\bigcup_{i=1}^{n-1}A_i$$  for all $n\gs2$, then $B_1,B_2,\cdots$ are pairwise disjoint events in $\ma$, $$\bigcup_{i=1}^nA_i =\bigcup_{i=1}^nB_i$$  for all $n\gs1$, and $$\bigcup_{n=1}^\infty A_n =\bigcup_{n=1}^\infty B_n.$$
\end{lemma}

\begin{proof}
  abc
\end{proof}

\begin{theorem}[Inclusion-exclusion Inequality]
Suppose $(\Omega,\ma,\mathbb{P})$ is a probability space, and suppose $A_1,A_2, \cdots,A_n\in\ma$, where $n\gs2$, then
$$\mathbb{P}\left(\bigcup_{i=1}^n A_i \right)
\begin{cases}
\ls\sum_{k=1}^m(-1)^{k+1}\cdot  \sum_{1\ls i_1<i_2<\cdots<i_k\ls n}\mathbb{P}\left(A_{i_1} \bigcap A_{i_2 } \bigcap\cdots\bigcap A_{i_k } \right),& \text{if $m$ is odd}\\
\gs\sum_{k=1}^m(-1)^{k+1}\cdot  \sum_{1\ls i_1<i_2<\cdots<i_k\ls n}\mathbb{P}\left(A_{i_1} \bigcap A_{i_2 } \bigcap\cdots\bigcap A_{i_k } \right),& \text{if $m$ is even}
\end{cases}
$$
where $1\ls m\ls n$.\\
In particular,
$$\mathbb{P}\left(\bigcup_{i=1}^n A_i \right)\ls\sumin \mathbb{P}(A_i),$$
$$\mathbb{P}\left(\bigcup_{i=1}^n A_i \right)\gs\sumin \mathbb{P}(A_i)-
\sum_{1\ls i<j\ls n}\mathbb{P}\left(A_i\cap A_j\right).$$
\end{theorem}

\begin{proof}
  abc
\end{proof}

\begin{theorem}[Boole's Inequality]
Suppose $(\Omega,\ma,\mathbb{P})$ is a probability space, and suppose $A_1,A_2,\cdots\in\ma$, then 
$$\mathbb{P}\left(\bigcup_{i=1}^\infty A_i \right)\ls\sum_{i=1}^\infty \mathbb{P}(A_i ).$$
\end{theorem}

\begin{proof}
  abc
\end{proof}

\begin{definition}[Monotonicity]
  Let $(\Omega,\ma,\mathbb{P})$ be a probability space.

  A sequence $\{A_1,A_2,\cdots\}$ of events in $A$ is increasing if $A_1\subseteq A_2\subseteq\cdots$.

  A sequence $\{A_1,A_2,\cdots\}$ of events in $A$ is decreasing if $A_1\supseteq A_2\supseteq\cdots$.
\end{definition}

\begin{definition}[Limit of Events]
Let $(\Omega,\ma,\mathbb{P})$ be a probability space.\\
(1) The limit $\limn A_n$ of an increasing sequence $\{A_1,A_2,\cdots\}$ of events in $A$ is the event that at least one of the events occurs, i.e., $$\limn A_n=\bigcup_{n=1}^\infty A_n.$$
(2) The limit $\limn A_n$ of a decreasing sequence $\{A_1,A_2,\cdots\}$ of events in $A$ is the event that all the events occur, i.e., $$\limn A_n=\bigcap_{n=1}^\infty A_n.$$
\end{definition}

\begin{theorem}[Continuity of Probability Measure]
Let $(\Omega,\ma,\mathbb{P})$ be a probability space.\\
(1) Suppose that $\{A_1,A_2,\cdots\}$ is an increasing sequence of events in $A$. Then $$\mathbb{P}\left(\limn A_n\right)=\limn \mathbb{P}(A_n).$$
(2) Suppose that $\{A_1,A_2,\cdots\}$ is a decreasing sequence of events in $A$. Then $$\mathbb{P}\left(\limn A_n\right)=\limn \mathbb{P}(A_n).$$
\end{theorem}

\begin{proof}
  abc
\end{proof}

\begin{remark}[Not Necessary]
If $\mathbb{P}(A)=0$, then it is not necessary that $A=\varnothing$, e.g., $\Omega=(0,1)$ and $A=A_\alpha, \alpha\in(0,1)$.
If $\mathbb{P}(A)=1$, then it is not necessary that $A=\Omega$, e.g., $\Omega=(0,1)$ and $A=A_\alpha^c, \alpha\in(0,1)$.
\end{remark}

\begin{definition}[Length]
The length of the intervals $(a,b), [a,b), (a,b], [a,b]$ are defined to be $(b-a)$.
\end{definition}

\begin{definition}[Random]
A point is said to be randomly selected from an interval $(a,b)$ if any subintervals of $(a,b)$ with the same length are equally likely to contain the randomly selected point.
\end{definition}

\begin{theorem}[Probability of Randomness]
The probability that a randomly selected point from $(a,b)$ falls in the subinterval $(\alpha,\beta)$ of $(a,b)$ is $$\mathbb{P}=\frac{\beta-\alpha}{b-a}.$$
\end{theorem}

\begin{proof}
  abc
\end{proof}

\begin{definition}[Borel Algebra]
The $\sigma$-algebra of subsets of $(a,b)$ generated by the set of all subintervals of $(a,b)$ is called Borel algebra associated with $(a,b)$ and is denoted $\mathcal{B}_{(a,b)}$.
\end{definition}

\begin{theorem}[Existence of Probability Measure]
For any interval $(a,b)$, there exists a unique probability measure $\mathbb{P}$ on $\mathcal{B}_{(a,b)}$ s.t., $$\mathbb{P}((\alpha,\beta))=\frac{\beta-\alpha}{b-a},$$
for all $(\alpha,\beta)\subseteq(a,b)$.
\end{theorem}

\begin{proof}
  abc
\end{proof}

\chapter{Combinational Methods}

\begin{theorem}[Counting Principle]
There are $n_1\times n_2\times\cdots\times n_k$ different ways in which we can first choose an element from a set of $n_1$  elements, then an element from a set of $n_2$ elements,..., and finally an element from a set of $n_k$ elements.
\end{theorem}

\begin{proof}
  abc
\end{proof}

\begin{definition}[Permutation]
An ordered arrangement of $r$ objects from a set $A$ containing $n$ objects is called an $r$-arrangement permutation of $A$, where $0\ls r\ls n$. 

An $n$-element permutation of $A$ is called a permutation of $A$. The number of different $r$-permutation permutations of $A$ is given by $$_nP_r =n\times(n-1)\times(n-2)\times\cdots\times(n-r+1)=\frac{n!}{(n-r)!}.$$
\end{definition}

\begin{theorem}[Permutation with Types]
The number of different (w.r.t. types) permutations of $n$ objects of $k$ different types is $$\frac{n!}{n_1 !\times n_2 !\times\cdots\times n_k !},$$
where $n_1$  are alike, $n_2$  are alike,..., $n_k$  are alike, and $n=n_1+n_2+\cdots+n_k $.
\end{theorem}

\begin{proof}
  abc
\end{proof}

\begin{definition}[Combination]
An unordered arrangement of $r$ objects from a set $A$ containing $n$ objects is called an $r$-element combination of $A$. The number of different $r$-element combinations of $A$ is given by 
$$_n C_r =\binom n r=\frac{_n P_r}{r!}=\frac{n!}{(n-r)!r!}.$$
\end{definition}

\begin{theorem}[Property of Combination]
$$\sum_{i=0}^k\binom {n+i} i=\sum_{i=0}^k \binom {n+i} n
=\binom {n+k+1} k$$
\end{theorem}

\begin{proof}
  abc
\end{proof}

\begin{theorem}[Multinomial Expansion]
$$(x_1+x_2+\cdots+x_k )^n=\sum_
{\mbox{\tiny $\begin{aligned}
&n_1+n_2+\cdots+n_k=n \\
&n_1,n_2, \cdots,n_k\gs0
\end{aligned}$}}
 \frac{n!}{n_1 !\times n_2 !\times\cdots\times n_k !}\cdot x_1^{n_1 } x_2^{n_2 }\cdots x_k^{n_k},\forall n\gs0.$$
\end{theorem}

\begin{proof}
  abc
\end{proof}

\begin{corollary}[Binomial Expansion]
$$(x+y)^n=\sumin\binom n i  x^i y^{n-i},~\forall n\gs 0.$$
\end{corollary}

\begin{theorem}[Stirling's Formula]
$$\sqrt{2\pi n} 
\left(\frac{n}{e}\right)^n
\cdot
\exp\left(\frac{1}{12n}-\frac{1}{360 n^2 }\right) 
<n!
<\sqrt{2\pi n} 
\left(\frac{n}{e}\right)^n
\cdot\exp\left(\frac{1}{12n}\right), 
\forall n\gs1.$$
Therefore, 
$$n!\sim\sqrt{2\pi n} \left(\frac{n}{e}\right)^n, \text{i.e.}, \limn \frac{n!}{\sqrt{2\pi n} \left(\frac{n}{e}\right)^n}=1.$$
\end{theorem}

\begin{proof}
  abc
\end{proof}

\chapter{Conditional Probability and Independence}

\begin{definition}[Conditional Probability]
Let $(\Omega,\ma,\mathbb{P})$ be a probability space, and $A,B\in\ma$. The conditional probability of $A$ given $B$, denoted $\mathbb{P}(A|B)$, is given by
$$\mathbb{P}(A|B)=
\begin{cases}
\frac{\mathbb{P}(A\cap B)}{\mathbb{P}(B)}, \text{if}  \mathbb{P}(B)>0,  \\
     \qquad 0         ,\qquad \text{if}  \mathbb{P}(B)=0.
\end{cases}
$$
\end{definition}

\begin{remark}[Property of Conditional Probability]
$$\mathbb{P}\left(A\cap B\right)=\mathbb{P}(B)\cdot \mathbb{P}(A|B),\forall A,B\in\ma.$$ 
\end{remark}

\begin{proof}
  abc
\end{proof}

\begin{theorem}[Conditional Probability Space]
Suppose $(\Omega,\ma,\mathbb{P})$ is a probability space, and suppose $\mathbb{P}(B)>0$, for some $B\in\ma$. 
Then the conditional probability function $\mathbb{P}(\cdot|B): \ma\to\mr$ is a probability measure on $\ma$, and hence $(\Omega,\ma, \mathbb{P}(\cdot|B))$ is a probability space.
\end{theorem}

\begin{proof}
  abc
\end{proof}

\begin{theorem}[Reduction of Probability Space]
Suppose $(\Omega,\ma,\mathbb{P})$ is a probability space, and suppose $\mathbb{P}(B)>0$, for some $B\in\ma$. Let $\ma_B:\{A\in\ma: A\subseteq B\}$ and $P_B (A)=\mathbb{P}(A|B)$  for all $A\in\ma_B$.  Then $\ma_B$ is a $\sigma$-algebra of subsets of $B$ and $P_B$ is a probability measure on $\ma_B$, and hence $(B,\ma_B,P_B )$ is a probability space.
\end{theorem}

\begin{proof}
  abc
\end{proof}

\begin{remark}[Conversion of Reduced and Conditional Probability Space]
Note that $\mathbb{P}(A|B)=\mathbb{P}(A\cap B|B)=P_B (A\cap B),\forall A\in\ma.$ 
And $\mathbb{P}(A|B)=P_B (A)$, if $A\in\ma$ and $A\subseteq B$.
\end{remark}

\begin{proof}
  abc
\end{proof}

\begin{theorem}[Law of Multiplication]
  Suppose $(\Omega,\ma,\mathbb{P})$ is a probability space, and $A_1,A_2,\dots,A_n\in\ma$. Then $$\mathbb{P}\left(A_1\cap A_2\cap\cdots\cap A_n\right)=\mathbb{P}(A_1 )\mathbb{P}(A_2|A_1 )\cdots \mathbb{P}\left(A_n|A_1 \cap A_2\cap\cdots\cap A_{n-1}\right).$$
\end{theorem}

\begin{proof}
  abc
\end{proof}

\begin{theorem}[Law of Total Probability (infinite)]
Suppose $(\Omega,\ma,\mathbb{P})$ is a probability space, and suppose $B_1,B_2,\cdots\in\ma$ are pairwise disjoint and $\bigcup_{n=1}^\infty B_n=\Omega$. Then,\\
(1) $\mathbb{P}(A)=\sumn \mathbb{P}(B_n )\cdot \mathbb{P}(A|B_n ),\forall A\in\ma$.\\
(2) $\mathbb{P}(A|B)=\sumn \mathbb{P}(B_n |B)\cdot \mathbb{P}(A|B\cap B_n ),\forall A,B\in\ma.$
\end{theorem}

\begin{proof}
  abc
\end{proof}

\begin{corollary}[Law of Total Probability (finite)]
Suppose $(\Omega,\ma,\mathbb{P})$ is a probability space, and suppose $B_1,B_2,\cdots B_n\in\ma$ are pairwise disjoint and $\bigcup_{i=1}^n B_i=\Omega$. Then,\\
(1) $\mathbb{P}(A)=\sumin \mathbb{P}(B_i )\cdot \mathbb{P}(A|B_i ),\forall A\in\ma$.\\
(2) $\mathbb{P}(A|B)=\sumin \mathbb{P}(B_i |B)\cdot \mathbb{P}(A|B\cap B_i ),\forall A,B\in\ma.$
\end{corollary}

\begin{proof}
  abc
\end{proof}

\begin{theorem}[Bayes' Theorem (infinite)]
Suppose $(\Omega,\ma,\mathbb{P})$ is a probability space, and suppose $B_1,B_2,\cdots\in\ma$ are pairwise disjoint and $\bigcup_{n=1}^\infty B_n=\Omega$.  Then $$\mathbb{P}(B_k|A)=\frac{\mathbb{P}(B_k)\cdot \mathbb{P}(A|B_k)}{\sumn \mathbb{P}(B_n )\cdot \mathbb{P}(A|B_n )},\forall A\in\ma, \mathbb{P}(A)>0,k=1,2,\dots$$
\end{theorem}

\begin{proof}
  abc
\end{proof}

\begin{corollary}[Bayes' Theorem (finite)]
Suppose $(\Omega,\ma,\mathbb{P})$ is a probability space, and suppose $B_1,B_2,\cdots B_n\in\ma$ are pairwise disjoint and $\bigcup_{i=1}^n B_i=\Omega$.  Then $$\mathbb{P}(B_k|A)=\frac{\mathbb{P}(B_k)\cdot \mathbb{P}(A|B_k)}{\sumin \mathbb{P}(B_i )\cdot \mathbb{P}(A|B_i )},\forall A\in\ma, \mathbb{P}(A)>0,k=1,2, \cdots,n$$
\end{corollary}

\begin{proof}
  abc
\end{proof}

\begin{theorem}[Properties of Conditional Probability]
Suppose $(\Omega,\ma,\mathbb{P})$ is a probability space, and suppose $A,B\in\ma$.

(1) $\mathbb{P}(A|B)>\mathbb{P}(A)\Leftrightarrow \mathbb{P}(A\cap B)>\mathbb{P}(A)\cdot \mathbb{P}(B)\Leftrightarrow \mathbb{P}(B|A)>\mathbb{P}(B)$

 $\begin{aligned}
\text{(2) }\mathbb{P}(A|B)<\mathbb{P}(A),\mathbb{P}(B)>0&\Leftrightarrow \mathbb{P}\left(A\cap B\right)<\mathbb{P}(A)\cdot \mathbb{P}(B)\\
                              &\Leftrightarrow \mathbb{P}(B|A)<\mathbb{P}(B),\mathbb{P}(A)>0
                                     \end{aligned}$
                                                          
(3) $\mathbb{P}(A|B)=\mathbb{P}(A)\to \mathbb{P}(A\cap B)=\mathbb{P}(A)\cdot \mathbb{P}(B)$

    $\quad \mathbb{P}(A\cap B)=\mathbb{P}(A)\cap \mathbb{P}(B), \mathbb{P}(A)=0$ or $\mathbb{P}(B)>0\to \mathbb{P}(A|B)=\mathbb{P}(A)$
    
If $\mathbb{P}(A)=0$ or $\mathbb{P}(B)>0$, then $\mathbb{P}(A|B)=\mathbb{P}(A)\Leftrightarrow \mathbb{P}(A\cap B)=\mathbb{P}(A)\cdot \mathbb{P}(B)  $

\end{theorem}

\begin{proof}
  abc
\end{proof}

\begin{definition}[Independence]
Let $(\Omega,\ma,\mathbb{P})$ be a probability space, and $A,B\in\ma$. 
If $\mathbb{P}(A\cap B)=\mathbb{P}(A)\cdot \mathbb{P}(B)$, then $A$ and $B$ are said to be independent, denoted $A\perp B$. If $A$ and $B$ are not independent, they are said to be dependent. 
Furthermore, if $\mathbb{P}(A|B)>\mathbb{P}(A)$, then $A$ and $B$ are said to be positively correlated, and if $\mathbb{P}(A|B)<\mathbb{P}(A)$, then $A$ and $B$ are said to be negatively correlated.
\end{definition}

\begin{theorem}[Properties of Independence]
Suppose $(\Omega,\ma,\mathbb{P})$ is a probability space, and suppose  $A,B\in\ma$.

(1) If $\mathbb{P}(A)=0$ or $\mathbb{P}(A)=1$, then $A\perp B, \forall B\in\ma$.

(2) If $A\subseteq B$ and $A\perp B$, then either $\mathbb{P}(A)=0$ or $\mathbb{P}(B)=1$.

(3) If $A$ and $B$ are disjoint and $\mathbb{P}(A)>0$, $\mathbb{P}(B)>0$, then they are dependent.

\end{theorem}

\begin{proof}
  abc
\end{proof}

\begin{theorem}[Independence of Two Events]
Suppose $(\Omega,\ma,\mathbb{P})$ is a probability space, and suppose $A,B\in\ma$, and $A\perp B$.

Then $A^*\perp B^*$, i.e., $\mathbb{P}(A^*\cap B^* )=\mathbb{P}(A^* )\cdot \mathbb{P}(B^* ),\forall  A^*=A, A^c; B^*=B, B^c$. 
\end{theorem}

\begin{proof}
  abc
\end{proof}

\begin{corollary}[Conditional Probability with Independence]
Suppose $(\Omega,\ma,\mathbb{P})$ is a probability space, and suppose $A,B\in\ma$, and $A\perp B$.

If $\mathbb{P}(B)>0$, then $\mathbb{P}(A^*|B)=\mathbb{P}(A^* ),\forall A^*=A, A^c$.\\
If $\mathbb{P}(B)<1$, then $\mathbb{P}(A^*|B^c )=\mathbb{P}(A^* ),\forall A^*=A, A^c$.

\end{corollary}

\begin{proof}
  abc
\end{proof}

\begin{remark}[Conditional Probability with Independence]
If $A\perp B$ and $\mathbb{P}(B)>0$, then knowledge about the occurrence of $B$ does not change the probability of the occurrence of $A^*$.

If $A\perp B$ and $\mathbb{P}(B)<1$, then knowledge about the occurrence of $B^c$  does not change the probability of the occurrence of $A^*$.

\end{remark}

\begin{proof}
  abc
\end{proof}

\begin{definition}[Independent Set]
Let $(\Omega,\ma,\mathbb{P})$ be a probability space, and $A_1,A_2, \cdots,A_n\in\ma$, where $n\gs2$.\\
If $\mathbb{P}\left(A_{i_1}\bigcap A_{i_2}\bigcap\cdots\bigcap A_{i_k}\right)=\mathbb{P}(A_{i_1} )\mathbb{P}(A_{i_2})\cdots \mathbb{P}(A_{i_k } ),\forall 2\ls k\ls n$,
$$ \#=\sum_{k=2}^n \binom n k=2^n-n-1,1\ls i_1<i_2<\cdots<i_k\ls n,\#:= \text{number}.$$
Then $A_1,A_2, \cdots,A_{n }$ are said to be independent; otherwise, they are said to be dependent.
\end{definition}

\begin{remark}[Sub Independent Set]
If $A_1,A_2, \cdots,A_n\in\ma$ are independent, then $A_{i_2},A_{i_2}, \cdots,A_{i_k}$ are independent, $\forall 2\ls k\ls n, 1\ls i_1<i_2<\cdots <i_k\ls n$. 
\end{remark}

\begin{proof}
  abc
\end{proof}

\begin{theorem}[Equivalent Statements of Independence]
Suppose $(\Omega,\ma,\mathbb{P})$ is a probability space, $A_1,A_2, \cdots,A_n\in\ma$, where $n\gs2$. The following statements are equivalent:

(1) $A_1,A_2, \cdots,A_n$ are independent.

(2) $\mathbb{P}\left(A_{i_1}^*\bigcap A_{i_2}^*\bigcap\cdots\bigcap A_{i_k}^*\right)=\mathbb{P}\left(A_{i_1}^*\right) \mathbb{P}\left(A_{i_2}^* \right)\cdots \mathbb{P}\left(A_{i_k}^* \right), \forall 2\ls k\ls n,$

$\quad  1\ls i_1<i_2<\cdots <i_k\ls n, A_{i_r}^*=A_{i_r}$ or $A_{i_r}^c$.
       
(3) $\mathbb{P}\left(A_{i_1}^*\bigcap A_{i_2}^*\bigcap\cdots\bigcap A_{i_n}^*\right)=\mathbb{P}\left(A_{i_1}^* \right)\mathbb{P}\left(A_{i_2}^* \right)\cdots \mathbb{P}\left(A_{i_n}^* \right), \forall A_i^*=A_i, A_i^c,$

$\quad   i=1,2, \cdots,n$.

\end{theorem}

\begin{proof}
  abc
\end{proof}

\begin{definition}[Independent Set]
Let $(\Omega,\ma,\mathbb{P})$ be a probability space, and $A_i\in\ma, \forall i\in I$, where $I$ is an index set, then $\{A_i: i\in I\}$ is said to be independent if any finite subset of $\{A_i: i\in I\}$ is independent; otherwise, it is said to be dependent.
\end{definition}

\begin{corollary}[Independence under Finite Union]
Suppose $(\Omega,\ma,\mathbb{P})$ is a probability space, and suppose $A_1,A_2, \cdots,A_n\in\ma$ are independent. Then
$$\begin{aligned}
&\mathbb{P}\left[\left(A_{i_1}^* \bigcap A_{i_2}^*\bigcap\cdots\bigcap A_{i_k}^*\right)\bigcap\left(A_{j_1}^* \bigcap A_{j_2}^*\bigcap\cdots\bigcap A_{j_l}^* \right)\right] \\
=&\mathbb{P}\left(A_{i_1}^*\bigcap A_{i_2}^*\bigcap\cdots\bigcap A_{i_k}^*\right)\cdot \mathbb{P}\left(A_{j_1}^*\bigcap A_{j_2}^*\bigcap\cdots\bigcap A_{j_l}^*\right)
\end{aligned}$$
 $\forall k,l\gs1, k+l\ls n, 1\ls i_1,i_2, \cdots,i_k,j_1,j_2, \cdots,j_l\ls n$ distinct, and $A_{i_r}^*=A_{i_r}$ or $A_{i_r}^c, r=1,2, \cdots,k, A_{j_r}^*=A_{j_r}$ or $A_{j_r}^c, r=1,2, \cdots,l.$

In particular, if $\mathbb{P}\left(A_{j_1}^* \bigcap A_{j_2}^* \bigcap\cdots\bigcap A_{j_l}^*\right)>0$, for some $1\ls l\ls n-1, 1\ls j_1, \cdots,j_l\ls n$ distinct, and $A_{j_r}^*=A_{j_r}$ or $A_{j_r}^c, r=1,2, \cdots,l$. Then 
$$\begin{aligned}
&\mathbb{P}\left[\left(A_{i_1}^*\bigcap A_{i_2}^*\bigcap\cdots\bigcap A_{i_k}^*\right)\bigg|\left(A_{j_1}^*\bigcap A_{j_2}^*\bigcap\cdots\bigcap A_{j_l}^*\right)\right]\\
=&\mathbb{P}\left(A_{i_1}^*\bigcap A_{i_2}^*\bigcap\cdots\bigcap A_{i_k}^*\right)\end{aligned}$$
 for all $1\ls k\ls n-l. i_1,i_2, \cdots,i_k\in\{1,2, \cdots,n\}\backslash\{j_1,j_2, \cdots,j_l \}$ distinct, and $A_{i_r}^*=A_{i_r }$ or $A_{i_r}^c,  r=1,2, \cdots,k$.
\end{corollary}

\begin{proof}
  abc
\end{proof}

% chapter 4
\chapter{Distribution Functions and Discrete Random Variables}

\section{Random Variables}

\begin{definition}[Measurable Space]
A measurable space is an ordered pair $(\Omega,\ma)$ consisting of a sample space $\Omega$ and a $\sigma$-algebra $\ma$ of subsets of $\Omega$.
\end{definition}

\begin{definition}[Measurable Function]
Let $(\Omega_1,\ma_1)$, $(\Omega_2,\ma_2)$ be measurable spaces. A function from $\Omega_1$ to $\Omega_2$ is called a measurable function from $(\Omega_1,\ma_1)$ to  $(\Omega_2,\ma_2)$ if $f^{-1}(B)\in \ma_1,\forall B\in\ma_2$, where $f^{-1} (B)=\{x\in\Omega:f(x)\in B\}$ is the 
pre-image of $B$ under $f$.
\end{definition}

\begin{lemma}[$\sigma$-algebra under Function]
Suppose $f$ is a function from $\Omega_1$ to $\Omega_2$.\\
(1) If $\ma_2$ is a $\sigma$-algebra of subsets of  $\Omega_2$, then $\ma_1=\{f^{-1} (B):B\in\ma _2\}$ is a $\sigma$-algebra of subsets of $\Omega_1$.\\
(2) If $\ma_1$ is a $\sigma$-algebra of subsets of $\Omega_1$, then $\ma_2=\{B\in\Omega_2:f^{-1} (B)\in\ma_1\}$ is a $\sigma$-algebra of subsets of $\Omega_2$.
\end{lemma}

\begin{proof}
  abc
\end{proof}

\begin{theorem}[$\sigma$-algebra Including Subset]
Suppose $(\Omega_1,\ma_1)$ is a measurable space and $f$ is a function from $\Omega_1$ to $\Omega_2$. If $\mc\subseteq\{B\subseteq\Omega_2: f^{-1} (B)\in\ma_1\}$, then $\sigma(\mc)\subseteq\{B\subseteq\Omega_2: f^{-1} (B)\in\ma_1\}$.
\end{theorem}

\begin{proof}
  abc
\end{proof}

\begin{corollary}[A Kind of Measurable Function]
Suppose $(\Omega_1,\ma_1)$, $(\Omega_2,\ma_2)$ are measurable spaces, and $f$ is a function from $\Omega_1$ to $\Omega_2$. Suppose $\mc\subseteq\{B\subseteq\Omega_2: f^{-1} (B)\in\ma_1\}$ and $\sigma(\mc)\supseteq\ma_2$. Then $f$ is a measurable function from $(\Omega_1,\ma_1)$ to $(\Omega_2,\ma_2)$.
\end{corollary}

\begin{proof}
  abc
\end{proof}

\begin{theorem}[Composite Measurable Function]
Suppose $(\Omega_1,\ma_1)$, $(\Omega_2,\ma_2)$, $(\Omega_3,\ma_3)$ are measurable spaces, $f$ is a measurable function from $(\Omega_1,\ma_1)$ to $(\Omega_2,\ma_2)$, and $g$ is a measurable function from $(\Omega_2,\ma_2)$ to $(\Omega_3,\ma_3)$. Then $g\circ f$ is a measurable function from  $(\Omega_1,\ma_1)$ to $(\Omega_3,\ma_3)$.
\end{theorem}

\begin{proof}
  abc
\end{proof}

\begin{definition}[Open Set]
A set $A$ in $\mr^n$ is called an open set in $\mr^n$ if for all $\mx\in A, \exists r>0 \to\mb_{\mx}(r)\subseteq A$, where $\mb_{\mx}(r)=\left\{\my\in\mr^n:\Arrowvert\my-\mx\Arrowvert<r\right\}$.
\end{definition}

\begin{definition}[Borel $\sigma$-algebra]
The $\sigma$-algebra generated by the set of all open sets in $\mr^n$ is called the Borel $\sigma$-algebra of subsets of $\mr^n$ and is denoted by $\mb_{\mr^n}$. We call a set in $\mb_{\mr^n}$ a Borel set in $\mr^n$.
\end{definition}

\begin{theorem}[Measurable Function from Continuity]
Suppose $f$ is a continuous function from $\mr^n$ to $\mr^m$. Then $f$ is a measurable function from $(\mr^n,\mb_{\mr^n})$ to $(\mr^m,\mb_{\mr^m})$.
\end{theorem}

\begin{proof}
  abc
\end{proof}

\begin{definition}[Cell]
A cell in $\mr$ is a finite interval of the form $(a,b),[a,b),(a,b]$, or $[a,b]$ for some $a\ls b$. A cell $I$ in $\mr^n$, where $n\gs1$, is a Cartesian product of $n$ cells $I_1,I_2, \cdots,I_n$ in $\mr$, i.e., $I=I_1\times I_2\times\cdots\times I_n$.
\end{definition}

\begin{definition}[Open Cube]
Let $\mx\in\mr^n$, $l>0$, and $I_i=\left(x_i-\frac{l}{2},x_i+\frac{l}{2}\right)$,$\forall1\ls i\ls n$. The open cube $C_{\mx}(l)$ in $\mr^n$ with center $\mx$ and side length $l$ is defined as the open cell $I_1\times I_2\times\cdots\times I_n$  in $\mr^n$.
\end{definition}

\begin{theorem}[Set from Cells]
Every open set in $\mr^n$ is a countable union of open cells in $\mr^n$.
\end{theorem}

\begin{proof}
  abc
\end{proof}

\begin{theorem}[Measurable Function on Open Cells]
Suppose $(\Omega,\ma)$ is a measurable space and $f$ is a function from $\Omega$ to $\mr^n$. Suppose that $f^{-1}(B)\in\ma$ for all open cells in $\mr^n$. Then $f$ is a measurable function from $(\Omega,\ma)$ to $(\mr^n,\mb_{\mr^n})$.
\end{theorem}

\begin{proof}
  abc
\end{proof}

\begin{theorem}[Components of Measurable Function]
Suppose $(\Omega,\ma)$ is a measurable space,  $f=(f_1,f_2, \cdots,f_n )$  is a function from $\Omega$ to $\mr^n$. Then $f$ is a measurable function from  $(\Omega,\ma)$ to $(\mr^n,\mb_{\mr^n})\Leftrightarrow f_1,f_2, \cdots,f_n$ are measurable functions from $(\Omega,\ma)$ to $(\mr^n,\mb_{\mr^n})$.
\end{theorem}

\begin{proof}
  abc
\end{proof}

\begin{theorem}[Elementary Operation of Measurable Function]
Suppose $f$ and $g$ are measurable functions from $(\Omega,\ma)$ to $(\mr,\mb_{\mr})$, and $c\in\mr$. Then $cf, f^n, |f|, f+g, f\circ g$ are measurable functions from $(\Omega,\ma)$ to $(\mr,\mb_{\mr})$.
\end{theorem}

\begin{proof}
  abc
\end{proof}

\begin{theorem}[Limit of Measurable Functions]
Suppose that $f_1,f_2,\cdots$ are measurable functions from $(\Omega,\ma)$ to $(\mr,\mb_{\mr})$ and $f_n\to f$ as $n\to\infty$, where $f$ is a function from $\Omega$ to $\mr$. Then $f$ is also a measurable function from $(\Omega,\ma)$ to $(\mr,\mb_{\mr})$.
\end{theorem}

\begin{proof}
  abc
\end{proof}

\begin{theorem}[Equivalence of Nine Types of Set]
Suppose $(\Omega,\ma)$ is a measurable space and $f$ is a function from $\Omega$ to $\mr$. Let $\mc_1$ be the set of all open sets in $\mr$, 
$$
\begin{array}{ll}
\mc_2=\{(a,b),a,b\in\mr,a\ls b\},    &\mc_3=\{(a,b],a,b\in\mr,a\ls b\},\\
\mc_4=\{[a,b],a,b\in\mr,a\ls b\},     &\mc_5=\{[a,b),a,b\in\mr,a\ls b\},\\
\mc_6=\{[a,+\infty),a\in\mr\},        &\mc_7=\{(a,+\infty),a\in\mr\},\\
\mc_8=\{(-\infty,a],a\in\mr\},        &\mc_9=\{(-\infty,a),a\in\mr\}.\\
\end{array}
$$
Then $f$ is a measurable function from $(\Omega,\ma)$ to $(\mr,\mb_{\mr})$
if $f^{-1} (B)\in\ma, \forall B\subseteq\mc_i$ for any $i=1,2, \cdots,9$.
\end{theorem}

\begin{proof}
  abc
\end{proof}

\begin{theorem}[Induced Probability Space under Function]
Suppose $f$ is a measurable function from $(\Omega_1,\ma_1)$ to $(\Omega_2,\ma_2)$. Suppose $P$ is a probability measure on $\ma_1$.
Then the function $P_f$ on $\ma_2$ given by $$P_f (B)=P[f^{-1} (B)],  \forall B\in\ma_2$$ is a probability measure.\\
We call $(\Omega_2,\ma_2,P_f)$ the probability space induced from $(\Omega_1,\ma_1,P)$ under $f$.
\end{theorem}

\begin{proof}
  abc
\end{proof}

\begin{remark}[Conventional Denotation]
(1) The set $f^{-1} (B)$ is conventionally denoted as $f\in B$. Therefore $P_f (B)=P[f^{-1} (B)]=\mathbb{P}(f\in B), \forall B\in\ma_2$.\\
(2) If $B\in\ma_2$, then $f^{-1} (B)=f^{-1}[B\cap f(\Omega_1)]$, and hence
$P_f (B)=\mathbb{P}(f\in B)=P[f^{-1} (B)]=P[f^{-1} (B\cap f(\Omega_1)]=P[ f\in (B\cap f(\Omega_1))]=P_f (B\cap f(\Omega_1)).$
\end{remark}

\begin{proof}
  abc
\end{proof}

\begin{definition}[Random Variable]
Let $(\Omega,\ma,\mathbb{P})$ be a probability space. A measurable function $X$ from $(\Omega,\ma)$ to $(\mr,\mb_{\mr})$ is called a random variable (r.v.) of the probability space $(\Omega,\ma,\mathbb{P})$.\\
A measurable function $\mX=(X_1,X_2, \cdots,X_n)$ from $(\Omega,\ma)$ to $(\mr^n,\mb_{\mr^n})$ is called a random vector (r.vect.) of the probability space $(\Omega,\ma,\mathbb{P})$.
\end{definition}

\begin{remark}[Conventional Denotation of Random Variable]
If $X$ is a r.v. of the probability space $(\Omega,\ma,\mathbb{P})$, then $P_X (B)=P[X^{-1} (B)]=\mathbb{P}(X\in B)=P[\{w\in\Omega:X(w)\in B\}], \forall B\in\mb_{\mr}$.
\end{remark}

\begin{proof}
  abc
\end{proof}

\begin{theorem}[Additivity of Countable Points]
Suppose $\mX$ is a r.vect. of a probability space $(\Omega,\ma,\mathbb{P})$, and $B$ is a ``countable'' subset of $\mr^n$, then $B\in\mb_{\mr}$, and
$$P_{\mX} (B)=\mathbb{P}(\mX\in B)=\sum_{\mx\in B}\mathbb{P}(\mX=\mx)=\sum_{\mx\in B}P_{\mX}(\{\mx\}).$$
\end{theorem}

\begin{proof}
  abc
\end{proof}

\section{Distribution Functions}

\begin{definition}[Cumulative Distribution Function]
Let $X$ be a r.v. of a probability space $(\Omega,\ma,\mathbb{P})$. The cumulative distribution function (c.d.f) $F_X$ of the r.v. $X$ is a function from $\mr$ to $[0,1]$, given by
$$F_X (t)=P_X ((-\infty,t])=\mathbb{P}(X\in(-\infty,t])=\mathbb{P}(X\ls t), \forall t\in\mr.$$
\end{definition}

\begin{theorem}[Properties of C.D.F]
Suppose $X$ is a r.v. of a probability space $(\Omega,\ma,\mathbb{P})$.\\
(1) $F_X$ is increasing.\\
(2) $F_X (+\infty):= \limtpi F_X (t)=1$.\\
(3) $F_X (-\infty):= \limtni F_X (t)=0$.\\
(4) $F_X (t+)=\mathbb{P}(X\ls t)=F_X (t)$. $F_X (t)$ is right continuous.\\
(5) $F_X (t-)=\mathbb{P}(X<t)$.
\end{theorem}

\begin{proof}
  abc
\end{proof}

\begin{corollary}[More Properties of C.D.F]
Suppose $X$ is a r.v. of a probability space $(\Omega,\ma,\mathbb{P})$.\\
(1) $\mathbb{P}(X\ls a)=F_X (a), \mathbb{P}(X>a)=1-F_X (a)$.\\
(2) $\mathbb{P}(X<a)=F_X (a-), \mathbb{P}(X\gs a)=1-F_X (a-)$.\\
(3) $\mathbb{P}(X=a)=F_X (a)-F_X (a-)$.\\
(4) $\mathbb{P}(a<X\ls b)=F_X (b)-F_X (a),\quad \mathbb{P}(a\ls X\ls b)=F_X (b)-F_X (a-),$\\
\quad$\mathbb{P}(a<X<b)=F_X (b-)-F_X (a),\quad \mathbb{P}(a\ls X<b)=F_X (b-)-F_X (a-).$
\end{corollary}

\begin{proof}
  abc
\end{proof}

\begin{theorem}[Existence of C.D.F]
Suppose $F: \mr\to[0,1]$ is a function s.t. $F$ is increasing and right continuous, 
$$\limtpi F_X (t)=1,\qquad\limtni F_X (t)=0.$$
Then there exists a r.v. $X$ of some probability space $(\Omega,\ma,\mathbb{P})$,
s.t. the c.d.f. $F_X$ of $X$ is equal to $F$. We call such function a c.d.f.
\end{theorem}

\begin{proof}
  abc
\end{proof}

\section{Discrete Random Variables}

\begin{definition}[Discrete R.V.]
A r.v. of a probability space $(\Omega,\ma,\mathbb{P})$ is called a discrete r.v. if $X(\Omega)=\{X(w):w\in\Omega\}$ is countable.
\end{definition}

\begin{definition}[Probability Mass Function]
Let $X$ be a discrete r.v. of a probability space $(\Omega,\ma,\mathbb{P})$ s.t. $X(\Omega)=\{x_1,x_2,\cdots\}$. The probability mass function (p.m.f) $p_X: \mr\to[0,1]$ of $X$ is a function from $\mr$ to $[0,1]$ given by $p_X (x)=P_X (\{X=x\})=\mathbb{P}(X=x), \forall x\in\mr$.
\end{definition}

\begin{theorem}[Properties of P.M.F]
Suppose $X$ is a discrete r.v. of a probability space $(\Omega,\ma,\mathbb{P})$. Then,\\
(1) $p_X (x)\gs0, \forall x\in X(\Omega)$.\\
(2) $p_X (x)=0, \forall x\in\mr\setminus X(\Omega)$.\\
(3) $\dis\sum_{x\in X(\Omega)}p_X (x) =1$.\\
Therefore if $X(\Omega)=\{x_1,x_2,\cdots\}$, then,\\
(1) $p_X (x_i )\gs0, \forall i=1,2,\dots$.\\
(2) $p_X (x)=0,\forall x\in\mr\setminus\{x_1,x_2,\cdots\}$.\\
(3) $\dis\sum_{i=1}^\infty p_X (x_i ) =1$.\\
\end{theorem}

\begin{proof}
  abc
\end{proof}

\begin{theorem}[Existence of P.M.F]
Suppose $p: \mr\to[0,1]$ is a function s.t.\\
(1) $p(x_i )\gs0 \forall i=1,2,\dots$\\
(2) $p(x)=0, \forall x\in\mr\setminus\{x_1,x_2,\cdots\}$.\\
(3) $\dis\sum_{i=1}^\infty p_X (x_i ) =1$.\\
for some distinct $x_1,x_2,\cdots\in\mr$.\\
Then there exists a discrete r.v. $X$ of some probability space $(\Omega,\ma,\mathbb{P})$ s.t. the p.m.f. $p_X$ of $X$ is equal to $p$. We call such a function a p.m.f.
\end{theorem}

\begin{proof}
  abc
\end{proof}

\begin{theorem}[Step Distribution Function for Discrete R.V.]
Suppose $X$ is a discrete r.v. of a probability space $(\Omega,\ma,\mathbb{P})$ s.t. $X(\Omega)=\{x_1,x_2\cdots\}$, where $x_1<x_2<\cdots$. Then the distribution function of $X$ is a step function given by
$$
F_X (t)=
\begin{cases}
\qquad0,\qquad \text{if}  t<x_1\\
\sumin p_X (x_i ), \text{if}  x_n\ls t\ls x_{n+1},  n=1,2,\dots
\end{cases}
=\sumin p_X (x_i )U(t-x_i),
$$
where
$$
U(t)=\begin{cases}
1, \text{if} t\gs0\\
0, \text{o.w.}
\end{cases}
$$
\end{theorem}

\begin{proof}
  abc
\end{proof}

\section{Expectations of Discrete Random Variables}

\begin{definition}[Expectation]
Let $X$ be a discrete r.v. of a probability space $(\Omega,\ma,\mathbb{P})$. The expectation (or expected value, or mean) of $X$ is given by 
$$
\mathbb{E}[X]=\sum_{x\in X(\Omega)}x\cdot \mathbb{P}(X=x) =\sum_{x\in X(\Omega)}x\cdot p_X (x)
$$
if the sum converges absolutely. And if the sum diverges to $\pm\infty$, $\mathbb{E}[X]=\pm\infty$.
\end{definition}

\begin{remark}[Explanations of Expectation]

(1) The expectation $\dis \mathbb{E}[X]=\sum_{x\in X(\Omega)}x\cdot p_X (x)$ is the weighted average of $\{x:x\in X(\Omega)\}$ with weights $\{\mathbb{P}(X=x): x\in X(\Omega)\}$.\\
(2) The expectation $\dis \mathbb{E}[X]=\sum_{x\in X(\Omega)}x\cdot p_X (x)$ is the center of gravity of $\{\mathbb{P}(X=x): x\in X(\Omega)\}$.
\end{remark}

\begin{proof}
  abc
\end{proof}

\begin{theorem}[Expectation of Constant]
Suppose $X$ is a discrete r.v. of a probability space $(\Omega,\ma,\mathbb{P})$ s.t. $X$ is a constant with probability 1, i.e., $\mathbb{P}(X=c)=1$ for some $c\in\mr$. Then $c\in X(\Omega), \mathbb{P}(X=x)=0, \forall x\in X(\Omega)\setminus\{c\}$, and $\mathbb{E}[X]=c$. In particular, if $X$ is a constant r.v. of $(\Omega,\ma,\mathbb{P})$, i.e., $X(w)=c, \forall w\in\Omega$, for some $c\in\mr$, then $\mathbb{E}[X]=c$.
\end{theorem}

\begin{proof}
  abc
\end{proof}

\begin{theorem}[Composition of Function and R.V.]
Suppose $X$ is a discrete r.v. of a probability space $(\Omega,\ma,\mathbb{P})$ and $g$ be a measurable function from $(\mr,\mb_{\mr})$ to $(\mr,\mb_{\mr})$. Then $g(X):= g\circ X$ is a discrete r.v. of $(\Omega,\ma,\mathbb{P})$ and
$$\mathbb{E}[g(X)]=\sum_{x\in X(\Omega)}g(x)\mathbb{P}(X=x).$$
\end{theorem}

\begin{proof}
  abc
\end{proof}

\begin{corollary}[Linearity of Expectation]
Suppose $X$ is a discrete r.v. of a probability space $(\Omega,\ma,\mathbb{P})$,
$g_1,g_2, \cdots,g_n$ are measurable functions from $(\mr,\mb_{\mr})$ to $(\mr,\mb_{\mr})$, and $\alpha_1,\alpha_2, \cdots,\alpha_n\in\mr$, Then
$$
\sumin\alpha_i  g_i (X)
$$
is a discrete r.v. of $(\Omega,\ma,\mathbb{P})$ and 
$$
\mathbb{E}\left[\sumin\alpha_i  g_i (X)\right]=\sumin\alpha_i  \mathbb{E}[g_i (X)].
$$
\end{corollary}

\begin{proof}
  abc
\end{proof}

\section{Variances and Moments of Discrete Random Variables}

\begin{definition}[Variance and Standard Deviation]
Let $X$ be a discrete r.v. of a probability space $(\Omega,\ma,\mathbb{P})$ and suppose $\mathbb{E}[X]$ exists. The variance of $X$ is given by $$\mathrm{Var}(X)=\mathbb{E}[(X-\mathbb{E}[X])^2],$$ and the standard deviation of $X$ is given by $\sigma_X=\sqrt{\mathrm{Var}(X)}$.
\end{definition}

\begin{remark}[Explanation about Variance]
The variance of a discrete r.v. measures the dispersion (or spread) of its probability masses about its expectation (center of gravity of its probability masses).
\end{remark}

\begin{proof}
  abc
\end{proof}

\begin{theorem}[Calculating Variance]
Suppose $X$ is a discrete r.v. of a probability space $(\Omega,\ma,\mathbb{P})$ and suppose $\mathbb{E}[X]$ exists. Then $\mathrm{Var}(X)=\mathbb{E}[X^2]-(\mathbb{E}[X])^2$.
\end{theorem}

\begin{proof}
  abc
\end{proof}

\begin{theorem}[Minimum Distance with Expectation]
Suppose $X$ is a discrete r.v. of a probability space $(\Omega,\ma,\mathbb{P})$ and suppose $\mathbb{E}[X]$ exists. If $\mathbb{E}[X^2]<+\infty$, then $\mathrm{Var}(X)=\min_{a\in\mr}\mathbb{E}[(X-a)^2]$.
\end{theorem}

\begin{proof}
  abc
\end{proof}

\begin{theorem}[With Probability 1]
Suppose $X$ is a discrete r.v. of a probability space $(\Omega,\ma,\mathbb{P})$.\\
(1) $\mathbb{E}[X^2]\gs0$, ``='' holds $\Leftrightarrow X=0$ with probability 1, i.e., $\mathbb{P}(X=0)=1$.\\
(2) If $\mathbb{E}[X]$ exists, then $\mathrm{Var}(X)\gs0$, ``='' holds $\Leftrightarrow X=\mathbb{E}[X]$ with probability 1, i.e., $\mathbb{P}(X=\mathbb{E}[X])=1$.
\end{theorem}

\begin{proof}
  abc
\end{proof}

\begin{theorem}[Calculating Linear Combination]
Suppose $X$ is a discrete r.v. of a probability space $(\Omega,\ma,\mathbb{P})$ and suppose $\mathbb{E}[X]$ exists. Then $\mathrm{Var}(aX+b)=a^2 \mathrm{Var}(X)$ and $\sigma_{aX+b}=|a| \sigma_X, \forall a,b\in\mr$.
\end{theorem}

\begin{proof}
  abc
\end{proof}

\begin{definition}[Moment and Absolute Moment]
Let $X$ be a discrete r.v. of a probability space $(\Omega,\ma,\mathbb{P})$, and $r,c\in\mr$.
$$\left\{
\begin{array}{l}
\text{The }r^{th} \text{moment of } X \text{is given by } \mathbb{E}[X^r]   \                              
\text{The }r^{th} \text{central moment of } X \text{is given by } \mathbb{E}[(X-\mathbb{E}[X])^r]\\
\text{The }r^{th} \text{moment of } c \text{is given by } \mathbb{E}[(X-c)^r]      \     
\text{The }r^{th} \text{absolute moment of } X \text{is given by } \mathbb{E}[|X|^r]\\
\text{The }r^{th} \text{absolute central moment of } X \text{is given by } \mathbb{E}[|X-\mathbb{E}[X] |^r]  \\
\text{The } r^{th} \text{absolute moment of } c \text{is given by } \mathbb{E}[|X-c|^r] 
\end{array}
\right.$$
If the respective sum converges absolutely.
\end{definition}

\begin{theorem}[Existence of Lower Order Moment]
Suppose $X$ is a discrete r.v. of a probability space $(\Omega,\ma,\mathbb{P})$ and suppose $0<r<s$. If $\mathbb{E}[|X|^s]$ exists, then $\mathbb{E}[|X|^r]$ exists. That is, the existence of a higher order moment of $X$ guarantees the existence of a lower order moment of $X$.
\end{theorem}

\begin{proof}
  abc
\end{proof}

\section{Standardized Random Variables}

\begin{definition}[Standardized R.V.]
Let $X$ be a discrete r.v. of a probability space $(\Omega,\ma,\mathbb{P})$. If $\mathrm{Var}(X)$ exists and $\mathrm{Var}(X)\neq0$, then the standardized r.v. of $X$ is given by 
$$X^*=\frac{X-\mathbb{E}[X]}{\sigma_X}$$
i.e., $X^*$ is the number of standard deviation units by which $X$ differs from $\mathbb{E}[X]$.
\end{definition}

\begin{theorem}[Expectation and Variance of Standardized R.V.]
Suppose $X$ is a discrete r.v. of a probability space $(\Omega,\ma,\mathbb{P})$ and $\mathrm{Var}(X)$ exists, $\mathrm{Var}(X)\neq0$. Then $\mathbb{E}[X^*]=0$ and $\mathrm{Var}(X^* )=1$.
\end{theorem}

\begin{proof}
  abc
\end{proof}

\begin{theorem}[Independence of Units]
Suppose $X$ is a discrete r.v. of a probability space $(\Omega,\ma,\mathbb{P})$ and $\mathrm{Var}(X)$ exists, $\mathrm{Var}(X)\neq0$. Then the standardized r.v. of $X$ is independent of the units in which $X$ is measured.
\end{theorem}

\begin{proof}
  abc
\end{proof}

\begin{remark}[Standardization for Comparison]
Standardization can be useful when comparing r.v.'s with different distributions.
\end{remark}

\begin{proof}
  abc
\end{proof}

\chapter{Special Discrete Distributions}
\section{Bernoulli R.V.'s and Binomial R.V.'s}

\begin{definition}[Bernoulli Trial]
A Bernoulli trial is an experiment that has only two outcomes, say success and failure, so that its sample space is given by $\Omega=\{s,f\}$.
\end{definition}

Let $X$ be the number of success in a Bernoulli trial.
$$
p_X (i)=
\begin{cases}
1-p,\quad\text{if } i=0\\
p,\quad\text{if }i=1\\
0,\quad\text{o.w.}\\
\end{cases}
$$
where $p=\mathbb{P}(X=1)=\mathbb{P}(\{s\})$ is the probability of success.

\begin{definition}[Bernoulli R.V.]
A discrete r.v. $X$ of a probability space $(\Omega,\ma,\mathbb{P})$ is called a Bernoulli r.v. with parameter $p$ where $0<p<1$, denoted $X\sim$ Bernoulli$(p)$, if its p.m.f is given by
$$
p_X (i)=
\begin{cases}
1-p,\quad\text{if } i=0\\
p,\quad\text{if }i=1\\
0,\quad\text{o.w.}\\
\end{cases}
$$
Such a p.m.f is called a Bernoulli p.m.f with parameter $p$.
\end{definition}

\begin{theorem}[Expectation and Variance of Bernoulli R.V.]
Suppose $X\sim$ Bernoulli$(p)$, where $0<p<1$. Then $$\mathbb{E}[X]=p,\qquad \mathrm{Var}(X)=p(1-p).$$
\end{theorem}

\begin{proof}
  abc
\end{proof}

Consider an experiment in which $n$ independent Bernoulli trials with the same probability of success, say $p$, are performed. The sample space of the experiment is $\Omega=\{(\omega_1,\omega_2, \cdots,\omega_n ): \omega_i=s \text{ or } f,i=1,2, \cdots,n\} $ and $\mathbb{P}(\{(\omega_1,\omega_2, \cdots,\omega_n )\})=p^i (1-p)^{n-i}$, where $i=|\{1\ls j\ls n: \omega_j=s\}|.$

Let $X$ be the number of successes in the $n$ Bernoulli trials.
$$
p_X (i)=
\left\{\begin{aligned}
&\binom n i p^i(1-p)^{n-i}, \text{if} i=0,1,2, \cdots,n\\
&\qquad0,\qquad \text{o.w.}    
\end{aligned}
\right.
$$

\begin{definition}[Binomial R.V.]
A discrete r.v. $X$ of a probability space $(\Omega,\ma,\mathbb{P})$ is called a binomial r.v. with parameter $n$ and $p$ where $n\gs 1$ and $0<p<1$, denoted $X\sim$ binomial$(n,p)$, if its p.m.f is given by
$$
p_X (i)=
\left\{\begin{aligned}
&\binom n i p^i(1-p)^{n-i}, \text{if} i=0,1,2, \cdots,n\\
&\qquad0,\qquad \text{o.w.}    
\end{aligned}
\right.
$$
Such a p.m.f is called a binomial p.m.f with parameter $n$ and $p$.
\end{definition}

\begin{remark}[Bernoulli R.V. from Binomial R.V.]
(1) A Bernoulli r.v. with parameter $p$ is a binomial r.v. with parameter 1 and $p$.\\
(2)
$$
\sumin p_X(i)=\sumin\binom n i p^i(1-p)^{n-i}=[p+(1-p)]^n=1
$$
Thus $p_X (\cdot)$ is a p.m.f.
\end{remark}

\begin{theorem}[Expectation and Variance of Binomial R.V.]
Suppose $X\sim$ binomial$(n,p)$, where $n\gs1$ and $0<p<1$. Then 
$$\mathbb{E}[X]=np,\qquad \mathrm{Var}(X)=np(1-p).$$
\end{theorem}

\begin{proof}
  abc
\end{proof}

\begin{theorem}[Maximum Point of Binomial Probability]
Suppose $X\sim$ binomial$(n,p)$, where $n\gs1$ and $0<p<1$. Then
$$
arg\max_{0\ls i\ls n} p_X (i)=
\begin{cases}
(n+1)p-1 \text{  or  } (n+1)p, \text{if }  (n+1)p\in\mz\\
\qquad\lfloor(n+1)p\rfloor\qquad,  \text{if }  (n+1)p\notin\mz
\end{cases}
$$
\end{theorem}

\begin{proof}
  abc
\end{proof}

\section{Poisson R.V.'s}

If $X\sim$ binomial$(n,p)$, then $p_X (i)=\binom n i p^i (1-p)^{n-i}$ is difficult to calculate if $n$ is large. A recursive relation: 
$$p_X (0)=(1-p)^n,  p_X (i)=\frac{n-i+1}{i(1-p)} \cdot p_X (i-1), \forall i\gs1.$$
An approximation for large $n$, small $p$, and moderate $np$, say $np=\lambda$ for some constant $\lambda$:
$$
\begin{aligned}
p_X (i)&=\binom n i p^i (1-p)^{n-i}=\frac{n(n-1)\cdots(n-i+1)}{i!}\left(\frac{\lambda}{n}\right)^i \left(1-\frac{\lambda}{n} \right)^{n-i}\\
&=\frac{n(n-1)\cdots(n-i+1)}{n^i}\cdot\frac{1}{\left(1-\frac{\lambda}{n}\right)^i }\cdot\frac{\lambda^i}{i!}\cdot\left(1-\frac{\lambda}{n}\right)^n  \xrightarrow{n\to\infty}e^{-\lambda} \frac{\lambda^i}{i!}.
\end{aligned}
$$

\begin{definition}[Poisson R.V.]
A discrete r.v. of a probability space $(\Omega,\ma,\mathbb{P})$ is called a Poisson r.v. with parameter $\lambda$ where $0<\lambda<1$, denoted $X\sim$ Poisson$(\lambda)$, if its p.m.f is given by
$$
p_X (i)=
\left\{\begin{aligned}
&e^{-\lambda}\cdot \frac{\lambda^i}{i!},  i=0,1,2,\dots\\
&0,\quad\text{o.w.}
\end{aligned}           
\right.
$$
Such a p.m.f is called a Poisson p.m.f with parameter $\lambda$.
\end{definition}

\begin{remark}[Poisson R.V. from Binomial R.V.]
(1) A Poisson r.v. with parameter $\lambda$ is an approximation of a binomial p.m.f. with parameters $n$ and $p$ such that $n$ is large and $p$ is small, and $np=\lambda$.\\
(2)
$$
\sumiz p_X (i)=\sumiz e^{-\lambda} \cdot\frac{\lambda^i}{i!}=e^{-\lambda}\cdot e^{\lambda}=1 
$$
thus $p_X (\cdot)$ is a p.m.f.
\end{remark}

\begin{theorem}[Expectation and Variance of Poisson R.V.]
Suppose $X\sim$ Poisson$(\lambda)$, where $\lambda>0$. Then $\mathbb{E}[X]=\lambda$ and $\mathrm{Var}(X)=\lambda.$
\end{theorem}

\begin{proof}
  abc
\end{proof}

\begin{theorem}[Maximum Point of Poisson Probability]
Suppose $X\sim$ Poisson$(\lambda)$, where $\lambda>0$. Then
$$
arg\max_{i\gs0}p_X(i)=
\begin{cases}
\lambda-1 \text{ or } \lambda, \text{if} \lambda\in\mz\\
\lfloor\lambda\rfloor,\quad \text{if} \lambda\notin\mz
\end{cases}
$$
\end{theorem}

\begin{proof}
  abc
\end{proof}

\section{Geometric R.V.'s, Negative Binomial R.V.'s and Hypergeometric R.V.'s}

Consider an experiment in which independent Bernoulli trials with the same probability of success, say $p$, are performed until the first success occurs. The sample space of the experiment is $\Omega=\{s,fs,ffs,\cdots\}$.

Let $X$ be the number of Bernoulli trials until the first success occurs,
$$
p_X (i)=
\begin{cases}
(1-p)^{i-1}\cdot p,   i=0,1,2\cdots\\
0,\quad\text{o.w.}
\end{cases}
$$

\begin{definition}[Geometric R.V.]
A discrete r.v. of a probability space $(\Omega,\ma,\mathbb{P})$ is called a geometric r.v. with parameter $p$ where $0<p<1$, denoted $X\sim$ geometric$(p)$, if its p.m.f is given by
$$
p_X (i)=
\begin{cases}
(1-p)^{i-1}\cdot p,   i=0,1,2\cdots\\
0,\quad\text{o.w.}
\end{cases}           
$$
Such a p.m.f is called a geometric p.m.f with parameter $p$.
\end{definition}

\begin{remark}[Justification of P.M.F.]
$$
\sumi p_X (i)=\sumi(1-p)^{i-1}\cdot p=p\cdot\frac{1}{1-(1-p)}=1 
$$
thus $p_X (\cdot)$ is a p.m.f. 
\end{remark}

\begin{theorem}[Expectation and Variance of Geometric R.V.]
Suppose $X\sim$ geometric$(p)$, where $0<p<1$. 
Then
$$
\mathbb{E}[X]=\frac{1}{p},\qquad
\mathrm{Var}(X)=\frac{1-p}{p^2}.
$$
\end{theorem}

\begin{proof}
  abc
\end{proof}

\begin{theorem}[Memoryless Property]
Suppose $X$ is a discrete r.v. of a probability space $(\Omega,\ma,\mathbb{P})$ with $X(\Omega)=\{1,2\cdots\}$. Then $P[(X>m+n)|(X>m)]=\mathbb{P}(X>n), \forall m,n>0 \Leftrightarrow X$ is a geometric r.v.
\end{theorem}

\begin{proof}
  abc
\end{proof}

Consider an experiment in which independent Bernoulli trials with the same
probability of success, say $p$, are performed until the $r^{th}$ success occurs, where $r\gs1$.

Let $X$ be the number of Bernoulli trials until the  $r^{th}$ success occurs,
$$
p_X (i)=
\left\{\begin{aligned}
&\binom {i-1} {r-1} p^r(1-p)^{i-r},   i=r,r+1,\cdots\\
&0,\quad\text{o.w.}
\end{aligned}        
\right.   
$$

\begin{definition}[Negative Binomial R.V.]
A discrete r.v. of a probability space $(\Omega,\ma,\mathbb{P})$ is called a negative binomial r.v. with parameters $r$ and $p$ where $r\gs1$ and $0<p<1$, denoted $X\sim$ neg.-binomial$(r,p)$, if its p.m.f is given by
$$
p_X (i)=
\left\{\begin{aligned}
&\binom {i-1} {r-1} p^r(1-p)^{i-r},   i=r,r+1,\cdots\\
&0,\quad\text{o.w.}
\end{aligned}        
\right.   
$$
Such a p.m.f is called a negative binomial p.m.f with parameters $r$ and $p$.
\end{definition}

\begin{remark}[Geometric R.V. from Negative Binomial R.V.]
(1) A geometric r.v. with parameter $p$ is a negative binomial r.v. with parameters 1 and $p$.\\
(2)
$$
\begin{aligned}
\sum_{i=r}^{\infty}\left(i-1\right)\left(i-2\right)\cdots\left(i-r+1\right)x^{i-r}&=\frac{\dif^{r-1}}{\dif x^{r-1}}\left(\sum_{i=1}^{\infty}x^{i-1}\right)\\
&=\frac{\dif^{r-1}}{\dif x^{r-1}}\left(\frac{1}{1-x}\right)=\frac{\left(r-1\right)!}{(1-x)^r}\\
\end{aligned}
$$
$$
\begin{aligned}
&\to\sum_{i=r}^{\infty}{p_X(i)}=\sum_{i=r}^{\infty}{\binom{i-1}{r-1}p^r(1-p)^{i-r}}=\frac{p^r}{\left(r-1\right)!}\cdot\frac{\left(r-1\right)!}{(1-(1-p))^r}=1\\
   &\to p_X\left(\cdot\right) \text{is a p.m.f.}
   \end{aligned}
$$
\end{remark}

\begin{theorem}[Expectation and Variance of Negative Geometric R.V.]
Suppose $X\sim$ neg.-binomial$(r,p)$, where $r\gs1$ and $0<p<1$. Then
$$
\mathbb{E}\left[X\right]=\frac{r}{p},\qquad
\mathrm{Var}(x)=\frac{r(1-p)}{p^2}.
$$
\end{theorem}

\begin{proof}
  abc
\end{proof}

\begin{theorem}[Maximum Point of Negative Geometric Probability]
Suppose $X\sim$ neg.-binomial$(r,p)$, where $r\gs1$ and $0<p<1$. Then
\begin{equation}
  \argmax_{i\gs r} p_X(i)=
  \begin{cases}
    1,                             &\text{if}~r=1\\
    \frac{r-1}{p}~\text{or}~\frac{r-1}{p+1},  &\text{if}~\frac{r-1}{p}\in\mz^+\\
    \left\lfloor\frac{r-1}{p+1}\right\rfloor,               &\text{if}~\frac{r-1}{p}\notin\mz     
  \end{cases}
\end{equation}
\end{theorem}

\begin{proof}
  abc
\end{proof}

A box contains $N_1$ red balls and $N_2$ blue balls. Suppose that $n$ balls are randomly drawn from the box, one by one and without replacement.

Let $X$ be the number of red balls drawn
\begin{equation}
  p_X(i)=
  \left\{\begin{aligned}
  &\frac{\binom {N_1} i \binom {N_2} {n-i}}{\binom {N_1+N_2} n},i=a,a+1, \cdots, b.  a=\max\{n-N_1,0\},b=\min\{n,N_1\}     \\
  &\quad0, \qquad\qquad\text{o.w.}              
  \end{aligned}\right.
\end{equation}

\begin{definition}[Hypergeometric R.V.]
A discrete r.v. of a probability space $(\Omega,\ma,\mathbb{P})$ is called a hypergeometric r.v. with parameter $N_1, N_2$ and $n$ where $N_1,N_2\gs1$ and $n\gs1$, denoted $X\sim$ hypergeometric$(N_1,N_2,n)$, if its p.m.f is given by
$$
p_X(i)=
\left\{\begin{aligned}
&\frac{\binom {N_1} i \binom {N_2} {n-i}}{\binom {N_1+N_2} n},i=a,a+1, \cdots, b.  a=\max\{n-N_1,0\},b=\min\{n,N_1\}     \\
&\quad0, \qquad\qquad\text{o.w.}              
\end{aligned}\right.                                                    
$$
Such a p.m.f is called a hypergeometric r.v. with parameter $N_1, N_2$ and $n$.
\end{definition}

\begin{remark}[Justification of P.M.F.]
(1) If $n\ls\min\{N_1,N_2\}\to a=\max\{n-N_1,0\}=0,b=\min\{n,N_1\}=n$.\\
(2) 
$$
\begin{aligned}
&(1+x)^{N_1+N_2}=(1+x)^{N_1}(1+x)^{N_2}\\
\to  &\text{the coefficient of } x^n \text{ is }
\binom{N_1+N_2}{n}=\sum_{i=a}^{b}\binom{N_1}{i}\binom{N_2}{n-i},\\
&\text{where}  a=\max\{n-N_1,0\},b=\min\{n,N_1\} \\
\to &\sum_{i=a}^{b}{p_X(i)}=\sum_{i=a}^{b}\frac{\binom{N_1}{i}\binom{N_2}{n-i}}{\binom{N_1+N_2}{n}}=1.\\
\to &p_X\left(\cdot\right) \text{is a p.m.f.}
\end{aligned}
$$
\end{remark}

\begin{theorem}[Expectation and Variance of Hypergeometric R.V.]
Suppose 
\begin{equation}
  X\sim\mathrm{hypergeometric}(N_1,N_2,n),
\end{equation}
where $N_1,N_2\gs1$ and $1\ls n\ls\min\{N_1,N_2\}$. Then
$$
\mathbb{E}\left[X\right]=\frac{nN_1}{N_1+N_2}, 
\mathrm{Var}(x)=n\cdot\frac{N_1}{N_1+N_2}\cdot\frac{N_2}{N_1+N_2}\cdot\left(1-\frac{n-1}{N_1+N_2-1}\right).
$$
\end{theorem}

\begin{proof}
  abc
\end{proof}

\begin{theorem}[Binomial Approximation for Hypergeometric]
Suppose $n$ balls are drawn with replacement, i.e.,
\begin{equation}
  X\sim \mathrm{binomial}\left(n,\frac{N_1}{N_1+N_2}\right).
\end{equation}
Then
\begin{equation}
  \mathbb{E}\left[X\right]=n\cdot\frac{N_1}{N_1+N_2},  \mathrm{Var}(x)=n\cdot\frac{N_1}{N_1+N_2}\cdot\frac{N_2}{N_1+N_2}.
\end{equation}
Therefore, if $n\ll N_1+N_2$, then drawing with replacement is a good approximation of drawing without replacement.
\end{theorem}

\begin{proof}
  abc
\end{proof}

\begin{theorem}[Maximum Point of Hypergeometric Probability]
Suppose
\begin{equation}
  X\sim \mathrm{hypergeometric}(N_1,N_2,n),
\end{equation}
where $N_1,N_2\gs1$ and $1\ls n\ls\min\{N_1,N_2\}$. Then
$$
\begin{aligned}
  &\argmax_{0\ls i\ls n}{p_X(i)}\\
  &=\left\{
    \begin{aligned}
      &\frac{(n+1)(N_1+1)}{N_1+N_2+2}-1~\text{or}~\frac{(n+1)(N_1+1)}{N_1+N_2+2},      \text{if}~\frac{(n+1)(N_1+1)}{N_1+N_2+2}\in\mz   \\
      &\left\lfloor \frac{(n+1)(N_1+1)}{N_1+N_2+2}\right\rfloor,  \text{if}~\frac{(n+1)(N_1+1)}{N_1+N_2+2}\notin\mz.
  \end{aligned}\right.   
\end{aligned}
$$
\end{theorem}

\begin{proof}
  abc
\end{proof}

\begin{remark}[Binomial and Poisson Approximation for Hypergeometric]
$$
\begin{aligned}
p_X(i)&=\frac{\binom{N_1}{i}\binom{N_2}{n-i}}{\binom{N_1+N_2}{n}}\\
&=\frac{n!}{i!\left(n-i\right)!}\cdot\frac{N_1\left(N_1-1\right)\cdots\left(N_1-i+1\right)N_2\left(N_2-1\right)\cdots(N_2-n+i+1)}{\left(N_1+N_2\right)\left(N_1+N_2-1\right)\cdots(N_1+N_2+n-1)}
\end{aligned}
$$
(1) If $N_1\to\infty, N_2\to\infty, \frac{N_1}{N_1+N_2}\to p$, then
$$\begin{aligned}
p_X(i)&=\binom{n}{i}\cdot\frac{1}{1\cdot\left(1-\frac{1}{N_1+N_2}\right)\cdots\left(1-\frac{n-1}{N_1+N_2}\right)}\\
&\cdot\frac{N_1}{N_1+N_2}\left(\frac{N_1}{N_1+N_2}-\frac{1}{N_1+N_2}\right)\cdots\left(\frac{N_1}{N_1+N_2}-\frac{i-1}{N_1+N_2}\right)\left(\frac{N_2}{N_1+N_2}\right)\\
&\cdot\left(\frac{N_2}{N_1+N_2}-\frac{1}{N_1+N_2}\right)\cdots\left(\frac{N_2}{N_1+N_2}-\frac{n-i-1}{N_1+N_2}\right)\\
&\xrightarrow{N_1, N_2\to\infty}\binom{n}{i}p^i(1-p)^{n-i}\leftarrow\mathrm{binomial}(n,p)\\
\end{aligned}
$$
(2) If $n\to\infty, N_1\to\infty, N_2\to\infty, \frac{n}{N_1+N_2}\to0, \frac{N_1}{N_1+N_2}\to\frac{\lambda}{n}$, then
$$\begin{aligned}
p_X(i)&=\frac{1}{i!}\cdot\frac{1}{\frac{\left(N_1+N_2\right)!}{\left(N_1+N_2-n\right)!}}\cdot nN_1\cdot\left(n-1\right)\left(N_1-1\right)\cdots\left(n-i+1\right)\left(N_1-i+1\right)\ 
&\cdot(N_1+N_2-N_1)(N_1+N_2-N_1-1)\cdots(N_1+N_2-N_1-n+i+1)
\\
&=\frac{1}{i!}\cdot\frac{\prod_{j=0}^{i-1}{\frac{nN_1-j\left(n+N_1\right)+j^2}{N_1+N_2}\cdot\prod_{j=0}^{n-i-1}\left(1-\frac{N_1+j}{N_1+N_2}\right)}}{\frac{1}{(N_1+N_2)^n}\cdot\frac{\sqrt{2\pi\left(N_1+N_2\right)}\left(\frac{N_1+N_2}{e}\right)^{N_1+N_2}e^{a_{N_1+N_2}}}{\sqrt{2\pi\left(N_1+N_2-n\right)}\left(\frac{N_1+N_2-n}{e}\right)^{N_1+N_2-n}e^{a_{N_1+N_2-n}}}}\\
\end{aligned}
$$
where $a_n=\ln{\frac{n!}{\sqrt{2\pi n}\left(\frac{n}{e}\right)^n}}\xrightarrow{n\to\infty}0. $
$$\begin{aligned}
p_X(i)&\xrightarrow{\dis n,N_1,N_2\to\infty,\frac{n}{N_1+N_2}\to0, \frac{N_1}{N_1+N_2}\to\frac{\lambda}{n}}\\
&\frac{1}{i!}\cdot\limn{\frac{\lambda^i\left(1-\frac{\lambda}{n}\right)^{n-i}}{\frac{1}{e^n\cdot\lim_{N_1,N_2\to\infty}{\left(1-\frac{n}{N_1+N_2}\right)^{N_1+N_2-n}}}}}\\
&=\lim_{n\to\infty}{\frac{\lambda^i}{i!}} \left(1-\frac{\lambda}{n}\right)^{n-i}=e^{-\lambda}\cdot\frac{\lambda^i}{i!} \leftarrow\mathrm{Poisson}(\lambda)\\
\end{aligned}
$$
\end{remark}

\chapter{Continuous Random Variables}
\section{Probability Density Function}

\begin{definition}[Probability Density Function]
Let $X$ be a r.v. of a probability space $(\Omega,\ma,\mathbb{P})$. $X$ is called an absolutely continuous (or a continuous) r.v. if there exists a nonnegative real-valued function $f_X: \mathbb{R}\to[0,\infty)$ s.t.
$$
\mathbb{P}(x\in B)=\int_{B}{f_X(x)\dif x}, \forall B\in \mb_\mr.
$$
The function $f_X$ is called the probability density function (p.d.f.) of $X$.
\end{definition}

\begin{remark}[Approximation of Probability]
$$
\mathbb{P}(a\ls X\ls a+\delta)=\int_{a}^{a+\delta}{f_X(x)\dif x}=f_X\left(a_\delta\right)\cdot\delta,
$$
for some $a_\delta\in[a, a+\delta]$.\\
If $f_X$ is continuous at $a$
$$
\to\lim_{\delta\to0}{\frac{\mathbb{P}(a\ls X\ls a+\delta)}{\delta}}=\lim_{\delta\to0}{f_X(a_\delta)}=f_X\left(a\right).
$$
So $\mathbb{P}(a\ls X\ls a+\delta)\approx f_X(a_\delta)\cdot\delta$, if $f_X$ is continuous at $a$ and $\delta$ is very small.
\end{remark}

\begin{theorem}[C.D.F and Probability from P.D.F.]
Suppose $X$ is a continuous r.v. of a probability space $(\Omega,\ma,\mathbb{P})$.\\
(1)
$$
F_X(x)=\int_{-\infty}^{x}{f_X(t)\dif t}.
$$
   Therefore, $F_X(x)$ is a continuous function.\\
(2)
$$
\int_{-\infty}^{\infty}{f_X(x)\dif x}=1
$$
(3) If $f_X$ is continuous at $a$, then $F_X^\prime(a)=f_X(a)$. Therefore, if $f_X$ is a continuous function, then $F_X^\prime(x)=f_X(x), \forall x\in\mathbb{R}$.\\
(4) $\mathbb{P}(X=a)=0,\forall a\in\mathbb{R}$. Therefore,
$$\begin{aligned}
&\mathbb{P}(a\ls X\ls b)=\mathbb{P}(a\ls X<b)\\
=&\mathbb{P}(a<X\ls b)=\mathbb{P}(a<X<b)\\
=&\int_{a}^{b}{f_X(x)\dif x}.
\end{aligned}
$$
\end{theorem}

\begin{proof}
  abc
\end{proof}

\begin{theorem}[Existence of P.D.F.]
Suppose $f: \mathbb{R}\to[0,\infty)$ is a nonnegative real-valued function s.t.
$$
\int_{-\infty}^{\infty}{f(x)\dif x}=1.
$$
Then there exists a continuous r.v. $X$ of some probability space $(\Omega,\ma,\mathbb{P})$ s.t. the p.d.f. is equal to $f$.
\end{theorem}

\begin{proof}
  abc
\end{proof}

\begin{definition}[Sufficient Conditions of P.D.F.]
A nonnegative real-valued function $f: \mathbb{R}\to[0,\infty)$ s.t.
$$
\int_{-\infty}^{\infty}{f(x)\dif x}=1
$$
is called a p.d.f. \\
The c.d.f. $F: \mathbb{R}\to[0,1]$ associated with $f$ is given by
$$
F(t)=\int_{-\infty}^{t}{f(x)\dif x},\forall t\in\mathbb{R}.
$$
\end{definition}

\begin{remark}[Neither Discrete Nor Continuous R.V.]
There are r.v.'s that are neither discrete nor continuous, e.g., $$F_X(x)=\alpha F_d(x)+(1-\alpha)F_c(x),$$
where $0<\alpha<1$.
\end{remark}

\section{The Probability Density Function of A Function of A R.V.}

\begin{theorem}[Method of Distribution Functions]
Suppose $X$ is a continuous r.v. of a probability space$(\Omega,\ma,\mathbb{P})$.\\
If $Y=g(X)$, then
$$\begin{aligned}
f_Y(y)&=\frac{\dif}{\dif y}\left[F_Y(y)\right]=\frac{\dif}{\dif y}\left[\mathbb{P}(Y\ls y)\right]=\frac{\dif}{\dif y}\left[P[g(x)\ls y]\right]\\
&\to\frac{\dif}{\dif y}\left[X\sim g^{-1}(y)\right]\to\frac{\dif}{\dif y}\left[F_X\left(g^{-1}(y)\right)\right]\to\frac{\dif}{\dif y}\left[g^{-1}(y)\right]\cdot f_X\left[g^{-1}(y)\right].
\end{aligned}
$$
\end{theorem}

\begin{proof}
  abc
\end{proof}

\begin{theorem}[Method of Transformations]
Suppose $X$ is a continuous r.v. of a probability space $(\Omega,\ma,\mathbb{P})$ such that its p.d.f. is continuous. Suppose $Y=g(X)$, where $g$ is a measurable function from $(\mathbb{R},\mb_\mathbb{R})$ to $(\mathbb{R},\mb_\mathbb{R})$.\\
(1) If $g(X)$ is a discrete r.v., then
$$
P_Y(y)=\int_{x:g(x)=y}{f_X(x)\dif x}, \forall y\in g\left[X(\Omega)\right].
$$
(2) If $g(X)$ is a continuous r.v., $g^\prime(x)$ exists, and $g^\prime(x)\neq0, \forall x\in g^{-1}\left(\left\{y\right\}\right):\{x:g(x)=y\}$, where $y\in g\left[X(\Omega)\right]$. Then,
$$
f_Y(y)=\sum_{x:g(x)=y}\frac{f_X(x)}{|g^\prime(x)|}.
$$
\end{theorem}

\begin{proof}
  abc
\end{proof}

\section{Expectations and Variances}

\begin{definition}[Expectation]
Let $X$ be a continuous r.v. of a probability space $(\Omega,\ma,\mathbb{P})$ s.t. its p.d.f. is continuous. The expectation (or mean) of $X$ is given by
$$
\mathbb{E}[X]=\int_{-\infty}^{\infty}xf_X(x)\dif x
$$
if $xf_X(x)$ is absolutely integrable, i.e.,
$$
\int_{-\infty}^{\infty}{|xf_X(x)|\dif x}<+\infty,
$$
and is given by $\mathbb{E}[X]=\pm\infty$, if the integration diverges to $\pm\infty$.
\end{definition}

\begin{remark}[Necessary and Sufficient Condition of Absolutely Integrable]
$$\begin{aligned}
&\mathbb{E}[X]=\int_{-\infty}^{\infty}xf_X(x)\dif x=\int_{0}^{\infty}xf_X(x)\dif x-\int_{-\infty}^{0}(-x)f_X(x)\dif x\\
\to &\mathbb{E}[|X|]=\int_{0}^{\infty}xf_X(x)\dif x+\int_{-\infty}^{0}(-x)f_X(x)\dif x\\
\therefore &\mathbb{E}[|X|]<\infty\Leftrightarrow \int_{0}^{\infty}xf_X(x)\dif x<\infty  \text{ and } \int_{-\infty}^{0}(-x)f_X(x)\dif x<\infty.
\end{aligned}$$
\end{remark}

\begin{theorem}[Calculation of Expectation]
Suppose $X$ is a continuous r.v. of a probability space $(\Omega,\ma,\mathbb{P})$.
Then
$$\begin{aligned}
\mathbb{E}[X]&=\int_{0}^{\infty}\mathbb{P}(x>t)\dif t-\int_{0}^{\infty}\mathbb{P}(x\ls-t)\dif t\\
&=\int_{0}^{\infty}[1-F_X(t)]\dif t-\int_{0}^{\infty}[F_X(-t)]\dif t.
\end{aligned}$$
\end{theorem}

\begin{proof}
  abc
\end{proof}

\begin{corollary}[Calculation of $r^{th}$ Moment]
Suppose $X$ is a nonnegative continuous r.v. of a probability space $(\Omega,\ma,\mathbb{P})$, and $r>0.$ Then
$$
\mathbb{E}[ X^{r}] = \int_{0}^{\infty}{rt^{r - 1}}\mathbb{P}(x > t)\dif t = \int_{0}^{\infty}{rt^{r - 1}}\left[ 1 - F_{X}( t ) \right]\dif t.
$$
In particular,
$$
\mathbb{E}[ X] = \int_{0}^{\infty}{\mathbb{P}(x > t)}\dif t = \int_{0}^{\infty}\left[ 1 - F_{X}(t) \right]\dif t.
$$
\end{corollary}

\begin{proof}
  abc
\end{proof}

\begin{theorem}[Approximation of Expectation]
Suppose $X$ is a continuous r.v. of a probability space $(\Omega,\ma,\mathbb{P})$. Then
$$
\sum_{n = 1}^{\infty}{\mathbb{P}(|X| \gs n)} \ls \mathbb{E}[|X|] \ls 1 + \sum_{n = 1}^{\infty}{\mathbb{P}( \left| X \right| \gs n )}.
$$
Therefore,
$$
\mathbb{E}[|X|] < \infty  \Leftrightarrow \sum_{n = 1}^{\infty}{\mathbb{P}(|X| \gs n)} \ls \infty.
$$
\end{theorem}

\begin{proof}
  abc
\end{proof}

\begin{theorem}[Infinite Zero]
Suppose $X$ is a continuous r.v. of a probability space $(\Omega,\ma,\mathbb{P})$. Then,
$$
\mathbb{E}[ X] < \infty \to \lim_{x \to \infty}{x \cdot \mathbb{P}(X > x)} = \lim_{x \to - \infty}{x \cdot \mathbb{P}(X \ls x)} = 0.
$$
\end{theorem}

\begin{proof}
  abc
\end{proof}

\begin{theorem}[Expectation of Measurable Function]
Suppose $X$ is a continuous r.v. of a probability space $(\Omega,\ma,\mathbb{P})$, and suppose $g$ is a measurable function from $(\mathbb{R},\mb_\mathbb{R})$ to $(\mathbb{R},\mb_\mathbb{R})$.
Then
\[\mathbb{E}[ g(X)] = \int_{- \infty}^{\infty}{g(X)} \cdot f_{X}(x)\dif x\]
\end{theorem}

\begin{proof}
  abc
\end{proof}

\begin{corollary}[Expectation of Linear Combination of Measurable Functions]
Suppose $X$ is a continuous r.v. of a probability space $(\Omega,\ma,\mathbb{P})$. $g_1,g_2,\cdots g_n$ are measurable functions from $(\mathbb{R},\mb_\mathbb{R})$ to $(\mathbb{R},\mb_\mathbb{R})$, and $\alpha_1,\alpha_2,\cdots\alpha_n\in\mathbb{R}$. Then
\[\mathbb{E}\left[ \sum_{i = 1}^{n}\alpha_{i}g_{i}(x) \right] = \sum_{i = 1}^{n}\alpha_{i}\mathbb{E}[ g_{i}(X)]\]
\end{corollary}

\begin{proof}
  abc
\end{proof}

\begin{definition}[Variance and Standard Deviation]
Let $X$ be a continuous r.v. of a probability space $(\Omega,\ma,\mathbb{P})$ and suppose $\mathbb{E}[ X]$ exists. The \textbf{variance} of $X$ is given by $\mathrm{Var}(x) = \mathbb{E}[(X - \mathbb{E}[ X])^{2}]$. And the \textbf{standard deviation} of $X$ is given by
$\sigma_{X} = \sqrt{\mathrm{Var}(x)} = \sqrt{\mathbb{E}[(X - \mathbb{E}[ X])^{2}]}$.
\end{definition}

\begin{theorem}[Minimum Distance with Expectation]
Suppose $X$ is a continuous r.v. of a probability space $(\Omega,\ma,\mathbb{P})$, and suppose $\mathbb{E}[X]$ exists. If $\mathbb{E}\left[X^2\right]<+\infty$, then $\mathrm{Var}(x)=\min_{a\in\mr}\mathbb{E}[(X-a)^2]$.
\end{theorem}

\begin{proof}
  abc
\end{proof}

\begin{theorem}[Calculation of Linear Combination]
Suppose $X$ is a continuous r.v. of a probability space $(\Omega,\ma,\mathbb{P})$, and suppose $\mathbb{E}[X]$ exists. Then\\
(1)
$$
\mathrm{Var}(x)=\mathbb{E}\left[X^2\right]-(\mathbb{E}[X])^2
$$
(2)
$$
\mathrm{Var}(aX+b)=a^2\mathrm{Var}(x),\quad
\sigma_{aX+b}=\left|a\right|\sigma_X, \forall a,b\in\mathbb{R}.
$$
\end{theorem}

\begin{proof}
  abc
\end{proof}

\begin{definition}[Moment and Absolute Moment]
Let $X$ be a continuous r.v. of a probability space $(\Omega,\ma,\mathbb{P})$, and $r,c\in\mr$.
$$\left\{
\begin{array}{l}
\text{The }r^{th} \text{moment of } X \text{is given by } \mathbb{E}[X^r]   \                              
\text{The }r^{th} \text{central moment of } X \text{is given by } \mathbb{E}[(X-\mathbb{E}[X])^r]\\
\text{The }r^{th} \text{moment of } c \text{is given by } \mathbb{E}[(X-c)^r]      \     
\text{The }r^{th} \text{absolute moment of } X \text{is given by } \mathbb{E}[|X|^r]\\
\text{The }r^{th} \text{absolute central moment of } X \text{is given by } \mathbb{E}[|X-\mathbb{E}[X] |^r]  \\
\text{The } r^{th} \text{absolute moment of } c \text{is given by } \mathbb{E}[|X-c|^r] 
\end{array}
\right.$$
If the respective sum converges absolutely.
\end{definition}

\begin{theorem}[Existence of Lower Order Moment]
Suppose $X$ is a continuous r.v. of a probability space $(\Omega,\ma,\mathbb{P})$ and suppose $0<r<s$. If $\mathbb{E}[|X|^s]$ exists, then $\mathbb{E}[|X|^r]$ exists. That is, the existence of a higher order moment of $X$ guarantees the existence of a lower order moment of $X$.
\end{theorem}

\begin{proof}
  abc
\end{proof}

\begin{theorem}[Positive Variance]
Suppose $X$ is a continuous r.v. of a probability space $(\Omega,\ma,\mathbb{P})$. Then
$$
\mathbb{E}\left[\left(X-a\right)^2\right]>0, \forall a\in\mathbb{R}.
$$
Therefore
$$
\mathbb{E}[X] \text{ exists }\to   \mathrm{Var}(X)>0.
$$
\end{theorem}

\begin{proof}
  abc
\end{proof}

\chapter{Special Continuous Distributions}
\section{Uniform R.V.'s}

\begin{definition}[Uniform R.V.]
A continuous r.v. of a probability space $(\Omega,\ma,\mathbb{P})$ is called a uniform r.v. over $(\alpha,\beta)$, where $\alpha,\beta\in\mathbb{R}$ and $\alpha<\beta$, denoted $X\sim U(\alpha,\beta)$, if its p.d.f. is given by
$$
f_{X}(x) = \left\{ \begin{aligned}
&\frac{1}{\beta - \alpha},\quad~\text{if}~ \alpha < x < \beta \\
&\quad0,\qquad~\text{o.w.}~\\
\end{aligned} \right.
$$
\end{definition}

\begin{remark}[P.D.F. and C.D.F.]
(1) $f_X(x)\gs0, \forall x\in\mathbb{R}$, and
$$
\int_{-\infty}^{\infty}{f_X(x)}\dif x=\int_{\alpha}^{\beta}\frac{1}{\beta-\alpha}\dif x=1
$$
$\to f_X(x)$ is a p.d.f.\\
(2)
$$
F_{X}(x) = \left\{ \begin{array}{cl}
0,&~\text{if}~ x \ls \alpha \\
\frac{x - \alpha}{\beta - \alpha},&~\text{if}~ \alpha < x < \beta \\
1,&~\text{if}~ x \gs \beta \\
\end{array} \right.
$$
\end{remark}

\begin{theorem}[Expectation and Variance of Uniform R.V.]
Suppose $X\sim U(\alpha,\beta)$, where $\alpha,\beta\in\mathbb{R}$ and $\alpha<\beta$. Then
$$
\mathbb{E}[ X^{n}] = \frac{\sum_{i = 1}^{n}{\alpha^{n - i}\beta^{i}}}{n + 1}.
$$
Therefore
$$
\mathbb{E}[ X] = \frac{\alpha + \beta}{2},\qquad
\mathrm{Var}(x)=\frac{(\beta-\alpha)^2}{12}.
$$
\end{theorem}

\begin{proof}
  abc
\end{proof}

\begin{remark}[Expectation and Variance of Discrete ``Uniform R.V.'']
Suppose $X\sim \mathrm{Uniform}(1,2, \cdots,n)$, where $n\gs1$. Then
$$
\mathbb{E}[ X] = \frac{n + 1}{2},\quad \mathbb{E}\left[ X^{2}\right] = \frac{(n + 1)(2n + 1)}{6}
$$
and
$$
\mathrm{Var}(x)=\frac{n^2-1}{12}.
$$
\end{remark}

\begin{theorem}[Linear Generated R.V.]
Suppose $X\sim U(\alpha,\beta)$, where $\alpha,\beta\in\mathbb{R}$ and $\alpha<\beta$. Suppose $Y=aX+b$, where $\alpha,\beta\in\mathbb{R}$ and $a\neq0$. Then
\[Y\sim\left\{ \begin{aligned}
&U\left( a\alpha + b,a\beta + b \right),\quad~\text{if}~ a > 0 \\
&U\left( a\beta + b,a\alpha + b \right),\quad~\text{if}~ a < 0 \\
\end{aligned} \right.\]
\end{theorem}

\begin{proof}
  abc
\end{proof}

\section{Normal (Gaussian) R.V.'s}

\begin{definition}[Normal (Gaussian) R.V.]
A continuous r.v. of a probability space $(\Omega,\ma,\mathbb{P})$ is called a normal (Gaussian) r.v. with parameters $\mu$ and $\sigma^2$, where $\mu,\sigma\in\mathbb{R}$, $\sigma\neq0$, denoted $X\sim \mathcal{N}(\mu,\sigma^2)$, if its p.d.f. is given by
$$
f_X(x)=\frac{1}{\sqrt{2\pi\sigma^2}}\cdot\exp\left[{-\frac{(x-\mu)^2}{2\sigma^2}}\right], -\infty<x<\infty.
$$
\end{definition}

\begin{remark}[P.D.F. and C.D.F.]
(1) $f_X(x)\gs0, \forall x\in\mathbb{R}$, and let $I=\int_{-\infty}^{\infty}e^{-ax^2}\dif x$.
$$\begin{aligned}
I^2&=\int_{-\infty}^{\infty}\int_{-\infty}^{\infty}{e^{-a\left(x^2+y^2\right)}\dif x\dif y}\\
&\xrightarrow{x=r\cos\theta,y=r\sin\theta}\int_{0}^{\infty}\int_{0}^{2\pi}{e^{-ar^2}r\dif r\dif\theta}=\frac{\pi}{a}\\
&\to I=\sqrt{\frac{\pi}{a}}\to\int_{-\infty}^{\infty}{\sqrt{\frac{a}{\pi}}}\cdot e^{-ax^2}\dif x=1
\end{aligned}
$$
$$
\therefore\int_{-\infty}^{\infty}{f_X(x)}\dif x=\int_{-\infty}^{\infty}{\frac{1}{\sqrt{2\pi\sigma^2}}e^{-\tfrac{(x-\mu)^2}{2\sigma^2}}}\dif x=\int_{-\infty}^{\infty}{\frac{1}{\sqrt{2\pi\sigma^2}}e^{-\tfrac{x^2}{2\sigma^2}}}\dif x=1
$$
$\to f_X(x)$ is a p.d.f.\\
(2) If $\mu=0, \sigma^2=1$, then $X$ is called a standard normal (Gaussian) r.v.\\
(3)
$$\begin{aligned}
F_X(x)&=\int_{-\infty}^{x}{\frac{1}{\sqrt{2\pi\sigma^2}}e^{-\tfrac{(y-\mu)^2}{2\sigma^2}}}\dif y\\
&\xrightarrow{y=\sigma t+\mu}\int_{-\infty}^{\tfrac{x-\mu}{\sigma}}{\frac{1}{\sqrt{2\pi}}e^{-\tfrac{{t}^2}{2}}}\dif t\\
&={\Phi}\left(\frac{x-\mu}{\sigma}\right)\\
\end{aligned}
$$
where
$$
{\Phi}(x)=\int_{-\infty}^{x}{\frac{1}{\sqrt{2\pi}}e^{-\tfrac{{t}^2}{2}}}\dif t.
$$
\end{remark}

\begin{theorem}[Symmetric about ${\mu}$]
Suppose $X\sim \mathcal{N}(\mu,\sigma^2)$.\\
(1) $f_X(x)$ is symmetric about $x=\mu$, with maximum at $x=\mu$, and inflection points at $x=\mu\pm\sigma$.\\
(2) $\Phi\left(-y\right)=1-{\Phi}(y), \forall y\in\mathbb{R}$ and $\Phi\left(0\right)=1$. Therefore, $$F_X\left(\mu-y\right)=1-F_X\left(\mu+y\right)$$ and $$F_X\left(\mu\right)=\frac{1}{2}.$$
\end{theorem}

\begin{proof}
  abc
\end{proof}

\begin{theorem}[Linear Generated R.V.]
Suppose $X\sim \mathcal{N}(\mu,\sigma^2)$, where $\mu,\sigma\in\mathbb{R}, \sigma\neq0$. Suppose $Y=aX+b$, where $\alpha,\beta\in\mathbb{R}$ and $a\neq0$. Then,
$$
Y\sim \mathcal{N}\left(a\mu+b,a^2\sigma^2\right).
$$
In particular, if
$$
Y=\frac{x-\mu}{\sigma},
$$
then
$$
Y\sim \mathcal{N}(0,1).
$$
\end{theorem}

\begin{proof}
  abc
\end{proof}

\begin{definition}[Gamma Function]
The function $\Gamma:(0,\infty)\to\mathbb{R}$ given by
$$
{\Gamma}(\alpha)=\int_{0}^{\infty}e^{-t}t^{\alpha-1}\dif t, \forall\alpha>0
$$
is called the gamma function.
\end{definition}

\begin{theorem}[Properties of Gamma Function]
(1)
$$
{\Gamma}(\alpha+1)=\alpha\Gamma(\alpha), \forall\alpha>0.
$$
(2)
$$
{\Gamma}(n+1)=n!, \forall n\gs0.
$$
(3)
$$
{\Gamma}\left(n+\frac{1}{2}\right)=\frac{\left(2n\right)!}{2^{2n}n!}\sqrt\pi, \forall n\gs0.
$$
\end{theorem}

\begin{proof}
  abc
\end{proof}

\begin{theorem}[Calculation of Moment and Absolute Moment]
Suppose $X\sim \mathcal{N}(\mu,\sigma^2)$, where $\mu,\sigma\in\mathbb{R},  \sigma\neq0$.\\
(1)
\[\mathbb{E}\left[ \left| x - \mu \right|^{n} \right] = \frac{\left( 2\sigma^{2} \right)^{\tfrac{n}{2}}}{\sqrt{\pi}}\Gamma\left( \frac{n + 1}{2} \right) = \left\{ \begin{array}{cll}
\frac{2^{k + 1} \cdot k!}{\sqrt{2\pi}}\sigma^{2k + 1},&~\text{if}~~n = 2k + 1,&k \gs 0 \\
\frac{(2k)!}{2^{k} \cdot k!}\sigma^{2k},&\text{if}~n = 2k,&k \gs 0 \\
\end{array} \right.\]
(2)
\[\mathbb{E}\left[ {(x - \mu)}^{n} \right] = \left\{ \begin{array}{cll}
0,&~\text{if}~~n = 2k + 1,&k \gs 0 \\
\frac{(2k)!}{2^{k} \cdot k!}\sigma^{2k}\,&{\text{if}}~ n = 2k,&k \gs 0 \\
\end{array} \right.\]
(3)
\[\mathbb{E}[X^{n}] = \sum_{k = 0}^{n}\binom n k \mathbb{E}\left[ \left( x - \mu \right)^{k} \right]\cdot \mu^{n - k}.\]
\end{theorem}

\begin{proof}
  abc
\end{proof}

\begin{theorem}[De Moivre-Laplace Theorem]
Suppose $X\sim \mathrm{binomial}(n,p)$, where $n\gs1$ and $0<p<1$. Then
\[\lim_{n \to \infty}{\mathbb{P}\left( a < \frac{X - np}{\sqrt{np( 1 - p )}} < b \right) = \int_{a}^{b}\frac{1}{\sqrt{2\pi}}}e^{- \tfrac{x^{2}}{2}}\dif x, \forall a,b \in \mathbb{R}, a < b.\]
\end{theorem}

\begin{proof}
  abc
\end{proof}

\begin{theorem}[Approximation of $\Phi(x)$]
\[\frac{1}{\sqrt{2\pi}x}\left( 1 - \frac{1}{x^{2}} \right)e^{- \tfrac{{x}^{2}}{2}} < 1 - \Phi(x) < \frac{1}{\sqrt{2\pi}x} \cdot e^{- \tfrac{{x}^{2}}{2}}, \forall x > 0.\]
\end{theorem}

\begin{proof}
  abc
\end{proof}

\begin{theorem}[Expectation of Exponential Function]
Suppose $X\sim \mathcal{N}(\mu,\sigma^2)$, where $\mu,\sigma\in\mathbb{R}, \sigma\neq0$, and $\alpha\in\mathbb{R}$.
Then 
$$
\mathbb{E}\left[e^{\alpha x}\right]=e^{\alpha\mu+\tfrac{1}{2}\alpha^2\sigma^2}.
$$
\end{theorem}

\begin{proof}
  abc
\end{proof}

\section{Gamma R.V.'s, Erlang R.V.'s and Exponential R.V.'s}

\begin{definition}[Gamma R.V., Erlang R.V. and Exponential R.V.]
A continuous r.v. of a probability space $(\Omega,\ma,\mathbb{P})$ is called a gamma r.v. with parameters $\alpha$ and $\lambda$, where $\alpha,\lambda>0$, denoted $X\sim \mathcal{G}(\alpha,\lambda)$, if its p.d.f. is given by
\[f_{X}(x) = \left\{ \begin{aligned}
&\frac{\lambda e^{- \lambda x}{(\lambda x)}^{\alpha - 1}}{\Gamma( \alpha )}, ~\text{if}~ \alpha > 0 \\
&\quad0,\quad~\text{o.w.}~ \\
\end{aligned} \right. \]
If $\alpha=n, n\gs1$, then $X$ is called an Erlang r.v. with parameters $n$ and $\lambda$, denoted $X\sim\me(n,\lambda)$.\\
If $\alpha=1$, then $X$ is called an exponential r.v. with parameters $\lambda$, denoted $X\sim\me(\lambda)$.
\end{definition}

\begin{remark}[Properties of P.D.F.]
(1)
$$
\int_{-\infty}^{\infty}\frac{\lambda e^{-\lambda x}{(\lambda x)}^{\alpha-1}}{{\Gamma}(\alpha)}\dif x\xrightarrow{t=\lambda x}\int_{0}^{\infty}\frac{e^{-t}t^{\alpha-1}}{{\Gamma}(\alpha)}\dif t=\frac{{\Gamma}(\alpha)}{{\Gamma}(\alpha)}=1
$$
$\to f_X(x)$ is a p.d.f.\\
(2) 
$$\begin{aligned}
f_X^\prime(x)&=\frac{\lambda^\alpha}{{\Gamma}(\alpha)}\cdot e^{-\lambda x}\left(-\lambda x^{\alpha-1}+\left(\alpha-1\right)x^{\alpha-2}\right)\\
&=\frac{\lambda^\alpha}{{\Gamma}(\alpha)}\cdot e^{-\lambda x}\cdot x^{\alpha-2}\left[-\lambda x+\left(\alpha-1\right)\right]
\end{aligned}$$
$$\begin{aligned}
&f_X^{\prime\prime}(x)\\
&=\frac{\lambda^\alpha}{{\Gamma}(\alpha)}\cdot e^{-\lambda x}\left[-\lambda^2x^{\alpha-1}-\lambda\left(\alpha-1\right)x^{\alpha-2}-\lambda\left(\alpha-1\right)x^{\alpha-2}+(\alpha-2)\left(\alpha-1\right)x^{\alpha-3}\right]\\
&=\frac{\lambda^\alpha}{{\Gamma}(\alpha)}\cdot e^{-\lambda x}\cdot x^{\alpha-3}\left[\left(\lambda x-\left(\alpha-1\right)\right)^2-\left(\alpha-1\right)\right]
\end{aligned}$$
$$
\therefore0<\alpha\ls1 \to f_X^\prime(x)<0, f_X^{\prime\prime}(x)>0, \forall x>0.
$$
\[\alpha > 1 \to f_{X}^{'}(x)\left\{ \begin{aligned}
 &> 0  \Leftrightarrow x < \frac{\alpha - 1}{\lambda} \\
 &= 0 \Leftrightarrow x = \frac{\alpha - 1}{\lambda} \\
 &< 0 \Leftrightarrow x > \frac{\alpha - 1}{\lambda} \\
\end{aligned} \right. \]
and
\[f_{X}^{''}(x)\left\{ \begin{aligned}
& > 0  \Leftrightarrow x > \frac{\alpha - 1}{\lambda} + \frac{\sqrt{\alpha - 1}}{\lambda} \text{or} x < \frac{\alpha - 1}{\lambda} - \frac{\sqrt{\alpha - 1}}{\lambda} \\
 &= 0  \Leftrightarrow x = \frac{\alpha - 1}{\lambda} \pm \frac{\sqrt{\alpha - 1}}{\lambda} \\
& < 0  \Leftrightarrow\frac{\alpha - 1}{\lambda} - \frac{\sqrt{\alpha - 1}}{\lambda} < x < \frac{\alpha - 1}{\lambda} + \frac{\sqrt{\alpha - 1}}{\lambda}\\
\end{aligned} \right. \]
\end{remark}

\begin{theorem}[Calculation of C.D.F.]
Suppose $X\sim\mathcal{G}(\alpha,\lambda)$, where $\alpha,\lambda>0$. Then
$$
F_X(x)=1-\frac{{\Gamma}(\alpha,\lambda x)}{{\Gamma}(\alpha)},
$$
where
$$
{\Gamma}(\alpha,x)=\int_{x}^{\infty}{e^{-t}t^{\alpha-1}\dif t}
$$
is the incomplete gamma function.\\
If $\alpha=n\gs1$, then
$$
F_X(x)=1-\sum_{i=0}^{n-1}\frac{e^{-\lambda x}\left(\lambda x\right)^i}{i!}=\mathbb{P}(N\gs n)
$$
where $N\sim\mathrm{Poisson}(n\lambda)$.
\end{theorem}

\begin{proof}
  abc
\end{proof}

\begin{theorem}[Expectation and Variance of Gamma R.V.]
Suppose $X\sim\mathcal{G}(\alpha,\lambda)$, where $\alpha,\lambda>0$. Then
$$
\mathbb{E}\left[X^n\right]=\frac{{\Gamma}(\alpha+n)}{{\Gamma}(\alpha)\lambda^n}=\frac{\binom{n+\alpha-1}{n}}{\lambda^n}=\frac{{(\alpha)}_n}{\lambda^n}
$$
where
$$
{(\alpha)}_n=\binom{n+\alpha-1}{n}=\left(n+\alpha-1\right)\cdots\left(\alpha-1\right)\cdot\alpha
$$
Therefore,
\[\mathbb{E}[ X] = \frac{\alpha}{\lambda}\text{  }\text{and}\mathrm{Var}(x) = \frac{\alpha}{\lambda^{2}}.
\]
\end{theorem}

\begin{proof}
  abc
\end{proof}

\begin{theorem}[Linear Generated Gamma R.V.]
Suppose $X\sim\mathcal{G}(\alpha,\lambda)$, where $\alpha,\lambda>0$, and $Y=aX$, where $a>0$. Then
\[Y\sim\mathcal{G}\left( \alpha,\frac{\lambda}{a} \right).\]
\end{theorem}

\begin{proof}
  abc
\end{proof}

\begin{theorem}[Gamma R.V. from Normal R.V.]
Suppose $X\sim \mathcal{N}(\mu,\sigma^2)$, where $\mu,\sigma\in\mathbb{R}$, $\sigma\neq0$ and $Y={(X-\mu)}^2$. Then
\[Y\sim\mathcal{G}\left( \frac{1}{2},\frac{1}{2\sigma^{2}} \right).\]
\end{theorem}

\begin{proof}
  abc
\end{proof}

\begin{lemma}[Plus to Multiply Property of Exponential Function]
Suppose $f:[0,+\infty)\to\mathbb{R}$ is right continuous on $[0,+\infty)$ and $f(x+y)=f(x)\cdot f(y), \forall x,y\gs0$. Then there either $f(x)=0, \forall x\gs0$ or $\exists\lambda\in\mathbb{R}$ s.t. $f(x)=e^{-\lambda x}, \forall x\gs0$.
\end{lemma}

\begin{proof}
  abc
\end{proof}

\begin{theorem}[Memoryless Property]
Suppose $X$ is a nonnegative continuous r.v. of a probability space $(\Omega,\ma,\mathbb{P})$. Then $\mathbb{P}(x>s+t| x>s)=\mathbb{P}(x>t), \forall s,t>0 \Leftrightarrow X\sim\me(\lambda)$, for some $\lambda>0$.
\end{theorem}

\begin{proof}
  abc
\end{proof}

\begin{remark}[Analog of Geometric R.V.]
Exponential r.v.'s are the continuous analog of geometric r.v.'s.
\end{remark}

\begin{theorem}[Geometric R.V. from Exponential R.V.]
Suppose $X\sim\me(\lambda)$ where $\lambda>0$ and $Y=\left\lceil X\right\rceil$. Then $Y\sim\mathrm{geometric}\left(1-e^{-\lambda}\right)$.
\end{theorem}

\begin{proof}
  abc
\end{proof}

\begin{definition}[Independent Set]
A set of r.v.'s $\{X_i:i\in I\}$ of a probability space $(\Omega,\ma,\mathbb{P})$ is called independent, if for any finite subset $\left\{X_{i_1},X_{i_2}, \cdots,X_{i_k}\right\}, k\gs2$ of $\{X_i:i\in I\}$ the events
$$
X_{i_1}\in B_1, X_{i_2}\in B_2, \cdots,X_{i_k}\in B_k
$$
are independent for all $B_1,B_2, \cdots,B_k\in\mb_\mathbb{R}$.\\
Otherwise, $\{X_i:i\in I\}$ is called dependent.
\end{definition}

\begin{definition}[Continuous R.Vect.]
A r.vect. $\mX=(X_1, X_2, \cdots,X_n)$ of a probability space $(\Omega,\ma,\mathbb{P})$ is called an absolute continuous r.vect. (or continuous r.vect.) if there exists a nonnegative real-valued function $f_{\mX}:\mathbb{R}^n\to[0,\infty)$ s.t.
$$
\mathbb{P}(X_1\in B_1,X_2\in B_2, \cdots,X_n\in B_k)=\int_{B_1}\int_{B_2}{\cdots\int_{B_n}{f_{\mX}(\mx)\dif x_n}}\cdots \dif x_2\dif x_1
$$
for all $B_1,B_2, \cdots,B_n\in\mb_\mathbb{R}$.\\
Then the function $f_{\mX}$ is called the p.d.f. of the r.vect. $\mX$, or the joint p.d.f. of the r.v.'s $X_1, X_2, \cdots,X_n$.
\end{definition}

\begin{theorem}[P.D.F. and C.D.F. of Continuous R.Vect.]
Suppose $\mX=(X_1, X_2, \cdots,X_n)$ is a continuous r.vect. and 
$$
F_{\mX}(\mx)=\mathbb{P}(X_1\ls x_1,X_2\ls x_2, \cdots,X_n\ls x_n).
$$
Then
$$
f_{\mX}(\mx)=\frac{\partial F_{\mX}(\mx)}{\partial x_1\cdots\partial x_n}.
$$
Furthermore, if $X_1, X_2, \cdots,X_n$ are independent, then
$$
f_{\mX}(\mx)=f_{X_1}(x)f_{X_2}(x)\cdots f_{X_n}(x).
$$
\end{theorem}

\begin{proof}
  abc
\end{proof}

\begin{theorem}[Convolution Theorem]
If $\mX=(X_1,X_2)$ is a continuous r.vect. and $Y=X_1+X_2$. Then
$$
f_Y(y)=\int_{-\infty}^{\infty}{f_{X_1,X_2}\left(x,y-x\right)}\dif x.
$$
Furthermore, if $X_1\bot X_2$, then
$$
f_Y(y)=\int_{-\infty}^{\infty}{f_{X_1}(x)}f_{X_2}\left(y-x\right)\dif x.
$$
\end{theorem}

\begin{proof}
  abc
\end{proof}

\begin{definition}[Beta Function]
The function $B: \mathbb{R}^+\times\mathbb{R}^+\to\mathbb{R}$ is given by
$$
B(\alpha,\beta)=\int_{0}^{1}x^{\alpha-1}{(1-x)}^{\beta-1}\dif x, \forall\alpha,\beta>0
$$
is called beta function.
\end{definition}

\begin{lemma}[Calculation of Beta Function]
\[B(\alpha,\beta) = \frac{\Gamma( \alpha ) \cdot \Gamma( \beta )}{\Gamma( \alpha + \beta)}, \forall\alpha,\beta > 0.\]
\end{lemma}

\begin{proof}
  abc
\end{proof}

\begin{theorem}[Independent Additivity of Gamma R.V.]
Suppose $X_i\sim\mathcal{G}(\alpha_i,\lambda)$ where $\alpha_i,\lambda>0$, $i=1,2, \cdots,n,$ ${X}_1, X_2, \cdots,X_n$ are independent, and $Y=X_1+X_2+\cdots+X_n$. Then
\[Y\sim\mathcal{G}\left( \sum_{i = 1}^{n}\alpha_{i},\lambda \right).\]
\end{theorem}

\begin{proof}
  abc
\end{proof}

\begin{theorem}[Independent Minimum of Exponential R.V.]
Suppose $X_i\sim\me(\lambda_i)$ where $\lambda_i>0, i=1,2, \cdots,n$, and ${X}_1,X_2, \cdots,X_n$ are independent.\\
(1) If $Y=\min\{X_1,X_2, \cdots,X_n\}$, then
\[Y\sim\me\left( \sum_{i = 1}^{n}\lambda_{i} \right).\]
(2)
$$
\mathbb{P}(X_1<X_2)=\frac{\lambda_1}{\lambda_1+\lambda_2}.
$$
\end{theorem}

\begin{proof}
  abc
\end{proof}

\begin{definition}[Stochastic Process]
A stochastic process (s.p.) $\{X(t):i\in I\}$ is a collection of r.v.'s of a probability space $(\Omega,\ma,\mathbb{P})$. If $I=\{0,1,2,\dots\}$ or $\{0,\pm1,\pm2,\cdots\}$, then we call $\{X(t):i\in I\}$ a discrete-time S.P. If $I=[0,\infty)$ or $(-\infty,\infty)$, then we call $\{X(t):i\in I\}$ a continuous-time S.P.
\end{definition}

\begin{definition}[Counting Process and Poisson Process]
Let $\{T_1,T_2,\cdots\}$ be a discrete-time S.P. s.t. $T_i, i=1,2,\dots$, is the time of occurrence of the $i^{th}$ event, and $0<T_1<T_2<\cdots$.\\

Let $X_i=T_i-T_{i-1}, i=1,2,\dots$, where $T_0=0$ be the inter-occurrence time between the ${(i-1)}^{th}$ and the $i^{th}$ events, and $N(t)=|\{i:0<T_i\ls t\}|$ be the number of events occurring in $(0,t]$, so that $\{N(t):0<t<\infty\}$ is called the counting process of the S.P. $\{T_1,T_2,\cdots\}$.\\

Then we call $\{T_1,T_2,\cdots\}$ a Poisson process with rate $\lambda$, if $X_1,X_2,\cdots$ are independent and identically distributed (i.i.d.) and $N(t)\sim\mathrm{Poisson}(\lambda t)$.
\end{definition}

\begin{theorem}[Necessary and Sufficient Condition of Poisson Process]
Suppose $\{T_1,T_2,\cdots\}$ is a S.P. s.t. $0<T_1<T_2<\cdots$ and its inter-occurrence times $X_i=T_i-T_{i-1}, i=1,2,\dots$ are i.i.d., where $T_0=0$. Then $\{T_1,T_2,\cdots\}$ is a Poisson process with rate $\lambda \Leftrightarrow X_i\sim\me(\lambda), i=1,2,\dots.$
\end{theorem}

\begin{proof}
  abc
\end{proof}

\begin{remark}[Negative Binomial $\leftrightarrow$ Geometric vs Gamma$ \leftrightarrow$ Exponential]
(1) A negative binomial r.v. $T_r=X_1+ X_2+\cdots+X_r\sim\text{neg.-binomial}(r,p)$ is the number of i.i.d. Bernoulli trials with the same probability of success $p$ until the $r^{th}$ success occurs, where $X_i\sim\mathrm{geometric}(p)$ is the number of Bernoulli trials between the ${(i-1)}^{th}$ and the $i^{th}$ successes, and $X_1,X_2,\cdots$ are independent.\\

(2) A gamma r.v. $T_n=X_1+X_2+\cdots+X_n\sim\mathcal{G}(n,\lambda)$ is the time of occurrence of the $n^{th}$ event of a Poisson process with rate $\lambda$, where $X_i\sim\me(\lambda)$ is the inter-occurrence time between the ${(i-1)}^{th}$ and the $i^{th}$ events, and $X_1,X_2,\cdots$ are independent.
\end{remark}

\begin{theorem}[Merging and Splitting of Poisson Process]
(1) Suppose that $k$ independent Poisson processes with rates $\lambda_1,\lambda_2, \cdots,\lambda_k$ are merged into a S.P. $\{T_1,T_2,\cdots\}$. Then $\{T_1,T_2,\cdots\}$ is a Poisson process with rate $\lambda=\lambda_1+\lambda_2+\cdots+\lambda_k$.\\

(2) Suppose that in a Poisson process with rate $\lambda$, an event is a type-$i$ event with probability $P_i, i=1,2, \cdots,k.$ Then the S.P. $\{T_1,T_2,\cdots\}$ of the times of the occurrences of the type-$i$ events is a
 Poisson process with rate $\lambda\cdot P_i, i=1,2, \cdots,k.$
\end{theorem}

\begin{proof}
  abc
\end{proof}

\section{Beta R.V.'s}

\begin{definition}[Beta R.V.]
A continuous r.v. $X$ of a probability space $(\Omega,\ma,\mathbb{P})$ is called a beta r.v. with parameter $\alpha$ and $\beta$, where $\alpha,\beta>0$, denoted $X\sim\mb(\alpha,\beta)$, if its p.d.f. is given by
\[f_{X}(x) = \left\{ \begin{aligned}
\frac{1}{B(\alpha,\beta)}&x^{\alpha - 1}{(1 - x)}^{\beta - 1}, ~\text{if}~  0 < x < 1 \\
&0, \qquad~\text{o.w.}~ \\
\end{aligned} \right.\]
where
\[B(\alpha,\beta) = \int_{0}^{1}x^{\alpha - 1}{(1 - x)}^{\beta - 1}\dif x = \frac{\Gamma( \alpha ) \cdot \Gamma( \beta )}{\Gamma( \alpha + \beta )}.\]
\end{definition}

\begin{remark}[P.D.F. and C.D.F.]
(1) $\int_{-\infty}^{\infty}{f_X(x)\dif x}=1 \to f_X(x)$ is a p.d.f.

(2) Beta r.v.'s are good approximations of r.v.'s that vary between two limits.

(3) If $X_1,X_2, \cdots,X_n$ are i.i.d. $\sim U(0,1)$ and ${X}_{(i)}$ is the $i^{th}$ smallest r.v. of $X_1,X_2, \cdots,X_n$ so that $X_{(1)}\ls{X}_{(2)}\ls\cdots\ls X_{(n)}$, then
$$
{X}_{(i)}\sim\mb\left(i,n+1-i\right).
$$

(4)
$$\begin{aligned}
f_{X}^{'}(x) &= \frac{\left( \alpha - 1 \right)x^{\alpha - 2}\left( 1 - x \right)^{\beta - 1} - (\beta - 1)x^{\alpha - 1}{(1 - x)}^{\beta - 2}}{B(\alpha,\beta)}\\
&= \frac{x^{\alpha - 2}{(1 - x)}^{\beta - 2}}{B(\alpha,\beta)}\left[ \left( \alpha - 1 \right) - \left( \alpha + \beta - 2 \right)x \right]\\
&\to f_{X}^{'}(x)\left\{ \begin{aligned}
 &> 0, \Leftrightarrow \left( \alpha + \beta - 2 \right)x < \alpha - 1 \\
 &= 0, \Leftrightarrow \left( \alpha + \beta - 2 \right)x = \alpha - 1 \\
 &< 0, \Leftrightarrow \left( \alpha + \beta - 2 \right)x > \alpha - 1 \\
\end{aligned} \right.
\end{aligned}$$
$$\begin{aligned}
&f_{X}^{''}(x)\\
&= \frac{\left( \alpha - 1 \right)\left( \alpha - 2 \right)x^{\alpha - 3}\left( 1 - x \right)^{\beta - 1} - (\beta - 1)(\beta - 2)x^{\alpha - 1}{(1 - x)}^{\beta - 3}}{B(\alpha,\beta)}\\
&= \frac{x^{\alpha - 3}{(1 - x)}^{\beta - 3}}{B(\alpha,\beta)}\cdot h(x,\alpha,\beta)\\
&= \left\{ \begin{aligned}
&\frac{x^{\alpha - 3}\left( 1 - x \right)^{\beta - 3}}{B(\alpha,\beta)}\left( \alpha + \beta - 2 \right)\left( \alpha + \beta - 3 \right)\cdot f(x,\alpha,\beta),\alpha + \beta \neq 2,3 \\
&\frac{x^{\alpha - 3}\left( 1 - x \right)^{\beta - 3}}{B(\alpha,\beta)} \cdot 2 \cdot \left( \alpha - 1 \right) \cdot \left( x - \frac{\alpha - 2}{2} \right),\alpha + \beta = 2\\
&\frac{x^{\alpha - 3}\left( 1 - x \right)^{\beta - 3}}{B(\alpha,\beta)} \cdot \left( \alpha - 1 \right) \cdot \left( \alpha - 2 \right), \alpha + \beta = 3\\
\end{aligned} \right.
\end{aligned}$$
where
$$
h(x,\alpha,\beta)=\left( \alpha + \beta - 2 \right)\left( \alpha + \beta - 3 \right)x^{2} - 2\left( \alpha - 1 \right)\left( \alpha + \beta - 3 \right)x + \left( \alpha - 1 \right)\left( \alpha - 2 \right),
$$
and
$$
f(x,\alpha,\beta)=\left( x - \frac{\alpha - 1}{\alpha + \beta - 2} \right)^{2} - \frac{\left( \alpha - 1 \right)\left( \beta - 1 \right)}{\left( \alpha + \beta - 2 \right)^{2}\left( \alpha + \beta - 3 \right)}.
$$
\end{remark}

\begin{theorem}[Expectation and Variance of Beta R.V.]
Suppose $X\sim\mb(\alpha,\beta)$, then
\[\mathbb{E}[ X^{n}] = \frac{( \alpha )_{n}}{\left( \alpha + \beta \right)_{n}} = \frac{ \binom{\alpha + n - 1}{n} }{ \binom{\alpha + \beta + n - 1}{n} }.\]
Therefore,
\[\mathbb{E}[ X] = \frac{\alpha}{\alpha + \beta}\]
and
\[\mathrm{Var}(x) = \frac{\alpha\beta}{(\alpha + \beta + 1){(\alpha + \beta)}^{2}}.\]
\end{theorem}

\begin{proof}
  abc
\end{proof}

\begin{theorem}[Beta R.V. vs Binomial R.V.]
Suppose $X\sim\mb(\alpha,\beta)$, and $Y\sim\mathrm{binomial}(\alpha+\beta-1,p)$, where $\alpha,\beta\in\mathbb{Z}^+, 0<p<1$. Then
$$
\mathbb{P}(X\ls p)=\mathbb{P}(Y\gs\alpha)
$$
and
$$
\mathbb{P}(X\gs p)=\mathbb{P}(Y\ls\alpha-1).
$$
\end{theorem}

\begin{proof}
  abc
\end{proof}

\chapter{Bivariate and Multivariate Distributions}

\section{Joint Distributions of Two or More R.V.'s}

\begin{definition}[Joint P.M.F. of Multiple R.v.'s]
Let $X_1, X_2, \cdots,X_n$ be discrete r.v.'s of a probability space $(\Omega,\ma,\mathbb{P})$. The nonnegative function $P_X:\mathbb{R}^n\to[0,1]$ given by 
\[p_{\mX}(\mx) = P_{\mX}\left( \left\{ \mx\right\} \right) = \mathbb{P}(\mX =\mx ) = \left\{ \begin{aligned}
&\mathbb{P}(\mX =\mx ), \mx\in\mX(\Omega) \\
&\qquad0,\quad\mx\in \mathbb{R}^{n}\backslash\mX(\Omega) \\
\end{aligned} \right.\]
is called the joint p.m.f. of $X_1, X_2, \cdots,X_n$.
\end{definition}

\begin{remark}[Properties of Joint P.M.F.]
(1)
\[p_{\mX}(\mx) \gs 0, \forall \mx\in \mX( \Omega ) ~\text{and}~\mathrm{ } p_{\mX}(\mx) = 0, \forall\mx \in \mathbb{R}^{n}\backslash \mX(\Omega).\]
(2)
\[\sum_{\mx\in \mX(\Omega)}{p_{\mX}(\mx)} = \sum_{\mx\in \mX(\Omega)}{\mathbb{P}(\mX=\mx)} = \mathbb{P}(\mX \in \mX(\Omega) ) = \mathbb{P}( \Omega ) = 1\]
(3)
\[\mX(\Omega) \subseteq \prod_{i = 1}^{n}{X_{i}(\Omega)}\]
(4)
\[p_{\mX}(\mx) = \left\{ \begin{aligned}
&\mathbb{P}(\mX =\mx), \mx\in\prod_{i = 1}^{n}X_{i}(\Omega) \\
&\qquad0,\qquad\mx  \in \mathbb{R}^{n}\backslash\prod_{i = 1}^{n}X_{i}( \Omega) \\
\end{aligned} \right.\]
\end{remark}

\begin{theorem}[Joint Marginal P.M.F.]
Suppose $X_1,X_2, \cdots,X_n$ are discrete r.v.'s of a probability space $(\Omega,\ma,\mathbb{P})$. Then 
\[p_{X_{i_{1}},X_{i_{2}}, \cdots,X_{i_{k}}}\left( x_{i_{1}},x_{i_{2}}, \cdots,x_{i_{k}} \right) = \left\{ \begin{aligned}
\sum_{\mbox{\tiny$\begin{aligned}
&x_{i} \in X_{i}\left( \Omega \right) \\
i &\neq i_{1},i_{2},\cdots\,i_{k} \\
\end{aligned}$}} & {p_{X_i}\left( x_i \right), \forall i = i_{1},i_{2}, \cdots,i_{k}} \\
&0,\qquad~\text{o.w.}~ \\
\end{aligned} \right.\]
We call $$p_{X_{i_1},X_{i_2}, \cdots,X_{i_k}}\left(x_{i_1},x_{i_2}, \cdots,x_{i_k}\right)$$ the joint p.m.f. marginalized over $X_{i_1},X_{i_2}, \cdots,X_{i_k}$.  If $k=1$, we call $p_{X_i}(x_i)$ the marginal p.m.f. of $X_i$.
\end{theorem}

\begin{proof}
  abc
\end{proof}

\begin{theorem}[Expectation of Measurable Function]
Suppose $X_1, X_2, \cdots,X_n$ are discrete r.v.'s of a probability space $(\Omega,\ma,\mathbb{P})$, and $g$ is a measurable function from $(\mathbb{R}^n,  \mb_{\mathbb{R}^n})$ to $(\mathbb{R}, \mb_\mathbb{R})$. Then
\[\mathbb{E}\left[ g(\mx) \right] = \sum_{\mbox{\tiny$\begin{aligned}
x_{i} \in X_{i}\left( \Omega \right) \\
i = 1,2, \cdots,n \\
\end{aligned}$}}^{}{g(\mx) \cdot p_{\mX}(\mx).}\]
\end{theorem}

\begin{proof}
  abc
\end{proof}

\begin{corollary}[Expectation of Linear Combined Measurable Function]
Suppose $X_1, X_2, \cdots,X_n$ are discrete r.v.'s of a probability space $(\Omega,\ma,\mathbb{P})$, and $g_1,g_2, \cdots,g_m$ are measurable functions from $(\mathbb{R}^n,\mb_{\mathbb{R}^n})$ to $(\mathbb{R},\mb_\mathbb{R})$, and $\alpha_1,\alpha_2, \cdots,\alpha_m\in\mathbb{R}$,
Then $$\sum_{k=1}^{m}\alpha_k\cdot g_k(\mx)$$ is a discrete r.v. of $(\Omega,\ma,\mathbb{P})$ and 
\[\mathbb{E}\left[ \sum_{k = 1}^{m}\alpha_{k}g_{k}(\mx) \right] = \sum_{k = 1}^{m}\alpha_{k}\mathbb{E}\left[ g_{k}(\mx) \right].\]
\end{corollary}

\begin{proof}
  abc
\end{proof}

\begin{definition}[Joint P.D.F.]
Let $X_1, X_2, \cdots,X_n$ be r.v.'s of a probability space $(\Omega,\ma,\mathbb{P})$. We say that $X_1, X_2, \cdots,X_n$ are jointly continuous r.v.'s if there exists a nonnegative function $f_{\mX}:\mathbb{R}^n\to[0,1]$ s.t.
$$
\mathbb{P}(\mX\in B)=\int\int_{B}\cdots\int f_{\mX}(\mx)\dif \mx,  \forall B\in\mb_{\mathbb{R}^n}.
$$
The function $f_{\mX}$ is called the joint p.d.f. of $X_1, X_2, \cdots,X_n$.
\end{definition}

\begin{theorem}[Joint Marginal P.D.F.]
Suppose $X_1, X_2, \cdots,X_n$ are jointly continuous r.v.'s of a probability space $(\Omega,\ma,\mathbb{P})$. Then $X_{i_1},X_{i_2}, \cdots,X_{i_k}$ are also jointly continuous r.v.'s of a probability space $(\Omega,\ma,\mathbb{P})$ with joint p.d.f.
$$
f_{X_{i_1},X_{i_2}, \cdots,X_{i_k}}\left(x_{i_1},x_{i_2}, \cdots,x_{i_k}\right)=\underbrace{\int_{-\infty}^{\infty}\int_{-\infty}^{\infty}\cdots\int_{-\infty}^{\infty}}_{n-k}{f_{\mX}(\mx)\dif x_i}
$$
where $i\neq i_1,i_2,\cdots i_k$.\\
We call $$f_{X_{i_1},X_{i_2}, \cdots,X_{i_k}}\left(x_{i_1},x_{i_2}, \cdots,x_{i_k}\right)$$ the joint p.d.f. marginalized over $X_{i_1},X_{i_2}, \cdots,X_{i_k}$. If $k=1$, we call $f_{X_i}(x_i)$ the marginal p.d.f. of $X_i$.
\end{theorem}

\begin{proof}
  abc
\end{proof}

\begin{theorem}[Expectation of Measurable Function]
Suppose $X_1, X_2, \cdots,X_n$ are jointly continuous r.v.'s of a probability space $(\Omega,\ma,\mathbb{P})$, and $g$ is a measurable function from $(\mathbb{R}^n, \mb_{\mathbb{R}^n})$ to $(\mathbb{R},\mb_\mathbb{R})$. Then
$$
\mathbb{E}\left[g(\mx)\right]=\int_{-\infty}^{\infty}\int_{-\infty}^{\infty}{\cdots\int_{-\infty}^{\infty}{g(\mx)f_{\mX}(\mx)\dif x_n\cdots \dif x_2\dif x_1}}.
$$
\end{theorem}

\begin{proof}
  abc
\end{proof}

\begin{remark}[Properties of Joint P.D.F.]
(1)
\[f_{\mX}(\mx) > 0, \forall\mx \in \mathbb{R}^{n}.\]
(2)
\[\int_{- \infty}^{\infty}{\int_{- \infty}^{\infty}{\cdots\int_{- \infty}^{\infty}{f_{\mX}(\mx)\dif\mx}}} = \mathbb{P}(\mX \in \mathbb{R}^{n}) = 1.\]
(3)
\[\mathbb{P}( X_{1} \in B_{1},X_{2} \in B_{2}, \cdots,X_{n} \in B_{n}) = \int_{B_{1}}^{}{\int_{B_{2}}^{}{\cdots\int_{B_{n}}^{}{f_{\mX}(\mx)\dif x_{n}\cdots \dif x_{2}}}}\dif x_{1},\]
$\forall B_{i} \in \mb_{\mathbb{R}^{n}}, i = 1,2, \cdots,n$.\\
(4)
\[\mathbb{P}(\mX = \mathbf{a}) = \int_{a_{1}}^{a_{1}}{\int_{a_{2}}^{a_{2}}{\cdots\int_{a_{n}}^{a_{n}}{f_{\mX}(\mx)\dif x_{n}\cdots \dif x_{2}}}}\dif x_{1} = 0.\]
(5)
\[\begin{aligned}
&\quad  \mathbb{P}(a_{i} \ls X_{i} \ls a_{i} + \delta_{i}, i = 1,2, \cdots,n)\\
&= \int_{a_{1}}^{a_{1} + \delta_{1}}{\int_{a_{2}}^{a_{2} + \delta_{2}}{\cdots\int_{a_{n}}^{a_{n} + \delta_{n}}{f_{\mX}(\mx)\dif x_{n}\cdots \dif x_{2}}}}\dif x_{1}\\
&= f_{\mX}( {\mathbf{a}}_{\bm{\delta}} ) \cdot \delta_{1} \cdot \delta_{2} \cdots\delta_{n} \text{ for some }{\mathbf{a}}_{\bm{\delta}} \in \prod_{i = 1}^{n}{[ a_{i},a_{i} + \delta_{i}]} \text{ if } f_{\mX}(\mx) \text{ is continuous}.\\
&\to \lim_{\bm{\delta}\to\mathbf{0}}\frac{\mathbb{P}(a_{i} \ls X_{i} \ls a_{i} + \delta_{i}, i = 1,2, \cdots,n)}{\delta_{1} \cdot \delta_{2} \cdots \delta_{n}} = \lim_{\bm{\delta}\to\mathbf{0}}{f_{\mX}( {\mathbf{a}}_{\bm{\delta}} )} = f_{\mX}(\mathbf{a})\\
& \text{and } \mathbb{P}(a_{i} \ls X_{i} \ls a_{i} + \delta_{i}, i = 1,2, \cdots,n) \approx f_{\mX}(\mathbf{a}) \cdot \delta_{1} \cdot \delta_{2} \cdots\delta_{n}.\\
\end{aligned}\]
\end{remark}

\begin{corollary}[Expectation of Linear Combined Measurable Function]
Suppose $X_{1}, X_{2}, \cdots,X_{n}$ are \textbf{jointly continuous}
r.v.'s of a probability space $(\Omega,\ma,\mathbb{P})$ and $g_{1}, g_{2}, \cdots, g_{m}$ are \textbf{measurable functions} from $(\mathbb{R}^{n}, \mb_{\mathbb{R}^{n}})$ to $\mathbb{(R,}\mb_{\mathbb{R}})$, and $\alpha_{1},\alpha_{2}, \cdots,\alpha_{m} \in \mathbb{R}$, then
\[\sum_{k = 1}^{m}\alpha_{k}\cdot g_{k}(\mx)
\]
is a continuous r.v. of $(\Omega,\ma,\mathbb{P})$ and 
\[\mathbb{E}\left[ \sum_{k = 1}^{m}\alpha_{k}\cdot g_{k}(\mx) \right] = \sum_{k = 1}^{m}\alpha_{k}\cdot \mathbb{E}\left[ g_{k}(\mx) \right].\]
\end{corollary}

\begin{proof}
  abc
\end{proof}

\begin{definition}[Joint C.D.F.]
Let $X_{1}, X_{2}, \cdots,X_{n}$ be r.v.'s of a probability space $(\Omega,\ma,\mathbb{P})$. The \textbf{joint c.d.f.} of $X_{1}, X_{2}, \cdots,X_{n}$ is given by
\[F_{\mX}(\mx) = \mathbb{P}( X_{1} \ls x_{1},X_{2} \ls x_{2}, \cdots,X_{n} \ls x_{n}), \forall\mx \in \mathbb{R}^{n}.\]
\end{definition}

\begin{theorem}[Joint Marginal C.D.F.]
Suppose $X_{1}, X_{2}, \cdots,X_{n}$ are r.v.'s of a probability space $(\Omega,\ma,\mathbb{P})$. Then
\[\begin{aligned}
&F_{X_{i_{1}},X_{i_{2}},\cdots\,X_{i_{k}}}\left( x_{i_{1}},x_{i_{2}}, \cdots,x_{i_{k}} \right) \\
=& F_{\mX}\left( \infty,\, \cdots,\infty,x_{i_{1}},\infty, \cdots,\infty,x_{i_{2}},\infty, \cdots,\infty,x_{i_{k}},\infty, \cdots,\infty \right)
\end{aligned}\]
We call \[F_{X_{i_{1}},X_{i_{2}}, \cdots,X_{i_{k}}}\left( x_{i_{1}},x_{i_{2}}, \cdots,x_{i_{k}} \right)\] the \textbf{joint c.d.f.} marginalized over $X_{1}, X_{2}, \cdots,X_{n}$. If $k = 1$, we call $F_{X_{i}}(x_{i})$ the \textbf{marginal c.d.f.} of $X_{i}$.
\end{theorem}

\begin{proof}
  abc
\end{proof}

\begin{theorem}[Properties of Joint C.D.F.]
Suppose $X_{1}, X_{2}, \cdots,X_{n}$ are r.v.'s of a probability
space $(\Omega,\ma,\mathbb{P})$.\\
(1) $F_{\mX}(\mx)$ is \textbf{increasing} and \textbf{right
continuous} in each argument $x_{i},i = 1,2, \cdots,n.$\\
(2) $F_{\mX}(\mx) = 0$ if there exists at least one $i$ such
that $x_{i} = - \infty$.\\
(3) $F_{\mX}\left( \infty,\infty, \cdots,\infty \right) = 1$.\\
(4) If $X_{1}, X_{2}, \cdots,X_{n}$ are \textbf{jointly continuous}
r.v.'s, then
\[f_{\mX}(\mx) = \frac{\partial F_{\mX}(\mx)}{\partial x_{1}\partial x_{2}\cdots\partial x_{n}}, \forall\mx \in \mathbb{R}^{n}.\]
\end{theorem}

\begin{proof}
  abc
\end{proof}

\section{Independent R.V.'s}

\begin{definition}[Independent Set]
Let $\{X_{i}, i \in I\}$ be r.v.'s of a probability space $(\Omega,\ma,\mathbb{P})$. We say that the r.v.'s $\{X_{i}, i \in I\}$ are \textbf{independent} if for any finite subset
$\left\{ X_{i_{1}},X_{i_{2}}, \cdots,X_{i_{k}} \right\} (k \gs 2)$ of $\{ X_{i}, i \in I\}$, the events $X_{i_{1}} \in B_{i_{1}},X_{i_{2}} \in B_{i_{2}}, \cdots,X_{i_{k}} \in B_{i_{k}}$ are independent $\forall B_{i_{1}},B_{i_{2}}, \cdots,B_{i_{k}} \in \mb_{\mathbb{R}}.$ Otherwise, the r.v.'s $\{X_{i}, i \in I\}$ are dependent.
\end{definition}

\begin{theorem}[Equivalent Statements of Independence]
Suppose $X_{1}, X_{2}, \cdots,X_{n}$ are r.v.'s of a probability space $(\Omega,\ma,\mathbb{P})$. The following three statements are \textbf{equivalent}:\\
(1) $X_{1}, X_{2}, \cdots,X_{n}$ are independent.\\
(2)
\[\mathbb{P}( X_{1} \in B_{1},X_{2} \in B_{2}, \cdots,X_{n} \in B_{n}) = \prod_{i = 1}^{n}{\mathbb{P}(X_{i} \in B_{i})},\forall B_{1},B_{2}, \cdots,B_{n} \in \mb_{\mathbb{R}}\]
(3)
\[F_{\mX}(\mx) = \prod_{i = 1}^{n}{F_{X_{i}}(x_{i})}, \forall \mx\in \mathbb{R}^{n}\]
\end{theorem}

\begin{proof}
  abc
\end{proof}

\begin{theorem}[Necessary and Sufficient Condition of Independence]
Suppose $X_{1}, X_{2}, \cdots,X_{n}$ are r.v.'s of a probability
space $(\Omega,\ma,\mathbb{P})$.\\
(1) If $X_{1}, X_{2}, \cdots,X_{n}$ are \textbf{discrete} r.v.'s, then $X_{1}, X_{2}, \cdots,X_{n}$ are independent
\[\Leftrightarrow P_{\mX}(\mx) = \prod_{i = 1}^{n}{P_{X_{i}}(x_{i})}, \forall\mx \in \mathbb{R}^{n}\]
(2) If $X_{1}, X_{2}, \cdots,X_{n}$ are \textbf{jointly continuous}
r.v.'s, then $X_{1}, X_{2}, \cdots,X_{n}$ are independent
\[\Leftrightarrow f_{\mX}(\mx) = \prod_{i = 1}^{n}{f_{X_{i}}(x_{i})}, \forall \mx\in \mathbb{R}^{n}\]
\end{theorem}

\begin{proof}
  abc
\end{proof}

\begin{definition}[Indicator Function]
Let $(\Omega,\ma,\mathbb{P})$ be a probability space, and $A \in \ma$. The \textbf{indicator function} $I_{A}$ of the event $A$ is given by
\[I_{A}(w) = \left\{ \begin{aligned}
&1, ~\text{ if}~ w \in A \\
&0,\quad ~\text{o.w.}~ \\
\end{aligned} \right.\quad ~\text{i.e.}~\quad I_{A} = \left\{ \begin{aligned}
&1, \text{ if } A  \text{occurs} \\
&0,\quad \text{o.w.} \\
\end{aligned} \right.\]
\end{definition}

\begin{theorem}[Indicator Function is a Discrete Measurable Function]
Suppose $(\Omega,\ma,\mathbb{P})$ is a probability space. $I_{A}$ is a \textbf{discrete r.v.} of $(\Omega,\ma,\mathbb{P})$ for all $A \in \ma$.
\end{theorem}

\begin{proof}
  abc
\end{proof}

\begin{theorem}[Indicator R.V.'s Indicates Independence]
Suppose $(\Omega,\ma,\mathbb{P})$ is a probability space, and $A_{1},A_{2}, \cdots,A_{n} \in \ma$. The events $A_{1},A_{2}, \cdots,A_{n}$ are \textbf{independent} $\Leftrightarrow$ the \textbf{indicator r.v.'s}
$I_{A_{1}},I_{A_{2}}, \cdots,I_{A_{n}}$ are \textbf{independent}.
\end{theorem}

\begin{proof}
  abc
\end{proof}

\begin{theorem}[Expectation of Measurable Functions of Independent R.V.]
Suppose $X_{1}, X_{2}, \cdots,X_{n}$ are independent r.v.'s of a
probability space $(\Omega,\ma,\mathbb{P})$, and $g_{1},g_{2}, \cdots,g_{n}$ are measurable functions from $\mathbb{(R,}\mb_{\mathbb{R}})$ to
$\mathbb{(R,}\mb_{\mathbb{R}})$. Then $g_{1}(x_{1}),g_{2}(x_{2}), \cdots,g_{n}(X_{n})$ are independent and
\[\mathbb{E}\left[ \prod_{i = 1}^{n}{g_{i}(x_{i})} \right] = \prod_{i = 1}^{n}{\mathbb{E}[ g_{i}(x_{i})]}.\]
\end{theorem}

\begin{proof}
  abc
\end{proof}

\begin{remark}[Independent Expectations Can't Imply Independence of R.V.'s]
The converse is \textbf{not true}, i.e.,
\[\mathbb{E}\left[ \prod_{i = 1}^{n}{g_{i}(x_{i})} \right] = \prod_{i = 1}^{n}{\mathbb{E}[ g_{i}(x_{i})]} \nRightarrow g_{1}(x_{1}), g_{2}(x_{2}), \cdots, g_{n}(x_{n})~\text{ are independent.}~\]
\end{remark}

\section{Conditional Distributions}

\begin{lemma}[Properties of Conditional Probability]
Suppose $(\Omega,\ma,\mathbb{P})$ is a probability space, and $A,B,A_{1},A_{2}, \cdots,A_{n},B_{1},B_{2}, \cdots,B_{n} \in \ma$.
\[\mathbb{P}( A | B ) = \left\{ \begin{aligned}
&\frac{\mathbb{P}(A  \cap B)}{\mathbb{P}(B)},~\text{ if }~\mathbb{P}( B ) \neq 0\\
&\qquad0\qquad,~\text{ if }~\mathbb{P}( B ) = 0 \\
\end{aligned} \right.\]
(1) If $\mathbb{P}(B) \neq 0$, then $\mathbb{P}(\cdot | B)$ regarded as a function on $\ma$ is a \textbf{probability} \textbf{measure.}\\
(2) \textbf{Multiplication theorem:}
\[\mathbb{P}( A_{1}\cap{A_{2}\cap{\cdots\cap A_{n}}} ) = \mathbb{P}( A_{1} )\mathbb{P}( A_{2} | A_{1} ) \cdots \mathbb{P}(A_{n}|A_{1}\cap{A_{2}\cap{  \cdots\cap A_{n - 1}}}).\]
(3) \textbf{Total probability theorem:}\\
If ${\{ B_{n}\}}_{n = 1}^{\infty}$ is a partition of $\Omega$, then
\[\mathbb{P}(A) = \sum_{n = 1}^{\infty}{\mathbb{P}(B_{n}) \cdot}\mathbb{P}(A|B_{n}),\forall A \in \ma.\]
(4) \textbf{Bayes' theorem:}\\
If $\mathbb{P}(A) \neq 0$ and ${\{ B_{n}\}}_{n = 1}^{\infty}$ is
a partition of $\Omega$, then
\[\mathbb{P}(B_{k} | A) = \frac{\mathbb{P}(B_{k}) \cdot \mathbb{P}(A|B_{k})}{\sum_{n = 1}^{\infty}{\mathbb{P}(B_{n}) \cdot}\mathbb{P}( A | B_{n} )}, \forall A \in \ma\mathrm{, }\mathbb{P}(A) > 0, k = 1,2,\dots\]
\end{lemma}

\begin{proof}
  abc
\end{proof}

$\bigstar P_{X|Y}(x|y): X$ and $Y$ are discrete r.v.'s

\begin{definition}[P.M.F. and C.D.F. of D-D]
Let $X$ and $Y$ be discrete r.v.'s of a probability space
$(\Omega,\ma,\mathbb{P})$ and $y \in \mathbb{R}$. The conditional p.m.f. $P_{X|Y}(x|y)$ of $X$ given that $Y = y$ is given by
\[P_{X|Y}(x|y) = \left\{ \begin{aligned}
\mathbb{P}( X = x | Y = y ) &= \frac{\mathbb{P}(X = x,Y = y)}{\mathbb{P}(Y = y)} \\
&= \frac{P_{X,Y}(x,y)}{P_{Y}(y)},P_{Y}(y) \neq 0,\forall x \in \mathbb{R} \\
&0, \qquad\qquad~\text{o.w.}~\\
\end{aligned} \right. \]
The conditional c.d.f. $F_{X|Y}\left( \cdot | y \right)$ of $X$ given that $Y = y$ is given by
\[\begin{aligned}
F_{X|Y}(x|y) &= \mathbb{P}( X \ls x | Y = y )\\
& = \sum_{t \ls X, t \in X(\Omega)}^{}{\mathbb{P}( X = t | Y = y )} \\
&= \sum_{t \ls X, t \in X(\Omega)}^{}{P_{X|Y}\left( t | y \right)},\forall x \in \mathbb{R}.
\end{aligned}\]
\end{definition}

\begin{remark}[Joint P.M.F.]
(1) $P_{X,Y}(x,y) = P_{Y}(y) \cdot P_{X|Y}(x|y) = P_{X}(x) \cdot P_{Y|X}(y|x)$.\\
(2) A similar definition can be made for discrete \textbf{random} \textbf{vectors}.
\end{remark}

\begin{theorem}[Properties of D-D Conditional Probability]
Suppose $X,Y,X_{1},X_{2}, \cdots,X_{n}$ are discrete r.v.'s of a
probability space $(\Omega,\ma,\mathbb{P})$.\\
(1) If $y \in \mathbb{R}$ and $P_{Y}(y) \neq 0$, then $P_{X|Y}\left( \cdot | y \right)$ is a p.m.f.\\
(2) $\forall x \in \mathbb{R}^{n},$
 \[P_{X}(x) = P_{X_{1}}(x_{1}) \cdot P_{X_{2}|X_{1}}\left( x_{2}|x_{1} \right) \cdots P_{X_{n}|X_{1},X_{2}, \cdots,X_{n - 1}}\left( x_{n}|x_{1},x_{2}, \cdots,x_{n - 1} \right).\]
(3) $\forall x \in \mathbb{R},$
\[P_{X}(x) = \sum_{y \in Y(\Omega)}^{}{P_{Y}(y)} \cdot P_{X|Y}(x|y).\]
(4) If $x \in \mathbb{R}$ and $P_{X}(x) \neq 0$, then
\[P_{Y|X}(y|x) = \frac{P_{Y}(y) \cdot P_{X|Y}(x|y)}{\sum_{y \in Y(\Omega)}^{}{P_{Y}(y)} \cdot P_{X|Y}(x|y)},\forall y \in \mathbb{R}.\]
\end{theorem}

\begin{proof}
  abc
\end{proof}

$\bigstar f_{X|Y}(x|y): X$ and $Y$ are jointly continuous r.v.'s

\begin{definition}[C.D.F. and P.D.F. of C-C]
Let $X$ and $Y$ be jointly continuous r.v.'s of a probability
space $(\Omega,\ma,\mathbb{P})$ and $y \in \mathbb{R}$. The conditional c.d.f. $F_{X|Y}(x|y)$ of $X$ given that $Y = y$ is given by
\[F_{X|Y}(x|y) = \left\{ \begin{aligned}
\lim_{\delta \to 0}&  {\mathbb{P}( X = x | y \ls Y \ls y + \delta )}\ 
&\quad= \lim_{\delta \to 0}\frac{\mathbb{P}( X = x,y \ls Y \ls y + \delta)}{\mathbb{P}( y \ls Y \ls y + \delta)} \\
&\quad= \lim_{\delta \to 0}\frac{[ F_{X,Y}\left( x,y + \delta \right) - F_{X,Y}(x,y)]/\delta}{[ F_{Y}\left( y + \delta \right) - F_{Y}(y)]/\delta} \\
&\quad= \frac{\frac{\partial F_{X,Y}(x,y)}{\partial y}}{f_{Y}(y)},f_{Y}(y) \neq 0,\forall x \in \mathbb{R} \\
0&, \qquad\qquad~\text{o.w.}~ \\
\end{aligned} \right. \]
The conditional p.d.f. $f_{X|Y}\left( \cdot | y \right)$ of $X$ given that $Y = y$ is given by
\[f_{X|Y}(x|y) = \left\{ \begin{aligned}
\frac{\partial F_{X,Y}(x,y)}{\partial x} &= \frac{f_{X,Y}(x,y)}{f_{Y}(y)},f_{Y}(y) \neq 0,\forall x \in \mathbb{R} \\
&0,\qquad~\text{o.w.}~ \\
\end{aligned} \right.\]
\end{definition}

\begin{remark}[Joint P.D.F.]
(1) $f_{X,Y}(x,y) = f_{Y}(y) \cdot f_{X|Y}(x|y) = f_{X}(x) \cdot f_{Y|X}(y|x), \forall x,y \in \mathbb{R}$\\
(2) A similar definition can be made for jointly continuous \textbf{random vectors}.
\end{remark}

\begin{theorem}[Properties of C-C Conditional Probability]
Suppose $X,Y,X_{1}, X_{2}, \cdots,X_{n}$ are jointly continuous r.v.'s of a probability space $(\Omega,\ma,\mathbb{P})$.\\
(1) If $y \in \mathbb{R}$ and $f_{Y}(y) \neq 0$, then $f_{X|Y}\left( \cdot | y \right)$ is a p.d.f.\\
(2) $\forall x \in \mathbb{R}^{n}$,
\[f_{X}(x) = f_{X_{1}}(x_{1}) \cdot f_{X_{2}|X_{1}}\left( x_{2}|x_{1} \right) \cdot \cdot \cdot f_{X_{n}|X_{1},X_{2}, \cdots\,X_{n - 1}}\left( x_{n}|x_{1},x_{2}, \cdots,x_{n - 1} \right).\]
(3)
\[f_{X}(x) = \int_{- \infty}^{\infty}{f_{Y}(y) \cdot f_{X|Y}(x|y)}\dif y,\forall x \in \mathbb{R}.\]
(4) If $x \in \mathbb{R}$ and $f_{X}(x) \neq 0$, then
\[f_{Y|X}(y|x) = \frac{f_{Y}(y) \cdot f_{X|Y}(x|y)}{\int_{- \infty}^{\infty}{f_{Y}(y) \cdot f_{X|Y}(x|y)\dif y}},\forall y \in \mathbb{R}.\]
\end{theorem}

\begin{proof}
  abc
\end{proof}

$\bigstar f_{X|Y}(x|y) ~\text{and}~ P_{X|Y}(x|y): X$ is a continuous r.v. and $Y$ is a discrete r.v.

\begin{definition}[C.D.F., P.D.F. and P.M.F. of C-D and D-C]
Let $X$ be a continuous r.v. and $Y$ be a discrete r.v. of a probability space $(\Omega,\ma,\mathbb{P})$.\\
The conditional \textbf{c.d.f.} $F_{X|Y}\left( \cdot | y \right)$ of $X$ given that $Y = y, y \in \mathbb{R}$ is given by
\[F_{X|Y}(x|y) = \left\{ \begin{aligned}
&\mathbb{P}(X \ls x | Y = y),P_{Y}(y) \neq 0,\forall x \in \mathbb{R} \\
&\qquad\qquad0,\qquad~\text{o.w.}~ \\
\end{aligned} \right. \]
The conditional \textbf{p.d.f.} $f_{X|Y}\left( \cdot | y \right)$ of $X$ given that $Y = y, y \in \mathbb{R}$ is given by
\[f_{X|Y}(x|y) = \left\{ \begin{aligned}
\frac{\partial F_{X,Y}(x,y)}{\partial x} &= \lim_{\delta \to 0}\frac{F_{X|Y}\left( x + \delta | y \right) - F_{X|Y}(x|y)}{\delta} \\
&= \lim_{\delta \to 0}\frac{\mathbb{P}( x \ls X \ls x + \delta | Y = y )}{\delta},P_{Y}(y) \neq 0,\forall x \in \mathbb{R} \\
&0,\qquad~\text{o.w.}~ \\
\end{aligned} \right. \]
The conditional \textbf{p.m.f.} $P_{X|Y}\left( \cdot | y \right)$ of $Y$ given that $X = x,x \in \mathbb{R}$ is given by
\[P_{Y|X}(y|x) = \left\{ \begin{aligned}
\lim_{\delta \to 0}&  \mathbb{P}( Y = y | x \ls X \ls x + \delta ) \\
&\quad= \lim_{\delta \to 0}\frac{\mathbb{P}(Y = y) \cdot \mathbb{P}( x \ls X \ls x + \delta | Y = y )/\delta}{\mathbb{P}(x \ls X \ls x + \delta)/\delta} \\
&\quad= \frac{P_{Y}(y) \cdot f_{X|Y}(x|y)}{f_{X}(x)},f_{X}(x) \neq 0,\forall y \in \mathbb{R} \\
&0, \qquad~\text{o.w.}~ \\
\end{aligned} \right. \]
The conditional \textbf{c.d.f.} $F_{Y|X}\left( \cdot | x \right)$ of $Y$ given that $X = x,x \in \mathbb{R}$ is given by
\[F_{Y|X}(y|x) = \left\{ \begin{aligned}
\sum_{t \ls X, t \in X(\Omega) }^{}{P_{Y,X}\left( t|x \right)} &= \frac{\sum_{t \ls X, t \in X(\Omega)}^{}{P_{Y}(t) \cdot f_{X|Y}\left( x | t \right)}}{f_{X}(x)},\\
&\quad f_{X}(x) \neq 0,\forall y \in \mathbb{R} \\
&0, \qquad~\text{o.w.}~ \\
\end{aligned} \right. \]
\end{definition}

\begin{remark}[Calculation of C-D P.D.F. and D-C P.M.F.]
(1)
$P_{Y}(y) \cdot f_{X|Y}(x|y) = f_{X}(x) \cdot P_{Y|X}(y|x), \forall x,y \in \mathbb{R}.$

(2) If $y \in \mathbb{R}$ and $P_{Y}(y) \neq 0$, then

\[f_{X|Y}(x|y) = \frac{f_{X}(x) \cdot P_{Y|X}(y|x)}{P_{Y}(y)},\forall x \in \mathbb{R}.\]

If $x \in \mathbb{R}$ and $f_{X}(x) \neq 0$, then

\[P_{Y|X}(y|x) = \frac{P_{Y}(y) \cdot f_{X|Y}(x|y)}{f_{X}(x)},\forall y \in \mathbb{R}.\]

\end{remark}

\begin{theorem}[Properties of C-D and D-C Conditional Probability]
Suppose $X$ is a continuous r.v. and $Y$ is a discrete r.v. of a probability space $(\Omega,\ma,\mathbb{P})$.\\
(1) If $y \in \mathbb{R}$ and $P_{Y}(y) \neq 0$, then
$f_{X|Y}\left( \cdot | y \right)$ is a p.d.f. If $x \in \mathbb{R}$ and $f_{X}(x) \neq 0$, then $P_{Y|X}(y|x)$ is a p.m.f.\\
(2)
\[f_{X}(x) = \sum_{y \in Y(\Omega)}^{}{P_{Y}(y) \cdot f_{X|Y}(x|y)}, \forall x \in \mathbb{R}.\]

\[P_{Y}(y) = \int_{- \infty}^{\infty}{f_{X}(x) \cdot P_{Y|X}(y|x)\dif x,} \forall y \in \mathbb{R}.\]

(3) If $x \in \mathbb{R}$ and $f_{X}(x) \neq 0$, then

\[P_{Y|X}(y|x) = \frac{P_{Y}(y) \cdot f_{X|Y}(x|y)}{\sum_{y \in Y(\Omega)}^{}{P_{Y}(y) \cdot f_{X|Y}(x|y)}}, \forall y \in \mathbb{R}.\]

If $y \in \mathbb{R}$ and $P_{Y}(y) \neq 0$, then

\[f_{X|Y}(x|y) = \frac{f_{X}(x) \cdot P_{Y|X}(y|x)}{\int_{- \infty}^{\infty}{f_{X}(x) \cdot P_{Y|X}(y|x)\dif x}}\dif x, \forall x \in \mathbb{R}.\]
\end{theorem}

\begin{proof}
  abc
\end{proof}

\begin{definition}[Expectation of Conditional R.V.]
  Let $X$ and $Y$ be r.v.'s of a probability space $(\Omega,\ma,\mathbb{P})$ and
  $y \in \mathbb{R}$. The conditional expectation $\mathbb{E}[ X|Y = y]$ of $X$ given that $Y = y$ is given by
  \[\mathbb{E}\left[ X | Y = y \right] = \left\{ \begin{aligned}
  &\sum_{x \in X(\Omega)}^{}{x \cdot P_{X|Y}(x|y),\quad \text{if}~X~\text{is a discrete r.v.}} \\
  &\int_{- \infty}^{\infty}{x \cdot}f_{X|Y}(x|y)\dif x,\quad\text{if}~X~\text{is a continuous r.v.} \\
  \end{aligned} \right. \]
\end{definition}

\begin{theorem}[Expectation of Conditional Measurable Function]
Suppose $X$ and $Y$ are r.v.'s of a probability space $(\Omega,\ma,\mathbb{P})$, and $g$ is a measurable function from $(\mathbb{R},\mb_{\mathbb{R}})$ to $(\mathbb{R},\mb_{\mathbb{R}})$. Then
\[\mathbb{E}\left[ g(X) | Y = y \right] = \left\{ \begin{aligned}
&\sum_{x \in X(\Omega)}^{}{g(x) \cdot P_{X|Y}(x|y),\quad \text{if }X\text{ is a discrete r.v.}} \\
&\int_{- \infty}^{\infty}g(x) \cdot f_{X|Y}(x|y)\dif x,\quad\text{if }X\text{ is a continuous r.v.} \\
\end{aligned} \right. \]
\end{theorem}

\begin{proof}
  abc
\end{proof}

\section{Transformations of Two R.V.'s}

\begin{theorem}[Transformations of Two R.V.'s]
Suppose $X$ and $Y$ are r.v.'s of a probability space $(\Omega,\ma,\mathbb{P})$, $g$ and $h$ are measurable functions from $(\mathbb{R}^{2},\mb_{\mathbb{R}^{2}})$ to $(\mathbb{R},\mb_{\mathbb{R}})$, and $U = g(X,Y)$ and $V = h(X,Y)$.\\
(1) If $X$ and $Y$ are discrete r.v.'s, then $U$ and $V$ are discrete r.v.'s and
\[P_{U,V}(u,v) = \sum_{(x,y):g(x,y) = u,h(x,y) = v}^{}{P_{X,Y}(x,y)}.\]
(2) If $X$ and $Y$ are jointly continuous r.v.'s, $U$ and $V$ are discrete r.v.'s, then
\[P_{U,V}(u,v) = \iint_{\{(x,y):g(x,y) = u,h(x,y) = v\}}^{}{f_{X,Y}(x,y)\dif x\dif y.}\]
(3) If $X$ and $Y$ are jointly continuous r.v.'s, $U$and $V$are jointly continuous r.v.'s, and
\[J(x,y) = \left| \begin{matrix}
\frac{\partial g(x,y)}{\partial x} & \frac{\partial g(x,y)}{\partial y} \\
\frac{\partial h(x,y)}{\partial x} & \frac{\partial h(x,y)}{\partial y} \\
\end{matrix} \right| \neq 0\]

$\forall(x,y) \in \{(x,y):g(x,y) = u, h(x,y) = v\}$, where $J(x,y)$ is the Jacobian determinant, $(u,v) \in g(X,Y)(\Omega) \times h(X,Y)(\Omega)$, then
\[f_{U,V}(u,v) = \sum_{(x,y):g(x,y) = u,h(x,y) = v}^{}\frac{f_{X,Y}(x,y)}{|J(x,y)|}\]
\end{theorem}

\begin{proof}
  abc
\end{proof}

\begin{theorem}[Convolution Theorem]
Suppose $X$ and $Y$ are two independent r.v.'s of a probability space $(\Omega,\ma,\mathbb{P})$ and $Z = X + Y$.\\
(1) If $X$ and $Y$ are discrete r.v.'s, then
\[P_{Z}(z) = \sum_{x \in X(\Omega)}^{}{P_{X}(x) \cdot}P_{Y}(z-x)\]
(2) If $X$ and $Y$ are jointly continuous r.v.'s, then
\[f_{Z}(z) = \int_{- \infty}^{\infty}{f_{X}(x) \cdot f_{Y}(z-x)\dif x}.\]
\end{theorem}

\begin{proof}
  abc
\end{proof}

\section{Order Statistics}

\begin{definition}[Order Statistic]
Let $X_{1}, X_{2}, \cdots,X_{n}$ be i.i.d. r.v.'s of a probability space $(\Omega,\ma,\mathbb{P})$. The $i^{th}$ order statistic $X_{(i)},  i = 1,2, \cdots,n$
of $X_{1}, X_{2}, \cdots,X_{n}$ is defined as the $i^{th}$
\textbf{smallest} value in $\left\{ X_{1}, X_{2}, \cdots,X_{n} \right\}$ so that
$X_{(1)} \ls X_{(2)} \ls \cdots\ls X_{(n)}$, namely, $X_{\left( i \right)}(w) =$ the $i^{th} $smallest value in $\left\{ X_{1}(w), X_{2}(w), \cdots,X_{n}(w) \right\}$ for all $w \in \Omega$. In particular,
$X_{(1)} = \min\{ X_{1}, X_{2}, \cdots,X_{n}\}$ and $X_{(n)} = \max\{ X_{1}, X_{2}, \cdots,X_{n}\}$.
\end{definition}

\begin{remark}[Without Equal $\&$ Not I.I.D.]
(1) If $X_{1}, X_{2}, \cdots,X_{n}$ are jointly continuous r.v.'s, then
\[\mathbb{P}\left( X_{\left( i \right)} = X_{\left( j \right)} \right) = 0, \forall i \neq j   \to \mathbb{P}\left( X_{\left( 1 \right)} < X_{\left( 2 \right)} < \cdots< X_{\left( n \right)} \right) = 1.\]
(2) $X_{(i)},  i = 1,2, \cdots,n$ is a function of $X_{1}, X_{2}, \cdots,X_{n}$ $\to X_{(1)},X_{(2)}, \cdots,X_{(n)}$ are \textbf{neither independent} \textbf{nor identically distributed} in general.
\end{remark}

\begin{definition}[Random Sample]
A random sample of size $n$ of a probability space $(\Omega,\ma,\mathbb{P})$ is a sequence of $n$ i.i.d. r.v.'s $X_1, X_2, \cdots,X_n$ of $(\Omega,\ma,\mathbb{P})$.
\end{definition}

\begin{definition}[Range, Midrange, Median and Mean of Random Sample]
Let $X_{1}, X_{2}, \cdots,X_{n}$ be a random sample of size $n$ of
a probability space $(\Omega,\ma,\mathbb{P})$.

The \textbf{sample range} is given by
$X_{\left( 1 \right)} + X_{\left( n \right)}$.

The \textbf{sample midrange} is given by
$\frac{X_{\left( 1 \right)} + X_{\left( n \right)}}{2}$.

The \textbf{sample median} is given by $\left\{ \begin{aligned}
&\quad X_{\left( i - 1 \right)},\qquad~\text{ if}~ n = 2i + 1 \\
&\frac{X_{\left( i \right)} + X_{\left( i + 1 \right)}}{2}, ~\text{if}~ n = 2i\\
\end{aligned} \right.$

The \textbf{sample mean} $\bar{X}$ is given by $\bar{X} = \frac{1}{n}\sum_{i = 1}^{n}X_{i}.$
\end{definition}

\begin{remark}[Forced Decline]
If $\exists i_{j} < i_{l} \to x_{i_{j}} \gs x_{i_{l}}$, then
\[\begin{aligned}
&F_{X_{(i_{1})},X_{(i_{2})},\cdots\,X_{(i_{k})}}\left( x_{i_{1}}, \cdots,x_{i_{j}}, \cdots,x_{i_{l}}, \cdots, x_{i_{k}} \right) \\
=& F_{X_{(i_{1})},X_{(i_{2})},\cdots\,X_{(i_{k})}}\left( x_{i_{1}}, \cdots,x_{i_{l}}, \cdots,x_{i_{l}}, \cdots, x_{i_{k}} \right) \end{aligned}\]
and
$f_{X_{(i_{1})},X_{(i_{2})},\cdots\,X_{(i_{k})}}\left( x_{i_{1}},x_{i_{2}}, \cdots, x_{i_{k}} \right) = 0$.
\end{remark}

\begin{theorem}[C.D.F. and P.D.F. of Jointly Order R.V.'s]
Suppose $X_{1}, X_{2}, \cdots,X_{n}$ are i.i.d. jointly continuous r.v.'s of a probability space $(\Omega,\ma,\mathbb{P})$ with common c.d.f. $F(x)$ and common p.d.f. $f(x)$. If
$1 \ls i_{1} \ls i_{2} \ls \cdots \ls i_{k} \ls n,  - \infty < x_{i_{1}} < x_{i_{2}} < \cdots < x_{i_{k}} < \infty$, then
\[\begin{aligned}
&F_{X_{\left(i_{1}\right)},X_{\left(i_{2}\right)}, \cdots,X_{\left(i_{k}\right)}}\left( x_{i_{1}},x_{i_{2}}, \cdots, x_{i_{k}} \right)\\
=& \sum_{j_{k} = i_{k}}^{n}{\sum_{j_{k - 1} = i_{k - 1}}^{j_{k}}\cdots}\sum_{j_{1} = i_{1}}^{j_{2}} \binom{n}{j_{k}}\binom{j_{k}}{j_{k - 1}} \cdots \binom{j_{2}}{j_{1}}\left[ F\left( x_{i_{1}} \right) \right]^{j_{1}} \left[ F\left( x_{i_{2}} \right) - F\left( x_{i_{1}} \right) \right]^{j_{2} - j_{1}}\\
&\qquad\cdots \left[ F\left( x_{i_{k}} \right) - F\left( x_{i_{k - 1}} \right) \right]^{j_{k} - j_{k - 1}} \left[ 1 - F\left( x_{i_{k}} \right) \right]^{n - j_{k}}\\
\end{aligned}\]
and
\[\begin{aligned}
&\qquad f_{X_{\left(i_{1}\right)},X_{\left(i_{2}\right)}, \cdots,X_{\left(i_{k}\right)}}\left( x_{i_{1}},x_{i_{2}}, \cdots, x_{i_{k}} \right)\\
&= \frac{n!}{\left( i_{1} - 1 \right)!\left( i_{2} - i_{1} - 1 \right)!\cdots\left( i_{k} - i_{k - 1} - 1 \right)!\left( n - i_{k} \right)!}\\
&\cdot f\left( x_{i_{1}} \right)f\left( x_{i_{2}} \right)\cdots f\left( x_{i_{k}} \right)\cdot \left[ F\left( x_{i_{1}} \right) \right]^{i_{1} - 1}\left[ F\left( x_{i_{2}} \right) - F\left( x_{i_{1}} \right) \right]^{i_{2} - i_{1} - 1}\\
&\cdots \left[ F\left( x_{i_{k}} \right) - F\left( x_{i_{k - 1}} \right) \right]^{i_{k} - i_{k - 1} - 1} \left[ 1 - F\left( x_{i_{k}} \right) \right]^{n - i_{k}}\\
\end{aligned}\]
\end{theorem}

\begin{proof}
  abc
\end{proof}

\begin{corollary}[Beta R.V. vs Binomial R.V.]
Suppose $X_{1}, X_{2}, \cdots,X_{n}$ are i.i.d. r.v.'s $\sim U(0,1)$, then
\[X_{(i)}\sim\mb\left( i,n + 1 - i \right), i = 1,2, \cdots,n.\]
\end{corollary}

\begin{proof}
\[\begin{aligned}
f_{X_{\left( i \right)}}(x) &= \frac{n!}{\left( i - 1 \right)!\left( n - i \right)!}f(x)\left[ F(x) \right]^{i - 1}\left[ 1 - F(x) \right]^{n - i}\\
&= \frac{n!}{\left( i - 1 \right)!\left( n - i \right)!}1 \cdot x^{i - 1}{(1 - x)}^{n - i}\\
&= \frac{\Gamma(n + 1)}{\Gamma(i)\Gamma(n + 1 - i)}x^{i - 1}{(1 - x)}^{(n + 1 - i) - 1}\\
&= \frac{x^{i - 1}{(1 - x)}^{(n + 1 - i) - 1}}{B(i,n + 1 - i)}, 0 < x < 1\\
&\to X_{(i)}\sim\mb\left( i,n + 1 - i \right)
\end{aligned}\]
\end{proof}

\begin{corollary}[Cases One, Two and $n$ Order R.V.'s]

(1)
\[\begin{aligned}
 F_{X_{\left( i \right)}}(x) &= \sum_{j = i}^{n}{\binom{n}{j}\left[ F(x) \right]^{j}\left[ 1 - F(x) \right]^{n - j}}, -\infty < x < \infty,\\
f_{X_{\left( i \right)}}(x) &= \frac{n!}{\left( i - 1 \right)!\left( n - i \right)!}f(x)\left[ F(x) \right]^{i - 1}\left[ 1 - F(x) \right]^{n - i}, -\infty < x < \infty.
\end{aligned}\]
In particular,
\[\begin{aligned}
F_{X_{\left( 1 \right)}}(x) &= 1 - \left[ 1 - F(x) \right]^{n}, -\infty < x < \infty,\\
f_{X_{\left( 1 \right)}}(x) &= n\cdot f(x)\left[ 1 - F(x) \right]^{n - 1}, -\infty < x < \infty,
\end{aligned}\]
and
\[F_{X_{\left( n \right)}}(x) = \left[ F(x) \right]^{n}, f_{X_{\left( 1 \right)}}(x) = nf(x)\left[ F(x) \right]^{n - 1}, -\infty < x < \infty.\]
(2)
\[\begin{aligned}
&F_{X_{\left( i_{1} \right)},X_{\left( i_{2} \right)}}(x,y)\\
=& \sum_{j_{2} = i_{2}}^{n}{\sum_{j_{1} = i_{1}}^{j_{2}}{\binom{n}{j_{2}}  \binom{j_{2}}{j_{1}} \left[ F(x) \right]^{j_{1}}\left[ F(y) - F(x) \right]^{j_{2} - j_{1}}\left[ 1 - F(y) \right]^{n - j_{2}}}},\\
& - \infty < x < y < \infty
\end{aligned}\]
\[\begin{aligned}
f_{X_{\left( i_{1} \right)},X_{\left( i_{2} \right)}}(x,y) =& \frac{n!}{\left( i_{1} - 1 \right)!\left( i_{2} - i_{1} - 1 \right)!\left( n - i_{2} \right)!}f(x)f(y)\left[ F(x) \right]^{j_{1}}
\\
\cdot &\left[ F(y) - F(x) \right]^{j_{2} - j_{1}}\left[ 1 - F(y) \right]^{n - j_{2}}, - \infty < x < y < \infty
\end{aligned}\]
(3)
\[\begin{aligned}
&\quad F_{X_{(1)},X_{(2)}, \cdots,X_{(n)}}\left( x_{1},x_{2}, \cdots, x_{n} \right)\\
&= \sum_{j_{n - 1} = i_{n - 1}}^{n}{\sum_{j_{n - 2} = i_{n - 2}}^{j_{n - 1}}\cdots}\sum_{j_{1} = i_{1}}^{j_{2}} \binom{n}{j_{n - 1}}  \binom{j_{n - 1}}{j_{n - 2}} \cdots\binom{j_{2}}{j_{1}}\left[ F( x_{1} ) \right]^{j_{1}}\\
&\cdot\left[ F( x_{2} ) - F( x_{1} ) \right]^{j_{2} - j_{1}}\cdots \left[ F( x_{n - 1} ) - F( x_{n - 2} ) \right]^{j_{n - 1} - j_{n - 2}} \left[ F( x_{n} ) - F( x_{n - 1} ) \right]^{n - j_{n - 1}}\\
\end{aligned}\]
and
\[\begin{aligned}
&f_{X_{\left( 1 \right)},X_{\left( 2 \right)}, \cdots,X_{\left( n \right)}}\left( x_{1},x_{2}, \cdots, x_{n} \right)\\
=& n!f(x_{1})f(x_{2})\cdots f(x_{n}),    - \infty < x_{1} < x_{2} < \cdots < x_{n} < \infty
\end{aligned}\]
\end{corollary}

\begin{proof}
  abc
\end{proof}

\section{Multinomial Distributions}

Consider an experiment with $k$ possible outcomes $\omega_{1}, \omega_{2}, \cdots,\omega_{k}$. Let $A_{\left( i \right)} = \left\{ \omega_{i} \right\}$ be the event that the outcome is $\omega_{i}$ and let $P_{i} = \mathbb{P}(A_{i}),i = 1,2, \cdots,k.$ Suppose that such an experiment is independently and successively performed $n$ times. Let $X_{i}, i = 1,2, \cdots,k$ be the number of times that event $A_{i}$ occurs. Then
\[\begin{aligned}
&P_{X_{1},X_{2}, \cdots,X_{k}}\left( x_{1},x_{2},\, \cdots,x_{k} \right) \\
=& \mathbb{P}( X_{1} = x_{1},X_{2} = x_{2}, \cdots,X_{k} = x_{k} )\\
=& \frac{n!}{x_{1}!x_{2}!\cdots x_{k}!}P_{1}^{x_{1}}P_{2}^{x_{2}}\cdots P_{k}^{x_{k}}, x_{1},x_{2}, \cdots, x_{k} \gs 0 ~\text{and}~ \sum_{i=1}^k x_i=n.
\end{aligned}\]

\begin{definition}[Multinomial Joint R.V.'s]
Let $X_{1}, X_{2}, \cdots,X_{k}$ be discrete r.v.'s of a probability space $(\Omega,\ma,\mathbb{P})$. We call $X_{1}, X_{2}, \cdots,X_{k}$ multinomial joint r.v.'s with parameters $n,P_{1},P_{2}, \cdots,P_{k}$, where $n \gs 1, P_{1},P_{2}, \cdots,P_{k} \gs 0, P_{1} + P_{2} + \cdots + P_{k} = 1$, if the joint p.m.f. is given by
\[P_{\mX}(\mx) = \left\{ \begin{aligned}
\frac{n!}{x_{1}!x_{2}!\cdots x_{k}!} & P_{1}^{x_{1}}P_{2}^{x_{2}}\cdots P_{k}^{x_{k}}, \mathrm{ }x_{1},x_{2}, \cdots,x_{k} \gs 0 ~\text{and}~ \sum_{i=1}^k x_i=n \\
&0, \qquad\qquad~\text{o.w.}~\\
\end{aligned} \right.\]
\end{definition}

\begin{remark}[Verification of P.M.F.]
$P_{\mX}(\mx) \gs 0, \forall\mx \in \mathbb{R}^{n}$ and
\[\sum_{\mbox{\tiny$\begin{aligned}
x_{1},x_{2},\,\cdots, x_{k} \gs 0 \\
x_{1} + x_{2} + \cdots + x_{k} = n \\
\end{aligned}$}}^{}{\frac{n!}{x_{1}!x_{2}!\cdots x_{k}!}P_{1}^{x_{1}}P_{2}^{x_{2}}\cdots P_{k}^{x_{k}}} = \left( P_{1} + P_{2} + \cdots + P_{k} \right)^{n} = 1\]
$\to P_{\mX}(\mx)$ is a p.m.f.
\end{remark}

\begin{theorem}[Splitting of Multinomial Joint R.V.'s]
Suppose $X_{1}, X_{2}, \cdots,X_{l}$ are multinomial r.v.'s of a probability space $(\Omega,\ma,\mathbb{P})$, with parameters $n,P_{1},P_{2}, \cdots,P_{l}$,
where $n \gs 1, P_{1},P_{2}, \cdots,P_{k} \gs 0, P_{1} + P_{2} + \cdots + P_{k} = 1$. Then
\[X_{(i_{1})},X_{(i_{2})},\cdots\,X_{\left( i_{k} \right)}, n - X_{\left( i_{1} \right)} - X_{\left( i_{2} \right)} - \cdots - X_{\left( i_{k} \right)}\]
are multinomial joint r.v.'s with parameters \[n,P_{i_{1}},P_{i_{2}}, \cdots,P_{i_{k}}, 1 - P_{i_{1}} - P_{i_{2}} - \cdots - P_{i_{k}}.\]
\end{theorem}

\begin{proof}
  abc
\end{proof}

\chapter{More Expectations and Variance}

\section{Expected Values of Sums of R.V.'s}

\begin{theorem}[Expectations of Sum of Finite R.V.'s]
Suppose $X_{1}, X_{2}, \cdots,X_{n}$ are r.v.'s of a probability space $(\Omega,\ma,\mathbb{P})$, then
\[\mathbb{E}\left[ \sum_{i = 1}^{n}X_{i} \right] = \sum_{i = 1}^{n}{\mathbb{E}[ X_{i}]}.\]
\end{theorem}

\begin{proof}
  abc
\end{proof}

\begin{theorem}[Expectations of Sum of Infinite R.V.'s]
Suppose $X_{1}, X_{2}, \cdots$ are r.v.'s of a probability space $(\Omega,\ma,\mathbb{P})$. If \[\sum_{i = 1}^{\infty}{\mathbb{E}[ X_{i}]} < \infty\] or if
$X_{i}$ is nonnegative for all $i = 1,2, \cdots,$ then
\[\mathbb{E}\left[ \sum_{i = 1}^{\infty}X_{i} \right] = \sum_{i = 1}^{\infty}{\mathbb{E}[ X_{i}]}.\]
\end{theorem}

\begin{proof}
  abc
\end{proof}

\begin{remark}[General Expectations of Sum of Infinite R.V.'s]
In general,
\[\mathbb{E}\left[ \sum_{i = 1}^{\infty}X_{i} \right] \neq \sum_{i = 1}^{\infty}{\mathbb{E}[ X_{i}]}.\]
\end{remark}

\begin{corollary}[Expectation of Integer-Valued R.V.]
Suppose $X$ is an integer-valued r.v. of a probability space $(\Omega,\ma,\mathbb{P})$, then
\[\mathbb{E}[X] = \sum_{i = 1}^{\infty}{\mathbb{P}(x \gs i)} - \sum_{i = 1}^{\infty}{\mathbb{P}( x \ls - i )}.\]
\end{corollary}

\begin{proof}
  abc
\end{proof}

\section{Covariance and Correlation Coefficients}

\begin{theorem}[Cauchy-Schwarz Inequality]
Suppose $X$ and $Y$ are r.v.'s of a probability space $(\Omega,\ma,\mathbb{P})$, and suppose $\mathbb{E}[ X^{2}]$ and $\mathbb{E}[ Y^{2}]$
exists. Then
\[\left| \mathbb{E}[XY] \right| \ls \sqrt{\mathbb{E}\left[ X^{2} \right] \cdot \mathbb{E}\left[ Y^{2} \right]}.\]
``$=$''  $\Leftrightarrow X = 0$ with probability 1 or $Y = 0$ with probability 1 or $Y = aX$ with probability 1, where
\[a = \frac{\mathbb{E}[XY]}{\mathbb{E}\left[ X^{2} \right]}.\]
\end{theorem}

\begin{proof}
  abc
\end{proof}

\begin{remark}[Cauchy-Schwarz Equalities]
Suppose that $\mathbb{E}[ X^{2}] \neq 0$ and $\mathbb{E}[ Y^{2}] \neq 0$, then

\[\mathbb{E}[XY]= \sqrt{\mathbb{E}\left[ X^{2} \right] \cdot \mathbb{E}\left[ Y^{2} \right]}  \Leftrightarrow Y = aX\]
with probability 1, where

\[a = \frac{\mathbb{E}[XY]}{\mathbb{E}\left[ X^{2} \right]} = \sqrt{\frac{\mathbb{E}\left[ Y^{2} \right]}{\mathbb{E}\left[ X^{2} \right]}} > 0.\]

\[\mathbb{E}[XY]= - \sqrt{\mathbb{E}\left[ X^{2} \right] \cdot \mathbb{E}\left[ Y^{2} \right]}  \Leftrightarrow Y = aX\]
with probability 1, where

\[a = \frac{\mathbb{E}[XY]}{\mathbb{E}\left[ X^{2} \right]} = - \sqrt{\frac{\mathbb{E}\left[ Y^{2} \right]}{\mathbb{E}\left[ X^{2} \right]}} < 0.\]

\end{remark}

\begin{corollary}[Variance Larger Than or Equal to Zero]
Suppose $X$ is a r.v. of a probability space $(\Omega,\ma,\mathbb{P})$ and suppose
$\mathbb{E}[ X^{2}]$ exists, then
\[\left| \mathbb{E}[X] \right|^{2} \ls \mathbb{E}\left[ X^{2} \right].\]
\end{corollary}

\begin{proof}
  abc
\end{proof}

\begin{definition}[Covariance]
Let $X$ and $Y$ be r.v.'s of a probability space $(\Omega,\ma,\mathbb{P})$ with means $\mu_{X}$ and $\mu_{Y}$, resp. The covariance $\mathrm{Cov}(X,Y)$ (or $\sigma_{X,Y}$) of $X$ and $Y$ is given by
\[\mathrm{Cov}(X,Y) = \sigma_{X,Y} = \mathbb{E}\left[ \left( X - \mu_{X} \right)\left( Y - \mu_{Y} \right) \right].\]
We say that $X$ and $Y$ are positively correlated, negatively correlated and uncorrelated if
$\mathrm{Cov}(X,Y) > 0, \mathrm{Cov}(X,Y) < 0$ and
$\mathrm{Cov}(X,Y) = 0$, resp.
\end{definition}

\begin{remark}[Covariance of Linear Combination of Two R.V.'s]
(1)
$\mathrm{Var}(X) = \mathbb{E}[(X-\mu_{X})^{2}]$ is a measure of the spread or dispersion of $X$.

$\mathrm{Var}(Y) = \mathbb{E}[(Y-\mu_{Y})^{2}]$ is a measure of the spread or dispersion of $Y$.

$\mathrm{Cov}(X,Y) = \sigma_{X,Y} = \mathbb{E}[( X - \mu_{X})(Y-\mu_{Y})]$ is a measure of the joint spread or dispersion of $X$ and $Y$.

(2)
\[\begin{aligned}
\mathrm{Var}(aX + bY) &= \mathbb{E}[ [ ( aX + bY ) - ( a\mu_{X} + b\mu_{Y} ) ]^{2} ]\\
&= \mathbb{E}[[ a( X - \mu_{X} ) + b( Y - \mu_{Y} ) ]^{2}]\\
& = a^{2}\mathrm{Var}(X) + b^{2}\mathrm{Var}(Y) + 2ab\mathrm{Cov}(X,Y)
\end{aligned}\]
is a measure of the spread or dispersion along the $(ax + by)$-direction.

\end{remark}

\begin{theorem}[Calculating Covariance]
Suppose $X$ and $Y$ are r.v.'s of a probability space $(\Omega,\ma,\mathbb{P})$.

(1) $\mathrm{Var}(X) = \mathrm{Cov}(X,X)$.

(2)
$\mathrm{Cov}(X,Y) = \mathrm{Cov}(Y,X) = \mathbb{E}[XY] - \mathbb{E}[X] \mathbb{E}[Y].$

(3)
$\left| \mathrm{Cov}(X,Y) \right| \ls \sigma_{X}\cdot \sigma_{Y}, \text{``=''} \Leftrightarrow X = \mu_{X}$ with probability 1 or
$Y = \mu_{Y}$ with probability 1 or $Y=aX+b$ with probability 1, where
\[a = \frac{\sigma_{X,Y}}{\sigma_{X}^{2}}, b = \mu_{Y} - \mu_{X} \cdot \frac{\sigma_{X,Y}}{\sigma_{X}^{2}}.\]

If $\sigma_{X} \neq 0$ and $\sigma_{Y} \neq 0$, then
\[\mathrm{Cov}( X,Y)= \sigma_{X}\cdot \sigma_{Y} \Leftrightarrow Y = aX + b\]
with probability 1, where
\[a = \frac{\sigma_{Y}}{\sigma_{X}} > 0, b = \mu_{Y} - \mu_{X} \cdot \frac{\sigma_{Y}}{\sigma_{X}}.\]
\[\mathrm{Cov}( X,Y)=- \sigma_{X}\cdot \sigma_{Y} \Leftrightarrow Y = aX + b\]
with probability 1, where
\[a = - \frac{\sigma_{Y}}{\sigma_{X}} < 0, b = \mu_{Y} + \mu_{X} \cdot \frac{\sigma_{Y}}{\sigma_{X}}.\]
\end{theorem}

\begin{proof}
  abc
\end{proof}

\begin{theorem}[Covariance of Two Linear Combined R.V.'s]
Suppose $X_{1}, X_{2}, \cdots,X_{n}, Y_{1}, Y_{2}, \cdots,Y_{m}$ are r.v.'s of a probability space $(\Omega,\ma,\mathbb{P})$.

(1)
\[\mathrm{Cov}\left( \sum_{i = 1}^{n}{a_{i}X_{i}}, \sum_{j = 1}^{m}{b_{j}Y_{j}} \right) = \sum_{i = 1}^{n}{\sum_{j = 1}^{m}a_{i}b_{j}}\mathrm{Cov}\left( X_{i},Y_{j} \right).\]
(2)
\[\mathrm{Var}\left( \sum_{i = 1}^{n}{a_{i}X_{i}} \right) = \sum_{i = 1}^{n}{a_{i}^{2}\mathrm{Var}(x_{i})} + 2\sum_{1 \ls i < j \ls n}^{}a_{i}b_{j}\mathrm{Cov}\left( X_{i},X_{j} \right).\]

In particular, if $X_{1}, X_{2}, \cdots,X_{n}$ are \textbf{pairwise
uncorrelated}, then
\[\mathrm{Var}\left( \sum_{i = 1}^{n}{a_{i}X_{i}} \right) = \sum_{i = 1}^{n}{a_{i}^{2}\mathrm{Var}(x_{i})}.\]
\end{theorem}

\begin{proof}
  abc
\end{proof}

\begin{theorem}[Independence Implies Uncorrelated]
Suppose $X$ and $Y$ are r.v.'s of a probability space $(\Omega,\ma,\mathbb{P})$. If $X\bot Y$, then $X$ and $Y$ are uncorrelated, i.e.,
\[\mathrm{Cov}(X,Y) = \mathbb{E}[XY] - \mathbb{E}[X] \mathbb{E}[Y] = \mathbb{E}[X] \mathbb{E}[Y] - \mathbb{E}[X] \mathbb{E}[Y] = 0.\]
\end{theorem}

\begin{proof}
  abc
\end{proof}

\begin{remark}[Uncorrelated Can't Imply Independence]
The inverse is not true, i.e.,
\[\mathrm{Cov}(X,Y) = 0 \nRightarrow X\bot Y.\]
\end{remark}

\begin{definition}[Correlation Coefficient]
Let $X$ and $Y$ be r.v.'s of a probability space $(\Omega,\ma,\mathbb{P})$ with
$0 < \sigma_{X}^{2} < \infty,0 < \sigma_{Y}^{2} < \infty$. The correlation coefficient between $X$ and $Y$ is given by
\[\rho_{X,Y} = \mathrm{Cov}\left( X^{*},Y^{*} \right) = \mathrm{Cov}\left( \frac{X - \mu_{X}}{\sigma_{X}},\frac{Y - \mu_{Y}}{\sigma_{Y}} \right) = \frac{\sigma_{X,Y}}{\sigma_{X}\sigma_{Y}}.\]
\end{definition}

\begin{remark}[Properties of Correlation Coefficient]

(1) $X^{*} = \frac{X - \mu_{X}}{\sigma_{X}}$is independent of the units in which $X$ is measured. $\to \rho_{X,Y}$ is \textbf{independent of the units} in which $X$ and $Y$ is measured.

(2) $- 1 \ls \rho_{X,Y} \ls 1.$

$\rho_{X,Y} = 1 \Leftrightarrow Y = aX + b$ with probability 1, where
\[a = \frac{\sigma_{Y}}{\sigma_{X}} > 0, b = \mu_{Y} - \mu_{X} \cdot \frac{\sigma_{Y}}{\sigma_{X}}.\]

$\rho_{X,Y} =- 1 \Leftrightarrow Y = aX + b$ with probability 1, where
\[a = - \frac{\sigma_{Y}}{\sigma_{X}} < 0, b = \mu_{Y} + \mu_{X} \cdot \frac{\sigma_{Y}}{\sigma_{X}}.\]
\end{remark}

\section{Conditioning on R.V.'s}

\begin{definition}[Conditional Expectation on R.V.'s]
Let $X$ and $Y$ be r.v.'s of a probability space $(\Omega,\ma,\mathbb{P})$.\\
 Let $g(Y) = \mathbb{E}[X | Y = y], \forall y\mathbb{\in R}$. We denote $\mathbb{E}[ X|Y]$ as the r.v. $g(Y)$. Note that $\mathbb{E}[ X|Y]$ is a function of $Y$.
\end{definition}

\begin{theorem}[Marginal Expectation]
Suppose $X$ and $Y$ are r.v.'s of a probability space $(\Omega,\ma,\mathbb{P})$. Then
\[\mathbb{E}\left[ \mathbb{E}\left[ X | Y \right] \right] = \mathbb{E}[X].\]
\end{theorem}

\begin{proof}
  abc
\end{proof}

\begin{theorem}[Marginal Expectation of Measurable Function]
Suppose $X$ and $Y$ are r.v.'s of a probability space $(\Omega,\ma,\mathbb{P})$. Then
\[\mathbb{E}\left[ \mathbb{E}\left[ X \cdot g(Y) | Y \right] \right] = g( Y )\mathbb{E}\left[ X|Y \right].\]
\end{theorem}

\begin{proof}
  abc
\end{proof}

\begin{theorem}[Wald's Equations]
Suppose $X_{1}, X_{2}, \cdots$ are i.i.d. r.v.'s $\sim X$ and $N$
is a positive integer-valued r.v. of a probability space $(\Omega,\ma,\mathbb{P})$, and $N\bot\{ X_{1}, X_{2}, \cdots\}$.

(1) If $\mathbb{E}[X] < \infty$ and
$\mathbb{E}\left[ N \right] < \infty$, then
\[\mathbb{E}\left[ \sum_{i = 1}^{N}X_{i} \right] = \mathbb{E}\left[ N \right] \cdot \mathbb{E}[X].\]

(2) If $\mathrm{Var}(X) < \infty$ and $\mathrm{Var}(N) < \infty$, then
\[\mathrm{Var}\left( \sum_{i = 1}^{N}X_{i} \right) = \mathbb{E}[N] \cdot \mathrm{Var}(X) + \left( \mathbb{E}[X] \right)^{2} \cdot \mathrm{Var}(N).\]
\end{theorem}

\begin{proof}
  abc
\end{proof}

\begin{theorem}[Law of Total Probability]
Suppose $A$ is an event and $X$ is a r.v. of a probability space $(\Omega,\ma,\mathbb{P})$, then
\[\mathbb{P}(A) = \left\{ \begin{aligned}
&\sum_{x \in X(\Omega)}^{}{\mathbb{P}(A|X = x) \cdot P_{X}(x)},\qquad \text{if}~X~\text{is a discrete r.v.}\\
&\int_{- \infty}^{\infty} \mathbb{P}( A | X = x ) \cdot f_{X}( x )\dif x,\qquad\text{if}~X~\text{is a continuous r.v.} \\
\end{aligned} \right.\]
\end{theorem}

\begin{proof}
  abc
\end{proof}

\begin{theorem}[Conditional Variance on R.V.'s]
Suppose $X$ and $Y$ are r.v.'s of a probability space $(\Omega,\ma,\mathbb{P})$, then
\[\mathrm{Var}(X) = \mathbb{E}[\mathrm{Var}(x|y)] + \mathrm{Var}(\mathbb{E}[X | Y]).\]
\end{theorem}

\begin{proof}
  abc
\end{proof}

\section{Bivariate Normal (Gaussian) Distribution}

\begin{definition}[Bivariate Normal (Gaussian) R.V.'s]
Let $X_{1}$ and $X_{2}$ be r.v.'s of a probability space $(\Omega,\ma,\mathbb{P})$. We call $X_{1}$ and $X_{2}$ jointly normal (Gaussian) r.v.'s with parameters
\[\bm{\mu}=\binom{\mu_{1}}{\mu_{2}}\]
and
\[\bm{\Sigma} = \begin{pmatrix}
\sigma_{11} & \sigma_{12} \\
\sigma_{21} & \sigma_{22} \\
\end{pmatrix} > 0,\]
where ``$>0$'' means positive definite, denoted \[\mX\sim \mathcal{N}(\bm{\mu},\bm{\Sigma}),\] if their joint p.d.f. is given by
\[\begin{aligned}
&f_{X}(X) = \frac{1}{\sqrt{{(2\pi)}^{2}\left|\bm{\Sigma}\right|}}\exp\left[ - \frac{1}{2}\left(\mx -\bm{\mu} \right)^{\top}\bm{\Sigma}_{}^{-1}\left( \mx-\bm{\mu} \right) \right]\\
&= \frac{1}{\sqrt{{(2\pi)}^{2}\left|\bm{\Sigma}\right|}}\exp\left[ - \frac{1}{2}\left( x_{1} - \mu_{1}, x_{2} - \mu_{2} \right)\frac{1}{\left| \bm{\Sigma} \right|}\begin{pmatrix}
\sigma_{22} & {- \sigma}_{12} \\
  - \sigma_{12} & \sigma_{11} \\
\end{pmatrix} \binom{x_{1} - \mu_{1}}{x_{2} - \mu_{2}} \right]\\
&= \frac{1}{\sqrt{{(2\pi)}^{2}\left|\bm{\Sigma}\right|}}\exp\left(\bm{\Sigma}_{}^* \right)
\end{aligned}\]
where
\[\left|\bm{\Sigma}\right| = \det\left( \bm{\Sigma} \right) = \sigma_{11} \cdot \sigma_{22} - \sigma_{12}^{2} > 0,\]
\[\bm{\Sigma}_{}^*=- \frac{1}{2\left| \bm{\Sigma} \right|}\left[ \sigma_{22}\left( x_{1} - \mu_{1} \right)^{2} - 2\sigma_{12}\left( x_{1} - \mu_{1} \right)\left( x_{2} - \mu_{2} \right) + \sigma_{11}\left( x_{2} - \mu_{2} \right)^{2} \right].\]
Such a joint p.d.f. is called a bivariate normal p.d.f. with parameters
$\bm{\mu}$ and $\bm{\Sigma}$.
\end{definition}

\begin{theorem}[Explicitly Normal (Gaussian) R.V.]
Suppose $X_{1}$ and $X_{2}$ are r.v.'s of a probability space $(\Omega,\ma,\mathbb{P})$, and suppose $\mX\sim \mathcal{N}\left(\bm{\mu},\bm{\Sigma} \right)$.

(1) $X_{1}\sim \mathcal{N}\left( \mu_{1},\sigma_{11} \right)$ and
$X_{2}\sim \mathcal{N}\left( \mu_{2},\sigma_{22} \right)$. Therefore
\[\mu_{1} = \mu_{X_{1}}, \sigma_{11} = \sigma_{X_{1}}^{2} := \sigma_{1}^{2}, \mu_{2} = \mu_{X_{2}}, \sigma_{22} = \sigma_{X_{2}}^{2} := \sigma_{2}^{2}.\]
(2)
\[X_{2}|_{X_{1} = x_{1}}\sim \mathcal{N}\left( \mu_{2} + \frac{\sigma_{12}}{\sigma_{11}}\left( x_{1} - \mu_{1} \right), \frac{\left| \bm{\Sigma} \right|}{\sigma_{11}} \right)\]
and
\[X_{1}|_{X_{2} = x_{2}}\sim \mathcal{N}\left( \mu_{1} + \frac{\sigma_{12}}{\sigma_{22}}\left( x_{2} - \mu_{2} \right), \frac{\left|\bm{\Sigma}\right|}{\sigma_{22}} \right).\]
(3)
$\sigma_{12} = \sigma_{X_{1},X_{2}} = \rho_{X_{1},X_{2}} \cdot \sigma_{X_{1}}\sigma_{X_{2}} := \rho \cdot \sigma_{1}\sigma_{2}$. Therefore
\[X_{2}|_{X_{1} = x_{1}}\sim \mathcal{N}\left( \mu_{2} + \rho\cdot\frac{\sigma_{2}}{\sigma_{1}}\left( x_{1} - \mu_{1} \right),\left(1 - \rho^{2}\right)\sigma_{2}^{2} \right)\]
and
\[X_{1}|_{X_{2} = x_{2}}\sim \mathcal{N}\left( \mu_{1} + \rho\cdot\frac{\sigma_{1}}{\sigma_{2}}\left( x_{2} - \mu_{2} \right),\left(1 - \rho^{2}\right)\sigma_{1}^{2} \right).\]
\end{theorem}

\begin{proof}
  abc
\end{proof}

\begin{remark}[Mean Vector and Covariance Matrix]
$\dis\bm{\mu}= \binom{\mu_{1}}{\mu_{2}}$ is called the mean vector of $\mX$, and $\dis\bm{\Sigma}_{}^{} = \begin{pmatrix}
\sigma_{11} & \sigma_{12} \\
\sigma_{21} & \sigma_{22} \\
\end{pmatrix}$ is called the covariance matrix of $\mX$.
\end{remark}

\begin{lemma}[Linear Conditional Expectation and Constant Variance]
Suppose $X_{1}$ and $X_{2}$ are jointly continuous r.v.'s of a probability
space $(\Omega,\ma,\mathbb{P})$ with $\mu_{X_{1}} = \mu_{1},  \mu_{X_{2}} = \mu_{2}, \sigma_{X_{1}}^{2} = \sigma_{1}^{2}, \sigma_{X_{2}}^{2} = \sigma_{2}^{2}, \rho_{X_{1},X_{2}} = \rho$.

(1) If
$\mathbb{E}[ X_{2} | X_{1} = x_{1}] = ax_{1} + b$
is a linear function in $x_{1}$, then
\[\mathbb{E}[ X_{2} | X_{1} = x_{1}] = \mu_{2} + \rho\cdot\frac{\sigma_{2}}{\sigma_{1}}\left( x_{1} - \mu_{1} \right).\]
(2) If
$\mathbb{E}[ X_{2} | X_{1} = x_{1}]= ax_{1} + b$
is a linear function in $x_{1}$, and
$\mathrm{Var}( X_{2}| X_{1} = x_{1} ) = \sigma^{2}$
is a constant, then
\[\mathrm{Var}( X_{2} | X_{1} = x_{1} ) = \left( 1 - \rho^{2} \right)\sigma_{2}^{2}.\]
\end{lemma}

\begin{proof}
  abc
\end{proof}

\begin{theorem}[Derivation of Jointly Normal R.V.'s]
Suppose $X_{1}$ and $X_{2}$ are r.v.'s of a probability space $(\Omega,\ma,\mathbb{P})$. Suppose

(1) $X_{1}$ is a normal r.v.

(2) $X_{2}|X_{1} = x_{1}$ is a normal r.v. for all
$x_{1}\mathbb{\in R}$.

(3) $\mathbb{E}[ X_{2} | X_{1} = x_{1}]$ is a
linear function in $X_{1}$, and
$\mathrm{Var}( X_{2}| X_{1} = x_{1}) = \sigma^{2}$
is a constant.

Then $X_{1}$ and $X_{2}$ are \textbf{jointly normal} r.v.'s.
\end{theorem}

\begin{proof}
  abc
\end{proof}

\begin{theorem}[Independence mutually Implies Uncorrelated]
Suppose $X_{1}$ and $X_{2}$ are jointly normal r.v.'s of a probability space $(\Omega,\ma,\mathbb{P})$. Then $X_{1}$ and $X_{2}$ are independent 
$\Leftrightarrow$ $X_{1}$ and $X_{2}$ are uncorrelated.
\end{theorem}

\begin{proof}
  abc
\end{proof}

\begin{theorem}[Linearly Generated Normal R.V.]
Suppose $\mX\sim \mathcal{N}\left( {\bm{\mu}}_{\mX},\bm{\Sigma}_{\mX} \right)$ and $\mY= \bm{A}\mX + b$, where $\bm{A}$ is \textbf{nonsingular}, i.e., $|\bm{A}| \neq 0$.
Then
\[\mY\sim \mathcal{N}\left( \bm{A}{\bm{\mu}}_{\mX} + b,\bm{A}\bm{\Sigma}_{\mX}\bm{A}^{\top} \right).\]
\end{theorem}

\begin{proof}
  abc
\end{proof}

\chapter{Sums of Independent R.V.'s and Limit Theorems}

\section{Moment Generating Functions}

\begin{definition}[Moment Generating Function]
The moment generating function (m.g.f.) $M_{X}(t)$ of a r.v. $X$ is
given by
$M_{X}(t) = \mathbb{E}[ e^{tx}]$
if $\exists\delta > 0 \to M_{X}(t)$ is defined
for all $t \in ( - \delta,\delta)$.
\end{definition}

\begin{theorem}[Moment Generation]
(1)
$\mathbb{E}[X^{n}] = M_{X}^{\left( n \right)}(0), \forall n \gs 0.$

(2) Maclaurin's series for $M_{X}(t)$:
\[M_{X}(t) = \sum_{n = 0}^{\infty}\frac{M_{X}^{\left( n \right)}(0)}{n!}t^{n} = \sum_{n = 0}^{\infty}\frac{\mathbb{E}[X^{n}]}{n!}t^{n}.\]
\end{theorem}

\begin{proof}
  abc
\end{proof}

\begin{remark}[Sufficient Condition for $n^{th}$ Moment to Converge]
If $\left| M_{X}(t) \right| < \infty$ for some $t > 0$,
then $\left| \mathbb{E}[X^{n}] \right| < \infty$ for
all $n \gs 1$. But the converse is not true.
\end{remark}

\begin{theorem}[Same M.G.F. Implies Same C.D.F.]
If $M_{X}(t) = M_{Y}(t)$ for all
$t \in ( - \delta,\delta)$ for some $\delta > 0$, then the c.d.f. of $X$ and $Y$ are the same.
\end{theorem}

\begin{proof}
  abc
\end{proof}

\section{Sums of Independent R.V.'s}

\begin{theorem}[M.G.F. of Sums of Independent R.V.'s]
Suppose $X_{1}, X_{2}, \cdots,X_{n}$ are \textbf{independent} r.v.'s
with m.g.f.'s
\[M_{X_{1}}(t),M_{X_{2}}(t), \cdots,M_{X_{n}}(t)\] respectively. Then the m.g.f. of their \textbf{sum}
$X = X_{1} +  X_{2} + \cdots +  X_{n}$ is
\[M_{X}(t) = \prod_{i = 1}^{n}{M_{X_{i}}(t)}.\]
\end{theorem}

\begin{proof}
  abc
\end{proof}

\begin{theorem}[M.G.F. of Sums of Normal R.V.'s]
Suppose $X_{1}, X_{2}, \cdots,X_{n}$ are \textbf{independent} r.v.'s
and $X_{i}\sim \mathcal{N}\left( \mu_{i}, \sigma_{i}^{2} \right), \forall i = 1,2, \cdots,n$
and suppose $a_{1},a_{2}, \cdots,a_n\in\mathbb{R}$.
If 
\[ X = \sum_{i = 1}^{n}{a_{i}X}_{i},\]
then
\[ X\sim \mathcal{N}\left( \sum_{i = 1}^{n}{a_{i}\mu_{i}},\sum_{i = 1}^{n}{a_{i}^{2}\sigma_{i}^{2}} \right).\]
\end{theorem}

\begin{proof}
  abc
\end{proof}

\begin{corollary}[M.G.F. of Sums of I.I.D. Normal R.V.'s]
Suppose $X_{1}, X_{2}, \cdots,X_{n}$ are \textbf{i.i.d.} 
$\sim \mathcal{N}\left( \mu,\sigma^{2} \right)$, then
\[S_{n} = \sum_{i = 1}^{n}X_{i}\sim \mathcal{N}\left( n\mu,n\sigma^{2} \right),~\text{and}~\bar{X} = \frac{S_{n}}{n}\sim \mathcal{N}\left( \mu,\frac{\sigma^{2}}{n} \right).\]
\end{corollary}

\begin{proof}
  abc
\end{proof}

\section{Concentration Inequalities}

\begin{theorem}[Markov's Inequality]
  Suppose $X$ is a nonnegative r.v., then
  \begin{equation}
    \mathbb{P}(X \gs t) \ls \frac{\mathbb{E}[X]}{t}, \forall t > 0.
  \end{equation}
\end{theorem}

\begin{proof}
  abc
\end{proof}

\begin{theorem}[Chebyshev's Inequality]
  \[\mathbb{P}( \left| X - \mu_{X} \right| \gs t ) \ls \frac{\sigma_{X}^{2}}{t^{2}}, \forall t > 0.\]
  In particular,
  \[\mathbb{P}(\left| X - \mu_{X} \right| \gs k \cdot \sigma_{X}) \ls \frac{1}{k^{2}}, \forall k > 0.\]
\end{theorem}

\begin{proof}
  abc
\end{proof}

\begin{remark}[Not Tight Bounds]
  The bounds obtained by Markov and Chebyshev inequalities are usually \textbf{not very tight}.
\end{remark}

\begin{theorem}[Zero Absolute Moment]
  \begin{equation}
    \mathbb{E}[\left| X \right|] = 0  \Leftrightarrow X = 0 ~\text{with probability}~ 1.
  \end{equation}
\end{theorem}

\begin{proof}
  abc
\end{proof}

\begin{corollary}[Zero Variance]
  \[\mathrm{Var}(X) = 0  \Leftrightarrow X = 0 ~\text{with probability}~ 1.\]
\end{corollary}

\begin{proof}
  abc
\end{proof}

\begin{theorem}[Chebyshev's Inequality for I.I.D R.V.'s]
  Suppose $X_{1}, X_{2}, \cdots,X_{n}$ are \textbf{i.i.d.} r.v.'s with
  mean $\mu$ and variance $\sigma^{2} < \infty$. Let 
  \[\bar{X} = \frac{1}{n}\sum_{i = 1}^{n}X_{i}\]
  be the sample mean of $ X_{1}, X_{2}, \cdots,X_{n}.$
  Then
  \[\mathbb{P}\left( \left| \bar{X} - \mu \right| \gs \epsilon \right) \ls \frac{\sigma^{2}}{n\epsilon^{2}}.\]
\end{theorem}

\begin{proof}
  abc
\end{proof}

\begin{theorem}[Chebyshev's Inequality for I.I.D. Bernoulli R.V.'s]
  Suppose $X_{1}, X_{2}, \cdots,X_{n}$ are i.i.d. $\sim$ Bernoulli$(p)$. Let 
  \[\bar{X} = \frac{1}{n}\sum_{i = 1}^{n}X_{i}\]
  be the sample mean of $ X_{1}, X_{2}, \cdots,X_{n}.$
  Then
  \[\mathbb{P}\left( \left| \bar{X} - p \right| \gs \epsilon \right) \ls \frac{p(1 - p)}{n\epsilon^{2}} \ls \frac{1}{4n\epsilon^{2}}.\]
\end{theorem}

\begin{proof}
  abc
\end{proof}

\begin{theorem}[Cantelli’s Inequality]
  Let $X$ be a real random variable with variance $\sigma^2<\infty$. Then
  \begin{equation}
    \mathbb{P}(X-\mathbb{E}[X]\geqslant t)\leqslant\frac{\sigma^2}{\sigma^2+t^2},
  \end{equation}
  and
  \begin{equation}
    \mathbb{P}(|X-\mathbb{E}[X]|\geqslant t)\leqslant2\frac{\sigma^2}{\sigma^2+t^2}.
  \end{equation}
\end{theorem}

\begin{proof}
  abc
\end{proof}

\begin{theorem}[Hoeffding's inequality]
  Let $X_1,\dots,X_n$ be independent random variables with $X_i$ taking values in $[a_i, b_i]$ for all $i=1,\dots,n$. Let $S_n=\sum_{i=1}^{n}X_i$, then for any $\epsilon>0$,
  \begin{equation}
    \mathbb{P}(S_n-\mathbb{E}[S_n]\geqslant\epsilon)\leqslant \exp\left(-\frac{2\epsilon^2}{\sum_{i=1}^{n}(b_i-a_i)^2}\right),
  \end{equation}
  and
  \begin{equation}
    \mathbb{P}(S_n-\mathbb{E}[S_n]\leqslant-\epsilon)\leqslant \exp\left(-\frac{2\epsilon^2}{\sum_{i=1}^{n}(b_i-a_i)^2}\right).
  \end{equation}
  Moreover, let $\bar{X}=\frac{1}{n}S_n$. By viewing $X_i$ taking values in $[a_i/n, b_i/n]$ for $S_n$, we have
  \begin{equation}
    \mathbb{P}(X_n-\mathbb{E}[X_n]\geqslant\epsilon)\leqslant \exp\left(-\frac{2n^2\epsilon^2}{\sum_{i=1}^{n}(b_i-a_i)^2}\right),
  \end{equation}
  and
  \begin{equation}
    \mathbb{P}(X_n-\mathbb{E}[X_n]\leqslant-\epsilon)\leqslant \exp\left(-\frac{2n^2\epsilon^2}{\sum_{i=1}^{n}(b_i-a_i)^2}\right).
  \end{equation}
\end{theorem}

\begin{proof}
  abc
\end{proof}

\begin{theorem}[Sanov's theorem]
  Let $X_1,\dots,X_n$ be independent random variables drawn according to some distribution with mean $p$ and support included in $[0, 1]$. Let $\hat{p}=\frac{1}{n}\sum_{i=1}^{n}X_i$, then for any $q\in[0, 1]$,
  \begin{equation}
    \mathbb{P}(\hat{p}\geqslant q)\leqslant\exp(-mD(q||p)),
  \end{equation}
  where $D(q||p)=q\log\frac{q}{p}+(1-q)\log\frac{1-q}{1-p}$ is the binary relative entropy of $p$ and $q$.
\end{theorem}

\begin{proof}
  abc
\end{proof}

\begin{theorem}[Multiplicative Chernoff Bounds]
  Let $X_1,\dots,X_n$ be independent random variables drawn according to some distribution with mean $p$ and support included in $[0, 1]$. Let $\hat{p}=\frac{1}{n}\sum_{i=1}^{n}X_i$, then for any $\gamma\in[0, \frac{1}{p}-1]$,
  \begin{equation}
    \mathbb{P}(\hat{p}\geqslant(1+\gamma)p)\leqslant\exp\left(-\frac{mp\gamma^2}{3}\right),
  \end{equation}
  and
  \begin{equation}
    \mathbb{P}(\hat{p}\leqslant(1-\gamma)p)\leqslant\exp\left(-\frac{mp\gamma^2}{2}\right).
  \end{equation}
\end{theorem}

\begin{proof}
  abc
\end{proof}

\begin{theorem}[Paley-Zygmund Inequality]
  Let $X$ be a non-negative random variable with finite variance. Then
  \begin{equation}
    \mathbb{P}(X>t\mathbb{E}[X])\geqslant(1-t)^2\frac{\mathbb{E}^2[X]}{\mathbb{E}[X^2]},\quad t\in[0,1].
  \end{equation}
\end{theorem}

\begin{proof}
  abc
\end{proof}

% https://zhuanlan.zhihu.com/p/538654580
\begin{theorem}[Kolmogorov's (Maximal) Inequality]
  Suppose $X_1,\dots,X_n$ are independent random variables with zero means and finite variances. Let $S_k:=\sum_{i=1}^{k}X_i$, then
  \begin{equation}
    \mathbb{P}\left(\max_{1\leqslant k\leqslant n}|S_k|\geqslant t\right)\leqslant\frac{1}{t^2}\mathrm{Var}(S_n)=\frac{1}{t^2}\sum_{i=1}^{n}\mathbb{E}[X_i^2],\quad t>0.
  \end{equation}
\end{theorem}

\begin{proof}
  abc
\end{proof}

% Foundations of Machine Learning, Mehryar Mohri, Afshin Rostamizadeh, and Ameet Talwalkar, MIT Press, Second Edition, 2018.
\begin{theorem}[Maximal Inequality]
  Let $X_1,\dots,X_n$ be real-valued random variables such that for all $j=1,\dots,n$ and $t>0$, $\mathbb{E}[e^{tX_j}]\leqslant e^{\frac{t^2r^2}{2}}$ for some $r>0$. Then
  \begin{equation}
    \mathbb{E}\left[\max_{j\in\{1,\dots,n\}}X_j\right]\leqslant r\sqrt{2\log n}.
  \end{equation}
\end{theorem}

\begin{proof}
  abc
\end{proof}

\begin{corollary}[Maximal inequality]
  Let $X_1,\dots,X_n$ be real-valued random variables such that for all $j=1,\dots,n$, $X_j=\sum_{i=1}^{m}Y_{ij}$ where, for each fixed $j$, $Y_{ij}$ are independent zero mean random variables taking values in $[-r_i, +r_i]$, for some $r_i>0$. Then
  \begin{equation}
    \mathbb{E}\left[\max_{j\in\{1,\dots,n\}}X_j\right]\leqslant r\sqrt{2\log n},
  \end{equation}
  where $r=\sqrt{\sum_{i=1}^{m}r_i^2}$.
\end{corollary}

\begin{proof}
  abc
\end{proof}

\section{Laws of Large Numbers (LLN's)}

\begin{definition}[Converge in Probability]
Let $X,X_{1}, X_{2},\cdots$ be r.v.'s of a probability space $(\Omega,\ma,\mathbb{P})$. We say that $X_{n}$ converges to $X$ \textbf{in probability}, denoted
\[X_{n}\xrightarrow{ P }X,\]
if
\[\lim_{n \to \infty}{\mathbb{P}(\left| X_{n} - X \right| < \epsilon)} = 1, \forall\epsilon > 0,\]
or
\[\lim_{n \to \infty}{\mathbb{P}(\left| X_{n} - X \right| > \epsilon)} = 0, \forall\epsilon > 0.\]
\end{definition}

\begin{theorem}[Weak Law of Large Numbers (WLLN)]
Suppose $X_{1}, X_{2}, \cdots$ are i.i.d. r.v.'s with mean $\mu$ and variance $\sigma^{2} < \infty$. Then
\[\overline{X_{n}} = \frac{1}{n}\sum_{i = 1}^{n}X_{i}\xrightarrow{ P }\mu,\]
i.e.,
\[ \lim_{n \to \infty}{\mathbb{P}\left( \left| \overline{X_{n}} - \mu \right| > \epsilon \right)} = 0, \forall\epsilon > 0.\]
\end{theorem}

\begin{proof}
  abc
\end{proof}

\begin{remark}[Relative Frequency Converges to Probability in Probability]
Let an experiment be repeated independently and let $n(A)$ be the
number of times an event $A$ occurs in the first $n$ repetitions of
the experiment. Let

\[X_{i} = \left\{ \begin{aligned}
&1,~\text{if} A \text{ occurs on the} i^{th} ~\text{repetition}~, \\
&0,~\text{o.w.}~ \\
\end{aligned} \right.\]
Then
\[\begin{aligned}
 &n(A) = \sum_{i = 1}^{n}X_{i}~\text{and}~\mathbb{E}[X_{i}]= 1 \cdot \mathbb{P}(A) + 0 \cdot \mathbb{P}(A^{c}) = \mathbb{P}(A).\\
\to& \lim_{n \to \infty}{\mathbb{P}\left( \left| \frac{n(A)}{n} - \mathbb{P}(A) \right| > \epsilon \right)} = \lim_{n \to \infty}{\mathbb{P}\left( \left| \frac{1}{n}\sum_{i = 1}^{n}X_{i} - \mathbb{P}(A) \right| > \epsilon \right)} = 0.
\end{aligned}\]
Therefore, the relative frequency $\frac{n(A)}{n}$ of occurrence of $A$ is very likely close to $\mathbb{P}(A)$ if $n$ is sufficiently large.
\end{remark}

\begin{definition}[Converge Almost Surely]
Let $X,X_{1}, X_{2},\cdots$ be r.v.'s of a probability space $(\Omega,\ma,\mathbb{P})$. We say that $X_{n}$ converges to $X$ \textbf{almost surely} (a.s.),
denoted
\[X_{n}\xrightarrow{ \text{a.s.} }X,\]
if \[\mathbb{P}\left( \lim_{n \to \infty}X_{n} = X \right) = 1.\]
\end{definition}

\begin{theorem}[Strong Law of Large Numbers (SLLN)]
Suppose $X_{1}, X_{2}, \cdots$ are i.i.d. r.v.'s with mean $\mu$. Then

\[\overline{X_{n}} = \frac{1}{n}\sum_{i = 1}^{n}X_{i}\xrightarrow{ \text{a.s.} }\mu\]
i.e.,
\[\mathbb{P}\left( \lim_{n \to \infty}\overline{X_{n}} = \mu \right) = 1.\]
\end{theorem}

\begin{proof}
  abc
\end{proof}

\begin{remark}[Relative Frequency Converges Almost Surely]
\[\mathbb{P}\left( \lim_{n \to \infty}\frac{n( A )}{n} = \mathbb{P}(A) \right) = 1  \to  \lim_{n \to \infty}\frac{n( A )}{n} = \mathbb{P}( A ) ~\text{with probability}~ 1.\]
\end{remark}

\begin{theorem}[Converge Almost Surely Implies Convergence in Probability]
\[\text{If}~X_{n}\xrightarrow{\text{a.s.}}X,~\text{then}~X_{n}\xrightarrow{ P }X.\]
\end{theorem}

\begin{proof}
  abc
\end{proof}

\section{Central Limit Theorem (CLT)}

\begin{theorem}[Levy Continuity Theorem]
Suppose $X,X_{1}, X_{2}, \cdots$ are r.v.'s of a probability space $(\Omega,\ma,\mathbb{P})$. 

If $\exists\delta > 0  \to  \lim_{n \to \infty}{M_{X_{n}}(t)} = M_{X}(t), \forall t \in \left( - \delta,\delta \right)$, then
\[ 
  \lim_{n \to \infty}F_{n}(x) = F(x),
\]
if $F(x)$ is continuous at $X$.
\end{theorem}

\begin{proof}
  abc
\end{proof}

\begin{theorem}[Central Limit Theorem (CLT)]
Suppose $X_{1}, X_{2}, \cdots,X_{n}$ are i.i.d. r.v.'s with mean $\mu$ and variance $\sigma^{2}$. Let
\[S_{n}^{*} = \frac{X_{1} + X_{2} + \cdots + X_{n} - \mathbb{E}[S_{n}]}{\sigma_{S_{n}}} = \frac{X_{1} + X_{2} + \cdots + X_{n} - n\mu}{\sigma\sqrt{n}}.\]
Then
\[\lim_{n \to \infty}{F_{S_{n}^{*}}(X) = \Phi(x)},\]
i.e.,
\[{\lim_{n \to \infty}\mathbb{P}\left( \frac{X_{1} + X_{2} + \cdots + X_{n} - n\mu}{\sigma\sqrt{n}} \ls x \right)}{= \Phi(x)} = \int_{- \infty}^{x}\frac{1}{\sqrt{2\pi}}e^{- \tfrac{y^{2}}{2}}\dif y.\]
Equivalently,
\[\begin{aligned}
{\lim_{n \to \infty}\mathbb{P}\left( \frac{\bar{X} - \mu}{\frac{\sigma}{\sqrt{n}}} \ls x \right)}&= \lim_{n \to \infty}\mathbb{P}\left( \frac{\bar{X} - \mu}{\sqrt{\frac{\mathrm{Var}(X)}{n}}} \ls x \right)\\
& = \lim_{n \to \infty}\mathbb{P}\left( \frac{\bar{X} - \mathbb{E}[\bar{X}]}{\sigma_{\bar{X}}} \ls x \right) \\
&= \Phi(x).\\
\end{aligned}\]
\end{theorem}

\begin{proof}
  abc
\end{proof}

%%=====模板========================================
%%================================================

%\chapter{Templates}
%
%$X_1, X_2, \cdots,X_n$
%$(\Omega,\ma,\mathbb{P})$
%
%\begin{definition}[]
%
%\end{definition}
%
%\begin{remark}[]
%
%\end{remark}
%
%\begin{theorem}[]
%
%\end{theorem}
%
%\begin{corollary}[]
%
%\end{corollary}
%
%\begin{lemma}[]
%
%\end{lemma}
%
%\begin{proof}
%
%\end{proof}

\end{document}