ballotPollError.tex

\section{Ballot-polling audits of a tolerable overstatement in votes}
\label{sec:ballotPollError}

In this section we develop a new method for ballot-polling audits that can test numerical margins,
rather than just test whether a candidate won.
This requires a different approach than that taken by \cite{lindemanEtal12}.

Existing ballot-polling methods consider only the fraction of ballots with a vote for either 
$w$ or $\ell$ that contain a vote for $w$,
making the statistical test one for a proportion.
To allow the error to be partitioned across the strata via $\lambda_s$,
the necessary inference is about the 
\emph{difference} between the number of votes for $w$ and the number of votes for $\ell$.
This introduces a nuisance parameter, the number of ballots with votes for either $w$ or $\ell$.
We deal with the nuisance parameter by maximizing the $P$-value 
over all possible values of the nuisance parameter, which ensures that the test is conservative.

\subsection{Conditional tri-hypergeometric test}

We consider a single stratum $s$, containing $N_s$ ballots.
Of the $N_s$ ballots,
$A_{w,s}$ have a vote for $w$ but not for $\ell$, $A_{\ell,s}$ have a vote for $\ell$ but not for $w$, and $A_{u,s} = N_s - N_{w,s} - N_{\ell,s}$ have votes for both $w$ and $\ell$ or neither $w$ nor $\ell$, including undervotes and invalid ballots.
We might draw a simple random sample of $n$ ballots ($n$ fixed ahead of time), or we might draw 
sequentially without replacement, so the sample size $B$ could be random.
For instance, the rule for determining $B$ could depend on the data.\footnote{%
   Sampling with replacement leads to simpler arithmetic, but is not as efficient.
}

Regardless, we assume that, conditional on the attained sample size $n$, the ballots are a simple random sample of size $n$ from the $N_s$ ballots in the population.
In the sample, $B_w$ ballots contain a vote for $w$ but not $\ell$, with $B_\ell$ and $B_u$ defined analogously.
The conditional joint distribution of
$(B_w, B_\ell, B_u)$ is tri-hypergeometric: 

\begin{equation}
    \mathbb{P}_{A_{w,s}, A_{\ell,s}} \{ B_w = i, B_\ell = j \vert B=n \} = 
     \frac{ {A_{w,s } \choose i}{A_{\ell,s} \choose j}{N_s - A_{w,s} - A_{\ell,s} \choose n-i-j}}{{N_s \choose n}}.
\end{equation}

Define the diluted sample margin, $D \equiv (B_w - B_\ell)/B$.
We want to test the compound hypothesis $A_{w,s} - A_{\ell,s} \le c$.
The value of $c$ is inferred from the definition
$\omega_{w\ell,s} \equiv V_{w\ell,s} - A_{w\ell,s} = V_{w,s} - V_{\ell,s} - (A_{w,s} -A_{\ell,s})$.
Thus,

\beq
    c = V_{w,s} - V_{\ell,s} - \omega_{w\ell,s} = V_{w\ell,s} - \lambda_s V_{w\ell}.
\eeq

The alternative is the compound hypothesis 
$A_{w,s} - A_{\ell,s} > c$.\footnote{%
    To use Wald's Sequential Probability Ratio Test, we might pick a simple alternative instead, e.g.,
   $A_{w,s} = V_{w,s}$ and $A_{\ell,s} = V_{\ell,s}$, the reported values, provided 
   $V_{w,s} - V_{\ell,s} > c$.
}
Hence, we will reject for large values of $D$.
Conditional on $B=n$, the event $D = (B_w - B_\ell)/B = d$ is the same as $B_w - B_\ell = nd$.\footnote{%
In contrast, the BRAVO ballot-polling
method~\cite{lindemanEtal12}
conditions only on $B_w+B_\ell = m$.
}


The $P$-value of the simple hypothesis that there are $A_{w,s}$ ballots with
a vote for $w$ but not for $\ell$, $A_{\ell,s}$ ballots with a vote for $\ell$ but not for $w$, 
and $N - A_{w,s} - A_{\ell,s}$ ballots with votes for both $w$ and $\ell$ or neither $w$ nor $\ell$ 
(including undervotes and
invalid ballots) is the probability that $B_w - B_\ell \geq nd$.
Therefore,

\begin{equation}
   \mathbb{P}_{A_{w,s}, A_{\ell,s}, N_s} \left \{ D \geq d \;\vert\; B = n\right \} = 
   \sum_{\substack{(i, j) :  i, j\ge 0 \\ i-j \geq nd \\ i+j \leq n}} \frac{ {A_{w,s } \choose i}{A_{\ell,s} \choose j}{N_s - A_{w,s} - A_{\ell,s} \choose n-i-j}}{{N_s \choose n}}.
\end{equation}


%\subsection{Conditional hypergeometric test}
%Another approach is to condition on both the events $B=n$ and $B_w+B_\ell=m$.
%We describe the hypothesis test here, but do not advocate for using it.
%We found that this approach was inefficient in some simulation experiments.
%
%Given $B=n$, all samples of size $n$ from the ballots are equally likely, by hypothesis.
%Hence, in particular, all samples of size $n$ for which $B_w + B_\ell = m$ are equally likely.
%There are ${A_{w,s}+A_{\ell,s} \choose m}{N_s - A_{w,s}-A_{\ell,s} \choose n-m}$ such samples.
%Among these samples, $B_w$ may take values $i=0, 1, \dots, m$.
%For a fixed $i$, there are ${A_{w,s} \choose i}{A_{\ell, s} \choose m-i}{N_s - A_{w,s} - A_{\ell,s} \choose n-m}$
%samples with $B_w=i$ and $B_\ell = m-i$.
%
%The factor ${N_s - A_{w,s} - A_{\ell,s} \choose n-m}$ counts the number of ways to sample $n-m$ of the
%remaining ballots.
%If we divide out this factor, we simply count the number of ways to sample ballots
%from the group of ballots for $w$ or for $\ell$.
%There are ${A_{w,s}+A_{\ell,s} \choose m}$ equally likely samples of size $m$ from
%the ballots with either a vote for $w$ or for $\ell$, but not both, 
%and of these samples, ${A_{w,s} \choose i}{A_{\ell, s} \choose m-i}$ contain $i$ ballots with a vote for $w$ but not $\ell$.
%Therefore, conditional on $B=n$ and $B_w+B_\ell=m$, the probability that $B_w=i$ is
%
%$$\frac{{A_{w,s} \choose i}{A_{\ell, s} \choose m-i}}{{A_{w,s}+A_{\ell,s} \choose m}}.$$
%
%The $P$-value of the simple hypothesis that there are $A_{w,s}$ ballots with
%a vote for $w$ but not for $\ell$, $A_{\ell,s}$ ballots with a vote for $\ell$ but not for $w$, 
%and $N - A_{w,s} - A_{\ell,s}$ ballots with votes for both $w$ and $\ell$ or neither $w$ nor $\ell$ 
%(including undervotes and
%invalid ballots) is the sum of these probabilities for events when $B_w - B_\ell \geq nd$.
%This event occurs for $B_w \geq \frac{m+nd}{2}$.
%Therefore,
%
%\begin{equation}
%   \mathbb{P}_{A_{w,s}, A_{\ell,s}, N_s} \left \{ D \geq d \;\vert\; B = n, B_w+B_\ell = m \right \} = 
%   \sum_{i=(m+nd)/2}^{\min\{m, A_{w,s}\}} \frac{{A_{w,s} \choose i}{A_{\ell, s} \choose m-i}}{{A_{w,s}+A_{\ell,s} \choose m}}.
%\end{equation}
%
%
%This conditional $P$-value is thus the tail probability of the hypergeometric distribution
%with parameters $A_{w,s}$ ``good'' items, $A_{\ell,s}$ ``bad'' items, and a sample of size $m$.
%This calculation is numerically stable and fast; tail probabilities of the hypergeometric distribution are available
%and well-tested in all standard statistics software.

\subsection{Maximizing the $P$-value over the nuisance parameter}

The composite null hypothesis does not specify $A_{w,s}$ or $A_{\ell,s}$ separately, only 
that $A_{w,s} - A_{\ell,s} \le c$ for
some fixed, known $c$.
Define $\mathcal{S}$ to be the set of pairs $(i, j)$ such that $i, j\ge 0, i-j \ge nd,$ and $ i+j \leq n$.
The (conditional) $P$-value of this composite hypothesis for $D=d$ is the maximum $P$-value for all
values $(A_{w,s}, A_{\ell,s})$ that are possible under the null hypothesis,
\begin{equation}
  \max_{A_{w,s}, A_{\ell,s} \in \{0, 1, \ldots, N \}: A_{w,s} - A_{\ell,s} \le c, A_{w,s} + A_{\ell,s} \le N_s}
   \sum_{\substack{(i, j)\in \mathcal{S}}} \frac{ {A_{w,s } \choose i}{A_{\ell,s} \choose j}{N_s - A_{w,s} - A_{\ell,s} \choose n-i-j}}{{N_s \choose n}},
\end{equation}
wherever the summand is defined. 
(Equivalently, define ${m \choose k} \equiv 0$ if $k > m$, $k < 0$, or $m \le 0$.)

\subsubsection{Characterizing the optimal solution}
The following result enables us to only test hypotheses along the boundary of the null set.

\begin{thm}
Assume that $n < A_{w,s}+A_{\ell,s}$.
Suppose the composite null hypothesis is $N_w - N_\ell \leq c$.
The $P$-value is maximized on the boundary of the null region, i.e. when $N_w - N_\ell = c$.
\end{thm}

\begin{proof}
Without loss of generality, let $c=0$ and assume that $A_{u,s}=N_s - A_{w,s} - A_{\ell,s}$ is fixed.
Let $N_{w\ell, s} \equiv A_{w,s}+A_{\ell,s}$ be the fixed, unknown number of ballots for $w$ or for $\ell$ in stratum $s$.
The $P$-value $p_0$ for the simple hypothesis that $c=0$ is

\begin{equation}
  p_0 = \sum_{\substack{(i, j) \in \mathcal{S}}} \frac{ {N_{w\ell, s}/2 \choose i}{N_{w\ell, s}/2 \choose j}{A_{u,s} \choose n-i-j}}{{N_s \choose n}} =  \sum_{\substack{(i, j) \in \mathcal{S}}}T_{ij},
\end{equation}

\noindent where $T_{ij}$ is defined as the $(i, j)$ term in the summand and $T_{ij} \equiv 0$ for pairs $(i, j)$ that don't appear in the summation.

Assume that $c>0$ is given.
The $P$-value $p_c$ for this simple hypothesis is
\begin{align*}
p_c &=   \sum_{\substack{(i, j) \in \mathcal{S}}}  \frac{ {(N_{w\ell, s}+c)/2 \choose i}{(N_{w\ell, s}-c)/2 \choose j}{A_{u,s} \choose n-i-j}}{{N_s \choose n}}  \\
   &=\sum_{\substack{(i, j) \in \mathcal{S}}} T_{ij} \frac{ \frac{N_{w\ell, s}+c}{2}(\frac{N_{w\ell, s}+c}{2}-1)\cdots(\frac{N_{w\ell, s}}{2}+1) (\frac{N_{w\ell, s}-c}{2} -j)\cdots(\frac{N_{w\ell, s}}{2}-1-j) }
   {(\frac{N_{w\ell, s}+c}{2} -i)\cdots(\frac{N_{w\ell, s}}{2}+1-i)(\frac{N_{w\ell, s}-c}{2})\cdots(\frac{N_{w\ell, s}}{2}-1)}.
\end{align*}

Terms in the fraction can be simplified: choose the corresponding pairs in the numerator and denominator.
Fractions of the form $\frac{\frac{N_{w\ell, s}}{2} + a}{\frac{N_{w\ell,s}}{2} + a - i}$ can be expressed as $1 + \frac{i}{\frac{N_{w\ell,s}}{2} + a-i}$.
Fractions of the form $\frac{\frac{N_{w\ell, s}}{2}  - a - j}{\frac{N_{w\ell, s}}{2}  - a}$ can be expressed as $1 - \frac{j}{\frac{N_{w\ell, s}}{2} -a}$.
Thus, the $P$-value can be written as 

\begin{align*}
p_c &= \sum_{\substack{(i, j) \in \mathcal{S}}} T_{ij} \prod_{a=1}^{c/2} \left(1 + \frac{i}{\frac{N_{w\ell,s}}{2} + a-i}\right)\left(1 - \frac{j}{\frac{N_{w\ell, s}}{2} - a}\right) \\
&> \sum_{\substack{(i, j) \in \mathcal{S}}}  T_{ij} \left[ \left(1 + \frac{i}{\frac{N_{w\ell,s}+c}{2} -i}\right)\left(1 - \frac{j}{\frac{N_{w\ell, s}}{2}+1}\right) \right]^{c/2} \\
&= \sum_{\substack{(i, j) \in \mathcal{S}}} T_{ij} \left[ 1 + \frac{\frac{N_{w\ell,s}+c}{2}j + \frac{N_{w\ell,s}}{2}i + i}{(\frac{N_{w\ell,s}+c}{2}-i)(\frac{N_{w\ell,s}}{2}+1)}\right]^{c/2} \\
&> \sum_{\substack{(i, j) \in \mathcal{S}}}  T_{ij}\\
&= p_0
\end{align*}

The last inequality follows from the fact that $i$ and $j$ are nonnegative, and 
that $i < \frac{N_{w\ell,s}+c}{2}$ (it is a possible outcome under the null hypothesis).


\end{proof}

\subsubsection{Solving the optimization problem}

We have found empirically (but have not proven) that given $N$, $c$, and the observed sample values $B_w$ and $B_\ell$, the tail probability $p_c$, as a function of $A_{w,s}$,
has a unique maximum at one of the endpoints, where $A_{w,s}$ is either as small or as large as possible.
If this empirical result is true in general, then finding the maximum is trivial;
otherwise, computing the unconditional $P$-value is a simple 1-dimensional optimization problem
on a bounded interval.

\subsection{Conditional testing}
If the conditional tests are always conducted at significance level $\alpha$ or less, so that
$\mathbb{P} \{\mbox{Type I error} | B = n\} \le \alpha$, then the
overall procedure has significance level $\alpha$ or less:
\begin{eqnarray}
    \mathbb{P} \{\mbox{Type I error}\} &=& \sum_{n=0}^N  \mathbb{P}\{\mbox{Type I error} |  B = n\} \mathbb{P} \{ B = n \} \nonumber \\
       & \le & \sum_{n=0}^N \alpha \mathbb{P} \{  B = n \}  =  \alpha.
\end{eqnarray}

In particular, this implies that our conditional hypergeometric test will have a conservative $P$-value  unconditionally.