\documentclass[10pt]{article}
\usepackage{amsfonts}
\usepackage{fullpage}
\usepackage{version}
%\usepackage{epsfig}
\excludeversion{forbes-notes}
\begin{document}
\input{preamble.tex}
\lecture{6}{Feb 23, 2009}{Madhu Sudan}{Michael Forbes}
%%%% body goes in here %%%%
The goal of this lecture is to give alternate proof of
\textsc{PARITY}$\notin{\textsc{AC}}^0$,
following the outline of Razborov and Smolensky.
\subsection{Probability Review}
Probability focuses on the probability of events and random variables. There
are several facts that will be used throughout the course. Consult the
appropriate textbook for proof. As this is a computer science course, we will
restrict ourselves to the simpler case of discrete events, and discrete random
variables. The results do generalize but we need not go there.
\begin{lemma}[Linearity of Expectation]
For random variables $X$ and $Y$, and real numbers $a$,$b\in\mathbb{R}$,
$\mathbb{E}[aX+bY]=a\mathbb{E}[X]+b\mathbb{E}[Y]$.
\end{lemma}
It is important to recall that this fact holds regardless of whether $X$ and $Y$
are independent or not.
\begin{lemma}
For independent random variables
$X$ and $Y$, $\mathbb{E}[XY]=\mathbb{E}[X]\mathbb{E}[Y]$.
\end{lemma}
\begin{lemma}[Union Bound]
For events $E_1$ and $E_2$, $\Pr[E_1\cup E_2]\le \Pr[E_1]+\Pr[E_2]$.
\end{lemma}
Recall that $E_1$ and $E_2$ are called \textit{independent} if $\Pr[E_1\cap E_2]\le
\Pr[E_1]\Pr[E_2]$. Two discrete random variables are said to be independent if
their joint probability distribution decomposes into a product of marginal
distributions. That is, the probabilities of the variables taking certain
values is what you would expect.
\begin{theorem}[Markov's Inequality]
If $X$ is a non-negative random variable then $\Pr[X\ge
k\mathbb{E}[X]]\le\frac{1}{k}$.
\end{theorem}
\begin{theorem}[Chebychev's Inequality]
For random variable $X$ with variance $\sigma^2$,
$\Pr[|X-\mathbb{E}[X]|\ge k\sigma]\le \frac{1}{k^2}$.
\end{theorem}
\begin{theorem}[Chernoff Bound]
For $X_1,\ldots,X_n$ independent, identically distributed (possibly
continuous) random variables with expectation $\mu$, such that
$X_i\in[0,1]$,\[\Pr\left[\left|\frac{\sum_{i=1}^nX_i}{n}-\mu\right|\ge
\varepsilon\right ]\le \exp(-\varepsilon^2n)\]
\end{theorem}
\subsection{Algebra}
Finite fields are finite sets equipped with the usual notions of addition and
multiplication. That is, a finite field $\mathbb{F}_q$ of size $q$ is such that
addition forms an abelian group, with an identity $0$, and
$\mathbb{F}_q\setminus\{0\}$ forms an abelian group with multiplication with
identity element $1$. These
two operations are related through the distributive axiom. Abstract algebra
gives us that there is a unique (up to isomorphism) finite field of size $q$ if
$q$ is a prime power, and no finite field for other values of $q$. For $q$
prime, these finite fields are the familiar structures of the integers modulo
$q$. For most computer science purposes, the prime fields are enough.
A basic fact about finite fields is their relation with polynomials. If we
recall Fermat's Little theorem, then we know that $x^p=x$ when working over
$\mathbb{F}_p$ (for prime $p$). This leads us to think that no polynomial over
$\mathbb{F}_p$ need have degree in a single variable above $p$.
\begin{definition}
For a multivariate polynomial $p(x_1,\ldots,x_n)$, the \textit{degree}
is the largest number of variables, including multiplicity, seen in any
term. Thus, $\deg(x^2y^2)=4$. The \textit{degree in a single variable}
is the degree of the polynomial when considered only as a function in a
single variable, with the other variables held constant. For example,
$\deg_x(x^2y^2)=2$.
\end{definition}
\begin{proposition}
For any multivariate function $f:\mathbb{F}_q^n\rightarrow\mathbb{F}_q$,
$f$ can be expressed as a multivariate polynomial $p(x_1,\ldots,x_n)$ such that
$\deg_i(p)\le q-1$.
\end{proposition}
The proof is left as an \textbf{exercise}, with the hint of ``counting'', or
perhaps even ``vector spaces''.
In the single variable case we can bound the number of roots using the degree.
The following theorem shows we can do something similar in the multivariate
case.
\begin{theorem}[Schwartz-Zippel Lemma]
For a non-zero degree $d$ polynomial over field $\mathbb{F}$ (possibly infinite),
$p:\mathbb{F}^n\rightarrow\mathbb{F}$, and some finite set $H\subseteq
\mathbb{F}$, \[\Pr_{\vec{x}\in H^n} [ f(\vec{x})=0]\le \frac{d}{|H|}\]
\end{theorem}
The proof is left as an \textbf{exercise}, with the hint of ``induction on
number of variables''. The key thing to notice about the above theorem is that
it is independent of $n$.
\subsection{A First Cut}
We examine the first idea of trying to work over $\mathbb{F}_2$, as this is the
most natural field for computer science. We would want to prove that every
circuit in $\textsc{AC}^0$ is computed by a low-degree polynomial. This means
for a circuit family $\{C_n\}_n$ that for each input size $n$, there is a degree
$d(n)$ polynomial that computes $C_n$, and the function $d(n)$ is somehow
``small'', or slow growing, as a function of $n$. We would then try to prove
that \textsc{PARITY} can only expressed in a similar way for some $d(n)$ which
is asymptotically large, that is it needs high-degree polynomials. Notice that
we would never need to go above degree $n$ as we can apply Fermat's Little
Theorem to reduce the degree in each variable to at most 1.
However, this plan doesn't work. Consider the $\textsc{AC}^0$ function
$\textsc{AND}(x_1,\ldots,x_n)=\prod_{i=0}^nx_i$. We cannot find a polynomial $p$
of degree less than $n$ for otherwise we could take their difference and notice
that it is zero on all inputs and by applying Schwartz-Zippel the polynomial
must be formally-zero (in the sense that all of its coefficients are zero).
(This isn't a strict implication of Schwartz-Zippel, but rather one must show
something similar: if you have a polynomial
$p:\mathbb{F}_q^n\rightarrow \mathbb{F}_q$ such that $\deg_i(p)< |H|$ for each
$i$, $p$ is identically zero on $H^n$, then $p$ is formally zero. The proof is
similar to that of Schwartz-Zippel, and is left as an \textbf{exercise}). This
would violate the selection of $p$, so no such $p$ can exist. This seems like a
problem, as this is the highest degree we can expect from a polynomial over
$\mathbb{F}_2$!
However, it gets even worse, as
$\textsc{PARITY}(x_1,\ldots,x_n)=x_1+\cdots+x_n$, which is about the
lowest-degree polynomial possible. Thus this plan does not work.
To fix this, we can move to another field.
\subsection{An Idea}
We begin by noticing a common trick in Boolean analysis (the analysis of
functions over boolean values). The additive group $\{0,1\}$ can be put into a
linear correspondence with the multiplicative group $\{-1,1\}$, where the
mapping is done via $x\mapsto 1-2x$, and inversely $y \mapsto \frac{1-y}{2}$.
The usual mapping $x\mapsto (-1)^x$ is not as useful here as it is not linear.
This allows us to move from analyzing $\textsc{PARITY}$ over $\{0,1\}$ to over
$\{-1,1\}$, where instead we look at the parity of signs in the input. Notice
that we can express this parity with the simple polynomial $\prod_{i=1}^ny_i$.
Suppose we fix some finite field $\mathbb{F}$ with size at least three (so
$-1\ne 1$). Then suppose that the polynomial $p(x_1,\ldots,x_n)$ outputs the
correct parity bit when its input is in from $\{0,1\}^n$ (and we don't care what
it does otherwise). Now we can construct
$q=1-2p(\frac{1-y_1}{2},\ldots,\frac{1-y_n}{2})$ which, as we are working over
the same field, will compute the correct parity in the $\pm1$ sense. Notice
that $\deg q\le \deg p$. Further, using the Schwartz-Zippel-like result
mentioned in the previous section, this means that because $q$ and $\prod y_i$
agree on all inputs restricted to $\{\pm1\}^n$ it must be that $\deg(\prod
y_i)\le \deg q$, and thus $\deg p \ge n$.
This establishes two points. First, that computing the parity over $\{0,1\}$ is
equivalent to computing it over $\pm1$ because of the linear transformations.
Second, using that the $\pm1$ domain is well-understood, we see that computing
the parity exactly over the $\{0,1\}$ domain requires degree $n$ polynomials.
This seems like progress.
\subsection{The Proof}
To make the idea into a proof, we need another idea from Razborov from a
previous work. The idea is to only approximate circuits by polynomials instead
of requiring equality. To do this, we replace each gate by some approximate
low-degree functions, and combine them to say that the entire function is
approximated by a low-degree (but higher-degree than at the gate level)
polynomial. Then, the plan should be that \textsc{PARITY} cannot be similarly
approximated. The field we will use will be the simplest one that obeys the
idea of the last section and this is $\mathbb{F}_3$.
We begin with approximating $\textsc{AC}^0$. The goal is to prove the following
lemma, which makes precise what we mean by approximation.
\begin{lemma}
Let $C:\{0,1\}^n\rightarrow \{0,1\}$ be computed by a depth $d$, size
$s$, $\textsc{AC}^0$ circuit. Then
\begin{enumerate}
\item There exists a polynimal $p\in\mathbb{F}_3$ on $n$
variables of degree at most $2^d(\log_3(s/\varepsilon)+1)^d$
\item There exists a set $S\subseteq\{0,1\}^n\subset
\mathbb{F}_3^n$, where $|S|\ge (1-\varepsilon)2^n$ and
$\forall\vec{x}\in S$, $p(\vec{x})=C(\vec{x})$.
\end{enumerate}
\label{lem1}
\end{lemma}
Notice that \textsc{AND} is approximated in this sense by the zero-polynomial as
this polynomial is only incorrect on one input, namely the all-ones vector.
Thus, an adversary to this proof could somehow ensure that we are taking the
\textsc{AND} of something that is always the all-ones vector and thus foil our
plans of using this as subroutine in our circuit. To avoid this, we will use
randomization. We will create a distribution of polynomials for each gate such
that there is a non-zero probability that some polynomial approximates the gate
well. We pass this distributions as the subroutine. By arguing that the
probability of correct approximation doesn't reach zero we then have, by the
probabilistic method, that some approximating polynomial must exist. In what
follows, we assume that we only use \textsc{NOT} and \textsc{OR} gates. We can
clearly do this without affecting the depth, and it will make the proof simpler.
We now present the randomization.
\begin{lemma}
Fix $k$. For each $t$, there exists a distribution on degree $2k$ polynomials $p$ such that
\[ \forall z_1,\ldots,z_t\in\{0,1\},
\Pr_p[p(z_1,\ldots,z_t)\ne\textsc{OR}(z_1,\ldots,z_t)]\le \frac{1}{3^k} \]
\label{lem1p}
\end{lemma}
\begin{proof}
We only concern ourselves here with inputs in $\{0,1\}$ as we do not
care what happens otherwise. Notice that
$\textsc{OR}(z_1,\ldots,z_t)=1-\prod_{i=1}^t(1-z_i)$ on boolean inputs.
Now, pick $\alpha_1,\ldots,\alpha_t\in\mathbb{F}_3$ at random (uniform,
independently). Consider
$L_\vec{\alpha}(z_1,\ldots,z_t)=(\sum_{i=1}^t\alpha_iz_i)^2$. We first
claim that
\[\Pr_\vec{\alpha}[L_\vec{\alpha}(z_1,\ldots,z_t)=\textsc{OR}(z_1,\ldots,z_t)]\ge
2/3\] Consider the cases. If $z_1,\ldots,z_t=0,\ldots,0$ then with
probability 1 the choice of $\vec{\alpha}$ will be correct as the
\textsc{OR} will always be $0$ as will $L_\vec{\alpha}$.
In the other case, we want to get $\sum_{i=1}^t\alpha_iz_i\ne 0$, for
when we then squared it shall become $1$, which is the correct answer in
this case. Consider $f(\vec{\alpha})=\sum_{i=1}^t\alpha_iz_i$. This is
a linear, non-zero polynomial (it is non-zero because $\vec{z}\ne
\vec{0}$, by hypothesis). Notice that we just pulled a switch: we are
now considering the $\alpha$'s as inputs and the $z$'s as constants.
Thus, we can apply Schwartz-Zippel to see that
$\Pr_\vec{\alpha}[f(\vec{\alpha})=0]\le\frac{1}{3}$. This gives the
claim about $L_\alpha$ by squaring $f$.
Further, we can notice that this gives a
one-sided error: if the answer is really 0, the $L_\alpha$ will also
give zero. If the answer is $1$, then with probability at least $2/3$ a
$1$ will be returned. As usual with one-sided error, we can amplify the
probability of success by taking multiple independent trials and taking
the \textsc{OR} of them. Thus, we pick vectors
$\alpha^{(1)},\ldots,\alpha^{(k)}$ randomly and take the \textsc{OR} of
$L_{\alpha^{(1)}},\ldots,L_{\alpha^{(1)}}$. It might seem that we would
want to use this approximate-\textsc{OR} in a recursive fashion, however
we can use the \textit{actual}
$\textsc{OR}(z_1,\ldots,z_t)=1-\prod_{i=1}^t(1-z_i)$ in this case. As
each $L_{\alpha^{(i)}}$ is of degree $2$, this gives a polynomial $p$ of
degree $2k$. Notice that our random choices were only over the
$\alpha^{(i)}$ and these determined the polynomial $p$. By repeating
this one-sided error process we see that there is only at most a $\frac{1}{3^k}$
error that we make a mistake, and thus for any particular
$z_1,\ldots,z_k$ the chance of $p(z_1,\ldots,z_k)$ making an error is
also bounded by $\frac{1}{3^k}$. This is exactly the claim.
\end{proof}
We have now shown that we can approximate the \textsc{OR} gate. By using this
approximate gates in the $\textsc{AC}^0$ circuit, we can then approximate the
whole thing. We can notice that \textsc{NOT} gates are very easy to implement
by using $\textsc{NOT}(z)=1-z$. This only works over the boolean domain, but
don't forget that even though we are working over $\mathbb{F}_3$, the boolean
domain is all that the polynomial needs to work over.
\begin{proofof}{Lemma~\ref{lem1}}
Choose the smallest $k$ such that $3^k\ge s/\varepsilon$. We will string
together the circuit $C$ using the distributions of
approximate-\textsc{OR} gates from Lemma~\ref{lem1p} and boolean
\textsc{NOT} gates. By then union bounding the errors the result should
follow.
For each internal node, we will construct a distribution of polynomials
that will with high probability compute the correct value for that node.
We start with the input nodes, where they have the monomial $x_i$
representing input $i$. At at NOT gate, as argued above, we simply
compute $1-z$, where $z$ is randomly chosen from the distribution on the
child of the node. This does not affect the degree of the polynomial
nor does it affect the probability of error on the distribution.
For an OR gate with children $z_1,\ldots, z_t$, we randomly choose a
polynomial $p$ given by Lemma~\ref{lem1p}, and for each $i$, randomly sample
a polynomial $p_i$ from child $i$'s distribution. We then create the
polynomial $q=p(p_1(x_1,\ldots,x_n),\ldots,p_t(x_1,\ldots,x_n))$ for this
node, with the appropriate probabilities based on its creation.
Notice that the only degree-increasing step is that of the OR gate, and
this is done via composition. The polynomial $p$ has degree $2k$ and so
the degree of $q$ is at most $2k\cdot\max_i\deg(p_i)$. By induction we
can see this leads to polynomials of degree at most $(2k)^d$, as the
circuit has depth $d$. Notice that by our choice of $k$, $k\le
\log_3(s/\varepsilon)+1$, so these polynomials have degree at most
$2^d(\log_3(s/\varepsilon)+1)^d$.
Now we must analyze the probability that the
polynomial $p$ of the output node computes the correct output. Notice
that $p$ is computed by independently chosen polynomials at each node.
The only source of error comes from the OR nodes, and there can be at
most $s$ of them. By the construction in Lemma~\ref{lem1p}, these
choices error with probability at most $\frac{1}{3^k}$ regardless of
what their input is. Therefore, the union bound indicates that
polynomial $p$ will error with probability at most $\frac{s}{3^k}$. By
our choice of $\varepsilon$, this gives that \[\forall \vec{x},
\Pr_p[p(\vec{x})\ne C(\vec{x})]\le \varepsilon\]
We now argue by the probabilistic method to assert the existence of a
specific $p$ to give the claim. By the above argument, any such $p$
from this distribution has the desired degree. The probability analysis
is left as an \textbf{exercise}.
\begin{forbes-notes}
Consider the matrix of zeroes and ones with rows corresponding
to polynomials and columns corresponding to inputs. The entries
will be one if the polynomial gets the input incorrect, and zero
otherwise. The above statement means that for any given column,
at most an $\varepsilon$ fraction of the entries are ones. This
means that in total, at most $\varepsilon$ of the entries in the
entire matrix are ones. Thus, the expected number of ones in a
row of the matrix is $\varepsilon$ times $2^n$, so by the
probabilistic method, at least one of the rows must have at most
this many ones. Pick this polynomial, and notice that it gets
right at least $(1-\varepsilon)2^n$ of the inputs, just as
claimed.
\end{forbes-notes}
\end{proofof}
We have just show that $\textsc{AC}^0$ can be approximated by low-degree
polynomials. Specifically, as $s$ is bounded by some polynomial in $n$, and $d$
is a constant, we see that the depth of the polynomials is polylogarithmic (when
$\varepsilon$ is held constant). Thus, we only need to show that
\textsc{PARITY} cannot be approximated by such a family of polynomials.
\begin{lemma}
No degree $o(\sqrt(n))$ polynomial approximates \textsc{PARITY} in the
sense that it correctly answers the question on a set of size at least
$\frac{3}{4}2^n$.
\end{lemma}
\begin{proof}
Consider any set $S\subseteq\{0,1\}^n$ (it is meant to be ``large'')
such that parity can be correctly computed on $S$ by a polynomial $p$ of
degree $D$ in the field $\mathbb{F}_3$. We now make the transformation to the $\pm1$ domain.
Define the analogous set $T$ to $S$ such that $|T|=|S|$ and $T\subseteq
\{\pm1\}^n$. Define
$q(y_1,\ldots,y_n)=1-2p(\frac{1-y_1}{2},\ldots,\frac{1-y_n}{2})$ of
degree at most $D$. Notice that as before, $q$ computes $\prod_{i=1}^n
y_i$ on $T\subseteq \{-1,1\}^n$.
Now we want to count the number of functions from $T$ to $\{-1,0,1\}$.
It is easy to see that the number of function is just $3^{|T|}$.
However, from the preliminaries we know that any function is also a
polynomial. So we can count polynomials also as a way to count
functions. The polynomials will be of the form
$p(x_1,\ldots,x_n)=\sum_\vec{d}c_\vec{d}x_1^{d_1}\cdots x_n^{d_n}$.
Notice that as we are dealing with $T\subseteq\{-1,1\}^n$, so we have
the $x_i^2=1$, as $x_i\in\{\pm1\}$. Thus, we can reduce all of the
$d_i$ modulo 2.
Now we have the following key idea. If $\sum_i d_i>\frac{n}{2}$, we
replace \[x_1^{d_1}\cdots x_n^{d_n}\] by \[x_1^{d_1+1\pmod{2}}\cdots
x_n^{d_n+1\pmod{2}} q(x_1,\ldots,x_n)\]. We first claim that these terms
are in fact equivalent for inputs over $T$. To see this, recall that
$q(x_1,\ldots,x_n)=\prod_{i=1}^nx_i$ over inputs in $T$ by construction.
Then, we use the reduction of degrees modulo $2$ to get the algebraic
equivalence. By doing this to each term, we see that the maximum term
size is bounded by $\lfloor \frac{n}{2}\rfloor+D$. Further, we see that each term is
in bijection with some subset of $\{1,\ldots,n\}$ of size at most its
degree. Thus, there can be at most \[\sum_{i=1}^{\lfloor n/2\rfloor+D}
\binom{n}{i}\le \sum_{i=1}^{\lfloor n/2\rfloor} \binom{n}{i}+D\binom{n}{\lfloor
n/2\rfloor}\]
And notice that $\sum_{i=1}^{\lfloor n/2\rfloor} \binom{n}{i}\le
2^{n-1}$ as the sets of size at most $\lfloor n/2\rfloor$ can be paired
up with their complements to get the entire set (if $n$ is even then we
would need to half count the pairs of sets each of size $n/2$).
Apparently, left as an \textbf{exercise}, we also have that
$\binom{n}{\lfloor n/2\rfloor}\le 2^n/\sqrt{n}$. So the overall bound
on the number of terms
is $2^{n-1}+\frac{D2^n}{\sqrt{n}}$. As each term can have one of the
coefficients in $\mathbb{F}_3$, there are at most
$3^{2^{n-1}+\frac{D2^n}{\sqrt{n}}}$ polynomials.
As this counts all polynomials over $T$, and thus all functions over $T$, we must have
that $3^|T|\le 3^{2^{n-1}+\frac{D2^n}{\sqrt{n}}}$, or that $|T|\le
3^{2^{n-1}+\frac{D2^n}{\sqrt{n}}}$. As we had $|T|\ge\frac{3}{4} 2^n$
this implies that $D\ge \sqrt{n}$.
\end{proof}
\begin{theorem}[Razborov-Solemnsky]
$\textsc{PARITY}\notin\textsc{AC}^0$
\end{theorem}
\begin{proof}
We saw that any $\textsc{AC}^0$ function can be approximated by a
polynomial of polylogarithmic degree, while \textsc{PARITY} requires
$\Omega(\sqrt{n})$.
\end{proof}
It should be noted that the $\frac{3}{4}$ in the approximation of
\textsc{PARITY} is important, as any constant function approximates parity on
exactly half the inputs.
\begin{observation}
As we were working over $\mathbb{F}_3$ we could have thrown in mod-3
gates into our circuits and we would have gotten the same results.
Thus, adding mod-3 gates wouldn't help compute parity either.
Conversely, it should be possible to show that computing mod-3 is hard
for $\textsc{AC}^0$ even given mod-2 gates.
\end{observation}
While these ideas seem to work for finite fields, the ideas really break down
for mod-6 gates. Currently, it is not known that $\textsc{AC}^0$ with mod-6
gates fails to compute SAT. Some results from error-correcting codes suggest
that this is in some way deep.
\begin{observation}
I think it should be possible to get a similar result to Lemma~\ref{lem1} over any finite field of size
bigger than 2.
\end{observation}
\end{document}