\documentclass[11pt]{article}
\usepackage{latexsym}
\usepackage{amsmath}
\usepackage{amssymb}
\usepackage{amsthm}
\usepackage{epsfig}
\usepackage[dvips]{geometry}
%\usepackage{psfig}
\newcommand{\handout}[5]{
\noindent
\begin{center}
\framebox{
\vbox{
\hbox to 5.78in { {\bf 6.851: Advanced Data Structures } \hfill #2 }
\vspace{4mm}
\hbox to 5.78in { {\Large \hfill #5 \hfill} }
\vspace{2mm}
\hbox to 5.78in { {\em #3 \hfill #4} }
}
}
\end{center}
\vspace*{4mm}
}
\newcommand{\lecture}[4]{\handout{#1}{#2}{#3}{Scribe: #4}{Lecture #1}}
\newtheorem{theorem}{Theorem}
\newtheorem{corollary}[theorem]{Corollary}
\newtheorem{lemma}[theorem]{Lemma}
\newtheorem{observation}[theorem]{Observation}
\newtheorem{proposition}[theorem]{Proposition}
\newtheorem{definition}[theorem]{Definition}
\newtheorem{claim}[theorem]{Claim}
\newtheorem{fact}[theorem]{Fact}
\newtheorem{assumption}[theorem]{Assumption}
% 1-inch margins, from fullpage.sty by H.Partl, Version 2, Dec. 15, 1988.
\topmargin 0pt
\advance \topmargin by -\headheight
\advance \topmargin by -\headsep
\textheight 8.9in
\oddsidemargin 0pt
\evensidemargin \oddsidemargin
\marginparwidth 0.5in
\textwidth 6.5in
\parindent 0in
\parskip 1.5ex
%\renewcommand{\baselinestretch}{1.25}
\newcommand{\norm}[1]{\left\lVert #1 \right\rVert}
\begin{document}
\lecture{10 --- March 14, 2007}{Spring 2007}{Alex Andoni}{Alex Schwendner}
\section{Nearest Neighbor}
In the Nearest Neighbor problem, we are given a set $P$ of points in
$\mathbb{R}^d$ or another metric space. We want to construct a data
structure such that we can efficiently answer queries in which we are
given a point $q$ and must return a point $p \in P$ minimizing the
distance $\norm{p-q}$ between $q$ and $p$.
\subsection{Interesting Metrics}
\begin{itemize}
\item \textbf{Euclidean Distance} on $\mathbb{R}^d$
\[\norm{p-q}_2 = \sqrt{\sum_{i=1}^d \left\lvert p_i - q_i\right\rvert^2}\]
\item \textbf{Manhattan} on $\mathbb{R}^d$
\[\norm{p-q}_1 = \sum_{i=1}^d \left\lvert p_i - q_i\right\rvert\]
\item \textbf{Hamming} on $\left\{0,1\right\}^d$
\[\norm{p-q}_1 = \sum_{i=1}^d \left\lvert p_i - q_i\right\rvert\]
The Hamming distance is the same as Manhattan distance except only on $\left\{0,1\right\}$.
\end{itemize}
\section{Applications}
Similarity search problems can often be expressed in this form with a
suitable metric.
\subsection{Inexact String Matching}
Here, our metric is the number of differences between two strings $p$
and $q$ (essentially the Hamming metric). If we have a text $T$ and a
pattern $P$, then we consider all $\left|T\right| - \left|P\right| +
1$ length-$\left|P\right|$ substrings of $T$. Our goal is then to find
the closest such point to $P$.
\subsection{Image Matching}
If we have an image, we can consider each pixel as a coordinate. Then,
the images is a high dimensional vector. We can use the Manhattan
metric to express the distance between two images. Another way we
might obtain a metric is to compute a Fourier transform of the two
images and then define the distance to be the Manhattan distance
between the Fourier coefficients.
\section{Exact Algorithms}
\subsection{$\mathbb{R}^2$: Voronoi Diagrams}
In two dimensions, the nearest neighbor problem can be solved with a
Voronoi diagram. The Voronoi diagram on a set $P$ of $n$ points in the
plane is a partition of the plane into $n$ \emph{Voronoi cells}. Each
Voronoi cell is the set of points closer to some specified point $p\in
P$ than to any other point in $P$. Each Voronoi cell is a (possibly
unbounded) region with straight sides.
The construction of a representation of a Voronoi diagram can be done
in $O(n \log n)$ time. The Voronoi diagram takes $O(n)$ space and with
it, nearest neighbor queries can be answered in $O(\log n)$ time.
\subsection{Higher Dimensions}
While constructing a Voronoi diagram nicely solves the two dimensional
problem, the algorithm does not extend well to higher dimensions.
Analogous Voronoi structures on $n$ points in $d$ dimensions use
$n^{O(d)}$ space.
The naive algorithm of simply computing the distance from a query to
each of the $n$ points takes $O(dn)$ time for each query.
The difficulty of exactly computing nearest neighbors in $d$
dimensions is the reason for considering the problem of computing
approximate nearest neighbors.
\section{Approximate Near Neighbor}
\subsection{Near Neighbor}
\emph{Near Neighbor} is the decision version of Nearest Neighbor.
Given a set $P$ of points in $\mathbb{R}^d$ and a real number $r > 0$,
we want to construct a data structure such that we can efficiently
answer queries in which we are given a point $q$ and must return some
point $p \in P$ such that $\norm{p-q} \leq r$, if such a point exists.
\subsection{$c$-Approximate Near Neighbor}
For the \emph{$c$-Approximate Near Neighbor} problem, we want to
construct a data structure such that when given a query $q$ for which
there exists a point $p \in P$ with $\norm{p-q} \leq r$ we can quickly
find some point $p' \in P$ with $\norm{p'-q} \leq cr$.
\begin{figure}[h]
\begin{center}
\includegraphics[scale=0.6]{lec10.f1.eps}
\end{center}
\end{figure}
Given a $c$-approximate algorithm for Near Neighbor, we can construct
a $c^2$-approximate algorithm for Near\emph{est} Neighbor. Suppose all
distances between points are in the range $[1,\Delta]$. Then, we can
use binary search.
Let $D_r$ be the $c$-approximate Near Neighbor algorithm with distance
cutoff $r$. Suppose we are given input point $q$. Then, consider the
sequence
\[\underbrace{D_1, D_c, D_{c^2},\ldots, D_\Delta}_{\log \Delta}.\]
We binary search within this sequence to find the smallest $i$ for
which $D_{c^i}(q)$ returns a valid point. We then return this as our
answer, which will be within a factor of $c^2$ of optimal.
\section{Locality-Sensitive Hashing}
Here, our approach will be to devise a weak locality-sensitive hash
function and amplify it to a stronger one. The details of the
primitive hash function will depend on the metric and space used. We
shall first present how to perform the amplification, and then discuss
some particular weak hash functions.
\subsection{Amplification}
These results are by Piotr Indyk and Rajeev Motwani in \cite{xneighbors}.
The idea is to solve $c$-approximate Near Neighbor with hashing.
Loosely speaking, we want to construct hash functions $g \colon
\mathbb{R}^d \longrightarrow U$ such that
\begin{itemize}
\item if $\norm{p-q} \leq r$, then $\operatorname{Pr}\left[g(p)=g(q)\right]$ is not too small,
\item if $\norm{p-q} > cr$, then $\operatorname{Pr}\left[g(p)=g(q)\right]$ is small.
\end{itemize}
More formally, a family $H$ of functions $h \colon \mathbb{R}^d
\longrightarrow U$ is called $(P_1,P_2,r,cr)$-sensitive if for any
$p,q$
\begin{itemize}
\item if $\norm{p-q} \leq r$, then $\operatorname{Pr}\left[h(p)=h(q)\right] > P_1$,
\item if $\norm{p-q} > cr$, then $\operatorname{Pr}\left[h(p)=h(q)\right] < P_2$.
\end{itemize}
We now amplify the gap between $P_1$ and $P_2$ by concatenating
$k=\log_{P_1/P_2} 2n$ different such hash functions to get a new hash
function $h'$. Then,
\begin{itemize}
\item if $\norm{p-q} \leq r$, then $\operatorname{Pr}\left[h'(p)=h'(q)\right] > P_1^{\log_{P_1/P_2} 2n} = 2n \cdot P_2^{\log_{P_1/P_2} 2n}$,
\item if $\norm{p-q} > cr$, then $\operatorname{Pr}\left[h'(p)=h'(q)\right] < P_2^{\log_{P_1/P_2} 2n}$.
\end{itemize}
If we now consider $n^\rho$ such amplified hash functions for
$\rho=\frac{\log P_1}{\log (P_2/P_1)}$, then the chance of any
collision will be $O(1)$ if $\norm{p-q} \leq r$ and small if
$\norm{p-q} > cr$.
This yields query time of $O(dn^\rho \log n)$ and space of $O(n^{\rho
+ 1} + dn)$.
\subsection{LSH for Particular Metrics}
\begin{itemize}
\item \textbf{Euclidean} \par
Here, divide space into hyper-boxes with edge length
$r(1+\epsilon)$, and hash each point to the box containing it. If
$\norm{p-q} \leq r$, then $\operatorname{Pr}\left[h(p)=h(q)\right]
\geq \frac{\epsilon}{1+\epsilon}$. If $\norm{p-q} > r\sqrt{d}$, then
$\operatorname{Pr}\left[h(p)=h(q)\right] = 0$.
\item \textbf{Hamming} \par
Let our family $H$ of hash functions be $\left\{h_i(p) =
p_i\right\}$ where $p_i$ is the $i$-th bit of $p$. Then,
$\operatorname{Pr}\left[h(p)=h(q)\right] = 1-\frac{1}{d} D(p,q)$
where $D(p,q)$ is the Hamming distance between $p$ and $q$.
\end{itemize}
%\bibliography{lec10}
\bibliographystyle{alpha}
\begin{thebibliography}{IM98}
\bibitem[IM98]{xneighbors}
Piotr Indyk and Rajeev Motwani.
\newblock Approximate nearest neighbors: towards removing the curse of
dimensionality.
\newblock In {\em STOC '98: Proceedings of the thirtieth annual ACM symposium
on Theory of computing}, pages 604--613, New York, NY, USA, 1998. ACM Press.
\end{thebibliography}
\end{document}