proposal-defence.tex

\documentclass{beamer}

\usetheme{Ilmenau}
\usepackage[all]{xy}
\usepackage{graphicx}

\title{Syntax Distance for Dialectometry}
\author{Nathan Sanders}
\date{\today}

\begin{document}
\frame{\titlepage}
\section{Contents}
\begin{frame}
  \frametitle{Contents}
  \begin{itemize}
 \item Overview
  \item Progress
\end{itemize}
\end{frame}

\section{Overview}
\begin{frame}
  \frametitle{Overview}
  \begin{itemize}
 \item Define syntactic distance in two parts: distance measure and
   feature set.
 \item Test the distance on a dialect corpus, Swediasyn.
\end{itemize}
\end{frame}

\begin{frame}
  \frametitle{Terms}
  \begin{itemize}
  \item Dialectology: Study of linguistic variation
  \item Dialectometry: Quantitative analysis of linguistic variation,
    recently dominated by computational methods.
 \end{itemize}
\end{frame}

\begin{frame}
  \frametitle{Terms: Distance Measure}
  A distance measure consists of two functions:
  \begin{enumerate}
  \item $f : $ corpus $\to$ features
  \item $d : $ (features, features) $\to$ distance
  \end{enumerate}
  Then the distance measure is
 \[ m(c_1,c_2) = d(f(c_1), f(c_2)) \]
\end{frame}

\begin{frame}
  \frametitle{Terms: Distance}
  \begin{enumerate}
  \item Divergence: only equals 0 for two equal corpora; always
    positive
  \item Dissimilarity: a divergence that is symmetric
  \item Distance: a dissimilarity that satisfies the triangle
    inequality
  \end{enumerate}
  Dialectometry only requires a dissimilarity. For the rest of the talk,
  I use `distance' to mean `dissimilarity'.
\end{frame}

\begin{frame}
  \frametitle{Distance Measure}
  \begin{itemize}
  \item Phonology typically uses Levenshtein distance, which requires
    that the same words be elicited from all interviewees.
  \item Syntax does not have a standard distance measure.
  \item This dissertation tries several variations: $R$ / $R^2$ and
   Kullbeck-Leibler and Jensen-Shannon divergence.
  \end{itemize}
\end{frame}

\begin{frame}
  \frametitle{Features}
  \begin{itemize}
  \item Phonology decomposes word into segments or distinctive features.
  \item Syntax does not have a standard decomposition of sentences into features.
  \item This dissertation tries several variations on trigrams,
    leaf-ancestor paths, and dependency paths.
  \end{itemize}
\end{frame}

\begin{frame}
  \frametitle{Questions}
  \begin{enumerate}
  \item Does $R$ reproduce the results of dialectology?
  \item (If $R$ doesn't, does any other distance measure?)
  \item What features produce the results that best match
    dialectology?
 \end{enumerate}
 NOT: Does syntactic dialectometry reproduce the results of
 phonological dialectometry?
\end{frame}

\section{Progress}

\begin{frame}
  \frametitle{Experiment}
  Find syntactic differences between dialects of Swedish
  \begin{enumerate}
  \item Corpus: Swediasyn, unparsed interviews transcribed and glossed
    to standard Swedish
  \item Training corpus: Talbanken, 300 Kwords of parsed spoken and
    written Swedish from the late 70s.
  \item Annotators: TnT, MaltParser and the Berkeley parser, all
    trained on Talbanken.
  \end{enumerate}
\end{frame}

\begin{frame}
\frametitle{Progress So Far}
\begin{itemize}
\item Distance ($R$)
\item Alternate distances ($R^2$, KL divergence, JS divergence)
\item Alternate features (POS tags taken from Berkeley parser,
  unigrams, dependencies with arc labels instead of node labels)
\item Statistical significance of each combination of distances and features.
\item 5 most heavily weighted features for each comparison.
\item Hierarchical clustering.
\item Multi-dimensional scaling.
\end{itemize}
\end{frame}

\begin{frame}
  \includegraphics [width=0.6\textwidth]{dist-10-1000-r-dep-interview-clusterward} %
\end{frame}
\begin{frame}
  \includegraphics[width=0.6\textwidth]{Sverigekarta-Landskap-mds-dep}
\end{frame}
\begin{frame}
\frametitle{To Analyze}
\begin{itemize}
\item Tweaking parser parameters and input cleaning.
\item Better ways to characterise the most important features of each
  comparison.
\item Comparison to dialectology literature.
\end{itemize}
\end{frame}


\begin{frame}
\frametitle{Progress So Far // To Write}
\begin{itemize}
\item Introduction (half // revision and overview)
\item Questions (most // revision)
\item Methods (most // updated methods)
\item Results (some figures // final analysis and more figures)
\item Discussion (none // comparison to dialectology)
\end{itemize}
\end{frame}

\end{document}