-
Notifications
You must be signed in to change notification settings - Fork 2
/
proposal-defence.tex
155 lines (138 loc) · 4.37 KB
/
proposal-defence.tex
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
\documentclass{beamer}
\usetheme{Ilmenau}
\usepackage[all]{xy}
\usepackage{graphicx}
\title{Syntax Distance for Dialectometry}
\author{Nathan Sanders}
\date{\today}
\begin{document}
\frame{\titlepage}
\section{Contents}
\begin{frame}
\frametitle{Contents}
\begin{itemize}
\item Overview
\item Progress
\end{itemize}
\end{frame}
\section{Overview}
\begin{frame}
\frametitle{Overview}
\begin{itemize}
\item Define syntactic distance in two parts: distance measure and
feature set.
\item Test the distance on a dialect corpus, Swediasyn.
\end{itemize}
\end{frame}
\begin{frame}
\frametitle{Terms}
\begin{itemize}
\item Dialectology: Study of linguistic variation
\item Dialectometry: Quantitative analysis of linguistic variation,
recently dominated by computational methods.
\end{itemize}
\end{frame}
\begin{frame}
\frametitle{Terms: Distance Measure}
A distance measure consists of two functions:
\begin{enumerate}
\item $f : $ corpus $\to$ features
\item $d : $ (features, features) $\to$ distance
\end{enumerate}
Then the distance measure is
\[ m(c_1,c_2) = d(f(c_1), f(c_2)) \]
\end{frame}
\begin{frame}
\frametitle{Terms: Distance}
\begin{enumerate}
\item Divergence: only equals 0 for two equal corpora; always
positive
\item Dissimilarity: a divergence that is symmetric
\item Distance: a dissimilarity that satisfies the triangle
inequality
\end{enumerate}
Dialectometry only requires a dissimilarity. For the rest of the talk,
I use `distance' to mean `dissimilarity'.
\end{frame}
\begin{frame}
\frametitle{Distance Measure}
\begin{itemize}
\item Phonology typically uses Levenshtein distance, which requires
that the same words be elicited from all interviewees.
\item Syntax does not have a standard distance measure.
\item This dissertation tries several variations: $R$ / $R^2$ and
Kullbeck-Leibler and Jensen-Shannon divergence.
\end{itemize}
\end{frame}
\begin{frame}
\frametitle{Features}
\begin{itemize}
\item Phonology decomposes word into segments or distinctive features.
\item Syntax does not have a standard decomposition of sentences into features.
\item This dissertation tries several variations on trigrams,
leaf-ancestor paths, and dependency paths.
\end{itemize}
\end{frame}
\begin{frame}
\frametitle{Questions}
\begin{enumerate}
\item Does $R$ reproduce the results of dialectology?
\item (If $R$ doesn't, does any other distance measure?)
\item What features produce the results that best match
dialectology?
\end{enumerate}
NOT: Does syntactic dialectometry reproduce the results of
phonological dialectometry?
\end{frame}
\section{Progress}
\begin{frame}
\frametitle{Experiment}
Find syntactic differences between dialects of Swedish
\begin{enumerate}
\item Corpus: Swediasyn, unparsed interviews transcribed and glossed
to standard Swedish
\item Training corpus: Talbanken, 300 Kwords of parsed spoken and
written Swedish from the late 70s.
\item Annotators: TnT, MaltParser and the Berkeley parser, all
trained on Talbanken.
\end{enumerate}
\end{frame}
\begin{frame}
\frametitle{Progress So Far}
\begin{itemize}
\item Distance ($R$)
\item Alternate distances ($R^2$, KL divergence, JS divergence)
\item Alternate features (POS tags taken from Berkeley parser,
unigrams, dependencies with arc labels instead of node labels)
\item Statistical significance of each combination of distances and features.
\item 5 most heavily weighted features for each comparison.
\item Hierarchical clustering.
\item Multi-dimensional scaling.
\end{itemize}
\end{frame}
\begin{frame}
\includegraphics [width=0.6\textwidth]{dist-10-1000-r-dep-interview-clusterward} %
\end{frame}
\begin{frame}
\includegraphics[width=0.6\textwidth]{Sverigekarta-Landskap-mds-dep}
\end{frame}
\begin{frame}
\frametitle{To Analyze}
\begin{itemize}
\item Tweaking parser parameters and input cleaning.
\item Better ways to characterise the most important features of each
comparison.
\item Comparison to dialectology literature.
\end{itemize}
\end{frame}
\begin{frame}
\frametitle{Progress So Far // To Write}
\begin{itemize}
\item Introduction (half // revision and overview)
\item Questions (most // revision)
\item Methods (most // updated methods)
\item Results (some figures // final analysis and more figures)
\item Discussion (none // comparison to dialectology)
\end{itemize}
\end{frame}
\end{document}