Skip to content

Commit

Permalink
Implement doc2vec #3
Browse files Browse the repository at this point in the history
  • Loading branch information
fnielsen committed Apr 5, 2017
1 parent a0cacca commit 3cc88da
Showing 1 changed file with 94 additions and 23 deletions.
117 changes: 94 additions & 23 deletions dasem/dannet.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,13 +2,15 @@
Usage:
dasem.dannet build-sqlite-database [options]
dasem.dannet doc2vec-most-similar [options] <document>
dasem.dannet download [options]
dasem.dannet fasttext-vector [options] <word>
dasem.dannet get-all-sentences [options]
dasem.dannet get-all-tokenized-sentences [options]
dasem.dannet show-glossary <word> [options]
dasem.dannet fasttext-most-similar [options] <word>
dasem.dannet show [options] <dataset>
dasem.dannet train-and-save-doc2vec [options]
dasem.dannet train-and-save-fasttext [options]
Options:
Expand Down Expand Up @@ -69,6 +71,8 @@

from zipfile import ZipFile

from gensim.models.doc2vec import TaggedDocument

from db import DB

from nltk.stem.snowball import DanishStemmer
Expand Down Expand Up @@ -96,7 +100,28 @@
DANNET_CSV_ZIP_URL = 'http://www.wordnet.dk/DanNet-2.2_csv.zip'


class Dannet(Corpus):
class DataDirectoryMixin(object):
"""Class to specify data directory.
This class should have first inheritance, so that its `data_directory`
method is calle before the abstract class.
"""

def data_directory(self):
"""Return diretory where data should be.
Returns
-------
directory : str
Directory.
"""
directory = join(data_directory(), 'dannet')
return directory


class Dannet(Corpus, DataDirectoryMixin):
"""Dannet.
Using the module will automagically download the data from the Dannet
Expand Down Expand Up @@ -160,18 +185,6 @@ def __init__(self):

self._db = None

def data_directory(self):
"""Return diretory where data should be.
Returns
-------
directory : str
Directory.
"""
directory = join(data_directory(), 'dannet')
return directory

@property
def db(self):
"""Return a db.py instance with DanNet data."""
Expand Down Expand Up @@ -510,25 +523,63 @@ def build_sqlite_database(
df.to_sql(table, con=connection, if_exists=if_exists)


class FastText(models.FastText):
"""FastText on Dannet corpus.
class TaggedDocumentsIterable(object):
"""Iterable for words in a sentence.
It requires that a file called `sentences.txt` is available in the data
directory.
Parameters
----------
lower : bool, default True
Lower case the words.
stem : bool, default False
Apply word stemming. DanishStemmer from nltk is used.
"""

def data_directory(self):
"""Return data directory.
def __init__(self, lower=True, stem=False):
"""Setup options."""
self.lower = lower
self.stem = stem

def __iter__(self):
"""Restart and return iterable."""
dannet = Dannet()
for n, sentence_words in enumerate(dannet.iter_sentence_words(
lower=self.lower, stem=self.stem)):
tagged_document = TaggedDocument(sentence_words, [n])
yield tagged_document


class Doc2Vec(DataDirectoryMixin, models.Doc2Vec):
"""Doc2Vec model for the Dannet corpus."""

def iterable_tagged_documents(self, lower=True, stem=False):
"""Return iterable for sentence words.
Parameters
----------
lower : bool, default True
Lower case the words.
stem : bool, default False
Apply word stemming. DanishStemmer from nltk is used.
Returns
-------
directory : str
Directory for data.
sentence_words : iterable
Iterable over sentence words
"""
directory = join(data_directory(), 'dannet')
return directory
tagged_documents = TaggedDocumentsIterable(lower=lower, stem=stem)
return tagged_documents


class FastText(DataDirectoryMixin, models.FastText):
"""FastText on Dannet corpus.
It requires that a file called `sentences.txt` is available in the data
directory.
"""
pass


def main():
Expand Down Expand Up @@ -596,6 +647,15 @@ def main():
dannet = Dannet()
dannet.build_sqlite_database()

elif arguments['doc2vec-most-similar']:
document = arguments['<document>']
if not isinstance(document, text_type):
document = document.decode(input_encoding)

doc2vec = Doc2Vec()
for word, similarity in doc2vec.most_similar(document.split()):
write(output_file, word.encode('utf-8') + b('\n'))

elif arguments['download']:
dannet = Dannet()
dannet.download()
Expand Down Expand Up @@ -637,13 +697,24 @@ def main():
for gloss in glossary:
write(output_file, gloss.encode('utf-8') + b('\n'))

elif arguments['train-and-save-doc2vec']:
doc2vec = Doc2Vec()
if input_filename:
doc2vec.train(input_filename=input_filename)
else:
doc2vec.train()

elif arguments['train-and-save-fasttext']:
fast_text = FastText()
if input_filename:
fast_text.train(input_filename=input_filename)
else:
fast_text.train()

else:
# Coding error if we arrive here
assert False


if __name__ == '__main__':
main()

0 comments on commit 3cc88da

Please sign in to comment.