From 3cc88da87a328a849b0cacb117edb0f6177a5985 Mon Sep 17 00:00:00 2001 From: fnielsen Date: Wed, 5 Apr 2017 16:11:21 +0200 Subject: [PATCH] Implement doc2vec #3 --- dasem/dannet.py | 117 ++++++++++++++++++++++++++++++++++++++---------- 1 file changed, 94 insertions(+), 23 deletions(-) diff --git a/dasem/dannet.py b/dasem/dannet.py index a75de6a..6cef6fe 100644 --- a/dasem/dannet.py +++ b/dasem/dannet.py @@ -2,6 +2,7 @@ Usage: dasem.dannet build-sqlite-database [options] + dasem.dannet doc2vec-most-similar [options] dasem.dannet download [options] dasem.dannet fasttext-vector [options] dasem.dannet get-all-sentences [options] @@ -9,6 +10,7 @@ dasem.dannet show-glossary [options] dasem.dannet fasttext-most-similar [options] dasem.dannet show [options] + dasem.dannet train-and-save-doc2vec [options] dasem.dannet train-and-save-fasttext [options] Options: @@ -69,6 +71,8 @@ from zipfile import ZipFile +from gensim.models.doc2vec import TaggedDocument + from db import DB from nltk.stem.snowball import DanishStemmer @@ -96,7 +100,28 @@ DANNET_CSV_ZIP_URL = 'http://www.wordnet.dk/DanNet-2.2_csv.zip' -class Dannet(Corpus): +class DataDirectoryMixin(object): + """Class to specify data directory. + + This class should have first inheritance, so that its `data_directory` + method is calle before the abstract class. + + """ + + def data_directory(self): + """Return diretory where data should be. + + Returns + ------- + directory : str + Directory. + + """ + directory = join(data_directory(), 'dannet') + return directory + + +class Dannet(Corpus, DataDirectoryMixin): """Dannet. Using the module will automagically download the data from the Dannet @@ -160,18 +185,6 @@ def __init__(self): self._db = None - def data_directory(self): - """Return diretory where data should be. - - Returns - ------- - directory : str - Directory. - - """ - directory = join(data_directory(), 'dannet') - return directory - @property def db(self): """Return a db.py instance with DanNet data.""" @@ -510,25 +523,63 @@ def build_sqlite_database( df.to_sql(table, con=connection, if_exists=if_exists) -class FastText(models.FastText): - """FastText on Dannet corpus. +class TaggedDocumentsIterable(object): + """Iterable for words in a sentence. - It requires that a file called `sentences.txt` is available in the data - directory. + Parameters + ---------- + lower : bool, default True + Lower case the words. + stem : bool, default False + Apply word stemming. DanishStemmer from nltk is used. """ - def data_directory(self): - """Return data directory. + def __init__(self, lower=True, stem=False): + """Setup options.""" + self.lower = lower + self.stem = stem + + def __iter__(self): + """Restart and return iterable.""" + dannet = Dannet() + for n, sentence_words in enumerate(dannet.iter_sentence_words( + lower=self.lower, stem=self.stem)): + tagged_document = TaggedDocument(sentence_words, [n]) + yield tagged_document + + +class Doc2Vec(DataDirectoryMixin, models.Doc2Vec): + """Doc2Vec model for the Dannet corpus.""" + + def iterable_tagged_documents(self, lower=True, stem=False): + """Return iterable for sentence words. + + Parameters + ---------- + lower : bool, default True + Lower case the words. + stem : bool, default False + Apply word stemming. DanishStemmer from nltk is used. Returns ------- - directory : str - Directory for data. + sentence_words : iterable + Iterable over sentence words """ - directory = join(data_directory(), 'dannet') - return directory + tagged_documents = TaggedDocumentsIterable(lower=lower, stem=stem) + return tagged_documents + + +class FastText(DataDirectoryMixin, models.FastText): + """FastText on Dannet corpus. + + It requires that a file called `sentences.txt` is available in the data + directory. + + """ + pass def main(): @@ -596,6 +647,15 @@ def main(): dannet = Dannet() dannet.build_sqlite_database() + elif arguments['doc2vec-most-similar']: + document = arguments[''] + if not isinstance(document, text_type): + document = document.decode(input_encoding) + + doc2vec = Doc2Vec() + for word, similarity in doc2vec.most_similar(document.split()): + write(output_file, word.encode('utf-8') + b('\n')) + elif arguments['download']: dannet = Dannet() dannet.download() @@ -637,6 +697,13 @@ def main(): for gloss in glossary: write(output_file, gloss.encode('utf-8') + b('\n')) + elif arguments['train-and-save-doc2vec']: + doc2vec = Doc2Vec() + if input_filename: + doc2vec.train(input_filename=input_filename) + else: + doc2vec.train() + elif arguments['train-and-save-fasttext']: fast_text = FastText() if input_filename: @@ -644,6 +711,10 @@ def main(): else: fast_text.train() + else: + # Coding error if we arrive here + assert False + if __name__ == '__main__': main()