diff --git a/gensim/corpora/bleicorpus.py b/gensim/corpora/bleicorpus.py index 6bd96da716..273759aca6 100644 --- a/gensim/corpora/bleicorpus.py +++ b/gensim/corpora/bleicorpus.py @@ -5,9 +5,7 @@ # Licensed under the GNU LGPL v2.1 - http://www.gnu.org/licenses/lgpl.html -""" -Blei's LDA-C format. -""" +"""Blei's LDA-C format.""" from __future__ import with_statement @@ -41,8 +39,9 @@ def __init__(self, fname, fname_vocab=None): """ Initialize the corpus from a file. - `fname_vocab` is the file with vocabulary; if not specified, it defaults to - `fname.vocab`. + Args: + fname (str): serialized corpus's filename + fname_vocab (str): vocabulary file; takes precedence over fname.vocab """ IndexedCorpus.__init__(self, fname) logger.info("loading corpus from %s", fname) @@ -85,7 +84,7 @@ def line2doc(self, line): return doc @staticmethod - def save_corpus(fname, corpus, id2word=None, metadata=False): + def __save_corpus(fname, corpus, id2word=None, metadata=False): """ Save a corpus in the LDA-C format. @@ -94,6 +93,9 @@ def save_corpus(fname, corpus, id2word=None, metadata=False): This function is automatically called by `BleiCorpus.serialize`; don't call it directly, call `serialize` instead. + + Args: + """ if id2word is None: logger.info("no word id mapping provided; initializing from corpus") diff --git a/gensim/corpora/indexedcorpus.py b/gensim/corpora/indexedcorpus.py index af79a2fd5f..ca0debbfd5 100644 --- a/gensim/corpora/indexedcorpus.py +++ b/gensim/corpora/indexedcorpus.py @@ -85,14 +85,14 @@ def serialize(serializer, fname, corpus, id2word=None, index_fname=None, progres if progress_cnt is not None: if labels is not None: - offsets = serializer.save_corpus(fname, corpus, id2word, labels=labels, progress_cnt=progress_cnt, metadata=metadata) + offsets = serializer.__save_corpus(fname, corpus, id2word, labels=labels, progress_cnt=progress_cnt, metadata=metadata) else: - offsets = serializer.save_corpus(fname, corpus, id2word, progress_cnt=progress_cnt, metadata=metadata) + offsets = serializer.__save_corpus(fname, corpus, id2word, progress_cnt=progress_cnt, metadata=metadata) else: if labels is not None: - offsets = serializer.save_corpus(fname, corpus, id2word, labels=labels, metadata=metadata) + offsets = serializer.__save_corpus(fname, corpus, id2word, labels=labels, metadata=metadata) else: - offsets = serializer.save_corpus(fname, corpus, id2word, metadata=metadata) + offsets = serializer.__save_corpus(fname, corpus, id2word, metadata=metadata) if offsets is None: raise NotImplementedError("called serialize on class %s which doesn't support indexing!" % serializer.__name__) diff --git a/gensim/corpora/lowcorpus.py b/gensim/corpora/lowcorpus.py index d5265f6571..572470af2e 100644 --- a/gensim/corpora/lowcorpus.py +++ b/gensim/corpora/lowcorpus.py @@ -140,7 +140,7 @@ def __iter__(self): yield self.line2doc(line) @staticmethod - def save_corpus(fname, corpus, id2word=None, metadata=False): + def __save_corpus(fname, corpus, id2word=None, metadata=False): """ Save a corpus in the List-of-words format. diff --git a/gensim/corpora/malletcorpus.py b/gensim/corpora/malletcorpus.py index cacf0074bd..b6dc482dcc 100644 --- a/gensim/corpora/malletcorpus.py +++ b/gensim/corpora/malletcorpus.py @@ -67,7 +67,7 @@ def line2doc(self, line): return doc @staticmethod - def save_corpus(fname, corpus, id2word=None, metadata=False): + def __save_corpus(fname, corpus, id2word=None, metadata=False): """ Save a corpus in the Mallet format. diff --git a/gensim/corpora/mmcorpus.py b/gensim/corpora/mmcorpus.py index 2158f0a526..1eaadfb332 100644 --- a/gensim/corpora/mmcorpus.py +++ b/gensim/corpora/mmcorpus.py @@ -38,7 +38,7 @@ def __iter__(self): yield doc # get rid of doc id, return the sparse vector only @staticmethod - def save_corpus(fname, corpus, id2word=None, progress_cnt=1000, metadata=False): + def __save_corpus(fname, corpus, id2word=None, progress_cnt=1000, metadata=False): """ Save a corpus in the Matrix Market format to disk. diff --git a/gensim/corpora/sharded_corpus.py b/gensim/corpora/sharded_corpus.py index 4d0fde4999..d6596aa831 100644 --- a/gensim/corpora/sharded_corpus.py +++ b/gensim/corpora/sharded_corpus.py @@ -764,7 +764,7 @@ def load(cls, fname, mmap=None): return super(ShardedCorpus, cls).load(fname, mmap) @staticmethod - def save_corpus(fname, corpus, id2word=None, progress_cnt=1000, metadata=False, **kwargs): + def __save_corpus(fname, corpus, id2word=None, progress_cnt=1000, metadata=False, **kwargs): """ Implement a serialization interface. Do not call directly; use the `serialize` method instead. @@ -799,4 +799,4 @@ def serialize(serializer, fname, corpus, id2word=None, index_fname=None, progres Ignore the parameters id2word, index_fname, progress_cnt, labels and metadata. They currently do nothing and are here only to provide a compatible method signature with superclass.""" - serializer.save_corpus(fname, corpus, id2word=id2word, progress_cnt=progress_cnt, metadata=metadata, **kwargs) + serializer.__save_corpus(fname, corpus, id2word=id2word, progress_cnt=progress_cnt, metadata=metadata, **kwargs) diff --git a/gensim/corpora/svmlightcorpus.py b/gensim/corpora/svmlightcorpus.py index 290414836e..34527c31b7 100644 --- a/gensim/corpora/svmlightcorpus.py +++ b/gensim/corpora/svmlightcorpus.py @@ -79,7 +79,7 @@ def __iter__(self): self.length = lineno + 1 @staticmethod - def save_corpus(fname, corpus, id2word=None, labels=False, metadata=False): + def __save_corpus(fname, corpus, id2word=None, labels=False, metadata=False): """ Save a corpus in the SVMlight format. diff --git a/gensim/corpora/ucicorpus.py b/gensim/corpora/ucicorpus.py index a8911ee07f..995ce3e6ad 100644 --- a/gensim/corpora/ucicorpus.py +++ b/gensim/corpora/ucicorpus.py @@ -192,7 +192,7 @@ def create_dictionary(self): return dictionary @staticmethod - def save_corpus(fname, corpus, id2word=None, progress_cnt=10000, metadata=False): + def __save_corpus(fname, corpus, id2word=None, progress_cnt=10000, metadata=False): """ Save a corpus in the UCI Bag-of-Words format. diff --git a/gensim/interfaces.py b/gensim/interfaces.py index 81f85a8527..cf9a10b123 100644 --- a/gensim/interfaces.py +++ b/gensim/interfaces.py @@ -71,14 +71,14 @@ def __len__(self): # return sum(1 for doc in self) # sum(empty generator) == 0, so this works even for an empty corpus @staticmethod - def save_corpus(fname, corpus, id2word=None, metadata=False): + def __save_corpus(fname, corpus, id2word=None, metadata=False): """ Save an existing `corpus` to disk. Some formats also support saving the dictionary (`feature_id->word` mapping), which can in this case be provided by the optional `id2word` parameter. - >>> MmCorpus.save_corpus('file.mm', corpus) + >>> MmCorpus.__save_corpus('file.mm', corpus) Some corpora also support an index of where each document begins, so that the documents on disk can be accessed in O(1) time (see the @@ -100,6 +100,10 @@ def save_corpus(fname, corpus, id2word=None, metadata=False): fmt = str(doc) # format the document appropriately... fout.write(utils.to_utf8("%s\n" % fmt)) # serialize the formatted document to disk + def serialize(serializer, fname, corpus, id2word=None, index_fname=None, progress_cnt=None, labels=None, + metadata=False): + pass + class TransformedCorpus(CorpusABC): def __init__(self, obj, corpus, chunksize=None, **kwargs): diff --git a/gensim/models/wrappers/dtmmodel.py b/gensim/models/wrappers/dtmmodel.py index 1f450a457a..34e9e7bc6b 100644 --- a/gensim/models/wrappers/dtmmodel.py +++ b/gensim/models/wrappers/dtmmodel.py @@ -175,7 +175,7 @@ def convert_input(self, corpus, time_slices): """ logger.info("serializing temporary corpus to %s", self.fcorpustxt()) # write out the corpus in a file format that DTM understands: - corpora.BleiCorpus.save_corpus(self.fcorpustxt(), corpus) + corpora.BleiCorpus.__save_corpus(self.fcorpustxt(), corpus) with utils.smart_open(self.ftimeslices(), 'wb') as fout: fout.write(utils.to_utf8(str(len(self.time_slices)) + "\n")) diff --git a/gensim/test/test_corpora.py b/gensim/test/test_corpora.py index 3f7f4e8149..9f2c1967f0 100644 --- a/gensim/test/test_corpora.py +++ b/gensim/test/test_corpora.py @@ -109,7 +109,7 @@ def test_save(self): corpus = self.TEST_CORPUS # make sure the corpus can be saved - self.corpus_class.save_corpus(testfile(), corpus) + self.corpus_class.__save_corpus(testfile(), corpus) # and loaded back, resulting in exactly the same corpus corpus2 = list(self.corpus_class(testfile())) @@ -253,7 +253,7 @@ def setUp(self): def test_save_format_for_dtm(self): corpus = [[(1, 1.0)], [], [(0, 5.0), (2, 1.0)], []] test_file = testfile() - self.corpus_class.save_corpus(test_file, corpus) + self.corpus_class.__save_corpus(test_file, corpus) with open(test_file) as f: for line in f: # unique_word_count index1:count1 index2:count2 ... indexn:counnt diff --git a/gensim/test/test_miislita.py b/gensim/test/test_miislita.py index dd660f629f..e126d2ccb8 100644 --- a/gensim/test/test_miislita.py +++ b/gensim/test/test_miislita.py @@ -64,7 +64,7 @@ def test_textcorpus(self): # make sure serializing works ftmp = get_tmpfile('test_textcorpus.mm') - corpora.MmCorpus.save_corpus(ftmp, miislita) + corpora.MmCorpus.__save_corpus(ftmp, miislita) self.assertTrue(os.path.exists(ftmp)) # make sure deserializing gives the same result