Make save_corpus private

anotherbugmaster · Oct 2, 2017 · 36d98d1 · 36d98d1
1 parent b260d4b
commit 36d98d1
Show file tree

Hide file tree

Showing 12 changed files with 29 additions and 23 deletions.
diff --git a/gensim/corpora/bleicorpus.py b/gensim/corpora/bleicorpus.py
@@ -5,9 +5,7 @@
 # Licensed under the GNU LGPL v2.1 - http://www.gnu.org/licenses/lgpl.html
 
 
-"""
-Blei's LDA-C format.
-"""
+"""Blei's LDA-C format."""
 
 from __future__ import with_statement
 
@@ -41,8 +39,9 @@ def __init__(self, fname, fname_vocab=None):
         """
         Initialize the corpus from a file.
 
-        `fname_vocab` is the file with vocabulary; if not specified, it defaults to
-        `fname.vocab`.
+        Args:
+            fname (str): serialized corpus's filename
+            fname_vocab (str): vocabulary file; takes precedence over fname.vocab
         """
         IndexedCorpus.__init__(self, fname)
         logger.info("loading corpus from %s", fname)
@@ -85,7 +84,7 @@ def line2doc(self, line):
         return doc
 
     @staticmethod
-    def save_corpus(fname, corpus, id2word=None, metadata=False):
+    def __save_corpus(fname, corpus, id2word=None, metadata=False):
         """
         Save a corpus in the LDA-C format.
 
@@ -94,6 +93,9 @@ def save_corpus(fname, corpus, id2word=None, metadata=False):
 
         This function is automatically called by `BleiCorpus.serialize`; don't
         call it directly, call `serialize` instead.
+
+        Args:
+
         """
         if id2word is None:
             logger.info("no word id mapping provided; initializing from corpus")

diff --git a/gensim/corpora/indexedcorpus.py b/gensim/corpora/indexedcorpus.py
@@ -85,14 +85,14 @@ def serialize(serializer, fname, corpus, id2word=None, index_fname=None, progres
 
         if progress_cnt is not None:
             if labels is not None:
-                offsets = serializer.save_corpus(fname, corpus, id2word, labels=labels, progress_cnt=progress_cnt, metadata=metadata)
+                offsets = serializer.__save_corpus(fname, corpus, id2word, labels=labels, progress_cnt=progress_cnt, metadata=metadata)
             else:
-                offsets = serializer.save_corpus(fname, corpus, id2word, progress_cnt=progress_cnt, metadata=metadata)
+                offsets = serializer.__save_corpus(fname, corpus, id2word, progress_cnt=progress_cnt, metadata=metadata)
         else:
             if labels is not None:
-                offsets = serializer.save_corpus(fname, corpus, id2word, labels=labels, metadata=metadata)
+                offsets = serializer.__save_corpus(fname, corpus, id2word, labels=labels, metadata=metadata)
             else:
-                offsets = serializer.save_corpus(fname, corpus, id2word, metadata=metadata)
+                offsets = serializer.__save_corpus(fname, corpus, id2word, metadata=metadata)
 
         if offsets is None:
             raise NotImplementedError("called serialize on class %s which doesn't support indexing!" % serializer.__name__)

diff --git a/gensim/corpora/lowcorpus.py b/gensim/corpora/lowcorpus.py
@@ -140,7 +140,7 @@ def __iter__(self):
                     yield self.line2doc(line)
 
     @staticmethod
-    def save_corpus(fname, corpus, id2word=None, metadata=False):
+    def __save_corpus(fname, corpus, id2word=None, metadata=False):
         """
         Save a corpus in the List-of-words format.
 

diff --git a/gensim/corpora/malletcorpus.py b/gensim/corpora/malletcorpus.py
@@ -67,7 +67,7 @@ def line2doc(self, line):
             return doc
 
     @staticmethod
-    def save_corpus(fname, corpus, id2word=None, metadata=False):
+    def __save_corpus(fname, corpus, id2word=None, metadata=False):
         """
         Save a corpus in the Mallet format.
 

diff --git a/gensim/corpora/mmcorpus.py b/gensim/corpora/mmcorpus.py
@@ -38,7 +38,7 @@ def __iter__(self):
             yield doc  # get rid of doc id, return the sparse vector only
 
     @staticmethod
-    def save_corpus(fname, corpus, id2word=None, progress_cnt=1000, metadata=False):
+    def __save_corpus(fname, corpus, id2word=None, progress_cnt=1000, metadata=False):
         """
         Save a corpus in the Matrix Market format to disk.
 

diff --git a/gensim/corpora/sharded_corpus.py b/gensim/corpora/sharded_corpus.py
@@ -764,7 +764,7 @@ def load(cls, fname, mmap=None):
         return super(ShardedCorpus, cls).load(fname, mmap)
 
     @staticmethod
-    def save_corpus(fname, corpus, id2word=None, progress_cnt=1000, metadata=False, **kwargs):
+    def __save_corpus(fname, corpus, id2word=None, progress_cnt=1000, metadata=False, **kwargs):
         """
         Implement a serialization interface. Do not call directly;
         use the `serialize` method instead.
@@ -799,4 +799,4 @@ def serialize(serializer, fname, corpus, id2word=None, index_fname=None, progres
         Ignore the parameters id2word, index_fname, progress_cnt, labels
         and metadata. They currently do nothing and are here only to
         provide a compatible method signature with superclass."""
-        serializer.save_corpus(fname, corpus, id2word=id2word, progress_cnt=progress_cnt, metadata=metadata, **kwargs)
+        serializer.__save_corpus(fname, corpus, id2word=id2word, progress_cnt=progress_cnt, metadata=metadata, **kwargs)
diff --git a/gensim/corpora/svmlightcorpus.py b/gensim/corpora/svmlightcorpus.py
@@ -79,7 +79,7 @@ def __iter__(self):
         self.length = lineno + 1
 
     @staticmethod
-    def save_corpus(fname, corpus, id2word=None, labels=False, metadata=False):
+    def __save_corpus(fname, corpus, id2word=None, labels=False, metadata=False):
         """
         Save a corpus in the SVMlight format.
 

diff --git a/gensim/corpora/ucicorpus.py b/gensim/corpora/ucicorpus.py
@@ -192,7 +192,7 @@ def create_dictionary(self):
         return dictionary
 
     @staticmethod
-    def save_corpus(fname, corpus, id2word=None, progress_cnt=10000, metadata=False):
+    def __save_corpus(fname, corpus, id2word=None, progress_cnt=10000, metadata=False):
         """
         Save a corpus in the UCI Bag-of-Words format.
 

diff --git a/gensim/interfaces.py b/gensim/interfaces.py
@@ -71,14 +71,14 @@ def __len__(self):
 #        return sum(1 for doc in self) # sum(empty generator) == 0, so this works even for an empty corpus
 
     @staticmethod
-    def save_corpus(fname, corpus, id2word=None, metadata=False):
+    def __save_corpus(fname, corpus, id2word=None, metadata=False):
         """
         Save an existing `corpus` to disk.
 
         Some formats also support saving the dictionary (`feature_id->word` mapping),
         which can in this case be provided by the optional `id2word` parameter.
 
-        >>> MmCorpus.save_corpus('file.mm', corpus)
+        >>> MmCorpus.__save_corpus('file.mm', corpus)
 
         Some corpora also support an index of where each document begins, so
         that the documents on disk can be accessed in O(1) time (see the
@@ -100,6 +100,10 @@ def save_corpus(fname, corpus, id2word=None, metadata=False):
                 fmt = str(doc)  # format the document appropriately...
                 fout.write(utils.to_utf8("%s\n" % fmt))  # serialize the formatted document to disk
 
+    def serialize(serializer, fname, corpus, id2word=None, index_fname=None, progress_cnt=None, labels=None,
+                  metadata=False):
+        pass
+
 
 class TransformedCorpus(CorpusABC):
     def __init__(self, obj, corpus, chunksize=None, **kwargs):

diff --git a/gensim/models/wrappers/dtmmodel.py b/gensim/models/wrappers/dtmmodel.py
@@ -175,7 +175,7 @@ def convert_input(self, corpus, time_slices):
         """
         logger.info("serializing temporary corpus to %s", self.fcorpustxt())
         # write out the corpus in a file format that DTM understands:
-        corpora.BleiCorpus.save_corpus(self.fcorpustxt(), corpus)
+        corpora.BleiCorpus.__save_corpus(self.fcorpustxt(), corpus)
 
         with utils.smart_open(self.ftimeslices(), 'wb') as fout:
             fout.write(utils.to_utf8(str(len(self.time_slices)) + "\n"))

diff --git a/gensim/test/test_corpora.py b/gensim/test/test_corpora.py
@@ -109,7 +109,7 @@ def test_save(self):
         corpus = self.TEST_CORPUS
 
         # make sure the corpus can be saved
-        self.corpus_class.save_corpus(testfile(), corpus)
+        self.corpus_class.__save_corpus(testfile(), corpus)
 
         # and loaded back, resulting in exactly the same corpus
         corpus2 = list(self.corpus_class(testfile()))
@@ -253,7 +253,7 @@ def setUp(self):
     def test_save_format_for_dtm(self):
         corpus = [[(1, 1.0)], [], [(0, 5.0), (2, 1.0)], []]
         test_file = testfile()
-        self.corpus_class.save_corpus(test_file, corpus)
+        self.corpus_class.__save_corpus(test_file, corpus)
         with open(test_file) as f:
             for line in f:
                 # unique_word_count index1:count1 index2:count2 ... indexn:counnt

diff --git a/gensim/test/test_miislita.py b/gensim/test/test_miislita.py
@@ -64,7 +64,7 @@ def test_textcorpus(self):
 
         # make sure serializing works
         ftmp = get_tmpfile('test_textcorpus.mm')
-        corpora.MmCorpus.save_corpus(ftmp, miislita)
+        corpora.MmCorpus.__save_corpus(ftmp, miislita)
         self.assertTrue(os.path.exists(ftmp))
 
         # make sure deserializing gives the same result