Skip to content

Commit

Permalink
Make save_corpus private
Browse files Browse the repository at this point in the history
  • Loading branch information
anotherbugmaster committed Oct 2, 2017
1 parent b260d4b commit 36d98d1
Show file tree
Hide file tree
Showing 12 changed files with 29 additions and 23 deletions.
14 changes: 8 additions & 6 deletions gensim/corpora/bleicorpus.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,9 +5,7 @@
# Licensed under the GNU LGPL v2.1 - http://www.gnu.org/licenses/lgpl.html


"""
Blei's LDA-C format.
"""
"""Blei's LDA-C format."""

from __future__ import with_statement

Expand Down Expand Up @@ -41,8 +39,9 @@ def __init__(self, fname, fname_vocab=None):
"""
Initialize the corpus from a file.
`fname_vocab` is the file with vocabulary; if not specified, it defaults to
`fname.vocab`.
Args:
fname (str): serialized corpus's filename
fname_vocab (str): vocabulary file; takes precedence over fname.vocab
"""
IndexedCorpus.__init__(self, fname)
logger.info("loading corpus from %s", fname)
Expand Down Expand Up @@ -85,7 +84,7 @@ def line2doc(self, line):
return doc

@staticmethod
def save_corpus(fname, corpus, id2word=None, metadata=False):
def __save_corpus(fname, corpus, id2word=None, metadata=False):
"""
Save a corpus in the LDA-C format.
Expand All @@ -94,6 +93,9 @@ def save_corpus(fname, corpus, id2word=None, metadata=False):
This function is automatically called by `BleiCorpus.serialize`; don't
call it directly, call `serialize` instead.
Args:
"""
if id2word is None:
logger.info("no word id mapping provided; initializing from corpus")
Expand Down
8 changes: 4 additions & 4 deletions gensim/corpora/indexedcorpus.py
Original file line number Diff line number Diff line change
Expand Up @@ -85,14 +85,14 @@ def serialize(serializer, fname, corpus, id2word=None, index_fname=None, progres

if progress_cnt is not None:
if labels is not None:
offsets = serializer.save_corpus(fname, corpus, id2word, labels=labels, progress_cnt=progress_cnt, metadata=metadata)
offsets = serializer.__save_corpus(fname, corpus, id2word, labels=labels, progress_cnt=progress_cnt, metadata=metadata)
else:
offsets = serializer.save_corpus(fname, corpus, id2word, progress_cnt=progress_cnt, metadata=metadata)
offsets = serializer.__save_corpus(fname, corpus, id2word, progress_cnt=progress_cnt, metadata=metadata)
else:
if labels is not None:
offsets = serializer.save_corpus(fname, corpus, id2word, labels=labels, metadata=metadata)
offsets = serializer.__save_corpus(fname, corpus, id2word, labels=labels, metadata=metadata)
else:
offsets = serializer.save_corpus(fname, corpus, id2word, metadata=metadata)
offsets = serializer.__save_corpus(fname, corpus, id2word, metadata=metadata)

if offsets is None:
raise NotImplementedError("called serialize on class %s which doesn't support indexing!" % serializer.__name__)
Expand Down
2 changes: 1 addition & 1 deletion gensim/corpora/lowcorpus.py
Original file line number Diff line number Diff line change
Expand Up @@ -140,7 +140,7 @@ def __iter__(self):
yield self.line2doc(line)

@staticmethod
def save_corpus(fname, corpus, id2word=None, metadata=False):
def __save_corpus(fname, corpus, id2word=None, metadata=False):
"""
Save a corpus in the List-of-words format.
Expand Down
2 changes: 1 addition & 1 deletion gensim/corpora/malletcorpus.py
Original file line number Diff line number Diff line change
Expand Up @@ -67,7 +67,7 @@ def line2doc(self, line):
return doc

@staticmethod
def save_corpus(fname, corpus, id2word=None, metadata=False):
def __save_corpus(fname, corpus, id2word=None, metadata=False):
"""
Save a corpus in the Mallet format.
Expand Down
2 changes: 1 addition & 1 deletion gensim/corpora/mmcorpus.py
Original file line number Diff line number Diff line change
Expand Up @@ -38,7 +38,7 @@ def __iter__(self):
yield doc # get rid of doc id, return the sparse vector only

@staticmethod
def save_corpus(fname, corpus, id2word=None, progress_cnt=1000, metadata=False):
def __save_corpus(fname, corpus, id2word=None, progress_cnt=1000, metadata=False):
"""
Save a corpus in the Matrix Market format to disk.
Expand Down
4 changes: 2 additions & 2 deletions gensim/corpora/sharded_corpus.py
Original file line number Diff line number Diff line change
Expand Up @@ -764,7 +764,7 @@ def load(cls, fname, mmap=None):
return super(ShardedCorpus, cls).load(fname, mmap)

@staticmethod
def save_corpus(fname, corpus, id2word=None, progress_cnt=1000, metadata=False, **kwargs):
def __save_corpus(fname, corpus, id2word=None, progress_cnt=1000, metadata=False, **kwargs):
"""
Implement a serialization interface. Do not call directly;
use the `serialize` method instead.
Expand Down Expand Up @@ -799,4 +799,4 @@ def serialize(serializer, fname, corpus, id2word=None, index_fname=None, progres
Ignore the parameters id2word, index_fname, progress_cnt, labels
and metadata. They currently do nothing and are here only to
provide a compatible method signature with superclass."""
serializer.save_corpus(fname, corpus, id2word=id2word, progress_cnt=progress_cnt, metadata=metadata, **kwargs)
serializer.__save_corpus(fname, corpus, id2word=id2word, progress_cnt=progress_cnt, metadata=metadata, **kwargs)
2 changes: 1 addition & 1 deletion gensim/corpora/svmlightcorpus.py
Original file line number Diff line number Diff line change
Expand Up @@ -79,7 +79,7 @@ def __iter__(self):
self.length = lineno + 1

@staticmethod
def save_corpus(fname, corpus, id2word=None, labels=False, metadata=False):
def __save_corpus(fname, corpus, id2word=None, labels=False, metadata=False):
"""
Save a corpus in the SVMlight format.
Expand Down
2 changes: 1 addition & 1 deletion gensim/corpora/ucicorpus.py
Original file line number Diff line number Diff line change
Expand Up @@ -192,7 +192,7 @@ def create_dictionary(self):
return dictionary

@staticmethod
def save_corpus(fname, corpus, id2word=None, progress_cnt=10000, metadata=False):
def __save_corpus(fname, corpus, id2word=None, progress_cnt=10000, metadata=False):
"""
Save a corpus in the UCI Bag-of-Words format.
Expand Down
8 changes: 6 additions & 2 deletions gensim/interfaces.py
Original file line number Diff line number Diff line change
Expand Up @@ -71,14 +71,14 @@ def __len__(self):
# return sum(1 for doc in self) # sum(empty generator) == 0, so this works even for an empty corpus

@staticmethod
def save_corpus(fname, corpus, id2word=None, metadata=False):
def __save_corpus(fname, corpus, id2word=None, metadata=False):
"""
Save an existing `corpus` to disk.
Some formats also support saving the dictionary (`feature_id->word` mapping),
which can in this case be provided by the optional `id2word` parameter.
>>> MmCorpus.save_corpus('file.mm', corpus)
>>> MmCorpus.__save_corpus('file.mm', corpus)
Some corpora also support an index of where each document begins, so
that the documents on disk can be accessed in O(1) time (see the
Expand All @@ -100,6 +100,10 @@ def save_corpus(fname, corpus, id2word=None, metadata=False):
fmt = str(doc) # format the document appropriately...
fout.write(utils.to_utf8("%s\n" % fmt)) # serialize the formatted document to disk

def serialize(serializer, fname, corpus, id2word=None, index_fname=None, progress_cnt=None, labels=None,
metadata=False):
pass


class TransformedCorpus(CorpusABC):
def __init__(self, obj, corpus, chunksize=None, **kwargs):
Expand Down
2 changes: 1 addition & 1 deletion gensim/models/wrappers/dtmmodel.py
Original file line number Diff line number Diff line change
Expand Up @@ -175,7 +175,7 @@ def convert_input(self, corpus, time_slices):
"""
logger.info("serializing temporary corpus to %s", self.fcorpustxt())
# write out the corpus in a file format that DTM understands:
corpora.BleiCorpus.save_corpus(self.fcorpustxt(), corpus)
corpora.BleiCorpus.__save_corpus(self.fcorpustxt(), corpus)

with utils.smart_open(self.ftimeslices(), 'wb') as fout:
fout.write(utils.to_utf8(str(len(self.time_slices)) + "\n"))
Expand Down
4 changes: 2 additions & 2 deletions gensim/test/test_corpora.py
Original file line number Diff line number Diff line change
Expand Up @@ -109,7 +109,7 @@ def test_save(self):
corpus = self.TEST_CORPUS

# make sure the corpus can be saved
self.corpus_class.save_corpus(testfile(), corpus)
self.corpus_class.__save_corpus(testfile(), corpus)

# and loaded back, resulting in exactly the same corpus
corpus2 = list(self.corpus_class(testfile()))
Expand Down Expand Up @@ -253,7 +253,7 @@ def setUp(self):
def test_save_format_for_dtm(self):
corpus = [[(1, 1.0)], [], [(0, 5.0), (2, 1.0)], []]
test_file = testfile()
self.corpus_class.save_corpus(test_file, corpus)
self.corpus_class.__save_corpus(test_file, corpus)
with open(test_file) as f:
for line in f:
# unique_word_count index1:count1 index2:count2 ... indexn:counnt
Expand Down
2 changes: 1 addition & 1 deletion gensim/test/test_miislita.py
Original file line number Diff line number Diff line change
Expand Up @@ -64,7 +64,7 @@ def test_textcorpus(self):

# make sure serializing works
ftmp = get_tmpfile('test_textcorpus.mm')
corpora.MmCorpus.save_corpus(ftmp, miislita)
corpora.MmCorpus.__save_corpus(ftmp, miislita)
self.assertTrue(os.path.exists(ftmp))

# make sure deserializing gives the same result
Expand Down

0 comments on commit 36d98d1

Please sign in to comment.