Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

fix #671 #1005

Merged
merged 1 commit into from
Nov 10, 2016
Merged
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
16 changes: 8 additions & 8 deletions gensim/models/wrappers/ldamallet.py
Original file line number Diff line number Diff line change
Expand Up @@ -145,7 +145,7 @@ def convert_input(self, corpus, infer=False, serialize_corpus=True):
self.corpus2mallet(corpus, fout)

# convert the text file above into MALLET's internal format
cmd = self.mallet_path + " import-file --preserve-case --keep-sequence --remove-stopwords --token-regex '\S+' --input %s --output %s"
cmd = self.mallet_path + ' import-file --preserve-case --keep-sequence --remove-stopwords --token-regex "\S+" --input %s --output %s'
if infer:
cmd += ' --use-pipe-from ' + self.fcorpusmallet()
cmd = cmd % (self.fcorpustxt(), self.fcorpusmallet() + '.infer')
Expand All @@ -166,7 +166,7 @@ def train(self, corpus):
logger.info("training MALLET LDA with %s", cmd)
check_output(cmd, shell=True)
self.word_topics = self.load_word_topics()
# NOTE - we are still keeping the wordtopics variable to not break backward compatibility.
# NOTE - we are still keeping the wordtopics variable to not break backward compatibility.
# word_topics has replaced wordtopics throughout the code; wordtopics just stores the values of word_topics when train is called.
self.wordtopics = self.word_topics

Expand Down Expand Up @@ -260,20 +260,20 @@ def get_version(self, direc_path):
Check version of mallet via jar file
"""
archive = zipfile.ZipFile(direc_path, 'r')
if u'cc/mallet/regression/' not in archive.namelist():
if u'cc/mallet/regression/' not in archive.namelist():
return '2.0.7'
else:
return '2.0.8RC3'
except Exception:

xml_path = direc_path.split("bin")[0]
try:
doc = et.parse(xml_path + "pom.xml").getroot()
namespace = doc.tag[:doc.tag.index('}') + 1]
return doc.find(namespace + 'version').text.split("-")[0]
except Exception:
return "Can't parse pom.xml version file"



def read_doctopics(self, fname, eps=1e-6, renorm=True):
Expand Down Expand Up @@ -304,7 +304,7 @@ def read_doctopics(self, fname, eps=1e-6, renorm=True):
if mallet_version == "2.0.7":
"""

1 1 0 1.0780612802674239 30.005575655428533364 2 0.005575655428533364 1 0.005575655428533364
1 1 0 1.0780612802674239 30.005575655428533364 2 0.005575655428533364 1 0.005575655428533364
2 2 0 0.9184413079632608 40.009062076892971008 3 0.009062076892971008 2 0.009062076892971008 1 0.009062076892971008
In the above example there is a mix of the above if and elif statement. There are neither `2*num_topics` nor `num_topics` elements.
It has 2 formats 40.009062076892971008 and 0 1.0780612802674239 which cannot be handled by above if elif.
Expand All @@ -316,14 +316,14 @@ def read_doctopics(self, fname, eps=1e-6, renorm=True):
doc = []
if len(parts) > 0:
while count < len(parts):
"""
"""
if section is to deal with formats of type 2 0.034
so if count reaches index of 2 and since int(2) == float(2) so if block is executed
now there is one extra element afer 2, so count + 1 access should not give an error

else section handles formats of type 20.034
now count is there on index of 20.034 since float(20.034) != int(20.034) so else block
is executed
is executed

"""
if float(parts[count]) == int(parts[count]):
Expand Down