-
Notifications
You must be signed in to change notification settings - Fork 0
/
lda.py
102 lines (83 loc) · 3.06 KB
/
lda.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
"""
Usage:
lda.py <CORPUS>
Options:
-k --ntopics Number of topics to use [default: 20].
-a --alpha LDA Alpha Parameter [default: 0.3].
-b --beta LDA Beta Parameter [default: 0.3].
-f --format Format of the CORPUS file [default: pickle].
-n --iter Number of sample iterations to perform [default: 2000]
-b --burn Number of samples to discard at first [default: 300]
-t --thin Interval of iterations to measure [default: 1]
A reference implementation of LDA using Gibbs Sampling.
"""
from docopt import docopt
import numpy as np
import pickle
import pymc3 as pm
import scipy as sp
from scipy import sparse
def save_sparse(fname, array):
np.savez(fname, data=array.data, indices=array.indices, indptr=array.indptr,
shape=array.shape)
def load_csr(fname):
loader = np.load(fname)
return sparse.csr_matrix((loader['data'], loader['indices'],
loader['indptr']), shape=loader['shape'])
def load_csc(fname):
loader = np.load(fname)
return sparse.csc_matrix((loader['data'], loader['indices'],
loader['indptr']), shape=loader['shape'])
def load_pickle(fname):
with open(fname, 'rb') as f:
return pickle.load(f)
def load(fname, fmt):
if fmt == "pickle":
return load_pickle(fname)
if fmt == "csr":
return load_csr(fname)
if fmt == "csc":
return load_csc(fname)
raise ValueError("Unknown file format %s" % fmt)
def main(args):
K = int(args['--ntopics'])
data = load(args['<CORPUS>'], args['--format'])
D = data.shape[0] # number of documents
V = data.shape[1] # number of words
alpha = np.ones(K) * float(args['--alpha'])
beta = np.ones(V) * float(args['--alpha'])
theta = pm.Container([
pm.CompletedDirichlet(
"theta_%s" % i,
pm.Dirichlet("ptheta_%s" % i, theta=alpha)
) for i in range(D)
])
phi = pm.Container([
pm.CompletedDirichlet(
"phi_%s" % k,
pm.Dirichlet("pphi_%s" % k, theta=beta)
) for k in range(K)
])
Wd = [len(doc) for doc in data]
z = pm.Container([pm.Categorical('z_%i' % d,
p = theta[d],
size=Wd[d],
value=np.random.randint(K, size=Wd[d]))
for d in range(D)])
# cannot use p=phi[z[d][i]] here since phi is an ordinary list while z[d][i] is stochastic
w = pm.Container([pm.Categorical("w_%i_%i" % (d,i),
p = pm.Lambda('phi_z_%i_%i' % (d,i),
lambda z=z[d][i], phi=phi: phi[z]),
value=data[d][i],
observed=True)
for d in range(D) for i in range(Wd[d])])
model = pm.Model([theta, phi, z, w])
mcmc = pm.MCMC(model)
niter = int(args['--iter'])
burn = int(args['--burn'])
thin = int(args['--thin'])
mcmc.sample(niter, burn, thin)
if __name__ == "__main__":
arguments = docopt(__doc__, version="lda.py 0.1.0")
print("ARGS: %s" % arguments)
main(arguments)