-
Notifications
You must be signed in to change notification settings - Fork 1
/
scode-model.h
71 lines (57 loc) · 1.82 KB
/
scode-model.h
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
#include <math.h>
#include "dlib.h"
/*** svec_t: vectors on unit sphere representing tokens. svec_t is a
pointer to a struct that contains a token string, its count, and its
vector. The vectors are unit-length, NDIM dimensional, float
arrays. */
typedef struct svec_s {
char *key;
size_t cnt;
float *vec;
} *svec_t;
static inline svec_t svec_new(const char *key) {
svec_t s = _d_malloc(sizeof(struct svec_s));
s->key = _d_strdup(key);
s->cnt = 0;
s->vec = NULL;
return s;
}
#define svec_key(s) ((s)->key)
/* This defines a hash table for svec_t and sget(), see dlib.h for details. */
D_HASH(s, svec_t, char *, svec_key, d_strmatch, fnv1a, svec_new, d_isnull, d_mknull)
#define svec(h,k) (*sget(h,k,true))
/* d2(x, y, ndim) returns the squared distance between two ndim
dimensional vectors x and y. */
static inline double d2(float *x, float *y, size_t ndim) {
double ans = 0;
for (size_t j = 0; j < ndim; j++) {
double dj = x[j] - y[j];
ans += dj * dj;
}
return ans;
}
/* normalize(x, ndim) scales the ndim dimensional vector x to unit
size. */
static inline float *normalize(float *x, size_t ndim) {
double s = 0;
for (size_t j = 0; j < ndim; j++)
s += x[j] * x[j];
s = sqrt(s);
for (size_t j = 0; j < ndim; j++)
x[j] /= s;
return x;
}
/* Things defined in scode-model.c */
typedef struct model_s {
size_t ntok; // Number of tokens per tuple
size_t ndim; // Number of embedding dimensions
darr_t *v; // ntok hash tables of string -> svec
size_t *n; // ntok counts (for denominator)
double *z; // ntok normalization constants (z[0]=0)
} *model_t;
model_t load_model(char *modelfile, double z);
model_t new_model(size_t ntok, size_t ndim);
void print_model(model_t m);
void free_model(model_t m);
size_t calcN(model_t m, size_t i);
double calcZ(model_t m, size_t i);