-
Notifications
You must be signed in to change notification settings - Fork 1.5k
/
simhash.h
76 lines (66 loc) · 1.35 KB
/
simhash.h
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
/*******************************************************************************
* DANIEL'S ALGORITHM IMPLEMENTAIONS
*
* /\ | _ _ ._ o _|_ |_ ._ _ _
* /--\ | (_| (_) | | |_ | | | | | _>
* _|
*
* SIMHASH FUNCTIONS
*
* http://matpalm.com/resemblance/simhash/
* http://en.wikipedia.org/wiki/Hamming_distance
*
******************************************************************************/
#ifndef ALGO_SIMHASH_H__
#define ALGO_SIMHASH_H__
#include <string.h>
#include <stdint.h>
#include "hash_string.h"
namespace alg {
class SimHash {
private:
int V[32];
public:
SimHash() {
memset(V, 0, sizeof(V));
}
/**
* hash a single word
*/
void AddWord(const char * word, uint32_t len) {
uint32_t hash = hash_fnv1a(word,len);
for(int i=0;i<32;i++) {
if (hash&(1<<i)) {
V[i]++;
} else {
V[i]--;
}
}
}
/**
* get the hash of the document
*/
uint32_t Hash() {
uint32_t hash=0;
for(int i=0;i<32;i++) {
if (V[i] > 0) {
hash|= (1<<i);
}
}
return hash;
}
/**
* calc the hamming distance of two hash.
*/
static int Distance(uint32_t hash1, uint32_t hash2) {
uint32_t diff = hash1^hash2;
int dist = 0;
while(diff) {
dist++;
diff &=diff-1;
}
return dist;
}
};
}
#endif //