Skip to content

Commit

Permalink
work for v0.9.7
Browse files Browse the repository at this point in the history
  • Loading branch information
davidohana committed Sep 12, 2021
1 parent 3632639 commit 1a2c6cb
Show file tree
Hide file tree
Showing 3 changed files with 105 additions and 52 deletions.
11 changes: 11 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -311,6 +311,17 @@ Our project welcomes external contributions. Please refer to [CONTRIBUTING.md](C

## Change Log

##### v0.9.7

* Fixed bug in original Drain: log clusters were created multiple times for
log messages with fewer tokens than `max_node_depth`.
* Changed `depth` property name to a more descriptive name `max_node_depth`
as Drain always subtracts 2 of `depth` argument value. Also added `log_cluster_depth`
property to reflect original value of depth argument (Breaking Change).
* Restricted `depth` param to minimum sensible value of 3
* Added log cluster count to nodes in `Drain.print_tree()`
* Added optional log cluster details to `Drain.print_tree()`

##### v0.9.6

* Fix issue https://github.com/IBM/Drain3/issues/38: Unnecessary update of
Expand Down
124 changes: 73 additions & 51 deletions drain3/drain.py
Original file line number Diff line number Diff line change
Expand Up @@ -63,7 +63,12 @@ def __init__(self,
"""
Attributes
----------
depth : depth of all leaf nodes (nodes that contain log clusters)
depth : max depth levels of log clusters. Minimum is 2.
For example, for depth==4:
Root is considered depth level 1.
Token count is considered depth level 2.
First log token is considered depth level 3.
Log clusters below first token node are considered depth level 4.
sim_th : similarity threshold - if percentage of similar tokens for a log message is below this
number, a new log cluster will be created.
max_children : max number of children of an internal node
Expand All @@ -72,7 +77,11 @@ def __init__(self,
with a new ones according to the LRU policy.
extra_delimiters: delimiters to apply when splitting log message into words (in addition to whitespace).
"""
self.depth = depth - 2 # number of prefix tokens in each tree path (exclude root and leaf node)
if depth < 3:
raise ValueError("depth argument must be at least 3")

self.log_cluster_depth = depth
self.max_node_depth = depth - 2 # max depth of a prefix tree node, starting from zero
self.sim_th = sim_th
self.max_children = max_children
self.root_node = Node()
Expand All @@ -97,101 +106,103 @@ def tree_search(self, root_node: Node, tokens: list, sim_th: float, include_para

# at first level, children are grouped by token (word) count
token_count = len(tokens)
parent_node = root_node.key_to_child_node.get(str(token_count))
cur_node = root_node.key_to_child_node.get(str(token_count))

# no template with same token count yet
if parent_node is None:
if cur_node is None:
return None

# handle case of empty log string - return the single cluster in that group
if token_count == 0:
return self.id_to_cluster.get(parent_node.cluster_ids[0])
return self.id_to_cluster.get(cur_node.cluster_ids[0])

# find the leaf node for this log - a path of nodes matching the first N tokens (N=tree depth)
current_depth = 1
cur_node_depth = 1
for token in tokens:
# at_max_depth
if current_depth == self.depth:
# at max depth
if cur_node_depth >= self.max_node_depth:
break

# is_last_token
if current_depth == token_count:
# this is last token
if cur_node_depth == token_count:
break

key_to_child_node = parent_node.key_to_child_node
parent_node = key_to_child_node.get(token)
if parent_node is None:
parent_node = key_to_child_node.get(self.param_str)
if parent_node is None:
key_to_child_node = cur_node.key_to_child_node
cur_node = key_to_child_node.get(token)
if cur_node is None: # no exact next token exist, try wildcard node
cur_node = key_to_child_node.get(self.param_str)
if cur_node is None: # no wildcard node exist
return None

current_depth += 1
cur_node_depth += 1

# get best match among all clusters with same prefix, or None if no match is above sim_th
cluster = self.fast_match(parent_node.cluster_ids, tokens, sim_th, include_params)
cluster = self.fast_match(cur_node.cluster_ids, tokens, sim_th, include_params)
return cluster

def add_seq_to_prefix_tree(self, root_node, cluster: LogCluster):
token_count_str = str(len(cluster.log_template_tokens))
token_count = len(cluster.log_template_tokens)
token_count_str = str(token_count)
if token_count_str not in root_node.key_to_child_node:
first_layer_node = Node()
root_node.key_to_child_node[token_count_str] = first_layer_node
else:
first_layer_node = root_node.key_to_child_node[token_count_str]

parent_node = first_layer_node
cur_node = first_layer_node

# handle case of empty log string
if len(cluster.log_template_tokens) == 0:
parent_node.cluster_ids = [cluster.cluster_id]
if token_count == 0:
cur_node.cluster_ids = [cluster.cluster_id]
return

current_depth = 1
for token in cluster.log_template_tokens:

# if at max depth or this is last token in template - add current log cluster to the leaf node
if current_depth == self.depth or str(current_depth) == token_count_str:
if current_depth >= self.max_node_depth or current_depth >= token_count:
# clean up stale clusters before adding a new one.
new_cluster_ids = [cluster.cluster_id]
for cluster_id in parent_node.cluster_ids:
new_cluster_ids = []
for cluster_id in cur_node.cluster_ids:
if cluster_id in self.id_to_cluster:
new_cluster_ids.append(cluster_id)
parent_node.cluster_ids = new_cluster_ids
new_cluster_ids.append(cluster.cluster_id)
cur_node.cluster_ids = new_cluster_ids
break

# if token not matched in this layer of existing tree.
if token not in parent_node.key_to_child_node:
if token not in cur_node.key_to_child_node:
if not self.has_numbers(token):
if self.param_str in parent_node.key_to_child_node:
if len(parent_node.key_to_child_node) < self.max_children:
if self.param_str in cur_node.key_to_child_node:
if len(cur_node.key_to_child_node) < self.max_children:
new_node = Node()
parent_node.key_to_child_node[token] = new_node
parent_node = new_node
cur_node.key_to_child_node[token] = new_node
cur_node = new_node
else:
parent_node = parent_node.key_to_child_node[self.param_str]
cur_node = cur_node.key_to_child_node[self.param_str]
else:
if len(parent_node.key_to_child_node) + 1 < self.max_children:
if len(cur_node.key_to_child_node) + 1 < self.max_children:
new_node = Node()
parent_node.key_to_child_node[token] = new_node
parent_node = new_node
elif len(parent_node.key_to_child_node) + 1 == self.max_children:
cur_node.key_to_child_node[token] = new_node
cur_node = new_node
elif len(cur_node.key_to_child_node) + 1 == self.max_children:
new_node = Node()
parent_node.key_to_child_node[self.param_str] = new_node
parent_node = new_node
cur_node.key_to_child_node[self.param_str] = new_node
cur_node = new_node
else:
parent_node = parent_node.key_to_child_node[self.param_str]
cur_node = cur_node.key_to_child_node[self.param_str]

else:
if self.param_str not in parent_node.key_to_child_node:
if self.param_str not in cur_node.key_to_child_node:
new_node = Node()
parent_node.key_to_child_node[self.param_str] = new_node
parent_node = new_node
cur_node.key_to_child_node[self.param_str] = new_node
cur_node = new_node
else:
parent_node = parent_node.key_to_child_node[self.param_str]
cur_node = cur_node.key_to_child_node[self.param_str]

# if the token is matched
else:
parent_node = parent_node.key_to_child_node[token]
cur_node = cur_node.key_to_child_node[token]

current_depth += 1

Expand Down Expand Up @@ -221,7 +232,7 @@ def fast_match(self, cluster_ids: list, tokens: list, sim_th: float, include_par
:param cluster_ids: List of clusters to match against (represented by their IDs)
:param tokens: the log message, separated to tokens.
:param sim_th: minimum required similarity threshold (None will be returned in no clusters reached it)
:param include_params: consider tokens matched to wildcard parameters in similarity treshold.
:param include_params: consider tokens matched to wildcard parameters in similarity threshold.
:return: Best match cluster or None
"""
match_cluster = None
Expand Down Expand Up @@ -257,21 +268,31 @@ def create_template(self, seq1, seq2):

return ret_val

def print_tree(self, file=None):
self.print_node("root", self.root_node, 0, file)
def print_tree(self, file=None, max_clusters=5):
self.print_node("root", self.root_node, 0, file, max_clusters)

def print_node(self, token, node, depth, file):
def print_node(self, token, node, depth, file, max_clusters):
out_str = '\t' * depth

if depth < 2:
out_str += '<' + str(token) + '>'
if depth == 0:
out_str += f'<{token}>'
elif depth == 1:
out_str += f'<L={token}>'
else:
out_str += token
out_str += f'"{token}"'

if len(node.cluster_ids) > 0:
out_str += f" (cluster_count={len(node.cluster_ids)})"

print(out_str, file=file)

for token, child in node.key_to_child_node.items():
self.print_node(token, child, depth + 1, file)
self.print_node(token, child, depth + 1, file, max_clusters)

for cid in node.cluster_ids[:max_clusters]:
cluster = self.id_to_cluster[cid]
out_str = '\t' * (depth + 1) + str(cluster)
print(out_str, file=file)

def get_content_as_tokens(self, content):
content = content.strip()
Expand Down Expand Up @@ -312,6 +333,7 @@ def add_log_message(self, content: str):
update_type = "cluster_template_changed"
match_cluster.size += 1
# Touch cluster to update its state in the cache.
# noinspection PyStatementEffect
self.id_to_cluster[match_cluster.cluster_id]

if self.profiler:
Expand Down
22 changes: 21 additions & 1 deletion tests/test_drain.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,26 @@


class DrainTest(unittest.TestCase):

def test_add_shorter_than_depth_message(self):
model = Drain(depth=4)
res = model.add_log_message("word")
print(res[1])
print(res[0])
self.assertEqual(res[1], "cluster_created")

res = model.add_log_message("word")
print(res[1])
print(res[0])
self.assertEqual(res[1], "none")

res = model.add_log_message("otherword")
print(res[1])
print(res[0])
self.assertEqual(res[1], "cluster_created")

self.assertEqual(2, len(model.id_to_cluster))

def test_add_log_message(self):
model = Drain()
entries = str.splitlines(
Expand Down Expand Up @@ -209,7 +229,6 @@ def test_max_clusters_lru_single_leaf_node(self):
self.assertListEqual(list(map(str.strip, expected)), actual)
# self.assertEqual(5, model.get_total_cluster_size())


def test_match_only(self):
model = Drain()
res = model.add_log_message("aa aa aa")
Expand All @@ -235,3 +254,4 @@ def test_match_only(self):

c: LogCluster = model.match("nothing")
self.assertIsNone(c)

0 comments on commit 1a2c6cb

Please sign in to comment.