Skip to content

Commit

Permalink
Merge pull request #43 from StanislawSwierc/issue_38
Browse files Browse the repository at this point in the history
fixed: clusters from the same leaf node get marked as used during testing
  • Loading branch information
davidohana authored Jul 22, 2021
2 parents 15470e3 + 1d25b35 commit 8dbb3e6
Show file tree
Hide file tree
Showing 2 changed files with 104 additions and 31 deletions.
23 changes: 20 additions & 3 deletions drain3/drain.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,23 @@ def __str__(self):
return f"ID={str(self.cluster_id).ljust(5)} : size={str(self.size).ljust(10)}: {self.get_template()}"


class LogClusterCache(LRUCache):
"""
Least Recently Used (LRU) cache which allows callers to conditionally skip
cache eviction algorithm when accessing elements.
"""

def __missing__(self, key):
return None

def get(self, key):
"""
Returns the value of the item with the specified key without updating
the cache eviction algorithm.
"""
return Cache.__getitem__(self, key)


class Node:
__slots__ = ["key_to_child_node", "cluster_ids"]

Expand Down Expand Up @@ -65,7 +82,7 @@ def __init__(self,
self.param_str = param_str

# key: int, value: LogCluster
self.id_to_cluster = {} if max_clusters is None else LRUCache(maxsize=max_clusters)
self.id_to_cluster = {} if max_clusters is None else LogClusterCache(maxsize=max_clusters)
self.clusters_counter = 0

@property
Expand Down Expand Up @@ -216,7 +233,7 @@ def fast_match(self, cluster_ids: list, tokens: list, sim_th: float, include_par
for cluster_id in cluster_ids:
# Try to retrieve cluster from cache with bypassing eviction
# algorithm as we are only testing candidates for a match.
cluster = Cache.get(self.id_to_cluster, cluster_id)
cluster = self.id_to_cluster.get(cluster_id)
if cluster is None:
continue
cur_sim, param_count = self.get_seq_distance(cluster.log_template_tokens, tokens, include_params)
Expand Down Expand Up @@ -295,7 +312,7 @@ def add_log_message(self, content: str):
update_type = "cluster_template_changed"
match_cluster.size += 1
# Touch cluster to update its state in the cache.
self.id_to_cluster.get(match_cluster.cluster_id)
self.id_to_cluster[match_cluster.cluster_id]

if self.profiler:
self.profiler.end_section()
Expand Down
112 changes: 84 additions & 28 deletions tests/test_drain.py
Original file line number Diff line number Diff line change
Expand Up @@ -113,37 +113,44 @@ def test_max_clusters(self):
self.assertListEqual(list(map(str.strip, expected)), actual)
self.assertEqual(1, model.get_total_cluster_size())

def test_max_clusters_lru(self):
"""When max number of clusters is reached, then clusters are removed
according to the lru policy.
def test_max_clusters_lru_multiple_leaf_nodes(self):
"""When all templates end up in different nodes and the max number of
clusters is reached, then clusters are removed according to the lru
policy.
"""
model = Drain(max_clusters=3, depth=3)
model = Drain(max_clusters=2, depth=4, param_str="*")
entries = [
"A A foramt 1",
"A A foramt 2",
"A B format 1",
"A B format 2",
"B format 1",
"B format 2",
"A A foramt 3",
"C foramt 1",
"A B format 3",
"A A A",
"A A B",
"B A A",
"B A B",
"C A A",
"C A B",
"B A A",
"A A A",
]
expected = [
"A A foramt 1", # LRU = ["A"]
"A A foramt <*>", # LRU = ["A"]
# Use "A A" prefix to make sure both "A" and "A A" clusters end up
# in the same leaf node. This is a setup for an interesting edge
# case.
"A B format 1", # LRU = ["AA", "A"]
"A B format <*>", # LRU = ["AA", "A"]
"B format 1", # LRU = ["B", "A A", "A"]
"B format <*>", # LRU = ["B", "A A", "A"]
"A A foramt <*>", # LRU = ["A", "B", "A A"]
"C foramt 1", # LRU = ["C", "A", "B"]
# Cluster "A A" should have been removed in the previous step, thus,
# it should be recognized as a new cluster with no slots.
"A B format 3", # LRU = ["A A', "C", "A"]
# lru: []
"A A A",
# lru: ["A A A"]
"A A *",
# lru: ["A A *"]
"B A A",
# lru: ["B A A", "A A *"]
"B A *",
# lru: ["B A *", "A A *"]
"C A A",
# lru: ["C A A", "B A *"]
"C A *",
# lru: ["C A *", "B A *"]
"B A *",
# Message "B A A" was normalized because the template "B A *" is
# still present in the cache.
# lru: ["B A *", "C A *"]
"A A A",
# Message "A A A" was not normalized because the template "C A A"
# pushed out the template "A A *" from the cache.
# lru: ["A A A", "C A *"]
]
actual = []

Expand All @@ -152,7 +159,56 @@ def test_max_clusters_lru(self):
actual.append(cluster.get_template())

self.assertListEqual(list(map(str.strip, expected)), actual)
self.assertEqual(5, model.get_total_cluster_size())
self.assertEqual(4, model.get_total_cluster_size())

def test_max_clusters_lru_single_leaf_node(self):
"""When all templates end up in the same leaf node and the max number of
clusters is reached, then clusters are removed according to the lru
policy.
"""
model = Drain(max_clusters=2, depth=4, param_str="*")
entries = [
"A A A",
"A A B",
"A B A",
"A B B",
"A C A",
"A C B",
"A B A",
"A A A",
]
expected = [
# lru: []
"A A A",
# lru: ["A A A"]
"A A *",
# lru: ["A A *"]
"A B A",
# lru: ["B A A", "A A *"]
"A B *",
# lru: ["B A *", "A A *"]
"A C A",
# lru: ["C A A", "B A *"]
"A C *",
# lru: ["C A *", "B A *"]
"A B *",
# Message "B A A" was normalized because the template "B A *" is
# still present in the cache.
# lru: ["B A *", "C A *"]
"A A A",
# Message "A A A" was not normalized because the template "C A A"
# pushed out the template "A A *" from the cache.
# lru: ["A A A", "C A *"]
]
actual = []

for entry in entries:
cluster, _ = model.add_log_message(entry)
actual.append(cluster.get_template())

self.assertListEqual(list(map(str.strip, expected)), actual)
# self.assertEqual(5, model.get_total_cluster_size())


def test_match_only(self):
model = Drain()
Expand Down

0 comments on commit 8dbb3e6

Please sign in to comment.