Merge pull request #43 from StanislawSwierc/issue_38

fixed: clusters from the same leaf node get marked as used during testing
logpai · Jul 22, 2021 · 8dbb3e6 · 8dbb3e6
2 parents 15470e3 + 1d25b35
commit 8dbb3e6
Show file tree

Hide file tree

Showing 2 changed files with 104 additions and 31 deletions.
diff --git a/drain3/drain.py b/drain3/drain.py
@@ -26,6 +26,23 @@ def __str__(self):
         return f"ID={str(self.cluster_id).ljust(5)} : size={str(self.size).ljust(10)}: {self.get_template()}"
 
 
+class LogClusterCache(LRUCache):
+    """
+    Least Recently Used (LRU) cache which allows callers to conditionally skip
+    cache eviction algorithm when accessing elements.
+    """
+
+    def __missing__(self, key):
+        return None
+
+    def get(self, key):
+        """
+        Returns the value of the item with the specified key without updating
+        the cache eviction algorithm.
+        """
+        return Cache.__getitem__(self, key)
+
+
 class Node:
     __slots__ = ["key_to_child_node", "cluster_ids"]
 
@@ -65,7 +82,7 @@ def __init__(self,
         self.param_str = param_str
 
         # key: int, value: LogCluster
-        self.id_to_cluster = {} if max_clusters is None else LRUCache(maxsize=max_clusters)
+        self.id_to_cluster = {} if max_clusters is None else LogClusterCache(maxsize=max_clusters)
         self.clusters_counter = 0
 
     @property
@@ -216,7 +233,7 @@ def fast_match(self, cluster_ids: list, tokens: list, sim_th: float, include_par
         for cluster_id in cluster_ids:
             # Try to retrieve cluster from cache with bypassing eviction
             # algorithm as we are only testing candidates for a match.
-            cluster = Cache.get(self.id_to_cluster, cluster_id)
+            cluster = self.id_to_cluster.get(cluster_id)
             if cluster is None:
                 continue
             cur_sim, param_count = self.get_seq_distance(cluster.log_template_tokens, tokens, include_params)
@@ -295,7 +312,7 @@ def add_log_message(self, content: str):
                 update_type = "cluster_template_changed"
             match_cluster.size += 1
             # Touch cluster to update its state in the cache.
-            self.id_to_cluster.get(match_cluster.cluster_id)
+            self.id_to_cluster[match_cluster.cluster_id]
 
         if self.profiler:
             self.profiler.end_section()

diff --git a/tests/test_drain.py b/tests/test_drain.py
@@ -113,37 +113,44 @@ def test_max_clusters(self):
         self.assertListEqual(list(map(str.strip, expected)), actual)
         self.assertEqual(1, model.get_total_cluster_size())
 
-    def test_max_clusters_lru(self):
-        """When max number of clusters is reached, then clusters are removed
-        according to the lru policy.
+    def test_max_clusters_lru_multiple_leaf_nodes(self):
+        """When all templates end up in different nodes and the max number of
+        clusters is reached, then clusters are removed according to the lru
+        policy.
         """
-        model = Drain(max_clusters=3, depth=3)
+        model = Drain(max_clusters=2, depth=4, param_str="*")
         entries = [
-            "A A foramt 1",
-            "A A foramt 2",
-            "A B format 1",
-            "A B format 2",
-            "B format 1",
-            "B format 2",
-            "A A foramt 3",
-            "C foramt 1",
-            "A B format 3",
+            "A A A",
+            "A A B",
+            "B A A",
+            "B A B",
+            "C A A",
+            "C A B",
+            "B A A",
+            "A A A",
         ]
         expected = [
-            "A A foramt 1",  # LRU = ["A"]
-            "A A foramt <*>",  # LRU = ["A"]
-            # Use "A A" prefix to make sure both "A" and "A A" clusters end up
-            # in the same leaf node. This is a setup for an interesting edge
-            # case.
-            "A B format 1",  # LRU = ["AA", "A"]
-            "A B format <*>",  # LRU = ["AA", "A"]
-            "B format 1",  # LRU = ["B", "A A", "A"]
-            "B format <*>",  # LRU = ["B", "A A", "A"]
-            "A A foramt <*>",  # LRU = ["A", "B", "A A"]
-            "C foramt 1",  # LRU = ["C", "A", "B"]
-            # Cluster "A A" should have been removed in the previous step, thus,
-            # it should be recognized as a new cluster with no slots.
-            "A B format 3",  # LRU = ["A A', "C", "A"]
+            # lru: []
+            "A A A",
+            # lru: ["A A A"]
+            "A A *",
+            # lru: ["A A *"]
+            "B A A",
+            # lru: ["B A A", "A A *"]
+            "B A *",
+            # lru: ["B A *", "A A *"]
+            "C A A",
+            # lru: ["C A A", "B A *"]
+            "C A *",
+            # lru: ["C A *", "B A *"]
+            "B A *",
+            # Message "B A A" was normalized because the template "B A *" is
+            # still present in the cache.
+            # lru: ["B A *", "C A *"]
+            "A A A",
+            # Message "A A A" was not normalized because the template "C A A"
+            # pushed out the template "A A *" from the cache.
+            # lru: ["A A A", "C A *"]
         ]
         actual = []
 
@@ -152,7 +159,56 @@ def test_max_clusters_lru(self):
             actual.append(cluster.get_template())
 
         self.assertListEqual(list(map(str.strip, expected)), actual)
-        self.assertEqual(5, model.get_total_cluster_size())
+        self.assertEqual(4, model.get_total_cluster_size())
+
+    def test_max_clusters_lru_single_leaf_node(self):
+        """When all templates end up in the same leaf node and the max number of
+        clusters is reached, then clusters are removed according to the lru
+        policy.
+        """
+        model = Drain(max_clusters=2, depth=4, param_str="*")
+        entries = [
+            "A A A",
+            "A A B",
+            "A B A",
+            "A B B",
+            "A C A",
+            "A C B",
+            "A B A",
+            "A A A",
+        ]
+        expected = [
+            # lru: []
+            "A A A",
+            # lru: ["A A A"]
+            "A A *",
+            # lru: ["A A *"]
+            "A B A",
+            # lru: ["B A A", "A A *"]
+            "A B *",
+            # lru: ["B A *", "A A *"]
+            "A C A",
+            # lru: ["C A A", "B A *"]
+            "A C *",
+            # lru: ["C A *", "B A *"]
+            "A B *",
+            # Message "B A A" was normalized because the template "B A *" is
+            # still present in the cache.
+            # lru: ["B A *", "C A *"]
+            "A A A",
+            # Message "A A A" was not normalized because the template "C A A"
+            # pushed out the template "A A *" from the cache.
+            # lru: ["A A A", "C A *"]
+        ]
+        actual = []
+
+        for entry in entries:
+            cluster, _ = model.add_log_message(entry)
+            actual.append(cluster.get_template())
+
+        self.assertListEqual(list(map(str.strip, expected)), actual)
+        # self.assertEqual(5, model.get_total_cluster_size())
+
 
     def test_match_only(self):
         model = Drain()