From 17c4776af95094ad31737e7ae96b3994c0c6b245 Mon Sep 17 00:00:00 2001
From: Max <maxtropets@microsoft.com>
Date: Fri, 12 Jul 2024 11:35:05 +0100
Subject: [PATCH] Configure hist. cache size on startup (#6346)

---
 .daily_canary                             |   4 +
 CHANGELOG.md                              |   1 +
 CMakeLists.txt                            |   5 +
 doc/host_config_schema/cchost_config.json |   5 +
 include/ccf/node/startup_config.h         |   2 +
 samples/apps/logging/logging.cpp          |  16 ---
 src/common/configuration.h                |   3 +-
 src/enclave/enclave.h                     |   3 +
 src/node/historical_queries.h             |   2 +-
 tests/config.jinja                        |   3 +-
 tests/historical_query_cache.py           | 131 ++++++++++++++++++++++
 tests/infra/network.py                    |   1 +
 tests/infra/remote.py                     |   2 +
 13 files changed, 159 insertions(+), 19 deletions(-)
 create mode 100644 tests/historical_query_cache.py

diff --git a/.daily_canary b/.daily_canary
index b0c8ecafff33..71a95928ef59 100644
--- a/.daily_canary
+++ b/.daily_canary
@@ -4,3 +4,7 @@
 /--x-m- /--n-n---xXx--/--yY------>>>----<<<>>]]{{}}---||-/\---..
 2024__
 !..!
+
+ ,--.  
+ ( o>
+//\\
diff --git a/CHANGELOG.md b/CHANGELOG.md
index 870628f8a568..583c188b597b 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -14,6 +14,7 @@ and this project adheres to [Semantic Versioning](http://semver.org/spec/v2.0.0.
 - The `cchost` configuration file now includes an `idle_connection_timeout` option. This controls how long the node will keep idle connections (for user TLS sessions) before automatically closing them. This may be set to `null` to restore the previous behaviour, where idle connections are never closed. By default connections will be closed after 60s of idle time.
 - New endpoints `GET /gov/service/javascript-modules` and `GET /gov/service/javascript-modules/{moduleName}` to retrieve the raw JS code of the currently installed app. Note that the `{moduleName}` path parameter will need to be URL-encoded to escape any `/` characters (eg - `/foo/bar.js` should become `%2Ffoo%2Fbar.js`).
 - New gov API version `2024-07-01`. This is near-identical to `2023-06-01-preview`, but additionally offers the new `javascript-modules` endpoints.
+- Historical cache soft limit now is a node-specific startup parameter.
 
 ### Changed
 
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 87c989d5bb19..d551016762ef 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -1576,6 +1576,11 @@ if(BUILD_TESTS)
       PYTHON_SCRIPT ${CMAKE_SOURCE_DIR}/tests/historical_query_perf.py
       LABEL perf PERF_LABEL "Historical Queries"
     )
+
+    add_e2e_test(
+      NAME historical_query_cache_test
+      PYTHON_SCRIPT ${CMAKE_SOURCE_DIR}/tests/historical_query_cache.py
+    )
   endif()
 endif()
 
diff --git a/doc/host_config_schema/cchost_config.json b/doc/host_config_schema/cchost_config.json
index ca7dd0f9c230..313e6144af2b 100644
--- a/doc/host_config_schema/cchost_config.json
+++ b/doc/host_config_schema/cchost_config.json
@@ -704,6 +704,11 @@
       "type": "integer",
       "description": "Cap at which node-to-node message channels will be closed, and a new channel will be created. Can be used to limit use of single cryptographic key",
       "minimum": 0
+    },
+    "historical_cache_soft_limit": {
+      "type": "string",
+      "default": "512MB",
+      "description": "Historical queries cache soft limit (as size string)"
     }
   },
   "required": ["enclave", "network", "command"],
diff --git a/include/ccf/node/startup_config.h b/include/ccf/node/startup_config.h
index 5d4fc0f893a2..c2091b8e54ec 100644
--- a/include/ccf/node/startup_config.h
+++ b/include/ccf/node/startup_config.h
@@ -22,6 +22,8 @@ struct CCFConfig
   // 2**24.5 as per RFC8446 Section 5.5
   size_t node_to_node_message_limit = 23'726'566;
 
+  ccf::ds::SizeString historical_cache_soft_limit = {"512MB"};
+
   ccf::consensus::Configuration consensus = {};
   ccf::NodeInfoNetwork network = {};
 
diff --git a/samples/apps/logging/logging.cpp b/samples/apps/logging/logging.cpp
index e3dd8e30ed29..7b531c911f17 100644
--- a/samples/apps/logging/logging.cpp
+++ b/samples/apps/logging/logging.cpp
@@ -460,12 +460,6 @@ namespace loggingapp
         PUBLIC_RECORDS, context, 10000, 20);
       context.get_indexing_strategies().install_strategy(index_per_public_key);
 
-      // According to manual obvervation it's enough to start evicting old
-      // requests on historical perf test, but not too small to get stuck
-      // because of a single request being larget than the cache.
-      constexpr size_t cache_limit = 1024 * 1024 * 10; // MB
-      context.get_historical_state().set_soft_cache_limit(cache_limit);
-
       const ccf::AuthnPolicies auth_policies = {
         ccf::jwt_auth_policy,
         ccf::user_cert_auth_policy,
@@ -1663,11 +1657,6 @@ namespace loggingapp
           ccf::http::headers::CONTENT_TYPE,
           ccf::http::headervalues::contenttype::JSON);
         ctx.rpc_ctx->set_response_body(j_response.dump());
-
-        // ALSO: Assume this response makes it all the way to the client, and
-        // they're finished with it, so we can drop the retrieved state. In a
-        // real app this may be driven by a separate client request or an LRU
-        historical_cache.drop_cached_states(handle);
       };
       make_endpoint(
         get_historical_range_path,
@@ -1834,11 +1823,6 @@ namespace loggingapp
           ccf::http::headers::CONTENT_TYPE,
           ccf::http::headervalues::contenttype::JSON);
         ctx.rpc_ctx->set_response_body(j_response.dump());
-
-        // ALSO: Assume this response makes it all the way to the client, and
-        // they're finished with it, so we can drop the retrieved state. In a
-        // real app this may be driven by a separate client request or an LRU
-        historical_cache.drop_cached_states(handle);
       };
       make_endpoint(
         get_historical_sparse_path,
diff --git a/src/common/configuration.h b/src/common/configuration.h
index 181c32fc70b5..c12771265e2c 100644
--- a/src/common/configuration.h
+++ b/src/common/configuration.h
@@ -92,7 +92,8 @@ DECLARE_JSON_OPTIONAL_FIELDS(
   ledger_signatures,
   jwt,
   attestation,
-  node_to_node_message_limit);
+  node_to_node_message_limit,
+  historical_cache_soft_limit);
 
 DECLARE_JSON_TYPE(StartupConfig::Start);
 DECLARE_JSON_REQUIRED_FIELDS(
diff --git a/src/enclave/enclave.h b/src/enclave/enclave.h
index a51d0fbf0f68..02ddd02f3819 100644
--- a/src/enclave/enclave.h
+++ b/src/enclave/enclave.h
@@ -238,6 +238,9 @@ namespace ccf
 
       node->set_n2n_message_limit(ccf_config_.node_to_node_message_limit);
 
+      historical_state_cache->set_soft_cache_limit(
+        ccf_config_.historical_cache_soft_limit);
+
       // If we haven't heard from a node for multiple elections, then cleanup
       // their node-to-node channel
       const auto idle_timeout =
diff --git a/src/node/historical_queries.h b/src/node/historical_queries.h
index 46599b5123fe..2cc96a976ec6 100644
--- a/src/node/historical_queries.h
+++ b/src/node/historical_queries.h
@@ -509,7 +509,7 @@ namespace ccf::historical
     std::unordered_map<SeqNo, std::set<CompoundHandle>> store_to_requests;
     std::unordered_map<ccf::SeqNo, size_t> raw_store_sizes{};
 
-    CacheSize soft_store_cache_limit{1ll * 1024 * 1024 * 512 /*512 MB*/};
+    CacheSize soft_store_cache_limit{std::numeric_limits<size_t>::max()};
     CacheSize soft_store_cache_limit_raw =
       soft_store_cache_limit / soft_to_raw_ratio;
     CacheSize estimated_store_cache_size{0};
diff --git a/tests/config.jinja b/tests/config.jinja
index 86925c3b072e..d94ca8ded5a7 100644
--- a/tests/config.jinja
+++ b/tests/config.jinja
@@ -102,5 +102,6 @@
     "max_fragment_size": "256KB"
   },
   "ignore_first_sigterm": {{ ignore_first_sigterm|tojson }}{% if node_to_node_message_limit %},
-  "node_to_node_message_limit": {{ node_to_node_message_limit|tojson }}{% endif %}
+  "node_to_node_message_limit": {{ node_to_node_message_limit|tojson }}{% endif %}{% if historical_cache_soft_limit %},
+  "historical_cache_soft_limit": {{ historical_cache_soft_limit|tojson }}{% endif %}
 }
\ No newline at end of file
diff --git a/tests/historical_query_cache.py b/tests/historical_query_cache.py
new file mode 100644
index 000000000000..3c12f0f1adb2
--- /dev/null
+++ b/tests/historical_query_cache.py
@@ -0,0 +1,131 @@
+# Copyright (c) Microsoft Corporation. All rights reserved.
+# Licensed under the Apache 2.0 License.
+import infra.e2e_args
+import infra.network
+import infra.proc
+import infra.commit
+import http
+from infra.snp import IS_SNP
+import infra.jwt_issuer
+import time
+import infra.bencher
+
+from loguru import logger as LOG
+
+DEFAULT_TIMEOUT_S = 10 if IS_SNP else 5
+
+
+def format_message(idx):
+    return """
+    Nodes whisper secrets,
+    Across vast digital realms,
+    Harmony in bits.
+    """ + str(
+        idx
+    )
+
+
+def submit_log_entry(primary, idx):
+    with primary.client("user0") as c:
+        msg = format_message(idx)
+        r = c.post(
+            "/app/log/public",
+            {
+                "id": idx,
+                "msg": msg,
+            },
+            log_capture=None,
+        )
+        assert r.status_code == http.HTTPStatus.OK
+        return (r.view, r.seqno)
+
+
+def get_and_verify_entry(client, idx):
+    start_time = time.time()
+    end_time = start_time + 10
+    entries = []
+    path = f"/app/log/public/historical/range?id={idx}"
+    while time.time() < end_time:
+        r = client.get(path, headers={})
+        if r.status_code == http.HTTPStatus.OK:
+            j_body = r.body.json()
+            entries += j_body["entries"]
+            if "@nextLink" in j_body:
+                path = j_body["@nextLink"]
+                continue
+            else:
+                # No @nextLink means we've reached end of range
+                assert entries[0]["msg"] == format_message(idx)
+                return
+        elif r.status_code == http.HTTPStatus.ACCEPTED:
+            # Ignore retry-after header, retry soon
+            time.sleep(0.1)
+            continue
+        else:
+            raise ValueError(
+                f"""
+                Unexpected status code from historical range query: {r.status_code}
+
+                {r.body}
+                """
+            )
+
+    raise TimeoutError("Historical range not available")
+
+
+def test_historical_query_stress_cache(network, args):
+    """This test loads the historical cache good enough so it's force to
+    lru_shrink. We go over the range twice and make sure we're able to load new
+    entries after they get evicted from the cache."""
+
+    jwt_issuer = infra.jwt_issuer.JwtIssuer()
+    jwt_issuer.register(network)
+    jwt = jwt_issuer.issue_jwt()
+
+    primary, _ = network.find_primary()
+
+    start = 1
+    end = 100
+    last_seqno = None
+    last_view = None
+    for i in range(start, end + 1):
+        last_view, last_seqno = submit_log_entry(primary, i)
+
+    with primary.client("user0") as c:
+        infra.commit.wait_for_commit(c, seqno=last_seqno, view=last_view, timeout=10)
+
+    network.wait_for_all_nodes_to_commit(primary=primary)
+    node = network.find_node_by_role(role=infra.network.NodeRole.BACKUP, log_capture=[])
+
+    with node.client(common_headers={"authorization": f"Bearer {jwt}"}) as c:
+        for cycle in range(0, 2):
+            LOG.info(f"Polling [{start}:{end + 1}] range. Attempt=[{cycle}]")
+            for idx in range(start, end + 1):
+                get_and_verify_entry(c, idx)
+
+    return network
+
+
+def run(args):
+    with infra.network.network(
+        args.nodes, args.binary_dir, args.debug_nodes, args.perf_nodes, pdb=args.pdb
+    ) as network:
+        network.start_and_open(args)
+
+        network = test_historical_query_stress_cache(network, args)
+
+
+if __name__ == "__main__":
+
+    def add(parser):
+        pass
+
+    args = infra.e2e_args.cli_args(add=add)
+    args.package = "samples/apps/logging/liblogging"
+    args.nodes = infra.e2e_args.max_nodes(args, f=0)
+    args.initial_member_count = 1
+    args.sig_ms_interval = 1000  # Set to cchost default value
+
+    args.historical_cache_soft_limit = "10KB"
+
+    run(args)
diff --git a/tests/infra/network.py b/tests/infra/network.py
index 5fb715d163ca..bf4c32eb36cc 100644
--- a/tests/infra/network.py
+++ b/tests/infra/network.py
@@ -194,6 +194,7 @@ class Network:
         "acme",
         "snp_endorsements_servers",
         "node_to_node_message_limit",
+        "historical_cache_soft_limit",
         "tick_ms",
         "max_msg_size_bytes",
         "snp_security_policy_file",
diff --git a/tests/infra/remote.py b/tests/infra/remote.py
index 4a8f9c698136..09fa98befbdb 100644
--- a/tests/infra/remote.py
+++ b/tests/infra/remote.py
@@ -630,6 +630,7 @@ def __init__(
         snp_security_policy_file=None,
         snp_uvm_endorsements_file=None,
         service_subject_name="CN=CCF Test Service",
+        historical_cache_soft_limit=None,
         **kwargs,
     ):
         """
@@ -823,6 +824,7 @@ def __init__(
                 snp_security_policy_file=snp_security_policy_file,
                 snp_uvm_endorsements_file=snp_uvm_endorsements_file,
                 service_subject_name=service_subject_name,
+                historical_cache_soft_limit=historical_cache_soft_limit,
                 **kwargs,
             )