From 17c4776af95094ad31737e7ae96b3994c0c6b245 Mon Sep 17 00:00:00 2001 From: Max Date: Fri, 12 Jul 2024 11:35:05 +0100 Subject: [PATCH] Configure hist. cache size on startup (#6346) --- .daily_canary | 4 + CHANGELOG.md | 1 + CMakeLists.txt | 5 + doc/host_config_schema/cchost_config.json | 5 + include/ccf/node/startup_config.h | 2 + samples/apps/logging/logging.cpp | 16 --- src/common/configuration.h | 3 +- src/enclave/enclave.h | 3 + src/node/historical_queries.h | 2 +- tests/config.jinja | 3 +- tests/historical_query_cache.py | 131 ++++++++++++++++++++++ tests/infra/network.py | 1 + tests/infra/remote.py | 2 + 13 files changed, 159 insertions(+), 19 deletions(-) create mode 100644 tests/historical_query_cache.py diff --git a/.daily_canary b/.daily_canary index b0c8ecafff33..71a95928ef59 100644 --- a/.daily_canary +++ b/.daily_canary @@ -4,3 +4,7 @@ /--x-m- /--n-n---xXx--/--yY------>>>----<<<>>]]{{}}---||-/\---.. 2024__ !..! + + ,--. + ( o> +//\\ diff --git a/CHANGELOG.md b/CHANGELOG.md index 870628f8a568..583c188b597b 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -14,6 +14,7 @@ and this project adheres to [Semantic Versioning](http://semver.org/spec/v2.0.0. - The `cchost` configuration file now includes an `idle_connection_timeout` option. This controls how long the node will keep idle connections (for user TLS sessions) before automatically closing them. This may be set to `null` to restore the previous behaviour, where idle connections are never closed. By default connections will be closed after 60s of idle time. - New endpoints `GET /gov/service/javascript-modules` and `GET /gov/service/javascript-modules/{moduleName}` to retrieve the raw JS code of the currently installed app. Note that the `{moduleName}` path parameter will need to be URL-encoded to escape any `/` characters (eg - `/foo/bar.js` should become `%2Ffoo%2Fbar.js`). - New gov API version `2024-07-01`. This is near-identical to `2023-06-01-preview`, but additionally offers the new `javascript-modules` endpoints. +- Historical cache soft limit now is a node-specific startup parameter. ### Changed diff --git a/CMakeLists.txt b/CMakeLists.txt index 87c989d5bb19..d551016762ef 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -1576,6 +1576,11 @@ if(BUILD_TESTS) PYTHON_SCRIPT ${CMAKE_SOURCE_DIR}/tests/historical_query_perf.py LABEL perf PERF_LABEL "Historical Queries" ) + + add_e2e_test( + NAME historical_query_cache_test + PYTHON_SCRIPT ${CMAKE_SOURCE_DIR}/tests/historical_query_cache.py + ) endif() endif() diff --git a/doc/host_config_schema/cchost_config.json b/doc/host_config_schema/cchost_config.json index ca7dd0f9c230..313e6144af2b 100644 --- a/doc/host_config_schema/cchost_config.json +++ b/doc/host_config_schema/cchost_config.json @@ -704,6 +704,11 @@ "type": "integer", "description": "Cap at which node-to-node message channels will be closed, and a new channel will be created. Can be used to limit use of single cryptographic key", "minimum": 0 + }, + "historical_cache_soft_limit": { + "type": "string", + "default": "512MB", + "description": "Historical queries cache soft limit (as size string)" } }, "required": ["enclave", "network", "command"], diff --git a/include/ccf/node/startup_config.h b/include/ccf/node/startup_config.h index 5d4fc0f893a2..c2091b8e54ec 100644 --- a/include/ccf/node/startup_config.h +++ b/include/ccf/node/startup_config.h @@ -22,6 +22,8 @@ struct CCFConfig // 2**24.5 as per RFC8446 Section 5.5 size_t node_to_node_message_limit = 23'726'566; + ccf::ds::SizeString historical_cache_soft_limit = {"512MB"}; + ccf::consensus::Configuration consensus = {}; ccf::NodeInfoNetwork network = {}; diff --git a/samples/apps/logging/logging.cpp b/samples/apps/logging/logging.cpp index e3dd8e30ed29..7b531c911f17 100644 --- a/samples/apps/logging/logging.cpp +++ b/samples/apps/logging/logging.cpp @@ -460,12 +460,6 @@ namespace loggingapp PUBLIC_RECORDS, context, 10000, 20); context.get_indexing_strategies().install_strategy(index_per_public_key); - // According to manual obvervation it's enough to start evicting old - // requests on historical perf test, but not too small to get stuck - // because of a single request being larget than the cache. - constexpr size_t cache_limit = 1024 * 1024 * 10; // MB - context.get_historical_state().set_soft_cache_limit(cache_limit); - const ccf::AuthnPolicies auth_policies = { ccf::jwt_auth_policy, ccf::user_cert_auth_policy, @@ -1663,11 +1657,6 @@ namespace loggingapp ccf::http::headers::CONTENT_TYPE, ccf::http::headervalues::contenttype::JSON); ctx.rpc_ctx->set_response_body(j_response.dump()); - - // ALSO: Assume this response makes it all the way to the client, and - // they're finished with it, so we can drop the retrieved state. In a - // real app this may be driven by a separate client request or an LRU - historical_cache.drop_cached_states(handle); }; make_endpoint( get_historical_range_path, @@ -1834,11 +1823,6 @@ namespace loggingapp ccf::http::headers::CONTENT_TYPE, ccf::http::headervalues::contenttype::JSON); ctx.rpc_ctx->set_response_body(j_response.dump()); - - // ALSO: Assume this response makes it all the way to the client, and - // they're finished with it, so we can drop the retrieved state. In a - // real app this may be driven by a separate client request or an LRU - historical_cache.drop_cached_states(handle); }; make_endpoint( get_historical_sparse_path, diff --git a/src/common/configuration.h b/src/common/configuration.h index 181c32fc70b5..c12771265e2c 100644 --- a/src/common/configuration.h +++ b/src/common/configuration.h @@ -92,7 +92,8 @@ DECLARE_JSON_OPTIONAL_FIELDS( ledger_signatures, jwt, attestation, - node_to_node_message_limit); + node_to_node_message_limit, + historical_cache_soft_limit); DECLARE_JSON_TYPE(StartupConfig::Start); DECLARE_JSON_REQUIRED_FIELDS( diff --git a/src/enclave/enclave.h b/src/enclave/enclave.h index a51d0fbf0f68..02ddd02f3819 100644 --- a/src/enclave/enclave.h +++ b/src/enclave/enclave.h @@ -238,6 +238,9 @@ namespace ccf node->set_n2n_message_limit(ccf_config_.node_to_node_message_limit); + historical_state_cache->set_soft_cache_limit( + ccf_config_.historical_cache_soft_limit); + // If we haven't heard from a node for multiple elections, then cleanup // their node-to-node channel const auto idle_timeout = diff --git a/src/node/historical_queries.h b/src/node/historical_queries.h index 46599b5123fe..2cc96a976ec6 100644 --- a/src/node/historical_queries.h +++ b/src/node/historical_queries.h @@ -509,7 +509,7 @@ namespace ccf::historical std::unordered_map> store_to_requests; std::unordered_map raw_store_sizes{}; - CacheSize soft_store_cache_limit{1ll * 1024 * 1024 * 512 /*512 MB*/}; + CacheSize soft_store_cache_limit{std::numeric_limits::max()}; CacheSize soft_store_cache_limit_raw = soft_store_cache_limit / soft_to_raw_ratio; CacheSize estimated_store_cache_size{0}; diff --git a/tests/config.jinja b/tests/config.jinja index 86925c3b072e..d94ca8ded5a7 100644 --- a/tests/config.jinja +++ b/tests/config.jinja @@ -102,5 +102,6 @@ "max_fragment_size": "256KB" }, "ignore_first_sigterm": {{ ignore_first_sigterm|tojson }}{% if node_to_node_message_limit %}, - "node_to_node_message_limit": {{ node_to_node_message_limit|tojson }}{% endif %} + "node_to_node_message_limit": {{ node_to_node_message_limit|tojson }}{% endif %}{% if historical_cache_soft_limit %}, + "historical_cache_soft_limit": {{ historical_cache_soft_limit|tojson }}{% endif %} } \ No newline at end of file diff --git a/tests/historical_query_cache.py b/tests/historical_query_cache.py new file mode 100644 index 000000000000..3c12f0f1adb2 --- /dev/null +++ b/tests/historical_query_cache.py @@ -0,0 +1,131 @@ +# Copyright (c) Microsoft Corporation. All rights reserved. +# Licensed under the Apache 2.0 License. +import infra.e2e_args +import infra.network +import infra.proc +import infra.commit +import http +from infra.snp import IS_SNP +import infra.jwt_issuer +import time +import infra.bencher + +from loguru import logger as LOG + +DEFAULT_TIMEOUT_S = 10 if IS_SNP else 5 + + +def format_message(idx): + return """ + Nodes whisper secrets, + Across vast digital realms, + Harmony in bits. + """ + str( + idx + ) + + +def submit_log_entry(primary, idx): + with primary.client("user0") as c: + msg = format_message(idx) + r = c.post( + "/app/log/public", + { + "id": idx, + "msg": msg, + }, + log_capture=None, + ) + assert r.status_code == http.HTTPStatus.OK + return (r.view, r.seqno) + + +def get_and_verify_entry(client, idx): + start_time = time.time() + end_time = start_time + 10 + entries = [] + path = f"/app/log/public/historical/range?id={idx}" + while time.time() < end_time: + r = client.get(path, headers={}) + if r.status_code == http.HTTPStatus.OK: + j_body = r.body.json() + entries += j_body["entries"] + if "@nextLink" in j_body: + path = j_body["@nextLink"] + continue + else: + # No @nextLink means we've reached end of range + assert entries[0]["msg"] == format_message(idx) + return + elif r.status_code == http.HTTPStatus.ACCEPTED: + # Ignore retry-after header, retry soon + time.sleep(0.1) + continue + else: + raise ValueError( + f""" + Unexpected status code from historical range query: {r.status_code} + + {r.body} + """ + ) + + raise TimeoutError("Historical range not available") + + +def test_historical_query_stress_cache(network, args): + """This test loads the historical cache good enough so it's force to + lru_shrink. We go over the range twice and make sure we're able to load new + entries after they get evicted from the cache.""" + + jwt_issuer = infra.jwt_issuer.JwtIssuer() + jwt_issuer.register(network) + jwt = jwt_issuer.issue_jwt() + + primary, _ = network.find_primary() + + start = 1 + end = 100 + last_seqno = None + last_view = None + for i in range(start, end + 1): + last_view, last_seqno = submit_log_entry(primary, i) + + with primary.client("user0") as c: + infra.commit.wait_for_commit(c, seqno=last_seqno, view=last_view, timeout=10) + + network.wait_for_all_nodes_to_commit(primary=primary) + node = network.find_node_by_role(role=infra.network.NodeRole.BACKUP, log_capture=[]) + + with node.client(common_headers={"authorization": f"Bearer {jwt}"}) as c: + for cycle in range(0, 2): + LOG.info(f"Polling [{start}:{end + 1}] range. Attempt=[{cycle}]") + for idx in range(start, end + 1): + get_and_verify_entry(c, idx) + + return network + + +def run(args): + with infra.network.network( + args.nodes, args.binary_dir, args.debug_nodes, args.perf_nodes, pdb=args.pdb + ) as network: + network.start_and_open(args) + + network = test_historical_query_stress_cache(network, args) + + +if __name__ == "__main__": + + def add(parser): + pass + + args = infra.e2e_args.cli_args(add=add) + args.package = "samples/apps/logging/liblogging" + args.nodes = infra.e2e_args.max_nodes(args, f=0) + args.initial_member_count = 1 + args.sig_ms_interval = 1000 # Set to cchost default value + + args.historical_cache_soft_limit = "10KB" + + run(args) diff --git a/tests/infra/network.py b/tests/infra/network.py index 5fb715d163ca..bf4c32eb36cc 100644 --- a/tests/infra/network.py +++ b/tests/infra/network.py @@ -194,6 +194,7 @@ class Network: "acme", "snp_endorsements_servers", "node_to_node_message_limit", + "historical_cache_soft_limit", "tick_ms", "max_msg_size_bytes", "snp_security_policy_file", diff --git a/tests/infra/remote.py b/tests/infra/remote.py index 4a8f9c698136..09fa98befbdb 100644 --- a/tests/infra/remote.py +++ b/tests/infra/remote.py @@ -630,6 +630,7 @@ def __init__( snp_security_policy_file=None, snp_uvm_endorsements_file=None, service_subject_name="CN=CCF Test Service", + historical_cache_soft_limit=None, **kwargs, ): """ @@ -823,6 +824,7 @@ def __init__( snp_security_policy_file=snp_security_policy_file, snp_uvm_endorsements_file=snp_uvm_endorsements_file, service_subject_name=service_subject_name, + historical_cache_soft_limit=historical_cache_soft_limit, **kwargs, )