From 6bb60eae3a2f992e488eefb406e6a23994307b72 Mon Sep 17 00:00:00 2001 From: Amaury Chamayou Date: Fri, 4 Aug 2023 15:04:50 +0000 Subject: [PATCH 01/17] Add reconfiguration to basicperf --- tests/infra/basicperf.py | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) diff --git a/tests/infra/basicperf.py b/tests/infra/basicperf.py index 87864edf2728..e8c553681611 100644 --- a/tests/infra/basicperf.py +++ b/tests/infra/basicperf.py @@ -176,6 +176,17 @@ def create_and_fill_key_space(size: int, primary: infra.node.Node) -> List[str]: return space +def create_and_add_node(network, host, primary): + LOG.info(f"Adding new node: {host}") + node = network.create_node(host) + network.join_node(node, args.package, args, timeout=10) + network.trust_node(node, args) + LOG.info(f"Done adding new node: {host}") + LOG.info("Retiring primary") + network.retire_node(primary, primary) + LOG.info("Primary is retired") + + def run(args): hosts = args.nodes or ["local://localhost"] @@ -303,6 +314,10 @@ def run(args): and time.time() > start_time + args.stop_primary_after_s and not primary.is_stopped() ): + if args.add_new_node_before_primary_stops: + create_and_add_node( + network, args.add_new_node_before_primary_stops, primary + ) LOG.info( f"Stopping primary after {args.stop_primary_after_s} seconds" ) @@ -504,6 +519,7 @@ def cli_args(): parser.add_argument( "--stop-primary-after-s", help="Stop primary after this many seconds", type=int ) + parser.add_argument("--add-new-node-before-primary-stops", type=str) return infra.e2e_args.cli_args( parser=parser, accept_unknown=False, ledger_chunk_bytes_override="5MB" ) From 3a413a4c7a1124d60dfac5cd2ce55824aeb54bc2 Mon Sep 17 00:00:00 2001 From: Amaury Chamayou Date: Mon, 7 Aug 2023 12:20:26 +0000 Subject: [PATCH 02/17] Shut down primary before reconfiguration --- tests/infra/basicperf.py | 24 +++++++++++++++--------- 1 file changed, 15 insertions(+), 9 deletions(-) diff --git a/tests/infra/basicperf.py b/tests/infra/basicperf.py index 958a2eba8c5c..431e74596ea4 100644 --- a/tests/infra/basicperf.py +++ b/tests/infra/basicperf.py @@ -177,15 +177,16 @@ def create_and_fill_key_space(size: int, primary: infra.node.Node) -> List[str]: return space -def create_and_add_node(network, host, primary): +def create_and_add_node(network, host, old_primary, new_primary): + LOG.info("Retiring old primary") + network.retire_node(new_primary, old_primary) + LOG.info("Old primary is retired") + LOG.info(f"Adding new node: {host}") node = network.create_node(host) - network.join_node(node, args.package, args, timeout=10) + network.join_node(node, args.package, args, timeout=10, copy_ledger=False) network.trust_node(node, args) LOG.info(f"Done adding new node: {host}") - LOG.info("Retiring primary") - network.retire_node(primary, primary) - LOG.info("Primary is retired") def run(args): @@ -315,14 +316,19 @@ def run(args): and time.time() > start_time + args.stop_primary_after_s and not primary.is_stopped() ): - if args.add_new_node_before_primary_stops: - create_and_add_node( - network, args.add_new_node_before_primary_stops, primary - ) LOG.info( f"Stopping primary after {args.stop_primary_after_s} seconds" ) primary.stop() + old_primary = primary + primary, _ = network.wait_for_new_primary(primary) + if args.add_new_node_before_primary_stops: + create_and_add_node( + network, + args.add_new_node_before_primary_stops, + old_primary, + primary, + ) time.sleep(1) From b5080e017c1970a9da718124d9295f6016bd0c85 Mon Sep 17 00:00:00 2001 From: Amaury Chamayou Date: Mon, 7 Aug 2023 12:41:23 +0000 Subject: [PATCH 03/17] Round robin allocation starts from second entry --- tests/infra/basicperf.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/tests/infra/basicperf.py b/tests/infra/basicperf.py index 431e74596ea4..919501dbb2cc 100644 --- a/tests/infra/basicperf.py +++ b/tests/infra/basicperf.py @@ -224,7 +224,11 @@ def run(args): requests_file_paths = [] for client_def in args.client_def: count, gen, iterations, target = client_def.split(",") - rr_idx = 0 + # The round robin index deliberately starts at 1, so that backups/any are + # loaded uniformly but the first instance slightly less so where possible. This + # is useful when running a failover test, to avoid the new primary being targeted + # by reads. + rr_idx = 1 for _ in range(int(count)): LOG.info(f"Generating {iterations} requests for client_{client_idx}") msgs = generator.Messages() From 2a16dc72a5bb674807cc9d6e3a72fa81a42da8f3 Mon Sep 17 00:00:00 2001 From: Amaury Chamayou Date: Mon, 7 Aug 2023 12:56:59 +0000 Subject: [PATCH 04/17] oops --- tests/infra/basicperf.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/infra/basicperf.py b/tests/infra/basicperf.py index 919501dbb2cc..6f1d206850b7 100644 --- a/tests/infra/basicperf.py +++ b/tests/infra/basicperf.py @@ -278,7 +278,7 @@ def run(args): # All clients talking to the primary are configured to fail over to the first backup, # which is the only node whose election timeout has not been raised, to guarantee its # election as the old primary becomes unavailable. - if args.stop_primary_after_s: + if args.stop_primary_after_s and target == "primary": cmd.append( f"--failover-server-address={backups[0].get_public_rpc_host()}:{backups[0].get_public_rpc_port()}" ) From 439f6de72f7145742d489a615cdc3a04f7b1deba Mon Sep 17 00:00:00 2001 From: Amaury Chamayou Date: Mon, 7 Aug 2023 14:19:25 +0000 Subject: [PATCH 05/17] Use snapshots from the initial primary to reduce failover gap --- tests/infra/basicperf.py | 21 ++++++++++++++++----- 1 file changed, 16 insertions(+), 5 deletions(-) diff --git a/tests/infra/basicperf.py b/tests/infra/basicperf.py index 6f1d206850b7..a1abf40eee59 100644 --- a/tests/infra/basicperf.py +++ b/tests/infra/basicperf.py @@ -177,14 +177,21 @@ def create_and_fill_key_space(size: int, primary: infra.node.Node) -> List[str]: return space -def create_and_add_node(network, host, old_primary, new_primary): +def create_and_add_node(network, host, old_primary, new_primary, snapshots_dir): LOG.info("Retiring old primary") network.retire_node(new_primary, old_primary) LOG.info("Old primary is retired") LOG.info(f"Adding new node: {host}") node = network.create_node(host) - network.join_node(node, args.package, args, timeout=10, copy_ledger=False) + network.join_node( + node, + args.package, + args, + timeout=10, + copy_ledger=False, + snapshots_dir=snapshots_dir, + ) network.trust_node(node, args) LOG.info(f"Done adding new node: {host}") @@ -320,18 +327,22 @@ def run(args): and time.time() > start_time + args.stop_primary_after_s and not primary.is_stopped() ): + committed_snapshots_dir = network.get_committed_snapshots( + primary, force_txs=False + ) LOG.info( f"Stopping primary after {args.stop_primary_after_s} seconds" ) primary.stop() old_primary = primary primary, _ = network.wait_for_new_primary(primary) - if args.add_new_node_before_primary_stops: + if args.add_new_node_after_primary_stops: create_and_add_node( network, - args.add_new_node_before_primary_stops, + args.add_new_node_after_primary_stops, old_primary, primary, + snapshots_dir=committed_snapshots_dir, ) time.sleep(1) @@ -551,7 +562,7 @@ def cli_args(): parser.add_argument( "--stop-primary-after-s", help="Stop primary after this many seconds", type=int ) - parser.add_argument("--add-new-node-before-primary-stops", type=str) + parser.add_argument("--add-new-node-after-primary-stops", type=str) return infra.e2e_args.cli_args( parser=parser, accept_unknown=False, ledger_chunk_bytes_override="5MB" ) From 77f6733b84bb8845a52ad9aa0e6817ac1027607c Mon Sep 17 00:00:00 2001 From: Amaury Chamayou Date: Mon, 7 Aug 2023 14:41:31 +0000 Subject: [PATCH 06/17] oops --- tests/infra/basicperf.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/tests/infra/basicperf.py b/tests/infra/basicperf.py index a1abf40eee59..a51ae3a46cb9 100644 --- a/tests/infra/basicperf.py +++ b/tests/infra/basicperf.py @@ -307,6 +307,7 @@ def run(args): try: start_time = time.time() + primary_has_stopped = False while True: stop_waiting = True for i, remote_client in enumerate(clients): @@ -325,7 +326,7 @@ def run(args): if ( args.stop_primary_after_s and time.time() > start_time + args.stop_primary_after_s - and not primary.is_stopped() + and not primary_has_stopped ): committed_snapshots_dir = network.get_committed_snapshots( primary, force_txs=False @@ -334,6 +335,7 @@ def run(args): f"Stopping primary after {args.stop_primary_after_s} seconds" ) primary.stop() + primary_has_stopped = True old_primary = primary primary, _ = network.wait_for_new_primary(primary) if args.add_new_node_after_primary_stops: From 1eabc38017103fadb603b1b20966d814a6e47c9d Mon Sep 17 00:00:00 2001 From: Amaury Chamayou Date: Mon, 7 Aug 2023 15:15:26 +0000 Subject: [PATCH 07/17] Only copy the last snapshot --- tests/infra/basicperf.py | 19 +++++++++++++++++-- 1 file changed, 17 insertions(+), 2 deletions(-) diff --git a/tests/infra/basicperf.py b/tests/infra/basicperf.py index a51ae3a46cb9..245a9d4d8fff 100644 --- a/tests/infra/basicperf.py +++ b/tests/infra/basicperf.py @@ -16,6 +16,7 @@ import random import string import json +import shutil def configure_remote_client(args, client_id, client_host, common_dir): @@ -178,7 +179,7 @@ def create_and_fill_key_space(size: int, primary: infra.node.Node) -> List[str]: def create_and_add_node(network, host, old_primary, new_primary, snapshots_dir): - LOG.info("Retiring old primary") + LOG.info(f"Retiring old primary {old_primary.local_node_id}") network.retire_node(new_primary, old_primary) LOG.info("Old primary is retired") @@ -331,6 +332,20 @@ def run(args): committed_snapshots_dir = network.get_committed_snapshots( primary, force_txs=False ) + snapshots = os.listdir(committed_snapshots_dir) + sorted_snapshots = sorted( + snapshots, key=lambda x: int(x.split("_")[1]) + ) + latest_snapshot = sorted_snapshots[-1] + latest_snapshot_dir = os.path.join( + network.common_dir, "snapshots_to_copy" + ) + os.mkdir(latest_snapshot_dir) + shutil.copy( + os.path.join(committed_snapshots_dir, latest_snapshot), + latest_snapshot_dir, + ) + # TODO: delete all but the last snapshot to minimise copy time to new remote LOG.info( f"Stopping primary after {args.stop_primary_after_s} seconds" ) @@ -344,7 +359,7 @@ def run(args): args.add_new_node_after_primary_stops, old_primary, primary, - snapshots_dir=committed_snapshots_dir, + snapshots_dir=latest_snapshot_dir, ) time.sleep(1) From f204bdfea0d112c2e609c1c74b36bb81fb9b9f09 Mon Sep 17 00:00:00 2001 From: Amaury Chamayou Date: Mon, 7 Aug 2023 15:16:40 +0000 Subject: [PATCH 08/17] -todo --- tests/infra/basicperf.py | 1 - 1 file changed, 1 deletion(-) diff --git a/tests/infra/basicperf.py b/tests/infra/basicperf.py index 245a9d4d8fff..5cd942ff7491 100644 --- a/tests/infra/basicperf.py +++ b/tests/infra/basicperf.py @@ -345,7 +345,6 @@ def run(args): os.path.join(committed_snapshots_dir, latest_snapshot), latest_snapshot_dir, ) - # TODO: delete all but the last snapshot to minimise copy time to new remote LOG.info( f"Stopping primary after {args.stop_primary_after_s} seconds" ) From 976010f54faaabc15bc3afaa26b36186b5f87027 Mon Sep 17 00:00:00 2001 From: Amaury Chamayou Date: Mon, 7 Aug 2023 17:06:56 +0000 Subject: [PATCH 09/17] stats --- tests/infra/basicperf.py | 24 +++++++++++++++++++++--- 1 file changed, 21 insertions(+), 3 deletions(-) diff --git a/tests/infra/basicperf.py b/tests/infra/basicperf.py index 5cd942ff7491..e3cba70952c5 100644 --- a/tests/infra/basicperf.py +++ b/tests/infra/basicperf.py @@ -17,6 +17,7 @@ import string import json import shutil +import datetime def configure_remote_client(args, client_id, client_host, common_dir): @@ -178,13 +179,22 @@ def create_and_fill_key_space(size: int, primary: infra.node.Node) -> List[str]: return space -def create_and_add_node(network, host, old_primary, new_primary, snapshots_dir): +def create_and_add_node( + network, host, old_primary, new_primary, snapshots_dir, statistics +): LOG.info(f"Retiring old primary {old_primary.local_node_id}") + statistics[ + "initial_primary_retirement_start_time" + ] = datetime.datetime.now().isoformat() network.retire_node(new_primary, old_primary) + statistics[ + "initial_primary_retirement_complete_time" + ] = datetime.datetime.now().isoformat() LOG.info("Old primary is retired") LOG.info(f"Adding new node: {host}") node = network.create_node(host) + statistics["new_node_join_start_time"] = datetime.datetime.now().isoformat() network.join_node( node, args.package, @@ -194,6 +204,7 @@ def create_and_add_node(network, host, old_primary, new_primary, snapshots_dir): snapshots_dir=snapshots_dir, ) network.trust_node(node, args) + statistics["new_node_join_complete_time"] = datetime.datetime.now().isoformat() LOG.info(f"Done adding new node: {host}") @@ -307,6 +318,7 @@ def run(args): format_width = len(str(args.client_timeout_s)) + 3 try: + statistics = {} start_time = time.time() primary_has_stopped = False while True: @@ -348,17 +360,24 @@ def run(args): LOG.info( f"Stopping primary after {args.stop_primary_after_s} seconds" ) + statistics[ + "initial_primary_shutdown_time" + ] = datetime.datetime.now().isoformat() primary.stop() primary_has_stopped = True old_primary = primary primary, _ = network.wait_for_new_primary(primary) + statistics[ + "new_primary_election_time" + ] = datetime.datetime.now().isoformat() if args.add_new_node_after_primary_stops: create_and_add_node( network, args.add_new_node_after_primary_stops, old_primary, primary, - snapshots_dir=latest_snapshot_dir, + latest_snapshot_dir, + statistics, ) time.sleep(1) @@ -387,7 +406,6 @@ def run(args): network.stop_all_nodes() - statistics = {} agg = [] for client_id, remote_client in enumerate(clients): From c2c889dda1deb6ac8ccea582b9028b147f6a9869 Mon Sep 17 00:00:00 2001 From: Amaury Chamayou Date: Tue, 8 Aug 2023 11:12:03 +0000 Subject: [PATCH 10/17] Draft atomic replacement --- tests/infra/basicperf.py | 17 ++++++++++ tests/infra/consortium.py | 19 +++++++++++ tests/infra/network.py | 67 +++++++++++++++++++++++++++++++++++++++ 3 files changed, 103 insertions(+) diff --git a/tests/infra/basicperf.py b/tests/infra/basicperf.py index e3cba70952c5..2aff88446094 100644 --- a/tests/infra/basicperf.py +++ b/tests/infra/basicperf.py @@ -207,6 +207,23 @@ def create_and_add_node( statistics["new_node_join_complete_time"] = datetime.datetime.now().isoformat() LOG.info(f"Done adding new node: {host}") +def create_and_add_node( + network, host, old_primary, new_primary, snapshots_dir, statistics +): + LOG.info(f"Add new node: {host}") + node = network.create_node(host) + statistics["new_node_join_start_time"] = datetime.datetime.now().isoformat() + network.join_node( + node, + args.package, + args, + timeout=10, + copy_ledger=False, + snapshots_dir=snapshots_dir, + ) + LOG.info(f"Replace node {old_primary.local_node_id} with {node.local_node_id}") + network.replace_node(old_primary, node, args) + LOG.info(f"Done replacing node: {host}") def run(args): hosts = args.nodes or ["local://localhost"] diff --git a/tests/infra/consortium.py b/tests/infra/consortium.py index 250c2374ea88..c89cfa36c800 100644 --- a/tests/infra/consortium.py +++ b/tests/infra/consortium.py @@ -361,6 +361,25 @@ def trust_nodes( **kwargs, ) + def replace_node(self, remote_node, node_to_retire, node_to_add, valid_from, validity_period_days=None, **kwargs,): + proposal_body = {"actions": []} + trust_args = {"node_id": node_to_add.node_id, "valid_from": str(valid_from)} + if validity_period_days is not None: + trust_args["validity_period_days"] = validity_period_days + proposal_body["actions"].append( + {"name": "transition_node_to_trusted", "args": trust_args} + ) + proposal_body["actions"].append( + {"name": "remove_node", "args": {"node_id": node_to_retire.node_id}} + ) + proposal = self.get_any_active_member().propose(remote_node, proposal_body) + self.vote_using_majority( + remote_node, + proposal, + {"ballot": "export function vote (proposal, proposer_id) { return true }"}, + **kwargs, + ) + def trust_node(self, remote_node, node_id, *args, **kwargs): return self.trust_nodes(remote_node, [node_id], *args, **kwargs) diff --git a/tests/infra/network.py b/tests/infra/network.py index e0d5b637273b..65cdb89214bc 100644 --- a/tests/infra/network.py +++ b/tests/infra/network.py @@ -955,6 +955,73 @@ def retire_node(self, remote_node, node_to_retire, timeout=10): self.nodes.remove(node_to_retire) + def replace_node( + self, + node_to_retire, + node_to_add, + args, + valid_from=None, + validity_period_days=None, + no_wait=False, + timeout=None, + ): + primary, _ = self.find_primary() + try: + if self.status is ServiceStatus.OPEN: + valid_from = valid_from or datetime.utcnow() + # Note: Timeout is function of the ledger size here since + # the commit of the trust_node proposal may rely on the new node + # catching up (e.g. adding 1 node to a 1-node network). + self.consortium.replace_node( + primary, + node_to_retire, + node_to_add, + valid_from=valid_from, + validity_period_days=validity_period_days, + timeout=args.ledger_recovery_timeout, + ) + if not no_wait: + # The main endorsed RPC interface is only open once the node + # has caught up and observed commit on the service open transaction. + node_to_add.wait_for_node_to_join( + timeout=timeout or args.ledger_recovery_timeout + ) + except (ValueError, TimeoutError): + LOG.error(f"New trusted node {node_to_add.node_id} failed to join the network") + node_to_add.stop() + raise + + node_to_add.network_state = infra.node.NodeNetworkState.joined + node_to_add.set_certificate_validity_period( + valid_from, + validity_period_days or args.maximum_node_certificate_validity_days, + ) + if not no_wait: + self.wait_for_all_nodes_to_commit(primary=primary) + end_time = time.time() + 5 + r = None + while time.time() < end_time: + try: + with primary.client() as c: + r = c.get("/node/network/removable_nodes").body.json() + if node_to_retire.node_id in {n["node_id"] for n in r["nodes"]}: + check_commit = infra.checker.Checker(c) + r = c.delete( + f"/node/network/nodes/{node_to_retire.node_id}" + ) + check_commit(r) + break + else: + r = c.get( + f"/node/network/nodes/{node_to_retire.node_id}" + ).body.json() + except ConnectionRefusedError: + pass + time.sleep(0.1) + else: + raise TimeoutError(f"Timed out waiting for node to become removed: {r}") + self.nodes.remove(node_to_retire) + def create_user(self, local_user_id, curve, record=True): infra.proc.ccall( self.key_generator, From 2995675c7cc54c3175bf5b8fc8597915563e5568 Mon Sep 17 00:00:00 2001 From: Amaury Chamayou Date: Tue, 8 Aug 2023 12:18:29 +0000 Subject: [PATCH 11/17] Tidy up --- tests/infra/basicperf.py | 36 +++--------------------------------- tests/infra/consortium.py | 10 +++++++++- tests/infra/network.py | 26 +++++++------------------- 3 files changed, 19 insertions(+), 53 deletions(-) diff --git a/tests/infra/basicperf.py b/tests/infra/basicperf.py index 2aff88446094..246f19c6d40c 100644 --- a/tests/infra/basicperf.py +++ b/tests/infra/basicperf.py @@ -179,37 +179,7 @@ def create_and_fill_key_space(size: int, primary: infra.node.Node) -> List[str]: return space -def create_and_add_node( - network, host, old_primary, new_primary, snapshots_dir, statistics -): - LOG.info(f"Retiring old primary {old_primary.local_node_id}") - statistics[ - "initial_primary_retirement_start_time" - ] = datetime.datetime.now().isoformat() - network.retire_node(new_primary, old_primary) - statistics[ - "initial_primary_retirement_complete_time" - ] = datetime.datetime.now().isoformat() - LOG.info("Old primary is retired") - - LOG.info(f"Adding new node: {host}") - node = network.create_node(host) - statistics["new_node_join_start_time"] = datetime.datetime.now().isoformat() - network.join_node( - node, - args.package, - args, - timeout=10, - copy_ledger=False, - snapshots_dir=snapshots_dir, - ) - network.trust_node(node, args) - statistics["new_node_join_complete_time"] = datetime.datetime.now().isoformat() - LOG.info(f"Done adding new node: {host}") - -def create_and_add_node( - network, host, old_primary, new_primary, snapshots_dir, statistics -): +def create_and_add_node(network, host, old_primary, snapshots_dir, statistics): LOG.info(f"Add new node: {host}") node = network.create_node(host) statistics["new_node_join_start_time"] = datetime.datetime.now().isoformat() @@ -222,9 +192,10 @@ def create_and_add_node( snapshots_dir=snapshots_dir, ) LOG.info(f"Replace node {old_primary.local_node_id} with {node.local_node_id}") - network.replace_node(old_primary, node, args) + network.replace_stopped_node(old_primary, node, args) LOG.info(f"Done replacing node: {host}") + def run(args): hosts = args.nodes or ["local://localhost"] @@ -392,7 +363,6 @@ def run(args): network, args.add_new_node_after_primary_stops, old_primary, - primary, latest_snapshot_dir, statistics, ) diff --git a/tests/infra/consortium.py b/tests/infra/consortium.py index c89cfa36c800..c9ad02b8b370 100644 --- a/tests/infra/consortium.py +++ b/tests/infra/consortium.py @@ -361,7 +361,15 @@ def trust_nodes( **kwargs, ) - def replace_node(self, remote_node, node_to_retire, node_to_add, valid_from, validity_period_days=None, **kwargs,): + def replace_node( + self, + remote_node, + node_to_retire, + node_to_add, + valid_from, + validity_period_days=None, + **kwargs, + ): proposal_body = {"actions": []} trust_args = {"node_id": node_to_add.node_id, "valid_from": str(valid_from)} if validity_period_days is not None: diff --git a/tests/infra/network.py b/tests/infra/network.py index 65cdb89214bc..1492adbbd2e3 100644 --- a/tests/infra/network.py +++ b/tests/infra/network.py @@ -955,7 +955,7 @@ def retire_node(self, remote_node, node_to_retire, timeout=10): self.nodes.remove(node_to_retire) - def replace_node( + def replace_stopped_node( self, node_to_retire, node_to_add, @@ -963,7 +963,7 @@ def replace_node( valid_from=None, validity_period_days=None, no_wait=False, - timeout=None, + timeout=5, ): primary, _ = self.find_primary() try: @@ -980,25 +980,15 @@ def replace_node( validity_period_days=validity_period_days, timeout=args.ledger_recovery_timeout, ) - if not no_wait: - # The main endorsed RPC interface is only open once the node - # has caught up and observed commit on the service open transaction. - node_to_add.wait_for_node_to_join( - timeout=timeout or args.ledger_recovery_timeout - ) except (ValueError, TimeoutError): - LOG.error(f"New trusted node {node_to_add.node_id} failed to join the network") + LOG.error( + f"NFailed to replace {node_to_retire.node_id} with {node_to_add.node_id}" + ) node_to_add.stop() raise node_to_add.network_state = infra.node.NodeNetworkState.joined - node_to_add.set_certificate_validity_period( - valid_from, - validity_period_days or args.maximum_node_certificate_validity_days, - ) - if not no_wait: - self.wait_for_all_nodes_to_commit(primary=primary) - end_time = time.time() + 5 + end_time = time.time() + timeout r = None while time.time() < end_time: try: @@ -1006,9 +996,7 @@ def replace_node( r = c.get("/node/network/removable_nodes").body.json() if node_to_retire.node_id in {n["node_id"] for n in r["nodes"]}: check_commit = infra.checker.Checker(c) - r = c.delete( - f"/node/network/nodes/{node_to_retire.node_id}" - ) + r = c.delete(f"/node/network/nodes/{node_to_retire.node_id}") check_commit(r) break else: From 74e6f08aa841fbd3cbac9e9062c5b989070416e5 Mon Sep 17 00:00:00 2001 From: Amaury Chamayou Date: Tue, 8 Aug 2023 12:49:55 +0000 Subject: [PATCH 12/17] bring the stats back --- tests/infra/basicperf.py | 4 ++-- tests/infra/network.py | 12 +++++++++++- 2 files changed, 13 insertions(+), 3 deletions(-) diff --git a/tests/infra/basicperf.py b/tests/infra/basicperf.py index 246f19c6d40c..ca11a0fd8521 100644 --- a/tests/infra/basicperf.py +++ b/tests/infra/basicperf.py @@ -192,7 +192,7 @@ def create_and_add_node(network, host, old_primary, snapshots_dir, statistics): snapshots_dir=snapshots_dir, ) LOG.info(f"Replace node {old_primary.local_node_id} with {node.local_node_id}") - network.replace_stopped_node(old_primary, node, args) + network.replace_stopped_node(old_primary, node, args, statistics=statistics) LOG.info(f"Done replacing node: {host}") @@ -356,7 +356,7 @@ def run(args): old_primary = primary primary, _ = network.wait_for_new_primary(primary) statistics[ - "new_primary_election_time" + "new_primary_detected_time" ] = datetime.datetime.now().isoformat() if args.add_new_node_after_primary_stops: create_and_add_node( diff --git a/tests/infra/network.py b/tests/infra/network.py index 1492adbbd2e3..b7536a796af5 100644 --- a/tests/infra/network.py +++ b/tests/infra/network.py @@ -962,8 +962,8 @@ def replace_stopped_node( args, valid_from=None, validity_period_days=None, - no_wait=False, timeout=5, + statistics=None, ): primary, _ = self.find_primary() try: @@ -972,6 +972,10 @@ def replace_stopped_node( # Note: Timeout is function of the ledger size here since # the commit of the trust_node proposal may rely on the new node # catching up (e.g. adding 1 node to a 1-node network). + if statistics is not None: + statistics[ + "node_replacement_governance_start" + ] = datetime.now().isoformat() self.consortium.replace_node( primary, node_to_retire, @@ -980,6 +984,10 @@ def replace_stopped_node( validity_period_days=validity_period_days, timeout=args.ledger_recovery_timeout, ) + if statistics is not None: + statistics[ + "node_replacement_governance_committed" + ] = datetime.now().isoformat() except (ValueError, TimeoutError): LOG.error( f"NFailed to replace {node_to_retire.node_id} with {node_to_add.node_id}" @@ -1008,6 +1016,8 @@ def replace_stopped_node( time.sleep(0.1) else: raise TimeoutError(f"Timed out waiting for node to become removed: {r}") + if statistics is not None: + statistics["old_node_removal_committed"] = datetime.now().isoformat() self.nodes.remove(node_to_retire) def create_user(self, local_user_id, curve, record=True): From 0a07457cf1fd6044a311a6c81d6dbc2454bf5aa3 Mon Sep 17 00:00:00 2001 From: Amaury Chamayou Date: Tue, 8 Aug 2023 13:42:45 +0000 Subject: [PATCH 13/17] Lower unnecessarily verbose log level --- src/host/node_connections.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/host/node_connections.h b/src/host/node_connections.h index 958cf40f73c7..0a6f3c29f661 100644 --- a/src/host/node_connections.h +++ b/src/host/node_connections.h @@ -297,7 +297,7 @@ namespace asynchost const auto address_it = node_addresses.find(to); if (address_it == node_addresses.end()) { - LOG_INFO_FMT("Ignoring node_outbound to unknown node {}", to); + LOG_TRACE_FMT("Ignoring node_outbound to unknown node {}", to); return; } From 4edbafc6333b8a35411231baaef2742f212d0f8e Mon Sep 17 00:00:00 2001 From: Amaury Chamayou Date: Tue, 8 Aug 2023 14:36:15 +0000 Subject: [PATCH 14/17] Removing more excessive logging --- samples/apps/basic/basic.cpp | 2 ++ src/consensus/aft/raft.h | 4 ++-- tests/infra/basicperf.py | 4 ++-- 3 files changed, 6 insertions(+), 4 deletions(-) diff --git a/samples/apps/basic/basic.cpp b/samples/apps/basic/basic.cpp index dba3f4c08aac..5c5250aa4d22 100644 --- a/samples/apps/basic/basic.cpp +++ b/samples/apps/basic/basic.cpp @@ -51,6 +51,7 @@ namespace basicapp }; make_endpoint( "/records/{key}", HTTP_PUT, put, {ccf::user_cert_auth_policy}) + .set_forwarding_required(ccf::endpoints::ForwardingRequired::Never) .install(); auto get = [this](ccf::endpoints::ReadOnlyEndpointContext& ctx) { @@ -85,6 +86,7 @@ namespace basicapp }; make_read_only_endpoint( "/records/{key}", HTTP_GET, get, {ccf::user_cert_auth_policy}) + .set_forwarding_required(ccf::endpoints::ForwardingRequired::Never) .install(); auto post = [this](ccf::endpoints::EndpointContext& ctx) { diff --git a/src/consensus/aft/raft.h b/src/consensus/aft/raft.h index a217c9ec0b7b..89d1cf131a29 100644 --- a/src/consensus/aft/raft.h +++ b/src/consensus/aft/raft.h @@ -520,7 +520,7 @@ namespace aft if (state->leadership_state != kv::LeadershipState::Leader) { - RAFT_FAIL_FMT( + RAFT_DEBUG_FMT( "Failed to replicate {} items: not leader", entries.size()); rollback(state->last_idx); return false; @@ -528,7 +528,7 @@ namespace aft if (term != state->current_view) { - RAFT_FAIL_FMT( + RAFT_DEBUG_FMT( "Failed to replicate {} items at term {}, current term is {}", entries.size(), term, diff --git a/tests/infra/basicperf.py b/tests/infra/basicperf.py index ca11a0fd8521..14a200d51c75 100644 --- a/tests/infra/basicperf.py +++ b/tests/infra/basicperf.py @@ -420,8 +420,8 @@ def table(): # to maintain consistency, and 504 when we try to write to the future primary # before their election. Since these requests effectively do nothing, they # should not count towards latency statistics. - if args.stop_primary_after_s: - overall = overall.filter(pl.col("responseStatus") < 500) + #if args.stop_primary_after_s: + # overall = overall.filter(pl.col("responseStatus") < 500) overall = overall.with_columns( pl.col("receiveTime").alias("latency") - pl.col("sendTime") From fbdbde687b62265854cfc036ded6ae5a272bda88 Mon Sep 17 00:00:00 2001 From: Amaury Chamayou Date: Tue, 8 Aug 2023 14:59:58 +0000 Subject: [PATCH 15/17] same fwding on js app --- samples/apps/basic/js/app.json | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/samples/apps/basic/js/app.json b/samples/apps/basic/js/app.json index 35bef8fa9554..1be9df145b6f 100644 --- a/samples/apps/basic/js/app.json +++ b/samples/apps/basic/js/app.json @@ -4,7 +4,7 @@ "get": { "js_module": "basic.js", "js_function": "get_record", - "forwarding_required": "sometimes", + "forwarding_required": "never", "authn_policies": ["user_cert"], "mode": "readonly", "openapi": {} @@ -12,7 +12,7 @@ "put": { "js_module": "basic.js", "js_function": "put_record", - "forwarding_required": "always", + "forwarding_required": "never", "authn_policies": ["user_cert"], "mode": "readwrite", "openapi": {} From 331385ba460454529535e11576f4f19d234af5e0 Mon Sep 17 00:00:00 2001 From: Amaury Chamayou Date: Tue, 8 Aug 2023 15:45:28 +0000 Subject: [PATCH 16/17] fmt --- tests/infra/basicperf.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/infra/basicperf.py b/tests/infra/basicperf.py index 14a200d51c75..fd371fbe8991 100644 --- a/tests/infra/basicperf.py +++ b/tests/infra/basicperf.py @@ -420,7 +420,7 @@ def table(): # to maintain consistency, and 504 when we try to write to the future primary # before their election. Since these requests effectively do nothing, they # should not count towards latency statistics. - #if args.stop_primary_after_s: + # if args.stop_primary_after_s: # overall = overall.filter(pl.col("responseStatus") < 500) overall = overall.with_columns( From dc0f0d183a23c9a762e2c41bea9df95533436864 Mon Sep 17 00:00:00 2001 From: Amaury Chamayou Date: Tue, 8 Aug 2023 16:44:54 +0000 Subject: [PATCH 17/17] Fix mt test --- CMakeLists.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index cfc31600093b..f3bc838ccaec 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -1460,7 +1460,7 @@ if(BUILD_TESTS) PYTHON_SCRIPT ${CMAKE_CURRENT_LIST_DIR}/tests/infra/basicperf.py CLIENT_BIN ./submit ADDITIONAL_ARGS --package "samples/apps/basic/libbasic" --client-def - "${WORKER_THREADS},write,100000,any" + "${WORKER_THREADS},write,100000,primary" ) endif()