From 8fc6024449828d2fb6bb73899e41a09a39773946 Mon Sep 17 00:00:00 2001 From: Dmitry Kardymon Date: Mon, 5 Feb 2024 13:06:40 +0000 Subject: [PATCH 01/19] Move mr tests --- ydb/tests/fq/kikimr/conftest.py | 71 ++++++++ ydb/tests/fq/kikimr/test_base.py | 26 +++ .../kikimr/test_recovery_match_recognize.py | 160 ++++++++++++++++++ ydb/tests/fq/kikimr/ya.make | 37 ++++ ydb/tests/fq/ya.make | 1 + 5 files changed, 295 insertions(+) create mode 100644 ydb/tests/fq/kikimr/conftest.py create mode 100644 ydb/tests/fq/kikimr/test_base.py create mode 100644 ydb/tests/fq/kikimr/test_recovery_match_recognize.py create mode 100644 ydb/tests/fq/kikimr/ya.make diff --git a/ydb/tests/fq/kikimr/conftest.py b/ydb/tests/fq/kikimr/conftest.py new file mode 100644 index 000000000000..687e11205a7c --- /dev/null +++ b/ydb/tests/fq/kikimr/conftest.py @@ -0,0 +1,71 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*- + +import pytest + +from ydb.tests.tools.fq_runner.fq_client import FederatedQueryClient +from ydb.tests.tools.fq_runner.custom_hooks import * # noqa: F401,F403 Adding custom hooks for YQv2 support +from ydb.tests.tools.fq_runner.kikimr_utils import ExtensionPoint +from ydb.tests.tools.fq_runner.kikimr_utils import YQv2Extension +from ydb.tests.tools.fq_runner.kikimr_utils import ComputeExtension +from ydb.tests.tools.fq_runner.kikimr_utils import DefaultConfigExtension +from ydb.tests.tools.fq_runner.kikimr_utils import StatsModeExtension +from ydb.tests.tools.fq_runner.kikimr_utils import start_kikimr + + +@pytest.fixture +def stats_mode(): + return '' + + +@pytest.fixture +def kikimr(request: pytest.FixtureRequest, yq_version: str, stats_mode: str): + kikimr_extensions = [DefaultConfigExtension(""), + YQv2Extension(yq_version), + ComputeExtension(), + StatsModeExtension(stats_mode)] + with start_kikimr(request, kikimr_extensions) as kikimr: + yield kikimr + + +class ManyRetriesConfigExtension(ExtensionPoint): + def __init__(self): + super().__init__() + + def is_applicable(self, request): + return True + + def apply_to_kikimr(self, request, kikimr): + kikimr.compute_plane.fq_config['control_plane_storage']['retry_policy_mapping'] = [ + { + 'status_code': [2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13], + 'policy': { + 'retry_count': 10000 + } + } + ] + + +@pytest.fixture +def kikimr_many_retries(request: pytest.FixtureRequest, yq_version: str): + kikimr_extensions = [DefaultConfigExtension(""), + ManyRetriesConfigExtension(), + YQv2Extension(yq_version), + ComputeExtension()] + with start_kikimr(request, kikimr_extensions) as kikimr: + yield kikimr + + +def create_client(kikimr, request): + return FederatedQueryClient(request.param["folder_id"] if request is not None else "my_folder", + streaming_over_kikimr=kikimr) + + +@pytest.fixture +def client(kikimr, request=None): + return create_client(kikimr, request) + + +@pytest.fixture +def client_many_retries(kikimr_many_retries, request=None): + return create_client(kikimr_many_retries, request) diff --git a/ydb/tests/fq/kikimr/test_base.py b/ydb/tests/fq/kikimr/test_base.py new file mode 100644 index 000000000000..6b09f7a702e9 --- /dev/null +++ b/ydb/tests/fq/kikimr/test_base.py @@ -0,0 +1,26 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*- + +from ydb.tests.tools.fq_runner.kikimr_runner import StreamingOverKikimr +from ydb.tests.tools.fq_runner.kikimr_runner import StreamingOverKikimrConfig +from ydb.tests.tools.datastreams_helpers.test_yds_base import TestYdsBase + + +class TestBaseWithAbortingConfigParams(TestYdsBase): + + @classmethod + def setup_class(cls): + kikimr_conf = StreamingOverKikimrConfig(cloud_mode=True) + cls.streaming_over_kikimr = StreamingOverKikimr(kikimr_conf) + cls.streaming_over_kikimr.control_plane.fq_config['control_plane_storage']['task_lease_ttl'] = "2s" + cls.streaming_over_kikimr.control_plane.fq_config['control_plane_storage']['task_lease_retry_policy'] = {} + cls.streaming_over_kikimr.control_plane.fq_config['control_plane_storage']['task_lease_retry_policy']['retry_count'] = 1 + cls.streaming_over_kikimr.compute_plane.fq_config['pinger']['ping_period'] = "1s" + cls.streaming_over_kikimr.start_mvp_mock_server() + cls.streaming_over_kikimr.start() + + @classmethod + def teardown_class(cls): + if hasattr(cls, "streaming_over_kikimr"): + cls.streaming_over_kikimr.stop_mvp_mock_server() + cls.streaming_over_kikimr.stop() diff --git a/ydb/tests/fq/kikimr/test_recovery_match_recognize.py b/ydb/tests/fq/kikimr/test_recovery_match_recognize.py new file mode 100644 index 000000000000..dcca66c09125 --- /dev/null +++ b/ydb/tests/fq/kikimr/test_recovery_match_recognize.py @@ -0,0 +1,160 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*- + +import pytest +import logging +import os +import time + +import ydb.tests.library.common.yatest_common as yatest_common +from ydb.tests.tools.fq_runner.kikimr_runner import StreamingOverKikimr +from ydb.tests.tools.fq_runner.kikimr_runner import StreamingOverKikimrConfig +import library.python.retry as retry +from ydb.tests.tools.fq_runner.kikimr_utils import yq_v1 +from ydb.tests.tools.datastreams_helpers.test_yds_base import TestYdsBase +import ydb.public.api.protos.draft.fq_pb2 as fq + + +@pytest.fixture +def kikimr(request): + kikimr_conf = StreamingOverKikimrConfig(cloud_mode=True, node_count=2) + kikimr = StreamingOverKikimr(kikimr_conf) + kikimr.start_mvp_mock_server() + kikimr.start() + yield kikimr + kikimr.stop_mvp_mock_server() + kikimr.stop() + + +class TestRecoveryMatchRecognize(TestYdsBase): + + @classmethod + def setup_class(cls): + # for retry + cls.retry_conf = retry.RetryConf().upto(seconds=30).waiting(0.1) + + @retry.retry_intrusive + def get_graph_master_node_id(self, kikimr, query_id): + for node_index in kikimr.control_plane.kikimr_cluster.nodes: + if kikimr.control_plane.get_task_count(node_index, query_id) > 0: + return node_index + assert False, "No active graphs found" + + def get_ca_count(self, kikimr, node_index): + result = kikimr.control_plane.get_sensors(node_index, "utils").find_sensor( + {"activity": "DQ_COMPUTE_ACTOR", "sensor": "ActorsAliveByActivity", "execpool": "User"} + ) + return result if result is not None else 0 + + def dump_workers(self, kikimr, worker_count, ca_count, wait_time=yatest_common.plain_or_under_sanitizer(30, 150)): + deadline = time.time() + wait_time + while True: + wcs = 0 + ccs = 0 + list = [] + for node_index in kikimr.control_plane.kikimr_cluster.nodes: + wc = kikimr.control_plane.get_worker_count(node_index) + cc = self.get_ca_count(kikimr, node_index) + wcs += wc + ccs += cc + list.append([node_index, wc, cc]) + if wcs == worker_count and ccs == ca_count: + for [s, w, c] in list: + if w * 2 != c: + continue + for [s, w, c] in list: + logging.debug("Node {}, workers {}, ca {}".format(s, w, c)) + return + if time.time() > deadline: + for [s, w, c] in list: + logging.debug("Node {}, workers {}, ca {}".format(s, w, c)) + assert False, "Workers={} and CAs={}, but {} and {} expected".format(wcs, ccs, worker_count, ca_count) + + @yq_v1 + @pytest.mark.parametrize("kikimr", [(None, None, None)], indirect=["kikimr"]) + def test_program_state_recovery(self, kikimr, client, yq_version): + + self.init_topics(f"pq_kikimr_streaming_{yq_version}") + + sql = R''' + PRAGMA dq.MaxTasksPerStage="2"; + + pragma FeatureR010="prototype"; + pragma config.flags("TimeOrderRecoverDelay", "-1000000"); + pragma config.flags("TimeOrderRecoverAhead", "1000000"); + + INSERT INTO myyds.`{output_topic}` + SELECT ToBytes(Unwrap(Json::SerializeJson(Yson::From(TableRow())))) + FROM (SELECT * FROM myyds.`{input_topic}` + WITH ( + format=json_each_row, + SCHEMA + ( + dt UINT64 + ))) + MATCH_RECOGNIZE( + ORDER BY CAST(dt as Timestamp) + MEASURES + LAST(ALL_TRUE.dt) as dt + ONE ROW PER MATCH + PATTERN ( ALL_TRUE ) + DEFINE + ALL_TRUE as True)''' \ + .format( + input_topic=self.input_topic, + output_topic=self.output_topic, + ) + + client.create_yds_connection("myyds", os.getenv("YDB_DATABASE"), os.getenv("YDB_ENDPOINT")) + query_id = client.create_query("simple", sql, type=fq.QueryContent.QueryType.STREAMING).result.query_id + client.wait_query_status(query_id, fq.QueryMeta.RUNNING) + kikimr.compute_plane.wait_zero_checkpoint(query_id) + + master_node_index = self.get_graph_master_node_id(kikimr, query_id) + logging.debug("Master node {}".format(master_node_index)) + + messages1 = ['{"dt": 1696849942400002}', '{"dt": 1696849942000001}'] + self.write_stream(messages1) + + logging.debug("get_completed_checkpoints {}".format(kikimr.compute_plane.get_completed_checkpoints(query_id))) + kikimr.compute_plane.wait_completed_checkpoints( + query_id, kikimr.compute_plane.get_completed_checkpoints(query_id) + 1 + ) + + # restart node with CA + node_to_restart = None + for node_index in kikimr.control_plane.kikimr_cluster.nodes: + wc = kikimr.control_plane.get_worker_count(node_index) + if wc is not None: + if wc > 0 and node_index != master_node_index and node_to_restart is None: + node_to_restart = node_index + assert node_to_restart is not None, "Can't find any task on non master node" + + logging.debug("Restart non-master node {}".format(node_to_restart)) + + kikimr.control_plane.kikimr_cluster.nodes[node_to_restart].stop() + kikimr.control_plane.kikimr_cluster.nodes[node_to_restart].start() + kikimr.control_plane.wait_bootstrap(node_to_restart) + + messages2 = [ + '{"dt": 1696849942800000}', + '{"dt": 1696849943200003}', + '{"dt": 1696849943300003}', + '{"dt": 1696849943600003}', + '{"dt": 1696849943900003}' + ] + self.write_stream(messages2) + + assert client.get_query_status(query_id) == fq.QueryMeta.RUNNING + + expected = ['{"dt":1696849942000001}', '{"dt":1696849942400002}', '{"dt":1696849942800000}'] + + read_data = self.read_stream(len(expected)) + logging.info("Data was read: {}".format(read_data)) + + assert read_data == expected + + client.abort_query(query_id) + client.wait_query(query_id) + + self.dump_workers(kikimr, 0, 0) diff --git a/ydb/tests/fq/kikimr/ya.make b/ydb/tests/fq/kikimr/ya.make new file mode 100644 index 000000000000..873d71ffa9a3 --- /dev/null +++ b/ydb/tests/fq/kikimr/ya.make @@ -0,0 +1,37 @@ +PY3TEST() + +FORK_SUBTESTS() +SPLIT_FACTOR(50) + +INCLUDE(${ARCADIA_ROOT}/ydb/tests/tools/fq_runner/ydb_runner_with_datastreams.inc) + +PEERDIR( + ydb/public/api/protos + ydb/public/api/grpc + ydb/tests/tools/datastreams_helpers + ydb/tests/tools/fq_runner +) + +DEPENDS(ydb/tests/tools/pq_read) + +PY_SRCS( + conftest.py + test_base.py +) + +TEST_SRCS( + test_recovery_match_recognize.py +) + +IF (SANITIZER_TYPE == "thread") + TIMEOUT(2400) + SIZE(LARGE) + TAG(ya:fat) +ELSE() + TIMEOUT(600) + SIZE(MEDIUM) +ENDIF() + +REQUIREMENTS(ram:16) + +END() diff --git a/ydb/tests/fq/ya.make b/ydb/tests/fq/ya.make index 734e38f7f709..8fe31a66c9af 100644 --- a/ydb/tests/fq/ya.make +++ b/ydb/tests/fq/ya.make @@ -2,6 +2,7 @@ RECURSE_FOR_TESTS( common generic http_api + kikimr mem_alloc multi_plane plans From 8d336e8ece1945150cffe325eae57d0f865ce272 Mon Sep 17 00:00:00 2001 From: Dmitry Kardymon Date: Tue, 6 Feb 2024 07:11:03 +0000 Subject: [PATCH 02/19] Add second test --- ydb/tests/fq/kikimr/test_recovery_mz.py | 196 ++++++++++++++++++++++++ ydb/tests/fq/kikimr/ya.make | 1 + 2 files changed, 197 insertions(+) create mode 100644 ydb/tests/fq/kikimr/test_recovery_mz.py diff --git a/ydb/tests/fq/kikimr/test_recovery_mz.py b/ydb/tests/fq/kikimr/test_recovery_mz.py new file mode 100644 index 000000000000..631f784a6d91 --- /dev/null +++ b/ydb/tests/fq/kikimr/test_recovery_mz.py @@ -0,0 +1,196 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*- + +import logging +import time +import pytest +import random +import os +import yatest + +import ydb.tests.library.common.yatest_common as yatest_common +from ydb.tests.tools.fq_runner.kikimr_runner import StreamingOverKikimr +from ydb.tests.tools.fq_runner.kikimr_runner import StreamingOverKikimrConfig +from ydb.tests.tools.fq_runner.kikimr_runner import TenantConfig +from ydb.tests.tools.fq_runner.fq_client import FederatedQueryClient +from ydb.tests.tools.fq_runner.kikimr_utils import yq_v1 +from ydb.tests.tools.datastreams_helpers.test_yds_base import TestYdsBase + +import library.python.retry as retry +import ydb.public.api.protos.draft.fq_pb2 as fq + + +@pytest.fixture +def kikimr(): + kikimr_conf = StreamingOverKikimrConfig( + cloud_mode=True, + node_count={"/cp": TenantConfig(1), + "/compute": TenantConfig(8)}) + kikimr = StreamingOverKikimr(kikimr_conf) + # control + kikimr.control_plane.fq_config['control_plane_storage']['mapping'] = {"common_tenant_name": ["/compute"]} + kikimr.control_plane.fq_config['control_plane_storage']['task_lease_retry_policy'] = {} + kikimr.control_plane.fq_config['control_plane_storage']['task_lease_retry_policy']['retry_count'] = 5 + kikimr.control_plane.fq_config['control_plane_storage']['task_lease_retry_policy']['retry_period'] = "30s" + kikimr.control_plane.fq_config['control_plane_storage']['task_lease_ttl'] = "3s" + # compute + kikimr.compute_plane.fq_config['pinger']['ping_period'] = "1s" + kikimr.start_mvp_mock_server() + kikimr.start() + yield kikimr + kikimr.stop() + kikimr.stop_mvp_mock_server() + + +def run_with_sleep(args): + program_args, time_min, time_max, duration = args + deadline = time.time() + duration + while time.time() < deadline: + yatest.common.execute(program_args) + time.sleep(random.uniform(time_min, time_max)) + + +class TestRecovery(TestYdsBase): + + @retry.retry_intrusive + def get_graph_master_node_id(self, query_id): + for node_index in self.kikimr.compute_plane.kikimr_cluster.nodes: + if self.kikimr.compute_plane.get_task_count(node_index, query_id) > 0: + return node_index + assert False, "No active graphs found" + + def get_ca_count(self, node_index): + result = self.kikimr.compute_plane.get_sensors(node_index, "utils").find_sensor({"activity": "DQ_COMPUTE_ACTOR", "sensor": "ActorsAliveByActivity", "execpool": "User"}) + return result if result is not None else 0 + + def dump_workers(self, worker_count, ca_count, wait_time=yatest_common.plain_or_under_sanitizer(30, 150)): + deadline = time.time() + wait_time + while True: + wcs = 0 + ccs = 0 + list = [] + for node_index in self.kikimr.compute_plane.kikimr_cluster.nodes: + wc = self.kikimr.compute_plane.get_worker_count(node_index) + cc = self.get_ca_count(node_index) + wcs += wc + ccs += cc + list.append([node_index, wc, cc]) + if wcs == worker_count and ccs == ca_count: + for [s, w, c] in list: + if w * 2 != c: + continue + for [s, w, c] in list: + logging.debug("Node {}, workers {}, ca {}".format(s, w, c)) + return + if time.time() > deadline: + for [s, w, c] in list: + logging.debug("Node {}, workers {}, ca {}".format(s, w, c)) + assert False, "Workers={} and CAs={}, but {} and {} expected".format(wcs, ccs, worker_count, ca_count) + @yq_v1 + def test_recovery(self, kikimr, client, yq_version): + self.init_topics(f"pq_kikimr_streaming_{yq_version}", partitions_count=2) + + self.retry_conf = retry.RetryConf().upto(seconds=30).waiting(0.1) + self.kikimr = kikimr + kikimr.compute_plane.wait_bootstrap() + kikimr.compute_plane.wait_discovery() + + # Consumer and topics to create are written in ya.make file. + sql = R''' + PRAGMA dq.MaxTasksPerStage="2"; + + INSERT INTO myyds.`{output_topic}` + SELECT STREAM + * + FROM myyds.`{input_topic}`;'''\ + .format( + input_topic=self.input_topic, + output_topic=self.output_topic, + ) + client.create_yds_connection("myyds", os.getenv("YDB_DATABASE"), os.getenv("YDB_ENDPOINT")) + # client = FederatedQueryClient("my_folder", streaming_over_kikimr=kikimr) + query_id = client.create_query("simple", sql, type=fq.QueryContent.QueryType.STREAMING).result.query_id + client.wait_query_status(query_id, fq.QueryMeta.RUNNING) + self.kikimr.compute_plane.wait_zero_checkpoint(query_id) + + logging.debug("Uuid = {}".format(kikimr.uuid)) + master_node_index = self.get_graph_master_node_id(query_id) + logging.debug("Master node {}".format(master_node_index)) + + self.write_stream([str(i) for i in range(1, 11)]) + + read_data = self.read_stream(10) + + for message in read_data: + logging.info("Received message: {}".format(message)) + + assert len(read_data) == 10 + + d = {} + for m in read_data: + n = int(m) + assert n >= 1 and n <= 10 + assert n not in d + d[n] = 1 + + self.dump_workers(2, 4) + + node_to_restart = None + for node_index in kikimr.compute_plane.kikimr_cluster.nodes: + wc = kikimr.compute_plane.get_worker_count(node_index) + if wc is not None: + if wc > 0 and node_index != master_node_index and node_to_restart is None: + node_to_restart = node_index + assert node_to_restart is not None, "Can't find any task on non master node" + + logging.debug("Restart non-master node {}".format(node_to_restart)) + + kikimr.compute_plane.kikimr_cluster.nodes[node_to_restart].stop() + kikimr.compute_plane.kikimr_cluster.nodes[node_to_restart].start() + kikimr.compute_plane.wait_bootstrap(node_to_restart) + + self.dump_workers(2, 4) + + self.write_stream([str(i) for i in range(11, 21)]) + + read_data = self.read_stream(10) + assert len(read_data) == 10 + + for m in read_data: + n = int(m) + assert n >= 1 and n <= 20 + if n in d: + d[n] = d[n] + 1 + else: + d[n] = 1 + + logging.debug("Restart Master node {}".format(master_node_index)) + + kikimr.compute_plane.kikimr_cluster.nodes[master_node_index].stop() + kikimr.compute_plane.kikimr_cluster.nodes[master_node_index].start() + kikimr.compute_plane.wait_bootstrap(master_node_index) + master_node_index = self.get_graph_master_node_id(query_id) + + logging.debug("New master node {}".format(master_node_index)) + + self.dump_workers(2, 4) + + self.write_stream([str(i) for i in range(21, 31)]) + + read_data = self.read_stream(10) + assert len(read_data) == 10 + + for m in read_data: + n = int(m) + assert n >= 1 and n <= 30 + if n in d: + d[n] = d[n] + 1 + else: + d[n] = 1 + + zero_checkpoints_metric = kikimr.compute_plane.get_checkpoint_coordinator_metric(query_id, "StartedFromEmptyCheckpoint") + restored_metric = kikimr.compute_plane.get_checkpoint_coordinator_metric(query_id, "RestoredFromSavedCheckpoint") + assert restored_metric >= 1, "RestoredFromSavedCheckpoint: {}, StartedFromEmptyCheckpoint: {}".format(restored_metric, zero_checkpoints_metric) + + client.abort_query(query_id) + client.wait_query(query_id) diff --git a/ydb/tests/fq/kikimr/ya.make b/ydb/tests/fq/kikimr/ya.make index 873d71ffa9a3..3bc2c637e1ac 100644 --- a/ydb/tests/fq/kikimr/ya.make +++ b/ydb/tests/fq/kikimr/ya.make @@ -21,6 +21,7 @@ PY_SRCS( TEST_SRCS( test_recovery_match_recognize.py + test_recovery_mz.py ) IF (SANITIZER_TYPE == "thread") From 3d0b0ae6bf5c9469da57d44ea07796cabeaf813c Mon Sep 17 00:00:00 2001 From: Dmitry Kardymon Date: Tue, 6 Feb 2024 13:05:46 +0000 Subject: [PATCH 03/19] Add mr integration test --- .../comp_nodes/mkql_match_recognize.cpp | 36 +++++- .../kikimr/test_recovery_match_recognize.py | 117 +++++++++++++++--- 2 files changed, 129 insertions(+), 24 deletions(-) diff --git a/ydb/library/yql/minikql/comp_nodes/mkql_match_recognize.cpp b/ydb/library/yql/minikql/comp_nodes/mkql_match_recognize.cpp index 160dfbf8bc15..1098f31411fa 100644 --- a/ydb/library/yql/minikql/comp_nodes/mkql_match_recognize.cpp +++ b/ydb/library/yql/minikql/comp_nodes/mkql_match_recognize.cpp @@ -221,6 +221,18 @@ class TStateForNonInterleavedPartitions , Cache(cache) , Terminating(false) {} + + NUdf::TUnboxedValue Save() const override { + std::cerr << "TStateForNonInterleavedPartitions::Save()" << std::endl; + TString out; + auto strRef = NUdf::TStringRef(out.data(), out.size()); + return MakeString(strRef); + } + + void Load(const NUdf::TStringRef& state) override { + std::cerr << "TStateForNonInterleavedPartitions::Load()" << std::endl; + } + bool ProcessInputRow(NUdf::TUnboxedValue&& row, TComputationContext& ctx) { MKQL_ENSURE(not DelayedRow, "Internal logic error"); //we're finalizing previous partition InputRowArg->SetValue(ctx, NUdf::TUnboxedValue(row)); @@ -311,9 +323,21 @@ class TStateForInterleavedPartitions , Parameters(parameters) , NfaTransitionGraph(TNfaTransitionGraphBuilder::Create(parameters.Pattern, parameters.VarNamesLookup)) , Cache(cache) -{ -} - bool ProcessInputRow(NUdf::TUnboxedValue&& row, TComputationContext& ctx) { + { + } + + NUdf::TUnboxedValue Save() const override { + std::cerr << "TStateForInterleavedPartitions::Save()" << std::endl; + TString out; + auto strRef = NUdf::TStringRef(out.data(), out.size()); + return MakeString(strRef); + } + + void Load(const NUdf::TStringRef& state) override { + std::cerr << "TStateForInterleavedPartitions::Load()" << std::endl; + } + + bool ProcessInputRow(NUdf::TUnboxedValue&& row, TComputationContext& ctx) { auto partition = GetPartitionHandler(row, ctx); if (partition->second->ProcessInputRow(std::move(row), ctx)) { HasReadyOutput.push(partition); @@ -378,8 +402,8 @@ class TStateForInterleavedPartitions }; template -class TMatchRecognizeWrapper : public TStatefulFlowComputationNode> { - using TBaseComputation = TStatefulFlowComputationNode>; +class TMatchRecognizeWrapper : public TStatefulFlowComputationNode, true> { + using TBaseComputation = TStatefulFlowComputationNode, true>; public: TMatchRecognizeWrapper(TComputationMutables &mutables, EValueRepresentation kind, IComputationNode *inputFlow, IComputationExternalNode *inputRowArg, @@ -409,6 +433,7 @@ class TMatchRecognizeWrapper : public TStatefulFlowComputationNode(stateValue.AsBoxed().Get()); while (true) { if (auto output = state->GetOutputIfReady(ctx); output) { + std::cerr << "DoCalculate: return output" << std::endl; return output; } auto item = InputFlow->GetValue(ctx); @@ -418,6 +443,7 @@ class TMatchRecognizeWrapper : public TStatefulFlowComputationNodeProcessInputRow(std::move(item), ctx); } } diff --git a/ydb/tests/fq/kikimr/test_recovery_match_recognize.py b/ydb/tests/fq/kikimr/test_recovery_match_recognize.py index dcca66c09125..4ee530901c9f 100644 --- a/ydb/tests/fq/kikimr/test_recovery_match_recognize.py +++ b/ydb/tests/fq/kikimr/test_recovery_match_recognize.py @@ -70,11 +70,31 @@ def dump_workers(self, kikimr, worker_count, ca_count, wait_time=yatest_common.p logging.debug("Node {}, workers {}, ca {}".format(s, w, c)) assert False, "Workers={} and CAs={}, but {} and {} expected".format(wcs, ccs, worker_count, ca_count) + def restart_node(self, kikimr, query_id): + # restart node with CA + + master_node_index = self.get_graph_master_node_id(kikimr, query_id) + logging.debug("Master node {}".format(master_node_index)) + + node_to_restart = None + for node_index in kikimr.control_plane.kikimr_cluster.nodes: + wc = kikimr.control_plane.get_worker_count(node_index) + if wc is not None: + if wc > 0 and node_index != master_node_index and node_to_restart is None: + node_to_restart = node_index + assert node_to_restart is not None, "Can't find any task on non master node" + + logging.debug("Restart non-master node {}".format(node_to_restart)) + + kikimr.control_plane.kikimr_cluster.nodes[node_to_restart].stop() + kikimr.control_plane.kikimr_cluster.nodes[node_to_restart].start() + kikimr.control_plane.wait_bootstrap(node_to_restart) + @yq_v1 @pytest.mark.parametrize("kikimr", [(None, None, None)], indirect=["kikimr"]) - def test_program_state_recovery(self, kikimr, client, yq_version): + def test_time_order_recorever(self, kikimr, client, yq_version): - self.init_topics(f"pq_kikimr_streaming_{yq_version}") + self.init_topics("test_time_order_recorever_save_load_state") sql = R''' PRAGMA dq.MaxTasksPerStage="2"; @@ -110,9 +130,6 @@ def test_program_state_recovery(self, kikimr, client, yq_version): client.wait_query_status(query_id, fq.QueryMeta.RUNNING) kikimr.compute_plane.wait_zero_checkpoint(query_id) - master_node_index = self.get_graph_master_node_id(kikimr, query_id) - logging.debug("Master node {}".format(master_node_index)) - messages1 = ['{"dt": 1696849942400002}', '{"dt": 1696849942000001}'] self.write_stream(messages1) @@ -121,20 +138,7 @@ def test_program_state_recovery(self, kikimr, client, yq_version): query_id, kikimr.compute_plane.get_completed_checkpoints(query_id) + 1 ) - # restart node with CA - node_to_restart = None - for node_index in kikimr.control_plane.kikimr_cluster.nodes: - wc = kikimr.control_plane.get_worker_count(node_index) - if wc is not None: - if wc > 0 and node_index != master_node_index and node_to_restart is None: - node_to_restart = node_index - assert node_to_restart is not None, "Can't find any task on non master node" - - logging.debug("Restart non-master node {}".format(node_to_restart)) - - kikimr.control_plane.kikimr_cluster.nodes[node_to_restart].stop() - kikimr.control_plane.kikimr_cluster.nodes[node_to_restart].start() - kikimr.control_plane.wait_bootstrap(node_to_restart) + self.restart_node(kikimr, query_id) messages2 = [ '{"dt": 1696849942800000}', @@ -158,3 +162,78 @@ def test_program_state_recovery(self, kikimr, client, yq_version): client.wait_query(query_id) self.dump_workers(kikimr, 0, 0) + + @yq_v1 + @pytest.mark.parametrize("kikimr", [(None, None, None)], indirect=["kikimr"]) + def test_match_recognize(self, kikimr, client, yq_version): + + self.init_topics("test_match_recognize_save_load_state") + + sql = R''' + PRAGMA dq.MaxTasksPerStage="2"; + + pragma FeatureR010="prototype"; + pragma config.flags("TimeOrderRecoverDelay", "-1000000"); + pragma config.flags("TimeOrderRecoverAhead", "1000000"); + + INSERT INTO myyds.`{output_topic}` + SELECT ToBytes(Unwrap(Json::SerializeJson(Yson::From(TableRow())))) + FROM (SELECT * FROM myyds.`{input_topic}` + WITH ( + format=json_each_row, + SCHEMA + ( + dt UINT64, + str STRING + ))) + MATCH_RECOGNIZE( + ORDER BY CAST(dt as Timestamp) + MEASURES + LAST(A.dt) as dt_begin, + LAST(C.dt) as dt_end, + LAST(A.str) as a_str, + LAST(B.str) as b_str + ONE ROW PER MATCH + PATTERN ( A B C ) + DEFINE + A as A.str='A', + B as B.str='B', + C as C.str='C')''' \ + .format( + input_topic=self.input_topic, + output_topic=self.output_topic, + ) + + client.create_yds_connection("myyds", os.getenv("YDB_DATABASE"), os.getenv("YDB_ENDPOINT")) + query_id = client.create_query("simple", sql, type=fq.QueryContent.QueryType.STREAMING).result.query_id + client.wait_query_status(query_id, fq.QueryMeta.RUNNING) + kikimr.compute_plane.wait_zero_checkpoint(query_id) + + messages1 = [ + '{"dt": 1696849942000001, "str": "A" }', + '{"dt": 1696849942500001, "str": "B" }', + '{"dt": 1696849943000001, "str": "C" }'] + self.write_stream(messages1) + + logging.debug("get_completed_checkpoints {}".format(kikimr.compute_plane.get_completed_checkpoints(query_id))) + kikimr.compute_plane.wait_completed_checkpoints( + query_id, kikimr.compute_plane.get_completed_checkpoints(query_id) + 1 + ) + + self.restart_node(kikimr, query_id) + + self.write_stream(['{"dt": 1696849943500001, "str": "D" }', '{"dt": 1696849944100001, "str": "F" }']) + + assert client.get_query_status(query_id) == fq.QueryMeta.RUNNING + + expected = ['{"a_str":"A","b_str":"B","dt_begin":1696849942000001,"dt_end":1696849943000001}'] + + read_data = self.read_stream(1) + logging.info("Data was read: {}".format(read_data)) + + assert read_data == expected + + client.abort_query(query_id) + client.wait_query(query_id) + + self.dump_workers(kikimr, 0, 0) From 2ac5f9e764db406745f03423b41e4b71896dd8fd Mon Sep 17 00:00:00 2001 From: Dmitry Kardymon Date: Tue, 6 Feb 2024 17:28:13 +0000 Subject: [PATCH 04/19] fix test to not pass --- .../comp_nodes/mkql_match_recognize.cpp | 22 ++++++-- .../kikimr/test_recovery_match_recognize.py | 55 +++++++++++-------- 2 files changed, 49 insertions(+), 28 deletions(-) diff --git a/ydb/library/yql/minikql/comp_nodes/mkql_match_recognize.cpp b/ydb/library/yql/minikql/comp_nodes/mkql_match_recognize.cpp index 1098f31411fa..08dfaddc5cb1 100644 --- a/ydb/library/yql/minikql/comp_nodes/mkql_match_recognize.cpp +++ b/ydb/library/yql/minikql/comp_nodes/mkql_match_recognize.cpp @@ -423,12 +423,22 @@ class TMatchRecognizeWrapper : public TStatefulFlowComputationNode( - InputRowArg, - PartitionKey, - PartitionKeyType, - Parameters, - Cache + InputRowArg, + PartitionKey, + PartitionKeyType, + Parameters, + Cache ); + } else if (stateValue.HasValue() && !stateValue.IsBoxed()) { + // Load from saved state. + NUdf::TUnboxedValue state = ctx.HolderFactory.Create( + InputRowArg, + PartitionKey, + PartitionKeyType, + Parameters, + Cache); + state.Load(stateValue.AsStringRef()); + stateValue = state; } auto state = static_cast(stateValue.AsBoxed().Get()); while (true) { @@ -578,6 +588,8 @@ std::pair> ConvertListOfStrings(c IComputationNode* WrapMatchRecognizeCore(TCallable& callable, const TComputationNodeFactoryContext& ctx) { + std::cerr << "WrapMatchRecognizeCore" << std::endl; + using namespace NMatchRecognize; size_t inputIndex = 0; const auto& inputFlow = callable.GetInput(inputIndex++); diff --git a/ydb/tests/fq/kikimr/test_recovery_match_recognize.py b/ydb/tests/fq/kikimr/test_recovery_match_recognize.py index 4ee530901c9f..60c473548d62 100644 --- a/ydb/tests/fq/kikimr/test_recovery_match_recognize.py +++ b/ydb/tests/fq/kikimr/test_recovery_match_recognize.py @@ -7,8 +7,7 @@ import time import ydb.tests.library.common.yatest_common as yatest_common -from ydb.tests.tools.fq_runner.kikimr_runner import StreamingOverKikimr -from ydb.tests.tools.fq_runner.kikimr_runner import StreamingOverKikimrConfig +from ydb.tests.tools.fq_runner.kikimr_runner import StreamingOverKikimr, StreamingOverKikimrConfig, TenantConfig import library.python.retry as retry from ydb.tests.tools.fq_runner.kikimr_utils import yq_v1 from ydb.tests.tools.datastreams_helpers.test_yds_base import TestYdsBase @@ -17,7 +16,7 @@ @pytest.fixture def kikimr(request): - kikimr_conf = StreamingOverKikimrConfig(cloud_mode=True, node_count=2) + kikimr_conf = StreamingOverKikimrConfig(cloud_mode=True, node_count={"/cp": TenantConfig(1), "/compute": TenantConfig(1)}) kikimr = StreamingOverKikimr(kikimr_conf) kikimr.start_mvp_mock_server() kikimr.start() @@ -33,12 +32,12 @@ def setup_class(cls): # for retry cls.retry_conf = retry.RetryConf().upto(seconds=30).waiting(0.1) - @retry.retry_intrusive - def get_graph_master_node_id(self, kikimr, query_id): - for node_index in kikimr.control_plane.kikimr_cluster.nodes: - if kikimr.control_plane.get_task_count(node_index, query_id) > 0: - return node_index - assert False, "No active graphs found" + # @retry.retry_intrusive + # def get_graph_master_node_id(self, kikimr, query_id): + # for node_index in kikimr.compute_plane.kikimr_cluster.nodes: + # if kikimr.compute_plane.get_task_count(node_index, query_id) > 0: + # return node_index + # assert False, "No active graphs found" def get_ca_count(self, kikimr, node_index): result = kikimr.control_plane.get_sensors(node_index, "utils").find_sensor( @@ -73,22 +72,27 @@ def dump_workers(self, kikimr, worker_count, ca_count, wait_time=yatest_common.p def restart_node(self, kikimr, query_id): # restart node with CA - master_node_index = self.get_graph_master_node_id(kikimr, query_id) - logging.debug("Master node {}".format(master_node_index)) + + # master_node_index = self.get_graph_master_node_id(kikimr, query_id) + # logging.debug("Master node {}".format(master_node_index)) node_to_restart = None - for node_index in kikimr.control_plane.kikimr_cluster.nodes: - wc = kikimr.control_plane.get_worker_count(node_index) + + # for node_index in kikimr.control_plane.kikimr_cluster.nodes: + # logging.debug("Master node {}".format(master_node_index)) + + for node_index in kikimr.compute_plane.kikimr_cluster.nodes: + wc = kikimr.compute_plane.get_worker_count(node_index) if wc is not None: - if wc > 0 and node_index != master_node_index and node_to_restart is None: + if wc > 0 and node_to_restart is None: node_to_restart = node_index - assert node_to_restart is not None, "Can't find any task on non master node" + assert node_to_restart is not None, "Can't find any task on node" - logging.debug("Restart non-master node {}".format(node_to_restart)) + logging.debug("Restart compute node {}".format(node_to_restart)) - kikimr.control_plane.kikimr_cluster.nodes[node_to_restart].stop() - kikimr.control_plane.kikimr_cluster.nodes[node_to_restart].start() - kikimr.control_plane.wait_bootstrap(node_to_restart) + kikimr.compute_plane.kikimr_cluster.nodes[node_to_restart].stop() + kikimr.compute_plane.kikimr_cluster.nodes[node_to_restart].start() + kikimr.compute_plane.wait_bootstrap(node_to_restart) @yq_v1 @pytest.mark.parametrize("kikimr", [(None, None, None)], indirect=["kikimr"]) @@ -192,7 +196,8 @@ def test_match_recognize(self, kikimr, client, yq_version): LAST(A.dt) as dt_begin, LAST(C.dt) as dt_end, LAST(A.str) as a_str, - LAST(B.str) as b_str + LAST(B.str) as b_str, + LAST(C.str) as c_str ONE ROW PER MATCH PATTERN ( A B C ) DEFINE @@ -212,9 +217,13 @@ def test_match_recognize(self, kikimr, client, yq_version): messages1 = [ '{"dt": 1696849942000001, "str": "A" }', '{"dt": 1696849942500001, "str": "B" }', - '{"dt": 1696849943000001, "str": "C" }'] + '{"dt": 1696849943000001, "str": "C" }', + '{"dt": 1696849943600001, "str": "D" }'] # push A+B from TimeOrderRecoverer to MatchRecognize self.write_stream(messages1) + # A + B : in MatchRecognize + # C + D : in TimeOrderRecoverer + logging.debug("get_completed_checkpoints {}".format(kikimr.compute_plane.get_completed_checkpoints(query_id))) kikimr.compute_plane.wait_completed_checkpoints( query_id, kikimr.compute_plane.get_completed_checkpoints(query_id) + 1 @@ -222,11 +231,11 @@ def test_match_recognize(self, kikimr, client, yq_version): self.restart_node(kikimr, query_id) - self.write_stream(['{"dt": 1696849943500001, "str": "D" }', '{"dt": 1696849944100001, "str": "F" }']) + self.write_stream(['{"dt": 1696849944100001, "str": "E" }']) assert client.get_query_status(query_id) == fq.QueryMeta.RUNNING - expected = ['{"a_str":"A","b_str":"B","dt_begin":1696849942000001,"dt_end":1696849943000001}'] + expected = ['{"a_str":"A","b_str":"B","c_str":"C",,"dt_begin":1696849942000001,"dt_end":1696849943000001}'] read_data = self.read_stream(1) logging.info("Data was read: {}".format(read_data)) From 4b310be23bcea5e0c820c2b77d5a9ba185f12901 Mon Sep 17 00:00:00 2001 From: Dmitry Kardymon Date: Wed, 7 Feb 2024 14:10:16 +0000 Subject: [PATCH 05/19] add unit test / in progress --- .../comp_nodes/mkql_match_recognize.cpp | 85 +++++++ .../comp_nodes/mkql_match_recognize_list.h | 47 +++- .../comp_nodes/mkql_match_recognize_nfa.h | 7 + .../comp_nodes/ut/CMakeLists.darwin-arm64.txt | 1 + .../ut/CMakeLists.darwin-x86_64.txt | 1 + .../ut/CMakeLists.linux-aarch64.txt | 1 + .../comp_nodes/ut/CMakeLists.linux-x86_64.txt | 1 + .../ut/CMakeLists.windows-x86_64.txt | 1 + .../comp_nodes/ut/mkql_match_recognize_ut.cpp | 238 ++++++++++++++++++ .../yql/minikql/comp_nodes/ut/ya.make.inc | 1 + 10 files changed, 382 insertions(+), 1 deletion(-) create mode 100644 ydb/library/yql/minikql/comp_nodes/ut/mkql_match_recognize_ut.cpp diff --git a/ydb/library/yql/minikql/comp_nodes/mkql_match_recognize.cpp b/ydb/library/yql/minikql/comp_nodes/mkql_match_recognize.cpp index 08dfaddc5cb1..bb1e941eeeb3 100644 --- a/ydb/library/yql/minikql/comp_nodes/mkql_match_recognize.cpp +++ b/ydb/library/yql/minikql/comp_nodes/mkql_match_recognize.cpp @@ -3,6 +3,7 @@ #include "mkql_match_recognize_measure_arg.h" #include "mkql_match_recognize_nfa.h" #include +#include #include #include #include @@ -17,6 +18,8 @@ namespace NKikimr::NMiniKQL { namespace NMatchRecognize { +constexpr ui32 StateVersion = 1; + enum class EOutputColumnSource {PartitionKey, Measure}; using TOutputColumnOrder = std::vector, TMKQLAllocator>>; @@ -34,6 +37,10 @@ struct TMatchRecognizeProcessorParameters { TMeasureInputColumnOrder MeasureInputColumnOrder; TComputationNodePtrVector Measures; TOutputColumnOrder OutputColumnOrder; + // + TType* const StateType; + TMutableObjectOverBoxedValue Packer; + }; class TBackTrackingMatchRecognize { @@ -71,6 +78,7 @@ class TBackTrackingMatchRecognize { } bool ProcessInputRow(NUdf::TUnboxedValue&& row, TComputationContext& ctx) { + std::cerr << "TBackTrackingMatchRecognize::ProcessInputRow()" << std::endl; Y_UNUSED(ctx); Rows.Append(std::move(row)); return false; @@ -111,6 +119,7 @@ class TBackTrackingMatchRecognize { for (size_t v = 0; v != Parameters.Defines.size(); ++v) { const auto &d = Parameters.Defines[v]->GetValue(ctx); if (d && d.GetOptionalValue().Get()) { + std::cerr << "Defines return true "<< std::endl; Extend(CurMatchedVars[v], TRange{i}); } } @@ -147,6 +156,7 @@ class TStreamingMatchRecognize { const TContainerCacheOnContext& cache ) : PartitionKey(std::move(partitionKey)) + , Rows(parameters) , Parameters(parameters) , Nfa(nfaTransitions, parameters.MatchedVarsArg, parameters.Defines) , Cache(cache) @@ -154,11 +164,17 @@ class TStreamingMatchRecognize { } bool ProcessInputRow(NUdf::TUnboxedValue&& row, TComputationContext& ctx) { + std::cerr << "class TStreamingMatchRecognize::ProcessInputRow()" << std::endl; Parameters.InputDataArg->SetValue(ctx, ctx.HolderFactory.Create>(Rows)); Parameters.CurrentRowIndexArg->SetValue(ctx, NUdf::TUnboxedValuePod(Rows.Size())); Nfa.ProcessRow(Rows.Append(std::move(row)), ctx); + return HasMatched(); + } + + bool HasMatched() { return Nfa.HasMatched(); } + NUdf::TUnboxedValue GetOutputIfReady(TComputationContext& ctx) { auto match = Nfa.GetMatched(); if (!match.has_value()) @@ -189,6 +205,21 @@ class TStreamingMatchRecognize { Y_UNUSED(ctx); return false; } + + void Save(TString& out) { + std::cerr << "TStreamingMatchRecognize::Save()" << std::endl; + Rows.Save(out); + Nfa.Save(out); + WriteUi64(out, MatchNumber); + } + + void Load(TStringBuf& in) { + std::cerr << "TStreamingMatchRecognize::Load()" << std::endl; + Rows.Load(in); + Nfa.Load(in); + MatchNumber = ReadUi64(in); + } + private: const NUdf::TUnboxedValue PartitionKey; const TMatchRecognizeProcessorParameters& Parameters; @@ -225,6 +256,8 @@ class TStateForNonInterleavedPartitions NUdf::TUnboxedValue Save() const override { std::cerr << "TStateForNonInterleavedPartitions::Save()" << std::endl; TString out; + + auto strRef = NUdf::TStringRef(out.data(), out.size()); return MakeString(strRef); } @@ -234,6 +267,7 @@ class TStateForNonInterleavedPartitions } bool ProcessInputRow(NUdf::TUnboxedValue&& row, TComputationContext& ctx) { + std::cerr << "TStateForNonInterleavedPartitions::ProcessInputRow()" << std::endl; MKQL_ENSURE(not DelayedRow, "Internal logic error"); //we're finalizing previous partition InputRowArg->SetValue(ctx, NUdf::TUnboxedValue(row)); auto partitionKey = PartitionKey->GetValue(ctx); @@ -329,15 +363,57 @@ class TStateForInterleavedPartitions NUdf::TUnboxedValue Save() const override { std::cerr << "TStateForInterleavedPartitions::Save()" << std::endl; TString out; + WriteUi32(out, StateVersion); + WriteUi32(out, Partitions.size()); + for (const auto& [key, state] : Partitions) { + WriteString(out, key); + state->Save(out); + std::cerr << "partitions.HasMatched() " << state->HasMatched() << std::endl; + } + + // WriteUi32(out, HasReadyOutput.size()); + // for (const auto it : HasReadyOutput) { + // auto& key = it->first; + // WriteString(out, key); + // } + + std::cerr << "HasReadyOutput size " << HasReadyOutput.size() << std::endl; + auto strRef = NUdf::TStringRef(out.data(), out.size()); return MakeString(strRef); } void Load(const NUdf::TStringRef& state) override { std::cerr << "TStateForInterleavedPartitions::Load()" << std::endl; + + TStringBuf in(state.Data(), state.Size()); + + const auto stateVersion = ReadUi32(in); + if (stateVersion == 1) { + Partitions.clear(); + auto partitionsSize = ReadUi32(in); + for (size_t i = 0; i < partitionsSize; ++i) { + auto key = ReadString(in); + auto pair = Partitions.emplace(key, std::make_unique( + NYql::NUdf::TUnboxedValuePod(NYql::NUdf::TStringValue(key)), + Parameters, + NfaTransitionGraph, + Cache)); + (pair.first)->second->Load(in); + } + + std::cerr << "partitionsSize " << partitionsSize << std::endl; + for (auto it = Partitions.begin(); it != Partitions.end(); ++it) { + std::cerr << "it->second->HasMatched() " << it->second->HasMatched() << std::endl; + if (it->second->HasMatched()) { + HasReadyOutput.push(it); + } + } + } } bool ProcessInputRow(NUdf::TUnboxedValue&& row, TComputationContext& ctx) { + std::cerr << "TStateForInterleavedPartitions::ProcessInputRow()" << std::endl; auto partition = GetPartitionHandler(row, ctx); if (partition->second->ProcessInputRow(std::move(row), ctx)) { HasReadyOutput.push(partition); @@ -375,6 +451,7 @@ class TStateForInterleavedPartitions InputRowArg->SetValue(ctx, NUdf::TUnboxedValue(row)); auto partitionKey = PartitionKey->GetValue(ctx); const auto packedKey = PartitionKeyPacker.Pack(partitionKey); + std::cerr << "partitionKey " << TString(packedKey)<< std::endl; if (const auto it = Partitions.find(TString(packedKey)); it != Partitions.end()) { return it; } else { @@ -418,6 +495,7 @@ class TMatchRecognizeWrapper : public TStatefulFlowComputationNodeGetValue(ctx); if (item.IsFinish()) { + std::cerr << "call ProcessEndOfData()" << std::endl; state->ProcessEndOfData(ctx); continue; } else if (item.IsSpecial()) { + std::cerr << "IsSpecial" << std::endl; return item; } std::cerr << "ProcessInputRow2" << std::endl; @@ -484,6 +564,8 @@ class TMatchRecognizeWrapper : public TStatefulFlowComputationNodeGetItemType()); const auto parameters = TMatchRecognizeProcessorParameters { static_cast(LocateNode(ctx.NodeLocator, *inputDataArg.GetNode())) @@ -633,6 +716,8 @@ IComputationNode* WrapMatchRecognizeCore(TCallable& callable, const TComputation ) , ConvertVectorOfCallables(measures, ctx) , GetOutputColumnOrder(partitionColumnIndexes, measureColumnIndexes) + , rowType + , ctx.Mutables }; if (AS_VALUE(TDataLiteral, streamingMode)->AsValue().Get()) { return new TMatchRecognizeWrapper(ctx.Mutables diff --git a/ydb/library/yql/minikql/comp_nodes/mkql_match_recognize_list.h b/ydb/library/yql/minikql/comp_nodes/mkql_match_recognize_list.h index 626e3c102a2d..b9814f2672b4 100644 --- a/ydb/library/yql/minikql/comp_nodes/mkql_match_recognize_list.h +++ b/ydb/library/yql/minikql/comp_nodes/mkql_match_recognize_list.h @@ -2,6 +2,7 @@ #include #include #include +#include #include #include @@ -95,6 +96,10 @@ class TSparseList { public: using TPtr = TIntrusivePtr; + TContainer(const TMatchRecognizeProcessorParameters& parameters) + : Parameters(parameters) { + } + void Add(size_t index, NUdf::TUnboxedValue&& value) { const auto& [iter, newOne] = Storage.emplace(index, TItem{std::move(value), 1}); MKQL_ENSURE(newOne, "Internal logic error"); @@ -131,6 +136,27 @@ class TSparseList { } } + void Save(TString& out) { + std::cerr << "TContainer::Save()" << std::endl; + WriteUi64(out, Storage.size()); + for (const auto& [key, item]: Storage) { + WriteUi64(out, key); + // WriteUi64(out, item.Value); TODO + WriteUnboxedValue(out, Parameters.Packer.RefMutableObject(Ctx, false, Parameters.StateType), item.Value); + WriteUi64(out, item.LockCount); + } + } + + void Load(TStringBuf& in) { + std::cerr << "TContainer::Load()" << std::endl; + auto size = ReadUi64(in); + for (size_t i =0; i < size; ++i) { + auto key = ReadUi64(in); + auto lockCount = ReadUi64(in); + Storage.emplace(key, TItem{NUdf::TUnboxedValue{}, lockCount}); + } + } + private: //TODO consider to replace hash table with contiguous chunks using TAllocator = TMKQLAllocator, EMemorySubPool::Temporary>; @@ -140,6 +166,7 @@ class TSparseList { std::hash, std::equal_to, TAllocator> Storage; + const TMatchRecognizeProcessorParameters& Parameters; }; using TContainerPtr = TContainer::TPtr; @@ -272,6 +299,11 @@ class TSparseList { size_t ToIndex; }; + TSparseList(const TMatchRecognizeProcessorParameters& parameters) + : Parameters(parameters) + , Container(MakeIntrusive(parameters)) { + } + public: TRange Append(NUdf::TUnboxedValue&& value) { const auto index = ListSize++; @@ -297,9 +329,22 @@ class TSparseList { return Size() == 0; } + void Save(TString& out) { + std::cerr << "TSparseList::Save()" << std::endl; + Container->Save(out); + WriteUi64(out, ListSize); + } + + void Load(TStringBuf& in) { + std::cerr << "TSparseList::Load()" << std::endl; + Container->Load(in); + ListSize = ReadUi64(in); + } + private: - TContainerPtr Container = MakeIntrusive(); + TContainerPtr Container; size_t ListSize = 0; //impl: max index ever stored + 1 + const TMatchRecognizeProcessorParameters& Parameters; }; template diff --git a/ydb/library/yql/minikql/comp_nodes/mkql_match_recognize_nfa.h b/ydb/library/yql/minikql/comp_nodes/mkql_match_recognize_nfa.h index 1ad86607a297..1db6d6f37b88 100644 --- a/ydb/library/yql/minikql/comp_nodes/mkql_match_recognize_nfa.h +++ b/ydb/library/yql/minikql/comp_nodes/mkql_match_recognize_nfa.h @@ -3,6 +3,7 @@ #include "mkql_match_recognize_matched_vars.h" #include "../computation/mkql_computation_node_holders.h" #include "../computation/mkql_computation_node_impl.h" +#include #include #include #include @@ -329,6 +330,12 @@ class TNfa { return ActiveStates.size(); } + void Save(TString& out) { + } + + void Load(TStringBuf& in) { + } + private: //TODO (zverevgeny): Consider to change to std::vector for the sake of perf using TStateSet = std::set, TMKQLAllocator>; diff --git a/ydb/library/yql/minikql/comp_nodes/ut/CMakeLists.darwin-arm64.txt b/ydb/library/yql/minikql/comp_nodes/ut/CMakeLists.darwin-arm64.txt index 14946a9072df..e8c252dd0b4c 100644 --- a/ydb/library/yql/minikql/comp_nodes/ut/CMakeLists.darwin-arm64.txt +++ b/ydb/library/yql/minikql/comp_nodes/ut/CMakeLists.darwin-arm64.txt @@ -64,6 +64,7 @@ target_sources(ydb-library-yql-minikql-comp_nodes-ut PRIVATE ${CMAKE_SOURCE_DIR}/ydb/library/yql/minikql/comp_nodes/ut/mkql_match_recognize_matched_vars_ut.cpp ${CMAKE_SOURCE_DIR}/ydb/library/yql/minikql/comp_nodes/ut/mkql_match_recognize_list_ut.cpp ${CMAKE_SOURCE_DIR}/ydb/library/yql/minikql/comp_nodes/ut/mkql_match_recognize_nfa_ut.cpp + ${CMAKE_SOURCE_DIR}/ydb/library/yql/minikql/comp_nodes/ut/mkql_match_recognize_ut.cpp ${CMAKE_SOURCE_DIR}/ydb/library/yql/minikql/comp_nodes/ut/mkql_safe_circular_buffer_ut.cpp ${CMAKE_SOURCE_DIR}/ydb/library/yql/minikql/comp_nodes/ut/mkql_sort_ut.cpp ${CMAKE_SOURCE_DIR}/ydb/library/yql/minikql/comp_nodes/ut/mkql_switch_ut.cpp diff --git a/ydb/library/yql/minikql/comp_nodes/ut/CMakeLists.darwin-x86_64.txt b/ydb/library/yql/minikql/comp_nodes/ut/CMakeLists.darwin-x86_64.txt index b4ada0b07714..e3c9590f8be8 100644 --- a/ydb/library/yql/minikql/comp_nodes/ut/CMakeLists.darwin-x86_64.txt +++ b/ydb/library/yql/minikql/comp_nodes/ut/CMakeLists.darwin-x86_64.txt @@ -65,6 +65,7 @@ target_sources(ydb-library-yql-minikql-comp_nodes-ut PRIVATE ${CMAKE_SOURCE_DIR}/ydb/library/yql/minikql/comp_nodes/ut/mkql_match_recognize_matched_vars_ut.cpp ${CMAKE_SOURCE_DIR}/ydb/library/yql/minikql/comp_nodes/ut/mkql_match_recognize_list_ut.cpp ${CMAKE_SOURCE_DIR}/ydb/library/yql/minikql/comp_nodes/ut/mkql_match_recognize_nfa_ut.cpp + ${CMAKE_SOURCE_DIR}/ydb/library/yql/minikql/comp_nodes/ut/mkql_match_recognize_ut.cpp ${CMAKE_SOURCE_DIR}/ydb/library/yql/minikql/comp_nodes/ut/mkql_safe_circular_buffer_ut.cpp ${CMAKE_SOURCE_DIR}/ydb/library/yql/minikql/comp_nodes/ut/mkql_sort_ut.cpp ${CMAKE_SOURCE_DIR}/ydb/library/yql/minikql/comp_nodes/ut/mkql_switch_ut.cpp diff --git a/ydb/library/yql/minikql/comp_nodes/ut/CMakeLists.linux-aarch64.txt b/ydb/library/yql/minikql/comp_nodes/ut/CMakeLists.linux-aarch64.txt index 1dd6565f1543..508aaed2ad18 100644 --- a/ydb/library/yql/minikql/comp_nodes/ut/CMakeLists.linux-aarch64.txt +++ b/ydb/library/yql/minikql/comp_nodes/ut/CMakeLists.linux-aarch64.txt @@ -68,6 +68,7 @@ target_sources(ydb-library-yql-minikql-comp_nodes-ut PRIVATE ${CMAKE_SOURCE_DIR}/ydb/library/yql/minikql/comp_nodes/ut/mkql_match_recognize_matched_vars_ut.cpp ${CMAKE_SOURCE_DIR}/ydb/library/yql/minikql/comp_nodes/ut/mkql_match_recognize_list_ut.cpp ${CMAKE_SOURCE_DIR}/ydb/library/yql/minikql/comp_nodes/ut/mkql_match_recognize_nfa_ut.cpp + ${CMAKE_SOURCE_DIR}/ydb/library/yql/minikql/comp_nodes/ut/mkql_match_recognize_ut.cpp ${CMAKE_SOURCE_DIR}/ydb/library/yql/minikql/comp_nodes/ut/mkql_safe_circular_buffer_ut.cpp ${CMAKE_SOURCE_DIR}/ydb/library/yql/minikql/comp_nodes/ut/mkql_sort_ut.cpp ${CMAKE_SOURCE_DIR}/ydb/library/yql/minikql/comp_nodes/ut/mkql_switch_ut.cpp diff --git a/ydb/library/yql/minikql/comp_nodes/ut/CMakeLists.linux-x86_64.txt b/ydb/library/yql/minikql/comp_nodes/ut/CMakeLists.linux-x86_64.txt index 848ec70bcd83..9e33793c87ac 100644 --- a/ydb/library/yql/minikql/comp_nodes/ut/CMakeLists.linux-x86_64.txt +++ b/ydb/library/yql/minikql/comp_nodes/ut/CMakeLists.linux-x86_64.txt @@ -69,6 +69,7 @@ target_sources(ydb-library-yql-minikql-comp_nodes-ut PRIVATE ${CMAKE_SOURCE_DIR}/ydb/library/yql/minikql/comp_nodes/ut/mkql_match_recognize_matched_vars_ut.cpp ${CMAKE_SOURCE_DIR}/ydb/library/yql/minikql/comp_nodes/ut/mkql_match_recognize_list_ut.cpp ${CMAKE_SOURCE_DIR}/ydb/library/yql/minikql/comp_nodes/ut/mkql_match_recognize_nfa_ut.cpp + ${CMAKE_SOURCE_DIR}/ydb/library/yql/minikql/comp_nodes/ut/mkql_match_recognize_ut.cpp ${CMAKE_SOURCE_DIR}/ydb/library/yql/minikql/comp_nodes/ut/mkql_safe_circular_buffer_ut.cpp ${CMAKE_SOURCE_DIR}/ydb/library/yql/minikql/comp_nodes/ut/mkql_sort_ut.cpp ${CMAKE_SOURCE_DIR}/ydb/library/yql/minikql/comp_nodes/ut/mkql_switch_ut.cpp diff --git a/ydb/library/yql/minikql/comp_nodes/ut/CMakeLists.windows-x86_64.txt b/ydb/library/yql/minikql/comp_nodes/ut/CMakeLists.windows-x86_64.txt index 4522578cbde3..837e381afdad 100644 --- a/ydb/library/yql/minikql/comp_nodes/ut/CMakeLists.windows-x86_64.txt +++ b/ydb/library/yql/minikql/comp_nodes/ut/CMakeLists.windows-x86_64.txt @@ -58,6 +58,7 @@ target_sources(ydb-library-yql-minikql-comp_nodes-ut PRIVATE ${CMAKE_SOURCE_DIR}/ydb/library/yql/minikql/comp_nodes/ut/mkql_match_recognize_matched_vars_ut.cpp ${CMAKE_SOURCE_DIR}/ydb/library/yql/minikql/comp_nodes/ut/mkql_match_recognize_list_ut.cpp ${CMAKE_SOURCE_DIR}/ydb/library/yql/minikql/comp_nodes/ut/mkql_match_recognize_nfa_ut.cpp + ${CMAKE_SOURCE_DIR}/ydb/library/yql/minikql/comp_nodes/ut/mkql_match_recognize_ut.cpp ${CMAKE_SOURCE_DIR}/ydb/library/yql/minikql/comp_nodes/ut/mkql_safe_circular_buffer_ut.cpp ${CMAKE_SOURCE_DIR}/ydb/library/yql/minikql/comp_nodes/ut/mkql_sort_ut.cpp ${CMAKE_SOURCE_DIR}/ydb/library/yql/minikql/comp_nodes/ut/mkql_switch_ut.cpp diff --git a/ydb/library/yql/minikql/comp_nodes/ut/mkql_match_recognize_ut.cpp b/ydb/library/yql/minikql/comp_nodes/ut/mkql_match_recognize_ut.cpp new file mode 100644 index 000000000000..96295e0b8d0d --- /dev/null +++ b/ydb/library/yql/minikql/comp_nodes/ut/mkql_match_recognize_ut.cpp @@ -0,0 +1,238 @@ +#include "../mkql_time_order_recover.h" +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include + +namespace NKikimr { + namespace NMiniKQL { + + namespace { + TIntrusivePtr CreateRandomProvider() { + return CreateDeterministicRandomProvider(1); + } + + TIntrusivePtr CreateTimeProvider() { + return CreateDeterministicTimeProvider(10000000); + } + + struct TSetup { + TSetup(TScopedAlloc& alloc) + : Alloc(alloc) + { + FunctionRegistry = CreateFunctionRegistry(CreateBuiltinRegistry()); + RandomProvider = CreateRandomProvider(); + TimeProvider = CreateTimeProvider(); + + Env.Reset(new TTypeEnvironment(Alloc)); + PgmBuilder.Reset(new TProgramBuilder(*Env, *FunctionRegistry)); + } + + THolder BuildGraph(TRuntimeNode pgm, const std::vector& entryPoints = std::vector()) { + Explorer.Walk(pgm.GetNode(), *Env); + TComputationPatternOpts opts( + Alloc.Ref(), + *Env, GetBuiltinFactory(), + FunctionRegistry.Get(), + NUdf::EValidateMode::None, + NUdf::EValidatePolicy::Fail, "OFF", EGraphPerProcess::Multi); + Pattern = MakeComputationPattern(Explorer, pgm, entryPoints, opts); + TComputationOptsFull compOpts = opts.ToComputationOptions(*RandomProvider, *TimeProvider); + return Pattern->Clone(compOpts); + } + + TIntrusivePtr FunctionRegistry; + TIntrusivePtr RandomProvider; + TIntrusivePtr TimeProvider; + + TScopedAlloc& Alloc; + THolder Env; + THolder PgmBuilder; + + TExploringNodeVisitor Explorer; + IComputationPattern::TPtr Pattern; + }; + + using TTestInputData = std::vector>; + using TTestData = std::vector>; + + THolder BuildGraph(TSetup& setup, const TTestInputData& input) { + TProgramBuilder& pgmBuilder = *setup.PgmBuilder; + + auto structType = pgmBuilder.NewStructType({ + {"time", pgmBuilder.NewDataType(NUdf::TDataType::Id)}, + {"key", pgmBuilder.NewDataType(NUdf::TDataType::Id)}, + {"sum", pgmBuilder.NewDataType(NUdf::TDataType::Id)}, + {"part", pgmBuilder.NewDataType(NUdf::TDataType::Id)}}); + + TVector items; + // constexpr ui64 g_Yield = std::numeric_limits::max(); + // items.push_back(pgmBuilder.NewDataLiteral(g_Yield)); + for (size_t i = 0; i < input.size(); ++i) + { + auto time = pgmBuilder.NewDataLiteral(std::get<0>(input[i])); + auto key = pgmBuilder.NewDataLiteral(NUdf::TStringRef(std::get<1>(input[i]))); + auto sum = pgmBuilder.NewDataLiteral(std::get<2>(input[i])); + auto part = pgmBuilder.NewDataLiteral(NUdf::TStringRef(std::get<3>(input[i]))); + + auto item = pgmBuilder.NewStruct(structType, + {{"time", time}, {"key", key}, {"sum", sum}, {"part", part}}); + items.push_back(std::move(item)); + } + + + const auto list = pgmBuilder.NewList(structType, std::move(items)); + + auto inputFlow = pgmBuilder.ToFlow(list); + + i64 delay = -10; + i64 ahead = 30; + ui32 rowLimit = 20; + + +/* + TRuntimeNode inputStream, + const TUnaryLambda& getPartitionKeySelectorNode, + const TArrayRef& partitionColumns, + const TArrayRef>& getMeasures, + const NYql::NMatchRecognize::TRowPattern& pattern, + const TArrayRef>& getDefines, + bool streamingMode +*/ + + // MEASURES +// LAST(A.dt) as dt_begin + // ONE ROW PER MATCH + // PATTERN ( A{3, 3} ) + // DEFINE A as True) + + + TVector partitionColumns;// = {TStringBuf("a")}; + TVector> getMeasures = {{ + std::make_pair( + TStringBuf("key"), + [&](TRuntimeNode measureInputDataArg, TRuntimeNode matchedVarsArg) { + //return pgmBuilder.Length(measureInputDataArg); + auto run = pgmBuilder.Take(measureInputDataArg, pgmBuilder.NewDataLiteral(0)); + + auto oldType = run.GetStaticType(); + // oldType->GetKindAsStr(); + + std::cerr << "GetKindAsStr " << oldType->GetKindAsStr() << std::endl; + + + // MKQL_ENSURE(oldType->IsStruct(), "Expected struct"); + + // const auto& oldTypeDetailed = static_cast(*oldType); + // auto count = oldTypeDetailed.GetMembersCount(); + + // return pgmBuilder.NewDataLiteral(count); + //return measureInputDataArg; + //return pgmBuilder.NewDataLiteral(42); + // return pgmBuilder.Size(measureInputDataArg); + + // bool isOptional; + // const auto type = AS_TYPE(TStructType, UnpackOptional(matchedVarsArg.GetStaticType(), isOptional)); + // auto count = type->GetMembersCount(); + + + return pgmBuilder.NewDataLiteral(56); + } + )}}; + TVector> getDefines = {{ + std::make_pair( + TStringBuf("A"), + [&](TRuntimeNode inputDataArg, TRuntimeNode matchedVarsArg, TRuntimeNode currentRowIndexArg) { + return pgmBuilder.NewDataLiteral(true); + } + )}}; + + auto pgmReturn = pgmBuilder.MatchRecognizeCore( + inputFlow, + [&](TRuntimeNode item) { + return pgmBuilder.Member(item, "part"); + }, + partitionColumns, // partitionColumns + getMeasures, + { + {NYql::NMatchRecognize::TRowPatternFactor{"A", 3, 3, false, false, false}} + }, + getDefines, + true); + + auto graph = setup.BuildGraph(pgmReturn); + return graph; + } + } + + Y_UNIT_TEST_SUITE(TMiniKQLMatchRecognizeSaveLoadTest) { + void TestWithSaveLoadImpl( + const TTestInputData& input, + const TTestData& expected) + { + TScopedAlloc alloc(__LOCATION__); + std::vector> result; + TSetup setup1(alloc); + auto graph1 = BuildGraph(setup1, input); + + auto value = graph1->GetValue(); + + std::cerr << "IsFinish " << value.IsFinish() << std::endl; + UNIT_ASSERT(!value.IsFinish() && value); + auto v = value.GetElement(0).Get(); + std::cerr << "GetElement " << v << std::endl; + + + // { + // value = graph1->GetValue(); + // UNIT_ASSERT(!value.IsFinish() && value); + // v = value.GetElement(0).Get(); + // std::cerr << "GetElement " << v << std::endl; + // } + TString graphState = graph1->SaveGraphState(); + + std::cerr << "----------------------" << std::endl; + std::cerr << "State size " << graphState.size() << std::endl; + TSetup setup2(alloc); + + auto graph2 = BuildGraph(setup2, TTestInputData{{1003, "D", 103, "P"}}); + graph2->LoadGraphState(graphState); + + value = graph2->GetValue(); + UNIT_ASSERT(!value.IsFinish() && value); + v = value.GetElement(0).Get(); + std::cerr << "GetElement " << v << std::endl; + } + + const TTestInputData input = { + // Time; Key; Value; PartitionKey + {1000, "A", 101, "P"}, + {1001, "B", 102, "P"}, + {1002, "C", 103, "P"}, // <- match end + {1003, "D", 103, "P"}}; // <- not processed + + const std::vector> expected = { + // Group; Time; Value + {1000, 800, 101}, + {1000, 800, 102}, + {1000, 800, 103}, + {1000, 800, 104}, + {1000, 800, 105}, + {3000, 801, 200}, + {2000, 802, 300}}; + + Y_UNIT_TEST(Test1) { + TestWithSaveLoadImpl(input, expected); + } + + } + + } // namespace NMiniKQL +} // namespace NKikimr diff --git a/ydb/library/yql/minikql/comp_nodes/ut/ya.make.inc b/ydb/library/yql/minikql/comp_nodes/ut/ya.make.inc index 12c027fddc94..484483baf6e6 100644 --- a/ydb/library/yql/minikql/comp_nodes/ut/ya.make.inc +++ b/ydb/library/yql/minikql/comp_nodes/ut/ya.make.inc @@ -52,6 +52,7 @@ SET(ORIG_SOURCES mkql_match_recognize_matched_vars_ut.cpp mkql_match_recognize_list_ut.cpp mkql_match_recognize_nfa_ut.cpp + mkql_match_recognize_ut.cpp mkql_safe_circular_buffer_ut.cpp mkql_sort_ut.cpp mkql_switch_ut.cpp From 343e438e2efc5fbf6b8e4479a580cdd2e0495725 Mon Sep 17 00:00:00 2001 From: Dmitry Kardymon Date: Thu, 8 Feb 2024 06:55:30 +0000 Subject: [PATCH 06/19] compilation fix --- .../comp_nodes/mkql_match_recognize.cpp | 77 +++++++++---------- .../comp_nodes/mkql_match_recognize_list.h | 23 ++---- .../mkql_match_recognize_parameters.h | 35 +++++++++ 3 files changed, 79 insertions(+), 56 deletions(-) create mode 100644 ydb/library/yql/minikql/comp_nodes/mkql_match_recognize_parameters.h diff --git a/ydb/library/yql/minikql/comp_nodes/mkql_match_recognize.cpp b/ydb/library/yql/minikql/comp_nodes/mkql_match_recognize.cpp index bb1e941eeeb3..4feb6f9f0724 100644 --- a/ydb/library/yql/minikql/comp_nodes/mkql_match_recognize.cpp +++ b/ydb/library/yql/minikql/comp_nodes/mkql_match_recognize.cpp @@ -2,6 +2,8 @@ #include "mkql_match_recognize_matched_vars.h" #include "mkql_match_recognize_measure_arg.h" #include "mkql_match_recognize_nfa.h" +#include "mkql_match_recognize_parameters.h" + #include #include #include @@ -20,29 +22,8 @@ namespace NMatchRecognize { constexpr ui32 StateVersion = 1; -enum class EOutputColumnSource {PartitionKey, Measure}; -using TOutputColumnOrder = std::vector, TMKQLAllocator>>; - using namespace NYql::NMatchRecognize; -struct TMatchRecognizeProcessorParameters { - IComputationExternalNode* InputDataArg; - TRowPattern Pattern; - TUnboxedValueVector VarNames; - THashMap VarNamesLookup; - IComputationExternalNode* MatchedVarsArg; - IComputationExternalNode* CurrentRowIndexArg; - TComputationNodePtrVector Defines; - IComputationExternalNode* MeasureInputDataArg; - TMeasureInputColumnOrder MeasureInputColumnOrder; - TComputationNodePtrVector Measures; - TOutputColumnOrder OutputColumnOrder; - // - TType* const StateType; - TMutableObjectOverBoxedValue Packer; - -}; - class TBackTrackingMatchRecognize { using TPartitionList = TSimpleList; using TRange = TPartitionList::TRange; @@ -156,7 +137,6 @@ class TStreamingMatchRecognize { const TContainerCacheOnContext& cache ) : PartitionKey(std::move(partitionKey)) - , Rows(parameters) , Parameters(parameters) , Nfa(nfaTransitions, parameters.MatchedVarsArg, parameters.Defines) , Cache(cache) @@ -206,9 +186,9 @@ class TStreamingMatchRecognize { return false; } - void Save(TString& out) { + void Save(TString& out, const TSaveLoadContext& ctx) { std::cerr << "TStreamingMatchRecognize::Save()" << std::endl; - Rows.Save(out); + Rows.Save(out, ctx); Nfa.Save(out); WriteUi64(out, MatchNumber); } @@ -236,12 +216,15 @@ class TStateForNonInterleavedPartitions using TRowPatternConfigurationBuilder = typename Algo::TPatternConfigurationBuilder; public: TStateForNonInterleavedPartitions( - TMemoryUsageInfo* memInfo, - IComputationExternalNode* inputRowArg, - IComputationNode* partitionKey, - TType* partitionKeyType, - const TMatchRecognizeProcessorParameters& parameters, - const TContainerCacheOnContext& cache + TMemoryUsageInfo* memInfo, + IComputationExternalNode* inputRowArg, + IComputationNode* partitionKey, + TType* partitionKeyType, + const TMatchRecognizeProcessorParameters& parameters, + const TContainerCacheOnContext& cache, + TComputationContext &ctx, + TType* stateType, + const TMutableObjectOverBoxedValue& packer ) : TComputationValue(memInfo) , InputRowArg(inputRowArg) @@ -348,7 +331,10 @@ class TStateForInterleavedPartitions IComputationNode* partitionKey, TType* partitionKeyType, const TMatchRecognizeProcessorParameters& parameters, - const TContainerCacheOnContext& cache + const TContainerCacheOnContext& cache, + TComputationContext &ctx, + TType* stateType, + const TMutableObjectOverBoxedValue& packer ) : TComputationValue(memInfo) , InputRowArg(inputRowArg) @@ -357,6 +343,7 @@ class TStateForInterleavedPartitions , Parameters(parameters) , NfaTransitionGraph(TNfaTransitionGraphBuilder::Create(parameters.Pattern, parameters.VarNamesLookup)) , Cache(cache) + , SaveLoadContex(ctx, stateType, packer) { } @@ -365,9 +352,10 @@ class TStateForInterleavedPartitions TString out; WriteUi32(out, StateVersion); WriteUi32(out, Partitions.size()); + for (const auto& [key, state] : Partitions) { WriteString(out, key); - state->Save(out); + state->Save(out, SaveLoadContex); std::cerr << "partitions.HasMatched() " << state->HasMatched() << std::endl; } @@ -476,6 +464,7 @@ class TStateForInterleavedPartitions const TMatchRecognizeProcessorParameters& Parameters; const TNfaTransitionGraph::TPtr NfaTransitionGraph; const TContainerCacheOnContext& Cache; + TSaveLoadContext SaveLoadContex; }; template @@ -486,7 +475,8 @@ class TMatchRecognizeWrapper : public TStatefulFlowComputationNode Packer; }; TOutputColumnOrder GetOutputColumnOrder(TRuntimeNode partitionKyeColumnsIndexes, TRuntimeNode measureColumnsIndexes) { @@ -716,8 +714,6 @@ IComputationNode* WrapMatchRecognizeCore(TCallable& callable, const TComputation ) , ConvertVectorOfCallables(measures, ctx) , GetOutputColumnOrder(partitionColumnIndexes, measureColumnIndexes) - , rowType - , ctx.Mutables }; if (AS_VALUE(TDataLiteral, streamingMode)->AsValue().Get()) { return new TMatchRecognizeWrapper(ctx.Mutables @@ -727,6 +723,7 @@ IComputationNode* WrapMatchRecognizeCore(TCallable& callable, const TComputation , LocateNode(ctx.NodeLocator, *partitionKeySelector.GetNode()) , partitionKeySelector.GetStaticType() , std::move(parameters) + , rowType ); } else { const bool useNfaForTables = true; //TODO(YQL-16486) get this flag from an optimizer @@ -738,6 +735,7 @@ IComputationNode* WrapMatchRecognizeCore(TCallable& callable, const TComputation , LocateNode(ctx.NodeLocator, *partitionKeySelector.GetNode()) , partitionKeySelector.GetStaticType() , std::move(parameters) + , rowType ); } else { return new TMatchRecognizeWrapper>(ctx.Mutables @@ -747,6 +745,7 @@ IComputationNode* WrapMatchRecognizeCore(TCallable& callable, const TComputation , LocateNode(ctx.NodeLocator, *partitionKeySelector.GetNode()) , partitionKeySelector.GetStaticType() , std::move(parameters) + , rowType ); } } diff --git a/ydb/library/yql/minikql/comp_nodes/mkql_match_recognize_list.h b/ydb/library/yql/minikql/comp_nodes/mkql_match_recognize_list.h index b9814f2672b4..a85c994329b9 100644 --- a/ydb/library/yql/minikql/comp_nodes/mkql_match_recognize_list.h +++ b/ydb/library/yql/minikql/comp_nodes/mkql_match_recognize_list.h @@ -2,10 +2,10 @@ #include #include #include +#include #include #include #include - namespace NKikimr::NMiniKQL::NMatchRecognize { class TSimpleList { @@ -96,10 +96,6 @@ class TSparseList { public: using TPtr = TIntrusivePtr; - TContainer(const TMatchRecognizeProcessorParameters& parameters) - : Parameters(parameters) { - } - void Add(size_t index, NUdf::TUnboxedValue&& value) { const auto& [iter, newOne] = Storage.emplace(index, TItem{std::move(value), 1}); MKQL_ENSURE(newOne, "Internal logic error"); @@ -136,13 +132,13 @@ class TSparseList { } } - void Save(TString& out) { + void Save(TString& out, const TSaveLoadContext& ctx) { std::cerr << "TContainer::Save()" << std::endl; WriteUi64(out, Storage.size()); for (const auto& [key, item]: Storage) { WriteUi64(out, key); // WriteUi64(out, item.Value); TODO - WriteUnboxedValue(out, Parameters.Packer.RefMutableObject(Ctx, false, Parameters.StateType), item.Value); + WriteUnboxedValue(out, ctx.Packer.RefMutableObject(ctx.Ctx, false, ctx.StateType), item.Value); WriteUi64(out, item.LockCount); } } @@ -166,7 +162,6 @@ class TSparseList { std::hash, std::equal_to, TAllocator> Storage; - const TMatchRecognizeProcessorParameters& Parameters; }; using TContainerPtr = TContainer::TPtr; @@ -299,11 +294,6 @@ class TSparseList { size_t ToIndex; }; - TSparseList(const TMatchRecognizeProcessorParameters& parameters) - : Parameters(parameters) - , Container(MakeIntrusive(parameters)) { - } - public: TRange Append(NUdf::TUnboxedValue&& value) { const auto index = ListSize++; @@ -329,9 +319,9 @@ class TSparseList { return Size() == 0; } - void Save(TString& out) { + void Save(TString& out, const TSaveLoadContext& ctx) { std::cerr << "TSparseList::Save()" << std::endl; - Container->Save(out); + Container->Save(out, ctx); WriteUi64(out, ListSize); } @@ -342,9 +332,8 @@ class TSparseList { } private: - TContainerPtr Container; + TContainerPtr Container = MakeIntrusive(); size_t ListSize = 0; //impl: max index ever stored + 1 - const TMatchRecognizeProcessorParameters& Parameters; }; template diff --git a/ydb/library/yql/minikql/comp_nodes/mkql_match_recognize_parameters.h b/ydb/library/yql/minikql/comp_nodes/mkql_match_recognize_parameters.h new file mode 100644 index 000000000000..85a8d8e66295 --- /dev/null +++ b/ydb/library/yql/minikql/comp_nodes/mkql_match_recognize_parameters.h @@ -0,0 +1,35 @@ +#pragma once + +#include "mkql_match_recognize_measure_arg.h" +#include +#include +#include +#include +#include + +namespace NKikimr::NMiniKQL::NMatchRecognize { + +enum class EOutputColumnSource {PartitionKey, Measure}; +using TOutputColumnOrder = std::vector, TMKQLAllocator>>; + +struct TMatchRecognizeProcessorParameters { + IComputationExternalNode* InputDataArg; + NYql::NMatchRecognize::TRowPattern Pattern; + TUnboxedValueVector VarNames; + THashMap VarNamesLookup; + IComputationExternalNode* MatchedVarsArg; + IComputationExternalNode* CurrentRowIndexArg; + TComputationNodePtrVector Defines; + IComputationExternalNode* MeasureInputDataArg; + TMeasureInputColumnOrder MeasureInputColumnOrder; + TComputationNodePtrVector Measures; + TOutputColumnOrder OutputColumnOrder; +}; + +struct TSaveLoadContext { + TComputationContext& Ctx; + TType* StateType; + const TMutableObjectOverBoxedValue& Packer; +}; + +} //namespace NKikimr::NMiniKQL::NMatchRecognize From 6ae4b8ce8f01a5f3f90993f7739de52e736782d8 Mon Sep 17 00:00:00 2001 From: Dmitry Kardymon Date: Thu, 8 Feb 2024 14:04:43 +0000 Subject: [PATCH 07/19] add serealize for IntrusivePtr --- .../comp_nodes/mkql_match_recognize.cpp | 31 +++-- .../comp_nodes/mkql_match_recognize_list.h | 68 +++++++++-- .../mkql_match_recognize_matched_vars.h | 1 + .../comp_nodes/mkql_match_recognize_nfa.h | 111 +++++++++++++++++- .../mkql_match_recognize_parameters.h | 63 ++++++++++ .../comp_nodes/mkql_time_order_recover.cpp | 2 +- .../comp_nodes/ut/mkql_match_recognize_ut.cpp | 85 ++++---------- 7 files changed, 262 insertions(+), 99 deletions(-) diff --git a/ydb/library/yql/minikql/comp_nodes/mkql_match_recognize.cpp b/ydb/library/yql/minikql/comp_nodes/mkql_match_recognize.cpp index 4feb6f9f0724..f13e881c86be 100644 --- a/ydb/library/yql/minikql/comp_nodes/mkql_match_recognize.cpp +++ b/ydb/library/yql/minikql/comp_nodes/mkql_match_recognize.cpp @@ -186,17 +186,17 @@ class TStreamingMatchRecognize { return false; } - void Save(TString& out, const TSaveLoadContext& ctx) { - std::cerr << "TStreamingMatchRecognize::Save()" << std::endl; + void Save(TString& out, const TSaveLoadContext& ctx) const { + std::cerr << "TStreamingMatchRecognize::Save() " << out.size() << std::endl; Rows.Save(out, ctx); - Nfa.Save(out); + Nfa.Save(out, ctx); WriteUi64(out, MatchNumber); } - void Load(TStringBuf& in) { + void Load(TStringBuf& in, const TSaveLoadContext& ctx) { std::cerr << "TStreamingMatchRecognize::Load()" << std::endl; - Rows.Load(in); - Nfa.Load(in); + Rows.Load(in, ctx); + Nfa.Load(in, ctx); MatchNumber = ReadUi64(in); } @@ -237,7 +237,7 @@ class TStateForNonInterleavedPartitions {} NUdf::TUnboxedValue Save() const override { - std::cerr << "TStateForNonInterleavedPartitions::Save()" << std::endl; + std::cerr << "TStateForNonInterleavedPartitions::Save() " << std::endl; TString out; @@ -348,7 +348,7 @@ class TStateForInterleavedPartitions } NUdf::TUnboxedValue Save() const override { - std::cerr << "TStateForInterleavedPartitions::Save()" << std::endl; + std::cerr << "TStateForInterleavedPartitions::Save() " << std::endl; TString out; WriteUi32(out, StateVersion); WriteUi32(out, Partitions.size()); @@ -359,22 +359,17 @@ class TStateForInterleavedPartitions std::cerr << "partitions.HasMatched() " << state->HasMatched() << std::endl; } - // WriteUi32(out, HasReadyOutput.size()); - // for (const auto it : HasReadyOutput) { - // auto& key = it->first; - // WriteString(out, key); - // } - std::cerr << "HasReadyOutput size " << HasReadyOutput.size() << std::endl; - + auto strRef = NUdf::TStringRef(out.data(), out.size()); + std::cerr << "TStateForInterleavedPartitions::Save() end " << out.size() << std::endl; return MakeString(strRef); } void Load(const NUdf::TStringRef& state) override { - std::cerr << "TStateForInterleavedPartitions::Load()" << std::endl; TStringBuf in(state.Data(), state.Size()); + std::cerr << "TStateForInterleavedPartitions::Load() " << in.size() << std::endl; const auto stateVersion = ReadUi32(in); if (stateVersion == 1) { @@ -387,7 +382,7 @@ class TStateForInterleavedPartitions Parameters, NfaTransitionGraph, Cache)); - (pair.first)->second->Load(in); + (pair.first)->second->Load(in, SaveLoadContex); } std::cerr << "partitionsSize " << partitionsSize << std::endl; @@ -398,6 +393,8 @@ class TStateForInterleavedPartitions } } } + std::cerr << "TStateForInterleavedPartitions::Load() " << in.size() << std::endl; + MKQL_ENSURE(!in.size(), "State is corrupted"); } bool ProcessInputRow(NUdf::TUnboxedValue&& row, TComputationContext& ctx) { diff --git a/ydb/library/yql/minikql/comp_nodes/mkql_match_recognize_list.h b/ydb/library/yql/minikql/comp_nodes/mkql_match_recognize_list.h index a85c994329b9..07b5c7a718ff 100644 --- a/ydb/library/yql/minikql/comp_nodes/mkql_match_recognize_list.h +++ b/ydb/library/yql/minikql/comp_nodes/mkql_match_recognize_list.h @@ -57,6 +57,20 @@ class TSimpleList { ++ToIndex; } + // void Save(TString& out) const { + // std::cerr << "TRange::Save() " << out.size() << std::endl; + // WriteUi64(out, FromIndex); + // WriteUi64(out, ToIndex); + // std::cerr << "TRange::Save() end " << out.size() << std::endl; + // } + + // void Load(TStringBuf& in) { + // std::cerr << "TRange::Load() " << in.size() << std::endl; + // FromIndex = ReadUi64(in); + // ToIndex = ReadUi64(in); + // std::cerr << "TRange::Load() end " << in.size() << std::endl; + // } + private: ui64 FromIndex; ui64 ToIndex; @@ -124,6 +138,8 @@ class TSparseList { void UnlockRange(size_t from, size_t to) { for (auto i = from; i <= to; ++i) { const auto it = Storage.find(i); + std::cerr << " TContainer::UnlockRange() i " << i << std::endl; + MKQL_ENSURE(it != Storage.cend(), "Internal logic error"); auto lockCount = --it->second.LockCount; if (0 == lockCount) { @@ -132,25 +148,32 @@ class TSparseList { } } - void Save(TString& out, const TSaveLoadContext& ctx) { - std::cerr << "TContainer::Save()" << std::endl; + void Save(TString& out, const TSaveLoadContext& ctx) const { + std::cerr << " TContainer::Save() " << out.size() << std::endl; WriteUi64(out, Storage.size()); + std::cerr << " TContainer::Save() size" << Storage.size() << std::endl; for (const auto& [key, item]: Storage) { WriteUi64(out, key); - // WriteUi64(out, item.Value); TODO + + std::cerr << " TContainer::Save() key " << key << std::endl; WriteUnboxedValue(out, ctx.Packer.RefMutableObject(ctx.Ctx, false, ctx.StateType), item.Value); WriteUi64(out, item.LockCount); } + std::cerr << " TContainer::Save() end " << out.size() << std::endl; } - void Load(TStringBuf& in) { - std::cerr << "TContainer::Load()" << std::endl; + void Load(TStringBuf& in, const TSaveLoadContext& ctx) { + std::cerr << " TContainer::Load() " << in.size() << std::endl; auto size = ReadUi64(in); - for (size_t i =0; i < size; ++i) { + for (size_t i = 0; i < size; ++i) { auto key = ReadUi64(in); + NUdf::TUnboxedValue row = ReadUnboxedValue(in, ctx.Packer.RefMutableObject(ctx.Ctx, false, ctx.StateType), ctx.Ctx); auto lockCount = ReadUi64(in); - Storage.emplace(key, TItem{NUdf::TUnboxedValue{}, lockCount}); + std::cerr << " TContainer::Load() key " << key << std::endl; + Storage.emplace(key, TItem{row, lockCount}); } + std::cerr << " TContainer::Load() size" << Storage.size() << std::endl; + std::cerr << " TContainer::Load() end " << in.size() << std::endl; } private: @@ -264,6 +287,23 @@ class TSparseList { ToIndex = -1; } + void Save(TString& out, const TSaveLoadContext& ctx) const { + std::cerr << " TRange::Save() " << out.size() << std::endl; + ctx.SavePtr(out, Container); + std::cerr << " TRange::Save() IsValid " << IsValid() << std::endl; + WriteUi64(out, FromIndex); + WriteUi64(out, ToIndex); + std::cerr << " TRange::Save() end " << out.size() << std::endl; + } + + void Load(TStringBuf& in, const TSaveLoadContext& ctx) { + std::cerr << " TRange::Load() " << in.size() << std::endl; + ctx.Load(in, Container); + FromIndex = ReadUi64(in); + ToIndex = ReadUi64(in); + std::cerr << " TRange::Load() end " << in.size() << std::endl; + } + private: TRange(TContainerPtr container, size_t index) : Container(container) @@ -319,16 +359,18 @@ class TSparseList { return Size() == 0; } - void Save(TString& out, const TSaveLoadContext& ctx) { - std::cerr << "TSparseList::Save()" << std::endl; - Container->Save(out, ctx); + void Save(TString& out, const TSaveLoadContext& ctx) const { + std::cerr << "TSparseList::Save() " << out.size() << std::endl; + ctx.SavePtr(out, Container); WriteUi64(out, ListSize); + std::cerr << "TSparseList::Save() end " << out.size() << std::endl; } - void Load(TStringBuf& in) { - std::cerr << "TSparseList::Load()" << std::endl; - Container->Load(in); + void Load(TStringBuf& in, const TSaveLoadContext& ctx) { + std::cerr << "TSparseList::Load() " << in.size() << std::endl; + ctx.Load(in, Container); ListSize = ReadUi64(in); + std::cerr << "TSparseList::Load() end " << in.size() << std::endl; } private: diff --git a/ydb/library/yql/minikql/comp_nodes/mkql_match_recognize_matched_vars.h b/ydb/library/yql/minikql/comp_nodes/mkql_match_recognize_matched_vars.h index 3a7dd959499c..61fd4f95f965 100644 --- a/ydb/library/yql/minikql/comp_nodes/mkql_match_recognize_matched_vars.h +++ b/ydb/library/yql/minikql/comp_nodes/mkql_match_recognize_matched_vars.h @@ -8,6 +8,7 @@ namespace NKikimr::NMiniKQL::NMatchRecognize { template using TMatchedVar = std::vector>; + template void Extend(TMatchedVar& var, const R& r) { if (var.empty()) { diff --git a/ydb/library/yql/minikql/comp_nodes/mkql_match_recognize_nfa.h b/ydb/library/yql/minikql/comp_nodes/mkql_match_recognize_nfa.h index 1db6d6f37b88..0e0accfe5c9a 100644 --- a/ydb/library/yql/minikql/comp_nodes/mkql_match_recognize_nfa.h +++ b/ydb/library/yql/minikql/comp_nodes/mkql_match_recognize_nfa.h @@ -67,6 +67,26 @@ struct TNfaTransitionGraph { size_t Output; using TPtr = std::shared_ptr; + + void Save(TString& out) const { + std::cerr << "TNfaTransitionGraph::Save() " << out.size() << std::endl; + // WriteUi64(out, Index); + // WriteUi64(out, Quantifiers.size()); + // TODO + std::cerr << "TNfaTransitionGraph::Save() end " << out.size() << std::endl; + } + + void Load(TStringBuf& in) { + std::cerr << "TNfaTransitionGraph::Load() " << in.size() << std::endl; + // Index = ReadUi64(in); + // while (!Quantifiers.empty()) { + // Quantifiers.pop(); + // } + // auto quantifiersSize = ReadUi64(in); + // // TODO + std::cerr << "TNfaTransitionGraph::Load() end " << in.size() << std::endl; + } + }; class TNfaTransitionGraphOptimizer { @@ -251,14 +271,77 @@ class TNfaTransitionGraphBuilder { class TNfa { using TRange = TSparseList::TRange; using TMatchedVars = TMatchedVars; + struct TState { + + TState() {} + TState(size_t index, const TMatchedVars& vars, std::stack>>&& quantifiers) : Index(index) , Vars(vars) , Quantifiers(quantifiers) {} - const size_t Index; + size_t Index; TMatchedVars Vars; - std::stack>> Quantifiers; //get rid of this + + using TQuantifiersStdStack = std::stack< + ui64, + std::deque>>; //get rid of this + + struct TQuantifiersStack: public TQuantifiersStdStack { + template + TQuantifiersStack(TArgs... args) : TQuantifiersStdStack(args...) {} + + auto begin() const { return c.begin(); } + auto end() const { return c.end(); } + auto clear() { return c.clear(); } + }; + + TQuantifiersStack Quantifiers; + + void Save(TString& out, const TSaveLoadContext& ctx) const { + std::cerr << "TState::Save() " << out.size() << std::endl; + WriteUi64(out, Index); + + WriteUi64(out, Vars.size()); + for (const auto& vector : Vars) { + WriteUi64(out, vector.size()); + for (const auto& range : vector) { + range.Save(out, ctx); + } + } + WriteUi64(out, Quantifiers.size()); + for (ui64 qnt : Quantifiers) { + WriteUi64(out, qnt); + } + std::cerr << "TState::Save() end " << out.size() << std::endl; + } + + void Load(TStringBuf& in, const TSaveLoadContext& ctx) { + std::cerr << "TState::Load() " << in.size() << std::endl; + Index = ReadUi64(in); + + auto varsSize = ReadUi64(in); + Vars.clear(); + Vars.resize(varsSize); + for (size_t i = 0; i < varsSize; ++i) { + auto& subvec = Vars[i]; + ui64 vectorSize = ReadUi64(in); + subvec.resize(vectorSize); + for (size_t j = 0; j < vectorSize; ++j) { + subvec[i].Load(in, ctx); + } + } + + while (!Quantifiers.empty()) { + Quantifiers.pop(); + } + auto quantifiersSize = ReadUi64(in); + for (size_t i = 0; i < quantifiersSize; ++i) { + ui64 qnt = ReadUi64(in); + Quantifiers.push(qnt); + } + std::cerr << "TState::Load() end " << in.size() << std::endl; + } friend inline bool operator<(const TState& lhs, const TState& rhs) { return std::tie(lhs.Index, lhs.Quantifiers, lhs.Vars) < std::tie(rhs.Index, rhs.Quantifiers, rhs.Vars); @@ -330,10 +413,30 @@ class TNfa { return ActiveStates.size(); } - void Save(TString& out) { + void Save(TString& out, const TSaveLoadContext& ctx) const { + std::cerr << "TNfa::Save() " << out.size() << std::endl; + TransitionGraph->Save(out); + WriteUi64(out, ActiveStates.size()); + std::cerr << "TNfa::Save() ActiveStates size " << ActiveStates.size() << std::endl; + for (const auto& state : ActiveStates) { + state.Save(out, ctx); + } + WriteUi64(out, EpsilonTransitionsLastRow); + std::cerr << "TNfa::Save() end " << out.size() << std::endl; } - void Load(TStringBuf& in) { + void Load(TStringBuf& in, const TSaveLoadContext& ctx) { + std::cerr << "TNfa::Load() " << in.size() << std::endl; + TransitionGraph->Load(in); + auto stateSize = ReadUi64(in); + std::cerr << "TNfa::Load() ActiveStates size " << stateSize << std::endl; + for (size_t i = 0; i < stateSize; ++i) { + TState state; + state.Load(in, ctx); + ActiveStates.emplace(state); + } + EpsilonTransitionsLastRow = ReadUi64(in); + std::cerr << "TNfa::Load() end " << in.size() << std::endl; } private: diff --git a/ydb/library/yql/minikql/comp_nodes/mkql_match_recognize_parameters.h b/ydb/library/yql/minikql/comp_nodes/mkql_match_recognize_parameters.h index 85a8d8e66295..ee9032e5de80 100644 --- a/ydb/library/yql/minikql/comp_nodes/mkql_match_recognize_parameters.h +++ b/ydb/library/yql/minikql/comp_nodes/mkql_match_recognize_parameters.h @@ -1,6 +1,7 @@ #pragma once #include "mkql_match_recognize_measure_arg.h" +#include "mkql_saveload.h" #include #include #include @@ -27,9 +28,71 @@ struct TMatchRecognizeProcessorParameters { }; struct TSaveLoadContext { + enum class TPtrStateMode { + Saved = 0, + FromCache = 1 + }; TComputationContext& Ctx; TType* StateType; const TMutableObjectOverBoxedValue& Packer; + + mutable std::map WriteCache; + mutable std::map ReadCache; + + template + void SavePtr(TString& out, const TIntrusivePtr& ptr) const { + std::cerr << " SavePtr) " << out.size() << std::endl; + auto refCount = ptr.RefCount(); + + bool isValid = static_cast(ptr); + WriteBool(out, isValid); + + if (!isValid) { + return; + } + std::cerr << " SavePtr) isValid " << isValid << std::endl; + + auto addr = reinterpret_cast(ptr.Get()); + WriteUi64(out, addr); + + auto it = WriteCache.find(addr); + if (it == WriteCache.end()) { + std::cerr << " SavePtr) new " << std::endl; + WriteByte(out, static_cast(TPtrStateMode::Saved)); + ptr->Save(out, *this); + WriteCache[addr] = addr; + } else { + WriteByte(out, static_cast(TPtrStateMode::FromCache)); + std::cerr << " SavePtr) from cache " << std::endl; + } + std::cerr << " SavePtr) end " << out.size() << std::endl; + } + + template + void Load(TStringBuf& in, TIntrusivePtr& ptr) const { + std::cerr << " Load) " << in.size() << std::endl; + //assert(false); + bool isValid = ReadBool(in); + if (!isValid) { + ptr.Reset(); + return; + } + ui64 addr = ReadUi64(in); + TPtrStateMode mode = static_cast(ReadByte(in)); + if (mode == TPtrStateMode::Saved) { + auto newPtr = MakeIntrusive(); + newPtr->Load(in, *this); + ptr = newPtr; + ReadCache[addr] = newPtr.Get(); + } else { + auto it = ReadCache.find(addr); + MKQL_ENSURE(it != ReadCache.end(), "Internal error"); + auto* cachePtr = static_cast(it->second); + ptr = TIntrusivePtr(cachePtr); + } + std::cerr << " Load) end " << in.size() << std::endl; + } + }; } //namespace NKikimr::NMiniKQL::NMatchRecognize diff --git a/ydb/library/yql/minikql/comp_nodes/mkql_time_order_recover.cpp b/ydb/library/yql/minikql/comp_nodes/mkql_time_order_recover.cpp index f32d828a0f1c..9ce84dc15c35 100644 --- a/ydb/library/yql/minikql/comp_nodes/mkql_time_order_recover.cpp +++ b/ydb/library/yql/minikql/comp_nodes/mkql_time_order_recover.cpp @@ -60,7 +60,7 @@ class TTimeOrderRecover : public TStatefulFlowComputationNode& partitionColumns, - const TArrayRef>& getMeasures, - const NYql::NMatchRecognize::TRowPattern& pattern, - const TArrayRef>& getDefines, - bool streamingMode -*/ - - // MEASURES -// LAST(A.dt) as dt_begin - // ONE ROW PER MATCH - // PATTERN ( A{3, 3} ) - // DEFINE A as True) - - - TVector partitionColumns;// = {TStringBuf("a")}; - TVector> getMeasures = {{ - std::make_pair( - TStringBuf("key"), - [&](TRuntimeNode measureInputDataArg, TRuntimeNode matchedVarsArg) { - //return pgmBuilder.Length(measureInputDataArg); - auto run = pgmBuilder.Take(measureInputDataArg, pgmBuilder.NewDataLiteral(0)); - - auto oldType = run.GetStaticType(); - // oldType->GetKindAsStr(); - - std::cerr << "GetKindAsStr " << oldType->GetKindAsStr() << std::endl; - - - // MKQL_ENSURE(oldType->IsStruct(), "Expected struct"); - - // const auto& oldTypeDetailed = static_cast(*oldType); - // auto count = oldTypeDetailed.GetMembersCount(); - - // return pgmBuilder.NewDataLiteral(count); - //return measureInputDataArg; - //return pgmBuilder.NewDataLiteral(42); - // return pgmBuilder.Size(measureInputDataArg); - - // bool isOptional; - // const auto type = AS_TYPE(TStructType, UnpackOptional(matchedVarsArg.GetStaticType(), isOptional)); - // auto count = type->GetMembersCount(); - - - return pgmBuilder.NewDataLiteral(56); - } + // MEASURES + // LAST(A.dt) as dt_begin + // ONE ROW PER MATCH + // PATTERN ( A{3, 3} ) + // DEFINE A as True) + + + TVector partitionColumns;// = {TStringBuf("a")}; + TVector> getMeasures = {{ + std::make_pair( + TStringBuf("key"), + [&](TRuntimeNode measureInputDataArg, TRuntimeNode matchedVarsArg) { + // auto run = pgmBuilder.Take(measureInputDataArg, pgmBuilder.NewDataLiteral(0)); + return pgmBuilder.NewDataLiteral(56); + } )}}; - TVector> getDefines = {{ - std::make_pair( - TStringBuf("A"), - [&](TRuntimeNode inputDataArg, TRuntimeNode matchedVarsArg, TRuntimeNode currentRowIndexArg) { - return pgmBuilder.NewDataLiteral(true); - } + TVector> getDefines = {{ + std::make_pair( + TStringBuf("A"), + [&](TRuntimeNode inputDataArg, TRuntimeNode matchedVarsArg, TRuntimeNode currentRowIndexArg) { + return pgmBuilder.NewDataLiteral(true); + } )}}; auto pgmReturn = pgmBuilder.MatchRecognizeCore( @@ -189,13 +153,6 @@ namespace NKikimr { auto v = value.GetElement(0).Get(); std::cerr << "GetElement " << v << std::endl; - - // { - // value = graph1->GetValue(); - // UNIT_ASSERT(!value.IsFinish() && value); - // v = value.GetElement(0).Get(); - // std::cerr << "GetElement " << v << std::endl; - // } TString graphState = graph1->SaveGraphState(); std::cerr << "----------------------" << std::endl; From 3e165dd4eba9dd12f57c2f49573f88ad96a6c06c Mon Sep 17 00:00:00 2001 From: Dmitry Kardymon Date: Fri, 9 Feb 2024 10:37:48 +0000 Subject: [PATCH 08/19] add serealization helpers --- .../comp_nodes/mkql_match_recognize.cpp | 56 +++-- .../comp_nodes/mkql_match_recognize_list.h | 102 ++++----- .../comp_nodes/mkql_match_recognize_nfa.h | 197 +++++++++++++----- .../mkql_match_recognize_parameters.h | 145 +++++++++++-- 4 files changed, 340 insertions(+), 160 deletions(-) diff --git a/ydb/library/yql/minikql/comp_nodes/mkql_match_recognize.cpp b/ydb/library/yql/minikql/comp_nodes/mkql_match_recognize.cpp index f13e881c86be..b413e44fb119 100644 --- a/ydb/library/yql/minikql/comp_nodes/mkql_match_recognize.cpp +++ b/ydb/library/yql/minikql/comp_nodes/mkql_match_recognize.cpp @@ -151,7 +151,7 @@ class TStreamingMatchRecognize { return HasMatched(); } - bool HasMatched() { + bool HasMatched() const { return Nfa.HasMatched(); } @@ -186,18 +186,18 @@ class TStreamingMatchRecognize { return false; } - void Save(TString& out, const TSaveLoadContext& ctx) const { - std::cerr << "TStreamingMatchRecognize::Save() " << out.size() << std::endl; - Rows.Save(out, ctx); - Nfa.Save(out, ctx); - WriteUi64(out, MatchNumber); + void Save(TOutputSerializer& serealizer) const { + std::cerr << "TStreamingMatchRecognize::Save() " << serealizer.Size() << std::endl; + Rows.Save(serealizer); + Nfa.Save(serealizer); + serealizer.Write(MatchNumber); } - void Load(TStringBuf& in, const TSaveLoadContext& ctx) { + void Load(TInputSerializer& serializer) { std::cerr << "TStreamingMatchRecognize::Load()" << std::endl; - Rows.Load(in, ctx); - Nfa.Load(in, ctx); - MatchNumber = ReadUi64(in); + Rows.Load(serializer); + Nfa.Load(serializer); + MatchNumber = serializer.Read(); } private: @@ -349,40 +349,38 @@ class TStateForInterleavedPartitions NUdf::TUnboxedValue Save() const override { std::cerr << "TStateForInterleavedPartitions::Save() " << std::endl; - TString out; - WriteUi32(out, StateVersion); - WriteUi32(out, Partitions.size()); + + TOutputSerializer serealizer(SaveLoadContex); + + serealizer.Write(StateVersion); + serealizer.Write(Partitions.size()); for (const auto& [key, state] : Partitions) { - WriteString(out, key); - state->Save(out, SaveLoadContex); - std::cerr << "partitions.HasMatched() " << state->HasMatched() << std::endl; + serealizer.Write(key); + state->Save(serealizer); } - std::cerr << "HasReadyOutput size " << HasReadyOutput.size() << std::endl; - - auto strRef = NUdf::TStringRef(out.data(), out.size()); - std::cerr << "TStateForInterleavedPartitions::Save() end " << out.size() << std::endl; - return MakeString(strRef); + return serealizer.MakeString(); } void Load(const NUdf::TStringRef& state) override { - TStringBuf in(state.Data(), state.Size()); - std::cerr << "TStateForInterleavedPartitions::Load() " << in.size() << std::endl; + TInputSerializer serializer(SaveLoadContex, state); + + std::cerr << "TStateForInterleavedPartitions::Load() " << serializer.Size() << std::endl; - const auto stateVersion = ReadUi32(in); + const auto stateVersion = serializer.Read(); if (stateVersion == 1) { Partitions.clear(); - auto partitionsSize = ReadUi32(in); + auto partitionsSize = serializer.Read(); for (size_t i = 0; i < partitionsSize; ++i) { - auto key = ReadString(in); + auto key = serializer.Read(); auto pair = Partitions.emplace(key, std::make_unique( NYql::NUdf::TUnboxedValuePod(NYql::NUdf::TStringValue(key)), Parameters, NfaTransitionGraph, Cache)); - (pair.first)->second->Load(in, SaveLoadContex); + (pair.first)->second->Load(serializer); } std::cerr << "partitionsSize " << partitionsSize << std::endl; @@ -393,8 +391,8 @@ class TStateForInterleavedPartitions } } } - std::cerr << "TStateForInterleavedPartitions::Load() " << in.size() << std::endl; - MKQL_ENSURE(!in.size(), "State is corrupted"); + std::cerr << "TStateForInterleavedPartitions::Load() " << serializer.Size() << std::endl; + MKQL_ENSURE(!serializer.Size(), "State is corrupted"); } bool ProcessInputRow(NUdf::TUnboxedValue&& row, TComputationContext& ctx) { diff --git a/ydb/library/yql/minikql/comp_nodes/mkql_match_recognize_list.h b/ydb/library/yql/minikql/comp_nodes/mkql_match_recognize_list.h index 07b5c7a718ff..36737a491ffd 100644 --- a/ydb/library/yql/minikql/comp_nodes/mkql_match_recognize_list.h +++ b/ydb/library/yql/minikql/comp_nodes/mkql_match_recognize_list.h @@ -57,20 +57,6 @@ class TSimpleList { ++ToIndex; } - // void Save(TString& out) const { - // std::cerr << "TRange::Save() " << out.size() << std::endl; - // WriteUi64(out, FromIndex); - // WriteUi64(out, ToIndex); - // std::cerr << "TRange::Save() end " << out.size() << std::endl; - // } - - // void Load(TStringBuf& in) { - // std::cerr << "TRange::Load() " << in.size() << std::endl; - // FromIndex = ReadUi64(in); - // ToIndex = ReadUi64(in); - // std::cerr << "TRange::Load() end " << in.size() << std::endl; - // } - private: ui64 FromIndex; ui64 ToIndex; @@ -148,43 +134,43 @@ class TSparseList { } } - void Save(TString& out, const TSaveLoadContext& ctx) const { - std::cerr << " TContainer::Save() " << out.size() << std::endl; - WriteUi64(out, Storage.size()); - std::cerr << " TContainer::Save() size" << Storage.size() << std::endl; + void Save(TOutputSerializer& serealizer) const { + std::cerr << " TContainer::Save() " << serealizer.Size() << std::endl; + serealizer.Write(Storage.size()); for (const auto& [key, item]: Storage) { - WriteUi64(out, key); - - std::cerr << " TContainer::Save() key " << key << std::endl; - WriteUnboxedValue(out, ctx.Packer.RefMutableObject(ctx.Ctx, false, ctx.StateType), item.Value); - WriteUi64(out, item.LockCount); + serealizer.Write(key); + serealizer.Write(item.Value); + serealizer.Write(item.LockCount); } - std::cerr << " TContainer::Save() end " << out.size() << std::endl; + std::cerr << " TContainer::Save() end " << serealizer.Size() << std::endl; } - void Load(TStringBuf& in, const TSaveLoadContext& ctx) { - std::cerr << " TContainer::Load() " << in.size() << std::endl; - auto size = ReadUi64(in); + void Load(TInputSerializer& serializer) { + std::cerr << " TContainer::Load() " << serializer.Size() << std::endl; + //auto size = serializer.Read(); + auto size = serializer.Read(); for (size_t i = 0; i < size; ++i) { - auto key = ReadUi64(in); - NUdf::TUnboxedValue row = ReadUnboxedValue(in, ctx.Packer.RefMutableObject(ctx.Ctx, false, ctx.StateType), ctx.Ctx); - auto lockCount = ReadUi64(in); - std::cerr << " TContainer::Load() key " << key << std::endl; + auto key = serializer.Read(); + NUdf::TUnboxedValue row = serializer.Read(); + auto lockCount = serializer.Read(); Storage.emplace(key, TItem{row, lockCount}); } std::cerr << " TContainer::Load() size" << Storage.size() << std::endl; - std::cerr << " TContainer::Load() end " << in.size() << std::endl; + std::cerr << " TContainer::Load() end " << serializer.Size() << std::endl; } private: //TODO consider to replace hash table with contiguous chunks using TAllocator = TMKQLAllocator, EMemorySubPool::Temporary>; - std::unordered_map< + + using TStorage = std::unordered_map< size_t, TItem, std::hash, std::equal_to, - TAllocator> Storage; + TAllocator>; + + TStorage Storage; }; using TContainerPtr = TContainer::TPtr; @@ -287,21 +273,20 @@ class TSparseList { ToIndex = -1; } - void Save(TString& out, const TSaveLoadContext& ctx) const { - std::cerr << " TRange::Save() " << out.size() << std::endl; - ctx.SavePtr(out, Container); - std::cerr << " TRange::Save() IsValid " << IsValid() << std::endl; - WriteUi64(out, FromIndex); - WriteUi64(out, ToIndex); - std::cerr << " TRange::Save() end " << out.size() << std::endl; + void Save(TOutputSerializer& serealizer) const { + std::cerr << " TRange::Save() " << serealizer.Size() << std::endl; + serealizer.Write(Container); + serealizer.Write(FromIndex); + serealizer.Write(ToIndex); + std::cerr << " TRange::Save() end " << serealizer.Size() << std::endl; } - void Load(TStringBuf& in, const TSaveLoadContext& ctx) { - std::cerr << " TRange::Load() " << in.size() << std::endl; - ctx.Load(in, Container); - FromIndex = ReadUi64(in); - ToIndex = ReadUi64(in); - std::cerr << " TRange::Load() end " << in.size() << std::endl; + void Load(TInputSerializer& serializer) { + std::cerr << " TRange::Load() " << serializer.Size() << std::endl; + serializer.Read(Container); + FromIndex = serializer.Read(); + ToIndex = serializer.Read(); + std::cerr << " TRange::Load() end " << serializer.Size() << std::endl; } private: @@ -359,18 +344,21 @@ class TSparseList { return Size() == 0; } - void Save(TString& out, const TSaveLoadContext& ctx) const { - std::cerr << "TSparseList::Save() " << out.size() << std::endl; - ctx.SavePtr(out, Container); - WriteUi64(out, ListSize); - std::cerr << "TSparseList::Save() end " << out.size() << std::endl; + void Save(TOutputSerializer& serealizer) const { + std::cerr << "TSparseList::Save() " << serealizer.Size() << std::endl; + serealizer.Write(Container); + serealizer.Write(ListSize); + + std::cerr << "TSparseList::Save() ListSize " << ListSize << std::endl; + std::cerr << "TSparseList::Save() end " << serealizer.Size() << std::endl; } - void Load(TStringBuf& in, const TSaveLoadContext& ctx) { - std::cerr << "TSparseList::Load() " << in.size() << std::endl; - ctx.Load(in, Container); - ListSize = ReadUi64(in); - std::cerr << "TSparseList::Load() end " << in.size() << std::endl; + void Load(TInputSerializer& serializer) { + std::cerr << "TSparseList::Load() " << serializer.Size() << std::endl; + serializer.Read(Container); + ListSize = serializer.Read(); + std::cerr << "TSparseList::Load() ListSize " << ListSize << std::endl; + std::cerr << "TSparseList::Load() end " << serializer.Size() << std::endl; } private: diff --git a/ydb/library/yql/minikql/comp_nodes/mkql_match_recognize_nfa.h b/ydb/library/yql/minikql/comp_nodes/mkql_match_recognize_nfa.h index 0e0accfe5c9a..8f5c243c8ef9 100644 --- a/ydb/library/yql/minikql/comp_nodes/mkql_match_recognize_nfa.h +++ b/ydb/library/yql/minikql/comp_nodes/mkql_match_recognize_nfa.h @@ -19,7 +19,20 @@ using TEpsilonTransitions = std::vector, size_t>; //{{varIndex, saveState}, to} using TQuantityEnterTransition = size_t; //to using TQuantityExitTransition = std::pair, std::pair>; //{{min, max}, {foFindMore, toMatched}} -using TNfaTransition = std::variant< + +template +struct TVariantHelper { + using TVariant = std::variant; + + static std::variant getVariantByIndex(size_t i) + { + MKQL_ENSURE(i < sizeof...(Ts), "Wrong variant index"); + static std::variant table[] = { Ts{ }... }; + return table[i]; + } +}; + +using TNfaTransitionHelper = TVariantHelper< TVoidTransition, TMatchedVarTransition, TEpsilonTransitions, @@ -27,6 +40,8 @@ using TNfaTransition = std::variant< TQuantityExitTransition >; +using TNfaTransition = TNfaTransitionHelper::TVariant; + struct TNfaTransitionDestinationVisitor { std::function callback; @@ -61,32 +76,112 @@ struct TNfaTransitionDestinationVisitor { } }; +template +inline constexpr bool always_false_v = false; + struct TNfaTransitionGraph { - std::vector> Transitions; + using TTransitions = std::vector>; + + TTransitions Transitions; size_t Input; size_t Output; using TPtr = std::shared_ptr; - void Save(TString& out) const { - std::cerr << "TNfaTransitionGraph::Save() " << out.size() << std::endl; - // WriteUi64(out, Index); - // WriteUi64(out, Quantifiers.size()); - // TODO - std::cerr << "TNfaTransitionGraph::Save() end " << out.size() << std::endl; + void Save(TOutputSerializer& serealizer) const { + std::cerr << "TNfaTransitionGraph::Save() " << serealizer.Size() << std::endl; + serealizer.Write(Transitions.size()); + + std::cerr << "TNfaTransitionGraph::Save() size " << Transitions.size() << std::endl; + + for (ui64 i = 0; i < Transitions.size(); ++i) { + serealizer.Write(Transitions[i].index()); + + std::cerr << "TNfaTransitionGraph::Save() index " << Transitions[i].index() << std::endl; + + std::visit([&](auto&& arg) + { + using T = std::decay_t; + if constexpr (std::is_same_v) { + // Nothing + } + else if constexpr (std::is_same_v) { + serealizer.Write(arg.first.first); + serealizer.Write(arg.first.second); + serealizer.Write(arg.second); + } + else if constexpr (std::is_same_v) { + serealizer.Write(arg.size()); + for (size_t i = 0; i < arg.size(); ++i) { + serealizer.Write(arg[i]); + } + } + else if constexpr (std::is_same_v) { + serealizer.Write(arg); + } + else if constexpr (std::is_same_v) { + serealizer.Write(arg.first.first); + serealizer.Write(arg.first.second); + serealizer.Write(arg.second.first); + serealizer.Write(arg.second.second); + } + else + static_assert(always_false_v, "non-exhaustive visitor!"); + }, Transitions[i]); + } + serealizer.Write(Input); + serealizer.Write(Output); + std::cerr << "TNfaTransitionGraph::Save() end " << serealizer.Size() << std::endl; } - void Load(TStringBuf& in) { - std::cerr << "TNfaTransitionGraph::Load() " << in.size() << std::endl; - // Index = ReadUi64(in); - // while (!Quantifiers.empty()) { - // Quantifiers.pop(); - // } - // auto quantifiersSize = ReadUi64(in); - // // TODO - std::cerr << "TNfaTransitionGraph::Load() end " << in.size() << std::endl; + void Load(TInputSerializer& serializer) { + std::cerr << "TNfaTransitionGraph::Load() " << serializer.Size() << std::endl; + ui64 transitionSize = serializer.Read(); + std::cerr << "TNfaTransitionGraph::Load() transitionSize " << transitionSize << std::endl; + + Transitions.resize(transitionSize); + for (ui64 i = 0; i < transitionSize; ++i) { + size_t index = serializer.Read(); + + std::cerr << "TNfaTransitionGraph::Load() index " << index << std::endl; + + Transitions[i] = TNfaTransitionHelper::getVariantByIndex(index); + std::visit([&](auto&& arg) + { + using T = std::decay_t; + if constexpr (std::is_same_v) { + // Nothing + } + else if constexpr (std::is_same_v) { + arg.first.first = serializer.Read(); + arg.first.second = serializer.Read(); + arg.second = serializer.Read(); + } + else if constexpr (std::is_same_v) { + ui64 size = serializer.Read(); + arg.resize(size); + for (size_t i = 0; i < size; ++i) { + arg[i] = serializer.Read(); + } + } + else if constexpr (std::is_same_v) { + arg = serializer.Read(); + } + else if constexpr (std::is_same_v) { + arg.first.first = serializer.Read(); + arg.first.second = serializer.Read(); + arg.second.first = serializer.Read(); + arg.second.second = serializer.Read(); + } + else + static_assert(always_false_v, "non-exhaustive visitor!"); + }, Transitions[i]); + + } + Input = serializer.Read(); + Output = serializer.Read(); + std::cerr << "TNfaTransitionGraph::Load() end " << serializer.Size() << std::endl; } - }; class TNfaTransitionGraphOptimizer { @@ -298,49 +393,49 @@ class TNfa { TQuantifiersStack Quantifiers; - void Save(TString& out, const TSaveLoadContext& ctx) const { - std::cerr << "TState::Save() " << out.size() << std::endl; - WriteUi64(out, Index); + void Save(TOutputSerializer& serealizer) const { + std::cerr << "TState::Save() " << serealizer.Size() << std::endl; + serealizer.Write(Index); - WriteUi64(out, Vars.size()); + serealizer.Write(Vars.size()); for (const auto& vector : Vars) { - WriteUi64(out, vector.size()); + serealizer.Write(vector.size()); for (const auto& range : vector) { - range.Save(out, ctx); + range.Save(serealizer); } } - WriteUi64(out, Quantifiers.size()); + serealizer.Write(Quantifiers.size()); for (ui64 qnt : Quantifiers) { - WriteUi64(out, qnt); + serealizer.Write(qnt); } - std::cerr << "TState::Save() end " << out.size() << std::endl; + std::cerr << "TState::Save() end " << serealizer.Size() << std::endl; } - void Load(TStringBuf& in, const TSaveLoadContext& ctx) { - std::cerr << "TState::Load() " << in.size() << std::endl; - Index = ReadUi64(in); + void Load(TInputSerializer& serializer) { + std::cerr << "TState::Load() " << serializer.Size() << std::endl; + Index = serializer.Read(); - auto varsSize = ReadUi64(in); + auto varsSize = serializer.Read(); Vars.clear(); Vars.resize(varsSize); for (size_t i = 0; i < varsSize; ++i) { auto& subvec = Vars[i]; - ui64 vectorSize = ReadUi64(in); + ui64 vectorSize = serializer.Read(); subvec.resize(vectorSize); for (size_t j = 0; j < vectorSize; ++j) { - subvec[i].Load(in, ctx); + subvec[i].Load(serializer); } } while (!Quantifiers.empty()) { Quantifiers.pop(); } - auto quantifiersSize = ReadUi64(in); + auto quantifiersSize = serializer.Read(); for (size_t i = 0; i < quantifiersSize; ++i) { - ui64 qnt = ReadUi64(in); + ui64 qnt = serializer.Read(); Quantifiers.push(qnt); } - std::cerr << "TState::Load() end " << in.size() << std::endl; + std::cerr << "TState::Load() end " << serializer.Size() << std::endl; } friend inline bool operator<(const TState& lhs, const TState& rhs) { @@ -413,30 +508,29 @@ class TNfa { return ActiveStates.size(); } - void Save(TString& out, const TSaveLoadContext& ctx) const { - std::cerr << "TNfa::Save() " << out.size() << std::endl; - TransitionGraph->Save(out); - WriteUi64(out, ActiveStates.size()); - std::cerr << "TNfa::Save() ActiveStates size " << ActiveStates.size() << std::endl; + void Save(TOutputSerializer& serealizer) const { + std::cerr << "TNfa::Save() " << serealizer.Size() << std::endl; + TransitionGraph->Save(serealizer); + serealizer.Write(ActiveStates.size()); for (const auto& state : ActiveStates) { - state.Save(out, ctx); + state.Save(serealizer); } - WriteUi64(out, EpsilonTransitionsLastRow); - std::cerr << "TNfa::Save() end " << out.size() << std::endl; + serealizer.Write(EpsilonTransitionsLastRow); + std::cerr << "TNfa::Save() end " << serealizer.Size() << std::endl; } - void Load(TStringBuf& in, const TSaveLoadContext& ctx) { - std::cerr << "TNfa::Load() " << in.size() << std::endl; - TransitionGraph->Load(in); - auto stateSize = ReadUi64(in); + void Load(TInputSerializer& serializer) { + std::cerr << "TNfa::Load() " << serializer.Size() << std::endl; + TransitionGraph->Load(serializer); + auto stateSize = serializer.Read(); std::cerr << "TNfa::Load() ActiveStates size " << stateSize << std::endl; for (size_t i = 0; i < stateSize; ++i) { TState state; - state.Load(in, ctx); + state.Load(serializer); ActiveStates.emplace(state); } - EpsilonTransitionsLastRow = ReadUi64(in); - std::cerr << "TNfa::Load() end " << in.size() << std::endl; + EpsilonTransitionsLastRow = serializer.Read(); + std::cerr << "TNfa::Load() end " << serializer.Size() << std::endl; } private: @@ -486,6 +580,7 @@ class TNfa { TStateSet& NewStates; TStateSet& DeletedStates; }; + bool MakeEpsilonTransitionsImpl() { TStateSet newStates; TStateSet deletedStates; diff --git a/ydb/library/yql/minikql/comp_nodes/mkql_match_recognize_parameters.h b/ydb/library/yql/minikql/comp_nodes/mkql_match_recognize_parameters.h index ee9032e5de80..1e59e7bd2c4b 100644 --- a/ydb/library/yql/minikql/comp_nodes/mkql_match_recognize_parameters.h +++ b/ydb/library/yql/minikql/comp_nodes/mkql_match_recognize_parameters.h @@ -28,60 +28,142 @@ struct TMatchRecognizeProcessorParameters { }; struct TSaveLoadContext { + + TComputationContext& Ctx; + TType* StateType; + const TMutableObjectOverBoxedValue& Packer; +}; + +template +inline constexpr bool always_false_v2 = false; + +struct TOutputSerializer { +private: enum class TPtrStateMode { Saved = 0, FromCache = 1 }; - TComputationContext& Ctx; - TType* StateType; - const TMutableObjectOverBoxedValue& Packer; - mutable std::map WriteCache; - mutable std::map ReadCache; +public: + TOutputSerializer(const TSaveLoadContext& context) + : Context(context) + {} + + void Write(ui32 value) { + WriteUi32(Buf, value); + } + + void Write(ui64 value) { + WriteUi64(Buf, value); + } + + void Write(std::string_view value) { + WriteString(Buf, value); + } + + void Write(const NUdf::TUnboxedValue& value) { + WriteUnboxedValue(Buf, Context.Packer.RefMutableObject(Context.Ctx, false, Context.StateType), value); + } + + void Write(bool value) { + WriteBool(Buf, value); + } template - void SavePtr(TString& out, const TIntrusivePtr& ptr) const { - std::cerr << " SavePtr) " << out.size() << std::endl; + void Write(const TIntrusivePtr& ptr) { + std::cerr << " SavePtr) " << Buf.size() << std::endl; auto refCount = ptr.RefCount(); - + std::cerr << " SavePtr) refCount " <(ptr); - WriteBool(out, isValid); + WriteBool(Buf, isValid); if (!isValid) { return; } - std::cerr << " SavePtr) isValid " << isValid << std::endl; - auto addr = reinterpret_cast(ptr.Get()); - WriteUi64(out, addr); + WriteUi64(Buf, addr); auto it = WriteCache.find(addr); if (it == WriteCache.end()) { std::cerr << " SavePtr) new " << std::endl; - WriteByte(out, static_cast(TPtrStateMode::Saved)); - ptr->Save(out, *this); + WriteByte(Buf, static_cast(TPtrStateMode::Saved)); + ptr->Save(*this); WriteCache[addr] = addr; } else { - WriteByte(out, static_cast(TPtrStateMode::FromCache)); + WriteByte(Buf, static_cast(TPtrStateMode::FromCache)); std::cerr << " SavePtr) from cache " << std::endl; } - std::cerr << " SavePtr) end " << out.size() << std::endl; + std::cerr << " SavePtr) end " << Buf.size() << std::endl; + } + + NUdf::TUnboxedValuePod MakeString() { + auto strRef = NUdf::TStringRef(Buf.data(), Buf.size()); + return NKikimr::NMiniKQL::MakeString(strRef); + } + + size_t Size() // TODO : delete + { + return Buf.size(); + } +private: + TString Buf; + const TSaveLoadContext& Context; + mutable std::map WriteCache; + mutable std::map ReadCache; +}; + +// template +// inline constexpr bool always_false_v = false; +#include + +struct TInputSerializer { +private: + enum class TPtrStateMode { + Saved = 0, + FromCache = 1 + }; + +public: + TInputSerializer(TSaveLoadContext& context, const NUdf::TStringRef& state) + : Context(context) + , Buf(state.Data(), state.Size()) + {} + + template + ReturnType Read() { + if constexpr (std::is_same_v, TString>) { + return ReadString(Buf); + } else if constexpr (std::is_same_v, ui64>) { + return ReadUi64(Buf); + } else if constexpr (std::is_same_v, bool>) { + return ReadBool(Buf); + } else if constexpr (std::is_same_v, ui8>) { + return ReadByte(Buf); + } else if constexpr (std::is_same_v, ui32>) { + return ReadUi32(Buf); + } else if constexpr (std::is_same_v, NUdf::TUnboxedValue>) { + return ReadUnboxedValue(Buf, Context.Packer.RefMutableObject(Context.Ctx, false, Context.StateType), Context.Ctx); + } + else + static_assert(always_false_v2, "non-exhaustive visitor!"); + std::cerr << "Not implemented " << typeid(Type).name() << std::endl; + MKQL_ENSURE(false, "Not implemented"); } template - void Load(TStringBuf& in, TIntrusivePtr& ptr) const { - std::cerr << " Load) " << in.size() << std::endl; + void Read(TIntrusivePtr& ptr) { + std::cerr << " Load) " << Buf.size() << std::endl; //assert(false); - bool isValid = ReadBool(in); + bool isValid = Read(); if (!isValid) { ptr.Reset(); return; } - ui64 addr = ReadUi64(in); - TPtrStateMode mode = static_cast(ReadByte(in)); + ui64 addr = Read(); + TPtrStateMode mode = static_cast(Read()); if (mode == TPtrStateMode::Saved) { auto newPtr = MakeIntrusive(); - newPtr->Load(in, *this); + newPtr->Load(*this); ptr = newPtr; ReadCache[addr] = newPtr.Get(); } else { @@ -89,10 +171,27 @@ struct TSaveLoadContext { MKQL_ENSURE(it != ReadCache.end(), "Internal error"); auto* cachePtr = static_cast(it->second); ptr = TIntrusivePtr(cachePtr); + + auto refCount = ptr.RefCount(); + std::cerr << " Load) refCount " << refCount << std::endl; } - std::cerr << " Load) end " << in.size() << std::endl; + std::cerr << " Load) end " << Buf.size() << std::endl; } + NUdf::TUnboxedValuePod MakeString() { + auto strRef = NUdf::TStringRef(Buf.data(), Buf.size()); + return NKikimr::NMiniKQL::MakeString(strRef); + } + + size_t Size() // TODO : delete + { + return Buf.size(); + } +private: + TStringBuf Buf; + TSaveLoadContext& Context; + mutable std::map WriteCache; + mutable std::map ReadCache; }; } //namespace NKikimr::NMiniKQL::NMatchRecognize From c453cd5fa6f1954ccd710ac6d12fcc5bee57fedd Mon Sep 17 00:00:00 2001 From: Dmitry Kardymon Date: Mon, 12 Feb 2024 13:51:19 +0000 Subject: [PATCH 09/19] Fix / test passed --- .../comp_nodes/mkql_match_recognize.cpp | 47 ++++++------------- .../comp_nodes/mkql_match_recognize_list.h | 28 +++-------- .../mkql_match_recognize_matched_vars.h | 6 +-- .../mkql_match_recognize_measure_arg.h | 1 + .../comp_nodes/mkql_match_recognize_nfa.h | 31 +++--------- .../mkql_match_recognize_parameters.h | 27 ++++------- .../comp_nodes/ut/mkql_match_recognize_ut.cpp | 8 ++-- .../kikimr/test_recovery_match_recognize.py | 2 +- 8 files changed, 42 insertions(+), 108 deletions(-) diff --git a/ydb/library/yql/minikql/comp_nodes/mkql_match_recognize.cpp b/ydb/library/yql/minikql/comp_nodes/mkql_match_recognize.cpp index b413e44fb119..2023ab440a0d 100644 --- a/ydb/library/yql/minikql/comp_nodes/mkql_match_recognize.cpp +++ b/ydb/library/yql/minikql/comp_nodes/mkql_match_recognize.cpp @@ -59,7 +59,6 @@ class TBackTrackingMatchRecognize { } bool ProcessInputRow(NUdf::TUnboxedValue&& row, TComputationContext& ctx) { - std::cerr << "TBackTrackingMatchRecognize::ProcessInputRow()" << std::endl; Y_UNUSED(ctx); Rows.Append(std::move(row)); return false; @@ -100,7 +99,6 @@ class TBackTrackingMatchRecognize { for (size_t v = 0; v != Parameters.Defines.size(); ++v) { const auto &d = Parameters.Defines[v]->GetValue(ctx); if (d && d.GetOptionalValue().Get()) { - std::cerr << "Defines return true "<< std::endl; Extend(CurMatchedVars[v], TRange{i}); } } @@ -127,7 +125,7 @@ class TBackTrackingMatchRecognize { class TStreamingMatchRecognize { using TPartitionList = TSparseList; using TRange = TPartitionList::TRange; - using TMatchedVars = TMatchedVars; + // using TMatchedVars = TMatchedVars; public: using TPatternConfigurationBuilder = TNfaTransitionGraphBuilder; TStreamingMatchRecognize( @@ -144,7 +142,6 @@ class TStreamingMatchRecognize { } bool ProcessInputRow(NUdf::TUnboxedValue&& row, TComputationContext& ctx) { - std::cerr << "class TStreamingMatchRecognize::ProcessInputRow()" << std::endl; Parameters.InputDataArg->SetValue(ctx, ctx.HolderFactory.Create>(Rows)); Parameters.CurrentRowIndexArg->SetValue(ctx, NUdf::TUnboxedValuePod(Rows.Size())); Nfa.ProcessRow(Rows.Append(std::move(row)), ctx); @@ -152,13 +149,16 @@ class TStreamingMatchRecognize { } bool HasMatched() const { + return Nfa.HasMatched(); } NUdf::TUnboxedValue GetOutputIfReady(TComputationContext& ctx) { auto match = Nfa.GetMatched(); if (!match.has_value()) + { return NUdf::TUnboxedValue{}; + } Parameters.MatchedVarsArg->SetValue(ctx, ctx.HolderFactory.Create>(ctx.HolderFactory, match.value())); Parameters.MeasureInputDataArg->SetValue(ctx, ctx.HolderFactory.Create( ctx.HolderFactory.Create>(Rows), @@ -187,14 +187,13 @@ class TStreamingMatchRecognize { } void Save(TOutputSerializer& serealizer) const { - std::cerr << "TStreamingMatchRecognize::Save() " << serealizer.Size() << std::endl; + // TODO : PartitionKey Rows.Save(serealizer); Nfa.Save(serealizer); serealizer.Write(MatchNumber); } void Load(TInputSerializer& serializer) { - std::cerr << "TStreamingMatchRecognize::Load()" << std::endl; Rows.Load(serializer); Nfa.Load(serializer); MatchNumber = serializer.Read(); @@ -237,20 +236,16 @@ class TStateForNonInterleavedPartitions {} NUdf::TUnboxedValue Save() const override { - std::cerr << "TStateForNonInterleavedPartitions::Save() " << std::endl; TString out; - auto strRef = NUdf::TStringRef(out.data(), out.size()); return MakeString(strRef); } void Load(const NUdf::TStringRef& state) override { - std::cerr << "TStateForNonInterleavedPartitions::Load()" << std::endl; } bool ProcessInputRow(NUdf::TUnboxedValue&& row, TComputationContext& ctx) { - std::cerr << "TStateForNonInterleavedPartitions::ProcessInputRow()" << std::endl; MKQL_ENSURE(not DelayedRow, "Internal logic error"); //we're finalizing previous partition InputRowArg->SetValue(ctx, NUdf::TUnboxedValue(row)); auto partitionKey = PartitionKey->GetValue(ctx); @@ -265,6 +260,7 @@ class TStateForNonInterleavedPartitions if (PartitionHandler) { return PartitionHandler->ProcessEndOfData(ctx); } + //be aware that the very first partition is created in the same manner as subsequent return false; } @@ -344,14 +340,10 @@ class TStateForInterleavedPartitions , NfaTransitionGraph(TNfaTransitionGraphBuilder::Create(parameters.Pattern, parameters.VarNamesLookup)) , Cache(cache) , SaveLoadContex(ctx, stateType, packer) - { - } + {} NUdf::TUnboxedValue Save() const override { - std::cerr << "TStateForInterleavedPartitions::Save() " << std::endl; - TOutputSerializer serealizer(SaveLoadContex); - serealizer.Write(StateVersion); serealizer.Write(Partitions.size()); @@ -366,8 +358,6 @@ class TStateForInterleavedPartitions void Load(const NUdf::TStringRef& state) override { TInputSerializer serializer(SaveLoadContex, state); - - std::cerr << "TStateForInterleavedPartitions::Load() " << serializer.Size() << std::endl; const auto stateVersion = serializer.Read(); if (stateVersion == 1) { @@ -383,20 +373,18 @@ class TStateForInterleavedPartitions (pair.first)->second->Load(serializer); } - std::cerr << "partitionsSize " << partitionsSize << std::endl; - for (auto it = Partitions.begin(); it != Partitions.end(); ++it) { - std::cerr << "it->second->HasMatched() " << it->second->HasMatched() << std::endl; - if (it->second->HasMatched()) { - HasReadyOutput.push(it); - } - } + // std::cerr << "partitionsSize " << partitionsSize << std::endl; + // for (auto it = Partitions.begin(); it != Partitions.end(); ++it) { + // std::cerr << "it->second->HasMatched() " << it->second->HasMatched() << std::endl; + // if (it->second->HasMatched()) { + // HasReadyOutput.push(it); + // } + // } } - std::cerr << "TStateForInterleavedPartitions::Load() " << serializer.Size() << std::endl; MKQL_ENSURE(!serializer.Size(), "State is corrupted"); } bool ProcessInputRow(NUdf::TUnboxedValue&& row, TComputationContext& ctx) { - std::cerr << "TStateForInterleavedPartitions::ProcessInputRow()" << std::endl; auto partition = GetPartitionHandler(row, ctx); if (partition->second->ProcessInputRow(std::move(row), ctx)) { HasReadyOutput.push(partition); @@ -434,7 +422,6 @@ class TStateForInterleavedPartitions InputRowArg->SetValue(ctx, NUdf::TUnboxedValue(row)); auto partitionKey = PartitionKey->GetValue(ctx); const auto packedKey = PartitionKeyPacker.Pack(partitionKey); - std::cerr << "partitionKey " << TString(packedKey)<< std::endl; if (const auto it = Partitions.find(TString(packedKey)); it != Partitions.end()) { return it; } else { @@ -514,19 +501,15 @@ class TMatchRecognizeWrapper : public TStatefulFlowComputationNode(stateValue.AsBoxed().Get()); while (true) { if (auto output = state->GetOutputIfReady(ctx); output) { - std::cerr << "DoCalculate: return output" << std::endl; return output; } auto item = InputFlow->GetValue(ctx); if (item.IsFinish()) { - std::cerr << "call ProcessEndOfData()" << std::endl; state->ProcessEndOfData(ctx); continue; } else if (item.IsSpecial()) { - std::cerr << "IsSpecial" << std::endl; return item; } - std::cerr << "ProcessInputRow2" << std::endl; state->ProcessInputRow(std::move(item), ctx); } } @@ -663,8 +646,6 @@ std::pair> ConvertListOfStrings(c IComputationNode* WrapMatchRecognizeCore(TCallable& callable, const TComputationNodeFactoryContext& ctx) { - std::cerr << "WrapMatchRecognizeCore" << std::endl; - using namespace NMatchRecognize; size_t inputIndex = 0; const auto& inputFlow = callable.GetInput(inputIndex++); diff --git a/ydb/library/yql/minikql/comp_nodes/mkql_match_recognize_list.h b/ydb/library/yql/minikql/comp_nodes/mkql_match_recognize_list.h index 36737a491ffd..b8159cfb6192 100644 --- a/ydb/library/yql/minikql/comp_nodes/mkql_match_recognize_list.h +++ b/ydb/library/yql/minikql/comp_nodes/mkql_match_recognize_list.h @@ -92,8 +92,11 @@ class TSparseList { size_t LockCount = 0; }; + + class TContainer: public TSimpleRefCount { public: + using TPtr = TIntrusivePtr; void Add(size_t index, NUdf::TUnboxedValue&& value) { @@ -124,8 +127,6 @@ class TSparseList { void UnlockRange(size_t from, size_t to) { for (auto i = from; i <= to; ++i) { const auto it = Storage.find(i); - std::cerr << " TContainer::UnlockRange() i " << i << std::endl; - MKQL_ENSURE(it != Storage.cend(), "Internal logic error"); auto lockCount = --it->second.LockCount; if (0 == lockCount) { @@ -135,18 +136,15 @@ class TSparseList { } void Save(TOutputSerializer& serealizer) const { - std::cerr << " TContainer::Save() " << serealizer.Size() << std::endl; serealizer.Write(Storage.size()); for (const auto& [key, item]: Storage) { serealizer.Write(key); serealizer.Write(item.Value); serealizer.Write(item.LockCount); } - std::cerr << " TContainer::Save() end " << serealizer.Size() << std::endl; } void Load(TInputSerializer& serializer) { - std::cerr << " TContainer::Load() " << serializer.Size() << std::endl; //auto size = serializer.Read(); auto size = serializer.Read(); for (size_t i = 0; i < size; ++i) { @@ -155,8 +153,6 @@ class TSparseList { auto lockCount = serializer.Read(); Storage.emplace(key, TItem{row, lockCount}); } - std::cerr << " TContainer::Load() size" << Storage.size() << std::endl; - std::cerr << " TContainer::Load() end " << serializer.Size() << std::endl; } private: @@ -185,8 +181,7 @@ class TSparseList { : Container() , FromIndex(-1) , ToIndex(-1) - { - } + {} TRange(const TRange& other) : Container(other.Container) @@ -274,20 +269,16 @@ class TSparseList { } void Save(TOutputSerializer& serealizer) const { - std::cerr << " TRange::Save() " << serealizer.Size() << std::endl; serealizer.Write(Container); serealizer.Write(FromIndex); serealizer.Write(ToIndex); - std::cerr << " TRange::Save() end " << serealizer.Size() << std::endl; - } + } void Load(TInputSerializer& serializer) { - std::cerr << " TRange::Load() " << serializer.Size() << std::endl; serializer.Read(Container); FromIndex = serializer.Read(); ToIndex = serializer.Read(); - std::cerr << " TRange::Load() end " << serializer.Size() << std::endl; - } + } private: TRange(TContainerPtr container, size_t index) @@ -345,20 +336,13 @@ class TSparseList { } void Save(TOutputSerializer& serealizer) const { - std::cerr << "TSparseList::Save() " << serealizer.Size() << std::endl; serealizer.Write(Container); serealizer.Write(ListSize); - - std::cerr << "TSparseList::Save() ListSize " << ListSize << std::endl; - std::cerr << "TSparseList::Save() end " << serealizer.Size() << std::endl; } void Load(TInputSerializer& serializer) { - std::cerr << "TSparseList::Load() " << serializer.Size() << std::endl; serializer.Read(Container); ListSize = serializer.Read(); - std::cerr << "TSparseList::Load() ListSize " << ListSize << std::endl; - std::cerr << "TSparseList::Load() end " << serializer.Size() << std::endl; } private: diff --git a/ydb/library/yql/minikql/comp_nodes/mkql_match_recognize_matched_vars.h b/ydb/library/yql/minikql/comp_nodes/mkql_match_recognize_matched_vars.h index 61fd4f95f965..60d5a9117e3b 100644 --- a/ydb/library/yql/minikql/comp_nodes/mkql_match_recognize_matched_vars.h +++ b/ydb/library/yql/minikql/comp_nodes/mkql_match_recognize_matched_vars.h @@ -84,8 +84,7 @@ class TMatchedVarsValue : public TComputationValue> { : TComputationValue(memInfo) , HolderFactory(holderFactory) , Var(v) - { - } + {} bool HasFastListLength() const override { return true; @@ -111,8 +110,7 @@ class TMatchedVarsValue : public TComputationValue> { : TComputationValue(memInfo) , HolderFactory(holderFactory) , Vars(vars) - { - } + {} NUdf::TUnboxedValue GetElement(ui32 index) const override { return HolderFactory.Create(HolderFactory, Vars[index]); diff --git a/ydb/library/yql/minikql/comp_nodes/mkql_match_recognize_measure_arg.h b/ydb/library/yql/minikql/comp_nodes/mkql_match_recognize_measure_arg.h index 38ed16e21ee3..db408287def7 100644 --- a/ydb/library/yql/minikql/comp_nodes/mkql_match_recognize_measure_arg.h +++ b/ydb/library/yql/minikql/comp_nodes/mkql_match_recognize_measure_arg.h @@ -32,6 +32,7 @@ class TRowForMeasureValue: public TComputationValue , VarNames(varNames) , MatchNumber(matchNumber) {} + NUdf::TUnboxedValue GetElement(ui32 index) const override { switch(ColumnOrder[index].first) { case EMeasureInputDataSpecialColumns::Classifier: { diff --git a/ydb/library/yql/minikql/comp_nodes/mkql_match_recognize_nfa.h b/ydb/library/yql/minikql/comp_nodes/mkql_match_recognize_nfa.h index 8f5c243c8ef9..d50c07f64fc7 100644 --- a/ydb/library/yql/minikql/comp_nodes/mkql_match_recognize_nfa.h +++ b/ydb/library/yql/minikql/comp_nodes/mkql_match_recognize_nfa.h @@ -24,8 +24,7 @@ template struct TVariantHelper { using TVariant = std::variant; - static std::variant getVariantByIndex(size_t i) - { + static std::variant getVariantByIndex(size_t i) { MKQL_ENSURE(i < sizeof...(Ts), "Wrong variant index"); static std::variant table[] = { Ts{ }... }; return table[i]; @@ -89,16 +88,11 @@ struct TNfaTransitionGraph { using TPtr = std::shared_ptr; void Save(TOutputSerializer& serealizer) const { - std::cerr << "TNfaTransitionGraph::Save() " << serealizer.Size() << std::endl; serealizer.Write(Transitions.size()); - std::cerr << "TNfaTransitionGraph::Save() size " << Transitions.size() << std::endl; - for (ui64 i = 0; i < Transitions.size(); ++i) { serealizer.Write(Transitions[i].index()); - std::cerr << "TNfaTransitionGraph::Save() index " << Transitions[i].index() << std::endl; - std::visit([&](auto&& arg) { using T = std::decay_t; @@ -131,20 +125,14 @@ struct TNfaTransitionGraph { } serealizer.Write(Input); serealizer.Write(Output); - std::cerr << "TNfaTransitionGraph::Save() end " << serealizer.Size() << std::endl; } void Load(TInputSerializer& serializer) { - std::cerr << "TNfaTransitionGraph::Load() " << serializer.Size() << std::endl; ui64 transitionSize = serializer.Read(); - std::cerr << "TNfaTransitionGraph::Load() transitionSize " << transitionSize << std::endl; Transitions.resize(transitionSize); for (ui64 i = 0; i < transitionSize; ++i) { size_t index = serializer.Read(); - - std::cerr << "TNfaTransitionGraph::Load() index " << index << std::endl; - Transitions[i] = TNfaTransitionHelper::getVariantByIndex(index); std::visit([&](auto&& arg) { @@ -180,7 +168,6 @@ struct TNfaTransitionGraph { } Input = serializer.Read(); Output = serializer.Read(); - std::cerr << "TNfaTransitionGraph::Load() end " << serializer.Size() << std::endl; } }; @@ -194,6 +181,7 @@ class TNfaTransitionGraphOptimizer { EliminateSingleEpsilons(); CollectGarbage(); } + private: void EliminateEpsilonChains() { for (size_t node = 0; node != Graph->Transitions.size(); node++) { @@ -367,6 +355,7 @@ class TNfa { using TRange = TSparseList::TRange; using TMatchedVars = TMatchedVars; + struct TState { TState() {} @@ -394,7 +383,6 @@ class TNfa { TQuantifiersStack Quantifiers; void Save(TOutputSerializer& serealizer) const { - std::cerr << "TState::Save() " << serealizer.Size() << std::endl; serealizer.Write(Index); serealizer.Write(Vars.size()); @@ -408,11 +396,9 @@ class TNfa { for (ui64 qnt : Quantifiers) { serealizer.Write(qnt); } - std::cerr << "TState::Save() end " << serealizer.Size() << std::endl; } void Load(TInputSerializer& serializer) { - std::cerr << "TState::Load() " << serializer.Size() << std::endl; Index = serializer.Read(); auto varsSize = serializer.Read(); @@ -423,7 +409,7 @@ class TNfa { ui64 vectorSize = serializer.Read(); subvec.resize(vectorSize); for (size_t j = 0; j < vectorSize; ++j) { - subvec[i].Load(serializer); + subvec[j].Load(serializer); } } @@ -435,7 +421,6 @@ class TNfa { ui64 qnt = serializer.Read(); Quantifiers.push(qnt); } - std::cerr << "TState::Load() end " << serializer.Size() << std::endl; } friend inline bool operator<(const TState& lhs, const TState& rhs) { @@ -446,13 +431,14 @@ class TNfa { } }; public: + TNfa(TNfaTransitionGraph::TPtr transitionGraph, IComputationExternalNode* matchedRangesArg, const TComputationNodePtrVector& defines) : TransitionGraph(transitionGraph) , MatchedRangesArg(matchedRangesArg) , Defines(defines) { } - void ProcessRow(TSparseList::TRange&& currentRowLock, TComputationContext& ctx) { + void ProcessRow(TSparseList::TRange&& currentRowLock, TComputationContext& ctx) { ActiveStates.emplace(TransitionGraph->Input, TMatchedVars(Defines.size()), std::stack>>{}); MakeEpsilonTransitions(); std::set, TMKQLAllocator> newStates; @@ -509,28 +495,23 @@ class TNfa { } void Save(TOutputSerializer& serealizer) const { - std::cerr << "TNfa::Save() " << serealizer.Size() << std::endl; TransitionGraph->Save(serealizer); serealizer.Write(ActiveStates.size()); for (const auto& state : ActiveStates) { state.Save(serealizer); } serealizer.Write(EpsilonTransitionsLastRow); - std::cerr << "TNfa::Save() end " << serealizer.Size() << std::endl; } void Load(TInputSerializer& serializer) { - std::cerr << "TNfa::Load() " << serializer.Size() << std::endl; TransitionGraph->Load(serializer); auto stateSize = serializer.Read(); - std::cerr << "TNfa::Load() ActiveStates size " << stateSize << std::endl; for (size_t i = 0; i < stateSize; ++i) { TState state; state.Load(serializer); ActiveStates.emplace(state); } EpsilonTransitionsLastRow = serializer.Read(); - std::cerr << "TNfa::Load() end " << serializer.Size() << std::endl; } private: diff --git a/ydb/library/yql/minikql/comp_nodes/mkql_match_recognize_parameters.h b/ydb/library/yql/minikql/comp_nodes/mkql_match_recognize_parameters.h index 1e59e7bd2c4b..e52e7a505a7f 100644 --- a/ydb/library/yql/minikql/comp_nodes/mkql_match_recognize_parameters.h +++ b/ydb/library/yql/minikql/comp_nodes/mkql_match_recognize_parameters.h @@ -71,9 +71,12 @@ struct TOutputSerializer { template void Write(const TIntrusivePtr& ptr) { - std::cerr << " SavePtr) " << Buf.size() << std::endl; + // Format + // bool isValid + // Ui64 addr + // ui8 mode + // ...data auto refCount = ptr.RefCount(); - std::cerr << " SavePtr) refCount " <(ptr); WriteBool(Buf, isValid); @@ -85,15 +88,12 @@ struct TOutputSerializer { auto it = WriteCache.find(addr); if (it == WriteCache.end()) { - std::cerr << " SavePtr) new " << std::endl; WriteByte(Buf, static_cast(TPtrStateMode::Saved)); ptr->Save(*this); WriteCache[addr] = addr; } else { WriteByte(Buf, static_cast(TPtrStateMode::FromCache)); - std::cerr << " SavePtr) from cache " << std::endl; } - std::cerr << " SavePtr) end " << Buf.size() << std::endl; } NUdf::TUnboxedValuePod MakeString() { @@ -109,7 +109,6 @@ struct TOutputSerializer { TString Buf; const TSaveLoadContext& Context; mutable std::map WriteCache; - mutable std::map ReadCache; }; // template @@ -127,7 +126,7 @@ struct TInputSerializer { TInputSerializer(TSaveLoadContext& context, const NUdf::TStringRef& state) : Context(context) , Buf(state.Data(), state.Size()) - {} + {} template ReturnType Read() { @@ -146,14 +145,11 @@ struct TInputSerializer { } else static_assert(always_false_v2, "non-exhaustive visitor!"); - std::cerr << "Not implemented " << typeid(Type).name() << std::endl; MKQL_ENSURE(false, "Not implemented"); } template void Read(TIntrusivePtr& ptr) { - std::cerr << " Load) " << Buf.size() << std::endl; - //assert(false); bool isValid = Read(); if (!isValid) { ptr.Reset(); @@ -162,20 +158,16 @@ struct TInputSerializer { ui64 addr = Read(); TPtrStateMode mode = static_cast(Read()); if (mode == TPtrStateMode::Saved) { - auto newPtr = MakeIntrusive(); - newPtr->Load(*this); - ptr = newPtr; - ReadCache[addr] = newPtr.Get(); + ptr = MakeIntrusive(); + ptr->Load(*this); + ReadCache[addr] = ptr.Get(); } else { auto it = ReadCache.find(addr); MKQL_ENSURE(it != ReadCache.end(), "Internal error"); auto* cachePtr = static_cast(it->second); ptr = TIntrusivePtr(cachePtr); - auto refCount = ptr.RefCount(); - std::cerr << " Load) refCount " << refCount << std::endl; } - std::cerr << " Load) end " << Buf.size() << std::endl; } NUdf::TUnboxedValuePod MakeString() { @@ -190,7 +182,6 @@ struct TInputSerializer { private: TStringBuf Buf; TSaveLoadContext& Context; - mutable std::map WriteCache; mutable std::map ReadCache; }; diff --git a/ydb/library/yql/minikql/comp_nodes/ut/mkql_match_recognize_ut.cpp b/ydb/library/yql/minikql/comp_nodes/ut/mkql_match_recognize_ut.cpp index 4c14ba0efc74..7b0d29352bb1 100644 --- a/ydb/library/yql/minikql/comp_nodes/ut/mkql_match_recognize_ut.cpp +++ b/ydb/library/yql/minikql/comp_nodes/ut/mkql_match_recognize_ut.cpp @@ -148,15 +148,13 @@ namespace NKikimr { auto value = graph1->GetValue(); - std::cerr << "IsFinish " << value.IsFinish() << std::endl; UNIT_ASSERT(!value.IsFinish() && value); auto v = value.GetElement(0).Get(); - std::cerr << "GetElement " << v << std::endl; TString graphState = graph1->SaveGraphState(); - std::cerr << "----------------------" << std::endl; - std::cerr << "State size " << graphState.size() << std::endl; + // graph1.Reset(); + TSetup setup2(alloc); auto graph2 = BuildGraph(setup2, TTestInputData{{1003, "D", 103, "P"}}); @@ -165,7 +163,7 @@ namespace NKikimr { value = graph2->GetValue(); UNIT_ASSERT(!value.IsFinish() && value); v = value.GetElement(0).Get(); - std::cerr << "GetElement " << v << std::endl; + UNIT_ASSERT_VALUES_EQUAL(56, v); } const TTestInputData input = { diff --git a/ydb/tests/fq/kikimr/test_recovery_match_recognize.py b/ydb/tests/fq/kikimr/test_recovery_match_recognize.py index 60c473548d62..db2878f7ad7d 100644 --- a/ydb/tests/fq/kikimr/test_recovery_match_recognize.py +++ b/ydb/tests/fq/kikimr/test_recovery_match_recognize.py @@ -235,7 +235,7 @@ def test_match_recognize(self, kikimr, client, yq_version): assert client.get_query_status(query_id) == fq.QueryMeta.RUNNING - expected = ['{"a_str":"A","b_str":"B","c_str":"C",,"dt_begin":1696849942000001,"dt_end":1696849943000001}'] + expected = ['{"a_str":"A","b_str":"B","c_str":"C","dt_begin":1696849942000001,"dt_end":1696849943000001}'] read_data = self.read_stream(1) logging.info("Data was read: {}".format(read_data)) From b669c52318e60e6ee9c2b26a89471a6fbd664943 Mon Sep 17 00:00:00 2001 From: Dmitry Kardymon Date: Mon, 12 Feb 2024 15:52:02 +0000 Subject: [PATCH 10/19] refactoring --- .../comp_nodes/mkql_match_recognize.cpp | 44 ++-- .../comp_nodes/mkql_match_recognize_list.h | 37 ++-- .../comp_nodes/mkql_match_recognize_nfa.h | 81 +++---- .../mkql_match_recognize_save_load.h | 207 ++++++++++++++++++ 4 files changed, 287 insertions(+), 82 deletions(-) create mode 100644 ydb/library/yql/minikql/comp_nodes/mkql_match_recognize_save_load.h diff --git a/ydb/library/yql/minikql/comp_nodes/mkql_match_recognize.cpp b/ydb/library/yql/minikql/comp_nodes/mkql_match_recognize.cpp index 2023ab440a0d..e1aae0db5581 100644 --- a/ydb/library/yql/minikql/comp_nodes/mkql_match_recognize.cpp +++ b/ydb/library/yql/minikql/comp_nodes/mkql_match_recognize.cpp @@ -2,10 +2,9 @@ #include "mkql_match_recognize_matched_vars.h" #include "mkql_match_recognize_measure_arg.h" #include "mkql_match_recognize_nfa.h" -#include "mkql_match_recognize_parameters.h" +#include "mkql_match_recognize_save_load.h" #include -#include #include #include #include @@ -20,10 +19,27 @@ namespace NKikimr::NMiniKQL { namespace NMatchRecognize { +enum class EOutputColumnSource {PartitionKey, Measure}; +using TOutputColumnOrder = std::vector, TMKQLAllocator>>; + constexpr ui32 StateVersion = 1; using namespace NYql::NMatchRecognize; +struct TMatchRecognizeProcessorParameters { + IComputationExternalNode* InputDataArg; + NYql::NMatchRecognize::TRowPattern Pattern; + TUnboxedValueVector VarNames; + THashMap VarNamesLookup; + IComputationExternalNode* MatchedVarsArg; + IComputationExternalNode* CurrentRowIndexArg; + TComputationNodePtrVector Defines; + IComputationExternalNode* MeasureInputDataArg; + TMeasureInputColumnOrder MeasureInputColumnOrder; + TComputationNodePtrVector Measures; + TOutputColumnOrder OutputColumnOrder; +}; + class TBackTrackingMatchRecognize { using TPartitionList = TSimpleList; using TRange = TPartitionList::TRange; @@ -125,7 +141,6 @@ class TBackTrackingMatchRecognize { class TStreamingMatchRecognize { using TPartitionList = TSparseList; using TRange = TPartitionList::TRange; - // using TMatchedVars = TMatchedVars; public: using TPatternConfigurationBuilder = TNfaTransitionGraphBuilder; TStreamingMatchRecognize( @@ -186,11 +201,11 @@ class TStreamingMatchRecognize { return false; } - void Save(TOutputSerializer& serealizer) const { + void Save(TOutputSerializer& serializer) const { // TODO : PartitionKey - Rows.Save(serealizer); - Nfa.Save(serealizer); - serealizer.Write(MatchNumber); + Rows.Save(serializer); + Nfa.Save(serializer); + serializer.Write(MatchNumber); } void Load(TInputSerializer& serializer) { @@ -343,16 +358,17 @@ class TStateForInterleavedPartitions {} NUdf::TUnboxedValue Save() const override { - TOutputSerializer serealizer(SaveLoadContex); - serealizer.Write(StateVersion); - serealizer.Write(Partitions.size()); + TOutputSerializer serializer(SaveLoadContex); + + serializer.Write(StateVersion); + serializer.Write(Partitions.size()); for (const auto& [key, state] : Partitions) { - serealizer.Write(key); - state->Save(serealizer); + serializer.Write(key); + state->Save(serializer); } - return serealizer.MakeString(); + return serializer.MakeString(); } void Load(const NUdf::TStringRef& state) override { @@ -381,7 +397,7 @@ class TStateForInterleavedPartitions // } // } } - MKQL_ENSURE(!serializer.Size(), "State is corrupted"); + MKQL_ENSURE(serializer.Empty(), "State is corrupted"); } bool ProcessInputRow(NUdf::TUnboxedValue&& row, TComputationContext& ctx) { diff --git a/ydb/library/yql/minikql/comp_nodes/mkql_match_recognize_list.h b/ydb/library/yql/minikql/comp_nodes/mkql_match_recognize_list.h index b8159cfb6192..5b729f3abfe0 100644 --- a/ydb/library/yql/minikql/comp_nodes/mkql_match_recognize_list.h +++ b/ydb/library/yql/minikql/comp_nodes/mkql_match_recognize_list.h @@ -2,7 +2,7 @@ #include #include #include -#include +#include #include #include #include @@ -135,18 +135,17 @@ class TSparseList { } } - void Save(TOutputSerializer& serealizer) const { - serealizer.Write(Storage.size()); + void Save(TOutputSerializer& serializer) const { + serializer.Write(Storage.size()); for (const auto& [key, item]: Storage) { - serealizer.Write(key); - serealizer.Write(item.Value); - serealizer.Write(item.LockCount); + serializer.Write(key); + serializer.Write(item.Value); + serializer.Write(item.LockCount); } } void Load(TInputSerializer& serializer) { - //auto size = serializer.Read(); - auto size = serializer.Read(); + auto size = serializer.Read(); for (size_t i = 0; i < size; ++i) { auto key = serializer.Read(); NUdf::TUnboxedValue row = serializer.Read(); @@ -268,17 +267,17 @@ class TSparseList { ToIndex = -1; } - void Save(TOutputSerializer& serealizer) const { - serealizer.Write(Container); - serealizer.Write(FromIndex); - serealizer.Write(ToIndex); + void Save(TOutputSerializer& serializer) const { + serializer.Write(Container); + serializer.Write(FromIndex); + serializer.Write(ToIndex); } void Load(TInputSerializer& serializer) { serializer.Read(Container); - FromIndex = serializer.Read(); - ToIndex = serializer.Read(); - } + serializer.Read(FromIndex); + serializer.Read(ToIndex); + } private: TRange(TContainerPtr container, size_t index) @@ -335,14 +334,14 @@ class TSparseList { return Size() == 0; } - void Save(TOutputSerializer& serealizer) const { - serealizer.Write(Container); - serealizer.Write(ListSize); + void Save(TOutputSerializer& serializer) const { + serializer.Write(Container); + serializer.Write(ListSize); } void Load(TInputSerializer& serializer) { serializer.Read(Container); - ListSize = serializer.Read(); + serializer.Read(ListSize); } private: diff --git a/ydb/library/yql/minikql/comp_nodes/mkql_match_recognize_nfa.h b/ydb/library/yql/minikql/comp_nodes/mkql_match_recognize_nfa.h index d50c07f64fc7..80b5eceefa89 100644 --- a/ydb/library/yql/minikql/comp_nodes/mkql_match_recognize_nfa.h +++ b/ydb/library/yql/minikql/comp_nodes/mkql_match_recognize_nfa.h @@ -87,11 +87,11 @@ struct TNfaTransitionGraph { using TPtr = std::shared_ptr; - void Save(TOutputSerializer& serealizer) const { - serealizer.Write(Transitions.size()); + void Save(TOutputSerializer& serializer) const { + serializer.Write(Transitions.size()); for (ui64 i = 0; i < Transitions.size(); ++i) { - serealizer.Write(Transitions[i].index()); + serializer.Write(Transitions[i].index()); std::visit([&](auto&& arg) { @@ -100,31 +100,23 @@ struct TNfaTransitionGraph { // Nothing } else if constexpr (std::is_same_v) { - serealizer.Write(arg.first.first); - serealizer.Write(arg.first.second); - serealizer.Write(arg.second); + serializer.Write(arg); } else if constexpr (std::is_same_v) { - serealizer.Write(arg.size()); - for (size_t i = 0; i < arg.size(); ++i) { - serealizer.Write(arg[i]); - } + serializer.Write(arg); } else if constexpr (std::is_same_v) { - serealizer.Write(arg); + serializer.Write(arg); } else if constexpr (std::is_same_v) { - serealizer.Write(arg.first.first); - serealizer.Write(arg.first.second); - serealizer.Write(arg.second.first); - serealizer.Write(arg.second.second); + serializer.Write(arg); } else static_assert(always_false_v, "non-exhaustive visitor!"); }, Transitions[i]); } - serealizer.Write(Input); - serealizer.Write(Output); + serializer.Write(Input); + serializer.Write(Output); } void Load(TInputSerializer& serializer) { @@ -141,33 +133,24 @@ struct TNfaTransitionGraph { // Nothing } else if constexpr (std::is_same_v) { - arg.first.first = serializer.Read(); - arg.first.second = serializer.Read(); - arg.second = serializer.Read(); + serializer.Read(arg); } else if constexpr (std::is_same_v) { - ui64 size = serializer.Read(); - arg.resize(size); - for (size_t i = 0; i < size; ++i) { - arg[i] = serializer.Read(); - } + serializer.Read(arg); } else if constexpr (std::is_same_v) { - arg = serializer.Read(); + serializer.Read(arg); } else if constexpr (std::is_same_v) { - arg.first.first = serializer.Read(); - arg.first.second = serializer.Read(); - arg.second.first = serializer.Read(); - arg.second.second = serializer.Read(); + serializer.Read(arg); } - else + else static_assert(always_false_v, "non-exhaustive visitor!"); }, Transitions[i]); } - Input = serializer.Read(); - Output = serializer.Read(); + serializer.Read(Input); + serializer.Read(Output); } }; @@ -382,26 +365,26 @@ class TNfa { TQuantifiersStack Quantifiers; - void Save(TOutputSerializer& serealizer) const { - serealizer.Write(Index); + void Save(TOutputSerializer& serializer) const { + serializer.Write(Index); - serealizer.Write(Vars.size()); + serializer.Write(Vars.size()); for (const auto& vector : Vars) { - serealizer.Write(vector.size()); + serializer.Write(vector.size()); for (const auto& range : vector) { - range.Save(serealizer); + range.Save(serializer); } } - serealizer.Write(Quantifiers.size()); + serializer.Write(Quantifiers.size()); for (ui64 qnt : Quantifiers) { - serealizer.Write(qnt); + serializer.Write(qnt); } } void Load(TInputSerializer& serializer) { - Index = serializer.Read(); - - auto varsSize = serializer.Read(); + serializer.Read(Index); + + auto varsSize = serializer.Read(); Vars.clear(); Vars.resize(varsSize); for (size_t i = 0; i < varsSize; ++i) { @@ -494,13 +477,13 @@ class TNfa { return ActiveStates.size(); } - void Save(TOutputSerializer& serealizer) const { - TransitionGraph->Save(serealizer); - serealizer.Write(ActiveStates.size()); + void Save(TOutputSerializer& serializer) const { + TransitionGraph->Save(serializer); + serializer.Write(ActiveStates.size()); for (const auto& state : ActiveStates) { - state.Save(serealizer); + state.Save(serializer); } - serealizer.Write(EpsilonTransitionsLastRow); + serializer.Write(EpsilonTransitionsLastRow); } void Load(TInputSerializer& serializer) { @@ -511,7 +494,7 @@ class TNfa { state.Load(serializer); ActiveStates.emplace(state); } - EpsilonTransitionsLastRow = serializer.Read(); + serializer.Read(EpsilonTransitionsLastRow); } private: diff --git a/ydb/library/yql/minikql/comp_nodes/mkql_match_recognize_save_load.h b/ydb/library/yql/minikql/comp_nodes/mkql_match_recognize_save_load.h new file mode 100644 index 000000000000..3ef1dd1f9061 --- /dev/null +++ b/ydb/library/yql/minikql/comp_nodes/mkql_match_recognize_save_load.h @@ -0,0 +1,207 @@ +#pragma once + +#include +#include +#include +#include + +namespace NKikimr::NMiniKQL::NMatchRecognize { + +struct TSaveLoadContext { + + TComputationContext& Ctx; + TType* StateType; + const TMutableObjectOverBoxedValue& Packer; +}; + +template +inline constexpr bool always_false_v2 = false; + +struct TOutputSerializer { +private: + enum class TPtrStateMode { + Saved = 0, + FromCache = 1 + }; + +public: + TOutputSerializer(const TSaveLoadContext& context) + : Context(context) + {} + + template + void Write(const Type& value ) { + if constexpr (std::is_same_v, TString>) { + WriteString(Buf, value); + } else if constexpr (std::is_same_v, ui64>) { + WriteUi64(Buf, value); + } else if constexpr (std::is_same_v, bool>) { + WriteBool(Buf, value); + } else if constexpr (std::is_same_v, ui8>) { + WriteByte(Buf, value); + } else if constexpr (std::is_same_v, ui32>) { + WriteUi32(Buf, value); + } else if constexpr (std::is_same_v, NUdf::TUnboxedValue>) { + WriteUnboxedValue(Buf, Context.Packer.RefMutableObject(Context.Ctx, false, Context.StateType), value); + } else if constexpr (std::is_same_v, NUdf::TUnboxedValue>) { + WriteUnboxedValue(Buf, Context.Packer.RefMutableObject(Context.Ctx, false, Context.StateType), value); + } + else { + static_assert(always_false_v2, "Not supported type / not implemented"); + } + } + + template + void Write(const TIntrusivePtr& ptr) { + bool isValid = static_cast(ptr); + WriteBool(Buf, isValid); + if (!isValid) { + return; + } + auto addr = reinterpret_cast(ptr.Get()); + WriteUi64(Buf, addr); + + auto it = Cache.find(addr); + if (it != Cache.end()) { + WriteByte(Buf, static_cast(TPtrStateMode::FromCache)); + return; + } + WriteByte(Buf, static_cast(TPtrStateMode::Saved)); + ptr->Save(*this); + Cache[addr] = addr; + } + + template + void Write(const std::pair& value) { + Write(value.first); + Write(value.second); + } + + template + void Write(const std::vector& value) { + Write(value.size()); + for (size_t i = 0; i < value.size(); ++i) { + Write(value[i]); + } + } + + NUdf::TUnboxedValuePod MakeString() { + auto strRef = NUdf::TStringRef(Buf.data(), Buf.size()); + return NKikimr::NMiniKQL::MakeString(strRef); + } + +private: + TString Buf; + const TSaveLoadContext& Context; + mutable std::map Cache; +}; + +struct TInputSerializer { +private: + enum class TPtrStateMode { + Saved = 0, + FromCache = 1 + }; + +public: + TInputSerializer(TSaveLoadContext& context, const NUdf::TStringRef& state) + : Context(context) + , Buf(state.Data(), state.Size()) + {} + + template + ReturnType Read() { + if constexpr (std::is_same_v, TString>) { + return ReadString(Buf); + } else if constexpr (std::is_same_v, ui64>) { + return ReadUi64(Buf); + } else if constexpr (std::is_same_v, bool>) { + return ReadBool(Buf); + } else if constexpr (std::is_same_v, ui8>) { + return ReadByte(Buf); + } else if constexpr (std::is_same_v, ui32>) { + return ReadUi32(Buf); + } else if constexpr (std::is_same_v, NUdf::TUnboxedValue>) { + return ReadUnboxedValue(Buf, Context.Packer.RefMutableObject(Context.Ctx, false, Context.StateType), Context.Ctx); + } + else { + static_assert(always_false_v2, "Not supported type / not implemented"); + } + } + + template + void Read(Type& value) { + if constexpr (std::is_same_v, TString>) { + value = ReadString(Buf); + } else if constexpr (std::is_same_v, ui64>) { + value = ReadUi64(Buf); + } else if constexpr (std::is_same_v, bool>) { + value = ReadBool(Buf); + } else if constexpr (std::is_same_v, ui8>) { + value = ReadByte(Buf); + } else if constexpr (std::is_same_v, ui32>) { + value = ReadUi32(Buf); + } else if constexpr (std::is_same_v, NUdf::TUnboxedValue>) { + value = ReadUnboxedValue(Buf, Context.Packer.RefMutableObject(Context.Ctx, false, Context.StateType), Context.Ctx); + } + else { + static_assert(always_false_v2, "Not supported type / not implemented"); + } + } + + template + void Read(TIntrusivePtr& ptr) { + bool isValid = Read(); + if (!isValid) { + ptr.Reset(); + return; + } + ui64 addr = Read(); + TPtrStateMode mode = static_cast(Read()); + if (mode == TPtrStateMode::Saved) { + ptr = MakeIntrusive(); + ptr->Load(*this); + Cache[addr] = ptr.Get(); + return; + } + auto it = Cache.find(addr); + MKQL_ENSURE(it != Cache.end(), "Internal error"); + auto* cachePtr = static_cast(it->second); + ptr = TIntrusivePtr(cachePtr); + auto refCount = ptr.RefCount(); + } + + template + void Read(std::pair& value) { + Read(value.first); + Read(value.second); + } + + template + void Read(std::vector& value) { + using TVector = std::vector; + //auto size = Read(); + auto size = Read(); + value.clear(); + value.resize(size); + for (size_t i = 0; i < size; ++i) { + Read(value[i]); + } + } + + NUdf::TUnboxedValuePod MakeString() { + auto strRef = NUdf::TStringRef(Buf.data(), Buf.size()); + return NKikimr::NMiniKQL::MakeString(strRef); + } + + bool Empty() + { + return Buf.size() == 0; + } +private: + TStringBuf Buf; + TSaveLoadContext& Context; + mutable std::map Cache; +}; + +} //namespace NKikimr::NMiniKQL::NMatchRecognize From 2663d1e1674be62010a9a07fbaffcd83d7f5ca26 Mon Sep 17 00:00:00 2001 From: Dmitry Kardymon Date: Tue, 13 Feb 2024 07:58:30 +0000 Subject: [PATCH 11/19] Fix build / add second test --- .../comp_nodes/mkql_match_recognize.cpp | 62 ++++- .../comp_nodes/mkql_match_recognize_list.h | 2 - .../mkql_match_recognize_parameters.h | 188 ------------- .../mkql_match_recognize_save_load.h | 16 +- .../comp_nodes/ut/mkql_match_recognize_ut.cpp | 72 ++--- .../kikimr/test_recovery_match_recognize.py | 248 ------------------ ydb/tests/fq/kikimr/test_recovery_mz.py | 3 +- .../fq/yds/test_recovery_match_recognize.py | 118 ++++++++- 8 files changed, 194 insertions(+), 515 deletions(-) delete mode 100644 ydb/library/yql/minikql/comp_nodes/mkql_match_recognize_parameters.h delete mode 100644 ydb/tests/fq/kikimr/test_recovery_match_recognize.py diff --git a/ydb/library/yql/minikql/comp_nodes/mkql_match_recognize.cpp b/ydb/library/yql/minikql/comp_nodes/mkql_match_recognize.cpp index e1aae0db5581..8a5cf6d1bd18 100644 --- a/ydb/library/yql/minikql/comp_nodes/mkql_match_recognize.cpp +++ b/ydb/library/yql/minikql/comp_nodes/mkql_match_recognize.cpp @@ -128,6 +128,20 @@ class TBackTrackingMatchRecognize { } return not Matches.empty(); } + + void Save(TOutputSerializer& /*serializer*/) const { + // // TODO : PartitionKey + // Rows.Save(serializer); + // Nfa.Save(serializer); + // serializer.Write(MatchNumber); + } + + void Load(TInputSerializer& /*serializer*/) { + // Rows.Load(serializer); + // Nfa.Load(serializer); + // MatchNumber = serializer.Read(); + } + private: const NUdf::TUnboxedValue PartitionKey; const TMatchRecognizeProcessorParameters& Parameters; @@ -248,16 +262,48 @@ class TStateForNonInterleavedPartitions , RowPatternConfiguration(TRowPatternConfigurationBuilder::Create(parameters.Pattern, parameters.VarNamesLookup)) , Cache(cache) , Terminating(false) + , SaveLoadContex(ctx, stateType, packer) {} NUdf::TUnboxedValue Save() const override { - TString out; - - auto strRef = NUdf::TStringRef(out.data(), out.size()); - return MakeString(strRef); + TOutputSerializer serializer(SaveLoadContex); + serializer.Write(StateVersion); + serializer.Write(CurPartitionPackedKey); + bool isValid = static_cast(PartitionHandler); + serializer.Write(isValid); + if (isValid) { + PartitionHandler->Save(serializer); + } + isValid = static_cast(DelayedRow); + serializer.Write(isValid); + if (isValid) { + serializer.Write(DelayedRow); + } + return serializer.MakeString(); } void Load(const NUdf::TStringRef& state) override { + + TInputSerializer serializer(SaveLoadContex, state); + const auto stateVersion = serializer.Read(); + if (stateVersion == 1) { + serializer.Read(CurPartitionPackedKey); + bool validPartitionHandler = serializer.Read(); + if (validPartitionHandler) { + PartitionHandler.reset(new Algo( + NYql::NUdf::TUnboxedValuePod(NYql::NUdf::TStringValue("asd")),// TODO + Parameters, + RowPatternConfiguration, + Cache + )); + PartitionHandler->Load(serializer); + } + bool validDelayedRow = serializer.Read(); + if (validDelayedRow) { + DelayedRow = serializer.Read(); + } + } + MKQL_ENSURE(serializer.Empty(), "State is corrupted"); } bool ProcessInputRow(NUdf::TUnboxedValue&& row, TComputationContext& ctx) { @@ -328,6 +374,7 @@ class TStateForNonInterleavedPartitions const TContainerCacheOnContext& Cache; NUdf::TUnboxedValue DelayedRow; bool Terminating; + TSerializerContext SaveLoadContex; }; class TStateForInterleavedPartitions @@ -359,7 +406,6 @@ class TStateForInterleavedPartitions NUdf::TUnboxedValue Save() const override { TOutputSerializer serializer(SaveLoadContex); - serializer.Write(StateVersion); serializer.Write(Partitions.size()); @@ -367,7 +413,7 @@ class TStateForInterleavedPartitions serializer.Write(key); state->Save(serializer); } - + serializer.Write(Terminating); return serializer.MakeString(); } @@ -396,6 +442,7 @@ class TStateForInterleavedPartitions // HasReadyOutput.push(it); // } // } + serializer.Read(Terminating); } MKQL_ENSURE(serializer.Empty(), "State is corrupted"); } @@ -462,7 +509,7 @@ class TStateForInterleavedPartitions const TMatchRecognizeProcessorParameters& Parameters; const TNfaTransitionGraph::TPtr NfaTransitionGraph; const TContainerCacheOnContext& Cache; - TSaveLoadContext SaveLoadContex; + TSerializerContext SaveLoadContex; }; template @@ -660,7 +707,6 @@ std::pair> ConvertListOfStrings(c } //namespace NMatchRecognize - IComputationNode* WrapMatchRecognizeCore(TCallable& callable, const TComputationNodeFactoryContext& ctx) { using namespace NMatchRecognize; size_t inputIndex = 0; diff --git a/ydb/library/yql/minikql/comp_nodes/mkql_match_recognize_list.h b/ydb/library/yql/minikql/comp_nodes/mkql_match_recognize_list.h index 5b729f3abfe0..3e32ba75264e 100644 --- a/ydb/library/yql/minikql/comp_nodes/mkql_match_recognize_list.h +++ b/ydb/library/yql/minikql/comp_nodes/mkql_match_recognize_list.h @@ -92,8 +92,6 @@ class TSparseList { size_t LockCount = 0; }; - - class TContainer: public TSimpleRefCount { public: diff --git a/ydb/library/yql/minikql/comp_nodes/mkql_match_recognize_parameters.h b/ydb/library/yql/minikql/comp_nodes/mkql_match_recognize_parameters.h deleted file mode 100644 index e52e7a505a7f..000000000000 --- a/ydb/library/yql/minikql/comp_nodes/mkql_match_recognize_parameters.h +++ /dev/null @@ -1,188 +0,0 @@ -#pragma once - -#include "mkql_match_recognize_measure_arg.h" -#include "mkql_saveload.h" -#include -#include -#include -#include -#include - -namespace NKikimr::NMiniKQL::NMatchRecognize { - -enum class EOutputColumnSource {PartitionKey, Measure}; -using TOutputColumnOrder = std::vector, TMKQLAllocator>>; - -struct TMatchRecognizeProcessorParameters { - IComputationExternalNode* InputDataArg; - NYql::NMatchRecognize::TRowPattern Pattern; - TUnboxedValueVector VarNames; - THashMap VarNamesLookup; - IComputationExternalNode* MatchedVarsArg; - IComputationExternalNode* CurrentRowIndexArg; - TComputationNodePtrVector Defines; - IComputationExternalNode* MeasureInputDataArg; - TMeasureInputColumnOrder MeasureInputColumnOrder; - TComputationNodePtrVector Measures; - TOutputColumnOrder OutputColumnOrder; -}; - -struct TSaveLoadContext { - - TComputationContext& Ctx; - TType* StateType; - const TMutableObjectOverBoxedValue& Packer; -}; - -template -inline constexpr bool always_false_v2 = false; - -struct TOutputSerializer { -private: - enum class TPtrStateMode { - Saved = 0, - FromCache = 1 - }; - -public: - TOutputSerializer(const TSaveLoadContext& context) - : Context(context) - {} - - void Write(ui32 value) { - WriteUi32(Buf, value); - } - - void Write(ui64 value) { - WriteUi64(Buf, value); - } - - void Write(std::string_view value) { - WriteString(Buf, value); - } - - void Write(const NUdf::TUnboxedValue& value) { - WriteUnboxedValue(Buf, Context.Packer.RefMutableObject(Context.Ctx, false, Context.StateType), value); - } - - void Write(bool value) { - WriteBool(Buf, value); - } - - template - void Write(const TIntrusivePtr& ptr) { - // Format - // bool isValid - // Ui64 addr - // ui8 mode - // ...data - auto refCount = ptr.RefCount(); - bool isValid = static_cast(ptr); - WriteBool(Buf, isValid); - - if (!isValid) { - return; - } - auto addr = reinterpret_cast(ptr.Get()); - WriteUi64(Buf, addr); - - auto it = WriteCache.find(addr); - if (it == WriteCache.end()) { - WriteByte(Buf, static_cast(TPtrStateMode::Saved)); - ptr->Save(*this); - WriteCache[addr] = addr; - } else { - WriteByte(Buf, static_cast(TPtrStateMode::FromCache)); - } - } - - NUdf::TUnboxedValuePod MakeString() { - auto strRef = NUdf::TStringRef(Buf.data(), Buf.size()); - return NKikimr::NMiniKQL::MakeString(strRef); - } - - size_t Size() // TODO : delete - { - return Buf.size(); - } -private: - TString Buf; - const TSaveLoadContext& Context; - mutable std::map WriteCache; -}; - -// template -// inline constexpr bool always_false_v = false; -#include - -struct TInputSerializer { -private: - enum class TPtrStateMode { - Saved = 0, - FromCache = 1 - }; - -public: - TInputSerializer(TSaveLoadContext& context, const NUdf::TStringRef& state) - : Context(context) - , Buf(state.Data(), state.Size()) - {} - - template - ReturnType Read() { - if constexpr (std::is_same_v, TString>) { - return ReadString(Buf); - } else if constexpr (std::is_same_v, ui64>) { - return ReadUi64(Buf); - } else if constexpr (std::is_same_v, bool>) { - return ReadBool(Buf); - } else if constexpr (std::is_same_v, ui8>) { - return ReadByte(Buf); - } else if constexpr (std::is_same_v, ui32>) { - return ReadUi32(Buf); - } else if constexpr (std::is_same_v, NUdf::TUnboxedValue>) { - return ReadUnboxedValue(Buf, Context.Packer.RefMutableObject(Context.Ctx, false, Context.StateType), Context.Ctx); - } - else - static_assert(always_false_v2, "non-exhaustive visitor!"); - MKQL_ENSURE(false, "Not implemented"); - } - - template - void Read(TIntrusivePtr& ptr) { - bool isValid = Read(); - if (!isValid) { - ptr.Reset(); - return; - } - ui64 addr = Read(); - TPtrStateMode mode = static_cast(Read()); - if (mode == TPtrStateMode::Saved) { - ptr = MakeIntrusive(); - ptr->Load(*this); - ReadCache[addr] = ptr.Get(); - } else { - auto it = ReadCache.find(addr); - MKQL_ENSURE(it != ReadCache.end(), "Internal error"); - auto* cachePtr = static_cast(it->second); - ptr = TIntrusivePtr(cachePtr); - auto refCount = ptr.RefCount(); - } - } - - NUdf::TUnboxedValuePod MakeString() { - auto strRef = NUdf::TStringRef(Buf.data(), Buf.size()); - return NKikimr::NMiniKQL::MakeString(strRef); - } - - size_t Size() // TODO : delete - { - return Buf.size(); - } -private: - TStringBuf Buf; - TSaveLoadContext& Context; - mutable std::map ReadCache; -}; - -} //namespace NKikimr::NMiniKQL::NMatchRecognize diff --git a/ydb/library/yql/minikql/comp_nodes/mkql_match_recognize_save_load.h b/ydb/library/yql/minikql/comp_nodes/mkql_match_recognize_save_load.h index 3ef1dd1f9061..efa2f45bdf77 100644 --- a/ydb/library/yql/minikql/comp_nodes/mkql_match_recognize_save_load.h +++ b/ydb/library/yql/minikql/comp_nodes/mkql_match_recognize_save_load.h @@ -7,7 +7,7 @@ namespace NKikimr::NMiniKQL::NMatchRecognize { -struct TSaveLoadContext { +struct TSerializerContext { TComputationContext& Ctx; TType* StateType; @@ -25,7 +25,7 @@ struct TOutputSerializer { }; public: - TOutputSerializer(const TSaveLoadContext& context) + TOutputSerializer(const TSerializerContext& context) : Context(context) {} @@ -91,8 +91,8 @@ struct TOutputSerializer { } private: + const TSerializerContext& Context; TString Buf; - const TSaveLoadContext& Context; mutable std::map Cache; }; @@ -104,7 +104,7 @@ struct TInputSerializer { }; public: - TInputSerializer(TSaveLoadContext& context, const NUdf::TStringRef& state) + TInputSerializer(TSerializerContext& context, const NUdf::TStringRef& state) : Context(context) , Buf(state.Data(), state.Size()) {} @@ -168,7 +168,6 @@ struct TInputSerializer { MKQL_ENSURE(it != Cache.end(), "Internal error"); auto* cachePtr = static_cast(it->second); ptr = TIntrusivePtr(cachePtr); - auto refCount = ptr.RefCount(); } template @@ -180,8 +179,8 @@ struct TInputSerializer { template void Read(std::vector& value) { using TVector = std::vector; + auto size = Read(); //auto size = Read(); - auto size = Read(); value.clear(); value.resize(size); for (size_t i = 0; i < size; ++i) { @@ -196,11 +195,12 @@ struct TInputSerializer { bool Empty() { - return Buf.size() == 0; + return Buf.empty(); } + private: + TSerializerContext& Context; TStringBuf Buf; - TSaveLoadContext& Context; mutable std::map Cache; }; diff --git a/ydb/library/yql/minikql/comp_nodes/ut/mkql_match_recognize_ut.cpp b/ydb/library/yql/minikql/comp_nodes/ut/mkql_match_recognize_ut.cpp index 7b0d29352bb1..62a54ae065ba 100644 --- a/ydb/library/yql/minikql/comp_nodes/ut/mkql_match_recognize_ut.cpp +++ b/ydb/library/yql/minikql/comp_nodes/ut/mkql_match_recognize_ut.cpp @@ -61,9 +61,11 @@ namespace NKikimr { }; using TTestInputData = std::vector>; - using TTestData = std::vector>; - THolder BuildGraph(TSetup& setup, const TTestInputData& input) { + THolder BuildGraph( + TSetup& setup, + bool streamingMode, + const TTestInputData& input) { TProgramBuilder& pgmBuilder = *setup.PgmBuilder; auto structType = pgmBuilder.NewStructType({ @@ -73,8 +75,6 @@ namespace NKikimr { {"part", pgmBuilder.NewDataType(NUdf::TDataType::Id)}}); TVector items; - // constexpr ui64 g_Yield = std::numeric_limits::max(); - // items.push_back(pgmBuilder.NewDataLiteral(g_Yield)); for (size_t i = 0; i < input.size(); ++i) { auto time = pgmBuilder.NewDataLiteral(std::get<0>(input[i])); @@ -90,30 +90,18 @@ namespace NKikimr { const auto list = pgmBuilder.NewList(structType, std::move(items)); auto inputFlow = pgmBuilder.ToFlow(list); - i64 delay = -10; - i64 ahead = 30; - ui32 rowLimit = 20; - - // MEASURES - // LAST(A.dt) as dt_begin - // ONE ROW PER MATCH - // PATTERN ( A{3, 3} ) - // DEFINE A as True) - - - TVector partitionColumns;// = {TStringBuf("a")}; + TVector partitionColumns; TVector> getMeasures = {{ std::make_pair( TStringBuf("key"), - [&](TRuntimeNode measureInputDataArg, TRuntimeNode matchedVarsArg) { - // auto run = pgmBuilder.Take(measureInputDataArg, pgmBuilder.NewDataLiteral(0)); + [&](TRuntimeNode /*measureInputDataArg*/, TRuntimeNode /*matchedVarsArg*/) { return pgmBuilder.NewDataLiteral(56); } )}}; TVector> getDefines = {{ std::make_pair( TStringBuf("A"), - [&](TRuntimeNode inputDataArg, TRuntimeNode matchedVarsArg, TRuntimeNode currentRowIndexArg) { + [&](TRuntimeNode /*inputDataArg*/, TRuntimeNode /*matchedVarsArg*/, TRuntimeNode /*currentRowIndexArg*/) { return pgmBuilder.NewDataLiteral(true); } )}}; @@ -123,13 +111,13 @@ namespace NKikimr { [&](TRuntimeNode item) { return pgmBuilder.Member(item, "part"); }, - partitionColumns, // partitionColumns + partitionColumns, getMeasures, { {NYql::NMatchRecognize::TRowPatternFactor{"A", 3, 3, false, false, false}} }, getDefines, - true); + streamingMode); auto graph = setup.BuildGraph(pgmReturn); return graph; @@ -138,13 +126,20 @@ namespace NKikimr { Y_UNIT_TEST_SUITE(TMiniKQLMatchRecognizeSaveLoadTest) { void TestWithSaveLoadImpl( - const TTestInputData& input, - const TTestData& expected) + bool streamingMode) { TScopedAlloc alloc(__LOCATION__); std::vector> result; TSetup setup1(alloc); - auto graph1 = BuildGraph(setup1, input); + + const TTestInputData input = { + // Time; Key; Value; PartitionKey + {1000, "A", 101, "P"}, + {1001, "B", 102, "P"}, + {1002, "C", 103, "P"}, // <- match end + {1003, "D", 103, "P"}}; // <- not processed + + auto graph1 = BuildGraph(setup1,streamingMode, input); auto value = graph1->GetValue(); @@ -153,11 +148,11 @@ namespace NKikimr { TString graphState = graph1->SaveGraphState(); - // graph1.Reset(); + graph1.Reset(); TSetup setup2(alloc); - auto graph2 = BuildGraph(setup2, TTestInputData{{1003, "D", 103, "P"}}); + auto graph2 = BuildGraph(setup2, streamingMode, TTestInputData{{1003, "D", 103, "P"}}); graph2->LoadGraphState(graphState); value = graph2->GetValue(); @@ -166,27 +161,14 @@ namespace NKikimr { UNIT_ASSERT_VALUES_EQUAL(56, v); } - const TTestInputData input = { - // Time; Key; Value; PartitionKey - {1000, "A", 101, "P"}, - {1001, "B", 102, "P"}, - {1002, "C", 103, "P"}, // <- match end - {1003, "D", 103, "P"}}; // <- not processed - - const std::vector> expected = { - // Group; Time; Value - {1000, 800, 101}, - {1000, 800, 102}, - {1000, 800, 103}, - {1000, 800, 104}, - {1000, 800, 105}, - {3000, 801, 200}, - {2000, 802, 300}}; - - Y_UNIT_TEST(Test1) { - TestWithSaveLoadImpl(input, expected); + + Y_UNIT_TEST(StreamingMode) { + TestWithSaveLoadImpl(true); } + Y_UNIT_TEST(NotStreamingMode) { + TestWithSaveLoadImpl(false); + } } } // namespace NMiniKQL diff --git a/ydb/tests/fq/kikimr/test_recovery_match_recognize.py b/ydb/tests/fq/kikimr/test_recovery_match_recognize.py deleted file mode 100644 index db2878f7ad7d..000000000000 --- a/ydb/tests/fq/kikimr/test_recovery_match_recognize.py +++ /dev/null @@ -1,248 +0,0 @@ -#!/usr/bin/env python -# -*- coding: utf-8 -*- - -import pytest -import logging -import os -import time - -import ydb.tests.library.common.yatest_common as yatest_common -from ydb.tests.tools.fq_runner.kikimr_runner import StreamingOverKikimr, StreamingOverKikimrConfig, TenantConfig -import library.python.retry as retry -from ydb.tests.tools.fq_runner.kikimr_utils import yq_v1 -from ydb.tests.tools.datastreams_helpers.test_yds_base import TestYdsBase -import ydb.public.api.protos.draft.fq_pb2 as fq - - -@pytest.fixture -def kikimr(request): - kikimr_conf = StreamingOverKikimrConfig(cloud_mode=True, node_count={"/cp": TenantConfig(1), "/compute": TenantConfig(1)}) - kikimr = StreamingOverKikimr(kikimr_conf) - kikimr.start_mvp_mock_server() - kikimr.start() - yield kikimr - kikimr.stop_mvp_mock_server() - kikimr.stop() - - -class TestRecoveryMatchRecognize(TestYdsBase): - - @classmethod - def setup_class(cls): - # for retry - cls.retry_conf = retry.RetryConf().upto(seconds=30).waiting(0.1) - - # @retry.retry_intrusive - # def get_graph_master_node_id(self, kikimr, query_id): - # for node_index in kikimr.compute_plane.kikimr_cluster.nodes: - # if kikimr.compute_plane.get_task_count(node_index, query_id) > 0: - # return node_index - # assert False, "No active graphs found" - - def get_ca_count(self, kikimr, node_index): - result = kikimr.control_plane.get_sensors(node_index, "utils").find_sensor( - {"activity": "DQ_COMPUTE_ACTOR", "sensor": "ActorsAliveByActivity", "execpool": "User"} - ) - return result if result is not None else 0 - - def dump_workers(self, kikimr, worker_count, ca_count, wait_time=yatest_common.plain_or_under_sanitizer(30, 150)): - deadline = time.time() + wait_time - while True: - wcs = 0 - ccs = 0 - list = [] - for node_index in kikimr.control_plane.kikimr_cluster.nodes: - wc = kikimr.control_plane.get_worker_count(node_index) - cc = self.get_ca_count(kikimr, node_index) - wcs += wc - ccs += cc - list.append([node_index, wc, cc]) - if wcs == worker_count and ccs == ca_count: - for [s, w, c] in list: - if w * 2 != c: - continue - for [s, w, c] in list: - logging.debug("Node {}, workers {}, ca {}".format(s, w, c)) - return - if time.time() > deadline: - for [s, w, c] in list: - logging.debug("Node {}, workers {}, ca {}".format(s, w, c)) - assert False, "Workers={} and CAs={}, but {} and {} expected".format(wcs, ccs, worker_count, ca_count) - - def restart_node(self, kikimr, query_id): - # restart node with CA - - - # master_node_index = self.get_graph_master_node_id(kikimr, query_id) - # logging.debug("Master node {}".format(master_node_index)) - - node_to_restart = None - - # for node_index in kikimr.control_plane.kikimr_cluster.nodes: - # logging.debug("Master node {}".format(master_node_index)) - - for node_index in kikimr.compute_plane.kikimr_cluster.nodes: - wc = kikimr.compute_plane.get_worker_count(node_index) - if wc is not None: - if wc > 0 and node_to_restart is None: - node_to_restart = node_index - assert node_to_restart is not None, "Can't find any task on node" - - logging.debug("Restart compute node {}".format(node_to_restart)) - - kikimr.compute_plane.kikimr_cluster.nodes[node_to_restart].stop() - kikimr.compute_plane.kikimr_cluster.nodes[node_to_restart].start() - kikimr.compute_plane.wait_bootstrap(node_to_restart) - - @yq_v1 - @pytest.mark.parametrize("kikimr", [(None, None, None)], indirect=["kikimr"]) - def test_time_order_recorever(self, kikimr, client, yq_version): - - self.init_topics("test_time_order_recorever_save_load_state") - - sql = R''' - PRAGMA dq.MaxTasksPerStage="2"; - - pragma FeatureR010="prototype"; - pragma config.flags("TimeOrderRecoverDelay", "-1000000"); - pragma config.flags("TimeOrderRecoverAhead", "1000000"); - - INSERT INTO myyds.`{output_topic}` - SELECT ToBytes(Unwrap(Json::SerializeJson(Yson::From(TableRow())))) - FROM (SELECT * FROM myyds.`{input_topic}` - WITH ( - format=json_each_row, - SCHEMA - ( - dt UINT64 - ))) - MATCH_RECOGNIZE( - ORDER BY CAST(dt as Timestamp) - MEASURES - LAST(ALL_TRUE.dt) as dt - ONE ROW PER MATCH - PATTERN ( ALL_TRUE ) - DEFINE - ALL_TRUE as True)''' \ - .format( - input_topic=self.input_topic, - output_topic=self.output_topic, - ) - - client.create_yds_connection("myyds", os.getenv("YDB_DATABASE"), os.getenv("YDB_ENDPOINT")) - query_id = client.create_query("simple", sql, type=fq.QueryContent.QueryType.STREAMING).result.query_id - client.wait_query_status(query_id, fq.QueryMeta.RUNNING) - kikimr.compute_plane.wait_zero_checkpoint(query_id) - - messages1 = ['{"dt": 1696849942400002}', '{"dt": 1696849942000001}'] - self.write_stream(messages1) - - logging.debug("get_completed_checkpoints {}".format(kikimr.compute_plane.get_completed_checkpoints(query_id))) - kikimr.compute_plane.wait_completed_checkpoints( - query_id, kikimr.compute_plane.get_completed_checkpoints(query_id) + 1 - ) - - self.restart_node(kikimr, query_id) - - messages2 = [ - '{"dt": 1696849942800000}', - '{"dt": 1696849943200003}', - '{"dt": 1696849943300003}', - '{"dt": 1696849943600003}', - '{"dt": 1696849943900003}' - ] - self.write_stream(messages2) - - assert client.get_query_status(query_id) == fq.QueryMeta.RUNNING - - expected = ['{"dt":1696849942000001}', '{"dt":1696849942400002}', '{"dt":1696849942800000}'] - - read_data = self.read_stream(len(expected)) - logging.info("Data was read: {}".format(read_data)) - - assert read_data == expected - - client.abort_query(query_id) - client.wait_query(query_id) - - self.dump_workers(kikimr, 0, 0) - - @yq_v1 - @pytest.mark.parametrize("kikimr", [(None, None, None)], indirect=["kikimr"]) - def test_match_recognize(self, kikimr, client, yq_version): - - self.init_topics("test_match_recognize_save_load_state") - - sql = R''' - PRAGMA dq.MaxTasksPerStage="2"; - - pragma FeatureR010="prototype"; - pragma config.flags("TimeOrderRecoverDelay", "-1000000"); - pragma config.flags("TimeOrderRecoverAhead", "1000000"); - - INSERT INTO myyds.`{output_topic}` - SELECT ToBytes(Unwrap(Json::SerializeJson(Yson::From(TableRow())))) - FROM (SELECT * FROM myyds.`{input_topic}` - WITH ( - format=json_each_row, - SCHEMA - ( - dt UINT64, - str STRING - ))) - MATCH_RECOGNIZE( - ORDER BY CAST(dt as Timestamp) - MEASURES - LAST(A.dt) as dt_begin, - LAST(C.dt) as dt_end, - LAST(A.str) as a_str, - LAST(B.str) as b_str, - LAST(C.str) as c_str - ONE ROW PER MATCH - PATTERN ( A B C ) - DEFINE - A as A.str='A', - B as B.str='B', - C as C.str='C')''' \ - .format( - input_topic=self.input_topic, - output_topic=self.output_topic, - ) - - client.create_yds_connection("myyds", os.getenv("YDB_DATABASE"), os.getenv("YDB_ENDPOINT")) - query_id = client.create_query("simple", sql, type=fq.QueryContent.QueryType.STREAMING).result.query_id - client.wait_query_status(query_id, fq.QueryMeta.RUNNING) - kikimr.compute_plane.wait_zero_checkpoint(query_id) - - messages1 = [ - '{"dt": 1696849942000001, "str": "A" }', - '{"dt": 1696849942500001, "str": "B" }', - '{"dt": 1696849943000001, "str": "C" }', - '{"dt": 1696849943600001, "str": "D" }'] # push A+B from TimeOrderRecoverer to MatchRecognize - self.write_stream(messages1) - - # A + B : in MatchRecognize - # C + D : in TimeOrderRecoverer - - logging.debug("get_completed_checkpoints {}".format(kikimr.compute_plane.get_completed_checkpoints(query_id))) - kikimr.compute_plane.wait_completed_checkpoints( - query_id, kikimr.compute_plane.get_completed_checkpoints(query_id) + 1 - ) - - self.restart_node(kikimr, query_id) - - self.write_stream(['{"dt": 1696849944100001, "str": "E" }']) - - assert client.get_query_status(query_id) == fq.QueryMeta.RUNNING - - expected = ['{"a_str":"A","b_str":"B","c_str":"C","dt_begin":1696849942000001,"dt_end":1696849943000001}'] - - read_data = self.read_stream(1) - logging.info("Data was read: {}".format(read_data)) - - assert read_data == expected - - client.abort_query(query_id) - client.wait_query(query_id) - - self.dump_workers(kikimr, 0, 0) diff --git a/ydb/tests/fq/kikimr/test_recovery_mz.py b/ydb/tests/fq/kikimr/test_recovery_mz.py index 631f784a6d91..a2f76db97f90 100644 --- a/ydb/tests/fq/kikimr/test_recovery_mz.py +++ b/ydb/tests/fq/kikimr/test_recovery_mz.py @@ -12,7 +12,6 @@ from ydb.tests.tools.fq_runner.kikimr_runner import StreamingOverKikimr from ydb.tests.tools.fq_runner.kikimr_runner import StreamingOverKikimrConfig from ydb.tests.tools.fq_runner.kikimr_runner import TenantConfig -from ydb.tests.tools.fq_runner.fq_client import FederatedQueryClient from ydb.tests.tools.fq_runner.kikimr_utils import yq_v1 from ydb.tests.tools.datastreams_helpers.test_yds_base import TestYdsBase @@ -86,6 +85,7 @@ def dump_workers(self, worker_count, ca_count, wait_time=yatest_common.plain_or_ for [s, w, c] in list: logging.debug("Node {}, workers {}, ca {}".format(s, w, c)) assert False, "Workers={} and CAs={}, but {} and {} expected".format(wcs, ccs, worker_count, ca_count) + @yq_v1 def test_recovery(self, kikimr, client, yq_version): self.init_topics(f"pq_kikimr_streaming_{yq_version}", partitions_count=2) @@ -108,7 +108,6 @@ def test_recovery(self, kikimr, client, yq_version): output_topic=self.output_topic, ) client.create_yds_connection("myyds", os.getenv("YDB_DATABASE"), os.getenv("YDB_ENDPOINT")) - # client = FederatedQueryClient("my_folder", streaming_over_kikimr=kikimr) query_id = client.create_query("simple", sql, type=fq.QueryContent.QueryType.STREAMING).result.query_id client.wait_query_status(query_id, fq.QueryMeta.RUNNING) self.kikimr.compute_plane.wait_zero_checkpoint(query_id) diff --git a/ydb/tests/fq/yds/test_recovery_match_recognize.py b/ydb/tests/fq/yds/test_recovery_match_recognize.py index dcca66c09125..9da918823f5b 100644 --- a/ydb/tests/fq/yds/test_recovery_match_recognize.py +++ b/ydb/tests/fq/yds/test_recovery_match_recognize.py @@ -70,6 +70,28 @@ def dump_workers(self, kikimr, worker_count, ca_count, wait_time=yatest_common.p logging.debug("Node {}, workers {}, ca {}".format(s, w, c)) assert False, "Workers={} and CAs={}, but {} and {} expected".format(wcs, ccs, worker_count, ca_count) + def restart_node(self, kikimr, query_id): + # restart node with CA + + + # master_node_index = self.get_graph_master_node_id(kikimr, query_id) + # logging.debug("Master node {}".format(master_node_index)) + + node_to_restart = None + + for node_index in kikimr.compute_plane.kikimr_cluster.nodes: + wc = kikimr.compute_plane.get_worker_count(node_index) + if wc is not None: + if wc > 0 and node_to_restart is None: + node_to_restart = node_index + assert node_to_restart is not None, "Can't find any task on node" + + logging.debug("Restart compute node {}".format(node_to_restart)) + + kikimr.compute_plane.kikimr_cluster.nodes[node_to_restart].stop() + kikimr.compute_plane.kikimr_cluster.nodes[node_to_restart].start() + kikimr.compute_plane.wait_bootstrap(node_to_restart) + @yq_v1 @pytest.mark.parametrize("kikimr", [(None, None, None)], indirect=["kikimr"]) def test_program_state_recovery(self, kikimr, client, yq_version): @@ -121,20 +143,7 @@ def test_program_state_recovery(self, kikimr, client, yq_version): query_id, kikimr.compute_plane.get_completed_checkpoints(query_id) + 1 ) - # restart node with CA - node_to_restart = None - for node_index in kikimr.control_plane.kikimr_cluster.nodes: - wc = kikimr.control_plane.get_worker_count(node_index) - if wc is not None: - if wc > 0 and node_index != master_node_index and node_to_restart is None: - node_to_restart = node_index - assert node_to_restart is not None, "Can't find any task on non master node" - - logging.debug("Restart non-master node {}".format(node_to_restart)) - - kikimr.control_plane.kikimr_cluster.nodes[node_to_restart].stop() - kikimr.control_plane.kikimr_cluster.nodes[node_to_restart].start() - kikimr.control_plane.wait_bootstrap(node_to_restart) + self.restart_node(kikimr, query_id) messages2 = [ '{"dt": 1696849942800000}', @@ -158,3 +167,84 @@ def test_program_state_recovery(self, kikimr, client, yq_version): client.wait_query(query_id) self.dump_workers(kikimr, 0, 0) + + + @yq_v1 + @pytest.mark.parametrize("kikimr", [(None, None, None)], indirect=["kikimr"]) + def test_match_recognize(self, kikimr, client, yq_version): + + self.init_topics("test_match_recognize_save_load_state") + + sql = R''' + PRAGMA dq.MaxTasksPerStage="2"; + + pragma FeatureR010="prototype"; + pragma config.flags("TimeOrderRecoverDelay", "-1000000"); + pragma config.flags("TimeOrderRecoverAhead", "1000000"); + + INSERT INTO myyds.`{output_topic}` + SELECT ToBytes(Unwrap(Json::SerializeJson(Yson::From(TableRow())))) + FROM (SELECT * FROM myyds.`{input_topic}` + WITH ( + format=json_each_row, + SCHEMA + ( + dt UINT64, + str STRING + ))) + MATCH_RECOGNIZE( + ORDER BY CAST(dt as Timestamp) + MEASURES + LAST(A.dt) as dt_begin, + LAST(C.dt) as dt_end, + LAST(A.str) as a_str, + LAST(B.str) as b_str, + LAST(C.str) as c_str + ONE ROW PER MATCH + PATTERN ( A B C ) + DEFINE + A as A.str='A', + B as B.str='B', + C as C.str='C')''' \ + .format( + input_topic=self.input_topic, + output_topic=self.output_topic, + ) + + client.create_yds_connection("myyds", os.getenv("YDB_DATABASE"), os.getenv("YDB_ENDPOINT")) + query_id = client.create_query("simple", sql, type=fq.QueryContent.QueryType.STREAMING).result.query_id + client.wait_query_status(query_id, fq.QueryMeta.RUNNING) + kikimr.compute_plane.wait_zero_checkpoint(query_id) + + messages1 = [ + '{"dt": 1696849942000001, "str": "A" }', + '{"dt": 1696849942500001, "str": "B" }', + '{"dt": 1696849943000001, "str": "C" }', + '{"dt": 1696849943600001, "str": "D" }'] # push A+B from TimeOrderRecoverer to MatchRecognize + self.write_stream(messages1) + + # A + B : in MatchRecognize + # C + D : in TimeOrderRecoverer + + logging.debug("get_completed_checkpoints {}".format(kikimr.compute_plane.get_completed_checkpoints(query_id))) + kikimr.compute_plane.wait_completed_checkpoints( + query_id, kikimr.compute_plane.get_completed_checkpoints(query_id) + 1 + ) + + self.restart_node(kikimr, query_id) + + self.write_stream(['{"dt": 1696849944100001, "str": "E" }']) + + assert client.get_query_status(query_id) == fq.QueryMeta.RUNNING + + expected = ['{"a_str":"A","b_str":"B","c_str":"C","dt_begin":1696849942000001,"dt_end":1696849943000001}'] + + read_data = self.read_stream(1) + logging.info("Data was read: {}".format(read_data)) + + assert read_data == expected + + client.abort_query(query_id) + client.wait_query(query_id) + + self.dump_workers(kikimr, 0, 0) From cd2cc7eba6709620b4d318a24cc91a519261e396 Mon Sep 17 00:00:00 2001 From: Dmitry Kardymon Date: Tue, 13 Feb 2024 09:28:56 +0000 Subject: [PATCH 12/19] fix key --- .../comp_nodes/mkql_match_recognize.cpp | 88 ++++---- .../comp_nodes/mkql_match_recognize_list.h | 4 +- .../mkql_match_recognize_matched_vars.h | 3 +- .../comp_nodes/mkql_match_recognize_nfa.h | 16 +- .../mkql_match_recognize_save_load.h | 22 +- ydb/tests/fq/kikimr/conftest.py | 71 ------- ydb/tests/fq/kikimr/test_base.py | 26 --- ydb/tests/fq/kikimr/test_recovery_mz.py | 195 ------------------ ydb/tests/fq/kikimr/ya.make | 38 ---- .../fq/yds/test_recovery_match_recognize.py | 1 + 10 files changed, 64 insertions(+), 400 deletions(-) delete mode 100644 ydb/tests/fq/kikimr/conftest.py delete mode 100644 ydb/tests/fq/kikimr/test_base.py delete mode 100644 ydb/tests/fq/kikimr/test_recovery_mz.py delete mode 100644 ydb/tests/fq/kikimr/ya.make diff --git a/ydb/library/yql/minikql/comp_nodes/mkql_match_recognize.cpp b/ydb/library/yql/minikql/comp_nodes/mkql_match_recognize.cpp index 8a5cf6d1bd18..ae7397702b18 100644 --- a/ydb/library/yql/minikql/comp_nodes/mkql_match_recognize.cpp +++ b/ydb/library/yql/minikql/comp_nodes/mkql_match_recognize.cpp @@ -28,7 +28,7 @@ using namespace NYql::NMatchRecognize; struct TMatchRecognizeProcessorParameters { IComputationExternalNode* InputDataArg; - NYql::NMatchRecognize::TRowPattern Pattern; + TRowPattern Pattern; TUnboxedValueVector VarNames; THashMap VarNamesLookup; IComputationExternalNode* MatchedVarsArg; @@ -130,16 +130,11 @@ class TBackTrackingMatchRecognize { } void Save(TOutputSerializer& /*serializer*/) const { - // // TODO : PartitionKey - // Rows.Save(serializer); - // Nfa.Save(serializer); - // serializer.Write(MatchNumber); + // Not used in not streaming mode. } void Load(TInputSerializer& /*serializer*/) { - // Rows.Load(serializer); - // Nfa.Load(serializer); - // MatchNumber = serializer.Read(); + // Not used in not streaming mode. } private: @@ -178,7 +173,6 @@ class TStreamingMatchRecognize { } bool HasMatched() const { - return Nfa.HasMatched(); } @@ -216,13 +210,14 @@ class TStreamingMatchRecognize { } void Save(TOutputSerializer& serializer) const { - // TODO : PartitionKey + // PartitionKey saved in TStateForInterleavedPartitions as key. Rows.Save(serializer); Nfa.Save(serializer); serializer.Write(MatchNumber); } void Load(TInputSerializer& serializer) { + // PartitionKey passed in contructor. Rows.Load(serializer); Nfa.Load(serializer); MatchNumber = serializer.Read(); @@ -251,8 +246,8 @@ class TStateForNonInterleavedPartitions const TMatchRecognizeProcessorParameters& parameters, const TContainerCacheOnContext& cache, TComputationContext &ctx, - TType* stateType, - const TMutableObjectOverBoxedValue& packer + TType* rowType, + const TMutableObjectOverBoxedValue& rowPacker ) : TComputationValue(memInfo) , InputRowArg(inputRowArg) @@ -262,11 +257,11 @@ class TStateForNonInterleavedPartitions , RowPatternConfiguration(TRowPatternConfigurationBuilder::Create(parameters.Pattern, parameters.VarNamesLookup)) , Cache(cache) , Terminating(false) - , SaveLoadContex(ctx, stateType, packer) + , SerializerContext(ctx, rowType, rowPacker) {} NUdf::TUnboxedValue Save() const override { - TOutputSerializer serializer(SaveLoadContex); + TOutputSerializer serializer(SerializerContext); serializer.Write(StateVersion); serializer.Write(CurPartitionPackedKey); bool isValid = static_cast(PartitionHandler); @@ -284,14 +279,15 @@ class TStateForNonInterleavedPartitions void Load(const NUdf::TStringRef& state) override { - TInputSerializer serializer(SaveLoadContex, state); + TInputSerializer serializer(SerializerContext, state); const auto stateVersion = serializer.Read(); if (stateVersion == 1) { serializer.Read(CurPartitionPackedKey); bool validPartitionHandler = serializer.Read(); if (validPartitionHandler) { + NUdf::TUnboxedValue key = PartitionKeyPacker.Unpack(CurPartitionPackedKey, SerializerContext.Ctx.HolderFactory); PartitionHandler.reset(new Algo( - NYql::NUdf::TUnboxedValuePod(NYql::NUdf::TStringValue("asd")),// TODO + std::move(key), Parameters, RowPatternConfiguration, Cache @@ -321,7 +317,6 @@ class TStateForNonInterleavedPartitions if (PartitionHandler) { return PartitionHandler->ProcessEndOfData(ctx); } - //be aware that the very first partition is created in the same manner as subsequent return false; } @@ -374,7 +369,7 @@ class TStateForNonInterleavedPartitions const TContainerCacheOnContext& Cache; NUdf::TUnboxedValue DelayedRow; bool Terminating; - TSerializerContext SaveLoadContex; + TSerializerContext SerializerContext; }; class TStateForInterleavedPartitions @@ -391,8 +386,8 @@ class TStateForInterleavedPartitions const TMatchRecognizeProcessorParameters& parameters, const TContainerCacheOnContext& cache, TComputationContext &ctx, - TType* stateType, - const TMutableObjectOverBoxedValue& packer + TType* rowType, + const TMutableObjectOverBoxedValue& rowPacker ) : TComputationValue(memInfo) , InputRowArg(inputRowArg) @@ -401,11 +396,11 @@ class TStateForInterleavedPartitions , Parameters(parameters) , NfaTransitionGraph(TNfaTransitionGraphBuilder::Create(parameters.Pattern, parameters.VarNamesLookup)) , Cache(cache) - , SaveLoadContex(ctx, stateType, packer) + , SerializerContext(ctx, rowType, rowPacker) {} NUdf::TUnboxedValue Save() const override { - TOutputSerializer serializer(SaveLoadContex); + TOutputSerializer serializer(SerializerContext); serializer.Write(StateVersion); serializer.Write(Partitions.size()); @@ -419,29 +414,30 @@ class TStateForInterleavedPartitions void Load(const NUdf::TStringRef& state) override { - TInputSerializer serializer(SaveLoadContex, state); + TInputSerializer serializer(SerializerContext, state); const auto stateVersion = serializer.Read(); if (stateVersion == 1) { Partitions.clear(); auto partitionsSize = serializer.Read(); for (size_t i = 0; i < partitionsSize; ++i) { - auto key = serializer.Read(); - auto pair = Partitions.emplace(key, std::make_unique( - NYql::NUdf::TUnboxedValuePod(NYql::NUdf::TStringValue(key)), - Parameters, - NfaTransitionGraph, - Cache)); + auto packedKey = serializer.Read(); + NUdf::TUnboxedValue key = PartitionKeyPacker.Unpack(packedKey, SerializerContext.Ctx.HolderFactory); + auto pair = Partitions.emplace( + packedKey, + std::make_unique( + std::move(key), + Parameters, + NfaTransitionGraph, + Cache)); (pair.first)->second->Load(serializer); } - // std::cerr << "partitionsSize " << partitionsSize << std::endl; - // for (auto it = Partitions.begin(); it != Partitions.end(); ++it) { - // std::cerr << "it->second->HasMatched() " << it->second->HasMatched() << std::endl; - // if (it->second->HasMatched()) { - // HasReadyOutput.push(it); - // } - // } + for (auto it = Partitions.begin(); it != Partitions.end(); ++it) { + if (it->second->HasMatched()) { + HasReadyOutput.push(it); + } + } serializer.Read(Terminating); } MKQL_ENSURE(serializer.Empty(), "State is corrupted"); @@ -509,7 +505,7 @@ class TStateForInterleavedPartitions const TMatchRecognizeProcessorParameters& Parameters; const TNfaTransitionGraph::TPtr NfaTransitionGraph; const TContainerCacheOnContext& Cache; - TSerializerContext SaveLoadContex; + TSerializerContext SerializerContext; }; template @@ -521,7 +517,7 @@ class TMatchRecognizeWrapper : public TStatefulFlowComputationNode Packer; + TType* const RowType; + TMutableObjectOverBoxedValue RowPacker; }; TOutputColumnOrder GetOutputColumnOrder(TRuntimeNode partitionKyeColumnsIndexes, TRuntimeNode measureColumnsIndexes) { diff --git a/ydb/library/yql/minikql/comp_nodes/mkql_match_recognize_list.h b/ydb/library/yql/minikql/comp_nodes/mkql_match_recognize_list.h index 3e32ba75264e..d0a20ea33e7b 100644 --- a/ydb/library/yql/minikql/comp_nodes/mkql_match_recognize_list.h +++ b/ydb/library/yql/minikql/comp_nodes/mkql_match_recognize_list.h @@ -94,7 +94,6 @@ class TSparseList { class TContainer: public TSimpleRefCount { public: - using TPtr = TIntrusivePtr; void Add(size_t index, NUdf::TUnboxedValue&& value) { @@ -178,7 +177,8 @@ class TSparseList { : Container() , FromIndex(-1) , ToIndex(-1) - {} + { + } TRange(const TRange& other) : Container(other.Container) diff --git a/ydb/library/yql/minikql/comp_nodes/mkql_match_recognize_matched_vars.h b/ydb/library/yql/minikql/comp_nodes/mkql_match_recognize_matched_vars.h index 60d5a9117e3b..92fd32efe71e 100644 --- a/ydb/library/yql/minikql/comp_nodes/mkql_match_recognize_matched_vars.h +++ b/ydb/library/yql/minikql/comp_nodes/mkql_match_recognize_matched_vars.h @@ -84,7 +84,8 @@ class TMatchedVarsValue : public TComputationValue> { : TComputationValue(memInfo) , HolderFactory(holderFactory) , Var(v) - {} + { + } bool HasFastListLength() const override { return true; diff --git a/ydb/library/yql/minikql/comp_nodes/mkql_match_recognize_nfa.h b/ydb/library/yql/minikql/comp_nodes/mkql_match_recognize_nfa.h index 80b5eceefa89..a283e9ad8a78 100644 --- a/ydb/library/yql/minikql/comp_nodes/mkql_match_recognize_nfa.h +++ b/ydb/library/yql/minikql/comp_nodes/mkql_match_recognize_nfa.h @@ -75,9 +75,6 @@ struct TNfaTransitionDestinationVisitor { } }; -template -inline constexpr bool always_false_v = false; - struct TNfaTransitionGraph { using TTransitions = std::vector>; @@ -87,6 +84,9 @@ struct TNfaTransitionGraph { using TPtr = std::shared_ptr; + template + inline constexpr static bool always_false_v = false; + void Save(TOutputSerializer& serializer) const { serializer.Write(Transitions.size()); @@ -387,16 +387,14 @@ class TNfa { auto varsSize = serializer.Read(); Vars.clear(); Vars.resize(varsSize); - for (size_t i = 0; i < varsSize; ++i) { - auto& subvec = Vars[i]; + for (auto& subvec: Vars) { ui64 vectorSize = serializer.Read(); subvec.resize(vectorSize); - for (size_t j = 0; j < vectorSize; ++j) { - subvec[j].Load(serializer); + for (auto& item : subvec) { + item.Load(serializer); } } - - while (!Quantifiers.empty()) { + while (!Quantifiers.empty()) { // Clearing. Quantifiers.pop(); } auto quantifiersSize = serializer.Read(); diff --git a/ydb/library/yql/minikql/comp_nodes/mkql_match_recognize_save_load.h b/ydb/library/yql/minikql/comp_nodes/mkql_match_recognize_save_load.h index efa2f45bdf77..540bac830c56 100644 --- a/ydb/library/yql/minikql/comp_nodes/mkql_match_recognize_save_load.h +++ b/ydb/library/yql/minikql/comp_nodes/mkql_match_recognize_save_load.h @@ -10,12 +10,12 @@ namespace NKikimr::NMiniKQL::NMatchRecognize { struct TSerializerContext { TComputationContext& Ctx; - TType* StateType; - const TMutableObjectOverBoxedValue& Packer; + TType* RowType; + const TMutableObjectOverBoxedValue& RowPacker; }; template -inline constexpr bool always_false_v2 = false; +inline constexpr bool always_false_v = false; struct TOutputSerializer { private: @@ -41,13 +41,11 @@ struct TOutputSerializer { WriteByte(Buf, value); } else if constexpr (std::is_same_v, ui32>) { WriteUi32(Buf, value); - } else if constexpr (std::is_same_v, NUdf::TUnboxedValue>) { - WriteUnboxedValue(Buf, Context.Packer.RefMutableObject(Context.Ctx, false, Context.StateType), value); - } else if constexpr (std::is_same_v, NUdf::TUnboxedValue>) { - WriteUnboxedValue(Buf, Context.Packer.RefMutableObject(Context.Ctx, false, Context.StateType), value); + } else if constexpr (std::is_same_v, NUdf::TUnboxedValue>) { // Only Row type (StateType) supported ! + WriteUnboxedValue(Buf, Context.RowPacker.RefMutableObject(Context.Ctx, false, Context.RowType), value); } else { - static_assert(always_false_v2, "Not supported type / not implemented"); + static_assert(always_false_v, "Not supported type / not implemented"); } } @@ -122,10 +120,10 @@ struct TInputSerializer { } else if constexpr (std::is_same_v, ui32>) { return ReadUi32(Buf); } else if constexpr (std::is_same_v, NUdf::TUnboxedValue>) { - return ReadUnboxedValue(Buf, Context.Packer.RefMutableObject(Context.Ctx, false, Context.StateType), Context.Ctx); + return ReadUnboxedValue(Buf, Context.RowPacker.RefMutableObject(Context.Ctx, false, Context.RowType), Context.Ctx); } else { - static_assert(always_false_v2, "Not supported type / not implemented"); + static_assert(always_false_v, "Not supported type / not implemented"); } } @@ -142,10 +140,10 @@ struct TInputSerializer { } else if constexpr (std::is_same_v, ui32>) { value = ReadUi32(Buf); } else if constexpr (std::is_same_v, NUdf::TUnboxedValue>) { - value = ReadUnboxedValue(Buf, Context.Packer.RefMutableObject(Context.Ctx, false, Context.StateType), Context.Ctx); + value = ReadUnboxedValue(Buf, Context.RowPacker.RefMutableObject(Context.Ctx, false, Context.RowType), Context.Ctx); } else { - static_assert(always_false_v2, "Not supported type / not implemented"); + static_assert(always_false_v, "Not supported type / not implemented"); } } diff --git a/ydb/tests/fq/kikimr/conftest.py b/ydb/tests/fq/kikimr/conftest.py deleted file mode 100644 index 687e11205a7c..000000000000 --- a/ydb/tests/fq/kikimr/conftest.py +++ /dev/null @@ -1,71 +0,0 @@ -#!/usr/bin/env python -# -*- coding: utf-8 -*- - -import pytest - -from ydb.tests.tools.fq_runner.fq_client import FederatedQueryClient -from ydb.tests.tools.fq_runner.custom_hooks import * # noqa: F401,F403 Adding custom hooks for YQv2 support -from ydb.tests.tools.fq_runner.kikimr_utils import ExtensionPoint -from ydb.tests.tools.fq_runner.kikimr_utils import YQv2Extension -from ydb.tests.tools.fq_runner.kikimr_utils import ComputeExtension -from ydb.tests.tools.fq_runner.kikimr_utils import DefaultConfigExtension -from ydb.tests.tools.fq_runner.kikimr_utils import StatsModeExtension -from ydb.tests.tools.fq_runner.kikimr_utils import start_kikimr - - -@pytest.fixture -def stats_mode(): - return '' - - -@pytest.fixture -def kikimr(request: pytest.FixtureRequest, yq_version: str, stats_mode: str): - kikimr_extensions = [DefaultConfigExtension(""), - YQv2Extension(yq_version), - ComputeExtension(), - StatsModeExtension(stats_mode)] - with start_kikimr(request, kikimr_extensions) as kikimr: - yield kikimr - - -class ManyRetriesConfigExtension(ExtensionPoint): - def __init__(self): - super().__init__() - - def is_applicable(self, request): - return True - - def apply_to_kikimr(self, request, kikimr): - kikimr.compute_plane.fq_config['control_plane_storage']['retry_policy_mapping'] = [ - { - 'status_code': [2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13], - 'policy': { - 'retry_count': 10000 - } - } - ] - - -@pytest.fixture -def kikimr_many_retries(request: pytest.FixtureRequest, yq_version: str): - kikimr_extensions = [DefaultConfigExtension(""), - ManyRetriesConfigExtension(), - YQv2Extension(yq_version), - ComputeExtension()] - with start_kikimr(request, kikimr_extensions) as kikimr: - yield kikimr - - -def create_client(kikimr, request): - return FederatedQueryClient(request.param["folder_id"] if request is not None else "my_folder", - streaming_over_kikimr=kikimr) - - -@pytest.fixture -def client(kikimr, request=None): - return create_client(kikimr, request) - - -@pytest.fixture -def client_many_retries(kikimr_many_retries, request=None): - return create_client(kikimr_many_retries, request) diff --git a/ydb/tests/fq/kikimr/test_base.py b/ydb/tests/fq/kikimr/test_base.py deleted file mode 100644 index 6b09f7a702e9..000000000000 --- a/ydb/tests/fq/kikimr/test_base.py +++ /dev/null @@ -1,26 +0,0 @@ -#!/usr/bin/env python -# -*- coding: utf-8 -*- - -from ydb.tests.tools.fq_runner.kikimr_runner import StreamingOverKikimr -from ydb.tests.tools.fq_runner.kikimr_runner import StreamingOverKikimrConfig -from ydb.tests.tools.datastreams_helpers.test_yds_base import TestYdsBase - - -class TestBaseWithAbortingConfigParams(TestYdsBase): - - @classmethod - def setup_class(cls): - kikimr_conf = StreamingOverKikimrConfig(cloud_mode=True) - cls.streaming_over_kikimr = StreamingOverKikimr(kikimr_conf) - cls.streaming_over_kikimr.control_plane.fq_config['control_plane_storage']['task_lease_ttl'] = "2s" - cls.streaming_over_kikimr.control_plane.fq_config['control_plane_storage']['task_lease_retry_policy'] = {} - cls.streaming_over_kikimr.control_plane.fq_config['control_plane_storage']['task_lease_retry_policy']['retry_count'] = 1 - cls.streaming_over_kikimr.compute_plane.fq_config['pinger']['ping_period'] = "1s" - cls.streaming_over_kikimr.start_mvp_mock_server() - cls.streaming_over_kikimr.start() - - @classmethod - def teardown_class(cls): - if hasattr(cls, "streaming_over_kikimr"): - cls.streaming_over_kikimr.stop_mvp_mock_server() - cls.streaming_over_kikimr.stop() diff --git a/ydb/tests/fq/kikimr/test_recovery_mz.py b/ydb/tests/fq/kikimr/test_recovery_mz.py deleted file mode 100644 index a2f76db97f90..000000000000 --- a/ydb/tests/fq/kikimr/test_recovery_mz.py +++ /dev/null @@ -1,195 +0,0 @@ -#!/usr/bin/env python -# -*- coding: utf-8 -*- - -import logging -import time -import pytest -import random -import os -import yatest - -import ydb.tests.library.common.yatest_common as yatest_common -from ydb.tests.tools.fq_runner.kikimr_runner import StreamingOverKikimr -from ydb.tests.tools.fq_runner.kikimr_runner import StreamingOverKikimrConfig -from ydb.tests.tools.fq_runner.kikimr_runner import TenantConfig -from ydb.tests.tools.fq_runner.kikimr_utils import yq_v1 -from ydb.tests.tools.datastreams_helpers.test_yds_base import TestYdsBase - -import library.python.retry as retry -import ydb.public.api.protos.draft.fq_pb2 as fq - - -@pytest.fixture -def kikimr(): - kikimr_conf = StreamingOverKikimrConfig( - cloud_mode=True, - node_count={"/cp": TenantConfig(1), - "/compute": TenantConfig(8)}) - kikimr = StreamingOverKikimr(kikimr_conf) - # control - kikimr.control_plane.fq_config['control_plane_storage']['mapping'] = {"common_tenant_name": ["/compute"]} - kikimr.control_plane.fq_config['control_plane_storage']['task_lease_retry_policy'] = {} - kikimr.control_plane.fq_config['control_plane_storage']['task_lease_retry_policy']['retry_count'] = 5 - kikimr.control_plane.fq_config['control_plane_storage']['task_lease_retry_policy']['retry_period'] = "30s" - kikimr.control_plane.fq_config['control_plane_storage']['task_lease_ttl'] = "3s" - # compute - kikimr.compute_plane.fq_config['pinger']['ping_period'] = "1s" - kikimr.start_mvp_mock_server() - kikimr.start() - yield kikimr - kikimr.stop() - kikimr.stop_mvp_mock_server() - - -def run_with_sleep(args): - program_args, time_min, time_max, duration = args - deadline = time.time() + duration - while time.time() < deadline: - yatest.common.execute(program_args) - time.sleep(random.uniform(time_min, time_max)) - - -class TestRecovery(TestYdsBase): - - @retry.retry_intrusive - def get_graph_master_node_id(self, query_id): - for node_index in self.kikimr.compute_plane.kikimr_cluster.nodes: - if self.kikimr.compute_plane.get_task_count(node_index, query_id) > 0: - return node_index - assert False, "No active graphs found" - - def get_ca_count(self, node_index): - result = self.kikimr.compute_plane.get_sensors(node_index, "utils").find_sensor({"activity": "DQ_COMPUTE_ACTOR", "sensor": "ActorsAliveByActivity", "execpool": "User"}) - return result if result is not None else 0 - - def dump_workers(self, worker_count, ca_count, wait_time=yatest_common.plain_or_under_sanitizer(30, 150)): - deadline = time.time() + wait_time - while True: - wcs = 0 - ccs = 0 - list = [] - for node_index in self.kikimr.compute_plane.kikimr_cluster.nodes: - wc = self.kikimr.compute_plane.get_worker_count(node_index) - cc = self.get_ca_count(node_index) - wcs += wc - ccs += cc - list.append([node_index, wc, cc]) - if wcs == worker_count and ccs == ca_count: - for [s, w, c] in list: - if w * 2 != c: - continue - for [s, w, c] in list: - logging.debug("Node {}, workers {}, ca {}".format(s, w, c)) - return - if time.time() > deadline: - for [s, w, c] in list: - logging.debug("Node {}, workers {}, ca {}".format(s, w, c)) - assert False, "Workers={} and CAs={}, but {} and {} expected".format(wcs, ccs, worker_count, ca_count) - - @yq_v1 - def test_recovery(self, kikimr, client, yq_version): - self.init_topics(f"pq_kikimr_streaming_{yq_version}", partitions_count=2) - - self.retry_conf = retry.RetryConf().upto(seconds=30).waiting(0.1) - self.kikimr = kikimr - kikimr.compute_plane.wait_bootstrap() - kikimr.compute_plane.wait_discovery() - - # Consumer and topics to create are written in ya.make file. - sql = R''' - PRAGMA dq.MaxTasksPerStage="2"; - - INSERT INTO myyds.`{output_topic}` - SELECT STREAM - * - FROM myyds.`{input_topic}`;'''\ - .format( - input_topic=self.input_topic, - output_topic=self.output_topic, - ) - client.create_yds_connection("myyds", os.getenv("YDB_DATABASE"), os.getenv("YDB_ENDPOINT")) - query_id = client.create_query("simple", sql, type=fq.QueryContent.QueryType.STREAMING).result.query_id - client.wait_query_status(query_id, fq.QueryMeta.RUNNING) - self.kikimr.compute_plane.wait_zero_checkpoint(query_id) - - logging.debug("Uuid = {}".format(kikimr.uuid)) - master_node_index = self.get_graph_master_node_id(query_id) - logging.debug("Master node {}".format(master_node_index)) - - self.write_stream([str(i) for i in range(1, 11)]) - - read_data = self.read_stream(10) - - for message in read_data: - logging.info("Received message: {}".format(message)) - - assert len(read_data) == 10 - - d = {} - for m in read_data: - n = int(m) - assert n >= 1 and n <= 10 - assert n not in d - d[n] = 1 - - self.dump_workers(2, 4) - - node_to_restart = None - for node_index in kikimr.compute_plane.kikimr_cluster.nodes: - wc = kikimr.compute_plane.get_worker_count(node_index) - if wc is not None: - if wc > 0 and node_index != master_node_index and node_to_restart is None: - node_to_restart = node_index - assert node_to_restart is not None, "Can't find any task on non master node" - - logging.debug("Restart non-master node {}".format(node_to_restart)) - - kikimr.compute_plane.kikimr_cluster.nodes[node_to_restart].stop() - kikimr.compute_plane.kikimr_cluster.nodes[node_to_restart].start() - kikimr.compute_plane.wait_bootstrap(node_to_restart) - - self.dump_workers(2, 4) - - self.write_stream([str(i) for i in range(11, 21)]) - - read_data = self.read_stream(10) - assert len(read_data) == 10 - - for m in read_data: - n = int(m) - assert n >= 1 and n <= 20 - if n in d: - d[n] = d[n] + 1 - else: - d[n] = 1 - - logging.debug("Restart Master node {}".format(master_node_index)) - - kikimr.compute_plane.kikimr_cluster.nodes[master_node_index].stop() - kikimr.compute_plane.kikimr_cluster.nodes[master_node_index].start() - kikimr.compute_plane.wait_bootstrap(master_node_index) - master_node_index = self.get_graph_master_node_id(query_id) - - logging.debug("New master node {}".format(master_node_index)) - - self.dump_workers(2, 4) - - self.write_stream([str(i) for i in range(21, 31)]) - - read_data = self.read_stream(10) - assert len(read_data) == 10 - - for m in read_data: - n = int(m) - assert n >= 1 and n <= 30 - if n in d: - d[n] = d[n] + 1 - else: - d[n] = 1 - - zero_checkpoints_metric = kikimr.compute_plane.get_checkpoint_coordinator_metric(query_id, "StartedFromEmptyCheckpoint") - restored_metric = kikimr.compute_plane.get_checkpoint_coordinator_metric(query_id, "RestoredFromSavedCheckpoint") - assert restored_metric >= 1, "RestoredFromSavedCheckpoint: {}, StartedFromEmptyCheckpoint: {}".format(restored_metric, zero_checkpoints_metric) - - client.abort_query(query_id) - client.wait_query(query_id) diff --git a/ydb/tests/fq/kikimr/ya.make b/ydb/tests/fq/kikimr/ya.make deleted file mode 100644 index 3bc2c637e1ac..000000000000 --- a/ydb/tests/fq/kikimr/ya.make +++ /dev/null @@ -1,38 +0,0 @@ -PY3TEST() - -FORK_SUBTESTS() -SPLIT_FACTOR(50) - -INCLUDE(${ARCADIA_ROOT}/ydb/tests/tools/fq_runner/ydb_runner_with_datastreams.inc) - -PEERDIR( - ydb/public/api/protos - ydb/public/api/grpc - ydb/tests/tools/datastreams_helpers - ydb/tests/tools/fq_runner -) - -DEPENDS(ydb/tests/tools/pq_read) - -PY_SRCS( - conftest.py - test_base.py -) - -TEST_SRCS( - test_recovery_match_recognize.py - test_recovery_mz.py -) - -IF (SANITIZER_TYPE == "thread") - TIMEOUT(2400) - SIZE(LARGE) - TAG(ya:fat) -ELSE() - TIMEOUT(600) - SIZE(MEDIUM) -ENDIF() - -REQUIREMENTS(ram:16) - -END() diff --git a/ydb/tests/fq/yds/test_recovery_match_recognize.py b/ydb/tests/fq/yds/test_recovery_match_recognize.py index 9da918823f5b..4670013bb4bf 100644 --- a/ydb/tests/fq/yds/test_recovery_match_recognize.py +++ b/ydb/tests/fq/yds/test_recovery_match_recognize.py @@ -181,6 +181,7 @@ def test_match_recognize(self, kikimr, client, yq_version): pragma FeatureR010="prototype"; pragma config.flags("TimeOrderRecoverDelay", "-1000000"); pragma config.flags("TimeOrderRecoverAhead", "1000000"); + pragma config.flags("MatchRecognizeStream", "auto"); INSERT INTO myyds.`{output_topic}` SELECT ToBytes(Unwrap(Json::SerializeJson(Yson::From(TableRow())))) From b57d57eeb41256523847c5f2977d439955610038 Mon Sep 17 00:00:00 2001 From: Dmitry Kardymon Date: Tue, 13 Feb 2024 09:56:48 +0000 Subject: [PATCH 13/19] edit tests --- ydb/tests/fq/ya.make | 1 - .../fq/yds/test_recovery_match_recognize.py | 151 +++++++----------- 2 files changed, 62 insertions(+), 90 deletions(-) diff --git a/ydb/tests/fq/ya.make b/ydb/tests/fq/ya.make index 8fe31a66c9af..734e38f7f709 100644 --- a/ydb/tests/fq/ya.make +++ b/ydb/tests/fq/ya.make @@ -2,7 +2,6 @@ RECURSE_FOR_TESTS( common generic http_api - kikimr mem_alloc multi_plane plans diff --git a/ydb/tests/fq/yds/test_recovery_match_recognize.py b/ydb/tests/fq/yds/test_recovery_match_recognize.py index 4670013bb4bf..04d93a8631b5 100644 --- a/ydb/tests/fq/yds/test_recovery_match_recognize.py +++ b/ydb/tests/fq/yds/test_recovery_match_recognize.py @@ -73,10 +73,6 @@ def dump_workers(self, kikimr, worker_count, ca_count, wait_time=yatest_common.p def restart_node(self, kikimr, query_id): # restart node with CA - - # master_node_index = self.get_graph_master_node_id(kikimr, query_id) - # logging.debug("Master node {}".format(master_node_index)) - node_to_restart = None for node_index in kikimr.compute_plane.kikimr_cluster.nodes: @@ -92,40 +88,12 @@ def restart_node(self, kikimr, query_id): kikimr.compute_plane.kikimr_cluster.nodes[node_to_restart].start() kikimr.compute_plane.wait_bootstrap(node_to_restart) - @yq_v1 - @pytest.mark.parametrize("kikimr", [(None, None, None)], indirect=["kikimr"]) - def test_program_state_recovery(self, kikimr, client, yq_version): - self.init_topics(f"pq_kikimr_streaming_{yq_version}") + def recovery_impl(self, kikimr, client, yq_version, sql_template, test_name, messages_before_restart, messages_after_restart, expected): - sql = R''' - PRAGMA dq.MaxTasksPerStage="2"; + self.init_topics(f"{test_name}_{yq_version}") - pragma FeatureR010="prototype"; - pragma config.flags("TimeOrderRecoverDelay", "-1000000"); - pragma config.flags("TimeOrderRecoverAhead", "1000000"); - - INSERT INTO myyds.`{output_topic}` - SELECT ToBytes(Unwrap(Json::SerializeJson(Yson::From(TableRow())))) - FROM (SELECT * FROM myyds.`{input_topic}` - WITH ( - format=json_each_row, - SCHEMA - ( - dt UINT64 - ))) - MATCH_RECOGNIZE( - ORDER BY CAST(dt as Timestamp) - MEASURES - LAST(ALL_TRUE.dt) as dt - ONE ROW PER MATCH - PATTERN ( ALL_TRUE ) - DEFINE - ALL_TRUE as True)''' \ - .format( - input_topic=self.input_topic, - output_topic=self.output_topic, - ) + sql = sql_template.format(self.input_topic, self.output_topic); client.create_yds_connection("myyds", os.getenv("YDB_DATABASE"), os.getenv("YDB_ENDPOINT")) query_id = client.create_query("simple", sql, type=fq.QueryContent.QueryType.STREAMING).result.query_id @@ -135,8 +103,7 @@ def test_program_state_recovery(self, kikimr, client, yq_version): master_node_index = self.get_graph_master_node_id(kikimr, query_id) logging.debug("Master node {}".format(master_node_index)) - messages1 = ['{"dt": 1696849942400002}', '{"dt": 1696849942000001}'] - self.write_stream(messages1) + self.write_stream(messages_before_restart) logging.debug("get_completed_checkpoints {}".format(kikimr.compute_plane.get_completed_checkpoints(query_id))) kikimr.compute_plane.wait_completed_checkpoints( @@ -144,20 +111,10 @@ def test_program_state_recovery(self, kikimr, client, yq_version): ) self.restart_node(kikimr, query_id) - - messages2 = [ - '{"dt": 1696849942800000}', - '{"dt": 1696849943200003}', - '{"dt": 1696849943300003}', - '{"dt": 1696849943600003}', - '{"dt": 1696849943900003}' - ] - self.write_stream(messages2) + self.write_stream(messages_after_restart) assert client.get_query_status(query_id) == fq.QueryMeta.RUNNING - expected = ['{"dt":1696849942000001}', '{"dt":1696849942400002}', '{"dt":1696849942800000}'] - read_data = self.read_stream(len(expected)) logging.info("Data was read: {}".format(read_data)) @@ -165,15 +122,56 @@ def test_program_state_recovery(self, kikimr, client, yq_version): client.abort_query(query_id) client.wait_query(query_id) - self.dump_workers(kikimr, 0, 0) - @yq_v1 @pytest.mark.parametrize("kikimr", [(None, None, None)], indirect=["kikimr"]) - def test_match_recognize(self, kikimr, client, yq_version): + def test_time_order_recoverer(self, kikimr, client, yq_version, request): - self.init_topics("test_match_recognize_save_load_state") + sql = R''' + PRAGMA dq.MaxTasksPerStage="2"; + + pragma FeatureR010="prototype"; + pragma config.flags("TimeOrderRecoverDelay", "-1000000"); + pragma config.flags("TimeOrderRecoverAhead", "1000000"); + + INSERT INTO myyds.`{1}` + SELECT ToBytes(Unwrap(Json::SerializeJson(Yson::From(TableRow())))) + FROM (SELECT * FROM myyds.`{0}` + WITH ( + format=json_each_row, + SCHEMA + ( + dt UINT64 + ))) + MATCH_RECOGNIZE( + ORDER BY CAST(dt as Timestamp) + MEASURES + LAST(ALL_TRUE.dt) as dt + ONE ROW PER MATCH + PATTERN ( ALL_TRUE ) + DEFINE + ALL_TRUE as True)''' + + messages_before_restart = [ + '{"dt":1696849942400002}', + '{"dt":1696849942000001}'] + messages_after_restart = [ + '{"dt":1696849942800000}', + '{"dt":1696849943200003}', + '{"dt":1696849943300003}', + '{"dt":1696849943600003}', + '{"dt":1696849943900003}'] + expected = [ + '{"dt":1696849942000001}', + '{"dt":1696849942400002}', + '{"dt":1696849942800000}'] + + self.recovery_impl(kikimr, client, yq_version, sql, request.node.name, messages_before_restart, messages_after_restart, expected) + + @yq_v1 + @pytest.mark.parametrize("kikimr", [(None, None, None)], indirect=["kikimr"]) + def test_match_recognize(self, kikimr, client, yq_version, request): sql = R''' PRAGMA dq.MaxTasksPerStage="2"; @@ -183,9 +181,9 @@ def test_match_recognize(self, kikimr, client, yq_version): pragma config.flags("TimeOrderRecoverAhead", "1000000"); pragma config.flags("MatchRecognizeStream", "auto"); - INSERT INTO myyds.`{output_topic}` + INSERT INTO myyds.`{1}` SELECT ToBytes(Unwrap(Json::SerializeJson(Yson::From(TableRow())))) - FROM (SELECT * FROM myyds.`{input_topic}` + FROM (SELECT * FROM myyds.`{0}` WITH ( format=json_each_row, SCHEMA @@ -206,46 +204,21 @@ def test_match_recognize(self, kikimr, client, yq_version): DEFINE A as A.str='A', B as B.str='B', - C as C.str='C')''' \ - .format( - input_topic=self.input_topic, - output_topic=self.output_topic, - ) - - client.create_yds_connection("myyds", os.getenv("YDB_DATABASE"), os.getenv("YDB_ENDPOINT")) - query_id = client.create_query("simple", sql, type=fq.QueryContent.QueryType.STREAMING).result.query_id - client.wait_query_status(query_id, fq.QueryMeta.RUNNING) - kikimr.compute_plane.wait_zero_checkpoint(query_id) + C as C.str='C')''' - messages1 = [ + messages_before_restart = [ '{"dt": 1696849942000001, "str": "A" }', '{"dt": 1696849942500001, "str": "B" }', '{"dt": 1696849943000001, "str": "C" }', '{"dt": 1696849943600001, "str": "D" }'] # push A+B from TimeOrderRecoverer to MatchRecognize - self.write_stream(messages1) - - # A + B : in MatchRecognize - # C + D : in TimeOrderRecoverer - - logging.debug("get_completed_checkpoints {}".format(kikimr.compute_plane.get_completed_checkpoints(query_id))) - kikimr.compute_plane.wait_completed_checkpoints( - query_id, kikimr.compute_plane.get_completed_checkpoints(query_id) + 1 - ) - self.restart_node(kikimr, query_id) + # Before restart: + # A + B : in MatchRecognize + # C + D : in TimeOrderRecoverer - self.write_stream(['{"dt": 1696849944100001, "str": "E" }']) - - assert client.get_query_status(query_id) == fq.QueryMeta.RUNNING + messages_after_restart = [ + '{"dt": 1696849944100001, "str": "E" }'] + expected = [ + '{"a_str":"A","b_str":"B","c_str":"C","dt_begin":1696849942000001,"dt_end":1696849943000001}'] + self.recovery_impl(kikimr, client, yq_version, sql, request.node.name, messages_before_restart, messages_after_restart, expected) - expected = ['{"a_str":"A","b_str":"B","c_str":"C","dt_begin":1696849942000001,"dt_end":1696849943000001}'] - - read_data = self.read_stream(1) - logging.info("Data was read: {}".format(read_data)) - - assert read_data == expected - - client.abort_query(query_id) - client.wait_query(query_id) - - self.dump_workers(kikimr, 0, 0) From f2329a303e2b06bf8c9238ca2213961dbcab05cc Mon Sep 17 00:00:00 2001 From: Dmitry Kardymon Date: Tue, 13 Feb 2024 10:30:51 +0000 Subject: [PATCH 14/19] style fix --- .../yql/minikql/comp_nodes/mkql_match_recognize_list.h | 5 ++++- .../yql/minikql/comp_nodes/mkql_match_recognize_nfa.h | 6 ++---- .../yql/minikql/comp_nodes/ut/mkql_match_recognize_ut.cpp | 4 ---- 3 files changed, 6 insertions(+), 9 deletions(-) diff --git a/ydb/library/yql/minikql/comp_nodes/mkql_match_recognize_list.h b/ydb/library/yql/minikql/comp_nodes/mkql_match_recognize_list.h index d0a20ea33e7b..edd92c721da2 100644 --- a/ydb/library/yql/minikql/comp_nodes/mkql_match_recognize_list.h +++ b/ydb/library/yql/minikql/comp_nodes/mkql_match_recognize_list.h @@ -1,11 +1,14 @@ #pragma once + +#include "mkql_match_recognize_save_load.h" + #include #include #include -#include #include #include #include + namespace NKikimr::NMiniKQL::NMatchRecognize { class TSimpleList { diff --git a/ydb/library/yql/minikql/comp_nodes/mkql_match_recognize_nfa.h b/ydb/library/yql/minikql/comp_nodes/mkql_match_recognize_nfa.h index a283e9ad8a78..09742de16d86 100644 --- a/ydb/library/yql/minikql/comp_nodes/mkql_match_recognize_nfa.h +++ b/ydb/library/yql/minikql/comp_nodes/mkql_match_recognize_nfa.h @@ -1,9 +1,9 @@ #pragma once #include "mkql_match_recognize_matched_vars.h" +#include "mkql_match_recognize_save_load.h" #include "../computation/mkql_computation_node_holders.h" #include "../computation/mkql_computation_node_impl.h" -#include #include #include #include @@ -394,9 +394,7 @@ class TNfa { item.Load(serializer); } } - while (!Quantifiers.empty()) { // Clearing. - Quantifiers.pop(); - } + Quantifiers.clear(); auto quantifiersSize = serializer.Read(); for (size_t i = 0; i < quantifiersSize; ++i) { ui64 qnt = serializer.Read(); diff --git a/ydb/library/yql/minikql/comp_nodes/ut/mkql_match_recognize_ut.cpp b/ydb/library/yql/minikql/comp_nodes/ut/mkql_match_recognize_ut.cpp index 62a54ae065ba..08a675535d08 100644 --- a/ydb/library/yql/minikql/comp_nodes/ut/mkql_match_recognize_ut.cpp +++ b/ydb/library/yql/minikql/comp_nodes/ut/mkql_match_recognize_ut.cpp @@ -51,11 +51,9 @@ namespace NKikimr { TIntrusivePtr FunctionRegistry; TIntrusivePtr RandomProvider; TIntrusivePtr TimeProvider; - TScopedAlloc& Alloc; THolder Env; THolder PgmBuilder; - TExploringNodeVisitor Explorer; IComputationPattern::TPtr Pattern; }; @@ -133,7 +131,6 @@ namespace NKikimr { TSetup setup1(alloc); const TTestInputData input = { - // Time; Key; Value; PartitionKey {1000, "A", 101, "P"}, {1001, "B", 102, "P"}, {1002, "C", 103, "P"}, // <- match end @@ -161,7 +158,6 @@ namespace NKikimr { UNIT_ASSERT_VALUES_EQUAL(56, v); } - Y_UNIT_TEST(StreamingMode) { TestWithSaveLoadImpl(true); } From 2a4464240c0146420b93b0a92afa014d23fd309b Mon Sep 17 00:00:00 2001 From: Dmitry Kardymon Date: Tue, 13 Feb 2024 13:54:43 +0000 Subject: [PATCH 15/19] change config --- ydb/tests/fq/yds/test_recovery_match_recognize.py | 14 ++------------ 1 file changed, 2 insertions(+), 12 deletions(-) diff --git a/ydb/tests/fq/yds/test_recovery_match_recognize.py b/ydb/tests/fq/yds/test_recovery_match_recognize.py index 04d93a8631b5..c38bc37f2423 100644 --- a/ydb/tests/fq/yds/test_recovery_match_recognize.py +++ b/ydb/tests/fq/yds/test_recovery_match_recognize.py @@ -9,6 +9,7 @@ import ydb.tests.library.common.yatest_common as yatest_common from ydb.tests.tools.fq_runner.kikimr_runner import StreamingOverKikimr from ydb.tests.tools.fq_runner.kikimr_runner import StreamingOverKikimrConfig +from ydb.tests.tools.fq_runner.kikimr_runner import TenantConfig import library.python.retry as retry from ydb.tests.tools.fq_runner.kikimr_utils import yq_v1 from ydb.tests.tools.datastreams_helpers.test_yds_base import TestYdsBase @@ -17,7 +18,7 @@ @pytest.fixture def kikimr(request): - kikimr_conf = StreamingOverKikimrConfig(cloud_mode=True, node_count=2) + kikimr_conf = StreamingOverKikimrConfig(cloud_mode=True, node_count={"/cp": TenantConfig(1), "/compute": TenantConfig(1)}) kikimr = StreamingOverKikimr(kikimr_conf) kikimr.start_mvp_mock_server() kikimr.start() @@ -33,13 +34,6 @@ def setup_class(cls): # for retry cls.retry_conf = retry.RetryConf().upto(seconds=30).waiting(0.1) - @retry.retry_intrusive - def get_graph_master_node_id(self, kikimr, query_id): - for node_index in kikimr.control_plane.kikimr_cluster.nodes: - if kikimr.control_plane.get_task_count(node_index, query_id) > 0: - return node_index - assert False, "No active graphs found" - def get_ca_count(self, kikimr, node_index): result = kikimr.control_plane.get_sensors(node_index, "utils").find_sensor( {"activity": "DQ_COMPUTE_ACTOR", "sensor": "ActorsAliveByActivity", "execpool": "User"} @@ -88,7 +82,6 @@ def restart_node(self, kikimr, query_id): kikimr.compute_plane.kikimr_cluster.nodes[node_to_restart].start() kikimr.compute_plane.wait_bootstrap(node_to_restart) - def recovery_impl(self, kikimr, client, yq_version, sql_template, test_name, messages_before_restart, messages_after_restart, expected): self.init_topics(f"{test_name}_{yq_version}") @@ -100,9 +93,6 @@ def recovery_impl(self, kikimr, client, yq_version, sql_template, test_name, mes client.wait_query_status(query_id, fq.QueryMeta.RUNNING) kikimr.compute_plane.wait_zero_checkpoint(query_id) - master_node_index = self.get_graph_master_node_id(kikimr, query_id) - logging.debug("Master node {}".format(master_node_index)) - self.write_stream(messages_before_restart) logging.debug("get_completed_checkpoints {}".format(kikimr.compute_plane.get_completed_checkpoints(query_id))) From b14af60d2126c6d695bad25d4ecbdb27a49217db Mon Sep 17 00:00:00 2001 From: Dmitry Kardymon Date: Fri, 16 Feb 2024 08:56:29 +0000 Subject: [PATCH 16/19] style fix --- .../comp_nodes/mkql_match_recognize.cpp | 14 ++--- .../comp_nodes/mkql_match_recognize_list.h | 1 + .../comp_nodes/mkql_match_recognize_nfa.h | 55 ++----------------- .../mkql_match_recognize_save_load.h | 28 +++++++--- 4 files changed, 32 insertions(+), 66 deletions(-) diff --git a/ydb/library/yql/minikql/comp_nodes/mkql_match_recognize.cpp b/ydb/library/yql/minikql/comp_nodes/mkql_match_recognize.cpp index ae7397702b18..5b3a25ebfbcd 100644 --- a/ydb/library/yql/minikql/comp_nodes/mkql_match_recognize.cpp +++ b/ydb/library/yql/minikql/comp_nodes/mkql_match_recognize.cpp @@ -178,8 +178,7 @@ class TStreamingMatchRecognize { NUdf::TUnboxedValue GetOutputIfReady(TComputationContext& ctx) { auto match = Nfa.GetMatched(); - if (!match.has_value()) - { + if (!match.has_value()) { return NUdf::TUnboxedValue{}; } Parameters.MatchedVarsArg->SetValue(ctx, ctx.HolderFactory.Create>(ctx.HolderFactory, match.value())); @@ -278,7 +277,6 @@ class TStateForNonInterleavedPartitions } void Load(const NUdf::TStringRef& state) override { - TInputSerializer serializer(SerializerContext, state); const auto stateVersion = serializer.Read(); if (stateVersion == 1) { @@ -408,19 +406,19 @@ class TStateForInterleavedPartitions serializer.Write(key); state->Save(serializer); } + // HasReadyOutput is not packed because when loading we can recalculate HasReadyOutput from Partitions. serializer.Write(Terminating); return serializer.MakeString(); } void Load(const NUdf::TStringRef& state) override { - TInputSerializer serializer(SerializerContext, state); - const auto stateVersion = serializer.Read(); if (stateVersion == 1) { Partitions.clear(); - auto partitionsSize = serializer.Read(); - for (size_t i = 0; i < partitionsSize; ++i) { + auto partitionsCount = serializer.Read(); + Partitions.reserve(partitionsCount); + for (size_t i = 0; i < partitionsCount; ++i) { auto packedKey = serializer.Read(); NUdf::TUnboxedValue key = PartitionKeyPacker.Unpack(packedKey, SerializerContext.Ctx.HolderFactory); auto pair = Partitions.emplace( @@ -430,7 +428,7 @@ class TStateForInterleavedPartitions Parameters, NfaTransitionGraph, Cache)); - (pair.first)->second->Load(serializer); + pair.first->second->Load(serializer); } for (auto it = Partitions.begin(); it != Partitions.end(); ++it) { diff --git a/ydb/library/yql/minikql/comp_nodes/mkql_match_recognize_list.h b/ydb/library/yql/minikql/comp_nodes/mkql_match_recognize_list.h index edd92c721da2..d8d42ddf3399 100644 --- a/ydb/library/yql/minikql/comp_nodes/mkql_match_recognize_list.h +++ b/ydb/library/yql/minikql/comp_nodes/mkql_match_recognize_list.h @@ -146,6 +146,7 @@ class TSparseList { void Load(TInputSerializer& serializer) { auto size = serializer.Read(); + Storage.reserve(size); for (size_t i = 0; i < size; ++i) { auto key = serializer.Read(); NUdf::TUnboxedValue row = serializer.Read(); diff --git a/ydb/library/yql/minikql/comp_nodes/mkql_match_recognize_nfa.h b/ydb/library/yql/minikql/comp_nodes/mkql_match_recognize_nfa.h index 09742de16d86..cae4275d8911 100644 --- a/ydb/library/yql/minikql/comp_nodes/mkql_match_recognize_nfa.h +++ b/ydb/library/yql/minikql/comp_nodes/mkql_match_recognize_nfa.h @@ -23,6 +23,7 @@ using TQuantityExitTransition = std::pair, std::pair struct TVariantHelper { using TVariant = std::variant; + using TTuple = std::tuple; static std::variant getVariantByIndex(size_t i) { MKQL_ENSURE(i < sizeof...(Ts), "Wrong variant index"); @@ -89,68 +90,22 @@ struct TNfaTransitionGraph { void Save(TOutputSerializer& serializer) const { serializer.Write(Transitions.size()); - for (ui64 i = 0; i < Transitions.size(); ++i) { serializer.Write(Transitions[i].index()); - - std::visit([&](auto&& arg) - { - using T = std::decay_t; - if constexpr (std::is_same_v) { - // Nothing - } - else if constexpr (std::is_same_v) { - serializer.Write(arg); - } - else if constexpr (std::is_same_v) { - serializer.Write(arg); - } - else if constexpr (std::is_same_v) { - serializer.Write(arg); - } - else if constexpr (std::is_same_v) { - serializer.Write(arg); - } - else - static_assert(always_false_v, "non-exhaustive visitor!"); - }, Transitions[i]); + std::visit(serializer, Transitions[i]); } - serializer.Write(Input); - serializer.Write(Output); + serializer(Input, Output); } void Load(TInputSerializer& serializer) { ui64 transitionSize = serializer.Read(); - Transitions.resize(transitionSize); for (ui64 i = 0; i < transitionSize; ++i) { size_t index = serializer.Read(); Transitions[i] = TNfaTransitionHelper::getVariantByIndex(index); - std::visit([&](auto&& arg) - { - using T = std::decay_t; - if constexpr (std::is_same_v) { - // Nothing - } - else if constexpr (std::is_same_v) { - serializer.Read(arg); - } - else if constexpr (std::is_same_v) { - serializer.Read(arg); - } - else if constexpr (std::is_same_v) { - serializer.Read(arg); - } - else if constexpr (std::is_same_v) { - serializer.Read(arg); - } - else - static_assert(always_false_v, "non-exhaustive visitor!"); - }, Transitions[i]); - + std::visit(serializer, Transitions[i]); } - serializer.Read(Input); - serializer.Read(Output); + serializer(Input, Output); } }; diff --git a/ydb/library/yql/minikql/comp_nodes/mkql_match_recognize_save_load.h b/ydb/library/yql/minikql/comp_nodes/mkql_match_recognize_save_load.h index 540bac830c56..893a5db8c527 100644 --- a/ydb/library/yql/minikql/comp_nodes/mkql_match_recognize_save_load.h +++ b/ydb/library/yql/minikql/comp_nodes/mkql_match_recognize_save_load.h @@ -29,6 +29,11 @@ struct TOutputSerializer { : Context(context) {} + template + void operator()(Ts... args) { + (Write(args), ...); + } + template void Write(const Type& value ) { if constexpr (std::is_same_v, TString>) { @@ -43,8 +48,9 @@ struct TOutputSerializer { WriteUi32(Buf, value); } else if constexpr (std::is_same_v, NUdf::TUnboxedValue>) { // Only Row type (StateType) supported ! WriteUnboxedValue(Buf, Context.RowPacker.RefMutableObject(Context.Ctx, false, Context.RowType), value); - } - else { + } else if constexpr (std::is_empty_v){ + // Empty struct is not saved/loaded. + } else { static_assert(always_false_v, "Not supported type / not implemented"); } } @@ -107,6 +113,11 @@ struct TInputSerializer { , Buf(state.Data(), state.Size()) {} + template + void operator()(Ts&... args) { + (Read(args), ...); + } + template ReturnType Read() { if constexpr (std::is_same_v, TString>) { @@ -121,8 +132,9 @@ struct TInputSerializer { return ReadUi32(Buf); } else if constexpr (std::is_same_v, NUdf::TUnboxedValue>) { return ReadUnboxedValue(Buf, Context.RowPacker.RefMutableObject(Context.Ctx, false, Context.RowType), Context.Ctx); - } - else { + } else if constexpr (std::is_empty_v){ + // Empty struct is not saved/loaded. + } else { static_assert(always_false_v, "Not supported type / not implemented"); } } @@ -141,8 +153,9 @@ struct TInputSerializer { value = ReadUi32(Buf); } else if constexpr (std::is_same_v, NUdf::TUnboxedValue>) { value = ReadUnboxedValue(Buf, Context.RowPacker.RefMutableObject(Context.Ctx, false, Context.RowType), Context.Ctx); - } - else { + } else if constexpr (std::is_empty_v){ + // Empty struct is not saved/loaded. + } else { static_assert(always_false_v, "Not supported type / not implemented"); } } @@ -191,8 +204,7 @@ struct TInputSerializer { return NKikimr::NMiniKQL::MakeString(strRef); } - bool Empty() - { + bool Empty() const { return Buf.empty(); } From 22579d3eece34fa4b574918be20facd0645ee7e9 Mon Sep 17 00:00:00 2001 From: Dmitry Kardymon Date: Fri, 16 Feb 2024 09:19:40 +0000 Subject: [PATCH 17/19] style fix --- .../comp_nodes/mkql_match_recognize_list.h | 27 +++++++------------ .../comp_nodes/mkql_match_recognize_nfa.h | 3 +-- .../mkql_match_recognize_save_load.h | 4 +-- 3 files changed, 13 insertions(+), 21 deletions(-) diff --git a/ydb/library/yql/minikql/comp_nodes/mkql_match_recognize_list.h b/ydb/library/yql/minikql/comp_nodes/mkql_match_recognize_list.h index d8d42ddf3399..313fb7476487 100644 --- a/ydb/library/yql/minikql/comp_nodes/mkql_match_recognize_list.h +++ b/ydb/library/yql/minikql/comp_nodes/mkql_match_recognize_list.h @@ -136,11 +136,9 @@ class TSparseList { } void Save(TOutputSerializer& serializer) const { - serializer.Write(Storage.size()); + serializer(Storage.size()); for (const auto& [key, item]: Storage) { - serializer.Write(key); - serializer.Write(item.Value); - serializer.Write(item.LockCount); + serializer(key, item.Value, item.LockCount); } } @@ -148,9 +146,10 @@ class TSparseList { auto size = serializer.Read(); Storage.reserve(size); for (size_t i = 0; i < size; ++i) { - auto key = serializer.Read(); - NUdf::TUnboxedValue row = serializer.Read(); - auto lockCount = serializer.Read(); + TStorage::key_type key; + NUdf::TUnboxedValue row; + decltype(TItem::LockCount) lockCount; + serializer(key, row, lockCount); Storage.emplace(key, TItem{row, lockCount}); } } @@ -270,15 +269,11 @@ class TSparseList { } void Save(TOutputSerializer& serializer) const { - serializer.Write(Container); - serializer.Write(FromIndex); - serializer.Write(ToIndex); + serializer(Container, FromIndex, ToIndex); } void Load(TInputSerializer& serializer) { - serializer.Read(Container); - serializer.Read(FromIndex); - serializer.Read(ToIndex); + serializer(Container, FromIndex, ToIndex); } private: @@ -337,13 +332,11 @@ class TSparseList { } void Save(TOutputSerializer& serializer) const { - serializer.Write(Container); - serializer.Write(ListSize); + serializer(Container, ListSize); } void Load(TInputSerializer& serializer) { - serializer.Read(Container); - serializer.Read(ListSize); + serializer(Container, ListSize); } private: diff --git a/ydb/library/yql/minikql/comp_nodes/mkql_match_recognize_nfa.h b/ydb/library/yql/minikql/comp_nodes/mkql_match_recognize_nfa.h index cae4275d8911..78a944095374 100644 --- a/ydb/library/yql/minikql/comp_nodes/mkql_match_recognize_nfa.h +++ b/ydb/library/yql/minikql/comp_nodes/mkql_match_recognize_nfa.h @@ -89,7 +89,7 @@ struct TNfaTransitionGraph { inline constexpr static bool always_false_v = false; void Save(TOutputSerializer& serializer) const { - serializer.Write(Transitions.size()); + serializer(Transitions.size()); for (ui64 i = 0; i < Transitions.size(); ++i) { serializer.Write(Transitions[i].index()); std::visit(serializer, Transitions[i]); @@ -322,7 +322,6 @@ class TNfa { void Save(TOutputSerializer& serializer) const { serializer.Write(Index); - serializer.Write(Vars.size()); for (const auto& vector : Vars) { serializer.Write(vector.size()); diff --git a/ydb/library/yql/minikql/comp_nodes/mkql_match_recognize_save_load.h b/ydb/library/yql/minikql/comp_nodes/mkql_match_recognize_save_load.h index 893a5db8c527..e7dcf12d53f7 100644 --- a/ydb/library/yql/minikql/comp_nodes/mkql_match_recognize_save_load.h +++ b/ydb/library/yql/minikql/comp_nodes/mkql_match_recognize_save_load.h @@ -30,8 +30,8 @@ struct TOutputSerializer { {} template - void operator()(Ts... args) { - (Write(args), ...); + void operator()(Ts&&... args) { + (Write(std::forward(args)), ...); } template From 1c8afed553a3808bd3f9e934e7d67e4fdf8d8c9e Mon Sep 17 00:00:00 2001 From: Dmitry Kardymon Date: Wed, 21 Feb 2024 13:45:02 +0000 Subject: [PATCH 18/19] check restore graph --- .../yql/minikql/comp_nodes/mkql_match_recognize.cpp | 6 +++++- .../minikql/comp_nodes/mkql_match_recognize_nfa.h | 12 ++++++++++-- 2 files changed, 15 insertions(+), 3 deletions(-) diff --git a/ydb/library/yql/minikql/comp_nodes/mkql_match_recognize.cpp b/ydb/library/yql/minikql/comp_nodes/mkql_match_recognize.cpp index 5b3a25ebfbcd..9a9e02dd8cab 100644 --- a/ydb/library/yql/minikql/comp_nodes/mkql_match_recognize.cpp +++ b/ydb/library/yql/minikql/comp_nodes/mkql_match_recognize.cpp @@ -408,6 +408,7 @@ class TStateForInterleavedPartitions } // HasReadyOutput is not packed because when loading we can recalculate HasReadyOutput from Partitions. serializer.Write(Terminating); + NfaTransitionGraph->Save(serializer); return serializer.MakeString(); } @@ -437,6 +438,10 @@ class TStateForInterleavedPartitions } } serializer.Read(Terminating); + auto restoredTransitionGraph = std::make_shared(); + restoredTransitionGraph->Load(serializer); + MKQL_ENSURE(NfaTransitionGraph, "Empty NfaTransitionGraph"); + MKQL_ENSURE(*restoredTransitionGraph == *NfaTransitionGraph, "Restored and current NfaTransitionGraph is different"); } MKQL_ENSURE(serializer.Empty(), "State is corrupted"); } @@ -595,7 +600,6 @@ class TMatchRecognizeWrapper : public TStatefulFlowComputationNode RowPacker; diff --git a/ydb/library/yql/minikql/comp_nodes/mkql_match_recognize_nfa.h b/ydb/library/yql/minikql/comp_nodes/mkql_match_recognize_nfa.h index 78a944095374..0df4745f9368 100644 --- a/ydb/library/yql/minikql/comp_nodes/mkql_match_recognize_nfa.h +++ b/ydb/library/yql/minikql/comp_nodes/mkql_match_recognize_nfa.h @@ -13,6 +13,9 @@ namespace NKikimr::NMiniKQL::NMatchRecognize { using namespace NYql::NMatchRecognize; struct TVoidTransition { + friend bool operator==(const TVoidTransition&, const TVoidTransition&) { + return true; + } }; using TEpsilonTransition = size_t; //to using TEpsilonTransitions = std::vector>; @@ -107,6 +110,12 @@ struct TNfaTransitionGraph { } serializer(Input, Output); } + + bool operator==(const TNfaTransitionGraph& other) { + return Transitions == other.Transitions + && Input == other.Input + && Output == other.Output; + } }; class TNfaTransitionGraphOptimizer { @@ -428,7 +437,7 @@ class TNfa { } void Save(TOutputSerializer& serializer) const { - TransitionGraph->Save(serializer); + // TransitionGraph is not saved/loaded, passed in constructor. serializer.Write(ActiveStates.size()); for (const auto& state : ActiveStates) { state.Save(serializer); @@ -437,7 +446,6 @@ class TNfa { } void Load(TInputSerializer& serializer) { - TransitionGraph->Load(serializer); auto stateSize = serializer.Read(); for (size_t i = 0; i < stateSize; ++i) { TState state; From 9e499fed5f26e2728c5e3ae5abb4d365d4e8c21d Mon Sep 17 00:00:00 2001 From: Dmitry Kardymon Date: Wed, 21 Feb 2024 14:12:29 +0000 Subject: [PATCH 19/19] Back tracking --- .../minikql/comp_nodes/mkql_match_recognize.cpp | 17 ++++++++++++++++- 1 file changed, 16 insertions(+), 1 deletion(-) diff --git a/ydb/library/yql/minikql/comp_nodes/mkql_match_recognize.cpp b/ydb/library/yql/minikql/comp_nodes/mkql_match_recognize.cpp index 9a9e02dd8cab..efa04a16b264 100644 --- a/ydb/library/yql/minikql/comp_nodes/mkql_match_recognize.cpp +++ b/ydb/library/yql/minikql/comp_nodes/mkql_match_recognize.cpp @@ -47,7 +47,17 @@ class TBackTrackingMatchRecognize { public: //TODO(YQL-16486): create a tree for backtracking(replace var names with indexes) - struct TPatternConfiguration {}; + struct TPatternConfiguration { + void Save(TOutputSerializer& /*serializer*/) const { + } + + void Load(TInputSerializer& /*serializer*/) { + } + + friend bool operator==(const TPatternConfiguration&, const TPatternConfiguration&) { + return true; + } + }; struct TPatternConfigurationBuilder { using TPatternConfigurationPtr = std::shared_ptr; @@ -151,6 +161,7 @@ class TStreamingMatchRecognize { using TPartitionList = TSparseList; using TRange = TPartitionList::TRange; public: + using TPatternConfiguration = TNfaTransitionGraph; using TPatternConfigurationBuilder = TNfaTransitionGraphBuilder; TStreamingMatchRecognize( NUdf::TUnboxedValue&& partitionKey, @@ -273,6 +284,7 @@ class TStateForNonInterleavedPartitions if (isValid) { serializer.Write(DelayedRow); } + RowPatternConfiguration->Save(serializer); return serializer.MakeString(); } @@ -296,6 +308,9 @@ class TStateForNonInterleavedPartitions if (validDelayedRow) { DelayedRow = serializer.Read(); } + auto restoredRowPatternConfiguration = std::make_shared(); + restoredRowPatternConfiguration->Load(serializer); + MKQL_ENSURE(*restoredRowPatternConfiguration == *RowPatternConfiguration, "Restored and current RowPatternConfiguration is different"); } MKQL_ENSURE(serializer.Empty(), "State is corrupted"); }