From 22dca6e36b0613d2e638bde7339418b24c644c75 Mon Sep 17 00:00:00 2001 From: Nathaniel May Date: Fri, 9 Jul 2021 16:28:28 -0400 Subject: [PATCH 1/2] add experimental-parser tracking --- CHANGELOG.md | 1 + core/dbt/parser/models.py | 147 ++++++++++--- core/dbt/tracking.py | 16 +- .../033_event_tracking_test/test_events.py | 208 +++++++++++++++--- 4 files changed, 307 insertions(+), 65 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 1fe515e8ea7..1fc680b9378 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -43,6 +43,7 @@ Contributors: ### Under the hood +- Add tracking for experimental parser accuracy ([3553](https://github.com/dbt-labs/dbt/pull/3553)) - Swap experimental parser implementation to use Rust [#3497](https://github.com/fishtown-analytics/dbt/pull/3497) - Dispatch the core SQL statement of the new test materialization, to benefit adapter maintainers ([#3465](https://github.com/fishtown-analytics/dbt/pull/3465), [#3461](https://github.com/fishtown-analytics/dbt/pull/3461)) - Minimal validation of yaml dictionaries prior to partial parsing ([#3246](https://github.com/fishtown-analytics/dbt/issues/3246), [#3460](https://github.com/fishtown-analytics/dbt/pull/3460)) diff --git a/core/dbt/parser/models.py b/core/dbt/parser/models.py index dc9a7b5de51..37f00d16396 100644 --- a/core/dbt/parser/models.py +++ b/core/dbt/parser/models.py @@ -4,7 +4,12 @@ from dbt.node_types import NodeType from dbt.parser.base import SimpleSQLParser from dbt.parser.search import FileBlock +import dbt.tracking as tracking +from dbt import utils from dbt_extractor import ExtractionError, py_extract_from_source # type: ignore +import itertools +import random +from typing import Any, Dict, List, Tuple class ModelParser(SimpleSQLParser[ParsedModelNode]): @@ -26,46 +31,122 @@ def render_update( ) -> None: self.manifest._parsing_info.static_analysis_path_count += 1 + # `True` roughly 1/100 times this function is called + sample: bool = random.randint(1, 101) == 100 + + # run the experimental parser if the flag is on or if we're sampling + if flags.USE_EXPERIMENTAL_PARSER or sample: + try: + experimentally_parsed: Dict[str, List[Any]] = py_extract_from_source(node.raw_sql) + + # second config format + config_calls: List[Dict[str, str]] = [] + for c in experimentally_parsed['configs']: + config_calls.append({c[0]: c[1]}) + + # format sources TODO change extractor to match this type + source_calls: List[List[str]] = [] + for s in experimentally_parsed['sources']: + source_calls.append([s[0], s[1]]) + experimentally_parsed['sources'] = source_calls + + except ExtractionError as e: + experimentally_parsed = e + # normal dbt run if not flags.USE_EXPERIMENTAL_PARSER: + # normal rendering super().render_update(node, config) + # if we're sampling, compare for correctness + if sample: + result: List[str] = [] + # experimental parser couldn't parse + if isinstance(experimentally_parsed, Exception): + result += ["01_experimental_parser_cannot_parse"] + else: + # rearrange existing configs to match: + real_configs: List[Tuple[str, Any]] = list( + itertools.chain.from_iterable( + map(lambda x: x.items(), config._config_calls) + ) + ) - # if the --use-experimental-parser flag was set - else: - try: - # run dbt jinja extractor (powered by tree-sitter + rust) - # throws an exception if it can't parse the source - res = py_extract_from_source(node.raw_sql) - - # since it doesn't need python jinja, fit the refs, sources, and configs - # into the node. Down the line the rest of the node will be updated with - # this information. (e.g. depends_on etc.) - config_calls = [] - for c in res['configs']: - config_calls.append({c[0]: c[1]}) + # look for false positive configs + for c in experimentally_parsed['configs']: + if c not in real_configs: + result += ["02_false_positive_config_value"] + break - config._config_calls = config_calls + # look for missed configs + for c in real_configs: + if c not in experimentally_parsed['configs']: + result += ["03_missed_config_value"] + break - # this uses the updated config to set all the right things in the node - # if there are hooks present, it WILL render jinja. Will need to change - # when we support hooks - self.update_parsed_node(node, config) + # look for false positive sources + for s in experimentally_parsed['sources']: + if s not in node.sources: + result += ["04_false_positive_source_value"] + break - # udpate the unrendered config with values from the file - # values from yaml files are in there already - node.unrendered_config.update(dict(res['configs'])) + # look for missed sources + for s in node.sources: + if s not in experimentally_parsed['sources']: + result += ["05_missed_source_value"] + break - # set refs, sources, and configs on the node object - node.refs = node.refs + res['refs'] - for sourcev in res['sources']: - # TODO change extractor to match type here - node.sources.append([sourcev[0], sourcev[1]]) - for configv in res['configs']: - node.config[configv[0]] = configv[1] + # look for false positive refs + for r in experimentally_parsed['refs']: + if r not in node.refs: + result += ["06_false_positive_ref_value"] + break - self.manifest._parsing_info.static_analysis_parsed_path_count += 1 + # look for missed refs + for r in node.refs: + if r not in experimentally_parsed['refs']: + result += ["07_missed_ref_value"] + break - # exception was thrown by dbt jinja extractor meaning it can't - # handle this source. fall back to python jinja rendering. - except ExtractionError: - super().render_update(node, config) + # if there are no errors, return a success value + if not result: + result = ["00_exact_match"] + + # fire a tracking event. this fires one event for every sample + # so that we have data on a per file basis. Not only can we expect + # no false positives or misses, we can expect the number model + # files parseable by the experimental parser to match our internal + # testing. + tracking.track_experimental_parser_sample({ + "project_id": self.root_project.hashed_name(), + "file_id": utils.get_hash(node), + "status": result + }) + + # if the --use-experimental-parser flag was set, and the experimental parser succeeded + elif not isinstance(experimentally_parsed, Exception): + # since it doesn't need python jinja, fit the refs, sources, and configs + # into the node. Down the line the rest of the node will be updated with + # this information. (e.g. depends_on etc.) + config._config_calls = config_calls + + # this uses the updated config to set all the right things in the node. + # if there are hooks present, it WILL render jinja. Will need to change + # when the experimental parser supports hooks + self.update_parsed_node(node, config) + + # update the unrendered config with values from the file. + # values from yaml files are in there already + node.unrendered_config.update(dict(experimentally_parsed['configs'])) + + # set refs, sources, and configs on the node object + node.refs += experimentally_parsed['refs'] + node.sources += experimentally_parsed['sources'] + for configv in experimentally_parsed['configs']: + node.config[configv[0]] = configv[1] + + self.manifest._parsing_info.static_analysis_parsed_path_count += 1 + + # the experimental parser tried and failed on this model. + # fall back to python jinja rendering. + else: + super().render_update(node, config) diff --git a/core/dbt/tracking.py b/core/dbt/tracking.py index a1c340bbc80..61d687985c5 100644 --- a/core/dbt/tracking.py +++ b/core/dbt/tracking.py @@ -30,7 +30,7 @@ DEPRECATION_WARN_SPEC = 'iglu:com.dbt/deprecation_warn/jsonschema/1-0-0' LOAD_ALL_TIMING_SPEC = 'iglu:com.dbt/load_all_timing/jsonschema/1-0-3' RESOURCE_COUNTS = 'iglu:com.dbt/resource_counts/jsonschema/1-0-0' - +EXPERIMENTAL_PARSER = 'iglu:com.dbt/experimental_parser/jsonschema/1-0-0' DBT_INVOCATION_ENV = 'DBT_INVOCATION_ENV' @@ -423,6 +423,20 @@ def track_invalid_invocation( ) +def track_experimental_parser_sample(options): + context = [SelfDescribingJson(EXPERIMENTAL_PARSER, options)] + assert active_user is not None, \ + 'Cannot track project loading time when active user is None' + + track( + active_user, + category='dbt', + action='experimental_parser', + label=active_user.invocation_id, + context=context + ) + + def flush(): logger.debug("Flushing usage events") tracker.flush() diff --git a/test/integration/033_event_tracking_test/test_events.py b/test/integration/033_event_tracking_test/test_events.py index 12d86892d40..561615212c7 100644 --- a/test/integration/033_event_tracking_test/test_events.py +++ b/test/integration/033_event_tracking_test/test_events.py @@ -10,6 +10,17 @@ import dbt.utils +# immutably creates a new array with the value inserted at the index +def inserted(value, index, arr): + x = [] + for i in range(0, len(arr)): + if i == index: + x.append(value) + x.append(arr[i]) + else: + x.append(arr[i]) + return x + class TestEventTracking(DBTIntegrationTest): maxDiff = None @@ -240,7 +251,7 @@ def project_config(self): @use_profile("postgres") def test__postgres_event_tracking_compile(self): - expected_calls = [ + expected_calls_A = [ call( category='dbt', action='invocation', @@ -267,6 +278,17 @@ def test__postgres_event_tracking_compile(self): ), ] + expected_calls_B = inserted( + call( + category='dbt', + action='experimental_parser', + label=ANY, + context=ANY + ), + 3, + expected_calls_A + ) + expected_contexts = [ self.build_context('compile', 'start'), self.load_context(), @@ -274,13 +296,19 @@ def test__postgres_event_tracking_compile(self): self.build_context('compile', 'end', result_type='ok') ] - test_result = self.run_event_test( + test_result_A = self.run_event_test( ["compile", "--vars", "sensitive_thing: abc"], - expected_calls, + expected_calls_A, expected_contexts ) - self.assertTrue(test_result) + test_result_B = self.run_event_test( + ["compile", "--vars", "sensitive_thing: abc"], + expected_calls_B, + expected_contexts + ) + + self.assertTrue(test_result_A or test_result_B) @use_profile("postgres") def test__postgres_event_tracking_deps(self): @@ -364,7 +392,7 @@ def seed_context(project_id, user_id, invocation_id, version): }, }] - expected_calls = [ + expected_calls_A = [ call( category='dbt', action='invocation', @@ -397,6 +425,17 @@ def seed_context(project_id, user_id, invocation_id, version): ), ] + expected_calls_B = inserted( + call( + category='dbt', + action='experimental_parser', + label=ANY, + context=ANY + ), + 3, + expected_calls_A + ) + expected_contexts = [ self.build_context('seed', 'start'), self.load_context(), @@ -405,12 +444,14 @@ def seed_context(project_id, user_id, invocation_id, version): self.build_context('seed', 'end', result_type='ok') ] - test_result = self.run_event_test(["seed"], expected_calls, expected_contexts) - self.assertTrue(test_result) + test_result_A = self.run_event_test(["seed"], expected_calls_A, expected_contexts) + test_result_A = self.run_event_test(["seed"], expected_calls_B, expected_contexts) + + self.assertTrue(test_result_A or test_result_B) @use_profile("postgres") def test__postgres_event_tracking_models(self): - expected_calls = [ + expected_calls_A = [ call( category='dbt', action='invocation', @@ -449,6 +490,17 @@ def test__postgres_event_tracking_models(self): ), ] + expected_calls_B = inserted( + call( + category='dbt', + action='experimental_parser', + label=ANY, + context=ANY + ), + 3, + expected_calls_A + ) + hashed = '20ff78afb16c8b3b8f83861b1d3b99bd' # this hashed contents field changes on azure postgres tests, I believe # due to newlines again @@ -478,20 +530,26 @@ def test__postgres_event_tracking_models(self): self.build_context('run', 'end', result_type='ok') ] - test_result = self.run_event_test( + test_result_A = self.run_event_test( ["run", "--model", "example", "example_2"], - expected_calls, + expected_calls_A, expected_contexts ) - self.assertTrue(test_result) + test_result_B = self.run_event_test( + ["run", "--model", "example", "example_2"], + expected_calls_A, + expected_contexts + ) + + self.assertTrue(test_result_A or test_result_B) @use_profile("postgres") def test__postgres_event_tracking_model_error(self): # cmd = ["run", "--model", "model_error"] # self.run_event_test(cmd, event_run_model_error, expect_pass=False) - expected_calls = [ + expected_calls_A = [ call( category='dbt', action='invocation', @@ -524,6 +582,17 @@ def test__postgres_event_tracking_model_error(self): ), ] + expected_calls_B = inserted( + call( + category='dbt', + action='experimental_parser', + label=ANY, + context=ANY + ), + 3, + expected_calls_A + ) + expected_contexts = [ self.build_context('run', 'start'), self.load_context(), @@ -539,14 +608,21 @@ def test__postgres_event_tracking_model_error(self): self.build_context('run', 'end', result_type='ok') ] - test_result = self.run_event_test( + test_result_A = self.run_event_test( ["run", "--model", "model_error"], - expected_calls, + expected_calls_A, expected_contexts, expect_pass=False ) - self.assertTrue(test_result) + test_result_B = self.run_event_test( + ["run", "--model", "model_error"], + expected_calls_B, + expected_contexts, + expect_pass=False + ) + + self.assertTrue(test_result_A or test_result_B) @use_profile("postgres") def test__postgres_event_tracking_tests(self): @@ -554,7 +630,7 @@ def test__postgres_event_tracking_tests(self): self.run_dbt(["deps"]) self.run_dbt(["run", "--model", "example", "example_2"]) - expected_calls = [ + expected_calls_A = [ call( category='dbt', action='invocation', @@ -581,6 +657,17 @@ def test__postgres_event_tracking_tests(self): ), ] + expected_calls_B = inserted( + call( + category='dbt', + action='experimental_parser', + label=ANY, + context=ANY + ), + 3, + expected_calls_A + ) + expected_contexts = [ self.build_context('test', 'start'), self.load_context(), @@ -588,14 +675,21 @@ def test__postgres_event_tracking_tests(self): self.build_context('test', 'end', result_type='ok') ] - test_result = self.run_event_test( + test_result_A = self.run_event_test( ["test"], - expected_calls, + expected_calls_A, expected_contexts, expect_pass=False ) - self.assertTrue(test_result) + test_result_B = self.run_event_test( + ["test"], + expected_calls_A, + expected_contexts, + expect_pass=False + ) + + self.assertTrue(test_result_A or test_result_B) class TestEventTrackingCompilationError(TestEventTracking): @@ -676,7 +770,7 @@ def profile_config(self): @use_profile("postgres") def test__postgres_event_tracking_unable_to_connect(self): - expected_calls = [ + expected_calls_A = [ call( category='dbt', action='invocation', @@ -703,6 +797,17 @@ def test__postgres_event_tracking_unable_to_connect(self): ), ] + expected_calls_B = inserted( + call( + category='dbt', + action='experimental_parser', + label=ANY, + context=ANY + ), + 3, + expected_calls_A + ) + expected_contexts = [ self.build_context('run', 'start'), self.load_context(), @@ -710,14 +815,21 @@ def test__postgres_event_tracking_unable_to_connect(self): self.build_context('run', 'end', result_type='error') ] - test_result = self.run_event_test( + test_result_A = self.run_event_test( ["run", "--target", "noaccess", "--models", "example"], - expected_calls, + expected_calls_A, expected_contexts, expect_pass=False ) - self.assertTrue(test_result) + test_result_B = self.run_event_test( + ["run", "--target", "noaccess", "--models", "example"], + expected_calls_B, + expected_contexts, + expect_pass=False + ) + + self.assertTrue(test_result_A or test_result_B) class TestEventTrackingSnapshot(TestEventTracking): @@ -732,7 +844,7 @@ def project_config(self): def test__postgres_event_tracking_snapshot(self): self.run_dbt(["run", "--models", "snapshottable"]) - expected_calls = [ + expected_calls_A = [ call( category='dbt', action='invocation', @@ -765,6 +877,17 @@ def test__postgres_event_tracking_snapshot(self): ), ] + expected_calls_B = inserted( + call( + category='dbt', + action='experimental_parser', + label=ANY, + context=ANY + ), + 3, + expected_calls_A + ) + # the model here has a raw_sql that contains the schema, which changes expected_contexts = [ self.build_context('snapshot', 'start'), @@ -781,13 +904,19 @@ def test__postgres_event_tracking_snapshot(self): self.build_context('snapshot', 'end', result_type='ok') ] - test_result = self.run_event_test( + test_result_A = self.run_event_test( ["snapshot"], - expected_calls, + expected_calls_A, expected_contexts ) - self.assertTrue(test_result) + test_result_B = self.run_event_test( + ["snapshot"], + expected_calls_B, + expected_contexts + ) + + self.assertTrue(test_result_A or test_result_B) class TestEventTrackingCatalogGenerate(TestEventTracking): @@ -796,7 +925,7 @@ def test__postgres_event_tracking_catalog_generate(self): # create a model for the catalog self.run_dbt(["run", "--models", "example"]) - expected_calls = [ + expected_calls_A = [ call( category='dbt', action='invocation', @@ -823,6 +952,17 @@ def test__postgres_event_tracking_catalog_generate(self): ), ] + expected_calls_B = inserted( + call( + category='dbt', + action='experimental_parser', + label=ANY, + context=ANY + ), + 3, + expected_calls_A + ) + expected_contexts = [ self.build_context('generate', 'start'), self.load_context(), @@ -830,10 +970,16 @@ def test__postgres_event_tracking_catalog_generate(self): self.build_context('generate', 'end', result_type='ok') ] - test_result = self.run_event_test( + test_result_A = self.run_event_test( ["docs", "generate"], - expected_calls, + expected_calls_A, expected_contexts ) - self.assertTrue(test_result) + test_result_B = self.run_event_test( + ["docs", "generate"], + expected_calls_B, + expected_contexts + ) + + self.assertTrue(test_result_A or test_result_B) From 2efe31fecc5ccc4216bae06486f08b9f75dfc762 Mon Sep 17 00:00:00 2001 From: Jeremy Cohen Date: Fri, 9 Jul 2021 17:10:08 -0400 Subject: [PATCH 2/2] Update changelog [skip ci] --- CHANGELOG.md | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 1fc680b9378..ee9e54b4007 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -31,6 +31,10 @@ Contributors: - Update dbt logo and links ([docs#197](https://github.com/fishtown-analytics/dbt-docs/issues/197)) +### Under the hood + +- Add tracking for experimental parser accuracy ([3503](https://github.com/dbt-labs/dbt/pull/3503), [3553](https://github.com/dbt-labs/dbt/pull/3553)) + ## dbt 0.20.0rc2 (June 30, 2021) ### Fixes @@ -43,7 +47,6 @@ Contributors: ### Under the hood -- Add tracking for experimental parser accuracy ([3553](https://github.com/dbt-labs/dbt/pull/3553)) - Swap experimental parser implementation to use Rust [#3497](https://github.com/fishtown-analytics/dbt/pull/3497) - Dispatch the core SQL statement of the new test materialization, to benefit adapter maintainers ([#3465](https://github.com/fishtown-analytics/dbt/pull/3465), [#3461](https://github.com/fishtown-analytics/dbt/pull/3461)) - Minimal validation of yaml dictionaries prior to partial parsing ([#3246](https://github.com/fishtown-analytics/dbt/issues/3246), [#3460](https://github.com/fishtown-analytics/dbt/pull/3460))