From 22dca6e36b0613d2e638bde7339418b24c644c75 Mon Sep 17 00:00:00 2001
From: Nathaniel May <nathaniel.may@fishtownanalytics.com>
Date: Fri, 9 Jul 2021 16:28:28 -0400
Subject: [PATCH 1/2] add experimental-parser tracking

---
 CHANGELOG.md                                  |   1 +
 core/dbt/parser/models.py                     | 147 ++++++++++---
 core/dbt/tracking.py                          |  16 +-
 .../033_event_tracking_test/test_events.py    | 208 +++++++++++++++---
 4 files changed, 307 insertions(+), 65 deletions(-)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index 1fe515e8ea7..1fc680b9378 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -43,6 +43,7 @@ Contributors:
 
 ### Under the hood
 
+- Add tracking for experimental parser accuracy ([3553](https://github.com/dbt-labs/dbt/pull/3553))
 - Swap experimental parser implementation to use Rust [#3497](https://github.com/fishtown-analytics/dbt/pull/3497)
 - Dispatch the core SQL statement of the new test materialization, to benefit adapter maintainers ([#3465](https://github.com/fishtown-analytics/dbt/pull/3465), [#3461](https://github.com/fishtown-analytics/dbt/pull/3461))
 - Minimal validation of yaml dictionaries prior to partial parsing ([#3246](https://github.com/fishtown-analytics/dbt/issues/3246), [#3460](https://github.com/fishtown-analytics/dbt/pull/3460))
diff --git a/core/dbt/parser/models.py b/core/dbt/parser/models.py
index dc9a7b5de51..37f00d16396 100644
--- a/core/dbt/parser/models.py
+++ b/core/dbt/parser/models.py
@@ -4,7 +4,12 @@
 from dbt.node_types import NodeType
 from dbt.parser.base import SimpleSQLParser
 from dbt.parser.search import FileBlock
+import dbt.tracking as tracking
+from dbt import utils
 from dbt_extractor import ExtractionError, py_extract_from_source  # type: ignore
+import itertools
+import random
+from typing import Any, Dict, List, Tuple
 
 
 class ModelParser(SimpleSQLParser[ParsedModelNode]):
@@ -26,46 +31,122 @@ def render_update(
     ) -> None:
         self.manifest._parsing_info.static_analysis_path_count += 1
 
+        # `True` roughly 1/100 times this function is called
+        sample: bool = random.randint(1, 101) == 100
+
+        # run the experimental parser if the flag is on or if we're sampling
+        if flags.USE_EXPERIMENTAL_PARSER or sample:
+            try:
+                experimentally_parsed: Dict[str, List[Any]] = py_extract_from_source(node.raw_sql)
+
+                # second config format
+                config_calls: List[Dict[str, str]] = []
+                for c in experimentally_parsed['configs']:
+                    config_calls.append({c[0]: c[1]})
+
+                # format sources TODO change extractor to match this type
+                source_calls: List[List[str]] = []
+                for s in experimentally_parsed['sources']:
+                    source_calls.append([s[0], s[1]])
+                experimentally_parsed['sources'] = source_calls
+
+            except ExtractionError as e:
+                experimentally_parsed = e
+
         # normal dbt run
         if not flags.USE_EXPERIMENTAL_PARSER:
+            # normal rendering
             super().render_update(node, config)
+            # if we're sampling, compare for correctness
+            if sample:
+                result: List[str] = []
+                # experimental parser couldn't parse
+                if isinstance(experimentally_parsed, Exception):
+                    result += ["01_experimental_parser_cannot_parse"]
+                else:
+                    # rearrange existing configs to match:
+                    real_configs: List[Tuple[str, Any]] = list(
+                        itertools.chain.from_iterable(
+                            map(lambda x: x.items(), config._config_calls)
+                        )
+                    )
 
-        # if the --use-experimental-parser flag was set
-        else:
-            try:
-                # run dbt jinja extractor (powered by tree-sitter + rust)
-                # throws an exception if it can't parse the source
-                res = py_extract_from_source(node.raw_sql)
-
-                # since it doesn't need python jinja, fit the refs, sources, and configs
-                # into the node. Down the line the rest of the node will be updated with
-                # this information. (e.g. depends_on etc.)
-                config_calls = []
-                for c in res['configs']:
-                    config_calls.append({c[0]: c[1]})
+                    # look for false positive configs
+                    for c in experimentally_parsed['configs']:
+                        if c not in real_configs:
+                            result += ["02_false_positive_config_value"]
+                            break
 
-                config._config_calls = config_calls
+                    # look for missed configs
+                    for c in real_configs:
+                        if c not in experimentally_parsed['configs']:
+                            result += ["03_missed_config_value"]
+                            break
 
-                # this uses the updated config to set all the right things in the node
-                # if there are hooks present, it WILL render jinja. Will need to change
-                # when we support hooks
-                self.update_parsed_node(node, config)
+                    # look for false positive sources
+                    for s in experimentally_parsed['sources']:
+                        if s not in node.sources:
+                            result += ["04_false_positive_source_value"]
+                            break
 
-                # udpate the unrendered config with values from the file
-                # values from yaml files are in there already
-                node.unrendered_config.update(dict(res['configs']))
+                    # look for missed sources
+                    for s in node.sources:
+                        if s not in experimentally_parsed['sources']:
+                            result += ["05_missed_source_value"]
+                            break
 
-                # set refs, sources, and configs on the node object
-                node.refs = node.refs + res['refs']
-                for sourcev in res['sources']:
-                    # TODO change extractor to match type here
-                    node.sources.append([sourcev[0], sourcev[1]])
-                for configv in res['configs']:
-                    node.config[configv[0]] = configv[1]
+                    # look for false positive refs
+                    for r in experimentally_parsed['refs']:
+                        if r not in node.refs:
+                            result += ["06_false_positive_ref_value"]
+                            break
 
-                self.manifest._parsing_info.static_analysis_parsed_path_count += 1
+                    # look for missed refs
+                    for r in node.refs:
+                        if r not in experimentally_parsed['refs']:
+                            result += ["07_missed_ref_value"]
+                            break
 
-            # exception was thrown by dbt jinja extractor meaning it can't
-            # handle this source. fall back to python jinja rendering.
-            except ExtractionError:
-                super().render_update(node, config)
+                    # if there are no errors, return a success value
+                    if not result:
+                        result = ["00_exact_match"]
+
+                # fire a tracking event. this fires one event for every sample
+                # so that we have data on a per file basis. Not only can we expect
+                # no false positives or misses, we can expect the number model
+                # files parseable by the experimental parser to match our internal
+                # testing.
+                tracking.track_experimental_parser_sample({
+                    "project_id": self.root_project.hashed_name(),
+                    "file_id": utils.get_hash(node),
+                    "status": result
+                })
+
+        # if the --use-experimental-parser flag was set, and the experimental parser succeeded
+        elif not isinstance(experimentally_parsed, Exception):
+            # since it doesn't need python jinja, fit the refs, sources, and configs
+            # into the node. Down the line the rest of the node will be updated with
+            # this information. (e.g. depends_on etc.)
+            config._config_calls = config_calls
+
+            # this uses the updated config to set all the right things in the node.
+            # if there are hooks present, it WILL render jinja. Will need to change
+            # when the experimental parser supports hooks
+            self.update_parsed_node(node, config)
+
+            # update the unrendered config with values from the file.
+            # values from yaml files are in there already
+            node.unrendered_config.update(dict(experimentally_parsed['configs']))
+
+            # set refs, sources, and configs on the node object
+            node.refs += experimentally_parsed['refs']
+            node.sources += experimentally_parsed['sources']
+            for configv in experimentally_parsed['configs']:
+                node.config[configv[0]] = configv[1]
+
+            self.manifest._parsing_info.static_analysis_parsed_path_count += 1
+
+        # the experimental parser tried and failed on this model.
+        # fall back to python jinja rendering.
+        else:
+            super().render_update(node, config)
diff --git a/core/dbt/tracking.py b/core/dbt/tracking.py
index a1c340bbc80..61d687985c5 100644
--- a/core/dbt/tracking.py
+++ b/core/dbt/tracking.py
@@ -30,7 +30,7 @@
 DEPRECATION_WARN_SPEC = 'iglu:com.dbt/deprecation_warn/jsonschema/1-0-0'
 LOAD_ALL_TIMING_SPEC = 'iglu:com.dbt/load_all_timing/jsonschema/1-0-3'
 RESOURCE_COUNTS = 'iglu:com.dbt/resource_counts/jsonschema/1-0-0'
-
+EXPERIMENTAL_PARSER = 'iglu:com.dbt/experimental_parser/jsonschema/1-0-0'
 DBT_INVOCATION_ENV = 'DBT_INVOCATION_ENV'
 
 
@@ -423,6 +423,20 @@ def track_invalid_invocation(
     )
 
 
+def track_experimental_parser_sample(options):
+    context = [SelfDescribingJson(EXPERIMENTAL_PARSER, options)]
+    assert active_user is not None, \
+        'Cannot track project loading time when active user is None'
+
+    track(
+        active_user,
+        category='dbt',
+        action='experimental_parser',
+        label=active_user.invocation_id,
+        context=context
+    )
+
+
 def flush():
     logger.debug("Flushing usage events")
     tracker.flush()
diff --git a/test/integration/033_event_tracking_test/test_events.py b/test/integration/033_event_tracking_test/test_events.py
index 12d86892d40..561615212c7 100644
--- a/test/integration/033_event_tracking_test/test_events.py
+++ b/test/integration/033_event_tracking_test/test_events.py
@@ -10,6 +10,17 @@
 import dbt.utils
 
 
+# immutably creates a new array with the value inserted at the index
+def inserted(value, index, arr):
+    x = []
+    for i in range(0, len(arr)):
+        if i == index:
+            x.append(value)
+            x.append(arr[i])
+        else:
+            x.append(arr[i])
+        return x
+
 class TestEventTracking(DBTIntegrationTest):
     maxDiff = None
 
@@ -240,7 +251,7 @@ def project_config(self):
 
     @use_profile("postgres")
     def test__postgres_event_tracking_compile(self):
-        expected_calls = [
+        expected_calls_A = [
             call(
                 category='dbt',
                 action='invocation',
@@ -267,6 +278,17 @@ def test__postgres_event_tracking_compile(self):
             ),
         ]
 
+        expected_calls_B = inserted(
+            call(
+                category='dbt',
+                action='experimental_parser',
+                label=ANY,
+                context=ANY
+            ),
+            3,
+            expected_calls_A
+        )
+
         expected_contexts = [
             self.build_context('compile', 'start'),
             self.load_context(),
@@ -274,13 +296,19 @@ def test__postgres_event_tracking_compile(self):
             self.build_context('compile', 'end', result_type='ok')
         ]
 
-        test_result = self.run_event_test(
+        test_result_A = self.run_event_test(
             ["compile", "--vars", "sensitive_thing: abc"],
-            expected_calls,
+            expected_calls_A,
             expected_contexts
         )
 
-        self.assertTrue(test_result)
+        test_result_B = self.run_event_test(
+            ["compile", "--vars", "sensitive_thing: abc"],
+            expected_calls_B,
+            expected_contexts
+        )
+
+        self.assertTrue(test_result_A or test_result_B)
 
     @use_profile("postgres")
     def test__postgres_event_tracking_deps(self):
@@ -364,7 +392,7 @@ def seed_context(project_id, user_id, invocation_id, version):
                 },
             }]
 
-        expected_calls = [
+        expected_calls_A = [
             call(
                 category='dbt',
                 action='invocation',
@@ -397,6 +425,17 @@ def seed_context(project_id, user_id, invocation_id, version):
             ),
         ]
 
+        expected_calls_B = inserted(
+            call(
+                category='dbt',
+                action='experimental_parser',
+                label=ANY,
+                context=ANY
+            ),
+            3,
+            expected_calls_A
+        )
+
         expected_contexts = [
             self.build_context('seed', 'start'),
             self.load_context(),
@@ -405,12 +444,14 @@ def seed_context(project_id, user_id, invocation_id, version):
             self.build_context('seed', 'end', result_type='ok')
         ]
 
-        test_result = self.run_event_test(["seed"], expected_calls, expected_contexts)
-        self.assertTrue(test_result)
+        test_result_A = self.run_event_test(["seed"], expected_calls_A, expected_contexts)
+        test_result_A = self.run_event_test(["seed"], expected_calls_B, expected_contexts)
+        
+        self.assertTrue(test_result_A or test_result_B)
 
     @use_profile("postgres")
     def test__postgres_event_tracking_models(self):
-        expected_calls = [
+        expected_calls_A = [
             call(
                 category='dbt',
                 action='invocation',
@@ -449,6 +490,17 @@ def test__postgres_event_tracking_models(self):
             ),
         ]
 
+        expected_calls_B = inserted(
+            call(
+                category='dbt',
+                action='experimental_parser',
+                label=ANY,
+                context=ANY
+            ),
+            3,
+            expected_calls_A
+        )
+
         hashed = '20ff78afb16c8b3b8f83861b1d3b99bd'
         # this hashed contents field changes on azure postgres tests, I believe
         # due to newlines again
@@ -478,20 +530,26 @@ def test__postgres_event_tracking_models(self):
             self.build_context('run', 'end', result_type='ok')
         ]
 
-        test_result = self.run_event_test(
+        test_result_A = self.run_event_test(
             ["run", "--model", "example", "example_2"],
-            expected_calls,
+            expected_calls_A,
             expected_contexts
         )
 
-        self.assertTrue(test_result)
+        test_result_B = self.run_event_test(
+            ["run", "--model", "example", "example_2"],
+            expected_calls_A,
+            expected_contexts
+        )
+
+        self.assertTrue(test_result_A or test_result_B)
 
     @use_profile("postgres")
     def test__postgres_event_tracking_model_error(self):
         # cmd = ["run", "--model", "model_error"]
         # self.run_event_test(cmd, event_run_model_error, expect_pass=False)
 
-        expected_calls = [
+        expected_calls_A = [
             call(
                 category='dbt',
                 action='invocation',
@@ -524,6 +582,17 @@ def test__postgres_event_tracking_model_error(self):
             ),
         ]
 
+        expected_calls_B = inserted(
+            call(
+                category='dbt',
+                action='experimental_parser',
+                label=ANY,
+                context=ANY
+            ),
+            3,
+            expected_calls_A
+        )
+
         expected_contexts = [
             self.build_context('run', 'start'),
             self.load_context(),
@@ -539,14 +608,21 @@ def test__postgres_event_tracking_model_error(self):
             self.build_context('run', 'end', result_type='ok')
         ]
 
-        test_result = self.run_event_test(
+        test_result_A = self.run_event_test(
             ["run", "--model", "model_error"],
-            expected_calls,
+            expected_calls_A,
             expected_contexts,
             expect_pass=False
         )
 
-        self.assertTrue(test_result)
+        test_result_B = self.run_event_test(
+            ["run", "--model", "model_error"],
+            expected_calls_B,
+            expected_contexts,
+            expect_pass=False
+        )
+
+        self.assertTrue(test_result_A or test_result_B)
 
     @use_profile("postgres")
     def test__postgres_event_tracking_tests(self):
@@ -554,7 +630,7 @@ def test__postgres_event_tracking_tests(self):
         self.run_dbt(["deps"])
         self.run_dbt(["run", "--model", "example", "example_2"])
 
-        expected_calls = [
+        expected_calls_A = [
             call(
                 category='dbt',
                 action='invocation',
@@ -581,6 +657,17 @@ def test__postgres_event_tracking_tests(self):
             ),
         ]
 
+        expected_calls_B = inserted(
+            call(
+                category='dbt',
+                action='experimental_parser',
+                label=ANY,
+                context=ANY
+            ),
+            3,
+            expected_calls_A
+        )
+
         expected_contexts = [
             self.build_context('test', 'start'),
             self.load_context(),
@@ -588,14 +675,21 @@ def test__postgres_event_tracking_tests(self):
             self.build_context('test', 'end', result_type='ok')
         ]
 
-        test_result = self.run_event_test(
+        test_result_A = self.run_event_test(
             ["test"],
-            expected_calls,
+            expected_calls_A,
             expected_contexts,
             expect_pass=False
         )
 
-        self.assertTrue(test_result)
+        test_result_B = self.run_event_test(
+            ["test"],
+            expected_calls_A,
+            expected_contexts,
+            expect_pass=False
+        )
+
+        self.assertTrue(test_result_A or test_result_B)
 
 
 class TestEventTrackingCompilationError(TestEventTracking):
@@ -676,7 +770,7 @@ def profile_config(self):
 
     @use_profile("postgres")
     def test__postgres_event_tracking_unable_to_connect(self):
-        expected_calls = [
+        expected_calls_A = [
             call(
                 category='dbt',
                 action='invocation',
@@ -703,6 +797,17 @@ def test__postgres_event_tracking_unable_to_connect(self):
             ),
         ]
 
+        expected_calls_B = inserted(
+            call(
+                category='dbt',
+                action='experimental_parser',
+                label=ANY,
+                context=ANY
+            ),
+            3,
+            expected_calls_A
+        )
+
         expected_contexts = [
             self.build_context('run', 'start'),
             self.load_context(),
@@ -710,14 +815,21 @@ def test__postgres_event_tracking_unable_to_connect(self):
             self.build_context('run', 'end', result_type='error')
         ]
 
-        test_result = self.run_event_test(
+        test_result_A = self.run_event_test(
             ["run", "--target", "noaccess", "--models", "example"],
-            expected_calls,
+            expected_calls_A,
             expected_contexts,
             expect_pass=False
         )
 
-        self.assertTrue(test_result)
+        test_result_B = self.run_event_test(
+            ["run", "--target", "noaccess", "--models", "example"],
+            expected_calls_B,
+            expected_contexts,
+            expect_pass=False
+        )
+
+        self.assertTrue(test_result_A or test_result_B)
 
 
 class TestEventTrackingSnapshot(TestEventTracking):
@@ -732,7 +844,7 @@ def project_config(self):
     def test__postgres_event_tracking_snapshot(self):
         self.run_dbt(["run", "--models", "snapshottable"])
 
-        expected_calls = [
+        expected_calls_A = [
             call(
                 category='dbt',
                 action='invocation',
@@ -765,6 +877,17 @@ def test__postgres_event_tracking_snapshot(self):
             ),
         ]
 
+        expected_calls_B = inserted(
+            call(
+                category='dbt',
+                action='experimental_parser',
+                label=ANY,
+                context=ANY
+            ),
+            3,
+            expected_calls_A
+        )
+
         # the model here has a raw_sql that contains the schema, which changes
         expected_contexts = [
             self.build_context('snapshot', 'start'),
@@ -781,13 +904,19 @@ def test__postgres_event_tracking_snapshot(self):
             self.build_context('snapshot', 'end', result_type='ok')
         ]
 
-        test_result = self.run_event_test(
+        test_result_A = self.run_event_test(
             ["snapshot"],
-            expected_calls,
+            expected_calls_A,
             expected_contexts
         )
 
-        self.assertTrue(test_result)
+        test_result_B = self.run_event_test(
+            ["snapshot"],
+            expected_calls_B,
+            expected_contexts
+        )
+
+        self.assertTrue(test_result_A or test_result_B)
 
 
 class TestEventTrackingCatalogGenerate(TestEventTracking):
@@ -796,7 +925,7 @@ def test__postgres_event_tracking_catalog_generate(self):
         # create a model for the catalog
         self.run_dbt(["run", "--models", "example"])
 
-        expected_calls = [
+        expected_calls_A = [
             call(
                 category='dbt',
                 action='invocation',
@@ -823,6 +952,17 @@ def test__postgres_event_tracking_catalog_generate(self):
             ),
         ]
 
+        expected_calls_B = inserted(
+            call(
+                category='dbt',
+                action='experimental_parser',
+                label=ANY,
+                context=ANY
+            ),
+            3,
+            expected_calls_A
+        )
+
         expected_contexts = [
             self.build_context('generate', 'start'),
             self.load_context(),
@@ -830,10 +970,16 @@ def test__postgres_event_tracking_catalog_generate(self):
             self.build_context('generate', 'end', result_type='ok')
         ]
 
-        test_result = self.run_event_test(
+        test_result_A = self.run_event_test(
             ["docs", "generate"],
-            expected_calls,
+            expected_calls_A,
             expected_contexts
         )
 
-        self.assertTrue(test_result)
+        test_result_B = self.run_event_test(
+            ["docs", "generate"],
+            expected_calls_B,
+            expected_contexts
+        )
+
+        self.assertTrue(test_result_A or test_result_B)

From 2efe31fecc5ccc4216bae06486f08b9f75dfc762 Mon Sep 17 00:00:00 2001
From: Jeremy Cohen <jeremy@fishtownanalytics.com>
Date: Fri, 9 Jul 2021 17:10:08 -0400
Subject: [PATCH 2/2] Update changelog [skip ci]

---
 CHANGELOG.md | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index 1fc680b9378..ee9e54b4007 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -31,6 +31,10 @@ Contributors:
 
 - Update dbt logo and links ([docs#197](https://github.com/fishtown-analytics/dbt-docs/issues/197))
 
+### Under the hood
+
+- Add tracking for experimental parser accuracy ([3503](https://github.com/dbt-labs/dbt/pull/3503), [3553](https://github.com/dbt-labs/dbt/pull/3553))
+
 ## dbt 0.20.0rc2 (June 30, 2021)
 
 ### Fixes
@@ -43,7 +47,6 @@ Contributors:
 
 ### Under the hood
 
-- Add tracking for experimental parser accuracy ([3553](https://github.com/dbt-labs/dbt/pull/3553))
 - Swap experimental parser implementation to use Rust [#3497](https://github.com/fishtown-analytics/dbt/pull/3497)
 - Dispatch the core SQL statement of the new test materialization, to benefit adapter maintainers ([#3465](https://github.com/fishtown-analytics/dbt/pull/3465), [#3461](https://github.com/fishtown-analytics/dbt/pull/3461))
 - Minimal validation of yaml dictionaries prior to partial parsing ([#3246](https://github.com/fishtown-analytics/dbt/issues/3246), [#3460](https://github.com/fishtown-analytics/dbt/pull/3460))