From 21a7b716574453137f6b74e7c71fbd68d99e053b Mon Sep 17 00:00:00 2001
From: Nathaniel May <nathaniel.may@fishtownanalytics.com>
Date: Wed, 20 Oct 2021 12:38:16 -0400
Subject: [PATCH] Fix tracking bug for jinja sampling (#4048)

fix jinja sampling for static parser
---
 core/dbt/parser/models.py | 258 +++++++++++++++++++++-----------------
 1 file changed, 142 insertions(+), 116 deletions(-)

diff --git a/core/dbt/parser/models.py b/core/dbt/parser/models.py
index 4e9de494d7e..59d17f851ab 100644
--- a/core/dbt/parser/models.py
+++ b/core/dbt/parser/models.py
@@ -12,7 +12,7 @@
 from functools import reduce
 from itertools import chain
 import random
-from typing import Any, Dict, Iterator, List, Optional, Union
+from typing import Any, Dict, Iterator, List, Optional, Tuple, Union
 
 
 class ModelParser(SimpleSQLParser[ParsedModelNode]):
@@ -63,14 +63,34 @@ def render_update(
         # top-level declaration of variables
         statically_parsed: Optional[Union[str, Dict[str, List[Any]]]] = None
         experimental_sample: Optional[Union[str, Dict[str, List[Any]]]] = None
-        jinja_sample_node = None
-        jinja_sample_config = None
-        result = []
-
-        # sample the experimental parser during a normal run
-        if exp_sample:
+        exp_sample_node: Optional[ParsedModelNode] = None
+        exp_sample_config: Optional[ContextConfig] = None
+        jinja_sample_node: Optional[ParsedModelNode] = None
+        jinja_sample_config: Optional[ContextConfig] = None
+        result: List[str] = []
+
+        # sample the experimental parser only during a normal run
+        if exp_sample and not flags.USE_EXPERIMENTAL_PARSER:
             logger.debug(f"1610: conducting experimental parser sample on {node.path}")
             experimental_sample = self.run_experimental_parser(node)
+            # if the experimental parser succeeded, make a full copy of model parser
+            # and populate _everything_ into it so it can be compared apples-to-apples
+            # with a fully jinja-rendered project. This is necessary because the experimental
+            # parser will likely add features that the existing static parser will fail on
+            # so comparing those directly would give us bad results. The comparison will be
+            # conducted after this model has been fully rendered either by the static parser
+            # or by full jinja rendering
+            if isinstance(experimental_sample, dict):
+                model_parser_copy = self.partial_deepcopy()
+                exp_sample_node = deepcopy(node)
+                exp_sample_config = deepcopy(config)
+                model_parser_copy.populate(
+                    exp_sample_node,
+                    exp_sample_config,
+                    experimental_sample['refs'],
+                    experimental_sample['sources'],
+                    dict(experimental_sample['configs'])
+                )
         # use the experimental parser exclusively if the flag is on
         if flags.USE_EXPERIMENTAL_PARSER:
             statically_parsed = self.run_experimental_parser(node)
@@ -80,58 +100,66 @@ def render_update(
 
         # if the static parser succeeded, extract some data in easy-to-compare formats
         if isinstance(statically_parsed, dict):
-            config_call_dict = _get_config_call_dict(statically_parsed)
-
-            # since it doesn't need python jinja, fit the refs, sources, and configs
-            # into the node. Down the line the rest of the node will be updated with
-            # this information. (e.g. depends_on etc.)
-            config._config_call_dict = config_call_dict
-
-            # this uses the updated config to set all the right things in the node.
-            # if there are hooks present, it WILL render jinja. Will need to change
-            # when the experimental parser supports hooks
-            self.update_parsed_node_config(node, config)
-
-            # update the unrendered config with values from the file.
-            # values from yaml files are in there already
-            node.unrendered_config.update(dict(statically_parsed['configs']))
-
-            # set refs and sources on the node object
-            node.refs += statically_parsed['refs']
-            node.sources += statically_parsed['sources']
-
-            # configs don't need to be merged into the node
-            # setting them in config._config_call_dict is sufficient
-
-            self.manifest._parsing_info.static_analysis_parsed_path_count += 1
-
             # only sample jinja for the purpose of comparing with the stable static parser
             # if we know we don't need to fall back to jinja (i.e. - nothing to compare
             # with jinja v jinja).
             # This means we skip sampling for 40% of the 1/5000 samples. We could run the
             # sampling rng here, but the effect would be the same since we would only roll
             # it 40% of the time. So I've opted to keep all the rng code colocated above.
-            if stable_sample \
-                    and jinja_sample_node is not None \
-                    and jinja_sample_config is not None:
+            if stable_sample and not flags.USE_EXPERIMENTAL_PARSER:
                 logger.debug(f"1611: conducting full jinja rendering sample on {node.path}")
-                # TODO are these deep copies this too expensive?
-                # TODO does this even mutate anything in `self`???
-                model_parser_copy = deepcopy(self)
+                # if this will _never_ mutate anything `self` we could avoid these deep copies,
+                # but we can't really guarantee that going forward.
+                model_parser_copy = self.partial_deepcopy()
                 jinja_sample_node = deepcopy(node)
                 jinja_sample_config = deepcopy(config)
                 # rendering mutates the node and the config
                 super(ModelParser, model_parser_copy) \
                     .render_update(jinja_sample_node, jinja_sample_config)
-                # type-level branching to avoid Optional parameters in the
-                # `_get_stable_sample_result` type signature
-                if jinja_sample_node is not None and jinja_sample_config is not None:
-                    result.extend(_get_stable_sample_result(
-                        jinja_sample_node,
-                        jinja_sample_config,
-                        node,
-                        config
-                    ))
+
+            # manually fit configs in
+            config._config_call_dict = _get_config_call_dict(statically_parsed)
+
+            # update the unrendered config with values from the static parser.
+            # values from yaml files are in there already
+            self.populate(
+                node,
+                config,
+                statically_parsed['refs'],
+                statically_parsed['sources'],
+                dict(statically_parsed['configs'])
+            )
+
+            # if we took a jinja sample, compare now that the base node has been populated
+            if jinja_sample_node is not None and jinja_sample_config is not None:
+                result = _get_stable_sample_result(
+                    jinja_sample_node,
+                    jinja_sample_config,
+                    node,
+                    config
+                )
+
+            # if we took an experimental sample, compare now that the base node has been populated
+            if exp_sample_node is not None and exp_sample_config is not None:
+                result = _get_exp_sample_result(
+                    exp_sample_node,
+                    exp_sample_config,
+                    node,
+                    config,
+                )
+
+            self.manifest._parsing_info.static_analysis_parsed_path_count += 1
+        # if the static parser failed, add the correct messages for tracking
+        elif isinstance(statically_parsed, str):
+            if statically_parsed == "cannot_parse":
+                result += ["01_stable_parser_cannot_parse"]
+            elif statically_parsed == "has_banned_macro":
+                result += ["08_has_banned_macro"]
+
+            super().render_update(node, config)
+            logger.debug(
+                f"1602: parser fallback to jinja rendering on {node.path}"
+            )
         # if the static parser didn't succeed, fall back to jinja
         else:
             # jinja rendering
@@ -140,14 +168,6 @@ def render_update(
                 f"1602: parser fallback to jinja rendering on {node.path}"
             )
 
-        # if we're sampling the experimental parser, compare for correctness
-        if exp_sample:
-            result.extend(_get_exp_sample_result(
-                experimental_sample,
-                config_call_dict,
-                node,
-                config
-            ))
         # only send the tracking event if there is at least one result code
         if result:
             # fire a tracking event. this fires one event for every sample
@@ -248,6 +268,40 @@ def _has_banned_macro(
             False
         )
 
+    # this method updates the model note rendered and unrendered config as well
+    # as the node object. Used to populate these values when circumventing jinja
+    # rendering like the static parser.
+    def populate(
+        self,
+        node: ParsedModelNode,
+        config: ContextConfig,
+        refs: List[List[str]],
+        sources: List[List[str]],
+        configs: Dict[str, Any]
+    ):
+        # if there are hooks present this, it WILL render jinja. Will need to change
+        # when the experimental parser supports hooks
+        self.update_parsed_node_config(node, config)
+
+        # update the unrendered config with values from the file.
+        # values from yaml files are in there already
+        node.unrendered_config.update(configs)
+
+        # set refs and sources on the node object
+        node.refs += refs
+        node.sources += sources
+
+        # configs don't need to be merged into the node because they
+        # are read from config._config_call_dict
+
+    # the manifest is often huge so this method avoids deepcopying it
+    def partial_deepcopy(self):
+        return ModelParser(
+            deepcopy(self.project),
+            self.manifest,
+            deepcopy(self.root_project)
+        )
+
 
 # pure function. safe to use elsewhere, but unlikely to be useful outside this file.
 def _get_config_call_dict(
@@ -277,63 +331,18 @@ def _shift_sources(
 
 # returns a list of string codes to be sent as a tracking event
 def _get_exp_sample_result(
-    sample_output: Optional[Union[str, Dict[str, Any]]],
-    config_call_dict: Dict[str, Any],
+    sample_node: ParsedModelNode,
+    sample_config: ContextConfig,
     node: ParsedModelNode,
     config: ContextConfig
 ) -> List[str]:
-    result: List[str] = []
-    # experimental parser didn't run
-    if sample_output is None:
-        result += ["09_experimental_parser_skipped"]
-    # experimental parser couldn't parse
-    elif isinstance(sample_output, str):
-        if sample_output == "cannot_parse":
-            result += ["01_experimental_parser_cannot_parse"]
-        elif sample_output == "has_banned_macro":
-            result += ["08_has_banned_macro"]
-    else:
-        # look for false positive configs
-        for k in config_call_dict.keys():
-            if k not in config._config_call_dict:
-                result += ["02_false_positive_config_value"]
-                break
-
-        # look for missed configs
-        for k in config._config_call_dict.keys():
-            if k not in config_call_dict:
-                result += ["03_missed_config_value"]
-                break
-
-        # look for false positive sources
-        for s in sample_output['sources']:
-            if s not in node.sources:
-                result += ["04_false_positive_source_value"]
-                break
-
-        # look for missed sources
-        for s in node.sources:
-            if s not in sample_output['sources']:
-                result += ["05_missed_source_value"]
-                break
-
-        # look for false positive refs
-        for r in sample_output['refs']:
-            if r not in node.refs:
-                result += ["06_false_positive_ref_value"]
-                break
-
-        # look for missed refs
-        for r in node.refs:
-            if r not in sample_output['refs']:
-                result += ["07_missed_ref_value"]
-                break
-
-        # if there are no errors, return a success value
-        if not result:
-            result = ["00_exact_match"]
+    result: List[Tuple[int, str]] = _get_sample_result(sample_node, sample_config, node, config)
 
-    return result
+    def process(codemsg):
+        code, msg = codemsg
+        return f"0{code}_experimental_{msg}"
+
+    return list(map(process, result))
 
 
 # returns a list of string codes to be sent as a tracking event
@@ -343,45 +352,62 @@ def _get_stable_sample_result(
     node: ParsedModelNode,
     config: ContextConfig
 ) -> List[str]:
-    result: List[str] = []
+    result: List[Tuple[int, str]] = _get_sample_result(sample_node, sample_config, node, config)
+
+    def process(codemsg):
+        code, msg = codemsg
+        return f"8{code}_stable_{msg}"
+
+    return list(map(process, result))
+
+
+# returns a list of string codes that need a single digit prefix to be prepended
+# before being sent as a tracking event
+def _get_sample_result(
+    sample_node: ParsedModelNode,
+    sample_config: ContextConfig,
+    node: ParsedModelNode,
+    config: ContextConfig
+) -> List[Tuple[int, str]]:
+    result: List[Tuple[int, str]] = []
     # look for false positive configs
     for k in config._config_call_dict:
         if k not in config._config_call_dict:
-            result += ["82_stable_false_positive_config_value"]
+            result += [(2, "false_positive_config_value")]
             break
 
     # look for missed configs
     for k in config._config_call_dict.keys():
         if k not in sample_config._config_call_dict.keys():
-            result += ["83_stable_missed_config_value"]
+            result += [(3, "missed_config_value")]
             break
 
     # look for false positive sources
     for s in sample_node.sources:
         if s not in node.sources:
-            result += ["84_sample_false_positive_source_value"]
+            result += [(4, "false_positive_source_value")]
             break
 
     # look for missed sources
     for s in node.sources:
         if s not in sample_node.sources:
-            result += ["85_sample_missed_source_value"]
+            result += [(5, "missed_source_value")]
             break
 
     # look for false positive refs
     for r in sample_node.refs:
         if r not in node.refs:
-            result += ["86_sample_false_positive_ref_value"]
+            result += [(6, "false_positive_ref_value")]
             break
 
     # look for missed refs
     for r in node.refs:
         if r not in sample_node.refs:
-            result += ["87_stable_missed_ref_value"]
+            result += [(7, "missed_ref_value")]
             break
 
     # if there are no errors, return a success value
     if not result:
-        result = ["80_stable_exact_match"]
+        result = [(0, "exact_match")]
 
     return result