Skip to content

Commit

Permalink
add experimental-parser tracking
Browse files Browse the repository at this point in the history
  • Loading branch information
Nathaniel May committed Jul 9, 2021
1 parent fb91bad commit fc38f9e
Show file tree
Hide file tree
Showing 3 changed files with 306 additions and 65 deletions.
147 changes: 114 additions & 33 deletions core/dbt/parser/models.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,12 @@
from dbt.node_types import NodeType
from dbt.parser.base import SimpleSQLParser
from dbt.parser.search import FileBlock
import dbt.tracking as tracking
from dbt import utils
from dbt_extractor import ExtractionError, py_extract_from_source # type: ignore
import itertools
import random
from typing import Any, Dict, List, Tuple


class ModelParser(SimpleSQLParser[ParsedModelNode]):
Expand All @@ -26,46 +31,122 @@ def render_update(
) -> None:
self.manifest._parsing_info.static_analysis_path_count += 1

# `True` roughly 1/100 times this function is called
sample: bool = random.randint(1, 101) == 100

# run the experimental parser if the flag is on or if we're sampling
if flags.USE_EXPERIMENTAL_PARSER or sample:
try:
experimentally_parsed: Dict[str, List[Any]] = py_extract_from_source(node.raw_sql)

# second config format
config_calls: List[Dict[str, str]] = []
for c in experimentally_parsed['configs']:
config_calls.append({c[0]: c[1]})

# format sources TODO change extractor to match this type
source_calls: List[List[str]] = []
for s in experimentally_parsed['sources']:
source_calls.append([s[0], s[1]])
experimentally_parsed['sources'] = source_calls

except ExtractionError as e:
experimentally_parsed = e

# normal dbt run
if not flags.USE_EXPERIMENTAL_PARSER:
# normal rendering
super().render_update(node, config)
# if we're sampling, compare for correctness
if sample:
result: List[str] = []
# experimental parser couldn't parse
if isinstance(experimentally_parsed, Exception):
result += ["01_experimental_parser_cannot_parse"]
else:
# rearrange existing configs to match:
real_configs: List[Tuple[str, Any]] = list(
itertools.chain.from_iterable(
map(lambda x: x.items(), config._config_calls)
)
)

# if the --use-experimental-parser flag was set
else:
try:
# run dbt jinja extractor (powered by tree-sitter + rust)
# throws an exception if it can't parse the source
res = py_extract_from_source(node.raw_sql)

# since it doesn't need python jinja, fit the refs, sources, and configs
# into the node. Down the line the rest of the node will be updated with
# this information. (e.g. depends_on etc.)
config_calls = []
for c in res['configs']:
config_calls.append({c[0]: c[1]})
# look for false positive configs
for c in experimentally_parsed['configs']:
if c not in real_configs:
result += ["02_false_positive_config_value"]
break

config._config_calls = config_calls
# look for missed configs
for c in real_configs:
if c not in experimentally_parsed['configs']:
result += ["03_missed_config_value"]
break

# this uses the updated config to set all the right things in the node
# if there are hooks present, it WILL render jinja. Will need to change
# when we support hooks
self.update_parsed_node(node, config)
# look for false positive sources
for s in experimentally_parsed['sources']:
if s not in node.sources:
result += ["04_false_positive_source_value"]
break

# udpate the unrendered config with values from the file
# values from yaml files are in there already
node.unrendered_config.update(dict(res['configs']))
# look for missed sources
for s in node.sources:
if s not in experimentally_parsed['sources']:
result += ["05_missed_source_value"]
break

# set refs, sources, and configs on the node object
node.refs = node.refs + res['refs']
for sourcev in res['sources']:
# TODO change extractor to match type here
node.sources.append([sourcev[0], sourcev[1]])
for configv in res['configs']:
node.config[configv[0]] = configv[1]
# look for false positive refs
for r in experimentally_parsed['refs']:
if r not in node.refs:
result += ["06_false_positive_ref_value"]
break

self.manifest._parsing_info.static_analysis_parsed_path_count += 1
# look for missed refs
for r in node.refs:
if r not in experimentally_parsed['refs']:
result += ["07_missed_ref_value"]
break

# exception was thrown by dbt jinja extractor meaning it can't
# handle this source. fall back to python jinja rendering.
except ExtractionError:
super().render_update(node, config)
# if there are no errors, return a success value
if not result:
result = ["00_exact_match"]

# fire a tracking event. this fires one event for every sample
# so that we have data on a per file basis. Not only can we expect
# no false positives or misses, we can expect the number model
# files parseable by the experimental parser to match our internal
# testing.
tracking.track_experimental_parser_sample({
"project_id": self.root_project.hashed_name(),
"file_id": utils.get_hash(node),
"status": result
})

# if the --use-experimental-parser flag was set, and the experimental parser succeeded
elif not isinstance(experimentally_parsed, Exception):
# since it doesn't need python jinja, fit the refs, sources, and configs
# into the node. Down the line the rest of the node will be updated with
# this information. (e.g. depends_on etc.)
config._config_calls = config_calls

# this uses the updated config to set all the right things in the node.
# if there are hooks present, it WILL render jinja. Will need to change
# when the experimental parser supports hooks
self.update_parsed_node(node, config)

# update the unrendered config with values from the file.
# values from yaml files are in there already
node.unrendered_config.update(dict(experimentally_parsed['configs']))

# set refs, sources, and configs on the node object
node.refs += experimentally_parsed['refs']
node.sources += experimentally_parsed['sources']
for configv in experimentally_parsed['configs']:
node.config[configv[0]] = configv[1]

self.manifest._parsing_info.static_analysis_parsed_path_count += 1

# the experimental parser tried and failed on this model.
# fall back to python jinja rendering.
else:
super().render_update(node, config)
16 changes: 15 additions & 1 deletion core/dbt/tracking.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,7 +30,7 @@
DEPRECATION_WARN_SPEC = 'iglu:com.dbt/deprecation_warn/jsonschema/1-0-0'
LOAD_ALL_TIMING_SPEC = 'iglu:com.dbt/load_all_timing/jsonschema/1-0-3'
RESOURCE_COUNTS = 'iglu:com.dbt/resource_counts/jsonschema/1-0-0'

EXPERIMENTAL_PARSER = 'iglu:com.dbt/experimental_parser/jsonschema/1-0-0'
DBT_INVOCATION_ENV = 'DBT_INVOCATION_ENV'


Expand Down Expand Up @@ -423,6 +423,20 @@ def track_invalid_invocation(
)


def track_experimental_parser_sample(options):
context = [SelfDescribingJson(EXPERIMENTAL_PARSER, options)]
assert active_user is not None, \
'Cannot track project loading time when active user is None'

track(
active_user,
category='dbt',
action='experimental_parser',
label=active_user.invocation_id,
context=context
)


def flush():
logger.debug("Flushing usage events")
tracker.flush()
Expand Down
Loading

0 comments on commit fc38f9e

Please sign in to comment.