Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

add experimental parser tracking #3553

Merged
merged 2 commits into from
Jul 9, 2021
Merged
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -43,6 +43,7 @@ Contributors:

### Under the hood

- Add tracking for experimental parser accuracy ([3553](https://github.com/dbt-labs/dbt/pull/3553))
- Swap experimental parser implementation to use Rust [#3497](https://github.com/fishtown-analytics/dbt/pull/3497)
- Dispatch the core SQL statement of the new test materialization, to benefit adapter maintainers ([#3465](https://github.com/fishtown-analytics/dbt/pull/3465), [#3461](https://github.com/fishtown-analytics/dbt/pull/3461))
- Minimal validation of yaml dictionaries prior to partial parsing ([#3246](https://github.com/fishtown-analytics/dbt/issues/3246), [#3460](https://github.com/fishtown-analytics/dbt/pull/3460))
Expand Down
147 changes: 114 additions & 33 deletions core/dbt/parser/models.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,12 @@
from dbt.node_types import NodeType
from dbt.parser.base import SimpleSQLParser
from dbt.parser.search import FileBlock
import dbt.tracking as tracking
from dbt import utils
from dbt_extractor import ExtractionError, py_extract_from_source # type: ignore
import itertools
import random
from typing import Any, Dict, List, Tuple


class ModelParser(SimpleSQLParser[ParsedModelNode]):
Expand All @@ -26,46 +31,122 @@ def render_update(
) -> None:
self.manifest._parsing_info.static_analysis_path_count += 1

# `True` roughly 1/100 times this function is called
sample: bool = random.randint(1, 101) == 100

# run the experimental parser if the flag is on or if we're sampling
if flags.USE_EXPERIMENTAL_PARSER or sample:
try:
experimentally_parsed: Dict[str, List[Any]] = py_extract_from_source(node.raw_sql)

# second config format
config_calls: List[Dict[str, str]] = []
for c in experimentally_parsed['configs']:
config_calls.append({c[0]: c[1]})

# format sources TODO change extractor to match this type
source_calls: List[List[str]] = []
for s in experimentally_parsed['sources']:
source_calls.append([s[0], s[1]])
experimentally_parsed['sources'] = source_calls

except ExtractionError as e:
experimentally_parsed = e

# normal dbt run
if not flags.USE_EXPERIMENTAL_PARSER:
# normal rendering
super().render_update(node, config)
# if we're sampling, compare for correctness
if sample:
result: List[str] = []
# experimental parser couldn't parse
if isinstance(experimentally_parsed, Exception):
result += ["01_experimental_parser_cannot_parse"]
else:
# rearrange existing configs to match:
real_configs: List[Tuple[str, Any]] = list(
itertools.chain.from_iterable(
map(lambda x: x.items(), config._config_calls)
)
)

# if the --use-experimental-parser flag was set
else:
try:
# run dbt jinja extractor (powered by tree-sitter + rust)
# throws an exception if it can't parse the source
res = py_extract_from_source(node.raw_sql)

# since it doesn't need python jinja, fit the refs, sources, and configs
# into the node. Down the line the rest of the node will be updated with
# this information. (e.g. depends_on etc.)
config_calls = []
for c in res['configs']:
config_calls.append({c[0]: c[1]})
# look for false positive configs
for c in experimentally_parsed['configs']:
if c not in real_configs:
result += ["02_false_positive_config_value"]
break

config._config_calls = config_calls
# look for missed configs
for c in real_configs:
if c not in experimentally_parsed['configs']:
result += ["03_missed_config_value"]
break

# this uses the updated config to set all the right things in the node
# if there are hooks present, it WILL render jinja. Will need to change
# when we support hooks
self.update_parsed_node(node, config)
# look for false positive sources
for s in experimentally_parsed['sources']:
if s not in node.sources:
result += ["04_false_positive_source_value"]
break

# udpate the unrendered config with values from the file
# values from yaml files are in there already
node.unrendered_config.update(dict(res['configs']))
# look for missed sources
for s in node.sources:
if s not in experimentally_parsed['sources']:
result += ["05_missed_source_value"]
break

# set refs, sources, and configs on the node object
node.refs = node.refs + res['refs']
for sourcev in res['sources']:
# TODO change extractor to match type here
node.sources.append([sourcev[0], sourcev[1]])
for configv in res['configs']:
node.config[configv[0]] = configv[1]
# look for false positive refs
for r in experimentally_parsed['refs']:
if r not in node.refs:
result += ["06_false_positive_ref_value"]
break

self.manifest._parsing_info.static_analysis_parsed_path_count += 1
# look for missed refs
for r in node.refs:
if r not in experimentally_parsed['refs']:
result += ["07_missed_ref_value"]
break

# exception was thrown by dbt jinja extractor meaning it can't
# handle this source. fall back to python jinja rendering.
except ExtractionError:
super().render_update(node, config)
# if there are no errors, return a success value
if not result:
result = ["00_exact_match"]

# fire a tracking event. this fires one event for every sample
# so that we have data on a per file basis. Not only can we expect
# no false positives or misses, we can expect the number model
# files parseable by the experimental parser to match our internal
# testing.
tracking.track_experimental_parser_sample({
"project_id": self.root_project.hashed_name(),
"file_id": utils.get_hash(node),
"status": result
})
Comment on lines +114 to +123
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

👍


# if the --use-experimental-parser flag was set, and the experimental parser succeeded
elif not isinstance(experimentally_parsed, Exception):
# since it doesn't need python jinja, fit the refs, sources, and configs
# into the node. Down the line the rest of the node will be updated with
# this information. (e.g. depends_on etc.)
config._config_calls = config_calls

# this uses the updated config to set all the right things in the node.
# if there are hooks present, it WILL render jinja. Will need to change
# when the experimental parser supports hooks
self.update_parsed_node(node, config)

# update the unrendered config with values from the file.
# values from yaml files are in there already
node.unrendered_config.update(dict(experimentally_parsed['configs']))

# set refs, sources, and configs on the node object
node.refs += experimentally_parsed['refs']
node.sources += experimentally_parsed['sources']
for configv in experimentally_parsed['configs']:
node.config[configv[0]] = configv[1]

self.manifest._parsing_info.static_analysis_parsed_path_count += 1

# the experimental parser tried and failed on this model.
# fall back to python jinja rendering.
else:
super().render_update(node, config)
16 changes: 15 additions & 1 deletion core/dbt/tracking.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,7 +30,7 @@
DEPRECATION_WARN_SPEC = 'iglu:com.dbt/deprecation_warn/jsonschema/1-0-0'
LOAD_ALL_TIMING_SPEC = 'iglu:com.dbt/load_all_timing/jsonschema/1-0-3'
RESOURCE_COUNTS = 'iglu:com.dbt/resource_counts/jsonschema/1-0-0'

EXPERIMENTAL_PARSER = 'iglu:com.dbt/experimental_parser/jsonschema/1-0-0'
DBT_INVOCATION_ENV = 'DBT_INVOCATION_ENV'


Expand Down Expand Up @@ -423,6 +423,20 @@ def track_invalid_invocation(
)


def track_experimental_parser_sample(options):
context = [SelfDescribingJson(EXPERIMENTAL_PARSER, options)]
assert active_user is not None, \
'Cannot track project loading time when active user is None'

track(
active_user,
category='dbt',
action='experimental_parser',
label=active_user.invocation_id,
context=context
)


def flush():
logger.debug("Flushing usage events")
tracker.flush()
Expand Down
Loading