Skip to content

Commit

Permalink
Update project load tracking to include experimental parser info (#3495)
Browse files Browse the repository at this point in the history
* Fix docs generation for cross-db sources in REDSHIFT RA3 node (#3408)

* Fix docs generating for cross-db sources

* Code reorganization

* Code adjustments according to flake8

* Error message adjusted to be more precise

* CHANGELOG update

* add static analysis info to parsing data

* update changelog

* don't use `meta`! need better separation between dbt internal objects and external facing data. hacked an internal field on the manifest to save off this parsing info for the time being

* fix partial parsing case

Co-authored-by: kostek-pl <[email protected]>
  • Loading branch information
Kyle Wigley and kostek-pl authored Jun 28, 2021
1 parent 41610b8 commit 4d24656
Show file tree
Hide file tree
Showing 7 changed files with 64 additions and 27 deletions.
3 changes: 2 additions & 1 deletion CHANGELOG.md
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
## dbt 0.21.0
## dbt 0.21.0 (Release TBD)

### Features

Expand All @@ -10,6 +10,7 @@
- Add optional `sslcert`, `sslkey`, and `sslrootcert` profile arguments to the Postgres connector. ([#3472](https://github.com/fishtown-analytics/dbt/pull/3472), [#3473](https://github.com/fishtown-analytics/dbt/pull/3473))
- Move the example project used by `dbt init` into `dbt` repository, to avoid cloning an external repo ([#3005](https://github.com/fishtown-analytics/dbt/pull/3005), [#3474](https://github.com/fishtown-analytics/dbt/pull/3474))
- Better interaction between `dbt init` and adapters. Avoid raising errors while initializing a project ([#2814](https://github.com/fishtown-analytics/dbt/pull/2814), [#3483](https://github.com/fishtown-analytics/dbt/pull/3483))
- Update project loading event data to include experimental parser information. ([#3438](https://github.com/fishtown-analytics/dbt/issues/3438), [#3495](https://github.com/fishtown-analytics/dbt/pull/3495))

Contributors:
- [@kostek-pl](https://github.com/kostek-pl) ([#3236](https://github.com/fishtown-analytics/dbt/pull/3408))
Expand Down
10 changes: 10 additions & 0 deletions core/dbt/contracts/graph/manifest.py
Original file line number Diff line number Diff line change
Expand Up @@ -525,6 +525,12 @@ def _find_macros_by_name(
return candidates


@dataclass
class ParsingInfo:
static_analysis_parsed_path_count: int = 0
static_analysis_path_count: int = 0


@dataclass
class ManifestStateCheck(dbtClassMixin):
vars_hash: FileHash = field(default_factory=FileHash.empty)
Expand Down Expand Up @@ -566,6 +572,10 @@ class Manifest(MacroMethods, DataClassMessagePackMixin, dbtClassMixin):
_analysis_lookup: Optional[AnalysisLookup] = field(
default=None, metadata={'serialize': lambda x: None, 'deserialize': lambda x: None}
)
_parsing_info: ParsingInfo = field(
default_factory=ParsingInfo,
metadata={'serialize': lambda x: None, 'deserialize': lambda x: None}
)
_lock: Lock = field(
default_factory=flags.MP_CONTEXT.Lock,
metadata={'serialize': lambda x: None, 'deserialize': lambda x: None}
Expand Down
58 changes: 39 additions & 19 deletions core/dbt/parser/manifest.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,7 +31,7 @@
from dbt.parser.partial import PartialParsing
from dbt.contracts.graph.compiled import ManifestNode
from dbt.contracts.graph.manifest import (
Manifest, Disabled, MacroManifest, ManifestStateCheck
Manifest, Disabled, MacroManifest, ManifestStateCheck, ParsingInfo
)
from dbt.contracts.graph.parsed import (
ParsedSourceDefinition, ParsedNode, ParsedMacro, ColumnInfo, ParsedExposure
Expand Down Expand Up @@ -71,7 +71,7 @@
class ParserInfo(dbtClassMixin):
parser: str
elapsed: float
path_count: int = 0
parsed_path_count: int = 0


# Part of saved performance info
Expand All @@ -80,14 +80,18 @@ class ProjectLoaderInfo(dbtClassMixin):
project_name: str
elapsed: float
parsers: List[ParserInfo] = field(default_factory=list)
path_count: int = 0
parsed_path_count: int = 0


# Part of saved performance info
@dataclass
class ManifestLoaderInfo(dbtClassMixin, Writable):
path_count: int = 0
parsed_path_count: int = 0
static_analysis_path_count: int = 0
static_analysis_parsed_path_count: int = 0
is_partial_parse_enabled: Optional[bool] = None
is_static_analysis_enabled: Optional[bool] = None
read_files_elapsed: Optional[float] = None
load_macros_elapsed: Optional[float] = None
parse_project_elapsed: Optional[float] = None
Expand Down Expand Up @@ -135,8 +139,6 @@ def __init__(
# have been enabled, but not happening because of some issue.
self.partially_parsing = False

self._perf_info = self.build_perf_info()

# This is a saved manifest from a previous run that's used for partial parsing
self.saved_manifest: Optional[Manifest] = self.read_manifest_for_partial_parse()

Expand Down Expand Up @@ -184,7 +186,6 @@ def get_full_manifest(

# This is where the main action happens
def load(self):

# Read files creates a dictionary of projects to a dictionary
# of parsers to lists of file strings. The file strings are
# used to get the SourceFiles from the manifest files.
Expand All @@ -196,6 +197,7 @@ def load(self):
project_parser_files = {}
for project in self.all_projects.values():
read_files(project, self.manifest.files, project_parser_files)
self._perf_info.path_count = len(self.manifest.files)
self._perf_info.read_files_elapsed = (time.perf_counter() - start_read_files)

skip_parsing = False
Expand All @@ -208,13 +210,15 @@ def load(self):
# files are different, we need to create a new set of
# project_parser_files.
project_parser_files = partial_parsing.get_parsing_files()
self.manifest = self.saved_manifest
self.partially_parsing = True

if skip_parsing:
logger.info("Partial parsing enabled, no changes found, skipping parsing")
self.manifest = self.saved_manifest

if self.manifest._parsing_info is None:
self.manifest._parsing_info = ParsingInfo()

if skip_parsing:
logger.info("Partial parsing enabled, no changes found, skipping parsing")
else:
# Load Macros
# We need to parse the macros first, so they're resolvable when
Expand All @@ -230,6 +234,8 @@ def load(self):
for file_id in parser_files['MacroParser']:
block = FileBlock(self.manifest.files[file_id])
parser.parse_file(block)
# increment parsed path count for performance tracking
self._perf_info.parsed_path_count = self._perf_info.parsed_path_count + 1
# Look at changed macros and update the macro.depends_on.macros
self.macro_depends_on()
self._perf_info.load_macros_elapsed = (time.perf_counter() - start_load_macros)
Expand Down Expand Up @@ -301,9 +307,17 @@ def load(self):
self.process_sources(self.root_project.project_name)
self.process_refs(self.root_project.project_name)
self.process_docs(self.root_project)

# update tracking data
self._perf_info.process_manifest_elapsed = (
time.perf_counter() - start_process
)
self._perf_info.static_analysis_parsed_path_count = (
self.manifest._parsing_info.static_analysis_parsed_path_count
)
self._perf_info.static_analysis_path_count = (
self.manifest._parsing_info.static_analysis_path_count
)

# write out the fully parsed manifest
self.write_manifest_for_partial_parse()
Expand All @@ -321,7 +335,7 @@ def parse_project(

project_loader_info = self._perf_info._project_index[project.project_name]
start_timer = time.perf_counter()
total_path_count = 0
total_parsed_path_count = 0

# Loop through parsers with loaded files.
for parser_cls in parser_types:
Expand All @@ -331,7 +345,7 @@ def parse_project(
continue

# Initialize timing info
parser_path_count = 0
project_parsed_path_count = 0
parser_start_timer = time.perf_counter()

# Parse the project files for this parser
Expand All @@ -347,15 +361,15 @@ def parse_project(
parser.parse_file(block, dct=dct)
else:
parser.parse_file(block)
parser_path_count = parser_path_count + 1
project_parsed_path_count = project_parsed_path_count + 1

# Save timing info
project_loader_info.parsers.append(ParserInfo(
parser=parser.resource_type,
path_count=parser_path_count,
parsed_path_count=project_parsed_path_count,
elapsed=time.perf_counter() - parser_start_timer
))
total_path_count = total_path_count + parser_path_count
total_parsed_path_count = total_parsed_path_count + project_parsed_path_count

# HookParser doesn't run from loaded files, just dbt_project.yml,
# so do separately
Expand All @@ -372,10 +386,12 @@ def parse_project(

# Store the performance info
elapsed = time.perf_counter() - start_timer
project_loader_info.path_count = project_loader_info.path_count + total_path_count
project_loader_info.parsed_path_count = (
project_loader_info.parsed_path_count + total_parsed_path_count
)
project_loader_info.elapsed = project_loader_info.elapsed + elapsed
self._perf_info.path_count = (
self._perf_info.path_count + total_path_count
self._perf_info.parsed_path_count = (
self._perf_info.parsed_path_count + total_parsed_path_count
)

# Loop through macros in the manifest and statically parse
Expand Down Expand Up @@ -501,12 +517,12 @@ def read_manifest_for_partial_parse(self) -> Optional[Manifest]:

def build_perf_info(self):
mli = ManifestLoaderInfo(
is_partial_parse_enabled=self._partial_parse_enabled()
is_partial_parse_enabled=self._partial_parse_enabled(),
is_static_analysis_enabled=flags.USE_EXPERIMENTAL_PARSER
)
for project in self.all_projects.values():
project_info = ProjectLoaderInfo(
project_name=project.project_name,
path_count=0,
elapsed=0,
)
mli.projects.append(project_info)
Expand Down Expand Up @@ -603,6 +619,7 @@ def track_project_load(self):
"invocation_id": invocation_id,
"project_id": self.root_project.hashed_name(),
"path_count": self._perf_info.path_count,
"parsed_path_count": self._perf_info.parsed_path_count,
"read_files_elapsed": self._perf_info.read_files_elapsed,
"load_macros_elapsed": self._perf_info.load_macros_elapsed,
"parse_project_elapsed": self._perf_info.parse_project_elapsed,
Expand All @@ -614,6 +631,9 @@ def track_project_load(self):
"is_partial_parse_enabled": (
self._perf_info.is_partial_parse_enabled
),
"is_static_analysis_enabled": self._perf_info.is_static_analysis_enabled,
"static_analysis_path_count": self._perf_info.static_analysis_path_count,
"static_analysis_parsed_path_count": self._perf_info.static_analysis_parsed_path_count,
})

# Takes references in 'refs' array of nodes and exposures, finds the target
Expand Down
7 changes: 5 additions & 2 deletions core/dbt/parser/models.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
from dbt.contracts.graph.parsed import ParsedModelNode
import dbt.flags as flags
from dbt.node_types import NodeType
from dbt.parser.base import IntermediateNode, SimpleSQLParser
from dbt.parser.base import SimpleSQLParser
from dbt.parser.search import FileBlock
from dbt.tree_sitter_jinja.extractor import extract_from_source

Expand All @@ -22,8 +22,9 @@ def get_compiled_path(cls, block: FileBlock):
return block.path.relative_path

def render_update(
self, node: IntermediateNode, config: ContextConfig
self, node: ParsedModelNode, config: ContextConfig
) -> None:
self.manifest._parsing_info.static_analysis_path_count += 1

# normal dbt run
if not flags.USE_EXPERIMENTAL_PARSER:
Expand Down Expand Up @@ -63,5 +64,7 @@ def render_update(
for configv in res['configs']:
node.config[configv[0]] = configv[1]

self.manifest._parsing_info.static_analysis_parsed_path_count += 1

else:
super().render_update(node, config)
5 changes: 2 additions & 3 deletions core/dbt/parser/schemas.py
Original file line number Diff line number Diff line change
Expand Up @@ -98,10 +98,10 @@ def error_context(

def yaml_from_file(
source_file: SchemaSourceFile
) -> Optional[Dict[str, Any]]:
) -> Dict[str, Any]:
"""If loading the yaml fails, raise an exception.
"""
path: str = source_file.path.relative_path
path = source_file.path.relative_path
try:
return load_yaml_text(source_file.contents)
except ValidationException as e:
Expand All @@ -110,7 +110,6 @@ def yaml_from_file(
'Error reading {}: {} - {}'
.format(source_file.project_name, path, reason)
)
return None


class ParserRef:
Expand Down
2 changes: 1 addition & 1 deletion core/dbt/tracking.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,7 @@
PACKAGE_INSTALL_SPEC = 'iglu:com.dbt/package_install/jsonschema/1-0-0'
RPC_REQUEST_SPEC = 'iglu:com.dbt/rpc_request/jsonschema/1-0-1'
DEPRECATION_WARN_SPEC = 'iglu:com.dbt/deprecation_warn/jsonschema/1-0-0'
LOAD_ALL_TIMING_SPEC = 'iglu:com.dbt/load_all_timing/jsonschema/1-0-2'
LOAD_ALL_TIMING_SPEC = 'iglu:com.dbt/load_all_timing/jsonschema/1-0-3'
RESOURCE_COUNTS = 'iglu:com.dbt/resource_counts/jsonschema/1-0-0'

DBT_INVOCATION_ENV = 'DBT_INVOCATION_ENV'
Expand Down
6 changes: 5 additions & 1 deletion test/integration/033_event_tracking_test/test_events.py
Original file line number Diff line number Diff line change
Expand Up @@ -92,12 +92,16 @@ def load_context(self):

def populate(project_id, user_id, invocation_id, version):
return [{
'schema': 'iglu:com.dbt/load_all_timing/jsonschema/1-0-2',
'schema': 'iglu:com.dbt/load_all_timing/jsonschema/1-0-3',
'data': {
'invocation_id': invocation_id,
'project_id': project_id,
'parsed_path_count': ANY,
'path_count': ANY,
'is_partial_parse_enabled': ANY,
'is_static_analysis_enabled': ANY,
'static_analysis_path_count': ANY,
'static_analysis_parsed_path_count': ANY,
'load_all_elapsed': ANY,
'read_files_elapsed': ANY,
'load_macros_elapsed': ANY,
Expand Down

0 comments on commit 4d24656

Please sign in to comment.