Update project load tracking to include experimental parser info (#3495)

* Fix docs generation for cross-db sources in REDSHIFT RA3 node (#3408) * Fix docs generating for cross-db sources * Code reorganization * Code adjustments according to flake8 * Error message adjusted to be more precise * CHANGELOG update * add static analysis info to parsing data * update changelog * don't use `meta`! need better separation between dbt internal objects and external facing data. hacked an internal field on the manifest to save off this parsing info for the time being * fix partial parsing case Co-authored-by: kostek-pl <[email protected]>
dbt-labs · Jun 28, 2021 · 4d24656 · 4d24656
1 parent 41610b8
commit 4d24656
Show file tree

Hide file tree

Showing 7 changed files with 64 additions and 27 deletions.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -1,4 +1,4 @@
-## dbt 0.21.0
+## dbt 0.21.0 (Release TBD)
 
 ### Features
 
@@ -10,6 +10,7 @@
 - Add optional `sslcert`, `sslkey`, and `sslrootcert` profile arguments to the Postgres connector. ([#3472](https://github.com/fishtown-analytics/dbt/pull/3472), [#3473](https://github.com/fishtown-analytics/dbt/pull/3473))
 - Move the example project used by `dbt init` into `dbt` repository, to avoid cloning an external repo ([#3005](https://github.com/fishtown-analytics/dbt/pull/3005), [#3474](https://github.com/fishtown-analytics/dbt/pull/3474))
 - Better interaction between `dbt init` and adapters. Avoid raising errors while initializing a project ([#2814](https://github.com/fishtown-analytics/dbt/pull/2814), [#3483](https://github.com/fishtown-analytics/dbt/pull/3483))
+- Update project loading event data to include experimental parser information. ([#3438](https://github.com/fishtown-analytics/dbt/issues/3438), [#3495](https://github.com/fishtown-analytics/dbt/pull/3495))
 
 Contributors:
 - [@kostek-pl](https://github.com/kostek-pl) ([#3236](https://github.com/fishtown-analytics/dbt/pull/3408))

diff --git a/core/dbt/contracts/graph/manifest.py b/core/dbt/contracts/graph/manifest.py
@@ -525,6 +525,12 @@ def _find_macros_by_name(
         return candidates
 
 
+@dataclass
+class ParsingInfo:
+    static_analysis_parsed_path_count: int = 0
+    static_analysis_path_count: int = 0
+
+
 @dataclass
 class ManifestStateCheck(dbtClassMixin):
     vars_hash: FileHash = field(default_factory=FileHash.empty)
@@ -566,6 +572,10 @@ class Manifest(MacroMethods, DataClassMessagePackMixin, dbtClassMixin):
     _analysis_lookup: Optional[AnalysisLookup] = field(
         default=None, metadata={'serialize': lambda x: None, 'deserialize': lambda x: None}
     )
+    _parsing_info: ParsingInfo = field(
+        default_factory=ParsingInfo,
+        metadata={'serialize': lambda x: None, 'deserialize': lambda x: None}
+    )
     _lock: Lock = field(
         default_factory=flags.MP_CONTEXT.Lock,
         metadata={'serialize': lambda x: None, 'deserialize': lambda x: None}

diff --git a/core/dbt/parser/manifest.py b/core/dbt/parser/manifest.py
@@ -31,7 +31,7 @@
 from dbt.parser.partial import PartialParsing
 from dbt.contracts.graph.compiled import ManifestNode
 from dbt.contracts.graph.manifest import (
-    Manifest, Disabled, MacroManifest, ManifestStateCheck
+    Manifest, Disabled, MacroManifest, ManifestStateCheck, ParsingInfo
 )
 from dbt.contracts.graph.parsed import (
     ParsedSourceDefinition, ParsedNode, ParsedMacro, ColumnInfo, ParsedExposure
@@ -71,7 +71,7 @@
 class ParserInfo(dbtClassMixin):
     parser: str
     elapsed: float
-    path_count: int = 0
+    parsed_path_count: int = 0
 
 
 # Part of saved performance info
@@ -80,14 +80,18 @@ class ProjectLoaderInfo(dbtClassMixin):
     project_name: str
     elapsed: float
     parsers: List[ParserInfo] = field(default_factory=list)
-    path_count: int = 0
+    parsed_path_count: int = 0
 
 
 # Part of saved performance info
 @dataclass
 class ManifestLoaderInfo(dbtClassMixin, Writable):
     path_count: int = 0
+    parsed_path_count: int = 0
+    static_analysis_path_count: int = 0
+    static_analysis_parsed_path_count: int = 0
     is_partial_parse_enabled: Optional[bool] = None
+    is_static_analysis_enabled: Optional[bool] = None
     read_files_elapsed: Optional[float] = None
     load_macros_elapsed: Optional[float] = None
     parse_project_elapsed: Optional[float] = None
@@ -135,8 +139,6 @@ def __init__(
         # have been enabled, but not happening because of some issue.
         self.partially_parsing = False
 
-        self._perf_info = self.build_perf_info()
-
         # This is a saved manifest from a previous run that's used for partial parsing
         self.saved_manifest: Optional[Manifest] = self.read_manifest_for_partial_parse()
 
@@ -184,7 +186,6 @@ def get_full_manifest(
 
     # This is where the main action happens
     def load(self):
-
         # Read files creates a dictionary of projects to a dictionary
         # of parsers to lists of file strings. The file strings are
         # used to get the SourceFiles from the manifest files.
@@ -196,6 +197,7 @@ def load(self):
         project_parser_files = {}
         for project in self.all_projects.values():
             read_files(project, self.manifest.files, project_parser_files)
+        self._perf_info.path_count = len(self.manifest.files)
         self._perf_info.read_files_elapsed = (time.perf_counter() - start_read_files)
 
         skip_parsing = False
@@ -208,13 +210,15 @@ def load(self):
                 # files are different, we need to create a new set of
                 # project_parser_files.
                 project_parser_files = partial_parsing.get_parsing_files()
-                self.manifest = self.saved_manifest
                 self.partially_parsing = True
 
-        if skip_parsing:
-            logger.info("Partial parsing enabled, no changes found, skipping parsing")
             self.manifest = self.saved_manifest
 
+        if self.manifest._parsing_info is None:
+            self.manifest._parsing_info = ParsingInfo()
+
+        if skip_parsing:
+            logger.info("Partial parsing enabled, no changes found, skipping parsing")
         else:
             # Load Macros
             # We need to parse the macros first, so they're resolvable when
@@ -230,6 +234,8 @@ def load(self):
                 for file_id in parser_files['MacroParser']:
                     block = FileBlock(self.manifest.files[file_id])
                     parser.parse_file(block)
+                    # increment parsed path count for performance tracking
+                    self._perf_info.parsed_path_count = self._perf_info.parsed_path_count + 1
             # Look at changed macros and update the macro.depends_on.macros
             self.macro_depends_on()
             self._perf_info.load_macros_elapsed = (time.perf_counter() - start_load_macros)
@@ -301,9 +307,17 @@ def load(self):
             self.process_sources(self.root_project.project_name)
             self.process_refs(self.root_project.project_name)
             self.process_docs(self.root_project)
+
+            # update tracking data
             self._perf_info.process_manifest_elapsed = (
                 time.perf_counter() - start_process
             )
+            self._perf_info.static_analysis_parsed_path_count = (
+                self.manifest._parsing_info.static_analysis_parsed_path_count
+            )
+            self._perf_info.static_analysis_path_count = (
+                self.manifest._parsing_info.static_analysis_path_count
+            )
 
             # write out the fully parsed manifest
             self.write_manifest_for_partial_parse()
@@ -321,7 +335,7 @@ def parse_project(
 
         project_loader_info = self._perf_info._project_index[project.project_name]
         start_timer = time.perf_counter()
-        total_path_count = 0
+        total_parsed_path_count = 0
 
         # Loop through parsers with loaded files.
         for parser_cls in parser_types:
@@ -331,7 +345,7 @@ def parse_project(
                 continue
 
             # Initialize timing info
-            parser_path_count = 0
+            project_parsed_path_count = 0
             parser_start_timer = time.perf_counter()
 
             # Parse the project files for this parser
@@ -347,15 +361,15 @@ def parse_project(
                     parser.parse_file(block, dct=dct)
                 else:
                     parser.parse_file(block)
-                parser_path_count = parser_path_count + 1
+                project_parsed_path_count = project_parsed_path_count + 1
 
             # Save timing info
             project_loader_info.parsers.append(ParserInfo(
                 parser=parser.resource_type,
-                path_count=parser_path_count,
+                parsed_path_count=project_parsed_path_count,
                 elapsed=time.perf_counter() - parser_start_timer
             ))
-            total_path_count = total_path_count + parser_path_count
+            total_parsed_path_count = total_parsed_path_count + project_parsed_path_count
 
         # HookParser doesn't run from loaded files, just dbt_project.yml,
         # so do separately
@@ -372,10 +386,12 @@ def parse_project(
 
         # Store the performance info
         elapsed = time.perf_counter() - start_timer
-        project_loader_info.path_count = project_loader_info.path_count + total_path_count
+        project_loader_info.parsed_path_count = (
+            project_loader_info.parsed_path_count + total_parsed_path_count
+        )
         project_loader_info.elapsed = project_loader_info.elapsed + elapsed
-        self._perf_info.path_count = (
-            self._perf_info.path_count + total_path_count
+        self._perf_info.parsed_path_count = (
+            self._perf_info.parsed_path_count + total_parsed_path_count
         )
 
     # Loop through macros in the manifest and statically parse
@@ -501,12 +517,12 @@ def read_manifest_for_partial_parse(self) -> Optional[Manifest]:
 
     def build_perf_info(self):
         mli = ManifestLoaderInfo(
-            is_partial_parse_enabled=self._partial_parse_enabled()
+            is_partial_parse_enabled=self._partial_parse_enabled(),
+            is_static_analysis_enabled=flags.USE_EXPERIMENTAL_PARSER
         )
         for project in self.all_projects.values():
             project_info = ProjectLoaderInfo(
                 project_name=project.project_name,
-                path_count=0,
                 elapsed=0,
             )
             mli.projects.append(project_info)
@@ -603,6 +619,7 @@ def track_project_load(self):
             "invocation_id": invocation_id,
             "project_id": self.root_project.hashed_name(),
             "path_count": self._perf_info.path_count,
+            "parsed_path_count": self._perf_info.parsed_path_count,
             "read_files_elapsed": self._perf_info.read_files_elapsed,
             "load_macros_elapsed": self._perf_info.load_macros_elapsed,
             "parse_project_elapsed": self._perf_info.parse_project_elapsed,
@@ -614,6 +631,9 @@ def track_project_load(self):
             "is_partial_parse_enabled": (
                 self._perf_info.is_partial_parse_enabled
             ),
+            "is_static_analysis_enabled": self._perf_info.is_static_analysis_enabled,
+            "static_analysis_path_count": self._perf_info.static_analysis_path_count,
+            "static_analysis_parsed_path_count": self._perf_info.static_analysis_parsed_path_count,
         })
 
     # Takes references in 'refs' array of nodes and exposures, finds the target

diff --git a/core/dbt/parser/models.py b/core/dbt/parser/models.py
@@ -2,7 +2,7 @@
 from dbt.contracts.graph.parsed import ParsedModelNode
 import dbt.flags as flags
 from dbt.node_types import NodeType
-from dbt.parser.base import IntermediateNode, SimpleSQLParser
+from dbt.parser.base import SimpleSQLParser
 from dbt.parser.search import FileBlock
 from dbt.tree_sitter_jinja.extractor import extract_from_source
 
@@ -22,8 +22,9 @@ def get_compiled_path(cls, block: FileBlock):
         return block.path.relative_path
 
     def render_update(
-        self, node: IntermediateNode, config: ContextConfig
+        self, node: ParsedModelNode, config: ContextConfig
     ) -> None:
+        self.manifest._parsing_info.static_analysis_path_count += 1
 
         # normal dbt run
         if not flags.USE_EXPERIMENTAL_PARSER:
@@ -63,5 +64,7 @@ def render_update(
                 for configv in res['configs']:
                     node.config[configv[0]] = configv[1]
 
+                self.manifest._parsing_info.static_analysis_parsed_path_count += 1
+
             else:
                 super().render_update(node, config)
diff --git a/core/dbt/parser/schemas.py b/core/dbt/parser/schemas.py
@@ -98,10 +98,10 @@ def error_context(
 
 def yaml_from_file(
     source_file: SchemaSourceFile
-) -> Optional[Dict[str, Any]]:
+) -> Dict[str, Any]:
     """If loading the yaml fails, raise an exception.
     """
-    path: str = source_file.path.relative_path
+    path = source_file.path.relative_path
     try:
         return load_yaml_text(source_file.contents)
     except ValidationException as e:
@@ -110,7 +110,6 @@ def yaml_from_file(
             'Error reading {}: {} - {}'
             .format(source_file.project_name, path, reason)
         )
-    return None
 
 
 class ParserRef:

diff --git a/core/dbt/tracking.py b/core/dbt/tracking.py
@@ -28,7 +28,7 @@
 PACKAGE_INSTALL_SPEC = 'iglu:com.dbt/package_install/jsonschema/1-0-0'
 RPC_REQUEST_SPEC = 'iglu:com.dbt/rpc_request/jsonschema/1-0-1'
 DEPRECATION_WARN_SPEC = 'iglu:com.dbt/deprecation_warn/jsonschema/1-0-0'
-LOAD_ALL_TIMING_SPEC = 'iglu:com.dbt/load_all_timing/jsonschema/1-0-2'
+LOAD_ALL_TIMING_SPEC = 'iglu:com.dbt/load_all_timing/jsonschema/1-0-3'
 RESOURCE_COUNTS = 'iglu:com.dbt/resource_counts/jsonschema/1-0-0'
 
 DBT_INVOCATION_ENV = 'DBT_INVOCATION_ENV'

diff --git a/test/integration/033_event_tracking_test/test_events.py b/test/integration/033_event_tracking_test/test_events.py
@@ -92,12 +92,16 @@ def load_context(self):
 
         def populate(project_id, user_id, invocation_id, version):
             return [{
-                'schema': 'iglu:com.dbt/load_all_timing/jsonschema/1-0-2',
+                'schema': 'iglu:com.dbt/load_all_timing/jsonschema/1-0-3',
                 'data': {
                     'invocation_id': invocation_id,
                     'project_id': project_id,
+                    'parsed_path_count': ANY,
                     'path_count': ANY,
                     'is_partial_parse_enabled': ANY,
+                    'is_static_analysis_enabled': ANY,
+                    'static_analysis_path_count': ANY,
+                    'static_analysis_parsed_path_count': ANY,
                     'load_all_elapsed': ANY,
                     'read_files_elapsed': ANY,
                     'load_macros_elapsed': ANY,