dbt test --store-failures (#3316)

dbt-labs · May 27, 2021 · c0d757a · c0d757a
1 parent 0f018ea
commit c0d757a
Show file tree

Hide file tree

Showing 33 changed files with 365 additions and 48 deletions.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -3,6 +3,7 @@
 ### Features
 - Support optional `updated_at` config parameter with `check` strategy snapshots. If not supplied, will use current timestamp (default). ([#1844](https://github.com/fishtown-analytics/dbt/issues/1844), [#3376](https://github.com/fishtown-analytics/dbt/pull/3376))
 - Add the opt-in `--use-experimental-parser` flag ([#3307](https://github.com/fishtown-analytics/dbt/issues/3307))
+- Store test failures in the database ([#517](https://github.com/fishtown-analytics/dbt/issues/517), [#903](https://github.com/fishtown-analytics/dbt/issues/903), [#2593](https://github.com/fishtown-analytics/dbt/issues/2593), [#3316](https://github.com/fishtown-analytics/dbt/issues/3316))
 
 ### Fixes
 - Fix compiled sql for ephemeral models ([#3317](https://github.com/fishtown-analytics/dbt/issues/3317), [#3318](https://github.com/fishtown-analytics/dbt/pull/3318))

diff --git a/core/dbt/compilation.py b/core/dbt/compilation.py
@@ -182,8 +182,7 @@ def add_ephemeral_prefix(self, name: str):
 
     def _get_relation_name(self, node: ParsedNode):
         relation_name = None
-        if (node.resource_type in NodeType.refable() and
-                not node.is_ephemeral_model):
+        if node.is_relational and not node.is_ephemeral_model:
             adapter = get_adapter(self.config)
             relation_cls = adapter.Relation
             relation_name = str(relation_cls.create_from(self.config, node))

diff --git a/core/dbt/contracts/graph/compiled.py b/core/dbt/contracts/graph/compiled.py
@@ -120,10 +120,14 @@ class CompiledSchemaTestNode(CompiledNode, HasTestMetadata):
     config: TestConfig = field(default_factory=TestConfig)
 
     def same_config(self, other) -> bool:
-        return (
-            self.unrendered_config.get('severity') ==
-            other.unrendered_config.get('severity')
-        )
+        comparisons = [
+            self.unrendered_config.get(modifier) == other.unrendered_config.get(modifier) or (
+                self.unrendered_config.get(modifier) is None and
+                other.unrendered_config.get(modifier) is None
+            )
+            for modifier in ('severity', 'store_failures')
+        ]
+        return all(comparisons)
 
     def same_column_name(self, other) -> bool:
         return self.column_name == other.column_name

diff --git a/core/dbt/contracts/graph/model_config.py b/core/dbt/contracts/graph/model_config.py
@@ -436,8 +436,13 @@ class SeedConfig(NodeConfig):
 
 @dataclass
 class TestConfig(NodeConfig):
+    schema: Optional[str] = field(
+        default='dbt_test__audit',
+        metadata=CompareBehavior.Exclude.meta(),
+    )
     materialized: str = 'test'
     severity: Severity = Severity('ERROR')
+    store_failures: Optional[bool] = None
 
 
 @dataclass

diff --git a/core/dbt/contracts/graph/parsed.py b/core/dbt/contracts/graph/parsed.py
@@ -115,6 +115,21 @@ class ParsedNodeMixins(dbtClassMixin):
     def is_refable(self):
         return self.resource_type in NodeType.refable()
 
+    @property
+    def should_store_failures(self):
+        return self.resource_type == NodeType.Test and (
+            self.config.store_failures if self.config.store_failures is not None
+            else flags.STORE_FAILURES
+        )
+
+    # will this node map to an object in the database?
+    @property
+    def is_relational(self):
+        return (
+            self.resource_type in NodeType.refable() or
+            self.should_store_failures
+        )
+
     @property
     def is_ephemeral(self):
         return self.config.materialized == 'ephemeral'
@@ -370,10 +385,14 @@ class ParsedSchemaTestNode(ParsedNode, HasTestMetadata):
     config: TestConfig = field(default_factory=TestConfig)
 
     def same_config(self, other) -> bool:
-        return (
-            self.unrendered_config.get('severity') ==
-            other.unrendered_config.get('severity')
-        )
+        comparisons = [
+            self.unrendered_config.get(modifier) == other.unrendered_config.get(modifier) or (
+                self.unrendered_config.get(modifier) is None and
+                other.unrendered_config.get(modifier) is None
+            )
+            for modifier in ('severity', 'store_failures')
+        ]
+        return all(comparisons)
 
     def same_column_name(self, other) -> bool:
         return self.column_name == other.column_name

diff --git a/core/dbt/flags.py b/core/dbt/flags.py
@@ -17,6 +17,7 @@
 WRITE_JSON = None
 PARTIAL_PARSE = None
 USE_COLORS = None
+STORE_FAILURES = None
 
 
 def env_set_truthy(key: str) -> Optional[str]:
@@ -54,7 +55,8 @@ def _get_context():
 
 def reset():
     global STRICT_MODE, FULL_REFRESH, USE_CACHE, WARN_ERROR, TEST_NEW_PARSER, \
-        USE_EXPERIMENTAL_PARSER, WRITE_JSON, PARTIAL_PARSE, MP_CONTEXT, USE_COLORS
+        USE_EXPERIMENTAL_PARSER, WRITE_JSON, PARTIAL_PARSE, MP_CONTEXT, USE_COLORS, \
+        STORE_FAILURES
 
     STRICT_MODE = False
     FULL_REFRESH = False
@@ -66,11 +68,13 @@ def reset():
     PARTIAL_PARSE = False
     MP_CONTEXT = _get_context()
     USE_COLORS = True
+    STORE_FAILURES = False
 
 
 def set_from_args(args):
     global STRICT_MODE, FULL_REFRESH, USE_CACHE, WARN_ERROR, TEST_NEW_PARSER, \
-        USE_EXPERIMENTAL_PARSER, WRITE_JSON, PARTIAL_PARSE, MP_CONTEXT, USE_COLORS
+        USE_EXPERIMENTAL_PARSER, WRITE_JSON, PARTIAL_PARSE, MP_CONTEXT, USE_COLORS, \
+        STORE_FAILURES
 
     USE_CACHE = getattr(args, 'use_cache', USE_CACHE)
 
@@ -94,6 +98,8 @@ def set_from_args(args):
     if use_colors_override is not None:
         USE_COLORS = use_colors_override
 
+    STORE_FAILURES = getattr(args, 'store_failures', STORE_FAILURES)
+
 
 # initialize everything to the defaults on module load
 reset()
diff --git a/core/dbt/include/global_project/macros/materializations/helpers.sql b/core/dbt/include/global_project/macros/materializations/helpers.sql
@@ -72,3 +72,12 @@
   {% endif %}
   {% do return(config_full_refresh) %}
 {% endmacro %}
+
+
+{% macro should_store_failures() %}
+  {% set config_store_failures = config.get('store_failures') %}
+  {% if config_store_failures is none %}
+    {% set config_store_failures = flags.STORE_FAILURES %}
+  {% endif %}
+  {% do return(config_store_failures) %}
+{% endmacro %}
diff --git a/core/dbt/include/global_project/macros/materializations/test.sql b/core/dbt/include/global_project/macros/materializations/test.sql
@@ -1,10 +1,46 @@
 {%- materialization test, default -%}
 
+  {% set relations = [] %}
+
+  {% if should_store_failures() %}
+
+    {% set identifier = model['alias'] %}
+    {% set old_relation = adapter.get_relation(database=database, schema=schema, identifier=identifier) %}
+    {% set target_relation = api.Relation.create(
+        identifier=identifier, schema=schema, database=database, type='table') -%} %}
+
+    {% if old_relation %}
+        {% do adapter.drop_relation(old_relation) %}
+    {% endif %}
+
+    {% call statement(auto_begin=True) %}
+        {{ create_table_as(False, target_relation, sql) }}
+    {% endcall %}
+
+    {% do relations.append(target_relation) %}
+
+    {% set main_sql %}
+        select count(*) as validation_errors
+        from {{ target_relation }}
+    {% endset %}
+
+    {{ adapter.commit() }}
+
+  {% else %}
+
+      {% set main_sql %}
+          select count(*) as validation_errors
+          from (
+            {{ sql }}
+          ) _dbt_internal_test
+      {% endset %}
+
+  {% endif %}
+
   {% call statement('main', fetch_result=True) -%}
-    select count(*) as validation_errors
-    from (
-      {{ sql }}
-    ) _dbt_internal_test
+    {{ main_sql }}
   {%- endcall %}
+
+  {{ return({'relations': relations}) }}
 
 {%- endmaterialization -%}
diff --git a/core/dbt/include/global_project/macros/schema_tests/accepted_values.sql b/core/dbt/include/global_project/macros/schema_tests/accepted_values.sql
@@ -2,16 +2,16 @@
 
 with all_values as (
 
-    select distinct
-        {{ column_name }} as value_field
+    select
+        {{ column_name }} as value_field,
+        count(*) as n_records
 
     from {{ model }}
+    group by 1
 
 )
 
-select
-    value_field
-
+select *
 from all_values
 where value_field not in (
     {% for value in values -%}

diff --git a/core/dbt/main.py b/core/dbt/main.py
@@ -719,6 +719,13 @@ def _build_test_subparser(subparsers, base_subparser):
         Stop execution upon a first test failure.
         '''
     )
+    sub.add_argument(
+        '--store-failures',
+        action='store_true',
+        help='''
+        Store test results (failing rows) in the database
+        '''
+    )
 
     sub.set_defaults(cls=test_task.TestTask, which='test', rpc_method='test')
     return sub

diff --git a/core/dbt/parser/manifest.py b/core/dbt/parser/manifest.py
@@ -655,7 +655,7 @@ def _check_resource_uniqueness(
     alias_resources: Dict[str, ManifestNode] = {}
 
     for resource, node in manifest.nodes.items():
-        if node.resource_type not in NodeType.refable():
+        if not node.is_relational:
             continue
         # appease mypy - sources aren't refable!
         assert not isinstance(node, ParsedSourceDefinition)

diff --git a/core/dbt/parser/schema_test_builders.py b/core/dbt/parser/schema_test_builders.py
@@ -41,16 +41,21 @@ def get_nice_schema_test_name(
     clean_flat_args = [re.sub('[^0-9a-zA-Z_]+', '_', arg) for arg in flat_args]
     unique = "__".join(clean_flat_args)
 
-    cutoff = 32
-    if len(unique) <= cutoff:
-        label = unique
-    else:
-        label = hashlib.md5(unique.encode('utf-8')).hexdigest()
+    # for the file path + alias, the name must be <64 characters
+    # if the full name is too long, include the first 30 identifying chars plus
+    # a 32-character hash of the full contents
+
+    test_identifier = '{}_{}'.format(test_type, test_name)
+    full_name = '{}_{}'.format(test_identifier, unique)
 
-    filename = '{}_{}_{}'.format(test_type, test_name, label)
-    name = '{}_{}_{}'.format(test_type, test_name, unique)
+    if len(full_name) >= 64:
+        test_trunc_identifier = test_identifier[:30]
+        label = hashlib.md5(full_name.encode('utf-8')).hexdigest()
+        short_name = '{}_{}'.format(test_trunc_identifier, label)
+    else:
+        short_name = full_name
 
-    return filename, name
+    return short_name, full_name
 
 
 @dataclass
@@ -185,7 +190,7 @@ class TestBuilder(Generic[Testable]):
         r'(?P<test_name>([a-zA-Z_][0-9a-zA-Z_]*))'
     )
     # kwargs representing test configs
-    MODIFIER_ARGS = ('severity', 'tags', 'enabled')
+    MODIFIER_ARGS = ('severity', 'tags', 'enabled', 'store_failures')
 
     def __init__(
         self,
@@ -231,6 +236,10 @@ def __init__(
         self.compiled_name: str = compiled_name
         self.fqn_name: str = fqn_name
 
+        # use hashed name as alias if too long
+        if compiled_name != fqn_name:
+            self.modifiers['alias'] = compiled_name
+
     def _bad_type(self) -> TypeError:
         return TypeError('invalid target type "{}"'.format(type(self.target)))
 
@@ -271,13 +280,19 @@ def extract_test_args(test, name=None) -> Tuple[str, Dict[str, Any]]:
     def enabled(self) -> Optional[bool]:
         return self.modifiers.get('enabled')
 
+    def alias(self) -> Optional[str]:
+        return self.modifiers.get('alias')
+
     def severity(self) -> Optional[str]:
         sev = self.modifiers.get('severity')
         if sev:
             return sev.upper()
         else:
             return None
 
+    def store_failures(self) -> Optional[bool]:
+        return self.modifiers.get('store_failures')
+
     def tags(self) -> List[str]:
         tags = self.modifiers.get('tags', [])
         if isinstance(tags, str):

diff --git a/core/dbt/parser/schemas.py b/core/dbt/parser/schemas.py
@@ -493,6 +493,11 @@ def render_test_update(self, node, config, builder):
                 node.config['severity'] = builder.severity()
             if builder.enabled() is not None:
                 node.config['enabled'] = builder.enabled()
+            # note: this does not respect generate_alias_name() macro
+            if builder.alias() is not None:
+                node.unrendered_config['alias'] = builder.alias()
+                node.config['alias'] = builder.alias()
+                node.alias = builder.alias()
             # source node tests are processed at patch_source time
             if isinstance(builder.target, UnpatchedSourceDefinition):
                 sources = [builder.target.fqn[-2], builder.target.fqn[-1]]

diff --git a/core/dbt/task/printer.py b/core/dbt/task/printer.py
@@ -306,6 +306,13 @@ def print_run_result_error(
             logger.info("  compiled SQL at {}".format(
                 result.node.compiled_path))
 
+        if result.node.should_store_failures:
+            with TextOnly():
+                logger.info("")
+            msg = f"select * from {result.node.relation_name}"
+            border = '-' * len(msg)
+            logger.info(f"  See test failures:\n  {border}\n  {msg}\n  {border}")
+
     elif result.message is not None:
         first = True
         for line in result.message.split("\n"):

diff --git a/core/dbt/task/runnable.py b/core/dbt/task/runnable.py
@@ -455,7 +455,7 @@ def get_model_schemas(
         for node in self.manifest.nodes.values():
             if node.unique_id not in selected_uids:
                 continue
-            if node.is_refable and not node.is_ephemeral:
+            if node.is_relational and not node.is_ephemeral:
                 relation = adapter.Relation.create_from(self.config, node)
                 result.add(relation.without_identifier())
 
@@ -525,7 +525,6 @@ def create_schema(relation: BaseRelation) -> None:
                 db_schema = (db_lower, schema.lower())
                 if db_schema not in existing_schemas_lowered:
                     existing_schemas_lowered.add(db_schema)
-
                     fut = tpe.submit_connected(
                         adapter, f'create_{info.database or ""}_{info.schema}',
                         create_schema, info