Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Feature/schema tests are more unique #3335

Merged
merged 4 commits into from
May 13, 2021
Merged
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,7 @@ Contributors:
- Ensure that schema test macros are properly processed ([#3229](https://github.com/fishtown-analytics/dbt/issues/3229), [#3272](https://github.com/fishtown-analytics/dbt/pull/3272))
- Use absolute path for profiles directory instead of a path relative to the project directory. Note: If a user supplies a relative path to the profiles directory, the value of `args.profiles_dir` will still be absolute. ([#3133](https://github.com/fishtown-analytics/dbt/issues/3133))
- Fix FQN selector unable to find models whose name contains dots ([#3246](https://github.com/fishtown-analytics/dbt/issues/3246))
- Fix unique_id generation for generic tests so tests with the same FQN but different configuration will run. ([#3254](https://github.com/fishtown-analytics/dbt/issues/3254))

### Features
- Support commit hashes in dbt deps package revision ([#3268](https://github.com/fishtown-analytics/dbt/issues/3268), [#3270](https://github.com/fishtown-analytics/dbt/pull/3270))
Expand Down
4 changes: 4 additions & 0 deletions core/dbt/exceptions.py
Original file line number Diff line number Diff line change
Expand Up @@ -772,6 +772,10 @@ def raise_duplicate_resource_name(node_1, node_2):
elif node_1.resource_type == NodeType.Documentation:
get_func = 'doc("{}")'.format(duped_name)
elif node_1.resource_type == NodeType.Test and 'schema' in node_1.tags:
# TODO: This causes duplicate schema test errors to be ignored!!
# This PR should ensure this never gets called when the tests are actually different,
# but removing this breaks 50+ tests that currently have the exact same schema
# test defined more than once. :/
iknox-fa marked this conversation as resolved.
Show resolved Hide resolved
return
else:
get_func = '"{}"'.format(duped_name)
Expand Down
26 changes: 20 additions & 6 deletions core/dbt/parser/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
import itertools
import os
from typing import (
List, Dict, Any, Generic, TypeVar
List, Dict, Any, Generic, Optional, TypeVar
)

from dbt.dataclass_schema import ValidationError
Expand Down Expand Up @@ -55,11 +55,25 @@ def parse_file(self, block: FileBlock) -> None:
def resource_type(self) -> NodeType:
pass

def generate_unique_id(self, resource_name: str) -> str:
"""Returns a unique identifier for a resource"""
return "{}.{}.{}".format(self.resource_type,
self.project.project_name,
resource_name)
def generate_unique_id(
self,
resource_name: str,
hash: Optional[str] = None
) -> str:
"""Returns a unique identifier for a resource
An optional hash may be passed in to ensure uniqueness for edge cases"""

return '.'.join(
filter(
None,
[
self.resource_type,
self.project.project_name,
resource_name,
hash
]
)
)


class Parser(BaseParser[FinalValue], Generic[FinalValue]):
Expand Down
4 changes: 2 additions & 2 deletions core/dbt/parser/docs.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
from typing import Iterable
from typing import Iterable, Optional

import re

Expand All @@ -23,7 +23,7 @@ def resource_type(self) -> NodeType:
def get_compiled_path(cls, block: FileBlock):
return block.path.relative_path

def generate_unique_id(self, resource_name: str) -> str:
def generate_unique_id(self, resource_name: str, _: Optional[str] = None) -> str:
# because docs are in their own graph namespace, node type doesn't
# need to be part of the unique ID.
return '{}.{}'.format(self.project.project_name, resource_name)
Expand Down
29 changes: 27 additions & 2 deletions core/dbt/parser/schemas.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,8 +2,9 @@
import os

from abc import ABCMeta, abstractmethod
from hashlib import md5
from typing import (
Iterable, Dict, Any, Union, List, Optional, Generic, TypeVar, Type
Iterable, Dict, Any, Union, List, Optional, Generic, TypeVar, Type, Iterator
)

from dbt.dataclass_schema import ValidationError, dbtClassMixin
Expand Down Expand Up @@ -351,6 +352,30 @@ def create_test_node(
column_name: Optional[str],
) -> ParsedSchemaTestNode:

# TODO: (PR input needed) Should this be applied to all tests?
# In theory we already check non-schema tests for "unique" name uniqueness in
# core/dbt/contracts/graph/manifest.py @ _check_duplicates()

HASH_LENGTH = 10

if 'schema' in tags:
iknox-fa marked this conversation as resolved.
Show resolved Hide resolved
# TODO: (PR input needed) is this a complete set of uniquie-ifying data?
iknox-fa marked this conversation as resolved.
Show resolved Hide resolved
hash_data: Iterator = filter(
None,
[
name,
test_metadata.get('kwargs').get('model'), # type: ignore
column_name
])
hash_string = ''.join(hash_data).encode('utf-8')
test_hash = md5(hash_string).hexdigest()[-HASH_LENGTH:]
unique_id = self.generate_unique_id(
name,
test_hash
)
else:
unique_id = self.generate_unique_id(name)

dct = {
'alias': name,
'schema': self.default_schema,
Expand All @@ -364,7 +389,7 @@ def create_test_node(
'original_file_path': target.original_file_path,
'package_name': self.project.project_name,
'raw_sql': raw_sql,
'unique_id': self.generate_unique_id(name),
'unique_id': unique_id,
'config': self.config_dict(config),
'test_metadata': test_metadata,
'column_name': column_name,
Expand Down