From 0cc105aa11724c06c828dbfd1ecff641dc27e4d8 Mon Sep 17 00:00:00 2001 From: Andrew Sikowitz Date: Thu, 1 Jun 2023 02:52:52 -0700 Subject: [PATCH 1/2] feat(ingest/snowflake): Okta OAuth support --- .../docs/sources/snowflake/snowflake_pre.md | 42 ++++++++- .../src/datahub/configuration/common.py | 28 ------ .../src/datahub/configuration/oauth.py | 34 +++++++ .../ingestion/source/sql/oauth_generator.py | 54 ++++++++--- .../ingestion/source_config/sql/snowflake.py | 90 +++++++++---------- .../tests/unit/test_snowflake_source.py | 68 +++++++++++--- 6 files changed, 219 insertions(+), 97 deletions(-) create mode 100644 metadata-ingestion/src/datahub/configuration/oauth.py diff --git a/metadata-ingestion/docs/sources/snowflake/snowflake_pre.md b/metadata-ingestion/docs/sources/snowflake/snowflake_pre.md index 4564f78d1f97c..d7458c6948e40 100644 --- a/metadata-ingestion/docs/sources/snowflake/snowflake_pre.md +++ b/metadata-ingestion/docs/sources/snowflake/snowflake_pre.md @@ -42,7 +42,9 @@ grant imported privileges on database snowflake to role datahub_role; The details of each granted privilege can be viewed in [snowflake docs](https://docs.snowflake.com/en/user-guide/security-access-control-privileges.html). A summarization of each privilege, and why it is required for this connector: -- `operate` is required on warehouse to execute queries +- `operate` is required only to start the warehouse. + If the warehouse is already running during ingestion or has auto-resume enabled, + this permission is not required. - `usage` is required for us to run queries using the warehouse - `usage` on `database` and `schema` are required because without it tables and views inside them are not accessible. If an admin does the required grants on `table` but misses the grants on `schema` or the `database` in which the table/view exists then we will not be able to get metadata for the table/view. - If metadata is required only on some schemas then you can grant the usage privilieges only on a particular schema like @@ -59,6 +61,44 @@ If you plan to enable extraction of table lineage, via the `include_table_lineag grant imported privileges on database snowflake to role datahub_role; ``` +### Authentication +Authentication is most simply done via a Snowflake user and password. + +Alternatively, other authentication methods are supported via the `authentication_type` config option. + +#### Okta OAuth +To set up Okta OAuth authentication, roughly follow the four steps in [this guide](https://docs.snowflake.com/en/user-guide/oauth-okta). + +Pass in the following values, as described in the article, for your recipe's `oauth_config`: +- `provider`: okta +- `client_id`: `` +- `client_secret`: `` +- `authority_url`: `` +- `scopes`: The list of your *Okta* scopes, i.e. with the `session:role:` prefix + +Datahub only supports two OAuth grant types: `client_credentials` and `password`. +The steps slightly differ based on which you decide to use. + +##### Client Credentials Grant Type (Simpler) +- When creating an Okta App Integration, choose type `API Services` + + Ensure client authentication method is `Client secret` + + Note your `Client ID` +- Create a Snowflake user to correspond to your newly created Okta client credentials + + *Ensure the user's `Login Name` matches your Okta application's `Client ID`* + + Ensure the user has been granted your datahub role + +##### Password Grant Type +- When creating an Okta App Integration, choose type `OIDC` -> `Native Application` + + Add Grant Type `Resource Owner Password` + + Ensure client authentication method is `Client secret` +- Create an Okta user to sign into, noting the `Username` and `Password` +- Create a Snowflake user to correspond to your newly created Okta client credentials + + *Ensure the user's `Login Name` matches your Okta user's `Username` (likely a password)* + + Ensure the user has been granted your datahub role +- When running ingestion, provide the required `oauth_config` fields, + including `client_id` and `client_secret`, plus your Okta user's `Username` and `Password` + * Note: the `username` and `password` config options are not nested under `oauth_config` + ### Caveats - Some of the features are only available in the Snowflake Enterprise Edition. This doc has notes mentioning where this applies. diff --git a/metadata-ingestion/src/datahub/configuration/common.py b/metadata-ingestion/src/datahub/configuration/common.py index a3897669a9d42..39fdc9abb67f6 100644 --- a/metadata-ingestion/src/datahub/configuration/common.py +++ b/metadata-ingestion/src/datahub/configuration/common.py @@ -187,34 +187,6 @@ def load_config(self, config_fp: IO) -> dict: pass -class OauthConfiguration(ConfigModel): - provider: Optional[str] = Field( - description="Identity provider for oauth, e.g- microsoft" - ) - client_id: Optional[str] = Field( - description="client id of your registered application" - ) - scopes: Optional[List[str]] = Field( - description="scopes required to connect to snowflake" - ) - use_certificate: bool = Field( - description="Do you want to use certificate and private key to authenticate using oauth", - default=False, - ) - client_secret: Optional[str] = Field( - description="client secret of the application if use_certificate = false" - ) - authority_url: Optional[str] = Field( - description="Authority url of your identity provider" - ) - encoded_oauth_public_key: Optional[str] = Field( - description="base64 encoded certificate content if use_certificate = true" - ) - encoded_oauth_private_key: Optional[str] = Field( - description="base64 encoded private key content if use_certificate = true" - ) - - class AllowDenyPattern(ConfigModel): """A class to store allow deny regexes""" diff --git a/metadata-ingestion/src/datahub/configuration/oauth.py b/metadata-ingestion/src/datahub/configuration/oauth.py new file mode 100644 index 0000000000000..9a1ddbf437913 --- /dev/null +++ b/metadata-ingestion/src/datahub/configuration/oauth.py @@ -0,0 +1,34 @@ +from enum import Enum +from typing import List, Optional + +from pydantic import Field, SecretStr + +from datahub.configuration import ConfigModel + + +class OAuthIdentityProvider(Enum): + MICROSOFT = "microsoft" + OKTA = "okta" + + +class OAuthConfiguration(ConfigModel): + provider: OAuthIdentityProvider = Field( + description="Identity provider for oauth." + "Supported providers are microsoft and okta." + ) + authority_url: str = Field(description="Authority url of your identity provider") + client_id: str = Field(description="client id of your registered application") + scopes: List[str] = Field(description="scopes required to connect to snowflake") + use_certificate: bool = Field( + description="Do you want to use certificate and private key to authenticate using oauth", + default=False, + ) + client_secret: Optional[SecretStr] = Field( + description="client secret of the application if use_certificate = false" + ) + encoded_oauth_public_key: Optional[str] = Field( + description="base64 encoded certificate content if use_certificate = true" + ) + encoded_oauth_private_key: Optional[str] = Field( + description="base64 encoded private key content if use_certificate = true" + ) diff --git a/metadata-ingestion/src/datahub/ingestion/source/sql/oauth_generator.py b/metadata-ingestion/src/datahub/ingestion/source/sql/oauth_generator.py index be85064a58c2a..7231c6ef6b1df 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/sql/oauth_generator.py +++ b/metadata-ingestion/src/datahub/ingestion/source/sql/oauth_generator.py @@ -1,31 +1,45 @@ import base64 import logging +from dataclasses import dataclass from typing import Any, Dict, List, Optional, Union import msal +import requests from OpenSSL.crypto import FILETYPE_PEM, load_certificate +from pydantic.types import SecretStr + +from datahub.configuration.oauth import OAuthIdentityProvider logger = logging.getLogger(__name__) -class OauthTokenGenerator: - def __init__(self, client_id, authority_url, provider): - self.client_id = client_id - self.authority_url = authority_url - self.provider = provider +OKTA_SCOPE_DELIMITER = "," + + +@dataclass +class OAuthTokenGenerator: + client_id: str + authority_url: str + provider: OAuthIdentityProvider + username: Optional[str] = None + password: Optional[SecretStr] = None def _get_token( self, credentials: Union[str, Dict[str, Any]], scopes: Optional[List[str]], check_cache: bool, - ) -> str: - token = getattr(self, "_get_{}_token".format(self.provider))( - scopes, check_cache, credentials - ) - return token + ) -> dict: + if self.provider == OAuthIdentityProvider.MICROSOFT: + return self._get_microsoft_token(credentials, scopes, check_cache) + elif self.provider == OAuthIdentityProvider.OKTA: + assert isinstance(credentials, str) + assert scopes is not None + return self._get_okta_token(credentials, scopes) + else: + raise Exception(f"Unknown oauth provider: {self.provider}") - def _get_microsoft_token(self, scopes, check_cache, credentials): + def _get_microsoft_token(self, credentials, scopes, check_cache): app = msal.ConfidentialClientApplication( self.client_id, authority=self.authority_url, client_credential=credentials ) @@ -38,6 +52,24 @@ def _get_microsoft_token(self, scopes, check_cache, credentials): return _token + def _get_okta_token(self, credentials: str, scopes: List[str]) -> dict: + data = { + "grant_type": "client_credentials", + "scope": OKTA_SCOPE_DELIMITER.join(scopes), + } + if self.username and self.password: + data["grant_type"] = "password" + data["username"] = self.username + data["password"] = self.password.get_secret_value() + + resp = requests.post( + self.authority_url, + headers={"Accept": "application/json"}, + auth=(self.client_id, credentials), + data=data, + ) + return resp.json() + def get_public_certificate_thumbprint(self, public_cert_str: str) -> str: cert_str = public_cert_str certificate = load_certificate(FILETYPE_PEM, cert_str.encode("utf-8")) diff --git a/metadata-ingestion/src/datahub/ingestion/source_config/sql/snowflake.py b/metadata-ingestion/src/datahub/ingestion/source_config/sql/snowflake.py index 36163b1cd2306..b21ebe30c24a1 100644 --- a/metadata-ingestion/src/datahub/ingestion/source_config/sql/snowflake.py +++ b/metadata-ingestion/src/datahub/ingestion/source_config/sql/snowflake.py @@ -12,14 +12,15 @@ OAUTH_AUTHENTICATOR, ) -from datahub.configuration.common import AllowDenyPattern, OauthConfiguration +from datahub.configuration.common import AllowDenyPattern +from datahub.configuration.oauth import OAuthConfiguration, OAuthIdentityProvider from datahub.configuration.time_window_config import BaseTimeWindowConfig from datahub.configuration.validate_field_rename import pydantic_renamed_field from datahub.ingestion.source.snowflake.constants import ( CLIENT_PREFETCH_THREADS, CLIENT_SESSION_KEEP_ALIVE, ) -from datahub.ingestion.source.sql.oauth_generator import OauthTokenGenerator +from datahub.ingestion.source.sql.oauth_generator import OAuthTokenGenerator from datahub.ingestion.source.sql.sql_config import ( SQLAlchemyConfig, make_sqlalchemy_uri, @@ -69,7 +70,7 @@ class BaseSnowflakeConfig(BaseTimeWindowConfig): description="Password for your private key. Required if using key pair authentication with encrypted private key.", ) - oauth_config: Optional[OauthConfiguration] = pydantic.Field( + oauth_config: Optional[OAuthConfiguration] = pydantic.Field( default=None, description="oauth configuration - https://docs.snowflake.com/en/user-guide/python-connector-example.html#connecting-with-oauth", ) @@ -137,48 +138,36 @@ def authenticator_type_is_valid(cls, v, values, field): f"At least one should be set when using {v} authentication" ) elif v == "OAUTH_AUTHENTICATOR": - if values.get("oauth_config") is None: - raise ValueError( - f"'oauth_config' is none but should be set when using {v} authentication" - ) - if values.get("oauth_config").provider is None: - raise ValueError( - f"'oauth_config.provider' is none " - f"but should be set when using {v} authentication" - ) - if values.get("oauth_config").client_id is None: - raise ValueError( - f"'oauth_config.client_id' is none " - f"but should be set when using {v} authentication" - ) - if values.get("oauth_config").scopes is None: + cls._check_oauth_config(values.get("oauth_config")) + logger.info(f"using authenticator type '{v}'") + return v + + @staticmethod + def _check_oauth_config(oauth_config: Optional[OAuthConfiguration]) -> None: + if oauth_config is None: + raise ValueError( + "'oauth_config' is none but should be set when using OAUTH_AUTHENTICATOR authentication" + ) + if oauth_config.use_certificate is True: + if oauth_config.provider == OAuthIdentityProvider.OKTA.value: raise ValueError( - f"'oauth_config.scopes' was none " - f"but should be set when using {v} authentication" + "Certificate authentication is not supported for Okta." ) - if values.get("oauth_config").authority_url is None: + if oauth_config.encoded_oauth_private_key is None: raise ValueError( - f"'oauth_config.authority_url' was none " - f"but should be set when using {v} authentication" + "'base64_encoded_oauth_private_key' was none " + "but should be set when using certificate for oauth_config" ) - if values.get("oauth_config").use_certificate is True: - if values.get("oauth_config").encoded_oauth_private_key is None: - raise ValueError( - "'base64_encoded_oauth_private_key' was none " - "but should be set when using certificate for oauth_config" - ) - if values.get("oauth").encoded_oauth_public_key is None: - raise ValueError( - "'base64_encoded_oauth_public_key' was none" - "but should be set when using use_certificate true for oauth_config" - ) - elif values.get("oauth_config").client_secret is None: + if oauth_config.encoded_oauth_public_key is None: raise ValueError( - "'oauth_config.client_secret' was none " - "but should be set when using use_certificate false for oauth_config" + "'base64_encoded_oauth_public_key' was none" + "but should be set when using use_certificate true for oauth_config" ) - logger.info(f"using authenticator type '{v}'") - return v + elif oauth_config.client_secret is None: + raise ValueError( + "'oauth_config.client_secret' was none " + "but should be set when using use_certificate false for oauth_config" + ) @pydantic.validator("include_view_lineage") def validate_include_view_lineage(cls, v, values): @@ -297,14 +286,16 @@ def get_options(self) -> dict: self.options["connect_args"] = options_connect_args return self.options - def get_oauth_connection(self): + def get_oauth_connection(self) -> snowflake.connector.SnowflakeConnection: assert ( self.oauth_config ), "oauth_config should be provided if using oauth based authentication" - generator = OauthTokenGenerator( - self.oauth_config.client_id, - self.oauth_config.authority_url, - self.oauth_config.provider, + generator = OAuthTokenGenerator( + client_id=self.oauth_config.client_id, + authority_url=self.oauth_config.authority_url, + provider=self.oauth_config.provider, + username=self.username, + password=self.password, ) if self.oauth_config.use_certificate: response = generator.get_token_with_certificate( @@ -313,11 +304,18 @@ def get_oauth_connection(self): scopes=self.oauth_config.scopes, ) else: + assert self.oauth_config.client_secret response = generator.get_token_with_secret( - secret=str(self.oauth_config.client_secret), + secret=str(self.oauth_config.client_secret.get_secret_value()), scopes=self.oauth_config.scopes, ) - token = response["access_token"] + try: + token = response["access_token"] + except KeyError: + raise ValueError( + f"access_token not found in response {response}. " + "Please check your OAuth configuration." + ) connect_args = self.get_options()["connect_args"] return snowflake.connector.connect( user=self.username, diff --git a/metadata-ingestion/tests/unit/test_snowflake_source.py b/metadata-ingestion/tests/unit/test_snowflake_source.py index e6a8eee3e219c..b620b785fa46a 100644 --- a/metadata-ingestion/tests/unit/test_snowflake_source.py +++ b/metadata-ingestion/tests/unit/test_snowflake_source.py @@ -3,7 +3,7 @@ import pytest from pydantic import ValidationError -from datahub.configuration.common import OauthConfiguration +from datahub.configuration.oauth import OAuthConfiguration from datahub.configuration.pattern_utils import UUID_REGEX from datahub.ingestion.api.source import SourceCapability from datahub.ingestion.source.snowflake.constants import ( @@ -34,15 +34,27 @@ def test_snowflake_source_throws_error_on_account_id_missing(): ) -def test_snowflake_throws_error_on_client_id_missing_if_using_oauth(): +def test_no_client_id_invalid_oauth_config(): oauth_dict = { "provider": "microsoft", "scopes": ["https://microsoft.com/f4b353d5-ef8d/.default"], "client_secret": "6Hb9apkbc6HD7", "authority_url": "https://login.microsoftonline.com/yourorganisation.com", } - # assert that this is a valid oauth config on its own - OauthConfiguration.parse_obj(oauth_dict) + with pytest.raises(ValueError): + OAuthConfiguration.parse_obj(oauth_dict) + + +def test_snowflake_throws_error_on_client_secret_missing_if_use_certificate_is_false(): + oauth_dict = { + "client_id": "882e9831-7ea51cb2b954", + "provider": "microsoft", + "scopes": ["https://microsoft.com/f4b353d5-ef8d/.default"], + "use_certificate": False, + "authority_url": "https://login.microsoftonline.com/yourorganisation.com", + } + OAuthConfiguration.parse_obj(oauth_dict) + with pytest.raises(ValueError): SnowflakeV2Config.parse_obj( { @@ -53,16 +65,16 @@ def test_snowflake_throws_error_on_client_id_missing_if_using_oauth(): ) -def test_snowflake_throws_error_on_client_secret_missing_if_use_certificate_is_false(): +def test_snowflake_throws_error_on_encoded_oauth_private_key_missing_if_use_certificate_is_true(): oauth_dict = { "client_id": "882e9831-7ea51cb2b954", "provider": "microsoft", "scopes": ["https://microsoft.com/f4b353d5-ef8d/.default"], - "use_certificate": False, + "use_certificate": True, "authority_url": "https://login.microsoftonline.com/yourorganisation.com", + "encoded_oauth_public_key": "fkdsfhkshfkjsdfiuwrwfkjhsfskfhksjf==", } - OauthConfiguration.parse_obj(oauth_dict) - + OAuthConfiguration.parse_obj(oauth_dict) with pytest.raises(ValueError): SnowflakeV2Config.parse_obj( { @@ -73,16 +85,16 @@ def test_snowflake_throws_error_on_client_secret_missing_if_use_certificate_is_f ) -def test_snowflake_throws_error_on_encoded_oauth_private_key_missing_if_use_certificate_is_true(): +def test_snowflake_oauth_okta_does_not_support_certificate(): oauth_dict = { "client_id": "882e9831-7ea51cb2b954", - "provider": "microsoft", + "provider": "okta", "scopes": ["https://microsoft.com/f4b353d5-ef8d/.default"], "use_certificate": True, "authority_url": "https://login.microsoftonline.com/yourorganisation.com", "encoded_oauth_public_key": "fkdsfhkshfkjsdfiuwrwfkjhsfskfhksjf==", } - OauthConfiguration.parse_obj(oauth_dict) + OAuthConfiguration.parse_obj(oauth_dict) with pytest.raises(ValueError): SnowflakeV2Config.parse_obj( { @@ -93,6 +105,40 @@ def test_snowflake_throws_error_on_encoded_oauth_private_key_missing_if_use_cert ) +def test_snowflake_oauth_happy_paths(): + okta_dict = { + "client_id": "client_id", + "client_secret": "secret", + "provider": "okta", + "scopes": ["datahub_role"], + "authority_url": "https://dev-abc.okta.com/oauth2/def/v1/token", + } + assert SnowflakeV2Config.parse_obj( + { + "account_id": "test", + "authentication_type": "OAUTH_AUTHENTICATOR", + "oauth_config": okta_dict, + } + ) + + microsoft_dict = { + "client_id": "client_id", + "provider": "microsoft", + "scopes": ["https://microsoft.com/f4b353d5-ef8d/.default"], + "use_certificate": True, + "authority_url": "https://login.microsoftonline.com/yourorganisation.com", + "encoded_oauth_public_key": "publickey", + "encoded_oauth_private_key": "privatekey", + } + assert SnowflakeV2Config.parse_obj( + { + "account_id": "test", + "authentication_type": "OAUTH_AUTHENTICATOR", + "oauth_config": microsoft_dict, + } + ) + + def test_account_id_is_added_when_host_port_is_present(): config = SnowflakeV2Config.parse_obj( { From 0670e4bac0384a22c71160dca2525a5dc526359f Mon Sep 17 00:00:00 2001 From: Andrew Sikowitz Date: Thu, 1 Jun 2023 06:18:20 -0700 Subject: [PATCH 2/2] update doc --- metadata-ingestion/docs/sources/snowflake/snowflake_pre.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/metadata-ingestion/docs/sources/snowflake/snowflake_pre.md b/metadata-ingestion/docs/sources/snowflake/snowflake_pre.md index d7458c6948e40..9a381fb351aec 100644 --- a/metadata-ingestion/docs/sources/snowflake/snowflake_pre.md +++ b/metadata-ingestion/docs/sources/snowflake/snowflake_pre.md @@ -93,7 +93,7 @@ The steps slightly differ based on which you decide to use. + Ensure client authentication method is `Client secret` - Create an Okta user to sign into, noting the `Username` and `Password` - Create a Snowflake user to correspond to your newly created Okta client credentials - + *Ensure the user's `Login Name` matches your Okta user's `Username` (likely a password)* + + *Ensure the user's `Login Name` matches your Okta user's `Username` (likely an email)* + Ensure the user has been granted your datahub role - When running ingestion, provide the required `oauth_config` fields, including `client_id` and `client_secret`, plus your Okta user's `Username` and `Password`