From e712d092d1116f14885996a46cc866688b654f14 Mon Sep 17 00:00:00 2001 From: Fokko Date: Wed, 6 Nov 2024 14:18:36 +0100 Subject: [PATCH] Glue: Allow for assuming role for Glue Fixes #1104 --- mkdocs/docs/configuration.md | 22 ++++++++------- pyiceberg/catalog/glue.py | 53 +++++++++++++++++++++++++++++++++--- pyiceberg/io/__init__.py | 4 +-- pyiceberg/io/pyarrow.py | 6 ++-- 4 files changed, 66 insertions(+), 19 deletions(-) diff --git a/mkdocs/docs/configuration.md b/mkdocs/docs/configuration.md index ba77867ba..37e1ea7f1 100644 --- a/mkdocs/docs/configuration.md +++ b/mkdocs/docs/configuration.md @@ -107,7 +107,7 @@ For the FileIO there are several configuration options available: | s3.access-key-id | admin | Configure the static access key id used to access the FileIO. | | s3.secret-access-key | password | Configure the static secret access key used to access the FileIO. | | s3.session-token | AQoDYXdzEJr... | Configure the static session token used to access the FileIO. | -| s3.session-name | session | An optional identifier for the assumed role session. | +| s3.role-session-name | session | An optional identifier for the assumed role session. | | s3.role-arn | arn:aws:... | AWS Role ARN. If provided instead of access_key and secret_key, temporary credentials will be fetched by assuming this role. | | s3.signer | bearer | Configure the signature version of the FileIO. | | s3.signer.uri | | Configure the remote signing uri if it differs from the catalog uri. Remote signing is only implemented for `FsspecFileIO`. The final request is sent to `/`. | @@ -331,16 +331,18 @@ catalog: -| Key | Example | Description | -| ---------------------- | ------------------------------------ | ------------------------------------------------------------------------------- | -| glue.id | 111111111111 | Configure the 12-digit ID of the Glue Catalog | -| glue.skip-archive | true | Configure whether to skip the archival of older table versions. Default to true | +| Key | Example | Description | +|------------------------|----------------------------------------| ------------------------------------------------------------------------------- | +| glue.id | 111111111111 | Configure the 12-digit ID of the Glue Catalog | +| glue.skip-archive | true | Configure whether to skip the archival of older table versions. Default to true | | glue.endpoint | | Configure an alternative endpoint of the Glue service for GlueCatalog to access | -| glue.profile-name | default | Configure the static profile used to access the Glue Catalog | -| glue.region | us-east-1 | Set the region of the Glue Catalog | -| glue.access-key-id | admin | Configure the static access key id used to access the Glue Catalog | -| glue.secret-access-key | password | Configure the static secret access key used to access the Glue Catalog | -| glue.session-token | AQoDYXdzEJr... | Configure the static session token used to access the Glue Catalog | +| glue.profile-name | default | Configure the static profile used to access the Glue Catalog | +| glue.role-session-name | session | An optional identifier for the assumed role session. | +| glue.role-arn | arn:aws:... | AWS Role ARN. If provided instead of access_key and secret_key, temporary credentials will be fetched by assuming this role. | +| glue.region | us-east-1 | Set the region of the Glue Catalog | +| glue.access-key-id | admin | Configure the static access key id used to access the Glue Catalog | +| glue.secret-access-key | password | Configure the static secret access key used to access the Glue Catalog | +| glue.session-token | AQoDYXdzEJr... | Configure the static session token used to access the Glue Catalog | diff --git a/pyiceberg/catalog/glue.py b/pyiceberg/catalog/glue.py index 385873c45..aa819db3f 100644 --- a/pyiceberg/catalog/glue.py +++ b/pyiceberg/catalog/glue.py @@ -29,6 +29,7 @@ ) import boto3 +from botocore.credentials import AssumeRoleCredentialFetcher, Credentials, DeferredRefreshableCredentials from mypy_boto3_glue.client import GlueClient from mypy_boto3_glue.type_defs import ( ColumnTypeDef, @@ -59,7 +60,14 @@ NoSuchTableError, TableAlreadyExistsError, ) -from pyiceberg.io import AWS_ACCESS_KEY_ID, AWS_REGION, AWS_SECRET_ACCESS_KEY, AWS_SESSION_TOKEN +from pyiceberg.io import ( + AWS_ACCESS_KEY_ID, + AWS_REGION, + AWS_ROLE_ARN, + AWS_ROLE_SESSION_NAME, + AWS_SECRET_ACCESS_KEY, + AWS_SESSION_TOKEN, +) from pyiceberg.partitioning import UNPARTITIONED_PARTITION_SPEC, PartitionSpec from pyiceberg.schema import Schema, SchemaVisitor, visit from pyiceberg.serializers import FromInputFile @@ -127,6 +135,8 @@ GLUE_ACCESS_KEY_ID = "glue.access-key-id" GLUE_SECRET_ACCESS_KEY = "glue.secret-access-key" GLUE_SESSION_TOKEN = "glue.session-token" +GLUE_ROLE_ARN = "glue.role-arn" +GLUE_ROLE_SESSION_NAME = "glue.role-session-name" def _construct_parameters( @@ -296,13 +306,48 @@ class GlueCatalog(MetastoreCatalog): def __init__(self, name: str, **properties: Any): super().__init__(name, **properties) + credentials = Credentials( + access_key=get_first_property_value(properties, GLUE_ACCESS_KEY_ID, AWS_ACCESS_KEY_ID), + secret_key=get_first_property_value(properties, GLUE_SECRET_ACCESS_KEY, AWS_SECRET_ACCESS_KEY), + token=get_first_property_value(properties, GLUE_SESSION_TOKEN, AWS_SESSION_TOKEN), + ) + session = boto3.Session( profile_name=properties.get(GLUE_PROFILE_NAME), region_name=get_first_property_value(properties, GLUE_REGION, AWS_REGION), - aws_access_key_id=get_first_property_value(properties, GLUE_ACCESS_KEY_ID, AWS_ACCESS_KEY_ID), - aws_secret_access_key=get_first_property_value(properties, GLUE_SECRET_ACCESS_KEY, AWS_SECRET_ACCESS_KEY), - aws_session_token=get_first_property_value(properties, GLUE_SESSION_TOKEN, AWS_SESSION_TOKEN), + aws_access_key_id=credentials.access_key, + aws_secret_access_key=credentials.secret_key, + aws_session_token=credentials.token, ) + + if role_arn := get_first_property_value(properties, GLUE_ROLE_ARN, AWS_ROLE_ARN): + extra_args = {} + if role_session_name := get_first_property_value(properties, GLUE_ROLE_SESSION_NAME, AWS_ROLE_SESSION_NAME): + extra_args["RoleSessionName"] = role_session_name + + fetcher = AssumeRoleCredentialFetcher( + client_creator=session.client, + source_credentials=credentials, + role_arn=role_arn, + extra_args=extra_args, + ) + refreshable_credentials = DeferredRefreshableCredentials( + method="assume-role", + refresh_using=fetcher.fetch_credentials, + ) + from botocore.session import Session as BotoSession + + botocore_session = BotoSession() + botocore_session._credentials = refreshable_credentials # noqa: SLF001 + session = boto3.Session( + profile_name=properties.get(GLUE_PROFILE_NAME), + region_name=get_first_property_value(properties, GLUE_REGION, AWS_REGION), + aws_access_key_id=credentials.access_key, + aws_secret_access_key=credentials.secret_key, + aws_session_token=credentials.token, + botocore_session=botocore_session, + ) + self.glue: GlueClient = session.client("glue", endpoint_url=properties.get(GLUE_CATALOG_ENDPOINT)) if glue_catalog_id := properties.get(GLUE_ID): diff --git a/pyiceberg/io/__init__.py b/pyiceberg/io/__init__.py index 23a2cf359..0e07411a2 100644 --- a/pyiceberg/io/__init__.py +++ b/pyiceberg/io/__init__.py @@ -61,7 +61,7 @@ AWS_SECRET_ACCESS_KEY = "client.secret-access-key" AWS_SESSION_TOKEN = "client.session-token" AWS_ROLE_ARN = "aws.role-arn" -AWS_SESSION_NAME = "aws.session-name" +AWS_ROLE_SESSION_NAME = "aws.role-session-name" S3_ENDPOINT = "s3.endpoint" S3_ACCESS_KEY_ID = "s3.access-key-id" S3_SECRET_ACCESS_KEY = "s3.secret-access-key" @@ -73,7 +73,7 @@ S3_SIGNER_ENDPOINT = "s3.signer.endpoint" S3_SIGNER_ENDPOINT_DEFAULT = "v1/aws/s3/sign" S3_ROLE_ARN = "s3.role-arn" -S3_SESSION_NAME = "s3.session-name" +S3_ROLE_SESSION_NAME = "s3.role-session-name" HDFS_HOST = "hdfs.host" HDFS_PORT = "hdfs.port" HDFS_USER = "hdfs.user" diff --git a/pyiceberg/io/pyarrow.py b/pyiceberg/io/pyarrow.py index a053b83ac..bff3a0a0c 100644 --- a/pyiceberg/io/pyarrow.py +++ b/pyiceberg/io/pyarrow.py @@ -86,8 +86,8 @@ AWS_ACCESS_KEY_ID, AWS_REGION, AWS_ROLE_ARN, + AWS_ROLE_SESSION_NAME, AWS_SECRET_ACCESS_KEY, - AWS_SESSION_NAME, AWS_SESSION_TOKEN, GCS_DEFAULT_LOCATION, GCS_ENDPOINT, @@ -104,8 +104,8 @@ S3_PROXY_URI, S3_REGION, S3_ROLE_ARN, + S3_ROLE_SESSION_NAME, S3_SECRET_ACCESS_KEY, - S3_SESSION_NAME, S3_SESSION_TOKEN, FileIO, InputFile, @@ -369,7 +369,7 @@ def _initialize_fs(self, scheme: str, netloc: Optional[str] = None) -> FileSyste if role_arn := get_first_property_value(self.properties, S3_ROLE_ARN, AWS_ROLE_ARN): client_kwargs["role_arn"] = role_arn - if session_name := get_first_property_value(self.properties, S3_SESSION_NAME, AWS_SESSION_NAME): + if session_name := get_first_property_value(self.properties, S3_ROLE_SESSION_NAME, AWS_ROLE_SESSION_NAME): client_kwargs["session_name"] = session_name return S3FileSystem(**client_kwargs)