From 1d46e0057444af9dd0f2b7e766afb43734f5e33c Mon Sep 17 00:00:00 2001 From: spenes Date: Fri, 18 Nov 2022 17:33:57 +0300 Subject: [PATCH] RDB Loader 5.2.0 --- .../index.md | 32 +++++++++++-------- .../stream-transformer-common/_index.mdx | 4 +++ .../reusable/transformer-pubsub/_index.mdx | 2 +- src/componentVersions.js | 2 +- 4 files changed, 25 insertions(+), 15 deletions(-) diff --git a/docs/pipeline-components-and-applications/loaders-storage-targets/snowplow-rdb-loader/loading-transformed-data/rdb-loader-configuration-reference/index.md b/docs/pipeline-components-and-applications/loaders-storage-targets/snowplow-rdb-loader/loading-transformed-data/rdb-loader-configuration-reference/index.md index ee9b138516..ba068d8804 100644 --- a/docs/pipeline-components-and-applications/loaders-storage-targets/snowplow-rdb-loader/loading-transformed-data/rdb-loader-configuration-reference/index.md +++ b/docs/pipeline-components-and-applications/loaders-storage-targets/snowplow-rdb-loader/loading-transformed-data/rdb-loader-configuration-reference/index.md @@ -27,11 +27,14 @@ This is a complete list of the options that can be configured: | `host` | Required. Host name of Redshift cluster. | | `port` | Required. Port of Redshift cluster. | | `database` | Required. Redshift database which the data will be loaded to. | -| `roleArn` | Required. AWS Role ARN allowing Redshift to load data from S3. | +| `roleArn` | Required if 'NoCreds' is chosen as load auth method.. AWS Role ARN allowing Redshift to load data from S3. | | `schema` | Required. Redshift schema name, eg “atomic”. | | `username` | Required. DB user with permissions to load data. | | `password` | Required. Password of DB user. | | `maxError` | Optional. Configures the [Redshift MAXERROR load option](https://docs.aws.amazon.com/redshift/latest/dg/copy-parameters-data-load.html#copy-maxerror). The default is 10. | +| `loadAuthMethod.*` (since 5.2.0) | Optional, default method is `NoCreds`. Specifies the auth method to use with the `COPY` statement. | +| `loadAuthMethod.type` | Required if `loadAuthMethod` section is included. Specifies the type of the authentication method. The possible values are `NoCreds` and `TempCreds`.

With `NoCreds`, no credentials will be passed to the `COPY` statement. Instead, Redshift cluster needs to be configured with an AWS Role ARN that allows it to load data from S3. This Role ARN needs to be passed in the `roleArn` setting above. You can find more information [here](https://docs.aws.amazon.com/redshift/latest/dg/copy-usage_notes-access-permissions.html).

With 'TempCreds', temporary credentials will be created for every load operation and these temporary credentials will be passed to the `COPY` statement. | +| `loadAuthMethod.roleArn` | Required if `loadAuthMethod.type` is `TempCreds`. IAM role that is used while creating temporary credentials. This role should allow access to the S3 bucket the transformer will write data to, with the following permissions: `s3:GetObject*`, `s3:ListBucket`, and `s3:GetBucketLocation`. | `jdbc.*` | Optional. Custom JDBC configuration. The default value is `{"ssl": true}`. | | `jdbc.BlockingRowsMode` | Optional. Refer to the [Redshift JDBC driver reference](https://s3.amazonaws.com/redshift-downloads/drivers/jdbc/1.2.54.1082/Amazon+Redshift+JDBC+Connector+Install+Guide.pdf). | | `jdbc.DisableIsValidQuery` | Optional. Refer to the [Redshift JDBC driver reference](https://s3.amazonaws.com/redshift-downloads/drivers/jdbc/1.2.54.1082/Amazon+Redshift+JDBC+Connector+Install+Guide.pdf). | @@ -60,18 +63,18 @@ This is a complete list of the options that can be configured: | `warehouse` | Required. Snowflake warehouse which the SQL statements submitted by Snowflake Loader will run on. | | `database` | Required. Snowflake database which the data will be loaded to. | | `schema` | Required. Target schema | -| `transformedStage.*` | Required if 'NoCreds' is chosen as load auth method. Snowflake stage for transformed events. | +| `transformedStage.*` | Required if `NoCreds` is chosen as load auth method. Snowflake stage for transformed events. | | `transformedStage.name` | Required if `transformedStage` is included. The name of the stage. | -| `transformedStage.location` | Required if `transformedStage` is included. The S3 path used as stage location. | -| `folderMonitoringStage.*` | Required if `monitoring.folders` section is configured and 'NoCreds' is chosen as load auth method. Snowflake stage to load folder monitoring entries into temporary Snowflake table. | +| `transformedStage.location` | Required if `transformedStage` is included. The S3 path used as stage location. (Not needed since 5.2.0 because it is auto-configured) | +| `folderMonitoringStage.*` | Required if `monitoring.folders` section is configured and `NoCreds` is chosen as load auth method. Snowflake stage to load folder monitoring entries into temporary Snowflake table. | | `folderMonitoringStage.name` | Required if `folderMonitoringStage` is included. The name of the stage. | -| `folderMonitoringStage.location` | Required if `folderMonitoringStage` is included. The S3 path used as stage location. | +| `folderMonitoringStage.location` | Required if `folderMonitoringStage` is included. The S3 path used as stage location. (Not needed since 5.2.0 because it is auto-configured)| | `appName` | Optional. Name passed as 'application' property while creating Snowflake connection. The default is `Snowplow_OSS`. | | `maxError` | Optional. A table copy statement will skip an input file when the number of errors in it exceeds the specified number. This setting is used during initial loading and thus can filter out only invalid JSONs (which is impossible situation if used with Transformer). | | `jdbcHost` | Optional. Host for the JDBC driver that has priority over automatically derived hosts. If it is not given, host will be created automatically according to given `snowflakeRegion`. | -| `loadAuthMethod.*` | Optional, default method is `NoCreds`. Specifies the auth method to use with 'COPY INTO' statement. Note that `TempCreds` auth method doesn't work when data is loaded from GCS. | -| `loadAuthMethod.type` | Required if `loadAuthMethod` section is included. Specifies the type of the auth method. The possible values are `NoCreds` and `TempCreds`.

With 'NoCreds', no credentials will be passed to 'COPY INTO' statement. Instead, 'transformedStage' and 'folderMonitoringStage' specified above will be used. More information can be found [here](https://docs.snowflake.com/en/user-guide/data-load-s3-config-storage-integration.html).

With 'TempCreds', temporary credentials will be created for every load operation and these temporary credentials will be passed to 'COPY INTO' statement. | -| `loadAuthMethod.roleArn` | Required if `loadAuthMethod.type` is `TempCreds`.IAM role that is used while creating temporary credentials. Created credentials will allow to access resources specified in the given role. List of necessary permissions needs to be given to role specified in [here](https://docs.snowflake.com/en/user-guide/data-load-s3-config-aws-iam-user.html). | +| `loadAuthMethod.*` | Optional, default method is `NoCreds`. Specifies the auth method to use with `COPY INTO` statement. Note that `TempCreds` auth method doesn't work when data is loaded from GCS. | +| `loadAuthMethod.type` | Required if `loadAuthMethod` section is included. Specifies the type of the auth method. The possible values are `NoCreds` and `TempCreds`.

With `NoCreds`, no credentials will be passed to `COPY INTO` statement. Instead, `transformedStage` and `folderMonitoringStage` specified above will be used. More information can be found [here](https://docs.snowflake.com/en/user-guide/data-load-s3-config-storage-integration.html).

With `TempCreds`, temporary credentials will be created for every load operation and these temporary credentials will be passed to `COPY INTO` statement. | +| `loadAuthMethod.roleArn` | Required if `loadAuthMethod.type` is `TempCreds`. IAM role that is used while creating temporary credentials. This role should allow access to the S3 bucket the transformer will write data to. You can find the list of necessary permissions needs to be given to role in [here](https://docs.snowflake.com/en/user-guide/data-load-s3-config-aws-iam-user.html). | ## Databricks Loader `storage` section @@ -80,15 +83,16 @@ This is a complete list of the options that can be configured: | `type` | Optional. The only valid value is the default: `databricks`. | | `host` | Required. Hostname of Databricks cluster. | | `password` | Required. [Databricks access token](https://docs.databricks.com/dev-tools/api/latest/authentication.html). Can be plain text, read from the EC2 parameter store or GCP secret manager (see below). | +| `eventsOptimizePeriod` | Optional. The default value is `2 days`. Optimize period per table, that will be used as predicate for the `OPTIMIZE` command. | | `password.secretManager.parameterName` | Alternative way for passing in the access token. | | `schema` | Required. Target schema. | | `port` | Required. Port of Databricks cluster. | | `httpPath` | Required. Http Path of Databricks cluster. Get it from the JDBC connection details after the cluster has been created. | | `catalog` | Optional. The default value is `hive_metastore`. [Databricks unity catalog name](https://docs.databricks.com/data-governance/unity-catalog/index.html). | | `userAgent` | Optional. The default value is `snowplow-rdbloader-oss`. User agent name for Databricks connection. | -| `loadAuthMethod.*` | Optional, default method is `NoCreds`. Specifies the auth method to use with 'COPY INTO' statement | -| `loadAuthMethod.type` | Required if `loadAuthMethod` section is included. Specifies the type of the auth method. The possible values are `NoCreds` and `TempCreds`.

With 'NoCreds', no credentials will be passed to 'COPY INTO' statement. Databricks cluster needs to have permission to access transformer output S3 bucket. More information can be found [here](https://docs.databricks.com/administration-guide/cloud-configurations/aws/instance-profiles.html).

With 'TempCreds', temporary credentials will be created for every load operation and these temporary credentials will be passed to 'COPY INTO' statement. With this way, Databricks cluster doesn't need permission to access to transformer output S3 bucket. This access will be provided by temporary credentials. | -| `loadAuthMethod.roleArn` | Required if `loadAuthMethod.type` is `TempCreds`. IAM role that is used while creating temporary credentials. Created credentials will allow to access resources specified in the given role. In our case, “s3:GetObject\*”, “s3:ListBucket”, and “s3:GetBucketLocation” permissions for transformer output S3 bucket should be specified in the role. | +| `loadAuthMethod.*` | Optional, default method is `NoCreds`. Specifies the auth method to use with `COPY INTO` statement | +| `loadAuthMethod.type` | Required if `loadAuthMethod` section is included. Specifies the type of the auth method. The possible values are `NoCreds` and `TempCreds`.

With `NoCreds`, no credentials will be passed to `COPY INTO` statement. Databricks cluster needs to have permission to access transformer output S3 bucket. More information can be found [here](https://docs.databricks.com/administration-guide/cloud-configurations/aws/instance-profiles.html).

With `TempCreds`, temporary credentials will be created for every load operation and these temporary credentials will be passed to `COPY INTO` statement. With this way, Databricks cluster doesn't need permission to access to transformer output S3 bucket. This access will be provided by temporary credentials. | +| `loadAuthMethod.roleArn` | Required if `loadAuthMethod.type` is `TempCreds`. IAM role that is used while creating temporary credentials. This role should allow access to the S3 bucket the transformer will write data to, with the following permissions: `s3:GetObject*`, `s3:ListBucket`, and `s3:GetBucketLocation`. | ## AWS specific settings @@ -113,11 +117,13 @@ Only Snowflake Loader can be run on GCP at the moment. | Parameter | Description | |--------------------------------------|-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------| -| `schedules.*` | Optional. Periodic schedules to stop loading, eg for Redshift maintenance window. | -| `schedules.noOperation.[*]` | Required if `schedules` section is configured. Array of objects which specifies no-operation windows. | +| `schedules.*` | Optional. Tasks scheduled to run periodically. | +| `schedules.noOperation.[*]` | Optional. Array of objects which specifies no-operation windows during which periodically scheduled tasks (configured in this section) will not run. | | `schedules.noOperation.[*].name` | Human-readable name of the no-op window. | | `schedules.noOperation.[*].when` | Cron expression with second granularity. | | `schedules.noOperation.[*].duration` | For how long the loader should be paused. | +| `schedules.optimizeEvents` | Optional. The default value is `"0 0 0 ? * *"` (i.e. every day at 00:00, JVM timezone). Cron expression with second granularity that specifies the schedule to run periodically an `OPTIMIZE` statement on event table. (Only for Databricks Loader) | +| `schedules.optimizeManifest` | Optional. The default value is `"0 0 5 ? * *"` (i.e. every day at 05:00 AM, JVM timezone). Cron expression with second granularity that specifies the schedule to run periodically an `OPTIMIZE` statement on manifest table. (Only for Databricks Loader) | | `retryQueue.*` | Optional. Additional backlog of recently failed folders that could be automatically retried. Retry queue saves a failed folder and then re-reads the info from `shredding_complete` S3 file. (Despite the legacy name of the message, which is required for backward compatibility, this also works with wide row format data.) | | `retryQueue.period` | Required if `retryQueue` section is configured. How often batch of failed folders should be pulled into a discovery queue. | | `retryQueue.size` | Required if `retryQueue` section is configured. How many failures should be kept in memory. After the limit is reached new failures are dropped. | diff --git a/docs/pipeline-components-and-applications/loaders-storage-targets/snowplow-rdb-loader/transforming-enriched-data/reusable/stream-transformer-common/_index.mdx b/docs/pipeline-components-and-applications/loaders-storage-targets/snowplow-rdb-loader/transforming-enriched-data/reusable/stream-transformer-common/_index.mdx index f415b59364..01a8b0bb88 100644 --- a/docs/pipeline-components-and-applications/loaders-storage-targets/snowplow-rdb-loader/transforming-enriched-data/reusable/stream-transformer-common/_index.mdx +++ b/docs/pipeline-components-and-applications/loaders-storage-targets/snowplow-rdb-loader/transforming-enriched-data/reusable/stream-transformer-common/_index.mdx @@ -42,3 +42,7 @@ import Link from '@docusaurus/Link'; telemetry.userProvidedId Optional. See here for more information. + + monitoring.sentry.dsn + Optional. For tracking runtime exceptions. + diff --git a/docs/pipeline-components-and-applications/loaders-storage-targets/snowplow-rdb-loader/transforming-enriched-data/reusable/transformer-pubsub/_index.mdx b/docs/pipeline-components-and-applications/loaders-storage-targets/snowplow-rdb-loader/transforming-enriched-data/reusable/transformer-pubsub/_index.mdx index 8d3b00bccc..d18407b634 100644 --- a/docs/pipeline-components-and-applications/loaders-storage-targets/snowplow-rdb-loader/transforming-enriched-data/reusable/transformer-pubsub/_index.mdx +++ b/docs/pipeline-components-and-applications/loaders-storage-targets/snowplow-rdb-loader/transforming-enriched-data/reusable/transformer-pubsub/_index.mdx @@ -32,7 +32,7 @@ formats.fileFormat - Optional. The default and only option at the moment is JSON. + Optional. The default option at the moment is JSON. Either JSON or PARQUET. windowing diff --git a/src/componentVersions.js b/src/componentVersions.js index 0e900805df..129e0955f3 100644 --- a/src/componentVersions.js +++ b/src/componentVersions.js @@ -2,7 +2,7 @@ export const versions = { snowplowMicro: '1.3.3', scalaTracker: '2.0.0', flutterTracker: '0.3.0', - rdbLoader: '5.1.2', + rdbLoader: '5.2.0', collector: '2.8.2', enrich: '3.5.1', bqLoader: '1.5.2',