From bebe1cf4696d67a860c90f4ae699fcbbc5b88f17 Mon Sep 17 00:00:00 2001 From: Travis Vachon Date: Wed, 18 Oct 2023 14:38:31 -0700 Subject: [PATCH 1/2] feat: add two more capability tables for Athena Add two more capability tables to Glue so Athena can query them. I'd like to start doing this in an automated way at some point, but that doesn't feel high priority enough for the moment so I've filed an issue here: https://github.com/web3-storage/w3infra/issues/243 --- stacks/firehose-stack.js | 99 ++++++++++++++++++++++++++++++++++++++++ 1 file changed, 99 insertions(+) diff --git a/stacks/firehose-stack.js b/stacks/firehose-stack.js index 845256cf..36db534f 100644 --- a/stacks/firehose-stack.js +++ b/stacks/firehose-stack.js @@ -9,6 +9,7 @@ import { aws_athena as athena, aws_sam as sam } from 'aws-cdk-lib' +import { Access, Provider, Space, Store, Upload, Consumer, Customer, RateLimit, Subscription, Filecoin, Admin, UCAN } from '@web3-storage/capabilities' import { UcanInvocationStack } from './ucan-invocation-stack.js' @@ -326,6 +327,104 @@ export function UcanFirehoseStack ({ stack, app }) { }) uploadAddTable.addDependsOn(glueDatabase) + // creates a table that can be seen in the AWS Glue table browser at + // https://console.aws.amazon.com/glue/home#/v2/data-catalog/tables + // and in the data browser in the Athena Query editor at + // https://console.aws.amazon.com/athena/home#/query-editor + const storeRemoveTableName = getCdkNames('store-remove-table', app.stage) + const storeRemoveTable = new glue.CfnTable(stack, storeRemoveTableName, { + catalogId: Aws.ACCOUNT_ID, + databaseName, + tableInput: { + name: storeRemoveTableName, + partitionKeys: [ + { name: 'day', type: 'date' } + ], + parameters: { + classification: "json", + typeOfData: "file", + // @see https://docs.aws.amazon.com/athena/latest/ug/partition-projection-kinesis-firehose-example.html for more information on projection + // configuration - this should match the "day" parameter and S3 prefix configured in the delivery stream + "projection.enabled": "true", + "projection.day.type": "date", + "projection.day.format": "yyyy-MM-dd", + "projection.day.range": "2023-01-01,NOW", + "projection.day.interval": "1", + "projection.day.interval.unit": "DAYS", + "storage.location.template": `s3://${streamLogBucket.bucketName}/logs/receipt/store_remove/\${day}/` + }, + storageDescriptor: { + location: `s3://${streamLogBucket.bucketName}/logs/receipt/store_remove/`, + columns: [ + { name: 'carcid', type: 'string' }, + // STRUCT here refers to the Apache Hive STRUCT datatype - see https://aws.amazon.com/blogs/big-data/create-tables-in-amazon-athena-from-nested-json-and-mappings-using-jsonserde/ + { name: 'value', type: 'STRUCT>>>,iss:STRING,aud:STRING>' }, + { name: "out", type: "STRUCT,ok:STRUCT>" }, + { name: "ts", type: "timestamp" } + ], + inputFormat: 'org.apache.hadoop.mapred.TextInputFormat', + outputFormat: 'org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat', + serdeInfo: { + serializationLibrary: 'org.openx.data.jsonserde.JsonSerDe', + parameters: { + // see https://aws.amazon.com/blogs/big-data/create-tables-in-amazon-athena-from-nested-json-and-mappings-using-jsonserde/ + 'mapping._cid_slash': '/' + } + } + } + } + }) + storeRemoveTable.addDependsOn(glueDatabase) + + // creates a table that can be seen in the AWS Glue table browser at + // https://console.aws.amazon.com/glue/home#/v2/data-catalog/tables + // and in the data browser in the Athena Query editor at + // https://console.aws.amazon.com/athena/home#/query-editor + const uploadRemoveTableName = getCdkNames('upload-remove-table', app.stage) + const uploadRemoveTable = new glue.CfnTable(stack, uploadRemoveTableName, { + catalogId: Aws.ACCOUNT_ID, + databaseName, + tableInput: { + name: uploadRemoveTableName, + partitionKeys: [ + { name: 'day', type: 'date' } + ], + parameters: { + classification: "json", + typeOfData: "file", + // @see https://docs.aws.amazon.com/athena/latest/ug/partition-projection-kinesis-firehose-example.html for more information on projection + // configuration - this should match the "day" parameter and S3 prefix configured in the delivery stream + "projection.enabled": "true", + "projection.day.type": "date", + "projection.day.format": "yyyy-MM-dd", + "projection.day.range": "2023-01-01,NOW", + "projection.day.interval": "1", + "projection.day.interval.unit": "DAYS", + "storage.location.template": `s3://${streamLogBucket.bucketName}/logs/receipt/upload_remove/\${day}/` + }, + storageDescriptor: { + location: `s3://${streamLogBucket.bucketName}/logs/receipt/upload_remove/`, + columns: [ + { name: 'carcid', type: 'string' }, + // STRUCT here refers to the Apache Hive STRUCT datatype - see https://aws.amazon.com/blogs/big-data/create-tables-in-amazon-athena-from-nested-json-and-mappings-using-jsonserde/ + { name: 'value', type: 'STRUCT,shards:ARRAY>>>>,iss:STRING,aud:STRING>' }, + { name: "out", type: "STRUCT,ok:STRUCT,shards:ARRAY>>>" }, + { name: "ts", type: "timestamp" } + ], + inputFormat: 'org.apache.hadoop.mapred.TextInputFormat', + outputFormat: 'org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat', + serdeInfo: { + serializationLibrary: 'org.openx.data.jsonserde.JsonSerDe', + parameters: { + // see https://aws.amazon.com/blogs/big-data/create-tables-in-amazon-athena-from-nested-json-and-mappings-using-jsonserde/ + 'mapping._cid_slash': '/' + } + } + } + } + }) + uploadRemoveTable.addDependsOn(glueDatabase) + // creates a table that can be seen in the AWS Glue table browser at // https://console.aws.amazon.com/glue/home#/v2/data-catalog/tables // and in the data browser in the Athena Query editor at From b18120fe819cf2db157c9e47d078ce59bdba1405 Mon Sep 17 00:00:00 2001 From: Travis Vachon Date: Thu, 19 Oct 2023 08:52:52 -0700 Subject: [PATCH 2/2] Update stacks/firehose-stack.js Co-authored-by: Vasco Santos --- stacks/firehose-stack.js | 1 - 1 file changed, 1 deletion(-) diff --git a/stacks/firehose-stack.js b/stacks/firehose-stack.js index 36db534f..09ca7cbe 100644 --- a/stacks/firehose-stack.js +++ b/stacks/firehose-stack.js @@ -9,7 +9,6 @@ import { aws_athena as athena, aws_sam as sam } from 'aws-cdk-lib' -import { Access, Provider, Space, Store, Upload, Consumer, Customer, RateLimit, Subscription, Filecoin, Admin, UCAN } from '@web3-storage/capabilities' import { UcanInvocationStack } from './ucan-invocation-stack.js'