diff --git a/stacks/firehose-stack.js b/stacks/firehose-stack.js index 845256cf..09ca7cbe 100644 --- a/stacks/firehose-stack.js +++ b/stacks/firehose-stack.js @@ -326,6 +326,104 @@ export function UcanFirehoseStack ({ stack, app }) { }) uploadAddTable.addDependsOn(glueDatabase) + // creates a table that can be seen in the AWS Glue table browser at + // https://console.aws.amazon.com/glue/home#/v2/data-catalog/tables + // and in the data browser in the Athena Query editor at + // https://console.aws.amazon.com/athena/home#/query-editor + const storeRemoveTableName = getCdkNames('store-remove-table', app.stage) + const storeRemoveTable = new glue.CfnTable(stack, storeRemoveTableName, { + catalogId: Aws.ACCOUNT_ID, + databaseName, + tableInput: { + name: storeRemoveTableName, + partitionKeys: [ + { name: 'day', type: 'date' } + ], + parameters: { + classification: "json", + typeOfData: "file", + // @see https://docs.aws.amazon.com/athena/latest/ug/partition-projection-kinesis-firehose-example.html for more information on projection + // configuration - this should match the "day" parameter and S3 prefix configured in the delivery stream + "projection.enabled": "true", + "projection.day.type": "date", + "projection.day.format": "yyyy-MM-dd", + "projection.day.range": "2023-01-01,NOW", + "projection.day.interval": "1", + "projection.day.interval.unit": "DAYS", + "storage.location.template": `s3://${streamLogBucket.bucketName}/logs/receipt/store_remove/\${day}/` + }, + storageDescriptor: { + location: `s3://${streamLogBucket.bucketName}/logs/receipt/store_remove/`, + columns: [ + { name: 'carcid', type: 'string' }, + // STRUCT here refers to the Apache Hive STRUCT datatype - see https://aws.amazon.com/blogs/big-data/create-tables-in-amazon-athena-from-nested-json-and-mappings-using-jsonserde/ + { name: 'value', type: 'STRUCT>>>,iss:STRING,aud:STRING>' }, + { name: "out", type: "STRUCT,ok:STRUCT>" }, + { name: "ts", type: "timestamp" } + ], + inputFormat: 'org.apache.hadoop.mapred.TextInputFormat', + outputFormat: 'org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat', + serdeInfo: { + serializationLibrary: 'org.openx.data.jsonserde.JsonSerDe', + parameters: { + // see https://aws.amazon.com/blogs/big-data/create-tables-in-amazon-athena-from-nested-json-and-mappings-using-jsonserde/ + 'mapping._cid_slash': '/' + } + } + } + } + }) + storeRemoveTable.addDependsOn(glueDatabase) + + // creates a table that can be seen in the AWS Glue table browser at + // https://console.aws.amazon.com/glue/home#/v2/data-catalog/tables + // and in the data browser in the Athena Query editor at + // https://console.aws.amazon.com/athena/home#/query-editor + const uploadRemoveTableName = getCdkNames('upload-remove-table', app.stage) + const uploadRemoveTable = new glue.CfnTable(stack, uploadRemoveTableName, { + catalogId: Aws.ACCOUNT_ID, + databaseName, + tableInput: { + name: uploadRemoveTableName, + partitionKeys: [ + { name: 'day', type: 'date' } + ], + parameters: { + classification: "json", + typeOfData: "file", + // @see https://docs.aws.amazon.com/athena/latest/ug/partition-projection-kinesis-firehose-example.html for more information on projection + // configuration - this should match the "day" parameter and S3 prefix configured in the delivery stream + "projection.enabled": "true", + "projection.day.type": "date", + "projection.day.format": "yyyy-MM-dd", + "projection.day.range": "2023-01-01,NOW", + "projection.day.interval": "1", + "projection.day.interval.unit": "DAYS", + "storage.location.template": `s3://${streamLogBucket.bucketName}/logs/receipt/upload_remove/\${day}/` + }, + storageDescriptor: { + location: `s3://${streamLogBucket.bucketName}/logs/receipt/upload_remove/`, + columns: [ + { name: 'carcid', type: 'string' }, + // STRUCT here refers to the Apache Hive STRUCT datatype - see https://aws.amazon.com/blogs/big-data/create-tables-in-amazon-athena-from-nested-json-and-mappings-using-jsonserde/ + { name: 'value', type: 'STRUCT,shards:ARRAY>>>>,iss:STRING,aud:STRING>' }, + { name: "out", type: "STRUCT,ok:STRUCT,shards:ARRAY>>>" }, + { name: "ts", type: "timestamp" } + ], + inputFormat: 'org.apache.hadoop.mapred.TextInputFormat', + outputFormat: 'org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat', + serdeInfo: { + serializationLibrary: 'org.openx.data.jsonserde.JsonSerDe', + parameters: { + // see https://aws.amazon.com/blogs/big-data/create-tables-in-amazon-athena-from-nested-json-and-mappings-using-jsonserde/ + 'mapping._cid_slash': '/' + } + } + } + } + }) + uploadRemoveTable.addDependsOn(glueDatabase) + // creates a table that can be seen in the AWS Glue table browser at // https://console.aws.amazon.com/glue/home#/v2/data-catalog/tables // and in the data browser in the Athena Query editor at