Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat: add two more capability tables for Athena #244

Merged
merged 2 commits into from
Oct 19, 2023
Merged
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
98 changes: 98 additions & 0 deletions stacks/firehose-stack.js
Original file line number Diff line number Diff line change
Expand Up @@ -326,6 +326,104 @@ export function UcanFirehoseStack ({ stack, app }) {
})
uploadAddTable.addDependsOn(glueDatabase)

// creates a table that can be seen in the AWS Glue table browser at
// https://console.aws.amazon.com/glue/home#/v2/data-catalog/tables
// and in the data browser in the Athena Query editor at
// https://console.aws.amazon.com/athena/home#/query-editor
const storeRemoveTableName = getCdkNames('store-remove-table', app.stage)
const storeRemoveTable = new glue.CfnTable(stack, storeRemoveTableName, {
catalogId: Aws.ACCOUNT_ID,
databaseName,
tableInput: {
name: storeRemoveTableName,
partitionKeys: [
{ name: 'day', type: 'date' }
],
parameters: {
classification: "json",
typeOfData: "file",
// @see https://docs.aws.amazon.com/athena/latest/ug/partition-projection-kinesis-firehose-example.html for more information on projection
// configuration - this should match the "day" parameter and S3 prefix configured in the delivery stream
"projection.enabled": "true",
"projection.day.type": "date",
"projection.day.format": "yyyy-MM-dd",
"projection.day.range": "2023-01-01,NOW",
"projection.day.interval": "1",
"projection.day.interval.unit": "DAYS",
"storage.location.template": `s3://${streamLogBucket.bucketName}/logs/receipt/store_remove/\${day}/`
},
storageDescriptor: {
location: `s3://${streamLogBucket.bucketName}/logs/receipt/store_remove/`,
columns: [
{ name: 'carcid', type: 'string' },
// STRUCT here refers to the Apache Hive STRUCT datatype - see https://aws.amazon.com/blogs/big-data/create-tables-in-amazon-athena-from-nested-json-and-mappings-using-jsonserde/
{ name: 'value', type: 'STRUCT<att:ARRAY<struct<can:STRING,with:STRING,nb:STRUCT<link:STRUCT<_cid_slash:STRING>>>>,iss:STRING,aud:STRING>' },
{ name: "out", type: "STRUCT<error:STRUCT<name:STRING>,ok:STRUCT<size:BIGINT>>" },
{ name: "ts", type: "timestamp" }
],
inputFormat: 'org.apache.hadoop.mapred.TextInputFormat',
outputFormat: 'org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat',
serdeInfo: {
serializationLibrary: 'org.openx.data.jsonserde.JsonSerDe',
parameters: {
// see https://aws.amazon.com/blogs/big-data/create-tables-in-amazon-athena-from-nested-json-and-mappings-using-jsonserde/
'mapping._cid_slash': '/'
}
}
}
}
})
storeRemoveTable.addDependsOn(glueDatabase)

// creates a table that can be seen in the AWS Glue table browser at
// https://console.aws.amazon.com/glue/home#/v2/data-catalog/tables
// and in the data browser in the Athena Query editor at
// https://console.aws.amazon.com/athena/home#/query-editor
const uploadRemoveTableName = getCdkNames('upload-remove-table', app.stage)
const uploadRemoveTable = new glue.CfnTable(stack, uploadRemoveTableName, {
catalogId: Aws.ACCOUNT_ID,
databaseName,
tableInput: {
name: uploadRemoveTableName,
partitionKeys: [
{ name: 'day', type: 'date' }
],
parameters: {
classification: "json",
typeOfData: "file",
// @see https://docs.aws.amazon.com/athena/latest/ug/partition-projection-kinesis-firehose-example.html for more information on projection
// configuration - this should match the "day" parameter and S3 prefix configured in the delivery stream
"projection.enabled": "true",
"projection.day.type": "date",
"projection.day.format": "yyyy-MM-dd",
"projection.day.range": "2023-01-01,NOW",
"projection.day.interval": "1",
"projection.day.interval.unit": "DAYS",
"storage.location.template": `s3://${streamLogBucket.bucketName}/logs/receipt/upload_remove/\${day}/`
},
storageDescriptor: {
location: `s3://${streamLogBucket.bucketName}/logs/receipt/upload_remove/`,
columns: [
{ name: 'carcid', type: 'string' },
// STRUCT here refers to the Apache Hive STRUCT datatype - see https://aws.amazon.com/blogs/big-data/create-tables-in-amazon-athena-from-nested-json-and-mappings-using-jsonserde/
{ name: 'value', type: 'STRUCT<att:ARRAY<STRUCT<can:STRING,with:STRING,nb:STRUCT<root:STRUCT<_cid_slash:STRING>,shards:ARRAY<STRUCT<_cid_slash:STRING>>>>>,iss:STRING,aud:STRING>' },
{ name: "out", type: "STRUCT<error:STRUCT<name:STRING>,ok:STRUCT<root:STRUCT<_cid_slash:STRING>,shards:ARRAY<STRUCT<_cid_slash:STRING>>>>" },
{ name: "ts", type: "timestamp" }
],
inputFormat: 'org.apache.hadoop.mapred.TextInputFormat',
outputFormat: 'org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat',
serdeInfo: {
serializationLibrary: 'org.openx.data.jsonserde.JsonSerDe',
parameters: {
// see https://aws.amazon.com/blogs/big-data/create-tables-in-amazon-athena-from-nested-json-and-mappings-using-jsonserde/
'mapping._cid_slash': '/'
}
}
}
}
})
uploadRemoveTable.addDependsOn(glueDatabase)

// creates a table that can be seen in the AWS Glue table browser at
// https://console.aws.amazon.com/glue/home#/v2/data-catalog/tables
// and in the data browser in the Athena Query editor at
Expand Down