From 8adff6070e8ad3cd93667c853bed22540e891431 Mon Sep 17 00:00:00 2001 From: Oguzhan Unlu Date: Mon, 21 May 2018 19:04:57 +0300 Subject: [PATCH] Bump Stream Enrich to 0.16.1 (closes #161) --- .../configs/enrichments/ip_lookups.json | 6 +- .../enrichments/pii_enrichment_config.json | 34 +++++++ .../configs/snowplow-stream-enrich.hocon | 88 +++---------------- .../resources/init/snowplow_stream_enrich | 2 +- .../resources/ui/js/components/Overview.tsx | 2 +- .../roles/sp_mini_4_setup_apps/tasks/main.yml | 2 +- 6 files changed, 54 insertions(+), 80 deletions(-) create mode 100644 provisioning/resources/configs/enrichments/pii_enrichment_config.json diff --git a/provisioning/resources/configs/enrichments/ip_lookups.json b/provisioning/resources/configs/enrichments/ip_lookups.json index 6bd598e6..cb99b38a 100644 --- a/provisioning/resources/configs/enrichments/ip_lookups.json +++ b/provisioning/resources/configs/enrichments/ip_lookups.json @@ -1,13 +1,13 @@ { - "schema": "iglu:com.snowplowanalytics.snowplow/ip_lookups/jsonschema/1-0-0", + "schema": "iglu:com.snowplowanalytics.snowplow/ip_lookups/jsonschema/2-0-0", "data": { "name": "ip_lookups", "vendor": "com.snowplowanalytics.snowplow", "enabled": true, "parameters": { "geo": { - "database": "GeoLiteCity.dat", - "uri": "http://s3-eu-west-1.amazonaws.com/snowplow-hosted-assets/third-party/maxmind" + "database": "GeoLite2-City.mmdb", + "uri": "http://snowplow-hosted-assets.s3.amazonaws.com/third-party/maxmind" } } } diff --git a/provisioning/resources/configs/enrichments/pii_enrichment_config.json b/provisioning/resources/configs/enrichments/pii_enrichment_config.json new file mode 100644 index 00000000..c1c28aee --- /dev/null +++ b/provisioning/resources/configs/enrichments/pii_enrichment_config.json @@ -0,0 +1,34 @@ +{ + "schema": "iglu:com.snowplowanalytics.snowplow.enrichments/pii_enrichment_config/jsonschema/1-0-0", + "data": { + "vendor": "com.snowplowanalytics.snowplow.enrichments", + "name": "pii_enrichment_config", + "enabled": false, + "parameters": { + "pii": [ + { + "pojo": { + "field": "user_id" + } + }, + { + "pojo": { + "field": "user_fingerprint" + } + }, + { + "json": { + "field": "unstruct_event", + "schemaCriterion": "iglu:com.mailchimp/subscribe/jsonschema/1-0-*", + "jsonPath": "$.data.['email', 'ip_opt']" + } + } + ], + "strategy": { + "pseudonymize": { + "hashFunction": "SHA-256" + } + } + } + } +} diff --git a/provisioning/resources/configs/snowplow-stream-enrich.hocon b/provisioning/resources/configs/snowplow-stream-enrich.hocon index f1ec918e..353095bb 100644 --- a/provisioning/resources/configs/snowplow-stream-enrich.hocon +++ b/provisioning/resources/configs/snowplow-stream-enrich.hocon @@ -1,4 +1,4 @@ -# Copyright (c) 2013-2017 Snowplow Analytics Ltd. All rights reserved. +# Copyright (c) 2013-2018 Snowplow Analytics Ltd. All rights reserved. # # This program is licensed to you under the Apache License Version 2.0, and # you may not use this file except in compliance with the Apache License @@ -15,33 +15,13 @@ # configuration options for Stream Enrich. enrich { - # Sources currently supported are: - # 'kinesis' for reading Thrift-serialized records from a Kinesis stream - # 'kafka' for reading Thrift-serialized records from a Kafka topic - # 'stdin' for reading Base64-encoded Thrift-serialized records from stdin - # 'nsq' for reading Base64-encoded Thrift-serialized records from NSQ - source = nsq - - # Sinks currently supported are: - # 'kinesis' for writing enriched events to one Kinesis stream and invalid events to another. - # 'kafka' for writing enriched events to one Kafka topic and invalid events to another. - # 'stdouterr' for writing enriched events to stdout and invalid events to stderr. - # Using "sbt assembly" and "java -jar" is recommended to disable sbt logging. - # 'nsq' for writing enriched events to one NSQ topic and invalid events to another. - sink = nsq - - # AWS credentials - # If both are set to 'default', use the default AWS credentials provider chain. - # If both are set to 'iam', use AWS IAM Roles to provision credentials. - # If both are set to 'env', use environment variables AWS_ACCESS_KEY_ID and AWS_SECRET_ACCESS_KEY - aws { - accessKey = "" - secretKey = "" - } streams { - # Stream/topic where the raw events to be enriched are located - in.raw = RawEvents + + in { + # Stream/topic where the raw events to be enriched are located + raw = RawEvents + } out { # Stream/topic where the events that were successfully enriched will end up @@ -53,73 +33,33 @@ enrich { # Possible partition keys are: event_id, event_fingerprint, domain_userid, network_userid, # user_ipaddress, domain_sessionid, user_fingerprint. # Refer to https://github.com/snowplow/snowplow/wiki/canonical-event-model to know what the - # possible parittion keys correspond to. + # possible partition keys correspond to. # Otherwise, the partition key will be a random UUID. + # Note: Nsq does not make use of partition key. partitionKey = "" } - kinesis { - # Region where the streams are located - region = "" - - # Maximum number of records to get from Kinesis per call to GetRecords - maxRecords = 10000 - - # LATEST: most recent data. - # TRIM_HORIZON: oldest available data. - # Note: This only effects the first run of this application - # on a stream. - initialPosition = TRIM_HORIZON - - # Minimum and maximum backoff periods, in milliseconds - backoffPolicy { - minBackoff = 50 - maxBackoff = 500 - } - } - - # Kafka configuration - kafka { - brokers = "" - - # Number of retries to perform before giving up on sending a record - retries = 0 - } + sourceSink { + enabled = nsq - # config for nsq - nsq { - # Channel name for raw event source + # Channel name for nsq source rawChannel = StreamEnrichChannel - # Host name for nsqd host = "127.0.0.1" - - # TCP port for nsqd + # TCP port for nsqd, 4150 by default port = 4150 - # Host name for lookupd lookupHost = "127.0.0.1" - - # Port for nsqlookupd + # HTTP port for nsqlookupd, 4161 by default lookupPort = 4161 } - # After enrichment, events are accumulated in a buffer before being sent to Kinesis/Kafka. - # NOTE: Buffering is not supported by NSQ. - # The buffer is emptied whenever: - # - the number of stored records reaches recordLimit or - # - the combined size of the stored records reaches byteLimit or - # - the time in milliseconds since it was last emptied exceeds timeLimit when - # a new event enters the buffer buffer { byteLimit = 4500000 - recordLimit = 500 # Not supported by Kafka; will be ignored + recordLimit = 500 timeLimit = 5000 } - # Used for a DynamoDB table to maintain stream state. - # Used as the Kafka consumer group ID. - # You can set it automatically using: "SnowplowEnrich-$\\{enrich.streams.in.raw\\}" appName = "" } } diff --git a/provisioning/resources/init/snowplow_stream_enrich b/provisioning/resources/init/snowplow_stream_enrich index e6424ce4..f64cfad6 100755 --- a/provisioning/resources/init/snowplow_stream_enrich +++ b/provisioning/resources/init/snowplow_stream_enrich @@ -10,7 +10,7 @@ ### END INIT INFO dir="/home/ubuntu/snowplow/bin/" -cmd="java -jar snowplow-stream-enrich-0.12.0.jar --config /home/ubuntu/snowplow/configs/snowplow-stream-enrich.hocon --resolver file:/home/ubuntu/snowplow/configs/iglu-resolver.json --enrichments file:/home/ubuntu/snowplow/configs/enrichments" +cmd="java -jar snowplow-stream-enrich-nsq-0.16.1.jar --config /home/ubuntu/snowplow/configs/snowplow-stream-enrich.hocon --resolver file:/home/ubuntu/snowplow/configs/iglu-resolver.json --enrichments file:/home/ubuntu/snowplow/configs/enrichments" user="ubuntu" name="snowplow_stream_enrich" diff --git a/provisioning/resources/ui/js/components/Overview.tsx b/provisioning/resources/ui/js/components/Overview.tsx index bf9f2cdd..7e3ee58d 100644 --- a/provisioning/resources/ui/js/components/Overview.tsx +++ b/provisioning/resources/ui/js/components/Overview.tsx @@ -50,7 +50,7 @@ export class Overview extends React.Component<{}, {}> {

The software stack installed: