-
Notifications
You must be signed in to change notification settings - Fork 17
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
RDB Shredder: add tabular data output (close #151)
- Loading branch information
Showing
8 changed files
with
353 additions
and
31 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
67 changes: 67 additions & 0 deletions
67
shredder/src/main/scala/com.snowplowanalytics.snowplow.storage/spark/Shredded.scala
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,67 @@ | ||
/* | ||
* Copyright (c) 2012-2019 Snowplow Analytics Ltd. All rights reserved. | ||
* | ||
* This program is licensed to you under the Apache License Version 2.0, | ||
* and you may not use this file except in compliance with the Apache License Version 2.0. | ||
* You may obtain a copy of the Apache License Version 2.0 at | ||
* http://www.apache.org/licenses/LICENSE-2.0. | ||
* | ||
* Unless required by applicable law or agreed to in writing, | ||
* software distributed under the Apache License Version 2.0 is distributed on an | ||
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | ||
* See the Apache License Version 2.0 for the specific language governing permissions and | ||
* limitations there under. | ||
*/ | ||
package com.snowplowanalytics.snowplow.storage.spark | ||
|
||
import cats.Id | ||
|
||
import com.snowplowanalytics.iglu.client.Resolver | ||
import com.snowplowanalytics.snowplow.storage.spark.ShredJob.Hierarchy | ||
|
||
/** ADT, representing possible forms of data in blob storage */ | ||
sealed trait Shredded { | ||
def json: Option[(String, String, String, String, String)] = this match { | ||
case Shredded.Json(vendor, name, format, version, data) => Some((vendor, name, format, version, data)) | ||
case Shredded.Tabular(_, _, _, _, _) => None | ||
} | ||
|
||
def tabular: Option[(String, String, String, String, String)] = this match { | ||
case Shredded.Tabular(vendor, name, format, version, data) => Some((vendor, name, format, version, data)) | ||
case Shredded.Json(_, _, _, _, _) => None | ||
} | ||
} | ||
|
||
object Shredded { | ||
|
||
/** Data will be present as JSON, with RDB Loader loading it using JSON Paths. Legacy format */ | ||
case class Json(vendor: String, name: String, format: String, version: String, data: String) extends Shredded | ||
|
||
/** Data will be present as TSV, with RDB Loader loading it directly */ | ||
case class Tabular(vendor: String, name: String, format: String, version: String, data: String) extends Shredded | ||
|
||
/** | ||
* Transform JSON `Hierarchy`, extrancted from enriched into a `Shredded` entity, | ||
* specifying how it should look like in destination: JSON or TSV | ||
* If flattening algorithm failed at any point - it will fallback to the JSON format | ||
* | ||
* @param jsonOnly output can only be JSON. All downstream components should agree on that | ||
* @param resolver Iglu resolver to request all necessary entities | ||
* @param hierarchy actual JSON hierarchy from an enriched event | ||
*/ | ||
def fromHierarchy(jsonOnly: Boolean, resolver: => Resolver[Id])(hierarchy: Hierarchy): Shredded = { | ||
val vendor = hierarchy.entity.schema.vendor | ||
val name = hierarchy.entity.schema.name | ||
val format = hierarchy.entity.schema.format | ||
if (jsonOnly) | ||
Json(vendor, name, format, hierarchy.entity.schema.version.asString, hierarchy.dumpJson) | ||
else | ||
EventUtils.flatten(resolver, hierarchy.entity).value match { | ||
case Right(columns) => | ||
val meta = EventUtils.buildMetadata(hierarchy.eventId, hierarchy.collectorTstamp, hierarchy.entity.schema) | ||
Tabular(vendor, name, format, hierarchy.entity.schema.version.model.toString, (meta ++ columns).mkString("\t")) | ||
case Left(_) => | ||
Json(vendor, name, format, hierarchy.entity.schema.version.asString, hierarchy.dumpJson) | ||
} | ||
} | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Oops, something went wrong.