From a19c6433b073f19238d1ea9e91313eaaf5dc005b Mon Sep 17 00:00:00 2001 From: Anton Parkhomenko Date: Wed, 25 Mar 2020 00:42:35 +0300 Subject: [PATCH] Loader: switch from Free monad to cats-effect IO (close #184) --- build.sbt | 1 + project/plugins.sbt | 2 +- .../snowplow/rdbloader/LoaderA.scala | 171 ------- .../snowplow/rdbloader/LoaderError.scala | 7 +- .../snowplow/rdbloader/Log.scala | 36 -- .../snowplow/rdbloader/Main.scala | 98 ++-- .../snowplow/rdbloader/Security.scala | 52 -- .../snowplow/rdbloader/config/CliConfig.scala | 5 +- .../rdbloader/config/SnowplowConfig.scala | 2 +- .../snowplow/rdbloader/db/Migration.scala | 66 +-- .../rdbloader/discovery/DataDiscovery.scala | 114 +++-- .../discovery/DiscoveryFailure.scala | 5 +- .../rdbloader/discovery/ShreddedType.scala | 73 ++- .../snowplow/rdbloader/dsl/AWS.scala | 188 ++++++++ .../snowplow/rdbloader/dsl/Cache.scala | 40 ++ .../snowplow/rdbloader/dsl/FS.scala | 59 +++ .../snowplow/rdbloader/dsl/Iglu.scala | 41 ++ .../snowplow/rdbloader/dsl/JDBC.scala | 247 ++++++++++ .../snowplow/rdbloader/dsl/Logging.scala | 188 ++++++++ .../snowplow/rdbloader/dsl/RealWorld.scala | 52 ++ .../interpreters/DryRunInterpreter.scala | 148 ------ .../rdbloader/interpreters/Interpreter.scala | 62 --- .../interpreters/RealWorldInterpreter.scala | 237 ---------- .../implementations/JdbcInterpreter.scala | 149 ------ .../implementations/S3Interpreter.scala | 136 ------ .../implementations/SshInterpreter.scala | 77 --- .../implementations/TrackerInterpreter.scala | 181 ------- .../snowplow/rdbloader/loaders/Common.scala | 87 ++-- .../rdbloader/loaders/PostgresqlLoader.scala | 43 +- .../loaders/RedshiftLoadStatements.scala | 1 + .../rdbloader/loaders/RedshiftLoader.scala | 87 ++-- .../snowplow/rdbloader/package.scala | 73 +-- .../snowplow/rdbloader/utils/Common.scala | 22 +- .../snowplow/rdbloader/{ => utils}/S3.scala | 10 +- .../snowplow/rdbloader/utils/SSH.scala | 62 +++ .../snowplow/rdbloader/Interpreters.scala | 39 ++ .../snowplow/rdbloader/S3Spec.scala | 3 +- .../snowplow/rdbloader/SpecHelpers.scala | 11 +- .../snowplow/rdbloader/TestInterpreter.scala | 184 +++++-- .../rdbloader/config/CliConfigSpec.scala | 4 +- .../snowplow/rdbloader/db/MigrationSpec.scala | 46 +- .../discovery/DataDiscoverySpec.scala | 447 +++++------------- .../discovery/ShreddedTypeSpec.scala | 2 +- .../rdbloader/loaders/CommonSpec.scala | 110 ----- .../loaders/RedshiftLoaderSpec.scala | 316 +++++-------- .../snowplow/rdbloader/utils/CommonSpec.scala | 18 +- 46 files changed, 1643 insertions(+), 2359 deletions(-) delete mode 100644 src/main/scala/com/snowplowanalytics/snowplow/rdbloader/LoaderA.scala delete mode 100644 src/main/scala/com/snowplowanalytics/snowplow/rdbloader/Log.scala delete mode 100644 src/main/scala/com/snowplowanalytics/snowplow/rdbloader/Security.scala create mode 100644 src/main/scala/com/snowplowanalytics/snowplow/rdbloader/dsl/AWS.scala create mode 100644 src/main/scala/com/snowplowanalytics/snowplow/rdbloader/dsl/Cache.scala create mode 100644 src/main/scala/com/snowplowanalytics/snowplow/rdbloader/dsl/FS.scala create mode 100644 src/main/scala/com/snowplowanalytics/snowplow/rdbloader/dsl/Iglu.scala create mode 100644 src/main/scala/com/snowplowanalytics/snowplow/rdbloader/dsl/JDBC.scala create mode 100644 src/main/scala/com/snowplowanalytics/snowplow/rdbloader/dsl/Logging.scala create mode 100644 src/main/scala/com/snowplowanalytics/snowplow/rdbloader/dsl/RealWorld.scala delete mode 100644 src/main/scala/com/snowplowanalytics/snowplow/rdbloader/interpreters/DryRunInterpreter.scala delete mode 100644 src/main/scala/com/snowplowanalytics/snowplow/rdbloader/interpreters/Interpreter.scala delete mode 100644 src/main/scala/com/snowplowanalytics/snowplow/rdbloader/interpreters/RealWorldInterpreter.scala delete mode 100644 src/main/scala/com/snowplowanalytics/snowplow/rdbloader/interpreters/implementations/JdbcInterpreter.scala delete mode 100644 src/main/scala/com/snowplowanalytics/snowplow/rdbloader/interpreters/implementations/S3Interpreter.scala delete mode 100644 src/main/scala/com/snowplowanalytics/snowplow/rdbloader/interpreters/implementations/SshInterpreter.scala delete mode 100644 src/main/scala/com/snowplowanalytics/snowplow/rdbloader/interpreters/implementations/TrackerInterpreter.scala rename src/main/scala/com/snowplowanalytics/snowplow/rdbloader/{ => utils}/S3.scala (98%) create mode 100644 src/main/scala/com/snowplowanalytics/snowplow/rdbloader/utils/SSH.scala create mode 100644 src/test/scala/com/snowplowanalytics/snowplow/rdbloader/Interpreters.scala delete mode 100644 src/test/scala/com/snowplowanalytics/snowplow/rdbloader/loaders/CommonSpec.scala diff --git a/build.sbt b/build.sbt index 663d7a326..7050bd5d7 100755 --- a/build.sbt +++ b/build.sbt @@ -43,6 +43,7 @@ lazy val loader = project.in(file(".")) .settings(BuildSettings.assemblySettings) .settings(resolvers ++= Dependencies.resolutionRepos) .settings( + addCompilerPlugin("com.olegpy" %% "better-monadic-for" % "0.3.1"), libraryDependencies ++= Seq( Dependencies.decline, Dependencies.scalaTracker, diff --git a/project/plugins.sbt b/project/plugins.sbt index c3f680dd5..0dbfaa658 100644 --- a/project/plugins.sbt +++ b/project/plugins.sbt @@ -1,4 +1,4 @@ logLevel := Level.Warn addSbtPlugin("com.eed3si9n" % "sbt-assembly" % "0.14.9") -addSbtPlugin("com.localytics" % "sbt-dynamodb" % "2.0.3") \ No newline at end of file +addSbtPlugin("com.localytics" % "sbt-dynamodb" % "2.0.3") diff --git a/src/main/scala/com/snowplowanalytics/snowplow/rdbloader/LoaderA.scala b/src/main/scala/com/snowplowanalytics/snowplow/rdbloader/LoaderA.scala deleted file mode 100644 index 6e6d3abed..000000000 --- a/src/main/scala/com/snowplowanalytics/snowplow/rdbloader/LoaderA.scala +++ /dev/null @@ -1,171 +0,0 @@ -/* - * Copyright (c) 2012-2019 Snowplow Analytics Ltd. All rights reserved. - * - * This program is licensed to you under the Apache License Version 2.0, - * and you may not use this file except in compliance with the Apache License Version 2.0. - * You may obtain a copy of the Apache License Version 2.0 at http://www.apache.org/licenses/LICENSE-2.0. - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the Apache License Version 2.0 is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the Apache License Version 2.0 for the specific language governing permissions and limitations there under. - */ -package com.snowplowanalytics.snowplow.rdbloader - -import java.nio.file.Path - -import cats.free.Free -import cats.data.EitherT -import cats.implicits._ - -import com.snowplowanalytics.iglu.schemaddl.migrations.SchemaList - -// This library -import Security.Tunnel -import loaders.Common.SqlString -import db.Decoder - -/** - * RDB Loader algebra. Used to build Free data-structure, - * interpreted to IO-actions - */ -sealed trait LoaderA[A] - -object LoaderA { - - // Discovery ops - case class ListS3(bucket: S3.Folder) extends LoaderA[Either[LoaderError, List[S3.BlobObject]]] - case class KeyExists(key: S3.Key) extends LoaderA[Boolean] - case class DownloadData(path: S3.Folder, dest: Path) extends LoaderA[Either[LoaderError, List[Path]]] - - // Loading ops - case class ExecuteUpdate(sql: SqlString) extends LoaderA[Either[LoaderError, Long]] - case class CopyViaStdin(files: List[Path], sql: SqlString) extends LoaderA[Either[LoaderError, Long]] - - // JDBC ops - case class ExecuteQuery[A](query: SqlString, ev: Decoder[A]) extends LoaderA[Either[LoaderError, A]] - - // FS ops - case object CreateTmpDir extends LoaderA[Either[LoaderError, Path]] - case class DeleteDir(path: Path) extends LoaderA[Either[LoaderError, Unit]] - - // Auxiliary ops - case class Sleep(timeout: Long) extends LoaderA[Unit] - case class Track(exitLog: Log) extends LoaderA[Unit] - case class Dump(key: S3.Key) extends LoaderA[Either[String, S3.Key]] - case class Exit(exitLog: Log, dumpResult: Option[Either[String, S3.Key]]) extends LoaderA[Int] - case class Print(message: String) extends LoaderA[Unit] - - // Cache ops - case class Put(key: String, value: Option[S3.Key]) extends LoaderA[Unit] - case class Get(key: String) extends LoaderA[Option[Option[S3.Key]]] - - // Tunnel ops - case class EstablishTunnel(tunnelConfig: Tunnel) extends LoaderA[Either[LoaderError, Unit]] - case class CloseTunnel() extends LoaderA[Either[LoaderError, Unit]] - - // Security ops - case class GetEc2Property(name: String) extends LoaderA[Either[LoaderError, String]] - - // Iglu ops - case class GetSchemas(vendor: String, name: String, model: Int) extends LoaderA[Either[LoaderError, SchemaList]] - - - def listS3(bucket: S3.Folder): Action[Either[LoaderError, List[S3.BlobObject]]] = - Free.liftF[LoaderA, Either[LoaderError, List[S3.BlobObject]]](ListS3(bucket)) - - /** Check if S3 key exist */ - def keyExists(key: S3.Key): Action[Boolean] = - Free.liftF[LoaderA, Boolean](KeyExists(key)) - - /** Download S3 key into local path */ - def downloadData(source: S3.Folder, dest: Path): LoaderAction[List[Path]] = - EitherT(Free.liftF[LoaderA, Either[LoaderError, List[Path]]](DownloadData(source, dest))) - - /** Execute single SQL statement (against target in interpreter) */ - def executeUpdate(sql: SqlString): LoaderAction[Long] = - EitherT(Free.liftF[LoaderA, Either[LoaderError, Long]](ExecuteUpdate(sql))) - - /** Execute multiple (against target in interpreter) */ - def executeUpdates(queries: List[SqlString]): LoaderAction[Unit] = { - val shortCircuiting = queries.traverse(query => executeUpdate(query)) - EitherT(shortCircuiting.void.value) - } - - /** Execute query and parse results into `A` */ - def executeQuery[A](query: SqlString)(implicit ev: Decoder[A]): LoaderAction[A] = - EitherT(Free.liftF[LoaderA, Either[LoaderError, A]](ExecuteQuery[A](query, ev))) - - - /** Execute SQL transaction (against target in interpreter) */ - def executeTransaction(queries: List[SqlString]): LoaderAction[Unit] = { - val begin = SqlString.unsafeCoerce("BEGIN") - val commit = SqlString.unsafeCoerce("COMMIT") - val transaction = (begin :: queries) :+ commit - executeUpdates(transaction) - } - - - /** Perform PostgreSQL COPY table FROM STDIN (against target in interpreter) */ - def copyViaStdin(files: List[Path], query: SqlString): LoaderAction[Long] = - EitherT(Free.liftF[LoaderA, Either[LoaderError, Long]](CopyViaStdin(files, query))) - - - /** Create tmp directory */ - def createTmpDir: LoaderAction[Path] = - EitherT(Free.liftF[LoaderA, Either[LoaderError, Path]](CreateTmpDir)) - - /** Delete directory */ - def deleteDir(path: Path): LoaderAction[Unit] = - EitherT(Free.liftF[LoaderA, Either[LoaderError, Unit]](DeleteDir(path))) - - - /** Block thread for some time, milliseconds */ - def sleep(timeout: Long): Action[Unit] = - Free.liftF[LoaderA, Unit](Sleep(timeout)) - - /** Track result via Snowplow tracker */ - def track(result: Log): Action[Unit] = - Free.liftF[LoaderA, Unit](Track(result)) - - /** Dump log to S3 */ - def dump(key: S3.Key): Action[Either[String, S3.Key]] = - Free.liftF[LoaderA, Either[String, S3.Key]](Dump(key)) - - /** Close RDB Loader app with appropriate state */ - def exit(result: Log, dumpResult: Option[Either[String, S3.Key]]): Action[Int] = - Free.liftF[LoaderA, Int](Exit(result, dumpResult)) - - /** Print message to stdout */ - def print(message: String): Action[Unit] = - Free.liftF[LoaderA, Unit](Print(message)) - - - /** Put value into cache (stored in interpreter) */ - def putCache(key: String, value: Option[S3.Key]): Action[Unit] = - Free.liftF[LoaderA, Unit](Put(key, value)) - - /** Get value from cache (stored in interpreter) */ - def getCache(key: String): Action[Option[Option[S3.Key]]] = - Free.liftF[LoaderA, Option[Option[S3.Key]]](Get(key)) - - - /** Create SSH tunnel to bastion host */ - def establishTunnel(tunnelConfig: Tunnel): Action[Either[LoaderError, Unit]] = - Free.liftF[LoaderA, Either[LoaderError, Unit]](EstablishTunnel(tunnelConfig)) - - /** Close single available SSH tunnel */ - def closeTunnel(): Action[Either[LoaderError, Unit]] = - Free.liftF[LoaderA, Either[LoaderError, Unit]](CloseTunnel()) - - - /** Retrieve decrypted property from EC2 Parameter Store */ - def getEc2Property(name: String): Action[Either[LoaderError, String]] = - Free.liftF[LoaderA, Either[LoaderError, String]](GetEc2Property(name)) - - - /** Retrieve list of schemas from Iglu Server */ - def getSchemas(vendor: String, name: String, model: Int): Action[Either[LoaderError, SchemaList]] = - Free.liftF[LoaderA, Either[LoaderError, SchemaList]](GetSchemas(vendor, name, model)) -} - diff --git a/src/main/scala/com/snowplowanalytics/snowplow/rdbloader/LoaderError.scala b/src/main/scala/com/snowplowanalytics/snowplow/rdbloader/LoaderError.scala index 7a97e87c0..062ba90d2 100644 --- a/src/main/scala/com/snowplowanalytics/snowplow/rdbloader/LoaderError.scala +++ b/src/main/scala/com/snowplowanalytics/snowplow/rdbloader/LoaderError.scala @@ -13,13 +13,12 @@ package com.snowplowanalytics.snowplow.rdbloader import cats.Show -import cats.implicits._ import cats.data.ValidatedNel import com.snowplowanalytics.snowplow.rdbloader.discovery.DiscoveryFailure /** Root error type */ -sealed trait LoaderError +sealed trait LoaderError extends Product with Serializable object LoaderError { @@ -63,7 +62,9 @@ object LoaderError { validated.leftMap(errors => DiscoveryError(errors.toList): LoaderError).toEither /** Other errors */ - case class LoaderLocalError(message: String) extends LoaderError + case class LoaderLocalError(message: String) extends Throwable with LoaderError { + override def getMessage: String = message + } /** Error happened during DDL-statements execution. Critical */ case class MigrationError(message: String) extends LoaderError diff --git a/src/main/scala/com/snowplowanalytics/snowplow/rdbloader/Log.scala b/src/main/scala/com/snowplowanalytics/snowplow/rdbloader/Log.scala deleted file mode 100644 index 761836aae..000000000 --- a/src/main/scala/com/snowplowanalytics/snowplow/rdbloader/Log.scala +++ /dev/null @@ -1,36 +0,0 @@ -/* - * Copyright (c) 2012-2019 Snowplow Analytics Ltd. All rights reserved. - * - * This program is licensed to you under the Apache License Version 2.0, - * and you may not use this file except in compliance with the Apache License Version 2.0. - * You may obtain a copy of the Apache License Version 2.0 at http://www.apache.org/licenses/LICENSE-2.0. - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the Apache License Version 2.0 is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the Apache License Version 2.0 for the specific language governing permissions and limitations there under. - */ -package com.snowplowanalytics.snowplow.rdbloader - -/** - * End-of-the-world result type. - * Controls how RDB Loader exits - */ -sealed trait Log - -object Log { - - /** - * Loading succeeded. No messages, 0 exit code - */ - case object LoadingSucceeded extends Log { - override def toString: String = s"Completed successfully" - } - - /** - * Loading failed. Write error message. 1 exit code. - */ - case class LoadingFailed(error: String) extends Log { - override def toString: String = s"Failed:\n$error" - } -} diff --git a/src/main/scala/com/snowplowanalytics/snowplow/rdbloader/Main.scala b/src/main/scala/com/snowplowanalytics/snowplow/rdbloader/Main.scala index a99e98f41..6c9d51a02 100644 --- a/src/main/scala/com/snowplowanalytics/snowplow/rdbloader/Main.scala +++ b/src/main/scala/com/snowplowanalytics/snowplow/rdbloader/Main.scala @@ -12,71 +12,69 @@ */ package com.snowplowanalytics.snowplow.rdbloader -import cats.syntax.flatMap._ +import cats.Monad import cats.data.Validated._ +import cats.implicits._ +import cats.effect.{ExitCode, IO, IOApp } -// This project -import interpreters.Interpreter -import config.CliConfig -import loaders.Common.{ load, discover } +import com.snowplowanalytics.snowplow.rdbloader.dsl.{AWS, JDBC, Logging, RealWorld} +import com.snowplowanalytics.snowplow.rdbloader.config.CliConfig +import com.snowplowanalytics.snowplow.rdbloader.loaders.Common.{discover, load} +import com.snowplowanalytics.snowplow.rdbloader.utils.{S3, SSH} -/** - * Application entry point - */ -object Main { +object Main extends IOApp { /** * If arguments or config is invalid exit with 1 * and print errors to EMR stdout * If arguments and config are valid, but loading failed * print message to `track` bucket */ - def main(argv: Array[String]): Unit = { + def run(argv: List[String]): IO[ExitCode] = CliConfig.parse(argv) match { case Valid(config) => - val status = run(config) - sys.exit(status) - case Invalid(errors) => - println("Configuration error") - errors.toList.foreach(error => println(error.message)) - sys.exit(1) - } - } + RealWorld.initialize[IO](config).flatMap { dsls => + import dsls._ - /** - * Initialize interpreter from parsed configuration and - * run all IO actions through it. Should never throw exceptions - * - * @param config parsed configuration - * @return exit code status. 0 for success, 1 if anything went wrong - */ - def run(config: CliConfig): Int = { - val interpreter = Interpreter.initialize(config) + val result = for { + discovery <- discover[IO](config) + jdbc = SSH.resource[IO](config.target.sshTunnel) *> + JDBC.interpreter[IO](config.target, config.dryRun) + _ <- LoaderAction(jdbc.use { implicit conn => load[IO](config, discovery).value }) + } yield () - val actions: Action[Int] = for { - data <- discover(config).flatTap(db.Migration.perform(config.target.schema)).value - result <- data match { - case Right(discovery) => load(config, discovery).value - case Left(LoaderError.StorageTargetError(message)) => - val upadtedMessage = s"$message\n${interpreter.getLastCopyStatements}" - ActionE.liftError(LoaderError.StorageTargetError(upadtedMessage)) - case Left(error) => ActionE.liftError(error) - } - message = utils.Common.interpret(config, result) - _ <- LoaderA.track(message) - status <- close(config.logKey, message) - } yield status - - actions.foldMap(interpreter.run) - } + result + .value + .attempt + .map { // TODO: write shorter; and figure out if unit test is possible + case Left(e) => + e.printStackTrace(System.out) + (LoaderError.LoaderLocalError(e.getMessage): LoaderError).asLeft + case Right(e) => e + } + .flatMap(res => close[IO](config.logKey, res)) + } + case Invalid(errors) => + IO.delay(println("Configuration error")) *> + errors.traverse_(message => IO.delay(println(message))).as(ExitCode.Error) + } /** Get exit status based on all previous steps */ - private def close(logKey: Option[S3.Key], message: Log) = { - logKey match { - case Some(key) => for { - dumpResult <- LoaderA.dump(key) - status <- LoaderA.exit(message, Some(dumpResult)) - } yield status - case None => LoaderA.exit(message, None) + private def close[F[_]: Monad: Logging: AWS](logKey: Option[S3.Key], result: Either[LoaderError, Unit]): F[ExitCode] = { + val dumping = logKey.traverse(Logging[F].dump).flatMap { dumpResult => + (result, dumpResult) match { + case (Right(_), None) => + Logging[F].print(s"INFO: Logs were not dumped to S3").as(ExitCode.Success) + case (Left(_), None) => + Logging[F].print(s"INFO: Logs were not dumped to S3").as(ExitCode.Error) + case (Right(_), Some(Right(key))) => + Logging[F].print(s"INFO: Logs successfully dumped to S3 [$key]").as(ExitCode.Success) + case (Left(_), Some(Right(key))) => + Logging[F].print(s"INFO: Logs successfully dumped to S3 [$key]").as(ExitCode.Error) + case (_, Some(Left(error))) => + Logging[F].print(s"ERROR: Log-dumping failed: [$error]").as(ExitCode.Error) + } } + + Logging[F].track(result) *> dumping } } diff --git a/src/main/scala/com/snowplowanalytics/snowplow/rdbloader/Security.scala b/src/main/scala/com/snowplowanalytics/snowplow/rdbloader/Security.scala deleted file mode 100644 index 673740a08..000000000 --- a/src/main/scala/com/snowplowanalytics/snowplow/rdbloader/Security.scala +++ /dev/null @@ -1,52 +0,0 @@ -/* - * Copyright (c) 2012-2019 Snowplow Analytics Ltd. All rights reserved. - * - * This program is licensed to you under the Apache License Version 2.0, - * and you may not use this file except in compliance with the Apache License Version 2.0. - * You may obtain a copy of the Apache License Version 2.0 at http://www.apache.org/licenses/LICENSE-2.0. - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the Apache License Version 2.0 is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the Apache License Version 2.0 for the specific language governing permissions and limitations there under. - */ -package com.snowplowanalytics.snowplow.rdbloader - -import cats.Functor -import cats.data.EitherT -import cats.implicits._ - -import common.StorageTarget.TunnelConfig - -/** Functions working with identities and security layers */ -object Security { - - /** Actual SSH identity data. Both passphrase and key are optional */ - case class Identity(passphrase: Option[Array[Byte]], key: Option[Array[Byte]]) - - /** Tunnel configuration with retrieved identity, ready to be used for establishing tunnel */ - case class Tunnel(config: TunnelConfig, identity: Identity) - - private val F = Functor[Action].compose[Either[LoaderError, ?]] - - /** Convert pure tunnel configuration to configuration with actual key and passphrase */ - def getIdentity(tunnelConfig: TunnelConfig): Action[Either[LoaderError, Identity]] = { - val key = tunnelConfig.bastion.key.map(_.ec2ParameterStore.parameterName).map(LoaderA.getEc2Property) - // Invert Option, Either and Action - val keyBytes: Action[Either[LoaderError, Option[Array[Byte]]]] = key.sequence.map(_.sequence.map(_.map(_.getBytes))) - F.map(keyBytes)(key => Identity(tunnelConfig.bastion.passphrase.map(_.getBytes()), key)) - } - - /** Perform loading and make sure tunnel is closed */ - def bracket(tunnelConfig: Option[TunnelConfig], action: LoaderAction[Unit]): LoaderAction[Unit] = { - tunnelConfig match { - case Some(tunnel) => for { - identity <- EitherT(getIdentity(tunnel)) - _ <- EitherT(LoaderA.establishTunnel(Security.Tunnel(tunnel, identity))) - _ <- action - _ <- EitherT(LoaderA.closeTunnel()) - } yield () - case None => action - } - } -} diff --git a/src/main/scala/com/snowplowanalytics/snowplow/rdbloader/config/CliConfig.scala b/src/main/scala/com/snowplowanalytics/snowplow/rdbloader/config/CliConfig.scala index ffd0aec3f..41d174c33 100644 --- a/src/main/scala/com/snowplowanalytics/snowplow/rdbloader/config/CliConfig.scala +++ b/src/main/scala/com/snowplowanalytics/snowplow/rdbloader/config/CliConfig.scala @@ -19,11 +19,10 @@ import java.nio.charset.StandardCharsets import cats.Id import cats.data._ import cats.implicits._ - import com.monovore.decline.{Argument, Command, Opts} import com.snowplowanalytics.iglu.client.Client -import com.snowplowanalytics.snowplow.rdbloader.common.{ StorageTarget, StringEnum } - +import com.snowplowanalytics.snowplow.rdbloader.common.{StorageTarget, StringEnum} +import com.snowplowanalytics.snowplow.rdbloader.utils.S3 import io.circe.Json import io.circe.parser.{parse => parseJson} diff --git a/src/main/scala/com/snowplowanalytics/snowplow/rdbloader/config/SnowplowConfig.scala b/src/main/scala/com/snowplowanalytics/snowplow/rdbloader/config/SnowplowConfig.scala index f8b35be6f..6079d8f43 100644 --- a/src/main/scala/com/snowplowanalytics/snowplow/rdbloader/config/SnowplowConfig.scala +++ b/src/main/scala/com/snowplowanalytics/snowplow/rdbloader/config/SnowplowConfig.scala @@ -24,7 +24,7 @@ import io.circe.yaml.parser // This project import common.StringEnum -import S3._ +import com.snowplowanalytics.snowplow.rdbloader.utils.S3._ import Semver._ import LoaderError._ diff --git a/src/main/scala/com/snowplowanalytics/snowplow/rdbloader/db/Migration.scala b/src/main/scala/com/snowplowanalytics/snowplow/rdbloader/db/Migration.scala index c0e51cd78..e595c064f 100644 --- a/src/main/scala/com/snowplowanalytics/snowplow/rdbloader/db/Migration.scala +++ b/src/main/scala/com/snowplowanalytics/snowplow/rdbloader/db/Migration.scala @@ -10,21 +10,22 @@ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the Apache License Version 2.0 for the specific language governing permissions and limitations there under. */ -package com.snowplowanalytics.snowplow.rdbloader.db +package com.snowplowanalytics.snowplow.rdbloader +package db +import cats.{Functor, Monad} import cats.data.EitherT import cats.implicits._ -import com.snowplowanalytics.iglu.core.{ SchemaKey, SchemaMap, SchemaVer } - +import com.snowplowanalytics.iglu.core.{SchemaKey, SchemaMap, SchemaVer} import com.snowplowanalytics.iglu.schemaddl.StringUtils -import com.snowplowanalytics.iglu.schemaddl.migrations.{ FlatSchema, Migration => DMigration, SchemaList => DSchemaList } +import com.snowplowanalytics.iglu.schemaddl.migrations.{FlatSchema, Migration => DMigration, SchemaList => DSchemaList} import com.snowplowanalytics.iglu.schemaddl.redshift.Ddl import com.snowplowanalytics.iglu.schemaddl.redshift.generators.{DdlGenerator, MigrationGenerator} -import com.snowplowanalytics.snowplow.rdbloader.{LoaderA, LoaderAction, LoaderError } import com.snowplowanalytics.snowplow.rdbloader.db.Entities.{Columns, TableState} -import com.snowplowanalytics.snowplow.rdbloader.discovery.{DataDiscovery, ShreddedType, DiscoveryFailure} +import com.snowplowanalytics.snowplow.rdbloader.discovery.{DataDiscovery, DiscoveryFailure, ShreddedType} +import com.snowplowanalytics.snowplow.rdbloader.dsl.{Logging, Iglu, JDBC} import com.snowplowanalytics.snowplow.rdbloader.loaders.Common.SqlString object Migration { @@ -33,27 +34,27 @@ object Migration { * latest state on the Iglu Server. Create or update tables in that case. * Do nothing in case there's only legacy JSON data */ - def perform(dbSchema: String)(discoveries: List[DataDiscovery]): LoaderAction[Unit] = + def perform[F[_]: Monad: Logging: Iglu: JDBC](dbSchema: String)(discoveries: List[DataDiscovery]): LoaderAction[F, Unit] = discoveries.flatMap(_.shreddedTypes).traverse_ { case ShreddedType.Tabular(ShreddedType.Info(_, vendor, name, model, _)) => for { - schemas <- EitherT(LoaderA.getSchemas(vendor, name, model)) + schemas <- EitherT(Iglu[F].getSchemas(vendor, name, model)) tableName = StringUtils.getTableName(SchemaMap(SchemaKey(vendor, name, "jsonschema", SchemaVer.Full(model, 0, 0)))) _ <- for { - exists <- tableExists(dbSchema, tableName) + exists <- tableExists[F](dbSchema, tableName) _ <- if (exists) for { - description <- getVersion(dbSchema, tableName, schemas) + description <- getVersion[F](dbSchema, tableName, schemas) matches = schemas.latest.schemaKey == description.version - columns <- getColumns(dbSchema, tableName) - _ <- if (matches) LoaderAction.unit else updateTable(dbSchema, description.version, columns, schemas) - } yield () else createTable(dbSchema, tableName, schemas) + columns <- getColumns[F](dbSchema, tableName) + _ <- if (matches) LoaderAction.unit[F] else updateTable[F](dbSchema, description.version, columns, schemas) + } yield () else createTable[F](dbSchema, tableName, schemas) } yield () } yield () - case ShreddedType.Json(_, _) => LoaderAction.unit + case ShreddedType.Json(_, _) => LoaderAction.unit[F] } /** Find the latest schema version in the table and confirm that it is the latest in `schemas` */ - def getVersion(dbSchema: String, tableName: String, latest: DSchemaList): LoaderAction[TableState] = { + def getVersion[F[_]: Monad: JDBC](dbSchema: String, tableName: String, latest: DSchemaList): LoaderAction[F, TableState] = { val query = SqlString.unsafeCoerce( s""" |SELECT obj_description(oid) @@ -64,12 +65,11 @@ object Migration { | WHERE nspname = '$dbSchema') |AND relname = '$tableName' """.stripMargin) - - LoaderA.executeQuery[TableState](query).leftMap(annotateError(dbSchema, tableName)) + JDBC[F].executeQuery[TableState](query).leftMap(annotateError(dbSchema, tableName)) } /** Check if table exists in `dbSchema` */ - def tableExists(dbSchema: String, table: String): LoaderAction[Boolean] = { + def tableExists[F[_]: Functor: JDBC](dbSchema: String, table: String): LoaderAction[F, Boolean] = { val query = SqlString.unsafeCoerce( s""" |SELECT EXISTS ( @@ -79,22 +79,22 @@ object Migration { | AND tablename = '$table') AS exists; """.stripMargin) - LoaderA.executeQuery[Boolean](query).leftMap(annotateError(dbSchema, table)) + JDBC[F].executeQuery[Boolean](query).leftMap(annotateError(dbSchema, table)) } - def createTable(dbSchema: String, name: String, schemas: DSchemaList): LoaderAction[Unit] = { + def createTable[F[_]: Monad: Logging: JDBC](dbSchema: String, name: String, schemas: DSchemaList): LoaderAction[F, Unit] = { val subschemas = FlatSchema.extractProperties(schemas) val tableName = StringUtils.getTableName(schemas.latest) val ddl = DdlGenerator.generateTableDdl(subschemas, tableName, Some(dbSchema), 4096, false) val comment = DdlGenerator.getTableComment(name, Some(dbSchema), schemas.latest) - LoaderA.print(s"Creating $dbSchema.$name table for ${comment.comment}").liftA *> - LoaderA.executeUpdate(ddl.toSql).void *> - LoaderA.executeUpdate(comment.toSql).void *> - LoaderA.print(s"Table created").liftA + Logging[F].print(s"Creating $dbSchema.$name table for ${comment.comment}").liftA *> + JDBC[F].executeUpdate(ddl.toSql).void *> + JDBC[F].executeUpdate(comment.toSql).void *> + Logging[F].print(s"Table created").liftA } /** Update existing table specified by `current` into a final version present in `state` */ - def updateTable(dbSchema: String, current: SchemaKey, columns: Columns, state: DSchemaList): LoaderAction[Unit] = + def updateTable[F[_]: Monad: JDBC: Logging](dbSchema: String, current: SchemaKey, columns: Columns, state: DSchemaList): LoaderAction[F, Unit] = state match { case s: DSchemaList.Full => val migrations = s.extractSegments.map(DMigration.fromSegment) @@ -102,24 +102,24 @@ object Migration { case Some(relevantMigration) => val ddlFile = MigrationGenerator.generateMigration(relevantMigration, 4096, Some(dbSchema)) val ddl = SqlString.unsafeCoerce(ddlFile.render) - LoaderAction.liftA(ddlFile.warnings.traverse_(LoaderA.print)) *> - LoaderAction.liftA(LoaderA.print(s"Executing migration DDL statement: $ddl")) *> - LoaderA.executeUpdate(ddl).void + LoaderAction.liftF(ddlFile.warnings.traverse_(Logging[F].print)) *> + LoaderAction.liftF(Logging[F].print(s"Executing migration DDL statement: $ddl")) *> + JDBC[F].executeUpdate(ddl).void case None => val message = s"Warning: Table's schema key '${current.toSchemaUri}' cannot be found in fetched schemas $state. Migration cannot be created" - LoaderAction.liftE[Unit](DiscoveryFailure.IgluError(message).toLoaderError.asLeft) + LoaderAction.liftE[F, Unit](DiscoveryFailure.IgluError(message).toLoaderError.asLeft) } case _: DSchemaList.Single => - LoaderA.print(s"Warning: updateTable executed for a table with single schema\ncolumns: $columns\nstate: $state").liftA + Logging[F].print(s"Warning: updateTable executed for a table with single schema\ncolumns: $columns\nstate: $state").liftA } /** List all columns in the table */ - def getColumns(dbSchema: String, tableName: String): LoaderAction[Columns] = { + def getColumns[F[_]: Monad: JDBC](dbSchema: String, tableName: String): LoaderAction[F, Columns] = { val setSchema = SqlString.unsafeCoerce(s"SET search_path TO $dbSchema;") val getColumns = SqlString.unsafeCoerce(s"""SELECT "column" FROM PG_TABLE_DEF WHERE tablename = '$tableName';""") for { - _ <- LoaderA.executeUpdate(setSchema) - columns <- LoaderA.executeQuery[Columns](getColumns).leftMap(annotateError(dbSchema, tableName)) + _ <- JDBC[F].executeUpdate(setSchema) + columns <- JDBC[F].executeQuery[Columns](getColumns).leftMap(annotateError(dbSchema, tableName)) } yield columns } diff --git a/src/main/scala/com/snowplowanalytics/snowplow/rdbloader/discovery/DataDiscovery.scala b/src/main/scala/com/snowplowanalytics/snowplow/rdbloader/discovery/DataDiscovery.scala index b869da4ff..1d3cb9b9b 100644 --- a/src/main/scala/com/snowplowanalytics/snowplow/rdbloader/discovery/DataDiscovery.scala +++ b/src/main/scala/com/snowplowanalytics/snowplow/rdbloader/discovery/DataDiscovery.scala @@ -13,15 +13,15 @@ package com.snowplowanalytics.snowplow.rdbloader package discovery -import java.util.UUID - +import scala.concurrent.duration._ import cats._ import cats.data._ -import cats.free.Free import cats.implicits._ - +import cats.effect.Timer import com.snowplowanalytics.snowplow.rdbloader.config.Semver import com.snowplowanalytics.snowplow.rdbloader.LoaderError._ +import com.snowplowanalytics.snowplow.rdbloader.dsl.{AWS, Cache, Logging} +import com.snowplowanalytics.snowplow.rdbloader.utils.S3 /** * Result of data discovery in shredded.good folder @@ -103,42 +103,40 @@ object DataDiscovery { * + files with unknown path were found in *any* shred run folder * * @param target either shredded good or specific run folder - * @param id storage target id to avoid "re-discovering" target when using manifest * @param shredJob shred job version to check path pattern * @param region AWS region for S3 buckets * @param assets optional JSONPath assets S3 bucket * @return list (probably empty, but usually with single element) of discover results * (atomic events and shredded types) */ - def discoverFull(target: DiscoveryTarget, - id: UUID, - shredJob: Semver, - region: String, - assets: Option[S3.Folder]): LoaderAction[Discovered] = { - def group(validatedDataKeys: LoaderAction[ValidatedDataKeys]): LoaderAction[Discovered] = + def discover[F[_]: Monad: Cache: Logging: AWS](target: DiscoveryTarget, + shredJob: Semver, + region: String, + assets: Option[S3.Folder]): LoaderAction[F, Discovered] = { + def group(validatedDataKeys: LoaderAction[F, ValidatedDataKeys[F]]): LoaderAction[F, Discovered] = for { keys <- validatedDataKeys - discovery <- groupKeysFull(keys) + discovery <- groupKeysFull[F](keys) } yield discovery val result = target match { case Global(folder) => - val keys: LoaderAction[ValidatedDataKeys] = - listGoodBucket(folder).map(transformKeys(shredJob, region, assets)) + val keys: LoaderAction[F, ValidatedDataKeys[F]] = + listGoodBucket[F](folder).map(transformKeys[F](shredJob, region, assets)) group(keys) case InSpecificFolder(folder) => - val keys: LoaderAction[ValidatedDataKeys] = - listGoodBucket(folder).map { keys => + val keys: LoaderAction[F, ValidatedDataKeys[F]] = + listGoodBucket[F](folder).map { keys => if (keys.isEmpty) { val failure = Validated.Invalid(NonEmptyList(DiscoveryFailure.NoDataFailure(folder), Nil)) - Free.pure(failure) - } else transformKeys(shredJob, region, assets)(keys) + Monad[F].pure(failure) + } else transformKeys[F](shredJob, region, assets)(keys) } for { discoveries <- group(keys) _ <- if (discoveries.lengthCompare(1) > 0) { - LoaderAction.liftA(LoaderA.print("More than one folder discovered with `--folder` option")) - } else LoaderAction.unit + LoaderAction.liftF[F, Unit](Logging[F].print("More than one folder discovered with `--folder` option")) + } else LoaderAction.unit[F] } yield discoveries } @@ -146,21 +144,21 @@ object DataDiscovery { } /** Properly set `specificFolder` flag */ - def setSpecificFolder(target: DiscoveryTarget, discovery: LoaderAction[Discovered]): LoaderAction[Discovered] = { - val F = Functor[LoaderAction].compose[List] - F.map(discovery) { d => - target match { - case InSpecificFolder(_) => d.copy(specificFolder = true) - case Global(_) => d.copy(specificFolder = false) + def setSpecificFolder[F[_]: Functor](target: DiscoveryTarget, discovery: LoaderAction[F, Discovered]): LoaderAction[F, Discovered] = + discovery.map { list => + list.map { d => + target match { + case InSpecificFolder(_) => d.copy(specificFolder = true) + case Global(_) => d.copy(specificFolder = false) + } } } - } /** * List whole directory excluding special files */ - def listGoodBucket(folder: S3.Folder): LoaderAction[List[S3.BlobObject]] = - EitherT(LoaderA.listS3(folder)).map(_.filterNot(k => isSpecial(k.key))) + def listGoodBucket[F[_]: Functor: AWS](folder: S3.Folder): LoaderAction[F, List[S3.BlobObject]] = + EitherT(AWS[F].listS3(folder)).map(_.filterNot(k => isSpecial(k.key))) // Full discovery @@ -170,12 +168,12 @@ object DataDiscovery { * @param validatedDataKeys IO-action producing validated list of `FinalDataKey` * @return IO-action producing list of */ - def groupKeysFull(validatedDataKeys: ValidatedDataKeys): LoaderAction[Discovered] = { + def groupKeysFull[F[_]: Applicative](validatedDataKeys: ValidatedDataKeys[F]): LoaderAction[F, Discovered] = { def group(dataKeys: List[DataKeyFinal]): ValidatedNel[DiscoveryFailure, Discovered] = dataKeys.groupBy(_.base).toList.reverse.traverse(validateFolderFull) // Transform into Either with non-empty list of errors - val result: Action[Either[LoaderError, Discovered]] = + val result: F[Either[LoaderError, Discovered]] = validatedDataKeys.map { keys => keys.andThen(group) match { case Validated.Valid(discovery) => @@ -214,11 +212,14 @@ object DataDiscovery { /** * Transform list of S3 keys into list of `DataKeyFinal` for `DataDiscovery` */ - private def transformKeys(shredJob: Semver, region: String, assets: Option[S3.Folder])(keys: List[S3.BlobObject]): ValidatedDataKeys = { + private def transformKeys[F[_]: Monad: Cache: AWS](shredJob: Semver, + region: String, + assets: Option[S3.Folder]) + (keys: List[S3.BlobObject]): ValidatedDataKeys[F] = { // Intermediate keys are keys that passed one check and not yet passed another val intermediateDataKeys = keys.map(parseDataKey(shredJob, _)) // Final keys passed all checks, e.g. JSONPaths for shredded data were fetched - val finalDataKeys = intermediateDataKeys.traverse(transformDataKey(_, region, assets)) + val finalDataKeys = intermediateDataKeys.traverse(transformDataKey[F](_, region, assets)) sequenceInF(finalDataKeys, identity[ValidatedNel[DiscoveryFailure, List[DataKeyFinal]]]) } @@ -233,28 +234,25 @@ object DataDiscovery { * @param assets optional JSONPath assets S3 bucket * @return `Action` containing `Validation` - as on next step we can aggregate errors */ - private def transformDataKey( + private def transformDataKey[F[_]: Monad: Cache: AWS]( dataKey: DiscoveryStep[DataKeyIntermediate], region: String, assets: Option[S3.Folder] - ): Action[ValidatedNel[DiscoveryFailure, DataKeyFinal]] = { + ): F[ValidatedNel[DiscoveryFailure, DataKeyFinal]] = { dataKey match { case Right(ShreddedDataKeyIntermediate(fullPath, info)) => - val jsonpathAction = EitherT(ShreddedType.discoverJsonPath(region, assets, info)) + val jsonpathAction: EitherT[F, DiscoveryFailure, S3.Key] = + EitherT(ShreddedType.discoverJsonPath[F](region, assets, info)) val discoveryAction = jsonpathAction.map { jsonpath => ShreddedDataKeyFinal(fullPath, ShreddedType.Json(info, jsonpath)) } discoveryAction.value.map(_.toValidatedNel) case Right(key @ ShreddedDataKeyTabular(_, _)) => - Free.pure(key.toFinal.validNel[DiscoveryFailure]) + Monad[F].pure(key.toFinal.validNel[DiscoveryFailure]) case Right(AtomicDataKey(fullPath, size)) => - val pure: Action[ValidatedNel[DiscoveryFailure, DataKeyFinal]] = - Free.pure(AtomicDataKey(fullPath, size).validNel[DiscoveryFailure]) - pure + Monad[F].pure(AtomicDataKey(fullPath, size).validNel[DiscoveryFailure]) case Left(failure) => - val pure: Action[ValidatedNel[DiscoveryFailure, DataKeyFinal]] = - Free.pure(failure.invalidNel) - pure + Monad[F].pure(failure.invalidNel) } } @@ -300,31 +298,31 @@ object DataDiscovery { * @param originalAction data-discovery action * @return result of same request, but with more guarantees to be consistent */ - def checkConsistency(originalAction: LoaderAction[Discovered]): LoaderAction[Discovered] = { - def check(checkAttempt: Int, last: Option[Either[LoaderError, Discovered]]): ActionE[Discovered] = { - val action = last.map(Free.pure[LoaderA, Either[LoaderError, Discovered]]).getOrElse(originalAction.value) + def checkConsistency[F[_]: Monad: Timer: Logging](originalAction: LoaderAction[F, Discovered]): LoaderAction[F, Discovered] = { + def check(checkAttempt: Int, last: Option[Either[LoaderError, Discovered]]): F[Either[LoaderError, Discovered]] = { + val action = last.map(Monad[F].pure).getOrElse(originalAction.value) for { original <- action - _ <- sleepConsistency(original) + _ <- sleepConsistency[F](original) control <- originalAction.value result <- retry(original, control, checkAttempt + 1) } yield result } - def retry(original: Either[LoaderError, Discovered], control: Either[LoaderError, Discovered], attempt: Int): ActionE[Discovered] = { + def retry(original: Either[LoaderError, Discovered], control: Either[LoaderError, Discovered], attempt: Int): F[Either[LoaderError, Discovered]] = { (original, control) match { case _ if attempt >= ConsistencyChecks => for { - _ <- LoaderA.print(s"Consistency check did not pass after $ConsistencyChecks attempts") - discovered <- Free.pure(control.orElse(original)) + _ <- Logging[F].print(s"Consistency check did not pass after $ConsistencyChecks attempts") + discovered <- Monad[F].pure(control.orElse(original)) } yield discovered case (Right(o), Right(c)) if o.sortBy(_.base.toString) == c.sortBy(_.base.toString) => val found = o.map(x => s"+ ${x.show}").mkString("\n") val message = if (found.isEmpty) "No run ids discovered" else s"Following run ids found:\n$found" for { - _ <- LoaderA.print(s"Consistency check passed after ${attempt - 1} attempt. " ++ message) - discovered <- Free.pure(original) + _ <- Logging[F].print(s"Consistency check passed after ${attempt - 1} attempt. " ++ message) + discovered <- Monad[F].pure(original) } yield discovered case (Right(o), Right(c)) => val message = if (attempt == ConsistencyChecks - 1) @@ -332,18 +330,18 @@ object DataDiscovery { else "" for { - _ <- LoaderA.print(s"Consistency check failed. $message") + _ <- Logging[F].print(s"Consistency check failed. $message") next <- check(attempt, Some(control)) } yield next case _ => for { - _ <- LoaderA.print(s"Consistency check failed. Making another attempt") + _ <- Logging[F].print(s"Consistency check failed. Making another attempt") next <- check(attempt, None) } yield next } } - EitherT[Action, LoaderError, Discovered](check(1, None)) + EitherT[F, LoaderError, Discovered](check(1, None)) } def discoveryDiff(original: Discovered, control: Discovered): List[String] = { @@ -365,14 +363,14 @@ object DataDiscovery { /** * Aggregates wait time for all discovered folders or wait 10 sec in case action failed */ - private def sleepConsistency(result: Either[LoaderError, Discovered]): Action[Unit] = { + private def sleepConsistency[F[_]: Timer](result: Either[LoaderError, Discovered]): F[Unit] = { val timeoutMs = result match { case Right(list) => list.map(_.consistencyTimeout).foldLeft(10000L)(_ + _) case Left(_) => 10000L } - LoaderA.sleep(timeoutMs) + Timer[F].sleep(timeoutMs.millis) } @@ -430,5 +428,5 @@ object DataDiscovery { def base: S3.Folder = info.info.base } - private type ValidatedDataKeys = Action[ValidatedNel[DiscoveryFailure, List[DataKeyFinal]]] + private type ValidatedDataKeys[F[_]] = F[ValidatedNel[DiscoveryFailure, List[DataKeyFinal]]] } diff --git a/src/main/scala/com/snowplowanalytics/snowplow/rdbloader/discovery/DiscoveryFailure.scala b/src/main/scala/com/snowplowanalytics/snowplow/rdbloader/discovery/DiscoveryFailure.scala index 0ce4169f5..fc3febbd9 100644 --- a/src/main/scala/com/snowplowanalytics/snowplow/rdbloader/discovery/DiscoveryFailure.scala +++ b/src/main/scala/com/snowplowanalytics/snowplow/rdbloader/discovery/DiscoveryFailure.scala @@ -13,9 +13,8 @@ package com.snowplowanalytics.snowplow.rdbloader.discovery import cats.data.NonEmptyList -import cats.syntax.show._ - -import com.snowplowanalytics.snowplow.rdbloader.{S3, LoaderError} +import com.snowplowanalytics.snowplow.rdbloader.LoaderError +import com.snowplowanalytics.snowplow.rdbloader.utils.S3 /** * Discovery failure. Represents failure of single step. diff --git a/src/main/scala/com/snowplowanalytics/snowplow/rdbloader/discovery/ShreddedType.scala b/src/main/scala/com/snowplowanalytics/snowplow/rdbloader/discovery/ShreddedType.scala index 7d59dea80..aee5021f9 100644 --- a/src/main/scala/com/snowplowanalytics/snowplow/rdbloader/discovery/ShreddedType.scala +++ b/src/main/scala/com/snowplowanalytics/snowplow/rdbloader/discovery/ShreddedType.scala @@ -13,14 +13,14 @@ package com.snowplowanalytics.snowplow.rdbloader package discovery +import cats.Monad import cats.data._ -import cats.free.Free import cats.implicits._ - -import com.snowplowanalytics.iglu.core.{SchemaKey, SchemaVer, SchemaCriterion} - +import com.snowplowanalytics.iglu.core.{SchemaCriterion, SchemaKey, SchemaVer} import com.snowplowanalytics.snowplow.rdbloader.config.Semver +import com.snowplowanalytics.snowplow.rdbloader.dsl.{AWS, Cache} import com.snowplowanalytics.snowplow.rdbloader.utils.Common.toSnakeCase +import com.snowplowanalytics.snowplow.rdbloader.utils.S3 sealed trait ShreddedType { /** raw metadata extracted from S3 Key */ @@ -132,33 +132,29 @@ object ShreddedType { * @param shreddedType some shredded type (self-describing event or context) * @return full valid s3 path (with `s3://` prefix) */ - def discoverJsonPath(region: String, jsonpathAssets: Option[S3.Folder], shreddedType: Info): DiscoveryAction[S3.Key] = { + def discoverJsonPath[F[_]: Monad: Cache: AWS](region: String, jsonpathAssets: Option[S3.Folder], shreddedType: Info): DiscoveryAction[F, S3.Key] = { val filename = s"""${toSnakeCase(shreddedType.name)}_${shreddedType.model}.json""" val key = s"${shreddedType.vendor}/$filename" - LoaderA.getCache(key).flatMap { (value: Option[Option[S3.Key]]) => - value match { - case Some(Some(jsonPath)) => - Free.pure(jsonPath.asRight) - case Some(None) => - Free.pure(DiscoveryFailure.JsonpathDiscoveryFailure(key).asLeft) - case None => - jsonpathAssets match { - case Some(assets) => - val path = S3.Folder.append(assets, shreddedType.vendor) - val s3Key = S3.Key.coerce(path + filename) - LoaderA.keyExists(s3Key).flatMap { - case true => - for { - _ <- LoaderA.putCache(key, Some(s3Key)) - } yield s3Key.asRight - case false => - getSnowplowJsonPath(region, key) - } - case None => - getSnowplowJsonPath(region, key) - } - } + Cache[F].getCache(key).flatMap { + case Some(Some(jsonPath)) => + Monad[F].pure(jsonPath.asRight) + case Some(None) => + Monad[F].pure(DiscoveryFailure.JsonpathDiscoveryFailure(key).asLeft) + case None => + jsonpathAssets match { + case Some(assets) => + val path = S3.Folder.append(assets, shreddedType.vendor) + val s3Key = S3.Key.coerce(path + filename) + AWS[F].keyExists(s3Key).flatMap { + case true => + Cache[F].putCache(key, Some(s3Key)).as(s3Key.asRight) + case false => + getSnowplowJsonPath[F](region, key) + } + case None => + getSnowplowJsonPath[F](region, key) + } } } @@ -178,30 +174,31 @@ object ShreddedType { * @param key vendor dir and filename, e.g. `com.acme/event_1` * @return full S3 key if file exists, discovery error otherwise */ - def getSnowplowJsonPath(s3Region: String, key: String): DiscoveryAction[S3.Key] = { + def getSnowplowJsonPath[F[_]: Monad: AWS: Cache](s3Region: String, + key: String): DiscoveryAction[F, S3.Key] = { val hostedAssetsBucket = getHostedAssetsBucket(s3Region) val fullDir = S3.Folder.append(hostedAssetsBucket, JsonpathsPath) val s3Key = S3.Key.coerce(fullDir + key) - LoaderA.keyExists(s3Key).flatMap { + AWS[F].keyExists(s3Key).flatMap { case true => - LoaderA.putCache(key, Some(s3Key)).as(s3Key.asRight) + Cache[F].putCache(key, Some(s3Key)).as(s3Key.asRight) case false => - LoaderA.putCache(key, None).as(DiscoveryFailure.JsonpathDiscoveryFailure(key).asLeft) + Cache[F].putCache(key, None).as(DiscoveryFailure.JsonpathDiscoveryFailure(key).asLeft) } } /** Discover multiple JSONPaths for shredded types at once and turn into `LoaderAction` */ - def discoverBatch(region: String, - jsonpathAssets: Option[S3.Folder], - raw: List[ShreddedType.Info]): LoaderAction[List[ShreddedType]] = { + def discoverBatch[F[_]: Monad: Cache: AWS](region: String, + jsonpathAssets: Option[S3.Folder], + raw: List[ShreddedType.Info]): LoaderAction[F, List[ShreddedType]] = { // Discover data for single item - def discover(info: ShreddedType.Info): Action[ValidatedNel[DiscoveryFailure, ShreddedType]] = { - val jsonpaths = ShreddedType.discoverJsonPath(region, jsonpathAssets, info) + def discover(info: ShreddedType.Info): F[ValidatedNel[DiscoveryFailure, ShreddedType]] = { + val jsonpaths: F[DiscoveryStep[S3.Key]] = ShreddedType.discoverJsonPath[F](region, jsonpathAssets, info) val shreddedType = jsonpaths.map(_.map(s3key => ShreddedType.Json(info, s3key))) shreddedType.map(_.toValidatedNel) } - val action: Action[Either[LoaderError, List[ShreddedType]]] = + val action: F[Either[LoaderError, List[ShreddedType]]] = sequenceInF(raw.traverse(discover), LoaderError.flattenValidated[List[ShreddedType]]) LoaderAction(action) diff --git a/src/main/scala/com/snowplowanalytics/snowplow/rdbloader/dsl/AWS.scala b/src/main/scala/com/snowplowanalytics/snowplow/rdbloader/dsl/AWS.scala new file mode 100644 index 000000000..637976f80 --- /dev/null +++ b/src/main/scala/com/snowplowanalytics/snowplow/rdbloader/dsl/AWS.scala @@ -0,0 +1,188 @@ +/* + * Copyright (c) 2012-2020 Snowplow Analytics Ltd. All rights reserved. + * + * This program is licensed to you under the Apache License Version 2.0, + * and you may not use this file except in compliance with the Apache License Version 2.0. + * You may obtain a copy of the Apache License Version 2.0 at http://www.apache.org/licenses/LICENSE-2.0. + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the Apache License Version 2.0 is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the Apache License Version 2.0 for the specific language governing permissions and limitations there under. + */ +package com.snowplowanalytics.snowplow.rdbloader.dsl + +import java.io.ByteArrayInputStream +import java.nio.charset.StandardCharsets +import java.nio.file.{Files, Path, Paths} + +import scala.collection.convert.wrapAsScala._ + +import cats.data.Validated +import cats.implicits._ +import cats.effect.Sync + +import com.amazonaws.AmazonServiceException +import com.amazonaws.services.s3.model._ +import com.amazonaws.services.s3.{AmazonS3, AmazonS3ClientBuilder} +import com.amazonaws.services.simplesystemsmanagement.AWSSimpleSystemsManagementClientBuilder +import com.amazonaws.services.simplesystemsmanagement.model.{AWSSimpleSystemsManagementException, GetParameterRequest} + +// This project +import com.snowplowanalytics.snowplow.rdbloader.{LoaderError, LoaderAction} +import com.snowplowanalytics.snowplow.rdbloader.utils.S3 +import com.snowplowanalytics.snowplow.rdbloader.config.SnowplowConfig.SnowplowAws +import com.snowplowanalytics.snowplow.rdbloader.discovery.DiscoveryFailure.{S3Failure, DownloadFailure} + + +trait AWS[F[_]] { + + /** Recursively list S3 folder */ + def listS3(bucket: S3.Folder): F[Either[LoaderError, List[S3.BlobObject]]] + + /** Check if S3 key exist */ + def keyExists(key: S3.Key): F[Boolean] + + /** Download S3 key into local path */ + def downloadData(source: S3.Folder, dest: Path): LoaderAction[F, List[Path]] + + /** Upload text file */ + def putObject(key: S3.Key, data: String): LoaderAction[F, Unit] + + /** Retrieve decrypted property from EC2 Parameter Store */ + def getEc2Property(name: String): F[Array[Byte]] +} + +object AWS { + def apply[F[_]](implicit ev: AWS[F]): AWS[F] = ev + + /** + * Create S3 client, backed by AWS Java SDK + * + * @param awsConfig Snowplow AWS Configuration + * @return Snowplow-specific S3 client + */ + def getClient[F[_]: Sync](awsConfig: SnowplowAws): F[AmazonS3] = + Sync[F].delay(AmazonS3ClientBuilder.standard().withRegion(awsConfig.s3.region).build()) + + def s3Interpreter[F[_]: Sync](client: AmazonS3): AWS[F] = new AWS[F] { + + def putObject(key: S3.Key, data: String): LoaderAction[F, Unit] = { + val meta = new ObjectMetadata() + meta.setContentLength(data.length) + meta.setContentEncoding("text/plain") + val (bucket, prefix) = S3.splitS3Key(key) + val is = new ByteArrayInputStream(data.getBytes(StandardCharsets.UTF_8)) + val action = Sync[F] + .delay(client.putObject(bucket, prefix, is, meta)) + .attempt + .map { + case Right(_) => ().asRight + case Left(error) => + val message = s"Cannot put S3 object $key, " ++ error.getMessage + (LoaderError.LoaderLocalError(message): LoaderError).asLeft + } + LoaderAction(action) + } + + private def list(str: S3.Folder): LoaderAction[F, List[S3ObjectSummary]] = { + val (bucket, prefix) = S3.splitS3Path(str) + + val req = new ListObjectsV2Request() + .withBucketName(bucket) + .withPrefix(prefix) + + def keyUnfold(result: ListObjectsV2Result): Stream[S3ObjectSummary] = { + if (result.isTruncated) { + val loaded = result.getObjectSummaries() + req.setContinuationToken(result.getNextContinuationToken) + loaded.toStream #::: keyUnfold(client.listObjectsV2(req)) + } else { + result.getObjectSummaries().toStream + } + } + + Sync[F].delay(keyUnfold(client.listObjectsV2(req)).filterNot(_.getSize == 0).toList) + .attemptT + .leftMap(e => LoaderError.DiscoveryError(List(S3Failure(e.toString))): LoaderError) + } + + def listS3(bucket: S3.Folder): F[Either[LoaderError, List[S3.BlobObject]]] = + list(bucket).map(summaries => summaries.map(S3.getKey)).value + + /** + * Check if some `file` exists in S3 `path` + * + * @param key valid S3 key (without trailing slash) + * @return true if file exists, false if file doesn't exist or not available + */ + def keyExists(key: S3.Key): F[Boolean] = { + val (bucket, s3Key) = S3.splitS3Key(key) + val request = new GetObjectMetadataRequest(bucket, s3Key) + Sync[F].delay(client.getObjectMetadata(request)).as(true).recover { + case _: AmazonServiceException => false + } + } + + /** + * Download contents of S3 folder into `destination` + * + * @param source AWS S3 folder + * @param dest optional local path, tmp dir will be used if not specified + * @return list of downloaded filenames + */ + def downloadData(source: S3.Folder, dest: Path): LoaderAction[F, List[Path]] = + list(source).flatMap { summaries => + val downloads = summaries.traverse { summary => + val bucket = summary.getBucketName + val key = summary.getKey + + val download = for { + s3Object <- Sync[F].delay(client.getObject(new GetObjectRequest(bucket, key))) + destinationFile <- Sync[F].delay(Paths.get(dest.toString, key)) + result <- Sync[F].ifM(Sync[F].delay(Files.exists(destinationFile)))( + Sync[F].pure(DownloadFailure(S3.Key.coerce(s"s3://$bucket/$key"), "File already exist").asLeft[Path]), + for { + _ <- Sync[F].delay(Files.createDirectories(destinationFile.getParent)) + _ <- Sync[F].delay(Files.copy(s3Object.getObjectContent, destinationFile)) + } yield destinationFile.asRight[DownloadFailure] + ) + } yield result + + download + .attempt + .map { + case Left(e) => DownloadFailure(S3.Key.coerce(s"s3://$bucket/$key"), e.toString).asLeft + case Right(e) => e + } + + } + + val result = downloads.map { d => d.map(_.toValidatedNel).sequence match { + case Validated.Valid(paths) => paths.asRight + case Validated.Invalid(failures) => (LoaderError.DiscoveryError(failures.toList): LoaderError).asLeft + } } + + LoaderAction[F, List[Path]](result) + } + + /** + * Get value from AWS EC2 Parameter Store + * @param name systems manager parameter's name with SSH key + * @return decrypted string with key + */ + def getEc2Property(name: String): F[Array[Byte]] = { + val result = for { + client <- Sync[F].delay(AWSSimpleSystemsManagementClientBuilder.defaultClient()) + req: GetParameterRequest = new GetParameterRequest().withName(name).withWithDecryption(true) + par <- Sync[F].delay(client.getParameter(req)) + } yield par.getParameter.getValue.getBytes + + result.recoverWith { + case e: AWSSimpleSystemsManagementException => + Sync[F].raiseError(LoaderError.LoaderLocalError(s"Cannot get $name EC2 property: ${e.getMessage}")) + } + } + } +} + diff --git a/src/main/scala/com/snowplowanalytics/snowplow/rdbloader/dsl/Cache.scala b/src/main/scala/com/snowplowanalytics/snowplow/rdbloader/dsl/Cache.scala new file mode 100644 index 000000000..29f2bf980 --- /dev/null +++ b/src/main/scala/com/snowplowanalytics/snowplow/rdbloader/dsl/Cache.scala @@ -0,0 +1,40 @@ +/* + * Copyright (c) 2012-2020 Snowplow Analytics Ltd. All rights reserved. + * + * This program is licensed to you under the Apache License Version 2.0, + * and you may not use this file except in compliance with the Apache License Version 2.0. + * You may obtain a copy of the Apache License Version 2.0 at http://www.apache.org/licenses/LICENSE-2.0. + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the Apache License Version 2.0 is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the Apache License Version 2.0 for the specific language governing permissions and limitations there under. + */ +package com.snowplowanalytics.snowplow.rdbloader.dsl + +import cats.implicits._ +import cats.effect.Sync +import cats.effect.concurrent.Ref +import com.snowplowanalytics.snowplow.rdbloader.utils.S3 + +trait Cache[F[_]] { + /** Put value into cache (stored in interpreter) */ + def putCache(key: String, value: Option[S3.Key]): F[Unit] + + /** Get value from cache (stored in interpreter) */ + def getCache(key: String): F[Option[Option[S3.Key]]] +} + +object Cache { + def apply[F[_]](implicit ev: Cache[F]): Cache[F] = ev + + def cacheInterpreter[F[_]: Sync](cache: Ref[F, Map[String, Option[S3.Key]]]): Cache[F] = + new Cache[F] { + def getCache(key: String): F[Option[Option[S3.Key]]] = + cache.get.map(_.get(key)) + + def putCache(key: String, value: Option[S3.Key]): F[Unit] = + cache.update { c => c ++ Map(key -> value) } + } +} + diff --git a/src/main/scala/com/snowplowanalytics/snowplow/rdbloader/dsl/FS.scala b/src/main/scala/com/snowplowanalytics/snowplow/rdbloader/dsl/FS.scala new file mode 100644 index 000000000..a10ec5cf8 --- /dev/null +++ b/src/main/scala/com/snowplowanalytics/snowplow/rdbloader/dsl/FS.scala @@ -0,0 +1,59 @@ +/* + * Copyright (c) 2012-2020 Snowplow Analytics Ltd. All rights reserved. + * + * This program is licensed to you under the Apache License Version 2.0, + * and you may not use this file except in compliance with the Apache License Version 2.0. + * You may obtain a copy of the Apache License Version 2.0 at http://www.apache.org/licenses/LICENSE-2.0. + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the Apache License Version 2.0 is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the Apache License Version 2.0 for the specific language governing permissions and limitations there under. + */ +package com.snowplowanalytics.snowplow.rdbloader.dsl + +import java.nio.file.attribute.BasicFileAttributes +import java.nio.file.{FileVisitResult, Files, Path, SimpleFileVisitor} + +import cats.syntax.functor._ +import cats.syntax.applicativeError._ +import cats.effect.Sync + +import com.snowplowanalytics.snowplow.rdbloader.{ LoaderAction, LoaderError } + +trait FS[F[_]] { + /** Create tmp directory */ + def createTmpDir: LoaderAction[F, Path] + + /** Delete directory */ + def deleteDir(path: Path): LoaderAction[F, Unit] +} + +object FS { + + def apply[F[_]](implicit ev: FS[F]): FS[F] = ev + + def fileSystemInterpreter[F[_]: Sync]: FS[F] = new FS[F] { + def createTmpDir: LoaderAction[F, Path] = + Sync[F] + .delay(Files.createTempDirectory("rdb-loader")) + .attemptT + .leftMap(e => LoaderError.LoaderLocalError("Cannot create temporary directory.\n" + e.toString): LoaderError) + + def deleteDir(path: Path): LoaderAction[F, Unit] = + Sync[F] + .delay(Files.walkFileTree(path, DeleteVisitor)) + .attemptT + .leftMap(e => LoaderError.LoaderLocalError(s"Cannot delete directory [${path.toString}].\n" + e.toString): LoaderError) + .void + } + + private object DeleteVisitor extends SimpleFileVisitor[Path] { + override def visitFile(file: Path, attrs: BasicFileAttributes) = { + Files.delete(file) + FileVisitResult.CONTINUE + } + } + +} + diff --git a/src/main/scala/com/snowplowanalytics/snowplow/rdbloader/dsl/Iglu.scala b/src/main/scala/com/snowplowanalytics/snowplow/rdbloader/dsl/Iglu.scala new file mode 100644 index 000000000..d2388195d --- /dev/null +++ b/src/main/scala/com/snowplowanalytics/snowplow/rdbloader/dsl/Iglu.scala @@ -0,0 +1,41 @@ +/* + * Copyright (c) 2012-2020 Snowplow Analytics Ltd. All rights reserved. + * + * This program is licensed to you under the Apache License Version 2.0, + * and you may not use this file except in compliance with the Apache License Version 2.0. + * You may obtain a copy of the Apache License Version 2.0 at http://www.apache.org/licenses/LICENSE-2.0. + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the Apache License Version 2.0 is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the Apache License Version 2.0 for the specific language governing permissions and limitations there under. + */ +package com.snowplowanalytics.snowplow.rdbloader.dsl + +import cats.effect.{ Sync, Clock } + +import io.circe.Json + +import com.snowplowanalytics.iglu.client.Client +import com.snowplowanalytics.iglu.schemaddl.migrations.SchemaList + +import com.snowplowanalytics.snowplow.rdbloader.LoaderError +import com.snowplowanalytics.snowplow.rdbloader.common._ +import com.snowplowanalytics.snowplow.rdbloader.discovery.DiscoveryFailure + +trait Iglu[F[_]] { + /** Retrieve list of schemas from Iglu Server */ + def getSchemas(vendor: String, name: String, model: Int): F[Either[LoaderError, SchemaList]] +} + +object Iglu { + def apply[F[_]](implicit ev: Iglu[F]): Iglu[F] = ev + + def igluInterpreter[F[_]: Sync: Clock](client: Client[F, Json]): Iglu[F] = new Iglu[F] { + def getSchemas(vendor: String, name: String, model: Int): F[Either[LoaderError, SchemaList]] = + Flattening.getOrdered(client.resolver, vendor, name, model).leftMap { resolutionError => + val message = s"Cannot get schemas for iglu:$vendor/$name/jsonschema/$model-*-*\n$resolutionError" + LoaderError.DiscoveryError(DiscoveryFailure.IgluError(message)) + }.value + } +} diff --git a/src/main/scala/com/snowplowanalytics/snowplow/rdbloader/dsl/JDBC.scala b/src/main/scala/com/snowplowanalytics/snowplow/rdbloader/dsl/JDBC.scala new file mode 100644 index 000000000..5f1d2a8b5 --- /dev/null +++ b/src/main/scala/com/snowplowanalytics/snowplow/rdbloader/dsl/JDBC.scala @@ -0,0 +1,247 @@ +/* + * Copyright (c) 2012-2020 Snowplow Analytics Ltd. All rights reserved. + * + * This program is licensed to you under the Apache License Version 2.0, + * and you may not use this file except in compliance with the Apache License Version 2.0. + * You may obtain a copy of the Apache License Version 2.0 at http://www.apache.org/licenses/LICENSE-2.0. + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the Apache License Version 2.0 is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the Apache License Version 2.0 for the specific language governing permissions and limitations there under. + */ +package com.snowplowanalytics.snowplow.rdbloader.dsl + +import java.io.FileReader +import java.nio.file.Path +import java.sql.{Connection, SQLException} +import java.util.Properties + +import scala.util.control.NonFatal +import scala.concurrent.duration._ + +import cats.Monad +import cats.data.EitherT +import cats.implicits._ +import cats.effect.{ Sync, Timer, Resource } + +import org.postgresql.copy.CopyManager +import org.postgresql.jdbc.PgConnection +import org.postgresql.{Driver => PgDriver} + +import com.amazon.redshift.jdbc42.{Driver => RedshiftDriver} + +import com.snowplowanalytics.snowplow.rdbloader.{LoaderAction, LoaderError} +import com.snowplowanalytics.snowplow.rdbloader.LoaderError.StorageTargetError +import com.snowplowanalytics.snowplow.rdbloader.common.StorageTarget +import com.snowplowanalytics.snowplow.rdbloader.db.Decoder +import com.snowplowanalytics.snowplow.rdbloader.loaders.Common.SqlString + +trait JDBC[F[_]] { + + /** Execute single SQL statement (against target in interpreter) */ + def executeUpdate(sql: SqlString): LoaderAction[F, Long] + + /** Execute multiple (against target in interpreter) */ + def executeUpdates(queries: List[SqlString])(implicit A: Monad[F]): LoaderAction[F, Unit] = + EitherT(queries.traverse_(executeUpdate).value) + + /** Execute query and parse results into `A` */ + def executeQuery[A](query: SqlString)(implicit ev: Decoder[A]): LoaderAction[F, A] + + /** Perform PostgreSQL COPY table FROM STDIN (against target in interpreter) */ + def copyViaStdin(files: List[Path], query: SqlString): LoaderAction[F, Long] + + /** Execute SQL transaction (against target in interpreter) */ + def executeTransaction(queries: List[SqlString])(implicit A: Monad[F]): LoaderAction[F, Unit] = { + val begin = SqlString.unsafeCoerce("BEGIN") + val commit = SqlString.unsafeCoerce("COMMIT") + val transaction = (begin :: queries) :+ commit + executeUpdates(transaction) + } +} + +object JDBC { + + def apply[F[_]](implicit ev: JDBC[F]): JDBC[F] = ev + + /** + * Build a necessary (dry-run or real-world) DB interpreter as a `Resource`, + * which guarantees to close a JDBC connection + */ + def interpreter[F[_]: Sync: Timer: AWS](target: StorageTarget, dryRun: Boolean): Resource[F, JDBC[F]] = + Resource + .make(getConnection[F](target))(conn => Sync[F].delay(conn.close())) + .map { conn => + if (dryRun) JDBC.jdbcDryRunInterpreter[F](conn) else JDBC.jdbcRealInterpreter[F](conn) + } + + /** + * Acquire JDBC connection. In case of failure - sleep 1 minute and retry again + * @param target storage target configuration, either Redshift or Postgres + * @tparam F effect type with `S3I` DSL to get encrypted password + * @return JDBC connection type + */ + def getConnection[F[_]: Sync: Timer: AWS](target: StorageTarget): F[Connection] = { + val password: F[String] = target.password match { + case StorageTarget.PlainText(text) => + Sync[F].pure(text) + case StorageTarget.EncryptedKey(StorageTarget.EncryptedConfig(key)) => + AWS[F].getEc2Property(key.parameterName).map(b => new String(b)) + } + + def connect(props: Properties): F[Connection] = + Sync[F].delay(new RedshiftDriver().connect(s"jdbc:redshift://${target.host}:${target.port}/${target.database}", props)) + + for { + p <- password + props = new Properties() + _ = props.setProperty("user", target.username) + _ = props.setProperty("password", p) + jdbcConnection <- target match { + case r: StorageTarget.RedshiftConfig => + r.jdbc.validation match { + case Left(error) => + Sync[F].raiseError[Connection](new IllegalArgumentException(error.message)) // Should never happen + case Right(propertyUpdaters) => + for { + _ <- Sync[F].delay(propertyUpdaters.foreach(f => f(props))) + firstAttempt <- connect(props).attempt + connection <- firstAttempt match { + case Right(c) => + Sync[F].delay(c) + case Left(e) => + Sync[F].delay(println(s"${e.getMessage} Sleeping and making another attempt")) *> + Timer[F].sleep(60.seconds) *> + connect(props) + } + } yield connection + } + case p: StorageTarget.PostgresqlConfig => + val url = s"jdbc:postgresql://${p.host}:${p.port}/${p.database}" + for { + _ <- Sync[F].delay(props.setProperty("sslmode", p.sslMode.asProperty)) + pgConnection <- Sync[F].delay(new PgDriver().connect(url, props)) + } yield pgConnection + } + } yield jdbcConnection + } + + def copyIn[F[_]: Sync](copyManager: CopyManager, copyStatement: String)(file: Path): LoaderAction[F, Long] = + Sync[F] + .delay(copyManager.copyIn(copyStatement, new FileReader(file.toFile))) + .attemptA(err => StorageTargetError(err.getMessage)) + + def setAutocommit[F[_]: Sync](conn: Connection, autoCommit: Boolean): LoaderAction[F, Unit] = + Sync[F] + .delay[Unit](conn.setAutoCommit(autoCommit)) + .onError { + case e => Sync[F].delay(println("setAutocommit error")) *> + Sync[F].delay(e.printStackTrace(System.out)) + } + .attemptA(err => StorageTargetError(err.toString)) + + /** Real-world (opposed to dry-run) interpreter */ + def jdbcRealInterpreter[F[_]: Sync](conn: Connection): JDBC[F] = new JDBC[F] { + /** + * Execute a single update-statement in provided Postgres connection + * + * @param sql string with valid SQL statement + * @return number of updated rows in case of success, failure otherwise + */ + def executeUpdate(sql: SqlString): LoaderAction[F, Long] = { + val update = Sync[F] + .delay[Long](conn.createStatement().executeUpdate(sql).toLong) + .attempt + .flatMap[Either[LoaderError, Long]] { + case Left(e: SQLException) if Option(e.getMessage).getOrElse("").contains("is not authorized to assume IAM Role") => + (StorageTargetError("IAM Role with S3 Read permissions is not attached to Redshift instance"): LoaderError).asLeft[Long].pure[F] + case Left(e) => + val log = Sync[F].delay(println("RDB Loader unknown error in executeUpdate")) *> + Sync[F].delay(e.printStackTrace(System.out)) + log.as(StorageTargetError(Option(e.getMessage).getOrElse(e.toString)).asLeft[Long]) + case Right(result) => + result.asRight[LoaderError].pure[F] + } + + LoaderAction[F, Long](update) + } + + def executeQuery[A](sql: SqlString)(implicit ev: Decoder[A]): LoaderAction[F, A] = { + val query = Sync[F] + .delay(conn.createStatement().executeQuery(sql)) + .map { resultSet => + ev.decode(resultSet) match { + case Left(e) => StorageTargetError(s"Cannot decode SQL row: ${e.message}").asLeft + case Right(a) => a.asRight[LoaderError] + } + } + .attempt + .flatMap[Either[LoaderError, A]] { + case Left(e) => + val log = Sync[F].delay(println("RDB Loader unknown error in executeQuery")) *> + Sync[F].delay(e.printStackTrace(System.out)) + log.as(StorageTargetError(Option(e.getMessage).getOrElse(e.toString)).asLeft[A]) + case Right(either) => + either.pure[F] + } + + LoaderAction(query) + } + + def copyViaStdin(files: List[Path], copyStatement: SqlString): LoaderAction[F, Long] = { + val copyManager: LoaderAction[F, CopyManager] = Sync[F] + .delay(new CopyManager(conn.asInstanceOf[PgConnection])) + .attemptA(err => StorageTargetError(err.getMessage)) + + for { + manager <- copyManager + _ <- setAutocommit[F](conn, false) + result = files.traverse(copyIn[F](manager, copyStatement)(_)).map(_.combineAll).value.flatMap[Either[LoaderError, Long]] { + case e @ Right(_) => Sync[F].delay(conn.commit()).as(e) + case e @ Left(_) => Sync[F].delay(conn.rollback()).as(e) + } + endResult <- LoaderAction(result) + _ <- setAutocommit[F](conn, true) + } yield endResult + } + + } + + /** Dry run interpreter, not performing any *destructive* statements */ + def jdbcDryRunInterpreter[F[_]: Sync](conn: Connection): JDBC[F] = new JDBC[F] { + def executeUpdate(sql: SqlString): LoaderAction[F, Long] = + LoaderAction.liftF(Sync[F].delay(println(sql)).as(1L)) + + def executeQuery[A](sql: SqlString)(implicit ev: Decoder[A]): LoaderAction[F, A] = { + val result = try { + val resultSet = conn.createStatement().executeQuery(sql) + ev.decode(resultSet) match { + case Left(e) => StorageTargetError(s"Cannot decode SQL row: ${e.message}").asLeft + case Right(a) => a.asRight[StorageTargetError] + } + } catch { + case NonFatal(e) => + println("RDB Loader unknown error in executeQuery") + e.printStackTrace(System.out) + StorageTargetError(Option(e.getMessage).getOrElse(e.toString)).asLeft[A] + } + + LoaderAction.liftE(result) + } + + def copyViaStdin(files: List[Path], copyStatement: SqlString): LoaderAction[F, Long] = + LoaderAction.liftF(Sync[F].delay(println(copyStatement)).as(1L)) + } + + implicit class SyncOps[F[_]: Sync, A](fa: F[A]) { + def attemptA(handle: Throwable => LoaderError): LoaderAction[F, A] = { + val action = fa.attempt.map { + case Right(a) => a.asRight[LoaderError] + case Left(err) => handle(err).asLeft[A] + } + LoaderAction(action) + } + } +} + diff --git a/src/main/scala/com/snowplowanalytics/snowplow/rdbloader/dsl/Logging.scala b/src/main/scala/com/snowplowanalytics/snowplow/rdbloader/dsl/Logging.scala new file mode 100644 index 000000000..96696729b --- /dev/null +++ b/src/main/scala/com/snowplowanalytics/snowplow/rdbloader/dsl/Logging.scala @@ -0,0 +1,188 @@ +/* + * Copyright (c) 2012-2020 Snowplow Analytics Ltd. All rights reserved. + * + * This program is licensed to you under the Apache License Version 2.0, + * and you may not use this file except in compliance with the Apache License Version 2.0. + * You may obtain a copy of the Apache License Version 2.0 at http://www.apache.org/licenses/LICENSE-2.0. + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the Apache License Version 2.0 is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the Apache License Version 2.0 for the specific language governing permissions and limitations there under. + */ +package com.snowplowanalytics.snowplow.rdbloader.dsl + +import java.time.{ Instant, ZoneId } +import java.time.format.DateTimeFormatter + +import org.joda.time.DateTime + +import cats.{Id, Monad} +import cats.data.NonEmptyList +import cats.syntax.option._ +import cats.syntax.apply._ +import cats.syntax.flatMap._ +import cats.syntax.either._ +import cats.syntax.functor._ +import cats.syntax.show._ +import cats.instances.either._ + +import cats.effect.Sync +import cats.effect.concurrent.Ref + +import io.circe.Json + +import com.snowplowanalytics.iglu.core.{SchemaKey, SchemaVer, SelfDescribingData} + +import com.snowplowanalytics.snowplow.scalatracker.emitters.id.RequestProcessor._ +import com.snowplowanalytics.snowplow.scalatracker.{Emitter, Tracker} +import com.snowplowanalytics.snowplow.scalatracker.emitters.id.{SyncBatchEmitter, SyncEmitter} + +import com.snowplowanalytics.snowplow.rdbloader.LoaderError +import com.snowplowanalytics.snowplow.rdbloader.common._ +import com.snowplowanalytics.snowplow.rdbloader.config.SnowplowConfig.{GetMethod, PostMethod} +import com.snowplowanalytics.snowplow.rdbloader.config.SnowplowConfig.Monitoring +import com.snowplowanalytics.snowplow.rdbloader.utils.{Common, S3} + + +trait Logging[F[_]] { + + /** Get last COPY statement in case of failure */ + def getLastCopyStatements: F[String] + + /** Track result via Snowplow tracker */ + def track(result: Either[LoaderError, Unit]): F[Unit] + + /** Dump log to S3 */ + def dump(key: S3.Key)(implicit S: AWS[F]): F[Either[String, S3.Key]] + + /** Print message to stdout */ + def print(message: String): F[Unit] +} + +object Logging { + def apply[F[_]](implicit ev: Logging[F]): Logging[F] = ev + + val ApplicationContextSchema = SchemaKey("com.snowplowanalytics.monitoring.batch", "application_context", "jsonschema", SchemaVer.Full(1,0,0)) + val LoadSucceededSchema = SchemaKey("com.snowplowanalytics.monitoring.batch", "load_succeeded", "jsonschema", SchemaVer.Full(1,0,0)) + val LoadFailedSchema = SchemaKey("com.snowplowanalytics.monitoring.batch", "load_failed", "jsonschema", SchemaVer.Full(1,0,0)) + + def controlInterpreter[F[_]: Sync](targetConfig: StorageTarget, + messages: Ref[F, List[String]], + tracker: Option[Tracker[Id]]): Logging[F] = + new Logging[F] { + + def getLastCopyStatements: F[String] = + messages.get.map(_.find(_.startsWith("COPY ")).getOrElse("No COPY statements performed")) + + /** Track result via Snowplow tracker */ + def track(result: Either[LoaderError, Unit]): F[Unit] = { + result match { + case Right(_) => + trackEmpty(LoadSucceededSchema) + case Left(error) => + val secrets = List(targetConfig.password.getUnencrypted, targetConfig.username) + val sanitizedMessage = Common.sanitize(error.show, secrets) + trackEmpty(LoadFailedSchema) *> this.print(sanitizedMessage) + } + } + + /** Dump log to S3 */ + def dump(key: S3.Key)(implicit S: AWS[F]): F[Either[String, S3.Key]] = + log(s"Dumping $key") *> + Monad[F].ifM(S.keyExists(key))( + Monad[F].pure(Left(s"S3 log object [$key] already exists")), + for { + logs <- getMessages.map(_.mkString("\n") + "\n") + putResult <- S.putObject(key, logs).value + } yield putResult.as(key).leftMap(_.show) + ) + + + /** Print message to stdout */ + def print(message: String): F[Unit] = + for { + time <- Sync[F].delay { + DateTimeFormatter.ofPattern("HH:mm:ss.SSS").withZone(ZoneId.systemDefault()).format(Instant.now()) + } + timestamped = s"$time: $message" + _ <- Sync[F].delay(System.out.println(timestamped)) *> log(timestamped) + } yield () + + private def log(message: String): F[Unit] = + messages.update(buf => message :: buf) + + private def trackEmpty(schema: SchemaKey): F[Unit] = + tracker match { + case Some(t) => + Sync[F].delay(t.trackSelfDescribingEvent(SelfDescribingData(schema, Json.obj()))) + case None => + Sync[F].unit + } + + private def getMessages: F[List[String]] = + messages.get.map(_.reverse) + } + + /** + * Initialize Snowplow tracker, if `monitoring` section is properly configured + * + * @param monitoring config.yml `monitoring` section + * @return some tracker if enabled, none otherwise + */ + def initializeTracking[F[_]: Sync](monitoring: Monitoring): F[Option[Tracker[Id]]] = { + monitoring.snowplow.flatMap(_.collector) match { + case Some(Collector((host, port))) => + Sync[F].delay { + val emitter: Emitter[Id] = monitoring.snowplow.flatMap(_.method) match { + case Some(GetMethod) => + SyncEmitter.createAndStart(host, port = Some(port), callback = Some(callback)) + case Some(PostMethod) => + SyncBatchEmitter.createAndStart(host, port = Some(port), bufferSize = 2) + case None => + SyncEmitter.createAndStart(host, port = Some(port), callback = Some(callback)) + } + + val tracker = new Tracker[Id](NonEmptyList.of(emitter), "snowplow-rdb-loader", monitoring.snowplow.flatMap(_.appId).getOrElse("rdb-loader")) + Some(tracker) + } + case Some(_) => Sync[F].pure(none[Tracker[Id]]) + case None => Sync[F].pure(none[Tracker[Id]]) + } + } + + /** Callback for failed */ + private def callback(params: CollectorParams, request: CollectorRequest, response: CollectorResponse): Unit = { + def toMsg(rsp: CollectorResponse, includeHeader: Boolean): String = rsp match { + case CollectorFailure(code) => + val header = if (includeHeader) { s"Snowplow Tracker [${DateTime.now()}]: " } else "" + header ++ s"Cannot deliver event to ${params.getUri}. Collector responded with $code" + case TrackerFailure(error) => + val header = if (includeHeader) { s"Snowplow Tracker [${DateTime.now()}]: " } else "" + header ++ s"Cannot deliver event to ${params.getUri}. Tracker failed due ${error.getMessage}" + case RetriesExceeded(r) => s"Tracker [${DateTime.now()}]: Gave up on trying to deliver event. Last error: ${toMsg(r, false)}" + case CollectorSuccess(_) => "" + } + + val message = toMsg(response, true) + + // The only place in interpreters where println used instead of logger as this is async function + if (message.isEmpty) () else println(message) + } + + + /** + * Config helper functions + */ + private object Collector { + def isInt(s: String): Boolean = try { s.toInt; true } catch { case _: NumberFormatException => false } + + def unapply(hostPort: String): Option[(String, Int)] = + hostPort.split(":").toList match { + case host :: port :: Nil if isInt(port) => Some((host, port.toInt)) + case host :: Nil => Some((host, 80)) + case _ => None + } + } +} + diff --git a/src/main/scala/com/snowplowanalytics/snowplow/rdbloader/dsl/RealWorld.scala b/src/main/scala/com/snowplowanalytics/snowplow/rdbloader/dsl/RealWorld.scala new file mode 100644 index 000000000..5be078402 --- /dev/null +++ b/src/main/scala/com/snowplowanalytics/snowplow/rdbloader/dsl/RealWorld.scala @@ -0,0 +1,52 @@ +/* + * Copyright (c) 2012-2020 Snowplow Analytics Ltd. All rights reserved. + * + * This program is licensed to you under the Apache License Version 2.0, + * and you may not use this file except in compliance with the Apache License Version 2.0. + * You may obtain a copy of the Apache License Version 2.0 at http://www.apache.org/licenses/LICENSE-2.0. + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the Apache License Version 2.0 is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the Apache License Version 2.0 for the specific language governing permissions and limitations there under. + */ +package com.snowplowanalytics.snowplow.rdbloader.dsl + +import cats.implicits._ +import cats.effect.{Clock, Sync} +import cats.effect.concurrent.Ref +import com.snowplowanalytics.iglu.client.Client +import com.snowplowanalytics.snowplow.rdbloader.config.CliConfig +import com.snowplowanalytics.snowplow.rdbloader.utils.S3 + +/** Container for most of interepreters to be used in Main + * JDBC will be instantiated only when necessary, and as a `Reousrce` + */ +class RealWorld[F[_]](cache: Cache[F], logging: Logging[F], iglu: Iglu[F], aws: AWS[F], fs: FS[F]) { + implicit val cacheF: Cache[F] = cache + implicit val loggingF: Logging[F] = logging + implicit val igluF: Iglu[F] = iglu + implicit val awsF: AWS[F] = aws + implicit val fsF: FS[F] = fs +} + +object RealWorld { + def initialize[F[_] : Sync : Clock](config: CliConfig): F[RealWorld[F]] = + for { + cacheMap <- Ref.of[F, Map[String, Option[S3.Key]]](Map.empty) + messages <- Ref.of[F, List[String]](List.empty[String]) + tracker <- Logging.initializeTracking[F](config.configYaml.monitoring) + igluParsed <- Client.parseDefault[F](config.resolverConfig).value + igluClient <- igluParsed match { + case Right(client) => Sync[F].pure(client) + case Left(error) => Sync[F].raiseError(error) // Should never happen because we already validated it + } + amazonS3 <- AWS.getClient[F](config.configYaml.aws) + + cache = Cache.cacheInterpreter[F](cacheMap) + logging = Logging.controlInterpreter[F](config.target, messages, tracker) + iglu = Iglu.igluInterpreter[F](igluClient) + aws = AWS.s3Interpreter[F](amazonS3) + fs = FS.fileSystemInterpreter[F] + } yield new RealWorld[F](cache, logging, iglu, aws, fs) +} \ No newline at end of file diff --git a/src/main/scala/com/snowplowanalytics/snowplow/rdbloader/interpreters/DryRunInterpreter.scala b/src/main/scala/com/snowplowanalytics/snowplow/rdbloader/interpreters/DryRunInterpreter.scala deleted file mode 100644 index b36c40416..000000000 --- a/src/main/scala/com/snowplowanalytics/snowplow/rdbloader/interpreters/DryRunInterpreter.scala +++ /dev/null @@ -1,148 +0,0 @@ -/* - * Copyright (c) 2012-2019 Snowplow Analytics Ltd. All rights reserved. - * - * This program is licensed to you under the Apache License Version 2.0, - * and you may not use this file except in compliance with the Apache License Version 2.0. - * You may obtain a copy of the Apache License Version 2.0 at http://www.apache.org/licenses/LICENSE-2.0. - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the Apache License Version 2.0 is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the Apache License Version 2.0 for the specific language governing permissions and limitations there under. - */ -package com.snowplowanalytics.snowplow.rdbloader -package interpreters - -import java.nio.file._ - -import scala.collection.mutable.ListBuffer - -import cats._ -import cats.implicits._ - -import io.circe.Json - -import com.amazonaws.services.s3.AmazonS3 - -import com.snowplowanalytics.iglu.client.Client -import com.snowplowanalytics.snowplow.scalatracker.Tracker - -// This project -import common._ - -import LoaderA._ -import config.CliConfig -import loaders.Common.SqlString -import discovery.DiscoveryFailure -import implementations.{S3Interpreter, TrackerInterpreter} - - -/** - * Interpreter performs all actual side-effecting work, - * interpreting `Action` at the end-of-the-world. - * It contains and handles configuration, connections and mutable state, - * all real-world interactions, except argument parsing - */ -class DryRunInterpreter private[interpreters](cliConfig: CliConfig, - amazonS3: AmazonS3, - tracker: Option[Tracker[Id]], - igluClient: Client[Id, Json]) extends Interpreter { - - private val logQueries = ListBuffer.empty[SqlString] - private val logCopyFiles = ListBuffer.empty[Path] - private val logMessages = ListBuffer.empty[String] - private var sleepTime = 0L - - private val interpreter = this - - /** - * Successfully fetched JSONPaths - * Key: "vendor/filename_1.json"; - * Value: "s3://my-jsonpaths/redshift/vendor/filename_1.json" - */ - private val cache = collection.mutable.HashMap.empty[String, Option[S3.Key]] - - def getDryRunLogs: String = { - val sleep = s"Consistency check sleep time: $sleepTime\n" - val queries = - if (logQueries.nonEmpty) "Performed SQL Queries:\n" + logQueries.mkString("\n") - else "No SQL queries performed" - val messages = - if (logMessages.nonEmpty) "Debug messages:\n" + logMessages.mkString("\n") - else "" - val files = - if (logCopyFiles.nonEmpty) "Files loaded via stdin:\n" + logCopyFiles.mkString("\n") - else "" - - List(sleep, queries, messages, files).mkString("\n") - } - - def run: LoaderA ~> Id = new (LoaderA ~> Id) { - - def apply[A](effect: LoaderA[A]): Id[A] = { - effect match { - case ListS3(folder) => - S3Interpreter.list(amazonS3, folder).map(summaries => summaries.map(S3.getKey)) - case KeyExists(key) => - S3Interpreter.keyExists(amazonS3, key) - case DownloadData(source, dest) => - logMessages.append(s"Downloading data from [$source] to [$dest]") - List.empty[Path].asRight[LoaderError] - - case ExecuteUpdate(query) => - logQueries.append(query) - 0L.asRight[LoaderError] - case CopyViaStdin(files, _) => - // Will never work while `DownloadData` is noop - logCopyFiles.appendAll(files) - 0L.asRight[LoaderError] - - case ExecuteQuery(_, _) => - None.asRight // All used decoders return something with Option - - case CreateTmpDir => - logMessages.append("Created temporary directory") - Paths.get("tmp").asRight - - case DeleteDir(path) => - logMessages.append(s"Deleted temporary directory [${path.toString}]").asRight - - - case Print(message) => - println(message) - case Sleep(timeout) => - sleepTime = sleepTime + timeout - Thread.sleep(timeout) - case Track(log) => - println(log.toString) - case Dump(key) => - val dryRunResult = "Dry-run action: \n" + getDryRunLogs - TrackerInterpreter.dumpStdout(amazonS3, key, dryRunResult) - case Exit(loadResult, dumpResult) => - println("Dry-run action: \n" + getDryRunLogs) - TrackerInterpreter.exit(loadResult, dumpResult) - - case Get(key: String) => - cache.get(key) - case Put(key: String, value: Option[S3.Key]) => - val _ = cache.put(key, value) - () - - case EstablishTunnel(tunnel) => - Right(logMessages.append(s"Established imaginary SSH tunnel to [${tunnel.config.bastion.host}:${tunnel.config.bastion.port}]")) - case CloseTunnel() => - Right(logMessages.append(s"Closed imaginary SSH tunnel")) - - case GetEc2Property(name) => - logMessages.append(s"Fetched imaginary EC2 [$name] property") - Right(name + " key") - - case GetSchemas(vendor, name, model) => - Flattening.getOrdered(igluClient.resolver, vendor, name, model).leftMap { resolutionError => - val message = s"Cannot get schemas for iglu:$vendor/$name/jsonschema/$model-*-*\n$resolutionError" - LoaderError.DiscoveryError(DiscoveryFailure.IgluError(message)) - }.value - } - } - } -} diff --git a/src/main/scala/com/snowplowanalytics/snowplow/rdbloader/interpreters/Interpreter.scala b/src/main/scala/com/snowplowanalytics/snowplow/rdbloader/interpreters/Interpreter.scala deleted file mode 100644 index ec47e4280..000000000 --- a/src/main/scala/com/snowplowanalytics/snowplow/rdbloader/interpreters/Interpreter.scala +++ /dev/null @@ -1,62 +0,0 @@ -/* - * Copyright (c) 2012-2019 Snowplow Analytics Ltd. All rights reserved. - * - * This program is licensed to you under the Apache License Version 2.0, - * and you may not use this file except in compliance with the Apache License Version 2.0. - * You may obtain a copy of the Apache License Version 2.0 at http://www.apache.org/licenses/LICENSE-2.0. - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the Apache License Version 2.0 is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the Apache License Version 2.0 for the specific language governing permissions and limitations there under. - */ -package com.snowplowanalytics.snowplow.rdbloader -package interpreters - -import cats.{ ~>, Id} -import cats.effect.IO -import cats.syntax.either._ - -import com.snowplowanalytics.iglu.client.Client - -import implementations.{S3Interpreter, TrackerInterpreter} - -// This project -import config.CliConfig - -trait Interpreter { - def run: LoaderA ~> Id - - /** For interpreters performing statements */ - def getLastCopyStatements: String = "" -} - -object Interpreter { - - /** - * Initialize clients/connections for interpreter and interpreter itself - * - * @param cliConfig RDB Loader app configuration - * @return prepared interpreter - */ - def initialize(cliConfig: CliConfig): Interpreter = { - val resolver = Client.parseDefault(cliConfig.resolverConfig).value - .fold(x => throw new RuntimeException(s"Initialization error. Cannot initialize Iglu Resolver. $x"), identity) - val amazonS3 = S3Interpreter.getClient(cliConfig.configYaml.aws) - val tracker = TrackerInterpreter.initializeTracking(cliConfig.configYaml.monitoring) - - if (cliConfig.dryRun) { - new DryRunInterpreter(cliConfig, amazonS3, tracker, resolver) - } else { - new RealWorldInterpreter(cliConfig, amazonS3, tracker, resolver) - } - } - - private[interpreters] def runIO[A](action: IO[Either[LoaderError, A]]): Either[LoaderError, A] = { - val io = action.attempt.map { - case Left(throwable) => LoaderError.LoaderLocalError(throwable.getMessage).asLeft - case Right(either) => either - } - io.unsafeRunSync() - } -} diff --git a/src/main/scala/com/snowplowanalytics/snowplow/rdbloader/interpreters/RealWorldInterpreter.scala b/src/main/scala/com/snowplowanalytics/snowplow/rdbloader/interpreters/RealWorldInterpreter.scala deleted file mode 100644 index fa745ae87..000000000 --- a/src/main/scala/com/snowplowanalytics/snowplow/rdbloader/interpreters/RealWorldInterpreter.scala +++ /dev/null @@ -1,237 +0,0 @@ -/* - * Copyright (c) 2012-2019 Snowplow Analytics Ltd. All rights reserved. - * - * This program is licensed to you under the Apache License Version 2.0, - * and you may not use this file except in compliance with the Apache License Version 2.0. - * You may obtain a copy of the Apache License Version 2.0 at http://www.apache.org/licenses/LICENSE-2.0. - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the Apache License Version 2.0 is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the Apache License Version 2.0 for the specific language governing permissions and limitations there under. - */ -package com.snowplowanalytics.snowplow.rdbloader -package interpreters - -import java.io.IOException -import java.nio.file._ -import java.nio.file.attribute.BasicFileAttributes -import java.sql.Connection - -import scala.util.control.NonFatal - -import cats._ -import cats.implicits._ - -import io.circe.Json - -import com.amazonaws.services.s3.AmazonS3 - -import org.joda.time.DateTime - -import com.snowplowanalytics.iglu.client.Client - -import com.snowplowanalytics.snowplow.scalatracker.Tracker - - -// This project -import common._ -import LoaderA._ -import LoaderError.LoaderLocalError -import config.CliConfig -import discovery.DiscoveryFailure -import utils.Common -import implementations._ -import com.snowplowanalytics.snowplow.rdbloader.{ Log => ExitLog } -import com.snowplowanalytics.snowplow.rdbloader.loaders.Common.SqlString - -/** - * Interpreter performs all actual side-effecting work, - * interpreting `Action` at the end-of-the-world. - * It contains and handles configuration, connections and mutable state, - * all real-world interactions, except argument parsing - */ -class RealWorldInterpreter private[interpreters](cliConfig: CliConfig, - amazonS3: AmazonS3, - tracker: Option[Tracker[Id]], - igluClient: Client[Id, Json]) extends Interpreter { - - private val interpreter = this - - /** - * Successfully fetched JSONPaths - * Key: "vendor/filename_1.json"; - * Value: "s3://my-jsonpaths/redshift/vendor/filename_1.json" - */ - private val cache = collection.mutable.HashMap.empty[String, Option[S3.Key]] - - // dbConnection is Either because not required for log dump - // lazy to wait before tunnel established - private var dbConnection: Either[LoaderError, Connection] = _ - - private def getConnection(force: Boolean = false): Either[LoaderError, Connection] = { - if (dbConnection == null) { - dbConnection = JdbcInterpreter.getConnection(cliConfig.target) - } - if (force) { - System.out.println("Forcing reconnection to DB") - dbConnection = JdbcInterpreter.getConnection(cliConfig.target) - } - dbConnection - } - - // General messages that should be printed both to output and final log - private val messages = collection.mutable.ListBuffer.empty[String] - - // DB messages that should be printed only to output and if failure is DB-related - private val messagesCopy = collection.mutable.ListBuffer.empty[String] - - def executeWithRetry[A](action: Connection => SqlString => Either[LoaderError.StorageTargetError, A])(sql: SqlString) = { - val firstAttempt = for { - conn <- getConnection() - _ <- JdbcInterpreter.setAutocommit(conn, false) - r <- action(conn)(sql) - } yield r - firstAttempt match { - case Left(LoaderError.StorageTargetError(message)) if message.contains("Connection refused") => - System.out.println("Sleeping and making another try") - Thread.sleep(10000) - for { - conn <- getConnection(true) - r <- action(conn)(sql) - } yield r - case other => other - } - } - - def run: LoaderA ~> Id = new (LoaderA ~> Id) { - - def apply[A](effect: LoaderA[A]): Id[A] = { - effect match { - case ListS3(folder) => - log(s"Listing $folder") - S3Interpreter.list(amazonS3, folder).map(summaries => summaries.map(S3.getKey)) - case KeyExists(key) => - S3Interpreter.keyExists(amazonS3, key) - case DownloadData(source, dest) => - S3Interpreter.downloadData(amazonS3, source, dest) - - - case ExecuteUpdate(query) => - if (query.startsWith("COPY ")) { logCopy(query.split(" ").take(2).mkString(" ")) } - executeWithRetry[Long](JdbcInterpreter.executeUpdate)(query).asInstanceOf[Id[A]] - - case CopyViaStdin(files, query) => - for { - conn <- getConnection() - _ = log(s"Copying ${files.length} files via stdin") - res <- JdbcInterpreter.copyViaStdin(conn, files, query) - } yield res - - case ExecuteQuery(query, d) => - for { - conn <- getConnection() - res <- JdbcInterpreter.executeQuery(conn)(query)(d) - } yield res - - case CreateTmpDir => - try { - Files.createTempDirectory("rdb-loader").asRight - } catch { - case NonFatal(e) => - LoaderLocalError("Cannot create temporary directory.\n" + e.toString).asLeft - } - case DeleteDir(path) => - try { - Files.walkFileTree(path, RealWorldInterpreter.DeleteVisitor).asRight[LoaderError].void - } catch { - case NonFatal(e) => LoaderLocalError(s"Cannot delete directory [${path.toString}].\n" + e.toString).asLeft - } - - case Print(message) => - log(message) - case Sleep(timeout) => - log(s"Sleeping $timeout milliseconds") - Thread.sleep(timeout) - case Track(result) => - result match { - case ExitLog.LoadingSucceeded => - TrackerInterpreter.trackSuccess(tracker) - log(result.toString) - case ExitLog.LoadingFailed(message) => - val secrets = List(cliConfig.target.password.getUnencrypted, cliConfig.target.username) - val sanitizedMessage = Common.sanitize(message, secrets) - TrackerInterpreter.trackError(tracker) - log(sanitizedMessage) - } - case Dump(key) => - log(s"Dumping $key") - val logs = messages.mkString("\n") + "\n" - TrackerInterpreter.dumpStdout(amazonS3, key, logs) - case Exit(loadResult, dumpResult) => - getConnection().foreach(c => c.close()) - TrackerInterpreter.exit(loadResult, dumpResult) - - - case Get(key: String) => - cache.get(key) - case Put(key: String, value: Option[S3.Key]) => - val _ = cache.put(key, value) - () - - case EstablishTunnel(config) => - log("Establishing SSH tunnel") - SshInterpreter.establishTunnel(config) - case CloseTunnel() => - log("Closing SSH tunnel") - SshInterpreter.closeTunnel() - - case GetEc2Property(name) => - SshInterpreter.getKey(name) - - case GetSchemas(vendor, name, model) => - Flattening.getOrdered(igluClient.resolver, vendor, name, model).leftMap { resolutionError => - val message = s"Cannot get schemas for iglu:$vendor/$name/jsonschema/$model-*-*\n$resolutionError" - LoaderError.DiscoveryError(DiscoveryFailure.IgluError(message)) - }.value - } - } - } - - - override def getLastCopyStatements: String = { - val last = messagesCopy.take(3) - if (last.isEmpty) - "No COPY statements were performed" - else - s"Last ${last.length} COPY statements:\n${last.mkString("\n")}" - } - - private def log(message: String): Unit = { - val endMessage = s"RDB Loader [${DateTime.now()}]: $message" - System.out.println(message) - messages.append(endMessage) - } - - private def logCopy(message: String): Unit = { - val endMessage = s"RDB Loader [${DateTime.now()}]: $message" - System.out.println(endMessage) - messagesCopy.append(endMessage) - } - -} - -object RealWorldInterpreter { - - object DeleteVisitor extends SimpleFileVisitor[Path] { - override def visitFile(file: Path, attrs: BasicFileAttributes) = { - Files.delete(file) - FileVisitResult.CONTINUE - } - - override def postVisitDirectory(dir: Path, exc: IOException): FileVisitResult = { - Files.delete(dir) - FileVisitResult.CONTINUE - } - } -} diff --git a/src/main/scala/com/snowplowanalytics/snowplow/rdbloader/interpreters/implementations/JdbcInterpreter.scala b/src/main/scala/com/snowplowanalytics/snowplow/rdbloader/interpreters/implementations/JdbcInterpreter.scala deleted file mode 100644 index 41817fba3..000000000 --- a/src/main/scala/com/snowplowanalytics/snowplow/rdbloader/interpreters/implementations/JdbcInterpreter.scala +++ /dev/null @@ -1,149 +0,0 @@ -/* - * Copyright (c) 2012-2019 Snowplow Analytics Ltd. All rights reserved. - * - * This program is licensed to you under the Apache License Version 2.0, - * and you may not use this file except in compliance with the Apache License Version 2.0. - * You may obtain a copy of the Apache License Version 2.0 at http://www.apache.org/licenses/LICENSE-2.0. - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the Apache License Version 2.0 is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the Apache License Version 2.0 for the specific language governing permissions and limitations there under. - */ -package com.snowplowanalytics.snowplow.rdbloader -package interpreters.implementations - -import java.io.FileReader -import java.nio.file.Path -import java.sql.{Connection, SQLException} -import java.util.Properties - -import scala.util.control.NonFatal -import com.amazon.redshift.jdbc42.{Driver => RedshiftDriver} -import cats.implicits._ -import org.postgresql.copy.CopyManager -import org.postgresql.jdbc.PgConnection -import org.postgresql.{Driver => PgDriver} -import LoaderError.StorageTargetError -import com.snowplowanalytics.snowplow.rdbloader.common.StorageTarget -import db.Decoder -import loaders.Common.SqlString - -object JdbcInterpreter { - - /** - * Execute a single update-statement in provided Postgres connection - * - * @param conn Postgres connection - * @param sql string with valid SQL statement - * @return number of updated rows in case of success, failure otherwise - */ - def executeUpdate(conn: Connection)(sql: SqlString): Either[StorageTargetError, Long] = - Either.catchNonFatal { - conn.createStatement().executeUpdate(sql).toLong - } leftMap { - case NonFatal(e: java.sql.SQLException) if Option(e.getMessage).getOrElse("").contains("is not authorized to assume IAM Role") => - StorageTargetError("IAM Role with S3 Read permissions is not attached to Redshift instance") - case NonFatal(e) => - println("RDB Loader unknown error in executeUpdate") - e.printStackTrace(System.out) - StorageTargetError(Option(e.getMessage).getOrElse(e.toString)) - } - - def executeQuery[A](conn: Connection)(sql: SqlString)(implicit ev: Decoder[A]): Either[StorageTargetError, A] = - try { - val resultSet = conn.createStatement().executeQuery(sql) - ev.decode(resultSet) match { - case Left(e) => StorageTargetError(s"Cannot decode SQL row: ${e.message}").asLeft - case Right(a) => a.asRight[StorageTargetError] - } - } catch { - case NonFatal(e) => - println("RDB Loader unknown error in executeQuery") - e.printStackTrace(System.out) - StorageTargetError(Option(e.getMessage).getOrElse(e.toString)).asLeft[A] - } - - def setAutocommit(conn: Connection, autoCommit: Boolean): Either[LoaderError, Unit] = - try { - Right(conn.setAutoCommit(autoCommit)) - } catch { - case e: SQLException => - println("setAutocommit error") - e.printStackTrace(System.out) - Left(StorageTargetError(e.toString)) - } - - def copyViaStdin(conn: Connection, files: List[Path], copyStatement: SqlString): Either[LoaderError, Long] = { - val copyManager = Either.catchNonFatal { - new CopyManager(conn.asInstanceOf[PgConnection]) - } leftMap { e => StorageTargetError(e.toString) } - - for { - manager <- copyManager - _ <- setAutocommit(conn, false) - result = files.traverse(copyIn(manager, copyStatement)(_)).map(_.combineAll) - _ = result.fold(_ => conn.rollback(), _ => conn.commit()) - _ <- setAutocommit(conn, true) - endResult <- result - } yield endResult - } - - def copyIn(copyManager: CopyManager, copyStatement: String)(file: Path): Either[LoaderError, Long] = - try { - Right(copyManager.copyIn(copyStatement, new FileReader(file.toFile))) - } catch { - case NonFatal(e) => Left(StorageTargetError(e.toString)) - } - - /** - * Get Redshift or Postgres connection - */ - def getConnection(target: StorageTarget): Either[LoaderError, Connection] = { - try { - val password = target.password match { - case StorageTarget.PlainText(text) => text - case StorageTarget.EncryptedKey(StorageTarget.EncryptedConfig(key)) => - SshInterpreter.getKey(key.parameterName).valueOr(error => throw new RuntimeException(s"Cannot retrieve JDBC password from EC2 Parameter Store. ${error.show}")) - } - - val props = new Properties() - props.setProperty("user", target.username) - props.setProperty("password", password) - - target match { - case r: StorageTarget.RedshiftConfig => - def connect() = - Either.catchNonFatal(new RedshiftDriver().connect(s"jdbc:redshift://${target.host}:${target.port}/${target.database}", props)) - - for { - _ <- r.jdbc.validation match { - case Left(error) => LoaderError.ConfigError(error.message).asLeft - case Right(propertyUpdaters) => - propertyUpdaters.foreach(f => f(props)).asRight - } - firstAttempt = connect() - connection <- firstAttempt match { - case Right(c) => - c.asRight - case Left(e) => - println("Error during connection acquisition. Sleeping and making another attempt") - e.printStackTrace(System.out) - Thread.sleep(60000) - connect().leftMap(e2 => LoaderError.StorageTargetError(e2.getMessage)) - } - } yield connection - - case p: StorageTarget.PostgresqlConfig => - val url = s"jdbc:postgresql://${p.host}:${p.port}/${p.database}" - props.setProperty("sslmode", p.sslMode.asProperty) - Right(new PgDriver().connect(url, props)) - } - } catch { - case NonFatal(e) => - println("RDB Loader getConnection error") - e.printStackTrace(System.out) - Left(StorageTargetError(s"Problems with establishing DB connection\n${e.getMessage}")) - } - } -} diff --git a/src/main/scala/com/snowplowanalytics/snowplow/rdbloader/interpreters/implementations/S3Interpreter.scala b/src/main/scala/com/snowplowanalytics/snowplow/rdbloader/interpreters/implementations/S3Interpreter.scala deleted file mode 100644 index f2dac4eea..000000000 --- a/src/main/scala/com/snowplowanalytics/snowplow/rdbloader/interpreters/implementations/S3Interpreter.scala +++ /dev/null @@ -1,136 +0,0 @@ -/* - * Copyright (c) 2012-2019 Snowplow Analytics Ltd. All rights reserved. - * - * This program is licensed to you under the Apache License Version 2.0, - * and you may not use this file except in compliance with the Apache License Version 2.0. - * You may obtain a copy of the Apache License Version 2.0 at http://www.apache.org/licenses/LICENSE-2.0. - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the Apache License Version 2.0 is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the Apache License Version 2.0 for the specific language governing permissions and limitations there under. - */ -package com.snowplowanalytics.snowplow.rdbloader.interpreters.implementations - -import java.nio.file.{Files, Path, Paths} - -import cats.Functor -import cats.implicits._ - -import com.amazonaws.AmazonServiceException -import com.amazonaws.services.s3.model._ -import com.amazonaws.services.s3.{AmazonS3, AmazonS3ClientBuilder} - -import com.snowplowanalytics.snowplow.rdbloader.S3.{Folder, splitS3Key, splitS3Path} -import com.snowplowanalytics.snowplow.rdbloader.config.SnowplowConfig.SnowplowAws -import com.snowplowanalytics.snowplow.rdbloader.{LoaderError, S3} - -import scala.collection.convert.wrapAsScala._ -import scala.util.control.NonFatal - -// This project -import com.snowplowanalytics.snowplow.rdbloader.LoaderError.DiscoveryError -import com.snowplowanalytics.snowplow.rdbloader.discovery.DiscoveryFailure.{S3Failure, DownloadFailure} - - -/** - * Side-effecting functions for interpreting S3 actions - */ -object S3Interpreter { - - val F = Functor[Either[LoaderError, ?]].compose[List] - - /** - * Create S3 client, backed by AWS Java SDK - * - * @param awsConfig Snowplow AWS Configuration - * @return Snowplow-specific S3 client - */ - def getClient(awsConfig: SnowplowAws): AmazonS3 = - AmazonS3ClientBuilder.standard().withRegion(awsConfig.s3.region).build() - - /** - * List all non-empty keys in S3 folder. - * This function will return as many matching keys as exist in bucket - * - * @param client AWS Client - * @param s3folder valid S3 folder (with trailing slash) to list - * @return list of valid S3 keys - */ - def list(client: AmazonS3, s3folder: Folder): Either[DiscoveryError, List[S3ObjectSummary]] = { - val (bucket, prefix) = splitS3Path(s3folder) - - val req = new ListObjectsV2Request() - .withBucketName(bucket) - .withPrefix(prefix) - - def keyUnfold(result: ListObjectsV2Result): Stream[S3ObjectSummary] = { - if (result.isTruncated) { - val loaded = result.getObjectSummaries() - req.setContinuationToken(result.getNextContinuationToken) - loaded.toStream #::: keyUnfold(client.listObjectsV2(req)) - } else { - result.getObjectSummaries().toStream - } - } - - try { - Right(keyUnfold(client.listObjectsV2(req)).filterNot(_.getSize == 0).toList) - } catch { - case NonFatal(e) => Left(DiscoveryError(List(S3Failure(e.toString)))) - } - } - - /** - * Check if some `file` exists in S3 `path` - * - * @param client AWS Client - * @param key valid S3 key (without trailing slash) - * @return true if file exists, false if file doesn't exist or not available - */ - def keyExists(client: AmazonS3, key: S3.Key): Boolean = { - val (bucket, s3Key) = splitS3Key(key) - val request = new GetObjectMetadataRequest(bucket, s3Key) - try { - client.getObjectMetadata(request) - true - } catch { - case _: AmazonServiceException => false - } - } - - /** - * Download contents of S3 folder into `destination` - * - * @param client AWS S3 client - * @param source AWS S3 folder - * @param dest optional local path, tmp dir will be used if not specified - * @return list of downloaded filenames - */ - def downloadData(client: AmazonS3, source: S3.Folder, dest: Path): Either[LoaderError, List[Path]] = { - val files = F.map(list(client, source)) { summary => - val bucket = summary.getBucketName - val key = summary.getKey - try { - val s3Object = client.getObject(new GetObjectRequest(bucket, key)) - val destinationFile = Paths.get(dest.toString, key) - - if (!Files.exists(destinationFile)) { - Files.createDirectories(destinationFile.getParent) - Files.copy(s3Object.getObjectContent, destinationFile) - Right(destinationFile) - } else { - Left(DownloadFailure(S3.Key.coerce(s"s3://$bucket/$key"), "File already exist")) - } - } catch { - case NonFatal(e) => - Left(DownloadFailure(S3.Key.coerce(s"s3://$bucket/$key"), e.toString)) - } - } - - files.map(stream => stream.sequence match { - case Left(failure) => Left(DiscoveryError(List(failure))) - case Right(success) => Right(success.toList) - }).flatten - } -} diff --git a/src/main/scala/com/snowplowanalytics/snowplow/rdbloader/interpreters/implementations/SshInterpreter.scala b/src/main/scala/com/snowplowanalytics/snowplow/rdbloader/interpreters/implementations/SshInterpreter.scala deleted file mode 100644 index b32eb525f..000000000 --- a/src/main/scala/com/snowplowanalytics/snowplow/rdbloader/interpreters/implementations/SshInterpreter.scala +++ /dev/null @@ -1,77 +0,0 @@ -/* - * Copyright (c) 2012-2019 Snowplow Analytics Ltd. All rights reserved. - * - * This program is licensed to you under the Apache License Version 2.0, - * and you may not use this file except in compliance with the Apache License Version 2.0. - * You may obtain a copy of the Apache License Version 2.0 at http://www.apache.org/licenses/LICENSE-2.0. - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the Apache License Version 2.0 is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the Apache License Version 2.0 for the specific language governing permissions and limitations there under. - */ -package com.snowplowanalytics.snowplow.rdbloader.interpreters.implementations - -import scala.util.control.NonFatal - -import cats.implicits._ - -import com.amazonaws.services.simplesystemsmanagement.AWSSimpleSystemsManagementClientBuilder -import com.amazonaws.services.simplesystemsmanagement.model.GetParameterRequest - -import com.jcraft.jsch.{JSch, Session} - -import com.snowplowanalytics.snowplow.rdbloader.LoaderError -import com.snowplowanalytics.snowplow.rdbloader.Security.Tunnel - -/** Real-world interpreter, responsible for maintaining **single** SSH session for tunnel */ -object SshInterpreter { - - private val jsch = new JSch() - private var sshSession: Session = _ - - /** - * Create a SSH tunnel to bastion host and set port forwarding to target DB - * @param tunnel SSH-tunnel configuration - * @return either nothing on success and error message on failure - */ - def establishTunnel(tunnel: Tunnel): Either[LoaderError, Unit] = { - if (sshSession != null) Left(LoaderError.LoaderLocalError("Session for SSH tunnel already opened")) - else { - val either = Either.catchNonFatal { - jsch.addIdentity("rdb-loader-tunnel-key", tunnel.identity.key.orNull, null, tunnel.identity.passphrase.orNull) - sshSession = jsch.getSession(tunnel.config.bastion.user, tunnel.config.bastion.host, tunnel.config.bastion.port) - sshSession.setConfig("StrictHostKeyChecking", "no") - sshSession.connect() - val _ = sshSession.setPortForwardingL(tunnel.config.localPort, tunnel.config.destination.host, tunnel.config.destination.port) - () - } - either.leftMap(e => LoaderError.LoaderLocalError(e.getMessage)) - } - } - - /** Try to close SSH tunnel, fail if it was not open */ - def closeTunnel(): Either[LoaderError, Unit] = - if (sshSession == null) Left(LoaderError.LoaderLocalError("Attempted to close nonexistent SSH session")) - else try { - Right(sshSession.disconnect()) - } catch { - case NonFatal(e) => Left(LoaderError.LoaderLocalError(e.getMessage)) - } - - /** - * Get value from AWS EC2 Parameter Store - * @param name systems manager parameter's name with SSH key - * @return decrypted string with key - */ - def getKey(name: String): Either[LoaderError, String] = { - try { - val client = AWSSimpleSystemsManagementClientBuilder.defaultClient() - val req: GetParameterRequest = new GetParameterRequest().withName(name).withWithDecryption(true) - val par = client.getParameter(req) - Right(par.getParameter.getValue) - } catch { - case NonFatal(e) => Left(LoaderError.LoaderLocalError(e.getMessage)) - } - } -} diff --git a/src/main/scala/com/snowplowanalytics/snowplow/rdbloader/interpreters/implementations/TrackerInterpreter.scala b/src/main/scala/com/snowplowanalytics/snowplow/rdbloader/interpreters/implementations/TrackerInterpreter.scala deleted file mode 100644 index 126a93a94..000000000 --- a/src/main/scala/com/snowplowanalytics/snowplow/rdbloader/interpreters/implementations/TrackerInterpreter.scala +++ /dev/null @@ -1,181 +0,0 @@ -/* - * Copyright (c) 2012-2019 Snowplow Analytics Ltd. All rights reserved. - * - * This program is licensed to you under the Apache License Version 2.0, - * and you may not use this file except in compliance with the Apache License Version 2.0. - * You may obtain a copy of the Apache License Version 2.0 at http://www.apache.org/licenses/LICENSE-2.0. - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the Apache License Version 2.0 is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the Apache License Version 2.0 for the specific language governing permissions and limitations there under. - */ -package com.snowplowanalytics.snowplow.rdbloader -package interpreters.implementations - -import java.io.ByteArrayInputStream -import java.nio.charset.StandardCharsets - -import scala.util.control.NonFatal - -import cats.Id -import cats.data.NonEmptyList - -import io.circe.Json - -import com.amazonaws.services.s3.AmazonS3 -import com.amazonaws.services.s3.model.ObjectMetadata - -import org.joda.time.DateTime - -import com.snowplowanalytics.iglu.core.{SchemaKey, SchemaVer, SelfDescribingData} - -import com.snowplowanalytics.snowplow.scalatracker._ -import com.snowplowanalytics.snowplow.scalatracker.emitters.id._ -import com.snowplowanalytics.snowplow.scalatracker.emitters.id.RequestProcessor._ -import com.snowplowanalytics.snowplow.scalatracker.emitters.id.{SyncBatchEmitter, SyncEmitter} - -// This project -import config.SnowplowConfig.{GetMethod, Monitoring, PostMethod} -import common._ - -object TrackerInterpreter { - - val ApplicationContextSchema = SchemaKey("com.snowplowanalytics.monitoring.batch", "application_context", "jsonschema", SchemaVer.Full(1,0,0)) - val LoadSucceededSchema = SchemaKey("com.snowplowanalytics.monitoring.batch", "load_succeeded", "jsonschema", SchemaVer.Full(1,0,0)) - val LoadFailedSchema = SchemaKey("com.snowplowanalytics.monitoring.batch", "load_failed", "jsonschema", SchemaVer.Full(1,0,0)) - - /** Callback for failed */ - private def callback(params: CollectorParams, request: CollectorRequest, response: CollectorResponse): Unit = { - def toMsg(rsp: CollectorResponse, includeHeader: Boolean): String = rsp match { - case CollectorFailure(code) => - val header = if (includeHeader) { s"Snowplow Tracker [${DateTime.now()}]: " } else "" - header ++ s"Cannot deliver event to ${params.getUri}. Collector responded with $code" - case TrackerFailure(error) => - val header = if (includeHeader) { s"Snowplow Tracker [${DateTime.now()}]: " } else "" - header ++ s"Cannot deliver event to ${params.getUri}. Tracker failed due ${error.getMessage}" - case RetriesExceeded(r) => s"Tracker [${DateTime.now()}]: Gave up on trying to deliver event. Last error: ${toMsg(r, false)}" - case CollectorSuccess(_) => "" - } - - val message = toMsg(response, true) - - // The only place in interpreters where println used instead of logger as this is async function - if (message.isEmpty) () else println(message) - } - - /** - * Initialize Snowplow tracker, if `monitoring` section is properly configured - * - * @param monitoring config.yml `monitoring` section - * @return some tracker if enabled, none otherwise - */ - def initializeTracking(monitoring: Monitoring): Option[Tracker[Id]] = { - monitoring.snowplow.flatMap(_.collector) match { - case Some(Collector((host, port))) => - val emitter = monitoring.snowplow.flatMap(_.method) match { - case Some(GetMethod) => - SyncEmitter.createAndStart(host, port = Some(port), callback = Some(callback)) - case Some(PostMethod) => - SyncBatchEmitter.createAndStart(host, port = Some(port), bufferSize = 2) - case None => - SyncEmitter.createAndStart(host, port = Some(port), callback = Some(callback)) - } - val tracker = new Tracker[Id](NonEmptyList.of(emitter), "snowplow-rdb-loader", monitoring.snowplow.flatMap(_.appId).getOrElse("rdb-loader")) - Some(tracker) - case Some(_) => None - case None => None - } - } - - /** - * Track error if `tracker` is enabled. Print error otherwise - * - * @param tracker some tracker if enabled - */ - def trackError(tracker: Option[Tracker[Id]]): Unit = tracker match { - case Some(t) => - t.trackSelfDescribingEvent(SelfDescribingData(LoadFailedSchema, Json.fromFields(List.empty))) - case None => () - } - - /** - * Track error if `tracker` is enabled. Do nothing otherwise - * - * @param tracker some tracker if enabled - */ - def trackSuccess(tracker: Option[Tracker[Id]]): Unit = tracker match { - case Some(t) => - t.trackSelfDescribingEvent(SelfDescribingData(LoadSucceededSchema, Json.fromFields(List.empty))) - case None => () - } - - /** - * Dump stdout to S3 logging object to be retrieved by EmrEtlRunner later - * - * @param s3Client AWS S3 client - * @param key S3 object, retrieved from EmrEtlRunner - * @param content plain text to write - */ - def dumpStdout(s3Client: AmazonS3, key: S3.Key, content: String): Either[String, S3.Key] = { - try { - if (S3Interpreter.keyExists(s3Client, key)) { - Left(s"S3 log object [$key] already exists") - } else { - val meta = new ObjectMetadata() - meta.setContentLength(content.length) - meta.setContentEncoding("text/plain") - - val (bucket, prefix) = S3.splitS3Key(key) - val is = new ByteArrayInputStream(content.getBytes(StandardCharsets.UTF_8)) - s3Client.putObject(bucket, prefix, is, meta) - Right(key) - } - - } catch { - case NonFatal(e) => - Left(e.toString) - } - } - - /** - * Exit job with appropriate status code and printing - * exit message (same as dumped to S3) to stdout - * - * @param result loading result - * @param dumpResult S3 dumping result, none if loader didn't try to dump - */ - def exit(result: Log, dumpResult: Option[Either[String, S3.Key]]): Int = { - (result, dumpResult) match { - case (Log.LoadingSucceeded, None) => - println(s"INFO: Logs were not dumped to S3") - 0 - case (Log.LoadingFailed(_), None) => - println(s"INFO: Logs were not dumped to S3") - 1 - case (Log.LoadingSucceeded, Some(Right(key))) => - println(s"INFO: Logs successfully dumped to S3 [$key]") - 0 - case (Log.LoadingFailed(_), Some(Right(key))) => - println(s"INFO: Logs successfully dumped to S3 [$key]") - 1 - case (_, Some(Left(error))) => - println(s"ERROR: Log-dumping failed: [$error]") - 1 - } - } - - /** - * Config helper functions - */ - private object Collector { - def isInt(s: String): Boolean = try { s.toInt; true } catch { case _: NumberFormatException => false } - - def unapply(hostPort: String): Option[(String, Int)] = - hostPort.split(":").toList match { - case host :: port :: Nil if isInt(port) => Some((host, port.toInt)) - case host :: Nil => Some((host, 80)) - case _ => None - } - } -} diff --git a/src/main/scala/com/snowplowanalytics/snowplow/rdbloader/loaders/Common.scala b/src/main/scala/com/snowplowanalytics/snowplow/rdbloader/loaders/Common.scala index b23b1ad94..58bd1aae3 100644 --- a/src/main/scala/com/snowplowanalytics/snowplow/rdbloader/loaders/Common.scala +++ b/src/main/scala/com/snowplowanalytics/snowplow/rdbloader/loaders/Common.scala @@ -10,22 +10,26 @@ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the Apache License Version 2.0 for the specific language governing permissions and limitations there under. */ -package com.snowplowanalytics.snowplow.rdbloader -package loaders +package com.snowplowanalytics.snowplow.rdbloader.loaders -import java.sql.{ Timestamp => SqlTimestamp } +import java.sql.{Timestamp => SqlTimestamp} +import scala.concurrent.duration._ +import cats.Monad import cats.implicits._ +import cats.effect.Timer import shapeless.tag import shapeless.tag._ // This project -import common.StorageTarget - -import config.{ CliConfig, Step } -import db.Entities._ -import discovery.DataDiscovery +import com.snowplowanalytics.snowplow.rdbloader._ +import com.snowplowanalytics.snowplow.rdbloader.dsl.{Cache, Logging, FS, AWS, JDBC, Iglu} +import com.snowplowanalytics.snowplow.rdbloader.config.{ CliConfig, Step } +import com.snowplowanalytics.snowplow.rdbloader.common.StorageTarget +import com.snowplowanalytics.snowplow.rdbloader.db.Migration +import com.snowplowanalytics.snowplow.rdbloader.db.Entities._ +import com.snowplowanalytics.snowplow.rdbloader.discovery.DataDiscovery object Common { @@ -80,25 +84,24 @@ object Common { * Process any valid storage target, * including discovering step and establishing SSH-tunnel * - * @param cliConfig RDB Loader app configuration + * @param config RDB Loader app configuration */ - def load(cliConfig: CliConfig, discovery: List[DataDiscovery]): LoaderAction[Unit] = { - val loadDb = cliConfig.target match { - case postgresqlTarget: StorageTarget.PostgresqlConfig => - PostgresqlLoader.run(postgresqlTarget, cliConfig.steps, discovery) - case redshiftTarget: StorageTarget.RedshiftConfig => - RedshiftLoader.run(cliConfig.configYaml, redshiftTarget, cliConfig.steps, discovery) + def load[F[_]: Monad: Logging: FS: AWS: Iglu: JDBC](config: CliConfig, + discovery: List[DataDiscovery]): LoaderAction[F, Unit] = + config.target match { + case db: StorageTarget.PostgresqlConfig => + PostgresqlLoader.run[F](db, config.steps, discovery) + case db: StorageTarget.RedshiftConfig => + Migration.perform[F](config.target.schema)(discovery) *> + RedshiftLoader.run[F](config.configYaml, db, config.steps, discovery) } - Security.bracket(cliConfig.target.sshTunnel, loadDb) - } - /** * Choose a discovery strategy and perform it * * @param cliConfig RDB Loader app configuration */ - def discover(cliConfig: CliConfig): LoaderAction[List[DataDiscovery]] = { + def discover[F[_]: Monad: Timer: Cache: Logging: AWS](cliConfig: CliConfig): LoaderAction[F, List[DataDiscovery]] = { // Shortcuts val shredJob = cliConfig.configYaml.storage.versions.rdbShredder val region = cliConfig.configYaml.aws.s3.region @@ -113,13 +116,13 @@ object Common { cliConfig.target match { case _: StorageTarget.RedshiftConfig => - val original = DataDiscovery.discoverFull(target, cliConfig.target.id, shredJob, region, assets) + val original = DataDiscovery.discover[F](target, shredJob, region, assets) if (cliConfig.steps.contains(Step.ConsistencyCheck) && cliConfig.target.processingManifest.isEmpty) - DataDiscovery.checkConsistency(original) + DataDiscovery.checkConsistency[F](original) else original case _: StorageTarget.PostgresqlConfig => // Safe to skip consistency check as whole folder will be downloaded - DataDiscovery.discoverFull(target, cliConfig.target.id, shredJob, region, assets) + DataDiscovery.discover[F](target, shredJob, region, assets) } } @@ -128,22 +131,23 @@ object Common { * @param loadAction set of queries inside a transaction loading atomic and shredded only * (no vacuum or analyze) */ - def retryIfFailed(loadAction: LoaderAction[Unit]): LoaderAction[Unit] = { + def retryIfFailed[F[_]: Monad: Timer: Logging: JDBC](loadAction: LoaderAction[F, Unit]): LoaderAction[F, Unit] = { val retry = loadAction.value.flatMap[Either[LoaderError, Unit]] { case Left(LoaderError.StorageTargetError(message)) if message.contains("Connection refused") => for { - _ <- LoaderA.print(s"Loading failed with [$message], making another attempt") - retransact <- (LoaderA.executeUpdate(Common.AbortTransaction) *> LoaderA.executeUpdate(Common.BeginTransaction)).value - _ <- LoaderA.sleep(60000) + _ <- Logging[F].print(s"Loading failed with [$message], making another attempt") + retransact <- (JDBC[F].executeUpdate(Common.AbortTransaction) *> + JDBC[F].executeUpdate(Common.BeginTransaction)).value + _ <- Timer[F].sleep(60.seconds) result <- retransact match { case Right(_) => loadAction.value - case Left(_) => Action.lift(retransact.void) + case Left(_) => Monad[F].pure(retransact.void) } } yield result case e @ Left(_) => - LoaderA.print("Loading failed, no retries will be made") *> Action.lift(e) + Logging[F].print("Loading failed, no retries will be made") *> Monad[F].pure(e) case success => - Action.lift(success) + Monad[F].pure(success) } LoaderAction(retry) } @@ -168,33 +172,35 @@ object Common { * @return true if manifest record exists and atomic data is empty, assuming folder contains no events * and next manifest record shouldn't be written */ - def checkLoadManifest(schema: String, eventsTable: EventsTable, possiblyEmpty: Boolean): LoaderAction[Boolean] = { + def checkLoadManifest[F[_]: Monad: Logging: JDBC](schema: String, + eventsTable: EventsTable, + possiblyEmpty: Boolean): LoaderAction[F, Boolean] = { for { latestAtomicTstamp <- getEtlTstamp(eventsTable) isEmptyLoad <- latestAtomicTstamp match { case Some(timestamp) => for { - _ <- LoaderA.print(s"Load manifest: latest timestamp in ${eventsTable.getDescriptor} is ${timestamp.etlTstamp}").liftA + _ <- Logging[F].print(s"Load manifest: latest timestamp in ${eventsTable.getDescriptor} is ${timestamp.etlTstamp}").liftA item <- getManifestItem(schema, timestamp.etlTstamp) empty <- item match { case Some(manifest) if manifest.etlTstamp == timestamp.etlTstamp && possiblyEmpty => val message = s"Load manifest: record for ${manifest.etlTstamp} already exists, but atomic folder is empty" - for { _ <- LoaderA.print(message).liftA } yield true + for { _ <- Logging[F].print(message).liftA} yield true case Some(manifest) if manifest.etlTstamp == timestamp.etlTstamp => val message = getLoadManifestMessage(manifest) - LoaderAction.liftE[Boolean](LoaderError.LoadManifestError(message).asLeft) + LoaderAction.liftE[F, Boolean](LoaderError.LoadManifestError(message).asLeft) case Some(record) => - for { _ <- LoaderA.print(s"Load manifest: latest record ${record.show}").liftA } yield false + for { _ <- Logging[F].print(s"Load manifest: latest record ${record.show}").liftA} yield false case None => - for { _ <- LoaderA.print(s"Load manifest: no records found").liftA } yield false + for { _ <- Logging[F].print(s"Load manifest: no records found").liftA} yield false } } yield empty - case None => LoaderAction.lift(false) + case None => LoaderAction.rightT[F, Boolean](false) } } yield isEmptyLoad } /** Get ETL timestamp of ongoing load */ - private[loaders] def getEtlTstamp(eventsTable: EventsTable): LoaderAction[Option[Timestamp]] = { + private[loaders] def getEtlTstamp[F[_]: JDBC](eventsTable: EventsTable): LoaderAction[F, Option[Timestamp]] = { val query = s"""SELECT etl_tstamp | FROM ${eventsTable.getDescriptor} @@ -202,11 +208,12 @@ object Common { | ORDER BY etl_tstamp DESC | LIMIT 1""".stripMargin - LoaderA.executeQuery[Option[Timestamp]](SqlString.unsafeCoerce(query)) + JDBC[F].executeQuery[Option[Timestamp]](SqlString.unsafeCoerce(query)) } /** Get latest load manifest item */ - private[loaders] def getManifestItem(schema: String, etlTstamp: SqlTimestamp): LoaderAction[Option[LoadManifestItem]] = { + private[loaders] def getManifestItem[F[_]: JDBC](schema: String, + etlTstamp: SqlTimestamp): LoaderAction[F, Option[LoadManifestItem]] = { val query = s"""SELECT * | FROM ${getManifestTable(schema)} @@ -214,7 +221,7 @@ object Common { | ORDER BY etl_tstamp DESC | LIMIT 1""".stripMargin - LoaderA.executeQuery[Option[LoadManifestItem]](SqlString.unsafeCoerce(query)) + JDBC[F].executeQuery[Option[LoadManifestItem]](SqlString.unsafeCoerce(query)) } private def getLoadManifestMessage(manifest: LoadManifestItem): String = diff --git a/src/main/scala/com/snowplowanalytics/snowplow/rdbloader/loaders/PostgresqlLoader.scala b/src/main/scala/com/snowplowanalytics/snowplow/rdbloader/loaders/PostgresqlLoader.scala index bee1102df..3472e77c7 100644 --- a/src/main/scala/com/snowplowanalytics/snowplow/rdbloader/loaders/PostgresqlLoader.scala +++ b/src/main/scala/com/snowplowanalytics/snowplow/rdbloader/loaders/PostgresqlLoader.scala @@ -10,15 +10,16 @@ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the Apache License Version 2.0 for the specific language governing permissions and limitations there under. */ -package com.snowplowanalytics.snowplow.rdbloader -package loaders +package com.snowplowanalytics.snowplow.rdbloader.loaders +import cats.Monad import cats.implicits._ -import common.StorageTarget.PostgresqlConfig -import LoaderA._ -import config.Step -import discovery.DataDiscovery +import com.snowplowanalytics.snowplow.rdbloader.LoaderAction +import com.snowplowanalytics.snowplow.rdbloader.dsl.{FS, AWS, JDBC} +import com.snowplowanalytics.snowplow.rdbloader.common.StorageTarget.PostgresqlConfig +import com.snowplowanalytics.snowplow.rdbloader.config.Step +import com.snowplowanalytics.snowplow.rdbloader.discovery.DataDiscovery object PostgresqlLoader { @@ -30,14 +31,14 @@ object PostgresqlLoader { * @param steps SQL steps * @param discovery discovered data to load */ - def run(target: PostgresqlConfig, steps: Set[Step], discovery: List[DataDiscovery]): LoaderAction[Unit] = { + def run[F[_]: Monad: FS: AWS: JDBC](target: PostgresqlConfig, steps: Set[Step], discovery: List[DataDiscovery]): LoaderAction[F, Unit] = { val eventsTable = Common.getEventsTable(target) val statements = PostgresqlLoadStatements.build(eventsTable, steps) for { - _ <- discovery.traverse(loadFolder(statements)) - _ <- analyze(statements) - _ <- vacuum(statements) + _ <- discovery.traverse(loadFolder[F](statements)) + _ <- analyze[F](statements) + _ <- vacuum[F](statements) } yield () } @@ -48,12 +49,12 @@ object PostgresqlLoader { * @param discovery discovered run folder * @return changed app state */ - def loadFolder(statement: PostgresqlLoadStatements)(discovery: DataDiscovery): LoaderAction[Long] = { + def loadFolder[F[_]: Monad: FS: AWS: JDBC](statement: PostgresqlLoadStatements)(discovery: DataDiscovery): LoaderAction[F, Long] = { for { - tmpdir <- createTmpDir - files <- downloadData(discovery.atomicEvents, tmpdir) - count <- copyViaStdin(files, statement.events) - _ <- deleteDir(tmpdir) + tmpdir <- FS[F].createTmpDir + files <- AWS[F].downloadData(discovery.atomicEvents, tmpdir) + count <- JDBC[F].copyViaStdin(files, statement.events) + _ <- FS[F].deleteDir(tmpdir) } yield count } @@ -61,10 +62,10 @@ object PostgresqlLoader { * Return action executing VACUUM statements if there's any vacuum statements, * or noop if no vacuum statements were generated */ - def analyze(statements: PostgresqlLoadStatements): LoaderAction[Unit] = { + def analyze[F[_]: Monad: JDBC](statements: PostgresqlLoadStatements): LoaderAction[F, Unit] = { statements.analyze match { - case Some(analyze) => executeUpdates(List(analyze)).void - case None => LoaderAction.unit + case Some(analyze) => JDBC[F].executeUpdates(List(analyze)).void + case None => LoaderAction.unit[F] } } @@ -72,10 +73,10 @@ object PostgresqlLoader { * Return action executing ANALYZE statements if there's any vacuum statements, * or noop if no vacuum statements were generated */ - def vacuum(statements: PostgresqlLoadStatements): LoaderAction[Unit] = { + def vacuum[F[_]: Monad: JDBC](statements: PostgresqlLoadStatements): LoaderAction[F, Unit] = { statements.vacuum match { - case Some(vacuum) => executeUpdates(List(vacuum)).void - case None => LoaderAction.unit + case Some(vacuum) => JDBC[F].executeUpdates(List(vacuum)).void + case None => LoaderAction.unit[F] } } } diff --git a/src/main/scala/com/snowplowanalytics/snowplow/rdbloader/loaders/RedshiftLoadStatements.scala b/src/main/scala/com/snowplowanalytics/snowplow/rdbloader/loaders/RedshiftLoadStatements.scala index 48ea80bcb..7b37daba4 100644 --- a/src/main/scala/com/snowplowanalytics/snowplow/rdbloader/loaders/RedshiftLoadStatements.scala +++ b/src/main/scala/com/snowplowanalytics/snowplow/rdbloader/loaders/RedshiftLoadStatements.scala @@ -14,6 +14,7 @@ package com.snowplowanalytics.snowplow.rdbloader package loaders import cats.implicits._ +import com.snowplowanalytics.snowplow.rdbloader.utils.S3 // This project import common.StorageTarget.RedshiftConfig diff --git a/src/main/scala/com/snowplowanalytics/snowplow/rdbloader/loaders/RedshiftLoader.scala b/src/main/scala/com/snowplowanalytics/snowplow/rdbloader/loaders/RedshiftLoader.scala index 518933792..d263ad1ca 100644 --- a/src/main/scala/com/snowplowanalytics/snowplow/rdbloader/loaders/RedshiftLoader.scala +++ b/src/main/scala/com/snowplowanalytics/snowplow/rdbloader/loaders/RedshiftLoader.scala @@ -10,19 +10,19 @@ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the Apache License Version 2.0 for the specific language governing permissions and limitations there under. */ -package com.snowplowanalytics.snowplow.rdbloader -package loaders +package com.snowplowanalytics.snowplow.rdbloader.loaders +import cats.Monad import cats.implicits._ // This project -import common.StorageTarget - -import LoaderA._ -import RedshiftLoadStatements._ -import Common.{ SqlString, EventsTable, checkLoadManifest, AtomicEvents, TransitTable } -import discovery.DataDiscovery -import config.{ SnowplowConfig, Step } +import com.snowplowanalytics.snowplow.rdbloader._ +import com.snowplowanalytics.snowplow.rdbloader.common.StorageTarget +import com.snowplowanalytics.snowplow.rdbloader.config.{ SnowplowConfig, Step } +import com.snowplowanalytics.snowplow.rdbloader.discovery.DataDiscovery +import com.snowplowanalytics.snowplow.rdbloader.dsl.{Logging, JDBC} +import com.snowplowanalytics.snowplow.rdbloader.loaders.RedshiftLoadStatements._ +import com.snowplowanalytics.snowplow.rdbloader.loaders.Common.{ SqlString, EventsTable, checkLoadManifest, AtomicEvents, TransitTable } /** @@ -45,14 +45,11 @@ object RedshiftLoader { * @param target Redshift storage target configuration * @param steps SQL steps */ - def run(config: SnowplowConfig, - target: StorageTarget.RedshiftConfig, - steps: Set[Step], - discovery: List[DataDiscovery]) = { - val queue = buildQueue(config, target, steps)(discovery) - - queue.traverse(loadFolder(steps)).void - } + def run[F[_]: Monad: Logging: JDBC](config: SnowplowConfig, + target: StorageTarget.RedshiftConfig, + steps: Set[Step], + discovery: List[DataDiscovery]) = + buildQueue(config, target, steps)(discovery).traverse_(loadFolder[F](steps)) /** * Perform data-loading for a single run folder. @@ -60,29 +57,27 @@ object RedshiftLoader { * @param statements prepared load statements * @return application state */ - def loadFolder(steps: Set[Step])(statements: RedshiftLoadStatements): LoaderAction[Unit] = { - import LoaderA._ - + def loadFolder[F[_]: Monad: Logging: JDBC](steps: Set[Step])(statements: RedshiftLoadStatements): LoaderAction[F, Unit] = { val checkManifest = steps.contains(Step.LoadManifestCheck) val loadManifest = steps.contains(Step.LoadManifest) def loadTransaction = for { - empty <- getLoad(checkManifest, statements.dbSchema, statements.atomicCopy, statements.discovery.possiblyEmpty) - _ <- executeUpdates(statements.shredded) - _ <- if (loadManifest && !empty) executeUpdate(statements.manifest) *> LoaderA.print("Load manifest: added new record").liftA - else if (loadManifest && empty) LoaderA.print(EmptyMessage).liftA - else LoaderAction.unit + empty <- getLoad[F](checkManifest, statements.dbSchema, statements.atomicCopy, statements.discovery.possiblyEmpty) + _ <- JDBC[F].executeUpdates(statements.shredded) + _ <- if (loadManifest && !empty) JDBC[F].executeUpdate(statements.manifest) *> Logging[F].print("Load manifest: added new record").liftA + else if (loadManifest && empty) Logging[F].print(EmptyMessage).liftA + else LoaderAction.unit[F] } yield () for { - _ <- LoaderA.print(s"Loading ${statements.base}").liftA + _ <- Logging[F].print(s"Loading ${statements.base}").liftA - _ <- executeUpdate(Common.BeginTransaction) + _ <- JDBC[F].executeUpdate(Common.BeginTransaction) _ <- loadTransaction - _ <- executeUpdate(Common.CommitTransaction) - _ <- LoaderA.print(s"Loading finished for ${statements.base}").liftA - _ <- vacuum(statements) - _ <- analyze(statements) + _ <- JDBC[F].executeUpdate(Common.CommitTransaction) + _ <- Logging[F].print(s"Loading finished for ${statements.base}").liftA + _ <- vacuum[F](statements) + _ <- analyze[F](statements) } yield () } @@ -90,24 +85,24 @@ object RedshiftLoader { * Get COPY action, either straight or transit (along with load manifest check) * @return */ - def getLoad(checkManifest: Boolean, dbSchema: String, copy: AtomicCopy, empty: Boolean): LoaderAction[Boolean] = { - def check(eventsTable: EventsTable): LoaderAction[Boolean] = - if (checkManifest) checkLoadManifest(dbSchema, eventsTable, empty) else LoaderAction.lift(false) + def getLoad[F[_]: Monad: Logging: JDBC](checkManifest: Boolean, dbSchema: String, copy: AtomicCopy, empty: Boolean): LoaderAction[F, Boolean] = { + def check(eventsTable: EventsTable): LoaderAction[F, Boolean] = + if (checkManifest) checkLoadManifest(dbSchema, eventsTable, empty) else LoaderAction.rightT(false) copy match { case StraightCopy(copyStatement) => for { - _ <- executeUpdate(copyStatement) + _ <- JDBC[F].executeUpdate(copyStatement) emptyLoad <- check(AtomicEvents(dbSchema)) } yield emptyLoad case TransitCopy(copyStatement) => val create = RedshiftLoadStatements.createTransitTable(dbSchema) val destroy = RedshiftLoadStatements.destroyTransitTable(dbSchema) for { - _ <- executeUpdate(create) + _ <- JDBC[F].executeUpdate(create) // TODO: Transit copy provides more reliable empty-check emptyLoad <- check(TransitTable(dbSchema)) - _ <- executeUpdate(copyStatement) - _ <- executeUpdate(destroy) + _ <- JDBC[F].executeUpdate(copyStatement) + _ <- JDBC[F].executeUpdate(destroy) } yield emptyLoad } } @@ -116,32 +111,32 @@ object RedshiftLoader { * Return action executing VACUUM statements if there's any vacuum statements, * or noop if no vacuum statements were generated */ - def analyze(statements: RedshiftLoadStatements): LoaderAction[Unit] = + def analyze[F[_]: Monad: Logging: JDBC](statements: RedshiftLoadStatements): LoaderAction[F, Unit] = statements.analyze match { case Some(analyze) => for { - _ <- executeTransaction(analyze) - _ <- LoaderA.print("ANALYZE transaction executed").liftA + _ <- JDBC[F].executeTransaction(analyze) + _ <- Logging[F].print("ANALYZE transaction executed").liftA } yield () - case None => LoaderA.print("ANALYZE transaction skipped").liftA + case None => Logging[F].print("ANALYZE transaction skipped").liftA } /** * Return action executing ANALYZE statements if there's any vacuum statements, * or noop if no vacuum statements were generated */ - def vacuum(statements: RedshiftLoadStatements): LoaderAction[Unit] = { + def vacuum[F[_]: Monad: Logging: JDBC](statements: RedshiftLoadStatements): LoaderAction[F, Unit] = { statements.vacuum match { case Some(vacuum) => val block = SqlString.unsafeCoerce("END") :: vacuum val actions = for { statement <- block } yield for { - _ <- LoaderA.print(statement).liftA - _ <- executeUpdate(statement) + _ <- Logging[F].print(statement).liftA + _ <- JDBC[F].executeUpdate(statement) } yield () actions.sequence.void - case None => LoaderA.print("VACUUM queries skipped").liftA + case None => Logging[F].print("VACUUM queries skipped").liftA } } diff --git a/src/main/scala/com/snowplowanalytics/snowplow/rdbloader/package.scala b/src/main/scala/com/snowplowanalytics/snowplow/rdbloader/package.scala index 459ec68a6..ee66235aa 100644 --- a/src/main/scala/com/snowplowanalytics/snowplow/rdbloader/package.scala +++ b/src/main/scala/com/snowplowanalytics/snowplow/rdbloader/package.scala @@ -14,68 +14,44 @@ package com.snowplowanalytics.snowplow import cats._ import cats.data._ -import cats.free.Free import cats.implicits._ -import cats.effect.Clock - import rdbloader.discovery.DiscoveryFailure package object rdbloader { - // RDB Loader's algebra defines hierarchy with three types common for all modules - // * Action[A] - IO substitution, end-of-the-world type - // * LoaderAction[A] - Validated and short-circuiting version of Action, equal to exception - // * ActionE[A] - Non-short-circuiting version of LoaderAction for results that can be recovered - - /** - * Main RDB Loader type. Represents all IO happening - * during discovering, loading and monitoring. - * End of the world type, that must be unwrapped and executed - * using one of interpreters - */ - type Action[A] = Free[LoaderA, A] - - object Action { - def lift[A](value: A): Action[A] = - Free.pure[LoaderA, A](value) - } - - /** - * Loading effect, producing value of type `A` with possible `LoaderError` - * - * @tparam A value of computation - */ - type LoaderAction[A] = EitherT[Action, LoaderError, A] + /** Loading effect, producing value of type `A` with possible `LoaderError` */ + type LoaderAction[F[_], A] = EitherT[F, LoaderError, A] /** Lift value into */ object LoaderAction { - def unit: LoaderAction[Unit] = - EitherT.liftF(Free.pure(())) + def unit[F[_]: Applicative]: LoaderAction[F, Unit] = + EitherT.liftF(Applicative[F].unit) - def lift[A](value: A): LoaderAction[A] = - EitherT.liftF(Free.pure(value)) + def rightT[F[_]: Applicative, A](value: A): LoaderAction[F, A] = + EitherT.rightT[F, LoaderError](value) - def liftE[A](either: Either[LoaderError, A]): LoaderAction[A] = - EitherT(Free.pure(either)) + def liftE[F[_]: Applicative, A](either: Either[LoaderError, A]): LoaderAction[F, A] = + EitherT.fromEither[F](either) - def liftA[A](action: Action[A]): LoaderAction[A] = - EitherT(action.map(_.asRight[LoaderError])) + def liftF[F[_]: Applicative, A](action: F[A]): LoaderAction[F, A] = + EitherT.liftF[F, LoaderError, A](action) - def apply[A](actionE: ActionE[A]): LoaderAction[A] = - EitherT[Action, LoaderError, A](actionE) + def apply[F[_], A](actionE: ActionE[F, A]): LoaderAction[F, A] = + EitherT[F, LoaderError, A](actionE) } - implicit class ActionOps[A](a: Action[A]) { - def liftA: LoaderAction[A] = LoaderAction.liftA(a) + implicit class ActionOps[F[_], A](a: F[A]) { + def liftA(implicit F: Applicative[F]): LoaderAction[F, A] = + LoaderAction.liftF(a) } /** Non-short-circuiting version of `TargetLoading` */ - type ActionE[A] = Free[LoaderA, Either[LoaderError, A]] + type ActionE[F[_], A] = F[Either[LoaderError, A]] object ActionE { - def liftError(error: LoaderError): ActionE[Nothing] = - Free.pure(error.asLeft) + def liftError[F[_]: Applicative](error: LoaderError): ActionE[F, Nothing] = + Applicative[F].pure(error.asLeft) } /** @@ -96,20 +72,11 @@ package object rdbloader { J[_], A](f: F[G[H[A]]], ff: H[G[A]] => J[G[A]]): F[J[G[A]]] = f.map(x => ff(x.sequence)) - /** - * IO-free result validation - */ + /** IO-free result validation */ type DiscoveryStep[A] = Either[DiscoveryFailure, A] - /** Single discovery step */ - type DiscoveryAction[A] = Action[DiscoveryStep[A]] - - /** - * Composed functor of IO and discovery step - */ - private[rdbloader] val DiscoveryAction = - Functor[Action].compose[DiscoveryStep] + type DiscoveryAction[F[_], A] = F[DiscoveryStep[A]] implicit class AggregateErrors[A, B](eithers: List[Either[A, B]]) { def aggregatedErrors: ValidatedNel[A, List[B]] = diff --git a/src/main/scala/com/snowplowanalytics/snowplow/rdbloader/utils/Common.scala b/src/main/scala/com/snowplowanalytics/snowplow/rdbloader/utils/Common.scala index 2980e0b47..71b64c005 100644 --- a/src/main/scala/com/snowplowanalytics/snowplow/rdbloader/utils/Common.scala +++ b/src/main/scala/com/snowplowanalytics/snowplow/rdbloader/utils/Common.scala @@ -10,21 +10,14 @@ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the Apache License Version 2.0 for the specific language governing permissions and limitations there under. */ -package com.snowplowanalytics.snowplow.rdbloader -package utils +package com.snowplowanalytics.snowplow.rdbloader.utils import cats.data._ -import cats.implicits._ import io.circe._ import com.snowplowanalytics.iglu.client.resolver.registries.Registry -// This project -import LoaderError._ -import discovery.DiscoveryFailure -import config.CliConfig - /** * Various common utility functions */ @@ -43,19 +36,6 @@ object Common { result.replace(secret, "x" * secret.length) } - /** - * Generate result for end-of-the-world log message using loading result - * - * @param result loading process state - * @return log entry, which can be interpreted accordingly - */ - def interpret(config: CliConfig, result: Either[LoaderError, Unit]): Log = { - result match { - case Right(_) => Log.LoadingSucceeded - case Left(error) => Log.LoadingFailed(error.show) - } - } - /** * Transforms CamelCase string into snake_case * Also replaces all hyphens with underscores diff --git a/src/main/scala/com/snowplowanalytics/snowplow/rdbloader/S3.scala b/src/main/scala/com/snowplowanalytics/snowplow/rdbloader/utils/S3.scala similarity index 98% rename from src/main/scala/com/snowplowanalytics/snowplow/rdbloader/S3.scala rename to src/main/scala/com/snowplowanalytics/snowplow/rdbloader/utils/S3.scala index c72fa8711..29ffa29b8 100644 --- a/src/main/scala/com/snowplowanalytics/snowplow/rdbloader/S3.scala +++ b/src/main/scala/com/snowplowanalytics/snowplow/rdbloader/utils/S3.scala @@ -10,17 +10,15 @@ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the Apache License Version 2.0 for the specific language governing permissions and limitations there under. */ -package com.snowplowanalytics.snowplow.rdbloader - -import com.amazonaws.services.s3.model.S3ObjectSummary +package com.snowplowanalytics.snowplow.rdbloader.utils import cats.syntax.either._ - +import com.amazonaws.services.s3.model.S3ObjectSummary +import com.snowplowanalytics.snowplow.rdbloader.loaders +import io.circe.Decoder import shapeless.tag import shapeless.tag._ -import io.circe.Decoder - /** * Common types and functions for Snowplow S3 clients */ diff --git a/src/main/scala/com/snowplowanalytics/snowplow/rdbloader/utils/SSH.scala b/src/main/scala/com/snowplowanalytics/snowplow/rdbloader/utils/SSH.scala new file mode 100644 index 000000000..6e2f0fc74 --- /dev/null +++ b/src/main/scala/com/snowplowanalytics/snowplow/rdbloader/utils/SSH.scala @@ -0,0 +1,62 @@ +/* + * Copyright (c) 2012-2020 Snowplow Analytics Ltd. All rights reserved. + * + * This program is licensed to you under the Apache License Version 2.0, + * and you may not use this file except in compliance with the Apache License Version 2.0. + * You may obtain a copy of the Apache License Version 2.0 at http://www.apache.org/licenses/LICENSE-2.0. + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the Apache License Version 2.0 is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the Apache License Version 2.0 for the specific language governing permissions and limitations there under. + */ +package com.snowplowanalytics.snowplow.rdbloader.utils + +import cats.Monad +import cats.effect.{Resource, Sync} +import cats.implicits._ + +import com.jcraft.jsch.{JSch, Session} + +import com.snowplowanalytics.snowplow.rdbloader.common.StorageTarget.TunnelConfig +import com.snowplowanalytics.snowplow.rdbloader.dsl.AWS + +object SSH { + + /** Actual SSH identity data. Both passphrase and key are optional */ + case class Identity(passphrase: Option[Array[Byte]], key: Option[Array[Byte]]) + + /** Open SSH tunnel, which will be guaranteed to be closed when application exits */ + def resource[F[_]: Sync: AWS](tunnelConfig: Option[TunnelConfig]): Resource[F, Unit] = + tunnelConfig match { + case Some(tunnel) => + Resource.make(getIdentity[F](tunnel).flatMap(i => createSession(tunnel, i)))(s => Sync[F].delay(s.disconnect())).void + case None => + Resource.pure[F, Unit](()) + } + + /** Convert pure tunnel configuration to configuration with actual key and passphrase */ + def getIdentity[F[_]: Monad: AWS](tunnelConfig: TunnelConfig): F[Identity] = + tunnelConfig + .bastion + .key + .map(_.ec2ParameterStore.parameterName).traverse(AWS[F].getEc2Property) + .map { key => Identity(tunnelConfig.bastion.passphrase.map(_.getBytes), key) } + + /** + * Create a SSH tunnel to bastion host and set port forwarding to target DB + * @param tunnelConfig SSH-tunnel configuration + * @return either nothing on success and error message on failure + */ + def createSession[F[_]: Sync](tunnelConfig: TunnelConfig, identity: Identity): F[Session] = + Sync[F].delay { + val jsch = new JSch() + jsch.addIdentity("rdb-loader-tunnel-key", identity.key.orNull, null, identity.passphrase.orNull) + val sshSession = jsch.getSession(tunnelConfig.bastion.user, tunnelConfig.bastion.host, tunnelConfig.bastion.port) + sshSession.setConfig("StrictHostKeyChecking", "no") + sshSession.connect() + val _ = sshSession.setPortForwardingL(tunnelConfig.localPort, tunnelConfig.destination.host, tunnelConfig.destination.port) + sshSession + } +} + diff --git a/src/test/scala/com/snowplowanalytics/snowplow/rdbloader/Interpreters.scala b/src/test/scala/com/snowplowanalytics/snowplow/rdbloader/Interpreters.scala new file mode 100644 index 000000000..61b0775ae --- /dev/null +++ b/src/test/scala/com/snowplowanalytics/snowplow/rdbloader/Interpreters.scala @@ -0,0 +1,39 @@ +package com.snowplowanalytics.snowplow.rdbloader + +import java.nio.file.Path + +import cats.effect.IO +import cats.syntax.all._ +import cats.effect.concurrent.Ref + +import com.snowplowanalytics.snowplow.rdbloader.db.Decoder +import com.snowplowanalytics.snowplow.rdbloader.dsl.JDBC +import com.snowplowanalytics.snowplow.rdbloader.loaders.Common.SqlString + +object Interpreters { + + + def ioTestJdbcInterpreter(queries: Ref[IO, List[String]]): JDBC[IO] = new JDBC[IO] { + def executeUpdate(sql: SqlString): LoaderAction[IO, Long] = { + val action = queries + .update(qs => sql.split(" ").headOption.toList.map(_.trim) ++ qs) + .as(1L.asRight[LoaderError]) + LoaderAction[IO, Long](action) + } + + def executeQuery[A](query: SqlString)(implicit ev: Decoder[A]): LoaderAction[IO, A] = { + val action = queries + .update(qs => query.split(" ").headOption.toList.map(_.trim) ++ qs) + .as("".asInstanceOf[A].asRight[LoaderError]) + LoaderAction[IO, A](action) + } + + def copyViaStdin(files: List[Path], query: SqlString): LoaderAction[IO, Long] = { + val action = queries + .update(qs => query.split(" ").headOption.toList.map(_.trim) ++ qs) + .as(1L.asRight[LoaderError]) + LoaderAction[IO, Long](action) + } + } +} + diff --git a/src/test/scala/com/snowplowanalytics/snowplow/rdbloader/S3Spec.scala b/src/test/scala/com/snowplowanalytics/snowplow/rdbloader/S3Spec.scala index 145997dd4..1ab50582b 100644 --- a/src/test/scala/com/snowplowanalytics/snowplow/rdbloader/S3Spec.scala +++ b/src/test/scala/com/snowplowanalytics/snowplow/rdbloader/S3Spec.scala @@ -12,11 +12,10 @@ */ package com.snowplowanalytics.snowplow.rdbloader +import com.snowplowanalytics.snowplow.rdbloader.utils.S3 import org.joda.time.DateTime import org.joda.time.format.DateTimeFormat - import org.scalacheck.Gen - import org.specs2.{ScalaCheck, Specification} diff --git a/src/test/scala/com/snowplowanalytics/snowplow/rdbloader/SpecHelpers.scala b/src/test/scala/com/snowplowanalytics/snowplow/rdbloader/SpecHelpers.scala index 59bb0445d..15eb92cbe 100644 --- a/src/test/scala/com/snowplowanalytics/snowplow/rdbloader/SpecHelpers.scala +++ b/src/test/scala/com/snowplowanalytics/snowplow/rdbloader/SpecHelpers.scala @@ -13,23 +13,18 @@ package com.snowplowanalytics.snowplow.rdbloader import scala.io.Source.fromInputStream - import java.util.UUID import cats.Id import cats.implicits._ - import io.circe.jawn.parse - import com.snowplowanalytics.iglu.client.Resolver - import com.snowplowanalytics.iglu.core.SelfDescribingData import com.snowplowanalytics.iglu.core.circe.implicits._ - +import com.snowplowanalytics.snowplow.rdbloader.utils.S3 import common.StorageTarget - -import S3.Folder.{coerce => s3} -import config.{ SnowplowConfig, Semver } +import com.snowplowanalytics.snowplow.rdbloader.utils.S3.Folder.{coerce => s3} +import config.{Semver, SnowplowConfig} import config.Semver._ import config.SnowplowConfig._ import loaders.Common.SqlString diff --git a/src/test/scala/com/snowplowanalytics/snowplow/rdbloader/TestInterpreter.scala b/src/test/scala/com/snowplowanalytics/snowplow/rdbloader/TestInterpreter.scala index a4bed8840..84ff21d6f 100644 --- a/src/test/scala/com/snowplowanalytics/snowplow/rdbloader/TestInterpreter.scala +++ b/src/test/scala/com/snowplowanalytics/snowplow/rdbloader/TestInterpreter.scala @@ -12,66 +12,180 @@ */ package com.snowplowanalytics.snowplow.rdbloader +import java.nio.file.Path + +import scala.concurrent.duration.{FiniteDuration, TimeUnit} import cats.data.{EitherT, State} +import cats.effect.{Clock, Timer} import cats.implicits._ - import io.circe.literal._ - import com.snowplowanalytics.iglu.core._ import com.snowplowanalytics.iglu.schemaddl.IgluSchema import com.snowplowanalytics.iglu.schemaddl.jsonschema.Schema import com.snowplowanalytics.iglu.schemaddl.jsonschema.circe.implicits._ -import com.snowplowanalytics.iglu.schemaddl.migrations.{ SchemaList => DSchemaList } - +import com.snowplowanalytics.iglu.schemaddl.migrations.{SchemaList => DSchemaList} +import com.snowplowanalytics.snowplow.rdbloader.utils.S3.{Folder, Key} import com.snowplowanalytics.snowplow.rdbloader.db.Decoder -import com.snowplowanalytics.snowplow.rdbloader.Security.Tunnel import com.snowplowanalytics.snowplow.rdbloader.db.Entities.{Columns, TableState} +import com.snowplowanalytics.snowplow.rdbloader.dsl.{AWS, Cache, Iglu, JDBC, Logging} +import com.snowplowanalytics.snowplow.rdbloader.loaders.Common.SqlString +import com.snowplowanalytics.snowplow.rdbloader.utils.S3 object TestInterpreter { - type Test[A] = State[List[String], A] + case class TestState(genericLog: List[String], cache: Map[String, Option[S3.Key]]) { - def fetch(key: SchemaKey): EitherT[Test, String, IgluSchema] = { - val state = State[List[String], IgluSchema] { log => - val result = Schema.parse(json"""{}""").getOrElse(throw new RuntimeException("Not a valid JSON schema")) - val schema = SelfDescribingSchema(SchemaMap(key), result) - (s"Fetch ${key.toSchemaUri}" :: log, schema) - } - EitherT.liftF(state) + def getLog: List[String] = genericLog.reverse.map(trim) + + def log(message: String): TestState = + TestState(message :: genericLog, cache) + + def time: Long = + (genericLog.length + cache.size).toLong + + def cachePut(key: String, value: Option[S3.Key]): TestState = + TestState(genericLog, cache ++ Map(key -> value)) } - def executeUpdate(query: String): Test[Either[LoaderError, Long]] = - State { log => (trim(query) :: log, 1L.asRight) } + object TestState { + val init: TestState = + TestState(List.empty[String], Map.empty[String, Option[S3.Key]]) + } - def print(message: String): Test[Unit] = - State { log => (message :: log, ()) } + type Test[A] = State[TestState, A] - def executeQuery[A](query: String, decoder: Decoder[A]): Test[Either[LoaderError, A]] = { - val result = decoder.name match { - case "TableState" => TableState(SchemaKey("com.acme", "some_context", "jsonschema", SchemaVer.Full(2,0,0))) - case "Boolean" => false - case "Columns" => Columns(List("some_column")) + object Test { + def apply[A](f: TestState => (TestState, A)): Test[A] = State(f) + def liftWith[I, A](f: I => A)(a: I): Test[A] = State { s: TestState => (s, f(a)) } + def pure[A](a: A): Test[A] = State.pure[TestState, A](a) + } + + implicit class StateOps[A](st: Test[Either[LoaderError, A]]) { + def toAction: LoaderAction[Test, A] = LoaderAction(st) + } + + def testClock: Clock[Test] = new Clock[Test] { + def realTime(unit: TimeUnit): Test[Long] = + State { log: TestState => (log.log("TICK REALTIME"), log.time) } + + def monotonic(unit: TimeUnit): Test[Long] = + State { log: TestState => (log.log("TICK MONOTONIC"), log.time) } + } + + case class JDBCResults(executeQuery: SqlString => Decoder[Any] => LoaderAction[Test, Any]) + + object JDBCResults { + val init: JDBCResults = { + def executeQuery[A](query: SqlString)(implicit ev: Decoder[A]): LoaderAction[Test, A] = { + val result = ev.name match { + case "TableState" => TableState(SchemaKey("com.acme", "some_context", "jsonschema", SchemaVer.Full(2,0,0))) + case "Boolean" => false + case "Columns" => Columns(List("some_column")) + } + val state = State { log: TestState => (log.log(query), result.asInstanceOf[A].asRight[LoaderError]) } + state.toAction + } + + JDBCResults(q => e => executeQuery(q)(e)) } - State { log => (trim(query) :: log, Right(result.asInstanceOf[A])) } } - def getEc2Property(name: String): Test[Either[LoaderError, String]] = - State { log => - val value = "EC2 PROPERTY " ++ name ++ " key" - (value :: log, Right(value)) + def stateJdbcInterpreter(results: JDBCResults): JDBC[Test] = new JDBC[Test] { + def executeUpdate(sql: SqlString): LoaderAction[Test, Long] = { + val action = State { s: TestState => (s.log(sql), 1L.asRight[LoaderError]) } + LoaderAction(action) } - def establishTunnel(tunnelConfig: Tunnel): Test[Either[LoaderError, Unit]] = - State { log => ("SSH TUNNEL ESTABLISH" :: log, Right(())) } + def executeQuery[A](query: SqlString)(implicit ev: Decoder[A]): LoaderAction[Test, A] = + results.executeQuery.asInstanceOf[SqlString => Decoder[A] => LoaderAction[Test, A]](query)(ev) + + def copyViaStdin(files: List[Path], query: SqlString): LoaderAction[Test, Long] = { + val action = State { log: TestState => (log.log(query), 1L.asRight[LoaderError]) } + LoaderAction(action) + } + } + + + case class ControlResults(print: String => Test[Unit]) + + object ControlResults { + def print(message: String): Test[Unit] = + State.modify[TestState](_.log(message)) - def getSchemas(vendor: String, name: String, model: Int): Test[Either[LoaderError, DSchemaList]] = - SchemaList - .parseStrings(List(s"iglu:$vendor/$name/jsonschema/$model-0-0")) - .map { x => DSchemaList.fromSchemaList(x, TestInterpreter.fetch).value } - .sequence[Test, Either[String, DSchemaList]] - .map { e => e.flatten.leftMap { x => LoaderError.LoaderLocalError(x)} } + def noop(message: String): Test[Unit] = + State.modify[TestState](identity).void + def init: ControlResults = ControlResults(print) + } + + def stateControlInterpreter(results: ControlResults): Logging[Test] = new Logging[Test] { + def getLastCopyStatements: Test[String] = + Test.pure("No COPY in the test") + def track(result: Either[LoaderError, Unit]): Test[Unit] = + Test.pure(()) + def dump(key: Key)(implicit S: AWS[Test]): Test[Either[String, Key]] = + Test.pure(key.asRight) + def print(message: String): Test[Unit] = + results.print(message) + } + + def stateIgluInterpreter: Iglu[Test] = new Iglu[Test] { + def getSchemas(vendor: String, name: String, model: Int): Test[Either[LoaderError, DSchemaList]] = + SchemaList + .parseStrings(List(s"iglu:$vendor/$name/jsonschema/$model-0-0")) + .map { x => DSchemaList.fromSchemaList(x, TestInterpreter.fetch).value } + .sequence[Test, Either[String, DSchemaList]] + .map { e => e.flatten.leftMap { x => LoaderError.LoaderLocalError(x)} } + } + + def stateTimerInterpreter: Timer[Test] = new Timer[Test] { + def clock: Clock[Test] = testClock + def sleep(duration: FiniteDuration): Test[Unit] = + State { log: TestState => (log.log(s"SLEEP $duration"), ()) } + } + + case class AWSResults(listS3: Folder => Test[Either[LoaderError, List[S3.BlobObject]]], keyExists: Key => Boolean) + + object AWSResults { + val init: AWSResults = AWSResults(_ => State.pure(List.empty[S3.BlobObject].asRight), _ => false) + } + + def stateAwsInterpreter(results: AWSResults): AWS[Test] = new AWS[Test] { + def listS3(bucket: Folder): Test[Either[LoaderError, List[S3.BlobObject]]] = + results.listS3(bucket).flatMap { list => + State.modify[TestState](s => s.log(s"LIST $bucket")).as(list) + } + + def keyExists(key: Key): Test[Boolean] = + State.pure(results.keyExists(key)) + + def downloadData(source: Folder, dest: Path): LoaderAction[Test, List[Path]] = + LoaderAction.liftF(Test.pure(List.empty[Path])) + + def putObject(key: Key, data: String): LoaderAction[Test, Unit] = + LoaderAction.liftF(Test.pure(())) + + def getEc2Property(name: String): Test[Array[Byte]] = + Test.pure(Array.empty[Byte]) + } + + def stateCacheInterpreter: Cache[Test] = new Cache[Test] { + def putCache(key: String, value: Option[S3.Key]): Test[Unit] = + State { log: TestState => (log.cachePut(key, value), ()) } + + def getCache(key: String): Test[Option[Option[S3.Key]]] = + State { log: TestState => (log.log(s"GET $key"), log.cache.get(key)) } + } + + private def fetch(key: SchemaKey): EitherT[Test, String, IgluSchema] = { + val state = State[TestState, IgluSchema] { log => + val result = Schema.parse(json"""{}""").getOrElse(throw new RuntimeException("Not a valid JSON schema")) + val schema = SelfDescribingSchema(SchemaMap(key), result) + (log.log(s"Fetch ${key.toSchemaUri}"), schema) + } + EitherT.liftF(state) + } private def trim(s: String): String = s.trim.replaceAll("\\s+", " ").replace("\n", " ") diff --git a/src/test/scala/com/snowplowanalytics/snowplow/rdbloader/config/CliConfigSpec.scala b/src/test/scala/com/snowplowanalytics/snowplow/rdbloader/config/CliConfigSpec.scala index 4d61a1df8..ab22570cd 100644 --- a/src/test/scala/com/snowplowanalytics/snowplow/rdbloader/config/CliConfigSpec.scala +++ b/src/test/scala/com/snowplowanalytics/snowplow/rdbloader/config/CliConfigSpec.scala @@ -18,8 +18,8 @@ import cats.data.{ Validated, NonEmptyList } // specs2 import org.specs2.Specification -import S3.Key.{coerce => s3} -import S3.Folder.{coerce => dir} +import com.snowplowanalytics.snowplow.rdbloader.utils.S3.Key.{coerce => s3} +import com.snowplowanalytics.snowplow.rdbloader.utils.S3.Folder.{coerce => dir} import LoaderError.ConfigError class CliConfigSpec extends Specification { def is = s2""" diff --git a/src/test/scala/com/snowplowanalytics/snowplow/rdbloader/db/MigrationSpec.scala b/src/test/scala/com/snowplowanalytics/snowplow/rdbloader/db/MigrationSpec.scala index e9fd0d462..2f3b97245 100644 --- a/src/test/scala/com/snowplowanalytics/snowplow/rdbloader/db/MigrationSpec.scala +++ b/src/test/scala/com/snowplowanalytics/snowplow/rdbloader/db/MigrationSpec.scala @@ -12,16 +12,14 @@ */ package com.snowplowanalytics.snowplow.rdbloader.db -import cats.~> - -import com.snowplowanalytics.snowplow.rdbloader.{LoaderA, S3} +import com.snowplowanalytics.snowplow.rdbloader.TestInterpreter +import com.snowplowanalytics.snowplow.rdbloader.TestInterpreter.{ControlResults, JDBCResults, Test, TestState} import com.snowplowanalytics.snowplow.rdbloader.config.Semver import com.snowplowanalytics.snowplow.rdbloader.discovery.{DataDiscovery, ShreddedType} - +import com.snowplowanalytics.snowplow.rdbloader.dsl.{Iglu, JDBC, Logging} +import com.snowplowanalytics.snowplow.rdbloader.utils.S3 import org.specs2.Specification -import com.snowplowanalytics.snowplow.rdbloader.TestInterpreter - class MigrationSpec extends Specification { def is = s2""" Perform migration only for ShreddedType.Tabular $e1 """ @@ -55,35 +53,11 @@ class MigrationSpec extends Specification { def is = s2""" "Table created" ) - val action = Migration.perform("public")(input) - val (state, result) = action.value.foldMap(MigrationSpec.interpreter).run(Nil).value - (state.reverse must beEqualTo(expected)).and(result must beRight) - } -} - -object MigrationSpec { + implicit val jdbc: JDBC[Test] = TestInterpreter.stateJdbcInterpreter(JDBCResults.init) + implicit val iglu: Iglu[Test] = TestInterpreter.stateIgluInterpreter + implicit val control: Logging[Test] = TestInterpreter.stateControlInterpreter(ControlResults.init) - import TestInterpreter.Test - - def interpreter: LoaderA ~> Test = new (LoaderA ~> Test) { - def apply[A](effect: LoaderA[A]): Test[A] = { - effect match { - case LoaderA.Print(message) => - TestInterpreter.print(message) - - case LoaderA.ExecuteUpdate(query) => - TestInterpreter.executeUpdate(query) - - case LoaderA.ExecuteQuery(query, decoder) => - TestInterpreter.executeQuery(query, decoder) - - case LoaderA.GetSchemas(vendor, name, model) => - TestInterpreter.getSchemas(vendor, name, model) - - case action => - throw new RuntimeException(s"Unexpected Action [$action]") - } - } + val (state, result) = Migration.perform[Test]("public")(input).value.run(TestState.init).value + (state.getLog must beEqualTo(expected)).and(result must beRight) } - -} \ No newline at end of file +} diff --git a/src/test/scala/com/snowplowanalytics/snowplow/rdbloader/discovery/DataDiscoverySpec.scala b/src/test/scala/com/snowplowanalytics/snowplow/rdbloader/discovery/DataDiscoverySpec.scala index badc5bbd0..ca0c70a84 100644 --- a/src/test/scala/com/snowplowanalytics/snowplow/rdbloader/discovery/DataDiscoverySpec.scala +++ b/src/test/scala/com/snowplowanalytics/snowplow/rdbloader/discovery/DataDiscoverySpec.scala @@ -10,27 +10,24 @@ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the Apache License Version 2.0 for the specific language governing permissions and limitations there under. */ -package com.snowplowanalytics.snowplow.rdbloader -package discovery +package com.snowplowanalytics.snowplow.rdbloader.discovery import java.util.UUID -import cats.data.State -import cats.{Id, ~>} import cats.syntax.either._ - import org.specs2.Specification - -import LoaderError.DiscoveryError -import config.Semver -import discovery.ShreddedType._ -import S3.Folder.{coerce => dir} -import S3.Key.{coerce => s3key} +import com.snowplowanalytics.snowplow.rdbloader.{LoaderError, TestInterpreter} +import com.snowplowanalytics.snowplow.rdbloader.utils.S3.Folder.{coerce => dir} +import com.snowplowanalytics.snowplow.rdbloader.utils.S3.Key.{coerce => s3key} +import com.snowplowanalytics.snowplow.rdbloader.TestInterpreter.{AWSResults, ControlResults, Test, TestState} +import com.snowplowanalytics.snowplow.rdbloader.config.Semver +import com.snowplowanalytics.snowplow.rdbloader.discovery.ShreddedType._ +import com.snowplowanalytics.snowplow.rdbloader.dsl.{AWS, Cache, Logging} +import com.snowplowanalytics.snowplow.rdbloader.utils.S3 class DataDiscoverySpec extends Specification { def is = s2""" Successfully discover two run folders at once $e1 - Successfully do eventual consistency check $e2 Fail to proceed with empty target folder $e3 Do not fail to proceed with empty shredded good folder $e4 Successfully discover data in run folder $e5 @@ -43,157 +40,36 @@ class DataDiscoverySpec extends Specification { def is = s2""" val id = UUID.fromString("8ad6fc06-ae5c-4dfc-a14d-f2ae86755179") def e1 = { - def interpreter: LoaderA ~> Id = new (LoaderA ~> Id) { - - private val cache = collection.mutable.HashMap.empty[String, Option[S3.Key]] - - def apply[A](effect: LoaderA[A]): Id[A] = { - effect match { - case LoaderA.ListS3(bucket) => - Right(List( - S3.Key.coerce(bucket + "run=2017-05-22-12-20-57/atomic-events/part-0000"), - S3.Key.coerce(bucket + "run=2017-05-22-12-20-57/atomic-events/part-0001"), - S3.Key.coerce(bucket + "run=2017-05-22-12-20-57/com.mailchimp/email_address_change/jsonschema/1-0-0/part-00001"), - S3.Key.coerce(bucket + "run=2017-05-22-12-20-57/com.mailchimp/email_address_change/jsonschema/1-0-0/part-00002"), - S3.Key.coerce(bucket + "run=2017-05-22-12-20-57/com.mailchimp/email_address_change/jsonschema/2-0-0/part-00001"), - - S3.Key.coerce(bucket + "run=2017-05-22-16-00-57/atomic-events/part-0000"), - S3.Key.coerce(bucket + "run=2017-05-22-16-00-57/atomic-events/part-0001"), - S3.Key.coerce(bucket + "run=2017-05-22-16-00-57/com.snowplowanalytics.snowplow/add_to_cart/jsonschema/1-0-0/part-00000"), - S3.Key.coerce(bucket + "run=2017-05-22-16-00-57/com.snowplowanalytics.snowplow/add_to_cart/jsonschema/1-0-0/part-00001") - ).map(k => S3.BlobObject(k, 1L))) - - case LoaderA.Get(key: String) => - cache.get(key) - case LoaderA.Put(key: String, value: Option[S3.Key]) => - val _ = cache.put(key, value) - () - - case LoaderA.KeyExists(key) => - if (key == "s3://snowplow-hosted-assets-us-east-1/4-storage/redshift-storage/jsonpaths/com.mailchimp/email_address_change_1.json" || - key == "s3://snowplow-hosted-assets-us-east-1/4-storage/redshift-storage/jsonpaths/com.mailchimp/email_address_change_2.json" || - key == "s3://snowplow-hosted-assets-us-east-1/4-storage/redshift-storage/jsonpaths/com.snowplowanalytics.snowplow/add_to_cart_1.json") - true - else - false - - case action => - throw new RuntimeException(s"Unexpected Action [$action]") - } + def listS3(bucket: S3.Folder) = + List( + S3.Key.coerce(bucket + "run=2017-05-22-12-20-57/atomic-events/part-0000"), + S3.Key.coerce(bucket + "run=2017-05-22-12-20-57/atomic-events/part-0001"), + S3.Key.coerce(bucket + "run=2017-05-22-12-20-57/com.mailchimp/email_address_change/jsonschema/1-0-0/part-00001"), + S3.Key.coerce(bucket + "run=2017-05-22-12-20-57/com.mailchimp/email_address_change/jsonschema/1-0-0/part-00002"), + S3.Key.coerce(bucket + "run=2017-05-22-12-20-57/com.mailchimp/email_address_change/jsonschema/2-0-0/part-00001"), + + S3.Key.coerce(bucket + "run=2017-05-22-16-00-57/atomic-events/part-0000"), + S3.Key.coerce(bucket + "run=2017-05-22-16-00-57/atomic-events/part-0001"), + S3.Key.coerce(bucket + "run=2017-05-22-16-00-57/com.snowplowanalytics.snowplow/add_to_cart/jsonschema/1-0-0/part-00000"), + S3.Key.coerce(bucket + "run=2017-05-22-16-00-57/com.snowplowanalytics.snowplow/add_to_cart/jsonschema/1-0-0/part-00001") + ).map(k => S3.BlobObject(k, 1L)).asRight[LoaderError] + + def keyExists(k: S3.Key): Boolean = { + k.toString match { + case "s3://snowplow-hosted-assets-us-east-1/4-storage/redshift-storage/jsonpaths/com.mailchimp/email_address_change_1.json" => true + case "s3://snowplow-hosted-assets-us-east-1/4-storage/redshift-storage/jsonpaths/com.mailchimp/email_address_change_2.json" => true + case "s3://snowplow-hosted-assets-us-east-1/4-storage/redshift-storage/jsonpaths/com.snowplowanalytics.snowplow/add_to_cart_1.json" => true + case _ => false } } - val shreddedGood = S3.Folder.coerce("s3://runfolder-test/shredded/good/") - - val expected = List( - DataDiscovery( - dir("s3://runfolder-test/shredded/good/run=2017-05-22-12-20-57/"), - Some(2L), - Some(2L), - List( - ShreddedType.Json( - Info(dir("s3://runfolder-test/shredded/good/run=2017-05-22-12-20-57/"),"com.mailchimp","email_address_change",2,Semver(0,11,0,None)), - s3key("s3://snowplow-hosted-assets-us-east-1/4-storage/redshift-storage/jsonpaths/com.mailchimp/email_address_change_2.json")), - ShreddedType.Json( - Info(dir("s3://runfolder-test/shredded/good/run=2017-05-22-12-20-57/"),"com.mailchimp","email_address_change",1,Semver(0,11,0,None)), - s3key("s3://snowplow-hosted-assets-us-east-1/4-storage/redshift-storage/jsonpaths/com.mailchimp/email_address_change_1.json")) - ), - specificFolder = false - ), - - DataDiscovery( - dir("s3://runfolder-test/shredded/good/run=2017-05-22-16-00-57/"), - Some(2L), - Some(2L), - List( - ShreddedType.Json( - Info(dir("s3://runfolder-test/shredded/good/run=2017-05-22-16-00-57/"), "com.snowplowanalytics.snowplow","add_to_cart",1,Semver(0,11,0,None)), - s3key("s3://snowplow-hosted-assets-us-east-1/4-storage/redshift-storage/jsonpaths/com.snowplowanalytics.snowplow/add_to_cart_1.json")) - ), - specificFolder = false - ) - ) - - val discoveryTarget = DataDiscovery.Global(shreddedGood) - val result = DataDiscovery.discoverFull(discoveryTarget, id, Semver(0,11,0), "us-east-1", None) - val endResult = result.value.foldMap(interpreter) - - endResult must beRight(expected) - } - - def e2 = { - - case class RealWorld(requests: Int, waited: List[Long]) { - def increment: RealWorld = this.copy(requests + 1) - } - - type TestState[A] = State[RealWorld, A] - - def interpreter: LoaderA ~> TestState = new (LoaderA ~> TestState) { - // S3 keys to return - val initial = List( - "run=2017-05-22-12-20-57/atomic-events/part-0000", - "run=2017-05-22-12-20-57/atomic-events/part-0001", - "run=2017-05-22-12-20-57/com.mailchimp/email_address_change/jsonschema/1-0-0/part-00001", - "run=2017-05-22-12-20-57/com.mailchimp/email_address_change/jsonschema/1-0-0/part-00002", - "run=2017-05-22-12-20-57/com.mailchimp/email_address_change/jsonschema/2-0-0/part-00001", - - "run=2017-05-22-16-00-57/atomic-events/part-0000", - "run=2017-05-22-16-00-57/atomic-events/part-0001", - "run=2017-05-22-16-00-57/com.snowplowanalytics.snowplow/add_to_cart/jsonschema/1-0-0/part-00000", - "run=2017-05-22-16-00-57/com.snowplowanalytics.snowplow/add_to_cart/jsonschema/1-0-0/part-00001" - ) - val second = "run=2017-05-22-16-00-57/com.snowplowanalytics.snowplow/geolocation/jsonschema/1-0-0/part-00001" :: initial - val end = "run=2017-05-22-16-00-57/com.snowplowanalytics.snowplow/custom_context/jsonschema/1-0-0/part-00000" :: second - - private val cache = collection.mutable.HashMap.empty[String, Option[S3.Key]] - - def blob(f: S3.Folder, k: String): S3.BlobObject = S3.BlobObject(S3.Key.coerce(f + k), 1L) - - def apply[A](effect: LoaderA[A]): TestState[A] = { - effect match { - case LoaderA.ListS3(bucket) => - State { (realWorld: RealWorld) => - if (realWorld.requests == 0) { - (realWorld.increment, Right(initial.map(blob(bucket, _)))) - } else if (realWorld.requests == 1) { - (realWorld.increment, Right(second.map(blob(bucket, _)))) - } else if (realWorld.requests == 2 || realWorld.requests == 3) { - (realWorld.increment, Right(end.map(blob(bucket, _)))) - } else { - throw new RuntimeException("Invalid test state " + realWorld.toString) - } - } - - case LoaderA.Get(key: String) => - State.pure(cache.get(key)) - case LoaderA.Put(key: String, value: Option[S3.Key]) => - val _ = cache.put(key, value) - State.pure(()) - - case LoaderA.KeyExists(key) => - if (key == "s3://snowplow-hosted-assets-us-east-1/4-storage/redshift-storage/jsonpaths/com.mailchimp/email_address_change_1.json" || - key == "s3://snowplow-hosted-assets-us-east-1/4-storage/redshift-storage/jsonpaths/com.mailchimp/email_address_change_2.json" || - key == "s3://snowplow-hosted-assets-us-east-1/4-storage/redshift-storage/jsonpaths/com.snowplowanalytics.snowplow/add_to_cart_1.json" || - key == "s3://snowplow-hosted-assets-us-east-1/4-storage/redshift-storage/jsonpaths/com.snowplowanalytics.snowplow/custom_context_1.json" || - key == "s3://snowplow-hosted-assets-us-east-1/4-storage/redshift-storage/jsonpaths/com.snowplowanalytics.snowplow/geolocation_1.json") - State.pure(true) - else - State.pure(false) - - case LoaderA.Sleep(timeout) => - State.modify((realWorld: RealWorld) => realWorld.copy(waited = timeout :: realWorld.waited)) - - case LoaderA.Print(_) => - State.pure(()) - - case action => - throw new RuntimeException(s"Unexpected Action [$action]") - } - } - } + implicit val control: Logging[Test] = TestInterpreter.stateControlInterpreter(ControlResults.init) + implicit val aws: AWS[Test] = TestInterpreter.stateAwsInterpreter(AWSResults.init.copy(listS3 = Test.liftWith(listS3), keyExists = keyExists)) + implicit val cache: Cache[Test] = TestInterpreter.stateCacheInterpreter val shreddedGood = S3.Folder.coerce("s3://runfolder-test/shredded/good/") + val discoveryTarget = DataDiscovery.Global(shreddedGood) + val (state, result) = DataDiscovery.discover[Test](discoveryTarget, Semver(0,11,0), "us-east-1", None).value.run(TestState.init).value val expected = List( DataDiscovery( @@ -218,62 +94,35 @@ class DataDiscoverySpec extends Specification { def is = s2""" List( ShreddedType.Json( Info(dir("s3://runfolder-test/shredded/good/run=2017-05-22-16-00-57/"), "com.snowplowanalytics.snowplow","add_to_cart",1,Semver(0,11,0,None)), - s3key("s3://snowplow-hosted-assets-us-east-1/4-storage/redshift-storage/jsonpaths/com.snowplowanalytics.snowplow/add_to_cart_1.json")), - ShreddedType.Json( - Info(dir("s3://runfolder-test/shredded/good/run=2017-05-22-16-00-57/"), "com.snowplowanalytics.snowplow","geolocation",1,Semver(0,11,0,None)), - s3key("s3://snowplow-hosted-assets-us-east-1/4-storage/redshift-storage/jsonpaths/com.snowplowanalytics.snowplow/geolocation_1.json")), - ShreddedType.Json( - Info(dir("s3://runfolder-test/shredded/good/run=2017-05-22-16-00-57/"), "com.snowplowanalytics.snowplow","custom_context",1,Semver(0,11,0,None)), - s3key("s3://snowplow-hosted-assets-us-east-1/4-storage/redshift-storage/jsonpaths/com.snowplowanalytics.snowplow/custom_context_1.json")) + s3key("s3://snowplow-hosted-assets-us-east-1/4-storage/redshift-storage/jsonpaths/com.snowplowanalytics.snowplow/add_to_cart_1.json")) ), specificFolder = false ) ) - val discoveryTarget = DataDiscovery.Global(shreddedGood) - val request = DataDiscovery.discoverFull(discoveryTarget, id, Semver(0,11,0), "us-east-1", None) - val result = DataDiscovery.checkConsistency(request) - val (endState, endResult) = result.value.foldMap(interpreter).run(RealWorld(0, Nil)).value - - val state = endState must beEqualTo(RealWorld(4, List(20000L, 20000L, 20000L))) - val response = endResult must beRight(expected) - state.and(response) + result must beRight(expected) } def e3 = { - def interpreter: LoaderA ~> Id = new (LoaderA ~> Id) { - def apply[A](effect: LoaderA[A]): Id[A] = { - effect match { - case LoaderA.ListS3(_) => Right(Nil) - - case action => - throw new RuntimeException(s"Unexpected Action [$action]") - } - } - } + + implicit val cache: Cache[Test] = TestInterpreter.stateCacheInterpreter + implicit val control: Logging[Test] = TestInterpreter.stateControlInterpreter(ControlResults.init) + implicit val aws: AWS[Test] = TestInterpreter.stateAwsInterpreter(AWSResults.init) val shreddedGood = S3.Folder.coerce("s3://runfolder-test/shredded/good/run=2017-08-21-19-18-20") - val expected = DiscoveryError(List(DiscoveryFailure.NoDataFailure(shreddedGood))) + val expected = LoaderError.DiscoveryError(List(DiscoveryFailure.NoDataFailure(shreddedGood))) val discoveryTarget = DataDiscovery.InSpecificFolder(shreddedGood) - val result = DataDiscovery.discoverFull(discoveryTarget, id, Semver(0,11,0), "us-east-1", None) - val endResult = result.value.foldMap(interpreter) + val (state, result) = DataDiscovery.discover[Test](discoveryTarget, Semver(0,11,0), "us-east-1", None).value.run(TestState.init).value - endResult must beLeft(expected) + result must beLeft(expected) } def e4 = { - def interpreter: LoaderA ~> Id = new (LoaderA ~> Id) { - def apply[A](effect: LoaderA[A]): Id[A] = { - effect match { - case LoaderA.ListS3(_) => Right(Nil) - - case action => - throw new RuntimeException(s"Unexpected Action [$action]") - } - } - } + implicit val cache: Cache[Test] = TestInterpreter.stateCacheInterpreter + implicit val control: Logging[Test] = TestInterpreter.stateControlInterpreter(ControlResults.init) + implicit val aws: AWS[Test] = TestInterpreter.stateAwsInterpreter(AWSResults.init) val shreddedGood = S3.Folder.coerce("s3://runfolder-test/shredded/good") @@ -281,46 +130,31 @@ class DataDiscoverySpec extends Specification { def is = s2""" // The only difference with e3 val discoveryTarget = DataDiscovery.Global(shreddedGood) - val result = DataDiscovery.discoverFull(discoveryTarget, id, Semver(0,11,0), "us-east-1", None) - val endResult = result.value.foldMap(interpreter) + val (state, result) = DataDiscovery.discover[Test](discoveryTarget, Semver(0,11,0), "us-east-1", None).value.run(TestState.init).value - endResult must beRight(expected) + result must beRight(expected) } def e5 = { - def interpreter: LoaderA ~> Id = new (LoaderA ~> Id) { - private val cache = collection.mutable.HashMap.empty[String, Option[S3.Key]] - def apply[A](effect: LoaderA[A]): Id[A] = { - effect match { - case LoaderA.ListS3(bucket) => - Right(List( - S3.Key.coerce(bucket + "atomic-events/part-0000"), - S3.Key.coerce(bucket + "atomic-events/part-0001"), - S3.Key.coerce(bucket + "com.mailchimp/email_address_change/jsonschema/1-0-0/part-00001"), - S3.Key.coerce(bucket + "com.mailchimp/email_address_change/jsonschema/1-0-0/part-00002"), - S3.Key.coerce(bucket + "com.mailchimp/email_address_change/jsonschema/2-0-0/part-00001") - ).map(k => S3.BlobObject(k, 1L))) - - case LoaderA.Get(key: String) => - cache.get(key) - case LoaderA.Put(key: String, value: Option[S3.Key]) => - val _ = cache.put(key, value) - () - - case LoaderA.KeyExists(key) => - if (key == "s3://snowplow-hosted-assets-us-east-1/4-storage/redshift-storage/jsonpaths/com.mailchimp/email_address_change_1.json" || - key == "s3://snowplow-hosted-assets-us-east-1/4-storage/redshift-storage/jsonpaths/com.mailchimp/email_address_change_2.json") - true - else - false - - case action => - throw new RuntimeException(s"Unexpected Action [$action]") - } + def listS3(bucket: S3.Folder) = + List( + S3.Key.coerce(bucket + "atomic-events/part-0000"), + S3.Key.coerce(bucket + "atomic-events/part-0001"), + S3.Key.coerce(bucket + "com.mailchimp/email_address_change/jsonschema/1-0-0/part-00001"), + S3.Key.coerce(bucket + "com.mailchimp/email_address_change/jsonschema/1-0-0/part-00002"), + S3.Key.coerce(bucket + "com.mailchimp/email_address_change/jsonschema/2-0-0/part-00001") + ).map(k => S3.BlobObject(k, 1L)).asRight[LoaderError] + + def keyExists(k: S3.Key): Boolean = + k.toString match { + case "s3://snowplow-hosted-assets-us-east-1/4-storage/redshift-storage/jsonpaths/com.mailchimp/email_address_change_1.json" => true + case "s3://snowplow-hosted-assets-us-east-1/4-storage/redshift-storage/jsonpaths/com.mailchimp/email_address_change_2.json" => true + case _ => false } - } - val shreddedGood = S3.Folder.coerce("s3://runfolder-test/shredded/good/run=2017-05-22-12-20-57/") + implicit val control: Logging[Test] = TestInterpreter.stateControlInterpreter(ControlResults.init) + implicit val aws: AWS[Test] = TestInterpreter.stateAwsInterpreter(AWSResults.init.copy(listS3 = Test.liftWith(listS3), keyExists = keyExists)) + implicit val cache: Cache[Test] = TestInterpreter.stateCacheInterpreter val expected = List( DataDiscovery( @@ -339,45 +173,33 @@ class DataDiscoverySpec extends Specification { def is = s2""" ) ) + val shreddedGood = S3.Folder.coerce("s3://runfolder-test/shredded/good/run=2017-05-22-12-20-57/") val discoveryTarget = DataDiscovery.Global(shreddedGood) - val result = DataDiscovery.discoverFull(discoveryTarget, id, Semver(0,11,0), "us-east-1", None).value - val endResult = result.foldMap(interpreter) + val (state, result) = DataDiscovery.discover[Test](discoveryTarget, Semver(0,11,0), "us-east-1", None).value.run(TestState.init).value - endResult must beRight(expected) + result must beRight(expected) } def e6 = { - def interpreter: LoaderA ~> Id = new (LoaderA ~> Id) { - private val cache = collection.mutable.HashMap.empty[String, Option[S3.Key]] - def apply[A](effect: LoaderA[A]): Id[A] = { - effect match { - case LoaderA.ListS3(bucket) => - Right(List( - S3.Key.coerce(bucket + "atomic-events/part-0000"), - S3.Key.coerce(bucket + "atomic-events/part-0001"), - S3.Key.coerce(bucket + "com.mailchimp/email_address_change/jsonschema/1-0-0/part-00001"), - S3.Key.coerce(bucket + "com.mailchimp/email_address_change/jsonschema/1-0-0/part-00002"), - S3.Key.coerce(bucket + "com.mailchimp/email_address_change/jsonschema/2-0-0/part-00001") - ).map(k => S3.BlobObject(k, 1L))) - - case LoaderA.Get(key: String) => - cache.get(key) - case LoaderA.Put(key: String, value: Option[S3.Key]) => - val _ = cache.put(key, value) - () - - case LoaderA.KeyExists(key) => - if (key == "s3://snowplow-hosted-assets-us-east-1/4-storage/redshift-storage/jsonpaths/com.mailchimp/email_address_change_1.json" || - key == "s3://snowplow-hosted-assets-us-east-1/4-storage/redshift-storage/jsonpaths/com.mailchimp/email_address_change_2.json") - true - else - false - - case action => - throw new RuntimeException(s"Unexpected Action [$action]") - } + def listS3(bucket: S3.Folder) = + List( + S3.Key.coerce(bucket + "atomic-events/part-0000"), + S3.Key.coerce(bucket + "atomic-events/part-0001"), + S3.Key.coerce(bucket + "com.mailchimp/email_address_change/jsonschema/1-0-0/part-00001"), + S3.Key.coerce(bucket + "com.mailchimp/email_address_change/jsonschema/1-0-0/part-00002"), + S3.Key.coerce(bucket + "com.mailchimp/email_address_change/jsonschema/2-0-0/part-00001") + ).map(k => S3.BlobObject(k, 1L)).asRight[LoaderError] + + def keyExists(k: S3.Key): Boolean = + k.toString match { + case "s3://snowplow-hosted-assets-us-east-1/4-storage/redshift-storage/jsonpaths/com.mailchimp/email_address_change_1.json" => true + case "s3://snowplow-hosted-assets-us-east-1/4-storage/redshift-storage/jsonpaths/com.mailchimp/email_address_change_2.json" => true + case _ => false } - } + + implicit val control: Logging[Test] = TestInterpreter.stateControlInterpreter(ControlResults.init) + implicit val aws: AWS[Test] = TestInterpreter.stateAwsInterpreter(AWSResults.init.copy(listS3 = Test.liftWith(listS3), keyExists = keyExists)) + implicit val cache: Cache[Test] = TestInterpreter.stateCacheInterpreter val targetFolder = S3.Folder.coerce("s3://runfolder-test/shredded/good/run=2017-05-22-12-20-57/") @@ -399,51 +221,35 @@ class DataDiscoverySpec extends Specification { def is = s2""" ) val discoveryTarget = DataDiscovery.InSpecificFolder(targetFolder) - val result = DataDiscovery.discoverFull(discoveryTarget, id, Semver(0,11,0), "us-east-1", None).value - val endResult = result.foldMap(interpreter) + val (state, result) = DataDiscovery.discover[Test](discoveryTarget, Semver(0,11,0), "us-east-1", None).value.run(TestState.init).value - endResult must beRight(expected) + result must beRight(expected) } def e7 = { - def interpreter: LoaderA ~> Id = new (LoaderA ~> Id) { - private val cache = collection.mutable.HashMap.empty[String, Option[S3.Key]] - def apply[A](effect: LoaderA[A]): Id[A] = { - effect match { - case LoaderA.ListS3(bucket) => - Right(List( - // One folder - S3.Key.coerce(bucket + "run=2017-05-22-12-20-57/atomic-events/part-0000"), - S3.Key.coerce(bucket + "run=2017-05-22-12-20-57/atomic-events/part-0001"), - S3.Key.coerce(bucket + "run=2017-05-22-12-20-57/com.mailchimp/email_address_change/jsonschema/1-0-0/part-00001"), - S3.Key.coerce(bucket + "run=2017-05-22-12-20-57/com.mailchimp/email_address_change/jsonschema/1-0-0/part-00002"), - S3.Key.coerce(bucket + "run=2017-05-22-12-20-57/com.mailchimp/email_address_change/jsonschema/2-0-0/part-00001"), - // Another folder - S3.Key.coerce(bucket + "run=2018-10-12-10-20-00/atomic-events/part-0000"), - S3.Key.coerce(bucket + "run=2018-10-12-10-20-00/atomic-events/part-0001") - ).map(k => S3.BlobObject(k, 1L))) - - case LoaderA.Get(key: String) => - cache.get(key) - case LoaderA.Put(key: String, value: Option[S3.Key]) => - val _ = cache.put(key, value) - () - - case LoaderA.KeyExists(key) => - if (key == "s3://snowplow-hosted-assets-us-east-1/4-storage/redshift-storage/jsonpaths/com.mailchimp/email_address_change_1.json" || - key == "s3://snowplow-hosted-assets-us-east-1/4-storage/redshift-storage/jsonpaths/com.mailchimp/email_address_change_2.json") - true - else - false - - case LoaderA.Print("More than one folder discovered with `--folder` option") => - () - - case action => - throw new RuntimeException(s"Unexpected Action [$action]") - } + def listS3(bucket: S3.Folder) = + List( + S3.Key.coerce(bucket + "run=2017-05-22-12-20-57/atomic-events/part-0000"), + S3.Key.coerce(bucket + "run=2017-05-22-12-20-57/atomic-events/part-0001"), + S3.Key.coerce(bucket + "run=2017-05-22-12-20-57/com.mailchimp/email_address_change/jsonschema/1-0-0/part-00001"), + S3.Key.coerce(bucket + "run=2017-05-22-12-20-57/com.mailchimp/email_address_change/jsonschema/1-0-0/part-00002"), + S3.Key.coerce(bucket + "run=2017-05-22-12-20-57/com.mailchimp/email_address_change/jsonschema/2-0-0/part-00001"), + // Another folder + S3.Key.coerce(bucket + "run=2018-10-12-10-20-00/atomic-events/part-0000"), + S3.Key.coerce(bucket + "run=2018-10-12-10-20-00/atomic-events/part-0001") + ).map(k => S3.BlobObject(k, 1L)).asRight[LoaderError] + + def keyExists(k: S3.Key): Boolean = + k.toString match { + case "s3://snowplow-hosted-assets-us-east-1/4-storage/redshift-storage/jsonpaths/com.mailchimp/email_address_change_1.json" => true + case "s3://snowplow-hosted-assets-us-east-1/4-storage/redshift-storage/jsonpaths/com.mailchimp/email_address_change_2.json" => true + case _ => false } - } + + + implicit val control: Logging[Test] = TestInterpreter.stateControlInterpreter(ControlResults.init) + implicit val aws: AWS[Test] = TestInterpreter.stateAwsInterpreter(AWSResults.init.copy(listS3 = Test.liftWith(listS3), keyExists = keyExists)) + implicit val cache: Cache[Test] = TestInterpreter.stateCacheInterpreter val targetFolder = S3.Folder.coerce("s3://runfolder-test/shredded/good/") @@ -472,34 +278,27 @@ class DataDiscoverySpec extends Specification { def is = s2""" ) val discoveryTarget = DataDiscovery.InSpecificFolder(targetFolder) - val result = DataDiscovery.discoverFull(discoveryTarget, id, Semver(0,11,0), "us-east-1", None).value - val endResult = result.foldMap(interpreter) + val (state, result) = DataDiscovery.discover[Test](discoveryTarget, Semver(0,11,0), "us-east-1", None).value.run(TestState.init).value - endResult must beRight(expected) + result must beRight(expected) } def e8 = { - def interpreter: LoaderA ~> Id = new (LoaderA ~> Id) { - def apply[A](effect: LoaderA[A]): Id[A] = { - effect match { - case LoaderA.ListS3(bucket) => - Right(List( - S3.BlobObject(S3.Key.join(bucket, "_SUCCESS"), 0L), - S3.BlobObject(S3.Key.join(bucket, "part-00000-8e95d7a6-4c5f-4dd3-ab78-6ca8b8cef5d4-c000.txt.gz"), 20L), - S3.BlobObject(S3.Key.join(bucket, "part-00001-8e95d7a6-4c5f-4dd3-ab78-6ca8b8cef5d4-c000.txt.gz"), 20L), - S3.BlobObject(S3.Key.join(bucket, "part-00002-8e95d7a6-4c5f-4dd3-ab78-6ca8b8cef5d4-c000.txt.gz"), 20L))) - - case action => - throw new RuntimeException(s"Unexpected Action [$action]") - } - } - } + def listS3(bucket: S3.Folder) = + List( + S3.BlobObject(S3.Key.join(bucket, "_SUCCESS"), 0L), + S3.BlobObject(S3.Key.join(bucket, "part-00000-8e95d7a6-4c5f-4dd3-ab78-6ca8b8cef5d4-c000.txt.gz"), 20L), + S3.BlobObject(S3.Key.join(bucket, "part-00001-8e95d7a6-4c5f-4dd3-ab78-6ca8b8cef5d4-c000.txt.gz"), 20L), + S3.BlobObject(S3.Key.join(bucket, "part-00002-8e95d7a6-4c5f-4dd3-ab78-6ca8b8cef5d4-c000.txt.gz"), 20L) + ).asRight[LoaderError] + + implicit val aws: AWS[Test] = TestInterpreter.stateAwsInterpreter(AWSResults.init.copy(listS3 = Test.liftWith(listS3))) val prefix = S3.Folder.coerce("s3://sp-com-acme-123987939231-10-batch-archive/main/shredded/good/run=2018-07-05-00-55-16/atomic-events/") - val result = DataDiscovery.listGoodBucket(prefix).value + val (state, result) = DataDiscovery.listGoodBucket[Test](prefix).value.run(TestState.init).value - result.foldMap(interpreter).map(_.length) must beRight(3) + result.map(_.length) must beRight(3) } def e9 = { diff --git a/src/test/scala/com/snowplowanalytics/snowplow/rdbloader/discovery/ShreddedTypeSpec.scala b/src/test/scala/com/snowplowanalytics/snowplow/rdbloader/discovery/ShreddedTypeSpec.scala index 99b8a3245..92dc0ed58 100644 --- a/src/test/scala/com/snowplowanalytics/snowplow/rdbloader/discovery/ShreddedTypeSpec.scala +++ b/src/test/scala/com/snowplowanalytics/snowplow/rdbloader/discovery/ShreddedTypeSpec.scala @@ -14,7 +14,7 @@ package com.snowplowanalytics.snowplow.rdbloader package discovery import cats.implicits._ - +import com.snowplowanalytics.snowplow.rdbloader.utils.S3 import org.scalacheck.Gen import org.specs2.{ScalaCheck, Specification} diff --git a/src/test/scala/com/snowplowanalytics/snowplow/rdbloader/loaders/CommonSpec.scala b/src/test/scala/com/snowplowanalytics/snowplow/rdbloader/loaders/CommonSpec.scala deleted file mode 100644 index b070b9059..000000000 --- a/src/test/scala/com/snowplowanalytics/snowplow/rdbloader/loaders/CommonSpec.scala +++ /dev/null @@ -1,110 +0,0 @@ -/* - * Copyright (c) 2012-2019 Snowplow Analytics Ltd. All rights reserved. - * - * This program is licensed to you under the Apache License Version 2.0, - * and you may not use this file except in compliance with the Apache License Version 2.0. - * You may obtain a copy of the Apache License Version 2.0 at http://www.apache.org/licenses/LICENSE-2.0. - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the Apache License Version 2.0 is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the Apache License Version 2.0 for the specific language governing permissions and limitations there under. - */ -package com.snowplowanalytics.snowplow.rdbloader -package loaders - -import java.util.UUID - -import cats._ -import org.specs2.Specification - -// This project -import common.StorageTarget -import config.Step -import discovery.DataDiscovery -import S3.Folder - -class CommonSpec extends Specification { def is = s2""" - Check that SSH tunnel gets open and closed if necessary $e1 - """ - - val targetId = UUID.fromString("8ad6fc06-ae5c-4dfc-a14d-f2ae86755179") - - def e1 = { - val expected = List( - "EC2 PROPERTY snowplow.redshift.key key", // Retrieve key - "SSH TUNNEL ESTABLISH", // Open - "BEGIN", "COPY", "SELECT", "INSERT", "COMMIT", "BEGIN", "ANALYZE", "COMMIT", - "SSH TUNNEL CLOSE") // Close - - val actions = collection.mutable.ListBuffer.empty[String] - - // Inputs - val key = StorageTarget.EncryptedConfig(StorageTarget.ParameterStoreConfig("snowplow.redshift.key")) - val TunnelInput = StorageTarget.TunnelConfig( - StorageTarget.BastionConfig("bastion.acme.com", 23, "bastion-user", None, Some(key)), - 15151, - StorageTarget.DestinationConfig("10.0.0.17", 5433)) - val target = StorageTarget.RedshiftConfig( - targetId, - "test-redsfhit-target", - "localhost", - "snowplowdb", - 15151, - SpecHelpers.disableSsl, - "arn:aws:iam::719197435995:role/RedshiftLoadRole", - "update", - "snowplow-loader", - StorageTarget.PlainText("Supersecret1"), - 100, - 1000L, - Some(TunnelInput), - None, - None) - - def interpreter: LoaderA ~> Id = new (LoaderA ~> Id) { - def apply[A](effect: LoaderA[A]): Id[A] = { - effect match { - case LoaderA.ExecuteUpdate(query) => - actions.append(query.split(" ").head.trim) - Right(1L) - - case LoaderA.ExecuteQuery(query, _) => - actions.append(query.split(" ").head.trim) - Right(None) - - case LoaderA.GetEc2Property(name) => - val value = "EC2 PROPERTY " ++ name ++ " key" - actions.append(value) - Right(value) - - case LoaderA.EstablishTunnel(Security.Tunnel(TunnelInput, Security.Identity(None, Some(_)))) => - actions.append("SSH TUNNEL ESTABLISH") - Right(()) - - case LoaderA.CloseTunnel() => - actions.append(s"SSH TUNNEL CLOSE") - Right(()) - - case LoaderA.Print(_) => () - - case action => - throw new RuntimeException(s"Unexpected Action [$action]") - } - } - } - - val cliConfig = config.CliConfig(SpecHelpers.validConfig, target, Step.defaultSteps, None, None, false, SpecHelpers.resolverJson) - val discovery = DataDiscovery( - Folder.coerce(cliConfig.configYaml.aws.s3.buckets.shredded.good ++ "run=2017-10-10-10-30-30/"), - Some(1L), Some(1L), Nil, specificFolder = false) - val state = Common.load(cliConfig, List(discovery)) - val action = state.value - val result = action.foldMap(interpreter) - - val transactionsExpectation = actions.toList must beEqualTo(expected) - val resultExpectation = result must beRight - transactionsExpectation.and(resultExpectation) - } - -} diff --git a/src/test/scala/com/snowplowanalytics/snowplow/rdbloader/loaders/RedshiftLoaderSpec.scala b/src/test/scala/com/snowplowanalytics/snowplow/rdbloader/loaders/RedshiftLoaderSpec.scala index 59918a593..7a6e5a334 100644 --- a/src/test/scala/com/snowplowanalytics/snowplow/rdbloader/loaders/RedshiftLoaderSpec.scala +++ b/src/test/scala/com/snowplowanalytics/snowplow/rdbloader/loaders/RedshiftLoaderSpec.scala @@ -10,22 +10,29 @@ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the Apache License Version 2.0 for the specific language governing permissions and limitations there under. */ -package com.snowplowanalytics.snowplow.rdbloader -package loaders +package com.snowplowanalytics.snowplow.rdbloader.loaders -import java.util.UUID -import java.time.Instant +import scala.concurrent.duration.FiniteDuration import java.sql.Timestamp -import cats.{Id, ~>} - +import cats.data.State +import cats.effect.{Clock, Timer} +import cats.syntax.either._ import org.specs2.Specification // This project -import Common.SqlString.{unsafeCoerce => sql} -import S3.{ Folder, Key } -import config.{ CliConfig, Step, Semver } -import discovery.{ DataDiscovery, ShreddedType } +import com.snowplowanalytics.snowplow.rdbloader.{LoaderError, TestInterpreter, SpecHelpers, LoaderAction} +import com.snowplowanalytics.snowplow.rdbloader.dsl.{AWS, Cache, Logging, JDBC} +import com.snowplowanalytics.snowplow.rdbloader.utils.S3 +import com.snowplowanalytics.snowplow.rdbloader.utils.S3.{ Folder, Key } +import com.snowplowanalytics.snowplow.rdbloader.config.{ CliConfig, Step, Semver } +import com.snowplowanalytics.snowplow.rdbloader.db.{ Decoder, Entities } +import com.snowplowanalytics.snowplow.rdbloader.discovery.{ DataDiscovery, ShreddedType } +import com.snowplowanalytics.snowplow.rdbloader.loaders.Common.SqlString +import com.snowplowanalytics.snowplow.rdbloader.loaders.Common.SqlString.{unsafeCoerce => sql} + +import com.snowplowanalytics.snowplow.rdbloader.SpecHelpers._ +import com.snowplowanalytics.snowplow.rdbloader.TestInterpreter.{AWSResults, JDBCResults, ControlResults, TestState, Test} class RedshiftLoaderSpec extends Specification { def is = s2""" @@ -38,39 +45,29 @@ class RedshiftLoaderSpec extends Specification { def is = s2""" Transit copy creates and deletes a temporary table $e7 """ - import SpecHelpers._ - val noDiscovery = DataDiscovery(Folder.coerce("s3://noop"), None, None, Nil, false) - def newId = UUID.randomUUID() - val time = Instant.now() + def e1 = { - def interpreter: LoaderA ~> Id = new (LoaderA ~> Id) { - def apply[A](effect: LoaderA[A]): Id[A] = { - effect match { - case LoaderA.ListS3(bucket) => - Right(List( - // This should succeed for "atomicDiscovery" - S3.Key.coerce(bucket + "run=2017-05-22-12-20-57_$folder$"), - S3.Key.coerce(bucket + "run=2017-05-22-12-20-57/atomic-events/_SUCCESS"), - S3.Key.coerce(bucket + "run=2017-05-22-12-20-57/atomic-events/$folder$"), - S3.Key.coerce(bucket + "run=2017-05-22-12-20-57/atomic-events/part-02") - ).map(k => S3.BlobObject(k, 1L))) - - case LoaderA.Sleep(_) => () - - case action => - throw new RuntimeException(s"Unexpected Action [$action]") - } - } - } + def listBucket(bucket: Folder): Either[LoaderError, List[S3.BlobObject]] = + List( + // This should succeed for "atomicDiscovery" + S3.Key.coerce(bucket + "run=2017-05-22-12-20-57_$folder$"), + S3.Key.coerce(bucket + "run=2017-05-22-12-20-57/atomic-events/_SUCCESS"), + S3.Key.coerce(bucket + "run=2017-05-22-12-20-57/atomic-events/$folder$"), + S3.Key.coerce(bucket + "run=2017-05-22-12-20-57/atomic-events/part-02") + ).map(k => S3.BlobObject(k, 1L)).asRight + + implicit val timer: Timer[Test] = TestInterpreter.stateTimerInterpreter + implicit val control: Logging[Test] = TestInterpreter.stateControlInterpreter(ControlResults.init) + implicit val aws: AWS[Test] = TestInterpreter.stateAwsInterpreter(AWSResults.init.copy(listS3 = Test.liftWith(listBucket))) + implicit val cache: Cache[Test] = TestInterpreter.stateCacheInterpreter + + val (_, result) = Common.discover[Test](CliConfig(validConfig, validTarget, Set.empty, None, None, false, SpecHelpers.resolverJson)).value.run(TestState.init).value val expected = List(DataDiscovery(S3.Folder.coerce("s3://snowplow-acme-storage/shredded/good/run=2017-05-22-12-20-57/"), Some(1), Some(1L), Nil, specificFolder = false)) - val action = Common.discover(CliConfig(validConfig, validTarget, Set.empty, None, None, false, SpecHelpers.resolverJson)) - val result = action.value.foldMap(interpreter) - result must beRight(expected) } @@ -135,70 +132,41 @@ class RedshiftLoaderSpec extends Specification { def is = s2""" } def e3 = { - def interpreter: LoaderA ~> Id = new (LoaderA ~> Id) { - def apply[A](effect: LoaderA[A]): Id[A] = { - effect match { - case LoaderA.ListS3(_) => Right(Nil) - - case LoaderA.KeyExists(_) => false - - case LoaderA.Sleep(_) => () - - case action => - throw new RuntimeException(s"Unexpected Action [$action]") - } - } - } + implicit val control: Logging[Test] = TestInterpreter.stateControlInterpreter(ControlResults.init) + implicit val jdbc: JDBC[Test] = TestInterpreter.stateJdbcInterpreter(JDBCResults.init) val steps: Set[Step] = Step.defaultSteps ++ Set(Step.Vacuum) - val action = RedshiftLoader.run(validConfig, validTarget, steps, Nil) - val result = action.value.foldMap(interpreter) + val (state, result) = RedshiftLoader.run[Test](validConfig, validTarget, steps, Nil).value.run(TestState.init).value result must beRight } def e4 = { - def interpreter: LoaderA ~> Id = new (LoaderA ~> Id) { - - private val cache = collection.mutable.HashMap.empty[String, Option[S3.Key]] - - def apply[A](effect: LoaderA[A]): Id[A] = { - effect match { - case LoaderA.ListS3(bucket) => - Right(List( - S3.Key.coerce(bucket + "run=2017-05-22-12-20-57/atomic-events/part-00001"), - S3.Key.coerce(bucket + "run=2017-05-22-12-20-57/atomic-events/part-00001"), - S3.Key.coerce(bucket + "run=2017-05-22-12-20-57/atomic-events/part-00001"), - S3.Key.coerce(bucket + "run=2017-05-22-12-20-57/shredded-types/vendor=com.snowplowanalytics.snowplow/name=submit_form/format=jsonschema/version=1-0-0/part-00001-dbb35260-7b12-494b-be87-e7a4b1f59906.txt"), - S3.Key.coerce(bucket + "run=2017-05-22-12-20-57/shredded-types/vendor=com.snowplowanalytics.snowplow/name=submit_form/format=jsonschema/version=1-0-0/part-00002-cba3a610-0b90-494b-be87-e7a4b1f59906.txt"), - S3.Key.coerce(bucket + "run=2017-05-22-12-20-57/shredded-types/vendor=com.snowplowanalytics.snowplow/name=submit_form/format=jsonschema/version=1-0-0/part-00003-fba35670-9b83-494b-be87-e7a4b1f59906.txt"), - S3.Key.coerce(bucket + "run=2017-05-22-12-20-57/shredded-types/vendor=com.snowplowanalytics.snowplow/name=submit_form/format=jsonschema/version=1-0-0/part-00004-fba3866a-8b90-494b-be87-e7a4b1fa9906.txt"), - S3.Key.coerce(bucket + "run=2017-05-22-12-20-57/shredded-types/vendor=com.snowplowanalytics.snowplow/name=submit_form/format=jsonschema/version=1-0-0/part-00005-aba3568f-7b96-494b-be87-e7a4b1fa9906.txt") - ).map(k => S3.BlobObject(k, 2L))) - - case LoaderA.Get(key: String) => - cache.get(key) - case LoaderA.Put(key: String, value: Option[S3.Key]) => - val _ = cache.put(key, value) - () - - case LoaderA.KeyExists(k) => - if (k == "s3://snowplow-hosted-assets-us-east-1/4-storage/redshift-storage/jsonpaths/com.snowplowanalytics.snowplow/submit_form_1.json") { - true - } else false - - case LoaderA.Sleep(time) => - throw new RuntimeException(s"Data-discovery should not sleep with skipped consistency check. Sleep called for [$time]") - - case action => - throw new RuntimeException(s"Unexpected Action [$action]") - } - } + def keyExists(k: Key): Boolean = + k == "s3://snowplow-hosted-assets-us-east-1/4-storage/redshift-storage/jsonpaths/com.snowplowanalytics.snowplow/submit_form_1.json" + + def listBucket(bucket: Folder): Either[LoaderError, List[S3.BlobObject]] = + List( + S3.Key.coerce(bucket + "run=2017-05-22-12-20-57/atomic-events/part-00001"), + S3.Key.coerce(bucket + "run=2017-05-22-12-20-57/atomic-events/part-00001"), + S3.Key.coerce(bucket + "run=2017-05-22-12-20-57/atomic-events/part-00001"), + S3.Key.coerce(bucket + "run=2017-05-22-12-20-57/shredded-types/vendor=com.snowplowanalytics.snowplow/name=submit_form/format=jsonschema/version=1-0-0/part-00001-dbb35260-7b12-494b-be87-e7a4b1f59906.txt"), + S3.Key.coerce(bucket + "run=2017-05-22-12-20-57/shredded-types/vendor=com.snowplowanalytics.snowplow/name=submit_form/format=jsonschema/version=1-0-0/part-00002-cba3a610-0b90-494b-be87-e7a4b1f59906.txt"), + S3.Key.coerce(bucket + "run=2017-05-22-12-20-57/shredded-types/vendor=com.snowplowanalytics.snowplow/name=submit_form/format=jsonschema/version=1-0-0/part-00003-fba35670-9b83-494b-be87-e7a4b1f59906.txt"), + S3.Key.coerce(bucket + "run=2017-05-22-12-20-57/shredded-types/vendor=com.snowplowanalytics.snowplow/name=submit_form/format=jsonschema/version=1-0-0/part-00004-fba3866a-8b90-494b-be87-e7a4b1fa9906.txt"), + S3.Key.coerce(bucket + "run=2017-05-22-12-20-57/shredded-types/vendor=com.snowplowanalytics.snowplow/name=submit_form/format=jsonschema/version=1-0-0/part-00005-aba3568f-7b96-494b-be87-e7a4b1fa9906.txt") + ).map(k => S3.BlobObject(k, 2L)).asRight + + implicit val control: Logging[Test] = TestInterpreter.stateControlInterpreter(ControlResults.init) + implicit val aws: AWS[Test] = TestInterpreter.stateAwsInterpreter(AWSResults.init.copy(listS3 = Test.liftWith(listBucket), keyExists = keyExists)) + implicit val cache: Cache[Test] = TestInterpreter.stateCacheInterpreter + implicit val timer: Timer[Test] = new Timer[Test] { + def clock: Clock[Test] = TestInterpreter.testClock + def sleep(duration: FiniteDuration): Test[Unit] = throw new RuntimeException("TODO") } val steps: Set[Step] = (Step.defaultSteps - Step.ConsistencyCheck) ++ Set(Step.Vacuum) - val action = Common.discover(CliConfig(validConfig, validTarget, steps, None, None, false, SpecHelpers.resolverJson)).value - val result: Either[LoaderError, List[DataDiscovery]] = action.foldMap(interpreter) + val (state, result) = Common.discover[Test](CliConfig(validConfig, validTarget, steps, None, None, false, SpecHelpers.resolverJson)).value.run(TestState.init).value val expected = List(DataDiscovery( S3.Folder.coerce("s3://snowplow-acme-storage/shredded/good/run=2017-05-22-12-20-57/"), @@ -216,6 +184,21 @@ class RedshiftLoaderSpec extends Specification { def is = s2""" } def e5 = { + implicit val control: Logging[Test] = TestInterpreter.stateControlInterpreter(ControlResults.init.copy(print = ControlResults.noop)) + implicit val jdbc: JDBC[Test] = TestInterpreter.stateJdbcInterpreter(JDBCResults.init) + + val input = RedshiftLoadStatements( + "atomic", + RedshiftLoadStatements.StraightCopy(sql("LOAD INTO atomic MOCK")), + List(sql("LOAD INTO SHRED 1 MOCK"), sql("LOAD INTO SHRED 2 MOCK"), sql("LOAD INTO SHRED 3 MOCK")), + Some(List(sql("VACUUM MOCK"))), // Must be shred cardinality + 1 + Some(List(sql("ANALYZE MOCK"))), + sql("MANIFEST INSERT MOCK"), + noDiscovery + ) + + val (state, result) = RedshiftLoader.loadFolder[Test](Step.defaultSteps - Step.LoadManifestCheck)(input).value.run(TestState.init).value + val expected = List( "BEGIN", "LOAD INTO atomic MOCK", @@ -231,83 +214,28 @@ class RedshiftLoaderSpec extends Specification { def is = s2""" "COMMIT" ) - val queries = collection.mutable.ListBuffer.empty[String] - - def interpreter: LoaderA ~> Id = new (LoaderA ~> Id) { - def apply[A](effect: LoaderA[A]): Id[A] = { - effect match { - case LoaderA.ExecuteUpdate(query) => - queries.append(query) - Right(1L) - - case LoaderA.Print(_) => - () - - case LoaderA.ExecuteQuery(_, _) => - Right(None) - - case action => - throw new RuntimeException(s"Unexpected Action [$action]") - } - } - } - - val input = RedshiftLoadStatements( - "atomic", - RedshiftLoadStatements.StraightCopy(sql("LOAD INTO atomic MOCK")), - List(sql("LOAD INTO SHRED 1 MOCK"), sql("LOAD INTO SHRED 2 MOCK"), sql("LOAD INTO SHRED 3 MOCK")), - Some(List(sql("VACUUM MOCK"))), // Must be shred cardinality + 1 - Some(List(sql("ANALYZE MOCK"))), - sql("MANIFEST INSERT MOCK"), - noDiscovery - ) - - val state = RedshiftLoader.loadFolder(Step.defaultSteps - Step.LoadManifestCheck)(input) - val action = state.value - val result = action.foldMap(interpreter) - - val transactionsExpectation = queries.toList must beEqualTo(expected) + val transactionsExpectation = state.getLog must beEqualTo(expected) val resultExpectation = result must beRight transactionsExpectation.and(resultExpectation) } def e6 = { - val expected = List( - "BEGIN", - "LOAD INTO atomic.events MOCK", - "SELECT events", - "SELECT manifest" - ) - - val queries = collection.mutable.ListBuffer.empty[String] - - def interpreter: LoaderA ~> Id = new (LoaderA ~> Id) { - def apply[A](effect: LoaderA[A]): Id[A] = { - effect match { - case LoaderA.ExecuteUpdate(query) => - queries.append(query) - Right(1L) - - case LoaderA.Print(_) => - () - - case LoaderA.ExecuteQuery(query, _) if query.contains("FROM atomic.events") => - queries.append("SELECT events") - val time = Timestamp.from(Instant.ofEpochMilli(1519757441133L)) - Right(Some(db.Entities.Timestamp(time))) - - case LoaderA.ExecuteQuery(query, _) if query.contains("FROM atomic.manifest") => - queries.append("SELECT manifest") - val etlTime = Timestamp.from(Instant.ofEpochMilli(1519757441133L)) - val commitTime = Timestamp.from(Instant.ofEpochMilli(1519777441133L)) - Right(Some(db.Entities.LoadManifestItem(etlTime, commitTime, 1000, 5))) - - case action => - throw new RuntimeException(s"Unexpected Action [$action]") - } - } + val latestTimestamp = Timestamp.valueOf("2018-02-27 00:00:00.01") + def executeQuery[A](query: SqlString)(implicit ev: Decoder[A]): LoaderAction[Test, A] = { + val result: Option[Any] = if (query.contains("FROM atomic.events")) { + Some(Entities.Timestamp(latestTimestamp)) + } else if (query.contains("FROM atomic.manifest")) { + val commitTime = Timestamp.valueOf("2018-02-28 00:00:00.01") + Some(Entities.LoadManifestItem(latestTimestamp, commitTime, 1000, 5)) + } else throw new RuntimeException("TODO") + + val state = State { log: TestState => (log.log(query), result.asInstanceOf[A].asRight[LoaderError]) } + state.toAction } + implicit val control: Logging[Test] = TestInterpreter.stateControlInterpreter(ControlResults.init.copy(print = ControlResults.noop)) + implicit val jdbc: JDBC[Test] = TestInterpreter.stateJdbcInterpreter(JDBCResults.init.copy(executeQuery = q => e => executeQuery(q)(e))) + val shouldNot = "SHOULD NOT BE EXECUTED" val input = RedshiftLoadStatements( @@ -320,20 +248,43 @@ class RedshiftLoaderSpec extends Specification { def is = s2""" noDiscovery ) - val state = RedshiftLoader.loadFolder(Step.defaultSteps)(input) - val action = state.value - val result = action.foldMap(interpreter) + val (state, result) = RedshiftLoader.loadFolder[Test](Step.defaultSteps)(input).value.run(TestState.init).value - val transactionsExpectation = queries.toList must beEqualTo(expected) + val expected = List( + "BEGIN", + "LOAD INTO atomic.events MOCK", + "SELECT etl_tstamp FROM atomic.events WHERE etl_tstamp IS NOT null ORDER BY etl_tstamp DESC LIMIT 1", + s"SELECT * FROM atomic.manifest WHERE etl_tstamp = '${latestTimestamp.toString}' ORDER BY etl_tstamp DESC LIMIT 1" + ) + + val transactionsExpectation = state.getLog must beEqualTo(expected) val resultExpectation = result must beLeft transactionsExpectation and resultExpectation } def e7 = { + val latestTimestamp = Timestamp.valueOf("2018-02-26 00:00:01.000") + def executeQuery[A](query: SqlString)(implicit ev: Decoder[A]): LoaderAction[Test, A] = { + val result = if (query.contains("SELECT etl_tstamp")) { + Some(Entities.Timestamp(latestTimestamp)) + } else if (query.contains("FROM atomic.manifest")) { + val manifestEtlTime = Timestamp.valueOf("2018-02-27 00:00:01.00") + val commitTime = Timestamp.valueOf("2018-02-28 00:00:01.000") + Some(Entities.LoadManifestItem(manifestEtlTime, commitTime, 1000, 5)) + } else throw new RuntimeException("TODO") + + val state = State { log: TestState => (log.log(query), result.asInstanceOf[A].asRight[LoaderError]) } + state.toAction + } + + implicit val control: Logging[Test] = TestInterpreter.stateControlInterpreter(ControlResults.init.copy(print = ControlResults.noop)) + implicit val jdbc: JDBC[Test] = TestInterpreter.stateJdbcInterpreter(JDBCResults.init.copy(executeQuery = q => e => executeQuery(q)(e))) + val expected = List( "BEGIN", "CREATE TABLE atomic.temp_transit_events ( LIKE atomic.events )", - "SELECT etl_tstamp", "SELECT manifest", + "SELECT etl_tstamp FROM atomic.temp_transit_events WHERE etl_tstamp IS NOT null ORDER BY etl_tstamp DESC LIMIT 1", + s"SELECT * FROM atomic.manifest WHERE etl_tstamp = '${latestTimestamp.toString}' ORDER BY etl_tstamp DESC LIMIT 1", "COPY", "DROP TABLE atomic.temp_transit_events", @@ -341,35 +292,6 @@ class RedshiftLoaderSpec extends Specification { def is = s2""" "VACUUM MOCK", "BEGIN", "ANALYZE MOCK", "COMMIT" ) - val queries = collection.mutable.ListBuffer.empty[String] - - def interpreter: LoaderA ~> Id = new (LoaderA ~> Id) { - def apply[A](effect: LoaderA[A]): Id[A] = { - effect match { - case LoaderA.ExecuteUpdate(query) => - queries.append(query) - Right(1L) - - case LoaderA.Print(_) => - () - - case LoaderA.ExecuteQuery(query, _) if query.contains("FROM atomic.manifest") => - queries.append("SELECT manifest") - val etlTime = Timestamp.from(Instant.ofEpochMilli(1519757441133L)) - val commitTime = Timestamp.from(Instant.ofEpochMilli(1519777441133L)) - Right(Some(db.Entities.LoadManifestItem(etlTime, commitTime, 1000, 5))) - - case LoaderA.ExecuteQuery(query, _) if query.contains("SELECT etl_tstamp") => - queries.append("SELECT etl_tstamp") - val time = Timestamp.from(Instant.ofEpochMilli(1520164735L)) - Right(Some(db.Entities.Timestamp(time))) - - case action => - throw new RuntimeException(s"Unexpected Action [$action]") - } - } - } - val input = RedshiftLoadStatements( "atomic", RedshiftLoadStatements.TransitCopy(sql("COPY")), @@ -380,11 +302,9 @@ class RedshiftLoaderSpec extends Specification { def is = s2""" noDiscovery ) - val state = RedshiftLoader.loadFolder(Step.defaultSteps)(input) - val action = state.value - val result = action.foldMap(interpreter) + val (state, result) = RedshiftLoader.loadFolder[Test](Step.defaultSteps)(input).value.run(TestState.init).value - val transactionsExpectation = queries.toList must beEqualTo(expected) + val transactionsExpectation = state.getLog must beEqualTo(expected) val resultExpectation = result must beRight(()) transactionsExpectation.and(resultExpectation) } diff --git a/src/test/scala/com/snowplowanalytics/snowplow/rdbloader/utils/CommonSpec.scala b/src/test/scala/com/snowplowanalytics/snowplow/rdbloader/utils/CommonSpec.scala index 62ffabb52..d8ef7993d 100644 --- a/src/test/scala/com/snowplowanalytics/snowplow/rdbloader/utils/CommonSpec.scala +++ b/src/test/scala/com/snowplowanalytics/snowplow/rdbloader/utils/CommonSpec.scala @@ -10,21 +10,13 @@ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the Apache License Version 2.0 for the specific language governing permissions and limitations there under. */ -package com.snowplowanalytics.snowplow.rdbloader -package utils - -import cats.implicits._ - -import LoaderError._ -import S3.Key.{coerce => s3} -import config.{ CliConfig, Step } +package com.snowplowanalytics.snowplow.rdbloader.utils import org.specs2.Specification class CommonSpec extends Specification { def is = s2""" Sanitize message $e1 Sanitize message that contains invalid regular expression $e2 - Correctly interpret final message $e3 """ def e1 = { @@ -38,12 +30,4 @@ class CommonSpec extends Specification { def is = s2""" val result = Common.sanitize(message, List("""$**^""", "username")) result must beEqualTo("Outputxxxx. Output xxxxxxxx") } - - def e3 = { - val loadResult = StorageTargetError("Some exception").asLeft - val cliConfig = CliConfig(SpecHelpers.validConfig, SpecHelpers.validTarget, Step.defaultSteps, Some(s3("s3://bucket/key")), None, false, SpecHelpers.resolverJson) - - val result = Common.interpret(cliConfig, loadResult) - result must beEqualTo(Log.LoadingFailed("Data loading error Some exception")) - } }