From 5d5f06f5533f164bccf6e4f79af30cb4d2f87648 Mon Sep 17 00:00:00 2001 From: Janet Gainer-Dewar Date: Wed, 4 Sep 2024 15:53:24 -0400 Subject: [PATCH 01/39] Improve doc --- engine/src/main/resources/swagger/cromwell.yaml | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/engine/src/main/resources/swagger/cromwell.yaml b/engine/src/main/resources/swagger/cromwell.yaml index 59a202df393..8f4f94fbdd6 100644 --- a/engine/src/main/resources/swagger/cromwell.yaml +++ b/engine/src/main/resources/swagger/cromwell.yaml @@ -556,7 +556,8 @@ paths: in: query - name: expandSubWorkflows description: > - When true, metadata for sub workflows will be fetched and inserted automatically in the metadata response. + When true, metadata for sub workflows will be fetched and inserted automatically in the metadata response. + Requires inclusion of key subworkflowId in metadata results (either included or not excluded). required: false type: boolean in: query From 0024b748d5afa58caa596e768b65ad8bbd1587fc Mon Sep 17 00:00:00 2001 From: Janet Gainer-Dewar Date: Mon, 9 Sep 2024 15:06:01 -0400 Subject: [PATCH 02/39] Terrible but functional --- .../webservice/MetadataBuilderActorSpec.scala | 3 +- .../services/metadata/MetadataService.scala | 2 +- .../impl/MetadataDatabaseAccess.scala | 6 +- .../ReadDatabaseMetadataWorkerActor.scala | 19 +- .../impl/builder/MetadataBuilderActor.scala | 213 +++++++++++++++--- 5 files changed, 205 insertions(+), 38 deletions(-) diff --git a/engine/src/test/scala/cromwell/webservice/MetadataBuilderActorSpec.scala b/engine/src/test/scala/cromwell/webservice/MetadataBuilderActorSpec.scala index 2851f36e013..57a48a4ae6e 100644 --- a/engine/src/test/scala/cromwell/webservice/MetadataBuilderActorSpec.scala +++ b/engine/src/test/scala/cromwell/webservice/MetadataBuilderActorSpec.scala @@ -191,6 +191,7 @@ class MetadataBuilderActorSpec ), MetadataEvent(MetadataKey(workflowId, None, "status"), MetadataValue("Succeeded")) ) + val query = MetadataQuery(workflowId, None, None, None, None, expandSubWorkflows = false) val expectedRes = s"""{ @@ -213,7 +214,7 @@ class MetadataBuilderActorSpec val response = mba.ask(action).mapTo[MetadataJsonResponse] mockReadMetadataWorkerActor.expectMsg(defaultTimeout, action) mockReadMetadataWorkerActor.reply( - CostResponse(workflowId, workflowState, events, false, false) + CostResponse(workflowId, workflowState, MetadataLookupResponse(query, events), false, false) ) response map { r => r shouldBe a[SuccessfulMetadataJsonResponse] } response.mapTo[SuccessfulMetadataJsonResponse] map { b => b.responseJson shouldBe expectedRes.parseJson } diff --git a/services/src/main/scala/cromwell/services/metadata/MetadataService.scala b/services/src/main/scala/cromwell/services/metadata/MetadataService.scala index c7439d5560a..e144e6fb15a 100644 --- a/services/src/main/scala/cromwell/services/metadata/MetadataService.scala +++ b/services/src/main/scala/cromwell/services/metadata/MetadataService.scala @@ -180,7 +180,7 @@ object MetadataService { final case class CostResponse(id: WorkflowId, status: WorkflowState, - costMetadata: Seq[MetadataEvent], + metadataResponse: MetadataLookupResponse, includeTaskBreakdown: Boolean, includeSubworkflowBreakdown: Boolean ) extends MetadataServiceResponse diff --git a/services/src/main/scala/cromwell/services/metadata/impl/MetadataDatabaseAccess.scala b/services/src/main/scala/cromwell/services/metadata/impl/MetadataDatabaseAccess.scala index 7c061637b24..cb596d0153a 100644 --- a/services/src/main/scala/cromwell/services/metadata/impl/MetadataDatabaseAccess.scala +++ b/services/src/main/scala/cromwell/services/metadata/impl/MetadataDatabaseAccess.scala @@ -272,7 +272,11 @@ trait MetadataDatabaseAccess { def queryCost(id: WorkflowId, timeout: Duration)(implicit ec: ExecutionContext ): Future[Seq[MetadataEvent]] = { - val keys = List("taskStartTime", "taskEndTime", "vmCostPerHour") + val keys = List(CallMetadataKeys.VmStartTime, + CallMetadataKeys.VmEndTime, + CallMetadataKeys.VmCostPerHour, + CallMetadataKeys.SubWorkflowId + ) metadataDatabaseInterface .queryMetadataEntryWithKeyConstraints(id.toString, keys, List.empty, CallOrWorkflowQuery, timeout) .map(metadataToMetadataEvents(id)) diff --git a/services/src/main/scala/cromwell/services/metadata/impl/ReadDatabaseMetadataWorkerActor.scala b/services/src/main/scala/cromwell/services/metadata/impl/ReadDatabaseMetadataWorkerActor.scala index 1392f7988b6..4d587c693ee 100644 --- a/services/src/main/scala/cromwell/services/metadata/impl/ReadDatabaseMetadataWorkerActor.scala +++ b/services/src/main/scala/cromwell/services/metadata/impl/ReadDatabaseMetadataWorkerActor.scala @@ -2,11 +2,12 @@ package cromwell.services.metadata.impl import java.sql.SQLTimeoutException import akka.actor.{Actor, ActorLogging, ActorRef, PoisonPill, Props} +import cats.data.NonEmptyList import cromwell.core.Dispatcher.ServiceDispatcher import cromwell.core.{WorkflowId, WorkflowSubmitted} import cromwell.services.MetadataServicesStore import cromwell.services.metadata.MetadataService._ -import cromwell.services.metadata.{MetadataEvent, MetadataQuery, WorkflowQueryParameters} +import cromwell.services.metadata.{CallMetadataKeys, MetadataEvent, MetadataQuery, WorkflowQueryParameters} import scala.concurrent.Future import scala.concurrent.duration.Duration @@ -164,15 +165,25 @@ class ReadDatabaseMetadataWorkerActor(metadataReadTimeout: Duration, metadataRea includeTaskBreakdown: Boolean, includeSubworkflowBreakdown: Boolean ): Future[MetadataServiceResponse] = { + + val keys = NonEmptyList.of(CallMetadataKeys.VmStartTime, + CallMetadataKeys.VmEndTime, + CallMetadataKeys.VmCostPerHour, + CallMetadataKeys.SubWorkflowId + ) + val metadataQuery = MetadataQuery(id, None, None, Option(keys), None, expandSubWorkflows = true) + val results = for { status <- getWorkflowStatus(id) - costEvents <- queryCost(id, metadataReadTimeout) + costEvents <- queryMetadata(metadataQuery) } yield (status, costEvents) results.map { case (s, m) => (s, m) match { - case (Some(wfState), metadataEvents) => - CostResponse(id, wfState, metadataEvents, includeTaskBreakdown, includeSubworkflowBreakdown) + case (Some(wfState), resp: MetadataLookupResponse) => + CostResponse(id, wfState, resp, includeTaskBreakdown, includeSubworkflowBreakdown) + // TODO better error handling for other metadata lookup results + case (Some(_), _) => CostFailure(id, new Exception("TODO")) // TODO should this be a failure? case (None, _) => CostFailure(id, new Exception("Couldn't find workflow status")) } diff --git a/services/src/main/scala/cromwell/services/metadata/impl/builder/MetadataBuilderActor.scala b/services/src/main/scala/cromwell/services/metadata/impl/builder/MetadataBuilderActor.scala index 0a0b7cff6ea..e3d8dab8b2d 100644 --- a/services/src/main/scala/cromwell/services/metadata/impl/builder/MetadataBuilderActor.scala +++ b/services/src/main/scala/cromwell/services/metadata/impl/builder/MetadataBuilderActor.scala @@ -2,7 +2,6 @@ package cromwell.services.metadata.impl.builder import java.time.OffsetDateTime import java.util.concurrent.atomic.AtomicLong - import akka.actor.{ActorRef, LoggingFSM, PoisonPill, Props} import common.collections.EnhancedCollections._ import cromwell.services.metadata.impl.builder.MetadataComponent._ @@ -16,6 +15,7 @@ import mouse.all._ import org.slf4j.LoggerFactory import spray.json._ +import java.time.temporal.ChronoUnit import scala.language.postfixOps object MetadataBuilderActor { @@ -23,6 +23,7 @@ object MetadataBuilderActor { case object Idle extends MetadataBuilderActorState case object WaitingForMetadataService extends MetadataBuilderActorState case object WaitingForSubWorkflows extends MetadataBuilderActorState + case object WaitingForSubWorkflowCost extends MetadataBuilderActorState sealed trait MetadataBuilderActorData @@ -63,6 +64,17 @@ object MetadataBuilderActor { */ private case class MetadataForIndex(index: Int, metadata: List[JsObject]) + /** + * Extract the list of subworkflow ids from a list of metadata events + */ + private def extractSubworkflowIds(events: Seq[MetadataEvent]): Seq[String] = + events + .collect { + case MetadataEvent(key, value, _) if key.key.endsWith(CallMetadataKeys.SubWorkflowId) => value map { _.value } + } + .flatten + .distinct + private def eventsToAttemptMetadata( subWorkflowMetadata: Map[String, JsValue] )(attempt: Int, events: Seq[MetadataEvent]) = { @@ -275,28 +287,6 @@ object MetadataBuilderActor { workflowMetadataResponse(id, updatedEvents, includeCallsIfEmpty = false, Map.empty) } - def processCostResponse(id: WorkflowId, - status: WorkflowState, - eventsList: Seq[MetadataEvent], - includeTaskBreakdown: Boolean, - includeSubworkflowBreakdown: Boolean - ): JsObject = { - // !! add logic to compute real cost here !! - val taskMap = if (includeTaskBreakdown) Map("taskBreakdown" -> JsObject(Map("foo.bar" -> JsNumber(3.5)))) else Map() - val subworkflowMap = - if (includeSubworkflowBreakdown) Map("subworkflowBreakdown" -> JsObject(Map("foo.baz" -> JsNumber(3.5)))) - else Map() - - JsObject( - Map( - WorkflowMetadataKeys.Id -> JsString(id.toString), - WorkflowMetadataKeys.Status -> JsString(status.toString), - "currency" -> JsString("USD"), - "cost" -> JsNumber(3.5) - ) ++ taskMap ++ subworkflowMap - ) - } - def workflowMetadataResponse(workflowId: WorkflowId, eventsList: Seq[MetadataEvent], includeCallsIfEmpty: Boolean, @@ -348,8 +338,7 @@ class MetadataBuilderActor(readMetadataWorkerMaker: () => Props, ) allDone() case Event(CostResponse(w, s, m, t, b), HasWorkData(target, originalRequest)) => - target ! SuccessfulMetadataJsonResponse(originalRequest, processCostResponse(w, s, m, t, b)) - allDone() + processCostResponse(w, s, m, t, b, target, originalRequest) case Event(MetadataLookupResponse(query, metadata), HasWorkData(target, originalRequest)) => processMetadataResponse(query, metadata, target, originalRequest) case Event(FetchFailedJobsMetadataLookupResponse(metadata), HasWorkData(target, originalRequest)) => @@ -386,6 +375,14 @@ class MetadataBuilderActor(readMetadataWorkerMaker: () => Props, allDone() } + when(WaitingForSubWorkflowCost) { + case Event(mbr: MetadataJsonResponse, data: HasReceivedEventsData) => + processSubWorkflowCost(mbr, data) + case Event(failure: MetadataServiceFailure, data: HasReceivedEventsData) => + data.target ! FailedMetadataJsonResponse(data.originalRequest, failure.reason) + allDone() + } + whenUnhandled { case Event(message, IdleData) => log.error(s"Received unexpected message $message in state $stateName with $IdleData") @@ -429,6 +426,39 @@ class MetadataBuilderActor(readMetadataWorkerMaker: () => Props, failAndDie(new Exception(message), data.target, data.originalRequest) } + def processSubWorkflowCost(metadataResponse: MetadataJsonResponse, data: HasReceivedEventsData) = + metadataResponse match { + case SuccessfulMetadataJsonResponse(GetCost(workflowId, includeTaskBreakdown, includeSubworkflowBreakdown), js) => + val subId: WorkflowId = workflowId + val newData = data.withSubWorkflow(subId.toString, js) + + if (newData.isComplete) { + buildCostAndStop( + subId, + WorkflowSucceeded, // TODO + data.originalEvents, + newData.subWorkflowsMetadata, + includeTaskBreakdown, + includeSubworkflowBreakdown, + data.target, + data.originalRequest + ) + } else { + stay() using newData + } + case FailedMetadataJsonResponse(originalRequest, e) => + failAndDie(new RuntimeException(s"Failed to retrieve cost for a sub workflow ($originalRequest)", e), + data.target, + data.originalRequest + ) + + case other => + val message = + s"Programmer Error: MetadataBuilderActor expected subworkflow metadata response type but got ${other.getClass.getSimpleName}" + log.error(message) + failAndDie(new Exception(message), data.target, data.originalRequest) + } + def failAndDie(reason: Throwable, target: ActorRef, originalRequest: BuildMetadataJsonAction) = { target ! FailedMetadataJsonResponse(originalRequest, reason) context stop self @@ -454,12 +484,7 @@ class MetadataBuilderActor(readMetadataWorkerMaker: () => Props, ) = if (query.expandSubWorkflows) { // Scan events for sub workflow ids - val subWorkflowIds = eventsList - .collect { - case MetadataEvent(key, value, _) if key.key.endsWith(CallMetadataKeys.SubWorkflowId) => value map { _.value } - } - .flatten - .distinct + val subWorkflowIds = extractSubworkflowIds(eventsList) // If none is found just proceed to build metadata if (subWorkflowIds.isEmpty) buildAndStop(query, eventsList, Map.empty, target, originalRequest) @@ -488,6 +513,132 @@ class MetadataBuilderActor(readMetadataWorkerMaker: () => Props, buildAndStop(query, eventsList, Map.empty, target, originalRequest) } + def processCostResponse(id: WorkflowId, + status: WorkflowState, + metadataResponse: MetadataLookupResponse, + includeTaskBreakdown: Boolean, + includeSubworkflowBreakdown: Boolean, + target: ActorRef, + originalRequest: BuildMetadataJsonAction + ): State = { + + // Always expand subworkflows for cost + val subWorkflowIds = extractSubworkflowIds(metadataResponse.eventList) + + if (subWorkflowIds.isEmpty) + // If no workflows found, just build cost data + buildCostAndStop(id, + status, + metadataResponse.eventList, + Map.empty, + includeTaskBreakdown, + includeSubworkflowBreakdown, + target, + originalRequest + ) + else { + // Otherwise spin up a metadata builder actor for each sub workflow + subWorkflowIds foreach { subId => + val subMetadataBuilder = context.actorOf(MetadataBuilderActor.props(readMetadataWorkerMaker, + metadataReadRowNumberSafetyThreshold, + isForSubworkflows = true + ), + uniqueActorName(subId) + ) + subMetadataBuilder ! GetCost(WorkflowId.fromString(subId), includeTaskBreakdown, includeSubworkflowBreakdown) + } + goto(WaitingForSubWorkflowCost) using HasReceivedEventsData(target, + originalRequest, + metadataResponse.query, + metadataResponse.eventList, + Map.empty, + subWorkflowIds.size + ) + } + } + + def buildCostAndStop(id: WorkflowId, + status: WorkflowState, + eventsList: Seq[MetadataEvent], + expandedValues: Map[String, JsValue], + includeTaskBreakdown: Boolean, + includeSubworkflowBreakdown: Boolean, + target: ActorRef, + originalRequest: BuildMetadataJsonAction + ): State = { + + val groupedEvents = groupEvents(eventsList) + val metadataEvents = MetadataBuilderActor.parse(groupedEvents, expandedValues) + + // TODO it's really annoying and brittle that we need to parse the JSON here + + def computeCost(jsCall: JsObject): Double = { + // TODO error handling, remove unprotected gets + val startTime = jsCall.fields.getOrElse(CallMetadataKeys.VmStartTime, JsString("")) match { + case start: JsString => OffsetDateTime.parse(start.value) + case _ => throw new RuntimeException("I'm so sad because I have no start time") + } + val endTime = jsCall.fields.getOrElse(CallMetadataKeys.VmEndTime, JsString("")) match { + case end: JsString => OffsetDateTime.parse(end.value) + case _ => throw new RuntimeException("I'm so sad because I have no end time") + } + val costPerHour = jsCall.fields.getOrElse(CallMetadataKeys.VmCostPerHour, JsNumber(0)) match { + case cost: JsNumber => cost.value.toDouble + case _ => throw new RuntimeException("I'm so sad because I have no cost, am I free???") + } + + // TODO is millis the right thing to use here? Should it be seconds? + val vmRuntimeInMillis = startTime.until(endTime, ChronoUnit.MILLIS).toDouble + val c = costPerHour * (vmRuntimeInMillis / (1000 * 60 * 60)) + c + } + val costFields = metadataEvents.fields.getOrElse(id.toString, JsObject.empty) + val wfFields: JsObject = costFields match { + case c: JsObject => c + case _ => JsObject.empty + } + val callsObject = wfFields.fields.getOrElse("calls", JsObject.empty) + + val cost: Double = callsObject match { + case calls: JsObject => + val callFields = calls.fields.values + val d: Iterable[Double] = callFields.map { multiCalls: JsValue => + val dd: Vector[Double] = multiCalls match { + case a: JsArray => + a.elements.map { + case o: JsObject => computeCost(o) + case _ => 0 + } + case _ => Vector(0) + } + dd.sum + } + d.sum + case _ => 0 + } + + val subworkflowCost = expandedValues.values.map { + case o: JsObject => + o.fields.getOrElse("cost", JsNumber(0)) match { + case n: JsNumber => n.value.toDouble + case _ => 0 + } + case _ => 0 + }.sum + + val resp = JsObject( + Map( + WorkflowMetadataKeys.Id -> JsString(id.toString), + WorkflowMetadataKeys.Status -> JsString(status.toString), + "currency" -> JsString("USD"), + "cost" -> JsNumber(cost + subworkflowCost) + ) // ++ taskMap ++ subworkflowMap + ) + + target ! SuccessfulMetadataJsonResponse(originalRequest, resp) + allDone() + } + def processFailedJobsMetadataResponse(eventsList: Seq[MetadataEvent], target: ActorRef, originalRequest: BuildMetadataJsonAction From d0b3164ce591c3505fefc31b8b4ec7afb4dbf0f0 Mon Sep 17 00:00:00 2001 From: Janet Gainer-Dewar Date: Mon, 9 Sep 2024 16:04:22 -0400 Subject: [PATCH 03/39] Remove support for includeTaskBreakdown, includeSubworkflowBreakdown --- .../routes/MetadataRouteSupport.scala | 13 ++-------- .../webservice/MetadataBuilderActorSpec.scala | 4 ++-- .../routes/CromwellApiServiceSpec.scala | 19 +++++++-------- .../routes/MetadataRouteSupportSpec.scala | 20 ---------------- .../services/metadata/MetadataService.scala | 11 +++------ .../ReadDatabaseMetadataWorkerActor.scala | 13 ++++------ .../impl/builder/MetadataBuilderActor.scala | 24 ++++--------------- 7 files changed, 24 insertions(+), 80 deletions(-) diff --git a/engine/src/main/scala/cromwell/webservice/routes/MetadataRouteSupport.scala b/engine/src/main/scala/cromwell/webservice/routes/MetadataRouteSupport.scala index afa12d871dd..ee2b4cdd483 100644 --- a/engine/src/main/scala/cromwell/webservice/routes/MetadataRouteSupport.scala +++ b/engine/src/main/scala/cromwell/webservice/routes/MetadataRouteSupport.scala @@ -125,17 +125,8 @@ trait MetadataRouteSupport extends HttpInstrumentation { }, path("workflows" / Segment / Segment / "cost") { (_, possibleWorkflowId) => get { - parameters( - (Symbol("includeTaskBreakdown").as[Boolean].?, Symbol("includeSubworkflowBreakdown").as[Boolean].?) - ) { (includeTaskBreakdownOption, includeSubworkflowBreakdownOption) => - val includeTaskBreakdown = includeTaskBreakdownOption.getOrElse(false) - val includeSubworkflowBreakdown = includeSubworkflowBreakdownOption.getOrElse(false) - - metadataLookup( - possibleWorkflowId, - (w: WorkflowId) => GetCost(w, includeTaskBreakdown, includeSubworkflowBreakdown), - serviceRegistryActor - ) + instrumentRequest { + metadataLookup(possibleWorkflowId, (w: WorkflowId) => GetCost(w), serviceRegistryActor) } } }, diff --git a/engine/src/test/scala/cromwell/webservice/MetadataBuilderActorSpec.scala b/engine/src/test/scala/cromwell/webservice/MetadataBuilderActorSpec.scala index 57a48a4ae6e..9075060add6 100644 --- a/engine/src/test/scala/cromwell/webservice/MetadataBuilderActorSpec.scala +++ b/engine/src/test/scala/cromwell/webservice/MetadataBuilderActorSpec.scala @@ -201,7 +201,7 @@ class MetadataBuilderActorSpec |"status": "${workflowState.toString}" |}""".stripMargin - val action = GetCost(workflowId, false, false) + val action = GetCost(workflowId) val mockReadMetadataWorkerActor = TestProbe("mockReadMetadataWorkerActor") def readMetadataWorkerMaker = () => mockReadMetadataWorkerActor.props @@ -214,7 +214,7 @@ class MetadataBuilderActorSpec val response = mba.ask(action).mapTo[MetadataJsonResponse] mockReadMetadataWorkerActor.expectMsg(defaultTimeout, action) mockReadMetadataWorkerActor.reply( - CostResponse(workflowId, workflowState, MetadataLookupResponse(query, events), false, false) + CostResponse(workflowId, workflowState, MetadataLookupResponse(query, events)) ) response map { r => r shouldBe a[SuccessfulMetadataJsonResponse] } response.mapTo[SuccessfulMetadataJsonResponse] map { b => b.responseJson shouldBe expectedRes.parseJson } diff --git a/engine/src/test/scala/cromwell/webservice/routes/CromwellApiServiceSpec.scala b/engine/src/test/scala/cromwell/webservice/routes/CromwellApiServiceSpec.scala index 1ee62199168..7900c02a0c9 100644 --- a/engine/src/test/scala/cromwell/webservice/routes/CromwellApiServiceSpec.scala +++ b/engine/src/test/scala/cromwell/webservice/routes/CromwellApiServiceSpec.scala @@ -725,19 +725,16 @@ object CromwellApiServiceSpec { ) ) sender() ! SuccessfulMetadataJsonResponse(request, MetadataBuilderActor.processOutputsResponse(id, event)) - case request @ GetCost(id, includeTaskBreakdown, includeSubworkflowBreakdown) => + case request @ GetCost(id) => sender() ! SuccessfulMetadataJsonResponse( request, - MetadataBuilderActor.processCostResponse( - id, - WorkflowSucceeded, - Vector( - MetadataEvent(MetadataKey(id, None, "outputs:test.hello.salutation"), - MetadataValue("Hello foo!", MetadataString) - ) - ), - includeTaskBreakdown, - includeSubworkflowBreakdown + JsObject( + Map( + WorkflowMetadataKeys.Id -> JsString(id.toString), + WorkflowMetadataKeys.Status -> JsString(WorkflowSucceeded.toString), + "currency" -> JsString("USD"), + "cost" -> JsNumber(3.5) + ) ) ) case request @ GetLogs(id) => diff --git a/engine/src/test/scala/cromwell/webservice/routes/MetadataRouteSupportSpec.scala b/engine/src/test/scala/cromwell/webservice/routes/MetadataRouteSupportSpec.scala index 3bbb5674793..2cd8b64517a 100644 --- a/engine/src/test/scala/cromwell/webservice/routes/MetadataRouteSupportSpec.scala +++ b/engine/src/test/scala/cromwell/webservice/routes/MetadataRouteSupportSpec.scala @@ -171,26 +171,6 @@ class MetadataRouteSupportSpec extends AsyncFlatSpec with ScalatestRouteTest wit costResponse.fields("currency") should be(JsString("USD")) costResponse.fields("cost") should be(JsNumber(3.5)) costResponse.fields("status") should be(JsString("Succeeded")) - costResponse.fields.keys should not contain "taskBreakdown" - costResponse.fields.keys should not contain "subworkflowBreakdown" - } - } - - it should "return 200 with basic cost info with task and subworkflow breakdowns" in { - Get( - s"/workflows/$version/${CromwellApiServiceSpec.ExistingWorkflowId}/cost?includeTaskBreakdown=true&includeSubworkflowBreakdown=true" - ) ~> - akkaHttpService.metadataRoutes ~> - check { - status should be(StatusCodes.OK) - - val costResponse = responseAs[JsObject] - costResponse.fields("id") should be(JsString(CromwellApiServiceSpec.ExistingWorkflowId.toString)) - costResponse.fields("currency") should be(JsString("USD")) - costResponse.fields("cost") should be(JsNumber(3.5)) - costResponse.fields("status") should be(JsString("Succeeded")) - costResponse.fields("taskBreakdown") should be(JsObject(Map("foo.bar" -> JsNumber(3.5)))) - costResponse.fields("subworkflowBreakdown") should be(JsObject(Map("foo.baz" -> JsNumber(3.5)))) } } diff --git a/services/src/main/scala/cromwell/services/metadata/MetadataService.scala b/services/src/main/scala/cromwell/services/metadata/MetadataService.scala index e144e6fb15a..6c8bd4c7f59 100644 --- a/services/src/main/scala/cromwell/services/metadata/MetadataService.scala +++ b/services/src/main/scala/cromwell/services/metadata/MetadataService.scala @@ -126,8 +126,7 @@ object MetadataService { extends BuildMetadataJsonAction final case class WorkflowOutputs(workflowId: WorkflowId) extends BuildWorkflowMetadataJsonWithOverridableSourceAction final case class GetLogs(workflowId: WorkflowId) extends BuildWorkflowMetadataJsonWithOverridableSourceAction - final case class GetCost(workflowId: WorkflowId, includeTaskBreakdown: Boolean, includeSubworkflowBreakdown: Boolean) - extends BuildWorkflowMetadataJsonWithOverridableSourceAction + final case class GetCost(workflowId: WorkflowId) extends BuildWorkflowMetadataJsonWithOverridableSourceAction case object RefreshSummary extends MetadataServiceAction case object SendMetadataTableSizeMetrics extends MetadataServiceAction @@ -178,12 +177,8 @@ object MetadataService { final case class LogsResponse(id: WorkflowId, logs: Seq[MetadataEvent]) extends MetadataServiceResponse final case class LogsFailure(id: WorkflowId, reason: Throwable) extends MetadataServiceFailure - final case class CostResponse(id: WorkflowId, - status: WorkflowState, - metadataResponse: MetadataLookupResponse, - includeTaskBreakdown: Boolean, - includeSubworkflowBreakdown: Boolean - ) extends MetadataServiceResponse + final case class CostResponse(id: WorkflowId, status: WorkflowState, metadataResponse: MetadataLookupResponse) + extends MetadataServiceResponse final case class CostFailure(id: WorkflowId, reason: Throwable) extends MetadataServiceFailure final case class MetadataWriteSuccess(events: Iterable[MetadataEvent]) extends MetadataServiceResponse diff --git a/services/src/main/scala/cromwell/services/metadata/impl/ReadDatabaseMetadataWorkerActor.scala b/services/src/main/scala/cromwell/services/metadata/impl/ReadDatabaseMetadataWorkerActor.scala index 4d587c693ee..932e54ee69d 100644 --- a/services/src/main/scala/cromwell/services/metadata/impl/ReadDatabaseMetadataWorkerActor.scala +++ b/services/src/main/scala/cromwell/services/metadata/impl/ReadDatabaseMetadataWorkerActor.scala @@ -39,10 +39,8 @@ class ReadDatabaseMetadataWorkerActor(metadataReadTimeout: Duration, metadataRea case GetRootAndSubworkflowLabels(rootWorkflowId: WorkflowId) => evaluateRespondAndStop(sender(), queryRootAndSubworkflowLabelsAndRespond(rootWorkflowId)) case GetLogs(workflowId) => evaluateRespondAndStop(sender(), queryLogsAndRespond(workflowId)) - case GetCost(workflowId, includeTaskBreakdown, includeSubworkflowBreakdown) => - evaluateRespondAndStop(sender(), - queryCostAndRespond(workflowId, includeTaskBreakdown, includeSubworkflowBreakdown) - ) + case GetCost(workflowId) => + evaluateRespondAndStop(sender(), queryCostAndRespond(workflowId)) case QueryForWorkflowsMatchingParameters(parameters) => evaluateRespondAndStop(sender(), queryWorkflowsAndRespond(parameters)) case WorkflowOutputs(id) => evaluateRespondAndStop(sender(), queryWorkflowOutputsAndRespond(id)) @@ -161,10 +159,7 @@ class ReadDatabaseMetadataWorkerActor(metadataReadTimeout: Duration, metadataRea LogsFailure(id, t) } - private def queryCostAndRespond(id: WorkflowId, - includeTaskBreakdown: Boolean, - includeSubworkflowBreakdown: Boolean - ): Future[MetadataServiceResponse] = { + private def queryCostAndRespond(id: WorkflowId): Future[MetadataServiceResponse] = { val keys = NonEmptyList.of(CallMetadataKeys.VmStartTime, CallMetadataKeys.VmEndTime, @@ -181,7 +176,7 @@ class ReadDatabaseMetadataWorkerActor(metadataReadTimeout: Duration, metadataRea results.map { case (s, m) => (s, m) match { case (Some(wfState), resp: MetadataLookupResponse) => - CostResponse(id, wfState, resp, includeTaskBreakdown, includeSubworkflowBreakdown) + CostResponse(id, wfState, resp) // TODO better error handling for other metadata lookup results case (Some(_), _) => CostFailure(id, new Exception("TODO")) // TODO should this be a failure? diff --git a/services/src/main/scala/cromwell/services/metadata/impl/builder/MetadataBuilderActor.scala b/services/src/main/scala/cromwell/services/metadata/impl/builder/MetadataBuilderActor.scala index e3d8dab8b2d..24a5cbb7b5e 100644 --- a/services/src/main/scala/cromwell/services/metadata/impl/builder/MetadataBuilderActor.scala +++ b/services/src/main/scala/cromwell/services/metadata/impl/builder/MetadataBuilderActor.scala @@ -337,8 +337,8 @@ class MetadataBuilderActor(readMetadataWorkerMaker: () => Props, workflowMetadataResponse(w, l, includeCallsIfEmpty = false, Map.empty) ) allDone() - case Event(CostResponse(w, s, m, t, b), HasWorkData(target, originalRequest)) => - processCostResponse(w, s, m, t, b, target, originalRequest) + case Event(CostResponse(w, s, m), HasWorkData(target, originalRequest)) => + processCostResponse(w, s, m, target, originalRequest) case Event(MetadataLookupResponse(query, metadata), HasWorkData(target, originalRequest)) => processMetadataResponse(query, metadata, target, originalRequest) case Event(FetchFailedJobsMetadataLookupResponse(metadata), HasWorkData(target, originalRequest)) => @@ -428,7 +428,7 @@ class MetadataBuilderActor(readMetadataWorkerMaker: () => Props, def processSubWorkflowCost(metadataResponse: MetadataJsonResponse, data: HasReceivedEventsData) = metadataResponse match { - case SuccessfulMetadataJsonResponse(GetCost(workflowId, includeTaskBreakdown, includeSubworkflowBreakdown), js) => + case SuccessfulMetadataJsonResponse(GetCost(workflowId), js) => val subId: WorkflowId = workflowId val newData = data.withSubWorkflow(subId.toString, js) @@ -438,8 +438,6 @@ class MetadataBuilderActor(readMetadataWorkerMaker: () => Props, WorkflowSucceeded, // TODO data.originalEvents, newData.subWorkflowsMetadata, - includeTaskBreakdown, - includeSubworkflowBreakdown, data.target, data.originalRequest ) @@ -516,8 +514,6 @@ class MetadataBuilderActor(readMetadataWorkerMaker: () => Props, def processCostResponse(id: WorkflowId, status: WorkflowState, metadataResponse: MetadataLookupResponse, - includeTaskBreakdown: Boolean, - includeSubworkflowBreakdown: Boolean, target: ActorRef, originalRequest: BuildMetadataJsonAction ): State = { @@ -527,15 +523,7 @@ class MetadataBuilderActor(readMetadataWorkerMaker: () => Props, if (subWorkflowIds.isEmpty) // If no workflows found, just build cost data - buildCostAndStop(id, - status, - metadataResponse.eventList, - Map.empty, - includeTaskBreakdown, - includeSubworkflowBreakdown, - target, - originalRequest - ) + buildCostAndStop(id, status, metadataResponse.eventList, Map.empty, target, originalRequest) else { // Otherwise spin up a metadata builder actor for each sub workflow subWorkflowIds foreach { subId => @@ -545,7 +533,7 @@ class MetadataBuilderActor(readMetadataWorkerMaker: () => Props, ), uniqueActorName(subId) ) - subMetadataBuilder ! GetCost(WorkflowId.fromString(subId), includeTaskBreakdown, includeSubworkflowBreakdown) + subMetadataBuilder ! GetCost(WorkflowId.fromString(subId)) } goto(WaitingForSubWorkflowCost) using HasReceivedEventsData(target, originalRequest, @@ -561,8 +549,6 @@ class MetadataBuilderActor(readMetadataWorkerMaker: () => Props, status: WorkflowState, eventsList: Seq[MetadataEvent], expandedValues: Map[String, JsValue], - includeTaskBreakdown: Boolean, - includeSubworkflowBreakdown: Boolean, target: ActorRef, originalRequest: BuildMetadataJsonAction ): State = { From a29e9b7b627bbc17c7aa03e6e970950e62e16b35 Mon Sep 17 00:00:00 2001 From: Janet Gainer-Dewar Date: Wed, 11 Sep 2024 14:50:04 -0400 Subject: [PATCH 04/39] Nicer json walking --- .../impl/builder/MetadataBuilderActor.scala | 111 +++++++++--------- 1 file changed, 54 insertions(+), 57 deletions(-) diff --git a/services/src/main/scala/cromwell/services/metadata/impl/builder/MetadataBuilderActor.scala b/services/src/main/scala/cromwell/services/metadata/impl/builder/MetadataBuilderActor.scala index 24a5cbb7b5e..bb3a969c7a5 100644 --- a/services/src/main/scala/cromwell/services/metadata/impl/builder/MetadataBuilderActor.scala +++ b/services/src/main/scala/cromwell/services/metadata/impl/builder/MetadataBuilderActor.scala @@ -17,6 +17,7 @@ import spray.json._ import java.time.temporal.ChronoUnit import scala.language.postfixOps +import scala.util.{Failure, Success, Try} object MetadataBuilderActor { sealed trait MetadataBuilderActorState @@ -431,10 +432,9 @@ class MetadataBuilderActor(readMetadataWorkerMaker: () => Props, case SuccessfulMetadataJsonResponse(GetCost(workflowId), js) => val subId: WorkflowId = workflowId val newData = data.withSubWorkflow(subId.toString, js) - if (newData.isComplete) { buildCostAndStop( - subId, + data.originalQuery.workflowId, WorkflowSucceeded, // TODO data.originalEvents, newData.subWorkflowsMetadata, @@ -545,6 +545,42 @@ class MetadataBuilderActor(readMetadataWorkerMaker: () => Props, } } + private def extractFromJsAs[A: Manifest](js: JsObject, fieldName: String): Option[A] = + js.fields.get(fieldName) match { + case Some(a: A) => Some(a) + case _ => None + } + + private def computeCost(jsVal: JsValue): Try[BigDecimal] = + jsVal match { + case jsCall: JsObject => + extractFromJsAs[JsString](jsCall, CallMetadataKeys.VmStartTime) map { startTimeVal => + val startTimeTry = Try(OffsetDateTime.parse(startTimeVal.value)) + + val endTimeTry = + extractFromJsAs[JsString](jsCall, CallMetadataKeys.VmEndTime) + .map(v => Try(OffsetDateTime.parse(v.value))) + .getOrElse(Success(OffsetDateTime.now())) + + val rawCostPerHour = extractFromJsAs[JsNumber](jsCall, CallMetadataKeys.VmCostPerHour).map(_.value) + val costPerHourTry = rawCostPerHour match { + case Some(c: BigDecimal) if c >= 0 => Success(c) + // A costPerHour < 0 indicates an error + case Some(_: BigDecimal) => Failure(new RuntimeException("cost error!!!!")) // TODO + case None => Success(BigDecimal(0)) + } + + for { + start <- startTimeTry + end <- endTimeTry + costPerHour <- costPerHourTry + vmRuntimeInMillis = start.until(end, ChronoUnit.MILLIS).toDouble + } yield (vmRuntimeInMillis / (1000 * 60 * 60)) * costPerHour + } getOrElse (Success(0)) + // TODO is millis the right thing to use here? Should it be seconds? + case _ => Success(0) + } + def buildCostAndStop(id: WorkflowId, status: WorkflowState, eventsList: Seq[MetadataEvent], @@ -553,72 +589,33 @@ class MetadataBuilderActor(readMetadataWorkerMaker: () => Props, originalRequest: BuildMetadataJsonAction ): State = { - val groupedEvents = groupEvents(eventsList) - val metadataEvents = MetadataBuilderActor.parse(groupedEvents, expandedValues) + val metadataEvents = MetadataBuilderActor.parse(groupEvents(eventsList), expandedValues) - // TODO it's really annoying and brittle that we need to parse the JSON here - - def computeCost(jsCall: JsObject): Double = { - // TODO error handling, remove unprotected gets - val startTime = jsCall.fields.getOrElse(CallMetadataKeys.VmStartTime, JsString("")) match { - case start: JsString => OffsetDateTime.parse(start.value) - case _ => throw new RuntimeException("I'm so sad because I have no start time") - } - val endTime = jsCall.fields.getOrElse(CallMetadataKeys.VmEndTime, JsString("")) match { - case end: JsString => OffsetDateTime.parse(end.value) - case _ => throw new RuntimeException("I'm so sad because I have no end time") - } - val costPerHour = jsCall.fields.getOrElse(CallMetadataKeys.VmCostPerHour, JsNumber(0)) match { - case cost: JsNumber => cost.value.toDouble - case _ => throw new RuntimeException("I'm so sad because I have no cost, am I free???") - } - - // TODO is millis the right thing to use here? Should it be seconds? - val vmRuntimeInMillis = startTime.until(endTime, ChronoUnit.MILLIS).toDouble - val c = costPerHour * (vmRuntimeInMillis / (1000 * 60 * 60)) - c - } - val costFields = metadataEvents.fields.getOrElse(id.toString, JsObject.empty) - val wfFields: JsObject = costFields match { - case c: JsObject => c - case _ => JsObject.empty - } - val callsObject = wfFields.fields.getOrElse("calls", JsObject.empty) - - val cost: Double = callsObject match { - case calls: JsObject => - val callFields = calls.fields.values - val d: Iterable[Double] = callFields.map { multiCalls: JsValue => - val dd: Vector[Double] = multiCalls match { - case a: JsArray => - a.elements.map { - case o: JsObject => computeCost(o) - case _ => 0 - } - case _ => Vector(0) - } - dd.sum - } - d.sum - case _ => 0 - } + // Walk the structured metadata to compute cost for each call in this workflow + val callCosts = for { + wfObj <- extractFromJsAs[JsObject](metadataEvents, id.toString).toList + callsObj <- extractFromJsAs[JsObject](wfObj, "calls").toList + singleCallName <- callsObj.fields.keys.toList + singleCallAttemptList <- extractFromJsAs[JsArray](callsObj, singleCallName).toList + singleCallAttempt <- singleCallAttemptList.elements.toList + singleCallAttemptCost = computeCost(singleCallAttempt).get // TODO + } yield singleCallAttemptCost val subworkflowCost = expandedValues.values.map { case o: JsObject => - o.fields.getOrElse("cost", JsNumber(0)) match { - case n: JsNumber => n.value.toDouble - case _ => 0 - } - case _ => 0 + extractFromJsAs[JsNumber](o, "cost").map(_.value).getOrElse(BigDecimal(0)) + case _ => BigDecimal(0) }.sum + val totalCost = callCosts.sum + subworkflowCost + val resp = JsObject( Map( WorkflowMetadataKeys.Id -> JsString(id.toString), WorkflowMetadataKeys.Status -> JsString(status.toString), "currency" -> JsString("USD"), - "cost" -> JsNumber(cost + subworkflowCost) - ) // ++ taskMap ++ subworkflowMap + "cost" -> JsNumber(totalCost) + ) ) target ! SuccessfulMetadataJsonResponse(originalRequest, resp) From 2ca8c3f06fb2d013076d1778330f6bed83c4cd5a Mon Sep 17 00:00:00 2001 From: Janet Gainer-Dewar Date: Wed, 11 Sep 2024 15:30:45 -0400 Subject: [PATCH 05/39] Better status handling --- .../metadata/impl/builder/MetadataBuilderActor.scala | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/services/src/main/scala/cromwell/services/metadata/impl/builder/MetadataBuilderActor.scala b/services/src/main/scala/cromwell/services/metadata/impl/builder/MetadataBuilderActor.scala index bb3a969c7a5..e944d77df71 100644 --- a/services/src/main/scala/cromwell/services/metadata/impl/builder/MetadataBuilderActor.scala +++ b/services/src/main/scala/cromwell/services/metadata/impl/builder/MetadataBuilderActor.scala @@ -432,10 +432,11 @@ class MetadataBuilderActor(readMetadataWorkerMaker: () => Props, case SuccessfulMetadataJsonResponse(GetCost(workflowId), js) => val subId: WorkflowId = workflowId val newData = data.withSubWorkflow(subId.toString, js) + if (newData.isComplete) { buildCostAndStop( data.originalQuery.workflowId, - WorkflowSucceeded, // TODO + extractFromJsAs[JsString](js, "status").map(_.value).getOrElse(""), // should never be empty data.originalEvents, newData.subWorkflowsMetadata, data.target, @@ -523,7 +524,7 @@ class MetadataBuilderActor(readMetadataWorkerMaker: () => Props, if (subWorkflowIds.isEmpty) // If no workflows found, just build cost data - buildCostAndStop(id, status, metadataResponse.eventList, Map.empty, target, originalRequest) + buildCostAndStop(id, status.toString, metadataResponse.eventList, Map.empty, target, originalRequest) else { // Otherwise spin up a metadata builder actor for each sub workflow subWorkflowIds foreach { subId => @@ -582,7 +583,7 @@ class MetadataBuilderActor(readMetadataWorkerMaker: () => Props, } def buildCostAndStop(id: WorkflowId, - status: WorkflowState, + status: String, eventsList: Seq[MetadataEvent], expandedValues: Map[String, JsValue], target: ActorRef, @@ -612,7 +613,7 @@ class MetadataBuilderActor(readMetadataWorkerMaker: () => Props, val resp = JsObject( Map( WorkflowMetadataKeys.Id -> JsString(id.toString), - WorkflowMetadataKeys.Status -> JsString(status.toString), + WorkflowMetadataKeys.Status -> JsString(status), "currency" -> JsString("USD"), "cost" -> JsNumber(totalCost) ) From 9fccf2b81e852f483f25bfe8c96c9797f767b7e1 Mon Sep 17 00:00:00 2001 From: Janet Gainer-Dewar Date: Wed, 11 Sep 2024 17:46:57 -0400 Subject: [PATCH 06/39] Report cost computation errors --- .../impl/builder/MetadataBuilderActor.scala | 60 ++++++++++++------- 1 file changed, 40 insertions(+), 20 deletions(-) diff --git a/services/src/main/scala/cromwell/services/metadata/impl/builder/MetadataBuilderActor.scala b/services/src/main/scala/cromwell/services/metadata/impl/builder/MetadataBuilderActor.scala index e944d77df71..048c353910a 100644 --- a/services/src/main/scala/cromwell/services/metadata/impl/builder/MetadataBuilderActor.scala +++ b/services/src/main/scala/cromwell/services/metadata/impl/builder/MetadataBuilderActor.scala @@ -3,7 +3,11 @@ package cromwell.services.metadata.impl.builder import java.time.OffsetDateTime import java.util.concurrent.atomic.AtomicLong import akka.actor.{ActorRef, LoggingFSM, PoisonPill, Props} +import cats.data.Validated.{Invalid, Valid} +import cats.implicits.catsSyntaxValidatedId import common.collections.EnhancedCollections._ +import common.validation.ErrorOr +import common.validation.ErrorOr._ import cromwell.services.metadata.impl.builder.MetadataComponent._ import cromwell.core.ExecutionIndex.ExecutionIndex import cromwell.core._ @@ -17,7 +21,6 @@ import spray.json._ import java.time.temporal.ChronoUnit import scala.language.postfixOps -import scala.util.{Failure, Success, Try} object MetadataBuilderActor { sealed trait MetadataBuilderActorState @@ -552,34 +555,39 @@ class MetadataBuilderActor(readMetadataWorkerMaker: () => Props, case _ => None } - private def computeCost(jsVal: JsValue): Try[BigDecimal] = + private def computeCost(callName: String, jsVal: JsValue): ErrorOr[BigDecimal] = jsVal match { case jsCall: JsObject => extractFromJsAs[JsString](jsCall, CallMetadataKeys.VmStartTime) map { startTimeVal => - val startTimeTry = Try(OffsetDateTime.parse(startTimeVal.value)) + val startTimeErrorOr = ErrorOr(OffsetDateTime.parse(startTimeVal.value)) - val endTimeTry = + val endTimeErrorOr = extractFromJsAs[JsString](jsCall, CallMetadataKeys.VmEndTime) - .map(v => Try(OffsetDateTime.parse(v.value))) - .getOrElse(Success(OffsetDateTime.now())) + .map(v => ErrorOr(OffsetDateTime.parse(v.value))) + .getOrElse(OffsetDateTime.now().validNel) val rawCostPerHour = extractFromJsAs[JsNumber](jsCall, CallMetadataKeys.VmCostPerHour).map(_.value) - val costPerHourTry = rawCostPerHour match { - case Some(c: BigDecimal) if c >= 0 => Success(c) + val costPerHourErrorOr = rawCostPerHour match { + case Some(c: BigDecimal) if c >= 0 => c.validNel // A costPerHour < 0 indicates an error - case Some(_: BigDecimal) => Failure(new RuntimeException("cost error!!!!")) // TODO - case None => Success(BigDecimal(0)) + case Some(_: BigDecimal) => + val index = + extractFromJsAs[JsNumber](jsCall, "shardIndex").map(_.value.toString).getOrElse("-1") + val attempt = + extractFromJsAs[JsNumber](jsCall, "attempt").map(_.value.toString).getOrElse("1") + s"Couldn't find valid vmCostPerHour for ${List(callName, index, attempt).mkString(".")}".invalidNel + case None => BigDecimal(0).validNel } for { - start <- startTimeTry - end <- endTimeTry - costPerHour <- costPerHourTry + start <- startTimeErrorOr + end <- endTimeErrorOr + costPerHour <- costPerHourErrorOr vmRuntimeInMillis = start.until(end, ChronoUnit.MILLIS).toDouble } yield (vmRuntimeInMillis / (1000 * 60 * 60)) * costPerHour - } getOrElse (Success(0)) + } getOrElse (BigDecimal(0).validNel) // TODO is millis the right thing to use here? Should it be seconds? - case _ => Success(0) + case _ => BigDecimal(0).validNel } def buildCostAndStop(id: WorkflowId, @@ -592,30 +600,42 @@ class MetadataBuilderActor(readMetadataWorkerMaker: () => Props, val metadataEvents = MetadataBuilderActor.parse(groupEvents(eventsList), expandedValues) - // Walk the structured metadata to compute cost for each call in this workflow - val callCosts = for { + // Walk the structured metadata to attempt to compute cost for each call in this workflow + val callCostsWithErrors: List[ErrorOr[BigDecimal]] = for { wfObj <- extractFromJsAs[JsObject](metadataEvents, id.toString).toList callsObj <- extractFromJsAs[JsObject](wfObj, "calls").toList singleCallName <- callsObj.fields.keys.toList singleCallAttemptList <- extractFromJsAs[JsArray](callsObj, singleCallName).toList singleCallAttempt <- singleCallAttemptList.elements.toList - singleCallAttemptCost = computeCost(singleCallAttempt).get // TODO + singleCallAttemptCost = computeCost(singleCallName, singleCallAttempt) } yield singleCallAttemptCost + val callCost: BigDecimal = callCostsWithErrors.collect { case (Valid(b)) => b }.sum + val costErrors: Vector[JsString] = + callCostsWithErrors + .collect { case (Invalid(e)) => e.toList.mkString(", ") } + .map(JsString(_)) + .toVector + val subworkflowCost = expandedValues.values.map { case o: JsObject => extractFromJsAs[JsNumber](o, "cost").map(_.value).getOrElse(BigDecimal(0)) case _ => BigDecimal(0) }.sum - val totalCost = callCosts.sum + subworkflowCost + val subworkflowErrors = expandedValues.values.flatMap { + case o: JsObject => + extractFromJsAs[JsArray](o, "errors").getOrElse(JsArray.empty).elements + case _ => JsArray.empty.elements + } val resp = JsObject( Map( WorkflowMetadataKeys.Id -> JsString(id.toString), WorkflowMetadataKeys.Status -> JsString(status), "currency" -> JsString("USD"), - "cost" -> JsNumber(totalCost) + "cost" -> JsNumber(callCost + subworkflowCost), + "errors" -> JsArray(costErrors ++ subworkflowErrors) ) ) From f5b5b9b7be6b3b072e06f4e5a34b2dfa4cbec405 Mon Sep 17 00:00:00 2001 From: Janet Gainer-Dewar Date: Wed, 11 Sep 2024 18:25:48 -0400 Subject: [PATCH 07/39] More better error handling --- .../metadata/impl/ReadDatabaseMetadataWorkerActor.scala | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/services/src/main/scala/cromwell/services/metadata/impl/ReadDatabaseMetadataWorkerActor.scala b/services/src/main/scala/cromwell/services/metadata/impl/ReadDatabaseMetadataWorkerActor.scala index 932e54ee69d..68bbc594356 100644 --- a/services/src/main/scala/cromwell/services/metadata/impl/ReadDatabaseMetadataWorkerActor.scala +++ b/services/src/main/scala/cromwell/services/metadata/impl/ReadDatabaseMetadataWorkerActor.scala @@ -177,10 +177,10 @@ class ReadDatabaseMetadataWorkerActor(metadataReadTimeout: Duration, metadataRea (s, m) match { case (Some(wfState), resp: MetadataLookupResponse) => CostResponse(id, wfState, resp) - // TODO better error handling for other metadata lookup results - case (Some(_), _) => CostFailure(id, new Exception("TODO")) - // TODO should this be a failure? - case (None, _) => CostFailure(id, new Exception("Couldn't find workflow status")) + case (None, resp: MetadataLookupResponse) => + // See note in getStatus above - if we can't find status, it's Submitted + CostResponse(id, WorkflowSubmitted, resp) + case (_, errorMetadataResponse) => errorMetadataResponse } } recover { case t => CostFailure(id, t) From 520ff4ac01ce7fccf979d74b8e39a6c0e34fcf36 Mon Sep 17 00:00:00 2001 From: Janet Gainer-Dewar Date: Thu, 12 Sep 2024 09:29:12 -0400 Subject: [PATCH 08/39] Remove dead code --- .../metadata/impl/MetadataDatabaseAccess.scala | 13 ------------- 1 file changed, 13 deletions(-) diff --git a/services/src/main/scala/cromwell/services/metadata/impl/MetadataDatabaseAccess.scala b/services/src/main/scala/cromwell/services/metadata/impl/MetadataDatabaseAccess.scala index cb596d0153a..e0b547370d3 100644 --- a/services/src/main/scala/cromwell/services/metadata/impl/MetadataDatabaseAccess.scala +++ b/services/src/main/scala/cromwell/services/metadata/impl/MetadataDatabaseAccess.scala @@ -269,19 +269,6 @@ trait MetadataDatabaseAccess { metadataToMetadataEvents(id) } - def queryCost(id: WorkflowId, timeout: Duration)(implicit - ec: ExecutionContext - ): Future[Seq[MetadataEvent]] = { - val keys = List(CallMetadataKeys.VmStartTime, - CallMetadataKeys.VmEndTime, - CallMetadataKeys.VmCostPerHour, - CallMetadataKeys.SubWorkflowId - ) - metadataDatabaseInterface - .queryMetadataEntryWithKeyConstraints(id.toString, keys, List.empty, CallOrWorkflowQuery, timeout) - .map(metadataToMetadataEvents(id)) - } - def refreshWorkflowMetadataSummaries(limit: Int)(implicit ec: ExecutionContext): Future[SummaryResult] = for { increasingProcessed <- metadataDatabaseInterface.summarizeIncreasing( From 8baf04350cd519b4f1d8fb05bd60f9fba61a1242 Mon Sep 17 00:00:00 2001 From: Janet Gainer-Dewar Date: Fri, 13 Sep 2024 11:18:52 -0400 Subject: [PATCH 09/39] Interpret BigDecimal as a number in metadata --- .../main/scala/cromwell/services/metadata/MetadataQuery.scala | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/services/src/main/scala/cromwell/services/metadata/MetadataQuery.scala b/services/src/main/scala/cromwell/services/metadata/MetadataQuery.scala index c88a142605b..a8cf1a6fca7 100644 --- a/services/src/main/scala/cromwell/services/metadata/MetadataQuery.scala +++ b/services/src/main/scala/cromwell/services/metadata/MetadataQuery.scala @@ -62,7 +62,7 @@ object MetadataValue { case WomOptionalValue(_, None) => new MetadataValue("", MetadataNull) case value: WomValue => new MetadataValue(value.valueString, MetadataString) case _: Int | Long | _: java.lang.Long | _: java.lang.Integer => new MetadataValue(value.toString, MetadataInt) - case _: Double | Float | _: java.lang.Double | _: java.lang.Float => + case _: Double | Float | _: BigDecimal | _: java.lang.Double | _: java.lang.Float => new MetadataValue(value.toString, MetadataNumber) case _: Boolean | _: java.lang.Boolean => new MetadataValue(value.toString, MetadataBoolean) case offsetDateTime: OffsetDateTime => new MetadataValue(offsetDateTime.toUtcMilliString, MetadataString) From 39f6d3d81f5324f2091aa2f6f4f43643693b7548 Mon Sep 17 00:00:00 2001 From: Janet Gainer-Dewar Date: Fri, 13 Sep 2024 11:19:06 -0400 Subject: [PATCH 10/39] MetadataBuilderActor unit tests --- .../webservice/MetadataBuilderActorSpec.scala | 223 +++++++++++++++++- 1 file changed, 215 insertions(+), 8 deletions(-) diff --git a/engine/src/test/scala/cromwell/webservice/MetadataBuilderActorSpec.scala b/engine/src/test/scala/cromwell/webservice/MetadataBuilderActorSpec.scala index 9075060add6..343f72a6977 100644 --- a/engine/src/test/scala/cromwell/webservice/MetadataBuilderActorSpec.scala +++ b/engine/src/test/scala/cromwell/webservice/MetadataBuilderActorSpec.scala @@ -1,7 +1,8 @@ package cromwell.webservice -import java.time.{OffsetDateTime, ZoneOffset} +import akka.actor.Props +import java.time.{OffsetDateTime, ZoneOffset} import akka.pattern.ask import akka.testkit._ import akka.util.Timeout @@ -9,6 +10,7 @@ import cromwell.core._ import cromwell.services._ import cromwell.services.metadata.MetadataService._ import cromwell.services.metadata._ +import cromwell.services.metadata.impl.ReadDatabaseMetadataWorkerActor import cromwell.services.metadata.impl.builder.MetadataBuilderActor import cromwell.util.AkkaTestUtil.EnhancedTestProbe import cromwell.webservice.MetadataBuilderActorSpec._ @@ -18,6 +20,7 @@ import org.scalatest.prop.TableDrivenPropertyChecks import org.scalatest.{Assertion, Succeeded} import spray.json._ +import java.util.UUID import scala.concurrent.Future import scala.concurrent.duration._ import scala.util.Random @@ -181,24 +184,92 @@ class MetadataBuilderActorSpec assertMetadataResponse(queryAction, mdQuery, events, expectedRes, metadataBuilderActorName) } - it should "build workflow cost response" in { - // Note that this is testing dummy behavior and should be updated and expanded when we add real logic + it should "build a basic workflow cost response" in { + val workflowId = WorkflowId.randomId() + val workflowState = WorkflowSucceeded + val events = Seq( + MetadataEvent(MetadataKey(workflowId, Option(MetadataJobKey("callA", None, 1)), CallMetadataKeys.VmStartTime), + MetadataValue("2023-10-30T09:00:00Z") + ), + MetadataEvent(MetadataKey(workflowId, Option(MetadataJobKey("callA", None, 1)), CallMetadataKeys.VmEndTime), + MetadataValue("2023-10-30T11:00:00Z") + ), + MetadataEvent(MetadataKey(workflowId, Option(MetadataJobKey("callA", None, 1)), CallMetadataKeys.VmCostPerHour), + MetadataValue(3.5) + ), + MetadataEvent(MetadataKey(workflowId, Option(MetadataJobKey("callB", None, 1)), CallMetadataKeys.VmStartTime), + MetadataValue("2023-10-30T11:01:00Z") + ), + MetadataEvent(MetadataKey(workflowId, Option(MetadataJobKey("callB", None, 1)), CallMetadataKeys.VmEndTime), + MetadataValue("2023-10-30T12:31:00Z") + ), + MetadataEvent(MetadataKey(workflowId, Option(MetadataJobKey("callB", None, 1)), CallMetadataKeys.VmCostPerHour), + MetadataValue(0.5) + ) + ) + val query = MetadataQuery(workflowId, None, None, None, None, expandSubWorkflows = false) + + val expectedRes = + s"""{ + |"cost": 7.75, + |"currency": "USD", + |"id": "${workflowId}", + |"status": "${workflowState.toString}", + |"errors": [] + |}""".stripMargin + + val action = GetCost(workflowId) + + val mockReadMetadataWorkerActor = TestProbe("mockReadMetadataWorkerActor") + def readMetadataWorkerMaker = () => mockReadMetadataWorkerActor.props + + val mba = system.actorOf( + props = MetadataBuilderActor.props(readMetadataWorkerMaker, 1000000), + name = "mba-cost-builder" + ) + + val response = mba.ask(action).mapTo[MetadataJsonResponse] + mockReadMetadataWorkerActor.expectMsg(defaultTimeout, action) + mockReadMetadataWorkerActor.reply( + CostResponse(workflowId, workflowState, MetadataLookupResponse(query, events)) + ) + response map { r => r shouldBe a[SuccessfulMetadataJsonResponse] } + response.mapTo[SuccessfulMetadataJsonResponse] map { b => b.responseJson shouldBe expectedRes.parseJson } + } + + it should "build a workflow cost response with an error" in { val workflowId = WorkflowId.randomId() val workflowState = WorkflowSucceeded val events = Seq( - MetadataEvent(MetadataKey(workflowId, Option(MetadataJobKey("callA", None, 1)), "NOT_CHECKED"), - MetadataValue("NOT_CHECKED") + MetadataEvent(MetadataKey(workflowId, Option(MetadataJobKey("callA", None, 1)), CallMetadataKeys.VmStartTime), + MetadataValue("2023-10-30T09:00:00Z") + ), + MetadataEvent(MetadataKey(workflowId, Option(MetadataJobKey("callA", None, 1)), CallMetadataKeys.VmEndTime), + MetadataValue("2023-10-30T11:00:00Z") + ), + MetadataEvent( + MetadataKey(workflowId, Option(MetadataJobKey("callA", None, 1)), CallMetadataKeys.VmCostPerHour), + MetadataValue(-1) // indicates an error when computing vmCostPerHour ), - MetadataEvent(MetadataKey(workflowId, None, "status"), MetadataValue("Succeeded")) + MetadataEvent(MetadataKey(workflowId, Option(MetadataJobKey("callB", None, 1)), CallMetadataKeys.VmStartTime), + MetadataValue("2023-10-30T11:01:00Z") + ), + MetadataEvent(MetadataKey(workflowId, Option(MetadataJobKey("callB", None, 1)), CallMetadataKeys.VmEndTime), + MetadataValue("2023-10-30T12:31:00Z") + ), + MetadataEvent(MetadataKey(workflowId, Option(MetadataJobKey("callB", None, 1)), CallMetadataKeys.VmCostPerHour), + MetadataValue(0.5) + ) ) val query = MetadataQuery(workflowId, None, None, None, None, expandSubWorkflows = false) val expectedRes = s"""{ - |"cost": 3.5, + |"cost": 0.75, |"currency": "USD", |"id": "${workflowId}", - |"status": "${workflowState.toString}" + |"status": "${workflowState.toString}", + |"errors": ["Couldn't find valid vmCostPerHour for callA.-1.1"] |}""".stripMargin val action = GetCost(workflowId) @@ -220,6 +291,142 @@ class MetadataBuilderActorSpec response.mapTo[SuccessfulMetadataJsonResponse] map { b => b.responseJson shouldBe expectedRes.parseJson } } + it should "build an error workflow cost response" in { + val workflowId = WorkflowId.randomId() + + val action = GetCost(workflowId) + + val expectedException = new Exception("Oh nooooo :(") + + val mockReadMetadataWorkerActor = TestProbe("mockReadMetadataWorkerActor") + def readMetadataWorkerMaker = () => mockReadMetadataWorkerActor.props + + val mba = system.actorOf( + props = MetadataBuilderActor.props(readMetadataWorkerMaker, 1000000), + name = "mba-cost-builder" + ) + + val response = mba.ask(action).mapTo[MetadataJsonResponse] + mockReadMetadataWorkerActor.expectMsg(defaultTimeout, action) + mockReadMetadataWorkerActor.reply( + CostFailure(workflowId, expectedException) + ) + response map { r => r shouldBe a[FailedMetadataJsonResponse] } + response.mapTo[FailedMetadataJsonResponse] map { b => + b.reason.getClass shouldBe expectedException.getClass + b.reason.getMessage shouldBe expectedException.getMessage + } + } + + def costMetadataEvents(wfId: WorkflowId, + callName: String, + shardIndex: Option[Int] = None, + attempt: Int = 1, + costPerHour: BigDecimal = BigDecimal(1) + ) = List( + MetadataEvent( + MetadataKey(wfId, Option(MetadataJobKey(callName, shardIndex, attempt)), CallMetadataKeys.VmStartTime), + MetadataValue("2023-10-30T09:00:00Z") + ), + MetadataEvent(MetadataKey(wfId, Option(MetadataJobKey(callName, shardIndex, attempt)), CallMetadataKeys.VmEndTime), + MetadataValue("2023-10-30T10:00:00Z") + ), + MetadataEvent( + MetadataKey(wfId, Option(MetadataJobKey(callName, shardIndex, attempt)), CallMetadataKeys.VmCostPerHour), + MetadataValue(costPerHour) + ) + ) + + it should "build a cost response with subworkflow data" in { + // hard-coding UUIDs to make test failures easier to diagnose + val mainWorkflowId = WorkflowId(UUID.fromString("00000000-f76d-4af3-b371-5ba580916729")) + val subWorkflow1Id = WorkflowId(UUID.fromString("11111111-f76d-4af3-b371-5ba580916729")) + val subWorkflow2Id = WorkflowId(UUID.fromString("22222222-f76d-4af3-b371-5ba580916729")) + val subWorkflow1aId = WorkflowId(UUID.fromString("1a1a1a1a-f76d-4af3-b371-5ba580916729")) + val subWorkflow1bId = WorkflowId(UUID.fromString("1b1b1b1b-f76d-4af3-b371-5ba580916729")) + + val workflowState = WorkflowSucceeded + + val mainEvents = List( + MetadataEvent(MetadataKey(mainWorkflowId, Option(MetadataJobKey("wfMain", None, 1)), "subWorkflowId"), + MetadataValue(subWorkflow1Id) + ), + MetadataEvent(MetadataKey(mainWorkflowId, Option(MetadataJobKey("wfMain", None, 1)), "subWorkflowId"), + MetadataValue(subWorkflow2Id) + ) + ) ++ + costMetadataEvents(mainWorkflowId, "callA", Option(0), 1) ++ + costMetadataEvents(mainWorkflowId, "callA", Option(1), 1) + + val sub1Events = List( + MetadataEvent(MetadataKey(subWorkflow1Id, Option(MetadataJobKey("wfSub1", None, 1)), "subWorkflowId"), + MetadataValue(subWorkflow1aId) + ), + MetadataEvent(MetadataKey(subWorkflow1Id, Option(MetadataJobKey("wfSub1", None, 1)), "subWorkflowId"), + MetadataValue(subWorkflow1bId) + ) + ) + + val sub2Events = + costMetadataEvents(subWorkflow2Id, "call2A") ++ + costMetadataEvents(subWorkflow2Id, "call2B") ++ + costMetadataEvents(subWorkflow2Id, "call2C") + val sub1aEvents = costMetadataEvents(subWorkflow1aId, "call1aA", costPerHour = BigDecimal(-1)) + val sub1bEvents = + costMetadataEvents(subWorkflow1bId, "call1bA", attempt = 1) ++ + costMetadataEvents(subWorkflow1bId, "call1bA", attempt = 2) + + val mainQuery = MetadataQuery(mainWorkflowId, None, None, None, None, expandSubWorkflows = true) + val sub1Query = MetadataQuery(subWorkflow1Id, None, None, None, None, expandSubWorkflows = true) + val sub2Query = MetadataQuery(subWorkflow2Id, None, None, None, None, expandSubWorkflows = true) + val sub1aQuery = MetadataQuery(subWorkflow1aId, None, None, None, None, expandSubWorkflows = true) + val sub1bQuery = MetadataQuery(subWorkflow1bId, None, None, None, None, expandSubWorkflows = true) + val mainAction = GetCost(mainWorkflowId) + + // Mock ReadDatabaseMetadataWorkerActor to return the expected metadata results for each query. + // Would normally have done this with expect/reply on a TestProbe, but that required the messages to + // be sent in a deterministic order, which is not the case here. + class TestReadDatabaseMetadataWorkerActorForCost extends ReadDatabaseMetadataWorkerActor(defaultTimeout, 1000000) { + override def receive: Receive = { + case GetCost(wfId) if wfId == mainWorkflowId => + sender() ! CostResponse(mainWorkflowId, workflowState, MetadataLookupResponse(mainQuery, mainEvents)) + () + case GetCost(wfId) if wfId == subWorkflow1Id => + sender() ! CostResponse(subWorkflow1Id, workflowState, MetadataLookupResponse(sub1Query, sub1Events)) + () + case GetCost(wfId) if wfId == subWorkflow2Id => + sender() ! CostResponse(subWorkflow2Id, workflowState, MetadataLookupResponse(sub2Query, sub2Events)) + () + case GetCost(wfId) if wfId == subWorkflow1aId => + sender() ! CostResponse(subWorkflow1aId, workflowState, MetadataLookupResponse(sub1aQuery, sub1aEvents)) + () + case GetCost(wfId) if wfId == subWorkflow1bId => + sender() ! CostResponse(subWorkflow1bId, workflowState, MetadataLookupResponse(sub1bQuery, sub1bEvents)) + () + case _ => () + } + } + + val expectedRes = + s"""{ + |"cost": 7, + |"currency": "USD", + |"id": "${mainWorkflowId}", + |"status": "${workflowState.toString}", + |"errors": ["Couldn't find valid vmCostPerHour for call1aA.-1.1"] + |}""".stripMargin + + def readMetadataWorkerMaker = () => Props(new TestReadDatabaseMetadataWorkerActorForCost) + val mba = system.actorOf( + props = MetadataBuilderActor.props(readMetadataWorkerMaker, 1000000), + name = "mba-cost-builder" + ) + + val response = mba.ask(mainAction).mapTo[MetadataJsonResponse] + response map { r => r shouldBe a[SuccessfulMetadataJsonResponse] } + response.mapTo[SuccessfulMetadataJsonResponse] map { b => b.responseJson shouldBe expectedRes.parseJson } + } + it should "build the call list for failed tasks when prompted" in { def makeEvent(workflow: WorkflowId, key: Option[MetadataJobKey]) = From e5a51ae32b9e7b6820d2d4b771b9225e091ae22e Mon Sep 17 00:00:00 2001 From: Janet Gainer-Dewar Date: Fri, 13 Sep 2024 11:26:54 -0400 Subject: [PATCH 11/39] Standardize on BigD --- .../scala/cromwell/webservice/MetadataBuilderActorSpec.scala | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/engine/src/test/scala/cromwell/webservice/MetadataBuilderActorSpec.scala b/engine/src/test/scala/cromwell/webservice/MetadataBuilderActorSpec.scala index 343f72a6977..b4e6973ed10 100644 --- a/engine/src/test/scala/cromwell/webservice/MetadataBuilderActorSpec.scala +++ b/engine/src/test/scala/cromwell/webservice/MetadataBuilderActorSpec.scala @@ -249,7 +249,7 @@ class MetadataBuilderActorSpec ), MetadataEvent( MetadataKey(workflowId, Option(MetadataJobKey("callA", None, 1)), CallMetadataKeys.VmCostPerHour), - MetadataValue(-1) // indicates an error when computing vmCostPerHour + MetadataValue(BigDecimal(-1)) // indicates an error when computing vmCostPerHour ), MetadataEvent(MetadataKey(workflowId, Option(MetadataJobKey("callB", None, 1)), CallMetadataKeys.VmStartTime), MetadataValue("2023-10-30T11:01:00Z") @@ -258,7 +258,7 @@ class MetadataBuilderActorSpec MetadataValue("2023-10-30T12:31:00Z") ), MetadataEvent(MetadataKey(workflowId, Option(MetadataJobKey("callB", None, 1)), CallMetadataKeys.VmCostPerHour), - MetadataValue(0.5) + MetadataValue(BigDecimal(0.5)) ) ) val query = MetadataQuery(workflowId, None, None, None, None, expandSubWorkflows = false) From ceb077c18299f83e0f586b64c7a6dda7b98c0766 Mon Sep 17 00:00:00 2001 From: Janet Gainer-Dewar Date: Fri, 13 Sep 2024 11:29:25 -0400 Subject: [PATCH 12/39] Oops there's more --- .../scala/cromwell/webservice/MetadataBuilderActorSpec.scala | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/engine/src/test/scala/cromwell/webservice/MetadataBuilderActorSpec.scala b/engine/src/test/scala/cromwell/webservice/MetadataBuilderActorSpec.scala index b4e6973ed10..6be8138bbfd 100644 --- a/engine/src/test/scala/cromwell/webservice/MetadataBuilderActorSpec.scala +++ b/engine/src/test/scala/cromwell/webservice/MetadataBuilderActorSpec.scala @@ -195,7 +195,7 @@ class MetadataBuilderActorSpec MetadataValue("2023-10-30T11:00:00Z") ), MetadataEvent(MetadataKey(workflowId, Option(MetadataJobKey("callA", None, 1)), CallMetadataKeys.VmCostPerHour), - MetadataValue(3.5) + MetadataValue(BigDecimal(3.5)) ), MetadataEvent(MetadataKey(workflowId, Option(MetadataJobKey("callB", None, 1)), CallMetadataKeys.VmStartTime), MetadataValue("2023-10-30T11:01:00Z") @@ -204,7 +204,7 @@ class MetadataBuilderActorSpec MetadataValue("2023-10-30T12:31:00Z") ), MetadataEvent(MetadataKey(workflowId, Option(MetadataJobKey("callB", None, 1)), CallMetadataKeys.VmCostPerHour), - MetadataValue(0.5) + MetadataValue(BigDecimal(0.5)) ) ) val query = MetadataQuery(workflowId, None, None, None, None, expandSubWorkflows = false) From cbacb807a9878c40734642d39bde51ab977bfb2d Mon Sep 17 00:00:00 2001 From: Janet Gainer-Dewar Date: Mon, 16 Sep 2024 17:27:07 -0400 Subject: [PATCH 13/39] More tests --- .../webservice/MetadataBuilderActorSpec.scala | 49 ++++++++++ .../impl/builder/MetadataBuilderActor.scala | 95 ++++++++++--------- 2 files changed, 101 insertions(+), 43 deletions(-) diff --git a/engine/src/test/scala/cromwell/webservice/MetadataBuilderActorSpec.scala b/engine/src/test/scala/cromwell/webservice/MetadataBuilderActorSpec.scala index 6be8138bbfd..02ffaefa3b7 100644 --- a/engine/src/test/scala/cromwell/webservice/MetadataBuilderActorSpec.scala +++ b/engine/src/test/scala/cromwell/webservice/MetadataBuilderActorSpec.scala @@ -6,6 +6,7 @@ import java.time.{OffsetDateTime, ZoneOffset} import akka.pattern.ask import akka.testkit._ import akka.util.Timeout +import cats.implicits.catsSyntaxValidatedId import cromwell.core._ import cromwell.services._ import cromwell.services.metadata.MetadataService._ @@ -21,6 +22,7 @@ import org.scalatest.{Assertion, Succeeded} import spray.json._ import java.util.UUID +import scala.BigDecimal import scala.concurrent.Future import scala.concurrent.duration._ import scala.util.Random @@ -427,6 +429,53 @@ class MetadataBuilderActorSpec response.mapTo[SuccessfulMetadataJsonResponse] map { b => b.responseJson shouldBe expectedRes.parseJson } } + it should "compute cost for calls" in { + val costJsValRows = Table( + ("jsInput", "expected"), + ("""{"attempt": 1, "shardIndex": -1, "vmStartTime": "2023-10-30T04:00:00Z", "vmEndTime": "2023-10-30T05:00:00Z", "vmCostPerHour": 0.0567}""", + BigDecimal(0.0567).validNel + ), + ("""{"attempt": 1, "shardIndex": -1, "vmStartTime": "2023-10-30T04:00:00Z", "vmEndTime": "2023-10-30T04:30:00Z", "vmCostPerHour": 0.0567}""", + BigDecimal(0.02835).validNel + ), + ("""{"attempt": 1, "shardIndex": -1, "vmEndTime": "2023-10-30T05:00:00Z", "vmCostPerHour": 0.0567}""", + BigDecimal(0).validNel + ), + ("""{"attempt": 1, "shardIndex": -1, "vmEndTime": "2023-10-30T05:00:00Z", "vmEndTime": "2023-10-30T04:30:00Z"}""", + BigDecimal(0).validNel + ), + (s"""{"attempt": 1, "shardIndex": -1, "vmStartTime": "2023-10-30 05:00:00Z", "vmCostPerHour": 0.0567}""", + "Text '2023-10-30 05:00:00Z' could not be parsed at index 10".invalidNel + ), + (s"""{"attempt": 1, "shardIndex": -1, "vmStartTime": "2023-10-30T05:00:00Z", "vmEndTime": "2023-10-30T04:30:00Z", "vmCostPerHour": -1}""", + "Couldn't find valid vmCostPerHour for foo.-1.1".invalidNel + ) + ) + + forAll(costJsValRows) { case (jsInput: String, expected) => + val jsVal = jsInput.parseJson + val retVal = MetadataBuilderActor.computeCost("foo", jsVal) + retVal shouldBe expected + } + } + + it should "compute cost for calls that are still running" in { + val inputJsVal = + s"""{ + |"attempt": 1, + |"shardIndex": -1, + |"vmStartTime": "${OffsetDateTime.now().minusMinutes(5)}", + |"vmCostPerHour": 12 + |}""".stripMargin.parseJson + + val retVal = MetadataBuilderActor.computeCost("foo", inputJsVal) + + // Just test that the cost is approximately 1 - we don't know + // exactly how long the test takes to run + retVal.getOrElse(BigDecimal(0)) should be > BigDecimal(0.9) + retVal.getOrElse(BigDecimal(100)) should be < BigDecimal(1.1) + } + it should "build the call list for failed tasks when prompted" in { def makeEvent(workflow: WorkflowId, key: Option[MetadataJobKey]) = diff --git a/services/src/main/scala/cromwell/services/metadata/impl/builder/MetadataBuilderActor.scala b/services/src/main/scala/cromwell/services/metadata/impl/builder/MetadataBuilderActor.scala index 048c353910a..13ed99ab227 100644 --- a/services/src/main/scala/cromwell/services/metadata/impl/builder/MetadataBuilderActor.scala +++ b/services/src/main/scala/cromwell/services/metadata/impl/builder/MetadataBuilderActor.scala @@ -79,6 +79,12 @@ object MetadataBuilderActor { .flatten .distinct + private def extractFromJsAs[A: Manifest](js: JsObject, fieldName: String): Option[A] = + js.fields.get(fieldName) match { + case Some(a: A) => Some(a) + case _ => None + } + private def eventsToAttemptMetadata( subWorkflowMetadata: Map[String, JsValue] )(attempt: Int, events: Seq[MetadataEvent]) = { @@ -301,6 +307,49 @@ object MetadataBuilderActor { .parseWorkflowEvents(includeCallsIfEmpty, expandedValues)(eventsList) .fields + ("id" -> JsString(workflowId.toString)) ) + + /* + * Attempt to the cost of a single call attempt from its metadata json, assuming the correct metadata + * exists to allow that. We depend on vmStartTime and vmCostPerHour being present. We also use vmEndTime, + * or fall back to the current time if it is absent. + * + * If the metadata needed to compute cost is missing, return 0. If the VM cost or any of the dates + * can't be parsed, return an error. We will also return an error if we find a negative value for + * vmCostPerHour - this indicates an error when generating the cost. + */ + def computeCost(callName: String, jsVal: JsValue): ErrorOr[BigDecimal] = + jsVal match { + case jsCall: JsObject => + extractFromJsAs[JsString](jsCall, CallMetadataKeys.VmStartTime) map { startTimeVal => + val startTimeErrorOr = ErrorOr(OffsetDateTime.parse(startTimeVal.value)) + + val endTimeErrorOr = + extractFromJsAs[JsString](jsCall, CallMetadataKeys.VmEndTime) + .map(v => ErrorOr(OffsetDateTime.parse(v.value))) + .getOrElse(OffsetDateTime.now().validNel) + + val rawCostPerHour = extractFromJsAs[JsNumber](jsCall, CallMetadataKeys.VmCostPerHour).map(_.value) + val costPerHourErrorOr = rawCostPerHour match { + case Some(c: BigDecimal) if c >= 0 => c.validNel + // A costPerHour < 0 indicates an error + case Some(_: BigDecimal) => + val index = + extractFromJsAs[JsNumber](jsCall, "shardIndex").map(_.value.toString).getOrElse("-1") + val attempt = + extractFromJsAs[JsNumber](jsCall, "attempt").map(_.value.toString).getOrElse("1") + s"Couldn't find valid vmCostPerHour for ${List(callName, index, attempt).mkString(".")}".invalidNel + case None => BigDecimal(0).validNel + } + + for { + start <- startTimeErrorOr + end <- endTimeErrorOr + costPerHour <- costPerHourErrorOr + vmRuntimeInMillis = start.until(end, ChronoUnit.MILLIS).toDouble + } yield (vmRuntimeInMillis / (1000 * 60 * 60)) * costPerHour + } getOrElse (BigDecimal(0).validNel) + case _ => BigDecimal(0).validNel + } } class MetadataBuilderActor(readMetadataWorkerMaker: () => Props, @@ -526,7 +575,7 @@ class MetadataBuilderActor(readMetadataWorkerMaker: () => Props, val subWorkflowIds = extractSubworkflowIds(metadataResponse.eventList) if (subWorkflowIds.isEmpty) - // If no workflows found, just build cost data + // If no subworkflows found, just build cost data buildCostAndStop(id, status.toString, metadataResponse.eventList, Map.empty, target, originalRequest) else { // Otherwise spin up a metadata builder actor for each sub workflow @@ -549,47 +598,6 @@ class MetadataBuilderActor(readMetadataWorkerMaker: () => Props, } } - private def extractFromJsAs[A: Manifest](js: JsObject, fieldName: String): Option[A] = - js.fields.get(fieldName) match { - case Some(a: A) => Some(a) - case _ => None - } - - private def computeCost(callName: String, jsVal: JsValue): ErrorOr[BigDecimal] = - jsVal match { - case jsCall: JsObject => - extractFromJsAs[JsString](jsCall, CallMetadataKeys.VmStartTime) map { startTimeVal => - val startTimeErrorOr = ErrorOr(OffsetDateTime.parse(startTimeVal.value)) - - val endTimeErrorOr = - extractFromJsAs[JsString](jsCall, CallMetadataKeys.VmEndTime) - .map(v => ErrorOr(OffsetDateTime.parse(v.value))) - .getOrElse(OffsetDateTime.now().validNel) - - val rawCostPerHour = extractFromJsAs[JsNumber](jsCall, CallMetadataKeys.VmCostPerHour).map(_.value) - val costPerHourErrorOr = rawCostPerHour match { - case Some(c: BigDecimal) if c >= 0 => c.validNel - // A costPerHour < 0 indicates an error - case Some(_: BigDecimal) => - val index = - extractFromJsAs[JsNumber](jsCall, "shardIndex").map(_.value.toString).getOrElse("-1") - val attempt = - extractFromJsAs[JsNumber](jsCall, "attempt").map(_.value.toString).getOrElse("1") - s"Couldn't find valid vmCostPerHour for ${List(callName, index, attempt).mkString(".")}".invalidNel - case None => BigDecimal(0).validNel - } - - for { - start <- startTimeErrorOr - end <- endTimeErrorOr - costPerHour <- costPerHourErrorOr - vmRuntimeInMillis = start.until(end, ChronoUnit.MILLIS).toDouble - } yield (vmRuntimeInMillis / (1000 * 60 * 60)) * costPerHour - } getOrElse (BigDecimal(0).validNel) - // TODO is millis the right thing to use here? Should it be seconds? - case _ => BigDecimal(0).validNel - } - def buildCostAndStop(id: WorkflowId, status: String, eventsList: Seq[MetadataEvent], @@ -600,7 +608,7 @@ class MetadataBuilderActor(readMetadataWorkerMaker: () => Props, val metadataEvents = MetadataBuilderActor.parse(groupEvents(eventsList), expandedValues) - // Walk the structured metadata to attempt to compute cost for each call in this workflow + // Walk the structured metadata to attempt to compute cost for each call/shard/attempt in this workflow val callCostsWithErrors: List[ErrorOr[BigDecimal]] = for { wfObj <- extractFromJsAs[JsObject](metadataEvents, id.toString).toList callsObj <- extractFromJsAs[JsObject](wfObj, "calls").toList @@ -617,6 +625,7 @@ class MetadataBuilderActor(readMetadataWorkerMaker: () => Props, .map(JsString(_)) .toVector + // Get the cost and error collection for any subworkflows that are part of this workflow val subworkflowCost = expandedValues.values.map { case o: JsObject => extractFromJsAs[JsNumber](o, "cost").map(_.value).getOrElse(BigDecimal(0)) From 984e50e603608f4663a0447de32f9a4ebadcc3bc Mon Sep 17 00:00:00 2001 From: Janet Gainer-Dewar Date: Tue, 17 Sep 2024 13:11:51 -0400 Subject: [PATCH 14/39] Use a real currency --- .../metadata/impl/builder/MetadataBuilderActor.scala | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/services/src/main/scala/cromwell/services/metadata/impl/builder/MetadataBuilderActor.scala b/services/src/main/scala/cromwell/services/metadata/impl/builder/MetadataBuilderActor.scala index 13ed99ab227..c337676e5b7 100644 --- a/services/src/main/scala/cromwell/services/metadata/impl/builder/MetadataBuilderActor.scala +++ b/services/src/main/scala/cromwell/services/metadata/impl/builder/MetadataBuilderActor.scala @@ -20,6 +20,7 @@ import org.slf4j.LoggerFactory import spray.json._ import java.time.temporal.ChronoUnit +import java.util.Currency import scala.language.postfixOps object MetadataBuilderActor { @@ -58,6 +59,8 @@ object MetadataBuilderActor { private val AttemptKey = "attempt" private val ShardKey = "shardIndex" + final private val DefaultCurrency = Currency.getInstance("USD") + /** * Metadata for a call attempt */ @@ -642,7 +645,7 @@ class MetadataBuilderActor(readMetadataWorkerMaker: () => Props, Map( WorkflowMetadataKeys.Id -> JsString(id.toString), WorkflowMetadataKeys.Status -> JsString(status), - "currency" -> JsString("USD"), + "currency" -> JsString(DefaultCurrency.getCurrencyCode), "cost" -> JsNumber(callCost + subworkflowCost), "errors" -> JsArray(costErrors ++ subworkflowErrors) ) From c044228de00625f05bd770b3857867af3dca4641 Mon Sep 17 00:00:00 2001 From: Tom Wiseman Date: Wed, 18 Sep 2024 13:26:29 -0400 Subject: [PATCH 15/39] the gist --- .../PollResultMonitorActor.scala | 22 +++++ core/src/main/resources/reference.conf | 2 +- .../services/cost/GcpCostCatalogService.scala | 83 +++++++++++++++++-- .../services/cost/GcpCostCatalogTypes.scala | 50 +++++++++++ .../cost/GcpCostCatalogServiceSpec.scala | 8 +- .../actors/BatchPollResultMonitorActor.scala | 11 +-- .../common/PapiPollResultMonitorActor.scala | 18 ++-- .../pipelines/common/api/RunStatus.scala | 32 ++++--- .../pipelines/common/CostLookupSpec.scala | 62 ++++++++++++++ .../v2beta/api/request/ErrorReporter.scala | 2 +- .../api/request/GetRequestHandler.scala | 82 +++++++++--------- 11 files changed, 294 insertions(+), 78 deletions(-) create mode 100644 supportedBackends/google/pipelines/common/src/test/scala/cromwell/backend/google/pipelines/common/CostLookupSpec.scala diff --git a/backend/src/main/scala/cromwell/backend/standard/pollmonitoring/PollResultMonitorActor.scala b/backend/src/main/scala/cromwell/backend/standard/pollmonitoring/PollResultMonitorActor.scala index d5b8110d5dc..43d7132f999 100644 --- a/backend/src/main/scala/cromwell/backend/standard/pollmonitoring/PollResultMonitorActor.scala +++ b/backend/src/main/scala/cromwell/backend/standard/pollmonitoring/PollResultMonitorActor.scala @@ -9,6 +9,7 @@ import cromwell.backend.validation.{ ValidatedRuntimeAttributes } import cromwell.core.logging.JobLogger +import cromwell.services.cost.{GcpCostLookupRequest, GcpCostLookupResponse, InstantiatedVmInfo} import cromwell.services.metadata.CallMetadataKeys import cromwell.services.metrics.bard.BardEventing.BardEventRequest import cromwell.services.metrics.bard.model.TaskSummaryEvent @@ -42,6 +43,9 @@ trait PollResultMonitorActor[PollResultType] extends Actor { // Time that the user VM started spending money. def extractStartTimeFromRunState(pollStatus: PollResultType): Option[OffsetDateTime] + // Used to kick off a cost calculation + def extractVmInfoFromRunState(pollStatus: PollResultType): Option[InstantiatedVmInfo] + // Time that the user VM stopped spending money. def extractEndTimeFromRunState(pollStatus: PollResultType): Option[OffsetDateTime] @@ -99,6 +103,7 @@ trait PollResultMonitorActor[PollResultType] extends Actor { Option.empty private var vmStartTime: Option[OffsetDateTime] = Option.empty private var vmEndTime: Option[OffsetDateTime] = Option.empty + private var vmCostPerHour: Option[BigDecimal] = Option.empty def processPollResult(pollStatus: PollResultType): Unit = { // Make sure jobStartTime remains the earliest event time ever seen @@ -122,6 +127,16 @@ trait PollResultMonitorActor[PollResultType] extends Actor { tellMetadata(Map(CallMetadataKeys.VmEndTime -> end)) } } + // If we don't yet have a cost per hour and we can extract VM info, send a cost request to the catalog service. + // We expect it to reply with an answer, which is handled in receive. + // NB: Due to the nature of async code, we may send a few cost requests before we get a response back. + if (vmCostPerHour.isEmpty) { + val instantiatedVmInfo = extractVmInfoFromRunState(pollStatus) + instantiatedVmInfo.foreach { vmInfo => + val request = GcpCostLookupRequest(vmInfo, self) + params.serviceRegistry ! request + } + } } // When a job finishes, the bard actor needs to know about the timing in order to record metrics. @@ -135,4 +150,11 @@ trait PollResultMonitorActor[PollResultType] extends Actor { vmEndTime = vmEndTime.getOrElse(OffsetDateTime.now()) ) ) + + def handleCostResponse(costLookupResponse: GcpCostLookupResponse): Unit = { + if (vmCostPerHour.isDefined) { return } // Optimization to avoid processing responses after we've stopped caring. + val cost = costLookupResponse.calculatedCost.getOrElse(BigDecimal(-1)) // TODO: better logging here. + vmCostPerHour = Option(cost) + tellMetadata(Map(CallMetadataKeys.VmCostPerHour -> vmCostPerHour)) + } } diff --git a/core/src/main/resources/reference.conf b/core/src/main/resources/reference.conf index 465f43ad7a1..7b776c87e4e 100644 --- a/core/src/main/resources/reference.conf +++ b/core/src/main/resources/reference.conf @@ -607,7 +607,7 @@ services { } } - CostCatalogService { + GcpCostCatalogService { class = "cromwell.services.cost.GcpCostCatalogService" config { catalogExpirySeconds = 86400 diff --git a/services/src/main/scala/cromwell/services/cost/GcpCostCatalogService.scala b/services/src/main/scala/cromwell/services/cost/GcpCostCatalogService.scala index 35012b7e424..7878c146b97 100644 --- a/services/src/main/scala/cromwell/services/cost/GcpCostCatalogService.scala +++ b/services/src/main/scala/cromwell/services/cost/GcpCostCatalogService.scala @@ -1,22 +1,30 @@ package cromwell.services.cost import akka.actor.{Actor, ActorRef} +import com.google.`type`.Money import com.google.cloud.billing.v1._ import com.typesafe.config.Config import com.typesafe.scalalogging.LazyLogging import common.util.StringUtil.EnhancedToStringable +import cromwell.services.ServiceRegistryActor.ServiceRegistryMessage import cromwell.services.cost.GcpCostCatalogService.{COMPUTE_ENGINE_SERVICE_NAME, DEFAULT_CURRENCY_CODE} import cromwell.util.GracefulShutdownHelper.ShutdownCommand import java.time.{Duration, Instant} import scala.jdk.CollectionConverters.IterableHasAsScala import java.time.temporal.ChronoUnit.SECONDS +import scala.util.{Failure, Success, Try} case class CostCatalogKey(machineType: Option[MachineType], usageType: Option[UsageType], machineCustomization: Option[MachineCustomization], - resourceGroup: Option[ResourceGroup] + resourceGroup: Option[ResourceGroup], + region: String ) +case class GcpCostLookupRequest(vmInfo: InstantiatedVmInfo, replyTo: ActorRef) extends ServiceRegistryMessage { + override def serviceName: String = GcpCostCatalogService.getClass.getSimpleName +} +case class GcpCostLookupResponse(calculatedCost: Option[BigDecimal]) case class CostCatalogValue(catalogObject: Sku) case class ExpiringGcpCostCatalog(catalog: Map[CostCatalogKey, CostCatalogValue], fetchTime: Instant) @@ -88,21 +96,78 @@ class GcpCostCatalogService(serviceConfig: Config, globalConfig: Config, service * Ideally, we don't want to have an entire, unprocessed, cost catalog in memory at once since it's ~20MB. */ private def processCostCatalog(skus: Iterable[Sku]): Map[CostCatalogKey, CostCatalogValue] = - // TODO: Account for key collisions (same key can be in multiple regions) // TODO: reduce memory footprint of returned map (don't store entire SKU object) skus.foldLeft(Map.empty[CostCatalogKey, CostCatalogValue]) { case (acc, sku) => - acc + convertSkuToKeyValuePair(sku) + acc ++ convertSkuToKeyValuePairs(sku) + } + + private def convertSkuToKeyValuePairs(sku: Sku): List[(CostCatalogKey, CostCatalogValue)] = { + val allAvailableRegions = sku.getServiceRegionsList.asScala.toList + allAvailableRegions.map(region => + CostCatalogKey( + machineType = MachineType.fromSku(sku), + usageType = UsageType.fromSku(sku), + machineCustomization = MachineCustomization.fromSku(sku), + resourceGroup = ResourceGroup.fromSku(sku), + region = region + ) -> CostCatalogValue(sku) + ) + } + + // See: https://cloud.google.com/billing/v1/how-tos/catalog-api + private def calculateCpuPricePerHour(cpuSku: Sku, coreCount: Int): Try[BigDecimal] = { + val pricingInfo = getMostRecentPricingInfo(cpuSku) + val usageUnit = pricingInfo.getPricingExpression.getUsageUnit + if (usageUnit != "h") { + return Failure(new UnsupportedOperationException(s"Expected usage units of CPUs to be 'h'. Got ${usageUnit}")) } + // Price per hour of a single core + // NB: Ignoring "TieredRates" here (the idea that stuff gets cheaper the more you use). + // Technically, we should write code that determines which tier(s) to use. + // In practice, from what I've seen, CPU cores and RAM don't have more than a single tier. + val costPerUnit: Money = pricingInfo.getPricingExpression.getTieredRates(0).getUnitPrice + val costPerCorePerHour: BigDecimal = + costPerUnit.getUnits + (costPerUnit.getNanos * 10e-9) // Same as above, but as a big decimal + Success(costPerCorePerHour * coreCount) + } + + private def calculateRamPricePerHour(ramSku: Sku, ramMbCount: Int): Try[BigDecimal] = + // TODO + Success(ramMbCount.toLong * 0.25) - private def convertSkuToKeyValuePair(sku: Sku): (CostCatalogKey, CostCatalogValue) = CostCatalogKey( - machineType = MachineType.fromSku(sku), - usageType = UsageType.fromSku(sku), - machineCustomization = MachineCustomization.fromSku(sku), - resourceGroup = ResourceGroup.fromSku(sku) - ) -> CostCatalogValue(sku) + private def getMostRecentPricingInfo(sku: Sku): PricingInfo = { + val mostRecentPricingInfoIndex = sku.getPricingInfoCount - 1 + sku.getPricingInfo(mostRecentPricingInfoIndex) + } + + private def calculateVmCostPerHour(instantiatedVmInfo: InstantiatedVmInfo): Try[BigDecimal] = { + val machineType = MachineType.fromGoogleMachineTypeString(instantiatedVmInfo.machineType) + val usageType = UsageType.fromBoolean(instantiatedVmInfo.preemptible) + val machineCustomization = MachineCustomization.fromMachineTypeString(instantiatedVmInfo.machineType) + val region = instantiatedVmInfo.region + val coreCount = MachineType.extractCoreCountFromMachineTypeString(instantiatedVmInfo.machineType) + val ramMbCount = MachineType.extractRamMbFromMachineTypeString(instantiatedVmInfo.machineType) + + val cpuResourceGroup = Cpu // TODO: Investigate the situation in which the resource group is n1 + val cpuKey = + CostCatalogKey(machineType, Option(usageType), Option(machineCustomization), Option(cpuResourceGroup), region) + val cpuSku = getSku(cpuKey) + val cpuCost = cpuSku.map(sku => calculateCpuPricePerHour(sku.catalogObject, coreCount.get)) // TODO .get + + val ramResourceGroup = Ram + val ramKey = + CostCatalogKey(machineType, Option(usageType), Option(machineCustomization), Option(ramResourceGroup), region) + val ramSku = getSku(ramKey) + val ramCost = ramSku.map(sku => calculateRamPricePerHour(sku.catalogObject, ramMbCount.get)) // TODO .get + Success(cpuCost.get.get + ramCost.get.get) + } def serviceRegistryActor: ActorRef = serviceRegistry override def receive: Receive = { + case GcpCostLookupRequest(vmInfo, replyTo) => + val calculatedCost = calculateVmCostPerHour(vmInfo).toOption + val response = GcpCostLookupResponse(calculatedCost) + replyTo ! response case ShutdownCommand => googleClient.foreach(client => client.shutdownNow()) context stop self diff --git a/services/src/main/scala/cromwell/services/cost/GcpCostCatalogTypes.scala b/services/src/main/scala/cromwell/services/cost/GcpCostCatalogTypes.scala index 7507560c810..57fed6d888c 100644 --- a/services/src/main/scala/cromwell/services/cost/GcpCostCatalogTypes.scala +++ b/services/src/main/scala/cromwell/services/cost/GcpCostCatalogTypes.scala @@ -2,6 +2,13 @@ package cromwell.services.cost import com.google.cloud.billing.v1.Sku +import java.util.regex.{Matcher, Pattern} +import scala.util.{Failure, Success, Try} + +/* + * Case class that contains information retrieved from Google about a VM that cromwell has started + */ +case class InstantiatedVmInfo(region: String, machineType: String, preemptible: Boolean) /* * These types reflect hardcoded strings found in a google cost catalog. */ @@ -13,6 +20,32 @@ object MachineType { else if (tokenizedDescription.contains(N2d.machineTypeName)) Some(N2d) else Option.empty } + + // expects a string that looks something like "n1-standard-1" or "custom-1-4096" + def fromGoogleMachineTypeString(machineTypeString: String): Option[MachineType] = + if (machineTypeString.startsWith("n1")) Some(N1) + else if (machineTypeString.startsWith("n2d")) Some(N2d) + else if (machineTypeString.startsWith("n2")) Some(N2) + else if (machineTypeString.startsWith("custom")) + None // TODO: should this be n1? Make a 'custom' type? Combine with MachineCustomization? + else { + println(s"Error: Unrecognized machine type: $machineTypeString") + None + } + + def extractCoreCountFromMachineTypeString(machineTypeString: String): Try[Int] = { + val pattern: Pattern = Pattern.compile("-(\\d+)") + val matcher: Matcher = pattern.matcher(machineTypeString) + if (matcher.find()) { + Success(matcher.group(1).toInt) + } else { + Failure(new IllegalArgumentException(s"Could not extract core count from ${machineTypeString}")) + } + } + + def extractRamMbFromMachineTypeString(machineTypeString: String): Try[Int] = + // TODO + Success(4096) } sealed trait MachineType { def machineTypeName: String } case object N1 extends MachineType { override val machineTypeName = "N1" } @@ -26,12 +59,23 @@ object UsageType { case Preemptible.typeName => Some(Preemptible) case _ => Option.empty } + def fromBoolean(isPreemptible: Boolean): UsageType = isPreemptible match { + case true => Preemptible + case false => OnDemand + } + } sealed trait UsageType { def typeName: String } case object OnDemand extends UsageType { override val typeName = "OnDemand" } case object Preemptible extends UsageType { override val typeName = "Preemptible" } object MachineCustomization { + // TODO: I think this is right but I am not 100% sure. Needs testing. + // Do custom machine types always have the word "custom"? + // Does Cromwell ever assign Predefined Machines? + // Is it possible to have a custom machine that *doesn't* contain the word custom? + def fromMachineTypeString(machineTypeString: String): MachineCustomization = + if (machineTypeString.toLowerCase.contains("custom")) Custom else Predefined def fromSku(sku: Sku): Option[MachineCustomization] = { val tokenizedDescription = sku.getDescription.split(" ") if (tokenizedDescription.contains(Predefined.customizationName)) Some(Predefined) @@ -55,4 +99,10 @@ object ResourceGroup { sealed trait ResourceGroup { def groupName: String } case object Cpu extends ResourceGroup { override val groupName = "CPU" } case object Ram extends ResourceGroup { override val groupName = "RAM" } + +// TODO: What is the deal with this? It seems out of place. +// Need to figure out how to reconcile with the Cpu resource group. +// Current theory is that the n1 machines are legacy machines, +// and are therefore categorized differently. +// Unfortunately, N1 is Cromwell's default machine. case object N1Standard extends ResourceGroup { override val groupName = "N1Standard" } diff --git a/services/src/test/scala/cromwell/services/cost/GcpCostCatalogServiceSpec.scala b/services/src/test/scala/cromwell/services/cost/GcpCostCatalogServiceSpec.scala index 19d9e505ac6..2c5298d0cb3 100644 --- a/services/src/test/scala/cromwell/services/cost/GcpCostCatalogServiceSpec.scala +++ b/services/src/test/scala/cromwell/services/cost/GcpCostCatalogServiceSpec.scala @@ -75,7 +75,8 @@ class GcpCostCatalogServiceSpec machineType = Some(N2), usageType = Some(Preemptible), machineCustomization = Some(Predefined), - resourceGroup = Some(Cpu) + resourceGroup = Some(Cpu), + region = "europe-west9" ) val freshActor = constructTestActor @@ -103,9 +104,10 @@ class GcpCostCatalogServiceSpec machineType = Some(N2d), usageType = Some(Preemptible), machineCustomization = None, - resourceGroup = Some(Ram) + resourceGroup = Some(Ram), + region = "europe-west9" ) val foundValue = testActorRef.getSku(expectedKey) - foundValue.get.catalogObject.getDescription shouldBe "Spot Preemptible N2D AMD Instance Ram running in Johannesburg" + foundValue.get.catalogObject.getDescription shouldBe "Spot Preemptible N2D AMD Instance Ram running in Paris" } } diff --git a/supportedBackends/google/batch/src/main/scala/cromwell/backend/google/batch/actors/BatchPollResultMonitorActor.scala b/supportedBackends/google/batch/src/main/scala/cromwell/backend/google/batch/actors/BatchPollResultMonitorActor.scala index 8b05bf4057b..0f935b17c79 100644 --- a/supportedBackends/google/batch/src/main/scala/cromwell/backend/google/batch/actors/BatchPollResultMonitorActor.scala +++ b/supportedBackends/google/batch/src/main/scala/cromwell/backend/google/batch/actors/BatchPollResultMonitorActor.scala @@ -3,15 +3,10 @@ package cromwell.backend.google.batch.actors import akka.actor.{ActorRef, Props} import cromwell.backend.{BackendJobDescriptor, BackendWorkflowDescriptor, Platform} import cromwell.backend.google.batch.models.RunStatus -import cromwell.backend.standard.pollmonitoring.{ - AsyncJobHasFinished, - PollMonitorParameters, - PollResultMessage, - PollResultMonitorActor, - ProcessThisPollResult -} +import cromwell.backend.standard.pollmonitoring.{AsyncJobHasFinished, PollMonitorParameters, PollResultMessage, PollResultMonitorActor, ProcessThisPollResult} import cromwell.backend.validation.ValidatedRuntimeAttributes import cromwell.core.logging.JobLogger +import cromwell.services.cost.InstantiatedVmInfo import cromwell.services.metadata.CallMetadataKeys import java.time.OffsetDateTime @@ -78,4 +73,6 @@ class BatchPollResultMonitorActor(pollMonitorParameters: PollMonitorParameters) } override def params: PollMonitorParameters = pollMonitorParameters + + override def extractVmInfoFromRunState(pollStatus: RunStatus): Option[InstantiatedVmInfo] = Option.empty //TODO } diff --git a/supportedBackends/google/pipelines/common/src/main/scala/cromwell/backend/google/pipelines/common/PapiPollResultMonitorActor.scala b/supportedBackends/google/pipelines/common/src/main/scala/cromwell/backend/google/pipelines/common/PapiPollResultMonitorActor.scala index 597c0ed8d35..c1bbef4edc2 100644 --- a/supportedBackends/google/pipelines/common/src/main/scala/cromwell/backend/google/pipelines/common/PapiPollResultMonitorActor.scala +++ b/supportedBackends/google/pipelines/common/src/main/scala/cromwell/backend/google/pipelines/common/PapiPollResultMonitorActor.scala @@ -1,17 +1,12 @@ package cromwell.backend.google.pipelines.common import akka.actor.{ActorRef, Props} -import cromwell.backend.{BackendJobDescriptor, BackendWorkflowDescriptor, Platform} import cromwell.backend.google.pipelines.common.api.RunStatus -import cromwell.backend.standard.pollmonitoring.{ - AsyncJobHasFinished, - PollMonitorParameters, - PollResultMessage, - PollResultMonitorActor, - ProcessThisPollResult -} +import cromwell.backend.standard.pollmonitoring._ import cromwell.backend.validation.ValidatedRuntimeAttributes +import cromwell.backend.{BackendJobDescriptor, BackendWorkflowDescriptor, Platform} import cromwell.core.logging.JobLogger +import cromwell.services.cost.{GcpCostLookupResponse, InstantiatedVmInfo} import cromwell.services.metadata.CallMetadataKeys import java.time.OffsetDateTime @@ -50,7 +45,13 @@ class PapiPollResultMonitorActor(parameters: PollMonitorParameters) extends Poll case event if event.name == CallMetadataKeys.VmEndTime => event.offsetDateTime } + override def extractVmInfoFromRunState(pollStatus: RunStatus): Option[InstantiatedVmInfo] = + pollStatus.instantiatedVmInfo + + override def params: PollMonitorParameters = parameters + override def receive: Receive = { + case costResponse: GcpCostLookupResponse => handleCostResponse(costResponse) case message: PollResultMessage => message match { case ProcessThisPollResult(pollResult: RunStatus) => processPollResult(pollResult) @@ -75,5 +76,4 @@ class PapiPollResultMonitorActor(parameters: PollMonitorParameters) extends Poll ) ) } - override def params: PollMonitorParameters = parameters } diff --git a/supportedBackends/google/pipelines/common/src/main/scala/cromwell/backend/google/pipelines/common/api/RunStatus.scala b/supportedBackends/google/pipelines/common/src/main/scala/cromwell/backend/google/pipelines/common/api/RunStatus.scala index 03e49e5c1c1..d5be6707161 100644 --- a/supportedBackends/google/pipelines/common/src/main/scala/cromwell/backend/google/pipelines/common/api/RunStatus.scala +++ b/supportedBackends/google/pipelines/common/src/main/scala/cromwell/backend/google/pipelines/common/api/RunStatus.scala @@ -3,20 +3,26 @@ package cromwell.backend.google.pipelines.common.api import _root_.io.grpc.Status import cromwell.backend.google.pipelines.common.PipelinesApiAsyncBackendJobExecutionActor import cromwell.core.ExecutionEvent +import cromwell.services.cost.InstantiatedVmInfo import scala.util.Try - sealed trait RunStatus { def eventList: Seq[ExecutionEvent] def toString: String + + val instantiatedVmInfo: Option[InstantiatedVmInfo] } object RunStatus { - case class Initializing(eventList: Seq[ExecutionEvent]) extends RunStatus { override def toString = "Initializing" } - case class AwaitingCloudQuota(eventList: Seq[ExecutionEvent]) extends RunStatus { + case class Initializing(eventList: Seq[ExecutionEvent], instantiatedVmInfo: Option[InstantiatedVmInfo] = Option.empty) + extends RunStatus { override def toString = "Initializing" } + case class AwaitingCloudQuota(eventList: Seq[ExecutionEvent], + instantiatedVmInfo: Option[InstantiatedVmInfo] = Option.empty + ) extends RunStatus { override def toString = "AwaitingCloudQuota" } - case class Running(eventList: Seq[ExecutionEvent]) extends RunStatus { override def toString = "Running" } + case class Running(eventList: Seq[ExecutionEvent], instantiatedVmInfo: Option[InstantiatedVmInfo] = Option.empty) + extends RunStatus { override def toString = "Running" } sealed trait TerminalRunStatus extends RunStatus { def machineType: Option[String] @@ -38,7 +44,8 @@ object RunStatus { case class Success(eventList: Seq[ExecutionEvent], machineType: Option[String], zone: Option[String], - instanceName: Option[String] + instanceName: Option[String], + instantiatedVmInfo: Option[InstantiatedVmInfo] = Option.empty ) extends TerminalRunStatus { override def toString = "Success" } @@ -88,7 +95,8 @@ object RunStatus { eventList, machineType, zone, - instanceName + instanceName, + Option.empty ) } } @@ -99,7 +107,8 @@ object RunStatus { eventList: Seq[ExecutionEvent], machineType: Option[String], zone: Option[String], - instanceName: Option[String] + instanceName: Option[String], + instantiatedVmInfo: Option[InstantiatedVmInfo] = Option.empty ) extends UnsuccessfulRunStatus { override def toString = "Failed" } @@ -113,7 +122,8 @@ object RunStatus { eventList: Seq[ExecutionEvent], machineType: Option[String], zone: Option[String], - instanceName: Option[String] + instanceName: Option[String], + instantiatedVmInfo: Option[InstantiatedVmInfo] = Option.empty ) extends UnsuccessfulRunStatus { override def toString = "Cancelled" } @@ -124,7 +134,8 @@ object RunStatus { eventList: Seq[ExecutionEvent], machineType: Option[String], zone: Option[String], - instanceName: Option[String] + instanceName: Option[String], + instantiatedVmInfo: Option[InstantiatedVmInfo] = Option.empty ) extends UnsuccessfulRunStatus { override def toString = "Preempted" } @@ -139,7 +150,8 @@ object RunStatus { eventList: Seq[ExecutionEvent], machineType: Option[String], zone: Option[String], - instanceName: Option[String] + instanceName: Option[String], + instantiatedVmInfo: Option[InstantiatedVmInfo] = Option.empty ) extends UnsuccessfulRunStatus { override def toString = "QuotaFailed" } diff --git a/supportedBackends/google/pipelines/common/src/test/scala/cromwell/backend/google/pipelines/common/CostLookupSpec.scala b/supportedBackends/google/pipelines/common/src/test/scala/cromwell/backend/google/pipelines/common/CostLookupSpec.scala new file mode 100644 index 00000000000..de9490d86e7 --- /dev/null +++ b/supportedBackends/google/pipelines/common/src/test/scala/cromwell/backend/google/pipelines/common/CostLookupSpec.scala @@ -0,0 +1,62 @@ +package cromwell.backend.google.pipelines.common + +import akka.testkit.{ImplicitSender, TestActorRef, TestProbe} +import cromwell.core.TestKitSuite +import org.scalatest.flatspec.AnyFlatSpecLike +import org.scalatest.matchers.should.Matchers +import cromwell.services.cost._ +import org.scalatest.concurrent.Eventually + +class CostLookupSpec extends TestKitSuite with AnyFlatSpecLike with Matchers with Eventually with ImplicitSender { + behavior of "CostLookup" + + def constructTestActor: GcpCostCatalogServiceTestActor = + TestActorRef( + new GcpCostCatalogServiceTestActor(GcpCostCatalogServiceSpec.config, + GcpCostCatalogServiceSpec.config, + TestProbe().ref + ) + ).underlyingActor + + val testCatalogService = constructTestActor + + it should "find a CPU sku" in { + val machineType = Some(N2) + val usageType = Some(OnDemand) + val customization = Some(Custom) + val resourceGroup = Some(Cpu) + val region = "europe-west9" + val key = CostCatalogKey(machineType, usageType, customization, resourceGroup, region) + val result = testCatalogService.getSku(key).get.catalogObject.getDescription + result shouldBe "N2 Custom Instance Core running in Paris" + } + + it should "find a RAM sku" in { + val machineType = Some(N2) + val usageType = Some(OnDemand) + val customization = Some(Custom) + val resourceGroup = Some(Ram) + val region = "europe-west9" + val key = CostCatalogKey(machineType, usageType, customization, resourceGroup, region) + val result = testCatalogService.getSku(key).get.catalogObject.getDescription + result shouldBe "N2 Custom Instance Ram running in Paris" + } + + it should "find CPU skus for all supported machine types" in { + val legalMachineTypes: List[MachineType] = List(N1, N2, N2d) + val legalUsageTypes: List[UsageType] = List(Preemptible, OnDemand) + val legalCustomizations: List[MachineCustomization] = List(Custom, Predefined) + val resourceGroup: Option[ResourceGroup] = Some(Cpu) + val region = "us-west1" + for (machineType <- legalMachineTypes) + for (usageType <- legalUsageTypes) + for (customization <- legalCustomizations) { + val key = CostCatalogKey(Some(machineType), Some(usageType), Some(customization), resourceGroup, region) + val result = testCatalogService.getSku(key) + if (!result.isEmpty) { + println("Success") + } + result.isEmpty shouldBe false + } + } +} diff --git a/supportedBackends/google/pipelines/v2beta/src/main/scala/cromwell/backend/google/pipelines/v2beta/api/request/ErrorReporter.scala b/supportedBackends/google/pipelines/v2beta/src/main/scala/cromwell/backend/google/pipelines/v2beta/api/request/ErrorReporter.scala index 77e53176df3..1c9be388c7d 100644 --- a/supportedBackends/google/pipelines/v2beta/src/main/scala/cromwell/backend/google/pipelines/v2beta/api/request/ErrorReporter.scala +++ b/supportedBackends/google/pipelines/v2beta/src/main/scala/cromwell/backend/google/pipelines/v2beta/api/request/ErrorReporter.scala @@ -86,7 +86,7 @@ class ErrorReporter(machineType: Option[String], // Reverse the list because the first failure (likely the most relevant, will appear last otherwise) val unexpectedExitEvents: List[String] = unexpectedExitStatusErrorStrings(events, actions).reverse - builder(status, None, failed.toList ++ unexpectedExitEvents, executionEvents, machineType, zone, instanceName) + builder(status, None, failed.toList ++ unexpectedExitEvents, executionEvents, machineType, zone, instanceName, Option.empty) } // There's maybe one FailedEvent per operation with a summary error message diff --git a/supportedBackends/google/pipelines/v2beta/src/main/scala/cromwell/backend/google/pipelines/v2beta/api/request/GetRequestHandler.scala b/supportedBackends/google/pipelines/v2beta/src/main/scala/cromwell/backend/google/pipelines/v2beta/api/request/GetRequestHandler.scala index 9cfbf62ca4f..d9f622b8af5 100644 --- a/supportedBackends/google/pipelines/v2beta/src/main/scala/cromwell/backend/google/pipelines/v2beta/api/request/GetRequestHandler.scala +++ b/supportedBackends/google/pipelines/v2beta/src/main/scala/cromwell/backend/google/pipelines/v2beta/api/request/GetRequestHandler.scala @@ -9,19 +9,14 @@ import common.validation.Validation._ import cromwell.backend.google.pipelines.common.action.ActionLabels._ import cromwell.backend.google.pipelines.common.api.PipelinesApiRequestManager._ import cromwell.backend.google.pipelines.common.api.RunStatus -import cromwell.backend.google.pipelines.common.api.RunStatus.{ - AwaitingCloudQuota, - Initializing, - Running, - Success, - UnsuccessfulRunStatus -} +import cromwell.backend.google.pipelines.common.api.RunStatus.{AwaitingCloudQuota, Initializing, Running, Success, UnsuccessfulRunStatus} import cromwell.backend.google.pipelines.common.errors.isQuotaMessage import cromwell.backend.google.pipelines.v2beta.PipelinesConversions._ import cromwell.backend.google.pipelines.v2beta.api.Deserialization._ import cromwell.backend.google.pipelines.v2beta.api.request.ErrorReporter._ import cromwell.cloudsupport.gcp.auth.GoogleAuthMode import cromwell.core.ExecutionEvent +import cromwell.services.cost.InstantiatedVmInfo import cromwell.services.metadata.CallMetadataKeys import io.grpc.Status import org.apache.commons.lang3.exception.ExceptionUtils @@ -29,7 +24,7 @@ import org.apache.commons.lang3.exception.ExceptionUtils import scala.jdk.CollectionConverters._ import scala.concurrent.{ExecutionContext, Future} import scala.language.postfixOps -import scala.util.{Failure, Success => TrySuccess, Try} +import scala.util.{Failure, Try, Success => TrySuccess} trait GetRequestHandler { this: RequestHandler => // the Genomics batch endpoint doesn't seem to be able to handle get requests on V2 operations at the moment @@ -81,33 +76,44 @@ trait GetRequestHandler { this: RequestHandler => .toList .flatten val executionEvents = getEventList(metadata, events, actions) + val workerAssignedEvent: Option[WorkerAssignedEvent] = + events.collectFirst { + case event if event.getWorkerAssigned != null => event.getWorkerAssigned + } + val virtualMachineOption = for { + pipelineValue <- pipeline + resources <- Option(pipelineValue.getResources) + virtualMachine <- Option(resources.getVirtualMachine) + } yield virtualMachine + + // Correlate `executionEvents` to `actions` to potentially assign a grouping into the appropriate events. + val machineType = virtualMachineOption.flatMap(virtualMachine => Option(virtualMachine.getMachineType)) + /* + preemptible is only used if the job fails, as a heuristic to guess if the VM was preempted. + If we can't get the value of preempted we still need to return something, returning false will not make the + failure count as a preemption which seems better than saying that it was preemptible when we really don't know + */ + val preemptibleOption = for { + pipelineValue <- pipeline + resources <- Option(pipelineValue.getResources) + virtualMachine <- Option(resources.getVirtualMachine) + preemptible <- Option(virtualMachine.getPreemptible) + } yield preemptible + val preemptible = preemptibleOption.exists(_.booleanValue) + val instanceName = + workerAssignedEvent.flatMap(workerAssignedEvent => Option(workerAssignedEvent.getInstance())) + val zone = workerAssignedEvent.flatMap(workerAssignedEvent => Option(workerAssignedEvent.getZone)) + val region = zone.map { zoneString => + val lastDashIndex = zoneString.lastIndexOf("-") + if (lastDashIndex != -1) zoneString.substring(0, lastDashIndex) else zoneString + } + + val instantiatedVmInfo: Option[InstantiatedVmInfo] = (region, machineType) match { + case (Some(instantiatedRegion), Some(instantiatedMachineType)) => + Option(InstantiatedVmInfo(instantiatedRegion, instantiatedMachineType, preemptible)) + case _ => Option.empty + } if (operation.getDone) { - val workerAssignedEvent: Option[WorkerAssignedEvent] = - events.collectFirst { - case event if event.getWorkerAssigned != null => event.getWorkerAssigned - } - val virtualMachineOption = for { - pipelineValue <- pipeline - resources <- Option(pipelineValue.getResources) - virtualMachine <- Option(resources.getVirtualMachine) - } yield virtualMachine - // Correlate `executionEvents` to `actions` to potentially assign a grouping into the appropriate events. - val machineType = virtualMachineOption.flatMap(virtualMachine => Option(virtualMachine.getMachineType)) - /* - preemptible is only used if the job fails, as a heuristic to guess if the VM was preempted. - If we can't get the value of preempted we still need to return something, returning false will not make the - failure count as a preemption which seems better than saying that it was preemptible when we really don't know - */ - val preemptibleOption = for { - pipelineValue <- pipeline - resources <- Option(pipelineValue.getResources) - virtualMachine <- Option(resources.getVirtualMachine) - preemptible <- Option(virtualMachine.getPreemptible) - } yield preemptible - val preemptible = preemptibleOption.exists(_.booleanValue) - val instanceName = - workerAssignedEvent.flatMap(workerAssignedEvent => Option(workerAssignedEvent.getInstance())) - val zone = workerAssignedEvent.flatMap(workerAssignedEvent => Option(workerAssignedEvent.getZone)) // If there's an error, generate an unsuccessful status. Otherwise, we were successful! Option(operation.getError) match { case Some(error) => @@ -122,14 +128,14 @@ trait GetRequestHandler { this: RequestHandler => pollingRequest.workflowId ) errorReporter.toUnsuccessfulRunStatus(error, events) - case None => Success(executionEvents, machineType, zone, instanceName) + case None => Success(executionEvents, machineType, zone, instanceName, instantiatedVmInfo) } } else if (isQuotaDelayed(events)) { - AwaitingCloudQuota(executionEvents) + AwaitingCloudQuota(executionEvents, instantiatedVmInfo) } else if (operation.hasStarted) { - Running(executionEvents) + Running(executionEvents, instantiatedVmInfo) } else { - Initializing(executionEvents) + Initializing(executionEvents, instantiatedVmInfo) } } catch { case nullPointerException: NullPointerException => From ac5f7311c8920802206f9780d436a53ce0cdadd1 Mon Sep 17 00:00:00 2001 From: Tom Wiseman Date: Thu, 19 Sep 2024 10:23:59 -0400 Subject: [PATCH 16/39] bonus debug logging --- .../pollmonitoring/PollResultMonitorActor.scala | 4 +++- .../services/cost/GcpCostCatalogService.scala | 12 +++++++++++- .../common/PapiPollResultMonitorActor.scala | 4 ++-- .../v2beta/api/request/GetRequestHandler.scala | 2 ++ 4 files changed, 18 insertions(+), 4 deletions(-) diff --git a/backend/src/main/scala/cromwell/backend/standard/pollmonitoring/PollResultMonitorActor.scala b/backend/src/main/scala/cromwell/backend/standard/pollmonitoring/PollResultMonitorActor.scala index 43d7132f999..5da2a784074 100644 --- a/backend/src/main/scala/cromwell/backend/standard/pollmonitoring/PollResultMonitorActor.scala +++ b/backend/src/main/scala/cromwell/backend/standard/pollmonitoring/PollResultMonitorActor.scala @@ -133,6 +133,7 @@ trait PollResultMonitorActor[PollResultType] extends Actor { if (vmCostPerHour.isEmpty) { val instantiatedVmInfo = extractVmInfoFromRunState(pollStatus) instantiatedVmInfo.foreach { vmInfo => + println(s"Requesting cost info for: ${vmInfo}") val request = GcpCostLookupRequest(vmInfo, self) params.serviceRegistry ! request } @@ -152,7 +153,8 @@ trait PollResultMonitorActor[PollResultType] extends Actor { ) def handleCostResponse(costLookupResponse: GcpCostLookupResponse): Unit = { - if (vmCostPerHour.isDefined) { return } // Optimization to avoid processing responses after we've stopped caring. + println(s"Handling Cost Response from Catalog Service: ${costLookupResponse}") + if (vmCostPerHour.isDefined) { return } // Optimization to avoid processing responses after we've received a valid one. val cost = costLookupResponse.calculatedCost.getOrElse(BigDecimal(-1)) // TODO: better logging here. vmCostPerHour = Option(cost) tellMetadata(Map(CallMetadataKeys.VmCostPerHour -> vmCostPerHour)) diff --git a/services/src/main/scala/cromwell/services/cost/GcpCostCatalogService.scala b/services/src/main/scala/cromwell/services/cost/GcpCostCatalogService.scala index 7878c146b97..22a15b06a51 100644 --- a/services/src/main/scala/cromwell/services/cost/GcpCostCatalogService.scala +++ b/services/src/main/scala/cromwell/services/cost/GcpCostCatalogService.scala @@ -22,7 +22,7 @@ case class CostCatalogKey(machineType: Option[MachineType], region: String ) case class GcpCostLookupRequest(vmInfo: InstantiatedVmInfo, replyTo: ActorRef) extends ServiceRegistryMessage { - override def serviceName: String = GcpCostCatalogService.getClass.getSimpleName + override def serviceName: String = "GcpCostCatalogService" } case class GcpCostLookupResponse(calculatedCost: Option[BigDecimal]) case class CostCatalogValue(catalogObject: Sku) @@ -152,12 +152,22 @@ class GcpCostCatalogService(serviceConfig: Config, globalConfig: Config, service val cpuKey = CostCatalogKey(machineType, Option(usageType), Option(machineCustomization), Option(cpuResourceGroup), region) val cpuSku = getSku(cpuKey) + if(cpuSku.isEmpty) { + println(s"Failed to find CPU Sku for ${cpuKey}") + } else { + println(s"Found CPU Sku ${cpuSku.get.catalogObject.getDescription} from key ${cpuKey}") + } val cpuCost = cpuSku.map(sku => calculateCpuPricePerHour(sku.catalogObject, coreCount.get)) // TODO .get val ramResourceGroup = Ram val ramKey = CostCatalogKey(machineType, Option(usageType), Option(machineCustomization), Option(ramResourceGroup), region) val ramSku = getSku(ramKey) + if(ramSku.isEmpty) { + println(s"Failed to find Ram Sku for ${ramKey}") + } else { + println(s"Found CPU Sku ${ramSku.get.catalogObject.getDescription} from key ${ramKey}") + } val ramCost = ramSku.map(sku => calculateRamPricePerHour(sku.catalogObject, ramMbCount.get)) // TODO .get Success(cpuCost.get.get + ramCost.get.get) } diff --git a/supportedBackends/google/pipelines/common/src/main/scala/cromwell/backend/google/pipelines/common/PapiPollResultMonitorActor.scala b/supportedBackends/google/pipelines/common/src/main/scala/cromwell/backend/google/pipelines/common/PapiPollResultMonitorActor.scala index c1bbef4edc2..0cc86ae3588 100644 --- a/supportedBackends/google/pipelines/common/src/main/scala/cromwell/backend/google/pipelines/common/PapiPollResultMonitorActor.scala +++ b/supportedBackends/google/pipelines/common/src/main/scala/cromwell/backend/google/pipelines/common/PapiPollResultMonitorActor.scala @@ -69,10 +69,10 @@ class PapiPollResultMonitorActor(parameters: PollMonitorParameters) extends Poll ) ) } - case _ => + case unexpected => params.logger.foreach(logger => logger.error( - s"Programmer error: Cost Helper received message of type other than CostPollingMessage" + s"Programmer error: Cost Helper received message of unexpected type. Was ${unexpected.getClass.getSimpleName}." ) ) } diff --git a/supportedBackends/google/pipelines/v2beta/src/main/scala/cromwell/backend/google/pipelines/v2beta/api/request/GetRequestHandler.scala b/supportedBackends/google/pipelines/v2beta/src/main/scala/cromwell/backend/google/pipelines/v2beta/api/request/GetRequestHandler.scala index d9f622b8af5..7d9775688f4 100644 --- a/supportedBackends/google/pipelines/v2beta/src/main/scala/cromwell/backend/google/pipelines/v2beta/api/request/GetRequestHandler.scala +++ b/supportedBackends/google/pipelines/v2beta/src/main/scala/cromwell/backend/google/pipelines/v2beta/api/request/GetRequestHandler.scala @@ -88,6 +88,7 @@ trait GetRequestHandler { this: RequestHandler => // Correlate `executionEvents` to `actions` to potentially assign a grouping into the appropriate events. val machineType = virtualMachineOption.flatMap(virtualMachine => Option(virtualMachine.getMachineType)) + /* preemptible is only used if the job fails, as a heuristic to guess if the VM was preempted. If we can't get the value of preempted we still need to return something, returning false will not make the @@ -110,6 +111,7 @@ trait GetRequestHandler { this: RequestHandler => val instantiatedVmInfo: Option[InstantiatedVmInfo] = (region, machineType) match { case (Some(instantiatedRegion), Some(instantiatedMachineType)) => + println(s"Machine Type: ${instantiatedMachineType}") Option(InstantiatedVmInfo(instantiatedRegion, instantiatedMachineType, preemptible)) case _ => Option.empty } From 26e841de78b347fc64cdff006742b7bc4008c052 Mon Sep 17 00:00:00 2001 From: Tom Wiseman Date: Thu, 19 Sep 2024 10:45:21 -0400 Subject: [PATCH 17/39] RAM usage --- .../services/cost/GcpCostCatalogService.scala | 24 ++++++++++++++----- .../services/cost/GcpCostCatalogTypes.scala | 14 +++++++---- 2 files changed, 28 insertions(+), 10 deletions(-) diff --git a/services/src/main/scala/cromwell/services/cost/GcpCostCatalogService.scala b/services/src/main/scala/cromwell/services/cost/GcpCostCatalogService.scala index 22a15b06a51..822a63254ec 100644 --- a/services/src/main/scala/cromwell/services/cost/GcpCostCatalogService.scala +++ b/services/src/main/scala/cromwell/services/cost/GcpCostCatalogService.scala @@ -128,12 +128,23 @@ class GcpCostCatalogService(serviceConfig: Config, globalConfig: Config, service val costPerUnit: Money = pricingInfo.getPricingExpression.getTieredRates(0).getUnitPrice val costPerCorePerHour: BigDecimal = costPerUnit.getUnits + (costPerUnit.getNanos * 10e-9) // Same as above, but as a big decimal + + println(s"Calculated ${coreCount} cpu cores to cost ${costPerCorePerHour * coreCount} per hour") Success(costPerCorePerHour * coreCount) } - private def calculateRamPricePerHour(ramSku: Sku, ramMbCount: Int): Try[BigDecimal] = - // TODO - Success(ramMbCount.toLong * 0.25) + private def calculateRamPricePerHour(ramSku: Sku, ramGbCount: Int): Try[BigDecimal] = { + val pricingInfo = getMostRecentPricingInfo(ramSku) + val usageUnit = pricingInfo.getPricingExpression.getUsageUnit + if (usageUnit != "GiBy.h") { + return Failure(new UnsupportedOperationException(s"Expected usage units of RAM to be 'GiBy.h'. Got ${usageUnit}")) + } + val costPerUnit: Money = pricingInfo.getPricingExpression.getTieredRates(0).getUnitPrice + val costPerGbHour: BigDecimal = + costPerUnit.getUnits + (costPerUnit.getNanos * 10e-9) // Same as above, but as a big decimal + println(s"Calculated ${ramGbCount} GB of ram to cost ${ramGbCount * costPerGbHour} per hour") + Success(costPerGbHour * ramGbCount) + } private def getMostRecentPricingInfo(sku: Sku): PricingInfo = { val mostRecentPricingInfoIndex = sku.getPricingInfoCount - 1 @@ -147,12 +158,13 @@ class GcpCostCatalogService(serviceConfig: Config, globalConfig: Config, service val region = instantiatedVmInfo.region val coreCount = MachineType.extractCoreCountFromMachineTypeString(instantiatedVmInfo.machineType) val ramMbCount = MachineType.extractRamMbFromMachineTypeString(instantiatedVmInfo.machineType) + val ramGbCount = ramMbCount.getOrElse(0) / 1024 val cpuResourceGroup = Cpu // TODO: Investigate the situation in which the resource group is n1 val cpuKey = CostCatalogKey(machineType, Option(usageType), Option(machineCustomization), Option(cpuResourceGroup), region) val cpuSku = getSku(cpuKey) - if(cpuSku.isEmpty) { + if (cpuSku.isEmpty) { println(s"Failed to find CPU Sku for ${cpuKey}") } else { println(s"Found CPU Sku ${cpuSku.get.catalogObject.getDescription} from key ${cpuKey}") @@ -163,12 +175,12 @@ class GcpCostCatalogService(serviceConfig: Config, globalConfig: Config, service val ramKey = CostCatalogKey(machineType, Option(usageType), Option(machineCustomization), Option(ramResourceGroup), region) val ramSku = getSku(ramKey) - if(ramSku.isEmpty) { + if (ramSku.isEmpty) { println(s"Failed to find Ram Sku for ${ramKey}") } else { println(s"Found CPU Sku ${ramSku.get.catalogObject.getDescription} from key ${ramKey}") } - val ramCost = ramSku.map(sku => calculateRamPricePerHour(sku.catalogObject, ramMbCount.get)) // TODO .get + val ramCost = ramSku.map(sku => calculateRamPricePerHour(sku.catalogObject, ramGbCount)) // TODO .get Success(cpuCost.get.get + ramCost.get.get) } diff --git a/services/src/main/scala/cromwell/services/cost/GcpCostCatalogTypes.scala b/services/src/main/scala/cromwell/services/cost/GcpCostCatalogTypes.scala index 57fed6d888c..e0705260f67 100644 --- a/services/src/main/scala/cromwell/services/cost/GcpCostCatalogTypes.scala +++ b/services/src/main/scala/cromwell/services/cost/GcpCostCatalogTypes.scala @@ -42,10 +42,16 @@ object MachineType { Failure(new IllegalArgumentException(s"Could not extract core count from ${machineTypeString}")) } } - - def extractRamMbFromMachineTypeString(machineTypeString: String): Try[Int] = - // TODO - Success(4096) + def extractRamMbFromMachineTypeString(machineTypeString: String): Try[Int] = { + // Regular expression to match the number after the second dash + val pattern: Pattern = Pattern.compile(".*?-.*?-(\\d+)") + val matcher: Matcher = pattern.matcher(machineTypeString); + if (matcher.find()) { + Success(matcher.group(1).toInt) + } else { + Failure(new IllegalArgumentException(s"Could not Ram MB count from ${machineTypeString}")) + } + } } sealed trait MachineType { def machineTypeName: String } case object N1 extends MachineType { override val machineTypeName = "N1" } From df7abf7065899f7efb84c39e85d196971bdfd762 Mon Sep 17 00:00:00 2001 From: Janet Gainer-Dewar Date: Mon, 23 Sep 2024 13:49:57 -0400 Subject: [PATCH 18/39] Bug fixes --- .../standard/pollmonitoring/PollResultMonitorActor.scala | 9 +++++---- .../PipelinesApiAsyncBackendJobExecutionActor.scala | 2 +- 2 files changed, 6 insertions(+), 5 deletions(-) diff --git a/backend/src/main/scala/cromwell/backend/standard/pollmonitoring/PollResultMonitorActor.scala b/backend/src/main/scala/cromwell/backend/standard/pollmonitoring/PollResultMonitorActor.scala index 5da2a784074..627ba4b63af 100644 --- a/backend/src/main/scala/cromwell/backend/standard/pollmonitoring/PollResultMonitorActor.scala +++ b/backend/src/main/scala/cromwell/backend/standard/pollmonitoring/PollResultMonitorActor.scala @@ -154,9 +154,10 @@ trait PollResultMonitorActor[PollResultType] extends Actor { def handleCostResponse(costLookupResponse: GcpCostLookupResponse): Unit = { println(s"Handling Cost Response from Catalog Service: ${costLookupResponse}") - if (vmCostPerHour.isDefined) { return } // Optimization to avoid processing responses after we've received a valid one. - val cost = costLookupResponse.calculatedCost.getOrElse(BigDecimal(-1)) // TODO: better logging here. - vmCostPerHour = Option(cost) - tellMetadata(Map(CallMetadataKeys.VmCostPerHour -> vmCostPerHour)) + if (vmCostPerHour.isEmpty) { // Optimization to avoid processing responses after we've received a valid one. + val cost = costLookupResponse.calculatedCost.getOrElse(BigDecimal(-1)) // TODO: better logging here. + vmCostPerHour = Option(cost) + tellMetadata(Map(CallMetadataKeys.VmCostPerHour -> cost)) + } } } diff --git a/supportedBackends/google/pipelines/common/src/main/scala/cromwell/backend/google/pipelines/common/PipelinesApiAsyncBackendJobExecutionActor.scala b/supportedBackends/google/pipelines/common/src/main/scala/cromwell/backend/google/pipelines/common/PipelinesApiAsyncBackendJobExecutionActor.scala index 402427dc81b..388ec1df89c 100644 --- a/supportedBackends/google/pipelines/common/src/main/scala/cromwell/backend/google/pipelines/common/PipelinesApiAsyncBackendJobExecutionActor.scala +++ b/supportedBackends/google/pipelines/common/src/main/scala/cromwell/backend/google/pipelines/common/PipelinesApiAsyncBackendJobExecutionActor.scala @@ -777,7 +777,7 @@ class PipelinesApiAsyncBackendJobExecutionActor(override val standardParams: Sta super[PipelinesApiStatusRequestClient].pollStatus(workflowId, handle.pendingJob) override def checkAndRecordQuotaExhaustion(runStatus: RunStatus): Unit = runStatus match { - case AwaitingCloudQuota(_) => + case AwaitingCloudQuota(_, _) => standardParams.groupMetricsActor ! RecordGroupQuotaExhaustion(googleProject(jobDescriptor.workflowDescriptor)) case _ => } From 21c03764879572153fe15f447f3b1c4c42e954e2 Mon Sep 17 00:00:00 2001 From: Janet Gainer-Dewar Date: Wed, 25 Sep 2024 13:49:29 -0400 Subject: [PATCH 19/39] Cost catalog massaging --- .../services/cost/GcpCostCatalogTypes.scala | 38 ++++++++++--------- 1 file changed, 21 insertions(+), 17 deletions(-) diff --git a/services/src/main/scala/cromwell/services/cost/GcpCostCatalogTypes.scala b/services/src/main/scala/cromwell/services/cost/GcpCostCatalogTypes.scala index e0705260f67..e16959c5e97 100644 --- a/services/src/main/scala/cromwell/services/cost/GcpCostCatalogTypes.scala +++ b/services/src/main/scala/cromwell/services/cost/GcpCostCatalogTypes.scala @@ -23,11 +23,10 @@ object MachineType { // expects a string that looks something like "n1-standard-1" or "custom-1-4096" def fromGoogleMachineTypeString(machineTypeString: String): Option[MachineType] = - if (machineTypeString.startsWith("n1")) Some(N1) - else if (machineTypeString.startsWith("n2d")) Some(N2d) - else if (machineTypeString.startsWith("n2")) Some(N2) - else if (machineTypeString.startsWith("custom")) - None // TODO: should this be n1? Make a 'custom' type? Combine with MachineCustomization? + if (machineTypeString.toLowerCase().startsWith("n1-")) Some(N1) + else if (machineTypeString.toLowerCase().startsWith("n2d-")) Some(N2d) + else if (machineTypeString.toLowerCase().startsWith("n2-")) Some(N2) + else if (machineTypeString.toLowerCase().startsWith("custom-")) Some(N1) // by convention else { println(s"Error: Unrecognized machine type: $machineTypeString") None @@ -43,8 +42,9 @@ object MachineType { } } def extractRamMbFromMachineTypeString(machineTypeString: String): Try[Int] = { - // Regular expression to match the number after the second dash - val pattern: Pattern = Pattern.compile(".*?-.*?-(\\d+)") + // Regular expression to match the number after the dash at the end of the string + // TODO add test + val pattern: Pattern = Pattern.compile(".*?-(\\d+)$") val matcher: Matcher = pattern.matcher(machineTypeString); if (matcher.find()) { Success(matcher.group(1).toInt) @@ -76,17 +76,25 @@ case object OnDemand extends UsageType { override val typeName = "OnDemand" } case object Preemptible extends UsageType { override val typeName = "Preemptible" } object MachineCustomization { - // TODO: I think this is right but I am not 100% sure. Needs testing. - // Do custom machine types always have the word "custom"? - // Does Cromwell ever assign Predefined Machines? - // Is it possible to have a custom machine that *doesn't* contain the word custom? def fromMachineTypeString(machineTypeString: String): MachineCustomization = if (machineTypeString.toLowerCase.contains("custom")) Custom else Predefined + + /* + The cost catalog is annoyingly inconsistent and unstructured in this area. + - For N1 machines, only predefined SKUs are included, and they have "Predefined" in their description strings. + We will eventually fall back to using these SKUs for custom machines, but accurately represent them as Predefined here. + - For non-N1 machines, both custom and predefined SKUs are included, custom ones include "Custom" in their description + strings and predefined SKUs are only identifiable by the absence of "Custom." + */ def fromSku(sku: Sku): Option[MachineCustomization] = { val tokenizedDescription = sku.getDescription.split(" ") + + // ex. "N1 Predefined Instance Core running in Montreal" if (tokenizedDescription.contains(Predefined.customizationName)) Some(Predefined) + // ex. "N2 Custom Instance Core running in Paris" else if (tokenizedDescription.contains(Custom.customizationName)) Some(Custom) - else Option.empty + // ex. "N2 Instance Core running in Paris" + else Some(Predefined) } } sealed trait MachineCustomization { def customizationName: String } @@ -106,9 +114,5 @@ sealed trait ResourceGroup { def groupName: String } case object Cpu extends ResourceGroup { override val groupName = "CPU" } case object Ram extends ResourceGroup { override val groupName = "RAM" } -// TODO: What is the deal with this? It seems out of place. -// Need to figure out how to reconcile with the Cpu resource group. -// Current theory is that the n1 machines are legacy machines, -// and are therefore categorized differently. -// Unfortunately, N1 is Cromwell's default machine. +// N1 machine SKUs are structured differently and use this resource group. case object N1Standard extends ResourceGroup { override val groupName = "N1Standard" } From 7c18124603a9b9c1f0ddceb7614379642f08b4ce Mon Sep 17 00:00:00 2001 From: Janet Gainer-Dewar Date: Wed, 25 Sep 2024 14:00:06 -0400 Subject: [PATCH 20/39] Standardize resource type --- .../services/cost/GcpCostCatalogService.scala | 13 +++++++------ .../services/cost/GcpCostCatalogTypes.scala | 18 +++++++++--------- .../cost/GcpCostCatalogServiceSpec.scala | 4 ++-- .../pipelines/common/CostLookupSpec.scala | 2 +- 4 files changed, 19 insertions(+), 18 deletions(-) diff --git a/services/src/main/scala/cromwell/services/cost/GcpCostCatalogService.scala b/services/src/main/scala/cromwell/services/cost/GcpCostCatalogService.scala index 822a63254ec..3de596996d1 100644 --- a/services/src/main/scala/cromwell/services/cost/GcpCostCatalogService.scala +++ b/services/src/main/scala/cromwell/services/cost/GcpCostCatalogService.scala @@ -18,7 +18,7 @@ import scala.util.{Failure, Success, Try} case class CostCatalogKey(machineType: Option[MachineType], usageType: Option[UsageType], machineCustomization: Option[MachineCustomization], - resourceGroup: Option[ResourceGroup], + resourceType: Option[ResourceType], region: String ) case class GcpCostLookupRequest(vmInfo: InstantiatedVmInfo, replyTo: ActorRef) extends ServiceRegistryMessage { @@ -103,12 +103,13 @@ class GcpCostCatalogService(serviceConfig: Config, globalConfig: Config, service private def convertSkuToKeyValuePairs(sku: Sku): List[(CostCatalogKey, CostCatalogValue)] = { val allAvailableRegions = sku.getServiceRegionsList.asScala.toList + // TODO jdewar flatMap this and discard any entries that we don't understand allAvailableRegions.map(region => CostCatalogKey( machineType = MachineType.fromSku(sku), usageType = UsageType.fromSku(sku), machineCustomization = MachineCustomization.fromSku(sku), - resourceGroup = ResourceGroup.fromSku(sku), + resourceType = ResourceType.fromSku(sku), region = region ) -> CostCatalogValue(sku) ) @@ -160,9 +161,9 @@ class GcpCostCatalogService(serviceConfig: Config, globalConfig: Config, service val ramMbCount = MachineType.extractRamMbFromMachineTypeString(instantiatedVmInfo.machineType) val ramGbCount = ramMbCount.getOrElse(0) / 1024 - val cpuResourceGroup = Cpu // TODO: Investigate the situation in which the resource group is n1 + val cpuResourceType = Cpu // TODO: Investigate the situation in which the resource group is n1 val cpuKey = - CostCatalogKey(machineType, Option(usageType), Option(machineCustomization), Option(cpuResourceGroup), region) + CostCatalogKey(machineType, Option(usageType), Option(machineCustomization), Option(cpuResourceType), region) val cpuSku = getSku(cpuKey) if (cpuSku.isEmpty) { println(s"Failed to find CPU Sku for ${cpuKey}") @@ -171,9 +172,9 @@ class GcpCostCatalogService(serviceConfig: Config, globalConfig: Config, service } val cpuCost = cpuSku.map(sku => calculateCpuPricePerHour(sku.catalogObject, coreCount.get)) // TODO .get - val ramResourceGroup = Ram + val ramResourceType = Ram val ramKey = - CostCatalogKey(machineType, Option(usageType), Option(machineCustomization), Option(ramResourceGroup), region) + CostCatalogKey(machineType, Option(usageType), Option(machineCustomization), Option(ramResourceType), region) val ramSku = getSku(ramKey) if (ramSku.isEmpty) { println(s"Failed to find Ram Sku for ${ramKey}") diff --git a/services/src/main/scala/cromwell/services/cost/GcpCostCatalogTypes.scala b/services/src/main/scala/cromwell/services/cost/GcpCostCatalogTypes.scala index e16959c5e97..cafe5eb55ed 100644 --- a/services/src/main/scala/cromwell/services/cost/GcpCostCatalogTypes.scala +++ b/services/src/main/scala/cromwell/services/cost/GcpCostCatalogTypes.scala @@ -101,18 +101,18 @@ sealed trait MachineCustomization { def customizationName: String } case object Custom extends MachineCustomization { override val customizationName = "Custom" } case object Predefined extends MachineCustomization { override val customizationName = "Predefined" } -object ResourceGroup { - def fromSku(sku: Sku): Option[ResourceGroup] = +object ResourceType { + def fromSku(sku: Sku): Option[ResourceType] = { + val tokenizedDescription = sku.getDescription.split(" ") sku.getCategory.getResourceGroup match { case Cpu.groupName => Some(Cpu) case Ram.groupName => Some(Ram) - case N1Standard.groupName => Some(N1Standard) + case "N1Standard" if tokenizedDescription.contains("Ram") => Some(Ram) + case "N1Standard" if tokenizedDescription.contains("Core") => Some(Cpu) case _ => Option.empty } + } } -sealed trait ResourceGroup { def groupName: String } -case object Cpu extends ResourceGroup { override val groupName = "CPU" } -case object Ram extends ResourceGroup { override val groupName = "RAM" } - -// N1 machine SKUs are structured differently and use this resource group. -case object N1Standard extends ResourceGroup { override val groupName = "N1Standard" } +sealed trait ResourceType { def groupName: String } +case object Cpu extends ResourceType { override val groupName = "CPU" } +case object Ram extends ResourceType { override val groupName = "RAM" } diff --git a/services/src/test/scala/cromwell/services/cost/GcpCostCatalogServiceSpec.scala b/services/src/test/scala/cromwell/services/cost/GcpCostCatalogServiceSpec.scala index 2c5298d0cb3..29910285819 100644 --- a/services/src/test/scala/cromwell/services/cost/GcpCostCatalogServiceSpec.scala +++ b/services/src/test/scala/cromwell/services/cost/GcpCostCatalogServiceSpec.scala @@ -75,7 +75,7 @@ class GcpCostCatalogServiceSpec machineType = Some(N2), usageType = Some(Preemptible), machineCustomization = Some(Predefined), - resourceGroup = Some(Cpu), + resourceType = Some(Cpu), region = "europe-west9" ) @@ -104,7 +104,7 @@ class GcpCostCatalogServiceSpec machineType = Some(N2d), usageType = Some(Preemptible), machineCustomization = None, - resourceGroup = Some(Ram), + resourceType = Some(Ram), region = "europe-west9" ) val foundValue = testActorRef.getSku(expectedKey) diff --git a/supportedBackends/google/pipelines/common/src/test/scala/cromwell/backend/google/pipelines/common/CostLookupSpec.scala b/supportedBackends/google/pipelines/common/src/test/scala/cromwell/backend/google/pipelines/common/CostLookupSpec.scala index de9490d86e7..10d643912f4 100644 --- a/supportedBackends/google/pipelines/common/src/test/scala/cromwell/backend/google/pipelines/common/CostLookupSpec.scala +++ b/supportedBackends/google/pipelines/common/src/test/scala/cromwell/backend/google/pipelines/common/CostLookupSpec.scala @@ -46,7 +46,7 @@ class CostLookupSpec extends TestKitSuite with AnyFlatSpecLike with Matchers wit val legalMachineTypes: List[MachineType] = List(N1, N2, N2d) val legalUsageTypes: List[UsageType] = List(Preemptible, OnDemand) val legalCustomizations: List[MachineCustomization] = List(Custom, Predefined) - val resourceGroup: Option[ResourceGroup] = Some(Cpu) + val resourceGroup: Option[ResourceType] = Some(Cpu) val region = "us-west1" for (machineType <- legalMachineTypes) for (usageType <- legalUsageTypes) From a8ed1122e932d0684f4b1efd7e37c7a010c4569f Mon Sep 17 00:00:00 2001 From: Janet Gainer-Dewar Date: Wed, 25 Sep 2024 15:19:57 -0400 Subject: [PATCH 21/39] Drop cost catalog entries we won't use --- .../services/cost/GcpCostCatalogService.scala | 45 ++++++++++--------- 1 file changed, 25 insertions(+), 20 deletions(-) diff --git a/services/src/main/scala/cromwell/services/cost/GcpCostCatalogService.scala b/services/src/main/scala/cromwell/services/cost/GcpCostCatalogService.scala index 3de596996d1..7e7e0454cc2 100644 --- a/services/src/main/scala/cromwell/services/cost/GcpCostCatalogService.scala +++ b/services/src/main/scala/cromwell/services/cost/GcpCostCatalogService.scala @@ -15,10 +15,10 @@ import scala.jdk.CollectionConverters.IterableHasAsScala import java.time.temporal.ChronoUnit.SECONDS import scala.util.{Failure, Success, Try} -case class CostCatalogKey(machineType: Option[MachineType], - usageType: Option[UsageType], - machineCustomization: Option[MachineCustomization], - resourceType: Option[ResourceType], +case class CostCatalogKey(machineType: MachineType, + usageType: UsageType, + machineCustomization: MachineCustomization, + resourceType: ResourceType, region: String ) case class GcpCostLookupRequest(vmInfo: InstantiatedVmInfo, replyTo: ActorRef) extends ServiceRegistryMessage { @@ -101,19 +101,24 @@ class GcpCostCatalogService(serviceConfig: Config, globalConfig: Config, service acc ++ convertSkuToKeyValuePairs(sku) } - private def convertSkuToKeyValuePairs(sku: Sku): List[(CostCatalogKey, CostCatalogValue)] = { - val allAvailableRegions = sku.getServiceRegionsList.asScala.toList - // TODO jdewar flatMap this and discard any entries that we don't understand - allAvailableRegions.map(region => - CostCatalogKey( - machineType = MachineType.fromSku(sku), - usageType = UsageType.fromSku(sku), - machineCustomization = MachineCustomization.fromSku(sku), - resourceType = ResourceType.fromSku(sku), - region = region - ) -> CostCatalogValue(sku) - ) - } + /** + * Convert a SKU into a cost catalog map entry. We drop all SKUs that to not map to known machine types, + * resource types, usage types, etc. + */ + private def convertSkuToKeyValuePairs(sku: Sku): List[(CostCatalogKey, CostCatalogValue)] = + for { + region <- sku.getServiceRegionsList.asScala.toList + machineType <- MachineType.fromSku(sku) + usageType <- UsageType.fromSku(sku) + machineCustomization <- MachineCustomization.fromSku(sku) + resourceType <- ResourceType.fromSku(sku) + } yield CostCatalogKey( + machineType, + usageType, + machineCustomization, + resourceType, + region + ) -> CostCatalogValue(sku) // See: https://cloud.google.com/billing/v1/how-tos/catalog-api private def calculateCpuPricePerHour(cpuSku: Sku, coreCount: Int): Try[BigDecimal] = { @@ -161,9 +166,9 @@ class GcpCostCatalogService(serviceConfig: Config, globalConfig: Config, service val ramMbCount = MachineType.extractRamMbFromMachineTypeString(instantiatedVmInfo.machineType) val ramGbCount = ramMbCount.getOrElse(0) / 1024 - val cpuResourceType = Cpu // TODO: Investigate the situation in which the resource group is n1 + val cpuResourceType = Cpu val cpuKey = - CostCatalogKey(machineType, Option(usageType), Option(machineCustomization), Option(cpuResourceType), region) + CostCatalogKey(machineType.get, usageType, machineCustomization, cpuResourceType, region) // TODO .get val cpuSku = getSku(cpuKey) if (cpuSku.isEmpty) { println(s"Failed to find CPU Sku for ${cpuKey}") @@ -174,7 +179,7 @@ class GcpCostCatalogService(serviceConfig: Config, globalConfig: Config, service val ramResourceType = Ram val ramKey = - CostCatalogKey(machineType, Option(usageType), Option(machineCustomization), Option(ramResourceType), region) + CostCatalogKey(machineType.get, usageType, machineCustomization, ramResourceType, region) // TODO .get val ramSku = getSku(ramKey) if (ramSku.isEmpty) { println(s"Failed to find Ram Sku for ${ramKey}") From b1f723771ba2ab022c50b3194ae64015f804a3a5 Mon Sep 17 00:00:00 2001 From: Janet Gainer-Dewar Date: Wed, 25 Sep 2024 15:39:06 -0400 Subject: [PATCH 22/39] Fall back to predefined sku for N1 custom --- .../services/cost/GcpCostCatalogService.scala | 20 +++++++++++++++++-- 1 file changed, 18 insertions(+), 2 deletions(-) diff --git a/services/src/main/scala/cromwell/services/cost/GcpCostCatalogService.scala b/services/src/main/scala/cromwell/services/cost/GcpCostCatalogService.scala index 7e7e0454cc2..0ecc3465718 100644 --- a/services/src/main/scala/cromwell/services/cost/GcpCostCatalogService.scala +++ b/services/src/main/scala/cromwell/services/cost/GcpCostCatalogService.scala @@ -169,7 +169,15 @@ class GcpCostCatalogService(serviceConfig: Config, globalConfig: Config, service val cpuResourceType = Cpu val cpuKey = CostCatalogKey(machineType.get, usageType, machineCustomization, cpuResourceType, region) // TODO .get - val cpuSku = getSku(cpuKey) + + // As of Sept 2024 the cost catalog does not contain entries for custom N1 machines. If we're using N1, attempt + // to fall back to predefined. + lazy val n1PredefinedCpuSku = (machineType, machineCustomization) match { + case (Some(N1), Custom) => getSku(cpuKey.copy(machineCustomization = Predefined)) + case _ => None + } + + val cpuSku = getSku(cpuKey).orElse(n1PredefinedCpuSku) if (cpuSku.isEmpty) { println(s"Failed to find CPU Sku for ${cpuKey}") } else { @@ -180,7 +188,15 @@ class GcpCostCatalogService(serviceConfig: Config, globalConfig: Config, service val ramResourceType = Ram val ramKey = CostCatalogKey(machineType.get, usageType, machineCustomization, ramResourceType, region) // TODO .get - val ramSku = getSku(ramKey) + + // As of Sept 2024 the cost catalog does not contain entries for custom N1 machines. If we're using N1, attempt + // to fall back to predefined. + lazy val n1PredefinedRamSku = (machineType, machineCustomization) match { + case (Some(N1), Custom) => getSku(ramKey.copy(machineCustomization = Predefined)) + case _ => None + } + + val ramSku = getSku(ramKey).orElse(n1PredefinedRamSku) if (ramSku.isEmpty) { println(s"Failed to find Ram Sku for ${ramKey}") } else { From d59c321c9ed3982d99e958a4d21f56c4d0cd56cf Mon Sep 17 00:00:00 2001 From: Janet Gainer-Dewar Date: Thu, 26 Sep 2024 10:55:44 -0400 Subject: [PATCH 23/39] Rearrange --- .../services/cost/GcpCostCatalogTypes.scala | 32 +++++++++++-------- 1 file changed, 19 insertions(+), 13 deletions(-) diff --git a/services/src/main/scala/cromwell/services/cost/GcpCostCatalogTypes.scala b/services/src/main/scala/cromwell/services/cost/GcpCostCatalogTypes.scala index cafe5eb55ed..ac641edebf1 100644 --- a/services/src/main/scala/cromwell/services/cost/GcpCostCatalogTypes.scala +++ b/services/src/main/scala/cromwell/services/cost/GcpCostCatalogTypes.scala @@ -9,9 +9,16 @@ import scala.util.{Failure, Success, Try} * Case class that contains information retrieved from Google about a VM that cromwell has started */ case class InstantiatedVmInfo(region: String, machineType: String, preemptible: Boolean) + /* * These types reflect hardcoded strings found in a google cost catalog. */ + +sealed trait MachineType { def machineTypeName: String } +case object N1 extends MachineType { override val machineTypeName = "N1" } +case object N2 extends MachineType { override val machineTypeName = "N2" } +case object N2d extends MachineType { override val machineTypeName = "N2D" } + object MachineType { def fromSku(sku: Sku): Option[MachineType] = { val tokenizedDescription = sku.getDescription.split(" ") @@ -53,10 +60,10 @@ object MachineType { } } } -sealed trait MachineType { def machineTypeName: String } -case object N1 extends MachineType { override val machineTypeName = "N1" } -case object N2 extends MachineType { override val machineTypeName = "N2" } -case object N2d extends MachineType { override val machineTypeName = "N2D" } + +sealed trait UsageType { def typeName: String } +case object OnDemand extends UsageType { override val typeName = "OnDemand" } +case object Preemptible extends UsageType { override val typeName = "Preemptible" } object UsageType { def fromSku(sku: Sku): Option[UsageType] = @@ -71,9 +78,10 @@ object UsageType { } } -sealed trait UsageType { def typeName: String } -case object OnDemand extends UsageType { override val typeName = "OnDemand" } -case object Preemptible extends UsageType { override val typeName = "Preemptible" } + +sealed trait MachineCustomization { def customizationName: String } +case object Custom extends MachineCustomization { override val customizationName = "Custom" } +case object Predefined extends MachineCustomization { override val customizationName = "Predefined" } object MachineCustomization { def fromMachineTypeString(machineTypeString: String): MachineCustomization = @@ -97,9 +105,10 @@ object MachineCustomization { else Some(Predefined) } } -sealed trait MachineCustomization { def customizationName: String } -case object Custom extends MachineCustomization { override val customizationName = "Custom" } -case object Predefined extends MachineCustomization { override val customizationName = "Predefined" } + +sealed trait ResourceType { def groupName: String } +case object Cpu extends ResourceType { override val groupName = "CPU" } +case object Ram extends ResourceType { override val groupName = "RAM" } object ResourceType { def fromSku(sku: Sku): Option[ResourceType] = { @@ -113,6 +122,3 @@ object ResourceType { } } } -sealed trait ResourceType { def groupName: String } -case object Cpu extends ResourceType { override val groupName = "CPU" } -case object Ram extends ResourceType { override val groupName = "RAM" } From 7b8452fc20b4b88a2200cc349c9b8c347273cea4 Mon Sep 17 00:00:00 2001 From: Janet Gainer-Dewar Date: Thu, 26 Sep 2024 11:32:00 -0400 Subject: [PATCH 24/39] Ignore all SKUs except the ones we know we care about --- .../services/cost/GcpCostCatalogService.scala | 43 +++++++++++++------ 1 file changed, 30 insertions(+), 13 deletions(-) diff --git a/services/src/main/scala/cromwell/services/cost/GcpCostCatalogService.scala b/services/src/main/scala/cromwell/services/cost/GcpCostCatalogService.scala index 0ecc3465718..4b5a6960675 100644 --- a/services/src/main/scala/cromwell/services/cost/GcpCostCatalogService.scala +++ b/services/src/main/scala/cromwell/services/cost/GcpCostCatalogService.scala @@ -21,6 +21,34 @@ case class CostCatalogKey(machineType: MachineType, resourceType: ResourceType, region: String ) + +object CostCatalogKey { + + // Specifically support only the SKUs that we know we can use. This is brittle and I hate it, but the more structured + // fields available in the SKU don't give us enough information without relying on the human-readable descriptions. + // + // N1: We usually use custom machines but SKUs are only available for predefined; we'll fall back to these SKUs. + // N2 and N2D: We only use custom machines. + + // Use this regex to filter down to just the SKUs we are interested in. + // NB: This should be updated if we add new machine types or the cost catalog descriptions change + final val expectedSku = + (".*?N1 Predefined Instance (Core|Ram) .*|" + + ".*?N2 Custom Instance (Core|Ram) .*|" + + ".*?N2D AMD Custom Instance (Core|Ram) .*").r + + def apply(sku: Sku): List[CostCatalogKey] = + for { + _ <- expectedSku.findFirstIn(sku.getDescription).toList + machineType <- MachineType.fromSku(sku).toList + resourceType <- ResourceType.fromSku(sku).toList + usageType <- UsageType.fromSku(sku).toList + machineCustomization <- MachineCustomization.fromSku(sku).toList + region <- sku.getServiceRegionsList.asScala.toList + } yield CostCatalogKey(machineType, usageType, machineCustomization, resourceType, region) + +} + case class GcpCostLookupRequest(vmInfo: InstantiatedVmInfo, replyTo: ActorRef) extends ServiceRegistryMessage { override def serviceName: String = "GcpCostCatalogService" } @@ -95,6 +123,7 @@ class GcpCostCatalogService(serviceConfig: Config, globalConfig: Config, service * NB: This function takes an iterable so we can take advantage of the pagination provided by googleClient.listSkus. * Ideally, we don't want to have an entire, unprocessed, cost catalog in memory at once since it's ~20MB. */ + // TODO check for collisions when building map private def processCostCatalog(skus: Iterable[Sku]): Map[CostCatalogKey, CostCatalogValue] = // TODO: reduce memory footprint of returned map (don't store entire SKU object) skus.foldLeft(Map.empty[CostCatalogKey, CostCatalogValue]) { case (acc, sku) => @@ -106,19 +135,7 @@ class GcpCostCatalogService(serviceConfig: Config, globalConfig: Config, service * resource types, usage types, etc. */ private def convertSkuToKeyValuePairs(sku: Sku): List[(CostCatalogKey, CostCatalogValue)] = - for { - region <- sku.getServiceRegionsList.asScala.toList - machineType <- MachineType.fromSku(sku) - usageType <- UsageType.fromSku(sku) - machineCustomization <- MachineCustomization.fromSku(sku) - resourceType <- ResourceType.fromSku(sku) - } yield CostCatalogKey( - machineType, - usageType, - machineCustomization, - resourceType, - region - ) -> CostCatalogValue(sku) + CostCatalogKey(sku).map(k => (k, CostCatalogValue(sku))) // See: https://cloud.google.com/billing/v1/how-tos/catalog-api private def calculateCpuPricePerHour(cpuSku: Sku, coreCount: Int): Try[BigDecimal] = { From 40ba5fab48f8b4a3c8aafa1d79a61d258136fcac Mon Sep 17 00:00:00 2001 From: Janet Gainer-Dewar Date: Thu, 26 Sep 2024 13:42:09 -0400 Subject: [PATCH 25/39] Check for cost catalog key collisions --- .../services/cost/GcpCostCatalogService.scala | 19 +++++++++---------- 1 file changed, 9 insertions(+), 10 deletions(-) diff --git a/services/src/main/scala/cromwell/services/cost/GcpCostCatalogService.scala b/services/src/main/scala/cromwell/services/cost/GcpCostCatalogService.scala index 4b5a6960675..d65dc59b2d2 100644 --- a/services/src/main/scala/cromwell/services/cost/GcpCostCatalogService.scala +++ b/services/src/main/scala/cromwell/services/cost/GcpCostCatalogService.scala @@ -123,19 +123,18 @@ class GcpCostCatalogService(serviceConfig: Config, globalConfig: Config, service * NB: This function takes an iterable so we can take advantage of the pagination provided by googleClient.listSkus. * Ideally, we don't want to have an entire, unprocessed, cost catalog in memory at once since it's ~20MB. */ - // TODO check for collisions when building map private def processCostCatalog(skus: Iterable[Sku]): Map[CostCatalogKey, CostCatalogValue] = - // TODO: reduce memory footprint of returned map (don't store entire SKU object) skus.foldLeft(Map.empty[CostCatalogKey, CostCatalogValue]) { case (acc, sku) => - acc ++ convertSkuToKeyValuePairs(sku) - } + val keys = CostCatalogKey(sku) - /** - * Convert a SKU into a cost catalog map entry. We drop all SKUs that to not map to known machine types, - * resource types, usage types, etc. - */ - private def convertSkuToKeyValuePairs(sku: Sku): List[(CostCatalogKey, CostCatalogValue)] = - CostCatalogKey(sku).map(k => (k, CostCatalogValue(sku))) + // We expect that every cost catalog key is unique, but changes to the SKUs returned by Google may + // break this assumption. Check and log an error if we find collisions. + val collisions = keys.filter(acc.contains) + if (collisions.nonEmpty) + logger.error(s"Found SKU key collision for ${sku.getDescription}") + + acc ++ keys.map(k => (k, CostCatalogValue(sku))) + } // See: https://cloud.google.com/billing/v1/how-tos/catalog-api private def calculateCpuPricePerHour(cpuSku: Sku, coreCount: Int): Try[BigDecimal] = { From 8796a94c4cecb70a8a59754123428cf2eea6ee0c Mon Sep 17 00:00:00 2001 From: Janet Gainer-Dewar Date: Fri, 27 Sep 2024 13:12:58 -0400 Subject: [PATCH 26/39] Fix tests --- .../cost/GcpCostCatalogServiceSpec.scala | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/services/src/test/scala/cromwell/services/cost/GcpCostCatalogServiceSpec.scala b/services/src/test/scala/cromwell/services/cost/GcpCostCatalogServiceSpec.scala index 29910285819..2cdd1f4154b 100644 --- a/services/src/test/scala/cromwell/services/cost/GcpCostCatalogServiceSpec.scala +++ b/services/src/test/scala/cromwell/services/cost/GcpCostCatalogServiceSpec.scala @@ -72,10 +72,10 @@ class GcpCostCatalogServiceSpec it should "cache catalogs properly" in { val testLookupKey = CostCatalogKey( - machineType = Some(N2), - usageType = Some(Preemptible), - machineCustomization = Some(Predefined), - resourceType = Some(Cpu), + machineType = N2, + usageType = Preemptible, + machineCustomization = Predefined, + resourceType = Cpu, region = "europe-west9" ) @@ -101,13 +101,13 @@ class GcpCostCatalogServiceSpec it should "contain an expected SKU" in { val expectedKey = CostCatalogKey( - machineType = Some(N2d), - usageType = Some(Preemptible), - machineCustomization = None, - resourceType = Some(Ram), + machineType = N2d, + usageType = Preemptible, + machineCustomization = Custom, + resourceType = Ram, region = "europe-west9" ) val foundValue = testActorRef.getSku(expectedKey) - foundValue.get.catalogObject.getDescription shouldBe "Spot Preemptible N2D AMD Instance Ram running in Paris" + foundValue.get.catalogObject.getDescription shouldBe "Spot Preemptible N2D AMD Custom Instance Ram running in Paris" } } From 7a3b4f580c1ee09d9d695e4c3bcc15308e76eee0 Mon Sep 17 00:00:00 2001 From: Janet Gainer-Dewar Date: Fri, 27 Sep 2024 15:44:14 -0400 Subject: [PATCH 27/39] Refactor cost calculations --- .../PollResultMonitorActor.scala | 8 +- .../services/cost/GcpCostCatalogService.scala | 159 +++++++++--------- 2 files changed, 88 insertions(+), 79 deletions(-) diff --git a/backend/src/main/scala/cromwell/backend/standard/pollmonitoring/PollResultMonitorActor.scala b/backend/src/main/scala/cromwell/backend/standard/pollmonitoring/PollResultMonitorActor.scala index 627ba4b63af..bf146f1ca8f 100644 --- a/backend/src/main/scala/cromwell/backend/standard/pollmonitoring/PollResultMonitorActor.scala +++ b/backend/src/main/scala/cromwell/backend/standard/pollmonitoring/PollResultMonitorActor.scala @@ -17,6 +17,7 @@ import wdl4s.parser.MemoryUnit import java.time.OffsetDateTime import java.time.temporal.ChronoUnit +import scala.util.{Failure, Success} trait PollResultMessage case class ProcessThisPollResult[PollResultType](pollResult: PollResultType) extends PollResultMessage case class AsyncJobHasFinished[PollResultType](pollResult: PollResultType) extends PollResultMessage @@ -155,7 +156,12 @@ trait PollResultMonitorActor[PollResultType] extends Actor { def handleCostResponse(costLookupResponse: GcpCostLookupResponse): Unit = { println(s"Handling Cost Response from Catalog Service: ${costLookupResponse}") if (vmCostPerHour.isEmpty) { // Optimization to avoid processing responses after we've received a valid one. - val cost = costLookupResponse.calculatedCost.getOrElse(BigDecimal(-1)) // TODO: better logging here. + val cost = costLookupResponse.calculatedCost match { + case Success(c) => c + case Failure(e) => + params.logger.foreach(_.error(s"Failed to calculate VM cost per hour. ${e.getMessage}")) + BigDecimal(-1) + } vmCostPerHour = Option(cost) tellMetadata(Map(CallMetadataKeys.VmCostPerHour -> cost)) } diff --git a/services/src/main/scala/cromwell/services/cost/GcpCostCatalogService.scala b/services/src/main/scala/cromwell/services/cost/GcpCostCatalogService.scala index d65dc59b2d2..79451689bec 100644 --- a/services/src/main/scala/cromwell/services/cost/GcpCostCatalogService.scala +++ b/services/src/main/scala/cromwell/services/cost/GcpCostCatalogService.scala @@ -47,12 +47,22 @@ object CostCatalogKey { region <- sku.getServiceRegionsList.asScala.toList } yield CostCatalogKey(machineType, usageType, machineCustomization, resourceType, region) + def apply(instantiatedVmInfo: InstantiatedVmInfo, resourceType: ResourceType): Option[CostCatalogKey] = + MachineType.fromGoogleMachineTypeString(instantiatedVmInfo.machineType).map { mType => + CostCatalogKey( + mType, + UsageType.fromBoolean(instantiatedVmInfo.preemptible), + MachineCustomization.fromMachineTypeString(instantiatedVmInfo.machineType), + resourceType, + instantiatedVmInfo.region + ) + } } case class GcpCostLookupRequest(vmInfo: InstantiatedVmInfo, replyTo: ActorRef) extends ServiceRegistryMessage { override def serviceName: String = "GcpCostCatalogService" } -case class GcpCostLookupResponse(calculatedCost: Option[BigDecimal]) +case class GcpCostLookupResponse(calculatedCost: Try[BigDecimal]) case class CostCatalogValue(catalogObject: Sku) case class ExpiringGcpCostCatalog(catalog: Map[CostCatalogKey, CostCatalogValue], fetchTime: Instant) @@ -62,6 +72,47 @@ object GcpCostCatalogService { // ISO 4217 https://developers.google.com/adsense/management/appendix/currencies private val DEFAULT_CURRENCY_CODE = "USD" + + def getMostRecentPricingInfo(sku: Sku): PricingInfo = { + val mostRecentPricingInfoIndex = sku.getPricingInfoCount - 1 + sku.getPricingInfo(mostRecentPricingInfoIndex) + } + + // See: https://cloud.google.com/billing/v1/how-tos/catalog-api + def calculateCpuPricePerHour(cpuSku: Sku, coreCount: Int): Try[BigDecimal] = { + val pricingInfo = getMostRecentPricingInfo(cpuSku) + val usageUnit = pricingInfo.getPricingExpression.getUsageUnit + if (usageUnit == "h") { + // Price per hour of a single core + // NB: Ignoring "TieredRates" here (the idea that stuff gets cheaper the more you use). + // Technically, we should write code that determines which tier(s) to use. + // In practice, from what I've seen, CPU cores and RAM don't have more than a single tier. + val costPerUnit: Money = pricingInfo.getPricingExpression.getTieredRates(0).getUnitPrice + val costPerCorePerHour: BigDecimal = + costPerUnit.getUnits + (costPerUnit.getNanos * 10e-9) // Same as above, but as a big decimal + + println(s"Calculated ${coreCount} cpu cores to cost ${costPerCorePerHour * coreCount} per hour") + Success(costPerCorePerHour * coreCount) + } else { + Failure(new UnsupportedOperationException(s"Expected usage units of CPUs to be 'h'. Got ${usageUnit}")) + } + } + + def calculateRamPricePerHour(ramSku: Sku, ramGbCount: Int): Try[BigDecimal] = { + val pricingInfo = getMostRecentPricingInfo(ramSku) + val usageUnit = pricingInfo.getPricingExpression.getUsageUnit + if (usageUnit == "GiBy.h") { + val costPerUnit: Money = pricingInfo.getPricingExpression.getTieredRates(0).getUnitPrice + val costPerGbHour: BigDecimal = + costPerUnit.getUnits + (costPerUnit.getNanos * 10e-9) // Same as above, but as a big decimal + println(s"Calculated ${ramGbCount} GB of ram to cost ${ramGbCount * costPerGbHour} per hour") + Success(costPerGbHour * ramGbCount) + } else { + Failure( + new UnsupportedOperationException(s"Expected usage units of RAM to be 'GiBy.h'. Got ${usageUnit}") + ) + } + } } /** @@ -136,96 +187,48 @@ class GcpCostCatalogService(serviceConfig: Config, globalConfig: Config, service acc ++ keys.map(k => (k, CostCatalogValue(sku))) } - // See: https://cloud.google.com/billing/v1/how-tos/catalog-api - private def calculateCpuPricePerHour(cpuSku: Sku, coreCount: Int): Try[BigDecimal] = { - val pricingInfo = getMostRecentPricingInfo(cpuSku) - val usageUnit = pricingInfo.getPricingExpression.getUsageUnit - if (usageUnit != "h") { - return Failure(new UnsupportedOperationException(s"Expected usage units of CPUs to be 'h'. Got ${usageUnit}")) - } - // Price per hour of a single core - // NB: Ignoring "TieredRates" here (the idea that stuff gets cheaper the more you use). - // Technically, we should write code that determines which tier(s) to use. - // In practice, from what I've seen, CPU cores and RAM don't have more than a single tier. - val costPerUnit: Money = pricingInfo.getPricingExpression.getTieredRates(0).getUnitPrice - val costPerCorePerHour: BigDecimal = - costPerUnit.getUnits + (costPerUnit.getNanos * 10e-9) // Same as above, but as a big decimal - - println(s"Calculated ${coreCount} cpu cores to cost ${costPerCorePerHour * coreCount} per hour") - Success(costPerCorePerHour * coreCount) - } - - private def calculateRamPricePerHour(ramSku: Sku, ramGbCount: Int): Try[BigDecimal] = { - val pricingInfo = getMostRecentPricingInfo(ramSku) - val usageUnit = pricingInfo.getPricingExpression.getUsageUnit - if (usageUnit != "GiBy.h") { - return Failure(new UnsupportedOperationException(s"Expected usage units of RAM to be 'GiBy.h'. Got ${usageUnit}")) - } - val costPerUnit: Money = pricingInfo.getPricingExpression.getTieredRates(0).getUnitPrice - val costPerGbHour: BigDecimal = - costPerUnit.getUnits + (costPerUnit.getNanos * 10e-9) // Same as above, but as a big decimal - println(s"Calculated ${ramGbCount} GB of ram to cost ${ramGbCount * costPerGbHour} per hour") - Success(costPerGbHour * ramGbCount) - } - - private def getMostRecentPricingInfo(sku: Sku): PricingInfo = { - val mostRecentPricingInfoIndex = sku.getPricingInfoCount - 1 - sku.getPricingInfo(mostRecentPricingInfoIndex) - } - - private def calculateVmCostPerHour(instantiatedVmInfo: InstantiatedVmInfo): Try[BigDecimal] = { - val machineType = MachineType.fromGoogleMachineTypeString(instantiatedVmInfo.machineType) - val usageType = UsageType.fromBoolean(instantiatedVmInfo.preemptible) - val machineCustomization = MachineCustomization.fromMachineTypeString(instantiatedVmInfo.machineType) - val region = instantiatedVmInfo.region - val coreCount = MachineType.extractCoreCountFromMachineTypeString(instantiatedVmInfo.machineType) - val ramMbCount = MachineType.extractRamMbFromMachineTypeString(instantiatedVmInfo.machineType) - val ramGbCount = ramMbCount.getOrElse(0) / 1024 - - val cpuResourceType = Cpu - val cpuKey = - CostCatalogKey(machineType.get, usageType, machineCustomization, cpuResourceType, region) // TODO .get + def lookUpSku(instantiatedVmInfo: InstantiatedVmInfo, resourceType: ResourceType): Try[Sku] = { + val keyOpt = CostCatalogKey(instantiatedVmInfo, resourceType) // As of Sept 2024 the cost catalog does not contain entries for custom N1 machines. If we're using N1, attempt // to fall back to predefined. - lazy val n1PredefinedCpuSku = (machineType, machineCustomization) match { - case (Some(N1), Custom) => getSku(cpuKey.copy(machineCustomization = Predefined)) - case _ => None + lazy val n1PredefinedSku = keyOpt.flatMap { key => + (key.machineType, key.machineCustomization) match { + case (N1, Custom) => getSku(key.copy(machineCustomization = Predefined)) + case _ => None + } } - - val cpuSku = getSku(cpuKey).orElse(n1PredefinedCpuSku) - if (cpuSku.isEmpty) { - println(s"Failed to find CPU Sku for ${cpuKey}") - } else { - println(s"Found CPU Sku ${cpuSku.get.catalogObject.getDescription} from key ${cpuKey}") + val sku = keyOpt.flatMap(getSku).orElse(n1PredefinedSku).map(_.catalogObject) + sku match { + case Some(s) => Success(s) + case None => Failure(new Exception(s"Failed to look up ${resourceType} SKU for ${instantiatedVmInfo}")) } - val cpuCost = cpuSku.map(sku => calculateCpuPricePerHour(sku.catalogObject, coreCount.get)) // TODO .get + } - val ramResourceType = Ram - val ramKey = - CostCatalogKey(machineType.get, usageType, machineCustomization, ramResourceType, region) // TODO .get + def calculateVmCostPerHour(instantiatedVmInfo: InstantiatedVmInfo): Try[BigDecimal] = { + val cpuPricePerHour: Try[BigDecimal] = for { + cpuSku <- lookUpSku(instantiatedVmInfo, Cpu) + coreCount <- MachineType.extractCoreCountFromMachineTypeString(instantiatedVmInfo.machineType) + pricePerHour <- GcpCostCatalogService.calculateCpuPricePerHour(cpuSku, coreCount) + } yield pricePerHour - // As of Sept 2024 the cost catalog does not contain entries for custom N1 machines. If we're using N1, attempt - // to fall back to predefined. - lazy val n1PredefinedRamSku = (machineType, machineCustomization) match { - case (Some(N1), Custom) => getSku(ramKey.copy(machineCustomization = Predefined)) - case _ => None - } + val ramPricePerHour: Try[BigDecimal] = for { + ramSku <- lookUpSku(instantiatedVmInfo, Ram) + ramMbCount <- MachineType.extractRamMbFromMachineTypeString(instantiatedVmInfo.machineType) + ramGbCount = ramMbCount / 1024 + pricePerHour <- GcpCostCatalogService.calculateRamPricePerHour(ramSku, ramGbCount) + } yield pricePerHour - val ramSku = getSku(ramKey).orElse(n1PredefinedRamSku) - if (ramSku.isEmpty) { - println(s"Failed to find Ram Sku for ${ramKey}") - } else { - println(s"Found CPU Sku ${ramSku.get.catalogObject.getDescription} from key ${ramKey}") - } - val ramCost = ramSku.map(sku => calculateRamPricePerHour(sku.catalogObject, ramGbCount)) // TODO .get - Success(cpuCost.get.get + ramCost.get.get) + for { + cpuPrice <- cpuPricePerHour + ramPrice <- ramPricePerHour + } yield cpuPrice + ramPrice } def serviceRegistryActor: ActorRef = serviceRegistry override def receive: Receive = { case GcpCostLookupRequest(vmInfo, replyTo) => - val calculatedCost = calculateVmCostPerHour(vmInfo).toOption + val calculatedCost = calculateVmCostPerHour(vmInfo) val response = GcpCostLookupResponse(calculatedCost) replyTo ! response case ShutdownCommand => From 6467f049db1faa91df2d746437327761f053ddce Mon Sep 17 00:00:00 2001 From: Janet Gainer-Dewar Date: Fri, 27 Sep 2024 16:29:44 -0400 Subject: [PATCH 28/39] Try -> ErrorOr --- .../PollResultMonitorActor.scala | 9 +-- .../services/cost/GcpCostCatalogService.scala | 59 ++++++++++--------- .../services/cost/GcpCostCatalogTypes.scala | 32 +++++----- 3 files changed, 51 insertions(+), 49 deletions(-) diff --git a/backend/src/main/scala/cromwell/backend/standard/pollmonitoring/PollResultMonitorActor.scala b/backend/src/main/scala/cromwell/backend/standard/pollmonitoring/PollResultMonitorActor.scala index bf146f1ca8f..172fd81c8a4 100644 --- a/backend/src/main/scala/cromwell/backend/standard/pollmonitoring/PollResultMonitorActor.scala +++ b/backend/src/main/scala/cromwell/backend/standard/pollmonitoring/PollResultMonitorActor.scala @@ -1,5 +1,6 @@ package cromwell.backend.standard.pollmonitoring import akka.actor.{Actor, ActorRef} +import cats.data.Validated.{Invalid, Valid} import cromwell.backend.{BackendJobDescriptor, BackendWorkflowDescriptor, Platform} import cromwell.backend.validation.{ CpuValidation, @@ -17,7 +18,6 @@ import wdl4s.parser.MemoryUnit import java.time.OffsetDateTime import java.time.temporal.ChronoUnit -import scala.util.{Failure, Success} trait PollResultMessage case class ProcessThisPollResult[PollResultType](pollResult: PollResultType) extends PollResultMessage case class AsyncJobHasFinished[PollResultType](pollResult: PollResultType) extends PollResultMessage @@ -157,9 +157,10 @@ trait PollResultMonitorActor[PollResultType] extends Actor { println(s"Handling Cost Response from Catalog Service: ${costLookupResponse}") if (vmCostPerHour.isEmpty) { // Optimization to avoid processing responses after we've received a valid one. val cost = costLookupResponse.calculatedCost match { - case Success(c) => c - case Failure(e) => - params.logger.foreach(_.error(s"Failed to calculate VM cost per hour. ${e.getMessage}")) + case Valid(c) => c + case Invalid(errors) => + // TODO contextualizeErrors + params.logger.foreach(_.error(s"Failed to calculate VM cost per hour. ${errors.toList.mkString(", ")}")) BigDecimal(-1) } vmCostPerHour = Option(cost) diff --git a/services/src/main/scala/cromwell/services/cost/GcpCostCatalogService.scala b/services/src/main/scala/cromwell/services/cost/GcpCostCatalogService.scala index 79451689bec..2da60940842 100644 --- a/services/src/main/scala/cromwell/services/cost/GcpCostCatalogService.scala +++ b/services/src/main/scala/cromwell/services/cost/GcpCostCatalogService.scala @@ -1,11 +1,14 @@ package cromwell.services.cost import akka.actor.{Actor, ActorRef} +import cats.implicits.catsSyntaxValidatedId import com.google.`type`.Money import com.google.cloud.billing.v1._ import com.typesafe.config.Config import com.typesafe.scalalogging.LazyLogging import common.util.StringUtil.EnhancedToStringable +import common.validation.ErrorOr._ +import common.validation.ErrorOr.ErrorOr import cromwell.services.ServiceRegistryActor.ServiceRegistryMessage import cromwell.services.cost.GcpCostCatalogService.{COMPUTE_ENGINE_SERVICE_NAME, DEFAULT_CURRENCY_CODE} import cromwell.util.GracefulShutdownHelper.ShutdownCommand @@ -13,7 +16,6 @@ import cromwell.util.GracefulShutdownHelper.ShutdownCommand import java.time.{Duration, Instant} import scala.jdk.CollectionConverters.IterableHasAsScala import java.time.temporal.ChronoUnit.SECONDS -import scala.util.{Failure, Success, Try} case class CostCatalogKey(machineType: MachineType, usageType: UsageType, @@ -47,7 +49,7 @@ object CostCatalogKey { region <- sku.getServiceRegionsList.asScala.toList } yield CostCatalogKey(machineType, usageType, machineCustomization, resourceType, region) - def apply(instantiatedVmInfo: InstantiatedVmInfo, resourceType: ResourceType): Option[CostCatalogKey] = + def apply(instantiatedVmInfo: InstantiatedVmInfo, resourceType: ResourceType): ErrorOr[CostCatalogKey] = MachineType.fromGoogleMachineTypeString(instantiatedVmInfo.machineType).map { mType => CostCatalogKey( mType, @@ -62,7 +64,7 @@ object CostCatalogKey { case class GcpCostLookupRequest(vmInfo: InstantiatedVmInfo, replyTo: ActorRef) extends ServiceRegistryMessage { override def serviceName: String = "GcpCostCatalogService" } -case class GcpCostLookupResponse(calculatedCost: Try[BigDecimal]) +case class GcpCostLookupResponse(calculatedCost: ErrorOr[BigDecimal]) case class CostCatalogValue(catalogObject: Sku) case class ExpiringGcpCostCatalog(catalog: Map[CostCatalogKey, CostCatalogValue], fetchTime: Instant) @@ -79,7 +81,7 @@ object GcpCostCatalogService { } // See: https://cloud.google.com/billing/v1/how-tos/catalog-api - def calculateCpuPricePerHour(cpuSku: Sku, coreCount: Int): Try[BigDecimal] = { + def calculateCpuPricePerHour(cpuSku: Sku, coreCount: Int): ErrorOr[BigDecimal] = { val pricingInfo = getMostRecentPricingInfo(cpuSku) val usageUnit = pricingInfo.getPricingExpression.getUsageUnit if (usageUnit == "h") { @@ -92,25 +94,26 @@ object GcpCostCatalogService { costPerUnit.getUnits + (costPerUnit.getNanos * 10e-9) // Same as above, but as a big decimal println(s"Calculated ${coreCount} cpu cores to cost ${costPerCorePerHour * coreCount} per hour") - Success(costPerCorePerHour * coreCount) + val result = costPerCorePerHour * coreCount + result.validNel } else { - Failure(new UnsupportedOperationException(s"Expected usage units of CPUs to be 'h'. Got ${usageUnit}")) + s"Expected usage units of CPUs to be 'h'. Got ${usageUnit}".invalidNel } } - def calculateRamPricePerHour(ramSku: Sku, ramGbCount: Int): Try[BigDecimal] = { + def calculateRamPricePerHour(ramSku: Sku, ramGbCount: Int): ErrorOr[BigDecimal] = { val pricingInfo = getMostRecentPricingInfo(ramSku) val usageUnit = pricingInfo.getPricingExpression.getUsageUnit if (usageUnit == "GiBy.h") { val costPerUnit: Money = pricingInfo.getPricingExpression.getTieredRates(0).getUnitPrice val costPerGbHour: BigDecimal = costPerUnit.getUnits + (costPerUnit.getNanos * 10e-9) // Same as above, but as a big decimal + println(s"Calculated ${ramGbCount} GB of ram to cost ${ramGbCount * costPerGbHour} per hour") - Success(costPerGbHour * ramGbCount) + val result = costPerGbHour * ramGbCount + result.validNel } else { - Failure( - new UnsupportedOperationException(s"Expected usage units of RAM to be 'GiBy.h'. Got ${usageUnit}") - ) + s"Expected usage units of RAM to be 'GiBy.h'. Got ${usageUnit}".invalidNel } } } @@ -187,32 +190,30 @@ class GcpCostCatalogService(serviceConfig: Config, globalConfig: Config, service acc ++ keys.map(k => (k, CostCatalogValue(sku))) } - def lookUpSku(instantiatedVmInfo: InstantiatedVmInfo, resourceType: ResourceType): Try[Sku] = { - val keyOpt = CostCatalogKey(instantiatedVmInfo, resourceType) - - // As of Sept 2024 the cost catalog does not contain entries for custom N1 machines. If we're using N1, attempt - // to fall back to predefined. - lazy val n1PredefinedSku = keyOpt.flatMap { key => - (key.machineType, key.machineCustomization) match { - case (N1, Custom) => getSku(key.copy(machineCustomization = Predefined)) - case _ => None + def lookUpSku(instantiatedVmInfo: InstantiatedVmInfo, resourceType: ResourceType): ErrorOr[Sku] = + CostCatalogKey(instantiatedVmInfo, resourceType).flatMap { key => + // As of Sept 2024 the cost catalog does not contain entries for custom N1 machines. If we're using N1, attempt + // to fall back to predefined. + lazy val n1PredefinedKey = + (key.machineType, key.machineCustomization) match { + case (N1, Custom) => Option(key.copy(machineCustomization = Predefined)) + case _ => None + } + val sku = getSku(key).orElse(n1PredefinedKey.flatMap(getSku)).map(_.catalogObject) + sku match { + case Some(sku) => sku.validNel + case None => s"Failed to look up ${resourceType} SKU for ${instantiatedVmInfo}".invalidNel } } - val sku = keyOpt.flatMap(getSku).orElse(n1PredefinedSku).map(_.catalogObject) - sku match { - case Some(s) => Success(s) - case None => Failure(new Exception(s"Failed to look up ${resourceType} SKU for ${instantiatedVmInfo}")) - } - } - def calculateVmCostPerHour(instantiatedVmInfo: InstantiatedVmInfo): Try[BigDecimal] = { - val cpuPricePerHour: Try[BigDecimal] = for { + def calculateVmCostPerHour(instantiatedVmInfo: InstantiatedVmInfo): ErrorOr[BigDecimal] = { + val cpuPricePerHour: ErrorOr[BigDecimal] = for { cpuSku <- lookUpSku(instantiatedVmInfo, Cpu) coreCount <- MachineType.extractCoreCountFromMachineTypeString(instantiatedVmInfo.machineType) pricePerHour <- GcpCostCatalogService.calculateCpuPricePerHour(cpuSku, coreCount) } yield pricePerHour - val ramPricePerHour: Try[BigDecimal] = for { + val ramPricePerHour: ErrorOr[BigDecimal] = for { ramSku <- lookUpSku(instantiatedVmInfo, Ram) ramMbCount <- MachineType.extractRamMbFromMachineTypeString(instantiatedVmInfo.machineType) ramGbCount = ramMbCount / 1024 diff --git a/services/src/main/scala/cromwell/services/cost/GcpCostCatalogTypes.scala b/services/src/main/scala/cromwell/services/cost/GcpCostCatalogTypes.scala index ac641edebf1..9705c64794f 100644 --- a/services/src/main/scala/cromwell/services/cost/GcpCostCatalogTypes.scala +++ b/services/src/main/scala/cromwell/services/cost/GcpCostCatalogTypes.scala @@ -1,9 +1,10 @@ package cromwell.services.cost +import cats.implicits.catsSyntaxValidatedId import com.google.cloud.billing.v1.Sku +import common.validation.ErrorOr.ErrorOr import java.util.regex.{Matcher, Pattern} -import scala.util.{Failure, Success, Try} /* * Case class that contains information retrieved from Google about a VM that cromwell has started @@ -29,34 +30,33 @@ object MachineType { } // expects a string that looks something like "n1-standard-1" or "custom-1-4096" - def fromGoogleMachineTypeString(machineTypeString: String): Option[MachineType] = - if (machineTypeString.toLowerCase().startsWith("n1-")) Some(N1) - else if (machineTypeString.toLowerCase().startsWith("n2d-")) Some(N2d) - else if (machineTypeString.toLowerCase().startsWith("n2-")) Some(N2) - else if (machineTypeString.toLowerCase().startsWith("custom-")) Some(N1) // by convention - else { - println(s"Error: Unrecognized machine type: $machineTypeString") - None - } + def fromGoogleMachineTypeString(machineTypeString: String): ErrorOr[MachineType] = { + val mType = machineTypeString.toLowerCase() + if (mType.startsWith("n1-")) N1.validNel + else if (mType.startsWith("n2d-")) N2d.validNel + else if (mType.startsWith("n2-")) N2.validNel + else if (mType.startsWith("custom-")) N1.validNel // by convention + else s"Error: Unrecognized machine type: $machineTypeString".invalidNel + } - def extractCoreCountFromMachineTypeString(machineTypeString: String): Try[Int] = { + def extractCoreCountFromMachineTypeString(machineTypeString: String): ErrorOr[Int] = { val pattern: Pattern = Pattern.compile("-(\\d+)") val matcher: Matcher = pattern.matcher(machineTypeString) if (matcher.find()) { - Success(matcher.group(1).toInt) + matcher.group(1).toInt.validNel } else { - Failure(new IllegalArgumentException(s"Could not extract core count from ${machineTypeString}")) + s"Could not extract core count from ${machineTypeString}".invalidNel } } - def extractRamMbFromMachineTypeString(machineTypeString: String): Try[Int] = { + def extractRamMbFromMachineTypeString(machineTypeString: String): ErrorOr[Int] = { // Regular expression to match the number after the dash at the end of the string // TODO add test val pattern: Pattern = Pattern.compile(".*?-(\\d+)$") val matcher: Matcher = pattern.matcher(machineTypeString); if (matcher.find()) { - Success(matcher.group(1).toInt) + matcher.group(1).toInt.validNel } else { - Failure(new IllegalArgumentException(s"Could not Ram MB count from ${machineTypeString}")) + s"Could not Ram MB count from ${machineTypeString}".invalidNel } } } From 30df0985edbddbfa25db8ccbd770c1edeafa934a Mon Sep 17 00:00:00 2001 From: Janet Gainer-Dewar Date: Fri, 27 Sep 2024 16:36:46 -0400 Subject: [PATCH 29/39] Require logger --- .../PollResultMonitorActor.scala | 4 +- .../actors/BatchPollResultMonitorActor.scala | 39 +++++++++---------- .../common/PapiPollResultMonitorActor.scala | 29 +++++--------- 3 files changed, 30 insertions(+), 42 deletions(-) diff --git a/backend/src/main/scala/cromwell/backend/standard/pollmonitoring/PollResultMonitorActor.scala b/backend/src/main/scala/cromwell/backend/standard/pollmonitoring/PollResultMonitorActor.scala index 172fd81c8a4..3a9dddbb0bb 100644 --- a/backend/src/main/scala/cromwell/backend/standard/pollmonitoring/PollResultMonitorActor.scala +++ b/backend/src/main/scala/cromwell/backend/standard/pollmonitoring/PollResultMonitorActor.scala @@ -28,7 +28,7 @@ case class PollMonitorParameters( jobDescriptor: BackendJobDescriptor, validatedRuntimeAttributes: ValidatedRuntimeAttributes, platform: Option[Platform], - logger: Option[JobLogger] + logger: JobLogger ) /** @@ -160,7 +160,7 @@ trait PollResultMonitorActor[PollResultType] extends Actor { case Valid(c) => c case Invalid(errors) => // TODO contextualizeErrors - params.logger.foreach(_.error(s"Failed to calculate VM cost per hour. ${errors.toList.mkString(", ")}")) + params.logger.error(s"Failed to calculate VM cost per hour. ${errors.toList.mkString(", ")}") BigDecimal(-1) } vmCostPerHour = Option(cost) diff --git a/supportedBackends/google/batch/src/main/scala/cromwell/backend/google/batch/actors/BatchPollResultMonitorActor.scala b/supportedBackends/google/batch/src/main/scala/cromwell/backend/google/batch/actors/BatchPollResultMonitorActor.scala index 0f935b17c79..a313ab49b05 100644 --- a/supportedBackends/google/batch/src/main/scala/cromwell/backend/google/batch/actors/BatchPollResultMonitorActor.scala +++ b/supportedBackends/google/batch/src/main/scala/cromwell/backend/google/batch/actors/BatchPollResultMonitorActor.scala @@ -3,7 +3,13 @@ package cromwell.backend.google.batch.actors import akka.actor.{ActorRef, Props} import cromwell.backend.{BackendJobDescriptor, BackendWorkflowDescriptor, Platform} import cromwell.backend.google.batch.models.RunStatus -import cromwell.backend.standard.pollmonitoring.{AsyncJobHasFinished, PollMonitorParameters, PollResultMessage, PollResultMonitorActor, ProcessThisPollResult} +import cromwell.backend.standard.pollmonitoring.{ + AsyncJobHasFinished, + PollMonitorParameters, + PollResultMessage, + PollResultMonitorActor, + ProcessThisPollResult +} import cromwell.backend.validation.ValidatedRuntimeAttributes import cromwell.core.logging.JobLogger import cromwell.services.cost.InstantiatedVmInfo @@ -20,13 +26,7 @@ object BatchPollResultMonitorActor { logger: JobLogger ): Props = Props( new BatchPollResultMonitorActor( - PollMonitorParameters(serviceRegistry, - workflowDescriptor, - jobDescriptor, - runtimeAttributes, - platform, - Option(logger) - ) + PollMonitorParameters(serviceRegistry, workflowDescriptor, jobDescriptor, runtimeAttributes, platform, logger) ) ) } @@ -51,28 +51,25 @@ class BatchPollResultMonitorActor(pollMonitorParameters: PollMonitorParameters) message match { case ProcessThisPollResult(pollResult: RunStatus) => processPollResult(pollResult) case ProcessThisPollResult(result) => - params.logger.foreach(logger => - logger.error( - s"Programmer error: Received Poll Result of unknown type. Expected ${RunStatus.getClass.getSimpleName} but got ${result.getClass.getSimpleName}." - ) + params.logger.error( + s"Programmer error: Received Poll Result of unknown type. Expected ${RunStatus.getClass.getSimpleName} but got ${result.getClass.getSimpleName}." ) + case AsyncJobHasFinished(pollResult: RunStatus) => handleAsyncJobFinish(pollResult.getClass.getSimpleName) case AsyncJobHasFinished(result) => - params.logger.foreach(logger => - logger.error( - s"Programmer error: Received Poll Result of unknown type. Expected ${AsyncJobHasFinished.getClass.getSimpleName} but got ${result.getClass.getSimpleName}." - ) + params.logger.error( + s"Programmer error: Received Poll Result of unknown type. Expected ${AsyncJobHasFinished.getClass.getSimpleName} but got ${result.getClass.getSimpleName}." ) + } case _ => - params.logger.foreach(logger => - logger.error( - s"Programmer error: Cost Helper received message of type other than CostPollingMessage" - ) + params.logger.error( + s"Programmer error: Cost Helper received message of type other than CostPollingMessage" ) + } override def params: PollMonitorParameters = pollMonitorParameters - override def extractVmInfoFromRunState(pollStatus: RunStatus): Option[InstantiatedVmInfo] = Option.empty //TODO + override def extractVmInfoFromRunState(pollStatus: RunStatus): Option[InstantiatedVmInfo] = Option.empty // TODO } diff --git a/supportedBackends/google/pipelines/common/src/main/scala/cromwell/backend/google/pipelines/common/PapiPollResultMonitorActor.scala b/supportedBackends/google/pipelines/common/src/main/scala/cromwell/backend/google/pipelines/common/PapiPollResultMonitorActor.scala index 0cc86ae3588..1206ac32e1d 100644 --- a/supportedBackends/google/pipelines/common/src/main/scala/cromwell/backend/google/pipelines/common/PapiPollResultMonitorActor.scala +++ b/supportedBackends/google/pipelines/common/src/main/scala/cromwell/backend/google/pipelines/common/PapiPollResultMonitorActor.scala @@ -20,13 +20,7 @@ object PapiPollResultMonitorActor { logger: JobLogger ): Props = Props( new PapiPollResultMonitorActor( - PollMonitorParameters(serviceRegistry, - workflowDescriptor, - jobDescriptor, - runtimeAttributes, - platform, - Option(logger) - ) + PollMonitorParameters(serviceRegistry, workflowDescriptor, jobDescriptor, runtimeAttributes, platform, logger) ) ) } @@ -56,24 +50,21 @@ class PapiPollResultMonitorActor(parameters: PollMonitorParameters) extends Poll message match { case ProcessThisPollResult(pollResult: RunStatus) => processPollResult(pollResult) case ProcessThisPollResult(result) => - params.logger.foreach(logger => - logger.error( - s"Programmer error: Received Poll Result of unknown type. Expected ${RunStatus.getClass.getSimpleName} but got ${result.getClass.getSimpleName}." - ) + params.logger.error( + s"Programmer error: Received Poll Result of unknown type. Expected ${RunStatus.getClass.getSimpleName} but got ${result.getClass.getSimpleName}." ) + case AsyncJobHasFinished(pollResult: RunStatus) => handleAsyncJobFinish(pollResult.getClass.getSimpleName) case AsyncJobHasFinished(result) => - params.logger.foreach(logger => - logger.error( - s"Programmer error: Received Poll Result of unknown type. Expected ${AsyncJobHasFinished.getClass.getSimpleName} but got ${result.getClass.getSimpleName}." - ) + params.logger.error( + s"Programmer error: Received Poll Result of unknown type. Expected ${AsyncJobHasFinished.getClass.getSimpleName} but got ${result.getClass.getSimpleName}." ) + } case unexpected => - params.logger.foreach(logger => - logger.error( - s"Programmer error: Cost Helper received message of unexpected type. Was ${unexpected.getClass.getSimpleName}." - ) + params.logger.error( + s"Programmer error: Cost Helper received message of unexpected type. Was ${unexpected.getClass.getSimpleName}." ) + } } From 248e16dabfb294885ae8c66c388a6d51d523f3eb Mon Sep 17 00:00:00 2001 From: Janet Gainer-Dewar Date: Mon, 30 Sep 2024 11:44:21 -0400 Subject: [PATCH 30/39] Fixes --- .../services/cost/GcpCostCatalogService.scala | 4 ++-- .../cromwell/services/cost/GcpCostCatalogTypes.scala | 12 ++++++------ 2 files changed, 8 insertions(+), 8 deletions(-) diff --git a/services/src/main/scala/cromwell/services/cost/GcpCostCatalogService.scala b/services/src/main/scala/cromwell/services/cost/GcpCostCatalogService.scala index 2da60940842..c4eb1f682d6 100644 --- a/services/src/main/scala/cromwell/services/cost/GcpCostCatalogService.scala +++ b/services/src/main/scala/cromwell/services/cost/GcpCostCatalogService.scala @@ -101,7 +101,7 @@ object GcpCostCatalogService { } } - def calculateRamPricePerHour(ramSku: Sku, ramGbCount: Int): ErrorOr[BigDecimal] = { + def calculateRamPricePerHour(ramSku: Sku, ramGbCount: Double): ErrorOr[BigDecimal] = { val pricingInfo = getMostRecentPricingInfo(ramSku) val usageUnit = pricingInfo.getPricingExpression.getUsageUnit if (usageUnit == "GiBy.h") { @@ -216,7 +216,7 @@ class GcpCostCatalogService(serviceConfig: Config, globalConfig: Config, service val ramPricePerHour: ErrorOr[BigDecimal] = for { ramSku <- lookUpSku(instantiatedVmInfo, Ram) ramMbCount <- MachineType.extractRamMbFromMachineTypeString(instantiatedVmInfo.machineType) - ramGbCount = ramMbCount / 1024 + ramGbCount = ramMbCount / 1024d pricePerHour <- GcpCostCatalogService.calculateRamPricePerHour(ramSku, ramGbCount) } yield pricePerHour diff --git a/services/src/main/scala/cromwell/services/cost/GcpCostCatalogTypes.scala b/services/src/main/scala/cromwell/services/cost/GcpCostCatalogTypes.scala index 9705c64794f..728c814f1f6 100644 --- a/services/src/main/scala/cromwell/services/cost/GcpCostCatalogTypes.scala +++ b/services/src/main/scala/cromwell/services/cost/GcpCostCatalogTypes.scala @@ -36,11 +36,12 @@ object MachineType { else if (mType.startsWith("n2d-")) N2d.validNel else if (mType.startsWith("n2-")) N2.validNel else if (mType.startsWith("custom-")) N1.validNel // by convention - else s"Error: Unrecognized machine type: $machineTypeString".invalidNel + else s"Unrecognized machine type: $machineTypeString".invalidNel } def extractCoreCountFromMachineTypeString(machineTypeString: String): ErrorOr[Int] = { - val pattern: Pattern = Pattern.compile("-(\\d+)") + // Regex to capture second-to-last hyphen-delimited token as number + val pattern: Pattern = Pattern.compile("-(\\d+)-[^-]+$") val matcher: Matcher = pattern.matcher(machineTypeString) if (matcher.find()) { matcher.group(1).toInt.validNel @@ -49,14 +50,13 @@ object MachineType { } } def extractRamMbFromMachineTypeString(machineTypeString: String): ErrorOr[Int] = { - // Regular expression to match the number after the dash at the end of the string - // TODO add test - val pattern: Pattern = Pattern.compile(".*?-(\\d+)$") + // Regular expression to match the number after a hyphen at the end of the string + val pattern: Pattern = Pattern.compile("-(\\d+)$") val matcher: Matcher = pattern.matcher(machineTypeString); if (matcher.find()) { matcher.group(1).toInt.validNel } else { - s"Could not Ram MB count from ${machineTypeString}".invalidNel + s"Could not extract Ram MB count from ${machineTypeString}".invalidNel } } } From f88ebff092f49303be8cdfd5217ce4d5017a6aa9 Mon Sep 17 00:00:00 2001 From: Janet Gainer-Dewar Date: Mon, 30 Sep 2024 11:45:26 -0400 Subject: [PATCH 31/39] Unit tests --- .../cost/GcpCostCatalogServiceSpec.scala | 163 +++++++++++++++++- .../pipelines/common/CostLookupSpec.scala | 73 +++++--- 2 files changed, 211 insertions(+), 25 deletions(-) diff --git a/services/src/test/scala/cromwell/services/cost/GcpCostCatalogServiceSpec.scala b/services/src/test/scala/cromwell/services/cost/GcpCostCatalogServiceSpec.scala index 2cdd1f4154b..86ad7a68656 100644 --- a/services/src/test/scala/cromwell/services/cost/GcpCostCatalogServiceSpec.scala +++ b/services/src/test/scala/cromwell/services/cost/GcpCostCatalogServiceSpec.scala @@ -9,6 +9,7 @@ import cromwell.util.GracefulShutdownHelper.ShutdownCommand import org.scalatest.concurrent.Eventually import org.scalatest.flatspec.AnyFlatSpecLike import org.scalatest.matchers.should.Matchers +import org.scalatest.prop.TableDrivenPropertyChecks import java.time.Duration import java.io.{File, FileInputStream, FileOutputStream} @@ -53,7 +54,8 @@ class GcpCostCatalogServiceSpec with AnyFlatSpecLike with Matchers with Eventually - with ImplicitSender { + with ImplicitSender + with TableDrivenPropertyChecks { behavior of "CostCatalogService" def constructTestActor: GcpCostCatalogServiceTestActor = @@ -110,4 +112,163 @@ class GcpCostCatalogServiceSpec val foundValue = testActorRef.getSku(expectedKey) foundValue.get.catalogObject.getDescription shouldBe "Spot Preemptible N2D AMD Custom Instance Ram running in Paris" } + + it should "find the skus for a VM when appropriate" in { + val lookupRows = Table( + ("instantiatedVmInfo", "resource", "skuDescription"), + (InstantiatedVmInfo("europe-west9", "custom-16-32768", false), + Cpu, + "N1 Predefined Instance Core running in Paris" + ), + (InstantiatedVmInfo("europe-west9", "custom-16-32768", false), + Ram, + "N1 Predefined Instance Ram running in Paris" + ), + (InstantiatedVmInfo("us-central1", "custom-4-4096", true), + Cpu, + "Spot Preemptible N1 Predefined Instance Core running in Americas" + ), + (InstantiatedVmInfo("us-central1", "custom-4-4096", true), + Ram, + "Spot Preemptible N1 Predefined Instance Ram running in Americas" + ), + (InstantiatedVmInfo("europe-west9", "n1-custom-16-32768", false), + Cpu, + "N1 Predefined Instance Core running in Paris" + ), + (InstantiatedVmInfo("europe-west9", "n1-custom-16-32768", false), + Ram, + "N1 Predefined Instance Ram running in Paris" + ), + (InstantiatedVmInfo("us-central1", "n1-custom-4-4096", true), + Cpu, + "Spot Preemptible N1 Predefined Instance Core running in Americas" + ), + (InstantiatedVmInfo("us-central1", "n1-custom-4-4096", true), + Ram, + "Spot Preemptible N1 Predefined Instance Ram running in Americas" + ), + (InstantiatedVmInfo("us-central1", "n2-custom-4-4096", true), + Cpu, + "Spot Preemptible N2 Custom Instance Core running in Americas" + ), + (InstantiatedVmInfo("us-central1", "n2-custom-4-4096", true), + Ram, + "Spot Preemptible N2 Custom Instance Ram running in Americas" + ), + (InstantiatedVmInfo("us-central1", "n2-custom-4-4096", false), + Cpu, + "N2 Custom Instance Core running in Americas" + ), + (InstantiatedVmInfo("us-central1", "n2-custom-4-4096", false), Ram, "N2 Custom Instance Ram running in Americas"), + (InstantiatedVmInfo("us-central1", "n2d-custom-4-4096", true), + Cpu, + "Spot Preemptible N2D AMD Custom Instance Core running in Americas" + ), + (InstantiatedVmInfo("us-central1", "n2d-custom-4-4096", true), + Ram, + "Spot Preemptible N2D AMD Custom Instance Ram running in Americas" + ), + (InstantiatedVmInfo("us-central1", "n2d-custom-4-4096", false), + Cpu, + "N2D AMD Custom Instance Core running in Americas" + ), + (InstantiatedVmInfo("us-central1", "n2d-custom-4-4096", false), + Ram, + "N2D AMD Custom Instance Ram running in Americas" + ) + ) + + forAll(lookupRows) { case (instantiatedVmInfo: InstantiatedVmInfo, resource: ResourceType, expectedSku: String) => + val skuOr = testActorRef.lookUpSku(instantiatedVmInfo, resource) + skuOr.isValid shouldBe true + skuOr.map(sku => sku.getDescription shouldEqual expectedSku) + } + } + + it should "fail to find the skus for a VM when appropriate" in { + val lookupRows = Table( + ("instantiatedVmInfo", "resource", "errors"), + (InstantiatedVmInfo("us-central1", "custooooooom-4-4096", true), + Cpu, + List("Unrecognized machine type: custooooooom-4-4096") + ), + (InstantiatedVmInfo("us-central1", "n2custom-4-4096", true), + Cpu, + List("Unrecognized machine type: n2custom-4-4096") + ), + (InstantiatedVmInfo("us-central1", "standard-4-4096", true), + Cpu, + List("Unrecognized machine type: standard-4-4096") + ), + (InstantiatedVmInfo("planet-mars1", "custom-4-4096", true), + Cpu, + List("Failed to look up Cpu SKU for InstantiatedVmInfo(planet-mars1,custom-4-4096,true)") + ) + ) + + forAll(lookupRows) { + case (instantiatedVmInfo: InstantiatedVmInfo, resource: ResourceType, expectedErrors: List[String]) => + val skuOr = testActorRef.lookUpSku(instantiatedVmInfo, resource) + skuOr.isValid shouldBe false + skuOr.leftMap(errors => errors.toList shouldEqual expectedErrors) + } + } + + it should "calculate the cost per hour for a VM" in { + // Create BigDecimals from strings to avoid inequality due to floating point shenanigans + val lookupRows = Table( + ("instantiatedVmInfo", "costPerHour"), + (InstantiatedVmInfo("us-central1", "custom-4-4096", true), BigDecimal(".361")), + (InstantiatedVmInfo("us-central1", "n2-custom-4-4096", true), BigDecimal(".42544000000000004")), + (InstantiatedVmInfo("us-central1", "n2d-custom-4-4096", true), BigDecimal(".2371600000000000024")), + (InstantiatedVmInfo("us-central1", "custom-4-4096", false), BigDecimal("1.43392")), + (InstantiatedVmInfo("us-central1", "n2-custom-4-4096", false), BigDecimal("1.50561600000000012")), + (InstantiatedVmInfo("us-central1", "n2d-custom-4-4096", false), BigDecimal("1.309896")), + (InstantiatedVmInfo("europe-west9", "custom-4-4096", true), BigDecimal(".3501808")), + (InstantiatedVmInfo("europe-west9", "n2-custom-4-4096", true), BigDecimal("0.49532")), + (InstantiatedVmInfo("europe-west9", "n2d-custom-4-4096", true), BigDecimal("0.30608")), + (InstantiatedVmInfo("europe-west9", "custom-4-4096", false), BigDecimal("1.663347200000000016")), + (InstantiatedVmInfo("europe-west9", "n2-custom-4-4352", false), BigDecimal("1.75941630500000012")), + (InstantiatedVmInfo("europe-west9", "n2d-custom-4-4096", false), BigDecimal("1.51947952")) + ) + + forAll(lookupRows) { case (instantiatedVmInfo: InstantiatedVmInfo, expectedCostPerHour: BigDecimal) => + val costOr = testActorRef.calculateVmCostPerHour(instantiatedVmInfo) + costOr.isValid shouldBe true + costOr.map(cost => cost shouldEqual expectedCostPerHour) + } + } + + it should "fail to calculate the cost oer hour for a VM" in { + + val lookupRows = Table( + ("instantiatedVmInfo", "errors"), + (InstantiatedVmInfo("us-central1", "custooooooom-4-4096", true), + List("Unrecognized machine type: custooooooom-4-4096") + ), + (InstantiatedVmInfo("us-central1", "n2_custom_4_4096", true), + List("Unrecognized machine type: n2_custom_4_4096") + ), + (InstantiatedVmInfo("us-central1", "custom-foo-4096", true), + List("Could not extract core count from custom-foo-4096") + ), + (InstantiatedVmInfo("us-central1", "custom-16-bar", true), + List("Could not extract Ram MB count from custom-16-bar") + ), + (InstantiatedVmInfo("us-central1", "123-456-789", true), List("Unrecognized machine type: 123-456-789")), + (InstantiatedVmInfo("us-central1", "n2-16-4096", true), + List("Failed to look up Cpu SKU for InstantiatedVmInfo(us-central1,n2-16-4096,true)") + ), + (InstantiatedVmInfo("planet-mars1", "n2-custom-4-4096", true), + List("Failed to look up Cpu SKU for InstantiatedVmInfo(planet-mars1,n2-custom-4-4096,true)") + ) + ) + + forAll(lookupRows) { case (instantiatedVmInfo: InstantiatedVmInfo, expectedErrors: List[String]) => + val costOr = testActorRef.calculateVmCostPerHour(instantiatedVmInfo) + costOr.isValid shouldBe false + costOr.leftMap(errors => errors.toList shouldEqual expectedErrors) + } + } } diff --git a/supportedBackends/google/pipelines/common/src/test/scala/cromwell/backend/google/pipelines/common/CostLookupSpec.scala b/supportedBackends/google/pipelines/common/src/test/scala/cromwell/backend/google/pipelines/common/CostLookupSpec.scala index 10d643912f4..8437a211fa7 100644 --- a/supportedBackends/google/pipelines/common/src/test/scala/cromwell/backend/google/pipelines/common/CostLookupSpec.scala +++ b/supportedBackends/google/pipelines/common/src/test/scala/cromwell/backend/google/pipelines/common/CostLookupSpec.scala @@ -6,8 +6,15 @@ import org.scalatest.flatspec.AnyFlatSpecLike import org.scalatest.matchers.should.Matchers import cromwell.services.cost._ import org.scalatest.concurrent.Eventually +import org.scalatest.prop.TableDrivenPropertyChecks -class CostLookupSpec extends TestKitSuite with AnyFlatSpecLike with Matchers with Eventually with ImplicitSender { +class CostLookupSpec + extends TestKitSuite + with AnyFlatSpecLike + with Matchers + with Eventually + with ImplicitSender + with TableDrivenPropertyChecks { behavior of "CostLookup" def constructTestActor: GcpCostCatalogServiceTestActor = @@ -21,10 +28,10 @@ class CostLookupSpec extends TestKitSuite with AnyFlatSpecLike with Matchers wit val testCatalogService = constructTestActor it should "find a CPU sku" in { - val machineType = Some(N2) - val usageType = Some(OnDemand) - val customization = Some(Custom) - val resourceGroup = Some(Cpu) + val machineType = N2 + val usageType = OnDemand + val customization = Custom + val resourceGroup = Cpu val region = "europe-west9" val key = CostCatalogKey(machineType, usageType, customization, resourceGroup, region) val result = testCatalogService.getSku(key).get.catalogObject.getDescription @@ -32,10 +39,10 @@ class CostLookupSpec extends TestKitSuite with AnyFlatSpecLike with Matchers wit } it should "find a RAM sku" in { - val machineType = Some(N2) - val usageType = Some(OnDemand) - val customization = Some(Custom) - val resourceGroup = Some(Ram) + val machineType = N2 + val usageType = OnDemand + val customization = Custom + val resourceGroup = Ram val region = "europe-west9" val key = CostCatalogKey(machineType, usageType, customization, resourceGroup, region) val result = testCatalogService.getSku(key).get.catalogObject.getDescription @@ -43,20 +50,38 @@ class CostLookupSpec extends TestKitSuite with AnyFlatSpecLike with Matchers wit } it should "find CPU skus for all supported machine types" in { - val legalMachineTypes: List[MachineType] = List(N1, N2, N2d) - val legalUsageTypes: List[UsageType] = List(Preemptible, OnDemand) - val legalCustomizations: List[MachineCustomization] = List(Custom, Predefined) - val resourceGroup: Option[ResourceType] = Some(Cpu) - val region = "us-west1" - for (machineType <- legalMachineTypes) - for (usageType <- legalUsageTypes) - for (customization <- legalCustomizations) { - val key = CostCatalogKey(Some(machineType), Some(usageType), Some(customization), resourceGroup, region) - val result = testCatalogService.getSku(key) - if (!result.isEmpty) { - println("Success") - } - result.isEmpty shouldBe false - } + val lookupRows = Table( + ("machineType", "usage", "customization", "resource", "region", "exists"), + (N1, Preemptible, Predefined, Cpu, "us-west1", true), + (N1, Preemptible, Predefined, Ram, "us-west1", true), + (N1, OnDemand, Predefined, Cpu, "us-west1", true), + (N1, OnDemand, Predefined, Ram, "us-west1", true), + (N1, Preemptible, Custom, Cpu, "us-west1", false), + (N1, Preemptible, Custom, Ram, "us-west1", false), + (N1, OnDemand, Custom, Cpu, "us-west1", false), + (N1, OnDemand, Custom, Ram, "us-west1", false), + (N2, Preemptible, Predefined, Cpu, "us-west1", false), + (N2, Preemptible, Predefined, Ram, "us-west1", false), + (N2, OnDemand, Predefined, Cpu, "us-west1", false), + (N2, OnDemand, Predefined, Ram, "us-west1", false), + (N2, Preemptible, Custom, Cpu, "us-west1", true), + (N2, Preemptible, Custom, Ram, "us-west1", true), + (N2, OnDemand, Custom, Cpu, "us-west1", true), + (N2, OnDemand, Custom, Ram, "us-west1", true), + (N2d, Preemptible, Predefined, Cpu, "us-west1", false), + (N2d, Preemptible, Predefined, Ram, "us-west1", false), + (N2d, OnDemand, Predefined, Cpu, "us-west1", false), + (N2d, OnDemand, Predefined, Ram, "us-west1", false), + (N2d, Preemptible, Custom, Cpu, "us-west1", true), + (N2d, Preemptible, Custom, Ram, "us-west1", true), + (N2d, OnDemand, Custom, Cpu, "us-west1", true), + (N2d, OnDemand, Custom, Ram, "us-west1", true) + ) + + forAll(lookupRows) { case (machineType, usage, customization, resource, region, exists: Boolean) => + val key = CostCatalogKey(machineType, usage, customization, resource, region) + val result = testCatalogService.getSku(key) + result.nonEmpty shouldBe exists + } } } From 2baf2c06664a695b314409c2f058ff4e3dc4ad60 Mon Sep 17 00:00:00 2001 From: Janet Gainer-Dewar Date: Mon, 30 Sep 2024 13:44:26 -0400 Subject: [PATCH 32/39] Logging improvements --- .../PollResultMonitorActor.scala | 14 +++---- .../services/cost/GcpCostCatalogService.scala | 37 ++++++++----------- .../api/request/GetRequestHandler.scala | 11 ++++-- 3 files changed, 31 insertions(+), 31 deletions(-) diff --git a/backend/src/main/scala/cromwell/backend/standard/pollmonitoring/PollResultMonitorActor.scala b/backend/src/main/scala/cromwell/backend/standard/pollmonitoring/PollResultMonitorActor.scala index 3a9dddbb0bb..f8e7b52ec5e 100644 --- a/backend/src/main/scala/cromwell/backend/standard/pollmonitoring/PollResultMonitorActor.scala +++ b/backend/src/main/scala/cromwell/backend/standard/pollmonitoring/PollResultMonitorActor.scala @@ -134,7 +134,6 @@ trait PollResultMonitorActor[PollResultType] extends Actor { if (vmCostPerHour.isEmpty) { val instantiatedVmInfo = extractVmInfoFromRunState(pollStatus) instantiatedVmInfo.foreach { vmInfo => - println(s"Requesting cost info for: ${vmInfo}") val request = GcpCostLookupRequest(vmInfo, self) params.serviceRegistry ! request } @@ -153,18 +152,19 @@ trait PollResultMonitorActor[PollResultType] extends Actor { ) ) - def handleCostResponse(costLookupResponse: GcpCostLookupResponse): Unit = { - println(s"Handling Cost Response from Catalog Service: ${costLookupResponse}") + def handleCostResponse(costLookupResponse: GcpCostLookupResponse): Unit = if (vmCostPerHour.isEmpty) { // Optimization to avoid processing responses after we've received a valid one. val cost = costLookupResponse.calculatedCost match { - case Valid(c) => c + case Valid(c) => + params.logger.info(s"vmCostPerHour for ${costLookupResponse.vmInfo} is ${c}") + c case Invalid(errors) => - // TODO contextualizeErrors - params.logger.error(s"Failed to calculate VM cost per hour. ${errors.toList.mkString(", ")}") + params.logger.error( + s"Failed to calculate VM cost per hour for ${costLookupResponse.vmInfo}. ${errors.toList.mkString(", ")}" + ) BigDecimal(-1) } vmCostPerHour = Option(cost) tellMetadata(Map(CallMetadataKeys.VmCostPerHour -> cost)) } - } } diff --git a/services/src/main/scala/cromwell/services/cost/GcpCostCatalogService.scala b/services/src/main/scala/cromwell/services/cost/GcpCostCatalogService.scala index c4eb1f682d6..78e3bb2c329 100644 --- a/services/src/main/scala/cromwell/services/cost/GcpCostCatalogService.scala +++ b/services/src/main/scala/cromwell/services/cost/GcpCostCatalogService.scala @@ -64,7 +64,7 @@ object CostCatalogKey { case class GcpCostLookupRequest(vmInfo: InstantiatedVmInfo, replyTo: ActorRef) extends ServiceRegistryMessage { override def serviceName: String = "GcpCostCatalogService" } -case class GcpCostLookupResponse(calculatedCost: ErrorOr[BigDecimal]) +case class GcpCostLookupResponse(vmInfo: InstantiatedVmInfo, calculatedCost: ErrorOr[BigDecimal]) case class CostCatalogValue(catalogObject: Sku) case class ExpiringGcpCostCatalog(catalog: Map[CostCatalogKey, CostCatalogValue], fetchTime: Instant) @@ -92,8 +92,6 @@ object GcpCostCatalogService { val costPerUnit: Money = pricingInfo.getPricingExpression.getTieredRates(0).getUnitPrice val costPerCorePerHour: BigDecimal = costPerUnit.getUnits + (costPerUnit.getNanos * 10e-9) // Same as above, but as a big decimal - - println(s"Calculated ${coreCount} cpu cores to cost ${costPerCorePerHour * coreCount} per hour") val result = costPerCorePerHour * coreCount result.validNel } else { @@ -108,8 +106,6 @@ object GcpCostCatalogService { val costPerUnit: Money = pricingInfo.getPricingExpression.getTieredRates(0).getUnitPrice val costPerGbHour: BigDecimal = costPerUnit.getUnits + (costPerUnit.getNanos * 10e-9) // Same as above, but as a big decimal - - println(s"Calculated ${ramGbCount} GB of ram to cost ${ramGbCount * costPerGbHour} per hour") val result = costPerGbHour * ramGbCount result.validNel } else { @@ -206,31 +202,30 @@ class GcpCostCatalogService(serviceConfig: Config, globalConfig: Config, service } } - def calculateVmCostPerHour(instantiatedVmInfo: InstantiatedVmInfo): ErrorOr[BigDecimal] = { - val cpuPricePerHour: ErrorOr[BigDecimal] = for { + // TODO consider caching this, answers won't change until we reload the SKUs + def calculateVmCostPerHour(instantiatedVmInfo: InstantiatedVmInfo): ErrorOr[BigDecimal] = + for { cpuSku <- lookUpSku(instantiatedVmInfo, Cpu) coreCount <- MachineType.extractCoreCountFromMachineTypeString(instantiatedVmInfo.machineType) - pricePerHour <- GcpCostCatalogService.calculateCpuPricePerHour(cpuSku, coreCount) - } yield pricePerHour - - val ramPricePerHour: ErrorOr[BigDecimal] = for { + cpuPricePerHour <- GcpCostCatalogService.calculateCpuPricePerHour(cpuSku, coreCount) ramSku <- lookUpSku(instantiatedVmInfo, Ram) ramMbCount <- MachineType.extractRamMbFromMachineTypeString(instantiatedVmInfo.machineType) - ramGbCount = ramMbCount / 1024d - pricePerHour <- GcpCostCatalogService.calculateRamPricePerHour(ramSku, ramGbCount) - } yield pricePerHour - - for { - cpuPrice <- cpuPricePerHour - ramPrice <- ramPricePerHour - } yield cpuPrice + ramPrice - } + ramGbCount = ramMbCount / 1024d // need sub-integer resolution + ramPricePerHour <- GcpCostCatalogService.calculateRamPricePerHour(ramSku, ramGbCount) + totalCost = cpuPricePerHour + ramPricePerHour + _ = logger.info( + s"Calculated vmCostPerHour of ${totalCost} " + + s"(CPU ${coreCount} x ${cpuPricePerHour} [${cpuSku.getDescription}], " + + s"RAM ${ramGbCount} x ${ramPricePerHour} [${ramSku.getDescription}]) " + + s"for ${instantiatedVmInfo}" + ) + } yield totalCost def serviceRegistryActor: ActorRef = serviceRegistry override def receive: Receive = { case GcpCostLookupRequest(vmInfo, replyTo) => val calculatedCost = calculateVmCostPerHour(vmInfo) - val response = GcpCostLookupResponse(calculatedCost) + val response = GcpCostLookupResponse(vmInfo, calculatedCost) replyTo ! response case ShutdownCommand => googleClient.foreach(client => client.shutdownNow()) diff --git a/supportedBackends/google/pipelines/v2beta/src/main/scala/cromwell/backend/google/pipelines/v2beta/api/request/GetRequestHandler.scala b/supportedBackends/google/pipelines/v2beta/src/main/scala/cromwell/backend/google/pipelines/v2beta/api/request/GetRequestHandler.scala index 7d9775688f4..4a1d780685f 100644 --- a/supportedBackends/google/pipelines/v2beta/src/main/scala/cromwell/backend/google/pipelines/v2beta/api/request/GetRequestHandler.scala +++ b/supportedBackends/google/pipelines/v2beta/src/main/scala/cromwell/backend/google/pipelines/v2beta/api/request/GetRequestHandler.scala @@ -9,7 +9,13 @@ import common.validation.Validation._ import cromwell.backend.google.pipelines.common.action.ActionLabels._ import cromwell.backend.google.pipelines.common.api.PipelinesApiRequestManager._ import cromwell.backend.google.pipelines.common.api.RunStatus -import cromwell.backend.google.pipelines.common.api.RunStatus.{AwaitingCloudQuota, Initializing, Running, Success, UnsuccessfulRunStatus} +import cromwell.backend.google.pipelines.common.api.RunStatus.{ + AwaitingCloudQuota, + Initializing, + Running, + Success, + UnsuccessfulRunStatus +} import cromwell.backend.google.pipelines.common.errors.isQuotaMessage import cromwell.backend.google.pipelines.v2beta.PipelinesConversions._ import cromwell.backend.google.pipelines.v2beta.api.Deserialization._ @@ -24,7 +30,7 @@ import org.apache.commons.lang3.exception.ExceptionUtils import scala.jdk.CollectionConverters._ import scala.concurrent.{ExecutionContext, Future} import scala.language.postfixOps -import scala.util.{Failure, Try, Success => TrySuccess} +import scala.util.{Failure, Success => TrySuccess, Try} trait GetRequestHandler { this: RequestHandler => // the Genomics batch endpoint doesn't seem to be able to handle get requests on V2 operations at the moment @@ -111,7 +117,6 @@ trait GetRequestHandler { this: RequestHandler => val instantiatedVmInfo: Option[InstantiatedVmInfo] = (region, machineType) match { case (Some(instantiatedRegion), Some(instantiatedMachineType)) => - println(s"Machine Type: ${instantiatedMachineType}") Option(InstantiatedVmInfo(instantiatedRegion, instantiatedMachineType, preemptible)) case _ => Option.empty } From 6efa7df3900cb59c08dc7d9121952ffe7a6b8c37 Mon Sep 17 00:00:00 2001 From: Janet Gainer-Dewar Date: Mon, 30 Sep 2024 13:57:29 -0400 Subject: [PATCH 33/39] More informative collision checking --- .../cromwell/services/cost/GcpCostCatalogService.scala | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/services/src/main/scala/cromwell/services/cost/GcpCostCatalogService.scala b/services/src/main/scala/cromwell/services/cost/GcpCostCatalogService.scala index 78e3bb2c329..253cb323e94 100644 --- a/services/src/main/scala/cromwell/services/cost/GcpCostCatalogService.scala +++ b/services/src/main/scala/cromwell/services/cost/GcpCostCatalogService.scala @@ -179,9 +179,11 @@ class GcpCostCatalogService(serviceConfig: Config, globalConfig: Config, service // We expect that every cost catalog key is unique, but changes to the SKUs returned by Google may // break this assumption. Check and log an error if we find collisions. - val collisions = keys.filter(acc.contains) + val collisions = keys.flatMap(acc.get(_).toList).map(_.catalogObject.getDescription) if (collisions.nonEmpty) - logger.error(s"Found SKU key collision for ${sku.getDescription}") + logger.error( + s"Found SKU key collision when adding ${sku.getDescription}, collides with ${collisions.mkString(", ")}" + ) acc ++ keys.map(k => (k, CostCatalogValue(sku))) } From 2f51c508caec8f8c7356c588cebd276177f8a792 Mon Sep 17 00:00:00 2001 From: Janet Gainer-Dewar Date: Mon, 30 Sep 2024 15:53:18 -0400 Subject: [PATCH 34/39] Prevent future hair-tearing due to case changes --- .../services/cost/GcpCostCatalogTypes.scala | 34 +++++++++---------- 1 file changed, 17 insertions(+), 17 deletions(-) diff --git a/services/src/main/scala/cromwell/services/cost/GcpCostCatalogTypes.scala b/services/src/main/scala/cromwell/services/cost/GcpCostCatalogTypes.scala index 728c814f1f6..d189de43f1b 100644 --- a/services/src/main/scala/cromwell/services/cost/GcpCostCatalogTypes.scala +++ b/services/src/main/scala/cromwell/services/cost/GcpCostCatalogTypes.scala @@ -16,13 +16,13 @@ case class InstantiatedVmInfo(region: String, machineType: String, preemptible: */ sealed trait MachineType { def machineTypeName: String } -case object N1 extends MachineType { override val machineTypeName = "N1" } -case object N2 extends MachineType { override val machineTypeName = "N2" } -case object N2d extends MachineType { override val machineTypeName = "N2D" } +case object N1 extends MachineType { override val machineTypeName = "n1" } +case object N2 extends MachineType { override val machineTypeName = "n2" } +case object N2d extends MachineType { override val machineTypeName = "n2d" } object MachineType { def fromSku(sku: Sku): Option[MachineType] = { - val tokenizedDescription = sku.getDescription.split(" ") + val tokenizedDescription = sku.getDescription.toLowerCase.split(" ") if (tokenizedDescription.contains(N1.machineTypeName)) Some(N1) else if (tokenizedDescription.contains(N2.machineTypeName)) Some(N2) else if (tokenizedDescription.contains(N2d.machineTypeName)) Some(N2d) @@ -31,7 +31,7 @@ object MachineType { // expects a string that looks something like "n1-standard-1" or "custom-1-4096" def fromGoogleMachineTypeString(machineTypeString: String): ErrorOr[MachineType] = { - val mType = machineTypeString.toLowerCase() + val mType = machineTypeString.toLowerCase if (mType.startsWith("n1-")) N1.validNel else if (mType.startsWith("n2d-")) N2d.validNel else if (mType.startsWith("n2-")) N2.validNel @@ -62,12 +62,12 @@ object MachineType { } sealed trait UsageType { def typeName: String } -case object OnDemand extends UsageType { override val typeName = "OnDemand" } -case object Preemptible extends UsageType { override val typeName = "Preemptible" } +case object OnDemand extends UsageType { override val typeName = "ondemand" } +case object Preemptible extends UsageType { override val typeName = "preemptible" } object UsageType { def fromSku(sku: Sku): Option[UsageType] = - sku.getCategory.getUsageType match { + sku.getCategory.getUsageType.toLowerCase match { case OnDemand.typeName => Some(OnDemand) case Preemptible.typeName => Some(Preemptible) case _ => Option.empty @@ -80,8 +80,8 @@ object UsageType { } sealed trait MachineCustomization { def customizationName: String } -case object Custom extends MachineCustomization { override val customizationName = "Custom" } -case object Predefined extends MachineCustomization { override val customizationName = "Predefined" } +case object Custom extends MachineCustomization { override val customizationName = "custom" } +case object Predefined extends MachineCustomization { override val customizationName = "predefined" } object MachineCustomization { def fromMachineTypeString(machineTypeString: String): MachineCustomization = @@ -95,7 +95,7 @@ object MachineCustomization { strings and predefined SKUs are only identifiable by the absence of "Custom." */ def fromSku(sku: Sku): Option[MachineCustomization] = { - val tokenizedDescription = sku.getDescription.split(" ") + val tokenizedDescription = sku.getDescription.toLowerCase.split(" ") // ex. "N1 Predefined Instance Core running in Montreal" if (tokenizedDescription.contains(Predefined.customizationName)) Some(Predefined) @@ -107,17 +107,17 @@ object MachineCustomization { } sealed trait ResourceType { def groupName: String } -case object Cpu extends ResourceType { override val groupName = "CPU" } -case object Ram extends ResourceType { override val groupName = "RAM" } +case object Cpu extends ResourceType { override val groupName = "cpu" } +case object Ram extends ResourceType { override val groupName = "ram" } object ResourceType { def fromSku(sku: Sku): Option[ResourceType] = { - val tokenizedDescription = sku.getDescription.split(" ") - sku.getCategory.getResourceGroup match { + val tokenizedDescription = sku.getDescription.toLowerCase.split(" ") + sku.getCategory.getResourceGroup.toLowerCase match { case Cpu.groupName => Some(Cpu) case Ram.groupName => Some(Ram) - case "N1Standard" if tokenizedDescription.contains("Ram") => Some(Ram) - case "N1Standard" if tokenizedDescription.contains("Core") => Some(Cpu) + case "n1standard" if tokenizedDescription.contains("ram") => Some(Ram) + case "n1standard" if tokenizedDescription.contains("core") => Some(Cpu) case _ => Option.empty } } From fbcd64913c78d1c8bdd4330b5d141757e29b4c89 Mon Sep 17 00:00:00 2001 From: Janet Gainer-Dewar Date: Mon, 30 Sep 2024 15:56:32 -0400 Subject: [PATCH 35/39] Condense tests --- .../cost/GcpCostCatalogServiceSpec.scala | 42 +++++++-- .../pipelines/common/CostLookupSpec.scala | 87 ------------------- 2 files changed, 33 insertions(+), 96 deletions(-) delete mode 100644 supportedBackends/google/pipelines/common/src/test/scala/cromwell/backend/google/pipelines/common/CostLookupSpec.scala diff --git a/services/src/test/scala/cromwell/services/cost/GcpCostCatalogServiceSpec.scala b/services/src/test/scala/cromwell/services/cost/GcpCostCatalogServiceSpec.scala index 86ad7a68656..087e7a89512 100644 --- a/services/src/test/scala/cromwell/services/cost/GcpCostCatalogServiceSpec.scala +++ b/services/src/test/scala/cromwell/services/cost/GcpCostCatalogServiceSpec.scala @@ -101,16 +101,40 @@ class GcpCostCatalogServiceSpec freshActor.getCatalogAge.toNanos should (be < shortDuration.toNanos) } - it should "contain an expected SKU" in { - val expectedKey = CostCatalogKey( - machineType = N2d, - usageType = Preemptible, - machineCustomization = Custom, - resourceType = Ram, - region = "europe-west9" + it should "find CPU and RAM skus for all supported machine types" in { + val lookupRows = Table( + ("machineType", "usage", "customization", "resource", "region", "exists"), + (N1, Preemptible, Predefined, Cpu, "us-west1", true), + (N1, Preemptible, Predefined, Ram, "us-west1", true), + (N1, OnDemand, Predefined, Cpu, "us-west1", true), + (N1, OnDemand, Predefined, Ram, "us-west1", true), + (N1, Preemptible, Custom, Cpu, "us-west1", false), + (N1, Preemptible, Custom, Ram, "us-west1", false), + (N1, OnDemand, Custom, Cpu, "us-west1", false), + (N1, OnDemand, Custom, Ram, "us-west1", false), + (N2, Preemptible, Predefined, Cpu, "us-west1", false), + (N2, Preemptible, Predefined, Ram, "us-west1", false), + (N2, OnDemand, Predefined, Cpu, "us-west1", false), + (N2, OnDemand, Predefined, Ram, "us-west1", false), + (N2, Preemptible, Custom, Cpu, "us-west1", true), + (N2, Preemptible, Custom, Ram, "us-west1", true), + (N2, OnDemand, Custom, Cpu, "us-west1", true), + (N2, OnDemand, Custom, Ram, "us-west1", true), + (N2d, Preemptible, Predefined, Cpu, "us-west1", false), + (N2d, Preemptible, Predefined, Ram, "us-west1", false), + (N2d, OnDemand, Predefined, Cpu, "us-west1", false), + (N2d, OnDemand, Predefined, Ram, "us-west1", false), + (N2d, Preemptible, Custom, Cpu, "us-west1", true), + (N2d, Preemptible, Custom, Ram, "us-west1", true), + (N2d, OnDemand, Custom, Cpu, "us-west1", true), + (N2d, OnDemand, Custom, Ram, "us-west1", true) ) - val foundValue = testActorRef.getSku(expectedKey) - foundValue.get.catalogObject.getDescription shouldBe "Spot Preemptible N2D AMD Custom Instance Ram running in Paris" + + forAll(lookupRows) { case (machineType, usage, customization, resource, region, exists: Boolean) => + val key = CostCatalogKey(machineType, usage, customization, resource, region) + val result = testActorRef.getSku(key) + result.nonEmpty shouldBe exists + } } it should "find the skus for a VM when appropriate" in { diff --git a/supportedBackends/google/pipelines/common/src/test/scala/cromwell/backend/google/pipelines/common/CostLookupSpec.scala b/supportedBackends/google/pipelines/common/src/test/scala/cromwell/backend/google/pipelines/common/CostLookupSpec.scala deleted file mode 100644 index 8437a211fa7..00000000000 --- a/supportedBackends/google/pipelines/common/src/test/scala/cromwell/backend/google/pipelines/common/CostLookupSpec.scala +++ /dev/null @@ -1,87 +0,0 @@ -package cromwell.backend.google.pipelines.common - -import akka.testkit.{ImplicitSender, TestActorRef, TestProbe} -import cromwell.core.TestKitSuite -import org.scalatest.flatspec.AnyFlatSpecLike -import org.scalatest.matchers.should.Matchers -import cromwell.services.cost._ -import org.scalatest.concurrent.Eventually -import org.scalatest.prop.TableDrivenPropertyChecks - -class CostLookupSpec - extends TestKitSuite - with AnyFlatSpecLike - with Matchers - with Eventually - with ImplicitSender - with TableDrivenPropertyChecks { - behavior of "CostLookup" - - def constructTestActor: GcpCostCatalogServiceTestActor = - TestActorRef( - new GcpCostCatalogServiceTestActor(GcpCostCatalogServiceSpec.config, - GcpCostCatalogServiceSpec.config, - TestProbe().ref - ) - ).underlyingActor - - val testCatalogService = constructTestActor - - it should "find a CPU sku" in { - val machineType = N2 - val usageType = OnDemand - val customization = Custom - val resourceGroup = Cpu - val region = "europe-west9" - val key = CostCatalogKey(machineType, usageType, customization, resourceGroup, region) - val result = testCatalogService.getSku(key).get.catalogObject.getDescription - result shouldBe "N2 Custom Instance Core running in Paris" - } - - it should "find a RAM sku" in { - val machineType = N2 - val usageType = OnDemand - val customization = Custom - val resourceGroup = Ram - val region = "europe-west9" - val key = CostCatalogKey(machineType, usageType, customization, resourceGroup, region) - val result = testCatalogService.getSku(key).get.catalogObject.getDescription - result shouldBe "N2 Custom Instance Ram running in Paris" - } - - it should "find CPU skus for all supported machine types" in { - val lookupRows = Table( - ("machineType", "usage", "customization", "resource", "region", "exists"), - (N1, Preemptible, Predefined, Cpu, "us-west1", true), - (N1, Preemptible, Predefined, Ram, "us-west1", true), - (N1, OnDemand, Predefined, Cpu, "us-west1", true), - (N1, OnDemand, Predefined, Ram, "us-west1", true), - (N1, Preemptible, Custom, Cpu, "us-west1", false), - (N1, Preemptible, Custom, Ram, "us-west1", false), - (N1, OnDemand, Custom, Cpu, "us-west1", false), - (N1, OnDemand, Custom, Ram, "us-west1", false), - (N2, Preemptible, Predefined, Cpu, "us-west1", false), - (N2, Preemptible, Predefined, Ram, "us-west1", false), - (N2, OnDemand, Predefined, Cpu, "us-west1", false), - (N2, OnDemand, Predefined, Ram, "us-west1", false), - (N2, Preemptible, Custom, Cpu, "us-west1", true), - (N2, Preemptible, Custom, Ram, "us-west1", true), - (N2, OnDemand, Custom, Cpu, "us-west1", true), - (N2, OnDemand, Custom, Ram, "us-west1", true), - (N2d, Preemptible, Predefined, Cpu, "us-west1", false), - (N2d, Preemptible, Predefined, Ram, "us-west1", false), - (N2d, OnDemand, Predefined, Cpu, "us-west1", false), - (N2d, OnDemand, Predefined, Ram, "us-west1", false), - (N2d, Preemptible, Custom, Cpu, "us-west1", true), - (N2d, Preemptible, Custom, Ram, "us-west1", true), - (N2d, OnDemand, Custom, Cpu, "us-west1", true), - (N2d, OnDemand, Custom, Ram, "us-west1", true) - ) - - forAll(lookupRows) { case (machineType, usage, customization, resource, region, exists: Boolean) => - val key = CostCatalogKey(machineType, usage, customization, resource, region) - val result = testCatalogService.getSku(key) - result.nonEmpty shouldBe exists - } - } -} From 2c760b4471815a08d1b601511bb6e12a018aaf7b Mon Sep 17 00:00:00 2001 From: Janet Gainer-Dewar Date: Mon, 30 Sep 2024 16:56:11 -0400 Subject: [PATCH 36/39] Don't be concerned with GCP-specific things in PollResultMonitorActor --- .../PollResultMonitorActor.scala | 30 ++++--------------- .../actors/BatchPollResultMonitorActor.scala | 24 ++++++++++++++- .../common/PapiPollResultMonitorActor.scala | 27 +++++++++++++++-- 3 files changed, 53 insertions(+), 28 deletions(-) diff --git a/backend/src/main/scala/cromwell/backend/standard/pollmonitoring/PollResultMonitorActor.scala b/backend/src/main/scala/cromwell/backend/standard/pollmonitoring/PollResultMonitorActor.scala index f8e7b52ec5e..c6d615cda0f 100644 --- a/backend/src/main/scala/cromwell/backend/standard/pollmonitoring/PollResultMonitorActor.scala +++ b/backend/src/main/scala/cromwell/backend/standard/pollmonitoring/PollResultMonitorActor.scala @@ -1,6 +1,6 @@ package cromwell.backend.standard.pollmonitoring + import akka.actor.{Actor, ActorRef} -import cats.data.Validated.{Invalid, Valid} import cromwell.backend.{BackendJobDescriptor, BackendWorkflowDescriptor, Platform} import cromwell.backend.validation.{ CpuValidation, @@ -10,7 +10,7 @@ import cromwell.backend.validation.{ ValidatedRuntimeAttributes } import cromwell.core.logging.JobLogger -import cromwell.services.cost.{GcpCostLookupRequest, GcpCostLookupResponse, InstantiatedVmInfo} +import cromwell.services.cost.InstantiatedVmInfo import cromwell.services.metadata.CallMetadataKeys import cromwell.services.metrics.bard.BardEventing.BardEventRequest import cromwell.services.metrics.bard.model.TaskSummaryEvent @@ -104,7 +104,7 @@ trait PollResultMonitorActor[PollResultType] extends Actor { Option.empty private var vmStartTime: Option[OffsetDateTime] = Option.empty private var vmEndTime: Option[OffsetDateTime] = Option.empty - private var vmCostPerHour: Option[BigDecimal] = Option.empty + protected var vmCostPerHour: Option[BigDecimal] = Option.empty def processPollResult(pollStatus: PollResultType): Unit = { // Make sure jobStartTime remains the earliest event time ever seen @@ -132,14 +132,12 @@ trait PollResultMonitorActor[PollResultType] extends Actor { // We expect it to reply with an answer, which is handled in receive. // NB: Due to the nature of async code, we may send a few cost requests before we get a response back. if (vmCostPerHour.isEmpty) { - val instantiatedVmInfo = extractVmInfoFromRunState(pollStatus) - instantiatedVmInfo.foreach { vmInfo => - val request = GcpCostLookupRequest(vmInfo, self) - params.serviceRegistry ! request - } + extractVmInfoFromRunState(pollStatus).foreach(handleVmCostLookup) } } + def handleVmCostLookup(vmInfo: InstantiatedVmInfo): Unit + // When a job finishes, the bard actor needs to know about the timing in order to record metrics. // Cost related metadata should already have been handled in processPollResult. def handleAsyncJobFinish(terminalStateName: String): Unit = @@ -151,20 +149,4 @@ trait PollResultMonitorActor[PollResultType] extends Actor { vmEndTime = vmEndTime.getOrElse(OffsetDateTime.now()) ) ) - - def handleCostResponse(costLookupResponse: GcpCostLookupResponse): Unit = - if (vmCostPerHour.isEmpty) { // Optimization to avoid processing responses after we've received a valid one. - val cost = costLookupResponse.calculatedCost match { - case Valid(c) => - params.logger.info(s"vmCostPerHour for ${costLookupResponse.vmInfo} is ${c}") - c - case Invalid(errors) => - params.logger.error( - s"Failed to calculate VM cost per hour for ${costLookupResponse.vmInfo}. ${errors.toList.mkString(", ")}" - ) - BigDecimal(-1) - } - vmCostPerHour = Option(cost) - tellMetadata(Map(CallMetadataKeys.VmCostPerHour -> cost)) - } } diff --git a/supportedBackends/google/batch/src/main/scala/cromwell/backend/google/batch/actors/BatchPollResultMonitorActor.scala b/supportedBackends/google/batch/src/main/scala/cromwell/backend/google/batch/actors/BatchPollResultMonitorActor.scala index a313ab49b05..0f5c00fa834 100644 --- a/supportedBackends/google/batch/src/main/scala/cromwell/backend/google/batch/actors/BatchPollResultMonitorActor.scala +++ b/supportedBackends/google/batch/src/main/scala/cromwell/backend/google/batch/actors/BatchPollResultMonitorActor.scala @@ -1,6 +1,7 @@ package cromwell.backend.google.batch.actors import akka.actor.{ActorRef, Props} +import cats.data.Validated.{Invalid, Valid} import cromwell.backend.{BackendJobDescriptor, BackendWorkflowDescriptor, Platform} import cromwell.backend.google.batch.models.RunStatus import cromwell.backend.standard.pollmonitoring.{ @@ -12,7 +13,7 @@ import cromwell.backend.standard.pollmonitoring.{ } import cromwell.backend.validation.ValidatedRuntimeAttributes import cromwell.core.logging.JobLogger -import cromwell.services.cost.InstantiatedVmInfo +import cromwell.services.cost.{GcpCostLookupRequest, GcpCostLookupResponse, InstantiatedVmInfo} import cromwell.services.metadata.CallMetadataKeys import java.time.OffsetDateTime @@ -46,6 +47,27 @@ class BatchPollResultMonitorActor(pollMonitorParameters: PollMonitorParameters) case event if event.name == CallMetadataKeys.VmEndTime => event.offsetDateTime } + override def handleVmCostLookup(vmInfo: InstantiatedVmInfo) = { + val request = GcpCostLookupRequest(vmInfo, self) + params.serviceRegistry ! request + } + + def handleCostResponse(costLookupResponse: GcpCostLookupResponse): Unit = + if (vmCostPerHour.isEmpty) { // Optimization to avoid processing responses after we've received a valid one. + val cost = costLookupResponse.calculatedCost match { + case Valid(c) => + params.logger.info(s"vmCostPerHour for ${costLookupResponse.vmInfo} is ${c}") + c + case Invalid(errors) => + params.logger.error( + s"Failed to calculate VM cost per hour for ${costLookupResponse.vmInfo}. ${errors.toList.mkString(", ")}" + ) + BigDecimal(-1) + } + vmCostPerHour = Option(cost) + tellMetadata(Map(CallMetadataKeys.VmCostPerHour -> cost)) + } + override def receive: Receive = { case message: PollResultMessage => message match { diff --git a/supportedBackends/google/pipelines/common/src/main/scala/cromwell/backend/google/pipelines/common/PapiPollResultMonitorActor.scala b/supportedBackends/google/pipelines/common/src/main/scala/cromwell/backend/google/pipelines/common/PapiPollResultMonitorActor.scala index 1206ac32e1d..3e9d55b9e1b 100644 --- a/supportedBackends/google/pipelines/common/src/main/scala/cromwell/backend/google/pipelines/common/PapiPollResultMonitorActor.scala +++ b/supportedBackends/google/pipelines/common/src/main/scala/cromwell/backend/google/pipelines/common/PapiPollResultMonitorActor.scala @@ -1,12 +1,13 @@ package cromwell.backend.google.pipelines.common import akka.actor.{ActorRef, Props} +import cats.data.Validated.{Invalid, Valid} import cromwell.backend.google.pipelines.common.api.RunStatus import cromwell.backend.standard.pollmonitoring._ import cromwell.backend.validation.ValidatedRuntimeAttributes import cromwell.backend.{BackendJobDescriptor, BackendWorkflowDescriptor, Platform} import cromwell.core.logging.JobLogger -import cromwell.services.cost.{GcpCostLookupResponse, InstantiatedVmInfo} +import cromwell.services.cost.{GcpCostLookupRequest, GcpCostLookupResponse, InstantiatedVmInfo} import cromwell.services.metadata.CallMetadataKeys import java.time.OffsetDateTime @@ -29,6 +30,7 @@ class PapiPollResultMonitorActor(parameters: PollMonitorParameters) extends Poll override def extractEarliestEventTimeFromRunState(pollStatus: RunStatus): Option[OffsetDateTime] = pollStatus.eventList.minByOption(_.offsetDateTime).map(e => e.offsetDateTime) + override def extractStartTimeFromRunState(pollStatus: RunStatus): Option[OffsetDateTime] = pollStatus.eventList.collectFirst { case event if event.name == CallMetadataKeys.VmStartTime => event.offsetDateTime @@ -42,8 +44,29 @@ class PapiPollResultMonitorActor(parameters: PollMonitorParameters) extends Poll override def extractVmInfoFromRunState(pollStatus: RunStatus): Option[InstantiatedVmInfo] = pollStatus.instantiatedVmInfo + override def handleVmCostLookup(vmInfo: InstantiatedVmInfo) = { + val request = GcpCostLookupRequest(vmInfo, self) + params.serviceRegistry ! request + } + override def params: PollMonitorParameters = parameters + def handleCostResponse(costLookupResponse: GcpCostLookupResponse): Unit = + if (vmCostPerHour.isEmpty) { // Optimization to avoid processing responses after we've received a valid one. + val cost = costLookupResponse.calculatedCost match { + case Valid(c) => + params.logger.info(s"vmCostPerHour for ${costLookupResponse.vmInfo} is ${c}") + c + case Invalid(errors) => + params.logger.error( + s"Failed to calculate VM cost per hour for ${costLookupResponse.vmInfo}. ${errors.toList.mkString(", ")}" + ) + BigDecimal(-1) + } + vmCostPerHour = Option(cost) + tellMetadata(Map(CallMetadataKeys.VmCostPerHour -> cost)) + } + override def receive: Receive = { case costResponse: GcpCostLookupResponse => handleCostResponse(costResponse) case message: PollResultMessage => @@ -53,13 +76,11 @@ class PapiPollResultMonitorActor(parameters: PollMonitorParameters) extends Poll params.logger.error( s"Programmer error: Received Poll Result of unknown type. Expected ${RunStatus.getClass.getSimpleName} but got ${result.getClass.getSimpleName}." ) - case AsyncJobHasFinished(pollResult: RunStatus) => handleAsyncJobFinish(pollResult.getClass.getSimpleName) case AsyncJobHasFinished(result) => params.logger.error( s"Programmer error: Received Poll Result of unknown type. Expected ${AsyncJobHasFinished.getClass.getSimpleName} but got ${result.getClass.getSimpleName}." ) - } case unexpected => params.logger.error( From e3086f925814c1706189d883cfbaf141b37c5bbe Mon Sep 17 00:00:00 2001 From: Janet Gainer-Dewar Date: Mon, 30 Sep 2024 16:58:28 -0400 Subject: [PATCH 37/39] Scalafmt --- .../pipelines/v2beta/api/request/ErrorReporter.scala | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/supportedBackends/google/pipelines/v2beta/src/main/scala/cromwell/backend/google/pipelines/v2beta/api/request/ErrorReporter.scala b/supportedBackends/google/pipelines/v2beta/src/main/scala/cromwell/backend/google/pipelines/v2beta/api/request/ErrorReporter.scala index 1c9be388c7d..bda6981084b 100644 --- a/supportedBackends/google/pipelines/v2beta/src/main/scala/cromwell/backend/google/pipelines/v2beta/api/request/ErrorReporter.scala +++ b/supportedBackends/google/pipelines/v2beta/src/main/scala/cromwell/backend/google/pipelines/v2beta/api/request/ErrorReporter.scala @@ -86,7 +86,15 @@ class ErrorReporter(machineType: Option[String], // Reverse the list because the first failure (likely the most relevant, will appear last otherwise) val unexpectedExitEvents: List[String] = unexpectedExitStatusErrorStrings(events, actions).reverse - builder(status, None, failed.toList ++ unexpectedExitEvents, executionEvents, machineType, zone, instanceName, Option.empty) + builder(status, + None, + failed.toList ++ unexpectedExitEvents, + executionEvents, + machineType, + zone, + instanceName, + Option.empty + ) } // There's maybe one FailedEvent per operation with a summary error message From e93b0784496e8095dc0abc75a761096a27c533e8 Mon Sep 17 00:00:00 2001 From: Janet Gainer-Dewar Date: Tue, 1 Oct 2024 10:53:28 -0400 Subject: [PATCH 38/39] Clearer log message --- .../scala/cromwell/services/cost/GcpCostCatalogService.scala | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/services/src/main/scala/cromwell/services/cost/GcpCostCatalogService.scala b/services/src/main/scala/cromwell/services/cost/GcpCostCatalogService.scala index 253cb323e94..7753d186107 100644 --- a/services/src/main/scala/cromwell/services/cost/GcpCostCatalogService.scala +++ b/services/src/main/scala/cromwell/services/cost/GcpCostCatalogService.scala @@ -217,8 +217,8 @@ class GcpCostCatalogService(serviceConfig: Config, globalConfig: Config, service totalCost = cpuPricePerHour + ramPricePerHour _ = logger.info( s"Calculated vmCostPerHour of ${totalCost} " + - s"(CPU ${coreCount} x ${cpuPricePerHour} [${cpuSku.getDescription}], " + - s"RAM ${ramGbCount} x ${ramPricePerHour} [${ramSku.getDescription}]) " + + s"(CPU ${cpuPricePerHour} for ${coreCount} cores [${cpuSku.getDescription}], " + + s"RAM ${ramPricePerHour} for ${ramGbCount} Gb [${ramSku.getDescription}]) " + s"for ${instantiatedVmInfo}" ) } yield totalCost From de345c2c3d7f5375528b50fa75ea06e0e8d77ee3 Mon Sep 17 00:00:00 2001 From: Janet Gainer-Dewar Date: Wed, 2 Oct 2024 15:43:50 -0400 Subject: [PATCH 39/39] Enable disablement of cost catalog, only keep google client open as needed --- core/src/main/resources/reference.conf | 2 + .../services/cost/CostCatalogConfig.scala | 5 ++- .../services/cost/GcpCostCatalogService.scala | 41 +++++++++++-------- .../cost/GcpCostCatalogServiceSpec.scala | 28 ++++++++----- 4 files changed, 45 insertions(+), 31 deletions(-) diff --git a/core/src/main/resources/reference.conf b/core/src/main/resources/reference.conf index e508a8f9424..9da59f16b7b 100644 --- a/core/src/main/resources/reference.conf +++ b/core/src/main/resources/reference.conf @@ -607,9 +607,11 @@ services { } } + // When enabled, Cromwell will store vmCostPerHour metadata for GCP tasks GcpCostCatalogService { class = "cromwell.services.cost.GcpCostCatalogService" config { + enabled = false catalogExpirySeconds = 86400 } } diff --git a/services/src/main/scala/cromwell/services/cost/CostCatalogConfig.scala b/services/src/main/scala/cromwell/services/cost/CostCatalogConfig.scala index 47b78245920..ae342ea84e5 100644 --- a/services/src/main/scala/cromwell/services/cost/CostCatalogConfig.scala +++ b/services/src/main/scala/cromwell/services/cost/CostCatalogConfig.scala @@ -3,8 +3,9 @@ package cromwell.services.cost import com.typesafe.config.Config import net.ceedubs.ficus.Ficus._ -final case class CostCatalogConfig(catalogExpirySeconds: Int) +final case class CostCatalogConfig(enabled: Boolean, catalogExpirySeconds: Int) object CostCatalogConfig { - def apply(config: Config): CostCatalogConfig = CostCatalogConfig(config.as[Int]("catalogExpirySeconds")) + def apply(config: Config): CostCatalogConfig = + CostCatalogConfig(config.as[Boolean]("enabled"), config.as[Int]("catalogExpirySeconds")) } diff --git a/services/src/main/scala/cromwell/services/cost/GcpCostCatalogService.scala b/services/src/main/scala/cromwell/services/cost/GcpCostCatalogService.scala index 7753d186107..4c715cd3628 100644 --- a/services/src/main/scala/cromwell/services/cost/GcpCostCatalogService.scala +++ b/services/src/main/scala/cromwell/services/cost/GcpCostCatalogService.scala @@ -16,6 +16,7 @@ import cromwell.util.GracefulShutdownHelper.ShutdownCommand import java.time.{Duration, Instant} import scala.jdk.CollectionConverters.IterableHasAsScala import java.time.temporal.ChronoUnit.SECONDS +import scala.util.Using case class CostCatalogKey(machineType: MachineType, usageType: UsageType, @@ -67,6 +68,9 @@ case class GcpCostLookupRequest(vmInfo: InstantiatedVmInfo, replyTo: ActorRef) e case class GcpCostLookupResponse(vmInfo: InstantiatedVmInfo, calculatedCost: ErrorOr[BigDecimal]) case class CostCatalogValue(catalogObject: Sku) case class ExpiringGcpCostCatalog(catalog: Map[CostCatalogKey, CostCatalogValue], fetchTime: Instant) +object ExpiringGcpCostCatalog { + def empty: ExpiringGcpCostCatalog = ExpiringGcpCostCatalog(Map.empty, Instant.MIN) +} object GcpCostCatalogService { // Can be gleaned by using googleClient.listServices @@ -122,37 +126,38 @@ class GcpCostCatalogService(serviceConfig: Config, globalConfig: Config, service extends Actor with LazyLogging { - private val maxCatalogLifetime: Duration = - Duration.of(CostCatalogConfig(serviceConfig).catalogExpirySeconds.longValue, SECONDS) + private val costCatalogConfig = CostCatalogConfig(serviceConfig) - private var googleClient: Option[CloudCatalogClient] = Option.empty + private val maxCatalogLifetime: Duration = + Duration.of(costCatalogConfig.catalogExpirySeconds.longValue, SECONDS) // Cached catalog. Refreshed lazily when older than maxCatalogLifetime. - private var costCatalog: Option[ExpiringGcpCostCatalog] = Option.empty + private var costCatalog: ExpiringGcpCostCatalog = ExpiringGcpCostCatalog.empty /** * Returns the SKU for a given key, if it exists */ def getSku(key: CostCatalogKey): Option[CostCatalogValue] = getOrFetchCachedCatalog().get(key) - protected def fetchNewCatalog: Iterable[Sku] = { - if (googleClient.isEmpty) { - // We use option rather than lazy here so that the client isn't created when it is told to shutdown (see receive override) - googleClient = Some(CloudCatalogClient.create) + protected def fetchSkuIterable(googleClient: CloudCatalogClient): Iterable[Sku] = + makeInitialWebRequest(googleClient).iterateAll().asScala + + private def fetchNewCatalog: ExpiringGcpCostCatalog = + Using.resource(CloudCatalogClient.create) { googleClient => + val skus = fetchSkuIterable(googleClient) + ExpiringGcpCostCatalog(processCostCatalog(skus), Instant.now()) } - makeInitialWebRequest(googleClient.get).iterateAll().asScala - } - def getCatalogAge: Duration = - Duration.between(costCatalog.map(c => c.fetchTime).getOrElse(Instant.ofEpochMilli(0)), Instant.now()) - private def isCurrentCatalogExpired: Boolean = getCatalogAge.toNanos > maxCatalogLifetime.toNanos + def getCatalogAge: Duration = Duration.between(costCatalog.fetchTime, Instant.now()) + + private def isCurrentCatalogExpired: Boolean = getCatalogAge.toSeconds > maxCatalogLifetime.toSeconds private def getOrFetchCachedCatalog(): Map[CostCatalogKey, CostCatalogValue] = { - if (costCatalog.isEmpty || isCurrentCatalogExpired) { + if (isCurrentCatalogExpired) { logger.info("Fetching a new GCP public cost catalog.") - costCatalog = Some(ExpiringGcpCostCatalog(processCostCatalog(fetchNewCatalog), Instant.now())) + costCatalog = fetchNewCatalog } - costCatalog.map(expiringCatalog => expiringCatalog.catalog).getOrElse(Map.empty) + costCatalog.catalog } /** @@ -225,12 +230,12 @@ class GcpCostCatalogService(serviceConfig: Config, globalConfig: Config, service def serviceRegistryActor: ActorRef = serviceRegistry override def receive: Receive = { - case GcpCostLookupRequest(vmInfo, replyTo) => + case GcpCostLookupRequest(vmInfo, replyTo) if costCatalogConfig.enabled => val calculatedCost = calculateVmCostPerHour(vmInfo) val response = GcpCostLookupResponse(vmInfo, calculatedCost) replyTo ! response + case GcpCostLookupRequest(_, _) => // do nothing if we're disabled case ShutdownCommand => - googleClient.foreach(client => client.shutdownNow()) context stop self case other => logger.error( diff --git a/services/src/test/scala/cromwell/services/cost/GcpCostCatalogServiceSpec.scala b/services/src/test/scala/cromwell/services/cost/GcpCostCatalogServiceSpec.scala index 087e7a89512..9973aaa4082 100644 --- a/services/src/test/scala/cromwell/services/cost/GcpCostCatalogServiceSpec.scala +++ b/services/src/test/scala/cromwell/services/cost/GcpCostCatalogServiceSpec.scala @@ -2,7 +2,7 @@ package cromwell.services.cost import akka.actor.ActorRef import akka.testkit.{ImplicitSender, TestActorRef, TestProbe} -import com.google.cloud.billing.v1.Sku +import com.google.cloud.billing.v1.{CloudCatalogClient, Sku} import com.typesafe.config.{Config, ConfigFactory} import cromwell.core.TestKitSuite import cromwell.util.GracefulShutdownHelper.ShutdownCommand @@ -14,10 +14,13 @@ import org.scalatest.prop.TableDrivenPropertyChecks import java.time.Duration import java.io.{File, FileInputStream, FileOutputStream} import scala.collection.mutable.ListBuffer +import scala.util.Using object GcpCostCatalogServiceSpec { val catalogExpirySeconds: Long = 1 // Short duration so we can do a cache expiry test - val config: Config = ConfigFactory.parseString(s"catalogExpirySeconds = $catalogExpirySeconds") + val config: Config = ConfigFactory.parseString( + s"catalogExpirySeconds = $catalogExpirySeconds, enabled = true" + ) val mockTestDataFilePath: String = "services/src/test/scala/cromwell/services/cost/serializedSkuList.testData" } class GcpCostCatalogServiceTestActor(serviceConfig: Config, globalConfig: Config, serviceRegistry: ActorRef) @@ -35,18 +38,21 @@ class GcpCostCatalogServiceTestActor(serviceConfig: Config, globalConfig: Config fis.close() skuList.toSeq } - def saveMockData(): Unit = { - val fetchedData = super.fetchNewCatalog - val fos = new FileOutputStream(new File(GcpCostCatalogServiceSpec.mockTestDataFilePath)) - fetchedData.foreach { sku => - sku.writeDelimitedTo(fos) + def saveMockData(): Unit = + Using.resources(CloudCatalogClient.create, + new FileOutputStream(new File(GcpCostCatalogServiceSpec.mockTestDataFilePath)) + ) { (googleClient, fos) => + val skus = super.fetchSkuIterable(googleClient) + skus.foreach { sku => + sku.writeDelimitedTo(fos) + } } - fos.close() - } + override def receive: Receive = { case ShutdownCommand => context stop self } - override def fetchNewCatalog: Iterable[Sku] = loadMockData + + override def fetchSkuIterable(client: CloudCatalogClient): Iterable[Sku] = loadMockData } class GcpCostCatalogServiceSpec @@ -90,7 +96,7 @@ class GcpCostCatalogServiceSpec freshActor.getCatalogAge.toNanos should (be < shortDuration.toNanos) // Simulate the cached catalog living longer than its lifetime - Thread.sleep(shortDuration.toMillis) + Thread.sleep(shortDuration.plus(shortDuration).toMillis) // Confirm that the catalog is old freshActor.getCatalogAge.toNanos should (be > shortDuration.toNanos)