Skip to content

Commit

Permalink
Merge pull request #6406 from fitzcao/issues_6400
Browse files Browse the repository at this point in the history
feat: task失败可以立即重试 #6400
  • Loading branch information
irwinsun authored Mar 29, 2022
2 parents 6cbb03c + 29ce8d7 commit 705d562
Show file tree
Hide file tree
Showing 7 changed files with 217 additions and 13 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -117,15 +117,20 @@ object ModelUtils {
val jobStatus = BuildStatus.parse(c.status)
c.canRetry = jobStatus.isFailure() || jobStatus.isCancel()
}

val failElements = mutableListOf<Element>()
c.elements.forEach { e ->
refreshElement(element = e, failElements = failElements)
if (c.canRetry == true) {
refreshContainer(c)
}
}
}
}

private fun refreshContainer(container: Container) {
val failElements = mutableListOf<Element>()
container.elements.forEach { e ->
refreshElement(element = e, failElements = failElements)
}
}

private fun refreshElement(element: Element, failElements: MutableList<Element>) {

val additionalOptions = element.additionalOptions
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -143,6 +143,7 @@ object ProcessMessageCode {
const val ERROR_TIMEOUT_IN_BUILD_QUEUE = "2101904" // 排队超时,取消运行! [{0}]
const val ERROR_PARUS_PIEPLINE_IS_RUNNINT = "2101905" // 暂停的流水线已开始运行
const val ERROR_ELEMENT_TOO_LONG = "2101906" // {0} element大小越界
const val ERROR_JOB_RUNNING = "2101907" // job非完成态,不能进行重试

const val ERROR_NO_BUILD_EXISTS_BY_ID = "2101100" // 流水线构建[{0}]不存在
const val ERROR_NO_PIPELINE_EXISTS_BY_ID = "2101101" // 流水线[{0}]不存在
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -555,7 +555,7 @@ class PipelineInfoDao {
fun listByProject(dslContext: DSLContext, projectCode: String): Result<Record2<String, Long>> {
return with(T_PIPELINE_INFO) {
dslContext.select(PIPELINE_ID.`as`("pipelineId"), ID.`as`("id")).from(this)
.where(PROJECT_ID.eq(projectCode)).fetch()
.where(PROJECT_ID.eq(projectCode).and(DELETE.eq(false))).fetch()
}
}

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -119,9 +119,8 @@ class PipelineBuildDetailService @Autowired constructor(
// 判断需要刷新状态,目前只会改变canRetry & canSkip 状态
if (refreshStatus) {
// #4245 仅当在有限时间内并已经失败或者取消(终态)的构建上可尝试重试或跳过
if (checkPassDays(buildInfo.startTime) &&
(buildInfo.status.isFailure() || buildInfo.status.isCancel())
) {
// #6400 无需流水线是终态就可以进行task重试
if (checkPassDays(buildInfo.startTime)) {
ModelUtils.refreshCanRetry(model)
}
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -137,7 +137,8 @@ class PipelineBuildFacadeService(
private val paramFacadeService: ParamFacadeService,
private val buildLogPrinter: BuildLogPrinter,
private val buildParamCompatibilityTransformer: BuildParametersCompatibilityTransformer,
private val pipelineRedisService: PipelineRedisService
private val pipelineRedisService: PipelineRedisService,
private val pipelineRetryFacadeService: PipelineRetryFacadeService
) {

@Value("\${pipeline.build.cancel.intervalLimitTime:60}")
Expand Down Expand Up @@ -329,17 +330,30 @@ class PipelineBuildFacadeService(
params = arrayOf(buildId)
)

if (buildInfo.pipelineId != pipelineId) {
throw ErrorCodeException(errorCode = ProcessMessageCode.ERROR_PIPLEINE_INPUT)
}

// 运行中的task重试走全新的处理逻辑
if (!buildInfo.status.isFinish()) {
if (pipelineRetryFacadeService.runningBuildTaskRetry(
userId = userId,
projectId = projectId,
pipelineId = pipelineId,
buildId = buildId,
taskId = taskId
)) {
return buildId
}
}

if (!buildInfo.status.isFinish() && buildInfo.status != BuildStatus.STAGE_SUCCESS) {
throw ErrorCodeException(
errorCode = ProcessMessageCode.ERROR_DUPLICATE_BUILD_RETRY_ACT,
defaultMessage = "重试已经启动,忽略重复的请求"
)
}

if (buildInfo.pipelineId != pipelineId) {
throw ErrorCodeException(errorCode = ProcessMessageCode.ERROR_PIPLEINE_INPUT)
}

val readyToBuildPipelineInfo =
pipelineRepositoryService.getPipelineInfo(projectId, pipelineId, channelCode)
?: throw ErrorCodeException(
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,184 @@
/*
* Tencent is pleased to support the open source community by making BK-CI 蓝鲸持续集成平台 available.
*
* Copyright (C) 2019 THL A29 Limited, a Tencent company. All rights reserved.
*
* BK-CI 蓝鲸持续集成平台 is licensed under the MIT license.
*
* A copy of the MIT License is included in this file.
*
*
* Terms of the MIT License:
* ---------------------------------------------------
* Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated
* documentation files (the "Software"), to deal in the Software without restriction, including without limitation the
* rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to
* permit persons to whom the Software is furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in all copies or substantial portions of
* the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT
* LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN
* NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
* SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
*/

package com.tencent.devops.process.service.builds

import com.tencent.devops.common.api.exception.ErrorCodeException
import com.tencent.devops.common.api.exception.ParamBlankException
import com.tencent.devops.common.event.dispatcher.pipeline.PipelineEventDispatcher
import com.tencent.devops.common.event.enums.ActionType
import com.tencent.devops.common.log.utils.BuildLogPrinter
import com.tencent.devops.common.pipeline.enums.BuildStatus
import com.tencent.devops.common.service.utils.MessageCodeUtil
import com.tencent.devops.process.constant.ProcessMessageCode
import com.tencent.devops.process.engine.common.VMUtils
import com.tencent.devops.process.engine.pojo.PipelineBuildContainer
import com.tencent.devops.process.engine.pojo.PipelineBuildTask
import com.tencent.devops.process.engine.pojo.event.PipelineBuildContainerEvent
import com.tencent.devops.process.engine.service.PipelineContainerService
import com.tencent.devops.process.engine.service.PipelineTaskService
import com.tencent.devops.process.engine.service.detail.TaskBuildDetailService
import org.jooq.DSLContext
import org.slf4j.LoggerFactory
import org.springframework.beans.factory.annotation.Autowired
import org.springframework.stereotype.Service
import javax.ws.rs.core.Response

@Service
class PipelineRetryFacadeService @Autowired constructor(
val dslContext: DSLContext,
val pipelineEventDispatcher: PipelineEventDispatcher,
val pipelineTaskService: PipelineTaskService,
val pipelineContainerService: PipelineContainerService,
val taskBuildDetailService: TaskBuildDetailService,
private val buildLogPrinter: BuildLogPrinter
) {
fun runningBuildTaskRetry(
userId: String,
projectId: String,
pipelineId: String,
buildId: String,
taskId: String? = null
): Boolean {
logger.info("runningBuildTaskRetry $userId|$projectId|$pipelineId|$buildId|$taskId}")
if (taskId.isNullOrEmpty()) {
// 抛异常
throw ParamBlankException("Invalid taskId")
}
val taskInfo = pipelineTaskService.getBuildTask(projectId, buildId, taskId)
if (taskInfo == null) {
logger.warn("runningBuild task retry task empty $projectId|$pipelineId|$buildId|$taskId")
// 抛异常
throw ErrorCodeException(
statusCode = Response.Status.NOT_FOUND.statusCode,
errorCode = ProcessMessageCode.ERROR_NO_BUILD_EXISTS_BY_ID,
defaultMessage = "构建任务${buildId}不存在",
params = arrayOf(buildId)
)
}
// 判断待重试task所属job是否为终态。 非终态判断是否是关机未完成。其他task直接报错
// 此处请求可能早于关机到达。 若还未关机就点击重试,提示用户稍后再试
val containerInfo = pipelineContainerService.getContainer(
projectId = projectId,
buildId = buildId,
containerId = taskInfo.containerId,
stageId = null
)

// 校验当前job的关机事件是否有完成
checkStopTask(projectId, buildId, containerInfo!!)
// 刷新当前job的开关机以及job状态, container状态, detail数据
refreshTaskAndJob(userId, projectId, buildId, taskId, containerInfo)
// 发送container Refreash事件,重新开始task对应的调度
sendContainerEvent(taskInfo, userId)
buildLogPrinter.addYellowLine(
buildId = buildId,
message = "$userId retry fail task ${taskInfo.taskName} when running",
tag = taskId,
jobId = containerInfo.containerId,
executeCount = taskInfo.executeCount ?: 1
)
return true
}

// 发送container刷新事件,将重置后的job重新丢入引擎中调度
private fun sendContainerEvent(taskInfo: PipelineBuildTask, userId: String) {
pipelineEventDispatcher.dispatch(
PipelineBuildContainerEvent(
source = "runningBuildRetry${taskInfo.buildId}|${taskInfo.taskId}",
containerId = taskInfo.containerId,
containerHashId = taskInfo.containerHashId,
stageId = taskInfo.stageId,
pipelineId = taskInfo.pipelineId,
buildId = taskInfo.buildId,
userId = userId,
projectId = taskInfo.projectId,
actionType = ActionType.REFRESH,
containerType = ""
)
)
}

// 若当前job状态还未完成,报错让用户稍后再重试。否则会因为开机比关机早到引发引擎调度问题
private fun checkStopTask(
projectId: String,
buildId: String,
containerInfo: PipelineBuildContainer
) {
if (!containerInfo.status.isFinish()) {
logger.warn("retry runningJob: $projectId|$buildId${containerInfo.containerId} is running")
throw ErrorCodeException(
errorCode = ProcessMessageCode.ERROR_JOB_RUNNING,
defaultMessage = MessageCodeUtil.getCodeLanMessage(ProcessMessageCode.ERROR_JOB_RUNNING)
)
}
}

// 刷新要重试task所属job的开关机状态,task状态。 container状态,detail相关信息
private fun refreshTaskAndJob(
userId: String,
projectId: String,
buildId: String,
taskId: String,
containerInfo: PipelineBuildContainer
) {
val taskRecords = pipelineTaskService.listContainerBuildTasks(projectId, buildId, containerInfo.containerId)
// 待重试task所属job对应的startVm,stopVm,endTask,对应task状态重置为Queue
val startAndEndTask = mutableListOf<PipelineBuildTask>()
taskRecords.forEach { task ->
if (task.taskId == taskId) {
startAndEndTask.add(task)
} else if (task.taskName.startsWith(VMUtils.getCleanVmLabel()) &&
task.taskId.startsWith(VMUtils.getStopVmLabel())) {
startAndEndTask.add(task)
} else if (task.taskName.startsWith(VMUtils.getPrepareVmLabel()) &&
task.taskId.startsWith(VMUtils.getStartVmLabel())) {
startAndEndTask.add(task)
} else if (task.taskName.startsWith(VMUtils.getWaitLabel()) &&
task.taskId.startsWith(VMUtils.getEndLabel())) {
startAndEndTask.add(task)
} else if (task.status == BuildStatus.UNEXEC) {
startAndEndTask.add(task)
}
}
startAndEndTask.forEach {
pipelineTaskService.updateTaskStatus(task = it, userId = userId, buildStatus = BuildStatus.QUEUE)
}
// 修改容器状态位运行
pipelineContainerService.updateContainerStatus(
projectId = containerInfo.projectId,
buildId = containerInfo.buildId,
stageId = containerInfo.stageId,
containerId = containerInfo.containerId,
buildStatus = BuildStatus.QUEUE
)
}

companion object {
private val logger = LoggerFactory.getLogger(PipelineRetryFacadeService::class.java)
}
}
1 change: 1 addition & 0 deletions support-files/sql/5001_ci_project-init_dml_mysql.sql
Original file line number Diff line number Diff line change
Expand Up @@ -171,6 +171,7 @@ REPLACE INTO `T_MESSAGE_CODE_DETAIL` (`ID`, `MESSAGE_CODE`, `MODULE_CODE`, `MESS
REPLACE INTO `T_MESSAGE_CODE_DETAIL` (`ID`, `MESSAGE_CODE`, `MODULE_CODE`, `MESSAGE_DETAIL_ZH_CN`, `MESSAGE_DETAIL_EN`) VALUES ('b6d9420993954802884f474828cc0db1', '2101904', '01', '流水线:排队超时,取消运行! [{0}]', 'Pipeline: Queue timed out, canceled running! [{0}]');
REPLACE INTO `T_MESSAGE_CODE_DETAIL` (`ID`, `MESSAGE_CODE`, `MODULE_CODE`, `MESSAGE_DETAIL_ZH_CN`, `MESSAGE_DETAIL_EN`) VALUES ('7d8f2d20b939448f91550967d5ff63b6', '2101905', '01', '流水线:暂停的流水线已开始运行', 'Pipeline: Pipeline: puase pipeline running now');
REPLACE INTO `T_MESSAGE_CODE_DETAIL` (`ID`, `MESSAGE_CODE`, `MODULE_CODE`, `MESSAGE_DETAIL_ZH_CN`, `MESSAGE_DETAIL_EN`) VALUES ('4e6d9a9c3c874c16b2fcff32b6a731fd', '2101906', '01', '流水线:{0} element大小越界', 'Pipeline: Pipeline: {0} element size too long');
REPLACE INTO `T_MESSAGE_CODE_DETAIL` (`ID`, `MESSAGE_CODE`, `MODULE_CODE`, `MESSAGE_DETAIL_ZH_CN`, `MESSAGE_DETAIL_EN`) VALUES ('894102da867c431baf9fca6bfd6bc640', '2101907', '01', '流水线: job非完成态,不能进行重试', 'Pipeline: job is running, forbid retry');


REPLACE INTO `T_MESSAGE_CODE_DETAIL` (`ID`, `MESSAGE_CODE`, `MODULE_CODE`, `MESSAGE_DETAIL_ZH_CN`, `MESSAGE_DETAIL_EN`) VALUES ('a1d8bb8bc3e54b0daaf6d22daef17650', '2101908', '01', '流水线:质量红线(准入)配置有误:Fail to find quality gate intercept element', 'Pipeline: The quality red line (access) is incorrectly configured: Fail to find quality gate intercept element');
Expand Down

0 comments on commit 705d562

Please sign in to comment.