|
| 1 | +import { chunk } from 'lodash' |
| 2 | +import { raw } from 'objection' |
| 3 | + |
| 4 | +import { DEFAULT_JOB_OPTIONS } from '@/helpers/default-job-configuration' |
| 5 | +import logger from '@/helpers/logger' |
| 6 | +import Execution from '@/models/execution' |
| 7 | +import ExecutionStep from '@/models/execution-step' |
| 8 | +import { enqueueActionJob, getActionJob } from '@/queues/action' |
| 9 | + |
| 10 | +import type { MutationResolvers } from '../__generated__/types.generated' |
| 11 | + |
| 12 | +const CHUNK_SIZE = 100 |
| 13 | + |
| 14 | +async function getAllFailedIterations(executionId: string) { |
| 15 | + const failedExecutionSteps = await ExecutionStep.query() |
| 16 | + .with('latest_attempts', (builder) => { |
| 17 | + builder |
| 18 | + .distinctOn([raw('step_id'), raw("metadata->>'iteration'")]) |
| 19 | + .select('*') |
| 20 | + .from('execution_steps') |
| 21 | + .where('execution_id', executionId) |
| 22 | + .orderBy('step_id') |
| 23 | + .orderBy(raw("metadata->>'iteration'")) |
| 24 | + .orderBy('created_at', 'desc') |
| 25 | + }) |
| 26 | + .select( |
| 27 | + 'id', |
| 28 | + 'execution_id', |
| 29 | + 'step_id', |
| 30 | + 'status', |
| 31 | + 'job_id', |
| 32 | + 'app_key', |
| 33 | + 'key', |
| 34 | + 'metadata', |
| 35 | + ) |
| 36 | + .from('latest_attempts') |
| 37 | + .where('status', '!=', 'success') |
| 38 | + .withSoftDeleted() |
| 39 | + |
| 40 | + return failedExecutionSteps |
| 41 | +} |
| 42 | + |
| 43 | +const bulkRetryIterations: MutationResolvers['bulkRetryIterations'] = async ( |
| 44 | + _parent, |
| 45 | + params, |
| 46 | +) => { |
| 47 | + if (!params.input.executionId) { |
| 48 | + throw new Error('Execution ID is required') |
| 49 | + } |
| 50 | + |
| 51 | + let failedExecutionSteps = await getAllFailedIterations( |
| 52 | + params.input.executionId, |
| 53 | + ) |
| 54 | + |
| 55 | + /** |
| 56 | + * NOTE: this filters out execution steps that are not failed or have no job id |
| 57 | + * if there is no job id, we will skip the retry |
| 58 | + */ |
| 59 | + failedExecutionSteps = failedExecutionSteps.filter((executionStep) => { |
| 60 | + const { |
| 61 | + id: executionStepId, |
| 62 | + executionId, |
| 63 | + status, |
| 64 | + jobId, |
| 65 | + metadata, |
| 66 | + } = executionStep |
| 67 | + |
| 68 | + const defaultLoggerMetadata = { |
| 69 | + executionId: executionId, |
| 70 | + executionStepId: executionStepId, |
| 71 | + iteration: metadata.iteration, |
| 72 | + } |
| 73 | + |
| 74 | + if (status !== 'failure') { |
| 75 | + logger.error( |
| 76 | + 'Latest execution step is not failed for a failed execution', |
| 77 | + { |
| 78 | + event: 'bulk-retry-iteration-step-status-mismatch', |
| 79 | + ...defaultLoggerMetadata, |
| 80 | + }, |
| 81 | + ) |
| 82 | + return false |
| 83 | + } |
| 84 | + |
| 85 | + if (jobId === null || jobId === undefined) { |
| 86 | + // For fresh per-app queues, job ID can be 0. |
| 87 | + logger.error('Latest execution step does not have a job ID', { |
| 88 | + event: 'bulk-retry-iteration-step-no-job-id', |
| 89 | + ...defaultLoggerMetadata, |
| 90 | + }) |
| 91 | + return false |
| 92 | + } |
| 93 | + |
| 94 | + if (executionId !== params.input.executionId) { |
| 95 | + return false |
| 96 | + } |
| 97 | + |
| 98 | + return true |
| 99 | + }) |
| 100 | + |
| 101 | + // Nothing to do if no steps to retry |
| 102 | + if (failedExecutionSteps.length === 0) { |
| 103 | + return { |
| 104 | + numFailedIterations: 0, |
| 105 | + allSuccessfullyRetried: true, |
| 106 | + } |
| 107 | + } |
| 108 | + |
| 109 | + // Retry each failed iteration |
| 110 | + const retryAttempts: PromiseSettledResult<void>[] = [] |
| 111 | + const chunkedIterations = chunk(failedExecutionSteps, CHUNK_SIZE) |
| 112 | + |
| 113 | + for (const currChunk of chunkedIterations) { |
| 114 | + const promises = currChunk.map(async (executionStep) => { |
| 115 | + const { |
| 116 | + id: executionStepId, |
| 117 | + executionId, |
| 118 | + jobId, |
| 119 | + appKey, |
| 120 | + metadata, |
| 121 | + } = executionStep |
| 122 | + |
| 123 | + const defaultLoggerMetadata = { |
| 124 | + executionId: executionId, |
| 125 | + executionStepId: executionStepId, |
| 126 | + iteration: metadata.iteration, |
| 127 | + } |
| 128 | + |
| 129 | + const job = await getActionJob(jobId) |
| 130 | + if (!job) { |
| 131 | + // if job cannot be found anymore, remove the job id from the execution step so it cannot be retried again |
| 132 | + await executionStep.$query().patch({ jobId: null }) |
| 133 | + logger.error('Bulk retrying iteration - no job', { |
| 134 | + event: 'bulk-retry-iteration-no-job', |
| 135 | + ...defaultLoggerMetadata, |
| 136 | + oldJobId: jobId, |
| 137 | + }) |
| 138 | + throw new Error( |
| 139 | + `Job for ${executionId}-${executionStepId}-${metadata.iteration} not found or has expired`, |
| 140 | + ) |
| 141 | + } |
| 142 | + |
| 143 | + try { |
| 144 | + const jobState = await job.getState() |
| 145 | + if (jobState !== 'failed') { |
| 146 | + logger.warn( |
| 147 | + `Bulk retrying iteration ${metadata.iteration} - job not failed`, |
| 148 | + { |
| 149 | + event: 'bulk-retry-iteration-job-not-failed', |
| 150 | + ...defaultLoggerMetadata, |
| 151 | + jobId: jobId, |
| 152 | + jobState, |
| 153 | + }, |
| 154 | + ) |
| 155 | + throw new Error( |
| 156 | + `Job for ${executionId}-${executionStepId}-${metadata.iteration} (JOB: ${jobId}) is not in a failed state`, |
| 157 | + ) |
| 158 | + } |
| 159 | + } catch (error) { |
| 160 | + logger.error('Bulk retrying execution step - job get state error', { |
| 161 | + event: 'bulk-retry-iteration-job-getstate-error', |
| 162 | + ...defaultLoggerMetadata, |
| 163 | + oldJobData: job.data, |
| 164 | + oldJobId: job.id, |
| 165 | + error, |
| 166 | + }) |
| 167 | + |
| 168 | + throw error |
| 169 | + } |
| 170 | + |
| 171 | + logger.info('Bulk retrying execution step - start', { |
| 172 | + event: 'bulk-retry-iteration-start', |
| 173 | + ...defaultLoggerMetadata, |
| 174 | + oldJobData: job.data, |
| 175 | + oldJobId: job.id, |
| 176 | + }) |
| 177 | + |
| 178 | + try { |
| 179 | + await job.remove() |
| 180 | + |
| 181 | + const newJob = await enqueueActionJob({ |
| 182 | + appKey: appKey, |
| 183 | + jobName: job.name, |
| 184 | + jobData: job.data, |
| 185 | + jobOptions: DEFAULT_JOB_OPTIONS, |
| 186 | + }) |
| 187 | + await Execution.query().findById(executionId).patch({ status: null }) |
| 188 | + await executionStep.$query().patch({ jobId: newJob.id }) |
| 189 | + |
| 190 | + logger.info('Bulk retrying iterations - done', { |
| 191 | + event: 'bulk-retry-iteration-done', |
| 192 | + ...defaultLoggerMetadata, |
| 193 | + oldJobData: job.data, |
| 194 | + oldJobId: job.id, |
| 195 | + newJobId: '123', |
| 196 | + }) |
| 197 | + } catch (error) { |
| 198 | + logger.error('Bulk retrying iterations - ERROR', { |
| 199 | + event: 'bulk-retry-iteration-failed', |
| 200 | + ...defaultLoggerMetadata, |
| 201 | + oldJobData: job.data, |
| 202 | + oldJobId: job.id, |
| 203 | + error, |
| 204 | + }) |
| 205 | + |
| 206 | + throw error |
| 207 | + } |
| 208 | + }) |
| 209 | + const currRetryAttempts = await Promise.allSettled(promises) |
| 210 | + retryAttempts.push(...currRetryAttempts) |
| 211 | + } |
| 212 | + |
| 213 | + const allSuccessfullyRetried = !retryAttempts.find( |
| 214 | + (attempt) => attempt.status === 'rejected', |
| 215 | + ) |
| 216 | + |
| 217 | + if (!allSuccessfullyRetried) { |
| 218 | + // Actually we can do some more processing to see which IDs failed but nvm. |
| 219 | + logger.warn('Some attempts in bulk iteration retry failed', { |
| 220 | + event: 'bulk-retry-iteration-some-attempts-failed', |
| 221 | + executionId: params.input.executionId, |
| 222 | + }) |
| 223 | + } else { |
| 224 | + logger.info('Bulk iteration retry succeeded', { |
| 225 | + event: 'bulk-retry-iteration-success', |
| 226 | + executionId: params.input.executionId, |
| 227 | + numRetried: failedExecutionSteps.length, |
| 228 | + }) |
| 229 | + } |
| 230 | + |
| 231 | + return { |
| 232 | + numFailedIterations: failedExecutionSteps.length, |
| 233 | + allSuccessfullyRetried, |
| 234 | + } |
| 235 | +} |
| 236 | + |
| 237 | +export default bulkRetryIterations |
0 commit comments