@@ -217,9 +217,7 @@ class BatchIndexer {
217217 process . stdout . write ( ` → flushing ${ toSend . length } docs: ${ label } \n` ) ;
218218 }
219219
220- const index = this . client . index ( this . indexName ) ;
221- const task = await index . addDocuments ( toSend ) ;
222- await this . client . tasks . waitForTask ( task . taskUid , { timeout : 300_000 } ) ;
220+ await this . flushWithRetry ( toSend ) ;
223221
224222 this . totalSent += toSend . length ;
225223 this . batchesSent ++ ;
@@ -231,6 +229,64 @@ class BatchIndexer {
231229 }
232230 }
233231
232+ // Meilisearch can silently restart mid-task under memory pressure, causing
233+ // ECONNRESET on either addDocuments POST or waitForTask polling. Submitted
234+ // tasks are persisted in LMDB and typically resume on server recovery, so we
235+ // wait for /health to return "available" and retry — rather than giving up
236+ // after a short backoff that can easily expire inside one crash cycle
237+ // (observed ~60s between crashes). waitForTask reuses the original taskUid
238+ // so we wait for the already-enqueued task rather than resubmitting.
239+ private async flushWithRetry ( toSend : SearchDocument [ ] ) : Promise < void > {
240+ const maxAttempts = 5 ;
241+ const healthWaitMs = 180_000 ;
242+ const index = this . client . index ( this . indexName ) ;
243+ let taskUid : number | null = null ;
244+
245+ for ( let attempt = 1 ; attempt <= maxAttempts ; attempt ++ ) {
246+ try {
247+ if ( taskUid === null ) {
248+ const task = await index . addDocuments ( toSend ) ;
249+ taskUid = task . taskUid ;
250+ }
251+ await this . client . tasks . waitForTask ( taskUid , { timeout : 300_000 } ) ;
252+ return ;
253+ } catch ( err ) {
254+ if ( attempt === maxAttempts ) throw err ;
255+ const message = err instanceof Error ? err . message : String ( err ) ;
256+ const firstId = toSend [ 0 ] ?. id ?? "" ;
257+ const context = taskUid !== null ? `waitForTask(${ taskUid } )` : "addDocuments" ;
258+ process . stdout . write (
259+ ` ⟳ attempt ${ attempt } /${ maxAttempts - 1 } — ${ context } failed (${ message } ) for batch starting ${ firstId } \n` ,
260+ ) ;
261+ const recovered = await this . waitForMeiliHealth ( healthWaitMs ) ;
262+ if ( ! recovered ) {
263+ process . stdout . write ( ` ⟳ Meilisearch did not recover within ${ healthWaitMs / 1000 } s — giving up this batch\n` ) ;
264+ throw err ;
265+ }
266+ // Small grace period after recovery lets Meilisearch finish its startup.
267+ await new Promise ( ( resolve ) => setTimeout ( resolve , 3000 ) ) ;
268+ }
269+ }
270+ }
271+
272+ private async waitForMeiliHealth ( maxWaitMs : number ) : Promise < boolean > {
273+ const deadline = Date . now ( ) + maxWaitMs ;
274+ const pollMs = 5000 ;
275+ while ( Date . now ( ) < deadline ) {
276+ try {
277+ const health = await this . client . health ( ) ;
278+ if ( health . status === "available" ) {
279+ process . stdout . write ( ` ⟳ Meilisearch healthy — resuming\n` ) ;
280+ return true ;
281+ }
282+ } catch {
283+ // Connection refused / reset — Meilisearch still down or restarting
284+ }
285+ await new Promise ( ( resolve ) => setTimeout ( resolve , pollMs ) ) ;
286+ }
287+ return false ;
288+ }
289+
234290 get total ( ) : number {
235291 return this . totalSent ;
236292 }
0 commit comments