livepeer · leszko · Apr 23, 2025 · Apr 23, 2025 · Apr 23, 2025 · Apr 23, 2025
diff --git a/cmd/livepeer/livepeer.go b/cmd/livepeer/livepeer.go
@@ -165,6 +165,7 @@ func parseLivepeerConfig() starter.LivepeerConfig {
 	cfg.AIVerboseLogs = flag.Bool("aiVerboseLogs", *cfg.AIVerboseLogs, "Set to true to enable verbose logs for the AI runner containers created by the worker")
 	cfg.AIRunnerImageOverrides = flag.String("aiRunnerImageOverrides", *cfg.AIRunnerImageOverrides, `Specify overrides for the Docker images used by the AI runner. Example: '{"default": "livepeer/ai-runner:v1.0", "batch": {"text-to-speech": "livepeer/ai-runner:text-to-speech-v1.0"}, "live": {"another-pipeline": "livepeer/ai-runner:another-pipeline-v1.0"}}'`)
 	cfg.AIProcessingRetryTimeout = flag.Duration("aiProcessingRetryTimeout", *cfg.AIProcessingRetryTimeout, "Timeout for retrying to initiate AI processing request")
+	cfg.AIStartupOrchSwapTimeout = flag.Duration("aiStartupOrchSwapTimeout", *cfg.AIStartupOrchSwapTimeout, "Timeout to wait for Orchestrator return first output segment")
 	cfg.AIRunnerContainersPerGPU = flag.Int("aiRunnerContainersPerGPU", *cfg.AIRunnerContainersPerGPU, "Number of AI runner containers to run per GPU; default to 1")
 
 	// Live AI:

diff --git a/cmd/livepeer/starter/starter.go b/cmd/livepeer/starter/starter.go
@@ -167,6 +167,7 @@
 	AIRunnerImageOverrides     *string
 	AIVerboseLogs              *bool
 	AIProcessingRetryTimeout   *time.Duration
+	AIStartupOrchSwapTimeout   *time.Duration
 	AIRunnerContainersPerGPU   *int
 	KafkaBootstrapServers      *string
 	KafkaUsername              *string
@@ -220,6 +221,7 @@
 	defaultAIRunnerImage := "livepeer/ai-runner:latest"
 	defaultAIVerboseLogs := false
 	defaultAIProcessingRetryTimeout := 2 * time.Second
+	defaultAIStartupOrchSwapTimeout := 15 * time.Second
 	defaultAIRunnerContainersPerGPU := 1
 	defaultAIRunnerImageOverrides := ""
 	defaultLiveAIAuthWebhookURL := ""
@@ -333,6 +335,7 @@
 		AIRunnerImage:            &defaultAIRunnerImage,
 		AIVerboseLogs:            &defaultAIVerboseLogs,
 		AIProcessingRetryTimeout: &defaultAIProcessingRetryTimeout,
+		AIStartupOrchSwapTimeout: &defaultAIStartupOrchSwapTimeout,
 		AIRunnerContainersPerGPU: &defaultAIRunnerContainersPerGPU,
 		AIRunnerImageOverrides:   &defaultAIRunnerImageOverrides,
 		LiveAIAuthWebhookURL:     &defaultLiveAIAuthWebhookURL,
@@ -523,6 +526,7 @@
 		glog.Errorf("Error creating livepeer node: %v", err)
 	}
 	n.AIProcesssingRetryTimeout = *cfg.AIProcessingRetryTimeout
+	n.AIStartupOrchSwapTimeout = *cfg.AIStartupOrchSwapTimeout
 
 	if *cfg.OrchSecret != "" {
 		n.OrchSecret, _ = common.ReadFromFile(*cfg.OrchSecret)

diff --git a/core/livepeernode.go b/core/livepeernode.go
@@ -122,6 +122,7 @@ type LivepeerNode struct {
 	AIWorker                  AI
 	AIWorkerManager           *RemoteAIWorkerManager
 	AIProcesssingRetryTimeout time.Duration
+	AIStartupOrchSwapTimeout  time.Duration
 
 	// Transcoder public fields
 	SegmentChans       map[ManifestID]SegmentChan

diff --git a/server/ai_live_video.go b/server/ai_live_video.go
@@ -27,10 +27,11 @@
 
 func startTricklePublish(ctx context.Context, url *url.URL, params aiRequestParams, sess *AISession) {
 	ctx = clog.AddVal(ctx, "url", url.Redacted())
+	mid := extractMid(url.Path)
 	publisher, err := trickle.NewTricklePublisher(url.String())
 	if err != nil {
 		clog.Infof(ctx, "error publishing trickle. err=%s", err)
-		params.liveParams.stopPipeline(fmt.Errorf("Error publishing trickle %w", err))
+		params.liveParams.stop(mid, fmt.Errorf("Error publishing trickle %w", err))
 		return
 	}
 
@@ -45,7 +46,7 @@
 				sess:      sess.BroadcastSession,
 				inPixels:  inPixels,
 				priceInfo: priceInfo,
-				mid:       extractMid(url.Path),
+				mid:       mid,
 			})
 		}
 		paymentProcessor = NewLivePaymentProcessor(ctx, params.liveParams.paymentProcessInterval, sendPaymentFunc)
@@ -69,7 +70,7 @@
 		thisSeq, atMax := slowOrchChecker.BeginSegment()
 		if atMax {
 			clog.Infof(ctx, "Orchestrator is slow - terminating")
-			params.liveParams.stopPipeline(fmt.Errorf("slow orchestrator"))
+			params.liveParams.stop(mid, fmt.Errorf("slow orchestrator"))
 			cancel()
 			return
 			// TODO switch orchestrators
@@ -119,7 +120,7 @@
 				}
 				if errors.Is(err, trickle.StreamNotFoundErr) {
 					clog.Infof(ctx, "Stream no longer exists on orchestrator; terminating")
-					params.liveParams.stopPipeline(fmt.Errorf("Stream does not exist"))
+					params.liveParams.stop(mid, fmt.Errorf("Stream does not exist"))
 					return
 				}
 				// Retry segment only if nothing has been sent yet
@@ -170,16 +171,17 @@
 }
 
 func startTrickleSubscribe(ctx context.Context, url *url.URL, params aiRequestParams, sess *AISession, onFistSegment func()) {
+	mid := extractMid(url.Path)
 	// subscribe to the outputs and send them into LPMS
 	subscriber := trickle.NewTrickleSubscriber(url.String())
 	r, w, err := os.Pipe()
 	if err != nil {
-		params.liveParams.stopPipeline(fmt.Errorf("error getting pipe for trickle-ffmpeg. url=%s %w", url, err))
+		params.liveParams.stop(mid, fmt.Errorf("error getting pipe for trickle-ffmpeg. url=%s %w", url, err))
 		return
 	}
 	rMediaMTX, wMediaMTX, err := os.Pipe()
 	if err != nil {
-		params.liveParams.stopPipeline(fmt.Errorf("error getting pipe for MediaMTX trickle-ffmpeg. url=%s %w", url, err))
+		params.liveParams.stop(mid, fmt.Errorf("error getting pipe for MediaMTX trickle-ffmpeg. url=%s %w", url, err))
 		return
 	}
 	ctx = clog.AddVal(ctx, "url", url.Redacted())
@@ -210,7 +212,7 @@
 			segment, err = subscriber.Read()
 			if err != nil {
 				if errors.Is(err, trickle.EOS) || errors.Is(err, trickle.StreamNotFoundErr) {
-					params.liveParams.stopPipeline(fmt.Errorf("trickle subscribe end of stream: %w", err))
+					params.liveParams.stop(mid, fmt.Errorf("trickle subscribe end of stream: %w", err))
 					return
 				}
 				var sequenceNonexistent *trickle.SequenceNonexistent
@@ -222,7 +224,7 @@
 				err = fmt.Errorf("trickle subscribe error reading: %w", err)
 				clog.Infof(ctx, "%s", err)
 				if retries > maxRetries {
-					params.liveParams.stopPipeline(err)
+					params.liveParams.stop(mid, err)
 					return
 				}
 				retries++
@@ -236,7 +238,7 @@
 
 			n, err := copySegment(segment, multiWriter)
 			if err != nil {
-				params.liveParams.stopPipeline(fmt.Errorf("trickle subscribe error copying: %w", err))
+				params.liveParams.stop(mid, fmt.Errorf("trickle subscribe error copying: %w", err))
 				return
 			}
 			if firstSegment {
@@ -266,14 +268,14 @@
 
 	// Studio Output ffmpeg process
 	if params.liveParams.outputRTMPURL != "" {
-		go ffmpegOutput(ctx, params.liveParams.outputRTMPURL, r, params)
+		go ffmpegOutput(ctx, mid, params.liveParams.outputRTMPURL, r, params)
 	}
 
 	// MediaMTX Output ffmpeg process
-	go ffmpegOutput(ctx, params.liveParams.mediaMTXOutputRTMPURL, rMediaMTX, params)
+	go ffmpegOutput(ctx, mid, params.liveParams.mediaMTXOutputRTMPURL, rMediaMTX, params)
 }
 
-func ffmpegOutput(ctx context.Context, outputUrl string, r io.ReadCloser, params aiRequestParams) {
+func ffmpegOutput(ctx context.Context, mid string, outputUrl string, r io.ReadCloser, params aiRequestParams) {
 	ctx = clog.AddVal(ctx, "rtmpOut", outputUrl)
 	defer func() {
 		r.Close()
@@ -284,7 +286,7 @@
 				err = errors.New("unknown error")
 			}
 			clog.Errorf(ctx, "LPMS panic err=%v", err)
-			params.liveParams.stopPipeline(fmt.Errorf("LPMS panic %w", err))
+			params.liveParams.stop(mid, fmt.Errorf("LPMS panic %w", err))
 		}
 	}()
 	for {
@@ -294,7 +296,7 @@
 			break
 		}
 
-		cmd := exec.Command("ffmpeg",
+		cmd := exec.CommandContext(ctx, "ffmpeg",
 			"-analyzeduration", "2500000", // 2.5 seconds
 			"-i", "pipe:0",
 			"-c:a", "copy",
@@ -377,6 +379,7 @@
 const clearStreamDelay = 1 * time.Minute
 
 func startEventsSubscribe(ctx context.Context, url *url.URL, params aiRequestParams, sess *AISession) {
+	mid := extractMid(url.Path)
 	subscriber := trickle.NewTrickleSubscriber(url.String())
 	stream := params.liveParams.stream
 	streamId := params.liveParams.streamID
@@ -425,7 +428,7 @@
 				if retries > maxRetries {
 					clog.Infof(ctx, "Too many errors reading events; stopping subscription err=%v", err)
 					err = fmt.Errorf("Error reading subscription: %w", err)
-					params.liveParams.stopPipeline(err)
+					params.liveParams.stop(mid, err)
 					return
 				}
 				clog.Infof(ctx, "Error reading events subscription: err=%v retry=%d", err, retries)
@@ -526,7 +529,7 @@
 				eventTime := lastEvent
 				lastEventMu.Unlock()
 				if time.Now().Sub(eventTime) > maxEventGap {
-					params.liveParams.stopPipeline(errors.New("timeout waiting for events"))
+					params.liveParams.stop(mid, errors.New("timeout waiting for events"))
 					eventTicker.Stop()
 					return
 				}

diff --git a/server/ai_mediaserver.go b/server/ai_mediaserver.go
@@ -591,6 +591,7 @@
 				streamID:               streamID,
 				pipelineID:             pipelineID,
 				stopPipeline:           stopPipeline,
+				initCompleted:          make(chan struct{}),
 				sendErrorEvent:         sendErrorEvent,
 			},
 		}
@@ -883,6 +884,7 @@
 					streamID:               streamID,
 					pipelineID:             pipelineID,
 					stopPipeline:           stopPipeline,
+					initCompleted:          make(chan struct{}),
 					sendErrorEvent:         sendErrorEvent,
 					orchestrator:           orchestrator,
 				},

diff --git a/server/ai_process.go b/server/ai_process.go
@@ -15,6 +15,7 @@
 	"path/filepath"
 	"strconv"
 	"strings"
+	"sync"
 	"time"
 
 	"github.com/livepeer/go-livepeer/ai/worker"
@@ -115,12 +116,33 @@
 	paymentProcessInterval time.Duration
 
 	// Stops the pipeline with an error. Also kicks the input
-	stopPipeline func(error)
+	stopPipeline  func(error)
+	lastMid       string
+	initCompleted chan struct{}
+	mu            sync.Mutex
 
 	// Report an error event
 	sendErrorEvent func(error)
 }
 
+func (p liveRequestParams) start(mid string) {
+	p.mu.Lock()
+	defer p.mu.Unlock()
+	p.lastMid = mid
+}
+
+func (p liveRequestParams) stop(mid string, err error) {
+	<-p.initCompleted
+
+	p.mu.Lock()
+	lastMid := p.lastMid
+	p.mu.Unlock()
+
+	if mid == lastMid {
+		p.stopPipeline(err)
+	}
+}
+
 // CalculateTextToImageLatencyScore computes the time taken per pixel for an text-to-image request.
 func CalculateTextToImageLatencyScore(took time.Duration, req worker.GenTextToImageJSONRequestBody, outPixels int64) float64 {
 	if outPixels <= 0 {
@@ -1088,6 +1110,9 @@
 		return nil, fmt.Errorf("invalid events URL: %w", err)
 	}
 	clog.V(common.VERBOSE).Infof(ctx, "pub %s sub %s control %s events %s", pub, sub, control, events)
+	firstSegmentReceived := make(chan struct{}, 1)
+	params.liveParams.start(extractMid(pub.Path))
+	ctx, cancelCtx := context.WithCancel(ctx)
 
 	startControlPublish(ctx, control, params)
 	startTricklePublish(ctx, pub, params, sess)
@@ -1108,10 +1133,21 @@
 			})
 		}
 		clog.V(common.VERBOSE).Infof(ctx, "First Segment delay=%dms streamID=%s", delayMs, params.liveParams.streamID)
+		select {
+		case firstSegmentReceived <- struct{}{}:
+		default:
+		}
 
 	})
 	startEventsSubscribe(ctx, events, params, sess)
-	return resp, nil
+	select {
+	case <-firstSegmentReceived:
+		return resp, nil
+	case <-time.After(params.node.AIStartupOrchSwapTimeout):
+		cancelCtx()
+		return nil, errors.New("timeout waiting for first segment")
+	}
+
 }
 
 // extractMid extracts the mid (manifest ID) from the publish URL
@@ -1500,6 +1536,7 @@
 	cctx, cancel := context.WithTimeout(ctx, processingRetryTimeout)
 	defer cancel()
 
+	defer completeAIRequest(params)
 	tries := 0
 	var retryableSessions []*AISession
 	for tries < maxTries {
@@ -1579,6 +1616,13 @@
 	return resp, nil
 }
 
+func completeAIRequest(params aiRequestParams) {
+	if params.liveParams.initCompleted == nil {
+		return
+	}
+	close(params.liveParams.initCompleted)
+}
+
 // isRetryableError checks if the error is a transient error that can be retried.
 func isRetryableError(err error) bool {
 	return errContainsMsg(err, "ticketparams expired")