@@ -60,57 +60,97 @@ class KyuubiBatchService(
6060 }
6161
6262 override def start (): Unit = {
63+ val UNINITIALIZED_BATCH_ID = " UNINITIALIZED_BATCH_ID"
6364 assert(running.compareAndSet(false , true ))
6465 val submitTask : Runnable = () => {
6566 restFrontend.waitForServerStarted()
6667 while (running.get) {
67- metadataManager.pickBatchForSubmitting(kyuubiInstance) match {
68- case None => Thread .sleep(1000 )
69- case Some (metadata) =>
70- val batchId = metadata.identifier
71- info(s " $batchId is picked for submission. " )
72- val batchSession = sessionManager.createBatchSession(
73- metadata.username,
74- " anonymous" ,
75- metadata.ipAddress,
76- metadata.requestConf,
77- metadata.engineType,
78- Option (metadata.requestName),
79- metadata.resource,
80- metadata.className,
81- metadata.requestArgs,
82- Some (metadata),
83- fromRecovery = false )
84- sessionManager.openBatchSession(batchSession)
85- var submitted = false
86- while (! submitted) { // block until batch job submitted
87- submitted = metadataManager.getBatchSessionMetadata(batchId) match {
88- case Some (metadata) if OperationState .isTerminal(metadata.opState) =>
89- true
90- case Some (metadata) if metadata.opState == OperationState .RUNNING =>
91- metadata.appState match {
92- // app that is not submitted to resource manager
93- case None | Some (ApplicationState .NOT_FOUND ) => false
94- // app that is pending in resource manager while the local startup
95- // process is alive. For example, in Spark YARN cluster mode, if set
96- // spark.yarn.submit.waitAppCompletion=false, the local spark-submit
97- // process exits immediately once Application goes ACCEPTED status,
98- // even no resource could be allocated for the AM container.
99- case Some (ApplicationState .PENDING ) if batchSession.startupProcessAlive =>
100- false
101- // not sure, added for safe
102- case Some (ApplicationState .UNKNOWN ) => false
103- case _ => true
104- }
105- case Some (_) =>
106- false
107- case None =>
108- error(s " $batchId does not existed in metastore, assume it is finished " )
109- true
68+ var batchId = UNINITIALIZED_BATCH_ID
69+ try {
70+ metadataManager.pickBatchForSubmitting(kyuubiInstance) match {
71+ case None => Thread .sleep(1000 )
72+ case Some (metadata) =>
73+ batchId = metadata.identifier
74+ info(s " $batchId is picked for submission. " )
75+ val batchSession = sessionManager.createBatchSession(
76+ metadata.username,
77+ " anonymous" ,
78+ metadata.ipAddress,
79+ metadata.requestConf,
80+ metadata.engineType,
81+ Option (metadata.requestName),
82+ metadata.resource,
83+ metadata.className,
84+ metadata.requestArgs,
85+ Some (metadata),
86+ fromRecovery = false )
87+ sessionManager.openBatchSession(batchSession)
88+ var submitted = false
89+ while (! submitted) { // block until batch job submitted
90+ submitted = metadataManager.getBatchSessionMetadata(batchId) match {
91+ case Some (metadata) if OperationState .isTerminal(metadata.opState) =>
92+ true
93+ case Some (metadata) if metadata.opState == OperationState .RUNNING =>
94+ metadata.appState match {
95+ // app that is not submitted to resource manager
96+ case None | Some (ApplicationState .NOT_FOUND ) => false
97+ // app that is pending in resource manager while the local startup
98+ // process is alive. For example, in Spark YARN cluster mode, if set
99+ // spark.yarn.submit.waitAppCompletion=false, the local spark-submit
100+ // process exits immediately once Application goes ACCEPTED status,
101+ // even no resource could be allocated for the AM container.
102+ case Some (ApplicationState .PENDING ) if batchSession.startupProcessAlive =>
103+ false
104+ // not sure, added for safe
105+ case Some (ApplicationState .UNKNOWN ) => false
106+ case _ => true
107+ }
108+ case Some (_) =>
109+ false
110+ case None =>
111+ error(s " $batchId does not exist in metastore, assume it is finished " )
112+ true
113+ }
114+ if (! submitted) Thread .sleep(1000 )
115+ }
116+ info(s " $batchId is submitted or finished. " )
117+ }
118+ } catch {
119+ case e : InterruptedException =>
120+ if (batchId == UNINITIALIZED_BATCH_ID ) {
121+ error(s " Interrupted while picking batch for submission " , e)
122+ } else {
123+ error(s " Interrupted while opening batch session for $batchId" , e)
124+ try {
125+ metadataManager.failScheduledBatch(batchId)
126+ } catch {
127+ case ex : Exception =>
128+ error(
129+ s " Unable to modify metadata for $batchId to ERROR; " +
130+ " an administrator may need to reset the batch state manually." ,
131+ ex)
132+ }
133+ }
134+ throw e
135+ // If the batch session failed to open, reinitialize the batch state to ERROR
136+ // This can be due to a DB error or batch_connection_limits exceeded
137+ case e : Exception =>
138+ if (batchId == UNINITIALIZED_BATCH_ID ) {
139+ error(s " Error picking batch for submission " , e)
140+ } else {
141+ error(s " Error opening batch session for $batchId" , e)
142+ try {
143+ metadataManager.failScheduledBatch(batchId)
144+ } catch {
145+ case ex : Exception =>
146+ error(
147+ s " Unable to modify metadata for $batchId to ERROR; " +
148+ " an administrator may need to reset the batch state manually." ,
149+ ex)
110150 }
111- if (! submitted) Thread .sleep(1000 )
112151 }
113- info(s " $batchId is submitted or finished. " )
152+ // sleep 1 second to avoid excessive retries during transient network/DB failures
153+ Thread .sleep(1000 )
114154 }
115155 }
116156 }
0 commit comments