@@ -8,16 +8,16 @@ use crate::app_manager::request_context::{
88 RequireBufferContext , ShuffleResult , WritingViewContext ,
99} ;
1010use crate :: block_id_manager:: { get_block_id_manager, BlockIdManager } ;
11- use crate :: client_configs:: STORAGE_CAPACITY_PARTITION_SPLIT_ENABLED ;
11+ use crate :: client_configs:: HARD_SPLIT_ENABLED ;
1212use crate :: config:: Config ;
1313use crate :: config_reconfigure:: ReconfigurableConfManager ;
1414use crate :: config_ref:: { ByteString , ConfigOption } ;
1515use crate :: constant:: ALL_LABEL ;
1616use crate :: ddashmap:: DDashMap ;
1717use crate :: error:: WorkerError ;
1818use crate :: metric:: {
19- BLOCK_ID_NUMBER , GAUGE_HUGE_PARTITION_NUMBER , GAUGE_PARTITION_NUMBER , RESIDENT_BYTES ,
20- TOTAL_HUGE_PARTITION_NUMBER , TOTAL_HUGE_PARTITION_REQUIRE_BUFFER_FAILED ,
19+ BLOCK_ID_NUMBER , GAUGE_HUGE_PARTITION_NUMBER , GAUGE_PARTITION_NUMBER , HARD_SPLIT_COUNTER ,
20+ RESIDENT_BYTES , TOTAL_HUGE_PARTITION_NUMBER , TOTAL_HUGE_PARTITION_REQUIRE_BUFFER_FAILED ,
2121 TOTAL_PARTITION_NUMBER , TOTAL_READ_DATA , TOTAL_READ_DATA_FROM_LOCALFILE ,
2222 TOTAL_READ_DATA_FROM_MEMORY , TOTAL_READ_INDEX_FROM_LOCALFILE , TOTAL_RECEIVED_DATA ,
2323 TOTAL_REQUIRE_BUFFER_FAILED ,
@@ -348,15 +348,17 @@ impl App {
348348 if self
349349 . app_config_options
350350 . client_configs
351- . get ( & STORAGE_CAPACITY_PARTITION_SPLIT_ENABLED )
351+ . get ( & HARD_SPLIT_ENABLED )
352352 . unwrap_or ( false )
353+ // TODO: If the store is corrupted and only a single replica exists, fail the job fast instead of performing a hard split.
353354 && !self . store . is_healthy ( ) . await ?
354355 {
355356 warn ! (
356- "[{}] writing is limited due to the unhealthy storage" ,
357+ "Hard split is activated for [{}] due to the unhealthy storage" ,
357358 & app_id. to_string( )
358359 ) ;
359- return Err ( WorkerError :: WRITE_LIMITED_BY_STORAGE_STATE ) ;
360+ HARD_SPLIT_COUNTER . inc ( ) ;
361+ return Err ( WorkerError :: HARD_SPLIT_BY_UNHEALTHY_STORAGE ) ;
360362 }
361363
362364 let mut partition_split_candidates = HashSet :: new ( ) ;
@@ -516,7 +518,7 @@ mod tests {
516518 use crate :: app_manager:: application_identifier:: ApplicationId ;
517519 use crate :: app_manager:: partition_identifier:: PartitionUId ;
518520 use crate :: app_manager:: request_context:: RequireBufferContext ;
519- use crate :: client_configs:: { ClientRssConf , STORAGE_CAPACITY_PARTITION_SPLIT_ENABLED } ;
521+ use crate :: client_configs:: { ClientRssConf , HARD_SPLIT_ENABLED } ;
520522 use crate :: config:: StorageType ;
521523 use crate :: config:: StorageType :: LOCALFILE ;
522524 use crate :: config_reconfigure:: ReconfigurableConfManager ;
@@ -537,10 +539,7 @@ mod tests {
537539 let app_id = ApplicationId :: YARN ( 1 , 1 , 1 ) ;
538540
539541 let mut hmap = HashMap :: new ( ) ;
540- hmap. insert (
541- "spark.rss.riffle.storageCapacityPartitionSplitEnabled" . to_string ( ) ,
542- "true" . to_string ( ) ,
543- ) ;
542+ hmap. insert ( HARD_SPLIT_ENABLED . get_key ( ) , "true" . to_string ( ) ) ;
544543 let conf = ClientRssConf :: from ( hmap) ;
545544 let options = AppConfigOptions :: new ( DataDistribution :: NORMAL , 1 , None , conf) ;
546545
@@ -596,7 +595,7 @@ mod tests {
596595 healthy_tag. store ( false , Ordering :: SeqCst ) ;
597596
598597 match runtime. block_on ( async { app. require_buffer ( ctx. clone ( ) ) . await } ) {
599- Err ( WorkerError :: WRITE_LIMITED_BY_STORAGE_STATE ) => {
598+ Err ( WorkerError :: HARD_SPLIT_BY_UNHEALTHY_STORAGE ) => {
600599 // pass
601600 }
602601 _ => {
0 commit comments