@@ -74,7 +74,7 @@ pub fn encode_rows_vertical_par_unordered_broadcast_nulls(
7474///
7575/// This should be given the logical type in order to communicate Polars datatype information down
7676/// into the row encoding / decoding.
77- pub fn get_row_encoding_context ( dtype : & DataType ) -> Option < RowEncodingContext > {
77+ pub fn get_row_encoding_context ( dtype : & DataType , ordered : bool ) -> Option < RowEncodingContext > {
7878 match dtype {
7979 DataType :: Boolean
8080 | DataType :: UInt8
@@ -108,67 +108,86 @@ pub fn get_row_encoding_context(dtype: &DataType) -> Option<RowEncodingContext>
108108 } ,
109109
110110 #[ cfg( feature = "dtype-array" ) ]
111- DataType :: Array ( dtype, _) => get_row_encoding_context ( dtype) ,
112- DataType :: List ( dtype) => get_row_encoding_context ( dtype) ,
111+ DataType :: Array ( dtype, _) => get_row_encoding_context ( dtype, ordered ) ,
112+ DataType :: List ( dtype) => get_row_encoding_context ( dtype, ordered ) ,
113113 #[ cfg( feature = "dtype-categorical" ) ]
114114 DataType :: Categorical ( revmap, ordering) | DataType :: Enum ( revmap, ordering) => {
115- let revmap = revmap. as_ref ( ) . unwrap ( ) ;
116-
117- let ( num_known_categories, lexical_sort_idxs) = match revmap. as_ref ( ) {
118- RevMapping :: Global ( map, _, _) => {
119- let num_known_categories = map. keys ( ) . max ( ) . copied ( ) . map_or ( 0 , |m| m + 1 ) ;
120-
121- // @TODO: This should probably be cached.
122- let lexical_sort_idxs =
123- matches ! ( ordering, CategoricalOrdering :: Lexical ) . then ( || {
124- let read_map = crate :: STRING_CACHE . read_map ( ) ;
125- let payloads = read_map. get_current_payloads ( ) ;
126- assert ! ( payloads. len( ) >= num_known_categories as usize ) ;
127-
128- let mut idxs = ( 0 ..num_known_categories) . collect :: < Vec < u32 > > ( ) ;
129- idxs. sort_by_key ( |& k| payloads[ k as usize ] . as_str ( ) ) ;
130- let mut sort_idxs = vec ! [ 0 ; num_known_categories as usize ] ;
131- for ( i, idx) in idxs. into_iter ( ) . enumerate_u32 ( ) {
132- sort_idxs[ idx as usize ] = i;
133- }
134- sort_idxs
135- } ) ;
136-
137- ( num_known_categories, lexical_sort_idxs)
115+ let is_enum = dtype. is_enum ( ) ;
116+ let ctx = match revmap {
117+ Some ( revmap) => {
118+ let ( num_known_categories, lexical_sort_idxs) = match revmap. as_ref ( ) {
119+ RevMapping :: Global ( map, _, _) => {
120+ let num_known_categories =
121+ map. keys ( ) . max ( ) . copied ( ) . map_or ( 0 , |m| m + 1 ) ;
122+
123+ // @TODO: This should probably be cached.
124+ let lexical_sort_idxs = ( ordered
125+ && matches ! ( ordering, CategoricalOrdering :: Lexical ) )
126+ . then ( || {
127+ let read_map = crate :: STRING_CACHE . read_map ( ) ;
128+ let payloads = read_map. get_current_payloads ( ) ;
129+ assert ! ( payloads. len( ) >= num_known_categories as usize ) ;
130+
131+ let mut idxs = ( 0 ..num_known_categories) . collect :: < Vec < u32 > > ( ) ;
132+ idxs. sort_by_key ( |& k| payloads[ k as usize ] . as_str ( ) ) ;
133+ let mut sort_idxs = vec ! [ 0 ; num_known_categories as usize ] ;
134+ for ( i, idx) in idxs. into_iter ( ) . enumerate_u32 ( ) {
135+ sort_idxs[ idx as usize ] = i;
136+ }
137+ sort_idxs
138+ } ) ;
139+
140+ ( num_known_categories, lexical_sort_idxs)
141+ } ,
142+ RevMapping :: Local ( values, _) => {
143+ // @TODO: This should probably be cached.
144+ let lexical_sort_idxs = ( ordered
145+ && matches ! ( ordering, CategoricalOrdering :: Lexical ) )
146+ . then ( || {
147+ assert_eq ! ( values. null_count( ) , 0 ) ;
148+ let values: Vec < & str > = values. values_iter ( ) . collect ( ) ;
149+
150+ let mut idxs = ( 0 ..values. len ( ) as u32 ) . collect :: < Vec < u32 > > ( ) ;
151+ idxs. sort_by_key ( |& k| values[ k as usize ] ) ;
152+ let mut sort_idxs = vec ! [ 0 ; values. len( ) ] ;
153+ for ( i, idx) in idxs. into_iter ( ) . enumerate_u32 ( ) {
154+ sort_idxs[ idx as usize ] = i;
155+ }
156+ sort_idxs
157+ } ) ;
158+
159+ ( values. len ( ) as u32 , lexical_sort_idxs)
160+ } ,
161+ } ;
162+
163+ RowEncodingCategoricalContext {
164+ num_known_categories,
165+ is_enum,
166+ lexical_sort_idxs,
167+ }
138168 } ,
139- RevMapping :: Local ( values, _) => {
140- // @TODO: This should probably be cached.
141- let lexical_sort_idxs =
142- matches ! ( ordering, CategoricalOrdering :: Lexical ) . then ( || {
143- assert_eq ! ( values. null_count( ) , 0 ) ;
144- let values: Vec < & str > = values. values_iter ( ) . collect ( ) ;
145-
146- let mut idxs = ( 0 ..values. len ( ) as u32 ) . collect :: < Vec < u32 > > ( ) ;
147- idxs. sort_by_key ( |& k| values[ k as usize ] ) ;
148- let mut sort_idxs = vec ! [ 0 ; values. len( ) ] ;
149- for ( i, idx) in idxs. into_iter ( ) . enumerate_u32 ( ) {
150- sort_idxs[ idx as usize ] = i;
151- }
152- sort_idxs
153- } ) ;
154-
155- ( values. len ( ) as u32 , lexical_sort_idxs)
169+ None => {
170+ let num_known_categories = u32:: MAX ;
171+
172+ if matches ! ( ordering, CategoricalOrdering :: Lexical ) && ordered {
173+ panic ! ( "lexical ordering not yet supported if rev-map not given" ) ;
174+ }
175+ RowEncodingCategoricalContext {
176+ num_known_categories,
177+ is_enum,
178+ lexical_sort_idxs : None ,
179+ }
156180 } ,
157181 } ;
158182
159- let ctx = RowEncodingCategoricalContext {
160- num_known_categories,
161- is_enum : matches ! ( dtype, DataType :: Enum ( _, _) ) ,
162- lexical_sort_idxs,
163- } ;
164183 Some ( RowEncodingContext :: Categorical ( ctx) )
165184 } ,
166185 #[ cfg( feature = "dtype-struct" ) ]
167186 DataType :: Struct ( fs) => {
168187 let mut ctxts = Vec :: new ( ) ;
169188
170189 for ( i, f) in fs. iter ( ) . enumerate ( ) {
171- if let Some ( ctxt) = get_row_encoding_context ( f. dtype ( ) ) {
190+ if let Some ( ctxt) = get_row_encoding_context ( f. dtype ( ) , ordered ) {
172191 ctxts. reserve ( fs. len ( ) ) ;
173192 ctxts. extend ( std:: iter:: repeat_n ( None , i) ) ;
174193 ctxts. push ( Some ( ctxt) ) ;
@@ -183,7 +202,7 @@ pub fn get_row_encoding_context(dtype: &DataType) -> Option<RowEncodingContext>
183202 ctxts. extend (
184203 fs[ ctxts. len ( ) ..]
185204 . iter ( )
186- . map ( |f| get_row_encoding_context ( f. dtype ( ) ) ) ,
205+ . map ( |f| get_row_encoding_context ( f. dtype ( ) , ordered ) ) ,
187206 ) ;
188207
189208 Some ( RowEncodingContext :: Struct ( ctxts) )
@@ -214,7 +233,7 @@ pub fn _get_rows_encoded_unordered(by: &[Column]) -> PolarsResult<RowsEncoded> {
214233 let by = by. as_materialized_series ( ) ;
215234 let arr = by. to_physical_repr ( ) . rechunk ( ) . chunks ( ) [ 0 ] . to_boxed ( ) ;
216235 let opt = RowEncodingOptions :: new_unsorted ( ) ;
217- let ctxt = get_row_encoding_context ( by. dtype ( ) ) ;
236+ let ctxt = get_row_encoding_context ( by. dtype ( ) , false ) ;
218237
219238 cols. push ( arr) ;
220239 opts. push ( opt) ;
@@ -245,7 +264,7 @@ pub fn _get_rows_encoded(
245264 let by = by. as_materialized_series ( ) ;
246265 let arr = by. to_physical_repr ( ) . rechunk ( ) . chunks ( ) [ 0 ] . to_boxed ( ) ;
247266 let opt = RowEncodingOptions :: new_sorted ( * desc, * null_last) ;
248- let ctxt = get_row_encoding_context ( by. dtype ( ) ) ;
267+ let ctxt = get_row_encoding_context ( by. dtype ( ) , true ) ;
249268
250269 cols. push ( arr) ;
251270 opts. push ( opt) ;
0 commit comments