Skip to content

Commit e336b3a

Browse files
authored
pqarrow/arrowutils: Add sorting support for Struct, RunEndEncoded, FixedSizeBinary (#936)
* pqarrow/arrowutils: Add FixedSizeBinaryDictionary support * pqarrow/arrowutils: Add support for Struct and RunEndEncoded * pqarrow/arrowutils: Handle empty structs correctly * pqarrow/arrowutils: Retain record if unmodified * query/physicalplan: Sampler requires LessOrEqual 1024 bytes allocations * remove trailing newline * pqarrow/arrowutils: Limit by using the arrowutils Take helper This will also allow us to limit all the newly supported column types, timestamp, struct and runendencoded * query/physicalplan: Guard against s.size==0 panic Please take a look at this fix, @thorfour. I'm not sure why this started panicking now. * pqarrow/arrowutils: Release List ValueBuilder
1 parent 8ee3beb commit e336b3a

File tree

5 files changed

+428
-85
lines changed

5 files changed

+428
-85
lines changed

pqarrow/arrowutils/sort.go

Lines changed: 160 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -72,7 +72,10 @@ func Take(ctx context.Context, r arrow.Record, indices *array.Int32) (arrow.Reco
7272
// does not have these columns.
7373
var customTake bool
7474
for i := 0; i < int(r.NumCols()); i++ {
75-
if r.Column(i).DataType().ID() == arrow.DICTIONARY || r.Column(i).DataType().ID() == arrow.LIST {
75+
if r.Column(i).DataType().ID() == arrow.DICTIONARY ||
76+
r.Column(i).DataType().ID() == arrow.RUN_END_ENCODED ||
77+
r.Column(i).DataType().ID() == arrow.LIST ||
78+
r.Column(i).DataType().ID() == arrow.STRUCT {
7679
customTake = true
7780
break
7881
}
@@ -108,8 +111,12 @@ func Take(ctx context.Context, r arrow.Record, indices *array.Int32) (arrow.Reco
108111
switch arr := r.Column(i).(type) {
109112
case *array.Dictionary:
110113
g.Go(func() error { return TakeDictColumn(ctx, arr, i, resArr, indices) })
114+
case *array.RunEndEncoded:
115+
g.Go(func() error { return TakeRunEndEncodedColumn(ctx, arr, i, resArr, indices) })
111116
case *array.List:
112117
g.Go(func() error { return TakeListColumn(ctx, arr, i, resArr, indices) })
118+
case *array.Struct:
119+
g.Go(func() error { return TakeStructColumn(ctx, arr, i, resArr, indices) })
113120
default:
114121
g.Go(func() error { return TakeColumn(ctx, col, i, resArr, indices) })
115122
}
@@ -140,22 +147,91 @@ func TakeColumn(ctx context.Context, a arrow.Array, idx int, arr []arrow.Array,
140147
}
141148

142149
func TakeDictColumn(ctx context.Context, a *array.Dictionary, idx int, arr []arrow.Array, indices *array.Int32) error {
143-
r := array.NewDictionaryBuilderWithDict(
144-
compute.GetAllocator(ctx), a.DataType().(*arrow.DictionaryType), a.Dictionary(),
145-
).(*array.BinaryDictionaryBuilder)
146-
defer r.Release()
150+
switch a.Dictionary().(type) {
151+
case *array.String, *array.Binary:
152+
r := array.NewDictionaryBuilderWithDict(
153+
compute.GetAllocator(ctx), a.DataType().(*arrow.DictionaryType), a.Dictionary(),
154+
).(*array.BinaryDictionaryBuilder)
155+
defer r.Release()
156+
157+
r.Reserve(indices.Len())
158+
idxBuilder := r.IndexBuilder()
159+
for _, i := range indices.Int32Values() {
160+
if a.IsNull(int(i)) {
161+
r.AppendNull()
162+
continue
163+
}
164+
idxBuilder.Append(a.GetValueIndex(int(i)))
165+
}
147166

148-
r.Reserve(indices.Len())
149-
idxBuilder := r.IndexBuilder()
150-
for _, i := range indices.Int32Values() {
151-
if a.IsNull(int(i)) {
152-
r.AppendNull()
167+
arr[idx] = r.NewArray()
168+
return nil
169+
case *array.FixedSizeBinary:
170+
r := array.NewDictionaryBuilderWithDict(
171+
compute.GetAllocator(ctx), a.DataType().(*arrow.DictionaryType), a.Dictionary(),
172+
).(*array.FixedSizeBinaryDictionaryBuilder)
173+
defer r.Release()
174+
175+
r.Reserve(indices.Len())
176+
idxBuilder := r.IndexBuilder()
177+
for _, i := range indices.Int32Values() {
178+
if a.IsNull(int(i)) {
179+
r.AppendNull()
180+
continue
181+
}
182+
// TODO: Improve this by not copying actual values.
183+
idxBuilder.Append(a.GetValueIndex(int(i)))
184+
}
185+
186+
arr[idx] = r.NewArray()
187+
return nil
188+
}
189+
190+
return nil
191+
}
192+
193+
func TakeRunEndEncodedColumn(ctx context.Context, a *array.RunEndEncoded, idx int, arr []arrow.Array, indices *array.Int32) error {
194+
expandedIndexBuilder := array.NewInt32Builder(compute.GetAllocator(ctx))
195+
defer expandedIndexBuilder.Release()
196+
197+
dict := a.Values().(*array.Dictionary)
198+
for i := 0; i < a.Len(); i++ {
199+
if dict.IsNull(a.GetPhysicalIndex(i)) {
200+
expandedIndexBuilder.AppendNull()
201+
} else {
202+
expandedIndexBuilder.Append(int32(dict.GetValueIndex(a.GetPhysicalIndex(i))))
203+
}
204+
}
205+
expandedIndex := expandedIndexBuilder.NewInt32Array()
206+
defer expandedIndex.Release()
207+
208+
expandedReorderedArr := make([]arrow.Array, 1)
209+
if err := TakeColumn(ctx, expandedIndex, 0, expandedReorderedArr, indices); err != nil {
210+
return err
211+
}
212+
expandedReordered := expandedReorderedArr[0].(*array.Int32)
213+
defer expandedReordered.Release()
214+
215+
b := array.NewRunEndEncodedBuilder(
216+
compute.GetAllocator(ctx), a.RunEndsArr().DataType(), a.Values().DataType(),
217+
)
218+
defer b.Release()
219+
b.Reserve(indices.Len())
220+
221+
dictValues := dict.Dictionary().(*array.String)
222+
for i := 0; i < expandedReordered.Len(); i++ {
223+
if expandedReordered.IsNull(i) {
224+
b.AppendNull()
153225
continue
154226
}
155-
idxBuilder.Append(a.GetValueIndex(int(i)))
227+
reorderedIndex := expandedReordered.Value(i)
228+
v := dictValues.Value(int(reorderedIndex))
229+
if err := b.AppendValueFromString(v); err != nil {
230+
return err
231+
}
156232
}
157233

158-
arr[idx] = r.NewArray()
234+
arr[idx] = b.NewRunEndEncodedArray()
159235
return nil
160236
}
161237

@@ -165,6 +241,7 @@ func TakeListColumn(ctx context.Context, a *array.List, idx int, arr []arrow.Arr
165241
if !ok {
166242
return fmt.Errorf("unexpected value builder type %T for list column", r.ValueBuilder())
167243
}
244+
defer valueBuilder.Release()
168245

169246
listValues := a.ListValues().(*array.Dictionary)
170247
switch dictV := listValues.Dictionary().(type) {
@@ -200,6 +277,54 @@ func TakeListColumn(ctx context.Context, a *array.List, idx int, arr []arrow.Arr
200277
return nil
201278
}
202279

280+
func TakeStructColumn(ctx context.Context, a *array.Struct, idx int, arr []arrow.Array, indices *array.Int32) error {
281+
aType := a.Data().DataType().(*arrow.StructType)
282+
283+
// Immediately, return this struct if it has no fields/columns
284+
if a.NumField() == 0 {
285+
// If the original record is released and this is released once more,
286+
// as usually done, we want to retain it once more.
287+
a.Retain()
288+
arr[idx] = a
289+
return nil
290+
}
291+
292+
cols := make([]arrow.Array, a.NumField())
293+
names := make([]string, a.NumField())
294+
defer func() {
295+
for _, col := range cols {
296+
if col != nil {
297+
col.Release()
298+
}
299+
}
300+
}()
301+
302+
for i := 0; i < a.NumField(); i++ {
303+
names[i] = aType.Field(i).Name
304+
305+
switch f := a.Field(i).(type) {
306+
case *array.RunEndEncoded:
307+
err := TakeRunEndEncodedColumn(ctx, f, i, cols, indices)
308+
if err != nil {
309+
return err
310+
}
311+
default:
312+
err := TakeColumn(ctx, f, i, cols, indices)
313+
if err != nil {
314+
return err
315+
}
316+
}
317+
}
318+
319+
takeStruct, err := array.NewStructArray(cols, names)
320+
if err != nil {
321+
return err
322+
}
323+
324+
arr[idx] = takeStruct
325+
return nil
326+
}
327+
203328
type multiColSorter struct {
204329
indices *builder.OptInt32Builder
205330
comparisons []comparator
@@ -263,13 +388,21 @@ func newMultiColSorter(
263388
},
264389
bytes.Compare,
265390
)
391+
case *array.FixedSizeBinary:
392+
ms.comparisons[i] = newOrderedSorter[[]byte](
393+
&fixedSizeBinaryDictionary{
394+
dict: e,
395+
elem: elem,
396+
},
397+
bytes.Compare,
398+
)
266399
default:
267400
ms.Release()
268-
return nil, fmt.Errorf("unsupported dictionary column type for sorting %T", e)
401+
return nil, fmt.Errorf("unsupported dictionary column type for sorting %T for column %s", e, r.Schema().Field(col.Index).Name)
269402
}
270403
default:
271404
ms.Release()
272-
return nil, fmt.Errorf("unsupported column type for sorting %T", e)
405+
return nil, fmt.Errorf("unsupported column type for sorting %T for column %s", e, r.Schema().Field(col.Index).Name)
273406
}
274407
}
275408
return ms, nil
@@ -417,3 +550,16 @@ func (s *binaryDictionary) IsNull(i int) bool {
417550
func (s *binaryDictionary) Value(i int) []byte {
418551
return s.elem.Value(s.dict.GetValueIndex(i))
419552
}
553+
554+
type fixedSizeBinaryDictionary struct {
555+
dict *array.Dictionary
556+
elem *array.FixedSizeBinary
557+
}
558+
559+
func (s *fixedSizeBinaryDictionary) IsNull(i int) bool {
560+
return s.dict.IsNull(i)
561+
}
562+
563+
func (s *fixedSizeBinaryDictionary) Value(i int) []byte {
564+
return s.elem.Value(s.dict.GetValueIndex(i))
565+
}

0 commit comments

Comments
 (0)