Skip to content

Commit 3bb2724

Browse files
committed
feat: add split_part, chr, and translate string functions
Add three new string manipulation functions to the extensions: - split_part: Split a string using a delimiter and return the nth substring (1-indexed). Returns empty string if field index exceeds available substrings. - chr: Convert an integer codepoint to its corresponding Unicode character. Behavior undefined for invalid Unicode scalar values. - translate: Character-by-character replacement similar to Unix tr command. Maps characters from 'from' string to corresponding positions in 'to' string, removing extra characters if 'to' is shorter.
1 parent 793c64b commit 3bb2724

File tree

6 files changed

+138
-14
lines changed

6 files changed

+138
-14
lines changed

extensions/functions_string.yaml

Lines changed: 54 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1558,6 +1558,60 @@ scalar_functions:
15581558
dotall:
15591559
values: [ DOTALL_DISABLED, DOTALL_ENABLED ]
15601560
return: "List<string>"
1561+
-
1562+
name: split_part
1563+
description: >-
1564+
Split a string using a delimiter and return the `field`-th substring (starting at 1). If `field`
1565+
is larger than the number of substrings, an empty string is returned.
1566+
impls:
1567+
- args:
1568+
- value: "varchar<L1>"
1569+
name: "input"
1570+
- value: "varchar<L2>"
1571+
name: "delimiter"
1572+
- value: i32
1573+
name: "field"
1574+
return: "varchar<L1>"
1575+
- args:
1576+
- value: "string"
1577+
name: "input"
1578+
- value: "string"
1579+
name: "delimiter"
1580+
- value: i32
1581+
name: "field"
1582+
return: "string"
1583+
-
1584+
name: chr
1585+
description: >-
1586+
Return a single character whose codepoint is the specified integer. Behaviour is undefined if
1587+
the `codepoint` does not correspond to a valid Unicode scalar value.
1588+
impls:
1589+
- args:
1590+
- value: i64
1591+
name: "codepoint"
1592+
return: "string"
1593+
-
1594+
name: translate
1595+
description: >-
1596+
Replace each occurrence of characters from `from` with the corresponding character in `to`.
1597+
If `to` is shorter than `from`, extra characters are removed from the result. Similar to the Unix `tr` command.
1598+
impls:
1599+
- args:
1600+
- value: "varchar<L1>"
1601+
name: "input"
1602+
- value: "varchar<L2>"
1603+
name: "from"
1604+
- value: "varchar<L3>"
1605+
name: "to"
1606+
return: "varchar<L1>"
1607+
- args:
1608+
- value: "string"
1609+
name: "input"
1610+
- value: "string"
1611+
name: "from"
1612+
- value: "string"
1613+
name: "to"
1614+
return: "string"
15611615

15621616
aggregate_functions:
15631617

tests/baseline.json

Lines changed: 14 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -1,16 +1,16 @@
11
{
2-
"registry": {
3-
"dependency_count": 13,
4-
"extension_count": 13,
5-
"function_count": 165,
6-
"num_aggregate_functions": 29,
7-
"num_scalar_functions": 158,
8-
"num_window_functions": 11,
9-
"num_function_overloads": 517
10-
},
11-
"coverage": {
12-
"total_test_count": 1086,
13-
"num_function_variants": 517,
14-
"num_covered_function_variants": 229
15-
}
2+
"registry": {
3+
"dependency_count": 13,
4+
"extension_count": 13,
5+
"function_count": 165,
6+
"num_aggregate_functions": 29,
7+
"num_scalar_functions": 158,
8+
"num_window_functions": 11,
9+
"num_function_overloads": 517
10+
},
11+
"coverage": {
12+
"total_test_count": 1164,
13+
"num_function_variants": 532,
14+
"num_covered_function_variants": 242
15+
}
1616
}

tests/cases/string/chr.test

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,12 @@
1+
### SUBSTRAIT_SCALAR_TEST: v1.0
2+
### SUBSTRAIT_INCLUDE: '/extensions/functions_string.yaml'
3+
4+
# basic: Basic examples without any special cases
5+
chr(65::i64) = 'A'::str
6+
chr(97::i64) = 'a'::str
7+
chr(48::i64) = '0'::str
8+
chr(8364::i64) = '€'::str
9+
chr(128512::i64) = '😀'::str
10+
11+
# null_input: Examples with null as input
12+
chr(null::i64) = null::str

tests/cases/string/split_part.test

Lines changed: 31 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,31 @@
1+
### SUBSTRAIT_SCALAR_TEST: v1.0
2+
### SUBSTRAIT_INCLUDE: '/extensions/functions_string.yaml'
3+
4+
# basic: Basic examples, no special cases
5+
split_part('abc,def,ghi'::str, ','::str, 1::i32) = 'abc'::str
6+
split_part('abc,def,ghi'::str, ','::str, 2::i32) = 'def'::str
7+
split_part('abc,def,ghi'::str, ','::str, 3::i32) = 'ghi'::str
8+
split_part('abc,def,ghi'::str, ','::str, 4::i32) = ''::str
9+
split_part('a|b|c|d'::str, '|'::str, 1::i32) = 'a'::str
10+
split_part('a|b|c|d'::str, '|'::str, 2::i32) = 'b'::str
11+
split_part('a|b|c|d'::str, '|'::str, 3::i32) = 'c'::str
12+
split_part('a|b|c|d'::str, '|'::str, 4::i32) = 'd'::str
13+
split_part('a|b|c|d'::str, '|'::str, 5::i32) = ''::str
14+
split_part('hello world test'::str, ' '::str, 1::i32) = 'hello'::str
15+
split_part('hello world test'::str, ' '::str, 2::i32) = 'world'::str
16+
split_part('hello world test'::str, ' '::str, 3::i32) = 'test'::str
17+
18+
# basic_delimiters: Basic examples without any special cases, multi-delimiters
19+
split_part('abc~@~def~@~ghi'::str, '~@~'::str, 1::i32) = 'abc'::str
20+
split_part('abc~@~def~@~ghi'::str, '~@~'::str, 2::i32) = 'def'::str
21+
split_part('abc~@~def~@~ghi'::str, '~@~'::str, 3::i32) = 'ghi'::str
22+
split_part('abc~@~def~@~ghi'::str, '~@~'::str, 4::i32) = ''::str
23+
24+
# missing_delimiter: Examples where delimiter not present
25+
split_part('abc'::str, ','::str, 1::i32) = 'abc'::str
26+
split_part('abc'::str, ','::str, 2::i32) = ''::str
27+
28+
# null_input: Examples with null as input
29+
split_part(null::str, ','::str, 1::i32) = null::str
30+
split_part('abc,def'::str, null::str, 1::i32) = null::str
31+
split_part('abc,def'::str, ','::str, null::i32) = null::str

tests/cases/string/translate.test

Lines changed: 18 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,18 @@
1+
### SUBSTRAIT_SCALAR_TEST: v1.0
2+
### SUBSTRAIT_INCLUDE: '/extensions/functions_string.yaml'
3+
4+
# basic: Basic examples without any special cases
5+
translate('banana'::str, 'an'::str, 'oy'::str) = 'boyoyo'::str
6+
translate('Hello World!'::str, ' !'::str, 'x'::str) = 'HelloxWorld'::str
7+
8+
# removal: Examples where replacement string shorter than source, resulting in removal
9+
translate('hello'::str, 'aeiou'::str, ''::str) = 'hll'::str
10+
translate('aabbcc'::str, 'abc'::str, 'a'::str) = 'aaaaaa'::str
11+
12+
# null_input: Examples with null as input
13+
translate(null::str, 'a'::str, 'b'::str) = null::str
14+
translate('hello'::str, null::str, 'b'::str) = null::str
15+
translate('hello'::str, 'l'::str, null::str) = null::str
16+
17+
# unicode: Examples with unicode characters
18+
translate('àéà'::str, 'à'::str, 'a'::str) = 'aéa'::str

tests/coverage/test_coverage.py

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -426,6 +426,15 @@ def test_parse_errors_with_bad_aggregate_testcases(input_func_test, expected_mes
426426
"bitwise_and(-31766::dec<5, 0>, 900::dec<3, 0>) = 896::dec<5, 0>",
427427
"or(true::bool, true::bool) = true::bool",
428428
"between(5::i8, 0::i8, 127::i8) = true::bool",
429+
"split_part('a,b,c'::str, ','::str, 2::i32) = 'b'::str",
430+
"split_part('hello world'::varchar<20>, ' '::varchar<5>, 1::i32) = 'hello'::varchar<20>",
431+
"split_part('one|two|three'::string, '|'::string, 3::i32) = 'three'::string",
432+
"chr(65::i64) = 'A'::string",
433+
"chr(8364::i64) = '€'::string",
434+
"chr(128512::i64) = '😀'::string",
435+
"translate('hello'::str, 'el'::str, 'XY'::str) = 'hXYYo'::str",
436+
"translate('abcdef'::varchar<10>, 'ace'::varchar<5>, 'XYZ'::varchar<5>) = 'XbYdZf'::varchar<10>",
437+
"translate('test'::string, 'ts'::string, 'XY'::string) = 'XeYX'::string",
429438
],
430439
)
431440
def test_parse_various_scalar_func_argument_types(input_func_test):

0 commit comments

Comments
 (0)