Skip to content

Commit 2052478

Browse files
committed
Add isValidASCII function with tests
1 parent c8c7929 commit 2052478

File tree

7 files changed

+266
-13
lines changed

7 files changed

+266
-13
lines changed

ci/jobs/scripts/check_style/aspell-ignore/en/aspell-dict.txt

Lines changed: 17 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
personal_ws-1.1 en 3286
1+
personal_ws-1.1 en 3452
22
AArch
33
ACLs
44
ALTERs
@@ -495,25 +495,25 @@ JSONEachRow
495495
JSONEachRowWithProgress
496496
JSONExtract
497497
JSONExtractArrayRaw
498-
JSONExtractBool
499-
JSONExtractFloat
500-
JSONExtractInt
501-
JSONExtractKeys
502-
JSONExtractKeysAndValues
503-
JSONExtractKeysAndValuesRaw
504-
JSONExtractRaw
505-
JSONExtractString
506-
JSONExtractUInt
507498
JSONExtractArrayRawCaseInsensitive
499+
JSONExtractBool
508500
JSONExtractBoolCaseInsensitive
509501
JSONExtractCaseInsensitive
502+
JSONExtractFloat
510503
JSONExtractFloatCaseInsensitive
504+
JSONExtractInt
511505
JSONExtractIntCaseInsensitive
506+
JSONExtractKeys
507+
JSONExtractKeysAndValues
512508
JSONExtractKeysAndValuesCaseInsensitive
509+
JSONExtractKeysAndValuesRaw
513510
JSONExtractKeysAndValuesRawCaseInsensitive
514511
JSONExtractKeysCaseInsensitive
512+
JSONExtractRaw
515513
JSONExtractRawCaseInsensitive
514+
JSONExtractString
516515
JSONExtractStringCaseInsensitive
516+
JSONExtractUInt
517517
JSONExtractUIntCaseInsensitive
518518
JSONHas
519519
JSONLength
@@ -1258,6 +1258,7 @@ YYYYMMDDToDate
12581258
YYYYMMDDhhmmssToDateTime
12591259
Yandex
12601260
Yasm
1261+
YTsaurus
12611262
ZCurve
12621263
ZSTDQAT
12631264
Zabbix
@@ -1953,6 +1954,7 @@ gRPC
19531954
gaugehistogram
19541955
gccMurmurHash
19551956
gcem
1957+
gdb's
19561958
generateRandom
19571959
generateRandomStructure
19581960
generateSerialID
@@ -2166,6 +2168,7 @@ isNotDistinctFrom
21662168
isNotNull
21672169
isNull
21682170
isNullable
2171+
isValidASCII
21692172
isValidJSON
21702173
isValidUTF
21712174
isZeroOrNull
@@ -2732,8 +2735,8 @@ quantiletdigest
27322735
quantiletdigestweighted
27332736
quantiletiming
27342737
quantiletimingweighted
2735-
quantized
27362738
quantizations
2739+
quantized
27372740
quartile
27382741
queryID
27392742
queryString
@@ -3354,10 +3357,10 @@ updateNode
33543357
uploader
33553358
uploaders
33563359
upperUTF
3357-
uptime
3358-
uptrace
33593360
upsert
33603361
upserts
3362+
uptime
3363+
uptrace
33613364
uring
33623365
url
33633366
urlCluster
@@ -3441,6 +3444,7 @@ xz
34413444
yaml
34423445
yandex
34433446
youtube
3447+
ytsaurus
34443448
zLib
34453449
zLinux
34463450
zabbix

src/Functions/isValidASCII.cpp

Lines changed: 144 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,144 @@
1+
#include <DataTypes/DataTypeString.h>
2+
#include <DataTypes/DataTypesNumber.h>
3+
#include <Functions/FunctionFactory.h>
4+
#include <Functions/IFunction.h>
5+
#include <Functions/FunctionHelpers.h>
6+
#include <Columns/ColumnString.h>
7+
#include <Columns/ColumnFixedString.h>
8+
#include <Columns/ColumnArray.h>
9+
#include <Columns/ColumnsNumber.h>
10+
#include <Common/Exception.h>
11+
#include <base/types.h>
12+
13+
namespace DB
14+
{
15+
16+
namespace ErrorCodes
17+
{
18+
extern const int ILLEGAL_TYPE_OF_ARGUMENT;
19+
}
20+
21+
namespace
22+
{
23+
24+
UInt8 isValidASCII(const UInt8 * data, UInt64 len)
25+
{
26+
if (len == 0)
27+
return 1;
28+
29+
for (UInt64 i = 0; i < len; ++i)
30+
{
31+
if (data[i] > 0x7F)
32+
return 0;
33+
}
34+
35+
return 1;
36+
}
37+
38+
}
39+
40+
class FunctionIsValidASCII : public IFunction
41+
{
42+
public:
43+
static constexpr auto name = "isValidASCII";
44+
45+
static FunctionPtr create(ContextPtr)
46+
{
47+
return std::make_shared<DB::FunctionIsValidASCII>();
48+
}
49+
50+
String getName() const override
51+
{
52+
return name;
53+
}
54+
55+
size_t getNumberOfArguments() const override
56+
{
57+
return 1;
58+
}
59+
60+
DataTypePtr getReturnTypeImpl(const DataTypes & /*arguments*/) const override
61+
{
62+
return std::make_shared<DataTypeUInt8>();
63+
}
64+
65+
bool useDefaultImplementationForConstants() const override
66+
{
67+
return true;
68+
}
69+
70+
ColumnPtr executeImpl(const ColumnsWithTypeAndName & arguments, const DataTypePtr &, size_t input_rows_count) const override
71+
{
72+
const auto & column = arguments[0];
73+
74+
if (const auto * col_str = checkAndGetColumn<ColumnString>(column.column.get()))
75+
{
76+
return executeString(col_str, input_rows_count);
77+
}
78+
else if (const auto * col_fixed_str = checkAndGetColumn<ColumnFixedString>(column.column.get()))
79+
{
80+
return executeFixedString(col_fixed_str, input_rows_count);
81+
}
82+
else
83+
{
84+
throw Exception(ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT,
85+
"Cannot apply function {} to column {}, expected String or FixedString",
86+
getName(), column.column->getName());
87+
}
88+
}
89+
90+
bool isSuitableForShortCircuitArgumentsExecution(const DataTypesWithConstInfo & /*arguments*/) const override
91+
{
92+
return true;
93+
}
94+
95+
private:
96+
ColumnPtr executeString(const ColumnString * col_str, size_t input_rows_count) const
97+
{
98+
auto result = ColumnUInt8::create(input_rows_count);
99+
auto & result_data = result->getData();
100+
101+
const auto & chars = col_str->getChars();
102+
const auto & offsets = col_str->getOffsets();
103+
104+
size_t prev_offset = 0;
105+
for (size_t i = 0; i < input_rows_count; ++i)
106+
{
107+
size_t current_offset = offsets[i];
108+
size_t string_size = current_offset - prev_offset;
109+
result_data[i] = isValidASCII(chars.data() + prev_offset, string_size);
110+
prev_offset = current_offset;
111+
}
112+
113+
return result;
114+
}
115+
116+
ColumnPtr executeFixedString(const ColumnFixedString * col_fixed_str, size_t input_rows_count) const
117+
{
118+
auto result = ColumnUInt8::create(input_rows_count);
119+
auto & result_data = result->getData();
120+
121+
const auto & chars = col_fixed_str->getChars();
122+
size_t string_size = col_fixed_str->getN();
123+
124+
for (size_t i = 0; i < input_rows_count; ++i)
125+
{
126+
result_data[i] = isValidASCII(chars.data() + i * string_size, string_size);
127+
}
128+
129+
return result;
130+
}
131+
};
132+
133+
}
134+
135+
REGISTER_FUNCTION(IsValidASCII)
136+
{
137+
factory.registerFunction<DB::FunctionIsValidASCII>(DB::FunctionDocumentation{
138+
.description = R"(Returns 1 if the input String or FixedString contains only ASCII bytes (0x00–0x7F), otherwise 0.)",
139+
.examples = {{"isValidASCII", "SELECT isValidASCII('hello') AS is_ascii, isValidASCII('你好') AS is_not_ascii", ""}},
140+
.introduced_in = {25, 8},
141+
.category = DB::FunctionDocumentation::Category::Conditional,
142+
});
143+
factory.registerAlias("isASCII", "isValidASCII", DB::FunctionFactory::Case::Sensitive);
144+
}

tests/performance/isValidASCII.xml

Lines changed: 27 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,27 @@
1+
<?xml version="1.0"?>
2+
<clickhouse>
3+
<query>CREATE TABLE IF NOT EXISTS test_ascii_data (
4+
id UInt32,
5+
ascii_string String,
6+
mixed_string String,
7+
long_string String,
8+
empty_string String
9+
) ENGINE = Memory</query>
10+
11+
<query>INSERT INTO test_ascii_data SELECT
12+
number as id,
13+
repeat('A', 100) as ascii_string,
14+
repeat('Hello World!', 50) as mixed_string,
15+
repeat('Pure ASCII 123 !@#', 30) as long_string,
16+
'' as empty_string
17+
FROM numbers(1000000)</query>
18+
19+
<query>SELECT isValidASCII(ascii_string) FROM test_ascii_data FORMAT Null</query>
20+
<query>SELECT isValidASCII(mixed_string) FROM test_ascii_data FORMAT Null</query>
21+
<query>SELECT isValidASCII(long_string) FROM test_ascii_data FORMAT Null</query>
22+
<query>SELECT isValidASCII(empty_string) FROM test_ascii_data FORMAT Null</query>
23+
24+
<query>SELECT isValidASCII('Mixed\x80Content') FROM test_ascii_data LIMIT 1000000 FORMAT Null</query>
25+
26+
<query>DROP TABLE IF EXISTS test_ascii_data</query>
27+
</clickhouse>
Lines changed: 39 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,39 @@
1+
1
2+
1
3+
1
4+
1
5+
1
6+
1
7+
1
8+
1
9+
1
10+
1
11+
1
12+
1
13+
1
14+
1
15+
1
16+
1
17+
1
18+
1
19+
1
20+
1
21+
1
22+
1
23+
1
24+
1
25+
1
26+
1
27+
1
28+
1
29+
1
30+
1
31+
1
32+
1
33+
1
34+
1
35+
1
36+
1
37+
1
38+
1
39+
1
Lines changed: 35 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,35 @@
1+
SELECT 1 = isValidASCII('') FROM system.numbers LIMIT 1;
2+
SELECT 1 = isValidASCII('some text') FROM system.numbers LIMIT 1;
3+
SELECT 1 = isValidASCII('\x00') FROM system.numbers LIMIT 1;
4+
SELECT 1 = isValidASCII('\x66') FROM system.numbers LIMIT 1;
5+
SELECT 1 = isValidASCII('\x7F') FROM system.numbers LIMIT 1;
6+
SELECT 1 = isValidASCII('\x00\x7F') FROM system.numbers LIMIT 1;
7+
SELECT 1 = isValidASCII('\x7F\x00') FROM system.numbers LIMIT 1;
8+
SELECT 0 = isValidASCII('какой-то текст') FROM system.numbers LIMIT 1;
9+
SELECT 0 = isValidASCII('\xC2\x80') FROM system.numbers LIMIT 1;
10+
SELECT 1 = isValidASCII('hello world!') FROM system.numbers LIMIT 1;
11+
12+
SELECT 1 = isASCII('') FROM system.numbers LIMIT 1;
13+
SELECT 1 = isASCII('some text') FROM system.numbers LIMIT 1;
14+
SELECT 1 = isASCII('\x00') FROM system.numbers LIMIT 1;
15+
SELECT 1 = isASCII('\x66') FROM system.numbers LIMIT 1;
16+
SELECT 1 = isASCII('\x7F') FROM system.numbers LIMIT 1;
17+
SELECT 1 = isASCII('\x00\x7F') FROM system.numbers LIMIT 1;
18+
SELECT 1 = isASCII('\x7F\x00') FROM system.numbers LIMIT 1;
19+
SELECT 0 = isASCII('какой-то текст') FROM system.numbers LIMIT 1;
20+
SELECT 0 = isASCII('\xC2\x80') FROM system.numbers LIMIT 1;
21+
SELECT 1 = isASCII('hello world!') FROM system.numbers LIMIT 1;
22+
23+
SELECT isValidASCII(toString(number)) FROM system.numbers WHERE number < 10;
24+
25+
SELECT 1 = isValidASCII('\x00') FROM system.numbers LIMIT 1;
26+
SELECT 1 = isValidASCII('\x7F') FROM system.numbers LIMIT 1;
27+
SELECT 0 = isValidASCII('\x80') FROM system.numbers LIMIT 1;
28+
SELECT 0 = isValidASCII('\xFF') FROM system.numbers LIMIT 1;
29+
30+
SELECT 0 = isValidASCII('Hello\x80World') FROM system.numbers LIMIT 1;
31+
SELECT 0 = isValidASCII('ASCII\xC2\x80Text') FROM system.numbers LIMIT 1;
32+
SELECT 1 = isValidASCII('Pure ASCII 123 !@#') FROM system.numbers LIMIT 1;
33+
34+
SELECT 1 = isValidASCII(toFixedString('ASCII', 5)) FROM system.numbers LIMIT 1;
35+
SELECT 0 = isValidASCII(toFixedString('ASCII\x80', 6)) FROM system.numbers LIMIT 1;

tests/queries/0_stateless/03594_is_valid_ascii_errors.reference

Whitespace-only changes.
Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,4 @@
1+
SELECT isValidASCII([1, 2, 3]); -- { serverError ILLEGAL_TYPE_OF_ARGUMENT }
2+
SELECT isValidASCII(toUUID('00000000-0000-0000-0000-000000000000')); -- { serverError ILLEGAL_TYPE_OF_ARGUMENT }
3+
SELECT isValidASCII(toIPv6('::1')); -- { serverError ILLEGAL_TYPE_OF_ARGUMENT }
4+
SELECT isValidASCII(toIPv4('127.0.0.1')); -- { serverError ILLEGAL_TYPE_OF_ARGUMENT }

0 commit comments

Comments
 (0)