Skip to content

Commit 90b6440

Browse files
committed
{171428666}: column character encoding
A new `ENCODING` keyword is introduced in this patch, that specifies a character set for a column. Invalid character encoding will be rejected. So far only `"utf8"` and `NONE` are supported. This allows users to conveniently create an indexable utf8 cstring column. It's implemented as a check constraint using the `utf8_validate()` function. Signed-off-by: Rivers Zhang <[email protected]>
1 parent 2dc04c8 commit 90b6440

17 files changed

+182
-21
lines changed

Diff for: db/types.c

+19-13
Original file line numberDiff line numberDiff line change
@@ -3695,6 +3695,21 @@ TYPES_INLINE int CLIENT_BLOB_to_CLIENT_PSTR2(
36953695
return -1;
36963696
}
36973697

3698+
static int utf8_validate_permitting_trailing_zeros(const char *u, int max)
3699+
{
3700+
int valid_len;
3701+
3702+
if (utf8_validate(u, max, &valid_len) != 0)
3703+
return -1;
3704+
3705+
/* utf8_validate() stops at the 1st NUL character. We want to permit trailing zeros */
3706+
for (; valid_len < max - 1; ++valid_len) {
3707+
if (u[valid_len] != '\0')
3708+
return -1;
3709+
}
3710+
return 0;
3711+
}
3712+
36983713
/**
36993714
* Finds out where the input vutf8 string is stored and then determines where it
37003715
* should be copied and copies it. Doesn't deal with NULLs.
@@ -3717,7 +3732,6 @@ static TYPES_INLINE int vutf8_convert(int len, const void *in, int in_len,
37173732
blob_buffer_t *inblob,
37183733
blob_buffer_t *outblob, int *outdtsz)
37193734
{
3720-
int valid_len;
37213735
if (out_len > 0)
37223736
memset(out, 0, out_len);
37233737

@@ -3742,10 +3756,8 @@ static TYPES_INLINE int vutf8_convert(int len, const void *in, int in_len,
37423756
/* validate input blob */
37433757
assert(inblob->length == len);
37443758

3745-
if (utf8_validate(inblob->data, inblob->length, &valid_len) ||
3746-
valid_len != len - 1) {
3759+
if (utf8_validate_permitting_trailing_zeros(inblob->data, inblob->length))
37473760
return -1;
3748-
}
37493761

37503762
memcpy(outblob, inblob, sizeof(blob_buffer_t));
37513763
bzero(inblob, sizeof(blob_buffer_t));
@@ -3767,8 +3779,7 @@ static TYPES_INLINE int vutf8_convert(int len, const void *in, int in_len,
37673779

37683780
/* if the string isn't empty, validate the string and make sure its
37693781
* length matches len (minus 1 for the NUL byte) */
3770-
if (len > 0 &&
3771-
(utf8_validate(in, len, &valid_len) || valid_len != len - 1))
3782+
if (len > 0 && utf8_validate_permitting_trailing_zeros(in, len))
37723783
return -1;
37733784

37743785
memcpy(out, in, len);
@@ -3785,7 +3796,6 @@ static TYPES_INLINE int vutf8_convert(int len, const void *in, int in_len,
37853796
* fit in the out buffer, then the string needs to be copied from the in
37863797
* buffer to a new out blob */
37873798
else if (len <= in_len) {
3788-
int valid_len;
37893799

37903800
if (outblob) {
37913801
if (len > gbl_blob_sz_thresh_bytes)
@@ -3800,8 +3810,7 @@ static TYPES_INLINE int vutf8_convert(int len, const void *in, int in_len,
38003810

38013811
/* if the string isn't empty, validate the string and make sure its
38023812
* length matches len (minus 1 for the NUL byte) */
3803-
if (len > 0 &&
3804-
(utf8_validate(in, len, &valid_len) || valid_len != len - 1))
3813+
if (len > 0 && utf8_validate_permitting_trailing_zeros(in, len))
38053814
return -1;
38063815

38073816
memcpy(outblob->data, in, len);
@@ -3821,8 +3830,6 @@ static TYPES_INLINE int vutf8_convert(int len, const void *in, int in_len,
38213830
* blob to the out buffer */
38223831
else /* len <= out_len */
38233832
{
3824-
int valid_len;
3825-
38263833
/* Do not attempt to convert a blob placeholder (i.e., length == -2) */
38273834
if (inblob && inblob->length != OSQL_BLOB_FILLER_LENGTH) {
38283835
if (!inblob->exists || !inblob->data) {
@@ -3832,8 +3839,7 @@ static TYPES_INLINE int vutf8_convert(int len, const void *in, int in_len,
38323839

38333840
/* if the string isn't empty, validate the string and make sure its
38343841
* length matches len (minus 1 for the NUL byte) */
3835-
if (len > 0 && (utf8_validate(inblob->data, len, &valid_len) ||
3836-
valid_len != len - 1))
3842+
if (len > 0 && utf8_validate_permitting_trailing_zeros(inblob->data, len))
38373843
return -1;
38383844

38393845
memcpy(out, inblob->data, len);

Diff for: docs/images/alter-table-ddl.gif

2.84 KB
Loading

Diff for: docs/images/column-constraint.gif

1.21 KB
Loading

Diff for: docs/src/sqlitegen/bubble-generator-data.tcl

+12
Original file line numberDiff line numberDiff line change
@@ -724,6 +724,12 @@ stack
724724
}
725725
}
726726
{line OPTION DBPAD = signed-number }
727+
{line ENCODING
728+
{or
729+
{line /string-literal}
730+
{line NONE}
731+
}
732+
}
727733
}
728734

729735
table-constraint {
@@ -828,6 +834,12 @@ stack
828834
}
829835
NOT NULL
830836
}
837+
{line ENCODING
838+
{or
839+
{line /string-literal}
840+
{line NONE}
841+
}
842+
}
831843
}
832844
}
833845
{line OPTIONS ( table-options ) }

Diff for: schemachange/sc_records.c

+5-2
Original file line numberDiff line numberDiff line change
@@ -545,12 +545,12 @@ static int prepare_and_verify_newdb_record(struct convert_record_data *data,
545545
if (rc < 0) {
546546
logmsg(LOGMSG_DEBUG, "%s:%d internal error during CHECK constraint\n",
547547
__func__, __LINE__);
548-
return ERR_CONSTR;
548+
return ERR_CHECK_CONSTRAINT;
549549
} else if (rc > 0) {
550550
logmsg(LOGMSG_DEBUG, "%s:%d CHECK constraint failed for '%s'\n",
551551
__func__, __LINE__,
552552
data->iq.usedb->check_constraints[rc - 1].consname);
553-
return ERR_CONSTR;
553+
return ERR_CHECK_CONSTRAINT;
554554
}
555555

556556
rc = verify_record_constraint(&data->iq, data->to, data->trans, p_buf_data,
@@ -1123,6 +1123,9 @@ static int convert_record(struct convert_record_data *data)
11231123
} else if (rc == ERR_VERIFY_PI) {
11241124
sc_client_error(data->s, "Error verifying partial indexes! rrn %d genid 0x%llx", rrn, genid);
11251125
return -2;
1126+
} else if (rc == ERR_CHECK_CONSTRAINT) {
1127+
sc_client_error(data->s, "Record violates check constraints rrn %d genid 0x%llx", rrn, genid);
1128+
return -2;
11261129
} else if (rc != 0) {
11271130
sc_client_error(data->s,
11281131
"Error adding record rcode %d opfailcode %d ixfailnum %d rrn %d genid 0x%llx, stripe %d", rc,

Diff for: sqlite/src/comdb2build.c

+66-4
Original file line numberDiff line numberDiff line change
@@ -6406,7 +6406,7 @@ void comdb2DeferForeignKey(Parse *pParse, int isDeferred)
64066406
return;
64076407
}
64086408

6409-
static void drop_constraint(Parse *pParse, Token *pName, int type)
6409+
static void drop_constraint(Parse *pParse, Token *pName, int type, int hush)
64106410
{
64116411
if (comdb2IsPrepareOnly(pParse))
64126412
return;
@@ -6433,7 +6433,7 @@ static void drop_constraint(Parse *pParse, Token *pName, int type)
64336433
if (cons) {
64346434
/* Mark it as dropped. */
64356435
cons->flags |= CONS_DELETED;
6436-
} else {
6436+
} else if (!hush) {
64376437
pParse->rc = SQLITE_ERROR;
64386438
sqlite3ErrorMsg(pParse, "Constraint '%s' not found.", name);
64396439
goto cleanup;
@@ -6454,15 +6454,15 @@ void comdb2DropForeignKey(Parse *pParse, /* Parser context */
64546454
Token *pName /* Foreign key name */
64556455
)
64566456
{
6457-
drop_constraint(pParse, pName, CONS_FKEY);
6457+
drop_constraint(pParse, pName, CONS_FKEY, 0);
64586458
return;
64596459
}
64606460

64616461
void comdb2DropConstraint(Parse *pParse, /* Parser context */
64626462
Token *pName /* Foreign key name */
64636463
)
64646464
{
6465-
drop_constraint(pParse, pName, CONS_ALL);
6465+
drop_constraint(pParse, pName, CONS_ALL, 0);
64666466
return;
64676467
}
64686468

@@ -7690,3 +7690,65 @@ void create_default_consumer_sp(Parse *p, char *spname)
76907690
comdb2prepareNoRows(v, p, 0, sc, &comdb2SqlSchemaChange, (vdbeFuncArgFree)&free_schema_change_type);
76917691

76927692
}
7693+
7694+
void comdb2ChangeCharacterSet(Parse *pParse, Token *t, int alter)
7695+
{
7696+
struct comdb2_ddl_context *ctx;
7697+
struct comdb2_column *column;
7698+
sqlite3 *db = pParse->db;
7699+
7700+
char *charset = NULL;
7701+
char expr[MAXCOLNAME + sizeof("utf8_validate()=0")];
7702+
char constraint_name[MAXCOLNAME + sizeof("$" GEN_CONS_PREFIX "_CHAR_ENC_")];
7703+
int nw;
7704+
7705+
Token colToken;
7706+
Token funcToken;
7707+
ExprList *arg;
7708+
Expr *func;
7709+
Expr *zero;
7710+
Expr *equality;
7711+
7712+
if (t != NULL) {
7713+
charset = sqlite3NameFromToken(db, t);
7714+
if (charset == NULL)
7715+
return;
7716+
7717+
/* so far only utf8 is supported */
7718+
if (strcasecmp(charset, "utf8") != 0 && strcasecmp(charset, "utf-8") != 0) {
7719+
setError(pParse, SQLITE_MISUSE, "unknown charset");
7720+
goto out;
7721+
}
7722+
}
7723+
7724+
ctx = pParse->comdb2_ddl_ctx;
7725+
if (alter)
7726+
column = ctx->alter_column;
7727+
else
7728+
column = (struct comdb2_column *)LISTC_BOT(&ctx->schema->column_list);
7729+
7730+
if (column->type != SQL_TYPE_CSTRING && column->type != SQL_TYPE_VARCHAR && column->type != SQL_TYPE_CHAR) {
7731+
setError(pParse, SQLITE_MISUSE, "invalid column type to use character encoding");
7732+
goto out;
7733+
}
7734+
7735+
snprintf(constraint_name, sizeof(constraint_name), "$" GEN_CONS_PREFIX "_CHAR_ENC_%s", column->name);
7736+
sqlite3TokenInit(&pParse->constraintName, constraint_name);
7737+
7738+
if (t == NULL) {
7739+
drop_constraint(pParse, &pParse->constraintName, CONS_CHECK, 1);
7740+
} else {
7741+
sqlite3TokenInit(&colToken, column->name);
7742+
sqlite3TokenInit(&funcToken, "utf8_validate");
7743+
7744+
arg = sqlite3ExprListAppend(pParse, NULL, sqlite3ExprAlloc(db, TK_ID, &colToken, 0));
7745+
func = sqlite3ExprFunction(pParse, arg, &funcToken, 0);
7746+
zero = sqlite3ExprAlloc(db, TK_INTEGER, &sqlite3IntTokens[0], 0);
7747+
7748+
equality = sqlite3PExpr(pParse, TK_EQ, func, zero);
7749+
nw = snprintf(expr, sizeof(expr), "utf8_validate(%s)=0", column->name);
7750+
comdb2AddCheckConstraint(pParse, equality, expr, expr + nw + 1);
7751+
}
7752+
out:
7753+
sqlite3DbFree(db, charset);
7754+
}

Diff for: sqlite/src/comdb2build.h

+1
Original file line numberDiff line numberDiff line change
@@ -80,6 +80,7 @@ void comdb2AddIndex(Parse *, Token *, ExprList *, int, Expr *, const char *,
8080
const char *, int, u8, int, ExprList *);
8181
void comdb2AddDbpad(Parse *, int);
8282
void comdb2AddCheckConstraint(Parse *, Expr *, const char *, const char *);
83+
void comdb2ChangeCharacterSet(Parse *pParse, Token *, int);
8384
void comdb2CreateIndex(Parse *, Token *, Token *, SrcList *, ExprList *, int,
8485
Token *, Expr *, const char *, const char *, int, int,
8586
u8, int, ExprList *, int);

Diff for: sqlite/src/func.c

+31
Original file line numberDiff line numberDiff line change
@@ -1418,6 +1418,36 @@ static void uncompressGzipFunc(
14181418
return;
14191419
}
14201420

1421+
/* Return 0 if payload is utf8. Return (-N - 1), where N is the index
1422+
* of the first malformed character */
1423+
int utf8_validate(const char *str, int len, int *valid_len);
1424+
static void comdb2Utf8ValidateFunc(
1425+
sqlite3_context *context,
1426+
int argc,
1427+
sqlite3_value **argv
1428+
){
1429+
int valid_len, rc, len;
1430+
const char *z;
1431+
assert(argc == 1);
1432+
UNUSED_PARAMETER(argc);
1433+
1434+
switch( sqlite3_value_type(argv[0]) ){
1435+
case SQLITE_BLOB:
1436+
len = sqlite3_value_bytes(argv[0]);
1437+
z = sqlite3_value_blob(argv[0]);
1438+
rc = utf8_validate(z, len, &valid_len);
1439+
break;
1440+
case SQLITE_TEXT:
1441+
len = sqlite3_value_bytes(argv[0]) + 1; /* +1 for \0 */
1442+
z = (const char *)sqlite3_value_text(argv[0]);
1443+
rc = utf8_validate(z, len, &valid_len);
1444+
break;
1445+
default:
1446+
rc = -1;
1447+
break;
1448+
}
1449+
sqlite3_result_int(context, rc == 0 ? rc : (-valid_len - 1));
1450+
}
14211451
#endif /* defined(SQLITE_BUILDING_FOR_COMDB2) */
14221452

14231453
/*
@@ -3093,6 +3123,7 @@ void sqlite3RegisterBuiltinFunctions(void){
30933123
FUNCTION(comdb2_starttime, 0, 0, 0, comdb2StartTimeFunc),
30943124
FUNCTION(comdb2_user, 0, 0, 0, comdb2UserFunc),
30953125
FUNCTION(comdb2_last_cost, 0, 0, 0, comdb2LastCostFunc),
3126+
FUNCTION(utf8_validate, 1, 0, 0, comdb2Utf8ValidateFunc),
30963127
FUNCTION(checksum_md5, 1, 0, 0, md5Func),
30973128
FUNCTION(compress, 1, 0, 0, compressFunc),
30983129
FUNCTION(uncompress, 1, 0, 0, uncompressFunc),

Diff for: sqlite/src/parse.y

+8
Original file line numberDiff line numberDiff line change
@@ -542,6 +542,8 @@ ccons ::= PRIMARY KEY sortorder(Z) onconf(R) autoinc(I).
542542
{sqlite3AddPrimaryKey(pParse,0,R,I,Z);}
543543
%endif !SQLITE_BUILDING_FOR_COMDB2
544544
%ifdef SQLITE_BUILDING_FOR_COMDB2
545+
ccons ::= ENCODING STRING(H). {comdb2ChangeCharacterSet(pParse,&H,0);}
546+
ccons ::= ENCODING NONE. {comdb2ChangeCharacterSet(pParse,NULL,0);}
545547
ccons ::= UNIQUE onconf(R). {
546548
comdb2AddIndex(pParse, 0, 0, R, 0, 0, 0, SQLITE_SO_ASC,
547549
SQLITE_IDXTYPE_UNIQUE, 0, 0);
@@ -2037,6 +2039,12 @@ alter_table_alter_column_cmd ::= SET NOT NULL. {
20372039
alter_table_alter_column_cmd ::= DROP NOT NULL. {
20382040
comdb2AlterColumnDropNotNull(pParse);
20392041
}
2042+
alter_table_alter_column_cmd ::= ENCODING STRING(H). {
2043+
comdb2ChangeCharacterSet(pParse,&H,1);
2044+
}
2045+
alter_table_alter_column_cmd ::= ENCODING NONE. {
2046+
comdb2ChangeCharacterSet(pParse,NULL,1);
2047+
}
20402048
alter_table_alter_column ::= alter_table_alter_column_start
20412049
alter_table_alter_column_cmd. {
20422050
comdb2AlterColumnEnd(pParse);

Diff for: sqlite/tool/mkkeywordhash.c

+1
Original file line numberDiff line numberDiff line change
@@ -211,6 +211,7 @@ static Keyword aKeywordTable[] = {
211211
{ "DISTINCT", "TK_DISTINCT", ALWAYS },
212212
{ "DO", "TK_DO", UPSERT },
213213
{ "DROP", "TK_DROP", ALWAYS },
214+
{ "ENCODING", "TK_ENCODING", ALWAYS },
214215
{ "END", "TK_END", ALWAYS },
215216
{ "EACH", "TK_EACH", TRIGGER },
216217
{ "ELSE", "TK_ELSE", ALWAYS },

Diff for: tests/auth.test/t09.expected

+2
Original file line numberDiff line numberDiff line change
@@ -65,6 +65,7 @@
6565
(candidate='EACH')
6666
(candidate='ELSE')
6767
(candidate='ENABLE')
68+
(candidate='ENCODING')
6869
(candidate='END')
6970
(candidate='ESCAPE')
7071
(candidate='EXCEPT')
@@ -377,6 +378,7 @@
377378
(candidate='unlikely()')
378379
(candidate='upper()')
379380
(candidate='usleep()')
381+
(candidate='utf8_validate()')
380382
(candidate='zeroblob()')
381383
(username='user1')
382384
(username='user2')

Diff for: tests/comdb2sys.test/comdb2sys.expected

+2-1
Original file line numberDiff line numberDiff line change
@@ -83,7 +83,7 @@
8383
[select * from comdb2_tablesizes order by tablename] rc 0
8484
(KEYWORDS_COUNT=223)
8585
[SELECT COUNT(*) AS KEYWORDS_COUNT FROM comdb2_keywords] rc 0
86-
(RESERVED_KW=66)
86+
(RESERVED_KW=67)
8787
[SELECT COUNT(*) AS RESERVED_KW FROM comdb2_keywords WHERE reserved = 'Y'] rc 0
8888
(NONRESERVED_KW=157)
8989
[SELECT COUNT(*) AS NONRESERVED_KW FROM comdb2_keywords WHERE reserved = 'N'] rc 0
@@ -104,6 +104,7 @@
104104
(name='DISTINCT', reserved='Y')
105105
(name='DROP', reserved='Y')
106106
(name='ELSE', reserved='Y')
107+
(name='ENCODING', reserved='Y')
107108
(name='ESCAPE', reserved='Y')
108109
(name='EXCEPT', reserved='Y')
109110
(name='EXISTS', reserved='Y')

Diff for: tests/ddl_no_csc2.test/t09_check.expected

+1-1
Original file line numberDiff line numberDiff line change
@@ -11,7 +11,7 @@
1111
')
1212
(rows inserted=1)
1313
(rows inserted=1)
14-
[ALTER TABLE t1 ADD CONSTRAINT valid_colors CHECK (color IN ('red', 'green', 'blue'))] failed with rc 240 Record violates foreign constraints rrn xx genid xx
14+
[ALTER TABLE t1 ADD CONSTRAINT valid_colors CHECK (color IN ('red', 'green', 'blue'))] failed with rc 240 Record violates check constraints rrn xx genid xx
1515
(csc2='schema
1616
{
1717
cstring color[11] null = yes

Diff for: tests/ddl_no_csc2.test/t15_encoding.expected

+17
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,17 @@
1+
[CREATE TABLE t15(a INTEGER ENCODING 'ascii')] failed with rc -3 unknown charset
2+
[CREATE TABLE t15(a INTEGER ENCODING 'utf8')] failed with rc -3 invalid column type to use character encoding
3+
[CREATE TABLE t15(a TEXT ENCODING 'utf8')] failed with rc -3 invalid column type to use character encoding
4+
(csc2='schema
5+
{
6+
cstring a[11] null = yes
7+
}
8+
constraints
9+
{
10+
check "$CONSTRAINT_CHAR_ENC_a" = {where utf8_validate(a)=0}
11+
}
12+
')
13+
[INSERT INTO t15 VALUES (CAST(x'616263FF616263' AS TEXT))] failed with rc 403 CHECK constraint violation CHECK constraint failed for '$CONSTRAINT_CHAR_ENC_a' unable to add record rc = 320
14+
(COUNT(*)=0)
15+
(rows inserted=1)
16+
(COUNT(*)=1)
17+
[ALTER TABLE t15 ALTER COLUMN a ENCODING 'utf8'] failed with rc 240 Record violates check constraints rrn xx genid xx

Diff for: tests/ddl_no_csc2.test/t15_encoding.sql

+13
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,13 @@
1+
DROP TABLE IF EXISTS t15
2+
CREATE TABLE t15(a INTEGER ENCODING 'ascii')$$
3+
CREATE TABLE t15(a INTEGER ENCODING 'utf8')$$
4+
CREATE TABLE t15(a TEXT ENCODING 'utf8')$$
5+
CREATE TABLE t15(a CHAR(10) ENCODING 'utf8')$$
6+
SELECT csc2 FROM sqlite_master WHERE name='t15'
7+
INSERT INTO t15 VALUES (CAST(x'616263FF616263' AS TEXT))
8+
SELECT COUNT(*) FROM t15
9+
ALTER TABLE t15 ALTER COLUMN a ENCODING NONE$$
10+
INSERT INTO t15 VALUES (CAST(x'616263FF616263' AS TEXT))
11+
SELECT COUNT(*) FROM t15
12+
ALTER TABLE t15 ALTER COLUMN a ENCODING 'utf8'$$
13+
DROP TABLE t15

0 commit comments

Comments
 (0)