Skip to content

Commit 2f8254f

Browse files
Krishna Paifacebook-github-bot
authored andcommitted
feat: Fix casting of invalid row positions to Json (facebookincubator#15828)
Summary: When casting nested ROW types to JSON, castToJsonFromRow would process all child elements including those at null positions. Since Velox vectors can have invalid data at null positions (e.g., Timestamp with nanos > 1 billion), this could cause crashes during JSON serialization. The fix is to filter out null rows from each child's SelectivityVector before creating AsJson. When all rows are null for a child, store std::nullopt and output "null" directly during rendering. This prevents accessing potentially invalid underlying data at null positions. Reviewed By: Yuhta Differential Revision: D89577815
1 parent a3c52a5 commit 2f8254f

File tree

2 files changed

+194
-20
lines changed

2 files changed

+194
-20
lines changed

velox/functions/prestosql/tests/JsonCastTest.cpp

Lines changed: 127 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1837,3 +1837,130 @@ TEST_F(JsonCastTest, uniqueErrorContextMessage) {
18371837
makeRowVector({makeNullableFlatVector<JsonNativeType>({"\"abc\""_sv})}),
18381838
"Top-level Expression: cast((c0) as BIGINT)");
18391839
}
1840+
1841+
TEST_F(JsonCastTest, fromNestedRowWithNullInnerRowFieldNames) {
1842+
// Same test as above but with field names enabled in JSON output.
1843+
// Uses TIMESTAMP as the innermost type.
1844+
setFieldNamesInJsonCast(true);
1845+
1846+
// Inner type is now a TIMESTAMP
1847+
auto innerRowType = ROW({"ts"}, {TIMESTAMP()});
1848+
auto outerRowType = ROW({"x", "y"}, {innerRowType, DOUBLE()});
1849+
auto arrayType = ARRAY(outerRowType);
1850+
1851+
// Create timestamp values: epoch, 10M seconds after epoch, etc.
1852+
// Row 0: {ts: Timestamp{0, 0}} = "1970-01-01 00:00:00.000"
1853+
// Row 1: null (entire inner row is null) - will have bad nanos value
1854+
// Row 2: {ts: Timestamp{10000000, 0}} = "1970-04-26 17:46:40.000"
1855+
// Row 3: null (entire inner row is null) - will have bad nanos value
1856+
// Note: Rows 1 and 3 have invalid nanos (> 1,000,000,000) but since the
1857+
// entire inner row is null, these bad values should be ignored during
1858+
// JSON serialization.
1859+
auto innerRowChild = makeNullableFlatVector<Timestamp>(
1860+
{Timestamp{0, 0},
1861+
Timestamp{0, 0},
1862+
Timestamp{10000000, 0},
1863+
Timestamp{0, 0}});
1864+
1865+
// Set bad nanos values directly in the raw buffer for null positions.
1866+
// This bypasses Timestamp validation to test that nulls are properly handled.
1867+
auto* rawTimestamps =
1868+
innerRowChild->asFlatVector<Timestamp>()->mutableRawValues();
1869+
rawTimestamps[1] = Timestamp::fromMillisNoError(0);
1870+
reinterpret_cast<uint64_t*>(&rawTimestamps[1])[1] = 2000000000ULL;
1871+
rawTimestamps[3] = Timestamp::fromMillisNoError(0);
1872+
reinterpret_cast<uint64_t*>(&rawTimestamps[3])[1] = 5000000000ULL;
1873+
1874+
auto innerRowVector = makeRowVector({"ts"}, {innerRowChild});
1875+
innerRowVector->setNull(1, true);
1876+
innerRowVector->setNull(3, true);
1877+
1878+
auto outerRowChild2 = makeNullableFlatVector<double>({1.1, 2.2, 3.3, 4.4});
1879+
auto outerRowVector =
1880+
makeRowVector({"x", "y"}, {innerRowVector, outerRowChild2});
1881+
1882+
auto offsets = allocateOffsets(2, pool());
1883+
auto sizes = allocateSizes(2, pool());
1884+
auto rawOffsets = offsets->asMutable<vector_size_t>();
1885+
auto rawSizes = sizes->asMutable<vector_size_t>();
1886+
rawOffsets[0] = 0;
1887+
rawSizes[0] = 2;
1888+
rawOffsets[1] = 2;
1889+
rawSizes[1] = 2;
1890+
1891+
auto arrayVector = std::make_shared<ArrayVector>(
1892+
pool(), arrayType, nullptr, 2, offsets, sizes, outerRowVector);
1893+
1894+
// With field names enabled, output uses object notation:
1895+
// Row 0: [{"x":{"ts":"1970-01-01 00:00:00.000"},"y":1.1},{"x":null,"y":2.2}]
1896+
// Row 1: [{"x":{"ts":"1970-04-26 17:46:40.000"},"y":3.3},{"x":null,"y":4.4}]
1897+
auto expectedVector = makeNullableFlatVector<JsonNativeType>(
1898+
{R"([{"x":{"ts":"1970-01-01 00:00:00.000"},"y":1.1},{"x":null,"y":2.2}])"_sv,
1899+
R"([{"x":{"ts":"1970-04-26 17:46:40.000"},"y":3.3},{"x":null,"y":4.4}])"_sv},
1900+
JSON());
1901+
1902+
testCast(arrayVector, expectedVector);
1903+
1904+
setFieldNamesInJsonCast(false);
1905+
}
1906+
1907+
TEST_F(JsonCastTest, fromDeeplyNestedRowWithNulls) {
1908+
// Test ARRAY(ROW(ROW(ROW(...)))) - 3 levels of nesting with nulls at
1909+
// different levels.
1910+
1911+
// Level 3 (innermost): ROW(val: BIGINT)
1912+
auto level3Type = ROW({"val"}, {BIGINT()});
1913+
1914+
// Level 2: ROW(inner: level3Type)
1915+
auto level2Type = ROW({"inner"}, {level3Type});
1916+
1917+
// Level 1 (outermost row): ROW(mid: level2Type, extra: VARCHAR)
1918+
auto level1Type = ROW({"mid", "extra"}, {level2Type, VARCHAR()});
1919+
1920+
// Array type
1921+
auto arrayType = ARRAY(level1Type);
1922+
1923+
// Create level 3 vectors (4 elements, some null):
1924+
// [0]: {val: 100}
1925+
// [1]: null (entire level3 row is null)
1926+
// [2]: {val: 300}
1927+
// [3]: {val: 400}
1928+
auto level3Child =
1929+
makeNullableFlatVector<int64_t>({100, std::nullopt, 300, 400});
1930+
auto level3Vector = makeRowVector({"val"}, {level3Child});
1931+
level3Vector->setNull(1, true);
1932+
1933+
// Create level 2 vectors:
1934+
// [0]: {inner: {val: 100}}
1935+
// [1]: {inner: null} <- level3 is null, but level2 is not null
1936+
// [2]: null <- entire level2 row is null
1937+
// [3]: {inner: {val: 400}}
1938+
auto level2Vector = makeRowVector({"inner"}, {level3Vector});
1939+
level2Vector->setNull(2, true);
1940+
1941+
// Create level 1 vectors:
1942+
auto level1Child2 =
1943+
makeNullableFlatVector<StringView>({"a"_sv, "b"_sv, "c"_sv, "d"_sv});
1944+
auto level1Vector =
1945+
makeRowVector({"mid", "extra"}, {level2Vector, level1Child2});
1946+
1947+
// Create array with 2 rows, 2 elements each:
1948+
auto offsets = allocateOffsets(2, pool());
1949+
auto sizes = allocateSizes(2, pool());
1950+
auto rawOffsets = offsets->asMutable<vector_size_t>();
1951+
auto rawSizes = sizes->asMutable<vector_size_t>();
1952+
rawOffsets[0] = 0;
1953+
rawSizes[0] = 2;
1954+
rawOffsets[1] = 2;
1955+
rawSizes[1] = 2;
1956+
1957+
auto arrayVector = std::make_shared<ArrayVector>(
1958+
pool(), arrayType, nullptr, 2, offsets, sizes, level1Vector);
1959+
1960+
auto expectedVector = makeNullableFlatVector<JsonNativeType>(
1961+
{R"([[[[100]],"a"],[[null],"b"]])"_sv,
1962+
R"([[null,"c"],[[[400]],"d"]])"_sv},
1963+
JSON());
1964+
1965+
testCast(arrayVector, expectedVector);
1966+
}

velox/functions/prestosql/types/JsonCastOperator.cpp

Lines changed: 67 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -306,15 +306,28 @@ struct AsJson {
306306

307307
// Appends the json string of the value at i to a string writer.
308308
void append(vector_size_t i, exec::StringWriter& proxy) const {
309-
if (fieldName_.has_value()) {
310-
proxy.append(fmt::format("\"{}\":", fieldName_.value()));
311-
}
312-
313309
if (decoded_->isNullAt(i)) {
314-
proxy.append("null");
310+
appendString(proxy, fieldName_);
315311
} else {
316-
proxy.append(this->at(i));
312+
appendString(proxy, fieldName_, this->at(i));
313+
}
314+
}
315+
316+
// Utility function to append string to a string writer.
317+
static void appendString(
318+
exec::StringWriter& proxy,
319+
const std::optional<std::string>& fieldName) {
320+
appendString(proxy, fieldName, "null"_sv);
321+
}
322+
323+
static void appendString(
324+
exec::StringWriter& proxy,
325+
const std::optional<std::string>& fieldName,
326+
const StringView& str) {
327+
if (fieldName.has_value()) {
328+
proxy.append(fmt::format("\"{}\":", fieldName.value()));
317329
}
330+
proxy.append(str);
318331
}
319332

320333
private:
@@ -556,7 +569,11 @@ void castToJsonFromRow(
556569
const SelectivityVector& rows,
557570
FlatVector<StringView>& flatResult,
558571
const std::shared_ptr<exec::CastHooks>& hooks) {
559-
using NameJsonPair = std::pair<std::string, AsJson>;
572+
// NameJsonPair also stores index of the child in the row vector.
573+
// since we are sorting the children based on their field names, we need
574+
// to store the index of the child in the row vector.
575+
using NameJsonPair =
576+
std::pair<std::pair<std::string, vector_size_t>, std::optional<AsJson>>;
560577
// input is guaranteed to be in flat encoding when passed in.
561578
VELOX_CHECK_EQ(input.encoding(), VectorEncoding::Simple::ROW);
562579
auto inputRow = input.as<RowVector>();
@@ -577,23 +594,43 @@ void castToJsonFromRow(
577594
std::optional<std::string> fieldName =
578595
fieldNamesInJsonCastEnabled ? std::optional{name} : std::nullopt;
579596

580-
jsonChildren.emplace_back(
581-
name,
582-
AsJson{
583-
context,
584-
inputRow->childAt(i),
585-
rows,
586-
nullptr,
587-
hooks,
588-
false,
589-
std::move(fieldName)});
597+
auto inputRowChild = inputRow->childAt(i);
598+
599+
// Use a pointer to avoid copying SelectivityVector when there are no nulls.
600+
// Only create a new SelectivityVector when the child has nulls that need
601+
// to be filtered out.
602+
SelectivityVector childRowsHolder;
603+
const SelectivityVector* childRowsPtr = &rows;
604+
if (inputRowChild->mayHaveNulls()) {
605+
childRowsHolder = rows;
606+
childRowsHolder.deselectNulls(
607+
inputRowChild->rawNulls(), rows.begin(), rows.end());
608+
childRowsPtr = &childRowsHolder;
609+
}
610+
const SelectivityVector& childRows = *childRowsPtr;
611+
612+
if (childRows.hasSelections()) {
613+
jsonChildren.emplace_back(
614+
std::pair{name, i},
615+
AsJson{
616+
context,
617+
inputRowChild,
618+
childRows,
619+
nullptr,
620+
hooks,
621+
false,
622+
std::move(fieldName)});
623+
} else {
624+
// All rows are null for this child - store nullopt.
625+
jsonChildren.emplace_back(std::pair{name, i}, std::nullopt);
626+
}
590627

591628
context.applyToSelectedNoThrow(rows, [&](auto row) {
592-
if (inputRow->isNullAt(row)) {
629+
if (inputRow->isNullAt(row) || inputRowChild->isNullAt(row)) {
593630
// "null" will be inlined in the StringView.
594631
return;
595632
}
596-
childrenStringSize += jsonChildren[i].second.lengthAt(row);
633+
childrenStringSize += jsonChildren[i].second->lengthAt(row);
597634
});
598635
}
599636

@@ -634,7 +671,17 @@ void castToJsonFromRow(
634671
if (i > 0) {
635672
proxy.append(","_sv);
636673
}
637-
jsonChildren[i].second.append(row, proxy);
674+
675+
auto nameIndex = jsonChildren[i].first;
676+
if (!jsonChildren[i].second.has_value() ||
677+
inputRow->childAt(nameIndex.second)->isNullAt(row)) {
678+
AsJson::appendString(
679+
proxy,
680+
fieldNamesInJsonCastEnabled ? std::make_optional(nameIndex.first)
681+
: std::nullopt);
682+
continue;
683+
}
684+
jsonChildren[i].second->append(row, proxy);
638685
}
639686

640687
if (fieldNamesInJsonCastEnabled) {

0 commit comments

Comments
 (0)