Skip to content

Commit 89c5105

Browse files
goldvitalycopybara-github
authored andcommitted
The first version of support of removed values in the dict.
dict_item_test.py is an illustration for corner case. Benchmarks are positive, because I am also fixing up usage of SliceBuilder in the way it was expected to be used. It was done before to keep old behavior for removed values. Now we change behavior everywhere (as far as we can be sure). ``` name old cpu/op new cpu/op delta get_from_dict/size:1/fallback_count:0 1.34µs ± 5% 1.42µs ± 2% ~ (p=0.056 n=5+5) get_from_dict/size:8/fallback_count:0 1.48µs ± 6% 1.50µs ± 2% ~ (p=0.222 n=5+5) get_from_dict/size:64/fallback_count:0 2.50µs ± 4% 2.31µs ± 2% -7.56% (p=0.008 n=5+5) get_from_dict/size:512/fallback_count:0 10.9µs ±13% 8.5µs ± 2% -22.11% (p=0.008 n=5+5) get_from_dict/size:4096/fallback_count:0 74.5µs ± 3% 57.9µs ± 2% -22.26% (p=0.008 n=5+5) get_from_dict/size:10000/fallback_count:0 187µs ± 5% 144µs ± 2% -22.62% (p=0.008 n=5+5) get_from_dict/size:1/fallback_count:1 2.07µs ±15% 1.86µs ± 2% -9.97% (p=0.016 n=5+4) get_from_dict/size:8/fallback_count:1 2.57µs ±10% 2.30µs ±17% ~ (p=0.095 n=5+5) get_from_dict/size:64/fallback_count:1 5.94µs ± 2% 4.69µs ±15% -21.01% (p=0.008 n=5+5) get_from_dict/size:512/fallback_count:1 34.3µs ±13% 24.0µs ±15% -30.08% (p=0.008 n=5+5) get_from_dict/size:4096/fallback_count:1 253µs ± 2% 170µs ± 2% -32.95% (p=0.008 n=5+5) get_from_dict/size:10000/fallback_count:1 630µs ± 2% 427µs ± 1% -32.22% (p=0.008 n=5+5) get_from_dict/size:1/fallback_count:8 5.46µs ± 2% 4.38µs ± 2% -19.85% (p=0.008 n=5+5) get_from_dict/size:8/fallback_count:8 10.8µs ± 2% 6.2µs ± 3% -43.29% (p=0.008 n=5+5) get_from_dict/size:64/fallback_count:8 29.8µs ± 3% 19.1µs ± 4% -35.96% (p=0.008 n=5+5) get_from_dict/size:512/fallback_count:8 179µs ± 3% 120µs ± 4% -32.71% (p=0.008 n=5+5) get_from_dict/size:4096/fallback_count:8 1.36ms ± 2% 0.94ms ± 3% -30.91% (p=0.008 n=5+5) get_from_dict/size:10000/fallback_count:8 3.33ms ± 2% 2.29ms ± 3% -31.41% (p=0.008 n=5+5) get_from_dict/size:1/fallback_count:16 9.32µs ± 2% 7.17µs ± 3% -23.01% (p=0.008 n=5+5) get_from_dict/size:8/fallback_count:16 16.7µs ± 3% 10.5µs ± 2% -37.08% (p=0.008 n=5+5) get_from_dict/size:64/fallback_count:16 55.0µs ± 2% 36.3µs ± 3% -34.03% (p=0.008 n=5+5) get_from_dict/size:512/fallback_count:16 336µs ± 3% 238µs ± 3% -29.14% (p=0.008 n=5+5) get_from_dict/size:4096/fallback_count:16 2.55ms ± 3% 1.86ms ± 3% -27.07% (p=0.008 n=5+5) get_from_dict/size:10000/fallback_count:16 6.20ms ± 2% 4.50ms ± 3% -27.47% (p=0.008 n=5+5) ``` PiperOrigin-RevId: 721738482 Change-Id: I9ba4bc4da61f79b9fbc5f1c539c93eb191a29913
1 parent e849b56 commit 89c5105

File tree

15 files changed

+305
-200
lines changed

15 files changed

+305
-200
lines changed

koladata/internal/data_bag.cc

Lines changed: 40 additions & 40 deletions
Original file line numberDiff line numberDiff line change
@@ -2110,46 +2110,47 @@ absl::StatusOr<DataSliceImpl> DataBagImpl::GetDictSize(
21102110
}
21112111

21122112
template <class AllocCheckFn>
2113-
absl::StatusOr<DataSliceImpl> DataBagImpl::GetFromDictNoFallback(
2114-
const arolla::DenseArray<ObjectId>& dicts,
2115-
const DataSliceImpl& keys) const {
2113+
absl::Status DataBagImpl::GetFromDictNoFallback(
2114+
const arolla::DenseArray<ObjectId>& dicts, const DataSliceImpl& keys,
2115+
SliceBuilder& bldr) const {
21162116
if (dicts.size() != keys.size()) {
21172117
return absl::InvalidArgumentError(
21182118
absl::StrFormat("dicts and keys sizes don't match: %d vs %d",
21192119
dicts.size(), keys.size()));
21202120
}
21212121

21222122
ReadOnlyDictGetter<AllocCheckFn> dict_getter(this);
2123-
SliceBuilder bldr(dicts.size());
21242123

21252124
if (keys.is_mixed_dtype()) {
21262125
dicts.ForEachPresent([&](int64_t offset, ObjectId dict_id) {
2127-
bldr.InsertIfNotSetAndUpdateAllocIds(
2128-
offset, dict_getter(dict_id).Get(keys[offset]));
2126+
if (auto val = dict_getter(dict_id).Get(keys[offset]); val.has_value()) {
2127+
bldr.InsertIfNotSetAndUpdateAllocIds(offset, *val);
2128+
}
21292129
});
21302130
} else {
21312131
absl::Status status = absl::OkStatus();
21322132
keys.VisitValues([&](const auto& vec) {
21332133
using T = typename std::decay_t<decltype(vec)>::base_type;
21342134
status = arolla::DenseArraysForEachPresent(
21352135
[&](int64_t offset, ObjectId dict_id, arolla::view_type_t<T> key) {
2136-
bldr.InsertIfNotSetAndUpdateAllocIds(
2137-
offset, dict_getter(dict_id).Get(DataItem::View<T>{key}));
2136+
if (auto val = dict_getter(dict_id).Get(DataItem::View<T>{key});
2137+
val.has_value()) {
2138+
bldr.InsertIfNotSetAndUpdateAllocIds(offset, *val);
2139+
}
21382140
},
21392141
dicts, vec);
21402142
});
21412143
RETURN_IF_ERROR(status);
21422144
}
21432145

2144-
RETURN_IF_ERROR(dict_getter.status());
2145-
return std::move(bldr).Build();
2146+
return dict_getter.status();
21462147
}
21472148

21482149
template <class AllocCheckFn>
2149-
absl::StatusOr<DataSliceImpl> DataBagImpl::GetFromDictNoFallback(
2150-
const arolla::DenseArray<ObjectId>& dicts, const DataItem& keys) const {
2150+
absl::Status DataBagImpl::GetFromDictNoFallback(
2151+
const arolla::DenseArray<ObjectId>& dicts, const DataItem& keys,
2152+
SliceBuilder& bldr) const {
21512153
ReadOnlyDictGetter<AllocCheckFn> dict_getter(this);
2152-
SliceBuilder bldr(dicts.size());
21532154

21542155
keys.VisitValue([&](const auto& val) {
21552156
using T = typename std::decay_t<decltype(val)>;
@@ -2159,29 +2160,25 @@ absl::StatusOr<DataSliceImpl> DataBagImpl::GetFromDictNoFallback(
21592160
});
21602161
});
21612162

2162-
RETURN_IF_ERROR(dict_getter.status());
2163-
return std::move(bldr).Build();
2163+
return dict_getter.status();
21642164
}
21652165

21662166
template <class AllocCheckFn, class DataSliceImplT>
21672167
inline absl::StatusOr<DataSliceImpl> DataBagImpl::GetFromDictImpl(
21682168
const DataSliceImpl& dicts, const DataSliceImplT& keys,
21692169
FallbackSpan fallbacks) const {
21702170
const auto& dict_objects = dicts.values<ObjectId>();
2171-
ASSIGN_OR_RETURN(auto result,
2172-
GetFromDictNoFallback<AllocCheckFn>(dict_objects, keys));
2173-
if (fallbacks.empty()) {
2174-
return result;
2175-
}
2176-
2171+
SliceBuilder bldr(dicts.size());
2172+
RETURN_IF_ERROR(
2173+
GetFromDictNoFallback<AllocCheckFn>(dict_objects, keys, bldr));
21772174
for (const DataBagImpl* fallback : fallbacks) {
2178-
// TODO: avoid requesting already known objects.
2179-
ASSIGN_OR_RETURN(
2180-
auto fb_result,
2181-
fallback->GetFromDictNoFallback<AllocCheckFn>(dict_objects, keys));
2182-
ASSIGN_OR_RETURN(result, PresenceOrOp{}(result, fb_result));
2175+
if (bldr.is_finalized()) {
2176+
break;
2177+
}
2178+
RETURN_IF_ERROR(fallback->GetFromDictNoFallback<AllocCheckFn>(dict_objects,
2179+
keys, bldr));
21832180
}
2184-
return result;
2181+
return std::move(bldr).Build();
21852182
}
21862183

21872184
absl::StatusOr<DataSliceImpl> DataBagImpl::GetFromDict(
@@ -2387,7 +2384,10 @@ ABSL_ATTRIBUTE_ALWAYS_INLINE DataItem
23872384
DataBagImpl::GetFromDictObject(ObjectId dict_id, const Key& key) const {
23882385
AllocationId alloc_id(dict_id);
23892386
if (const auto* dicts = GetConstDictsOrNull(alloc_id); dicts != nullptr) {
2390-
return (**dicts)[dict_id.Offset()].Get(key);
2387+
if (auto res_opt = (**dicts)[dict_id.Offset()].Get(key);
2388+
res_opt.has_value()) {
2389+
return *res_opt;
2390+
}
23912391
}
23922392
return DataItem();
23932393
}
@@ -2401,21 +2401,20 @@ DataBagImpl::GetFromDictObjectWithFallbacks(ObjectId dict_id, const Key& key,
24012401
}
24022402
AllocationId alloc_id(dict_id);
24032403
size_t alloc_hash = absl::HashOf(alloc_id);
2404-
size_t key_hash = DataItem::Hash()(key);
24052404
int64_t offset = dict_id.Offset();
24062405
if (const auto* dicts = GetConstDictsOrNull(alloc_id, alloc_hash);
24072406
dicts != nullptr) {
2408-
DataItem res = (**dicts)[offset].Get(key, key_hash);
2407+
auto res = (**dicts)[offset].Get(key);
24092408
if (res.has_value() || fallbacks.empty()) {
2410-
return res;
2409+
return *res;
24112410
}
24122411
}
24132412
for (const DataBagImpl* fallback : fallbacks) {
24142413
if (const auto* dicts = fallback->GetConstDictsOrNull(alloc_id, alloc_hash);
24152414
dicts != nullptr) {
2416-
DataItem res = (**dicts)[offset].Get(key, key_hash);
2415+
auto res = (**dicts)[offset].Get(key);
24172416
if (res.has_value()) {
2418-
return res;
2417+
return *res;
24192418
}
24202419
}
24212420
}
@@ -3177,23 +3176,24 @@ absl::Status DataBagImpl::MergeDictsInplace(const DataBagImpl& other,
31773176
auto& this_dict = this_dicts[i];
31783177
for (const DataItem& key : other_dict.GetKeys()) {
31793178
if (conflict_policy == MergeOptions::kOverwrite) {
3180-
this_dict.Set(key, other_dict.Get(key));
3179+
this_dict.Set(key, *other_dict.Get(key));
31813180
continue;
31823181
}
3183-
const DataItem& other_value = other_dict.Get(key);
3182+
auto other_value = other_dict.Get(key);
31843183
if (!other_value.has_value()) {
31853184
continue;
31863185
}
3187-
const DataItem& this_value = this_dict.GetOrAssign(key, other_value);
3186+
const DataItem& this_value =
3187+
this_dict.GetOrAssign(key, other_value->get());
31883188
if (conflict_policy == MergeOptions::kRaiseOnConflict &&
3189-
this_value != other_value) {
3189+
this_value != other_value->get()) {
31903190
internal::ObjectId object_id = alloc_id.ObjectByOffset(i);
31913191
return internal::WithErrorPayload(
31923192
absl::FailedPreconditionError(absl::StrCat(
31933193
"conflicting dict values for ", object_id, " key", key,
3194-
": ", this_value, " vs ", other_value)),
3194+
": ", this_value, " vs ", *other_value)),
31953195
MakeSchemaOrDictMergeError(object_id, key, this_value,
3196-
other_value));
3196+
*other_value));
31973197
}
31983198
}
31993199
}
@@ -3318,7 +3318,7 @@ void DataBagImpl::AddDictToContent(
33183318
std::vector<DataItem> values;
33193319
values.reserve(keys.size());
33203320
for (size_t ki = 0; ki < keys.size(); ++ki) {
3321-
values.push_back(dict.Get(keys[ki]));
3321+
values.push_back(*dict.Get(keys[ki]));
33223322
}
33233323
res.push_back({dict_id, std::move(keys), std::move(values)});
33243324
}

koladata/internal/data_bag.h

Lines changed: 6 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -689,16 +689,18 @@ class DataBagImpl : public arolla::RefcountedBase {
689689
// `AllocCheckFn` is a customization to provide which allocation type `dicts`
690690
// ObjectIds belong to (e.g. IsDictsAlloc or IsSchemasAlloc).
691691
template <class AllocCheckFn>
692-
absl::StatusOr<DataSliceImpl> GetFromDictNoFallback(
693-
const arolla::DenseArray<ObjectId>& dicts, const DataSliceImpl& keys) const;
692+
absl::Status GetFromDictNoFallback(const arolla::DenseArray<ObjectId>& dicts,
693+
const DataSliceImpl& keys,
694+
SliceBuilder& bldr) const;
694695

695696
// Lower level utility for batch GetFromDict with a single key without
696697
// fallbacks support.
697698
// `AllocCheckFn` is a customization to provide which allocation type `dicts`
698699
// ObjectIds belong to (e.g. IsDictsAlloc or IsSchemasAlloc).
699700
template <class AllocCheckFn>
700-
absl::StatusOr<DataSliceImpl> GetFromDictNoFallback(
701-
const arolla::DenseArray<ObjectId>& dicts, const DataItem& keys) const;
701+
absl::Status GetFromDictNoFallback(const arolla::DenseArray<ObjectId>& dicts,
702+
const DataItem& keys,
703+
SliceBuilder& bldr) const;
702704

703705
// Implementation of GetFromDict that can be customized with `AllocCheckFn`.
704706
// Returns one value from each dict. Dicts get be quieried by a single key or

koladata/internal/data_bag_dict_test.cc

Lines changed: 55 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -41,6 +41,7 @@ namespace {
4141
using ::absl_testing::IsOkAndHolds;
4242
using ::absl_testing::StatusIs;
4343
using ::arolla::DenseArrayEdge;
44+
using ::testing::_;
4445
using ::testing::ElementsAre;
4546
using ::testing::ElementsAreArray;
4647
using ::testing::Eq;
@@ -407,5 +408,59 @@ TEST(DataBagTest, DictFallbacks) {
407408
}
408409
}
409410

411+
TEST(DataBagTest, DictFallbacksSingleItemWithRemovedValues) {
412+
auto dict = DataItem(AllocateSingleDict());
413+
414+
auto db = DataBagImpl::CreateEmptyDatabag();
415+
ASSERT_OK(db->SetInDict(dict, DataItem(1), DataItem()));
416+
auto fb_db = DataBagImpl::CreateEmptyDatabag();
417+
ASSERT_OK(fb_db->SetInDict(dict, DataItem(1), DataItem(7)));
418+
419+
EXPECT_THAT(db->GetFromDict(dict, DataItem(1), {fb_db.get()}),
420+
IsOkAndHolds(DataItem()));
421+
EXPECT_THAT(db->GetDictKeys(dict, {fb_db.get()}),
422+
IsOkAndHolds(Pair(ElementsAre(DataItem(1)), _)));
423+
EXPECT_THAT(db->GetDictValues(dict, {fb_db.get()}),
424+
IsOkAndHolds(Pair(ElementsAre(DataItem()), _)));
425+
426+
ASSERT_OK(db->SetInDict(dict, DataItem(2), DataItem(7)));
427+
auto fork_db = db->PartiallyPersistentFork();
428+
ASSERT_OK(fork_db->SetInDict(dict, DataItem(2), DataItem()));
429+
EXPECT_THAT(fork_db->GetFromDict(dict, DataItem(1), {}),
430+
IsOkAndHolds(DataItem()));
431+
EXPECT_THAT(fork_db->GetFromDict(dict, DataItem(2), {}),
432+
IsOkAndHolds(DataItem()));
433+
EXPECT_THAT(
434+
fork_db->GetDictKeys(dict, {fb_db.get()}),
435+
IsOkAndHolds(Pair(UnorderedElementsAre(DataItem(1), DataItem(2)), _)));
436+
EXPECT_THAT(fork_db->GetDictValues(dict, {fb_db.get()}),
437+
IsOkAndHolds(Pair(ElementsAre(DataItem(), DataItem()), _)));
438+
}
439+
440+
TEST(DataBagTest, DictFallbacksSingleItemManyKeysWithRemovedValues) {
441+
auto dict = DataItem(AllocateSingleDict());
442+
443+
auto db = DataBagImpl::CreateEmptyDatabag();
444+
ASSERT_OK(db->SetInDict(DataSliceImpl::Create({dict, dict}),
445+
DataSliceImpl::Create({DataItem(1), DataItem(2)}),
446+
DataSliceImpl::Create({DataItem(), DataItem(7)})));
447+
auto fb_db = DataBagImpl::CreateEmptyDatabag();
448+
ASSERT_OK(
449+
fb_db->SetInDict(DataSliceImpl::Create({dict, dict}),
450+
DataSliceImpl::Create({DataItem(1), DataItem(2)}),
451+
DataSliceImpl::Create({DataItem(9), DataItem(1)})));
452+
453+
EXPECT_THAT(db->GetFromDict(DataSliceImpl::Create({dict, dict}),
454+
DataSliceImpl::Create({DataItem(1), DataItem(2)}),
455+
{fb_db.get()}),
456+
IsOkAndHolds(ElementsAre(DataItem(), DataItem(7))));
457+
EXPECT_THAT(
458+
db->GetDictKeys(dict, {fb_db.get()}),
459+
IsOkAndHolds(Pair(UnorderedElementsAre(DataItem(1), DataItem(2)), _)));
460+
EXPECT_THAT(
461+
db->GetDictValues(dict, {fb_db.get()}),
462+
IsOkAndHolds(Pair(UnorderedElementsAre(DataItem(), DataItem(7)), _)));
463+
}
464+
410465
} // namespace
411466
} // namespace koladata::internal

koladata/internal/data_bag_extract_content_test.cc

Lines changed: 23 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -15,6 +15,7 @@
1515
#include <bit>
1616
#include <cstddef>
1717
#include <cstdint>
18+
#include <utility>
1819
#include <vector>
1920

2021
#include "gmock/gmock.h"
@@ -30,6 +31,8 @@ namespace {
3031

3132
using ::testing::ElementsAre;
3233
using ::testing::ElementsAreArray;
34+
using ::testing::Pair;
35+
using ::testing::UnorderedElementsAre;
3336

3437
namespace {
3538

@@ -103,6 +106,26 @@ TEST(DataBagTest, ExtractRemovedValues) {
103106
EXPECT_THAT(allocs[0].values, ElementsAreArray(ds_b_values));
104107
}
105108

109+
TEST(DataBagTest, ExtractDictRemovedValues) {
110+
auto dict = DataItem(AllocateSingleDict());
111+
112+
auto db = DataBagImpl::CreateEmptyDatabag();
113+
ASSERT_OK(db->SetInDict(dict, DataItem(1), DataItem()));
114+
ASSERT_OK(db->SetInDict(dict, DataItem(2), DataItem(3)));
115+
116+
ASSERT_OK_AND_ASSIGN(auto content, db->ExtractContent());
117+
const auto& dicts = content.dicts;
118+
ASSERT_EQ(dicts.size(), 1);
119+
const auto& d = dicts[0];
120+
EXPECT_EQ(d.dict_id, dict.value<ObjectId>());
121+
ASSERT_THAT(d.keys, UnorderedElementsAre(DataItem(1), DataItem(2)));
122+
ASSERT_THAT(d.values, UnorderedElementsAre(DataItem(), DataItem(3)));
123+
ASSERT_THAT((std::vector<std::pair<DataItem, DataItem>>{
124+
{d.keys[0], d.values[0]}, {d.keys[1], d.values[1]}}),
125+
UnorderedElementsAre(Pair(DataItem(1), DataItem()),
126+
Pair(DataItem(2), DataItem(3))));
127+
}
128+
106129
} // namespace
107130

108131
} // namespace

koladata/internal/data_bag_schema_test.cc

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -434,7 +434,8 @@ TEST(DataBagTest, DelSchemaAttr_Item) {
434434
EXPECT_THAT(db->GetSchemaAttr(schema, "a"),
435435
StatusIs(absl::StatusCode::kInvalidArgument,
436436
HasSubstr("the attribute 'a' is missing")));
437-
EXPECT_THAT(db->GetSchemaAttrs(schema), IsOkAndHolds(ElementsAre()));
437+
EXPECT_THAT(db->GetSchemaAttrs(schema),
438+
IsOkAndHolds(ElementsAre(DataItem(arolla::Text("a")))));
438439

439440
// Deleting on an empty object is a no-op.
440441
ASSERT_OK(db->DelSchemaAttr(DataItem(), "a"));

koladata/internal/data_bag_test.cc

Lines changed: 6 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -1764,8 +1764,8 @@ TEST(DataBagTest, GetStatistics_Dicts) {
17641764
{5, 6, 7, 8, std::nullopt}))));
17651765

17661766
ASSERT_OK_AND_ASSIGN(auto stats, db->GetStatistics());
1767-
EXPECT_EQ(stats.total_non_empty_dicts, 3);
1768-
EXPECT_EQ(stats.total_items_in_dicts, 3);
1767+
EXPECT_EQ(stats.total_non_empty_dicts, 4);
1768+
EXPECT_EQ(stats.total_items_in_dicts, 4);
17691769
}
17701770
{
17711771
DataBagImplPtr db = DataBagImpl::CreateEmptyDatabag();
@@ -1780,17 +1780,17 @@ TEST(DataBagTest, GetStatistics_Dicts) {
17801780
DataSliceImpl sub_dicts =
17811781
DataSliceImpl::ObjectsFromAllocation(dict_alloc_id, 3);
17821782
ASSERT_OK_AND_ASSIGN(auto stats, db->GetStatistics());
1783-
EXPECT_EQ(stats.total_non_empty_dicts, 3);
1784-
EXPECT_EQ(stats.total_items_in_dicts, 3);
1783+
EXPECT_EQ(stats.total_non_empty_dicts, 4);
1784+
EXPECT_EQ(stats.total_items_in_dicts, 4);
17851785

17861786
ASSERT_OK(db->SetInDict(sub_dicts,
17871787
DataSliceImpl::Create(arolla::CreateDenseArray<int>(
17881788
{7, std::nullopt, 8})),
17891789
DataSliceImpl::Create(arolla::CreateDenseArray<int>(
17901790
{7, 8, std::nullopt}))));
17911791
ASSERT_OK_AND_ASSIGN(stats, db->GetStatistics());
1792-
EXPECT_EQ(stats.total_non_empty_dicts, 3);
1793-
EXPECT_EQ(stats.total_items_in_dicts, 4);
1792+
EXPECT_EQ(stats.total_non_empty_dicts, 4);
1793+
EXPECT_EQ(stats.total_items_in_dicts, 6);
17941794
}
17951795
{
17961796
DataBagImplPtr db = DataBagImpl::CreateEmptyDatabag();

0 commit comments

Comments
 (0)