|
3 | 3 | #include <AggregateFunctions/IAggregateFunction.h> |
4 | 4 | #include <AggregateFunctions/UniqVariadicHash.h> |
5 | 5 | #include <DataTypes/DataTypesNumber.h> |
| 6 | +#include <base/arithmeticOverflow.h> |
6 | 7 | #include <Common/Exception.h> |
7 | 8 | #include <Common/HashTable/HashMap.h> |
8 | 9 | #include <Common/VectorWithMemoryTracking.h> |
@@ -82,6 +83,84 @@ struct CrossTabCountsState |
82 | 83 | count_a.read(buf); |
83 | 84 | count_b.read(buf); |
84 | 85 | count_ab.read(buf); |
| 86 | + |
| 87 | + validateDeserialized(count, count_a, count_b, count_ab); |
| 88 | + } |
| 89 | + |
| 90 | + /// Aggregate function states can be constructed from untrusted data, e.g. by `CAST` from `String`, |
| 91 | + /// so deserialization has to check all invariants that the calculations rely on |
| 92 | + /// (a genuine state produced by `add` and `merge` satisfies them by construction): |
| 93 | + /// - all joint counts in `count_ab` are positive; |
| 94 | + /// - `count_a` and `count_b` are exactly the marginal sums of `count_ab`; |
| 95 | + /// - the joint counts sum up to `count`. |
| 96 | + /// These invariants guarantee that the state describes a valid contingency table, which in turn |
| 97 | + /// guarantees the theoretical bounds asserted during finalization (e.g. 0 <= φ² <= min(|A|, |B|) - 1). |
| 98 | + static void validateDeserialized(UInt64 count, const auto & count_a, const auto & count_b, const auto & count_ab) |
| 99 | + { |
| 100 | + UInt64 total = 0; |
| 101 | + HashMapWithStackMemory<UInt64, UInt64, TrivialHash, 4> marginal_a; |
| 102 | + HashMapWithStackMemory<UInt64, UInt64, TrivialHash, 4> marginal_b; |
| 103 | + |
| 104 | + for (const auto & [key, value] : count_ab) |
| 105 | + { |
| 106 | + if (value == 0) |
| 107 | + throw Exception( |
| 108 | + ErrorCodes::CORRUPTED_DATA, |
| 109 | + "Corrupted aggregate function state: the joint count of the value pair with hashes ({}, {}) is zero", |
| 110 | + key.items[UInt128::_impl::little(0)], |
| 111 | + key.items[UInt128::_impl::little(1)]); |
| 112 | + |
| 113 | + if (common::addOverflow(total, value, total)) |
| 114 | + throw Exception(ErrorCodes::CORRUPTED_DATA, "Corrupted aggregate function state: the sum of joint counts overflows UInt64"); |
| 115 | + |
| 116 | + /// The marginal sums cannot overflow if the total sum does not: each of them is bounded by `total`. |
| 117 | + marginal_a[key.items[UInt128::_impl::little(0)]] += value; |
| 118 | + marginal_b[key.items[UInt128::_impl::little(1)]] += value; |
| 119 | + } |
| 120 | + |
| 121 | + if (total != count) |
| 122 | + throw Exception( |
| 123 | + ErrorCodes::CORRUPTED_DATA, |
| 124 | + "Corrupted aggregate function state: the joint counts sum up to {}, while the total count is {}", |
| 125 | + total, |
| 126 | + count); |
| 127 | + |
| 128 | + auto check_marginals = [](const auto & expected, const auto & stored, const char * side) |
| 129 | + { |
| 130 | + if (expected.size() != stored.size()) |
| 131 | + throw Exception( |
| 132 | + ErrorCodes::CORRUPTED_DATA, |
| 133 | + "Corrupted aggregate function state: there are {} distinct values of the {} argument, " |
| 134 | + "while the joint counts imply {}", |
| 135 | + stored.size(), |
| 136 | + side, |
| 137 | + expected.size()); |
| 138 | + |
| 139 | + for (const auto & [key, value] : expected) |
| 140 | + { |
| 141 | + const auto * it = stored.find(key); |
| 142 | + if (it == stored.end()) |
| 143 | + throw Exception( |
| 144 | + ErrorCodes::CORRUPTED_DATA, |
| 145 | + "Corrupted aggregate function state: the value with hash {} of the {} argument is present " |
| 146 | + "in the joint counts but has no marginal count", |
| 147 | + key, |
| 148 | + side); |
| 149 | + |
| 150 | + if (it->getMapped() != value) |
| 151 | + throw Exception( |
| 152 | + ErrorCodes::CORRUPTED_DATA, |
| 153 | + "Corrupted aggregate function state: the marginal count of the value with hash {} " |
| 154 | + "of the {} argument is {}, while its joint counts sum up to {}", |
| 155 | + key, |
| 156 | + side, |
| 157 | + it->getMapped(), |
| 158 | + value); |
| 159 | + } |
| 160 | + }; |
| 161 | + |
| 162 | + check_marginals(marginal_a, count_a, "first"); |
| 163 | + check_marginals(marginal_b, count_b, "second"); |
85 | 164 | } |
86 | 165 | }; |
87 | 166 |
|
@@ -507,6 +586,8 @@ struct CrossTabPhiSquaredWindowData |
507 | 586 | count_ab.size(), |
508 | 587 | INVALID_EDGE_IDX - 1); |
509 | 588 |
|
| 589 | + CrossTabCountsState::validateDeserialized(count, count_a, count_b, count_ab); |
| 590 | + |
510 | 591 | /// Update the internal states |
511 | 592 | a_hash_by_index.reserve(count_a.size()); |
512 | 593 | a_marginal_count.reserve(count_a.size()); |
@@ -561,9 +642,7 @@ struct CrossTabPhiSquaredWindowData |
561 | 642 | const Float64 a = static_cast<Float64>(a_marginal_count[a_idx]); |
562 | 643 | const Float64 b = static_cast<Float64>(b_marginal_count[b_idx]); |
563 | 644 |
|
564 | | - if (unlikely(a <= 0 || b <= 0)) |
565 | | - throw Exception( |
566 | | - ErrorCodes::CORRUPTED_DATA, "Corrupted aggregate function state: value frequency must be positive (a={}, b={})", a, b); |
| 645 | + chassert(a > 0 && b > 0 && "value frequencies are positive after validation"); |
567 | 646 |
|
568 | 647 | phi_term_sum += phiTerm(cnt_ab, a, b); |
569 | 648 | } |
|
0 commit comments