Skip to content

Commit 853c625

Browse files
Krishna Paimeta-codesync[bot]
authored andcommitted
feat: Add Spatial Join Benchmark (facebookincubator#15323)
Summary: Pull Request resolved: facebookincubator#15323 This is a simple benchmark for Spatial Joins which uses ST_Intersects, and ST_Contains over polygons and line data against geometries that are clustered or uniform. Here are the results: ``` ============================================================================ [...]c/benchmarks/SpatialJoinBenchmark.cpp relative time/iter iters/s ============================================================================ 1000x1000_ST_Intersects_Inner_uniform 11.59ms 86.31 1000x1000_ST_Intersects_Inner_clustered 7.90ms 126.51 10000x1000_ST_Intersects_Inner_uniform 59.27ms 16.87 10000x1000_ST_Intersects_Inner_clustered 39.32ms 25.43 5000x1000_ST_Intersects_Left_uniform 33.96ms 29.45 5000x1000_ST_Intersects_Left_clustered 22.63ms 44.19 5000x1000_ST_Contains_Inner_uniform 31.48ms 31.77 20000x2000_ST_Intersects_Inner_uniform 195.11ms 5.13 ``` Reviewed By: jagill Differential Revision: D85805886 fbshipit-source-id: be4567286824c2922ae67208a6db7224b357b4ea
1 parent 855fff3 commit 853c625

File tree

2 files changed

+378
-0
lines changed

2 files changed

+378
-0
lines changed

velox/exec/benchmarks/CMakeLists.txt

Lines changed: 18 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -162,3 +162,21 @@ target_link_libraries(
162162
add_executable(velox_atomics_benchmark AtomicsBench.cpp)
163163

164164
target_link_libraries(velox_atomics_benchmark Folly::follybenchmark)
165+
166+
if(VELOX_ENABLE_GEO)
167+
add_executable(velox_spatial_join_benchmark SpatialJoinBenchmark.cpp)
168+
169+
target_compile_definitions(velox_spatial_join_benchmark PRIVATE VELOX_ENABLE_GEO)
170+
171+
target_link_libraries(
172+
velox_spatial_join_benchmark
173+
velox_memory
174+
velox_exec
175+
velox_exec_test_lib
176+
velox_parse_parser
177+
velox_presto_types
178+
velox_vector_test_lib
179+
velox_functions_prestosql
180+
Folly::follybenchmark
181+
)
182+
endif()
Lines changed: 360 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,360 @@
1+
/*
2+
* Copyright (c) Facebook, Inc. and its affiliates.
3+
*
4+
* Licensed under the Apache License, Version 2.0 (the "License");
5+
* you may not use this file except in compliance with the License.
6+
* You may obtain a copy of the License at
7+
*
8+
* http://www.apache.org/licenses/LICENSE-2.0
9+
*
10+
* Unless required by applicable law or agreed to in writing, software
11+
* distributed under the License is distributed on an "AS IS" BASIS,
12+
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13+
* See the License for the specific language governing permissions and
14+
* limitations under the License.
15+
*/
16+
17+
#include <folly/Benchmark.h>
18+
#include <folly/init/Init.h>
19+
20+
#include "velox/common/memory/Memory.h"
21+
#include "velox/exec/tests/utils/AssertQueryBuilder.h"
22+
#include "velox/exec/tests/utils/PlanBuilder.h"
23+
#include "velox/functions/prestosql/registration/RegistrationFunctions.h"
24+
#include "velox/parse/TypeResolver.h"
25+
#include "velox/vector/tests/utils/VectorTestBase.h"
26+
27+
/// Benchmark for SpatialJoin operator, which implements a nested-loop join
28+
/// with spatial predicates (e.g., ST_INTERSECTS, ST_CONTAINS, ST_WITHIN).
29+
///
30+
/// This benchmark measures the performance of spatial joins under different
31+
/// conditions:
32+
/// - Different build and probe side sizes (cross join cardinality)
33+
/// - Different spatial predicates
34+
/// - Different data distributions (dense vs sparse geometries)
35+
/// - Inner vs Left join types
36+
///
37+
/// The benchmark creates synthetic geometric data and measures the throughput
38+
/// of spatial join operations. The focus is on understanding how the nested
39+
/// loop pattern performs with varying data sizes and selectivity.
40+
41+
using namespace facebook::velox;
42+
using namespace facebook::velox::exec;
43+
using namespace facebook::velox::exec::test;
44+
45+
namespace {
46+
47+
/// Spatial distribution patterns for geometry generation.
48+
enum class Distribution {
49+
kUniform, // Geometries uniformly distributed in space
50+
kClustered // Geometries clustered in specific regions
51+
};
52+
53+
// Constants for geometry generation.
54+
constexpr int32_t kNullPatternModulo = 13;
55+
constexpr int32_t kRandomCoordinateMax = 10000;
56+
constexpr double kCoordinateScaleDivisor = 10.0;
57+
constexpr int32_t kNumClusters = 5;
58+
constexpr double kClusterSpacing = 200.0;
59+
constexpr double kClusterCenterOffset = 100.0;
60+
constexpr int32_t kClusterSpreadRange = 100;
61+
constexpr int32_t kClusterSpreadHalf = 50;
62+
constexpr double kPolygonSize = 10.0;
63+
64+
// Constants for benchmark configuration.
65+
constexpr int32_t kDefaultBatchSize = 10000;
66+
constexpr int32_t kSmallBenchmarkSize = 1000;
67+
constexpr int32_t kMediumProbeBenchmarkSize = 10000;
68+
constexpr int32_t kMediumBuildBenchmarkSize = 1000;
69+
constexpr int32_t kLargeProbeBenchmarkSize = 20000;
70+
constexpr int32_t kLargeBuildBenchmarkSize = 2000;
71+
72+
/// Parameters for a spatial join benchmark test case.
73+
struct SpatialJoinBenchmarkParams {
74+
/// Number of rows on the probe (left) side.
75+
int32_t probeSize;
76+
77+
/// Number of rows on the build (right) side.
78+
int32_t buildSize;
79+
80+
/// Spatial predicate to use (e.g., "ST_Intersects", "ST_Contains").
81+
std::string predicate;
82+
83+
/// Join type (kInner or kLeft).
84+
core::JoinType joinType;
85+
86+
/// Spatial distribution pattern for geometry generation.
87+
Distribution distribution;
88+
89+
/// Description for benchmark naming.
90+
std::string toString() const {
91+
std::string joinTypeStr =
92+
(joinType == core::JoinType::kInner) ? "Inner" : "Left";
93+
std::string distributionStr =
94+
(distribution == Distribution::kUniform) ? "uniform" : "clustered";
95+
return fmt::format(
96+
"{}x{}_{}_{}_{}",
97+
probeSize,
98+
buildSize,
99+
predicate,
100+
joinTypeStr,
101+
distributionStr);
102+
}
103+
};
104+
105+
class SpatialJoinBenchmark : public facebook::velox::test::VectorTestBase {
106+
public:
107+
SpatialJoinBenchmark() : rng_((std::random_device{}())) {}
108+
109+
/// Creates a vector of POINT geometries with specified distribution.
110+
VectorPtr
111+
makePointVector(int32_t size, Distribution distribution, bool nulls = false) {
112+
return makeFlatVector<std::string>(
113+
size,
114+
[&](vector_size_t row) {
115+
if (nulls && (row % kNullPatternModulo == 0)) {
116+
return std::string("");
117+
}
118+
double x, y;
119+
if (distribution == Distribution::kUniform) {
120+
x = (folly::Random::rand32(rng_) % kRandomCoordinateMax) /
121+
kCoordinateScaleDivisor;
122+
y = (folly::Random::rand32(rng_) % kRandomCoordinateMax) /
123+
kCoordinateScaleDivisor;
124+
} else {
125+
int cluster = row % kNumClusters;
126+
double centerX = (cluster * kClusterSpacing) + kClusterCenterOffset;
127+
double centerY = (cluster * kClusterSpacing) + kClusterCenterOffset;
128+
x = centerX +
129+
((folly::Random::rand32(rng_) % kClusterSpreadRange) -
130+
kClusterSpreadHalf);
131+
y = centerY +
132+
((folly::Random::rand32(rng_) % kClusterSpreadRange) -
133+
kClusterSpreadHalf);
134+
}
135+
return fmt::format("POINT ({} {})", x, y);
136+
},
137+
[&](vector_size_t row) {
138+
return nulls && (row % kNullPatternModulo == 0);
139+
});
140+
}
141+
142+
/// Creates a vector of POLYGON geometries with specified distribution.
143+
VectorPtr makePolygonVector(
144+
int32_t size,
145+
Distribution distribution,
146+
bool nulls = false) {
147+
return makeFlatVector<std::string>(
148+
size,
149+
[&](vector_size_t row) {
150+
if (nulls && (row % kNullPatternModulo == 0)) {
151+
return std::string("");
152+
}
153+
double centerX, centerY;
154+
if (distribution == Distribution::kUniform) {
155+
centerX = (folly::Random::rand32(rng_) % kRandomCoordinateMax) /
156+
kCoordinateScaleDivisor;
157+
centerY = (folly::Random::rand32(rng_) % kRandomCoordinateMax) /
158+
kCoordinateScaleDivisor;
159+
} else {
160+
int cluster = row % kNumClusters;
161+
centerX = (cluster * kClusterSpacing) + kClusterCenterOffset;
162+
centerY = (cluster * kClusterSpacing) + kClusterCenterOffset;
163+
}
164+
return fmt::format(
165+
"POLYGON (({} {}, {} {}, {} {}, {} {}, {} {}))",
166+
centerX - kPolygonSize,
167+
centerY - kPolygonSize,
168+
centerX + kPolygonSize,
169+
centerY - kPolygonSize,
170+
centerX + kPolygonSize,
171+
centerY + kPolygonSize,
172+
centerX - kPolygonSize,
173+
centerY + kPolygonSize,
174+
centerX - kPolygonSize,
175+
centerY - kPolygonSize);
176+
},
177+
[&](vector_size_t row) {
178+
return nulls && (row % kNullPatternModulo == 0);
179+
});
180+
}
181+
182+
RowVectorPtr createProjectionVector(
183+
const std::string& prefix,
184+
RowVectorPtr input) {
185+
const auto plan = PlanBuilder(std::make_shared<core::PlanNodeIdGenerator>())
186+
.values({input})
187+
.project(
188+
{fmt::format("{}_id", prefix),
189+
fmt::format(
190+
"ST_GeometryFromText({}_geom) AS {}_geom",
191+
prefix,
192+
prefix)})
193+
.planNode();
194+
return AssertQueryBuilder(plan).copyResults(pool_.get());
195+
}
196+
197+
/// Creates test data for the specified parameters.
198+
std::pair<std::vector<RowVectorPtr>, std::vector<RowVectorPtr>> makeTestData(
199+
const SpatialJoinBenchmarkParams& params) {
200+
// Create probe side data (points)
201+
std::vector<RowVectorPtr> probeVectors;
202+
const int32_t batchSize = std::min(params.probeSize, kDefaultBatchSize);
203+
const int32_t numBatches = (params.probeSize + batchSize - 1) / batchSize;
204+
205+
for (int32_t i = 0; i < numBatches; ++i) {
206+
int32_t currentBatchSize =
207+
std::min(batchSize, params.probeSize - (i * batchSize));
208+
auto geomVector =
209+
makePointVector(currentBatchSize, params.distribution, false);
210+
auto idVector = makeFlatVector<int64_t>(
211+
currentBatchSize,
212+
[i, batchSize](vector_size_t row) { return (i * batchSize) + row; });
213+
probeVectors.push_back(createProjectionVector(
214+
"probe",
215+
makeRowVector({"probe_id", "probe_geom"}, {idVector, geomVector})));
216+
}
217+
218+
// Create build side data (polygons)
219+
std::vector<RowVectorPtr> buildVectors;
220+
const int32_t buildBatchSize =
221+
std::min(params.buildSize, kDefaultBatchSize);
222+
const int32_t numBuildBatches =
223+
(params.buildSize + buildBatchSize - 1) / buildBatchSize;
224+
225+
for (int32_t i = 0; i < numBuildBatches; ++i) {
226+
int32_t currentBatchSize =
227+
std::min(buildBatchSize, params.buildSize - (i * buildBatchSize));
228+
auto geomVector =
229+
makePolygonVector(currentBatchSize, params.distribution, false);
230+
auto idVector = makeFlatVector<int64_t>(
231+
currentBatchSize, [i, buildBatchSize](vector_size_t row) {
232+
return (i * buildBatchSize) + row;
233+
});
234+
buildVectors.push_back(createProjectionVector(
235+
"build",
236+
makeRowVector({"build_id", "build_geom"}, {idVector, geomVector})));
237+
}
238+
239+
return {probeVectors, buildVectors};
240+
}
241+
242+
/// Creates a spatial join plan with the specified parameters.
243+
std::shared_ptr<const core::PlanNode> makeSpatialJoinPlan(
244+
std::vector<RowVectorPtr>&& probeVectors,
245+
std::vector<RowVectorPtr>&& buildVectors,
246+
const SpatialJoinBenchmarkParams& params) {
247+
const auto planNodeIdGenerator =
248+
std::make_shared<core::PlanNodeIdGenerator>();
249+
return PlanBuilder(planNodeIdGenerator)
250+
.values(probeVectors)
251+
.spatialJoin(
252+
PlanBuilder(planNodeIdGenerator).values(buildVectors).planNode(),
253+
fmt::format("{}(probe_geom, build_geom)", params.predicate),
254+
"probe_geom",
255+
"build_geom",
256+
std::nullopt,
257+
{"probe_id", "probe_geom", "build_id", "build_geom"},
258+
params.joinType)
259+
.planNode();
260+
}
261+
262+
/// Runs a single benchmark iteration.
263+
uint64_t run(
264+
std::shared_ptr<const core::PlanNode> plan,
265+
const SpatialJoinBenchmarkParams& params) {
266+
auto result = AssertQueryBuilder(plan).copyResults(pool_.get());
267+
return result->size();
268+
}
269+
270+
/// Adds a benchmark for the given parameters.
271+
void addBenchmark(const SpatialJoinBenchmarkParams& params) {
272+
auto name = params.toString();
273+
folly::addBenchmark(__FILE__, name, [this, params]() {
274+
std::shared_ptr<const core::PlanNode> plan;
275+
BENCHMARK_SUSPEND {
276+
auto [probeVectors, buildVectors] = makeTestData(params);
277+
plan = makeSpatialJoinPlan(
278+
std::move(probeVectors), std::move(buildVectors), params);
279+
}
280+
281+
run(plan, params);
282+
return 1;
283+
});
284+
}
285+
286+
private:
287+
std::default_random_engine rng_;
288+
};
289+
290+
} // namespace
291+
292+
int main(int argc, char** argv) {
293+
folly::Init init{&argc, &argv};
294+
memory::initializeMemoryManager(memory::MemoryManager::Options{});
295+
parse::registerTypeResolver();
296+
functions::prestosql::registerAllScalarFunctions();
297+
298+
SpatialJoinBenchmark bm;
299+
300+
// Small scale benchmarks (1K x 1K)
301+
bm.addBenchmark(
302+
{kSmallBenchmarkSize,
303+
kSmallBenchmarkSize,
304+
"ST_Intersects",
305+
core::JoinType::kInner,
306+
Distribution::kUniform});
307+
bm.addBenchmark(
308+
{kSmallBenchmarkSize,
309+
kSmallBenchmarkSize,
310+
"ST_Intersects",
311+
core::JoinType::kInner,
312+
Distribution::kClustered});
313+
314+
// Medium scale benchmarks (10K x 1K)
315+
bm.addBenchmark(
316+
{kMediumProbeBenchmarkSize,
317+
kMediumBuildBenchmarkSize,
318+
"ST_Intersects",
319+
core::JoinType::kInner,
320+
Distribution::kUniform});
321+
bm.addBenchmark(
322+
{kMediumProbeBenchmarkSize,
323+
kMediumBuildBenchmarkSize,
324+
"ST_Intersects",
325+
core::JoinType::kInner,
326+
Distribution::kClustered});
327+
328+
// Left join benchmarks (5K x 1K)
329+
bm.addBenchmark(
330+
{kMediumProbeBenchmarkSize / 2,
331+
kMediumBuildBenchmarkSize,
332+
"ST_Intersects",
333+
core::JoinType::kLeft,
334+
Distribution::kUniform});
335+
bm.addBenchmark(
336+
{kMediumProbeBenchmarkSize / 2,
337+
kMediumBuildBenchmarkSize,
338+
"ST_Intersects",
339+
core::JoinType::kLeft,
340+
Distribution::kClustered});
341+
342+
// Contains predicate benchmarks (5K x 1K)
343+
bm.addBenchmark(
344+
{kMediumProbeBenchmarkSize / 2,
345+
kMediumBuildBenchmarkSize,
346+
"ST_Contains",
347+
core::JoinType::kInner,
348+
Distribution::kUniform});
349+
350+
// Large scale benchmark (20K x 2K)
351+
bm.addBenchmark(
352+
{kLargeProbeBenchmarkSize,
353+
kLargeBuildBenchmarkSize,
354+
"ST_Intersects",
355+
core::JoinType::kInner,
356+
Distribution::kUniform});
357+
358+
folly::runBenchmarks();
359+
return 0;
360+
}

0 commit comments

Comments
 (0)