From 3702c45d72fca5bcbb5b2a8b043804052797d740 Mon Sep 17 00:00:00 2001 From: dshkol Date: Mon, 10 Nov 2025 21:25:27 -0800 Subject: [PATCH 1/2] Phase 2: Optimize gh_covering performance MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Eliminated redundant encode-decode cycle in gh_covering by decoding geohashes once and building spatial polygons directly. Before: Grid → encode → gh_to_sf → gh_to_spdf → gh_to_sp → decode → polygons After: Grid → encode → decode → polygons (direct) Performance improvements (100 points, 0.5° spread): - Precision 4: 2.17x faster (10.9ms → 5.0ms) - Precision 5: 2.41x faster (15.9ms → 6.6ms) - Precision 6: 3.13x faster (180.5ms → 57.8ms) - Precision 7: 3.07x faster (5528.6ms → 1802.5ms) Median speedup: ~2.7x across typical use cases All existing tests pass - no breaking changes to API or behavior. 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude --- NEWS.md | 6 ++ R/gis_tools.R | 25 ++++- benchmarks/gh_covering_bench_simple.R | 111 ++++++++++++++++++++++ benchmarks/gh_covering_benchmark.R | 88 +++++++++++++++++ benchmarks/gh_covering_comparison.R | 132 ++++++++++++++++++++++++++ 5 files changed, 359 insertions(+), 3 deletions(-) create mode 100644 benchmarks/gh_covering_bench_simple.R create mode 100644 benchmarks/gh_covering_benchmark.R create mode 100644 benchmarks/gh_covering_comparison.R diff --git a/NEWS.md b/NEWS.md index f7c6c8c..65ca7ad 100644 --- a/NEWS.md +++ b/NEWS.md @@ -1,5 +1,11 @@ # `geohashTools` NEWS +## v0.3.4 (Development) + +### PERFORMANCE + + 1. Optimized `gh_covering` by eliminating redundant encode-decode cycle. The function now decodes geohashes once and builds spatial polygons directly, rather than going through `gh_to_sf` → `gh_to_spdf` → `gh_to_sp` → `gh_decode`. Benchmarks show 2-3× speedup across typical use cases, with larger improvements for higher precision values. + ## v0.3.3 Drop references to deprecated rgdal. diff --git a/R/gis_tools.R b/R/gis_tools.R index 501b5b1..8c297b7 100644 --- a/R/gis_tools.R +++ b/R/gis_tools.R @@ -73,14 +73,33 @@ gh_covering = function(SP, precision = 6L, minimal = FALSE) { SP$id = rownames(SP@data) bb = sp::bbox(SP) delta = 2.0 * gh_delta(precision) - # TODO: actually goes through an encode-decode cycle -- more efficient to - # just build the cells directly by rounding to the precision's grid + + # Build grid and encode to geohashes gh = with(expand.grid( latitude = seq(bb[2L, 'min'], bb[2L, 'max'] + delta[1L], by = delta[1L]), longitude = seq(bb[1L, 'min'], bb[1L, 'max'] + delta[2L], by = delta[2L]) ), gh_encode(latitude, longitude, precision)) + + # Optimization: decode once and build polygons directly + # instead of going through gh_to_sf -> gh_to_spdf -> gh_to_sp -> gh_decode + gh_xy = gh_decode(gh, include_delta = TRUE) + + # Build SpatialPolygons directly from decoded coordinates + cover_sp = sp::SpatialPolygons(lapply(seq_along(gh), function(ii) { + sp::Polygons(list(sp::Polygon(cbind( + # the four corners of the current geohash + gh_xy$longitude[ii] + c(-1.0, -1.0, 1.0, 1.0, -1.0) * gh_xy$delta_longitude[ii], + gh_xy$latitude[ii] + c(-1.0, 1.0, 1.0, -1.0, -1.0) * gh_xy$delta_latitude[ii] + ))), ID = gh[ii]) + }), proj4string = wgs()) + + # Convert to SPDF + cover = sp::SpatialPolygonsDataFrame( + cover_sp, + data = data.frame(row.names = gh, ID = seq_along(gh)) + ) + if (is.na(prj4 <- sp::proj4string(SP))) sp::proj4string(SP) = (prj4 <- wgs()) - cover = methods::as(gh_to_sf(gh), 'Spatial') sp::proj4string(cover) = prj4 if (minimal) { # slightly more efficient to use rgeos, but there's a bug preventing diff --git a/benchmarks/gh_covering_bench_simple.R b/benchmarks/gh_covering_bench_simple.R new file mode 100644 index 0000000..2f2dd1b --- /dev/null +++ b/benchmarks/gh_covering_bench_simple.R @@ -0,0 +1,111 @@ +# Simple benchmark comparing old vs new gh_covering +# Loads both versions and compares directly + +library(microbenchmark) +library(ggplot2) +library(sp) + +# Define old version (before optimization) +gh_covering_old = function(SP, precision = 6L, minimal = FALSE) { + if (sf_input <- inherits(SP, 'sf')) { + SP = sf::as_Spatial(SP) + } + if (!inherits(SP, 'Spatial')) + stop('Object to cover must be Spatial (or subclass)') + if (inherits(SP, 'SpatialPointsDataFrame') && !NCOL(SP)) + SP$id = rownames(SP@data) + bb = sp::bbox(SP) + delta = 2.0 * geohashTools::gh_delta(precision) + + # OLD: goes through encode-decode cycle + gh = with(expand.grid( + latitude = seq(bb[2L, 'min'], bb[2L, 'max'] + delta[1L], by = delta[1L]), + longitude = seq(bb[1L, 'min'], bb[1L, 'max'] + delta[2L], by = delta[2L]) + ), geohashTools::gh_encode(latitude, longitude, precision)) + + wgs_crs = sp::CRS('+proj=longlat +datum=WGS84', doCheckCRSArgs = FALSE) + if (is.na(prj4 <- sp::proj4string(SP))) sp::proj4string(SP) = (prj4 <- wgs_crs) + + # OLD: calls gh_to_sf which calls gh_to_spdf which calls gh_to_sp which decodes + cover = methods::as(geohashTools::gh_to_sf(gh), 'Spatial') + sp::proj4string(cover) = prj4 + + if (minimal) { + n_in_cover = vapply(sp::over(cover, SP, returnList=TRUE), NROW, integer(1L)) + cover = cover[which(n_in_cover > 0L), ] + sp::proj4string(cover) = prj4 + } + return(if (sf_input) sf::st_as_sf(cover) else cover) +} + +# Load new version +devtools::load_all(".", quiet = TRUE) + +# Test data +set.seed(123) +test_points <- sp::SpatialPoints(cbind( + runif(100, 114.5, 115.0), + runif(100, -3.4, -3.2) +)) + +cat("Benchmarking gh_covering: Old vs New\n") +cat("Test data: 100 points, 0.5 degree spread\n\n") + +# Test different precisions +precisions <- c(4, 5, 6, 7, 8) +results <- list() + +for (prec in precisions) { + cat(sprintf("Precision %d...\n", prec)) + + bm <- microbenchmark( + old = gh_covering_old(test_points, precision = prec), + new = gh_covering(test_points, precision = prec), + times = 10L + ) + + results[[as.character(prec)]] <- data.frame( + precision = prec, + method = c("old", "new"), + median_ms = c( + median(bm$time[bm$expr == "old"]) / 1e6, + median(bm$time[bm$expr == "new"]) / 1e6 + ) + ) + + speedup <- results[[as.character(prec)]]$median_ms[1] / + results[[as.character(prec)]]$median_ms[2] + cat(sprintf(" Old: %.1f ms, New: %.1f ms, Speedup: %.2fx\n\n", + results[[as.character(prec)]]$median_ms[1], + results[[as.character(prec)]]$median_ms[2], + speedup)) +} + +results_df <- do.call(rbind, results) +results_df$speedup <- with(results_df[results_df$method == "old", ], + median_ms) / results_df$median_ms[results_df$method == "new"] + +# Create plot +p <- ggplot(results_df, aes(x = precision, y = median_ms, color = method)) + + geom_line(size = 1) + + geom_point(size = 3) + + scale_y_log10() + + labs( + title = "gh_covering Performance Comparison", + subtitle = "100 random points, 0.5° spread", + x = "Precision", + y = "Median Time (ms, log scale)", + color = "Method" + ) + + theme_minimal() + + theme(legend.position = "top") + +ggsave("benchmarks/gh_covering_simple_comparison.png", p, width = 8, height = 6, dpi = 150) +cat("\nSaved plot to benchmarks/gh_covering_simple_comparison.png\n") + +# Print summary +cat("\n=== Summary ===\n") +print(results_df) + +cat(sprintf("\nMedian speedup across all precisions: %.2fx\n", + median(results_df$speedup[results_df$method == "new"]))) diff --git a/benchmarks/gh_covering_benchmark.R b/benchmarks/gh_covering_benchmark.R new file mode 100644 index 0000000..5e0527a --- /dev/null +++ b/benchmarks/gh_covering_benchmark.R @@ -0,0 +1,88 @@ +# Benchmark script for gh_covering optimization +# Compares current implementation vs. optimized direct grid generation + +# Load package from source +devtools::load_all(".", quiet = TRUE) + +library(microbenchmark) +library(ggplot2) +library(sp) + +# Create test data of varying sizes +create_test_points <- function(n_points = 100, spread = 1.0) { + sp::SpatialPoints(cbind( + runif(n_points, 114.5, 114.5 + spread), + runif(n_points, -3.4, -3.4 + spread) + )) +} + +# Test different grid sizes by varying point spread and precision +test_cases <- expand.grid( + n_points = c(10, 100), + spread = c(0.1, 0.5, 1.0), # degrees - affects grid size + precision = c(4, 6, 8, 10), + stringsAsFactors = FALSE +) + +cat("Running benchmarks for gh_covering...\n") +cat("Test cases:", nrow(test_cases), "\n\n") + +results <- vector("list", nrow(test_cases)) + +for (i in seq_len(nrow(test_cases))) { + tc <- test_cases[i, ] + cat(sprintf("Test %d/%d: n_points=%d, spread=%.1f, precision=%d\n", + i, nrow(test_cases), tc$n_points, tc$spread, tc$precision)) + + test_points <- create_test_points(tc$n_points, tc$spread) + + # Run benchmark + bm <- microbenchmark( + current = gh_covering(test_points, precision = tc$precision), + times = 20L, + unit = "ms" + ) + + results[[i]] <- data.frame( + n_points = tc$n_points, + spread = tc$spread, + precision = tc$precision, + median_ms = median(bm$time) / 1e6, + mean_ms = mean(bm$time) / 1e6, + min_ms = min(bm$time) / 1e6, + max_ms = max(bm$time) / 1e6 + ) + + cat(sprintf(" Median: %.2f ms\n\n", results[[i]]$median_ms)) +} + +results_df <- do.call(rbind, results) + +# Save results +saveRDS(results_df, "benchmarks/gh_covering_baseline_results.rds") +cat("Saved baseline results to benchmarks/gh_covering_baseline_results.rds\n") + +# Create visualization +p <- ggplot(results_df, aes(x = precision, y = median_ms, + color = factor(spread), + shape = factor(n_points))) + + geom_line() + + geom_point(size = 3) + + scale_y_log10() + + labs( + title = "gh_covering Performance (Baseline)", + subtitle = "Current encode-decode implementation", + x = "Precision", + y = "Median Time (ms, log scale)", + color = "Spread (degrees)", + shape = "Points" + ) + + theme_minimal() + + theme(legend.position = "right") + +ggsave("benchmarks/gh_covering_baseline.png", p, width = 10, height = 6, dpi = 150) +cat("Saved plot to benchmarks/gh_covering_baseline.png\n") + +# Print summary +cat("\n=== Summary Statistics ===\n") +print(results_df[order(results_df$median_ms, decreasing = TRUE), ]) diff --git a/benchmarks/gh_covering_comparison.R b/benchmarks/gh_covering_comparison.R new file mode 100644 index 0000000..76d799c --- /dev/null +++ b/benchmarks/gh_covering_comparison.R @@ -0,0 +1,132 @@ +# Comparison benchmark for gh_covering optimization +# Tests optimized version vs results from baseline + +library(microbenchmark) +library(ggplot2) +library(sp) + +# Load the optimized version +devtools::load_all(".", quiet = TRUE) + +# Create test data +create_test_points <- function(n_points = 100, spread = 1.0) { + sp::SpatialPoints(cbind( + runif(n_points, 114.5, 114.5 + spread), + runif(n_points, -3.4, -3.4 + spread) + )) +} + +# Smaller, faster test suite for comparison +test_cases <- expand.grid( + n_points = c(10, 100), + spread = c(0.1, 0.5, 1.0), + precision = c(4, 6, 8), + stringsAsFactors = FALSE +) + +cat("Running comparison benchmarks...\n") +cat("Testing optimized gh_covering implementation\n") +cat("Test cases:", nrow(test_cases), "\n\n") + +results <- vector("list", nrow(test_cases)) + +for (i in seq_len(nrow(test_cases))) { + tc <- test_cases[i, ] + cat(sprintf("Test %d/%d: n_points=%d, spread=%.1f, precision=%d\n", + i, nrow(test_cases), tc$n_points, tc$spread, tc$precision)) + + test_points <- create_test_points(tc$n_points, tc$spread) + + # Run benchmark on optimized version + bm <- microbenchmark( + optimized = gh_covering(test_points, precision = tc$precision), + times = 20L, + unit = "ms" + ) + + results[[i]] <- data.frame( + n_points = tc$n_points, + spread = tc$spread, + precision = tc$precision, + median_ms = median(bm$time) / 1e6, + mean_ms = mean(bm$time) / 1e6, + min_ms = min(bm$time) / 1e6, + max_ms = max(bm$time) / 1e6 + ) + + cat(sprintf(" Median: %.2f ms\n\n", results[[i]]$median_ms)) +} + +optimized_df <- do.call(rbind, results) + +# Save optimized results +saveRDS(optimized_df, "benchmarks/gh_covering_optimized_results.rds") +cat("\nSaved optimized results to benchmarks/gh_covering_optimized_results.rds\n") + +# Load baseline if it exists and compare +if (file.exists("benchmarks/gh_covering_baseline_results.rds")) { + baseline_df <- readRDS("benchmarks/gh_covering_baseline_results.rds") + + # Merge results + baseline_df$version <- "baseline" + optimized_df$version <- "optimized" + combined_df <- rbind(baseline_df[, names(optimized_df)], optimized_df) + + # Calculate speedup + comparison <- merge( + baseline_df[, c("n_points", "spread", "precision", "median_ms")], + optimized_df[, c("n_points", "spread", "precision", "median_ms")], + by = c("n_points", "spread", "precision"), + suffixes = c("_baseline", "_optimized") + ) + comparison$speedup <- comparison$median_ms_baseline / comparison$median_ms_optimized + + cat("\n=== Performance Comparison ===\n") + print(comparison[order(-comparison$speedup), ]) + + cat(sprintf("\nMedian speedup: %.2fx\n", median(comparison$speedup))) + cat(sprintf("Mean speedup: %.2fx\n", mean(comparison$speedup))) + cat(sprintf("Max speedup: %.2fx\n", max(comparison$speedup))) + + # Create comparison plot + p <- ggplot(combined_df, aes(x = precision, y = median_ms, + color = version, + linetype = factor(spread))) + + geom_line() + + geom_point(size = 2) + + facet_wrap(~n_points, labeller = label_both) + + scale_y_log10() + + labs( + title = "gh_covering Performance: Baseline vs Optimized", + x = "Precision", + y = "Median Time (ms, log scale)", + color = "Version", + linetype = "Spread (degrees)" + ) + + theme_minimal() + + theme(legend.position = "bottom") + + ggsave("benchmarks/gh_covering_comparison.png", p, width = 12, height = 6, dpi = 150) + cat("\nSaved comparison plot to benchmarks/gh_covering_comparison.png\n") + + # Speedup plot + p2 <- ggplot(comparison, aes(x = precision, y = speedup, + color = factor(spread), + shape = factor(n_points))) + + geom_line() + + geom_point(size = 3) + + geom_hline(yintercept = 1, linetype = "dashed", color = "gray50") + + labs( + title = "gh_covering Speedup: Optimized vs Baseline", + subtitle = "Values > 1.0 indicate optimized is faster", + x = "Precision", + y = "Speedup Factor", + color = "Spread (degrees)", + shape = "Points" + ) + + theme_minimal() + + theme(legend.position = "right") + + ggsave("benchmarks/gh_covering_speedup.png", p2, width = 10, height = 6, dpi = 150) + cat("Saved speedup plot to benchmarks/gh_covering_speedup.png\n") +} From a8e48dbb01c45c6f9fd25c64c1941793d580027d Mon Sep 17 00:00:00 2001 From: dshkol Date: Mon, 10 Nov 2025 22:06:32 -0800 Subject: [PATCH 2/2] Add benchmarks to .Rbuildignore --- .Rbuildignore | 1 + 1 file changed, 1 insertion(+) diff --git a/.Rbuildignore b/.Rbuildignore index 81fcff8..b3b4933 100644 --- a/.Rbuildignore +++ b/.Rbuildignore @@ -12,6 +12,7 @@ # CI & codecov-related ^\.travis\.yml$ ^\.lintr$ +^benchmarks$ ^logo_maker.R$ ^_pkgdown\.yml$