From 3702c45d72fca5bcbb5b2a8b043804052797d740 Mon Sep 17 00:00:00 2001
From: dshkol <shkolnikd@gmail.com>
Date: Mon, 10 Nov 2025 21:25:27 -0800
Subject: [PATCH 1/2] Phase 2: Optimize gh_covering performance
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Eliminated redundant encode-decode cycle in gh_covering by decoding
geohashes once and building spatial polygons directly.

Before: Grid → encode → gh_to_sf → gh_to_spdf → gh_to_sp → decode → polygons
After: Grid → encode → decode → polygons (direct)

Performance improvements (100 points, 0.5° spread):
- Precision 4: 2.17x faster (10.9ms → 5.0ms)
- Precision 5: 2.41x faster (15.9ms → 6.6ms)
- Precision 6: 3.13x faster (180.5ms → 57.8ms)
- Precision 7: 3.07x faster (5528.6ms → 1802.5ms)

Median speedup: ~2.7x across typical use cases

All existing tests pass - no breaking changes to API or behavior.

🤖 Generated with [Claude Code](https://claude.com/claude-code)

Co-Authored-By: Claude <noreply@anthropic.com>
---
 NEWS.md                               |   6 ++
 R/gis_tools.R                         |  25 ++++-
 benchmarks/gh_covering_bench_simple.R | 111 ++++++++++++++++++++++
 benchmarks/gh_covering_benchmark.R    |  88 +++++++++++++++++
 benchmarks/gh_covering_comparison.R   | 132 ++++++++++++++++++++++++++
 5 files changed, 359 insertions(+), 3 deletions(-)
 create mode 100644 benchmarks/gh_covering_bench_simple.R
 create mode 100644 benchmarks/gh_covering_benchmark.R
 create mode 100644 benchmarks/gh_covering_comparison.R

diff --git a/NEWS.md b/NEWS.md
index f7c6c8c..65ca7ad 100644
--- a/NEWS.md
+++ b/NEWS.md
@@ -1,5 +1,11 @@
 # `geohashTools` NEWS
 
+## v0.3.4 (Development)
+
+### PERFORMANCE
+
+ 1. Optimized `gh_covering` by eliminating redundant encode-decode cycle. The function now decodes geohashes once and builds spatial polygons directly, rather than going through `gh_to_sf` → `gh_to_spdf` → `gh_to_sp` → `gh_decode`. Benchmarks show 2-3× speedup across typical use cases, with larger improvements for higher precision values.
+
 ## v0.3.3
 
 Drop references to deprecated rgdal.
diff --git a/R/gis_tools.R b/R/gis_tools.R
index 501b5b1..8c297b7 100644
--- a/R/gis_tools.R
+++ b/R/gis_tools.R
@@ -73,14 +73,33 @@ gh_covering = function(SP, precision = 6L, minimal = FALSE) {
     SP$id = rownames(SP@data)
   bb = sp::bbox(SP)
   delta = 2.0 * gh_delta(precision)
-  # TODO: actually goes through an encode-decode cycle -- more efficient to
-  #   just build the cells directly by rounding to the precision's grid
+
+  # Build grid and encode to geohashes
   gh = with(expand.grid(
     latitude = seq(bb[2L, 'min'], bb[2L, 'max'] + delta[1L], by = delta[1L]),
     longitude = seq(bb[1L, 'min'], bb[1L, 'max'] + delta[2L], by = delta[2L])
   ), gh_encode(latitude, longitude, precision))
+
+  # Optimization: decode once and build polygons directly
+  # instead of going through gh_to_sf -> gh_to_spdf -> gh_to_sp -> gh_decode
+  gh_xy = gh_decode(gh, include_delta = TRUE)
+
+  # Build SpatialPolygons directly from decoded coordinates
+  cover_sp = sp::SpatialPolygons(lapply(seq_along(gh), function(ii) {
+    sp::Polygons(list(sp::Polygon(cbind(
+      # the four corners of the current geohash
+      gh_xy$longitude[ii] + c(-1.0, -1.0, 1.0, 1.0, -1.0) * gh_xy$delta_longitude[ii],
+      gh_xy$latitude[ii] + c(-1.0, 1.0, 1.0, -1.0, -1.0) * gh_xy$delta_latitude[ii]
+    ))), ID = gh[ii])
+  }), proj4string = wgs())
+
+  # Convert to SPDF
+  cover = sp::SpatialPolygonsDataFrame(
+    cover_sp,
+    data = data.frame(row.names = gh, ID = seq_along(gh))
+  )
+
   if (is.na(prj4 <- sp::proj4string(SP))) sp::proj4string(SP) = (prj4 <- wgs())
-  cover = methods::as(gh_to_sf(gh), 'Spatial')
   sp::proj4string(cover) = prj4
   if (minimal) {
     # slightly more efficient to use rgeos, but there's a bug preventing
diff --git a/benchmarks/gh_covering_bench_simple.R b/benchmarks/gh_covering_bench_simple.R
new file mode 100644
index 0000000..2f2dd1b
--- /dev/null
+++ b/benchmarks/gh_covering_bench_simple.R
@@ -0,0 +1,111 @@
+# Simple benchmark comparing old vs new gh_covering
+# Loads both versions and compares directly
+
+library(microbenchmark)
+library(ggplot2)
+library(sp)
+
+# Define old version (before optimization)
+gh_covering_old = function(SP, precision = 6L, minimal = FALSE) {
+  if (sf_input <- inherits(SP, 'sf')) {
+    SP = sf::as_Spatial(SP)
+  }
+  if (!inherits(SP, 'Spatial'))
+    stop('Object to cover must be Spatial (or subclass)')
+  if (inherits(SP, 'SpatialPointsDataFrame') && !NCOL(SP))
+    SP$id = rownames(SP@data)
+  bb = sp::bbox(SP)
+  delta = 2.0 * geohashTools::gh_delta(precision)
+
+  # OLD: goes through encode-decode cycle
+  gh = with(expand.grid(
+    latitude = seq(bb[2L, 'min'], bb[2L, 'max'] + delta[1L], by = delta[1L]),
+    longitude = seq(bb[1L, 'min'], bb[1L, 'max'] + delta[2L], by = delta[2L])
+  ), geohashTools::gh_encode(latitude, longitude, precision))
+
+  wgs_crs = sp::CRS('+proj=longlat +datum=WGS84', doCheckCRSArgs = FALSE)
+  if (is.na(prj4 <- sp::proj4string(SP))) sp::proj4string(SP) = (prj4 <- wgs_crs)
+
+  # OLD: calls gh_to_sf which calls gh_to_spdf which calls gh_to_sp which decodes
+  cover = methods::as(geohashTools::gh_to_sf(gh), 'Spatial')
+  sp::proj4string(cover) = prj4
+
+  if (minimal) {
+    n_in_cover = vapply(sp::over(cover, SP, returnList=TRUE), NROW, integer(1L))
+    cover = cover[which(n_in_cover > 0L), ]
+    sp::proj4string(cover) = prj4
+  }
+  return(if (sf_input) sf::st_as_sf(cover) else cover)
+}
+
+# Load new version
+devtools::load_all(".", quiet = TRUE)
+
+# Test data
+set.seed(123)
+test_points <- sp::SpatialPoints(cbind(
+  runif(100, 114.5, 115.0),
+  runif(100, -3.4, -3.2)
+))
+
+cat("Benchmarking gh_covering: Old vs New\n")
+cat("Test data: 100 points, 0.5 degree spread\n\n")
+
+# Test different precisions
+precisions <- c(4, 5, 6, 7, 8)
+results <- list()
+
+for (prec in precisions) {
+  cat(sprintf("Precision %d...\n", prec))
+
+  bm <- microbenchmark(
+    old = gh_covering_old(test_points, precision = prec),
+    new = gh_covering(test_points, precision = prec),
+    times = 10L
+  )
+
+  results[[as.character(prec)]] <- data.frame(
+    precision = prec,
+    method = c("old", "new"),
+    median_ms = c(
+      median(bm$time[bm$expr == "old"]) / 1e6,
+      median(bm$time[bm$expr == "new"]) / 1e6
+    )
+  )
+
+  speedup <- results[[as.character(prec)]]$median_ms[1] /
+             results[[as.character(prec)]]$median_ms[2]
+  cat(sprintf("  Old: %.1f ms, New: %.1f ms, Speedup: %.2fx\n\n",
+              results[[as.character(prec)]]$median_ms[1],
+              results[[as.character(prec)]]$median_ms[2],
+              speedup))
+}
+
+results_df <- do.call(rbind, results)
+results_df$speedup <- with(results_df[results_df$method == "old", ],
+                             median_ms) / results_df$median_ms[results_df$method == "new"]
+
+# Create plot
+p <- ggplot(results_df, aes(x = precision, y = median_ms, color = method)) +
+  geom_line(size = 1) +
+  geom_point(size = 3) +
+  scale_y_log10() +
+  labs(
+    title = "gh_covering Performance Comparison",
+    subtitle = "100 random points, 0.5° spread",
+    x = "Precision",
+    y = "Median Time (ms, log scale)",
+    color = "Method"
+  ) +
+  theme_minimal() +
+  theme(legend.position = "top")
+
+ggsave("benchmarks/gh_covering_simple_comparison.png", p, width = 8, height = 6, dpi = 150)
+cat("\nSaved plot to benchmarks/gh_covering_simple_comparison.png\n")
+
+# Print summary
+cat("\n=== Summary ===\n")
+print(results_df)
+
+cat(sprintf("\nMedian speedup across all precisions: %.2fx\n",
+            median(results_df$speedup[results_df$method == "new"])))
diff --git a/benchmarks/gh_covering_benchmark.R b/benchmarks/gh_covering_benchmark.R
new file mode 100644
index 0000000..5e0527a
--- /dev/null
+++ b/benchmarks/gh_covering_benchmark.R
@@ -0,0 +1,88 @@
+# Benchmark script for gh_covering optimization
+# Compares current implementation vs. optimized direct grid generation
+
+# Load package from source
+devtools::load_all(".", quiet = TRUE)
+
+library(microbenchmark)
+library(ggplot2)
+library(sp)
+
+# Create test data of varying sizes
+create_test_points <- function(n_points = 100, spread = 1.0) {
+  sp::SpatialPoints(cbind(
+    runif(n_points, 114.5, 114.5 + spread),
+    runif(n_points, -3.4, -3.4 + spread)
+  ))
+}
+
+# Test different grid sizes by varying point spread and precision
+test_cases <- expand.grid(
+  n_points = c(10, 100),
+  spread = c(0.1, 0.5, 1.0),  # degrees - affects grid size
+  precision = c(4, 6, 8, 10),
+  stringsAsFactors = FALSE
+)
+
+cat("Running benchmarks for gh_covering...\n")
+cat("Test cases:", nrow(test_cases), "\n\n")
+
+results <- vector("list", nrow(test_cases))
+
+for (i in seq_len(nrow(test_cases))) {
+  tc <- test_cases[i, ]
+  cat(sprintf("Test %d/%d: n_points=%d, spread=%.1f, precision=%d\n",
+              i, nrow(test_cases), tc$n_points, tc$spread, tc$precision))
+
+  test_points <- create_test_points(tc$n_points, tc$spread)
+
+  # Run benchmark
+  bm <- microbenchmark(
+    current = gh_covering(test_points, precision = tc$precision),
+    times = 20L,
+    unit = "ms"
+  )
+
+  results[[i]] <- data.frame(
+    n_points = tc$n_points,
+    spread = tc$spread,
+    precision = tc$precision,
+    median_ms = median(bm$time) / 1e6,
+    mean_ms = mean(bm$time) / 1e6,
+    min_ms = min(bm$time) / 1e6,
+    max_ms = max(bm$time) / 1e6
+  )
+
+  cat(sprintf("  Median: %.2f ms\n\n", results[[i]]$median_ms))
+}
+
+results_df <- do.call(rbind, results)
+
+# Save results
+saveRDS(results_df, "benchmarks/gh_covering_baseline_results.rds")
+cat("Saved baseline results to benchmarks/gh_covering_baseline_results.rds\n")
+
+# Create visualization
+p <- ggplot(results_df, aes(x = precision, y = median_ms,
+                             color = factor(spread),
+                             shape = factor(n_points))) +
+  geom_line() +
+  geom_point(size = 3) +
+  scale_y_log10() +
+  labs(
+    title = "gh_covering Performance (Baseline)",
+    subtitle = "Current encode-decode implementation",
+    x = "Precision",
+    y = "Median Time (ms, log scale)",
+    color = "Spread (degrees)",
+    shape = "Points"
+  ) +
+  theme_minimal() +
+  theme(legend.position = "right")
+
+ggsave("benchmarks/gh_covering_baseline.png", p, width = 10, height = 6, dpi = 150)
+cat("Saved plot to benchmarks/gh_covering_baseline.png\n")
+
+# Print summary
+cat("\n=== Summary Statistics ===\n")
+print(results_df[order(results_df$median_ms, decreasing = TRUE), ])
diff --git a/benchmarks/gh_covering_comparison.R b/benchmarks/gh_covering_comparison.R
new file mode 100644
index 0000000..76d799c
--- /dev/null
+++ b/benchmarks/gh_covering_comparison.R
@@ -0,0 +1,132 @@
+# Comparison benchmark for gh_covering optimization
+# Tests optimized version vs results from baseline
+
+library(microbenchmark)
+library(ggplot2)
+library(sp)
+
+# Load the optimized version
+devtools::load_all(".", quiet = TRUE)
+
+# Create test data
+create_test_points <- function(n_points = 100, spread = 1.0) {
+  sp::SpatialPoints(cbind(
+    runif(n_points, 114.5, 114.5 + spread),
+    runif(n_points, -3.4, -3.4 + spread)
+  ))
+}
+
+# Smaller, faster test suite for comparison
+test_cases <- expand.grid(
+  n_points = c(10, 100),
+  spread = c(0.1, 0.5, 1.0),
+  precision = c(4, 6, 8),
+  stringsAsFactors = FALSE
+)
+
+cat("Running comparison benchmarks...\n")
+cat("Testing optimized gh_covering implementation\n")
+cat("Test cases:", nrow(test_cases), "\n\n")
+
+results <- vector("list", nrow(test_cases))
+
+for (i in seq_len(nrow(test_cases))) {
+  tc <- test_cases[i, ]
+  cat(sprintf("Test %d/%d: n_points=%d, spread=%.1f, precision=%d\n",
+              i, nrow(test_cases), tc$n_points, tc$spread, tc$precision))
+
+  test_points <- create_test_points(tc$n_points, tc$spread)
+
+  # Run benchmark on optimized version
+  bm <- microbenchmark(
+    optimized = gh_covering(test_points, precision = tc$precision),
+    times = 20L,
+    unit = "ms"
+  )
+
+  results[[i]] <- data.frame(
+    n_points = tc$n_points,
+    spread = tc$spread,
+    precision = tc$precision,
+    median_ms = median(bm$time) / 1e6,
+    mean_ms = mean(bm$time) / 1e6,
+    min_ms = min(bm$time) / 1e6,
+    max_ms = max(bm$time) / 1e6
+  )
+
+  cat(sprintf("  Median: %.2f ms\n\n", results[[i]]$median_ms))
+}
+
+optimized_df <- do.call(rbind, results)
+
+# Save optimized results
+saveRDS(optimized_df, "benchmarks/gh_covering_optimized_results.rds")
+cat("\nSaved optimized results to benchmarks/gh_covering_optimized_results.rds\n")
+
+# Load baseline if it exists and compare
+if (file.exists("benchmarks/gh_covering_baseline_results.rds")) {
+  baseline_df <- readRDS("benchmarks/gh_covering_baseline_results.rds")
+
+  # Merge results
+  baseline_df$version <- "baseline"
+  optimized_df$version <- "optimized"
+  combined_df <- rbind(baseline_df[, names(optimized_df)], optimized_df)
+
+  # Calculate speedup
+  comparison <- merge(
+    baseline_df[, c("n_points", "spread", "precision", "median_ms")],
+    optimized_df[, c("n_points", "spread", "precision", "median_ms")],
+    by = c("n_points", "spread", "precision"),
+    suffixes = c("_baseline", "_optimized")
+  )
+  comparison$speedup <- comparison$median_ms_baseline / comparison$median_ms_optimized
+
+  cat("\n=== Performance Comparison ===\n")
+  print(comparison[order(-comparison$speedup), ])
+
+  cat(sprintf("\nMedian speedup: %.2fx\n", median(comparison$speedup)))
+  cat(sprintf("Mean speedup: %.2fx\n", mean(comparison$speedup)))
+  cat(sprintf("Max speedup: %.2fx\n", max(comparison$speedup)))
+
+  # Create comparison plot
+  p <- ggplot(combined_df, aes(x = precision, y = median_ms,
+                                color = version,
+                                linetype = factor(spread))) +
+    geom_line() +
+    geom_point(size = 2) +
+    facet_wrap(~n_points, labeller = label_both) +
+    scale_y_log10() +
+    labs(
+      title = "gh_covering Performance: Baseline vs Optimized",
+      x = "Precision",
+      y = "Median Time (ms, log scale)",
+      color = "Version",
+      linetype = "Spread (degrees)"
+    ) +
+    theme_minimal() +
+    theme(legend.position = "bottom")
+
+  ggsave("benchmarks/gh_covering_comparison.png", p, width = 12, height = 6, dpi = 150)
+  cat("\nSaved comparison plot to benchmarks/gh_covering_comparison.png\n")
+
+  # Speedup plot
+  p2 <- ggplot(comparison, aes(x = precision, y = speedup,
+                                color = factor(spread),
+                                shape = factor(n_points))) +
+    geom_line() +
+    geom_point(size = 3) +
+    geom_hline(yintercept = 1, linetype = "dashed", color = "gray50") +
+    labs(
+      title = "gh_covering Speedup: Optimized vs Baseline",
+      subtitle = "Values > 1.0 indicate optimized is faster",
+      x = "Precision",
+      y = "Speedup Factor",
+      color = "Spread (degrees)",
+      shape = "Points"
+    ) +
+    theme_minimal() +
+    theme(legend.position = "right")
+
+  ggsave("benchmarks/gh_covering_speedup.png", p2, width = 10, height = 6, dpi = 150)
+  cat("Saved speedup plot to benchmarks/gh_covering_speedup.png\n")
+}

From a8e48dbb01c45c6f9fd25c64c1941793d580027d Mon Sep 17 00:00:00 2001
From: dshkol <shkolnikd@gmail.com>
Date: Mon, 10 Nov 2025 22:06:32 -0800
Subject: [PATCH 2/2] Add benchmarks to .Rbuildignore

---
 .Rbuildignore | 1 +
 1 file changed, 1 insertion(+)

diff --git a/.Rbuildignore b/.Rbuildignore
index 81fcff8..b3b4933 100644
--- a/.Rbuildignore
+++ b/.Rbuildignore
@@ -12,6 +12,7 @@
 # CI & codecov-related
 ^\.travis\.yml$
 ^\.lintr$
+^benchmarks$
 
 ^logo_maker.R$
 ^_pkgdown\.yml$