MichaelChirico · dshkol · Nov 11, 2025 · Nov 11, 2025
diff --git a/.Rbuildignore b/.Rbuildignore
@@ -12,6 +12,7 @@
 # CI & codecov-related
 ^\.travis\.yml$
 ^\.lintr$
+^benchmarks$
 
 ^logo_maker.R$
 ^_pkgdown\.yml$

diff --git a/NEWS.md b/NEWS.md
@@ -1,5 +1,11 @@
 # `geohashTools` NEWS
 
+## v0.3.4 (Development)
+
+### PERFORMANCE
+
+ 1. Optimized duplicate detection in `gh_to_sp`, `gh_to_spdf.default`, and `gh_to_spdf.data.frame` by using single-pass algorithm instead of double-scan (`anyDuplicated` + `duplicated`). Benchmarks show 1.25-1.76× speedup depending on input size and duplicate ratio, with median improvement of ~1.44×.
+
 ## v0.3.3
 
 Drop references to deprecated rgdal.

diff --git a/R/gis_tools.R b/R/gis_tools.R
@@ -12,10 +12,10 @@ check_suggested = function(pkg) {
 gh_to_sp = function(geohashes) {
   check_suggested('sp')
   gh = tolower(geohashes)
-  if (anyDuplicated(gh) > 0L) {
-    idx = which(duplicated(gh))
-    warning('Detected ', length(idx), ' duplicate input geohashes; removing')
-    gh = gh[-idx]
+  dup_idx = duplicated(gh)
+  if (any(dup_idx)) {
+    warning('Detected ', sum(dup_idx), ' duplicate input geohashes; removing')
+    gh = gh[!dup_idx]
   }
   gh_xy = gh_decode(gh, include_delta = TRUE)
   sp::SpatialPolygons(lapply(seq_along(gh), function(ii) {
@@ -34,10 +34,10 @@ gh_to_spdf = function(...) {
 }
 
 gh_to_spdf.default = function(geohashes, ...) {
-  if (anyDuplicated(geohashes) > 0L) {
-    idx = which(duplicated(geohashes))
-    warning('Detected ', length(idx), ' duplicate input geohashes; removing')
-    geohashes = geohashes[-idx]
+  dup_idx = duplicated(geohashes)
+  if (any(dup_idx)) {
+    warning('Detected ', sum(dup_idx), ' duplicate input geohashes; removing')
+    geohashes = geohashes[!dup_idx]
   }
   sp::SpatialPolygonsDataFrame(
     gh_to_sp(geohashes),
@@ -49,11 +49,11 @@ gh_to_spdf.data.frame = function(gh_df, gh_col = 'gh', ...) {
   if (is.na(idx <- match(gh_col, names(gh_df))))
     stop('Searched for geohashes at a column named "', gh_col, '", but found nothing.')
   gh = gh_df[[idx]]
-  if (anyDuplicated(gh) > 0L) {
-    idx = which(duplicated(gh))
-    warning('Detected ', length(idx), ' duplicate input geohashes; removing')
-    gh = gh[-idx]
-    gh_df = gh_df[-idx, , drop = FALSE]
+  dup_idx = duplicated(gh)
+  if (any(dup_idx)) {
+    warning('Detected ', sum(dup_idx), ' duplicate input geohashes; removing')
+    gh = gh[!dup_idx]
+    gh_df = gh_df[!dup_idx, , drop = FALSE]
   }
   sp::SpatialPolygonsDataFrame(
     gh_to_sp(gh), data = gh_df, match.ID = FALSE

diff --git a/benchmarks/dedup_absolute.png b/benchmarks/dedup_absolute.png
diff --git a/benchmarks/dedup_benchmark.R b/benchmarks/dedup_benchmark.R
@@ -0,0 +1,138 @@
+# Benchmark for duplicate detection optimization
+# Compares double-scan (anyDuplicated + duplicated) vs single-pass (duplicated only)
+
+library(microbenchmark)
+library(ggplot2)
+
+# Define old double-scan approach
+dedup_old <- function(x) {
+  if (anyDuplicated(x) > 0L) {
+    idx = which(duplicated(x))
+    x = x[-idx]
+  }
+  x
+}
+
+# Define new single-pass approach
+dedup_new <- function(x) {
+  dup_idx = duplicated(x)
+  if (any(dup_idx)) {
+    x = x[!dup_idx]
+  }
+  x
+}
+
+# Test with varying input sizes and duplicate ratios
+test_cases <- expand.grid(
+  n = c(100, 1000, 10000, 100000),
+  dup_ratio = c(0.0, 0.1, 0.5, 0.9),
+  stringsAsFactors = FALSE
+)
+
+cat("Benchmarking duplicate detection methods\n")
+cat("Test cases:", nrow(test_cases), "\n\n")
+
+results <- vector("list", nrow(test_cases))
+
+for (i in seq_len(nrow(test_cases))) {
+  tc <- test_cases[i, ]
+  cat(sprintf("Test %d/%d: n=%d, dup_ratio=%.1f\n",
+              i, nrow(test_cases), tc$n, tc$dup_ratio))
+
+  # Create test data with specified duplicate ratio
+  n_unique <- ceiling(tc$n * (1 - tc$dup_ratio))
+  test_data <- sample(paste0("gh", seq_len(n_unique)), tc$n, replace = TRUE)
+
+  # Run benchmark
+  bm <- microbenchmark(
+    old = dedup_old(test_data),
+    new = dedup_new(test_data),
+    times = 50L,
+    unit = "us"
+  )
+
+  old_median <- median(bm$time[bm$expr == "old"]) / 1e3
+  new_median <- median(bm$time[bm$expr == "new"]) / 1e3
+
+  results[[i]] <- data.frame(
+    n = tc$n,
+    dup_ratio = tc$dup_ratio,
+    old_median_us = old_median,
+    new_median_us = new_median,
+    speedup = old_median / new_median
+  )
+
+  cat(sprintf("  Old: %.1f µs, New: %.1f µs, Speedup: %.2fx\n\n",
+              old_median, new_median, results[[i]]$speedup))
+}
+
+results_df <- do.call(rbind, results)
+
+# Save results
+saveRDS(results_df, "benchmarks/dedup_results.rds")
+cat("\nSaved results to benchmarks/dedup_results.rds\n")
+
+# Create visualization
+p1 <- ggplot(results_df, aes(x = n, y = new_median_us / old_median_us,
+                              color = factor(dup_ratio),
+                              group = factor(dup_ratio))) +
+  geom_line(size = 1) +
+  geom_point(size = 3) +
+  geom_hline(yintercept = 1, linetype = "dashed", color = "gray50") +
+  scale_x_log10(labels = scales::comma) +
+  labs(
+    title = "Duplicate Detection: Single-Pass vs Double-Scan",
+    subtitle = "Values < 1.0 indicate single-pass is faster",
+    x = "Input Size (log scale)",
+    y = "Relative Speed (New / Old)",
+    color = "Duplicate Ratio"
+  ) +
+  theme_minimal() +
+  theme(legend.position = "right")
+
+ggsave("benchmarks/dedup_speedup.png", p1, width = 10, height = 6, dpi = 150)
+cat("Saved speedup plot to benchmarks/dedup_speedup.png\n")
+
+# Absolute performance plot
+results_long <- reshape2::melt(
+  results_df,
+  id.vars = c("n", "dup_ratio"),
+  measure.vars = c("old_median_us", "new_median_us"),
+  variable.name = "method",
+  value.name = "time_us"
+)
+results_long$method <- factor(results_long$method,
+                                levels = c("old_median_us", "new_median_us"),
+                                labels = c("Double-scan (old)", "Single-pass (new)"))
+
+p2 <- ggplot(results_long, aes(x = n, y = time_us,
+                                color = method,
+                                linetype = factor(dup_ratio))) +
+  geom_line(size = 0.8) +
+  geom_point(size = 2) +
+  scale_x_log10(labels = scales::comma) +
+  scale_y_log10(labels = scales::comma) +
+  labs(
+    title = "Duplicate Detection Performance",
+    x = "Input Size (log scale)",
+    y = "Median Time (µs, log scale)",
+    color = "Method",
+    linetype = "Duplicate Ratio"
+  ) +
+  theme_minimal() +
+  theme(legend.position = "bottom")
+
+ggsave("benchmarks/dedup_absolute.png", p2, width = 10, height = 6, dpi = 150)
+cat("Saved absolute performance plot to benchmarks/dedup_absolute.png\n")
+
+# Summary statistics
+cat("\n=== Summary Statistics ===\n")
+cat(sprintf("Overall median speedup: %.2fx\n", median(results_df$speedup)))
+cat(sprintf("Mean speedup: %.2fx\n", mean(results_df$speedup)))
+cat(sprintf("Best case speedup: %.2fx (n=%d, dup_ratio=%.1f)\n",
+            max(results_df$speedup),
+            results_df$n[which.max(results_df$speedup)],
+            results_df$dup_ratio[which.max(results_df$speedup)]))
+
+cat("\nBy duplicate ratio:\n")
+aggregate(speedup ~ dup_ratio, results_df, function(x) sprintf("%.2fx", median(x)))
diff --git a/benchmarks/dedup_results.rds b/benchmarks/dedup_results.rds
diff --git a/benchmarks/dedup_speedup.png b/benchmarks/dedup_speedup.png