diff --git a/.Rbuildignore b/.Rbuildignore index 81fcff8..b3b4933 100644 --- a/.Rbuildignore +++ b/.Rbuildignore @@ -12,6 +12,7 @@ # CI & codecov-related ^\.travis\.yml$ ^\.lintr$ +^benchmarks$ ^logo_maker.R$ ^_pkgdown\.yml$ diff --git a/NEWS.md b/NEWS.md index f7c6c8c..e442d4e 100644 --- a/NEWS.md +++ b/NEWS.md @@ -1,5 +1,11 @@ # `geohashTools` NEWS +## v0.3.4 (Development) + +### PERFORMANCE + + 1. Optimized duplicate detection in `gh_to_sp`, `gh_to_spdf.default`, and `gh_to_spdf.data.frame` by using single-pass algorithm instead of double-scan (`anyDuplicated` + `duplicated`). Benchmarks show 1.25-1.76× speedup depending on input size and duplicate ratio, with median improvement of ~1.44×. + ## v0.3.3 Drop references to deprecated rgdal. diff --git a/R/gis_tools.R b/R/gis_tools.R index 501b5b1..2827ae6 100644 --- a/R/gis_tools.R +++ b/R/gis_tools.R @@ -12,10 +12,10 @@ check_suggested = function(pkg) { gh_to_sp = function(geohashes) { check_suggested('sp') gh = tolower(geohashes) - if (anyDuplicated(gh) > 0L) { - idx = which(duplicated(gh)) - warning('Detected ', length(idx), ' duplicate input geohashes; removing') - gh = gh[-idx] + dup_idx = duplicated(gh) + if (any(dup_idx)) { + warning('Detected ', sum(dup_idx), ' duplicate input geohashes; removing') + gh = gh[!dup_idx] } gh_xy = gh_decode(gh, include_delta = TRUE) sp::SpatialPolygons(lapply(seq_along(gh), function(ii) { @@ -34,10 +34,10 @@ gh_to_spdf = function(...) { } gh_to_spdf.default = function(geohashes, ...) { - if (anyDuplicated(geohashes) > 0L) { - idx = which(duplicated(geohashes)) - warning('Detected ', length(idx), ' duplicate input geohashes; removing') - geohashes = geohashes[-idx] + dup_idx = duplicated(geohashes) + if (any(dup_idx)) { + warning('Detected ', sum(dup_idx), ' duplicate input geohashes; removing') + geohashes = geohashes[!dup_idx] } sp::SpatialPolygonsDataFrame( gh_to_sp(geohashes), @@ -49,11 +49,11 @@ gh_to_spdf.data.frame = function(gh_df, gh_col = 'gh', ...) { if (is.na(idx <- match(gh_col, names(gh_df)))) stop('Searched for geohashes at a column named "', gh_col, '", but found nothing.') gh = gh_df[[idx]] - if (anyDuplicated(gh) > 0L) { - idx = which(duplicated(gh)) - warning('Detected ', length(idx), ' duplicate input geohashes; removing') - gh = gh[-idx] - gh_df = gh_df[-idx, , drop = FALSE] + dup_idx = duplicated(gh) + if (any(dup_idx)) { + warning('Detected ', sum(dup_idx), ' duplicate input geohashes; removing') + gh = gh[!dup_idx] + gh_df = gh_df[!dup_idx, , drop = FALSE] } sp::SpatialPolygonsDataFrame( gh_to_sp(gh), data = gh_df, match.ID = FALSE diff --git a/benchmarks/dedup_absolute.png b/benchmarks/dedup_absolute.png new file mode 100644 index 0000000..56cf6ce Binary files /dev/null and b/benchmarks/dedup_absolute.png differ diff --git a/benchmarks/dedup_benchmark.R b/benchmarks/dedup_benchmark.R new file mode 100644 index 0000000..0b03a03 --- /dev/null +++ b/benchmarks/dedup_benchmark.R @@ -0,0 +1,138 @@ +# Benchmark for duplicate detection optimization +# Compares double-scan (anyDuplicated + duplicated) vs single-pass (duplicated only) + +library(microbenchmark) +library(ggplot2) + +# Define old double-scan approach +dedup_old <- function(x) { + if (anyDuplicated(x) > 0L) { + idx = which(duplicated(x)) + x = x[-idx] + } + x +} + +# Define new single-pass approach +dedup_new <- function(x) { + dup_idx = duplicated(x) + if (any(dup_idx)) { + x = x[!dup_idx] + } + x +} + +# Test with varying input sizes and duplicate ratios +test_cases <- expand.grid( + n = c(100, 1000, 10000, 100000), + dup_ratio = c(0.0, 0.1, 0.5, 0.9), + stringsAsFactors = FALSE +) + +cat("Benchmarking duplicate detection methods\n") +cat("Test cases:", nrow(test_cases), "\n\n") + +results <- vector("list", nrow(test_cases)) + +for (i in seq_len(nrow(test_cases))) { + tc <- test_cases[i, ] + cat(sprintf("Test %d/%d: n=%d, dup_ratio=%.1f\n", + i, nrow(test_cases), tc$n, tc$dup_ratio)) + + # Create test data with specified duplicate ratio + n_unique <- ceiling(tc$n * (1 - tc$dup_ratio)) + test_data <- sample(paste0("gh", seq_len(n_unique)), tc$n, replace = TRUE) + + # Run benchmark + bm <- microbenchmark( + old = dedup_old(test_data), + new = dedup_new(test_data), + times = 50L, + unit = "us" + ) + + old_median <- median(bm$time[bm$expr == "old"]) / 1e3 + new_median <- median(bm$time[bm$expr == "new"]) / 1e3 + + results[[i]] <- data.frame( + n = tc$n, + dup_ratio = tc$dup_ratio, + old_median_us = old_median, + new_median_us = new_median, + speedup = old_median / new_median + ) + + cat(sprintf(" Old: %.1f µs, New: %.1f µs, Speedup: %.2fx\n\n", + old_median, new_median, results[[i]]$speedup)) +} + +results_df <- do.call(rbind, results) + +# Save results +saveRDS(results_df, "benchmarks/dedup_results.rds") +cat("\nSaved results to benchmarks/dedup_results.rds\n") + +# Create visualization +p1 <- ggplot(results_df, aes(x = n, y = new_median_us / old_median_us, + color = factor(dup_ratio), + group = factor(dup_ratio))) + + geom_line(size = 1) + + geom_point(size = 3) + + geom_hline(yintercept = 1, linetype = "dashed", color = "gray50") + + scale_x_log10(labels = scales::comma) + + labs( + title = "Duplicate Detection: Single-Pass vs Double-Scan", + subtitle = "Values < 1.0 indicate single-pass is faster", + x = "Input Size (log scale)", + y = "Relative Speed (New / Old)", + color = "Duplicate Ratio" + ) + + theme_minimal() + + theme(legend.position = "right") + +ggsave("benchmarks/dedup_speedup.png", p1, width = 10, height = 6, dpi = 150) +cat("Saved speedup plot to benchmarks/dedup_speedup.png\n") + +# Absolute performance plot +results_long <- reshape2::melt( + results_df, + id.vars = c("n", "dup_ratio"), + measure.vars = c("old_median_us", "new_median_us"), + variable.name = "method", + value.name = "time_us" +) +results_long$method <- factor(results_long$method, + levels = c("old_median_us", "new_median_us"), + labels = c("Double-scan (old)", "Single-pass (new)")) + +p2 <- ggplot(results_long, aes(x = n, y = time_us, + color = method, + linetype = factor(dup_ratio))) + + geom_line(size = 0.8) + + geom_point(size = 2) + + scale_x_log10(labels = scales::comma) + + scale_y_log10(labels = scales::comma) + + labs( + title = "Duplicate Detection Performance", + x = "Input Size (log scale)", + y = "Median Time (µs, log scale)", + color = "Method", + linetype = "Duplicate Ratio" + ) + + theme_minimal() + + theme(legend.position = "bottom") + +ggsave("benchmarks/dedup_absolute.png", p2, width = 10, height = 6, dpi = 150) +cat("Saved absolute performance plot to benchmarks/dedup_absolute.png\n") + +# Summary statistics +cat("\n=== Summary Statistics ===\n") +cat(sprintf("Overall median speedup: %.2fx\n", median(results_df$speedup))) +cat(sprintf("Mean speedup: %.2fx\n", mean(results_df$speedup))) +cat(sprintf("Best case speedup: %.2fx (n=%d, dup_ratio=%.1f)\n", + max(results_df$speedup), + results_df$n[which.max(results_df$speedup)], + results_df$dup_ratio[which.max(results_df$speedup)])) + +cat("\nBy duplicate ratio:\n") +aggregate(speedup ~ dup_ratio, results_df, function(x) sprintf("%.2fx", median(x))) diff --git a/benchmarks/dedup_results.rds b/benchmarks/dedup_results.rds new file mode 100644 index 0000000..1ae9172 Binary files /dev/null and b/benchmarks/dedup_results.rds differ diff --git a/benchmarks/dedup_speedup.png b/benchmarks/dedup_speedup.png new file mode 100644 index 0000000..cd7b14d Binary files /dev/null and b/benchmarks/dedup_speedup.png differ