Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions .Rbuildignore
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@
# CI & codecov-related
^\.travis\.yml$
^\.lintr$
^benchmarks$

^logo_maker.R$
^_pkgdown\.yml$
Expand Down
6 changes: 6 additions & 0 deletions NEWS.md
Original file line number Diff line number Diff line change
@@ -1,5 +1,11 @@
# `geohashTools` NEWS

## v0.3.4 (Development)

### PERFORMANCE

1. Optimized duplicate detection in `gh_to_sp`, `gh_to_spdf.default`, and `gh_to_spdf.data.frame` by using single-pass algorithm instead of double-scan (`anyDuplicated` + `duplicated`). Benchmarks show 1.25-1.76× speedup depending on input size and duplicate ratio, with median improvement of ~1.44×.

## v0.3.3

Drop references to deprecated rgdal.
Expand Down
26 changes: 13 additions & 13 deletions R/gis_tools.R
Original file line number Diff line number Diff line change
Expand Up @@ -12,10 +12,10 @@ check_suggested = function(pkg) {
gh_to_sp = function(geohashes) {
check_suggested('sp')
gh = tolower(geohashes)
if (anyDuplicated(gh) > 0L) {
idx = which(duplicated(gh))
warning('Detected ', length(idx), ' duplicate input geohashes; removing')
gh = gh[-idx]
dup_idx = duplicated(gh)
if (any(dup_idx)) {
warning('Detected ', sum(dup_idx), ' duplicate input geohashes; removing')
gh = gh[!dup_idx]
}
gh_xy = gh_decode(gh, include_delta = TRUE)
sp::SpatialPolygons(lapply(seq_along(gh), function(ii) {
Expand All @@ -34,10 +34,10 @@ gh_to_spdf = function(...) {
}

gh_to_spdf.default = function(geohashes, ...) {
if (anyDuplicated(geohashes) > 0L) {
idx = which(duplicated(geohashes))
warning('Detected ', length(idx), ' duplicate input geohashes; removing')
geohashes = geohashes[-idx]
dup_idx = duplicated(geohashes)
if (any(dup_idx)) {
warning('Detected ', sum(dup_idx), ' duplicate input geohashes; removing')
geohashes = geohashes[!dup_idx]
}
sp::SpatialPolygonsDataFrame(
gh_to_sp(geohashes),
Expand All @@ -49,11 +49,11 @@ gh_to_spdf.data.frame = function(gh_df, gh_col = 'gh', ...) {
if (is.na(idx <- match(gh_col, names(gh_df))))
stop('Searched for geohashes at a column named "', gh_col, '", but found nothing.')
gh = gh_df[[idx]]
if (anyDuplicated(gh) > 0L) {
idx = which(duplicated(gh))
warning('Detected ', length(idx), ' duplicate input geohashes; removing')
gh = gh[-idx]
gh_df = gh_df[-idx, , drop = FALSE]
dup_idx = duplicated(gh)
if (any(dup_idx)) {
warning('Detected ', sum(dup_idx), ' duplicate input geohashes; removing')
gh = gh[!dup_idx]
gh_df = gh_df[!dup_idx, , drop = FALSE]
}
sp::SpatialPolygonsDataFrame(
gh_to_sp(gh), data = gh_df, match.ID = FALSE
Expand Down
Binary file added benchmarks/dedup_absolute.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
138 changes: 138 additions & 0 deletions benchmarks/dedup_benchmark.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,138 @@
# Benchmark for duplicate detection optimization
# Compares double-scan (anyDuplicated + duplicated) vs single-pass (duplicated only)

library(microbenchmark)
library(ggplot2)

# Define old double-scan approach
dedup_old <- function(x) {
if (anyDuplicated(x) > 0L) {
idx = which(duplicated(x))
x = x[-idx]
}
x
}

# Define new single-pass approach
dedup_new <- function(x) {
dup_idx = duplicated(x)
if (any(dup_idx)) {
x = x[!dup_idx]
}
x
}

# Test with varying input sizes and duplicate ratios
test_cases <- expand.grid(
n = c(100, 1000, 10000, 100000),
dup_ratio = c(0.0, 0.1, 0.5, 0.9),
stringsAsFactors = FALSE
)

cat("Benchmarking duplicate detection methods\n")
cat("Test cases:", nrow(test_cases), "\n\n")

results <- vector("list", nrow(test_cases))

for (i in seq_len(nrow(test_cases))) {
tc <- test_cases[i, ]
cat(sprintf("Test %d/%d: n=%d, dup_ratio=%.1f\n",
i, nrow(test_cases), tc$n, tc$dup_ratio))

# Create test data with specified duplicate ratio
n_unique <- ceiling(tc$n * (1 - tc$dup_ratio))
test_data <- sample(paste0("gh", seq_len(n_unique)), tc$n, replace = TRUE)

# Run benchmark
bm <- microbenchmark(
old = dedup_old(test_data),
new = dedup_new(test_data),
times = 50L,
unit = "us"
)

old_median <- median(bm$time[bm$expr == "old"]) / 1e3
new_median <- median(bm$time[bm$expr == "new"]) / 1e3

results[[i]] <- data.frame(
n = tc$n,
dup_ratio = tc$dup_ratio,
old_median_us = old_median,
new_median_us = new_median,
speedup = old_median / new_median
)

cat(sprintf(" Old: %.1f µs, New: %.1f µs, Speedup: %.2fx\n\n",
old_median, new_median, results[[i]]$speedup))
}

results_df <- do.call(rbind, results)

# Save results
saveRDS(results_df, "benchmarks/dedup_results.rds")
cat("\nSaved results to benchmarks/dedup_results.rds\n")

# Create visualization
p1 <- ggplot(results_df, aes(x = n, y = new_median_us / old_median_us,
color = factor(dup_ratio),
group = factor(dup_ratio))) +
geom_line(size = 1) +
geom_point(size = 3) +
geom_hline(yintercept = 1, linetype = "dashed", color = "gray50") +
scale_x_log10(labels = scales::comma) +
labs(
title = "Duplicate Detection: Single-Pass vs Double-Scan",
subtitle = "Values < 1.0 indicate single-pass is faster",
x = "Input Size (log scale)",
y = "Relative Speed (New / Old)",
color = "Duplicate Ratio"
) +
theme_minimal() +
theme(legend.position = "right")

ggsave("benchmarks/dedup_speedup.png", p1, width = 10, height = 6, dpi = 150)
cat("Saved speedup plot to benchmarks/dedup_speedup.png\n")

# Absolute performance plot
results_long <- reshape2::melt(
results_df,
id.vars = c("n", "dup_ratio"),
measure.vars = c("old_median_us", "new_median_us"),
variable.name = "method",
value.name = "time_us"
)
results_long$method <- factor(results_long$method,
levels = c("old_median_us", "new_median_us"),
labels = c("Double-scan (old)", "Single-pass (new)"))

p2 <- ggplot(results_long, aes(x = n, y = time_us,
color = method,
linetype = factor(dup_ratio))) +
geom_line(size = 0.8) +
geom_point(size = 2) +
scale_x_log10(labels = scales::comma) +
scale_y_log10(labels = scales::comma) +
labs(
title = "Duplicate Detection Performance",
x = "Input Size (log scale)",
y = "Median Time (µs, log scale)",
color = "Method",
linetype = "Duplicate Ratio"
) +
theme_minimal() +
theme(legend.position = "bottom")

ggsave("benchmarks/dedup_absolute.png", p2, width = 10, height = 6, dpi = 150)
cat("Saved absolute performance plot to benchmarks/dedup_absolute.png\n")

# Summary statistics
cat("\n=== Summary Statistics ===\n")
cat(sprintf("Overall median speedup: %.2fx\n", median(results_df$speedup)))
cat(sprintf("Mean speedup: %.2fx\n", mean(results_df$speedup)))
cat(sprintf("Best case speedup: %.2fx (n=%d, dup_ratio=%.1f)\n",
max(results_df$speedup),
results_df$n[which.max(results_df$speedup)],
results_df$dup_ratio[which.max(results_df$speedup)]))

cat("\nBy duplicate ratio:\n")
aggregate(speedup ~ dup_ratio, results_df, function(x) sprintf("%.2fx", median(x)))
Binary file added benchmarks/dedup_results.rds
Binary file not shown.
Binary file added benchmarks/dedup_speedup.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.