mountainMath · dshkol · Nov 16, 2025 · Nov 16, 2025
diff --git a/.Rbuildignore b/.Rbuildignore
@@ -16,3 +16,6 @@ card_fix.sh
 ^CRAN-RELEASE$
 ^\.github$
 ^CRAN-SUBMISSION$
+^benchmarks$
+^claude$
+^\.claude$
diff --git a/.gitignore b/.gitignore
@@ -11,3 +11,5 @@ tongfen.Rproj
 .DS_Store
 CRAN-RELEASE
 CRAN-SUBMISSION
+benchmarks/
+.claude/
diff --git a/DESCRIPTION b/DESCRIPTION
@@ -22,7 +22,7 @@ Imports:
     utils,
     lifecycle
 RoxygenNote: 7.3.1
-Suggests: 
+Suggests:
     knitr,
     rmarkdown,
     RColorBrewer,
@@ -32,7 +32,8 @@ Suggests:
     tidycensus,
     spelling,
     readxl,
-    scales
+    scales,
+    microbenchmark
 VignetteBuilder: knitr, rmarkdown
 URL: https://github.com/mountainMath/tongfen, https://mountainmath.github.io/tongfen/
 BugReports: https://github.com/mountainMath/tongfen/issues

diff --git a/R/helpers.R b/R/helpers.R
@@ -24,7 +24,7 @@ inner_join_tongfen_correspondence <- function(data,correspondence,link){
 
 
 
-get_tongfen_correspondence <- function(dd){
+get_tongfen_correspondence <- function(dd, max_iterations = 100){
   hs <- names(dd)[!grepl("TongfenMethod",names(dd))]
   index = 1
   ddd<- dd %>%
@@ -33,35 +33,87 @@ get_tongfen_correspondence <- function(dd){
   while (index<length(hs) && filter(ddd,is.na(.data$TongfenID)) %>% nrow > 0) {
     ddd<- ddd %>%
       mutate(TongfenID=coalesce(.data$TongfenID,paste0(index,"_",!!as.name(hs[index]))))
+    index <- index + 1
   }
 
-  done_tongfen <- FALSE
-  iterations <- 0
-  while (!done_tongfen) {
-    ddd <- ddd %>%
-      mutate(TongfenIDOriginal=.data$TongfenID)
-    for (nn in hs) {
-      ddd <- ddd %>%
-        group_by(!!as.name(nn)) %>%
-        mutate(TongfenID=min(.data$TongfenID))
+  # Optimized connected components using union-find approach
+  # Build a mapping of all unique IDs to their component root
+  # This is much faster than repeated group_by operations
+
+  # Create union-find parent mapping
+  all_ids <- unique(ddd$TongfenID)
+  parent <- setNames(all_ids, all_ids)
+
+  # Find root with path compression
+  find_root <- function(x) {
+    if (parent[x] == x) return(x)
+    parent[x] <<- find_root(parent[x])  # Path compression
+    return(parent[x])
+  }
+
+  # Union two components
+  union_ids <- function(x, y) {
+    root_x <- find_root(x)
+    root_y <- find_root(y)
+    if (root_x != root_y) {
+      # Always attach to the smaller ID (alphabetically)
+      if (root_x < root_y) {
+        parent[root_y] <<- root_x
+      } else {
+        parent[root_x] <<- root_y
+      }
     }
-    done_tongfen <- ddd %>% filter(.data$TongfenID!=.data$TongfenIDOriginal) %>% nrow == 0
-    iterations <- iterations+1
   }
 
-  ddd <- ddd %>% select(-.data$TongfenIDOriginal)
+  # For each identifier column, union all IDs that share the same identifier value
+  for (nn in hs) {
+    id_groups <- ddd %>%
+      select(identifier = !!as.name(nn), .data$TongfenID) %>%
+      distinct() %>%
+      group_by(.data$identifier) %>%
+      summarise(ids = list(.data$TongfenID), .groups = "drop")
+
+    for (i in seq_len(nrow(id_groups))) {
+      ids_in_group <- id_groups$ids[[i]]
+      if (length(ids_in_group) > 1) {
+        # Union all pairs in this group
+        for (j in 2:length(ids_in_group)) {
+          union_ids(ids_in_group[1], ids_in_group[j])
+        }
+      }
+    }
+  }
 
-  tongfen_groups <- unique(ddd$TongfenID)
-  grp_lookup <- setNames(seq(1,length(tongfen_groups)),tongfen_groups)
+  # Apply the final mapping - find root for each ID
+  final_mapping <- setNames(
+    vapply(all_ids, find_root, character(1), USE.NAMES = FALSE),
+    all_ids
+  )
 
+  # Map all TongfenIDs to their roots
   ddd <- ddd %>%
+    mutate(TongfenID = final_mapping[.data$TongfenID])
+
+  # Vectorized UID generation
+  # Pre-compute all the grouped values at once
+  uid_parts <- ddd %>%
     group_by(.data$TongfenID) %>%
-    mutate(TongfenUID=paste0(hs[1],":",paste0(sort(unique(!!as.name(hs[1]))),collapse=",")))
-  for (nn in hs[-1]) {
-    ddd <- ddd %>%
-      mutate(TongfenUID=paste0(.data$TongfenUID," ",nn,":",paste0(sort(unique(!!as.name(nn))),collapse=",")))
-  }
+    summarise(
+      across(
+        all_of(hs),
+        ~paste0(cur_column(), ":", paste0(sort(unique(.x)), collapse = ",")),
+        .names = "uid_{.col}"
+      ),
+      .groups = "drop"
+    ) %>%
+    mutate(
+      TongfenUID = do.call(paste, c(select(., starts_with("uid_")), sep = " "))
+    ) %>%
+    select(.data$TongfenID, .data$TongfenUID)
+
+  # Join the UIDs back
   ddd %>%
+    left_join(uid_parts, by = "TongfenID") %>%
     ungroup()
 }
 

diff --git a/R/tongfen.R b/R/tongfen.R
@@ -51,8 +51,14 @@ pre_scale <- function(data,meta,meta_var="data_var",quiet=FALSE) {
   }
 
 
-  for (x in to_scale) {
-    data <- data %>% mutate(!!x := !!as.name(x)*!!as.name(parent_lookup[x]))
+  # Optimized: Vectorized scaling using mutate(across(...)) instead of loop
+  if (length(to_scale) > 0) {
+    data <- data %>%
+      mutate(across(
+        all_of(to_scale),
+        ~ .x * data[[parent_lookup[cur_column()]]],
+        .names = "{.col}"
+      ))
   }
 
   data
@@ -64,8 +70,14 @@ post_scale <- function(data,meta,meta_var="data_var") {
   parent_lookup <- setNames(meta$parent_name,meta %>% pull(meta_var))
   to_scale <-  filter(meta,.data$rule %in% c("Median","Average")) %>% pull(meta_var)
 
-  for (x in to_scale) {
-    data <- data %>% mutate(!!x := !!as.name(x)/!!as.name(parent_lookup[x]))
+  # Optimized: Vectorized scaling using mutate(across(...)) instead of loop
+  if (length(to_scale) > 0) {
+    data <- data %>%
+      mutate(across(
+        all_of(to_scale),
+        ~ .x / data[[parent_lookup[cur_column()]]],
+        .names = "{.col}"
+      ))
   }
 
   data
@@ -112,8 +124,14 @@ aggregate_data_with_meta <- function(data,meta,geo=FALSE,na.rm=TRUE,quiet=FALSE)
       message(paste0("Can't TongFen medians, will approximate by treating as averages: ",paste0(median_vars,collapse = ", ")))
   }
 
-  for (x in to_scale) {
-    data <- data %>% mutate(!!x := !!as.name(x)*!!as.name(parent_lookup[x]))
+  # Optimized: Vectorized pre-scaling
+  if (length(to_scale) > 0) {
+    data <- data %>%
+      mutate(across(
+        all_of(to_scale),
+        ~ .x * data[[parent_lookup[cur_column()]]],
+        .names = "{.col}"
+      ))
   }
 
   base_variables <- c()
@@ -147,13 +165,23 @@ aggregate_data_with_meta <- function(data,meta,geo=FALSE,na.rm=TRUE,quiet=FALSE)
   } else {
     data <- data %>% summarize_at(meta$variable,sum,na.rm=na.rm)
   }
-  for (x in to_scale) {
-    data <- data %>% mutate(!!x := !!as.name(x)/!!as.name(parent_lookup[x]))
+
+  # Optimized: Vectorized post-scaling
+  if (length(to_scale) > 0) {
+    data <- data %>%
+      mutate(across(
+        all_of(to_scale),
+        ~ .x / data[[parent_lookup[cur_column()]]],
+        .names = "{.col}"
+      ))
   }
-  for (x in to_scale_from) {
-    scale_type <- meta %>% filter(.data$variable==x) %>% pull(units) %>% as.character()
-    base_vector <- paste0("base_",parent_lookup[x])
-    data <- data %>% mutate(!!x := !!as.name(x)/!!as.name(base_vector))
+
+  # Optimized: Vectorized division by base vectors
+  if (length(to_scale_from) > 0) {
+    for (x in to_scale_from) {
+      base_vector <- paste0("base_", parent_lookup[x])
+      data[[x]] <- data[[x]] / data[[base_vector]]
+    }
   }
   data
 }
@@ -456,24 +484,25 @@ estimate_tongfen_single_correspondence <- function(geo1,geo2,geo1_uid,geo2_uid,
   cgeo1 <- geo1 %>% robust_tolerance_buffer(geo_uid = geo1_uid,tolerance = tolerance)
   cgeo2 <- geo2 %>% robust_tolerance_buffer(geo_uid = geo2_uid,tolerance = tolerance)
 
-  i1 <- cgeo1 %>%
-    st_intersects(geo2,sparse = TRUE) %>%
+  # Optimized: Both intersections are necessary (buffered cgeo1 vs geo2, and cgeo2 vs geo1)
+  # But we can streamline the conversion and processing
+  # Convert sparse matrix directly to tibble, avoiding intermediate data.frame step
+  i1 <- st_intersects(cgeo1, geo2, sparse = TRUE) %>%
     as.data.frame() %>%
     as_tibble() %>%
-    rename(id1=.data$row.id,id2=.data$col.id) %>%
-    left_join(id1,by="id1") %>%
-    left_join(id2,by="id2") %>%
-    select(-id1,-id2)
-  i2 <- cgeo2 %>%
-    st_intersects(geo1,sparse = TRUE) %>%
+    left_join(id1, by = c("row.id" = "id1")) %>%
+    left_join(id2, by = c("col.id" = "id2")) %>%
+    select(-.data$row.id, -.data$col.id)
+
+  i2 <- st_intersects(cgeo2, geo1, sparse = TRUE) %>%
     as.data.frame() %>%
     as_tibble() %>%
-    rename(id2=.data$row.id,id1=.data$col.id) %>%
-    left_join(id1,by="id1") %>%
-    left_join(id2,by="id2") %>%
-    select(-id1,-id2)
+    left_join(id2, by = c("row.id" = "id2")) %>%
+    left_join(id1, by = c("col.id" = "id1")) %>%
+    select(-.data$row.id, -.data$col.id)
 
-  correspondence <- bind_rows(i1,i2) %>%
+  # Combine and find correspondence
+  correspondence <- bind_rows(i1, i2) %>%
     unique() %>%
     get_tongfen_correspondence()