From b302274033c4f1dfe0e44b592c479273727c26be Mon Sep 17 00:00:00 2001
From: Michael Chirico <michaelchirico4@gmail.com>
Date: Mon, 21 Jun 2021 00:49:46 -0700
Subject: [PATCH 1/5] initial work on team splitting tool

---
 NAMESPACE                                     |  2 +-
 R/translate_package.R                         | 69 ++++++++++++++++++-
 .../test_packages/r_msg/po/R-rMsg.pot         |  2 +-
 3 files changed, 70 insertions(+), 3 deletions(-)

diff --git a/NAMESPACE b/NAMESPACE
index c6290da8..51e0df5f 100644
--- a/NAMESPACE
+++ b/NAMESPACE
@@ -2,7 +2,7 @@ importFrom(data.table, data.table, foverlaps, fread, rbindlist, uniqueN, "%chin%
 importFrom(data.table, is.data.table, as.data.table)
 importFrom(data.table, set, setcolorder, setindexv, setkeyv, setnames, setorderv)
 importFrom(data.table, `:=`, .BY, .N, .SD)
-importFrom(data.table, fifelse)
+importFrom(data.table, fifelse, frank)
 
 importFrom(utils, getParseData, globalVariables, head, tail, type.convert)
 
diff --git a/R/translate_package.R b/R/translate_package.R
index 7ab56f67..60111333 100644
--- a/R/translate_package.R
+++ b/R/translate_package.R
@@ -3,6 +3,8 @@ translate_package = function(
   diagnostics = list(check_cracked_messages, check_untranslated_cat, check_untranslated_src),
   src_translation_macros = c("_", "N_"),
   use_base_rules = package %chin% .potools$base_package_names,
+  team_size = 1L, team_id = 1L,
+  team_split_rule = c("equalize_char", "equalize_files"),
   copyright = NULL, bugs = NULL, verbose = FALSE
 ) {
   check_sys_reqs()
@@ -14,9 +16,18 @@ translate_package = function(
     "'diagnostics' should be empty, a function, or list of functions" =
       is.null(diagnostics)
       || is.function(diagnostics)
-      || (is.list(diagnostics) && all(vapply(diagnostics, is.function, logical(1L))))
+      || (is.list(diagnostics) && all(vapply(diagnostics, is.function, logical(1L)))),
+    "'team_size' should be >=1 and 'team_id' should be between 1 and 'team_size'" =
+      is.numeric(team_size) && is.numeric(team_id)
+      && team_size >= 1 && team_id >= 1 && team_id <= team_size,
+    "When using team splitting, only translate one language at a time" =
+      team_size == 1L || length(languages) == 1L
   )
 
+  team_split_rule = match.arg(team_split_rule)
+  team_size = as.integer(team_size)
+  team_id = as.integer(team_id)
+
   # en-list singleton diagnostic for convenience
   if (is.function(diagnostics)) diagnostics = list(diagnostics)
 
@@ -133,6 +144,7 @@ translate_package = function(
     }
 
     new_idx = message_data[
+      !is_repeat &
       is_marked_for_translation & (
         fuzzy == 1L
         | (type == 'singular' & !nzchar(msgstr) & nzchar(msgid, keepNA = TRUE))
@@ -141,6 +153,61 @@ translate_package = function(
       which = TRUE
     ]
 
+    if (team_size > 1L) {
+      if ((n_files <- uniqueN(message_data[(new_idx)]$file)) < team_size) warning(domain=NA, gettextf(
+        "Requested to split %d files among %d translators, which will exclude %d translators; setting team_split_rule = 'equalize_char' instead",
+        n_files, team_size, team_size - n_files
+      ))
+      browser()
+      new_idx = switch(
+        team_split_rule,
+        equalize_char = {
+          message_data[
+            (new_idx),
+            {
+              msg_size = fifelse(
+                type == "singular",
+                nchar(msgid),
+                vapply(msgid_plural, function(x) sum(nchar(x)), numeric(1L))
+              )
+              char_rank = frank(msg_size, ties.method = "random")
+              assign_idx = which((char_rank %% team_size) == (team_id - 1L))
+              if (verbose) message(domain=NA, gettextf(
+                "Assigning team %d %d messages for translation totalling %d characters",
+                team_id, length(assign_idx), sum(msg_size[assign_idx])
+              ))
+              new_idx[assign_idx]
+            }
+          ]
+        },
+        equalize_files = {
+          assigned_files = message_data[
+            (new_idx),
+            by = file,
+            {
+              msg_size = fifelse(
+                type == "singular",
+                nchar(msgid),
+                vapply(msgid_plural, function(x) sum(nchar(x)), numeric(1L))
+              )
+              .(file_size = sum(msg_size))
+            }
+          ][
+            order(file_size),
+            {
+              assign_idx = (.I %% team_size) == (team_id - 1L)
+              if (verbose) message(domain=NA, gettextf(
+                "Assigning team %d %d files for translation totalling %d characters",
+                team_id, assign_idx, sum(file_size[assign_idx])
+              ))
+              file[assign_idx]
+            }
+          ]
+          message_data[(new_idx), new_idx[which(file %chin% assigned_files)]]
+        }
+      )
+    }
+
     if (!length(new_idx)) {
       if (verbose) message(domain=NA, gettextf(
         'Translations for %s are up to date! Skipping.',
diff --git a/tests/testthat/test_packages/r_msg/po/R-rMsg.pot b/tests/testthat/test_packages/r_msg/po/R-rMsg.pot
index 781a62a8..8494ad3d 100644
--- a/tests/testthat/test_packages/r_msg/po/R-rMsg.pot
+++ b/tests/testthat/test_packages/r_msg/po/R-rMsg.pot
@@ -6,7 +6,7 @@
 msgid ""
 msgstr ""
 "Project-Id-Version: rMsg 0.0.1\n"
-"POT-Creation-Date: 2021-06-20 05:58:19\n"
+"POT-Creation-Date: 2021-06-21 07:46:19\n"
 "PO-Revision-Date: YEAR-MO-DA HO:MI+ZONE\n"
 "Last-Translator: FULL NAME <EMAIL@ADDRESS>\n"
 "Language-Team: LANGUAGE <LL@li.org>\n"

From 0df94f145eda56dbda709724de5b7336d9416efd Mon Sep 17 00:00:00 2001
From: Michael Chirico <michaelchirico4@gmail.com>
Date: Mon, 21 Jun 2021 23:25:55 -0700
Subject: [PATCH 2/5] add a new option, add documentation, resist using any
 randomization

---
 R/translate_package.R    | 17 ++++++++++++++---
 man/translate_package.Rd | 21 +++++++++++++++++++++
 2 files changed, 35 insertions(+), 3 deletions(-)

diff --git a/R/translate_package.R b/R/translate_package.R
index 60111333..a2e83491 100644
--- a/R/translate_package.R
+++ b/R/translate_package.R
@@ -4,7 +4,7 @@ translate_package = function(
   src_translation_macros = c("_", "N_"),
   use_base_rules = package %chin% .potools$base_package_names,
   team_size = 1L, team_id = 1L,
-  team_split_rule = c("equalize_char", "equalize_files"),
+  team_split_rule = c("equalize_char", "equalize_count", "equalize_files"),
   copyright = NULL, bugs = NULL, verbose = FALSE
 ) {
   check_sys_reqs()
@@ -18,7 +18,7 @@ translate_package = function(
       || is.function(diagnostics)
       || (is.list(diagnostics) && all(vapply(diagnostics, is.function, logical(1L)))),
     "'team_size' should be >=1 and 'team_id' should be between 1 and 'team_size'" =
-      is.numeric(team_size) && is.numeric(team_id)
+      is.numeric(team_size) && length(team_size) == 1L && is.numeric(team_id) && length(team_id) == 1L
       && team_size >= 1 && team_id >= 1 && team_id <= team_size,
     "When using team splitting, only translate one language at a time" =
       team_size == 1L || length(languages) == 1L
@@ -170,7 +170,14 @@ translate_package = function(
                 nchar(msgid),
                 vapply(msgid_plural, function(x) sum(nchar(x)), numeric(1L))
               )
-              char_rank = frank(msg_size, ties.method = "random")
+
+              # NB: use order instead of frank because we don't care about ties. ties.method='random'
+              #   also won't work because of the difficulty in matching seed across machines for
+              #   distributed translation teams. even if we do set.seed(team_size), say, we can't
+              #   guarantee the same RNG generator is being used; ensuring this is much more complication
+              #   than it's worth. I think there is some risk based on collation order that msg_size may
+              #   be different, but this _should_ be static owing to data.table's consistent sorting rules.
+              char_rank = order(msg_size)
               assign_idx = which((char_rank %% team_size) == (team_id - 1L))
               if (verbose) message(domain=NA, gettextf(
                 "Assigning team %d %d messages for translation totalling %d characters",
@@ -180,6 +187,10 @@ translate_package = function(
             }
           ]
         },
+        equalize_count = {
+          # plain & simple
+          new_idx[seq(team_id, length(new_idx), by = team_size)]
+        }
         equalize_files = {
           assigned_files = message_data[
             (new_idx),
diff --git a/man/translate_package.Rd b/man/translate_package.Rd
index c5a6041c..cc55d476 100644
--- a/man/translate_package.Rd
+++ b/man/translate_package.Rd
@@ -16,6 +16,8 @@ translate_package(
   diagnostics = list(check_cracked_messages, check_untranslated_cat, check_untranslated_src),
   src_translation_macros = c("_", "N_"),
   use_base_rules = package \%chin\% .potools$base_package_names,
+  team_size = 1L, team_id = 1L,
+  team_split_rule = c("equalize_char", "equalize_count", "equalize_files"),
   copyright = NULL, bugs = NULL, verbose=FALSE
 )
 }
@@ -25,6 +27,9 @@ translate_package(
   \item{diagnostics}{ A \code{list} of diagnostic functions to be run on the package's message data. See Details.}
   \item{src_translation_macros}{ Character, the macro used to indicate which \code{char} arrays are to be marked for translation in C/C++ files. The default, \code{_} and \code{N_}, is shared by R itself & recommended in R-exts and R-ints (See references). }
   \item{use_base_rules}{ Logical; Should internal behavior match base behavior as strictly as possible? \code{TRUE} if being run on a base package (i.e., \code{base} or one of the default packages like \code{utils}, \code{graphics}, etc.). See Details. }
+  \item{team_size}{ Integer; how many translators are there for the (singular) language? See Details for this, \code{team_id}, and \code{team_split_rule}. }
+  \item{team_id}{ Integer; which translator is currently working? }
+  \item{team_split_rule}{ Character; how should the message base be split up among the teams? }
   \item{copyright}{ Character; passed on to \code{\link[tools]{update_pkg_po}}. }
   \item{bugs}{ Character; passed on to \code{\link[tools]{update_pkg_po}}. }
   \item{verbose}{ Logical, default \code{FALSE}. Should extra information about progress, etc. be reported? }
@@ -53,6 +58,22 @@ directory (which is created if it does not yet exist).
 
 There are some discrepancies in the default behavior of \code{translate_package} and the translation workflow used to generate the \file{.po}/\file{.pot} files for R itself (mainly, the suite of functions from \code{tools}, \code{\link[tools]{update_pkg_po}}, \code{\link[tools]{xgettext2pot}}, \code{\link[tools]{xgettext}}, and \code{\link[tools]{xngettext}}). They should only be superficial (e.g., whitespace or comments), but nevertheless may represent a barrier to smoothly submitting patchings to R Core. To make the process of translating base R and the default packages (\code{tools}, \code{utils}, \code{stats}, etc.) as smooth as possible, set the \code{use_base_rules} argument to \code{TRUE} and your resulting \file{.po}/\file{.pot}/\file{.mo} file will match base's.
 
+\bold{Teams:}
+
+For packages with larger message bases to tackle (e.g., R itself or a large, currently-untranslated package), a divide-and-conquer approach may be preferable if a suitable team can be assembled. The arguments \code{team_size}, \code{team_id}, and \code{team_split_rule} are meant to facilitate the work of translation in this case. If \code{team_size > 1}, first the set of messages that need translation is divided "roughly"" equally into \code{team_size} parts, each of which is assigned an "ID" from \code{1} to \code{team_size}. You can select which block of messages you'd like to translate by passing your \code{team_id} (the ID for each translator will need to be coordinated amongst the team members).
+
+There are three ways the translation set can be split "equally", controlled by the \code{team_split_rule} argument:
+
+\enumerate{
+  \item \code{"equalize_char"}: Roughly assign each translator the same number of \emph{characters} of messages to translate (i.e., according to \code{\link{nchar}}).
+  \item \code{"equalize_count"}: Roughly assign each translator the same number of \emph{messages} to translate. Specific implementation is analogous to \code{"equalize_char"}.
+  \item \code{"equalize_files"}: Roughly assign each translator the same number of \emph{soure files} to translate. The thinking here is to try and give one translator many similar messages on the hope that there are some efficiency gains from autocorrelation in the messages.
+}
+
+NB: \code{"equalize_char"} and \code{"equalize_files"} are implemented by sorting the and slicing. For example, if \code{team_size=3}, for \code{"equalize_char"}, the first translator (\code{team_id=1}) will get the 1st, 4th, 7th, ... largest messages, the second (\code{team_id=2}) will get the 2nd, 5th, 8th, ... largest, and the third (\code{team_id=3}) will get the 3rd, 6th, 9th, ... largest. For \code{"equalize_count"}, messages are simply taken in alternating order, sorted as they are from \code{\link{get_message_data}}.
+
+NB: this option only applies when a single language is specified for translation.
+
 \bold{Diagnostics:}
 
 A diagnostic is a function which takes as input a \code{data.table} summarizing the translatable strings in

From 9459e34b1c9cab979daefeb12a208f8aee0367fe Mon Sep 17 00:00:00 2001
From: Michael Chirico <michaelchirico4@gmail.com>
Date: Mon, 21 Jun 2021 23:45:30 -0700
Subject: [PATCH 3/5] debugging...

---
 R/msgmerge.R                            | 2 ++
 R/onLoad.R                              | 2 +-
 R/translate_package.R                   | 4 ++--
 tests/testthat/test-translate-package.R | 2 +-
 4 files changed, 6 insertions(+), 4 deletions(-)

diff --git a/R/msgmerge.R b/R/msgmerge.R
index c4553d93..0f3c6b6d 100644
--- a/R/msgmerge.R
+++ b/R/msgmerge.R
@@ -15,7 +15,9 @@ run_msgmerge = function(po_file, pot_file) {
 
 run_msgfmt = function(po_file, mo_file, verbose) {
   use_stats <- if (verbose) '--statistics' else ''
+  browser()
   if (system(sprintf("msgfmt -c %s -o %s %s", use_stats, shQuote(mo_file), shQuote(po_file))) != 0L) {
+    browser()
     warning(domain = NA, gettextf("running msgfmt on %s failed", basename(po_file)), immediate. = TRUE)
   }
   return(invisible())
diff --git a/R/onLoad.R b/R/onLoad.R
index d6fffc94..4bec955c 100644
--- a/R/onLoad.R
+++ b/R/onLoad.R
@@ -11,7 +11,7 @@ globalVariables(
     'source_location', 'c_fmt_tag', 'msgid_plural_str', 'msgstr_plural_str', 'suggested_call',
     'array_start', 'i.array_start', 'array_end', 'i.array_end', 'call_start', 'x.call_start', 'i.call_start',
     'paren_start', 'i.paren_start', 'paren_end', 'i.paren_end', 'x.paren_end',
-    'i.msgid', 'replacement', 'non_spurious', 'str_arg',
+    'i.msgid', 'replacement', 'non_spurious', 'str_arg', 'file_size',
     '.'
   ),
   package = 'potools'
diff --git a/R/translate_package.R b/R/translate_package.R
index a2e83491..daeee8cb 100644
--- a/R/translate_package.R
+++ b/R/translate_package.R
@@ -190,7 +190,7 @@ translate_package = function(
         equalize_count = {
           # plain & simple
           new_idx[seq(team_id, length(new_idx), by = team_size)]
-        }
+        },
         equalize_files = {
           assigned_files = message_data[
             (new_idx),
@@ -206,7 +206,7 @@ translate_package = function(
           ][
             order(file_size),
             {
-              assign_idx = (.I %% team_size) == (team_id - 1L)
+              assign_idx = (seq_len(.N) %% team_size) == (team_id - 1L)
               if (verbose) message(domain=NA, gettextf(
                 "Assigning team %d %d files for translation totalling %d characters",
                 team_id, assign_idx, sum(file_size[assign_idx])
diff --git a/tests/testthat/test-translate-package.R b/tests/testthat/test-translate-package.R
index 2d7f6427..b524688b 100644
--- a/tests/testthat/test-translate-package.R
+++ b/tests/testthat/test-translate-package.R
@@ -177,7 +177,7 @@ test_that("Packages with src code work correctly", {
       expect_true("po/rSrcMsg.pot" %in% pkg_files)
       expect(
         any(grepl("inst/po/zh_CN/LC_MESSAGES/rSrcMsg.mo", pkg_files, fixed = TRUE)),
-        "Didn't find rSrcMsg.mo; found %s", toString(pkg_files)
+        sprintf("Didn't find rSrcMsg.mo; found %s", toString(pkg_files))
       )
 
       # NB: paste(readLines(), collapse="\n") instead of readChar() for platform robustness

From f3a4ac648600f4e45278946d1b99bd63f45dceeb Mon Sep 17 00:00:00 2001
From: Michael Chirico <michaelchirico4@gmail.com>
Date: Tue, 22 Jun 2021 00:13:54 -0700
Subject: [PATCH 4/5] fix bug

---
 NAMESPACE             | 1 +
 R/msgmerge.R          | 4 +---
 R/translate_package.R | 3 +--
 3 files changed, 3 insertions(+), 5 deletions(-)

diff --git a/NAMESPACE b/NAMESPACE
index 51e0df5f..9bbaca7a 100644
--- a/NAMESPACE
+++ b/NAMESPACE
@@ -4,6 +4,7 @@ importFrom(data.table, set, setcolorder, setindexv, setkeyv, setnames, setorderv
 importFrom(data.table, `:=`, .BY, .N, .SD)
 importFrom(data.table, fifelse, frank)
 
+importFrom(tools, checkPoFile)
 importFrom(utils, getParseData, globalVariables, head, tail, type.convert)
 
 export(translate_package)
diff --git a/R/msgmerge.R b/R/msgmerge.R
index 0f3c6b6d..9f35c045 100644
--- a/R/msgmerge.R
+++ b/R/msgmerge.R
@@ -5,7 +5,7 @@ run_msgmerge = function(po_file, pot_file) {
     warning(domain = NA, gettextf("Running msgmerge on '%s' failed.", po_file))
   }
 
-  res <- tools::checkPoFile(po_file, strictPlural = TRUE)
+  res <- checkPoFile(po_file, strictPlural = TRUE)
   if (nrow(res)) {
     warning(domain = NA, gettextf("tools::checkPoFile() found some issues in %s", po_file))
     print(res)
@@ -15,9 +15,7 @@ run_msgmerge = function(po_file, pot_file) {
 
 run_msgfmt = function(po_file, mo_file, verbose) {
   use_stats <- if (verbose) '--statistics' else ''
-  browser()
   if (system(sprintf("msgfmt -c %s -o %s %s", use_stats, shQuote(mo_file), shQuote(po_file))) != 0L) {
-    browser()
     warning(domain = NA, gettextf("running msgfmt on %s failed", basename(po_file)), immediate. = TRUE)
   }
   return(invisible())
diff --git a/R/translate_package.R b/R/translate_package.R
index daeee8cb..45c2cff1 100644
--- a/R/translate_package.R
+++ b/R/translate_package.R
@@ -144,10 +144,9 @@ translate_package = function(
     }
 
     new_idx = message_data[
-      !is_repeat &
       is_marked_for_translation & (
         fuzzy == 1L
-        | (type == 'singular' & !nzchar(msgstr) & nzchar(msgid, keepNA = TRUE))
+        | (type == 'singular' & !nzchar(msgstr) & nzchar(msgid, keepNA = TRUE) & !is_repeat)
         | (type == 'plural' & !vapply(msgstr_plural, function(x) all(nzchar(x)), logical(1L)))
       ),
       which = TRUE

From 06d97f4819041ffe272d568a4906e3cda6c06f94 Mon Sep 17 00:00:00 2001
From: Michael Chirico <michaelchirico4@gmail.com>
Date: Tue, 22 Jun 2021 00:30:20 -0700
Subject: [PATCH 5/5] remove browser()

---
 R/translate_package.R | 1 -
 1 file changed, 1 deletion(-)

diff --git a/R/translate_package.R b/R/translate_package.R
index 45c2cff1..2c946827 100644
--- a/R/translate_package.R
+++ b/R/translate_package.R
@@ -157,7 +157,6 @@ translate_package = function(
         "Requested to split %d files among %d translators, which will exclude %d translators; setting team_split_rule = 'equalize_char' instead",
         n_files, team_size, team_size - n_files
       ))
-      browser()
       new_idx = switch(
         team_split_rule,
         equalize_char = {