MichaelChirico · MichaelChirico · Jun 21, 2021 · Jun 22, 2021 · Jun 22, 2021 · Jun 22, 2021
diff --git a/NAMESPACE b/NAMESPACE
@@ -2,8 +2,9 @@ importFrom(data.table, data.table, foverlaps, fread, rbindlist, uniqueN, "%chin%
 importFrom(data.table, is.data.table, as.data.table)
 importFrom(data.table, set, setcolorder, setindexv, setkeyv, setnames, setorderv)
 importFrom(data.table, `:=`, .BY, .N, .SD)
-importFrom(data.table, fifelse)
+importFrom(data.table, fifelse, frank)
 
+importFrom(tools, checkPoFile)
 importFrom(utils, getParseData, globalVariables, head, tail, type.convert)
 
 export(translate_package)

diff --git a/R/msgmerge.R b/R/msgmerge.R
@@ -5,7 +5,7 @@ run_msgmerge = function(po_file, pot_file) {
     warning(domain = NA, gettextf("Running msgmerge on '%s' failed.", po_file))
   }
 
-  res <- tools::checkPoFile(po_file, strictPlural = TRUE)
+  res <- checkPoFile(po_file, strictPlural = TRUE)
   if (nrow(res)) {
     warning(domain = NA, gettextf("tools::checkPoFile() found some issues in %s", po_file))
     print(res)

diff --git a/R/onLoad.R b/R/onLoad.R
@@ -11,7 +11,7 @@ globalVariables(
     'source_location', 'c_fmt_tag', 'msgid_plural_str', 'msgstr_plural_str', 'suggested_call',
     'array_start', 'i.array_start', 'array_end', 'i.array_end', 'call_start', 'x.call_start', 'i.call_start',
     'paren_start', 'i.paren_start', 'paren_end', 'i.paren_end', 'x.paren_end',
-    'i.msgid', 'replacement', 'non_spurious', 'str_arg',
+    'i.msgid', 'replacement', 'non_spurious', 'str_arg', 'file_size',
     '.'
   ),
   package = 'potools'

diff --git a/R/translate_package.R b/R/translate_package.R
@@ -3,6 +3,8 @@ translate_package = function(
   diagnostics = list(check_cracked_messages, check_untranslated_cat, check_untranslated_src),
   src_translation_macros = c("_", "N_"),
   use_base_rules = package %chin% .potools$base_package_names,
+  team_size = 1L, team_id = 1L,
+  team_split_rule = c("equalize_char", "equalize_count", "equalize_files"),
   copyright = NULL, bugs = NULL, verbose = FALSE
 ) {
   check_sys_reqs()
@@ -14,9 +16,18 @@ translate_package = function(
     "'diagnostics' should be empty, a function, or list of functions" =
       is.null(diagnostics)
       || is.function(diagnostics)
-      || (is.list(diagnostics) && all(vapply(diagnostics, is.function, logical(1L))))
+      || (is.list(diagnostics) && all(vapply(diagnostics, is.function, logical(1L)))),
+    "'team_size' should be >=1 and 'team_id' should be between 1 and 'team_size'" =
+      is.numeric(team_size) && length(team_size) == 1L && is.numeric(team_id) && length(team_id) == 1L
+      && team_size >= 1 && team_id >= 1 && team_id <= team_size,
+    "When using team splitting, only translate one language at a time" =
+      team_size == 1L || length(languages) == 1L
   )
 
+  team_split_rule = match.arg(team_split_rule)
+  team_size = as.integer(team_size)
+  team_id = as.integer(team_id)
+
   # en-list singleton diagnostic for convenience
   if (is.function(diagnostics)) diagnostics = list(diagnostics)
 
@@ -135,12 +146,77 @@ translate_package = function(
     new_idx = message_data[
       is_marked_for_translation & (
         fuzzy == 1L
-        | (type == 'singular' & !nzchar(msgstr) & nzchar(msgid, keepNA = TRUE))
+        | (type == 'singular' & !nzchar(msgstr) & nzchar(msgid, keepNA = TRUE) & !is_repeat)
         | (type == 'plural' & !vapply(msgstr_plural, function(x) all(nzchar(x)), logical(1L)))
       ),
       which = TRUE
     ]
 
+    if (team_size > 1L) {
+      if ((n_files <- uniqueN(message_data[(new_idx)]$file)) < team_size) warning(domain=NA, gettextf(
+        "Requested to split %d files among %d translators, which will exclude %d translators; setting team_split_rule = 'equalize_char' instead",
+        n_files, team_size, team_size - n_files
+      ))
+      new_idx = switch(
+        team_split_rule,
+        equalize_char = {
+          message_data[
+            (new_idx),
+            {
+              msg_size = fifelse(
+                type == "singular",
+                nchar(msgid),
+                vapply(msgid_plural, function(x) sum(nchar(x)), numeric(1L))
+              )
+
+              # NB: use order instead of frank because we don't care about ties. ties.method='random'
+              #   also won't work because of the difficulty in matching seed across machines for
+              #   distributed translation teams. even if we do set.seed(team_size), say, we can't
+              #   guarantee the same RNG generator is being used; ensuring this is much more complication
+              #   than it's worth. I think there is some risk based on collation order that msg_size may
+              #   be different, but this _should_ be static owing to data.table's consistent sorting rules.
+              char_rank = order(msg_size)
+              assign_idx = which((char_rank %% team_size) == (team_id - 1L))
+              if (verbose) message(domain=NA, gettextf(
+                "Assigning team %d %d messages for translation totalling %d characters",
+                team_id, length(assign_idx), sum(msg_size[assign_idx])
+              ))
+              new_idx[assign_idx]
+            }
+          ]
+        },
+        equalize_count = {
+          # plain & simple
+          new_idx[seq(team_id, length(new_idx), by = team_size)]
+        },
+        equalize_files = {
+          assigned_files = message_data[
+            (new_idx),
+            by = file,
+            {
+              msg_size = fifelse(
+                type == "singular",
+                nchar(msgid),
+                vapply(msgid_plural, function(x) sum(nchar(x)), numeric(1L))
+              )
+              .(file_size = sum(msg_size))
+            }
+          ][
+            order(file_size),
+            {
+              assign_idx = (seq_len(.N) %% team_size) == (team_id - 1L)
+              if (verbose) message(domain=NA, gettextf(
+                "Assigning team %d %d files for translation totalling %d characters",
+                team_id, assign_idx, sum(file_size[assign_idx])
+              ))
+              file[assign_idx]
+            }
+          ]
+          message_data[(new_idx), new_idx[which(file %chin% assigned_files)]]
+        }
+      )
+    }
+
     if (!length(new_idx)) {
       if (verbose) message(domain=NA, gettextf(
         'Translations for %s are up to date! Skipping.',

diff --git a/man/translate_package.Rd b/man/translate_package.Rd
@@ -16,6 +16,8 @@ translate_package(
   diagnostics = list(check_cracked_messages, check_untranslated_cat, check_untranslated_src),
   src_translation_macros = c("_", "N_"),
   use_base_rules = package \%chin\% .potools$base_package_names,
+  team_size = 1L, team_id = 1L,
+  team_split_rule = c("equalize_char", "equalize_count", "equalize_files"),
   copyright = NULL, bugs = NULL, verbose=FALSE
 )
 }
@@ -25,6 +27,9 @@ translate_package(
   \item{diagnostics}{ A \code{list} of diagnostic functions to be run on the package's message data. See Details.}
   \item{src_translation_macros}{ Character, the macro used to indicate which \code{char} arrays are to be marked for translation in C/C++ files. The default, \code{_} and \code{N_}, is shared by R itself & recommended in R-exts and R-ints (See references). }
   \item{use_base_rules}{ Logical; Should internal behavior match base behavior as strictly as possible? \code{TRUE} if being run on a base package (i.e., \code{base} or one of the default packages like \code{utils}, \code{graphics}, etc.). See Details. }
+  \item{team_size}{ Integer; how many translators are there for the (singular) language? See Details for this, \code{team_id}, and \code{team_split_rule}. }
+  \item{team_id}{ Integer; which translator is currently working? }
+  \item{team_split_rule}{ Character; how should the message base be split up among the teams? }
   \item{copyright}{ Character; passed on to \code{\link[tools]{update_pkg_po}}. }
   \item{bugs}{ Character; passed on to \code{\link[tools]{update_pkg_po}}. }
   \item{verbose}{ Logical, default \code{FALSE}. Should extra information about progress, etc. be reported? }
@@ -53,6 +58,22 @@ directory (which is created if it does not yet exist).
 
 There are some discrepancies in the default behavior of \code{translate_package} and the translation workflow used to generate the \file{.po}/\file{.pot} files for R itself (mainly, the suite of functions from \code{tools}, \code{\link[tools]{update_pkg_po}}, \code{\link[tools]{xgettext2pot}}, \code{\link[tools]{xgettext}}, and \code{\link[tools]{xngettext}}). They should only be superficial (e.g., whitespace or comments), but nevertheless may represent a barrier to smoothly submitting patchings to R Core. To make the process of translating base R and the default packages (\code{tools}, \code{utils}, \code{stats}, etc.) as smooth as possible, set the \code{use_base_rules} argument to \code{TRUE} and your resulting \file{.po}/\file{.pot}/\file{.mo} file will match base's.
 
+\bold{Teams:}
+
+For packages with larger message bases to tackle (e.g., R itself or a large, currently-untranslated package), a divide-and-conquer approach may be preferable if a suitable team can be assembled. The arguments \code{team_size}, \code{team_id}, and \code{team_split_rule} are meant to facilitate the work of translation in this case. If \code{team_size > 1}, first the set of messages that need translation is divided "roughly"" equally into \code{team_size} parts, each of which is assigned an "ID" from \code{1} to \code{team_size}. You can select which block of messages you'd like to translate by passing your \code{team_id} (the ID for each translator will need to be coordinated amongst the team members).
+
+There are three ways the translation set can be split "equally", controlled by the \code{team_split_rule} argument:
+
+\enumerate{
+  \item \code{"equalize_char"}: Roughly assign each translator the same number of \emph{characters} of messages to translate (i.e., according to \code{\link{nchar}}).
+  \item \code{"equalize_count"}: Roughly assign each translator the same number of \emph{messages} to translate. Specific implementation is analogous to \code{"equalize_char"}.
+  \item \code{"equalize_files"}: Roughly assign each translator the same number of \emph{soure files} to translate. The thinking here is to try and give one translator many similar messages on the hope that there are some efficiency gains from autocorrelation in the messages.
+}
+
+NB: \code{"equalize_char"} and \code{"equalize_files"} are implemented by sorting the and slicing. For example, if \code{team_size=3}, for \code{"equalize_char"}, the first translator (\code{team_id=1}) will get the 1st, 4th, 7th, ... largest messages, the second (\code{team_id=2}) will get the 2nd, 5th, 8th, ... largest, and the third (\code{team_id=3}) will get the 3rd, 6th, 9th, ... largest. For \code{"equalize_count"}, messages are simply taken in alternating order, sorted as they are from \code{\link{get_message_data}}.
+
+NB: this option only applies when a single language is specified for translation.
+
 \bold{Diagnostics:}
 
 A diagnostic is a function which takes as input a \code{data.table} summarizing the translatable strings in

diff --git a/tests/testthat/test-translate-package.R b/tests/testthat/test-translate-package.R
@@ -177,7 +177,7 @@ test_that("Packages with src code work correctly", {
       expect_true("po/rSrcMsg.pot" %in% pkg_files)
       expect(
         any(grepl("inst/po/zh_CN/LC_MESSAGES/rSrcMsg.mo", pkg_files, fixed = TRUE)),
-        "Didn't find rSrcMsg.mo; found %s", toString(pkg_files)
+        sprintf("Didn't find rSrcMsg.mo; found %s", toString(pkg_files))
       )
 
       # NB: paste(readLines(), collapse="\n") instead of readChar() for platform robustness

diff --git a/tests/testthat/test_packages/r_msg/po/R-rMsg.pot b/tests/testthat/test_packages/r_msg/po/R-rMsg.pot
@@ -6,7 +6,7 @@
 msgid ""
 msgstr ""
 "Project-Id-Version: rMsg 0.0.1\n"
-"POT-Creation-Date: 2021-06-20 05:58:19\n"
+"POT-Creation-Date: 2021-06-21 07:46:19\n"
 "PO-Revision-Date: YEAR-MO-DA HO:MI+ZONE\n"
 "Last-Translator: FULL NAME <EMAIL@ADDRESS>\n"
 "Language-Team: LANGUAGE <LL@li.org>\n"