From b302274033c4f1dfe0e44b592c479273727c26be Mon Sep 17 00:00:00 2001 From: Michael Chirico Date: Mon, 21 Jun 2021 00:49:46 -0700 Subject: [PATCH 1/5] initial work on team splitting tool --- NAMESPACE | 2 +- R/translate_package.R | 69 ++++++++++++++++++- .../test_packages/r_msg/po/R-rMsg.pot | 2 +- 3 files changed, 70 insertions(+), 3 deletions(-) diff --git a/NAMESPACE b/NAMESPACE index c6290da8..51e0df5f 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -2,7 +2,7 @@ importFrom(data.table, data.table, foverlaps, fread, rbindlist, uniqueN, "%chin% importFrom(data.table, is.data.table, as.data.table) importFrom(data.table, set, setcolorder, setindexv, setkeyv, setnames, setorderv) importFrom(data.table, `:=`, .BY, .N, .SD) -importFrom(data.table, fifelse) +importFrom(data.table, fifelse, frank) importFrom(utils, getParseData, globalVariables, head, tail, type.convert) diff --git a/R/translate_package.R b/R/translate_package.R index 7ab56f67..60111333 100644 --- a/R/translate_package.R +++ b/R/translate_package.R @@ -3,6 +3,8 @@ translate_package = function( diagnostics = list(check_cracked_messages, check_untranslated_cat, check_untranslated_src), src_translation_macros = c("_", "N_"), use_base_rules = package %chin% .potools$base_package_names, + team_size = 1L, team_id = 1L, + team_split_rule = c("equalize_char", "equalize_files"), copyright = NULL, bugs = NULL, verbose = FALSE ) { check_sys_reqs() @@ -14,9 +16,18 @@ translate_package = function( "'diagnostics' should be empty, a function, or list of functions" = is.null(diagnostics) || is.function(diagnostics) - || (is.list(diagnostics) && all(vapply(diagnostics, is.function, logical(1L)))) + || (is.list(diagnostics) && all(vapply(diagnostics, is.function, logical(1L)))), + "'team_size' should be >=1 and 'team_id' should be between 1 and 'team_size'" = + is.numeric(team_size) && is.numeric(team_id) + && team_size >= 1 && team_id >= 1 && team_id <= team_size, + "When using team splitting, only translate one language at a time" = + team_size == 1L || length(languages) == 1L ) + team_split_rule = match.arg(team_split_rule) + team_size = as.integer(team_size) + team_id = as.integer(team_id) + # en-list singleton diagnostic for convenience if (is.function(diagnostics)) diagnostics = list(diagnostics) @@ -133,6 +144,7 @@ translate_package = function( } new_idx = message_data[ + !is_repeat & is_marked_for_translation & ( fuzzy == 1L | (type == 'singular' & !nzchar(msgstr) & nzchar(msgid, keepNA = TRUE)) @@ -141,6 +153,61 @@ translate_package = function( which = TRUE ] + if (team_size > 1L) { + if ((n_files <- uniqueN(message_data[(new_idx)]$file)) < team_size) warning(domain=NA, gettextf( + "Requested to split %d files among %d translators, which will exclude %d translators; setting team_split_rule = 'equalize_char' instead", + n_files, team_size, team_size - n_files + )) + browser() + new_idx = switch( + team_split_rule, + equalize_char = { + message_data[ + (new_idx), + { + msg_size = fifelse( + type == "singular", + nchar(msgid), + vapply(msgid_plural, function(x) sum(nchar(x)), numeric(1L)) + ) + char_rank = frank(msg_size, ties.method = "random") + assign_idx = which((char_rank %% team_size) == (team_id - 1L)) + if (verbose) message(domain=NA, gettextf( + "Assigning team %d %d messages for translation totalling %d characters", + team_id, length(assign_idx), sum(msg_size[assign_idx]) + )) + new_idx[assign_idx] + } + ] + }, + equalize_files = { + assigned_files = message_data[ + (new_idx), + by = file, + { + msg_size = fifelse( + type == "singular", + nchar(msgid), + vapply(msgid_plural, function(x) sum(nchar(x)), numeric(1L)) + ) + .(file_size = sum(msg_size)) + } + ][ + order(file_size), + { + assign_idx = (.I %% team_size) == (team_id - 1L) + if (verbose) message(domain=NA, gettextf( + "Assigning team %d %d files for translation totalling %d characters", + team_id, assign_idx, sum(file_size[assign_idx]) + )) + file[assign_idx] + } + ] + message_data[(new_idx), new_idx[which(file %chin% assigned_files)]] + } + ) + } + if (!length(new_idx)) { if (verbose) message(domain=NA, gettextf( 'Translations for %s are up to date! Skipping.', diff --git a/tests/testthat/test_packages/r_msg/po/R-rMsg.pot b/tests/testthat/test_packages/r_msg/po/R-rMsg.pot index 781a62a8..8494ad3d 100644 --- a/tests/testthat/test_packages/r_msg/po/R-rMsg.pot +++ b/tests/testthat/test_packages/r_msg/po/R-rMsg.pot @@ -6,7 +6,7 @@ msgid "" msgstr "" "Project-Id-Version: rMsg 0.0.1\n" -"POT-Creation-Date: 2021-06-20 05:58:19\n" +"POT-Creation-Date: 2021-06-21 07:46:19\n" "PO-Revision-Date: YEAR-MO-DA HO:MI+ZONE\n" "Last-Translator: FULL NAME \n" "Language-Team: LANGUAGE \n" From 0df94f145eda56dbda709724de5b7336d9416efd Mon Sep 17 00:00:00 2001 From: Michael Chirico Date: Mon, 21 Jun 2021 23:25:55 -0700 Subject: [PATCH 2/5] add a new option, add documentation, resist using any randomization --- R/translate_package.R | 17 ++++++++++++++--- man/translate_package.Rd | 21 +++++++++++++++++++++ 2 files changed, 35 insertions(+), 3 deletions(-) diff --git a/R/translate_package.R b/R/translate_package.R index 60111333..a2e83491 100644 --- a/R/translate_package.R +++ b/R/translate_package.R @@ -4,7 +4,7 @@ translate_package = function( src_translation_macros = c("_", "N_"), use_base_rules = package %chin% .potools$base_package_names, team_size = 1L, team_id = 1L, - team_split_rule = c("equalize_char", "equalize_files"), + team_split_rule = c("equalize_char", "equalize_count", "equalize_files"), copyright = NULL, bugs = NULL, verbose = FALSE ) { check_sys_reqs() @@ -18,7 +18,7 @@ translate_package = function( || is.function(diagnostics) || (is.list(diagnostics) && all(vapply(diagnostics, is.function, logical(1L)))), "'team_size' should be >=1 and 'team_id' should be between 1 and 'team_size'" = - is.numeric(team_size) && is.numeric(team_id) + is.numeric(team_size) && length(team_size) == 1L && is.numeric(team_id) && length(team_id) == 1L && team_size >= 1 && team_id >= 1 && team_id <= team_size, "When using team splitting, only translate one language at a time" = team_size == 1L || length(languages) == 1L @@ -170,7 +170,14 @@ translate_package = function( nchar(msgid), vapply(msgid_plural, function(x) sum(nchar(x)), numeric(1L)) ) - char_rank = frank(msg_size, ties.method = "random") + + # NB: use order instead of frank because we don't care about ties. ties.method='random' + # also won't work because of the difficulty in matching seed across machines for + # distributed translation teams. even if we do set.seed(team_size), say, we can't + # guarantee the same RNG generator is being used; ensuring this is much more complication + # than it's worth. I think there is some risk based on collation order that msg_size may + # be different, but this _should_ be static owing to data.table's consistent sorting rules. + char_rank = order(msg_size) assign_idx = which((char_rank %% team_size) == (team_id - 1L)) if (verbose) message(domain=NA, gettextf( "Assigning team %d %d messages for translation totalling %d characters", @@ -180,6 +187,10 @@ translate_package = function( } ] }, + equalize_count = { + # plain & simple + new_idx[seq(team_id, length(new_idx), by = team_size)] + } equalize_files = { assigned_files = message_data[ (new_idx), diff --git a/man/translate_package.Rd b/man/translate_package.Rd index c5a6041c..cc55d476 100644 --- a/man/translate_package.Rd +++ b/man/translate_package.Rd @@ -16,6 +16,8 @@ translate_package( diagnostics = list(check_cracked_messages, check_untranslated_cat, check_untranslated_src), src_translation_macros = c("_", "N_"), use_base_rules = package \%chin\% .potools$base_package_names, + team_size = 1L, team_id = 1L, + team_split_rule = c("equalize_char", "equalize_count", "equalize_files"), copyright = NULL, bugs = NULL, verbose=FALSE ) } @@ -25,6 +27,9 @@ translate_package( \item{diagnostics}{ A \code{list} of diagnostic functions to be run on the package's message data. See Details.} \item{src_translation_macros}{ Character, the macro used to indicate which \code{char} arrays are to be marked for translation in C/C++ files. The default, \code{_} and \code{N_}, is shared by R itself & recommended in R-exts and R-ints (See references). } \item{use_base_rules}{ Logical; Should internal behavior match base behavior as strictly as possible? \code{TRUE} if being run on a base package (i.e., \code{base} or one of the default packages like \code{utils}, \code{graphics}, etc.). See Details. } + \item{team_size}{ Integer; how many translators are there for the (singular) language? See Details for this, \code{team_id}, and \code{team_split_rule}. } + \item{team_id}{ Integer; which translator is currently working? } + \item{team_split_rule}{ Character; how should the message base be split up among the teams? } \item{copyright}{ Character; passed on to \code{\link[tools]{update_pkg_po}}. } \item{bugs}{ Character; passed on to \code{\link[tools]{update_pkg_po}}. } \item{verbose}{ Logical, default \code{FALSE}. Should extra information about progress, etc. be reported? } @@ -53,6 +58,22 @@ directory (which is created if it does not yet exist). There are some discrepancies in the default behavior of \code{translate_package} and the translation workflow used to generate the \file{.po}/\file{.pot} files for R itself (mainly, the suite of functions from \code{tools}, \code{\link[tools]{update_pkg_po}}, \code{\link[tools]{xgettext2pot}}, \code{\link[tools]{xgettext}}, and \code{\link[tools]{xngettext}}). They should only be superficial (e.g., whitespace or comments), but nevertheless may represent a barrier to smoothly submitting patchings to R Core. To make the process of translating base R and the default packages (\code{tools}, \code{utils}, \code{stats}, etc.) as smooth as possible, set the \code{use_base_rules} argument to \code{TRUE} and your resulting \file{.po}/\file{.pot}/\file{.mo} file will match base's. +\bold{Teams:} + +For packages with larger message bases to tackle (e.g., R itself or a large, currently-untranslated package), a divide-and-conquer approach may be preferable if a suitable team can be assembled. The arguments \code{team_size}, \code{team_id}, and \code{team_split_rule} are meant to facilitate the work of translation in this case. If \code{team_size > 1}, first the set of messages that need translation is divided "roughly"" equally into \code{team_size} parts, each of which is assigned an "ID" from \code{1} to \code{team_size}. You can select which block of messages you'd like to translate by passing your \code{team_id} (the ID for each translator will need to be coordinated amongst the team members). + +There are three ways the translation set can be split "equally", controlled by the \code{team_split_rule} argument: + +\enumerate{ + \item \code{"equalize_char"}: Roughly assign each translator the same number of \emph{characters} of messages to translate (i.e., according to \code{\link{nchar}}). + \item \code{"equalize_count"}: Roughly assign each translator the same number of \emph{messages} to translate. Specific implementation is analogous to \code{"equalize_char"}. + \item \code{"equalize_files"}: Roughly assign each translator the same number of \emph{soure files} to translate. The thinking here is to try and give one translator many similar messages on the hope that there are some efficiency gains from autocorrelation in the messages. +} + +NB: \code{"equalize_char"} and \code{"equalize_files"} are implemented by sorting the and slicing. For example, if \code{team_size=3}, for \code{"equalize_char"}, the first translator (\code{team_id=1}) will get the 1st, 4th, 7th, ... largest messages, the second (\code{team_id=2}) will get the 2nd, 5th, 8th, ... largest, and the third (\code{team_id=3}) will get the 3rd, 6th, 9th, ... largest. For \code{"equalize_count"}, messages are simply taken in alternating order, sorted as they are from \code{\link{get_message_data}}. + +NB: this option only applies when a single language is specified for translation. + \bold{Diagnostics:} A diagnostic is a function which takes as input a \code{data.table} summarizing the translatable strings in From 9459e34b1c9cab979daefeb12a208f8aee0367fe Mon Sep 17 00:00:00 2001 From: Michael Chirico Date: Mon, 21 Jun 2021 23:45:30 -0700 Subject: [PATCH 3/5] debugging... --- R/msgmerge.R | 2 ++ R/onLoad.R | 2 +- R/translate_package.R | 4 ++-- tests/testthat/test-translate-package.R | 2 +- 4 files changed, 6 insertions(+), 4 deletions(-) diff --git a/R/msgmerge.R b/R/msgmerge.R index c4553d93..0f3c6b6d 100644 --- a/R/msgmerge.R +++ b/R/msgmerge.R @@ -15,7 +15,9 @@ run_msgmerge = function(po_file, pot_file) { run_msgfmt = function(po_file, mo_file, verbose) { use_stats <- if (verbose) '--statistics' else '' + browser() if (system(sprintf("msgfmt -c %s -o %s %s", use_stats, shQuote(mo_file), shQuote(po_file))) != 0L) { + browser() warning(domain = NA, gettextf("running msgfmt on %s failed", basename(po_file)), immediate. = TRUE) } return(invisible()) diff --git a/R/onLoad.R b/R/onLoad.R index d6fffc94..4bec955c 100644 --- a/R/onLoad.R +++ b/R/onLoad.R @@ -11,7 +11,7 @@ globalVariables( 'source_location', 'c_fmt_tag', 'msgid_plural_str', 'msgstr_plural_str', 'suggested_call', 'array_start', 'i.array_start', 'array_end', 'i.array_end', 'call_start', 'x.call_start', 'i.call_start', 'paren_start', 'i.paren_start', 'paren_end', 'i.paren_end', 'x.paren_end', - 'i.msgid', 'replacement', 'non_spurious', 'str_arg', + 'i.msgid', 'replacement', 'non_spurious', 'str_arg', 'file_size', '.' ), package = 'potools' diff --git a/R/translate_package.R b/R/translate_package.R index a2e83491..daeee8cb 100644 --- a/R/translate_package.R +++ b/R/translate_package.R @@ -190,7 +190,7 @@ translate_package = function( equalize_count = { # plain & simple new_idx[seq(team_id, length(new_idx), by = team_size)] - } + }, equalize_files = { assigned_files = message_data[ (new_idx), @@ -206,7 +206,7 @@ translate_package = function( ][ order(file_size), { - assign_idx = (.I %% team_size) == (team_id - 1L) + assign_idx = (seq_len(.N) %% team_size) == (team_id - 1L) if (verbose) message(domain=NA, gettextf( "Assigning team %d %d files for translation totalling %d characters", team_id, assign_idx, sum(file_size[assign_idx]) diff --git a/tests/testthat/test-translate-package.R b/tests/testthat/test-translate-package.R index 2d7f6427..b524688b 100644 --- a/tests/testthat/test-translate-package.R +++ b/tests/testthat/test-translate-package.R @@ -177,7 +177,7 @@ test_that("Packages with src code work correctly", { expect_true("po/rSrcMsg.pot" %in% pkg_files) expect( any(grepl("inst/po/zh_CN/LC_MESSAGES/rSrcMsg.mo", pkg_files, fixed = TRUE)), - "Didn't find rSrcMsg.mo; found %s", toString(pkg_files) + sprintf("Didn't find rSrcMsg.mo; found %s", toString(pkg_files)) ) # NB: paste(readLines(), collapse="\n") instead of readChar() for platform robustness From f3a4ac648600f4e45278946d1b99bd63f45dceeb Mon Sep 17 00:00:00 2001 From: Michael Chirico Date: Tue, 22 Jun 2021 00:13:54 -0700 Subject: [PATCH 4/5] fix bug --- NAMESPACE | 1 + R/msgmerge.R | 4 +--- R/translate_package.R | 3 +-- 3 files changed, 3 insertions(+), 5 deletions(-) diff --git a/NAMESPACE b/NAMESPACE index 51e0df5f..9bbaca7a 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -4,6 +4,7 @@ importFrom(data.table, set, setcolorder, setindexv, setkeyv, setnames, setorderv importFrom(data.table, `:=`, .BY, .N, .SD) importFrom(data.table, fifelse, frank) +importFrom(tools, checkPoFile) importFrom(utils, getParseData, globalVariables, head, tail, type.convert) export(translate_package) diff --git a/R/msgmerge.R b/R/msgmerge.R index 0f3c6b6d..9f35c045 100644 --- a/R/msgmerge.R +++ b/R/msgmerge.R @@ -5,7 +5,7 @@ run_msgmerge = function(po_file, pot_file) { warning(domain = NA, gettextf("Running msgmerge on '%s' failed.", po_file)) } - res <- tools::checkPoFile(po_file, strictPlural = TRUE) + res <- checkPoFile(po_file, strictPlural = TRUE) if (nrow(res)) { warning(domain = NA, gettextf("tools::checkPoFile() found some issues in %s", po_file)) print(res) @@ -15,9 +15,7 @@ run_msgmerge = function(po_file, pot_file) { run_msgfmt = function(po_file, mo_file, verbose) { use_stats <- if (verbose) '--statistics' else '' - browser() if (system(sprintf("msgfmt -c %s -o %s %s", use_stats, shQuote(mo_file), shQuote(po_file))) != 0L) { - browser() warning(domain = NA, gettextf("running msgfmt on %s failed", basename(po_file)), immediate. = TRUE) } return(invisible()) diff --git a/R/translate_package.R b/R/translate_package.R index daeee8cb..45c2cff1 100644 --- a/R/translate_package.R +++ b/R/translate_package.R @@ -144,10 +144,9 @@ translate_package = function( } new_idx = message_data[ - !is_repeat & is_marked_for_translation & ( fuzzy == 1L - | (type == 'singular' & !nzchar(msgstr) & nzchar(msgid, keepNA = TRUE)) + | (type == 'singular' & !nzchar(msgstr) & nzchar(msgid, keepNA = TRUE) & !is_repeat) | (type == 'plural' & !vapply(msgstr_plural, function(x) all(nzchar(x)), logical(1L))) ), which = TRUE From 06d97f4819041ffe272d568a4906e3cda6c06f94 Mon Sep 17 00:00:00 2001 From: Michael Chirico Date: Tue, 22 Jun 2021 00:30:20 -0700 Subject: [PATCH 5/5] remove browser() --- R/translate_package.R | 1 - 1 file changed, 1 deletion(-) diff --git a/R/translate_package.R b/R/translate_package.R index 45c2cff1..2c946827 100644 --- a/R/translate_package.R +++ b/R/translate_package.R @@ -157,7 +157,6 @@ translate_package = function( "Requested to split %d files among %d translators, which will exclude %d translators; setting team_split_rule = 'equalize_char' instead", n_files, team_size, team_size - n_files )) - browser() new_idx = switch( team_split_rule, equalize_char = {