Skip to content

Commit d0a50cc

Browse files
authored
fix(content-translator): translate rich text as marker units and align by input index (#166)
1 parent 0cf1f06 commit d0a50cc

7 files changed

Lines changed: 358 additions & 42 deletions

File tree

content-translator/CHANGELOG.md

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,9 @@
22

33
## Unreleased
44

5+
- fix: translate rich text block-level elements as one unit using segment markers so inline formatting spans stay aligned and word order can change across languages
6+
- fix: reconstruct OpenAI translations by input index so a merged, dropped, or reordered entry no longer shifts later translations into the wrong fields; missing entries keep their original text
7+
- fix: abort a translation when the resolver returns a different number of texts than were sent, and guard against non-string values reaching `he.decode`
58
- fix: translate each entry of `hasMany` text fields individually so keyword/tag lists are translated instead of crashing
69
- fix: translate fields inside unnamed (presentational) groups instead of throwing an "Unnamed groups are currently not supported" error
710
- fix: skip fields and tabs named `__proto__`, `constructor`, or `prototype` during traversal to avoid prototype-polluting writes when a user-supplied Payload config contains such a name

content-translator/src/resolvers/openAI.ts

Lines changed: 45 additions & 24 deletions
Original file line numberDiff line numberDiff line change
@@ -32,16 +32,23 @@ type OpenAIResponse = {
3232
}
3333

3434
const defaultPrompt: OpenAIPrompt = ({ localeFrom, localeTo, texts }) => {
35-
return `Translate the following array of strings from the language with ISO 639 code "${localeFrom}" to the language with ISO 639 code "${localeTo}".
35+
const indexed: Record<string, string> = {}
36+
texts.forEach((text, i) => {
37+
indexed[String(i)] = text
38+
})
3639

37-
IMPORTANT: You must return ONLY a valid JSON object with a "translations" key containing the array of translated strings. The array must maintain the exact same length and order as the input. Properly escape all special characters including quotes, newlines, and backslashes according to JSON standards.
40+
return `Translate the values of the following JSON object from the language with ISO 639 code "${localeFrom}" to the language with ISO 639 code "${localeTo}".
3841
39-
Input array to translate:
40-
${JSON.stringify(texts, null, 2)}
42+
IMPORTANT: Return ONLY a valid JSON object with a "translations" key whose value is an object using the EXACT SAME KEYS as the input. Translate each value independently and keep it under its own key. Never merge, split, drop, reorder, or add entries — even if two adjacent values look like fragments of the same sentence, they MUST stay as separate keys. Preserve leading and trailing whitespace of each value. Properly escape all special characters including quotes, newlines, and backslashes according to JSON standards.
43+
44+
Some values contain segment markers of the form ⟦0⟧, ⟦1⟧, ⟦2⟧ (a number enclosed in the brackets ⟦ ⟧). These markers separate inline formatting spans within one text. In your translation, keep every marker exactly as it appears — same characters, same numbers, each marker exactly once — and place each marker immediately before the translated words that belong to its segment. You may move words across markers when grammar requires it, but never add, remove, renumber, duplicate, or translate the markers themselves.
45+
46+
Input object to translate:
47+
${JSON.stringify(indexed, null, 2)}
4148
4249
Expected response format:
4350
{
44-
"translations": ["translated string 1", "translated string 2", ...]
51+
"translations": { "0": "translated value 0", "1": "translated value 1" }
4552
}`
4653
}
4754

@@ -63,6 +70,7 @@ export const openAIResolver = ({
6370
try {
6471
const response: {
6572
data: OpenAIResponse
73+
inputTexts: string[]
6674
success: boolean
6775
}[] = await Promise.all(
6876
chunkArray(texts, chunkLength).map(async (texts) => {
@@ -94,6 +102,7 @@ export const openAIResolver = ({
94102

95103
return {
96104
data,
105+
inputTexts: texts,
97106
success: res.ok,
98107
}
99108
})
@@ -102,7 +111,7 @@ export const openAIResolver = ({
102111

103112
const translated: string[] = []
104113

105-
for (const { data, success } of response) {
114+
for (const { data, inputTexts, success } of response) {
106115
if (!success) {
107116
return {
108117
success: false as const,
@@ -121,12 +130,12 @@ export const openAIResolver = ({
121130
}
122131
}
123132

124-
let translatedChunk: string[] = []
133+
let translationsObj: unknown
125134

126135
try {
127136
const parsedResponse = JSON.parse(content)
128137

129-
// Extract translations array from the response object
138+
// Extract the translations from the response object
130139
if (!parsedResponse || typeof parsedResponse !== 'object') {
131140
req.payload.logger.error({
132141
fullContent: content,
@@ -152,7 +161,7 @@ export const openAIResolver = ({
152161
}
153162
}
154163

155-
translatedChunk = parsedResponse.translations
164+
translationsObj = parsedResponse.translations
156165
} catch (e) {
157166
req.payload.logger.error({
158167
error: e instanceof Error ? e.message : String(e),
@@ -165,35 +174,47 @@ export const openAIResolver = ({
165174
}
166175
}
167176

168-
if (!Array.isArray(translatedChunk)) {
177+
// "translations" must be an object (keyed by index) or an array.
178+
// A bare string would otherwise be indexed character by character
179+
// (e.g. "abc"["0"] === "a"), producing garbage that still looks
180+
// like a success - so reject anything that is not an object/array.
181+
if (translationsObj === null || typeof translationsObj !== 'object') {
169182
req.payload.logger.error({
170-
data: translatedChunk,
171183
fullContent: content,
172184
message:
173-
'An error occurred when trying to translate the data using OpenAI API - parsed content is not an array',
185+
'An error occurred when trying to parse the content - "translations" is not an object or array',
186+
translations: translationsObj,
174187
})
175188

176189
return {
177190
success: false as const,
178191
}
179192
}
180193

181-
for (const text of translatedChunk) {
182-
if (text && typeof text !== 'string') {
183-
req.payload.logger.error({
184-
chunkData: translatedChunk,
185-
data: text,
186-
fullContent: content,
194+
// The model is asked to return an object keyed by the input index.
195+
// Reconstruct the output strictly from the input indices so the
196+
// result always has the same length and order as the input. A
197+
// missing / merged / non-string key keeps the original text (left
198+
// untranslated in its own slot) instead of shifting every later
199+
// value into the wrong field. An array response is tolerated too,
200+
// for backwards compatibility with custom prompts.
201+
for (let i = 0; i < inputTexts.length; i++) {
202+
const value = Array.isArray(translationsObj)
203+
? translationsObj[i]
204+
: (translationsObj as Record<string, unknown>)[String(i)]
205+
206+
if (typeof value === 'string') {
207+
translated.push(value)
208+
} else {
209+
req.payload.logger.warn({
210+
index: i,
187211
message:
188-
'An error occurred when trying to translate the data using OpenAI API - parsed content is not a string',
212+
'Translation missing or not a string for input index - keeping original text',
213+
original: inputTexts[i],
189214
})
190215

191-
return {
192-
success: false as const,
193-
}
216+
translated.push(inputTexts[i])
194217
}
195-
196-
translated.push(text)
197218
}
198219
}
199220

content-translator/src/translate/operation.ts

Lines changed: 18 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -79,12 +79,29 @@ export const translateOperation = async (args: TranslateOperationArgs) => {
7979
let result: TranslateResult
8080

8181
if (!resolveResult.success) {
82+
result = {
83+
success: false,
84+
}
85+
} else if (resolveResult.translatedTexts.length !== valuesToTranslate.length) {
86+
// Defense in depth: the resolver must return exactly one translation per
87+
// input value, in order. A count mismatch means index-based write-back
88+
// would shift translations into the wrong fields, so fail instead.
89+
req.payload.logger.error({
90+
inputCount: valuesToTranslate.length,
91+
message: 'Translation aborted: resolver returned a different number of texts than were sent',
92+
outputCount: resolveResult.translatedTexts.length,
93+
})
94+
8295
result = {
8396
success: false,
8497
}
8598
} else {
8699
resolveResult.translatedTexts.forEach((translated, index) => {
87-
const formattedValue = he.decode(translated)
100+
// he.decode() calls String.prototype.replace internally, so a
101+
// non-string value (e.g. an array slipping through from a hasMany
102+
// field) would throw "e.replace is not a function". Guard against it
103+
// and pass non-strings through untouched.
104+
const formattedValue = typeof translated === 'string' ? he.decode(translated) : translated
88105

89106
valuesToTranslate[index].onTranslate(formattedValue)
90107
})

content-translator/src/translate/traverseFields.ts

Lines changed: 0 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -263,14 +263,6 @@ export const traverseFields = ({
263263
if (root) {
264264
traverseRichText({
265265
emptyOnly,
266-
onText: (siblingData, key) => {
267-
valuesToTranslate.push({
268-
onTranslate: (translated: string) => {
269-
siblingData[key] = translated
270-
},
271-
value: siblingData[key],
272-
})
273-
},
274266
payloadConfig,
275267
root,
276268
translatedData,

content-translator/src/translate/traverseRichText.ts

Lines changed: 108 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -4,17 +4,95 @@ import type { ValueToTranslate } from './types.js'
44

55
import { traverseFields } from './traverseFields.js'
66

7+
// Markers delimit the individual text nodes inside a single block-level element
8+
// (paragraph, heading, list item, quote, ...) so the element can be translated
9+
// as ONE unit. This lets the translator reorder words across nodes — required
10+
// for languages like German -> English where the verb moves — while each
11+
// formatting span (bold, italic, ...) still receives its own translated text.
12+
// Translating each text node in isolation made the model merge adjacent
13+
// sentence fragments and shift content (and formatting) into the wrong nodes.
14+
const MARKER_OPEN = '⟦'
15+
const MARKER_CLOSE = '⟧'
16+
const buildMarker = (i: number) => `${MARKER_OPEN}${i}${MARKER_CLOSE}`
17+
const markerRegex = /(\d+)/g
18+
19+
// Translate a maximal run of consecutive text-node siblings. A single node is
20+
// sent as plain text (no markers). Two or more nodes are joined into one
21+
// marker-delimited value and split back apart once translated.
22+
const pushTextRun = (run: Record<string, any>[], valuesToTranslate: ValueToTranslate[]) => {
23+
if (run.length === 1) {
24+
const node = run[0]
25+
26+
if (!node.text) {
27+
return
28+
}
29+
30+
valuesToTranslate.push({
31+
onTranslate: (translated) => {
32+
node.text = translated
33+
},
34+
value: node.text,
35+
})
36+
37+
return
38+
}
39+
40+
let combined = ''
41+
run.forEach((node, index) => {
42+
combined += buildMarker(index) + node.text
43+
})
44+
45+
valuesToTranslate.push({
46+
onTranslate: (translated: string) => {
47+
const matches: { end: number; index: number; start: number }[] = []
48+
let match: null | RegExpExecArray
49+
markerRegex.lastIndex = 0
50+
51+
while ((match = markerRegex.exec(translated)) !== null) {
52+
matches.push({
53+
end: markerRegex.lastIndex,
54+
index: parseInt(match[1], 10),
55+
start: match.index,
56+
})
57+
}
58+
59+
if (matches.length === 0) {
60+
// No markers survived translation - keep the originals rather than
61+
// writing the whole translated blob into the first node.
62+
return
63+
}
64+
65+
// Each segment is the text following its marker up to the next marker
66+
// in textual order, so reordering across markers is handled correctly.
67+
const segments: Record<number, string> = {}
68+
for (let m = 0; m < matches.length; m++) {
69+
const current = matches[m]
70+
const next = matches[m + 1]
71+
segments[current.index] = translated.slice(
72+
current.end,
73+
next ? next.start : translated.length,
74+
)
75+
}
76+
77+
run.forEach((node, index) => {
78+
if (typeof segments[index] === 'string') {
79+
node.text = segments[index]
80+
}
81+
})
82+
},
83+
value: combined,
84+
})
85+
}
86+
787
export const traverseRichText = ({
888
emptyOnly,
9-
onText,
1089
payloadConfig,
1190
root,
1291
siblingData,
1392
translatedData,
1493
valuesToTranslate,
1594
}: {
1695
emptyOnly: boolean
17-
onText: (siblingData: Record<string, unknown>, key: string) => void
1896
payloadConfig: SanitizedConfig
1997
root: Record<string, unknown>
2098
siblingData?: Record<string, unknown>
@@ -23,10 +101,6 @@ export const traverseRichText = ({
23101
}) => {
24102
siblingData = siblingData ?? root
25103

26-
if (siblingData.text) {
27-
onText(siblingData, 'text')
28-
}
29-
30104
if (siblingData.type === 'block') {
31105
if (
32106
'fields' in siblingData &&
@@ -58,17 +132,42 @@ export const traverseRichText = ({
58132
} else {
59133
console.warn('Could not find fields and blockType in block', siblingData)
60134
}
61-
} else if (Array.isArray(siblingData?.children)) {
62-
for (const child of siblingData.children) {
135+
136+
return
137+
}
138+
139+
if (!Array.isArray(siblingData?.children)) {
140+
return
141+
}
142+
143+
const children = siblingData.children as Record<string, any>[]
144+
let i = 0
145+
146+
while (i < children.length) {
147+
const child = children[i]
148+
149+
if (child && typeof child.text === 'string') {
150+
// Gather a maximal run of consecutive text nodes and translate them
151+
// together so a sentence split across formatting spans stays aligned.
152+
const run: Record<string, any>[] = []
153+
154+
while (i < children.length && children[i] && typeof children[i].text === 'string') {
155+
run.push(children[i])
156+
i++
157+
}
158+
159+
pushTextRun(run, valuesToTranslate)
160+
} else {
63161
traverseRichText({
64162
emptyOnly,
65-
onText,
66163
payloadConfig,
67164
root,
68165
siblingData: child,
69166
translatedData,
70167
valuesToTranslate,
71168
})
169+
170+
i++
72171
}
73172
}
74173
}

0 commit comments

Comments
 (0)