Skip to content

Commit f3d8c7b

Browse files
feat: rich text formatting improvement in google docs AI parser and hooks in json as a doc
1 parent 37315ca commit f3d8c7b

File tree

6 files changed

+303
-54
lines changed

6 files changed

+303
-54
lines changed

apps/google-docs/functions/agents/documentParserAgent/documentParser.agent.ts

Lines changed: 76 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -11,24 +11,23 @@
1111
import { createOpenAI } from '@ai-sdk/openai';
1212
import { generateObject } from 'ai';
1313
import { ContentTypeProps } from 'contentful-management';
14-
import { fetchGoogleDoc } from '../../service/googleDriveService';
1514
import { FinalEntriesResultSchema, FinalEntriesResult } from './schema';
1615

1716
/**
1817
* Configuration for the document parser
1918
*/
2019
export interface DocumentParserConfig {
2120
openAiApiKey: string;
22-
googleDocUrl: string;
21+
document: unknown; // JSON document from Google Docs API or test data
2322
contentTypes: ContentTypeProps[];
2423
locale?: string;
2524
}
2625

2726
/**
28-
* AI Agent that parses a Google Doc and extracts structured entries
27+
* AI Agent that parses a Google Doc JSON and extracts structured entries
2928
* based on provided Contentful content type definitions.
3029
*
31-
* @param config - Parser configuration including API key, document URL, and content types
30+
* @param config - Parser configuration including API key, document JSON, and content types
3231
* @returns Promise resolving to entries ready for CMA client
3332
*/
3433
export async function createDocument(config: DocumentParserConfig): Promise<FinalEntriesResult> {
@@ -37,15 +36,16 @@ export async function createDocument(config: DocumentParserConfig): Promise<Fina
3736
const modelVersion = 'gpt-4o';
3837
const temperature = 0.3;
3938

40-
const { googleDocUrl, openAiApiKey, contentTypes, locale = 'en-US' } = config;
41-
const googleDocContent = await fetchGoogleDoc(googleDocUrl);
39+
const { document, openAiApiKey, contentTypes, locale = 'en-US' } = config;
40+
41+
// Extract text content from Google Docs JSON structure
42+
const documentContent = extractTextFromGoogleDocsJson(document);
4243

4344
const openaiClient = createOpenAI({
4445
apiKey: openAiApiKey,
4546
});
4647

47-
const prompt = buildExtractionPrompt({ contentTypes, googleDocContent, locale });
48-
48+
const prompt = buildExtractionPrompt({ contentTypes, documentContent, locale });
4949
const result = await generateObject({
5050
model: openaiClient(modelVersion),
5151
schema: FinalEntriesResultSchema,
@@ -80,7 +80,10 @@ CRITICAL FIELD TYPE RULES - READ CAREFULLY:
8080
- Array (of Link): ❌ NEVER USE - these reference other entries, skip entirely
8181
Example: DO NOT create [{ title: "x", content: "y" }] - this will FAIL
8282
- Link/Reference: ❌ NEVER USE - skip these fields (they reference other entries)
83-
- RichText: ❌ NEVER USE - complex format not supported
83+
- RichText: Provide a Markdown string preserving inline styles:
84+
- Bold: **bold**
85+
- Italic: *italic*
86+
- Underline: _underline_ (or <u>underline</u>)
8487
8588
FIELD FORMAT RULES:
8689
- Each entry must have a contentTypeId that matches one of the provided content types
@@ -103,13 +106,71 @@ EXTRACTION GUIDELINES:
103106
- Focus on simple fields: Symbol, Text, Number, Boolean, Date`;
104107
}
105108

109+
/**
110+
* Extracts plain text content from Google Docs JSON structure
111+
*/
112+
function extractTextFromGoogleDocsJson(document: unknown): string {
113+
if (!document || typeof document !== 'object') {
114+
return '';
115+
}
116+
117+
const doc = document as Record<string, unknown>;
118+
const textParts: string[] = [];
119+
120+
// Extract title if available
121+
if (typeof doc.title === 'string') {
122+
textParts.push(doc.title);
123+
}
124+
125+
// Navigate through tabs -> documentTab -> body -> content
126+
if (Array.isArray(doc.tabs)) {
127+
for (const tab of doc.tabs) {
128+
if (typeof tab === 'object' && tab !== null) {
129+
const tabObj = tab as Record<string, unknown>;
130+
if (tabObj.documentTab && typeof tabObj.documentTab === 'object') {
131+
const docTab = tabObj.documentTab as Record<string, unknown>;
132+
if (docTab.body && typeof docTab.body === 'object') {
133+
const body = docTab.body as Record<string, unknown>;
134+
if (Array.isArray(body.content)) {
135+
for (const item of body.content) {
136+
if (typeof item === 'object' && item !== null) {
137+
const itemObj = item as Record<string, unknown>;
138+
// Extract text from paragraphs
139+
if (itemObj.paragraph && typeof itemObj.paragraph === 'object') {
140+
const para = itemObj.paragraph as Record<string, unknown>;
141+
if (Array.isArray(para.elements)) {
142+
for (const elem of para.elements) {
143+
if (typeof elem === 'object' && elem !== null) {
144+
const elemObj = elem as Record<string, unknown>;
145+
if (elemObj.textRun && typeof elemObj.textRun === 'object') {
146+
const textRun = elemObj.textRun as Record<string, unknown>;
147+
if (typeof textRun.content === 'string') {
148+
textParts.push(textRun.content);
149+
}
150+
}
151+
}
152+
}
153+
}
154+
}
155+
}
156+
}
157+
}
158+
}
159+
}
160+
}
161+
}
162+
}
163+
164+
return textParts.join(' ').trim();
165+
}
166+
106167
function buildExtractionPrompt({
107168
contentTypes,
108-
googleDocContent,
169+
documentContent,
109170
locale,
110171
}: {
111172
contentTypes: ContentTypeProps[];
112-
googleDocContent: string;
173+
documentContent: string;
113174
locale: string;
114175
}): string {
115176
const contentTypeList = contentTypes.map((ct) => `${ct.name} (ID: ${ct.sys.id})`).join(', ');
@@ -121,8 +182,7 @@ function buildExtractionPrompt({
121182
ct.fields?.map((field) => {
122183
const isLinkType = field.type === 'Link';
123184
const isArrayOfLinks = field.type === 'Array' && (field.items as any)?.type === 'Link';
124-
const isRichText = field.type === 'RichText';
125-
const shouldSkip = isLinkType || isArrayOfLinks || isRichText;
185+
const shouldSkip = isLinkType || isArrayOfLinks;
126186

127187
return {
128188
id: field.id,
@@ -137,9 +197,7 @@ function buildExtractionPrompt({
137197
SKIP_REASON: shouldSkip
138198
? isLinkType
139199
? 'Link/Reference field - cannot be populated without entry IDs'
140-
: isArrayOfLinks
141-
? 'Array of Links - cannot be populated without entry IDs'
142-
: 'RichText field - complex format not supported'
200+
: 'Array of Links - cannot be populated without entry IDs'
143201
: undefined,
144202
};
145203
}) || [];
@@ -163,7 +221,7 @@ CONTENT TYPE DEFINITIONS:
163221
${JSON.stringify(contentTypeDefinitions, null, 2)}
164222
165223
DOCUMENT CONTENT:
166-
${googleDocContent}
224+
${documentContent}
167225
168226
CRITICAL INSTRUCTIONS:
169227
1. **SKIP ALL FIELDS WHERE "SKIP": true** - Do NOT include these fields in your output
@@ -176,6 +234,7 @@ CRITICAL INSTRUCTIONS:
176234
8. Match field types exactly:
177235
- Symbol: string (max 256 chars)
178236
- Text: string (any length)
237+
- RichText: string in Markdown (preserve bold **, italics *, underline _)
179238
- Number: number
180239
- Boolean: boolean
181240
- Date: ISO 8601 string

apps/google-docs/functions/createEntriesFromDocument.ts

Lines changed: 13 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -13,7 +13,7 @@ import { createEntries } from './service/entryService';
1313

1414
export type AppActionParameters = {
1515
contentTypeIds: string[];
16-
googleDocUrl: string;
16+
document: unknown; // JSON document from Google Docs API or test data
1717
};
1818

1919
interface AppInstallationParameters {
@@ -31,8 +31,17 @@ export const handler: FunctionEventHandler<
3131
event: AppActionRequest<'Custom', AppActionParameters>,
3232
context: FunctionEventContext
3333
) => {
34-
const { contentTypeIds, googleDocUrl } = event.body;
34+
const { contentTypeIds, document } = event.body;
3535
const { openAiApiKey } = context.appInstallationParameters as AppInstallationParameters;
36+
37+
if (!document) {
38+
throw new Error('Document is required');
39+
}
40+
41+
if (!contentTypeIds || contentTypeIds.length === 0) {
42+
throw new Error('At least one content type ID is required');
43+
}
44+
3645
const cma = initContentfulManagementClient(context);
3746
const contentTypes = await fetchContentTypes(cma, new Set<string>(contentTypeIds));
3847

@@ -43,7 +52,7 @@ export const handler: FunctionEventHandler<
4352
// createContentTypeObservationsFromLLMResponse()
4453

4554
const aiDocumentResponse = await createDocument({
46-
googleDocUrl,
55+
document,
4756
openAiApiKey,
4857
contentTypes,
4958
});
@@ -56,6 +65,7 @@ export const handler: FunctionEventHandler<
5665
const creationResult = await createEntries(cma, aiDocumentResponse.entries, {
5766
spaceId: context.spaceId,
5867
environmentId: context.environmentId,
68+
contentTypes,
5969
});
6070

6171
// INTEG-3265: Create the assets in Contentful using the asset service

0 commit comments

Comments
 (0)