1111import { createOpenAI } from '@ai-sdk/openai' ;
1212import { generateObject } from 'ai' ;
1313import { ContentTypeProps } from 'contentful-management' ;
14- import { fetchGoogleDoc } from '../../service/googleDriveService' ;
1514import { FinalEntriesResultSchema , FinalEntriesResult } from './schema' ;
1615
1716/**
1817 * Configuration for the document parser
1918 */
2019export interface DocumentParserConfig {
2120 openAiApiKey : string ;
22- googleDocUrl : string ;
21+ document : unknown ; // JSON document from Google Docs API or test data
2322 contentTypes : ContentTypeProps [ ] ;
2423 locale ?: string ;
2524}
2625
2726/**
28- * AI Agent that parses a Google Doc and extracts structured entries
27+ * AI Agent that parses a Google Doc JSON and extracts structured entries
2928 * based on provided Contentful content type definitions.
3029 *
31- * @param config - Parser configuration including API key, document URL , and content types
30+ * @param config - Parser configuration including API key, document JSON , and content types
3231 * @returns Promise resolving to entries ready for CMA client
3332 */
3433export async function createDocument ( config : DocumentParserConfig ) : Promise < FinalEntriesResult > {
@@ -37,15 +36,16 @@ export async function createDocument(config: DocumentParserConfig): Promise<Fina
3736 const modelVersion = 'gpt-4o' ;
3837 const temperature = 0.3 ;
3938
40- const { googleDocUrl, openAiApiKey, contentTypes, locale = 'en-US' } = config ;
41- const googleDocContent = await fetchGoogleDoc ( googleDocUrl ) ;
39+ const { document, openAiApiKey, contentTypes, locale = 'en-US' } = config ;
40+
41+ // Extract text content from Google Docs JSON structure
42+ const documentContent = extractTextFromGoogleDocsJson ( document ) ;
4243
4344 const openaiClient = createOpenAI ( {
4445 apiKey : openAiApiKey ,
4546 } ) ;
4647
47- const prompt = buildExtractionPrompt ( { contentTypes, googleDocContent, locale } ) ;
48-
48+ const prompt = buildExtractionPrompt ( { contentTypes, documentContent, locale } ) ;
4949 const result = await generateObject ( {
5050 model : openaiClient ( modelVersion ) ,
5151 schema : FinalEntriesResultSchema ,
@@ -80,7 +80,10 @@ CRITICAL FIELD TYPE RULES - READ CAREFULLY:
8080- Array (of Link): ❌ NEVER USE - these reference other entries, skip entirely
8181 Example: DO NOT create [{ title: "x", content: "y" }] - this will FAIL
8282- Link/Reference: ❌ NEVER USE - skip these fields (they reference other entries)
83- - RichText: ❌ NEVER USE - complex format not supported
83+ - RichText: Provide a Markdown string preserving inline styles:
84+ - Bold: **bold**
85+ - Italic: *italic*
86+ - Underline: _underline_ (or <u>underline</u>)
8487
8588FIELD FORMAT RULES:
8689- Each entry must have a contentTypeId that matches one of the provided content types
@@ -103,13 +106,71 @@ EXTRACTION GUIDELINES:
103106- Focus on simple fields: Symbol, Text, Number, Boolean, Date` ;
104107}
105108
109+ /**
110+ * Extracts plain text content from Google Docs JSON structure
111+ */
112+ function extractTextFromGoogleDocsJson ( document : unknown ) : string {
113+ if ( ! document || typeof document !== 'object' ) {
114+ return '' ;
115+ }
116+
117+ const doc = document as Record < string , unknown > ;
118+ const textParts : string [ ] = [ ] ;
119+
120+ // Extract title if available
121+ if ( typeof doc . title === 'string' ) {
122+ textParts . push ( doc . title ) ;
123+ }
124+
125+ // Navigate through tabs -> documentTab -> body -> content
126+ if ( Array . isArray ( doc . tabs ) ) {
127+ for ( const tab of doc . tabs ) {
128+ if ( typeof tab === 'object' && tab !== null ) {
129+ const tabObj = tab as Record < string , unknown > ;
130+ if ( tabObj . documentTab && typeof tabObj . documentTab === 'object' ) {
131+ const docTab = tabObj . documentTab as Record < string , unknown > ;
132+ if ( docTab . body && typeof docTab . body === 'object' ) {
133+ const body = docTab . body as Record < string , unknown > ;
134+ if ( Array . isArray ( body . content ) ) {
135+ for ( const item of body . content ) {
136+ if ( typeof item === 'object' && item !== null ) {
137+ const itemObj = item as Record < string , unknown > ;
138+ // Extract text from paragraphs
139+ if ( itemObj . paragraph && typeof itemObj . paragraph === 'object' ) {
140+ const para = itemObj . paragraph as Record < string , unknown > ;
141+ if ( Array . isArray ( para . elements ) ) {
142+ for ( const elem of para . elements ) {
143+ if ( typeof elem === 'object' && elem !== null ) {
144+ const elemObj = elem as Record < string , unknown > ;
145+ if ( elemObj . textRun && typeof elemObj . textRun === 'object' ) {
146+ const textRun = elemObj . textRun as Record < string , unknown > ;
147+ if ( typeof textRun . content === 'string' ) {
148+ textParts . push ( textRun . content ) ;
149+ }
150+ }
151+ }
152+ }
153+ }
154+ }
155+ }
156+ }
157+ }
158+ }
159+ }
160+ }
161+ }
162+ }
163+
164+ return textParts . join ( ' ' ) . trim ( ) ;
165+ }
166+
106167function buildExtractionPrompt ( {
107168 contentTypes,
108- googleDocContent ,
169+ documentContent ,
109170 locale,
110171} : {
111172 contentTypes : ContentTypeProps [ ] ;
112- googleDocContent : string ;
173+ documentContent : string ;
113174 locale : string ;
114175} ) : string {
115176 const contentTypeList = contentTypes . map ( ( ct ) => `${ ct . name } (ID: ${ ct . sys . id } )` ) . join ( ', ' ) ;
@@ -121,8 +182,7 @@ function buildExtractionPrompt({
121182 ct . fields ?. map ( ( field ) => {
122183 const isLinkType = field . type === 'Link' ;
123184 const isArrayOfLinks = field . type === 'Array' && ( field . items as any ) ?. type === 'Link' ;
124- const isRichText = field . type === 'RichText' ;
125- const shouldSkip = isLinkType || isArrayOfLinks || isRichText ;
185+ const shouldSkip = isLinkType || isArrayOfLinks ;
126186
127187 return {
128188 id : field . id ,
@@ -137,9 +197,7 @@ function buildExtractionPrompt({
137197 SKIP_REASON : shouldSkip
138198 ? isLinkType
139199 ? 'Link/Reference field - cannot be populated without entry IDs'
140- : isArrayOfLinks
141- ? 'Array of Links - cannot be populated without entry IDs'
142- : 'RichText field - complex format not supported'
200+ : 'Array of Links - cannot be populated without entry IDs'
143201 : undefined ,
144202 } ;
145203 } ) || [ ] ;
@@ -163,7 +221,7 @@ CONTENT TYPE DEFINITIONS:
163221${ JSON . stringify ( contentTypeDefinitions , null , 2 ) }
164222
165223DOCUMENT CONTENT:
166- ${ googleDocContent }
224+ ${ documentContent }
167225
168226CRITICAL INSTRUCTIONS:
1692271. **SKIP ALL FIELDS WHERE "SKIP": true** - Do NOT include these fields in your output
@@ -176,6 +234,7 @@ CRITICAL INSTRUCTIONS:
1762348. Match field types exactly:
177235 - Symbol: string (max 256 chars)
178236 - Text: string (any length)
237+ - RichText: string in Markdown (preserve bold **, italics *, underline _)
179238 - Number: number
180239 - Boolean: boolean
181240 - Date: ISO 8601 string
0 commit comments