-
Notifications
You must be signed in to change notification settings - Fork 166
[INTEG-3264] Google docs backend can create entries #10279
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -1,5 +1,5 @@ | ||
| /** | ||
| * INTEG-3263: Document Parser Agent | ||
| * Document Parser Agent | ||
| * | ||
| * Agent that takes a Google Doc URL and content type definitions, | ||
| * then uses OpenAI to extract structured entries from the document | ||
|
|
@@ -45,6 +45,7 @@ export async function createDocument(config: DocumentParserConfig): Promise<Fina | |
| }); | ||
|
|
||
| const prompt = buildExtractionPrompt({ contentTypes, googleDocContent, locale }); | ||
|
|
||
| const result = await generateObject({ | ||
| model: openaiClient(modelVersion), | ||
| schema: FinalEntriesResultSchema, | ||
|
|
@@ -56,6 +57,7 @@ export async function createDocument(config: DocumentParserConfig): Promise<Fina | |
| return result.object as FinalEntriesResult; | ||
| } | ||
|
|
||
| // These should be improved by having an example prompt on top of this zero shot prompt | ||
| function buildSystemPrompt(): string { | ||
| return `You are an expert content extraction AI that analyzes documents and extracts structured content based on Contentful content type definitions. | ||
|
|
||
|
|
@@ -65,15 +67,40 @@ Your role is to: | |
| 3. Extract relevant information from the document that matches the content type structure | ||
| 4. Create properly formatted entries that are ready to be created in Contentful via the CMA API | ||
|
|
||
| Important guidelines: | ||
| CRITICAL FIELD TYPE RULES - READ CAREFULLY: | ||
| - Symbol: Short text (max 256 characters) - use for titles, names, IDs ✓ | ||
| - Text: Long text (any length) - use for descriptions, content ✓ | ||
| - Number: Integer or decimal values only ✓ | ||
| - Boolean: true or false only ✓ | ||
| - Date: ISO 8601 format (YYYY-MM-DD or YYYY-MM-DDTHH:mm:ss.sssZ) ✓ | ||
| - Location: { lat: number, lon: number } ✓ | ||
| - Object: JSON object (use sparingly, check validations) ✓ | ||
| - Array (of Symbol/Text/Number): Array of PRIMITIVE values ONLY ✓ | ||
| Example: ["value1", "value2"] or [1, 2, 3] | ||
| - Array (of Link): ❌ NEVER USE - these reference other entries, skip entirely | ||
| Example: DO NOT create [{ title: "x", content: "y" }] - this will FAIL | ||
| - Link/Reference: ❌ NEVER USE - skip these fields (they reference other entries) | ||
| - RichText: ❌ NEVER USE - complex format not supported | ||
|
|
||
| FIELD FORMAT RULES: | ||
|
Comment on lines
+72
to
+85
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. let's try one shot prompting |
||
| - Each entry must have a contentTypeId that matches one of the provided content types | ||
| - Fields must be in the correct format: { "fieldId": { "locale": value } } | ||
| - Respect field types (Text, Symbol, RichText, Number, Boolean, Date, Reference, etc.) | ||
| - Fields must be in the format: { "fieldId": { "locale": value } } | ||
| - Only include fields that exist in the content type definition | ||
| - NEVER include Reference/Link fields (type: "Link") | ||
| - NEVER include fields with type "Array" if items.type is "Link" | ||
| - NEVER create arrays of objects like [{ title: "x", content: "y" }] - this will FAIL | ||
| - If a field type is unclear or complex, SKIP it rather than guess | ||
|
|
||
| COMMON MISTAKES TO AVOID: | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. think we need any locale comments in here? seems like a common gotcha |
||
| ❌ WRONG: { "sections": { "en-US": [{ "title": "...", "content": "..." }] } } | ||
| ✓ CORRECT: Skip "sections" field entirely if it's an Array of Links | ||
| ✓ CORRECT: { "tags": { "en-US": ["tag1", "tag2", "tag3"] } } (if tags is Array of Symbol) | ||
|
|
||
| EXTRACTION GUIDELINES: | ||
| - Extract all relevant content from the document - don't skip entries | ||
| - If a field is required in the content type, ensure it's populated | ||
| - For rich text fields, extract formatted content when possible | ||
| - Be thorough and extract as many valid entries as you can find in the document`; | ||
| - If a required field cannot be populated from the document, use a sensible default or placeholder | ||
| - Be thorough and extract as many valid entries as you can find | ||
| - Focus on simple fields: Symbol, Text, Number, Boolean, Date`; | ||
| } | ||
|
|
||
| function buildExtractionPrompt({ | ||
|
|
@@ -88,21 +115,42 @@ function buildExtractionPrompt({ | |
| const contentTypeList = contentTypes.map((ct) => `${ct.name} (ID: ${ct.sys.id})`).join(', '); | ||
| const totalFields = contentTypes.reduce((sum, ct) => sum + (ct.fields?.length || 0), 0); | ||
|
|
||
| // Create a simplified view of content types for the prompt | ||
| const contentTypeDefinitions = contentTypes.map((ct) => ({ | ||
| id: ct.sys.id, | ||
| name: ct.name, | ||
| description: ct.description, | ||
| fields: | ||
| ct.fields?.map((field) => ({ | ||
| id: field.id, | ||
| name: field.name, | ||
| type: field.type, | ||
| required: field.required, | ||
| localized: field.localized, | ||
| validations: field.validations, | ||
| })) || [], | ||
| })); | ||
| // Create a detailed view of content types, filtering out unsupported field types | ||
| const contentTypeDefinitions = contentTypes.map((ct) => { | ||
| const fields = | ||
| ct.fields?.map((field) => { | ||
| const isLinkType = field.type === 'Link'; | ||
| const isArrayOfLinks = field.type === 'Array' && (field.items as any)?.type === 'Link'; | ||
| const isRichText = field.type === 'RichText'; | ||
| const shouldSkip = isLinkType || isArrayOfLinks || isRichText; | ||
|
|
||
| return { | ||
| id: field.id, | ||
| name: field.name, | ||
| type: field.type, | ||
| linkType: (field as any).linkType, | ||
| items: field.type === 'Array' ? (field.items as any) : undefined, | ||
| required: field.required, | ||
| localized: field.localized, | ||
| validations: field.validations, | ||
| SKIP: shouldSkip, | ||
| SKIP_REASON: shouldSkip | ||
| ? isLinkType | ||
| ? 'Link/Reference field - cannot be populated without entry IDs' | ||
| : isArrayOfLinks | ||
| ? 'Array of Links - cannot be populated without entry IDs' | ||
| : 'RichText field - complex format not supported' | ||
| : undefined, | ||
| }; | ||
| }) || []; | ||
|
|
||
| return { | ||
| id: ct.sys.id, | ||
| name: ct.name, | ||
| description: ct.description, | ||
| fields, | ||
| }; | ||
| }); | ||
|
|
||
| return `Extract structured entries from the following document based on the provided Contentful content type definitions. | ||
|
|
||
|
|
@@ -117,13 +165,25 @@ ${JSON.stringify(contentTypeDefinitions, null, 2)} | |
| DOCUMENT CONTENT: | ||
| ${googleDocContent} | ||
|
|
||
| INSTRUCTIONS: | ||
| 1. Analyze the document and identify content that matches the provided content type structures | ||
| 2. Extract all relevant entries from the document | ||
| 3. For each entry, use the contentTypeId that best matches the content | ||
| 4. Format fields correctly: { "fieldId": { "${locale}": value } } | ||
| 5. Ensure all required fields are populated | ||
| 6. Be thorough - extract all valid content from the document | ||
| CRITICAL INSTRUCTIONS: | ||
| 1. **SKIP ALL FIELDS WHERE "SKIP": true** - Do NOT include these fields in your output | ||
| 2. Look at each field definition - if it has "SKIP": true, completely ignore that field | ||
| 3. Only include fields where "SKIP" is false or not present | ||
| 4. Analyze the document and identify content that matches the provided content type structures | ||
| 5. Extract all relevant entries from the document | ||
| 6. For each entry, use the contentTypeId that best matches the content | ||
| 7. Format fields correctly: { "fieldId": { "${locale}": value } } | ||
| 8. Match field types exactly: | ||
| - Symbol: string (max 256 chars) | ||
| - Text: string (any length) | ||
| - Number: number | ||
| - Boolean: boolean | ||
| - Date: ISO 8601 string | ||
| - Array: array of primitives (strings or numbers ONLY) | ||
| - Object: JSON object | ||
| 9. For required fields (required: true) that are NOT marked SKIP: true, ensure they are populated | ||
| 10. If you cannot populate a required field from the document, use a sensible default or placeholder | ||
| 11. Be thorough - extract all valid content from the document | ||
|
|
||
| Return the extracted entries in the specified JSON schema format.`; | ||
| } | ||
| Original file line number | Diff line number | Diff line change | ||
|---|---|---|---|---|
|
|
@@ -9,6 +9,7 @@ import { createDocument } from './agents/documentParserAgent/documentParser.agen | |||
| import { fetchContentTypes } from './service/contentTypeService'; | ||||
| import { initContentfulManagementClient } from './service/initCMAClient'; | ||||
| import { fetchGoogleDoc } from './service/googleDriveService'; | ||||
| import { createEntries } from './service/entryService'; | ||||
|
|
||||
| export type AppActionParameters = { | ||||
| contentTypeIds: string[]; | ||||
|
|
@@ -32,18 +33,15 @@ export const handler: FunctionEventHandler< | |||
| ) => { | ||||
| const { contentTypeIds, googleDocUrl } = event.body; | ||||
| const { openAiApiKey } = context.appInstallationParameters as AppInstallationParameters; | ||||
| // INTEG-3262 and INTEG-3263: Take in Content Type, Prompt, and Upload File from user | ||||
| const cma = initContentfulManagementClient(context); | ||||
| const contentTypes = await fetchContentTypes(cma, new Set<string>(contentTypeIds)); | ||||
|
|
||||
| const contentTypeParserAgentResult = await analyzeContentTypes({ contentTypes, openAiApiKey }); | ||||
| // console.log('contentTypeParserAgentResult', contentTypeParserAgentResult); | ||||
| // Commented out to preserver as much time as possible due to the 30 second limit for App functions | ||||
| // const contentTypeParserAgentResult = await analyzeContentTypes({ contentTypes, openAiApiKey }); | ||||
|
|
||||
| // INTEG-3261: Pass the ai content type response to the observer for analysis | ||||
| // createContentTypeObservationsFromLLMResponse() | ||||
|
|
||||
| // INTEG-3263: Implement the document parser agent | ||||
| // Pass the content types to the document parser so it can extract entries based on the structure | ||||
| const aiDocumentResponse = await createDocument({ | ||||
| googleDocUrl, | ||||
| openAiApiKey, | ||||
|
|
@@ -55,16 +53,26 @@ export const handler: FunctionEventHandler< | |||
|
|
||||
| // INTEG-3264: Create the entries in Contentful using the entry service | ||||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. probably good to yank jira references as we complete these
Suggested change
|
||||
| // The aiDocumentResponse.entries is now ready to be passed to the CMA client | ||||
| // await createEntries(aiDocumentResponse.entries, { spaceId, environmentId, accessToken }); | ||||
| const creationResult = await createEntries(cma, aiDocumentResponse.entries, { | ||||
| spaceId: context.spaceId, | ||||
| environmentId: context.environmentId, | ||||
| }); | ||||
|
|
||||
| // INTEG-3265: Create the assets in Contentful using the asset service | ||||
| // await createAssets() | ||||
|
|
||||
| return { | ||||
| success: true, | ||||
| response: { | ||||
| contentTypeParserAgentResult, | ||||
| entriesReadyForCreation: aiDocumentResponse.entries, | ||||
| // contentTypeParserAgentResult, | ||||
| summary: aiDocumentResponse.summary, | ||||
| totalEntriesExtracted: aiDocumentResponse.totalEntries, | ||||
| createdEntries: creationResult.createdEntries.map((entry) => ({ | ||||
| id: entry.sys.id, | ||||
| contentType: entry.sys.contentType.sys.id, | ||||
| })), | ||||
| errors: creationResult.errors, | ||||
| successRate: `${creationResult.createdEntries.length}/${aiDocumentResponse.totalEntries}`, | ||||
| }, | ||||
| }; | ||||
| }; | ||||
| Original file line number | Diff line number | Diff line change | ||||||||||
|---|---|---|---|---|---|---|---|---|---|---|---|---|
|
|
@@ -8,43 +8,62 @@ import { EntryToCreate } from '../agents/documentParserAgent/schema'; | |||||||||||
| * and creates them in Contentful using the CMA client. | ||||||||||||
| */ | ||||||||||||
|
|
||||||||||||
| /** | ||||||||||||
| * Creates a single entry in Contentful | ||||||||||||
| * | ||||||||||||
| * @param cma - Contentful Management API client | ||||||||||||
| * @param entry - Entry data from Document Parser Agent (matches EntryToCreate schema) | ||||||||||||
| * @returns Promise resolving to the created entry | ||||||||||||
| */ | ||||||||||||
| export async function createEntry(cma: PlainClientAPI, entry: EntryToCreate): Promise<EntryProps> { | ||||||||||||
| // TODO: Implement entry creation using the CMA client | ||||||||||||
| // Example implementation: | ||||||||||||
| // const createdEntry = await cma.entry.create( | ||||||||||||
| // { spaceId, environmentId }, | ||||||||||||
| // { | ||||||||||||
| // contentTypeId: entry.contentTypeId, | ||||||||||||
| // fields: entry.fields | ||||||||||||
| // } | ||||||||||||
| // ); | ||||||||||||
| // return createdEntry; | ||||||||||||
| throw new Error('Not implemented'); | ||||||||||||
| export interface EntryCreationResult { | ||||||||||||
| createdEntries: EntryProps[]; | ||||||||||||
| errors: Array<{ | ||||||||||||
| contentTypeId: string; | ||||||||||||
| error: string; | ||||||||||||
| details?: any; | ||||||||||||
| }>; | ||||||||||||
| } | ||||||||||||
|
|
||||||||||||
| /** | ||||||||||||
| * Creates multiple entries in Contentful | ||||||||||||
| * | ||||||||||||
| * @param cma - Contentful Management API client | ||||||||||||
| * @param entries - Array of entries from Document Parser Agent output | ||||||||||||
| * @returns Promise resolving to array of created entries | ||||||||||||
| * @param config - Space and environment configuration | ||||||||||||
| * @returns Promise resolving to creation results with entries and errors | ||||||||||||
| */ | ||||||||||||
| export async function createEntries( | ||||||||||||
| cma: PlainClientAPI, | ||||||||||||
| entries: EntryToCreate[] | ||||||||||||
| ): Promise<EntryProps[]> { | ||||||||||||
| // TODO: Implement batch entry creation | ||||||||||||
| // Consider implementing with proper error handling and rate limiting | ||||||||||||
| // const createdEntries = await Promise.all( | ||||||||||||
| // entries.map(entry => createEntry(cma, entry)) | ||||||||||||
| // ); | ||||||||||||
| // return createdEntries; | ||||||||||||
| throw new Error('Not implemented'); | ||||||||||||
| entries: EntryToCreate[], | ||||||||||||
| config: { spaceId: string; environmentId: string } | ||||||||||||
| ): Promise<EntryCreationResult> { | ||||||||||||
| const { spaceId, environmentId } = config; | ||||||||||||
| const createdEntries: EntryProps[] = []; | ||||||||||||
| const errors: Array<{ contentTypeId: string; error: string; details?: any }> = []; | ||||||||||||
|
|
||||||||||||
| // Create entries sequentially to avoid rate limiting issues | ||||||||||||
| // In production, you may want to implement batching and retry logic | ||||||||||||
| for (let i = 0; i < entries.length; i++) { | ||||||||||||
| const entry = entries[i]; | ||||||||||||
|
|
||||||||||||
| try { | ||||||||||||
| const createdEntry = await cma.entry.create( | ||||||||||||
| { spaceId, environmentId, contentTypeId: entry.contentTypeId }, | ||||||||||||
| { | ||||||||||||
| fields: entry.fields, | ||||||||||||
| } | ||||||||||||
| ); | ||||||||||||
|
|
||||||||||||
| // Optionally publish the entry immediately | ||||||||||||
| // const publishedEntry = await cma.entry.publish( | ||||||||||||
| // { spaceId, environmentId, entryId: createdEntry.sys.id }, | ||||||||||||
| // createdEntry | ||||||||||||
| // ); | ||||||||||||
|
Comment on lines
+50
to
+54
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
Suggested change
💨 |
||||||||||||
|
|
||||||||||||
| createdEntries.push(createdEntry); | ||||||||||||
| } catch (error) { | ||||||||||||
| const errorMessage = error instanceof Error ? error.message : String(error); | ||||||||||||
| console.error(`✗ Failed to create entry of type ${entry.contentTypeId}:`, error); | ||||||||||||
| errors.push({ | ||||||||||||
| contentTypeId: entry.contentTypeId, | ||||||||||||
| error: errorMessage, | ||||||||||||
| details: error, | ||||||||||||
| }); | ||||||||||||
| } | ||||||||||||
| } | ||||||||||||
|
|
||||||||||||
| return { createdEntries, errors }; | ||||||||||||
| } | ||||||||||||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
"read" 🤖