Skip to content

Commit 449ba7a

Browse files
feat: ai document parser now produces an output that can be consumed by the cma to create entries (#10277)
1 parent 2a3ffef commit 449ba7a

File tree

7 files changed

+190
-59
lines changed

7 files changed

+190
-59
lines changed

apps/google-docs/functions/agents/contentTypeParserAgent/contentTypeParser.agent.ts

Lines changed: 0 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -43,9 +43,6 @@ export async function analyzeContentTypes({
4343
return finalAnalysis;
4444
}
4545

46-
/**
47-
* Builds the system prompt for the AI
48-
*/
4946
function buildSystemPrompt(): string {
5047
return `You are an expert Contentful content modeling analyst. Your role is to analyze Contentful content type definitions and provide clear, actionable summaries.
5148
@@ -61,9 +58,6 @@ Focus on clarity and actionability. Your summaries will be used by:
6158
- Content strategists planning content architecture`;
6259
}
6360

64-
/**
65-
* Builds the analysis prompt from content type data
66-
*/
6761
function buildAnalysisPrompt(contentTypes: ContentTypeProps[]): string {
6862
const contentTypeList = contentTypes.map((ct) => ct.name).join(', ');
6963
const totalFields = contentTypes.reduce((sum, ct) => sum + (ct.fields?.length || 0), 0);

apps/google-docs/functions/agents/contentTypeParserAgent/schema.ts

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -20,5 +20,4 @@ export const FinalContentTypesAnalysisSchema = z.object({
2020
complexity: z.string(),
2121
});
2222

23-
export type ContentTypeSummary = z.infer<typeof ContentTypeAnalysisSchema>;
2423
export type FinalContentTypesResultSummary = z.infer<typeof FinalContentTypesAnalysisSchema>;
Lines changed: 106 additions & 35 deletions
Original file line numberDiff line numberDiff line change
@@ -1,58 +1,129 @@
11
/**
2-
* INTEG-3263: Content Type Parser Agent
2+
* INTEG-3263: Document Parser Agent
33
*
4-
* Simple agent that takes JSON input and sends it to OpenAI/ChatGPT
5-
* to generate Contentful content type definitions.
4+
* Agent that takes a Google Doc URL and content type definitions,
5+
* then uses OpenAI to extract structured entries from the document
6+
* that can be directly created in Contentful.
67
* See https://contentful.atlassian.net/wiki/spaces/ECO/pages/5850955777/RFC+Google+Docs+V1+AI-Gen
78
* for more details.
89
*/
910

10-
import { openai } from '@ai-sdk/openai';
11-
import { generateText } from 'ai';
11+
import { createOpenAI } from '@ai-sdk/openai';
12+
import { generateObject } from 'ai';
1213
import { ContentTypeProps } from 'contentful-management';
1314
import { fetchGoogleDoc } from '../../service/googleDriveService';
15+
import { FinalEntriesResultSchema, FinalEntriesResult } from './schema';
1416

1517
/**
16-
* Configuration for the content type parser
18+
* Configuration for the document parser
1719
*/
18-
interface DocumentParserConfig {
19-
// contentTypes: ContentTypeProps[];
20+
export interface DocumentParserConfig {
21+
openAiApiKey: string;
2022
googleDocUrl: string;
23+
contentTypes: ContentTypeProps[];
24+
locale?: string;
2125
}
2226

2327
/**
24-
* @param jsonData - JSON data to analyze
25-
* @param config - Parser configuration
26-
* @returns Promise resolving to LLM response
28+
* AI Agent that parses a Google Doc and extracts structured entries
29+
* based on provided Contentful content type definitions.
30+
*
31+
* @param config - Parser configuration including API key, document URL, and content types
32+
* @returns Promise resolving to entries ready for CMA client
2733
*/
28-
export async function createDocument(config: DocumentParserConfig) {
29-
const { googleDocUrl } = config;
34+
export async function createDocument(config: DocumentParserConfig): Promise<FinalEntriesResult> {
35+
// TODO: Double check these values and make sure they are compatible because not every user will have a key
36+
// to access all models
37+
const modelVersion = 'gpt-4o';
38+
const temperature = 0.3;
39+
40+
const { googleDocUrl, openAiApiKey, contentTypes, locale = 'en-US' } = config;
3041
const googleDocContent = await fetchGoogleDoc(googleDocUrl);
31-
// const prompt = buildPrompt(jsonData);
32-
// return await callOpenAI(prompt, modelVersion, openaiApiKey);
33-
return googleDocContent as string;
42+
43+
const openaiClient = createOpenAI({
44+
apiKey: openAiApiKey,
45+
});
46+
47+
const prompt = buildExtractionPrompt({ contentTypes, googleDocContent, locale });
48+
const result = await generateObject({
49+
model: openaiClient(modelVersion),
50+
schema: FinalEntriesResultSchema,
51+
temperature,
52+
system: buildSystemPrompt(),
53+
prompt,
54+
});
55+
56+
return result.object as FinalEntriesResult;
3457
}
3558

36-
function buildPrompt(jsonData: any): string {
37-
// TODO: Create prompt template for the AI to consume
38-
// 1. Add instructions for content type generation
39-
// 2. Include JSON data
40-
// 3. Specify output format
41-
return `Parse the document and create a json object that represents the document for the Contetful API to consume: ${JSON.stringify(
42-
jsonData,
43-
null,
44-
2
45-
)}`;
59+
function buildSystemPrompt(): string {
60+
return `You are an expert content extraction AI that analyzes documents and extracts structured content based on Contentful content type definitions.
61+
62+
Your role is to:
63+
1. Carefully read and understand the document content
64+
2. Analyze the provided Contentful content type definitions (their fields, types, and validations)
65+
3. Extract relevant information from the document that matches the content type structure
66+
4. Create properly formatted entries that are ready to be created in Contentful via the CMA API
67+
68+
Important guidelines:
69+
- Each entry must have a contentTypeId that matches one of the provided content types
70+
- Fields must be in the correct format: { "fieldId": { "locale": value } }
71+
- Respect field types (Text, Symbol, RichText, Number, Boolean, Date, Reference, etc.)
72+
- Only include fields that exist in the content type definition
73+
- Extract all relevant content from the document - don't skip entries
74+
- If a field is required in the content type, ensure it's populated
75+
- For rich text fields, extract formatted content when possible
76+
- Be thorough and extract as many valid entries as you can find in the document`;
4677
}
4778

48-
async function callOpenAI(prompt: string, modelVersion: string, openaiApiKey: string) {
49-
const model = openai(modelVersion);
79+
function buildExtractionPrompt({
80+
contentTypes,
81+
googleDocContent,
82+
locale,
83+
}: {
84+
contentTypes: ContentTypeProps[];
85+
googleDocContent: string;
86+
locale: string;
87+
}): string {
88+
const contentTypeList = contentTypes.map((ct) => `${ct.name} (ID: ${ct.sys.id})`).join(', ');
89+
const totalFields = contentTypes.reduce((sum, ct) => sum + (ct.fields?.length || 0), 0);
5090

51-
// ai-sdk documentation: https://ai-sdk.dev/docs/reference/ai-sdk-core/generate-text
52-
// Select the appropriate core function from the list of options
53-
const result = await generateText({
54-
model,
55-
prompt,
56-
});
57-
return result;
91+
// Create a simplified view of content types for the prompt
92+
const contentTypeDefinitions = contentTypes.map((ct) => ({
93+
id: ct.sys.id,
94+
name: ct.name,
95+
description: ct.description,
96+
fields:
97+
ct.fields?.map((field) => ({
98+
id: field.id,
99+
name: field.name,
100+
type: field.type,
101+
required: field.required,
102+
localized: field.localized,
103+
validations: field.validations,
104+
})) || [],
105+
}));
106+
107+
return `Extract structured entries from the following document based on the provided Contentful content type definitions.
108+
109+
AVAILABLE CONTENT TYPES: ${contentTypeList}
110+
TOTAL CONTENT TYPES: ${contentTypes.length}
111+
TOTAL FIELDS ACROSS ALL TYPES: ${totalFields}
112+
LOCALE TO USE: ${locale}
113+
114+
CONTENT TYPE DEFINITIONS:
115+
${JSON.stringify(contentTypeDefinitions, null, 2)}
116+
117+
DOCUMENT CONTENT:
118+
${googleDocContent}
119+
120+
INSTRUCTIONS:
121+
1. Analyze the document and identify content that matches the provided content type structures
122+
2. Extract all relevant entries from the document
123+
3. For each entry, use the contentTypeId that best matches the content
124+
4. Format fields correctly: { "fieldId": { "${locale}": value } }
125+
5. Ensure all required fields are populated
126+
6. Be thorough - extract all valid content from the document
127+
128+
Return the extracted entries in the specified JSON schema format.`;
58129
}
Lines changed: 29 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,29 @@
1+
import { z } from 'zod';
2+
3+
// Schema Definitions for the Document Parser Agent
4+
5+
// Schema for a single field value in Contentful format (locale-specific)
6+
// Contentful expects fields in format: { 'en-US': value }
7+
const LocalizedFieldSchema = z.record(z.string(), z.any());
8+
9+
// Schema for a single entry that will be created in Contentful
10+
export const EntryToCreateSchema = z.object({
11+
contentTypeId: z.string().describe('The ID of the content type for this entry'),
12+
fields: z
13+
.record(z.string(), LocalizedFieldSchema)
14+
.describe(
15+
'Fields with localized values, e.g., { "title": { "en-US": "My Title" }, "body": { "en-US": "Content..." } }'
16+
),
17+
});
18+
19+
// The final output schema - array of entries ready for CMA client
20+
export const FinalEntriesResultSchema = z.object({
21+
entries: z
22+
.array(EntryToCreateSchema)
23+
.describe('Array of entries extracted from the document, ready to be created in Contentful'),
24+
summary: z.string().describe('Brief summary of what was extracted from the document'),
25+
totalEntries: z.number().describe('Total number of entries extracted'),
26+
});
27+
28+
export type EntryToCreate = z.infer<typeof EntryToCreateSchema>;
29+
export type FinalEntriesResult = z.infer<typeof FinalEntriesResultSchema>;

apps/google-docs/functions/createEntriesFromDocument.ts

Lines changed: 15 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -33,7 +33,6 @@ export const handler: FunctionEventHandler<
3333
const { contentTypeIds, googleDocUrl } = event.body;
3434
const { openAiApiKey } = context.appInstallationParameters as AppInstallationParameters;
3535
// INTEG-3262 and INTEG-3263: Take in Content Type, Prompt, and Upload File from user
36-
3736
const cma = initContentfulManagementClient(context);
3837
const contentTypes = await fetchContentTypes(cma, new Set<string>(contentTypeIds));
3938

@@ -44,16 +43,28 @@ export const handler: FunctionEventHandler<
4443
// createContentTypeObservationsFromLLMResponse()
4544

4645
// INTEG-3263: Implement the document parser agent
47-
const aiDocumentResponse = await createDocument({ googleDocUrl });
46+
// Pass the content types to the document parser so it can extract entries based on the structure
47+
const aiDocumentResponse = await createDocument({
48+
googleDocUrl,
49+
openAiApiKey,
50+
contentTypes,
51+
});
4852

4953
// INTEG-3261: Pass the ai document response to the observer for analysis
5054
// createDocumentObservationsFromLLMResponse()
5155

5256
// INTEG-3264: Create the entries in Contentful using the entry service
53-
// await createEntries();
57+
// The aiDocumentResponse.entries is now ready to be passed to the CMA client
58+
// await createEntries(aiDocumentResponse.entries, { spaceId, environmentId, accessToken });
5459

5560
// INTEG-3265: Create the assets in Contentful using the asset service
5661
// await createAssets()
5762

58-
return { success: true, response: { contentTypeParserAgentResult, aiDocumentResponse } };
63+
return {
64+
success: true,
65+
response: {
66+
contentTypeParserAgentResult,
67+
entriesReadyForCreation: aiDocumentResponse.entries,
68+
},
69+
};
5970
};
Lines changed: 39 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -1,21 +1,50 @@
1-
import { EntryProps, ContentTypeProps } from 'contentful-management';
1+
import { PlainClientAPI, EntryProps } from 'contentful-management';
2+
import { EntryToCreate } from '../agents/documentParserAgent/schema';
23

34
/**
4-
* INTEG-3264: Implement this file to create entries in Contentful using the Contentful Management API
5+
* INTEG-3264: Service for creating entries in Contentful using the Contentful Management API
6+
*
7+
* This service takes the output from the Document Parser Agent (which extracts entries from documents)
8+
* and creates them in Contentful using the CMA client.
59
*/
6-
interface EntryServiceParams {
7-
spaceId: string;
8-
environmentId: string;
9-
accessToken: string;
10-
}
1110

12-
export async function createEntry(aiDocumentResponse: any): Promise<EntryProps> {
11+
/**
12+
* Creates a single entry in Contentful
13+
*
14+
* @param cma - Contentful Management API client
15+
* @param entry - Entry data from Document Parser Agent (matches EntryToCreate schema)
16+
* @returns Promise resolving to the created entry
17+
*/
18+
export async function createEntry(cma: PlainClientAPI, entry: EntryToCreate): Promise<EntryProps> {
19+
// TODO: Implement entry creation using the CMA client
20+
// Example implementation:
21+
// const createdEntry = await cma.entry.create(
22+
// { spaceId, environmentId },
23+
// {
24+
// contentTypeId: entry.contentTypeId,
25+
// fields: entry.fields
26+
// }
27+
// );
28+
// return createdEntry;
1329
throw new Error('Not implemented');
1430
}
1531

32+
/**
33+
* Creates multiple entries in Contentful
34+
*
35+
* @param cma - Contentful Management API client
36+
* @param entries - Array of entries from Document Parser Agent output
37+
* @returns Promise resolving to array of created entries
38+
*/
1639
export async function createEntries(
17-
entries: Array<{ contentTypeId: string; fields: Record<string, any> }>,
18-
config: EntryServiceParams
40+
cma: PlainClientAPI,
41+
entries: EntryToCreate[]
1942
): Promise<EntryProps[]> {
43+
// TODO: Implement batch entry creation
44+
// Consider implementing with proper error handling and rate limiting
45+
// const createdEntries = await Promise.all(
46+
// entries.map(entry => createEntry(cma, entry))
47+
// );
48+
// return createdEntries;
2049
throw new Error('Not implemented');
2150
}

apps/google-docs/src/utils/appFunctionUtils.ts

Lines changed: 1 addition & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -15,9 +15,7 @@ export async function getAppActionId(
1515
environmentId: sdk.ids.environment,
1616
spaceId: sdk.ids.space,
1717
});
18-
1918
const appAction = appActions.items.find((action) => action.name === actionName);
20-
2119
if (!appAction) {
2220
throw new Error(`App action "${actionName}" not found`);
2321
}
@@ -38,7 +36,6 @@ export const createEntriesFromDocumentAction = async (
3836
}
3937

4038
const appActionId = await getAppActionId(sdk, 'createEntriesFromDocumentAction');
41-
4239
const result = await sdk.cma.appActionCall.createWithResult(
4340
{
4441
appDefinitionId,
@@ -55,6 +52,7 @@ export const createEntriesFromDocumentAction = async (
5552

5653
return result;
5754
} catch (error) {
55+
console.error('Error creating entries from document', error);
5856
throw new Error(error instanceof Error ? error.message : 'Failed to analyze content types');
5957
}
6058
};

0 commit comments

Comments
 (0)