Skip to content

Commit eb42555

Browse files
committed
Add document formatting utility functions for Firestore and OpenAI integration
1 parent ecd15c7 commit eb42555

File tree

3 files changed

+499
-297
lines changed

3 files changed

+499
-297
lines changed
Lines changed: 30 additions & 297 deletions
Original file line numberDiff line numberDiff line change
@@ -1,36 +1,10 @@
1-
const OpenAI = require('openai');
21
const functions = require('firebase-functions');
32
const admin = require('firebase-admin');
43
const { DOCUMENT_TYPES, normalizeDocumentType } = require('./utils/documentUtils');
5-
const { getOpenAIApiKey } = require('./utils/apiUtils');
4+
const { formatSyllabusData, formatGradesData, formatTranscriptData, combineFormattedData } = require('./utils/documentProcessors');
65

76
/**
8-
* Store debugging data in Firestore
9-
* @param {string} userId - User ID
10-
* @param {string} prompt - OpenAI prompt
11-
* @param {string} response - OpenAI response
12-
*/
13-
async function storeDebugData(userId, prompt, response) {
14-
try {
15-
const db = admin.firestore();
16-
const debugRef = db.collection('users').doc(userId).collection('debug').doc();
17-
18-
await debugRef.set({
19-
prompt,
20-
response,
21-
timestamp: admin.firestore.FieldValue.serverTimestamp()
22-
});
23-
24-
console.log(`Debug data stored with ID: ${debugRef.id}`);
25-
return debugRef.id;
26-
} catch (error) {
27-
console.error("Error storing debug data:", error);
28-
// Don't throw - this is just for debugging
29-
}
30-
}
31-
32-
/**
33-
* Formats all document data using a single OpenAI API call to ensure consistent structure
7+
* Formats all document data using multiple OpenAI API calls to ensure consistent structure
348
* @param {string} userId - The user ID
359
* @param {boolean} forceProcess - Whether to force processing regardless of conditions
3610
* @returns {Promise<Object>} Formatted data for calculations and predictions
@@ -79,193 +53,49 @@ exports.formatDocumentsData = async (userId, forceProcess = false) => {
7953

8054
console.log(`Found documents by type: ${Object.keys(documentsByType).join(', ')}`);
8155

82-
// Check if we have a syllabus document - case insensitive
83-
const hasSyllabus = Object.entries(documentsByType).some(([type]) =>
84-
normalizeDocumentType(type) === DOCUMENT_TYPES.SYLLABUS
85-
);
56+
// Process each document type in sequence
8657

87-
// Get the syllabus document if available
88-
const syllabusDoc = hasSyllabus ? Object.entries(documentsByType).find(([type]) =>
89-
normalizeDocumentType(type) === DOCUMENT_TYPES.SYLLABUS
90-
)?.[1] : null;
91-
92-
// Log available document types
93-
console.log('Available document types:', Object.keys(documentsByType));
94-
if (!hasSyllabus) {
95-
console.log('No syllabus document found - proceeding with limited formatting');
58+
// 1. Process syllabus first (if available)
59+
let syllabusData = null;
60+
if (documentsByType[DOCUMENT_TYPES.SYLLABUS]) {
61+
console.log('Processing syllabus document');
62+
syllabusData = await formatSyllabusData(userId, documentsByType[DOCUMENT_TYPES.SYLLABUS].text);
63+
} else {
64+
console.log('No syllabus document found - using default values');
9665
}
9766

98-
// Get OpenAI API key
99-
const apiKey = getOpenAIApiKey();
100-
101-
// Initialize OpenAI client
102-
const openai = new OpenAI({
103-
apiKey: apiKey
104-
});
105-
106-
// Create a unified prompt with all document texts
107-
const prompt = createFormattingPrompt(documentsByType);
108-
109-
// Call OpenAI API
110-
console.log("===== OPENAI PROMPT =====");
111-
console.log(prompt);
112-
console.log('Calling OpenAI for unified data formatting');
113-
const response = await openai.chat.completions.create({
114-
model: "gpt-4o-mini",
115-
messages: [
116-
{
117-
role: "system",
118-
content: "You are a precise data formatting assistant. You MUST respond with ONLY valid JSON that exactly matches the requested structure. Include NO explanatory text outside the JSON object."
119-
},
120-
{ role: "user", content: prompt }
121-
],
122-
temperature: 0.1, // Low temperature for consistency
123-
response_format: { type: "json_object" } // Enforces JSON response
124-
});
125-
126-
// Extract and parse the JSON response
127-
const responseContent = response.choices[0].message.content;
128-
console.log("===== OPENAI RESPONSE =====");
129-
console.log(responseContent);
130-
131-
// Store debug data
132-
await storeDebugData(userId, prompt, responseContent);
67+
// 2. Process grades next (using syllabus data to improve categorization)
68+
let gradesData = null;
69+
if (documentsByType[DOCUMENT_TYPES.GRADES]) {
70+
console.log('Processing grades document');
71+
gradesData = await formatGradesData(userId, documentsByType[DOCUMENT_TYPES.GRADES].text);
72+
} else {
73+
console.log('No grades document found - using default values');
74+
}
13375

134-
const formattedData = JSON.parse(responseContent);
135-
console.log("===== PARSED FORMATTED DATA =====");
136-
console.log(JSON.stringify(formattedData, null, 2));
76+
// 3. Process transcript last
77+
let transcriptData = null;
78+
if (documentsByType[DOCUMENT_TYPES.TRANSCRIPT]) {
79+
console.log('Processing transcript document');
80+
transcriptData = await formatTranscriptData(userId, documentsByType[DOCUMENT_TYPES.TRANSCRIPT].text);
81+
} else {
82+
console.log('No transcript document found - using default values');
83+
}
13784

138-
// Store the formatted data in the user's data document
139-
await storeFormattedData(userId, formattedData);
85+
// 4. Combine all formatted data
86+
const formattedData = await combineFormattedData(userId);
14087

14188
// Update the status of all processed documents
14289
const updateResult = await updateDocumentStatus(userId, snapshot.docs);
14390
console.log(`Document status update result: ${updateResult}`);
14491

14592
return formattedData;
14693
} catch (error) {
147-
console.error('Error formatting data with OpenAI:', error);
148-
149-
// Return a fallback format in case of error
150-
return createFallbackFormattedData(documentsByType);
94+
console.error('Error formatting documents:', error);
95+
throw error;
15196
}
15297
};
15398

154-
/**
155-
* Creates the prompt for OpenAI formatting
156-
* @param {Object} documentsByType - Documents organized by type
157-
* @returns {string} Formatted prompt
158-
*/
159-
function createFormattingPrompt(documentsByType) {
160-
// Extract the document texts
161-
const syllabusText = documentsByType[DOCUMENT_TYPES.SYLLABUS]?.text || '';
162-
const gradesText = documentsByType[DOCUMENT_TYPES.GRADES]?.text || '';
163-
const transcriptText = documentsByType[DOCUMENT_TYPES.TRANSCRIPT]?.text || '';
164-
165-
return `
166-
I need you to format educational document data into a consistent structure for grade calculations and predictions.
167-
Here is the raw text from different document types:
168-
169-
${syllabusText ? `SYLLABUS DATA:
170-
${syllabusText}` : 'SYLLABUS DATA: Not available'}
171-
172-
${gradesText ? `GRADES DATA:
173-
${gradesText}` : 'GRADES DATA: Not available'}
174-
175-
${transcriptText ? `TRANSCRIPT DATA:
176-
${transcriptText}` : 'TRANSCRIPT DATA: Not available'}
177-
178-
Please format this data into the following exact JSON structure. If a syllabus is not available, provide best-effort values based on available data:
179-
{
180-
"course": {
181-
"name": "Course name (from syllabus if available, otherwise derive from grades/transcript)",
182-
"instructor": "Instructor name if available, otherwise 'Unknown'",
183-
"creditHours": "Credit hours if available, otherwise '3'"
184-
},
185-
"gradeWeights": [
186-
{
187-
"name": "Category name (from syllabus or inferred from grades)",
188-
"weight": 0.3 // Decimal weight, ensure all weights sum to 1.0
189-
}
190-
],
191-
"completedAssignments": [
192-
{
193-
"name": "Assignment name from grades",
194-
"grade": 95, // Numeric grade
195-
"maxPoints": 100, // Maximum possible points
196-
"category": "Best matching category based on name"
197-
}
198-
],
199-
"remainingAssignments": [
200-
{
201-
"name": "Assignment name from syllabus if available",
202-
"category": "Best matching category"
203-
}
204-
],
205-
"dueDates": [
206-
{
207-
"assignment": "Assignment name",
208-
"due_date": "Due date if available"
209-
}
210-
],
211-
"gpa": "Overall GPA from transcript, or 'N/A' if not available",
212-
"academicHistory": {
213-
"relevantCourses": [
214-
{
215-
"course_code": "Course code if available",
216-
"course_name": "Course name",
217-
"grade": "Letter grade if available",
218-
"numerical_grade": "Numerical equivalent if available",
219-
"relevance": "High/Medium/Low based on available context"
220-
}
221-
]
222-
}
223-
}
224-
225-
Processing Instructions:
226-
1. If syllabus is available:
227-
- Use exact grade weights and assignments
228-
- Match completed assignments to syllabus categories
229-
- List remaining assignments from syllabus
230-
231-
2. If only grades are available:
232-
- Infer categories from assignment names
233-
- Create approximate grade weights based on assignment counts
234-
- Leave remainingAssignments empty
235-
236-
3. If transcript is available:
237-
- Include GPA and relevant course history
238-
- Use course names to determine relevance
239-
240-
For the academicHistory.relevantCourses, analyze the transcript to find courses that are relevant to the current course:
241-
- High relevance: Same department (e.g., PHY for Physics courses), prerequisites, or similar keywords
242-
- Medium relevance: Related departments (e.g., MATH for Physics courses), general science courses
243-
- Low relevance: Other STEM courses or courses that might indirectly impact performance
244-
`;
245-
}
246-
247-
/**
248-
* Stores the formatted data in Firestore
249-
* @param {string} userId - The user ID
250-
* @param {Object} formattedData - The formatted data
251-
* @returns {Promise<void>}
252-
*/
253-
async function storeFormattedData(userId, formattedData) {
254-
console.log(`Storing formatted data for user ${userId}`);
255-
const db = admin.firestore();
256-
257-
try {
258-
await db.collection('users').doc(userId).collection('data').doc('formatted_data').set({
259-
formatted_data: formattedData,
260-
lastUpdated: admin.firestore.FieldValue.serverTimestamp()
261-
});
262-
263-
console.log('Successfully stored formatted data');
264-
} catch (error) {
265-
console.error('Error storing formatted data:', error);
266-
throw error;
267-
}
268-
}
26999

270100
/**
271101
* Updates the status of processed documents
@@ -316,100 +146,3 @@ async function updateDocumentStatus(userId, documents) {
316146
return false;
317147
}
318148
}
319-
320-
/**
321-
* Creates a fallback formatted data structure if OpenAI fails
322-
* @param {Object} documentsByType - Documents organized by type
323-
* @returns {Object} Fallback formatted data
324-
*/
325-
function createFallbackFormattedData(documentsByType) {
326-
console.log('Creating fallback formatted data');
327-
328-
// Extract basic information using regex patterns
329-
const syllabusText = documentsByType[DOCUMENT_TYPES.SYLLABUS]?.text || '';
330-
331-
// Extract course name
332-
const courseNameMatch = syllabusText.match(/course(?:\s+title)?:?\s*([^\n]+)/i);
333-
const courseName = courseNameMatch ? courseNameMatch[1].trim() : "Unknown Course";
334-
335-
// Extract instructor
336-
const instructorMatch = syllabusText.match(/instructor:?\s*([^\n]+)/i);
337-
const instructor = instructorMatch ? instructorMatch[1].trim() : "Unknown Instructor";
338-
339-
// Extract credit hours
340-
const creditHoursMatch = syllabusText.match(/credit\s+hours:?\s*(\d+)/i);
341-
const creditHours = creditHoursMatch ? creditHoursMatch[1].trim() : "3";
342-
343-
// Extract grade weights using regex
344-
const gradeWeights = extractGradeWeights(syllabusText);
345-
346-
// Extract GPA from transcript
347-
const transcriptText = documentsByType[DOCUMENT_TYPES.TRANSCRIPT]?.text || '';
348-
const gpaMatch = transcriptText.match(/gpa:?\s*([\d\.]+)/i);
349-
const gpa = gpaMatch ? gpaMatch[1].trim() : "3.0";
350-
351-
return {
352-
course: {
353-
name: courseName,
354-
instructor: instructor,
355-
creditHours: creditHours
356-
},
357-
gradeWeights: gradeWeights.length > 0 ? gradeWeights : [
358-
{ name: "Assignments", weight: 0.4 },
359-
{ name: "Exams", weight: 0.6 }
360-
],
361-
completedAssignments: [],
362-
remainingAssignments: [],
363-
dueDates: [],
364-
gpa: gpa,
365-
academicHistory: {
366-
relevantCourses: []
367-
}
368-
};
369-
}
370-
371-
/**
372-
* Extract grade weights using regex patterns
373-
* @param {string} text - Text to extract grade weights from
374-
* @returns {Array} Array of {name, weight} objects
375-
*/
376-
function extractGradeWeights(text) {
377-
try {
378-
// Try different patterns to catch various formatting styles
379-
const patterns = [
380-
/([A-Za-z\s&-]+):\s*(\d+(?:\.\d+)?)%/g,
381-
/([A-Za-z\s&-]+)\s*=\s*(\d+(?:\.\d+)?)%/g,
382-
/([A-Za-z\s&-]+)\s*\((\d+(?:\.\d+)?)%\)/g
383-
];
384-
385-
const results = [];
386-
387-
for (const pattern of patterns) {
388-
let match;
389-
while ((match = pattern.exec(text)) !== null) {
390-
const name = match[1].trim();
391-
const weight = parseFloat(match[2]) / 100;
392-
393-
// Check for duplicates
394-
if (!results.some(r => r.name === name)) {
395-
results.push({ name, weight });
396-
}
397-
}
398-
}
399-
400-
// Normalize weights to sum to 1.0
401-
if (results.length > 0) {
402-
const totalWeight = results.reduce((sum, item) => sum + item.weight, 0);
403-
if (totalWeight > 0 && totalWeight !== 1.0) {
404-
results.forEach(item => {
405-
item.weight = item.weight / totalWeight;
406-
});
407-
}
408-
}
409-
410-
return results;
411-
} catch (error) {
412-
console.error("Error extracting grade weights:", error);
413-
return [];
414-
}
415-
}

0 commit comments

Comments
 (0)