|
1 | | -const OpenAI = require('openai'); |
2 | 1 | const functions = require('firebase-functions'); |
3 | 2 | const admin = require('firebase-admin'); |
4 | 3 | const { DOCUMENT_TYPES, normalizeDocumentType } = require('./utils/documentUtils'); |
5 | | -const { getOpenAIApiKey } = require('./utils/apiUtils'); |
| 4 | +const { formatSyllabusData, formatGradesData, formatTranscriptData, combineFormattedData } = require('./utils/documentProcessors'); |
6 | 5 |
|
7 | 6 | /** |
8 | | - * Store debugging data in Firestore |
9 | | - * @param {string} userId - User ID |
10 | | - * @param {string} prompt - OpenAI prompt |
11 | | - * @param {string} response - OpenAI response |
12 | | - */ |
13 | | -async function storeDebugData(userId, prompt, response) { |
14 | | - try { |
15 | | - const db = admin.firestore(); |
16 | | - const debugRef = db.collection('users').doc(userId).collection('debug').doc(); |
17 | | - |
18 | | - await debugRef.set({ |
19 | | - prompt, |
20 | | - response, |
21 | | - timestamp: admin.firestore.FieldValue.serverTimestamp() |
22 | | - }); |
23 | | - |
24 | | - console.log(`Debug data stored with ID: ${debugRef.id}`); |
25 | | - return debugRef.id; |
26 | | - } catch (error) { |
27 | | - console.error("Error storing debug data:", error); |
28 | | - // Don't throw - this is just for debugging |
29 | | - } |
30 | | -} |
31 | | - |
32 | | -/** |
33 | | - * Formats all document data using a single OpenAI API call to ensure consistent structure |
| 7 | + * Formats all document data using multiple OpenAI API calls to ensure consistent structure |
34 | 8 | * @param {string} userId - The user ID |
35 | 9 | * @param {boolean} forceProcess - Whether to force processing regardless of conditions |
36 | 10 | * @returns {Promise<Object>} Formatted data for calculations and predictions |
@@ -79,193 +53,49 @@ exports.formatDocumentsData = async (userId, forceProcess = false) => { |
79 | 53 |
|
80 | 54 | console.log(`Found documents by type: ${Object.keys(documentsByType).join(', ')}`); |
81 | 55 |
|
82 | | - // Check if we have a syllabus document - case insensitive |
83 | | - const hasSyllabus = Object.entries(documentsByType).some(([type]) => |
84 | | - normalizeDocumentType(type) === DOCUMENT_TYPES.SYLLABUS |
85 | | - ); |
| 56 | + // Process each document type in sequence |
86 | 57 |
|
87 | | - // Get the syllabus document if available |
88 | | - const syllabusDoc = hasSyllabus ? Object.entries(documentsByType).find(([type]) => |
89 | | - normalizeDocumentType(type) === DOCUMENT_TYPES.SYLLABUS |
90 | | - )?.[1] : null; |
91 | | - |
92 | | - // Log available document types |
93 | | - console.log('Available document types:', Object.keys(documentsByType)); |
94 | | - if (!hasSyllabus) { |
95 | | - console.log('No syllabus document found - proceeding with limited formatting'); |
| 58 | + // 1. Process syllabus first (if available) |
| 59 | + let syllabusData = null; |
| 60 | + if (documentsByType[DOCUMENT_TYPES.SYLLABUS]) { |
| 61 | + console.log('Processing syllabus document'); |
| 62 | + syllabusData = await formatSyllabusData(userId, documentsByType[DOCUMENT_TYPES.SYLLABUS].text); |
| 63 | + } else { |
| 64 | + console.log('No syllabus document found - using default values'); |
96 | 65 | } |
97 | 66 |
|
98 | | - // Get OpenAI API key |
99 | | - const apiKey = getOpenAIApiKey(); |
100 | | - |
101 | | - // Initialize OpenAI client |
102 | | - const openai = new OpenAI({ |
103 | | - apiKey: apiKey |
104 | | - }); |
105 | | - |
106 | | - // Create a unified prompt with all document texts |
107 | | - const prompt = createFormattingPrompt(documentsByType); |
108 | | - |
109 | | - // Call OpenAI API |
110 | | - console.log("===== OPENAI PROMPT ====="); |
111 | | - console.log(prompt); |
112 | | - console.log('Calling OpenAI for unified data formatting'); |
113 | | - const response = await openai.chat.completions.create({ |
114 | | - model: "gpt-4o-mini", |
115 | | - messages: [ |
116 | | - { |
117 | | - role: "system", |
118 | | - content: "You are a precise data formatting assistant. You MUST respond with ONLY valid JSON that exactly matches the requested structure. Include NO explanatory text outside the JSON object." |
119 | | - }, |
120 | | - { role: "user", content: prompt } |
121 | | - ], |
122 | | - temperature: 0.1, // Low temperature for consistency |
123 | | - response_format: { type: "json_object" } // Enforces JSON response |
124 | | - }); |
125 | | - |
126 | | - // Extract and parse the JSON response |
127 | | - const responseContent = response.choices[0].message.content; |
128 | | - console.log("===== OPENAI RESPONSE ====="); |
129 | | - console.log(responseContent); |
130 | | - |
131 | | - // Store debug data |
132 | | - await storeDebugData(userId, prompt, responseContent); |
| 67 | + // 2. Process grades next (using syllabus data to improve categorization) |
| 68 | + let gradesData = null; |
| 69 | + if (documentsByType[DOCUMENT_TYPES.GRADES]) { |
| 70 | + console.log('Processing grades document'); |
| 71 | + gradesData = await formatGradesData(userId, documentsByType[DOCUMENT_TYPES.GRADES].text); |
| 72 | + } else { |
| 73 | + console.log('No grades document found - using default values'); |
| 74 | + } |
133 | 75 |
|
134 | | - const formattedData = JSON.parse(responseContent); |
135 | | - console.log("===== PARSED FORMATTED DATA ====="); |
136 | | - console.log(JSON.stringify(formattedData, null, 2)); |
| 76 | + // 3. Process transcript last |
| 77 | + let transcriptData = null; |
| 78 | + if (documentsByType[DOCUMENT_TYPES.TRANSCRIPT]) { |
| 79 | + console.log('Processing transcript document'); |
| 80 | + transcriptData = await formatTranscriptData(userId, documentsByType[DOCUMENT_TYPES.TRANSCRIPT].text); |
| 81 | + } else { |
| 82 | + console.log('No transcript document found - using default values'); |
| 83 | + } |
137 | 84 |
|
138 | | - // Store the formatted data in the user's data document |
139 | | - await storeFormattedData(userId, formattedData); |
| 85 | + // 4. Combine all formatted data |
| 86 | + const formattedData = await combineFormattedData(userId); |
140 | 87 |
|
141 | 88 | // Update the status of all processed documents |
142 | 89 | const updateResult = await updateDocumentStatus(userId, snapshot.docs); |
143 | 90 | console.log(`Document status update result: ${updateResult}`); |
144 | 91 |
|
145 | 92 | return formattedData; |
146 | 93 | } catch (error) { |
147 | | - console.error('Error formatting data with OpenAI:', error); |
148 | | - |
149 | | - // Return a fallback format in case of error |
150 | | - return createFallbackFormattedData(documentsByType); |
| 94 | + console.error('Error formatting documents:', error); |
| 95 | + throw error; |
151 | 96 | } |
152 | 97 | }; |
153 | 98 |
|
154 | | -/** |
155 | | - * Creates the prompt for OpenAI formatting |
156 | | - * @param {Object} documentsByType - Documents organized by type |
157 | | - * @returns {string} Formatted prompt |
158 | | - */ |
159 | | -function createFormattingPrompt(documentsByType) { |
160 | | - // Extract the document texts |
161 | | - const syllabusText = documentsByType[DOCUMENT_TYPES.SYLLABUS]?.text || ''; |
162 | | - const gradesText = documentsByType[DOCUMENT_TYPES.GRADES]?.text || ''; |
163 | | - const transcriptText = documentsByType[DOCUMENT_TYPES.TRANSCRIPT]?.text || ''; |
164 | | - |
165 | | - return ` |
166 | | -I need you to format educational document data into a consistent structure for grade calculations and predictions. |
167 | | -Here is the raw text from different document types: |
168 | | -
|
169 | | -${syllabusText ? `SYLLABUS DATA: |
170 | | -${syllabusText}` : 'SYLLABUS DATA: Not available'} |
171 | | -
|
172 | | -${gradesText ? `GRADES DATA: |
173 | | -${gradesText}` : 'GRADES DATA: Not available'} |
174 | | -
|
175 | | -${transcriptText ? `TRANSCRIPT DATA: |
176 | | -${transcriptText}` : 'TRANSCRIPT DATA: Not available'} |
177 | | -
|
178 | | -Please format this data into the following exact JSON structure. If a syllabus is not available, provide best-effort values based on available data: |
179 | | -{ |
180 | | - "course": { |
181 | | - "name": "Course name (from syllabus if available, otherwise derive from grades/transcript)", |
182 | | - "instructor": "Instructor name if available, otherwise 'Unknown'", |
183 | | - "creditHours": "Credit hours if available, otherwise '3'" |
184 | | - }, |
185 | | - "gradeWeights": [ |
186 | | - { |
187 | | - "name": "Category name (from syllabus or inferred from grades)", |
188 | | - "weight": 0.3 // Decimal weight, ensure all weights sum to 1.0 |
189 | | - } |
190 | | - ], |
191 | | - "completedAssignments": [ |
192 | | - { |
193 | | - "name": "Assignment name from grades", |
194 | | - "grade": 95, // Numeric grade |
195 | | - "maxPoints": 100, // Maximum possible points |
196 | | - "category": "Best matching category based on name" |
197 | | - } |
198 | | - ], |
199 | | - "remainingAssignments": [ |
200 | | - { |
201 | | - "name": "Assignment name from syllabus if available", |
202 | | - "category": "Best matching category" |
203 | | - } |
204 | | - ], |
205 | | - "dueDates": [ |
206 | | - { |
207 | | - "assignment": "Assignment name", |
208 | | - "due_date": "Due date if available" |
209 | | - } |
210 | | - ], |
211 | | - "gpa": "Overall GPA from transcript, or 'N/A' if not available", |
212 | | - "academicHistory": { |
213 | | - "relevantCourses": [ |
214 | | - { |
215 | | - "course_code": "Course code if available", |
216 | | - "course_name": "Course name", |
217 | | - "grade": "Letter grade if available", |
218 | | - "numerical_grade": "Numerical equivalent if available", |
219 | | - "relevance": "High/Medium/Low based on available context" |
220 | | - } |
221 | | - ] |
222 | | - } |
223 | | -} |
224 | | -
|
225 | | -Processing Instructions: |
226 | | -1. If syllabus is available: |
227 | | - - Use exact grade weights and assignments |
228 | | - - Match completed assignments to syllabus categories |
229 | | - - List remaining assignments from syllabus |
230 | | -
|
231 | | -2. If only grades are available: |
232 | | - - Infer categories from assignment names |
233 | | - - Create approximate grade weights based on assignment counts |
234 | | - - Leave remainingAssignments empty |
235 | | -
|
236 | | -3. If transcript is available: |
237 | | - - Include GPA and relevant course history |
238 | | - - Use course names to determine relevance |
239 | | -
|
240 | | -For the academicHistory.relevantCourses, analyze the transcript to find courses that are relevant to the current course: |
241 | | -- High relevance: Same department (e.g., PHY for Physics courses), prerequisites, or similar keywords |
242 | | -- Medium relevance: Related departments (e.g., MATH for Physics courses), general science courses |
243 | | -- Low relevance: Other STEM courses or courses that might indirectly impact performance |
244 | | -`; |
245 | | -} |
246 | | - |
247 | | -/** |
248 | | - * Stores the formatted data in Firestore |
249 | | - * @param {string} userId - The user ID |
250 | | - * @param {Object} formattedData - The formatted data |
251 | | - * @returns {Promise<void>} |
252 | | - */ |
253 | | -async function storeFormattedData(userId, formattedData) { |
254 | | - console.log(`Storing formatted data for user ${userId}`); |
255 | | - const db = admin.firestore(); |
256 | | - |
257 | | - try { |
258 | | - await db.collection('users').doc(userId).collection('data').doc('formatted_data').set({ |
259 | | - formatted_data: formattedData, |
260 | | - lastUpdated: admin.firestore.FieldValue.serverTimestamp() |
261 | | - }); |
262 | | - |
263 | | - console.log('Successfully stored formatted data'); |
264 | | - } catch (error) { |
265 | | - console.error('Error storing formatted data:', error); |
266 | | - throw error; |
267 | | - } |
268 | | -} |
269 | 99 |
|
270 | 100 | /** |
271 | 101 | * Updates the status of processed documents |
@@ -316,100 +146,3 @@ async function updateDocumentStatus(userId, documents) { |
316 | 146 | return false; |
317 | 147 | } |
318 | 148 | } |
319 | | - |
320 | | -/** |
321 | | - * Creates a fallback formatted data structure if OpenAI fails |
322 | | - * @param {Object} documentsByType - Documents organized by type |
323 | | - * @returns {Object} Fallback formatted data |
324 | | - */ |
325 | | -function createFallbackFormattedData(documentsByType) { |
326 | | - console.log('Creating fallback formatted data'); |
327 | | - |
328 | | - // Extract basic information using regex patterns |
329 | | - const syllabusText = documentsByType[DOCUMENT_TYPES.SYLLABUS]?.text || ''; |
330 | | - |
331 | | - // Extract course name |
332 | | - const courseNameMatch = syllabusText.match(/course(?:\s+title)?:?\s*([^\n]+)/i); |
333 | | - const courseName = courseNameMatch ? courseNameMatch[1].trim() : "Unknown Course"; |
334 | | - |
335 | | - // Extract instructor |
336 | | - const instructorMatch = syllabusText.match(/instructor:?\s*([^\n]+)/i); |
337 | | - const instructor = instructorMatch ? instructorMatch[1].trim() : "Unknown Instructor"; |
338 | | - |
339 | | - // Extract credit hours |
340 | | - const creditHoursMatch = syllabusText.match(/credit\s+hours:?\s*(\d+)/i); |
341 | | - const creditHours = creditHoursMatch ? creditHoursMatch[1].trim() : "3"; |
342 | | - |
343 | | - // Extract grade weights using regex |
344 | | - const gradeWeights = extractGradeWeights(syllabusText); |
345 | | - |
346 | | - // Extract GPA from transcript |
347 | | - const transcriptText = documentsByType[DOCUMENT_TYPES.TRANSCRIPT]?.text || ''; |
348 | | - const gpaMatch = transcriptText.match(/gpa:?\s*([\d\.]+)/i); |
349 | | - const gpa = gpaMatch ? gpaMatch[1].trim() : "3.0"; |
350 | | - |
351 | | - return { |
352 | | - course: { |
353 | | - name: courseName, |
354 | | - instructor: instructor, |
355 | | - creditHours: creditHours |
356 | | - }, |
357 | | - gradeWeights: gradeWeights.length > 0 ? gradeWeights : [ |
358 | | - { name: "Assignments", weight: 0.4 }, |
359 | | - { name: "Exams", weight: 0.6 } |
360 | | - ], |
361 | | - completedAssignments: [], |
362 | | - remainingAssignments: [], |
363 | | - dueDates: [], |
364 | | - gpa: gpa, |
365 | | - academicHistory: { |
366 | | - relevantCourses: [] |
367 | | - } |
368 | | - }; |
369 | | -} |
370 | | - |
371 | | -/** |
372 | | - * Extract grade weights using regex patterns |
373 | | - * @param {string} text - Text to extract grade weights from |
374 | | - * @returns {Array} Array of {name, weight} objects |
375 | | - */ |
376 | | -function extractGradeWeights(text) { |
377 | | - try { |
378 | | - // Try different patterns to catch various formatting styles |
379 | | - const patterns = [ |
380 | | - /([A-Za-z\s&-]+):\s*(\d+(?:\.\d+)?)%/g, |
381 | | - /([A-Za-z\s&-]+)\s*=\s*(\d+(?:\.\d+)?)%/g, |
382 | | - /([A-Za-z\s&-]+)\s*\((\d+(?:\.\d+)?)%\)/g |
383 | | - ]; |
384 | | - |
385 | | - const results = []; |
386 | | - |
387 | | - for (const pattern of patterns) { |
388 | | - let match; |
389 | | - while ((match = pattern.exec(text)) !== null) { |
390 | | - const name = match[1].trim(); |
391 | | - const weight = parseFloat(match[2]) / 100; |
392 | | - |
393 | | - // Check for duplicates |
394 | | - if (!results.some(r => r.name === name)) { |
395 | | - results.push({ name, weight }); |
396 | | - } |
397 | | - } |
398 | | - } |
399 | | - |
400 | | - // Normalize weights to sum to 1.0 |
401 | | - if (results.length > 0) { |
402 | | - const totalWeight = results.reduce((sum, item) => sum + item.weight, 0); |
403 | | - if (totalWeight > 0 && totalWeight !== 1.0) { |
404 | | - results.forEach(item => { |
405 | | - item.weight = item.weight / totalWeight; |
406 | | - }); |
407 | | - } |
408 | | - } |
409 | | - |
410 | | - return results; |
411 | | - } catch (error) { |
412 | | - console.error("Error extracting grade weights:", error); |
413 | | - return []; |
414 | | - } |
415 | | -} |
0 commit comments