@@ -52,6 +52,7 @@ def extract_text_from_pdf(self, pdf_path: str) -> Optional[str]:
5252
5353 doc = pymupdf .open (pdf_path )
5454 pages = range (doc .page_count )
55+ # We are calling the enhanced pymupdf_rag script here
5556 resume_text = to_markdown (
5657 doc ,
5758 pages = pages ,
@@ -67,6 +68,8 @@ def extract_text_from_pdf(self, pdf_path: str) -> Optional[str]:
6768 def _call_llm_for_section (
6869 self , section_name : str , text_content : str , prompt : str , return_model = None
6970 ) -> Optional [Dict ]:
71+ # This function remains unchanged, as it correctly calls the LLM for a given piece of text.
72+ # The change is that we will now pass it SMALLER, pre-separated chunks of text.
7073 try :
7174 start_time = time .time ()
7275 logger .debug (
@@ -103,9 +106,7 @@ def _call_llm_for_section(
103106 if return_model :
104107 kwargs ["format" ] = return_model .model_json_schema ()
105108
106- # Use the appropriate provider to make the API call
107109 response = self .provider .chat (** chat_params , ** kwargs )
108-
109110 response_text = response ["message" ]["content" ]
110111
111112 try :
@@ -123,40 +124,30 @@ def _call_llm_for_section(
123124 logger .debug (
124125 f"⏱️ Total time for separate section extraction: { total_time :.2f} seconds"
125126 )
126-
127127 return transformed_data
128128 except json .JSONDecodeError as e :
129129 logger .error (f"❌ Error parsing JSON for { section_name } section: { e } " )
130130 logger .error (f"Raw response: { response_text } " )
131131 return None
132-
133132 except Exception as e :
134133 logger .error (f"❌ Error calling LLM for { section_name } section: { e } " )
135134 return None
136135
136+ # --- All the extract_*_section methods below remain unchanged ---
137137 def extract_basics_section (self , resume_text : str ) -> Optional [Dict ]:
138138 prompt = self .template_manager .render_template (
139139 "basics" , text_content = resume_text
140140 )
141- if not prompt :
142- logger .error ("❌ Failed to render basics template" )
143- return None
144141 return self ._call_llm_for_section ("basics" , resume_text , prompt , BasicsSection )
145142
146143 def extract_work_section (self , resume_text : str ) -> Optional [Dict ]:
147144 prompt = self .template_manager .render_template ("work" , text_content = resume_text )
148- if not prompt :
149- logger .error ("❌ Failed to render work template" )
150- return None
151145 return self ._call_llm_for_section ("work" , resume_text , prompt , WorkSection )
152146
153147 def extract_education_section (self , resume_text : str ) -> Optional [Dict ]:
154148 prompt = self .template_manager .render_template (
155149 "education" , text_content = resume_text
156150 )
157- if not prompt :
158- logger .error ("❌ Failed to render education template" )
159- return None
160151 return self ._call_llm_for_section (
161152 "education" , resume_text , prompt , EducationSection
162153 )
@@ -165,18 +156,12 @@ def extract_skills_section(self, resume_text: str) -> Optional[Dict]:
165156 prompt = self .template_manager .render_template (
166157 "skills" , text_content = resume_text
167158 )
168- if not prompt :
169- logger .error ("❌ Failed to render skills template" )
170- return None
171159 return self ._call_llm_for_section ("skills" , resume_text , prompt , SkillsSection )
172160
173161 def extract_projects_section (self , resume_text : str ) -> Optional [Dict ]:
174162 prompt = self .template_manager .render_template (
175163 "projects" , text_content = resume_text
176164 )
177- if not prompt :
178- logger .error ("❌ Failed to render projects template" )
179- return None
180165 return self ._call_llm_for_section (
181166 "projects" , resume_text , prompt , ProjectsSection
182167 )
@@ -185,37 +170,17 @@ def extract_awards_section(self, resume_text: str) -> Optional[Dict]:
185170 prompt = self .template_manager .render_template (
186171 "awards" , text_content = resume_text
187172 )
188- if not prompt :
189- logger .error ("❌ Failed to render awards template" )
190- return None
191173 return self ._call_llm_for_section ("awards" , resume_text , prompt , AwardsSection )
192174
175+ # --- All other top-level methods remain unchanged ---
193176 def extract_json_from_text (self , resume_text : str ) -> Optional [JSONResume ]:
194- try :
195- return self ._extract_all_sections_separately (resume_text )
196- except Exception as e :
197- logger .error (f"Error calling Ollama: { e } " )
198- return None
177+ return self ._extract_all_sections_separately (resume_text )
199178
200179 def extract_json_from_pdf (self , pdf_path : str ) -> Optional [JSONResume ]:
201- try :
202- logger .debug (f"📄 Extracting text from PDF: { pdf_path } " )
203- text_content = self .extract_text_from_pdf (pdf_path )
204-
205- if not text_content :
206- logger .error ("❌ Failed to extract text from PDF" )
207- return None
208-
209- logger .debug (
210- f"✅ Successfully extracted { len (text_content )} characters from PDF"
211- )
212-
213- logger .debug ("🔄 Extracting all sections separately..." )
214- return self ._extract_all_sections_separately (text_content )
215-
216- except Exception as e :
217- logger .error (f"❌ Error during PDF to JSON extraction: { e } " )
180+ text_content = self .extract_text_from_pdf (pdf_path )
181+ if not text_content :
218182 return None
183+ return self ._extract_all_sections_separately (text_content )
219184
220185 def _extract_section_data (
221186 self , text_content : str , section_name : str , return_model = None
@@ -228,85 +193,111 @@ def _extract_section_data(
228193 "projects" : self .extract_projects_section ,
229194 "awards" : self .extract_awards_section ,
230195 }
196+ if section_name in section_extractors :
197+ return section_extractors [section_name ](text_content )
198+ return None
231199
232- if section_name not in section_extractors :
233- logger .error (f"❌ Invalid section name: { section_name } " )
234- logger .error (f"Valid sections: { list (section_extractors .keys ())} " )
235- return None
236-
237- return section_extractors [section_name ](text_content )
200+ # --- UPGRADE #1: ADD THE NEW HELPER FUNCTION ---
201+ # This is now properly indented to be a method of the PDFHandler class.
202+ def _split_markdown_by_headers (self , markdown_text : str ) -> dict :
203+ """
204+ Splits a markdown string into a dictionary based on H2 headers.
205+ This is a robust, deterministic way to parse the resume structure.
206+ """
207+ sections = {}
208+ lines = markdown_text .strip ().split ("\n " )
209+
210+ current_header_key = "basics"
211+ current_content = []
212+
213+ known_headers = {
214+ "academic details" : "education" ,
215+ "work experience" : "work" ,
216+ "projects" : "projects" ,
217+ "technical skills" : "skills" ,
218+ "relevant courses" : "skills" , # Group courses with skills
219+ "achievements" : "awards" ,
220+ }
238221
239- def _extract_single_section (
240- self , text_content : str , section_name : str , return_model = None
241- ) -> Optional [Dict ]:
242- section_data = self ._extract_section_data (
243- text_content , section_name , return_model
244- )
245- if section_data :
246- complete_resume = {
247- "basics" : None ,
248- "work" : None ,
249- "volunteer" : None ,
250- "education" : None ,
251- "awards" : None ,
252- "certificates" : None ,
253- "publications" : None ,
254- "skills" : None ,
255- "languages" : None ,
256- "interests" : None ,
257- "references" : None ,
258- "projects" : None ,
259- "meta" : None ,
260- }
222+ for line in lines :
223+ if line .strip ().startswith ("##" ):
224+ cleaned_line = line .strip ().replace ("##" , "" ).strip ().lower ()
225+
226+ if cleaned_line in known_headers :
227+ # Save the content of the previous section
228+ if current_header_key and current_content :
229+ sections [current_header_key ] = "\n " .join (
230+ current_content
231+ ).strip ()
232+
233+ # Start the new section
234+ current_header_key = known_headers [cleaned_line ]
235+ current_content = []
236+ else :
237+ # If it's a header we don't recognize, treat it as content
238+ current_content .append (line )
239+ else :
240+ current_content .append (line )
261241
262- complete_resume .update (section_data )
263- return complete_resume
242+ # Save the very last section
243+ if current_header_key and current_content :
244+ sections [current_header_key ] = "\n " .join (current_content ).strip ()
264245
265- return None
246+ return sections
266247
248+ # --- UPGRADE #2: REPLACE THE OLD, INEFFICIENT FUNCTION ---
267249 def _extract_all_sections_separately (
268250 self , text_content : str
269251 ) -> Optional [JSONResume ]:
270252 start_time = time .time ()
271253
272- sections = ["basics" , "work" , "education" , "skills" , "projects" , "awards" ]
254+ # Step 1: Reliably split the document into sections using our new helper function.
255+ sectioned_text = self ._split_markdown_by_headers (text_content )
273256
274257 complete_resume = {
275258 "basics" : None ,
276259 "work" : None ,
277- "volunteer" : None ,
278260 "education" : None ,
279261 "awards" : None ,
280- "certificates" : None ,
281- "publications" : None ,
282262 "skills" : None ,
283- "languages" : None ,
284- "interests" : None ,
285- "references" : None ,
286263 "projects" : None ,
287- "meta" : None ,
288264 }
289265
290- for section_name in sections :
291- section_data = self ._extract_section_data (text_content , section_name )
292-
293- if section_data :
294- complete_resume .update (section_data )
295- logger .debug (f"✅ Successfully extracted { section_name } section" )
296- else :
297- logger .error (f"⚠️ Failed to extract { section_name } section" )
266+ # Step 2: Loop through the pre-separated sections and send them to the LLM for analysis.
267+ for section_name , section_content in sectioned_text .items ():
268+ if section_name in complete_resume and section_content :
269+ # Pass the specific section_content, not the whole resume text.
270+ section_data = self ._extract_section_data (section_content , section_name )
271+
272+ if section_data :
273+ # If a section already has data (e.g. skills + courses), merge them
274+ if complete_resume .get (section_name ):
275+ # This is a simplified merge, more complex logic can be added if needed
276+ if isinstance (complete_resume [section_name ], list ):
277+ complete_resume [section_name ].extend (
278+ section_data .get (section_name , [])
279+ )
280+ else :
281+ complete_resume .update (section_data )
282+ logger .debug (f"✅ Successfully extracted { section_name } section" )
283+ else :
284+ logger .error (
285+ f"⚠️ Failed to extract { section_name } section using LLM"
286+ )
298287
299288 try :
300289 if complete_resume .get ("basics" ) and isinstance (
301290 complete_resume ["basics" ], dict
302291 ):
303- try :
304- complete_resume ["basics" ] = Basics (** complete_resume ["basics" ])
305- except Exception as e :
306- logger .error (f"❌ Error creating Basics object: { e } " )
307- complete_resume ["basics" ] = None
292+ complete_resume ["basics" ] = Basics (** complete_resume ["basics" ])
293+
294+ # Filter out keys not expected by JSONResume before creating the object
295+ valid_keys = JSONResume .model_fields .keys ()
296+ filtered_resume_data = {
297+ k : v for k , v in complete_resume .items () if k in valid_keys
298+ }
308299
309- json_resume = JSONResume (** complete_resume )
300+ json_resume = JSONResume (** filtered_resume_data )
310301
311302 end_time = time .time ()
312303 total_time = end_time - start_time
0 commit comments