1515
1616load_dotenv ()
1717
18+
1819def collect_quran_threads () -> List [Dict ]:
1920 """Collect all Quran-related threads from v2 analysis."""
2021 output_dir = Path ("analyzed_data_v2" )
2122 quran_threads = []
22-
23- print (f "Scanning batch files for Quran threads..." )
23+
24+ print ("Scanning batch files for Quran threads..." )
2425 completed_files = sorted (output_dir .glob ("analyzed_threads_batch_*.json" ))
25-
26+
2627 for file in completed_files :
2728 try :
2829 with open (file ) as f :
2930 data = json .load (f )
3031 for result in data .get ("results" , []):
3132 if result .get ("topic" ) == "quran" :
32- quran_threads .append ({
33- "user_input" : result .get ("user_input" , "" ),
34- "language" : result .get ("language" , "" ),
35- "thread_id" : result .get ("thread_id" , "" ),
36- "pii_confidence" : result .get ("pii_confidence" , 0.0 )
37- })
33+ quran_threads .append (
34+ {
35+ "user_input" : result .get ("user_input" , "" ),
36+ "language" : result .get ("language" , "" ),
37+ "thread_id" : result .get ("thread_id" , "" ),
38+ "pii_confidence" : result .get ("pii_confidence" , 0.0 ),
39+ }
40+ )
3841 except Exception as e :
3942 print (f"Error reading { file } : { e } " )
40-
43+
4144 return quran_threads
4245
46+
4347def analyze_500_samples (sample_threads : List [Dict ]) -> Dict :
4448 """Analyze 500 samples with Gemini 2.5 Flash."""
45-
46- api_key = os .getenv (' GOOGLE_API_KEY' )
49+
50+ api_key = os .getenv (" GOOGLE_API_KEY" )
4751 if not api_key :
4852 raise ValueError ("GOOGLE_API_KEY environment variable not set" )
49-
53+
5054 client = genai .Client (api_key = api_key )
51-
55+
5256 # Prepare samples in batches to avoid context limits
5357 batch_size = 100
5458 all_clusters = []
55-
59+
5660 for i in range (0 , len (sample_threads ), batch_size ):
57- batch = sample_threads [i : i + batch_size ]
61+ batch = sample_threads [i : i + batch_size ]
5862 batch_num = i // batch_size + 1
5963 print (f"\n Analyzing batch { batch_num } ({ len (batch )} questions)..." )
60-
61- sample_text = "\n \n " .join ([
62- f"Q{ j + i + 1 } : { thread ['user_input' ]} "
63- for j , thread in enumerate (batch )
64- ])
65-
64+
65+ sample_text = "\n \n " .join ([f"Q{ j + i + 1 } : { thread ['user_input' ]} " for j , thread in enumerate (batch )])
66+
6667 prompt = f"""Analyze these { len (batch )} Quran-related questions (part of a 500-question sample).
6768
6869Identify natural subtopic clusters. Be specific and consistent.
@@ -94,42 +95,40 @@ def analyze_500_samples(sample_threads: List[Dict]) -> Dict:
9495 ...
9596 ]
9697}}"""
97-
98+
9899 try :
99- response = client .models .generate_content (
100- model = 'gemini-2.5-flash' ,
101- contents = [prompt ]
102- )
103-
100+ response = client .models .generate_content (model = "gemini-2.5-flash" , contents = [prompt ])
101+
104102 response_text = response .text .strip ()
105- if response_text .startswith (' ```json' ):
103+ if response_text .startswith (" ```json" ):
106104 response_text = response_text [7 :- 3 ].strip ()
107- elif response_text .startswith (' ```' ):
105+ elif response_text .startswith (" ```" ):
108106 response_text = response_text [3 :- 3 ].strip ()
109-
107+
110108 batch_result = json .loads (response_text )
111109 all_clusters .extend (batch_result .get ("question_clusters" , []))
112-
110+
113111 except Exception as e :
114112 print (f"Error analyzing batch { batch_num } : { e } " )
115-
113+
116114 return all_clusters
117115
116+
118117def generate_comprehensive_analysis (all_clusters : List [Dict ], sample_threads : List [Dict ]) -> Dict :
119118 """Generate comprehensive analysis from all cluster assignments."""
120-
119+
121120 # Count cluster frequencies
122121 cluster_counts = Counter ()
123122 for item in all_clusters :
124123 cluster_counts [item .get ("cluster" , "Unknown" )] += 1
125-
124+
126125 # Calculate percentages
127126 total = len (all_clusters )
128127 cluster_stats = []
129-
128+
130129 for cluster , count in cluster_counts .most_common ():
131130 percentage = (count / total * 100 ) if total > 0 else 0
132-
131+
133132 # Get example questions for this cluster
134133 examples = []
135134 for i , item in enumerate (all_clusters ):
@@ -139,129 +138,123 @@ def generate_comprehensive_analysis(all_clusters: List[Dict], sample_threads: Li
139138 idx = int (q_id [1 :]) - 1
140139 if 0 <= idx < len (sample_threads ):
141140 examples .append (sample_threads [idx ]["user_input" ][:150 ])
142-
143- cluster_stats .append ({
144- "name" : cluster ,
145- "count" : count ,
146- "percentage" : percentage ,
147- "examples" : examples
148- })
149-
141+
142+ cluster_stats .append ({"name" : cluster , "count" : count , "percentage" : percentage , "examples" : examples })
143+
150144 # Language distribution
151145 lang_counts = Counter (thread ["language" ] for thread in sample_threads )
152-
146+
153147 return {
154148 "sample_size" : len (sample_threads ),
155149 "cluster_distribution" : cluster_stats ,
156150 "language_distribution" : dict (lang_counts .most_common ()),
157- "analysis_timestamp" : datetime .now ().isoformat ()
151+ "analysis_timestamp" : datetime .now ().isoformat (),
158152 }
159153
154+
160155def main ():
161156 """Main function to analyze 500 Quran samples."""
162-
157+
163158 print ("=" * 80 )
164159 print ("QURAN SUBTOPIC ANALYSIS - 500 SAMPLE ANALYSIS" )
165160 print ("=" * 80 )
166-
161+
167162 # Collect all Quran threads
168163 quran_threads = collect_quran_threads ()
169164 print (f"Total Quran threads found: { len (quran_threads ):,} " )
170-
165+
171166 if len (quran_threads ) < 500 :
172167 print (f"Warning: Only { len (quran_threads )} threads available" )
173168 sample_size = len (quran_threads )
174169 else :
175170 sample_size = 500
176-
171+
177172 # Random sample
178173 sample = random .sample (quran_threads , sample_size )
179174 print (f"Randomly sampled: { sample_size } threads" )
180-
175+
181176 # Analyze with Gemini
182- print (f "\n Analyzing with Gemini 2.5 Flash..." )
177+ print ("\n Analyzing with Gemini 2.5 Flash..." )
183178 all_clusters = analyze_500_samples (sample )
184-
179+
185180 if not all_clusters :
186181 print ("Failed to get cluster analysis" )
187182 return
188-
183+
189184 # Generate comprehensive analysis
190- print (f "\n Generating comprehensive analysis..." )
185+ print ("\n Generating comprehensive analysis..." )
191186 analysis = generate_comprehensive_analysis (all_clusters , sample )
192-
187+
193188 # Display results
194189 print ("\n " + "=" * 80 )
195190 print ("QURAN SUBTOPIC CLUSTERS (500 SAMPLES)" )
196191 print ("=" * 80 )
197-
192+
198193 for i , cluster in enumerate (analysis ["cluster_distribution" ], 1 ):
199194 print (f"\n { i :2d} . { cluster ['name' ]} " )
200195 print (f" Count: { cluster ['count' ]} ({ cluster ['percentage' ]:.1f} %)" )
201- if cluster [' examples' ]:
202- print (f " Examples:" )
203- for example in cluster [' examples' ]:
196+ if cluster [" examples" ]:
197+ print (" Examples:" )
198+ for example in cluster [" examples" ]:
204199 print (f" • { example } ..." )
205-
200+
206201 # Summary statistics
207202 print ("\n " + "=" * 80 )
208203 print ("SUMMARY STATISTICS" )
209204 print ("=" * 80 )
210-
205+
211206 top_5 = analysis ["cluster_distribution" ][:5 ]
212207 top_5_pct = sum (c ["percentage" ] for c in top_5 )
213-
208+
214209 print (f"\n 📊 Top 5 clusters account for { top_5_pct :.1f} % of all questions:" )
215210 for cluster in top_5 :
216211 print (f" • { cluster ['name' ]} : { cluster ['percentage' ]:.1f} %" )
217-
212+
218213 # Language distribution
219- print (f "\n 🌍 Language Distribution:" )
214+ print ("\n 🌍 Language Distribution:" )
220215 lang_dist = analysis ["language_distribution" ]
221216 for lang , count in list (lang_dist .items ())[:5 ]:
222- pct = ( count / sample_size * 100 )
217+ pct = count / sample_size * 100
223218 print (f" • { lang } : { count } ({ pct :.1f} %)" )
224-
219+
225220 # Save results
226221 output_file = Path ("quran_500_analysis.json" )
227- with open (output_file , 'w' , encoding = 'utf-8' ) as f :
228- json .dump ({
229- "analysis" : analysis ,
230- "sample_threads" : sample
231- }, f , indent = 2 , ensure_ascii = False )
232-
222+ with open (output_file , "w" , encoding = "utf-8" ) as f :
223+ json .dump ({"analysis" : analysis , "sample_threads" : sample }, f , indent = 2 , ensure_ascii = False )
224+
233225 print (f"\n 💾 Full analysis saved to: { output_file } " )
234-
226+
235227 # Comparison with 100-sample analysis
236228 print ("\n " + "=" * 80 )
237229 print ("COMPARISON: 500 vs 100 SAMPLE ANALYSIS" )
238230 print ("=" * 80 )
239-
231+
240232 print ("\n 📈 Key Differences with 500 samples:" )
241233 print (" • More granular subcategories emerge" )
242234 print (" • Percentages stabilize with larger sample" )
243235 print (" • Rare categories become visible" )
244236 print (" • Better representation of language diversity" )
245-
237+
246238 # Recommendations
247239 print ("\n " + "=" * 80 )
248240 print ("RECOMMENDATIONS FOR QURAN CONTENT ORGANIZATION" )
249241 print ("=" * 80 )
250-
242+
251243 print ("\n ✅ Primary Categories (>10%):" )
252244 for cluster in analysis ["cluster_distribution" ]:
253245 if cluster ["percentage" ] >= 10 :
254246 print (f" • { cluster ['name' ]} : { cluster ['percentage' ]:.1f} %" )
255-
247+
256248 print ("\n 📝 Secondary Categories (5-10%):" )
257249 for cluster in analysis ["cluster_distribution" ]:
258250 if 5 <= cluster ["percentage" ] < 10 :
259251 print (f" • { cluster ['name' ]} : { cluster ['percentage' ]:.1f} %" )
260-
252+
261253 print ("\n 💡 Specialized Categories (<5%):" )
262254 for cluster in analysis ["cluster_distribution" ]:
263255 if cluster ["percentage" ] < 5 :
264256 print (f" • { cluster ['name' ]} : { cluster ['percentage' ]:.1f} %" )
265257
258+
266259if __name__ == "__main__" :
267- main ()
260+ main ()
0 commit comments