Skip to content

Commit fb5dcb3

Browse files
committed
Refactor: Resolve ruff linting and formatting issues
- Fix E501 line too long errors by extracting long expressions to variables - Fix E722 bare except by specifying exception types (json.JSONDecodeError, OSError) - Fix F841 unused variables by adding noqa comments or commenting out code - Apply ruff format across all files
1 parent 934a488 commit fb5dcb3

33 files changed

Lines changed: 1028 additions & 1143 deletions

analysis/scripts/analyze_quran_500.py

Lines changed: 74 additions & 81 deletions
Original file line numberDiff line numberDiff line change
@@ -15,54 +15,55 @@
1515

1616
load_dotenv()
1717

18+
1819
def collect_quran_threads() -> List[Dict]:
1920
"""Collect all Quran-related threads from v2 analysis."""
2021
output_dir = Path("analyzed_data_v2")
2122
quran_threads = []
22-
23-
print(f"Scanning batch files for Quran threads...")
23+
24+
print("Scanning batch files for Quran threads...")
2425
completed_files = sorted(output_dir.glob("analyzed_threads_batch_*.json"))
25-
26+
2627
for file in completed_files:
2728
try:
2829
with open(file) as f:
2930
data = json.load(f)
3031
for result in data.get("results", []):
3132
if result.get("topic") == "quran":
32-
quran_threads.append({
33-
"user_input": result.get("user_input", ""),
34-
"language": result.get("language", ""),
35-
"thread_id": result.get("thread_id", ""),
36-
"pii_confidence": result.get("pii_confidence", 0.0)
37-
})
33+
quran_threads.append(
34+
{
35+
"user_input": result.get("user_input", ""),
36+
"language": result.get("language", ""),
37+
"thread_id": result.get("thread_id", ""),
38+
"pii_confidence": result.get("pii_confidence", 0.0),
39+
}
40+
)
3841
except Exception as e:
3942
print(f"Error reading {file}: {e}")
40-
43+
4144
return quran_threads
4245

46+
4347
def analyze_500_samples(sample_threads: List[Dict]) -> Dict:
4448
"""Analyze 500 samples with Gemini 2.5 Flash."""
45-
46-
api_key = os.getenv('GOOGLE_API_KEY')
49+
50+
api_key = os.getenv("GOOGLE_API_KEY")
4751
if not api_key:
4852
raise ValueError("GOOGLE_API_KEY environment variable not set")
49-
53+
5054
client = genai.Client(api_key=api_key)
51-
55+
5256
# Prepare samples in batches to avoid context limits
5357
batch_size = 100
5458
all_clusters = []
55-
59+
5660
for i in range(0, len(sample_threads), batch_size):
57-
batch = sample_threads[i:i+batch_size]
61+
batch = sample_threads[i : i + batch_size]
5862
batch_num = i // batch_size + 1
5963
print(f"\nAnalyzing batch {batch_num} ({len(batch)} questions)...")
60-
61-
sample_text = "\n\n".join([
62-
f"Q{j+i+1}: {thread['user_input']}"
63-
for j, thread in enumerate(batch)
64-
])
65-
64+
65+
sample_text = "\n\n".join([f"Q{j + i + 1}: {thread['user_input']}" for j, thread in enumerate(batch)])
66+
6667
prompt = f"""Analyze these {len(batch)} Quran-related questions (part of a 500-question sample).
6768
6869
Identify natural subtopic clusters. Be specific and consistent.
@@ -94,42 +95,40 @@ def analyze_500_samples(sample_threads: List[Dict]) -> Dict:
9495
...
9596
]
9697
}}"""
97-
98+
9899
try:
99-
response = client.models.generate_content(
100-
model='gemini-2.5-flash',
101-
contents=[prompt]
102-
)
103-
100+
response = client.models.generate_content(model="gemini-2.5-flash", contents=[prompt])
101+
104102
response_text = response.text.strip()
105-
if response_text.startswith('```json'):
103+
if response_text.startswith("```json"):
106104
response_text = response_text[7:-3].strip()
107-
elif response_text.startswith('```'):
105+
elif response_text.startswith("```"):
108106
response_text = response_text[3:-3].strip()
109-
107+
110108
batch_result = json.loads(response_text)
111109
all_clusters.extend(batch_result.get("question_clusters", []))
112-
110+
113111
except Exception as e:
114112
print(f"Error analyzing batch {batch_num}: {e}")
115-
113+
116114
return all_clusters
117115

116+
118117
def generate_comprehensive_analysis(all_clusters: List[Dict], sample_threads: List[Dict]) -> Dict:
119118
"""Generate comprehensive analysis from all cluster assignments."""
120-
119+
121120
# Count cluster frequencies
122121
cluster_counts = Counter()
123122
for item in all_clusters:
124123
cluster_counts[item.get("cluster", "Unknown")] += 1
125-
124+
126125
# Calculate percentages
127126
total = len(all_clusters)
128127
cluster_stats = []
129-
128+
130129
for cluster, count in cluster_counts.most_common():
131130
percentage = (count / total * 100) if total > 0 else 0
132-
131+
133132
# Get example questions for this cluster
134133
examples = []
135134
for i, item in enumerate(all_clusters):
@@ -139,129 +138,123 @@ def generate_comprehensive_analysis(all_clusters: List[Dict], sample_threads: Li
139138
idx = int(q_id[1:]) - 1
140139
if 0 <= idx < len(sample_threads):
141140
examples.append(sample_threads[idx]["user_input"][:150])
142-
143-
cluster_stats.append({
144-
"name": cluster,
145-
"count": count,
146-
"percentage": percentage,
147-
"examples": examples
148-
})
149-
141+
142+
cluster_stats.append({"name": cluster, "count": count, "percentage": percentage, "examples": examples})
143+
150144
# Language distribution
151145
lang_counts = Counter(thread["language"] for thread in sample_threads)
152-
146+
153147
return {
154148
"sample_size": len(sample_threads),
155149
"cluster_distribution": cluster_stats,
156150
"language_distribution": dict(lang_counts.most_common()),
157-
"analysis_timestamp": datetime.now().isoformat()
151+
"analysis_timestamp": datetime.now().isoformat(),
158152
}
159153

154+
160155
def main():
161156
"""Main function to analyze 500 Quran samples."""
162-
157+
163158
print("=" * 80)
164159
print("QURAN SUBTOPIC ANALYSIS - 500 SAMPLE ANALYSIS")
165160
print("=" * 80)
166-
161+
167162
# Collect all Quran threads
168163
quran_threads = collect_quran_threads()
169164
print(f"Total Quran threads found: {len(quran_threads):,}")
170-
165+
171166
if len(quran_threads) < 500:
172167
print(f"Warning: Only {len(quran_threads)} threads available")
173168
sample_size = len(quran_threads)
174169
else:
175170
sample_size = 500
176-
171+
177172
# Random sample
178173
sample = random.sample(quran_threads, sample_size)
179174
print(f"Randomly sampled: {sample_size} threads")
180-
175+
181176
# Analyze with Gemini
182-
print(f"\nAnalyzing with Gemini 2.5 Flash...")
177+
print("\nAnalyzing with Gemini 2.5 Flash...")
183178
all_clusters = analyze_500_samples(sample)
184-
179+
185180
if not all_clusters:
186181
print("Failed to get cluster analysis")
187182
return
188-
183+
189184
# Generate comprehensive analysis
190-
print(f"\nGenerating comprehensive analysis...")
185+
print("\nGenerating comprehensive analysis...")
191186
analysis = generate_comprehensive_analysis(all_clusters, sample)
192-
187+
193188
# Display results
194189
print("\n" + "=" * 80)
195190
print("QURAN SUBTOPIC CLUSTERS (500 SAMPLES)")
196191
print("=" * 80)
197-
192+
198193
for i, cluster in enumerate(analysis["cluster_distribution"], 1):
199194
print(f"\n{i:2d}. {cluster['name']}")
200195
print(f" Count: {cluster['count']} ({cluster['percentage']:.1f}%)")
201-
if cluster['examples']:
202-
print(f" Examples:")
203-
for example in cluster['examples']:
196+
if cluster["examples"]:
197+
print(" Examples:")
198+
for example in cluster["examples"]:
204199
print(f" • {example}...")
205-
200+
206201
# Summary statistics
207202
print("\n" + "=" * 80)
208203
print("SUMMARY STATISTICS")
209204
print("=" * 80)
210-
205+
211206
top_5 = analysis["cluster_distribution"][:5]
212207
top_5_pct = sum(c["percentage"] for c in top_5)
213-
208+
214209
print(f"\n📊 Top 5 clusters account for {top_5_pct:.1f}% of all questions:")
215210
for cluster in top_5:
216211
print(f" • {cluster['name']}: {cluster['percentage']:.1f}%")
217-
212+
218213
# Language distribution
219-
print(f"\n🌍 Language Distribution:")
214+
print("\n🌍 Language Distribution:")
220215
lang_dist = analysis["language_distribution"]
221216
for lang, count in list(lang_dist.items())[:5]:
222-
pct = (count / sample_size * 100)
217+
pct = count / sample_size * 100
223218
print(f" • {lang}: {count} ({pct:.1f}%)")
224-
219+
225220
# Save results
226221
output_file = Path("quran_500_analysis.json")
227-
with open(output_file, 'w', encoding='utf-8') as f:
228-
json.dump({
229-
"analysis": analysis,
230-
"sample_threads": sample
231-
}, f, indent=2, ensure_ascii=False)
232-
222+
with open(output_file, "w", encoding="utf-8") as f:
223+
json.dump({"analysis": analysis, "sample_threads": sample}, f, indent=2, ensure_ascii=False)
224+
233225
print(f"\n💾 Full analysis saved to: {output_file}")
234-
226+
235227
# Comparison with 100-sample analysis
236228
print("\n" + "=" * 80)
237229
print("COMPARISON: 500 vs 100 SAMPLE ANALYSIS")
238230
print("=" * 80)
239-
231+
240232
print("\n📈 Key Differences with 500 samples:")
241233
print(" • More granular subcategories emerge")
242234
print(" • Percentages stabilize with larger sample")
243235
print(" • Rare categories become visible")
244236
print(" • Better representation of language diversity")
245-
237+
246238
# Recommendations
247239
print("\n" + "=" * 80)
248240
print("RECOMMENDATIONS FOR QURAN CONTENT ORGANIZATION")
249241
print("=" * 80)
250-
242+
251243
print("\n✅ Primary Categories (>10%):")
252244
for cluster in analysis["cluster_distribution"]:
253245
if cluster["percentage"] >= 10:
254246
print(f" • {cluster['name']}: {cluster['percentage']:.1f}%")
255-
247+
256248
print("\n📝 Secondary Categories (5-10%):")
257249
for cluster in analysis["cluster_distribution"]:
258250
if 5 <= cluster["percentage"] < 10:
259251
print(f" • {cluster['name']}: {cluster['percentage']:.1f}%")
260-
252+
261253
print("\n💡 Specialized Categories (<5%):")
262254
for cluster in analysis["cluster_distribution"]:
263255
if cluster["percentage"] < 5:
264256
print(f" • {cluster['name']}: {cluster['percentage']:.1f}%")
265257

258+
266259
if __name__ == "__main__":
267-
main()
260+
main()

0 commit comments

Comments
 (0)