-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathexamples.py
More file actions
278 lines (214 loc) · 8.97 KB
/
examples.py
File metadata and controls
278 lines (214 loc) · 8.97 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
"""
Script d'exemple avancé montrant toutes les fonctionnalités
avec optimisations de performance.
"""
from src.pipeline import DarijaDatasetPipeline
from src.logger import setup_logger
from src.optimization import performance_monitor, cache_manager
import time
logger = setup_logger(__name__)
def example_basic_usage():
"""Exemple basique: traiter des commentaires directement."""
print("\n" + "="*60)
print("EXEMPLE 1: Traitement Basique de Commentaires")
print("="*60 + "\n")
pipeline = DarijaDatasetPipeline(use_scraper=False)
# Commentaires d'exemple
comments = [
{
"text": "واح البدر يا سيدي! شحال ديال الجمالة في هذا الوقت",
"url": "https://www.facebook.com/post/123"
},
{
"text": "مليح البزاف يا لخوي! غادي نشوفك قريب إن شاء الله",
"url": "https://www.facebook.com/post/456"
},
{
"text": "نحن ندعم اللغة العربية الفصحى والدارجة المغربية",
"url": "https://www.facebook.com/post/789"
},
{
"text": "This is an English comment that will be filtered",
"url": "https://www.facebook.com/post/123"
},
{
"text": "السلام عليكم ورحمة الله وبركاته، كيفاش تاع الحوال",
"url": "https://www.facebook.com/post/1011"
},
]
logger.info(f"Traitement de {len(comments)} commentaires...")
darija_count = pipeline.process_comments_batch(comments)
pipeline.save_dataset()
pipeline.print_statistics()
return pipeline
def example_with_batching():
"""Exemple avec traitement par lots optimisé."""
print("\n" + "="*60)
print("EXEMPLE 2: Traitement Optimisé par Lots")
print("="*60 + "\n")
from src.optimization import OptimizedBatchProcessor
from src.cleaner import DataCleaner
cleaner = DataCleaner()
# Générer des commentaires de test
sample_texts = [
"واح البدر يا سيدي! شحال ديال الجمالة",
"مليح البزاف يا لخوي! غادي نشوفك قريب",
"This is English text",
"نحن ندعم اللغة العربية الفصحى",
"Bonjour mon ami, ça va?",
"السلام عليكم ورحمة الله وبركاته",
] * 100 # Répéter pour avoir 600 textes
print(f"Nettoyage de {len(sample_texts)} textes par lots...\n")
start_time = time.time()
cleaned_texts = OptimizedBatchProcessor.process_with_batching(
cleaner.clean_batch,
sample_texts,
batch_size=100
)
elapsed = time.time() - start_time
print(f"✓ {len(cleaned_texts)} textes nettoyés en {elapsed:.2f}s")
print(f"✓ Performance: {len(sample_texts)/elapsed:.0f} textes/seconde\n")
def example_language_detection():
"""Exemple de détection de langue avec cache."""
print("\n" + "="*60)
print("EXEMPLE 3: Détection de Langue avec Cache")
print("="*60 + "\n")
from src.language_detector import LanguageDetector
detector = LanguageDetector()
test_texts = [
"واح البدر يا سيدي! شحال ديال الجمالة",
"مليح البزاف يا لخوي",
"This is an English text",
"Bonjour mon ami",
# Répéter pour démontrer le cache
"واح البدر يا سيدي! شحال ديال الجمالة",
"مليح البزاف يا لخوي",
]
print("Détection de langue:")
for text in test_texts:
lang, confidence = detector.detect_language(text)
is_darija = detector.is_darija(text)
print(f" {text[:40]:<42} → {lang} (conf: {confidence:.2f}, Darija: {is_darija})")
print("\n" + "Cache Statistics:")
cache_stats = cache_manager.get_stats()
print(f" Size: {cache_stats['size']}")
print(f" Hits: {cache_stats['hits']}")
print(f" Misses: {cache_stats['misses']}")
print(f" Hit Rate: {cache_stats['hit_rate']:.2f}%\n")
def example_csv_operations():
"""Exemple d'opérations CSV."""
print("\n" + "="*60)
print("EXEMPLE 4: Opérations CSV")
print("="*60 + "\n")
from src.csv_manager import CSVManager
csv_mgr = CSVManager("data/example_output.csv")
# Ajouter des enregistrements
records = [
{"text": "واح البدر يا سيدي!", "url": "https://facebook.com/1"},
{"text": "مليح البزاف", "url": "https://facebook.com/2"},
{"text": "نحن ندعم الدارجة", "url": "https://facebook.com/3"},
{"text": "كيفاش تاع الحوال", "url": "https://facebook.com/4"},
]
csv_mgr.add_records(records)
print(f"Ajouté {len(records)} enregistrements\n")
print("Sauvegarde du CSV...")
output_file = csv_mgr.save_to_csv()
print(f"✓ Fichier sauvegardé: {output_file}\n")
# Statistiques
stats = csv_mgr.get_statistics()
print("Statistiques du CSV:")
print(f" Total records: {stats['total_records']}")
print(f" Unique URLs: {stats['unique_urls']}")
print(f" Avg text length: {stats['avg_text_length']:.2f}\n")
def example_full_pipeline():
"""Exemple du pipeline complet optimisé."""
print("\n" + "="*60)
print("EXEMPLE 5: Pipeline Complet Optimisé")
print("="*60 + "\n")
start_time = time.time()
# Créer le pipeline
pipeline = DarijaDatasetPipeline(use_scraper=False)
# Générer des commentaires d'exemple
comments = []
sample_texts_darija = [
"واح البدر يا سيدي! شحال ديال الجمالة",
"مليح البزاف يا لخوي! غادي نشوفك قريب",
"نحن ندعم اللغة الدارجة المغربية",
"السلام عليكم ورحمة الله وبركاته",
"كيفاش تاع الحوال يا صديقي",
"واش كاين شي مشاكل؟",
"ولاه يا سيدي، كلشي مليح",
"غادي نتلاقاو قريب إن شاء الله",
"شنو الأخبار يا صحابي",
"حنا فقراء وعندنا لقمة العيش",
]
for i, text in enumerate(sample_texts_darija * 5): # 50 commentaires
comments.append({
"text": text + f" (comment {i+1})",
"url": f"https://www.facebook.com/post/{i % 5 + 1}"
})
print(f"Traitement de {len(comments)} commentaires...")
print(f"(Chaque texte sera nettoyé, filtré et vérifié)")
print()
# Traiter
darija_count = pipeline.process_comments_batch(comments)
# Sauvegarder
output_file = pipeline.save_dataset()
# Statistiques
elapsed = time.time() - start_time
pipeline.print_statistics()
print(f"\n✓ Temps total: {elapsed:.2f}s")
print(f"✓ Vitesse de traitement: {len(comments)/elapsed:.0f} comments/sec\n")
def example_performance_analysis():
"""Analyse détaillée de la performance."""
print("\n" + "="*60)
print("EXEMPLE 6: Analyse de Performance")
print("="*60 + "\n")
from src.cleaner import DataCleaner
from src.language_detector import LanguageDetector
import time
cleaner = DataCleaner()
detector = LanguageDetector()
# Texte de test
test_text = "واح البدر يا سيدي! شحال ديال الجمالة في هذا الوقت 😊"
# Benchmark nettoyage
start = time.time()
for _ in range(1000):
cleaner.clean(test_text)
clean_time = time.time() - start
# Benchmark détection
cleaned = cleaner.clean(test_text)
start = time.time()
for _ in range(1000):
detector.detect_language(cleaned)
detect_time = time.time() - start
print("Benchmark (1000 itérations):")
print(f" Nettoyage: {clean_time:.2f}s ({1000/clean_time:.0f} ops/sec)")
print(f" Détection: {detect_time:.2f}s ({1000/detect_time:.0f} ops/sec)")
print()
def main():
"""Exécute tous les exemples."""
print("\n" + "="*70)
print(" "*15 + "DARIJA DATASET BUILDER - EXEMPLES AVANCÉS")
print("="*70)
try:
# Example 1: Basic usage
example_basic_usage()
# Example 2: Optimized batching
example_with_batching()
# Example 3: Language detection with cache
example_language_detection()
# Example 4: CSV operations
example_csv_operations()
# Example 5: Full pipeline
example_full_pipeline()
# Example 6: Performance analysis
example_performance_analysis()
print("\n" + "="*70)
print("✓ Tous les exemples ont été exécutés avec succès!")
print("="*70 + "\n")
except Exception as e:
logger.error(f"Erreur lors de l'exécution des exemples: {e}")
raise
if __name__ == "__main__":
main()