darija-dataset-builder/examples.py at main · IlyasFardaouix/darija-dataset-builder · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
"""
Script d'exemple avancé montrant toutes les fonctionnalités
avec optimisations de performance.
"""

from src.pipeline import DarijaDatasetPipeline
from src.logger import setup_logger
from src.optimization import performance_monitor, cache_manager
import time

logger = setup_logger(__name__)


def example_basic_usage():
    """Exemple basique: traiter des commentaires directement."""
    print("\n" + "="*60)
    print("EXEMPLE 1: Traitement Basique de Commentaires")
    print("="*60 + "\n")

    pipeline = DarijaDatasetPipeline(use_scraper=False)

    # Commentaires d'exemple
    comments = [
        {
            "text": "واح البدر يا سيدي! شحال ديال الجمالة في هذا الوقت",
            "url": "https://www.facebook.com/post/123"
        },
        {
            "text": "مليح البزاف يا لخوي! غادي نشوفك قريب إن شاء الله",
            "url": "https://www.facebook.com/post/456"
        },
        {
            "text": "نحن ندعم اللغة العربية الفصحى والدارجة المغربية",
            "url": "https://www.facebook.com/post/789"
        },
        {
            "text": "This is an English comment that will be filtered",
            "url": "https://www.facebook.com/post/123"
        },
        {
            "text": "السلام عليكم ورحمة الله وبركاته، كيفاش تاع الحوال",
            "url": "https://www.facebook.com/post/1011"
        },
    ]

    logger.info(f"Traitement de {len(comments)} commentaires...")
    darija_count = pipeline.process_comments_batch(comments)

    pipeline.save_dataset()
    pipeline.print_statistics()

    return pipeline


def example_with_batching():
    """Exemple avec traitement par lots optimisé."""
    print("\n" + "="*60)
    print("EXEMPLE 2: Traitement Optimisé par Lots")
    print("="*60 + "\n")

    from src.optimization import OptimizedBatchProcessor
    from src.cleaner import DataCleaner

    cleaner = DataCleaner()

    # Générer des commentaires de test
    sample_texts = [
        "واح البدر يا سيدي! شحال ديال الجمالة",
        "مليح البزاف يا لخوي! غادي نشوفك قريب",
        "This is English text",
        "نحن ندعم اللغة العربية الفصحى",
        "Bonjour mon ami, ça va?",
        "السلام عليكم ورحمة الله وبركاته",
    ] * 100  # Répéter pour avoir 600 textes

    print(f"Nettoyage de {len(sample_texts)} textes par lots...\n")

    start_time = time.time()
    cleaned_texts = OptimizedBatchProcessor.process_with_batching(
        cleaner.clean_batch,
        sample_texts,
        batch_size=100
    )
    elapsed = time.time() - start_time

    print(f"✓ {len(cleaned_texts)} textes nettoyés en {elapsed:.2f}s")
    print(f"✓ Performance: {len(sample_texts)/elapsed:.0f} textes/seconde\n")


def example_language_detection():
    """Exemple de détection de langue avec cache."""
    print("\n" + "="*60)
    print("EXEMPLE 3: Détection de Langue avec Cache")
    print("="*60 + "\n")

    from src.language_detector import LanguageDetector

    detector = LanguageDetector()

    test_texts = [
        "واح البدر يا سيدي! شحال ديال الجمالة",
        "مليح البزاف يا لخوي",
        "This is an English text",
        "Bonjour mon ami",
        # Répéter pour démontrer le cache
        "واح البدر يا سيدي! شحال ديال الجمالة",
        "مليح البزاف يا لخوي",
    ]

    print("Détection de langue:")
    for text in test_texts:
        lang, confidence = detector.detect_language(text)
        is_darija = detector.is_darija(text)
        print(f"  {text[:40]:<42} → {lang} (conf: {confidence:.2f}, Darija: {is_darija})")

    print("\n" + "Cache Statistics:")
    cache_stats = cache_manager.get_stats()
    print(f"  Size: {cache_stats['size']}")
    print(f"  Hits: {cache_stats['hits']}")
    print(f"  Misses: {cache_stats['misses']}")
    print(f"  Hit Rate: {cache_stats['hit_rate']:.2f}%\n")


def example_csv_operations():
    """Exemple d'opérations CSV."""
    print("\n" + "="*60)
    print("EXEMPLE 4: Opérations CSV")
    print("="*60 + "\n")

    from src.csv_manager import CSVManager

    csv_mgr = CSVManager("data/example_output.csv")

    # Ajouter des enregistrements
    records = [
        {"text": "واح البدر يا سيدي!", "url": "https://facebook.com/1"},
        {"text": "مليح البزاف", "url": "https://facebook.com/2"},
        {"text": "نحن ندعم الدارجة", "url": "https://facebook.com/3"},
        {"text": "كيفاش تاع الحوال", "url": "https://facebook.com/4"},
    ]

    csv_mgr.add_records(records)

    print(f"Ajouté {len(records)} enregistrements\n")
    print("Sauvegarde du CSV...")
    output_file = csv_mgr.save_to_csv()

    print(f"✓ Fichier sauvegardé: {output_file}\n")

    # Statistiques
    stats = csv_mgr.get_statistics()
    print("Statistiques du CSV:")
    print(f"  Total records: {stats['total_records']}")
    print(f"  Unique URLs: {stats['unique_urls']}")
    print(f"  Avg text length: {stats['avg_text_length']:.2f}\n")


def example_full_pipeline():
    """Exemple du pipeline complet optimisé."""
    print("\n" + "="*60)
    print("EXEMPLE 5: Pipeline Complet Optimisé")
    print("="*60 + "\n")

    start_time = time.time()

    # Créer le pipeline
    pipeline = DarijaDatasetPipeline(use_scraper=False)

    # Générer des commentaires d'exemple
    comments = []
    sample_texts_darija = [
        "واح البدر يا سيدي! شحال ديال الجمالة",
        "مليح البزاف يا لخوي! غادي نشوفك قريب",
        "نحن ندعم اللغة الدارجة المغربية",
        "السلام عليكم ورحمة الله وبركاته",
        "كيفاش تاع الحوال يا صديقي",
        "واش كاين شي مشاكل؟",
        "ولاه يا سيدي، كلشي مليح",
        "غادي نتلاقاو قريب إن شاء الله",
        "شنو الأخبار يا صحابي",
        "حنا فقراء وعندنا لقمة العيش",
    ]

    for i, text in enumerate(sample_texts_darija * 5):  # 50 commentaires
        comments.append({
            "text": text + f" (comment {i+1})",
            "url": f"https://www.facebook.com/post/{i % 5 + 1}"
        })

    print(f"Traitement de {len(comments)} commentaires...")
    print(f"(Chaque texte sera nettoyé, filtré et vérifié)")
    print()

    # Traiter
    darija_count = pipeline.process_comments_batch(comments)

    # Sauvegarder
    output_file = pipeline.save_dataset()

    # Statistiques
    elapsed = time.time() - start_time
    pipeline.print_statistics()

    print(f"\n✓ Temps total: {elapsed:.2f}s")
    print(f"✓ Vitesse de traitement: {len(comments)/elapsed:.0f} comments/sec\n")


def example_performance_analysis():
    """Analyse détaillée de la performance."""
    print("\n" + "="*60)
    print("EXEMPLE 6: Analyse de Performance")
    print("="*60 + "\n")

    from src.cleaner import DataCleaner
    from src.language_detector import LanguageDetector
    import time

    cleaner = DataCleaner()
    detector = LanguageDetector()

    # Texte de test
    test_text = "واح البدر يا سيدي! شحال ديال الجمالة في هذا الوقت 😊"

    # Benchmark nettoyage
    start = time.time()
    for _ in range(1000):
        cleaner.clean(test_text)
    clean_time = time.time() - start

    # Benchmark détection
    cleaned = cleaner.clean(test_text)
    start = time.time()
    for _ in range(1000):
        detector.detect_language(cleaned)
    detect_time = time.time() - start

    print("Benchmark (1000 itérations):")
    print(f"  Nettoyage: {clean_time:.2f}s ({1000/clean_time:.0f} ops/sec)")
    print(f"  Détection: {detect_time:.2f}s ({1000/detect_time:.0f} ops/sec)")
    print()


def main():
    """Exécute tous les exemples."""
    print("\n" + "="*70)
    print(" "*15 + "DARIJA DATASET BUILDER - EXEMPLES AVANCÉS")
    print("="*70)

    try:
        # Example 1: Basic usage
        example_basic_usage()

        # Example 2: Optimized batching
        example_with_batching()

        # Example 3: Language detection with cache
        example_language_detection()

        # Example 4: CSV operations
        example_csv_operations()

        # Example 5: Full pipeline
        example_full_pipeline()

        # Example 6: Performance analysis
        example_performance_analysis()

        print("\n" + "="*70)
        print("✓ Tous les exemples ont été exécutés avec succès!")
        print("="*70 + "\n")

    except Exception as e:
        logger.error(f"Erreur lors de l'exécution des exemples: {e}")
        raise


if __name__ == "__main__":
    main()