2626 " \n " ,
2727 " # TODO: A remplacer par vos propres identifiants Garage\n " ,
2828 " GARAGE_ENDPOINT = \" http://garage:3900\"\n " ,
29- " GARAGE_ACCESS_KEY = \" GK907b22f51dc0d0c5164474f2 \"\n " ,
30- " GARAGE_SECRET_KEY = \" 6cf587853042d92d2cf6bb85b7c46a6a2400a47822e9baae32f9be0b7c5c9663 \"\n " ,
29+ " GARAGE_ACCESS_KEY = \" GK2ae23cad2bbbf648143b1b8c \"\n " ,
30+ " GARAGE_SECRET_KEY = \" 997e31832cbc9c78a2d919897f1cc9d63ad2c628464a7fba3a55f972c31790ee \"\n " ,
3131 " \n " ,
3232 " SILVER_PATH = \" s3a://silver/hackernews\"\n " ,
3333 " GOLD_PATH = \" s3a://gold/hackernews\"\n " ,
6464 "execution_count" : null ,
6565 "metadata" : {},
6666 "outputs" : [],
67- "source" : " import sparknlp\n from sparknlp.base import DocumentAssembler, Finisher\n from sparknlp.annotator import (\n Tokenizer, Normalizer, StopWordsCleaner, LemmatizerModel,\n SentimentDLModel, NerDLModel, NerConverter,\n SentenceDetector, WordEmbeddingsModel, UniversalSentenceEncoder\n )\n from pyspark.ml import Pipeline\n from pyspark.sql.functions import col, explode, explode_outer, length, desc, count, when, rank, round as round_, window\n from pyspark.sql.functions import sum as sum_\n from pyspark.sql.window import Window\n\n print(f\" Spark NLP version: {sparknlp.version()}\" )"
67+ "source" : [
68+ " import sparknlp\n " ,
69+ " from sparknlp.base import DocumentAssembler, Finisher\n " ,
70+ " from sparknlp.annotator import (\n " ,
71+ " Tokenizer, Normalizer, StopWordsCleaner, LemmatizerModel,\n " ,
72+ " SentimentDLModel, NerDLModel, NerConverter,\n " ,
73+ " SentenceDetector, WordEmbeddingsModel, UniversalSentenceEncoder\n " ,
74+ " )\n " ,
75+ " from pyspark.ml import Pipeline\n " ,
76+ " from pyspark.sql.functions import col, explode, explode_outer, length, desc, count, when, rank, round as round_, window\n " ,
77+ " from pyspark.sql.functions import sum as sum_\n " ,
78+ " from pyspark.sql.window import Window\n " ,
79+ " \n " ,
80+ " print(f\" Spark NLP version: {sparknlp.version()}\" )"
81+ ]
6882 },
6983 {
7084 "cell_type" : " markdown" ,
342356 "cell_type" : " markdown" ,
343357 "metadata" : {},
344358 "source" : [
345- " ## 6b. Window Function - Classement des auteurs\n " ,
359+ " ### 6b. Window Function - Classement des auteurs\n " ,
346360 " \n " ,
347361 " Utilisation de RANK() pour classer les auteurs par leur ratio de commentaires positifs."
348362 ]
406420 "source" : [
407421 " sentiment_windowed = comments_with_sentiment \\\n " ,
408422 " .withColumn(\" sentiment_result\" , explode(col(\" sentiment.result\" ))) \\\n " ,
423+ " .withColumn(\" timestamp\" , col(\" timestamp\" ).cast(\" timestamp\" )) \\\n " ,
424+ " .withWatermark(\" timestamp\" , \" 10 minutes\" ) \\\n " ,
409425 " .groupBy(\n " ,
410426 " window(\n " ,
411427 " col(\" timestamp\" ),\n " ,
421437 " \n " ,
422438 " sentiment_windowed.writeStream \\\n " ,
423439 " .format(\" delta\" ) \\\n " ,
424- " .outputMode(\" update \" ) \\\n " ,
440+ " .outputMode(\" append \" ) \\\n " ,
425441 " .option(\" checkpointLocation\" , f\" {GOLD_PATH}/_checkpoints/sentiment_windowed\" ) \\\n " ,
426442 " .start(f\" {GOLD_PATH}/sentiment_real_time\" )"
427443 ]
430446 "cell_type" : " markdown" ,
431447 "metadata" : {},
432448 "source" : [
433- " ## 7a. Sentiment en temps réel"
449+ " ### 7a. Sentiment en temps réel"
434450 ]
435451 },
436452 {
445461 "execution_count" : null ,
446462 "metadata" : {},
447463 "outputs" : [],
448- "source" : " comments_with_kw = keywords_model_fitted.transform(comments_stream)\n keywords_exploded_stream = comments_with_kw \\\n .select(\n col(\" id\" ),\n col(\" timestamp\" ),\n explode(col(\" keywords\" )).alias(\" keyword\" )\n ) \\\n .groupBy(\n window(\n col(\" timestamp\" ),\n \" 10 minutes\" ,\n \" 2 minutes\"\n ),\n col(\" keyword\" )\n ) \\\n .count()\n\n keywords_exploded_stream.writeStream \\\n .format(\" delta\" ) \\\n .outputMode(\" complete\" ) \\\n .option(\" checkpointLocation\" , f\" {GOLD_PATH}/_checkpoints/keywords_windowed\" ) \\\n .start(f\" {GOLD_PATH}/keywords_real_time\" )"
464+ "source" : [
465+ " comments_with_kw = keywords_model_fitted.transform(comments_stream)\n " ,
466+ " keywords_exploded_stream = comments_with_kw \\\n " ,
467+ " .select(\n " ,
468+ " col(\" id\" ),\n " ,
469+ " col(\" timestamp\" ),\n " ,
470+ " explode(col(\" keywords\" )).alias(\" keyword\" )\n " ,
471+ " ) \\\n " ,
472+ " .groupBy(\n " ,
473+ " window(\n " ,
474+ " col(\" timestamp\" ),\n " ,
475+ " \" 10 minutes\" ,\n " ,
476+ " \" 2 minutes\"\n " ,
477+ " ),\n " ,
478+ " col(\" keyword\" )\n " ,
479+ " ) \\\n " ,
480+ " .count()\n " ,
481+ " \n " ,
482+ " keywords_exploded_stream.writeStream \\\n " ,
483+ " .format(\" delta\" ) \\\n " ,
484+ " .outputMode(\" complete\" ) \\\n " ,
485+ " .option(\" checkpointLocation\" , f\" {GOLD_PATH}/_checkpoints/keywords_windowed\" ) \\\n " ,
486+ " .start(f\" {GOLD_PATH}/keywords_real_time\" )"
487+ ]
449488 },
450489 {
451490 "cell_type" : " markdown" ,
452491 "metadata" : {},
453492 "source" : [
454- " ## 7 . Visualisation Pandas + Seaborn"
493+ " ## 8 . Visualisation Pandas + Seaborn"
455494 ]
456495 },
457496 {
520559 "cell_type" : " markdown" ,
521560 "metadata" : {},
522561 "source" : [
523- " ## 8 . Écriture Gold"
562+ " ## 9 . Écriture Gold"
524563 ]
525564 },
526565 {
592631 "source" : [
593632 " spark.stop()"
594633 ]
634+ },
635+ {
636+ "cell_type" : " code" ,
637+ "execution_count" : null ,
638+ "metadata" : {},
639+ "outputs" : [],
640+ "source" : []
595641 }
596642 ],
597643 "metadata" : {
601647 "name" : " python3"
602648 },
603649 "language_info" : {
650+ "codemirror_mode" : {
651+ "name" : " ipython" ,
652+ "version" : 3
653+ },
654+ "file_extension" : " .py" ,
655+ "mimetype" : " text/x-python" ,
604656 "name" : " python" ,
657+ "nbconvert_exporter" : " python" ,
658+ "pygments_lexer" : " ipython3" ,
605659 "version" : " 3.12.11"
606660 }
607661 },
608662 "nbformat" : 4 ,
609663 "nbformat_minor" : 4
610- }
664+ }
0 commit comments