Skip to content

Commit 6d94ad6

Browse files
committed
fix: Update Garage credentials, improve code formatting in the notebook and fix streaming request (sentiment_windowed)
1 parent 9a940d1 commit 6d94ad6

1 file changed

Lines changed: 64 additions & 10 deletions

File tree

notebooks/03_silver_to_gold.ipynb

Lines changed: 64 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -26,8 +26,8 @@
2626
"\n",
2727
"# TODO: A remplacer par vos propres identifiants Garage\n",
2828
"GARAGE_ENDPOINT = \"http://garage:3900\"\n",
29-
"GARAGE_ACCESS_KEY = \"GK907b22f51dc0d0c5164474f2\"\n",
30-
"GARAGE_SECRET_KEY = \"6cf587853042d92d2cf6bb85b7c46a6a2400a47822e9baae32f9be0b7c5c9663\"\n",
29+
"GARAGE_ACCESS_KEY = \"GK2ae23cad2bbbf648143b1b8c\"\n",
30+
"GARAGE_SECRET_KEY = \"997e31832cbc9c78a2d919897f1cc9d63ad2c628464a7fba3a55f972c31790ee\"\n",
3131
"\n",
3232
"SILVER_PATH = \"s3a://silver/hackernews\"\n",
3333
"GOLD_PATH = \"s3a://gold/hackernews\"\n",
@@ -64,7 +64,21 @@
6464
"execution_count": null,
6565
"metadata": {},
6666
"outputs": [],
67-
"source": "import sparknlp\nfrom sparknlp.base import DocumentAssembler, Finisher\nfrom sparknlp.annotator import (\n Tokenizer, Normalizer, StopWordsCleaner, LemmatizerModel,\n SentimentDLModel, NerDLModel, NerConverter,\n SentenceDetector, WordEmbeddingsModel, UniversalSentenceEncoder\n)\nfrom pyspark.ml import Pipeline\nfrom pyspark.sql.functions import col, explode, explode_outer, length, desc, count, when, rank, round as round_, window\nfrom pyspark.sql.functions import sum as sum_\nfrom pyspark.sql.window import Window\n\nprint(f\"Spark NLP version: {sparknlp.version()}\")"
67+
"source": [
68+
"import sparknlp\n",
69+
"from sparknlp.base import DocumentAssembler, Finisher\n",
70+
"from sparknlp.annotator import (\n",
71+
" Tokenizer, Normalizer, StopWordsCleaner, LemmatizerModel,\n",
72+
" SentimentDLModel, NerDLModel, NerConverter,\n",
73+
" SentenceDetector, WordEmbeddingsModel, UniversalSentenceEncoder\n",
74+
")\n",
75+
"from pyspark.ml import Pipeline\n",
76+
"from pyspark.sql.functions import col, explode, explode_outer, length, desc, count, when, rank, round as round_, window\n",
77+
"from pyspark.sql.functions import sum as sum_\n",
78+
"from pyspark.sql.window import Window\n",
79+
"\n",
80+
"print(f\"Spark NLP version: {sparknlp.version()}\")"
81+
]
6882
},
6983
{
7084
"cell_type": "markdown",
@@ -342,7 +356,7 @@
342356
"cell_type": "markdown",
343357
"metadata": {},
344358
"source": [
345-
"## 6b. Window Function - Classement des auteurs\n",
359+
"### 6b. Window Function - Classement des auteurs\n",
346360
"\n",
347361
"Utilisation de RANK() pour classer les auteurs par leur ratio de commentaires positifs."
348362
]
@@ -406,6 +420,8 @@
406420
"source": [
407421
"sentiment_windowed = comments_with_sentiment \\\n",
408422
" .withColumn(\"sentiment_result\", explode(col(\"sentiment.result\"))) \\\n",
423+
" .withColumn(\"timestamp\", col(\"timestamp\").cast(\"timestamp\")) \\\n",
424+
" .withWatermark(\"timestamp\", \"10 minutes\") \\\n",
409425
" .groupBy(\n",
410426
" window(\n",
411427
" col(\"timestamp\"),\n",
@@ -421,7 +437,7 @@
421437
"\n",
422438
"sentiment_windowed.writeStream \\\n",
423439
" .format(\"delta\") \\\n",
424-
" .outputMode(\"update\") \\\n",
440+
" .outputMode(\"append\") \\\n",
425441
" .option(\"checkpointLocation\", f\"{GOLD_PATH}/_checkpoints/sentiment_windowed\") \\\n",
426442
" .start(f\"{GOLD_PATH}/sentiment_real_time\")"
427443
]
@@ -430,7 +446,7 @@
430446
"cell_type": "markdown",
431447
"metadata": {},
432448
"source": [
433-
"## 7a. Sentiment en temps réel"
449+
"### 7a. Sentiment en temps réel"
434450
]
435451
},
436452
{
@@ -445,13 +461,36 @@
445461
"execution_count": null,
446462
"metadata": {},
447463
"outputs": [],
448-
"source": "comments_with_kw = keywords_model_fitted.transform(comments_stream)\nkeywords_exploded_stream = comments_with_kw \\\n .select(\n col(\"id\"),\n col(\"timestamp\"),\n explode(col(\"keywords\")).alias(\"keyword\")\n ) \\\n .groupBy(\n window(\n col(\"timestamp\"),\n \"10 minutes\",\n \"2 minutes\"\n ),\n col(\"keyword\")\n ) \\\n .count()\n\nkeywords_exploded_stream.writeStream \\\n .format(\"delta\") \\\n .outputMode(\"complete\") \\\n .option(\"checkpointLocation\", f\"{GOLD_PATH}/_checkpoints/keywords_windowed\") \\\n .start(f\"{GOLD_PATH}/keywords_real_time\")"
464+
"source": [
465+
"comments_with_kw = keywords_model_fitted.transform(comments_stream)\n",
466+
"keywords_exploded_stream = comments_with_kw \\\n",
467+
" .select(\n",
468+
" col(\"id\"),\n",
469+
" col(\"timestamp\"),\n",
470+
" explode(col(\"keywords\")).alias(\"keyword\")\n",
471+
" ) \\\n",
472+
" .groupBy(\n",
473+
" window(\n",
474+
" col(\"timestamp\"),\n",
475+
" \"10 minutes\",\n",
476+
" \"2 minutes\"\n",
477+
" ),\n",
478+
" col(\"keyword\")\n",
479+
" ) \\\n",
480+
" .count()\n",
481+
"\n",
482+
"keywords_exploded_stream.writeStream \\\n",
483+
" .format(\"delta\") \\\n",
484+
" .outputMode(\"complete\") \\\n",
485+
" .option(\"checkpointLocation\", f\"{GOLD_PATH}/_checkpoints/keywords_windowed\") \\\n",
486+
" .start(f\"{GOLD_PATH}/keywords_real_time\")"
487+
]
449488
},
450489
{
451490
"cell_type": "markdown",
452491
"metadata": {},
453492
"source": [
454-
"## 7. Visualisation Pandas + Seaborn"
493+
"## 8. Visualisation Pandas + Seaborn"
455494
]
456495
},
457496
{
@@ -520,7 +559,7 @@
520559
"cell_type": "markdown",
521560
"metadata": {},
522561
"source": [
523-
"## 8. Écriture Gold"
562+
"## 9. Écriture Gold"
524563
]
525564
},
526565
{
@@ -592,6 +631,13 @@
592631
"source": [
593632
"spark.stop()"
594633
]
634+
},
635+
{
636+
"cell_type": "code",
637+
"execution_count": null,
638+
"metadata": {},
639+
"outputs": [],
640+
"source": []
595641
}
596642
],
597643
"metadata": {
@@ -601,10 +647,18 @@
601647
"name": "python3"
602648
},
603649
"language_info": {
650+
"codemirror_mode": {
651+
"name": "ipython",
652+
"version": 3
653+
},
654+
"file_extension": ".py",
655+
"mimetype": "text/x-python",
604656
"name": "python",
657+
"nbconvert_exporter": "python",
658+
"pygments_lexer": "ipython3",
605659
"version": "3.12.11"
606660
}
607661
},
608662
"nbformat": 4,
609663
"nbformat_minor": 4
610-
}
664+
}

0 commit comments

Comments
 (0)