fix: Update Garage credentials, improve code formatting in the notebook and fix streaming request (sentiment_windowed)

Angry-Jay · Angry-Jay · commit 6d94ad679927 · 2026-01-23T20:51:20.000+01:00
diff --git a/notebooks/03_silver_to_gold.ipynb b/notebooks/03_silver_to_gold.ipynb
@@ -26,8 +26,8 @@
     "\n",
     "# TODO: A remplacer par vos propres identifiants Garage\n",
     "GARAGE_ENDPOINT = \"http://garage:3900\"\n",
-    "GARAGE_ACCESS_KEY = \"GK907b22f51dc0d0c5164474f2\"\n",
-    "GARAGE_SECRET_KEY = \"6cf587853042d92d2cf6bb85b7c46a6a2400a47822e9baae32f9be0b7c5c9663\"\n",
+    "GARAGE_ACCESS_KEY = \"GK2ae23cad2bbbf648143b1b8c\"\n",
+    "GARAGE_SECRET_KEY = \"997e31832cbc9c78a2d919897f1cc9d63ad2c628464a7fba3a55f972c31790ee\"\n",
     "\n",
     "SILVER_PATH = \"s3a://silver/hackernews\"\n",
     "GOLD_PATH = \"s3a://gold/hackernews\"\n",
@@ -64,7 +64,21 @@
    "execution_count": null,
    "metadata": {},
    "outputs": [],
-   "source": "import sparknlp\nfrom sparknlp.base import DocumentAssembler, Finisher\nfrom sparknlp.annotator import (\n    Tokenizer, Normalizer, StopWordsCleaner, LemmatizerModel,\n    SentimentDLModel, NerDLModel, NerConverter,\n    SentenceDetector, WordEmbeddingsModel, UniversalSentenceEncoder\n)\nfrom pyspark.ml import Pipeline\nfrom pyspark.sql.functions import col, explode, explode_outer, length, desc, count, when, rank, round as round_, window\nfrom pyspark.sql.functions import sum as sum_\nfrom pyspark.sql.window import Window\n\nprint(f\"Spark NLP version: {sparknlp.version()}\")"
+   "source": [
+    "import sparknlp\n",
+    "from sparknlp.base import DocumentAssembler, Finisher\n",
+    "from sparknlp.annotator import (\n",
+    "    Tokenizer, Normalizer, StopWordsCleaner, LemmatizerModel,\n",
+    "    SentimentDLModel, NerDLModel, NerConverter,\n",
+    "    SentenceDetector, WordEmbeddingsModel, UniversalSentenceEncoder\n",
+    ")\n",
+    "from pyspark.ml import Pipeline\n",
+    "from pyspark.sql.functions import col, explode, explode_outer, length, desc, count, when, rank, round as round_, window\n",
+    "from pyspark.sql.functions import sum as sum_\n",
+    "from pyspark.sql.window import Window\n",
+    "\n",
+    "print(f\"Spark NLP version: {sparknlp.version()}\")"
+   ]
   },
   {
    "cell_type": "markdown",
@@ -342,7 +356,7 @@
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "## 6b. Window Function - Classement des auteurs\n",
+    "### 6b. Window Function - Classement des auteurs\n",
     "\n",
     "Utilisation de RANK() pour classer les auteurs par leur ratio de commentaires positifs."
    ]
@@ -406,6 +420,8 @@
    "source": [
     "sentiment_windowed = comments_with_sentiment \\\n",
     "    .withColumn(\"sentiment_result\", explode(col(\"sentiment.result\"))) \\\n",
+    "    .withColumn(\"timestamp\", col(\"timestamp\").cast(\"timestamp\")) \\\n",
+    "    .withWatermark(\"timestamp\", \"10 minutes\") \\\n",
     "    .groupBy(\n",
     "        window(\n",
     "            col(\"timestamp\"),\n",
@@ -421,7 +437,7 @@
     "\n",
     "sentiment_windowed.writeStream \\\n",
     "    .format(\"delta\") \\\n",
-    "    .outputMode(\"update\") \\\n",
+    "    .outputMode(\"append\") \\\n",
     "    .option(\"checkpointLocation\", f\"{GOLD_PATH}/_checkpoints/sentiment_windowed\") \\\n",
     "    .start(f\"{GOLD_PATH}/sentiment_real_time\")"
    ]
@@ -430,7 +446,7 @@
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "## 7a. Sentiment en temps réel"
+    "### 7a. Sentiment en temps réel"
    ]
   },
   {
@@ -445,13 +461,36 @@
    "execution_count": null,
    "metadata": {},
    "outputs": [],
-   "source": "comments_with_kw = keywords_model_fitted.transform(comments_stream)\nkeywords_exploded_stream = comments_with_kw \\\n    .select(\n        col(\"id\"),\n        col(\"timestamp\"),\n        explode(col(\"keywords\")).alias(\"keyword\")\n    ) \\\n    .groupBy(\n        window(\n            col(\"timestamp\"),\n            \"10 minutes\",\n            \"2 minutes\"\n        ),\n        col(\"keyword\")\n    ) \\\n    .count()\n\nkeywords_exploded_stream.writeStream \\\n    .format(\"delta\") \\\n    .outputMode(\"complete\") \\\n    .option(\"checkpointLocation\", f\"{GOLD_PATH}/_checkpoints/keywords_windowed\") \\\n    .start(f\"{GOLD_PATH}/keywords_real_time\")"
+   "source": [
+    "comments_with_kw = keywords_model_fitted.transform(comments_stream)\n",
+    "keywords_exploded_stream = comments_with_kw \\\n",
+    "    .select(\n",
+    "        col(\"id\"),\n",
+    "        col(\"timestamp\"),\n",
+    "        explode(col(\"keywords\")).alias(\"keyword\")\n",
+    "    ) \\\n",
+    "    .groupBy(\n",
+    "        window(\n",
+    "            col(\"timestamp\"),\n",
+    "            \"10 minutes\",\n",
+    "            \"2 minutes\"\n",
+    "        ),\n",
+    "        col(\"keyword\")\n",
+    "    ) \\\n",
+    "    .count()\n",
+    "\n",
+    "keywords_exploded_stream.writeStream \\\n",
+    "    .format(\"delta\") \\\n",
+    "    .outputMode(\"complete\") \\\n",
+    "    .option(\"checkpointLocation\", f\"{GOLD_PATH}/_checkpoints/keywords_windowed\") \\\n",
+    "    .start(f\"{GOLD_PATH}/keywords_real_time\")"
+   ]
   },
   {
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "## 7. Visualisation Pandas + Seaborn"
+    "## 8. Visualisation Pandas + Seaborn"
    ]
   },
   {
@@ -520,7 +559,7 @@
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "## 8. Écriture Gold"
+    "## 9. Écriture Gold"
    ]
   },
   {
@@ -592,6 +631,13 @@
    "source": [
     "spark.stop()"
    ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": []
   }
  ],
  "metadata": {
@@ -601,10 +647,18 @@
    "name": "python3"
   },
   "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
    "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
    "version": "3.12.11"
   }
  },
  "nbformat": 4,
  "nbformat_minor": 4
-}
+}