33 {
44 "cell_type" : " markdown" ,
55 "metadata" : {},
6- "source" : " # Bronze → Silver Layer"
6+ "source" : [
7+ " # Bronze → Silver Layer"
8+ ]
79 },
810 {
911 "cell_type" : " markdown" ,
1012 "metadata" : {},
11- "source" : " ## 1. Configuration Spark"
13+ "source" : [
14+ " ## 1. Configuration Spark"
15+ ]
1216 },
1317 {
1418 "cell_type" : " code" ,
1519 "execution_count" : null ,
1620 "metadata" : {},
1721 "outputs" : [],
18- "source": "from pyspark.sql import SparkSession\n\nGARAGE_ENDPOINT = \"http://garage:3900\"\nGARAGE_ACCESS_KEY = \"GKa25124b4fd82613c063217f3\"\nGARAGE_SECRET_KEY = \"008126399688f9b1efc3a3093079b066e4c6471fa256b52788da0c927194147e\"\n\nBRONZE_PATH = \"s3a://bronze/hackernews\"\nSILVER_PATH = \"s3a://silver/hackernews\"\n\nspark = SparkSession.builder \\\n .appName(\"BronzeToSilver\") \\\n .master(\"spark://spark:7077\") \\\n .config(\"spark.jars.packages\", \n \"org.apache.hadoop:hadoop-aws:3.3.4,\"\n \"com.amazonaws:aws-java-sdk-bundle:1.12.262,\"\n \"io.delta:delta-spark_2.12:3.3.0\") \\\n .config(\"spark.sql.extensions\", \"io.delta.sql.DeltaSparkSessionExtension\") \\\n .config(\"spark.sql.catalog.spark_catalog\", \"org.apache.spark.sql.delta.catalog.DeltaCatalog\") \\\n .config(\"spark.hadoop.fs.s3a.multiobjectdelete.enable\", \"false\") \\\n .config(\"spark.sql.shuffle.partitions\", \"10\") \\\n .getOrCreate()\n\nhadoop_conf = spark.sparkContext._jsc.hadoopConfiguration()\nhadoop_conf.set(\"fs.s3a.endpoint\", GARAGE_ENDPOINT)\nhadoop_conf.set(\"fs.s3a.access.key\", GARAGE_ACCESS_KEY)\nhadoop_conf.set(\"fs.s3a.secret.key\", GARAGE_SECRET_KEY)\nhadoop_conf.set(\"fs.s3a.endpoint.region\", \"garage\")\nhadoop_conf.set(\"fs.s3a.path.style.access\", \"true\")\nhadoop_conf.set(\"fs.s3a.impl\", \"org.apache.hadoop.fs.s3a.S3AFileSystem\")\nhadoop_conf.set(\"fs.s3a.connection.ssl.enabled\", \"false\")"
22+ "source" : [
23+ " from pyspark.sql import SparkSession\n " ,
24+ " \n " ,
25+ " # TODO : A remplacer par vos propres identifiants Garage\n " ,
26+ " GARAGE_ENDPOINT = \" http://garage:3900\"\n " ,
27+ " GARAGE_ACCESS_KEY = \" GK907b22f51dc0d0c5164474f2\"\n " ,
28+ " GARAGE_SECRET_KEY = \" 6cf587853042d92d2cf6bb85b7c46a6a2400a47822e9baae32f9be0b7c5c9663\"\n " ,
29+ " \n " ,
30+ " BRONZE_PATH = \" s3a://bronze/hackernews\"\n " ,
31+ " SILVER_PATH = \" s3a://silver/hackernews\"\n " ,
32+ " \n " ,
33+ " spark = SparkSession.builder \\\n " ,
34+ " .appName(\" BronzeToSilver\" ) \\\n " ,
35+ " .master(\" spark://spark:7077\" ) \\\n " ,
36+ " .config(\" spark.jars.packages\" , \n " ,
37+ " \" org.apache.hadoop:hadoop-aws:3.3.4,\"\n " ,
38+ " \" com.amazonaws:aws-java-sdk-bundle:1.12.262,\"\n " ,
39+ " \" io.delta:delta-spark_2.12:3.3.0\" ) \\\n " ,
40+ " .config(\" spark.sql.extensions\" , \" io.delta.sql.DeltaSparkSessionExtension\" ) \\\n " ,
41+ " .config(\" spark.sql.catalog.spark_catalog\" , \" org.apache.spark.sql.delta.catalog.DeltaCatalog\" ) \\\n " ,
42+ " .config(\" spark.hadoop.fs.s3a.multiobjectdelete.enable\" , \" false\" ) \\\n " ,
43+ " .config(\" spark.sql.shuffle.partitions\" , \" 10\" ) \\\n " ,
44+ " .getOrCreate()\n " ,
45+ " \n " ,
46+ " hadoop_conf = spark.sparkContext._jsc.hadoopConfiguration()\n " ,
47+ " hadoop_conf.set(\" fs.s3a.endpoint\" , GARAGE_ENDPOINT)\n " ,
48+ " hadoop_conf.set(\" fs.s3a.access.key\" , GARAGE_ACCESS_KEY)\n " ,
49+ " hadoop_conf.set(\" fs.s3a.secret.key\" , GARAGE_SECRET_KEY)\n " ,
50+ " hadoop_conf.set(\" fs.s3a.endpoint.region\" , \" garage\" )\n " ,
51+ " hadoop_conf.set(\" fs.s3a.path.style.access\" , \" true\" )\n " ,
52+ " hadoop_conf.set(\" fs.s3a.impl\" , \" org.apache.hadoop.fs.s3a.S3AFileSystem\" )\n " ,
53+ " hadoop_conf.set(\" fs.s3a.connection.ssl.enabled\" , \" false\" )"
54+ ]
1955 },
2056 {
2157 "cell_type" : " markdown" ,
2258 "metadata" : {},
23- "source" : " ## 2. Création bucket Silver"
59+ "source" : [
60+ " ## 2. Création bucket Silver"
61+ ]
2462 },
2563 {
2664 "cell_type" : " code" ,
2765 "execution_count" : null ,
2866 "metadata" : {},
2967 "outputs" : [],
30- "source" : " # Bucket \" silver\" à créer manuellement via Garage CLI/WebUI si nécessaire"
68+ "source" : [
69+ " # Bucket \" silver\" à créer manuellement via Garage CLI/WebUI si nécessaire"
70+ ]
3171 },
3272 {
3373 "cell_type" : " markdown" ,
3474 "metadata" : {},
35- "source" : " ## 3. Lecture Bronze"
36- },
37- {
38- "cell_type" : " code" ,
39- "execution_count" : null ,
40- "metadata" : {},
41- "outputs" : [],
42- "source" : " stories_bronze = spark.read.format(\" delta\" ).load(f\" {BRONZE_PATH}/stories\" )\n comments_bronze = spark.read.format(\" delta\" ).load(f\" {BRONZE_PATH}/comments\" )\n\n print(f\" Stories: {stories_bronze.count()}, Comments: {comments_bronze.count()}\" )"
75+ "source" : [
76+ " ## 3. Lecture Bronze"
77+ ]
4378 },
4479 {
4580 "cell_type" : " code" ,
4681 "execution_count" : null ,
4782 "metadata" : {},
4883 "outputs" : [],
49- "source" : " stories_bronze.printSchema() "
50- } ,
51- {
52- "cell_type" : " markdown " ,
53- "metadata" : {},
54- "source" : " ## 4. Fonctions de nettoyage "
84+ "source" : [
85+ " stories_bronze = spark.read.format( \" delta \" ).load(f \" {BRONZE_PATH}/stories \" ) \n " ,
86+ " comments_bronze = spark.read.format( \" delta \" ).load(f \" {BRONZE_PATH}/comments \" ) \n " ,
87+ " \n " ,
88+ " print(f \" Stories : {stories_bronze.count()}, Comments: {comments_bronze.count()} \" ) "
89+ ]
5590 },
5691 {
5792 "cell_type" : " code" ,
5893 "execution_count" : null ,
5994 "metadata" : {},
6095 "outputs" : [],
61- "source" : " from pyspark.sql.functions import col, when, regexp_replace, regexp_extract, length, trim, coalesce, lit\n\n def clean_html(column):\n c = col(column)\n c = regexp_replace(c, r\" <[^>]+>\" , \" \" )\n c = regexp_replace(c, r\"\\ s+\" , \" \" )\n\n html_entities = {\n r\" '\" : \" '\" ,\n r\" /\" : \" /\" ,\n r\" "\" : '\" ',\n r\" &\" : \" &\" ,\n r\" <\" : \" <\" ,\n r\" >\" : \" >\"\n }\n for k, v in html_entities.items():\n c = regexp_replace(c, k, v)\n\n return when(col(column).isNull(), lit(\"\" )).otherwise(trim(c))\n\n def extract_domain(column):\n return regexp_extract(col(column), r\" https?://(?:www\\ .)?([^/]+)\" , 1)"
96+ "source" : [
97+ " stories_bronze.printSchema()"
98+ ]
6299 },
63100 {
64101 "cell_type" : " markdown" ,
65102 "metadata" : {},
66- "source" : " ## 5. Nettoyage Stories"
103+ "source" : [
104+ " ## 4. Fonctions de nettoyage"
105+ ]
67106 },
68107 {
69108 "cell_type" : " code" ,
70109 "execution_count" : null ,
71110 "metadata" : {},
72111 "outputs" : [],
73- "source" : " stories_silver = stories_bronze \\\n .filter(col(\" id\" ).isNotNull()) \\\n .dropDuplicates([\" id\" ]) \\\n .withColumn(\" text_clean\" , clean_html(\" text\" )) \\\n .withColumn(\" domain\" , extract_domain(\" url\" )) \\\n .select(\" id\" , \" by\" , \" title\" , \" url\" , \" domain\" , \" score\" , \" descendants\" , \n \" text_clean\" , \" timestamp\" , \" _ingested_at\" )\n\n stories_silver.show(3, truncate=40)"
112+ "source" : [
113+ " from pyspark.sql.functions import col, when, regexp_replace, regexp_extract, length, trim, coalesce, lit\n " ,
114+ " \n " ,
115+ " def clean_html(column):\n " ,
116+ " c = col(column)\n " ,
117+ " c = regexp_replace(c, r\" <[^>]+>\" , \" \" )\n " ,
118+ " c = regexp_replace(c, r\"\\ s+\" , \" \" )\n " ,
119+ " \n " ,
120+ " html_entities = {\n " ,
121+ " r\" '\" : \" '\" ,\n " ,
122+ " r\" /\" : \" /\" ,\n " ,
123+ " r\" "\" : '\" ',\n " ,
124+ " r\" &\" : \" &\" ,\n " ,
125+ " r\" <\" : \" <\" ,\n " ,
126+ " r\" >\" : \" >\"\n " ,
127+ " }\n " ,
128+ " for k, v in html_entities.items():\n " ,
129+ " c = regexp_replace(c, k, v)\n " ,
130+ " \n " ,
131+ " return when(col(column).isNull(), lit(\"\" )).otherwise(trim(c))\n " ,
132+ " \n " ,
133+ " def extract_domain(column):\n " ,
134+ " return regexp_extract(col(column), r\" https?://(?:www\\ .)?([^/]+)\" , 1)"
135+ ]
74136 },
75137 {
76138 "cell_type" : " markdown" ,
77139 "metadata" : {},
78- "source" : " ## 6. Nettoyage Comments"
140+ "source" : [
141+ " ## 5. Nettoyage Stories"
142+ ]
79143 },
80144 {
81145 "cell_type" : " code" ,
82146 "execution_count" : null ,
83147 "metadata" : {},
84148 "outputs" : [],
85- "source" : " comments_silver = comments_bronze \\\n .filter(col(\" id\" ).isNotNull()) \\\n .filter(coalesce(col(\" deleted\" ), lit(False)) == False) \\\n .filter(coalesce(col(\" dead\" ), lit(False)) == False) \\\n .dropDuplicates([\" id\" ]) \\\n .withColumn(\" text_clean\" , clean_html(\" text\" )) \\\n .filter(length(col(\" text_clean\" )) > 0) \\\n .select(\" id\" , \" by\" , \" parent\" , \" text_clean\" , \" timestamp\" , \" _ingested_at\" )\n\n comments_silver.show(3, truncate=40)"
149+ "source" : [
150+ " stories_silver = stories_bronze \\\n " ,
151+ " .filter(col(\" id\" ).isNotNull()) \\\n " ,
152+ " .dropDuplicates([\" id\" ]) \\\n " ,
153+ " .withColumn(\" text_clean\" , clean_html(\" text\" )) \\\n " ,
154+ " .withColumn(\" domain\" , extract_domain(\" url\" )) \\\n " ,
155+ " .select(\" id\" , \" by\" , \" title\" , \" url\" , \" domain\" , \" score\" , \" descendants\" , \n " ,
156+ " \" text_clean\" , \" timestamp\" , \" _ingested_at\" )\n " ,
157+ " \n " ,
158+ " stories_silver.show(3, truncate=40)"
159+ ]
86160 },
87161 {
88162 "cell_type" : " markdown" ,
89163 "metadata" : {},
90- "source" : " ## 7. Jointure Comments + Stories"
164+ "source" : [
165+ " ## 6. Nettoyage Comments"
166+ ]
91167 },
92168 {
93169 "cell_type" : " code" ,
94170 "execution_count" : null ,
95171 "metadata" : {},
96172 "outputs" : [],
97- "source" : " stories_for_join = stories_silver.select(\n col(\" id\" ).alias(\" story_id\" ),\n col(\" title\" ).alias(\" story_title\" ),\n col(\" score\" ).alias(\" story_score\" ),\n col(\" domain\" ).alias(\" story_domain\" )\n )\n\n comments_enriched = comments_silver.join(\n stories_for_join,\n comments_silver[\" parent\" ] == stories_for_join[\" story_id\" ],\n \" left\"\n )\n\n comments_enriched.show(3, truncate=30)"
173+ "source" : [
174+ " comments_silver = comments_bronze \\\n " ,
175+ " .filter(col(\" id\" ).isNotNull()) \\\n " ,
176+ " .filter(coalesce(col(\" deleted\" ), lit(False)) == False) \\\n " ,
177+ " .filter(coalesce(col(\" dead\" ), lit(False)) == False) \\\n " ,
178+ " .dropDuplicates([\" id\" ]) \\\n " ,
179+ " .withColumn(\" text_clean\" , clean_html(\" text\" )) \\\n " ,
180+ " .filter(length(col(\" text_clean\" )) > 0) \\\n " ,
181+ " .select(\" id\" , \" by\" , \" parent\" , \" text_clean\" , \" timestamp\" , \" _ingested_at\" )\n " ,
182+ " \n " ,
183+ " comments_silver.show(3, truncate=40)"
184+ ]
98185 },
99186 {
100187 "cell_type" : " markdown" ,
101188 "metadata" : {},
102- "source" : " ## 8. Écriture Silver"
103- },
104- {
105- "cell_type" : " code" ,
106- "execution_count" : null ,
107- "metadata" : {},
108- "outputs" : [],
109- "source" : " stories_silver.write.format(\" delta\" ).mode(\" overwrite\" ).save(f\" {SILVER_PATH}/stories\" )"
189+ "source" : [
190+ " ## 7. Écriture Silver"
191+ ]
110192 },
111193 {
112194 "cell_type" : " code" ,
113195 "execution_count" : null ,
114196 "metadata" : {},
115197 "outputs" : [],
116- "source" : " comments_silver.write.format(\" delta\" ).mode(\" overwrite\" ).save(f\" {SILVER_PATH}/comments\" )"
198+ "source" : [
199+ " stories_silver.write.format(\" delta\" ).mode(\" overwrite\" ).save(f\" {SILVER_PATH}/stories\" )"
200+ ]
117201 },
118202 {
119203 "cell_type" : " code" ,
120204 "execution_count" : null ,
121205 "metadata" : {},
122206 "outputs" : [],
123- "source" : " comments_enriched.write.format(\" delta\" ).mode(\" overwrite\" ).save(f\" {SILVER_PATH}/comments_enriched\" )"
207+ "source" : [
208+ " comments_silver.write.format(\" delta\" ).mode(\" overwrite\" ).save(f\" {SILVER_PATH}/comments\" )"
209+ ]
124210 },
125211 {
126212 "cell_type" : " markdown" ,
127213 "metadata" : {},
128- "source" : " ## 9. Vérification"
214+ "source" : [
215+ " ## 8. Vérification"
216+ ]
129217 },
130218 {
131219 "cell_type" : " code" ,
132220 "execution_count" : null ,
133221 "metadata" : {},
134222 "outputs" : [],
135- "source" : " spark.read.format(\" delta\" ).load(f\" {SILVER_PATH}/stories\" ).show(3, truncate=30)\n spark.read.format(\" delta\" ).load(f\" {SILVER_PATH}/comments\" ).show(3, truncate=30)\n spark.read.format(\" delta\" ).load(f\" {SILVER_PATH}/comments_enriched\" ).show(3, truncate=30)"
223+ "source" : [
224+ " spark.read.format(\" delta\" ).load(f\" {SILVER_PATH}/stories\" ).show(3, truncate=30)\n " ,
225+ " spark.read.format(\" delta\" ).load(f\" {SILVER_PATH}/comments\" ).show(3, truncate=30)"
226+ ]
136227 },
137228 {
138229 "cell_type" : " code" ,
139230 "execution_count" : null ,
140231 "metadata" : {},
141232 "outputs" : [],
142- "source" : " spark.read.format(\" delta\" ).load(f\" {SILVER_PATH}/stories\" ) \\\n .filter(col(\" domain\" ) != \"\" ) \\\n .groupBy(\" domain\" ).count() \\\n .orderBy(col(\" count\" ).desc()) \\\n .show(5)"
233+ "source" : [
234+ " spark.read.format(\" delta\" ).load(f\" {SILVER_PATH}/stories\" ) \\\n " ,
235+ " .filter(col(\" domain\" ) != \"\" ) \\\n " ,
236+ " .groupBy(\" domain\" ).count() \\\n " ,
237+ " .orderBy(col(\" count\" ).desc()) \\\n " ,
238+ " .show(5)"
239+ ]
143240 },
144241 {
145242 "cell_type" : " code" ,
158255 "name" : " python3"
159256 },
160257 "language_info" : {
258+ "codemirror_mode" : {
259+ "name" : " ipython" ,
260+ "version" : 3
261+ },
262+ "file_extension" : " .py" ,
263+ "mimetype" : " text/x-python" ,
161264 "name" : " python" ,
162- "version" : " 3.11.0"
265+ "nbconvert_exporter" : " python" ,
266+ "pygments_lexer" : " ipython3" ,
267+ "version" : " 3.12.11"
163268 }
164269 },
165270 "nbformat" : 4 ,
166271 "nbformat_minor" : 4
167- }
272+ }
0 commit comments