6464 "execution_count" : null ,
6565 "metadata" : {},
6666 "outputs" : [],
67- "source" : [
68- " import sparknlp\n " ,
69- " from sparknlp.base import DocumentAssembler, Finisher\n " ,
70- " from sparknlp.annotator import (\n " ,
71- " Tokenizer, Normalizer, StopWordsCleaner, LemmatizerModel,\n " ,
72- " SentimentDLModel, NerDLModel, NerConverter,\n " ,
73- " SentenceDetector, WordEmbeddingsModel, UniversalSentenceEncoder\n " ,
74- " )\n " ,
75- " from pyspark.ml import Pipeline\n " ,
76- " from pyspark.sql.functions import col, explode, explode_outer, length, desc, count, when, rank, round as round_\n " ,
77- " from pyspark.sql.functions import sum as sum_\n " ,
78- " from pyspark.sql.window import Window\n " ,
79- " \n " ,
80- " print(f\" Spark NLP version: {sparknlp.version()}\" )"
81- ]
67+ "source" : " import sparknlp\n from sparknlp.base import DocumentAssembler, Finisher\n from sparknlp.annotator import (\n Tokenizer, Normalizer, StopWordsCleaner, LemmatizerModel,\n SentimentDLModel, NerDLModel, NerConverter,\n SentenceDetector, WordEmbeddingsModel, UniversalSentenceEncoder\n )\n from pyspark.ml import Pipeline\n from pyspark.sql.functions import col, explode, explode_outer, length, desc, count, when, rank, round as round_, window\n from pyspark.sql.functions import sum as sum_\n from pyspark.sql.window import Window\n\n print(f\" Spark NLP version: {sparknlp.version()}\" )"
8268 },
8369 {
8470 "cell_type" : " markdown" ,
459445 "execution_count" : null ,
460446 "metadata" : {},
461447 "outputs" : [],
462- "source" : [
463- " comments_with_kw = keywords_model_fitted.transform(comments_stream)\n " ,
464- " keywords_exploded_stream = comments_with_kw \\\n " ,
465- " .select(\n " ,
466- " col(\" id\" ),\n " ,
467- " explode(col(\" keywords\" )).alias(\" keyword\" )\n " ,
468- " ) \\\n " ,
469- " .groupBy(\n " ,
470- " window(\n " ,
471- " col(\" timestamp\" ),\n " ,
472- " \" 10 minutes\" ,\n " ,
473- " \" 2 minutes\"\n " ,
474- " ),\n " ,
475- " col(\" keyword\" )\n " ,
476- " ) \\\n " ,
477- " .count() \\\n " ,
478- " .orderBy(desc(\" count\" ))\n " ,
479- " \n " ,
480- " keywords_exploded_stream.writeStream \\\n " ,
481- " .format(\" delta\" ) \\\n " ,
482- " .outputMode(\" complete\" ) \\\n " ,
483- " .option(\" checkpointLocation\" , f\" {GOLD_PATH}/_checkpoints/keywords_windowed\" ) \\\n " ,
484- " .start(f\" {GOLD_PATH}/keywords_real_time\" )"
485- ]
448+ "source" : " comments_with_kw = keywords_model_fitted.transform(comments_stream)\n keywords_exploded_stream = comments_with_kw \\\n .select(\n col(\" id\" ),\n col(\" timestamp\" ),\n explode(col(\" keywords\" )).alias(\" keyword\" )\n ) \\\n .groupBy(\n window(\n col(\" timestamp\" ),\n \" 10 minutes\" ,\n \" 2 minutes\"\n ),\n col(\" keyword\" )\n ) \\\n .count()\n\n keywords_exploded_stream.writeStream \\\n .format(\" delta\" ) \\\n .outputMode(\" complete\" ) \\\n .option(\" checkpointLocation\" , f\" {GOLD_PATH}/_checkpoints/keywords_windowed\" ) \\\n .start(f\" {GOLD_PATH}/keywords_real_time\" )"
486449 },
487450 {
488451 "cell_type" : " markdown" ,
644607 },
645608 "nbformat" : 4 ,
646609 "nbformat_minor" : 4
647- }
610+ }
0 commit comments