from pyspark.sql.functions import explode, split
 
StaticLines = spark.read.format("text").load("file:///home/ak/datasets/sample.txt")
StaticWords = StaticLines.select(explode(split(StaticLines.value, " ")).alias("word"))
StaticDF = StaticWords.groupBy("word").count()
 
StreamLines = spark.readStream.format("socket").option("host", "localhost").option("port", 9999).load()
StreamWords = StreamLines.select(explode(split(StreamLines.value, " ")).alias("word"))
StreamWordCounts = StreamWords.groupBy("word").count()
StreamDF = StaticDF.join(StreamWordCounts, "word", "right")
 
query = StreamDF.writeStream.outputMode("complete").format("console").option("numRows", 40).option("truncate", "false").start()