Sunday, July 30, 2023

WordCount Program to show Top 10 searched words in PySpark

from pyspark import SparkContext

sc = SparkContext("local[*]", "wordcount")
sc.setLogLevel("ERROR")
input = sc.textFile("C:/Users/pranavwagde/Downloads/DE/BigData/search_data.txt")

# This will divide the file's data into multiple words, one below another
words = input.flatMap(lambda x: x.split(" "))

# convert each word to K,V format to count effectively
wordCounts = words.map(lambda x:(x,1))

# count the occurrences of each word
finalCount = wordCounts.reduceByKey(lambda x,y:(x+y))

# Sort the output by the no of occurrences, to get the words with the highest no of appearances
sortedOrder = finalCount.sortBy(lambda x: x[1],False)

# Take the top 10 searched words
top10MostSearchedWords = sortedOrder.take(10)

# Print the list
for word in top10MostSearchedWords:
print(word)



Courtesy: Frank Kane Udemy Course

No comments:

Post a Comment