from pyspark import SparkContext
sc = SparkContext("local[*]", "wordcount")
sc.setLogLevel("ERROR")
input = sc.textFile("C:/Users/pranavwagde/Downloads/DE/BigData/search_data.txt")
# This will divide the file's data into multiple words, one below another
words = input.flatMap(lambda x: x.split(" "))
# convert each word to K,V format to count effectively
wordCounts = words.map(lambda x:(x,1))
# count the occurrences of each word
finalCount = wordCounts.reduceByKey(lambda x,y:(x+y))
# Sort the output by the no of occurrences, to get the words with the highest no of appearances
sortedOrder = finalCount.sortBy(lambda x: x[1],False)
# Take the top 10 searched words
top10MostSearchedWords = sortedOrder.take(10)
# Print the list
for word in top10MostSearchedWords:
print(word)
sc = SparkContext("local[*]", "wordcount")
sc.setLogLevel("ERROR")
input = sc.textFile("C:/Users/pranavwagde/Downloads/DE/BigData/search_data.txt")
# This will divide the file's data into multiple words, one below another
words = input.flatMap(lambda x: x.split(" "))
# convert each word to K,V format to count effectively
wordCounts = words.map(lambda x:(x,1))
# count the occurrences of each word
finalCount = wordCounts.reduceByKey(lambda x,y:(x+y))
# Sort the output by the no of occurrences, to get the words with the highest no of appearances
sortedOrder = finalCount.sortBy(lambda x: x[1],False)
# Take the top 10 searched words
top10MostSearchedWords = sortedOrder.take(10)
# Print the list
for word in top10MostSearchedWords:
print(word)
Courtesy: Frank Kane Udemy Course
No comments:
Post a Comment