Saturday, April 21, 2018

My first Spark Program for Word count using Scala

package com.sample.wc

import org.apache.spark.sql.SparkSession
import org.apache.commons.io.FileUtils
import org.apache.commons.io.filefilter.WildcardFileFilter
import java.io.File

object WordCount {
  def main(args: Array[String]): Unit = {
    // Creating the spark object
    val spark = SparkSession.builder().master("local").appName("Word Count").getOrCreate()
    
    //reading the text file and create the RDD
    val data = spark.read.textFile(args(0)).rdd
    
    //Split the line in the text file with space
    val wordsSplits = data.flatMap(lines => lines.split(" "))
    
    //Map each word to word,1, to ease the counting
    val wordMaptoOne = wordsSplits.map(value => (value, 1))
    
    //Count each word
    val count = wordMaptoOne.reduceByKey(_ + _)

    //Delete the output file, if already exists
    FileUtils.deleteDirectory(new File(args(1)))

    //Save the output file as text
    count.saveAsTextFile(args(1))

    //Stop the spark object
    spark.stop()
  }
}
Command to execute the Jar file // bin/spark-submit --class com.sample.wc.WordCount WordCounts.jar text.txt output