準備工作
#進入spark目錄
cd /usr/local/spark
#新建代碼實驗目錄
mkdir demo_code
cd demo_code
#新建一個詞頻統計文件保存目錄
mkdir wordcount
cd wordcount
#新建一個包含了一些語句的文本文件,網上摘取一段文字,保存,退出
vim demoWord.txt
spark-shell運行詞頻統計
#進入spark目錄
cd /usr/local/spark
#啟動spark
./bin/spark-shell
#在scala中加載文本文件
val wordFile = sc.textFile("file:///usr/local/spark/demo_code/wordcount/demo_word.txt")
#查看內容
wordFile.first()
#計算詞頻統計方法
val wordCount = wordFile.flatMap(line => line.split(" ")).map(word => (word, 1)).reduceByKey((a, b) => a + b)
#查看詞頻統計結果
wordCount.collect()
如圖所示:
Scala獨立程序實現詞頻統計
#進入詞頻統計文件目錄
cd /usr/local/spark/demo_code/wordcount/
#在詞頻統計文件目錄下,新建scala程序目錄
mkdir -p src/main/scala
#進入scala程序目錄
cd /usr/local/spark/demo_code/wordcount/src/main/scala
#新建scala程序文件,編寫程序,保存,退出
vim demo_word.scala
程序代碼如下所示:
import org.apache.spark.SparkContext
import org.apache.spark.SparkContext._
import org.apache.spark.SparkConf
object WordCount {
def main(args: Array[String]) {
val inputFile = "file:///usr/local/spark/demo_code/wordcount/demo_word.txt"
val conf = new SparkConf().setAppName("WordCount").setMaster("local[2]")
val sc = new SparkContext(conf)
val wordFile = sc.textFile(inputFile)
val wordCount = wordFile.flatMap(line => line.split(" ")).map(word => (word, 1)).reduceByKey((a, b) => a + b)
wordCount.foreach(println)
}
}
sbt編譯打包
#進入詞頻統計文件目錄
cd /usr/local/spark/demo_code/wordcount/
#新建sbt文件用於編譯程序,保存,退出
vim demo_word.sbt
編輯內容如下:
name := "demoWord Project"
version := "1.0"
scalaVersion := "2.11.8"
libraryDependencies += "org.apache.spark" %% "spark-core" % "2.1.0"
#編譯程序
/usr/local/sbt/sbt package
#運行程序
/usr/local/spark/bin/spark-submit --class "WordCount" /usr/local/spark/demo_code/wordcount/target/scala-2.11/demoword-project_2.11-1.0.jar
Spark詞頻統計Demo介紹到這些