Skip to content

Commit d0c88c3

Browse files
author
James Lee
committed
sort by key for word count problem
1 parent 39c7c6d commit d0c88c3

2 files changed

Lines changed: 58 additions & 0 deletions

File tree

Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,16 @@
1+
package com.sparkTutorial.pairRdd.sort.sortbykey;
2+
3+
4+
public class SortedWorldCountProblem {
5+
6+
/* TODO: Create a Spark program to read the an article from in/word_count.text, output the number of occurrence of each word in descending order.
7+
8+
Sample output:
9+
10+
apple : 200
11+
shoes : 193
12+
bag : 176
13+
...
14+
*/
15+
}
16+
Lines changed: 42 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,42 @@
1+
package com.sparkTutorial.pairRdd.sort.sortbykey;
2+
3+
4+
import org.apache.log4j.Level;
5+
import org.apache.log4j.Logger;
6+
import org.apache.spark.SparkConf;
7+
import org.apache.spark.api.java.JavaPairRDD;
8+
import org.apache.spark.api.java.JavaRDD;
9+
import org.apache.spark.api.java.JavaSparkContext;
10+
import org.apache.spark.api.java.function.Function2;
11+
import org.apache.spark.api.java.function.PairFunction;
12+
import scala.Tuple2;
13+
14+
import java.util.Arrays;
15+
16+
public class SortedWorldCountSolution {
17+
18+
public static void main(String[] args) throws Exception {
19+
20+
Logger.getLogger("org").setLevel(Level.ERROR);
21+
SparkConf conf = new SparkConf().setAppName("wordCounts").setMaster("local[3]");
22+
JavaSparkContext sc = new JavaSparkContext(conf);
23+
24+
JavaRDD<String> lines = sc.textFile("in/word_count.text");
25+
JavaRDD<String> wordRdd = lines.flatMap(line -> Arrays.asList(line.split(" ")).iterator());
26+
27+
JavaPairRDD<String, Integer> wordPairRdd = wordRdd.mapToPair((PairFunction<String, String, Integer>) word -> new Tuple2<>(word, 1));
28+
29+
JavaPairRDD<String, Integer> wordToCountPairs = wordPairRdd.reduceByKey((Function2<Integer, Integer, Integer>) (x, y) -> x + y);
30+
31+
JavaPairRDD<Integer, String> countToWordParis = wordToCountPairs.mapToPair((PairFunction<Tuple2<String, Integer>, Integer, String>) wordToCount -> new Tuple2<>(wordToCount._2(), wordToCount._1()));
32+
33+
JavaPairRDD<Integer, String> sortedCountToWordParis = countToWordParis.sortByKey(false);
34+
35+
JavaPairRDD<String, Integer> sortedWordToCountPairs = sortedCountToWordParis.mapToPair((PairFunction<Tuple2<Integer, String>, String, Integer>) countToWord -> new Tuple2<>(countToWord._2(), countToWord._1()));
36+
37+
for (Tuple2<String, Integer> wordToCount : sortedWordToCountPairs.collect()) {
38+
System.out.println(wordToCount._1() + " : " + wordToCount._2());
39+
40+
}
41+
}
42+
}

0 commit comments

Comments
 (0)