Additional CLI options how to save result to disk.

LorenzBuehmann · LorenzBuehmann · commit 6aca7a14ee8f · 2016-12-02T13:36:16.000+01:00
diff --git a/sansa-inference-spark/src/main/scala/net/sansa_stack/inference/spark/RDFGraphMaterializer.scala b/sansa-inference-spark/src/main/scala/net/sansa_stack/inference/spark/RDFGraphMaterializer.scala
@@ -23,19 +23,19 @@ object RDFGraphMaterializer {
   def main(args: Array[String]) {
     parser.parse(args, Config()) match {
       case Some(config) =>
-        run(config.in, config.out, config.profile)
+        run(config.in, config.out, config.profile, config.writeToSingleFile, config.sortedOutput)
       case None =>
         println(parser.usage)
     }
   }
 
-  def run(input: File, output: File, profile: ReasoningProfile): Unit = {
+  def run(input: File, output: File, profile: ReasoningProfile, writeToSingleFile: Boolean, sortedOutput: Boolean): Unit = {
     val conf = new SparkConf()
     conf.registerKryoClasses(Array(classOf[RDFTriple]))
 
     // the SPARK config
     val session = SparkSession.builder
-      .appName("SPARK Reasoning")
+      .appName(s"SPARK $profile Reasoning")
       .master("local[4]")
       .config("spark.eventLog.enabled", "true")
       .config("spark.hadoop.validateOutputSpecs", "false") //override output files
@@ -58,13 +58,18 @@ object RDFGraphMaterializer {
     print(inferredGraph.size())
 
     // write triples to disk
-    RDFGraphWriter.writeToFile(inferredGraph, output.getAbsolutePath)
+    RDFGraphWriter.writeGraphToFile(inferredGraph, output.getAbsolutePath, writeToSingleFile, sortedOutput)
 
     session.stop()
   }
 
   // the config object
-  case class Config(in: File = new File("."), out: File = new File("."), profile: ReasoningProfile = ReasoningProfile.RDFS)
+  case class Config(
+                     in: File = new File("."),
+                     out: File = new File("."),
+                     profile: ReasoningProfile = ReasoningProfile.RDFS,
+                     writeToSingleFile: Boolean = false,
+                     sortedOutput: Boolean = false)
 
   // read ReasoningProfile enum
   implicit val profilesRead: scopt.Read[ReasoningProfile.Value] =
@@ -82,6 +87,12 @@ object RDFGraphMaterializer {
       action((x, c) => c.copy(out = x)).
       text("the output directory")
 
+    opt[Unit]("single-file").optional().action( (_, c) =>
+      c.copy(writeToSingleFile = true)).text("write the output to a single file in the output directory")
+
+    opt[Unit]("sorted").optional().action( (_, c) =>
+      c.copy(sortedOutput = true)).text("sorted output of the triples per file")
+
     opt[ReasoningProfile]('p', "profile").required().valueName("{rdfs | owl-horst | owl-el | owl-rl}").
       action((x, c) => c.copy(profile = x)).
       text("the reasoning profile")
diff --git a/sansa-inference-spark/src/main/scala/net/sansa_stack/inference/spark/data/RDFGraphWriter.scala b/sansa-inference-spark/src/main/scala/net/sansa_stack/inference/spark/data/RDFGraphWriter.scala
@@ -20,26 +20,56 @@ object RDFGraphWriter {
 
   private val logger = com.typesafe.scalalogging.Logger(LoggerFactory.getLogger(this.getClass.getName))
 
-  def writeToFile(graph: RDFGraph, path: String): Unit = {
+  /**
+    * Write the graph to disk in N-Triple format.
+    *
+    * @param graph the RDF graph
+    * @param path the output  directory
+    * @param singleFile whether to put all data into a single file
+    * @param sorted whether to sort the triples by subject, predicate, object
+    */
+  def writeGraphToFile(graph: RDFGraph, path: String, singleFile: Boolean = false, sorted: Boolean = false): Unit = {
+    writeTriplesToFile(graph.triples, path, singleFile, sorted)
+  }
+
+  /**
+    * Write the triples to disk in N-Triple format.
+    *
+    * @param triples the triples
+    * @param path the output  directory
+    * @param singleFile whether to put all data into a single file
+    * @param sorted whether to sort the triples by subject, predicate, object
+    */
+  def writeTriplesToFile(triples: RDD[RDFTriple], path: String, singleFile: Boolean = false, sorted: Boolean = false): Unit = {
     logger.info("writing triples to disk...")
     val startTime  = System.currentTimeMillis()
 
     implicit val ordering = RDFTripleOrdering
 
-    graph.triples.map(t=>(t,t)).sortByKey().map(_._1)
-      .map(t => "<" + t.subject + "> <" + t.predicate + "> <" + t.`object` + "> .") // to N-TRIPLES string
-      .coalesce(1)
-      .saveAsTextFile(path)
+    // sort triples if enabled
+    val tmp = if(sorted) {
+                triples.map(t => (t,t)).sortByKey().map(_._1)
+              } else {
+                triples
+              }
 
-    logger.info("finished writing triples to disk in " + (System.currentTimeMillis()-startTime) + "ms.")
-  }
+    // convert to N-Triple format
+    var triplesNTFormat = tmp.map(t => "<" + t.subject + "> <" + t.predicate + "> <" + t.`object` + "> .")
 
-  def writeToFile(triples: RDD[RDFTriple], path: String): Unit = {
-    writeToFile(RDFGraph(triples), path)
+    // convert to single file, i.e. move al lto one partition
+    // (might be very expensive and contradicts the Big Data paradigm on Hadoop in general)
+    if(singleFile) {
+      triplesNTFormat = triplesNTFormat.coalesce(1, shuffle = true)
+    }
+
+    // finally, write to disk
+    triplesNTFormat.saveAsTextFile(path)
+
+    logger.info("finished writing triples to disk in " + (System.currentTimeMillis()-startTime) + "ms.")
   }
 
-  def writeToFile(dataFrame: DataFrame, path: String): Unit = {
-    writeToFile(dataFrame.rdd.map(row => RDFTriple(row.getString(0), row.getString(1), row.getString(2))), path)
+  def writeDataframeToFile(dataFrame: DataFrame, path: String, singleFile: Boolean = false, sorted: Boolean = false): Unit = {
+    writeTriplesToFile(dataFrame.rdd.map(row => RDFTriple(row.getString(0), row.getString(1), row.getString(2))), path, singleFile, sorted)
   }
 
   def convertToModel(graph: RDFGraph) : Model = {
diff --git a/sansa-inference-spark/src/test/scala/net/sansa_stack/inference/spark/GenericVsNativeExperiments.scala b/sansa-inference-spark/src/test/scala/net/sansa_stack/inference/spark/GenericVsNativeExperiments.scala
@@ -48,8 +48,8 @@ object GenericVsNativeExperiments {
     val targetDir = args(1)
 
     // write triples to disk
-    RDFGraphWriter.writeToFile(infGraphNative.toDataFrame(), targetDir + "/native")
-    RDFGraphWriter.writeToFile(infGraphGeneric.toDataFrame(), targetDir + "/generic")
+    RDFGraphWriter.writeDataframeToFile(infGraphNative.toDataFrame(), targetDir + "/native")
+    RDFGraphWriter.writeDataframeToFile(infGraphGeneric.toDataFrame(), targetDir + "/generic")
 
     session.stop()
 
diff --git a/sansa-inference-spark/src/test/scala/net/sansa_stack/inference/spark/rules/RDFGraphMaterializerTest.scala b/sansa-inference-spark/src/test/scala/net/sansa_stack/inference/spark/rules/RDFGraphMaterializerTest.scala
@@ -46,7 +46,7 @@ object RDFGraphMaterializerTest {
     val inferredGraph = reasoner.apply(graph)
 
     // write triples to disk
-    RDFGraphWriter.writeToFile(inferredGraph, args(0))
+    RDFGraphWriter.writeGraphToFile(inferredGraph, args(0))
 
     sc.stop()
 
diff --git a/sansa-inference-spark/src/test/scala/net/sansa_stack/inference/spark/rules/SetOfRulesTest.scala b/sansa-inference-spark/src/test/scala/net/sansa_stack/inference/spark/rules/SetOfRulesTest.scala
@@ -118,13 +118,13 @@ object SetOfRulesTest {
   def runNaive(graph: RDFGraphNative, rules: Seq[Rule]) = {
     val reasoner = new ForwardRuleReasonerNaive(sc, rules.toSet)
     val res = reasoner.apply(graph)
-    RDFGraphWriter.writeToFile(res.toRDD(), "/tmp/spark-tests/naive")
+    RDFGraphWriter.writeTriplesToFile(res.toRDD(), "/tmp/spark-tests/naive")
   }
 
   def runNative(graph: RDFGraphNative, rules: Seq[Rule]) = {
     val reasoner = new ForwardRuleReasonerOptimizedNative(sparkSession, rules.toSet)
     val res = reasoner.apply(graph)
-    RDFGraphWriter.writeToFile(res.toRDD(), "/tmp/spark-tests/optimized-native")
+    RDFGraphWriter.writeTriplesToFile(res.toRDD(), "/tmp/spark-tests/optimized-native")
   }
 
   def runSQL(graph: RDFGraphNative, rules: Seq[Rule]) = {
@@ -133,7 +133,7 @@ object SetOfRulesTest {
 
     val reasoner = new ForwardRuleReasonerOptimizedSQL(sparkSession, rules.toSet)
     val res = reasoner.apply(graphDataframe)
-    RDFGraphWriter.writeToFile(res.toDataFrame(), "/tmp/spark-tests/optimized-sql")
+    RDFGraphWriter.writeDataframeToFile(res.toDataFrame(), "/tmp/spark-tests/optimized-sql")
     reasoner.showExecutionStats()
   }
 }
diff --git a/sansa-inference-spark/src/test/scala/net/sansa_stack/inference/spark/rules/TransitivityRuleTest.scala b/sansa-inference-spark/src/test/scala/net/sansa_stack/inference/spark/rules/TransitivityRuleTest.scala
@@ -82,7 +82,7 @@ object TransitivityRuleTest {
 
     val planExecutor1 = new PlanExecutorNative(sc)
     val res2 = planExecutor1.execute(plan, graph)
-    RDFGraphWriter.writeToFile(res2.toRDD(), "/tmp/spark-tests/native")
+    RDFGraphWriter.writeTriplesToFile(res2.toRDD(), "/tmp/spark-tests/native")
 
 
     // 3. the SQL based rule executor
@@ -94,7 +94,7 @@ object TransitivityRuleTest {
     val df = new RDFGraphDataFrame(graph.toDataFrame(sparkSession))
     val planExecutor2 = new PlanExecutorSQL(sparkSession)
     val res3 = planExecutor2.execute(plan, df)
-    RDFGraphWriter.writeToFile(res3.toRDD(), "/tmp/spark-tests/sql")
+    RDFGraphWriter.writeTriplesToFile(res3.toRDD(), "/tmp/spark-tests/sql")
 
     sc.stop()
   }

Original file line number	Diff line number	Diff line change
`@@ -118,13 +118,13 @@ object SetOfRulesTest {`
`118`	`118`	`def runNaive(graph: RDFGraphNative, rules: Seq[Rule]) = {`
`119`	`119`	`val reasoner = new ForwardRuleReasonerNaive(sc, rules.toSet)`
`120`	`120`	`val res = reasoner.apply(graph)`
`121`		`- RDFGraphWriter.writeToFile(res.toRDD(), "/tmp/spark-tests/naive")`
	`121`	`+ RDFGraphWriter.writeTriplesToFile(res.toRDD(), "/tmp/spark-tests/naive")`
`122`	`122`	`}`
`123`	`123`
`124`	`124`	`def runNative(graph: RDFGraphNative, rules: Seq[Rule]) = {`
`125`	`125`	`val reasoner = new ForwardRuleReasonerOptimizedNative(sparkSession, rules.toSet)`
`126`	`126`	`val res = reasoner.apply(graph)`
`127`		`- RDFGraphWriter.writeToFile(res.toRDD(), "/tmp/spark-tests/optimized-native")`
	`127`	`+ RDFGraphWriter.writeTriplesToFile(res.toRDD(), "/tmp/spark-tests/optimized-native")`
`128`	`128`	`}`
`129`	`129`
`130`	`130`	`def runSQL(graph: RDFGraphNative, rules: Seq[Rule]) = {`
`@@ -133,7 +133,7 @@ object SetOfRulesTest {`
`133`	`133`
`134`	`134`	`val reasoner = new ForwardRuleReasonerOptimizedSQL(sparkSession, rules.toSet)`
`135`	`135`	`val res = reasoner.apply(graphDataframe)`
`136`		`- RDFGraphWriter.writeToFile(res.toDataFrame(), "/tmp/spark-tests/optimized-sql")`
	`136`	`+ RDFGraphWriter.writeDataframeToFile(res.toDataFrame(), "/tmp/spark-tests/optimized-sql")`
`137`	`137`	`reasoner.showExecutionStats()`
`138`	`138`	`}`
`139`	`139`	`}`