SANSA-Stack
diff --git a/‎sansa-inference-spark/src/main/scala/net/sansa_stack/inference/spark/RDFGraphMaterializer.scala‎
Lines changed: 5 additions & 10 deletions b/‎sansa-inference-spark/src/main/scala/net/sansa_stack/inference/spark/RDFGraphMaterializer.scala‎
Lines changed: 5 additions & 10 deletions
diff --git a/‎sansa-inference-spark/src/main/scala/net/sansa_stack/inference/spark/abstraction/TypeComputorDefault.scala‎
Lines changed: 1 addition & 1 deletion b/‎sansa-inference-spark/src/main/scala/net/sansa_stack/inference/spark/abstraction/TypeComputorDefault.scala‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎sansa-inference-spark/src/main/scala/net/sansa_stack/inference/spark/data/RDFGraph.scala‎
Lines changed: 1 addition & 1 deletion b/‎sansa-inference-spark/src/main/scala/net/sansa_stack/inference/spark/data/RDFGraph.scala‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎sansa-inference-spark/src/main/scala/net/sansa_stack/inference/spark/data/RDFGraphLoader.scala‎
Lines changed: 103 additions & 44 deletions b/‎sansa-inference-spark/src/main/scala/net/sansa_stack/inference/spark/data/RDFGraphLoader.scala‎
Lines changed: 103 additions & 44 deletions
@@ -1,8 +1,6 @@
 package net.sansa_stack.inference.spark
 
-import java.io.File
 import java.net.URI
-import java.nio.file.{Path, Paths}
 
 import org.apache.spark.SparkConf
 import org.apache.spark.sql.SparkSession
@@ -49,21 +47,18 @@ object RDFGraphMaterializer {
       .config("spark.serializer", "org.apache.spark.serializer.KryoSerializer")
       .config("spark.default.parallelism", parallelism)
       .config("spark.ui.showConsoleProgress", "false")
+      .config("spark.sql.shuffle.partitions", parallelism)
       .config(conf)
       .getOrCreate()
 
 //    println(session.conf.getAll.mkString("\n"))
 
-//    val g = RDFGraphLoader.loadGraphFromDiskAsDataset(session, input)
-//    g.triples.toDF().printSchema()
-//    g.triples.show(10)
+//    val g = RDFGraphLoader.loadFromDiskAsDataset(session, input).distinct()
 //    val g_inf = new ForwardRuleReasonerRDFSDataset(session).apply(g)
-//    println(g_inf.size())
+//    println(s"|G_inf| = ${g_inf.size()}")
 
     // load triples from disk
-    val graph = RDFGraphLoader.loadFromDisk(input, session, parallelism)
-
-
+    val graph = RDFGraphLoader.loadFromDisk(session, input, parallelism)
 //    println(s"|G| = ${graph.size()}")
 
     // create reasoner
@@ -82,7 +77,7 @@ object RDFGraphMaterializer {
 //    println(s"|G_inf| = ${inferredGraph.size()}")
 
     // write triples to disk
-    RDFGraphWriter.writeGraphToFile(inferredGraph, output.toString, writeToSingleFile, sortedOutput)
+    RDFGraphWriter.writeToDisk(inferredGraph, output.toString, writeToSingleFile, sortedOutput)
 
     session.stop()
   }
 
@@ -81,7 +81,7 @@ object TypeComputorDefault {
       .config("spark.ui.showConsoleProgress", "false")
       .getOrCreate()
 
-    val graph = RDFGraphLoader.loadFromDisk(Seq(URI.create(args(0))), session, parallelism)
+    val graph = RDFGraphLoader.loadFromDisk(session, Seq(URI.create(args(0))), parallelism)
 
     val typeComputor = new TypeComputorDefault
 
 
@@ -83,7 +83,7 @@ case class RDFGraph (triples: RDD[RDFTriple]) {
     val triplesDataFrame = sqlContext.createDataFrame(rowRDD, schema)
 
     // register the DataFrame as a table
-    triplesDataFrame.registerTempTable("TRIPLES")
+    triplesDataFrame.createOrReplaceTempView("TRIPLES")
 
     triplesDataFrame
   }
 
@@ -1,18 +1,18 @@
 package net.sansa_stack.inference.spark.data
 
-import java.io.File
 import java.net.URI
 
-import org.apache.jena.rdf.model.Resource
+import scala.language.implicitConversions
+
 import org.apache.spark.SparkContext
-import org.apache.spark.sql.SparkSession
+import org.apache.spark.sql.{Dataset, SparkSession}
 import org.slf4j.LoggerFactory
 
 import net.sansa_stack.inference.data.RDFTriple
 import net.sansa_stack.inference.utils.NTriplesStringToRDFTriple
 
 /**
-  * Load an RDF graph from disk.
+  * A class that provides methods to load an RDF graph from disk.
   *
   * @author Lorenz Buehmann
   *
@@ -21,65 +21,66 @@ object RDFGraphLoader {
 
   private val logger = com.typesafe.scalalogging.Logger(LoggerFactory.getLogger(this.getClass.getName))
 
-  implicit def filesConverter(files: Seq[File]): String = files.map(p => p.getAbsolutePath).mkString(",")
+  private implicit def pathURIsConverter(uris: Seq[URI]): String = uris.map(p => p.toString).mkString(",")
 
   /**
     * Load an RDF graph from a file or directory. The path can also contain multiple paths
     * and even wildcards, e.g.
-    * "/my/dir1,/my/paths/part-00[0-5]*,/another/dir,/a/specific/file"
+    * `"/my/dir1,/my/paths/part-00[0-5]*,/another/dir,/a/specific/file"`
     *
     * @param path the absolute path of the file
     * @param session the Spark session
     * @param minPartitions min number of partitions for Hadoop RDDs ([[SparkContext.defaultMinPartitions]])
     * @return an RDF graph
     */
-  def loadFromFile(path: String, session: SparkSession, minPartitions: Int = 2): RDFGraph = {
+  def loadFromDisk(session: SparkSession, path: String, minPartitions: Int = 2): RDFGraph = {
     logger.info("loading triples from disk...")
     val startTime = System.currentTimeMillis()
 
     val triples = session.sparkContext
       .textFile(path, minPartitions) // read the text file
       .map(new NTriplesStringToRDFTriple()) // convert to triple object
+//      .repartition(minPartitions)
 
 //  logger.info("finished loading " + triples.count() + " triples in " + (System.currentTimeMillis()-startTime) + "ms.")
     new RDFGraph(triples)
   }
 
   /**
-    * Load an RDF graph from multiple files.
+    * Load an RDF graph from multiple files or directories.
     *
-    * @param paths the files
+    * @param paths the files or directories
     * @param session the Spark session
     * @param minPartitions min number of partitions for Hadoop RDDs ([[SparkContext.defaultMinPartitions]])
     * @return an RDF graph
     */
-  def loadFromDisk(paths: Seq[URI], session: SparkSession, minPartitions: Int = 2): RDFGraph = {
-    logger.info("loading triples from disk...")
-    val startTime = System.currentTimeMillis()
-println("Input Paths: " + paths.mkString(","))
-    val pathsConcat = paths.mkString(",") // make concatenated string of paths
-
-    val triples = session.sparkContext
-      .textFile(pathsConcat, minPartitions) // read the text files
-      .map(new NTriplesStringToRDFTriple()) // convert to triple object
-//      .repartition(minPartitions)
+  def loadFromDisk(session: SparkSession, paths: Seq[URI], minPartitions: Int): RDFGraph = {
+    loadFromDisk(session, paths.mkString(","), minPartitions)
+  }
 
-    // logger.info("finished loading " + triples.count() + " triples in " +
-    // (System.currentTimeMillis()-startTime) + "ms.")
-    new RDFGraph(triples)
+  /**
+    * Load an RDF graph from a single file or directory.
+    *
+    * @param path the path to the file or directory
+    * @param session the Spark session
+    * @param minPartitions min number of partitions for Hadoop RDDs ([[SparkContext.defaultMinPartitions]])
+    * @return an RDF graph
+    */
+  def loadFromDisk(session: SparkSession, path: URI, minPartitions: Int): RDFGraph = {
+    loadFromDisk(session, Seq(path), minPartitions)
   }
 
   /**
-    * Load an RDF graph from a file or directory. The path can also contain multiple paths
-    * and even wildcards, e.g.
+    * Load an RDF graph from a file or directory with a Spark RDD as underlying datastructure.
+    * The path can also contain multiple paths and even wildcards, e.g.
     * "/my/dir1,/my/paths/part-00[0-5]*,/another/dir,/a/specific/file"
     *
     * @param path the files
     * @param session the Spark session
     * @param minPartitions min number of partitions for Hadoop RDDs ([[SparkContext.defaultMinPartitions]])
     * @return an RDF graph
     */
-  def loadGraphFromFile(path: String, session: SparkSession, minPartitions: Int = 2): RDFGraphNative = {
+  def loadFromDiskAsRDD(session: SparkSession, path: String, minPartitions: Int): RDFGraphNative = {
     logger.info("loading triples from disk...")
     val startTime = System.currentTimeMillis()
 
@@ -92,9 +93,26 @@ println("Input Paths: " + paths.mkString(","))
     new RDFGraphNative(triples)
   }
 
-  case class RDFTriple2(s: String, p: String, o: String)
+  private case class RDFTriple2(s: String, p: String, o: String) extends Product3[String, String, String] {
+    override def _1: String = s
+    override def _2: String = p
+    override def _3: String = o
+
+    def subject: String = s
 
-  def loadGraphFromDiskAsDataset(implicit session: SparkSession, paths: scala.Seq[File]): RDFGraphDataset = {
+    override def toString: String = s + "  " + p + "  " + o
+  }
+
+  /**
+    * Load an RDF graph from from multiple files or directories with a Spark Dataset as underlying datastructure.
+    * The path can also contain multiple paths and even wildcards, e.g.
+    * `"/my/dir1,/my/paths/part-00[0-5]*,/another/dir,/a/specific/file"`
+    *
+    * @param path the absolute path of the file
+    * @param session the Spark session
+    * @return an RDF graph based on a [[Dataset]]
+    */
+  def loadFromDiskAsDataset(session: SparkSession, path: String): RDFGraphDataset = {
     logger.info("loading triples from disk...")
     val startTime = System.currentTimeMillis()
 
@@ -104,25 +122,43 @@ println("Input Paths: " + paths.mkString(","))
       Array(splitted(0), splitted(1), splitted(2))
     })
 
-    implicit val myObjEncoder = org.apache.spark.sql.Encoders.kryo[RDFTriple]
+    //    val rdfTripleEncoder = org.apache.spark.sql.Encoders.kryo[RDFTriple]
     val spark = session.sqlContext
     import spark.implicits._
 
+
+
     val triples = session.read
-      .textFile(paths) // read the text file
-        .map(s => {
-      val tokens = s.split(" ")
-      RDFTriple(tokens(0), tokens(1), tokens(2))
-    })
-    session.read
-      .textFile(paths) // read the text file
-      .map(s => {
-      val tokens = s.split(" ")
-      RDFTriple(tokens(0), tokens(1), tokens(2))
-    }).toDF().show(10)
-//      .select(splitter($"value") as "tokens")
-//      .select($"tokens"(0) as "s", $"tokens"(1) as "p", $"tokens"(2) as "o")
-//      .as[RDFTriple]
+      .textFile(path) // read the text file
+      .map(new NTriplesStringToRDFTriple())
+      .as[RDFTriple]
+      .as("triples")
+    // (rdfTripleEncoder)
+    //    val rowRDD = session.sparkContext
+    //      .textFile(paths) // read the text file
+    //      .map(s => {
+    //      val tokens = s.split(" ")
+    //      Row(tokens(0), tokens(1), tokens(2))
+    //      //      RDFTriple(tokens(0), tokens(1), tokens(2))
+    //    })
+    //
+    //    val encoder = Encoders.product[RDFTriple]
+    //    val schema =
+    //      StructType(Array(
+    //        StructField("s", StringType, true),
+    //        StructField("p", StringType, true),
+    //        StructField("o", StringType, true)))
+    //    val triplesDF = spark.createDataFrame(rowRDD, schema)
+    //     val triples = triplesDF.as[RDFTriple](encoder)
+    //    session.read
+    //      .textFile(paths) // read the text file
+    //      .map(s => {
+    //      val tokens = s.split(" ")
+    //      RDFTriple2(tokens(0), tokens(1), tokens(2))
+    //    }).as[RDFTriple2].show(10)
+    //      .select(splitter($"value") as "tokens")
+    //      .select($"tokens"(0) as "s", $"tokens"(1) as "p", $"tokens"(2) as "o")
+    //      .as[RDFTriple]
 
 
     // convert to triple object
@@ -132,7 +168,30 @@ println("Input Paths: " + paths.mkString(","))
     new RDFGraphDataset(triples)
   }
 
-  def loadGraphDataFrameFromFile(path: String, session: SparkSession, minPartitions: Int = 2): RDFGraphDataFrame = {
-    new RDFGraphDataFrame(loadGraphFromFile(path, session, minPartitions).toDataFrame(session))
+  /**
+    * Load an RDF graph from from from a file or directory with a Spark Dataset as underlying datastructure.
+    * The path can also contain multiple paths and even wildcards, e.g.
+    * `"/my/dir1,/my/paths/part-00[0-5]*,/another/dir,/a/specific/file"`
+    *
+    * @param paths the absolute path of the file
+    * @param session the Spark session
+    * @return an RDF graph based on a [[Dataset]]
+    */
+  def loadFromDiskAsDataset(session: SparkSession, paths: scala.Seq[URI]): RDFGraphDataset = {
+    loadFromDiskAsDataset(session, paths.mkString(","))
+  }
+
+  /**
+    * Load an RDF graph from a file or directory with a Spark DataFrame as underlying datastructure.
+    * The path can also contain multiple paths and even wildcards, e.g.
+    * `"/my/dir1,/my/paths/part-00[0-5]*,/another/dir,/a/specific/file"`
+    *
+    * @param path the absolute path of the file
+    * @param session the Spark session
+    * @param minPartitions min number of partitions for Hadoop RDDs ([[SparkContext.defaultMinPartitions]])
+    * @return an RDF graph based on a [[org.apache.spark.sql.DataFrame]]
+    */
+  def loadFromDiskAsDataFrame(session: SparkSession, path: String, minPartitions: Int): RDFGraphDataFrame = {
+    new RDFGraphDataFrame(loadFromDiskAsRDD(session, path, minPartitions).toDataFrame(session))
   }
 }
Original file line number	Diff line number	Diff line change
`@@ -83,7 +83,7 @@ case class RDFGraph (triples: RDD[RDFTriple]) {`
`83`	`83`	`val triplesDataFrame = sqlContext.createDataFrame(rowRDD, schema)`
`84`	`84`
`85`	`85`	`// register the DataFrame as a table`
`86`		`- triplesDataFrame.registerTempTable("TRIPLES")`
	`86`	`+ triplesDataFrame.createOrReplaceTempView("TRIPLES")`
`87`	`87`
`88`	`88`	`triplesDataFrame`
`89`	`89`	`}`