Separate RDFS schema extractor

LorenzBuehmann · LorenzBuehmann · commit ae1674fd0a66 · 2017-04-29T12:22:01.000+02:00
diff --git a/sansa-inference-spark/src/main/scala/net/sansa_stack/inference/spark/forwardchaining/ForwardRuleReasonerRDFS.scala b/sansa-inference-spark/src/main/scala/net/sansa_stack/inference/spark/forwardchaining/ForwardRuleReasonerRDFS.scala
@@ -39,7 +39,7 @@ class ForwardRuleReasonerRDFS(sc: SparkContext, parallelism: Int = 2) extends Tr
 
     // as an optimization, we can extract all schema triples first which avoids to run on the whole dataset
     // for each schema triple later
-    val schemaTriples = if (extractSchemaTriplesInAdvance) new RDFSSchemaExtractor(sc).extract(triplesRDD)
+    val schemaTriples = if (extractSchemaTriplesInAdvance) new RDFSSchemaExtractor().extract(triplesRDD)
                         else triplesRDD
 
 
diff --git a/sansa-inference-spark/src/main/scala/net/sansa_stack/inference/spark/forwardchaining/ForwardRuleReasonerRDFSDataframe.scala b/sansa-inference-spark/src/main/scala/net/sansa_stack/inference/spark/forwardchaining/ForwardRuleReasonerRDFSDataframe.scala
@@ -34,7 +34,7 @@ class ForwardRuleReasonerRDFSDataframe(session: SparkSession, parallelism: Int =
 
     val sqlSchema = graph.schema
 
-    val extractor = new RDFSSchemaExtractor(session.sparkContext)
+    val extractor = new RDFSSchemaExtractor()
 
     var index = extractor.extractWithIndex(graph)
 
@@ -145,7 +145,7 @@ class ForwardRuleReasonerRDFSDataframe(session: SparkSession, parallelism: Int =
 
     // get rdf:type tuples here as intermediate result
     val typeTuples = triples
-      .where(s"${sqlSchema.predicateCol} = '${RDF.`type`.getURI} '")
+      .where(s"${sqlSchema.predicateCol} = '${RDF.`type`.getURI}'")
       .select(sqlSchema.subjectCol, sqlSchema.objectCol)
       .union(tuples23)
       .alias("TYPES")
@@ -156,7 +156,7 @@ class ForwardRuleReasonerRDFSDataframe(session: SparkSession, parallelism: Int =
     rdfs9	xxx rdfs:subClassOf yyy .
           zzz rdf:type xxx .	        zzz rdf:type yyy .
      */
-    val tripleRDFS9 = typeTuples
+    val tuplesRDFS9 = typeTuples
       .join(subClassOfTriplesTrans, $"TYPES.${sqlSchema.objectCol}" === $"SC.${sqlSchema.subjectCol}", "inner")
       .select($"TYPES.${sqlSchema.subjectCol}", $"SC.${sqlSchema.objectCol}") // (zzz, yyy)
 
@@ -184,22 +184,24 @@ class ForwardRuleReasonerRDFSDataframe(session: SparkSession, parallelism: Int =
 //        .withColumn("const", lit(RDF.`type`.getURI))
 //        .select("DATA.subject", "const", "SC.object")
 //        .select($"TYPES.subject", $"SC.object") // (zzz, yyy)
+//    println("existing types:" + existingTypes.count())
 //    println("SC:" + subClassOfTriplesTrans.count())
 //    println("SP:" + subPropertyOfTriplesTrans.count())
 //    println("TYPES:" + typeTuples.count())
 //    println("R7:" + triplesRDFS7.count())
 //    println("R2:" + triplesRDFS2.count())
 //    println("R3:" + triplesRDFS3.count())
-//    println("R9:" + triplesRDFS9.count())
+//    println("R9:" + tuplesRDFS9.count())
 
     // 5. merge triples and remove duplicates
     val allTriples =
-      tuples23.union(tripleRDFS9)
+      typeTuples.union(tuples23).union(tuplesRDFS9)
         .withColumn("const", lit(RDF.`type`.getURI))
         .select(sqlSchema.subjectCol, "const", sqlSchema.objectCol)
       .union(subClassOfTriplesTrans)
       .union(subPropertyOfTriplesTrans)
       .union(triplesRDFS7)
+      .union(triples)
       .distinct()
 //        .selectExpr("subject", "'" + RDF.`type`.getURI + "' as predicate", "object")
 //    allTriples.explain()
diff --git a/sansa-inference-spark/src/main/scala/net/sansa_stack/inference/spark/utils/RDFSSchemaExtractor.scala b/sansa-inference-spark/src/main/scala/net/sansa_stack/inference/spark/utils/RDFSSchemaExtractor.scala
@@ -22,7 +22,7 @@ import net.sansa_stack.inference.utils.{CollectionUtils, Logging}
   *
   * @author Lorenz Buehmann
   */
-class RDFSSchemaExtractor(sc : SparkContext) extends Logging{
+class RDFSSchemaExtractor() extends Logging with Serializable {
 
   val properties = Set(RDFS.subClassOf, RDFS.subPropertyOf, RDFS.domain, RDFS.range).map(p => p.getURI)
 
@@ -121,7 +121,7 @@ class RDFSSchemaExtractor(sc : SparkContext) extends Logging{
     * @return a mapping from the corresponding schema property to the broadcast variable that wraps the multimap
     *         with s-o pairs
     */
-  def extractWithIndexAndDistribute(graph: RDFGraphNative): Map[String, Broadcast[Map[String, Set[String]]]] = {
+  def extractWithIndexAndDistribute(sc : SparkContext, graph: RDFGraphNative): Map[String, Broadcast[Map[String, Set[String]]]] = {
     val schema = extractWithIndex(graph)
 
     log.info("Started schema distribution...")