From 2c17cce9c53ca036570c56194e27bb9f89e06654 Mon Sep 17 00:00:00 2001
From: George <giwrgosmandi@gmail.com>
Date: Sat, 20 Feb 2021 23:36:06 +0200
Subject: [PATCH 01/25] Composite Schemes

---
 config/LINEARWATER-AREAWATER.yaml             |  17 +++
 config/configurationTemplate.yaml             |   3 +-
 src/main/scala/dataModel/WeightedPair.scala   |  37 +++++
 .../scala/dataModel/WeightedPairsPQ.scala     |  43 ++++++
 .../scala/experiments/EvaluationExp.scala     |  12 +-
 .../scala/experiments/ProgressiveExp.scala    |  13 +-
 .../scala/experiments/WellBalancedExp.scala   |  12 +-
 .../progressive/GeometryCentric.scala         | 136 +++++++++---------
 .../ProgressiveAlgorithmsFactory.scala        |  15 +-
 .../progressive/ProgressiveGIAnt.scala        |  21 +--
 .../ProgressiveGeospatialInterlinkingT.scala  |  63 ++++----
 .../progressive/RandomScheduling.scala        |  31 ++--
 .../progressive/ReciprocalTopK.scala          |  45 +++---
 .../progressive/TopKPairs.scala               |  87 +++++------
 .../scala/utils/ConfigurationParser.scala     |  11 +-
 src/main/scala/utils/Constants.scala          |   3 +-
 16 files changed, 327 insertions(+), 222 deletions(-)
 create mode 100644 config/LINEARWATER-AREAWATER.yaml
 create mode 100644 src/main/scala/dataModel/WeightedPair.scala
 create mode 100644 src/main/scala/dataModel/WeightedPairsPQ.scala

diff --git a/config/LINEARWATER-AREAWATER.yaml b/config/LINEARWATER-AREAWATER.yaml
new file mode 100644
index 00000000..b176c04c
--- /dev/null
+++ b/config/LINEARWATER-AREAWATER.yaml
@@ -0,0 +1,17 @@
+
+source:
+  path: "/home/gmandi/Documents/Extreme-Earth/Datasets/SPATIAL-HADOOP/LINEARWATER_100K.tsv"
+  realIdField: "id"
+  geometryField: "WKT"
+
+target:
+  path: "/home/gmandi/Documents/Extreme-Earth/Datasets/SPATIAL-HADOOP/AREAWATER_100K.tsv"
+  realIdField: "id"
+  geometryField: "WKT"
+
+relation: "DE9IM"
+
+configurations:
+  thetaGranularity: "avg"
+  secondaryWS: "MBR_INTERSECTION"
+  mainWS: "JS"
\ No newline at end of file
diff --git a/config/configurationTemplate.yaml b/config/configurationTemplate.yaml
index c77f6536..773f55c1 100644
--- a/config/configurationTemplate.yaml
+++ b/config/configurationTemplate.yaml
@@ -20,6 +20,7 @@ configurations:
   partitions: "number of partitions"
   thetaGranularity: "avg"                               # define the extend of dynamic tiling based on the geometries of source - Experiments have shown that "avg" is the best option
   gridType: "spatial paritioner grid Type algorithm"    # allowed values:  KDBTREE, QUADTREE
-  weightingScheme: "WS"                                 # specify weighting scheme - allowed values: CF, JS, MBR_INTERSECTION, PEARSON_X2, POINTS
+  mainWS: "WS"                                          # specify weighting scheme - allowed values: CF, JS, MBR_INTERSECTION, PEARSON_X2, POINTS
+  secondaryWS: "WS"
   progressiveAlgorithm : "PA"                           # specify progressive algorithm - allowed values: PROGRESSIVE_GIANT, TOPK, RECIPROCAL_TOPK, GEOMETRY_CENTRIC, RANDOM
   budget: "BU"                                          # the budget of progressive algorithms
\ No newline at end of file
diff --git a/src/main/scala/dataModel/WeightedPair.scala b/src/main/scala/dataModel/WeightedPair.scala
new file mode 100644
index 00000000..05bfa7a4
--- /dev/null
+++ b/src/main/scala/dataModel/WeightedPair.scala
@@ -0,0 +1,37 @@
+package dataModel
+
+case class WeightedPair(entityId1: Int, entityId2: Int, mainWeight: Float, secondaryWeight: Float)  extends Serializable with Comparable[WeightedPair]{
+
+    var relatedMatches: Int = 0
+
+    override def compareTo(o: WeightedPair): Int = {
+        // descendant order
+        if (o.entityId1 == entityId1 && o.entityId2 == entityId2) return 0
+
+        val test1 = o.mainWeight - mainWeight
+        if (0 < test1) return 1
+
+        if (test1 < 0) return -1
+
+        val test2 = o.secondaryWeight - secondaryWeight
+        if (0 < test2) return 1
+
+        if (test2 < 0) return -1
+
+        o.entityId1 - entityId1
+    }
+
+    /**
+     * Returns the weight between two geometries. Higher weights correspond to
+     * stronger likelihood of related entities.
+     *
+     * @return
+     */
+    def getMainWeight: Float = mainWeight * (1 + relatedMatches)
+
+    def getSecondaryWeight: Float = secondaryWeight * (1 + relatedMatches)
+
+    def incrementRelatedMatches(): Unit = relatedMatches += 1
+
+    override def toString: String = "E1 : " + entityId1 + ", E2 : " + entityId2 + ", main weight : " + getMainWeight + ", secondary weight : " + getSecondaryWeight
+}
diff --git a/src/main/scala/dataModel/WeightedPairsPQ.scala b/src/main/scala/dataModel/WeightedPairsPQ.scala
new file mode 100644
index 00000000..b0add368
--- /dev/null
+++ b/src/main/scala/dataModel/WeightedPairsPQ.scala
@@ -0,0 +1,43 @@
+package dataModel
+
+import org.spark_project.guava.collect.MinMaxPriorityQueue
+import scala.collection.JavaConverters._
+
+case class WeightedPairsPQ(maxSize: Int){
+
+    lazy val pq: MinMaxPriorityQueue[WeightedPair] = MinMaxPriorityQueue.maximumSize(maxSize+1).create()
+
+
+    def enqueue(wp: WeightedPair): Unit ={
+            pq.add(wp)
+            if (pq.size > maxSize)
+                pq.pollLast()
+    }
+
+    def enqueueAll(items: Iterator[WeightedPair]): Unit = items.foreach(wp => enqueue(wp))
+
+    def take(n: Option[Int]): Iterator[WeightedPair] =
+        n match {
+            case Some(n) => Iterator.continually{ pq.pollFirst() }.take(n)
+            case None =>  Iterator.continually{ pq.pollFirst() }.takeWhile(_ => !pq.isEmpty)
+        }
+
+    def take(n: Int): Iterator[WeightedPair] = take(Option(n))
+
+    def dequeueAll: Iterator[WeightedPair] = take(None)
+
+    def clear(): Unit = pq.clear()
+
+    def isEmpty: Boolean = pq.isEmpty
+
+    def size(): Int = pq.size()
+
+    def dequeueHead(): WeightedPair = pq.pollFirst()
+
+    def dequeue(): WeightedPair = pq.pollLast()
+
+    def iterator(): Iterator[WeightedPair] = pq.iterator().asScala
+}
+
+
+
diff --git a/src/main/scala/experiments/EvaluationExp.scala b/src/main/scala/experiments/EvaluationExp.scala
index 55390310..b450a1d0 100644
--- a/src/main/scala/experiments/EvaluationExp.scala
+++ b/src/main/scala/experiments/EvaluationExp.scala
@@ -89,21 +89,21 @@ object EvaluationExp {
         val (_, _, _, _, _, _, _, _, _, totalVerifications, totalRelatedPairs) = GIAnt(sourceRDD, targetRDD, partitioner).countAllRelations
 
         log.info("DS-JEDAI: Total Verifications: " + totalVerifications)
-        log.info("DS-JEDAI: Total Interlinked Geometries: " + totalRelatedPairs)
+        log.info("DS-JEDAI: Total Qualifying Pairs: " + totalRelatedPairs)
         log.info("\n")
 
-        printResults(sourceRDD, targetRDD, partitioner, totalRelatedPairs, ProgressiveAlgorithm.RANDOM,  WeightingScheme.CF)
-        val algorithms = Seq(ProgressiveAlgorithm.PROGRESSIVE_GIANT, ProgressiveAlgorithm.TOPK, ProgressiveAlgorithm.RECIPROCAL_TOPK, ProgressiveAlgorithm.GEOMETRY_CENTRIC)
-        val weightingSchemes = Seq(WeightingScheme.MBR_INTERSECTION, WeightingScheme.POINTS)
+        printResults(sourceRDD, targetRDD, partitioner, totalRelatedPairs, ProgressiveAlgorithm.RANDOM,  (WeightingScheme.CF, WeightingScheme.CF))
+        val algorithms = Seq(ProgressiveAlgorithm.PROGRESSIVE_GIANT, ProgressiveAlgorithm.TOPK, ProgressiveAlgorithm.RECIPROCAL_TOPK)
+        val weightingSchemes = Seq((WeightingScheme.JS, WeightingScheme.MBR_INTERSECTION), (WeightingScheme.PEARSON_X2, WeightingScheme.POINTS))
         for (a <- algorithms ; ws <- weightingSchemes)
             printResults(sourceRDD, targetRDD, partitioner, totalRelatedPairs, a, ws)
     }
 
 
     def printResults(source:RDD[(Int, Entity)], target:RDD[(Int, Entity)], partitioner: Partitioner, totalRelations: Int,
-                     ma: ProgressiveAlgorithm, ws: WeightingScheme, n: Int = 10): Unit = {
+                     ma: ProgressiveAlgorithm, ws: (WeightingScheme, WeightingScheme), n: Int = 10): Unit = {
 
-        val pma = ProgressiveAlgorithmsFactory.get(ma, source, target, partitioner, budget, ws)
+        val pma = ProgressiveAlgorithmsFactory.get(ma, source, target, partitioner, budget, ws._1, ws._2)
         val results = pma.evaluate(relation, n, totalRelations, takeBudget)
 
         results.zip(takeBudget).foreach { case ((pgr, qp, verifications, (verificationSteps, qualifiedPairsSteps)), b) =>
diff --git a/src/main/scala/experiments/ProgressiveExp.scala b/src/main/scala/experiments/ProgressiveExp.scala
index 554f9ddb..f7cfb697 100644
--- a/src/main/scala/experiments/ProgressiveExp.scala
+++ b/src/main/scala/experiments/ProgressiveExp.scala
@@ -39,8 +39,10 @@ object ProgressiveExp {
                     nextOption(map ++ Map("conf" -> value), tail)
                 case ("-b" | "-budget") :: value :: tail =>
                     nextOption(map ++ Map("budget" -> value), tail)
-                case "-ws" :: value :: tail =>
-                    nextOption(map ++ Map("ws" -> value), tail)
+                case "-mws" :: value :: tail =>
+                    nextOption(map ++ Map("mws" -> value), tail)
+                case "-sws" :: value :: tail =>
+                    nextOption(map ++ Map("sws" -> value), tail)
                 case "-pa" :: value :: tail =>
                     nextOption(map ++ Map("pa" -> value), tail)
                 case "-gt" :: value :: tail =>
@@ -64,13 +66,14 @@ object ProgressiveExp {
         val conf = ConfigurationParser.parse(confPath)
         val partitions: Int = if (options.contains("partitions")) options("partitions").toInt else conf.getPartitions
         val budget: Int = if (options.contains("budget")) options("budget").toInt else conf.getBudget
-        val ws: WeightingScheme = if (options.contains("ws")) WeightingScheme.withName(options("ws")) else conf.getWeightingScheme
+        val mainWS: WeightingScheme = if (options.contains("mws")) WeightingScheme.withName(options("mws")) else conf.getMainWS
+        val secondaryWS: WeightingScheme = if (options.contains("sws")) WeightingScheme.withName(options("sws")) else conf.getSecondaryWS
         val pa: ProgressiveAlgorithm = if (options.contains("pa")) ProgressiveAlgorithm.withName(options("pa")) else conf.getProgressiveAlgorithm
         val gridType: GridType.GridType = if (options.contains("gt")) GridType.withName(options("gt").toString) else conf.getGridType
         val relation = conf.getRelation
 
         log.info("DS-JEDAI: Input Budget: " + budget)
-        log.info("DS-JEDAI: Weighting Scheme: " + ws.toString)
+        log.info("DS-JEDAI: Weighting Scheme: " + mainWS.toString)
         log.info("DS-JEDAI: Progressive Algorithm: " + pa.toString)
 
         val startTime = Calendar.getInstance().getTimeInMillis
@@ -85,7 +88,7 @@ object ProgressiveExp {
         val partitioner = reader.partitioner
 
         val matchingStartTime = Calendar.getInstance().getTimeInMillis
-        val method = ProgressiveAlgorithmsFactory.get(pa, sourceRDD, targetRDD, partitioner, budget, ws)
+        val method = ProgressiveAlgorithmsFactory.get(pa, sourceRDD, targetRDD, partitioner, budget, mainWS, secondaryWS)
         if (relation.equals(Relation.DE9IM)) {
             val (totalContains, totalCoveredBy, totalCovers, totalCrosses, totalEquals, totalIntersects,
             totalOverlaps, totalTouches, totalWithin, verifications, qp) = method.countAllRelations
diff --git a/src/main/scala/experiments/WellBalancedExp.scala b/src/main/scala/experiments/WellBalancedExp.scala
index eec5bca0..0ab389b7 100644
--- a/src/main/scala/experiments/WellBalancedExp.scala
+++ b/src/main/scala/experiments/WellBalancedExp.scala
@@ -2,9 +2,7 @@ package experiments
 
 import java.util.Calendar
 
-import geospatialInterlinking.IndexBasedMatching
-import geospatialInterlinking.IndexBasedMatching
-import geospatialInterlinking.progressive.ProgressiveAlgorithmsFactory
+import geospatialInterlinking.{GIAnt, IndexBasedMatching}
 import org.apache.log4j.{Level, LogManager, Logger}
 import org.apache.spark.serializer.KryoSerializer
 import org.apache.spark.sql.SparkSession
@@ -81,7 +79,7 @@ object WellBalancedExp {
         val conf = ConfigurationParser.parse(confPath)
         val partitions: Int = if (options.contains("partitions")) options("partitions").toInt else conf.getPartitions
         val budget: Int = if (options.contains("budget")) options("budget").toInt else conf.getBudget
-        val ws: WeightingScheme = if (options.contains("ws")) WeightingScheme.withName(options("ws")) else conf.getWeightingScheme
+        val ws: WeightingScheme = if (options.contains("ws")) WeightingScheme.withName(options("ws")) else conf.getMainWS
         val ma: ProgressiveAlgorithm = if (options.contains("ma")) ProgressiveAlgorithm.withName(options("ma")) else conf.getProgressiveAlgorithm
         val gridType: GridType.GridType = if (options.contains("gt")) GridType.withName(options("gt").toString) else conf.getGridType
         val relation = conf.getRelation
@@ -108,12 +106,12 @@ object WellBalancedExp {
 
         val matchingStartTime = Calendar.getInstance().getTimeInMillis
 
-        val pm = ProgressiveAlgorithmsFactory.get(ma, sourceRDD, targetRDD, partitioner, budget, ws)
+        val giant = GIAnt(sourceRDD, targetRDD, partitioner)
         val ibm = IndexBasedMatching(overloadedSource.map(_._2), overloadedTarget.map(_._2), Utils.getTheta)
 
         if (relation.equals(Relation.DE9IM)) {
             val (totalContains, totalCoveredBy, totalCovers, totalCrosses, totalEquals, totalIntersects,
-            totalOverlaps, totalTouches, totalWithin, intersectingPairs, interlinkedGeometries) = pm.countAllRelations + ibm.countAllRelations
+            totalOverlaps, totalTouches, totalWithin, intersectingPairs, interlinkedGeometries) = giant.countAllRelations + ibm.countAllRelations
 
             val totalRelations = totalContains + totalCoveredBy + totalCovers + totalCrosses + totalEquals +
                 totalIntersects + totalOverlaps + totalTouches + totalWithin
@@ -132,7 +130,7 @@ object WellBalancedExp {
             log.info("DS-JEDAI: Total Relations Discovered: " + totalRelations)
         }
         else{
-            val totalMatches = pm.countRelation(relation) + ibm.countRelation(relation)
+            val totalMatches = giant.countRelation(relation) + ibm.countRelation(relation)
             log.info("DS-JEDAI: " + relation.toString +": " + totalMatches)
         }
         val matchingEndTime = Calendar.getInstance().getTimeInMillis
diff --git a/src/main/scala/geospatialInterlinking/progressive/GeometryCentric.scala b/src/main/scala/geospatialInterlinking/progressive/GeometryCentric.scala
index 7f7b5d1f..b0f5cd3a 100644
--- a/src/main/scala/geospatialInterlinking/progressive/GeometryCentric.scala
+++ b/src/main/scala/geospatialInterlinking/progressive/GeometryCentric.scala
@@ -1,68 +1,68 @@
-package geospatialInterlinking.progressive
-
-import dataModel.{ComparisonPQ, Entity, MBR}
-import org.apache.spark.Partitioner
-import org.apache.spark.rdd.RDD
-import utils.Constants.Relation.Relation
-import utils.Constants.WeightingScheme.WeightingScheme
-import utils.Utils
-
-
-case class GeometryCentric(joinedRDD: RDD[(Int, (Iterable[Entity], Iterable[Entity]))],
-                           thetaXY: (Double, Double), ws: WeightingScheme, budget: Int, sourceCount: Long)
-   extends ProgressiveGeospatialInterlinkingT {
-
-
-    /**
-     * For each target entity we keep only the top K comparisons, according to a weighting scheme.
-     * Then we assign the top K comparisons a common weight, which is their avg
-     * Based on this weight we prioritize their execution.
-     *
-     * @return  an RDD of Intersection Matrices
-     */
-    def prioritize(source: Array[Entity], target: Array[Entity], partition: MBR, relation: Relation): ComparisonPQ[(Int, Int)] = {
-        val sourceIndex = index(source)
-        val filterIndices = (b: (Int, Int)) => sourceIndex.contains(b)
-        val k = (math.ceil(budget / target.length).toInt + 1) * 2 // +1 to avoid k=0
-        val targetPQ: ComparisonPQ[Int] = ComparisonPQ[Int](k)
-        val partitionPQ: ComparisonPQ[(Int, Int)] = ComparisonPQ[(Int, Int)](budget)
-
-        target
-            .indices
-            .foreach { j =>
-                var wSum = 0f
-                val e2 = target(j)
-                e2.index(thetaXY, filterIndices)
-                    .foreach { block =>
-                        sourceIndex.get(block)
-                            .filter(i => source(i).filter(e2, relation, block, thetaXY, Some(partition)))
-                            .foreach { i =>
-                                val e1 = source(i)
-                                val w = getWeight(e1, e2)
-                                wSum += w
-                                targetPQ.enqueue(w, i)
-                            }
-                    }
-                if (! targetPQ.isEmpty) {
-                    val pqSize = targetPQ.size()
-                    val topK = targetPQ.dequeueAll.map(_._2)
-                    val weight = wSum / pqSize
-                    partitionPQ.enqueueAll(topK.map(i => ((i, j), weight)))
-                    targetPQ.clear()
-                }
-            }
-        partitionPQ
-    }
-}
-
-
-object GeometryCentric{
-
-    def apply(source:RDD[(Int, Entity)], target:RDD[(Int, Entity)], ws: WeightingScheme, budget: Int, partitioner: Partitioner)
-    : GeometryCentric ={
-        val thetaXY = Utils.getTheta
-        val sourceCount = Utils.getSourceCount
-        val joinedRDD = source.cogroup(target, partitioner)
-        GeometryCentric(joinedRDD, thetaXY, ws, budget, sourceCount)
-    }
-}
\ No newline at end of file
+//package geospatialInterlinking.progressive
+//
+//import dataModel.{ComparisonPQ, Entity, MBR, WeightedPairsPQ}
+//import org.apache.spark.Partitioner
+//import org.apache.spark.rdd.RDD
+//import utils.Constants.Relation.Relation
+//import utils.Constants.WeightingScheme.WeightingScheme
+//import utils.Utils
+//
+//
+//case class GeometryCentric(joinedRDD: RDD[(Int, (Iterable[Entity], Iterable[Entity]))],
+//                           thetaXY: (Double, Double), ws: WeightingScheme, budget: Int, sourceCount: Long)
+//   extends ProgressiveGeospatialInterlinkingT {
+//
+//
+//    /**
+//     * For each target entity we keep only the top K comparisons, according to a weighting scheme.
+//     * Then we assign the top K comparisons a common weight, which is their avg
+//     * Based on this weight we prioritize their execution.
+//     *
+//     * @return  an RDD of Intersection Matrices
+//     */
+//    def prioritize(source: Array[Entity], target: Array[Entity], partition: MBR, relation: Relation): WeightedPairsPQ = {
+//        val sourceIndex = index(source)
+//        val filterIndices = (b: (Int, Int)) => sourceIndex.contains(b)
+//        val k = (math.ceil(budget / target.length).toInt + 1) * 2 // +1 to avoid k=0
+//        val targetPQ: ComparisonPQ[Int] = ComparisonPQ[Int](k)
+//        val partitionPQ: ComparisonPQ[(Int, Int)] = ComparisonPQ[(Int, Int)](budget)
+//
+//        target
+//            .indices
+//            .foreach { j =>
+//                var wSum = 0f
+//                val e2 = target(j)
+//                e2.index(thetaXY, filterIndices)
+//                    .foreach { block =>
+//                        sourceIndex.get(block)
+//                            .filter(i => source(i).filter(e2, relation, block, thetaXY, Some(partition)))
+//                            .foreach { i =>
+//                                val e1 = source(i)
+//                                val w = getWeight(e1, e2)
+//                                wSum += w
+//                                targetPQ.enqueue(w, i)
+//                            }
+//                    }
+//                if (! targetPQ.isEmpty) {
+//                    val pqSize = targetPQ.size()
+//                    val topK = targetPQ.dequeueAll.map(_._2)
+//                    val weight = wSum / pqSize
+//                    partitionPQ.enqueueAll(topK.map(i => ((i, j), weight)))
+//                    targetPQ.clear()
+//                }
+//            }
+//        partitionPQ
+//    }
+//}
+//
+//
+//object GeometryCentric{
+//
+//    def apply(source:RDD[(Int, Entity)], target:RDD[(Int, Entity)], ws: WeightingScheme, budget: Int, partitioner: Partitioner)
+//    : GeometryCentric ={
+//        val thetaXY = Utils.getTheta
+//        val sourceCount = Utils.getSourceCount
+//        val joinedRDD = source.cogroup(target, partitioner)
+//        GeometryCentric(joinedRDD, thetaXY, ws, budget, sourceCount)
+//    }
+//}
\ No newline at end of file
diff --git a/src/main/scala/geospatialInterlinking/progressive/ProgressiveAlgorithmsFactory.scala b/src/main/scala/geospatialInterlinking/progressive/ProgressiveAlgorithmsFactory.scala
index ffe9033b..03beed7e 100644
--- a/src/main/scala/geospatialInterlinking/progressive/ProgressiveAlgorithmsFactory.scala
+++ b/src/main/scala/geospatialInterlinking/progressive/ProgressiveAlgorithmsFactory.scala
@@ -12,19 +12,20 @@ object ProgressiveAlgorithmsFactory {
 
 
     def get(matchingAlgorithm: ProgressiveAlgorithm, source: RDD[(Int, Entity)], target: RDD[(Int, Entity)],
-            partitioner: Partitioner, budget: Int = 0, ws: WeightingScheme = WeightingScheme.JS): ProgressiveGeospatialInterlinkingT ={
+            partitioner: Partitioner, budget: Int = 0, mainWS: WeightingScheme,  secondaryWS: WeightingScheme):
+    ProgressiveGeospatialInterlinkingT ={
 
         matchingAlgorithm match {
             case ProgressiveAlgorithm.RANDOM =>
-                RandomScheduling(source, target, ws, budget, partitioner)
-            case ProgressiveAlgorithm.GEOMETRY_CENTRIC =>
-                GeometryCentric(source, target, ws, budget, partitioner)
+                RandomScheduling(source, target, mainWS, Option(secondaryWS), budget, partitioner)
+//            case ProgressiveAlgorithm.GEOMETRY_CENTRIC =>
+//                GeometryCentric(source, target, ws, budget, partitioner)
             case ProgressiveAlgorithm.TOPK =>
-                TopKPairs(source, target, ws, budget, partitioner)
+                TopKPairs(source, target, mainWS, Option(secondaryWS), budget, partitioner)
             case ProgressiveAlgorithm.RECIPROCAL_TOPK =>
-                ReciprocalTopK(source, target, ws, budget, partitioner)
+                ReciprocalTopK(source, target, mainWS, Option(secondaryWS), budget, partitioner)
             case ProgressiveAlgorithm.PROGRESSIVE_GIANT | _ =>
-                ProgressiveGIAnt(source, target, ws, budget, partitioner)
+                ProgressiveGIAnt(source, target, mainWS, Option(secondaryWS), budget, partitioner)
         }
     }
 }
diff --git a/src/main/scala/geospatialInterlinking/progressive/ProgressiveGIAnt.scala b/src/main/scala/geospatialInterlinking/progressive/ProgressiveGIAnt.scala
index 962aedea..717daa09 100644
--- a/src/main/scala/geospatialInterlinking/progressive/ProgressiveGIAnt.scala
+++ b/src/main/scala/geospatialInterlinking/progressive/ProgressiveGIAnt.scala
@@ -1,6 +1,6 @@
 package geospatialInterlinking.progressive
 
-import dataModel.{ComparisonPQ, Entity, MBR}
+import dataModel.{Entity, MBR, WeightedPair, WeightedPairsPQ}
 import org.apache.spark.Partitioner
 import org.apache.spark.rdd.RDD
 import utils.Constants.Relation.Relation
@@ -9,7 +9,9 @@ import utils.Utils
 
 
 case class ProgressiveGIAnt(joinedRDD: RDD[(Int, (Iterable[Entity], Iterable[Entity]))],
-                            thetaXY: (Double, Double), ws: WeightingScheme, budget: Int, sourceCount: Long) extends ProgressiveGeospatialInterlinkingT {
+                            thetaXY: (Double, Double), mainWS: WeightingScheme, secondaryWS: Option[WeightingScheme],
+                            budget: Int, sourceCount: Long)
+    extends ProgressiveGeospatialInterlinkingT {
 
 
     /**
@@ -21,10 +23,10 @@ case class ProgressiveGIAnt(joinedRDD: RDD[(Int, (Iterable[Entity], Iterable[Ent
      * @param target target
      * @return a PQ with the top comparisons
      */
-    def prioritize(source: Array[Entity], target: Array[Entity], partition: MBR, relation: Relation): ComparisonPQ[(Int, Int)] ={
+    def prioritize(source: Array[Entity], target: Array[Entity], partition: MBR, relation: Relation): WeightedPairsPQ ={
         val sourceIndex = index(source)
         val filterIndices = (b: (Int, Int)) => sourceIndex.contains(b)
-        val pq: ComparisonPQ[(Int, Int)] = ComparisonPQ[(Int, Int)](budget)
+        val pq: WeightedPairsPQ = WeightedPairsPQ(budget)
 
         // weight and put the comparisons in a PQ
         target
@@ -37,8 +39,10 @@ case class ProgressiveGIAnt(joinedRDD: RDD[(Int, (Iterable[Entity], Iterable[Ent
                             .filter(i => source(i).filter(e2, relation, block, thetaXY, Some(partition)))
                             .foreach { i =>
                                 val e1 = source(i)
-                                val w = getWeight(e1, e2)
-                                pq.enqueue(w, (i,j))
+                                val w = getMainWeight(e1, e2)
+                                val secW = getSecondaryWeight(e1, e2)
+                                val wp = WeightedPair(i, j, w, secW)
+                                pq.enqueue(wp)
                             }
                     }
             }
@@ -53,11 +57,12 @@ case class ProgressiveGIAnt(joinedRDD: RDD[(Int, (Iterable[Entity], Iterable[Ent
  */
 object ProgressiveGIAnt {
 
-    def apply(source:RDD[(Int, Entity)], target:RDD[(Int, Entity)], ws: WeightingScheme, budget: Int, partitioner: Partitioner): ProgressiveGIAnt ={
+    def apply(source:RDD[(Int, Entity)], target:RDD[(Int, Entity)], ws: WeightingScheme, sws: Option[WeightingScheme] = None,
+              budget: Int, partitioner: Partitioner): ProgressiveGIAnt ={
         val thetaXY = Utils.getTheta
         val sourceCount = Utils.getSourceCount
         val joinedRDD = source.cogroup(target, partitioner)
-        ProgressiveGIAnt(joinedRDD, thetaXY, ws, budget, sourceCount)
+        ProgressiveGIAnt(joinedRDD, thetaXY, ws, sws, budget, sourceCount)
     }
 
 }
diff --git a/src/main/scala/geospatialInterlinking/progressive/ProgressiveGeospatialInterlinkingT.scala b/src/main/scala/geospatialInterlinking/progressive/ProgressiveGeospatialInterlinkingT.scala
index a9e2420a..ddd7fe36 100644
--- a/src/main/scala/geospatialInterlinking/progressive/ProgressiveGeospatialInterlinkingT.scala
+++ b/src/main/scala/geospatialInterlinking/progressive/ProgressiveGeospatialInterlinkingT.scala
@@ -1,6 +1,6 @@
 package geospatialInterlinking.progressive
 
-import dataModel.{ComparisonPQ, Entity, IM, MBR}
+import dataModel.{Entity, IM, MBR, WeightedPair, WeightedPairsPQ}
 import geospatialInterlinking.GeospatialInterlinkingT
 import org.apache.spark.rdd.RDD
 import org.apache.spark.storage.StorageLevel
@@ -14,13 +14,18 @@ import scala.collection.mutable.ListBuffer
 
 trait ProgressiveGeospatialInterlinkingT extends GeospatialInterlinkingT{
     val budget: Int
-    val ws: WeightingScheme
+    val mainWS: WeightingScheme
+    val secondaryWS: Option[WeightingScheme]
 
-    def prioritize(source: Array[Entity], target: Array[Entity], partition: MBR, relation: Relation): ComparisonPQ[(Int, Int)]
+    def prioritize(source: Array[Entity], target: Array[Entity], partition: MBR, relation: Relation): WeightedPairsPQ
 
+    def getMainWeight(e1: Entity, e2: Entity): Float = Utils.getWeight(e1, e2, mainWS)
 
-    def getWeight(e1: Entity, e2: Entity): Float = Utils.getWeight(e1, e2, ws)
-
+    def getSecondaryWeight(e1: Entity, e2: Entity): Float =
+        secondaryWS match {
+            case Some(ws) => Utils.getWeight(e1, e2, ws)
+            case None => 0f
+        }
 
     /**
      *  Get the DE-9IM of the top most related entities based
@@ -30,20 +35,20 @@ trait ProgressiveGeospatialInterlinkingT extends GeospatialInterlinkingT{
     def getDE9IM: RDD[IM] ={
         joinedRDD.filter(j => j._2._1.nonEmpty && j._2._2.nonEmpty)
             .flatMap{ p =>
-            val pid = p._1
-            val partition = partitionsZones(pid)
-            val source = p._2._1.toArray
-            val target = p._2._2.toArray
+                val pid = p._1
+                val partition = partitionsZones(pid)
+                val source = p._2._1.toArray
+                val target = p._2._2.toArray
 
-            val pq = prioritize(source, target, partition, Relation.DE9IM)
-            if (!pq.isEmpty)
-                pq.dequeueAll.map{ case (_, (i, j)) =>
-                    val e1 = source(i)
-                    val e2 = target(j)
-                    IM(e1, e2)
-                }.takeWhile(_ => !pq.isEmpty)
-            else Iterator()
-        }
+                val pq = prioritize(source, target, partition, Relation.DE9IM)
+                if (!pq.isEmpty)
+                    pq.dequeueAll.map{ wp =>
+                        val e1 = source(wp.entityId1)
+                        val e2 = target(wp.entityId2)
+                        IM(e1, e2)
+                    }.takeWhile(_ => !pq.isEmpty)
+                else Iterator()
+            }
     }
 
 
@@ -63,9 +68,9 @@ trait ProgressiveGeospatialInterlinkingT extends GeospatialInterlinkingT{
 
             val pq = prioritize(source, target, partition, relation)
             if (!pq.isEmpty)
-                pq.dequeueAll.map{ case (_, (i, j)) =>
-                    val e1 = source(i)
-                    val e2 = target(j)
+                pq.dequeueAll.map{ wp =>
+                    val e1 = source(wp.entityId1)
+                    val e2 = target(wp.entityId2)
                     (e1.relate(e2, relation), (e1.originalID, e2.originalID))
                 }.filter(_._1).map(_._2)
             else Iterator()
@@ -81,8 +86,8 @@ trait ProgressiveGeospatialInterlinkingT extends GeospatialInterlinkingT{
      * @return (PGR, total interlinked Geometries (TP), total comparisons)
      */
     def evaluate(relation: Relation, n: Int = 10, totalQualifiedPairs: Double, takeBudget: Seq[Int]): Seq[(Double, Long, Long, (List[Int], List[Int]))]  ={
-       // computes weighted the weighted comparisons
-        val matches: RDD[(Float, Boolean)] = joinedRDD
+        // computes weighted the weighted comparisons
+        val matches: RDD[(WeightedPair, Boolean)] = joinedRDD
             .filter(p => p._2._1.nonEmpty && p._2._2.nonEmpty)
             .flatMap { p =>
                 val pid = p._1
@@ -92,12 +97,12 @@ trait ProgressiveGeospatialInterlinkingT extends GeospatialInterlinkingT{
 
                 val pq = prioritize(source, target, partition, relation)
                 if (!pq.isEmpty)
-                    pq.dequeueAll.map{ case (w, (i, j)) =>
-                        val e1 = source(i)
-                        val e2 = target(j)
+                    pq.dequeueAll.map{  wp =>
+                        val e1 = source(wp.entityId1)
+                        val e2 = target(wp.entityId2)
                         relation match {
-                            case Relation.DE9IM => (w, IM(e1, e2).relate)
-                            case _ => (w, e1.relate(e2, relation))
+                            case Relation.DE9IM => (wp, IM(e1, e2).relate)
+                            case _ => (wp, e1.relate(e2, relation))
                         }
                     }.takeWhile(_ => !pq.isEmpty)
                 else Iterator()
@@ -106,7 +111,7 @@ trait ProgressiveGeospatialInterlinkingT extends GeospatialInterlinkingT{
         var results = mutable.ListBuffer[(Double, Long, Long, (List[Int], List[Int]))]()
         for(b <- takeBudget){
             // compute AUC prioritizing the comparisons based on their weight
-            val sorted = matches.takeOrdered(b)(Ordering.by[(Float, Boolean), Float](_._1).reverse)
+            val sorted = matches.takeOrdered(b)
             val verifications = sorted.length
             val step = math.ceil(verifications/n)
 
diff --git a/src/main/scala/geospatialInterlinking/progressive/RandomScheduling.scala b/src/main/scala/geospatialInterlinking/progressive/RandomScheduling.scala
index dd94569b..2b45349e 100644
--- a/src/main/scala/geospatialInterlinking/progressive/RandomScheduling.scala
+++ b/src/main/scala/geospatialInterlinking/progressive/RandomScheduling.scala
@@ -1,6 +1,6 @@
 package geospatialInterlinking.progressive
 
-import dataModel.{ComparisonPQ, Entity, MBR}
+import dataModel.{Entity, MBR, WeightedPair, WeightedPairsPQ}
 import org.apache.spark.Partitioner
 import org.apache.spark.rdd.RDD
 import utils.Constants.Relation.Relation
@@ -8,32 +8,39 @@ import utils.Constants.WeightingScheme.WeightingScheme
 import utils.Utils
 
 case class RandomScheduling(joinedRDD: RDD[(Int, (Iterable[Entity], Iterable[Entity]))],
-                             thetaXY: (Double, Double), ws: WeightingScheme, budget: Int, sourceCount: Long) extends ProgressiveGeospatialInterlinkingT {
+                            thetaXY: (Double, Double), mainWS: WeightingScheme, secondaryWS: Option[WeightingScheme],
+                            budget: Int, sourceCount: Long)
+    extends ProgressiveGeospatialInterlinkingT {
 
 
     /**
      * First index source and then for each entity of target, find its comparisons using source's index.
      *
      * @param partition the MBR: of the partition
-     * @param source source
-     * @param target target
+     * @param source    source
+     * @param target    target
      * @return a PQ with the top comparisons
      */
-    def prioritize(source: Array[Entity], target: Array[Entity], partition: MBR, relation: Relation): ComparisonPQ[(Int, Int)] ={
+    def prioritize(source: Array[Entity], target: Array[Entity], partition: MBR, relation: Relation): WeightedPairsPQ = {
         val sourceIndex = index(source)
         val filterIndices = (b: (Int, Int)) => sourceIndex.contains(b)
-        val pq: ComparisonPQ[(Int, Int)] = ComparisonPQ[(Int, Int)](budget)
-
+        val pq: WeightedPairsPQ = WeightedPairsPQ(budget)
+        val rnd = new scala.util.Random
         // weight and put the comparisons in a PQ
         target
             .indices
-            .foreach {j =>
+            .foreach { j =>
                 val e2 = target(j)
                 e2.index(thetaXY, filterIndices)
                     .foreach { block =>
                         sourceIndex.get(block)
                             .filter(i => source(i).filter(e2, relation, block, thetaXY, Some(partition)))
-                            .foreach { i => pq.enqueue(1f, (i,j)) }
+                            .foreach { i =>
+                                val w = rnd.nextFloat()
+                                val secW = rnd.nextFloat()
+                                val wp = WeightedPair(i, j, w, secW)
+                                pq.enqueue(wp)
+                            }
                     }
             }
         pq
@@ -41,17 +48,17 @@ case class RandomScheduling(joinedRDD: RDD[(Int, (Iterable[Entity], Iterable[Ent
 }
 
 
-
 /**
  * auxiliary constructor
  */
 object RandomScheduling {
 
-    def apply(source:RDD[(Int, Entity)], target:RDD[(Int, Entity)], ws: WeightingScheme, budget: Int, partitioner: Partitioner): RandomScheduling ={
+    def apply(source:RDD[(Int, Entity)], target:RDD[(Int, Entity)], ws: WeightingScheme, sws: Option[WeightingScheme] = None,
+              budget: Int, partitioner: Partitioner): RandomScheduling ={
         val thetaXY = Utils.getTheta
         val sourceCount = Utils.getSourceCount
         val joinedRDD = source.cogroup(target, partitioner)
-        RandomScheduling(joinedRDD, thetaXY, ws, budget, sourceCount)
+        RandomScheduling(joinedRDD, thetaXY, ws, sws, budget, sourceCount)
     }
 
 }
diff --git a/src/main/scala/geospatialInterlinking/progressive/ReciprocalTopK.scala b/src/main/scala/geospatialInterlinking/progressive/ReciprocalTopK.scala
index 81612156..305ae72b 100644
--- a/src/main/scala/geospatialInterlinking/progressive/ReciprocalTopK.scala
+++ b/src/main/scala/geospatialInterlinking/progressive/ReciprocalTopK.scala
@@ -1,6 +1,6 @@
 package geospatialInterlinking.progressive
 
-import dataModel.{ComparisonPQ, Entity, MBR}
+import dataModel.{Entity, MBR, WeightedPair, WeightedPairsPQ}
 import org.apache.spark.Partitioner
 import org.apache.spark.rdd.RDD
 import utils.Constants.Relation.Relation
@@ -10,7 +10,9 @@ import utils.Utils
 
 
 case class ReciprocalTopK(joinedRDD: RDD[(Int, (Iterable[Entity], Iterable[Entity]))],
-                          thetaXY: (Double, Double), ws: WeightingScheme, budget: Int, sourceCount: Long) extends ProgressiveGeospatialInterlinkingT {
+                          thetaXY: (Double, Double), mainWS: WeightingScheme, secondaryWS: Option[WeightingScheme],
+                          budget: Int, sourceCount: Long)
+    extends ProgressiveGeospatialInterlinkingT {
 
     /**
      * Find the top-K comparisons of target and source and keep only the comparison (i, j) that belongs to both
@@ -22,16 +24,16 @@ case class ReciprocalTopK(joinedRDD: RDD[(Int, (Iterable[Entity], Iterable[Entit
      * @param relation examining relation
      * @return prioritized comparisons as a PQ
      */
-    def prioritize(source: Array[Entity], target: Array[Entity], partition: MBR, relation: Relation): ComparisonPQ[(Int, Int)] = {
+    def prioritize(source: Array[Entity], target: Array[Entity], partition: MBR, relation: Relation):  WeightedPairsPQ = {
         val sourceIndex = index(source)
         val filterIndices = (b: (Int, Int)) => sourceIndex.contains(b)
 
         val sourceK = (math.ceil(budget / source.length).toInt + 1) * 2 // +1 to avoid k=0
         val targetK = (math.ceil(budget / target.length).toInt + 1) * 2 // +1 to avoid k=0
 
-        val sourcePQ: Array[ComparisonPQ[Int]] = new Array(source.length)
-        val targetPQ: ComparisonPQ[Int] = ComparisonPQ[Int](targetK)
-        val partitionPQ: ComparisonPQ[(Int, Int)] = ComparisonPQ[(Int, Int)](budget)
+        val sourcePQ: Array[WeightedPairsPQ] = new Array(source.length)
+        val targetPQ: WeightedPairsPQ = WeightedPairsPQ(targetK)
+        val partitionPQ: WeightedPairsPQ = WeightedPairsPQ(budget)
 
         val targetSet: Array[Set[Int]] = new Array(target.length)
         target.indices
@@ -43,33 +45,31 @@ case class ReciprocalTopK(joinedRDD: RDD[(Int, (Iterable[Entity], Iterable[Entit
                             .filter(i => source(i).filter(e2, relation, block, thetaXY, Some(partition)))
                             .foreach { i =>
                                 val e1 = source(i)
-                                val w = getWeight(e1, e2)
+                                val w = getMainWeight(e1, e2)
+                                val secW = getSecondaryWeight(e1, e2)
+                                val wp = WeightedPair(i, j, w, secW)
 
                                 // set top-K PQ for the examining target entity
-                                targetPQ.enqueue(w, i)
+                                targetPQ.enqueue(wp)
 
                                 // update source entities' top-K
                                 if (sourcePQ(i) == null)
-                                    sourcePQ(i) = ComparisonPQ[Int](sourceK)
-                                sourcePQ(i).enqueue(w, j)
+                                    sourcePQ(i) = WeightedPairsPQ(sourceK)
+                                sourcePQ(i).enqueue(wp)
                             }
                     }
                 // add comparisons into corresponding HashSet
-                targetSet(j) = targetPQ.iterator().map(_._2).toSet
+                targetSet(j) = targetPQ.iterator().map(_.entityId1).toSet
                 targetPQ.clear()
             }
 
         // add comparison into PQ only if is contained by both top-K PQs
         sourcePQ
-            .zipWithIndex
-            .filter(_._1 != null)
-            .foreach { case (pq, i) =>
-                val w = Double.MaxValue
-                while (pq.size > 0 && w > partitionPQ.minW) {
-                    val (w, j) = pq.dequeueHead()
-                    if (targetSet(j).contains(i))
-                        partitionPQ.enqueue(w, (i, j))
-                }
+            .filter(_ != null)
+            .foreach { pq =>
+                pq.iterator()
+                    .filter(wp => targetSet(wp.entityId2).contains(wp.entityId1))
+                    .foreach(wp => partitionPQ.enqueue(wp))
             }
         partitionPQ
     }
@@ -77,10 +77,11 @@ case class ReciprocalTopK(joinedRDD: RDD[(Int, (Iterable[Entity], Iterable[Entit
 
 object ReciprocalTopK{
 
-    def apply(source:RDD[(Int, Entity)], target:RDD[(Int, Entity)], ws: WeightingScheme, budget: Int, partitioner: Partitioner): ReciprocalTopK ={
+    def apply(source:RDD[(Int, Entity)], target:RDD[(Int, Entity)], ws: WeightingScheme, sws: Option[WeightingScheme] = None,
+              budget: Int, partitioner: Partitioner): ReciprocalTopK ={
         val thetaXY = Utils.getTheta
         val sourceCount = Utils.getSourceCount
         val joinedRDD = source.cogroup(target, partitioner)
-        ReciprocalTopK(joinedRDD, thetaXY, ws, budget, sourceCount)
+        ReciprocalTopK(joinedRDD, thetaXY, ws, sws, budget, sourceCount)
     }
 }
diff --git a/src/main/scala/geospatialInterlinking/progressive/TopKPairs.scala b/src/main/scala/geospatialInterlinking/progressive/TopKPairs.scala
index f6278af2..0ff43f40 100644
--- a/src/main/scala/geospatialInterlinking/progressive/TopKPairs.scala
+++ b/src/main/scala/geospatialInterlinking/progressive/TopKPairs.scala
@@ -1,17 +1,16 @@
 package geospatialInterlinking.progressive
 
-import dataModel.{ComparisonPQ, Entity, MBR}
+import dataModel.{Entity, MBR, WeightedPair, WeightedPairsPQ}
 import org.apache.spark.Partitioner
 import org.apache.spark.rdd.RDD
 import utils.Constants.Relation.Relation
 import utils.Constants.WeightingScheme.WeightingScheme
 import utils.Utils
 
-import scala.collection.mutable
-
 case class TopKPairs(joinedRDD: RDD[(Int, (Iterable[Entity], Iterable[Entity]))],
-                     thetaXY: (Double, Double), ws: WeightingScheme, budget: Int, sourceCount: Long) extends ProgressiveGeospatialInterlinkingT {
-
+                     thetaXY: (Double, Double), mainWS: WeightingScheme, secondaryWS: Option[WeightingScheme],
+                     budget: Int, sourceCount: Long)
+    extends ProgressiveGeospatialInterlinkingT {
 
     /**
      * First we find the top-k comparisons of each geometry in source and target,
@@ -24,43 +23,44 @@ case class TopKPairs(joinedRDD: RDD[(Int, (Iterable[Entity], Iterable[Entity]))]
      * @param relation examining relation
      * @return prioritized comparisons in a PQ
      */
-    def prioritize(source: Array[Entity], target: Array[Entity], partition: MBR, relation: Relation): ComparisonPQ[(Int, Int)] = {
+    def prioritize(source: Array[Entity], target: Array[Entity], partition: MBR, relation: Relation): WeightedPairsPQ = {
         val sourceIndex = index(source)
         val filterIndices = (b: (Int, Int)) => sourceIndex.contains(b)
 
         // the budget is divided based on the number of entities
         val k = (math.ceil(budget / (source.length + target.length)).toInt + 1) * 2 // +1 to avoid k=0
-        val sourcePQ: Array[ComparisonPQ[Int]] = new Array(source.length)
-        val targetPQ: ComparisonPQ[Int] = ComparisonPQ[Int](k)
-        val partitionPQ: ComparisonPQ[(Int, Int)] = ComparisonPQ[(Int, Int)](budget)
+        val sourcePQ: Array[WeightedPairsPQ] = new Array(source.length)
+        val targetPQ: WeightedPairsPQ = WeightedPairsPQ(k)
+        val partitionPQ: WeightedPairsPQ = WeightedPairsPQ(budget)
 
         target.indices
-                .foreach{ j =>
-                    val e2 = target(j)
-                    e2.index(thetaXY, filterIndices)
-                        .foreach{ block =>
-                            sourceIndex.get(block)
-                                .filter(i => source(i).filter(e2, relation, block, thetaXY, Some(partition)))
-                                .foreach { i =>
-                                    val e1 = source(i)
-                                    val w = getWeight(e1, e2)
+            .foreach{ j =>
+                val e2 = target(j)
+                e2.index(thetaXY, filterIndices)
+                    .foreach{ block =>
+                        sourceIndex.get(block)
+                            .filter(i => source(i).filter(e2, relation, block, thetaXY, Some(partition)))
+                            .foreach { i =>
+                                val e1 = source(i)
+                                val w = getMainWeight(e1, e2)
+                                val secW = getSecondaryWeight(e1, e2)
+                                val wp = WeightedPair(i, j, w, secW)
 
-                                    // set top-K PQ for the examining target entity
-                                    targetPQ.enqueue(w, i)
+                                // set top-K PQ for the examining target entity
+                                targetPQ.enqueue(wp)
 
-                                    // update source entities' top-K
-                                    if (sourcePQ(i) == null)
-                                        sourcePQ(i) = ComparisonPQ[Int](k)
-                                    sourcePQ(i).enqueue(w, j)
+                                // update source entities' top-K
+                                if (sourcePQ(i) == null)
+                                    sourcePQ(i) = WeightedPairsPQ(k)
+                                sourcePQ(i).enqueue(wp)
                             }
-                        }
+                    }
 
                 // add target's pairs in partition's PQ
                 if (!targetPQ.isEmpty) {
-                    val w = Double.MaxValue
-                    while (targetPQ.size > 0 && w > partitionPQ.minW) {
-                        val (w, i) = targetPQ.dequeueHead()
-                        partitionPQ.enqueue(w, (i, j))
+                    while (targetPQ.size > 0) {
+                        val wp = targetPQ.dequeueHead()
+                        partitionPQ.enqueue(wp)
                     }
                 }
                 targetPQ.clear()
@@ -68,41 +68,28 @@ case class TopKPairs(joinedRDD: RDD[(Int, (Iterable[Entity], Iterable[Entity]))]
 
         // putting target comparisons in a HasMap. Source entities will also be added in the HashMap
         // to update wights and avoid duplicate comparisons
-        val partitionPairs: mutable.HashMap[(Int, Int), Float] = mutable.HashMap()
-        partitionPQ.iterator().foreach{ case(w:Float, pair:(Int, Int)) => partitionPairs += (pair -> w) }
-
+        val existingPairs = partitionPQ.iterator().toSet
         // adding source entities' top-K in hashMap
         sourcePQ
-            .zipWithIndex
-            .filter(_._1 != null)
-            .foreach { case (pq, i) =>
-                val w = Double.MaxValue
-                while (pq.size > 0 && w > partitionPQ.minW) {
-                    val (w, j) = pq.dequeueHead()
-                    if (partitionPQ.minW < w) {
-                        partitionPairs.get(i, j) match {
-                            case Some(weight) if weight < w => partitionPairs.update((i, j), w) //if exist with smaller weight -> update
-                            case None => partitionPairs += ((i, j) -> w)
-                            case _ =>
-                        }
-                    }
-                }
+            .filter(_ != null)
+            .foreach { pq =>
+                pq.dequeueAll.foreach(wp => partitionPQ.enqueue(wp))
                 pq.clear()
             }
-
         // keep partition's top comparisons
         partitionPQ.clear()
-        partitionPQ.enqueueAll(partitionPairs.toIterator)
+        partitionPQ.enqueueAll(existingPairs.iterator)
         partitionPQ
     }
 }
 
 object TopKPairs{
 
-    def apply(source:RDD[(Int, Entity)], target:RDD[(Int, Entity)], ws: WeightingScheme, budget: Int, partitioner: Partitioner): TopKPairs ={
+    def apply(source:RDD[(Int, Entity)], target:RDD[(Int, Entity)], ws: WeightingScheme, sws: Option[WeightingScheme] = None,
+              budget: Int, partitioner: Partitioner): TopKPairs ={
         val thetaXY = Utils.getTheta
         val sourceCount = Utils.getSourceCount
         val joinedRDD = source.cogroup(target, partitioner)
-        TopKPairs(joinedRDD, thetaXY, ws, budget, sourceCount)
+        TopKPairs(joinedRDD, thetaXY, ws, sws, budget, sourceCount)
     }
 }
diff --git a/src/main/scala/utils/ConfigurationParser.scala b/src/main/scala/utils/ConfigurationParser.scala
index 1f937a2d..59952b0c 100644
--- a/src/main/scala/utils/ConfigurationParser.scala
+++ b/src/main/scala/utils/ConfigurationParser.scala
@@ -14,12 +14,9 @@ import utils.Constants.FileTypes.FileTypes
 import utils.Constants._
 
 
-
 /**
  * @author George Mandilaras < gmandi@di.uoa.gr > (National and Kapodistrian University of Athens)
  */
-
-
 case class DatasetConfigurations(path: String, geometryField: String, realIdField: Option[String] = None, dateField: Option[String] = None, datePattern: Option[String] = None){
 
 
@@ -86,7 +83,9 @@ case class Configuration(source: DatasetConfigurations, target:DatasetConfigurat
 
 	def getTheta: ThetaOption = ThetaOption.withName(configurations.getOrElse(YamlConfiguration.CONF_THETA_GRANULARITY, "avg"))
 
-	def getWeightingScheme: WeightingScheme = WeightingScheme.withName(configurations.getOrElse(YamlConfiguration.CONF_WEIGHTING_SCHM, "JS"))
+	def getMainWS: WeightingScheme = WeightingScheme.withName(configurations.getOrElse(YamlConfiguration.CONF_MAIN_WS, "JS"))
+
+	def getSecondaryWS: WeightingScheme = WeightingScheme.withName(configurations.getOrElse(YamlConfiguration.CONF_SECONDARY_WS, "MBR_INTERSECTION"))
 
 	def getGridType: GridType = GridType.withName(configurations.getOrElse(YamlConfiguration.CONF_GRIDTYPE, "QUADTREE"))
 
@@ -155,9 +154,9 @@ object ConfigurationParser {
 							log.error(s"DS-JEDAI: Prioritization Algorithm \'$value\' is not supported")
 							false
 						}
-					case YamlConfiguration.CONF_WEIGHTING_SCHM =>
+					case YamlConfiguration.CONF_MAIN_WS | YamlConfiguration.CONF_SECONDARY_WS=>
 						if (! WeightingScheme.exists(value)) {
-							log.error(s"DS-JEDAI: Weighting algorithm \'$value\' is not supported")
+							log.error(s"DS-JEDAI: Weighting Scheme \'$value\' is not supported")
 							false
 						}
 					case YamlConfiguration.CONF_GRIDTYPE=>
diff --git a/src/main/scala/utils/Constants.scala b/src/main/scala/utils/Constants.scala
index 26c1e586..d5f4a06e 100644
--- a/src/main/scala/utils/Constants.scala
+++ b/src/main/scala/utils/Constants.scala
@@ -87,7 +87,8 @@ object Constants {
 		val CONF_PARTITIONS = "partitions"
 		val CONF_THETA_GRANULARITY = "thetaGranularity"
 		val CONF_PROGRESSIVE_ALG = "progressiveAlgorithm"
-		val CONF_WEIGHTING_SCHM = "weightingScheme"
+		val CONF_MAIN_WS = "mainWS"
+		val CONF_SECONDARY_WS = "secondaryWS"
 		val CONF_BUDGET = "budget"
 		val CONF_GRIDTYPE = "gridType"
 	}

From dedc8176f3b78f4d5c7658ce24f062e3a7d1e229 Mon Sep 17 00:00:00 2001
From: George <giwrgosmandi@gmail.com>
Date: Sun, 21 Feb 2021 20:08:08 +0200
Subject: [PATCH 02/25] cleaning

---
 .../progressive/ProgressiveGIAnt.scala                    | 8 +++-----
 .../progressive/RandomScheduling.scala                    | 8 +++-----
 .../progressive/ReciprocalTopK.scala                      | 8 +++-----
 .../geospatialInterlinking/progressive/TopKPairs.scala    | 8 +++-----
 src/main/scala/utils/SpatialReader.scala                  | 2 +-
 5 files changed, 13 insertions(+), 21 deletions(-)

diff --git a/src/main/scala/geospatialInterlinking/progressive/ProgressiveGIAnt.scala b/src/main/scala/geospatialInterlinking/progressive/ProgressiveGIAnt.scala
index 717daa09..876f7f48 100644
--- a/src/main/scala/geospatialInterlinking/progressive/ProgressiveGIAnt.scala
+++ b/src/main/scala/geospatialInterlinking/progressive/ProgressiveGIAnt.scala
@@ -8,9 +8,8 @@ import utils.Constants.WeightingScheme.WeightingScheme
 import utils.Utils
 
 
-case class ProgressiveGIAnt(joinedRDD: RDD[(Int, (Iterable[Entity], Iterable[Entity]))],
-                            thetaXY: (Double, Double), mainWS: WeightingScheme, secondaryWS: Option[WeightingScheme],
-                            budget: Int, sourceCount: Long)
+case class ProgressiveGIAnt(joinedRDD: RDD[(Int, (Iterable[Entity], Iterable[Entity]))], thetaXY: (Double, Double),
+                            mainWS: WeightingScheme, secondaryWS: Option[WeightingScheme], budget: Int)
     extends ProgressiveGeospatialInterlinkingT {
 
 
@@ -60,9 +59,8 @@ object ProgressiveGIAnt {
     def apply(source:RDD[(Int, Entity)], target:RDD[(Int, Entity)], ws: WeightingScheme, sws: Option[WeightingScheme] = None,
               budget: Int, partitioner: Partitioner): ProgressiveGIAnt ={
         val thetaXY = Utils.getTheta
-        val sourceCount = Utils.getSourceCount
         val joinedRDD = source.cogroup(target, partitioner)
-        ProgressiveGIAnt(joinedRDD, thetaXY, ws, sws, budget, sourceCount)
+        ProgressiveGIAnt(joinedRDD, thetaXY, ws, sws, budget)
     }
 
 }
diff --git a/src/main/scala/geospatialInterlinking/progressive/RandomScheduling.scala b/src/main/scala/geospatialInterlinking/progressive/RandomScheduling.scala
index 2b45349e..24c42131 100644
--- a/src/main/scala/geospatialInterlinking/progressive/RandomScheduling.scala
+++ b/src/main/scala/geospatialInterlinking/progressive/RandomScheduling.scala
@@ -7,9 +7,8 @@ import utils.Constants.Relation.Relation
 import utils.Constants.WeightingScheme.WeightingScheme
 import utils.Utils
 
-case class RandomScheduling(joinedRDD: RDD[(Int, (Iterable[Entity], Iterable[Entity]))],
-                            thetaXY: (Double, Double), mainWS: WeightingScheme, secondaryWS: Option[WeightingScheme],
-                            budget: Int, sourceCount: Long)
+case class RandomScheduling(joinedRDD: RDD[(Int, (Iterable[Entity], Iterable[Entity]))], thetaXY: (Double, Double),
+                            mainWS: WeightingScheme, secondaryWS: Option[WeightingScheme], budget: Int)
     extends ProgressiveGeospatialInterlinkingT {
 
 
@@ -56,9 +55,8 @@ object RandomScheduling {
     def apply(source:RDD[(Int, Entity)], target:RDD[(Int, Entity)], ws: WeightingScheme, sws: Option[WeightingScheme] = None,
               budget: Int, partitioner: Partitioner): RandomScheduling ={
         val thetaXY = Utils.getTheta
-        val sourceCount = Utils.getSourceCount
         val joinedRDD = source.cogroup(target, partitioner)
-        RandomScheduling(joinedRDD, thetaXY, ws, sws, budget, sourceCount)
+        RandomScheduling(joinedRDD, thetaXY, ws, sws, budget)
     }
 
 }
diff --git a/src/main/scala/geospatialInterlinking/progressive/ReciprocalTopK.scala b/src/main/scala/geospatialInterlinking/progressive/ReciprocalTopK.scala
index 305ae72b..56acdc3e 100644
--- a/src/main/scala/geospatialInterlinking/progressive/ReciprocalTopK.scala
+++ b/src/main/scala/geospatialInterlinking/progressive/ReciprocalTopK.scala
@@ -9,9 +9,8 @@ import utils.Utils
 
 
 
-case class ReciprocalTopK(joinedRDD: RDD[(Int, (Iterable[Entity], Iterable[Entity]))],
-                          thetaXY: (Double, Double), mainWS: WeightingScheme, secondaryWS: Option[WeightingScheme],
-                          budget: Int, sourceCount: Long)
+case class ReciprocalTopK(joinedRDD: RDD[(Int, (Iterable[Entity], Iterable[Entity]))], thetaXY: (Double, Double),
+                          mainWS: WeightingScheme, secondaryWS: Option[WeightingScheme], budget: Int)
     extends ProgressiveGeospatialInterlinkingT {
 
     /**
@@ -80,8 +79,7 @@ object ReciprocalTopK{
     def apply(source:RDD[(Int, Entity)], target:RDD[(Int, Entity)], ws: WeightingScheme, sws: Option[WeightingScheme] = None,
               budget: Int, partitioner: Partitioner): ReciprocalTopK ={
         val thetaXY = Utils.getTheta
-        val sourceCount = Utils.getSourceCount
         val joinedRDD = source.cogroup(target, partitioner)
-        ReciprocalTopK(joinedRDD, thetaXY, ws, sws, budget, sourceCount)
+        ReciprocalTopK(joinedRDD, thetaXY, ws, sws, budget)
     }
 }
diff --git a/src/main/scala/geospatialInterlinking/progressive/TopKPairs.scala b/src/main/scala/geospatialInterlinking/progressive/TopKPairs.scala
index 0ff43f40..9f93608e 100644
--- a/src/main/scala/geospatialInterlinking/progressive/TopKPairs.scala
+++ b/src/main/scala/geospatialInterlinking/progressive/TopKPairs.scala
@@ -7,9 +7,8 @@ import utils.Constants.Relation.Relation
 import utils.Constants.WeightingScheme.WeightingScheme
 import utils.Utils
 
-case class TopKPairs(joinedRDD: RDD[(Int, (Iterable[Entity], Iterable[Entity]))],
-                     thetaXY: (Double, Double), mainWS: WeightingScheme, secondaryWS: Option[WeightingScheme],
-                     budget: Int, sourceCount: Long)
+case class TopKPairs(joinedRDD: RDD[(Int, (Iterable[Entity], Iterable[Entity]))], thetaXY: (Double, Double),
+                     mainWS: WeightingScheme, secondaryWS: Option[WeightingScheme], budget: Int)
     extends ProgressiveGeospatialInterlinkingT {
 
     /**
@@ -88,8 +87,7 @@ object TopKPairs{
     def apply(source:RDD[(Int, Entity)], target:RDD[(Int, Entity)], ws: WeightingScheme, sws: Option[WeightingScheme] = None,
               budget: Int, partitioner: Partitioner): TopKPairs ={
         val thetaXY = Utils.getTheta
-        val sourceCount = Utils.getSourceCount
         val joinedRDD = source.cogroup(target, partitioner)
-        TopKPairs(joinedRDD, thetaXY, ws, sws, budget, sourceCount)
+        TopKPairs(joinedRDD, thetaXY, ws, sws, budget)
     }
 }
diff --git a/src/main/scala/utils/SpatialReader.scala b/src/main/scala/utils/SpatialReader.scala
index 44039163..2535b6fd 100644
--- a/src/main/scala/utils/SpatialReader.scala
+++ b/src/main/scala/utils/SpatialReader.scala
@@ -306,7 +306,7 @@ case class SpatialReader(sourceDc: DatasetConfigurations, partitions: Int, gt: C
                     }
         // redistribute based on spatial partitioner
         entitiesRDD
-            .flatMap(se =>  sp.value.placeObject(se.geometry).asScala.map(i => (i._1.toInt, se)))
+            .flatMap(se => sp.value.placeObject(se.geometry).asScala.map(i => (i._1.toInt, se)))
             .partitionBy(partitioner)
     }
 

From b65b0117562e7a7bd56110c56f14a9e7d00d5235 Mon Sep 17 00:00:00 2001
From: George <giwrgosmandi@gmail.com>
Date: Sun, 21 Feb 2021 20:08:20 +0200
Subject: [PATCH 03/25] stats option

---
 src/main/scala/experiments/GiantExp.scala    |  13 ++-
 src/main/scala/utils/SpaceStatsCounter.scala | 106 -------------------
 2 files changed, 12 insertions(+), 107 deletions(-)
 delete mode 100644 src/main/scala/utils/SpaceStatsCounter.scala

diff --git a/src/main/scala/experiments/GiantExp.scala b/src/main/scala/experiments/GiantExp.scala
index 72103bc1..017bd5fa 100644
--- a/src/main/scala/experiments/GiantExp.scala
+++ b/src/main/scala/experiments/GiantExp.scala
@@ -40,6 +40,8 @@ object GiantExp {
                     nextOption(map ++ Map("partitions" -> value), tail)
                 case "-gt" :: value :: tail =>
                     nextOption(map ++ Map("gt" -> value), tail)
+                case "-s" :: tail =>
+                    nextOption(map ++ Map("stats" -> "true"), tail)
                 case _ :: tail =>
                     log.warn("DS-JEDAI: Unrecognized argument")
                     nextOption(map, tail)
@@ -60,6 +62,7 @@ object GiantExp {
         val partitions: Int = if (options.contains("partitions")) options("partitions").toInt else conf.getPartitions
         val gridType: GridType.GridType = if (options.contains("gt")) GridType.withName(options("gt").toString) else conf.getGridType
         val relation = conf.getRelation
+        val printCount = options.getOrElse("stats", "false").toBoolean
 
         val startTime = Calendar.getInstance().getTimeInMillis
 
@@ -72,6 +75,14 @@ object GiantExp {
         val targetRDD = reader.load(conf.target)
         val partitioner = reader.partitioner
 
+        if(printCount){
+            val sourceCount = sourceRDD.map(_._2.originalID).distinct().count()
+            val targetCount = targetRDD.map(_._2.originalID).distinct().count()
+            log.info("DS-JEDAI: Source valid geometries: " + sourceCount)
+            log.info("DS-JEDAI: Target valid geometries: " + targetCount)
+            log.info("DS-JEDAI: Cartesian: " + sourceCount*targetCount)
+        }
+
         val matchingStartTime = Calendar.getInstance().getTimeInMillis
         val giant = GIAnt(sourceRDD, targetRDD, partitioner)
         if (relation.equals(Relation.DE9IM)) {
@@ -92,7 +103,7 @@ object GiantExp {
             log.info("DS-JEDAI: OVERLAPS: " + totalOverlaps)
             log.info("DS-JEDAI: TOUCHES: " + totalTouches)
             log.info("DS-JEDAI: WITHIN: " + totalWithin)
-            log.info("DS-JEDAI: Total Relations Discovered: " + totalRelations)
+            log.info("DS-JEDAI: Total Discovered Relations: " + totalRelations)
         }
         else{
             val totalMatches = giant.countRelation(relation)
diff --git a/src/main/scala/utils/SpaceStatsCounter.scala b/src/main/scala/utils/SpaceStatsCounter.scala
deleted file mode 100644
index a2be58c3..00000000
--- a/src/main/scala/utils/SpaceStatsCounter.scala
+++ /dev/null
@@ -1,106 +0,0 @@
-package utils
-
-import dataModel.{Entity, IM, MBR, SpatialIndex}
-import org.apache.log4j.{Level, LogManager}
-import org.apache.spark.rdd.RDD
-import org.apache.spark.{Partitioner, TaskContext}
-import utils.Constants.Relation
-
-import scala.collection.mutable.ListBuffer
-
-
-case class SpaceStatsCounter(joinedRDD: RDD[(Int, (Iterable[Entity],  Iterable[Entity]))], thetaXY: (Double, Double)){
-
-    val partitionsZones: Array[MBR] = Utils.getZones
-
-    def printSpaceInfo(): Unit ={
-        val log = LogManager.getRootLogger
-        log.setLevel(Level.INFO)
-
-        val source: RDD[Entity] = joinedRDD.flatMap(_._2._1.map(se => (se.originalID, se))).distinct().map(_._2).setName("Source").cache()
-        val target: RDD[Entity] = joinedRDD.flatMap(_._2._2.map(se => (se.originalID, se))).distinct().map(_._2).setName("target").cache()
-
-        val sourceTiles: RDD[(Int, Int)] = source.flatMap(se => se.index(thetaXY)).setName("SourceTiles").cache()
-        val targetTiles: RDD[(Int, Int)] = target.flatMap(se => se.index(thetaXY)).setName("TargetTiles").cache()
-
-        val ssePerTile: RDD[((Int, Int), Int)] = sourceTiles.map((_,1)).reduceByKey(_ + _)
-        val tsePerTile: RDD[((Int, Int), Int)] = targetTiles.map((_,1)).reduceByKey(_ + _)
-
-        val commonTiles = ssePerTile.join(tsePerTile).setName("CommonTiles").cache()
-        val tiles = commonTiles.map{ case(_, (n1, n2)) => n2}.sum()
-        log.info("Tiles: " + tiles)
-
-        val pairTiles = commonTiles.map{ case(c, (n1, n2)) => n1*n2}.sum()
-        log.info("Pairs Tiles: " + pairTiles)
-
-        sourceTiles.unpersist()
-        targetTiles.unpersist()
-        commonTiles.unpersist()
-
-        val tilesSE = source.flatMap(se => se.index(thetaXY).map(c => (c, ListBuffer(se.originalID)))).reduceByKey(_ ++ _)
-        val tilesTE = target.flatMap(se => se.index(thetaXY).map(c => (c, ListBuffer(se.originalID)))).reduceByKey(_ ++ _)
-        val joinedTiles = tilesSE.rightOuterJoin(tilesTE).filter(_._2._1.isDefined).setName("Joined").cache()
-        val uniquePairs = joinedTiles.flatMap{case(_, (sse, tse)) => tse.map(se => (se, sse.get))}.reduceByKey(_ ++ _).map(_._2.distinct.size).sum
-        log.info("Unique Tiles: " + uniquePairs)
-
-        joinedTiles.unpersist()
-        source.unpersist()
-        target.unpersist()
-
-        val comparisonsRDD: RDD[(Entity, Entity)] = joinedRDD
-            .filter(p => p._2._1.nonEmpty && p._2._2.nonEmpty)
-            .flatMap { p =>
-                val source: Array[Entity] = p._2._1.toArray
-                val target: Iterator[Entity] = p._2._2.toIterator
-                val sourceIndex = index(source)
-                val filteringFunction = (b: (Int, Int)) => sourceIndex.contains(b)
-                val pid = p._1
-                val partition = partitionsZones(pid)
-
-                target.flatMap { targetSE =>
-                    targetSE
-                        .index(thetaXY, filteringFunction)
-                        .flatMap(c => sourceIndex.get(c).map(j => (c, source(j))))
-                        .filter{case(c, se) => se.referencePointFiltering(targetSE, c, thetaXY, Some(partition))}
-                        .map {case(_, se) => (se, targetSE)}
-                }
-            }.setName("ComparisonsRDD")
-
-        val intersectingTiles = comparisonsRDD.filter{ case (sSE, tSE) => sSE.testMBR(tSE, Relation.INTERSECTS, Relation.TOUCHES)}
-        val truePairs = comparisonsRDD.filter{ case (sSE, tSE) => sSE.testMBR(tSE, Relation.INTERSECTS, Relation.TOUCHES)}.filter{case (sSE, tSE) => IM(sSE, tSE).relate}
-
-        log.info("Intersecting Pairs: " + intersectingTiles.count())
-        log.info("True Pairs: " + truePairs.count())
-        log.info("")
-    }
-
-    /**
-     * index a list of spatial entities
-     *
-     * @param entities list of spatial entities
-     * @return a SpatialIndex
-     */
-    def index(entities: Array[Entity]): SpatialIndex = {
-        val spatialIndex = new SpatialIndex()
-        entities.zipWithIndex.foreach { case (se, index) =>
-            val indices: Seq[(Int, Int)] = se.index(thetaXY)
-            indices.foreach(i => spatialIndex.insert(i, index))
-        }
-        spatialIndex
-    }
-
-
-}
-object SpaceStatsCounter{
-
-    def apply(source:RDD[Entity], target:RDD[Entity], partitioner: Partitioner): SpaceStatsCounter ={
-        val thetaXY = Utils.getTheta
-        val sourcePartitions = source.map(se => (TaskContext.getPartitionId(), se))
-        val targetPartitions = target.map(se => (TaskContext.getPartitionId(), se))
-
-        val joinedRDD = sourcePartitions.cogroup(targetPartitions, partitioner)
-
-        SpaceStatsCounter(joinedRDD, thetaXY)
-    }
-}
-

From 2fb9062c9aa0dc09e6321b647683c34d4e8bf98e Mon Sep 17 00:00:00 2001
From: George <giwrgosmandi@gmail.com>
Date: Mon, 22 Feb 2021 11:20:21 +0200
Subject: [PATCH 04/25] fixing no-class-found

---
 .../ProgressiveGeospatialInterlinkingT.scala  | 46 +++++++++++++++++--
 src/main/scala/utils/Utils.scala              | 45 ++----------------
 2 files changed, 45 insertions(+), 46 deletions(-)

diff --git a/src/main/scala/geospatialInterlinking/progressive/ProgressiveGeospatialInterlinkingT.scala b/src/main/scala/geospatialInterlinking/progressive/ProgressiveGeospatialInterlinkingT.scala
index ddd7fe36..33299d31 100644
--- a/src/main/scala/geospatialInterlinking/progressive/ProgressiveGeospatialInterlinkingT.scala
+++ b/src/main/scala/geospatialInterlinking/progressive/ProgressiveGeospatialInterlinkingT.scala
@@ -2,15 +2,17 @@ package geospatialInterlinking.progressive
 
 import dataModel.{Entity, IM, MBR, WeightedPair, WeightedPairsPQ}
 import geospatialInterlinking.GeospatialInterlinkingT
+import org.apache.commons.math3.stat.inference.ChiSquareTest
 import org.apache.spark.rdd.RDD
 import org.apache.spark.storage.StorageLevel
 import utils.Constants.Relation.Relation
 import utils.Constants.WeightingScheme.WeightingScheme
-import utils.Constants.Relation
-import utils.Utils
+import utils.Constants.{Relation, WeightingScheme}
+import utils.Utils.totalBlocks
 
 import scala.collection.mutable
 import scala.collection.mutable.ListBuffer
+import scala.math.{ceil, floor, max, min}
 
 trait ProgressiveGeospatialInterlinkingT extends GeospatialInterlinkingT{
     val budget: Int
@@ -19,14 +21,50 @@ trait ProgressiveGeospatialInterlinkingT extends GeospatialInterlinkingT{
 
     def prioritize(source: Array[Entity], target: Array[Entity], partition: MBR, relation: Relation): WeightedPairsPQ
 
-    def getMainWeight(e1: Entity, e2: Entity): Float = Utils.getWeight(e1, e2, mainWS)
+    def getMainWeight(e1: Entity, e2: Entity): Float = getWeight(e1, e2, mainWS)
 
     def getSecondaryWeight(e1: Entity, e2: Entity): Float =
         secondaryWS match {
-            case Some(ws) => Utils.getWeight(e1, e2, ws)
+            case Some(ws) => getWeight(e1, e2, ws)
             case None => 0f
         }
 
+    /**
+     * Weight a comparison
+     * TODO: ensure that float does not produce issues
+     *
+     * @param e1        Spatial entity
+     * @param e2        Spatial entity
+     * @return weight
+     */
+    def getWeight(e1: Entity, e2: Entity, ws: WeightingScheme): Float = {
+        val e1Blocks = (ceil(e1.mbr.maxX/thetaXY._1).toInt - floor(e1.mbr.minX/thetaXY._1).toInt + 1) * (ceil(e1.mbr.maxY/thetaXY._2).toInt - floor(e1.mbr.minY/thetaXY._2).toInt + 1)
+        val e2Blocks = (ceil(e2.mbr.maxX/thetaXY._1).toInt - floor(e2.mbr.minX/thetaXY._1).toInt + 1) * (ceil(e2.mbr.maxY/thetaXY._2).toInt - floor(e2.mbr.minY/thetaXY._2).toInt + 1)
+        val cb = (min(ceil(e1.mbr.maxX/thetaXY._1), ceil(e2.mbr.maxX/thetaXY._1)).toInt - max(floor(e1.mbr.minX/thetaXY._1), floor(e2.mbr.minX/thetaXY._1)).toInt + 1) *
+            (min(ceil(e1.mbr.maxY/thetaXY._2), ceil(e2.mbr.maxY/thetaXY._2)).toInt - max(floor(e1.mbr.minY/thetaXY._2), floor(e2.mbr.minY/thetaXY._2)).toInt + 1)
+
+        ws match {
+            case WeightingScheme.MBR_INTERSECTION =>
+                val intersectionArea = e1.mbr.getIntersectingMBR(e2.mbr).getArea
+                intersectionArea / (e1.mbr.getArea + e2.mbr.getArea - intersectionArea)
+
+            case WeightingScheme.POINTS =>
+                1f / (e1.geometry.getNumPoints + e2.geometry.getNumPoints);
+
+            case WeightingScheme.JS =>
+                cb / (e1Blocks + e2Blocks - cb)
+
+            case WeightingScheme.PEARSON_X2 =>
+                val v1: Array[Long] = Array[Long](cb, (e2Blocks - cb).toLong)
+                val v2: Array[Long] = Array[Long]((e1Blocks - cb).toLong, (totalBlocks - (v1(0) + v1(1) + (e1Blocks - cb))).toLong)
+                val chiTest = new ChiSquareTest()
+                chiTest.chiSquare(Array(v1, v2)).toFloat
+
+            case WeightingScheme.CF | _ =>
+                cb.toFloat
+        }
+    }
+
     /**
      *  Get the DE-9IM of the top most related entities based
      *  on the input budget and the Weighting Scheme
diff --git a/src/main/scala/utils/Utils.scala b/src/main/scala/utils/Utils.scala
index e70e7204..5f7a8045 100644
--- a/src/main/scala/utils/Utils.scala
+++ b/src/main/scala/utils/Utils.scala
@@ -3,31 +3,28 @@ package utils
 
 import dataModel.{Entity, MBR}
 import com.vividsolutions.jts.geom.Geometry
-import org.apache.commons.math3.stat.inference.ChiSquareTest
 import org.apache.log4j.{LogManager, Logger}
 import org.apache.spark.TaskContext
 import org.apache.spark.rdd.RDD
 import org.apache.spark.sql.catalyst.expressions.GenericRowWithSchema
 import org.apache.spark.sql.types.{IntegerType, StringType, StructField, StructType}
 import org.apache.spark.sql.{Encoder, Encoders, Row, SparkSession}
-import utils.Constants.{ThetaOption, WeightingScheme}
+import utils.Constants.ThetaOption
 import utils.Constants.ThetaOption.ThetaOption
-import utils.Constants.WeightingScheme.WeightingScheme
 
 import scala.collection.mutable
-import scala.math.{ceil, floor, max, min}
 import scala.reflect.ClassTag
 
 /**
  * @author George Mandilaras < gmandi@di.uoa.gr > (National and Kapodistrian University of Athens)
  */
-object Utils {
+object Utils extends Serializable {
 
 	val spark: SparkSession = SparkSession.builder().getOrCreate()
 
 	var thetaOption: ThetaOption = _
 	var source: RDD[MBR] = spark.sparkContext.emptyRDD
-	var partitionsZones: Array[MBR] = _
+	var partitionsZones: Array[MBR] = Array()
 	lazy val sourceCount: Long = source.count()
 	lazy val thetaXY: (Double, Double) = initTheta()
 
@@ -87,42 +84,6 @@ object Utils {
 		(tx, ty)
 	}
 
-	/**
-	 * Weight a comparison
-	 * TODO: ensure that float does not produce issues
-	 *
-	 * @param e1        Spatial entity
-	 * @param e2        Spatial entity
-	 * @return weight
-	 */
-	def getWeight(e1: Entity, e2: Entity, ws: WeightingScheme): Float = {
-		val e1Blocks = (ceil(e1.mbr.maxX/thetaXY._1).toInt - floor(e1.mbr.minX/thetaXY._1).toInt + 1) * (ceil(e1.mbr.maxY/thetaXY._2).toInt - floor(e1.mbr.minY/thetaXY._2).toInt + 1)
-		val e2Blocks = (ceil(e2.mbr.maxX/thetaXY._1).toInt - floor(e2.mbr.minX/thetaXY._1).toInt + 1) * (ceil(e2.mbr.maxY/thetaXY._2).toInt - floor(e2.mbr.minY/thetaXY._2).toInt + 1)
-		val cb = (min(ceil(e1.mbr.maxX/thetaXY._1), ceil(e2.mbr.maxX/thetaXY._1)).toInt - max(floor(e1.mbr.minX/thetaXY._1), floor(e2.mbr.minX/thetaXY._1)).toInt + 1) *
-			(min(ceil(e1.mbr.maxY/thetaXY._2), ceil(e2.mbr.maxY/thetaXY._2)).toInt - max(floor(e1.mbr.minY/thetaXY._2), floor(e2.mbr.minY/thetaXY._2)).toInt + 1)
-
-		ws match {
-			case WeightingScheme.MBR_INTERSECTION =>
-				val intersectionArea = e1.mbr.getIntersectingMBR(e2.mbr).getArea
-				intersectionArea / (e1.mbr.getArea + e2.mbr.getArea - intersectionArea)
-
-			case WeightingScheme.POINTS =>
-				1f / (e1.geometry.getNumPoints + e2.geometry.getNumPoints);
-
-			case WeightingScheme.JS =>
-				cb / (e1Blocks + e2Blocks - cb)
-
-			case WeightingScheme.PEARSON_X2 =>
-				val v1: Array[Long] = Array[Long](cb, (e2Blocks - cb).toLong)
-				val v2: Array[Long] = Array[Long]((e1Blocks - cb).toLong, (totalBlocks - (v1(0) + v1(1) + (e1Blocks - cb))).toLong)
-				val chiTest = new ChiSquareTest()
-				chiTest.chiSquare(Array(v1, v2)).toFloat
-
-			case WeightingScheme.CF | _ =>
-				cb.toFloat
-		}
-	}
-
 
 	def getZones: Array[MBR] ={
 		val (thetaX, thetaY) = thetaXY

From 09528bb13f45ede4f5a7fbb0b668b1285e64341b Mon Sep 17 00:00:00 2001
From: George <giwrgosmandi@gmail.com>
Date: Mon, 22 Feb 2021 13:46:15 +0200
Subject: [PATCH 05/25] fixing no-class-found

---
 .../ProgressiveGeospatialInterlinkingT.scala           | 10 +++++++++-
 src/main/scala/utils/Utils.scala                       |  2 --
 2 files changed, 9 insertions(+), 3 deletions(-)

diff --git a/src/main/scala/geospatialInterlinking/progressive/ProgressiveGeospatialInterlinkingT.scala b/src/main/scala/geospatialInterlinking/progressive/ProgressiveGeospatialInterlinkingT.scala
index 33299d31..c6476058 100644
--- a/src/main/scala/geospatialInterlinking/progressive/ProgressiveGeospatialInterlinkingT.scala
+++ b/src/main/scala/geospatialInterlinking/progressive/ProgressiveGeospatialInterlinkingT.scala
@@ -8,7 +8,6 @@ import org.apache.spark.storage.StorageLevel
 import utils.Constants.Relation.Relation
 import utils.Constants.WeightingScheme.WeightingScheme
 import utils.Constants.{Relation, WeightingScheme}
-import utils.Utils.totalBlocks
 
 import scala.collection.mutable
 import scala.collection.mutable.ListBuffer
@@ -19,6 +18,15 @@ trait ProgressiveGeospatialInterlinkingT extends GeospatialInterlinkingT{
     val mainWS: WeightingScheme
     val secondaryWS: Option[WeightingScheme]
 
+    lazy val totalBlocks: Double = {
+        val globalMinX: Double = partitionsZones.map(p => p.minX / thetaXY._1).min
+        val globalMaxX: Double = partitionsZones.map(p => p.maxX / thetaXY._1).max
+        val globalMinY: Double = partitionsZones.map(p => p.minY / thetaXY._2).min
+        val globalMaxY: Double = partitionsZones.map(p => p.maxY / thetaXY._2).max
+
+        (globalMaxX - globalMinX + 1) * (globalMaxY - globalMinY + 1)
+    }
+
     def prioritize(source: Array[Entity], target: Array[Entity], partition: MBR, relation: Relation): WeightedPairsPQ
 
     def getMainWeight(e1: Entity, e2: Entity): Float = getWeight(e1, e2, mainWS)
diff --git a/src/main/scala/utils/Utils.scala b/src/main/scala/utils/Utils.scala
index 5f7a8045..6642738b 100644
--- a/src/main/scala/utils/Utils.scala
+++ b/src/main/scala/utils/Utils.scala
@@ -48,8 +48,6 @@ object Utils extends Serializable {
 	lazy val globalMinY: Double = partitionsZones.map(p => p.minY / thetaXY._2).min
 	lazy val globalMaxY: Double = partitionsZones.map(p => p.maxY / thetaXY._2).max
 
-	lazy val totalBlocks: Double = (globalMaxX - globalMinX + 1) * (globalMaxY - globalMinY + 1)
-
 
 	/**
 	 * initialize theta based on theta granularity

From 010efebb4a5e6991242300725b6965efdf193031 Mon Sep 17 00:00:00 2001
From: George <giwrgosmandi@gmail.com>
Date: Mon, 22 Feb 2021 15:03:10 +0200
Subject: [PATCH 06/25] optional secondary weight

---
 src/main/scala/experiments/EvaluationExp.scala        |  6 +++---
 src/main/scala/experiments/ProgressiveExp.scala       |  2 +-
 .../progressive/ProgressiveAlgorithmsFactory.scala    | 11 +++++------
 src/main/scala/utils/ConfigurationParser.scala        |  5 ++++-
 4 files changed, 13 insertions(+), 11 deletions(-)

diff --git a/src/main/scala/experiments/EvaluationExp.scala b/src/main/scala/experiments/EvaluationExp.scala
index b450a1d0..6afe02ed 100644
--- a/src/main/scala/experiments/EvaluationExp.scala
+++ b/src/main/scala/experiments/EvaluationExp.scala
@@ -92,16 +92,16 @@ object EvaluationExp {
         log.info("DS-JEDAI: Total Qualifying Pairs: " + totalRelatedPairs)
         log.info("\n")
 
-        printResults(sourceRDD, targetRDD, partitioner, totalRelatedPairs, ProgressiveAlgorithm.RANDOM,  (WeightingScheme.CF, WeightingScheme.CF))
+        printResults(sourceRDD, targetRDD, partitioner, totalRelatedPairs, ProgressiveAlgorithm.RANDOM,  (WeightingScheme.CF, None))
         val algorithms = Seq(ProgressiveAlgorithm.PROGRESSIVE_GIANT, ProgressiveAlgorithm.TOPK, ProgressiveAlgorithm.RECIPROCAL_TOPK)
-        val weightingSchemes = Seq((WeightingScheme.JS, WeightingScheme.MBR_INTERSECTION), (WeightingScheme.PEARSON_X2, WeightingScheme.POINTS))
+        val weightingSchemes = Seq((WeightingScheme.JS, Option(WeightingScheme.MBR_INTERSECTION)), (WeightingScheme.PEARSON_X2, Option(WeightingScheme.POINTS)))
         for (a <- algorithms ; ws <- weightingSchemes)
             printResults(sourceRDD, targetRDD, partitioner, totalRelatedPairs, a, ws)
     }
 
 
     def printResults(source:RDD[(Int, Entity)], target:RDD[(Int, Entity)], partitioner: Partitioner, totalRelations: Int,
-                     ma: ProgressiveAlgorithm, ws: (WeightingScheme, WeightingScheme), n: Int = 10): Unit = {
+                     ma: ProgressiveAlgorithm, ws: (WeightingScheme, Option[WeightingScheme]), n: Int = 10): Unit = {
 
         val pma = ProgressiveAlgorithmsFactory.get(ma, source, target, partitioner, budget, ws._1, ws._2)
         val results = pma.evaluate(relation, n, totalRelations, takeBudget)
diff --git a/src/main/scala/experiments/ProgressiveExp.scala b/src/main/scala/experiments/ProgressiveExp.scala
index f7cfb697..2f91629d 100644
--- a/src/main/scala/experiments/ProgressiveExp.scala
+++ b/src/main/scala/experiments/ProgressiveExp.scala
@@ -67,7 +67,7 @@ object ProgressiveExp {
         val partitions: Int = if (options.contains("partitions")) options("partitions").toInt else conf.getPartitions
         val budget: Int = if (options.contains("budget")) options("budget").toInt else conf.getBudget
         val mainWS: WeightingScheme = if (options.contains("mws")) WeightingScheme.withName(options("mws")) else conf.getMainWS
-        val secondaryWS: WeightingScheme = if (options.contains("sws")) WeightingScheme.withName(options("sws")) else conf.getSecondaryWS
+        val secondaryWS: Option[WeightingScheme] = if (options.contains("sws")) Option(WeightingScheme.withName(options("sws"))) else conf.getSecondaryWS
         val pa: ProgressiveAlgorithm = if (options.contains("pa")) ProgressiveAlgorithm.withName(options("pa")) else conf.getProgressiveAlgorithm
         val gridType: GridType.GridType = if (options.contains("gt")) GridType.withName(options("gt").toString) else conf.getGridType
         val relation = conf.getRelation
diff --git a/src/main/scala/geospatialInterlinking/progressive/ProgressiveAlgorithmsFactory.scala b/src/main/scala/geospatialInterlinking/progressive/ProgressiveAlgorithmsFactory.scala
index 03beed7e..0da0571c 100644
--- a/src/main/scala/geospatialInterlinking/progressive/ProgressiveAlgorithmsFactory.scala
+++ b/src/main/scala/geospatialInterlinking/progressive/ProgressiveAlgorithmsFactory.scala
@@ -1,31 +1,30 @@
 package geospatialInterlinking.progressive
 
 import dataModel.Entity
-import org.apache.log4j.{LogManager, Logger}
 import org.apache.spark.Partitioner
 import org.apache.spark.rdd.RDD
 import utils.Constants.ProgressiveAlgorithm.ProgressiveAlgorithm
 import utils.Constants.WeightingScheme.WeightingScheme
-import utils.Constants.{ProgressiveAlgorithm, WeightingScheme}
+import utils.Constants.ProgressiveAlgorithm
 
 object ProgressiveAlgorithmsFactory {
 
 
     def get(matchingAlgorithm: ProgressiveAlgorithm, source: RDD[(Int, Entity)], target: RDD[(Int, Entity)],
-            partitioner: Partitioner, budget: Int = 0, mainWS: WeightingScheme,  secondaryWS: WeightingScheme):
+            partitioner: Partitioner, budget: Int = 0, mainWS: WeightingScheme,  secondaryWS: Option[WeightingScheme]):
     ProgressiveGeospatialInterlinkingT ={
 
         matchingAlgorithm match {
             case ProgressiveAlgorithm.RANDOM =>
-                RandomScheduling(source, target, mainWS, Option(secondaryWS), budget, partitioner)
+                RandomScheduling(source, target, mainWS, secondaryWS, budget, partitioner)
 //            case ProgressiveAlgorithm.GEOMETRY_CENTRIC =>
 //                GeometryCentric(source, target, ws, budget, partitioner)
             case ProgressiveAlgorithm.TOPK =>
-                TopKPairs(source, target, mainWS, Option(secondaryWS), budget, partitioner)
+                TopKPairs(source, target, mainWS, secondaryWS, budget, partitioner)
             case ProgressiveAlgorithm.RECIPROCAL_TOPK =>
                 ReciprocalTopK(source, target, mainWS, Option(secondaryWS), budget, partitioner)
             case ProgressiveAlgorithm.PROGRESSIVE_GIANT | _ =>
-                ProgressiveGIAnt(source, target, mainWS, Option(secondaryWS), budget, partitioner)
+                ProgressiveGIAnt(source, target, mainWS, secondaryWS, budget, partitioner)
         }
     }
 }
diff --git a/src/main/scala/utils/ConfigurationParser.scala b/src/main/scala/utils/ConfigurationParser.scala
index 59952b0c..05c452ff 100644
--- a/src/main/scala/utils/ConfigurationParser.scala
+++ b/src/main/scala/utils/ConfigurationParser.scala
@@ -85,7 +85,10 @@ case class Configuration(source: DatasetConfigurations, target:DatasetConfigurat
 
 	def getMainWS: WeightingScheme = WeightingScheme.withName(configurations.getOrElse(YamlConfiguration.CONF_MAIN_WS, "JS"))
 
-	def getSecondaryWS: WeightingScheme = WeightingScheme.withName(configurations.getOrElse(YamlConfiguration.CONF_SECONDARY_WS, "MBR_INTERSECTION"))
+	def getSecondaryWS: Option[WeightingScheme] = configurations.get(YamlConfiguration.CONF_SECONDARY_WS) match {
+		case Some(ws) => Option(WeightingScheme.withName(ws))
+		case None => None
+	}
 
 	def getGridType: GridType = GridType.withName(configurations.getOrElse(YamlConfiguration.CONF_GRIDTYPE, "QUADTREE"))
 

From e085a0e7902acc1ece4b015a1557e397e640a33e Mon Sep 17 00:00:00 2001
From: George <giwrgosmandi@gmail.com>
Date: Tue, 23 Feb 2021 18:04:47 +0200
Subject: [PATCH 07/25] dynamic progressive giant

---
 src/main/scala/dataModel/WeightedPair.scala   |   4 +-
 .../scala/dataModel/WeightedPairsPQ.scala     |   8 +
 .../scala/experiments/EvaluationExp.scala     |  11 +-
 .../progressive/DynamicProgressiveGIAnt.scala | 210 ++++++++++++++++++
 .../ProgressiveAlgorithmsFactory.scala        |   4 +-
 src/main/scala/utils/Constants.scala          |   1 +
 src/main/scala/utils/SpatialReader.scala      |   7 +-
 7 files changed, 234 insertions(+), 11 deletions(-)
 create mode 100644 src/main/scala/geospatialInterlinking/progressive/DynamicProgressiveGIAnt.scala

diff --git a/src/main/scala/dataModel/WeightedPair.scala b/src/main/scala/dataModel/WeightedPair.scala
index 05bfa7a4..b7713b25 100644
--- a/src/main/scala/dataModel/WeightedPair.scala
+++ b/src/main/scala/dataModel/WeightedPair.scala
@@ -8,12 +8,12 @@ case class WeightedPair(entityId1: Int, entityId2: Int, mainWeight: Float, secon
         // descendant order
         if (o.entityId1 == entityId1 && o.entityId2 == entityId2) return 0
 
-        val test1 = o.mainWeight - mainWeight
+        val test1 = o.getMainWeight - getMainWeight
         if (0 < test1) return 1
 
         if (test1 < 0) return -1
 
-        val test2 = o.secondaryWeight - secondaryWeight
+        val test2 = o.getSecondaryWeight - getSecondaryWeight
         if (0 < test2) return 1
 
         if (test2 < 0) return -1
diff --git a/src/main/scala/dataModel/WeightedPairsPQ.scala b/src/main/scala/dataModel/WeightedPairsPQ.scala
index b0add368..83d1f43b 100644
--- a/src/main/scala/dataModel/WeightedPairsPQ.scala
+++ b/src/main/scala/dataModel/WeightedPairsPQ.scala
@@ -22,6 +22,14 @@ case class WeightedPairsPQ(maxSize: Int){
             case None =>  Iterator.continually{ pq.pollFirst() }.takeWhile(_ => !pq.isEmpty)
         }
 
+    def dynamicUpdate(wp: WeightedPair): Unit ={
+        val exists = pq.remove(wp)
+        if (exists){
+            wp.incrementRelatedMatches()
+            enqueue(wp)
+        }
+    }
+
     def take(n: Int): Iterator[WeightedPair] = take(Option(n))
 
     def dequeueAll: Iterator[WeightedPair] = take(None)
diff --git a/src/main/scala/experiments/EvaluationExp.scala b/src/main/scala/experiments/EvaluationExp.scala
index 6afe02ed..5aa2bbfb 100644
--- a/src/main/scala/experiments/EvaluationExp.scala
+++ b/src/main/scala/experiments/EvaluationExp.scala
@@ -92,13 +92,16 @@ object EvaluationExp {
         log.info("DS-JEDAI: Total Qualifying Pairs: " + totalRelatedPairs)
         log.info("\n")
 
-        printResults(sourceRDD, targetRDD, partitioner, totalRelatedPairs, ProgressiveAlgorithm.RANDOM,  (WeightingScheme.CF, None))
-        val algorithms = Seq(ProgressiveAlgorithm.PROGRESSIVE_GIANT, ProgressiveAlgorithm.TOPK, ProgressiveAlgorithm.RECIPROCAL_TOPK)
-        val weightingSchemes = Seq((WeightingScheme.JS, Option(WeightingScheme.MBR_INTERSECTION)), (WeightingScheme.PEARSON_X2, Option(WeightingScheme.POINTS)))
+        //printResults(sourceRDD, targetRDD, partitioner, totalRelatedPairs, ProgressiveAlgorithm.RANDOM,  (WeightingScheme.CF, None))
+        val algorithms = Seq(ProgressiveAlgorithm.DYNAMIC_PROGRESSIVE_GIANT)
+        val weightingSchemes = Seq(
+            (WeightingScheme.CF, None), (WeightingScheme.JS, None), (WeightingScheme.PEARSON_X2, None),
+            (WeightingScheme.MBR_INTERSECTION, None), (WeightingScheme.POINTS, None),
+            (WeightingScheme.JS, Option(WeightingScheme.MBR_INTERSECTION)), (WeightingScheme.PEARSON_X2, Option(WeightingScheme.POINTS)))
         for (a <- algorithms ; ws <- weightingSchemes)
             printResults(sourceRDD, targetRDD, partitioner, totalRelatedPairs, a, ws)
     }
-
+n
 
     def printResults(source:RDD[(Int, Entity)], target:RDD[(Int, Entity)], partitioner: Partitioner, totalRelations: Int,
                      ma: ProgressiveAlgorithm, ws: (WeightingScheme, Option[WeightingScheme]), n: Int = 10): Unit = {
diff --git a/src/main/scala/geospatialInterlinking/progressive/DynamicProgressiveGIAnt.scala b/src/main/scala/geospatialInterlinking/progressive/DynamicProgressiveGIAnt.scala
new file mode 100644
index 00000000..ce09311e
--- /dev/null
+++ b/src/main/scala/geospatialInterlinking/progressive/DynamicProgressiveGIAnt.scala
@@ -0,0 +1,210 @@
+package geospatialInterlinking.progressive
+
+import dataModel.{Entity, IM, MBR, WeightedPair, WeightedPairsPQ}
+import org.apache.spark.Partitioner
+import org.apache.spark.rdd.RDD
+import org.apache.spark.storage.StorageLevel
+import utils.Constants.Relation
+import utils.Constants.Relation.Relation
+import utils.Constants.WeightingScheme.WeightingScheme
+import utils.Utils
+
+import scala.collection.mutable
+import scala.collection.mutable.ListBuffer
+
+case class DynamicProgressiveGIAnt(joinedRDD: RDD[(Int, (Iterable[Entity], Iterable[Entity]))], thetaXY: (Double, Double),
+                                    mainWS: WeightingScheme, secondaryWS: Option[WeightingScheme], budget: Int)
+    extends ProgressiveGeospatialInterlinkingT {
+
+
+    /**
+     * First index source and then for each entity of target, find its comparisons using source's index.
+     * Weight the comparisons according to the input weighting scheme and sort them using a PQ.
+     *
+     * @param partition the MBR: of the partition
+     * @param source source
+     * @param target target
+     * @return a PQ with the top comparisons
+     */
+    def prioritize(source: Array[Entity], target: Array[Entity], partition: MBR, relation: Relation): WeightedPairsPQ ={
+        val sourceIndex = index(source)
+        val filterIndices = (b: (Int, Int)) => sourceIndex.contains(b)
+        val pq: WeightedPairsPQ = WeightedPairsPQ(budget)
+
+        // weight and put the comparisons in a PQ
+        target
+            .indices
+            .foreach {j =>
+                val e2 = target(j)
+                e2.index(thetaXY, filterIndices)
+                    .foreach { block =>
+                        sourceIndex.get(block)
+                            .filter(i => source(i).filter(e2, relation, block, thetaXY, Some(partition)))
+                            .foreach { i =>
+                                val e1 = source(i)
+                                val w = getMainWeight(e1, e2)
+                                val secW = getSecondaryWeight(e1, e2)
+                                val wp = WeightedPair(i, j, w, secW)
+                                pq.enqueue(wp)
+                            }
+                    }
+            }
+        pq
+    }
+
+    /**
+     *  Get the DE-9IM of the top most related entities based
+     *  on the input budget and the Weighting Scheme
+     * @return an RDD of IM
+     */
+    override def getDE9IM: RDD[IM] ={
+        joinedRDD.filter(j => j._2._1.nonEmpty && j._2._2.nonEmpty)
+            .flatMap{ p =>
+                val pid = p._1
+                val partition = partitionsZones(pid)
+                val source = p._2._1.toArray
+                val target = p._2._2.toArray
+
+                val pq = prioritize(source, target, partition, Relation.DE9IM)
+                val sourceCandidates: Map[Int, List[WeightedPair]] = pq.iterator().map(wp => (wp.entityId1, wp)).toList.groupBy(_._1).mapValues(_.map(_._2))
+                val targetCandidates: Map[Int, List[WeightedPair]] = pq.iterator().map(wp => (wp.entityId2, wp)).toList.groupBy(_._1).mapValues(_.map(_._2))
+
+                if (!pq.isEmpty)
+                    Iterator.continually{
+                        val wp = pq.dequeue()
+                        val e1 = source(wp.entityId1)
+                        val e2 = target(wp.entityId2)
+                        val im = IM(e1, e2)
+                        val isRelated = im.relate
+                        if (isRelated){
+                            sourceCandidates.getOrElse(wp.entityId1, List()).foreach(wp => pq.dynamicUpdate(wp))
+                            targetCandidates.getOrElse(wp.entityId2, List()).foreach(wp => pq.dynamicUpdate(wp))
+                        }
+                        im
+                    }.takeWhile(_ => !pq.isEmpty)
+                else Iterator()
+            }
+    }
+
+
+    /**
+     *  Examine the Relation of the top most related entities based
+     *  on the input budget and the Weighting Scheme
+     *  @param relation the relation to examine
+     *  @return an RDD of pair of IDs
+     */
+    override def relate(relation: Relation): RDD[(String, String)] = {
+        joinedRDD.filter(j => j._2._1.nonEmpty && j._2._2.nonEmpty)
+            .flatMap{ p =>
+                val pid = p._1
+                val partition = partitionsZones(pid)
+                val source = p._2._1.toArray
+                val target = p._2._2.toArray
+
+                val pq = prioritize(source, target, partition, relation)
+                val sourceCandidates: Map[Int, List[WeightedPair]] = pq.iterator().map(wp => (wp.entityId1, wp)).toList.groupBy(_._1).mapValues(_.map(_._2))
+                val targetCandidates: Map[Int, List[WeightedPair]] = pq.iterator().map(wp => (wp.entityId2, wp)).toList.groupBy(_._1).mapValues(_.map(_._2))
+                if (!pq.isEmpty)
+                    Iterator.continually{
+                        val wp = pq.dequeue()
+                        val e1 = source(wp.entityId1)
+                        val e2 = target(wp.entityId2)
+                        val isRelated = e1.relate(e2, relation)
+                        if (isRelated){
+                            sourceCandidates.getOrElse(wp.entityId1, List()).foreach(wp => pq.dynamicUpdate(wp))
+                            targetCandidates.getOrElse(wp.entityId2, List()).foreach(wp => pq.dynamicUpdate(wp))
+                        }
+                        (isRelated, (e1.originalID, e2.originalID))
+                    }.filter(_._1).map(_._2)
+                else Iterator()
+            }
+    }
+
+
+    /**
+     * Compute PGR - first weight and perform the comparisons in each partition,
+     * then collect them in descending order and compute the progressive True Positives.
+     *
+     * @param relation the examined relation
+     * @return (PGR, total interlinked Geometries (TP), total comparisons)
+     */
+    override def evaluate(relation: Relation, n: Int = 10, totalQualifiedPairs: Double, takeBudget: Seq[Int]): Seq[(Double, Long, Long, (List[Int], List[Int]))]  ={
+        // computes weighted the weighted comparisons
+        val matches: RDD[(WeightedPair, Boolean)] = joinedRDD
+            .filter(p => p._2._1.nonEmpty && p._2._2.nonEmpty)
+            .flatMap { p =>
+                val pid = p._1
+                val partition = partitionsZones(pid)
+                val source = p._2._1.toArray
+                val target = p._2._2.toArray
+
+                val pq = prioritize(source, target, partition, relation)
+                val sourceCandidates: Map[Int, List[WeightedPair]] = pq.iterator().map(wp => (wp.entityId1, wp)).toList.groupBy(_._1).mapValues(_.map(_._2))
+                val targetCandidates: Map[Int, List[WeightedPair]] = pq.iterator().map(wp => (wp.entityId2, wp)).toList.groupBy(_._1).mapValues(_.map(_._2))
+                if (!pq.isEmpty)
+                    Iterator.continually{
+                        val wp = pq.dequeue()
+                        val e1 = source(wp.entityId1)
+                        val e2 = target(wp.entityId2)
+                        val isRelatedAndPair = relation match {
+                            case Relation.DE9IM => (wp, IM(e1, e2).relate)
+                            case _ => (wp, e1.relate(e2, relation))
+                        }
+                        if (isRelatedAndPair._2){
+                            sourceCandidates.getOrElse(wp.entityId1, List()).foreach(wp => pq.dynamicUpdate(wp))
+                            targetCandidates.getOrElse(wp.entityId2, List()).foreach(wp => pq.dynamicUpdate(wp))
+                        }
+                        isRelatedAndPair
+                    }.takeWhile(_ => !pq.isEmpty)
+                else Iterator()
+            }.persist(StorageLevel.MEMORY_AND_DISK)
+
+        var results = mutable.ListBuffer[(Double, Long, Long, (List[Int], List[Int]))]()
+        for(b <- takeBudget){
+            // compute AUC prioritizing the comparisons based on their weight
+            val sorted = matches.takeOrdered(b)
+            val verifications = sorted.length
+            val step = math.ceil(verifications/n)
+
+            var progressiveQP: Double = 0
+            var qp = 0
+            val verificationSteps = ListBuffer[Int]()
+            val qualifiedPairsSteps = ListBuffer[Int]()
+
+            sorted
+                .map(_._2)
+                .zipWithIndex
+                .foreach{
+                    case (r, i) =>
+                        if (r) qp += 1
+                        progressiveQP += qp
+                        if (i % step == 0){
+                            qualifiedPairsSteps += qp
+                            verificationSteps += i
+                        }
+                }
+            qualifiedPairsSteps += qp
+            verificationSteps += verifications
+            val qualifiedPairsWithinBudget = if (totalQualifiedPairs < verifications) totalQualifiedPairs else verifications
+            val pgr = (progressiveQP/qualifiedPairsWithinBudget)/verifications.toDouble
+            results += ((pgr, qp, verifications, (verificationSteps.toList, qualifiedPairsSteps.toList)))
+        }
+        matches.unpersist()
+        results
+    }
+}
+
+
+/**
+ * auxiliary constructor
+ */
+object DynamicProgressiveGIAnt {
+
+    def apply(source:RDD[(Int, Entity)], target:RDD[(Int, Entity)], ws: WeightingScheme, sws: Option[WeightingScheme] = None,
+              budget: Int, partitioner: Partitioner): DynamicProgressiveGIAnt ={
+        val thetaXY = Utils.getTheta
+        val joinedRDD = source.cogroup(target, partitioner)
+        DynamicProgressiveGIAnt(joinedRDD, thetaXY, ws, sws, budget)
+    }
+
+}
\ No newline at end of file
diff --git a/src/main/scala/geospatialInterlinking/progressive/ProgressiveAlgorithmsFactory.scala b/src/main/scala/geospatialInterlinking/progressive/ProgressiveAlgorithmsFactory.scala
index 0da0571c..e8a28877 100644
--- a/src/main/scala/geospatialInterlinking/progressive/ProgressiveAlgorithmsFactory.scala
+++ b/src/main/scala/geospatialInterlinking/progressive/ProgressiveAlgorithmsFactory.scala
@@ -22,7 +22,9 @@ object ProgressiveAlgorithmsFactory {
             case ProgressiveAlgorithm.TOPK =>
                 TopKPairs(source, target, mainWS, secondaryWS, budget, partitioner)
             case ProgressiveAlgorithm.RECIPROCAL_TOPK =>
-                ReciprocalTopK(source, target, mainWS, Option(secondaryWS), budget, partitioner)
+                ReciprocalTopK(source, target, mainWS, secondaryWS, budget, partitioner)
+            case ProgressiveAlgorithm.DYNAMIC_PROGRESSIVE_GIANT =>
+                DynamicProgressiveGIAnt(source, target, mainWS, secondaryWS, budget, partitioner)
             case ProgressiveAlgorithm.PROGRESSIVE_GIANT | _ =>
                 ProgressiveGIAnt(source, target, mainWS, secondaryWS, budget, partitioner)
         }
diff --git a/src/main/scala/utils/Constants.scala b/src/main/scala/utils/Constants.scala
index d5f4a06e..407037e7 100644
--- a/src/main/scala/utils/Constants.scala
+++ b/src/main/scala/utils/Constants.scala
@@ -108,6 +108,7 @@ object Constants {
 	object ProgressiveAlgorithm extends Enumeration {
 		type ProgressiveAlgorithm = Value
 		val PROGRESSIVE_GIANT: Constants.ProgressiveAlgorithm.Value = Value("PROGRESSIVE_GIANT")
+		val DYNAMIC_PROGRESSIVE_GIANT: Constants.ProgressiveAlgorithm.Value = Value("DYNAMIC_PROGRESSIVE_GIANT")
 		val GEOMETRY_CENTRIC: Constants.ProgressiveAlgorithm.Value = Value("GEOMETRY_CENTRIC")
 		val TOPK: Constants.ProgressiveAlgorithm.Value = Value("TOPK")
 		val RECIPROCAL_TOPK: Constants.ProgressiveAlgorithm.Value = Value("RECIPROCAL_TOPK")
diff --git a/src/main/scala/utils/SpatialReader.scala b/src/main/scala/utils/SpatialReader.scala
index 2535b6fd..ca0fd631 100644
--- a/src/main/scala/utils/SpatialReader.scala
+++ b/src/main/scala/utils/SpatialReader.scala
@@ -227,9 +227,8 @@ case class SpatialReader(sourceDc: DatasetConfigurations, partitions: Int, gt: C
         GeoSparkSQLRegistrator.registerAll(spark)
         ARQ.init()
 
-        val asWKT = "http://www.opengis.net/ont/geosparql#asWKT"
-        val allowedPredicates: mutable.Set[String] = mutable.Set(asWKT)
-        var sparqlQuery = s"SELECT ?Subject ?WKT WHERE { ?Subject $geometryPredicate ?g. ?g <$asWKT> ?WKT.}"
+        val allowedPredicates: mutable.Set[String] = mutable.Set()
+        var sparqlQuery = s"SELECT ?Subject ?WKT WHERE { ?Subject $geometryPredicate ?WKT.}"
         var query = "SELECT ST_GeomFromWKT(GEOMETRIES.WKT),  GEOMETRIES.Subject FROM GEOMETRIES".stripMargin
 
         val cleanGeomPredicate: String =
@@ -245,7 +244,7 @@ case class SpatialReader(sourceDc: DatasetConfigurations, partitions: Int, gt: C
                 datePredicateValue.substring(1, datePredicateValue.length-1)
             else datePredicateValue
             allowedPredicates.add(cleanDatePredicate)
-            sparqlQuery = s"SELECT ?Subject ?WKT ?Date WHERE { ?Subject ${datePredicate.get} ?Date. ?Subject $geometryPredicate ?g. ?g <$asWKT> ?WKT.}"
+            sparqlQuery = s"SELECT ?Subject ?WKT ?Date WHERE { ?Subject ${datePredicate.get} ?Date. ?Subject $geometryPredicate ?WKT.}"
             query = "SELECT ST_GeomFromWKT(GEOMETRIES.WKT),  GEOMETRIES.Subject, GEOMETRIES.Date FROM GEOMETRIES".stripMargin
         }
 

From 34572b4ecc571ee2ec4bad774eb903c0917095ff Mon Sep 17 00:00:00 2001
From: George <giwrgosmandi@gmail.com>
Date: Wed, 24 Feb 2021 14:30:30 +0200
Subject: [PATCH 08/25] solving comparisons' violation

---
 src/main/scala/dataModel/WeightedPair.scala               | 5 +++--
 src/main/scala/dataModel/WeightedPairsPQ.scala            | 8 ++++----
 src/main/scala/experiments/EvaluationExp.scala            | 5 ++---
 .../progressive/DynamicProgressiveGIAnt.scala             | 6 +++---
 4 files changed, 12 insertions(+), 12 deletions(-)

diff --git a/src/main/scala/dataModel/WeightedPair.scala b/src/main/scala/dataModel/WeightedPair.scala
index b7713b25..4e82ef30 100644
--- a/src/main/scala/dataModel/WeightedPair.scala
+++ b/src/main/scala/dataModel/WeightedPair.scala
@@ -17,8 +17,9 @@ case class WeightedPair(entityId1: Int, entityId2: Int, mainWeight: Float, secon
         if (0 < test2) return 1
 
         if (test2 < 0) return -1
-
-        o.entityId1 - entityId1
+        // Note: Returning just the id leads to comparison method violation
+        // as may lead to cases that A > B, B > C and C > A
+        0
     }
 
     /**
diff --git a/src/main/scala/dataModel/WeightedPairsPQ.scala b/src/main/scala/dataModel/WeightedPairsPQ.scala
index 83d1f43b..77bd7e6f 100644
--- a/src/main/scala/dataModel/WeightedPairsPQ.scala
+++ b/src/main/scala/dataModel/WeightedPairsPQ.scala
@@ -11,15 +11,15 @@ case class WeightedPairsPQ(maxSize: Int){
     def enqueue(wp: WeightedPair): Unit ={
             pq.add(wp)
             if (pq.size > maxSize)
-                pq.pollLast()
+                dequeueLast()
     }
 
     def enqueueAll(items: Iterator[WeightedPair]): Unit = items.foreach(wp => enqueue(wp))
 
     def take(n: Option[Int]): Iterator[WeightedPair] =
         n match {
-            case Some(n) => Iterator.continually{ pq.pollFirst() }.take(n)
-            case None =>  Iterator.continually{ pq.pollFirst() }.takeWhile(_ => !pq.isEmpty)
+            case Some(n) => Iterator.continually{ dequeueHead() }.take(n)
+            case None =>  Iterator.continually{ dequeueHead() }.takeWhile(_ => !pq.isEmpty)
         }
 
     def dynamicUpdate(wp: WeightedPair): Unit ={
@@ -42,7 +42,7 @@ case class WeightedPairsPQ(maxSize: Int){
 
     def dequeueHead(): WeightedPair = pq.pollFirst()
 
-    def dequeue(): WeightedPair = pq.pollLast()
+    def dequeueLast(): WeightedPair = pq.pollLast()
 
     def iterator(): Iterator[WeightedPair] = pq.iterator().asScala
 }
diff --git a/src/main/scala/experiments/EvaluationExp.scala b/src/main/scala/experiments/EvaluationExp.scala
index 5aa2bbfb..88d80381 100644
--- a/src/main/scala/experiments/EvaluationExp.scala
+++ b/src/main/scala/experiments/EvaluationExp.scala
@@ -93,15 +93,14 @@ object EvaluationExp {
         log.info("\n")
 
         //printResults(sourceRDD, targetRDD, partitioner, totalRelatedPairs, ProgressiveAlgorithm.RANDOM,  (WeightingScheme.CF, None))
-        val algorithms = Seq(ProgressiveAlgorithm.DYNAMIC_PROGRESSIVE_GIANT)
+        val algorithms = Seq(ProgressiveAlgorithm.PROGRESSIVE_GIANT, ProgressiveAlgorithm.TOPK, ProgressiveAlgorithm.RECIPROCAL_TOPK)
         val weightingSchemes = Seq(
-            (WeightingScheme.CF, None), (WeightingScheme.JS, None), (WeightingScheme.PEARSON_X2, None),
             (WeightingScheme.MBR_INTERSECTION, None), (WeightingScheme.POINTS, None),
             (WeightingScheme.JS, Option(WeightingScheme.MBR_INTERSECTION)), (WeightingScheme.PEARSON_X2, Option(WeightingScheme.POINTS)))
         for (a <- algorithms ; ws <- weightingSchemes)
             printResults(sourceRDD, targetRDD, partitioner, totalRelatedPairs, a, ws)
     }
-n
+
 
     def printResults(source:RDD[(Int, Entity)], target:RDD[(Int, Entity)], partitioner: Partitioner, totalRelations: Int,
                      ma: ProgressiveAlgorithm, ws: (WeightingScheme, Option[WeightingScheme]), n: Int = 10): Unit = {
diff --git a/src/main/scala/geospatialInterlinking/progressive/DynamicProgressiveGIAnt.scala b/src/main/scala/geospatialInterlinking/progressive/DynamicProgressiveGIAnt.scala
index ce09311e..0b90d327 100644
--- a/src/main/scala/geospatialInterlinking/progressive/DynamicProgressiveGIAnt.scala
+++ b/src/main/scala/geospatialInterlinking/progressive/DynamicProgressiveGIAnt.scala
@@ -71,7 +71,7 @@ case class DynamicProgressiveGIAnt(joinedRDD: RDD[(Int, (Iterable[Entity], Itera
 
                 if (!pq.isEmpty)
                     Iterator.continually{
-                        val wp = pq.dequeue()
+                        val wp = pq.dequeueHead()
                         val e1 = source(wp.entityId1)
                         val e2 = target(wp.entityId2)
                         val im = IM(e1, e2)
@@ -106,7 +106,7 @@ case class DynamicProgressiveGIAnt(joinedRDD: RDD[(Int, (Iterable[Entity], Itera
                 val targetCandidates: Map[Int, List[WeightedPair]] = pq.iterator().map(wp => (wp.entityId2, wp)).toList.groupBy(_._1).mapValues(_.map(_._2))
                 if (!pq.isEmpty)
                     Iterator.continually{
-                        val wp = pq.dequeue()
+                        val wp = pq.dequeueHead()
                         val e1 = source(wp.entityId1)
                         val e2 = target(wp.entityId2)
                         val isRelated = e1.relate(e2, relation)
@@ -143,7 +143,7 @@ case class DynamicProgressiveGIAnt(joinedRDD: RDD[(Int, (Iterable[Entity], Itera
                 val targetCandidates: Map[Int, List[WeightedPair]] = pq.iterator().map(wp => (wp.entityId2, wp)).toList.groupBy(_._1).mapValues(_.map(_._2))
                 if (!pq.isEmpty)
                     Iterator.continually{
-                        val wp = pq.dequeue()
+                        val wp = pq.dequeueHead()
                         val e1 = source(wp.entityId1)
                         val e2 = target(wp.entityId2)
                         val isRelatedAndPair = relation match {

From ed5cd598d554f148ca5314e3249c81e14cd60ce7 Mon Sep 17 00:00:00 2001
From: George <giwrgosmandi@gmail.com>
Date: Wed, 24 Feb 2021 22:56:12 +0200
Subject: [PATCH 09/25] wp with id

---
 src/main/scala/dataModel/WeightedPair.scala               | 8 ++++----
 .../progressive/DynamicProgressiveGIAnt.scala             | 5 +++--
 .../progressive/ProgressiveGIAnt.scala                    | 5 +++--
 .../progressive/RandomScheduling.scala                    | 4 +++-
 .../progressive/ReciprocalTopK.scala                      | 4 +++-
 .../geospatialInterlinking/progressive/TopKPairs.scala    | 4 +++-
 6 files changed, 19 insertions(+), 11 deletions(-)

diff --git a/src/main/scala/dataModel/WeightedPair.scala b/src/main/scala/dataModel/WeightedPair.scala
index 4e82ef30..5510c6f1 100644
--- a/src/main/scala/dataModel/WeightedPair.scala
+++ b/src/main/scala/dataModel/WeightedPair.scala
@@ -1,12 +1,12 @@
 package dataModel
 
-case class WeightedPair(entityId1: Int, entityId2: Int, mainWeight: Float, secondaryWeight: Float)  extends Serializable with Comparable[WeightedPair]{
+case class WeightedPair(id: Int, entityId1: Int, entityId2: Int, mainWeight: Float, secondaryWeight: Float)  extends Serializable with Comparable[WeightedPair]{
 
     var relatedMatches: Int = 0
 
     override def compareTo(o: WeightedPair): Int = {
         // descendant order
-        if (o.entityId1 == entityId1 && o.entityId2 == entityId2) return 0
+        if (o.id == id) return 0
 
         val test1 = o.getMainWeight - getMainWeight
         if (0 < test1) return 1
@@ -19,7 +19,7 @@ case class WeightedPair(entityId1: Int, entityId2: Int, mainWeight: Float, secon
         if (test2 < 0) return -1
         // Note: Returning just the id leads to comparison method violation
         // as may lead to cases that A > B, B > C and C > A
-        0
+        id - o.id
     }
 
     /**
@@ -34,5 +34,5 @@ case class WeightedPair(entityId1: Int, entityId2: Int, mainWeight: Float, secon
 
     def incrementRelatedMatches(): Unit = relatedMatches += 1
 
-    override def toString: String = "E1 : " + entityId1 + ", E2 : " + entityId2 + ", main weight : " + getMainWeight + ", secondary weight : " + getSecondaryWeight
+    override def toString: String = s"ID: $id E1 : $entityId1 E2 : $entityId2 main weight : $getMainWeight secondary weight : $getSecondaryWeight"
 }
diff --git a/src/main/scala/geospatialInterlinking/progressive/DynamicProgressiveGIAnt.scala b/src/main/scala/geospatialInterlinking/progressive/DynamicProgressiveGIAnt.scala
index 0b90d327..e1bcb0e0 100644
--- a/src/main/scala/geospatialInterlinking/progressive/DynamicProgressiveGIAnt.scala
+++ b/src/main/scala/geospatialInterlinking/progressive/DynamicProgressiveGIAnt.scala
@@ -30,7 +30,7 @@ case class DynamicProgressiveGIAnt(joinedRDD: RDD[(Int, (Iterable[Entity], Itera
         val sourceIndex = index(source)
         val filterIndices = (b: (Int, Int)) => sourceIndex.contains(b)
         val pq: WeightedPairsPQ = WeightedPairsPQ(budget)
-
+        var counter: Int = 0
         // weight and put the comparisons in a PQ
         target
             .indices
@@ -44,8 +44,9 @@ case class DynamicProgressiveGIAnt(joinedRDD: RDD[(Int, (Iterable[Entity], Itera
                                 val e1 = source(i)
                                 val w = getMainWeight(e1, e2)
                                 val secW = getSecondaryWeight(e1, e2)
-                                val wp = WeightedPair(i, j, w, secW)
+                                val wp = WeightedPair(counter, i, j, w, secW)
                                 pq.enqueue(wp)
+                                counter += 1
                             }
                     }
             }
diff --git a/src/main/scala/geospatialInterlinking/progressive/ProgressiveGIAnt.scala b/src/main/scala/geospatialInterlinking/progressive/ProgressiveGIAnt.scala
index 876f7f48..bae5ac39 100644
--- a/src/main/scala/geospatialInterlinking/progressive/ProgressiveGIAnt.scala
+++ b/src/main/scala/geospatialInterlinking/progressive/ProgressiveGIAnt.scala
@@ -26,7 +26,7 @@ case class ProgressiveGIAnt(joinedRDD: RDD[(Int, (Iterable[Entity], Iterable[Ent
         val sourceIndex = index(source)
         val filterIndices = (b: (Int, Int)) => sourceIndex.contains(b)
         val pq: WeightedPairsPQ = WeightedPairsPQ(budget)
-
+        var counter: Int = 0
         // weight and put the comparisons in a PQ
         target
             .indices
@@ -40,8 +40,9 @@ case class ProgressiveGIAnt(joinedRDD: RDD[(Int, (Iterable[Entity], Iterable[Ent
                                 val e1 = source(i)
                                 val w = getMainWeight(e1, e2)
                                 val secW = getSecondaryWeight(e1, e2)
-                                val wp = WeightedPair(i, j, w, secW)
+                                val wp = WeightedPair(counter, i, j, w, secW)
                                 pq.enqueue(wp)
+                                counter += 1
                             }
                     }
             }
diff --git a/src/main/scala/geospatialInterlinking/progressive/RandomScheduling.scala b/src/main/scala/geospatialInterlinking/progressive/RandomScheduling.scala
index 24c42131..002308c7 100644
--- a/src/main/scala/geospatialInterlinking/progressive/RandomScheduling.scala
+++ b/src/main/scala/geospatialInterlinking/progressive/RandomScheduling.scala
@@ -25,6 +25,7 @@ case class RandomScheduling(joinedRDD: RDD[(Int, (Iterable[Entity], Iterable[Ent
         val filterIndices = (b: (Int, Int)) => sourceIndex.contains(b)
         val pq: WeightedPairsPQ = WeightedPairsPQ(budget)
         val rnd = new scala.util.Random
+        var counter: Int = 0
         // weight and put the comparisons in a PQ
         target
             .indices
@@ -37,8 +38,9 @@ case class RandomScheduling(joinedRDD: RDD[(Int, (Iterable[Entity], Iterable[Ent
                             .foreach { i =>
                                 val w = rnd.nextFloat()
                                 val secW = rnd.nextFloat()
-                                val wp = WeightedPair(i, j, w, secW)
+                                val wp = WeightedPair(counter, i, j, w, secW)
                                 pq.enqueue(wp)
+                                counter += 1
                             }
                     }
             }
diff --git a/src/main/scala/geospatialInterlinking/progressive/ReciprocalTopK.scala b/src/main/scala/geospatialInterlinking/progressive/ReciprocalTopK.scala
index 56acdc3e..686e5a67 100644
--- a/src/main/scala/geospatialInterlinking/progressive/ReciprocalTopK.scala
+++ b/src/main/scala/geospatialInterlinking/progressive/ReciprocalTopK.scala
@@ -35,6 +35,7 @@ case class ReciprocalTopK(joinedRDD: RDD[(Int, (Iterable[Entity], Iterable[Entit
         val partitionPQ: WeightedPairsPQ = WeightedPairsPQ(budget)
 
         val targetSet: Array[Set[Int]] = new Array(target.length)
+        var counter: Int = 0
         target.indices
             .foreach{j =>
                 val e2 = target(j)
@@ -46,7 +47,8 @@ case class ReciprocalTopK(joinedRDD: RDD[(Int, (Iterable[Entity], Iterable[Entit
                                 val e1 = source(i)
                                 val w = getMainWeight(e1, e2)
                                 val secW = getSecondaryWeight(e1, e2)
-                                val wp = WeightedPair(i, j, w, secW)
+                                val wp = WeightedPair(counter, i, j, w, secW)
+                                counter += 1
 
                                 // set top-K PQ for the examining target entity
                                 targetPQ.enqueue(wp)
diff --git a/src/main/scala/geospatialInterlinking/progressive/TopKPairs.scala b/src/main/scala/geospatialInterlinking/progressive/TopKPairs.scala
index 9f93608e..c32a7e5f 100644
--- a/src/main/scala/geospatialInterlinking/progressive/TopKPairs.scala
+++ b/src/main/scala/geospatialInterlinking/progressive/TopKPairs.scala
@@ -31,6 +31,7 @@ case class TopKPairs(joinedRDD: RDD[(Int, (Iterable[Entity], Iterable[Entity]))]
         val sourcePQ: Array[WeightedPairsPQ] = new Array(source.length)
         val targetPQ: WeightedPairsPQ = WeightedPairsPQ(k)
         val partitionPQ: WeightedPairsPQ = WeightedPairsPQ(budget)
+        var counter: Int = 0
 
         target.indices
             .foreach{ j =>
@@ -43,7 +44,8 @@ case class TopKPairs(joinedRDD: RDD[(Int, (Iterable[Entity], Iterable[Entity]))]
                                 val e1 = source(i)
                                 val w = getMainWeight(e1, e2)
                                 val secW = getSecondaryWeight(e1, e2)
-                                val wp = WeightedPair(i, j, w, secW)
+                                val wp = WeightedPair(counter, i, j, w, secW)
+                                counter += 1
 
                                 // set top-K PQ for the examining target entity
                                 targetPQ.enqueue(wp)

From e648e03e968aa02b075bccaa2bf9a766a8b6cef1 Mon Sep 17 00:00:00 2001
From: George <giwrgosmandi@gmail.com>
Date: Thu, 25 Feb 2021 10:21:48 +0200
Subject: [PATCH 10/25] wp without ids

---
 src/main/scala/dataModel/WeightedPair.scala   | 22 +++++++++++++------
 .../progressive/DynamicProgressiveGIAnt.scala |  4 +---
 .../progressive/ProgressiveGIAnt.scala        |  4 +---
 .../progressive/RandomScheduling.scala        |  4 +---
 .../progressive/ReciprocalTopK.scala          |  4 +---
 .../progressive/TopKPairs.scala               |  4 +---
 6 files changed, 20 insertions(+), 22 deletions(-)

diff --git a/src/main/scala/dataModel/WeightedPair.scala b/src/main/scala/dataModel/WeightedPair.scala
index 5510c6f1..3db8dece 100644
--- a/src/main/scala/dataModel/WeightedPair.scala
+++ b/src/main/scala/dataModel/WeightedPair.scala
@@ -1,12 +1,21 @@
 package dataModel
 
-case class WeightedPair(id: Int, entityId1: Int, entityId2: Int, mainWeight: Float, secondaryWeight: Float)  extends Serializable with Comparable[WeightedPair]{
+case class WeightedPair(entityId1: Int, entityId2: Int, mainWeight: Float, secondaryWeight: Float)  extends Serializable with Comparable[WeightedPair]{
 
     var relatedMatches: Int = 0
 
+    /**
+     * Note: ID based comparison leads to violation of comparable contract
+     * as may lead to cases that A > B, B > C and C > A. This is because the ids
+     * indicate the index of each entity in the partitions array, if they are collected
+     * it may lead to violations.
+     *
+     * CompareTo will sort elements in a descendant order
+     *
+     * @param o a weighted pair
+     * @return 1 if o is greater, 0 if they are equal, -1 if o is lesser.
+     */
     override def compareTo(o: WeightedPair): Int = {
-        // descendant order
-        if (o.id == id) return 0
 
         val test1 = o.getMainWeight - getMainWeight
         if (0 < test1) return 1
@@ -17,9 +26,8 @@ case class WeightedPair(id: Int, entityId1: Int, entityId2: Int, mainWeight: Flo
         if (0 < test2) return 1
 
         if (test2 < 0) return -1
-        // Note: Returning just the id leads to comparison method violation
-        // as may lead to cases that A > B, B > C and C > A
-        id - o.id
+
+        0
     }
 
     /**
@@ -34,5 +42,5 @@ case class WeightedPair(id: Int, entityId1: Int, entityId2: Int, mainWeight: Flo
 
     def incrementRelatedMatches(): Unit = relatedMatches += 1
 
-    override def toString: String = s"ID: $id E1 : $entityId1 E2 : $entityId2 main weight : $getMainWeight secondary weight : $getSecondaryWeight"
+    override def toString: String = s"E1 : $entityId1 E2 : $entityId2 main weight : $getMainWeight secondary weight : $getSecondaryWeight"
 }
diff --git a/src/main/scala/geospatialInterlinking/progressive/DynamicProgressiveGIAnt.scala b/src/main/scala/geospatialInterlinking/progressive/DynamicProgressiveGIAnt.scala
index e1bcb0e0..d58cc9db 100644
--- a/src/main/scala/geospatialInterlinking/progressive/DynamicProgressiveGIAnt.scala
+++ b/src/main/scala/geospatialInterlinking/progressive/DynamicProgressiveGIAnt.scala
@@ -30,7 +30,6 @@ case class DynamicProgressiveGIAnt(joinedRDD: RDD[(Int, (Iterable[Entity], Itera
         val sourceIndex = index(source)
         val filterIndices = (b: (Int, Int)) => sourceIndex.contains(b)
         val pq: WeightedPairsPQ = WeightedPairsPQ(budget)
-        var counter: Int = 0
         // weight and put the comparisons in a PQ
         target
             .indices
@@ -44,9 +43,8 @@ case class DynamicProgressiveGIAnt(joinedRDD: RDD[(Int, (Iterable[Entity], Itera
                                 val e1 = source(i)
                                 val w = getMainWeight(e1, e2)
                                 val secW = getSecondaryWeight(e1, e2)
-                                val wp = WeightedPair(counter, i, j, w, secW)
+                                val wp = WeightedPair(i, j, w, secW)
                                 pq.enqueue(wp)
-                                counter += 1
                             }
                     }
             }
diff --git a/src/main/scala/geospatialInterlinking/progressive/ProgressiveGIAnt.scala b/src/main/scala/geospatialInterlinking/progressive/ProgressiveGIAnt.scala
index bae5ac39..a6e03549 100644
--- a/src/main/scala/geospatialInterlinking/progressive/ProgressiveGIAnt.scala
+++ b/src/main/scala/geospatialInterlinking/progressive/ProgressiveGIAnt.scala
@@ -26,7 +26,6 @@ case class ProgressiveGIAnt(joinedRDD: RDD[(Int, (Iterable[Entity], Iterable[Ent
         val sourceIndex = index(source)
         val filterIndices = (b: (Int, Int)) => sourceIndex.contains(b)
         val pq: WeightedPairsPQ = WeightedPairsPQ(budget)
-        var counter: Int = 0
         // weight and put the comparisons in a PQ
         target
             .indices
@@ -40,9 +39,8 @@ case class ProgressiveGIAnt(joinedRDD: RDD[(Int, (Iterable[Entity], Iterable[Ent
                                 val e1 = source(i)
                                 val w = getMainWeight(e1, e2)
                                 val secW = getSecondaryWeight(e1, e2)
-                                val wp = WeightedPair(counter, i, j, w, secW)
+                                val wp = WeightedPair(i, j, w, secW)
                                 pq.enqueue(wp)
-                                counter += 1
                             }
                     }
             }
diff --git a/src/main/scala/geospatialInterlinking/progressive/RandomScheduling.scala b/src/main/scala/geospatialInterlinking/progressive/RandomScheduling.scala
index 002308c7..24c42131 100644
--- a/src/main/scala/geospatialInterlinking/progressive/RandomScheduling.scala
+++ b/src/main/scala/geospatialInterlinking/progressive/RandomScheduling.scala
@@ -25,7 +25,6 @@ case class RandomScheduling(joinedRDD: RDD[(Int, (Iterable[Entity], Iterable[Ent
         val filterIndices = (b: (Int, Int)) => sourceIndex.contains(b)
         val pq: WeightedPairsPQ = WeightedPairsPQ(budget)
         val rnd = new scala.util.Random
-        var counter: Int = 0
         // weight and put the comparisons in a PQ
         target
             .indices
@@ -38,9 +37,8 @@ case class RandomScheduling(joinedRDD: RDD[(Int, (Iterable[Entity], Iterable[Ent
                             .foreach { i =>
                                 val w = rnd.nextFloat()
                                 val secW = rnd.nextFloat()
-                                val wp = WeightedPair(counter, i, j, w, secW)
+                                val wp = WeightedPair(i, j, w, secW)
                                 pq.enqueue(wp)
-                                counter += 1
                             }
                     }
             }
diff --git a/src/main/scala/geospatialInterlinking/progressive/ReciprocalTopK.scala b/src/main/scala/geospatialInterlinking/progressive/ReciprocalTopK.scala
index 686e5a67..cab45026 100644
--- a/src/main/scala/geospatialInterlinking/progressive/ReciprocalTopK.scala
+++ b/src/main/scala/geospatialInterlinking/progressive/ReciprocalTopK.scala
@@ -35,7 +35,6 @@ case class ReciprocalTopK(joinedRDD: RDD[(Int, (Iterable[Entity], Iterable[Entit
         val partitionPQ: WeightedPairsPQ = WeightedPairsPQ(budget)
 
         val targetSet: Array[Set[Int]] = new Array(target.length)
-        var counter: Int = 0
         target.indices
             .foreach{j =>
                 val e2 = target(j)
@@ -47,8 +46,7 @@ case class ReciprocalTopK(joinedRDD: RDD[(Int, (Iterable[Entity], Iterable[Entit
                                 val e1 = source(i)
                                 val w = getMainWeight(e1, e2)
                                 val secW = getSecondaryWeight(e1, e2)
-                                val wp = WeightedPair(counter, i, j, w, secW)
-                                counter += 1
+                                val wp = WeightedPair( i, j, w, secW)
 
                                 // set top-K PQ for the examining target entity
                                 targetPQ.enqueue(wp)
diff --git a/src/main/scala/geospatialInterlinking/progressive/TopKPairs.scala b/src/main/scala/geospatialInterlinking/progressive/TopKPairs.scala
index c32a7e5f..9f93608e 100644
--- a/src/main/scala/geospatialInterlinking/progressive/TopKPairs.scala
+++ b/src/main/scala/geospatialInterlinking/progressive/TopKPairs.scala
@@ -31,7 +31,6 @@ case class TopKPairs(joinedRDD: RDD[(Int, (Iterable[Entity], Iterable[Entity]))]
         val sourcePQ: Array[WeightedPairsPQ] = new Array(source.length)
         val targetPQ: WeightedPairsPQ = WeightedPairsPQ(k)
         val partitionPQ: WeightedPairsPQ = WeightedPairsPQ(budget)
-        var counter: Int = 0
 
         target.indices
             .foreach{ j =>
@@ -44,8 +43,7 @@ case class TopKPairs(joinedRDD: RDD[(Int, (Iterable[Entity], Iterable[Entity]))]
                                 val e1 = source(i)
                                 val w = getMainWeight(e1, e2)
                                 val secW = getSecondaryWeight(e1, e2)
-                                val wp = WeightedPair(counter, i, j, w, secW)
-                                counter += 1
+                                val wp = WeightedPair(i, j, w, secW)
 
                                 // set top-K PQ for the examining target entity
                                 targetPQ.enqueue(wp)

From 387158b7485ef5ff64aeefc980193944a4fb3b97 Mon Sep 17 00:00:00 2001
From: George <giwrgosmandi@gmail.com>
Date: Mon, 1 Mar 2021 11:09:15 +0200
Subject: [PATCH 11/25] fixing comparator issue

---
 .../progressive/ProgressiveGeospatialInterlinkingT.scala      | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/main/scala/geospatialInterlinking/progressive/ProgressiveGeospatialInterlinkingT.scala b/src/main/scala/geospatialInterlinking/progressive/ProgressiveGeospatialInterlinkingT.scala
index c6476058..84c94844 100644
--- a/src/main/scala/geospatialInterlinking/progressive/ProgressiveGeospatialInterlinkingT.scala
+++ b/src/main/scala/geospatialInterlinking/progressive/ProgressiveGeospatialInterlinkingT.scala
@@ -39,7 +39,6 @@ trait ProgressiveGeospatialInterlinkingT extends GeospatialInterlinkingT{
 
     /**
      * Weight a comparison
-     * TODO: ensure that float does not produce issues
      *
      * @param e1        Spatial entity
      * @param e2        Spatial entity
@@ -54,7 +53,8 @@ trait ProgressiveGeospatialInterlinkingT extends GeospatialInterlinkingT{
         ws match {
             case WeightingScheme.MBR_INTERSECTION =>
                 val intersectionArea = e1.mbr.getIntersectingMBR(e2.mbr).getArea
-                intersectionArea / (e1.mbr.getArea + e2.mbr.getArea - intersectionArea)
+                val w = intersectionArea / (e1.mbr.getArea + e2.mbr.getArea - intersectionArea)
+                if (!w.isNaN) w else 0f
 
             case WeightingScheme.POINTS =>
                 1f / (e1.geometry.getNumPoints + e2.geometry.getNumPoints);

From 77ccef2b380e6aa91c216b60d2d31cb0b380d465 Mon Sep 17 00:00:00 2001
From: George <giwrgosmandi@gmail.com>
Date: Mon, 1 Mar 2021 12:03:14 +0200
Subject: [PATCH 12/25] improving experiments

---
 .../scala/experiments/EvaluationExp.scala     | 33 ++++++++++++++++---
 .../progressive/DynamicProgressiveGIAnt.scala | 10 +++---
 2 files changed, 33 insertions(+), 10 deletions(-)

diff --git a/src/main/scala/experiments/EvaluationExp.scala b/src/main/scala/experiments/EvaluationExp.scala
index 88d80381..b88cdc45 100644
--- a/src/main/scala/experiments/EvaluationExp.scala
+++ b/src/main/scala/experiments/EvaluationExp.scala
@@ -52,6 +52,12 @@ object EvaluationExp {
                     nextOption(map ++ Map("budget" -> value), tail)
                 case "-gt" :: value :: tail =>
                     nextOption(map ++ Map("gt" -> value), tail)
+                case "-tv" :: value :: tail =>
+                    nextOption(map ++ Map("tv" -> value), tail)
+                case "-qp" :: value :: tail =>
+                    nextOption(map ++ Map("qp" -> value), tail)
+                case "-pa" :: value :: tail =>
+                    nextOption(map ++ Map("pa" -> value), tail)
                 case _ :: tail =>
                     log.warn("DS-JEDAI: Unrecognized argument")
                     nextOption(map, tail)
@@ -86,17 +92,34 @@ object EvaluationExp {
         val targetRDD = reader.load(conf.target)
         val partitioner = reader.partitioner
 
-        val (_, _, _, _, _, _, _, _, _, totalVerifications, totalRelatedPairs) = GIAnt(sourceRDD, targetRDD, partitioner).countAllRelations
+        val (totalVerifications, totalRelatedPairs) =
+            if (options.contains("tv") && options.contains("qp"))
+                (options("tv").toInt, options("qp").toInt)
+            else {
+                val g = GIAnt(sourceRDD, targetRDD, partitioner).countAllRelations
+                (g._10, g._11)
+            }
 
         log.info("DS-JEDAI: Total Verifications: " + totalVerifications)
         log.info("DS-JEDAI: Total Qualifying Pairs: " + totalRelatedPairs)
         log.info("\n")
 
         //printResults(sourceRDD, targetRDD, partitioner, totalRelatedPairs, ProgressiveAlgorithm.RANDOM,  (WeightingScheme.CF, None))
-        val algorithms = Seq(ProgressiveAlgorithm.PROGRESSIVE_GIANT, ProgressiveAlgorithm.TOPK, ProgressiveAlgorithm.RECIPROCAL_TOPK)
-        val weightingSchemes = Seq(
-            (WeightingScheme.MBR_INTERSECTION, None), (WeightingScheme.POINTS, None),
-            (WeightingScheme.JS, Option(WeightingScheme.MBR_INTERSECTION)), (WeightingScheme.PEARSON_X2, Option(WeightingScheme.POINTS)))
+
+        val algorithms: Seq[ProgressiveAlgorithm] =
+            if (options.contains("pa"))
+                options("pa").split(",").filter(ProgressiveAlgorithm.exists).map(ProgressiveAlgorithm.withName).toSeq
+            else
+                Seq(ProgressiveAlgorithm.DYNAMIC_PROGRESSIVE_GIANT)
+
+        val weightingSchemes = Seq((WeightingScheme.CF, None),
+                                (WeightingScheme.JS, None),
+                                (WeightingScheme.PEARSON_X2,None),
+                                (WeightingScheme.MBR_INTERSECTION, None),
+                                 (WeightingScheme.POINTS, None),
+                                 (WeightingScheme.JS, Option(WeightingScheme.MBR_INTERSECTION)),
+                                 (WeightingScheme.PEARSON_X2, Option(WeightingScheme.MBR_INTERSECTION)))
+
         for (a <- algorithms ; ws <- weightingSchemes)
             printResults(sourceRDD, targetRDD, partitioner, totalRelatedPairs, a, ws)
     }
diff --git a/src/main/scala/geospatialInterlinking/progressive/DynamicProgressiveGIAnt.scala b/src/main/scala/geospatialInterlinking/progressive/DynamicProgressiveGIAnt.scala
index d58cc9db..2593190a 100644
--- a/src/main/scala/geospatialInterlinking/progressive/DynamicProgressiveGIAnt.scala
+++ b/src/main/scala/geospatialInterlinking/progressive/DynamicProgressiveGIAnt.scala
@@ -145,15 +145,15 @@ case class DynamicProgressiveGIAnt(joinedRDD: RDD[(Int, (Iterable[Entity], Itera
                         val wp = pq.dequeueHead()
                         val e1 = source(wp.entityId1)
                         val e2 = target(wp.entityId2)
-                        val isRelatedAndPair = relation match {
-                            case Relation.DE9IM => (wp, IM(e1, e2).relate)
-                            case _ => (wp, e1.relate(e2, relation))
+                        val isRelated = relation match {
+                            case Relation.DE9IM => IM(e1, e2).relate
+                            case _ => e1.relate(e2, relation)
                         }
-                        if (isRelatedAndPair._2){
+                        if (isRelated){
                             sourceCandidates.getOrElse(wp.entityId1, List()).foreach(wp => pq.dynamicUpdate(wp))
                             targetCandidates.getOrElse(wp.entityId2, List()).foreach(wp => pq.dynamicUpdate(wp))
                         }
-                        isRelatedAndPair
+                        (wp, isRelated)
                     }.takeWhile(_ => !pq.isEmpty)
                 else Iterator()
             }.persist(StorageLevel.MEMORY_AND_DISK)

From 3f500ee9d4e8714fe2d0507eac00b970a81721a4 Mon Sep 17 00:00:00 2001
From: George <giwrgosmandi@gmail.com>
Date: Tue, 2 Mar 2021 12:02:45 +0200
Subject: [PATCH 13/25] treeset instead of minmax

---
 src/main/scala/dataModel/WeightedPairsPQ.scala | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/src/main/scala/dataModel/WeightedPairsPQ.scala b/src/main/scala/dataModel/WeightedPairsPQ.scala
index 77bd7e6f..bb220d00 100644
--- a/src/main/scala/dataModel/WeightedPairsPQ.scala
+++ b/src/main/scala/dataModel/WeightedPairsPQ.scala
@@ -1,12 +1,11 @@
 package dataModel
 
-import org.spark_project.guava.collect.MinMaxPriorityQueue
+import java.util
 import scala.collection.JavaConverters._
 
 case class WeightedPairsPQ(maxSize: Int){
 
-    lazy val pq: MinMaxPriorityQueue[WeightedPair] = MinMaxPriorityQueue.maximumSize(maxSize+1).create()
-
+    val pq: util.TreeSet[WeightedPair] = new util.TreeSet[WeightedPair]()
 
     def enqueue(wp: WeightedPair): Unit ={
             pq.add(wp)

From 6964ea3ea444276aa1511f9203972c22257d3e14 Mon Sep 17 00:00:00 2001
From: George <giwrgosmandi@gmail.com>
Date: Wed, 3 Mar 2021 11:12:50 +0200
Subject: [PATCH 14/25] wp counter

---
 src/main/scala/dataModel/WeightedPair.scala                 | 6 ++++--
 .../progressive/DynamicProgressiveGIAnt.scala               | 4 +++-
 .../progressive/ProgressiveGIAnt.scala                      | 4 +++-
 .../progressive/RandomScheduling.scala                      | 4 +++-
 .../geospatialInterlinking/progressive/ReciprocalTopK.scala | 4 +++-
 .../geospatialInterlinking/progressive/TopKPairs.scala      | 4 +++-
 6 files changed, 19 insertions(+), 7 deletions(-)

diff --git a/src/main/scala/dataModel/WeightedPair.scala b/src/main/scala/dataModel/WeightedPair.scala
index 3db8dece..00db6dc9 100644
--- a/src/main/scala/dataModel/WeightedPair.scala
+++ b/src/main/scala/dataModel/WeightedPair.scala
@@ -1,6 +1,6 @@
 package dataModel
 
-case class WeightedPair(entityId1: Int, entityId2: Int, mainWeight: Float, secondaryWeight: Float)  extends Serializable with Comparable[WeightedPair]{
+case class WeightedPair(counter: Int, entityId1: Int, entityId2: Int, mainWeight: Float, secondaryWeight: Float)  extends Serializable with Comparable[WeightedPair]{
 
     var relatedMatches: Int = 0
 
@@ -17,6 +17,8 @@ case class WeightedPair(entityId1: Int, entityId2: Int, mainWeight: Float, secon
      */
     override def compareTo(o: WeightedPair): Int = {
 
+        if (entityId1 == o.entityId1 && entityId2 == o.entityId2) return 0
+
         val test1 = o.getMainWeight - getMainWeight
         if (0 < test1) return 1
 
@@ -27,7 +29,7 @@ case class WeightedPair(entityId1: Int, entityId2: Int, mainWeight: Float, secon
 
         if (test2 < 0) return -1
 
-        0
+        o.counter - counter
     }
 
     /**
diff --git a/src/main/scala/geospatialInterlinking/progressive/DynamicProgressiveGIAnt.scala b/src/main/scala/geospatialInterlinking/progressive/DynamicProgressiveGIAnt.scala
index 2593190a..a3a5491b 100644
--- a/src/main/scala/geospatialInterlinking/progressive/DynamicProgressiveGIAnt.scala
+++ b/src/main/scala/geospatialInterlinking/progressive/DynamicProgressiveGIAnt.scala
@@ -30,6 +30,7 @@ case class DynamicProgressiveGIAnt(joinedRDD: RDD[(Int, (Iterable[Entity], Itera
         val sourceIndex = index(source)
         val filterIndices = (b: (Int, Int)) => sourceIndex.contains(b)
         val pq: WeightedPairsPQ = WeightedPairsPQ(budget)
+        var counter = 0
         // weight and put the comparisons in a PQ
         target
             .indices
@@ -43,8 +44,9 @@ case class DynamicProgressiveGIAnt(joinedRDD: RDD[(Int, (Iterable[Entity], Itera
                                 val e1 = source(i)
                                 val w = getMainWeight(e1, e2)
                                 val secW = getSecondaryWeight(e1, e2)
-                                val wp = WeightedPair(i, j, w, secW)
+                                val wp = WeightedPair(counter, i, j, w, secW)
                                 pq.enqueue(wp)
+                                counter += 1
                             }
                     }
             }
diff --git a/src/main/scala/geospatialInterlinking/progressive/ProgressiveGIAnt.scala b/src/main/scala/geospatialInterlinking/progressive/ProgressiveGIAnt.scala
index a6e03549..6cbb6bea 100644
--- a/src/main/scala/geospatialInterlinking/progressive/ProgressiveGIAnt.scala
+++ b/src/main/scala/geospatialInterlinking/progressive/ProgressiveGIAnt.scala
@@ -26,6 +26,7 @@ case class ProgressiveGIAnt(joinedRDD: RDD[(Int, (Iterable[Entity], Iterable[Ent
         val sourceIndex = index(source)
         val filterIndices = (b: (Int, Int)) => sourceIndex.contains(b)
         val pq: WeightedPairsPQ = WeightedPairsPQ(budget)
+        var counter = 0
         // weight and put the comparisons in a PQ
         target
             .indices
@@ -39,8 +40,9 @@ case class ProgressiveGIAnt(joinedRDD: RDD[(Int, (Iterable[Entity], Iterable[Ent
                                 val e1 = source(i)
                                 val w = getMainWeight(e1, e2)
                                 val secW = getSecondaryWeight(e1, e2)
-                                val wp = WeightedPair(i, j, w, secW)
+                                val wp = WeightedPair(counter, i, j, w, secW)
                                 pq.enqueue(wp)
+                                counter += 1
                             }
                     }
             }
diff --git a/src/main/scala/geospatialInterlinking/progressive/RandomScheduling.scala b/src/main/scala/geospatialInterlinking/progressive/RandomScheduling.scala
index 24c42131..03bb7838 100644
--- a/src/main/scala/geospatialInterlinking/progressive/RandomScheduling.scala
+++ b/src/main/scala/geospatialInterlinking/progressive/RandomScheduling.scala
@@ -25,6 +25,7 @@ case class RandomScheduling(joinedRDD: RDD[(Int, (Iterable[Entity], Iterable[Ent
         val filterIndices = (b: (Int, Int)) => sourceIndex.contains(b)
         val pq: WeightedPairsPQ = WeightedPairsPQ(budget)
         val rnd = new scala.util.Random
+        var counter = 0
         // weight and put the comparisons in a PQ
         target
             .indices
@@ -37,8 +38,9 @@ case class RandomScheduling(joinedRDD: RDD[(Int, (Iterable[Entity], Iterable[Ent
                             .foreach { i =>
                                 val w = rnd.nextFloat()
                                 val secW = rnd.nextFloat()
-                                val wp = WeightedPair(i, j, w, secW)
+                                val wp = WeightedPair(counter, i, j, w, secW)
                                 pq.enqueue(wp)
+                                counter += 1
                             }
                     }
             }
diff --git a/src/main/scala/geospatialInterlinking/progressive/ReciprocalTopK.scala b/src/main/scala/geospatialInterlinking/progressive/ReciprocalTopK.scala
index cab45026..33d1dff9 100644
--- a/src/main/scala/geospatialInterlinking/progressive/ReciprocalTopK.scala
+++ b/src/main/scala/geospatialInterlinking/progressive/ReciprocalTopK.scala
@@ -33,6 +33,7 @@ case class ReciprocalTopK(joinedRDD: RDD[(Int, (Iterable[Entity], Iterable[Entit
         val sourcePQ: Array[WeightedPairsPQ] = new Array(source.length)
         val targetPQ: WeightedPairsPQ = WeightedPairsPQ(targetK)
         val partitionPQ: WeightedPairsPQ = WeightedPairsPQ(budget)
+        var counter = 0
 
         val targetSet: Array[Set[Int]] = new Array(target.length)
         target.indices
@@ -46,7 +47,8 @@ case class ReciprocalTopK(joinedRDD: RDD[(Int, (Iterable[Entity], Iterable[Entit
                                 val e1 = source(i)
                                 val w = getMainWeight(e1, e2)
                                 val secW = getSecondaryWeight(e1, e2)
-                                val wp = WeightedPair( i, j, w, secW)
+                                val wp = WeightedPair(counter, i, j, w, secW)
+                                counter += 1
 
                                 // set top-K PQ for the examining target entity
                                 targetPQ.enqueue(wp)
diff --git a/src/main/scala/geospatialInterlinking/progressive/TopKPairs.scala b/src/main/scala/geospatialInterlinking/progressive/TopKPairs.scala
index 9f93608e..c26b851d 100644
--- a/src/main/scala/geospatialInterlinking/progressive/TopKPairs.scala
+++ b/src/main/scala/geospatialInterlinking/progressive/TopKPairs.scala
@@ -31,6 +31,7 @@ case class TopKPairs(joinedRDD: RDD[(Int, (Iterable[Entity], Iterable[Entity]))]
         val sourcePQ: Array[WeightedPairsPQ] = new Array(source.length)
         val targetPQ: WeightedPairsPQ = WeightedPairsPQ(k)
         val partitionPQ: WeightedPairsPQ = WeightedPairsPQ(budget)
+        var counter = 0
 
         target.indices
             .foreach{ j =>
@@ -43,7 +44,8 @@ case class TopKPairs(joinedRDD: RDD[(Int, (Iterable[Entity], Iterable[Entity]))]
                                 val e1 = source(i)
                                 val w = getMainWeight(e1, e2)
                                 val secW = getSecondaryWeight(e1, e2)
-                                val wp = WeightedPair(i, j, w, secW)
+                                val wp = WeightedPair(counter, i, j, w, secW)
+                                counter += 1
 
                                 // set top-K PQ for the examining target entity
                                 targetPQ.enqueue(wp)

From 5375c9385cc4af03ec5b0b724078c0436f083328 Mon Sep 17 00:00:00 2001
From: George <giwrgosmandi@gmail.com>
Date: Wed, 3 Mar 2021 11:13:07 +0200
Subject: [PATCH 15/25] counting verifications

---
 src/main/scala/geospatialInterlinking/GIAnt.scala | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/main/scala/geospatialInterlinking/GIAnt.scala b/src/main/scala/geospatialInterlinking/GIAnt.scala
index f7f426f9..83642e82 100644
--- a/src/main/scala/geospatialInterlinking/GIAnt.scala
+++ b/src/main/scala/geospatialInterlinking/GIAnt.scala
@@ -63,7 +63,7 @@ case class GIAnt(joinedRDD: RDD[(Int, (Iterable[Entity], Iterable[Entity]))], th
                             .filter { case (block, i) => source(i).filter(e2, Relation.DE9IM, block, thetaXY, Some(partition)) }
                             .map(_._2)
                             .map(i => IM(source(i), e2))
-                            .filter(_.relate)
+                            //.filter(_.relate)
                             .force
                     }
                 }

From dbb30ebe8e768f8ae2dab4371a3418ec92f7e92b Mon Sep 17 00:00:00 2001
From: George <giwrgosmandi@gmail.com>
Date: Wed, 3 Mar 2021 13:39:51 +0200
Subject: [PATCH 16/25] local budget

---
 src/main/scala/dataModel/WeightedPairsPQ.scala       |  2 +-
 .../progressive/DynamicProgressiveGIAnt.scala        |  8 +++++---
 .../progressive/ProgressiveGIAnt.scala               |  8 +++++---
 .../progressive/RandomScheduling.scala               |  9 ++++++---
 .../progressive/ReciprocalTopK.scala                 | 12 +++++++-----
 .../progressive/TopKPairs.scala                      | 10 ++++++----
 6 files changed, 30 insertions(+), 19 deletions(-)

diff --git a/src/main/scala/dataModel/WeightedPairsPQ.scala b/src/main/scala/dataModel/WeightedPairsPQ.scala
index bb220d00..f3d16c91 100644
--- a/src/main/scala/dataModel/WeightedPairsPQ.scala
+++ b/src/main/scala/dataModel/WeightedPairsPQ.scala
@@ -3,7 +3,7 @@ package dataModel
 import java.util
 import scala.collection.JavaConverters._
 
-case class WeightedPairsPQ(maxSize: Int){
+case class WeightedPairsPQ(maxSize: Long){
 
     val pq: util.TreeSet[WeightedPair] = new util.TreeSet[WeightedPair]()
 
diff --git a/src/main/scala/geospatialInterlinking/progressive/DynamicProgressiveGIAnt.scala b/src/main/scala/geospatialInterlinking/progressive/DynamicProgressiveGIAnt.scala
index a3a5491b..954ee20b 100644
--- a/src/main/scala/geospatialInterlinking/progressive/DynamicProgressiveGIAnt.scala
+++ b/src/main/scala/geospatialInterlinking/progressive/DynamicProgressiveGIAnt.scala
@@ -13,7 +13,7 @@ import scala.collection.mutable
 import scala.collection.mutable.ListBuffer
 
 case class DynamicProgressiveGIAnt(joinedRDD: RDD[(Int, (Iterable[Entity], Iterable[Entity]))], thetaXY: (Double, Double),
-                                    mainWS: WeightingScheme, secondaryWS: Option[WeightingScheme], budget: Int)
+                                    mainWS: WeightingScheme, secondaryWS: Option[WeightingScheme], budget: Int, sourceEntities: Int)
     extends ProgressiveGeospatialInterlinkingT {
 
 
@@ -27,9 +27,10 @@ case class DynamicProgressiveGIAnt(joinedRDD: RDD[(Int, (Iterable[Entity], Itera
      * @return a PQ with the top comparisons
      */
     def prioritize(source: Array[Entity], target: Array[Entity], partition: MBR, relation: Relation): WeightedPairsPQ ={
+        val localBudget = (math.ceil(budget*source.length.toDouble/sourceEntities.toDouble)*2).toLong
         val sourceIndex = index(source)
         val filterIndices = (b: (Int, Int)) => sourceIndex.contains(b)
-        val pq: WeightedPairsPQ = WeightedPairsPQ(budget)
+        val pq: WeightedPairsPQ = WeightedPairsPQ(localBudget)
         var counter = 0
         // weight and put the comparisons in a PQ
         target
@@ -205,7 +206,8 @@ object DynamicProgressiveGIAnt {
               budget: Int, partitioner: Partitioner): DynamicProgressiveGIAnt ={
         val thetaXY = Utils.getTheta
         val joinedRDD = source.cogroup(target, partitioner)
-        DynamicProgressiveGIAnt(joinedRDD, thetaXY, ws, sws, budget)
+        val sourceEntities = Utils.sourceCount
+        DynamicProgressiveGIAnt(joinedRDD, thetaXY, ws, sws, budget, sourceEntities.toInt)
     }
 
 }
\ No newline at end of file
diff --git a/src/main/scala/geospatialInterlinking/progressive/ProgressiveGIAnt.scala b/src/main/scala/geospatialInterlinking/progressive/ProgressiveGIAnt.scala
index 6cbb6bea..0bf2549e 100644
--- a/src/main/scala/geospatialInterlinking/progressive/ProgressiveGIAnt.scala
+++ b/src/main/scala/geospatialInterlinking/progressive/ProgressiveGIAnt.scala
@@ -9,7 +9,7 @@ import utils.Utils
 
 
 case class ProgressiveGIAnt(joinedRDD: RDD[(Int, (Iterable[Entity], Iterable[Entity]))], thetaXY: (Double, Double),
-                            mainWS: WeightingScheme, secondaryWS: Option[WeightingScheme], budget: Int)
+                            mainWS: WeightingScheme, secondaryWS: Option[WeightingScheme], budget: Int, sourceEntities: Int)
     extends ProgressiveGeospatialInterlinkingT {
 
 
@@ -23,9 +23,10 @@ case class ProgressiveGIAnt(joinedRDD: RDD[(Int, (Iterable[Entity], Iterable[Ent
      * @return a PQ with the top comparisons
      */
     def prioritize(source: Array[Entity], target: Array[Entity], partition: MBR, relation: Relation): WeightedPairsPQ ={
+        val localBudget = (math.ceil(budget*source.length.toDouble/sourceEntities.toDouble)*2).toLong
         val sourceIndex = index(source)
         val filterIndices = (b: (Int, Int)) => sourceIndex.contains(b)
-        val pq: WeightedPairsPQ = WeightedPairsPQ(budget)
+        val pq: WeightedPairsPQ = WeightedPairsPQ(localBudget)
         var counter = 0
         // weight and put the comparisons in a PQ
         target
@@ -61,7 +62,8 @@ object ProgressiveGIAnt {
               budget: Int, partitioner: Partitioner): ProgressiveGIAnt ={
         val thetaXY = Utils.getTheta
         val joinedRDD = source.cogroup(target, partitioner)
-        ProgressiveGIAnt(joinedRDD, thetaXY, ws, sws, budget)
+        val sourceEntities = Utils.sourceCount
+        ProgressiveGIAnt(joinedRDD, thetaXY, ws, sws, budget, sourceEntities.toInt)
     }
 
 }
diff --git a/src/main/scala/geospatialInterlinking/progressive/RandomScheduling.scala b/src/main/scala/geospatialInterlinking/progressive/RandomScheduling.scala
index 03bb7838..9619fc05 100644
--- a/src/main/scala/geospatialInterlinking/progressive/RandomScheduling.scala
+++ b/src/main/scala/geospatialInterlinking/progressive/RandomScheduling.scala
@@ -8,7 +8,7 @@ import utils.Constants.WeightingScheme.WeightingScheme
 import utils.Utils
 
 case class RandomScheduling(joinedRDD: RDD[(Int, (Iterable[Entity], Iterable[Entity]))], thetaXY: (Double, Double),
-                            mainWS: WeightingScheme, secondaryWS: Option[WeightingScheme], budget: Int)
+                            mainWS: WeightingScheme, secondaryWS: Option[WeightingScheme], budget: Int, sourceEntities: Int)
     extends ProgressiveGeospatialInterlinkingT {
 
 
@@ -21,9 +21,11 @@ case class RandomScheduling(joinedRDD: RDD[(Int, (Iterable[Entity], Iterable[Ent
      * @return a PQ with the top comparisons
      */
     def prioritize(source: Array[Entity], target: Array[Entity], partition: MBR, relation: Relation): WeightedPairsPQ = {
+        val localBudget = (math.ceil(budget*source.length.toDouble/sourceEntities.toDouble)*2).toInt
+
         val sourceIndex = index(source)
         val filterIndices = (b: (Int, Int)) => sourceIndex.contains(b)
-        val pq: WeightedPairsPQ = WeightedPairsPQ(budget)
+        val pq: WeightedPairsPQ = WeightedPairsPQ(localBudget)
         val rnd = new scala.util.Random
         var counter = 0
         // weight and put the comparisons in a PQ
@@ -58,7 +60,8 @@ object RandomScheduling {
               budget: Int, partitioner: Partitioner): RandomScheduling ={
         val thetaXY = Utils.getTheta
         val joinedRDD = source.cogroup(target, partitioner)
-        RandomScheduling(joinedRDD, thetaXY, ws, sws, budget)
+        val sourceEntities = Utils.sourceCount
+        RandomScheduling(joinedRDD, thetaXY, ws, sws, budget, sourceEntities.toInt)
     }
 
 }
diff --git a/src/main/scala/geospatialInterlinking/progressive/ReciprocalTopK.scala b/src/main/scala/geospatialInterlinking/progressive/ReciprocalTopK.scala
index 33d1dff9..65abdcdd 100644
--- a/src/main/scala/geospatialInterlinking/progressive/ReciprocalTopK.scala
+++ b/src/main/scala/geospatialInterlinking/progressive/ReciprocalTopK.scala
@@ -10,7 +10,7 @@ import utils.Utils
 
 
 case class ReciprocalTopK(joinedRDD: RDD[(Int, (Iterable[Entity], Iterable[Entity]))], thetaXY: (Double, Double),
-                          mainWS: WeightingScheme, secondaryWS: Option[WeightingScheme], budget: Int)
+                          mainWS: WeightingScheme, secondaryWS: Option[WeightingScheme], budget: Int, sourceEntities: Int)
     extends ProgressiveGeospatialInterlinkingT {
 
     /**
@@ -24,15 +24,16 @@ case class ReciprocalTopK(joinedRDD: RDD[(Int, (Iterable[Entity], Iterable[Entit
      * @return prioritized comparisons as a PQ
      */
     def prioritize(source: Array[Entity], target: Array[Entity], partition: MBR, relation: Relation):  WeightedPairsPQ = {
+        val localBudget = (math.ceil(budget*source.length.toDouble/sourceEntities.toDouble)*2).toLong
         val sourceIndex = index(source)
         val filterIndices = (b: (Int, Int)) => sourceIndex.contains(b)
 
-        val sourceK = (math.ceil(budget / source.length).toInt + 1) * 2 // +1 to avoid k=0
-        val targetK = (math.ceil(budget / target.length).toInt + 1) * 2 // +1 to avoid k=0
+        val sourceK = (math.ceil(localBudget / source.length).toInt + 1) * 2 // +1 to avoid k=0
+        val targetK = (math.ceil(localBudget / target.length).toInt + 1) * 2 // +1 to avoid k=0
 
         val sourcePQ: Array[WeightedPairsPQ] = new Array(source.length)
         val targetPQ: WeightedPairsPQ = WeightedPairsPQ(targetK)
-        val partitionPQ: WeightedPairsPQ = WeightedPairsPQ(budget)
+        val partitionPQ: WeightedPairsPQ = WeightedPairsPQ(localBudget)
         var counter = 0
 
         val targetSet: Array[Set[Int]] = new Array(target.length)
@@ -82,6 +83,7 @@ object ReciprocalTopK{
               budget: Int, partitioner: Partitioner): ReciprocalTopK ={
         val thetaXY = Utils.getTheta
         val joinedRDD = source.cogroup(target, partitioner)
-        ReciprocalTopK(joinedRDD, thetaXY, ws, sws, budget)
+        val sourceEntities = Utils.sourceCount
+        ReciprocalTopK(joinedRDD, thetaXY, ws, sws, budget, sourceEntities.toInt)
     }
 }
diff --git a/src/main/scala/geospatialInterlinking/progressive/TopKPairs.scala b/src/main/scala/geospatialInterlinking/progressive/TopKPairs.scala
index c26b851d..c9262c1f 100644
--- a/src/main/scala/geospatialInterlinking/progressive/TopKPairs.scala
+++ b/src/main/scala/geospatialInterlinking/progressive/TopKPairs.scala
@@ -8,7 +8,7 @@ import utils.Constants.WeightingScheme.WeightingScheme
 import utils.Utils
 
 case class TopKPairs(joinedRDD: RDD[(Int, (Iterable[Entity], Iterable[Entity]))], thetaXY: (Double, Double),
-                     mainWS: WeightingScheme, secondaryWS: Option[WeightingScheme], budget: Int)
+                     mainWS: WeightingScheme, secondaryWS: Option[WeightingScheme], budget: Int, sourceEntities: Int)
     extends ProgressiveGeospatialInterlinkingT {
 
     /**
@@ -23,14 +23,15 @@ case class TopKPairs(joinedRDD: RDD[(Int, (Iterable[Entity], Iterable[Entity]))]
      * @return prioritized comparisons in a PQ
      */
     def prioritize(source: Array[Entity], target: Array[Entity], partition: MBR, relation: Relation): WeightedPairsPQ = {
+        val localBudget = (math.ceil(budget*source.length.toDouble/sourceEntities.toDouble)*2).toLong
         val sourceIndex = index(source)
         val filterIndices = (b: (Int, Int)) => sourceIndex.contains(b)
 
         // the budget is divided based on the number of entities
-        val k = (math.ceil(budget / (source.length + target.length)).toInt + 1) * 2 // +1 to avoid k=0
+        val k = (math.ceil(localBudget / (source.length + target.length)).toInt + 1) * 2 // +1 to avoid k=0
         val sourcePQ: Array[WeightedPairsPQ] = new Array(source.length)
         val targetPQ: WeightedPairsPQ = WeightedPairsPQ(k)
-        val partitionPQ: WeightedPairsPQ = WeightedPairsPQ(budget)
+        val partitionPQ: WeightedPairsPQ = WeightedPairsPQ(localBudget)
         var counter = 0
 
         target.indices
@@ -90,6 +91,7 @@ object TopKPairs{
               budget: Int, partitioner: Partitioner): TopKPairs ={
         val thetaXY = Utils.getTheta
         val joinedRDD = source.cogroup(target, partitioner)
-        TopKPairs(joinedRDD, thetaXY, ws, sws, budget)
+        val sourceEntities = Utils.sourceCount
+        TopKPairs(joinedRDD, thetaXY, ws, sws, budget, sourceEntities.toInt)
     }
 }

From 26dcaa900e98e0c385cea82567d42e9c91ab8126 Mon Sep 17 00:00:00 2001
From: George <giwrgosmandi@gmail.com>
Date: Tue, 9 Mar 2021 10:38:55 +0200
Subject: [PATCH 17/25] IndexedJoin - a well balancing technique

---
 ...llBalancedExp.scala => BalancingExp.scala} | 79 ++++++++++---------
 .../IndexBasedMatching.scala                  | 70 ----------------
 .../IndexedJoinInterlinking.scala             | 65 +++++++++++++++
 3 files changed, 107 insertions(+), 107 deletions(-)
 rename src/main/scala/experiments/{WellBalancedExp.scala => BalancingExp.scala} (61%)
 delete mode 100644 src/main/scala/geospatialInterlinking/IndexBasedMatching.scala
 create mode 100644 src/main/scala/geospatialInterlinking/IndexedJoinInterlinking.scala

diff --git a/src/main/scala/experiments/WellBalancedExp.scala b/src/main/scala/experiments/BalancingExp.scala
similarity index 61%
rename from src/main/scala/experiments/WellBalancedExp.scala
rename to src/main/scala/experiments/BalancingExp.scala
index 0ab389b7..fbe53fa5 100644
--- a/src/main/scala/experiments/WellBalancedExp.scala
+++ b/src/main/scala/experiments/BalancingExp.scala
@@ -2,22 +2,22 @@ package experiments
 
 import java.util.Calendar
 
-import geospatialInterlinking.{GIAnt, IndexBasedMatching}
+import dataModel.Entity
+import geospatialInterlinking.{GIAnt, IndexedJoinInterlinking}
 import org.apache.log4j.{Level, LogManager, Logger}
+import org.apache.spark.rdd.RDD
 import org.apache.spark.serializer.KryoSerializer
 import org.apache.spark.sql.SparkSession
 import org.apache.spark.storage.StorageLevel
 import org.apache.spark.{SparkConf, SparkContext, TaskContext}
 import org.datasyslab.geospark.serde.GeoSparkKryoRegistrator
-import utils.Constants.ProgressiveAlgorithm.ProgressiveAlgorithm
-import utils.Constants.{GridType, ProgressiveAlgorithm, Relation, WeightingScheme}
-import utils.Constants.WeightingScheme.WeightingScheme
+import utils.Constants.{GridType, Relation}
 import utils.{ConfigurationParser, SpatialReader, Utils}
 
 
-object WellBalancedExp {
+object BalancingExp {
 
-    implicit class TuppleAdd(t: (Int, Int, Int, Int, Int, Int, Int, Int, Int, Int, Int)) {
+    implicit class TupleAdd(t: (Int, Int, Int, Int, Int, Int, Int, Int, Int, Int, Int)) {
         def +(p: (Int, Int, Int, Int, Int, Int, Int, Int, Int, Int, Int))
         : (Int, Int, Int, Int, Int, Int, Int, Int, Int, Int, Int) =
             (p._1 + t._1, p._2 + t._2, p._3 +t._3, p._4+t._4, p._5+t._5, p._6+t._6, p._7+t._7, p._8+t._8, p._9+t._9, p._10+t._10, p._11+t._11)
@@ -44,20 +44,10 @@ object WellBalancedExp {
                 case Nil => map
                 case ("-c" | "-conf") :: value :: tail =>
                     nextOption(map ++ Map("conf" -> value), tail)
-                case ("-f" | "-fraction") :: value :: tail =>
-                    nextOption(map ++ Map("fraction" -> value), tail)
-                case ("-s" | "-stats") :: tail =>
-                    nextOption(map ++ Map("stats" -> "true"), tail)
                 case "-auc" :: tail =>
                     nextOption(map ++ Map("auc" -> "true"), tail)
                 case ("-p" | "-partitions") :: value :: tail =>
                     nextOption(map ++ Map("partitions" -> value), tail)
-                case ("-b" | "-budget") :: value :: tail =>
-                    nextOption(map ++ Map("budget" -> value), tail)
-                case "-ws" :: value :: tail =>
-                    nextOption(map ++ Map("ws" -> value), tail)
-                case "-ma" :: value :: tail =>
-                    nextOption(map ++ Map("ma" -> value), tail)
                 case "-gt" :: value :: tail =>
                     nextOption(map ++ Map("gt" -> value), tail)
                 case _ :: tail =>
@@ -78,41 +68,58 @@ object WellBalancedExp {
         val confPath = options("conf")
         val conf = ConfigurationParser.parse(confPath)
         val partitions: Int = if (options.contains("partitions")) options("partitions").toInt else conf.getPartitions
-        val budget: Int = if (options.contains("budget")) options("budget").toInt else conf.getBudget
-        val ws: WeightingScheme = if (options.contains("ws")) WeightingScheme.withName(options("ws")) else conf.getMainWS
-        val ma: ProgressiveAlgorithm = if (options.contains("ma")) ProgressiveAlgorithm.withName(options("ma")) else conf.getProgressiveAlgorithm
         val gridType: GridType.GridType = if (options.contains("gt")) GridType.withName(options("gt").toString) else conf.getGridType
         val relation = conf.getRelation
 
-        log.info("DS-JEDAI: Input Budget: " + budget)
-        log.info("DS-JEDAI: Weighting Scheme: " + ws.toString)
-
         val startTime = Calendar.getInstance().getTimeInMillis
         val reader = SpatialReader(conf.source, partitions, gridType)
         val sourceRDD = reader.load()
         sourceRDD.persist(StorageLevel.MEMORY_AND_DISK)
         Utils(sourceRDD.map(_._2.mbr), conf.getTheta, reader.partitionsZones)
+        val sourcePartitions: RDD[(Int, Iterator[Entity])] = sourceRDD.mapPartitions(si => Iterator((TaskContext.getPartitionId(), si.map(_._2))))
 
         val targetRDD = reader.load(conf.target)
+        targetRDD.persist(StorageLevel.MEMORY_AND_DISK)
         val partitioner = reader.partitioner
 
-        val partitionEntitiesAVG = sourceRDD.mapPartitions(si => Iterator(si.toArray.length)).sum()/sourceRDD.getNumPartitions
-        val balancedSource = sourceRDD.mapPartitions(si => Iterator(si.toArray)).filter(_.length < partitionEntitiesAVG*3).flatMap(_.toIterator)
-        val overloadedSource = sourceRDD.mapPartitions(si => Iterator(si.toArray)).filter(_.length >= partitionEntitiesAVG*3).flatMap(_.toIterator)
-        val overloadedPartitionIds = overloadedSource.map(_ => TaskContext.getPartitionId()).collect().toSet
-        val balancedTarget = targetRDD.mapPartitions(ti => Iterator((TaskContext.getPartitionId(), ti))).filter{ case (pid, _) => !overloadedPartitionIds.contains(pid) }.flatMap(_._2)
-        val overloadedTarget = targetRDD.mapPartitions(ti => Iterator((TaskContext.getPartitionId(), ti))).filter{ case (pid, _) => overloadedPartitionIds.contains(pid) }.flatMap(_._2)
-        log.info("DS-JEDAI: Overloaded partitions: " + overloadedPartitionIds.size)
+        val entitiesPerPartitions: Seq[(Int, Int)] =  sourcePartitions.map{ case (pid, si) => (pid, si.size)}.collect()
+
+        // find outlier partitions
+        val mean = entitiesPerPartitions.map(_._2).sum/sourceRDD.getNumPartitions
+        val variance =  entitiesPerPartitions.map(_._2.toDouble).map(x => math.pow(x - mean, 2)).sum / entitiesPerPartitions.length
+        val std = Math.sqrt(variance)
+        val zScore: (Int, Int) => (Int, Double) = (p: Int, x: Int) => (p, (x - mean).toDouble/std)
+
+        val outliers = entitiesPerPartitions.map{case (p, x) => zScore(p, x)}.filter(_._2 > 1.8)
+        val outlierPartitions = outliers.map(_._1).toSet
+        log.info("DS-JEDAI: Overloaded partitions: " + outlierPartitions.size)
 
-        val matchingStartTime = Calendar.getInstance().getTimeInMillis
+        val goodSourceRDD = sourceRDD.filter(s => !outlierPartitions.contains(s._1))
+        val badSourceRDD = sourceRDD.filter(s => outlierPartitions.contains(s._1))
 
-        val giant = GIAnt(sourceRDD, targetRDD, partitioner)
-        val ibm = IndexBasedMatching(overloadedSource.map(_._2), overloadedTarget.map(_._2), Utils.getTheta)
+        val goodTargetRDD = targetRDD.filter(t => !outlierPartitions.contains(t._1))
+        val badTargetRDD = targetRDD.filter(t => outlierPartitions.contains(t._1))
+
+        val giant = GIAnt(goodSourceRDD, goodTargetRDD, partitioner)
+        val iji = IndexedJoinInterlinking(badSourceRDD, badTargetRDD, Utils.getTheta)
 
         if (relation.equals(Relation.DE9IM)) {
-            val (totalContains, totalCoveredBy, totalCovers, totalCrosses, totalEquals, totalIntersects,
-            totalOverlaps, totalTouches, totalWithin, intersectingPairs, interlinkedGeometries) = giant.countAllRelations + ibm.countAllRelations
+            val giantStartTime = Calendar.getInstance().getTimeInMillis
+            val giantResults = giant.countAllRelations
+            val giantEndTime = Calendar.getInstance().getTimeInMillis
+            log.info("DS-JEDAI: GIA.nt Time: " + (giantEndTime - giantStartTime) / 1000.0)
+            log.info("DS-JEDAI: GIA.nt Interlinked Geometries: " + giantResults._11)
+            log.info("-----------------------------------------------------------\n")
+
+            val indexedJoinStartTime = Calendar.getInstance().getTimeInMillis
+            val indexedJoinResults = iji.countAllRelations
+            val indexedJoinEndTime = Calendar.getInstance().getTimeInMillis
+            log.info("DS-JEDAI: INDEXED-JOIN Time: " + (indexedJoinEndTime - indexedJoinStartTime) / 1000.0)
+            log.info("DS-JEDAI:INDEXED-JOIN Interlinked Geometries: " + indexedJoinResults._11)
+            log.info("-----------------------------------------------------------\n")
 
+            val (totalContains, totalCoveredBy, totalCovers, totalCrosses, totalEquals, totalIntersects,
+            totalOverlaps, totalTouches, totalWithin, intersectingPairs, interlinkedGeometries) = giantResults + indexedJoinResults
             val totalRelations = totalContains + totalCoveredBy + totalCovers + totalCrosses + totalEquals +
                 totalIntersects + totalOverlaps + totalTouches + totalWithin
             log.info("DS-JEDAI: Total Intersecting Pairs: " + intersectingPairs)
@@ -130,11 +137,9 @@ object WellBalancedExp {
             log.info("DS-JEDAI: Total Relations Discovered: " + totalRelations)
         }
         else{
-            val totalMatches = giant.countRelation(relation) + ibm.countRelation(relation)
+            val totalMatches = giant.countRelation(relation) + iji.countRelation(relation)
             log.info("DS-JEDAI: " + relation.toString +": " + totalMatches)
         }
-        val matchingEndTime = Calendar.getInstance().getTimeInMillis
-        log.info("DS-JEDAI: Interlinking Time: " + (matchingEndTime - matchingStartTime) / 1000.0)
 
         val endTime = Calendar.getInstance()
         log.info("DS-JEDAI: Total Execution Time: " + (endTime.getTimeInMillis - startTime) / 1000.0)
diff --git a/src/main/scala/geospatialInterlinking/IndexBasedMatching.scala b/src/main/scala/geospatialInterlinking/IndexBasedMatching.scala
deleted file mode 100644
index d54f2cda..00000000
--- a/src/main/scala/geospatialInterlinking/IndexBasedMatching.scala
+++ /dev/null
@@ -1,70 +0,0 @@
-package geospatialInterlinking
-
-import dataModel.{Entity, IM}
-import org.apache.spark.TaskContext
-import org.apache.spark.rdd.RDD
-import utils.Constants.Relation
-import utils.Constants.Relation.Relation
-import utils.Constants.WeightingScheme.WeightingScheme
-
-import scala.collection.mutable.ListBuffer
-
-case class IndexBasedMatching(source:RDD[Entity], target:RDD[Entity], thetaXY: (Double, Double)) extends GeospatialInterlinkingT {
-
-    val joinedRDD: RDD[(Int, (Iterable[Entity], Iterable[Entity]))] = null
-    val ws: WeightingScheme = null
-
-    val filteringFunction: ((Entity, Int), (Entity, Int), (Int, Int), Relation) => Boolean =
-        (e1: (Entity, Int), e2: (Entity, Int), c: (Int, Int), r: Relation) => e1._2 == e2._2 && e1._1.filter(e2._1, r, c, thetaXY)
-
-    /**
-     * First index the Source and then use the index to find the comparisons with target's entities.
-     * Filter the redundant comparisons using testMBR and RF
-     *
-     * @param relation the examining relation
-     * @return an RDD containing the matching pairs
-     */
-    def relate(relation: Relation): RDD[(String, String)] = {
-
-        val indexedSource = source
-            .map(se => (se.index(thetaXY), (se, TaskContext.getPartitionId())))
-            .flatMap{case (indices, (se, pid)) => indices.map(i => (i, ListBuffer((se, pid))))}
-            .reduceByKey(_ ++ _)
-        val partitioner = indexedSource.partitioner.get
-
-        val indexedTarget = target
-            .map(se => (se.index(thetaXY), (se, TaskContext.getPartitionId())))
-            .flatMap{case (indices, (se, pid)) => indices.map(i => (i, ListBuffer((se, pid))))}
-            .reduceByKey(partitioner, _ ++ _)
-
-        indexedSource.leftOuterJoin(indexedTarget, partitioner)
-            .filter(_._2._2.isDefined)
-            .flatMap { case (c: (Int, Int), (s: ListBuffer[(Entity, Int)], optT: Option[ListBuffer[(Entity, Int)]])) =>
-                for (e1 <- s; e2 <- optT.get; if filteringFunction(e1, e2, c, relation))
-                    yield (e1._1.originalID, e2._1.originalID)
-            }
-    }
-
-
-
-    def getDE9IM: RDD[IM] = {
-        val indexedSource = source
-            .map(se => (se.index(thetaXY), (se, TaskContext.getPartitionId())))
-            .flatMap{case (indices, (se, pid)) => indices.map(i => (i, ListBuffer((se, pid))))}
-            .reduceByKey(_ ++ _)
-        val partitioner = indexedSource.partitioner.get
-
-        val indexedTarget = target
-            .map(se => (se.index(thetaXY), (se, TaskContext.getPartitionId())))
-            .flatMap{case (indices, (se, pid)) => indices.map(i => (i, ListBuffer((se, pid))))}
-            .reduceByKey(partitioner, _ ++ _)
-
-        indexedSource.leftOuterJoin(indexedTarget, partitioner)
-            .filter(_._2._2.isDefined)
-            .flatMap { case (c: (Int, Int), (s: ListBuffer[(Entity, Int)], optT: Option[ListBuffer[(Entity, Int)]])) =>
-                for (e1 <- s; e2 <- optT.get; if filteringFunction(e1, e2, c, Relation.DE9IM)) yield IM(e1._1, e2._1)
-            }
-    }
-
-
-}
diff --git a/src/main/scala/geospatialInterlinking/IndexedJoinInterlinking.scala b/src/main/scala/geospatialInterlinking/IndexedJoinInterlinking.scala
new file mode 100644
index 00000000..3c411861
--- /dev/null
+++ b/src/main/scala/geospatialInterlinking/IndexedJoinInterlinking.scala
@@ -0,0 +1,65 @@
+package geospatialInterlinking
+
+import dataModel.{Entity, IM}
+import org.apache.spark.HashPartitioner
+import org.apache.spark.rdd.RDD
+import utils.Constants.Relation
+import utils.Constants.Relation.Relation
+import utils.Constants.WeightingScheme.WeightingScheme
+
+
+
+
+case class IndexedJoinInterlinking(source:RDD[(Int, Entity)], target:RDD[(Int, Entity)], thetaXY: (Double, Double)) extends GeospatialInterlinkingT {
+
+    val joinedRDD: RDD[(Int, (Iterable[Entity], Iterable[Entity]))] = null
+    val ws: WeightingScheme = null
+    val partitioner = new HashPartitioner(source.getNumPartitions)
+
+    val filteringFunction: ((Int, Entity),  (Int, Entity), (Int, Int), Relation) => Boolean =
+        (e1: (Int, Entity), e2: (Int, Entity), c: (Int, Int), r: Relation) =>
+            e1._1 == e2._1 && e1._2.filter(e2._2, r, c, thetaXY, Some(partitionsZones(e1._1)))
+
+
+    def indexedJoin(): RDD[((Int, Int), (Iterable[(Int, Entity)], Iterable[(Int, Entity)]))] = {
+        val indexedSource: RDD[((Int, Int), Iterable[(Int, Entity)])] = source
+            .map(se => (se._2.index(thetaXY), se))
+            .flatMap{ case (indices, (pid, se)) => indices.map(i => (i, (pid, se)))}
+            .groupByKey(partitioner)
+
+        val indexedTarget: RDD[((Int, Int), Iterable[(Int, Entity)])] = target
+            .map(se => (se._2.index(thetaXY), se))
+            .flatMap{ case (indices, (pid, se)) => indices.map(i => (i, (pid, se)))}
+            .groupByKey(partitioner)
+
+        indexedSource.leftOuterJoin(indexedTarget, partitioner)
+            .filter(_._2._2.isDefined)
+            .map(p => (p._1, (p._2._1, p._2._2.get)))
+    }
+
+    /**
+     * First index Source and then use index to find the comparisons with the entities of Target.
+     * Filter the redundant comparisons using the spatial Filters
+     *
+     * @param relation the examining relation
+     * @return an RDD containing the matching pairs
+     */
+    def relate(relation: Relation): RDD[(String, String)] =
+        indexedJoin()
+            .flatMap { case (c: (Int, Int), ( source: Iterable[(Int, Entity)], target: Iterable[(Int, Entity)])) =>
+                for (e1 <- source; e2 <- target; if filteringFunction(e1, e2, c, relation) && e1._2.relate(e2._2, relation))
+                    yield (e1._2.originalID, e2._2.originalID)
+            }
+
+
+    def getDE9IM: RDD[IM] = {
+        val indexedSeq = indexedJoin()
+        indexedSeq
+            .flatMap { case (c: (Int, Int), (source: Iterable[(Int, Entity)], target: Iterable[(Int, Entity)])) =>
+                for (e1 <- source; e2 <- target; if filteringFunction(e1, e2, c, Relation.DE9IM))
+                    yield IM(e1._2, e2._2)
+            }
+    }
+
+
+}

From 34ecdc0cb90db3f17f23b7ddac189ae790127a75 Mon Sep 17 00:00:00 2001
From: George <giwrgosmandi@gmail.com>
Date: Thu, 11 Mar 2021 17:30:11 +0200
Subject: [PATCH 18/25] minor changes

---
 src/main/scala/experiments/ProgressiveExp.scala                 | 2 ++
 src/main/scala/geospatialInterlinking/GIAnt.scala               | 2 +-
 .../scala/geospatialInterlinking/GeospatialInterlinkingT.scala  | 2 --
 3 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/src/main/scala/experiments/ProgressiveExp.scala b/src/main/scala/experiments/ProgressiveExp.scala
index 2f91629d..6e799499 100644
--- a/src/main/scala/experiments/ProgressiveExp.scala
+++ b/src/main/scala/experiments/ProgressiveExp.scala
@@ -47,6 +47,8 @@ object ProgressiveExp {
                     nextOption(map ++ Map("pa" -> value), tail)
                 case "-gt" :: value :: tail =>
                     nextOption(map ++ Map("gt" -> value), tail)
+                case ("-p" | "-partitions") :: value :: tail =>
+                    nextOption(map ++ Map("partitions" -> value), tail)
                 case _ :: tail =>
                     log.warn("DS-JEDAI: Unrecognized argument")
                     nextOption(map, tail)
diff --git a/src/main/scala/geospatialInterlinking/GIAnt.scala b/src/main/scala/geospatialInterlinking/GIAnt.scala
index 83642e82..f7f426f9 100644
--- a/src/main/scala/geospatialInterlinking/GIAnt.scala
+++ b/src/main/scala/geospatialInterlinking/GIAnt.scala
@@ -63,7 +63,7 @@ case class GIAnt(joinedRDD: RDD[(Int, (Iterable[Entity], Iterable[Entity]))], th
                             .filter { case (block, i) => source(i).filter(e2, Relation.DE9IM, block, thetaXY, Some(partition)) }
                             .map(_._2)
                             .map(i => IM(source(i), e2))
-                            //.filter(_.relate)
+                            .filter(_.relate)
                             .force
                     }
                 }
diff --git a/src/main/scala/geospatialInterlinking/GeospatialInterlinkingT.scala b/src/main/scala/geospatialInterlinking/GeospatialInterlinkingT.scala
index e39783a4..58e7a67f 100644
--- a/src/main/scala/geospatialInterlinking/GeospatialInterlinkingT.scala
+++ b/src/main/scala/geospatialInterlinking/GeospatialInterlinkingT.scala
@@ -15,7 +15,6 @@ trait GeospatialInterlinkingT {
 
     val partitionsZones: Array[MBR] = SparkContext.getOrCreate().broadcast(Utils.getZones).value
 
-
     /**
      * index a list of spatial entities
      *
@@ -65,7 +64,6 @@ trait GeospatialInterlinkingT {
         }
         (totalContains, totalCoveredBy, totalCovers, totalCrosses, totalEquals, totalIntersects,
             totalOverlaps, totalTouches, totalWithin, verifications, qualifiedPairs)
-
     }
 
     def countAllRelations: (Int, Int, Int, Int, Int, Int, Int, Int, Int, Int, Int) =

From f47d4b8618a9457ac16d97bbbcf3f6e62cec7db2 Mon Sep 17 00:00:00 2001
From: George <giwrgosmandi@gmail.com>
Date: Fri, 12 Mar 2021 20:13:40 +0200
Subject: [PATCH 19/25] minor + export

---
 src/main/scala/experiments/BalancingExp.scala   | 4 +---
 src/main/scala/experiments/ProgressiveExp.scala | 3 ++-
 src/main/scala/utils/Utils.scala                | 8 ++++----
 3 files changed, 7 insertions(+), 8 deletions(-)

diff --git a/src/main/scala/experiments/BalancingExp.scala b/src/main/scala/experiments/BalancingExp.scala
index fbe53fa5..6204a664 100644
--- a/src/main/scala/experiments/BalancingExp.scala
+++ b/src/main/scala/experiments/BalancingExp.scala
@@ -44,8 +44,6 @@ object BalancingExp {
                 case Nil => map
                 case ("-c" | "-conf") :: value :: tail =>
                     nextOption(map ++ Map("conf" -> value), tail)
-                case "-auc" :: tail =>
-                    nextOption(map ++ Map("auc" -> "true"), tail)
                 case ("-p" | "-partitions") :: value :: tail =>
                     nextOption(map ++ Map("partitions" -> value), tail)
                 case "-gt" :: value :: tail =>
@@ -90,7 +88,7 @@ object BalancingExp {
         val std = Math.sqrt(variance)
         val zScore: (Int, Int) => (Int, Double) = (p: Int, x: Int) => (p, (x - mean).toDouble/std)
 
-        val outliers = entitiesPerPartitions.map{case (p, x) => zScore(p, x)}.filter(_._2 > 1.8)
+        val outliers = entitiesPerPartitions.map{case (p, x) => zScore(p, x)}.filter(_._2 > 2.5)
         val outlierPartitions = outliers.map(_._1).toSet
         log.info("DS-JEDAI: Overloaded partitions: " + outlierPartitions.size)
 
diff --git a/src/main/scala/experiments/ProgressiveExp.scala b/src/main/scala/experiments/ProgressiveExp.scala
index 6e799499..ee36ac1e 100644
--- a/src/main/scala/experiments/ProgressiveExp.scala
+++ b/src/main/scala/experiments/ProgressiveExp.scala
@@ -75,7 +75,8 @@ object ProgressiveExp {
         val relation = conf.getRelation
 
         log.info("DS-JEDAI: Input Budget: " + budget)
-        log.info("DS-JEDAI: Weighting Scheme: " + mainWS.toString)
+        log.info("DS-JEDAI: Main Weighting Scheme: " + mainWS.toString)
+        if (secondaryWS.isDefined) log.info("DS-JEDAI: Secondary Weighting Scheme: " + secondaryWS.get.toString)
         log.info("DS-JEDAI: Progressive Algorithm: " + pa.toString)
 
         val startTime = Calendar.getInstance().getTimeInMillis
diff --git a/src/main/scala/utils/Utils.scala b/src/main/scala/utils/Utils.scala
index 6642738b..fae02edd 100644
--- a/src/main/scala/utils/Utils.scala
+++ b/src/main/scala/utils/Utils.scala
@@ -125,12 +125,12 @@ object Utils extends Serializable {
 		log.info("Unique blocks: " + pSet.size)
 	}
 
-	def export(rdd: RDD[Entity], path:String): Unit ={
+	def export(rdd: RDD[(String, String)], path:String): Unit ={
 		val schema = StructType(
-			StructField("id", IntegerType, nullable = true) ::
-				StructField("wkt", StringType, nullable = true) :: Nil
+			StructField("id1", StringType, nullable = true) ::
+				StructField("id2", StringType, nullable = true) :: Nil
 		)
-		val rowRDD: RDD[Row] = rdd.map(s => new GenericRowWithSchema(Array(TaskContext.getPartitionId(), s.geometry.toText), schema))
+		val rowRDD: RDD[Row] = rdd.map(s => new GenericRowWithSchema(Array(s._1, s._2), schema))
 		val df = spark.createDataFrame(rowRDD, schema)
 		df.write.option("header", "true").csv(path)
 	}

From 9f5f68f51718ffad6022afac22ed70aa387695a4 Mon Sep 17 00:00:00 2001
From: George <giwrgosmandi@gmail.com>
Date: Tue, 16 Mar 2021 16:12:45 +0200
Subject: [PATCH 20/25] readers and textual reader for RDF

---
 .../scala/experiments/EvaluationExp.scala     |  30 +-
 src/main/scala/experiments/GiantExp.scala     |   9 +-
 .../scala/experiments/ProgressiveExp.scala    |   9 +-
 .../ProgressiveGeospatialInterlinkingT.scala  |   2 +-
 src/main/scala/utils/SpatialReader.scala      | 312 ------------------
 src/main/scala/utils/readers/CSVReader.scala  |  75 +++++
 .../utils/readers/GeospatialReader.scala      |  97 ++++++
 .../scala/utils/readers/RDFGraphReader.scala  | 113 +++++++
 src/main/scala/utils/readers/Reader.scala     | 105 ++++++
 .../scala/utils/readers/ReaderFactory.scala   |   5 +
 10 files changed, 422 insertions(+), 335 deletions(-)
 delete mode 100644 src/main/scala/utils/SpatialReader.scala
 create mode 100644 src/main/scala/utils/readers/CSVReader.scala
 create mode 100644 src/main/scala/utils/readers/GeospatialReader.scala
 create mode 100644 src/main/scala/utils/readers/RDFGraphReader.scala
 create mode 100644 src/main/scala/utils/readers/Reader.scala
 create mode 100644 src/main/scala/utils/readers/ReaderFactory.scala

diff --git a/src/main/scala/experiments/EvaluationExp.scala b/src/main/scala/experiments/EvaluationExp.scala
index b88cdc45..f2425be5 100644
--- a/src/main/scala/experiments/EvaluationExp.scala
+++ b/src/main/scala/experiments/EvaluationExp.scala
@@ -15,7 +15,8 @@ import utils.Constants.ProgressiveAlgorithm.ProgressiveAlgorithm
 import utils.Constants.Relation.Relation
 import utils.Constants.WeightingScheme.WeightingScheme
 import utils.Constants.{GridType, ProgressiveAlgorithm, Relation, WeightingScheme}
-import utils.{ConfigurationParser, SpatialReader, Utils}
+import utils.readers.Reader
+import utils.{ConfigurationParser, Utils}
 
 
 object EvaluationExp {
@@ -24,7 +25,7 @@ object EvaluationExp {
     log.setLevel(Level.INFO)
 
     var budget: Int = 10000
-    var takeBudget: Seq[Int] = Seq(5000000, 10000000)
+    var takeBudget: Seq[Int] = Seq(500000, 1000000)
     var relation: Relation = Relation.DE9IM
 
     def main(args: Array[String]): Unit = {
@@ -83,13 +84,13 @@ object EvaluationExp {
 
         log.info("DS-JEDAI: Input Budget: " + budget)
 
-        val reader = SpatialReader(conf.source, partitions, gridType)
-        val sourceRDD = reader.load()
+        val reader = Reader(conf.source, partitions, gridType)
+        val sourceRDD = reader.spatialLoad()
         sourceRDD.persist(StorageLevel.MEMORY_AND_DISK)
         Utils(sourceRDD.map(_._2.mbr), conf.getTheta, reader.partitionsZones)
         log.info(s"DS-JEDAI: Source was loaded into ${sourceRDD.getNumPartitions} partitions")
 
-        val targetRDD = reader.load(conf.target)
+        val targetRDD = reader.spatialLoad(conf.target)
         val partitioner = reader.partitioner
 
         val (totalVerifications, totalRelatedPairs) =
@@ -110,15 +111,16 @@ object EvaluationExp {
             if (options.contains("pa"))
                 options("pa").split(",").filter(ProgressiveAlgorithm.exists).map(ProgressiveAlgorithm.withName).toSeq
             else
-                Seq(ProgressiveAlgorithm.DYNAMIC_PROGRESSIVE_GIANT)
-
-        val weightingSchemes = Seq((WeightingScheme.CF, None),
-                                (WeightingScheme.JS, None),
-                                (WeightingScheme.PEARSON_X2,None),
-                                (WeightingScheme.MBR_INTERSECTION, None),
-                                 (WeightingScheme.POINTS, None),
-                                 (WeightingScheme.JS, Option(WeightingScheme.MBR_INTERSECTION)),
-                                 (WeightingScheme.PEARSON_X2, Option(WeightingScheme.MBR_INTERSECTION)))
+                Seq(ProgressiveAlgorithm.DYNAMIC_PROGRESSIVE_GIANT, ProgressiveAlgorithm.PROGRESSIVE_GIANT, ProgressiveAlgorithm.TOPK, ProgressiveAlgorithm.RECIPROCAL_TOPK)
+
+        val weightingSchemes = Seq((WeightingScheme.JS, Option(WeightingScheme.MBR_INTERSECTION)))
+//                                (WeightingScheme.CF, None),
+//                                (WeightingScheme.JS, None),
+//                                (WeightingScheme.PEARSON_X2,None),
+//                                (WeightingScheme.MBR_INTERSECTION, None),
+//                                 (WeightingScheme.POINTS, None),
+//                                 (WeightingScheme.JS, Option(WeightingScheme.MBR_INTERSECTION)),
+//                                 (WeightingScheme.PEARSON_X2, Option(WeightingScheme.MBR_INTERSECTION)))
 
         for (a <- algorithms ; ws <- weightingSchemes)
             printResults(sourceRDD, targetRDD, partitioner, totalRelatedPairs, a, ws)
diff --git a/src/main/scala/experiments/GiantExp.scala b/src/main/scala/experiments/GiantExp.scala
index 017bd5fa..a2e216c8 100644
--- a/src/main/scala/experiments/GiantExp.scala
+++ b/src/main/scala/experiments/GiantExp.scala
@@ -11,7 +11,8 @@ import org.apache.spark.storage.StorageLevel
 import org.apache.spark.{SparkConf, SparkContext}
 import org.datasyslab.geospark.serde.GeoSparkKryoRegistrator
 import utils.Constants.{GridType, Relation}
-import utils.{ConfigurationParser, SpatialReader, Utils}
+import utils.readers.Reader
+import utils.{ConfigurationParser, Utils}
 
 object GiantExp {
 
@@ -66,13 +67,13 @@ object GiantExp {
 
         val startTime = Calendar.getInstance().getTimeInMillis
 
-        val reader = SpatialReader(conf.source, partitions, gridType)
-        val sourceRDD = reader.load()
+        val reader = Reader(conf.source, partitions, gridType)
+        val sourceRDD = reader.spatialLoad()
         sourceRDD.persist(StorageLevel.MEMORY_AND_DISK)
         Utils(sourceRDD.map(_._2.mbr), conf.getTheta, reader.partitionsZones)
         log.info(s"DS-JEDAI: Source was loaded into ${sourceRDD.getNumPartitions} partitions")
 
-        val targetRDD = reader.load(conf.target)
+        val targetRDD = reader.spatialLoad(conf.target)
         val partitioner = reader.partitioner
 
         if(printCount){
diff --git a/src/main/scala/experiments/ProgressiveExp.scala b/src/main/scala/experiments/ProgressiveExp.scala
index ee36ac1e..c108fda2 100644
--- a/src/main/scala/experiments/ProgressiveExp.scala
+++ b/src/main/scala/experiments/ProgressiveExp.scala
@@ -12,7 +12,8 @@ import org.datasyslab.geospark.serde.GeoSparkKryoRegistrator
 import utils.Constants.ProgressiveAlgorithm.ProgressiveAlgorithm
 import utils.Constants.{GridType, ProgressiveAlgorithm, Relation, WeightingScheme}
 import utils.Constants.WeightingScheme.WeightingScheme
-import utils.{ConfigurationParser, SpatialReader, Utils}
+import utils.readers.Reader
+import utils.{ConfigurationParser, Utils}
 
 object ProgressiveExp {
 
@@ -81,13 +82,13 @@ object ProgressiveExp {
 
         val startTime = Calendar.getInstance().getTimeInMillis
 
-        val reader = SpatialReader(conf.source, partitions, gridType)
-        val sourceRDD = reader.load()
+        val reader = Reader(conf.source, partitions, gridType)
+        val sourceRDD = reader.spatialLoad()
         sourceRDD.persist(StorageLevel.MEMORY_AND_DISK)
         Utils(sourceRDD.map(_._2.mbr), conf.getTheta, reader.partitionsZones)
         log.info(s"DS-JEDAI: Source was loaded into ${sourceRDD.getNumPartitions} partitions")
 
-        val targetRDD = reader.load(conf.target)
+        val targetRDD = reader.spatialLoad(conf.target)
         val partitioner = reader.partitioner
 
         val matchingStartTime = Calendar.getInstance().getTimeInMillis
diff --git a/src/main/scala/geospatialInterlinking/progressive/ProgressiveGeospatialInterlinkingT.scala b/src/main/scala/geospatialInterlinking/progressive/ProgressiveGeospatialInterlinkingT.scala
index 84c94844..105a45e9 100644
--- a/src/main/scala/geospatialInterlinking/progressive/ProgressiveGeospatialInterlinkingT.scala
+++ b/src/main/scala/geospatialInterlinking/progressive/ProgressiveGeospatialInterlinkingT.scala
@@ -47,7 +47,7 @@ trait ProgressiveGeospatialInterlinkingT extends GeospatialInterlinkingT{
     def getWeight(e1: Entity, e2: Entity, ws: WeightingScheme): Float = {
         val e1Blocks = (ceil(e1.mbr.maxX/thetaXY._1).toInt - floor(e1.mbr.minX/thetaXY._1).toInt + 1) * (ceil(e1.mbr.maxY/thetaXY._2).toInt - floor(e1.mbr.minY/thetaXY._2).toInt + 1)
         val e2Blocks = (ceil(e2.mbr.maxX/thetaXY._1).toInt - floor(e2.mbr.minX/thetaXY._1).toInt + 1) * (ceil(e2.mbr.maxY/thetaXY._2).toInt - floor(e2.mbr.minY/thetaXY._2).toInt + 1)
-        val cb = (min(ceil(e1.mbr.maxX/thetaXY._1), ceil(e2.mbr.maxX/thetaXY._1)).toInt - max(floor(e1.mbr.minX/thetaXY._1), floor(e2.mbr.minX/thetaXY._1)).toInt + 1) *
+        lazy val cb = (min(ceil(e1.mbr.maxX/thetaXY._1), ceil(e2.mbr.maxX/thetaXY._1)).toInt - max(floor(e1.mbr.minX/thetaXY._1), floor(e2.mbr.minX/thetaXY._1)).toInt + 1) *
             (min(ceil(e1.mbr.maxY/thetaXY._2), ceil(e2.mbr.maxY/thetaXY._2)).toInt - max(floor(e1.mbr.minY/thetaXY._2), floor(e2.mbr.minY/thetaXY._2)).toInt + 1)
 
         ws match {
diff --git a/src/main/scala/utils/SpatialReader.scala b/src/main/scala/utils/SpatialReader.scala
deleted file mode 100644
index ca0fd631..00000000
--- a/src/main/scala/utils/SpatialReader.scala
+++ /dev/null
@@ -1,312 +0,0 @@
-package utils
-
-import dataModel.{Entity, MBR, SpatialEntity, SpatioTemporalEntity}
-import com.vividsolutions.jts.geom.Geometry
-import org.apache.jena.query.ARQ
-import net.sansa_stack.rdf.spark.io._
-import org.apache.jena.riot.Lang
-import org.apache.spark.rdd.RDD
-import org.apache.spark.serializer.KryoSerializer
-import org.apache.spark.sql.SparkSession
-import org.apache.spark.sql.functions.col
-import org.apache.spark.sql.types.StringType
-import org.apache.spark.sql.functions.udf
-import org.apache.spark.{HashPartitioner, SparkConf, SparkContext}
-import org.datasyslab.geospark.enums.GridType
-import org.datasyslab.geospark.formatMapper.GeoJsonReader
-import org.datasyslab.geospark.formatMapper.shapefileParser.ShapefileReader
-import org.datasyslab.geospark.serde.GeoSparkKryoRegistrator
-import org.datasyslab.geospark.spatialPartitioning.SpatialPartitioner
-import org.datasyslab.geospark.spatialRDD.SpatialRDD
-import org.datasyslab.geosparksql.utils.{Adapter, GeoSparkSQLRegistrator}
-import org.joda.time.DateTime
-import org.joda.time.format.DateTimeFormat
-import utils.Constants.FileTypes
-
-import scala.collection.JavaConverters._
-import scala.collection.mutable
-
-/**
- * Spatial Reader loads input dataset into  RDD[Entity]
- *
- * It's initialized based on the source dataset, which sets the partitioner,
- * and the next datasets will be loaded using the same partitioner
- * @param sourceDc source Dataset configuration
- * @param partitions num of partitions
- * @param gt grid algorithm of spatial partitioner
- */
-case class SpatialReader(sourceDc: DatasetConfigurations, partitions: Int, gt: Constants.GridType.GridType = Constants.GridType.QUADTREE) {
-
-    lazy val gridType: GridType = gt match {
-        case Constants.GridType.KDBTREE => GridType.KDBTREE
-        case _ => GridType.QUADTREE
-    }
-
-    // spatial RDD of source
-    lazy val spatialRDD: SpatialRDD[Geometry] = loadSource(sourceDc)
-
-    // spatial partitioner defined by the source spatial RDD
-    lazy val spatialPartitioner: SpatialPartitioner = {
-        spatialRDD.analyze()
-        if (partitions > 0) spatialRDD.spatialPartitioning(gridType, partitions) else spatialRDD.spatialPartitioning(gridType)
-        spatialRDD.getPartitioner
-    }
-
-    // the final partitioner - because the transformation of SRDD into RDD does not preserve partitioning
-    // we partitioning using HashPartitioning with the spatial indexes as keys
-    lazy val partitioner = new HashPartitioner(spatialPartitioner.numPartitions)
-
-    lazy val partitionsZones: Array[MBR] =
-        spatialPartitioner.getGrids.asScala.map(e => MBR(e.getMaxX, e.getMinX, e.getMaxY, e.getMinY)).toArray
-
-    /**
-     *  Employ the appropriate reader based the FileType
-     * @param dc dataset configuration
-     * @return a spatial RDD
-     */
-    def loadSource(dc: DatasetConfigurations): SpatialRDD[Geometry] ={
-        val extension = dc.getExtension
-        extension match {
-            case FileTypes.CSV =>
-                loadCSV(dc.path, dc.realIdField.getOrElse("id"), dc.geometryField, dc.dateField, header = true )
-            case FileTypes.TSV =>
-                loadTSV(dc.path, dc.realIdField.getOrElse("id"), dc.geometryField, dc.dateField, header = true )
-            case FileTypes.SHP =>
-                loadSHP(dc.path, dc.realIdField.getOrElse("id"), dc.dateField)
-            case FileTypes.NTRIPLES =>
-                loadRDF(dc.path, dc.geometryField, dc.dateField, Lang.NTRIPLES)
-            case FileTypes.TURTLE =>
-                loadRDF(dc.path, dc.geometryField, dc.dateField, Lang.TURTLE)
-            case FileTypes.RDFXML =>
-                loadRDF(dc.path, dc.geometryField, dc.dateField, Lang.RDFXML)
-            case FileTypes.RDFJSON =>
-                loadRDF(dc.path, dc.geometryField, dc.dateField, Lang.RDFJSON)
-            case _ =>
-                null
-        }
-    }
-
-    def loadCSV(filepath: String, realIdField: String, geometryField: String, dateField: Option[String], header: Boolean):SpatialRDD[Geometry] =
-        loadDelimitedFile(filepath, realIdField, geometryField, dateField, ",", header)
-
-    def loadTSV(filepath: String, realIdField: String, geometryField: String, dateField: Option[String], header: Boolean): SpatialRDD[Geometry] =
-        loadDelimitedFile(filepath, realIdField, geometryField, dateField, "\t", header)
-
-    /**
-     * Loads a delimited file
-     * @param filepath path to the delimited text file
-     * @param realIdField instances' unique id
-     * @param geometryField geometry field
-     * @param dateField date field if exists
-     * @param delimiter delimiter
-     * @param header if first row contains the headers
-     * @return a spatial RDD
-     */
-    def loadDelimitedFile(filepath: String, realIdField: String, geometryField: String, dateField: Option[String], delimiter: String, header: Boolean): SpatialRDD[Geometry] ={
-        val conf = new SparkConf()
-        conf.set("spark.serializer", classOf[KryoSerializer].getName)
-        conf.set("spark.kryo.registrator", classOf[GeoSparkKryoRegistrator].getName)
-        val sc = SparkContext.getOrCreate(conf)
-        val spark = SparkSession.getActiveSession.get
-
-        GeoSparkSQLRegistrator.registerAll(spark)
-
-        var inputDF = spark.read.format("csv")
-            .option("delimiter", delimiter)
-            .option("quote", "\"")
-            .option("header", header)
-            .load(filepath)
-            .filter(col(realIdField).isNotNull)
-            .filter(col(geometryField).isNotNull)
-            .filter(! col(geometryField).contains("EMPTY"))
-
-        var query = s"SELECT ST_GeomFromWKT(GEOMETRIES.$geometryField) AS WKT,  GEOMETRIES.$realIdField AS REAL_ID FROM GEOMETRIES".stripMargin
-
-        if (dateField.isDefined) {
-            inputDF = inputDF.filter(col(dateField.get).isNotNull)
-            query = s"SELECT ST_GeomFromWKT(GEOMETRIES.$geometryField) AS WKT,  GEOMETRIES.$realIdField AS REAL_ID, GEOMETRIES.${dateField.get} AS DATE  FROM GEOMETRIES".stripMargin
-        }
-
-        inputDF.createOrReplaceTempView("GEOMETRIES")
-
-        val spatialDF = spark.sql(query)
-        val srdd = new SpatialRDD[Geometry]
-        srdd.rawSpatialRDD = Adapter.toRdd(spatialDF)
-        srdd
-    }
-
-    /**
-     * Loads an ESRI Shapefile
-     * @param filepath path to the SHP file
-     * @param realIdField instances' unique id
-     * @param dateField date field if exists
-     * @return a spatial RDD
-     */
-    def loadSHP(filepath: String, realIdField: String, dateField: Option[String]): SpatialRDD[Geometry] ={
-        val conf = new SparkConf()
-        conf.set("spark.serializer", classOf[KryoSerializer].getName)
-        conf.set("spark.kryo.registrator", classOf[GeoSparkKryoRegistrator].getName)
-        val sc = SparkContext.getOrCreate(conf)
-
-        val parentFolder = filepath.substring(0, filepath.lastIndexOf("/"))
-        val srdd = ShapefileReader.readToGeometryRDD(sc, parentFolder)
-        adjustUserData(srdd, realIdField, dateField)
-    }
-
-
-    /**
-     * Loads a GeoJSON file
-     * @param filepath path to the SHP file
-     * @param realIdField instances' unique id
-     * @param dateField date field if exists
-     * @return a spatial RDD
-     */
-    def loadGeoJSON(filepath: String, realIdField: String, dateField: Option[String]): SpatialRDD[Geometry] ={
-        val conf = new SparkConf()
-        conf.set("spark.serializer", classOf[KryoSerializer].getName)
-        conf.set("spark.kryo.registrator", classOf[GeoSparkKryoRegistrator].getName)
-        val sc = SparkContext.getOrCreate(conf)
-
-        val srdd = GeoJsonReader.readToGeometryRDD(sc, filepath)
-        adjustUserData(srdd, realIdField, dateField)
-    }
-
-    /**
-     *  Adjust users' data.
-     *  Discard all properties except the id and the date if it's requested.
-     * @param srdd the input rdd
-     * @param realIdField the field of id
-     * @param dateField the field of data if it's given
-     * @return geometries with only the necessary user data
-     */
-    def adjustUserData(srdd: SpatialRDD[Geometry], realIdField: String, dateField: Option[String]): SpatialRDD[Geometry]={
-        val idIndex = srdd.fieldNames.indexOf(realIdField)
-        val rddWithUserData: RDD[Geometry] = dateField match {
-            case Some(dateField) =>
-                val dateIndex = srdd.fieldNames.indexOf(dateField)
-                srdd.rawSpatialRDD.rdd.map { g =>
-                    val userData = g.getUserData.toString.split("\t")
-                    val id = userData(idIndex)
-                    val date = userData(dateIndex)
-                    g.setUserData(id + '\t' + date)
-                    g
-                }
-            case _ =>
-                srdd.rawSpatialRDD.rdd.map{ g =>
-                    val userData = g.getUserData.toString.split("\t")
-                    val id = userData(idIndex)
-                    g.setUserData(id)
-                    g
-                }
-        }
-        srdd.setRawSpatialRDD(rddWithUserData)
-
-        // filter records with valid geometries and ids
-        srdd.setRawSpatialRDD(srdd.rawSpatialRDD.rdd.filter(g => ! (g.isEmpty || g == null || g.getUserData.toString == "")))
-        srdd
-    }
-
-
-    /**
-     * Loads RDF dataset into Spatial RDD - First loads the dataset into
-     * RDD[Triples] and then using a SPARQL Select query, extract the necessary
-     * fields.
-     *
-     * @param filepath path to the RDF file
-     * @param geometryPredicate the predicate of the geometry
-     * @param datePredicate date predicate if exists
-     * @param lang the RDF format (i.e. NTRIPLES, TURTLE, etc.)
-     * @return a spatial RDD
-     */
-    def loadRDF(filepath: String, geometryPredicate: String, datePredicate: Option[String], lang: Lang) : SpatialRDD[Geometry] ={
-        val conf = new SparkConf()
-        conf.set("spark.serializer", classOf[KryoSerializer].getName)
-        conf.set("spark.kryo.registrator", classOf[GeoSparkKryoRegistrator].getName)
-        val sc = SparkContext.getOrCreate(conf)
-        val spark = SparkSession.getActiveSession.get
-        GeoSparkSQLRegistrator.registerAll(spark)
-        ARQ.init()
-
-        val allowedPredicates: mutable.Set[String] = mutable.Set()
-        var sparqlQuery = s"SELECT ?Subject ?WKT WHERE { ?Subject $geometryPredicate ?WKT.}"
-        var query = "SELECT ST_GeomFromWKT(GEOMETRIES.WKT),  GEOMETRIES.Subject FROM GEOMETRIES".stripMargin
-
-        val cleanGeomPredicate: String =
-            if (geometryPredicate.head == '<' && geometryPredicate.last == '>')
-                geometryPredicate.substring(1, geometryPredicate.length-1)
-            else geometryPredicate
-
-        allowedPredicates.add(cleanGeomPredicate)
-
-        if(datePredicate.isDefined){
-            val datePredicateValue = datePredicate.get
-            val cleanDatePredicate: String = if (datePredicateValue.head == '<' && datePredicateValue.last == '>')
-                datePredicateValue.substring(1, datePredicateValue.length-1)
-            else datePredicateValue
-            allowedPredicates.add(cleanDatePredicate)
-            sparqlQuery = s"SELECT ?Subject ?WKT ?Date WHERE { ?Subject ${datePredicate.get} ?Date. ?Subject $geometryPredicate ?WKT.}"
-            query = "SELECT ST_GeomFromWKT(GEOMETRIES.WKT),  GEOMETRIES.Subject, GEOMETRIES.Date FROM GEOMETRIES".stripMargin
-        }
-
-        val triplesRDD = spark.rdf(lang)(filepath).filter(t => allowedPredicates.contains(t.getPredicate.getURI))
-        var df = SparqlExecutor.query(spark, triplesRDD, sparqlQuery)
-
-        val cleanWKT = udf( (wkt: String) => wkt.replaceAll("<\\S+>\\s?", ""), StringType)
-        df = df.withColumn("WKT", cleanWKT(df.col("WKT")))
-            .filter(col("WKT").isNotNull)
-            .filter(col("WKT").isNotNull)
-            .filter(! col("WKT").contains("EMPTY"))
-
-        df.createOrReplaceTempView("GEOMETRIES")
-
-        val spatialDF = spark.sql(query)
-        val srdd = new SpatialRDD[Geometry]
-        srdd.rawSpatialRDD = Adapter.toRdd(spatialDF)
-        srdd
-    }
-
-    /**
-     *  Loads a dataset into Spatial Partitioned RDD. The partitioner
-     *  is defined by the first dataset (i.e. the source dataset)
-     *
-     * @param dc dataset configuration
-     * @return a spatial partitioned rdd
-     */
-    def load(dc: DatasetConfigurations = sourceDc): RDD[(Int, Entity)] = {
-        val srdd = if (dc == sourceDc) spatialRDD else loadSource(dc)
-        val sp = SparkContext.getOrCreate().broadcast(spatialPartitioner)
-
-        val withTemporal = dc.dateField.isDefined
-
-        // remove empty, invalid geometries and geometry collections
-        val filteredGeometriesRDD = srdd.rawSpatialRDD.rdd
-            .map{ geom =>
-                val userdata = geom.getUserData.asInstanceOf[String].split("\t")
-                (geom, userdata)
-            }
-            .filter{case (g, _) => !g.isEmpty && g.isValid && g.getGeometryType != "GeometryCollection"}
-
-        // create Spatial or SpatioTemporal entities
-        val entitiesRDD: RDD[Entity] =
-            if(!withTemporal)
-                filteredGeometriesRDD.map{ case (geom, userdata) =>  SpatialEntity(userdata(0), geom)}
-            else
-                filteredGeometriesRDD.mapPartitions{ geomIterator =>
-                        val pattern = dc.datePattern.get
-                        val formatter = DateTimeFormat.forPattern(pattern)
-                        geomIterator.map{
-                            case (geom, userdata) =>
-                                val realID = userdata(0)
-                                val dateStr = userdata(1)
-                                val date: DateTime = formatter.parseDateTime(dateStr)
-                                val dateStr_ = date.toString(Constants.defaultDatePattern)
-                                SpatioTemporalEntity(realID, geom, dateStr_)
-                        }
-                    }
-        // redistribute based on spatial partitioner
-        entitiesRDD
-            .flatMap(se => sp.value.placeObject(se.geometry).asScala.map(i => (i._1.toInt, se)))
-            .partitionBy(partitioner)
-    }
-
-}
diff --git a/src/main/scala/utils/readers/CSVReader.scala b/src/main/scala/utils/readers/CSVReader.scala
new file mode 100644
index 00000000..50c9b6e9
--- /dev/null
+++ b/src/main/scala/utils/readers/CSVReader.scala
@@ -0,0 +1,75 @@
+package utils.readers
+
+import com.vividsolutions.jts.geom.Geometry
+import org.apache.spark.serializer.KryoSerializer
+import org.apache.spark.{SparkConf, SparkContext}
+import org.apache.spark.sql.SparkSession
+import org.apache.spark.sql.functions.col
+import org.datasyslab.geospark.serde.GeoSparkKryoRegistrator
+import org.datasyslab.geospark.spatialRDD.SpatialRDD
+import org.datasyslab.geosparksql.utils.{Adapter, GeoSparkSQLRegistrator}
+import utils.Constants.FileTypes
+import utils.{Constants, DatasetConfigurations}
+
+case class CSVReader(sourceDc: DatasetConfigurations, partitions: Int, gt: Constants.GridType.GridType = Constants.GridType.QUADTREE) extends Reader {
+
+    def load(dc: DatasetConfigurations): SpatialRDD[Geometry] = {
+        val extension = dc.getExtension
+        extension match {
+            case FileTypes.CSV =>
+                loadCSV(dc.path, dc.realIdField.getOrElse("id"), dc.geometryField, dc.dateField, header = true)
+            case FileTypes.TSV =>
+                loadTSV(dc.path, dc.realIdField.getOrElse("id"), dc.geometryField, dc.dateField, header = true)
+        }
+    }
+
+
+    def loadCSV(filepath: String, realIdField: String, geometryField: String, dateField: Option[String], header: Boolean):SpatialRDD[Geometry] =
+        loadDelimitedFile(filepath, realIdField, geometryField, dateField, ",", header)
+
+    def loadTSV(filepath: String, realIdField: String, geometryField: String, dateField: Option[String], header: Boolean): SpatialRDD[Geometry] =
+        loadDelimitedFile(filepath, realIdField, geometryField, dateField, "\t", header)
+
+    /**
+     * Loads a delimited file
+     * @param filepath path to the delimited text file
+     * @param realIdField instances' unique id
+     * @param geometryField geometry field
+     * @param dateField date field if exists
+     * @param delimiter delimiter
+     * @param header if first row contains the headers
+     * @return a spatial RDD
+     */
+    def loadDelimitedFile(filepath: String, realIdField: String, geometryField: String, dateField: Option[String], delimiter: String, header: Boolean): SpatialRDD[Geometry] ={
+        val conf = new SparkConf()
+        conf.set("spark.serializer", classOf[KryoSerializer].getName)
+        conf.set("spark.kryo.registrator", classOf[GeoSparkKryoRegistrator].getName)
+        val sc = SparkContext.getOrCreate(conf)
+        val spark = SparkSession.getActiveSession.get
+
+        GeoSparkSQLRegistrator.registerAll(spark)
+
+        var inputDF = spark.read.format("csv")
+            .option("delimiter", delimiter)
+            .option("quote", "\"")
+            .option("header", header)
+            .load(filepath)
+            .filter(col(realIdField).isNotNull)
+            .filter(col(geometryField).isNotNull)
+            .filter(! col(geometryField).contains("EMPTY"))
+
+        var query = s"SELECT ST_GeomFromWKT(GEOMETRIES.$geometryField) AS WKT,  GEOMETRIES.$realIdField AS REAL_ID FROM GEOMETRIES".stripMargin
+
+        if (dateField.isDefined) {
+            inputDF = inputDF.filter(col(dateField.get).isNotNull)
+            query = s"SELECT ST_GeomFromWKT(GEOMETRIES.$geometryField) AS WKT,  GEOMETRIES.$realIdField AS REAL_ID, GEOMETRIES.${dateField.get} AS DATE  FROM GEOMETRIES".stripMargin
+        }
+
+        inputDF.createOrReplaceTempView("GEOMETRIES")
+
+        val spatialDF = spark.sql(query)
+        val srdd = new SpatialRDD[Geometry]
+        srdd.rawSpatialRDD = Adapter.toRdd(spatialDF)
+        srdd
+    }
+}
diff --git a/src/main/scala/utils/readers/GeospatialReader.scala b/src/main/scala/utils/readers/GeospatialReader.scala
new file mode 100644
index 00000000..15f730ac
--- /dev/null
+++ b/src/main/scala/utils/readers/GeospatialReader.scala
@@ -0,0 +1,97 @@
+package utils.readers
+
+import com.vividsolutions.jts.geom.Geometry
+import org.apache.spark.rdd.RDD
+import org.apache.spark.serializer.KryoSerializer
+import org.apache.spark.{SparkConf, SparkContext}
+import org.datasyslab.geospark.formatMapper.GeoJsonReader
+import org.datasyslab.geospark.formatMapper.shapefileParser.ShapefileReader
+import org.datasyslab.geospark.serde.GeoSparkKryoRegistrator
+import org.datasyslab.geospark.spatialRDD.SpatialRDD
+import utils.Constants.FileTypes
+import utils.{Constants, DatasetConfigurations}
+
+case class GeospatialReader(sourceDc: DatasetConfigurations, partitions: Int, gt: Constants.GridType.GridType) extends Reader {
+
+    def load(dc: DatasetConfigurations): SpatialRDD[Geometry] = {
+        val extension = dc.getExtension
+        extension match {
+            case FileTypes.GEOJSON =>
+                loadGeoJSON(dc.path, dc.realIdField.getOrElse("id"), dc.dateField)
+            case FileTypes.SHP =>
+                loadSHP(dc.path, dc.realIdField.getOrElse("id"), dc.dateField)
+        }
+    }
+
+    /**
+     * Loads an ESRI Shapefile
+     * @param filepath path to the SHP file
+     * @param realIdField instances' unique id
+     * @param dateField date field if exists
+     * @return a spatial RDD
+     */
+    def loadSHP(filepath: String, realIdField: String, dateField: Option[String]): SpatialRDD[Geometry] ={
+        val conf = new SparkConf()
+        conf.set("spark.serializer", classOf[KryoSerializer].getName)
+        conf.set("spark.kryo.registrator", classOf[GeoSparkKryoRegistrator].getName)
+        val sc = SparkContext.getOrCreate(conf)
+
+        val parentFolder = filepath.substring(0, filepath.lastIndexOf("/"))
+        val srdd = ShapefileReader.readToGeometryRDD(sc, parentFolder)
+        adjustUserData(srdd, realIdField, dateField)
+    }
+
+
+    /**
+     * Loads a GeoJSON file
+     * @param filepath path to the SHP file
+     * @param realIdField instances' unique id
+     * @param dateField date field if exists
+     * @return a spatial RDD
+     */
+    def loadGeoJSON(filepath: String, realIdField: String, dateField: Option[String]): SpatialRDD[Geometry] ={
+        val conf = new SparkConf()
+        conf.set("spark.serializer", classOf[KryoSerializer].getName)
+        conf.set("spark.kryo.registrator", classOf[GeoSparkKryoRegistrator].getName)
+        val sc = SparkContext.getOrCreate(conf)
+
+        val srdd = GeoJsonReader.readToGeometryRDD(sc, filepath)
+        adjustUserData(srdd, realIdField, dateField)
+    }
+
+    /**
+     *  Adjust users' data.
+     *  Discard all properties except the id and the date if it's requested.
+     * @param srdd the input rdd
+     * @param realIdField the field of id
+     * @param dateField the field of data if it's given
+     * @return geometries with only the necessary user data
+     */
+    def adjustUserData(srdd: SpatialRDD[Geometry], realIdField: String, dateField: Option[String]): SpatialRDD[Geometry]={
+        val idIndex = srdd.fieldNames.indexOf(realIdField)
+        val rddWithUserData: RDD[Geometry] = dateField match {
+            case Some(dateField) =>
+                val dateIndex = srdd.fieldNames.indexOf(dateField)
+                srdd.rawSpatialRDD.rdd.map { g =>
+                    val userData = g.getUserData.toString.split("\t")
+                    val id = userData(idIndex)
+                    val date = userData(dateIndex)
+                    g.setUserData(id + '\t' + date)
+                    g
+                }
+            case _ =>
+                srdd.rawSpatialRDD.rdd.map{ g =>
+                    val userData = g.getUserData.toString.split("\t")
+                    val id = userData(idIndex)
+                    g.setUserData(id)
+                    g
+                }
+        }
+        srdd.setRawSpatialRDD(rddWithUserData)
+
+        // filter records with valid geometries and ids
+        srdd.setRawSpatialRDD(srdd.rawSpatialRDD.rdd.filter(g => ! (g.isEmpty || g == null || g.getUserData.toString == "")))
+        srdd
+    }
+
+}
diff --git a/src/main/scala/utils/readers/RDFGraphReader.scala b/src/main/scala/utils/readers/RDFGraphReader.scala
new file mode 100644
index 00000000..3e70eedc
--- /dev/null
+++ b/src/main/scala/utils/readers/RDFGraphReader.scala
@@ -0,0 +1,113 @@
+package utils.readers
+
+import com.vividsolutions.jts.geom.Geometry
+import com.vividsolutions.jts.io.WKTReader
+import org.apache.jena.query.ARQ
+import org.apache.jena.riot.Lang
+import org.apache.spark.serializer.KryoSerializer
+import org.apache.spark.{SparkConf, SparkContext}
+import org.apache.spark.sql.{Row, SparkSession}
+import org.apache.spark.sql.functions.{col, udf}
+import org.apache.spark.sql.types.{StringType, StructField, StructType}
+import org.datasyslab.geospark.serde.GeoSparkKryoRegistrator
+import org.datasyslab.geospark.spatialRDD.SpatialRDD
+import org.datasyslab.geosparksql.utils.{Adapter, GeoSparkSQLRegistrator}
+import utils.{Constants, DatasetConfigurations, SparqlExecutor}
+import net.sansa_stack.rdf.spark.io._
+import org.apache.spark.rdd.RDD
+import utils.Constants.FileTypes
+
+import scala.collection.mutable
+
+case class RDFGraphReader(sourceDc: DatasetConfigurations, partitions: Int, gt: Constants.GridType.GridType) extends Reader {
+
+    def load(dc: DatasetConfigurations): SpatialRDD[Geometry] = {
+        val extension = dc.getExtension
+        val lang: Lang = extension match {
+            case FileTypes.NTRIPLES => Lang.NTRIPLES
+            case FileTypes.TURTLE => Lang.TURTLE
+            case FileTypes.RDFXML => Lang.RDFXML
+            case FileTypes.RDFJSON => Lang.RDFJSON
+            case _ => Lang.NTRIPLES
+        }
+        loadRdfAsTextual(dc.path, dc.geometryField)
+    }
+
+    def loadRdfAsTextual(filepath: String, geometryPredicate: String): SpatialRDD[Geometry] ={
+        val conf = new SparkConf()
+        conf.set("spark.serializer", classOf[KryoSerializer].getName)
+        conf.set("spark.kryo.registrator", classOf[GeoSparkKryoRegistrator].getName)
+        val sc = SparkContext.getOrCreate(conf)
+        val spark = SparkSession.getActiveSession.get
+        GeoSparkSQLRegistrator.registerAll(spark)
+
+        val cleanWKT = (wkt: String) => wkt.replaceAll("<\\S+>\\s?", "").replaceAll("\"", "")
+        val rowRDD: RDD[Row]  = spark.read.textFile(filepath)
+            .rdd.map(s => s.split(" ", 3))
+            .filter(s => s(1) == geometryPredicate)
+            .map(s => (s(0), cleanWKT(s(2))))
+            .filter(s => s._1 != null && s._2 != null)
+            .filter(s => !s._2.contains("EMPTY"))
+            .map(s => Row(s._1, s._2))
+
+        val schema = new StructType()
+            .add(StructField("Subject", StringType, nullable = true))
+            .add(StructField("WKT", StringType, nullable = true))
+
+        val df = spark.createDataFrame(rowRDD, schema)
+        df.createOrReplaceTempView("GEOMETRIES")
+        val query = "SELECT ST_GeomFromWKT(GEOMETRIES.WKT),  GEOMETRIES.Subject FROM GEOMETRIES".stripMargin
+
+        val spatialDF = spark.sql(query)
+        val srdd = new SpatialRDD[Geometry]
+        srdd.rawSpatialRDD = Adapter.toRdd(spatialDF)
+        srdd
+    }
+
+    def loadRDF(filepath: String, geometryPredicate: String, datePredicate: Option[String], lang: Lang): SpatialRDD[Geometry] ={
+        val conf = new SparkConf()
+        conf.set("spark.serializer", classOf[KryoSerializer].getName)
+        conf.set("spark.kryo.registrator", classOf[GeoSparkKryoRegistrator].getName)
+        val sc = SparkContext.getOrCreate(conf)
+        val spark = SparkSession.getActiveSession.get
+        GeoSparkSQLRegistrator.registerAll(spark)
+        ARQ.init()
+
+        val allowedPredicates: mutable.Set[String] = mutable.Set()
+        var sparqlQuery = s"SELECT ?Subject ?WKT WHERE { ?Subject $geometryPredicate ?WKT.}"
+        var query = "SELECT ST_GeomFromWKT(GEOMETRIES.WKT),  GEOMETRIES.Subject FROM GEOMETRIES".stripMargin
+
+        val cleanGeomPredicate: String =
+            if (geometryPredicate.head == '<' && geometryPredicate.last == '>')
+                geometryPredicate.substring(1, geometryPredicate.length-1)
+            else geometryPredicate
+
+        allowedPredicates.add(cleanGeomPredicate)
+
+        if(datePredicate.isDefined){
+            val datePredicateValue = datePredicate.get
+            val cleanDatePredicate: String = if (datePredicateValue.head == '<' && datePredicateValue.last == '>')
+                datePredicateValue.substring(1, datePredicateValue.length-1)
+            else datePredicateValue
+            allowedPredicates.add(cleanDatePredicate)
+            sparqlQuery = s"SELECT ?Subject ?WKT ?Date WHERE { ?Subject ${datePredicate.get} ?Date. ?Subject $geometryPredicate ?WKT.}"
+            query = "SELECT ST_GeomFromWKT(GEOMETRIES.WKT),  GEOMETRIES.Subject, GEOMETRIES.Date FROM GEOMETRIES".stripMargin
+        }
+
+        val triplesRDD = spark.rdf(lang)(filepath).filter(t => allowedPredicates.contains(t.getPredicate.getURI))
+        var df = SparqlExecutor.query(spark, triplesRDD, sparqlQuery)
+
+        val cleanWKT = udf( (wkt: String) => wkt.replaceAll("<\\S+>\\s?", ""), StringType)
+        df = df.withColumn("WKT", cleanWKT(df.col("WKT")))
+            .filter(col("WKT").isNotNull)
+            .filter(! col("WKT").contains("EMPTY"))
+
+        df.createOrReplaceTempView("GEOMETRIES")
+
+        val spatialDF = spark.sql(query)
+        val srdd = new SpatialRDD[Geometry]
+        srdd.rawSpatialRDD = Adapter.toRdd(spatialDF)
+        srdd
+    }
+
+}
diff --git a/src/main/scala/utils/readers/Reader.scala b/src/main/scala/utils/readers/Reader.scala
new file mode 100644
index 00000000..70f72fbe
--- /dev/null
+++ b/src/main/scala/utils/readers/Reader.scala
@@ -0,0 +1,105 @@
+package utils.readers
+
+import com.vividsolutions.jts.geom.Geometry
+import dataModel.{Entity, MBR, SpatialEntity, SpatioTemporalEntity}
+import org.apache.spark.{HashPartitioner, SparkContext}
+import org.apache.spark.rdd.RDD
+import org.datasyslab.geospark.enums.GridType
+import org.datasyslab.geospark.spatialPartitioning.SpatialPartitioner
+import org.datasyslab.geospark.spatialRDD.SpatialRDD
+import org.joda.time.DateTime
+import org.joda.time.format.DateTimeFormat
+import utils.Constants.FileTypes
+import utils.{Constants, DatasetConfigurations}
+
+import scala.collection.JavaConverters._
+
+trait Reader {
+
+    val sourceDc: DatasetConfigurations
+    val partitions: Int
+    val gt: Constants.GridType.GridType
+
+    lazy val gridType: GridType = gt match {
+        case Constants.GridType.KDBTREE => GridType.KDBTREE
+        case _ => GridType.QUADTREE
+    }
+
+    // spatial RDD of source
+    lazy val spatialRDD: SpatialRDD[Geometry] = load(sourceDc)
+
+    // spatial partitioner defined by the source spatial RDD
+    lazy val spatialPartitioner: SpatialPartitioner = {
+        spatialRDD.analyze()
+        if (partitions > 0) spatialRDD.spatialPartitioning(gridType, partitions) else spatialRDD.spatialPartitioning(gridType)
+        spatialRDD.getPartitioner
+    }
+
+    // the final partitioner - because the transformation of SRDD into RDD does not preserve partitioning
+    // we partitioning using HashPartitioning with the spatial indexes as keys
+    lazy val partitioner = new HashPartitioner(spatialPartitioner.numPartitions)
+
+    lazy val partitionsZones: Array[MBR] =
+        spatialPartitioner.getGrids.asScala.map(e => MBR(e.getMaxX, e.getMinX, e.getMaxY, e.getMinY)).toArray
+
+
+    def load(dc: DatasetConfigurations) : SpatialRDD[Geometry]
+
+
+    /**
+     *  Loads a dataset into Spatial Partitioned RDD. The partitioner
+     *  is defined by the first dataset (i.e. the source dataset)
+     *
+     * @param dc dataset configuration
+     * @return a spatial partitioned rdd
+     */
+    def spatialLoad(dc: DatasetConfigurations = sourceDc): RDD[(Int, Entity)] = {
+        val srdd = if (dc == sourceDc) spatialRDD else load(dc)
+        val sp = SparkContext.getOrCreate().broadcast(spatialPartitioner)
+
+        val withTemporal = dc.dateField.isDefined
+
+        // remove empty, invalid geometries and geometry collections
+        val filteredGeometriesRDD = srdd.rawSpatialRDD.rdd
+            .map{ geom =>
+                val userdata = geom.getUserData.asInstanceOf[String].split("\t")
+                (geom, userdata)
+            }
+            .filter{case (g, _) => !g.isEmpty && g.isValid && g.getGeometryType != "GeometryCollection"}
+
+        // create Spatial or SpatioTemporal entities
+        val entitiesRDD: RDD[Entity] =
+            if(!withTemporal)
+                filteredGeometriesRDD.map{ case (geom, userdata) =>  SpatialEntity(userdata(0), geom)}
+            else
+                filteredGeometriesRDD.mapPartitions{ geomIterator =>
+                    val pattern = dc.datePattern.get
+                    val formatter = DateTimeFormat.forPattern(pattern)
+                    geomIterator.map{
+                        case (geom, userdata) =>
+                            val realID = userdata(0)
+                            val dateStr = userdata(1)
+                            val date: DateTime = formatter.parseDateTime(dateStr)
+                            val dateStr_ = date.toString(Constants.defaultDatePattern)
+                            SpatioTemporalEntity(realID, geom, dateStr_)
+                    }
+                }
+        // redistribute based on spatial partitioner
+        entitiesRDD
+            .flatMap(se => sp.value.placeObject(se.geometry).asScala.map(i => (i._1.toInt, se)))
+            .partitionBy(partitioner)
+    }
+}
+
+
+object Reader {
+    def apply(sourceDc: DatasetConfigurations, partitions: Int, gt: Constants.GridType.GridType = Constants.GridType.QUADTREE) : Reader = {
+            val extension = sourceDc.getExtension
+            extension match {
+            case FileTypes.CSV | FileTypes.TSV => CSVReader(sourceDc, partitions, gt)
+            case FileTypes.SHP | FileTypes.GEOJSON => GeospatialReader(sourceDc, partitions, gt)
+            case FileTypes.NTRIPLES | FileTypes.TURTLE | FileTypes.RDFXML | FileTypes.RDFJSON => RDFGraphReader(sourceDc, partitions, gt)
+            case _ =>  null
+        }
+    }
+}
diff --git a/src/main/scala/utils/readers/ReaderFactory.scala b/src/main/scala/utils/readers/ReaderFactory.scala
new file mode 100644
index 00000000..9f3ea4b1
--- /dev/null
+++ b/src/main/scala/utils/readers/ReaderFactory.scala
@@ -0,0 +1,5 @@
+package utils.readers
+
+object ReaderFactory {
+
+}

From 24658c6bbf85afabf80f631271dcab47804a9c41 Mon Sep 17 00:00:00 2001
From: George <giwrgosmandi@gmail.com>
Date: Tue, 16 Mar 2021 16:26:54 +0200
Subject: [PATCH 21/25] IndexedJoin - a well balancing technique

---
 src/main/scala/experiments/BalancingExp.scala | 15 +++++++++------
 src/main/scala/utils/readers/CSVReader.scala  | 11 ++---------
 2 files changed, 11 insertions(+), 15 deletions(-)

diff --git a/src/main/scala/experiments/BalancingExp.scala b/src/main/scala/experiments/BalancingExp.scala
index 6204a664..f6ff31c1 100644
--- a/src/main/scala/experiments/BalancingExp.scala
+++ b/src/main/scala/experiments/BalancingExp.scala
@@ -12,7 +12,8 @@ import org.apache.spark.storage.StorageLevel
 import org.apache.spark.{SparkConf, SparkContext, TaskContext}
 import org.datasyslab.geospark.serde.GeoSparkKryoRegistrator
 import utils.Constants.{GridType, Relation}
-import utils.{ConfigurationParser, SpatialReader, Utils}
+import utils.readers.Reader
+import utils.{ConfigurationParser, Utils}
 
 
 object BalancingExp {
@@ -68,16 +69,18 @@ object BalancingExp {
         val partitions: Int = if (options.contains("partitions")) options("partitions").toInt else conf.getPartitions
         val gridType: GridType.GridType = if (options.contains("gt")) GridType.withName(options("gt").toString) else conf.getGridType
         val relation = conf.getRelation
-
         val startTime = Calendar.getInstance().getTimeInMillis
-        val reader = SpatialReader(conf.source, partitions, gridType)
-        val sourceRDD = reader.load()
+
+
+        val reader = Reader(conf.source, partitions, gridType)
+        val sourceRDD = reader.spatialLoad()
         sourceRDD.persist(StorageLevel.MEMORY_AND_DISK)
         Utils(sourceRDD.map(_._2.mbr), conf.getTheta, reader.partitionsZones)
+        log.info(s"DS-JEDAI: Source was loaded into ${sourceRDD.getNumPartitions} partitions")
         val sourcePartitions: RDD[(Int, Iterator[Entity])] = sourceRDD.mapPartitions(si => Iterator((TaskContext.getPartitionId(), si.map(_._2))))
 
-        val targetRDD = reader.load(conf.target)
-        targetRDD.persist(StorageLevel.MEMORY_AND_DISK)
+
+        val targetRDD = reader.spatialLoad(conf.target)
         val partitioner = reader.partitioner
 
         val entitiesPerPartitions: Seq[(Int, Int)] =  sourcePartitions.map{ case (pid, si) => (pid, si.size)}.collect()
diff --git a/src/main/scala/utils/readers/CSVReader.scala b/src/main/scala/utils/readers/CSVReader.scala
index 50c9b6e9..46f6b5e9 100644
--- a/src/main/scala/utils/readers/CSVReader.scala
+++ b/src/main/scala/utils/readers/CSVReader.scala
@@ -17,19 +17,12 @@ case class CSVReader(sourceDc: DatasetConfigurations, partitions: Int, gt: Const
         val extension = dc.getExtension
         extension match {
             case FileTypes.CSV =>
-                loadCSV(dc.path, dc.realIdField.getOrElse("id"), dc.geometryField, dc.dateField, header = true)
+                loadDelimitedFile(dc.path, dc.realIdField.getOrElse("id"), dc.geometryField, dc.dateField, ",", header = true)
             case FileTypes.TSV =>
-                loadTSV(dc.path, dc.realIdField.getOrElse("id"), dc.geometryField, dc.dateField, header = true)
+                loadDelimitedFile(dc.path, dc.realIdField.getOrElse("id"), dc.geometryField, dc.dateField, "\t", header = true)
         }
     }
 
-
-    def loadCSV(filepath: String, realIdField: String, geometryField: String, dateField: Option[String], header: Boolean):SpatialRDD[Geometry] =
-        loadDelimitedFile(filepath, realIdField, geometryField, dateField, ",", header)
-
-    def loadTSV(filepath: String, realIdField: String, geometryField: String, dateField: Option[String], header: Boolean): SpatialRDD[Geometry] =
-        loadDelimitedFile(filepath, realIdField, geometryField, dateField, "\t", header)
-
     /**
      * Loads a delimited file
      * @param filepath path to the delimited text file

From 0ec3361715ae66baf0533e96fa5ecfc15c00b989 Mon Sep 17 00:00:00 2001
From: George <giwrgosmandi@gmail.com>
Date: Tue, 23 Mar 2021 16:36:58 +0200
Subject: [PATCH 22/25] minor changes in names in PQ

---
 src/main/scala/dataModel/ComparisonPQ.scala   | 63 -------------------
 .../GIAnt.scala                               |  6 +-
 .../IndexedJoinInterlinking.scala             |  6 +-
 .../InterlinkerT.scala}                       |  6 +-
 .../progressive/DynamicProgressiveGIAnt.scala | 16 ++---
 .../progressive/GeometryCentric.scala         |  0
 .../ProgressiveAlgorithmsFactory.scala        |  6 +-
 .../progressive/ProgressiveGIAnt.scala        | 10 +--
 .../ProgressiveInterlinkerT.scala}            | 10 +--
 .../progressive/RandomScheduling.scala        | 10 +--
 .../progressive/ReciprocalTopK.scala          | 16 ++---
 .../progressive/TopKPairs.scala               | 16 ++---
 .../ComparisonPQ.scala}                       | 55 +++++++++++-----
 .../scala/{dataModel => model}/Entity.scala   |  2 +-
 src/main/scala/{dataModel => model}/IM.scala  |  2 +-
 src/main/scala/{dataModel => model}/MBR.scala | 24 +++----
 .../{dataModel => model}/SpatialIndex.scala   |  2 +-
 .../SpatioTemporalEntity.scala                |  2 +-
 .../{dataModel => model}/WeightedPair.scala   |  2 +-
 19 files changed, 108 insertions(+), 146 deletions(-)
 delete mode 100644 src/main/scala/dataModel/ComparisonPQ.scala
 rename src/main/scala/{geospatialInterlinking => interlinkers}/GIAnt.scala (95%)
 rename src/main/scala/{geospatialInterlinking => interlinkers}/IndexedJoinInterlinking.scala (94%)
 rename src/main/scala/{geospatialInterlinking/GeospatialInterlinkingT.scala => interlinkers/InterlinkerT.scala} (96%)
 rename src/main/scala/{geospatialInterlinking => interlinkers}/progressive/DynamicProgressiveGIAnt.scala (93%)
 rename src/main/scala/{geospatialInterlinking => interlinkers}/progressive/GeometryCentric.scala (100%)
 rename src/main/scala/{geospatialInterlinking => interlinkers}/progressive/ProgressiveAlgorithmsFactory.scala (92%)
 rename src/main/scala/{geospatialInterlinking => interlinkers}/progressive/ProgressiveGIAnt.scala (90%)
 rename src/main/scala/{geospatialInterlinking/progressive/ProgressiveGeospatialInterlinkingT.scala => interlinkers/progressive/ProgressiveInterlinkerT.scala} (96%)
 rename src/main/scala/{geospatialInterlinking => interlinkers}/progressive/RandomScheduling.scala (89%)
 rename src/main/scala/{geospatialInterlinking => interlinkers}/progressive/ReciprocalTopK.scala (87%)
 rename src/main/scala/{geospatialInterlinking => interlinkers}/progressive/TopKPairs.scala (88%)
 rename src/main/scala/{dataModel/WeightedPairsPQ.scala => model/ComparisonPQ.scala} (62%)
 rename src/main/scala/{dataModel => model}/Entity.scala (99%)
 rename src/main/scala/{dataModel => model}/IM.scala (97%)
 rename src/main/scala/{dataModel => model}/MBR.scala (95%)
 rename src/main/scala/{dataModel => model}/SpatialIndex.scala (98%)
 rename src/main/scala/{dataModel => model}/SpatioTemporalEntity.scala (98%)
 rename src/main/scala/{dataModel => model}/WeightedPair.scala (98%)

diff --git a/src/main/scala/dataModel/ComparisonPQ.scala b/src/main/scala/dataModel/ComparisonPQ.scala
deleted file mode 100644
index 835b6c61..00000000
--- a/src/main/scala/dataModel/ComparisonPQ.scala
+++ /dev/null
@@ -1,63 +0,0 @@
-package dataModel
-
-import org.spark_project.guava.collect.MinMaxPriorityQueue
-import scala.collection.JavaConverters._
-
-/**
- * a wrapper of guava min-max PQ.
- *
- * @param maxSize max size of PQ
- * @tparam T the type of input items
- */
-case class ComparisonPQ[T](maxSize: Int){
-
-    var minW: Float = 0f
-    val ordering: Ordering[(Float, T)] =  Ordering.by[(Float, T), Float](_._1).reverse
-    lazy val pq: MinMaxPriorityQueue[(Float, T)] = MinMaxPriorityQueue.orderedBy(ordering).maximumSize(maxSize+1).create()
-
-    /**
-     * if w is smaller than minW then omit it.
-     * Otherwise, insert it into PQ and if PQ exceed max size,
-     * remove item with the smallest weight and update minW
-     *
-     * @param w the weight of the item
-     * @param item item to insert
-     */
-    def enqueue(w: Float, item: T): Unit ={
-        if (minW < w) {
-            pq.add((w, item))
-            if (pq.size > maxSize)
-                minW = pq.pollLast()._1
-        }
-    }
-
-    def enqueueAll(items: Iterator[(T, Float)]): Unit = items.foreach{ case(item, w) => enqueue(w, item)}
-
-    def take(n: Option[Int]): Iterator[(Float, T)] =
-        n match {
-            case Some(n) => Iterator.continually{ pq.pollFirst() }.take(n)
-            case None =>  Iterator.continually{ pq.pollFirst() }.takeWhile(_ => !pq.isEmpty)
-        }
-
-    def take(n: Int): Iterator[(Float, T)] = take(Option(n))
-
-    def dequeueAll: Iterator[(Float, T)] = take(None)
-
-    def clear(): Unit = {
-        pq.clear()
-        minW = 0f
-    }
-
-    def isEmpty: Boolean = pq.isEmpty
-
-    def size(): Int = pq.size()
-
-    def dequeueHead(): (Float, T) = pq.pollFirst()
-
-    def dequeue(): (Float, T) = pq.pollLast()
-
-    def iterator(): Iterator[(Float, T)] = pq.iterator().asScala
-}
-
-
-
diff --git a/src/main/scala/geospatialInterlinking/GIAnt.scala b/src/main/scala/interlinkers/GIAnt.scala
similarity index 95%
rename from src/main/scala/geospatialInterlinking/GIAnt.scala
rename to src/main/scala/interlinkers/GIAnt.scala
index f7f426f9..cf665686 100644
--- a/src/main/scala/geospatialInterlinking/GIAnt.scala
+++ b/src/main/scala/interlinkers/GIAnt.scala
@@ -1,6 +1,6 @@
-package geospatialInterlinking
+package interlinkers
 
-import dataModel.{Entity, IM}
+import model.{Entity, IM}
 import org.apache.spark.Partitioner
 import org.apache.spark.rdd.RDD
 import utils.Constants.Relation
@@ -8,7 +8,7 @@ import utils.Constants.Relation.Relation
 import utils.Utils
 
 
-case class GIAnt(joinedRDD: RDD[(Int, (Iterable[Entity], Iterable[Entity]))], thetaXY: (Double, Double)) extends GeospatialInterlinkingT {
+case class GIAnt(joinedRDD: RDD[(Int, (Iterable[Entity], Iterable[Entity]))], thetaXY: (Double, Double)) extends InterlinkerT {
 
     /**
      * First index the Source and then use the index to find the comparisons with target's entities.
diff --git a/src/main/scala/geospatialInterlinking/IndexedJoinInterlinking.scala b/src/main/scala/interlinkers/IndexedJoinInterlinking.scala
similarity index 94%
rename from src/main/scala/geospatialInterlinking/IndexedJoinInterlinking.scala
rename to src/main/scala/interlinkers/IndexedJoinInterlinking.scala
index 3c411861..3012b951 100644
--- a/src/main/scala/geospatialInterlinking/IndexedJoinInterlinking.scala
+++ b/src/main/scala/interlinkers/IndexedJoinInterlinking.scala
@@ -1,6 +1,6 @@
-package geospatialInterlinking
+package interlinkers
 
-import dataModel.{Entity, IM}
+import model.{Entity, IM}
 import org.apache.spark.HashPartitioner
 import org.apache.spark.rdd.RDD
 import utils.Constants.Relation
@@ -10,7 +10,7 @@ import utils.Constants.WeightingScheme.WeightingScheme
 
 
 
-case class IndexedJoinInterlinking(source:RDD[(Int, Entity)], target:RDD[(Int, Entity)], thetaXY: (Double, Double)) extends GeospatialInterlinkingT {
+case class IndexedJoinInterlinking(source:RDD[(Int, Entity)], target:RDD[(Int, Entity)], thetaXY: (Double, Double)) extends InterlinkerT {
 
     val joinedRDD: RDD[(Int, (Iterable[Entity], Iterable[Entity]))] = null
     val ws: WeightingScheme = null
diff --git a/src/main/scala/geospatialInterlinking/GeospatialInterlinkingT.scala b/src/main/scala/interlinkers/InterlinkerT.scala
similarity index 96%
rename from src/main/scala/geospatialInterlinking/GeospatialInterlinkingT.scala
rename to src/main/scala/interlinkers/InterlinkerT.scala
index 58e7a67f..6db51c22 100644
--- a/src/main/scala/geospatialInterlinking/GeospatialInterlinkingT.scala
+++ b/src/main/scala/interlinkers/InterlinkerT.scala
@@ -1,12 +1,12 @@
-package geospatialInterlinking
+package interlinkers
 
-import dataModel.{Entity, IM, MBR, SpatialIndex}
+import model.{Entity, IM, MBR, SpatialIndex}
 import org.apache.spark.SparkContext
 import org.apache.spark.rdd.RDD
 import utils.Constants.Relation.Relation
 import utils.Utils
 
-trait GeospatialInterlinkingT {
+trait InterlinkerT {
 
     val orderByWeight: Ordering[(Double, (Entity, Entity))] = Ordering.by[(Double, (Entity, Entity)), Double](_._1).reverse
 
diff --git a/src/main/scala/geospatialInterlinking/progressive/DynamicProgressiveGIAnt.scala b/src/main/scala/interlinkers/progressive/DynamicProgressiveGIAnt.scala
similarity index 93%
rename from src/main/scala/geospatialInterlinking/progressive/DynamicProgressiveGIAnt.scala
rename to src/main/scala/interlinkers/progressive/DynamicProgressiveGIAnt.scala
index 954ee20b..81471e27 100644
--- a/src/main/scala/geospatialInterlinking/progressive/DynamicProgressiveGIAnt.scala
+++ b/src/main/scala/interlinkers/progressive/DynamicProgressiveGIAnt.scala
@@ -1,6 +1,6 @@
-package geospatialInterlinking.progressive
+package interlinkers.progressive
 
-import dataModel.{Entity, IM, MBR, WeightedPair, WeightedPairsPQ}
+import model.{ComparisonPQ, DynamicComparisonPQ, Entity, IM, MBR, WeightedPair}
 import org.apache.spark.Partitioner
 import org.apache.spark.rdd.RDD
 import org.apache.spark.storage.StorageLevel
@@ -14,7 +14,7 @@ import scala.collection.mutable.ListBuffer
 
 case class DynamicProgressiveGIAnt(joinedRDD: RDD[(Int, (Iterable[Entity], Iterable[Entity]))], thetaXY: (Double, Double),
                                     mainWS: WeightingScheme, secondaryWS: Option[WeightingScheme], budget: Int, sourceEntities: Int)
-    extends ProgressiveGeospatialInterlinkingT {
+    extends ProgressiveInterlinkerT {
 
 
     /**
@@ -26,11 +26,11 @@ case class DynamicProgressiveGIAnt(joinedRDD: RDD[(Int, (Iterable[Entity], Itera
      * @param target target
      * @return a PQ with the top comparisons
      */
-    def prioritize(source: Array[Entity], target: Array[Entity], partition: MBR, relation: Relation): WeightedPairsPQ ={
+    def prioritize(source: Array[Entity], target: Array[Entity], partition: MBR, relation: Relation): ComparisonPQ ={
         val localBudget = (math.ceil(budget*source.length.toDouble/sourceEntities.toDouble)*2).toLong
         val sourceIndex = index(source)
         val filterIndices = (b: (Int, Int)) => sourceIndex.contains(b)
-        val pq: WeightedPairsPQ = WeightedPairsPQ(localBudget)
+        val pq: DynamicComparisonPQ = DynamicComparisonPQ(localBudget)
         var counter = 0
         // weight and put the comparisons in a PQ
         target
@@ -67,7 +67,7 @@ case class DynamicProgressiveGIAnt(joinedRDD: RDD[(Int, (Iterable[Entity], Itera
                 val source = p._2._1.toArray
                 val target = p._2._2.toArray
 
-                val pq = prioritize(source, target, partition, Relation.DE9IM)
+                val pq: DynamicComparisonPQ = prioritize(source, target, partition, Relation.DE9IM).asInstanceOf[DynamicComparisonPQ]
                 val sourceCandidates: Map[Int, List[WeightedPair]] = pq.iterator().map(wp => (wp.entityId1, wp)).toList.groupBy(_._1).mapValues(_.map(_._2))
                 val targetCandidates: Map[Int, List[WeightedPair]] = pq.iterator().map(wp => (wp.entityId2, wp)).toList.groupBy(_._1).mapValues(_.map(_._2))
 
@@ -103,7 +103,7 @@ case class DynamicProgressiveGIAnt(joinedRDD: RDD[(Int, (Iterable[Entity], Itera
                 val source = p._2._1.toArray
                 val target = p._2._2.toArray
 
-                val pq = prioritize(source, target, partition, relation)
+                val pq: DynamicComparisonPQ = prioritize(source, target, partition, relation).asInstanceOf[DynamicComparisonPQ]
                 val sourceCandidates: Map[Int, List[WeightedPair]] = pq.iterator().map(wp => (wp.entityId1, wp)).toList.groupBy(_._1).mapValues(_.map(_._2))
                 val targetCandidates: Map[Int, List[WeightedPair]] = pq.iterator().map(wp => (wp.entityId2, wp)).toList.groupBy(_._1).mapValues(_.map(_._2))
                 if (!pq.isEmpty)
@@ -140,7 +140,7 @@ case class DynamicProgressiveGIAnt(joinedRDD: RDD[(Int, (Iterable[Entity], Itera
                 val source = p._2._1.toArray
                 val target = p._2._2.toArray
 
-                val pq = prioritize(source, target, partition, relation)
+                val pq: DynamicComparisonPQ = prioritize(source, target, partition, relation).asInstanceOf[DynamicComparisonPQ]
                 val sourceCandidates: Map[Int, List[WeightedPair]] = pq.iterator().map(wp => (wp.entityId1, wp)).toList.groupBy(_._1).mapValues(_.map(_._2))
                 val targetCandidates: Map[Int, List[WeightedPair]] = pq.iterator().map(wp => (wp.entityId2, wp)).toList.groupBy(_._1).mapValues(_.map(_._2))
                 if (!pq.isEmpty)
diff --git a/src/main/scala/geospatialInterlinking/progressive/GeometryCentric.scala b/src/main/scala/interlinkers/progressive/GeometryCentric.scala
similarity index 100%
rename from src/main/scala/geospatialInterlinking/progressive/GeometryCentric.scala
rename to src/main/scala/interlinkers/progressive/GeometryCentric.scala
diff --git a/src/main/scala/geospatialInterlinking/progressive/ProgressiveAlgorithmsFactory.scala b/src/main/scala/interlinkers/progressive/ProgressiveAlgorithmsFactory.scala
similarity index 92%
rename from src/main/scala/geospatialInterlinking/progressive/ProgressiveAlgorithmsFactory.scala
rename to src/main/scala/interlinkers/progressive/ProgressiveAlgorithmsFactory.scala
index e8a28877..8e4d9441 100644
--- a/src/main/scala/geospatialInterlinking/progressive/ProgressiveAlgorithmsFactory.scala
+++ b/src/main/scala/interlinkers/progressive/ProgressiveAlgorithmsFactory.scala
@@ -1,6 +1,6 @@
-package geospatialInterlinking.progressive
+package interlinkers.progressive
 
-import dataModel.Entity
+import model.Entity
 import org.apache.spark.Partitioner
 import org.apache.spark.rdd.RDD
 import utils.Constants.ProgressiveAlgorithm.ProgressiveAlgorithm
@@ -12,7 +12,7 @@ object ProgressiveAlgorithmsFactory {
 
     def get(matchingAlgorithm: ProgressiveAlgorithm, source: RDD[(Int, Entity)], target: RDD[(Int, Entity)],
             partitioner: Partitioner, budget: Int = 0, mainWS: WeightingScheme,  secondaryWS: Option[WeightingScheme]):
-    ProgressiveGeospatialInterlinkingT ={
+    ProgressiveInterlinkerT ={
 
         matchingAlgorithm match {
             case ProgressiveAlgorithm.RANDOM =>
diff --git a/src/main/scala/geospatialInterlinking/progressive/ProgressiveGIAnt.scala b/src/main/scala/interlinkers/progressive/ProgressiveGIAnt.scala
similarity index 90%
rename from src/main/scala/geospatialInterlinking/progressive/ProgressiveGIAnt.scala
rename to src/main/scala/interlinkers/progressive/ProgressiveGIAnt.scala
index 0bf2549e..9ffae1fa 100644
--- a/src/main/scala/geospatialInterlinking/progressive/ProgressiveGIAnt.scala
+++ b/src/main/scala/interlinkers/progressive/ProgressiveGIAnt.scala
@@ -1,6 +1,6 @@
-package geospatialInterlinking.progressive
+package interlinkers.progressive
 
-import dataModel.{Entity, MBR, WeightedPair, WeightedPairsPQ}
+import model.{Entity, MBR, WeightedPair, StaticComparisonPQ}
 import org.apache.spark.Partitioner
 import org.apache.spark.rdd.RDD
 import utils.Constants.Relation.Relation
@@ -10,7 +10,7 @@ import utils.Utils
 
 case class ProgressiveGIAnt(joinedRDD: RDD[(Int, (Iterable[Entity], Iterable[Entity]))], thetaXY: (Double, Double),
                             mainWS: WeightingScheme, secondaryWS: Option[WeightingScheme], budget: Int, sourceEntities: Int)
-    extends ProgressiveGeospatialInterlinkingT {
+    extends ProgressiveInterlinkerT {
 
 
     /**
@@ -22,11 +22,11 @@ case class ProgressiveGIAnt(joinedRDD: RDD[(Int, (Iterable[Entity], Iterable[Ent
      * @param target target
      * @return a PQ with the top comparisons
      */
-    def prioritize(source: Array[Entity], target: Array[Entity], partition: MBR, relation: Relation): WeightedPairsPQ ={
+    def prioritize(source: Array[Entity], target: Array[Entity], partition: MBR, relation: Relation): StaticComparisonPQ ={
         val localBudget = (math.ceil(budget*source.length.toDouble/sourceEntities.toDouble)*2).toLong
         val sourceIndex = index(source)
         val filterIndices = (b: (Int, Int)) => sourceIndex.contains(b)
-        val pq: WeightedPairsPQ = WeightedPairsPQ(localBudget)
+        val pq: StaticComparisonPQ = StaticComparisonPQ(localBudget)
         var counter = 0
         // weight and put the comparisons in a PQ
         target
diff --git a/src/main/scala/geospatialInterlinking/progressive/ProgressiveGeospatialInterlinkingT.scala b/src/main/scala/interlinkers/progressive/ProgressiveInterlinkerT.scala
similarity index 96%
rename from src/main/scala/geospatialInterlinking/progressive/ProgressiveGeospatialInterlinkingT.scala
rename to src/main/scala/interlinkers/progressive/ProgressiveInterlinkerT.scala
index 105a45e9..3206ec5d 100644
--- a/src/main/scala/geospatialInterlinking/progressive/ProgressiveGeospatialInterlinkingT.scala
+++ b/src/main/scala/interlinkers/progressive/ProgressiveInterlinkerT.scala
@@ -1,7 +1,7 @@
-package geospatialInterlinking.progressive
+package interlinkers.progressive
 
-import dataModel.{Entity, IM, MBR, WeightedPair, WeightedPairsPQ}
-import geospatialInterlinking.GeospatialInterlinkingT
+import model.{ComparisonPQ, Entity, IM, MBR, WeightedPair}
+import interlinkers.InterlinkerT
 import org.apache.commons.math3.stat.inference.ChiSquareTest
 import org.apache.spark.rdd.RDD
 import org.apache.spark.storage.StorageLevel
@@ -13,7 +13,7 @@ import scala.collection.mutable
 import scala.collection.mutable.ListBuffer
 import scala.math.{ceil, floor, max, min}
 
-trait ProgressiveGeospatialInterlinkingT extends GeospatialInterlinkingT{
+trait ProgressiveInterlinkerT extends InterlinkerT{
     val budget: Int
     val mainWS: WeightingScheme
     val secondaryWS: Option[WeightingScheme]
@@ -27,7 +27,7 @@ trait ProgressiveGeospatialInterlinkingT extends GeospatialInterlinkingT{
         (globalMaxX - globalMinX + 1) * (globalMaxY - globalMinY + 1)
     }
 
-    def prioritize(source: Array[Entity], target: Array[Entity], partition: MBR, relation: Relation): WeightedPairsPQ
+    def prioritize(source: Array[Entity], target: Array[Entity], partition: MBR, relation: Relation): ComparisonPQ
 
     def getMainWeight(e1: Entity, e2: Entity): Float = getWeight(e1, e2, mainWS)
 
diff --git a/src/main/scala/geospatialInterlinking/progressive/RandomScheduling.scala b/src/main/scala/interlinkers/progressive/RandomScheduling.scala
similarity index 89%
rename from src/main/scala/geospatialInterlinking/progressive/RandomScheduling.scala
rename to src/main/scala/interlinkers/progressive/RandomScheduling.scala
index 9619fc05..9aa843ce 100644
--- a/src/main/scala/geospatialInterlinking/progressive/RandomScheduling.scala
+++ b/src/main/scala/interlinkers/progressive/RandomScheduling.scala
@@ -1,6 +1,6 @@
-package geospatialInterlinking.progressive
+package interlinkers.progressive
 
-import dataModel.{Entity, MBR, WeightedPair, WeightedPairsPQ}
+import model.{Entity, MBR, WeightedPair, StaticComparisonPQ}
 import org.apache.spark.Partitioner
 import org.apache.spark.rdd.RDD
 import utils.Constants.Relation.Relation
@@ -9,7 +9,7 @@ import utils.Utils
 
 case class RandomScheduling(joinedRDD: RDD[(Int, (Iterable[Entity], Iterable[Entity]))], thetaXY: (Double, Double),
                             mainWS: WeightingScheme, secondaryWS: Option[WeightingScheme], budget: Int, sourceEntities: Int)
-    extends ProgressiveGeospatialInterlinkingT {
+    extends ProgressiveInterlinkerT {
 
 
     /**
@@ -20,12 +20,12 @@ case class RandomScheduling(joinedRDD: RDD[(Int, (Iterable[Entity], Iterable[Ent
      * @param target    target
      * @return a PQ with the top comparisons
      */
-    def prioritize(source: Array[Entity], target: Array[Entity], partition: MBR, relation: Relation): WeightedPairsPQ = {
+    def prioritize(source: Array[Entity], target: Array[Entity], partition: MBR, relation: Relation): StaticComparisonPQ = {
         val localBudget = (math.ceil(budget*source.length.toDouble/sourceEntities.toDouble)*2).toInt
 
         val sourceIndex = index(source)
         val filterIndices = (b: (Int, Int)) => sourceIndex.contains(b)
-        val pq: WeightedPairsPQ = WeightedPairsPQ(localBudget)
+        val pq: StaticComparisonPQ = StaticComparisonPQ(localBudget)
         val rnd = new scala.util.Random
         var counter = 0
         // weight and put the comparisons in a PQ
diff --git a/src/main/scala/geospatialInterlinking/progressive/ReciprocalTopK.scala b/src/main/scala/interlinkers/progressive/ReciprocalTopK.scala
similarity index 87%
rename from src/main/scala/geospatialInterlinking/progressive/ReciprocalTopK.scala
rename to src/main/scala/interlinkers/progressive/ReciprocalTopK.scala
index 65abdcdd..f6e1a973 100644
--- a/src/main/scala/geospatialInterlinking/progressive/ReciprocalTopK.scala
+++ b/src/main/scala/interlinkers/progressive/ReciprocalTopK.scala
@@ -1,6 +1,6 @@
-package geospatialInterlinking.progressive
+package interlinkers.progressive
 
-import dataModel.{Entity, MBR, WeightedPair, WeightedPairsPQ}
+import model.{Entity, MBR, WeightedPair, StaticComparisonPQ}
 import org.apache.spark.Partitioner
 import org.apache.spark.rdd.RDD
 import utils.Constants.Relation.Relation
@@ -11,7 +11,7 @@ import utils.Utils
 
 case class ReciprocalTopK(joinedRDD: RDD[(Int, (Iterable[Entity], Iterable[Entity]))], thetaXY: (Double, Double),
                           mainWS: WeightingScheme, secondaryWS: Option[WeightingScheme], budget: Int, sourceEntities: Int)
-    extends ProgressiveGeospatialInterlinkingT {
+    extends ProgressiveInterlinkerT {
 
     /**
      * Find the top-K comparisons of target and source and keep only the comparison (i, j) that belongs to both
@@ -23,7 +23,7 @@ case class ReciprocalTopK(joinedRDD: RDD[(Int, (Iterable[Entity], Iterable[Entit
      * @param relation examining relation
      * @return prioritized comparisons as a PQ
      */
-    def prioritize(source: Array[Entity], target: Array[Entity], partition: MBR, relation: Relation):  WeightedPairsPQ = {
+    def prioritize(source: Array[Entity], target: Array[Entity], partition: MBR, relation: Relation):  StaticComparisonPQ = {
         val localBudget = (math.ceil(budget*source.length.toDouble/sourceEntities.toDouble)*2).toLong
         val sourceIndex = index(source)
         val filterIndices = (b: (Int, Int)) => sourceIndex.contains(b)
@@ -31,9 +31,9 @@ case class ReciprocalTopK(joinedRDD: RDD[(Int, (Iterable[Entity], Iterable[Entit
         val sourceK = (math.ceil(localBudget / source.length).toInt + 1) * 2 // +1 to avoid k=0
         val targetK = (math.ceil(localBudget / target.length).toInt + 1) * 2 // +1 to avoid k=0
 
-        val sourcePQ: Array[WeightedPairsPQ] = new Array(source.length)
-        val targetPQ: WeightedPairsPQ = WeightedPairsPQ(targetK)
-        val partitionPQ: WeightedPairsPQ = WeightedPairsPQ(localBudget)
+        val sourcePQ: Array[StaticComparisonPQ] = new Array(source.length)
+        val targetPQ: StaticComparisonPQ = StaticComparisonPQ(targetK)
+        val partitionPQ: StaticComparisonPQ = StaticComparisonPQ(localBudget)
         var counter = 0
 
         val targetSet: Array[Set[Int]] = new Array(target.length)
@@ -56,7 +56,7 @@ case class ReciprocalTopK(joinedRDD: RDD[(Int, (Iterable[Entity], Iterable[Entit
 
                                 // update source entities' top-K
                                 if (sourcePQ(i) == null)
-                                    sourcePQ(i) = WeightedPairsPQ(sourceK)
+                                    sourcePQ(i) = StaticComparisonPQ(sourceK)
                                 sourcePQ(i).enqueue(wp)
                             }
                     }
diff --git a/src/main/scala/geospatialInterlinking/progressive/TopKPairs.scala b/src/main/scala/interlinkers/progressive/TopKPairs.scala
similarity index 88%
rename from src/main/scala/geospatialInterlinking/progressive/TopKPairs.scala
rename to src/main/scala/interlinkers/progressive/TopKPairs.scala
index c9262c1f..458f307b 100644
--- a/src/main/scala/geospatialInterlinking/progressive/TopKPairs.scala
+++ b/src/main/scala/interlinkers/progressive/TopKPairs.scala
@@ -1,6 +1,6 @@
-package geospatialInterlinking.progressive
+package interlinkers.progressive
 
-import dataModel.{Entity, MBR, WeightedPair, WeightedPairsPQ}
+import model.{Entity, MBR, WeightedPair, StaticComparisonPQ}
 import org.apache.spark.Partitioner
 import org.apache.spark.rdd.RDD
 import utils.Constants.Relation.Relation
@@ -9,7 +9,7 @@ import utils.Utils
 
 case class TopKPairs(joinedRDD: RDD[(Int, (Iterable[Entity], Iterable[Entity]))], thetaXY: (Double, Double),
                      mainWS: WeightingScheme, secondaryWS: Option[WeightingScheme], budget: Int, sourceEntities: Int)
-    extends ProgressiveGeospatialInterlinkingT {
+    extends ProgressiveInterlinkerT {
 
     /**
      * First we find the top-k comparisons of each geometry in source and target,
@@ -22,16 +22,16 @@ case class TopKPairs(joinedRDD: RDD[(Int, (Iterable[Entity], Iterable[Entity]))]
      * @param relation examining relation
      * @return prioritized comparisons in a PQ
      */
-    def prioritize(source: Array[Entity], target: Array[Entity], partition: MBR, relation: Relation): WeightedPairsPQ = {
+    def prioritize(source: Array[Entity], target: Array[Entity], partition: MBR, relation: Relation): StaticComparisonPQ = {
         val localBudget = (math.ceil(budget*source.length.toDouble/sourceEntities.toDouble)*2).toLong
         val sourceIndex = index(source)
         val filterIndices = (b: (Int, Int)) => sourceIndex.contains(b)
 
         // the budget is divided based on the number of entities
         val k = (math.ceil(localBudget / (source.length + target.length)).toInt + 1) * 2 // +1 to avoid k=0
-        val sourcePQ: Array[WeightedPairsPQ] = new Array(source.length)
-        val targetPQ: WeightedPairsPQ = WeightedPairsPQ(k)
-        val partitionPQ: WeightedPairsPQ = WeightedPairsPQ(localBudget)
+        val sourcePQ: Array[StaticComparisonPQ] = new Array(source.length)
+        val targetPQ: StaticComparisonPQ = StaticComparisonPQ(k)
+        val partitionPQ: StaticComparisonPQ = StaticComparisonPQ(localBudget)
         var counter = 0
 
         target.indices
@@ -53,7 +53,7 @@ case class TopKPairs(joinedRDD: RDD[(Int, (Iterable[Entity], Iterable[Entity]))]
 
                                 // update source entities' top-K
                                 if (sourcePQ(i) == null)
-                                    sourcePQ(i) = WeightedPairsPQ(k)
+                                    sourcePQ(i) = StaticComparisonPQ(k)
                                 sourcePQ(i).enqueue(wp)
                             }
                     }
diff --git a/src/main/scala/dataModel/WeightedPairsPQ.scala b/src/main/scala/model/ComparisonPQ.scala
similarity index 62%
rename from src/main/scala/dataModel/WeightedPairsPQ.scala
rename to src/main/scala/model/ComparisonPQ.scala
index f3d16c91..742a2e2e 100644
--- a/src/main/scala/dataModel/WeightedPairsPQ.scala
+++ b/src/main/scala/model/ComparisonPQ.scala
@@ -1,16 +1,18 @@
-package dataModel
+package model
 
 import java.util
+import org.spark_project.guava.collect.MinMaxPriorityQueue
 import scala.collection.JavaConverters._
 
-case class WeightedPairsPQ(maxSize: Long){
 
-    val pq: util.TreeSet[WeightedPair] = new util.TreeSet[WeightedPair]()
+sealed trait ComparisonPQ {
+    val pq: util.AbstractCollection[WeightedPair]
+    val maxSize: Long
 
     def enqueue(wp: WeightedPair): Unit ={
-            pq.add(wp)
-            if (pq.size > maxSize)
-                dequeueLast()
+        pq.add(wp)
+        if (pq.size > maxSize)
+            dequeueLast()
     }
 
     def enqueueAll(items: Iterator[WeightedPair]): Unit = items.foreach(wp => enqueue(wp))
@@ -21,14 +23,6 @@ case class WeightedPairsPQ(maxSize: Long){
             case None =>  Iterator.continually{ dequeueHead() }.takeWhile(_ => !pq.isEmpty)
         }
 
-    def dynamicUpdate(wp: WeightedPair): Unit ={
-        val exists = pq.remove(wp)
-        if (exists){
-            wp.incrementRelatedMatches()
-            enqueue(wp)
-        }
-    }
-
     def take(n: Int): Iterator[WeightedPair] = take(Option(n))
 
     def dequeueAll: Iterator[WeightedPair] = take(None)
@@ -39,12 +33,43 @@ case class WeightedPairsPQ(maxSize: Long){
 
     def size(): Int = pq.size()
 
+    def dequeueHead(): WeightedPair
+
+    def dequeueLast(): WeightedPair
+
+    def iterator(): Iterator[WeightedPair] = pq.iterator().asScala
+
+}
+
+
+case class StaticComparisonPQ(maxSize: Long) extends ComparisonPQ{
+
+    val pq: MinMaxPriorityQueue[WeightedPair] = MinMaxPriorityQueue.maximumSize(maxSize.toInt+1).create()
+
     def dequeueHead(): WeightedPair = pq.pollFirst()
 
     def dequeueLast(): WeightedPair = pq.pollLast()
 
-    def iterator(): Iterator[WeightedPair] = pq.iterator().asScala
 }
 
+case class DynamicComparisonPQ(maxSize: Long) extends ComparisonPQ{
+
+    val pq: util.TreeSet[WeightedPair] = new util.TreeSet[WeightedPair]()
+
+    def dequeueHead(): WeightedPair = pq.pollFirst()
+
+    def dequeueLast(): WeightedPair = pq.pollLast()
+
+    def dynamicUpdate(wp: WeightedPair): Unit ={
+        val exists = pq.remove(wp)
+        if (exists){
+            wp.incrementRelatedMatches()
+            enqueue(wp)
+        }
+    }
+}
+
+
+
 
 
diff --git a/src/main/scala/dataModel/Entity.scala b/src/main/scala/model/Entity.scala
similarity index 99%
rename from src/main/scala/dataModel/Entity.scala
rename to src/main/scala/model/Entity.scala
index 8ac56235..fe22ba9c 100644
--- a/src/main/scala/dataModel/Entity.scala
+++ b/src/main/scala/model/Entity.scala
@@ -1,4 +1,4 @@
-package dataModel
+package model
 
 import com.vividsolutions.jts.geom.{Geometry, IntersectionMatrix}
 import com.vividsolutions.jts.io.WKTReader
diff --git a/src/main/scala/dataModel/IM.scala b/src/main/scala/model/IM.scala
similarity index 97%
rename from src/main/scala/dataModel/IM.scala
rename to src/main/scala/model/IM.scala
index f3d252a0..73f667da 100644
--- a/src/main/scala/dataModel/IM.scala
+++ b/src/main/scala/model/IM.scala
@@ -1,4 +1,4 @@
-package dataModel
+package model
 
 case class IM(idPair: (String, String), isContains: Boolean, isCoveredBy: Boolean, isCovers: Boolean, isCrosses: Boolean,
               isEquals: Boolean, isIntersects: Boolean, isOverlaps: Boolean, isTouches: Boolean, isWithin: Boolean){
diff --git a/src/main/scala/dataModel/MBR.scala b/src/main/scala/model/MBR.scala
similarity index 95%
rename from src/main/scala/dataModel/MBR.scala
rename to src/main/scala/model/MBR.scala
index bbde614d..589a9392 100644
--- a/src/main/scala/dataModel/MBR.scala
+++ b/src/main/scala/model/MBR.scala
@@ -1,4 +1,4 @@
-package dataModel
+package model
 
 import com.vividsolutions.jts.geom.{Coordinate, Envelope, Geometry, GeometryFactory}
 import utils.Constants.Relation
@@ -30,7 +30,7 @@ case class MBR(maxX:Double, minX:Double, maxY:Double, minY:Double){
      * @param thetaXY blocks' granularity
      * @return true if the reference point is in the block
      */
-    private[dataModel]
+    private[model]
     def referencePointFiltering(mbr:MBR, b:(Int, Int), thetaXY: (Double, Double)): Boolean ={
         val (thetaX, thetaY) = thetaXY
 
@@ -53,7 +53,7 @@ case class MBR(maxX:Double, minX:Double, maxY:Double, minY:Double){
      * @param partition the examining partition
      * @return  true if the reference point is in the block and in partition
      */
-    private[dataModel]
+    private[model]
     def referencePointFiltering(mbr:MBR, b:(Int, Int), thetaXY: (Double, Double), partition: MBR): Boolean ={
         val (thetaX, thetaY) = thetaXY
 
@@ -75,7 +75,7 @@ case class MBR(maxX:Double, minX:Double, maxY:Double, minY:Double){
      * @param relations requested relations
      * @return whether the relation is true
      */
-    private[dataModel]
+    private[model]
     def testMBR(mbr:MBR, relations: Seq[Relation]): Boolean =
         relations.map {
             case Relation.CONTAINS | Relation.COVERS =>
@@ -98,7 +98,7 @@ case class MBR(maxX:Double, minX:Double, maxY:Double, minY:Double){
      * @param mbr given mbr
      * @return whether it's true
      */
-    private[dataModel]
+    private[model]
     def equals(mbr:MBR): Boolean = minX == mbr.minX && maxX == mbr.maxX && minY == mbr.minY && maxY == mbr.maxY
 
 
@@ -107,14 +107,14 @@ case class MBR(maxX:Double, minX:Double, maxY:Double, minY:Double){
      * @param mbr given mbr
      * @return whether it's true
      */
-    private[dataModel]
+    private[model]
     def contains(mbr:MBR): Boolean = minX <= mbr.minX && maxX >= mbr.maxX && minY <= mbr.minY && maxY >= mbr.maxY
 
-    private[dataModel]
+    private[model]
     def contains(minX: Double, maxX: Double, minY: Double, maxY: Double): Boolean = minX <= minX && maxX >= maxX && minY <= minY && maxY >= maxY
 
 
-    private[dataModel]
+    private[model]
     def contains(c: (Double, Double)): Boolean = minX <= c._1 && maxX >= c._1 && minY <= c._2 && maxY >= c._2
 
 
@@ -123,7 +123,7 @@ case class MBR(maxX:Double, minX:Double, maxY:Double, minY:Double){
      * @param mbr given mbr
      * @return whether it's true
      */
-    private[dataModel]
+    private[model]
     def within(mbr: MBR):Boolean = mbr.contains(this)
 
 
@@ -132,7 +132,7 @@ case class MBR(maxX:Double, minX:Double, maxY:Double, minY:Double){
      * @param mbr given mbr
      * @return whether it's true
      */
-    private[dataModel]
+    private[model]
     def touches(mbr: MBR): Boolean = maxX == mbr.maxX || minX == mbr.minX || maxY == mbr.maxY || minY == mbr.minY
 
 
@@ -141,7 +141,7 @@ case class MBR(maxX:Double, minX:Double, maxY:Double, minY:Double){
      * @param mbr given mbr
      * @return whether it's true
      */
-    private[dataModel]
+    private[model]
     def intersects(mbr:MBR): Boolean = ! disjoint(mbr)
 
 
@@ -150,7 +150,7 @@ case class MBR(maxX:Double, minX:Double, maxY:Double, minY:Double){
      * @param mbr given mbr
      * @return whether it's true
      */
-    private[dataModel]
+    private[model]
     def disjoint(mbr:MBR): Boolean = minX > mbr.maxX || maxX < mbr.minX || minY > mbr.maxY || maxY < mbr.minY
 
 
diff --git a/src/main/scala/dataModel/SpatialIndex.scala b/src/main/scala/model/SpatialIndex.scala
similarity index 98%
rename from src/main/scala/dataModel/SpatialIndex.scala
rename to src/main/scala/model/SpatialIndex.scala
index b5d88612..cebf0e8c 100644
--- a/src/main/scala/dataModel/SpatialIndex.scala
+++ b/src/main/scala/model/SpatialIndex.scala
@@ -1,4 +1,4 @@
-package dataModel
+package model
 
 import scala.collection.{Set, mutable}
 import scala.collection.mutable.ListBuffer
diff --git a/src/main/scala/dataModel/SpatioTemporalEntity.scala b/src/main/scala/model/SpatioTemporalEntity.scala
similarity index 98%
rename from src/main/scala/dataModel/SpatioTemporalEntity.scala
rename to src/main/scala/model/SpatioTemporalEntity.scala
index 5b7c3252..7f37760e 100644
--- a/src/main/scala/dataModel/SpatioTemporalEntity.scala
+++ b/src/main/scala/model/SpatioTemporalEntity.scala
@@ -1,4 +1,4 @@
-package dataModel
+package model
 
 import com.vividsolutions.jts.geom.Geometry
 import com.vividsolutions.jts.io.WKTReader
diff --git a/src/main/scala/dataModel/WeightedPair.scala b/src/main/scala/model/WeightedPair.scala
similarity index 98%
rename from src/main/scala/dataModel/WeightedPair.scala
rename to src/main/scala/model/WeightedPair.scala
index 00db6dc9..2eeedd85 100644
--- a/src/main/scala/dataModel/WeightedPair.scala
+++ b/src/main/scala/model/WeightedPair.scala
@@ -1,4 +1,4 @@
-package dataModel
+package model
 
 case class WeightedPair(counter: Int, entityId1: Int, entityId2: Int, mainWeight: Float, secondaryWeight: Float)  extends Serializable with Comparable[WeightedPair]{
 

From 52a54aa320e405516744c522dc0424e36dbefe9e Mon Sep 17 00:00:00 2001
From: George <giwrgosmandi@gmail.com>
Date: Tue, 23 Mar 2021 16:38:01 +0200
Subject: [PATCH 23/25] readers and fixing RDF reader

---
 src/main/scala/experiments/BalancingExp.scala |  21 ++--
 .../scala/experiments/EvaluationExp.scala     |  24 ++--
 src/main/scala/experiments/GiantExp.scala     |  28 +++--
 .../scala/experiments/ProgressiveExp.scala    |  22 +++-
 src/main/scala/utils/SparqlExecutor.scala     |  28 +++--
 src/main/scala/utils/Utils.scala              |   8 +-
 src/main/scala/utils/readers/CSVReader.scala  |   6 +-
 .../utils/readers/GeospatialReader.scala      |   6 +-
 .../scala/utils/readers/RDFGraphReader.scala  |  29 +++--
 src/main/scala/utils/readers/Reader.scala     | 104 ++++++++++--------
 .../scala/utils/readers/ReaderFactory.scala   |   5 -
 11 files changed, 164 insertions(+), 117 deletions(-)
 delete mode 100644 src/main/scala/utils/readers/ReaderFactory.scala

diff --git a/src/main/scala/experiments/BalancingExp.scala b/src/main/scala/experiments/BalancingExp.scala
index f6ff31c1..7d4dd457 100644
--- a/src/main/scala/experiments/BalancingExp.scala
+++ b/src/main/scala/experiments/BalancingExp.scala
@@ -2,8 +2,8 @@ package experiments
 
 import java.util.Calendar
 
-import dataModel.Entity
-import geospatialInterlinking.{GIAnt, IndexedJoinInterlinking}
+import model.Entity
+import interlinkers.{GIAnt, IndexedJoinInterlinking}
 import org.apache.log4j.{Level, LogManager, Logger}
 import org.apache.spark.rdd.RDD
 import org.apache.spark.serializer.KryoSerializer
@@ -72,15 +72,22 @@ object BalancingExp {
         val startTime = Calendar.getInstance().getTimeInMillis
 
 
-        val reader = Reader(conf.source, partitions, gridType)
-        val sourceRDD = reader.spatialLoad()
+        // reading source dataset
+        val reader = Reader(partitions, gridType)
+        val sourceRDD: RDD[(Int, Entity)] = reader.loadSource(conf.source)
         sourceRDD.persist(StorageLevel.MEMORY_AND_DISK)
-        Utils(sourceRDD.map(_._2.mbr), conf.getTheta, reader.partitionsZones)
-        log.info(s"DS-JEDAI: Source was loaded into ${sourceRDD.getNumPartitions} partitions")
         val sourcePartitions: RDD[(Int, Iterator[Entity])] = sourceRDD.mapPartitions(si => Iterator((TaskContext.getPartitionId(), si.map(_._2))))
 
 
-        val targetRDD = reader.spatialLoad(conf.target)
+        // reading target dataset
+        val targetRDD: RDD[(Int, Entity)] = reader.load(conf.target) match {
+            case Left(e) =>
+                log.error("Paritioner is not initialized, call first the `loadSource`.")
+                e.printStackTrace()
+                System.exit(1)
+                null
+            case Right(rdd) => rdd
+        }
         val partitioner = reader.partitioner
 
         val entitiesPerPartitions: Seq[(Int, Int)] =  sourcePartitions.map{ case (pid, si) => (pid, si.size)}.collect()
diff --git a/src/main/scala/experiments/EvaluationExp.scala b/src/main/scala/experiments/EvaluationExp.scala
index f2425be5..65f7471f 100644
--- a/src/main/scala/experiments/EvaluationExp.scala
+++ b/src/main/scala/experiments/EvaluationExp.scala
@@ -1,9 +1,9 @@
 package experiments
 
 
-import dataModel.Entity
-import geospatialInterlinking.GIAnt
-import geospatialInterlinking.progressive.ProgressiveAlgorithmsFactory
+import model.Entity
+import interlinkers.GIAnt
+import interlinkers.progressive.ProgressiveAlgorithmsFactory
 import org.apache.log4j.{Level, LogManager, Logger}
 import org.apache.spark.rdd.RDD
 import org.apache.spark.{Partitioner, SparkConf, SparkContext}
@@ -84,15 +84,23 @@ object EvaluationExp {
 
         log.info("DS-JEDAI: Input Budget: " + budget)
 
-        val reader = Reader(conf.source, partitions, gridType)
-        val sourceRDD = reader.spatialLoad()
+        val reader = Reader(partitions, gridType)
+        val sourceRDD: RDD[(Int, Entity)] = reader.loadSource(conf.source)
         sourceRDD.persist(StorageLevel.MEMORY_AND_DISK)
-        Utils(sourceRDD.map(_._2.mbr), conf.getTheta, reader.partitionsZones)
-        log.info(s"DS-JEDAI: Source was loaded into ${sourceRDD.getNumPartitions} partitions")
 
-        val targetRDD = reader.spatialLoad(conf.target)
+        val targetRDD: RDD[(Int, Entity)] = reader.load(conf.target) match {
+            case Left(e) =>
+                log.error("Paritioner is not initialized, call first the `loadSource`.")
+                e.printStackTrace()
+                System.exit(1)
+                null
+            case Right(rdd) => rdd
+        }
         val partitioner = reader.partitioner
 
+        Utils(sourceRDD.map(_._2.mbr), conf.getTheta, reader.partitionsZones)
+        log.info(s"DS-JEDAI: Source was loaded into ${sourceRDD.getNumPartitions} partitions")
+
         val (totalVerifications, totalRelatedPairs) =
             if (options.contains("tv") && options.contains("qp"))
                 (options("tv").toInt, options("qp").toInt)
diff --git a/src/main/scala/experiments/GiantExp.scala b/src/main/scala/experiments/GiantExp.scala
index a2e216c8..c341765b 100644
--- a/src/main/scala/experiments/GiantExp.scala
+++ b/src/main/scala/experiments/GiantExp.scala
@@ -3,8 +3,10 @@ package experiments
 
 import java.util.Calendar
 
-import geospatialInterlinking.GIAnt
+import interlinkers.GIAnt
+import model.Entity
 import org.apache.log4j.{Level, LogManager, Logger}
+import org.apache.spark.rdd.RDD
 import org.apache.spark.serializer.KryoSerializer
 import org.apache.spark.sql.SparkSession
 import org.apache.spark.storage.StorageLevel
@@ -17,8 +19,8 @@ import utils.{ConfigurationParser, Utils}
 object GiantExp {
 
     def main(args: Array[String]): Unit = {
-        Logger.getLogger("org").setLevel(Level.ERROR)
-        Logger.getLogger("akka").setLevel(Level.ERROR)
+        Logger.getLogger("org").setLevel(Level.INFO)
+        Logger.getLogger("akka").setLevel(Level.INFO)
         val log = LogManager.getRootLogger
         log.setLevel(Level.INFO)
 
@@ -67,15 +69,25 @@ object GiantExp {
 
         val startTime = Calendar.getInstance().getTimeInMillis
 
-        val reader = Reader(conf.source, partitions, gridType)
-        val sourceRDD = reader.spatialLoad()
+        // reading source dataset
+        val reader = Reader(partitions, gridType)
+        val sourceRDD: RDD[(Int, Entity)] = reader.loadSource(conf.source)
         sourceRDD.persist(StorageLevel.MEMORY_AND_DISK)
-        Utils(sourceRDD.map(_._2.mbr), conf.getTheta, reader.partitionsZones)
-        log.info(s"DS-JEDAI: Source was loaded into ${sourceRDD.getNumPartitions} partitions")
 
-        val targetRDD = reader.spatialLoad(conf.target)
+        // reading target dataset
+        val targetRDD: RDD[(Int, Entity)] = reader.load(conf.target) match {
+            case Left(e) =>
+                log.error("Paritioner is not initialized, call first the `loadSource`.")
+                e.printStackTrace()
+                System.exit(1)
+                null
+            case Right(rdd) => rdd
+        }
         val partitioner = reader.partitioner
 
+        Utils(sourceRDD.map(_._2.mbr), conf.getTheta, reader.partitionsZones)
+        log.info(s"DS-JEDAI: Source was loaded into ${sourceRDD.getNumPartitions} partitions")
+
         if(printCount){
             val sourceCount = sourceRDD.map(_._2.originalID).distinct().count()
             val targetCount = targetRDD.map(_._2.originalID).distinct().count()
diff --git a/src/main/scala/experiments/ProgressiveExp.scala b/src/main/scala/experiments/ProgressiveExp.scala
index c108fda2..b84af188 100644
--- a/src/main/scala/experiments/ProgressiveExp.scala
+++ b/src/main/scala/experiments/ProgressiveExp.scala
@@ -2,8 +2,10 @@ package experiments
 
 import java.util.Calendar
 
-import geospatialInterlinking.progressive.ProgressiveAlgorithmsFactory
+import interlinkers.progressive.ProgressiveAlgorithmsFactory
+import model.Entity
 import org.apache.log4j.{Level, LogManager, Logger}
+import org.apache.spark.rdd.RDD
 import org.apache.spark.{SparkConf, SparkContext}
 import org.apache.spark.serializer.KryoSerializer
 import org.apache.spark.sql.SparkSession
@@ -82,15 +84,23 @@ object ProgressiveExp {
 
         val startTime = Calendar.getInstance().getTimeInMillis
 
-        val reader = Reader(conf.source, partitions, gridType)
-        val sourceRDD = reader.spatialLoad()
+        val reader = Reader(partitions, gridType)
+        val sourceRDD: RDD[(Int, Entity)] = reader.loadSource(conf.source)
         sourceRDD.persist(StorageLevel.MEMORY_AND_DISK)
-        Utils(sourceRDD.map(_._2.mbr), conf.getTheta, reader.partitionsZones)
-        log.info(s"DS-JEDAI: Source was loaded into ${sourceRDD.getNumPartitions} partitions")
 
-        val targetRDD = reader.spatialLoad(conf.target)
+        val targetRDD: RDD[(Int, Entity)] = reader.load(conf.target) match {
+            case Left(e) =>
+                log.error("Paritioner is not initialized, call first the `loadSource`.")
+                e.printStackTrace()
+                System.exit(1)
+                null
+            case Right(rdd) => rdd
+        }
         val partitioner = reader.partitioner
 
+        Utils(sourceRDD.map(_._2.mbr), conf.getTheta, reader.partitionsZones)
+        log.info(s"DS-JEDAI: Source was loaded into ${sourceRDD.getNumPartitions} partitions")
+
         val matchingStartTime = Calendar.getInstance().getTimeInMillis
         val method = ProgressiveAlgorithmsFactory.get(pa, sourceRDD, targetRDD, partitioner, budget, mainWS, secondaryWS)
         if (relation.equals(Relation.DE9IM)) {
diff --git a/src/main/scala/utils/SparqlExecutor.scala b/src/main/scala/utils/SparqlExecutor.scala
index 3771ec91..819f6afb 100644
--- a/src/main/scala/utils/SparqlExecutor.scala
+++ b/src/main/scala/utils/SparqlExecutor.scala
@@ -1,38 +1,42 @@
 package utils
 
 import net.sansa_stack.query.spark.sparqlify.{QueryExecutionSpark, SparqlifyUtils3}
-import org.apache.jena.graph.Triple
-import org.apache.jena.query.QueryFactory
+import net.sansa_stack.rdf.common.partition.core.{RdfPartition, RdfPartitioner, RdfPartitionerDefault}
+import net.sansa_stack.rdf.spark.partition.core.RdfPartitionUtilsSpark
+import org.apache.jena.query.{ARQ, QueryFactory}
 import org.apache.jena.sparql.core.Var
-import org.apache.spark.rdd.RDD
 import org.apache.spark.sql.{DataFrame, SparkSession}
 import org.apache.spark.sql.functions._
 
+import scala.reflect.ClassTag
+import org.apache.jena.graph.Triple
+import org.apache.spark.rdd.RDD
+import org.apache.spark.sql.Row
+
 import scala.collection.JavaConversions._
 import scala.collection.JavaConversions.asScalaSet
-import net.sansa_stack.rdf.spark.partition.core.RdfPartitionUtilsSpark
-
 import scala.collection.mutable
 
+
 object SparqlExecutor {
 
     //Renames columns to their expected names
-    def renameDfCols(mappings: Array[(String, mutable.Set[Var])], df:DataFrame) : DataFrame = {
-        mappings.foldLeft(df) { (memoDf:DataFrame, colMapping:(String, mutable.Set[Var])) => concatColumns(colMapping, memoDf)}
+    def renameDfCols(mappings: Array[(String, mutable.Set[Var])], df: DataFrame): DataFrame = {
+        mappings.foldLeft(df) { (memoDf: DataFrame, colMapping: (String, mutable.Set[Var])) => concatColumns(colMapping, memoDf) }
     }
 
     //Merges associated columns
-    def concatColumns(mapping:(String, mutable.Set[Var]), df: DataFrame) : DataFrame = {
+    def concatColumns(mapping: (String, mutable.Set[Var]), df: DataFrame): DataFrame = {
         val colArray = for {colName <- mapping._2} yield col(colName.getName)
         val colNameArray = for {colName <- mapping._2} yield colName.getName
         val colSeq = colArray.toSeq
         val colNameSeq = colNameArray.toSeq
 
-        df.withColumn(mapping._1, concat(colSeq:_*))
-            .drop(colNameSeq:_*)
+        df.withColumn(mapping._1, concat(colSeq: _*))
+            .drop(colNameSeq: _*)
     }
 
-    def query(spark: SparkSession, triples: RDD[Triple], sparqlQuery:String): DataFrame = {
+    def query(spark: SparkSession, triples: RDD[Triple], sparqlQuery: String): DataFrame = {
 
         val partitions = RdfPartitionUtilsSpark.partitionGraph(triples)
         val rewriter = SparqlifyUtils3.createSparqlSqlRewriter(spark, partitions)
@@ -52,4 +56,4 @@ object SparqlExecutor {
         renamedDf
     }
 
-}
+}
\ No newline at end of file
diff --git a/src/main/scala/utils/Utils.scala b/src/main/scala/utils/Utils.scala
index fae02edd..d83eb012 100644
--- a/src/main/scala/utils/Utils.scala
+++ b/src/main/scala/utils/Utils.scala
@@ -1,7 +1,7 @@
 package utils
 
 
-import dataModel.{Entity, MBR}
+import model.{Entity, MBR}
 import com.vividsolutions.jts.geom.Geometry
 import org.apache.log4j.{LogManager, Logger}
 import org.apache.spark.TaskContext
@@ -28,13 +28,14 @@ object Utils extends Serializable {
 	lazy val sourceCount: Long = source.count()
 	lazy val thetaXY: (Double, Double) = initTheta()
 
+
 	def apply(sourceRDD: RDD[MBR], thetaOpt: ThetaOption = Constants.ThetaOption.AVG, pz: Array[MBR]=Array()): Unit ={
 		source = sourceRDD
-		source.cache()
 		thetaOption = thetaOpt
 		partitionsZones = pz
 	}
 
+
 	def getTheta: (Double, Double) = thetaXY
 	def getSourceCount: Long = sourceCount
 
@@ -43,6 +44,7 @@ object Utils extends Serializable {
 	implicit def singleInt[A](implicit c: ClassTag[Int]): Encoder[Int] = Encoders.scalaInt
 	implicit def tuple[String, Int](implicit e1: Encoder[String], e2: Encoder[Int]): Encoder[(String,Int)] = Encoders.tuple[String,Int](e1, e2)
 
+
 	lazy val globalMinX: Double = partitionsZones.map(p => p.minX / thetaXY._1).min
 	lazy val globalMaxX: Double = partitionsZones.map(p => p.maxX / thetaXY._1).max
 	lazy val globalMinY: Double = partitionsZones.map(p => p.minY / thetaXY._2).min
@@ -78,7 +80,6 @@ object Utils extends Serializable {
 			case _ =>
 				(1d, 1d)
 		}
-		source.unpersist()
 		(tx, ty)
 	}
 
@@ -134,5 +135,4 @@ object Utils extends Serializable {
 		val df = spark.createDataFrame(rowRDD, schema)
 		df.write.option("header", "true").csv(path)
 	}
-
 }
\ No newline at end of file
diff --git a/src/main/scala/utils/readers/CSVReader.scala b/src/main/scala/utils/readers/CSVReader.scala
index 46f6b5e9..b12b4922 100644
--- a/src/main/scala/utils/readers/CSVReader.scala
+++ b/src/main/scala/utils/readers/CSVReader.scala
@@ -9,11 +9,11 @@ import org.datasyslab.geospark.serde.GeoSparkKryoRegistrator
 import org.datasyslab.geospark.spatialRDD.SpatialRDD
 import org.datasyslab.geosparksql.utils.{Adapter, GeoSparkSQLRegistrator}
 import utils.Constants.FileTypes
-import utils.{Constants, DatasetConfigurations}
+import utils.DatasetConfigurations
 
-case class CSVReader(sourceDc: DatasetConfigurations, partitions: Int, gt: Constants.GridType.GridType = Constants.GridType.QUADTREE) extends Reader {
+object CSVReader {
 
-    def load(dc: DatasetConfigurations): SpatialRDD[Geometry] = {
+    def extract(dc: DatasetConfigurations): SpatialRDD[Geometry] = {
         val extension = dc.getExtension
         extension match {
             case FileTypes.CSV =>
diff --git a/src/main/scala/utils/readers/GeospatialReader.scala b/src/main/scala/utils/readers/GeospatialReader.scala
index 15f730ac..096f3dce 100644
--- a/src/main/scala/utils/readers/GeospatialReader.scala
+++ b/src/main/scala/utils/readers/GeospatialReader.scala
@@ -9,11 +9,11 @@ import org.datasyslab.geospark.formatMapper.shapefileParser.ShapefileReader
 import org.datasyslab.geospark.serde.GeoSparkKryoRegistrator
 import org.datasyslab.geospark.spatialRDD.SpatialRDD
 import utils.Constants.FileTypes
-import utils.{Constants, DatasetConfigurations}
+import utils.DatasetConfigurations
 
-case class GeospatialReader(sourceDc: DatasetConfigurations, partitions: Int, gt: Constants.GridType.GridType) extends Reader {
+object GeospatialReader {
 
-    def load(dc: DatasetConfigurations): SpatialRDD[Geometry] = {
+    def extract(dc: DatasetConfigurations): SpatialRDD[Geometry] = {
         val extension = dc.getExtension
         extension match {
             case FileTypes.GEOJSON =>
diff --git a/src/main/scala/utils/readers/RDFGraphReader.scala b/src/main/scala/utils/readers/RDFGraphReader.scala
index 3e70eedc..09e8fe6e 100644
--- a/src/main/scala/utils/readers/RDFGraphReader.scala
+++ b/src/main/scala/utils/readers/RDFGraphReader.scala
@@ -1,7 +1,6 @@
 package utils.readers
 
 import com.vividsolutions.jts.geom.Geometry
-import com.vividsolutions.jts.io.WKTReader
 import org.apache.jena.query.ARQ
 import org.apache.jena.riot.Lang
 import org.apache.spark.serializer.KryoSerializer
@@ -12,16 +11,16 @@ import org.apache.spark.sql.types.{StringType, StructField, StructType}
 import org.datasyslab.geospark.serde.GeoSparkKryoRegistrator
 import org.datasyslab.geospark.spatialRDD.SpatialRDD
 import org.datasyslab.geosparksql.utils.{Adapter, GeoSparkSQLRegistrator}
-import utils.{Constants, DatasetConfigurations, SparqlExecutor}
+import utils.{DatasetConfigurations, SparqlExecutor}
 import net.sansa_stack.rdf.spark.io._
 import org.apache.spark.rdd.RDD
 import utils.Constants.FileTypes
 
 import scala.collection.mutable
 
-case class RDFGraphReader(sourceDc: DatasetConfigurations, partitions: Int, gt: Constants.GridType.GridType) extends Reader {
+object RDFGraphReader {
 
-    def load(dc: DatasetConfigurations): SpatialRDD[Geometry] = {
+    def extract(dc: DatasetConfigurations): SpatialRDD[Geometry] = {
         val extension = dc.getExtension
         val lang: Lang = extension match {
             case FileTypes.NTRIPLES => Lang.NTRIPLES
@@ -30,10 +29,11 @@ case class RDFGraphReader(sourceDc: DatasetConfigurations, partitions: Int, gt:
             case FileTypes.RDFJSON => Lang.RDFJSON
             case _ => Lang.NTRIPLES
         }
+//        loadRDF(dc.path, dc.geometryField, dc.dateField, lang)
         loadRdfAsTextual(dc.path, dc.geometryField)
     }
 
-    def loadRdfAsTextual(filepath: String, geometryPredicate: String): SpatialRDD[Geometry] ={
+    def loadRdfAsTextual(filepath: String, geometryPredicate: String): SpatialRDD[Geometry] = {
         val conf = new SparkConf()
         conf.set("spark.serializer", classOf[KryoSerializer].getName)
         conf.set("spark.kryo.registrator", classOf[GeoSparkKryoRegistrator].getName)
@@ -42,11 +42,11 @@ case class RDFGraphReader(sourceDc: DatasetConfigurations, partitions: Int, gt:
         GeoSparkSQLRegistrator.registerAll(spark)
 
         val cleanWKT = (wkt: String) => wkt.replaceAll("<\\S+>\\s?", "").replaceAll("\"", "")
-        val rowRDD: RDD[Row]  = spark.read.textFile(filepath)
+        val rowRDD: RDD[Row] = spark.read.textFile(filepath)
             .rdd.map(s => s.split(" ", 3))
             .filter(s => s(1) == geometryPredicate)
             .map(s => (s(0), cleanWKT(s(2))))
-            .filter(s => s._1 != null && s._2 != null)
+            .filter(s => s._1 != null && s._2 != null && !s._2.isEmpty)
             .filter(s => !s._2.contains("EMPTY"))
             .map(s => Row(s._1, s._2))
 
@@ -64,14 +64,13 @@ case class RDFGraphReader(sourceDc: DatasetConfigurations, partitions: Int, gt:
         srdd
     }
 
-    def loadRDF(filepath: String, geometryPredicate: String, datePredicate: Option[String], lang: Lang): SpatialRDD[Geometry] ={
+    def loadRDF(filepath: String, geometryPredicate: String, datePredicate: Option[String], lang: Lang): SpatialRDD[Geometry] = {
         val conf = new SparkConf()
         conf.set("spark.serializer", classOf[KryoSerializer].getName)
         conf.set("spark.kryo.registrator", classOf[GeoSparkKryoRegistrator].getName)
         val sc = SparkContext.getOrCreate(conf)
         val spark = SparkSession.getActiveSession.get
         GeoSparkSQLRegistrator.registerAll(spark)
-        ARQ.init()
 
         val allowedPredicates: mutable.Set[String] = mutable.Set()
         var sparqlQuery = s"SELECT ?Subject ?WKT WHERE { ?Subject $geometryPredicate ?WKT.}"
@@ -79,28 +78,29 @@ case class RDFGraphReader(sourceDc: DatasetConfigurations, partitions: Int, gt:
 
         val cleanGeomPredicate: String =
             if (geometryPredicate.head == '<' && geometryPredicate.last == '>')
-                geometryPredicate.substring(1, geometryPredicate.length-1)
+                geometryPredicate.substring(1, geometryPredicate.length - 1)
             else geometryPredicate
 
         allowedPredicates.add(cleanGeomPredicate)
 
-        if(datePredicate.isDefined){
+        if (datePredicate.isDefined) {
             val datePredicateValue = datePredicate.get
             val cleanDatePredicate: String = if (datePredicateValue.head == '<' && datePredicateValue.last == '>')
-                datePredicateValue.substring(1, datePredicateValue.length-1)
+                datePredicateValue.substring(1, datePredicateValue.length - 1)
             else datePredicateValue
             allowedPredicates.add(cleanDatePredicate)
             sparqlQuery = s"SELECT ?Subject ?WKT ?Date WHERE { ?Subject ${datePredicate.get} ?Date. ?Subject $geometryPredicate ?WKT.}"
             query = "SELECT ST_GeomFromWKT(GEOMETRIES.WKT),  GEOMETRIES.Subject, GEOMETRIES.Date FROM GEOMETRIES".stripMargin
         }
 
+        ARQ.init()
         val triplesRDD = spark.rdf(lang)(filepath).filter(t => allowedPredicates.contains(t.getPredicate.getURI))
         var df = SparqlExecutor.query(spark, triplesRDD, sparqlQuery)
 
-        val cleanWKT = udf( (wkt: String) => wkt.replaceAll("<\\S+>\\s?", ""), StringType)
+        val cleanWKT = udf((wkt: String) => wkt.replaceAll("<\\S+>\\s?", ""), StringType)
         df = df.withColumn("WKT", cleanWKT(df.col("WKT")))
             .filter(col("WKT").isNotNull)
-            .filter(! col("WKT").contains("EMPTY"))
+            .filter(!col("WKT").contains("EMPTY"))
 
         df.createOrReplaceTempView("GEOMETRIES")
 
@@ -109,5 +109,4 @@ case class RDFGraphReader(sourceDc: DatasetConfigurations, partitions: Int, gt:
         srdd.rawSpatialRDD = Adapter.toRdd(spatialDF)
         srdd
     }
-
 }
diff --git a/src/main/scala/utils/readers/Reader.scala b/src/main/scala/utils/readers/Reader.scala
index 70f72fbe..f229bae9 100644
--- a/src/main/scala/utils/readers/Reader.scala
+++ b/src/main/scala/utils/readers/Reader.scala
@@ -1,8 +1,8 @@
 package utils.readers
 
 import com.vividsolutions.jts.geom.Geometry
-import dataModel.{Entity, MBR, SpatialEntity, SpatioTemporalEntity}
-import org.apache.spark.{HashPartitioner, SparkContext}
+import model.{Entity, MBR, SpatialEntity, SpatioTemporalEntity}
+import org.apache.spark.HashPartitioner
 import org.apache.spark.rdd.RDD
 import org.datasyslab.geospark.enums.GridType
 import org.datasyslab.geospark.spatialPartitioning.SpatialPartitioner
@@ -14,51 +14,76 @@ import utils.{Constants, DatasetConfigurations}
 
 import scala.collection.JavaConverters._
 
-trait Reader {
+case class Reader(partitions: Int, gt: Constants.GridType.GridType) {
 
-    val sourceDc: DatasetConfigurations
-    val partitions: Int
-    val gt: Constants.GridType.GridType
+    /**
+     * The transformation of an SRDD into RDD does not preserve partitioning.
+     * Hence we use a spatial partitioner to spatially index the geometries and then
+     * we partition using a HashPartitioner and the spatial indexes as the partition keys
+     */
+    var spatialPartitioner: SpatialPartitioner = _
+    var partitioner: HashPartitioner = _
+    lazy val partitionsZones: Array[MBR] = spatialPartitioner.getGrids.asScala.map(e => MBR(e.getMaxX, e.getMinX, e.getMaxY, e.getMinY)).toArray
 
-    lazy val gridType: GridType = gt match {
+    val gridType: GridType = gt match {
         case Constants.GridType.KDBTREE => GridType.KDBTREE
         case _ => GridType.QUADTREE
     }
 
-    // spatial RDD of source
-    lazy val spatialRDD: SpatialRDD[Geometry] = load(sourceDc)
-
-    // spatial partitioner defined by the source spatial RDD
-    lazy val spatialPartitioner: SpatialPartitioner = {
-        spatialRDD.analyze()
-        if (partitions > 0) spatialRDD.spatialPartitioning(gridType, partitions) else spatialRDD.spatialPartitioning(gridType)
-        spatialRDD.getPartitioner
+    /**
+     * Extract the geometries from the input configurations
+     * @param dc dataset configuration
+     * @return the geometries as a SpatialRDD
+     */
+    def extract(dc: DatasetConfigurations) : SpatialRDD[Geometry] = {
+        val extension = dc.getExtension
+        extension match {
+            case FileTypes.CSV | FileTypes.TSV => CSVReader.extract(dc)
+            case FileTypes.SHP | FileTypes.GEOJSON => GeospatialReader.extract(dc)
+            case FileTypes.NTRIPLES | FileTypes.TURTLE | FileTypes.RDFXML | FileTypes.RDFJSON => RDFGraphReader.extract(dc)
+        }
     }
 
-    // the final partitioner - because the transformation of SRDD into RDD does not preserve partitioning
-    // we partitioning using HashPartitioning with the spatial indexes as keys
-    lazy val partitioner = new HashPartitioner(spatialPartitioner.numPartitions)
-
-    lazy val partitionsZones: Array[MBR] =
-        spatialPartitioner.getGrids.asScala.map(e => MBR(e.getMaxX, e.getMinX, e.getMaxY, e.getMinY)).toArray
-
-
-    def load(dc: DatasetConfigurations) : SpatialRDD[Geometry]
+    /**
+     * Load source dataset, the dataset which will initialize the partitioners
+     * @param dc dc dataset configuration
+     * @return an RDD of pairs of partition index and entities
+     */
+    def loadSource(dc: DatasetConfigurations): RDD[(Int, Entity)] ={
+        val sourceRDD = extract(dc)
+        sourceRDD.analyze()
+        if (partitions > 0)
+            sourceRDD.spatialPartitioning(gridType, partitions)
+        else
+            sourceRDD.spatialPartitioning(gridType)
+        spatialPartitioner = sourceRDD.getPartitioner
+        partitioner = new HashPartitioner(spatialPartitioner.numPartitions)
+        distribute(sourceRDD, dc)
+    }
 
+    /**
+     * Load the input dataset. If the loadSource has not been called, it will result to
+     * a NullPointerException.
+     * @param dc dc dataset configuration
+     * @return an RDD of pairs of partition index and entities
+     */
+    def load(dc: DatasetConfigurations): Either[java.lang.Throwable, RDD[(Int, Entity)]] ={
+        val rdd = extract(dc)
+        try {
+            Right(distribute(rdd, dc))
+        } catch {
+            case ex: Throwable =>Left(ex)
+        }
+    }
 
     /**
      *  Loads a dataset into Spatial Partitioned RDD. The partitioner
      *  is defined by the first dataset (i.e. the source dataset)
-     *
      * @param dc dataset configuration
      * @return a spatial partitioned rdd
      */
-    def spatialLoad(dc: DatasetConfigurations = sourceDc): RDD[(Int, Entity)] = {
-        val srdd = if (dc == sourceDc) spatialRDD else load(dc)
-        val sp = SparkContext.getOrCreate().broadcast(spatialPartitioner)
-
+    def distribute(srdd: SpatialRDD[Geometry], dc: DatasetConfigurations): RDD[(Int, Entity)] = {
         val withTemporal = dc.dateField.isDefined
-
         // remove empty, invalid geometries and geometry collections
         val filteredGeometriesRDD = srdd.rawSpatialRDD.rdd
             .map{ geom =>
@@ -84,22 +109,9 @@ trait Reader {
                             SpatioTemporalEntity(realID, geom, dateStr_)
                     }
                 }
-        // redistribute based on spatial partitioner
+        // redistribute based on spatial index
         entitiesRDD
-            .flatMap(se => sp.value.placeObject(se.geometry).asScala.map(i => (i._1.toInt, se)))
+            .flatMap(se => spatialPartitioner.placeObject(se.geometry).asScala.map(i => (i._1.toInt, se)))
             .partitionBy(partitioner)
     }
-}
-
-
-object Reader {
-    def apply(sourceDc: DatasetConfigurations, partitions: Int, gt: Constants.GridType.GridType = Constants.GridType.QUADTREE) : Reader = {
-            val extension = sourceDc.getExtension
-            extension match {
-            case FileTypes.CSV | FileTypes.TSV => CSVReader(sourceDc, partitions, gt)
-            case FileTypes.SHP | FileTypes.GEOJSON => GeospatialReader(sourceDc, partitions, gt)
-            case FileTypes.NTRIPLES | FileTypes.TURTLE | FileTypes.RDFXML | FileTypes.RDFJSON => RDFGraphReader(sourceDc, partitions, gt)
-            case _ =>  null
-        }
-    }
-}
+}
\ No newline at end of file
diff --git a/src/main/scala/utils/readers/ReaderFactory.scala b/src/main/scala/utils/readers/ReaderFactory.scala
deleted file mode 100644
index 9f3ea4b1..00000000
--- a/src/main/scala/utils/readers/ReaderFactory.scala
+++ /dev/null
@@ -1,5 +0,0 @@
-package utils.readers
-
-object ReaderFactory {
-
-}

From 7b790e56e79f4b58fe08201c831c3c031bd39ef5 Mon Sep 17 00:00:00 2001
From: George <giwrgosmandi@gmail.com>
Date: Tue, 23 Mar 2021 17:35:49 +0200
Subject: [PATCH 24/25] count entities

---
 src/main/scala/experiments/GiantExp.scala | 16 ++++++++--------
 src/main/scala/utils/readers/Reader.scala |  9 +++++++--
 2 files changed, 15 insertions(+), 10 deletions(-)

diff --git a/src/main/scala/experiments/GiantExp.scala b/src/main/scala/experiments/GiantExp.scala
index c341765b..8b6e1409 100644
--- a/src/main/scala/experiments/GiantExp.scala
+++ b/src/main/scala/experiments/GiantExp.scala
@@ -19,8 +19,8 @@ import utils.{ConfigurationParser, Utils}
 object GiantExp {
 
     def main(args: Array[String]): Unit = {
-        Logger.getLogger("org").setLevel(Level.INFO)
-        Logger.getLogger("akka").setLevel(Level.INFO)
+        Logger.getLogger("org").setLevel(Level.ERROR)
+        Logger.getLogger("akka").setLevel(Level.ERROR)
         val log = LogManager.getRootLogger
         log.setLevel(Level.INFO)
 
@@ -70,9 +70,10 @@ object GiantExp {
         val startTime = Calendar.getInstance().getTimeInMillis
 
         // reading source dataset
-        val reader = Reader(partitions, gridType)
+        val reader = Reader(partitions, gridType, printCount)
         val sourceRDD: RDD[(Int, Entity)] = reader.loadSource(conf.source)
         sourceRDD.persist(StorageLevel.MEMORY_AND_DISK)
+        val sourceCount = reader.counter
 
         // reading target dataset
         val targetRDD: RDD[(Int, Entity)] = reader.load(conf.target) match {
@@ -83,17 +84,16 @@ object GiantExp {
                 null
             case Right(rdd) => rdd
         }
+        val targetCount = reader.counter
         val partitioner = reader.partitioner
 
         Utils(sourceRDD.map(_._2.mbr), conf.getTheta, reader.partitionsZones)
         log.info(s"DS-JEDAI: Source was loaded into ${sourceRDD.getNumPartitions} partitions")
 
         if(printCount){
-            val sourceCount = sourceRDD.map(_._2.originalID).distinct().count()
-            val targetCount = targetRDD.map(_._2.originalID).distinct().count()
-            log.info("DS-JEDAI: Source valid geometries: " + sourceCount)
-            log.info("DS-JEDAI: Target valid geometries: " + targetCount)
-            log.info("DS-JEDAI: Cartesian: " + sourceCount*targetCount)
+            log.info(s"DS-JEDAI: Source geometries: $sourceCount")
+            log.info(s"DS-JEDAI: Target geometries: $targetCount")
+            log.info(s"DS-JEDAI: Cartesian: ${sourceCount*targetCount}")
         }
 
         val matchingStartTime = Calendar.getInstance().getTimeInMillis
diff --git a/src/main/scala/utils/readers/Reader.scala b/src/main/scala/utils/readers/Reader.scala
index f229bae9..388fee24 100644
--- a/src/main/scala/utils/readers/Reader.scala
+++ b/src/main/scala/utils/readers/Reader.scala
@@ -14,7 +14,9 @@ import utils.{Constants, DatasetConfigurations}
 
 import scala.collection.JavaConverters._
 
-case class Reader(partitions: Int, gt: Constants.GridType.GridType) {
+case class Reader(partitions: Int, gt: Constants.GridType.GridType, printStats: Boolean = false) {
+
+    var counter: Long = 0
 
     /**
      * The transformation of an SRDD into RDD does not preserve partitioning.
@@ -32,16 +34,19 @@ case class Reader(partitions: Int, gt: Constants.GridType.GridType) {
 
     /**
      * Extract the geometries from the input configurations
+     * As a side-effect, it also counts the geometries, if requested
      * @param dc dataset configuration
      * @return the geometries as a SpatialRDD
      */
     def extract(dc: DatasetConfigurations) : SpatialRDD[Geometry] = {
         val extension = dc.getExtension
-        extension match {
+        val rdd = extension match {
             case FileTypes.CSV | FileTypes.TSV => CSVReader.extract(dc)
             case FileTypes.SHP | FileTypes.GEOJSON => GeospatialReader.extract(dc)
             case FileTypes.NTRIPLES | FileTypes.TURTLE | FileTypes.RDFXML | FileTypes.RDFJSON => RDFGraphReader.extract(dc)
         }
+        if (printStats) counter = rdd.rawSpatialRDD.count()
+        rdd
     }
 
     /**

From 52e0e12d9fba31031d3a236dfb8971614023058a Mon Sep 17 00:00:00 2001
From: George <giwrgosmandi@gmail.com>
Date: Wed, 24 Mar 2021 11:53:05 +0200
Subject: [PATCH 25/25] updated

---
 README.md                                     | 28 +++++++++++++------
 config/LINEARWATER-AREAWATER.yaml             |  2 +-
 config/configurationTemplate.yaml             |  2 +-
 .../scala/experiments/EvaluationExp.scala     | 16 +++++------
 .../progressive/ProgressiveInterlinkerT.scala |  4 +--
 src/main/scala/utils/Constants.scala          | 13 +++++++--
 6 files changed, 41 insertions(+), 24 deletions(-)

diff --git a/README.md b/README.md
index 5317d5ad..93822702 100644
--- a/README.md
+++ b/README.md
@@ -1,7 +1,7 @@
 # DS-JedAI 
 
 DS-JedAI (Distributed Spatial JedAI) is a system for Holistic Geospatial Interlinking for big geospatial data.
-In Holistic Geospatial Interlinking we aim to discover all the topological relations between two geospatial datasets, 
+In Holistic Geospatial Interlinking, we aim to discover all the topological relations between the geometries of two geospatial datasets, 
 using the [DE-9IM](https://en.wikipedia.org/wiki/DE-9IM) topological model. DS-JedAI offers a novel batch algorithm for
 geospatial interlinking and several algorithms for progressive Geospatial Interlinking. All the algorithms have been 
 parallelized based on the MapReduce Framework. 
@@ -12,7 +12,7 @@ These algorithms take as input a budget (*BU*) that indicates the total number o
 and a weighting scheme *W* that quantify the probability of two geometries to relate. Furthermore, DS-JedAI allows
 temporal filtering in order to detect pairs that coincide not only spatially but also temporally.
 
-DS-JedAI in implemented on top of Apache Spark and can run in any distributed or standalone environment that
+DS-JedAI in implemented on top of **Apache Spark** and can run in any distributed or standalone environment that
 supports the execution of Apache Spark jobs. Currently, supports most of the RDF formats (i.e., N­Triples, Turtle,
 RDF/JSON and RDF/XML), as well as CSV, TSV, GeoJSON and ESRI shapefiles.
 
@@ -43,10 +43,11 @@ to relate. The idea is, that not all the verifications will be performed, but us
 Hence, the progressive algorithms prioritize the *BU* prominent verifications.
 The implemented algorithms are the following:
 
-- **Progressive Giant**: Implements GIA.nt but prioritizes the BU most promising geometry pairs. 
+- **Progressive GIA.nt**: Implements GIA.nt but prioritizes the BU most promising geometry pairs. 
+- **Dynamic Progressive GIA.nt**: Extends Progressive GIA.nt by boosting the weight of the pairs that are associated with qualifying pairs. So the Priority Queue,
+    that stores the geometry pairs, dynamically changes during the verifications.
 - **Geometry Top-k** :  For each geometry, finds and verifies its top-k.
 - **Geometry Reciprocal Top-k**: Verifies only the pairs *(s, t)* that *s* belongs to the top-k of *t* and *t* belongs to the top-k of *s*.
-- **Geometry-Centric**: The prioritization of *(s, t)* is based on the mean weight of all the pairs that *t* is part of.
 - **RandomScheduling**: Implements random prioritization.
 
 Currently, the supported weighting schemes are:
@@ -54,8 +55,11 @@ Currently, the supported weighting schemes are:
 - Co-occurrence Frequencies (CF)
 - Jaccard Similarity (JS)
 - Pearson's chi square test (PEARSON_x2)
-- MBR INTERSECTION
-- GEOMETRY POINTS 
+- Minimum Bounding Rectangle Overlap (MBRO)
+- Inverse Sum of Points (ISP) 
+
+The algorithms also supports composite schemes, where we combine two weighting schemes in the sense that the second weighting
+scheme is for resolving the ties of the main one.
 
 The progressive Algorithms, the weighting schemes and the budget *BU* are specified in the configuration file. Advise the 
 configuration template in `config/configurationTemplate.yaml` to see how you can specify them. To execute, run:
@@ -66,8 +70,14 @@ Some additional options are the following:
 
 - **-p N**: specify the number of partitions
 - **-gt type**: specify the grid type for the spatial partitioning. Accepted values are KDBTREE and QUADTREE.
-- **ws WS**: specify weighting scheme - allowed values: *CF, JS, MBR_INTERSECTION, PEARSON_X2, POINTS*.
-- **progressiveAlgorithm PA**:  specify progressive algorithm - allowed values: *PROGRESSIVE_GIANT, TOPK, RECIPROCAL_TOPK, GEOMETRY_CENTRIC, RANDOM*
+- **mws WS**: specify the main weighting scheme - allowed values: *CF, JS, MBRO, PEARSON_X2, ISP*.
+- **sws WS**: specify the secondary weighting scheme (optional)- allowed values: *CF, JS, MBRO, PEARSON_X2, ISP*, MBRO is preferred.
+- **progressiveAlgorithm PA**:  specify progressive algorithm - allowed values: *PROGRESSIVE_GIANT, DYNAMIC_PROGRESSIVE_GIANT, TOPK, RECIPROCAL_TOPK, RANDOM*
 - **budget** BU: the input budget.
 
-The command line options will overwrite the corresponding options of the configuration file. 
\ No newline at end of file
+The command line options will overwrite the corresponding options of the configuration file. 
+
+---
+## Publication
+
+*Progressive, Holistic Geospatial Interlinking. George Papadakis, Georgios Mandilaras, Nikos Mamoulis, Manolis Koubarakis. In Proceedings of The Web Conference  2021*
\ No newline at end of file
diff --git a/config/LINEARWATER-AREAWATER.yaml b/config/LINEARWATER-AREAWATER.yaml
index b176c04c..66a9104b 100644
--- a/config/LINEARWATER-AREAWATER.yaml
+++ b/config/LINEARWATER-AREAWATER.yaml
@@ -13,5 +13,5 @@ relation: "DE9IM"
 
 configurations:
   thetaGranularity: "avg"
-  secondaryWS: "MBR_INTERSECTION"
+  secondaryWS: "MBRO"
   mainWS: "JS"
\ No newline at end of file
diff --git a/config/configurationTemplate.yaml b/config/configurationTemplate.yaml
index 773f55c1..c56dadf5 100644
--- a/config/configurationTemplate.yaml
+++ b/config/configurationTemplate.yaml
@@ -20,7 +20,7 @@ configurations:
   partitions: "number of partitions"
   thetaGranularity: "avg"                               # define the extend of dynamic tiling based on the geometries of source - Experiments have shown that "avg" is the best option
   gridType: "spatial paritioner grid Type algorithm"    # allowed values:  KDBTREE, QUADTREE
-  mainWS: "WS"                                          # specify weighting scheme - allowed values: CF, JS, MBR_INTERSECTION, PEARSON_X2, POINTS
+  mainWS: "WS"                                          # specify weighting scheme - allowed values: CF, JS, MBRO, PEARSON_X2, ISP
   secondaryWS: "WS"
   progressiveAlgorithm : "PA"                           # specify progressive algorithm - allowed values: PROGRESSIVE_GIANT, TOPK, RECIPROCAL_TOPK, GEOMETRY_CENTRIC, RANDOM
   budget: "BU"                                          # the budget of progressive algorithms
\ No newline at end of file
diff --git a/src/main/scala/experiments/EvaluationExp.scala b/src/main/scala/experiments/EvaluationExp.scala
index 65f7471f..d61ee49d 100644
--- a/src/main/scala/experiments/EvaluationExp.scala
+++ b/src/main/scala/experiments/EvaluationExp.scala
@@ -121,14 +121,14 @@ object EvaluationExp {
             else
                 Seq(ProgressiveAlgorithm.DYNAMIC_PROGRESSIVE_GIANT, ProgressiveAlgorithm.PROGRESSIVE_GIANT, ProgressiveAlgorithm.TOPK, ProgressiveAlgorithm.RECIPROCAL_TOPK)
 
-        val weightingSchemes = Seq((WeightingScheme.JS, Option(WeightingScheme.MBR_INTERSECTION)))
-//                                (WeightingScheme.CF, None),
-//                                (WeightingScheme.JS, None),
-//                                (WeightingScheme.PEARSON_X2,None),
-//                                (WeightingScheme.MBR_INTERSECTION, None),
-//                                 (WeightingScheme.POINTS, None),
-//                                 (WeightingScheme.JS, Option(WeightingScheme.MBR_INTERSECTION)),
-//                                 (WeightingScheme.PEARSON_X2, Option(WeightingScheme.MBR_INTERSECTION)))
+        val weightingSchemes = Seq((WeightingScheme.JS, Option(WeightingScheme.MBRO)),
+                                (WeightingScheme.CF, None),
+                                (WeightingScheme.JS, None),
+                                (WeightingScheme.PEARSON_X2,None),
+                                (WeightingScheme.MBRO, None),
+                                 (WeightingScheme.ISP, None),
+                                 (WeightingScheme.JS, Option(WeightingScheme.MBRO)),
+                                 (WeightingScheme.PEARSON_X2, Option(WeightingScheme.MBRO)))
 
         for (a <- algorithms ; ws <- weightingSchemes)
             printResults(sourceRDD, targetRDD, partitioner, totalRelatedPairs, a, ws)
diff --git a/src/main/scala/interlinkers/progressive/ProgressiveInterlinkerT.scala b/src/main/scala/interlinkers/progressive/ProgressiveInterlinkerT.scala
index 3206ec5d..332893af 100644
--- a/src/main/scala/interlinkers/progressive/ProgressiveInterlinkerT.scala
+++ b/src/main/scala/interlinkers/progressive/ProgressiveInterlinkerT.scala
@@ -51,12 +51,12 @@ trait ProgressiveInterlinkerT extends InterlinkerT{
             (min(ceil(e1.mbr.maxY/thetaXY._2), ceil(e2.mbr.maxY/thetaXY._2)).toInt - max(floor(e1.mbr.minY/thetaXY._2), floor(e2.mbr.minY/thetaXY._2)).toInt + 1)
 
         ws match {
-            case WeightingScheme.MBR_INTERSECTION =>
+            case WeightingScheme.MBRO =>
                 val intersectionArea = e1.mbr.getIntersectingMBR(e2.mbr).getArea
                 val w = intersectionArea / (e1.mbr.getArea + e2.mbr.getArea - intersectionArea)
                 if (!w.isNaN) w else 0f
 
-            case WeightingScheme.POINTS =>
+            case WeightingScheme.ISP =>
                 1f / (e1.geometry.getNumPoints + e2.geometry.getNumPoints);
 
             case WeightingScheme.JS =>
diff --git a/src/main/scala/utils/Constants.scala b/src/main/scala/utils/Constants.scala
index 407037e7..f65a2b4c 100644
--- a/src/main/scala/utils/Constants.scala
+++ b/src/main/scala/utils/Constants.scala
@@ -69,12 +69,19 @@ object Constants {
 	 */
 	object WeightingScheme extends Enumeration {
 		type WeightingScheme = Value
-
+		// co-occurrence frequency
 		val CF: Constants.WeightingScheme.Value = Value("CF")
+		// jaccard  similarity
 		val JS: Constants.WeightingScheme.Value = Value("JS")
+
+		// Pearson's chi squared test
 		val PEARSON_X2: Constants.WeightingScheme.Value = Value("PEARSON_X2")
-		val MBR_INTERSECTION: Constants.WeightingScheme.Value = Value("MBR_INTERSECTION")
-		val POINTS: Constants.WeightingScheme.Value = Value("POINTS")
+
+		// minimum bounding rectangle overlap
+		val MBRO: Constants.WeightingScheme.Value = Value("MBRO")
+
+		// inverse sum of points
+		val ISP: Constants.WeightingScheme.Value = Value("ISP")
 
 		def exists(s: String): Boolean = values.exists(_.toString == s)
 	}