Skip to content

Commit 989e9fe

Browse files
authored
Merge pull request #51 from zouzias/hotfix/fix_ordering_bug_on_topk_monoid
Hotfix: Fix ordering bug on topk monoids for LuceneRDD
2 parents 3d46963 + 5d9509c commit 989e9fe

File tree

7 files changed

+39
-58
lines changed

7 files changed

+39
-58
lines changed

scripts/loadCities.scala

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -29,13 +29,13 @@ luceneRDD.cache
2929
luceneRDD.count
3030

3131
println("=" * 20)
32-
luceneRDD.termQuery("_1", "toronto")
32+
luceneRDD.termQuery("_1", "toronto").take(10)
3333

3434
println("=" * 20)
35-
luceneRDD.termQuery("_1", "athens")
35+
luceneRDD.termQuery("_1", "athens").take(10)
3636

3737
println("=" * 20)
38-
luceneRDD.termQuery("_1", "bern")
38+
luceneRDD.termQuery("_1", "bern").take(10)
3939

4040
println("=" * 20)
41-
luceneRDD.termQuery("_1", "madrid")
41+
luceneRDD.termQuery("_1", "madrid").take(10)

src/main/resources/reference.conf

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -8,7 +8,7 @@ lucenerdd {
88
// Otherwise the index will be stored in memory
99
index.store.mode = "memory"
1010

11-
query.topk.maxvalue = 1000
11+
query.topk.maxvalue = 100
1212
query.topk.default = 10
1313
query.facets.number.default = 10
1414

src/main/scala/org/zouzias/spark/lucenerdd/LuceneRDD.scala

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -84,7 +84,7 @@ class LuceneRDD[T: ClassTag](protected val partitionsRDD: RDD[AbstractLuceneRDDP
8484
*/
8585
protected def partitionMapper(f: AbstractLuceneRDDPartition[T] => LuceneRDDResponsePartition,
8686
k: Int): LuceneRDDResponse = {
87-
new LuceneRDDResponse(partitionsRDD.map(f))
87+
new LuceneRDDResponse(partitionsRDD.map(f), SparkScoreDoc.descending)
8888
}
8989

9090

@@ -149,7 +149,7 @@ class LuceneRDD[T: ClassTag](protected val partitionsRDD: RDD[AbstractLuceneRDDP
149149
def link[T1: ClassTag](other: RDD[T1], searchQueryGen: T1 => String, topK: Int = DefaultTopK)
150150
: RDD[(T1, List[SparkScoreDoc])] = {
151151
logInfo("Linkage requested")
152-
val monoid = new TopKMonoid[SparkScoreDoc](topK)
152+
val monoid = new TopKMonoid[SparkScoreDoc](topK)(SparkScoreDoc.descending)
153153
val queries = other.map(searchQueryGen).collect()
154154
val queriesB = partitionsRDD.context.broadcast(queries)
155155

@@ -165,7 +165,7 @@ class LuceneRDD[T: ClassTag](protected val partitionsRDD: RDD[AbstractLuceneRDDP
165165

166166
val results = resultsByPart.reduceByKey(monoid.plus)
167167
other.zipWithIndex.map(_.swap).join(results)
168-
.map{ case (_, joined) => (joined._1, joined._2.items.reverse.take(topK))}
168+
.map{ case (_, joined) => (joined._1, joined._2.items.take(topK))}
169169
}
170170

171171
/**

src/main/scala/org/zouzias/spark/lucenerdd/aggregate/SparkScoreDocAggregatable.scala

Lines changed: 0 additions & 31 deletions
This file was deleted.

src/main/scala/org/zouzias/spark/lucenerdd/models/SparkScoreDoc.scala

Lines changed: 11 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -40,13 +40,23 @@ object SparkScoreDoc extends Serializable {
4040
SparkDoc(indexSearcher.doc(scoreDoc.doc)))
4141
}
4242

43+
/**
44+
* Ordering by score (descending)
45+
*/
46+
def descending: Ordering[SparkScoreDoc] = new Ordering[SparkScoreDoc]{
47+
override def compare(x: SparkScoreDoc, y: SparkScoreDoc): Int = {
48+
if ( x.score > y.score) -1 else if (x.score == y.score) 0 else 1
49+
}
50+
}
51+
4352
/**
4453
* Ordering by score (ascending)
4554
*/
46-
implicit val ScoreAscendingOrdered = new Ordering[SparkScoreDoc]{
55+
def ascending: Ordering[SparkScoreDoc] = new Ordering[SparkScoreDoc]{
4756
override def compare(x: SparkScoreDoc, y: SparkScoreDoc): Int = {
4857
if ( x.score < y.score) -1 else if (x.score == y.score) 0 else 1
4958
}
5059
}
5160
}
5261

62+

src/main/scala/org/zouzias/spark/lucenerdd/response/LuceneRDDResponse.scala

Lines changed: 4 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -26,7 +26,8 @@ import org.zouzias.spark.lucenerdd.models.SparkScoreDoc
2626
/**
2727
* LuceneRDD response
2828
*/
29-
class LuceneRDDResponse(protected val partitionsRDD: RDD[LuceneRDDResponsePartition])
29+
class LuceneRDDResponse(protected val partitionsRDD: RDD[LuceneRDDResponsePartition],
30+
protected val ordering: Ordering[SparkScoreDoc])
3031
extends RDD[SparkScoreDoc](partitionsRDD.context,
3132
List(new OneToOneDependency(partitionsRDD))) {
3233

@@ -63,14 +64,14 @@ class LuceneRDDResponse(protected val partitionsRDD: RDD[LuceneRDDResponsePartit
6364
* @return
6465
*/
6566
override def take(num: Int): Array[SparkScoreDoc] = {
66-
val monoid = new TopKMonoid[SparkScoreDoc](num)
67+
val monoid = new TopKMonoid[SparkScoreDoc](num)(ordering)
6768
partitionsRDD.map(monoid.build(_))
6869
.reduce(monoid.plus).items.toArray
6970
}
7071

7172
override def collect(): Array[SparkScoreDoc] = {
7273
val sz = partitionsRDD.map(_.size).sum().toInt
73-
val monoid = new TopKMonoid[SparkScoreDoc](sz)
74+
val monoid = new TopKMonoid[SparkScoreDoc](sz)(ordering)
7475
partitionsRDD.map(monoid.build(_))
7576
.reduce(monoid.plus).items.toArray
7677

src/main/scala/org/zouzias/spark/lucenerdd/spatial/shape/ShapeLuceneRDD.scala

Lines changed: 16 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -17,14 +17,14 @@
1717
package org.zouzias.spark.lucenerdd.spatial.shape
1818

1919
import com.spatial4j.core.shape.Shape
20-
import com.twitter.algebird.TopK
20+
import com.twitter.algebird.{TopK, TopKMonoid}
2121
import org.apache.lucene.document.Document
2222
import org.apache.lucene.spatial.query.SpatialOperation
2323
import org.apache.spark.rdd.RDD
2424
import org.apache.spark.storage.StorageLevel
2525
import org.apache.spark._
2626
import org.apache.spark.sql.{DataFrame, Row}
27-
import org.zouzias.spark.lucenerdd.aggregate.SparkScoreDocAggregatable
27+
import org.zouzias.spark.lucenerdd.config.LuceneRDDConfigurable
2828
import org.zouzias.spark.lucenerdd.models.SparkScoreDoc
2929
import org.zouzias.spark.lucenerdd.query.LuceneQueryHelpers
3030
import org.zouzias.spark.lucenerdd.response.{LuceneRDDResponse, LuceneRDDResponsePartition}
@@ -43,7 +43,7 @@ import scala.reflect.ClassTag
4343
class ShapeLuceneRDD[K: ClassTag, V: ClassTag]
4444
(private val partitionsRDD: RDD[AbstractShapeLuceneRDDPartition[K, V]])
4545
extends RDD[(K, V)](partitionsRDD.context, List(new OneToOneDependency(partitionsRDD)))
46-
with SparkScoreDocAggregatable
46+
with LuceneRDDConfigurable
4747
with Logging {
4848

4949
logInfo("Instance is created...")
@@ -87,16 +87,17 @@ class ShapeLuceneRDD[K: ClassTag, V: ClassTag]
8787
* @param f
8888
* @return
8989
*/
90-
private def docResultsMapper(f: AbstractShapeLuceneRDDPartition[K, V] =>
90+
private def partitionMapper(f: AbstractShapeLuceneRDDPartition[K, V] =>
9191
LuceneRDDResponsePartition): LuceneRDDResponse = {
92-
new LuceneRDDResponse(partitionsRDD.map(f(_)))
92+
new LuceneRDDResponse(partitionsRDD.map(f(_)), SparkScoreDoc.ascending)
9393
}
9494

9595
private def linker[T: ClassTag](that: RDD[T], pointFunctor: T => PointType,
9696
mapper: ( PointType, AbstractShapeLuceneRDDPartition[K, V]) =>
9797
Iterable[SparkScoreDoc]): RDD[(T, List[SparkScoreDoc])] = {
9898
logDebug("Linker requested")
9999

100+
val topKMonoid = new TopKMonoid[SparkScoreDoc](MaxDefaultTopKValue)(SparkScoreDoc.ascending)
100101
logDebug("Collecting query points to driver")
101102
val queries = that.map(pointFunctor).collect()
102103
logDebug("Query points collected to driver successfully")
@@ -107,16 +108,16 @@ class ShapeLuceneRDD[K: ClassTag, V: ClassTag]
107108
logDebug("Compute topK linkage per partition")
108109
val resultsByPart: RDD[(Long, TopK[SparkScoreDoc])] = partitionsRDD.flatMap {
109110
case partition => queriesB.value.zipWithIndex.map { case (queryPoint, index) =>
110-
val results = mapper(queryPoint, partition).map(x => SparkDocAscendingTopKMonoid.build(x))
111-
.reduceOption(SparkDocAscendingTopKMonoid.plus)
112-
.getOrElse(SparkDocAscendingTopKMonoid.zero)
111+
val results = mapper(queryPoint, partition).map(x => topKMonoid.build(x))
112+
.reduceOption(topKMonoid.plus)
113+
.getOrElse(topKMonoid.zero)
113114

114115
(index.toLong, results)
115116
}
116117
}
117118

118119
logDebug("Merge topK linkage results")
119-
val results = resultsByPart.reduceByKey(SparkDocAscendingTopKMonoid.plus)
120+
val results = resultsByPart.reduceByKey(topKMonoid.plus)
120121
that.zipWithIndex.map(_.swap).join(results)
121122
.map{ case (_, joined) => (joined._1, joined._2.items)}
122123
}
@@ -215,7 +216,7 @@ class ShapeLuceneRDD[K: ClassTag, V: ClassTag]
215216
searchString: String = LuceneQueryHelpers.MatchAllDocsString)
216217
: LuceneRDDResponse = {
217218
logInfo(s"Knn search with query ${queryPoint} and search string ${searchString}")
218-
docResultsMapper(_.knnSearch(queryPoint, k, searchString))
219+
partitionMapper(_.knnSearch(queryPoint, k, searchString))
219220
}
220221

221222
/**
@@ -230,7 +231,7 @@ class ShapeLuceneRDD[K: ClassTag, V: ClassTag]
230231
: LuceneRDDResponse = {
231232
logInfo(s"Circle search with center ${center} and radius ${radius}")
232233
// Points can only intersect
233-
docResultsMapper(_.circleSearch(center, radius, k,
234+
partitionMapper(_.circleSearch(center, radius, k,
234235
SpatialOperation.Intersects.getName))
235236
}
236237

@@ -246,7 +247,7 @@ class ShapeLuceneRDD[K: ClassTag, V: ClassTag]
246247
operationName: String = SpatialOperation.Intersects.getName)
247248
: LuceneRDDResponse = {
248249
logInfo(s"Spatial search with shape ${shapeWKT} and operation ${operationName}")
249-
docResultsMapper(_.spatialSearch(shapeWKT, k, operationName))
250+
partitionMapper(_.spatialSearch(shapeWKT, k, operationName))
250251
}
251252

252253
/**
@@ -261,7 +262,7 @@ class ShapeLuceneRDD[K: ClassTag, V: ClassTag]
261262
operationName: String)
262263
: LuceneRDDResponse = {
263264
logInfo(s"Spatial search with point ${point} and operation ${operationName}")
264-
docResultsMapper(_.spatialSearch(point, k, operationName))
265+
partitionMapper(_.spatialSearch(point, k, operationName))
265266
}
266267

267268
/**
@@ -277,7 +278,7 @@ class ShapeLuceneRDD[K: ClassTag, V: ClassTag]
277278
operationName: String = SpatialOperation.Intersects.getName)
278279
: LuceneRDDResponse = {
279280
logInfo(s"Bounding box with center ${center}, radius ${radius}, k = ${k}")
280-
docResultsMapper(_.bboxSearch(center, radius, k, operationName))
281+
partitionMapper(_.bboxSearch(center, radius, k, operationName))
281282
}
282283

283284
/**
@@ -292,7 +293,7 @@ class ShapeLuceneRDD[K: ClassTag, V: ClassTag]
292293
operationName: String)
293294
: LuceneRDDResponse = {
294295
logInfo(s"Bounding box with lower left ${lowerLeft}, upper right ${upperRight} and k = ${k}")
295-
docResultsMapper(_.bboxSearch(lowerLeft, upperRight, k, operationName))
296+
partitionMapper(_.bboxSearch(lowerLeft, upperRight, k, operationName))
296297
}
297298

298299
override def count(): Long = {

0 commit comments

Comments
 (0)