Skip to content

Commit db938ee

Browse files
authored
Merge pull request #80 from zouzias/feature/moreLikeThis
Feature/more like this
2 parents a6b5c57 + e06201f commit db938ee

File tree

8 files changed

+3888
-1
lines changed

8 files changed

+3888
-1
lines changed

project/build.properties

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1 +1 @@
1-
sbt.version=0.13.12
1+
sbt.version=0.13.13

scripts/loadAlice.scala

Lines changed: 28 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,28 @@
1+
/*
2+
* Licensed to the Apache Software Foundation (ASF) under one or more
3+
* contributor license agreements. See the NOTICE file distributed with
4+
* this work for additional information regarding copyright ownership.
5+
* The ASF licenses this file to You under the Apache License, Version 2.0
6+
* (the "License"); you may not use this file except in compliance with
7+
* the License. You may obtain a copy of the License at
8+
*
9+
* http://www.apache.org/licenses/LICENSE-2.0
10+
*
11+
* Unless required by applicable law or agreed to in writing, software
12+
* distributed under the License is distributed on an "AS IS" BASIS,
13+
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14+
* See the License for the specific language governing permissions and
15+
* limitations under the License.
16+
*/
17+
18+
import scala.io.Source
19+
import org.zouzias.spark.lucenerdd._
20+
import org.zouzias.spark.lucenerdd.LuceneRDD
21+
val words = Source.fromFile("src/test/resources/alice.txt").getLines().map(_.toLowerCase).toSeq
22+
val rdd = sc.parallelize(words)
23+
val luceneRDD = LuceneRDD(rdd)
24+
luceneRDD.cache
25+
luceneRDD.count
26+
27+
28+
luceneRDD.moreLikeThis("_1", "alice adventures wonderland", 1, 1, 20).take(20).foreach(println)

src/main/scala/org/zouzias/spark/lucenerdd/LuceneRDD.scala

Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -264,6 +264,22 @@ class LuceneRDD[T: ClassTag](protected val partitionsRDD: RDD[AbstractLuceneRDDP
264264
partitionsRDD.map(_.size).reduce(_ + _)
265265
}
266266

267+
/**
268+
* Lucene's More Like This (MLT) functionality
269+
* @param fieldName Field name
270+
* @param query Query text
271+
* @param minTermFreq Minimum term frequency
272+
* @param minDocFreq Minimum document frequency
273+
* @param topK Number of returned documents
274+
* @return
275+
*/
276+
def moreLikeThis(fieldName: String, query: String,
277+
minTermFreq: Int, minDocFreq: Int, topK: Int = DefaultTopK)
278+
: LuceneRDDResponse = {
279+
logInfo(s"MoreLikeThis field: ${fieldName}, query: ${query}")
280+
partitionMapper(_.moreLikeThis(fieldName, query, minTermFreq, minDocFreq, topK), topK)
281+
}
282+
267283
/** RDD compute method. */
268284
override def compute(part: Partition, context: TaskContext): Iterator[T] = {
269285
firstParent[AbstractLuceneRDDPartition[T]].iterator(part, context).next.iterator

src/main/scala/org/zouzias/spark/lucenerdd/partition/AbstractLuceneRDDPartition.scala

Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -118,6 +118,20 @@ private[lucenerdd] abstract class AbstractLuceneRDDPartition[T] extends Serializ
118118
*/
119119
def phraseQuery(fieldName: String, query: String, topK: Int): LuceneRDDResponsePartition
120120

121+
122+
/**
123+
* Lucene's More Like This (MLT) functionality
124+
* @param fieldName Field name
125+
* @param query Query text
126+
* @param minTermFreq Minimum term frequency
127+
* @param minDocFreq Minimum document frequency
128+
* @param topK Number of returned documents
129+
* @return
130+
*/
131+
def moreLikeThis(fieldName: String, query: String,
132+
minTermFreq: Int, minDocFreq: Int, topK: Int)
133+
: LuceneRDDResponsePartition
134+
121135
/**
122136
* Restricts the entries to those satisfying a predicate
123137
* @param pred Predicate to filter on

src/main/scala/org/zouzias/spark/lucenerdd/partition/LuceneRDDPartition.scala

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -145,6 +145,14 @@ private[lucenerdd] class LuceneRDDPartition[T]
145145
facetField + FacetedLuceneRDD.FacetTextFieldSuffix,
146146
topK)(Analyzer)
147147
}
148+
149+
override def moreLikeThis(fieldName: String, query: String,
150+
minTermFreq: Int, minDocFreq: Int, topK: Int)
151+
: LuceneRDDResponsePartition = {
152+
val docs = LuceneQueryHelpers.moreLikeThis(indexSearcher, fieldName,
153+
query, minTermFreq, minDocFreq, topK)(Analyzer)
154+
LuceneRDDResponsePartition(docs)
155+
}
148156
}
149157

150158
object LuceneRDDPartition {

src/main/scala/org/zouzias/spark/lucenerdd/query/LuceneQueryHelpers.scala

Lines changed: 26 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -25,6 +25,7 @@ import org.apache.lucene.facet.{FacetsCollector, FacetsConfig}
2525
import org.apache.lucene.facet.sortedset.SortedSetDocValuesFacetCounts
2626
import org.apache.lucene.facet.taxonomy.{FastTaxonomyFacetCounts, TaxonomyReader}
2727
import org.apache.lucene.index.Term
28+
import org.apache.lucene.queries.mlt.MoreLikeThis
2829
import org.apache.lucene.queryparser.classic.QueryParser
2930
import org.apache.lucene.search._
3031
import org.zouzias.spark.lucenerdd.aggregate.SparkFacetResultMonoid
@@ -280,4 +281,29 @@ object LuceneQueryHelpers extends Serializable {
280281

281282
searchTopK(indexSearcher, builder.build(), topK)
282283
}
284+
285+
/**
286+
* Lucene's More Like This (MLT) functionality
287+
* @param indexSearcher
288+
* @param fieldName
289+
* @param query
290+
* @param minTermFreq
291+
* @param minDocFreq
292+
* @param topK
293+
* @param analyzer
294+
* @return
295+
*/
296+
def moreLikeThis(indexSearcher: IndexSearcher, fieldName: String,
297+
query: String,
298+
minTermFreq: Int, minDocFreq: Int, topK: Int)
299+
(implicit analyzer: Analyzer)
300+
: Iterator[SparkScoreDoc] = {
301+
val mlt = new MoreLikeThis(indexSearcher.getIndexReader)
302+
mlt.setMinTermFreq(minTermFreq)
303+
mlt.setMinDocFreq(minDocFreq)
304+
mlt.setFieldNames(Array(fieldName)) // FIXME: Is this necessary?
305+
mlt.setAnalyzer(analyzer)
306+
val q = mlt.like(fieldName, new StringReader(query))
307+
searchTopK(indexSearcher, q, topK).toIterator
308+
}
283309
}

0 commit comments

Comments
 (0)