Prepare for release v0.3.6 (#155)

zouzias · web-flow · commit 02cf4275136b · 2019-03-11T22:38:26.000+01:00
* [sbt] version updates * [sbt] disable build for scala 2.12 * [conf] allow not_analyzed string fields (#145) * [not-analyzed-fields] do not analyzed fields ending with _notanalyzed * [sbt] version updates * [sbt] disable build for scala 2.12 * [conf] allow not_analyzed string fields (#145) * [not-analyzed-fields] do not analyzed fields ending with _notanalyzed * Revert "Revert "Setting version to 0.3.5-SNAPSHOT"" This reverts commit a6da0af. * [build] update Lucene to 7.7.0 * Hotfix: issue 150 (#151) * Remove unused code (#141) * Revert "Setting version to 0.3.4-SNAPSHOT" This reverts commit 2f1d7be. * README: update to 0.3.3 * README: fix javadoc badge * remove unused param * [sbt] version updates * [conf] allow not_analyzed string fields (#145) * [not-analyzed-fields] do not analyzed fields ending with _notanalyzed * [hotfix] fixes issue 150 * [tests] issue 150 * fix typo * [blockEntityLinkage] drop queryPartColumns * [sbt] version updates * [scripts] fix shell * Block linkage: allow a block linker with Row to Query (#154) * [linkage] block linker with => Query * [linkage] block linker is Row => Query * remove Query analyzer on methods * [sbt] set version to 0.3.6-SNAPSHOT
diff --git a/README.md b/README.md
@@ -48,7 +48,7 @@ You can link against this library (for Spark 1.4+) in your program at the follow
 Using SBT:
 
 ```
-libraryDependencies += "org.zouzias" %% "spark-lucenerdd" % "0.3.4"
+libraryDependencies += "org.zouzias" %% "spark-lucenerdd" % "0.3.6"
 ```
 
 Using Maven:
@@ -57,15 +57,15 @@ Using Maven:
 <dependency>
     <groupId>org.zouzias</groupId>
     <artifactId>spark-lucenerdd_2.11</artifactId>
-    <version>0.3.4</version>
+    <version>0.3.6</version>
 </dependency>
 ```
 
 This library can also be added to Spark jobs launched through `spark-shell` or `spark-submit` by using the `--packages` command line option.
 For example, to include it when starting the spark shell:
 
 ```
-$ bin/spark-shell --packages org.zouzias:spark-lucenerdd_2.11:0.3.4
+$ bin/spark-shell --packages org.zouzias:spark-lucenerdd_2.11:0.3.6
 ```
 
 Unlike using `--jars`, using `--packages` ensures that this library and its dependencies will be added to the classpath.
@@ -76,7 +76,9 @@ The project has the following compatibility with Apache Spark:
 
 Artifact                  | Release Date    | Spark compatibility | Notes | Status
 ------------------------- | --------------- | -------------------------- | ----- | ----
-0.3.5-SNAPSHOT            |                 | >= 2.3.1, JVM 8  | [develop](https://github.com/zouzias/spark-lucenerdd/tree/develop) | Under Development
+0.3.7-SNAPSHOT            |                 | >= 2.4.0, JVM 8  | [develop](https://github.com/zouzias/spark-lucenerdd/tree/develop) | Under Development
+0.3.6                     |  2019-03-11     | >= 2.4.0, JVM 8  | [tag v0.3.6](https://github.com/zouzias/spark-lucenerdd/tree/v0.3.6) | Released
+0.3.5                     |  2019-02-7     | >= 2.4.0, JVM 8  | [tag v0.3.5](https://github.com/zouzias/spark-lucenerdd/tree/v0.3.5) | Released
 0.3.4                     |  2018-11-27     | >= 2.4.0, JVM 8  | [tag v0.3.4](https://github.com/zouzias/spark-lucenerdd/tree/v0.3.4) | Released
 0.2.8                     | 2017-05-30      |  2.1.x, JVM 7      | [tag v0.2.8](https://github.com/zouzias/spark-lucenerdd/tree/v0.2.8) | Released
 0.1.0                     | 2016-09-26      | 1.4.x, 1.5.x, 1.6.x| [tag v0.1.0](https://github.com/zouzias/spark-lucenerdd/tree/v0.1.0) | Cross-released with 2.10/2.11
diff --git a/build.sbt b/build.sbt
@@ -77,7 +77,7 @@ pomExtra := <scm>
     </developer>
   </developers>
 
-val luceneV = "7.6.0"
+val luceneV = "7.7.1"
 
 spName := "zouzias/spark-lucenerdd"
 sparkVersion := "2.4.0"
@@ -94,8 +94,8 @@ testSparkVersion := sys.props.get("spark.testVersion").getOrElse(sparkVersion.va
 
 
 // scalastyle:off
-val scalactic                 = "org.scalactic"                  %% "scalactic"                % "3.0.5"
-val scalatest                 = "org.scalatest"                  %% "scalatest"                % "3.0.5" % "test"
+val scalactic                 = "org.scalactic"                  %% "scalactic"                % "3.0.6"
+val scalatest                 = "org.scalatest"                  %% "scalatest"                % "3.0.6" % "test"
 
 val joda_time                 = "joda-time"                      % "joda-time"                 % "2.10.1"
 val algebird                  = "com.twitter"                    %% "algebird-core"            % "0.13.5"
@@ -111,7 +111,7 @@ val lucene_expressions        = "org.apache.lucene"              % "lucene-expre
 val lucene_spatial            = "org.apache.lucene"              % "lucene-spatial"            % luceneV
 val lucene_spatial_extras     = "org.apache.lucene"              % "lucene-spatial-extras"     % luceneV
 
-val jts                       = "org.locationtech.jts"           % "jts-core"                  % "1.16.0"
+val jts                       = "org.locationtech.jts"           % "jts-core"                  % "1.16.1"
 // scalastyle:on
 
 
diff --git a/spark-shell.sh b/spark-shell.sh
@@ -6,7 +6,7 @@ CURRENT_DIR=`pwd`
 SPARK_LUCENERDD_VERSION=`cat version.sbt | awk '{print $5}' | xargs`
 
 # You should have downloaded this spark version under your ${HOME}
-SPARK_VERSION="2.3.1"
+SPARK_VERSION="2.4.0"
 
 echo "==============================================="
 echo "Loading LuceneRDD with version ${SPARK_LUCENERDD_VERSION}"
diff --git a/src/main/scala/org/zouzias/spark/lucenerdd/LuceneRDD.scala b/src/main/scala/org/zouzias/spark/lucenerdd/LuceneRDD.scala
@@ -469,7 +469,7 @@ object LuceneRDD extends Versionable
     *
     * @param queries Queries / entities to be linked with @corpus
     * @param entities DataFrame of entities to be linked with queries parameter
-    * @param rowToQueryString Converts each [[Row]] to a 'Lucene Query Syntax'
+    * @param rowToQuery Function[Row, Query] that converts [[Row]] to a Lucene [[Query]]
     * @param queryPartColumns List of query columns for [[HashPartitioner]]
     * @param entityPartColumns List of entity columns for [[HashPartitioner]]
     * @param topK Number of linked results
@@ -481,7 +481,7 @@ object LuceneRDD extends Versionable
     */
   def blockEntityLinkage(queries: DataFrame,
                          entities: DataFrame,
-                         rowToQueryString: Row => String,
+                         rowToQuery: Row => Query,
                          queryPartColumns: Array[String],
                          entityPartColumns: Array[String],
                          topK : Int = 3,
@@ -496,17 +496,18 @@ object LuceneRDD extends Versionable
       "Query Partition columns must be non-empty for block linkage")
 
 
-    val partColumn = "__PARTITION_COLUMN__"
+    val partColumnLeft = "__PARTITION_COLUMN_LEFT__"
+    val partColumnRight = "__PARTITION_COLUMN_RIGHT__"
 
     // Prepare input DataFrames for cogroup operation.
     // Keyed them on queryPartColumns and entityPartColumns
     // I.e., Query/Entity DataFrame are now of type (String, Row)
-    val blocked = entities.withColumn(partColumn,
-      concat(entityPartColumns.map(entities.col): _*))
-      .rdd.keyBy(x => x.getString(x.fieldIndex(partColumn)))
-    val blockedQueries = queries.withColumn(partColumn,
+    val blocked = entities.withColumn(partColumnLeft,
       concat(entityPartColumns.map(entities.col): _*))
-      .rdd.keyBy(x => x.getString(x.fieldIndex(partColumn)))
+      .rdd.keyBy(x => x.getString(x.fieldIndex(partColumnLeft)))
+    val blockedQueries = queries.withColumn(partColumnRight,
+      concat(queryPartColumns.map(queries.col): _*)).drop(queryPartColumns: _*)
+      .rdd.keyBy(x => x.getString(x.fieldIndex(partColumnRight)))
 
     // Cogroup queries and entities. Map over each
     // CoGrouped partition and instantiate Lucene index on partitioned
@@ -520,7 +521,7 @@ object LuceneRDD extends Versionable
             queryAnalyzer, similarity)
 
           // Multi-query lucene index
-          qs.map(q => (q, lucenePart.query(rowToQueryString(q), topK).results.toArray))
+          qs.map(q => (q, lucenePart.query(rowToQuery(q), topK).results.toArray))
         }
     }
   }
@@ -529,7 +530,7 @@ object LuceneRDD extends Versionable
     * Deduplication via blocking
     *
     * @param entities Entities [[DataFrame]] to deduplicate
-    * @param rowToQueryString Function that maps [[Row]] to Lucene Query String
+    * @param rowToQuery Function that maps [[Row]] to Lucene [[Query]]
     * @param blockingColumns Columns on which exact match is required
     * @param topK Number of top-K query results
     * @param indexAnalyzer Lucene analyzer at index time
@@ -540,7 +541,7 @@ object LuceneRDD extends Versionable
     * @return
     */
   def blockDedup(entities: DataFrame,
-                 rowToQueryString: Row => String,
+                 rowToQuery: Row => Query,
                  blockingColumns: Array[String],
                  topK : Int = 3,
                  indexAnalyzer: String = getOrElseEn(IndexAnalyzerConfigName),
@@ -574,7 +575,7 @@ object LuceneRDD extends Versionable
         queryAnalyzer, similarity)
 
       // Multi-query lucene index
-      iterQueries.map(q => (q, lucenePart.query(rowToQueryString(q), topK).results.toArray))
+      iterQueries.map(q => (q, lucenePart.query(rowToQuery(q), topK).results.toArray))
     }
   }
 }
diff --git a/src/main/scala/org/zouzias/spark/lucenerdd/partition/AbstractLuceneRDDPartition.scala b/src/main/scala/org/zouzias/spark/lucenerdd/partition/AbstractLuceneRDDPartition.scala
@@ -16,7 +16,7 @@
  */
 package org.zouzias.spark.lucenerdd.partition
 
-import org.apache.lucene.search.BooleanClause
+import org.apache.lucene.search.{BooleanClause, Query}
 import org.zouzias.spark.lucenerdd.models.indexstats.IndexStatistics
 import org.zouzias.spark.lucenerdd.models.{SparkFacetResult, TermVectorEntry}
 import org.zouzias.spark.lucenerdd.response.LuceneRDDResponsePartition
@@ -62,6 +62,16 @@ private[lucenerdd] abstract class AbstractLuceneRDDPartition[T] extends Serializ
    */
   def query(searchString: String, topK: Int): LuceneRDDResponsePartition
 
+
+  /**
+    * Lucene search using Lucene [[Query]]
+    * @param query Lucene query, i.e., [[org.apache.lucene.search.BooleanQuery]] or
+    *              [[org.apache.lucene.search.PhraseQuery]]
+    * @param topK Number of documents to return
+    * @return
+    */
+  def query(query: Query, topK: Int): LuceneRDDResponsePartition
+
   /**
    * Multiple generic Lucene Queries using QueryParser
    * @param searchString  Lucene query string
diff --git a/src/main/scala/org/zouzias/spark/lucenerdd/partition/LuceneRDDPartition.scala b/src/main/scala/org/zouzias/spark/lucenerdd/partition/LuceneRDDPartition.scala
@@ -137,6 +137,13 @@ private[lucenerdd] class LuceneRDDPartition[T]
     LuceneRDDResponsePartition(results.toIterator)
   }
 
+  override def query(query: Query,
+                     topK: Int): LuceneRDDResponsePartition = {
+    val results = LuceneQueryHelpers.searchQuery(indexSearcher, query, topK)
+
+    LuceneRDDResponsePartition(results.toIterator)
+  }
+
   override def queries(searchStrings: Iterable[String],
                      topK: Int): Iterable[(String, LuceneRDDResponsePartition)] = {
     searchStrings.map( searchString =>
diff --git a/src/main/scala/org/zouzias/spark/lucenerdd/query/LuceneQueryHelpers.scala b/src/main/scala/org/zouzias/spark/lucenerdd/query/LuceneQueryHelpers.scala
@@ -95,18 +95,35 @@ object LuceneQueryHelpers extends Serializable {
    *
    * @param indexSearcher Index searcher
    * @param searchString Lucene search query string
-   * @param topK Number of returned documents
+   * @param topK Number of documents to return
    * @param analyzer Lucene Analyzer
    * @return
    */
   def searchParser(indexSearcher: IndexSearcher,
                    searchString: String,
-                   topK: Int, analyzer: Analyzer)
+                   topK: Int,
+                   analyzer: Analyzer)
   : Seq[SparkScoreDoc] = {
     val q = parseQueryString(searchString, analyzer)
     indexSearcher.search(q, topK).scoreDocs.map(SparkScoreDoc(indexSearcher, _))
   }
 
+  /**
+    * Lucene search using a Lucene [[Query]]
+    *
+    * Important: Query analysis is done during the definition of query
+    * @param indexSearcher Lucene index searcher
+    * @param query Lucene query
+    * @param topK Number of documents to return
+    * @return
+    */
+  def searchQuery(indexSearcher: IndexSearcher,
+                  query: Query,
+                  topK: Int)
+  : Seq[SparkScoreDoc] = {
+    indexSearcher.search(query, topK).scoreDocs.map(SparkScoreDoc(indexSearcher, _))
+  }
+
   /**
    * Faceted search using [[SortedSetDocValuesFacetCounts]]
    *
diff --git a/src/test/scala/org/zouzias/spark/lucenerdd/BlockingDedupSpec.scala b/src/test/scala/org/zouzias/spark/lucenerdd/BlockingDedupSpec.scala
@@ -17,6 +17,8 @@
 package org.zouzias.spark.lucenerdd
 
 import com.holdenkarau.spark.testing.SharedSparkContext
+import org.apache.lucene.index.Term
+import org.apache.lucene.search.{Query, TermQuery}
 import org.apache.spark.SparkConf
 import org.apache.spark.sql.{Row, SparkSession}
 import org.scalatest.{BeforeAndAfterEach, FlatSpec, Matchers}
@@ -44,10 +46,11 @@ class BlockingDedupSpec extends FlatSpec
     }
     val df = sc.parallelize(people).repartition(2).toDF()
 
-    val linker: Row => String = { row =>
+    val linker: Row => Query = { row =>
       val name = row.getString(row.fieldIndex("name"))
+      val term = new Term("name", name)
 
-      s"name:$name"
+      new TermQuery(term)
     }
 
 
diff --git a/src/test/scala/org/zouzias/spark/lucenerdd/BlockingLinkageSpec.scala b/src/test/scala/org/zouzias/spark/lucenerdd/BlockingLinkageSpec.scala
@@ -17,6 +17,8 @@
 package org.zouzias.spark.lucenerdd
 
 import com.holdenkarau.spark.testing.SharedSparkContext
+import org.apache.lucene.index.Term
+import org.apache.lucene.search.{Query, TermQuery}
 import org.apache.spark.SparkConf
 import org.apache.spark.sql.{Row, SparkSession}
 import org.scalatest.{BeforeAndAfterEach, FlatSpec, Matchers}
@@ -37,24 +39,34 @@ class BlockingLinkageSpec extends FlatSpec
     val spark = SparkSession.builder().getOrCreate()
     import spark.implicits._
 
-    val people: Array[Person] = Array("fear", "death", "water", "fire", "house")
+    val peopleLeft: Array[Person] = Array("fear", "death", "water", "fire", "house")
       .zipWithIndex.map { case (str, index) =>
       val email = if (index % 2 == 0) "yes@gmail.com" else "no@gmail.com"
       Person(str, index, email)
     }
-    val df = sc.parallelize(people).repartition(2).toDF()
 
-    val linker: Row => String = { row =>
+    val peopleRight: Array[Person] = Array("fear", "death", "water", "fire", "house")
+      .zipWithIndex.map { case (str, index) =>
+      val email = if (index % 2 == 0) "yes@gmail.com" else "no@gmail.com"
+      Person(str, index, email)
+    }
+
+    val leftDF = sc.parallelize(peopleLeft).repartition(2).toDF()
+    val rightDF = sc.parallelize(peopleRight).repartition(3).toDF()
+
+    // Define a Lucene Term linker
+    val linker: Row => Query = { row =>
       val name = row.getString(row.fieldIndex("name"))
+      val term = new Term("name", name)
 
-      s"name:$name"
+      new TermQuery(term)
     }
 
 
-    val linked = LuceneRDD.blockEntityLinkage(df, df, linker,
+    val linked = LuceneRDD.blockEntityLinkage(leftDF, rightDF, linker,
       Array("email"), Array("email"))
 
-    val linkedCount, dfCount = (linked.count, df.count())
+    val linkedCount, dfCount = (linked.count, leftDF.count())
 
     linkedCount should equal(dfCount)