Open
Description
Describe the bug
I try to launch a minimal example (Titanic) from a Jupyter hub with Spark 2.4.4, and got the following exception for string features:
Name: java.lang.ClassCastException
Message: [Lcom.salesforce.op.stages.impl.feature.TextStats; cannot be cast to [Lcom.salesforce.op.stages.impl.feature.TextStats;
The unit test in my local repo seems to work well, with the following dependencies:
// sbt-assembly excludes packages tagged "provided" as below
val sparkVersion = "2.4.4"
val scalaTestVersion = "3.0.8"
libraryDependencies ++= Seq(
"org.scalatest" %% "scalatest" % scalaTestVersion,
"org.apache.spark" %% "spark-core" % sparkVersion % "provided",
"org.apache.spark" %% "spark-mllib" % sparkVersion % "provided",
"org.apache.spark" %% "spark-sql" % sparkVersion % "provided",
"com.salesforce.transmogrifai" %% "transmogrifai-core" % "0.7.0"
)
To Reproduce
object SimpleLauncher {
def run (inputDf: DataFrame, targetCol: String): Unit = {
implicit val spark: SparkSession = getSparkSession(false, "Transmogifai Simple Launcher")
println("Yarn application id: " + spark.sparkContext.getConf.getAppId)
import spark.implicits._
// Automated feature engineering
val (target, features) = FeatureBuilder.fromDataFrame[RealNN](inputDf, response = targetCol)
val featureVector: FeatureLike[OPVector] = features.transmogrify()
// Automated feature selection
val checkedFeatures: FeatureLike[OPVector] = target.sanityCheck(featureVector, checkSample = 1.0, removeBadFeatures = true)
// Define the model we want to use (here a simple logistic regression) and get the resulting output
val prediction: FeatureLike[Prediction] = BinaryClassificationModelSelector.withTrainValidationSplit(
modelTypesToUse = Seq(OpLogisticRegression)
).setInput(target, checkedFeatures).getOutput()
val model: OpWorkflowModel = new OpWorkflow().setInputDataset(inputDf).setResultFeatures(prediction).train()
println("Model summary:\n" + model.summaryPretty())
}
}
This work on local:
test("Titanic simple") {
import spark.implicits._
// Read Titanic data as a DataFrame
val csvFilePath: String = "src/test/resources/data/PassengerDataAll.csv"
val passengersData: DataFrame = DataReaders.Simple.csvCase[Passenger](path = Option(csvFilePath), key = _.id.toString)
.readDataset().toDF()
val truncatedData = passengersData.select("name", "age", "survived")
truncatedData.show()
truncatedData.printSchema()
SimpleLauncher.run(truncatedData, "survived")
}
While the same doesn't from jupyter hub:
val passengers = spark.read.schema(schema)
.option("header","true")
.csv("path_to_csv)
SimpleLauncher.run(passengers, "survived")
Expected behavior
Name: java.lang.ClassCastException
Message: [Lcom.salesforce.op.stages.impl.feature.TextStats; cannot be cast to [Lcom.salesforce.op.stages.impl.feature.TextStats;
StackTrace: at com.salesforce.op.stages.impl.feature.SmartTextVectorizer.fitFn(SmartTextVectorizer.scala:91)
at com.salesforce.op.stages.base.sequence.SequenceEstimator.fit(SequenceEstimator.scala:99)
at com.salesforce.op.stages.base.sequence.SequenceEstimator.fit(SequenceEstimator.scala:57)
at com.salesforce.op.utils.stages.FitStagesUtil$$anonfun$20.apply(FitStagesUtil.scala:264)
at com.salesforce.op.utils.stages.FitStagesUtil$$anonfun$20.apply(FitStagesUtil.scala:263)
at scala.collection.TraversableLike$$anonfun$map$1.apply(TraversableLike.scala:234)
at scala.collection.TraversableLike$$anonfun$map$1.apply(TraversableLike.scala:234)
at scala.collection.IndexedSeqOptimized$class.foreach(IndexedSeqOptimized.scala:33)
at scala.collection.mutable.ArrayOps$ofRef.foreach(ArrayOps.scala:186)
at scala.collection.TraversableLike$class.map(TraversableLike.scala:234)
at scala.collection.mutable.ArrayOps$ofRef.map(ArrayOps.scala:186)
at com.salesforce.op.utils.stages.FitStagesUtil$.com$salesforce$op$utils$stages$FitStagesUtil$$fitAndTransformLayer(FitStagesUtil.scala:263)
at com.salesforce.op.utils.stages.FitStagesUtil$$anonfun$17.apply(FitStagesUtil.scala:226)
at com.salesforce.op.utils.stages.FitStagesUtil$$anonfun$17.apply(FitStagesUtil.scala:224)
at scala.collection.IndexedSeqOptimized$class.foldl(IndexedSeqOptimized.scala:57)
at scala.collection.IndexedSeqOptimized$class.foldLeft(IndexedSeqOptimized.scala:66)
at scala.collection.mutable.ArrayOps$ofRef.foldLeft(ArrayOps.scala:186)
at com.salesforce.op.utils.stages.FitStagesUtil$.fitAndTransformDAG(FitStagesUtil.scala:224)
at com.salesforce.op.OpWorkflow.fitStages(OpWorkflow.scala:407)
at com.salesforce.op.OpWorkflow.train(OpWorkflow.scala:354)
at launchers.SimpleLauncher$.run(SimpleLauncher.scala:35)
Logs or screenshots
If applicable, add logs or screenshots to help explain your problem.
Additional context
Add any other context about the problem here.
Metadata
Metadata
Assignees
Labels
No labels