From 1732ed10680c546c1f51a772e7fba2b08cfb4a87 Mon Sep 17 00:00:00 2001 From: Arek Burdach Date: Mon, 10 Mar 2025 17:42:37 +0100 Subject: [PATCH 1/9] Flink MiniCluster: IO.pure instead of IO.delay + attached StreamExecutionEnvironment --- .../flink/minicluster/FlinkMiniClusterFactory.scala | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) diff --git a/engine/flink/minicluster/src/main/scala/pl/touk/nussknacker/engine/flink/minicluster/FlinkMiniClusterFactory.scala b/engine/flink/minicluster/src/main/scala/pl/touk/nussknacker/engine/flink/minicluster/FlinkMiniClusterFactory.scala index 3fa929c1e9e..57ea13e15d4 100644 --- a/engine/flink/minicluster/src/main/scala/pl/touk/nussknacker/engine/flink/minicluster/FlinkMiniClusterFactory.scala +++ b/engine/flink/minicluster/src/main/scala/pl/touk/nussknacker/engine/flink/minicluster/FlinkMiniClusterFactory.scala @@ -109,11 +109,19 @@ class FlinkMiniClusterWithServices( ) extends AutoCloseable { def withDetachedStreamExecutionEnvironment[T](action: StreamExecutionEnvironment => T): T = { - createDetachedStreamExecutionEnvironment.use(env => IO(action(env))).unsafeRunSync() + createDetachedStreamExecutionEnvironment.use(env => IO.pure(action(env))).unsafeRunSync() } def createDetachedStreamExecutionEnvironment: Resource[IO, StreamExecutionEnvironment] = { - Resource.fromAutoCloseable(IO(streamExecutionEnvironmentFactory(false))) + Resource.fromAutoCloseable(IO.pure(streamExecutionEnvironmentFactory(false))) + } + + def withAttachedStreamExecutionEnvironment[T](action: StreamExecutionEnvironment => T): T = { + createAttachedStreamExecutionEnvironment.use(env => IO.pure(action(env))).unsafeRunSync() + } + + def createAttachedStreamExecutionEnvironment: Resource[IO, StreamExecutionEnvironment] = { + Resource.fromAutoCloseable(IO.pure(streamExecutionEnvironmentFactory(true))) } override def close(): Unit = miniCluster.close() From 18d7c888e60fb5fe1d08425d1635d567d38e6964 Mon Sep 17 00:00:00 2001 From: Arek Burdach Date: Mon, 10 Mar 2025 22:56:59 +0100 Subject: [PATCH 2/9] SyncIO --- .../minicluster/FlinkMiniClusterFactory.scala | 17 +++++++++-------- .../FlinkMiniClusterScenarioStateVerifier.scala | 3 ++- .../FlinkMiniClusterScenarioTestRunner.scala | 2 +- 3 files changed, 12 insertions(+), 10 deletions(-) diff --git a/engine/flink/minicluster/src/main/scala/pl/touk/nussknacker/engine/flink/minicluster/FlinkMiniClusterFactory.scala b/engine/flink/minicluster/src/main/scala/pl/touk/nussknacker/engine/flink/minicluster/FlinkMiniClusterFactory.scala index 57ea13e15d4..6abf628d1f7 100644 --- a/engine/flink/minicluster/src/main/scala/pl/touk/nussknacker/engine/flink/minicluster/FlinkMiniClusterFactory.scala +++ b/engine/flink/minicluster/src/main/scala/pl/touk/nussknacker/engine/flink/minicluster/FlinkMiniClusterFactory.scala @@ -1,8 +1,8 @@ package pl.touk.nussknacker.engine.flink.minicluster -import cats.effect.IO +import cats.Applicative +import cats.effect.{Sync, SyncIO} import cats.effect.kernel.Resource -import cats.effect.unsafe.implicits.global import com.typesafe.scalalogging.LazyLogging import org.apache.flink.configuration._ import org.apache.flink.core.fs.FileSystem @@ -16,6 +16,7 @@ import pl.touk.nussknacker.engine.util.ThreadUtils import pl.touk.nussknacker.engine.util.loader.ModelClassLoader import java.net.URLClassLoader +import scala.language.higherKinds object FlinkMiniClusterFactory extends LazyLogging { @@ -109,19 +110,19 @@ class FlinkMiniClusterWithServices( ) extends AutoCloseable { def withDetachedStreamExecutionEnvironment[T](action: StreamExecutionEnvironment => T): T = { - createDetachedStreamExecutionEnvironment.use(env => IO.pure(action(env))).unsafeRunSync() + createDetachedStreamExecutionEnvironment[SyncIO].use(env => SyncIO.pure(action(env))).unsafeRunSync() } - def createDetachedStreamExecutionEnvironment: Resource[IO, StreamExecutionEnvironment] = { - Resource.fromAutoCloseable(IO.pure(streamExecutionEnvironmentFactory(false))) + def createDetachedStreamExecutionEnvironment[F[_]: Sync]: Resource[F, StreamExecutionEnvironment] = { + Resource.fromAutoCloseable(Applicative[F].pure(streamExecutionEnvironmentFactory(false))) } def withAttachedStreamExecutionEnvironment[T](action: StreamExecutionEnvironment => T): T = { - createAttachedStreamExecutionEnvironment.use(env => IO.pure(action(env))).unsafeRunSync() + createAttachedStreamExecutionEnvironment[SyncIO].use(env => SyncIO.pure(action(env))).unsafeRunSync() } - def createAttachedStreamExecutionEnvironment: Resource[IO, StreamExecutionEnvironment] = { - Resource.fromAutoCloseable(IO.pure(streamExecutionEnvironmentFactory(true))) + def createAttachedStreamExecutionEnvironment[F[_]: Sync]: Resource[F, StreamExecutionEnvironment] = { + Resource.fromAutoCloseable(Applicative[F].pure(streamExecutionEnvironmentFactory(true))) } override def close(): Unit = miniCluster.close() diff --git a/engine/flink/minicluster/src/main/scala/pl/touk/nussknacker/engine/flink/minicluster/scenariotesting/FlinkMiniClusterScenarioStateVerifier.scala b/engine/flink/minicluster/src/main/scala/pl/touk/nussknacker/engine/flink/minicluster/scenariotesting/FlinkMiniClusterScenarioStateVerifier.scala index 9b56a1f0981..618e960485e 100644 --- a/engine/flink/minicluster/src/main/scala/pl/touk/nussknacker/engine/flink/minicluster/scenariotesting/FlinkMiniClusterScenarioStateVerifier.scala +++ b/engine/flink/minicluster/src/main/scala/pl/touk/nussknacker/engine/flink/minicluster/scenariotesting/FlinkMiniClusterScenarioStateVerifier.scala @@ -57,7 +57,8 @@ class FlinkMiniClusterScenarioStateVerifier( env ) val scenarioName = processVersion.processName - miniClusterWithServices.createDetachedStreamExecutionEnvironment + miniClusterWithServices + .createDetachedStreamExecutionEnvironment[IO] .use { env => logger.info(s"Starting to verify $scenarioName") (for { diff --git a/engine/flink/minicluster/src/main/scala/pl/touk/nussknacker/engine/flink/minicluster/scenariotesting/FlinkMiniClusterScenarioTestRunner.scala b/engine/flink/minicluster/src/main/scala/pl/touk/nussknacker/engine/flink/minicluster/scenariotesting/FlinkMiniClusterScenarioTestRunner.scala index 6598c2500f7..20d5ca0bde6 100644 --- a/engine/flink/minicluster/src/main/scala/pl/touk/nussknacker/engine/flink/minicluster/scenariotesting/FlinkMiniClusterScenarioTestRunner.scala +++ b/engine/flink/minicluster/src/main/scala/pl/touk/nussknacker/engine/flink/minicluster/scenariotesting/FlinkMiniClusterScenarioTestRunner.scala @@ -60,7 +60,7 @@ class FlinkMiniClusterScenarioTestRunner( ) } (for { - env <- miniClusterWithServices.createDetachedStreamExecutionEnvironment + env <- miniClusterWithServices.createDetachedStreamExecutionEnvironment[IO] collectingListener <- ResultsCollectingListenerHolder.registerTestEngineListener } yield (env, collectingListener)) .use { case (env, collectingListener) => From 960ff62f91fbbcad0e3cdc76c393241e83055ce1 Mon Sep 17 00:00:00 2001 From: Arek Burdach Date: Tue, 11 Mar 2025 17:37:02 +0100 Subject: [PATCH 3/9] [NU-2045] FlinkMiniClusterWithServices: attached variants of methods creating StreamExecutionEnvironment + SyncIO to fix problem with ThreadLocal + clean up --- .../minicluster/FlinkMiniClusterFactory.scala | 27 ++++++++++++------- 1 file changed, 17 insertions(+), 10 deletions(-) diff --git a/engine/flink/minicluster/src/main/scala/pl/touk/nussknacker/engine/flink/minicluster/FlinkMiniClusterFactory.scala b/engine/flink/minicluster/src/main/scala/pl/touk/nussknacker/engine/flink/minicluster/FlinkMiniClusterFactory.scala index 6abf628d1f7..72106632cd2 100644 --- a/engine/flink/minicluster/src/main/scala/pl/touk/nussknacker/engine/flink/minicluster/FlinkMiniClusterFactory.scala +++ b/engine/flink/minicluster/src/main/scala/pl/touk/nussknacker/engine/flink/minicluster/FlinkMiniClusterFactory.scala @@ -79,15 +79,7 @@ object FlinkMiniClusterFactory extends LazyLogging { miniCluster.start() } - def createStreamExecutionEnv(attached: Boolean): StreamExecutionEnvironment = { - FlinkMiniClusterStreamExecutionEnvironmentFactory.createStreamExecutionEnvironment( - miniCluster, - modelClassLoader, - attached - ) - } - - new FlinkMiniClusterWithServices(miniCluster, createStreamExecutionEnv) + new FlinkMiniClusterWithServices(miniCluster, modelClassLoader) } private def createMiniCluster(configuration: Configuration) = { @@ -106,10 +98,11 @@ object FlinkMiniClusterFactory extends LazyLogging { class FlinkMiniClusterWithServices( val miniCluster: MiniCluster, - streamExecutionEnvironmentFactory: Boolean => StreamExecutionEnvironment + modelClassLoader: URLClassLoader ) extends AutoCloseable { def withDetachedStreamExecutionEnvironment[T](action: StreamExecutionEnvironment => T): T = { + // We use SyncIO, because passed actions sometimes uses ThreadLocal and we don't want to change the Thread which run this action createDetachedStreamExecutionEnvironment[SyncIO].use(env => SyncIO.pure(action(env))).unsafeRunSync() } @@ -117,14 +110,28 @@ class FlinkMiniClusterWithServices( Resource.fromAutoCloseable(Applicative[F].pure(streamExecutionEnvironmentFactory(false))) } + // This method is used only by external project. It should be used with caution because action on StreamExecutionEnvironment + // can be blocking and it can cause thread pool starvation or deadlock. As an alternative, we recommend to use withDetachedStreamExecutionEnvironment + // combined with MiniClusterJobStatusCheckingOps def withAttachedStreamExecutionEnvironment[T](action: StreamExecutionEnvironment => T): T = { + // We use SyncIO, because passed actions sometimes uses ThreadLocal and we don't want to change the Thread which run this action createAttachedStreamExecutionEnvironment[SyncIO].use(env => SyncIO.pure(action(env))).unsafeRunSync() } + // This method is used only by external project. It should be used with caution because action on StreamExecutionEnvironment + // can be blocking and it can cause thread pool starvation or deadlock. As an alternative, we recommend to use createDetachedStreamExecutionEnvironment + // combined with MiniClusterJobStatusCheckingOps def createAttachedStreamExecutionEnvironment[F[_]: Sync]: Resource[F, StreamExecutionEnvironment] = { Resource.fromAutoCloseable(Applicative[F].pure(streamExecutionEnvironmentFactory(true))) } + private def streamExecutionEnvironmentFactory(attached: Boolean): StreamExecutionEnvironment = + FlinkMiniClusterStreamExecutionEnvironmentFactory.createStreamExecutionEnvironment( + miniCluster, + modelClassLoader, + attached + ) + override def close(): Unit = miniCluster.close() } From 9b1dbd1010ca3f041771d4e64a2e4fa7454a5725 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mateusz=20S=C5=82abek?= Date: Mon, 17 Mar 2025 21:50:26 +0100 Subject: [PATCH 4/9] remove flink classloader wrapper --- .../process/registrar/FlinkProcessRegistrar.scala | 12 ++---------- 1 file changed, 2 insertions(+), 10 deletions(-) diff --git a/engine/flink/executor/src/main/scala/pl/touk/nussknacker/engine/process/registrar/FlinkProcessRegistrar.scala b/engine/flink/executor/src/main/scala/pl/touk/nussknacker/engine/process/registrar/FlinkProcessRegistrar.scala index f2cf4fcbf21..72a448c2655 100644 --- a/engine/flink/executor/src/main/scala/pl/touk/nussknacker/engine/process/registrar/FlinkProcessRegistrar.scala +++ b/engine/flink/executor/src/main/scala/pl/touk/nussknacker/engine/process/registrar/FlinkProcessRegistrar.scala @@ -98,17 +98,9 @@ class FlinkProcessRegistrar( protected def isRemoteEnv(env: StreamExecutionEnvironment): Boolean = env.isInstanceOf[RemoteStreamEnvironment] - // In remote env we assume FlinkProcessRegistrar is loaded via userClassloader protected def usingRightClassloader(env: StreamExecutionEnvironment)(action: ClassLoader => Unit): Unit = { - if (!isRemoteEnv(env)) { - val flinkLoaderSimulation = streamExecutionEnvPreparer.flinkClassLoaderSimulation - ThreadUtils.withThisAsContextClassLoader[Unit](flinkLoaderSimulation) { - action(flinkLoaderSimulation) - } - } else { - val userLoader = getClass.getClassLoader - action(userLoader) - } + val userLoader = getClass.getClassLoader + action(userLoader) } protected def createInterpreter( From 786cdda6333872b7c38e0759d24f6de160bc0132 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mateusz=20S=C5=82abek?= Date: Fri, 21 Mar 2025 15:54:57 +0100 Subject: [PATCH 5/9] add flink sql validations using env --- build.sbt | 4 +- .../flink/table/FlinkSqlTableTestCases.scala | 8 +- .../table/definition/FlinkDdlParserTest.scala | 62 ++-- .../TablesDefinitionDiscoveryTest.scala | 286 +++++++++++++++--- .../TablesDefinitionValidationTest.scala | 161 ++++++++++ .../flink/table/sink/TableFileSinkTest.scala | 34 ++- .../table/sink/TableSinkParametersTest.scala | 10 +- .../TableSourceDataGenerationTest.scala | 44 ++- .../flink/table/source/TableSourceTest.scala | 4 +- .../ModelClassLoaderSimulationSuite.scala | 22 ++ ...linkTableDataSourceComponentProvider.scala | 38 ++- .../definition/FlinkDataDefinition.scala | 89 +++--- .../FlinkDataDefinitionErrors.scala | 115 +++++++ .../table/definition/FlinkDdlParser.scala | 60 ++-- .../TablesDefinitionDiscovery.scala | 171 ++++++++--- .../TablesDefinitionValidation.scala | 88 ++++++ .../flink/table/sink/TableSinkFactory.scala | 26 +- .../FlinkMiniClusterTableOperations.scala | 61 ++-- .../flink/table/source/TableSource.scala | 42 ++- .../table/source/TableSourceFactory.scala | 26 +- .../flink/table/utils/SchemaExtensions.scala | 7 + 21 files changed, 1064 insertions(+), 294 deletions(-) create mode 100644 engine/flink/components/base-tests/src/test/scala/pl/touk/nussknacker/engine/flink/table/definition/TablesDefinitionValidationTest.scala create mode 100644 engine/flink/components/base-tests/src/test/scala/pl/touk/nussknacker/engine/flink/table/utils/ModelClassLoaderSimulationSuite.scala create mode 100644 engine/flink/components/table/src/main/scala/pl/touk/nussknacker/engine/flink/table/definition/FlinkDataDefinitionErrors.scala create mode 100644 engine/flink/components/table/src/main/scala/pl/touk/nussknacker/engine/flink/table/definition/TablesDefinitionValidation.scala diff --git a/build.sbt b/build.sbt index 0da597a6c03..b365040a965 100644 --- a/build.sbt +++ b/build.sbt @@ -294,6 +294,7 @@ lazy val commonSettings = // You can find versions provided by Flink in it's lib/flink-dist-*.jar/META-INF/DEPENDENCIES file. val flinkV = "1.19.2" val flinkConnectorKafkaV = "3.2.0-1.19" +val jdbcFlinkConnectorV = "3.2.0-1.19" val flinkCommonsCompressV = "1.26.0" val flinkCommonsLang3V = "3.12.0" val flinkCommonsTextV = "1.10.0" @@ -1861,7 +1862,8 @@ lazy val flinkTableApiComponents = (project in flink("components/table")) "org.apache.flink" % "flink-sql-parser" % flinkV, "org.apache.flink" % "flink-connector-files" % flinkV, // needed for testing data generation "org.apache.flink" % "flink-json" % flinkV, // needed for testing data generation - "org.apache.flink" % "flink-csv" % flinkV % Test, + "org.apache.flink" % "flink-csv" % flinkV % Test, + "org.apache.flink" % "flink-connector-jdbc" % jdbcFlinkConnectorV % Test, ) } ) diff --git a/engine/flink/components/base-tests/src/test/scala/pl/touk/nussknacker/engine/flink/table/FlinkSqlTableTestCases.scala b/engine/flink/components/base-tests/src/test/scala/pl/touk/nussknacker/engine/flink/table/FlinkSqlTableTestCases.scala index dce1dd54978..4f5640bfaa8 100644 --- a/engine/flink/components/base-tests/src/test/scala/pl/touk/nussknacker/engine/flink/table/FlinkSqlTableTestCases.scala +++ b/engine/flink/components/base-tests/src/test/scala/pl/touk/nussknacker/engine/flink/table/FlinkSqlTableTestCases.scala @@ -2,6 +2,7 @@ package pl.touk.nussknacker.engine.flink.table import pl.touk.nussknacker.engine.flink.table.definition.FlinkDataDefinition.FlinkSqlDdlStatement.{ CatalogName, + CatalogType, CreateCatalog, SqlOption } @@ -17,7 +18,9 @@ object FlinkSqlTableTestCases { | someIntComputed AS someInt * 2, | `file.name` STRING NOT NULL METADATA |) WITH ( - | 'connector' = 'filesystem' + | 'connector' = 'filesystem', + | 'path' = '.', + | 'format' = 'csv' |);""".stripMargin val unboundedDatagenTable: String = @@ -83,7 +86,8 @@ object FlinkSqlTableTestCases { SqlOption("username", "username"), SqlOption("password", "password"), SqlOption("base-url", "jdbc:postgresql://localhost:5432"), - ) + ), + CatalogType("jdbc") ) } diff --git a/engine/flink/components/base-tests/src/test/scala/pl/touk/nussknacker/engine/flink/table/definition/FlinkDdlParserTest.scala b/engine/flink/components/base-tests/src/test/scala/pl/touk/nussknacker/engine/flink/table/definition/FlinkDdlParserTest.scala index 64df94aa436..3b2a76930a6 100644 --- a/engine/flink/components/base-tests/src/test/scala/pl/touk/nussknacker/engine/flink/table/definition/FlinkDdlParserTest.scala +++ b/engine/flink/components/base-tests/src/test/scala/pl/touk/nussknacker/engine/flink/table/definition/FlinkDdlParserTest.scala @@ -6,35 +6,18 @@ import org.scalatest.funsuite.AnyFunSuite import org.scalatest.matchers.should.Matchers import pl.touk.nussknacker.engine.flink.table.FlinkSqlTableTestCases import pl.touk.nussknacker.engine.flink.table.definition.FlinkDataDefinition.FlinkSqlDdlStatement.{ + Connector, CreateTable, SqlString } -import pl.touk.nussknacker.engine.flink.table.definition.FlinkDdlParseError.{ParseError, UnallowedStatement} +import pl.touk.nussknacker.engine.flink.table.definition.FlinkDataDefinitionCreationError.FlinkDdlParseError._ import pl.touk.nussknacker.engine.flink.table.definition.FlinkDdlParserTest.syntacticallyInvalidSqlStatements class FlinkDdlParserTest extends AnyFunSuite with Matchers { - test("return error for syntactically invalid statements") { - syntacticallyInvalidSqlStatements.foreach { s => - FlinkDdlParser.parse(s) should matchPattern { case Invalid(NonEmptyList(ParseError(_), _)) => } - } - } - - test("return multiple errors for multiple unallowed statements") { - val sqlStatements = FlinkDdlParserTest.unallowedSqlStatements.mkString(";\n") - FlinkDdlParser.parse(sqlStatements) should matchPattern { - case Invalid(NonEmptyList(UnallowedStatement(_), List(UnallowedStatement(_)))) => - } - } - - test("parses semantically invalid but parseable statements") { - val sqlStatements = FlinkDdlParserTest.semanticallyIllegalButParseableStatements.mkString(";\n") - FlinkDdlParser.parse(sqlStatements) should matchPattern { case Valid(_) => } - } - test("parse valid create table statements") { FlinkDdlParser.parse(FlinkSqlTableTestCases.unboundedKafkaTable) shouldBe Valid( - List(CreateTable(SqlString(FlinkSqlTableTestCases.unboundedKafkaTableFormatted))) + List(CreateTable(SqlString(FlinkSqlTableTestCases.unboundedKafkaTableFormatted), Connector("kafka"))) ) } @@ -51,13 +34,45 @@ class FlinkDdlParserTest extends AnyFunSuite with Matchers { s"${FlinkSqlTableTestCases.PostgresCatalog.postgresCatalog}" ) shouldBe Valid( List( - CreateTable(SqlString(FlinkSqlTableTestCases.unboundedKafkaTableFormatted)), - CreateTable(SqlString(FlinkSqlTableTestCases.unboundedDatagenTableFormatted)), + CreateTable(SqlString(FlinkSqlTableTestCases.unboundedKafkaTableFormatted), Connector("kafka")), + CreateTable(SqlString(FlinkSqlTableTestCases.unboundedDatagenTableFormatted), Connector("datagen")), FlinkSqlTableTestCases.PostgresCatalog.postgresCatalogParsed ) ) } + test("parses semantically invalid but parseable statements") { + val sqlStatements = FlinkDdlParserTest.semanticallyIllegalButParseableStatements.mkString(";\n") + FlinkDdlParser.parse(sqlStatements) should matchPattern { case Valid(_) => } + } + + test("return error for syntactically invalid statements") { + syntacticallyInvalidSqlStatements.foreach { s => + FlinkDdlParser.parse(s) should matchPattern { case Invalid(NonEmptyList(ParseError(_), Nil)) => } + } + } + + test("return multiple errors for multiple unallowed statements") { + val sqlStatements = FlinkDdlParserTest.unallowedSqlStatements.mkString(";\n") + FlinkDdlParser.parse(sqlStatements) should matchPattern { + case Invalid(NonEmptyList(UnallowedStatement(_), List(UnallowedStatement(_)))) => + } + } + + test("return error for missing connector option") { + val sqlStatement = "CREATE TABLE testTable (str STRING) WITH ()" + FlinkDdlParser.parse(sqlStatement) should matchPattern { + case Invalid(NonEmptyList(MissingConnectorOption(_), Nil)) => + } + } + + test("return error for missing catalog type") { + val sqlStatement = "CREATE CATALOG testCatalog WITH ()" + FlinkDdlParser.parse(sqlStatement) should matchPattern { + case Invalid(NonEmptyList(MissingCatalogTypeOption(_, _), Nil)) => + } + } + } object FlinkDdlParserTest { @@ -86,9 +101,6 @@ object FlinkDdlParserTest { |) WITH ( | 'connector' = 'datagen' |)""".stripMargin, - s"""|CREATE TABLE testTableWithoutOptions ( - | col STRING - |)""".stripMargin, s"""|CREATE TABLE testTableWithEmptyConnector ( | col STRING |) WITH ( diff --git a/engine/flink/components/base-tests/src/test/scala/pl/touk/nussknacker/engine/flink/table/definition/TablesDefinitionDiscoveryTest.scala b/engine/flink/components/base-tests/src/test/scala/pl/touk/nussknacker/engine/flink/table/definition/TablesDefinitionDiscoveryTest.scala index 3979461dbda..44fed7c3fdd 100644 --- a/engine/flink/components/base-tests/src/test/scala/pl/touk/nussknacker/engine/flink/table/definition/TablesDefinitionDiscoveryTest.scala +++ b/engine/flink/components/base-tests/src/test/scala/pl/touk/nussknacker/engine/flink/table/definition/TablesDefinitionDiscoveryTest.scala @@ -1,17 +1,26 @@ package pl.touk.nussknacker.engine.flink.table.definition -import cats.implicits.catsSyntaxValidatedId +import cats.implicits.{catsSyntaxValidatedId, toTraverseOps} import org.apache.flink.configuration.Configuration import org.apache.flink.table.api.DataTypes import org.apache.flink.table.catalog._ +import org.scalatest.Inside.inside import org.scalatest.LoneElement import org.scalatest.funsuite.AnyFunSuite import org.scalatest.matchers.should.Matchers import org.scalatest.prop.TableDrivenPropertyChecks -import pl.touk.nussknacker.engine.flink.table.FlinkSqlTableTestCases -import pl.touk.nussknacker.engine.flink.table.definition.FlinkDataDefinition.EmptyDataDefinitionConfiguration -import pl.touk.nussknacker.engine.flink.table.definition.TablesDefinitionDiscoveryTest.invalidSqlStatements +import pl.touk.nussknacker.engine.flink.minicluster.FlinkMiniClusterFactory +import pl.touk.nussknacker.engine.flink.table.definition.FlinkDataDefinitionCreationError.EmptyDataDefinitionConfiguration +import pl.touk.nussknacker.engine.flink.table.definition.FlinkDataDefinitionDiscoveryError.{ + ConnectorDiscoveryProblem, + TableEnvironmentRuntimeValidationError +} +import pl.touk.nussknacker.engine.flink.table.definition.FlinkDataDefinitionRegistrationError.{ + CatalogRegistrationError, + SqlStatementExecutionError +} import pl.touk.nussknacker.engine.flink.table.utils.DataTypesExtensions._ +import pl.touk.nussknacker.engine.flink.table.utils.ModelClassLoaderSimulationSuite import pl.touk.nussknacker.test.{PatientScalaFutures, ValidatedValuesDetailedMessage} import scala.jdk.CollectionConverters._ @@ -22,18 +31,36 @@ class TablesDefinitionDiscoveryTest with LoneElement with ValidatedValuesDetailedMessage with TableDrivenPropertyChecks - with PatientScalaFutures { + with PatientScalaFutures + with ModelClassLoaderSimulationSuite { - test("return error for empty flink data definition") { - FlinkDataDefinition.apply(None, None) shouldBe EmptyDataDefinitionConfiguration.invalidNel + private val minicluster = FlinkMiniClusterFactory.createUnitTestsMiniClusterWithServices() + + private def discoverTables(sql: String) = { + val parsedSql = FlinkDdlParser.parseUnsafe(sql) + val dataDefinition = FlinkDataDefinition.applyUnsafe(parsedSql, None) + TablesDefinitionDiscovery + .prepareDiscovery(dataDefinition, minicluster, simulatedModelClassloader) + .andThen(_.listTables.sequence) } - test("extracts configuration from valid sql statement") { - val flinkDataDefinition = FlinkDataDefinition.applyUnsafe(Some(FlinkSqlTableTestCases.allColumnTypesTable), None) - val discovery = TablesDefinitionDiscovery.prepareDiscovery(flinkDataDefinition).validValue - val tablesDefinitions = discovery.listTables - val tableDefinition = tablesDefinitions.loneElement - val sourceRowType = tableDefinition.sourceRowDataType.toLogicalRowTypeUnsafe + test("extracts table definition with correct source and sink data type") { + val sql = + s"""|CREATE TABLE testTable + |( + | someString STRING, + | someVarChar VARCHAR(150), + | someInt INT, + | someIntComputed AS someInt * 2, + | `file.name` STRING NOT NULL METADATA + |) WITH ( + | 'connector' = 'filesystem', + | 'path' = '.', + | 'format' = 'csv' + |);""".stripMargin + val tableDefinition = discoverTables(sql).validValue.loneElement + + val sourceRowType = tableDefinition.sourceRowDataType.toLogicalRowTypeUnsafe sourceRowType.getFieldNames.asScala shouldBe List( "someString", "someVarChar", @@ -55,22 +82,12 @@ class TablesDefinitionDiscoveryTest ) } - test("returns errors for statements that cannot be executed") { - invalidSqlStatements.foreach { invalidStatement => - val flinkDataDefinition = FlinkDataDefinition.applyUnsafe(Some(invalidStatement), None) - val sqlStatementExecutionErrors = TablesDefinitionDiscovery.prepareDiscovery(flinkDataDefinition).invalidValue - - sqlStatementExecutionErrors.size shouldBe 1 - } - } - test("use catalog configuration in data definition") { val catalogConfiguration = Configuration.fromMap(Map("type" -> StubbedCatalogFactory.catalogName).asJava) - val flinkDataDefinition = FlinkDataDefinition.applyUnsafe(None, Some(catalogConfiguration)) - - val discovery = TablesDefinitionDiscovery.prepareDiscovery(flinkDataDefinition).validValue - - val tableDefinition = discovery.listTables.loneElement + val flinkDataDefinition = FlinkDataDefinition.applyUnsafe(List.empty, Some(catalogConfiguration)) + val discovery = + TablesDefinitionDiscovery.prepareDiscovery(flinkDataDefinition, minicluster, simulatedModelClassloader).validValue + val tableDefinition = discovery.listTables.sequence.validValue.loneElement tableDefinition.tableId.toString shouldBe s"`_nu_catalog`." + s"`${StubbedCatalogFactory.sampleBoundedTablePath.getDatabaseName}`." + @@ -80,28 +97,199 @@ class TablesDefinitionDiscoveryTest ) } -} + test("returns no error for persistable metadata column table") { + val sql = + s"""|CREATE TABLE testTable ( + | `file.name` STRING NOT NULL METADATA + |) WITH ( + | 'connector' = 'filesystem', + | 'path' = '.', + | 'format' = 'csv' + |);""".stripMargin + discoverTables(sql) shouldBe Symbol("valid") + } + + test("return error for empty flink data definition") { + FlinkDataDefinition.apply(List.empty, None) shouldBe EmptyDataDefinitionConfiguration.invalidNel + } + + test("returns error for table under non-default database") { + val sql = + """|CREATE TABLE somedb.testTable + |( + | someString STRING + |) WITH ( + | 'connector' = 'datagen' + |);""".stripMargin + val error = discoverTables(sql).invalidValue.toList.loneElement + + inside(error) { case e: SqlStatementExecutionError => + e.getMessage should include("Cause: Could not execute CreateTable in path `default_catalog`.`somedb`.`testTable`") + } + } + + test("should return error if cannot connect to catalog at discovery preparation") { + val sql = + """|CREATE CATALOG my_catalog WITH ( + | 'type' = 'jdbc', + | 'default-database' = 'default-db', + | 'username' = 'username', + | 'password' = 'password', + | 'base-url' = 'jdbc:postgresql://localhost:5432' + |)""".stripMargin + val error = discoverTables(sql).invalidValue.toList.loneElement + inside(error) { case e: CatalogRegistrationError => + e.getMessage shouldBe + """Could not create catalog. + |Cause: Failed connecting to jdbc:postgresql://localhost:5432/default-db via JDBC.""".stripMargin + } + + } -object TablesDefinitionDiscoveryTest { - - private val invalidSqlStatements: List[String] = List( - """|CREATE TABLE testTable - |( - | someString STRING - |) - |;""".stripMargin, // no WITH clause - """|CREATE TABLE testTable - |( - | someString STRING - |) WITH ( - | 'connector' = '' - |);""".stripMargin, // empty string connector - does not reach the dedicated error because fails earlier - """|CREATE TABLE somedb.testTable - |( - | someString STRING - |) WITH ( - | 'connector' = 'datagen' - |);""".stripMargin, // trying to create a table under non-existing database - ) + test("should return error for table with connector not on classpath") { + val sql = + s"""|CREATE TABLE `test_table` ( + | `someString` STRING + |) WITH ( + | 'connector' = 'not-on-classpath-connector' + |)""".stripMargin + val error = discoverTables(sql).invalidValue.toList.loneElement + inside(error) { case e: ConnectorDiscoveryProblem => + e.getMessage shouldBe "Could not find matching connector: [not-on-classpath-connector]" + } + } + + test("should return error for table with format not on classpath") { + val sql = + s"""|CREATE TABLE `test_table` ( + | `someString` STRING + |) WITH ( + | 'connector' = 'filesystem', + | 'path' = '.', + | 'format' = 'not-on-classpath-format' + |)""".stripMargin + val errors = discoverTables(sql).invalidValue.toList + inside(errors) { case err1 :: err2 :: Nil => + err1.getMessage shouldBe "Could not find any format factory for identifier 'not-on-classpath-format' in the classpath." + err2.getMessage shouldBe "Could not find any format factory for identifier 'not-on-classpath-format' in the classpath." + } + } + + test("should return error for source only table with redundant options") { + val sql = + s"""|CREATE TABLE `datagen_table` ( + | `someString` STRING + |) WITH ( + | 'connector' = 'datagen', + | 'redundant' = '123' + |)""".stripMargin + val error = discoverTables(sql).invalidValue.toList.loneElement + inside(error) { case e: TableEnvironmentRuntimeValidationError => + e.getMessage shouldBe + """|Unsupported options found for 'datagen'. + | + |Unsupported options: + | + |redundant + | + |Supported options: + | + |connector + |fields.someString.kind + |fields.someString.length + |fields.someString.null-rate + |fields.someString.var-len + |number-of-rows + |rows-per-second + |scan.parallelism""".stripMargin + } + } + + test("should return error for sink only table with redundant options") { + val sql = + s"""|CREATE TABLE `datagen_table` ( + | `someString` STRING + |) WITH ( + | 'connector' = 'blackhole', + | 'redundant' = '123' + |)""".stripMargin + val error = discoverTables(sql).invalidValue.toList.loneElement + inside(error) { case e: TableEnvironmentRuntimeValidationError => + e.getMessage shouldBe + """|Unsupported options found for 'blackhole'. + | + |Unsupported options: + | + |redundant + | + |Supported options: + | + |connector + |property-version + |scan.watermark.alignment.group + |scan.watermark.alignment.max-drift + |scan.watermark.alignment.update-interval + |scan.watermark.emit.strategy + |scan.watermark.idle-timeout""".stripMargin + } + } + + test("should return duplicated error for source and sink table with redundant options") { + val sql = + s"""|CREATE TABLE testTable ( + | `file.name` STRING + |) WITH ( + | 'connector' = 'filesystem', + | 'path' = '.', + | 'format' = 'csv', + | 'redundant' = '123' + |);""".stripMargin + val error = discoverTables(sql).invalidValue.toList + inside(error) { case List(e1: TableEnvironmentRuntimeValidationError, e2) => + val expectedMessage = + """|Unsupported options found for 'filesystem'. + | + |Unsupported options: + | + |redundant + | + |Supported options: + | + |auto-compaction + |compaction.file-size + |connector + |format + |partition.default-name + |partition.time-extractor.class + |partition.time-extractor.kind + |partition.time-extractor.timestamp-formatter + |partition.time-extractor.timestamp-pattern + |path + |property-version + |scan.watermark.alignment.group + |scan.watermark.alignment.max-drift + |scan.watermark.alignment.update-interval + |scan.watermark.emit.strategy + |scan.watermark.idle-timeout + |sink.parallelism + |sink.partition-commit.delay + |sink.partition-commit.policy.class + |sink.partition-commit.policy.class.parameters + |sink.partition-commit.policy.kind + |sink.partition-commit.success-file.name + |sink.partition-commit.trigger + |sink.partition-commit.watermark-time-zone + |sink.rolling-policy.check-interval + |sink.rolling-policy.file-size + |sink.rolling-policy.inactivity-interval + |sink.rolling-policy.rollover-interval + |sink.shuffle-by-partition.enable + |source.monitor-interval + |source.path.regex-pattern + |source.report-statistics""".stripMargin + e1.getMessage shouldBe expectedMessage + e2.getMessage shouldBe expectedMessage + } + } } diff --git a/engine/flink/components/base-tests/src/test/scala/pl/touk/nussknacker/engine/flink/table/definition/TablesDefinitionValidationTest.scala b/engine/flink/components/base-tests/src/test/scala/pl/touk/nussknacker/engine/flink/table/definition/TablesDefinitionValidationTest.scala new file mode 100644 index 00000000000..9a5d4eecf37 --- /dev/null +++ b/engine/flink/components/base-tests/src/test/scala/pl/touk/nussknacker/engine/flink/table/definition/TablesDefinitionValidationTest.scala @@ -0,0 +1,161 @@ +package pl.touk.nussknacker.engine.flink.table.definition + +import org.scalatest.{Inside, LoneElement} +import org.scalatest.funsuite.AnyFunSuite +import org.scalatest.matchers.should.Matchers +import pl.touk.nussknacker.engine.flink.minicluster.FlinkMiniClusterFactory +import pl.touk.nussknacker.engine.flink.table.definition.FlinkDataDefinitionCreationError.FlinkDdlParseError.ParseError +import pl.touk.nussknacker.engine.flink.table.definition.FlinkDataDefinitionDiscoveryError.{ + CatalogDiscoveryProblem, + CatalogNonTransientValidationError +} +import pl.touk.nussknacker.engine.flink.table.utils.ModelClassLoaderSimulationSuite +import pl.touk.nussknacker.test.ValidatedValuesDetailedMessage.convertValidatedToValuable + +class TablesDefinitionValidationTest + extends AnyFunSuite + with Matchers + with LoneElement + with ModelClassLoaderSimulationSuite + with Inside { + + private val minicluster = FlinkMiniClusterFactory.createUnitTestsMiniClusterWithServices() + private def validate(sql: String) = + TablesDefinitionValidation.validateWithoutExternalConnections(sql, minicluster, simulatedModelClassloader) + + test("should not return external calls reliant validation errors") { + val sql = + """|CREATE CATALOG test_catalog WITH ( + | 'type' = 'jdbc', + | 'default-database' = 'default-db', + | 'username' = 'username', + | 'password' = 'password', + | 'base-url' = 'jdbc:postgresql://localhost:5432' + |)""".stripMargin + validate(sql) shouldBe Symbol("valid") + } + + test("should return error if catalog cannot be found on classpath") { + val sql = + """|CREATE CATALOG test_catalog WITH ( + | 'type' = 'non-existant-catalog' + |)""".stripMargin + val error = validate(sql).invalidValue.toList.loneElement + inside(error) { case e: CatalogDiscoveryProblem => + e.getMessage should startWith("Could not find matching catalog: [non-existant-catalog]") + } + } + + test("should return error for missing required catalog option") { + val sql = + """|CREATE CATALOG test_catalog WITH ( + | 'type' = 'jdbc', + | 'username' = 'username', + | 'password' = 'password', + | 'base-url' = 'jdbc:postgresql://localhost:5432' + |)""".stripMargin + val error = validate(sql).invalidValue.toList.loneElement + inside(error) { case e: CatalogNonTransientValidationError => + e.getMessage shouldBe + """|One or more required options are missing. + | + |Missing required options are: + | + |default-database""".stripMargin + } + } + + test("return error for empty flink data definition") { + val error = validate("").invalidValue.toList.loneElement + inside(error) { case e: ParseError => + e.getMessage should startWith("""Could not parse SQL statements: Encountered "" at line 0, column 0.""") + } + } + + test("should return Flink SQL parsing error") { + val sql = + s"""|CREATE TABLE `test_table` ( + | `someString` STRING + |) WITH ( + | 'connector' = 'not-on-classpath-connector', + |)""".stripMargin + val error = validate(sql).invalidValue.toList.loneElement + inside(error) { case e: FlinkDataDefinitionCreationError => + e.getMessage should startWith("""Could not parse SQL statements: Encountered ")" at line 5, column 1""") + } + } + + test("returns error from data definition registration (statement execution)") { + val sql = + """|CREATE TABLE somedb.testTable + |( + | someString STRING + |) WITH ( + | 'connector' = 'datagen' + |);""".stripMargin + val error = validate(sql).invalidValue.toList.loneElement + + inside(error) { case e: FlinkDataDefinitionRegistrationError => + e.getMessage should include("Cause: Could not execute CreateTable in path `default_catalog`.`somedb`.`testTable`") + } + } + + test("should return table environment runtime validation error") { + val sql = + s"""|CREATE TABLE testTable ( + | `file.name` STRING + |) WITH ( + | 'connector' = 'filesystem', + | 'path' = '.', + | 'format' = 'csv', + | 'redundant' = '123' + |);""".stripMargin + val error = validate(sql).invalidValue.toList + inside(error) { case List(e1: FlinkDataDefinitionDiscoveryError, e2) => + val expectedMessage = + """|Unsupported options found for 'filesystem'. + | + |Unsupported options: + | + |redundant + | + |Supported options: + | + |auto-compaction + |compaction.file-size + |connector + |format + |partition.default-name + |partition.time-extractor.class + |partition.time-extractor.kind + |partition.time-extractor.timestamp-formatter + |partition.time-extractor.timestamp-pattern + |path + |property-version + |scan.watermark.alignment.group + |scan.watermark.alignment.max-drift + |scan.watermark.alignment.update-interval + |scan.watermark.emit.strategy + |scan.watermark.idle-timeout + |sink.parallelism + |sink.partition-commit.delay + |sink.partition-commit.policy.class + |sink.partition-commit.policy.class.parameters + |sink.partition-commit.policy.kind + |sink.partition-commit.success-file.name + |sink.partition-commit.trigger + |sink.partition-commit.watermark-time-zone + |sink.rolling-policy.check-interval + |sink.rolling-policy.file-size + |sink.rolling-policy.inactivity-interval + |sink.rolling-policy.rollover-interval + |sink.shuffle-by-partition.enable + |source.monitor-interval + |source.path.regex-pattern + |source.report-statistics""".stripMargin + e1.getMessage shouldBe expectedMessage + e2.getMessage shouldBe expectedMessage + } + } + +} diff --git a/engine/flink/components/base-tests/src/test/scala/pl/touk/nussknacker/engine/flink/table/sink/TableFileSinkTest.scala b/engine/flink/components/base-tests/src/test/scala/pl/touk/nussknacker/engine/flink/table/sink/TableFileSinkTest.scala index 04ade709a96..e5787e39840 100644 --- a/engine/flink/components/base-tests/src/test/scala/pl/touk/nussknacker/engine/flink/table/sink/TableFileSinkTest.scala +++ b/engine/flink/components/base-tests/src/test/scala/pl/touk/nussknacker/engine/flink/table/sink/TableFileSinkTest.scala @@ -16,7 +16,10 @@ import pl.touk.nussknacker.engine.build.ScenarioBuilder import pl.touk.nussknacker.engine.flink.minicluster.FlinkMiniClusterFactory import pl.touk.nussknacker.engine.flink.table.FlinkTableDataSourceComponentProvider import pl.touk.nussknacker.engine.flink.table.SpelValues._ -import pl.touk.nussknacker.engine.flink.table.utils.NotConvertibleResultOfAlignmentException +import pl.touk.nussknacker.engine.flink.table.utils.{ + ModelClassLoaderSimulationSuite, + NotConvertibleResultOfAlignmentException +} import pl.touk.nussknacker.engine.flink.util.test.FlinkTestScenarioRunner import pl.touk.nussknacker.engine.graph.expression.Expression import pl.touk.nussknacker.engine.process.FlinkJobConfig.ExecutionMode @@ -34,28 +37,29 @@ class TableFileSinkTest with PatientScalaFutures with LoneElement with ValidatedValuesDetailedMessage - with BeforeAndAfterAll { + with BeforeAndAfterAll + with ModelClassLoaderSimulationSuite { import pl.touk.nussknacker.engine.flink.util.test.FlinkTestScenarioRunner._ import pl.touk.nussknacker.engine.spel.SpelExtension._ - private val basicPingPongInputTableName = "basic-ping-pong-input" - private val basicPingPongOutputTableName = "basic-ping-pong-output" - private val basicExpressionOutputTableName = "basic-expression-output" + private val basicPingPongInputTableName = "basic_ping_pong_input" + private val basicPingPongOutputTableName = "basic_ping_pong_output" + private val basicExpressionOutputTableName = "basic_expression_output" - private val advancedPingPongInputTableName = "advanced-ping-pong-input" - private val advancedPingPongOutputTableName = "advanced-ping-pong-output" - private val advancedExpressionOutputTableName = "advanced-expression-output" + private val advancedPingPongInputTableName = "advanced_ping_pong_input" + private val advancedPingPongOutputTableName = "advanced_ping_pong_output" + private val advancedExpressionOutputTableName = "advanced_expression_output" - private val datetimePingPongInputTableName = "datetime-ping-pong-input" - private val datetimePingPongOutputTableName = "datetime-ping-pong-output" - private val datetimeExpressionOutputTableName = "datetime-expression-output" + private val datetimePingPongInputTableName = "datetime_ping_pong_input" + private val datetimePingPongOutputTableName = "datetime_ping_pong_output" + private val datetimeExpressionOutputTableName = "datetime_expression_output" - private val virtualColumnInputTableName = "virtual-column-input" - private val virtualColumnOutputTableName = "virtual-column-output" + private val virtualColumnInputTableName = "virtual_column_input" + private val virtualColumnOutputTableName = "virtual_column_output" - private val oneColumnOutputTableName = "one-column-output" - private val genericsOutputTableName = "generics-output" + private val oneColumnOutputTableName = "one_column_output" + private val genericsOutputTableName = "generics_output" private lazy val basicPingPongInputDirectory = Files.createTempDirectory(s"nusssknacker-${getClass.getSimpleName}-$basicPingPongInputTableName") diff --git a/engine/flink/components/base-tests/src/test/scala/pl/touk/nussknacker/engine/flink/table/sink/TableSinkParametersTest.scala b/engine/flink/components/base-tests/src/test/scala/pl/touk/nussknacker/engine/flink/table/sink/TableSinkParametersTest.scala index 60742745a7f..0f374577bfe 100644 --- a/engine/flink/components/base-tests/src/test/scala/pl/touk/nussknacker/engine/flink/table/sink/TableSinkParametersTest.scala +++ b/engine/flink/components/base-tests/src/test/scala/pl/touk/nussknacker/engine/flink/table/sink/TableSinkParametersTest.scala @@ -17,6 +17,7 @@ import pl.touk.nussknacker.engine.api.process.ProcessObjectDependencies import pl.touk.nussknacker.engine.build.ScenarioBuilder import pl.touk.nussknacker.engine.flink.minicluster.FlinkMiniClusterFactory import pl.touk.nussknacker.engine.flink.table.FlinkTableDataSourceComponentProvider +import pl.touk.nussknacker.engine.flink.table.utils.ModelClassLoaderSimulationSuite import pl.touk.nussknacker.engine.flink.util.test.FlinkTestScenarioRunner import pl.touk.nussknacker.engine.process.FlinkJobConfig.ExecutionMode import pl.touk.nussknacker.engine.util.test.TestScenarioRunner @@ -26,14 +27,19 @@ import java.nio.charset.StandardCharsets import java.nio.file.{Files, Path} import scala.jdk.CollectionConverters._ -class TableSinkParametersTest extends AnyFunSuite with Matchers with PatientScalaFutures with BeforeAndAfterAll { +class TableSinkParametersTest + extends AnyFunSuite + with Matchers + with PatientScalaFutures + with BeforeAndAfterAll + with ModelClassLoaderSimulationSuite { import pl.touk.nussknacker.engine.flink.util.test.FlinkTestScenarioRunner._ import pl.touk.nussknacker.engine.spel.SpelExtension._ private val inputTableName = "input" private val outputTableName = "output" - private val virtualColumnOutputTableName = "virtual-column-output" + private val virtualColumnOutputTableName = "virtual_column_output" private val outputTableNameWithInvalidCols = "output_invalid_column_names" private lazy val outputDirectory = diff --git a/engine/flink/components/base-tests/src/test/scala/pl/touk/nussknacker/engine/flink/table/source/TableSourceDataGenerationTest.scala b/engine/flink/components/base-tests/src/test/scala/pl/touk/nussknacker/engine/flink/table/source/TableSourceDataGenerationTest.scala index ced5dce6564..2c35871811a 100644 --- a/engine/flink/components/base-tests/src/test/scala/pl/touk/nussknacker/engine/flink/table/source/TableSourceDataGenerationTest.scala +++ b/engine/flink/components/base-tests/src/test/scala/pl/touk/nussknacker/engine/flink/table/source/TableSourceDataGenerationTest.scala @@ -1,11 +1,19 @@ package pl.touk.nussknacker.engine.flink.table.source +import cats.implicits.toTraverseOps +import org.apache.flink.configuration.Configuration import org.scalatest.LoneElement import org.scalatest.funsuite.AnyFunSuite import org.scalatest.matchers.should.Matchers -import pl.touk.nussknacker.engine.flink.table.FlinkSqlTableTestCases +import pl.touk.nussknacker.engine.flink.minicluster.FlinkMiniClusterFactory +import pl.touk.nussknacker.engine.flink.table.FlinkSqlTableTestCases.allColumnTypesTable import pl.touk.nussknacker.engine.flink.table.TableComponentProviderConfig.TestDataGenerationMode -import pl.touk.nussknacker.engine.flink.table.definition.{FlinkDataDefinition, TablesDefinitionDiscovery} +import pl.touk.nussknacker.engine.flink.table.definition.{ + FlinkDataDefinition, + FlinkDdlParser, + TablesDefinitionDiscovery +} +import pl.touk.nussknacker.engine.flink.table.utils.ModelClassLoaderSimulationSuite import pl.touk.nussknacker.test.ValidatedValuesDetailedMessage import scala.jdk.CollectionConverters._ @@ -14,18 +22,28 @@ class TableSourceDataGenerationTest extends AnyFunSuite with Matchers with LoneElement - with ValidatedValuesDetailedMessage { - - private val flinkDataDefinition = - FlinkDataDefinition.applyUnsafe(Some(FlinkSqlTableTestCases.allColumnTypesTable), None) - - private val discovery = TablesDefinitionDiscovery.prepareDiscovery(flinkDataDefinition).validValue + with ValidatedValuesDetailedMessage + with ModelClassLoaderSimulationSuite { - private val tableSource = new TableSource( - tableDefinition = discovery.listTables.loneElement, - flinkDataDefinition = flinkDataDefinition, - testDataGenerationMode = TestDataGenerationMode.Random - ) + private val tableSource = { + val flinkDataDefinition = + FlinkDataDefinition.applyUnsafe(FlinkDdlParser.parseUnsafe(allColumnTypesTable), None) + val minicluster = + FlinkMiniClusterFactory.createMiniClusterWithServices(simulatedModelClassloader, new Configuration()) + val discovery = TablesDefinitionDiscovery + .prepareDiscovery( + flinkDataDefinition, + FlinkMiniClusterFactory.createMiniClusterWithServices(simulatedModelClassloader, new Configuration()), + simulatedModelClassloader + ) + .validValue + new TableSource( + tableDefinition = discovery.listTables.sequence.validValue.loneElement, + flinkDataDefinition = flinkDataDefinition, + testDataGenerationMode = TestDataGenerationMode.Random, + minicluster + ) + } /* Note: Testing features like data generation or scenario testing (like ad hoc test) requires a full e2e test where diff --git a/engine/flink/components/base-tests/src/test/scala/pl/touk/nussknacker/engine/flink/table/source/TableSourceTest.scala b/engine/flink/components/base-tests/src/test/scala/pl/touk/nussknacker/engine/flink/table/source/TableSourceTest.scala index 56377a7680c..8926a051f69 100644 --- a/engine/flink/components/base-tests/src/test/scala/pl/touk/nussknacker/engine/flink/table/source/TableSourceTest.scala +++ b/engine/flink/components/base-tests/src/test/scala/pl/touk/nussknacker/engine/flink/table/source/TableSourceTest.scala @@ -13,6 +13,7 @@ import pl.touk.nussknacker.engine.flink.minicluster.FlinkMiniClusterFactory import pl.touk.nussknacker.engine.flink.table.FlinkTableDataSourceComponentProvider import pl.touk.nussknacker.engine.flink.table.definition.StubbedCatalogFactory import pl.touk.nussknacker.engine.flink.table.source.TableSource.SQL_EXPRESSION_PARAMETER_NAME +import pl.touk.nussknacker.engine.flink.table.utils.ModelClassLoaderSimulationSuite import pl.touk.nussknacker.engine.flink.util.test.FlinkTestScenarioRunner import pl.touk.nussknacker.engine.process.FlinkJobConfig.ExecutionMode import pl.touk.nussknacker.engine.util.test.TestScenarioRunner @@ -24,7 +25,8 @@ class TableSourceTest with PatientScalaFutures with LoneElement with ValidatedValuesDetailedMessage - with BeforeAndAfterAll { + with BeforeAndAfterAll + with ModelClassLoaderSimulationSuite { import pl.touk.nussknacker.engine.flink.util.test.FlinkTestScenarioRunner._ import pl.touk.nussknacker.engine.spel.SpelExtension._ diff --git a/engine/flink/components/base-tests/src/test/scala/pl/touk/nussknacker/engine/flink/table/utils/ModelClassLoaderSimulationSuite.scala b/engine/flink/components/base-tests/src/test/scala/pl/touk/nussknacker/engine/flink/table/utils/ModelClassLoaderSimulationSuite.scala new file mode 100644 index 00000000000..60926785c37 --- /dev/null +++ b/engine/flink/components/base-tests/src/test/scala/pl/touk/nussknacker/engine/flink/table/utils/ModelClassLoaderSimulationSuite.scala @@ -0,0 +1,22 @@ +package pl.touk.nussknacker.engine.flink.table.utils + +import org.scalatest.{BeforeAndAfterAll, Suite} +import pl.touk.nussknacker.engine.util.loader.ModelClassLoader + +// This is only for purpose of using an empty URLClassLoader as contextClassLoader in tests ran from Intellij Idea +trait ModelClassLoaderSimulationSuite extends BeforeAndAfterAll { this: Suite => + + private val originalContextClassLoader: ClassLoader = Thread.currentThread().getContextClassLoader + protected val simulatedModelClassloader: ModelClassLoader = ModelClassLoader.flinkWorkAroundEmptyClassloader + + override protected def beforeAll(): Unit = { + Thread.currentThread().setContextClassLoader(simulatedModelClassloader) + super.beforeAll() + } + + override protected def afterAll(): Unit = { + Thread.currentThread().setContextClassLoader(originalContextClassLoader) + super.afterAll() + } + +} diff --git a/engine/flink/components/table/src/main/scala/pl/touk/nussknacker/engine/flink/table/FlinkTableDataSourceComponentProvider.scala b/engine/flink/components/table/src/main/scala/pl/touk/nussknacker/engine/flink/table/FlinkTableDataSourceComponentProvider.scala index 6f83c4d1838..5a032106cb2 100644 --- a/engine/flink/components/table/src/main/scala/pl/touk/nussknacker/engine/flink/table/FlinkTableDataSourceComponentProvider.scala +++ b/engine/flink/components/table/src/main/scala/pl/touk/nussknacker/engine/flink/table/FlinkTableDataSourceComponentProvider.scala @@ -3,14 +3,17 @@ package pl.touk.nussknacker.engine.flink.table import com.typesafe.config.Config import com.typesafe.scalalogging.LazyLogging import org.apache.flink.configuration.Configuration +import org.apache.flink.util.FlinkUserCodeClassLoaders.SafetyNetWrapperClassLoader import pl.touk.nussknacker.engine.api.component.{ComponentDefinition, ComponentProvider, NussknackerVersion} import pl.touk.nussknacker.engine.api.process.ProcessObjectDependencies +import pl.touk.nussknacker.engine.flink.minicluster.FlinkMiniClusterFactory import pl.touk.nussknacker.engine.flink.table.TableComponentProviderConfig.TestDataGenerationMode import pl.touk.nussknacker.engine.flink.table.TableComponentProviderConfig.TestDataGenerationMode.TestDataGenerationMode -import pl.touk.nussknacker.engine.flink.table.definition.FlinkDataDefinition +import pl.touk.nussknacker.engine.flink.table.definition.{FlinkDataDefinition, FlinkDdlParser} import pl.touk.nussknacker.engine.flink.table.sink.TableSinkFactory import pl.touk.nussknacker.engine.flink.table.source.TableSourceFactory +import java.net.URLClassLoader import scala.jdk.CollectionConverters._ class FlinkTableDataSourceComponentProvider extends ComponentProvider with LazyLogging { @@ -25,19 +28,25 @@ class FlinkTableDataSourceComponentProvider extends ComponentProvider with LazyL val testDataGenerationModeOrDefault = parsedConfig.testDataGenerationMode.getOrElse(TestDataGenerationMode.default) val sqlStatements = parsedConfig.tableDefinition val catalogConfigurationOpt = parsedConfig.catalogConfiguration.map(_.asJava).map(Configuration.fromMap) - val flinkDataDefinition = FlinkDataDefinition.applyUnsafe(sqlStatements, catalogConfigurationOpt) + val parsedSqlStatements = sqlStatements.map(FlinkDdlParser.parseUnsafe).toList.flatten + val flinkDataDefinition = FlinkDataDefinition.applyUnsafe(parsedSqlStatements, catalogConfigurationOpt) + + val modelClassLoader = castModelClassloader() + val miniCluster = FlinkMiniClusterFactory.createMiniClusterWithServices(modelClassLoader, new Configuration()) List( ComponentDefinition( tableComponentName, new TableSourceFactory( flinkDataDefinition, - testDataGenerationModeOrDefault + testDataGenerationModeOrDefault, + miniCluster, + modelClassLoader ) ), ComponentDefinition( tableComponentName, - new TableSinkFactory(flinkDataDefinition) + new TableSinkFactory(flinkDataDefinition, miniCluster, modelClassLoader) ) ) } @@ -46,6 +55,27 @@ class FlinkTableDataSourceComponentProvider extends ComponentProvider with LazyL override def isAutoLoaded: Boolean = false + // TODO: Pass ModelClassLoader through API + private def castModelClassloader() = { + val modelClassLoader = Thread.currentThread().getContextClassLoader match { + // When executing tests in Designer, a SafetyNetWrapperClassLoader is used with the ModelClassloader as parent + case wrapperClassLoader: SafetyNetWrapperClassLoader => + wrapperClassLoader.getParent match { + case cl: URLClassLoader => cl + case _ => + throw new IllegalStateException( + "FlinkTableDataSourceComponentProvider should be loaded with ModelClassLoader as context ClassLoader" + ) + } + case cl: URLClassLoader => cl + case _ => + throw new IllegalStateException( + "FlinkTableDataSourceComponentProvider should be loaded with ModelClassLoader as context ClassLoader" + ) + } + modelClassLoader + } + } final case class TableComponentProviderConfig( diff --git a/engine/flink/components/table/src/main/scala/pl/touk/nussknacker/engine/flink/table/definition/FlinkDataDefinition.scala b/engine/flink/components/table/src/main/scala/pl/touk/nussknacker/engine/flink/table/definition/FlinkDataDefinition.scala index 1cedde3050d..7dc574eee69 100644 --- a/engine/flink/components/table/src/main/scala/pl/touk/nussknacker/engine/flink/table/definition/FlinkDataDefinition.scala +++ b/engine/flink/components/table/src/main/scala/pl/touk/nussknacker/engine/flink/table/definition/FlinkDataDefinition.scala @@ -1,12 +1,15 @@ package pl.touk.nussknacker.engine.flink.table.definition -import cats.data.{Validated, ValidatedNel} +import cats.data.{NonEmptyList, Validated, ValidatedNel} import cats.implicits._ import org.apache.flink.configuration.Configuration import org.apache.flink.table.api.TableEnvironment import org.apache.flink.table.catalog.CatalogDescriptor -import pl.touk.nussknacker.engine.flink.table.definition.FlinkDataDefinition.FlinkSqlDdlStatement +import pl.touk.nussknacker.engine.flink.table.definition.FlinkDataDefinition._ import pl.touk.nussknacker.engine.flink.table.definition.FlinkDataDefinition.FlinkSqlDdlStatement._ +import pl.touk.nussknacker.engine.flink.table.definition.FlinkDataDefinitionCreationError.EmptyDataDefinitionConfiguration +import pl.touk.nussknacker.engine.flink.table.definition.FlinkDataDefinitionCreationError.FlinkDdlParseError.MissingCatalogTypeOption +import pl.touk.nussknacker.engine.flink.table.definition.FlinkDataDefinitionRegistrationError._ import scala.jdk.CollectionConverters._ import scala.util.Try @@ -15,14 +18,14 @@ class FlinkDataDefinition( sqlDdl: List[FlinkSqlDdlStatement] ) extends Serializable { - def registerIn(tableEnvironment: TableEnvironment): ValidatedNel[DataDefinitionRegistrationError, Unit] = { + def registerIn(tableEnvironment: TableEnvironment): ValidatedNel[FlinkDataDefinitionRegistrationError, Unit] = { val registrationResults = sqlDdl.map { - case CreateTable(SqlString(sql)) => + case CreateTable(SqlString(sql), _) => Validated .fromTry(Try(tableEnvironment.executeSql(sql))) - .leftMap(SqlStatementExecutionError(sql, _): DataDefinitionRegistrationError) + .leftMap(SqlStatementExecutionError(sql, _): FlinkDataDefinitionRegistrationError) .toValidatedNel - case CreateCatalog(CatalogName(name), options) => + case CreateCatalog(CatalogName(name), options, _) => val catalogConf = Configuration.fromMap( options .map { case SqlOption(key, value) => @@ -33,7 +36,7 @@ class FlinkDataDefinition( ) Validated .fromTry(Try { tableEnvironment.createCatalog(name, CatalogDescriptor.of(name, catalogConf)) }) - .leftMap(CatalogRegistrationError(catalogConf, _): DataDefinitionRegistrationError) + .leftMap(CatalogRegistrationError(catalogConf, _): FlinkDataDefinitionRegistrationError) .toValidatedNel } registrationResults.sequence.void @@ -43,33 +46,25 @@ class FlinkDataDefinition( object FlinkDataDefinition { - trait FlinkDataDefinitionCreationError extends IllegalArgumentException - - case object EmptyDataDefinitionConfiguration extends FlinkDataDefinitionCreationError { - override def getMessage: String = - "Empty data definition configuration. At least one of either tableDefinitionFilePath or catalogConfiguration should be configured" - } - def apply( - sql: Option[String], + sql: List[FlinkSqlDdlStatement], additionalCatalogConfiguration: Option[Configuration] ): ValidatedNel[FlinkDataDefinitionCreationError, FlinkDataDefinition] = { if (sql.isEmpty && additionalCatalogConfiguration.isEmpty) { EmptyDataDefinitionConfiguration.invalidNel } else { - val parsedDdls = sql.map(FlinkDdlParser.parse).getOrElse(List.empty.validNel) val additionalCatalog = additionalCatalogConfiguration .map(CreateCatalog.buildAdditionalCatalogFromConfig) - .toList - .validNel - - (parsedDdls, additionalCatalog).mapN { (ddls, catalogs) => - new FlinkDataDefinition(ddls ++ catalogs) - } + .map(_.map(List(_))) + .getOrElse(List.empty[CreateCatalog].validNel) + additionalCatalog.map(catalogs => new FlinkDataDefinition(sql ++ catalogs)) } } - def applyUnsafe(sql: Option[String], additionalCatalogConfiguration: Option[Configuration]): FlinkDataDefinition = + def applyUnsafe( + sql: List[FlinkSqlDdlStatement], + additionalCatalogConfiguration: Option[Configuration] + ): FlinkDataDefinition = apply(sql, additionalCatalogConfiguration).fold(errs => throw errs.head, identity) sealed trait FlinkSqlDdlStatement @@ -78,14 +73,18 @@ object FlinkDataDefinition { final case class SqlOption(key: String, value: String) final case class SqlString(value: String) final case class CatalogName(value: String) + final case class Connector(value: String) + final case class CatalogType(value: String) final case class CreateTable( - sql: SqlString + sql: SqlString, + connector: Connector ) extends FlinkSqlDdlStatement final case class CreateCatalog( name: CatalogName, - options: List[SqlOption] + options: List[SqlOption], + catalogType: CatalogType ) extends FlinkSqlDdlStatement object CreateCatalog { @@ -93,11 +92,19 @@ object FlinkDataDefinition { // to split object paths private val internalCatalogName = "_nu_catalog" - def buildAdditionalCatalogFromConfig(configuration: Configuration): CreateCatalog = { - val conf = configuration.toMap.asScala.map { case (k, v) => + def buildAdditionalCatalogFromConfig( + configuration: Configuration + ): ValidatedNel[MissingCatalogTypeOption, CreateCatalog] = { + val options = configuration.toMap.asScala.map { case (k, v) => SqlOption(k, v) }.toList - CreateCatalog(CatalogName(internalCatalogName), conf) + options + .find(_.key == "type") + .toValidNel(MissingCatalogTypeOption(internalCatalogName, options)) + .map(_.value) + .map { catalogType => + CreateCatalog(CatalogName(internalCatalogName), options, CatalogType(catalogType)) + } } } @@ -105,14 +112,14 @@ object FlinkDataDefinition { } implicit class DataDefinitionRegistrationResultExtension[T]( - result: ValidatedNel[DataDefinitionRegistrationError, T] + result: ValidatedNel[FlinkDataDefinitionRegistrationError, T] ) { def orFail: T = { result.valueOr { errors => throw new IllegalStateException( errors.toList - .map(_.message) + .map(_.getMessage) .mkString("Errors occurred when data definition registration in TableEnvironment: ", ", ", "") ) } @@ -121,25 +128,3 @@ object FlinkDataDefinition { } } - -sealed trait DataDefinitionRegistrationError { - def message: String -} - -final case class SqlStatementExecutionError(statement: String, exception: Throwable) - extends DataDefinitionRegistrationError { - - override def message: String = - s"""Could not execute sql statement. The statement may be malformed. - |Sql statement: $statement - |Caused by: $exception""".stripMargin - -} - -final case class CatalogRegistrationError(catalogConfiguration: Configuration, exception: Throwable) - extends DataDefinitionRegistrationError { - - override def message: String = - s"Could not created catalog with configuration: $catalogConfiguration. Caused by: $exception" - -} diff --git a/engine/flink/components/table/src/main/scala/pl/touk/nussknacker/engine/flink/table/definition/FlinkDataDefinitionErrors.scala b/engine/flink/components/table/src/main/scala/pl/touk/nussknacker/engine/flink/table/definition/FlinkDataDefinitionErrors.scala new file mode 100644 index 00000000000..d0f702bc716 --- /dev/null +++ b/engine/flink/components/table/src/main/scala/pl/touk/nussknacker/engine/flink/table/definition/FlinkDataDefinitionErrors.scala @@ -0,0 +1,115 @@ +package pl.touk.nussknacker.engine.flink.table.definition + +import org.apache.flink.configuration.Configuration +import pl.touk.nussknacker.engine.flink.table.definition.FlinkDataDefinition.FlinkSqlDdlStatement.{SqlOption, SqlString} + +sealed trait FlinkDataDefinitionError extends RuntimeException +sealed trait FlinkDataDefinitionCreationError extends FlinkDataDefinitionError + +object FlinkDataDefinitionCreationError { + + case object EmptyDataDefinitionConfiguration extends FlinkDataDefinitionCreationError { + override def getMessage: String = + "Empty data definition configuration. At least one of either tableDefinition or catalogConfiguration should be configured" + } + + sealed trait FlinkDdlParseError extends FlinkDataDefinitionCreationError + + object FlinkDdlParseError { + + final case class ParseError(exception: Throwable) extends FlinkDdlParseError { + override def getCause: Throwable = exception + override def getMessage: String = s"Could not parse SQL statements: ${exception.getMessage}" + } + + final case class UnallowedStatement(sql: SqlString) extends FlinkDdlParseError { + override def getMessage: String = + "Found invalid SQL statement. Only `CREATE TABLE` and `CREATE CATALOG` statements are allowed" + } + + final case class MissingConnectorOption(sql: SqlString) extends FlinkDdlParseError { + override def getMessage: String = + s"Connector not specified in `CREATE TABLE` statement [${sql.value}]" + } + + final case class MissingCatalogTypeOption(catalogName: String, options: List[SqlOption]) + extends FlinkDdlParseError { + override def getMessage: String = + s"Catalog type not specified in catalog: [$catalogName] definition. Given options were: ${options}" + } + + } + +} + +sealed trait FlinkDataDefinitionRegistrationError extends FlinkDataDefinitionError + +object FlinkDataDefinitionRegistrationError { + + final case class CatalogRegistrationError(catalogConfiguration: Configuration, exception: Throwable) + extends FlinkDataDefinitionRegistrationError { + + override def getMessage: String = + s"""Could not create catalog. + |Cause: ${getCause.getMessage}""".stripMargin + + override def getCause: Throwable = exception + } + + final case class SqlStatementExecutionError(statement: String, cause: Throwable) + extends FlinkDataDefinitionRegistrationError { + + override def getMessage: String = + s"""Could not execute sql statement. The statement may be malformed. + |Sql statement: [$statement] + |Cause: ${getCause.getMessage}""".stripMargin + + override def getCause: Throwable = cause + } + +} + +sealed trait FlinkDataDefinitionDiscoveryError extends FlinkDataDefinitionError + +object FlinkDataDefinitionDiscoveryError { + + final case class ConnectorDiscoveryProblem(connector: String, exception: Throwable) + extends FlinkDataDefinitionDiscoveryError { + override def getMessage: String = s"Could not find matching connector: [$connector]" + override def getCause: Throwable = exception + } + + final case class CatalogDiscoveryProblem(catalogType: String, exception: Throwable) + extends FlinkDataDefinitionDiscoveryError { + override def getMessage: String = s"Could not find matching catalog: [$catalogType]" + override def getCause: Throwable = exception + } + + final case class NoSinkOrSourceImplementationsFoundForConnector(connector: String) + extends FlinkDataDefinitionDiscoveryError { + override def getMessage: String = + s"Could not find Source or Sink factory for connector [${connector}]. The connector is invalid." + } + + final case class TableEnvironmentRuntimeValidationError(exception: Throwable) + extends FlinkDataDefinitionDiscoveryError { + + // Usually message of second exception in the stack trace is most descriptive + override def getMessage: String = Option(exception.getCause) + .flatMap(c => Option(c.getMessage)) + .getOrElse(exception.getMessage) + + override def getCause: Throwable = exception + } + + final case class CatalogNonTransientValidationError(exception: Throwable) extends FlinkDataDefinitionDiscoveryError { + + // Usually message of second exception in the stack trace is most descriptive + override def getMessage: String = Option(exception.getCause) + .flatMap(c => Option(c.getMessage)) + .getOrElse(exception.getMessage) + + override def getCause: Throwable = exception + } + +} diff --git a/engine/flink/components/table/src/main/scala/pl/touk/nussknacker/engine/flink/table/definition/FlinkDdlParser.scala b/engine/flink/components/table/src/main/scala/pl/touk/nussknacker/engine/flink/table/definition/FlinkDdlParser.scala index 9e91a9a7782..e0d1e45f33d 100644 --- a/engine/flink/components/table/src/main/scala/pl/touk/nussknacker/engine/flink/table/definition/FlinkDdlParser.scala +++ b/engine/flink/components/table/src/main/scala/pl/touk/nussknacker/engine/flink/table/definition/FlinkDdlParser.scala @@ -8,12 +8,10 @@ import org.apache.calcite.sql.parser.{SqlParser => CalciteSqlParser} import org.apache.flink.sql.parser.ddl.{SqlCreateCatalog, SqlCreateTable, SqlCreateTableAs, SqlTableOption} import org.apache.flink.sql.parser.impl.FlinkSqlParserImpl import org.apache.flink.sql.parser.validate.FlinkSqlConformance -import pl.touk.nussknacker.engine.flink.table.definition.FlinkDataDefinition.{ - FlinkDataDefinitionCreationError, - FlinkSqlDdlStatement -} +import pl.touk.nussknacker.engine.flink.table.definition.FlinkDataDefinition.FlinkSqlDdlStatement import pl.touk.nussknacker.engine.flink.table.definition.FlinkDataDefinition.FlinkSqlDdlStatement._ -import pl.touk.nussknacker.engine.flink.table.definition.FlinkDdlParseError._ +import pl.touk.nussknacker.engine.flink.table.definition.FlinkDataDefinitionCreationError.FlinkDdlParseError +import pl.touk.nussknacker.engine.flink.table.definition.FlinkDataDefinitionCreationError.FlinkDdlParseError._ import scala.jdk.CollectionConverters._ import scala.util.Try @@ -23,6 +21,7 @@ import scala.util.Try It does the following: - parses the string as a statement list, assuming a semicolon separator - returns errors for basic syntax errors like unescaped keywords, trailing commas, missing closing quotes + - returns errors for missing `connector` option for CREATE TABLE and `type` option for CREATE CATALOG statements - allows only 3 types of statements: - CREATE CATALOG https://nightlies.apache.org/flink/flink-docs-release-1.20/docs/dev/table/sql/create/#create-catalog - CREATE TABLE https://nightlies.apache.org/flink/flink-docs-release-1.20/docs/dev/table/sql/create/#create-table @@ -43,6 +42,9 @@ object FlinkDdlParser { def parse(sqlStatements: String): ValidatedNel[FlinkDdlParseError, List[FlinkSqlDdlStatement]] = parseSql(sqlStatements).andThen(extractDefinitions) + def parseUnsafe(sqlStatements: String): List[FlinkSqlDdlStatement] = + parse(sqlStatements).fold(e => throw e.head, identity) + private def parseSql(sql: String): ValidatedNel[FlinkDdlParseError, List[SqlNode]] = { val parser = CalciteSqlParser.create(sql, sqlParserConfig) Try(parser.parseStmtList()).toEither.left @@ -53,18 +55,36 @@ object FlinkDdlParser { private def extractDefinitions(sql: List[SqlNode]): ValidatedNel[FlinkDdlParseError, List[FlinkSqlDdlStatement]] = { sql.traverse { - case catalog: SqlCreateCatalog => - CreateCatalog( - name = CatalogName(catalog.catalogName()), - options = extractOptions(catalog.getPropertyList) - ).validNel + case catalog: SqlCreateCatalog => { + val options = extractOptions(catalog.getPropertyList) + options + .find(_.key == "type") + .toValidNel(MissingCatalogTypeOption(catalog.catalogName(), options)) + .map(_.value) + .map { catalogType => + CreateCatalog( + name = CatalogName(catalog.catalogName()), + options = extractOptions(catalog.getPropertyList), + catalogType = CatalogType(catalogType) + ) + } + } // Create Table As (CTAS) performs an insert from another table. // Doc: https://nightlies.apache.org/flink/flink-docs-master/docs/dev/table/sql/create/#as-select_statement // Since these definitions are meant only for data source declarations, inserts are not allowed. // To achieve similar functionality, users should run a separate insert statement. case ctas: SqlCreateTableAs => UnallowedStatement(SqlString(ctas.toString)).invalidNel - case table: SqlCreateTable => CreateTable(SqlString(table.toString)).validNel - case other => UnallowedStatement(SqlString(other.toString)).invalidNel + case table: SqlCreateTable => { + val sqlString = SqlString(table.toString) + extractOptions(table.getPropertyList) + .find(_.key == "connector") + .toValidNel(MissingConnectorOption(sqlString)) + .map(_.value) + .map { connector => + CreateTable(sqlString, Connector(connector)) + } + } + case other => UnallowedStatement(SqlString(other.toString)).invalidNel } } @@ -83,19 +103,3 @@ object FlinkDdlParser { .withIdentifierMaxLength(256); } - -sealed trait FlinkDdlParseError extends FlinkDataDefinitionCreationError - -object FlinkDdlParseError { - - final case class ParseError(cause: Throwable) extends FlinkDdlParseError { - override def getCause: Throwable = cause - override def getMessage: String = "Could not parse SQL statements" - } - - final case class UnallowedStatement(sql: SqlString) extends FlinkDdlParseError { - override def getMessage: String = - "Found invalid SQL statement. Only `CREATE TABLE` and `CREATE CATALOG` statements are allowed" - } - -} diff --git a/engine/flink/components/table/src/main/scala/pl/touk/nussknacker/engine/flink/table/definition/TablesDefinitionDiscovery.scala b/engine/flink/components/table/src/main/scala/pl/touk/nussknacker/engine/flink/table/definition/TablesDefinitionDiscovery.scala index c8f9824c508..7bcf3c19c41 100644 --- a/engine/flink/components/table/src/main/scala/pl/touk/nussknacker/engine/flink/table/definition/TablesDefinitionDiscovery.scala +++ b/engine/flink/components/table/src/main/scala/pl/touk/nussknacker/engine/flink/table/definition/TablesDefinitionDiscovery.scala @@ -1,57 +1,160 @@ package pl.touk.nussknacker.engine.flink.table.definition -import cats.data.ValidatedNel +import cats.data.{NonEmptyList, ValidatedNel} +import cats.implicits.{catsSyntaxValidatedId, toFunctorOps} +import cats.syntax.either._ import com.typesafe.scalalogging.LazyLogging -import org.apache.flink.table.api.{EnvironmentSettings, TableEnvironment} -import org.apache.flink.table.catalog.ObjectIdentifier +import org.apache.flink.configuration.Configuration +import org.apache.flink.table.api.{EnvironmentSettings, Schema, Table, TableDescriptor} +import org.apache.flink.table.api.bridge.java.StreamTableEnvironment +import org.apache.flink.table.catalog.{ObjectIdentifier, ObjectPath} +import org.apache.flink.table.catalog.Catalog +import org.apache.flink.table.factories.{ + DynamicTableFactory, + DynamicTableSinkFactory, + DynamicTableSourceFactory, + FactoryUtil +} +import pl.touk.nussknacker.engine.flink.minicluster.FlinkMiniClusterWithServices import pl.touk.nussknacker.engine.flink.table.TableDefinition +import pl.touk.nussknacker.engine.flink.table.definition.FlinkDataDefinitionDiscoveryError.{ + ConnectorDiscoveryProblem, + NoSinkOrSourceImplementationsFoundForConnector, + TableEnvironmentRuntimeValidationError +} +import pl.touk.nussknacker.engine.flink.table.utils.SchemaExtensions.ResolvedSchemaExtensions +import pl.touk.nussknacker.engine.util.ThreadUtils +import java.net.URLClassLoader import scala.jdk.OptionConverters.RichOptional import scala.util.Try -// TODO: Make this extractor more memory/cpu efficient and ensure closing of resources. For more details see -// https://github.com/TouK/nussknacker/pull/5627#discussion_r1512881038 -class TablesDefinitionDiscovery(tableEnv: TableEnvironment) extends LazyLogging { +class TablesDefinitionDiscovery(env: StreamTableEnvironment) extends LazyLogging { import scala.jdk.CollectionConverters._ - def listTables: List[TableDefinition] = { - for { - catalogName <- tableEnv.listCatalogs().toList - catalog <- tableEnv.getCatalog(catalogName).toScala.toList - databaseName <- catalog.listDatabases.asScala.toList - tableName <- tableEnv.listTables(catalogName, databaseName).toList - tableId = ObjectIdentifier.of(catalogName, databaseName, tableName) - } yield extractTableDefinition(tableId) + def listTables: List[ValidatedNel[FlinkDataDefinitionDiscoveryError, TableDefinition]] = for { + catalogName <- env.listCatalogs().toList + catalog <- env.getCatalog(catalogName).toScala.toList + databaseName <- catalog.listDatabases.asScala.toList + tableName <- env.listTables(catalogName, databaseName).toList + } yield validateAndExtractTable(catalog, catalogName, databaseName, tableName) + + private def validateAndExtractTable( + catalog: Catalog, + catalogName: String, + databaseName: String, + tableName: String + ): ValidatedNel[FlinkDataDefinitionDiscoveryError, TableDefinition] = { + val tableId = ObjectIdentifier.of(catalogName, databaseName, tableName) + val baseTable = catalog.getTable(new ObjectPath(databaseName, tableName)) + val table = extractTable(tableId) + val connectorOpt = baseTable.getOptions.asScala.get("connector") + + val validation = connectorOpt match { + case Some(connector) => validateTable(connector, table, tableName, env) + case None => ().validNel // No connector found - may be a catalog-managed table + } + + validation.map(_ => TableDefinition(tableId, table.getResolvedSchema)) } - private def extractTableDefinition(tableId: ObjectIdentifier) = { - val table = Try(tableEnv.from(tableId.toString)).fold( - ex => throw new IllegalStateException(s"Table extractor could not locate a created table with id: $tableId", ex), - identity - ) - TableDefinition(tableId, table.getResolvedSchema) + private def extractTable(tableId: ObjectIdentifier): Table = Try(env.from(tableId.toString)).fold( + ex => throw new IllegalStateException(s"Table extractor could not locate a created table with id: $tableId", ex), + identity + ) + + private def validateTable( + connector: String, + table: Table, + tableName: String, + env: StreamTableEnvironment + ): ValidatedNel[FlinkDataDefinitionDiscoveryError, Unit] = { + val tableFactory = Try { + FactoryUtil.discoverFactory(getClass.getClassLoader, classOf[DynamicTableFactory], connector) + }.fold(ex => ConnectorDiscoveryProblem(connector, ex).invalidNel, factory => factory.validNel) + + tableFactory.andThen { factory => + val sourceValidationResult = factory match { + case _: DynamicTableSourceFactory => Some(validateSource(table)) + case _ => None + } + + val sinkValidationResult = factory match { + case _: DynamicTableSinkFactory if table.getResolvedSchema.containsPersistableMetadataColumns() => { + logger.warn(s"Ommitted table [$tableName] as a Sink since persistable metadata columns are not supported") + None + } + case _: DynamicTableSinkFactory => Some(validateSink(table, tableName, env)) + case _ => None + } + + (sourceValidationResult, sinkValidationResult) match { + case (None, None) => NonEmptyList.one(NoSinkOrSourceImplementationsFoundForConnector(connector)).invalid + case (Some(sourceResult), Some(sinkResult)) => sourceResult.combine(sinkResult) + case (Some(sourceResult), None) => sourceResult + case (None, Some(sinkResult)) => sinkResult + } + } + } + + private def validateSource(table: Table) = { + Try { + table + .insertInto( + TableDescriptor + .forConnector("blackhole") + .schema(Schema.newBuilder().fromRowDataType(table.getResolvedSchema.toSourceRowDataType).build) + .build() + ) + .compilePlan() + }.toEither + .leftMap(e => NonEmptyList.one(TableEnvironmentRuntimeValidationError(e))) + .toValidated + .void + } + + private def validateSink(table: Table, tableName: String, env: StreamTableEnvironment) = { + Try { + env + .from( + TableDescriptor + .forConnector("datagen") + .schema(Schema.newBuilder().fromRowDataType(table.getResolvedSchema.toPhysicalRowDataType).build) + .build() + ) + .insertInto(tableName) + .compilePlan() + }.toEither + .leftMap(e => NonEmptyList.one(TableEnvironmentRuntimeValidationError(e))) + .toValidated + .void } } -object TablesDefinitionDiscovery { +object TablesDefinitionDiscovery extends LazyLogging { def prepareDiscovery( - flinkDataDefinition: FlinkDataDefinition - ): ValidatedNel[DataDefinitionRegistrationError, TablesDefinitionDiscovery] = { - val environmentSettings = EnvironmentSettings - .newInstance() - .build() - prepareDiscovery(flinkDataDefinition, environmentSettings) - } - - private[definition] def prepareDiscovery( flinkDataDefinition: FlinkDataDefinition, - environmentSettings: EnvironmentSettings - ): ValidatedNel[DataDefinitionRegistrationError, TablesDefinitionDiscovery] = { - val tableEnv = TableEnvironment.create(environmentSettings) - flinkDataDefinition.registerIn(tableEnv).map(_ => new TablesDefinitionDiscovery(tableEnv)) + miniCluster: FlinkMiniClusterWithServices, + classLoader: URLClassLoader + ): ValidatedNel[FlinkDataDefinitionRegistrationError, TablesDefinitionDiscovery] = { + ThreadUtils.withThisAsContextClassLoader(classLoader) { + miniCluster.withDetachedStreamExecutionEnvironment { env => + val streamTableEnv = StreamTableEnvironment.create( + env, + EnvironmentSettings + .newInstance() + .withClassLoader(classLoader) + .withConfiguration(Configuration.fromMap(env.getConfiguration.toMap)) + .build() + ) + flinkDataDefinition + .registerIn(streamTableEnv) + .map(_ => new TablesDefinitionDiscovery(streamTableEnv)) + } + } } } diff --git a/engine/flink/components/table/src/main/scala/pl/touk/nussknacker/engine/flink/table/definition/TablesDefinitionValidation.scala b/engine/flink/components/table/src/main/scala/pl/touk/nussknacker/engine/flink/table/definition/TablesDefinitionValidation.scala new file mode 100644 index 00000000000..4d28472db32 --- /dev/null +++ b/engine/flink/components/table/src/main/scala/pl/touk/nussknacker/engine/flink/table/definition/TablesDefinitionValidation.scala @@ -0,0 +1,88 @@ +package pl.touk.nussknacker.engine.flink.table.definition + +import cats.data.{NonEmptyList, ValidatedNel} +import cats.implicits._ +import org.apache.flink.configuration.Configuration +import org.apache.flink.table.factories.{CatalogFactory, FactoryUtil} +import org.apache.flink.table.factories.FactoryUtil.DefaultCatalogContext +import pl.touk.nussknacker.engine.flink.minicluster.FlinkMiniClusterWithServices +import pl.touk.nussknacker.engine.flink.table.definition.FlinkDataDefinition.FlinkSqlDdlStatement.{ + CreateCatalog, + CreateTable +} +import pl.touk.nussknacker.engine.flink.table.definition.FlinkDataDefinitionDiscoveryError.{ + CatalogDiscoveryProblem, + CatalogNonTransientValidationError +} + +import java.net.URLClassLoader +import scala.jdk.CollectionConverters._ +import scala.util.Try + +// This is used by external project +object TablesDefinitionValidation { + + def validateWithoutExternalConnections( + sqlDdlStatements: String, + minicluster: FlinkMiniClusterWithServices, + classLoader: URLClassLoader + ): ValidatedNel[FlinkDataDefinitionError, Unit] = { + FlinkDdlParser.parse(sqlDdlStatements).andThen { statements => + val createTableStatements = statements.collect { case ct: CreateTable => ct } + val createTableValidationResult = if (createTableStatements.isEmpty) { + ().validNel + } else { + FlinkDataDefinition + .apply(createTableStatements, None) + .andThen { definition => + TablesDefinitionDiscovery + .prepareDiscovery(definition, minicluster, classLoader) + .andThen(_.listTables.sequence) + } + .void + } + + val createCatalogStatements = statements.collect { case ct: CreateCatalog => ct } + val createCatalogValidationResult = if (createCatalogStatements.isEmpty) { + ().validNel + } else { + createCatalogStatements.map(cc => validateCatalogWithoutExternalCalls(cc, classLoader)).sequence.void + } + + createTableValidationResult.combine(createCatalogValidationResult) + } + } + + private def validateCatalogWithoutExternalCalls( + createCatalog: CreateCatalog, + classLoader: URLClassLoader + ): ValidatedNel[FlinkDataDefinitionDiscoveryError, Unit] = { + val catalogFactory = Try { + FactoryUtil.discoverFactory( + classLoader, + classOf[CatalogFactory], + createCatalog.catalogType.value + ) + }.fold(ex => CatalogDiscoveryProblem(createCatalog.catalogType.value, ex).invalidNel, factory => factory.validNel) + + catalogFactory.andThen(factory => { + val simulatedContext = new DefaultCatalogContext( + createCatalog.name.value, + createCatalog.options + .filter(_.key != "type") + .map(option => option.key -> option.value) + .toMap + .asJava, + new Configuration(), + classLoader + ) + Try { + factory.createCatalog(simulatedContext) + }.toEither + .leftMap(e => NonEmptyList.one(CatalogNonTransientValidationError(e))) + .toValidated + .void + }) + } + +} diff --git a/engine/flink/components/table/src/main/scala/pl/touk/nussknacker/engine/flink/table/sink/TableSinkFactory.scala b/engine/flink/components/table/src/main/scala/pl/touk/nussknacker/engine/flink/table/sink/TableSinkFactory.scala index eccc9806dcc..c82bdff3208 100644 --- a/engine/flink/components/table/src/main/scala/pl/touk/nussknacker/engine/flink/table/sink/TableSinkFactory.scala +++ b/engine/flink/components/table/src/main/scala/pl/touk/nussknacker/engine/flink/table/sink/TableSinkFactory.scala @@ -3,6 +3,7 @@ package pl.touk.nussknacker.engine.flink.table.sink import cats.data.{NonEmptyList, Validated, ValidatedNel} import cats.data.Validated.{invalid, valid} import cats.implicits._ +import com.typesafe.scalalogging.LazyLogging import pl.touk.nussknacker.engine.api.{NodeId, Params} import pl.touk.nussknacker.engine.api.component.{Component, ProcessingMode} import pl.touk.nussknacker.engine.api.component.Component.AllowedProcessingModes.SetOf @@ -16,6 +17,7 @@ import pl.touk.nussknacker.engine.api.context.transformation.{ import pl.touk.nussknacker.engine.api.definition.{BoolParameterEditor, NodeDependency, Parameter, ParameterDeclaration} import pl.touk.nussknacker.engine.api.parameter.ParameterName import pl.touk.nussknacker.engine.api.process.{Sink, SinkFactory} +import pl.touk.nussknacker.engine.flink.minicluster.FlinkMiniClusterWithServices import pl.touk.nussknacker.engine.flink.table.TableDefinition import pl.touk.nussknacker.engine.flink.table.definition.{FlinkDataDefinition, TablesDefinitionDiscovery} import pl.touk.nussknacker.engine.flink.table.definition.FlinkDataDefinition._ @@ -31,18 +33,30 @@ import pl.touk.nussknacker.engine.util.parameters.{ } import pl.touk.nussknacker.engine.util.sinkvalue.SinkValue +import java.net.URLClassLoader +import scala.collection.compat.toTraversableLikeExtensionMethods import scala.collection.immutable.ListMap import scala.jdk.CollectionConverters._ -class TableSinkFactory(flinkDataDefinition: FlinkDataDefinition) - extends SingleInputDynamicComponent[Sink] - with SinkFactory { +class TableSinkFactory( + flinkDataDefinition: FlinkDataDefinition, + miniCluster: FlinkMiniClusterWithServices, + classLoader: URLClassLoader +) extends SingleInputDynamicComponent[Sink] + with SinkFactory + with LazyLogging { override def allowedProcessingModes: Component.AllowedProcessingModes = SetOf(ProcessingMode.UnboundedStream, ProcessingMode.BoundedStream) @transient - private lazy val tablesDiscovery = TablesDefinitionDiscovery.prepareDiscovery(flinkDataDefinition).orFail + private lazy val tablesDiscovery = TablesDefinitionDiscovery + .prepareDiscovery( + flinkDataDefinition, + miniCluster, + classLoader + ) + .orFail override type State = TableSinkFactoryState @@ -59,7 +73,9 @@ class TableSinkFactory(flinkDataDefinition: FlinkDataDefinition) override def nodeDependencies: List[NodeDependency] = List.empty private lazy val prepareInitialParameters: ContextTransformationDefinition = { case TransformationStep(Nil, _) => - val tableDefinitions = tablesDiscovery.listTables + val (errors, tableDefinitions) = tablesDiscovery.listTables.map(_.toEither).partitionMap(identity) + errors.foreach(logger.warn("A validation error occured when trying to use configured tables", _)) + val tableNameParamDeclaration = TableComponentFactory.buildTableNameParam(tableDefinitions) NextParameters( parameters = tableNameParamDeclaration.createParameter() :: rawModeParameterDeclaration.createParameter() :: Nil, diff --git a/engine/flink/components/table/src/main/scala/pl/touk/nussknacker/engine/flink/table/source/FlinkMiniClusterTableOperations.scala b/engine/flink/components/table/src/main/scala/pl/touk/nussknacker/engine/flink/table/source/FlinkMiniClusterTableOperations.scala index 8ef13120612..049250500ae 100644 --- a/engine/flink/components/table/src/main/scala/pl/touk/nussknacker/engine/flink/table/source/FlinkMiniClusterTableOperations.scala +++ b/engine/flink/components/table/src/main/scala/pl/touk/nussknacker/engine/flink/table/source/FlinkMiniClusterTableOperations.scala @@ -22,11 +22,10 @@ import java.util.UUID import scala.jdk.CollectionConverters._ import scala.util.{Failure, Success, Try, Using} -object FlinkMiniClusterTableOperations extends LazyLogging { +class FlinkMiniClusterTableOperations(env: StreamTableEnvironment) extends LazyLogging { def parseTestRecords(records: List[TestRecord], schema: Schema): List[Row] = { - implicit val env: StreamTableEnvironment = MiniClusterEnvBuilder.buildStreamTableEnv - val (inputTablePath, inputTableName) = createTempFileTable(schema) + val (inputTablePath, inputTableName) = createTempFileTable(schema) val parsedRecords = Try { writeRecordsToFile(inputTablePath, records) val inputTable = env.from(s"`$inputTableName`") @@ -44,25 +43,22 @@ object FlinkMiniClusterTableOperations extends LazyLogging { ): TestData = generateTestData( limit = limit, schema = schema, - buildSourceTable = createLiveDataGeneratorTable(flinkDataDefinition, tableId, schema) + sourceTable = createLiveDataGeneratorTable(flinkDataDefinition, tableId, schema) ) def generateRandomTestData(amount: Int, schema: Schema): TestData = generateTestData( limit = amount, schema = schema, - buildSourceTable = createRandomDataGeneratorTable(amount, schema) + sourceTable = createRandomDataGeneratorTable(amount, schema) ) private type TableName = String - // TODO: check the minicluster releases memory properly and if not, refactor to reuse one minicluster per all usages private def generateTestData( limit: Int, schema: Schema, - buildSourceTable: TableEnvironment => Table + sourceTable: Table ): TestData = { - implicit val env: TableEnvironment = MiniClusterEnvBuilder.buildTableEnv - val sourceTable = buildSourceTable(env) val (outputFilePath, outputTableName) = createTempFileTable(schema) val generatedRows = Try { insertDataAndAwait(sourceTable, outputTableName, limit) @@ -103,7 +99,7 @@ object FlinkMiniClusterTableOperations extends LazyLogging { private def createRandomDataGeneratorTable( amountOfRecordsToGenerate: Int, flinkTableSchema: Schema, - )(env: TableEnvironment): Table = { + ): Table = { val tableName = generateTableName env.createTemporaryTable( tableName, @@ -120,12 +116,12 @@ object FlinkMiniClusterTableOperations extends LazyLogging { flinkDataDefinition: FlinkDataDefinition, tableId: ObjectIdentifier, schema: Schema - )(env: TableEnvironment): Table = { + ): Table = { flinkDataDefinition.registerIn(env).orFail env.from(tableId.toString).select(schema.getColumns.asScala.map(_.getName).map($).toList: _*) } - private def createTempFileTable(flinkTableSchema: Schema)(implicit env: TableEnvironment): (Path, TableName) = { + private def createTempFileTable(flinkTableSchema: Schema): (Path, TableName) = { val tempTestDataOutputFilePrefix = "tableSourceDataDump-" val tempDir = Files.createTempDirectory(tempTestDataOutputFilePrefix) logger.debug(s"Created temporary directory for dumping test data at: '${tempDir.toUri.toURL}'") @@ -159,38 +155,19 @@ object FlinkMiniClusterTableOperations extends LazyLogging { private def generateTableName: TableName = s"testDataInputTable_${UUID.randomUUID().toString.replaceAll("-", "")}" - private object MiniClusterEnvBuilder { - - private lazy val streamEnvConfig = { - val conf = new Configuration() - - // parent-first - otherwise linkage error (loader constraint violation, a different class with the same name was - // previously loaded by 'app') for class 'org.apache.commons.math3.random.RandomDataGenerator' - conf.set(CoreOptions.CLASSLOADER_RESOLVE_ORDER, "parent-first") - - // Here is a hidden assumption that getClass.getClassLoader is the model classloader and another hidden assumuption that model classloader has all necessary jars (including connectors) - // TODO: we should explicitly pass model classloader + we should split model classloader into libs that are only for - // testing mechanism purpose (in the real deployment, they are already available in Flink), for example table connectors - Thread.currentThread().getContextClassLoader match { - case url: URLClassLoader => - conf.set(PipelineOptions.CLASSPATHS, url.getURLs.toList.map(_.toString).asJava) - case _ => - logger.warn( - "Context classloader is not a URLClassLoader. Probably data generation invocation wasn't wrapped with ModelData.withThisAsContextClassLoader. MiniCluster classpath set up will be skipped." - ) - } - conf.set(CoreOptions.DEFAULT_PARALLELISM, Int.box(1)) - } - - private lazy val tableEnvConfig = EnvironmentSettings.newInstance().withConfiguration(streamEnvConfig).build() +} - def buildTableEnv: TableEnvironment = TableEnvironment.create(tableEnvConfig) +object MiniClusterEnvBuilder extends LazyLogging { - def buildStreamTableEnv: StreamTableEnvironment = StreamTableEnvironment.create( - StreamExecutionEnvironment.createLocalEnvironment(streamEnvConfig), - tableEnvConfig + def buildStreamTableEnv( + baseStreamExecutionEnv: StreamExecutionEnvironment + ): StreamTableEnvironment = + StreamTableEnvironment.create( + baseStreamExecutionEnv, + EnvironmentSettings + .newInstance() + .withConfiguration(Configuration.fromMap(baseStreamExecutionEnv.getConfiguration.toMap)) + .build() ) - } - } diff --git a/engine/flink/components/table/src/main/scala/pl/touk/nussknacker/engine/flink/table/source/TableSource.scala b/engine/flink/components/table/src/main/scala/pl/touk/nussknacker/engine/flink/table/source/TableSource.scala index 3f54cdbb6dc..38be731e537 100644 --- a/engine/flink/components/table/src/main/scala/pl/touk/nussknacker/engine/flink/table/source/TableSource.scala +++ b/engine/flink/components/table/src/main/scala/pl/touk/nussknacker/engine/flink/table/source/TableSource.scala @@ -1,5 +1,6 @@ package pl.touk.nussknacker.engine.flink.table.source +import com.typesafe.scalalogging.LazyLogging import org.apache.flink.streaming.api.datastream.DataStream import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment import org.apache.flink.table.api.{DataTypes, Schema} @@ -21,6 +22,7 @@ import pl.touk.nussknacker.engine.flink.api.process.{ StandardFlinkSource } import pl.touk.nussknacker.engine.flink.api.timestampwatermark.TimestampWatermarkHandler +import pl.touk.nussknacker.engine.flink.minicluster.FlinkMiniClusterWithServices import pl.touk.nussknacker.engine.flink.table.TableComponentProviderConfig.TestDataGenerationMode import pl.touk.nussknacker.engine.flink.table.TableComponentProviderConfig.TestDataGenerationMode.TestDataGenerationMode import pl.touk.nussknacker.engine.flink.table.TableDefinition @@ -38,11 +40,13 @@ import scala.jdk.CollectionConverters._ class TableSource( tableDefinition: TableDefinition, flinkDataDefinition: FlinkDataDefinition, - testDataGenerationMode: TestDataGenerationMode + testDataGenerationMode: TestDataGenerationMode, + miniCluster: FlinkMiniClusterWithServices ) extends StandardFlinkSource[Row] with TestWithParametersSupport[Row] with FlinkSourceTestSupport[Row] - with TestDataGenerator { + with TestDataGenerator + with LazyLogging { override def sourceStream( env: StreamExecutionEnvironment, @@ -98,7 +102,10 @@ class TableSource( .build() } (testRecords: List[TestRecord]) => - FlinkMiniClusterTableOperations.parseTestRecords(testRecords, tableDataParserSchema) + miniCluster.withDetachedStreamExecutionEnvironment { env => + new FlinkMiniClusterTableOperations(MiniClusterEnvBuilder.buildStreamTableEnv(env)) + .parseTestRecords(testRecords, tableDataParserSchema) + } } override def generateTestData(size: Int): TestData = { @@ -106,19 +113,22 @@ class TableSource( val dataType = DataTypes.ROW(fieldsWithoutComputedColumns: _*) Schema.newBuilder().fromRowDataType(dataType).build() } - testDataGenerationMode match { - case TestDataGenerationMode.Random => - FlinkMiniClusterTableOperations.generateRandomTestData( - amount = size, - schema = generateDataSchema - ) - case TestDataGenerationMode.Live => - FlinkMiniClusterTableOperations.generateLiveTestData( - limit = size, - schema = generateDataSchema, - flinkDataDefinition = flinkDataDefinition, - tableId = tableDefinition.tableId - ) + miniCluster.withDetachedStreamExecutionEnvironment { env => + val tableOps = new FlinkMiniClusterTableOperations(MiniClusterEnvBuilder.buildStreamTableEnv(env)) + testDataGenerationMode match { + case TestDataGenerationMode.Random => + tableOps.generateRandomTestData( + amount = size, + schema = generateDataSchema + ) + case TestDataGenerationMode.Live => + tableOps.generateLiveTestData( + limit = size, + schema = generateDataSchema, + flinkDataDefinition = flinkDataDefinition, + tableId = tableDefinition.tableId + ) + } } } diff --git a/engine/flink/components/table/src/main/scala/pl/touk/nussknacker/engine/flink/table/source/TableSourceFactory.scala b/engine/flink/components/table/src/main/scala/pl/touk/nussknacker/engine/flink/table/source/TableSourceFactory.scala index da214b000e7..81a2dbb8097 100644 --- a/engine/flink/components/table/src/main/scala/pl/touk/nussknacker/engine/flink/table/source/TableSourceFactory.scala +++ b/engine/flink/components/table/src/main/scala/pl/touk/nussknacker/engine/flink/table/source/TableSourceFactory.scala @@ -1,5 +1,6 @@ package pl.touk.nussknacker.engine.flink.table.source +import com.typesafe.scalalogging.LazyLogging import pl.touk.nussknacker.engine.api.{NodeId, Params} import pl.touk.nussknacker.engine.api.component.{Component, ProcessingMode} import pl.touk.nussknacker.engine.api.component.Component.AllowedProcessingModes.SetOf @@ -11,6 +12,7 @@ import pl.touk.nussknacker.engine.api.context.transformation.{ } import pl.touk.nussknacker.engine.api.definition._ import pl.touk.nussknacker.engine.api.process.{BasicContextInitializer, Source, SourceFactory} +import pl.touk.nussknacker.engine.flink.minicluster.FlinkMiniClusterWithServices import pl.touk.nussknacker.engine.flink.table.TableComponentProviderConfig.TestDataGenerationMode.TestDataGenerationMode import pl.touk.nussknacker.engine.flink.table.TableDefinition import pl.touk.nussknacker.engine.flink.table.definition.{FlinkDataDefinition, TablesDefinitionDiscovery} @@ -24,17 +26,29 @@ import pl.touk.nussknacker.engine.flink.table.utils.DataTypesExtensions._ import pl.touk.nussknacker.engine.flink.table.utils.TableComponentFactory import pl.touk.nussknacker.engine.flink.table.utils.TableComponentFactory._ +import java.net.URLClassLoader +import scala.collection.compat.toTraversableLikeExtensionMethods + class TableSourceFactory( flinkDataDefinition: FlinkDataDefinition, - testDataGenerationMode: TestDataGenerationMode + testDataGenerationMode: TestDataGenerationMode, + miniCluster: FlinkMiniClusterWithServices, + classLoader: URLClassLoader ) extends SingleInputDynamicComponent[Source] - with SourceFactory { + with SourceFactory + with LazyLogging { override def allowedProcessingModes: Component.AllowedProcessingModes = SetOf(ProcessingMode.UnboundedStream, ProcessingMode.BoundedStream) @transient - private lazy val tablesDiscovery = TablesDefinitionDiscovery.prepareDiscovery(flinkDataDefinition).orFail + private lazy val tablesDiscovery = TablesDefinitionDiscovery + .prepareDiscovery( + flinkDataDefinition, + miniCluster, + classLoader + ) + .orFail override type State = TableSourceFactoryState @@ -42,7 +56,9 @@ class TableSourceFactory( implicit nodeId: NodeId ): this.ContextTransformationDefinition = { case TransformationStep(Nil, _) => - val tableDefinitions = tablesDiscovery.listTables + val (errors, tableDefinitions) = tablesDiscovery.listTables.map(_.toEither).partitionMap(identity) + errors.foreach(logger.warn("A validation error occured when trying to use configured tables", _)) + val tableNameParamDeclaration = TableComponentFactory.buildTableNameParam(tableDefinitions) NextParameters( parameters = tableNameParamDeclaration.createParameter() :: Nil, @@ -72,7 +88,7 @@ class TableSourceFactory( s"Unexpected final state determined during parameters validation: $finalStateOpt" ) } - new TableSource(selectedTable, flinkDataDefinition, testDataGenerationMode) + new TableSource(selectedTable, flinkDataDefinition, testDataGenerationMode, miniCluster) } override def nodeDependencies: List[NodeDependency] = List.empty diff --git a/engine/flink/components/table/src/main/scala/pl/touk/nussknacker/engine/flink/table/utils/SchemaExtensions.scala b/engine/flink/components/table/src/main/scala/pl/touk/nussknacker/engine/flink/table/utils/SchemaExtensions.scala index 60e4659845f..0bcdae4ae65 100644 --- a/engine/flink/components/table/src/main/scala/pl/touk/nussknacker/engine/flink/table/utils/SchemaExtensions.scala +++ b/engine/flink/components/table/src/main/scala/pl/touk/nussknacker/engine/flink/table/utils/SchemaExtensions.scala @@ -27,6 +27,13 @@ object SchemaExtensions { ) } + def containsPersistableMetadataColumns(): Boolean = { + resolvedSchema.getColumns.asScala.toList.exists { + case col: Column.MetadataColumn => col.isPersisted + case _ => false + } + } + // This is a copy-paste of toRowDataType is public and returns fields instead of DataType // The original method is used in toSourceRowDataType and toSinkRowDataType, but is private def toRowDataTypeFields(columnPredicate: Column => Boolean): List[DataTypes.Field] = From 3a9ace5a04827ce2f0bd188a436c84875a22edd4 Mon Sep 17 00:00:00 2001 From: Arek Burdach Date: Mon, 10 Mar 2025 22:56:59 +0100 Subject: [PATCH 6/9] table components changes to work with minicluster --- build.sbt | 1 + .../TablesDefinitionDiscoveryTest.scala | 4 +- .../TableSourceDataGenerationTest.scala | 3 +- .../ModelClassLoaderSimulationSuite.scala | 2 +- ...linkTableDataSourceComponentProvider.scala | 9 +-- .../TablesDefinitionDiscovery.scala | 31 +++++----- .../TablesDefinitionValidation.scala | 2 +- .../flink/table/sink/TableSinkFactory.scala | 57 +++++++++++-------- .../table/source/TableSourceFactory.scala | 51 +++++++++-------- 9 files changed, 82 insertions(+), 78 deletions(-) diff --git a/build.sbt b/build.sbt index b365040a965..e305dbb2b2f 100644 --- a/build.sbt +++ b/build.sbt @@ -1875,6 +1875,7 @@ lazy val flinkTableApiComponents = (project in flink("components/table")) flinkComponentsUtils % Provided, jsonUtils % Provided, flinkMiniCluster % Provided, + extensionsApi % Provided, testUtils % Test, flinkComponentsTestkit % Test, ) diff --git a/engine/flink/components/base-tests/src/test/scala/pl/touk/nussknacker/engine/flink/table/definition/TablesDefinitionDiscoveryTest.scala b/engine/flink/components/base-tests/src/test/scala/pl/touk/nussknacker/engine/flink/table/definition/TablesDefinitionDiscoveryTest.scala index 44fed7c3fdd..bd9921b3df9 100644 --- a/engine/flink/components/base-tests/src/test/scala/pl/touk/nussknacker/engine/flink/table/definition/TablesDefinitionDiscoveryTest.scala +++ b/engine/flink/components/base-tests/src/test/scala/pl/touk/nussknacker/engine/flink/table/definition/TablesDefinitionDiscoveryTest.scala @@ -40,7 +40,7 @@ class TablesDefinitionDiscoveryTest val parsedSql = FlinkDdlParser.parseUnsafe(sql) val dataDefinition = FlinkDataDefinition.applyUnsafe(parsedSql, None) TablesDefinitionDiscovery - .prepareDiscovery(dataDefinition, minicluster, simulatedModelClassloader) + .prepareDiscovery(dataDefinition, minicluster) .andThen(_.listTables.sequence) } @@ -86,7 +86,7 @@ class TablesDefinitionDiscoveryTest val catalogConfiguration = Configuration.fromMap(Map("type" -> StubbedCatalogFactory.catalogName).asJava) val flinkDataDefinition = FlinkDataDefinition.applyUnsafe(List.empty, Some(catalogConfiguration)) val discovery = - TablesDefinitionDiscovery.prepareDiscovery(flinkDataDefinition, minicluster, simulatedModelClassloader).validValue + TablesDefinitionDiscovery.prepareDiscovery(flinkDataDefinition, minicluster).validValue val tableDefinition = discovery.listTables.sequence.validValue.loneElement tableDefinition.tableId.toString shouldBe s"`_nu_catalog`." + diff --git a/engine/flink/components/base-tests/src/test/scala/pl/touk/nussknacker/engine/flink/table/source/TableSourceDataGenerationTest.scala b/engine/flink/components/base-tests/src/test/scala/pl/touk/nussknacker/engine/flink/table/source/TableSourceDataGenerationTest.scala index 2c35871811a..b54380d8754 100644 --- a/engine/flink/components/base-tests/src/test/scala/pl/touk/nussknacker/engine/flink/table/source/TableSourceDataGenerationTest.scala +++ b/engine/flink/components/base-tests/src/test/scala/pl/touk/nussknacker/engine/flink/table/source/TableSourceDataGenerationTest.scala @@ -33,8 +33,7 @@ class TableSourceDataGenerationTest val discovery = TablesDefinitionDiscovery .prepareDiscovery( flinkDataDefinition, - FlinkMiniClusterFactory.createMiniClusterWithServices(simulatedModelClassloader, new Configuration()), - simulatedModelClassloader + FlinkMiniClusterFactory.createMiniClusterWithServices(simulatedModelClassloader, new Configuration()) ) .validValue new TableSource( diff --git a/engine/flink/components/base-tests/src/test/scala/pl/touk/nussknacker/engine/flink/table/utils/ModelClassLoaderSimulationSuite.scala b/engine/flink/components/base-tests/src/test/scala/pl/touk/nussknacker/engine/flink/table/utils/ModelClassLoaderSimulationSuite.scala index 60926785c37..c3ad72303ee 100644 --- a/engine/flink/components/base-tests/src/test/scala/pl/touk/nussknacker/engine/flink/table/utils/ModelClassLoaderSimulationSuite.scala +++ b/engine/flink/components/base-tests/src/test/scala/pl/touk/nussknacker/engine/flink/table/utils/ModelClassLoaderSimulationSuite.scala @@ -1,7 +1,7 @@ package pl.touk.nussknacker.engine.flink.table.utils import org.scalatest.{BeforeAndAfterAll, Suite} -import pl.touk.nussknacker.engine.util.loader.ModelClassLoader +import pl.touk.nussknacker.engine.classloader.ModelClassLoader // This is only for purpose of using an empty URLClassLoader as contextClassLoader in tests ran from Intellij Idea trait ModelClassLoaderSimulationSuite extends BeforeAndAfterAll { this: Suite => diff --git a/engine/flink/components/table/src/main/scala/pl/touk/nussknacker/engine/flink/table/FlinkTableDataSourceComponentProvider.scala b/engine/flink/components/table/src/main/scala/pl/touk/nussknacker/engine/flink/table/FlinkTableDataSourceComponentProvider.scala index 5a032106cb2..e6f3d245c59 100644 --- a/engine/flink/components/table/src/main/scala/pl/touk/nussknacker/engine/flink/table/FlinkTableDataSourceComponentProvider.scala +++ b/engine/flink/components/table/src/main/scala/pl/touk/nussknacker/engine/flink/table/FlinkTableDataSourceComponentProvider.scala @@ -31,22 +31,17 @@ class FlinkTableDataSourceComponentProvider extends ComponentProvider with LazyL val parsedSqlStatements = sqlStatements.map(FlinkDdlParser.parseUnsafe).toList.flatten val flinkDataDefinition = FlinkDataDefinition.applyUnsafe(parsedSqlStatements, catalogConfigurationOpt) - val modelClassLoader = castModelClassloader() - val miniCluster = FlinkMiniClusterFactory.createMiniClusterWithServices(modelClassLoader, new Configuration()) - List( ComponentDefinition( tableComponentName, new TableSourceFactory( flinkDataDefinition, - testDataGenerationModeOrDefault, - miniCluster, - modelClassLoader + testDataGenerationModeOrDefault ) ), ComponentDefinition( tableComponentName, - new TableSinkFactory(flinkDataDefinition, miniCluster, modelClassLoader) + new TableSinkFactory(flinkDataDefinition) ) ) } diff --git a/engine/flink/components/table/src/main/scala/pl/touk/nussknacker/engine/flink/table/definition/TablesDefinitionDiscovery.scala b/engine/flink/components/table/src/main/scala/pl/touk/nussknacker/engine/flink/table/definition/TablesDefinitionDiscovery.scala index 7bcf3c19c41..63aa35c4880 100644 --- a/engine/flink/components/table/src/main/scala/pl/touk/nussknacker/engine/flink/table/definition/TablesDefinitionDiscovery.scala +++ b/engine/flink/components/table/src/main/scala/pl/touk/nussknacker/engine/flink/table/definition/TablesDefinitionDiscovery.scala @@ -137,23 +137,22 @@ object TablesDefinitionDiscovery extends LazyLogging { def prepareDiscovery( flinkDataDefinition: FlinkDataDefinition, - miniCluster: FlinkMiniClusterWithServices, - classLoader: URLClassLoader + miniCluster: FlinkMiniClusterWithServices ): ValidatedNel[FlinkDataDefinitionRegistrationError, TablesDefinitionDiscovery] = { - ThreadUtils.withThisAsContextClassLoader(classLoader) { - miniCluster.withDetachedStreamExecutionEnvironment { env => - val streamTableEnv = StreamTableEnvironment.create( - env, - EnvironmentSettings - .newInstance() - .withClassLoader(classLoader) - .withConfiguration(Configuration.fromMap(env.getConfiguration.toMap)) - .build() - ) - flinkDataDefinition - .registerIn(streamTableEnv) - .map(_ => new TablesDefinitionDiscovery(streamTableEnv)) - } + // TODO: Check if this works without: + // 1. Setting ModelClassLoader as context classloader + // 2. Setting ModelClassLoader on EnvironmnentSettings for StreamTableEnv (this may be redundant anyways) + miniCluster.withDetachedStreamExecutionEnvironment { env => + val streamTableEnv = StreamTableEnvironment.create( + env, + EnvironmentSettings + .newInstance() + .withConfiguration(Configuration.fromMap(env.getConfiguration.toMap)) + .build() + ) + flinkDataDefinition + .registerIn(streamTableEnv) + .map(_ => new TablesDefinitionDiscovery(streamTableEnv)) } } diff --git a/engine/flink/components/table/src/main/scala/pl/touk/nussknacker/engine/flink/table/definition/TablesDefinitionValidation.scala b/engine/flink/components/table/src/main/scala/pl/touk/nussknacker/engine/flink/table/definition/TablesDefinitionValidation.scala index 4d28472db32..bbc103ec6da 100644 --- a/engine/flink/components/table/src/main/scala/pl/touk/nussknacker/engine/flink/table/definition/TablesDefinitionValidation.scala +++ b/engine/flink/components/table/src/main/scala/pl/touk/nussknacker/engine/flink/table/definition/TablesDefinitionValidation.scala @@ -36,7 +36,7 @@ object TablesDefinitionValidation { .apply(createTableStatements, None) .andThen { definition => TablesDefinitionDiscovery - .prepareDiscovery(definition, minicluster, classLoader) + .prepareDiscovery(definition, minicluster) .andThen(_.listTables.sequence) } .void diff --git a/engine/flink/components/table/src/main/scala/pl/touk/nussknacker/engine/flink/table/sink/TableSinkFactory.scala b/engine/flink/components/table/src/main/scala/pl/touk/nussknacker/engine/flink/table/sink/TableSinkFactory.scala index c82bdff3208..028144f9f8b 100644 --- a/engine/flink/components/table/src/main/scala/pl/touk/nussknacker/engine/flink/table/sink/TableSinkFactory.scala +++ b/engine/flink/components/table/src/main/scala/pl/touk/nussknacker/engine/flink/table/sink/TableSinkFactory.scala @@ -14,7 +14,13 @@ import pl.touk.nussknacker.engine.api.context.transformation.{ NodeDependencyValue, SingleInputDynamicComponent } -import pl.touk.nussknacker.engine.api.definition.{BoolParameterEditor, NodeDependency, Parameter, ParameterDeclaration} +import pl.touk.nussknacker.engine.api.definition.{ + BoolParameterEditor, + NodeDependency, + Parameter, + ParameterDeclaration, + TypedNodeDependency +} import pl.touk.nussknacker.engine.api.parameter.ParameterName import pl.touk.nussknacker.engine.api.process.{Sink, SinkFactory} import pl.touk.nussknacker.engine.flink.minicluster.FlinkMiniClusterWithServices @@ -39,9 +45,7 @@ import scala.collection.immutable.ListMap import scala.jdk.CollectionConverters._ class TableSinkFactory( - flinkDataDefinition: FlinkDataDefinition, - miniCluster: FlinkMiniClusterWithServices, - classLoader: URLClassLoader + flinkDataDefinition: FlinkDataDefinition ) extends SingleInputDynamicComponent[Sink] with SinkFactory with LazyLogging { @@ -49,39 +53,42 @@ class TableSinkFactory( override def allowedProcessingModes: Component.AllowedProcessingModes = SetOf(ProcessingMode.UnboundedStream, ProcessingMode.BoundedStream) - @transient - private lazy val tablesDiscovery = TablesDefinitionDiscovery - .prepareDiscovery( - flinkDataDefinition, - miniCluster, - classLoader - ) - .orFail - override type State = TableSinkFactoryState override def contextTransformation(context: ValidationContext, dependencies: List[NodeDependencyValue])( implicit nodeId: NodeId ): this.ContextTransformationDefinition = { - prepareInitialParameters orElse + prepareInitialParameters(dependencies) orElse rawModePrepareValueParameter orElse rawModeFinalStep(context) orElse nonRawModePrepareValueParameters(context) orElse nonRawModeValidateValueParametersFinalStep(context) } - override def nodeDependencies: List[NodeDependency] = List.empty - - private lazy val prepareInitialParameters: ContextTransformationDefinition = { case TransformationStep(Nil, _) => - val (errors, tableDefinitions) = tablesDiscovery.listTables.map(_.toEither).partitionMap(identity) - errors.foreach(logger.warn("A validation error occured when trying to use configured tables", _)) + private val miniclusterDependency = TypedNodeDependency[FlinkMiniClusterWithServices] + override def nodeDependencies: List[NodeDependency] = List(miniclusterDependency) - val tableNameParamDeclaration = TableComponentFactory.buildTableNameParam(tableDefinitions) - NextParameters( - parameters = tableNameParamDeclaration.createParameter() :: rawModeParameterDeclaration.createParameter() :: Nil, - errors = List.empty, - state = Some(AvailableTables(tableDefinitions)) - ) + private def prepareInitialParameters(dependencies: List[NodeDependencyValue]): ContextTransformationDefinition = { + case TransformationStep(Nil, _) => + val minicluster = miniclusterDependency.extract(dependencies) + val (errors, tableDefinitions) = TablesDefinitionDiscovery + .prepareDiscovery( + flinkDataDefinition, + minicluster + ) + .orFail + .listTables + .map(_.toEither) + .partitionMap(identity) + errors.foreach(logger.warn("A validation error occured when trying to use configured tables", _)) + + val tableNameParamDeclaration = TableComponentFactory.buildTableNameParam(tableDefinitions) + NextParameters( + parameters = + tableNameParamDeclaration.createParameter() :: rawModeParameterDeclaration.createParameter() :: Nil, + errors = List.empty, + state = Some(AvailableTables(tableDefinitions)) + ) } private lazy val rawModePrepareValueParameter: ContextTransformationDefinition = { diff --git a/engine/flink/components/table/src/main/scala/pl/touk/nussknacker/engine/flink/table/source/TableSourceFactory.scala b/engine/flink/components/table/src/main/scala/pl/touk/nussknacker/engine/flink/table/source/TableSourceFactory.scala index 81a2dbb8097..f1ebdc591a0 100644 --- a/engine/flink/components/table/src/main/scala/pl/touk/nussknacker/engine/flink/table/source/TableSourceFactory.scala +++ b/engine/flink/components/table/src/main/scala/pl/touk/nussknacker/engine/flink/table/source/TableSourceFactory.scala @@ -26,14 +26,11 @@ import pl.touk.nussknacker.engine.flink.table.utils.DataTypesExtensions._ import pl.touk.nussknacker.engine.flink.table.utils.TableComponentFactory import pl.touk.nussknacker.engine.flink.table.utils.TableComponentFactory._ -import java.net.URLClassLoader import scala.collection.compat.toTraversableLikeExtensionMethods class TableSourceFactory( flinkDataDefinition: FlinkDataDefinition, - testDataGenerationMode: TestDataGenerationMode, - miniCluster: FlinkMiniClusterWithServices, - classLoader: URLClassLoader + testDataGenerationMode: TestDataGenerationMode ) extends SingleInputDynamicComponent[Source] with SourceFactory with LazyLogging { @@ -41,39 +38,45 @@ class TableSourceFactory( override def allowedProcessingModes: Component.AllowedProcessingModes = SetOf(ProcessingMode.UnboundedStream, ProcessingMode.BoundedStream) - @transient - private lazy val tablesDiscovery = TablesDefinitionDiscovery - .prepareDiscovery( - flinkDataDefinition, - miniCluster, - classLoader - ) - .orFail - override type State = TableSourceFactoryState + private val miniclusterDependency = TypedNodeDependency[FlinkMiniClusterWithServices] + override def nodeDependencies: List[NodeDependency] = List(miniclusterDependency) + override def contextTransformation(context: ValidationContext, dependencies: List[NodeDependencyValue])( implicit nodeId: NodeId ): this.ContextTransformationDefinition = { case TransformationStep(Nil, _) => - val (errors, tableDefinitions) = tablesDiscovery.listTables.map(_.toEither).partitionMap(identity) + val minicluster = miniclusterDependency.extract(dependencies) + + val (errors, tableDefinitions) = TablesDefinitionDiscovery + .prepareDiscovery( + flinkDataDefinition, + minicluster + ) + .orFail + .listTables + .map(_.toEither) + .partitionMap(identity) errors.foreach(logger.warn("A validation error occured when trying to use configured tables", _)) val tableNameParamDeclaration = TableComponentFactory.buildTableNameParam(tableDefinitions) NextParameters( parameters = tableNameParamDeclaration.createParameter() :: Nil, errors = List.empty, - state = Some(AvailableTables(tableDefinitions)) + state = Some(AvailableTables(tableDefinitions, minicluster)) ) case TransformationStep( (`tableNameParamName`, DefinedEagerParameter(tableName: String, _)) :: Nil, - Some(AvailableTables(tableDefinitions)) + Some(AvailableTables(tableDefinitions, discovery)) ) => val selectedTable = getSelectedTableUnsafe(tableName, tableDefinitions) val initializer = new BasicContextInitializer( selectedTable.schema.toSourceRowDataType.getLogicalType.toTypingResult ) - FinalResults.forValidation(context, Nil, Some(SelectedTable(selectedTable)))(initializer.validationContext) + FinalResults.forValidation(context, Nil, Some(SelectedTable(selectedTable, discovery)))( + initializer.validationContext + ) } override def implementation( @@ -81,26 +84,26 @@ class TableSourceFactory( dependencies: List[NodeDependencyValue], finalStateOpt: Option[State] ): Source = { - val selectedTable = finalStateOpt match { - case Some(SelectedTable(table)) => table + val (selectedTable, minicluster) = finalStateOpt match { + case Some(SelectedTable(table, minicluster)) => table -> minicluster case _ => throw new IllegalStateException( s"Unexpected final state determined during parameters validation: $finalStateOpt" ) } - new TableSource(selectedTable, flinkDataDefinition, testDataGenerationMode, miniCluster) + new TableSource(selectedTable, flinkDataDefinition, testDataGenerationMode, minicluster) } - override def nodeDependencies: List[NodeDependency] = List.empty - } object TableSourceFactory { sealed trait TableSourceFactoryState - private case class AvailableTables(tableDefinitions: List[TableDefinition]) extends TableSourceFactoryState + private case class AvailableTables(tableDefinitions: List[TableDefinition], minicluster: FlinkMiniClusterWithServices) + extends TableSourceFactoryState - private case class SelectedTable(tableDefinition: TableDefinition) extends TableSourceFactoryState + private case class SelectedTable(tableDefinition: TableDefinition, minicluster: FlinkMiniClusterWithServices) + extends TableSourceFactoryState } From d2985200688bb991305239ccfbc5f82120916349 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mateusz=20S=C5=82abek?= Date: Mon, 24 Mar 2025 12:52:07 +0100 Subject: [PATCH 7/9] table components changes to work with stream env --- .../TablesDefinitionDiscoveryTest.scala | 31 ++- .../TablesDefinitionValidationTest.scala | 7 +- .../TableSourceDataGenerationTest.scala | 20 +- .../ModelClassLoaderSimulationSuite.scala | 1 + .../TablesDefinitionDiscovery.scala | 246 +++++++++--------- .../TablesDefinitionValidation.scala | 9 +- .../flink/table/sink/TableSinkFactory.scala | 24 +- .../flink/table/source/TableSource.scala | 40 ++- .../table/source/TableSourceFactory.scala | 39 ++- 9 files changed, 220 insertions(+), 197 deletions(-) diff --git a/engine/flink/components/base-tests/src/test/scala/pl/touk/nussknacker/engine/flink/table/definition/TablesDefinitionDiscoveryTest.scala b/engine/flink/components/base-tests/src/test/scala/pl/touk/nussknacker/engine/flink/table/definition/TablesDefinitionDiscoveryTest.scala index bd9921b3df9..18f3275195f 100644 --- a/engine/flink/components/base-tests/src/test/scala/pl/touk/nussknacker/engine/flink/table/definition/TablesDefinitionDiscoveryTest.scala +++ b/engine/flink/components/base-tests/src/test/scala/pl/touk/nussknacker/engine/flink/table/definition/TablesDefinitionDiscoveryTest.scala @@ -2,14 +2,15 @@ package pl.touk.nussknacker.engine.flink.table.definition import cats.implicits.{catsSyntaxValidatedId, toTraverseOps} import org.apache.flink.configuration.Configuration +import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment import org.apache.flink.table.api.DataTypes +import org.apache.flink.table.api.bridge.java.StreamTableEnvironment import org.apache.flink.table.catalog._ import org.scalatest.Inside.inside import org.scalatest.LoneElement import org.scalatest.funsuite.AnyFunSuite import org.scalatest.matchers.should.Matchers import org.scalatest.prop.TableDrivenPropertyChecks -import pl.touk.nussknacker.engine.flink.minicluster.FlinkMiniClusterFactory import pl.touk.nussknacker.engine.flink.table.definition.FlinkDataDefinitionCreationError.EmptyDataDefinitionConfiguration import pl.touk.nussknacker.engine.flink.table.definition.FlinkDataDefinitionDiscoveryError.{ ConnectorDiscoveryProblem, @@ -34,14 +35,13 @@ class TablesDefinitionDiscoveryTest with PatientScalaFutures with ModelClassLoaderSimulationSuite { - private val minicluster = FlinkMiniClusterFactory.createUnitTestsMiniClusterWithServices() + private val env = StreamTableEnvironment.create(StreamExecutionEnvironment.getExecutionEnvironment) private def discoverTables(sql: String) = { val parsedSql = FlinkDdlParser.parseUnsafe(sql) val dataDefinition = FlinkDataDefinition.applyUnsafe(parsedSql, None) TablesDefinitionDiscovery - .prepareDiscovery(dataDefinition, minicluster) - .andThen(_.listTables.sequence) + .discoverTables(dataDefinition, env) } test("extracts table definition with correct source and sink data type") { @@ -58,7 +58,7 @@ class TablesDefinitionDiscoveryTest | 'path' = '.', | 'format' = 'csv' |);""".stripMargin - val tableDefinition = discoverTables(sql).validValue.loneElement + val tableDefinition = discoverTables(sql).loneElement.validValue val sourceRowType = tableDefinition.sourceRowDataType.toLogicalRowTypeUnsafe sourceRowType.getFieldNames.asScala shouldBe List( @@ -85,9 +85,8 @@ class TablesDefinitionDiscoveryTest test("use catalog configuration in data definition") { val catalogConfiguration = Configuration.fromMap(Map("type" -> StubbedCatalogFactory.catalogName).asJava) val flinkDataDefinition = FlinkDataDefinition.applyUnsafe(List.empty, Some(catalogConfiguration)) - val discovery = - TablesDefinitionDiscovery.prepareDiscovery(flinkDataDefinition, minicluster).validValue - val tableDefinition = discovery.listTables.sequence.validValue.loneElement + val tableDefinition = + TablesDefinitionDiscovery.discoverTables(flinkDataDefinition, env).loneElement.validValue tableDefinition.tableId.toString shouldBe s"`_nu_catalog`." + s"`${StubbedCatalogFactory.sampleBoundedTablePath.getDatabaseName}`." + @@ -106,7 +105,7 @@ class TablesDefinitionDiscoveryTest | 'path' = '.', | 'format' = 'csv' |);""".stripMargin - discoverTables(sql) shouldBe Symbol("valid") + discoverTables(sql).loneElement shouldBe Symbol("valid") } test("return error for empty flink data definition") { @@ -121,7 +120,7 @@ class TablesDefinitionDiscoveryTest |) WITH ( | 'connector' = 'datagen' |);""".stripMargin - val error = discoverTables(sql).invalidValue.toList.loneElement + val error = discoverTables(sql).loneElement.invalidValue.toList.loneElement inside(error) { case e: SqlStatementExecutionError => e.getMessage should include("Cause: Could not execute CreateTable in path `default_catalog`.`somedb`.`testTable`") @@ -137,7 +136,7 @@ class TablesDefinitionDiscoveryTest | 'password' = 'password', | 'base-url' = 'jdbc:postgresql://localhost:5432' |)""".stripMargin - val error = discoverTables(sql).invalidValue.toList.loneElement + val error = discoverTables(sql).loneElement.invalidValue.toList.loneElement inside(error) { case e: CatalogRegistrationError => e.getMessage shouldBe """Could not create catalog. @@ -153,7 +152,7 @@ class TablesDefinitionDiscoveryTest |) WITH ( | 'connector' = 'not-on-classpath-connector' |)""".stripMargin - val error = discoverTables(sql).invalidValue.toList.loneElement + val error = discoverTables(sql).loneElement.invalidValue.toList.loneElement inside(error) { case e: ConnectorDiscoveryProblem => e.getMessage shouldBe "Could not find matching connector: [not-on-classpath-connector]" } @@ -168,7 +167,7 @@ class TablesDefinitionDiscoveryTest | 'path' = '.', | 'format' = 'not-on-classpath-format' |)""".stripMargin - val errors = discoverTables(sql).invalidValue.toList + val errors = discoverTables(sql).loneElement.invalidValue.toList inside(errors) { case err1 :: err2 :: Nil => err1.getMessage shouldBe "Could not find any format factory for identifier 'not-on-classpath-format' in the classpath." err2.getMessage shouldBe "Could not find any format factory for identifier 'not-on-classpath-format' in the classpath." @@ -183,7 +182,7 @@ class TablesDefinitionDiscoveryTest | 'connector' = 'datagen', | 'redundant' = '123' |)""".stripMargin - val error = discoverTables(sql).invalidValue.toList.loneElement + val error = discoverTables(sql).loneElement.invalidValue.toList.loneElement inside(error) { case e: TableEnvironmentRuntimeValidationError => e.getMessage shouldBe """|Unsupported options found for 'datagen'. @@ -213,7 +212,7 @@ class TablesDefinitionDiscoveryTest | 'connector' = 'blackhole', | 'redundant' = '123' |)""".stripMargin - val error = discoverTables(sql).invalidValue.toList.loneElement + val error = discoverTables(sql).loneElement.invalidValue.toList.loneElement inside(error) { case e: TableEnvironmentRuntimeValidationError => e.getMessage shouldBe """|Unsupported options found for 'blackhole'. @@ -244,7 +243,7 @@ class TablesDefinitionDiscoveryTest | 'format' = 'csv', | 'redundant' = '123' |);""".stripMargin - val error = discoverTables(sql).invalidValue.toList + val error = discoverTables(sql).loneElement.invalidValue.toList inside(error) { case List(e1: TableEnvironmentRuntimeValidationError, e2) => val expectedMessage = """|Unsupported options found for 'filesystem'. diff --git a/engine/flink/components/base-tests/src/test/scala/pl/touk/nussknacker/engine/flink/table/definition/TablesDefinitionValidationTest.scala b/engine/flink/components/base-tests/src/test/scala/pl/touk/nussknacker/engine/flink/table/definition/TablesDefinitionValidationTest.scala index 9a5d4eecf37..7dbda8d8ecb 100644 --- a/engine/flink/components/base-tests/src/test/scala/pl/touk/nussknacker/engine/flink/table/definition/TablesDefinitionValidationTest.scala +++ b/engine/flink/components/base-tests/src/test/scala/pl/touk/nussknacker/engine/flink/table/definition/TablesDefinitionValidationTest.scala @@ -1,9 +1,10 @@ package pl.touk.nussknacker.engine.flink.table.definition +import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment +import org.apache.flink.table.api.bridge.java.StreamTableEnvironment import org.scalatest.{Inside, LoneElement} import org.scalatest.funsuite.AnyFunSuite import org.scalatest.matchers.should.Matchers -import pl.touk.nussknacker.engine.flink.minicluster.FlinkMiniClusterFactory import pl.touk.nussknacker.engine.flink.table.definition.FlinkDataDefinitionCreationError.FlinkDdlParseError.ParseError import pl.touk.nussknacker.engine.flink.table.definition.FlinkDataDefinitionDiscoveryError.{ CatalogDiscoveryProblem, @@ -19,9 +20,9 @@ class TablesDefinitionValidationTest with ModelClassLoaderSimulationSuite with Inside { - private val minicluster = FlinkMiniClusterFactory.createUnitTestsMiniClusterWithServices() + private val env = StreamTableEnvironment.create(StreamExecutionEnvironment.getExecutionEnvironment) private def validate(sql: String) = - TablesDefinitionValidation.validateWithoutExternalConnections(sql, minicluster, simulatedModelClassloader) + TablesDefinitionValidation.validateWithoutExternalConnections(sql, env, simulatedModelClassloader) test("should not return external calls reliant validation errors") { val sql = diff --git a/engine/flink/components/base-tests/src/test/scala/pl/touk/nussknacker/engine/flink/table/source/TableSourceDataGenerationTest.scala b/engine/flink/components/base-tests/src/test/scala/pl/touk/nussknacker/engine/flink/table/source/TableSourceDataGenerationTest.scala index b54380d8754..35ea6f800be 100644 --- a/engine/flink/components/base-tests/src/test/scala/pl/touk/nussknacker/engine/flink/table/source/TableSourceDataGenerationTest.scala +++ b/engine/flink/components/base-tests/src/test/scala/pl/touk/nussknacker/engine/flink/table/source/TableSourceDataGenerationTest.scala @@ -1,11 +1,11 @@ package pl.touk.nussknacker.engine.flink.table.source import cats.implicits.toTraverseOps -import org.apache.flink.configuration.Configuration +import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment +import org.apache.flink.table.api.bridge.java.StreamTableEnvironment import org.scalatest.LoneElement import org.scalatest.funsuite.AnyFunSuite import org.scalatest.matchers.should.Matchers -import pl.touk.nussknacker.engine.flink.minicluster.FlinkMiniClusterFactory import pl.touk.nussknacker.engine.flink.table.FlinkSqlTableTestCases.allColumnTypesTable import pl.touk.nussknacker.engine.flink.table.TableComponentProviderConfig.TestDataGenerationMode import pl.touk.nussknacker.engine.flink.table.definition.{ @@ -28,19 +28,17 @@ class TableSourceDataGenerationTest private val tableSource = { val flinkDataDefinition = FlinkDataDefinition.applyUnsafe(FlinkDdlParser.parseUnsafe(allColumnTypesTable), None) - val minicluster = - FlinkMiniClusterFactory.createMiniClusterWithServices(simulatedModelClassloader, new Configuration()) + val env = + StreamTableEnvironment.create(StreamExecutionEnvironment.getExecutionEnvironment) val discovery = TablesDefinitionDiscovery - .prepareDiscovery( - flinkDataDefinition, - FlinkMiniClusterFactory.createMiniClusterWithServices(simulatedModelClassloader, new Configuration()) - ) - .validValue + .discoverTables(flinkDataDefinition, env) + .sequence + new TableSource( - tableDefinition = discovery.listTables.sequence.validValue.loneElement, + tableDefinition = discovery.validValue.loneElement, flinkDataDefinition = flinkDataDefinition, testDataGenerationMode = TestDataGenerationMode.Random, - minicluster + env ) } diff --git a/engine/flink/components/base-tests/src/test/scala/pl/touk/nussknacker/engine/flink/table/utils/ModelClassLoaderSimulationSuite.scala b/engine/flink/components/base-tests/src/test/scala/pl/touk/nussknacker/engine/flink/table/utils/ModelClassLoaderSimulationSuite.scala index c3ad72303ee..fd19fb4ae31 100644 --- a/engine/flink/components/base-tests/src/test/scala/pl/touk/nussknacker/engine/flink/table/utils/ModelClassLoaderSimulationSuite.scala +++ b/engine/flink/components/base-tests/src/test/scala/pl/touk/nussknacker/engine/flink/table/utils/ModelClassLoaderSimulationSuite.scala @@ -4,6 +4,7 @@ import org.scalatest.{BeforeAndAfterAll, Suite} import pl.touk.nussknacker.engine.classloader.ModelClassLoader // This is only for purpose of using an empty URLClassLoader as contextClassLoader in tests ran from Intellij Idea +// TODO: check if this will still be necessary after changes in NodeDependency trait ModelClassLoaderSimulationSuite extends BeforeAndAfterAll { this: Suite => private val originalContextClassLoader: ClassLoader = Thread.currentThread().getContextClassLoader diff --git a/engine/flink/components/table/src/main/scala/pl/touk/nussknacker/engine/flink/table/definition/TablesDefinitionDiscovery.scala b/engine/flink/components/table/src/main/scala/pl/touk/nussknacker/engine/flink/table/definition/TablesDefinitionDiscovery.scala index 63aa35c4880..f1d34dce7bb 100644 --- a/engine/flink/components/table/src/main/scala/pl/touk/nussknacker/engine/flink/table/definition/TablesDefinitionDiscovery.scala +++ b/engine/flink/components/table/src/main/scala/pl/touk/nussknacker/engine/flink/table/definition/TablesDefinitionDiscovery.scala @@ -1,21 +1,18 @@ package pl.touk.nussknacker.engine.flink.table.definition -import cats.data.{NonEmptyList, ValidatedNel} +import cats.data.{NonEmptyList, Validated, ValidatedNel} import cats.implicits.{catsSyntaxValidatedId, toFunctorOps} import cats.syntax.either._ import com.typesafe.scalalogging.LazyLogging -import org.apache.flink.configuration.Configuration -import org.apache.flink.table.api.{EnvironmentSettings, Schema, Table, TableDescriptor} +import org.apache.flink.table.api.{Schema, Table, TableDescriptor} import org.apache.flink.table.api.bridge.java.StreamTableEnvironment -import org.apache.flink.table.catalog.{ObjectIdentifier, ObjectPath} -import org.apache.flink.table.catalog.Catalog +import org.apache.flink.table.catalog.{Catalog, ObjectIdentifier, ObjectPath} import org.apache.flink.table.factories.{ DynamicTableFactory, DynamicTableSinkFactory, DynamicTableSourceFactory, FactoryUtil } -import pl.touk.nussknacker.engine.flink.minicluster.FlinkMiniClusterWithServices import pl.touk.nussknacker.engine.flink.table.TableDefinition import pl.touk.nussknacker.engine.flink.table.definition.FlinkDataDefinitionDiscoveryError.{ ConnectorDiscoveryProblem, @@ -23,137 +20,146 @@ import pl.touk.nussknacker.engine.flink.table.definition.FlinkDataDefinitionDisc TableEnvironmentRuntimeValidationError } import pl.touk.nussknacker.engine.flink.table.utils.SchemaExtensions.ResolvedSchemaExtensions -import pl.touk.nussknacker.engine.util.ThreadUtils -import java.net.URLClassLoader +import scala.jdk.CollectionConverters._ import scala.jdk.OptionConverters.RichOptional import scala.util.Try -class TablesDefinitionDiscovery(env: StreamTableEnvironment) extends LazyLogging { - - import scala.jdk.CollectionConverters._ - - def listTables: List[ValidatedNel[FlinkDataDefinitionDiscoveryError, TableDefinition]] = for { - catalogName <- env.listCatalogs().toList - catalog <- env.getCatalog(catalogName).toScala.toList - databaseName <- catalog.listDatabases.asScala.toList - tableName <- env.listTables(catalogName, databaseName).toList - } yield validateAndExtractTable(catalog, catalogName, databaseName, tableName) - - private def validateAndExtractTable( - catalog: Catalog, - catalogName: String, - databaseName: String, - tableName: String - ): ValidatedNel[FlinkDataDefinitionDiscoveryError, TableDefinition] = { - val tableId = ObjectIdentifier.of(catalogName, databaseName, tableName) - val baseTable = catalog.getTable(new ObjectPath(databaseName, tableName)) - val table = extractTable(tableId) - val connectorOpt = baseTable.getOptions.asScala.get("connector") - - val validation = connectorOpt match { - case Some(connector) => validateTable(connector, table, tableName, env) - case None => ().validNel // No connector found - may be a catalog-managed table - } - - validation.map(_ => TableDefinition(tableId, table.getResolvedSchema)) - } - - private def extractTable(tableId: ObjectIdentifier): Table = Try(env.from(tableId.toString)).fold( - ex => throw new IllegalStateException(s"Table extractor could not locate a created table with id: $tableId", ex), - identity - ) +object TablesDefinitionDiscovery extends LazyLogging { - private def validateTable( - connector: String, - table: Table, - tableName: String, + // TODO: Check if this works without: + // 1. Setting ModelClassLoader as context classloader + // 2. Setting ModelClassLoader on EnvironmnentSettings for StreamTableEnv (this may be redundant anyways) + def discoverTables( + flinkDataDefinition: FlinkDataDefinition, env: StreamTableEnvironment - ): ValidatedNel[FlinkDataDefinitionDiscoveryError, Unit] = { - val tableFactory = Try { - FactoryUtil.discoverFactory(getClass.getClassLoader, classOf[DynamicTableFactory], connector) - }.fold(ex => ConnectorDiscoveryProblem(connector, ex).invalidNel, factory => factory.validNel) - - tableFactory.andThen { factory => - val sourceValidationResult = factory match { - case _: DynamicTableSourceFactory => Some(validateSource(table)) - case _ => None + ): List[ValidatedNel[FlinkDataDefinitionError, TableDefinition]] = { + try { + flinkDataDefinition + .registerIn(env) + .fold( + e => List(e.invalid), + _ => new TablesDefinitionDiscoveryInternal(env).listTables + ) + } finally { + // TODO: check if its enough + for { + catalogName <- env.listCatalogs().toList + catalog <- env.getCatalog(catalogName).toScala.toList + databaseName <- catalog.listDatabases.asScala.toList + tableName <- env.listTables(catalogName, databaseName).toList + } yield { + env.useCatalog(catalogName) + env.useDatabase(databaseName) + env.executeSql(s"DROP TABLE `$tableName`") } + env.useCatalog("default_catalog") + env.useDatabase("default_database") + } + } - val sinkValidationResult = factory match { - case _: DynamicTableSinkFactory if table.getResolvedSchema.containsPersistableMetadataColumns() => { - logger.warn(s"Ommitted table [$tableName] as a Sink since persistable metadata columns are not supported") - None - } - case _: DynamicTableSinkFactory => Some(validateSink(table, tableName, env)) - case _ => None + private class TablesDefinitionDiscoveryInternal(env: StreamTableEnvironment) extends LazyLogging { + + import scala.jdk.CollectionConverters._ + + def listTables: List[ValidatedNel[FlinkDataDefinitionError, TableDefinition]] = for { + catalogName <- env.listCatalogs().toList + catalog <- env.getCatalog(catalogName).toScala.toList + databaseName <- catalog.listDatabases.asScala.toList + tableName <- env.listTables(catalogName, databaseName).toList + } yield validateAndExtractTable(catalog, catalogName, databaseName, tableName) + + private def validateAndExtractTable( + catalog: Catalog, + catalogName: String, + databaseName: String, + tableName: String + ): ValidatedNel[FlinkDataDefinitionDiscoveryError, TableDefinition] = { + val tableId = ObjectIdentifier.of(catalogName, databaseName, tableName) + val baseTable = catalog.getTable(new ObjectPath(databaseName, tableName)) + val table = extractTable(tableId) + val connectorOpt = baseTable.getOptions.asScala.get("connector") + + val validation = connectorOpt match { + case Some(connector) => validateTable(connector, table, tableName, env) + case None => ().validNel // No connector found - may be a catalog-managed table } - (sourceValidationResult, sinkValidationResult) match { - case (None, None) => NonEmptyList.one(NoSinkOrSourceImplementationsFoundForConnector(connector)).invalid - case (Some(sourceResult), Some(sinkResult)) => sourceResult.combine(sinkResult) - case (Some(sourceResult), None) => sourceResult - case (None, Some(sinkResult)) => sinkResult - } + validation.map(_ => TableDefinition(tableId, table.getResolvedSchema)) } - } - private def validateSource(table: Table) = { - Try { - table - .insertInto( - TableDescriptor - .forConnector("blackhole") - .schema(Schema.newBuilder().fromRowDataType(table.getResolvedSchema.toSourceRowDataType).build) - .build() - ) - .compilePlan() - }.toEither - .leftMap(e => NonEmptyList.one(TableEnvironmentRuntimeValidationError(e))) - .toValidated - .void - } + private def extractTable(tableId: ObjectIdentifier): Table = Try(env.from(tableId.toString)).fold( + ex => throw new IllegalStateException(s"Table extractor could not locate a created table with id: $tableId", ex), + identity + ) + + private def validateTable( + connector: String, + table: Table, + tableName: String, + env: StreamTableEnvironment + ): ValidatedNel[FlinkDataDefinitionDiscoveryError, Unit] = { + val tableFactory = Try { + FactoryUtil.discoverFactory(getClass.getClassLoader, classOf[DynamicTableFactory], connector) + }.fold(ex => ConnectorDiscoveryProblem(connector, ex).invalidNel, factory => factory.validNel) + + tableFactory.andThen { factory => + val sourceValidationResult = factory match { + case _: DynamicTableSourceFactory => Some(validateSource(table)) + case _ => None + } - private def validateSink(table: Table, tableName: String, env: StreamTableEnvironment) = { - Try { - env - .from( - TableDescriptor - .forConnector("datagen") - .schema(Schema.newBuilder().fromRowDataType(table.getResolvedSchema.toPhysicalRowDataType).build) - .build() - ) - .insertInto(tableName) - .compilePlan() - }.toEither - .leftMap(e => NonEmptyList.one(TableEnvironmentRuntimeValidationError(e))) - .toValidated - .void - } + val sinkValidationResult = factory match { + case _: DynamicTableSinkFactory if table.getResolvedSchema.containsPersistableMetadataColumns() => { + logger.warn(s"Ommitted table [$tableName] as a Sink since persistable metadata columns are not supported") + None + } + case _: DynamicTableSinkFactory => Some(validateSink(table, tableName, env)) + case _ => None + } -} + (sourceValidationResult, sinkValidationResult) match { + case (None, None) => NonEmptyList.one(NoSinkOrSourceImplementationsFoundForConnector(connector)).invalid + case (Some(sourceResult), Some(sinkResult)) => sourceResult.combine(sinkResult) + case (Some(sourceResult), None) => sourceResult + case (None, Some(sinkResult)) => sinkResult + } + } + } -object TablesDefinitionDiscovery extends LazyLogging { + private def validateSource(table: Table) = { + Try { + table + .insertInto( + TableDescriptor + .forConnector("blackhole") + .schema(Schema.newBuilder().fromRowDataType(table.getResolvedSchema.toSourceRowDataType).build) + .build() + ) + .compilePlan() + }.toEither + .leftMap(e => NonEmptyList.one(TableEnvironmentRuntimeValidationError(e))) + .toValidated + .void + } - def prepareDiscovery( - flinkDataDefinition: FlinkDataDefinition, - miniCluster: FlinkMiniClusterWithServices - ): ValidatedNel[FlinkDataDefinitionRegistrationError, TablesDefinitionDiscovery] = { - // TODO: Check if this works without: - // 1. Setting ModelClassLoader as context classloader - // 2. Setting ModelClassLoader on EnvironmnentSettings for StreamTableEnv (this may be redundant anyways) - miniCluster.withDetachedStreamExecutionEnvironment { env => - val streamTableEnv = StreamTableEnvironment.create( - env, - EnvironmentSettings - .newInstance() - .withConfiguration(Configuration.fromMap(env.getConfiguration.toMap)) - .build() - ) - flinkDataDefinition - .registerIn(streamTableEnv) - .map(_ => new TablesDefinitionDiscovery(streamTableEnv)) + private def validateSink(table: Table, tableName: String, env: StreamTableEnvironment) = { + Try { + env + .from( + TableDescriptor + .forConnector("datagen") + .schema(Schema.newBuilder().fromRowDataType(table.getResolvedSchema.toPhysicalRowDataType).build) + .build() + ) + .insertInto(tableName) + .compilePlan() + }.toEither + .leftMap(e => NonEmptyList.one(TableEnvironmentRuntimeValidationError(e))) + .toValidated + .void } + } } diff --git a/engine/flink/components/table/src/main/scala/pl/touk/nussknacker/engine/flink/table/definition/TablesDefinitionValidation.scala b/engine/flink/components/table/src/main/scala/pl/touk/nussknacker/engine/flink/table/definition/TablesDefinitionValidation.scala index bbc103ec6da..662b2907c95 100644 --- a/engine/flink/components/table/src/main/scala/pl/touk/nussknacker/engine/flink/table/definition/TablesDefinitionValidation.scala +++ b/engine/flink/components/table/src/main/scala/pl/touk/nussknacker/engine/flink/table/definition/TablesDefinitionValidation.scala @@ -3,9 +3,10 @@ package pl.touk.nussknacker.engine.flink.table.definition import cats.data.{NonEmptyList, ValidatedNel} import cats.implicits._ import org.apache.flink.configuration.Configuration +import org.apache.flink.table.api.bridge.java.StreamTableEnvironment import org.apache.flink.table.factories.{CatalogFactory, FactoryUtil} import org.apache.flink.table.factories.FactoryUtil.DefaultCatalogContext -import pl.touk.nussknacker.engine.flink.minicluster.FlinkMiniClusterWithServices +import pl.touk.nussknacker.engine.flink.table.TableDefinition import pl.touk.nussknacker.engine.flink.table.definition.FlinkDataDefinition.FlinkSqlDdlStatement.{ CreateCatalog, CreateTable @@ -24,7 +25,7 @@ object TablesDefinitionValidation { def validateWithoutExternalConnections( sqlDdlStatements: String, - minicluster: FlinkMiniClusterWithServices, + env: StreamTableEnvironment, classLoader: URLClassLoader ): ValidatedNel[FlinkDataDefinitionError, Unit] = { FlinkDdlParser.parse(sqlDdlStatements).andThen { statements => @@ -35,9 +36,7 @@ object TablesDefinitionValidation { FlinkDataDefinition .apply(createTableStatements, None) .andThen { definition => - TablesDefinitionDiscovery - .prepareDiscovery(definition, minicluster) - .andThen(_.listTables.sequence) + TablesDefinitionDiscovery.discoverTables(definition, env).sequence } .void } diff --git a/engine/flink/components/table/src/main/scala/pl/touk/nussknacker/engine/flink/table/sink/TableSinkFactory.scala b/engine/flink/components/table/src/main/scala/pl/touk/nussknacker/engine/flink/table/sink/TableSinkFactory.scala index 028144f9f8b..547d54637dd 100644 --- a/engine/flink/components/table/src/main/scala/pl/touk/nussknacker/engine/flink/table/sink/TableSinkFactory.scala +++ b/engine/flink/components/table/src/main/scala/pl/touk/nussknacker/engine/flink/table/sink/TableSinkFactory.scala @@ -4,6 +4,10 @@ import cats.data.{NonEmptyList, Validated, ValidatedNel} import cats.data.Validated.{invalid, valid} import cats.implicits._ import com.typesafe.scalalogging.LazyLogging +import org.apache.flink.configuration.Configuration +import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment +import org.apache.flink.table.api.EnvironmentSettings +import org.apache.flink.table.api.bridge.java.StreamTableEnvironment import pl.touk.nussknacker.engine.api.{NodeId, Params} import pl.touk.nussknacker.engine.api.component.{Component, ProcessingMode} import pl.touk.nussknacker.engine.api.component.Component.AllowedProcessingModes.SetOf @@ -65,19 +69,25 @@ class TableSinkFactory( nonRawModeValidateValueParametersFinalStep(context) } - private val miniclusterDependency = TypedNodeDependency[FlinkMiniClusterWithServices] - override def nodeDependencies: List[NodeDependency] = List(miniclusterDependency) + private val streamEnvDependency = TypedNodeDependency[StreamExecutionEnvironment] + override def nodeDependencies: List[NodeDependency] = List(streamEnvDependency) private def prepareInitialParameters(dependencies: List[NodeDependencyValue]): ContextTransformationDefinition = { case TransformationStep(Nil, _) => - val minicluster = miniclusterDependency.extract(dependencies) + val streamEnv = streamEnvDependency.extract(dependencies) + val streamTableEnv = StreamTableEnvironment.create( + streamEnv, + EnvironmentSettings + .newInstance() + .withConfiguration(Configuration.fromMap(streamEnv.getConfiguration.toMap)) + .build() + ) + val (errors, tableDefinitions) = TablesDefinitionDiscovery - .prepareDiscovery( + .discoverTables( flinkDataDefinition, - minicluster + streamTableEnv ) - .orFail - .listTables .map(_.toEither) .partitionMap(identity) errors.foreach(logger.warn("A validation error occured when trying to use configured tables", _)) diff --git a/engine/flink/components/table/src/main/scala/pl/touk/nussknacker/engine/flink/table/source/TableSource.scala b/engine/flink/components/table/src/main/scala/pl/touk/nussknacker/engine/flink/table/source/TableSource.scala index 38be731e537..92fcf6fcf7f 100644 --- a/engine/flink/components/table/src/main/scala/pl/touk/nussknacker/engine/flink/table/source/TableSource.scala +++ b/engine/flink/components/table/src/main/scala/pl/touk/nussknacker/engine/flink/table/source/TableSource.scala @@ -41,7 +41,7 @@ class TableSource( tableDefinition: TableDefinition, flinkDataDefinition: FlinkDataDefinition, testDataGenerationMode: TestDataGenerationMode, - miniCluster: FlinkMiniClusterWithServices + environmentForTestingPurposes: StreamTableEnvironment ) extends StandardFlinkSource[Row] with TestWithParametersSupport[Row] with FlinkSourceTestSupport[Row] @@ -102,10 +102,9 @@ class TableSource( .build() } (testRecords: List[TestRecord]) => - miniCluster.withDetachedStreamExecutionEnvironment { env => - new FlinkMiniClusterTableOperations(MiniClusterEnvBuilder.buildStreamTableEnv(env)) - .parseTestRecords(testRecords, tableDataParserSchema) - } + // TODO: clean up env if necessary + new FlinkMiniClusterTableOperations(environmentForTestingPurposes) + .parseTestRecords(testRecords, tableDataParserSchema) } override def generateTestData(size: Int): TestData = { @@ -113,22 +112,21 @@ class TableSource( val dataType = DataTypes.ROW(fieldsWithoutComputedColumns: _*) Schema.newBuilder().fromRowDataType(dataType).build() } - miniCluster.withDetachedStreamExecutionEnvironment { env => - val tableOps = new FlinkMiniClusterTableOperations(MiniClusterEnvBuilder.buildStreamTableEnv(env)) - testDataGenerationMode match { - case TestDataGenerationMode.Random => - tableOps.generateRandomTestData( - amount = size, - schema = generateDataSchema - ) - case TestDataGenerationMode.Live => - tableOps.generateLiveTestData( - limit = size, - schema = generateDataSchema, - flinkDataDefinition = flinkDataDefinition, - tableId = tableDefinition.tableId - ) - } + // TODO: clean up env if necessary + val tableOps = new FlinkMiniClusterTableOperations(environmentForTestingPurposes) + testDataGenerationMode match { + case TestDataGenerationMode.Random => + tableOps.generateRandomTestData( + amount = size, + schema = generateDataSchema + ) + case TestDataGenerationMode.Live => + tableOps.generateLiveTestData( + limit = size, + schema = generateDataSchema, + flinkDataDefinition = flinkDataDefinition, + tableId = tableDefinition.tableId + ) } } diff --git a/engine/flink/components/table/src/main/scala/pl/touk/nussknacker/engine/flink/table/source/TableSourceFactory.scala b/engine/flink/components/table/src/main/scala/pl/touk/nussknacker/engine/flink/table/source/TableSourceFactory.scala index f1ebdc591a0..43d547ef10e 100644 --- a/engine/flink/components/table/src/main/scala/pl/touk/nussknacker/engine/flink/table/source/TableSourceFactory.scala +++ b/engine/flink/components/table/src/main/scala/pl/touk/nussknacker/engine/flink/table/source/TableSourceFactory.scala @@ -1,6 +1,10 @@ package pl.touk.nussknacker.engine.flink.table.source import com.typesafe.scalalogging.LazyLogging +import org.apache.flink.configuration.Configuration +import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment +import org.apache.flink.table.api.EnvironmentSettings +import org.apache.flink.table.api.bridge.java.StreamTableEnvironment import pl.touk.nussknacker.engine.api.{NodeId, Params} import pl.touk.nussknacker.engine.api.component.{Component, ProcessingMode} import pl.touk.nussknacker.engine.api.component.Component.AllowedProcessingModes.SetOf @@ -12,7 +16,6 @@ import pl.touk.nussknacker.engine.api.context.transformation.{ } import pl.touk.nussknacker.engine.api.definition._ import pl.touk.nussknacker.engine.api.process.{BasicContextInitializer, Source, SourceFactory} -import pl.touk.nussknacker.engine.flink.minicluster.FlinkMiniClusterWithServices import pl.touk.nussknacker.engine.flink.table.TableComponentProviderConfig.TestDataGenerationMode.TestDataGenerationMode import pl.touk.nussknacker.engine.flink.table.TableDefinition import pl.touk.nussknacker.engine.flink.table.definition.{FlinkDataDefinition, TablesDefinitionDiscovery} @@ -40,22 +43,30 @@ class TableSourceFactory( override type State = TableSourceFactoryState - private val miniclusterDependency = TypedNodeDependency[FlinkMiniClusterWithServices] - override def nodeDependencies: List[NodeDependency] = List(miniclusterDependency) + // TODO: StreamExecutionEnvironment or StreamTableEnvironment? StreamTableEnvironment would be better because we + // convert it into it in almost every case and it should be replaceable + // TODO: we need some wrapper to clean the env but its not so simple + private val streamEnvDependency = TypedNodeDependency[StreamExecutionEnvironment] + override def nodeDependencies: List[NodeDependency] = List(streamEnvDependency) override def contextTransformation(context: ValidationContext, dependencies: List[NodeDependencyValue])( implicit nodeId: NodeId ): this.ContextTransformationDefinition = { case TransformationStep(Nil, _) => - val minicluster = miniclusterDependency.extract(dependencies) + val streamEnv = streamEnvDependency.extract(dependencies) + val streamTableEnv = StreamTableEnvironment.create( + streamEnv, + EnvironmentSettings + .newInstance() + .withConfiguration(Configuration.fromMap(streamEnv.getConfiguration.toMap)) + .build() + ) val (errors, tableDefinitions) = TablesDefinitionDiscovery - .prepareDiscovery( + .discoverTables( flinkDataDefinition, - minicluster + streamTableEnv ) - .orFail - .listTables .map(_.toEither) .partitionMap(identity) errors.foreach(logger.warn("A validation error occured when trying to use configured tables", _)) @@ -64,7 +75,7 @@ class TableSourceFactory( NextParameters( parameters = tableNameParamDeclaration.createParameter() :: Nil, errors = List.empty, - state = Some(AvailableTables(tableDefinitions, minicluster)) + state = Some(AvailableTables(tableDefinitions, streamTableEnv)) ) case TransformationStep( (`tableNameParamName`, DefinedEagerParameter(tableName: String, _)) :: Nil, @@ -84,14 +95,14 @@ class TableSourceFactory( dependencies: List[NodeDependencyValue], finalStateOpt: Option[State] ): Source = { - val (selectedTable, minicluster) = finalStateOpt match { - case Some(SelectedTable(table, minicluster)) => table -> minicluster + val (selectedTable, env) = finalStateOpt match { + case Some(SelectedTable(table, env)) => table -> env case _ => throw new IllegalStateException( s"Unexpected final state determined during parameters validation: $finalStateOpt" ) } - new TableSource(selectedTable, flinkDataDefinition, testDataGenerationMode, minicluster) + new TableSource(selectedTable, flinkDataDefinition, testDataGenerationMode, env) } } @@ -100,10 +111,10 @@ object TableSourceFactory { sealed trait TableSourceFactoryState - private case class AvailableTables(tableDefinitions: List[TableDefinition], minicluster: FlinkMiniClusterWithServices) + private case class AvailableTables(tableDefinitions: List[TableDefinition], env: StreamTableEnvironment) extends TableSourceFactoryState - private case class SelectedTable(tableDefinition: TableDefinition, minicluster: FlinkMiniClusterWithServices) + private case class SelectedTable(tableDefinition: TableDefinition, env: StreamTableEnvironment) extends TableSourceFactoryState } From 1c40a1369573ecb658a11cb2e9dfcddf02700531 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mateusz=20S=C5=82abek?= Date: Mon, 24 Mar 2025 13:09:59 +0100 Subject: [PATCH 8/9] fix build --- .../flink/table/sink/TableSinkFactory.scala | 15 ++++++++++++--- .../flink/table/source/TableSourceFactory.scala | 16 ++++++++++++---- 2 files changed, 24 insertions(+), 7 deletions(-) diff --git a/engine/flink/components/table/src/main/scala/pl/touk/nussknacker/engine/flink/table/sink/TableSinkFactory.scala b/engine/flink/components/table/src/main/scala/pl/touk/nussknacker/engine/flink/table/sink/TableSinkFactory.scala index 547d54637dd..2ee7b24187a 100644 --- a/engine/flink/components/table/src/main/scala/pl/touk/nussknacker/engine/flink/table/sink/TableSinkFactory.scala +++ b/engine/flink/components/table/src/main/scala/pl/touk/nussknacker/engine/flink/table/sink/TableSinkFactory.scala @@ -29,7 +29,11 @@ import pl.touk.nussknacker.engine.api.parameter.ParameterName import pl.touk.nussknacker.engine.api.process.{Sink, SinkFactory} import pl.touk.nussknacker.engine.flink.minicluster.FlinkMiniClusterWithServices import pl.touk.nussknacker.engine.flink.table.TableDefinition -import pl.touk.nussknacker.engine.flink.table.definition.{FlinkDataDefinition, TablesDefinitionDiscovery} +import pl.touk.nussknacker.engine.flink.table.definition.{ + FlinkDataDefinition, + FlinkDataDefinitionError, + TablesDefinitionDiscovery +} import pl.touk.nussknacker.engine.flink.table.definition.FlinkDataDefinition._ import pl.touk.nussknacker.engine.flink.table.sink.TableSinkFactory._ import pl.touk.nussknacker.engine.flink.table.utils.DataTypesExtensions._ @@ -88,8 +92,13 @@ class TableSinkFactory( flinkDataDefinition, streamTableEnv ) - .map(_.toEither) - .partitionMap(identity) + .foldLeft((List.empty[FlinkDataDefinitionError], List.empty[TableDefinition])) { + case ((errs, tables), Validated.Invalid(errsNel)) => + (errs ++ errsNel.toList, tables) + case ((errs, tables), Validated.Valid(table)) => + (errs, table :: tables) + } + errors.foreach(logger.warn("A validation error occured when trying to use configured tables", _)) val tableNameParamDeclaration = TableComponentFactory.buildTableNameParam(tableDefinitions) diff --git a/engine/flink/components/table/src/main/scala/pl/touk/nussknacker/engine/flink/table/source/TableSourceFactory.scala b/engine/flink/components/table/src/main/scala/pl/touk/nussknacker/engine/flink/table/source/TableSourceFactory.scala index 43d547ef10e..c66a1b0aecf 100644 --- a/engine/flink/components/table/src/main/scala/pl/touk/nussknacker/engine/flink/table/source/TableSourceFactory.scala +++ b/engine/flink/components/table/src/main/scala/pl/touk/nussknacker/engine/flink/table/source/TableSourceFactory.scala @@ -1,5 +1,6 @@ package pl.touk.nussknacker.engine.flink.table.source +import cats.data.Validated import com.typesafe.scalalogging.LazyLogging import org.apache.flink.configuration.Configuration import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment @@ -18,8 +19,11 @@ import pl.touk.nussknacker.engine.api.definition._ import pl.touk.nussknacker.engine.api.process.{BasicContextInitializer, Source, SourceFactory} import pl.touk.nussknacker.engine.flink.table.TableComponentProviderConfig.TestDataGenerationMode.TestDataGenerationMode import pl.touk.nussknacker.engine.flink.table.TableDefinition -import pl.touk.nussknacker.engine.flink.table.definition.{FlinkDataDefinition, TablesDefinitionDiscovery} -import pl.touk.nussknacker.engine.flink.table.definition.FlinkDataDefinition._ +import pl.touk.nussknacker.engine.flink.table.definition.{ + FlinkDataDefinition, + FlinkDataDefinitionError, + TablesDefinitionDiscovery +} import pl.touk.nussknacker.engine.flink.table.source.TableSourceFactory.{ AvailableTables, SelectedTable, @@ -67,8 +71,12 @@ class TableSourceFactory( flinkDataDefinition, streamTableEnv ) - .map(_.toEither) - .partitionMap(identity) + .foldLeft((List.empty[FlinkDataDefinitionError], List.empty[TableDefinition])) { + case ((errs, tables), Validated.Invalid(errsNel)) => + (errs ++ errsNel.toList, tables) + case ((errs, tables), Validated.Valid(table)) => + (errs, table :: tables) + } errors.foreach(logger.warn("A validation error occured when trying to use configured tables", _)) val tableNameParamDeclaration = TableComponentFactory.buildTableNameParam(tableDefinitions) From d452742dd41434f1e530ffac32865dbabe0c7043 Mon Sep 17 00:00:00 2001 From: Arek Burdach Date: Tue, 25 Mar 2025 16:02:05 +0100 Subject: [PATCH 9/9] tests fixes --- .../engine/flink/table/sink/TableSinkFactory.scala | 12 +----------- .../flink/table/source/TableSourceFactory.scala | 2 -- 2 files changed, 1 insertion(+), 13 deletions(-) diff --git a/engine/flink/components/table/src/main/scala/pl/touk/nussknacker/engine/flink/table/sink/TableSinkFactory.scala b/engine/flink/components/table/src/main/scala/pl/touk/nussknacker/engine/flink/table/sink/TableSinkFactory.scala index 2ee7b24187a..a455adcf9d7 100644 --- a/engine/flink/components/table/src/main/scala/pl/touk/nussknacker/engine/flink/table/sink/TableSinkFactory.scala +++ b/engine/flink/components/table/src/main/scala/pl/touk/nussknacker/engine/flink/table/sink/TableSinkFactory.scala @@ -18,23 +18,15 @@ import pl.touk.nussknacker.engine.api.context.transformation.{ NodeDependencyValue, SingleInputDynamicComponent } -import pl.touk.nussknacker.engine.api.definition.{ - BoolParameterEditor, - NodeDependency, - Parameter, - ParameterDeclaration, - TypedNodeDependency -} +import pl.touk.nussknacker.engine.api.definition._ import pl.touk.nussknacker.engine.api.parameter.ParameterName import pl.touk.nussknacker.engine.api.process.{Sink, SinkFactory} -import pl.touk.nussknacker.engine.flink.minicluster.FlinkMiniClusterWithServices import pl.touk.nussknacker.engine.flink.table.TableDefinition import pl.touk.nussknacker.engine.flink.table.definition.{ FlinkDataDefinition, FlinkDataDefinitionError, TablesDefinitionDiscovery } -import pl.touk.nussknacker.engine.flink.table.definition.FlinkDataDefinition._ import pl.touk.nussknacker.engine.flink.table.sink.TableSinkFactory._ import pl.touk.nussknacker.engine.flink.table.utils.DataTypesExtensions._ import pl.touk.nussknacker.engine.flink.table.utils.TableComponentFactory @@ -47,8 +39,6 @@ import pl.touk.nussknacker.engine.util.parameters.{ } import pl.touk.nussknacker.engine.util.sinkvalue.SinkValue -import java.net.URLClassLoader -import scala.collection.compat.toTraversableLikeExtensionMethods import scala.collection.immutable.ListMap import scala.jdk.CollectionConverters._ diff --git a/engine/flink/components/table/src/main/scala/pl/touk/nussknacker/engine/flink/table/source/TableSourceFactory.scala b/engine/flink/components/table/src/main/scala/pl/touk/nussknacker/engine/flink/table/source/TableSourceFactory.scala index c66a1b0aecf..c93bd936864 100644 --- a/engine/flink/components/table/src/main/scala/pl/touk/nussknacker/engine/flink/table/source/TableSourceFactory.scala +++ b/engine/flink/components/table/src/main/scala/pl/touk/nussknacker/engine/flink/table/source/TableSourceFactory.scala @@ -33,8 +33,6 @@ import pl.touk.nussknacker.engine.flink.table.utils.DataTypesExtensions._ import pl.touk.nussknacker.engine.flink.table.utils.TableComponentFactory import pl.touk.nussknacker.engine.flink.table.utils.TableComponentFactory._ -import scala.collection.compat.toTraversableLikeExtensionMethods - class TableSourceFactory( flinkDataDefinition: FlinkDataDefinition, testDataGenerationMode: TestDataGenerationMode