From 779172cb1435a6654c1b2e93f6d1139da7ba43d3 Mon Sep 17 00:00:00 2001 From: Emmanuel Nau Date: Fri, 16 Jan 2026 13:44:55 -0500 Subject: [PATCH 1/4] feat: CLIN-5494 Safe parse OMIM phenotypes attributes --- .../publictables/normalized/omim/OmimPhenotype.scala | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/datalake-spark3/src/main/scala/bio/ferlab/datalake/spark3/publictables/normalized/omim/OmimPhenotype.scala b/datalake-spark3/src/main/scala/bio/ferlab/datalake/spark3/publictables/normalized/omim/OmimPhenotype.scala index bc769d14..9f256918 100644 --- a/datalake-spark3/src/main/scala/bio/ferlab/datalake/spark3/publictables/normalized/omim/OmimPhenotype.scala +++ b/datalake-spark3/src/main/scala/bio/ferlab/datalake/spark3/publictables/normalized/omim/OmimPhenotype.scala @@ -51,12 +51,14 @@ object OmimPhenotype { val parse_pheno: UserDefinedFunction = udf { raw: String => raw match { case pheno_regexp(name, omim_id, inheritance) => + val omimIdValue: Option[String] = Option(omim_id) + val inheritanceValue: Option[String] = Option(inheritance) Some( OmimPhenotype( name.replace("{", "").replace("}", "").trim, - omim_id.trim, - mapInheritance(inheritance), - mapInheritanceCode(inheritance) + omimIdValue.map(_.trim).getOrElse(""), + mapInheritance(inheritanceValue.map(_.trim).orNull), + mapInheritanceCode(inheritanceValue.map(_.trim).orNull) ) ) case _ => None From bf2032c1fd8e702b93a8493857071f199f7bc391 Mon Sep 17 00:00:00 2001 From: Emmanuel Nau Date: Wed, 21 Jan 2026 08:47:18 -0500 Subject: [PATCH 2/4] feat: CLIN-5494 Add doc for local publish --- README.md | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/README.md b/README.md index 3abee8f0..80ba1e40 100644 --- a/README.md +++ b/README.md @@ -117,7 +117,11 @@ The following table lists the versions supported of the main dependencies | datalake-spark3 | `3.2.2` | `1.2.0` | `1.2.1` | `2.12` `2.13` | `1.0.6` | ## release - -``` +```shell sbt "publishSigned; sonatypeRelease" +``` + +## local release +```shell + sbt VERSION=14.14.2-SNAPSHOT sbt publishLocal ``` \ No newline at end of file From 1066a5b13f3ab41cffbf027a8c18b213431bbc1d Mon Sep 17 00:00:00 2001 From: Emmanuel Nau Date: Wed, 21 Jan 2026 15:11:07 -0500 Subject: [PATCH 3/4] feat: CLIN-5494 Revert regex changes to use a dedicated regex + test --- .../normalized/omim/OmimPhenotype.scala | 22 ++++++++---- .../normalized/OmimGeneSetSpec.scala | 34 ++++++++++++------- 2 files changed, 37 insertions(+), 19 deletions(-) diff --git a/datalake-spark3/src/main/scala/bio/ferlab/datalake/spark3/publictables/normalized/omim/OmimPhenotype.scala b/datalake-spark3/src/main/scala/bio/ferlab/datalake/spark3/publictables/normalized/omim/OmimPhenotype.scala index 9f256918..22632a8e 100644 --- a/datalake-spark3/src/main/scala/bio/ferlab/datalake/spark3/publictables/normalized/omim/OmimPhenotype.scala +++ b/datalake-spark3/src/main/scala/bio/ferlab/datalake/spark3/publictables/normalized/omim/OmimPhenotype.scala @@ -6,13 +6,14 @@ import org.apache.spark.sql.functions.udf import scala.util.matching.Regex case class OmimPhenotype(name: String, - omim_id: String, + omim_id: Option[String], inheritance: Option[Seq[String]], inheritance_code: Option[Seq[String]]) object OmimPhenotype { - val pheno_regexp: Regex = "(.*)(?:,\\s(\\d*))?\\s\\([1234]\\)(?:,\\s(.*))?".r + val pheno_regexp: Regex = "(.*),\\s(\\d*)\\s\\([1234]\\)(?:,\\s(.*))?".r + val pheno_regexp_no_omim_id: Regex = "(.*)\\s\\([1234]\\)".r def mapInheritance(inheritance: String): Option[Seq[String]] = { if (inheritance == null) None @@ -51,14 +52,21 @@ object OmimPhenotype { val parse_pheno: UserDefinedFunction = udf { raw: String => raw match { case pheno_regexp(name, omim_id, inheritance) => - val omimIdValue: Option[String] = Option(omim_id) - val inheritanceValue: Option[String] = Option(inheritance) Some( OmimPhenotype( name.replace("{", "").replace("}", "").trim, - omimIdValue.map(_.trim).getOrElse(""), - mapInheritance(inheritanceValue.map(_.trim).orNull), - mapInheritanceCode(inheritanceValue.map(_.trim).orNull) + Option(omim_id).map(_.trim).orElse(None), + mapInheritance(inheritance), + mapInheritanceCode(inheritance) + ) + ) + case pheno_regexp_no_omim_id(name) => + Some( + OmimPhenotype( + name.replace("{", "").replace("}", "").trim, + None, + None, + None ) ) case _ => None diff --git a/datalake-spark3/src/test/scala/bio/ferlab/datalake/spark3/publictables/normalized/OmimGeneSetSpec.scala b/datalake-spark3/src/test/scala/bio/ferlab/datalake/spark3/publictables/normalized/OmimGeneSetSpec.scala index 656e50e2..e8d224df 100644 --- a/datalake-spark3/src/test/scala/bio/ferlab/datalake/spark3/publictables/normalized/OmimGeneSetSpec.scala +++ b/datalake-spark3/src/test/scala/bio/ferlab/datalake/spark3/publictables/normalized/OmimGeneSetSpec.scala @@ -1,28 +1,38 @@ package bio.ferlab.datalake.spark3.publictables.normalized import bio.ferlab.datalake.commons.config.DatasetConf +import bio.ferlab.datalake.spark3.publictables.normalized.omim.OmimGeneSet import bio.ferlab.datalake.spark3.testutils.WithTestConfig -import bio.ferlab.datalake.testutils.{CleanUpBeforeAll, CreateDatabasesBeforeAll, SparkSpec} +import bio.ferlab.datalake.testutils.models.normalized.{NormalizedOmimGeneSet, PHENOTYPE} +import bio.ferlab.datalake.testutils.models.raw.RawOmimGeneSet +import bio.ferlab.datalake.testutils.{SparkSpec, TestETLContext} -class OmimGeneSetSpec extends SparkSpec with WithTestConfig with CreateDatabasesBeforeAll with CleanUpBeforeAll { +class OmimGeneSetSpec extends SparkSpec with WithTestConfig { + + import spark.implicits._ val source: DatasetConf = conf.getDataset("raw_omim_gene_set") val destination: DatasetConf = conf.getDataset("normalized_omim_gene_set") - override val dbToCreate: List[String] = List(destination.table.map(_.database).getOrElse("variant")) - override val dsToClean: List[DatasetConf] = List(destination) + "transform" should "transform RawOmimGeneSet to NormalizedOmimGeneSet" in { - /* - //TODO fix this - ANTLR Tool version 4.7 used for code generation does not match the current runtime version 4.8 + val rawOmimGeneSet2PhenotypeName = "Acute myeloid leukemia, somatic" + val rawOmimGeneSet3PhenotypeName = "Hemolytic anemia due to phosphofructokinase deficiency" - "ImportOmimGeneSet" should "transform data into expected format" in { + val inputData = Map(source.id -> Seq(RawOmimGeneSet(), // with phenotypes having omim_id and inheritance + RawOmimGeneSet(_c12 = rawOmimGeneSet2PhenotypeName + ", 601626 (3)"), // with phenotypes having omim_id and no inheritance + RawOmimGeneSet(_c12 = rawOmimGeneSet3PhenotypeName + " (1)") // with phenotypes having no omim_id and no inheritance + ).toDF()) - val inputDf = Map(source.id -> Seq(OmimGeneSetInput()).toDF()) - val outputDf = new OmimGeneSet().transform(inputDf) + val resultDF = new OmimGeneSet(TestETLContext()).transformSingle(inputData) - outputDf.as[OmimGeneSetOutput].collect() should contain theSameElementsAs Seq(OmimGeneSetOutput()) + val expectedResults = Seq(NormalizedOmimGeneSet(), + NormalizedOmimGeneSet(phenotype = PHENOTYPE(name = rawOmimGeneSet2PhenotypeName, + omim_id = "601626", null, null)), + NormalizedOmimGeneSet(phenotype = PHENOTYPE(name = rawOmimGeneSet3PhenotypeName, + omim_id = null, null, null)) + ) + resultDF.as[NormalizedOmimGeneSet].collect() shouldBe expectedResults } - */ } From d57227968600e2f091f4b81136a15e579511fa48 Mon Sep 17 00:00:00 2001 From: Emmanuel Nau Date: Thu, 22 Jan 2026 09:08:41 -0500 Subject: [PATCH 4/4] feat: CLIN-5494 Update regex, to match genemap2-parser parsing --- .../publictables/normalized/omim/OmimPhenotype.scala | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/datalake-spark3/src/main/scala/bio/ferlab/datalake/spark3/publictables/normalized/omim/OmimPhenotype.scala b/datalake-spark3/src/main/scala/bio/ferlab/datalake/spark3/publictables/normalized/omim/OmimPhenotype.scala index 22632a8e..817d5c4c 100644 --- a/datalake-spark3/src/main/scala/bio/ferlab/datalake/spark3/publictables/normalized/omim/OmimPhenotype.scala +++ b/datalake-spark3/src/main/scala/bio/ferlab/datalake/spark3/publictables/normalized/omim/OmimPhenotype.scala @@ -12,8 +12,8 @@ case class OmimPhenotype(name: String, object OmimPhenotype { - val pheno_regexp: Regex = "(.*),\\s(\\d*)\\s\\([1234]\\)(?:,\\s(.*))?".r - val pheno_regexp_no_omim_id: Regex = "(.*)\\s\\([1234]\\)".r + val pheno_regexp: Regex = "(.*),\\s(\\d*)\\s\\(\\d\\)(?:,\\s(.*))?".r + val short_pheno_regexp: Regex = "^(.*)\\(\\d\\)(?:|, (.*))$".r def mapInheritance(inheritance: String): Option[Seq[String]] = { if (inheritance == null) None @@ -60,13 +60,13 @@ object OmimPhenotype { mapInheritanceCode(inheritance) ) ) - case pheno_regexp_no_omim_id(name) => + case short_pheno_regexp(name, inheritance) => Some( OmimPhenotype( name.replace("{", "").replace("}", "").trim, None, - None, - None + mapInheritance(inheritance), + mapInheritanceCode(inheritance) ) ) case _ => None