diff --git a/README.md b/README.md index 3abee8f0..80ba1e40 100644 --- a/README.md +++ b/README.md @@ -117,7 +117,11 @@ The following table lists the versions supported of the main dependencies | datalake-spark3 | `3.2.2` | `1.2.0` | `1.2.1` | `2.12` `2.13` | `1.0.6` | ## release - -``` +```shell sbt "publishSigned; sonatypeRelease" +``` + +## local release +```shell + sbt VERSION=14.14.2-SNAPSHOT sbt publishLocal ``` \ No newline at end of file diff --git a/datalake-spark3/src/main/scala/bio/ferlab/datalake/spark3/publictables/normalized/omim/OmimPhenotype.scala b/datalake-spark3/src/main/scala/bio/ferlab/datalake/spark3/publictables/normalized/omim/OmimPhenotype.scala index bc769d14..817d5c4c 100644 --- a/datalake-spark3/src/main/scala/bio/ferlab/datalake/spark3/publictables/normalized/omim/OmimPhenotype.scala +++ b/datalake-spark3/src/main/scala/bio/ferlab/datalake/spark3/publictables/normalized/omim/OmimPhenotype.scala @@ -6,13 +6,14 @@ import org.apache.spark.sql.functions.udf import scala.util.matching.Regex case class OmimPhenotype(name: String, - omim_id: String, + omim_id: Option[String], inheritance: Option[Seq[String]], inheritance_code: Option[Seq[String]]) object OmimPhenotype { - val pheno_regexp: Regex = "(.*)(?:,\\s(\\d*))?\\s\\([1234]\\)(?:,\\s(.*))?".r + val pheno_regexp: Regex = "(.*),\\s(\\d*)\\s\\(\\d\\)(?:,\\s(.*))?".r + val short_pheno_regexp: Regex = "^(.*)\\(\\d\\)(?:|, (.*))$".r def mapInheritance(inheritance: String): Option[Seq[String]] = { if (inheritance == null) None @@ -54,7 +55,16 @@ object OmimPhenotype { Some( OmimPhenotype( name.replace("{", "").replace("}", "").trim, - omim_id.trim, + Option(omim_id).map(_.trim).orElse(None), + mapInheritance(inheritance), + mapInheritanceCode(inheritance) + ) + ) + case short_pheno_regexp(name, inheritance) => + Some( + OmimPhenotype( + name.replace("{", "").replace("}", "").trim, + None, mapInheritance(inheritance), mapInheritanceCode(inheritance) ) diff --git a/datalake-spark3/src/test/scala/bio/ferlab/datalake/spark3/publictables/normalized/OmimGeneSetSpec.scala b/datalake-spark3/src/test/scala/bio/ferlab/datalake/spark3/publictables/normalized/OmimGeneSetSpec.scala index 656e50e2..e8d224df 100644 --- a/datalake-spark3/src/test/scala/bio/ferlab/datalake/spark3/publictables/normalized/OmimGeneSetSpec.scala +++ b/datalake-spark3/src/test/scala/bio/ferlab/datalake/spark3/publictables/normalized/OmimGeneSetSpec.scala @@ -1,28 +1,38 @@ package bio.ferlab.datalake.spark3.publictables.normalized import bio.ferlab.datalake.commons.config.DatasetConf +import bio.ferlab.datalake.spark3.publictables.normalized.omim.OmimGeneSet import bio.ferlab.datalake.spark3.testutils.WithTestConfig -import bio.ferlab.datalake.testutils.{CleanUpBeforeAll, CreateDatabasesBeforeAll, SparkSpec} +import bio.ferlab.datalake.testutils.models.normalized.{NormalizedOmimGeneSet, PHENOTYPE} +import bio.ferlab.datalake.testutils.models.raw.RawOmimGeneSet +import bio.ferlab.datalake.testutils.{SparkSpec, TestETLContext} -class OmimGeneSetSpec extends SparkSpec with WithTestConfig with CreateDatabasesBeforeAll with CleanUpBeforeAll { +class OmimGeneSetSpec extends SparkSpec with WithTestConfig { + + import spark.implicits._ val source: DatasetConf = conf.getDataset("raw_omim_gene_set") val destination: DatasetConf = conf.getDataset("normalized_omim_gene_set") - override val dbToCreate: List[String] = List(destination.table.map(_.database).getOrElse("variant")) - override val dsToClean: List[DatasetConf] = List(destination) + "transform" should "transform RawOmimGeneSet to NormalizedOmimGeneSet" in { - /* - //TODO fix this - ANTLR Tool version 4.7 used for code generation does not match the current runtime version 4.8 + val rawOmimGeneSet2PhenotypeName = "Acute myeloid leukemia, somatic" + val rawOmimGeneSet3PhenotypeName = "Hemolytic anemia due to phosphofructokinase deficiency" - "ImportOmimGeneSet" should "transform data into expected format" in { + val inputData = Map(source.id -> Seq(RawOmimGeneSet(), // with phenotypes having omim_id and inheritance + RawOmimGeneSet(_c12 = rawOmimGeneSet2PhenotypeName + ", 601626 (3)"), // with phenotypes having omim_id and no inheritance + RawOmimGeneSet(_c12 = rawOmimGeneSet3PhenotypeName + " (1)") // with phenotypes having no omim_id and no inheritance + ).toDF()) - val inputDf = Map(source.id -> Seq(OmimGeneSetInput()).toDF()) - val outputDf = new OmimGeneSet().transform(inputDf) + val resultDF = new OmimGeneSet(TestETLContext()).transformSingle(inputData) - outputDf.as[OmimGeneSetOutput].collect() should contain theSameElementsAs Seq(OmimGeneSetOutput()) + val expectedResults = Seq(NormalizedOmimGeneSet(), + NormalizedOmimGeneSet(phenotype = PHENOTYPE(name = rawOmimGeneSet2PhenotypeName, + omim_id = "601626", null, null)), + NormalizedOmimGeneSet(phenotype = PHENOTYPE(name = rawOmimGeneSet3PhenotypeName, + omim_id = null, null, null)) + ) + resultDF.as[NormalizedOmimGeneSet].collect() shouldBe expectedResults } - */ }