Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 6 additions & 2 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -117,7 +117,11 @@ The following table lists the versions supported of the main dependencies
| datalake-spark3 | `3.2.2` | `1.2.0` | `1.2.1` | `2.12` `2.13` | `1.0.6` |

## release

```
```shell
sbt "publishSigned; sonatypeRelease"
```

## local release
```shell
sbt VERSION=14.14.2-SNAPSHOT sbt publishLocal
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

👍

```
Original file line number Diff line number Diff line change
Expand Up @@ -6,13 +6,14 @@ import org.apache.spark.sql.functions.udf
import scala.util.matching.Regex

case class OmimPhenotype(name: String,
omim_id: String,
omim_id: Option[String],
inheritance: Option[Seq[String]],
inheritance_code: Option[Seq[String]])

object OmimPhenotype {

val pheno_regexp: Regex = "(.*)(?:,\\s(\\d*))?\\s\\([1234]\\)(?:,\\s(.*))?".r
val pheno_regexp: Regex = "(.*),\\s(\\d*)\\s\\(\\d\\)(?:,\\s(.*))?".r
val short_pheno_regexp: Regex = "^(.*)\\(\\d\\)(?:|, (.*))$".r

def mapInheritance(inheritance: String): Option[Seq[String]] = {
if (inheritance == null) None
Expand Down Expand Up @@ -54,7 +55,16 @@ object OmimPhenotype {
Some(
OmimPhenotype(
name.replace("{", "").replace("}", "").trim,
omim_id.trim,
Option(omim_id).map(_.trim).orElse(None),
mapInheritance(inheritance),
mapInheritanceCode(inheritance)
)
)
case short_pheno_regexp(name, inheritance) =>
Some(
OmimPhenotype(
name.replace("{", "").replace("}", "").trim,
None,
mapInheritance(inheritance),
mapInheritanceCode(inheritance)
)
Expand Down
Original file line number Diff line number Diff line change
@@ -1,28 +1,38 @@
package bio.ferlab.datalake.spark3.publictables.normalized

import bio.ferlab.datalake.commons.config.DatasetConf
import bio.ferlab.datalake.spark3.publictables.normalized.omim.OmimGeneSet
import bio.ferlab.datalake.spark3.testutils.WithTestConfig
import bio.ferlab.datalake.testutils.{CleanUpBeforeAll, CreateDatabasesBeforeAll, SparkSpec}
import bio.ferlab.datalake.testutils.models.normalized.{NormalizedOmimGeneSet, PHENOTYPE}
import bio.ferlab.datalake.testutils.models.raw.RawOmimGeneSet
import bio.ferlab.datalake.testutils.{SparkSpec, TestETLContext}

class OmimGeneSetSpec extends SparkSpec with WithTestConfig with CreateDatabasesBeforeAll with CleanUpBeforeAll {
class OmimGeneSetSpec extends SparkSpec with WithTestConfig {

import spark.implicits._

val source: DatasetConf = conf.getDataset("raw_omim_gene_set")
val destination: DatasetConf = conf.getDataset("normalized_omim_gene_set")

override val dbToCreate: List[String] = List(destination.table.map(_.database).getOrElse("variant"))
override val dsToClean: List[DatasetConf] = List(destination)
"transform" should "transform RawOmimGeneSet to NormalizedOmimGeneSet" in {

/*
//TODO fix this
ANTLR Tool version 4.7 used for code generation does not match the current runtime version 4.8
val rawOmimGeneSet2PhenotypeName = "Acute myeloid leukemia, somatic"
val rawOmimGeneSet3PhenotypeName = "Hemolytic anemia due to phosphofructokinase deficiency"

"ImportOmimGeneSet" should "transform data into expected format" in {
val inputData = Map(source.id -> Seq(RawOmimGeneSet(), // with phenotypes having omim_id and inheritance
RawOmimGeneSet(_c12 = rawOmimGeneSet2PhenotypeName + ", 601626 (3)"), // with phenotypes having omim_id and no inheritance
RawOmimGeneSet(_c12 = rawOmimGeneSet3PhenotypeName + " (1)") // with phenotypes having no omim_id and no inheritance
).toDF())

val inputDf = Map(source.id -> Seq(OmimGeneSetInput()).toDF())
val outputDf = new OmimGeneSet().transform(inputDf)
val resultDF = new OmimGeneSet(TestETLContext()).transformSingle(inputData)

outputDf.as[OmimGeneSetOutput].collect() should contain theSameElementsAs Seq(OmimGeneSetOutput())
val expectedResults = Seq(NormalizedOmimGeneSet(),
NormalizedOmimGeneSet(phenotype = PHENOTYPE(name = rawOmimGeneSet2PhenotypeName,
omim_id = "601626", null, null)),
NormalizedOmimGeneSet(phenotype = PHENOTYPE(name = rawOmimGeneSet3PhenotypeName,
omim_id = null, null, null))
)
resultDF.as[NormalizedOmimGeneSet].collect() shouldBe expectedResults
}
*/
}