Skip to content

Commit e2dc851

Browse files
committed
Add metrics sorter
Also update commons to 1.0.5. Also update htsjdk to 3.0.5.
1 parent a745c1d commit e2dc851

File tree

6 files changed

+336
-53
lines changed

6 files changed

+336
-53
lines changed

build.sbt

+2-2
Original file line numberDiff line numberDiff line change
@@ -126,9 +126,9 @@ lazy val root = Project(id="fgbio", base=file("."))
126126
"org.scala-lang" % "scala-reflect" % scalaVersion.value,
127127
"org.scala-lang" % "scala-compiler" % scalaVersion.value,
128128
"org.scala-lang.modules" %% "scala-xml" % "2.1.0",
129-
"com.fulcrumgenomics" %% "commons" % "1.4.0",
129+
"com.fulcrumgenomics" %% "commons" % "1.5.0",
130130
"com.fulcrumgenomics" %% "sopt" % "1.1.0",
131-
"com.github.samtools" % "htsjdk" % "2.24.1-26-ga38c78d-SNAPSHOT" excludeAll(htsjdkExcludes: _*),
131+
"com.github.samtools" % "htsjdk" % "3.0.5",
132132
"org.apache.commons" % "commons-math3" % "3.6.1",
133133
"com.beachape" %% "enumeratum" % "1.7.0",
134134
"com.intel.gkl" % "gkl" % "0.8.10",

src/main/scala/com/fulcrumgenomics/util/Metric.scala

+9-51
Original file line numberDiff line numberDiff line change
@@ -25,24 +25,21 @@
2525

2626
package com.fulcrumgenomics.util
2727

28-
import com.fulcrumgenomics.cmdline.FgBioMain.FailureException
29-
import com.fulcrumgenomics.commons.CommonsDef._
3028
import com.fulcrumgenomics.commons.io.{Writer => CommonsWriter}
3129
import com.fulcrumgenomics.commons.reflect.{ReflectionUtil, ReflectiveBuilder}
32-
import com.fulcrumgenomics.commons.util.{DelimitedDataParser, LazyLogging}
30+
import com.fulcrumgenomics.commons.util.DelimitedDataParser
3331
import enumeratum.EnumEntry
3432
import htsjdk.samtools.util.Iso8601Date
3533

36-
import java.io.{PrintWriter, StringWriter, Writer}
34+
import java.io.Writer
3735
import java.nio.file.Path
3836
import java.text.{DecimalFormat, NumberFormat, SimpleDateFormat}
3937
import java.util.Date
4038
import scala.collection.compat._
4139
import scala.collection.concurrent.TrieMap
4240
import scala.reflect.runtime.{universe => ru}
43-
import scala.util.{Failure, Success}
4441

45-
object Metric extends LazyLogging {
42+
object Metric {
4643
val Delimiter: Char = '\t'
4744
val DelimiterAsString: String = s"$Delimiter"
4845

@@ -102,53 +99,14 @@ object Metric extends LazyLogging {
10299
/** Reads metrics from a set of lines. The first line should be the header with the field names. Each subsequent
103100
* line should be a single metric. */
104101
def iterator[T <: Metric](lines: Iterator[String], source: Option[String] = None)(implicit tt: ru.TypeTag[T]): Iterator[T] = {
105-
val clazz: Class[T] = ReflectionUtil.typeTagToClass[T]
106-
107-
def fail(lineNumber: Int,
108-
message: String,
109-
throwable: Option[Throwable] = None): Unit = {
110-
val sourceMessage = source.map("\nIn source: " + _).getOrElse("")
111-
val fullMessage = s"On line #$lineNumber for metric '${clazz.getSimpleName}'$sourceMessage\n$message"
112-
throwable.foreach { thr =>
113-
val stringWriter = new StringWriter
114-
thr.printStackTrace(new PrintWriter(stringWriter))
115-
val banner = "#" * 80
116-
logger.debug(banner)
117-
logger.debug(stringWriter.toString)
118-
logger.debug(banner)
119-
}
120-
throw FailureException(message=Some(fullMessage))
121-
}
122-
123-
if (lines.isEmpty) fail(lineNumber=1, message="No header found")
124-
val parser = new DelimitedDataParser(lines=lines, delimiter=Delimiter, ignoreBlankLines=false, trimFields=true)
125-
val names = parser.headers.toIndexedSeq
126-
val reflectiveBuilder = new ReflectiveBuilder(clazz)
102+
val builder = new MetricBuilder[T](source=source)(tt)
103+
if (lines.isEmpty) builder.fail(message="No header found", lineNumber=Some(1))
104+
val parser = new DelimitedDataParser(lines=lines, delimiter=Delimiter, ignoreBlankLines=false, trimFields=true)
105+
val names = parser.headers.toIndexedSeq
127106

128107
parser.zipWithIndex.map { case (row, rowIndex) =>
129-
forloop(from = 0, until = names.length) { i =>
130-
reflectiveBuilder.argumentLookup.forField(names(i)) match {
131-
case Some(arg) =>
132-
val value = {
133-
val tmp = row[String](i)
134-
if (tmp.isEmpty && arg.argumentType == classOf[Option[_]]) ReflectionUtil.SpecialEmptyOrNoneToken else tmp
135-
}
136-
137-
val argumentValue = ReflectionUtil.constructFromString(arg.argumentType, arg.unitType, value) match {
138-
case Success(v) => v
139-
case Failure(thr) =>
140-
fail(lineNumber=rowIndex+2, message=s"Could not construct value for column '${arg.name}' of type '${arg.typeDescription}' from '$value'", Some(thr))
141-
}
142-
arg.value = argumentValue
143-
case None =>
144-
fail(lineNumber=rowIndex+2, message=s"Did not have a field with name '${names(i)}'.")
145-
}
146-
}
147-
148-
// build it. NB: if arguments are missing values, then an exception will be thrown here
149-
// Also, we don't use the default "build()" method since if a collection or option is empty, it will be treated as
150-
// missing.
151-
reflectiveBuilder.build(reflectiveBuilder.argumentLookup.ordered.map(arg => arg.value getOrElse unreachable(s"Arguments not set: ${arg.name}")))
108+
val argMap = names.zipWithIndex.map { case (name, i) => name -> row[String](i) }.toMap
109+
builder.fromArgMap(argMap=argMap, lineNumber=Some(rowIndex+2))
152110
}
153111
}
154112

Original file line numberDiff line numberDiff line change
@@ -0,0 +1,142 @@
1+
/*
2+
* The MIT License
3+
*
4+
* Copyright (c) 2022 Fulcrum Genomics
5+
*
6+
* Permission is hereby granted, free of charge, to any person obtaining a copy
7+
* of this software and associated documentation files (the "Software"), to deal
8+
* in the Software without restriction, including without limitation the rights
9+
* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
10+
* copies of the Software, and to permit persons to whom the Software is
11+
* furnished to do so, subject to the following conditions:
12+
*
13+
* The above copyright notice and this permission notice shall be included in
14+
* all copies or substantial portions of the Software.
15+
*
16+
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17+
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18+
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
19+
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20+
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21+
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
22+
* THE SOFTWARE.
23+
*
24+
*/
25+
26+
package com.fulcrumgenomics.util
27+
28+
import com.fulcrumgenomics.cmdline.FgBioMain.FailureException
29+
import com.fulcrumgenomics.commons.CommonsDef.{forloop, unreachable}
30+
import com.fulcrumgenomics.commons.reflect.{ReflectionUtil, ReflectiveBuilder}
31+
import com.fulcrumgenomics.commons.util.LazyLogging
32+
33+
import java.io.{PrintWriter, StringWriter}
34+
import scala.reflect.runtime.{universe => ru}
35+
import scala.util.{Failure, Success}
36+
37+
/** Class for building metrics of type [[T]].
38+
*
39+
* This is not thread-safe.
40+
*
41+
* @param source optionally, the source of reading (e.g. file)
42+
* @tparam T the metric type
43+
*/
44+
class MetricBuilder[T <: Metric](source: Option[String] = None)(implicit tt: ru.TypeTag[T]) extends LazyLogging {
45+
// The main reason why a builder is necessary is to cache some expensive reflective calls.
46+
private val clazz: Class[T] = ReflectionUtil.typeTagToClass[T]
47+
private val reflectiveBuilder = new ReflectiveBuilder(clazz)
48+
private val names = Metric.names[T]
49+
50+
/** Builds a metric from a delimited line
51+
*
52+
* @param line the line with delimited values
53+
* @param delim the delimiter of the values
54+
* @param lineNumber optionally, the line number when building a metric from a line in a file
55+
* @return
56+
*/
57+
def fromLine(line: String, delim: String = Metric.DelimiterAsString, lineNumber: Option[Int] = None): T = {
58+
fromValues(values = line.split(delim), lineNumber = lineNumber)
59+
}
60+
61+
/** Builds a metric from values for the complete set of metric fields
62+
*
63+
* @param values the values in the same order as the names defined in the class
64+
* @param lineNumber optionally, the line number when building a metric from a line in a file
65+
* @return
66+
*/
67+
def fromValues(values: Iterable[String], lineNumber: Option[Int] = None): T = {
68+
val vals = values.toIndexedSeq
69+
if (names.length != vals.length) {
70+
fail(message = f"Failed decoding: expected '${names.length}' fields, found '${vals.length}'.", lineNumber = lineNumber)
71+
}
72+
fromArgMap(argMap = names.zip(values).toMap, lineNumber = lineNumber)
73+
}
74+
75+
/** Builds a metric of type [[T]]
76+
*
77+
* @param argMap map of field names to values. All required fields must be given. Can be in any order.
78+
* @param lineNumber optionally, the line number when building a metric from a line in a file
79+
* @return a new instance of type [[T]]
80+
*/
81+
def fromArgMap(argMap: Map[String, String], lineNumber: Option[Int] = None): T = {
82+
reflectiveBuilder.reset() // reset the arguments to their initial values
83+
84+
val names = argMap.keys.toIndexedSeq
85+
forloop(from = 0, until = names.length) { i =>
86+
reflectiveBuilder.argumentLookup.forField(names(i)) match {
87+
case Some(arg) =>
88+
val value = {
89+
val tmp = argMap(names(i))
90+
if (tmp.isEmpty && arg.argumentType == classOf[Option[_]]) ReflectionUtil.SpecialEmptyOrNoneToken else tmp
91+
}
92+
93+
val argumentValue = ReflectionUtil.constructFromString(arg.argumentType, arg.unitType, value) match {
94+
case Success(v) => v
95+
case Failure(thr) =>
96+
fail(
97+
message = s"Could not construct value for column '${arg.name}' of type '${arg.typeDescription}' from '$value'",
98+
throwable = Some(thr),
99+
lineNumber = lineNumber
100+
)
101+
}
102+
arg.value = argumentValue
103+
case None =>
104+
fail(
105+
message = s"Did not have a field with name '${names(i)}'.",
106+
lineNumber = lineNumber
107+
)
108+
}
109+
}
110+
111+
// build it. NB: if arguments are missing values, then an exception will be thrown here
112+
// Also, we don't use the default "build()" method since if a collection or option is empty, it will be treated as
113+
// missing.
114+
val params = reflectiveBuilder.argumentLookup.ordered.map(arg => arg.value getOrElse unreachable(s"Arguments not set: ${arg.name}"))
115+
reflectiveBuilder.build(params)
116+
}
117+
118+
/** Logs the throwable, if given, and throws a [[FailureException]] with information about when reading metrics fails
119+
*
120+
* @param message the message to include in the exception thrown
121+
* @param throwable optionally, a throwable that should be logged
122+
* @param lineNumber optionally, the line number when building a metric from a line in a file
123+
*/
124+
def fail(message: String, throwable: Option[Throwable] = None, lineNumber: Option[Int] = None): Unit = {
125+
throwable.foreach { thr =>
126+
val stringWriter = new StringWriter
127+
thr.printStackTrace(new PrintWriter(stringWriter))
128+
val banner = "#" * 80
129+
logger.debug(banner)
130+
logger.debug(stringWriter.toString)
131+
logger.debug(banner)
132+
}
133+
val sourceMessage = source.map("\nIn source: " + _).getOrElse("")
134+
val prefix = lineNumber match {
135+
case None => "For metric"
136+
case Some(n) => s"On line #$n for metric"
137+
}
138+
val fullMessage = s"$prefix '${clazz.getSimpleName}'$sourceMessage\n$message"
139+
140+
throw FailureException(message = Some(fullMessage))
141+
}
142+
}
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,70 @@
1+
/*
2+
* The MIT License
3+
*
4+
* Copyright (c) 2022 Fulcrum Genomics
5+
*
6+
* Permission is hereby granted, free of charge, to any person obtaining a copy
7+
* of this software and associated documentation files (the "Software"), to deal
8+
* in the Software without restriction, including without limitation the rights
9+
* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
10+
* copies of the Software, and to permit persons to whom the Software is
11+
* furnished to do so, subject to the following conditions:
12+
*
13+
* The above copyright notice and this permission notice shall be included in
14+
* all copies or substantial portions of the Software.
15+
*
16+
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17+
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18+
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
19+
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20+
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21+
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
22+
* THE SOFTWARE.
23+
*
24+
*/
25+
26+
package com.fulcrumgenomics.util
27+
28+
import com.fulcrumgenomics.commons.CommonsDef.DirPath
29+
30+
import scala.reflect.runtime.{universe => ru}
31+
32+
/** Disk-backed metrics sorter
33+
*
34+
* @param maxObjectsInRam the maximum number of metrics to keep in memory before spilling to disk
35+
* @param keyfunc method to convert a metric to an ordered key
36+
* @param tmpDir the temporary directory in which to spill to disk
37+
* @param tt the type tag for [[T]]
38+
* @tparam Key the key to use for sorting metrics
39+
* @tparam T the metric type
40+
*/
41+
class MetricSorter[Key <: Ordered[Key], T <: Metric](maxObjectsInRam: Int = MetricSorter.MaxInMemory,
42+
keyfunc: T => Key,
43+
tmpDir: DirPath = Io.tmpDir,
44+
45+
)(implicit tt: ru.TypeTag[T]) extends Sorter[T, Key](
46+
maxObjectsInRam = maxObjectsInRam,
47+
codec = new MetricSorter.MetricSorterCodec[T](),
48+
keyfunc = keyfunc,
49+
tmpDir = tmpDir
50+
)
51+
52+
object MetricSorter {
53+
/** The default maximum # of records to keep and sort in memory. */
54+
val MaxInMemory: Int = 1e6.toInt
55+
56+
/** The codec for encoding and decoding a metric */
57+
class MetricSorterCodec[T <: Metric]()(implicit tt: ru.TypeTag[T])
58+
extends Sorter.Codec[T] {
59+
private val builder = new MetricBuilder[T]()
60+
61+
/** Encode the metric into an array of bytes. */
62+
def encode(metric: T): Array[Byte] = metric.values.mkString(Metric.DelimiterAsString).getBytes
63+
64+
/** Decode a metric from an array of bytes. */
65+
def decode(bs: Array[Byte], start: Int, length: Int): T = {
66+
val fields = new String(bs.slice(from = start, until = start + length)).split(Metric.DelimiterAsString)
67+
builder.fromValues(fields)
68+
}
69+
}
70+
}
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,43 @@
1+
/*
2+
* The MIT License
3+
*
4+
* Copyright (c) 2022 Fulcrum Genomics
5+
*
6+
* Permission is hereby granted, free of charge, to any person obtaining a copy
7+
* of this software and associated documentation files (the "Software"), to deal
8+
* in the Software without restriction, including without limitation the rights
9+
* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
10+
* copies of the Software, and to permit persons to whom the Software is
11+
* furnished to do so, subject to the following conditions:
12+
*
13+
* The above copyright notice and this permission notice shall be included in
14+
* all copies or substantial portions of the Software.
15+
*
16+
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17+
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18+
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
19+
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20+
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21+
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
22+
* THE SOFTWARE.
23+
*
24+
*/
25+
26+
package com.fulcrumgenomics.util
27+
28+
import com.fulcrumgenomics.testing.UnitSpec
29+
30+
31+
case class MetricBuilderTestMetric(name: String, count: Long = 1) extends Metric
32+
33+
class MetricBuilderTest extends UnitSpec {
34+
private val builder = new MetricBuilder[MetricBuilderTestMetric]()
35+
36+
"MetricBuilder.fromArgMap" should "build a metric from an argmap with all value specified" in {
37+
builder.fromArgMap(Map("name" -> "foo", "count" -> "2")) shouldBe MetricBuilderTestMetric(name="foo", count=2)
38+
}
39+
40+
it should "build a metric from an argmap with only required values specified" in {
41+
builder.fromArgMap(Map("name" -> "foo")) shouldBe MetricBuilderTestMetric(name="foo")
42+
}
43+
}

0 commit comments

Comments
 (0)