Skip to content

Commit a279b56

Browse files
authored
Add an option to sort by sequence role in CollectAlternateContigNames (#868)
* [feat] Add an option to sort by sequence role in CollectAlternateContigNames
1 parent f092245 commit a279b56

File tree

2 files changed

+58
-4
lines changed

2 files changed

+58
-4
lines changed

src/main/scala/com/fulcrumgenomics/fasta/CollectAlternateContigNames.scala

+17-4
Original file line numberDiff line numberDiff line change
@@ -96,8 +96,9 @@ object SequenceRole extends FgBioEnum[SequenceRole] {
9696
SequenceRole(roleValue)
9797
}
9898

99-
case object AltScaffold extends SequenceRole { val key: String = "alt-scaffold" }
99+
// NB: the order in which these are defined is important for when `--sort-by-sequence-role` is used
100100
case object AssembledMolecule extends SequenceRole { val key: String = "assembled-molecule" }
101+
case object AltScaffold extends SequenceRole { val key: String = "alt-scaffold" }
101102
case object FixPatch extends SequenceRole { val key: String = "fix-patch" }
102103
case object NovelPatch extends SequenceRole { val key: String = "novel-patch" }
103104
case object UnlocalizedScaffold extends SequenceRole { val key: String = "unlocalized-scaffold" }
@@ -131,9 +132,11 @@ class CollectAlternateContigNames
131132
@arg(flag='a', doc="The assembly report column(s) for the alternate contig name(s)", minElements=1) val alternates: Seq[AssemblyReportColumn],
132133
@arg(flag='s', doc="Only output sequences with the given sequence roles. If none given, all sequences will be output.", minElements=0)
133134
val sequenceRoles: Seq[SequenceRole] = Seq.empty,
134-
@arg(flag='d', doc="Update an existing sequence dictionary file. The primary names must match.") val existing: Option[PathToSequenceDictionary] = None,
135+
@arg(flag='d', doc="Update an existing sequence dictionary file. The primary names must match.", mutex=Array("sortBySequencingRole")) val existing: Option[PathToSequenceDictionary] = None,
135136
@arg(flag='x', doc="Allow mismatching sequence lengths when using an existing sequence dictionary file.") val allowMismatchingLengths: Boolean = false,
136-
@arg(doc="Skip contigs that have no alternates") val skipMissingAlternates: Boolean = true
137+
@arg(doc="Skip contigs that have no alternates") val skipMissingAlternates: Boolean = true,
138+
@arg(doc="Sort by the sequencing role (only when not updating an existing sequence dictionary file). Uses the order from `--sequence-roles` if provided.", mutex=Array("existing"))
139+
val sortBySequencingRole: Boolean = false,
137140
) extends FgBioTool with LazyLogging {
138141

139142
import com.fulcrumgenomics.fasta.{AssemblyReportColumn => Column}
@@ -213,7 +216,17 @@ class CollectAlternateContigNames
213216

214217
// Apply to an existing sequence dictionary if necessary
215218
val dict: SequenceDictionary = existing match {
216-
case None => SequenceDictionary(metadatas.toSeq:_*)
219+
case None if !sortBySequencingRole => SequenceDictionary(metadatas.toSeq:_*)
220+
case None =>
221+
// Use the user-provided roles, otherwise use all role
222+
val roles = (if (sequenceRoles.nonEmpty) sequenceRoles.toIndexedSeq else SequenceRole.values)
223+
.zipWithIndex
224+
.map { case (role, index) => role.key -> index }
225+
.toMap
226+
val metas = metadatas.toSeq.sortBy { metadata =>
227+
metadata.get(Column.SequenceRole.tag).flatMap { name => roles.get(name) }.getOrElse(roles.size)
228+
}.zipWithIndex.map { case (metadata, index) => metadata.copy(index=index) }
229+
SequenceDictionary(metas:_*)
217230
case Some(path) =>
218231
val assemblyReportMetadataMap = metadatas.map { m => (m.name, m) }.toMap
219232
val updatedMetadatas = SequenceDictionary(path).map { existingMetadata =>

src/test/scala/com/fulcrumgenomics/fasta/CollectAlternateContigNamesTest.scala

+41
Original file line numberDiff line numberDiff line change
@@ -147,4 +147,45 @@ class CollectAlternateContigNamesTest extends UnitSpec {
147147
dict.last.name shouldBe "NC_012920.1"
148148
dict.last.aliases should contain theSameElementsInOrderAs Seq("J01415.2", "chrM")
149149
}
150+
151+
it should "sort by sequencing roles when they all are defined" in {
152+
val output = makeTempFile("test.", ".dict")
153+
val tool = new CollectAlternateContigNames(
154+
input = reportHg19,
155+
output = output,
156+
primary = Column.RefSeqAccession,
157+
alternates = Seq(Column.UcscName),
158+
sequenceRoles = Seq(UnplacedScaffold, UnlocalizedScaffold, AssembledMolecule),
159+
sortBySequencingRole = true
160+
)
161+
executeFgbioTool(tool)
162+
163+
val dict = SequenceDictionary(output)
164+
dict should have length 84
165+
dict.head.name shouldBe "NT_113961.1"
166+
dict.head.aliases should contain theSameElementsInOrderAs Seq("chrUn_gl000211")
167+
dict.last.name shouldBe "NC_012920.1"
168+
dict.last.aliases should contain theSameElementsInOrderAs Seq("chrM")
169+
dict.map(m => m(Column.SequenceRole.tag)).toIndexedSeq.distinct should contain theSameElementsInOrderAs Seq(UnplacedScaffold, UnlocalizedScaffold, AssembledMolecule).map(_.key)
170+
}
171+
172+
it should "sort by sequencing roles when no roles are defined" in {
173+
val output = makeTempFile("test.", ".dict")
174+
val tool = new CollectAlternateContigNames(
175+
input = reportHg19,
176+
output = output,
177+
primary = Column.RefSeqAccession,
178+
alternates = Seq(Column.UcscName),
179+
sortBySequencingRole = true
180+
)
181+
executeFgbioTool(tool)
182+
183+
val dict = SequenceDictionary(output)
184+
dict should have length 93
185+
dict.head.name shouldBe "NC_000001.10"
186+
dict.head.aliases should contain theSameElementsInOrderAs Seq("chr1")
187+
dict.last.name shouldBe "NT_167243.1"
188+
dict.last.aliases should contain theSameElementsInOrderAs Seq("chrUn_gl000249")
189+
dict.map(m => m(Column.SequenceRole.tag)).toIndexedSeq.distinct should contain theSameElementsInOrderAs Seq(AssembledMolecule, AltScaffold, UnlocalizedScaffold, UnplacedScaffold).map(_.key)
190+
}
150191
}

0 commit comments

Comments
 (0)