Release v0.0.27

willchet · willchet · commit b99a40b0de40 · 2026-04-15T12:55:51.000-04:00
diff --git a/.github/workflows/docs.yml b/.github/workflows/docs.yml
@@ -36,7 +36,7 @@ jobs:
           echo '<meta http-equiv="refresh" content="0; url=zoe/index.html">' > target/doc/index.html
 
       - name: Upload artifact
-        uses: actions/upload-pages-artifact@v5.0.0
+        uses: actions/upload-pages-artifact@v5
         with:
           path: ./target/doc
       - name: Deploy to GitHub Pages
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -4,43 +4,44 @@ All notable changes to this project will be documented in this file. The format
 is roughly based on [Keep a Changelog], and this project tries to adheres to
 [Semantic Versioning].
 
-## [0.0.27] - TBD
+## [0.0.27] - 2026-04-15
 
 ### Added
 
+- Added `ByteMap` for defining and representing maps from `u8` values to other `u8` values (e.g., for sanitization/recoding), as well as many pre-defined maps
+- Added `ByteValidator` for defining alphabets or subsets of valid bytes
+- Added `SanitizeBase` extension trait for `u8` to perform DNA validation, recoding, and refinement on a single byte
+- Added `find_start_codon` as well as a modified string search routine supporting lazy transformations of the haystack (`find_mapped_match_simd`)
+- Added `to_aa_iter_exact` and `to_aa_iter_exact_with` for amino acid translation that does not include a partial codon for codons with less than 3 bases at the end of a sequence
+- Added `find_next_aa` and `find_next_aa_in_frame` to `RangeSearch`
+- Added `find_byte` to `ByteSubstring` (for searching strings and for use with `RangeSearch`)
 - Added `as_mut_vec` to `AlignmentStates`, `Nucleotides`, and `AminoAcids` to enable custom editing
 - Added `push`, `starts_with`, and `ends_with` to `Nucleotides` and `AminoAcids`
 - Implemented `Extend` for `Nucleotides` and `AminoAcids`
-- Added `to_aa_iter_exact` and `to_aa_iter_exact_with` for amino acid translation that does not include a partial codon for codons with less than 3 bases at the end of a sequence
-- Added `find_next_aa` and `find_next_aa_in_frame` to `RangeSearch`
-- Added `SanitizeBase` extension trait for `u8` to perform DNA validation, recoding, and refinement on a single byte
 - Added `AcgtNoGapsUc` strategy for retaining/recoding DNA
-- Added `find_start_codon` as well as a modified string search routine supporting lazy transformations of the haystack (`find_mapped_match_simd`)
-- Added `ByteMap` for defining and representing maps from `u8` values to other `u8` values (e.g., for sanitization/recoding)
-- Added many pre-defined `ByteMap`s for direct use or as a starting point for defining more custom maps
-- Added `find_byte` to `ByteSubstring` (for searching strings and for use with `RangeSearch`)
-- Added `ByteValidator` for defining alphabets or subsets of valid bytes
 
 ### Changed
 
-- Implemented Copy for several immutable views
-- `RangeSearch` now contains a generic with the original type to allow search methods to restrict the types of data they are used on
-- `StdGeneticCode` now translates codons with mixed `-` and `.` bytes to `.`
-- `is_amino_acid` now behaves the same as `is_known_amino_acid`, and `is_known_amino_acid` is removed
-- Renamed `is_valid_codon` to `is_resolvable_codon` in `CodonExtension`
 - Renamed `from_filename` to `from_path` for *Zoe*'s readers, deprecating the old functions
 - Renamed `with_file_context` to `with_path_context` for errors, deprecating the old functions
+- Renamed `is_valid_codon` to `is_resolvable_codon` in `CodonExtension`
+- `is_amino_acid` now behaves the same as `is_known_amino_acid`, and `is_known_amino_acid` is removed
+- `RangeSearch` now contains a generic with the original type to allow search methods to restrict the types of data they are used on
+- `StdGeneticCode` now translates codons with mixed `-` and `.` bytes to `.`
 - `OrFail` is now only implemented on errors with a `'static` lifetime
+- `retain_by_recoding` now uses `ByteMap`, and `retain_by_validation` now uses `ByteValidator`
+- Implemented `Copy` for several immutable views
 
 ### Removed
 
-- `itoa` is no longer a dependency of Zoe.
 - Removes `open_nonempty_file`, which produces incorrect results for piped inputs
+- `itoa` is no longer a dependency of Zoe.
 
 ### Fixes
 
 - Fixes a bug where pipes are interpreted as empty files with `FastQReader::from_filename` (and similarly for `FastaReader` and `SAMReader`)
 - Fixes a bug in `p_distance_acgt` where longer sequences could be normalized incorrectly.
+- Fixes a bug in `physiochemical` distance to ensure invalid sequences are not comparable
 
 ## [0.0.26] - 2026-03-06
 
diff --git a/Cargo.lock b/Cargo.lock
diff --git a/Cargo.toml b/Cargo.toml
@@ -1,7 +1,7 @@
 [package]
 edition = "2024"
 name = "zoe"
-version = "0.0.27-dev"
+version = "0.0.27"
 rust-version = "1.95"
 description = "A nightly library for viral genomics"
 license = "Apache-2.0"
diff --git a/src/alignment/types/state.rs b/src/alignment/types/state.rs
@@ -89,7 +89,7 @@ impl AlignmentStates {
         self.0.as_mut_slice()
     }
 
-    /// Returns the [`Ciglet`] elements as a mutable reference to a vector.
+    /// Returns a mutable reference to the vector of [`Ciglet`] elements.
     ///
     /// ## Validity
     ///
diff --git a/src/data/constants/mappings/gc.rs b/src/data/constants/mappings/gc.rs
@@ -12,7 +12,7 @@ macro_rules! fill_std_gc {
 /// If a codon involving ambiguous IUPAC letters translates to the same amino
 /// acid in all cases, then it is also included in the hashmap. Stop codons are
 /// translated to `*`, and the codons `...`, `---`, and `NNN` are translated to
-/// `.`, `-`, and `X` respectively.
+/// `.`, `-`, and `X` respectively. A mix of `.` and `-` is translated to `.`.
 #[derive(Debug)]
 pub struct StdGeneticCode;
 
diff --git a/src/data/extension/byte_types.rs b/src/data/extension/byte_types.rs
@@ -52,7 +52,7 @@ impl IsBase for u8 {
 /// [`RetainNucleotides`]: crate::data::types::nucleotides::RetainNucleotides
 pub trait SanitizeBase: Sized + Copy {
     /// Checks whether a single byte is valid under the given validation
-    /// strategy
+    /// strategy.
     #[must_use]
     fn is_valid(self, strategy: IsValidDNA) -> bool;
 
diff --git a/src/data/records/sam/merge_pairs.rs b/src/data/records/sam/merge_pairs.rs
@@ -134,6 +134,15 @@ impl SamAligned {
 }
 
 impl From<&SamData> for SamAligned {
+    /// Builds a [`SamAligned`] from [`SamData`].
+    ///
+    /// ## Panics
+    ///
+    /// Currently panics on invalid cigar states. To be removed in the future
+    /// for either a `Result` or type-state validated CIGAR strings.
+    ///
+    /// This also has a chance of panicking in debug mode if an insertion
+    /// appears at the start of the alignment.
     #[inline]
     fn from(row: &SamData) -> Self {
         row.get_aligned()
@@ -265,7 +274,10 @@ impl SamData {
     /// ## Panics
     ///
     /// Currently panics on invalid cigar states. To be removed in the future
-    /// for either a `Result` or type-state validated Cigars.
+    /// for either a `Result` or type-state validated CIGAR strings.
+    ///
+    /// This also has a chance of panicking in debug mode if an insertion
+    /// appears at the start of the alignment.
     ///
     /// [`Cigar`]: crate::data::types::cigar::Cigar
     #[must_use]
@@ -346,12 +358,20 @@ impl SamData {
     }
 
     /// Merges SAM read pairs using the reference alignment to parsimoniously
-    /// detect and correct errors. Based on the work by **Shepard et al. 2016** for
-    /// IRMA.
+    /// detect and correct errors. Based on the work by **Shepard et al. 2016**
+    /// for IRMA.
     ///
     /// ## Notes
     ///
     /// This algorithm is designed for local alignment.
+    ///
+    /// ## Panics
+    ///
+    /// Currently panics on invalid cigar states. To be removed in the future
+    /// for either a `Result` or type-state validated CIGAR strings.
+    ///
+    /// This also has a chance of panicking in debug mode if an insertion
+    /// appears at the start of the alignment.
     #[allow(clippy::too_many_lines)]
     #[must_use]
     pub fn merge_pair_using_reference(
diff --git a/src/data/types/cigar/mod.rs b/src/data/types/cigar/mod.rs
@@ -258,9 +258,13 @@ impl TryFrom<&str> for Cigar {
     }
 }
 
+/// An extension trait for `Vec<u8>` allowing a ciglet to be formatted into the
+/// buffer.
 pub(crate) trait FormatCigletForCigarVec {
+    /// Extends the buffer with the formatted `ciglet`.
     fn push_formatted_ciglet(&mut self, ciglet: Ciglet);
 }
+
 impl FormatCigletForCigarVec for Vec<u8> {
     #[inline]
     fn push_formatted_ciglet(&mut self, ciglet: Ciglet) {
@@ -348,13 +352,19 @@ impl ExpandedCigar {
             if previous == op {
                 count += 1;
             } else {
-                condensed.push_formatted_ciglet(Ciglet { inc: count, op: previous });
+                condensed.push_formatted_ciglet(Ciglet {
+                    inc: count,
+                    op:  previous,
+                });
                 previous = op;
                 count = 1;
             }
         }
 
-        condensed.push_formatted_ciglet(Ciglet { inc: count, op: previous });
+        condensed.push_formatted_ciglet(Ciglet {
+            inc: count,
+            op:  previous,
+        });
 
         Cigar(condensed)
     }
diff --git a/src/data/types/nucleotides/sanitize.rs b/src/data/types/nucleotides/sanitize.rs
@@ -167,7 +167,7 @@ pub enum RefineDNAStrat {
     /// Retains and recodes to uppercase ACGTN bases with standard gaps (`-`)
     AcgtnStdGapsUc,
 
-    // Retains and recodes to uppercase ACGT bases without gaps
+    /// Retains and recodes to uppercase ACGT bases without gaps
     AcgtNoGapsUc,
 }
 
diff --git a/src/data/types/nucleotides/translation.rs b/src/data/types/nucleotides/translation.rs
@@ -65,18 +65,12 @@ pub trait Translate: NucleotidesReadable + Sealed {
         self.to_aa_iter().position(|aa| aa == needle).map(|x| x * 3)
     }
 
-    /// Returns the starting index of the first start codon (`ATG` or `AUG`) or
-    /// `None` otherwise.
+    /// Returns the starting index of the first case-insensitive start codon
+    /// (`ATG` or `AUG`) or `None` otherwise.
     ///
-    /// If the sequence is known to not contain `U`, then [`find_substring`] may
-    /// be faster. This function uses the same logic as [`find_substring`] but
-    /// is specialized to handle the two possibilities for the middle of the
-    /// codon.
-    ///
-    /// ## Limitations
-    ///
-    /// The input sequence must be in uppercase (i.e., the search is
-    /// case-sensitive).
+    /// If the sequence is known to not contain `U` and be of a particular case,
+    /// then [`find_substring`] may be faster. This function uses
+    /// [`find_mapped_match_simd`] in its implementation.
     ///
     /// [`find_substring`]: crate::search::ByteSubstring::find_substring
     #[inline]
diff --git a/src/distance/aa.rs b/src/distance/aa.rs
@@ -79,9 +79,9 @@ impl AminoAcidsDistance for AminoAcids {}
 impl AminoAcidsDistance for AminoAcidsView<'_> {}
 impl AminoAcidsDistance for AminoAcidsViewMut<'_> {}
 
-/// Calculates a "physiochemical" distance measure using the euclidean distances
-/// over physiochemical factors. Only valid amino acids are permitted in the
-/// denominator.
+/// Calculates a "physiochemical" distance measure using Euclidean distances
+/// over physiochemical factors, normalized by sequence length. Only valid amino
+/// acids are counted in the denominator.
 ///
 /// ## Example
 ///
@@ -104,8 +104,8 @@ impl AminoAcidsDistance for AminoAcidsViewMut<'_> {}
 ///
 /// ## Errors
 ///
-/// If either argument is empty or the sequence characters are invalid, an error
-/// is thrown. See [`DistanceError`].
+/// If either argument is empty or the sequence characters are all invalid, a
+/// [`DistanceError`] is thrown.
 //
 // TODO: Could make a generic function that depends on distance matrix.
 #[allow(clippy::cast_precision_loss)]
diff --git a/src/distance/dna/mod.rs b/src/distance/dna/mod.rs
@@ -19,26 +19,21 @@ pub(crate) use tabulation::{hamming_dist_from_sub_matrix, total_and_frequencies}
 /// Calculates the p-distance (proportion of differing sites) between two
 /// nucleotide sequences.
 ///
-/// The p-distance considers only valid, canonical nucleotide pairs (A, C, G,
-/// T/U). Casing is ignored and only shared sequence length--the smaller of the
-/// two--are compared. The algorithm uses SIMD operations for improved
-/// performance.
+/// The p-distance considers only valid, canonical nucleotide pairs (`A`, `C`,
+/// `G`, `T`/`U`). Casing is ignored and only shared sequence length--the
+/// smaller of the two--is compared. The algorithm uses SIMD operations for
+/// improved performance.
 ///
-/// ## Returns
-///
-/// - `Some(f64)` - The p-distance between sequences if valid positions are
-///   found.
-/// - `None` - If no valid positions are found to compare.
+/// `None` is returned if no valid positions are found to compare.
 ///
 /// ## Type Parameters
 ///
-/// - `N` - SIMD lane count, must be a supported lane count
+/// `N` - SIMD lane count, must be a supported lane count
 ///
 /// ## Example
 ///
 /// ```
 /// # use zoe::distance::dna::p_distance_acgt;
-///
 /// let seq1 = b"ATGCATGC";
 /// let seq2 = b"atgtttnc";
 /// let distance = p_distance_acgt::<16>(seq1, seq2);
diff --git a/src/search/mod.rs b/src/search/mod.rs
@@ -55,8 +55,8 @@
 //! is when a needle must be found in at most the first `n` elements of the
 //! haystack, or at most the last `n` elements of the haystack. Ordinarily, one
 //! would have to ensure the range is in bounds before slicing or using
-//! `search_in`. To simplify this, *Zoe* provides `search_in_first` and
-//! `search_in_last`, as shown below. Once again, the second occurrence of the
+//! [`search_in`]. To simplify this, *Zoe* provides [`search_in_first`] and
+//! [`search_in_last`], as shown below. Once again, the second occurrence of the
 //! needle is returned.
 //!
 //! ```
@@ -76,6 +76,9 @@
 //! [`find_substring`]: crate::search::ByteSubstring::find_substring
 //! [`find_fuzzy_substring`]: crate::search::ByteSubstring::find_fuzzy_substring
 //! [`RangeSearch`]: crate::search::RangeSearch
+//! [`search_in`]: crate::search::ToRangeSearch::search_in
+//! [`search_in_first`]: crate::search::ToRangeSearch::search_in_first
+//! [`search_in_last`]: crate::search::ToRangeSearch::search_in_last
 
 /// Search and/or replace bytes.
 mod bytes;

Original file line number	Diff line number	Diff line change
`@@ -89,7 +89,7 @@ impl AlignmentStates {`
`89`	`89`	`self.0.as_mut_slice()`
`90`	`90`	`}`
`91`	`91`
`92`		- /// Returns the [`Ciglet`] elements as a mutable reference to a vector.
	`92`	+ /// Returns a mutable reference to the vector of [`Ciglet`] elements.
`93`	`93`	`///`
`94`	`94`	`/// ## Validity`
`95`	`95`	`///`
Original file line number	Diff line number	Diff line change
`@@ -258,9 +258,13 @@ impl TryFrom<&str> for Cigar {`
`258`	`258`	`}`
`259`	`259`	`}`
`260`	`260`
	`261`	+/// An extension trait for `Vec<u8>` allowing a ciglet to be formatted into the
	`262`	`+/// buffer.`
`261`	`263`	`pub(crate) trait FormatCigletForCigarVec {`
	`264`	+ /// Extends the buffer with the formatted `ciglet`.
`262`	`265`	`fn push_formatted_ciglet(&mut self, ciglet: Ciglet);`
`263`	`266`	`}`
	`267`	`+`
`264`	`268`	`impl FormatCigletForCigarVec for Vec<u8> {`
`265`	`269`	`#[inline]`
`266`	`270`	`fn push_formatted_ciglet(&mut self, ciglet: Ciglet) {`
`@@ -348,13 +352,19 @@ impl ExpandedCigar {`
`348`	`352`	`if previous == op {`
`349`	`353`	`count += 1;`
`350`	`354`	`} else {`
`351`		`- condensed.push_formatted_ciglet(Ciglet { inc: count, op: previous });`
	`355`	`+ condensed.push_formatted_ciglet(Ciglet {`
	`356`	`+ inc: count,`
	`357`	`+ op: previous,`
	`358`	`+ });`
`352`	`359`	`previous = op;`
`353`	`360`	`count = 1;`
`354`	`361`	`}`
`355`	`362`	`}`
`356`	`363`
`357`		`- condensed.push_formatted_ciglet(Ciglet { inc: count, op: previous });`
	`364`	`+ condensed.push_formatted_ciglet(Ciglet {`
	`365`	`+ inc: count,`
	`366`	`+ op: previous,`
	`367`	`+ });`
`358`	`368`
`359`	`369`	`Cigar(condensed)`
`360`	`370`	`}`
Original file line number	Diff line number	Diff line change
`@@ -167,7 +167,7 @@ pub enum RefineDNAStrat {`
`167`	`167`	/// Retains and recodes to uppercase ACGTN bases with standard gaps (`-`)
`168`	`168`	`AcgtnStdGapsUc,`
`169`	`169`
`170`		`- // Retains and recodes to uppercase ACGT bases without gaps`
	`170`	`+ /// Retains and recodes to uppercase ACGT bases without gaps`
`171`	`171`	`AcgtNoGapsUc,`
`172`	`172`	`}`
`173`	`173`