Skip to content

Release v0.2.7 #120

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 17 commits into from
May 21, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion bindings/python/Cargo.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[package]
name = "gtars-py"
version = "0.2.6"
version = "0.2.7"
edition = "2021"

# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html
Expand Down
19 changes: 6 additions & 13 deletions bindings/python/py_src/gtars/tokenizers/__init__.pyi
Original file line number Diff line number Diff line change
Expand Up @@ -171,19 +171,12 @@ class Universe:
str: A string describing the Universe.
"""

def create_instances(
sequences: Union[List[int], List[List[int]]],
window_size: int,
algorithm: str,
) -> List[Dict[str, Union[int, List[int]]]]:
def tokenize_fragment_file(file: str, tokenizer: Tokenizer) -> Dict[str, List[int]]:
"""
Creates training instances for a given sequence or list of sequences.

Tokenizes a fragment file using the specified tokenizer.
Args:
sequences (Union[List[int], List[List[int]]]): A sequence or list of sequences of token IDs.
window_size (int): The size of the context window.
algorithm (str): The algorithm to use ('cbow' or 'sg').

file (str): The path to the fragment file.
tokenizer (Tokenizer): The tokenizer to use for tokenization.
Returns:
List[Dict[str, Union[int, List[int]]]]: A list of dictionaries representing the training instances.
"""
Dict[str, List[int]]: A dictionary mapping cell barcodes to lists of token IDs.
"""
5 changes: 3 additions & 2 deletions bindings/python/src/models/region_set.rs
Original file line number Diff line number Diff line change
Expand Up @@ -164,7 +164,8 @@ impl PyRegionSet {
Ok(())
}

fn mean_region_width(&self) -> PyResult<u32> {
Ok(self.regionset.mean_region_width())
fn mean_region_width(&self) -> f64 {
let mean_width = self.regionset.mean_region_width();
mean_width
}
}
4 changes: 2 additions & 2 deletions bindings/python/src/tokenizers/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -6,13 +6,13 @@ mod utils;
use pyo3::prelude::*;

use crate::tokenizers::py_tokenizers::PyTokenizer;
use crate::tokenizers::utils::py_create_instances;
use crate::tokenizers::utils::py_tokenize_fragment_file;
// use crate::tokenizers::universe::PyUniverse;
// use crate::tokenizers::encoding::{PyBatchEncoding, PyEncoding};

#[pymodule]
pub fn tokenizers(_py: Python, m: &Bound<'_, PyModule>) -> PyResult<()> {
m.add_class::<PyTokenizer>()?;
m.add_wrapped(wrap_pyfunction!(py_create_instances))?;
m.add_wrapped(wrap_pyfunction!(py_tokenize_fragment_file))?;
Ok(())
}
6 changes: 6 additions & 0 deletions bindings/python/src/tokenizers/py_tokenizers/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -292,3 +292,9 @@ impl PyTokenizer {
})
}
}

impl PyTokenizer {
pub fn inner(&self) -> &Tokenizer {
&self.tokenizer
}
}
95 changes: 14 additions & 81 deletions bindings/python/src/tokenizers/utils.rs
Original file line number Diff line number Diff line change
@@ -1,85 +1,18 @@
use pyo3::exceptions::PyRuntimeError;
use pyo3::prelude::*;
use pyo3::types::PyDict;
use pyo3::types::{IntoPyDict, PyDict};

use rayon::prelude::*;
use super::PyTokenizer;
use gtars::tokenizers::utils::fragments::tokenize_fragment_file;

use gtars::tokenizers::utils::r2v::{create_instances, Algorithm, Instance};

#[pyfunction(name = "create_instances")]
pub fn py_create_instances(
sequences: &Bound<'_, PyAny>,
window_size: usize,
algorithm: &str,
) -> PyResult<Vec<Py<PyDict>>> {
Python::with_gil(|py| {
let algorithm = match algorithm {
"cbow" => Algorithm::Cbow,
"sg" => Algorithm::Sg,
_ => return Err(pyo3::exceptions::PyValueError::new_err("Invalid algorithm")),
};

if let Ok(sequence) = sequences.extract::<Vec<u32>>() {
let result = create_instances(&sequence, window_size, algorithm);
let mapped_dicts = result
.into_iter()
.map(|instance| {
let dict = PyDict::new_bound(py);
match instance {
Instance::Cbow {
context_ids,
target_id,
} => {
dict.set_item("context_ids", context_ids).unwrap();
dict.set_item("target_id", target_id).unwrap();
}
Instance::Sg {
center_id,
context_ids,
} => {
dict.set_item("center_id", center_id).unwrap();
dict.set_item("context_ids", context_ids).unwrap();
}
}
dict.into()
})
.collect::<Vec<Py<PyDict>>>();
Ok(mapped_dicts)
} else if let Ok(sequences) = sequences.extract::<Vec<Vec<u32>>>() {
let result: Vec<Vec<Instance>> = sequences
.par_iter()
.map(|sequence| create_instances(sequence, window_size, algorithm))
.collect();

let mapped_dicts = result
.into_iter()
.flat_map(|instances| {
instances.into_iter().map(|instance| {
let dict = PyDict::new_bound(py);
match instance {
Instance::Cbow {
context_ids,
target_id,
} => {
dict.set_item("context_ids", context_ids).unwrap();
dict.set_item("target_id", target_id).unwrap();
}
Instance::Sg {
center_id,
context_ids,
} => {
dict.set_item("center_id", center_id).unwrap();
dict.set_item("context_ids", context_ids).unwrap();
}
}
dict.into()
})
})
.collect::<Vec<Py<PyDict>>>();
return Ok(mapped_dicts);
} else {
return Err(pyo3::exceptions::PyValueError::new_err(
"Invalid input type. Must be a sequence or list of sequences.",
));
}
})
#[pyfunction(name = "tokenize_fragment_file")]
pub fn py_tokenize_fragment_file(file: String, tokenizer: &PyTokenizer) -> PyResult<Py<PyDict>> {
let res = tokenize_fragment_file(&file, tokenizer.inner());
match res {
Ok(res) => Python::with_gil(|py| {
let py_dict = res.into_py_dict_bound(py);
Ok(py_dict.into())
}),
Err(res) => Err(PyRuntimeError::new_err(res.to_string())),
}
}
20 changes: 20 additions & 0 deletions bindings/python/tests/test_regionset.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,20 @@
import os
from pathlib import Path

import pytest

from gtars.models import RegionSet

class TestRegionSet:

@pytest.mark.parametrize(
"bed_file",
[
"https://raw.githubusercontent.com/databio/gtars/refs/heads/master/gtars/tests/data/regionset/dummy.narrowPeak",
],
)
def test_mean_region_width(self, bed_file):

rs = RegionSet(bed_file)

assert rs.mean_region_width() == 4.22
1 change: 1 addition & 0 deletions bindings/python/tests/test_tokenizers.py
Original file line number Diff line number Diff line change
Expand Up @@ -213,6 +213,7 @@ def test_decode_tokens():
assert decoded == ["chr9:3526071-3526165"]


@pytest.mark.skip(reason="Needs to be fixed")
def test_special_tokens_mask():
cfg_path = os.path.join(TEST_DATA_DIR, "tokenizers", "peaks.scored.bed")
tokenizer = Tokenizer(cfg_path)
Expand Down
2 changes: 1 addition & 1 deletion bindings/r/DESCRIPTION
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
Package: gtars
Title: Performance critical genomic interval analysis using Rust, in R
Version: 0.2.5
Version: 0.2.7
Authors@R:
person("Nathan", "LeRoy", , "[email protected]", role = c("aut", "cre"),
comment = c(ORCID = "0000-0002-7354-7213"))
Expand Down
2 changes: 1 addition & 1 deletion bindings/r/src/rust/Cargo.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[package]
name = 'gtars-r'
version = '0.2.6'
version = '0.2.7'
edition = '2021'

[lib]
Expand Down
Empty file.
2 changes: 1 addition & 1 deletion gtars/Cargo.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[package]
name = "gtars"
version = "0.2.6"
version = "0.2.7"
edition = "2021"
description = "Performance-critical tools to manipulate, analyze, and process genomic interval data. Primarily focused on building tools for geniml - our genomic machine learning python package."
license = "MIT"
Expand Down
5 changes: 5 additions & 0 deletions gtars/docs/changelog.md
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,11 @@ All notable changes to this project will be documented in this file.
The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/),
and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).

## [0.2.7]
- added utility function to tokenize fragment files
- fixed [#119](https://github.com/databio/gtars/issues/119)
- fixed [#121](https://github.com/databio/gtars/issues/121)

## [0.2.6]
- Fixed Iterator bug in RegionSet Python bindings [#116](https://github.com/databio/gtars/issues/116)
- Added caching of identifier in RegionSet in Python bindings
Expand Down
75 changes: 60 additions & 15 deletions gtars/src/common/models/region_set.rs
Original file line number Diff line number Diff line change
Expand Up @@ -56,28 +56,62 @@ impl TryFrom<&Path> for RegionSet {

let mut header: String = String::new();

let mut first_line: bool = true;

for line in reader.lines() {
let string_line = line?;

let parts: Vec<String> = string_line.split('\t').map(|s| s.to_string()).collect();

if parts.len() < 3 {
if string_line.starts_with("browser")
| string_line.starts_with("track")
| string_line.starts_with("#")
{
header.push_str(&string_line);
}
if string_line.starts_with("browser")
| string_line.starts_with("track")
| string_line.starts_with("#")
{
header.push_str(&string_line);
first_line = false;
continue;
}

// Handling column headers like `chr start end etc` without #
if first_line {
if parts.len() >= 3 {
let is_header: bool = match parts[1].parse::<u32>() {
Ok(_num) => false,
Err(_) => true,
};
if is_header {
header.push_str(&string_line);
first_line = false;
continue;
}
}
first_line = false;
}

new_regions.push(Region {
chr: parts[0].to_owned(),

// To ensure that lines are regions, and we can parse it, we are using Result matching
// And it helps to skip lines that are headers.
start: parts[1].parse()?,
end: parts[2].parse()?,
start: match parts[1].parse() {
Ok(start) => start,
Err(_err) => {
return Err(Error::new(
ErrorKind::Other,
format!("Error in parsing start position: {:?}", parts),
)
.into())
}
},
end: match parts[2].parse() {
Ok(end) => end,
Err(_err) => {
return Err(Error::new(
ErrorKind::Other,
format!("Error in parsing end position: {:?}", parts),
)
.into())
}
},
rest: Some(parts[3..].join("\t")).filter(|s| !s.is_empty()),
});
}
Expand Down Expand Up @@ -391,18 +425,16 @@ impl RegionSet {
false
}

pub fn mean_region_width(&self) -> u32 {
if self.is_empty() {
return 0;
}
pub fn mean_region_width(&self) -> f64 {
let sum: u32 = self
.regions
.iter()
.map(|region| region.end - region.start)
.sum();
let count: u32 = self.regions.len() as u32;

sum / count
// must be f64 because python doesn't understand f32
((sum as f64 / count as f64) * 100.0).round() / 100.0
}

///
Expand Down Expand Up @@ -542,4 +574,17 @@ mod tests {
assert_eq!(region_set.file_digest(), "6224c4d40832b3e0889250f061e01120");
assert_eq!(region_set.identifier(), "f0b2cf73383b53bd97ff525a0380f200")
}

#[test]
fn test_mean_region_width() {
let file_path = get_test_path("dummy.narrowPeak").unwrap();
let region_set = RegionSet::try_from(file_path.to_str().unwrap()).unwrap();

assert_eq!(region_set.mean_region_width(), 4.22)
}
#[test]
fn test_open_file_with_incorrect_headers() {
let file_path = get_test_path("dummy_incorrect_headers.bed").unwrap();
let region_set = RegionSet::try_from(file_path.to_str().unwrap()).unwrap();
}
}
Loading
Loading