Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -7,3 +7,4 @@ babel_outputs/
.snakemake/
.env
.idea
**/target/
3 changes: 3 additions & 0 deletions babel_io/.gitignore
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
./target
.idea
./Cargo.lock
32 changes: 32 additions & 0 deletions babel_io/Cargo.toml
Original file line number Diff line number Diff line change
@@ -0,0 +1,32 @@
[package]
name = "babel_io"
version = "0.1.0"
edition = "2024"

[lib]
name = "babel_io"

[dependencies]
async_once = "^0.2"
csv = "^1.3"
clap = { version = "^4.5", features = ["derive"] }
env_logger = "^0.11"
humantime = "^2.2"
itertools = "^0.14"
lazy_static = "^1.3"
log = { version = "^0.4", features = ["std"] }
polars = { version = "^0.45", features = ["default", "cloud", "concat_str", "string_pad", "dtype-array", "strings", "regex", "json", "cross_join", "lazy", "coalesce", "polars-lazy", "parquet", "find_many", "csv", "decompress", "list_eval", "is_in"] }
oxigraph = "^0.4"
rand = "^0.9"
rayon = "^1.10"
regex = "^1.11"
reqwest = { version = "^0.12", features = ["default", "json"] }
roxmltree = "^0.20"
serde = { version = "^1.0", features = ["derive", "serde_derive"] }
serde_derive = "^1.0"
serde_json = "^1.0"
serde_with = { version = "^3.12", features = ["std", "macros", "json"] }
tokio = { version = "^1.45", features = ["rt", "rt-multi-thread", "macros"] }
uuid = { version = "^1.1", features = ["v4"] }
quick-xml = "^0.38"
zip = "^4.2"
4 changes: 4 additions & 0 deletions babel_io/rustfmt.toml
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
max_width = 160
newline_style = "Unix"
use_field_init_shorthand = true
use_try_shorthand = true
65 changes: 65 additions & 0 deletions babel_io/src/bin/build_compendia.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,65 @@
#[macro_use]
extern crate log;

use clap::Parser;
use humantime::format_duration;
use itertools::Itertools;
use oxigraph::io::RdfFormat;
use oxigraph::sparql::QueryResults;
use oxigraph::store::Store;
use std::collections::HashSet;
use std::error::Error;
use std::fs::read_to_string;
use std::io::{BufReader, BufWriter};
use std::time::Instant;
use std::{fs, path};

#[derive(Parser, PartialEq, Debug)]
#[clap(author, version, about, long_about = None)]
struct Options {
#[clap(short, long, required = true)]
concordances: Vec<path::PathBuf>,

#[clap(short, long, required = true)]
identifiers: Vec<path::PathBuf>,

#[clap(short = 'z', long, required = true)]
ic_rdf: path::PathBuf,
}
#[tokio::main]
async fn main() -> Result<(), Box<dyn Error>> {
let start = Instant::now();
env_logger::init();

let options = Options::parse();
debug!("{:?}", options);

let dicts = HashSet::new();
let types = HashSet::new();

for ifile in options.identifiers {
let asdf = read_to_string(ifile).unwrap();
for line in asdf.lines() {}
// new_identifiers, new_types = read_identifier_file(ifile)
//
//
//
// types = {}
// identifiers = list()
// with open(infile,'r') as inf:
// for line in inf:
// x = line.strip().split('\t')
// identifiers.append((x[0],))
// if len(x) > 1:
// types[x[0]] = x[1]
// return identifiers,types
//
//
//
// glom(dicts, new_identifiers, unique_prefixes=[UBERON, GO])
// types.update(new_types)
}

info!("Duration: {}", format_duration(start.elapsed()).to_string());
Ok(())
}
122 changes: 122 additions & 0 deletions babel_io/src/bin/create_chembl_labels_and_smiles.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,122 @@
#[macro_use]
extern crate log;

use clap::Parser;
use humantime::format_duration;
use itertools::Itertools;
use oxigraph::io::RdfFormat;
use oxigraph::sparql::QueryResults;
use oxigraph::store::Store;
use std::error::Error;
use std::fs;
use std::io::{BufReader, BufWriter, Write};
use std::path;
use std::time::Instant;

// NOTE: rust runs in 13s, python runs in 21s
#[derive(Parser, PartialEq, Debug)]
#[clap(author, version, about, long_about = None)]
struct Options {
#[clap(short, long, required = true)]
input: path::PathBuf,

#[clap(short, long, required = true)]
cco: path::PathBuf,

#[clap(short, long, required = true)]
labels_output: path::PathBuf,

#[clap(short, long, required = true)]
smiles_output: path::PathBuf,
}

#[tokio::main]
async fn main() -> Result<(), Box<dyn Error>> {
let start = Instant::now();
env_logger::init();

let options = Options::parse();
debug!("{:?}", options);

let store = Store::new()?;
let start_load = Instant::now();

// this file is small...no need for bulk loader
let cco_br = BufReader::new(fs::File::open(options.cco).unwrap());
store.load_from_reader(RdfFormat::Turtle, cco_br).expect("Could not load input");

let input_br = BufReader::new(fs::File::open(options.input).unwrap());
store
.bulk_loader()
.with_max_memory_size_in_megabytes(4 * 2048)
.with_num_threads(4)
.load_from_reader(RdfFormat::Turtle, input_br)
.expect("Could not load input");

info!("duration to load input: {}", format_duration(start_load.elapsed()).to_string());

let mut labels_bw = BufWriter::new(fs::File::create(options.labels_output.clone().as_path()).unwrap());

let query_statement = "PREFIX rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#>
PREFIX rdfs: <http://www.w3.org/2000/01/rdf-schema#>
PREFIX cco: <http://rdf.ebi.ac.uk/terms/chembl#>
SELECT ?molecule ?label
WHERE {
?molecule a ?type .
?type rdfs:subClassOf* cco:Substance .
?molecule rdfs:label ?label .
}";

if let QueryResults::Solutions(solutions) = store.query(query_statement)? {
for qs in solutions.filter_map(Result::ok).into_iter() {
let iterm = qs.get("molecule").expect("molecule was None");
let mut iterm = iterm.to_string();
iterm = babel_io::trim_gt_and_lt(iterm);

let iterm_split = iterm.split("/").collect_vec();
let id = iterm_split.last().unwrap();

let label = qs.get("label").expect("label was None");
let mut label = label.to_string();
label = babel_io::trim_quotes(label);

if id.to_string() == label {
continue;
}
write!(labels_bw, "CHEMBL.COMPOUND:{}\t{}\n", id, label).expect("Could not write triple");
}
}

let mut smiles_bw = BufWriter::new(fs::File::create(options.smiles_output.clone().as_path()).unwrap());

let query_statement = "PREFIX rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#>
PREFIX rdfs: <http://www.w3.org/2000/01/rdf-schema#>
PREFIX cco: <http://rdf.ebi.ac.uk/terms/chembl#>
PREFIX cheminf: <http://semanticscience.org/resource/>
SELECT ?molecule ?smiles
WHERE {
?molecule cheminf:SIO_000008 ?smile_entity .
?smile_entity a cheminf:CHEMINF_000018 ;
cheminf:SIO_000300 ?smiles .
}";

if let QueryResults::Solutions(solutions) = store.query(query_statement)? {
for qs in solutions.filter_map(Result::ok).into_iter() {
let iterm = qs.get("molecule").expect("molecule was None");
let mut iterm = iterm.to_string();
iterm = babel_io::trim_gt_and_lt(iterm);

let iterm_split = iterm.split("/").collect_vec();
let id = iterm_split.last().unwrap();

let label = qs.get("smiles").expect("smiles was None");
let mut label = label.to_string();
label = babel_io::trim_quotes(label);

write!(smiles_bw, "CHEMBL.COMPOUND:{}\t{}\n", id, label).expect("Could not write triple");
}
}

info!("Duration: {}", format_duration(start.elapsed()).to_string());
Ok(())
}
118 changes: 118 additions & 0 deletions babel_io/src/bin/create_complexportal_labels_and_synonyms.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,118 @@
#[macro_use]
extern crate log;

use clap::Parser;
use humantime::format_duration;
use itertools::Itertools;
use polars::prelude::*;
use std::error::Error;
use std::fs;
use std::path;
use std::time::Instant;

#[derive(Parser, PartialEq, Debug)]
#[clap(author, version, about, long_about = None)]
struct Options {
#[clap(short, long, required = true)]
input: path::PathBuf,

#[clap(short, long, required = true)]
labels_output: path::PathBuf,

#[clap(short, long, required = true)]
synonyms_output: path::PathBuf,
}

#[tokio::main]
async fn main() -> Result<(), Box<dyn Error>> {
let start = Instant::now();
env_logger::init();

let options = Options::parse();
debug!("{:?}", options);

// NOTE: this base implementation runs in 4ms, python version of this runs in 4s
// let br = BufReader::new(fs::File::open(options.input).unwrap());
//
// let mut labels_bw = std::io::BufWriter::new(fs::File::create(options.labels_output.clone().as_path()).unwrap());
// let mut synonyms_bw = std::io::BufWriter::new(fs::File::create(options.synonyms_output.clone().as_path()).unwrap());
//
// let mut used_synonyms = HashSet::new();
//
// for line in br.lines().skip(1) {
// let line = line.unwrap();
// let line_split = line.split("\t").collect_vec();
// let id = line_split.get(0).unwrap();
// let label = line_split.get(1).unwrap();
// write!(labels_bw, "ComplexPortal:{}\t{}\n", id, label).unwrap();
// let synonyms = line_split.get(2).unwrap();
// if !synonyms.to_string().eq("-") {
// let synonyms_split = synonyms.split("|").collect_vec();
// for synonym in synonyms_split.into_iter().map(|a| a.to_string()) {
// if !used_synonyms.contains(&synonym) {
// write!(synonyms_bw, "ComplexPortal:{}\t{}\n", id, synonym).unwrap();
// used_synonyms.insert(synonym);
// }
// }
// }
// }

// NOTE: this polars implementation runs in 16ms
let usable_columns = vec!["#Complex ac", "Recommended name", "Aliases for complex"];

let df = polars::lazy::frame::LazyCsvReader::new(options.input.clone())
.with_separator(b'\t')
.with_infer_schema_length(Some(0))
.with_ignore_errors(true)
.with_truncate_ragged_lines(true)
.with_has_header(true)
.finish()
.unwrap()
.select(usable_columns.into_iter().map(|a| col(a)).collect_vec())
.collect()
.unwrap();

// println!("{}", df.head(None));

let mut labels_df = df
.clone()
.lazy()
.select([
concat_str([lit("ComplexPortal"), col("#Complex ac")], ":", true).alias("#Complex ac"),
col("Recommended name"),
])
.collect()
.unwrap();

let mut file = fs::File::create(options.labels_output).expect("could not create file");
CsvWriter::new(&mut file)
.include_header(false)
.with_separator(b'\t')
.finish(&mut labels_df)
.unwrap();

let mut synonyms_df = df
.clone()
.lazy()
.filter(col("Aliases for complex").neq(lit("-")))
.select([
concat_str([lit("ComplexPortal"), col("#Complex ac")], ":", true).alias("#Complex ac"),
col("Aliases for complex").str().split(lit("|")).alias("Aliases for complex"),
])
.explode([col("Aliases for complex")])
.unique(Some(vec!["Aliases for complex".to_string()]), UniqueKeepStrategy::First)
.collect()
.unwrap();

// println!("{}", synonyms_df.head(None));

let mut file = fs::File::create(options.synonyms_output).expect("could not create file");
CsvWriter::new(&mut file)
.include_header(false)
.with_separator(b'\t')
.finish(&mut synonyms_df)
.unwrap();

info!("Duration: {}", format_duration(start.elapsed()).to_string());
Ok(())
}
Loading