diff --git a/modules/Bio/EnsEMBL/Compara/PipeConfig/ENV.pm b/modules/Bio/EnsEMBL/Compara/PipeConfig/ENV.pm index a92fdd9ccc..da7dbe069b 100644 --- a/modules/Bio/EnsEMBL/Compara/PipeConfig/ENV.pm +++ b/modules/Bio/EnsEMBL/Compara/PipeConfig/ENV.pm @@ -123,6 +123,7 @@ sub executable_locations { 'BuildSynteny_exe' => $self->check_file_in_ensembl('ensembl-compara/scripts/synteny/BuildSynteny.jar'), 'check_ncbi_taxa_exe' => $self->check_exe_in_ensembl('ensembl-compara/scripts/taxonomy/check_ncbi_taxa_consistency.py'), 'compare_beds_exe' => $self->check_exe_in_ensembl('ensembl-compara/scripts/pipeline/compare_beds.pl'), + 'compare_enredo_regions_exe' => $self->check_exe_in_ensembl('ensembl-compara/scripts/production/compare_enredo_regions.py'), 'count_genes_in_tree_exe' => $self->check_exe_in_ensembl('ensembl-compara/scripts/pipeline/count_genes_in_tree.pl'), 'create_pair_aligner_page_exe' => $self->check_exe_in_ensembl('ensembl-compara/scripts/report/create_pair_aligner_page.pl'), 'dump_aln_program' => $self->check_exe_in_ensembl('ensembl-compara/scripts/dumps/DumpMultiAlign.pl'), diff --git a/modules/Bio/EnsEMBL/Compara/PipeConfig/Legacy/AnchorAlignCheck_conf.pm b/modules/Bio/EnsEMBL/Compara/PipeConfig/Legacy/AnchorAlignCheck_conf.pm new file mode 100644 index 0000000000..34ece9347d --- /dev/null +++ b/modules/Bio/EnsEMBL/Compara/PipeConfig/Legacy/AnchorAlignCheck_conf.pm @@ -0,0 +1,278 @@ +=head1 LICENSE + +See the NOTICE file distributed with this work for additional information +regarding copyright ownership. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. + +=head1 NAME + +Bio::EnsEMBL::Compara::PipeConfig::Legacy::AnchorAlignCheck_conf + +=cut + +package Bio::EnsEMBL::Compara::PipeConfig::Legacy::AnchorAlignCheck_conf; + +use strict; +use warnings; + +use base ('Bio::EnsEMBL::Compara::PipeConfig::ComparaGeneric_conf'); + + +sub default_options { + my ($self) = @_; + + return { + %{$self->SUPER::default_options}, + + 'epo_db' => undef, + 'collection' => undef, + 'division' => undef, + + 'pipeline_name' => $self->o('collection') . '_' . $self->o('division') . '_anchor_align_check_' . $self->o('rel_with_suffix'), + + 'work_dir' => $self->o('pipeline_dir'), + + 'enredo_params' => ' --min-score 0 --max-gap-length 200000 --max-path-dissimilarity 4 --min-length 10000 --min-regions 2 --min-anchors 3 --max-ratio 3 --simplify-graph 7 --bridges -o ', + + 'trim_anchor_align_batch_size' => 20, + 'trim_anchor_align_capacity' => 500, + }; +} + + +sub pipeline_create_commands { + my ($self) = @_; + return [ + @{$self->SUPER::pipeline_create_commands}, + $self->pipeline_create_commands_rm_mkdir(['work_dir']), + ]; +} + + +sub pipeline_wide_parameters { + my $self = shift @_; + return { + %{$self->SUPER::pipeline_wide_parameters}, + + 'epo_db' => $self->o('epo_db'), + 'compara_db' => $self->pipeline_url(), + 'collection' => $self->o('collection'), + 'division' => $self->o('division'), + + 'work_dir' => $self->o('work_dir'), + + 'genome_dumps_dir' => $self->o('genome_dumps_dir'), + }; +} + + +sub pipeline_checks_pre_init { + my ($self) = @_; + + if (!defined $self->o('collection')) { + die("A collection must be specified with the 'collection' pipeline parameter"); + } + + if (!defined $self->o('epo_db')) { + die("An EPO database must be specified with the 'epo_db' pipeline parameter"); + } +} + + +sub core_pipeline_analyses { + my ($self) = @_; + + return [ + + { -logic_name => 'copy_table_factory', + -module => 'Bio::EnsEMBL::Hive::RunnableDB::JobFactory', + -input_ids => [ {} ], + -parameters => { + 'column_names' => [ 'table' ], + 'db_conn' => '#epo_db#', + 'inputlist' => [ 'dnafrag', 'genome_db' ], + }, + -flow_into => { + '2->A' => { 'copy_table' => { 'src_db_conn' => '#db_conn#', 'table' => '#table#' } }, + 'A->1' => 'copy_table_funnel_check', + }, + }, + + { -logic_name => 'copy_table', + -module => 'Bio::EnsEMBL::Hive::RunnableDB::MySQLTransfer', + -parameters => { + 'filter_cmd' => 'sed "s/ENGINE=MyISAM/ENGINE=InnoDB/"', + }, + }, + + { -logic_name => 'copy_table_funnel_check', + -module => 'Bio::EnsEMBL::Compara::RunnableDB::FunnelCheck', + -flow_into => 'copy_untrimmed_anchor_aligns', + }, + + { -logic_name => 'copy_untrimmed_anchor_aligns', + -module => 'Bio::EnsEMBL::Compara::RunnableDB::CopyDataWithJoin', + -parameters => { + 'db_conn' => '#epo_db#', + 'table' => 'anchor_align', + 'inputquery' => q/ + SELECT * FROM anchor_align + WHERE untrimmed_anchor_align_id IS NULL + AND is_overlapping = 0 + /, + }, + -flow_into => 'check_consistent_mlss', + }, + + { -logic_name => 'check_consistent_mlss', + -module => 'Bio::EnsEMBL::Hive::RunnableDB::SqlHealthcheck', + -parameters => { + 'query' => 'SELECT DISTINCT method_link_species_set_id FROM anchor_align', + 'expected_size' => 1, + }, + -flow_into => 'fire_anchor_align_trimming', + }, + + { -logic_name => 'fire_anchor_align_trimming', + -module => 'Bio::EnsEMBL::Hive::RunnableDB::Dummy', + -flow_into => { + '1->A' => [ 'trim_anchor_align_factory' ], + 'A->1' => [ 'trim_anchor_align_funnel_check' ], + }, + }, + + { -logic_name => 'trim_anchor_align_factory', + -module => 'Bio::EnsEMBL::Hive::RunnableDB::JobFactory', + -parameters => { + 'inputquery' => q/ + SELECT DISTINCT method_link_species_set_id, anchor_id + FROM anchor_align + WHERE untrimmed_anchor_align_id IS NULL + AND is_overlapping = 0 + /, + }, + -flow_into => { 2 => 'trim_anchor_align' }, + -rc_name => '4Gb_24_hour_job', + }, + + { -logic_name => 'trim_anchor_align', + -module => 'Bio::EnsEMBL::Compara::Production::EPOanchors::TrimAnchorAlign', + -parameters => { + 'ortheus_c_exe' => $self->o('ortheus_c_exe'), + }, + -flow_into => { -1 => 'trim_anchor_align_himem' }, + -hive_capacity => $self->o('trim_anchor_align_capacity'), + -batch_size => $self->o('trim_anchor_align_batch_size'), + -rc_name => '2Gb_job', + }, + + { -logic_name => 'trim_anchor_align_himem', + -module => 'Bio::EnsEMBL::Compara::Production::EPOanchors::TrimAnchorAlign', + -parameters => { + 'ortheus_c_exe' => $self->o('ortheus_c_exe'), + }, + -flow_into => { -1 => 'ignore_huge_trim_anchor_align' }, + -hive_capacity => $self->o('trim_anchor_align_capacity'), + -batch_size => $self->o('trim_anchor_align_batch_size'), + -rc_name => '8Gb_job', + }, + + { -logic_name => 'ignore_huge_trim_anchor_align', + -module => 'Bio::EnsEMBL::Hive::RunnableDB::Dummy', + -meadow_type=> 'LOCAL', + }, + + { -logic_name => 'trim_anchor_align_funnel_check', + -module => 'Bio::EnsEMBL::Compara::RunnableDB::FunnelCheck', + -flow_into => 'fire_enredo', + }, + + { -logic_name => 'fire_enredo', + -module => 'Bio::EnsEMBL::Hive::RunnableDB::JobFactory', + -parameters => { + 'column_names' => [ 'db_label', 'db_conn' ], + 'inputlist' => [ ['epo_db', '#epo_db#'], ['compara_db', '#compara_db#'] ], + }, + -flow_into => { + '2->A' => [ 'dump_mappings_to_file' ], + 'A->1' => [ 'enredo_funnel_check' ], + }, + }, + + { -logic_name => 'dump_mappings_to_file', + -module => 'Bio::EnsEMBL::Hive::RunnableDB::DbCmd', + -parameters => { + 'append' => [ '-N', '-B', '-q' ], + 'enredo_mapping_file' => '#work_dir#/enredo_input.#db_label#.txt', + 'output_file' => '#enredo_mapping_file#', + 'input_query' => q/ + SELECT + aa.anchor_id, + gdb.name, + df.name, + aa.dnafrag_start, + aa.dnafrag_end, + CASE aa.dnafrag_strand WHEN 1 THEN "+" ELSE "-" END, + aa.num_of_organisms, + aa.score + FROM + anchor_align aa + INNER JOIN + dnafrag df ON aa.dnafrag_id = df.dnafrag_id + INNER JOIN + genome_db gdb ON gdb.genome_db_id = df.genome_db_id + WHERE + untrimmed_anchor_align_id IS NOT NULL + ORDER BY + gdb.name, df.name, aa.dnafrag_start + /, + }, + -flow_into => { + 1 => { + 'run_enredo' => { 'enredo_mapping_file' => '#enredo_mapping_file#', 'db_label' => '#db_label#' }, + }, + }, + }, + + { -logic_name => 'run_enredo', + -module => 'Bio::EnsEMBL::Hive::RunnableDB::SystemCmd', + -rc_name => '8Gb_job', + -parameters => { + 'cmd' => '#enredo_exe# #enredo_params# #enredo_output_file# #enredo_mapping_file#', + 'enredo_exe' => $self->o('enredo_exe'), + 'enredo_output_file' => '#work_dir#/enredo_output.#db_label#.txt', + 'enredo_params' => $self->o('enredo_params'), + }, + }, + + { -logic_name => 'enredo_funnel_check', + -module => 'Bio::EnsEMBL::Compara::RunnableDB::FunnelCheck', + -flow_into => 'compare_enredo_regions', + }, + + { -logic_name => 'compare_enredo_regions', + -module => 'Bio::EnsEMBL::Hive::RunnableDB::SystemCmd', + -parameters => { + 'cmd' => '#compare_enredo_regions_exe# -a #epo_enredo_output_file# -b #compara_enredo_output_file# -o #comparison_output_file#', + 'compare_enredo_regions_exe' => $self->o('compare_enredo_regions_exe'), + 'comparison_output_file' => '#work_dir#/enredo_output.jaccard.tsv', + 'epo_enredo_output_file' => '#work_dir#/enredo_output.epo_db.txt', + 'compara_enredo_output_file' => '#work_dir#/enredo_output.compara_db.txt', + }, + }, + ]; +} + + +1; diff --git a/pyproject.toml b/pyproject.toml index 6626a3c425..9c147e4bcb 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -51,6 +51,7 @@ dependencies = [ "ete3>=3.1.1", "lxml>=4.9.2", "pandas>=0.24.2", + "pybedtools>=0.9.0", "sqlalchemy>=1.4.0", "xmlschema>=2.5.1", ] diff --git a/scripts/production/compare_enredo_regions.py b/scripts/production/compare_enredo_regions.py new file mode 100755 index 0000000000..115e92a377 --- /dev/null +++ b/scripts/production/compare_enredo_regions.py @@ -0,0 +1,111 @@ +#!/usr/bin/env python3 +# See the NOTICE file distributed with this work for additional information +# regarding copyright ownership. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Compare regions in two Enredo output files.""" + +from argparse import ArgumentParser +from collections import defaultdict +import csv +from pathlib import Path +import re +from tempfile import TemporaryDirectory + +from pybedtools import BedTool + + +def extract_enredo_regions(enredo_out_file_path: Path, out_dir_path: Path) -> dict[str, Path]: + """Extract regions from an Enredo output file.""" + enredo_region_re = re.compile( + "(?P_?[a-z0-9]+_[a-z0-9_]+)" + ":(?P.+?)" + ":(?P[0-9]+)" + ":(?P[0-9]+)" + "\\s+" + ) + + regions_by_genome = defaultdict(list) + with open(enredo_out_file_path, encoding="ascii") as in_file_obj: + for line in in_file_obj: + if line.startswith(("#", "block")): + continue + if match := enredo_region_re.match(line): + prod_name = match["prod_name"] + chrom_name = match["dnafrag_name"] + chrom_start = int(match["dnafrag_start"]) - 1 + chrom_end = int(match["dnafrag_end"]) + regions_by_genome[prod_name].append((chrom_name, chrom_start, chrom_end)) + + bed_file_map = {} + for prod_name, regions in regions_by_genome.items(): + bed_file_path = out_dir_path / f"{prod_name}.bed" + with open(bed_file_path, mode="w", encoding="utf-8", newline="") as out_file_obj: + writer = csv.writer(out_file_obj, delimiter="\t", lineterminator="\n") + for region in sorted(regions): + writer.writerow(region) + bed_file_map[prod_name] = bed_file_path + + return bed_file_map + + +def main() -> None: + """Main function of script.""" + + parser = ArgumentParser(description=__doc__) + parser.add_argument("-a", dest="file1", required=True, help="First Enredo output file to compare.") + parser.add_argument("-b", dest="file2", required=True, help="Second Enredo output file to compare.") + parser.add_argument("-o", "--output-file", required=True, help="Output TSV file of comparison results.") + + args = parser.parse_args() + + enredo_out_file_paths = [Path(x) for x in (args.file1, args.file2)] + + recs = [] + with TemporaryDirectory() as tmp_dir: + tmp_dir_path = Path(tmp_dir) + + bed_file_maps = [] + for dataset_idx, enredo_out_file_path in enumerate(enredo_out_file_paths): + bed_dir_path = tmp_dir_path / str(dataset_idx) + bed_dir_path.mkdir() + dataset_bed_file_map = extract_enredo_regions(enredo_out_file_path, bed_dir_path) + bed_file_maps.append(dataset_bed_file_map) + + bed_file_map1, bed_file_map2 = bed_file_maps # pylint: disable=unbalanced-tuple-unpacking + prod_names = sorted(bed_file_map1.keys() | bed_file_map2.keys()) + for prod_name in prod_names: + bed_file_path1 = bed_file_map1[prod_name] + bed_file_path2 = bed_file_map2[prod_name] + bedtool = BedTool(bed_file_path1) + rec = bedtool.jaccard(str(bed_file_path2)) # pylint: disable=too-many-function-args + rec["genome_name"] = prod_name + recs.append(rec) + + out_col_names = [ + "genome_name", + "n_intersections", + "intersection", + "union", + "jaccard", + ] + + with open(args.output_file, mode="w", encoding="utf-8", newline="") as out_file_obj: + writer = csv.DictWriter(out_file_obj, out_col_names, delimiter="\t", lineterminator="\n") + writer.writeheader() + for rec in recs: + writer.writerow(rec) + + +if __name__ == "__main__": + main()