diff --git a/src/bio2bel_kegg/constants.py b/src/bio2bel_kegg/constants.py index 96ed3b7..be6a2da 100644 --- a/src/bio2bel_kegg/constants.py +++ b/src/bio2bel_kegg/constants.py @@ -16,10 +16,15 @@ os.makedirs(PROTEIN_ENTRY_DIR, exist_ok=True) # returns the list of human pathways -KEGG_PATHWAYS_URL = 'http://rest.kegg.jp/list/pathway/hsa' +KEGG_PATHWAYS_URL = 'http://rest.kegg.jp/list/pathway' +KEGG_HUMAN_PATHWAYS_URL = f'{KEGG_PATHWAYS_URL}/hsa' + +# returns the list of organism pathways +KEGG_ORGANISM_URL = 'http://rest.kegg.jp/list/organism' # human genes linked from each of the KEGG pathways -PROTEIN_PATHWAY_URL = 'http://rest.kegg.jp/link/pathway/hsa' +PROTEIN_PATHWAY_URL = 'http://rest.kegg.jp/link/pathway/' +PROTEIN_PATHWAY_HUMAN_URL = f'{PROTEIN_PATHWAY_URL.rstrip("/")}/hsa' # KEGG stats KEGG_STATISTICS_URL = 'http://rest.kegg.jp/info/kegg' diff --git a/src/bio2bel_kegg/manager.py b/src/bio2bel_kegg/manager.py index 362b425..ef0973e 100644 --- a/src/bio2bel_kegg/manager.py +++ b/src/bio2bel_kegg/manager.py @@ -6,7 +6,7 @@ import logging import os from multiprocessing.pool import ThreadPool -from typing import List, Mapping, Optional +from typing import List, Mapping, Optional, Union import requests from tqdm import tqdm @@ -19,10 +19,13 @@ from pybel.constants import BIOPROCESS, FUNCTION, NAME, NAMESPACE, PROTEIN from pybel.manager.models import Namespace, NamespaceEntry from pybel.struct.graph import BELGraph -from .constants import API_KEGG_GET, KEGG, METADATA_FILE_PATH, MODULE_NAME, PROTEIN_ENTRY_DIR +from .constants import ( + API_KEGG_GET, KEGG, KEGG_PATHWAYS_URL, METADATA_FILE_PATH, MODULE_NAME, PROTEIN_ENTRY_DIR, PROTEIN_PATHWAY_URL, +) from .models import Base, Pathway, Protein, protein_pathway from .parsers import ( - get_entity_pathway_df, get_pathway_names_df, parse_entity_pathway, parse_pathways, process_protein_info_to_model, + get_entity_pathway_df, get_pathway_names_df, get_pathway_species_df, parse_entity_pathway, parse_pathways, + parse_species, process_protein_info_to_model, ) __all__ = [ @@ -111,6 +114,56 @@ def _populate_pathways(self, url: Optional[str] = None): self.session.commit() + def _populate_pathways_single_species( + self, + species_id: Optional[str] = None, + metadata_existing=None, + ): + """Populate pathways for A SINGLE specie. + + :param species_id: name or id of the specie to populate + :param metadata_existing: metadata exists already + """ + df_pathway_species = get_pathway_species_df() + pathways_species_dict = parse_species(df_pathway_species) + + url_pathways = os.path.join(KEGG_PATHWAYS_URL, species_id) + url_proteins = os.path.join(PROTEIN_PATHWAY_URL, species_id) + + if species_id in list(pathways_species_dict.keys()): + self._populate_pathways(url_pathways) + self._pathway_entity(url=url_proteins, metadata_existing=metadata_existing) + elif species_id in [i[0] for i in pathways_species_dict.values()]: + url = os.path.join(KEGG_PATHWAYS_URL, species_id) + self._populate_pathways(url_pathways) + self._pathway_entity(url=url_proteins, metadata_existing=metadata_existing) + else: + raise Warning(f'Organism id {species_id} not found in KEGG.') + + def _populate_pathways_multiple_species( + self, + species_ids: Optional[Union[str, List]] = None, + metadata_existing=None, + ): + """Populate pathways for A OR MANY species. + + :param species_ids: name or id of the specie or species (if list) to populate + :param metadata_existing: metadata exists already + """ + if isinstance(species_ids, str): + self._populate_pathways_single_species(species_ids, metadata_existing) + + elif isinstance(species_ids, list): + for specie_id in species_ids: + self._populate_pathways_single_species(specie_id, metadata_existing) + else: + # If none specified, populate ALL species in KEGG + df_pathway_species = get_pathway_species_df() + pathways_species_dict = parse_species(df_pathway_species) + + for specie_id in pathways_species_dict.keys(): + self._populate_pathways(os.path.join(KEGG_PATHWAYS_URL, specie_id)) + def _pathway_entity(self, url=None, metadata_existing=None, thead_pool_size=1): """Populate proteins. @@ -207,10 +260,13 @@ def _postprocess_pid(self, pid_attributes): if hgnc_id is not None: pid_attributes[kegg_protein_id]['hgnc_symbol'] = hgnc_id_to_symbol.get(hgnc_id) - def populate(self, pathways_url=None, protein_pathway_url=None, metadata_existing=False): + def populate(self, pathways_url=None, protein_pathway_url=None, species=None, metadata_existing=False): """Populate all tables.""" - self._populate_pathways(url=pathways_url) - self._pathway_entity(url=protein_pathway_url, metadata_existing=metadata_existing) + if species: + self._populate_pathways_multiple_species(species, metadata_existing) + else: + self._populate_pathways(url=pathways_url) + self._pathway_entity(url=protein_pathway_url, metadata_existing=metadata_existing) def count_pathways(self) -> int: """Count the pathways in the database.""" diff --git a/src/bio2bel_kegg/parsers/__init__.py b/src/bio2bel_kegg/parsers/__init__.py index 68dadb0..922aa23 100644 --- a/src/bio2bel_kegg/parsers/__init__.py +++ b/src/bio2bel_kegg/parsers/__init__.py @@ -4,4 +4,4 @@ from .description import process_protein_info_to_model # noqa: F401 from .entities import get_entity_pathway_df, parse_entity_pathway # noqa: F401 -from .pathways import get_pathway_names_df, parse_pathways # noqa: F401 +from .pathways import get_pathway_names_df, parse_pathways, get_pathway_species_df, parse_species # noqa: F401 diff --git a/src/bio2bel_kegg/parsers/entities.py b/src/bio2bel_kegg/parsers/entities.py index ed75e76..6a258d0 100644 --- a/src/bio2bel_kegg/parsers/entities.py +++ b/src/bio2bel_kegg/parsers/entities.py @@ -6,7 +6,7 @@ import pandas as pd -from bio2bel_kegg.constants import PROTEIN_PATHWAY_URL +from bio2bel_kegg.constants import PROTEIN_PATHWAY_HUMAN_URL __all__ = [ 'get_entity_pathway_df', @@ -20,7 +20,7 @@ def get_entity_pathway_df(url: Optional[str] = None) -> pd.DataFrame: :param url: An optional url from a KEGG TSV file """ return pd.read_csv( - url or PROTEIN_PATHWAY_URL, + url or PROTEIN_PATHWAY_HUMAN_URL, sep='\t', header=None ) diff --git a/src/bio2bel_kegg/parsers/pathways.py b/src/bio2bel_kegg/parsers/pathways.py index 21beec8..1bf75bb 100644 --- a/src/bio2bel_kegg/parsers/pathways.py +++ b/src/bio2bel_kegg/parsers/pathways.py @@ -7,7 +7,7 @@ import pandas as pd -from bio2bel_kegg.constants import KEGG_PATHWAYS_URL +from bio2bel_kegg.constants import KEGG_PATHWAYS_URL, KEGG_ORGANISM_URL __all__ = [ 'get_pathway_names_df', @@ -40,3 +40,30 @@ def parse_pathways(pathway_dataframe): kegg_id: name for line, (kegg_id, name) in pathway_dataframe.iterrows() } + + +def get_pathway_species_df(url=None): + """Convert tab separated txt files to pandas Dataframe. + + :param Optional[str] url: url from KEGG tab separated file + :return: dataframe of the file + :rtype: pandas.DataFrame + """ + return pd.read_csv( + url or KEGG_ORGANISM_URL, + sep='\t', + header=None + ) + + +def parse_species(org_dataframe): + """Parse the pathway table dataframe. + + :param pandas.DataFrame pathway_dataframe: Pathway hierarchy as dataframe + :rtype: dict + :return Object representation dictionary (kegg_id: name, species) + """ + return { + org_id: org_name.replace(')', '').split(' (') + for line, (kegg_id, org_id, org_name, desc) in org_dataframe.iterrows() + }