Skip to content

Add multiple species support #16

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Draft
wants to merge 1 commit into
base: master
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
9 changes: 7 additions & 2 deletions src/bio2bel_kegg/constants.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,10 +16,15 @@
os.makedirs(PROTEIN_ENTRY_DIR, exist_ok=True)

# returns the list of human pathways
KEGG_PATHWAYS_URL = 'http://rest.kegg.jp/list/pathway/hsa'
KEGG_PATHWAYS_URL = 'http://rest.kegg.jp/list/pathway'
KEGG_HUMAN_PATHWAYS_URL = f'{KEGG_PATHWAYS_URL}/hsa'

# returns the list of organism pathways
KEGG_ORGANISM_URL = 'http://rest.kegg.jp/list/organism'

# human genes linked from each of the KEGG pathways
PROTEIN_PATHWAY_URL = 'http://rest.kegg.jp/link/pathway/hsa'
PROTEIN_PATHWAY_URL = 'http://rest.kegg.jp/link/pathway/'
PROTEIN_PATHWAY_HUMAN_URL = f'{PROTEIN_PATHWAY_URL.rstrip("/")}/hsa'

# KEGG stats
KEGG_STATISTICS_URL = 'http://rest.kegg.jp/info/kegg'
Expand Down
68 changes: 62 additions & 6 deletions src/bio2bel_kegg/manager.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@
import logging
import os
from multiprocessing.pool import ThreadPool
from typing import List, Mapping, Optional
from typing import List, Mapping, Optional, Union

import requests
from tqdm import tqdm
Expand All @@ -19,10 +19,13 @@
from pybel.constants import BIOPROCESS, FUNCTION, NAME, NAMESPACE, PROTEIN
from pybel.manager.models import Namespace, NamespaceEntry
from pybel.struct.graph import BELGraph
from .constants import API_KEGG_GET, KEGG, METADATA_FILE_PATH, MODULE_NAME, PROTEIN_ENTRY_DIR
from .constants import (
API_KEGG_GET, KEGG, KEGG_PATHWAYS_URL, METADATA_FILE_PATH, MODULE_NAME, PROTEIN_ENTRY_DIR, PROTEIN_PATHWAY_URL,
)
from .models import Base, Pathway, Protein, protein_pathway
from .parsers import (
get_entity_pathway_df, get_pathway_names_df, parse_entity_pathway, parse_pathways, process_protein_info_to_model,
get_entity_pathway_df, get_pathway_names_df, get_pathway_species_df, parse_entity_pathway, parse_pathways,
parse_species, process_protein_info_to_model,
)

__all__ = [
Expand Down Expand Up @@ -111,6 +114,56 @@ def _populate_pathways(self, url: Optional[str] = None):

self.session.commit()

def _populate_pathways_single_species(
self,
species_id: Optional[str] = None,
metadata_existing=None,
):
"""Populate pathways for A SINGLE specie.

:param species_id: name or id of the specie to populate
:param metadata_existing: metadata exists already
"""
df_pathway_species = get_pathway_species_df()
pathways_species_dict = parse_species(df_pathway_species)

url_pathways = os.path.join(KEGG_PATHWAYS_URL, species_id)
url_proteins = os.path.join(PROTEIN_PATHWAY_URL, species_id)

if species_id in list(pathways_species_dict.keys()):
self._populate_pathways(url_pathways)
self._pathway_entity(url=url_proteins, metadata_existing=metadata_existing)
elif species_id in [i[0] for i in pathways_species_dict.values()]:
url = os.path.join(KEGG_PATHWAYS_URL, species_id)
self._populate_pathways(url_pathways)
self._pathway_entity(url=url_proteins, metadata_existing=metadata_existing)
else:
raise Warning(f'Organism id {species_id} not found in KEGG.')

def _populate_pathways_multiple_species(
self,
species_ids: Optional[Union[str, List]] = None,
metadata_existing=None,
):
"""Populate pathways for A OR MANY species.

:param species_ids: name or id of the specie or species (if list) to populate
:param metadata_existing: metadata exists already
"""
if isinstance(species_ids, str):
self._populate_pathways_single_species(species_ids, metadata_existing)

elif isinstance(species_ids, list):
for specie_id in species_ids:
self._populate_pathways_single_species(specie_id, metadata_existing)
else:
# If none specified, populate ALL species in KEGG
df_pathway_species = get_pathway_species_df()
pathways_species_dict = parse_species(df_pathway_species)

for specie_id in pathways_species_dict.keys():
self._populate_pathways(os.path.join(KEGG_PATHWAYS_URL, specie_id))

def _pathway_entity(self, url=None, metadata_existing=None, thead_pool_size=1):
"""Populate proteins.

Expand Down Expand Up @@ -207,10 +260,13 @@ def _postprocess_pid(self, pid_attributes):
if hgnc_id is not None:
pid_attributes[kegg_protein_id]['hgnc_symbol'] = hgnc_id_to_symbol.get(hgnc_id)

def populate(self, pathways_url=None, protein_pathway_url=None, metadata_existing=False):
def populate(self, pathways_url=None, protein_pathway_url=None, species=None, metadata_existing=False):
"""Populate all tables."""
self._populate_pathways(url=pathways_url)
self._pathway_entity(url=protein_pathway_url, metadata_existing=metadata_existing)
if species:
self._populate_pathways_multiple_species(species, metadata_existing)
else:
self._populate_pathways(url=pathways_url)
self._pathway_entity(url=protein_pathway_url, metadata_existing=metadata_existing)

def count_pathways(self) -> int:
"""Count the pathways in the database."""
Expand Down
2 changes: 1 addition & 1 deletion src/bio2bel_kegg/parsers/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,4 +4,4 @@

from .description import process_protein_info_to_model # noqa: F401
from .entities import get_entity_pathway_df, parse_entity_pathway # noqa: F401
from .pathways import get_pathway_names_df, parse_pathways # noqa: F401
from .pathways import get_pathway_names_df, parse_pathways, get_pathway_species_df, parse_species # noqa: F401
4 changes: 2 additions & 2 deletions src/bio2bel_kegg/parsers/entities.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@

import pandas as pd

from bio2bel_kegg.constants import PROTEIN_PATHWAY_URL
from bio2bel_kegg.constants import PROTEIN_PATHWAY_HUMAN_URL

__all__ = [
'get_entity_pathway_df',
Expand All @@ -20,7 +20,7 @@ def get_entity_pathway_df(url: Optional[str] = None) -> pd.DataFrame:
:param url: An optional url from a KEGG TSV file
"""
return pd.read_csv(
url or PROTEIN_PATHWAY_URL,
url or PROTEIN_PATHWAY_HUMAN_URL,
sep='\t',
header=None
)
Expand Down
29 changes: 28 additions & 1 deletion src/bio2bel_kegg/parsers/pathways.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@

import pandas as pd

from bio2bel_kegg.constants import KEGG_PATHWAYS_URL
from bio2bel_kegg.constants import KEGG_PATHWAYS_URL, KEGG_ORGANISM_URL

__all__ = [
'get_pathway_names_df',
Expand Down Expand Up @@ -40,3 +40,30 @@ def parse_pathways(pathway_dataframe):
kegg_id: name
for line, (kegg_id, name) in pathway_dataframe.iterrows()
}


def get_pathway_species_df(url=None):
"""Convert tab separated txt files to pandas Dataframe.

:param Optional[str] url: url from KEGG tab separated file
:return: dataframe of the file
:rtype: pandas.DataFrame
"""
return pd.read_csv(
url or KEGG_ORGANISM_URL,
sep='\t',
header=None
)


def parse_species(org_dataframe):
"""Parse the pathway table dataframe.

:param pandas.DataFrame pathway_dataframe: Pathway hierarchy as dataframe
:rtype: dict
:return Object representation dictionary (kegg_id: name, species)
"""
return {
org_id: org_name.replace(')', '').split(' (')
for line, (kegg_id, org_id, org_name, desc) in org_dataframe.iterrows()
}