bio2bel · cthoyt · Nov 18, 2020
diff --git a/src/bio2bel_kegg/constants.py b/src/bio2bel_kegg/constants.py
@@ -16,10 +16,15 @@
 os.makedirs(PROTEIN_ENTRY_DIR, exist_ok=True)
 
 # returns the list of human pathways
-KEGG_PATHWAYS_URL = 'http://rest.kegg.jp/list/pathway/hsa'
+KEGG_PATHWAYS_URL = 'http://rest.kegg.jp/list/pathway'
+KEGG_HUMAN_PATHWAYS_URL = f'{KEGG_PATHWAYS_URL}/hsa'
+
+# returns the list of organism pathways
+KEGG_ORGANISM_URL = 'http://rest.kegg.jp/list/organism'
 
 #  human genes linked from each of the KEGG pathways
-PROTEIN_PATHWAY_URL = 'http://rest.kegg.jp/link/pathway/hsa'
+PROTEIN_PATHWAY_URL = 'http://rest.kegg.jp/link/pathway/'
+PROTEIN_PATHWAY_HUMAN_URL = f'{PROTEIN_PATHWAY_URL.rstrip("/")}/hsa'
 
 # KEGG stats
 KEGG_STATISTICS_URL = 'http://rest.kegg.jp/info/kegg'

diff --git a/src/bio2bel_kegg/manager.py b/src/bio2bel_kegg/manager.py
@@ -6,7 +6,7 @@
 import logging
 import os
 from multiprocessing.pool import ThreadPool
-from typing import List, Mapping, Optional
+from typing import List, Mapping, Optional, Union
 
 import requests
 from tqdm import tqdm
@@ -19,10 +19,13 @@
 from pybel.constants import BIOPROCESS, FUNCTION, NAME, NAMESPACE, PROTEIN
 from pybel.manager.models import Namespace, NamespaceEntry
 from pybel.struct.graph import BELGraph
-from .constants import API_KEGG_GET, KEGG, METADATA_FILE_PATH, MODULE_NAME, PROTEIN_ENTRY_DIR
+from .constants import (
+    API_KEGG_GET, KEGG, KEGG_PATHWAYS_URL, METADATA_FILE_PATH, MODULE_NAME, PROTEIN_ENTRY_DIR, PROTEIN_PATHWAY_URL,
+)
 from .models import Base, Pathway, Protein, protein_pathway
 from .parsers import (
-    get_entity_pathway_df, get_pathway_names_df, parse_entity_pathway, parse_pathways, process_protein_info_to_model,
+    get_entity_pathway_df, get_pathway_names_df, get_pathway_species_df, parse_entity_pathway, parse_pathways,
+    parse_species, process_protein_info_to_model,
 )
 
 __all__ = [
@@ -111,6 +114,56 @@ def _populate_pathways(self, url: Optional[str] = None):
 
         self.session.commit()
 
+    def _populate_pathways_single_species(
+        self,
+        species_id: Optional[str] = None,
+        metadata_existing=None,
+    ):
+        """Populate pathways for A SINGLE specie.
+
+        :param species_id: name or id of the specie to populate
+        :param metadata_existing: metadata exists already
+        """
+        df_pathway_species = get_pathway_species_df()
+        pathways_species_dict = parse_species(df_pathway_species)
+
+        url_pathways = os.path.join(KEGG_PATHWAYS_URL, species_id)
+        url_proteins = os.path.join(PROTEIN_PATHWAY_URL, species_id)
+
+        if species_id in list(pathways_species_dict.keys()):
+            self._populate_pathways(url_pathways)
+            self._pathway_entity(url=url_proteins, metadata_existing=metadata_existing)
+        elif species_id in [i[0] for i in pathways_species_dict.values()]:
+            url = os.path.join(KEGG_PATHWAYS_URL, species_id)
+            self._populate_pathways(url_pathways)
+            self._pathway_entity(url=url_proteins, metadata_existing=metadata_existing)
+        else:
+            raise Warning(f'Organism id {species_id} not found in KEGG.')
+
+    def _populate_pathways_multiple_species(
+        self,
+        species_ids: Optional[Union[str, List]] = None,
+        metadata_existing=None,
+    ):
+        """Populate pathways for A OR MANY species.
+
+        :param species_ids: name or id of the specie or species (if list) to populate
+        :param metadata_existing: metadata exists already
+        """
+        if isinstance(species_ids, str):
+            self._populate_pathways_single_species(species_ids, metadata_existing)
+
+        elif isinstance(species_ids, list):
+            for specie_id in species_ids:
+                self._populate_pathways_single_species(specie_id, metadata_existing)
+        else:
+            # If none specified, populate ALL species in KEGG
+            df_pathway_species = get_pathway_species_df()
+            pathways_species_dict = parse_species(df_pathway_species)
+
+            for specie_id in pathways_species_dict.keys():
+                self._populate_pathways(os.path.join(KEGG_PATHWAYS_URL, specie_id))
+
     def _pathway_entity(self, url=None, metadata_existing=None, thead_pool_size=1):
         """Populate proteins.
 
@@ -207,10 +260,13 @@ def _postprocess_pid(self, pid_attributes):
             if hgnc_id is not None:
                 pid_attributes[kegg_protein_id]['hgnc_symbol'] = hgnc_id_to_symbol.get(hgnc_id)
 
-    def populate(self, pathways_url=None, protein_pathway_url=None, metadata_existing=False):
+    def populate(self, pathways_url=None, protein_pathway_url=None, species=None, metadata_existing=False):
         """Populate all tables."""
-        self._populate_pathways(url=pathways_url)
-        self._pathway_entity(url=protein_pathway_url, metadata_existing=metadata_existing)
+        if species:
+            self._populate_pathways_multiple_species(species, metadata_existing)
+        else:
+            self._populate_pathways(url=pathways_url)
+            self._pathway_entity(url=protein_pathway_url, metadata_existing=metadata_existing)
 
     def count_pathways(self) -> int:
         """Count the pathways in the database."""

diff --git a/src/bio2bel_kegg/parsers/__init__.py b/src/bio2bel_kegg/parsers/__init__.py
@@ -4,4 +4,4 @@
 
 from .description import process_protein_info_to_model  # noqa: F401
 from .entities import get_entity_pathway_df, parse_entity_pathway  # noqa: F401
-from .pathways import get_pathway_names_df, parse_pathways  # noqa: F401
+from .pathways import get_pathway_names_df, parse_pathways, get_pathway_species_df, parse_species  # noqa: F401
diff --git a/src/bio2bel_kegg/parsers/entities.py b/src/bio2bel_kegg/parsers/entities.py
@@ -6,7 +6,7 @@
 
 import pandas as pd
 
-from bio2bel_kegg.constants import PROTEIN_PATHWAY_URL
+from bio2bel_kegg.constants import PROTEIN_PATHWAY_HUMAN_URL
 
 __all__ = [
     'get_entity_pathway_df',
@@ -20,7 +20,7 @@ def get_entity_pathway_df(url: Optional[str] = None) -> pd.DataFrame:
     :param url: An optional url from a KEGG TSV file
     """
     return pd.read_csv(
-        url or PROTEIN_PATHWAY_URL,
+        url or PROTEIN_PATHWAY_HUMAN_URL,
         sep='\t',
         header=None
     )

diff --git a/src/bio2bel_kegg/parsers/pathways.py b/src/bio2bel_kegg/parsers/pathways.py
@@ -7,7 +7,7 @@
 
 import pandas as pd
 
-from bio2bel_kegg.constants import KEGG_PATHWAYS_URL
+from bio2bel_kegg.constants import KEGG_PATHWAYS_URL, KEGG_ORGANISM_URL
 
 __all__ = [
     'get_pathway_names_df',
@@ -40,3 +40,30 @@ def parse_pathways(pathway_dataframe):
         kegg_id: name
         for line, (kegg_id, name) in pathway_dataframe.iterrows()
     }
+
+
+def get_pathway_species_df(url=None):
+    """Convert tab separated txt files to pandas Dataframe.
+
+    :param Optional[str] url: url from KEGG tab separated file
+    :return: dataframe of the file
+    :rtype: pandas.DataFrame
+    """
+    return pd.read_csv(
+        url or KEGG_ORGANISM_URL,
+        sep='\t',
+        header=None
+    )
+
+
+def parse_species(org_dataframe):
+    """Parse the pathway table dataframe.
+
+    :param pandas.DataFrame pathway_dataframe: Pathway hierarchy as dataframe
+    :rtype: dict
+    :return Object representation dictionary (kegg_id: name, species)
+    """
+    return {
+        org_id: org_name.replace(')', '').split(' (')
+        for line, (kegg_id, org_id, org_name, desc) in org_dataframe.iterrows()
+    }