Steinbeck-Lab
diff --git a/‎app/modules/pubchem_retrieve.py‎
Lines changed: 199 additions & 40 deletions b/‎app/modules/pubchem_retrieve.py‎
Lines changed: 199 additions & 40 deletions
diff --git a/‎app/routers/chem.py‎
Lines changed: 16 additions & 36 deletions b/‎app/routers/chem.py‎
Lines changed: 16 additions & 36 deletions
diff --git a/‎app/schemas/pubchem_schema.py‎
Lines changed: 3 additions & 1 deletion b/‎app/schemas/pubchem_schema.py‎
Lines changed: 3 additions & 1 deletion
@@ -1,7 +1,7 @@
 import re
 import logging
 from functools import lru_cache
-from typing import Optional
+from typing import Optional, Dict, Any, List
 from urllib.parse import quote
 
 import requests
@@ -55,6 +55,71 @@ def __init__(
         # Create a cached version of the internal _get_smiles method
         self._get_smiles_cached = lru_cache(maxsize=self.cache_size)(self._get_smiles)
 
+    def detect_input_type(self, user_input: str) -> str:
+        """
+        Detect the type of chemical identifier from the input string.
+
+        Args:
+            user_input (str): The input string to analyze
+
+        Returns:
+            str: The detected input type
+        """
+        if not user_input or not isinstance(user_input, str):
+            return "unknown"
+
+        user_input = user_input.strip()
+
+        # 1. CID: all digits
+        if user_input.isdigit():
+            return "CID"
+
+        # 2. InChI: starts with "InChI="
+        if user_input.startswith("InChI="):
+            return "InChI"
+
+        # 3. InChIKey: specific format
+        if re.match(r"^[A-Z0-9]{14}-[A-Z0-9]{10}-[A-Z0-9]$", user_input):
+            return "InChIKey"
+
+        # 4. CAS number: pattern like "50-78-2" or "7732-18-5"
+        if re.match(r"^\d{2,7}-\d{2}-\d$", user_input):
+            return "CAS"
+
+        # 5. SMILES: check before molecular formula since some short formulas look like SMILES
+        # SMILES typically contain organic chemistry characters and structural notation
+        smiles_pattern = r"^[A-Za-z0-9\(\)\[\]=#+\-\\/\\@\.%:]*$"
+
+        if (
+            " " not in user_input  # SMILES shouldn't contain spaces
+            and len(user_input) >= 1  # Allow very short SMILES
+            and len(user_input) <= 500  # Reasonable maximum length
+            and re.match(smiles_pattern, user_input)
+            # Improved heuristic: SMILES usually contain structural elements OR basic organic patterns
+            and (
+                any(char in user_input for char in "()=[]#@\\/")  # Structural notation
+                or re.match(
+                    r"^[CNOPS]+(Cl|Br|[cnops]|\d)*$", user_input
+                )  # Simple organic patterns
+                or (
+                    len(user_input) <= 10
+                    and re.match(r"^[CNOSPcnops]+\d*$", user_input)
+                )  # Short organic
+                or (
+                    len(user_input) >= 3
+                    and re.match(r"^[CNOSHPFcnoshpf]+$", user_input)
+                )  # Common atoms
+            )
+        ):
+            return "SMILES"
+
+        # 6. Molecular formula: pattern like "C9H8O4" (check after SMILES)
+        if re.match(r"^[A-Z][a-z]?(\d*[A-Z][a-z]?\d*)*$", user_input):
+            return "formula"
+
+        # 7. Default: chemical name
+        return "name"
+
     def _query_by_cid(self, cid: str) -> Optional[str]:
         """
         Retrieve the canonical SMILES for a compound by its PubChem CID.
@@ -80,6 +145,71 @@ def _query_by_cid(self, cid: str) -> Optional[str]:
             logger.error(f"Error querying by CID {cid}: {str(e)}")
             return None
 
+    def _get_cids_by_identifier(
+        self, identifier: str, input_type: str
+    ) -> Optional[List[str]]:
+        """
+        Get CIDs for a given identifier based on its type.
+
+        Args:
+            identifier (str): The chemical identifier
+            input_type (str): The type of identifier
+
+        Returns:
+            Optional[List[str]]: List of CIDs if found, None otherwise
+        """
+        try:
+            if input_type == "CID":
+                return [identifier] if identifier.isdigit() else None
+
+            elif input_type == "InChI":
+                url = f"{self.BASE_URL}/inchi/cids/txt"
+                response = self.session.post(
+                    url, data={"inchi": identifier}, timeout=self.timeout
+                )
+                response.raise_for_status()
+                cids = response.text.strip().splitlines()
+                return cids[:10]  # Limit to first 10 CIDs
+
+            elif input_type == "InChIKey":
+                url = f"{self.BASE_URL}/inchikey/{quote(identifier, safe='')}/cids/txt"
+                response = self.session.get(url, timeout=self.timeout)
+                response.raise_for_status()
+                cids = response.text.strip().splitlines()
+                return cids[:10]  # Limit to first 10 CIDs
+
+            elif input_type == "formula":
+                url = (
+                    f"{self.BASE_URL}/fastformula/{quote(identifier, safe='')}/cids/txt"
+                )
+                response = self.session.get(url, timeout=self.timeout)
+                response.raise_for_status()
+                cids = response.text.strip().splitlines()
+                return cids[:10]  # Limit to first 10 CIDs
+
+            elif input_type == "SMILES":
+                url = f"{self.BASE_URL}/smiles/cids/txt"
+                response = self.session.post(
+                    url, data={"smiles": identifier}, timeout=self.timeout
+                )
+                response.raise_for_status()
+                cids = response.text.strip().splitlines()
+                return cids[:10]  # Limit to first 10 CIDs
+
+            elif input_type in ["name", "CAS"]:
+                encoded = quote(identifier, safe="")
+                url = f"{self.BASE_URL}/name/{encoded}/cids/txt"
+                response = self.session.get(url, timeout=self.timeout)
+                response.raise_for_status()
+                cids = response.text.strip().splitlines()
+                return cids[:10]  # Limit to first 10 CIDs
+
+        except (RequestException, IndexError) as e:
+            logger.error(f"Error getting CIDs for {identifier}: {str(e)}")
+            return None
+
+        return None
+
     def _query_by_inchi(self, inchi: str) -> Optional[str]:
         """
         Retrieve the canonical SMILES for a compound by its InChI string.
@@ -203,51 +333,21 @@ def _get_smiles(self, user_input: str) -> Optional[str]:
 
         # Trim whitespace
         user_input = user_input.strip()
+        input_type = self.detect_input_type(user_input)
 
-        # 1. If input is a CID (all digits)
-        if user_input.isdigit():
+        # Route to appropriate query method based on detected type
+        if input_type == "CID":
             return self._query_by_cid(user_input)
-
-        # 2. If input is an InChI (starts with "InChI=")
-        if user_input.startswith("InChI="):
+        elif input_type == "InChI":
             return self._query_by_inchi(user_input)
-
-        # 3. If input is an InChIKey
-        if re.match(r"^[A-Z0-9]{14}-[A-Z0-9]{10}-[A-Z0-9]$", user_input):
+        elif input_type == "InChIKey":
             return self._query_by_inchikey(user_input)
-
-        # 4. If input is a CAS number (e.g., "7732-18-5")
-        if re.match(r"^\d{2,7}-\d{2}-\d$", user_input):
-            return self._query_by_name(user_input)  # Handle CAS as name
-
-        # 5. If input is a molecular formula (e.g., "C6H12O6")
-        if re.match(r"^(?:[A-Z][a-z]?\d+)+$", user_input):
+        elif input_type == "formula":
             return self._query_by_formula(user_input)
-
-        # 6. If input is a SMILES string - IMPROVED VERSION
-        # SMILES typically contain organic chemistry characters and structural notation
-        # Common SMILES characters: C, N, O, S, P, F, Cl, Br, I, H, numbers,
-        # parentheses (), equals =, hash #, plus +, minus -, forward slash /,
-        # backslash \, at symbol @, square brackets []
-        smiles_pattern = r"^[A-Za-z0-9\(\)\[\]=#+\-\\/\\@\.%:]*$"
-
-        if (
-            " " not in user_input  # SMILES shouldn't contain spaces
-            and len(user_input) >= 1  # Allow very short SMILES
-            and len(user_input) <= 500  # Reasonable maximum length
-            and re.match(smiles_pattern, user_input)
-            # Improved heuristic: SMILES usually contain structural elements OR basic organic patterns
-            and (
-                any(char in user_input for char in "()=[]#@\\/")  # Structural notation
-                or re.match(
-                    r"^[CNOPS]+(Cl|Br|[cnops]|\d)*$", user_input
-                )  # Simple organic patterns
-            )
-        ):
+        elif input_type == "SMILES":
             return self._query_by_smiles(user_input)
-
-        # 7. Default: treat as a chemical name
-        return self._query_by_name(user_input)
+        else:  # name or CAS
+            return self._query_by_name(user_input)
 
     def get_smiles(self, user_input: str) -> Optional[str]:
         """
@@ -278,3 +378,62 @@ def get_smiles(self, user_input: str) -> Optional[str]:
             'C([C@@H]1[C@H]([C@@H]([C@H](C(O1)O)O)O)O)O'
         """
         return self._get_smiles_cached(user_input)
+
+    def get_compound_info(self, user_input: str) -> Dict[str, Any]:
+        """
+        Retrieve comprehensive compound information from PubChem.
+
+        Args:
+            user_input (str): The chemical identifier
+
+        Returns:
+            Dict[str, Any]: Dictionary containing:
+                - input: original input
+                - input_type: detected input type
+                - canonical_smiles: canonical SMILES if found
+                - cids: list of PubChem CIDs
+                - pubchem_links: list of PubChem compound page URLs
+                - success: boolean indicating if compound was found
+        """
+        if not user_input or not isinstance(user_input, str):
+            return {
+                "input": user_input,
+                "input_type": "unknown",
+                "canonical_smiles": None,
+                "cids": None,
+                "pubchem_links": None,
+                "success": False,
+            }
+
+        user_input = user_input.strip()
+        input_type = self.detect_input_type(user_input)
+
+        # Get CIDs
+        cids = self._get_cids_by_identifier(user_input, input_type)
+
+        if not cids:
+            return {
+                "input": user_input,
+                "input_type": input_type,
+                "canonical_smiles": None,
+                "cids": None,
+                "pubchem_links": None,
+                "success": False,
+            }
+
+        # Get canonical SMILES from first CID
+        canonical_smiles = self._query_by_cid(cids[0])
+
+        # Generate PubChem links
+        pubchem_links = [
+            f"https://pubchem.ncbi.nlm.nih.gov/compound/{cid}" for cid in cids
+        ]
+
+        return {
+            "input": user_input,
+            "input_type": input_type,
+            "canonical_smiles": canonical_smiles,
+            "cids": cids,
+            "pubchem_links": pubchem_links,
+            "success": canonical_smiles is not None,
+        }
@@ -1,6 +1,7 @@
 from __future__ import annotations
 
 import io
+import logging
 from typing import Annotated
 from typing import Literal
 from typing import Optional
@@ -89,6 +90,7 @@
 templates = Jinja2Templates(directory="app/templates")
 
 pubchem_client = PubChemClient()
+logger = logging.getLogger(__name__)
 
 
 @router.get("/", include_in_schema=False)
@@ -1440,7 +1442,7 @@ async def get_pubchem_smiles(
         identifier (str): The chemical identifier to look up
 
     Returns:
-        SMILESResponse: Object containing the input, canonical SMILES, detected input type, and success status
+        PubChemResponse: Object containing the input, canonical SMILES, detected input type, CIDs, PubChem links, and success status
 
     Raises:
         HTTPException: If the identifier is invalid or no results are found
@@ -1452,43 +1454,21 @@ async def get_pubchem_smiles(
         )
 
     try:
-        # Get the SMILES from PubChem
-        result = pubchem_client.get_smiles(identifier)
-
-        # Determine input type (simplified for endpoint response)
-        input_type = "name"  # Default
-        if identifier.isdigit():
-            input_type = "CID"
-        elif identifier.startswith("InChI="):
-            input_type = "InChI"
-        elif (
-            len(identifier) == 27
-            and identifier.count("-") == 2
-            and all(c.isupper() or c.isdigit() or c == "-" for c in identifier)
-            and identifier[14] == "-"
-            and identifier[25] == "-"
-        ):
-            input_type = "InChIKey"
-        elif "-" in identifier and sum(c.isdigit() for c in identifier) > 5:
-            input_type = "CAS"
-
-        # Create response
-        if result:
-            return PubChemResponse(
-                input=identifier,
-                canonical_smiles=result,
-                input_type=input_type,
-                success=True,
-            )
-        else:
-            return PubChemResponse(
-                input=identifier,
-                canonical_smiles=None,
-                input_type=input_type,
-                success=False,
-            )
+        # Get comprehensive compound information from PubChem
+        result = pubchem_client.get_compound_info(identifier)
+
+        # Create response with all available information
+        return PubChemResponse(
+            input=result["input"],
+            canonical_smiles=result["canonical_smiles"],
+            input_type=result["input_type"],
+            success=result["success"],
+            cids=result["cids"],
+            pubchem_links=result["pubchem_links"],
+        )
 
     except Exception as e:
+        logger.error(f"Error processing PubChem request for '{identifier}': {str(e)}")
         raise HTTPException(
             status_code=status.HTTP_422_UNPROCESSABLE_ENTITY,
             detail=f"Error processing request: {str(e)}",
 
@@ -1,5 +1,5 @@
 from __future__ import annotations
-from typing import Optional
+from typing import Optional, List
 
 from pydantic import BaseModel
 
@@ -11,3 +11,5 @@ class PubChemResponse(BaseModel):
     canonical_smiles: Optional[str] = None
     input_type: Optional[str] = None
     success: bool
+    cids: Optional[List[str]] = None
+    pubchem_links: Optional[List[str]] = None