Skip to content

Commit 393d4c9

Browse files
authored
Merge pull request #628 from Steinbeck-Lab/development
Development
2 parents 69a01f5 + 31d0a8a commit 393d4c9

File tree

5 files changed

+272
-93
lines changed

5 files changed

+272
-93
lines changed

app/modules/pubchem_retrieve.py

Lines changed: 199 additions & 40 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
11
import re
22
import logging
33
from functools import lru_cache
4-
from typing import Optional
4+
from typing import Optional, Dict, Any, List
55
from urllib.parse import quote
66

77
import requests
@@ -55,6 +55,71 @@ def __init__(
5555
# Create a cached version of the internal _get_smiles method
5656
self._get_smiles_cached = lru_cache(maxsize=self.cache_size)(self._get_smiles)
5757

58+
def detect_input_type(self, user_input: str) -> str:
59+
"""
60+
Detect the type of chemical identifier from the input string.
61+
62+
Args:
63+
user_input (str): The input string to analyze
64+
65+
Returns:
66+
str: The detected input type
67+
"""
68+
if not user_input or not isinstance(user_input, str):
69+
return "unknown"
70+
71+
user_input = user_input.strip()
72+
73+
# 1. CID: all digits
74+
if user_input.isdigit():
75+
return "CID"
76+
77+
# 2. InChI: starts with "InChI="
78+
if user_input.startswith("InChI="):
79+
return "InChI"
80+
81+
# 3. InChIKey: specific format
82+
if re.match(r"^[A-Z0-9]{14}-[A-Z0-9]{10}-[A-Z0-9]$", user_input):
83+
return "InChIKey"
84+
85+
# 4. CAS number: pattern like "50-78-2" or "7732-18-5"
86+
if re.match(r"^\d{2,7}-\d{2}-\d$", user_input):
87+
return "CAS"
88+
89+
# 5. SMILES: check before molecular formula since some short formulas look like SMILES
90+
# SMILES typically contain organic chemistry characters and structural notation
91+
smiles_pattern = r"^[A-Za-z0-9\(\)\[\]=#+\-\\/\\@\.%:]*$"
92+
93+
if (
94+
" " not in user_input # SMILES shouldn't contain spaces
95+
and len(user_input) >= 1 # Allow very short SMILES
96+
and len(user_input) <= 500 # Reasonable maximum length
97+
and re.match(smiles_pattern, user_input)
98+
# Improved heuristic: SMILES usually contain structural elements OR basic organic patterns
99+
and (
100+
any(char in user_input for char in "()=[]#@\\/") # Structural notation
101+
or re.match(
102+
r"^[CNOPS]+(Cl|Br|[cnops]|\d)*$", user_input
103+
) # Simple organic patterns
104+
or (
105+
len(user_input) <= 10
106+
and re.match(r"^[CNOSPcnops]+\d*$", user_input)
107+
) # Short organic
108+
or (
109+
len(user_input) >= 3
110+
and re.match(r"^[CNOSHPFcnoshpf]+$", user_input)
111+
) # Common atoms
112+
)
113+
):
114+
return "SMILES"
115+
116+
# 6. Molecular formula: pattern like "C9H8O4" (check after SMILES)
117+
if re.match(r"^[A-Z][a-z]?(\d*[A-Z][a-z]?\d*)*$", user_input):
118+
return "formula"
119+
120+
# 7. Default: chemical name
121+
return "name"
122+
58123
def _query_by_cid(self, cid: str) -> Optional[str]:
59124
"""
60125
Retrieve the canonical SMILES for a compound by its PubChem CID.
@@ -80,6 +145,71 @@ def _query_by_cid(self, cid: str) -> Optional[str]:
80145
logger.error(f"Error querying by CID {cid}: {str(e)}")
81146
return None
82147

148+
def _get_cids_by_identifier(
149+
self, identifier: str, input_type: str
150+
) -> Optional[List[str]]:
151+
"""
152+
Get CIDs for a given identifier based on its type.
153+
154+
Args:
155+
identifier (str): The chemical identifier
156+
input_type (str): The type of identifier
157+
158+
Returns:
159+
Optional[List[str]]: List of CIDs if found, None otherwise
160+
"""
161+
try:
162+
if input_type == "CID":
163+
return [identifier] if identifier.isdigit() else None
164+
165+
elif input_type == "InChI":
166+
url = f"{self.BASE_URL}/inchi/cids/txt"
167+
response = self.session.post(
168+
url, data={"inchi": identifier}, timeout=self.timeout
169+
)
170+
response.raise_for_status()
171+
cids = response.text.strip().splitlines()
172+
return cids[:10] # Limit to first 10 CIDs
173+
174+
elif input_type == "InChIKey":
175+
url = f"{self.BASE_URL}/inchikey/{quote(identifier, safe='')}/cids/txt"
176+
response = self.session.get(url, timeout=self.timeout)
177+
response.raise_for_status()
178+
cids = response.text.strip().splitlines()
179+
return cids[:10] # Limit to first 10 CIDs
180+
181+
elif input_type == "formula":
182+
url = (
183+
f"{self.BASE_URL}/fastformula/{quote(identifier, safe='')}/cids/txt"
184+
)
185+
response = self.session.get(url, timeout=self.timeout)
186+
response.raise_for_status()
187+
cids = response.text.strip().splitlines()
188+
return cids[:10] # Limit to first 10 CIDs
189+
190+
elif input_type == "SMILES":
191+
url = f"{self.BASE_URL}/smiles/cids/txt"
192+
response = self.session.post(
193+
url, data={"smiles": identifier}, timeout=self.timeout
194+
)
195+
response.raise_for_status()
196+
cids = response.text.strip().splitlines()
197+
return cids[:10] # Limit to first 10 CIDs
198+
199+
elif input_type in ["name", "CAS"]:
200+
encoded = quote(identifier, safe="")
201+
url = f"{self.BASE_URL}/name/{encoded}/cids/txt"
202+
response = self.session.get(url, timeout=self.timeout)
203+
response.raise_for_status()
204+
cids = response.text.strip().splitlines()
205+
return cids[:10] # Limit to first 10 CIDs
206+
207+
except (RequestException, IndexError) as e:
208+
logger.error(f"Error getting CIDs for {identifier}: {str(e)}")
209+
return None
210+
211+
return None
212+
83213
def _query_by_inchi(self, inchi: str) -> Optional[str]:
84214
"""
85215
Retrieve the canonical SMILES for a compound by its InChI string.
@@ -203,51 +333,21 @@ def _get_smiles(self, user_input: str) -> Optional[str]:
203333

204334
# Trim whitespace
205335
user_input = user_input.strip()
336+
input_type = self.detect_input_type(user_input)
206337

207-
# 1. If input is a CID (all digits)
208-
if user_input.isdigit():
338+
# Route to appropriate query method based on detected type
339+
if input_type == "CID":
209340
return self._query_by_cid(user_input)
210-
211-
# 2. If input is an InChI (starts with "InChI=")
212-
if user_input.startswith("InChI="):
341+
elif input_type == "InChI":
213342
return self._query_by_inchi(user_input)
214-
215-
# 3. If input is an InChIKey
216-
if re.match(r"^[A-Z0-9]{14}-[A-Z0-9]{10}-[A-Z0-9]$", user_input):
343+
elif input_type == "InChIKey":
217344
return self._query_by_inchikey(user_input)
218-
219-
# 4. If input is a CAS number (e.g., "7732-18-5")
220-
if re.match(r"^\d{2,7}-\d{2}-\d$", user_input):
221-
return self._query_by_name(user_input) # Handle CAS as name
222-
223-
# 5. If input is a molecular formula (e.g., "C6H12O6")
224-
if re.match(r"^(?:[A-Z][a-z]?\d+)+$", user_input):
345+
elif input_type == "formula":
225346
return self._query_by_formula(user_input)
226-
227-
# 6. If input is a SMILES string - IMPROVED VERSION
228-
# SMILES typically contain organic chemistry characters and structural notation
229-
# Common SMILES characters: C, N, O, S, P, F, Cl, Br, I, H, numbers,
230-
# parentheses (), equals =, hash #, plus +, minus -, forward slash /,
231-
# backslash \, at symbol @, square brackets []
232-
smiles_pattern = r"^[A-Za-z0-9\(\)\[\]=#+\-\\/\\@\.%:]*$"
233-
234-
if (
235-
" " not in user_input # SMILES shouldn't contain spaces
236-
and len(user_input) >= 1 # Allow very short SMILES
237-
and len(user_input) <= 500 # Reasonable maximum length
238-
and re.match(smiles_pattern, user_input)
239-
# Improved heuristic: SMILES usually contain structural elements OR basic organic patterns
240-
and (
241-
any(char in user_input for char in "()=[]#@\\/") # Structural notation
242-
or re.match(
243-
r"^[CNOPS]+(Cl|Br|[cnops]|\d)*$", user_input
244-
) # Simple organic patterns
245-
)
246-
):
347+
elif input_type == "SMILES":
247348
return self._query_by_smiles(user_input)
248-
249-
# 7. Default: treat as a chemical name
250-
return self._query_by_name(user_input)
349+
else: # name or CAS
350+
return self._query_by_name(user_input)
251351

252352
def get_smiles(self, user_input: str) -> Optional[str]:
253353
"""
@@ -278,3 +378,62 @@ def get_smiles(self, user_input: str) -> Optional[str]:
278378
'C([C@@H]1[C@H]([C@@H]([C@H](C(O1)O)O)O)O)O'
279379
"""
280380
return self._get_smiles_cached(user_input)
381+
382+
def get_compound_info(self, user_input: str) -> Dict[str, Any]:
383+
"""
384+
Retrieve comprehensive compound information from PubChem.
385+
386+
Args:
387+
user_input (str): The chemical identifier
388+
389+
Returns:
390+
Dict[str, Any]: Dictionary containing:
391+
- input: original input
392+
- input_type: detected input type
393+
- canonical_smiles: canonical SMILES if found
394+
- cids: list of PubChem CIDs
395+
- pubchem_links: list of PubChem compound page URLs
396+
- success: boolean indicating if compound was found
397+
"""
398+
if not user_input or not isinstance(user_input, str):
399+
return {
400+
"input": user_input,
401+
"input_type": "unknown",
402+
"canonical_smiles": None,
403+
"cids": None,
404+
"pubchem_links": None,
405+
"success": False,
406+
}
407+
408+
user_input = user_input.strip()
409+
input_type = self.detect_input_type(user_input)
410+
411+
# Get CIDs
412+
cids = self._get_cids_by_identifier(user_input, input_type)
413+
414+
if not cids:
415+
return {
416+
"input": user_input,
417+
"input_type": input_type,
418+
"canonical_smiles": None,
419+
"cids": None,
420+
"pubchem_links": None,
421+
"success": False,
422+
}
423+
424+
# Get canonical SMILES from first CID
425+
canonical_smiles = self._query_by_cid(cids[0])
426+
427+
# Generate PubChem links
428+
pubchem_links = [
429+
f"https://pubchem.ncbi.nlm.nih.gov/compound/{cid}" for cid in cids
430+
]
431+
432+
return {
433+
"input": user_input,
434+
"input_type": input_type,
435+
"canonical_smiles": canonical_smiles,
436+
"cids": cids,
437+
"pubchem_links": pubchem_links,
438+
"success": canonical_smiles is not None,
439+
}

app/routers/chem.py

Lines changed: 16 additions & 36 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,7 @@
11
from __future__ import annotations
22

33
import io
4+
import logging
45
from typing import Annotated
56
from typing import Literal
67
from typing import Optional
@@ -89,6 +90,7 @@
8990
templates = Jinja2Templates(directory="app/templates")
9091

9192
pubchem_client = PubChemClient()
93+
logger = logging.getLogger(__name__)
9294

9395

9496
@router.get("/", include_in_schema=False)
@@ -1440,7 +1442,7 @@ async def get_pubchem_smiles(
14401442
identifier (str): The chemical identifier to look up
14411443
14421444
Returns:
1443-
SMILESResponse: Object containing the input, canonical SMILES, detected input type, and success status
1445+
PubChemResponse: Object containing the input, canonical SMILES, detected input type, CIDs, PubChem links, and success status
14441446
14451447
Raises:
14461448
HTTPException: If the identifier is invalid or no results are found
@@ -1452,43 +1454,21 @@ async def get_pubchem_smiles(
14521454
)
14531455

14541456
try:
1455-
# Get the SMILES from PubChem
1456-
result = pubchem_client.get_smiles(identifier)
1457-
1458-
# Determine input type (simplified for endpoint response)
1459-
input_type = "name" # Default
1460-
if identifier.isdigit():
1461-
input_type = "CID"
1462-
elif identifier.startswith("InChI="):
1463-
input_type = "InChI"
1464-
elif (
1465-
len(identifier) == 27
1466-
and identifier.count("-") == 2
1467-
and all(c.isupper() or c.isdigit() or c == "-" for c in identifier)
1468-
and identifier[14] == "-"
1469-
and identifier[25] == "-"
1470-
):
1471-
input_type = "InChIKey"
1472-
elif "-" in identifier and sum(c.isdigit() for c in identifier) > 5:
1473-
input_type = "CAS"
1474-
1475-
# Create response
1476-
if result:
1477-
return PubChemResponse(
1478-
input=identifier,
1479-
canonical_smiles=result,
1480-
input_type=input_type,
1481-
success=True,
1482-
)
1483-
else:
1484-
return PubChemResponse(
1485-
input=identifier,
1486-
canonical_smiles=None,
1487-
input_type=input_type,
1488-
success=False,
1489-
)
1457+
# Get comprehensive compound information from PubChem
1458+
result = pubchem_client.get_compound_info(identifier)
1459+
1460+
# Create response with all available information
1461+
return PubChemResponse(
1462+
input=result["input"],
1463+
canonical_smiles=result["canonical_smiles"],
1464+
input_type=result["input_type"],
1465+
success=result["success"],
1466+
cids=result["cids"],
1467+
pubchem_links=result["pubchem_links"],
1468+
)
14901469

14911470
except Exception as e:
1471+
logger.error(f"Error processing PubChem request for '{identifier}': {str(e)}")
14921472
raise HTTPException(
14931473
status_code=status.HTTP_422_UNPROCESSABLE_ENTITY,
14941474
detail=f"Error processing request: {str(e)}",

app/schemas/pubchem_schema.py

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
from __future__ import annotations
2-
from typing import Optional
2+
from typing import Optional, List
33

44
from pydantic import BaseModel
55

@@ -11,3 +11,5 @@ class PubChemResponse(BaseModel):
1111
canonical_smiles: Optional[str] = None
1212
input_type: Optional[str] = None
1313
success: bool
14+
cids: Optional[List[str]] = None
15+
pubchem_links: Optional[List[str]] = None

0 commit comments

Comments
 (0)