11import re
22import logging
33from functools import lru_cache
4- from typing import Optional
4+ from typing import Optional , Dict , Any , List
55from urllib .parse import quote
66
77import requests
@@ -55,6 +55,71 @@ def __init__(
5555 # Create a cached version of the internal _get_smiles method
5656 self ._get_smiles_cached = lru_cache (maxsize = self .cache_size )(self ._get_smiles )
5757
58+ def detect_input_type (self , user_input : str ) -> str :
59+ """
60+ Detect the type of chemical identifier from the input string.
61+
62+ Args:
63+ user_input (str): The input string to analyze
64+
65+ Returns:
66+ str: The detected input type
67+ """
68+ if not user_input or not isinstance (user_input , str ):
69+ return "unknown"
70+
71+ user_input = user_input .strip ()
72+
73+ # 1. CID: all digits
74+ if user_input .isdigit ():
75+ return "CID"
76+
77+ # 2. InChI: starts with "InChI="
78+ if user_input .startswith ("InChI=" ):
79+ return "InChI"
80+
81+ # 3. InChIKey: specific format
82+ if re .match (r"^[A-Z0-9]{14}-[A-Z0-9]{10}-[A-Z0-9]$" , user_input ):
83+ return "InChIKey"
84+
85+ # 4. CAS number: pattern like "50-78-2" or "7732-18-5"
86+ if re .match (r"^\d{2,7}-\d{2}-\d$" , user_input ):
87+ return "CAS"
88+
89+ # 5. SMILES: check before molecular formula since some short formulas look like SMILES
90+ # SMILES typically contain organic chemistry characters and structural notation
91+ smiles_pattern = r"^[A-Za-z0-9\(\)\[\]=#+\-\\/\\@\.%:]*$"
92+
93+ if (
94+ " " not in user_input # SMILES shouldn't contain spaces
95+ and len (user_input ) >= 1 # Allow very short SMILES
96+ and len (user_input ) <= 500 # Reasonable maximum length
97+ and re .match (smiles_pattern , user_input )
98+ # Improved heuristic: SMILES usually contain structural elements OR basic organic patterns
99+ and (
100+ any (char in user_input for char in "()=[]#@\\ /" ) # Structural notation
101+ or re .match (
102+ r"^[CNOPS]+(Cl|Br|[cnops]|\d)*$" , user_input
103+ ) # Simple organic patterns
104+ or (
105+ len (user_input ) <= 10
106+ and re .match (r"^[CNOSPcnops]+\d*$" , user_input )
107+ ) # Short organic
108+ or (
109+ len (user_input ) >= 3
110+ and re .match (r"^[CNOSHPFcnoshpf]+$" , user_input )
111+ ) # Common atoms
112+ )
113+ ):
114+ return "SMILES"
115+
116+ # 6. Molecular formula: pattern like "C9H8O4" (check after SMILES)
117+ if re .match (r"^[A-Z][a-z]?(\d*[A-Z][a-z]?\d*)*$" , user_input ):
118+ return "formula"
119+
120+ # 7. Default: chemical name
121+ return "name"
122+
58123 def _query_by_cid (self , cid : str ) -> Optional [str ]:
59124 """
60125 Retrieve the canonical SMILES for a compound by its PubChem CID.
@@ -80,6 +145,71 @@ def _query_by_cid(self, cid: str) -> Optional[str]:
80145 logger .error (f"Error querying by CID { cid } : { str (e )} " )
81146 return None
82147
148+ def _get_cids_by_identifier (
149+ self , identifier : str , input_type : str
150+ ) -> Optional [List [str ]]:
151+ """
152+ Get CIDs for a given identifier based on its type.
153+
154+ Args:
155+ identifier (str): The chemical identifier
156+ input_type (str): The type of identifier
157+
158+ Returns:
159+ Optional[List[str]]: List of CIDs if found, None otherwise
160+ """
161+ try :
162+ if input_type == "CID" :
163+ return [identifier ] if identifier .isdigit () else None
164+
165+ elif input_type == "InChI" :
166+ url = f"{ self .BASE_URL } /inchi/cids/txt"
167+ response = self .session .post (
168+ url , data = {"inchi" : identifier }, timeout = self .timeout
169+ )
170+ response .raise_for_status ()
171+ cids = response .text .strip ().splitlines ()
172+ return cids [:10 ] # Limit to first 10 CIDs
173+
174+ elif input_type == "InChIKey" :
175+ url = f"{ self .BASE_URL } /inchikey/{ quote (identifier , safe = '' )} /cids/txt"
176+ response = self .session .get (url , timeout = self .timeout )
177+ response .raise_for_status ()
178+ cids = response .text .strip ().splitlines ()
179+ return cids [:10 ] # Limit to first 10 CIDs
180+
181+ elif input_type == "formula" :
182+ url = (
183+ f"{ self .BASE_URL } /fastformula/{ quote (identifier , safe = '' )} /cids/txt"
184+ )
185+ response = self .session .get (url , timeout = self .timeout )
186+ response .raise_for_status ()
187+ cids = response .text .strip ().splitlines ()
188+ return cids [:10 ] # Limit to first 10 CIDs
189+
190+ elif input_type == "SMILES" :
191+ url = f"{ self .BASE_URL } /smiles/cids/txt"
192+ response = self .session .post (
193+ url , data = {"smiles" : identifier }, timeout = self .timeout
194+ )
195+ response .raise_for_status ()
196+ cids = response .text .strip ().splitlines ()
197+ return cids [:10 ] # Limit to first 10 CIDs
198+
199+ elif input_type in ["name" , "CAS" ]:
200+ encoded = quote (identifier , safe = "" )
201+ url = f"{ self .BASE_URL } /name/{ encoded } /cids/txt"
202+ response = self .session .get (url , timeout = self .timeout )
203+ response .raise_for_status ()
204+ cids = response .text .strip ().splitlines ()
205+ return cids [:10 ] # Limit to first 10 CIDs
206+
207+ except (RequestException , IndexError ) as e :
208+ logger .error (f"Error getting CIDs for { identifier } : { str (e )} " )
209+ return None
210+
211+ return None
212+
83213 def _query_by_inchi (self , inchi : str ) -> Optional [str ]:
84214 """
85215 Retrieve the canonical SMILES for a compound by its InChI string.
@@ -203,51 +333,21 @@ def _get_smiles(self, user_input: str) -> Optional[str]:
203333
204334 # Trim whitespace
205335 user_input = user_input .strip ()
336+ input_type = self .detect_input_type (user_input )
206337
207- # 1. If input is a CID (all digits)
208- if user_input . isdigit () :
338+ # Route to appropriate query method based on detected type
339+ if input_type == "CID" :
209340 return self ._query_by_cid (user_input )
210-
211- # 2. If input is an InChI (starts with "InChI=")
212- if user_input .startswith ("InChI=" ):
341+ elif input_type == "InChI" :
213342 return self ._query_by_inchi (user_input )
214-
215- # 3. If input is an InChIKey
216- if re .match (r"^[A-Z0-9]{14}-[A-Z0-9]{10}-[A-Z0-9]$" , user_input ):
343+ elif input_type == "InChIKey" :
217344 return self ._query_by_inchikey (user_input )
218-
219- # 4. If input is a CAS number (e.g., "7732-18-5")
220- if re .match (r"^\d{2,7}-\d{2}-\d$" , user_input ):
221- return self ._query_by_name (user_input ) # Handle CAS as name
222-
223- # 5. If input is a molecular formula (e.g., "C6H12O6")
224- if re .match (r"^(?:[A-Z][a-z]?\d+)+$" , user_input ):
345+ elif input_type == "formula" :
225346 return self ._query_by_formula (user_input )
226-
227- # 6. If input is a SMILES string - IMPROVED VERSION
228- # SMILES typically contain organic chemistry characters and structural notation
229- # Common SMILES characters: C, N, O, S, P, F, Cl, Br, I, H, numbers,
230- # parentheses (), equals =, hash #, plus +, minus -, forward slash /,
231- # backslash \, at symbol @, square brackets []
232- smiles_pattern = r"^[A-Za-z0-9\(\)\[\]=#+\-\\/\\@\.%:]*$"
233-
234- if (
235- " " not in user_input # SMILES shouldn't contain spaces
236- and len (user_input ) >= 1 # Allow very short SMILES
237- and len (user_input ) <= 500 # Reasonable maximum length
238- and re .match (smiles_pattern , user_input )
239- # Improved heuristic: SMILES usually contain structural elements OR basic organic patterns
240- and (
241- any (char in user_input for char in "()=[]#@\\ /" ) # Structural notation
242- or re .match (
243- r"^[CNOPS]+(Cl|Br|[cnops]|\d)*$" , user_input
244- ) # Simple organic patterns
245- )
246- ):
347+ elif input_type == "SMILES" :
247348 return self ._query_by_smiles (user_input )
248-
249- # 7. Default: treat as a chemical name
250- return self ._query_by_name (user_input )
349+ else : # name or CAS
350+ return self ._query_by_name (user_input )
251351
252352 def get_smiles (self , user_input : str ) -> Optional [str ]:
253353 """
@@ -278,3 +378,62 @@ def get_smiles(self, user_input: str) -> Optional[str]:
278378 'C([C@@H]1[C@H]([C@@H]([C@H](C(O1)O)O)O)O)O'
279379 """
280380 return self ._get_smiles_cached (user_input )
381+
382+ def get_compound_info (self , user_input : str ) -> Dict [str , Any ]:
383+ """
384+ Retrieve comprehensive compound information from PubChem.
385+
386+ Args:
387+ user_input (str): The chemical identifier
388+
389+ Returns:
390+ Dict[str, Any]: Dictionary containing:
391+ - input: original input
392+ - input_type: detected input type
393+ - canonical_smiles: canonical SMILES if found
394+ - cids: list of PubChem CIDs
395+ - pubchem_links: list of PubChem compound page URLs
396+ - success: boolean indicating if compound was found
397+ """
398+ if not user_input or not isinstance (user_input , str ):
399+ return {
400+ "input" : user_input ,
401+ "input_type" : "unknown" ,
402+ "canonical_smiles" : None ,
403+ "cids" : None ,
404+ "pubchem_links" : None ,
405+ "success" : False ,
406+ }
407+
408+ user_input = user_input .strip ()
409+ input_type = self .detect_input_type (user_input )
410+
411+ # Get CIDs
412+ cids = self ._get_cids_by_identifier (user_input , input_type )
413+
414+ if not cids :
415+ return {
416+ "input" : user_input ,
417+ "input_type" : input_type ,
418+ "canonical_smiles" : None ,
419+ "cids" : None ,
420+ "pubchem_links" : None ,
421+ "success" : False ,
422+ }
423+
424+ # Get canonical SMILES from first CID
425+ canonical_smiles = self ._query_by_cid (cids [0 ])
426+
427+ # Generate PubChem links
428+ pubchem_links = [
429+ f"https://pubchem.ncbi.nlm.nih.gov/compound/{ cid } " for cid in cids
430+ ]
431+
432+ return {
433+ "input" : user_input ,
434+ "input_type" : input_type ,
435+ "canonical_smiles" : canonical_smiles ,
436+ "cids" : cids ,
437+ "pubchem_links" : pubchem_links ,
438+ "success" : canonical_smiles is not None ,
439+ }
0 commit comments