@@ -23,6 +23,12 @@ def inchikey14_from_full(inchikey: str) -> Optional[str]:
2323
2424def encode_sparse_fp (bits : Optional [np .ndarray ], counts : Optional [np .ndarray ]) -> tuple [bytes , bytes ]:
2525 """Store bits as uint32 indices, counts as int32
26+
27+ Parameters
28+ ----------
29+ bits : array-like of uint32 bit indices
30+ counts : array-like of int32 counts
31+
2632 Returns (bits_blob, counts_blob). Accepts None -> empty blobs."""
2733 if bits is None :
2834 b = b""
@@ -41,14 +47,23 @@ def encode_sparse_fp(bits: Optional[np.ndarray], counts: Optional[np.ndarray]) -
4147 return b , c
4248
4349def decode_sparse_fp (bits_blob : bytes , counts_blob : bytes ) -> tuple [np .ndarray , np .ndarray ]:
44- """Inverse of encode_sparse_fp. Returns (bits_uint32, counts_int32). Empty blobs -> empty arrays."""
50+ """Inverse of encode_sparse_fp.
51+
52+ Parameters
53+ ----------
54+ bits_blob : BLOB bytes of uint32 bit indices
55+ counts_blob : BLOB bytes of int32 counts
56+
57+ Returns (bits_uint32, counts_int32). Empty blobs -> empty arrays.
58+ """
4559 bits = np .frombuffer (bits_blob , dtype = np .uint32 ).copy () if bits_blob else np .zeros (0 , dtype = np .uint32 )
4660 # Guess signedness: store as int32 by default
4761 counts = np .frombuffer (counts_blob , dtype = np .int32 ).copy () if counts_blob else np .zeros (0 , dtype = np .int32 )
4862 return bits , counts
4963
5064def decode_fp_blob (blob : bytes ) -> np .ndarray :
51- """Decode fingerprint BLOB back to uint8 array. Unknown length -> infer from blob size."""
65+ """Decode fingerprint BLOB back to uint8 array.
66+ Unknown length -> infer from blob size."""
5267 if not blob :
5368 return np .zeros (0 , dtype = np .uint8 )
5469 return np .frombuffer (blob , dtype = np .uint8 ).copy ()
@@ -62,8 +77,22 @@ def compute_fingerprints(
6277 progress_bar : bool = True ,
6378 ) -> np .ndarray :
6479 """
65- Placeholder: compute a molecular fingerprint from SMILES or InChI.
66- For now return a dummy vector (replace with RDKit/Morgan etc. later).
80+ Compute a molecular fingerprint from SMILES or InChI.
81+
82+ Parameters
83+ ----------
84+ smiles : str or None
85+ SMILES string to compute the fingerprint from.
86+ inchis : str or None
87+ InChI strings to compute the fingerprint from (used if smiles is None).
88+ sparse : bool
89+ If True, compute sparse fingerprint (indices/counts); else dense bit vector.
90+ count : bool
91+ If True, compute count-based fingerprint; else binary fingerprint.
92+ radius : int
93+ Radius for Morgan fingerprint. Default 9.
94+ progress_bar : bool
95+ Whether to show a progress bar during computation. Default True.
6796 """
6897 fpgen = rdFingerprintGenerator .GetMorganGenerator (radius = radius , fpSize = 4096 )
6998
@@ -94,15 +123,29 @@ def compute_fingerprints(
94123# Compound database (compounds table) in SQLite
95124# ==================================================
96125
97- # --- keep your imports/utilities as-is (encode/decode utils etc.) ---
98-
99126@dataclass
100127class CompoundDatabase :
128+ """SQLite-based compound database with sparse fingerprint storage.
129+ Stores compounds identified by inchikey14, with optional metadata and molecular fingerprints.
130+
131+ Attributes
132+ ----------
133+ sqlite_path : str
134+ Path to the SQLite database file.
135+ compound_fields : List[str]
136+ List of metadata fields to store for each compound.
137+ fingerprint_radius : int
138+ Radius for Morgan fingerprint computation (used in backfill).
139+ fingerprint_sparse : bool
140+ Whether to store fingerprints as sparse (True) or dense (False) (used in backfill).
141+ fingerprint_count : bool
142+ Whether to store count-based (True) or binary (False) fingerprints (used in backfill).
143+ """
101144 sqlite_path : str
102145 compound_fields : List [str ] = field (default_factory = lambda : [
103146 "smiles" , "inchi" , "inchikey" , "classyfire_class" , "classyfire_superclass"
104147 ])
105- # Default FP parameters (used by the backfill method if you pass through to your compute_fingerprints)
148+ # Default FP parameters (used by the backfill method for compute_fingerprints)
106149 fingerprint_radius : int = 9
107150 fingerprint_sparse : bool = True
108151 fingerprint_count : bool = True
@@ -142,7 +185,7 @@ def _ensure_schema(self):
142185 cur .execute (f"ALTER TABLE compounds ADD COLUMN { name } { typ } " )
143186 self ._conn .commit ()
144187
145- # ---------- UPSERTS: write metadata only; DO NOT compute fingerprints here ----------
188+ # ---------- UPSERTS ----------
146189
147190 def upsert_compound (
148191 self ,
@@ -420,7 +463,7 @@ def _update_rows(comp_ids: List[str], results: List[Optional[Tuple[np.ndarray, n
420463 comp_ids = [r [0 ] for r in rows ]
421464 reps = [r [1 ] for r in rows ] # list[str] of smiles or inchi
422465
423- # call your project-level function ONCE for the whole batch
466+ # call compute_fingerprints ONCE for the whole batch
424467 results = compute_fingerprints (
425468 smiles = reps if which == "smiles" else None ,
426469 inchis = reps if which == "inchi" else None ,
0 commit comments