33import numpy as np
44import pandas as pd
55from chemap import FingerprintConfig , compute_fingerprints
6- from chemap .fingerprint_conversions import fingerprints_to_csr
6+ from chemap .fingerprint_conversions import (
7+ fingerprints_to_csr ,
8+ fingerprints_to_tfidf ,
9+ idf_normalized ,
10+ )
711from chemap .metrics import (
812 tanimoto_distance_dense ,
913 tanimoto_distance_sparse ,
@@ -53,7 +57,7 @@ def create_chem_space_umap(
5357 fpgen : Optional [Any ] = None ,
5458 fingerprint_config : Optional [FingerprintConfig ] = None ,
5559 show_progress : bool = True ,
56- log_count : bool = False ,
60+ scaling : str = None ,
5761 # UMAP (CPU / umap-learn)
5862 n_neighbors : int = 100 ,
5963 min_dist : float = 0.25 ,
@@ -80,9 +84,9 @@ def create_chem_space_umap(
8084 FingerprintConfig(count=True, folded=False, invalid_policy="raise")
8185 show_progress:
8286 Forwarded to compute_fingerprints.
83- log_count :
84- If True, apply np.log1p to counts (works for sparse CSR and dense arrays) .
85- (For binary fingerprints this is harmless)
87+ scaling :
88+ Define scaling for count fingerprints. Default is None, which means no scaling .
89+ Can be set to "log" for log1p scaling, or to "tfidf" for TF-IDF scaling of bits.
8690 n_neighbors, min_dist, umap_random_state:
8791 Standard UMAP parameters.
8892 n_jobs:
@@ -137,14 +141,20 @@ def create_chem_space_umap(
137141
138142 if not fingerprint_config .folded :
139143 # Convert to CSR matrix
140- fps_csr = fingerprints_to_csr (fingerprints ).X
144+ if scaling == "tfidf" :
145+ fps_csr = fingerprints_to_tfidf (fingerprints ).X
146+ else :
147+ fps_csr = fingerprints_to_csr (fingerprints ).X
141148
142- if log_count :
143- # Works well for count fingerprints ( for binary it's essentially unchanged).
144- fps_csr = _log1p_csr_inplace (fps_csr )
149+ if scaling == "log" :
150+ fps_csr = _log1p_csr_inplace (fps_csr )
145151
146152 coords = reducer .fit_transform (fps_csr )
147153 else :
154+ if scaling == "log" :
155+ fingerprints = np .log1p (fingerprints )
156+ elif scaling == "tfidf" :
157+ fingerprints *= idf_normalized ((fingerprints > 0 ).sum (axis = 0 ), fingerprints .shape [0 ])
148158 coords = reducer .fit_transform (fingerprints )
149159
150160 df [x_col ] = coords [:, 0 ]
@@ -163,13 +173,39 @@ def create_chem_space_umap_gpu(
163173 fpgen : Optional [Any ] = None ,
164174 fingerprint_config : Optional [FingerprintConfig ] = None ,
165175 show_progress : bool = True ,
166- log_count : bool = False ,
176+ scaling : str = None ,
167177 # UMAP (GPU / cuML)
168178 n_neighbors : int = 100 ,
169179 min_dist : float = 0.25 ,
170180) -> pd .DataFrame :
171181 """Compute fingerprints and create 2D UMAP coordinates using cuML (GPU).
172182
183+ Parameters
184+ ----------
185+ data:
186+ Input dataframe containing a SMILES column.
187+ col_smiles:
188+ Name of the SMILES column.
189+ inplace:
190+ If True, write x/y columns into `data` and return it. Else returns a copy.
191+ x_col, y_col:
192+ Output coordinate column names.
193+ fpgen:
194+ RDKit fingerprint generator. Defaults to Morgan radius=9, fpSize=4096.
195+ fingerprint_config:
196+ FingerprintConfig for chemap.compute_fingerprints. Defaults to:
197+ FingerprintConfig(count=True, folded=False, invalid_policy="raise")
198+ show_progress:
199+ Forwarded to compute_fingerprints.
200+ scaling:
201+ Define scaling for count fingerprints. Default is None, which means no scaling.
202+ Can be set to "log" for log1p scaling, or to "tfidf" for TF-IDF scaling of bits.
203+ n_neighbors, min_dist, umap_random_state:
204+ Standard UMAP parameters.
205+ n_jobs:
206+ Passed to umap-learn UMAP for parallelism. Ignores random_state when n_jobs != 1.
207+ Default -1 uses all CPUs.
208+
173209 Notes
174210 -----
175211 - cuML UMAP here is fixed to metric="cosine"
@@ -222,12 +258,12 @@ def create_chem_space_umap_gpu(
222258 )
223259
224260 # Reduce memory footprint (works well for count fingerprints)
225- if not log_count :
226- # stays integer-like
227- fps = fingerprints .astype (np .int8 , copy = False )
261+ if scaling == "log" :
262+ fingerprints = np .log1p (fingerprints ).astype (np .float32 , copy = False )
263+ elif scaling == "tfidf" :
264+ fingerprints *= idf_normalized ((fingerprints > 0 ).sum (axis = 0 ), fingerprints .shape [0 ])
228265 else :
229- # log1p returns float
230- fps = np .log1p (fingerprints ).astype (np .float32 , copy = False )
266+ fingerprints = fingerprints .astype (np .int8 , copy = False )
231267
232268 umap_model = cuUMAP (
233269 n_neighbors = int (n_neighbors ),
@@ -238,7 +274,7 @@ def create_chem_space_umap_gpu(
238274 n_components = 2 ,
239275 )
240276
241- coords = umap_model .fit_transform (fps )
277+ coords = umap_model .fit_transform (fingerprints )
242278
243279 # cuML may return cupy/cudf-backed arrays; np.asarray makes it safe for pandas columns.
244280 coords_np = np .asarray (coords )
0 commit comments