|
1 | 1 | import logging |
2 | 2 | import os |
3 | | -from typing import Union |
| 3 | +from typing import Tuple, Union |
4 | 4 |
|
5 | 5 | import pandas as pd |
6 | 6 | import synapseclient |
@@ -114,7 +114,7 @@ class cna(FileTypeFormat): |
114 | 114 |
|
115 | 115 | _process_kwargs = ["newPath"] |
116 | 116 |
|
117 | | - _validation_kwargs = ["nosymbol_check"] |
| 117 | + _validation_kwargs = ["skip_database_checks"] |
118 | 118 |
|
119 | 119 | # VALIDATE FILENAME |
120 | 120 | def _validateFilename(self, filePath): |
@@ -175,7 +175,18 @@ def process_steps(self, cnaDf, newPath): |
175 | 175 | self.syn.store(synapseclient.File(newPath, parent=centerMafSynId)) |
176 | 176 | return newPath |
177 | 177 |
|
178 | | - def _validate(self, cnvDF, nosymbol_check): |
| 178 | + def _validate(self, cnvDF: pd.DataFrame, skip_database_checks: bool) -> Tuple: |
| 179 | + """ |
| 180 | + Validates the values of the input cna file |
| 181 | +
|
| 182 | + Args: |
| 183 | + cnvDF (pd.DataFrame): input CNA file |
| 184 | + skip_database_checks (bool): Whether to skip this validation check |
| 185 | + since it requires access to the internal clinical sample database |
| 186 | +
|
| 187 | + Returns: |
| 188 | + Tuple: complete error and warning messages |
| 189 | + """ |
179 | 190 | total_error = "" |
180 | 191 | warning = "" |
181 | 192 | cnvDF.columns = [col.upper() for col in cnvDF.columns] |
@@ -220,27 +231,49 @@ def _validate(self, cnvDF, nosymbol_check): |
220 | 231 | ) |
221 | 232 | else: |
222 | 233 | cnvDF["HUGO_SYMBOL"] = keepSymbols |
223 | | - if haveColumn and not nosymbol_check: |
224 | | - bedSynId = self.genie_config["bed"] |
225 | | - bed = self.syn.tableQuery( |
226 | | - f"select Hugo_Symbol, ID from {bedSynId} " |
227 | | - f"where CENTER = '{self.center}'" |
228 | | - ) |
229 | | - bedDf = bed.asDataFrame() |
230 | | - cnvDF["remapped"] = cnvDF["HUGO_SYMBOL"].apply( |
231 | | - lambda x: validateSymbol(x, bedDf) |
| 234 | + if haveColumn: |
| 235 | + total_error += self.validate_no_dup_symbols_after_remapping( |
| 236 | + cnvDF=cnvDF, skip_database_checks=skip_database_checks |
232 | 237 | ) |
233 | | - cnvDF = cnvDF[~cnvDF["remapped"].isnull()] |
234 | | - |
235 | | - # Do not allow any duplicated genes after symbols |
236 | | - # have been remapped |
237 | | - if sum(cnvDF["remapped"].duplicated()) > 0: |
238 | | - duplicated = cnvDF["remapped"].duplicated(keep=False) |
239 | | - total_error += ( |
240 | | - "Your CNA file has duplicated Hugo_Symbols " |
241 | | - "(After remapping of genes): {} -> {}.\n".format( |
242 | | - ",".join(cnvDF["HUGO_SYMBOL"][duplicated]), |
243 | | - ",".join(cnvDF["remapped"][duplicated]), |
244 | | - ) |
245 | | - ) |
246 | 238 | return (total_error, warning) |
| 239 | + |
| 240 | + def validate_no_dup_symbols_after_remapping( |
| 241 | + self, cnvDF: pd.DataFrame, skip_database_checks: bool |
| 242 | + ) -> str: |
| 243 | + """Validates that there are no duplicated Hugo_Symbol values |
| 244 | + after remapping the previous Hugo_Symbol column using the |
| 245 | + bed database table. See validateSymbol for more details |
| 246 | + on the remapping method. |
| 247 | +
|
| 248 | + Args: |
| 249 | + skip_database_checks (bool): Whether to skip this validation check |
| 250 | + since it requires access to the internal bed database |
| 251 | +
|
| 252 | + Returns: |
| 253 | + str: error message |
| 254 | + """ |
| 255 | + error = "" |
| 256 | + if not skip_database_checks: |
| 257 | + bedSynId = self.genie_config["bed"] |
| 258 | + bed = self.syn.tableQuery( |
| 259 | + f"select Hugo_Symbol, ID from {bedSynId} " |
| 260 | + f"where CENTER = '{self.center}'" |
| 261 | + ) |
| 262 | + bedDf = bed.asDataFrame() |
| 263 | + cnvDF["remapped"] = cnvDF["HUGO_SYMBOL"].apply( |
| 264 | + lambda x: validateSymbol(x, bedDf) |
| 265 | + ) |
| 266 | + cnvDF = cnvDF[~cnvDF["remapped"].isnull()] |
| 267 | + |
| 268 | + # Do not allow any duplicated genes after symbols |
| 269 | + # have been remapped |
| 270 | + if sum(cnvDF["remapped"].duplicated()) > 0: |
| 271 | + duplicated = cnvDF["remapped"].duplicated(keep=False) |
| 272 | + error += ( |
| 273 | + "Your CNA file has duplicated Hugo_Symbols " |
| 274 | + "(After remapping of genes): {} -> {}.\n".format( |
| 275 | + ",".join(cnvDF["HUGO_SYMBOL"][duplicated]), |
| 276 | + ",".join(cnvDF["remapped"][duplicated]), |
| 277 | + ) |
| 278 | + ) |
| 279 | + return error |
0 commit comments