diff --git a/src/pg2_benchmark/dummy_data.py b/src/pg2_benchmark/dummy_data.py index edd0e4a0..eb7ba4ad 100644 --- a/src/pg2_benchmark/dummy_data.py +++ b/src/pg2_benchmark/dummy_data.py @@ -110,9 +110,11 @@ def charge_mutations(sequence: str, n: int) -> str: def charge_ladder_dataset(n_rows: int = 200, seq_len: int = 20) -> pd.DataFrame: parent = _generate_sequence(seq_len) - sequences = [ + # Deduplicate sequences + sequences = { charge_mutations(parent, random.randint(0, seq_len)) for _ in range(n_rows) - ] + } + charge = [peptide_charge(seq) for seq in sequences] return pd.DataFrame({"sequence": sequences, "charge": charge})