Skip to content

Commit 4dd4522

Browse files
s6junchengcopybara-github
authored andcommitted
Correct docstring for a few functions in the alphagenome data module.
PiperOrigin-RevId: 885272239 Change-Id: I57de4e56c4035342b2d90b37a11c288cd05dc191
1 parent 6973cfe commit 4dd4522

6 files changed

Lines changed: 18 additions & 16 deletions

File tree

src/alphagenome/data/fold_intervals.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -75,7 +75,7 @@ def get_all_folds() -> list[str]:
7575
def get_fold_names(
7676
model_version: dna_client.ModelVersion, subset: Subset
7777
) -> list[str]:
78-
"""Returns the data folds used for the model version."""
78+
"""Returns the names of the folds for a given model version and subset."""
7979
match subset:
8080
case Subset.VALID:
8181
return [_VALID_FOLD[_MODEL_VERSION_TO_FOLD[model_version]]]
@@ -101,7 +101,7 @@ def get_fold_intervals(
101101
subset: Subset,
102102
example_regions_path: str | None = None,
103103
) -> pd.DataFrame:
104-
"""Returns the training intervals for the model version."""
104+
"""Returns the intervals for a given model version and subset."""
105105
if example_regions_path is None:
106106
example_regions_path = _DEFAULT_EXAMPLE_REGIONS[organism]
107107

src/alphagenome/data/gene_annotation.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -80,7 +80,8 @@ def extract_tss(gtf: pd.DataFrame, feature: str = 'transcript') -> pd.DataFrame:
8080
feature: Feature in the GTF file to use (either transcript or gene).
8181
8282
Returns:
83-
pd.DataFrame containing transcription start sites (width=0, 0-based).
83+
pd.DataFrame containing transcription start sites as zero-width point
84+
intervals (Start == End, 0-based).
8485
"""
8586
tss = gtf[(gtf.Feature == feature)].copy()
8687

src/alphagenome/data/genome.py

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -300,8 +300,9 @@ def truncate(self, reference_length: int = sys.maxsize) -> Self:
300300
def center(self, use_strand: bool = True) -> int:
301301
"""Computes the center of the interval.
302302
303-
For intervals with an odd width, the center is rounded down to the nearest
304-
integer.
303+
For intervals with an odd width, the center is rounded up for
304+
positive/unstranded intervals and rounded down for negative strand
305+
intervals.
305306
306307
If `use_strand` is True and the interval is on the negative strand, the
307308
center is calculated differently to maintain consistency when stacking

src/alphagenome/data/junction_data.py

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -78,12 +78,12 @@ def num_tracks(self) -> int:
7878

7979
@property
8080
def names(self) -> np.ndarray:
81-
"""Returns a list of track names (not necessarily unique)."""
81+
"""Returns an array of track names (not necessarily unique)."""
8282
return self.metadata['name'].values
8383

8484
@property
8585
def strands(self) -> np.ndarray:
86-
"""Returns a list of track strands."""
86+
"""Returns an array of junction strands."""
8787
return np.array([j.strand for j in self.junctions])
8888

8989
@property
@@ -223,7 +223,7 @@ def get_junctions_to_plot(
223223
) -> list[genome.Junction]:
224224
"""Gets a list of junctions to plot.
225225
226-
Filters the junctions in the `predictions` by ontology term and strand, and
226+
Filters the junctions in the `predictions` by name and strand, and
227227
applies a threshold on the `k` value (read count).
228228
229229
Args:
@@ -237,7 +237,7 @@ def get_junctions_to_plot(
237237
A list of `Junction` objects to plot.
238238
239239
Raises:
240-
ValueError: If more than one track is found for the specified ontology term.
240+
ValueError: If more than one track is found for the specified name.
241241
"""
242242
filtered = predictions.filter_by_name(name)
243243
if filtered.num_tracks > 1:

src/alphagenome/data/track_data.py

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -69,7 +69,7 @@ class TrackData:
6969
7070
Valid shapes of `TrackData.values` are:
7171
72-
* [positional_bins]
72+
* [num_tracks]
7373
* [positional_bins, num_tracks]
7474
* [positional_bins, positional_bins, num_tracks]
7575
* ...
@@ -352,7 +352,7 @@ def upsample(
352352
resolution: int,
353353
aggregation_type: AggregationType = AggregationType.SUM,
354354
) -> 'TrackData':
355-
"""Upsamples the track data to a higher resolution by repeating existing values.
355+
"""Upsamples the track data to a higher resolution.
356356
357357
Args:
358358
resolution: The desired resolution in base pairs.
@@ -394,7 +394,7 @@ def downsample(
394394
resolution: int,
395395
aggregation_type: AggregationType = AggregationType.SUM,
396396
) -> 'TrackData':
397-
"""Downsamples the track data to a lower resolution using sum pooling.
397+
"""Downsamples the track data to a lower resolution.
398398
399399
Args:
400400
resolution: The desired resolution in base pairs.

src/alphagenome/data/transcript.py

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -40,7 +40,7 @@ class Transcript:
4040
exons: A list of `genome.Interval`s representing exons within transcript.
4141
Each `Transcript` must contain exons.
4242
cds: An optional list of `genome.Interval`s representing coding sequences
43-
(CDS) within a transcript. CDS include start codon and exclude top codon.
43+
(CDS) within a transcript. CDS include start codon and exclude stop codon.
4444
start_codon: An optional list of `genome.Interval`s representing a single
4545
start codon. Start codons can be split by introns, therefore might have
4646
more than one genomic interval. Some coding transcripts are missing start
@@ -50,7 +50,7 @@ class Transcript:
5050
than one genomic interval. Some transcripts coding transcripts are missing
5151
stop codons, e.g., ENST00000574051.5.
5252
transcript_id: An optional string representing a transcript id.
53-
gene_id: An optional string representing a protein id.
53+
gene_id: An optional string representing a gene id.
5454
protein_id: An optional string representing a protein id which is encoded by
5555
the transcript.
5656
uniprot_id: An optional UniprotKB-AC id string.
@@ -83,9 +83,9 @@ class Transcript:
8383
transcript or UTRs can be split by introns.
8484
splice_regions: a list of splice regions within a transcript.
8585
splice_donor_sites: a list of splice donor sites. Commonly, the RNA sequence
86-
that is removed ends with AG at its 3′ end.
86+
that is removed begins with the dinucleotide GU at its 5′ end.
8787
splice_acceptor_sites: a list of splice acceptor sites. Commonly, the RNA
88-
sequence that is removed begins with the dinucleotide GU at its 5′ end.
88+
sequence that is removed ends with AG at its 3′ end.
8989
splice_donors: a list of splice donors. The first nucleotide of the intron
9090
(0-based).
9191
splice_acceptors: a list of splice acceptors. The last nucleotide of the

0 commit comments

Comments
 (0)