Correct docstring for a few functions in the alphagenome data module.

s6juncheng · copybara-github · commit 4dd45227a004 · 2026-03-17T16:35:09.000-07:00
PiperOrigin-RevId: 885272239
Change-Id: I57de4e56c4035342b2d90b37a11c288cd05dc191
diff --git a/src/alphagenome/data/fold_intervals.py b/src/alphagenome/data/fold_intervals.py
@@ -75,7 +75,7 @@ def get_all_folds() -> list[str]:
 def get_fold_names(
     model_version: dna_client.ModelVersion, subset: Subset
 ) -> list[str]:
-  """Returns the data folds used for the model version."""
+  """Returns the names of the folds for a given model version and subset."""
   match subset:
     case Subset.VALID:
       return [_VALID_FOLD[_MODEL_VERSION_TO_FOLD[model_version]]]
@@ -101,7 +101,7 @@ def get_fold_intervals(
     subset: Subset,
     example_regions_path: str | None = None,
 ) -> pd.DataFrame:
-  """Returns the training intervals for the model version."""
+  """Returns the intervals for a given model version and subset."""
   if example_regions_path is None:
     example_regions_path = _DEFAULT_EXAMPLE_REGIONS[organism]
 
diff --git a/src/alphagenome/data/gene_annotation.py b/src/alphagenome/data/gene_annotation.py
@@ -80,7 +80,8 @@ def extract_tss(gtf: pd.DataFrame, feature: str = 'transcript') -> pd.DataFrame:
     feature: Feature in the GTF file to use (either transcript or gene).
 
   Returns:
-    pd.DataFrame containing transcription start sites (width=0, 0-based).
+    pd.DataFrame containing transcription start sites as zero-width point
+    intervals (Start == End, 0-based).
   """
   tss = gtf[(gtf.Feature == feature)].copy()
 
diff --git a/src/alphagenome/data/genome.py b/src/alphagenome/data/genome.py
@@ -300,8 +300,9 @@ def truncate(self, reference_length: int = sys.maxsize) -> Self:
   def center(self, use_strand: bool = True) -> int:
     """Computes the center of the interval.
 
-    For intervals with an odd width, the center is rounded down to the nearest
-    integer.
+    For intervals with an odd width, the center is rounded up for
+    positive/unstranded intervals and rounded down for negative strand
+    intervals.
 
     If `use_strand` is True and the interval is on the negative strand, the
     center is calculated differently to maintain consistency when stacking
diff --git a/src/alphagenome/data/junction_data.py b/src/alphagenome/data/junction_data.py
@@ -78,12 +78,12 @@ def num_tracks(self) -> int:
 
   @property
   def names(self) -> np.ndarray:
-    """Returns a list of track names (not necessarily unique)."""
+    """Returns an array of track names (not necessarily unique)."""
     return self.metadata['name'].values
 
   @property
   def strands(self) -> np.ndarray:
-    """Returns a list of track strands."""
+    """Returns an array of junction strands."""
     return np.array([j.strand for j in self.junctions])
 
   @property
@@ -223,7 +223,7 @@ def get_junctions_to_plot(
 ) -> list[genome.Junction]:
   """Gets a list of junctions to plot.
 
-  Filters the junctions in the `predictions` by ontology term and strand, and
+  Filters the junctions in the `predictions` by name and strand, and
   applies a threshold on the `k` value (read count).
 
   Args:
@@ -237,7 +237,7 @@ def get_junctions_to_plot(
     A list of `Junction` objects to plot.
 
   Raises:
-    ValueError: If more than one track is found for the specified ontology term.
+    ValueError: If more than one track is found for the specified name.
   """
   filtered = predictions.filter_by_name(name)
   if filtered.num_tracks > 1:
diff --git a/src/alphagenome/data/track_data.py b/src/alphagenome/data/track_data.py
@@ -69,7 +69,7 @@ class TrackData:
 
   Valid shapes of `TrackData.values` are:
 
-    * [positional_bins]
+    * [num_tracks]
     * [positional_bins, num_tracks]
     * [positional_bins, positional_bins, num_tracks]
     * ...
@@ -352,7 +352,7 @@ def upsample(
       resolution: int,
       aggregation_type: AggregationType = AggregationType.SUM,
   ) -> 'TrackData':
-    """Upsamples the track data to a higher resolution by repeating existing values.
+    """Upsamples the track data to a higher resolution.
 
     Args:
       resolution: The desired resolution in base pairs.
@@ -394,7 +394,7 @@ def downsample(
       resolution: int,
       aggregation_type: AggregationType = AggregationType.SUM,
   ) -> 'TrackData':
-    """Downsamples the track data to a lower resolution using sum pooling.
+    """Downsamples the track data to a lower resolution.
 
     Args:
       resolution: The desired resolution in base pairs.
diff --git a/src/alphagenome/data/transcript.py b/src/alphagenome/data/transcript.py
@@ -40,7 +40,7 @@ class Transcript:
     exons: A list of `genome.Interval`s representing exons within transcript.
       Each `Transcript` must contain exons.
     cds: An optional list of `genome.Interval`s representing coding sequences
-      (CDS) within a transcript. CDS include start codon and exclude top codon.
+      (CDS) within a transcript. CDS include start codon and exclude stop codon.
     start_codon: An optional list of `genome.Interval`s representing a single
       start codon. Start codons can be split by introns, therefore might have
       more than one genomic interval. Some coding transcripts are missing start
@@ -50,7 +50,7 @@ class Transcript:
       than one genomic interval. Some transcripts coding transcripts are missing
       stop codons, e.g., ENST00000574051.5.
     transcript_id: An optional string representing a transcript id.
-    gene_id: An optional string representing a protein id.
+    gene_id: An optional string representing a gene id.
     protein_id: An optional string representing a protein id which is encoded by
       the transcript.
     uniprot_id: An optional UniprotKB-AC id string.
@@ -83,9 +83,9 @@ class Transcript:
       transcript or UTRs can be split by introns.
     splice_regions: a list of splice regions within a transcript.
     splice_donor_sites: a list of splice donor sites. Commonly, the RNA sequence
-      that is removed ends with AG at its 3′ end.
+      that is removed begins with the dinucleotide GU at its 5′ end.
     splice_acceptor_sites: a list of splice acceptor sites. Commonly, the RNA
-      sequence that is removed begins with the dinucleotide GU at its 5′ end.
+      sequence that is removed ends with AG at its 3′ end.
     splice_donors: a list of splice donors. The first nucleotide of the intron
       (0-based).
     splice_acceptors: a list of splice acceptors. The last nucleotide of the