flaky-tests-in-ci/src/analysis/utils.py at master · tum-i4/flaky-tests-in-ci · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
from datetime import date, timedelta
from pathlib import Path
from typing import List, Optional, Tuple, Dict, Any
from enum import Enum
import logging
import os

import psutil
from pyspark.sql import SparkSession, DataFrame
from pyspark.sql.types import StringType, FloatType, ArrayType
import pyspark.sql.functions as f

from paths import (
    RAW_DATA_PATH, VALIDATION_DATA_PATH, TEB_PATH, TEST_CASE_PATH
)

# Configure logging
logger = logging.getLogger(__name__)

# ==================== Constants ====================

# Study Periods
STUDY_PERIODS: Dict[str, Tuple[date, date]] = {
    "google-chromium": (date(2024, 7, 1), date(2024, 8, 31)),
    "cqse-teamscale": (date(2024, 2, 1), date(2024, 3, 31)),
    "gitlab-org-gitlab": (date(2024, 8, 1), date(2024, 9, 30)),
    "microsoft-playwright": (date(2024, 8, 1), date(2024, 9, 30)),
}

# Spark Configuration
# Minimum requirements
MIN_CPUS = 6
MIN_MEMORY_GB = 20  # 8GB driver + 8GB executor + 4GB off-heap

# Test Verdict Constants
VERDICT_PASSED = "passed"
VERDICT_FLAKY_DETECTED = "flaky (detected)"
VERDICT_FLAKY_UNDETECTED_SURVIVED_IMMEDIATE_RERUN = "flaky (undetected survived immediate rerun)"
VERDICT_FLAKY_UNDETECTED_UNCHALLENGED_BY_IMMEDIATE_RERUN = "flaky (undetected unchallenged by immediate rerun)"
VERDICT_FAILED_SURVIVED_IMMEDIATE_RERUN = "failed (with immediate reruns)"
VERDICT_FAILED_UNCHALLENGED_BY_IMMEDIATE_RERUN = "failed (no immediate reruns)"
VERDICT_FAILED_UNVALIDATED = "failed (unvalidated)"

# Hash Extraction Constants
HASH_LENGTH = 7  # Standard git short hash length

# ==================== Enums ====================

class Project(Enum):
    """Enumeration representing different projects in the study."""

    TEAMSCALE = "cqse-teamscale"
    CHROMIUM = "google-chromium"
    GITLAB = "gitlab-org-gitlab"
    PLAYWRIGHT = "microsoft-playwright"

    def __str__(self) -> str:
        """Return the string value of the project."""
        return self.value

    @property
    def study_period(self) -> Tuple[date, date]:
        """Get the study period for this project."""
        if self.value not in STUDY_PERIODS:
            raise ValueError(f"No study period defined for project {self.value}")
        return STUDY_PERIODS[self.value]

# ==================== Resource Detection & Spark Configuration ====================


def _detect_available_cpus() -> int:
    """Detect available CPU cores, respecting container limits."""
    try:
        return len(os.sched_getaffinity(0))  # Respects cgroup limits
    except AttributeError:
        return os.cpu_count() or 1  # Fallback for macOS


def _detect_available_memory_gb() -> int:
    """Detect available memory in GB, respecting container limits."""
    try:
        available_bytes = psutil.virtual_memory().available
        available_gb = available_bytes / (1024 ** 3)
        return max(1, int(available_gb * 0.9))  # Use 90% to be safe
    except Exception as e:
        logger.warning(f"Could not detect memory: {e}. Using default 16GB")
        return 16

def _build_spark_config() -> Dict[str, str]:
    """
    Builds a dynamic PySpark configuration that optimizes for available local resources.

    - A 15% buffer of total memory is reserved for the OS and other processes.
    - Driver memory is fixed at a stable 8GB.
    - The remaining memory is allocated to the Spark executor, with a floor of 4GB.
    - This ensures efficient resource use on powerful machines while maintaining stability
      on less powerful ones.

    Returns:
        A dictionary of Spark configuration parameters.
    """
    # Detect available system resources
    detected_cpus = _detect_available_cpus()
    detected_memory_gb = _detect_available_memory_gb()
    logger.info(f"Detected: {detected_cpus} CPUs, {detected_memory_gb}GB memory")

    # Validate minimum requirements
    if detected_cpus < MIN_CPUS:
        logger.warning(
            f"Insufficient CPU cores: detected {detected_cpus}, but minimum {MIN_CPUS} required. "
            "This analysis requires at least 6 CPU cores. Proceeding with potentially "
            "suboptimal performance."
        )

    if detected_memory_gb < MIN_MEMORY_GB:
        logger.warning(
            f"Insufficient memory: detected {detected_memory_gb}GB, but minimum {MIN_MEMORY_GB}GB required. "
            "Proceeding with potentially suboptimal performance and risk of OOM errors."
        )

    # Dynamic memory allocation
    # In local[*] mode, the driver does ALL work (scheduling + execution)
    # Reserve 15% for OS, and allocate remaining to driver + off-heap
    system_reserve_gb = int(detected_memory_gb * 0.15)
    offheap_mem = min(30, max(4, int(detected_memory_gb * 0.05)))  # 5% for off-heap, capped at 30GB

    # Give all remaining memory to driver (it's the executor in local mode)
    driver_mem = max(8, detected_memory_gb - system_reserve_gb - offheap_mem)

    # Log the calculated configuration
    logger.info(
        f"Spark config: {driver_mem}GB driver (local mode), {offheap_mem}GB off-heap"
    )

    return {
        # Memory configuration (local mode - driver does all work)
        "spark.driver.memory": f"{driver_mem}g",
        "spark.memory.offHeap.enabled": "true",
        "spark.memory.offHeap.size": f"{offheap_mem}g",

        # Parquet configuration
        "spark.sql.parquet.columnarReaderBatchSize": "1024",
        "spark.sql.parquet.enableVectorizedReader": "true",

        # Set shuffle partitions to number of cores for better performance
        "spark.sql.shuffle.partitions": str(detected_cpus * 2),

        # Enable Arrow optimization for Pandas conversions
        "spark.sql.execution.arrow.pyspark.enabled": "true",
    }

_spark_session: Optional[SparkSession] = None


def get_or_create_spark_session() -> SparkSession:
    """
    Get or create a singleton SparkSession with the configured settings.

    Returns:
        SparkSession: The configured Spark session.
    """
    global _spark_session

    if _spark_session is None:
        builder = SparkSession.builder.appName("Analyzing Parquet Files")
        builder = builder.master("local[*]")  # Explicitly set master to local[*] for clarity

        spark_config = _build_spark_config()
        for key, value in spark_config.items():
            builder = builder.config(key, value)

        # Set Spark local dir for shuffle/spill operations
        builder = builder.config("spark.local.dir", "/var/tmp/spark-local")

        _spark_session = builder.getOrCreate()

        logger.info("Created new Spark session")

    return _spark_session


# ==================== Data Loading Functions ====================


def load_test_cases(
    project: Project,
    include_environment: bool = True
) -> DataFrame:
    """
    Load test cases from the specified project.

    Args:
        project: The project to load test cases from.
        include_environment: If True, test cases are defined by name and environment.
                           If False, test cases are only defined by name.

    Returns:
        DataFrame containing the loaded test cases.

    Raises:
        FileNotFoundError: If cache files don't exist.
    """
    cache_path = TEST_CASE_PATH / project.value

    if not cache_path.exists():
        logger.info(f"Cache not found for {project.value}, creating it...")
        _create_test_case_cache(project)

    spark = get_or_create_spark_session()
    all_files = list(cache_path.glob("**/*.parquet"))

    if not all_files:
        raise FileNotFoundError(f"No parquet files found in {cache_path}")

    test_cases = spark.read.parquet(*[f.as_posix() for f in all_files])

    if not include_environment:
        test_cases = _aggregate_test_cases_by_name(test_cases)

    test_cases = test_cases.withColumn("project", f.lit(project.value))
    return test_cases


def load_test_execution_batches(
    project: Project,
    columns: Optional[List[str]] = None
) -> DataFrame:
    """
    Load test execution batches from cached Parquet files.

    Creates the cached TEB files if they do not exist.

    Args:
        project: The project to load the data from.
        columns: Optional list of column names to select.

    Returns:
        DataFrame containing the test execution batches.

    Raises:
        FileNotFoundError: If no parquet files are found.
    """
    data_path = TEB_PATH / project.value

    if not data_path.exists():
        logger.info(f"TEB cache not found for {project.value}, creating it...")
        _create_test_execution_batch_cache(project)

    spark = get_or_create_spark_session()
    all_files = list(data_path.glob("**/*.parquet"))

    if not all_files:
        raise FileNotFoundError(f"No parquet files found in {data_path}")

    teb = spark.read.parquet(*[f.as_posix() for f in all_files])

    if columns:
        teb = teb.select(columns)

    return teb


def load_test_executions(
    project: Project,
    columns: Optional[List[str]] = None,
    data_date: Optional[date] = None,
    validation: bool = False
) -> Optional[DataFrame]:
    """
    Load test executions from Parquet files.

    This function handles project-specific preprocessing, particularly for
    Google Chromium which doesn't have a native job ID.

    Args:
        project: The project to load data from.
        columns: Optional list of column names to select.
        data_date: Specific date to load data for. If None, loads entire study period.
        validation: Whether to load validation run data (vs CI data).

    Returns:
        DataFrame containing test executions, or None if no files found.
    """
    is_chromium = project == Project.CHROMIUM
    spark = get_or_create_spark_session()

    # Get relevant files
    if validation and not is_chromium:
        # For non-Chromium validation, load all available validation data
        relevant_files = _get_files_by_date(project, validation, date.min, date.max)
    elif data_date is None:
        relevant_files = _get_files_for_study(project, validation)
    else:
        relevant_files = _get_files_by_date(project, validation, data_date, data_date)

    if not relevant_files:
        logger.warning(f"No files found for {project.value} on {data_date}")
        return None

    # Load data with column selection
    te = _load_parquet_with_columns(spark, relevant_files, columns, is_chromium)

    # Apply Chromium-specific transformations
    if is_chromium:
        te = _apply_chromium_transformations(te, columns, validation)

    return te

# ==================== Private Helper Functions ====================


def _aggregate_test_cases_by_name(test_cases: DataFrame) -> DataFrame:
    """Aggregate test cases by name only (excluding environment)."""
    aggregations = [
        f.sum("teb_count").alias("teb_count"),
        f.sum("teb_passed_count").alias("teb_passed_count"),
        f.sum("teb_flaky_detected_count").alias("teb_flaky_detected_count"),
        f.sum("teb_flaky_undetected_survived_immediate_rerun_count").alias("teb_flaky_undetected_survived_immediate_rerun_count"),
        f.sum("teb_flaky_undetected_unchallenged_by_immediate_rerun_count").alias("teb_flaky_undetected_unchallenged_by_immediate_rerun_count"),
        f.sum("teb_failed_with_immediate_reruns_count").alias("teb_failed_with_immediate_reruns_count"),
        f.sum("teb_failed_no_immediate_reruns_count").alias("teb_failed_no_immediate_reruns_count"),
        f.sum("teb_failed_unvalidated_count").alias("teb_failed_unvalidated_count"),
        f.sum("te_pass_count").alias("te_pass_count"),
        f.sum("te_fail_count").alias("te_fail_count"),
    ]

    return test_cases.groupBy("test_case_name").agg(*aggregations)


def _create_test_case_cache(project: Project) -> None:
    """Create a test case cache for the given project."""
    output_path = TEST_CASE_PATH / project.value
    teb = load_test_execution_batches(project=project)

    # Define verdict count aggregations
    verdict_counts = {
        "teb_count": f.count(f.lit(1)),
        "teb_passed_count": f.count(f.when(f.col("verdict") == VERDICT_PASSED, True)),
        "teb_flaky_detected_count": f.count(f.when(f.col("verdict") == VERDICT_FLAKY_DETECTED, True)),
        "teb_flaky_undetected_survived_immediate_rerun_count": f.count(f.when(f.col("verdict") == VERDICT_FLAKY_UNDETECTED_SURVIVED_IMMEDIATE_RERUN, True)),
        "teb_flaky_undetected_unchallenged_by_immediate_rerun_count": f.count(f.when(f.col("verdict") == VERDICT_FLAKY_UNDETECTED_UNCHALLENGED_BY_IMMEDIATE_RERUN, True)),
        "teb_failed_with_immediate_reruns_count": f.count(f.when(f.col("verdict") == VERDICT_FAILED_SURVIVED_IMMEDIATE_RERUN, True)),
        "teb_failed_no_immediate_reruns_count": f.count(f.when(f.col("verdict") == VERDICT_FAILED_UNCHALLENGED_BY_IMMEDIATE_RERUN, True)),
        "teb_failed_unvalidated_count": f.count(f.when(f.col("verdict") == VERDICT_FAILED_UNVALIDATED, True)),
        "te_pass_count": f.sum(f.col("pass_count")),
        "te_fail_count": f.sum(f.col("fail_count")),
    }

    test_cases = teb.groupBy(["test_case_name", "test_case_environment"]).agg(
        *[agg.alias(name) for name, agg in verdict_counts.items()]
    )

    test_cases = test_cases.sort(f.col("test_case_name"), f.col("test_case_environment"))
    test_cases.repartition("test_case_name").write.format("parquet").save(output_path.as_posix())

    logger.info(f"Created test case cache at {output_path}")


def _create_test_execution_batch_cache(project: Project) -> None:
    """
    Create cached TEB files for a project.
    """
    output_path = TEB_PATH / project.value
    start_date, end_date = project.study_period

    # Required columns for TEB creation
    required_columns = [
        "test_case_name", "test_case_environment", "job_id",
        "generic_result", "duration", "retry_count",
        "job_name", "job_duration", "job_result", "job_trigger",
        "pipeline_id", "pipeline_result", "pipeline_created_at",
        "pipeline_duration", "pipeline_commit_sha", "pipeline_branch_name",
        "pipeline_trigger", "pipeline_metadata", "pipeline_name", "project_name"
    ]

    # Process day by day to reduce storage footprint
    for current_date in _daterange(start_date, end_date):
        logger.info(f"Creating TEB cache for {project.value} on {current_date}")
        output_folder_path = output_path / current_date.isoformat()

        # Load all test executions for this date
        te = load_test_executions(project, data_date=current_date, columns=required_columns)
        if te is None:
            logger.warning(f"No files found for {project.value} on {current_date}")
            continue

        te = te.repartition("job_id")

        # Create TEBs (groupBy already uses job_id, so partitions process independently)
        teb = _create_test_execution_batches(te)
        teb = _add_validation_data(teb, project, current_date)
        teb = _assign_verdicts(teb)

        # Coalesce to reduce output file count for better read performance
        # Use project-specific output file counts
        num_output_files = {
            Project.TEAMSCALE: 1,
            Project.CHROMIUM: 30,
            Project.GITLAB: 50,
            Project.PLAYWRIGHT: 1,
        }[project]

        teb.coalesce(num_output_files).write.mode("overwrite").parquet(
            output_folder_path.as_posix()
        )

        # Explicitly unpersist DataFrames to avoid memory bloat
        te.unpersist()
        teb.unpersist()

        logger.info(f"  ✓ Completed TEB cache for {current_date}")


def _create_test_execution_batches(te: DataFrame) -> DataFrame:
    """Create test execution batches from test executions."""
    teb = te.groupBy("test_case_name", "test_case_environment", "job_id").agg(
        f.count(f.when(te["generic_result"].like("pass-like"), 1)).alias("pass_count"),
        f.count(f.when(te["generic_result"].like("fail-like"), 1)).alias("fail_count"),
        f.collect_list("duration").alias("durations"),
        f.collect_list("generic_result").alias("generic_results"),
        f.collect_list("retry_count").alias("retry_counts"),
        f.first("job_name").alias("job_name"),
        f.first("job_duration").alias("job_duration"),
        f.first("job_result").alias("job_result"),
        f.first("job_trigger").alias("job_trigger"),
        f.first("pipeline_id").alias("pipeline_id"),
        f.first("pipeline_result").alias("pipeline_result"),
        f.first("pipeline_created_at").alias("pipeline_created_at"),
        f.first("pipeline_duration").alias("pipeline_duration"),
        f.first("pipeline_commit_sha").alias("pipeline_commit_sha"),
        f.first("pipeline_branch_name").alias("pipeline_branch_name"),
        f.first("pipeline_trigger").alias("pipeline_trigger"),
        f.first("pipeline_metadata").alias("pipeline_metadata"),
        f.first("pipeline_name").alias("pipeline_name"),
        f.first("project_name").alias("project_name")
    )

    # Add initial verdict for failed tests
    teb = teb.withColumn("verdict", f.when(teb["pass_count"] == 0, "failed").otherwise(None))

    # Sort lists by retry count
    teb = _sort_lists_by_retry_count(teb)

    return teb


def _sort_lists_by_retry_count(teb: DataFrame) -> DataFrame:
    """Sort duration and result lists by retry count."""
    def sort_by_retry_count(list_to_sort: List[Any], retry_counts: List[int]) -> List[Any]:
        """Sort lists based on retry_counts."""
        if not list_to_sort or not retry_counts:
            return list_to_sort
        return [x for _, x in sorted(zip(retry_counts, list_to_sort))]

    sort_durations_udf = f.udf(sort_by_retry_count, returnType=ArrayType(FloatType()))
    sort_results_udf = f.udf(sort_by_retry_count, returnType=ArrayType(StringType()))

    teb = teb.withColumn("durations", sort_durations_udf("durations", "retry_counts"))
    teb = teb.withColumn("generic_results", sort_results_udf("generic_results", "retry_counts"))
    teb = teb.drop("retry_counts")

    return teb


def _add_validation_data(teb: DataFrame, project: Project, current_date: date) -> DataFrame:
    """Add validation data to failed test execution batches."""
    # Load validation test executions
    load_columns = _get_validation_columns(project)
    te_validation = load_test_executions(
        project, load_columns, validation=True, data_date=current_date
    )

    if te_validation is None:
        return teb

    # Add hash column
    te_validation = _add_hash_column(te_validation, project)

    # Add verdict column for joining
    te_validation = te_validation.withColumn("verdict", f.lit("failed"))

    # Group validation data by hash (not full identifier) to deduplicate
    group_columns = _get_validation_group_columns(project)
    te_validation = te_validation.groupBy(group_columns).agg(
        (f.count(f.when(te_validation["generic_result"].like("pass-like"), 1)) >= 1)
        .alias("validation_test_passed")
    )

    # Add hash to TEB for joining
    teb = teb.withColumn("hash_join", f.substring(teb["pipeline_commit_sha"], 0, HASH_LENGTH))

    # Join validation data
    join_columns = _get_join_columns(project)
    teb = teb.join(te_validation, join_columns, "left")
    teb = teb.drop("hash_join")

    return teb


def _assign_verdicts(teb: DataFrame) -> DataFrame:
    """Assign final verdicts to test execution batches."""
    return teb.withColumn(
        "verdict",
        f.when(teb["fail_count"] == 0, VERDICT_PASSED)
        .when((teb["fail_count"] > 0) & (teb["pass_count"] > 0), VERDICT_FLAKY_DETECTED)
        .when(
            (teb["fail_count"] > 1) &
            (teb["pass_count"] == 0) &
            (teb["validation_test_passed"] == True),
            VERDICT_FLAKY_UNDETECTED_SURVIVED_IMMEDIATE_RERUN
        )
        .when(
            (teb["fail_count"] == 1) &
            (teb["pass_count"] == 0) &
            (teb["validation_test_passed"] == True),
            VERDICT_FLAKY_UNDETECTED_UNCHALLENGED_BY_IMMEDIATE_RERUN
        )
        .when(
            (teb["fail_count"] > 1) &
            (teb["pass_count"] == 0) &
            (teb["validation_test_passed"] == False),
            VERDICT_FAILED_SURVIVED_IMMEDIATE_RERUN
        )
        .when(
            (teb["fail_count"] == 1) &
            (teb["pass_count"] == 0) &
            (teb["validation_test_passed"] == False),
            VERDICT_FAILED_UNCHALLENGED_BY_IMMEDIATE_RERUN
        )
        .when(
            (teb["fail_count"] > 0) &
            (teb["pass_count"] == 0) &
            (teb["validation_test_passed"].isNull()),
            VERDICT_FAILED_UNVALIDATED
        )
    ).drop("validation_test_passed")


def _get_validation_columns(project: Project) -> List[str]:
    """Get columns needed for validation data based on project."""
    columns = ["test_case_name", "job_name", "pipeline_branch_name",
               "generic_result", "pipeline_commit_sha"]

    if project in [Project.PLAYWRIGHT, Project.CHROMIUM, Project.GITLAB]:
        columns.append("test_case_environment")

    if project == Project.GITLAB:
        columns.append("pipeline_name")

    return columns


def _get_validation_group_columns(project: Project) -> List[str]:
    """Get grouping columns for validation data based on project."""
    columns = ["test_case_name", "hash_join", "verdict"]

    if project in [Project.PLAYWRIGHT, Project.CHROMIUM, Project.GITLAB]:
        columns.append("test_case_environment")

    return columns


def _add_hash_column(df: DataFrame, project: Project) -> DataFrame:
    """Add hash column for joining based on project."""
    if project == Project.PLAYWRIGHT:
        df = df.withColumn("hash_join", f.substring("pipeline_branch_name", 12, HASH_LENGTH))
    elif project == Project.TEAMSCALE:
        df = df.withColumn("hash_join", f.substring("pipeline_branch_name", 7, HASH_LENGTH))
    elif project == Project.CHROMIUM:
        df = df.withColumn("hash_join", f.substring("pipeline_commit_sha", 0, HASH_LENGTH))
    elif project == Project.GITLAB:
        df = df.withColumn("hash_join", f.substring("pipeline_name", 26, HASH_LENGTH))

    # Clean up columns no longer needed
    for col in ["pipeline_commit_sha", "pipeline_branch_name", "pipeline_name"]:
        if col in df.columns:
            df = df.drop(col)

    return df


def _get_join_columns(project: Project) -> List[str]:
    """Get columns for joining validation data based on project."""
    columns = ["test_case_name", "hash_join", "verdict"]

    if project in [Project.PLAYWRIGHT, Project.CHROMIUM, Project.GITLAB]:
        columns.append("test_case_environment")

    return columns


def _load_parquet_with_columns(
    spark: SparkSession,
    files: List[Path],
    columns: Optional[List[str]],
    is_chromium: bool
) -> DataFrame:
    """Load parquet files with optional column selection."""
    support_columns = []

    if columns is not None and is_chromium:
        # Chromium needs additional columns for derived fields
        if "job_id" in columns:
            if "pipeline_id" not in columns:
                support_columns.append("pipeline_id")
            if "job_name" not in columns:
                support_columns.append("job_name")
        if "metadata" not in columns:
            support_columns.append("metadata")

    file_paths = [f.as_posix() for f in files]

    if columns is not None:
        df = spark.read.parquet(*file_paths).select(columns + support_columns)
    else:
        df = spark.read.parquet(*file_paths)

    return df


def _apply_chromium_transformations(
    te: DataFrame,
    columns: Optional[List[str]],
    validation: bool
) -> DataFrame:
    """Apply Chromium-specific transformations to test executions."""
    # Extract retry shard information
    te = te.withColumn(
        "is_retry_shard",
        f.regexp_extract("metadata", r"is_retry_shard': (False|True)", 1)
    )
    te = te.withColumn(
        "is_retry_shard",
        f.when(te["is_retry_shard"] == "True", True)
        .when(te["is_retry_shard"] == "False", False)
        .otherwise(None)
    )

    # Filter based on validation flag
    te = te.filter(f.col("is_retry_shard") == validation)
    te = te.drop("is_retry_shard")

    # Add synthetic job_id if needed
    if columns is None or "job_id" in columns:
        te = te.withColumn("job_id", f.concat(te["pipeline_id"], f.lit("-"), te["job_name"]))

    # Drop support columns
    support_cols = ["pipeline_id", "job_name", "metadata"]
    for col in support_cols:
        if columns is not None and col not in columns and col in te.columns:
            te = te.drop(col)

    # Extract environment from file path
    if columns is None or "test_case_environment" in columns:
        te = te.withColumn("test_case_environment", f.input_file_name())
        te = te.withColumn(
            "test_case_environment",
            f.element_at(f.reverse(f.split("test_case_environment", '/')), 2)
        )

    return te


def _daterange(start_date: date, end_date: date) -> List[date]:
    """
    Generate a range of dates from start_date to end_date inclusive.

    Args:
        start_date: The starting date of the range.
        end_date: The ending date of the range.

    Returns:
        List of dates in the range from start_date to end_date.
    """
    days = int((end_date - start_date).days) + 1
    return [start_date + timedelta(n) for n in range(days)]


def _get_files_by_date(
    project: Project,
    validation: bool,
    start_date: date,
    end_date: date
) -> List[Path]:
    """
    Retrieve a list of files within the specified date range.

    Args:
        project: The project to load data from.
        validation: Whether to load validation run data.
        start_date: The start date of the range.
        end_date: The end date of the range.

    Returns:
        List of file paths that fall within the specified date range.
    """
    # Chromium validation data comes from CI system directly
    if validation and project == Project.CHROMIUM:
        data_path = RAW_DATA_PATH / project.value
    else:
        data_path = (VALIDATION_DATA_PATH if validation else RAW_DATA_PATH) / project.value

    all_files = list(data_path.glob("**/*.parquet"))

    # Filter files by date from filename
    filtered_files = []
    for file_path in all_files:
        try:
            # Extract date from filename (format: YYYY-MM-DD_*.parquet)
            file_date_str = file_path.stem.split("_")[0]
            file_date = date.fromisoformat(file_date_str)

            if start_date <= file_date <= end_date:
                filtered_files.append(file_path)
        except (ValueError, IndexError):
            logger.warning(f"Could not parse date from filename: {file_path}")
            continue

    logger.info(f"Found {len(filtered_files)} files between {start_date} and {end_date}")
    return filtered_files


def _get_files_for_study(project: Project, validation: bool) -> List[Path]:
    """
    Retrieve project files for the entire study period.

    Args:
        project: The project to load data from.
        validation: Whether to load validation run data.

    Returns:
        List of files within the study period.
    """
    start_date, end_date = project.study_period
    return _get_files_by_date(project, validation, start_date, end_date)