Merge branch 'main' into fix/CM-1054-throttle-org-updated-at

themarolt · themarolt · commit dc3c44edd112 · 2026-03-25T11:31:57.000+01:00
diff --git a/services/apps/git_integration/src/crowdgit/services/software_value/main.go b/services/apps/git_integration/src/crowdgit/services/software_value/main.go
@@ -3,6 +3,7 @@ package main
 import (
 	"context"
 	"encoding/json"
+	"flag"
 	"fmt"
 	"os"
 	"os/exec"
@@ -13,23 +14,27 @@ import (
 )
 
 func main() {
-	response := processRepository()
+	noLarge := flag.Bool("no-large", false, "Skip files larger than 100MB to avoid OOM on large repos")
+	flag.Parse()
+
+	response := processRepository(*noLarge)
 	outputJSON(response)
 
 	// Always exit with code 0 - status details are in JSON response
 }
 
 // processRepository handles the main logic and returns a StandardResponse
-func processRepository() StandardResponse {
+func processRepository(noLarge bool) StandardResponse {
 	ctx := context.Background()
 
-	// Get target path from command line argument
+	// Get target path from remaining non-flag arguments
+	args := flag.Args()
 	var targetPath string
-	if len(os.Args) > 1 {
-		targetPath = os.Args[1]
+	if len(args) > 0 {
+		targetPath = args[0]
 	} else {
 		errorCode := ErrorCodeInvalidArguments
-		errorMessage := fmt.Sprintf("Usage: %s <target-path>", os.Args[0])
+		errorMessage := fmt.Sprintf("Usage: %s [--no-large] <target-path>", os.Args[0])
 		return StandardResponse{
 			Status:       StatusFailure,
 			ErrorCode:    &errorCode,
@@ -51,10 +56,10 @@ func processRepository() StandardResponse {
 	// Process single repository (the target path argument)
 	repoDir := config.TargetPath
 
-	insightsDb, err := NewInsightsDB(ctx, config.InsightsDatabase)
-	if err != nil {
+	insightsDb, dbErr := NewInsightsDB(ctx, config.InsightsDatabase)
+	if dbErr != nil {
 		errorCode := ErrorCodeDatabaseConnection
-		errorMessage := fmt.Sprintf("Error connecting to insights database: %v", err)
+		errorMessage := fmt.Sprintf("Error connecting to insights database: %v", dbErr)
 		return StandardResponse{
 			Status:       StatusFailure,
 			ErrorCode:    &errorCode,
@@ -76,7 +81,7 @@ func processRepository() StandardResponse {
 	}
 
 	// Process the repository with SCC
-	report, err := getSCCReport(config.SCCPath, repoDir)
+	report, err := getSCCReport(config.SCCPath, repoDir, noLarge)
 	if err != nil {
 		errorCode := getErrorCodeFromSCCError(err)
 		errorMessage := fmt.Sprintf("Error processing repository '%s': %v", repoDir, err)
@@ -120,10 +125,10 @@ func processRepository() StandardResponse {
 
 
 // getSCCReport analyzes a directory with scc and returns a report containing the estimated cost and language statistics.
-func getSCCReport(sccPath, dirPath string) (SCCReport, error) {
-	cost, err := getCost(sccPath, dirPath)
+func getSCCReport(sccPath, dirPath string, noLarge bool) (SCCReport, error) {
+	cost, err := getCost(sccPath, dirPath, noLarge)
 	if err != nil {
-		return SCCReport{}, fmt.Errorf("error getting SCC report for '%s': %v\"", err)
+		return SCCReport{}, fmt.Errorf("error getting SCC report for '%s': %v", dirPath, err)
 	}
 
 	// Skip saving to database if cost is 0 - do we want to do this?
@@ -133,7 +138,7 @@ func getSCCReport(sccPath, dirPath string) (SCCReport, error) {
 
 	projectPath := filepath.Base(dirPath)
 
-	langStats, err := getLanguageStats(sccPath, dirPath)
+	langStats, err := getLanguageStats(sccPath, dirPath, noLarge)
 	if err != nil {
 		return SCCReport{}, fmt.Errorf("error getting language stats for '%s': %v", dirPath, err)
 	}
@@ -177,8 +182,8 @@ func getGitRepositoryURL(dirPath string) (string, error) {
 }
 
 // getCost runs the scc command and parses the output to get the estimated cost.
-func getCost(sccPathPath, repoPath string) (float64, error) {
-	output, err := runSCC(sccPathPath, "--format=short", repoPath)
+func getCost(sccPathPath, repoPath string, noLarge bool) (float64, error) {
+	output, err := runSCC(sccPathPath, noLarge, "--format=short", repoPath)
 	if err != nil {
 		return 0, fmt.Errorf("failed to run scc command: %w", err)
 	}
@@ -192,8 +197,8 @@ func getCost(sccPathPath, repoPath string) (float64, error) {
 }
 
 // getLanguageStats runs the scc command and parses the output to get language statistics.
-func getLanguageStats(sccPathPath, repoPath string) ([]LanguageStats, error) {
-	output, err := runSCC(sccPathPath, "--format=json", repoPath)
+func getLanguageStats(sccPathPath, repoPath string, noLarge bool) ([]LanguageStats, error) {
+	output, err := runSCC(sccPathPath, noLarge, "--format=json", repoPath)
 	if err != nil {
 		return nil, fmt.Errorf("failed to run scc command: %w", err)
 	}
@@ -207,8 +212,14 @@ func getLanguageStats(sccPathPath, repoPath string) ([]LanguageStats, error) {
 }
 
 // runSCC executes the scc command with the given arguments and returns the output.
-func runSCC(sccPathPath string, args ...string) (string, error) {
-	cmd := exec.Command(sccPathPath, args...)
+// When noLarge is true, files larger than 100MB are skipped to avoid OOM on large repos.
+func runSCC(sccPathPath string, noLarge bool, args ...string) (string, error) {
+	var cmdArgs []string
+	if noLarge {
+		cmdArgs = append(cmdArgs, "--no-large", "--large-byte-count", "100000000")
+	}
+	cmdArgs = append(cmdArgs, args...)
+	cmd := exec.Command(sccPathPath, cmdArgs...)
 	output, err := cmd.Output()
 	if err != nil {
 		if exitErr, ok := err.(*exec.ExitError); ok {
diff --git a/services/apps/git_integration/src/crowdgit/services/software_value/software_value_service.py b/services/apps/git_integration/src/crowdgit/services/software_value/software_value_service.py
@@ -8,6 +8,21 @@
 from crowdgit.services.base.base_service import BaseService
 from crowdgit.services.utils import run_shell_command
 
+_LARGE_REPO_THRESHOLD_BYTES = 10 * 1024 * 1024 * 1024  # 10 GB
+# Repos excluded from software value analysis.
+# f7f92577-f258-49f0-b5b4-ba07194ca040: data repo (not a code repo), produces misleading results.
+_SOFTWARE_VALUE_EXCLUDED_REPO_IDS = frozenset({"f7f92577-f258-49f0-b5b4-ba07194ca040"})
+
+
+async def _get_repo_size_bytes(repo_path: str) -> int:
+    """Return total disk usage of repo_path in bytes using du -sb."""
+    try:
+        output = await run_shell_command(["du", "-sb", repo_path], timeout=120)
+        return int(output.split()[0])
+    except Exception:
+        pass
+    return 0
+
 
 class SoftwareValueService(BaseService):
     """Service for calculating software value metrics"""
@@ -20,16 +35,34 @@ def __init__(self):
     async def run(self, repo_id: str, repo_path: str) -> None:
         """
         Triggers software value binary for given repo.
-        Results are saved into insights database directly
+        Results are saved into insights database directly.
+        Repos in _SOFTWARE_VALUE_EXCLUDED_REPO_IDS are skipped entirely.
+        For repos larger than 10 GB, scc is run with --no-large (skipping files >100MB) to avoid OOM.
         """
+        if repo_id in _SOFTWARE_VALUE_EXCLUDED_REPO_IDS:
+            self.logger.info(f"Skipping software value for excluded repo {repo_id}")
+            return
+
         start_time = time.time()
         execution_status = ExecutionStatus.SUCCESS
         error_code = None
         error_message = None
 
         try:
+            cmd = [self.software_value_executable]
+
+            repo_size = await _get_repo_size_bytes(repo_path)
+            if repo_size >= _LARGE_REPO_THRESHOLD_BYTES:
+                self.logger.info(
+                    f"Repo size {repo_size / (1024**3):.1f} GB exceeds threshold — "
+                    "running scc with no-large (skipping files >100MB)"
+                )
+                cmd += ["--no-large"]
+
+            cmd.append(repo_path)
+
             self.logger.info("Running software value...")
-            output = await run_shell_command([self.software_value_executable, repo_path])
+            output = await run_shell_command(cmd)
             self.logger.info(f"Software value output: {output}")
 
             # Parse JSON output and extract fields from StandardResponse structure
diff --git a/services/libs/tinybird/datasources/ai_code_tracker_commits_ds.datasource b/services/libs/tinybird/datasources/ai_code_tracker_commits_ds.datasource
@@ -0,0 +1,17 @@
+DESCRIPTION >
+    - `ai_code_tracker_commits_ds` contains only authored-commit activities, pre-filtered from the full activities table.
+    - Populated daily by `ai_code_tracker_commits_copy.pipe`.
+    - Stores only the fields needed for AI tool detection: timestamp, title, body, attributes.
+    - Reduces the dataset from ~1B rows to only commits, with sorting keys optimized for the AI pattern matching step.
+
+TAGS "Report"
+
+SCHEMA >
+    `timestamp` DateTime,
+    `title` String DEFAULT '',
+    `body` String DEFAULT '',
+    `attributes` String DEFAULT ''
+
+ENGINE MergeTree
+ENGINE_PARTITION_KEY toYear(timestamp)
+ENGINE_SORTING_KEY timestamp
diff --git a/services/libs/tinybird/datasources/ai_code_tracker_ds.datasource b/services/libs/tinybird/datasources/ai_code_tracker_ds.datasource
@@ -0,0 +1,19 @@
+DESCRIPTION >
+    - `ai_code_tracker_ds` contains pre-computed monthly aggregates of AI-assisted commits by tool.
+    - Populated hourly by `ai_code_tracker_copy.pipe` which scans activities for AI tool signatures.
+    - Each row represents one (month, toolKey) combination with commit counts.
+    - Also stores total commits per month (toolKey = '__total__') for percentage calculations.
+    - `monthStart` is the first day of the month (used for both monthly and yearly aggregation at query time).
+    - `toolKey` identifies the AI tool (e.g., 'github-copilot', 'claude', 'cursor') or '__total__' for all commits.
+    - `commitCount` is the number of commits for that tool in that month.
+
+TAGS "Report"
+
+SCHEMA >
+    `monthStart` Date,
+    `toolKey` LowCardinality(String),
+    `commitCount` UInt64
+
+ENGINE MergeTree
+ENGINE_PARTITION_KEY toYear(monthStart)
+ENGINE_SORTING_KEY monthStart, toolKey
diff --git a/services/libs/tinybird/pipes/ai_code_tracker.pipe b/services/libs/tinybird/pipes/ai_code_tracker.pipe
@@ -0,0 +1,90 @@
+DESCRIPTION >
+    - `ai_code_tracker.pipe` returns AI-assisted commit counts by tool and time period.
+    - Reads from pre-computed `ai_code_tracker_ds` datasource (materialized hourly by `ai_code_tracker_copy.pipe`).
+    - Parameters:
+    - `granularity`: Required. Either 'monthly' or 'yearly'.
+    - `startDate`: Optional DateTime filter for commits after this date.
+    - `endDate`: Optional DateTime filter for commits before this date.
+    - Response: toolKey, toolName, startDate, endDate, commitCount
+
+TAGS "Report"
+
+NODE ai_code_tracker_result
+DESCRIPTION >
+    Aggregate pre-computed AI commit counts by tool and time period
+
+SQL >
+    %
+    SELECT
+        toolKey,
+        multiIf(
+            toolKey = 'github-copilot',
+            'GitHub Copilot',
+            toolKey = 'chatgpt',
+            'ChatGPT',
+            toolKey = 'claude',
+            'Claude',
+            toolKey = 'cursor',
+            'Cursor',
+            toolKey = 'codewhisperer',
+            'CodeWhisperer',
+            toolKey = 'gemini',
+            'Gemini',
+            toolKey = 'codeium',
+            'Codeium',
+            toolKey = 'aider',
+            'Aider',
+            toolKey = 'devin',
+            'Devin',
+            toolKey = 'tabnine',
+            'Tabnine',
+            toolKey = 'other',
+            'Other AI',
+            'Unknown'
+        ) AS toolName,
+        formatDateTime(
+            CASE
+                WHEN
+                    {{
+                        String(
+                            granularity,
+                            description="Time aggregation: monthly or yearly",
+                            required=True,
+                        )
+                    }} = 'monthly'
+                THEN monthStart
+                ELSE toStartOfYear(monthStart)
+            END,
+            '%Y-%m-%d'
+        ) AS startDate,
+        formatDateTime(
+            CASE
+                WHEN
+                    {{
+                        String(
+                            granularity,
+                            description="Time aggregation: monthly or yearly",
+                            required=True,
+                        )
+                    }} = 'monthly'
+                THEN monthStart + INTERVAL 1 MONTH - INTERVAL 1 DAY
+                ELSE toStartOfYear(monthStart) + INTERVAL 1 YEAR - INTERVAL 1 DAY
+            END,
+            '%Y-%m-%d'
+        ) AS endDate,
+        sum(commitCount) AS commitCount
+    FROM ai_code_tracker_ds
+    WHERE
+        toolKey != '__total__'
+        {% if defined(startDate) %}
+            AND monthStart >= toDate(
+                {{ DateTime(startDate, description="Filter commits after this date", required=False) }}
+            )
+        {% end %}
+        {% if defined(endDate) %}
+            AND monthStart < toDate(
+                {{ DateTime(endDate, description="Filter commits before this date", required=False) }}
+            )
+        {% end %}
+    GROUP BY toolKey, startDate, endDate
+    ORDER BY startDate ASC, commitCount DESC
diff --git a/services/libs/tinybird/pipes/ai_code_tracker_commits_copy.pipe b/services/libs/tinybird/pipes/ai_code_tracker_commits_copy.pipe
@@ -0,0 +1,17 @@
+DESCRIPTION >
+    - `ai_code_tracker_commits_copy.pipe` extracts only authored-commit rows from `activities_deduplicated_ds`.
+    - Runs daily to populate `ai_code_tracker_commits_ds` with a small subset (~commits only) of the full 1B+ activities table.
+    - This intermediate datasource is then used by `ai_code_tracker_copy.pipe` for fast AI pattern matching.
+
+TAGS "Report"
+
+NODE ai_code_tracker_commits_copy_result
+SQL >
+    SELECT a.timestamp, a.title, a.body, a.attributes
+    FROM activities_deduplicated_ds a
+    WHERE a.type = 'authored-commit'
+
+TYPE COPY
+TARGET_DATASOURCE ai_code_tracker_commits_ds
+COPY_MODE replace
+COPY_SCHEDULE 0 2 * * *
diff --git a/services/libs/tinybird/pipes/ai_code_tracker_copy.pipe b/services/libs/tinybird/pipes/ai_code_tracker_copy.pipe
diff --git a/services/libs/tinybird/pipes/ai_code_tracker_total_commits.pipe b/services/libs/tinybird/pipes/ai_code_tracker_total_commits.pipe