Skip to content

Commit dc3c44e

Browse files
committed
Merge branch 'main' into fix/CM-1054-throttle-org-updated-at
2 parents 9ff36e7 + 6f9e65a commit dc3c44e

File tree

8 files changed

+403
-22
lines changed

8 files changed

+403
-22
lines changed

services/apps/git_integration/src/crowdgit/services/software_value/main.go

Lines changed: 31 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,7 @@ package main
33
import (
44
"context"
55
"encoding/json"
6+
"flag"
67
"fmt"
78
"os"
89
"os/exec"
@@ -13,23 +14,27 @@ import (
1314
)
1415

1516
func main() {
16-
response := processRepository()
17+
noLarge := flag.Bool("no-large", false, "Skip files larger than 100MB to avoid OOM on large repos")
18+
flag.Parse()
19+
20+
response := processRepository(*noLarge)
1721
outputJSON(response)
1822

1923
// Always exit with code 0 - status details are in JSON response
2024
}
2125

2226
// processRepository handles the main logic and returns a StandardResponse
23-
func processRepository() StandardResponse {
27+
func processRepository(noLarge bool) StandardResponse {
2428
ctx := context.Background()
2529

26-
// Get target path from command line argument
30+
// Get target path from remaining non-flag arguments
31+
args := flag.Args()
2732
var targetPath string
28-
if len(os.Args) > 1 {
29-
targetPath = os.Args[1]
33+
if len(args) > 0 {
34+
targetPath = args[0]
3035
} else {
3136
errorCode := ErrorCodeInvalidArguments
32-
errorMessage := fmt.Sprintf("Usage: %s <target-path>", os.Args[0])
37+
errorMessage := fmt.Sprintf("Usage: %s [--no-large] <target-path>", os.Args[0])
3338
return StandardResponse{
3439
Status: StatusFailure,
3540
ErrorCode: &errorCode,
@@ -51,10 +56,10 @@ func processRepository() StandardResponse {
5156
// Process single repository (the target path argument)
5257
repoDir := config.TargetPath
5358

54-
insightsDb, err := NewInsightsDB(ctx, config.InsightsDatabase)
55-
if err != nil {
59+
insightsDb, dbErr := NewInsightsDB(ctx, config.InsightsDatabase)
60+
if dbErr != nil {
5661
errorCode := ErrorCodeDatabaseConnection
57-
errorMessage := fmt.Sprintf("Error connecting to insights database: %v", err)
62+
errorMessage := fmt.Sprintf("Error connecting to insights database: %v", dbErr)
5863
return StandardResponse{
5964
Status: StatusFailure,
6065
ErrorCode: &errorCode,
@@ -76,7 +81,7 @@ func processRepository() StandardResponse {
7681
}
7782

7883
// Process the repository with SCC
79-
report, err := getSCCReport(config.SCCPath, repoDir)
84+
report, err := getSCCReport(config.SCCPath, repoDir, noLarge)
8085
if err != nil {
8186
errorCode := getErrorCodeFromSCCError(err)
8287
errorMessage := fmt.Sprintf("Error processing repository '%s': %v", repoDir, err)
@@ -120,10 +125,10 @@ func processRepository() StandardResponse {
120125

121126

122127
// getSCCReport analyzes a directory with scc and returns a report containing the estimated cost and language statistics.
123-
func getSCCReport(sccPath, dirPath string) (SCCReport, error) {
124-
cost, err := getCost(sccPath, dirPath)
128+
func getSCCReport(sccPath, dirPath string, noLarge bool) (SCCReport, error) {
129+
cost, err := getCost(sccPath, dirPath, noLarge)
125130
if err != nil {
126-
return SCCReport{}, fmt.Errorf("error getting SCC report for '%s': %v\"", err)
131+
return SCCReport{}, fmt.Errorf("error getting SCC report for '%s': %v", dirPath, err)
127132
}
128133

129134
// Skip saving to database if cost is 0 - do we want to do this?
@@ -133,7 +138,7 @@ func getSCCReport(sccPath, dirPath string) (SCCReport, error) {
133138

134139
projectPath := filepath.Base(dirPath)
135140

136-
langStats, err := getLanguageStats(sccPath, dirPath)
141+
langStats, err := getLanguageStats(sccPath, dirPath, noLarge)
137142
if err != nil {
138143
return SCCReport{}, fmt.Errorf("error getting language stats for '%s': %v", dirPath, err)
139144
}
@@ -177,8 +182,8 @@ func getGitRepositoryURL(dirPath string) (string, error) {
177182
}
178183

179184
// getCost runs the scc command and parses the output to get the estimated cost.
180-
func getCost(sccPathPath, repoPath string) (float64, error) {
181-
output, err := runSCC(sccPathPath, "--format=short", repoPath)
185+
func getCost(sccPathPath, repoPath string, noLarge bool) (float64, error) {
186+
output, err := runSCC(sccPathPath, noLarge, "--format=short", repoPath)
182187
if err != nil {
183188
return 0, fmt.Errorf("failed to run scc command: %w", err)
184189
}
@@ -192,8 +197,8 @@ func getCost(sccPathPath, repoPath string) (float64, error) {
192197
}
193198

194199
// getLanguageStats runs the scc command and parses the output to get language statistics.
195-
func getLanguageStats(sccPathPath, repoPath string) ([]LanguageStats, error) {
196-
output, err := runSCC(sccPathPath, "--format=json", repoPath)
200+
func getLanguageStats(sccPathPath, repoPath string, noLarge bool) ([]LanguageStats, error) {
201+
output, err := runSCC(sccPathPath, noLarge, "--format=json", repoPath)
197202
if err != nil {
198203
return nil, fmt.Errorf("failed to run scc command: %w", err)
199204
}
@@ -207,8 +212,14 @@ func getLanguageStats(sccPathPath, repoPath string) ([]LanguageStats, error) {
207212
}
208213

209214
// runSCC executes the scc command with the given arguments and returns the output.
210-
func runSCC(sccPathPath string, args ...string) (string, error) {
211-
cmd := exec.Command(sccPathPath, args...)
215+
// When noLarge is true, files larger than 100MB are skipped to avoid OOM on large repos.
216+
func runSCC(sccPathPath string, noLarge bool, args ...string) (string, error) {
217+
var cmdArgs []string
218+
if noLarge {
219+
cmdArgs = append(cmdArgs, "--no-large", "--large-byte-count", "100000000")
220+
}
221+
cmdArgs = append(cmdArgs, args...)
222+
cmd := exec.Command(sccPathPath, cmdArgs...)
212223
output, err := cmd.Output()
213224
if err != nil {
214225
if exitErr, ok := err.(*exec.ExitError); ok {

services/apps/git_integration/src/crowdgit/services/software_value/software_value_service.py

Lines changed: 35 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,21 @@
88
from crowdgit.services.base.base_service import BaseService
99
from crowdgit.services.utils import run_shell_command
1010

11+
_LARGE_REPO_THRESHOLD_BYTES = 10 * 1024 * 1024 * 1024 # 10 GB
12+
# Repos excluded from software value analysis.
13+
# f7f92577-f258-49f0-b5b4-ba07194ca040: data repo (not a code repo), produces misleading results.
14+
_SOFTWARE_VALUE_EXCLUDED_REPO_IDS = frozenset({"f7f92577-f258-49f0-b5b4-ba07194ca040"})
15+
16+
17+
async def _get_repo_size_bytes(repo_path: str) -> int:
18+
"""Return total disk usage of repo_path in bytes using du -sb."""
19+
try:
20+
output = await run_shell_command(["du", "-sb", repo_path], timeout=120)
21+
return int(output.split()[0])
22+
except Exception:
23+
pass
24+
return 0
25+
1126

1227
class SoftwareValueService(BaseService):
1328
"""Service for calculating software value metrics"""
@@ -20,16 +35,34 @@ def __init__(self):
2035
async def run(self, repo_id: str, repo_path: str) -> None:
2136
"""
2237
Triggers software value binary for given repo.
23-
Results are saved into insights database directly
38+
Results are saved into insights database directly.
39+
Repos in _SOFTWARE_VALUE_EXCLUDED_REPO_IDS are skipped entirely.
40+
For repos larger than 10 GB, scc is run with --no-large (skipping files >100MB) to avoid OOM.
2441
"""
42+
if repo_id in _SOFTWARE_VALUE_EXCLUDED_REPO_IDS:
43+
self.logger.info(f"Skipping software value for excluded repo {repo_id}")
44+
return
45+
2546
start_time = time.time()
2647
execution_status = ExecutionStatus.SUCCESS
2748
error_code = None
2849
error_message = None
2950

3051
try:
52+
cmd = [self.software_value_executable]
53+
54+
repo_size = await _get_repo_size_bytes(repo_path)
55+
if repo_size >= _LARGE_REPO_THRESHOLD_BYTES:
56+
self.logger.info(
57+
f"Repo size {repo_size / (1024**3):.1f} GB exceeds threshold — "
58+
"running scc with no-large (skipping files >100MB)"
59+
)
60+
cmd += ["--no-large"]
61+
62+
cmd.append(repo_path)
63+
3164
self.logger.info("Running software value...")
32-
output = await run_shell_command([self.software_value_executable, repo_path])
65+
output = await run_shell_command(cmd)
3366
self.logger.info(f"Software value output: {output}")
3467

3568
# Parse JSON output and extract fields from StandardResponse structure
Lines changed: 17 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,17 @@
1+
DESCRIPTION >
2+
- `ai_code_tracker_commits_ds` contains only authored-commit activities, pre-filtered from the full activities table.
3+
- Populated daily by `ai_code_tracker_commits_copy.pipe`.
4+
- Stores only the fields needed for AI tool detection: timestamp, title, body, attributes.
5+
- Reduces the dataset from ~1B rows to only commits, with sorting keys optimized for the AI pattern matching step.
6+
7+
TAGS "Report"
8+
9+
SCHEMA >
10+
`timestamp` DateTime,
11+
`title` String DEFAULT '',
12+
`body` String DEFAULT '',
13+
`attributes` String DEFAULT ''
14+
15+
ENGINE MergeTree
16+
ENGINE_PARTITION_KEY toYear(timestamp)
17+
ENGINE_SORTING_KEY timestamp
Lines changed: 19 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,19 @@
1+
DESCRIPTION >
2+
- `ai_code_tracker_ds` contains pre-computed monthly aggregates of AI-assisted commits by tool.
3+
- Populated hourly by `ai_code_tracker_copy.pipe` which scans activities for AI tool signatures.
4+
- Each row represents one (month, toolKey) combination with commit counts.
5+
- Also stores total commits per month (toolKey = '__total__') for percentage calculations.
6+
- `monthStart` is the first day of the month (used for both monthly and yearly aggregation at query time).
7+
- `toolKey` identifies the AI tool (e.g., 'github-copilot', 'claude', 'cursor') or '__total__' for all commits.
8+
- `commitCount` is the number of commits for that tool in that month.
9+
10+
TAGS "Report"
11+
12+
SCHEMA >
13+
`monthStart` Date,
14+
`toolKey` LowCardinality(String),
15+
`commitCount` UInt64
16+
17+
ENGINE MergeTree
18+
ENGINE_PARTITION_KEY toYear(monthStart)
19+
ENGINE_SORTING_KEY monthStart, toolKey
Lines changed: 90 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,90 @@
1+
DESCRIPTION >
2+
- `ai_code_tracker.pipe` returns AI-assisted commit counts by tool and time period.
3+
- Reads from pre-computed `ai_code_tracker_ds` datasource (materialized hourly by `ai_code_tracker_copy.pipe`).
4+
- Parameters:
5+
- `granularity`: Required. Either 'monthly' or 'yearly'.
6+
- `startDate`: Optional DateTime filter for commits after this date.
7+
- `endDate`: Optional DateTime filter for commits before this date.
8+
- Response: toolKey, toolName, startDate, endDate, commitCount
9+
10+
TAGS "Report"
11+
12+
NODE ai_code_tracker_result
13+
DESCRIPTION >
14+
Aggregate pre-computed AI commit counts by tool and time period
15+
16+
SQL >
17+
%
18+
SELECT
19+
toolKey,
20+
multiIf(
21+
toolKey = 'github-copilot',
22+
'GitHub Copilot',
23+
toolKey = 'chatgpt',
24+
'ChatGPT',
25+
toolKey = 'claude',
26+
'Claude',
27+
toolKey = 'cursor',
28+
'Cursor',
29+
toolKey = 'codewhisperer',
30+
'CodeWhisperer',
31+
toolKey = 'gemini',
32+
'Gemini',
33+
toolKey = 'codeium',
34+
'Codeium',
35+
toolKey = 'aider',
36+
'Aider',
37+
toolKey = 'devin',
38+
'Devin',
39+
toolKey = 'tabnine',
40+
'Tabnine',
41+
toolKey = 'other',
42+
'Other AI',
43+
'Unknown'
44+
) AS toolName,
45+
formatDateTime(
46+
CASE
47+
WHEN
48+
{{
49+
String(
50+
granularity,
51+
description="Time aggregation: monthly or yearly",
52+
required=True,
53+
)
54+
}} = 'monthly'
55+
THEN monthStart
56+
ELSE toStartOfYear(monthStart)
57+
END,
58+
'%Y-%m-%d'
59+
) AS startDate,
60+
formatDateTime(
61+
CASE
62+
WHEN
63+
{{
64+
String(
65+
granularity,
66+
description="Time aggregation: monthly or yearly",
67+
required=True,
68+
)
69+
}} = 'monthly'
70+
THEN monthStart + INTERVAL 1 MONTH - INTERVAL 1 DAY
71+
ELSE toStartOfYear(monthStart) + INTERVAL 1 YEAR - INTERVAL 1 DAY
72+
END,
73+
'%Y-%m-%d'
74+
) AS endDate,
75+
sum(commitCount) AS commitCount
76+
FROM ai_code_tracker_ds
77+
WHERE
78+
toolKey != '__total__'
79+
{% if defined(startDate) %}
80+
AND monthStart >= toDate(
81+
{{ DateTime(startDate, description="Filter commits after this date", required=False) }}
82+
)
83+
{% end %}
84+
{% if defined(endDate) %}
85+
AND monthStart < toDate(
86+
{{ DateTime(endDate, description="Filter commits before this date", required=False) }}
87+
)
88+
{% end %}
89+
GROUP BY toolKey, startDate, endDate
90+
ORDER BY startDate ASC, commitCount DESC
Lines changed: 17 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,17 @@
1+
DESCRIPTION >
2+
- `ai_code_tracker_commits_copy.pipe` extracts only authored-commit rows from `activities_deduplicated_ds`.
3+
- Runs daily to populate `ai_code_tracker_commits_ds` with a small subset (~commits only) of the full 1B+ activities table.
4+
- This intermediate datasource is then used by `ai_code_tracker_copy.pipe` for fast AI pattern matching.
5+
6+
TAGS "Report"
7+
8+
NODE ai_code_tracker_commits_copy_result
9+
SQL >
10+
SELECT a.timestamp, a.title, a.body, a.attributes
11+
FROM activities_deduplicated_ds a
12+
WHERE a.type = 'authored-commit'
13+
14+
TYPE COPY
15+
TARGET_DATASOURCE ai_code_tracker_commits_ds
16+
COPY_MODE replace
17+
COPY_SCHEDULE 0 2 * * *

0 commit comments

Comments
 (0)