Skip to content

Commit e6955b8

Browse files
committed
fix: disable concurrency on scc when repo size greater than 10G
Signed-off-by: Mouad BANI <mouad-mb@outlook.com>
1 parent 22868fc commit e6955b8

File tree

3 files changed

+64
-19
lines changed

3 files changed

+64
-19
lines changed

services/apps/git_integration/src/crowdgit/services/software_value/main.go

Lines changed: 31 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,7 @@ package main
33
import (
44
"context"
55
"encoding/json"
6+
"flag"
67
"fmt"
78
"os"
89
"os/exec"
@@ -13,23 +14,27 @@ import (
1314
)
1415

1516
func main() {
16-
response := processRepository()
17+
numProcessors := flag.Int("num-processors", 0, "Number of parallel scc workers (0 = scc default, 1 = minimum for large repos)")
18+
flag.Parse()
19+
20+
response := processRepository(*numProcessors)
1721
outputJSON(response)
1822

1923
// Always exit with code 0 - status details are in JSON response
2024
}
2125

2226
// processRepository handles the main logic and returns a StandardResponse
23-
func processRepository() StandardResponse {
27+
func processRepository(numProcessors int) StandardResponse {
2428
ctx := context.Background()
2529

26-
// Get target path from command line argument
30+
// Get target path from remaining non-flag arguments
31+
args := flag.Args()
2732
var targetPath string
28-
if len(os.Args) > 1 {
29-
targetPath = os.Args[1]
33+
if len(args) > 0 {
34+
targetPath = args[0]
3035
} else {
3136
errorCode := ErrorCodeInvalidArguments
32-
errorMessage := fmt.Sprintf("Usage: %s <target-path>", os.Args[0])
37+
errorMessage := fmt.Sprintf("Usage: %s [--num-processors N] <target-path>", os.Args[0])
3338
return StandardResponse{
3439
Status: StatusFailure,
3540
ErrorCode: &errorCode,
@@ -84,7 +89,7 @@ func processRepository() StandardResponse {
8489
}
8590

8691
// Process the repository with SCC
87-
report, err := getSCCReport(config.SCCPath, repoDir)
92+
report, err := getSCCReport(config.SCCPath, repoDir, numProcessors)
8893
if err != nil {
8994
errorCode := getErrorCodeFromSCCError(err)
9095
errorMessage := fmt.Sprintf("Error processing repository '%s': %v", repoDir, err)
@@ -136,8 +141,8 @@ func processRepository() StandardResponse {
136141

137142

138143
// getSCCReport analyzes a directory with scc and returns a report containing the estimated cost and language statistics.
139-
func getSCCReport(sccPath, dirPath string) (SCCReport, error) {
140-
cost, err := getCost(sccPath, dirPath)
144+
func getSCCReport(sccPath, dirPath string, numProcessors int) (SCCReport, error) {
145+
cost, err := getCost(sccPath, dirPath, numProcessors)
141146
if err != nil {
142147
return SCCReport{}, fmt.Errorf("error getting SCC report for '%s': %v", dirPath, err)
143148
}
@@ -149,7 +154,7 @@ func getSCCReport(sccPath, dirPath string) (SCCReport, error) {
149154

150155
projectPath := filepath.Base(dirPath)
151156

152-
langStats, err := getLanguageStats(sccPath, dirPath)
157+
langStats, err := getLanguageStats(sccPath, dirPath, numProcessors)
153158
if err != nil {
154159
return SCCReport{}, fmt.Errorf("error getting language stats for '%s': %v", dirPath, err)
155160
}
@@ -193,8 +198,8 @@ func getGitRepositoryURL(dirPath string) (string, error) {
193198
}
194199

195200
// getCost runs the scc command and parses the output to get the estimated cost.
196-
func getCost(sccPathPath, repoPath string) (float64, error) {
197-
output, err := runSCC(sccPathPath, "--format=short", repoPath)
201+
func getCost(sccPathPath, repoPath string, numProcessors int) (float64, error) {
202+
output, err := runSCC(sccPathPath, numProcessors, "--format=short", repoPath)
198203
if err != nil {
199204
return 0, fmt.Errorf("failed to run scc command: %w", err)
200205
}
@@ -208,8 +213,8 @@ func getCost(sccPathPath, repoPath string) (float64, error) {
208213
}
209214

210215
// getLanguageStats runs the scc command and parses the output to get language statistics.
211-
func getLanguageStats(sccPathPath, repoPath string) ([]LanguageStats, error) {
212-
output, err := runSCC(sccPathPath, "--format=json", repoPath)
216+
func getLanguageStats(sccPathPath, repoPath string, numProcessors int) ([]LanguageStats, error) {
217+
output, err := runSCC(sccPathPath, numProcessors, "--format=json", repoPath)
213218
if err != nil {
214219
return nil, fmt.Errorf("failed to run scc command: %w", err)
215220
}
@@ -223,8 +228,18 @@ func getLanguageStats(sccPathPath, repoPath string) ([]LanguageStats, error) {
223228
}
224229

225230
// runSCC executes the scc command with the given arguments and returns the output.
226-
func runSCC(sccPathPath string, args ...string) (string, error) {
227-
cmd := exec.Command(sccPathPath, args...)
231+
// When numProcessors > 0, scc is run with reduced parallelism to limit memory usage on large repos.
232+
func runSCC(sccPathPath string, numProcessors int, args ...string) (string, error) {
233+
var cmdArgs []string
234+
if numProcessors > 0 {
235+
n := strconv.Itoa(numProcessors)
236+
cmdArgs = append(cmdArgs,
237+
"--directory-walker-job-workers", n,
238+
"--file-process-job-workers", n,
239+
)
240+
}
241+
cmdArgs = append(cmdArgs, args...)
242+
cmd := exec.Command(sccPathPath, cmdArgs...)
228243
output, err := cmd.Output()
229244
if err != nil {
230245
if exitErr, ok := err.(*exec.ExitError); ok {

services/apps/git_integration/src/crowdgit/services/software_value/software_value_service.py

Lines changed: 32 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,5 @@
11
import json
2+
import subprocess
23
import time
34
from decimal import Decimal
45

@@ -8,6 +9,21 @@
89
from crowdgit.services.base.base_service import BaseService
910
from crowdgit.services.utils import run_shell_command
1011

12+
_LARGE_REPO_THRESHOLD_BYTES = 10 * 1024 * 1024 * 1024 # 10 GB
13+
14+
15+
def _get_repo_size_bytes(repo_path: str) -> int:
16+
"""Return total disk usage of repo_path in bytes using du -sb."""
17+
try:
18+
result = subprocess.run(
19+
["du", "-sb", repo_path], capture_output=True, text=True, timeout=120
20+
)
21+
if result.returncode == 0:
22+
return int(result.stdout.split()[0])
23+
except Exception:
24+
pass
25+
return 0
26+
1127

1228
class SoftwareValueService(BaseService):
1329
"""Service for calculating software value metrics"""
@@ -20,16 +36,30 @@ def __init__(self):
2036
async def run(self, repo_id: str, repo_path: str) -> None:
2137
"""
2238
Triggers software value binary for given repo.
23-
Results are saved into insights database directly
39+
Results are saved into insights database directly.
40+
For repos larger than 10 GB, scc is run with minimum parallelism (1 worker)
41+
to avoid OOM; results are identical.
2442
"""
2543
start_time = time.time()
2644
execution_status = ExecutionStatus.SUCCESS
2745
error_code = None
2846
error_message = None
2947

3048
try:
49+
cmd = [self.software_value_executable]
50+
51+
repo_size = _get_repo_size_bytes(repo_path)
52+
if repo_size >= _LARGE_REPO_THRESHOLD_BYTES:
53+
self.logger.info(
54+
f"Repo size {repo_size / (1024**3):.1f} GB exceeds threshold — "
55+
"running scc with num-processors=1"
56+
)
57+
cmd += ["--num-processors", "1"]
58+
59+
cmd.append(repo_path)
60+
3161
self.logger.info("Running software value...")
32-
output = await run_shell_command([self.software_value_executable, repo_path])
62+
output = await run_shell_command(cmd)
3363
self.logger.info(f"Software value output: {output}")
3464

3565
# Parse JSON output and extract fields from StandardResponse structure

services/apps/git_integration/src/crowdgit/settings.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -44,4 +44,4 @@ def load_env_var(key: str, required=True, default=None):
4444
STUCK_RECURRENT_REPO_TIMEOUT_HOURS = int(
4545
load_env_var("STUCK_RECURRENT_REPO_TIMEOUT_HOURS", default="4")
4646
)
47-
IS_PROD_ENV: bool = load_env_var("NODE_ENV", required=False) == "production"
47+
IS_PROD_ENV: bool = load_env_var("NODE_ENV", required=False) == "production"

0 commit comments

Comments
 (0)