diff --git a/cmd/eval-dev-quality/cmd/evaluate.go b/cmd/eval-dev-quality/cmd/evaluate.go index b3d31765..0780061c 100644 --- a/cmd/eval-dev-quality/cmd/evaluate.go +++ b/cmd/eval-dev-quality/cmd/evaluate.go @@ -73,6 +73,8 @@ type Evaluate struct { Configuration string `long:"configuration" description:"Configuration file to set up an evaluation run."` // ExecutionTimeout holds the timeout for an execution. ExecutionTimeout uint `long:"execution-timeout" description:"Execution timeout for compilation and tests in minutes." default:"5"` + // RunIDStartsAt holds the offset increment for the run id used in creating the result folders. + RunIDStartsAt uint `long:"run-id-starts-at" description:"Sets the starting index for the run ID." default:"1"` // Runs holds the number of runs to perform. Runs uint `long:"runs" description:"Number of runs to perform." default:"1"` // RunsSequential indicates that interleaved runs are disabled and runs are performed sequentially. @@ -175,6 +177,8 @@ func (command *Evaluate) Initialize(args []string) (evaluationContext *evaluate. language.DefaultExecutionTimeout = time.Duration(command.ExecutionTimeout) * time.Minute } + evaluationContext.RunIDStartsAt = command.RunIDStartsAt + if command.Runs == 0 { command.logger.Panicf("number of configured runs must be greater than zero") } diff --git a/evaluate/evaluate.go b/evaluate/evaluate.go index 919e944f..94b9dd18 100644 --- a/evaluate/evaluate.go +++ b/evaluate/evaluate.go @@ -38,6 +38,8 @@ type Context struct { // TestdataPath determines the testdata path where all repositories reside grouped by languages. TestdataPath string + // RunIDStartsAt holds the run ID starting index created when running a evaluation multiple times. + RunIDStartsAt uint // Runs holds the number of runs to perform. Runs uint // RunsSequential indicates that interleaved runs are disabled and runs are performed sequentially. @@ -148,6 +150,7 @@ func Evaluate(ctx *Context) { } else { runCount = rl + 1 } + runID := ctx.RunIDStartsAt + runCount - 1 if err := temporaryRepository.Reset(logger); err != nil { logger.Panicf("ERROR: unable to reset temporary repository path: %s", err) @@ -177,7 +180,7 @@ func Evaluate(ctx *Context) { } // Write the task assessment to the evaluation CSV file. - if err := evaluationFile.WriteEvaluationRecord(model, language, temporaryRepository.Name(), runCount, assessment); err != nil { + if err := evaluationFile.WriteEvaluationRecord(model, language, temporaryRepository.Name(), runID, assessment); err != nil { logger.Panicf("ERROR: cannot write evaluation record: %s", err) } } @@ -271,6 +274,7 @@ func Evaluate(ctx *Context) { } else { runCount = rl + 1 } + runID := ctx.RunIDStartsAt + runCount - 1 if err := temporaryRepository.Reset(logger); err != nil { logger.Panicf("ERROR: unable to reset temporary repository path: %s", err) @@ -292,7 +296,7 @@ func Evaluate(ctx *Context) { } // Write the task assessment to the evaluation CSV file. - if err := evaluationFile.WriteEvaluationRecord(model, language, temporaryRepository.Name(), runCount, assessment); err != nil { + if err := evaluationFile.WriteEvaluationRecord(model, language, temporaryRepository.Name(), runID, assessment); err != nil { logger.Panicf("ERROR: cannot write evaluation record: %s", err) } } diff --git a/evaluate/evaluate_test.go b/evaluate/evaluate_test.go index 8719a7bb..140c858f 100644 --- a/evaluate/evaluate_test.go +++ b/evaluate/evaluate_test.go @@ -29,6 +29,8 @@ import ( modeltesting "github.com/symflower/eval-dev-quality/model/testing" "github.com/symflower/eval-dev-quality/provider" providertesting "github.com/symflower/eval-dev-quality/provider/testing" + "github.com/symflower/gota/dataframe" + "github.com/symflower/gota/series" ) var ( @@ -1235,6 +1237,7 @@ func TestEvaluate(t *testing.T) { repositoryPath, }, + RunIDStartsAt: 11, Runs: 3, RunsSequential: false, }, @@ -1395,7 +1398,14 @@ func TestEvaluate(t *testing.T) { assert.Equal(t, 1, strings.Count(data, "creating temporary repository"), "create only one temporary repository") }, filepath.Join(string(evaluatetask.IdentifierWriteTests), log.CleanModelNameForFileSystem(mockedModelID), "golang", "golang", "plain", "evaluation.log"): nil, - "evaluation.csv": nil, + "evaluation.csv": func(t *testing.T, filePath string, data string) { + dataFrame := dataframe.ReadCSV(strings.NewReader(data)) + assert.NoError(t, dataFrame.Err) + + expectedColumnRun := series.New([]int{11, 11, 11, 11, 12, 12, 12, 12, 13, 13, 13, 13}, series.Int, "run") + actualColumnRun := dataFrame.Col("run") + assert.Equal(t, expectedColumnRun, actualColumnRun) + }, }, }) } @@ -1425,6 +1435,7 @@ func TestEvaluate(t *testing.T) { repositoryPath, }, + RunIDStartsAt: 21, Runs: 3, RunsSequential: true, }, @@ -1585,7 +1596,14 @@ func TestEvaluate(t *testing.T) { assert.Contains(t, data, "\"msg\":\"starting run\",\"count\":3,\"total\":3,") assert.NotRegexp(t, `\\\"msg\\\":\\\"starting run\\\",\\\"count\\\":\d+,\\\"total\\\":\d+\}`, data) }, - "evaluation.csv": nil, + "evaluation.csv": func(t *testing.T, filePath string, data string) { + dataFrame := dataframe.ReadCSV(strings.NewReader(data)) + assert.NoError(t, dataFrame.Err) + + expectedColumnRun := series.New([]int{21, 21, 21, 21, 22, 22, 22, 22, 23, 23, 23, 23}, series.Int, "run") + actualColumnRun := dataFrame.Col("run") + assert.Equal(t, expectedColumnRun, actualColumnRun) + }, }, }) } diff --git a/go.mod b/go.mod index cc4e9605..778648dd 100644 --- a/go.mod +++ b/go.mod @@ -11,6 +11,7 @@ require ( github.com/pkg/errors v0.9.1 github.com/sashabaranov/go-openai v1.38.0 github.com/stretchr/testify v1.10.0 + github.com/symflower/gota v0.0.0-20250312083757-2306c53d6db1 github.com/symflower/lockfile v0.0.0-20240419143922-aa3b60940c84 github.com/zimmski/osutil v1.7.1 golang.org/x/exp v0.0.0-20250305212735-054e65f0b394 @@ -68,6 +69,7 @@ require ( github.com/yuin/goldmark v1.7.8 // indirect go.uber.org/multierr v1.11.0 // indirect golang.org/x/exp/typeparams v0.0.0-20250305212735-054e65f0b394 // indirect + golang.org/x/net v0.37.0 // indirect golang.org/x/sync v0.12.0 // indirect golang.org/x/sys v0.31.0 // indirect golang.org/x/telemetry v0.0.0-20250310203348-fdfaad844314 // indirect diff --git a/go.sum b/go.sum index b34f1c76..e19fd5f4 100644 --- a/go.sum +++ b/go.sum @@ -128,6 +128,8 @@ github.com/stretchr/testify v1.10.0 h1:Xv5erBjTwe/5IxqUQTdXv5kgmIvbHo3QQyRwhJsOf github.com/stretchr/testify v1.10.0/go.mod h1:r2ic/lqez/lEtzL7wO/rwa5dbSLXVDPFyf8C91i36aY= github.com/subosito/gotenv v1.6.0 h1:9NlTDc1FTs4qu0DDq7AEtTPNw6SVm7uBMsUCUjABIf8= github.com/subosito/gotenv v1.6.0/go.mod h1:Dk4QP5c2W3ibzajGcXpNraDfq2IrhjMIvMSWPKKo0FU= +github.com/symflower/gota v0.0.0-20250312083757-2306c53d6db1 h1:I+amS+Ns1Jqe1pPeC+QzshwKqFT7K79SUkKEOwWdYX0= +github.com/symflower/gota v0.0.0-20250312083757-2306c53d6db1/go.mod h1:UltkFgAA4b+vETI4lB6yS2FmZG9SUSJRxJuPwmCLytg= github.com/symflower/lockfile v0.0.0-20240419143922-aa3b60940c84 h1:yhPz6r3LLBDjoV0rIDUlyuvWQg9L4MTfdksLVX6/q0s= github.com/symflower/lockfile v0.0.0-20240419143922-aa3b60940c84/go.mod h1:W/87GmsQmvlsvcXeuAlTGjIpTHrTTHDEIVH936LjnqI= github.com/symflower/pretty v1.0.0 h1:wYSv0CBazyyzHNiGTwjkLzcmUQUFjRafEyWf3A7LJCk= @@ -150,6 +152,8 @@ golang.org/x/exp/typeparams v0.0.0-20250305212735-054e65f0b394 h1:VI4qDpTkfFaCXE golang.org/x/exp/typeparams v0.0.0-20250305212735-054e65f0b394/go.mod h1:LKZHyeOpPuZcMgxeHjJp4p5yvxrCX1xDvH10zYHhjjQ= golang.org/x/mod v0.24.0 h1:ZfthKaKaT4NrhGVZHO1/WDTwGES4De8KtWO0SIbNJMU= golang.org/x/mod v0.24.0/go.mod h1:IXM97Txy2VM4PJ3gI61r1YEk/gAj6zAHN3AdZt6S9Ww= +golang.org/x/net v0.37.0 h1:1zLorHbz+LYj7MQlSf1+2tPIIgibq2eL5xkrGk6f+2c= +golang.org/x/net v0.37.0/go.mod h1:ivrbrMbzFq5J41QOQh0siUuly180yBYtLp+CKbEaFx8= golang.org/x/sync v0.12.0 h1:MHc5BpPuC30uJk597Ri8TV3CNZcTLu6B6z4lJy+g6Jw= golang.org/x/sync v0.12.0/go.mod h1:1dzgHSNfp02xaA81J2MS99Qcpr2w7fw1gpm99rleRqA= golang.org/x/sys v0.0.0-20220811171246-fbc7d0a398ab/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=