Skip to content

Commit f9dfac1

Browse files
Klesh Wongd4x1
andcommitted
feat: gitlab extractors and convertors support incremental mode (#7997)
* feat: gitlab mr_extractor support incremental sync * feat: incr mode transformation support for deployment/issue/job/mr and others * feat: removed mr_enricher and all gitlab transformers support incr-mode * fix: some gitlab subtasks are missing connection in left join clause * fix: test cases failed due to gitlab transformers support incr-mode * fix: linting * fix: do not collect all accounts from jihulab.com * fix: typo * fix: gitlab mr comments won't be converted to domain layer till next run * feat: improve gitlab MR comments/commits collection performance * fix: gitlab issues/mrs child records are not deleted * docs: update stateful extractor doc * fix: gitlab mr detail test * refactor: unify stateful extractor and convertor helper * docs: update stateful extractor/convertor doc * refactor: remove useless type hint * fix: shoud not be deleting records not extracted by current extractor * fix: jira issue extractor should not deleting sprint_issue * refactor: remove commit related subtasks * fix: remove commit conversion e2e test * feat(gitlab): update ExtractApiChildPipelines (#8016) * fix: retransform should run in fullsync mode * fix: gitlab issue assignees are not being converted --------- Co-authored-by: Lynwee <1507509064@qq.com>
1 parent 83ec4a5 commit f9dfac1

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

54 files changed

+843
-1401
lines changed

backend/helpers/pluginhelper/api/api_extractor_stateful.go

Lines changed: 65 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -18,6 +18,7 @@ limitations under the License.
1818
package api
1919

2020
import (
21+
"encoding/json"
2122
"reflect"
2223

2324
"github.com/apache/incubator-devlake/core/dal"
@@ -27,30 +28,72 @@ import (
2728
)
2829

2930
// StatefulApiExtractorArgs is a struct that contains the arguments for a stateful api extractor
30-
type StatefulApiExtractorArgs struct {
31+
type StatefulApiExtractorArgs[InputType any] struct {
3132
*SubtaskCommonArgs
32-
Extract func(row *RawData) ([]any, errors.Error)
33+
BeforeExtract func(issue *InputType, stateManager *SubtaskStateManager) errors.Error
34+
Extract func(body *InputType, row *RawData) ([]any, errors.Error)
3335
}
3436

35-
type StatefulApiExtractor struct {
36-
*StatefulApiExtractorArgs
37+
// StatefulApiExtractor is a struct that manages the stateful API extraction process.
38+
// It facilitates extracting data from a single _raw_data table and saving it into multiple Tool Layer tables.
39+
// By default, the extractor operates in Incremental Mode, processing only new records added to the raw table since the previous run.
40+
// This approach reduces the amount of data to process, significantly decreasing the execution time.
41+
// The extractor automatically detects if the configuration has changed since the last run. If a change is detected,
42+
// it will automatically switch to Full-Sync mode.
43+
//
44+
// Example:
45+
//
46+
// extractor, err := api.NewStatefulApiExtractor(&api.StatefulApiExtractorArgs[apiv2models.Issue]{
47+
// SubtaskCommonArgs: &api.SubtaskCommonArgs{
48+
// SubTaskContext: subtaskCtx,
49+
// Table: RAW_ISSUE_TABLE,
50+
// Params: JiraApiParams{
51+
// ConnectionId: data.Options.ConnectionId,
52+
// BoardId: data.Options.BoardId,
53+
// },
54+
// SubtaskConfig: config, // The helper stores this configuration in the state and compares it with the previous one
55+
// // to determine the operating mode (Incremental/FullSync).
56+
// // Ensure that the configuration is serializable and contains only public fields.
57+
// // It is also recommended that the configuration includes only the necessary fields used by the extractor.
58+
// ..},
59+
// BeforeExtract: func(body *IssuesResponse, stateManager *api.SubtaskStateManager) errors.Error {
60+
// if stateManager.IsIncremental() {
61+
// // It is important to delete all existing child-records under DiffSync Mode
62+
// err := db.Delete(
63+
// &models.JiraIssueLabel{},
64+
// dal.Where("connection_id = ? AND issue_id = ?", data.Options.ConnectionId, body.Id),
65+
// )
66+
// }
67+
// return nil
68+
// },
69+
// Extract: func(apiIssue *apiv2models.Issue, row *api.RawData) ([]interface{}, errors.Error) {
70+
// },
71+
// })
72+
//
73+
// if err != nil {
74+
// return err
75+
// }
76+
//
77+
// return extractor.Execute()
78+
type StatefulApiExtractor[InputType any] struct {
79+
*StatefulApiExtractorArgs[InputType]
3780
*SubtaskStateManager
3881
}
3982

4083
// NewStatefulApiExtractor creates a new StatefulApiExtractor
41-
func NewStatefulApiExtractor(args *StatefulApiExtractorArgs) (*StatefulApiExtractor, errors.Error) {
84+
func NewStatefulApiExtractor[InputType any](args *StatefulApiExtractorArgs[InputType]) (*StatefulApiExtractor[InputType], errors.Error) {
4285
stateManager, err := NewSubtaskStateManager(args.SubtaskCommonArgs)
4386
if err != nil {
4487
return nil, err
4588
}
46-
return &StatefulApiExtractor{
89+
return &StatefulApiExtractor[InputType]{
4790
StatefulApiExtractorArgs: args,
4891
SubtaskStateManager: stateManager,
4992
}, nil
5093
}
5194

5295
// Execute sub-task
53-
func (extractor *StatefulApiExtractor) Execute() errors.Error {
96+
func (extractor *StatefulApiExtractor[InputType]) Execute() errors.Error {
5497
// load data from database
5598
db := extractor.GetDal()
5699
logger := extractor.GetLogger()
@@ -103,7 +146,20 @@ func (extractor *StatefulApiExtractor) Execute() errors.Error {
103146
return errors.Default.Wrap(err, "error fetching row")
104147
}
105148

106-
results, err := extractor.Extract(row)
149+
body := new(InputType)
150+
err = errors.Convert(json.Unmarshal(row.Data, body))
151+
if err != nil {
152+
return err
153+
}
154+
155+
if extractor.BeforeExtract != nil {
156+
err = extractor.BeforeExtract(body, extractor.SubtaskStateManager)
157+
if err != nil {
158+
return err
159+
}
160+
}
161+
162+
results, err := extractor.Extract(body, row)
107163
if err != nil {
108164
return errors.Default.Wrap(err, "error calling plugin Extract implementation")
109165
}
@@ -137,4 +193,4 @@ func (extractor *StatefulApiExtractor) Execute() errors.Error {
137193
return extractor.SubtaskStateManager.Close()
138194
}
139195

140-
var _ plugin.SubTask = (*StatefulApiExtractor)(nil)
196+
var _ plugin.SubTask = (*StatefulApiExtractor[any])(nil)

backend/helpers/pluginhelper/api/api_rawdata.go

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -34,7 +34,7 @@ type RawData struct {
3434
Data []byte
3535
Url string
3636
Input json.RawMessage `gorm:"type:json"`
37-
CreatedAt time.Time
37+
CreatedAt time.Time `gorm:"index"`
3838
}
3939

4040
type TaskOptions interface {

backend/helpers/pluginhelper/api/data_convertor_stateful.go

Lines changed: 75 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -27,18 +27,82 @@ import (
2727

2828
type StatefulDataConverterArgs[InputType any] struct {
2929
*SubtaskCommonArgs
30-
Input func(*SubtaskStateManager) (dal.Rows, errors.Error)
31-
Convert func(row *InputType) ([]any, errors.Error)
32-
BatchSize int
30+
Input func(*SubtaskStateManager) (dal.Rows, errors.Error)
31+
BeforeConvert func(issue *InputType, stateManager *SubtaskStateManager) errors.Error
32+
Convert func(row *InputType) ([]any, errors.Error)
33+
BatchSize int
3334
}
3435

36+
// StatefulDataConverter is a struct that manages the stateful data conversion process.
37+
// It facilitates converting data from a database cursor and saving it into arbitrary tables.
38+
// The converter determines the operating mode (Incremental/FullSync) based on the stored state and configuration.
39+
// It then calls the provided `Input` function to obtain the `dal.Rows` (the database cursor) and processes each
40+
// record individually through the `Convert` function, saving the results to the database.
41+
//
42+
// For Incremental mode to work properly, it is crucial to check `stateManager.IsIncremental()` and utilize
43+
// `stateManager.GetSince()` to build your query in the `Input` function, ensuring that only the necessary
44+
// records are fetched.
45+
//
46+
// The converter automatically detects if the configuration has changed since the last run. If a change is detected,
47+
// it will automatically switch to Full-Sync mode.
48+
//
49+
// Example:
50+
//
51+
// converter, err := api.NewStatefulDataConverter(&api.StatefulDataConverterArgs[models.JiraIssue]{
52+
// SubtaskCommonArgs: &api.SubtaskCommonArgs{
53+
// SubTaskContext: subtaskCtx,
54+
// Table: RAW_ISSUE_TABLE,
55+
// Params: JiraApiParams{
56+
// ConnectionId: data.Options.ConnectionId,
57+
// BoardId: data.Options.BoardId,
58+
// },
59+
// SubtaskConfig: mappings,
60+
// },
61+
// Input: func(stateManager *api.SubtaskStateManager) (dal.Rows, errors.Error) {
62+
// clauses := []dal.Clause{
63+
// dal.Select("_tool_jira_issues.*"),
64+
// dal.From("_tool_jira_issues"),
65+
// dal.Join(`left join _tool_jira_board_issues
66+
// on _tool_jira_board_issues.issue_id = _tool_jira_issues.issue_id
67+
// and _tool_jira_board_issues.connection_id = _tool_jira_issues.connection_id`),
68+
// dal.Where(
69+
// "_tool_jira_board_issues.connection_id = ? AND _tool_jira_board_issues.board_id = ?",
70+
// data.Options.ConnectionId,
71+
// data.Options.BoardId,
72+
// ),
73+
// }
74+
// if stateManager.IsIncremental() { // IMPORTANT: to filter records for Incremental Mode
75+
// since := stateManager.GetSince()
76+
// if since != nil {
77+
// clauses = append(clauses, dal.Where("_tool_jira_issues.updated_at >= ? ", since))
78+
// }
79+
// }
80+
// return db.Cursor(clauses...)
81+
// },
82+
// BeforeConvert: func(jiraIssue *models.GitlabMergeRequest, stateManager *api.SubtaskStateManager) errors.Error {
83+
// // It is important to delete all existing child-records under DiffSync Mode
84+
// issueId := issueIdGen.Generate(data.Options.ConnectionId, jiraIssue.IssueId)
85+
// if err := db.Delete(&ticket.IssueAssignee{}, dal.Where("issue_id = ?", issueId)); err != nil {
86+
// return err
87+
// }
88+
// ...
89+
// return nil
90+
// },
91+
// Convert: func(jiraIssue *models.JiraIssue) ([]interface{}, errors.Error) {
92+
// },
93+
// })
94+
95+
// if err != nil {
96+
// return err
97+
// }
98+
99+
// return converter.Execute()
35100
type StatefulDataConverter[InputType any] struct {
36101
*StatefulDataConverterArgs[InputType]
37102
*SubtaskStateManager
38103
}
39104

40105
func NewStatefulDataConverter[
41-
OptType any,
42106
InputType any,
43107
](
44108
args *StatefulDataConverterArgs[InputType],
@@ -91,6 +155,13 @@ func (converter *StatefulDataConverter[InputType]) Execute() errors.Error {
91155
return errors.Default.Wrap(err, "error fetching rows")
92156
}
93157

158+
if converter.BeforeConvert != nil {
159+
err = converter.BeforeConvert(inputRow, converter.SubtaskStateManager)
160+
if err != nil {
161+
return err
162+
}
163+
}
164+
94165
results, err := converter.Convert(inputRow)
95166
if err != nil {
96167
return errors.Default.Wrap(err, "error calling Converter plugin implementation")

backend/helpers/pluginhelper/api/enrich_with_regex.go

Lines changed: 59 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -28,12 +28,13 @@ import (
2828
// TODO: remove Enricher from naming since it is more like a util function
2929
type RegexEnricher struct {
3030
// This field will store compiled regular expression for every pattern
31-
regexpMap map[string]*regexp.Regexp
31+
regexpMap map[string]*regexp.Regexp
32+
regexMapList map[string][]*regexp.Regexp
3233
}
3334

3435
// NewRegexEnricher initialize a regexEnricher
3536
func NewRegexEnricher() *RegexEnricher {
36-
return &RegexEnricher{regexpMap: make(map[string]*regexp.Regexp)}
37+
return &RegexEnricher{regexpMap: make(map[string]*regexp.Regexp), regexMapList: make(map[string][]*regexp.Regexp)}
3738
}
3839

3940
// AddRegexp will add compiled regular expression for pattern to regexpMap
@@ -105,3 +106,59 @@ func (r *RegexEnricher) ReturnNameIfOmittedOrMatched(name string, targets ...str
105106
}
106107
return r.ReturnNameIfMatched(name, targets...)
107108
}
109+
110+
func (r *RegexEnricher) PlainMap() map[string]string {
111+
m := make(map[string]string)
112+
for k, v := range r.regexpMap {
113+
m[k] = v.String()
114+
}
115+
return m
116+
}
117+
118+
// TryAdd a named regexp if given pattern is not empty
119+
func (r *RegexEnricher) TryAddList(name string, patterns ...string) errors.Error {
120+
if _, ok := r.regexMapList[name]; ok {
121+
return errors.Default.New(fmt.Sprintf("Regex pattern with name: %s already exists", name))
122+
}
123+
var regexList []*regexp.Regexp
124+
for _, pattern := range patterns {
125+
if pattern == "" {
126+
continue
127+
}
128+
regex, err := errors.Convert01(regexp.Compile(pattern))
129+
if err != nil {
130+
return errors.BadInput.Wrap(err, fmt.Sprintf("Fail to compile pattern for regex pattern: %s", pattern))
131+
}
132+
regexList = append(regexList, regex)
133+
}
134+
135+
// Only save non-empty regexList
136+
if len(regexList) > 0 {
137+
r.regexMapList[name] = regexList
138+
}
139+
return nil
140+
}
141+
142+
// ReturnNameIfMatched will return name if any of the targets matches the regex associated with the given name
143+
func (r *RegexEnricher) ReturnNameIfMatchedList(name string, targets ...string) string {
144+
if regexList, ok := r.regexMapList[name]; !ok {
145+
return ""
146+
} else {
147+
for _, regex := range regexList {
148+
for _, target := range targets {
149+
if regex.MatchString(target) {
150+
return name
151+
}
152+
}
153+
}
154+
return "" // If any regex fails to match, return ""
155+
}
156+
}
157+
158+
// ReturnNameIfOmittedOrMatched returns the given name if regex of the given name is omitted or fallback to ReturnNameIfMatched
159+
func (r *RegexEnricher) ReturnNameIfOmittedOrMatchedList(name string, targets ...string) string {
160+
if _, ok := r.regexMapList[name]; !ok {
161+
return name
162+
}
163+
return r.ReturnNameIfMatched(name, targets...)
164+
}

backend/helpers/pluginhelper/api/subtask_state_manager.go

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -34,7 +34,7 @@ type SubtaskCommonArgs struct {
3434
plugin.SubTaskContext
3535
Table string // raw table name
3636
Params any // for filtering rows belonging to the scope (jira board, github repo) of the subtask
37-
SubtaskConfig any // for determining whether the subtask should run in incremental or full sync mode
37+
SubtaskConfig any // for determining whether the subtask should run in Incremental or Full-Sync mode by comparing with the previous config to see if it changed
3838
BatchSize int // batch size for saving data
3939
}
4040

backend/impls/dalgorm/encdec_serializer.go

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -91,6 +91,11 @@ func (es *EncDecSerializer) Value(ctx context.Context, field *schema.Field, dst
9191
}
9292
target = string(b)
9393
}
94+
if field.GORMDataType == "string" {
95+
println("field.GORMDataType == string", field.Size)
96+
gormTag, ok := field.Tag.Lookup("gorm")
97+
println(ok, gormTag)
98+
}
9499
return plugin.Encrypt(es.encryptionSecret, target)
95100
}
96101

backend/plugins/gitlab/e2e/mr_commits_test.go

Lines changed: 0 additions & 33 deletions
Original file line numberDiff line numberDiff line change
@@ -133,37 +133,4 @@ func TestGitlabMrCommitDataFlow(t *testing.T) {
133133
CSVRelPath: "./snapshot_tables/pull_request_commits.csv",
134134
IgnoreTypes: []interface{}{common.Model{}},
135135
})
136-
137-
// verify conversion
138-
dataflowTester.FlushTabler(&code.Commit{})
139-
dataflowTester.FlushTabler(&code.RepoCommit{})
140-
dataflowTester.Subtask(tasks.ConvertCommitsMeta, taskData)
141-
dataflowTester.VerifyTable(
142-
code.Commit{},
143-
"./snapshot_tables/commits.csv",
144-
e2ehelper.ColumnWithRawData(
145-
"sha",
146-
"additions",
147-
"deletions",
148-
"dev_eq",
149-
"message",
150-
"author_name",
151-
"author_email",
152-
"authored_date",
153-
"author_id",
154-
"committer_name",
155-
"committer_email",
156-
"committed_date",
157-
"committer_id",
158-
),
159-
)
160-
161-
dataflowTester.VerifyTable(
162-
code.RepoCommit{},
163-
"./snapshot_tables/repo_commits.csv",
164-
e2ehelper.ColumnWithRawData(
165-
"repo_id",
166-
"commit_sha",
167-
),
168-
)
169136
}

backend/plugins/gitlab/e2e/mr_detail_test.go

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -41,6 +41,8 @@ func TestGitlabMrDetailDataFlow(t *testing.T) {
4141
},
4242
}
4343
// import raw data table
44+
dataflowTester.FlushTabler(&code.PullRequestAssignee{})
45+
dataflowTester.FlushTabler(&code.PullRequestReviewer{})
4446
dataflowTester.ImportCsvIntoRawTable("./raw_tables/_raw_gitlab_api_merge_requests.csv",
4547
"_raw_gitlab_api_merge_request_details")
4648

0 commit comments

Comments
 (0)