Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 2 additions & 1 deletion app/src/main/kotlin/io/klibs/app/Application.kt
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@ package io.klibs.app
import io.klibs.app.configuration.properties.GoogleMavenCacheConfigurationProperties
import io.klibs.app.configuration.properties.ApiDocsProperties
import io.klibs.app.configuration.properties.AuthProperties
import io.klibs.core.scm.repository.health.OssHealthProperties
import org.springframework.boot.SpringApplication
import org.springframework.boot.autoconfigure.SpringBootApplication
import org.springframework.boot.autoconfigure.domain.EntityScan
Expand All @@ -13,7 +14,7 @@ fun main() {
SpringApplication.run(Application::class.java)
}

@EnableConfigurationProperties(value = [AuthProperties::class, ApiDocsProperties::class, GoogleMavenCacheConfigurationProperties::class])
@EnableConfigurationProperties(value = [AuthProperties::class, ApiDocsProperties::class, GoogleMavenCacheConfigurationProperties::class, OssHealthProperties::class])
@SpringBootApplication(scanBasePackages = ["io.klibs"])
@EntityScan(basePackages = ["io.klibs.**.entity"])
@EnableJpaRepositories(basePackages = ["io.klibs.**.repository"])
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,10 @@ package io.klibs.app.job

import io.klibs.app.indexing.GitHubIndexingService
import io.klibs.app.util.BackoffProvider
import io.klibs.core.scm.repository.ScmRepositoryEntity
import io.klibs.core.scm.repository.ScmRepositoryRepository
import io.klibs.core.scm.repository.health.OssHealthIssueOrPrSyncService
import io.klibs.core.scm.repository.health.OssHealthScoreService
import org.springframework.beans.factory.annotation.Qualifier
import net.javacrumbs.shedlock.core.LockAssert
import net.javacrumbs.shedlock.spring.annotation.SchedulerLock
Expand All @@ -11,14 +14,15 @@ import org.springframework.boot.autoconfigure.condition.ConditionalOnProperty
import org.springframework.scheduling.annotation.Scheduled
import org.springframework.stereotype.Component
import org.springframework.stereotype.Service
import java.time.Instant
import java.util.concurrent.TimeUnit

@Component
@ConditionalOnProperty("klibs.indexing", havingValue = "true")
class GitHubRepositoryUpdatingJob(val gitHubRepositoryUpdatingService: GitHubRepositoryUpdatingService) {

@Scheduled(initialDelay = 30, fixedRate = 30, timeUnit = TimeUnit.SECONDS)
@SchedulerLock(name = "updateGitHubRepositoryLock", lockAtMostFor = "30s")
@SchedulerLock(name = "updateGitHubRepositoryLock", lockAtMostFor = "10m")
fun updateGitHubRepository() {
LockAssert.assertLocked()
gitHubRepositoryUpdatingService.syncRepositoryWithGitHub()
Expand All @@ -29,6 +33,8 @@ class GitHubRepositoryUpdatingJob(val gitHubRepositoryUpdatingService: GitHubRep
class GitHubRepositoryUpdatingService(
private val scmRepositoryRepository: ScmRepositoryRepository,
private val githubIndexingService: GitHubIndexingService,
private val ossHealthIssueOrPrSyncService: OssHealthIssueOrPrSyncService,
private val ossHealthScoreService: OssHealthScoreService,
@Value("\${klibs.integration.github.update-repos-per-iteration:3}")
private val reposUpdatedPerCall: Int,
@Qualifier("repoBackoffProvider")
Expand All @@ -51,7 +57,30 @@ class GitHubRepositoryUpdatingService(
} catch (e: Exception) {
logger.error("Error while updating a repo", e)
repoBackoffProvider.onFailure(repoToUpdate.idNotNull)
return@forEach
}

val now = Instant.now()
runOssIssueOrPrSyncIfDue(repoToUpdate, now)
runOssScoreIfDue(repoToUpdate, now)
}
}

private fun runOssIssueOrPrSyncIfDue(repo: ScmRepositoryEntity, now: Instant) {
if (!ossHealthIssueOrPrSyncService.isDue(repo, now)) return
try {
ossHealthIssueOrPrSyncService.syncOne(repo, now)
} catch (e: Exception) {
logger.warn("OSS issue/PR sync failed for ${repo.ownerLogin}/${repo.name}: ${e.message}")
}
}

private fun runOssScoreIfDue(repo: ScmRepositoryEntity, now: Instant) {
if (!ossHealthScoreService.isDue(repo, now)) return
try {
ossHealthScoreService.computeOne(repo, now)
} catch (e: Exception) {
logger.warn("OSS score compute failed for ${repo.ownerLogin}/${repo.name}: ${e.message}")
}
}

Expand Down
6 changes: 6 additions & 0 deletions app/src/main/resources/application.yml
Original file line number Diff line number Diff line change
Expand Up @@ -67,6 +67,7 @@ klibs:
index-endpoint: ${KLIBS_MAVEN_INDEX_ENDPOINT:https://repo1.maven.org/maven2}
index-dir: ${KLIBS_MAVEN_INDEX_CACHE_DIR:${user.dir}/cache/maven-index}
content-endpoint: ${KLIBS_MAVEN_CONTENT_ENDPOINT:https://repo1.maven.org/maven2/}
content-fallback-endpoint: ${KLIBS_MAVEN_CONTENT_FALLBACK_ENDPOINT:https://search.maven.org/remotecontent?filepath=}
github:
personal-access-token: ${KLIBS_GITHUB_TOKEN}
cache:
Expand All @@ -88,6 +89,11 @@ klibs:
bucket-name: ${BUCKET_NAME}
prefix: ${KLIBS_GMAVEN_S3_PREFIX:gmaven}
ai: true
oss-health:
commit-cv-denominator: 1.0
issue-median-days-threshold: 21.0
pr-median-days-threshold: 5.0
active-contributors-target: 5

management:
endpoints:
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,184 @@
databaseChangeLog:
- changeSet:
id: create scm_repo_issue_or_pr table
author: nikita.vlaev@jetbrains.com
comment: "KTL-4386: Sliding-window store of closed issues and merged PRs for OSS Health index."
Comment thread
dkrasnoff marked this conversation as resolved.
preConditions:
- onFail: MARK_RAN
- not:
tableExists:
tableName: scm_repo_issue_or_pr
changes:
- createTable:
tableName: scm_repo_issue_or_pr
columns:
- column:
name: id
type: BIGINT
constraints:
nullable: false
primaryKey: true
primaryKeyName: pk_scm_repo_issue_or_pr
- column:
name: scm_repo_id
type: INT
constraints:
nullable: false
foreignKeyName: fk_scm_repo_issue_or_pr_scm_repo_id
references: scm_repo(id)
- column:
name: gh_number
type: INT
remarks: "GitHub's per-repo issue/PR number (the integer in /issues/N URLs); together with scm_repo_id, this is the upsert dedup key."
constraints:
nullable: false
- column:
name: type
type: VARCHAR(8)
remarks: "ISSUE or PR. Aggregate queries filter by this since I and P sub-scores are computed separately."
constraints:
nullable: false
- column:
name: created_at
type: TIMESTAMP
constraints:
nullable: false
- column:
name: closed_at
type: TIMESTAMP
- column:
name: merged_at
type: TIMESTAMP
- column:
name: duration_days
type: INT
- addUniqueConstraint:
tableName: scm_repo_issue_or_pr
columnNames: scm_repo_id, gh_number
constraintName: uq_scm_repo_issue_or_pr_repo_number

- changeSet:
id: create scm_repo_issue_or_pr sequence
author: nikita.vlaev@jetbrains.com
preConditions:
- onFail: MARK_RAN
- not:
sequenceExists:
sequenceName: scm_repo_issue_or_pr_id_seq
changes:
- sql:
sql: |
CREATE SEQUENCE scm_repo_issue_or_pr_id_seq
AS BIGINT
INCREMENT BY 50
START WITH 1;

- changeSet:
id: index scm_repo_issue_or_pr on repo/type/closed
author: nikita.vlaev@jetbrains.com
preConditions:
- onFail: MARK_RAN
- not:
indexExists:
tableName: scm_repo_issue_or_pr
indexName: scm_repo_issue_or_pr_repo_type_closed_idx
changes:
- createIndex:
indexName: scm_repo_issue_or_pr_repo_type_closed_idx
tableName: scm_repo_issue_or_pr
columns:
- column:
name: scm_repo_id
- column:
name: type
- column:
name: closed_at

- changeSet:
id: index scm_repo_issue_or_pr on repo/type/created
author: nikita.vlaev@jetbrains.com
preConditions:
- onFail: MARK_RAN
- not:
indexExists:
tableName: scm_repo_issue_or_pr
indexName: scm_repo_issue_or_pr_repo_type_created_idx
changes:
- createIndex:
indexName: scm_repo_issue_or_pr_repo_type_created_idx
tableName: scm_repo_issue_or_pr
columns:
- column:
name: scm_repo_id
- column:
name: type
- column:
name: created_at

- changeSet:
id: create scm_repo_health_components table
author: nikita.vlaev@jetbrains.com
comment: "KTL-4386: Per-repo OSS Health components snapshot (one row per repo, overwritten each run)."
preConditions:
- onFail: MARK_RAN
- not:
tableExists:
tableName: scm_repo_health_components
changes:
- createTable:
tableName: scm_repo_health_components
columns:
- column:
name: scm_repo_id
type: INT
constraints:
nullable: false
primaryKey: true
foreignKeyName: fk_scm_repo_health_components_scm_repo_id
references: scm_repo(id)
- column:
name: score_recomputed_ts
type: TIMESTAMP
- column:
name: issue_opened_count
type: INT
- column:
name: issue_closed_count
type: INT
- column:
name: median_issue_close_days
type: DOUBLE PRECISION
- column:
name: pr_opened_count
type: INT
- column:
name: pr_merged_count
type: INT
- column:
name: median_pr_merge_days
type: DOUBLE PRECISION
- column:
name: active_contributors
type: INT
- column:
name: top_contributor_share
type: DOUBLE PRECISION
- column:
name: c_score
type: DOUBLE PRECISION
- column:
name: i_score
type: DOUBLE PRECISION
- column:
name: p_score
type: DOUBLE PRECISION
- column:
name: a_score
type: DOUBLE PRECISION
- column:
name: health_score
type: INT
- column:
name: last_issue_or_pr_sync_ts
type: TIMESTAMP
remarks: "Last time the issue/PR sync populated I/P inputs for this repo; drives the issue/PR sync queue."
Original file line number Diff line number Diff line change
@@ -0,0 +1,67 @@
DROP MATERIALIZED VIEW IF EXISTS project_index;

CREATE MATERIALIZED VIEW project_index AS
WITH package_info AS (SELECT project.id,
array_agg(DISTINCT platform) AS platforms,
array_to_tsvector(array_agg(DISTINCT platform)) AS platforms_vector,
string_agg(format('%s:1', pckg.group_id), ' ')::tsvector AS group_ids_vector,
string_agg(format('%s:2', pckg.artifact_id), ' ')::tsvector AS artifact_ids_vector,
array_to_tsvector(array_remove(
array_agg(DISTINCT COALESCE(platform || '_' || target, platform)),
NULL)) AS targets_vector
FROM project
JOIN package_index pckg ON project.id = pckg.project_id
JOIN scm_owner owner ON project.owner_id = owner.id
CROSS JOIN LATERAL unnest(pckg.platforms) as platform
LEFT JOIN LATERAL jsonb_array_elements_text(COALESCE(pckg.targets -> platform, '[]'::jsonb)) AS target
ON true
WHERE NOT (owner.login = 'androidx' AND project.name = 'room' AND platform IN ('WASM', 'JS'))
GROUP BY project.id),
markers_info AS (SELECT project_marker.project_id,
array_agg(DISTINCT project_marker.type) AS markers
FROM project_marker
GROUP BY project_marker.project_id),
tags_info AS (SELECT project_id,
COALESCE(
array_agg(DISTINCT value ORDER BY value DESC) FILTER (WHERE origin = 'USER'),
array_agg(DISTINCT value ORDER BY value DESC) FILTER (WHERE origin = 'GITHUB'),
array_agg(DISTINCT value ORDER BY value DESC) FILTER (WHERE origin = 'AI')
) AS tags
FROM project_tags
GROUP BY project_id)
SELECT project.id AS project_id,
owner.type AS owner_type,
owner.login AS owner_login,
repo.name AS repo_name, -- We still need this to form GH repo link and GH pages link, because androidx projects have diffent names for repository and project.
project.name,
repo.stars,
repo.license_name,
project.latest_version,
project.latest_version_ts,
package_info.platforms,
package_info.platforms_vector,
package_info.targets_vector,
coalesce(project.description, repo.description) AS plain_description,
tags_info.tags AS tags,
markers_info.markers AS markers,
project.dependent_count AS dependent_count,
health.health_score AS health_score,
(setweight(to_tsvector(owner.login), 'A') ||
setweight(to_tsvector(project.name), 'A') ||
setweight(format('%s:1', project.name)::tsvector, 'A') ||
setweight(format('%s:1', owner.login)::tsvector, 'A') ||
setweight(package_info.group_ids_vector, 'B') ||
setweight(package_info.artifact_ids_vector, 'B') ||
setweight(to_tsvector(coalesce(owner.name, '')), 'D') ||
setweight(to_tsvector(coalesce(owner.description, '')), 'D') ||
setweight(to_tsvector(coalesce(project.minimized_readme, '')), 'D') ||
setweight(to_tsvector(coalesce(project.description, '')), 'C') ||
setweight(to_tsvector(coalesce(repo.description, '')), 'C') ||
setweight(to_tsvector(coalesce(array_to_string(tags_info.tags, ' '), '')), 'B')) AS fts
FROM project
JOIN package_info ON project.id = package_info.id
JOIN scm_owner owner ON project.owner_id = owner.id
JOIN scm_repo repo ON project.scm_repo_id = repo.id
LEFT JOIN markers_info on markers_info.project_id = project.id
LEFT JOIN tags_info on tags_info.project_id = project.id
LEFT JOIN scm_repo_health_components health on health.scm_repo_id = repo.id;
Loading
Loading