Skip to content

[ENH] Implement log forking #4326

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open
wants to merge 1 commit into
base: sicheng/04-17-_enh_wire_up_collection_forking_for_python
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
41 changes: 41 additions & 0 deletions go/pkg/log/repository/log.go
Original file line number Diff line number Diff line change
Expand Up @@ -126,6 +126,47 @@ func (r *LogRepository) PullRecords(ctx context.Context, collectionId string, of
return
}

func (r *LogRepository) ForkRecords(ctx context.Context, sourceCollectionID string, targetCollectionID string) (err error) {
var tx pgx.Tx
tx, err = r.conn.BeginTx(ctx, pgx.TxOptions{})
if err != nil {
trace_log.Error("Error in begin transaction for forking logs in log service", zap.Error(err), zap.String("sourceCollectionID", sourceCollectionID))
return
}
queriesWithTx := r.queries.WithTx(tx)
defer func() {
if err != nil {
tx.Rollback(ctx)
} else {
err = tx.Commit(ctx)
}
}()

err = queriesWithTx.LockCollection(ctx, sourceCollectionID)
if err != nil {
trace_log.Error("Error locking collection for fork", zap.String("sourceCollectionID", sourceCollectionID))
return
}
err = queriesWithTx.ForkCollectionOffset(ctx, log.ForkCollectionOffsetParams{
ID: sourceCollectionID,
ID_2: targetCollectionID,
})
if err != nil {
trace_log.Error("Error forking log offset", zap.String("sourceCollectionID", sourceCollectionID))
return
}
err = queriesWithTx.ForkCollectionRecord(ctx, log.ForkCollectionRecordParams{
CollectionID: sourceCollectionID,
CollectionID_2: targetCollectionID,
})
if err != nil {
trace_log.Error("Error forking log record", zap.String("sourceCollectionID", sourceCollectionID))
return
}

return
}

func (r *LogRepository) GetAllCollectionInfoToCompact(ctx context.Context, minCompactionSize uint64) (collectionToCompact []log.GetAllCollectionsToCompactRow, err error) {
collectionToCompact, err = r.queries.GetAllCollectionsToCompact(ctx, int64(minCompactionSize))
if collectionToCompact == nil {
Expand Down
23 changes: 22 additions & 1 deletion go/pkg/log/server/server.go
Original file line number Diff line number Diff line change
Expand Up @@ -56,7 +56,7 @@ func (s *logServer) ScoutLogs(ctx context.Context, req *logservicepb.ScoutLogsRe
return
}
// +1 to convert from the (] bound to a [) bound.
res = &logservicepb.ScoutLogsResponse {
res = &logservicepb.ScoutLogsResponse{
FirstUninsertedRecordOffset: int64(limit + 1),
}
return
Expand Down Expand Up @@ -90,6 +90,27 @@ func (s *logServer) PullLogs(ctx context.Context, req *logservicepb.PullLogsRequ
return
}

func (s *logServer) ForkLogs(ctx context.Context, req *logservicepb.ForkLogsRequest) (res *logservicepb.ForkLogsResponse, err error) {
var sourceCollectionID types.UniqueID
var targetCollectionID types.UniqueID
sourceCollectionID, err = types.ToUniqueID(&req.SourceCollectionId)
if err != nil {
return
}
targetCollectionID, err = types.ToUniqueID(&req.TargetCollectionId)
if err != nil {
return
}

err = s.lr.ForkRecords(ctx, sourceCollectionID.String(), targetCollectionID.String())
if err != nil {
return
}

res = &logservicepb.ForkLogsResponse{}
return
}

func (s *logServer) GetAllCollectionInfoToCompact(ctx context.Context, req *logservicepb.GetAllCollectionInfoToCompactRequest) (res *logservicepb.GetAllCollectionInfoToCompactResponse, err error) {
var collectionToCompact []log.GetAllCollectionsToCompactRow
collectionToCompact, err = s.lr.GetAllCollectionInfoToCompact(ctx, req.MinCompactionSize)
Expand Down
47 changes: 44 additions & 3 deletions go/pkg/log/store/db/queries.sql.go

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

15 changes: 15 additions & 0 deletions go/pkg/log/store/queries/queries.sql
Original file line number Diff line number Diff line change
Expand Up @@ -58,3 +58,18 @@ SELECT id FROM collection;

-- name: GetLastCompactedOffset :one
SELECT record_compaction_offset_position FROM collection c WHERE c.id = $1;

-- name: LockCollection :exec
SELECT * FROM collection WHERE id = $1 FOR UPDATE;

-- name: ForkCollectionOffset :exec
INSERT INTO collection (id, record_compaction_offset_position, record_enumeration_offset_position)
SELECT $2, collection.record_compaction_offset_position, collection.record_enumeration_offset_position
FROM collection
WHERE collection.id = $1;

-- name: ForkCollectionRecord :exec
INSERT INTO record_log ("offset", collection_id, timestamp, record)
SELECT record_log.offset, $2, record_log.timestamp, record_log.record
FROM record_log
WHERE record_log.collection_id = $1;
8 changes: 3 additions & 5 deletions go/pkg/sysdb/coordinator/coordinator_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -1416,11 +1416,9 @@ func (suite *APIsTestSuite) TestForkCollection() {
}

sourceFlushCollectionCompaction := &model.FlushCollectionCompaction{
ID: sourceCreateCollection.ID,
TenantID: sourceCreateCollection.TenantID,
// TODO: Inherit log position after log fork is implemented
// LogPosition: 1000,
LogPosition: 0,
ID: sourceCreateCollection.ID,
TenantID: sourceCreateCollection.TenantID,
LogPosition: 1000,
CurrentCollectionVersion: 0,
FlushSegmentCompactions: []*model.FlushSegmentCompaction{
sourceFlushMetadataSegment,
Expand Down
22 changes: 10 additions & 12 deletions go/pkg/sysdb/coordinator/table_catalog.go
Original file line number Diff line number Diff line change
Expand Up @@ -868,18 +868,16 @@ func (tc *Catalog) ForkCollection(ctx context.Context, forkCollection *model.For
}

createCollection := &model.CreateCollection{
ID: forkCollection.TargetCollectionID,
Name: forkCollection.TargetCollectionName,
ConfigurationJsonStr: sourceCollection.ConfigurationJsonStr,
Dimension: sourceCollection.Dimension,
Metadata: sourceCollection.Metadata,
GetOrCreate: false,
TenantID: sourceCollection.TenantID,
DatabaseName: sourceCollection.DatabaseName,
Ts: ts.Unix(),
// TODO: Inherit log position after log fork is implemented
// LogPosition: sourceCollection.LogPosition,
LogPosition: 0,
ID: forkCollection.TargetCollectionID,
Name: forkCollection.TargetCollectionName,
ConfigurationJsonStr: sourceCollection.ConfigurationJsonStr,
Dimension: sourceCollection.Dimension,
Metadata: sourceCollection.Metadata,
GetOrCreate: false,
TenantID: sourceCollection.TenantID,
DatabaseName: sourceCollection.DatabaseName,
Ts: ts.Unix(),
LogPosition: sourceCollection.LogPosition,
RootCollectionId: rootCollectionIDStr,
TotalRecordsPostCompaction: sourceCollection.TotalRecordsPostCompaction,
SizeBytesPostCompaction: sourceCollection.SizeBytesPostCompaction,
Expand Down
10 changes: 10 additions & 0 deletions idl/chromadb/proto/logservice.proto
Original file line number Diff line number Diff line change
Expand Up @@ -42,6 +42,15 @@ message PullLogsResponse {
repeated LogRecord records = 1;
}

message ForkLogsRequest {
string source_collection_id = 1;
string target_collection_id = 2;
}

message ForkLogsResponse {
// Empty
}

message CollectionInfo {
string collection_id = 1;
// The log offset of the first log entry of the collection that needs to be compacted
Expand Down Expand Up @@ -81,6 +90,7 @@ service LogService {
rpc PushLogs(PushLogsRequest) returns (PushLogsResponse) {}
rpc ScoutLogs(ScoutLogsRequest) returns (ScoutLogsResponse) {}
rpc PullLogs(PullLogsRequest) returns (PullLogsResponse) {}
rpc ForkLogs(ForkLogsRequest) returns (ForkLogsResponse) {}
rpc GetAllCollectionInfoToCompact(GetAllCollectionInfoToCompactRequest) returns (GetAllCollectionInfoToCompactResponse) {}
rpc UpdateCollectionLogOffset(UpdateCollectionLogOffsetRequest) returns (UpdateCollectionLogOffsetResponse) {}
rpc PurgeDirtyForCollection(PurgeDirtyForCollectionRequest) returns (PurgeDirtyForCollectionResponse) {}
Expand Down
3 changes: 3 additions & 0 deletions rust/frontend/src/impls/service_based_frontend.rs
Original file line number Diff line number Diff line change
Expand Up @@ -552,6 +552,9 @@ impl ServiceBasedFrontend {
}: ForkCollectionRequest,
) -> Result<ForkCollectionResponse, ForkCollectionError> {
let target_collection_id = CollectionUuid::new();
self.log_client
.fork_logs(source_collection_id, target_collection_id)
.await?;
let collection_and_segments = self
.sysdb_client
.fork_collection(
Expand Down
8 changes: 8 additions & 0 deletions rust/log-service/src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@ use chroma_types::chroma_proto::{
PushLogsRequest, PushLogsResponse, ScoutLogsRequest, ScoutLogsResponse,
UpdateCollectionLogOffsetRequest, UpdateCollectionLogOffsetResponse,
};
use chroma_types::chroma_proto::{ForkLogsRequest, ForkLogsResponse};
use chroma_types::CollectionUuid;
use figment::providers::{Env, Format, Yaml};
use parquet::arrow::arrow_reader::ParquetRecordBatchReaderBuilder;
Expand Down Expand Up @@ -763,6 +764,13 @@ impl LogService for LogServer {
.await
}

async fn fork_logs(
&self,
_request: Request<ForkLogsRequest>,
) -> Result<Response<ForkLogsResponse>, Status> {
unimplemented!("Log forking is unimplemented for WAL3 for now")
}

#[tracing::instrument(info, skip(self, request), err(Display))]
async fn get_all_collection_info_to_compact(
&self,
Expand Down
35 changes: 35 additions & 0 deletions rust/log/src/grpc_log.rs
Original file line number Diff line number Diff line change
Expand Up @@ -57,6 +57,23 @@ impl ChromaError for GrpcPushLogsError {
}
}

#[derive(Error, Debug)]
pub enum GrpcForkLogsError {
#[error("Please backoff exponentially and retry")]
Backoff,
#[error("Failed to push logs")]
FailedToForkLogs(#[from] tonic::Status),
}

impl ChromaError for GrpcForkLogsError {
fn code(&self) -> ErrorCodes {
match self {
GrpcForkLogsError::Backoff => ErrorCodes::Unavailable,
GrpcForkLogsError::FailedToForkLogs(_) => ErrorCodes::Internal,
}
}
}

#[derive(Error, Debug)]
pub enum GrpcGetCollectionsWithNewDataError {
#[error("Failed to fetch")]
Expand Down Expand Up @@ -306,6 +323,24 @@ impl GrpcLog {
Ok(())
}

pub(super) async fn fork_logs(
&mut self,
source_collection_id: CollectionUuid,
target_collection_id: CollectionUuid,
) -> Result<(), GrpcForkLogsError> {
self.client_for(source_collection_id)
.fork_logs(chroma_proto::ForkLogsRequest {
source_collection_id: source_collection_id.to_string(),
target_collection_id: target_collection_id.to_string(),
})
.await
.map(|_| ())
.map_err(|err| match err.code() {
tonic::Code::Unavailable => GrpcForkLogsError::Backoff,
_ => err.into(),
})
}

pub(crate) async fn get_collections_with_new_data(
&mut self,
min_compaction_size: u64,
Expand Down
20 changes: 19 additions & 1 deletion rust/log/src/log.rs
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,9 @@ use crate::in_memory_log::InMemoryLog;
use crate::sqlite_log::SqliteLog;
use crate::types::CollectionInfo;
use chroma_error::ChromaError;
use chroma_types::{CollectionUuid, LogRecord, OperationRecord, ResetError, ResetResponse};
use chroma_types::{
CollectionUuid, ForkCollectionError, LogRecord, OperationRecord, ResetError, ResetResponse,
};
use std::fmt::Debug;

#[derive(Clone, Debug)]
Expand Down Expand Up @@ -92,6 +94,22 @@ impl Log {
}
}

#[tracing::instrument(skip(self))]
pub async fn fork_logs(
&mut self,
source_collection_id: CollectionUuid,
target_collection_id: CollectionUuid,
) -> Result<(), ForkCollectionError> {
match self {
Log::Sqlite(_) => Err(ForkCollectionError::Local),
Log::Grpc(log) => log
.fork_logs(source_collection_id, target_collection_id)
.await
.map_err(|err| err.boxed().into()),
Log::InMemory(_) => Err(ForkCollectionError::Local),
}
}

#[tracing::instrument(skip(self))]
pub async fn get_collections_with_new_data(
&mut self,
Expand Down
Loading