Skip to content

Commit 6a235d2

Browse files
authored
[ENH] A route and tool to inspect the dirty log. (#4461)
## Description of changes To get ground truth on the dirty log, add a tool that can print it as it gets interpreted in the 'get all to compact' call. ## Test plan - [ ] Tests pass locally with `pytest` for python, `yarn test` for js, `cargo test` for rust ## Documentation Changes N/A
1 parent 7e50089 commit 6a235d2

File tree

4 files changed

+105
-4
lines changed

4 files changed

+105
-4
lines changed

go/pkg/log/server/server.go

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -152,6 +152,12 @@ func (s *logServer) PurgeDirtyForCollection(ctx context.Context, req *logservice
152152
return
153153
}
154154

155+
func (s *logServer) InspectDirtyLog(ctx context.Context, req *logservicepb.InspectDirtyLogRequest) (res *logservicepb.InspectDirtyLogResponse, err error) {
156+
// no-op for now
157+
return
158+
}
159+
160+
155161
func NewLogServer(lr *repository.LogRepository) logservicepb.LogServiceServer {
156162
return &logServer{
157163
lr: lr,

idl/chromadb/proto/logservice.proto

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -87,6 +87,14 @@ message PurgeDirtyForCollectionResponse {
8787
// Empty
8888
}
8989

90+
message InspectDirtyLogRequest {
91+
// Empty
92+
}
93+
94+
message InspectDirtyLogResponse {
95+
repeated string markers = 1;
96+
}
97+
9098
service LogService {
9199
rpc PushLogs(PushLogsRequest) returns (PushLogsResponse) {}
92100
rpc ScoutLogs(ScoutLogsRequest) returns (ScoutLogsResponse) {}
@@ -95,4 +103,5 @@ service LogService {
95103
rpc GetAllCollectionInfoToCompact(GetAllCollectionInfoToCompactRequest) returns (GetAllCollectionInfoToCompactResponse) {}
96104
rpc UpdateCollectionLogOffset(UpdateCollectionLogOffsetRequest) returns (UpdateCollectionLogOffsetResponse) {}
97105
rpc PurgeDirtyForCollection(PurgeDirtyForCollectionRequest) returns (PurgeDirtyForCollectionResponse) {}
106+
rpc InspectDirtyLog(InspectDirtyLogRequest) returns (InspectDirtyLogResponse) {}
98107
}
Lines changed: 27 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,27 @@
1+
use tonic::transport::Channel;
2+
3+
use chroma_types::chroma_proto::log_service_client::LogServiceClient;
4+
use chroma_types::chroma_proto::InspectDirtyLogRequest;
5+
6+
#[tokio::main]
7+
async fn main() {
8+
let args = std::env::args().skip(1).collect::<Vec<_>>();
9+
if args.len() != 1 {
10+
eprintln!("USAGE: chroma-inspect-dirty-log [HOST]");
11+
std::process::exit(13);
12+
}
13+
let logservice = Channel::from_shared(args[0].clone())
14+
.expect("could not create channel")
15+
.connect()
16+
.await
17+
.expect("could not connect to log service");
18+
let mut client = LogServiceClient::new(logservice);
19+
let dirty = client
20+
.inspect_dirty_log(InspectDirtyLogRequest {})
21+
.await
22+
.expect("could not inspect dirty log");
23+
let dirty = dirty.into_inner();
24+
for line in dirty.markers {
25+
println!("{line}");
26+
}
27+
}

rust/log-service/src/lib.rs

Lines changed: 63 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -13,10 +13,10 @@ use chroma_storage::config::StorageConfig;
1313
use chroma_storage::Storage;
1414
use chroma_types::chroma_proto::{
1515
log_service_server::LogService, CollectionInfo, GetAllCollectionInfoToCompactRequest,
16-
GetAllCollectionInfoToCompactResponse, LogRecord, OperationRecord, PullLogsRequest,
17-
PullLogsResponse, PurgeDirtyForCollectionRequest, PurgeDirtyForCollectionResponse,
18-
PushLogsRequest, PushLogsResponse, ScoutLogsRequest, ScoutLogsResponse,
19-
UpdateCollectionLogOffsetRequest, UpdateCollectionLogOffsetResponse,
16+
GetAllCollectionInfoToCompactResponse, InspectDirtyLogRequest, InspectDirtyLogResponse,
17+
LogRecord, OperationRecord, PullLogsRequest, PullLogsResponse, PurgeDirtyForCollectionRequest,
18+
PurgeDirtyForCollectionResponse, PushLogsRequest, PushLogsResponse, ScoutLogsRequest,
19+
ScoutLogsResponse, UpdateCollectionLogOffsetRequest, UpdateCollectionLogOffsetResponse,
2020
};
2121
use chroma_types::chroma_proto::{ForkLogsRequest, ForkLogsResponse};
2222
use chroma_types::CollectionUuid;
@@ -1080,6 +1080,65 @@ impl LogService for LogServer {
10801080
.map_err(|err| Status::new(err.code().into(), err.to_string()))?;
10811081
Ok(Response::new(PurgeDirtyForCollectionResponse {}))
10821082
}
1083+
1084+
#[tracing::instrument(skip(self, _request))]
1085+
async fn inspect_dirty_log(
1086+
&self,
1087+
_request: Request<InspectDirtyLogRequest>,
1088+
) -> Result<Response<InspectDirtyLogResponse>, Status> {
1089+
let Some(reader) = self.dirty_log.reader(LogReaderOptions::default()) else {
1090+
return Err(Status::unavailable("Failed to get dirty log reader"));
1091+
};
1092+
let Some(cursors) = self.dirty_log.cursors(CursorStoreOptions::default()) else {
1093+
return Err(Status::unavailable("Failed to get dirty log cursors"));
1094+
};
1095+
let witness = match cursors.load(&STABLE_PREFIX).await {
1096+
Ok(witness) => witness,
1097+
Err(err) => {
1098+
return Err(Status::new(err.code().into(), err.to_string()));
1099+
}
1100+
};
1101+
let default = Cursor::default();
1102+
let cursor = witness.as_ref().map(|w| w.cursor()).unwrap_or(&default);
1103+
tracing::info!("cursoring from {cursor:?}");
1104+
let dirty_fragments = reader
1105+
.scan(
1106+
cursor.position,
1107+
Limits {
1108+
max_files: Some(1_000_000),
1109+
max_bytes: Some(1_000_000_000),
1110+
},
1111+
)
1112+
.await
1113+
.map_err(|err| Status::new(err.code().into(), err.to_string()))?;
1114+
let dirty_futures = dirty_fragments
1115+
.iter()
1116+
.map(|fragment| reader.read_parquet(fragment))
1117+
.collect::<Vec<_>>();
1118+
let dirty_raw = futures::future::try_join_all(dirty_futures)
1119+
.await
1120+
.map_err(|err| {
1121+
Status::new(
1122+
err.code().into(),
1123+
format!("Failed to fetch dirty parquet: {}", err),
1124+
)
1125+
})?;
1126+
let mut markers = vec![];
1127+
for (_, records, _) in dirty_raw {
1128+
let records = records
1129+
.into_iter()
1130+
.map(|x| String::from_utf8(x.1))
1131+
.collect::<Result<Vec<_>, _>>()
1132+
.map_err(|err| {
1133+
Status::new(
1134+
chroma_error::ErrorCodes::DataLoss.into(),
1135+
format!("Failed to extract records: {}", err),
1136+
)
1137+
})?;
1138+
markers.extend(records);
1139+
}
1140+
Ok(Response::new(InspectDirtyLogResponse { markers }))
1141+
}
10831142
}
10841143

10851144
fn parquet_to_records(parquet: Arc<Vec<u8>>) -> Result<Vec<(LogPosition, Vec<u8>)>, Status> {

0 commit comments

Comments
 (0)