-
Notifications
You must be signed in to change notification settings - Fork 17
refactor: Use real iterators over nodes #375
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: feat/faster-index-generation
Are you sure you want to change the base?
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -3,10 +3,12 @@ package sqlrepo | |
| import ( | ||
| "context" | ||
| "fmt" | ||
| "iter" | ||
|
|
||
| "github.com/ipfs/go-cid" | ||
| "github.com/multiformats/go-multihash" | ||
| "github.com/storacha/go-ucanto/core/invocation" | ||
| "github.com/storacha/guppy/pkg/preparation/blobs" | ||
| "github.com/storacha/guppy/pkg/preparation/blobs/model" | ||
| "github.com/storacha/guppy/pkg/preparation/sqlrepo/util" | ||
| "github.com/storacha/guppy/pkg/preparation/types/id" | ||
|
|
@@ -350,39 +352,53 @@ func (r *Repo) ShardsForIndex(ctx context.Context, indexID id.IndexID) ([]*model | |
| return shards, nil | ||
| } | ||
|
|
||
| func (r *Repo) ForEachNodeInIndex(ctx context.Context, indexID id.IndexID, yield func(shardDigest multihash.Multihash, nodeCID cid.Cid, nodeSize uint64, shardOffset uint64) error) error { | ||
| stmt, err := r.prepareStmt(ctx, ` | ||
| func (r *Repo) ForEachNodeInIndex(ctx context.Context, indexID id.IndexID) iter.Seq2[blobs.NodeInIndex, error] { | ||
| return func(yield func(blobs.NodeInIndex, error) bool) { | ||
| stmt, err := r.prepareStmt(ctx, ` | ||
| SELECT s.id, s.digest, nodes.cid, nodes.size, nu.shard_offset | ||
| FROM shards_in_indexes si | ||
| JOIN shards s ON s.id = si.shard_id | ||
| JOIN node_uploads nu ON nu.shard_id = si.shard_id | ||
| JOIN nodes ON nodes.cid = nu.node_cid AND nodes.space_did = nu.space_did | ||
| WHERE si.index_id = ?`) | ||
| if err != nil { | ||
| return fmt.Errorf("failed to prepare statement: %w", err) | ||
| } | ||
| rows, err := stmt.QueryContext(ctx, indexID) | ||
| if err != nil { | ||
| return fmt.Errorf("failed to query nodes in index %s: %w", indexID, err) | ||
| } | ||
| defer rows.Close() | ||
|
|
||
| for rows.Next() { | ||
| var shardID id.ShardID | ||
| var shardDigest multihash.Multihash | ||
| var nodeCID cid.Cid | ||
| var nodeSize uint64 | ||
| var shardOffset uint64 | ||
| if err := rows.Scan(&shardID, util.DbBytes(&shardDigest), util.DbCID(&nodeCID), &nodeSize, &shardOffset); err != nil { | ||
| return fmt.Errorf("failed to scan node row in index %s: %w", indexID, err) | ||
| if err != nil { | ||
| yield(blobs.NodeInIndex{}, fmt.Errorf("failed to prepare statement: %w", err)) | ||
| return | ||
|
Comment on lines
+365
to
+366
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. nit: iterator functions should check what In this specific case it's not a huge deal because these errors will likely happen for every element, but I think checking the result of
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. +1 |
||
| } | ||
| if len(shardDigest) == 0 { | ||
| return fmt.Errorf("failed to iterate nodes in index %s, because shard with ID %s has no digest set", indexID, shardID) | ||
| rows, err := stmt.QueryContext(ctx, indexID) | ||
| if err != nil { | ||
| yield(blobs.NodeInIndex{}, fmt.Errorf("failed to query nodes in index %s: %w", indexID, err)) | ||
| return | ||
| } | ||
| if err := yield(shardDigest, nodeCID, nodeSize, shardOffset); err != nil { | ||
| return fmt.Errorf("failed to yield node %s in index %s: %w", nodeCID, indexID, err) | ||
| defer rows.Close() | ||
|
|
||
| for rows.Next() { | ||
| var shardID id.ShardID | ||
| var shardDigest multihash.Multihash | ||
| var nodeCID cid.Cid | ||
| var nodeSize uint64 | ||
| var shardOffset uint64 | ||
| if err := rows.Scan(&shardID, util.DbBytes(&shardDigest), util.DbCID(&nodeCID), &nodeSize, &shardOffset); err != nil { | ||
| yield(blobs.NodeInIndex{}, fmt.Errorf("failed to scan node row in index %s: %w", indexID, err)) | ||
| return | ||
| } | ||
| if len(shardDigest) == 0 { | ||
| yield(blobs.NodeInIndex{}, fmt.Errorf("failed to iterate nodes in index %s, because shard with ID %s has no digest set", indexID, shardID)) | ||
| return | ||
| } | ||
| if !yield(blobs.NodeInIndex{ | ||
| NodeCID: nodeCID, | ||
| NodeSize: nodeSize, | ||
| ShardDigest: shardDigest, | ||
| ShardOffset: shardOffset, | ||
| }, nil) { | ||
| return | ||
| } | ||
| } | ||
| } | ||
|
|
||
| return rows.Err() | ||
| err = rows.Err() | ||
| if err != nil { | ||
| yield(blobs.NodeInIndex{}, fmt.Errorf("error iterating node rows in index %s: %w", indexID, err)) | ||
| } | ||
| } | ||
| } | ||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
nit, likely subjective preference: I'd find the name of iterator functions to be more idiomatic if the "ForEach" was dropped and the plural was used. I'd call them just
NodesInShardandNodesInIndex. Personally, the resulting "for node = range for each node..." feels a bit awkward and redundant 🙂.