Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
59 changes: 59 additions & 0 deletions docs/state-stats.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,59 @@
# Inspecting committed state size (`inferenced state-stats`)

`state-stats` is an offline analysis command that answers two questions about a
node's on-disk state:

1. Which module store consumes the most space?
2. Within the `inference` module, which prefix (feature) consumes the most
space, and which of those are legacy / cleanup candidates?

It was added for issue #1223 ("clean up the state") so we can drive cleanup
decisions from measured data instead of guessing.

## How it works

The command opens the node's `application.db` directly, loads the latest
committed height (or `--height`), and iterates every committed KV store,
summing key and value bytes. For the `inference` store it additionally
attributes every key to a named prefix using
`x/inference/types.StatePrefixCatalog`, and flags prefixes marked `legacy`.

Because it opens the database exclusively, **the node must be stopped** (or run
it against a restored snapshot copy).

## Usage

```bash
# Stop the node first, then:
inferenced state-stats --home /path/to/node/home

# Inspect a specific committed height:
inferenced state-stats --home /path/to/node/home --height 1234567

# Only the 20 largest inference prefixes:
inferenced state-stats --home /path/to/node/home --top 20

# Only legacy (cleanup-candidate) inference prefixes:
inferenced state-stats --home /path/to/node/home --legacy-only
```

## Output

Two tables:

- **Per-store summary** — every module KV store with key count, key bytes,
value bytes, and a humanized total, sorted largest first, plus a grand total.
- **Inference prefix breakdown** — every inference prefix with a `LEGACY`
column (`yes` = cleanup candidate, `?` = key not recognized by the catalog),
sorted largest first.

A non-empty `<unmatched:0xNN>` row means a key prefix is not yet in the
catalog; add it to `StatePrefixCatalog` so the attribution stays complete.

## Relationship to the v0.2.14 cleanup

The v0.2.14 upgrade handler already removes the known legacy prefixes
(`EpochGroupValidations` aggregate map, `TopMiner`, training state, legacy PoC
v2 prefixes). Run `state-stats --legacy-only` before and after the upgrade on a
snapshot to confirm those prefixes drop to zero, and use the full breakdown to
decide whether any additional large prefixes warrant their own cleanup task.
2 changes: 2 additions & 0 deletions inference-chain/app/upgrades/v0_2_12/upgrades_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,7 @@ func TestClearTrainingState(t *testing.T) {
store.Set([]byte(inferencetypes.TrainingTaskSequenceKey), []byte{1})
store.Set([]byte(inferencetypes.QueuedTrainingTaskKeyPrefix+"1"), []byte{1})
store.Set([]byte(inferencetypes.InProgressTrainingTaskKeyPrefix+"1"), []byte{1})
store.Set([]byte(inferencetypes.TrainingTaskKvRecordKeyPrefix+"1/key"), []byte{1})
store.Set([]byte("TrainingTask/sync/1/store/key/value"), []byte("value"))
store.Set([]byte("TrainingTask/sync/1/heartbeat/0/participant/node"), []byte("hb"))
store.Set([]byte("TrainingTask/sync/1/barrier/b1/0/participant/node/value"), []byte("barrier"))
Expand All @@ -50,6 +51,7 @@ func TestClearTrainingState(t *testing.T) {
[]byte(inferencetypes.TrainingTaskSequenceKey),
[]byte(inferencetypes.QueuedTrainingTaskKeyPrefix + "1"),
[]byte(inferencetypes.InProgressTrainingTaskKeyPrefix + "1"),
[]byte(inferencetypes.TrainingTaskKvRecordKeyPrefix + "1/key"),
[]byte("TrainingTask/sync/1/store/key/value"),
[]byte("TrainingTask/sync/1/heartbeat/0/participant/node"),
[]byte("TrainingTask/sync/1/barrier/b1/0/participant/node/value"),
Expand Down
24 changes: 23 additions & 1 deletion inference-chain/app/upgrades/v0_2_14/upgrades.go
Original file line number Diff line number Diff line change
Expand Up @@ -32,7 +32,9 @@ func CreateUpgradeHandler(
fromVM["capability"] = mm.Modules["capability"].(module.HasConsensusVersion).ConsensusVersion()
}

// Future v0.2.14 migration steps land below this line.
if err := cleanupLeftoverState(ctx, k); err != nil {
return nil, err
}

toVM, err := mm.RunMigrations(ctx, configurator, fromVM)
if err != nil {
Expand All @@ -43,3 +45,23 @@ func CreateUpgradeHandler(
return toVM, nil
}
}

func cleanupLeftoverState(ctx context.Context, k keeper.Keeper) error {
k.LogInfo("cleaning up leftover state", types.Upgrades, "version", UpgradeName)

if err := k.MigrateEpochGroupValidationsToEntries(ctx); err != nil {
return err
}
if err := k.TopMiners.Clear(ctx, nil); err != nil {
return err
}
if err := k.ClearTrainingState(ctx); err != nil {
return err
}
if err := k.ClearLegacyPoCv2Data(ctx); err != nil {
return err
}

k.LogInfo("finished cleaning up leftover state", types.Upgrades, "version", UpgradeName)
return nil
}
128 changes: 128 additions & 0 deletions inference-chain/app/upgrades/v0_2_14/upgrades_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,13 @@ package v0_2_14
import (
"testing"

"cosmossdk.io/collections"
"cosmossdk.io/store/prefix"
sdk "github.com/cosmos/cosmos-sdk/types"
keepertest "github.com/productscience/inference/testutil/keeper"
"github.com/productscience/inference/testutil/sample"
inferencekeeper "github.com/productscience/inference/x/inference/keeper"
inferencetypes "github.com/productscience/inference/x/inference/types"
"github.com/stretchr/testify/require"
)

Expand All @@ -11,3 +18,124 @@ import (
func TestUpgradeName(t *testing.T) {
require.Equal(t, "v0.2.14", UpgradeName)
}

func TestCleanupLeftoverState(t *testing.T) {
k, ctx, _ := keepertest.InferenceKeeperReturningMocks(t)
store := inferencekeeper.EmptyPrefixStore(ctx, &k)

currentEpoch := uint64(2)
previousEpoch := uint64(1)
oldEpoch := uint64(0)
require.NoError(t, k.SetEffectiveEpochIndex(ctx, currentEpoch))

currentValidation := inferencetypes.EpochGroupValidations{
Participant: "current-participant",
EpochIndex: currentEpoch,
ValidatedInferences: []string{"current-inf-1", "current-inf-2"},
}
previousValidation := inferencetypes.EpochGroupValidations{
Participant: "previous-participant",
EpochIndex: previousEpoch,
ValidatedInferences: []string{"previous-inf-1"},
}
oldValidation := inferencetypes.EpochGroupValidations{
Participant: "old-participant",
EpochIndex: oldEpoch,
ValidatedInferences: []string{"old-inf-1"},
}
require.NoError(t, k.EpochGroupValidationsMap.Set(ctx, collections.Join(currentEpoch, currentValidation.Participant), currentValidation))
require.NoError(t, k.EpochGroupValidationsMap.Set(ctx, collections.Join(previousEpoch, previousValidation.Participant), previousValidation))
require.NoError(t, k.EpochGroupValidationsMap.Set(ctx, collections.Join(oldEpoch, oldValidation.Participant), oldValidation))

topMinerAddr := sdk.MustAccAddressFromBech32(sample.AccAddress())
require.NoError(t, k.TopMiners.Set(ctx, topMinerAddr, inferencetypes.TopMiner{Address: topMinerAddr.String()}))

execAddr := sdk.MustAccAddressFromBech32(sample.AccAddress())
startAddr := sdk.MustAccAddressFromBech32(sample.AccAddress())
require.NoError(t, k.TrainingExecAllowListSet.Set(ctx, execAddr))
require.NoError(t, k.TrainingStartAllowListSet.Set(ctx, startAddr))

trainingKeys := [][]byte{
[]byte(inferencetypes.TrainingTaskKeyPrefix + "1"),
[]byte(inferencetypes.TrainingTaskSequenceKey),
[]byte(inferencetypes.QueuedTrainingTaskKeyPrefix + "1"),
[]byte(inferencetypes.InProgressTrainingTaskKeyPrefix + "1"),
[]byte(inferencetypes.TrainingTaskKvRecordKeyPrefix + "1/key"),
[]byte("TrainingTask/sync/1/store/key/value"),
[]byte("TrainingTask/sync/1/heartbeat/0/participant/node"),
[]byte("TrainingTask/sync/1/barrier/b1/0/participant/node/value"),
}
for _, key := range trainingKeys {
store.Set(key, []byte("training"))
}

legacyPoCPrefixes := [][]byte{
inferencetypes.LegacyPoCValidationV2Prefix,
inferencetypes.LegacyPoCV2StoreCommitPrefix,
inferencetypes.LegacyMLNodeWeightDistributionPrefix,
}
for i, pfx := range legacyPoCPrefixes {
key := append(append([]byte{}, pfx...), byte(i), byte(i+1))
store.Set(key, []byte("legacy-poc"))
require.Equal(t, 1, countPrefixEntries(t, store, pfx))
}

require.NoError(t, k.SetPocV2EnabledEpoch(ctx, 123))

require.NoError(t, cleanupLeftoverState(ctx, k))
require.NoError(t, cleanupLeftoverState(ctx, k))

migratedCurrent, found := k.GetEpochGroupValidations(ctx, currentValidation.Participant, currentEpoch)
require.True(t, found)
require.ElementsMatch(t, currentValidation.ValidatedInferences, migratedCurrent.ValidatedInferences)

migratedPrevious, found := k.GetEpochGroupValidations(ctx, previousValidation.Participant, previousEpoch)
require.True(t, found)
require.ElementsMatch(t, previousValidation.ValidatedInferences, migratedPrevious.ValidatedInferences)

_, found = k.GetEpochGroupValidations(ctx, oldValidation.Participant, oldEpoch)
require.False(t, found)

legacyIter, err := k.EpochGroupValidationsMap.Iterate(ctx, nil)
require.NoError(t, err)
legacyValues, err := legacyIter.Values()
require.NoError(t, err)
require.Empty(t, legacyValues)

hasTopMiner, err := k.TopMiners.Has(ctx, topMinerAddr)
require.NoError(t, err)
require.False(t, hasTopMiner)

hasExec, err := k.TrainingExecAllowListSet.Has(ctx, execAddr)
require.NoError(t, err)
require.False(t, hasExec)

hasStart, err := k.TrainingStartAllowListSet.Has(ctx, startAddr)
require.NoError(t, err)
require.False(t, hasStart)

for _, key := range trainingKeys {
require.Nil(t, store.Get(key), "expected key %q to be deleted", string(key))
}
for _, pfx := range legacyPoCPrefixes {
require.Equal(t, 0, countPrefixEntries(t, store, pfx))
}

pocV2Epoch, found := k.GetPocV2EnabledEpoch(ctx)
require.True(t, found)
require.Equal(t, uint64(123), pocV2Epoch)
}

func countPrefixEntries(t *testing.T, store *prefix.Store, pfx []byte) int {
t.Helper()

sub := prefix.NewStore(store, pfx)
iter := sub.Iterator(nil, nil)
defer iter.Close()

count := 0
for ; iter.Valid(); iter.Next() {
count++
}
return count
}
1 change: 1 addition & 0 deletions inference-chain/cmd/inferenced/cmd/commands.go
Original file line number Diff line number Diff line change
Expand Up @@ -66,6 +66,7 @@ func initRootCmd(
SetRpcServers(),
SetTrustedBlock(),
PatchToml(),
StateStatsCommand(),
)
}

Expand Down
Loading
Loading