Skip to content
Draft
Show file tree
Hide file tree
Changes from 15 commits
Commits
Show all changes
30 commits
Select commit Hold shift + click to select a range
1cf5e6c
feat: add tracing
aaronc Oct 29, 2025
a207f94
work on otel tracer impl
aaronc Oct 29, 2025
ee0bfb3
add basic baseapp tracing
aaronc Oct 29, 2025
d5f5ea4
latest WIP
aaronc Oct 30, 2025
47f83e8
add trace exporter setup
aaronc Oct 30, 2025
7fafce3
fixes
aaronc Oct 30, 2025
bdba035
simapp setup, make tracers wrap loggers
aaronc Oct 30, 2025
25e3135
add test setup
aaronc Oct 30, 2025
5c7e464
fix shutdown order
aaronc Oct 30, 2025
d71f7c1
block trace nesting
aaronc Oct 30, 2025
56b215a
update metrics config and instrumentation
aaronc Oct 30, 2025
f9ce55c
start adding otel metric config
aaronc Oct 31, 2025
3fff00f
migrate to pure otel setup
aaronc Oct 31, 2025
5077567
fixes
aaronc Oct 31, 2025
31536b6
add basic metrics
aaronc Oct 31, 2025
c922688
add telemetry shutdown hook
aaronc Oct 31, 2025
ed891cc
docs, cleanup
aaronc Oct 31, 2025
f685bd4
WIP on removing go-metrics
aaronc Oct 31, 2025
42da2f7
Merge branch 'main' of github.com:cosmos/cosmos-sdk into aaronc/traci…
aaronc Oct 31, 2025
699f5d3
setup sim test flag
aaronc Oct 31, 2025
5df2460
integrate slog logging
aaronc Oct 31, 2025
1c84edb
update to use official env var
aaronc Oct 31, 2025
46e4bcb
add README.md
aaronc Nov 3, 2025
f0c3955
delete spaces
aaronc Nov 3, 2025
7dfb754
setup TestingMain
aaronc Nov 3, 2025
1ce344b
update suggested config in README.md
aaronc Nov 3, 2025
edbae92
add otel custom config options
aaronc Nov 3, 2025
0f8085a
add otel custom config options
aaronc Nov 3, 2025
03b6069
add more instrumentation
aaronc Nov 3, 2025
c4dbd07
remove pretty print
aaronc Nov 4, 2025
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -65,3 +65,5 @@ debug_container.log
*.synctex.gz
/x/genutil/config/priv_validator_key.json
/x/genutil/data/priv_validator_state.json
/.envrc
/.env
46 changes: 39 additions & 7 deletions baseapp/abci.go
Original file line number Diff line number Diff line change
Expand Up @@ -39,6 +39,9 @@
)

func (app *BaseApp) InitChain(req *abci.RequestInitChain) (*abci.ResponseInitChain, error) {
_, span := tracer.Start(context.Background(), "InitChain")
defer span.End()

if req.ChainId != app.chainID {
return nil, fmt.Errorf("invalid chain-id on InitChain; expected: %s, got: %s", app.chainID, req.ChainId)
}
Expand Down Expand Up @@ -152,7 +155,10 @@

// Query implements the ABCI interface. It delegates to CommitMultiStore if it
// implements Queryable.
func (app *BaseApp) Query(_ context.Context, req *abci.RequestQuery) (resp *abci.ResponseQuery, err error) {
func (app *BaseApp) Query(ctx context.Context, req *abci.RequestQuery) (resp *abci.ResponseQuery, err error) {
ctx, span := tracer.Start(ctx, "Query")

Check failure on line 159 in baseapp/abci.go

View workflow job for this annotation

GitHub Actions / golangci-lint

ineffectual assignment to ctx (ineffassign)

Check warning

Code scanning / CodeQL

Useless assignment to local variable Warning

This definition of ctx is never used.

Copilot Autofix

AI 4 days ago

To fix the problem, the assignment to ctx in ctx, span := tracer.Start(ctx, "Query") should be replaced with the blank identifier _, since the new value is not used afterward. Thus, the line should be updated to _ , span := tracer.Start(ctx, "Query"). This keeps the logic correct and makes clear that the new context is irrelevant, and only span is wanted. No additional imports, method, or definition changes are necessary. Only a single line in baseapp/abci.go needs updating.

Suggested changeset 1
baseapp/abci.go

Autofix patch

Autofix patch
Run the following command in your local git repository to apply this patch
cat << 'EOF' | git apply
diff --git a/baseapp/abci.go b/baseapp/abci.go
--- a/baseapp/abci.go
+++ b/baseapp/abci.go
@@ -156,7 +156,7 @@
 // Query implements the ABCI interface. It delegates to CommitMultiStore if it
 // implements Queryable.
 func (app *BaseApp) Query(ctx context.Context, req *abci.RequestQuery) (resp *abci.ResponseQuery, err error) {
-	ctx, span := tracer.Start(ctx, "Query")
+	_, span := tracer.Start(ctx, "Query")
 	defer span.End()
 
 	// add panic recovery for all queries
EOF
@@ -156,7 +156,7 @@
// Query implements the ABCI interface. It delegates to CommitMultiStore if it
// implements Queryable.
func (app *BaseApp) Query(ctx context.Context, req *abci.RequestQuery) (resp *abci.ResponseQuery, err error) {
ctx, span := tracer.Start(ctx, "Query")
_, span := tracer.Start(ctx, "Query")
defer span.End()

// add panic recovery for all queries
Copilot is powered by AI and may make mistakes. Always verify output.
defer span.End()

// add panic recovery for all queries
//
// Ref: https://github.com/cosmos/cosmos-sdk/pull/8039
Expand Down Expand Up @@ -342,6 +348,9 @@
// will contain relevant error information. Regardless of tx execution outcome,
// the ResponseCheckTx will contain the relevant gas execution context.
func (app *BaseApp) CheckTx(req *abci.RequestCheckTx) (*abci.ResponseCheckTx, error) {
_, span := tracer.Start(context.Background(), "CheckTx")
defer span.End()

var mode sdk.ExecMode

switch req.Type {
Expand Down Expand Up @@ -454,7 +463,10 @@
}
}()

resp, err = app.abciHandlers.PrepareProposalHandler(prepareProposalState.Context(), req)
ctx := prepareProposalState.Context()
ctx, span := ctx.StartSpan(tracer, "PrepareProposal")
defer span.End()
resp, err = app.abciHandlers.PrepareProposalHandler(ctx, req)
if err != nil {
app.logger.Error("failed to prepare proposal", "height", req.Height, "time", req.Time, "err", err)
return &abci.ResponsePrepareProposal{Txs: req.Txs}, nil
Expand Down Expand Up @@ -513,7 +525,10 @@
}

processProposalState := app.stateManager.GetState(execModeProcessProposal)
processProposalState.SetContext(app.getContextForProposal(processProposalState.Context(), req.Height).
ctx := processProposalState.Context()
ctx, span := ctx.StartSpan(tracer, "ProcessProposal")
defer span.End()
processProposalState.SetContext(app.getContextForProposal(ctx, req.Height).
WithVoteInfos(req.ProposedLastCommit.Votes). // this is a set of votes that are not finalized yet, wait for commit
WithBlockHeight(req.Height).
WithBlockTime(req.Time).
Expand Down Expand Up @@ -595,6 +610,9 @@
return nil, errors.New("application ExtendVote handler not set")
}

ctx, span := ctx.StartSpan(tracer, "ExtendVote")
defer span.End()

// If vote extensions are not enabled, as a safety precaution, we return an
// error.
cp := app.GetConsensusParams(ctx)
Expand Down Expand Up @@ -666,6 +684,9 @@
ctx = sdk.NewContext(ms, emptyHeader, false, app.logger).WithStreamingManager(app.streamingManager)
}

ctx, span := ctx.StartSpan(tracer, "VerifyVoteExtension")
defer span.End()

// If vote extensions are not enabled, as a safety precaution, we return an
// error.
cp := app.GetConsensusParams(ctx)
Expand Down Expand Up @@ -716,7 +737,7 @@
// Execution flow or by the FinalizeBlock ABCI method. The context received is
// only used to handle early cancellation, for anything related to state app.stateManager.GetState(execModeFinalize).Context()
// must be used.
func (app *BaseApp) internalFinalizeBlock(ctx context.Context, req *abci.RequestFinalizeBlock) (*abci.ResponseFinalizeBlock, error) {
func (app *BaseApp) internalFinalizeBlock(goCtx context.Context, req *abci.RequestFinalizeBlock) (*abci.ResponseFinalizeBlock, error) {
var events []abci.Event

if err := app.checkHalt(req.Height, req.Time); err != nil {
Expand Down Expand Up @@ -750,9 +771,12 @@
app.stateManager.SetState(execModeFinalize, app.cms, header, app.logger, app.streamingManager)
finalizeState = app.stateManager.GetState(execModeFinalize)
}
ctx := finalizeState.Context().WithContext(goCtx)
ctx, span := ctx.StartSpan(tracer, "internalFinalizeBlock")
defer span.End()

// Context is now updated with Header information.
finalizeState.SetContext(finalizeState.Context().
finalizeState.SetContext(ctx.
WithBlockHeader(header).
WithHeaderHash(req.Hash).
WithHeaderInfo(coreheader.Info{
Expand Down Expand Up @@ -846,7 +870,7 @@
WithBlockGasUsed(blockGasUsed).
WithBlockGasWanted(blockGasWanted),
)
endBlock, err := app.endBlock(finalizeState.Context())
endBlock, err := app.endBlock()
if err != nil {
return nil, err
}
Expand Down Expand Up @@ -959,7 +983,11 @@
// height.
func (app *BaseApp) Commit() (*abci.ResponseCommit, error) {
finalizeState := app.stateManager.GetState(execModeFinalize)
header := finalizeState.Context().BlockHeader()
ctx := finalizeState.Context()
ctx, span := ctx.StartSpan(tracer, "Commit")
defer span.End()

header := ctx.BlockHeader()
retainHeight := app.GetBlockRetentionHeight(header.Height)

if app.abciHandlers.Precommiter != nil {
Expand Down Expand Up @@ -1005,6 +1033,10 @@
// The SnapshotIfApplicable method will create the snapshot by starting the goroutine
app.snapshotManager.SnapshotIfApplicable(header.Height)

blockCnt.Add(ctx, 1)
blockTime.Record(ctx, time.Since(app.blockStartTime).Seconds())
app.blockStartTime = time.Now()

Check warning

Code scanning / CodeQL

Calling the system time Warning

Calling the system time may be a possible source of non-determinism

return resp, nil
}

Expand Down
85 changes: 79 additions & 6 deletions baseapp/baseapp.go
Original file line number Diff line number Diff line change
@@ -1,36 +1,42 @@
package baseapp

import (
"context"
"fmt"
"maps"
"math"
"slices"
"strconv"
"sync"
"time"

"github.com/cockroachdb/errors"
abci "github.com/cometbft/cometbft/abci/types"
"github.com/cometbft/cometbft/crypto/tmhash"
cmtproto "github.com/cometbft/cometbft/proto/tendermint/types"
dbm "github.com/cosmos/cosmos-db"
"github.com/cosmos/gogoproto/proto"
"go.opentelemetry.io/otel"
"go.opentelemetry.io/otel/attribute"
"go.opentelemetry.io/otel/metric"
"go.opentelemetry.io/otel/trace"
protov2 "google.golang.org/protobuf/proto"

errorsmod "cosmossdk.io/errors"

Check failure on line 24 in baseapp/baseapp.go

View workflow job for this annotation

GitHub Actions / golangci-lint

File is not properly formatted (gci)
"cosmossdk.io/log"
"cosmossdk.io/store"
storemetrics "cosmossdk.io/store/metrics"
"cosmossdk.io/store/snapshots"
storetypes "cosmossdk.io/store/types"

"cosmossdk.io/log"

"github.com/cosmos/cosmos-sdk/baseapp/config"
"github.com/cosmos/cosmos-sdk/baseapp/oe"
"github.com/cosmos/cosmos-sdk/baseapp/state"
"github.com/cosmos/cosmos-sdk/codec"
codectypes "github.com/cosmos/cosmos-sdk/codec/types"
servertypes "github.com/cosmos/cosmos-sdk/server/types"
"github.com/cosmos/cosmos-sdk/telemetry"

Check failure on line 38 in baseapp/baseapp.go

View workflow job for this annotation

GitHub Actions / golangci-lint

ST1019: package "github.com/cosmos/cosmos-sdk/telemetry" is being imported more than once (staticcheck)
_ "github.com/cosmos/cosmos-sdk/telemetry" // need to initialize telemetry before we declare tracer and metrics

Check failure on line 39 in baseapp/baseapp.go

View workflow job for this annotation

GitHub Actions / golangci-lint

ST1019(related information): other import of "github.com/cosmos/cosmos-sdk/telemetry" (staticcheck)
sdk "github.com/cosmos/cosmos-sdk/types"
sdkerrors "github.com/cosmos/cosmos-sdk/types/errors"
"github.com/cosmos/cosmos-sdk/types/mempool"
Expand Down Expand Up @@ -59,6 +65,41 @@

var _ servertypes.ABCI = (*BaseApp)(nil)

var (
tracer = otel.Tracer("baseapp")
meter = otel.Meter("baseapp")
blockCnt metric.Int64Counter
txCnt metric.Int64Counter
blockTime metric.Float64Histogram
txTime metric.Int64Histogram
)

func init() {
var err error
blockCnt, err = meter.Int64Counter("block.count")
if err != nil {
panic(err)
}
txCnt, err = meter.Int64Counter("tx.count")
if err != nil {
panic(err)
}
blockTime, err = meter.Float64Histogram("block.time",
metric.WithUnit("s"),
metric.WithDescription("Block time in seconds"),
)
if err != nil {
panic(err)
}
txTime, err = meter.Int64Histogram("tx.time",
metric.WithUnit("us"),
metric.WithDescription("Transaction time in microseconds"),
)
if err != nil {
panic(err)
}
}

// BaseApp reflects the ABCI application implementation.
type BaseApp struct {
// initialized on creation
Expand Down Expand Up @@ -164,6 +205,8 @@

// Optional alternative tx runner, used for block-stm parallel transaction execution. If nil, default txRunner is used.
txRunner sdk.TxRunner

blockStartTime time.Time
}

// NewBaseApp returns a reference to an initialized BaseApp. It accepts a
Expand All @@ -184,8 +227,11 @@
fauxMerkleMode: false,
sigverifyTx: true,
gasConfig: config.GasConfig{QueryGasLimit: math.MaxUint64},
blockStartTime: time.Now(),

Check warning

Code scanning / CodeQL

Calling the system time Warning

Calling the system time may be a possible source of non-determinism
}

// initialize tracer

for _, option := range options {
option(app)
}
Expand Down Expand Up @@ -656,6 +702,8 @@
if app.abciHandlers.PreBlocker != nil {
finalizeState := app.stateManager.GetState(execModeFinalize)
ctx := finalizeState.Context().WithEventManager(sdk.NewEventManager())
ctx, span := ctx.StartSpan(tracer, "preBlock")
defer span.End()
rsp, err := app.abciHandlers.PreBlocker(ctx, req)
if err != nil {
return nil, err
Expand All @@ -681,7 +729,10 @@
)

if app.abciHandlers.BeginBlocker != nil {
resp, err = app.abciHandlers.BeginBlocker(app.stateManager.GetState(execModeFinalize).Context())
ctx := app.stateManager.GetState(execModeFinalize).Context()
ctx, span := ctx.StartSpan(tracer, "beginBlock")
defer span.End()
resp, err = app.abciHandlers.BeginBlocker(ctx)
if err != nil {
return resp, err
}
Expand Down Expand Up @@ -739,11 +790,14 @@

// endBlock is an application-defined function that is called after transactions
// have been processed in FinalizeBlock.
func (app *BaseApp) endBlock(_ context.Context) (sdk.EndBlock, error) {
func (app *BaseApp) endBlock() (sdk.EndBlock, error) {
var endblock sdk.EndBlock

if app.abciHandlers.EndBlocker != nil {
eb, err := app.abciHandlers.EndBlocker(app.stateManager.GetState(execModeFinalize).Context())
ctx := app.stateManager.GetState(execModeFinalize).Context()
ctx, span := ctx.StartSpan(tracer, "endBlock")
defer span.End()
eb, err := app.abciHandlers.EndBlocker(ctx)
if err != nil {
return endblock, err
}
Expand Down Expand Up @@ -773,12 +827,16 @@
// both txbytes and the decoded tx are passed to runTx to avoid the state machine encoding the tx and decoding the transaction twice
// passing the decoded tx to runTX is optional, it will be decoded if the tx is nil
func (app *BaseApp) RunTx(mode sdk.ExecMode, txBytes []byte, tx sdk.Tx, txIndex int, txMultiStore storetypes.MultiStore, incarnationCache map[string]any) (gInfo sdk.GasInfo, result *sdk.Result, anteEvents []abci.Event, err error) {
startTime := time.Now()

Check warning

Code scanning / CodeQL

Calling the system time Warning

Calling the system time may be a possible source of non-determinism
ctx := app.getContextForTx(mode, txBytes, txIndex)
ctx, span := ctx.StartSpan(tracer, "runTx")
defer span.End()

// NOTE: GasWanted should be returned by the AnteHandler. GasUsed is
// determined by the GasMeter. We need access to the context to get the gas
// meter, so we initialize upfront.
var gasWanted uint64

ctx := app.getContextForTx(mode, txBytes, txIndex)
if incarnationCache != nil {
ctx = ctx.WithIncarnationCache(incarnationCache)
}
Expand Down Expand Up @@ -861,7 +919,9 @@
// performance benefits, but it'll be more difficult to get right.
anteCtx, msCache = app.cacheTxContext(ctx, txBytes)
anteCtx = anteCtx.WithEventManager(sdk.NewEventManager())
anteCtx, anteSpan := anteCtx.StartSpan(tracer, "anteHandler")
newCtx, err := app.anteHandler(anteCtx, tx, mode == execModeSimulate)
anteSpan.End()

if !newCtx.IsZero() {
// At this point, newCtx.MultiStore() is a store branch, or something else
Expand Down Expand Up @@ -951,6 +1011,9 @@
consumeBlockGas()

msCache.Write()

txCnt.Add(ctx, 1)
txTime.Record(ctx, time.Since(startTime).Microseconds())
}

if len(anteEvents) > 0 && (mode == execModeFinalize || mode == execModeSimulate) {
Expand All @@ -968,6 +1031,9 @@
// Handler does not exist for a given message route. Otherwise, a reference to a
// Result is returned. The caller must not commit state if an error is returned.
func (app *BaseApp) runMsgs(ctx sdk.Context, msgs []sdk.Msg, msgsV2 []protov2.Message, mode sdk.ExecMode) (*sdk.Result, error) {
ctx, span := ctx.StartSpan(tracer, "runMsgs")
defer span.End()

events := sdk.EmptyEvents()
var msgResponses []*codectypes.Any

Expand All @@ -984,11 +1050,18 @@
return nil, errorsmod.Wrapf(sdkerrors.ErrUnknownRequest, "no message handler found for %T", msg)
}

ctx, msgSpan := ctx.StartSpan(tracer, "msgHandler",
trace.WithAttributes(
attribute.String("msg_type", sdk.MsgTypeURL(msg)),
attribute.Int("msg_index", i),
),
)
// ADR 031 request type routing
msgResult, err := handler(ctx, msg)
if err != nil {
return nil, errorsmod.Wrapf(err, "failed to execute message; message index: %d", i)
}
msgSpan.End()

// create message events
msgEvents, err := createEvents(app.cdc, msgResult.GetEvents(), msg, msgsV2[i])
Expand Down
Loading
Loading