Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 3 additions & 2 deletions AGENTS.md
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,7 @@ cmd/exporter/exporter.go # Entrypoint: flags, provider selection, HT
pkg/provider/provider.go # Provider, Collector, Registry interfaces
pkg/aws/aws.go # AWS: S3, EC2, RDS, NATGATEWAY, ELB, VPC
pkg/google/gcp.go # GCP: GCS, GKE, CLB, SQL, VPC
pkg/azure/azure.go # Azure: AKS
pkg/azure/azure.go # Azure: AKS, blob
pkg/gatherer/gatherer.go # Wraps Collect(): duration, errors, metadata metrics
pkg/utils/consts.go # Shared metric suffixes, HoursInMonth, GenerateDesc()
cmd/dashboards/main.go # Dashboard generation (grafana-foundation-sdk)
Expand Down Expand Up @@ -58,7 +58,8 @@ Rule: Never push to `main`.
```bash
go run cmd/exporter/exporter.go -provider gcp -project-id=$GCP_PROJECT_ID -gcp.services GKE,GCS
go run cmd/exporter/exporter.go -provider aws -aws.profile $AWS_PROFILE -aws.services EC2,S3
go run cmd/exporter/exporter.go -provider azure -azure.subscription-id $AZ_SUBSCRIPTION_ID
go run cmd/exporter/exporter.go -provider azure -azure.subscription-id $AZ_SUBSCRIPTION_ID -azure.services AKS
go run cmd/exporter/exporter.go -provider azure -azure.subscription-id $AZ_SUBSCRIPTION_ID -azure.services blob
```

### Adding a collector
Expand Down
18 changes: 16 additions & 2 deletions cmd/exporter/exporter.go
Original file line number Diff line number Diff line change
Expand Up @@ -75,7 +75,7 @@ func providerFlags(fs *flag.FlagSet, cfg *config.Config) {
fs.Var(config.NewDeprecatedStringSliceFlag(&cfg.Providers.GCP.Projects, &cfg.Providers.GCP.BucketProjectsDeprecated), "gcp.bucket-projects", "GCP project(s). (deprecated: use --gcp.projects instead)")
fs.Var(&cfg.Providers.AWS.Services, "aws.services", "AWS service(s).")
fs.Var(&cfg.Providers.AWS.ExcludeRegions, "aws.exclude-regions", "AWS region(s) to exclude from cost collection.")
fs.Var(&cfg.Providers.Azure.Services, "azure.services", "Azure service(s).")
fs.Var(&cfg.Providers.Azure.Services, "azure.services", "Azure service(s): AKS, blob (comma-separated and/or repeat flag; case-insensitive).")
fs.Var(&cfg.Providers.GCP.Services, "gcp.services", "GCP service(s).")
flag.StringVar(&cfg.Providers.AWS.Region, "aws.region", "", "AWS region")
flag.StringVar(&cfg.Providers.AWS.RoleARN, "aws.roleARN", "", "Optional AWS role ARN to assume for cross-account access.")
Expand Down Expand Up @@ -224,6 +224,19 @@ func selectProvider(ctx context.Context, cfg *config.Config) (provider.Provider,

type newProviderFunc[T any] func(context.Context, T) (provider.Provider, error)

// expandAzureServices flattens repeated -azure.services values and comma-separated tokens (parity with AWS/GCP flags).
func expandAzureServices(flags config.StringSliceFlag) []string {
var out []string
for _, s := range flags {
for _, part := range strings.Split(s, ",") {
if t := strings.TrimSpace(part); t != "" {
out = append(out, t)
}
}
}
return out
}

func selectProviderWith(
ctx context.Context,
cfg *config.Config,
Expand All @@ -242,7 +255,8 @@ func selectProviderWith(
return newAzure(ctx, &azure.Config{
Logger: cfg.Logger,
SubscriptionId: cfg.Providers.Azure.SubscriptionId,
Services: cfg.Providers.Azure.Services,
ScrapeInterval: cfg.Collector.ScrapeInterval,
Services: expandAzureServices(cfg.Providers.Azure.Services),
Copy link
Copy Markdown
Contributor

@rooneyshuman rooneyshuman Apr 10, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

why not just use the inline pattern used by the other providers? eg, strings.Split(cfg.Providers.Azure.Services.String(), ","),

Copy link
Copy Markdown
Contributor

@stephan-rayner stephan-rayner Apr 10, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Context:
There are two ways in which this could be argued to be better:

If we get whitespace in the list split would do this "thing, blob" => "thing", " blob" while this option would do "thing, blob" => "thing", "blob". But since latter in this flow we trim white space this isn't a huge save.

If for some reason cfg.Providers.Azure.Services == "" then this will be mapped to [""] in the string split option and nil in this option. In the split option this will lead to "unknown service" being logged eventually... In the case of nil being returned it would be a no-op (could be made to send a metric or log or something more communicative)

Suggestion:
AWS and GCP already use the string split approach and we don't have problems there, so lets do that, it also means we can pull the test out.

Future:
That said, I think this would be better served as a helper utility we would incorporate across the codebase to avoid the extra confusing log line (because "" isn't an unknown service it is no service). I think the utility-ification of this could be done in a future PR.

CollectorTimeout: collectorTimeout,
})
case "aws":
Expand Down
22 changes: 22 additions & 0 deletions cmd/exporter/exporter_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@ import (
"errors"
"net/http"
"net/http/httptest"
"slices"
"testing"
"time"

Expand Down Expand Up @@ -140,6 +141,27 @@ func Test_selectProvider(t *testing.T) {
}
}

func Test_expandAzureServices(t *testing.T) {
tests := map[string]struct {
in config.StringSliceFlag
want []string
}{
"empty": {in: nil, want: nil},
"single token": {in: config.StringSliceFlag{"blob"}, want: []string{"blob"}},
"comma separated in one flag": {in: config.StringSliceFlag{"AKS, blob"}, want: []string{"AKS", "blob"}},
"repeat flag": {in: config.StringSliceFlag{"AKS", "blob"}, want: []string{"AKS", "blob"}},
"trims and skips empty": {in: config.StringSliceFlag{" AKS , ", ""}, want: []string{"AKS"}},
}
for name, tt := range tests {
t.Run(name, func(t *testing.T) {
got := expandAzureServices(tt.in)
if !slices.Equal(got, tt.want) {
t.Fatalf("expandAzureServices(%#v) = %#v, want %#v", tt.in, got, tt.want)
}
})
}
}

func Test_regionFromConfig(t *testing.T) {
tests := map[string]struct {
provider string
Expand Down
2 changes: 1 addition & 1 deletion docs/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,6 @@
- [Providers](metrics/providers.md)
- **AWS:** [EC2](metrics/aws/ec2.md), [S3](metrics/aws/s3.md), [RDS](metrics/aws/rds.md), [MSK](metrics/aws/msk.md), [ELB](metrics/aws/elb.md), [NAT Gateway](metrics/aws/natgateway.md), [VPC](metrics/aws/vpc.md)
- **GCP:** [GKE](metrics/gcp/gke.md), [GCS](metrics/gcp/gcs.md), [Cloud SQL](metrics/gcp/cloudsql.md), [Managed Kafka](metrics/gcp/managedkafka.md), [CLB](metrics/gcp/clb.md), [VPC](metrics/gcp/vpc.md)
- **Azure:** [AKS](metrics/azure/aks.md)
- **Azure:** [AKS](metrics/azure/aks.md), [blob](metrics/azure/blob.md)
- [Deploying](deploying/aws/README.md) - Run the exporter
- [AWS](deploying/aws/README.md) — IRSA, Helm, cross-account access
1 change: 1 addition & 0 deletions docs/metrics/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -22,3 +22,4 @@
## Azure Services

- **[AKS](./azure/aks.md)** - Azure Kubernetes Service VM instances and managed disks
- **[Blob](./azure/blob.md)** - Azure Blob Storage (cost metrics registered; no series until Cost Management)
31 changes: 31 additions & 0 deletions docs/metrics/azure/blob.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,31 @@
# Azure Blob Storage metrics

## Status

The Blob Collector is registered when you pass `blob` in `--azure.services` (matching is case-insensitive). The collector **does not query Azure Cost Management yet**. The storage cost `GaugeVec` appears in `Describe` but is **not** `MustRegister`'d on the root registry (same pattern as `azure_aks`); **`Collect` forwards** `StorageGauge.Collect(ch)` to the parent Azure gatherer. **There are no samples** until **`Collect` calls `Set`** on label values (not implemented yet).
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Suggested change
The Blob Collector is registered when you pass `blob` in `--azure.services` (matching is case-insensitive). The collector **does not query Azure Cost Management yet**. The storage cost `GaugeVec` appears in `Describe` but is **not** `MustRegister`'d on the root registry (same pattern as `azure_aks`); **`Collect` forwards** `StorageGauge.Collect(ch)` to the parent Azure gatherer. **There are no samples** until **`Collect` calls `Set`** on label values (not implemented yet).
Pass blob to -azure.services to enable this collector (matching is case-insensitive).

Let's use active voice here. We can iterate on this doc, let's not add what the collector doesn't do


## Planned behavior (see issue #54):

### Metric shape (AWS S3 and GCP GCS in this repo)

- **Storage rate:** Same naming pattern as S3 and GCS: `storage_by_location_usd_per_gibyte_hour` under the service subsystem (here `azure_blob` → `cloudcost_azure_blob_storage_by_location_usd_per_gibyte_hour`). Implemented today as `cloudcost_aws_s3_*` in `pkg/aws/s3/s3.go` and `cloudcost_gcp_gcs_*` in `pkg/google/metrics/metrics.go`.
- **Operations rate (optional):** S3 and GCS also define `operation_by_location_usd_per_krequest` (`pkg/aws/s3/s3.go`, `pkg/google/metrics/metrics.go`). **Not registered in the exporter yet** (see Planned future work under [Cost metrics](#cost-metrics)). A Blob Collector may expose an analogous gauge when the chosen Azure dataset supports it.
- **Labels:** S3 uses `region` and `class` for storage (and `tier` for operations); GCS uses `location` and `storage_class` (and operation class labels for ops). Azure label sets would follow whatever dimensions the Cost Management query returns; the above collectors show the cross-cloud “by location + storage class/tier” pattern only.

### Azure Cost Management

- **Query API:** Azure documentation mentions `POST https://management.azure.com/{scope}/providers/Microsoft.CostManagement/query` and states that `{scope}` includes **`/subscriptions/{subscriptionId}/`** for subscription scope. Sample responses in that snapshot include cost columns such as **`PreTaxCost`** alongside dimensions like **`ResourceLocation`** in example filters.
- **Permissions:** Azure documentation states that you must have **Cost Management permissions at the appropriate scope** to use Cost Management APIs (the captured article links to Microsoft’s assign-access documentation for details). No specific Azure role name is asserted here.
Comment on lines +7 to +18
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I'd remove future work plans from this doc. The docs is meant to describe what functionality currently exists. We could add this info to the issue though!


### Provider operational metrics

- **`cloudcost_exporter_collector_*` with `collector="azure_blob"`** — same mechanism as other Azure collectors (e.g. `azure_aks`) via `pkg/azure/azure.go` and the shared gatherer pattern.
- **Registry:** Blob cost `GaugeVec`s are not registered separately on the Prometheus registry; `Describe` lists them for the parent Azure collector, and **`Collect` forwards** `StorageGauge.Collect(ch)` so metric descriptors are not duplicated.

## Cost metrics

| Metric name | Type | Labels | Samples |
|-------------|------|--------|---------|
| `cloudcost_azure_blob_storage_by_location_usd_per_gibyte_hour` | Gauge | `region`, `class` | None; exporter does not populate this metric yet (no billing API calls in `Collect`) |

**Planned future work:** `cloudcost_azure_blob_operation_by_location_usd_per_krequest` (Gauge; `region`, `class`, `tier`) — parity with S3/GCS operation metrics; commented out in `pkg/azure/blob/blob.go` until operation pricing is implemented. How we group cost rows into labels is TBD when we add the API integration.
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

same comment here on including future work

27 changes: 23 additions & 4 deletions pkg/azure/azure.go
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@ import (
"github.com/prometheus/client_golang/prometheus"

"github.com/grafana/cloudcost-exporter/pkg/azure/aks"
"github.com/grafana/cloudcost-exporter/pkg/azure/blob"
"github.com/grafana/cloudcost-exporter/pkg/azure/client"
"github.com/grafana/cloudcost-exporter/pkg/collectormetrics"
"github.com/grafana/cloudcost-exporter/pkg/provider"
Expand Down Expand Up @@ -65,6 +66,7 @@ type Config struct {
Region string

SubscriptionId string
ScrapeInterval time.Duration

CollectorTimeout time.Duration
Services []string
Expand All @@ -90,15 +92,32 @@ func New(ctx context.Context, config *Config) (*Azure, error) {
return nil, err
}

// Collector Registration
// Collector Registration (--azure.services matching is case-insensitive).
for _, svc := range config.Services {
switch strings.ToUpper(svc) {
case "AKS":
svc = strings.TrimSpace(svc)
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

do we need to TrimSpace() here again? I think the new expandAzureServices() already trims it before the services are passed to azure.New()

switch {
case strings.EqualFold(svc, "AKS"):
collector, err := aks.New(ctx, &aks.Config{
Logger: logger,
}, azClientWrapper)
if err != nil {
return nil, err
logger.LogAttrs(ctx, slog.LevelError, "Error creating collector",
slog.String("service", svc),
slog.String("message", err.Error()))
continue
}
collectors = append(collectors, collector)
case strings.EqualFold(svc, "blob"):
collector, err := blob.New(&blob.Config{
Logger: logger,
SubscriptionId: config.SubscriptionId,
ScrapeInterval: config.ScrapeInterval,
})
if err != nil {
logger.LogAttrs(ctx, slog.LevelError, "Error creating collector",
slog.String("service", svc),
slog.String("message", err.Error()))
continue
}
collectors = append(collectors, collector)
default:
Expand Down
99 changes: 99 additions & 0 deletions pkg/azure/blob/blob.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,99 @@
package blob

import (
"context"
"log/slog"
"time"

"github.com/grafana/cloudcost-exporter/pkg/provider"
"github.com/prometheus/client_golang/prometheus"

cloudcost_exporter "github.com/grafana/cloudcost-exporter"
)

const subsystem = "azure_blob"

// metrics holds Prometheus collectors for blob cost rates. Vectors are not registered on the root registry;
// Azure's top-level Collector gathers them via Collect → GaugeVec.Collect (same pattern as pkg/azure/aks).
type metrics struct {
StorageGauge *prometheus.GaugeVec
// Planned future work: operation request rate (parity with S3/GCS cloudcost_*_operation_by_location_usd_per_krequest).
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

we might already have that with existing metrics, see https://ops.grafana-ops.net/goto/ffio5ap805csgf?orgId=stacks-27821

but it depends what kind of operation rate we are talking about :)

Copy link
Copy Markdown
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Good to know. I'll keep that in mind when digging into the next metric. Thanks!

// OperationsGauge *prometheus.GaugeVec
}

func newMetrics() metrics {
m := metrics{
StorageGauge: prometheus.NewGaugeVec(prometheus.GaugeOpts{
Name: prometheus.BuildFQName(cloudcost_exporter.MetricPrefix, subsystem, "storage_by_location_usd_per_gibyte_hour"),
Help: "Storage cost of blob objects by region and class. Cost represented in USD/(GiB*h). No samples until Cost Management is integrated.",
},
[]string{"region", "class"},
),
}

// Planned future work: register operation cost per 1k requests (labels region, class, tier) when Cost Management dimensions support it.
// m.OperationsGauge = prometheus.NewGaugeVec(prometheus.GaugeOpts{
// Name: prometheus.BuildFQName(cloudcost_exporter.MetricPrefix, subsystem, "operation_by_location_usd_per_krequest"),
// Help: "Operation cost of blob objects by region, class, and tier. Cost represented in USD/(1k req). No samples until Cost Management is integrated.",
// },
// []string{"region", "class", "tier"},
// )

return m
}

// Collector implements provider.Collector for Azure Blob Storage cost metrics.
// Cost Management integration is not implemented yet; there are no labeled series until Collect calls Set on the vec.
type Collector struct {
logger *slog.Logger
metrics metrics
subscriptionID string
scrapeInterval time.Duration
}

// Config holds settings for the blob collector.
type Config struct {
Logger *slog.Logger
SubscriptionId string
ScrapeInterval time.Duration
}

// New builds a blob collector. It does not call Azure APIs yet; subscription and interval are stored for Cost Management integration.
func New(cfg *Config) (*Collector, error) {
interval := cfg.ScrapeInterval
if interval <= 0 {
interval = time.Hour
}
return &Collector{
logger: cfg.Logger.With("collector", "blob"),
metrics: newMetrics(),
subscriptionID: cfg.SubscriptionId,
scrapeInterval: interval,
}, nil
}

// Collect satisfies provider.Collector. Does not call Set on cost vectors yet; still forwards the vec on ch for the parent gatherer.
func (c *Collector) Collect(ctx context.Context, ch chan<- prometheus.Metric) error {
c.logger.LogAttrs(ctx, slog.LevelInfo, "collecting metrics")
c.metrics.StorageGauge.Collect(ch)
return nil
}

// Describe satisfies provider.Collector.
func (c *Collector) Describe(ch chan<- *prometheus.Desc) error {
c.metrics.StorageGauge.Describe(ch)
// Planned future work: c.metrics.OperationsGauge.Describe(ch)
return nil
}

// Name returns the collector subsystem name for operational metrics.
func (c *Collector) Name() string {
return subsystem
}

// Register satisfies provider.Collector. Does not register cost metrics on the registry (avoids duplicate Desc
// with Azure's Describe fan-out; metrics are collected via Collect → StorageGauge.Collect).
func (c *Collector) Register(_ provider.Registry) error {
c.logger.LogAttrs(context.Background(), slog.LevelInfo, "registering collector")
return nil
}
84 changes: 84 additions & 0 deletions pkg/azure/blob/blob_test.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,84 @@
package blob

import (
"context"
"log/slog"
"os"
"strings"
"testing"
"time"

"github.com/prometheus/client_golang/prometheus"
"github.com/stretchr/testify/assert"
"github.com/stretchr/testify/require"

cloudcost_exporter "github.com/grafana/cloudcost-exporter"
)

var testLogger = slog.New(slog.NewTextHandler(os.Stdout, nil))

func testCollectSink() chan prometheus.Metric {
return make(chan prometheus.Metric, 8)
}

func TestCollector_Describe(t *testing.T) {
c, err := New(&Config{Logger: testLogger})
require.NoError(t, err)
ch := make(chan *prometheus.Desc, 4)
require.NoError(t, c.Describe(ch))
close(ch)
var got []string
for d := range ch {
got = append(got, d.String())
}
require.Len(t, got, 1)
joined := strings.Join(got, " ")
prefix := cloudcost_exporter.MetricPrefix + "_azure_blob_"
assert.Contains(t, joined, prefix+"storage_by_location_usd_per_gibyte_hour")
}

func TestCollector_Register(t *testing.T) {
c, err := New(&Config{Logger: testLogger})
require.NoError(t, err)
// Register does not call registry.MustRegister on cost metrics (AKS pattern).
require.NoError(t, c.Register(nil))
}

func TestCollector_Collect_forwardsStorageGauge(t *testing.T) {
c, err := New(&Config{Logger: testLogger})
require.NoError(t, err)
ctx := context.Background()
require.NoError(t, c.Collect(ctx, testCollectSink()))
}

func TestNew_configPlumbing(t *testing.T) {
const subUUID = "11111111-1111-1111-1111-111111111111"
tests := map[string]struct {
subscriptionID string
scrapeInterval time.Duration
wantInterval time.Duration
}{
"zero scrape interval defaults to one hour": {
subscriptionID: "sub-1",
scrapeInterval: 0,
wantInterval: time.Hour,
},
"explicit subscription and interval": {
subscriptionID: subUUID,
scrapeInterval: 30 * time.Minute,
wantInterval: 30 * time.Minute,
},
}
for name, tt := range tests {
t.Run(name, func(t *testing.T) {
c, err := New(&Config{
Logger: testLogger,
SubscriptionId: tt.subscriptionID,
ScrapeInterval: tt.scrapeInterval,
})
require.NoError(t, err)
assert.Equal(t, tt.subscriptionID, c.subscriptionID)
assert.Equal(t, tt.wantInterval, c.scrapeInterval)
})
}
}
Loading