Skip to content

Commit 40ff36d

Browse files
committed
feat: implement actual download statistics scraping from package pages
1 parent 0117253 commit 40ff36d

File tree

6 files changed

+419
-5
lines changed

6 files changed

+419
-5
lines changed

README.md

Lines changed: 7 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,7 @@ A Prometheus exporter for GitHub Container Registry (GHCR) metrics.
66

77
- Collects package download statistics from GitHub Container Registry (GHCR)
88
- Tracks package version counts and last published timestamps
9+
- Download statistics from package pages
910
- Monitors collection performance and success rates
1011
- Supports both user and organization packages
1112
- Prometheus metrics endpoint with health checks
@@ -62,12 +63,17 @@ make build
6263
The exporter provides the following metrics:
6364

6465
- `ghcr_exporter_info` - Information about the exporter
65-
- `ghcr_package_downloads_total` - Version count (proxy for package activity)
66+
- `ghcr_package_version_count` - Total number of versions for a package
67+
- `ghcr_package_downloads` - **Actual download count** scraped from package pages
6668
- `ghcr_package_last_published_timestamp` - Last published timestamp
6769
- `ghcr_collection_duration_seconds` - Collection duration
6870
- `ghcr_collection_success_total` - Successful collections
6971
- `ghcr_collection_failed_total` - Failed collections
7072

73+
### Important Note About Download Statistics
74+
75+
The `ghcr_package_downloads` metric provides **actual download counts** by scraping the package page HTML, which matches what you see on GitHub (e.g., "Total Downloads 176K"). This is different from version count, which only represents the number of different versions/tags available.
76+
7177
## Development
7278

7379
```bash

go.mod

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -11,6 +11,8 @@ require (
1111
)
1212

1313
require (
14+
github.com/antchfx/htmlquery v1.3.4 // indirect
15+
github.com/antchfx/xpath v1.3.3 // indirect
1416
github.com/beorn7/perks v1.0.1 // indirect
1517
github.com/bytedance/sonic v1.11.6 // indirect
1618
github.com/bytedance/sonic/loader v0.1.1 // indirect
@@ -23,6 +25,7 @@ require (
2325
github.com/go-playground/universal-translator v0.18.1 // indirect
2426
github.com/go-playground/validator/v10 v10.20.0 // indirect
2527
github.com/goccy/go-json v0.10.2 // indirect
28+
github.com/golang/groupcache v0.0.0-20210331224755-41bb18bfe9da // indirect
2629
github.com/json-iterator/go v1.1.12 // indirect
2730
github.com/klauspost/cpuid/v2 v2.2.7 // indirect
2831
github.com/kr/text v0.2.0 // indirect

go.sum

Lines changed: 70 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,7 @@
1+
github.com/antchfx/htmlquery v1.3.4 h1:Isd0srPkni2iNTWCwVj/72t7uCphFeor5Q8nCzj1jdQ=
2+
github.com/antchfx/htmlquery v1.3.4/go.mod h1:K9os0BwIEmLAvTqaNSua8tXLWRWZpocZIH73OzWQbwM=
3+
github.com/antchfx/xpath v1.3.3 h1:tmuPQa1Uye0Ym1Zn65vxPgfltWb/Lxu2jeqIGteJSRs=
4+
github.com/antchfx/xpath v1.3.3/go.mod h1:i54GszH55fYfBmoZXapTHN8T8tkcHfRgLyVwwqzXNcs=
15
github.com/beorn7/perks v1.0.1 h1:VlbKKnNfV8bJzeqoa4cOKqO6bYr3WgKZxO8Z16+hsOM=
26
github.com/beorn7/perks v1.0.1/go.mod h1:G2ZrVWU2WbWT9wwq4/hrbKbnv/1ERSJQ0ibhJ6rlkpw=
37
github.com/bytedance/sonic v1.11.6 h1:oUp34TzMlL+OY1OUWxHqsdkgC/Zfc85zGqw9siXjrc0=
@@ -30,6 +34,9 @@ github.com/go-playground/validator/v10 v10.20.0 h1:K9ISHbSaI0lyB2eWMPJo+kOS/FBEx
3034
github.com/go-playground/validator/v10 v10.20.0/go.mod h1:dbuPbCMFw/DrkbEynArYaCwl3amGuJotoKCe95atGMM=
3135
github.com/goccy/go-json v0.10.2 h1:CrxCmQqYDkv1z7lO7Wbh2HN93uovUHgrECaO5ZrCXAU=
3236
github.com/goccy/go-json v0.10.2/go.mod h1:6MelG93GURQebXPDq3khkgXZkazVtN9CRI+MGFi0w8I=
37+
github.com/golang/groupcache v0.0.0-20210331224755-41bb18bfe9da h1:oI5xCqsCo564l8iNU+DwB5epxmsaqB+rhGL0m5jtYqE=
38+
github.com/golang/groupcache v0.0.0-20210331224755-41bb18bfe9da/go.mod h1:cIg4eruTrX1D+g88fzRXU5OdNfaM+9IcxsU14FzY7Hc=
39+
github.com/google/go-cmp v0.6.0/go.mod h1:17dUlkBOakJ0+DkrSSNjCkIjxS6bF9zb3elmeNGIjoY=
3340
github.com/google/go-cmp v0.7.0 h1:wk8382ETsv4JYUZwIsn6YpYiWiBsYLSJiTsyBybVuN8=
3441
github.com/google/go-cmp v0.7.0/go.mod h1:pXiqmnSA92OHEEa9HXL2W4E7lf9JzCmGVUdgjX3N/iU=
3542
github.com/google/gofuzz v1.0.0/go.mod h1:dBl0BpW6vV/+mYPU4Po3pmUjxk6FQPldtuIdl/M65Eg=
@@ -89,21 +96,84 @@ github.com/twitchyliquid64/golang-asm v0.15.1 h1:SU5vSMR7hnwNxj24w34ZyCi/FmDZTkS
8996
github.com/twitchyliquid64/golang-asm v0.15.1/go.mod h1:a1lVb/DtPvCB8fslRZhAngC2+aY1QWCk3Cedj/Gdt08=
9097
github.com/ugorji/go/codec v1.2.12 h1:9LC83zGrHhuUA9l16C9AHXAqEV/2wBQ4nkvumAE65EE=
9198
github.com/ugorji/go/codec v1.2.12/go.mod h1:UNopzCgEMSXjBc6AOMqYvWC1ktqTAfzJZUZgYf6w6lg=
99+
github.com/yuin/goldmark v1.4.13/go.mod h1:6yULJ656Px+3vBD8DxQVa3kxgyrAnzto9xy5taEt/CY=
92100
go.uber.org/goleak v1.3.0 h1:2K3zAYmnTNqV73imy9J1T3WC+gmCePx2hEGkimedGto=
93101
go.uber.org/goleak v1.3.0/go.mod h1:CoHD4mav9JJNrW/WLlf7HGZPjdw8EucARQHekz1X6bE=
94102
golang.org/x/arch v0.0.0-20210923205945-b76863e36670/go.mod h1:5om86z9Hs0C8fWVUuoMHwpExlXzs5Tkyp9hOrfG7pp8=
95103
golang.org/x/arch v0.8.0 h1:3wRIsP3pM4yUptoR96otTUOXI367OS0+c9eeRi9doIc=
96104
golang.org/x/arch v0.8.0/go.mod h1:FEVrYAQjsQXMVJ1nsMoVVXPZg6p2JE2mx8psSWTDQys=
105+
golang.org/x/crypto v0.0.0-20190308221718-c2843e01d9a2/go.mod h1:djNgcEr1/C05ACkg1iLfiJU5Ep61QUkGW8qpdssI0+w=
106+
golang.org/x/crypto v0.0.0-20210921155107-089bfa567519/go.mod h1:GvvjBRRGRdwPK5ydBHafDWAxML/pGHZbMvKqRZ5+Abc=
107+
golang.org/x/crypto v0.13.0/go.mod h1:y6Z2r+Rw4iayiXXAIxJIDAJ1zMW4yaTpebo8fPOliYc=
108+
golang.org/x/crypto v0.19.0/go.mod h1:Iy9bg/ha4yyC70EfRS8jz+B6ybOBKMaSxLj6P6oBDfU=
109+
golang.org/x/crypto v0.23.0/go.mod h1:CKFgDieR+mRhux2Lsu27y0fO304Db0wZe70UKqHu0v8=
110+
golang.org/x/crypto v0.31.0/go.mod h1:kDsLvtWBEx7MV9tJOj9bnXsPbxwJQ6csT/x4KIN4Ssk=
97111
golang.org/x/crypto v0.41.0 h1:WKYxWedPGCTVVl5+WHSSrOBT0O8lx32+zxmHxijgXp4=
98112
golang.org/x/crypto v0.41.0/go.mod h1:pO5AFd7FA68rFak7rOAGVuygIISepHftHnr8dr6+sUc=
113+
golang.org/x/mod v0.6.0-dev.0.20220419223038-86c51ed26bb4/go.mod h1:jJ57K6gSWd91VN4djpZkiMVwK6gcyfeH4XE8wZrZaV4=
114+
golang.org/x/mod v0.8.0/go.mod h1:iBbtSCu2XBx23ZKBPSOrRkjjQPZFPuis4dIYUhu/chs=
115+
golang.org/x/mod v0.12.0/go.mod h1:iBbtSCu2XBx23ZKBPSOrRkjjQPZFPuis4dIYUhu/chs=
116+
golang.org/x/mod v0.15.0/go.mod h1:hTbmBsO62+eylJbnUtE2MGJUyE7QWk4xUqPFrRgJ+7c=
117+
golang.org/x/mod v0.17.0/go.mod h1:hTbmBsO62+eylJbnUtE2MGJUyE7QWk4xUqPFrRgJ+7c=
118+
golang.org/x/net v0.0.0-20190620200207-3b0461eec859/go.mod h1:z5CRVTTTmAJ677TzLLGU+0bjPO0LkuOLi4/5GtJWs/s=
119+
golang.org/x/net v0.0.0-20210226172049-e18ecbb05110/go.mod h1:m0MpNAwzfU5UDzcl9v0D8zg8gWTRqZa9RBIspLL5mdg=
120+
golang.org/x/net v0.0.0-20220722155237-a158d28d115b/go.mod h1:XRhObCWvk6IyKnWLug+ECip1KBveYUHfp+8e9klMJ9c=
121+
golang.org/x/net v0.6.0/go.mod h1:2Tu9+aMcznHK/AK1HMvgo6xiTLG5rD5rZLDS+rp2Bjs=
122+
golang.org/x/net v0.10.0/go.mod h1:0qNGK6F8kojg2nk9dLZ2mShWaEBan6FAoqfSigmmuDg=
123+
golang.org/x/net v0.15.0/go.mod h1:idbUs1IY1+zTqbi8yxTbhexhEEk5ur9LInksu6HrEpk=
124+
golang.org/x/net v0.21.0/go.mod h1:bIjVDfnllIU7BJ2DNgfnXvpSvtn8VRwhlsaeUTyUS44=
125+
golang.org/x/net v0.25.0/go.mod h1:JkAGAh7GEvH74S6FOH42FLoXpXbE/aqXSrIQjXgsiwM=
126+
golang.org/x/net v0.33.0/go.mod h1:HXLR5J+9DxmrqMwG9qjGCxZ+zKXxBru04zlTvWlWuN4=
99127
golang.org/x/net v0.42.0 h1:jzkYrhi3YQWD6MLBJcsklgQsoAcw89EcZbJw8Z614hs=
100128
golang.org/x/net v0.42.0/go.mod h1:FF1RA5d3u7nAYA4z2TkclSCKh68eSXtiFwcWQpPXdt8=
129+
golang.org/x/sync v0.0.0-20190423024810-112230192c58/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM=
130+
golang.org/x/sync v0.0.0-20220722155255-886fb9371eb4/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM=
131+
golang.org/x/sync v0.1.0/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM=
132+
golang.org/x/sync v0.3.0/go.mod h1:FU7BRWz2tNW+3quACPkgCx/L+uEAv1htQ0V83Z9Rj+Y=
133+
golang.org/x/sync v0.6.0/go.mod h1:Czt+wKu1gCyEFDUtn0jG5QVvpJ6rzVqr5aXyt9drQfk=
134+
golang.org/x/sync v0.7.0/go.mod h1:Czt+wKu1gCyEFDUtn0jG5QVvpJ6rzVqr5aXyt9drQfk=
135+
golang.org/x/sync v0.10.0/go.mod h1:Czt+wKu1gCyEFDUtn0jG5QVvpJ6rzVqr5aXyt9drQfk=
136+
golang.org/x/sys v0.0.0-20190215142949-d0b11bdaac8a/go.mod h1:STP8DvDyc/dI5b8T5hshtkjS+E42TnysNCUPdjciGhY=
137+
golang.org/x/sys v0.0.0-20201119102817-f84b799fce68/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs=
138+
golang.org/x/sys v0.0.0-20210615035016-665e8c7367d1/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
139+
golang.org/x/sys v0.0.0-20220520151302-bc2c85ada10a/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
140+
golang.org/x/sys v0.0.0-20220722155257-8c9f86f7a55f/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
101141
golang.org/x/sys v0.5.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
102142
golang.org/x/sys v0.6.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
143+
golang.org/x/sys v0.8.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
144+
golang.org/x/sys v0.12.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
145+
golang.org/x/sys v0.17.0/go.mod h1:/VUhepiaJMQUp4+oa/7Zr1D23ma6VTLIYjOOTFZPUcA=
146+
golang.org/x/sys v0.20.0/go.mod h1:/VUhepiaJMQUp4+oa/7Zr1D23ma6VTLIYjOOTFZPUcA=
147+
golang.org/x/sys v0.28.0/go.mod h1:/VUhepiaJMQUp4+oa/7Zr1D23ma6VTLIYjOOTFZPUcA=
103148
golang.org/x/sys v0.35.0 h1:vz1N37gP5bs89s7He8XuIYXpyY0+QlsKmzipCbUtyxI=
104149
golang.org/x/sys v0.35.0/go.mod h1:BJP2sWEmIv4KK5OTEluFJCKSidICx8ciO85XgH3Ak8k=
150+
golang.org/x/telemetry v0.0.0-20240228155512-f48c80bd79b2/go.mod h1:TeRTkGYfJXctD9OcfyVLyj2J3IxLnKwHJR8f4D8a3YE=
151+
golang.org/x/term v0.0.0-20201126162022-7de9c90e9dd1/go.mod h1:bj7SfCRtBDWHUb9snDiAeCFNEtKQo2Wmx5Cou7ajbmo=
152+
golang.org/x/term v0.0.0-20210927222741-03fcf44c2211/go.mod h1:jbD1KX2456YbFQfuXm/mYQcufACuNUgVhRMnK/tPxf8=
153+
golang.org/x/term v0.5.0/go.mod h1:jMB1sMXY+tzblOD4FWmEbocvup2/aLOaQEp7JmGp78k=
154+
golang.org/x/term v0.8.0/go.mod h1:xPskH00ivmX89bAKVGSKKtLOWNx2+17Eiy94tnKShWo=
155+
golang.org/x/term v0.12.0/go.mod h1:owVbMEjm3cBLCHdkQu9b1opXd4ETQWc3BhuQGKgXgvU=
156+
golang.org/x/term v0.17.0/go.mod h1:lLRBjIVuehSbZlaOtGMbcMncT+aqLLLmKrsjNrUguwk=
157+
golang.org/x/term v0.20.0/go.mod h1:8UkIAJTvZgivsXaD6/pH6U9ecQzZ45awqEOzuCvwpFY=
158+
golang.org/x/term v0.27.0/go.mod h1:iMsnZpn0cago0GOrHO2+Y7u7JPn5AylBrcoWkElMTSM=
159+
golang.org/x/text v0.3.0/go.mod h1:NqM8EUOU14njkJ3fqMW+pc6Ldnwhi/IjpwHt7yyuwOQ=
160+
golang.org/x/text v0.3.3/go.mod h1:5Zoc/QRtKVWzQhOtBMvqHzDpF6irO9z98xDceosuGiQ=
161+
golang.org/x/text v0.3.7/go.mod h1:u+2+/6zg+i71rQMx5EYifcz6MCKuco9NR6JIITiCfzQ=
162+
golang.org/x/text v0.7.0/go.mod h1:mrYo+phRRbMaCq/xk9113O4dZlRixOauAjOtrjsXDZ8=
163+
golang.org/x/text v0.9.0/go.mod h1:e1OnstbJyHTd6l/uOt8jFFHp6TRDWZR/bV3emEE/zU8=
164+
golang.org/x/text v0.13.0/go.mod h1:TvPlkZtksWOMsz7fbANvkp4WM8x/WCo/om8BMLbz+aE=
165+
golang.org/x/text v0.14.0/go.mod h1:18ZOQIKpY8NJVqYksKHtTdi31H5itFRjB5/qKTNYzSU=
166+
golang.org/x/text v0.15.0/go.mod h1:18ZOQIKpY8NJVqYksKHtTdi31H5itFRjB5/qKTNYzSU=
167+
golang.org/x/text v0.21.0/go.mod h1:4IBbMaMmOPCJ8SecivzSH54+73PCFmPWxNTLm+vZkEQ=
105168
golang.org/x/text v0.28.0 h1:rhazDwis8INMIwQ4tpjLDzUhx6RlXqZNPEM0huQojng=
106169
golang.org/x/text v0.28.0/go.mod h1:U8nCwOR8jO/marOQ0QbDiOngZVEBB7MAiitBuMjXiNU=
170+
golang.org/x/tools v0.0.0-20180917221912-90fa682c2a6e/go.mod h1:n7NCudcB/nEzxVGmLbDWY5pfWTLqBcC2KZ6jyYvM4mQ=
171+
golang.org/x/tools v0.0.0-20191119224855-298f0cb1881e/go.mod h1:b+2E5dAYhXwXZwtnZ6UAqBI28+e2cm9otk0dWdXHAEo=
172+
golang.org/x/tools v0.1.12/go.mod h1:hNGJHUnrk76NpqgfD5Aqm5Crs+Hm0VOH/i9J2+nxYbc=
173+
golang.org/x/tools v0.6.0/go.mod h1:Xwgl3UAJ/d3gWutnCtw505GrjyAbvKui8lOU390QaIU=
174+
golang.org/x/tools v0.13.0/go.mod h1:HvlwmtVNQAhOuCjW7xxvovg8wbNq7LwfXh/k7wXUl58=
175+
golang.org/x/tools v0.21.1-0.20240508182429-e35e4ccd0d2d/go.mod h1:aiJjzUbINMkxbQROHiO6hDPo2LHcIPhhQsa9DLh0yGk=
176+
golang.org/x/xerrors v0.0.0-20190717185122-a985d3407aa7/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0=
107177
google.golang.org/protobuf v1.36.7 h1:IgrO7UwFQGJdRNXH/sQux4R1Dj1WAKcLElzeeRaXV2A=
108178
google.golang.org/protobuf v1.36.7/go.mod h1:jduwjTPXsFjZGTmRluh+L6NjiWu7pchiJ2/5YcXBHnY=
109179
gopkg.in/check.v1 v0.0.0-20161208181325-20d25e280405/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0=

internal/collectors/ghcr_collector.go

Lines changed: 183 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,9 +1,11 @@
11
package collectors
22

33
import (
4+
"compress/gzip"
45
"context"
56
"encoding/json"
67
"fmt"
8+
"io"
79
"log/slog"
810
"net/http"
911
"strconv"
@@ -171,7 +173,7 @@ func (gc *GHCRCollector) collectPackageMetrics(ctx context.Context, repo string,
171173
}
172174

173175
// Update metrics
174-
gc.updatePackageMetrics(pkg, packageInfo, versions)
176+
gc.updatePackageMetrics(ctx, pkg, packageInfo, versions)
175177

176178
return nil
177179
}
@@ -288,7 +290,7 @@ func (gc *GHCRCollector) getPackageVersions(ctx context.Context, owner, repo, pa
288290
return versions, nil
289291
}
290292

291-
func (gc *GHCRCollector) updatePackageMetrics(pkg config.PackageGroup, packageInfo *GHCRPackageResponse, versions []GHCRVersionResponse) {
293+
func (gc *GHCRCollector) updatePackageMetrics(ctx context.Context, pkg config.PackageGroup, packageInfo *GHCRPackageResponse, versions []GHCRVersionResponse) {
292294
// Update package-level metrics with real data
293295
// Note: GitHub API doesn't provide download statistics for packages
294296
// We'll use version count as a proxy metric and track last published time
@@ -308,13 +310,24 @@ func (gc *GHCRCollector) updatePackageMetrics(pkg config.PackageGroup, packageIn
308310
// Use version count as a proxy for activity (more versions = more activity)
309311
gc.metrics.PackageDownloadsGauge.WithLabelValues(pkg.Owner, pkg.Repo).Set(float64(packageInfo.VersionCount))
310312

313+
// Try to get actual download statistics from the package page
314+
downloadCount, err := gc.getPackageDownloadStats(ctx, pkg.Owner, pkg.Repo)
315+
if err != nil {
316+
slog.Warn("Failed to get download statistics", "package", pkg.Repo, "error", err)
317+
// Set to -1 to indicate no data available
318+
gc.metrics.PackageDownloadStatsGauge.WithLabelValues(pkg.Owner, pkg.Repo).Set(-1)
319+
} else {
320+
gc.metrics.PackageDownloadStatsGauge.WithLabelValues(pkg.Owner, pkg.Repo).Set(float64(downloadCount))
321+
}
322+
311323
if !lastPublished.IsZero() {
312324
gc.metrics.PackageLastPublishedGauge.WithLabelValues(pkg.Owner, pkg.Repo).Set(float64(lastPublished.Unix()))
313325
}
314326

315327
slog.Info("Updated package metrics",
316328
"package", pkg.Repo,
317329
"version_count", packageInfo.VersionCount,
330+
"download_count", downloadCount,
318331
"last_published", lastPublished.Format(time.RFC3339))
319332
}
320333

@@ -338,3 +351,171 @@ func (gc *GHCRCollector) retryWithBackoff(operation func() error, maxRetries int
338351

339352
return fmt.Errorf("operation failed after %d retries: %w", maxRetries, lastErr)
340353
}
354+
355+
// getPackageDownloadStats scrapes the package page to get actual download statistics
356+
func (gc *GHCRCollector) getPackageDownloadStats(ctx context.Context, owner, packageName string) (int64, error) {
357+
slog.Info("Starting download statistics collection", "owner", owner, "package", packageName)
358+
359+
// Construct the package page URL
360+
packageURL := fmt.Sprintf("https://github.com/%s/%s/pkgs/container/%s", owner, packageName, packageName)
361+
slog.Debug("Constructed package URL", "url", packageURL)
362+
363+
// Create request to the package page
364+
req, err := http.NewRequestWithContext(ctx, http.MethodGet, packageURL, nil)
365+
if err != nil {
366+
slog.Error("Failed to create HTTP request", "owner", owner, "package", packageName, "error", err)
367+
return 0, fmt.Errorf("failed to create request: %w", err)
368+
}
369+
370+
slog.Debug("Created HTTP request successfully")
371+
372+
// Set headers to mimic a browser request
373+
req.Header.Set("User-Agent", "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36")
374+
req.Header.Set("Accept", "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7")
375+
req.Header.Set("Accept-Language", "en-US,en;q=0.9")
376+
req.Header.Set("Accept-Encoding", "gzip, deflate, br")
377+
req.Header.Set("DNT", "1")
378+
req.Header.Set("Connection", "keep-alive")
379+
req.Header.Set("Upgrade-Insecure-Requests", "1")
380+
req.Header.Set("Sec-Fetch-Dest", "document")
381+
req.Header.Set("Sec-Fetch-Mode", "navigate")
382+
req.Header.Set("Sec-Fetch-Site", "none")
383+
req.Header.Set("Sec-Fetch-User", "?1")
384+
req.Header.Set("Cache-Control", "max-age=0")
385+
slog.Debug("Set browser-like headers", "user_agent", req.Header.Get("User-Agent"))
386+
387+
// Make the request
388+
slog.Debug("Making HTTP request to package page")
389+
390+
resp, err := gc.client.Do(req)
391+
if err != nil {
392+
slog.Error("Failed to fetch package page", "owner", owner, "package", packageName, "url", packageURL, "error", err)
393+
return 0, fmt.Errorf("failed to fetch package page: %w", err)
394+
}
395+
396+
defer func() {
397+
if closeErr := resp.Body.Close(); closeErr != nil {
398+
slog.Warn("Failed to close response body", "error", closeErr)
399+
}
400+
}()
401+
402+
slog.Debug("Received HTTP response", "status_code", resp.StatusCode, "content_length", resp.ContentLength, "content_type", resp.Header.Get("Content-Type"))
403+
404+
if resp.StatusCode != http.StatusOK {
405+
slog.Error("Package page returned non-OK status", "owner", owner, "package", packageName, "status_code", resp.StatusCode, "url", packageURL)
406+
return 0, fmt.Errorf("package page returned status %d", resp.StatusCode)
407+
}
408+
409+
// Read the response body
410+
slog.Debug("Reading response body")
411+
412+
body, err := io.ReadAll(resp.Body)
413+
if err != nil {
414+
slog.Error("Failed to read response body", "owner", owner, "package", packageName, "error", err)
415+
return 0, fmt.Errorf("failed to read response body: %w", err)
416+
}
417+
418+
// Handle gzip decompression if needed
419+
if resp.Header.Get("Content-Encoding") == "gzip" {
420+
slog.Debug("Decompressing gzipped response")
421+
422+
gzReader, err := gzip.NewReader(strings.NewReader(string(body)))
423+
if err != nil {
424+
slog.Error("Failed to create gzip reader", "owner", owner, "package", packageName, "error", err)
425+
return 0, fmt.Errorf("failed to create gzip reader: %w", err)
426+
}
427+
428+
defer func() {
429+
if closeErr := gzReader.Close(); closeErr != nil {
430+
slog.Warn("Failed to close gzip reader", "error", closeErr)
431+
}
432+
}()
433+
434+
// Read the decompressed content
435+
decompressedBody, err := io.ReadAll(gzReader)
436+
if err != nil {
437+
slog.Error("Failed to read decompressed body", "owner", owner, "package", packageName, "error", err)
438+
return 0, fmt.Errorf("failed to read decompressed body: %w", err)
439+
}
440+
441+
body = decompressedBody
442+
slog.Debug("Gzip decompression successful", "original_size", len(body), "decompressed_size", len(decompressedBody))
443+
}
444+
445+
bodySize := len(body)
446+
slog.Debug("Response body read successfully", "body_size_bytes", bodySize)
447+
448+
if bodySize == 0 {
449+
slog.Error("Response body is empty", "owner", owner, "package", packageName, "url", packageURL)
450+
return 0, fmt.Errorf("response body is empty")
451+
}
452+
453+
// Parse the HTML document
454+
slog.Debug("Parsing HTML document", "body_size_bytes", bodySize)
455+
456+
// Simple grep-like approach: find "Total downloads" and get the next line
457+
htmlContent := string(body)
458+
lines := strings.Split(htmlContent, "\n")
459+
460+
var downloadLine string
461+
462+
for i, line := range lines {
463+
if strings.Contains(line, "Total downloads") {
464+
if i+1 < len(lines) {
465+
downloadLine = strings.TrimSpace(lines[i+1])
466+
slog.Debug("Found download line after 'Total downloads'", "line", downloadLine)
467+
468+
break
469+
}
470+
}
471+
}
472+
473+
if downloadLine == "" {
474+
slog.Error("Download statistics not found", "owner", owner, "package", packageName)
475+
476+
// Log a few lines around where "Total downloads" should be for debugging
477+
for i, line := range lines {
478+
if strings.Contains(line, "download") {
479+
slog.Debug("Found line with 'download'", "line_number", i, "content", strings.TrimSpace(line))
480+
481+
if i+1 < len(lines) {
482+
slog.Debug("Next line content", "line_number", i+1, "content", strings.TrimSpace(lines[i+1]))
483+
}
484+
}
485+
}
486+
487+
return 0, fmt.Errorf("download statistics not found in package page")
488+
}
489+
490+
slog.Debug("Found download line", "line", downloadLine)
491+
492+
// Extract the title attribute which contains the full number
493+
// Look for title="123456" in the line (e.g., from <h3 title="123456">123K</h3>)
494+
titleStart := strings.Index(downloadLine, `title="`)
495+
if titleStart == -1 {
496+
slog.Error("Download count title attribute not found", "owner", owner, "package", packageName, "line", downloadLine)
497+
return 0, fmt.Errorf("download count title attribute not found")
498+
}
499+
500+
titleStart += 7 // Skip 'title="'
501+
502+
titleEnd := strings.Index(downloadLine[titleStart:], `"`)
503+
if titleEnd == -1 {
504+
slog.Error("Download count title attribute malformed", "owner", owner, "package", packageName, "line", downloadLine)
505+
return 0, fmt.Errorf("download count title attribute malformed")
506+
}
507+
508+
title := downloadLine[titleStart : titleStart+titleEnd]
509+
slog.Debug("Extracted title attribute", "title", title)
510+
511+
// Parse the download count from the title attribute
512+
downloadCount, err := strconv.ParseInt(title, 10, 64)
513+
if err != nil {
514+
slog.Error("Failed to parse download count", "owner", owner, "package", packageName, "title", title, "error", err)
515+
return 0, fmt.Errorf("failed to parse download count %s: %w", title, err)
516+
}
517+
518+
slog.Info("Successfully extracted download statistics", "owner", owner, "package", packageName, "download_count", downloadCount, "raw_title", title)
519+
520+
return downloadCount, nil
521+
}

0 commit comments

Comments
 (0)