11package collectors
22
33import (
4+ "compress/gzip"
45 "context"
56 "encoding/json"
67 "fmt"
8+ "io"
79 "log/slog"
810 "net/http"
911 "strconv"
@@ -171,7 +173,7 @@ func (gc *GHCRCollector) collectPackageMetrics(ctx context.Context, repo string,
171173 }
172174
173175 // Update metrics
174- gc .updatePackageMetrics (pkg , packageInfo , versions )
176+ gc .updatePackageMetrics (ctx , pkg , packageInfo , versions )
175177
176178 return nil
177179}
@@ -288,7 +290,7 @@ func (gc *GHCRCollector) getPackageVersions(ctx context.Context, owner, repo, pa
288290 return versions , nil
289291}
290292
291- func (gc * GHCRCollector ) updatePackageMetrics (pkg config.PackageGroup , packageInfo * GHCRPackageResponse , versions []GHCRVersionResponse ) {
293+ func (gc * GHCRCollector ) updatePackageMetrics (ctx context. Context , pkg config.PackageGroup , packageInfo * GHCRPackageResponse , versions []GHCRVersionResponse ) {
292294 // Update package-level metrics with real data
293295 // Note: GitHub API doesn't provide download statistics for packages
294296 // We'll use version count as a proxy metric and track last published time
@@ -308,13 +310,24 @@ func (gc *GHCRCollector) updatePackageMetrics(pkg config.PackageGroup, packageIn
308310 // Use version count as a proxy for activity (more versions = more activity)
309311 gc .metrics .PackageDownloadsGauge .WithLabelValues (pkg .Owner , pkg .Repo ).Set (float64 (packageInfo .VersionCount ))
310312
313+ // Try to get actual download statistics from the package page
314+ downloadCount , err := gc .getPackageDownloadStats (ctx , pkg .Owner , pkg .Repo )
315+ if err != nil {
316+ slog .Warn ("Failed to get download statistics" , "package" , pkg .Repo , "error" , err )
317+ // Set to -1 to indicate no data available
318+ gc .metrics .PackageDownloadStatsGauge .WithLabelValues (pkg .Owner , pkg .Repo ).Set (- 1 )
319+ } else {
320+ gc .metrics .PackageDownloadStatsGauge .WithLabelValues (pkg .Owner , pkg .Repo ).Set (float64 (downloadCount ))
321+ }
322+
311323 if ! lastPublished .IsZero () {
312324 gc .metrics .PackageLastPublishedGauge .WithLabelValues (pkg .Owner , pkg .Repo ).Set (float64 (lastPublished .Unix ()))
313325 }
314326
315327 slog .Info ("Updated package metrics" ,
316328 "package" , pkg .Repo ,
317329 "version_count" , packageInfo .VersionCount ,
330+ "download_count" , downloadCount ,
318331 "last_published" , lastPublished .Format (time .RFC3339 ))
319332}
320333
@@ -338,3 +351,171 @@ func (gc *GHCRCollector) retryWithBackoff(operation func() error, maxRetries int
338351
339352 return fmt .Errorf ("operation failed after %d retries: %w" , maxRetries , lastErr )
340353}
354+
355+ // getPackageDownloadStats scrapes the package page to get actual download statistics
356+ func (gc * GHCRCollector ) getPackageDownloadStats (ctx context.Context , owner , packageName string ) (int64 , error ) {
357+ slog .Info ("Starting download statistics collection" , "owner" , owner , "package" , packageName )
358+
359+ // Construct the package page URL
360+ packageURL := fmt .Sprintf ("https://github.com/%s/%s/pkgs/container/%s" , owner , packageName , packageName )
361+ slog .Debug ("Constructed package URL" , "url" , packageURL )
362+
363+ // Create request to the package page
364+ req , err := http .NewRequestWithContext (ctx , http .MethodGet , packageURL , nil )
365+ if err != nil {
366+ slog .Error ("Failed to create HTTP request" , "owner" , owner , "package" , packageName , "error" , err )
367+ return 0 , fmt .Errorf ("failed to create request: %w" , err )
368+ }
369+
370+ slog .Debug ("Created HTTP request successfully" )
371+
372+ // Set headers to mimic a browser request
373+ req .Header .Set ("User-Agent" , "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36" )
374+ req .Header .Set ("Accept" , "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7" )
375+ req .Header .Set ("Accept-Language" , "en-US,en;q=0.9" )
376+ req .Header .Set ("Accept-Encoding" , "gzip, deflate, br" )
377+ req .Header .Set ("DNT" , "1" )
378+ req .Header .Set ("Connection" , "keep-alive" )
379+ req .Header .Set ("Upgrade-Insecure-Requests" , "1" )
380+ req .Header .Set ("Sec-Fetch-Dest" , "document" )
381+ req .Header .Set ("Sec-Fetch-Mode" , "navigate" )
382+ req .Header .Set ("Sec-Fetch-Site" , "none" )
383+ req .Header .Set ("Sec-Fetch-User" , "?1" )
384+ req .Header .Set ("Cache-Control" , "max-age=0" )
385+ slog .Debug ("Set browser-like headers" , "user_agent" , req .Header .Get ("User-Agent" ))
386+
387+ // Make the request
388+ slog .Debug ("Making HTTP request to package page" )
389+
390+ resp , err := gc .client .Do (req )
391+ if err != nil {
392+ slog .Error ("Failed to fetch package page" , "owner" , owner , "package" , packageName , "url" , packageURL , "error" , err )
393+ return 0 , fmt .Errorf ("failed to fetch package page: %w" , err )
394+ }
395+
396+ defer func () {
397+ if closeErr := resp .Body .Close (); closeErr != nil {
398+ slog .Warn ("Failed to close response body" , "error" , closeErr )
399+ }
400+ }()
401+
402+ slog .Debug ("Received HTTP response" , "status_code" , resp .StatusCode , "content_length" , resp .ContentLength , "content_type" , resp .Header .Get ("Content-Type" ))
403+
404+ if resp .StatusCode != http .StatusOK {
405+ slog .Error ("Package page returned non-OK status" , "owner" , owner , "package" , packageName , "status_code" , resp .StatusCode , "url" , packageURL )
406+ return 0 , fmt .Errorf ("package page returned status %d" , resp .StatusCode )
407+ }
408+
409+ // Read the response body
410+ slog .Debug ("Reading response body" )
411+
412+ body , err := io .ReadAll (resp .Body )
413+ if err != nil {
414+ slog .Error ("Failed to read response body" , "owner" , owner , "package" , packageName , "error" , err )
415+ return 0 , fmt .Errorf ("failed to read response body: %w" , err )
416+ }
417+
418+ // Handle gzip decompression if needed
419+ if resp .Header .Get ("Content-Encoding" ) == "gzip" {
420+ slog .Debug ("Decompressing gzipped response" )
421+
422+ gzReader , err := gzip .NewReader (strings .NewReader (string (body )))
423+ if err != nil {
424+ slog .Error ("Failed to create gzip reader" , "owner" , owner , "package" , packageName , "error" , err )
425+ return 0 , fmt .Errorf ("failed to create gzip reader: %w" , err )
426+ }
427+
428+ defer func () {
429+ if closeErr := gzReader .Close (); closeErr != nil {
430+ slog .Warn ("Failed to close gzip reader" , "error" , closeErr )
431+ }
432+ }()
433+
434+ // Read the decompressed content
435+ decompressedBody , err := io .ReadAll (gzReader )
436+ if err != nil {
437+ slog .Error ("Failed to read decompressed body" , "owner" , owner , "package" , packageName , "error" , err )
438+ return 0 , fmt .Errorf ("failed to read decompressed body: %w" , err )
439+ }
440+
441+ body = decompressedBody
442+ slog .Debug ("Gzip decompression successful" , "original_size" , len (body ), "decompressed_size" , len (decompressedBody ))
443+ }
444+
445+ bodySize := len (body )
446+ slog .Debug ("Response body read successfully" , "body_size_bytes" , bodySize )
447+
448+ if bodySize == 0 {
449+ slog .Error ("Response body is empty" , "owner" , owner , "package" , packageName , "url" , packageURL )
450+ return 0 , fmt .Errorf ("response body is empty" )
451+ }
452+
453+ // Parse the HTML document
454+ slog .Debug ("Parsing HTML document" , "body_size_bytes" , bodySize )
455+
456+ // Simple grep-like approach: find "Total downloads" and get the next line
457+ htmlContent := string (body )
458+ lines := strings .Split (htmlContent , "\n " )
459+
460+ var downloadLine string
461+
462+ for i , line := range lines {
463+ if strings .Contains (line , "Total downloads" ) {
464+ if i + 1 < len (lines ) {
465+ downloadLine = strings .TrimSpace (lines [i + 1 ])
466+ slog .Debug ("Found download line after 'Total downloads'" , "line" , downloadLine )
467+
468+ break
469+ }
470+ }
471+ }
472+
473+ if downloadLine == "" {
474+ slog .Error ("Download statistics not found" , "owner" , owner , "package" , packageName )
475+
476+ // Log a few lines around where "Total downloads" should be for debugging
477+ for i , line := range lines {
478+ if strings .Contains (line , "download" ) {
479+ slog .Debug ("Found line with 'download'" , "line_number" , i , "content" , strings .TrimSpace (line ))
480+
481+ if i + 1 < len (lines ) {
482+ slog .Debug ("Next line content" , "line_number" , i + 1 , "content" , strings .TrimSpace (lines [i + 1 ]))
483+ }
484+ }
485+ }
486+
487+ return 0 , fmt .Errorf ("download statistics not found in package page" )
488+ }
489+
490+ slog .Debug ("Found download line" , "line" , downloadLine )
491+
492+ // Extract the title attribute which contains the full number
493+ // Look for title="123456" in the line (e.g., from <h3 title="123456">123K</h3>)
494+ titleStart := strings .Index (downloadLine , `title="` )
495+ if titleStart == - 1 {
496+ slog .Error ("Download count title attribute not found" , "owner" , owner , "package" , packageName , "line" , downloadLine )
497+ return 0 , fmt .Errorf ("download count title attribute not found" )
498+ }
499+
500+ titleStart += 7 // Skip 'title="'
501+
502+ titleEnd := strings .Index (downloadLine [titleStart :], `"` )
503+ if titleEnd == - 1 {
504+ slog .Error ("Download count title attribute malformed" , "owner" , owner , "package" , packageName , "line" , downloadLine )
505+ return 0 , fmt .Errorf ("download count title attribute malformed" )
506+ }
507+
508+ title := downloadLine [titleStart : titleStart + titleEnd ]
509+ slog .Debug ("Extracted title attribute" , "title" , title )
510+
511+ // Parse the download count from the title attribute
512+ downloadCount , err := strconv .ParseInt (title , 10 , 64 )
513+ if err != nil {
514+ slog .Error ("Failed to parse download count" , "owner" , owner , "package" , packageName , "title" , title , "error" , err )
515+ return 0 , fmt .Errorf ("failed to parse download count %s: %w" , title , err )
516+ }
517+
518+ slog .Info ("Successfully extracted download statistics" , "owner" , owner , "package" , packageName , "download_count" , downloadCount , "raw_title" , title )
519+
520+ return downloadCount , nil
521+ }
0 commit comments