Skip to content

Commit a640403

Browse files
authored
Improve scraper reliability (#36)
1 parent 67610e9 commit a640403

30 files changed

+686
-581
lines changed

cache/cache.go

+64-20
Original file line numberDiff line numberDiff line change
@@ -1,65 +1,104 @@
11
package cache
22

33
import (
4+
"net/http"
45
"strings"
6+
"sync"
57
"time"
68

79
goCache "github.com/patrickmn/go-cache"
10+
messagebus "github.com/vardius/message-bus"
811
"golang.org/x/exp/maps"
912

1013
"bdo-rest-api/config"
14+
"bdo-rest-api/models"
1115
"bdo-rest-api/utils"
1216
)
1317

14-
type cacheEntry[T any] struct {
15-
data T
16-
date time.Time
17-
status int
18+
type CacheEntry[T any] struct {
19+
Data T
20+
Date time.Time
21+
Status int
1822
}
1923

2024
type cache[T any] struct {
25+
Bus messagebus.MessageBus
2126
internalCache *goCache.Cache
2227
}
2328

2429
func joinKeys(keys []string) string {
2530
return strings.Join(keys, ",")
2631
}
2732

28-
func NewCache[T any]() *cache[T] {
33+
func newCache[T any]() *cache[T] {
2934
cacheTTL := config.GetCacheTTL()
3035

3136
return &cache[T]{
37+
Bus: messagebus.New(100), // Idk what buffer size is optimal
3238
internalCache: goCache.New(cacheTTL, min(time.Hour, cacheTTL)),
3339
}
3440
}
3541

36-
func (c *cache[T]) AddRecord(keys []string, data T, status int) (date string, expires string) {
37-
cacheTTL := config.GetCacheTTL()
38-
entry := cacheEntry[T]{
39-
data: data,
40-
date: time.Now(),
41-
status: status,
42+
func (c *cache[T]) AddRecord(keys []string, data T, status int, taskId string) (date string, expires string) {
43+
ttl := config.GetCacheTTL()
44+
entry := CacheEntry[T]{
45+
Data: data,
46+
Date: time.Now(),
47+
Status: status,
4248
}
4349

44-
c.internalCache.Add(joinKeys(keys), entry, cacheTTL)
45-
expirationDate := entry.date.Add(cacheTTL)
50+
c.internalCache.Add(joinKeys(keys), entry, ttl)
51+
c.Bus.Publish(taskId, entry)
4652

47-
return utils.FormatDateForHeaders(entry.date), utils.FormatDateForHeaders(expirationDate)
53+
return utils.FormatDateForHeaders(entry.Date), utils.FormatDateForHeaders(entry.Date.Add(ttl))
4854
}
4955

50-
func (c *cache[T]) GetRecord(keys []string) (data T, status int, date string, expires string, found bool) {
51-
var anyEntry interface{}
52-
var expirationDate time.Time
56+
func (c *cache[T]) SignalBypassCache(status int, taskId string) {
57+
var data T
58+
entry := CacheEntry[T]{
59+
Data: data,
60+
Date: time.Now(),
61+
Status: status,
62+
}
5363

54-
anyEntry, expirationDate, found = c.internalCache.GetWithExpiration(joinKeys(keys))
64+
c.Bus.Publish(taskId, entry)
65+
}
66+
67+
func (c *cache[T]) GetRecord(keys []string) (data T, status int, date string, expires string, found bool) {
68+
cacheTTL := config.GetCacheTTL()
69+
anyEntry, found := c.internalCache.Get(joinKeys(keys))
5570

5671
if !found {
5772
return
5873
}
5974

60-
entry := anyEntry.(cacheEntry[T])
75+
entry := anyEntry.(CacheEntry[T])
6176

62-
return entry.data, entry.status, utils.FormatDateForHeaders(entry.date), utils.FormatDateForHeaders(expirationDate), found
77+
return entry.Data, entry.Status, utils.FormatDateForHeaders(entry.Date), utils.FormatDateForHeaders(entry.Date.Add(cacheTTL)), found
78+
}
79+
80+
func (c *cache[T]) WaitForRecord(taskId string) (data T, status int, date string, expires string) {
81+
var wg sync.WaitGroup
82+
wg.Add(1)
83+
84+
c.Bus.Subscribe(taskId, func(entry CacheEntry[T]) {
85+
data = entry.Data
86+
status = entry.Status
87+
date = utils.FormatDateForHeaders(entry.Date)
88+
89+
if entry.Status == http.StatusInternalServerError {
90+
expires = date
91+
} else if entry.Status == http.StatusServiceUnavailable {
92+
expires = utils.FormatDateForHeaders(entry.Date.Add(config.GetMaintenanceStatusTTL()))
93+
} else {
94+
expires = utils.FormatDateForHeaders(entry.Date.Add(config.GetCacheTTL()))
95+
}
96+
97+
wg.Done()
98+
})
99+
100+
wg.Wait()
101+
return
63102
}
64103

65104
func (c *cache[T]) GetItemCount() int {
@@ -69,3 +108,8 @@ func (c *cache[T]) GetItemCount() int {
69108
func (c *cache[T]) GetKeys() []string {
70109
return maps.Keys(c.internalCache.Items())
71110
}
111+
112+
var GuildProfiles = newCache[models.GuildProfile]()
113+
var GuildSearch = newCache[[]models.GuildProfile]()
114+
var Profiles = newCache[models.Profile]()
115+
var ProfileSearch = newCache[[]models.Profile]()

cache/cache_test.go

+3-2
Original file line numberDiff line numberDiff line change
@@ -13,14 +13,15 @@ func init() {
1313

1414
func TestCache(t *testing.T) {
1515
// Create a cache instance for testing
16-
testCache := NewCache[string]()
16+
testCache := newCache[string]()
1717

1818
// Test AddRecord and GetRecord
1919
keys := []string{"key1", "key2"}
2020
data := "test data"
2121
status := 200
22+
taskId := "task-id"
2223

23-
date, expires := testCache.AddRecord(keys, data, status)
24+
date, expires := testCache.AddRecord(keys, data, status, taskId)
2425

2526
// Validate AddRecord results
2627
if date == "" || expires == "" {

config/config.go

-11
Original file line numberDiff line numberDiff line change
@@ -4,9 +4,6 @@ import (
44
"fmt"
55
"sync"
66
"time"
7-
8-
"github.com/gocolly/colly/v2"
9-
"github.com/gocolly/colly/v2/proxy"
107
)
118

129
type config struct {
@@ -16,7 +13,6 @@ type config struct {
1613
mu sync.RWMutex
1714
port int
1815
proxyList []string
19-
proxySwitcher colly.ProxyFunc
2016
rateLimit int64
2117
verbosity bool
2218
}
@@ -78,7 +74,6 @@ func SetProxyList(proxies []string) {
7874
getInstance().mu.Lock()
7975
defer getInstance().mu.Unlock()
8076
getInstance().proxyList = proxies
81-
getInstance().proxySwitcher, _ = proxy.RoundRobinProxySwitcher(proxies...)
8277
}
8378

8479
func GetProxyList() []string {
@@ -87,12 +82,6 @@ func GetProxyList() []string {
8782
return getInstance().proxyList
8883
}
8984

90-
func GetProxySwitcher() colly.ProxyFunc {
91-
getInstance().mu.RLock()
92-
defer getInstance().mu.RUnlock()
93-
return getInstance().proxySwitcher
94-
}
95-
9685
func SetVerbosity(verbosity bool) {
9786
getInstance().mu.Lock()
9887
defer getInstance().mu.Unlock()

docs/openapi.json

+33-35
Original file line numberDiff line numberDiff line change
@@ -31,13 +31,19 @@
3131
},
3232
"responses": {
3333
"400": {
34-
"description": "Bad Request: a required parameter is either missing or in a wrong format."
34+
"description": "Bad Request. A required query parameter is either missing or in a wrong format."
3535
},
3636
"404": {
37-
"description": "Not Found: try requesting something else."
37+
"description": "Not Found. Request something else, or contact instance owner if you're sure that it's a mistake."
38+
},
39+
"429": {
40+
"description": "Too Many Requests. Try doing the same request after waiting some time."
41+
},
42+
"500": {
43+
"description": "Internal Server Error. Try doing the same request after waiting some time, contact instance owner if the problem persists."
3844
},
3945
"503": {
40-
"description": "Service Unavailable: [https://naeu.playblackdesert.com](naeu.playblackdesert.com) is currently under maintenance and requested data is not cached."
46+
"description": "Service Unavailable. BDO website is currently under maintenance and requested data is temporarily not available."
4147
}
4248
}
4349
},
@@ -310,6 +316,12 @@
310316
"404": {
311317
"$ref": "#/components/responses/404"
312318
},
319+
"429": {
320+
"$ref": "#/components/responses/429"
321+
},
322+
"500": {
323+
"$ref": "#/components/responses/500"
324+
},
313325
"503": {
314326
"$ref": "#/components/responses/503"
315327
}
@@ -346,16 +358,6 @@
346358
],
347359
"example": "familyName"
348360
}
349-
},
350-
{
351-
"name": "page",
352-
"in": "query",
353-
"description": "This parameter is understood by the API, but you should either omit it or set to 1. Because of how search currently works, there is never more than one page.",
354-
"deprecated": true,
355-
"schema": {
356-
"type": "number",
357-
"default": 1
358-
}
359361
}
360362
],
361363
"responses": {
@@ -439,6 +441,12 @@
439441
"404": {
440442
"$ref": "#/components/responses/404"
441443
},
444+
"429": {
445+
"$ref": "#/components/responses/429"
446+
},
447+
"500": {
448+
"$ref": "#/components/responses/500"
449+
},
442450
"503": {
443451
"$ref": "#/components/responses/503"
444452
}
@@ -537,6 +545,12 @@
537545
"404": {
538546
"$ref": "#/components/responses/404"
539547
},
548+
"429": {
549+
"$ref": "#/components/responses/429"
550+
},
551+
"500": {
552+
"$ref": "#/components/responses/500"
553+
},
540554
"503": {
541555
"$ref": "#/components/responses/503"
542556
}
@@ -560,16 +574,6 @@
560574
},
561575
{
562576
"$ref": "#/components/parameters/region"
563-
},
564-
{
565-
"name": "page",
566-
"in": "query",
567-
"description": "This parameter is understood by the API, but you should either omit it or set to 1. Because of how search currently works, there is never more than one page.",
568-
"deprecated": true,
569-
"schema": {
570-
"type": "number",
571-
"default": 1
572-
}
573577
}
574578
],
575579
"responses": {
@@ -635,6 +639,12 @@
635639
"404": {
636640
"$ref": "#/components/responses/404"
637641
},
642+
"429": {
643+
"$ref": "#/components/responses/429"
644+
},
645+
"500": {
646+
"$ref": "#/components/responses/500"
647+
},
638648
"503": {
639649
"$ref": "#/components/responses/503"
640650
}
@@ -680,12 +690,6 @@
680690
"items": {
681691
"type": "object",
682692
"properties": {
683-
"page": {
684-
"deprecated": true,
685-
"nullable": false,
686-
"type": "number",
687-
"example": 1
688-
},
689693
"query": {
690694
"nullable": false,
691695
"type": "string",
@@ -738,12 +742,6 @@
738742
"items": {
739743
"type": "object",
740744
"properties": {
741-
"page": {
742-
"deprecated": true,
743-
"nullable": false,
744-
"type": "number",
745-
"example": 1
746-
},
747745
"query": {
748746
"nullable": false,
749747
"type": "string",

go.mod

+21-17
Original file line numberDiff line numberDiff line change
@@ -1,39 +1,43 @@
11
module bdo-rest-api
22

3-
go 1.22
3+
go 1.24.0
44

55
require (
6-
github.com/gocolly/colly/v2 v2.1.0
6+
github.com/gocolly/colly/v2 v2.1.1-0.20240605174350-99b7fb1b87d1
7+
github.com/google/uuid v1.6.0
78
github.com/patrickmn/go-cache v2.1.0+incompatible
89
github.com/sa-/slicefunk v0.1.4
910
github.com/ulule/limiter/v3 v3.11.2
10-
go.mongodb.org/mongo-driver/v2 v2.0.0
11-
golang.org/x/exp v0.0.0-20240719175910-8a7402abbf56
11+
go.mongodb.org/mongo-driver/v2 v2.1.0
12+
golang.org/x/exp v0.0.0-20250218142911-aa4b98e5adaa
1213
)
1314

1415
require (
15-
github.com/PuerkitoBio/goquery v1.8.1 // indirect
16-
github.com/andybalholm/cascadia v1.3.2 // indirect
17-
github.com/antchfx/htmlquery v1.3.0 // indirect
18-
github.com/antchfx/xmlquery v1.3.18 // indirect
19-
github.com/antchfx/xpath v1.2.5 // indirect
16+
github.com/PuerkitoBio/goquery v1.10.2 // indirect
17+
github.com/andybalholm/cascadia v1.3.3 // indirect
18+
github.com/antchfx/htmlquery v1.3.4 // indirect
19+
github.com/antchfx/xmlquery v1.4.4 // indirect
20+
github.com/antchfx/xpath v1.3.3 // indirect
21+
github.com/bits-and-blooms/bitset v1.21.0 // indirect
2022
github.com/gobwas/glob v0.2.3 // indirect
21-
github.com/golang/groupcache v0.0.0-20210331224755-41bb18bfe9da // indirect
22-
github.com/golang/protobuf v1.5.3 // indirect
23+
github.com/golang/groupcache v0.0.0-20241129210726-2c02b8208cf8 // indirect
24+
github.com/golang/protobuf v1.5.4 // indirect
2325
github.com/golang/snappy v0.0.4 // indirect
2426
github.com/kennygrant/sanitize v1.2.4 // indirect
25-
github.com/klauspost/compress v1.16.7 // indirect
27+
github.com/klauspost/compress v1.18.0 // indirect
28+
github.com/nlnwa/whatwg-url v0.5.1 // indirect
2629
github.com/pkg/errors v0.9.1 // indirect
2730
github.com/saintfish/chardet v0.0.0-20230101081208-5e3ef4b5456d // indirect
2831
github.com/temoto/robotstxt v1.1.2 // indirect
32+
github.com/vardius/message-bus v1.1.5
2933
github.com/xdg-go/pbkdf2 v1.0.0 // indirect
3034
github.com/xdg-go/scram v1.1.2 // indirect
3135
github.com/xdg-go/stringprep v1.0.4 // indirect
3236
github.com/youmark/pkcs8 v0.0.0-20240726163527-a2c0da244d78 // indirect
33-
golang.org/x/crypto v0.29.0 // indirect
34-
golang.org/x/net v0.21.0 // indirect
35-
golang.org/x/sync v0.9.0 // indirect
36-
golang.org/x/text v0.20.0 // indirect
37+
golang.org/x/crypto v0.35.0 // indirect
38+
golang.org/x/net v0.35.0 // indirect
39+
golang.org/x/sync v0.11.0 // indirect
40+
golang.org/x/text v0.22.0 // indirect
3741
google.golang.org/appengine v1.6.8 // indirect
38-
google.golang.org/protobuf v1.31.0 // indirect
42+
google.golang.org/protobuf v1.36.5 // indirect
3943
)

0 commit comments

Comments
 (0)