Skip to content

Commit 261cf60

Browse files
vdimirpaskal
authored andcommitted
Full text search
1 parent d1ea664 commit 261cf60

File tree

16 files changed

+1124
-2
lines changed

16 files changed

+1124
-2
lines changed

backend/app/cmd/server.go

Lines changed: 46 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -18,6 +18,7 @@ import (
1818
"github.com/go-pkgz/lcw/eventbus"
1919
log "github.com/go-pkgz/lgr"
2020
ntf "github.com/go-pkgz/notify"
21+
"github.com/go-pkgz/syncs"
2122
"github.com/golang-jwt/jwt"
2223
"github.com/kyokomi/emoji/v2"
2324
bolt "go.etcd.io/bbolt"
@@ -38,6 +39,7 @@ import (
3839
"github.com/umputun/remark42/backend/app/store/admin"
3940
"github.com/umputun/remark42/backend/app/store/engine"
4041
"github.com/umputun/remark42/backend/app/store/image"
42+
"github.com/umputun/remark42/backend/app/store/search"
4143
"github.com/umputun/remark42/backend/app/store/service"
4244
"github.com/umputun/remark42/backend/app/templates"
4345
)
@@ -57,6 +59,7 @@ type ServerCommand struct {
5759
Image ImageGroup `group:"image" namespace:"image" env-namespace:"IMAGE"`
5860
SSL SSLGroup `group:"ssl" namespace:"ssl" env-namespace:"SSL"`
5961
ImageProxy ImageProxyGroup `group:"image-proxy" namespace:"image-proxy" env-namespace:"IMAGE_PROXY"`
62+
Search SearchGroup `group:"search" namespace:"search" env-namespace:"SEARCH"`
6063

6164
Sites []string `long:"site" env:"SITE" default:"remark" description:"site names" env-delim:","`
6265
AnonymousVote bool `long:"anon-vote" env:"ANON_VOTE" description:"enable anonymous votes (works only with VOTES_IP enabled)"`
@@ -280,6 +283,13 @@ type AdminRPCGroup struct {
280283
SecretPerSite bool `long:"secret_per_site" env:"SECRET_PER_SITE" description:"enable JWT secret retrieval per aud, which is site_id in this case"`
281284
}
282285

286+
// SearchGroup defines options group for search engine
287+
type SearchGroup struct {
288+
Enable bool `long:"enable" env:"ENABLE" description:"enable search engine"`
289+
IndexPath string `long:"index_path" env:"INDEX_PATH" description:"search index location" default:"./var/search_index"`
290+
Analyzer string `long:"analyzer" env:"ANALYZER" description:"text analyzer type, set language-specific one to improve search quality" choice:"standard" choice:"ar" choice:"de" choice:"en" choice:"es" choice:"fi" choice:"fr" choice:"it" choice:"ru" default:"standard"` //nolint
291+
}
292+
283293
// LoadingCache defines interface for caching
284294
type LoadingCache interface {
285295
Get(key cache.Key, fn func() ([]byte, error)) (data []byte, err error) // load from cache if found or put to cache and return
@@ -574,6 +584,12 @@ func (s *ServerCommand) newServerApp(ctx context.Context) (*serverApp, error) {
574584
return nil, fmt.Errorf("failed to make config of ssl server params: %w", err)
575585
}
576586

587+
dataService.SearchService, err = s.makeSearchService()
588+
if err != nil {
589+
_ = dataService.Close()
590+
return nil, fmt.Errorf("failed to create search service: %w", err)
591+
}
592+
577593
srv := &api.Rest{
578594
Version: s.Revision,
579595
DataService: dataService,
@@ -656,6 +672,20 @@ func (a *serverApp) run(ctx context.Context) error {
656672
log.Printf("[WARN] failed to resubmit comments with staging images, %s", e)
657673
}
658674

675+
numWorkersForIndexing := 8
676+
grp := syncs.NewErrSizedGroup(numWorkersForIndexing)
677+
for _, siteID := range a.Sites {
678+
a.dataService.RunSiteIndexers(ctx, siteID, grp) // index comments for the first time run
679+
}
680+
681+
// don't lock here, wait in background to log error if any
682+
go func() {
683+
err := grp.Wait()
684+
if err != nil {
685+
log.Printf("[WARN] background task for indexing existing comments failed: %s", err)
686+
}
687+
}()
688+
659689
go a.imageService.Cleanup(ctx) // pictures cleanup for staging images
660690

661691
a.restSrv.Run(a.Address, a.Port)
@@ -1194,6 +1224,22 @@ func (s *ServerCommand) getAuthenticator(ds *service.DataStore, avas avatar.Stor
11941224
})
11951225
}
11961226

1227+
func (s *ServerCommand) makeSearchService() (*search.Service, error) {
1228+
if !s.Search.Enable {
1229+
return nil, nil
1230+
}
1231+
log.Printf("[INFO] creating search service")
1232+
1233+
if s.Search.IndexPath == "" {
1234+
return nil, fmt.Errorf("search index path is not set")
1235+
}
1236+
1237+
return search.NewService(s.Sites, search.ServiceParams{
1238+
IndexPath: s.Search.IndexPath,
1239+
Analyzer: s.Search.Analyzer,
1240+
})
1241+
}
1242+
11971243
func (s *ServerCommand) parseSameSite(ss string) http.SameSite {
11981244
switch strings.ToLower(ss) {
11991245
case "default":

backend/app/cmd/server_test.go

Lines changed: 7 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -828,5 +828,11 @@ func createAppFromCmd(t *testing.T, cmd ServerCommand) (*serverApp, context.Cont
828828

829829
func TestMain(m *testing.M) {
830830
// ignore is added only for GitHub Actions, can't reproduce locally
831-
goleak.VerifyTestMain(m, goleak.IgnoreTopFunction("net/http.(*Server).Shutdown"))
831+
goleak.VerifyTestMain(m,
832+
goleak.IgnoreTopFunction("net/http.(*Server).Shutdown"),
833+
834+
// we should call bleve.Config.Shutdown() to close all the workers,
835+
// but we don't do it for each server instance because it's global per application
836+
goleak.IgnoreTopFunction("github.com/blevesearch/bleve_index_api.AnalysisWorker"),
837+
)
832838
}

backend/app/main_test.go

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -157,5 +157,10 @@ func TestMain(m *testing.M) {
157157
m,
158158
goleak.IgnoreTopFunction("github.com/umputun/remark42/backend/app.init.0.func1"),
159159
goleak.IgnoreTopFunction("net/http.(*Server).Shutdown"),
160+
161+
// we should call bleve.Config.Shutdown() to close all the workers,
162+
// but it's global per application and cannot be reinitialized multiple times
163+
// so we do not terminate them, ignore the leak
164+
goleak.IgnoreTopFunction("github.com/blevesearch/bleve_index_api.AnalysisWorker"),
160165
)
161166
}

backend/app/rest/api/rest.go

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -91,12 +91,20 @@ type LoadingCache interface {
9191
const hardBodyLimit = 1024 * 64 // limit size of body
9292

9393
const lastCommentsScope = "last"
94+
const searchScope = "search"
9495

9596
type commentsWithInfo struct {
9697
Comments []store.Comment `json:"comments"`
9798
Info store.PostInfo `json:"info,omitempty"`
9899
}
99100

101+
type commentsWithTotal struct {
102+
// List of comments matching the query and selected by `limit` and `skip` parameters.
103+
Comments []store.Comment `json:"comments"`
104+
// Total number of comments in the index matching query.
105+
Total uint64 `json:"total"`
106+
}
107+
100108
// Run the lister and request's router, activate rest server
101109
func (s *Rest) Run(address string, port int) {
102110
if address == "*" {
@@ -263,6 +271,7 @@ func (s *Rest) routes() chi.Router {
263271
ropen.Get("/list", s.pubRest.listCtrl)
264272
ropen.Get("/info", s.pubRest.infoCtrl)
265273
ropen.Get("/img", s.ImageProxy.Handler)
274+
ropen.Get("/search", s.pubRest.searchQueryCtrl)
266275

267276
ropen.Route("/rss", func(rrss chi.Router) {
268277
rrss.Get("/post", s.rssRest.postCommentsCtrl)

backend/app/rest/api/rest_public.go

Lines changed: 68 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -42,6 +42,7 @@ type pubStore interface {
4242
Count(locator store.Locator) (int, error)
4343
List(siteID string, limit, skip int) ([]store.PostInfo, error)
4444
Info(locator store.Locator, readonlyAge int) (store.PostInfo, error)
45+
Search(siteID, query, sortBy string, limit, skip int) ([]store.Comment, uint64, error)
4546

4647
ValidateComment(c *store.Comment) error
4748
IsReadOnly(locator store.Locator) bool
@@ -386,6 +387,73 @@ func (s *public) telegramQrCtrl(w http.ResponseWriter, r *http.Request) {
386387
}
387388
}
388389

390+
// GET /search?site=siteID&query=query&limit=20&skip=10&sort=[-]field - search documents
391+
// site - site ID
392+
// query - user-provided search query
393+
// limit - number of documents to return, default is 20, maximum is 100
394+
// skip - number of documents to skip, default is 0 (do not skip anything), maximum is 1000
395+
// sort - sort by specified field, can be prefixed with "-" to reverse the order
396+
func (s *public) searchQueryCtrl(w http.ResponseWriter, r *http.Request) {
397+
getIntParamInRange := func(min, max, defaultVal int, name string) (int, error) {
398+
s := r.URL.Query().Get(name)
399+
if s == "" {
400+
return defaultVal, nil
401+
}
402+
403+
value, err := strconv.Atoi(s)
404+
if err != nil {
405+
return 0, fmt.Errorf("%s parameter should be an integer", name)
406+
}
407+
408+
if min <= value && value <= max {
409+
return value, nil
410+
}
411+
return 0, fmt.Errorf("%s parameter should be between %d and %d", name, min, max)
412+
}
413+
414+
var err error
415+
416+
var limit int
417+
if limit, err = getIntParamInRange(1, 100, 20, "limit"); err != nil {
418+
rest.SendErrorJSON(w, r, http.StatusBadRequest, fmt.Errorf("wrong param"), err.Error(), rest.ErrInternal)
419+
return
420+
}
421+
422+
var skip int
423+
if skip, err = getIntParamInRange(0, 1000, 0, "skip"); err != nil {
424+
rest.SendErrorJSON(w, r, http.StatusBadRequest, fmt.Errorf("wrong param"), err.Error(), rest.ErrInternal)
425+
return
426+
}
427+
428+
siteID := r.URL.Query().Get("site")
429+
query := r.URL.Query().Get("query")
430+
sortBy := r.URL.Query().Get("sort")
431+
432+
if siteID == "" || query == "" {
433+
rest.SendErrorJSON(w, r, http.StatusBadRequest, fmt.Errorf("missing param"), "site and query parameters are required", rest.ErrInternal)
434+
return
435+
}
436+
437+
key := cache.NewKey(siteID).ID(URLKey(r)).Scopes(siteID, searchScope)
438+
data, err := s.cache.Get(key, func() ([]byte, error) {
439+
comments, total, searchErr := s.dataService.Search(siteID, query, sortBy, limit, skip)
440+
if searchErr != nil {
441+
return nil, searchErr
442+
}
443+
444+
return encodeJSONWithHTML(commentsWithTotal{Comments: comments, Total: total})
445+
})
446+
447+
if err != nil {
448+
rest.SendErrorJSON(w, r, http.StatusBadRequest, err, "can't perform search request", rest.ErrInternal)
449+
return
450+
}
451+
452+
if err = R.RenderJSONFromBytes(w, r, data); err != nil {
453+
log.Printf("[WARN] can't render search results for site %s", siteID)
454+
}
455+
}
456+
389457
func (s *public) applyView(comments []store.Comment, view string) []store.Comment {
390458
if strings.EqualFold(view, "user") {
391459
projection := make([]store.Comment, 0, len(comments))

backend/app/rest/api/rest_test.go

Lines changed: 95 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,7 @@ import (
1010
"net"
1111
"net/http"
1212
"net/http/httptest"
13+
urlpkg "net/url"
1314
"os"
1415
"strconv"
1516
"strings"
@@ -34,6 +35,7 @@ import (
3435
adminstore "github.com/umputun/remark42/backend/app/store/admin"
3536
"github.com/umputun/remark42/backend/app/store/engine"
3637
"github.com/umputun/remark42/backend/app/store/image"
38+
"github.com/umputun/remark42/backend/app/store/search"
3739
"github.com/umputun/remark42/backend/app/store/service"
3840
)
3941

@@ -212,6 +214,94 @@ func TestRest_rejectAnonUser(t *testing.T) {
212214
assert.Equal(t, http.StatusOK, resp.StatusCode, "real user")
213215
}
214216

217+
func TestRest_Search(t *testing.T) {
218+
var searchIndexPath string
219+
ts, _, teardown := startupT(t, func(srv *Rest) {
220+
var err error
221+
searchIndexPath, err = randomPath(srv.WebRoot, "test-search-remark", "")
222+
require.NoError(t, err)
223+
srv.DataService.SearchService, err = search.NewService([]string{"remark42"}, search.ServiceParams{
224+
IndexPath: searchIndexPath,
225+
Analyzer: "standard",
226+
})
227+
require.NoError(t, err)
228+
})
229+
defer teardown()
230+
defer func() { _ = os.RemoveAll(searchIndexPath) }()
231+
232+
t0 := time.Date(2018, 12, 20, 15, 18, 22, 0, time.Local)
233+
234+
id1 := addComment(t, store.Comment{Text: "test test", Timestamp: t0.Add(1 * time.Minute), Locator: store.Locator{SiteID: "remark42", URL: "https://radio-t.com/abc"}}, ts)
235+
id2 := addComment(t, store.Comment{Text: "test hello", Timestamp: t0.Add(2 * time.Minute), Locator: store.Locator{SiteID: "remark42", URL: "https://radio-t.com/blah"}}, ts)
236+
id3 := addComment(t, store.Comment{Text: "hello world", Timestamp: t0.Add(3 * time.Minute), Locator: store.Locator{SiteID: "remark42", URL: "https://radio-t.com/blah"}}, ts)
237+
id4 := addComment(t, store.Comment{Text: "# title\n\nok\n", Timestamp: t0.Add(4 * time.Minute), Locator: store.Locator{SiteID: "remark42", URL: "https://radio-t.com/blah"}}, ts)
238+
239+
idToPos := map[string]int{id1: 1, id2: 2, id3: 3, id4: 4}
240+
tbl := []struct {
241+
query string
242+
extraParams string
243+
ids []int
244+
status int
245+
}{
246+
{"blah", "", []int{}, http.StatusOK},
247+
{"h1", "", []int{}, http.StatusOK},
248+
{"world", "", []int{3}, http.StatusOK},
249+
{"title", "", []int{4}, http.StatusOK},
250+
{"\"hello world\"", "", []int{3}, http.StatusOK},
251+
{"test", "&sort=time", []int{1, 2}, http.StatusOK},
252+
{"hello", "&sort=-time", []int{3, 2}, http.StatusOK},
253+
{"hello world", "&sort=time", []int{2, 3}, http.StatusOK},
254+
{"hello world test", "&sort=time", []int{1, 2, 3}, http.StatusOK},
255+
{"hello world test", "&sort=time&skip=1", []int{2, 3}, http.StatusOK},
256+
{"hello world test", "&sort=time&skip=1&limit=1", []int{2}, http.StatusOK},
257+
{"", "", []int{}, http.StatusBadRequest},
258+
{"test", "&sort=text", []int{}, http.StatusBadRequest},
259+
{"test", "&skip=-1", []int{}, http.StatusBadRequest},
260+
{"test", "&limit=999999", []int{}, http.StatusBadRequest},
261+
}
262+
263+
cnt := 0
264+
for i, tt := range tbl {
265+
tt := tt
266+
267+
cnt++
268+
if (cnt % 10) == 0 {
269+
// wait for rate limiter
270+
time.Sleep(1 * time.Second)
271+
}
272+
t.Run(strconv.Itoa(i), func(t *testing.T) {
273+
resp, err := http.Get(fmt.Sprintf("%s/api/v1/search?site=remark42&query=%s%s",
274+
ts.URL, urlpkg.QueryEscape(tt.query), tt.extraParams))
275+
require.NoError(t, err)
276+
277+
defer resp.Body.Close()
278+
body, err := io.ReadAll(resp.Body)
279+
require.NoError(t, err)
280+
assert.Equal(t, tt.status, resp.StatusCode)
281+
282+
result := commentsWithTotal{}
283+
err = json.Unmarshal(body, &result)
284+
require.NoError(t, err)
285+
286+
foundIds := make([]int, len(result.Comments))
287+
for i, c := range result.Comments {
288+
foundIds[i] = idToPos[c.ID]
289+
}
290+
require.Equal(t, tt.ids, foundIds)
291+
})
292+
}
293+
}
294+
295+
func TestRest_SearchDisabledFeature(t *testing.T) {
296+
ts, _, teardown := startupT(t)
297+
defer teardown()
298+
299+
resp, err := http.Get(ts.URL + "/api/v1/search?site=remark42&query=test")
300+
require.NoError(t, err)
301+
require.NoError(t, resp.Body.Close())
302+
assert.Equal(t, http.StatusBadRequest, resp.StatusCode)
303+
}
304+
215305
func Test_URLKey(t *testing.T) {
216306
tbl := []struct {
217307
url string
@@ -633,5 +723,9 @@ func waitForHTTPSServerStart(port int) {
633723
}
634724

635725
func TestMain(m *testing.M) {
636-
goleak.VerifyTestMain(m)
726+
goleak.VerifyTestMain(m,
727+
// we should call bleve.Config.Shutdown() to close all the workers,
728+
// but we don't do it for each server instance because it's global per application
729+
goleak.IgnoreTopFunction("github.com/blevesearch/bleve_index_api.AnalysisWorker"),
730+
)
637731
}

0 commit comments

Comments
 (0)