@@ -21,15 +21,15 @@ import (
2121 "context"
2222 "fmt"
2323 "hash/fnv"
24+ "io"
2425 "os"
2526 "sort"
2627 "strings"
2728 "sync"
2829
29- "github.com/pkg/errors"
30-
3130 "github.com/RoaringBitmap/roaring"
3231 cmap "github.com/orcaman/concurrent-map/v2"
32+ "github.com/pkg/errors"
3333 "github.com/spf13/cast"
3434
3535 "github.com/CocaineCong/tangseng/app/index_platform/analyzer"
@@ -234,3 +234,112 @@ func iHash(key string) int64 { // nolint:golint,unused
234234 _ , _ = h .Write ([]byte (key ))
235235 return int64 (h .Sum32 () & 0x7fffffff )
236236}
237+
238+ func (s * IndexPlatformSrv ) UploadFile (stream pb.IndexPlatformService_UploadFileServer ) (err error ) {
239+ ctx := stream .Context ()
240+ // 时间估计
241+ invertedIndex := cmap .New [* roaring.Bitmap ]() // 倒排索引
242+ dictTrie := trie .NewTrie () // 前缀树
243+ // mapreduce 这个是用chan和goroutine来代替master和worker的rpc调用,避免了频繁的rpc调用
244+ _ , _ = mapreduce .MapReduce (func (source chan <- []byte ) {
245+ chunk , err := stream .Recv ()
246+ if err == io .EOF {
247+ _ = stream .SendAndClose (& pb.UploadResponse {
248+ Code : e .SUCCESS ,
249+ Message : cconsts .IndexPlatformUploadSuccess ,
250+ })
251+ }
252+ source <- chunk .Content
253+ }, func (item []byte , writer mapreduce.Writer [[]* types.KeyValue ], cancel func (error )) {
254+ // 控制并发
255+ var wg sync.WaitGroup
256+ ch := make (chan struct {}, 3 )
257+
258+ keyValueList := make ([]* types.KeyValue , 0 , 1e3 )
259+ lines := strings .Split (string (item ), "\r \n " )
260+ for _ , line := range lines [1 :] {
261+ ch <- struct {}{}
262+ wg .Add (1 )
263+ docStruct , _ := input_data .Doc2Struct (line ) // line 转 docs struct
264+ if docStruct .DocId == 0 {
265+ continue
266+ }
267+
268+ // 分词
269+ tokens , _ := analyzer .GseCutForBuildIndex (docStruct .DocId , docStruct .Body )
270+ for _ , v := range tokens {
271+ if v .Token == "" || v .Token == " " {
272+ continue
273+ }
274+ keyValueList = append (keyValueList , & types.KeyValue {Key : v .Token , Value : cast .ToString (v .DocId )})
275+ dictTrie .Insert (v .Token )
276+ }
277+
278+ // 建立正排索引
279+ go func (docStruct * types.Document ) {
280+ err = input_data .DocData2Kfk (docStruct )
281+ if err != nil {
282+ logs .LogrusObj .Error (err )
283+ }
284+ defer wg .Done ()
285+ <- ch
286+ }(docStruct )
287+ }
288+ wg .Wait ()
289+
290+ // // 构建前缀树 // TODO: kafka异步处理一下前缀树的插入,不然占着这里的资源
291+ // go func(tokenList []string) {
292+ // err = input_data.DocTrie2Kfk(tokenList)
293+ // if err != nil {
294+ // logs.LogrusObj.Error("DocTrie2Kfk", err)
295+ // }
296+ // }(tokenList)
297+
298+ // shuffle 排序过程
299+ sort .Sort (types .ByKey (keyValueList ))
300+ writer .Write (keyValueList )
301+ }, func (pipe <- chan []* types.KeyValue , writer mapreduce.Writer [string ], cancel func (error )) {
302+ for values := range pipe {
303+ for _ , v := range values { // 构建倒排索引
304+ if value , ok := invertedIndex .Get (v .Key ); ok {
305+ value .AddInt (cast .ToInt (v .Value ))
306+ invertedIndex .Set (v .Key , value )
307+ } else {
308+ docIds := roaring .NewBitmap ()
309+ docIds .AddInt (cast .ToInt (v .Value ))
310+ invertedIndex .Set (v .Key , docIds )
311+ }
312+ }
313+ }
314+ })
315+
316+ // 存储倒排索引
317+ go func () {
318+ newCtx := clone .NewContextWithoutDeadline ()
319+ newCtx .Clone (ctx )
320+ err = storeInvertedIndexByHash (newCtx , invertedIndex )
321+ if err != nil {
322+ logs .LogrusObj .Error ("storeInvertedIndexByHash error " , err )
323+ }
324+ }()
325+
326+ logs .LogrusObj .Infoln ("storeInvertedIndexByHash End" )
327+
328+ // 存储前缀树
329+ go func () {
330+ newCtx := clone .NewContextWithoutDeadline ()
331+ newCtx .Clone (ctx )
332+ err = storeDictTrieByHash (newCtx , dictTrie )
333+ if err != nil {
334+ logs .LogrusObj .Error ("storeDictTrieByHash error " , err )
335+ logs .LogrusObj .Errorf ("stack trace: \n %+v\n " , err )
336+ }
337+ }()
338+
339+ return nil
340+ }
341+
342+ func (s * IndexPlatformSrv ) DownloadFile (file * pb.FileRequest , req pb.IndexPlatformService_DownloadFileServer ) (err error ) {
343+
344+ return nil
345+ }
0 commit comments