Skip to content

Commit 1eed11f

Browse files
committed
fix: ignore non-utf8 words when saving to db
1 parent 53d8b13 commit 1eed11f

File tree

7 files changed

+404
-3
lines changed

7 files changed

+404
-3
lines changed

Makefile

Lines changed: 55 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,42 @@
1-
.PHONY: build run test clean docker-build docker-run migrate-up migrate-down docker-restart docker-stop start-all stop-all start-ollama stop-ollama
1+
.PHONY: help build run test clean docker-build docker-run migrate-up migrate-down docker-restart docker-stop start-all stop-all start-ollama stop-ollama build-images build-images-app build-images-docreader build-images-frontend clean-images
2+
3+
# Show help
4+
help:
5+
@echo "WeKnora Makefile 帮助"
6+
@echo ""
7+
@echo "基础命令:"
8+
@echo " build 构建应用"
9+
@echo " run 运行应用"
10+
@echo " test 运行测试"
11+
@echo " clean 清理构建文件"
12+
@echo ""
13+
@echo "Docker 命令:"
14+
@echo " docker-build 构建 Docker 镜像"
15+
@echo " docker-run 运行 Docker 容器"
16+
@echo " docker-stop 停止 Docker 容器"
17+
@echo " docker-restart 重启 Docker 容器"
18+
@echo ""
19+
@echo "服务管理:"
20+
@echo " start-all 启动所有服务"
21+
@echo " stop-all 停止所有服务"
22+
@echo " start-ollama 仅启动 Ollama 服务"
23+
@echo ""
24+
@echo "镜像构建:"
25+
@echo " build-images 从源码构建所有镜像"
26+
@echo " build-images-app 从源码构建应用镜像"
27+
@echo " build-images-docreader 从源码构建文档读取器镜像"
28+
@echo " build-images-frontend 从源码构建前端镜像"
29+
@echo " clean-images 清理本地镜像"
30+
@echo ""
31+
@echo "数据库:"
32+
@echo " migrate-up 执行数据库迁移"
33+
@echo " migrate-down 回滚数据库迁移"
34+
@echo ""
35+
@echo "开发工具:"
36+
@echo " fmt 格式化代码"
37+
@echo " lint 代码检查"
38+
@echo " deps 安装依赖"
39+
@echo " docs 生成 API 文档"
240

341
# Go related variables
442
BINARY_NAME=WeKnora
@@ -53,6 +91,22 @@ stop-all:
5391
docker-stop:
5492
docker-compose down
5593

94+
# 从源码构建镜像相关命令
95+
build-images:
96+
./scripts/build_images.sh
97+
98+
build-images-app:
99+
./scripts/build_images.sh --app
100+
101+
build-images-docreader:
102+
./scripts/build_images.sh --docreader
103+
104+
build-images-frontend:
105+
./scripts/build_images.sh --frontend
106+
107+
clean-images:
108+
./scripts/build_images.sh --clean
109+
56110
# Restart Docker container (stop, rebuild, start)
57111
docker-restart:
58112
docker-compose stop -t 60

frontend/.dockerignore

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,8 @@
1+
node_modules
2+
dist
3+
.git
4+
.gitignore
5+
README.md
6+
.vscode
7+
*.log
8+
.DS_Store

internal/application/repository/chunk.go

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,7 @@ import (
44
"context"
55
"errors"
66

7+
"github.com/Tencent/WeKnora/internal/common"
78
"github.com/Tencent/WeKnora/internal/types"
89
"github.com/Tencent/WeKnora/internal/types/interfaces"
910
"gorm.io/gorm"
@@ -21,6 +22,9 @@ func NewChunkRepository(db *gorm.DB) interfaces.ChunkRepository {
2122

2223
// CreateChunks creates multiple chunks in batches
2324
func (r *chunkRepository) CreateChunks(ctx context.Context, chunks []*types.Chunk) error {
25+
for _, chunk := range chunks {
26+
chunk.Content = common.CleanInvalidUTF8(chunk.Content)
27+
}
2428
return r.db.WithContext(ctx).CreateInBatches(chunks, 100).Error
2529
}
2630

internal/application/repository/retriever/postgres/structs.go

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,7 @@ import (
66
"strconv"
77
"time"
88

9+
"github.com/Tencent/WeKnora/internal/common"
910
"github.com/Tencent/WeKnora/internal/types"
1011
"github.com/pgvector/pgvector-go"
1112
)
@@ -59,7 +60,7 @@ func toDBVectorEmbedding(indexInfo *types.IndexInfo, additionalParams map[string
5960
ChunkID: indexInfo.ChunkID,
6061
KnowledgeID: indexInfo.KnowledgeID,
6162
KnowledgeBaseID: indexInfo.KnowledgeBaseID,
62-
Content: indexInfo.Content,
63+
Content: common.CleanInvalidUTF8(indexInfo.Content),
6364
}
6465
// Add embedding data if available in additionalParams
6566
if additionalParams != nil && slices.Contains(slices.Collect(maps.Keys(additionalParams)), "embedding") {

internal/common/tools.go

Lines changed: 25 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,7 @@ import (
66
"regexp"
77
"slices"
88
"strings"
9+
"unicode/utf8"
910
)
1011

1112
// ToInterfaceSlice converts a slice of strings to a slice of empty interfaces.
@@ -73,3 +74,27 @@ func ParseLLMJsonResponse(content string, target interface{}) error {
7374
// If no code block found, return the original error
7475
return err
7576
}
77+
78+
// CleanInvalidUTF8 移除字符串中的非法 UTF-8 字符和 \x00
79+
func CleanInvalidUTF8(s string) string {
80+
var b strings.Builder
81+
b.Grow(len(s))
82+
83+
for i := 0; i < len(s); {
84+
r, size := utf8.DecodeRuneInString(s[i:])
85+
if r == utf8.RuneError && size == 1 {
86+
// 非法 UTF-8 字节,跳过
87+
i++
88+
continue
89+
}
90+
if r == 0 {
91+
// NULL 字符 \x00,跳过
92+
i += size
93+
continue
94+
}
95+
b.WriteRune(r)
96+
i += size
97+
}
98+
99+
return b.String()
100+
}

0 commit comments

Comments
 (0)