Skip to content

Commit 7312d09

Browse files
author
anyihao
committed
2.3.0
1 parent ef2cc6d commit 7312d09

37 files changed

+2288
-529
lines changed

README.md

+2-2
Original file line numberDiff line numberDiff line change
@@ -24,14 +24,14 @@ You can use **Apache Maven** or **Gradle**/**Grails** to download the SDK.
2424
<dependency>
2525
<groupId>com.tencent.tcvectordb</groupId>
2626
<artifactId>vectordatabase-sdk-java</artifactId>
27-
<version>2.2.2</version>
27+
<version>2.3.0</version>
2828
</dependency>
2929
```
3030

3131
- Gradle/Grails
3232

3333
```gradle
34-
compile 'com.tencent.tcvectordb:vectordatabase-sdk-java:2.2.2'
34+
compile 'com.tencent.tcvectordb:vectordatabase-sdk-java:2.3.0'
3535
```
3636

3737
### Examples

tcvdb_text/README.md

+2-2
Original file line numberDiff line numberDiff line change
@@ -20,14 +20,14 @@ You can use **Apache Maven** or **Gradle**/**Grails** to download the SDK.
2020
<dependency>
2121
<groupId>com.tencent.tcvectordb</groupId>
2222
<artifactId>tcvdb-text</artifactId>
23-
<version>1.0.1</version>
23+
<version>1.0.2</version>
2424
</dependency>
2525
```
2626

2727
- Gradle/Grails
2828

2929
```gradle
30-
compile 'com.tencent.tcvectordb:tcvdb-text:1.0.1'
30+
compile 'com.tencent.tcvectordb:tcvdb-text:1.0.2'
3131
```
3232

3333
### Examples

tcvdb_text/pom.xml

+1-1
Original file line numberDiff line numberDiff line change
@@ -6,7 +6,7 @@
66

77
<groupId>com.tencent.tcvectordb</groupId>
88
<artifactId>tcvdb-text</artifactId>
9-
<version>1.0.1</version>
9+
<version>1.0.2</version>
1010
<packaging>jar</packaging>
1111
<name>tcvdb-text</name>
1212
<dependencies>

tcvdb_text/src/main/java/com/tencent/tcvdbtext/encoder/SparseVectorBm25Encoder.java

+17
Original file line numberDiff line numberDiff line change
@@ -346,6 +346,23 @@ public void setParams(String paramsFile) {
346346
public void setDict(String dictFile) {
347347
this.tokenizer.loadDict(dictFile);
348348
}
349+
350+
/**
351+
* set tokenizer cutAll mode
352+
* @param cutAll
353+
*/
354+
public void setCutAll(Boolean cutAll) {
355+
this.tokenizer.setCutAll(cutAll);
356+
}
357+
358+
/**
359+
* set stop words file
360+
* @param stopWordsFile
361+
*/
362+
public void setStopWords(String stopWordsFile) {
363+
this.tokenizer.setStopWords(stopWordsFile);
364+
}
365+
349366
// build模式
350367
public static class Builder {
351368
private BaseTokenizer tokenizer;

tcvdb_text/src/main/java/com/tencent/tcvdbtext/example.java

+37-1
Original file line numberDiff line numberDiff line change
@@ -40,12 +40,18 @@ public static void main(String[] args) {
4040
quickStart();
4141
fitStart();
4242
userDict();
43+
cutAll();
44+
stopWord();
4345
}
4446
public static void quickStart() {
4547
SparseVectorBm25Encoder encoder = SparseVectorBm25Encoder.getBm25Encoder("zh");
4648
List<String> texts = Arrays.asList("腾讯云向量数据库(Tencent Cloud VectorDB)是一款全托管的自研企业级分布式数据库服务,专用于存储、索引、检索、管理由深度神经网络或其他机器学习模型生成的大量多维嵌入向量。",
4749
"作为专门为处理输入向量查询而设计的数据库,它支持多种索引类型和相似度计算方法,单索引支持10亿级向量规模,高达百万级 QPS 及毫秒级查询延迟。",
4850
"不仅能为大模型提供外部知识库,提高大模型回答的准确性,还可广泛应用于推荐系统、NLP 服务、计算机视觉、智能客服等 AI 领域。");
51+
System.out.println("cut all:" + encoder.getTokenizer().getCutAll());
52+
System.out.println("encode texts: "+ encoder.encodeTexts(texts));
53+
encoder.setCutAll(true);
54+
System.out.println("cut all:" + encoder.getTokenizer().getCutAll());
4955
System.out.println("encode texts: "+ encoder.encodeTexts(texts));
5056

5157
System.out.println("encode multiple quires: "+ encoder.encodeQueries(Arrays.asList("什么是腾讯云向量数据库?", "腾讯云向量数据库有什么优势?", "腾讯云向量数据库能做些什么?")));
@@ -63,7 +69,6 @@ public static void fitStart() {
6369
}
6470
public static void userDict(){
6571
JiebaTokenizer tokenizer = new JiebaTokenizer();
66-
tokenizer.setEnableStopWords(true);
6772
String projectPath = example.class.getProtectionDomain().getCodeSource().getLocation().getPath();
6873

6974
// 创建一个 File 对象来表示工程路径
@@ -74,6 +79,7 @@ public static void userDict(){
7479
String path = projectAbsolutePath.replace("target/classes", "") +
7580
"src/main/resources/data/user_dict/userdict_example.txt";
7681
System.out.println(tokenizer.tokenize("腾讯云向量数据库(Tencent Cloud VectorDB)是一款全托管的自研企业级分布式数据库服务,专用于存储、索引、检索、管理由深度神经网络或其他机器学习模型生成的大量多维嵌入向量。"));
82+
7783
tokenizer.loadDict(path);
7884
System.out.println(tokenizer.tokenize("腾讯云向量数据库(Tencent Cloud VectorDB)是一款全托管的自研企业级分布式数据库服务,专用于存储、索引、检索、管理由深度神经网络或其他机器学习模型生成的大量多维嵌入向量。"));
7985

@@ -83,4 +89,34 @@ public static void userDict(){
8389
System.out.println(tokenizer.tokenize("腾讯云向量数据库(Tencent Cloud VectorDB)是一款全托管的自研企业级分布式数据库服务,专用于存储、索引、检索、管理由深度神经网络或其他机器学习模型生成的大量多维嵌入向量。"));
8490

8591
}
92+
93+
94+
public static void cutAll(){
95+
SparseVectorBm25Encoder encoder = SparseVectorBm25Encoder.getBm25Encoder("zh");
96+
System.out.println("cut all : " + encoder.getTokenizer().getCutAll());
97+
System.out.println(encoder.getTokenizer().tokenize("腾讯云向量数据库(Tencent Cloud VectorDB)是一款全托管的自研企业级分布式数据库服务,专用于存储、索引、检索、管理由深度神经网络或其他机器学习模型生成的大量多维嵌入向量。"));
98+
99+
encoder.setCutAll(true);
100+
System.out.println("cut all: " + encoder.getTokenizer().getCutAll());
101+
System.out.println(encoder.getTokenizer().tokenize("腾讯云向量数据库(Tencent Cloud VectorDB)是一款全托管的自研企业级分布式数据库服务,专用于存储、索引、检索、管理由深度神经网络或其他机器学习模型生成的大量多维嵌入向量。"));
102+
}
103+
104+
public static void stopWord(){
105+
SparseVectorBm25Encoder encoder = SparseVectorBm25Encoder.getBm25Encoder("zh");
106+
System.out.println(encoder.getTokenizer().tokenize("什么是腾讯云向量数据库。"));
107+
108+
encoder.setEnableStopWords(false);
109+
System.out.println(encoder.getTokenizer().tokenize("什么是腾讯云向量数据库。"));
110+
String projectPath = example.class.getProtectionDomain().getCodeSource().getLocation().getPath();
111+
112+
// 创建一个 File 对象来表示工程路径
113+
File projectDirectory = new File(projectPath);
114+
String projectAbsolutePath = projectDirectory.getAbsolutePath();
115+
String path = projectAbsolutePath.replace("target/classes", "") +
116+
"src/main/resources/data/user_stopwords.txt";
117+
encoder.setStopWords(path);
118+
encoder.setEnableStopWords(true);
119+
System.out.println(encoder.getTokenizer().tokenize("什么是腾讯云向量数据库。"));
120+
121+
}
86122
}

tcvdb_text/src/main/java/com/tencent/tcvdbtext/tokenizer/BaseTokenizer.java

+37
Original file line numberDiff line numberDiff line change
@@ -42,6 +42,10 @@ public abstract class BaseTokenizer {
4242

4343
protected Boolean enableStopWords;
4444

45+
protected Boolean cutAll;
46+
47+
protected String stopWordsFile;
48+
4549

4650
public BaseTokenizer(BaseHash hash, Boolean enableStopWords, Set<String> stopWords, Boolean lowerCase, String dictFilePath) {
4751
this.hash = hash;
@@ -51,6 +55,14 @@ public BaseTokenizer(BaseHash hash, Boolean enableStopWords, Set<String> stopWor
5155
this.enableStopWords = enableStopWords;
5256
}
5357

58+
public BaseTokenizer(BaseHash hash, Boolean enableStopWords, Set<String> stopWords, Boolean lowerCase,Boolean cutAll, String dictFilePath) {
59+
this.hash = hash;
60+
this.stopWords = stopWords;
61+
this.lowerCase = lowerCase;
62+
this.dictFilePath = dictFilePath;
63+
this.enableStopWords = enableStopWords;
64+
this.cutAll = cutAll;
65+
}
5466
public BaseTokenizer() {
5567
}
5668

@@ -73,6 +85,7 @@ public Map<String, Object> getParameter(){
7385
param.put("lowerCase", lowerCase);
7486
param.put("dictFilePath", dictFilePath);
7587
param.put("enableStopWords", enableStopWords);
88+
param.put("cutAll", cutAll);
7689
return param;
7790
}
7891

@@ -84,11 +97,35 @@ public void updateParameter(BaseHash hash, Set<String> stopWords, Boolean enable
8497
this.enableStopWords = enableStopWords;
8598
}
8699

100+
public void updateParameter(BaseHash hash, Set<String> stopWords, Boolean enableStopWords, Boolean lowerCase,Boolean cutAll, String dictFilePath){
101+
this.hash = hash;
102+
this.stopWords = stopWords;
103+
this.lowerCase = lowerCase;
104+
this.dictFilePath = dictFilePath;
105+
this.enableStopWords = enableStopWords;
106+
this.cutAll = cutAll;
107+
}
108+
87109
public boolean isStopWord(String word){
88110
if (stopWords.isEmpty()) return false;
89111
return stopWords.contains(word);
90112
}
113+
114+
public Boolean getCutAll() {
115+
return cutAll;
116+
}
117+
118+
public void setCutAll(Boolean cutAll) {
119+
this.cutAll = cutAll;
120+
}
121+
122+
public void setStopWords(Set<String> stopWords) {
123+
this.stopWords = stopWords;
124+
}
125+
91126
public abstract void loadDict(String dictFile);
92127

93128
public abstract void setLowerCase(Boolean lowerCase);
129+
130+
public abstract void setStopWords(String stopWordsFile);
94131
}

tcvdb_text/src/main/java/com/tencent/tcvdbtext/tokenizer/JiebaTokenizer.java

+27-2
Original file line numberDiff line numberDiff line change
@@ -50,6 +50,15 @@ public JiebaTokenizer(BaseHash hash, Boolean enableStopWords, Set<String> stopWo
5050
}
5151
this.segmenter = new JiebaSegmenter();
5252

53+
}
54+
55+
public JiebaTokenizer(BaseHash hash, Boolean enableStopWords, Set<String> stopWords, Boolean lowerCase, Boolean cutAll, String dictFilePath) {
56+
super(hash, enableStopWords, stopWords, lowerCase, cutAll, dictFilePath);
57+
if (!dictFilePath.isEmpty()) {
58+
WordDictionary.getInstance().init(Paths.get(dictFilePath));
59+
}
60+
this.segmenter = new JiebaSegmenter();
61+
5362
}
5463
public JiebaTokenizer(){
5564
super();
@@ -72,6 +81,12 @@ public void setDict(String dicFile) {
7281
}
7382
}
7483

84+
@Override
85+
public void setStopWords(String stopWordsFile) {
86+
if (!stopWordsFile.isEmpty()) {
87+
this.stopWords = StopWords.getStopWordsFromFilePath(stopWordsFile);
88+
}
89+
}
7590

7691
@Override
7792
public List<String> tokenize(String sentence) {
@@ -82,7 +97,12 @@ public List<String> tokenize(String sentence) {
8297
sentence = sentence.toLowerCase();
8398
}
8499
List<String> words;
85-
words = segmenter.sentenceProcess(sentence);
100+
if (this.cutAll!=null && this.cutAll) {
101+
words = segmenter.process(sentence, JiebaSegmenter.SegMode.INDEX).stream().map(word -> word.word).collect(Collectors.toList());
102+
}else{
103+
words = segmenter.process(sentence, JiebaSegmenter.SegMode.SEARCH).stream().map(word -> word.word).collect(Collectors.toList());
104+
}
105+
86106
words = words.stream().filter(word -> {
87107
if(word.equals(" ") || word.equals(" ")) {
88108
return false;
@@ -136,6 +156,7 @@ public static class Builder{
136156
private Boolean lowerCase;
137157
private String dictFilePath;
138158
private Boolean enableStopWords;
159+
private Boolean cutAll;
139160
public Builder withHash(BaseHash hash){
140161
this.hash = hash;
141162
return this;
@@ -156,8 +177,12 @@ public Builder withEnableStopWords(Boolean enableStopWords){
156177
this.enableStopWords = enableStopWords;
157178
return this;
158179
}
180+
public Builder withCutAll(Boolean cutAll){
181+
this.cutAll = cutAll;
182+
return this;
183+
}
159184
public JiebaTokenizer build(){
160-
return new JiebaTokenizer(hash, enableStopWords, stopWords, lowerCase, dictFilePath);
185+
return new JiebaTokenizer(hash, enableStopWords, stopWords, lowerCase, cutAll, dictFilePath);
161186
}
162187
}
163188

tcvdb_text/src/main/java/com/tencent/tcvdbtext/tokenizer/StopWords.java

+32
Original file line numberDiff line numberDiff line change
@@ -43,6 +43,38 @@ public static Set<String> getStopWordsFromFile(String filePath) {
4343
}
4444
} catch (IOException e) {
4545
e.printStackTrace();
46+
}finally {
47+
if(inputStream != null){
48+
try {
49+
inputStream.close();
50+
} catch (IOException e) {
51+
e.printStackTrace();
52+
}
53+
}
54+
}
55+
return lines;
56+
}
57+
58+
public static Set<String> getStopWordsFromFilePath(String filePath) {
59+
Set<String> lines = new HashSet<>();
60+
InputStream inputStream = null;
61+
try {
62+
inputStream = new FileInputStream(filePath);
63+
BufferedReader reader = new BufferedReader(new InputStreamReader(inputStream, StandardCharsets.UTF_8));
64+
String line;
65+
while ((line = reader.readLine()) != null) {
66+
lines.add(line);
67+
}
68+
} catch (IOException e) {
69+
e.printStackTrace();
70+
}finally {
71+
if(inputStream != null){
72+
try {
73+
inputStream.close();
74+
} catch (IOException e) {
75+
e.printStackTrace();
76+
}
77+
}
4678
}
4779
return lines;
4880
}
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,2 @@
1+
什么
2+

tcvectordb/README.md

+2-2
Original file line numberDiff line numberDiff line change
@@ -24,14 +24,14 @@ You can use **Apache Maven** or **Gradle**/**Grails** to download the SDK.
2424
<dependency>
2525
<groupId>com.tencent.tcvectordb</groupId>
2626
<artifactId>vectordatabase-sdk-java</artifactId>
27-
<version>2.2.2</version>
27+
<version>2.3.0</version>
2828
</dependency>
2929
```
3030

3131
- Gradle/Grails
3232

3333
```gradle
34-
compile 'com.tencent.tcvectordb:vectordatabase-sdk-java:2.2.2'
34+
compile 'com.tencent.tcvectordb:vectordatabase-sdk-java:2.3.0'
3535
```
3636

3737
### Examples

tcvectordb/pom.xml

+2-2
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,7 @@
44

55
<groupId>com.tencent.tcvectordb</groupId>
66
<artifactId>vectordatabase-sdk-java</artifactId>
7-
<version>2.2.2</version>
7+
<version>2.3.0</version>
88
<packaging>jar</packaging>
99

1010
<name>vectordatabase-sdk-java</name>
@@ -102,7 +102,7 @@
102102
<dependency>
103103
<groupId>com.tencent.tcvectordb</groupId>
104104
<artifactId>tcvdb-text</artifactId>
105-
<version>1.0.1</version>
105+
<version>1.0.2</version>
106106
</dependency>
107107
</dependencies>
108108
<build>

0 commit comments

Comments
 (0)