Tencent
diff --git a/‎README.md
+2-2 b/‎README.md
+2-2
diff --git a/‎tcvdb_text/README.md
+2-2 b/‎tcvdb_text/README.md
+2-2
diff --git a/‎tcvdb_text/pom.xml
+1-1 b/‎tcvdb_text/pom.xml
+1-1
diff --git a/‎tcvdb_text/src/main/java/com/tencent/tcvdbtext/encoder/SparseVectorBm25Encoder.java
+17 b/‎tcvdb_text/src/main/java/com/tencent/tcvdbtext/encoder/SparseVectorBm25Encoder.java
+17
diff --git a/‎tcvdb_text/src/main/java/com/tencent/tcvdbtext/example.java
+37-1 b/‎tcvdb_text/src/main/java/com/tencent/tcvdbtext/example.java
+37-1
diff --git a/‎tcvdb_text/src/main/java/com/tencent/tcvdbtext/tokenizer/BaseTokenizer.java
+37 b/‎tcvdb_text/src/main/java/com/tencent/tcvdbtext/tokenizer/BaseTokenizer.java
+37
diff --git a/‎tcvdb_text/src/main/java/com/tencent/tcvdbtext/tokenizer/JiebaTokenizer.java
+27-2 b/‎tcvdb_text/src/main/java/com/tencent/tcvdbtext/tokenizer/JiebaTokenizer.java
+27-2
diff --git a/‎tcvdb_text/src/main/java/com/tencent/tcvdbtext/tokenizer/StopWords.java
+32 b/‎tcvdb_text/src/main/java/com/tencent/tcvdbtext/tokenizer/StopWords.java
+32
diff --git a/‎tcvdb_text/src/main/resources/data/user_stopwords.txt
+2 b/‎tcvdb_text/src/main/resources/data/user_stopwords.txt
+2
diff --git a/‎tcvectordb/README.md
+2-2 b/‎tcvectordb/README.md
+2-2
diff --git a/‎tcvectordb/pom.xml
+2-2 b/‎tcvectordb/pom.xml
+2-2
@@ -24,14 +24,14 @@ You can use **Apache Maven** or **Gradle**/**Grails** to download the SDK.
         <dependency>
             <groupId>com.tencent.tcvectordb</groupId>
             <artifactId>vectordatabase-sdk-java</artifactId>
-            <version>2.2.2</version>
+            <version>2.3.0</version>
         </dependency>
        ```
 
    - Gradle/Grails
 
         ```gradle
-        compile 'com.tencent.tcvectordb:vectordatabase-sdk-java:2.2.2'
+        compile 'com.tencent.tcvectordb:vectordatabase-sdk-java:2.3.0'
         ```
 
 ### Examples
 
@@ -20,14 +20,14 @@ You can use **Apache Maven** or **Gradle**/**Grails** to download the SDK.
         <dependency>
             <groupId>com.tencent.tcvectordb</groupId>
             <artifactId>tcvdb-text</artifactId>
-            <version>1.0.1</version>
+            <version>1.0.2</version>
         </dependency>
        ```
 
    - Gradle/Grails
 
         ```gradle
-        compile 'com.tencent.tcvectordb:tcvdb-text:1.0.1'
+        compile 'com.tencent.tcvectordb:tcvdb-text:1.0.2'
         ```
 
 ### Examples
 
@@ -6,7 +6,7 @@
 
     <groupId>com.tencent.tcvectordb</groupId>
     <artifactId>tcvdb-text</artifactId>
-    <version>1.0.1</version>
+    <version>1.0.2</version>
     <packaging>jar</packaging>
     <name>tcvdb-text</name>
     <dependencies>
 
@@ -346,6 +346,23 @@ public void setParams(String paramsFile) {
     public void setDict(String dictFile) {
         this.tokenizer.loadDict(dictFile);
     }
+
+    /**
+     * set tokenizer cutAll mode
+     * @param cutAll
+     */
+    public void setCutAll(Boolean cutAll) {
+        this.tokenizer.setCutAll(cutAll);
+    }
+
+    /**
+     * set stop words file
+     * @param stopWordsFile
+     */
+    public void setStopWords(String stopWordsFile)  {
+        this.tokenizer.setStopWords(stopWordsFile);
+    }
+
     // build模式
     public static class Builder {
         private BaseTokenizer tokenizer;
 
@@ -40,12 +40,18 @@ public static void main(String[] args) {
         quickStart();
         fitStart();
         userDict();
+        cutAll();
+        stopWord();
     }
     public static void quickStart() {
         SparseVectorBm25Encoder encoder = SparseVectorBm25Encoder.getBm25Encoder("zh");
         List<String> texts = Arrays.asList("腾讯云向量数据库（Tencent Cloud VectorDB）是一款全托管的自研企业级分布式数据库服务，专用于存储、索引、检索、管理由深度神经网络或其他机器学习模型生成的大量多维嵌入向量。",
                 "作为专门为处理输入向量查询而设计的数据库，它支持多种索引类型和相似度计算方法，单索引支持10亿级向量规模，高达百万级 QPS 及毫秒级查询延迟。",
                    "不仅能为大模型提供外部知识库，提高大模型回答的准确性，还可广泛应用于推荐系统、NLP 服务、计算机视觉、智能客服等 AI 领域。");
+        System.out.println("cut all:" + encoder.getTokenizer().getCutAll());
+        System.out.println("encode texts: "+ encoder.encodeTexts(texts));
+        encoder.setCutAll(true);
+        System.out.println("cut all:" + encoder.getTokenizer().getCutAll());
         System.out.println("encode texts: "+ encoder.encodeTexts(texts));
 
         System.out.println("encode multiple quires: "+ encoder.encodeQueries(Arrays.asList("什么是腾讯云向量数据库？", "腾讯云向量数据库有什么优势？", "腾讯云向量数据库能做些什么？")));
@@ -63,7 +69,6 @@ public static void fitStart() {
     }
     public static void userDict(){
         JiebaTokenizer tokenizer = new JiebaTokenizer();
-        tokenizer.setEnableStopWords(true);
         String projectPath = example.class.getProtectionDomain().getCodeSource().getLocation().getPath();
 
         // 创建一个 File 对象来表示工程路径
@@ -74,6 +79,7 @@ public static void userDict(){
         String path = projectAbsolutePath.replace("target/classes", "") +
                 "src/main/resources/data/user_dict/userdict_example.txt";
         System.out.println(tokenizer.tokenize("腾讯云向量数据库（Tencent Cloud VectorDB）是一款全托管的自研企业级分布式数据库服务，专用于存储、索引、检索、管理由深度神经网络或其他机器学习模型生成的大量多维嵌入向量。"));
+
         tokenizer.loadDict(path);
         System.out.println(tokenizer.tokenize("腾讯云向量数据库（Tencent Cloud VectorDB）是一款全托管的自研企业级分布式数据库服务，专用于存储、索引、检索、管理由深度神经网络或其他机器学习模型生成的大量多维嵌入向量。"));
 
@@ -83,4 +89,34 @@ public static void userDict(){
         System.out.println(tokenizer.tokenize("腾讯云向量数据库（Tencent Cloud VectorDB）是一款全托管的自研企业级分布式数据库服务，专用于存储、索引、检索、管理由深度神经网络或其他机器学习模型生成的大量多维嵌入向量。"));
 
     }
+
+
+    public static void cutAll(){
+        SparseVectorBm25Encoder encoder = SparseVectorBm25Encoder.getBm25Encoder("zh");
+        System.out.println("cut all : " + encoder.getTokenizer().getCutAll());
+        System.out.println(encoder.getTokenizer().tokenize("腾讯云向量数据库（Tencent Cloud VectorDB）是一款全托管的自研企业级分布式数据库服务，专用于存储、索引、检索、管理由深度神经网络或其他机器学习模型生成的大量多维嵌入向量。"));
+
+        encoder.setCutAll(true);
+        System.out.println("cut all: " + encoder.getTokenizer().getCutAll());
+        System.out.println(encoder.getTokenizer().tokenize("腾讯云向量数据库（Tencent Cloud VectorDB）是一款全托管的自研企业级分布式数据库服务，专用于存储、索引、检索、管理由深度神经网络或其他机器学习模型生成的大量多维嵌入向量。"));
+    }
+
+    public static void stopWord(){
+        SparseVectorBm25Encoder encoder = SparseVectorBm25Encoder.getBm25Encoder("zh");
+        System.out.println(encoder.getTokenizer().tokenize("什么是腾讯云向量数据库。"));
+
+        encoder.setEnableStopWords(false);
+        System.out.println(encoder.getTokenizer().tokenize("什么是腾讯云向量数据库。"));
+        String projectPath = example.class.getProtectionDomain().getCodeSource().getLocation().getPath();
+
+        // 创建一个 File 对象来表示工程路径
+        File projectDirectory = new File(projectPath);
+        String projectAbsolutePath = projectDirectory.getAbsolutePath();
+        String path = projectAbsolutePath.replace("target/classes", "") +
+                "src/main/resources/data/user_stopwords.txt";
+        encoder.setStopWords(path);
+        encoder.setEnableStopWords(true);
+        System.out.println(encoder.getTokenizer().tokenize("什么是腾讯云向量数据库。"));
+
+    }
 }
@@ -42,6 +42,10 @@ public abstract class BaseTokenizer {
 
     protected Boolean enableStopWords;
 
+    protected Boolean cutAll;
+
+    protected String stopWordsFile;
+
 
     public BaseTokenizer(BaseHash hash, Boolean enableStopWords, Set<String> stopWords, Boolean lowerCase, String dictFilePath) {
         this.hash = hash;
@@ -51,6 +55,14 @@ public BaseTokenizer(BaseHash hash, Boolean enableStopWords, Set<String> stopWor
         this.enableStopWords = enableStopWords;
     }
 
+    public BaseTokenizer(BaseHash hash, Boolean enableStopWords, Set<String> stopWords, Boolean lowerCase,Boolean cutAll, String dictFilePath) {
+        this.hash = hash;
+        this.stopWords = stopWords;
+        this.lowerCase = lowerCase;
+        this.dictFilePath = dictFilePath;
+        this.enableStopWords = enableStopWords;
+        this.cutAll = cutAll;
+    }
     public BaseTokenizer() {
     }
 
@@ -73,6 +85,7 @@ public Map<String, Object> getParameter(){
         param.put("lowerCase", lowerCase);
         param.put("dictFilePath", dictFilePath);
         param.put("enableStopWords", enableStopWords);
+        param.put("cutAll", cutAll);
         return param;
     }
 
@@ -84,11 +97,35 @@ public void updateParameter(BaseHash hash, Set<String> stopWords, Boolean enable
         this.enableStopWords = enableStopWords;
     }
 
+    public void updateParameter(BaseHash hash, Set<String> stopWords, Boolean enableStopWords, Boolean lowerCase,Boolean cutAll,  String dictFilePath){
+        this.hash = hash;
+        this.stopWords = stopWords;
+        this.lowerCase = lowerCase;
+        this.dictFilePath = dictFilePath;
+        this.enableStopWords = enableStopWords;
+        this.cutAll = cutAll;
+    }
+
     public boolean isStopWord(String word){
         if (stopWords.isEmpty()) return false;
         return stopWords.contains(word);
     }
+
+    public Boolean getCutAll() {
+        return cutAll;
+    }
+
+    public void setCutAll(Boolean cutAll) {
+        this.cutAll = cutAll;
+    }
+
+    public void setStopWords(Set<String> stopWords) {
+        this.stopWords = stopWords;
+    }
+
     public abstract void loadDict(String dictFile);
 
     public abstract  void setLowerCase(Boolean lowerCase);
+
+    public abstract void setStopWords(String stopWordsFile);
 }
@@ -50,6 +50,15 @@ public JiebaTokenizer(BaseHash hash, Boolean enableStopWords, Set<String> stopWo
         }
         this.segmenter = new JiebaSegmenter();
 
+    }
+
+    public JiebaTokenizer(BaseHash hash, Boolean enableStopWords, Set<String> stopWords, Boolean lowerCase, Boolean cutAll, String dictFilePath) {
+        super(hash, enableStopWords, stopWords, lowerCase, cutAll, dictFilePath);
+        if (!dictFilePath.isEmpty()) {
+            WordDictionary.getInstance().init(Paths.get(dictFilePath));
+        }
+        this.segmenter = new JiebaSegmenter();
+
     }
     public JiebaTokenizer(){
         super();
@@ -72,6 +81,12 @@ public void setDict(String dicFile) {
         }
     }
 
+    @Override
+    public void setStopWords(String stopWordsFile) {
+        if (!stopWordsFile.isEmpty()) {
+            this.stopWords = StopWords.getStopWordsFromFilePath(stopWordsFile);
+        }
+    }
 
     @Override
     public List<String> tokenize(String sentence) {
@@ -82,7 +97,12 @@ public List<String> tokenize(String sentence) {
             sentence = sentence.toLowerCase();
         }
         List<String> words;
-        words = segmenter.sentenceProcess(sentence);
+        if (this.cutAll!=null && this.cutAll) {
+            words = segmenter.process(sentence, JiebaSegmenter.SegMode.INDEX).stream().map(word -> word.word).collect(Collectors.toList());
+        }else{
+            words = segmenter.process(sentence, JiebaSegmenter.SegMode.SEARCH).stream().map(word -> word.word).collect(Collectors.toList());
+        }
+
         words = words.stream().filter(word -> {
             if(word.equals(" ") || word.equals("　")) {
                 return false;
@@ -136,6 +156,7 @@ public static class Builder{
         private Boolean lowerCase;
         private String dictFilePath;
         private Boolean enableStopWords;
+        private Boolean cutAll;
         public Builder withHash(BaseHash hash){
             this.hash = hash;
             return this;
@@ -156,8 +177,12 @@ public Builder withEnableStopWords(Boolean enableStopWords){
             this.enableStopWords = enableStopWords;
             return this;
         }
+        public Builder withCutAll(Boolean cutAll){
+            this.cutAll = cutAll;
+            return this;
+        }
         public JiebaTokenizer build(){
-            return new JiebaTokenizer(hash, enableStopWords, stopWords, lowerCase, dictFilePath);
+            return new JiebaTokenizer(hash, enableStopWords, stopWords, lowerCase, cutAll, dictFilePath);
         }
     }
 
 
@@ -43,6 +43,38 @@ public static Set<String> getStopWordsFromFile(String filePath) {
             }
         } catch (IOException e) {
             e.printStackTrace();
+        }finally {
+            if(inputStream != null){
+                try {
+                    inputStream.close();
+                } catch (IOException e) {
+                    e.printStackTrace();
+                }
+            }
+        }
+        return lines;
+    }
+
+    public static Set<String> getStopWordsFromFilePath(String filePath) {
+        Set<String> lines = new HashSet<>();
+        InputStream inputStream = null;
+        try {
+            inputStream = new FileInputStream(filePath);
+            BufferedReader reader = new BufferedReader(new InputStreamReader(inputStream, StandardCharsets.UTF_8));
+            String line;
+            while ((line = reader.readLine()) != null) {
+                lines.add(line);
+            }
+        } catch (IOException e) {
+            e.printStackTrace();
+        }finally {
+            if(inputStream != null){
+                try {
+                    inputStream.close();
+                } catch (IOException e) {
+                    e.printStackTrace();
+                }
+            }
         }
         return lines;
     }
 
@@ -0,0 +1,2 @@
+什么
+是
@@ -24,14 +24,14 @@ You can use **Apache Maven** or **Gradle**/**Grails** to download the SDK.
         <dependency>
             <groupId>com.tencent.tcvectordb</groupId>
             <artifactId>vectordatabase-sdk-java</artifactId>
-            <version>2.2.2</version>
+            <version>2.3.0</version>
         </dependency>
        ```
 
    - Gradle/Grails
 
         ```gradle
-        compile 'com.tencent.tcvectordb:vectordatabase-sdk-java:2.2.2'
+        compile 'com.tencent.tcvectordb:vectordatabase-sdk-java:2.3.0'
         ```
 
 ### Examples
 
@@ -4,7 +4,7 @@
 
     <groupId>com.tencent.tcvectordb</groupId>
     <artifactId>vectordatabase-sdk-java</artifactId>
-    <version>2.2.2</version>
+    <version>2.3.0</version>
     <packaging>jar</packaging>
 
     <name>vectordatabase-sdk-java</name>
@@ -102,7 +102,7 @@
         <dependency>
             <groupId>com.tencent.tcvectordb</groupId>
             <artifactId>tcvdb-text</artifactId>
-            <version>1.0.1</version>
+            <version>1.0.2</version>
         </dependency>
     </dependencies>
     <build>