1616import json
1717import logging
1818import random
19+ import re
1920
2021from flask import request
2122from flask_login import login_required , current_user
@@ -847,8 +848,13 @@ def sample_random_chunks_with_vectors(
847848 "position_int" : full_doc .get ("position_int" ),
848849 "top_int" : full_doc .get ("top_int" ),
849850 "content_with_weight" : full_doc .get ("content_with_weight" ) or "" ,
851+ "question_kwd" : full_doc .get ("question_kwd" ) or []
850852 })
851853 return out
854+
855+ def _clean (s : str ) -> str :
856+ s = re .sub (r"</?(table|td|caption|tr|th)( [^<>]{0,12})?>" , " " , s or "" )
857+ return s if s else "None"
852858 req = request .json
853859 kb_id = req .get ("kb_id" , "" )
854860 embd_id = req .get ("embd_id" , "" )
@@ -861,8 +867,10 @@ def sample_random_chunks_with_vectors(
861867
862868 results , eff_sims = [], []
863869 for ck in samples :
864- txt = (ck .get ("content_with_weight" ) or "" ).strip ()
865- if not txt :
870+ title = ck .get ("doc_name" ) or "Title"
871+ txt_in = "\n " .join (ck .get ("question_kwd" ) or []) or ck .get ("content_with_weight" ) or ""
872+ txt_in = _clean (txt_in )
873+ if not txt_in :
866874 results .append ({"chunk_id" : ck ["chunk_id" ], "reason" : "no_text" })
867875 continue
868876
@@ -871,8 +879,16 @@ def sample_random_chunks_with_vectors(
871879 continue
872880
873881 try :
874- qv , _ = emb_mdl .encode_queries (txt )
875- sim = _cos_sim (qv , ck ["vector" ])
882+ v , _ = emb_mdl .encode ([title , txt_in ])
883+ sim_content = _cos_sim (v [1 ], ck ["vector" ])
884+ title_w = 0.1
885+ qv_mix = title_w * v [0 ] + (1 - title_w ) * v [1 ]
886+ sim_mix = _cos_sim (qv_mix , ck ["vector" ])
887+ sim = sim_content
888+ mode = "content_only"
889+ if sim_mix > sim :
890+ sim = sim_mix
891+ mode = "title+content"
876892 except Exception :
877893 return get_error_data_result (message = "embedding failure" )
878894
@@ -894,8 +910,9 @@ def sample_random_chunks_with_vectors(
894910 "avg_cos_sim" : round (float (np .mean (eff_sims )) if eff_sims else 0.0 , 6 ),
895911 "min_cos_sim" : round (float (np .min (eff_sims )) if eff_sims else 0.0 , 6 ),
896912 "max_cos_sim" : round (float (np .max (eff_sims )) if eff_sims else 0.0 , 6 ),
913+ "match_mode" : mode ,
897914 }
898- if summary ["avg_cos_sim" ] > 0.99 :
915+ if summary ["avg_cos_sim" ] > 0.9 :
899916 return get_json_result (data = {"summary" : summary , "results" : results })
900917 return get_json_result (code = RetCode .NOT_EFFECTIVE , message = "failed" , data = {"summary" : summary , "results" : results })
901918
0 commit comments