@@ -41,6 +41,67 @@ def build_index(
4141 ) -> Dict [str , List [str ]]: ...
4242
4343
44+ @dataclass
45+ class ProgressPrinter :
46+ """Print progress in both interactive (TTY) and non-interactive environments.
47+
48+ - TTY: renders a single-line progress bar updated in-place.
49+ - Non-TTY (CI/server): prints on every 1% progress change.
50+ """
51+
52+ enabled : bool
53+ prefix : str
54+ width : int = 24
55+ tty_stream = sys .stderr
56+ _last_pct : int = - 1
57+ _is_tty : bool = False
58+
59+ def __post_init__ (self ) -> None :
60+ if not self .enabled :
61+ self ._is_tty = False
62+ return
63+ self ._is_tty = bool (getattr (self .tty_stream , "isatty" , lambda : False )())
64+
65+ def _render_bar (self , done : int , total : int ) -> str :
66+ if total <= 0 :
67+ return "[" + ("-" * self .width ) + "] 0/0"
68+ frac = max (0.0 , min (1.0 , done / float (total )))
69+ filled = int (round (frac * self .width ))
70+ bar = "#" * filled + "-" * (self .width - filled )
71+ pct = int (frac * 100 )
72+ return f"[{ bar } ] { done } /{ total } { pct :3d} %"
73+
74+ def update (self , done : int , total : int ) -> None :
75+ if not self .enabled :
76+ return
77+ if total < 0 :
78+ total = 0
79+ if done < 0 :
80+ done = 0
81+ if total > 0 and done > total :
82+ done = total
83+
84+ if self ._is_tty :
85+ msg = f"\r { self .prefix } { self ._render_bar (done , total )} "
86+ self .tty_stream .write (msg )
87+ self .tty_stream .flush ()
88+ return
89+
90+ pct = int ((done / max (1 , total )) * 100 )
91+ # Non-interactive: print every 1% change.
92+ if pct != self ._last_pct :
93+ print (f"{ self .prefix } : { pct } % ({ done } /{ total } )" )
94+ self ._last_pct = pct
95+
96+ def close (self , total : int ) -> None :
97+ if not self .enabled :
98+ return
99+ self .update (total , total )
100+ if self ._is_tty :
101+ self .tty_stream .write ("\n " )
102+ self .tty_stream .flush ()
103+
104+
44105def _clean_text_for_tags (text : str ) -> str :
45106 # Remove common non-content segments to reduce noisy tokens.
46107 clean = text
@@ -117,15 +178,18 @@ def _build_tfidf_tag_index(
117178 ) from exc
118179
119180 # Build corpus: jieba tokens joined by spaces.
181+ progress_corpus = ProgressPrinter (enabled = verbose , prefix = '[tags tfidf] corpus' )
120182 rel_paths : List [str ] = []
121183 corpus : List [str ] = []
122- for p in files :
184+ for i , p in enumerate ( files , start = 1 ) :
123185 rel = p .relative_to (target_root ).as_posix ()
124186 rel_paths .append (rel )
125187 raw = _read_text_best_effort (p )
126188 body = _strip_front_matter_if_any (raw )
127189 tokens = _jieba_tokenize (body , dedupe = True )
128190 corpus .append (' ' .join (tokens ))
191+ progress_corpus .update (i , len (files ))
192+ progress_corpus .close (len (files ))
129193
130194 vectorizer = TfidfVectorizer (
131195 # We already tokenized; keep tokens as-is.
@@ -139,11 +203,13 @@ def _build_tfidf_tag_index(
139203
140204 used_global : Set [str ] = set ()
141205 tags_by_path : Dict [str , List [str ]] = {}
206+ progress_pick = ProgressPrinter (enabled = verbose , prefix = '[tags tfidf] pick' )
142207
143208 for i , rel in enumerate (rel_paths ):
144209 row = X .getrow (i )
145210 if row .nnz == 0 :
146211 tags_by_path [rel ] = []
212+ progress_pick .update (i + 1 , len (rel_paths ))
147213 continue
148214 # Sort terms by TF-IDF score descending.
149215 pairs = sorted (zip (row .indices , row .data ), key = lambda x : x [1 ], reverse = True )
@@ -164,6 +230,8 @@ def _build_tfidf_tag_index(
164230 if len (picked ) >= tag_count :
165231 break
166232 tags_by_path [rel ] = picked
233+ progress_pick .update (i + 1 , len (rel_paths ))
234+ progress_pick .close (len (rel_paths ))
167235
168236 if verbose :
169237 print (f"[tags] unique={ len (used_global )} /{ max_unique_tags } " )
@@ -190,11 +258,12 @@ def _build_textrank_tag_index(
190258 "或:pip install textrank4zh"
191259 ) from exc
192260 print (f"[tags] start building TextRank index for { len (files )} files..." )
261+ progress = ProgressPrinter (enabled = verbose , prefix = '[tags textrank]' )
193262
194263 used_global : Set [str ] = set ()
195264 tags_by_path : Dict [str , List [str ]] = {}
196265
197- for p in files :
266+ for i , p in enumerate ( files , start = 1 ) :
198267 rel = p .relative_to (target_root ).as_posix ()
199268 raw = _read_text_best_effort (p )
200269 body = _strip_front_matter_if_any (raw )
@@ -239,6 +308,8 @@ def _build_textrank_tag_index(
239308 if len (picked ) >= tag_count :
240309 break
241310 tags_by_path [rel ] = picked
311+ progress .update (i , len (files ))
312+ progress .close (len (files ))
242313
243314 if verbose :
244315 print (f"[tags] end building TextRank index, unique={ len (used_global )} /{ max_unique_tags } " )
@@ -280,6 +351,7 @@ def _build_keybert_tag_index(
280351 ) from exc
281352
282353 print (f"[tags] start building KeyBERT index for { len (files )} files..." )
354+ progress = ProgressPrinter (enabled = verbose , prefix = '[tags keybert]' )
283355
284356 # A multilingual model works for Chinese and English mixed content.
285357 # Keep it as an internal default to avoid expanding CLI surface.
@@ -297,11 +369,16 @@ def _build_keybert_tag_index(
297369 used_global : Set [str ] = set ()
298370 tags_by_path : Dict [str , List [str ]] = {}
299371
300- for p in files :
372+ for i , p in enumerate ( files , start = 1 ) :
301373 rel = p .relative_to (target_root ).as_posix ()
302374 raw = _read_text_best_effort (p )
303375 body = _strip_front_matter_if_any (raw )
304376 clean = _clean_text_for_tags (body )
377+ tokens = _jieba_tokenize (clean , dedupe = False )
378+ if not tokens :
379+ tags_by_path [rel ] = []
380+ continue
381+ doc = ' ' .join (tokens )
305382
306383 # KeyBERT returns list[(keyword, score)]. Use keyphrase_ngram_range=(1,1)
307384 # to align with the existing per-tag token behavior.
@@ -351,6 +428,8 @@ def _build_keybert_tag_index(
351428 break
352429
353430 tags_by_path [rel ] = picked
431+ progress .update (i , len (files ))
432+ progress .close (len (files ))
354433
355434 if verbose :
356435 print (f"[tags] unique={ len (used_global )} /{ max_unique_tags } " )
@@ -782,6 +861,8 @@ def _build_git_time_index(repo_dir: Path, interest_paths: Set[str], date_kind: s
782861 created : Dict [str , int ] = {}
783862 done : Set [str ] = set ()
784863 current_ts : Optional [int ] = None
864+ progress = ProgressPrinter (enabled = verbose , prefix = '[git] batch' )
865+ last_filled = - 1
785866
786867 try :
787868 with tempfile .NamedTemporaryFile (prefix = 'git-log-' , suffix = '.txt' , delete = False , mode = 'w' , encoding = 'utf-8' ) as fp :
@@ -856,6 +937,14 @@ def _build_git_time_index(repo_dir: Path, interest_paths: Set[str], date_kind: s
856937 done .add (final )
857938 if len (done ) == len (interest_paths ):
858939 break
940+
941+ filled = 0
942+ for k in interest_paths :
943+ if k in updated and k in created :
944+ filled += 1
945+ if filled != last_filled :
946+ progress .update (filled , len (interest_paths ))
947+ last_filled = filled
859948 finally :
860949 if out_path :
861950 try :
@@ -869,6 +958,7 @@ def _build_git_time_index(repo_dir: Path, interest_paths: Set[str], date_kind: s
869958 u = updated .get (k )
870959 if c is not None and u is not None :
871960 result [k ] = (c , u )
961+ progress .close (len (interest_paths ))
872962 return result
873963
874964
@@ -1032,40 +1122,16 @@ def process_directory(cfg: ProcessConfig) -> int:
10321122 if cfg .verbose :
10331123 print (f"[git] batch index: filled={ len (idx )} /{ len (missing )} " )
10341124
1035- def render_bar (done : int , total_count : int , width : int = 24 ) -> str :
1036- if total_count <= 0 :
1037- return "[" + ("-" * width ) + "] 0/0"
1038- frac = max (0.0 , min (1.0 , done / float (total_count )))
1039- filled = int (round (frac * width ))
1040- bar = "#" * filled + "-" * (width - filled )
1041- pct = int (frac * 100 )
1042- return f"[{ bar } ] { done } /{ total_count } { pct :3d} %"
1043-
1044- show_progress = (not cfg .verbose )
1045- is_tty = bool (getattr (sys .stderr , "isatty" , lambda : False )())
1046- last_reported_pct = - 1
1125+ progress = ProgressPrinter (enabled = cfg .verbose , prefix = '[process]' )
10471126
10481127 count = 0
10491128 for i , p in enumerate (files , start = 1 ):
10501129 if process_file (cfg , p ):
10511130 count += 1
10521131 if cfg .verbose :
10531132 print (f"[ok] { p .relative_to (cfg .target_dir ).as_posix ()} " )
1054-
1055- if show_progress :
1056- if is_tty :
1057- msg = f"\r { render_bar (i , total )} "
1058- sys .stderr .write (msg )
1059- sys .stderr .flush ()
1060- else :
1061- # Non-interactive output: print on percentage change (1% steps) and at the end.
1062- pct = int ((i / max (1 , total )) * 100 )
1063- if pct != last_reported_pct and (pct % 5 == 0 or i == total ):
1064- print (f"progress: { pct } % ({ i } /{ total } )" )
1065- last_reported_pct = pct
1066-
1067- if show_progress and is_tty :
1068- sys .stderr .write ("\n " )
1133+ progress .update (i , total )
1134+ progress .close (total )
10691135
10701136 print (f"processed: { count } files, target={ cfg .target_dir } " )
10711137 return 0
0 commit comments