1
- import argparse , difflib , json , os .path , sys
1
+ import argparse , difflib , json , logging , os .path , sys
2
2
from collections import Counter
3
3
from pathlib import Path
4
4
from typing import Dict , List
5
5
6
6
from xklb import usage
7
- from xklb .scripts import mcda
8
- from xklb .utils import consts , file_utils , iterables , nums , objects , printing , strings
7
+ from xklb .scripts import eda , mcda
8
+ from xklb .utils import consts , db_utils , file_utils , iterables , nums , objects , printing , strings
9
9
from xklb .utils .consts import DBType
10
10
from xklb .utils .log_utils import Timer , log
11
11
@@ -73,33 +73,7 @@ def parse_args() -> argparse.Namespace:
73
73
return args
74
74
75
75
76
- def cluster_paths (paths , n_clusters = None ):
77
- if len (paths ) < 2 :
78
- return paths
79
-
80
- from sklearn .cluster import KMeans
81
- from sklearn .feature_extraction .text import TfidfVectorizer
82
-
83
- sentence_strings = (strings .path_to_sentence (s ) for s in paths )
84
-
85
- try :
86
- vectorizer = TfidfVectorizer (min_df = 2 , strip_accents = "unicode" , stop_words = "english" )
87
- X = vectorizer .fit_transform (sentence_strings )
88
- except ValueError :
89
- try :
90
- vectorizer = TfidfVectorizer (strip_accents = "unicode" , stop_words = "english" )
91
- X = vectorizer .fit_transform (sentence_strings )
92
- except ValueError :
93
- try :
94
- vectorizer = TfidfVectorizer ()
95
- X = vectorizer .fit_transform (sentence_strings )
96
- except ValueError :
97
- vectorizer = TfidfVectorizer (analyzer = "char_wb" )
98
- X = vectorizer .fit_transform (sentence_strings )
99
-
100
- clusterizer = KMeans (n_clusters = n_clusters or int (X .shape [0 ] ** 0.5 ), random_state = 0 , n_init = 10 ).fit (X )
101
- clusters = clusterizer .labels_
102
-
76
+ def map_cluster_to_paths (paths , clusters ):
103
77
grouped_strings = {}
104
78
for i , group_string in enumerate (paths ):
105
79
cluster_id = clusters [i ]
@@ -108,6 +82,11 @@ def cluster_paths(paths, n_clusters=None):
108
82
grouped_strings [cluster_id ] = []
109
83
110
84
grouped_strings [cluster_id ].append (group_string )
85
+ return grouped_strings
86
+
87
+
88
+ def group_paths (paths , clusters ):
89
+ grouped_strings = map_cluster_to_paths (paths , clusters )
111
90
112
91
result = []
113
92
for _cluster_id , paths in grouped_strings .items ():
@@ -131,15 +110,70 @@ def cluster_paths(paths, n_clusters=None):
131
110
"grouped_paths" : paths ,
132
111
}
133
112
result .append (metadata )
113
+ return result
114
+
115
+
116
+ def find_clusters (n_clusters , sentence_strings ):
117
+ from sklearn .cluster import KMeans
118
+ from sklearn .feature_extraction .text import TfidfVectorizer
119
+
120
+ try :
121
+ vectorizer = TfidfVectorizer (min_df = 2 , strip_accents = "unicode" , stop_words = "english" )
122
+ X = vectorizer .fit_transform (sentence_strings )
123
+ except ValueError :
124
+ try :
125
+ vectorizer = TfidfVectorizer (strip_accents = "unicode" , stop_words = "english" )
126
+ X = vectorizer .fit_transform (sentence_strings )
127
+ except ValueError :
128
+ try :
129
+ vectorizer = TfidfVectorizer ()
130
+ X = vectorizer .fit_transform (sentence_strings )
131
+ except ValueError :
132
+ vectorizer = TfidfVectorizer (analyzer = "char_wb" )
133
+ X = vectorizer .fit_transform (sentence_strings )
134
+
135
+ clusterizer = KMeans (n_clusters = n_clusters or int (X .shape [0 ] ** 0.5 ), random_state = 0 , n_init = 10 ).fit (X )
136
+ clusters = clusterizer .labels_
137
+ return clusters
138
+
139
+
140
+ def cluster_paths (paths , n_clusters = None ):
141
+ if len (paths ) < 2 :
142
+ return paths
143
+
144
+ sentence_strings = (strings .path_to_sentence (s ) for s in paths )
145
+ clusters = find_clusters (n_clusters , sentence_strings )
146
+ result = group_paths (paths , clusters )
134
147
135
148
return result
136
149
137
150
138
151
def cluster_dicts (args , media ):
139
152
if len (media ) < 2 :
140
153
return media
154
+
155
+ n_clusters = getattr (args , "clusters" , None )
156
+ search_columns = {
157
+ col
158
+ for _table , table_config in db_utils .config .items ()
159
+ if "search_columns" in table_config
160
+ for col in table_config ["search_columns" ]
161
+ }
162
+
141
163
media_keyed = {d ["path" ]: d for d in media }
142
- groups = cluster_paths ([d ["path" ] for d in media ], n_clusters = getattr (args , "clusters" , None ))
164
+ paths = [d ["path" ] for d in media ]
165
+ sentence_strings = (
166
+ strings .path_to_sentence (" " .join (str (v ) for k , v in d .items () if v and k in search_columns )) for d in media
167
+ )
168
+
169
+ clusters = find_clusters (n_clusters , sentence_strings )
170
+
171
+ if log .getEffectiveLevel () >= logging .DEBUG :
172
+ from pandas import DataFrame
173
+
174
+ eda .print_info (objects .NoneSpace (end_row = "inf" ), DataFrame (clusters ))
175
+
176
+ groups = group_paths (paths , clusters )
143
177
groups = sorted (groups , key = lambda d : (- len (d ["grouped_paths" ]), - len (d ["common_prefix" ])))
144
178
145
179
if getattr (args , "sort_groups_by" , None ) is not None :
0 commit comments