Skip to content

Commit d2650bf

Browse files
committed
2.4.004
1 parent 439305d commit d2650bf

File tree

8 files changed

+89
-45
lines changed

8 files changed

+89
-45
lines changed

.github/README.md

+1-1
Original file line numberDiff line numberDiff line change
@@ -95,7 +95,7 @@ To stop playing press Ctrl+C in either the terminal or mpv
9595
<details><summary>List all subcommands</summary>
9696

9797
$ library
98-
xk media library subcommands (v2.4.003)
98+
xk media library subcommands (v2.4.004)
9999

100100
Create database subcommands:
101101
╭───────────────┬────────────────────────────────────────────────────╮

xklb/__init__.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -1 +1 @@
1-
__version__ = "2.4.003"
1+
__version__ = "2.4.004"

xklb/play_actions.py

+3-3
Original file line numberDiff line numberDiff line change
@@ -499,9 +499,6 @@ def process_playqueue(args) -> None:
499499
if args.fetch_siblings:
500500
media = db_media.get_sibling_media(args, media)
501501

502-
if args.play_in_order:
503-
media = db_media.natsort_media(args, media)
504-
505502
if args.partial:
506503
media = history_sort(args, media)
507504
log.debug("utils.history_sort: %s", t.elapsed())
@@ -567,6 +564,9 @@ def process_playqueue(args) -> None:
567564
media.append(media_keyed[key])
568565
log.debug("double for loop compare_block_strings: %s", t.elapsed())
569566

567+
if args.play_in_order:
568+
media = db_media.natsort_media(args, media)
569+
570570
if args.cluster_sort:
571571
from xklb.scripts.cluster_sort import cluster_dicts
572572

xklb/scripts/cluster_sort.py

+65-31
Original file line numberDiff line numberDiff line change
@@ -1,11 +1,11 @@
1-
import argparse, difflib, json, os.path, sys
1+
import argparse, difflib, json, logging, os.path, sys
22
from collections import Counter
33
from pathlib import Path
44
from typing import Dict, List
55

66
from xklb import usage
7-
from xklb.scripts import mcda
8-
from xklb.utils import consts, file_utils, iterables, nums, objects, printing, strings
7+
from xklb.scripts import eda, mcda
8+
from xklb.utils import consts, db_utils, file_utils, iterables, nums, objects, printing, strings
99
from xklb.utils.consts import DBType
1010
from xklb.utils.log_utils import Timer, log
1111

@@ -73,33 +73,7 @@ def parse_args() -> argparse.Namespace:
7373
return args
7474

7575

76-
def cluster_paths(paths, n_clusters=None):
77-
if len(paths) < 2:
78-
return paths
79-
80-
from sklearn.cluster import KMeans
81-
from sklearn.feature_extraction.text import TfidfVectorizer
82-
83-
sentence_strings = (strings.path_to_sentence(s) for s in paths)
84-
85-
try:
86-
vectorizer = TfidfVectorizer(min_df=2, strip_accents="unicode", stop_words="english")
87-
X = vectorizer.fit_transform(sentence_strings)
88-
except ValueError:
89-
try:
90-
vectorizer = TfidfVectorizer(strip_accents="unicode", stop_words="english")
91-
X = vectorizer.fit_transform(sentence_strings)
92-
except ValueError:
93-
try:
94-
vectorizer = TfidfVectorizer()
95-
X = vectorizer.fit_transform(sentence_strings)
96-
except ValueError:
97-
vectorizer = TfidfVectorizer(analyzer="char_wb")
98-
X = vectorizer.fit_transform(sentence_strings)
99-
100-
clusterizer = KMeans(n_clusters=n_clusters or int(X.shape[0] ** 0.5), random_state=0, n_init=10).fit(X)
101-
clusters = clusterizer.labels_
102-
76+
def map_cluster_to_paths(paths, clusters):
10377
grouped_strings = {}
10478
for i, group_string in enumerate(paths):
10579
cluster_id = clusters[i]
@@ -108,6 +82,11 @@ def cluster_paths(paths, n_clusters=None):
10882
grouped_strings[cluster_id] = []
10983

11084
grouped_strings[cluster_id].append(group_string)
85+
return grouped_strings
86+
87+
88+
def group_paths(paths, clusters):
89+
grouped_strings = map_cluster_to_paths(paths, clusters)
11190

11291
result = []
11392
for _cluster_id, paths in grouped_strings.items():
@@ -131,15 +110,70 @@ def cluster_paths(paths, n_clusters=None):
131110
"grouped_paths": paths,
132111
}
133112
result.append(metadata)
113+
return result
114+
115+
116+
def find_clusters(n_clusters, sentence_strings):
117+
from sklearn.cluster import KMeans
118+
from sklearn.feature_extraction.text import TfidfVectorizer
119+
120+
try:
121+
vectorizer = TfidfVectorizer(min_df=2, strip_accents="unicode", stop_words="english")
122+
X = vectorizer.fit_transform(sentence_strings)
123+
except ValueError:
124+
try:
125+
vectorizer = TfidfVectorizer(strip_accents="unicode", stop_words="english")
126+
X = vectorizer.fit_transform(sentence_strings)
127+
except ValueError:
128+
try:
129+
vectorizer = TfidfVectorizer()
130+
X = vectorizer.fit_transform(sentence_strings)
131+
except ValueError:
132+
vectorizer = TfidfVectorizer(analyzer="char_wb")
133+
X = vectorizer.fit_transform(sentence_strings)
134+
135+
clusterizer = KMeans(n_clusters=n_clusters or int(X.shape[0] ** 0.5), random_state=0, n_init=10).fit(X)
136+
clusters = clusterizer.labels_
137+
return clusters
138+
139+
140+
def cluster_paths(paths, n_clusters=None):
141+
if len(paths) < 2:
142+
return paths
143+
144+
sentence_strings = (strings.path_to_sentence(s) for s in paths)
145+
clusters = find_clusters(n_clusters, sentence_strings)
146+
result = group_paths(paths, clusters)
134147

135148
return result
136149

137150

138151
def cluster_dicts(args, media):
139152
if len(media) < 2:
140153
return media
154+
155+
n_clusters = getattr(args, "clusters", None)
156+
search_columns = {
157+
col
158+
for _table, table_config in db_utils.config.items()
159+
if "search_columns" in table_config
160+
for col in table_config["search_columns"]
161+
}
162+
141163
media_keyed = {d["path"]: d for d in media}
142-
groups = cluster_paths([d["path"] for d in media], n_clusters=getattr(args, "clusters", None))
164+
paths = [d["path"] for d in media]
165+
sentence_strings = (
166+
strings.path_to_sentence(" ".join(str(v) for k, v in d.items() if v and k in search_columns)) for d in media
167+
)
168+
169+
clusters = find_clusters(n_clusters, sentence_strings)
170+
171+
if log.getEffectiveLevel() >= logging.DEBUG:
172+
from pandas import DataFrame
173+
174+
eda.print_info(objects.NoneSpace(end_row="inf"), DataFrame(clusters))
175+
176+
groups = group_paths(paths, clusters)
143177
groups = sorted(groups, key=lambda d: (-len(d["grouped_paths"]), -len(d["common_prefix"])))
144178

145179
if getattr(args, "sort_groups_by", None) is not None:

xklb/scripts/open_links.py

+5-5
Original file line numberDiff line numberDiff line change
@@ -5,7 +5,7 @@
55

66
from xklb import db_media, history, usage
77
from xklb.media import media_printer
8-
from xklb.utils import arg_utils, consts, db_utils, iterables, objects, processes
8+
from xklb.utils import arg_utils, consts, db_utils, iterables, objects, processes, web
99
from xklb.utils.log_utils import log
1010

1111

@@ -182,14 +182,14 @@ def make_souffle(args, media):
182182
for m in media:
183183
m_urls = set()
184184
if args.title:
185-
for pre in args.title_prefix:
185+
for engine in args.title_prefix:
186186
suffix = m.get("title") or m["path"]
187-
m_urls.add(suffix if suffix.startswith("http") else pre.replace("%", suffix))
187+
m_urls.add(suffix if suffix.startswith("http") else web.construct_search(engine, suffix))
188188

189189
if not args.title or args.path:
190190
if not m["path"].startswith("http"):
191-
for pre in args.title_prefix:
192-
m_urls.add(pre.replace("%", m["path"]))
191+
for engine in args.title_prefix:
192+
m_urls.add(web.construct_search(engine, m["path"]))
193193
else:
194194
m_urls.add(m["path"])
195195

xklb/utils/objects.py

+6-1
Original file line numberDiff line numberDiff line change
@@ -1,11 +1,16 @@
1-
import json
1+
import json, types
22
from contextlib import contextmanager
33
from functools import wraps
44
from typing import Dict, Optional
55

66
from xklb.utils.log_utils import log
77

88

9+
class NoneSpace(types.SimpleNamespace):
10+
def __getattr__(self, name):
11+
return None
12+
13+
914
def fallback(func, fallback):
1015
@wraps(func)
1116
def wrapped(*args, **kwargs):

xklb/utils/strings.py

+3-3
Original file line numberDiff line numberDiff line change
@@ -142,12 +142,12 @@ def strip_enclosing_quotes(s):
142142
if len(s) < 2:
143143
return s
144144

145-
for q in ['"', "'", "'", '"', '‛', '‟', '"', '‚', '〞', '〝', '〟', '„', '⹂', '❟', '❜', '❛', '❝', '❞']:
145+
for q in ['"', "'", "'", '"', "‛", "‟", """, "‚", "〞", "〝", "〟", "„", "⹂", "❟", "❜", "❛", "❝", "❞"]:
146146
if s[0] == q and s[-1] == q:
147147
return s[1:-1]
148148

149-
ls = ['‘', '“', '❮', '‹', '«']
150-
rs = ['’', '”', '❯', '›', '»']
149+
ls = ["‘", "“", "❮", "‹", "«"]
150+
rs = ["’", "”", "❯", "›", "»"]
151151
for l, r in zip(ls, rs):
152152
if s[0] == l and s[-1] == r:
153153
return s[1:-1]

xklb/utils/web.py

+5
Original file line numberDiff line numberDiff line change
@@ -464,6 +464,11 @@ def infinite_scroll(driver):
464464
yield selenium_extract_html(driver)
465465

466466

467+
def construct_search(engine, s):
468+
s = urllib.parse.quote(s, safe="")
469+
return engine.replace("%", s, 1)
470+
471+
467472
def safe_unquote(url):
468473
# https://en.wikipedia.org/wiki/Internationalized_Resource_Identifier
469474
# we aren't writing HTML so we can unquote

0 commit comments

Comments
 (0)