2.4.004

chapmanjacobd · chapmanjacobd · commit d2650bf2848b · 2024-01-25T15:54:30.000-06:00
diff --git a/.github/README.md b/.github/README.md
@@ -95,7 +95,7 @@ To stop playing press Ctrl+C in either the terminal or mpv
 <details><summary>List all subcommands</summary>
 
     $ library
-    xk media library subcommands (v2.4.003)
+    xk media library subcommands (v2.4.004)
 
     Create database subcommands:
     ╭───────────────┬────────────────────────────────────────────────────╮
diff --git a/xklb/__init__.py b/xklb/__init__.py
@@ -1 +1 @@
-__version__ = "2.4.003"
+__version__ = "2.4.004"
diff --git a/xklb/play_actions.py b/xklb/play_actions.py
@@ -499,9 +499,6 @@ def process_playqueue(args) -> None:
     if args.fetch_siblings:
         media = db_media.get_sibling_media(args, media)
 
-    if args.play_in_order:
-        media = db_media.natsort_media(args, media)
-
     if args.partial:
         media = history_sort(args, media)
         log.debug("utils.history_sort: %s", t.elapsed())
@@ -567,6 +564,9 @@ def process_playqueue(args) -> None:
                         media.append(media_keyed[key])
             log.debug("double for loop compare_block_strings: %s", t.elapsed())
 
+    if args.play_in_order:
+        media = db_media.natsort_media(args, media)
+
     if args.cluster_sort:
         from xklb.scripts.cluster_sort import cluster_dicts
 
diff --git a/xklb/scripts/cluster_sort.py b/xklb/scripts/cluster_sort.py
@@ -1,11 +1,11 @@
-import argparse, difflib, json, os.path, sys
+import argparse, difflib, json, logging, os.path, sys
 from collections import Counter
 from pathlib import Path
 from typing import Dict, List
 
 from xklb import usage
-from xklb.scripts import mcda
-from xklb.utils import consts, file_utils, iterables, nums, objects, printing, strings
+from xklb.scripts import eda, mcda
+from xklb.utils import consts, db_utils, file_utils, iterables, nums, objects, printing, strings
 from xklb.utils.consts import DBType
 from xklb.utils.log_utils import Timer, log
 
@@ -73,33 +73,7 @@ def parse_args() -> argparse.Namespace:
     return args
 
 
-def cluster_paths(paths, n_clusters=None):
-    if len(paths) < 2:
-        return paths
-
-    from sklearn.cluster import KMeans
-    from sklearn.feature_extraction.text import TfidfVectorizer
-
-    sentence_strings = (strings.path_to_sentence(s) for s in paths)
-
-    try:
-        vectorizer = TfidfVectorizer(min_df=2, strip_accents="unicode", stop_words="english")
-        X = vectorizer.fit_transform(sentence_strings)
-    except ValueError:
-        try:
-            vectorizer = TfidfVectorizer(strip_accents="unicode", stop_words="english")
-            X = vectorizer.fit_transform(sentence_strings)
-        except ValueError:
-            try:
-                vectorizer = TfidfVectorizer()
-                X = vectorizer.fit_transform(sentence_strings)
-            except ValueError:
-                vectorizer = TfidfVectorizer(analyzer="char_wb")
-                X = vectorizer.fit_transform(sentence_strings)
-
-    clusterizer = KMeans(n_clusters=n_clusters or int(X.shape[0] ** 0.5), random_state=0, n_init=10).fit(X)
-    clusters = clusterizer.labels_
-
+def map_cluster_to_paths(paths, clusters):
     grouped_strings = {}
     for i, group_string in enumerate(paths):
         cluster_id = clusters[i]
@@ -108,6 +82,11 @@ def cluster_paths(paths, n_clusters=None):
             grouped_strings[cluster_id] = []
 
         grouped_strings[cluster_id].append(group_string)
+    return grouped_strings
+
+
+def group_paths(paths, clusters):
+    grouped_strings = map_cluster_to_paths(paths, clusters)
 
     result = []
     for _cluster_id, paths in grouped_strings.items():
@@ -131,15 +110,70 @@ def cluster_paths(paths, n_clusters=None):
             "grouped_paths": paths,
         }
         result.append(metadata)
+    return result
+
+
+def find_clusters(n_clusters, sentence_strings):
+    from sklearn.cluster import KMeans
+    from sklearn.feature_extraction.text import TfidfVectorizer
+
+    try:
+        vectorizer = TfidfVectorizer(min_df=2, strip_accents="unicode", stop_words="english")
+        X = vectorizer.fit_transform(sentence_strings)
+    except ValueError:
+        try:
+            vectorizer = TfidfVectorizer(strip_accents="unicode", stop_words="english")
+            X = vectorizer.fit_transform(sentence_strings)
+        except ValueError:
+            try:
+                vectorizer = TfidfVectorizer()
+                X = vectorizer.fit_transform(sentence_strings)
+            except ValueError:
+                vectorizer = TfidfVectorizer(analyzer="char_wb")
+                X = vectorizer.fit_transform(sentence_strings)
+
+    clusterizer = KMeans(n_clusters=n_clusters or int(X.shape[0] ** 0.5), random_state=0, n_init=10).fit(X)
+    clusters = clusterizer.labels_
+    return clusters
+
+
+def cluster_paths(paths, n_clusters=None):
+    if len(paths) < 2:
+        return paths
+
+    sentence_strings = (strings.path_to_sentence(s) for s in paths)
+    clusters = find_clusters(n_clusters, sentence_strings)
+    result = group_paths(paths, clusters)
 
     return result
 
 
 def cluster_dicts(args, media):
     if len(media) < 2:
         return media
+
+    n_clusters = getattr(args, "clusters", None)
+    search_columns = {
+        col
+        for _table, table_config in db_utils.config.items()
+        if "search_columns" in table_config
+        for col in table_config["search_columns"]
+    }
+
     media_keyed = {d["path"]: d for d in media}
-    groups = cluster_paths([d["path"] for d in media], n_clusters=getattr(args, "clusters", None))
+    paths = [d["path"] for d in media]
+    sentence_strings = (
+        strings.path_to_sentence(" ".join(str(v) for k, v in d.items() if v and k in search_columns)) for d in media
+    )
+
+    clusters = find_clusters(n_clusters, sentence_strings)
+
+    if log.getEffectiveLevel() >= logging.DEBUG:
+        from pandas import DataFrame
+
+        eda.print_info(objects.NoneSpace(end_row="inf"), DataFrame(clusters))
+
+    groups = group_paths(paths, clusters)
     groups = sorted(groups, key=lambda d: (-len(d["grouped_paths"]), -len(d["common_prefix"])))
 
     if getattr(args, "sort_groups_by", None) is not None:
diff --git a/xklb/scripts/open_links.py b/xklb/scripts/open_links.py
@@ -5,7 +5,7 @@
 
 from xklb import db_media, history, usage
 from xklb.media import media_printer
-from xklb.utils import arg_utils, consts, db_utils, iterables, objects, processes
+from xklb.utils import arg_utils, consts, db_utils, iterables, objects, processes, web
 from xklb.utils.log_utils import log
 
 
@@ -182,14 +182,14 @@ def make_souffle(args, media):
     for m in media:
         m_urls = set()
         if args.title:
-            for pre in args.title_prefix:
+            for engine in args.title_prefix:
                 suffix = m.get("title") or m["path"]
-                m_urls.add(suffix if suffix.startswith("http") else pre.replace("%", suffix))
+                m_urls.add(suffix if suffix.startswith("http") else web.construct_search(engine, suffix))
 
         if not args.title or args.path:
             if not m["path"].startswith("http"):
-                for pre in args.title_prefix:
-                    m_urls.add(pre.replace("%", m["path"]))
+                for engine in args.title_prefix:
+                    m_urls.add(web.construct_search(engine, m["path"]))
             else:
                 m_urls.add(m["path"])
 
diff --git a/xklb/utils/objects.py b/xklb/utils/objects.py
@@ -1,11 +1,16 @@
-import json
+import json, types
 from contextlib import contextmanager
 from functools import wraps
 from typing import Dict, Optional
 
 from xklb.utils.log_utils import log
 
 
+class NoneSpace(types.SimpleNamespace):
+    def __getattr__(self, name):
+        return None
+
+
 def fallback(func, fallback):
     @wraps(func)
     def wrapped(*args, **kwargs):
diff --git a/xklb/utils/strings.py b/xklb/utils/strings.py
@@ -142,12 +142,12 @@ def strip_enclosing_quotes(s):
     if len(s) < 2:
         return s
 
-    for q in ['"', "'", "＇", '"', '‛', '‟', '＂', '‚', '〞', '〝', '〟', '„', '⹂', '❟', '❜', '❛', '❝', '❞']:
+    for q in ['"', "'", "＇", '"', "‛", "‟", "＂", "‚", "〞", "〝", "〟", "„", "⹂", "❟", "❜", "❛", "❝", "❞"]:
         if s[0] == q and s[-1] == q:
             return s[1:-1]
 
-    ls = ['‘', '“', '❮', '‹', '«']
-    rs = ['’', '”', '❯', '›', '»']
+    ls = ["‘", "“", "❮", "‹", "«"]
+    rs = ["’", "”", "❯", "›", "»"]
     for l, r in zip(ls, rs):
         if s[0] == l and s[-1] == r:
             return s[1:-1]
diff --git a/xklb/utils/web.py b/xklb/utils/web.py
@@ -464,6 +464,11 @@ def infinite_scroll(driver):
     yield selenium_extract_html(driver)
 
 
+def construct_search(engine, s):
+    s = urllib.parse.quote(s, safe="")
+    return engine.replace("%", s, 1)
+
+
 def safe_unquote(url):
     # https://en.wikipedia.org/wiki/Internationalized_Resource_Identifier
     # we aren't writing HTML so we can unquote

Original file line number	Diff line number	Diff line change
`@@ -1 +1 @@`
`1`		`-__version__ = "2.4.003"`
	`1`	`+__version__ = "2.4.004"`