2.4.006

chapmanjacobd · chapmanjacobd · commit 2aa8d8eb2a17 · 2024-01-28T13:40:25.000-06:00
diff --git a/.github/README.md b/.github/README.md
@@ -95,7 +95,7 @@ To stop playing press Ctrl+C in either the terminal or mpv
 <details><summary>List all subcommands</summary>
 
     $ library
-    xk media library subcommands (v2.4.005)
+    xk media library subcommands (v2.4.006)
 
     Create database subcommands:
     ╭───────────────┬────────────────────────────────────────────────────╮
@@ -132,6 +132,8 @@ To stop playing press Ctrl+C in either the terminal or mpv
     ├───────────────┼─────────────────────────────────┤
     │ tubeupdate    │ Update online video media       │
     ├───────────────┼─────────────────────────────────┤
+    │ webupdate     │ Update open-directory media     │
+    ├───────────────┼─────────────────────────────────┤
     │ galleryupdate │ Update online gallery media     │
     ├───────────────┼─────────────────────────────────┤
     │ links-update  │ Update a link-scraping database │
@@ -258,11 +260,6 @@ To stop playing press Ctrl+C in either the terminal or mpv
     │ nouns          │ Unstructured text -> compound nouns (stdin) │
     ╰────────────────┴─────────────────────────────────────────────╯
 
-    Other subcommands:
-    ╭───────────┬───────────╮
-    │ webupdate │ webupdate │
-    ╰───────────┴───────────╯
-
 
 </details>
 
@@ -593,6 +590,8 @@ BTW, for some cols like time_deleted you'll need to specify a where clause so th
 
     Scan open directories
 
+    library download open_dir.db --fs --prefix ~/d/dump/video/ --relative -vv -s factory -p
+
 
 
 </details>
@@ -916,6 +915,19 @@ BTW, for some cols like time_deleted you'll need to specify a where clause so th
         lb dedupe-db video.db playlists --bk extractor_playlist_id
 
 
+</details>
+
+###### webupdate
+
+<details><summary>Update open-directory media</summary>
+
+    $ library webupdate -h
+    usage: library web-update DATABASE
+
+    Update saved open directories
+
+
+
 </details>
 
 ###### galleryupdate
@@ -2593,21 +2605,6 @@ BTW, for some cols like time_deleted you'll need to specify a where clause so th
         library process-audio --split-longer-than 36mins audiobook.m4b audiobook2.mp3
 
 
-</details>
-
-### Other subcommands
-
-###### webupdate
-
-<details><summary>webupdate</summary>
-
-    $ library webupdate -h
-    usage: library web-update DATABASE
-
-    Update saved open directories
-
-
-
 </details>
 
 
diff --git a/pdm.lock b/pdm.lock
diff --git a/xklb/__init__.py b/xklb/__init__.py
@@ -1 +1 @@
-__version__ = "2.4.005"
+__version__ = "2.4.006"
diff --git a/xklb/dl_extract.py b/xklb/dl_extract.py
@@ -1,7 +1,7 @@
 import argparse, os, sys
 from typing import List, Tuple
 
-from xklb import gdl_backend, tube_backend, usage
+from xklb import db_media, gdl_backend, tube_backend, usage
 from xklb.media import media_printer
 from xklb.utils import arg_utils, consts, db_utils, iterables, nums, objects, printing, processes, sql_utils, web
 from xklb.utils.consts import SC, DBType
@@ -71,6 +71,7 @@ def parse_args():
     parser.add_argument("--subtitle-languages", "--subtitle-language", "--sl", action=arg_utils.ArgparseList)
 
     parser.add_argument("--prefix", default=os.getcwd(), help=argparse.SUPPRESS)
+    parser.add_argument("--relative", action="store_true", help="Replicate website file tree")
     parser.add_argument("--ext")
 
     parser.add_argument("--print", "-p", default="", const="p", nargs="?", help=argparse.SUPPRESS)
@@ -340,7 +341,8 @@ def dl_download(args=None) -> None:
             elif args.profile == DBType.image:
                 gdl_backend.download(args, m)
             elif args.profile == DBType.filesystem:
-                web.download_url(m["path"], output_prefix=args.prefix)
+                local_path = web.download_url(m["path"], output_prefix=args.prefix, relative=args.relative)
+                db_media.download_add(args, m["path"], {}, local_path)
             else:
                 raise NotImplementedError
         except Exception:
diff --git a/xklb/lb.py b/xklb/lb.py
@@ -25,6 +25,7 @@
     "Update database subcommands": {
         "fsupdate": "Update local media",
         "tubeupdate": "Update online video media",
+        "webupdate": "Update open-directory media",
         "galleryupdate": "Update online gallery media",
         "links_update": "Update a link-scraping database",
         "redditupdate": "Update reddit media",
@@ -223,6 +224,7 @@ def add_parser(subparsers, func, aliases=None):
     add_parser(subparsers, "xklb.scripts.search_db.search_db", ["s", "sdb", "search-dbs"])
     add_parser(subparsers, "xklb.scripts.streaming_tab_loader.streaming_tab_loader", ["surf"])
     add_parser(subparsers, "xklb.scripts.web_add.web_add", ["web-dir-add"])
+    add_parser(subparsers, "xklb.scripts.web_update.web_update", ["web-dir-update"])
     add_parser(subparsers, "xklb.search.search", ["sc", "search-captions"])
     add_parser(subparsers, "xklb.site_extract.site_add", ["sa", "sql-site", "site-sql"])
     add_parser(subparsers, "xklb.tabs_actions.tabs", ["tb"])
diff --git a/xklb/scripts/web_add.py b/xklb/scripts/web_add.py
@@ -214,6 +214,10 @@ def spider(args, paths: Set):
         traversed_paths.add(path)
         log.info("Loading %s", path)
 
+        printing.print_overwrite(
+            f"Pages to scan {len(paths)} link scan: {new_media_count} new [{len(known_paths)} known]"
+        )
+
         if path.endswith("/"):
             for a_ref in get_urls(args, path):
                 if a_ref is None:
@@ -243,34 +247,46 @@ def spider(args, paths: Set):
                 else:
                     new_paths[path] = None  # add key to map; title: None
 
-        printing.print_overwrite(
-            f"Pages to scan {len(paths)} link scan: {len(new_paths)} new [{len(known_paths)} known]"
-        )
-
         media = [{"path": k, "title": v} for k, v in new_paths.items()]
-        for m in media:
+        new_media_count += len(media)
+        for i, m in enumerate(media, start=1):
+            printing.print_overwrite(
+                f"Pages to scan {len(paths)} link scan: {new_media_count} new [{len(known_paths)} known]; basic metadata {i} of {len(media)}"
+            )
+
             m |= web.stat(m["path"])
-            m["type"] = file_utils.mimetype(path)
+            m["type"] = file_utils.mimetype(m["path"])
 
             if getattr(args, "hash", False):
                 # TODO: use head_foot_stream
                 m["hash"] = sample_hash.sample_hash_file(path)
 
-        if args.profile in [DBType.audio, DBType.video]:
-            for m in media:
+        for i, m in enumerate(media, start=1):
+            printing.print_overwrite(
+                f"Pages to scan {len(paths)} link scan: {new_media_count} new [{len(known_paths)} known]; {args.profile} metadata {i} of {len(media)}"
+            )
+
+            extension = m["path"].rsplit(".", 1)[-1].lower()
+            remote_path = m["path"]
+            if args.profile == DBType.video and extension in consts.VIDEO_EXTENSIONS:
+                m |= av.munge_av_tags(args, m["path"])
+            elif args.profile == DBType.audio and extension in consts.AUDIO_ONLY_EXTENSIONS:
                 m |= av.munge_av_tags(args, m["path"])
-        elif args.profile == DBType.text:
-            for m in media:
+            elif args.profile == DBType.text and extension in consts.TEXTRACT_EXTENSIONS:
                 with web.PartialContent(m["path"]) as temp_file_path:
                     m |= books.munge_book_tags_fast(temp_file_path)
-        elif args.profile == DBType.image:
-            for m in media:
+            elif args.profile == DBType.image and extension in consts.IMAGE_EXTENSIONS:
                 with web.PartialContent(m["path"], max_size=32 * 1024) as temp_file_path:
                     m |= books.extract_image_metadata_chunk([{"path": temp_file_path}])[0]
+            m["path"] = remote_path  # required for temp file extraction
 
         if media:
             add_media(args, media)
-            new_media_count += len(media)
+
+        printing.print_overwrite(
+            f"Pages to scan {len(paths)} link scan: {new_media_count} new [{len(known_paths)} known]"
+        )
+
     return new_media_count
 
 
@@ -342,7 +358,7 @@ def web_update(args=None) -> None:
             extractor_config = json.loads(playlist.get("extractor_config") or "{}")
             args_env = arg_utils.override_config(parser, extractor_config, args)
 
-            # TODO: use directory Last-Modified header to skip file trees which don't need to be scanned
+            # TODO: use directory Last-Modified header to skip file trees which don't need to be updated
             new_media = spider(args_env, {playlist["path"]})
 
             if new_media > 0:
diff --git a/xklb/usage.py b/xklb/usage.py
@@ -1687,6 +1687,8 @@ def play(action) -> str:
 
     Scan open directories
 
+    library download open_dir.db --fs --prefix ~/d/dump/video/ --relative -vv -s factory -p
+
 """
 
 webupdate = """library web-update DATABASE
diff --git a/xklb/utils/file_utils.py b/xklb/utils/file_utils.py
@@ -6,6 +6,8 @@
 from shutil import which
 from typing import List, Optional, Set, Tuple, Union
 
+import urllib3
+
 from xklb.utils import consts, file_utils, printing, processes, web
 from xklb.utils.log_utils import log
 
@@ -269,9 +271,12 @@ def head_foot_stream(url, head_len, foot_len):
 
     head_bytes = head_response.raw.read(head_len)
 
-    foot_response = web.session.get(url, stream=True, headers={"Range": f"bytes={-foot_len}"})
+    foot_response = web.session.get(url, stream=True, headers={"Range": f"bytes=-{foot_len}"})
     foot_response.raw.decode_content = True
-    foot_bytes = foot_response.raw.read(foot_len)
+    try:
+        foot_bytes = foot_response.raw.read(foot_len)
+    except urllib3.exceptions.DecodeError:
+        foot_bytes = b""
 
     stream = io.BytesIO(head_bytes + foot_bytes)
     return stream
@@ -333,8 +338,12 @@ def mimetype(path):
                 file_type = "block device"
             elif p.is_char_device():
                 file_type = "char device"
-            elif Path(path).stat().st_size == 0:
-                file_type = "empty file"
+            try:
+                if Path(path).stat().st_size == 0:
+                    file_type = "empty file"
+            except Exception:
+                pass
+
         except FileNotFoundError:
             return
 
@@ -427,12 +436,12 @@ def read_file_to_dataframes(
         "netcdf",
         "application/x-netcdf",
     ):
-        import xarray as xr
+        import xarray as xr  # type: ignore
 
         ds = xr.open_dataset(path)
         dfs = [ds.to_dataframe()]
     elif mimetype in ("zarr",):
-        import xarray as xr
+        import xarray as xr  # type: ignore
 
         ds = xr.open_zarr(path)
         dfs = [ds.to_dataframe()]
diff --git a/xklb/utils/web.py b/xklb/utils/web.py

Original file line number	Diff line number	Diff line change
`@@ -1 +1 @@`
`1`		`-__version__ = "2.4.005"`
	`1`	`+__version__ = "2.4.006"`