Skip to content

Commit 2aa8d8e

Browse files
committed
2.4.006
1 parent f64a343 commit 2aa8d8e

File tree

9 files changed

+128
-82
lines changed

9 files changed

+128
-82
lines changed

.github/README.md

+18-21
Original file line numberDiff line numberDiff line change
@@ -95,7 +95,7 @@ To stop playing press Ctrl+C in either the terminal or mpv
9595
<details><summary>List all subcommands</summary>
9696

9797
$ library
98-
xk media library subcommands (v2.4.005)
98+
xk media library subcommands (v2.4.006)
9999

100100
Create database subcommands:
101101
╭───────────────┬────────────────────────────────────────────────────╮
@@ -132,6 +132,8 @@ To stop playing press Ctrl+C in either the terminal or mpv
132132
├───────────────┼─────────────────────────────────┤
133133
│ tubeupdate │ Update online video media │
134134
├───────────────┼─────────────────────────────────┤
135+
│ webupdate │ Update open-directory media │
136+
├───────────────┼─────────────────────────────────┤
135137
│ galleryupdate │ Update online gallery media │
136138
├───────────────┼─────────────────────────────────┤
137139
│ links-update │ Update a link-scraping database │
@@ -258,11 +260,6 @@ To stop playing press Ctrl+C in either the terminal or mpv
258260
│ nouns │ Unstructured text -> compound nouns (stdin) │
259261
╰────────────────┴─────────────────────────────────────────────╯
260262

261-
Other subcommands:
262-
╭───────────┬───────────╮
263-
│ webupdate │ webupdate │
264-
╰───────────┴───────────╯
265-
266263

267264
</details>
268265

@@ -593,6 +590,8 @@ BTW, for some cols like time_deleted you'll need to specify a where clause so th
593590

594591
Scan open directories
595592

593+
library download open_dir.db --fs --prefix ~/d/dump/video/ --relative -vv -s factory -p
594+
596595

597596

598597
</details>
@@ -916,6 +915,19 @@ BTW, for some cols like time_deleted you'll need to specify a where clause so th
916915
lb dedupe-db video.db playlists --bk extractor_playlist_id
917916

918917

918+
</details>
919+
920+
###### webupdate
921+
922+
<details><summary>Update open-directory media</summary>
923+
924+
$ library webupdate -h
925+
usage: library web-update DATABASE
926+
927+
Update saved open directories
928+
929+
930+
919931
</details>
920932

921933
###### galleryupdate
@@ -2593,21 +2605,6 @@ BTW, for some cols like time_deleted you'll need to specify a where clause so th
25932605
library process-audio --split-longer-than 36mins audiobook.m4b audiobook2.mp3
25942606

25952607

2596-
</details>
2597-
2598-
### Other subcommands
2599-
2600-
###### webupdate
2601-
2602-
<details><summary>webupdate</summary>
2603-
2604-
$ library webupdate -h
2605-
usage: library web-update DATABASE
2606-
2607-
Update saved open directories
2608-
2609-
2610-
26112608
</details>
26122609

26132610

pdm.lock

+5-5
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

xklb/__init__.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -1 +1 @@
1-
__version__ = "2.4.005"
1+
__version__ = "2.4.006"

xklb/dl_extract.py

+4-2
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
11
import argparse, os, sys
22
from typing import List, Tuple
33

4-
from xklb import gdl_backend, tube_backend, usage
4+
from xklb import db_media, gdl_backend, tube_backend, usage
55
from xklb.media import media_printer
66
from xklb.utils import arg_utils, consts, db_utils, iterables, nums, objects, printing, processes, sql_utils, web
77
from xklb.utils.consts import SC, DBType
@@ -71,6 +71,7 @@ def parse_args():
7171
parser.add_argument("--subtitle-languages", "--subtitle-language", "--sl", action=arg_utils.ArgparseList)
7272

7373
parser.add_argument("--prefix", default=os.getcwd(), help=argparse.SUPPRESS)
74+
parser.add_argument("--relative", action="store_true", help="Replicate website file tree")
7475
parser.add_argument("--ext")
7576

7677
parser.add_argument("--print", "-p", default="", const="p", nargs="?", help=argparse.SUPPRESS)
@@ -340,7 +341,8 @@ def dl_download(args=None) -> None:
340341
elif args.profile == DBType.image:
341342
gdl_backend.download(args, m)
342343
elif args.profile == DBType.filesystem:
343-
web.download_url(m["path"], output_prefix=args.prefix)
344+
local_path = web.download_url(m["path"], output_prefix=args.prefix, relative=args.relative)
345+
db_media.download_add(args, m["path"], {}, local_path)
344346
else:
345347
raise NotImplementedError
346348
except Exception:

xklb/lb.py

+2
Original file line numberDiff line numberDiff line change
@@ -25,6 +25,7 @@
2525
"Update database subcommands": {
2626
"fsupdate": "Update local media",
2727
"tubeupdate": "Update online video media",
28+
"webupdate": "Update open-directory media",
2829
"galleryupdate": "Update online gallery media",
2930
"links_update": "Update a link-scraping database",
3031
"redditupdate": "Update reddit media",
@@ -223,6 +224,7 @@ def add_parser(subparsers, func, aliases=None):
223224
add_parser(subparsers, "xklb.scripts.search_db.search_db", ["s", "sdb", "search-dbs"])
224225
add_parser(subparsers, "xklb.scripts.streaming_tab_loader.streaming_tab_loader", ["surf"])
225226
add_parser(subparsers, "xklb.scripts.web_add.web_add", ["web-dir-add"])
227+
add_parser(subparsers, "xklb.scripts.web_update.web_update", ["web-dir-update"])
226228
add_parser(subparsers, "xklb.search.search", ["sc", "search-captions"])
227229
add_parser(subparsers, "xklb.site_extract.site_add", ["sa", "sql-site", "site-sql"])
228230
add_parser(subparsers, "xklb.tabs_actions.tabs", ["tb"])

xklb/scripts/web_add.py

+30-14
Original file line numberDiff line numberDiff line change
@@ -214,6 +214,10 @@ def spider(args, paths: Set):
214214
traversed_paths.add(path)
215215
log.info("Loading %s", path)
216216

217+
printing.print_overwrite(
218+
f"Pages to scan {len(paths)} link scan: {new_media_count} new [{len(known_paths)} known]"
219+
)
220+
217221
if path.endswith("/"):
218222
for a_ref in get_urls(args, path):
219223
if a_ref is None:
@@ -243,34 +247,46 @@ def spider(args, paths: Set):
243247
else:
244248
new_paths[path] = None # add key to map; title: None
245249

246-
printing.print_overwrite(
247-
f"Pages to scan {len(paths)} link scan: {len(new_paths)} new [{len(known_paths)} known]"
248-
)
249-
250250
media = [{"path": k, "title": v} for k, v in new_paths.items()]
251-
for m in media:
251+
new_media_count += len(media)
252+
for i, m in enumerate(media, start=1):
253+
printing.print_overwrite(
254+
f"Pages to scan {len(paths)} link scan: {new_media_count} new [{len(known_paths)} known]; basic metadata {i} of {len(media)}"
255+
)
256+
252257
m |= web.stat(m["path"])
253-
m["type"] = file_utils.mimetype(path)
258+
m["type"] = file_utils.mimetype(m["path"])
254259

255260
if getattr(args, "hash", False):
256261
# TODO: use head_foot_stream
257262
m["hash"] = sample_hash.sample_hash_file(path)
258263

259-
if args.profile in [DBType.audio, DBType.video]:
260-
for m in media:
264+
for i, m in enumerate(media, start=1):
265+
printing.print_overwrite(
266+
f"Pages to scan {len(paths)} link scan: {new_media_count} new [{len(known_paths)} known]; {args.profile} metadata {i} of {len(media)}"
267+
)
268+
269+
extension = m["path"].rsplit(".", 1)[-1].lower()
270+
remote_path = m["path"]
271+
if args.profile == DBType.video and extension in consts.VIDEO_EXTENSIONS:
272+
m |= av.munge_av_tags(args, m["path"])
273+
elif args.profile == DBType.audio and extension in consts.AUDIO_ONLY_EXTENSIONS:
261274
m |= av.munge_av_tags(args, m["path"])
262-
elif args.profile == DBType.text:
263-
for m in media:
275+
elif args.profile == DBType.text and extension in consts.TEXTRACT_EXTENSIONS:
264276
with web.PartialContent(m["path"]) as temp_file_path:
265277
m |= books.munge_book_tags_fast(temp_file_path)
266-
elif args.profile == DBType.image:
267-
for m in media:
278+
elif args.profile == DBType.image and extension in consts.IMAGE_EXTENSIONS:
268279
with web.PartialContent(m["path"], max_size=32 * 1024) as temp_file_path:
269280
m |= books.extract_image_metadata_chunk([{"path": temp_file_path}])[0]
281+
m["path"] = remote_path # required for temp file extraction
270282

271283
if media:
272284
add_media(args, media)
273-
new_media_count += len(media)
285+
286+
printing.print_overwrite(
287+
f"Pages to scan {len(paths)} link scan: {new_media_count} new [{len(known_paths)} known]"
288+
)
289+
274290
return new_media_count
275291

276292

@@ -342,7 +358,7 @@ def web_update(args=None) -> None:
342358
extractor_config = json.loads(playlist.get("extractor_config") or "{}")
343359
args_env = arg_utils.override_config(parser, extractor_config, args)
344360

345-
# TODO: use directory Last-Modified header to skip file trees which don't need to be scanned
361+
# TODO: use directory Last-Modified header to skip file trees which don't need to be updated
346362
new_media = spider(args_env, {playlist["path"]})
347363

348364
if new_media > 0:

xklb/usage.py

+2
Original file line numberDiff line numberDiff line change
@@ -1687,6 +1687,8 @@ def play(action) -> str:
16871687
16881688
Scan open directories
16891689
1690+
library download open_dir.db --fs --prefix ~/d/dump/video/ --relative -vv -s factory -p
1691+
16901692
"""
16911693

16921694
webupdate = """library web-update DATABASE

xklb/utils/file_utils.py

+15-6
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,8 @@
66
from shutil import which
77
from typing import List, Optional, Set, Tuple, Union
88

9+
import urllib3
10+
911
from xklb.utils import consts, file_utils, printing, processes, web
1012
from xklb.utils.log_utils import log
1113

@@ -269,9 +271,12 @@ def head_foot_stream(url, head_len, foot_len):
269271

270272
head_bytes = head_response.raw.read(head_len)
271273

272-
foot_response = web.session.get(url, stream=True, headers={"Range": f"bytes={-foot_len}"})
274+
foot_response = web.session.get(url, stream=True, headers={"Range": f"bytes=-{foot_len}"})
273275
foot_response.raw.decode_content = True
274-
foot_bytes = foot_response.raw.read(foot_len)
276+
try:
277+
foot_bytes = foot_response.raw.read(foot_len)
278+
except urllib3.exceptions.DecodeError:
279+
foot_bytes = b""
275280

276281
stream = io.BytesIO(head_bytes + foot_bytes)
277282
return stream
@@ -333,8 +338,12 @@ def mimetype(path):
333338
file_type = "block device"
334339
elif p.is_char_device():
335340
file_type = "char device"
336-
elif Path(path).stat().st_size == 0:
337-
file_type = "empty file"
341+
try:
342+
if Path(path).stat().st_size == 0:
343+
file_type = "empty file"
344+
except Exception:
345+
pass
346+
338347
except FileNotFoundError:
339348
return
340349

@@ -427,12 +436,12 @@ def read_file_to_dataframes(
427436
"netcdf",
428437
"application/x-netcdf",
429438
):
430-
import xarray as xr
439+
import xarray as xr # type: ignore
431440

432441
ds = xr.open_dataset(path)
433442
dfs = [ds.to_dataframe()]
434443
elif mimetype in ("zarr",):
435-
import xarray as xr
444+
import xarray as xr # type: ignore
436445

437446
ds = xr.open_zarr(path)
438447
dfs = [ds.to_dataframe()]

0 commit comments

Comments
 (0)