Skip to content

Commit 040f1b1

Browse files
committed
webtext: --save flag
1 parent 8eb827b commit 040f1b1

File tree

3 files changed

+47
-24
lines changed

3 files changed

+47
-24
lines changed

pdm.lock

+21-21
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

xklb/scripts/mining/extract_text.py

+12-1
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,5 @@
11
import argparse, re
2+
from pathlib import Path
23

34
from bs4 import BeautifulSoup, NavigableString
45

@@ -10,6 +11,7 @@
1011
def parse_args():
1112
parser = argparse.ArgumentParser(prog="library extract-text", usage=usage.extract_text)
1213
parser.add_argument("--skip-links", action="store_true")
14+
parser.add_argument("--save", action="store_true")
1315

1416
parser.add_argument("--cookies", help="path to a Netscape formatted cookies file")
1517
parser.add_argument("--cookies-from-browser", metavar="BROWSER[+KEYRING][:PROFILE][::CONTAINER]")
@@ -99,12 +101,21 @@ def extract_text() -> None:
99101
web.load_selenium(args)
100102
try:
101103
for url in arg_utils.gen_paths(args):
104+
output_lines = []
102105
for s in iterables.return_unique(get_text)(args, url):
103106
if s is None:
104107
break
105108

106-
printing.pipe_print(s)
109+
if args.save:
110+
output_lines.append(s)
111+
else:
112+
printing.pipe_print(s)
107113

114+
if args.save:
115+
save_path = web.url_to_local_path(url)
116+
Path(save_path).parent.mkdir(exist_ok=True, parents=True)
117+
with open(save_path, "w") as f:
118+
f.writelines(s + "\n" for s in output_lines)
108119
finally:
109120
if args.selenium:
110121
web.quit_selenium(args)

xklb/utils/web.py

+14-2
Original file line numberDiff line numberDiff line change
@@ -305,7 +305,19 @@ def quit_selenium(args):
305305
pass
306306

307307

308-
def set_output_path(url, output_path, output_prefix, relative, response):
308+
def url_to_local_path(url):
309+
base_path = "."
310+
parsed_url = urlparse(url)
311+
relative_path = parsed_url.netloc + "/" + parsed_url.path.lstrip("/")
312+
base_path = os.path.dirname(relative_path)
313+
314+
filename = url.split("/")[-1]
315+
output_path = os.path.join(base_path, filename)
316+
output_path = path_utils.clean_path(output_path.encode())
317+
return output_path
318+
319+
320+
def gen_output_path_from_response(url, output_path, output_prefix, relative, response):
309321
if output_path is None:
310322
content_d = response.headers.get("Content-Disposition")
311323
if content_d:
@@ -341,7 +353,7 @@ def download_url(
341353

342354
remote_size = nums.safe_int(r.headers.get("Content-Length"))
343355

344-
output_path = set_output_path(url, output_path, output_prefix, relative, r)
356+
output_path = gen_output_path_from_response(url, output_path, output_prefix, relative, r)
345357
if output_path == ".":
346358
log.warning("Skipping directory %s", url)
347359
return

0 commit comments

Comments
 (0)