Merge pull request #308 from dclong/dev

dclong · web-flow · commit e4f2b85d17d2 · 2021-12-12T12:11:53.000-08:00
Merge dev into main
diff --git a/dsutil/__init__.py b/dsutil/__init__.py
@@ -3,4 +3,4 @@
 from . import git
 from . import poetry
 
-__version__ = "0.65.4"
+__version__ = "0.66.0"
diff --git a/dsutil/hadoop/log.py b/dsutil/hadoop/log.py
@@ -232,6 +232,31 @@ def _error_priority(line: str) -> tuple[int, str, str]:
         :return: The priority of the error line.
         """
         patterns = [
+            (
+                "(?i)libc.*not found", 1,
+                "http://www.legendu.net/misc/blog/spark-issue-libc-not-found/"
+            ),
+            (
+                r"(?i)ArrowInvalid", 1,
+                "http://www.legendu.net/misc/blog/Spark-issue:-Pure-Python-code-errors"
+            ),
+            (
+                r"(?i)TypeError: .*has no len()", 1,
+                "http://www.legendu.net/misc/blog/Spark-issue:-Pure-Python-code-errors"
+            ),
+            (
+                r"(?i)CalledProcessError: Command .* returned non-zero exit status", 1,
+                "http://www.legendu.net/misc/blog/Spark-issue:-Pure-Python-code-errors"
+            ),
+            (
+                r"(?i)error: Found argument .* which wasn't expected", 1,
+                "http://www.legendu.net/misc/blog/Spark-issue:-Pure-Python-code-errors"
+            ),
+            (
+                r"(?i)RuntimeError: Result vector from pandas_udf was not the required length",
+                1,
+                "http://www.legendu.net/misc/blog/Spark-issue:-Pure-Python-code-errors"
+            ),
             (
                 "(?i)object has no attribute", 1,
                 "http://www.legendu.net/misc/blog/Spark-issue:-Pure-Python-code-errors"
@@ -268,10 +293,6 @@ def _error_priority(line: str) -> tuple[int, str, str]:
                 "(?i)table not found", 1,
                 "http://www.legendu.net/misc/blog/spark-issue-table-not-found/"
             ),
-            (
-                "(?i)libc.*not found", 1,
-                "http://www.legendu.net/misc/blog/spark-issue-libc-not-found/"
-            ),
             (
                 "(?i)SparkContext: A master URL must be set", 1,
                 "http://www.legendu.net/misc/blog/spark-issue-a-master-url-must-be-set-in-your-configuration/"
@@ -303,19 +324,6 @@ def _error_priority(line: str) -> tuple[int, str, str]:
                 r"(?i)IllegalArgumentException: System memory \d* must be at least", 1,
                 "http://www.legendu.net/misc/blog/spark-issue:-IllegalArgumentException:-System-memory-must-be-at-least"
             ),
-            (
-                r"(?i)CalledProcessError: Command .* returned non-zero exit status", 1,
-                "http://www.legendu.net/misc/blog/spark-issue:-CalledProcessError:-Command-returned-non-zero-exit-status"
-            ),
-            (
-                r"(?i)error: Found argument .* which wasn't expected", 1,
-                "http://www.legendu.net/misc/blog/spark-issue:-error:-Found-argument-which-was-not-expected"
-            ),
-            (
-                r"(?i)RuntimeError: Result vector from pandas_udf was not the required length",
-                1,
-                "http://www.legendu.net/misc/blog/spark-issue:-RuntimeError:-Result-vector-of-pandas_udf-was-not-the-required-length"
-            ),
             (
                 r"(?i)InvalidResourceRequestException", 1,
                 "http://www.legendu.net/misc/blog/spark-issue:-InvalidResourceRequestException"
diff --git a/dsutil/hadoop/logf.py b/dsutil/hadoop/logf.py
@@ -1,6 +1,7 @@
 """Script for fetch and filtering Spark application logs.
 """
 from typing import Optional
+from pathlib import Path
 import re
 from argparse import ArgumentParser, Namespace
 import subprocess as sp
@@ -54,6 +55,58 @@ def fetch(args):
     filter_(args)
 
 
+def status(args):
+    """Get status of a Spark application.
+
+    :param args: A Namespace object containing command-line options.
+    """
+    if "app_id" in args:
+        cmd = ["yarn", "application", "-status", args.app_id]
+        sp.run(cmd, check=True)
+        return
+    report = """Application Report : 
+        Application-Id : {APP_ID}
+        Application-Name : {APP_NAME}
+        Application-Type : {APP_TYPE}
+        User : {USER}
+        Queue : {QUEUE}
+        Application Priority : {PRIORITY}
+        Start-Time : {START_TIME}
+        Finish-Time : {FINISH_TIME}
+        Progress : {PROGRESS}
+        State : {STATE}
+        Final-State : {STATUS}
+        Tracking-URL : {URL}
+        RPC Port : {PORT}
+        AM Host : {HOST}
+        Aggregate Resource Allocation : {RESOURCE}
+        Log Aggregation Status : {LOG_STATUS}
+        Diagnostics : 
+        Unmanaged Application : {UNMANAGED}
+        Application Node Label Expression : {APP_NODE_LABEL}
+        AM container Node Label Expression : {CON_NODE_LABEL}
+    """
+    with args.log_file.open() as fin:
+        for line in fin:
+            if "{APP_ID}" in report:
+                match = re.search(r"(application_\d+_\d+)", line)
+                if match:
+                    report = report.replace("{APP_ID}", match.group())
+            if "{APP_NAME}" in report:
+                match = re.search(r"--primary-py-file (.*) ", line)
+                if match:
+                    report = report.replace("{APP_NAME}", match.group())
+            if "{USER}" in report:
+                match = re.search(r"local/usercache/(.*)/", line)
+                if match:
+                    report = report.replace("{USER}", match.group())
+            if "{STATUS}" in report:
+                match = re.search(r"Final app status: (.*),", line)
+                if match:
+                    report = report.replace("{STATUS}", match.group())
+    print(report)
+
+
 def parse_args(args=None, namespace=None) -> Namespace:
     """Parse command-line arguments.
     
@@ -66,9 +119,24 @@ def parse_args(args=None, namespace=None) -> Namespace:
     subparsers = parser.add_subparsers(help="Sub commands.")
     _subparser_fetch(subparsers)
     _subparser_filter(subparsers)
+    _subparser_status(subparsers)
     return parser.parse_args(args=args, namespace=namespace)
 
 
+def _subparser_status(subparsers):
+    subparser_status = subparsers.add_parser(
+        "status", help="filter key information from a Spark/Hive application log."
+    )
+    mutex_group = subparser_status.add_mutually_exclusive_group(required=True)
+    mutex_group.add_argument(
+        "-i", "--id", "--app-id", dest="app_id", type=str, help="An application ID."
+    )
+    mutex_group.add_argument(
+        "-l", "-f", "--log-file", dest="log_file", type=Path, help="An application ID."
+    )
+    subparser_status.set_defaults(func=status)
+
+
 def _option_filter(subparser) -> None:
     subparser.add_argument(
         "-k",
diff --git a/dsutil/hadoop/pyspark_submit.py b/dsutil/hadoop/pyspark_submit.py
@@ -319,14 +319,20 @@ def _submit_cluster(args, config: dict[str, Any]) -> bool:
     if not os.path.isfile(spark_submit):
         raise ValueError(f"{spark_submit} does not exist!")
     opts = (
-        "files", "master", "deploy-mode", "queue", "num-executors", "executor-memory",
-        "driver-memory", "executor-cores", "archives"
+        "files",
+        "master",
+        "deploy-mode",
+        "queue",
+        "num-executors",
+        "executor-memory",
+        "driver-memory",
+        "executor-cores",
+        "archives",
+        "jars",
     )
     lines = [config["spark-submit"]] + [
-        f"--{opt} {config[opt]}" for opt in opts if opt in config
+        f"--{opt} {config[opt]}" for opt in opts if opt in config and config[opt]
     ] + [f"--conf {k}={v}" for k, v in config["conf"].items()]
-    if config["jars"]:
-        lines.append(f"--jars {config['jars']}")
     lines.extend(args.cmd)
     for idx in range(1, len(lines)):
         lines[idx] = " " * 4 + lines[idx]
@@ -358,11 +364,14 @@ def submit(args: Namespace) -> None:
         config["python-local"] = args.python_local
     if "files" not in config:
         config["files"] = []
-    config["files"] = _files(config) + args.files
-    if "jars" not in config:
-        config["jars"] = ""
-    if isinstance(config["jars"], (list, tuple)):
-        config["jars"] = ",".join(config["jars"])
+    config["files"].extend(args.files)
+    config["files"] = _files(config)
+    if "archives" in config:
+        if isinstance(config["archives"], (list, tuple)):
+            config["archives"] = ",".join(config["archives"])
+    if "jars" in config:
+        if isinstance(config["jars"], (list, tuple)):
+            config["jars"] = ",".join(config["jars"])
     # submit Spark applications
     if _submit_local(args, config):
         _submit_cluster(args, config)
@@ -407,6 +416,7 @@ def parse_args(args=None, namespace=None) -> Namespace:
         help="Specify a path for generating a configration example."
     )
     mutex_group.add_argument(
+        "--cmd",
         dest="cmd",
         nargs="+",
         help="The command (of PySpark script) to submit to Spark to run."
diff --git a/dsutil/hash.py b/dsutil/hash.py
@@ -35,10 +35,18 @@ def _rmd5(path: Path, res: list[tuple[str]]) -> None:
     :param res: A list to record the result.
     """
     if path.is_file():
-        md5sum = hashlib.md5(path.read_bytes()).hexdigest()
+        try:
+            md5sum = hashlib.md5(path.read_bytes()).hexdigest()
+        except:
+            md5sum = "FAILED!"
         line = f"{str(path)}: {md5sum}"
         res.append(line)
         logger.info(line)
         return
-    for p in path.iterdir():
-        _rmd5(p, res)
+    try:
+        for p in path.iterdir():
+            _rmd5(p, res)
+    except:
+        line = f"{str(path)}: FAILED!"
+        res.append(line)
+        logger.info(line)
diff --git a/dsutil/text.py b/dsutil/text.py
@@ -6,6 +6,7 @@
 from typing import Union
 import sys
 from pathlib import Path
+import re
 from loguru import logger
 
 
@@ -182,3 +183,74 @@ def prune_json(input: Union[str, Path], output: Union[str, Path] = ""):
             else:
                 fout.write(line)
     logger.info("The pruned JSON file is written to {}.", output)
+
+
+def _filter_num(path: Union[str, Path], pattern: str, num_lines: int):
+    if isinstance(path, str):
+        path = Path(path)
+    results = []
+    res = []
+    count = 0
+    for line in path.open():
+        if count > 0:
+            res.append(line)
+            count -= 1
+            continue
+        if re.search(pattern, line):
+            if res:
+                results.append(res)
+            res = []
+            res.append(line)
+            count = num_lines
+    if res:
+        results.append(res)
+    return results
+
+
+def _filter_sp(path: Union[str, Path], pattern: str, sub_pattern: str):
+    if isinstance(path, str):
+        path = Path(path)
+    results = []
+    res = []
+    sub = False
+    for line in path.open():
+        if sub:
+            if re.search(sub_pattern, line):
+                res.append(line)
+            else:
+                sub = False
+        if re.search(pattern, line):
+            if res:
+                results.append(res)
+            res = []
+            res.append(line)
+            sub = True
+    if res:
+        results.append(res)
+    return results
+
+
+def filter(
+    path: Union[str, Path],
+    pattern: str,
+    sub_pattern: str = "",
+    num_lines: int = 0
+) -> list[list[str]]:
+    """Filter lines from a file. 
+    A main regex pattern is used to identify main rows.
+    For each matched main row, 
+    a sub regex pattern or a fixed number of lines can be provided.
+    If a sub regex pattern is provided,
+    then lines matching the sub regex pattern following a main line are kept together with the main line.
+    If a fixed number of lines is provided, e.g., ``num_lines=k``,
+    then ``k`` additional lines after a main line are kept together with the main line.
+
+    :param path: Path to a text file from which to filter lines.
+    :param pattern: The main regex pattern.
+    :param sub_pattern: The sub regex pattern (defaults to "", i.e., no sub pattern by default).
+    :param num_lines: The num of additional lines (0 by default) to keep after a main line.
+    :return: A list of list of lines.
+    """
+    if sub_pattern:
+        return _filter_sp(path, pattern=pattern, sub_pattern=sub_pattern)
+    return _filter_num(path, pattern=pattern, num_lines=num_lines)
diff --git a/pyproject.toml b/pyproject.toml
@@ -1,6 +1,6 @@
 [tool.poetry]
 name = "dsutil"
-version = "0.65.4"
+version = "0.66.0"
 description = "A utils Python package for data scientists."
 authors = ["Benjamin Du <longendu@yahoo.com>"]
 
diff --git a/readme.md b/readme.md
@@ -25,7 +25,7 @@ Currently, Python 3.7 and 3.8 are supported.
 
 You can download a copy of the latest release and install it using pip.
 ```bash
-pip3 install --user -U https://github.com/dclong/dsutil/releases/download/v0.65.4/dsutil-0.65.4-py3-none-any.whl
+pip3 install --user -U https://github.com/dclong/dsutil/releases/download/v0.66.0/dsutil-0.66.0-py3-none-any.whl
 ```
 Or you can use the following command to install the latest master branch
 if you have pip 20.0+.
@@ -35,7 +35,7 @@ pip3 install --user -U git+https://github.com/dclong/dsutil@main
 Use one of the following commands if you want to install all components of dsutil. 
 Available additional components are `cv`, `docker`, `pdf`, `jupyter`, `admin` and `all`.
 ```bash
-pip3 install "dsutil[cv] @ https://github.com/dclong/dsutil/releases/download/v0.65.4/dsutil-0.65.4-py3-none-any.whl"
+pip3 install "dsutil[cv] @ https://github.com/dclong/dsutil/releases/download/v0.66.0/dsutil-0.66.0-py3-none-any.whl"
 # or
 pip3 install --user -U "dsutil[all] @ git+https://github.com/dclong/dsutil@main"
 ```