Merge pull request #336 from dclong/dev

dclong · web-flow · commit 3a0126d454fa · 2022-04-03T20:03:29.000-07:00
merge dev into main
diff --git a/dsutil/__init__.py b/dsutil/__init__.py
@@ -3,4 +3,4 @@
 from . import git
 from . import poetry
 
-__version__ = "0.72.1"
+__version__ = "0.73.0"
diff --git a/dsutil/hadoop/log.py b/dsutil/hadoop/log.py
@@ -249,6 +249,14 @@ def _error_priority(line: str) -> tuple[int, str, str]:
                 r"(?i)ArrowInvalid", 1,
                 "https://www.legendu.net/misc/blog/Spark-issue:-Pure-Python-code-errors"
             ),
+            (
+                r"(?i)ArrowTypeError: Expected", 1,
+                "https://www.legendu.net/misc/blog/spark-issue:-ArrowTypeError:-Expect-a-type-but-got-a-different-type"
+            ),
+            (
+                r"(?i)Arrow legacy IPC format is not supported", 1,
+                "https://www.legendu.net/misc/blog/spark-issue:-RuntimeError:-Arrow-legacy-IPC-format-is-not-supported"
+            ),
             (
                 r"(?i)TypeError: .*has no len()", 1,
                 "https://www.legendu.net/misc/blog/Spark-issue:-Pure-Python-code-errors"
@@ -270,6 +278,14 @@ def _error_priority(line: str) -> tuple[int, str, str]:
                 1,
                 "https://www.legendu.net/misc/blog/Spark-issue:-Pure-Python-code-errors"
             ),
+            (
+                "(?i)ViewFs: Cannot initialize: Empty Mount table in config", 1,
+                "https://www.legendu.net/misc/blog/spark-issue:-ViewFs:-Cannot-initialize:-Empty-Mount-table-in-config"
+            ),
+            (
+                "(?i)IllegalArgumentException: Wrong FS", 1,
+                "https://www.legendu.net/misc/blog/spark-issue:-IllegalArgumentException:-Wrong-FS"
+            ),
             (
                 "(?i)object has no attribute", 1,
                 "https://www.legendu.net/misc/blog/Spark-issue:-Pure-Python-code-errors"
@@ -321,11 +337,11 @@ def _error_priority(line: str) -> tuple[int, str, str]:
             ),
             (
                 "(?i)URISyntaxException", 1,
-                "http://www.legendu.net/misc/blog/spark-issue:-URISyntaxException"
+                "https://www.legendu.net/misc/blog/spark-issue:-URISyntaxException"
             ),
             (
                 "(?i)Could not find any configured addresses for URI", 1,
-                "http://www.legendu.net/misc/blog/spark-issue:-RuntimeException:-Could-not-find-any-configured-addresses-for-URI"
+                "https://www.legendu.net/misc/blog/spark-issue:-RuntimeException:-Could-not-find-any-configured-addresses-for-URI"
             ),
             (
                 "(?i)table not found", 1,
@@ -366,9 +382,17 @@ def _error_priority(line: str) -> tuple[int, str, str]:
                 r"(?i)InvalidResourceRequestException", 1,
                 "https://www.legendu.net/misc/blog/spark-issue:-InvalidResourceRequestException"
             ),
+            (
+                r"(?i)The quota system is disabled", 1,
+                "https://www.legendu.net/misc/blog/spark-issue:-getQuotaUsage"
+            ),
+            (
+                r"(?i)AnalysisException: Found duplicate column(s)", 1,
+                "https://www.legendu.net/misc/blog/spark-issue:-AnalysisException:-Found-duplicated-columns"
+            ),
             (
                 "(?i)broadcastTimeout", 2,
-                "http://www.legendu.net/misc/blog/spark-issue:-could-not-execute-broadcast-in-300s"
+                "https://www.legendu.net/misc/blog/spark-issue:-could-not-execute-broadcast-in-300s"
             ),
             (
                 "(?i)serialized results is bigger than spark.driver.maxResultSize", 2,
diff --git a/dsutil/hadoop/logf.py b/dsutil/hadoop/logf.py
@@ -5,6 +5,7 @@
 import re
 from argparse import ArgumentParser, Namespace
 import subprocess as sp
+from loguru import logger
 from .log import LogFilter
 
 YARN = "/apache/hadoop/bin/yarn"
@@ -49,6 +50,7 @@ def fetch(args):
     cmd = [YARN, "logs", "-size_limit_mb", "-1", "-applicationId", app_id]
     if args.user:
         cmd = cmd + ["-appOwner", args.user]
+    logger.info(f"Fetching log of the application {app_id} ...")
     with open(output, "w", encoding="utf-8") as fout:
         sp.run(cmd, stdout=fout, check=True)
     args.log_file = output
diff --git a/dsutil/hadoop/repart_hdfs.py b/dsutil/hadoop/repart_hdfs.py
@@ -42,7 +42,7 @@ def main(args: Optional[Namespace] = None):
     """
     if args is None:
         args = parse_args()
-    repart_hdfs(spark, path=args.hdfs_path, num_parts=args.num_parts)
+    repart_hdfs(spark, src_path=args.hdfs_path, num_parts=args.num_parts)
 
 
 if __name__ == "__main__":
diff --git a/dsutil/hadoop/utils.py b/dsutil/hadoop/utils.py
@@ -1,11 +1,11 @@
 """Utils functions for Hadoop.
 """
 from __future__ import annotations
-from typing import Union
+from typing import Optional, Union
 import sys
 import datetime
 from pyspark.sql import DataFrame, Window
-from pyspark.sql.functions import col, spark_partition_id, rank, coalesce, lit, max, sum
+import pyspark.sql.functions as sf
 
 
 def sample(
@@ -50,55 +50,81 @@ def calc_global_rank(frame: DataFrame, order_by: Union[str, list[str]]) -> DataF
     # calculate local rank
     wspec1 = Window.partitionBy("part_id").orderBy(*order_by)
     frame_local_rank = frame.orderBy(order_by).withColumn(
-        "part_id", spark_partition_id()
+        "part_id", sf.spark_partition_id()
     ).withColumn("local_rank",
-                 rank().over(wspec1)).persist()
+                 sf.rank().over(wspec1)).persist()
     # calculate accumulative rank
     wspec2 = Window.orderBy("part_id").rowsBetween(
         Window.unboundedPreceding, Window.currentRow
     )
     stat = frame_local_rank.groupBy("part_id").agg(
-        max("local_rank").alias("max_rank")
+        sf.max("local_rank").alias("max_rank")
     ).withColumn("cum_rank",
-                 sum("max_rank").over(wspec2))
+                 sf.sum("max_rank").over(wspec2))
     # self join and shift 1 row to get sum factor
     stat2 = stat.alias("l").join(
         stat.alias("r"),
-        col("l.part_id") == col("r.part_id") + 1, "left_outer"
-    ).select(col("l.part_id"),
-             coalesce(col("r.cum_rank"), lit(0)).alias("sum_factor"))
+        sf.col("l.part_id") == sf.col("r.part_id") + 1, "left_outer"
+    ).select(
+        sf.col("l.part_id"),
+        sf.coalesce(sf.col("r.cum_rank"), sf.lit(0)).alias("sum_factor")
+    )
     return frame_local_rank.join(
         #broadcast(stat2),
         stat2,
         ["part_id"],
     ).withColumn("rank",
-                 col("local_rank") + col("sum_factor"))
+                 sf.col("local_rank") + sf.col("sum_factor"))
 
 
-def repart_hdfs(spark, path: str, num_parts: int, coalesce: bool = False) -> None:
+def repart_hdfs(
+    spark,
+    src_path: str,
+    dst_path: str = "",
+    num_parts: Optional[int] = None,
+    mb_per_part: float = 64,
+    min_num_parts: int = 1,
+    coalesce: bool = False
+) -> None:
     """Repartition a HDFS path of the Parquet format.
 
     :param spark: A SparkSession object. 
     :param path: The HDFS path to repartition. 
     :param num_parts: The new number of partitions. 
     :param coalesce: If True, use coalesce instead of repartition.
     """
-    path = path.rstrip("/")
-    ts = datetime.datetime.now().strftime("%Y%m%d%H%M%S%f")
-    path_tmp = path + f"_repart_tmp_{ts}"
+    sc = spark.sparkContext
+    hdfs = sc._jvm.org.apache.hadoop.fs.FileSystem.get(sc._jsc.hadoopConfiguration())  # pylint: disable=W0212
+    src_path = src_path.rstrip("/")
+    src_path_hdfs = sc._jvm.org.apache.hadoop.fs.Path(src_path)  # pylint: disable=W0212
+    # num of partitions
+    if num_parts is None:
+        bytes_path = hdfs.getContentSummary(src_path_hdfs).getLength()
+        num_parts = round(bytes_path / 1_048_576 / mb_per_part)
+    num_parts = max(num_parts, min_num_parts)
+    # temp path for repartitioned table
+    if dst_path == src_path:
+        dst_path = ""
+    if dst_path:
+        path_tmp = dst_path
+    else:
+        ts = datetime.datetime.now().strftime("%Y%m%d%H%M%S%f")
+        path_tmp = src_path + f"_repart_tmp_{ts}"
+    # repartition
     if coalesce:
-        spark.read.parquet(path).coalesce(num_parts) \
+        spark.read.parquet(src_path).coalesce(num_parts) \
             .write.mode("overwrite").parquet(path_tmp)
     else:
-        spark.read.parquet(path).repartition(num_parts) \
+        spark.read.parquet(src_path).repartition(num_parts) \
             .write.mode("overwrite").parquet(path_tmp)
-    sc = spark.sparkContext
-    fs = sc._jvm.org.apache.hadoop.fs.FileSystem.get(sc._jsc.hadoopConfiguration())  # pylint: disable=W0212
-    if fs.delete(sc._jvm.org.apache.hadoop.fs.Path(path), True):  # pylint: disable=W0212
-        if not fs.rename(
+    # path_tmp --> src_path
+    if dst_path:
+        return
+    if hdfs.delete(src_path_hdfs, True):
+        if not hdfs.rename(
             sc._jvm.org.apache.hadoop.fs.Path(path_tmp),  # pylint: disable=W0212
-            sc._jvm.org.apache.hadoop.fs.Path(path),  # pylint: disable=W0212
+            src_path_hdfs,  # pylint: disable=W0212
         ):
-            sys.exit(f"Failed to rename the HDFS path {path_tmp} to {path}!")
+            sys.exit(f"Failed to rename the HDFS path {path_tmp} to {src_path}!")
     else:
-        sys.exit(f"Failed to remove the (old) HDFS path: {path}!")
+        sys.exit(f"Failed to remove the (old) HDFS path: {src_path}!")
diff --git a/pyproject.toml b/pyproject.toml
@@ -1,6 +1,6 @@
 [tool.poetry]
 name = "dsutil"
-version = "0.72.1"
+version = "0.73.0"
 description = "A utils Python package for data scientists."
 authors = ["Benjamin Du <longendu@yahoo.com>"]
 
diff --git a/readme.md b/readme.md
@@ -38,7 +38,7 @@ Currently, Python 3.7 and 3.8 are supported.
 
 You can download a copy of the latest release and install it using pip.
 ```bash
-pip3 install --user -U https://github.com/dclong/dsutil/releases/download/v0.72.1/dsutil-0.72.1-py3-none-any.whl
+pip3 install --user -U https://github.com/dclong/dsutil/releases/download/v0.73.0/dsutil-0.73.0-py3-none-any.whl
 ```
 Or you can use the following command to install the latest master branch
 if you have pip 20.0+.
@@ -48,7 +48,7 @@ pip3 install --user -U git+https://github.com/dclong/dsutil@main
 Use one of the following commands if you want to install all components of dsutil. 
 Available additional components are `cv`, `docker`, `pdf`, `jupyter`, `admin` and `all`.
 ```bash
-pip3 install "dsutil[cv] @ https://github.com/dclong/dsutil/releases/download/v0.72.1/dsutil-0.72.1-py3-none-any.whl"
+pip3 install "dsutil[cv] @ https://github.com/dclong/dsutil/releases/download/v0.73.0/dsutil-0.73.0-py3-none-any.whl"
 # or
 pip3 install --user -U "dsutil[all] @ git+https://github.com/dclong/dsutil@main"
 ```