Skip to content

Commit 3a0126d

Browse files
authored
Merge pull request #336 from dclong/dev
merge dev into main
2 parents 49cb7eb + 3c149db commit 3a0126d

File tree

7 files changed

+83
-31
lines changed

7 files changed

+83
-31
lines changed

dsutil/__init__.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -3,4 +3,4 @@
33
from . import git
44
from . import poetry
55

6-
__version__ = "0.72.1"
6+
__version__ = "0.73.0"

dsutil/hadoop/log.py

Lines changed: 27 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -249,6 +249,14 @@ def _error_priority(line: str) -> tuple[int, str, str]:
249249
r"(?i)ArrowInvalid", 1,
250250
"https://www.legendu.net/misc/blog/Spark-issue:-Pure-Python-code-errors"
251251
),
252+
(
253+
r"(?i)ArrowTypeError: Expected", 1,
254+
"https://www.legendu.net/misc/blog/spark-issue:-ArrowTypeError:-Expect-a-type-but-got-a-different-type"
255+
),
256+
(
257+
r"(?i)Arrow legacy IPC format is not supported", 1,
258+
"https://www.legendu.net/misc/blog/spark-issue:-RuntimeError:-Arrow-legacy-IPC-format-is-not-supported"
259+
),
252260
(
253261
r"(?i)TypeError: .*has no len()", 1,
254262
"https://www.legendu.net/misc/blog/Spark-issue:-Pure-Python-code-errors"
@@ -270,6 +278,14 @@ def _error_priority(line: str) -> tuple[int, str, str]:
270278
1,
271279
"https://www.legendu.net/misc/blog/Spark-issue:-Pure-Python-code-errors"
272280
),
281+
(
282+
"(?i)ViewFs: Cannot initialize: Empty Mount table in config", 1,
283+
"https://www.legendu.net/misc/blog/spark-issue:-ViewFs:-Cannot-initialize:-Empty-Mount-table-in-config"
284+
),
285+
(
286+
"(?i)IllegalArgumentException: Wrong FS", 1,
287+
"https://www.legendu.net/misc/blog/spark-issue:-IllegalArgumentException:-Wrong-FS"
288+
),
273289
(
274290
"(?i)object has no attribute", 1,
275291
"https://www.legendu.net/misc/blog/Spark-issue:-Pure-Python-code-errors"
@@ -321,11 +337,11 @@ def _error_priority(line: str) -> tuple[int, str, str]:
321337
),
322338
(
323339
"(?i)URISyntaxException", 1,
324-
"http://www.legendu.net/misc/blog/spark-issue:-URISyntaxException"
340+
"https://www.legendu.net/misc/blog/spark-issue:-URISyntaxException"
325341
),
326342
(
327343
"(?i)Could not find any configured addresses for URI", 1,
328-
"http://www.legendu.net/misc/blog/spark-issue:-RuntimeException:-Could-not-find-any-configured-addresses-for-URI"
344+
"https://www.legendu.net/misc/blog/spark-issue:-RuntimeException:-Could-not-find-any-configured-addresses-for-URI"
329345
),
330346
(
331347
"(?i)table not found", 1,
@@ -366,9 +382,17 @@ def _error_priority(line: str) -> tuple[int, str, str]:
366382
r"(?i)InvalidResourceRequestException", 1,
367383
"https://www.legendu.net/misc/blog/spark-issue:-InvalidResourceRequestException"
368384
),
385+
(
386+
r"(?i)The quota system is disabled", 1,
387+
"https://www.legendu.net/misc/blog/spark-issue:-getQuotaUsage"
388+
),
389+
(
390+
r"(?i)AnalysisException: Found duplicate column(s)", 1,
391+
"https://www.legendu.net/misc/blog/spark-issue:-AnalysisException:-Found-duplicated-columns"
392+
),
369393
(
370394
"(?i)broadcastTimeout", 2,
371-
"http://www.legendu.net/misc/blog/spark-issue:-could-not-execute-broadcast-in-300s"
395+
"https://www.legendu.net/misc/blog/spark-issue:-could-not-execute-broadcast-in-300s"
372396
),
373397
(
374398
"(?i)serialized results is bigger than spark.driver.maxResultSize", 2,

dsutil/hadoop/logf.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,7 @@
55
import re
66
from argparse import ArgumentParser, Namespace
77
import subprocess as sp
8+
from loguru import logger
89
from .log import LogFilter
910

1011
YARN = "/apache/hadoop/bin/yarn"
@@ -49,6 +50,7 @@ def fetch(args):
4950
cmd = [YARN, "logs", "-size_limit_mb", "-1", "-applicationId", app_id]
5051
if args.user:
5152
cmd = cmd + ["-appOwner", args.user]
53+
logger.info(f"Fetching log of the application {app_id} ...")
5254
with open(output, "w", encoding="utf-8") as fout:
5355
sp.run(cmd, stdout=fout, check=True)
5456
args.log_file = output

dsutil/hadoop/repart_hdfs.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -42,7 +42,7 @@ def main(args: Optional[Namespace] = None):
4242
"""
4343
if args is None:
4444
args = parse_args()
45-
repart_hdfs(spark, path=args.hdfs_path, num_parts=args.num_parts)
45+
repart_hdfs(spark, src_path=args.hdfs_path, num_parts=args.num_parts)
4646

4747

4848
if __name__ == "__main__":

dsutil/hadoop/utils.py

Lines changed: 49 additions & 23 deletions
Original file line numberDiff line numberDiff line change
@@ -1,11 +1,11 @@
11
"""Utils functions for Hadoop.
22
"""
33
from __future__ import annotations
4-
from typing import Union
4+
from typing import Optional, Union
55
import sys
66
import datetime
77
from pyspark.sql import DataFrame, Window
8-
from pyspark.sql.functions import col, spark_partition_id, rank, coalesce, lit, max, sum
8+
import pyspark.sql.functions as sf
99

1010

1111
def sample(
@@ -50,55 +50,81 @@ def calc_global_rank(frame: DataFrame, order_by: Union[str, list[str]]) -> DataF
5050
# calculate local rank
5151
wspec1 = Window.partitionBy("part_id").orderBy(*order_by)
5252
frame_local_rank = frame.orderBy(order_by).withColumn(
53-
"part_id", spark_partition_id()
53+
"part_id", sf.spark_partition_id()
5454
).withColumn("local_rank",
55-
rank().over(wspec1)).persist()
55+
sf.rank().over(wspec1)).persist()
5656
# calculate accumulative rank
5757
wspec2 = Window.orderBy("part_id").rowsBetween(
5858
Window.unboundedPreceding, Window.currentRow
5959
)
6060
stat = frame_local_rank.groupBy("part_id").agg(
61-
max("local_rank").alias("max_rank")
61+
sf.max("local_rank").alias("max_rank")
6262
).withColumn("cum_rank",
63-
sum("max_rank").over(wspec2))
63+
sf.sum("max_rank").over(wspec2))
6464
# self join and shift 1 row to get sum factor
6565
stat2 = stat.alias("l").join(
6666
stat.alias("r"),
67-
col("l.part_id") == col("r.part_id") + 1, "left_outer"
68-
).select(col("l.part_id"),
69-
coalesce(col("r.cum_rank"), lit(0)).alias("sum_factor"))
67+
sf.col("l.part_id") == sf.col("r.part_id") + 1, "left_outer"
68+
).select(
69+
sf.col("l.part_id"),
70+
sf.coalesce(sf.col("r.cum_rank"), sf.lit(0)).alias("sum_factor")
71+
)
7072
return frame_local_rank.join(
7173
#broadcast(stat2),
7274
stat2,
7375
["part_id"],
7476
).withColumn("rank",
75-
col("local_rank") + col("sum_factor"))
77+
sf.col("local_rank") + sf.col("sum_factor"))
7678

7779

78-
def repart_hdfs(spark, path: str, num_parts: int, coalesce: bool = False) -> None:
80+
def repart_hdfs(
81+
spark,
82+
src_path: str,
83+
dst_path: str = "",
84+
num_parts: Optional[int] = None,
85+
mb_per_part: float = 64,
86+
min_num_parts: int = 1,
87+
coalesce: bool = False
88+
) -> None:
7989
"""Repartition a HDFS path of the Parquet format.
8090
8191
:param spark: A SparkSession object.
8292
:param path: The HDFS path to repartition.
8393
:param num_parts: The new number of partitions.
8494
:param coalesce: If True, use coalesce instead of repartition.
8595
"""
86-
path = path.rstrip("/")
87-
ts = datetime.datetime.now().strftime("%Y%m%d%H%M%S%f")
88-
path_tmp = path + f"_repart_tmp_{ts}"
96+
sc = spark.sparkContext
97+
hdfs = sc._jvm.org.apache.hadoop.fs.FileSystem.get(sc._jsc.hadoopConfiguration()) # pylint: disable=W0212
98+
src_path = src_path.rstrip("/")
99+
src_path_hdfs = sc._jvm.org.apache.hadoop.fs.Path(src_path) # pylint: disable=W0212
100+
# num of partitions
101+
if num_parts is None:
102+
bytes_path = hdfs.getContentSummary(src_path_hdfs).getLength()
103+
num_parts = round(bytes_path / 1_048_576 / mb_per_part)
104+
num_parts = max(num_parts, min_num_parts)
105+
# temp path for repartitioned table
106+
if dst_path == src_path:
107+
dst_path = ""
108+
if dst_path:
109+
path_tmp = dst_path
110+
else:
111+
ts = datetime.datetime.now().strftime("%Y%m%d%H%M%S%f")
112+
path_tmp = src_path + f"_repart_tmp_{ts}"
113+
# repartition
89114
if coalesce:
90-
spark.read.parquet(path).coalesce(num_parts) \
115+
spark.read.parquet(src_path).coalesce(num_parts) \
91116
.write.mode("overwrite").parquet(path_tmp)
92117
else:
93-
spark.read.parquet(path).repartition(num_parts) \
118+
spark.read.parquet(src_path).repartition(num_parts) \
94119
.write.mode("overwrite").parquet(path_tmp)
95-
sc = spark.sparkContext
96-
fs = sc._jvm.org.apache.hadoop.fs.FileSystem.get(sc._jsc.hadoopConfiguration()) # pylint: disable=W0212
97-
if fs.delete(sc._jvm.org.apache.hadoop.fs.Path(path), True): # pylint: disable=W0212
98-
if not fs.rename(
120+
# path_tmp --> src_path
121+
if dst_path:
122+
return
123+
if hdfs.delete(src_path_hdfs, True):
124+
if not hdfs.rename(
99125
sc._jvm.org.apache.hadoop.fs.Path(path_tmp), # pylint: disable=W0212
100-
sc._jvm.org.apache.hadoop.fs.Path(path), # pylint: disable=W0212
126+
src_path_hdfs, # pylint: disable=W0212
101127
):
102-
sys.exit(f"Failed to rename the HDFS path {path_tmp} to {path}!")
128+
sys.exit(f"Failed to rename the HDFS path {path_tmp} to {src_path}!")
103129
else:
104-
sys.exit(f"Failed to remove the (old) HDFS path: {path}!")
130+
sys.exit(f"Failed to remove the (old) HDFS path: {src_path}!")

pyproject.toml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
[tool.poetry]
22
name = "dsutil"
3-
version = "0.72.1"
3+
version = "0.73.0"
44
description = "A utils Python package for data scientists."
55
authors = ["Benjamin Du <[email protected]>"]
66

readme.md

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -38,7 +38,7 @@ Currently, Python 3.7 and 3.8 are supported.
3838

3939
You can download a copy of the latest release and install it using pip.
4040
```bash
41-
pip3 install --user -U https://github.com/dclong/dsutil/releases/download/v0.72.1/dsutil-0.72.1-py3-none-any.whl
41+
pip3 install --user -U https://github.com/dclong/dsutil/releases/download/v0.73.0/dsutil-0.73.0-py3-none-any.whl
4242
```
4343
Or you can use the following command to install the latest master branch
4444
if you have pip 20.0+.
@@ -48,7 +48,7 @@ pip3 install --user -U git+https://github.com/dclong/dsutil@main
4848
Use one of the following commands if you want to install all components of dsutil.
4949
Available additional components are `cv`, `docker`, `pdf`, `jupyter`, `admin` and `all`.
5050
```bash
51-
pip3 install "dsutil[cv] @ https://github.com/dclong/dsutil/releases/download/v0.72.1/dsutil-0.72.1-py3-none-any.whl"
51+
pip3 install "dsutil[cv] @ https://github.com/dclong/dsutil/releases/download/v0.73.0/dsutil-0.73.0-py3-none-any.whl"
5252
# or
5353
pip3 install --user -U "dsutil[all] @ git+https://github.com/dclong/dsutil@main"
5454
```

0 commit comments

Comments
 (0)