Skip to content

Commit 3e2e3be

Browse files
authored
Merge pull request #318 from dclong/dev
Merge dev into main
2 parents bb9959e + cf86b00 commit 3e2e3be

File tree

5 files changed

+80
-4
lines changed

5 files changed

+80
-4
lines changed

dsutil/__init__.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -3,4 +3,4 @@
33
from . import git
44
from . import poetry
55

6-
__version__ = "0.68.3"
6+
__version__ = "0.69.0"

dsutil/hadoop/repart_hdfs.py

Lines changed: 49 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,49 @@
1+
"""Repartition a HDFS path of the Parquet format.
2+
"""
3+
from typing import Optional
4+
from argparse import ArgumentParser, Namespace
5+
from pyspark.sql import SparkSession
6+
from .utils import repart_hdfs
7+
8+
spark = SparkSession.builder.appName("Repart_HDFS").enableHiveSupport().getOrCreate()
9+
10+
11+
def parse_args(args=None, namespace=None) -> Namespace:
12+
"""Parse command-line arguments.
13+
14+
:param args: The arguments to parse.
15+
If None, the arguments from command-line are parsed.
16+
:param namespace: An inital Namespace object.
17+
:return: A namespace object containing parsed options.
18+
"""
19+
parser = ArgumentParser(
20+
description="Repartition a HDFS path which is of the Parquet format."
21+
)
22+
parser.add_argument(
23+
"-p",
24+
"--path",
25+
"--hdfs-path",
26+
dest="hdfs_path",
27+
type=str,
28+
help="The HDFS path (of the Parquet format) to repartition."
29+
)
30+
parser.add_argument(
31+
"-n",
32+
"--num-parts",
33+
dest="num_parts",
34+
type=int,
35+
help="The new number of partitions."
36+
)
37+
return parser.parse_args(args=args, namespace=namespace)
38+
39+
40+
def main(args: Optional[Namespace] = None):
41+
"""The main function for script usage.
42+
"""
43+
if args is None:
44+
args = parse_args()
45+
repart_hdfs(spark, path=args.hdfs_path, num_parts=args.num_parts)
46+
47+
48+
if __name__ == "__main__":
49+
main()

dsutil/hadoop/utils.py

Lines changed: 26 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,8 @@
22
"""
33
from __future__ import annotations
44
from typing import Union
5+
import sys
6+
import datetime
57
from pyspark.sql import DataFrame, Window
68
from pyspark.sql.functions import col, spark_partition_id, rank, coalesce, lit, max, sum
79

@@ -71,3 +73,27 @@ def calc_global_rank(frame: DataFrame, order_by: Union[str, list[str]]) -> DataF
7173
["part_id"],
7274
).withColumn("rank",
7375
col("local_rank") + col("sum_factor"))
76+
77+
78+
def repart_hdfs(spark, path: str, num_parts: int) -> None:
79+
"""Repartition a HDFS path of the Parquet format.
80+
81+
:param spark: A SparkSession object.
82+
:param path: The HDFS path to repartition.
83+
:param num_parts: The new number of partitions.
84+
"""
85+
path = path.rstrip("/")
86+
ts = datetime.datetime.now().strftime("%Y%m%d%H%M%S%f")
87+
path_tmp = path + f"_repart_tmp_{ts}"
88+
spark.read.parquet(path).repartition(num_parts) \
89+
.write.mode("overwrite").parquet(path_tmp)
90+
sc = spark.sparkContext
91+
fs = sc._jvm.org.apache.hadoop.fs.FileSystem.get(sc._jsc.hadoopConfiguration()) # pylint: disable=W0212
92+
if fs.delete(sc._jvm.org.apache.hadoop.fs.Path(path), True): # pylint: disable=W0212
93+
if not fs.rename(
94+
sc._jvm.org.apache.hadoop.fs.Path(path_tmp), # pylint: disable=W0212
95+
sc._jvm.org.apache.hadoop.fs.Path(path), # pylint: disable=W0212
96+
):
97+
sys.exit(f"Failed to rename the HDFS path {path_tmp} to {path}!")
98+
else:
99+
sys.exit(f"Failed to remove the (old) HDFS path: {path}!")

pyproject.toml

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,11 +1,12 @@
11
[tool.poetry]
22
name = "dsutil"
3-
version = "0.68.3"
3+
version = "0.69.0"
44
description = "A utils Python package for data scientists."
55
authors = ["Benjamin Du <[email protected]>"]
66

77
[tool.poetry.scripts]
88
logf = "dsutil.hadoop:logf.main"
9+
repart_hdfs = "dsutil.hadoop:repart_hdfs.main"
910
pyspark_submit = "dsutil.hadoop:pyspark_submit.main"
1011
pykinit = "dsutil.hadoop:kerberos.main"
1112
match_memory = "dsutil:memory.main"

readme.md

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -25,7 +25,7 @@ Currently, Python 3.7 and 3.8 are supported.
2525
2626
You can download a copy of the latest release and install it using pip.
2727
```bash
28-
pip3 install --user -U https://github.com/dclong/dsutil/releases/download/v0.68.3/dsutil-0.68.3-py3-none-any.whl
28+
pip3 install --user -U https://github.com/dclong/dsutil/releases/download/v0.69.0/dsutil-0.69.0-py3-none-any.whl
2929
```
3030
Or you can use the following command to install the latest master branch
3131
if you have pip 20.0+.
@@ -35,7 +35,7 @@ pip3 install --user -U git+https://github.com/dclong/dsutil@main
3535
Use one of the following commands if you want to install all components of dsutil.
3636
Available additional components are `cv`, `docker`, `pdf`, `jupyter`, `admin` and `all`.
3737
```bash
38-
pip3 install "dsutil[cv] @ https://github.com/dclong/dsutil/releases/download/v0.68.3/dsutil-0.68.3-py3-none-any.whl"
38+
pip3 install "dsutil[cv] @ https://github.com/dclong/dsutil/releases/download/v0.69.0/dsutil-0.69.0-py3-none-any.whl"
3939
# or
4040
pip3 install --user -U "dsutil[all] @ git+https://github.com/dclong/dsutil@main"
4141
```

0 commit comments

Comments
 (0)