python-reference/gz_splitter.py at master · ramwin/python-reference · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
#!/usr/bin/env python3
"""
Gzip 文件分割工具

功能：读取 .gz 压缩文件，按每 N 行分割成小文件保存到目标目录。
支持三种处理模式：
1. single: 单进程顺序读写
2. threaded: 单进程读取 + 线程池异步写入
3. multiprocess: 单进程读取 + 进程池处理写入

文件名格式：原始名称(无后缀)_(文件index)_(起始行号)_(终止行号)(原始后缀)
"""

import gzip
import os
import time
import click
from concurrent.futures import ThreadPoolExecutor, ProcessPoolExecutor
from pathlib import Path
from typing import Iterator, Tuple, List
import tempfile
import shutil


def generate_output_filename(
    input_path: Path,
    index: int,
    start_line: int,
    end_line: int
) -> str:
    """生成输出文件名: 原始名称_索引_起始行_终止行.txt.gz"""
    stem = input_path.stem
    if stem.endswith('.txt'):
        stem = stem[:-4]
    return f"{stem}_{index}_{start_line}_{end_line}.txt.gz"


def write_chunk_to_gz(output_path: Path, lines: List[str]) -> None:
    """将行列表写入 gzip 文件"""
    with gzip.open(output_path, 'wt', encoding='utf-8') as f:
        f.writelines(lines)


def read_gz_lines(input_path: Path) -> Iterator[Tuple[int, str]]:
    """读取 gzip 文件，返回 (行号, 行内容) 的迭代器"""
    with gzip.open(input_path, 'rt', encoding='utf-8') as f:
        for line_num, line in enumerate(f, 1):
            yield line_num, line


def split_single_process(input_path: Path, output_dir: Path, lines_per_file: int) -> int:
    """
    方案1：单进程顺序读写
    返回生成的文件数量
    """
    file_count = 0
    current_lines: List[str] = []
    start_line = 1

    for line_num, line in read_gz_lines(input_path):
        current_lines.append(line)

        if len(current_lines) >= lines_per_file:
            file_count += 1
            end_line = line_num
            filename = generate_output_filename(input_path, file_count, start_line, end_line)
            output_path_file = output_dir / filename
            write_chunk_to_gz(output_path_file, current_lines)

            current_lines = []
            start_line = line_num + 1

    # 处理剩余行
    if current_lines:
        file_count += 1
        end_line = line_num if current_lines else start_line
        filename = generate_output_filename(input_path, file_count, start_line, end_line)
        output_path_file = output_dir / filename
        write_chunk_to_gz(output_path_file, current_lines)

    return file_count


def split_threaded(input_path: Path, output_dir: Path, lines_per_file: int, max_workers: int = 4) -> int:
    """
    方案2：单进程读取 + 线程池异步写入
    返回生成的文件数量
    """
    file_count = 0
    current_lines: List[str] = []
    start_line = 1
    futures = []

    with ThreadPoolExecutor(max_workers=max_workers) as executor:
        for line_num, line in read_gz_lines(input_path):
            current_lines.append(line)

            if len(current_lines) >= lines_per_file:
                file_count += 1
                end_line = line_num
                filename = generate_output_filename(input_path, file_count, start_line, end_line)
                output_path_file = output_dir / filename

                # 提交写入任务
                future = executor.submit(write_chunk_to_gz, output_path_file, current_lines.copy())
                futures.append(future)

                current_lines = []
                start_line = line_num + 1

        # 处理剩余行
        if current_lines:
            file_count += 1
            end_line = line_num if current_lines else start_line
            filename = generate_output_filename(input_path, file_count, start_line, end_line)
            output_path_file = output_dir / filename
            future = executor.submit(write_chunk_to_gz, output_path_file, current_lines)
            futures.append(future)

        # 等待所有任务完成
        for future in futures:
            future.result()

    return file_count


def _write_chunk_worker(args: Tuple[str, List[str]]) -> None:
    """
    进程池工作函数 - 必须定义在模块级别以便能被 pickle
    接收字符串路径而不是 Path 对象（为了更好的兼容性）
    """
    output_path_str, lines = args
    write_chunk_to_gz(Path(output_path_str), lines)


def split_multiprocess(input_path: Path, output_dir: Path, lines_per_file: int, max_workers: int = 4) -> int:
    """
    方案3：多进程处理 - 使用 Manager Queue 流式分发数据

    优化点：
    1. 主进程读取并解压数据（I/O 密集型，单线程即可）
    2. 使用进程池并行压缩和写入（CPU 密集型，gzip 压缩是瓶颈）
    3. 使用队列避免一次性加载所有数据到内存
    返回生成的文件数量
    """
    from multiprocessing import Manager, Pool

    file_count = 0
    chunk_index = 0
    start_line = 1
    current_lines: List[str] = []
    line_num = 0

    # 使用 Manager 创建队列
    with Manager() as manager:
        task_queue = manager.Queue(maxsize=max_workers * 2)

        def worker_main(queue):
            """工作进程主函数"""
            while True:
                try:
                    task = queue.get(timeout=1)
                    if task is None:  # 结束信号
                        break
                    output_path_str, lines = task
                    write_chunk_to_gz(Path(output_path_str), lines)
                except Exception:
                    break
            return True

        # 启动工作进程池
        with Pool(max_workers) as pool:
            # 先异步启动所有工作进程
            async_results = [pool.apply_async(worker_main, (task_queue,)) for _ in range(max_workers)]

            # 主进程读取数据并分发任务
            for line_num, line in read_gz_lines(input_path):
                current_lines.append(line)

                if len(current_lines) >= lines_per_file:
                    chunk_index += 1
                    end_line = line_num
                    filename = generate_output_filename(input_path, chunk_index, start_line, end_line)
                    output_path_file = output_dir / filename

                    # 将任务放入队列（阻塞直到有空位）
                    task_queue.put((str(output_path_file), current_lines.copy()))

                    current_lines = []
                    start_line = line_num + 1

            # 处理剩余行
            if current_lines:
                chunk_index += 1
                end_line = line_num if current_lines else start_line
                filename = generate_output_filename(input_path, chunk_index, start_line, end_line)
                output_path_file = output_dir / filename
                task_queue.put((str(output_path_file), current_lines))

            # 发送结束信号
            for _ in range(max_workers):
                task_queue.put(None)

            # 等待所有工作进程完成
            for r in async_results:
                r.get(timeout=300)

    return chunk_index


@click.command()
@click.argument('input_file', type=click.Path(exists=True, path_type=Path))
@click.option('--lines', '-n', default=1000, help='每个小文件的行数')
@click.option('--output-dir', '-o', default='./output', type=click.Path(path_type=Path), help='输出目录')
@click.option('--mode', '-m',
              type=click.Choice(['single', 'threaded', 'multiprocess', 'all'], case_sensitive=False),
              default='single', help='处理模式')
@click.option('--workers', '-w', default=4, help='线程池/进程池的工作进程数')
@click.option('--benchmark', is_flag=True, help='运行性能测试对比所有模式')
def main(
    input_file: Path,
    lines: int,
    output_dir: Path,
    mode: str,
    workers: int,
    benchmark: bool
):
    """
    分割 gzip 文件为多个小文件

    INPUT_FILE: 输入的 .gz 文件路径
    """
    if not input_file.suffix == '.gz':
        click.echo("错误：输入文件必须是 .gz 格式", err=True)
        return

    if benchmark or mode == 'all':
        run_benchmark(input_file, output_dir, lines, workers)
    else:
        # 确保输出目录存在
        output_dir.mkdir(parents=True, exist_ok=True)

        click.echo(f"输入文件: {input_file}")
        click.echo(f"输出目录: {output_dir}")
        click.echo(f"每文件行数: {lines}")
        click.echo(f"处理模式: {mode}")
        click.echo(f"工作进程数: {workers}")
        click.echo("-" * 50)

        start_time = time.time()

        if mode == 'single':
            count = split_single_process(input_file, output_dir, lines)
        elif mode == 'threaded':
            count = split_threaded(input_file, output_dir, lines, workers)
        elif mode == 'multiprocess':
            count = split_multiprocess(input_file, output_dir, lines, workers)

        elapsed = time.time() - start_time
        click.echo(f"✓ 完成！生成了 {count} 个文件，耗时: {elapsed:.3f} 秒")


def run_benchmark(input_file: Path, output_dir: Path, lines: int, workers: int):
    """运行性能测试对比三种模式"""
    import tempfile
    import shutil

    click.echo("=" * 60)
    click.echo("性能测试 - 对比三种处理模式")
    click.echo("=" * 60)
    click.echo(f"测试文件: {input_file}")
    click.echo(f"文件大小: {input_file.stat().st_size / 1024 / 1024:.2f} MB")
    click.echo(f"每文件行数: {lines}")
    click.echo(f"线程/进程池大小: {workers}")
    click.echo()

    results = []

    # 测试方案1：单进程
    click.echo("[1/3] 测试单进程顺序读写模式...")
    test_dir = tempfile.mkdtemp()
    try:
        start = time.time()
        count = split_single_process(input_file, Path(test_dir), lines)
        elapsed = time.time() - start
        results.append(("单进程顺序读写", elapsed, count))
        click.echo(f"    完成: {elapsed:.3f} 秒, 生成 {count} 个文件")
    finally:
        shutil.rmtree(test_dir)

    # 测试方案2：线程池
    click.echo("[2/3] 测试单进程读取 + 线程池异步写入模式...")
    test_dir = tempfile.mkdtemp()
    try:
        start = time.time()
        count = split_threaded(input_file, Path(test_dir), lines, workers)
        elapsed = time.time() - start
        results.append(("单进程读 + 线程池写", elapsed, count))
        click.echo(f"    完成: {elapsed:.3f} 秒, 生成 {count} 个文件")
    finally:
        shutil.rmtree(test_dir)

    # 测试方案3：进程池
    click.echo("[3/3] 测试单进程读取 + 进程池处理模式...")
    test_dir = tempfile.mkdtemp()
    try:
        start = time.time()
        count = split_multiprocess(input_file, Path(test_dir), lines, workers)
        elapsed = time.time() - start
        results.append(("单进程读 + 进程池处理", elapsed, count))
        click.echo(f"    完成: {elapsed:.3f} 秒, 生成 {count} 个文件")
    finally:
        shutil.rmtree(test_dir)

    # 输出对比结果
    click.echo()
    click.echo("=" * 60)
    click.echo("性能对比结果")
    click.echo("=" * 60)
    click.echo(f"{'模式':<25} {'耗时(秒)':<12} {'文件数':<10} {'相对速度':<10}")
    click.echo("-" * 60)

    best_time = min(r[1] for r in results)
    for name, elapsed, count in results:
        ratio = elapsed / best_time
        speed = f"{ratio:.2f}x" if ratio > 1 else "1.00x (最快)"
        click.echo(f"{name:<25} {elapsed:<12.3f} {count:<10} {speed:<10}")

    click.echo("=" * 60)

    # 找出最快的方法
    fastest = min(results, key=lambda x: x[1])
    click.echo(f"\n🏆 最快模式: {fastest[0]} ({fastest[1]:.3f} 秒)")


# ==================== 测试用例 ====================

import unittest


class TestGzSplitter(unittest.TestCase):
    """测试 gzip 文件分割功能"""

    @classmethod
    def setUpClass(cls):
        """创建测试数据"""
        cls.test_dir = Path(tempfile.mkdtemp())
        cls.test_file = cls.test_dir / "test_data.txt.gz"
        cls.output_dir = cls.test_dir / "output"

        # 创建测试数据：10000 行
        cls.total_lines = 10000
        cls.lines_per_file = 1000

        with gzip.open(cls.test_file, 'wt', encoding='utf-8') as f:
            for i in range(cls.total_lines):
                f.write(f"Line {i + 1}: This is a test line with some content.\n")

        print(f"\n创建测试文件: {cls.test_file}")
        print(f"总行数: {cls.total_lines}")

    @classmethod
    def tearDownClass(cls):
        """清理测试数据"""
        shutil.rmtree(cls.test_dir)
        print(f"\n清理测试目录: {cls.test_dir}")

    def setUp(self):
        """每个测试前清理输出目录"""
        if self.output_dir.exists():
            shutil.rmtree(self.output_dir)
        self.output_dir.mkdir(parents=True, exist_ok=True)

    def test_01_single_process(self):
        """测试单进程模式"""
        print("\n[测试] 单进程顺序读写模式...")
        count = split_single_process(self.test_file, self.output_dir, self.lines_per_file)

        # 验证文件数量
        expected_files = self.total_lines // self.lines_per_file  # 10
        self.assertEqual(count, expected_files)

        # 验证文件内容（按文件名中的序号排序）
        files = sorted(self.output_dir.glob("*.gz"),
                      key=lambda p: int(p.name.split('_')[2]))  # 按 index 排序
        self.assertEqual(len(files), expected_files)

        total_read = 0
        for i, f in enumerate(files, 1):
            with gzip.open(f, 'rt', encoding='utf-8') as fp:
                lines = fp.readlines()
                total_read += len(lines)
                # 验证文件名格式
                self.assertIn(f"_{i}_", f.name)

        self.assertEqual(total_read, self.total_lines)
        print(f"  ✓ 成功分割为 {count} 个文件")

    def test_02_threaded(self):
        """测试线程池模式"""
        print("\n[测试] 线程池异步写入模式...")
        count = split_threaded(self.test_file, self.output_dir, self.lines_per_file, max_workers=4)

        expected_files = self.total_lines // self.lines_per_file
        self.assertEqual(count, expected_files)

        # 验证所有文件都存在且内容正确（按文件名中的序号排序）
        files = sorted(self.output_dir.glob("*.gz"),
                      key=lambda p: int(p.name.split('_')[2]))
        self.assertEqual(len(files), expected_files)

        total_read = 0
        for f in files:
            with gzip.open(f, 'rt', encoding='utf-8') as fp:
                total_read += len(fp.readlines())

        self.assertEqual(total_read, self.total_lines)
        print(f"  ✓ 成功分割为 {count} 个文件")

    def test_03_multiprocess(self):
        """测试进程池模式"""
        print("\n[测试] 进程池处理模式...")
        count = split_multiprocess(self.test_file, self.output_dir, self.lines_per_file, max_workers=4)

        expected_files = self.total_lines // self.lines_per_file
        self.assertEqual(count, expected_files)

        # 验证所有文件都存在且内容正确（按文件名中的序号排序）
        files = sorted(self.output_dir.glob("*.gz"),
                      key=lambda p: int(p.name.split('_')[2]))
        self.assertEqual(len(files), expected_files)

        total_read = 0
        for f in files:
            with gzip.open(f, 'rt', encoding='utf-8') as fp:
                total_read += len(fp.readlines())

        self.assertEqual(total_read, self.total_lines)
        print(f"  ✓ 成功分割为 {count} 个文件")

    def test_04_filename_generation(self):
        """测试文件名生成"""
        test_path = Path("/data/test_file.txt.gz")

        filename = generate_output_filename(test_path, 1, 1, 100)
        self.assertEqual(filename, "test_file_1_1_100.txt.gz")

        filename = generate_output_filename(test_path, 5, 401, 500)
        self.assertEqual(filename, "test_file_5_401_500.txt.gz")

        # 测试没有 .txt 的情况
        test_path2 = Path("/data/test_file.gz")
        filename2 = generate_output_filename(test_path2, 1, 1, 100)
        self.assertEqual(filename2, "test_file_1_1_100.txt.gz")

        print("\n[测试] 文件名生成... ✓")

    def test_05_performance_comparison(self):
        """性能对比测试"""
        print("\n[性能测试] 对比三种模式的速度...")

        results = []

        # 测试单进程
        if self.output_dir.exists():
            shutil.rmtree(self.output_dir)
        self.output_dir.mkdir(parents=True, exist_ok=True)

        start = time.time()
        split_single_process(self.test_file, self.output_dir, self.lines_per_file)
        single_time = time.time() - start
        results.append(("单进程顺序读写", single_time))
        print(f"  单进程顺序读写: {single_time:.3f} 秒")

        # 测试线程池
        if self.output_dir.exists():
            shutil.rmtree(self.output_dir)
        self.output_dir.mkdir(parents=True, exist_ok=True)

        start = time.time()
        split_threaded(self.test_file, self.output_dir, self.lines_per_file, max_workers=4)
        thread_time = time.time() - start
        results.append(("单进程读 + 线程池写", thread_time))
        print(f"  线程池异步写入: {thread_time:.3f} 秒")

        # 测试进程池
        if self.output_dir.exists():
            shutil.rmtree(self.output_dir)
        self.output_dir.mkdir(parents=True, exist_ok=True)

        start = time.time()
        split_multiprocess(self.test_file, self.output_dir, self.lines_per_file, max_workers=4)
        process_time = time.time() - start
        results.append(("单进程读 + 进程池处理", process_time))
        print(f"  进程池处理: {process_time:.3f} 秒")

        # 输出对比
        print("\n  性能对比:")
        best = min(results, key=lambda x: x[1])
        for name, t in results:
            ratio = t / best[1]
            marker = " 🏆" if t == best[1] else ""
            print(f"    {name}: {t:.3f}s ({ratio:.2f}x){marker}")

        # 断言所有模式都能正常工作（不比较具体速度，因为受机器影响）
        self.assertTrue(all(t > 0 for _, t in results))


def run_tests():
    """运行所有测试"""
    loader = unittest.TestLoader()
    suite = loader.loadTestsFromTestCase(TestGzSplitter)
    runner = unittest.TextTestRunner(verbosity=2)
    result = runner.run(suite)
    return result.wasSuccessful()


if __name__ == '__main__':
    import sys

    # 检查是否需要运行测试
    if len(sys.argv) > 1 and sys.argv[1] in ['--test', '-t']:
        # 运行测试模式
        print("运行测试用例...\n")
        success = run_tests()
        sys.exit(0 if success else 1)
    elif len(sys.argv) == 1:
        # 无参数时默认运行测试
        print("运行测试用例... (使用 --help 查看命令行用法)\n")
        success = run_tests()
        if success:
            print("\n所有测试通过！")
            print("\n使用示例:")
            print(f"  python {sys.argv[0]} input.txt.gz --lines 1000 --output-dir ./output --mode single")
            print(f"  python {sys.argv[0]} input.txt.gz --benchmark")
        sys.exit(0 if success else 1)
    else:
        # 其他情况运行 CLI (包括带文件参数或 --help 等)
        main()