email-evidence-tools/clean_evidence_csv.py at master · k3rt4s/email-evidence-tools · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
"""
clean_evidence_csv.py
=====================
Project : email-evidence-tools
Purpose : Post-processing step for the output of scan_mbox_for_evidence.py.
          Reads the raw evidence CSV, strips HTML tags from the `exact_text` column,
          and collapses excess whitespace so hits are readable in a spreadsheet or
          when pasted into correspondence.

Input   : --input-file or INPUT_FILE
Output  : --output-file or OUTPUT_FILE

Usage   : python clean_evidence_csv.py --input-file evidence_hits.csv --output-file evidence_hits_clean.csv

Note    : This script is non-destructive; the original CSV is not modified.
"""

import pandas as pd
import re
import os
import argparse

DEFAULT_INPUT_FILE = "mbox_evidence_hits.csv"
DEFAULT_OUTPUT_FILE = "mbox_evidence_hits_clean.csv"


def clean_text(text):
    """Remove HTML tags and normalize whitespace in a text value."""
    if pd.isna(text):
        return ""
    # Strip HTML tags
    text = re.sub(r"<[^>]+>", " ", str(text))
    # Collapse runs of whitespace to a single space
    text = re.sub(r"\s+", " ", text)
    return text.strip()


def parse_args():
    """Parse command-line arguments and environment-variable fallbacks."""
    parser = argparse.ArgumentParser(
        description="Clean text fields in an evidence CSV."
    )
    parser.add_argument(
        "--input-file",
        default=os.getenv("INPUT_FILE", DEFAULT_INPUT_FILE),
        help=f"Input CSV path. Defaults to INPUT_FILE or {DEFAULT_INPUT_FILE}.",
    )
    parser.add_argument(
        "--output-file",
        default=os.getenv("OUTPUT_FILE", DEFAULT_OUTPUT_FILE),
        help=f"Output CSV path. Defaults to OUTPUT_FILE or {DEFAULT_OUTPUT_FILE}.",
    )
    return parser.parse_args()


if __name__ == "__main__":
    args = parse_args()
    df = pd.read_csv(args.input_file)
    original_rows = len(df)

    df["exact_text"] = df["exact_text"].apply(clean_text)

    df.to_csv(args.output_file, index=False)
    print(f"Done. {original_rows:,} rows cleaned -> {args.output_file}")