Heartie-fastGPT-analyzer/log_parser.py at main · PancrePal-xiaoyibao/Heartie-fastGPT-analyzer · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
import re
import pandas as pd
from datetime import datetime

class LogParser:
    def __init__(self, log_file_path):
        self.log_file_path = log_file_path
        self.data = []

    def parse(self):
        """
        Parses the log file and extracts Q&A pairs.
        Assumes sequential logging: Query comes first, then Reply.
        """
        print(f"Parsing log file: {self.log_file_path}")
        try:
            with open(self.log_file_path, 'r', encoding='utf-8', errors='ignore') as f:
                lines = f.readlines()
        except FileNotFoundError:
            print(f"Error: File not found {self.log_file_path}")
            return pd.DataFrame()

        current_entry = {}

        # Regex patterns
        # [INFO][2025-05-31 14:53:42]...
        time_pattern = r'\[INFO\]\[(\d{4}-\d{2}-\d{2} \d{2}:\d{2}:\d{2})\]'

        # [CHATGPT] query=...
        query_pattern = r'\[CHATGPT\] query=(.*)'

        # [gewechat] Do send text to ...: ... (reply content)
        reply_start_pattern = r'\[gewechat\] Do send text to .*?: (.*)'

        for i, line in enumerate(lines):
            # Extract timestamp
            time_match = re.search(time_pattern, line)
            timestamp = time_match.group(1) if time_match else None

            # Check for User Query
            query_match = re.search(query_pattern, line)
            if query_match:
                # If we have a pending entry (query without reply), we might save it as unanswered or just overwrite
                if current_entry.get('query'):
                    # Discard previous unanswered query or handle as needed
                    pass

                current_entry = {
                    'timestamp': timestamp,
                    'query': query_match.group(1).strip(),
                    'reply': None
                }
                continue

            # Check for Bot Reply Start
            reply_match = re.search(reply_start_pattern, line)
            if reply_match and current_entry.get('query'):
                # Found a potential reply block
                reply_content = [reply_match.group(1).strip()]

                # Check if this reply belongs to the current query?
                # In simple sequential logs, we assume yes.

                # Read subsequent lines until next log header or empty line
                j = i + 1
                while j < len(lines):
                    next_line = lines[j]
                    # Stop if next line is a log header
                    if re.match(r'\[(INFO|WARNING|ERROR)\]', next_line):
                        break
                    reply_content.append(next_line.strip())
                    j += 1

                current_entry['reply'] = '\n'.join(reply_content).strip()

                # Ignore "received image" processing logs if they don't look like real answers
                # But here we assume all [gewechat] Do send text to... are answers.

                # Add valid entry
                self.data.append(current_entry)
                current_entry = {} # Reset

        df = pd.DataFrame(self.data)
        # Clean up column names to match what XiaoXinBaoDataProcessor expects or at least be useful
        if not df.empty:
            df.rename(columns={'query': 'dialogue_content', 'reply': 'bot_reply'}, inplace=True)
            # Add dummy columns if needed for strict compatibility, or just let existing processor handle flexible columns
            df['source'] = 'log_parser'

        return df

if __name__ == '__main__':
    import sys
    import os

    input_file = 'input/xyanb.yaml'
    output_file = 'input/parsed_logs.csv'

    if len(sys.argv) > 1:
        input_file = sys.argv[1]
    if len(sys.argv) > 2:
        output_file = sys.argv[2]

    parser = LogParser(input_file)
    df = parser.parse()

    if not df.empty:
        print(f"Parsed {len(df)} Q&A pairs.")
        print(df.head())
        df.to_csv(output_file, index=False)
        print(f"Saved to {output_file}")
    else:
        print("No data parsed.")