-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathannotate_dialogues.py
More file actions
executable file
·351 lines (294 loc) · 13.7 KB
/
Copy pathannotate_dialogues.py
File metadata and controls
executable file
·351 lines (294 loc) · 13.7 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
#!/usr/bin/env python3
"""
Interactive Dialogue Annotation Tool
This script provides a command-line interface for annotating dialogues with task information
and error types. It allows users to manually mark which tasks were successfully completed,
identify errors, and define ground truth dialogues.
Usage:
python annotate_dialogues.py --input raw_dialogues.txt --output annotated_dialogues.json
"""
import os
import sys
import json
import logging
import argparse
from typing import Dict, List, Any
# Configure logging
logging.basicConfig(
level=logging.INFO,
format='%(asctime)s - %(name)s - %(levelname)s - %(message)s',
handlers=[logging.StreamHandler(sys.stdout)]
)
logger = logging.getLogger(__name__)
# Define task types for ontology-based systems
TASK_TYPES = [
"provide_information",
"answer_query",
"provide_entity_details",
"list_entities",
"explain_relationship",
"other"
]
# Define error types
ERROR_TYPES = [None, "substitution", "deletion", "insertion"]
def load_raw_dialogues(input_file: str) -> List[Dict[str, Any]]:
"""
Load raw dialogues from a text file
Args:
input_file: Path to raw dialogue file
Returns:
List of parsed dialogues
"""
dialogues = []
current_dialogue = None
try:
with open(input_file, 'r', encoding='utf-8') as f:
lines = f.readlines()
# Simple parsing logic - assumes dialogues are separated by blank lines
# and turns are in format "User: ..." or "Bot: ..."
for line in lines:
line = line.strip()
if not line:
# End of dialogue
if current_dialogue and current_dialogue.get("turns"):
dialogues.append(current_dialogue)
current_dialogue = {"id": f"dialogue_{len(dialogues)+1}", "turns": []}
continue
if not current_dialogue:
current_dialogue = {"id": f"dialogue_{len(dialogues)+1}", "turns": []}
# Check if line starts with User: or Bot:
if line.startswith("User:") or line.startswith("Bot:"):
speaker = "user" if line.startswith("User:") else "bot"
content = line[5:].strip() if speaker == "user" else line[4:].strip()
turn = {
"id": f"turn_{len(current_dialogue['turns'])+1}",
"speaker": speaker,
"text": line,
"content": content
}
current_dialogue["turns"].append(turn)
# Add the last dialogue
if current_dialogue and current_dialogue.get("turns"):
dialogues.append(current_dialogue)
logger.info(f"Loaded {len(dialogues)} raw dialogues from {input_file}")
return dialogues
except Exception as e:
logger.error(f"Failed to load raw dialogues from {input_file}: {e}")
return []
def annotate_dialogues_interactive(dialogues: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
"""
Interactively annotate dialogues with task and error information
Args:
dialogues: List of raw dialogues
Returns:
List of annotated dialogues
"""
annotated_dialogues = []
print("\n" + "="*50)
print("DIALOGUE ANNOTATION TOOL FOR ONTOLOGY PIPELINE EVALUATION")
print("="*50)
print("\nTask types:")
for i, task_type in enumerate(TASK_TYPES, 1):
print(f" {i}. {task_type}")
print("\nError types:")
for i, error_type in enumerate(ERROR_TYPES, 0):
print(f" {i}. {error_type if error_type else 'None'}")
for dialogue_idx, dialogue in enumerate(dialogues, 1):
print("\n" + "="*50)
print(f"DIALOGUE {dialogue_idx}/{len(dialogues)}: {dialogue['id']}")
print("="*50)
# Print dialogue for reference
for i, turn in enumerate(dialogue.get("turns", []), 1):
print(f"{i}. {turn.get('text', '')}")
# Create a copy with annotated turns
annotated_dialogue = {
"id": dialogue["id"],
"turns": [],
"ground_truth_turns": []
}
# Process each turn
for i, turn in enumerate(dialogue.get("turns", []), 1):
print("\n" + "-"*30)
print(f"Turn {i}: {turn.get('text', '')}")
# Create annotated turn
annotated_turn = {
"id": turn["id"],
"text": turn.get("text", ""),
"speaker": turn.get("speaker", ""),
"content": turn.get("content", ""),
"tasks": []
}
# Only ask for tasks and errors for bot turns
if turn.get("speaker") == "bot":
# Ask for tasks
while True:
try:
num_tasks = input(f"Number of tasks in this turn (0-5) [0]: ")
num_tasks = int(num_tasks) if num_tasks.strip() else 0
if 0 <= num_tasks <= 5:
break
print("Please enter a number between 0 and 5")
except ValueError:
print("Please enter a valid number")
# Get task details
for j in range(num_tasks):
task = {"id": f"task_{turn['id']}_{j+1}"}
# Get task type
while True:
try:
print("\nTask types:")
for k, task_type in enumerate(TASK_TYPES, 1):
print(f" {k}. {task_type}")
type_idx = input(f"Task {j+1} type (1-{len(TASK_TYPES)}): ")
type_idx = int(type_idx) - 1
if 0 <= type_idx < len(TASK_TYPES):
task["type"] = TASK_TYPES[type_idx]
break
print(f"Please enter a number between 1 and {len(TASK_TYPES)}")
except ValueError:
print("Please enter a valid number")
# Get success status
while True:
success = input(f"Was task {j+1} successful? (y/n) [y]: ").lower()
if success in ("", "y", "yes"):
task["success"] = True
break
elif success in ("n", "no"):
task["success"] = False
break
print("Please enter 'y' or 'n'")
annotated_turn["tasks"].append(task)
# Get error type
while True:
try:
print("\nError types:")
for k, error_type in enumerate(ERROR_TYPES, 0):
print(f" {k}. {error_type if error_type else 'None'}")
error_idx = input(f"Error type (0-{len(ERROR_TYPES)-1}) [0]: ")
error_idx = int(error_idx) if error_idx.strip() else 0
if 0 <= error_idx < len(ERROR_TYPES):
error_type = ERROR_TYPES[error_idx]
if error_type:
annotated_turn["error_type"] = error_type
break
print(f"Please enter a number between 0 and {len(ERROR_TYPES)-1}")
except ValueError:
print("Please enter a valid number")
annotated_dialogue["turns"].append(annotated_turn)
# Ask for ground truth turns
print("\n" + "-"*30)
print("Now let's define the ground truth turns (ideal dialogue)")
print("For each bot turn, indicate if it should be different in the ideal case")
# Start with user turns and ask for bot turn corrections
ground_truth = []
for turn in annotated_dialogue["turns"]:
if turn.get("speaker") == "user":
# User turns stay the same
ground_truth.append({
"id": f"gt_{turn['id']}",
"speaker": "user",
"text": turn.get("text", ""),
"content": turn.get("content", "")
})
else:
# For bot turns, ask if they need correction
print("\n" + "-"*30)
print(f"Bot turn: {turn.get('text', '')}")
while True:
needs_correction = input("Does this turn need correction in the ideal dialogue? (y/n) [n]: ").lower()
if needs_correction in ("", "n", "no"):
# Keep original
ground_truth.append({
"id": f"gt_{turn['id']}",
"speaker": "bot",
"text": turn.get("text", ""),
"content": turn.get("content", "")
})
break
elif needs_correction in ("y", "yes"):
# Get corrected text
corrected_text = input("Enter the ideal bot response: ")
ground_truth.append({
"id": f"gt_{turn['id']}",
"speaker": "bot",
"text": f"Bot: {corrected_text}",
"content": corrected_text
})
break
print("Please enter 'y' or 'n'")
annotated_dialogue["ground_truth_turns"] = ground_truth
annotated_dialogues.append(annotated_dialogue)
print("\nDialogue annotation complete!")
# Ask if the user wants to continue with the next dialogue
if dialogue_idx < len(dialogues):
while True:
continue_annotation = input("Continue with next dialogue? (y/n) [y]: ").lower()
if continue_annotation in ("", "y", "yes"):
break
elif continue_annotation in ("n", "no"):
print("Annotation stopped by user")
return annotated_dialogues
print("Please enter 'y' or 'n'")
return annotated_dialogues
def save_annotated_dialogues(dialogues: List[Dict[str, Any]], output_file: str) -> None:
"""
Save annotated dialogues to a JSON file
Args:
dialogues: List of annotated dialogues
output_file: Path to save the annotated dialogues
"""
try:
with open(output_file, 'w', encoding='utf-8') as f:
json.dump({"dialogues": dialogues}, f, indent=2)
logger.info(f"Saved {len(dialogues)} annotated dialogues to {output_file}")
except Exception as e:
logger.error(f"Failed to save annotated dialogues to {output_file}: {e}")
def create_sample_raw_dialogues(output_file: str, num_dialogues: int = 2) -> None:
"""
Create a sample raw dialogues file for testing
Args:
output_file: Path to save the sample data
num_dialogues: Number of sample dialogues to create
"""
sample_dialogues = [
[
"User: What facilities are available in Camp A?",
"Bot: Camp A has 3 water points, 1 health facility, and 2 food distribution centers.",
"User: How many people can the health facility serve?",
"Bot: The health facility in Camp A can serve approximately 500 people per day."
],
[
"User: Where can I find water in Sector B?",
"Bot: There are 2 water points in Sector B: one at the north entrance and one near the community center.",
"User: Is the water safe to drink?",
"Bot: Yes, all water points are treated and tested regularly for safety."
]
]
with open(output_file, 'w', encoding='utf-8') as f:
for i, dialogue in enumerate(sample_dialogues[:num_dialogues], 1):
for turn in dialogue:
f.write(f"{turn}\n")
if i < num_dialogues:
f.write("\n") # Blank line between dialogues
logger.info(f"Created sample raw dialogues file with {num_dialogues} dialogues at {output_file}")
def main():
parser = argparse.ArgumentParser(description="Annotate dialogues for ontology pipeline evaluation")
parser.add_argument("--input", help="Path to raw dialogues file")
parser.add_argument("--output", default="annotated_dialogues.json", help="Path to save annotated dialogues")
parser.add_argument("--create-sample", action="store_true", help="Create a sample raw dialogues file")
parser.add_argument("--sample-output", default="sample_raw_dialogues.txt", help="Path to save sample raw dialogues")
parser.add_argument("--sample-size", type=int, default=2, help="Number of sample dialogues to create")
args = parser.parse_args()
if args.create_sample:
create_sample_raw_dialogues(args.sample_output, args.sample_size)
return
if not args.input:
parser.error("--input is required unless using --create-sample")
raw_dialogues = load_raw_dialogues(args.input)
if not raw_dialogues:
logger.error(f"No dialogues loaded from {args.input}")
return
annotated_dialogues = annotate_dialogues_interactive(raw_dialogues)
save_annotated_dialogues(annotated_dialogues, args.output)
if __name__ == "__main__":
main()