forked from dxlong2000/LongGuide
-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathrun.py
More file actions
executable file
·140 lines (109 loc) · 4.81 KB
/
run.py
File metadata and controls
executable file
·140 lines (109 loc) · 4.81 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
#!/usr/bin/env python3
import yaml
import json
import argparse
from pathlib import Path
from longguide import MetricsGuidelines, OutputConstraintsGuidelines
from standardize_data import standardize_dataset
def get_task_instruction(data_path):
"""Get dataset-specific instruction"""
dataset_name = Path(data_path).name
instructions = {
"SWiPE": "Simplify this text.",
"SAMSum": "Summarize the following dialogue.",
"CNN": "Summarize the following news.",
"xlsum": "Summarize the following document.",
"IWSLT": "Translate the following from English to Japanese.",
"CommonGen": "Generate the text from the following table.",
"SyntheticDialogue": "Generate the next dialogue response."
}
return instructions.get(dataset_name, "Process this text.")
def get_data_path(task_type):
"""Map task type to dataset path"""
# summarization, translation, dialogue generation, table-to-text generation, text simplification
task_to_dataset = {
"summarization": "data/SAMSum", # Can be CNN/xlsum/SAMSum
"translation": "data/IWSLT",
"text simplification": "data/SWiPE",
"table-to-text generation": "data/CommonGen",
"dialogue generation": "data/SyntheticDialogue",
}
return task_to_dataset.get(task_type, "data/SAMSum")
def load_config(config_path):
"""Load configuration from YAML file"""
with open(config_path, 'r') as f:
return yaml.safe_load(f)
def load_dataset(data_path):
"""Load and standardize dataset from JSON files"""
data_dir = Path(data_path)
dataset_name = data_dir.name
data = []
for json_file in data_dir.glob("*.json"):
with open(json_file, 'r') as f:
file_data = json.load(f)
data.extend(file_data)
# Use existing standardization
temp_file = f"temp_{dataset_name}.json"
with open(temp_file, 'w') as f:
json.dump(data, f)
standardized_data = standardize_dataset(dataset_name, temp_file)
Path(temp_file).unlink()
return standardized_data
def run_longguide(config_path):
"""Run LongGuide with specified configuration"""
config = load_config(config_path)
# Auto-determine data path from task type
data_path = get_data_path(config['task_type'])
# Load dataset
dataset = load_dataset(data_path)
print(f"Loaded {len(dataset)} examples from {data_path}")
# Initialize guidelines with task type and config
metrics_guidelines = MetricsGuidelines(config['task_type'], config)
constraints_guidelines = OutputConstraintsGuidelines(config['task_type'], config)
# Get task-specific guidelines
metrics = metrics_guidelines.get_guidelines()
constraints = constraints_guidelines.get_guidelines(dataset[:10]) # Test with first 10 examples
print(f"Using guidelines for task: {config['task_type']}")
print(f"Metrics: {metrics}")
print(f"Constraints: {constraints}")
# Process dataset
results = []
for i, item in enumerate(dataset):
print(f"Processing example {i+1}/{len(dataset)}")
input_text = item['input']
task_instruction = get_task_instruction(data_path)
# Full attributes prompt
full_prompt = f"""{task_instruction} Your generated output must strictly fulfill the following task metrics. {constraints}
{metrics}
Input: {input_text}"""
# Only metrics prompt
only_metrics_prompt = f"""{task_instruction} Your generated output must strictly fulfill the following task metrics.
{metrics}
Input: {input_text}"""
# Only constraints prompt
only_constraints_prompt = f"""{task_instruction} {constraints}
Input: {input_text}"""
# TODO: Replace with actual LLM calls
results.append({
'input': input_text,
'target': item['output'],
'full_attributes_prompt': full_prompt,
'only_metrics_prompt': only_metrics_prompt,
'only_constraints_prompt': only_constraints_prompt,
'full_attributes': f"[Full attributes output for: {input_text[:30]}...]",
'only_metrics': f"[Only metrics output for: {input_text[:30]}...]",
'only_constraints': f"[Only constraints output for: {input_text[:30]}...]"
})
# Save results
output_path = f"outputs/results_{config['task_type']}.json"
with open(output_path, 'w') as f:
json.dump(results, f, indent=2)
print(f"Results saved to {output_path}")
def main():
parser = argparse.ArgumentParser(description='Run LongGuide with configuration')
parser.add_argument('--config', default='configs/default.yaml',
help='Path to configuration file')
args = parser.parse_args()
run_longguide(args.config)
if __name__ == "__main__":
main()