|
3 | 3 | import logging |
4 | 4 | import os |
5 | 5 | from collections import defaultdict |
| 6 | +from typing import Any |
| 7 | +from typing import List |
6 | 8 |
|
7 | 9 | import numpy as np |
8 | 10 | import pandas as pd |
9 | 11 |
|
10 | 12 | logging.basicConfig(format='%(asctime)s %(message)s', level=logging.INFO) |
11 | 13 |
|
12 | 14 |
|
13 | | -def outrank_task_result_summary(args): |
14 | | - triplets_path = os.path.join(args.output_folder, 'pairwise_ranks.tsv') |
| 15 | +def read_and_sort_triplets(triplets_path: str) -> pd.DataFrame: |
| 16 | + """Read triplets from a file and sort by the 'Score' column.""" |
15 | 17 | triplets = pd.read_csv(triplets_path, sep='\t') |
16 | | - triplets = triplets.sort_values(by='Score', ascending=False) |
| 18 | + return triplets.sort_values(by='Score', ascending=False) |
| 19 | + |
17 | 20 |
|
| 21 | +def generate_final_ranking(triplets: pd.DataFrame, label_column: str) -> list[list[Any]]: |
| 22 | + """Generate final ranking based on the label column.""" |
18 | 23 | final_ranking = [] |
19 | 24 | for _, row in triplets.iterrows(): |
20 | 25 | feature_a, feature_b = row['FeatureA'], row['FeatureB'] |
21 | 26 | score = row['Score'] |
22 | | - if args.label_column == feature_a.split('-')[0]: |
| 27 | + if label_column == feature_a.split('-')[0]: |
23 | 28 | final_ranking.append([feature_b, score]) |
24 | | - elif args.label_column == feature_b.split('-')[0]: |
| 29 | + elif label_column == feature_b.split('-')[0]: |
25 | 30 | final_ranking.append([feature_a, score]) |
| 31 | + return final_ranking |
26 | 32 |
|
27 | | - final_df = pd.DataFrame(final_ranking, columns=['Feature', f'Score {args.heuristic}']) |
| 33 | + |
| 34 | +def create_final_dataframe(final_ranking: list[list[Any]], heuristic: str) -> pd.DataFrame: |
| 35 | + """Create a final DataFrame and normalize if necessary.""" |
| 36 | + final_df = pd.DataFrame(final_ranking, columns=['Feature', f'Score {heuristic}']) |
28 | 37 | final_df = ( |
29 | 38 | final_df.groupby('Feature') |
30 | 39 | .median() |
31 | 40 | .reset_index() |
32 | | - .sort_values(by=f'Score {args.heuristic}', ascending=False) |
| 41 | + .sort_values(by=f'Score {heuristic}', ascending=False) |
33 | 42 | ) |
34 | 43 |
|
35 | | - if "MI" in args.heuristic: |
36 | | - min_score = final_df[f'Score {args.heuristic}'].min() |
37 | | - max_score = final_df[f'Score {args.heuristic}'].max() |
38 | | - final_df[f'Score {args.heuristic}'] = (final_df[f'Score {args.heuristic}'] - min_score) / (max_score - min_score) |
| 44 | + if 'MI' in heuristic: |
| 45 | + min_score = final_df[f'Score {heuristic}'].min() |
| 46 | + max_score = final_df[f'Score {heuristic}'].max() |
| 47 | + final_df[f'Score {heuristic}'] = (final_df[f'Score {heuristic}'] - min_score) / (max_score - min_score) |
| 48 | + |
| 49 | + return final_df |
39 | 50 |
|
40 | | - logging.info(f'Storing summary files to {args.output_folder}') |
| 51 | + |
| 52 | +def store_summary_files(final_df: pd.DataFrame, output_folder: str, heuristic: str, tldr: bool) -> None: |
| 53 | + """Store the summary files and optionally print the head of the DataFrame.""" |
| 54 | + logging.info(f'Storing summary files to {output_folder}') |
41 | 55 | pd.set_option('display.max_rows', None, 'display.max_columns', None) |
42 | 56 |
|
43 | | - singles_path = os.path.join(args.output_folder, 'feature_singles.tsv') |
| 57 | + singles_path = os.path.join(output_folder, 'feature_singles.tsv') |
44 | 58 | final_df.to_csv(singles_path, sep='\t', index=False) |
45 | 59 |
|
46 | | - if args.interaction_order > 1: |
| 60 | + if tldr: |
| 61 | + print(final_df.head(20)) |
| 62 | + |
| 63 | + |
| 64 | +def handle_interaction_order(final_df: pd.DataFrame, output_folder: str, heuristic: str, interaction_order: int) -> None: |
| 65 | + """Handle the interaction order if it is greater than 1.""" |
| 66 | + if interaction_order > 1: |
47 | 67 | feature_store = defaultdict(list) |
48 | 68 | for _, row in final_df.iterrows(): |
49 | 69 | fname = row['Feature'] |
50 | | - score = row[f'Score {args.heuristic}'] |
| 70 | + score = row[f'Score {heuristic}'] |
51 | 71 | if 'AND' in fname: |
52 | 72 | for el in fname.split('-')[0].split(' AND '): |
53 | 73 | feature_store[el].append(score) |
54 | 74 |
|
55 | 75 | final_aggregate_df = pd.DataFrame([ |
56 | 76 | { |
57 | 77 | 'Feature': k, |
58 | | - f'Combined score (order: {args.interaction_order}, {args.heuristic})': np.median(v), |
| 78 | + f'Combined score (order: {interaction_order}, {heuristic})': np.median(v), |
59 | 79 | } |
60 | 80 | for k, v in feature_store.items() |
61 | 81 | ]) |
62 | 82 | final_aggregate_df.to_csv( |
63 | | - os.path.join(args.output_folder, 'feature_singles_aggregated.tsv'), sep='\t', index=False |
| 83 | + os.path.join(output_folder, 'feature_singles_aggregated.tsv'), sep='\t', index=False, |
64 | 84 | ) |
65 | 85 |
|
66 | | - transformers_only_path = singles_path.replace('.tsv', '_transformers_only_imp.tsv') |
| 86 | + |
| 87 | +def filter_transformers_only(final_df: pd.DataFrame, output_folder: str) -> None: |
| 88 | + """Filter the DataFrame to include only transformer features and store the result.""" |
| 89 | + transformers_only_path = os.path.join(output_folder, 'feature_singles_transformers_only_imp.tsv') |
67 | 90 | final_df[final_df['Feature'].str.contains('_tr_')].to_csv(transformers_only_path, sep='\t', index=False) |
| 91 | + |
| 92 | + |
| 93 | +def outrank_task_result_summary(args) -> None: |
| 94 | + """Main function to generate a summary of outrank task results.""" |
| 95 | + triplets_path = os.path.join(args.output_folder, 'pairwise_ranks.tsv') |
| 96 | + triplets = read_and_sort_triplets(triplets_path) |
| 97 | + |
| 98 | + final_ranking = generate_final_ranking(triplets, args.label_column) |
| 99 | + final_df = create_final_dataframe(final_ranking, args.heuristic) |
| 100 | + |
| 101 | + store_summary_files(final_df, args.output_folder, args.heuristic, args.tldr) |
| 102 | + handle_interaction_order(final_df, args.output_folder, args.heuristic, args.interaction_order) |
| 103 | + filter_transformers_only(final_df, args.output_folder) |
0 commit comments