Skip to content

Commit b5f4a2c

Browse files
authored
REF-1523-TLDR mode (#85)
* tldr mode * tldr
1 parent 9b07ab8 commit b5f4a2c

File tree

4 files changed

+69
-26
lines changed

4 files changed

+69
-26
lines changed

outrank/__main__.py

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -204,6 +204,13 @@ def main():
204204
help='Relevant for task data_generator -- how many features.',
205205
)
206206

207+
parser.add_argument(
208+
'--tldr',
209+
type=str,
210+
default='True',
211+
help='If enabled, it will output some of the main results on the screen after finishing.',
212+
)
213+
207214
parser.add_argument(
208215
'--num_synthetic_rows',
209216
type=int,

outrank/task_summary.py

Lines changed: 54 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -3,65 +3,101 @@
33
import logging
44
import os
55
from collections import defaultdict
6+
from typing import Any
7+
from typing import List
68

79
import numpy as np
810
import pandas as pd
911

1012
logging.basicConfig(format='%(asctime)s %(message)s', level=logging.INFO)
1113

1214

13-
def outrank_task_result_summary(args):
14-
triplets_path = os.path.join(args.output_folder, 'pairwise_ranks.tsv')
15+
def read_and_sort_triplets(triplets_path: str) -> pd.DataFrame:
16+
"""Read triplets from a file and sort by the 'Score' column."""
1517
triplets = pd.read_csv(triplets_path, sep='\t')
16-
triplets = triplets.sort_values(by='Score', ascending=False)
18+
return triplets.sort_values(by='Score', ascending=False)
19+
1720

21+
def generate_final_ranking(triplets: pd.DataFrame, label_column: str) -> list[list[Any]]:
22+
"""Generate final ranking based on the label column."""
1823
final_ranking = []
1924
for _, row in triplets.iterrows():
2025
feature_a, feature_b = row['FeatureA'], row['FeatureB']
2126
score = row['Score']
22-
if args.label_column == feature_a.split('-')[0]:
27+
if label_column == feature_a.split('-')[0]:
2328
final_ranking.append([feature_b, score])
24-
elif args.label_column == feature_b.split('-')[0]:
29+
elif label_column == feature_b.split('-')[0]:
2530
final_ranking.append([feature_a, score])
31+
return final_ranking
2632

27-
final_df = pd.DataFrame(final_ranking, columns=['Feature', f'Score {args.heuristic}'])
33+
34+
def create_final_dataframe(final_ranking: list[list[Any]], heuristic: str) -> pd.DataFrame:
35+
"""Create a final DataFrame and normalize if necessary."""
36+
final_df = pd.DataFrame(final_ranking, columns=['Feature', f'Score {heuristic}'])
2837
final_df = (
2938
final_df.groupby('Feature')
3039
.median()
3140
.reset_index()
32-
.sort_values(by=f'Score {args.heuristic}', ascending=False)
41+
.sort_values(by=f'Score {heuristic}', ascending=False)
3342
)
3443

35-
if "MI" in args.heuristic:
36-
min_score = final_df[f'Score {args.heuristic}'].min()
37-
max_score = final_df[f'Score {args.heuristic}'].max()
38-
final_df[f'Score {args.heuristic}'] = (final_df[f'Score {args.heuristic}'] - min_score) / (max_score - min_score)
44+
if 'MI' in heuristic:
45+
min_score = final_df[f'Score {heuristic}'].min()
46+
max_score = final_df[f'Score {heuristic}'].max()
47+
final_df[f'Score {heuristic}'] = (final_df[f'Score {heuristic}'] - min_score) / (max_score - min_score)
48+
49+
return final_df
3950

40-
logging.info(f'Storing summary files to {args.output_folder}')
51+
52+
def store_summary_files(final_df: pd.DataFrame, output_folder: str, heuristic: str, tldr: bool) -> None:
53+
"""Store the summary files and optionally print the head of the DataFrame."""
54+
logging.info(f'Storing summary files to {output_folder}')
4155
pd.set_option('display.max_rows', None, 'display.max_columns', None)
4256

43-
singles_path = os.path.join(args.output_folder, 'feature_singles.tsv')
57+
singles_path = os.path.join(output_folder, 'feature_singles.tsv')
4458
final_df.to_csv(singles_path, sep='\t', index=False)
4559

46-
if args.interaction_order > 1:
60+
if tldr:
61+
print(final_df.head(20))
62+
63+
64+
def handle_interaction_order(final_df: pd.DataFrame, output_folder: str, heuristic: str, interaction_order: int) -> None:
65+
"""Handle the interaction order if it is greater than 1."""
66+
if interaction_order > 1:
4767
feature_store = defaultdict(list)
4868
for _, row in final_df.iterrows():
4969
fname = row['Feature']
50-
score = row[f'Score {args.heuristic}']
70+
score = row[f'Score {heuristic}']
5171
if 'AND' in fname:
5272
for el in fname.split('-')[0].split(' AND '):
5373
feature_store[el].append(score)
5474

5575
final_aggregate_df = pd.DataFrame([
5676
{
5777
'Feature': k,
58-
f'Combined score (order: {args.interaction_order}, {args.heuristic})': np.median(v),
78+
f'Combined score (order: {interaction_order}, {heuristic})': np.median(v),
5979
}
6080
for k, v in feature_store.items()
6181
])
6282
final_aggregate_df.to_csv(
63-
os.path.join(args.output_folder, 'feature_singles_aggregated.tsv'), sep='\t', index=False
83+
os.path.join(output_folder, 'feature_singles_aggregated.tsv'), sep='\t', index=False,
6484
)
6585

66-
transformers_only_path = singles_path.replace('.tsv', '_transformers_only_imp.tsv')
86+
87+
def filter_transformers_only(final_df: pd.DataFrame, output_folder: str) -> None:
88+
"""Filter the DataFrame to include only transformer features and store the result."""
89+
transformers_only_path = os.path.join(output_folder, 'feature_singles_transformers_only_imp.tsv')
6790
final_df[final_df['Feature'].str.contains('_tr_')].to_csv(transformers_only_path, sep='\t', index=False)
91+
92+
93+
def outrank_task_result_summary(args) -> None:
94+
"""Main function to generate a summary of outrank task results."""
95+
triplets_path = os.path.join(args.output_folder, 'pairwise_ranks.tsv')
96+
triplets = read_and_sort_triplets(triplets_path)
97+
98+
final_ranking = generate_final_ranking(triplets, args.label_column)
99+
final_df = create_final_dataframe(final_ranking, args.heuristic)
100+
101+
store_summary_files(final_df, args.output_folder, args.heuristic, args.tldr)
102+
handle_interaction_order(final_df, args.output_folder, args.heuristic, args.interaction_order)
103+
filter_transformers_only(final_df, args.output_folder)

outrank/visualizations/ranking_visualization.py

Lines changed: 7 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -43,7 +43,7 @@ def visualize_hierarchical_clusters(
4343
values='Score',
4444
index='FeatureA',
4545
columns='FeatureB',
46-
aggfunc=np.mean,
46+
aggfunc='mean', # Updated from np.mean to 'mean'
4747
)
4848

4949
pivot_table.fillna(0, inplace=True)
@@ -59,7 +59,7 @@ def visualize_hierarchical_clusters(
5959
)
6060
plt.title(f'Linkage function: {linkage_heuristic}')
6161
with warnings.catch_warnings():
62-
warnings.simplefilter("ignore", UserWarning)
62+
warnings.simplefilter('ignore', UserWarning)
6363
plt.tight_layout()
6464
out_path = f'{output_folder}/dendrogram_{linkage_heuristic}.{image_format}'
6565
plt.savefig(out_path, dpi=300)
@@ -95,7 +95,7 @@ def visualize_hierarchical_clusters(
9595
dfx.columns = ['Silhouette', 'threshold', 'numClusters']
9696
sns.lineplot(x='numClusters', y='Silhouette', data=dfx, color='black')
9797
with warnings.catch_warnings():
98-
warnings.simplefilter("ignore", UserWarning)
98+
warnings.simplefilter('ignore', UserWarning)
9999
plt.tight_layout()
100100
out_path = f'{output_folder}/SilhouetteProfile.{image_format}'
101101
plt.savefig(out_path, dpi=300)
@@ -113,7 +113,7 @@ def visualize_hierarchical_clusters(
113113
projected_data['ClusterID'] = top_clustering.astype(str)
114114
sns.scatterplot(x='Dim1', y='Dim2', hue='ClusterID', data=projected_data, palette='Set2')
115115
with warnings.catch_warnings():
116-
warnings.simplefilter("ignore", UserWarning)
116+
warnings.simplefilter('ignore', UserWarning)
117117
plt.tight_layout()
118118
plt.savefig(f'{output_folder}/clustersEmbeddingVisualization.pdf', dpi=300)
119119
plt.clf()
@@ -130,7 +130,7 @@ def visualize_heatmap(
130130
sns.set(font_scale=2)
131131
fig, ax = plt.subplots()
132132
pivot_table = pd.pivot_table(
133-
triplets, values='Score', index='FeatureA', columns='FeatureB', aggfunc=np.mean,
133+
triplets, values='Score', index='FeatureA', columns='FeatureB', aggfunc='mean', # Updated from np.mean to 'mean'
134134
)
135135
mask = np.zeros_like(pivot_table.values)
136136
mask[np.triu_indices_from(mask)] = True
@@ -160,7 +160,7 @@ def visualize_heatmap(
160160
plt.xlabel('')
161161
plt.ylabel('')
162162
with warnings.catch_warnings():
163-
warnings.simplefilter("ignore", UserWarning)
163+
warnings.simplefilter('ignore', UserWarning)
164164
plt.tight_layout()
165165
plt.savefig(f'{output_folder}/heatmap.{image_format}', dpi=500)
166166
plt.clf()
@@ -245,7 +245,7 @@ def visualize_barplots(
245245
plt.xlabel(f'Feature importance (based on heuristic {heuristic})')
246246
plt.ylabel('')
247247
with warnings.catch_warnings():
248-
warnings.simplefilter("ignore", UserWarning)
248+
warnings.simplefilter('ignore', UserWarning)
249249
plt.tight_layout()
250250
plt.savefig(f'{output_folder}/barplot_top_{subset_range}.{image_format}', dpi=300)
251251
plt.clf()

setup.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -23,7 +23,7 @@ def _read_description():
2323
packages = [x for x in setuptools.find_packages() if x != 'test']
2424
setuptools.setup(
2525
name='outrank',
26-
version='0.97.3',
26+
version='0.97.4',
2727
description='OutRank: Feature ranking for massive sparse data sets.',
2828
long_description=_read_description(),
2929
long_description_content_type='text/markdown',

0 commit comments

Comments
 (0)