Skip to content

Commit 7648236

Browse files
committed
(#343)(skip-models)(skip-preps)(skip-embs)
1 parent dbb95ee commit 7648236

File tree

3 files changed

+20
-8
lines changed

3 files changed

+20
-8
lines changed

src/Adila

src/main.py

Lines changed: 3 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -117,10 +117,6 @@ def run(cfg):
117117
# if a list, all see the exact splits of teams.
118118
# if individual, they see different teams in splits. But as we show the average results, no big deal, esp., as we do n-fold
119119
models = {}
120-
# model names t* will follow the streaming scenario
121-
# model names *_ts have timestamp (year) as a single added feature
122-
# model names *_ts2v learn temporal skill vectors via d2v when each doc is a stream of (skills: year of the team)
123-
# non-temporal (no streaming scenario, bag of teams)
124120
assert len(cfg.models.instances) > 0, f'{opentf.textcolor["red"]}No model instance for training! Check ./src/__config__.yaml and models.instances ... {opentf.textcolor["reset"]}'
125121

126122
# Get command-line overrides for models. Kinda tricky as we dynamically override a subconfig.
@@ -145,7 +141,7 @@ def run(cfg):
145141
# t2v object knows the embedding method and ...
146142
skill_vecs = t2v.get_dense_vecs(teamsvecs, vectype='skill')
147143
assert skill_vecs.shape[0] == teamsvecs['skill'].shape[0], f'{opentf.textcolor["red"]}Incorrect number of embeddings for teams subset of skills!{opentf.textcolor["reset"]}'
148-
teamsvecs['original_skill'] = teamsvecs['skill'] #to accomodate skill_coverage metric and future use cases like in nmt
144+
teamsvecs['original_skill'] = teamsvecs['skill'] #to accommodate skill_coverage metric and future use cases like in nmt
149145
teamsvecs['skill'] = skill_vecs
150146

151147
for m in cfg.models.instances:
@@ -189,8 +185,9 @@ def run(cfg):
189185
log.info(f'{opentf.textcolor["green"]}Aggregating the test results under {cfg.data.output} per splits from test.pred.eval.mean.csv files ... {opentf.textcolor["reset"]}')
190186
aggregate(cfg.data.output)
191187

192-
# sample runs for different configs, including different prep, embeddings, model training, ..., are available as unit-test in
188+
# sample runs for different configs, including different prep, embeddings, model training, ..., see unit-tests and scripts in
193189
# ./github/workflows/*.yml
190+
# ./ipynb/*.ipynb
194191

195192
# To run on compute canada servers you can use the following command: (time is in minutes)
196193
#sbatch --account=def-hfani --mem=96000MB --time=2880 computecanada.sh

src/mdl/ntf.py

Lines changed: 16 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -105,6 +105,20 @@ def plot_roc(self, splits, on_train=False):
105105
# plt.show()
106106

107107
def adila(self, teamsvecs, splits, faircfg):
108+
def _avg_per_fold(files):
109+
from pathlib import Path;
110+
pd = opentf.install_import('pandas')
111+
files = [f for f in files if not re.search(r'\.e\d+\.', os.path.basename(f))] #filter out per epoch files
112+
for kind in ['fair', 'utility']:
113+
dfs = []
114+
for fold in splits['folds'].keys():
115+
fold_file = [f'{f}.eval.{kind}.mean.csv' for f in files if Path(f).name.startswith(f'f{fold}.')]
116+
if len(fold_file) != 1: raise ValueError(f'Expected one file for fold {fold} ({kind}), found {len(fold_file)}!')
117+
dfs.append(pd.read_csv(fold_file[0], index_col=0))
118+
combined = pd.concat(dfs)
119+
mean_df = combined.groupby(combined.index).mean()
120+
mean_df.to_csv(fold_file[0].replace(f'f{fold}.', ''))
121+
108122
from Adila.src.adila import Adila
109123
from Adila.src.main import _
110124
if not scipy.sparse.issparse(teamsvecs['skill']): teamsvecs['skill'] = teamsvecs['original_skill'] # to accomodate dense emb vecs of skills
@@ -116,4 +130,5 @@ def adila(self, teamsvecs, splits, faircfg):
116130
stats, minorities, ratios = adila.prep(self.output, notion, attribute, is_popular_alg, faircfg.is_popular_coef, plot)
117131
if notion == 'dp' and faircfg.dp_ratio: ratios = [1 - faircfg.ratio if attribute == 'popularity' else faircfg.ratio]
118132
for algorithm in faircfg.algorithm:
119-
_(adila, self.output, minorities, ratios, algorithm, faircfg.k_max, faircfg.alpha, faircfg.acceleration, faircfg.eval)
133+
outputs = _(adila, self.output, minorities, ratios, algorithm, faircfg.k_max, faircfg.alpha, faircfg.acceleration, faircfg.eval)
134+
_avg_per_fold(outputs) # ideally, all the outputs should be in the same folder/path.

0 commit comments

Comments
 (0)