Skip to content

Commit 11876ee

Browse files
made some changes to the user embs, the model arch (classifier), added
some req to the text file, and wrote a comparison script
1 parent 47772c8 commit 11876ee

File tree

9 files changed

+580
-151
lines changed

9 files changed

+580
-151
lines changed

acaris_comparer.py

Lines changed: 62 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,62 @@
1+
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score, log_loss
2+
from sklearn.preprocessing import label_binarize
3+
from inferDistilBERT import InferACARISBERT
4+
from inferACARIS import InferACARIS
5+
import pandas as pd
6+
import numpy as np
7+
from tqdm import tqdm
8+
9+
10+
11+
class ACARISComparer:
12+
def __init__(self, baseline, acarisModel):
13+
self.baseline = baseline
14+
self.acarisModel = acarisModel
15+
16+
def compare(self, data):
17+
uids = list(data["uid"])
18+
contents = list(data["content"])
19+
classLabels = ["neg", "neu", "pos"]
20+
y_true = [classLabels.index(sentiment) for sentiment in data["sentiment"]]
21+
22+
baselineIN = [self.baseline.prepare_input(content) for content in contents]
23+
24+
baselinePreds = [self.baseline.predict(inputs=input_) for input_ in tqdm(baselineIN, desc="Predicting using baseline", ncols=100)]
25+
acarisPreds = [self.acarisModel.predict(uids=[uid], contents=[content]) for uid, content in tqdm(zip(uids, contents), total=len(uids), desc="Predicting using ACARIS", ncols=100)]
26+
27+
probs1 = baselinePreds
28+
probs2 = np.vstack([pred[1] for pred in acarisPreds])
29+
y_pred1 = [np.argmax(prob) for prob in probs1]
30+
y_pred2 = [np.argmax(prob) for prob in probs2]
31+
32+
print("Baseline metrics:")
33+
y_true = np.array(y_true)
34+
y_pred1 = np.array(y_pred1)
35+
probs1 = np.concatenate(probs1)
36+
self.compute_metrics(y_true, y_pred1, probs1, classLabels)
37+
38+
print("ACARIS metrics:")
39+
self.compute_metrics(y_true, y_pred2, probs2, classLabels)
40+
41+
def compute_metrics(self, y_true, y_pred, y_probs, classLabels):
42+
print(classification_report(y_true, y_pred, target_names=classLabels))
43+
44+
confMatrix = confusion_matrix(y_true, y_pred)
45+
print(confMatrix)
46+
47+
y_true_bin = label_binarize(y_true, classes=range(len(classLabels)))
48+
roc_auc = roc_auc_score(y_true_bin, y_probs, multi_class="ovr")
49+
print(f"ROC AUC: {roc_auc}")
50+
51+
logloss = log_loss(y_true, y_probs)
52+
print(f"Log loss: {logloss}")
53+
54+
55+
if __name__ == "__main__":
56+
baseline = InferACARISBERT("ongknsro/ACARIS_BASELINE-DistilBERT-evalSpecimen1-batchSize32")
57+
ACARISModel = InferACARIS("ongknsro/ACARIS-DistilBERT_MLPUserEmbs-iter1-batchSize32")
58+
59+
comparer = ACARISComparer(baseline, ACARISModel)
60+
61+
data = pd.read_csv("./datasets/test.csv", sep="|", encoding="utf-8")
62+
comparer.compare(data)

dualTrainACARIS.py

Lines changed: 304 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,304 @@
1+
"""
2+
This mod fine-tunes a BERT model on the ACARIS dataset for comparison with ACARISMdl.
3+
"""
4+
5+
6+
7+
8+
9+
10+
11+
12+
13+
14+
15+
16+
17+
18+
19+
20+
21+
22+
23+
24+
25+
26+
##################### !!! WORK IN PROGRESS !!! DO NOT USE !!! #####################
27+
28+
29+
30+
31+
32+
33+
34+
35+
36+
37+
38+
39+
40+
41+
42+
43+
44+
45+
46+
import torch
47+
from torch import nn
48+
from torch.utils.data import Dataset, DataLoader
49+
from transformers import DistilBertTokenizerFast, DistilBertForSequenceClassification, TrainingArguments, Trainer, AdamW, EarlyStoppingCallback, PreTrainedModel, DistilBertModel
50+
from transformers.modeling_outputs import SequenceClassifierOutput
51+
from datasets import load_dataset, Dataset
52+
import pandas as pd
53+
from sklearn.metrics import accuracy_score, precision_recall_fscore_support, confusion_matrix, roc_auc_score
54+
import wandb
55+
import huggingface_hub
56+
import os
57+
import random
58+
import numpy as np
59+
60+
config = {
61+
"mdl": "distilbert-base-uncased",
62+
"epochs": 5,
63+
"batchSize": 14,
64+
"maxLen": 512,
65+
"warmupSteps": 0.1, # proportion of total steps, NOT absolute
66+
"weightDecay": 0.02,
67+
"outputDir": "./output",
68+
"earlyStopping": True,
69+
"earlyStoppingPatience": 2,
70+
"dropout": 0.1,
71+
"initlr": 5e-5,
72+
"epsilon": 1e-8
73+
}
74+
75+
wandb.init(project="MarkIII_ACARIS", entity="simtoonia", config=config)
76+
77+
78+
def lockSeed(seed):
79+
random.seed(seed)
80+
np.random.seed(seed)
81+
torch.manual_seed(seed)
82+
if torch.cuda.is_available():
83+
torch.cuda.manual_seed_all(seed)
84+
torch.backends.cudnn.deterministic = True
85+
86+
#0 disabled, as determinism is not guaranteed and lowers performance
87+
#lockSeed(69) # setting a fixed seed for *some* reproducibility
88+
89+
class DistilBertForMulticlassSequenceClassification(DistilBertForSequenceClassification):
90+
def __init__(self, config):
91+
super().__init__(config)
92+
93+
def forward(self, input_ids=None, attention_mask=None, head_mask=None, inputs_embeds=None, labels=None, output_attentions=None, output_hidden_states=None, return_dict=None):
94+
return_dict = return_dict if return_dict is not None else self.config.use_return_dict
95+
96+
outputs = self.distilbert(input_ids, attention_mask=attention_mask, head_mask=head_mask, inputs_embeds=inputs_embeds, output_attentions=output_attentions, output_hidden_states=output_hidden_states, return_dict=return_dict)
97+
98+
hidden_state = outputs[0]
99+
pooled_output = hidden_state[:, 0]
100+
pooled_output = self.pre_classifier(pooled_output)
101+
pooled_output = nn.ReLU()(pooled_output)
102+
pooled_output = self.dropout(pooled_output)
103+
logits = self.classifier(pooled_output)
104+
105+
loss = None
106+
if labels is not None:
107+
lossFct = nn.CrossEntropyLoss()
108+
loss = lossFct(logits.view(-1, self.num_labels), labels.view(-1))
109+
110+
if not return_dict:
111+
output = (logits,) + outputs[2:]
112+
return ((loss,) + output) if loss is not None else output
113+
114+
return SequenceClassifierOutput(loss=loss, logits=logits, hidden_states=outputs.hidden_states, attentions=outputs.attentions)
115+
116+
117+
118+
class ACARISUserEmber:
119+
def __init__(self, trainPath, valPath):
120+
self.trainPath = trainPath
121+
self.valPath = valPath
122+
self.tokenizer = DistilBertTokenizerFast.from_pretrained(config["mdl"])
123+
self.model = DistilBertForMulticlassSequenceClassification.from_pretrained(config["mdl"], num_labels=3, id2label={0: "neg", 1: "neu", 2: "pos"}, label2id={"neg": 0, "neu": 1, "pos": 2}, dropout=config["dropout"], attention_dropout=config["dropout"])
124+
125+
def read_data(self, path):
126+
df = pd.read_csv(path, sep="|", usecols=["content", "sentiment"])
127+
return Dataset.from_pandas(df)
128+
129+
def tokenize_data(self, dataset):
130+
sentMapping = {"pos": 2, "neg": 0, "neu": 1}
131+
tokenized = dataset.map(
132+
lambda x: {
133+
**self.tokenizer(x["content"], truncation=True, padding="max_length", max_length=config["maxLen"]),
134+
"labels": torch.tensor([sentMapping[sent] for sent in x["sentiment"]])
135+
},
136+
batched=True,
137+
remove_columns=["content", "sentiment"]
138+
)
139+
tokenized.set_format(type="torch", columns=["input_ids", "attention_mask", "labels"])
140+
return tokenized
141+
142+
def get_data_loaders(self, trainDS, valDS):
143+
trainLoader = DataLoader(trainDS, batch_size=config["batchSize"], shuffle=False)
144+
valLoader = DataLoader(valDS, batch_size=config["batchSize"], shuffle=False)
145+
return trainLoader, valLoader
146+
147+
def compute_metrics(self, evalPred):
148+
logits, labels = evalPred
149+
preds = torch.argmax(torch.Tensor(logits), dim=1)
150+
probs = torch.nn.functional.softmax(torch.Tensor(logits), dim=1)
151+
precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average=None)
152+
accuracy = accuracy_score(labels, preds)
153+
rocAUC = roc_auc_score(labels, probs, multi_class="ovr")
154+
metrics = {
155+
"accuracy": accuracy,
156+
"roc_auc": rocAUC
157+
}
158+
metricNames = ["precision", "recall", "f1"]
159+
labelNames = ["neg", "neu", "pos"]
160+
for metricName, metricValue in zip(metricNames, [precision, recall, f1]):
161+
for labelName, value in zip(labelNames, metricValue):
162+
metrics[f"{metricName}_{labelName}"] = float(value)
163+
return metrics
164+
165+
def train(self):
166+
trainDS = self.tokenize_data(self.read_data(self.trainPath))
167+
valDS = self.tokenize_data(self.read_data(self.valPath))
168+
169+
totalSteps = len(trainDS) // config["batchSize"] * config["epochs"]
170+
warmupSteps = int(totalSteps * config["warmupSteps"])
171+
172+
trainingArgs = TrainingArguments(
173+
output_dir=config["outputDir"],
174+
num_train_epochs=config["epochs"],
175+
per_device_train_batch_size=config["batchSize"],
176+
per_device_eval_batch_size=config["batchSize"],
177+
warmup_steps=warmupSteps,
178+
weight_decay=config["weightDecay"],
179+
logging_dir="./logs",
180+
logging_steps=100,
181+
learning_rate=config["initlr"],
182+
evaluation_strategy="epoch",
183+
save_strategy="epoch",
184+
load_best_model_at_end=True,
185+
metric_for_best_model="accuracy",
186+
save_total_limit=5,
187+
adam_epsilon=config["epsilon"],
188+
report_to="wandb",
189+
fp16=True
190+
)
191+
192+
trainer = Trainer(
193+
model=self.model,
194+
args=trainingArgs,
195+
train_dataset=trainDS,
196+
eval_dataset=valDS,
197+
compute_metrics=self.compute_metrics,
198+
callbacks=[EarlyStoppingCallback(early_stopping_patience=config["earlyStoppingPatience"])]
199+
)
200+
print(f"Number of parameters: {trainer.model.num_parameters()}")
201+
print("Running eval ...")
202+
trainer.evaluate()
203+
print("Running training ...")
204+
trainer.train()
205+
print("Saving model ...")
206+
trainer.save_model(config["outputDir"])
207+
208+
209+
210+
class ACARISLabeler:
211+
def __init__(self, trainPath, valPath):
212+
self.trainPath = trainPath
213+
self.valPath = valPath
214+
self.tokenizer = DistilBertTokenizerFast.from_pretrained(config["mdl"])
215+
self.model = DistilBertForMulticlassSequenceClassification.from_pretrained(config["mdl"], num_labels=3, id2label={0: "neg", 1: "neu", 2: "pos"}, label2id={"neg": 0, "neu": 1, "pos": 2}, dropout=config["dropout"], attention_dropout=config["dropout"])
216+
217+
def read_data(self, path):
218+
df = pd.read_csv(path, sep="|", usecols=["content", "sentiment"])
219+
return Dataset.from_pandas(df)
220+
221+
def tokenize_data(self, dataset):
222+
sentMapping = {"pos": 2, "neg": 0, "neu": 1}
223+
tokenized = dataset.map(
224+
lambda x: {
225+
**self.tokenizer(x["content"], truncation=True, padding="max_length", max_length=config["maxLen"]),
226+
"labels": torch.tensor([sentMapping[sent] for sent in x["sentiment"]])
227+
},
228+
batched=True,
229+
remove_columns=["content", "sentiment"]
230+
)
231+
tokenized.set_format(type="torch", columns=["input_ids", "attention_mask", "labels"])
232+
return tokenized
233+
234+
def get_data_loaders(self, trainDS, valDS):
235+
trainLoader = DataLoader(trainDS, batch_size=config["batchSize"], shuffle=False)
236+
valLoader = DataLoader(valDS, batch_size=config["batchSize"], shuffle=False)
237+
return trainLoader, valLoader
238+
239+
def compute_metrics(self, evalPred):
240+
logits, labels = evalPred
241+
preds = torch.argmax(torch.Tensor(logits), dim=1)
242+
probs = torch.nn.functional.softmax(torch.Tensor(logits), dim=1)
243+
precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average=None)
244+
accuracy = accuracy_score(labels, preds)
245+
rocAUC = roc_auc_score(labels, probs, multi_class="ovr")
246+
metrics = {
247+
"accuracy": accuracy,
248+
"roc_auc": rocAUC
249+
}
250+
metricNames = ["precision", "recall", "f1"]
251+
labelNames = ["neg", "neu", "pos"]
252+
for metricName, metricValue in zip(metricNames, [precision, recall, f1]):
253+
for labelName, value in zip(labelNames, metricValue):
254+
metrics[f"{metricName}_{labelName}"] = float(value)
255+
return metrics
256+
257+
def train(self):
258+
trainDS = self.tokenize_data(self.read_data(self.trainPath))
259+
valDS = self.tokenize_data(self.read_data(self.valPath))
260+
261+
totalSteps = len(trainDS) // config["batchSize"] * config["epochs"]
262+
warmupSteps = int(totalSteps * config["warmupSteps"])
263+
264+
trainingArgs = TrainingArguments(
265+
output_dir=config["outputDir"],
266+
num_train_epochs=config["epochs"],
267+
per_device_train_batch_size=config["batchSize"],
268+
per_device_eval_batch_size=config["batchSize"],
269+
warmup_steps=warmupSteps,
270+
weight_decay=config["weightDecay"],
271+
logging_dir="./logs",
272+
logging_steps=100,
273+
learning_rate=config["initlr"],
274+
evaluation_strategy="epoch",
275+
save_strategy="epoch",
276+
load_best_model_at_end=True,
277+
metric_for_best_model="accuracy",
278+
save_total_limit=5,
279+
adam_epsilon=config["epsilon"],
280+
report_to="wandb",
281+
fp16=True
282+
)
283+
284+
trainer = Trainer(
285+
model=self.model,
286+
args=trainingArgs,
287+
train_dataset=trainDS,
288+
eval_dataset=valDS,
289+
compute_metrics=self.compute_metrics,
290+
callbacks=[EarlyStoppingCallback(early_stopping_patience=config["earlyStoppingPatience"])]
291+
)
292+
print(f"Number of parameters: {trainer.model.num_parameters()}")
293+
print("Running eval ...")
294+
trainer.evaluate()
295+
print("Running training ...")
296+
trainer.train()
297+
print("Saving model ...")
298+
trainer.save_model(config["outputDir"])
299+
300+
301+
if __name__ == "__main__":
302+
acaris_bert = ACARISBERT("./datasets/train.csv", "./datasets/val.csv")
303+
acaris_bert.train()
304+
wandb.finish()

0 commit comments

Comments
 (0)