Skip to content

Commit 2975bb7

Browse files
authored
Merge pull request #55 from ua-datalab/add_spanish_uspantek_dictionary_for_usp_data_code
Add spanish uspantek dictionary for usp data code
2 parents d5b5fef + d3d2b39 commit 2975bb7

File tree

2 files changed

+59
-10
lines changed

2 files changed

+59
-10
lines changed

classify.py

Lines changed: 58 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -21,6 +21,7 @@
2121

2222

2323
import argparse
24+
import json
2425
from lambeq.text2diagram.ccg_parser import CCGParser
2526
from lambeq.ansatz import BaseAnsatz
2627
from lambeq.training.model import Model
@@ -60,7 +61,7 @@
6061
from keras import layers
6162
import os.path
6263

63-
64+
USPANTEK_SPANISH_DICT_FILENAME="refined_usp_to_spanish.json"
6465

6566

6667
def f1(y_hat, y):
@@ -98,20 +99,42 @@ def get_max_word_param_length_all_other_ansatz(input_circuits):
9899
lengths.append(int(symb.name[-1]))
99100
return lengths
100101

101-
102+
"""
103+
given a word in uspantek- from training or dev, give it to a look up dictionary,find its corresponding spanish word, and return it.
104+
The dictionary was provided to us by the professor who went to guatemala to create this dictionary from uspantek speaking people
105+
106+
:param usp_word: the word in uspantek
107+
108+
:return: returns the corresponding spanish word
109+
"""
110+
def get_spanish_word_given_usp_word(usp_word):
111+
112+
pass
113+
114+
102115
"""
103116
given a word from training and dev vocab, get the corresponding embedding using fast text.
104117
105118
:param vocab: vocabulary
119+
:param embdding model: the corresponding embedding model, that will be used. i.e model 3. e.g fasttext for spanish or english
120+
:param language: what language. options are spanish or uspantek. Rather if it is uspantek, we need to do one extra step. i.e take every word in uspantek, and find its corresponding spanish word,- then only does pulling the corresponding spanish embedding makes sense.
121+
106122
:return: returns a dictionary of each word and its corresponding embedding
107123
"""
108-
def get_vocab_emb_dict(vocab,embedding_model):
124+
def get_vocab_emb_dict(vocab,embedding_model,dataset_name,usp_spanish_dictionary=None):
109125
embed_dict={}
110126
for wrd in vocab:
111127
cleaned_wrd_just_plain_text,cleaned_wrd_with_type=clean_wrd_for_spider_ansatz_coming_from_vocab(wrd)
112128
if cleaned_wrd_with_type in embed_dict :
113129
print(f"error. the word {cleaned_wrd_with_type} was already in dict")
114130
else:
131+
if(dataset_name == "uspantek"):
132+
assert usp_spanish_dictionary != None
133+
#then for each uspantek word, get the corresponding spanish word
134+
if cleaned_wrd_just_plain_text in usp_spanish_dictionary:
135+
spanish_equivalent_word_or_sent = usp_spanish_dictionary[cleaned_wrd_just_plain_text]
136+
else:
137+
pass
115138
embed_dict[cleaned_wrd_with_type]= embedding_model[cleaned_wrd_just_plain_text]
116139
return embed_dict
117140

@@ -154,7 +177,7 @@ def create_vocab_from_circuits(circuits,ansatz):
154177
155178
"""
156179

157-
def generate_initial_parameterisation(train_circuits, val_circuits, embedding_model, qnlp_model,ansatz,model_type_class):
180+
def generate_initial_parameterisation(train_circuits, val_circuits, embedding_model, qnlp_model,ansatz,model_type_class,usp_spanish_dictionary,dataset_name):
158181

159182

160183
train_vocab=create_vocab_from_circuits(train_circuits,ansatz)
@@ -204,9 +227,9 @@ def generate_initial_parameterisation(train_circuits, val_circuits, embedding_mo
204227

205228
assert max_word_param_length!=0
206229

207-
if(ansatz==SpiderAnsatz):
230+
if(ansatz==SpiderAnsatz):
208231
#for each word in train and test vocab get its embedding from fasttext
209-
train_vocab_embeddings = get_vocab_emb_dict(train_vocab,embedding_model)
232+
train_vocab_embeddings = get_vocab_emb_dict(train_vocab,embedding_model,dataset_name,usp_spanish_dictionary)
210233
val_vocab_embeddings = get_vocab_emb_dict(val_vocab,embedding_model)
211234
else:
212235
#for the words created by other ansatz other formatting is different
@@ -528,6 +551,24 @@ def read_glue_data(dataset_downloaded,split,lines_to_read=0):
528551

529552

530553

554+
"""
555+
The dictionary/translator between uspantek and spanish came in a txt file.
556+
here we will load the file and convert it into a key value pair
557+
558+
:param file_path: path to the dictionary
559+
560+
:return: returns a dictionary of each usp word with corresponding spanish translation
561+
"""
562+
563+
564+
def read_usp_spanish_dictionary(file_path):
565+
with open(file_path,'r') as json_file:
566+
uspantek_to_spanish = json.load(json_file)
567+
568+
return uspantek_to_spanish
569+
570+
571+
531572
def read_data(filename,lines_to_read):
532573
labels, sentences = [], []
533574
line_counter=0
@@ -600,7 +641,7 @@ def convert_diagram_to_circuits_with_try_catch(diagrams, ansatz, labels,split):
600641
return list_circuits, list_labels
601642

602643

603-
def run_experiment(train_diagrams, train_labels, val_diagrams, val_labels,test_diagrams,test_labels, eval_metrics,seed,embedding_model,ansatz_class, single_qubit_params,base_dimension_for_noun,base_dimension_for_sent,base_dimension_for_prep_phrase,no_of_layers_in_ansatz,expose_model1_val_during_model_initialization,batch_size,learning_rate_model1,model_class_to_use, epochs_train_model1, trainer_class_to_use,do_model3_tuning,learning_rate_model3 ,maxparams,epochs_model3_oov_model,model14type,use_wandb):
644+
def run_experiment(train_diagrams, train_labels, val_diagrams, val_labels,test_diagrams,test_labels, eval_metrics,seed,embedding_model,ansatz_class, single_qubit_params,base_dimension_for_noun,base_dimension_for_sent,base_dimension_for_prep_phrase,no_of_layers_in_ansatz,expose_model1_val_during_model_initialization,batch_size,learning_rate_model1,model_class_to_use, epochs_train_model1, trainer_class_to_use,do_model3_tuning,learning_rate_model3 ,maxparams,epochs_model3_oov_model,model14type,use_wandb,usp_spanish_dictionary,dataset_name):
604645
if ansatz_class in [IQPAnsatz,Sim15Ansatz, Sim14Ansatz]:
605646
ansatz_obj = ansatz_class ({AtomicType.NOUN: base_dimension_for_noun,
606647
AtomicType.SENTENCE: base_dimension_for_sent,
@@ -710,7 +751,7 @@ def run_experiment(train_diagrams, train_labels, val_diagrams, val_labels,test_d
710751

711752

712753

713-
train_embeddings, val_embeddings, max_w_param_length, oov_word_count = generate_initial_parameterisation(train_circuits, val_circuits, embedding_model, model1_obj,ansatz_class, model_class_to_use)
754+
train_embeddings, val_embeddings, max_w_param_length, oov_word_count = generate_initial_parameterisation(train_circuits, val_circuits, embedding_model, model1_obj,ansatz_class, model_class_to_use,usp_spanish_dictionary,dataset_name)
714755

715756
global MAX_PARAM_LENGTH
716757
MAX_PARAM_LENGTH = max_w_param_length
@@ -911,6 +952,12 @@ def perform_task(args):
911952
if args.dataset in ["uspantek","spanish"]:
912953
spanish_tokeniser=spacy.load("es_core_news_sm")
913954
spacy_tokeniser.tokeniser = spanish_tokeniser
955+
956+
if args.dataset == "uspantek":
957+
#load the usp_spanish-dictionary
958+
usp_spanish_dictionary= read_usp_spanish_dictionary(os.path.join(args.data_base_folder,USPANTEK_SPANISH_DICT_FILENAME))
959+
960+
914961
else:
915962
english_tokenizer = spacy.load("en_core_web_sm")
916963
spacy_tokeniser.tokeniser =english_tokenizer
@@ -928,6 +975,7 @@ def perform_task(args):
928975
test_labels, test_data = read_data(os.path.join(args.data_base_folder,TEST),lines_to_read= args.no_of_training_data_points_to_use)
929976

930977

978+
931979

932980

933981
"""#some datasets like spanish, uspantek, sst2 have some sentences which bobcat doesnt like. putting it
@@ -979,7 +1027,7 @@ def perform_task(args):
9791027
# But commenting out due to lack of ram in laptop
9801028
tf_seed = args.seed
9811029
tf.random.set_seed(tf_seed)
982-
return run_experiment(train_diagrams, train_labels, val_diagrams, val_labels,test_diagrams,test_labels, eval_metrics,tf_seed,embedding_model,args.ansatz,args.single_qubit_params,args.base_dimension_for_noun,args.base_dimension_for_sent,args.base_dimension_for_prep_phrase, args.no_of_layers_in_ansatz,args.expose_model1_val_during_model_initialization , args.batch_size,args.learning_rate_model1,args.model14type, args.epochs_train_model1,args.trainer,args.do_model3_tuning,args.learning_rate_model3,args.maxparams,args.epochs_model3_oov_model, args.model14type, args.use_wandb)
1030+
return run_experiment(train_diagrams, train_labels, val_diagrams, val_labels,test_diagrams,test_labels, eval_metrics,tf_seed,embedding_model,args.ansatz,args.single_qubit_params,args.base_dimension_for_noun,args.base_dimension_for_sent,args.base_dimension_for_prep_phrase, args.no_of_layers_in_ansatz,args.expose_model1_val_during_model_initialization , args.batch_size,args.learning_rate_model1,args.model14type, args.epochs_train_model1,args.trainer,args.do_model3_tuning,args.learning_rate_model3,args.maxparams,args.epochs_model3_oov_model, args.model14type, args.use_wandb,usp_spanish_dictionary,args.dataset)
9831031

9841032
def parse_name_model(val):
9851033
try:
@@ -1092,6 +1140,7 @@ def parse_arguments():
10921140
parser.add_argument('--do_debug', action= "store_true",help="to run debug or not to debug. If yes, will uncomment the attachment code")
10931141
parser.add_argument('--use_wandb', action= "store_true",help="turn on wandb. making it optional since wandb doesnt work well with cyverse")
10941142

1143+
10951144

10961145
return parser.parse_args()
10971146

test_oov_no_pair.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -249,7 +249,7 @@ def test_uspantek_quantum1_no_expose_val(monkeypatch):
249249
assert type(ex) == KeyError
250250

251251

252-
# python classify.py --dataset uspantek --parser BobCatParser --ansatz IQPAnsatz --model14type TketModel --trainer QuantumTrainer --epochs_train_model1 30 --no_of_training_data_points_to_use 70 --no_of_val_data_points_to_use 30 --max_tokens_per_sent 10 ----expose_model1_val_during_model_initialization
252+
# python classify.py --dataset uspantek --parser BobCatParser --ansatz IQPAnsatz --model14type TketModel --trainer QuantumTrainer --epochs_train_model1 30 --no_of_training_data_points_to_use 70 --no_of_val_data_points_to_use 30 --max_tokens_per_sent 10 --expose_model1_val_during_model_initialization
253253

254254

255255
def test_uspantek_quantum1_yes_expose_val(monkeypatch):

0 commit comments

Comments
 (0)