2121
2222
2323import argparse
24+ import json
2425from lambeq .text2diagram .ccg_parser import CCGParser
2526from lambeq .ansatz import BaseAnsatz
2627from lambeq .training .model import Model
6061from keras import layers
6162import os .path
6263
63-
64+ USPANTEK_SPANISH_DICT_FILENAME = "refined_usp_to_spanish.json"
6465
6566
6667def f1 (y_hat , y ):
@@ -98,20 +99,42 @@ def get_max_word_param_length_all_other_ansatz(input_circuits):
9899 lengths .append (int (symb .name [- 1 ]))
99100 return lengths
100101
101-
102+ """
103+ given a word in uspantek- from training or dev, give it to a look up dictionary,find its corresponding spanish word, and return it.
104+ The dictionary was provided to us by the professor who went to guatemala to create this dictionary from uspantek speaking people
105+
106+ :param usp_word: the word in uspantek
107+
108+ :return: returns the corresponding spanish word
109+ """
110+ def get_spanish_word_given_usp_word (usp_word ):
111+
112+ pass
113+
114+
102115"""
103116given a word from training and dev vocab, get the corresponding embedding using fast text.
104117
105118:param vocab: vocabulary
119+ :param embdding model: the corresponding embedding model, that will be used. i.e model 3. e.g fasttext for spanish or english
120+ :param language: what language. options are spanish or uspantek. Rather if it is uspantek, we need to do one extra step. i.e take every word in uspantek, and find its corresponding spanish word,- then only does pulling the corresponding spanish embedding makes sense.
121+
106122:return: returns a dictionary of each word and its corresponding embedding
107123"""
108- def get_vocab_emb_dict (vocab ,embedding_model ):
124+ def get_vocab_emb_dict (vocab ,embedding_model , dataset_name , usp_spanish_dictionary = None ):
109125 embed_dict = {}
110126 for wrd in vocab :
111127 cleaned_wrd_just_plain_text ,cleaned_wrd_with_type = clean_wrd_for_spider_ansatz_coming_from_vocab (wrd )
112128 if cleaned_wrd_with_type in embed_dict :
113129 print (f"error. the word { cleaned_wrd_with_type } was already in dict" )
114130 else :
131+ if (dataset_name == "uspantek" ):
132+ assert usp_spanish_dictionary != None
133+ #then for each uspantek word, get the corresponding spanish word
134+ if cleaned_wrd_just_plain_text in usp_spanish_dictionary :
135+ spanish_equivalent_word_or_sent = usp_spanish_dictionary [cleaned_wrd_just_plain_text ]
136+ else :
137+ pass
115138 embed_dict [cleaned_wrd_with_type ]= embedding_model [cleaned_wrd_just_plain_text ]
116139 return embed_dict
117140
@@ -154,7 +177,7 @@ def create_vocab_from_circuits(circuits,ansatz):
154177
155178"""
156179
157- def generate_initial_parameterisation (train_circuits , val_circuits , embedding_model , qnlp_model ,ansatz ,model_type_class ):
180+ def generate_initial_parameterisation (train_circuits , val_circuits , embedding_model , qnlp_model ,ansatz ,model_type_class , usp_spanish_dictionary , dataset_name ):
158181
159182
160183 train_vocab = create_vocab_from_circuits (train_circuits ,ansatz )
@@ -204,9 +227,9 @@ def generate_initial_parameterisation(train_circuits, val_circuits, embedding_mo
204227
205228 assert max_word_param_length != 0
206229
207- if (ansatz == SpiderAnsatz ):
230+ if (ansatz == SpiderAnsatz ):
208231 #for each word in train and test vocab get its embedding from fasttext
209- train_vocab_embeddings = get_vocab_emb_dict (train_vocab ,embedding_model )
232+ train_vocab_embeddings = get_vocab_emb_dict (train_vocab ,embedding_model , dataset_name , usp_spanish_dictionary )
210233 val_vocab_embeddings = get_vocab_emb_dict (val_vocab ,embedding_model )
211234 else :
212235 #for the words created by other ansatz other formatting is different
@@ -528,6 +551,24 @@ def read_glue_data(dataset_downloaded,split,lines_to_read=0):
528551
529552
530553
554+ """
555+ The dictionary/translator between uspantek and spanish came in a txt file.
556+ here we will load the file and convert it into a key value pair
557+
558+ :param file_path: path to the dictionary
559+
560+ :return: returns a dictionary of each usp word with corresponding spanish translation
561+ """
562+
563+
564+ def read_usp_spanish_dictionary (file_path ):
565+ with open (file_path ,'r' ) as json_file :
566+ uspantek_to_spanish = json .load (json_file )
567+
568+ return uspantek_to_spanish
569+
570+
571+
531572def read_data (filename ,lines_to_read ):
532573 labels , sentences = [], []
533574 line_counter = 0
@@ -600,7 +641,7 @@ def convert_diagram_to_circuits_with_try_catch(diagrams, ansatz, labels,split):
600641 return list_circuits , list_labels
601642
602643
603- def run_experiment (train_diagrams , train_labels , val_diagrams , val_labels ,test_diagrams ,test_labels , eval_metrics ,seed ,embedding_model ,ansatz_class , single_qubit_params ,base_dimension_for_noun ,base_dimension_for_sent ,base_dimension_for_prep_phrase ,no_of_layers_in_ansatz ,expose_model1_val_during_model_initialization ,batch_size ,learning_rate_model1 ,model_class_to_use , epochs_train_model1 , trainer_class_to_use ,do_model3_tuning ,learning_rate_model3 ,maxparams ,epochs_model3_oov_model ,model14type ,use_wandb ):
644+ def run_experiment (train_diagrams , train_labels , val_diagrams , val_labels ,test_diagrams ,test_labels , eval_metrics ,seed ,embedding_model ,ansatz_class , single_qubit_params ,base_dimension_for_noun ,base_dimension_for_sent ,base_dimension_for_prep_phrase ,no_of_layers_in_ansatz ,expose_model1_val_during_model_initialization ,batch_size ,learning_rate_model1 ,model_class_to_use , epochs_train_model1 , trainer_class_to_use ,do_model3_tuning ,learning_rate_model3 ,maxparams ,epochs_model3_oov_model ,model14type ,use_wandb , usp_spanish_dictionary , dataset_name ):
604645 if ansatz_class in [IQPAnsatz ,Sim15Ansatz , Sim14Ansatz ]:
605646 ansatz_obj = ansatz_class ({AtomicType .NOUN : base_dimension_for_noun ,
606647 AtomicType .SENTENCE : base_dimension_for_sent ,
@@ -710,7 +751,7 @@ def run_experiment(train_diagrams, train_labels, val_diagrams, val_labels,test_d
710751
711752
712753
713- train_embeddings , val_embeddings , max_w_param_length , oov_word_count = generate_initial_parameterisation (train_circuits , val_circuits , embedding_model , model1_obj ,ansatz_class , model_class_to_use )
754+ train_embeddings , val_embeddings , max_w_param_length , oov_word_count = generate_initial_parameterisation (train_circuits , val_circuits , embedding_model , model1_obj ,ansatz_class , model_class_to_use , usp_spanish_dictionary , dataset_name )
714755
715756 global MAX_PARAM_LENGTH
716757 MAX_PARAM_LENGTH = max_w_param_length
@@ -911,6 +952,12 @@ def perform_task(args):
911952 if args .dataset in ["uspantek" ,"spanish" ]:
912953 spanish_tokeniser = spacy .load ("es_core_news_sm" )
913954 spacy_tokeniser .tokeniser = spanish_tokeniser
955+
956+ if args .dataset == "uspantek" :
957+ #load the usp_spanish-dictionary
958+ usp_spanish_dictionary = read_usp_spanish_dictionary (os .path .join (args .data_base_folder ,USPANTEK_SPANISH_DICT_FILENAME ))
959+
960+
914961 else :
915962 english_tokenizer = spacy .load ("en_core_web_sm" )
916963 spacy_tokeniser .tokeniser = english_tokenizer
@@ -928,6 +975,7 @@ def perform_task(args):
928975 test_labels , test_data = read_data (os .path .join (args .data_base_folder ,TEST ),lines_to_read = args .no_of_training_data_points_to_use )
929976
930977
978+
931979
932980
933981 """#some datasets like spanish, uspantek, sst2 have some sentences which bobcat doesnt like. putting it
@@ -979,7 +1027,7 @@ def perform_task(args):
9791027 # But commenting out due to lack of ram in laptop
9801028 tf_seed = args .seed
9811029 tf .random .set_seed (tf_seed )
982- return run_experiment (train_diagrams , train_labels , val_diagrams , val_labels ,test_diagrams ,test_labels , eval_metrics ,tf_seed ,embedding_model ,args .ansatz ,args .single_qubit_params ,args .base_dimension_for_noun ,args .base_dimension_for_sent ,args .base_dimension_for_prep_phrase , args .no_of_layers_in_ansatz ,args .expose_model1_val_during_model_initialization , args .batch_size ,args .learning_rate_model1 ,args .model14type , args .epochs_train_model1 ,args .trainer ,args .do_model3_tuning ,args .learning_rate_model3 ,args .maxparams ,args .epochs_model3_oov_model , args .model14type , args .use_wandb )
1030+ return run_experiment (train_diagrams , train_labels , val_diagrams , val_labels ,test_diagrams ,test_labels , eval_metrics ,tf_seed ,embedding_model ,args .ansatz ,args .single_qubit_params ,args .base_dimension_for_noun ,args .base_dimension_for_sent ,args .base_dimension_for_prep_phrase , args .no_of_layers_in_ansatz ,args .expose_model1_val_during_model_initialization , args .batch_size ,args .learning_rate_model1 ,args .model14type , args .epochs_train_model1 ,args .trainer ,args .do_model3_tuning ,args .learning_rate_model3 ,args .maxparams ,args .epochs_model3_oov_model , args .model14type , args .use_wandb , usp_spanish_dictionary , args . dataset )
9831031
9841032def parse_name_model (val ):
9851033 try :
@@ -1092,6 +1140,7 @@ def parse_arguments():
10921140 parser .add_argument ('--do_debug' , action = "store_true" ,help = "to run debug or not to debug. If yes, will uncomment the attachment code" )
10931141 parser .add_argument ('--use_wandb' , action = "store_true" ,help = "turn on wandb. making it optional since wandb doesnt work well with cyverse" )
10941142
1143+
10951144
10961145 return parser .parse_args ()
10971146
0 commit comments