|
21 | 21 |
|
22 | 22 |
|
23 | 23 | import argparse |
| 24 | +import json |
24 | 25 | from lambeq.text2diagram.ccg_parser import CCGParser |
25 | 26 | from lambeq.ansatz import BaseAnsatz |
26 | 27 | from lambeq.training.model import Model |
|
60 | 61 | from keras import layers |
61 | 62 | import os.path |
62 | 63 |
|
63 | | -USPANTEK_SPANISH_DICT_FILENAME="usp_spanish_dictionary.txt" |
| 64 | +USPANTEK_SPANISH_DICT_FILENAME="refined_usp_to_spanish.json" |
64 | 65 |
|
65 | 66 |
|
66 | 67 | def f1(y_hat, y): |
@@ -120,17 +121,20 @@ def get_spanish_word_given_usp_word(usp_word): |
120 | 121 |
|
121 | 122 | :return: returns a dictionary of each word and its corresponding embedding |
122 | 123 | """ |
123 | | -def get_vocab_emb_dict(vocab,embedding_model,language): |
| 124 | +def get_vocab_emb_dict(vocab,embedding_model,dataset_name,usp_spanish_dictionary=None): |
124 | 125 | embed_dict={} |
125 | 126 | for wrd in vocab: |
126 | 127 | cleaned_wrd_just_plain_text,cleaned_wrd_with_type=clean_wrd_for_spider_ansatz_coming_from_vocab(wrd) |
127 | 128 | if cleaned_wrd_with_type in embed_dict : |
128 | 129 | print(f"error. the word {cleaned_wrd_with_type} was already in dict") |
129 | 130 | else: |
130 | | - if(language == "uspantek"): |
131 | | - pass |
132 | | - |
| 131 | + if(dataset_name == "uspantek"): |
| 132 | + assert usp_spanish_dictionary != None |
133 | 133 | #then for each uspantek word, get the corresponding spanish word |
| 134 | + if cleaned_wrd_just_plain_text in usp_spanish_dictionary: |
| 135 | + spanish_equivalent_word_or_sent = usp_spanish_dictionary[cleaned_wrd_just_plain_text] |
| 136 | + else: |
| 137 | + pass |
134 | 138 | embed_dict[cleaned_wrd_with_type]= embedding_model[cleaned_wrd_just_plain_text] |
135 | 139 | return embed_dict |
136 | 140 |
|
@@ -173,7 +177,7 @@ def create_vocab_from_circuits(circuits,ansatz): |
173 | 177 |
|
174 | 178 | """ |
175 | 179 |
|
176 | | -def generate_initial_parameterisation(train_circuits, val_circuits, embedding_model, qnlp_model,ansatz,model_type_class): |
| 180 | +def generate_initial_parameterisation(train_circuits, val_circuits, embedding_model, qnlp_model,ansatz,model_type_class,usp_spanish_dictionary,dataset_name): |
177 | 181 |
|
178 | 182 |
|
179 | 183 | train_vocab=create_vocab_from_circuits(train_circuits,ansatz) |
@@ -223,9 +227,9 @@ def generate_initial_parameterisation(train_circuits, val_circuits, embedding_mo |
223 | 227 |
|
224 | 228 | assert max_word_param_length!=0 |
225 | 229 |
|
226 | | - if(ansatz==SpiderAnsatz): |
| 230 | + if(ansatz==SpiderAnsatz): |
227 | 231 | #for each word in train and test vocab get its embedding from fasttext |
228 | | - train_vocab_embeddings = get_vocab_emb_dict(train_vocab,embedding_model,language) |
| 232 | + train_vocab_embeddings = get_vocab_emb_dict(train_vocab,embedding_model,dataset_name,usp_spanish_dictionary) |
229 | 233 | val_vocab_embeddings = get_vocab_emb_dict(val_vocab,embedding_model) |
230 | 234 | else: |
231 | 235 | #for the words created by other ansatz other formatting is different |
@@ -558,16 +562,9 @@ def read_glue_data(dataset_downloaded,split,lines_to_read=0): |
558 | 562 |
|
559 | 563 |
|
560 | 564 | def read_usp_spanish_dictionary(file_path): |
561 | | - with open(file_path) as f: |
562 | | - # Extracting word pairs |
563 | | - uspantek_to_spanish = {} |
564 | | - for text_input in f: |
565 | | - for line in text_input.strip().split("\n"): |
566 | | - parts = line.split(maxsplit=2) |
567 | | - if len(parts) >= 3: |
568 | | - key = parts[0].strip() |
569 | | - value = parts[2].strip() |
570 | | - uspantek_to_spanish[key] = value |
| 565 | + with open(file_path,'r') as json_file: |
| 566 | + uspantek_to_spanish = json.load(json_file) |
| 567 | + |
571 | 568 | return uspantek_to_spanish |
572 | 569 |
|
573 | 570 |
|
@@ -644,7 +641,7 @@ def convert_diagram_to_circuits_with_try_catch(diagrams, ansatz, labels,split): |
644 | 641 | return list_circuits, list_labels |
645 | 642 |
|
646 | 643 |
|
647 | | -def run_experiment(train_diagrams, train_labels, val_diagrams, val_labels,test_diagrams,test_labels, eval_metrics,seed,embedding_model,ansatz_class, single_qubit_params,base_dimension_for_noun,base_dimension_for_sent,base_dimension_for_prep_phrase,no_of_layers_in_ansatz,expose_model1_val_during_model_initialization,batch_size,learning_rate_model1,model_class_to_use, epochs_train_model1, trainer_class_to_use,do_model3_tuning,learning_rate_model3 ,maxparams,epochs_model3_oov_model,model14type,use_wandb): |
| 644 | +def run_experiment(train_diagrams, train_labels, val_diagrams, val_labels,test_diagrams,test_labels, eval_metrics,seed,embedding_model,ansatz_class, single_qubit_params,base_dimension_for_noun,base_dimension_for_sent,base_dimension_for_prep_phrase,no_of_layers_in_ansatz,expose_model1_val_during_model_initialization,batch_size,learning_rate_model1,model_class_to_use, epochs_train_model1, trainer_class_to_use,do_model3_tuning,learning_rate_model3 ,maxparams,epochs_model3_oov_model,model14type,use_wandb,usp_spanish_dictionary,dataset_name): |
648 | 645 | if ansatz_class in [IQPAnsatz,Sim15Ansatz, Sim14Ansatz]: |
649 | 646 | ansatz_obj = ansatz_class ({AtomicType.NOUN: base_dimension_for_noun, |
650 | 647 | AtomicType.SENTENCE: base_dimension_for_sent, |
@@ -754,7 +751,7 @@ def run_experiment(train_diagrams, train_labels, val_diagrams, val_labels,test_d |
754 | 751 |
|
755 | 752 |
|
756 | 753 |
|
757 | | - train_embeddings, val_embeddings, max_w_param_length, oov_word_count = generate_initial_parameterisation(train_circuits, val_circuits, embedding_model, model1_obj,ansatz_class, model_class_to_use) |
| 754 | + train_embeddings, val_embeddings, max_w_param_length, oov_word_count = generate_initial_parameterisation(train_circuits, val_circuits, embedding_model, model1_obj,ansatz_class, model_class_to_use,usp_spanish_dictionary,dataset_name) |
758 | 755 |
|
759 | 756 | global MAX_PARAM_LENGTH |
760 | 757 | MAX_PARAM_LENGTH = max_w_param_length |
@@ -1030,7 +1027,7 @@ def perform_task(args): |
1030 | 1027 | # But commenting out due to lack of ram in laptop |
1031 | 1028 | tf_seed = args.seed |
1032 | 1029 | tf.random.set_seed(tf_seed) |
1033 | | - return run_experiment(train_diagrams, train_labels, val_diagrams, val_labels,test_diagrams,test_labels, eval_metrics,tf_seed,embedding_model,args.ansatz,args.single_qubit_params,args.base_dimension_for_noun,args.base_dimension_for_sent,args.base_dimension_for_prep_phrase, args.no_of_layers_in_ansatz,args.expose_model1_val_during_model_initialization , args.batch_size,args.learning_rate_model1,args.model14type, args.epochs_train_model1,args.trainer,args.do_model3_tuning,args.learning_rate_model3,args.maxparams,args.epochs_model3_oov_model, args.model14type, args.use_wandb) |
| 1030 | + return run_experiment(train_diagrams, train_labels, val_diagrams, val_labels,test_diagrams,test_labels, eval_metrics,tf_seed,embedding_model,args.ansatz,args.single_qubit_params,args.base_dimension_for_noun,args.base_dimension_for_sent,args.base_dimension_for_prep_phrase, args.no_of_layers_in_ansatz,args.expose_model1_val_during_model_initialization , args.batch_size,args.learning_rate_model1,args.model14type, args.epochs_train_model1,args.trainer,args.do_model3_tuning,args.learning_rate_model3,args.maxparams,args.epochs_model3_oov_model, args.model14type, args.use_wandb,usp_spanish_dictionary,args.dataset) |
1034 | 1031 |
|
1035 | 1032 | def parse_name_model(val): |
1036 | 1033 | try: |
|
0 commit comments