Skip to content

Commit d3d2b39

Browse files
committed
able to load spanish dictionary, but its not getting all words correclty converted
1 parent 2cef170 commit d3d2b39

File tree

1 file changed

+18
-21
lines changed

1 file changed

+18
-21
lines changed

classify.py

Lines changed: 18 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -21,6 +21,7 @@
2121

2222

2323
import argparse
24+
import json
2425
from lambeq.text2diagram.ccg_parser import CCGParser
2526
from lambeq.ansatz import BaseAnsatz
2627
from lambeq.training.model import Model
@@ -60,7 +61,7 @@
6061
from keras import layers
6162
import os.path
6263

63-
USPANTEK_SPANISH_DICT_FILENAME="usp_spanish_dictionary.txt"
64+
USPANTEK_SPANISH_DICT_FILENAME="refined_usp_to_spanish.json"
6465

6566

6667
def f1(y_hat, y):
@@ -120,17 +121,20 @@ def get_spanish_word_given_usp_word(usp_word):
120121
121122
:return: returns a dictionary of each word and its corresponding embedding
122123
"""
123-
def get_vocab_emb_dict(vocab,embedding_model,language):
124+
def get_vocab_emb_dict(vocab,embedding_model,dataset_name,usp_spanish_dictionary=None):
124125
embed_dict={}
125126
for wrd in vocab:
126127
cleaned_wrd_just_plain_text,cleaned_wrd_with_type=clean_wrd_for_spider_ansatz_coming_from_vocab(wrd)
127128
if cleaned_wrd_with_type in embed_dict :
128129
print(f"error. the word {cleaned_wrd_with_type} was already in dict")
129130
else:
130-
if(language == "uspantek"):
131-
pass
132-
131+
if(dataset_name == "uspantek"):
132+
assert usp_spanish_dictionary != None
133133
#then for each uspantek word, get the corresponding spanish word
134+
if cleaned_wrd_just_plain_text in usp_spanish_dictionary:
135+
spanish_equivalent_word_or_sent = usp_spanish_dictionary[cleaned_wrd_just_plain_text]
136+
else:
137+
pass
134138
embed_dict[cleaned_wrd_with_type]= embedding_model[cleaned_wrd_just_plain_text]
135139
return embed_dict
136140

@@ -173,7 +177,7 @@ def create_vocab_from_circuits(circuits,ansatz):
173177
174178
"""
175179

176-
def generate_initial_parameterisation(train_circuits, val_circuits, embedding_model, qnlp_model,ansatz,model_type_class):
180+
def generate_initial_parameterisation(train_circuits, val_circuits, embedding_model, qnlp_model,ansatz,model_type_class,usp_spanish_dictionary,dataset_name):
177181

178182

179183
train_vocab=create_vocab_from_circuits(train_circuits,ansatz)
@@ -223,9 +227,9 @@ def generate_initial_parameterisation(train_circuits, val_circuits, embedding_mo
223227

224228
assert max_word_param_length!=0
225229

226-
if(ansatz==SpiderAnsatz):
230+
if(ansatz==SpiderAnsatz):
227231
#for each word in train and test vocab get its embedding from fasttext
228-
train_vocab_embeddings = get_vocab_emb_dict(train_vocab,embedding_model,language)
232+
train_vocab_embeddings = get_vocab_emb_dict(train_vocab,embedding_model,dataset_name,usp_spanish_dictionary)
229233
val_vocab_embeddings = get_vocab_emb_dict(val_vocab,embedding_model)
230234
else:
231235
#for the words created by other ansatz other formatting is different
@@ -558,16 +562,9 @@ def read_glue_data(dataset_downloaded,split,lines_to_read=0):
558562

559563

560564
def read_usp_spanish_dictionary(file_path):
561-
with open(file_path) as f:
562-
# Extracting word pairs
563-
uspantek_to_spanish = {}
564-
for text_input in f:
565-
for line in text_input.strip().split("\n"):
566-
parts = line.split(maxsplit=2)
567-
if len(parts) >= 3:
568-
key = parts[0].strip()
569-
value = parts[2].strip()
570-
uspantek_to_spanish[key] = value
565+
with open(file_path,'r') as json_file:
566+
uspantek_to_spanish = json.load(json_file)
567+
571568
return uspantek_to_spanish
572569

573570

@@ -644,7 +641,7 @@ def convert_diagram_to_circuits_with_try_catch(diagrams, ansatz, labels,split):
644641
return list_circuits, list_labels
645642

646643

647-
def run_experiment(train_diagrams, train_labels, val_diagrams, val_labels,test_diagrams,test_labels, eval_metrics,seed,embedding_model,ansatz_class, single_qubit_params,base_dimension_for_noun,base_dimension_for_sent,base_dimension_for_prep_phrase,no_of_layers_in_ansatz,expose_model1_val_during_model_initialization,batch_size,learning_rate_model1,model_class_to_use, epochs_train_model1, trainer_class_to_use,do_model3_tuning,learning_rate_model3 ,maxparams,epochs_model3_oov_model,model14type,use_wandb):
644+
def run_experiment(train_diagrams, train_labels, val_diagrams, val_labels,test_diagrams,test_labels, eval_metrics,seed,embedding_model,ansatz_class, single_qubit_params,base_dimension_for_noun,base_dimension_for_sent,base_dimension_for_prep_phrase,no_of_layers_in_ansatz,expose_model1_val_during_model_initialization,batch_size,learning_rate_model1,model_class_to_use, epochs_train_model1, trainer_class_to_use,do_model3_tuning,learning_rate_model3 ,maxparams,epochs_model3_oov_model,model14type,use_wandb,usp_spanish_dictionary,dataset_name):
648645
if ansatz_class in [IQPAnsatz,Sim15Ansatz, Sim14Ansatz]:
649646
ansatz_obj = ansatz_class ({AtomicType.NOUN: base_dimension_for_noun,
650647
AtomicType.SENTENCE: base_dimension_for_sent,
@@ -754,7 +751,7 @@ def run_experiment(train_diagrams, train_labels, val_diagrams, val_labels,test_d
754751

755752

756753

757-
train_embeddings, val_embeddings, max_w_param_length, oov_word_count = generate_initial_parameterisation(train_circuits, val_circuits, embedding_model, model1_obj,ansatz_class, model_class_to_use)
754+
train_embeddings, val_embeddings, max_w_param_length, oov_word_count = generate_initial_parameterisation(train_circuits, val_circuits, embedding_model, model1_obj,ansatz_class, model_class_to_use,usp_spanish_dictionary,dataset_name)
758755

759756
global MAX_PARAM_LENGTH
760757
MAX_PARAM_LENGTH = max_w_param_length
@@ -1030,7 +1027,7 @@ def perform_task(args):
10301027
# But commenting out due to lack of ram in laptop
10311028
tf_seed = args.seed
10321029
tf.random.set_seed(tf_seed)
1033-
return run_experiment(train_diagrams, train_labels, val_diagrams, val_labels,test_diagrams,test_labels, eval_metrics,tf_seed,embedding_model,args.ansatz,args.single_qubit_params,args.base_dimension_for_noun,args.base_dimension_for_sent,args.base_dimension_for_prep_phrase, args.no_of_layers_in_ansatz,args.expose_model1_val_during_model_initialization , args.batch_size,args.learning_rate_model1,args.model14type, args.epochs_train_model1,args.trainer,args.do_model3_tuning,args.learning_rate_model3,args.maxparams,args.epochs_model3_oov_model, args.model14type, args.use_wandb)
1030+
return run_experiment(train_diagrams, train_labels, val_diagrams, val_labels,test_diagrams,test_labels, eval_metrics,tf_seed,embedding_model,args.ansatz,args.single_qubit_params,args.base_dimension_for_noun,args.base_dimension_for_sent,args.base_dimension_for_prep_phrase, args.no_of_layers_in_ansatz,args.expose_model1_val_during_model_initialization , args.batch_size,args.learning_rate_model1,args.model14type, args.epochs_train_model1,args.trainer,args.do_model3_tuning,args.learning_rate_model3,args.maxparams,args.epochs_model3_oov_model, args.model14type, args.use_wandb,usp_spanish_dictionary,args.dataset)
10341031

10351032
def parse_name_model(val):
10361033
try:

0 commit comments

Comments
 (0)