Skip to content

Commit ea0191e

Browse files
committed
in the middle of loading uspantek spanish dictionary. the txt file robert provided is a mess. next todo: give it to chatgpt and ask it to create a tsv file which has ONLY uspantek word and its corresponding spanish translation. none of the grammar related crap
1 parent 4da347d commit ea0191e

File tree

2 files changed

+48
-5
lines changed

2 files changed

+48
-5
lines changed

classify.py

Lines changed: 47 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -60,7 +60,7 @@
6060
from keras import layers
6161
import os.path
6262

63-
63+
USPANTEK_SPANISH_DICT_FILENAME="usp_spanish_dictionary.txt"
6464

6565

6666
def f1(y_hat, y):
@@ -98,20 +98,39 @@ def get_max_word_param_length_all_other_ansatz(input_circuits):
9898
lengths.append(int(symb.name[-1]))
9999
return lengths
100100

101-
101+
"""
102+
given a word in uspantek- from training or dev, give it to a look up dictionary,find its corresponding spanish word, and return it.
103+
The dictionary was provided to us by the professor who went to guatemala to create this dictionary from uspantek speaking people
104+
105+
:param usp_word: the word in uspantek
106+
107+
:return: returns the corresponding spanish word
108+
"""
109+
def get_spanish_word_given_usp_word(usp_word):
110+
111+
pass
112+
113+
102114
"""
103115
given a word from training and dev vocab, get the corresponding embedding using fast text.
104116
105117
:param vocab: vocabulary
118+
:param embdding model: the corresponding embedding model, that will be used. i.e model 3. e.g fasttext for spanish or english
119+
:param language: what language. options are spanish or uspantek. Rather if it is uspantek, we need to do one extra step. i.e take every word in uspantek, and find its corresponding spanish word,- then only does pulling the corresponding spanish embedding makes sense.
120+
106121
:return: returns a dictionary of each word and its corresponding embedding
107122
"""
108-
def get_vocab_emb_dict(vocab,embedding_model):
123+
def get_vocab_emb_dict(vocab,embedding_model,language):
109124
embed_dict={}
110125
for wrd in vocab:
111126
cleaned_wrd_just_plain_text,cleaned_wrd_with_type=clean_wrd_for_spider_ansatz_coming_from_vocab(wrd)
112127
if cleaned_wrd_with_type in embed_dict :
113128
print(f"error. the word {cleaned_wrd_with_type} was already in dict")
114129
else:
130+
if(language == "uspantek"):
131+
pass
132+
133+
#then for each uspantek word, get the corresponding spanish word
115134
embed_dict[cleaned_wrd_with_type]= embedding_model[cleaned_wrd_just_plain_text]
116135
return embed_dict
117136

@@ -206,7 +225,7 @@ def generate_initial_parameterisation(train_circuits, val_circuits, embedding_mo
206225

207226
if(ansatz==SpiderAnsatz):
208227
#for each word in train and test vocab get its embedding from fasttext
209-
train_vocab_embeddings = get_vocab_emb_dict(train_vocab,embedding_model)
228+
train_vocab_embeddings = get_vocab_emb_dict(train_vocab,embedding_model,language)
210229
val_vocab_embeddings = get_vocab_emb_dict(val_vocab,embedding_model)
211230
else:
212231
#for the words created by other ansatz other formatting is different
@@ -528,6 +547,22 @@ def read_glue_data(dataset_downloaded,split,lines_to_read=0):
528547

529548

530549

550+
"""
551+
The dictionary/translator between uspantek and spanish came in a txt file.
552+
here we will load the file and convert it into a key value pair
553+
554+
:param file_path: path to the dictionary
555+
556+
:return: returns a dictionary of each usp word with corresponding spanish translation
557+
"""
558+
559+
560+
def read_usp_spanish_dictionary(file_path):
561+
with open(file_path) as f:
562+
for line in f:
563+
line_split = line.split(" ")
564+
print(line_split)
565+
531566
def read_data(filename,lines_to_read):
532567
labels, sentences = [], []
533568
line_counter=0
@@ -911,6 +946,12 @@ def perform_task(args):
911946
if args.dataset in ["uspantek","spanish"]:
912947
spanish_tokeniser=spacy.load("es_core_news_sm")
913948
spacy_tokeniser.tokeniser = spanish_tokeniser
949+
950+
if args.dataset == "uspantek":
951+
#load the usp_spanish-dictionary
952+
usp_spanish_dictionary= read_usp_spanish_dictionary(os.path.join(args.data_base_folder,USPANTEK_SPANISH_DICT_FILENAME))
953+
954+
914955
else:
915956
english_tokenizer = spacy.load("en_core_web_sm")
916957
spacy_tokeniser.tokeniser =english_tokenizer
@@ -928,6 +969,7 @@ def perform_task(args):
928969
test_labels, test_data = read_data(os.path.join(args.data_base_folder,TEST),lines_to_read= args.no_of_training_data_points_to_use)
929970

930971

972+
931973

932974

933975
"""#some datasets like spanish, uspantek, sst2 have some sentences which bobcat doesnt like. putting it
@@ -1092,6 +1134,7 @@ def parse_arguments():
10921134
parser.add_argument('--do_debug', action= "store_true",help="to run debug or not to debug. If yes, will uncomment the attachment code")
10931135
parser.add_argument('--use_wandb', action= "store_true",help="turn on wandb. making it optional since wandb doesnt work well with cyverse")
10941136

1137+
10951138

10961139
return parser.parse_args()
10971140

test_oov_no_pair.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -249,7 +249,7 @@ def test_uspantek_quantum1_no_expose_val(monkeypatch):
249249
assert type(ex) == KeyError
250250

251251

252-
# python classify.py --dataset uspantek --parser BobCatParser --ansatz IQPAnsatz --model14type TketModel --trainer QuantumTrainer --epochs_train_model1 30 --no_of_training_data_points_to_use 70 --no_of_val_data_points_to_use 30 --max_tokens_per_sent 10 ----expose_model1_val_during_model_initialization
252+
# python classify.py --dataset uspantek --parser BobCatParser --ansatz IQPAnsatz --model14type TketModel --trainer QuantumTrainer --epochs_train_model1 30 --no_of_training_data_points_to_use 70 --no_of_val_data_points_to_use 30 --max_tokens_per_sent 10 --expose_model1_val_during_model_initialization
253253

254254

255255
def test_uspantek_quantum1_yes_expose_val(monkeypatch):

0 commit comments

Comments
 (0)