Skip to content

Commit 06f8017

Browse files
Merge pull request #93 from Living-with-machines/dependabot/pip/pyyaml-5.4
v1.2.3
2 parents db1bcf9 + 5145997 commit 06f8017

File tree

8 files changed

+227
-7
lines changed

8 files changed

+227
-7
lines changed

.github/workflows/dm_ci.yml

Lines changed: 22 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,22 @@
1+
name: Continuous integration
2+
3+
on: [push, pull_request]
4+
5+
jobs:
6+
build-linux:
7+
runs-on: ubuntu-latest
8+
9+
steps:
10+
- uses: actions/checkout@v2
11+
- name: Set up Python 3.7
12+
uses: actions/setup-python@v2
13+
with:
14+
python-version: 3.7
15+
- name: Install dependencies
16+
run: |
17+
$CONDA/bin/pip install -r requirements.txt
18+
- name: Test with pytest
19+
run: |
20+
$CONDA/bin/conda install pytest
21+
$CONDA/bin/pip install .
22+
$CONDA/bin/pytest ./DeezyMatch/tests

DeezyMatch/rnn_networks.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -27,7 +27,7 @@
2727
import time
2828
#from tqdm import tqdm, tnrange
2929
from tqdm.autonotebook import tqdm
30-
from tqdm import tnrange
30+
from tqdm.notebook import tnrange
3131

3232
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
3333

DeezyMatch/tests/test_import.py

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,6 @@
1+
#!/usr/bin/env python
2+
# -*- coding: UTF-8 -*-
3+
import pytest
4+
5+
def test_import():
6+
import DeezyMatch

DeezyMatch/tests/test_pipeline.py

Lines changed: 88 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,88 @@
1+
#!/usr/bin/env python
2+
# -*- coding: UTF-8 -*-
3+
import pytest
4+
5+
def test_train():
6+
from DeezyMatch import train as dm_train
7+
# train a new model
8+
dm_train(input_file_path="./inputs/input_dfm_pytest_001.yaml",
9+
dataset_path="./dataset/dataset-string-similarity_test.txt",
10+
model_name="test001")
11+
12+
def test_finetune():
13+
from DeezyMatch import finetune as dm_finetune
14+
# fine-tune a pretrained model stored at pretrained_model_path and pretrained_vocab_path
15+
dm_finetune(input_file_path="./inputs/input_dfm_pytest_001.yaml",
16+
dataset_path="./dataset/dataset-string-similarity_test.txt",
17+
model_name="finetuned_test001",
18+
pretrained_model_path="./models/test001/test001.model",
19+
pretrained_vocab_path="./models/test001/test001.vocab")
20+
21+
def test_inference():
22+
from DeezyMatch import inference as dm_inference
23+
24+
# model inference using a model stored at pretrained_model_path and pretrained_vocab_path
25+
dm_inference(input_file_path="./inputs/input_dfm_pytest_001.yaml",
26+
dataset_path="./dataset/dataset-string-similarity_test.txt",
27+
pretrained_model_path="./models/finetuned_test001/finetuned_test001.model",
28+
pretrained_vocab_path="./models/finetuned_test001/finetuned_test001.vocab")
29+
30+
def test_generate_query_vecs():
31+
from DeezyMatch import inference as dm_inference
32+
33+
# generate vectors for queries (specified in dataset_path)
34+
# using a model stored at pretrained_model_path and pretrained_vocab_path
35+
dm_inference(input_file_path="./inputs/input_dfm_pytest_001.yaml",
36+
dataset_path="./dataset/dataset-string-similarity_test.txt",
37+
pretrained_model_path="./models/finetuned_test001/finetuned_test001.model",
38+
pretrained_vocab_path="./models/finetuned_test001/finetuned_test001.vocab",
39+
inference_mode="vect",
40+
scenario="queries/test")
41+
42+
def test_generate_candidate_vecs():
43+
from DeezyMatch import inference as dm_inference
44+
45+
# generate vectors for candidates (specified in dataset_path)
46+
# using a model stored at pretrained_model_path and pretrained_vocab_path
47+
dm_inference(input_file_path="./inputs/input_dfm_pytest_001.yaml",
48+
dataset_path="./dataset/dataset-string-similarity_test.txt",
49+
pretrained_model_path="./models/finetuned_test001/finetuned_test001.model",
50+
pretrained_vocab_path="./models/finetuned_test001/finetuned_test001.vocab",
51+
inference_mode="vect",
52+
scenario="candidates/test")
53+
54+
def test_assemble_queries():
55+
from DeezyMatch import combine_vecs
56+
57+
# combine vectors stored in queries/test and save them in combined/queries_test
58+
combine_vecs(rnn_passes=['fwd', 'bwd'],
59+
input_scenario='queries/test',
60+
output_scenario='combined/queries_test',
61+
print_every=10)
62+
63+
def test_assemble_candidates():
64+
from DeezyMatch import combine_vecs
65+
66+
# combine vectors stored in candidates/test and save them in combined/candidates_test
67+
combine_vecs(rnn_passes=['fwd', 'bwd'],
68+
input_scenario='candidates/test',
69+
output_scenario='combined/candidates_test',
70+
print_every=10)
71+
72+
def test_candidate_ranker():
73+
from DeezyMatch import candidate_ranker
74+
75+
# Select candidates based on L2-norm distance (aka faiss distance):
76+
# find candidates from candidate_scenario
77+
# for queries specified in query_scenario
78+
candidates_pd = \
79+
candidate_ranker(query_scenario="./combined/queries_test",
80+
candidate_scenario="./combined/candidates_test",
81+
ranking_metric="faiss",
82+
selection_threshold=5.,
83+
num_candidates=2,
84+
search_size=10,
85+
output_path="ranker_results/test_candidates_deezymatch",
86+
pretrained_model_path="./models/finetuned_test001/finetuned_test001.model",
87+
pretrained_vocab_path="./models/finetuned_test001/finetuned_test001.vocab",
88+
number_test_rows=5)

README.md

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -17,6 +17,9 @@
1717
<a href="https://mybinder.org/v2/gh/Living-with-machines/DeezyMatch/HEAD?filepath=examples">
1818
<img alt="Binder" src="https://mybinder.org/badge_logo.svg">
1919
</a>
20+
<a href="https://github.com/Living-with-machines/DeezyMatch/workflows/Continuous%20integration/badge.svg">
21+
<img alt="Continuous integration badge" src="https://github.com/Living-with-machines/DeezyMatch/workflows/Continuous%20integration/badge.svg">
22+
</a>
2023
<br/>
2124
</p>
2225

inputs/input_dfm_pytest_001.yaml

Lines changed: 101 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,101 @@
1+
general:
2+
use_gpu: True # only if available
3+
# specify CUDA device, these are 0-indexed, e.g.,
4+
# cuda:0, cuda:1 or others. "cuda" is the default CUDA device
5+
gpu_device: "cuda"
6+
# Parent dir to save trained models
7+
models_dir: "./models"
8+
9+
preprocessing:
10+
# normalizing text to the ASCII encoding standard
11+
uni2ascii: True
12+
lowercase: True
13+
# removing both the leading and the trailing empty characters
14+
strip: True
15+
only_latin_letters: False
16+
# Strings in the inputs will be prefix + string + suffix
17+
prefix_suffix: ["|", "|"]
18+
# Accepted proportion of characters in a string that are not present in our vocabulary, i.e.,
19+
# String is accepted if:
20+
# (number of its characters found in the vocabulary)/(total number its characters) >= missing_char_threshold
21+
missing_char_threshold: 0.5
22+
# read a list of characters and add to the vocabulary
23+
read_list_chars: "./dataset/characters_v001.vocab"
24+
25+
# --- RNN/GRU/LSTM architecture/misc info
26+
gru_lstm:
27+
main_architecture: "gru" # rnn, gru, lstm
28+
mode: # Tokenization mode
29+
# choices: "char", "ngram", "word"
30+
# for example: tokenize: ["char", "ngram", "word"] or ["char", "word"]
31+
tokenize: ["char"]
32+
# ONLY if "ngram" is selected in tokenize, the following args will be used:
33+
min_gram: 2
34+
max_gram: 3
35+
bidirectional: True # if True, becomes a bidirectional RNN/GRU/LSTM
36+
# num_layers
37+
# number of recurrent layers. e.g., setting num_layers=2 means stacking two
38+
# RNN/GRU/LSTMs together to form a stacked RNN/GRU/LSTM,
39+
# with the second RNN/GRU/LSTM taking in outputs of the first RNN/GRU/LSTM and computing the final results.
40+
num_layers: 2
41+
# number of dimensions of the first fully connected network
42+
fc1_out_dim: 120
43+
# pooling_mode:
44+
# hstates_layers_simple, hstates_layers, hstates
45+
# hstates_subtract, hstates_l2_distance, hstates_cosine
46+
# average, max
47+
# attention
48+
pooling_mode: 'hstates_layers_simple'
49+
# rnn_dropout:
50+
# if non-zero, introduces a Dropout layer on the outputs of each RNN/LSTM/GRU layer except the last layer,
51+
# with dropout probability equal to rnn_dropout.
52+
rnn_dropout: 0.01
53+
# fully-connected layers dropout depends on the number of fc layers (currently there are two)
54+
fc_dropout: [0.01, 0.01]
55+
# attention layer dropout depends on the number of attention layers (currently there are two)
56+
att_dropout: [0.01, 0.01]
57+
# Add bias to all learnable parameters
58+
bias: True
59+
60+
rnn_hidden_dim: 60
61+
max_seq_len: 120
62+
embedding_dim: 60
63+
output_dim: 2
64+
65+
learning_rate: 0.001 # 3e-4
66+
optimizer: adam
67+
epochs: 3
68+
batch_size: 32
69+
# shuffle when creating DataLoader
70+
dl_shuffle: True
71+
random_seed: 123
72+
# Early stopping:
73+
# Number of epochs with no improvement after which training will be stopped and
74+
# the model with the least validation loss will be saved
75+
# If 0 or negative, early stopping will be deactivated
76+
early_stopping_patience: -1
77+
78+
# if -1 or 1, perform the validation step in every epoch;
79+
# if 0, no validation will be done
80+
# otherwise, specify the interval (integer)
81+
validation: 1
82+
# split dataset
83+
train_proportion: 0.5
84+
val_proportion: 0.3
85+
test_proportion: 0.2
86+
87+
# False or path to a directory to create tensor-board
88+
#create_tensor_board: "./tb_gru_test"
89+
create_tensor_board: False
90+
91+
# Layers to freeze during fine-tuning
92+
layers_to_freeze: ["emb", "rnn_1", "attn"]
93+
94+
inference:
95+
# Output predictions and save the results in output_preds_file
96+
output_preds: True
97+
# either a path or "default"
98+
# "default" saves the prediction output inside the model directory
99+
output_preds_file: "default"
100+
# change it to true to have Mean Average Precision as an eval metric. Note that this would have an impact on computational time
101+
eval_map_metric: False

requirements.txt

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,12 +1,12 @@
11
torch==1.5.0
22
torchvision==0.6.0
33
ipywidgets==7.5.1
4-
PyYAML==5.3.1
4+
PyYAML==5.4
55
scikit-learn==0.23.1
6-
pandas==1.0.3
6+
pandas==1.0.5
77
faiss-cpu==1.6.3
88
tqdm==4.46.0
99
tensorboard==2.2.2
1010
matplotlib==3.2.1
1111
jupyter-client==6.1.5
12-
jupyter-core==4.6.3
12+
jupyter-core==4.6.3

setup.py

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,7 @@
22

33
setuptools.setup(
44
name="DeezyMatch",
5-
version="1.2.2",
5+
version="1.2.3",
66
description="A Flexible Deep Learning Approach to Fuzzy String Matching",
77
author=u"The LwM Development Team",
88
#author_email="",
@@ -21,9 +21,9 @@
2121
"torch==1.5.0",
2222
"torchvision==0.6.0",
2323
"ipywidgets==7.5.1",
24-
"PyYAML==5.3.1",
24+
"PyYAML==5.4",
2525
"scikit-learn==0.23.1",
26-
"pandas==1.0.3",
26+
"pandas==1.0.5",
2727
"faiss-cpu==1.6.3",
2828
"tqdm==4.46.0",
2929
"tensorboard==2.2.2",

0 commit comments

Comments
 (0)