Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
50 changes: 48 additions & 2 deletions .github/workflows/py_test.yml
Original file line number Diff line number Diff line change
Expand Up @@ -49,7 +49,7 @@
run: |
black --check $(git ls-files '*.py')

python-tests:
similarity-tests:
env:
TEST_FILES: tests/similarity_framework/test_similarity* tests/column2vec/test_column2vec_cache.py tests/runner/test_runner*
name: Run Python Tests
Expand Down Expand Up @@ -91,6 +91,52 @@
- uses: actions/upload-artifact@v4
if: github.event_name == 'pull_request'
with:
name: coverage
name: Similarity coverage
path: coverage.xml
retention-days: 1

column2vec-tests:
env:
TEST_FILES: tests/column2vec/test_column2vec_cache.py tests/column2vec/test_column2vec.py
name: Run Column2Vec Tests
runs-on: ubuntu-latest
steps:
- name: Checkout repository
uses: actions/checkout@v4

- name: Set up Python
uses: actions/setup-python@v5
with:
python-version: '3.11'
cache: 'pip'

- name: Install dependencies
run: |
pip install -r requirements.txt
pip install coverage pytest pytest-xdist

- name: Run tests
run: coverage run --source='column2Vec' -m pytest -v $TEST_FILES

- name: Show coverage
run: coverage report -m --omit=".*.ipynb"

- name: Create coverage file
if: github.event_name == 'pull_request'
run: coverage xml

- name: Get Cover
if: github.event_name == 'pull_request'
uses: orgoro/[email protected]
with:
coverageFile: coverage.xml
token: ${{ secrets.GITHUB_TOKEN }}
thresholdAll: 0.7
thresholdNew: 0.9

- uses: actions/upload-artifact@v4
if: github.event_name == 'pull_request'
with:
name: Column2Vec coverage
path: coverage.xml
retention-days: 1

Check warning

Code scanning / CodeQL

Workflow does not contain permissions Medium

Actions job or workflow does not limit the permissions of the GITHUB_TOKEN. Consider setting an explicit permissions block, using the following as a minimal starting point: {contents: read}
4 changes: 1 addition & 3 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -122,6 +122,7 @@ celerybeat.pid
*.sage.py

# Environments
.env
.config
.venv
env/
Expand Down Expand Up @@ -165,6 +166,3 @@ cython_debug/
# Custom for this project
fingerprints/
**/.DS_Store

column2vec/research
measurement
8 changes: 2 additions & 6 deletions column2vec/src/functions.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@
import numpy as np
import pandas as pd
import plotly.express as px
from memory_profiler import profile
from sentence_transformers import (
SentenceTransformer,
)
Expand Down Expand Up @@ -41,7 +42,6 @@ def get_nonnumerical_data(
metadata1 = metadata_creator.get_metadata(MetadataCreatorInput(dataframe=data))
column_names = metadata1.get_column_names_by_type(NONNUMERICAL)
for name in column_names:
print(f" {i} : {name}")
result[name + str(index)] = data[name]
return result

Expand Down Expand Up @@ -147,9 +147,5 @@ def compute_distances(vectors: dict):
:param vectors: Dictionary of embedding vectors
:return: matrix with distances
"""
res = {}
for key1, vec1 in vectors.items():
res[key1] = {}
for key2, vec2 in vectors.items():
res[key1][key2] = 1 - cosine_sim(vec1, vec2)
res = {key1: {key2: 1 - cosine_sim(vec1, vec2) for key2, vec2 in vectors.items()} for key1, vec1 in vectors.items()}
return res
30 changes: 22 additions & 8 deletions tests/column2vec/test_column2vec.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,10 @@
import os
import sys
import unittest
import time

import pandas as pd
from memory_profiler import profile
from sentence_transformers import SentenceTransformer

from column2vec.src.column2vec import (column2vec_as_sentence, column2vec_as_sentence_clean,
Expand All @@ -19,7 +21,16 @@
# MODEL = 'all-mpnet-base-v2' # bert-base-nli-mean-tokens
MODEL = 'sentence-transformers/all-mpnet-base-v2' #
THIS_DIR = os.path.dirname(os.path.abspath(__file__))
TRANSFORMER = SentenceTransformer(MODEL)

def get_size(foo):
num = sys.getsizeof(foo)
suffix="B"
for unit in ("", "Ki", "Mi", "Gi", "Ti", "Pi", "Ei", "Zi"):
if abs(num) < 1024.0:
return f"{num:3.1f}{unit}{suffix}"
num /= 1024.0
return f"{num:.1f}Yi{suffix}"

def vectors_are_same(vec1, vec2):
for i, j in zip(vec1, vec2):
Expand Down Expand Up @@ -61,7 +72,7 @@ def get_data(files):
class TestSimilarityOfVectors(unittest.TestCase):
@classmethod
def setUpClass(cls):
cls.model = SentenceTransformer(MODEL)
cls.model = TRANSFORMER
file_m2 = os.path.join(THIS_DIR, os.pardir, 'data/netflix_titles.csv')
# make an array of all the files
files = [file_m2]
Expand All @@ -77,7 +88,7 @@ def setUpClass(cls):
stop += 1

def test_column2vec_as_sentence(self):
model = SentenceTransformer(MODEL)
model = TRANSFORMER
self.assertTrue(
vectors_are_same(column2vec_as_sentence(self.first, model, "a"),
column2vec_as_sentence(self.first, self.model, "b")))
Expand All @@ -88,7 +99,7 @@ def test_column2vec_as_sentence(self):
column2vec_as_sentence(self.third, self.model, "f")))

def test_column2vec_as_sentence_clean(self):
model = SentenceTransformer(MODEL)
model = TRANSFORMER
self.assertTrue(vectors_are_same(column2vec_as_sentence_clean(self.first, model, "g"),
column2vec_as_sentence_clean(self.first, self.model, "h")))
self.assertTrue(vectors_are_same(column2vec_as_sentence_clean(self.second, model, "i"),
Expand All @@ -97,7 +108,7 @@ def test_column2vec_as_sentence_clean(self):
column2vec_as_sentence_clean(self.third, self.model, "l")))

def test_column2vec_as_sentence_clean_uniq(self):
model = SentenceTransformer(MODEL)
model = TRANSFORMER
self.assertTrue(vectors_are_same(column2vec_as_sentence_clean_uniq(self.first, model, "m"),
column2vec_as_sentence_clean_uniq(self.first, self.model, "n")))
self.assertTrue(vectors_are_same(column2vec_as_sentence_clean_uniq(self.second, model, "o"),
Expand All @@ -106,14 +117,14 @@ def test_column2vec_as_sentence_clean_uniq(self):
column2vec_as_sentence_clean_uniq(self.third, self.model, "r")))

def test_column2vec_avg(self):
model = SentenceTransformer(MODEL)
model = TRANSFORMER
self.assertTrue(vectors_are_same(column2vec_avg(self.first, model, "v"),
column2vec_avg(self.first, self.model, "s")))
# self.assertTrue(vectors_are_same(column2vec_avg(self.second, model), column2vec_avg(self.second, self.model)))
# self.assertTrue(vectors_are_same(column2vec_avg(self.third, model), column2vec_avg(self.third, self.model)))

def test_column2vec_weighted_avg(self):
model = SentenceTransformer(MODEL)
model = TRANSFORMER
self.assertTrue(vectors_are_same(column2vec_weighted_avg(self.first, model, "u"),
column2vec_weighted_avg(self.first, self.model, "w")))
# self.assertTrue(vectors_are_same(column2vec_weighted_avg(self.second, model),
Expand All @@ -122,12 +133,12 @@ def test_column2vec_weighted_avg(self):
# column2vec_weighted_avg(self.third, self.model)))

def test_column2vec_sum(self):
model = SentenceTransformer(MODEL)
model = TRANSFORMER
self.assertTrue(vectors_are_same(column2vec_sum(self.first, model, "x"),
column2vec_sum(self.first, self.model, "y")))

def test_column2vec_weighted_sum(self):
model = SentenceTransformer(MODEL)
model = TRANSFORMER
self.assertTrue(vectors_are_same(column2vec_weighted_sum(self.first, model, "z"),
column2vec_weighted_sum(self.first, self.model, "ab")))

Expand Down Expand Up @@ -269,6 +280,9 @@ def test_column2vec_as_sentence_clean_uniq(self):

@unittest.skipIf(SKIP_SIMILAR, "Skipping test_column2vec_avg ...")
def test_column2vec_avg(self):
# slow one 1:30
# slow one 1:30
# slow one 1:30
vectors_sentence = get_vectors(column2vec_avg, self.data)
distances = compute_distances(vectors_sentence)
self.print_accuracy(distances, "AVG")
Expand Down
27 changes: 2 additions & 25 deletions tests/column2vec/test_column2vec_cache.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,8 +31,7 @@ def setUpClass(cls):
for i in cls.data:
cls.first = cls.data[i].head(100)
break
cache.set_file("generated/test.csv")
cls.model = SentenceTransformer(MODEL)
cache.set_file("cache_test2.csv")

def setUp(self):
cache.clear_cache()
Expand All @@ -47,7 +46,6 @@ def test_column2vec_as_sentence(self):
cache.off()
third = time_measure_function(column2vec_as_sentence, self.first, self.model, "a")

print(f"{first} : {second} : {third}")
self.assertGreater(first, second)
self.assertGreater(third, second)

Expand All @@ -60,7 +58,6 @@ def test_column2vec_as_sentence_clean(self):
cache.off()
third = time_measure_function(column2vec_as_sentence_clean, self.first, self.model, "a")

print(f"{first} : {second} : {third}")
self.assertGreater(first, second)
self.assertGreater(third, second)

Expand All @@ -72,7 +69,6 @@ def test_column2vec_as_sentence_clean_uniq(self):
cache.off()
third = time_measure_function(column2vec_as_sentence_clean_uniq, self.first, self.model, "a")

print(f"{first} : {second} : {third}")
self.assertGreater(first, second)
self.assertGreater(third, second)

Expand All @@ -84,7 +80,6 @@ def test_column2vec_avg(self):
cache.off()
third = time_measure_function(column2vec_avg, self.first, self.model, "a")

print(f"{first} : {second} : {third}")
self.assertGreater(first, second)
self.assertGreater(third, second)

Expand All @@ -96,8 +91,6 @@ def test_column2vec_weighted_avg(self):
cache.off()
third = time_measure_function(column2vec_weighted_avg, self.first, self.model, "a")

print(f"{first} : {second} : {third}")

self.assertGreater(first, second)
self.assertGreater(third, second)

Expand All @@ -109,8 +102,6 @@ def test_column2vec_sum(self):
cache.off()
third = time_measure_function(column2vec_sum, self.first, self.model, "a")

print(f"{first} : {second} : {third}")

self.assertGreater(first, second)
self.assertGreater(third, second)

Expand All @@ -123,8 +114,6 @@ def test_column2vec_weighted_sum(self):
cache.off()
third = time_measure_function(column2vec_weighted_sum, self.first, self.model, "a")

print(f"{first} : {second} : {third}")

self.assertGreater(first, second)
self.assertGreater(third, second)

Expand All @@ -140,13 +129,12 @@ def setUpClass(cls):
cls.data = get_nonnumerical_data(files)
skip = True
for i in cls.data:
if skip:
if skip: # skip first column to get different than in previous test
skip = False
continue
cls.first = cls.data[i].head(100)
break
cache.set_file("cache_test.csv")
cls.model = SentenceTransformer(MODEL)

def setUp(self):
cache.clear_cache()
Expand All @@ -163,7 +151,6 @@ def test_column2vec_as_sentence(self):
cache.off()
third = time_measure_function(column2vec_as_sentence, self.first, self.model, "a")

print(f"{first} : {second} : {third}")
self.assertGreater(first, second)
self.assertGreater(third, second)

Expand All @@ -178,7 +165,6 @@ def test_column2vec_as_sentence_clean(self):
cache.off()
third = time_measure_function(column2vec_as_sentence_clean, self.first, self.model, "a")

print(f"{first} : {second} : {third}")
self.assertGreater(first, second)
self.assertGreater(third, second)

Expand All @@ -192,7 +178,6 @@ def test_column2vec_as_sentence_clean_uniq(self):
cache.off()
third = time_measure_function(column2vec_as_sentence_clean_uniq, self.first, self.model, "a")

print(f"{first} : {second} : {third}")
self.assertGreater(first, second)
self.assertGreater(third, second)

Expand All @@ -206,7 +191,6 @@ def test_column2vec_avg(self):
cache.off()
third = time_measure_function(column2vec_avg, self.first, self.model, "a")

print(f"{first} : {second} : {third}")
self.assertGreater(first, second)
self.assertGreater(third, second)

Expand All @@ -220,7 +204,6 @@ def test_column2vec_weighted_avg(self):
cache.off()
third = time_measure_function(column2vec_weighted_avg, self.first, self.model, "a")

print(f"{first} : {second} : {third}")

self.assertGreater(first, second)
self.assertGreater(third, second)
Expand All @@ -230,13 +213,9 @@ def test_column2vec_sum(self):
first = time_measure_function(column2vec_sum, self.first, self.model, "a")
cache.save_persistently()
cache.clear_cache()

second = time_measure_function(column2vec_sum, self.first, self.model, "a")
cache.off()
third = time_measure_function(column2vec_sum, self.first, self.model, "a")

print(f"{first} : {second} : {third}")

self.assertGreater(first, second)
self.assertGreater(third, second)

Expand All @@ -251,8 +230,6 @@ def test_column2vec_weighted_sum(self):
cache.off()
third = time_measure_function(column2vec_weighted_sum, self.first, self.model, "a")

print(f"{first} : {second} : {third}")

self.assertGreater(first, second)
self.assertGreater(third, second)

Expand Down
Loading