first and maybe sufficiently minimal usage of pgvector to store deepf… (#85)

moilerat · Victor Reutenauer · grololo06 · web-flow · commit b1536ed71b29 · 2024-11-17T06:50:01.000+01:00
* first and sufficiently minimal usage of pgvector to store deepfeature
---------
Co-authored-by: Victor Reutenauer &lt;victor@fotonower.com&gt;
Co-authored-by: grololo06 &lt;laurent.salinas@laposte.net&gt;
diff --git a/.github/workflows/auto_tests.yml b/.github/workflows/auto_tests.yml
@@ -28,7 +28,7 @@ jobs:
       # Label used to access the service container
       postgres:
         # Docker Hub image
-        image: pgvector/pgvector:pg14
+        image: pgvector/pgvector:0.7.4-pg14
         # Provide the password for postgres
         env:
           POSTGRES_PASSWORD: postgres12
diff --git a/QA/py/pg_files/schem_prod.sql b/QA/py/pg_files/schem_prod.sql
@@ -44,6 +44,13 @@ CREATE EXTENSION IF NOT EXISTS tsm_system_time WITH SCHEMA public;
 
 COMMENT ON EXTENSION tsm_system_time IS 'TABLESAMPLE method which accepts time in milliseconds as a limit';
 
+--
+-- Name: vector; Type: EXTENSION; Schema: -; Owner: -
+--
+
+-- doesn't work and should be there in ankane
+-- CREATE EXTENSION IF NOT EXISTS vector; 
+
 
 SET default_tablespace = '';
 
diff --git a/QA/py/pg_files/upgrade_prod.sql b/QA/py/pg_files/upgrade_prod.sql
@@ -2219,6 +2219,31 @@ UPDATE alembic_version SET version_num='4e25988b1e56' WHERE alembic_version.vers
 ALTER TABLE users ADD COLUMN orcid VARCHAR(20) DEFAULT NULL;
 UPDATE alembic_version SET version_num='0a3132f436fb' WHERE alembic_version.version_num = '4e25988b1e56';
 
+-- Running upgrade 0a3132f436fb -> a9dd3c62b7b0
+
+CREATE TABLE obj_cnn_features_vector (
+    objcnnid BIGINT NOT NULL,
+    features VECTOR(50),
+    PRIMARY KEY (objcnnid),
+    FOREIGN KEY(objcnnid) REFERENCES obj_head (objid) ON DELETE CASCADE
+);
+
+INSERT INTO obj_cnn_features_vector (objcnnid, features)
+        SELECT objcnnid, ARRAY[cnn01, cnn02, cnn03, cnn04, cnn05, cnn06, cnn07, cnn08, cnn09, cnn10,
+        cnn11, cnn12, cnn13, cnn14, cnn15, cnn16, cnn17, cnn18, cnn19, cnn20,
+        cnn21, cnn22, cnn23, cnn24, cnn25, cnn26, cnn27, cnn28, cnn29, cnn30,
+        cnn31, cnn32, cnn33, cnn34, cnn35, cnn36, cnn37, cnn38, cnn39, cnn40,
+        cnn41, cnn42, cnn43, cnn44, cnn45, cnn46, cnn47, cnn48, cnn49, cnn50]::vector
+        FROM obj_cnn_features;
+
+GRANT SELECT ON obj_cnn_features_vector TO readerole;
+
+DROP TABLE obj_cnn_features;
+
+UPDATE alembic_version SET version_num='a9dd3c62b7b0' WHERE alembic_version.version_num = '0a3132f436fb';
+
+COMMIT;
+
 ------- Leave on tail
 
 ALTER TABLE alembic_version REPLICA IDENTITY FULL;
diff --git a/QA/py/requirements.txt b/QA/py/requirements.txt
@@ -85,3 +85,5 @@ stringcase==1.2.0
 # For python API client generated classes
 # Fails to build on GH due to obscure compilation issue. Nevermind for the moment.
 #backports-datetime-fromisoformat==1.0.0
+# pgvector for similarity search
+pgvector==0.2.4
diff --git a/py/API_operations/Subset.py b/py/API_operations/Subset.py
@@ -15,7 +15,7 @@
 from BO.TSVFile import TSVFile
 from BO.helpers.ImportHelpers import ImportHow
 from DB.Acquisition import Acquisition
-from DB.CNNFeature import ObjectCNNFeature
+from DB.CNNFeatureVector import ObjectCNNFeatureVector
 from DB.Image import Image
 from DB.Object import ObjectHeader, ObjectFields, ObjectsClassifHisto
 from DB.Process import Process
@@ -35,7 +35,7 @@
 # Useful typings
 # TODO: Put somewhere else if reused in other classes
 DBObjectTupleT = Tuple[
-    ObjectHeader, ObjectFields, ObjectCNNFeature, Image, Sample, Acquisition, Process
+    ObjectHeader, ObjectFields, ObjectCNNFeatureVector, Image, Sample, Acquisition, Process
 ]
 DBObjectTupleListT = List[DBObjectTupleT]
 
@@ -164,15 +164,15 @@ def _db_fetch(self, object_ids: ObjectIDListT) -> Iterable[DBObjectTupleT]:
         )
         ret = (
             ret.outerjoin(Image, ObjectHeader.all_images)
-            .outerjoin(ObjectCNNFeature)
+            .outerjoin(ObjectCNNFeatureVector)
             .join(ObjectFields)
         )
         ret = ret.filter(ObjectHeader.objid == any_(object_ids))
         ret = ret.order_by(ObjectHeader.objid, Image.imgid)
         ret = ret.with_entities(
             ObjectHeader,
             ObjectFields,
-            ObjectCNNFeature,
+            ObjectCNNFeatureVector,
             Image,
             Sample,
             Acquisition,
diff --git a/py/BO/Prediction.py b/py/BO/Prediction.py
@@ -1,6 +1,6 @@
 # -*- coding: utf-8 -*-
 # This file is part of Ecotaxa, see license.md in the application root directory for license informations.
-# Copyright (C) 2015-2021  Picheral, Colin, Irisson (UPMC-CNRS)
+# Copyright (C) 2015-2024  Picheral, Colin, Irisson (UPMC-CNRS), Amblard (LOVNOWER)
 #
 
 #
@@ -13,7 +13,7 @@
 from numpy import ndarray
 
 from DB.Acquisition import Acquisition
-from DB.CNNFeature import DEEP_FEATURES, ObjectCNNFeaturesBean, ObjectCNNFeature
+from DB.CNNFeatureVector import N_DEEP_FEATURES, ObjectCNNFeaturesVectorBean, ObjectCNNFeatureVector
 from DB.Image import Image
 from DB.Object import ObjectHeader, ObjectIDT
 from DB.Project import ProjectIDT
@@ -33,7 +33,7 @@ class DeepFeatures(object):
 
     OTOH, it can also _generate_ features, using another class of machine learning algorithm: CNN
      @see https://en.wikipedia.org/wiki/Convolutional_neural_network
-    These other features are stored in a dedicated DB table @see ObjectCNNFeature.
+    These other features are stored in a dedicated DB table @see ObjectCNNFeatureVector.
     """
 
     SAVE_EVERY: ClassVar = 500
@@ -53,8 +53,8 @@ def delete_all(session: Session, proj_id: ProjectIDT) -> int:
                 Sample.sampleid == Acquisition.acq_sample_id, Sample.projid == proj_id
             ),
         )
-        qry = session.query(ObjectCNNFeature)
-        qry = qry.filter(ObjectCNNFeature.objcnnid.in_(sub_qry))
+        qry = session.query(ObjectCNNFeatureVector)
+        qry = qry.filter(ObjectCNNFeatureVector.objcnnid.in_(sub_qry))
         nb_deleted = qry.delete(synchronize_session=False)
         return nb_deleted
 
@@ -72,9 +72,9 @@ def find_missing(session: Session, proj_id: ProjectIDT) -> Dict[ObjectIDT, str]:
             ),
         )
         qry = qry.outerjoin(Image)  # For detecting missing images
-        qry = qry.outerjoin(ObjectCNNFeature)  # For detecting missing features
+        qry = qry.outerjoin((ObjectCNNFeatureVector))  # For detecting missing features
         # noinspection PyComparisonWithNone
-        qry = qry.filter(ObjectCNNFeature.objcnnid == None)  # SQLAlchemy
+        qry = qry.filter(ObjectCNNFeatureVector.objcnnid == None)  # SQLAlchemy
         qry = qry.order_by(ObjectHeader.objid, Image.imgrank)
         ret = {}
         for a_res in session.execute(qry):
@@ -97,7 +97,7 @@ def save(cls, session: Session, features: Any) -> int:
         # for a_rec in features.to_records(index=True): # This is nice and can produce tuple()
         # but I found no way to feed them into DBWriter without going low-level.
         for obj_id, row in features.iterrows():
-            bean = ObjectCNNFeaturesBean(obj_id, row)
+            bean = ObjectCNNFeaturesVectorBean(obj_id, row)
             writer.add_cnn_features_with_pk(bean)
             nb_rows += 1
             if nb_rows % cls.SAVE_EVERY == 0:
@@ -112,10 +112,10 @@ def read_for_objects(
         """
         Read CNN lines AKA features, in order, for given object_ids
         """
-        fk_to_objid = ObjectCNNFeature.objcnnid.name
+        fk_to_objid = ObjectCNNFeatureVector.objcnnid.name
         sql = "WITH ordr (seq, objid) AS (select * from UNNEST(:seq, :oids)) "
-        sql += "SELECT " + ",".join(DEEP_FEATURES)
-        sql += " FROM " + ObjectCNNFeature.__tablename__
+        sql += "SELECT " + " features "
+        sql += " FROM " + ObjectCNNFeatureVector.__tablename__
         sql += " JOIN ordr ON " + fk_to_objid + " = ordr.objid "
         sql += " ORDER BY ordr.seq "
         params = {"seq": list(range(len(oid_lst))), "oids": oid_lst}
@@ -128,12 +128,13 @@ def np_read_for_objects(cls, session: Session, oid_lst: List[int]) -> ndarray:
         Read CNN lines AKA features, in order, for given object_ids, into a NumPy array
         """
         res = cls.read_for_objects(session, oid_lst)
-        ret = np.ndarray(shape=(len(oid_lst), len(res.keys())), dtype=np.float32)
+        ret = np.ndarray(shape=(len(oid_lst), N_DEEP_FEATURES), dtype=np.float32)
         ndx = 0
         for a_row in res:
-            ret[ndx] = a_row
+            all_feats = a_row["features"].strip("[]").split(",") if type(a_row["features"]) == str else a_row["features"]            
+            ret[ndx] = [float(x) for x in all_feats]
             ndx += 1
         assert ndx == len(
             oid_lst
-        ), "No enough CNN features in DB: expected %d read %d" % (len(oid_lst), ndx)
+        ), "Not enough CNN features in DB: expected %d read %d" % (len(oid_lst), ndx)
         return ret
diff --git a/py/DB/CNNFeatureVector.py b/py/DB/CNNFeatureVector.py
@@ -1,37 +1,36 @@
 # -*- coding: utf-8 -*-
 # This file is part of Ecotaxa, see license.md in the application root directory for license informations.
-# Copyright (C) 2015-2021  Picheral, Colin, Irisson (UPMC-CNRS)
+# Copyright (C) 2022-2024 LOVNOWER : Amblard, Colin, Irisson, Reutenauer (UPMC-CNRS-FOTONOWER)
 #
 from typing import List
+from pgvector.sqlalchemy import Vector
 
 from .Object import ObjectIDT
 from .helpers.Bean import Bean
 from .helpers.DDL import ForeignKey
 from .helpers.ORM import Column, relationship, Model
 from .helpers.Postgres import BIGINT, REAL
 
+N_DEEP_FEATURES = 50
 
-class ObjectCNNFeature(Model):
-    __tablename__ = "obj_cnn_features"
+
+class ObjectCNNFeatureVector(Model):
+    __tablename__ = "obj_cnn_features_vector"
     objcnnid: int = Column(
         BIGINT, ForeignKey("obj_head.objid", ondelete="CASCADE"), primary_key=True
     )
+    features: Vector = Column(Vector(N_DEEP_FEATURES))
     # The relationships are created in Relations.py but the typing here helps the IDE
     object: relationship
 
 
-# The features in _each_ row
-DEEP_FEATURES = ["cnn%02d" % i for i in range(1, 51)]
-
-for a_feat in DEEP_FEATURES:
-    setattr(ObjectCNNFeature, a_feat, Column(REAL))
-
-
-class ObjectCNNFeaturesBean(Bean):
+class ObjectCNNFeaturesVectorBean(Bean):
     """
     A bean for feeding DBWriter.
     """
 
     def __init__(self, obj_id: ObjectIDT, features: List[float]):
-        super().__init__(zip(DEEP_FEATURES, features))
-        self["objcnnid"] = obj_id
+        super().__init__({
+            "objcnnid": obj_id,
+            "features": features,
+        })
diff --git a/py/DB/Relations.py b/py/DB/Relations.py
@@ -9,7 +9,7 @@
     # Trick to prevent accidental re-export of the DB Models involved
     # Note: The trick doesn't work :(
     from .Acquisition import Acquisition
-    from .CNNFeature import ObjectCNNFeature
+    from .CNNFeatureVector import ObjectCNNFeatureVector
     from .Collection import (
         Collection,
         CollectionProject,
@@ -126,13 +126,13 @@
         uselist=False,
     )
 
-    ObjectCNNFeature.object = relationship(
+    ObjectCNNFeatureVector.object = relationship(
         ObjectHeader,
         foreign_keys="ObjectHeader.objid",
-        primaryjoin="ObjectCNNFeature.objcnnid==ObjectHeader.objid",
+        primaryjoin="ObjectCNNFeatureVector.objcnnid==ObjectHeader.objid",
         uselist=False,
     )
-    ObjectHeader.cnn_features = relationship(ObjectCNNFeature, uselist=False)
+    ObjectHeader.cnn_features = relationship(ObjectCNNFeatureVector, uselist=False)
 
     ObjectHeader.all_images = relationship(Image)
 
diff --git a/py/DB/helpers/DBWriter.py b/py/DB/helpers/DBWriter.py
@@ -2,14 +2,14 @@
 # This file is part of Ecotaxa, see license.md in the application root directory for license informations.
 # Copyright (C) 2015-2020  Picheral, Colin, Irisson (UPMC-CNRS)
 #
-from typing import List, Optional, ClassVar
+from typing import Dict, Tuple, List, Type, Optional, ClassVar
 
 from helpers.DynamicLogs import get_logger
 from .Bean import Bean
 from .Direct import text
-from .ORM import Session, MetaData, minimal_table_of
+from .ORM import Session, Table, MetaData, minimal_table_of
 from .Postgres import SequenceCache
-from ..CNNFeature import ObjectCNNFeature
+from ..CNNFeatureVector import ObjectCNNFeatureVector
 from ..Image import Image
 from ..Object import ObjectHeader, ObjectFields, ObjectsClassifHisto
 
@@ -35,7 +35,7 @@ def __init__(self, session: Session):
         self.obj_tbl = ObjectHeader.__table__
         self.obj_fields_tbl = ObjectFields.__table__  # Slow by default @see narrow_to
         self.img_tbl = Image.__table__
-        self.obj_cnn_tbl = ObjectCNNFeature.__table__
+        self.obj_cnn_vector_tbl = ObjectCNNFeatureVector.__table__
         self.obj_history_tbl = ObjectsClassifHisto.__table__
         # Data
         self.obj_bulks: List[Bean] = []
@@ -70,7 +70,7 @@ def do_bulk_save(self) -> None:
         inserts = [
             self.obj_tbl.insert(),
             self.obj_fields_tbl.insert(),
-            self.obj_cnn_tbl.insert(),
+            self.obj_cnn_vector_tbl.insert(),
             self.img_tbl.insert(),
             self.obj_history_tbl.insert(),
         ]
diff --git a/py/cmds/db_upg/versions/a9dd3c62b7b0_ss_store_cnn_desc_pgvector.py b/py/cmds/db_upg/versions/a9dd3c62b7b0_ss_store_cnn_desc_pgvector.py
@@ -0,0 +1,50 @@
+"""similarity search
+
+Revision ID: a9dd3c62b7b0
+Revises: 0a3132f436fb
+Create Date: 2024-02-19 16:53:45.397975
+
+"""
+
+# revision identifiers, used by Alembic.
+revision = "a9dd3c62b7b0"
+down_revision = "0a3132f436fb"
+
+import sqlalchemy as sa
+from alembic import op
+from pgvector.sqlalchemy import Vector  # type:ignore
+
+
+def upgrade():
+    # ### commands auto generated by Alembic - please adjust! ###
+    op.create_table(
+        "obj_cnn_features_vector",
+        sa.Column("objcnnid", sa.BIGINT(), nullable=False),
+        sa.Column("features", Vector(dim=50), nullable=True),
+        sa.ForeignKeyConstraint(["objcnnid"], ["obj_head.objid"], ondelete="CASCADE"),
+        sa.PrimaryKeyConstraint("objcnnid"),
+    )
+    op.execute(
+        """
+        INSERT INTO obj_cnn_features_vector (objcnnid, features)
+        SELECT objcnnid, ARRAY[cnn01, cnn02, cnn03, cnn04, cnn05, cnn06, cnn07, cnn08, cnn09, cnn10, 
+        cnn11, cnn12, cnn13, cnn14, cnn15, cnn16, cnn17, cnn18, cnn19, cnn20, 
+        cnn21, cnn22, cnn23, cnn24, cnn25, cnn26, cnn27, cnn28, cnn29, cnn30, 
+        cnn31, cnn32, cnn33, cnn34, cnn35, cnn36, cnn37, cnn38, cnn39, cnn40, 
+        cnn41, cnn42, cnn43, cnn44, cnn45, cnn46, cnn47, cnn48, cnn49, cnn50]::vector 
+        FROM obj_cnn_features
+    """
+    )
+    op.execute(
+        """
+        GRANT SELECT ON obj_cnn_features_vector TO readerole
+        """
+    )
+    op.drop_table("obj_cnn_features")
+    # ### end Alembic commands ###
+
+
+def downgrade():
+    # ### commands auto generated by Alembic - please adjust! ###
+    op.drop_table("obj_cnn_features_vector")
+    # ### end Alembic commands ###
diff --git a/py/cmds/manage.py b/py/cmds/manage.py
@@ -138,6 +138,7 @@ def drop(user: str = "postgres", password: str = "", db_name: str = ""):
 @db_app.command(help="Full DB build, the DB should be usable when done.")
 def build():
     conn = Service.build_connection(app_config)
+    conn.exec_outside_transaction("CREATE EXTENSION IF NOT EXISTS vector;")
     sess = conn.get_session()
     from DB.Project import Project
 
diff --git a/py/gpu_jobs_reqs.txt b/py/gpu_jobs_reqs.txt
@@ -3,6 +3,8 @@
 SQLAlchemy==1.4.31
 psycopg2-binary==2.9.3
 astral==2.2
+# pgvector for similarity search
+pgvector==0.2.4
 # Last (April 2023) pydantic
 # ERROR: Cannot install -r reqs.txt (line 12) and pydantic[mypy]==1.10.7 because these package versions have conflicting dependencies.
 # The conflict is caused by:
diff --git a/py/requirements.txt b/py/requirements.txt
@@ -9,6 +9,8 @@ sqlalchemy_views==0.3.1
 # When SQLAlchemy moves to 2.0 we'll be able to use psycopg3
 #psycopg[binary]==3.0.8
 psycopg2-binary==2.9.3
+# pgvector for similarity search
+pgvector==0.2.4
 # Pillow needs manual testing
 # mac: brew install libjpeg
 Pillow==8.1.0