@@ -4,9 +4,11 @@ import { Tables } from "./supabase.types.generated";
44import { supabase } from "./supabase.server" ;
55import { slog } from "../modules/observability.server" ;
66import { authorIdSchema } from "./author.shape" ;
7- import { uniqBy } from "es-toolkit" ;
7+ import { groupBy , uniqBy } from "es-toolkit" ;
88import TTLCache from "@isaacs/ttlcache" ;
99import { format , hoursToMilliseconds } from "date-fns" ;
10+ import { embed } from "ai" ;
11+ import { google } from "@ai-sdk/google" ;
1012
1113type Package = Tables < "cran_packages" > ;
1214
@@ -164,37 +166,129 @@ export class PackageService {
164166 ) {
165167 const { limit = 20 } = options || { } ;
166168
167- const [ fts , exact ] = await Promise . all ( [
168- supabase . rpc ( "find_closest_packages" , {
169- search_term : query ,
170- result_limit : limit ,
171- } ) ,
172- // ! ilike is expensive, but we want to make sure we get the exact match w/o case sensitivity.
173- supabase
174- . from ( "cran_packages" )
175- . select ( "id,name" )
176- . ilike ( "name" , query )
177- . maybeSingle ( ) ,
178- ] ) ;
179-
180- if ( fts . error ) {
181- slog . error ( "Error in searchPackages" , fts . error ) ;
182- return [ ] ;
169+ const isSimilaritySearchEnabled = query . length >= 3 ;
170+
171+ const [ packageFTS , packageExact , embeddingSimilarity , embeddingFTS ] =
172+ await Promise . all ( [
173+ supabase . rpc ( "find_closest_packages" , {
174+ search_term : query ,
175+ result_limit : limit ,
176+ } ) ,
177+ // ! ilike is expensive, but we want to make sure we get the exact match w/o case sensitivity.
178+ supabase
179+ . from ( "cran_packages" )
180+ . select ( "id,name,synopsis" )
181+ . ilike ( "name" , query )
182+ . maybeSingle ( ) ,
183+ isSimilaritySearchEnabled
184+ ? supabase . rpc ( "match_package_embeddings" , {
185+ query_embedding : await embed ( {
186+ value : query ,
187+ model : google . textEmbeddingModel ( "text-embedding-004" ) ,
188+ } ) . then ( ( res ) => res . embedding as unknown as string ) ,
189+ match_threshold : 0.4 ,
190+ match_count : limit ,
191+ } )
192+ : null ,
193+ isSimilaritySearchEnabled
194+ ? supabase . rpc ( "find_closest_package_embeddings" , {
195+ search_term : query ,
196+ result_limit : limit ,
197+ } )
198+ : null ,
199+ ] ) ;
200+
201+ if ( packageFTS . error ) {
202+ slog . error ( "Error in searchPackages" , packageFTS . error ) ;
203+ throw packageFTS . error ;
183204 }
184205
185- if ( exact . error ) {
186- slog . error ( "Error in searchPackages" , exact . error ) ;
187- return [ ] ;
206+ if ( packageExact . error ) {
207+ slog . error ( "Error in searchPackages" , packageExact . error ) ;
208+ throw packageExact . error ;
188209 }
189210
190- if ( exact . data ) {
191- fts . data . unshift ( {
192- ...exact . data ,
193- levenshtein_distance : 0 ,
211+ if ( packageExact . data ) {
212+ packageFTS . data . unshift ( {
213+ ...packageExact . data ,
214+ levenshtein_distance : 0.4 ,
194215 } ) ;
195216 }
196217
197- return uniqBy ( fts . data , ( item ) => item . id ) ;
218+ if ( embeddingSimilarity ) {
219+ if ( embeddingSimilarity . error ) {
220+ slog . error ( "Error in searchPackages" , embeddingSimilarity . error ) ;
221+ }
222+ }
223+
224+ // Prefer the exact match over the similarity match.
225+ // Therefore we filter out the similarity match if it's the same as the exact match.
226+ const sources = [
227+ ...( embeddingFTS ?. data || [ ] ) ,
228+ ...( embeddingSimilarity ?. data || [ ] ) ,
229+ ] . filter ( ( item ) => {
230+ const hasExactMatch =
231+ packageExact . data && packageExact . data . id === item . cran_package_id ;
232+ if ( hasExactMatch ) {
233+ return false ;
234+ }
235+ return true ;
236+ } ) ;
237+
238+ const lexical = uniqBy ( packageFTS . data , ( item ) => item . id )
239+ . filter ( ( item ) => {
240+ return ! sources . some ( ( s ) => s . cran_package_id === item . id ) ;
241+ } )
242+ . map ( ( item ) => ( {
243+ name : item . name ,
244+ synopsis : item . synopsis ,
245+ } ) ) ;
246+
247+ // Group sources by package id and source name, so that multiple hits per source & package
248+ // can be grouped together. `Object.values` is used to convert the object back to an array.
249+ const sourcesByPackage = groupBy ( sources , ( item ) => item . cran_package_id ) ;
250+ const groupedSourcesByPackageIds = Object . entries ( sourcesByPackage ) . map (
251+ ( [ packageId , sources ] ) => ( {
252+ packageId,
253+ sources : groupBy ( sources , ( item ) => item . source_name ) ,
254+ } ) ,
255+ ) ;
256+
257+ // Fetch the package name for each package id.
258+ // This is not done inside the RPC call as we could
259+ // potentially have different package families (CRAN, Bioconductor, etc.).
260+ const groupedSourcesByPackage = await Promise . all (
261+ groupedSourcesByPackageIds . map ( async ( item ) => {
262+ const { data, error } = await supabase
263+ . from ( "cran_packages" )
264+ . select ( "name,synopsis" )
265+ . eq ( "id" , item . packageId )
266+ . maybeSingle ( ) ;
267+
268+ if ( error || ! data ) {
269+ slog . error ( "Error in searchPackages" , error ) ;
270+ return null ;
271+ }
272+
273+ return {
274+ name : data . name ,
275+ synopsis : data . synopsis ,
276+ sources : Object . entries ( item . sources ) ,
277+ } ;
278+ } ) ,
279+ ) ;
280+
281+ const isSemanticPreferred =
282+ ! packageExact . data && isSimilaritySearchEnabled && sources . length > 0 ;
283+
284+ return {
285+ lexical,
286+ semantic : groupedSourcesByPackage ,
287+ combined : isSemanticPreferred
288+ ? [ ...groupedSourcesByPackage , ...lexical ]
289+ : [ ...lexical , ...groupedSourcesByPackage ] ,
290+ isSemanticPreferred,
291+ } ;
198292 }
199293
200294 private static sanitizeSitemapName ( name : string ) {
0 commit comments