Skip to content

Commit b6a09dd

Browse files
committed
Allow OpenSearch embedding field name to be configurable
In order to support multiple embedding providers (for evaluating different ones), we'll need to store these different embeddings in different fields in OpenSearch. So we'll need to be able to specify the embedding provider we're using and use that to determine the field name. Currently we're just supporting OpenAI, with a view to extending this soon.
1 parent 31f65e9 commit b6a09dd

4 files changed

Lines changed: 36 additions & 7 deletions

File tree

lib/search/chunked_content_repository.rb

Lines changed: 9 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -141,14 +141,21 @@ def id_digest_hash(base_path, batch_size: 100)
141141
items
142142
end
143143

144-
def search_by_embedding(embedding, max_chunks:)
144+
def search_by_embedding(embedding, max_chunks:, llm_provider:)
145+
field_name = case llm_provider.to_sym
146+
when :openai
147+
:openai_embedding
148+
else
149+
raise "Unknown provider: #{llm_provider}"
150+
end
151+
145152
response = client.search(
146153
index:,
147154
body: {
148155
size: max_chunks,
149156
query: {
150157
knn: {
151-
openai_embedding: {
158+
"#{field_name}": {
152159
vector: embedding,
153160
k: max_chunks,
154161
},

lib/search/results_for_question.rb

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -15,7 +15,11 @@ def self.call(question_message)
1515
metrics[:embedding_duration] = Clock.monotonic_time - embedding_start_time
1616

1717
search_start_time = Clock.monotonic_time
18-
results = ChunkedContentRepository.new.search_by_embedding(embedding, max_chunks:)
18+
results = ChunkedContentRepository.new.search_by_embedding(
19+
embedding,
20+
max_chunks:,
21+
llm_provider: provider,
22+
)
1923
metrics[:search_duration] = Clock.monotonic_time - search_start_time
2024
metrics[:embedding_provider] = provider
2125

spec/lib/search/chunked_content_repository_spec.rb

Lines changed: 21 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -220,7 +220,11 @@
220220
end
221221

222222
it "returns an array of Result objects" do
223-
result = repository.search_by_embedding(openai_embedding, max_chunks: 10)
223+
result = repository.search_by_embedding(
224+
openai_embedding,
225+
max_chunks: 10,
226+
llm_provider: :openai,
227+
)
224228
expected_attributes = chunked_content_records.first
225229
.except(:openai_embedding)
226230
.merge(score: a_value_between(0.9, 1))
@@ -229,12 +233,26 @@
229233
expect(result.first).to have_attributes(**expected_attributes)
230234
end
231235

232-
context "when there are more then the maxiumum chunks" do
236+
it "raises an error if the llm provider is not recognised" do
237+
expect {
238+
repository.search_by_embedding(
239+
openai_embedding,
240+
max_chunks: 10,
241+
llm_provider: :unknown,
242+
)
243+
}.to raise_error("Unknown provider: unknown")
244+
end
245+
246+
context "when there are more than the maxiumum chunks" do
233247
let(:max_chunks) { 10 }
234248
let(:chunked_content_records) { build_list(:chunked_content_record, 11, openai_embedding:) }
235249

236250
it "only returns the first max_chunks" do
237-
result = repository.search_by_embedding(openai_embedding, max_chunks:)
251+
result = repository.search_by_embedding(
252+
openai_embedding,
253+
max_chunks:,
254+
llm_provider: :openai,
255+
)
238256
expect(result.count).to eq max_chunks
239257
end
240258
end

spec/requests/admin/search_spec.rb

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -67,7 +67,7 @@
6767
it "includes score calculation and back links in links to results" do
6868
get admin_search_path, params: { search_text: }
6969

70-
results = Search::ChunkedContentRepository.new.search_by_embedding(openai_embedding, max_chunks: 2)
70+
results = Search::ChunkedContentRepository.new.search_by_embedding(openai_embedding, max_chunks: 2, llm_provider: :openai)
7171
result = results.detect { |r| r.digest == chunk_to_find[:digest] }
7272
document_type_weight = Search::ResultsForQuestion::Reranker::DOCUMENT_TYPE_WEIGHTINGS[chunk_to_find[:document_type]]
7373
weighted_score = result.score * document_type_weight

0 commit comments

Comments
 (0)