Skip to content

Commit faaf552

Browse files
authored
optimize sequence number calculation and reduce search requests in doc level monitor execution (opensearch-project#1445)
* optimize sequence number calculation and reduce search requests by n where n is number of shards being queried in the executino Signed-off-by: Surya Sashank Nistala <[email protected]> * fix tests Signed-off-by: Surya Sashank Nistala <[email protected]> * optimize check indices and execute to query only write index of aliases and datastreams during monitor creation Signed-off-by: Surya Sashank Nistala <[email protected]> * fix test Signed-off-by: Surya Sashank Nistala <[email protected]> * add javadoc Signed-off-by: Surya Sashank Nistala <[email protected]> * add tests to verify seq_no calculation Signed-off-by: Surya Sashank Nistala <[email protected]> --------- Signed-off-by: Surya Sashank Nistala <[email protected]>
1 parent d93c163 commit faaf552

File tree

9 files changed

+187
-98
lines changed

9 files changed

+187
-98
lines changed

alerting/src/main/kotlin/org/opensearch/alerting/AlertingPlugin.kt

+2
Original file line numberDiff line numberDiff line change
@@ -46,6 +46,7 @@ import org.opensearch.alerting.resthandler.RestSearchMonitorAction
4646
import org.opensearch.alerting.script.TriggerScript
4747
import org.opensearch.alerting.service.DeleteMonitorService
4848
import org.opensearch.alerting.settings.AlertingSettings
49+
import org.opensearch.alerting.settings.AlertingSettings.Companion.DOC_LEVEL_MONITOR_SHARD_FETCH_SIZE
4950
import org.opensearch.alerting.settings.DestinationSettings
5051
import org.opensearch.alerting.settings.LegacyOpenDistroAlertingSettings
5152
import org.opensearch.alerting.settings.LegacyOpenDistroDestinationSettings
@@ -323,6 +324,7 @@ internal class AlertingPlugin : PainlessExtension, ActionPlugin, ScriptPlugin, R
323324
AlertingSettings.ALERT_HISTORY_RETENTION_PERIOD,
324325
AlertingSettings.ALERTING_MAX_MONITORS,
325326
AlertingSettings.PERCOLATE_QUERY_DOCS_SIZE_MEMORY_PERCENTAGE_LIMIT,
327+
DOC_LEVEL_MONITOR_SHARD_FETCH_SIZE,
326328
AlertingSettings.PERCOLATE_QUERY_MAX_NUM_DOCS_IN_MEMORY,
327329
AlertingSettings.REQUEST_TIMEOUT,
328330
AlertingSettings.MAX_ACTION_THROTTLE_VALUE,

alerting/src/main/kotlin/org/opensearch/alerting/DocumentLevelMonitorRunner.kt

+79-73
Original file line numberDiff line numberDiff line change
@@ -17,8 +17,8 @@ import org.opensearch.action.index.IndexRequest
1717
import org.opensearch.action.search.SearchAction
1818
import org.opensearch.action.search.SearchRequest
1919
import org.opensearch.action.search.SearchResponse
20-
import org.opensearch.alerting.model.DocumentExecutionContext
2120
import org.opensearch.alerting.model.DocumentLevelTriggerRunResult
21+
import org.opensearch.alerting.model.IndexExecutionContext
2222
import org.opensearch.alerting.model.InputRunResults
2323
import org.opensearch.alerting.model.MonitorMetadata
2424
import org.opensearch.alerting.model.MonitorRunResult
@@ -30,7 +30,6 @@ import org.opensearch.alerting.util.IndexUtils
3030
import org.opensearch.alerting.util.defaultToPerExecutionAction
3131
import org.opensearch.alerting.util.getActionExecutionPolicy
3232
import org.opensearch.alerting.workflow.WorkflowRunContext
33-
import org.opensearch.client.Client
3433
import org.opensearch.client.node.NodeClient
3534
import org.opensearch.cluster.metadata.IndexMetadata
3635
import org.opensearch.cluster.routing.Preference
@@ -59,6 +58,8 @@ import org.opensearch.index.IndexNotFoundException
5958
import org.opensearch.index.query.BoolQueryBuilder
6059
import org.opensearch.index.query.Operator
6160
import org.opensearch.index.query.QueryBuilders
61+
import org.opensearch.index.seqno.SequenceNumbers
62+
import org.opensearch.indices.IndexClosedException
6263
import org.opensearch.percolator.PercolateQueryBuilderExt
6364
import org.opensearch.search.SearchHit
6465
import org.opensearch.search.SearchHits
@@ -207,7 +208,7 @@ class DocumentLevelMonitorRunner : MonitorRunner() {
207208
}
208209

209210
// Prepare updatedLastRunContext for each index
210-
val indexUpdatedRunContext = updateLastRunContext(
211+
val indexUpdatedRunContext = initializeNewLastRunContext(
211212
indexLastRunContext.toMutableMap(),
212213
monitorCtx,
213214
concreteIndexName,
@@ -255,25 +256,29 @@ class DocumentLevelMonitorRunner : MonitorRunner() {
255256
"${fieldsToBeQueried.joinToString()} instead of entire _source of documents"
256257
)
257258
}
258-
259-
// Prepare DocumentExecutionContext for each index
260-
val docExecutionContext = DocumentExecutionContext(queries, indexLastRunContext, indexUpdatedRunContext)
261-
262-
fetchShardDataAndMaybeExecutePercolateQueries(
263-
monitor,
264-
monitorCtx,
265-
docExecutionContext,
259+
val indexExecutionContext = IndexExecutionContext(
260+
queries,
261+
indexLastRunContext,
262+
indexUpdatedRunContext,
266263
updatedIndexName,
267264
concreteIndexName,
268265
conflictingFields.toList(),
269266
matchingDocIdsPerIndex?.get(concreteIndexName),
267+
)
268+
269+
fetchShardDataAndMaybeExecutePercolateQueries(
270+
monitor,
271+
monitorCtx,
272+
indexExecutionContext,
270273
monitorMetadata,
271274
inputRunResults,
272275
docsToQueries,
273276
updatedIndexNames,
274277
concreteIndicesSeenSoFar,
275278
ArrayList(fieldsToBeQueried)
276-
)
279+
) { shard, maxSeqNo -> // function passed to update last run context with new max sequence number
280+
indexExecutionContext.updatedLastRunContext[shard] = maxSeqNo
281+
}
277282
}
278283
}
279284
/* if all indices are covered still in-memory docs size limit is not breached we would need to submit
@@ -615,7 +620,7 @@ class DocumentLevelMonitorRunner : MonitorRunner() {
615620
)
616621
}
617622

618-
private suspend fun updateLastRunContext(
623+
private fun initializeNewLastRunContext(
619624
lastRunContext: Map<String, Any>,
620625
monitorCtx: MonitorRunnerExecutionContext,
621626
index: String,
@@ -624,8 +629,7 @@ class DocumentLevelMonitorRunner : MonitorRunner() {
624629
val updatedLastRunContext = lastRunContext.toMutableMap()
625630
for (i: Int in 0 until count) {
626631
val shard = i.toString()
627-
val maxSeqNo: Long = getMaxSeqNo(monitorCtx.client!!, index, shard)
628-
updatedLastRunContext[shard] = maxSeqNo.toString()
632+
updatedLastRunContext[shard] = SequenceNumbers.UNASSIGNED_SEQ_NO.toString()
629633
}
630634
return updatedLastRunContext
631635
}
@@ -657,33 +661,6 @@ class DocumentLevelMonitorRunner : MonitorRunner() {
657661
return indexCreationDate > lastExecutionTime.toEpochMilli()
658662
}
659663

660-
/**
661-
* Get the current max seq number of the shard. We find it by searching the last document
662-
* in the primary shard.
663-
*/
664-
private suspend fun getMaxSeqNo(client: Client, index: String, shard: String): Long {
665-
val request: SearchRequest = SearchRequest()
666-
.indices(index)
667-
.preference("_shards:$shard")
668-
.source(
669-
SearchSourceBuilder()
670-
.version(true)
671-
.sort("_seq_no", SortOrder.DESC)
672-
.seqNoAndPrimaryTerm(true)
673-
.query(QueryBuilders.matchAllQuery())
674-
.size(1)
675-
)
676-
val response: SearchResponse = client.suspendUntil { client.search(request, it) }
677-
if (response.status() !== RestStatus.OK) {
678-
throw IOException("Failed to get max seq no for shard: $shard")
679-
}
680-
nonPercolateSearchesTimeTakenStat += response.took.millis
681-
if (response.hits.hits.isEmpty())
682-
return -1L
683-
684-
return response.hits.hits[0].seqNo
685-
}
686-
687664
private fun getShardsCount(clusterService: ClusterService, index: String): Int {
688665
val allShards: List<ShardRouting> = clusterService!!.state().routingTable().allShards(index)
689666
return allShards.filter { it.primary() }.size
@@ -697,51 +674,79 @@ class DocumentLevelMonitorRunner : MonitorRunner() {
697674
private suspend fun fetchShardDataAndMaybeExecutePercolateQueries(
698675
monitor: Monitor,
699676
monitorCtx: MonitorRunnerExecutionContext,
700-
docExecutionCtx: DocumentExecutionContext,
701-
indexName: String,
702-
concreteIndexName: String,
703-
conflictingFields: List<String>,
704-
docIds: List<String>? = null,
677+
indexExecutionCtx: IndexExecutionContext,
705678
monitorMetadata: MonitorMetadata,
706679
inputRunResults: MutableMap<String, MutableSet<String>>,
707680
docsToQueries: MutableMap<String, MutableList<String>>,
708681
monitorInputIndices: List<String>,
709682
concreteIndices: List<String>,
710683
fieldsToBeQueried: List<String>,
684+
updateLastRunContext: (String, String) -> Unit
711685
) {
712-
val count: Int = docExecutionCtx.updatedLastRunContext["shards_count"] as Int
686+
val count: Int = indexExecutionCtx.updatedLastRunContext["shards_count"] as Int
713687
for (i: Int in 0 until count) {
714688
val shard = i.toString()
715689
try {
716-
val maxSeqNo: Long = docExecutionCtx.updatedLastRunContext[shard].toString().toLong()
717-
val prevSeqNo = docExecutionCtx.lastRunContext[shard].toString().toLongOrNull()
718-
719-
val hits: SearchHits = searchShard(
720-
monitorCtx,
721-
concreteIndexName,
722-
shard,
723-
prevSeqNo,
724-
maxSeqNo,
725-
docIds,
726-
fieldsToBeQueried
727-
)
728-
val startTime = System.currentTimeMillis()
729-
transformedDocs.addAll(
730-
transformSearchHitsAndReconstructDocs(
731-
hits,
732-
indexName,
733-
concreteIndexName,
734-
monitor.id,
735-
conflictingFields,
690+
val prevSeqNo = indexExecutionCtx.lastRunContext[shard].toString().toLongOrNull()
691+
val from = prevSeqNo ?: SequenceNumbers.NO_OPS_PERFORMED
692+
var to: Long = Long.MAX_VALUE
693+
while (to >= from) {
694+
val hits: SearchHits = searchShard(
695+
monitorCtx,
696+
indexExecutionCtx.concreteIndexName,
697+
shard,
698+
from,
699+
to,
700+
indexExecutionCtx.docIds,
701+
fieldsToBeQueried,
736702
)
737-
)
738-
docTransformTimeTakenStat += System.currentTimeMillis() - startTime
703+
if (hits.hits.isEmpty()) {
704+
if (to == Long.MAX_VALUE) {
705+
updateLastRunContext(shard, (prevSeqNo ?: SequenceNumbers.NO_OPS_PERFORMED).toString()) // didn't find any docs
706+
}
707+
break
708+
}
709+
if (to == Long.MAX_VALUE) { // max sequence number of shard needs to be computed
710+
updateLastRunContext(shard, hits.hits[0].seqNo.toString())
711+
}
712+
val leastSeqNoFromHits = hits.hits.last().seqNo
713+
to = leastSeqNoFromHits - 1
714+
val startTime = System.currentTimeMillis()
715+
transformedDocs.addAll(
716+
transformSearchHitsAndReconstructDocs(
717+
hits,
718+
indexExecutionCtx.indexName,
719+
indexExecutionCtx.concreteIndexName,
720+
monitor.id,
721+
indexExecutionCtx.conflictingFields,
722+
)
723+
)
724+
if (
725+
transformedDocs.isNotEmpty() &&
726+
shouldPerformPercolateQueryAndFlushInMemoryDocs(transformedDocs.size, monitorCtx)
727+
) {
728+
performPercolateQueryAndResetCounters(
729+
monitorCtx,
730+
monitor,
731+
monitorMetadata,
732+
monitorInputIndices,
733+
concreteIndices,
734+
inputRunResults,
735+
docsToQueries,
736+
)
737+
}
738+
docTransformTimeTakenStat += System.currentTimeMillis() - startTime
739+
}
739740
} catch (e: Exception) {
740741
logger.error(
741742
"Monitor ${monitor.id} :" +
742-
" Failed to run fetch data from shard [$shard] of index [$concreteIndexName]. Error: ${e.message}",
743+
"Failed to run fetch data from shard [$shard] of index [${indexExecutionCtx.concreteIndexName}]. " +
744+
"Error: ${e.message}",
743745
e
744746
)
747+
if (e is IndexClosedException) {
748+
throw e
749+
}
745750
}
746751
if (
747752
transformedDocs.isNotEmpty() &&
@@ -833,8 +838,10 @@ class DocumentLevelMonitorRunner : MonitorRunner() {
833838
.source(
834839
SearchSourceBuilder()
835840
.version(true)
841+
.sort("_seq_no", SortOrder.DESC)
842+
.seqNoAndPrimaryTerm(true)
836843
.query(boolQueryBuilder)
837-
.size(10000)
844+
.size(monitorCtx.docLevelMonitorShardFetchSize)
838845
)
839846
.preference(Preference.PRIMARY_FIRST.type())
840847

@@ -846,7 +853,6 @@ class DocumentLevelMonitorRunner : MonitorRunner() {
846853
}
847854
val response: SearchResponse = monitorCtx.client!!.suspendUntil { monitorCtx.client!!.search(request, it) }
848855
if (response.status() !== RestStatus.OK) {
849-
logger.error("Failed search shard. Response: $response")
850856
throw IOException("Failed to search shard: [$shard] in index [$index]. Response status is ${response.status()}")
851857
}
852858
nonPercolateSearchesTimeTakenStat += response.took.millis

alerting/src/main/kotlin/org/opensearch/alerting/MonitorRunnerExecutionContext.kt

+2
Original file line numberDiff line numberDiff line change
@@ -55,4 +55,6 @@ data class MonitorRunnerExecutionContext(
5555
@Volatile var percQueryMaxNumDocsInMemory: Int = AlertingSettings.DEFAULT_PERCOLATE_QUERY_NUM_DOCS_IN_MEMORY,
5656
@Volatile var percQueryDocsSizeMemoryPercentageLimit: Int =
5757
AlertingSettings.DEFAULT_PERCOLATE_QUERY_DOCS_SIZE_MEMORY_PERCENTAGE_LIMIT,
58+
@Volatile var docLevelMonitorShardFetchSize: Int =
59+
AlertingSettings.DEFAULT_DOC_LEVEL_MONITOR_SHARD_FETCH_SIZE,
5860
)

alerting/src/main/kotlin/org/opensearch/alerting/MonitorRunnerService.kt

+8
Original file line numberDiff line numberDiff line change
@@ -26,6 +26,7 @@ import org.opensearch.alerting.settings.AlertingSettings
2626
import org.opensearch.alerting.settings.AlertingSettings.Companion.ALERT_BACKOFF_COUNT
2727
import org.opensearch.alerting.settings.AlertingSettings.Companion.ALERT_BACKOFF_MILLIS
2828
import org.opensearch.alerting.settings.AlertingSettings.Companion.DOC_LEVEL_MONITOR_FETCH_ONLY_QUERY_FIELDS_ENABLED
29+
import org.opensearch.alerting.settings.AlertingSettings.Companion.DOC_LEVEL_MONITOR_SHARD_FETCH_SIZE
2930
import org.opensearch.alerting.settings.AlertingSettings.Companion.FINDINGS_INDEXING_BATCH_SIZE
3031
import org.opensearch.alerting.settings.AlertingSettings.Companion.INDEX_TIMEOUT
3132
import org.opensearch.alerting.settings.AlertingSettings.Companion.MAX_ACTIONABLE_ALERT_COUNT
@@ -202,6 +203,13 @@ object MonitorRunnerService : JobRunner, CoroutineScope, AbstractLifecycleCompon
202203
monitorCtx.percQueryDocsSizeMemoryPercentageLimit = it
203204
}
204205

206+
monitorCtx.docLevelMonitorShardFetchSize =
207+
DOC_LEVEL_MONITOR_SHARD_FETCH_SIZE.get(monitorCtx.settings)
208+
monitorCtx.clusterService!!.clusterSettings
209+
.addSettingsUpdateConsumer(DOC_LEVEL_MONITOR_SHARD_FETCH_SIZE) {
210+
monitorCtx.docLevelMonitorShardFetchSize = it
211+
}
212+
205213
return this
206214
}
207215

alerting/src/main/kotlin/org/opensearch/alerting/model/DocumentExecutionContext.kt

-14
This file was deleted.
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,19 @@
1+
/*
2+
* Copyright OpenSearch Contributors
3+
* SPDX-License-Identifier: Apache-2.0
4+
*/
5+
6+
package org.opensearch.alerting.model
7+
8+
import org.opensearch.commons.alerting.model.DocLevelQuery
9+
10+
/** DTO that contains all the necessary context for fetching data from shard and performing percolate queries */
11+
data class IndexExecutionContext(
12+
val queries: List<DocLevelQuery>,
13+
val lastRunContext: MutableMap<String, Any>,
14+
val updatedLastRunContext: MutableMap<String, Any>,
15+
val indexName: String,
16+
val concreteIndexName: String,
17+
val conflictingFields: List<String>,
18+
val docIds: List<String>? = null,
19+
)

alerting/src/main/kotlin/org/opensearch/alerting/settings/AlertingSettings.kt

+11
Original file line numberDiff line numberDiff line change
@@ -20,6 +20,7 @@ class AlertingSettings {
2020
const val DEFAULT_FINDINGS_INDEXING_BATCH_SIZE = 1000
2121
const val DEFAULT_PERCOLATE_QUERY_NUM_DOCS_IN_MEMORY = 50000
2222
const val DEFAULT_PERCOLATE_QUERY_DOCS_SIZE_MEMORY_PERCENTAGE_LIMIT = 10
23+
const val DEFAULT_DOC_LEVEL_MONITOR_SHARD_FETCH_SIZE = 10000
2324

2425
val ALERTING_MAX_MONITORS = Setting.intSetting(
2526
"plugins.alerting.monitor.max_monitors",
@@ -38,6 +39,16 @@ class AlertingSettings {
3839
Setting.Property.NodeScope, Setting.Property.Dynamic
3940
)
4041

42+
/** Purely a setting used to verify seq_no calculation
43+
*/
44+
val DOC_LEVEL_MONITOR_SHARD_FETCH_SIZE = Setting.intSetting(
45+
"plugins.alerting.monitor.doc_level_monitor_shard_fetch_size",
46+
DEFAULT_DOC_LEVEL_MONITOR_SHARD_FETCH_SIZE,
47+
1,
48+
10000,
49+
Setting.Property.NodeScope, Setting.Property.Dynamic
50+
)
51+
4152
/** Defines the threshold of the maximum number of docs accumulated in memory to query against percolate query index in document
4253
* level monitor execution. The docs are being collected from searching on shards of indices mentioned in the
4354
* monitor input indices field. When the number of in-memory docs reaches or exceeds threshold we immediately perform percolate

alerting/src/main/kotlin/org/opensearch/alerting/transport/TransportIndexMonitorAction.kt

+9-1
Original file line numberDiff line numberDiff line change
@@ -189,7 +189,15 @@ class TransportIndexMonitorAction @Inject constructor(
189189
else (it as DocLevelMonitorInput).indices
190190
indices.addAll(inputIndices)
191191
}
192-
val searchRequest = SearchRequest().indices(*indices.toTypedArray())
192+
val updatedIndices = indices.map { index ->
193+
if (IndexUtils.isAlias(index, clusterService.state()) || IndexUtils.isDataStream(index, clusterService.state())) {
194+
val metadata = clusterService.state().metadata.indicesLookup[index]?.writeIndex
195+
metadata?.index?.name ?: index
196+
} else {
197+
index
198+
}
199+
}
200+
val searchRequest = SearchRequest().indices(*updatedIndices.toTypedArray())
193201
.source(SearchSourceBuilder.searchSource().size(1).query(QueryBuilders.matchAllQuery()))
194202
client.search(
195203
searchRequest,

0 commit comments

Comments
 (0)