From f03bc92bbdabb4762c7cc96cc6ae507f0c148f61 Mon Sep 17 00:00:00 2001 From: Anthony Alaribe Date: Sun, 10 May 2026 05:44:51 +0200 Subject: [PATCH 1/6] feat(schema): replace shapes/fields/formats with in-process schema learning Two-tier catalog: instance-wide schema_template (structure-only, dedup'd by hash) + per-project schema_catalog (values, counts, anomaly state) + schema_summary (materialized AI/query-editor doc). Pipeline lives in Pkg.SchemaLearning (Hot/Catalog/Worker/OpenApi); per-flush diffing in Worker.flushDirty replaces the legacy DB triggers. Drops apis.shapes / apis.fields / apis.formats / apis.facet_summaries and their anomaly triggers (migrations 0089 + 0090). --- monoscope.cabal | 8 +- src/BackgroundJobs.hs | 72 +-- src/Models/Apis/Anomalies.hs | 44 +- src/Models/Apis/Fields.hs | 460 ----------------- src/Models/Apis/LogQueries.hs | 1 - src/Models/Apis/SchemaCatalog.hs | 307 +++++++++++ src/Pages/Anomalies.hs | 6 +- src/Pages/Bots/Utils.hs | 4 +- src/Pages/LogExplorer/Log.hs | 8 +- src/Pkg/AI.hs | 2 +- src/Pkg/Components/LogQueryBox.hs | 2 +- src/Pkg/ExtractionWorker.hs | 8 +- src/Pkg/SchemaLearning/Catalog.hs | 486 ++++++++++++++++++ src/Pkg/SchemaLearning/Hot.hs | 239 +++++++++ src/Pkg/SchemaLearning/OpenApi.hs | 69 +++ src/Pkg/SchemaLearning/Worker.hs | 185 +++++++ src/ProcessMessage.hs | 375 ++++++-------- src/System/Config.hs | 13 + src/System/Server.hs | 1 + src/Web/ApiHandlers.hs | 5 +- src/Web/MCP.hs | 4 +- static/migrations/0089_schema_catalog.sql | 55 ++ .../0090_drop_legacy_schema_tables.sql | 24 + 23 files changed, 1636 insertions(+), 742 deletions(-) delete mode 100644 src/Models/Apis/Fields.hs create mode 100644 src/Models/Apis/SchemaCatalog.hs create mode 100644 src/Pkg/SchemaLearning/Catalog.hs create mode 100644 src/Pkg/SchemaLearning/Hot.hs create mode 100644 src/Pkg/SchemaLearning/OpenApi.hs create mode 100644 src/Pkg/SchemaLearning/Worker.hs create mode 100644 static/migrations/0089_schema_catalog.sql create mode 100644 static/migrations/0090_drop_legacy_schema_tables.sql diff --git a/monoscope.cabal b/monoscope.cabal index de9d2f535..597bc79ee 100644 --- a/monoscope.cabal +++ b/monoscope.cabal @@ -114,6 +114,8 @@ extra-source-files: static/migrations/0086_daily_usage_window_timestamps.sql static/migrations/0087_apis_hosts.sql static/migrations/0088_message_size_tracking.sql + static/migrations/0089_schema_catalog.sql + static/migrations/0090_drop_legacy_schema_tables.sql source-repository head type: git @@ -137,7 +139,6 @@ library Models.Apis.Anomalies Models.Apis.Endpoints Models.Apis.ErrorPatterns - Models.Apis.Fields Models.Apis.Integrations Models.Apis.IssueEnhancement Models.Apis.Issues @@ -145,6 +146,7 @@ library Models.Apis.LogQueries Models.Apis.Monitors Models.Apis.PatternMerge + Models.Apis.SchemaCatalog Models.Apis.ShareEvents Models.Projects.Dashboards Models.Projects.GitSync @@ -196,6 +198,10 @@ library Pkg.PatternMerge Pkg.QueryCache Pkg.Queue + Pkg.SchemaLearning.Catalog + Pkg.SchemaLearning.Hot + Pkg.SchemaLearning.OpenApi + Pkg.SchemaLearning.Worker Pkg.TestUtils Pkg.TraceSessionCache ProcessMessage diff --git a/src/BackgroundJobs.hs b/src/BackgroundJobs.hs index 80951a4d4..dade160e8 100644 --- a/src/BackgroundJobs.hs +++ b/src/BackgroundJobs.hs @@ -1,6 +1,6 @@ {-# LANGUAGE StrictData #-} -module BackgroundJobs (jobsWorkerInit, jobsRunner, processBackgroundJob, BgJobs (..), jobTypeName, runHourlyJob, generateOtelFacetsBatch, throwParsePayload, checkTriggeredQueryMonitors, monitorStatus, detectSpikeOrDrop, aboveVolumeFloor, isAlertableLogLevel, spikeZScoreThreshold, spikeMinAbsoluteDelta, spikeMinBaselineRate, dropMinBaselineRate, calculateLogPatternBaselines, detectLogPatternSpikes, processNewLogPatterns, pruneStaleLogPatterns, calculateErrorBaselines, detectErrorSpikes, notifyErrorSubscriptions, sweepErrorSubscriptions, consumeNotificationToken, endpointTemplateDiscovery, patternEmbeddingAndMerge, processEagerBatch, flushDrainTask, runErrorDecayFiber, runDrainFlusher, runDrainAgeFlushTimer, runSessionBackfillTimer, getStripeSubDetails, scheduleTrialReminders, StripeSubDetails (..), errorTrendChartUrl, sameSegmentCount) where +module BackgroundJobs (jobsWorkerInit, jobsRunner, processBackgroundJob, BgJobs (..), jobTypeName, runHourlyJob, generateOtelFacetsBatch, throwParsePayload, checkTriggeredQueryMonitors, monitorStatus, detectSpikeOrDrop, aboveVolumeFloor, isAlertableLogLevel, spikeZScoreThreshold, spikeMinAbsoluteDelta, spikeMinBaselineRate, dropMinBaselineRate, calculateLogPatternBaselines, detectLogPatternSpikes, processNewLogPatterns, pruneStaleLogPatterns, calculateErrorBaselines, detectErrorSpikes, notifyErrorSubscriptions, sweepErrorSubscriptions, consumeNotificationToken, endpointTemplateDiscovery, patternEmbeddingAndMerge, processEagerBatch, flushDrainTask, runErrorDecayFiber, runDrainFlusher, runDrainAgeFlushTimer, runSchemaFlusherFiber, runSessionBackfillTimer, getStripeSubDetails, scheduleTrialReminders, StripeSubDetails (..), errorTrendChartUrl, sameSegmentCount) where import Control.Concurrent (threadDelay) import Control.Concurrent.Async (async) @@ -62,7 +62,6 @@ import Lucid (Html) import Models.Apis.Anomalies qualified as Anomalies import Models.Apis.Endpoints qualified as Endpoints import Models.Apis.ErrorPatterns qualified as ErrorPatterns -import Models.Apis.Fields qualified as Fields import Models.Apis.IssueEnhancement qualified as Enhancement import Models.Apis.Issues qualified as Issues import Models.Apis.LogPatterns qualified as LogPatterns @@ -91,12 +90,17 @@ import Pkg.DeriveUtils (BaselineState (..), UUIDId (..), rawSql) import Pkg.Drain qualified as Drain import Pkg.EmailTemplates qualified as ET import Pkg.ExtractionWorker qualified as ExtractionWorker +import Pkg.SchemaLearning.Hot qualified as SchemaHot +import Pkg.SchemaLearning.Worker qualified as SchemaWorker + +-- Fields module is being deleted; remove its import. +-- (Old usages of Fields.bulkInsertX / Fields.generateAndSaveFacets are gone.) import Pkg.Mail (NotificationAlerts (..), RuntimeAlertType (..), sendDiscordAlert, sendDiscordAlertWith, sendPagerdutyAlertToService, sendRenderedEmail, sendSlackAlert, sendSlackAlertWith, sendSlackMessage, sendWhatsAppAlert) import Pkg.Parser import Pkg.PatternMerge qualified as PatternMerge import Pkg.QueryCache qualified as QueryCache import Pkg.TraceSessionCache qualified as TSC -import ProcessMessage (parseCanonicalPaths, processSpanToEntities, tokenizeUrlPath) +import ProcessMessage (extractObservation, parseCanonicalPaths, processSpanToEntities, tokenizeUrlPath) import PyF (fmtTrim) import Relude hiding (ask) import Relude.Extra.Tuple (fmapToSnd) @@ -344,7 +348,7 @@ processBackgroundJob authCtx bgJob = -- itself; the daily loop seeds a full day's ticks so a single restart -- can't leave a gap when the self-chain was broken. let seed :: Int -> Int -> (UTCTime -> BgJobs) -> IO () - seed count step mkJob = forM_ [0 .. count - 1] \i -> do + seed count step mkJob = forM_ ([0 .. count - 1] :: [Int]) \i -> do let at = addUTCTime (fromIntegral @Int $ i * step) currentTime void $ scheduleJob conn "background_jobs" (mkJob at) at forM_ [0 .. 23 :: Int] \i -> do @@ -894,10 +898,8 @@ checkFreeTierUsageNotifications pids now = forM_ pids \pid -> tryLog "free-tier- forM_ users \user -> sendRenderedEmail (CI.original user.email) subj (ET.renderEmail subj html) -generateOtelFacetsBatch :: (DB es, Effectful.Reader.Static.Reader Config.AuthContext :> es, Ki.StructuredConcurrency :> es, Labeled "timefusion" Hasql :> es, Log :> es, Tracing :> es, UUID.UUIDEff :> es) => V.Vector Projects.ProjectId -> UTCTime -> Eff es () -generateOtelFacetsBatch projectIds timestamp = do - ctx <- ask @Config.AuthContext - let enableTfReads = ctx.env.enableTimefusionReads +generateOtelFacetsBatch :: (IOE :> es, Ki.StructuredConcurrency :> es, Log :> es, Tracing :> es) => V.Vector Projects.ProjectId -> UTCTime -> Eff es () +generateOtelFacetsBatch projectIds _timestamp = do Log.logTrace "Starting batch OTLP facets generation" ("project_count", AE.toJSON $ V.length projectIds) -- Process projects concurrently with individual error handling @@ -910,17 +912,9 @@ generateOtelFacetsBatch projectIds timestamp = do , ("batch_size", OA.toAttribute $ V.length projectIds) ] $ \sp -> do - addEvent sp "facet_generation.started" [] - result <- try $ Fields.generateAndSaveFacets enableTfReads pid "otel_logs_and_spans" 50 timestamp - case result of - Left (e :: SomeException) -> do - addEvent sp "facet_generation.failed" [("error", OA.toAttribute $ toText $ show e)] - setStatus sp (Error $ toText $ show e) - pure $ Left (pid, show e) - Right _ -> do - addEvent sp "facet_generation.completed" [] - setStatus sp Ok - pure $ Right pid + addEvent sp "facet_generation.deprecated" [] + setStatus sp Ok + pure $ Right pid traverse (Ki.atomically . Ki.await) threads let successes = V.length $ V.filter isRight results @@ -1593,11 +1587,19 @@ processEagerBatch batch shard !entityIds <- V.replicateM (V.length spans) UUID.genUUID let !canonicalTemplates = parseCanonicalPaths projectCache.canonicalPaths !results = V.zipWith (processSpanToEntities canonicalTemplates projectCache) spans entityIds - !(endpoints, shapes, fields, formats, spanHashes, normalizedPaths) = V.unzip6 results + !(endpoints, spanHashes, normalizedPaths) = V.unzip3 results + !observations = V.map (extractObservation projectCache) spans !endpointsFinal = deduplicateByHash (.hash) $ V.mapMaybe id endpoints - !shapesFinal = deduplicateByHash (.hash) $ V.mapMaybe id shapes - !fieldsFinal = deduplicateByHash (.hash) $ V.concatMap id fields - !formatsFinal = deduplicateByHash (.hash) $ V.concatMap id formats + + -- Stream into the in-memory schema catalog. Single-writer per shard; + -- the schema-flusher fiber persists the dirty subset on its own tick. + let !policy = + SchemaHot.DecisionPolicy + { learnFullThreshold = fromIntegral ctx.config.schemaLearnFullThreshold + , learnSampleEveryN = fromIntegral ctx.config.schemaLearnSampleEveryN + , maxKeysPerProject = ctx.config.schemaCatalogMaxKeysPerProject + } + liftIO $ SchemaHot.observeSpans shard.schemaState policy pid observations -- Error extraction (pure). let !allErrors = Telemetry.getAllATErrors spans @@ -1636,16 +1638,14 @@ processEagerBatch batch shard perRowErrorsJson = V.zipWith (\sid tid -> fromMaybe AE.Null (HM.lookup (sid, tid) errorsByKey)) spanIdsV traceIdsV - Relude.when (V.length endpointsFinal > 0 || V.length shapesFinal > 0 || V.length fieldsFinal > 0 || V.length formatsFinal > 0 || V.length allErrors > 0) + Relude.when (V.length endpointsFinal > 0 || V.length allErrors > 0) $ Log.logTrace "Eager-track derivations" ( AE.object [ "project_id" AE..= pid.toText , "spans" AE..= V.length spans , "endpoints" AE..= V.length endpointsFinal - , "shapes" AE..= V.length shapesFinal - , "fields" AE..= V.length fieldsFinal - , "formats" AE..= V.length formatsFinal + , "observations" AE..= V.length observations , "errors" AE..= V.length allErrors ] ) @@ -1657,9 +1657,9 @@ processEagerBatch batch shard let forkNonEmpty :: V.Vector a -> (V.Vector a -> ATBackgroundCtx ()) -> ATBackgroundCtx () forkNonEmpty v action = Relude.unless (V.null v) $ void $ Ki.fork scope $ action v forkNonEmpty endpointsFinal Endpoints.bulkInsertEndpoints - forkNonEmpty shapesFinal Fields.bulkInsertShapes - forkNonEmpty fieldsFinal Fields.bulkInsertFields - forkNonEmpty formatsFinal Fields.bulkInsertFormat + -- Legacy apis.shapes/fields/formats writes removed; the + -- in-memory schema-learning catalog (observeSpans above) + + -- runSchemaFlusherFiber replaces them. forkNonEmpty allErrors \_ -> liftIO $ withResource ctx.jobsPool \conn -> void $ createJob conn "background_jobs" $ ProcessProjectErrorsJob pid allErrors now Ki.atomically $ Ki.awaitAll scope @@ -1724,6 +1724,18 @@ processEagerBatch batch shard liftIO $ ExtractionWorker.appendBufferedSpans shard pid ctx.config.drainFlushBatchSize now ctx.extractionWorker.droppedFlushTasks bufferedSpans +-- | Periodic schema-learning flush fiber. Iterates each shard once per +-- @schemaFlushIntervalSecs@, persists the dirty subset of catalog entries, +-- and re-derives the per-project summary doc. +runSchemaFlusherFiber :: Logger -> Config.AuthContext -> TracerProvider -> IO Void +runSchemaFlusherFiber logger ctx tp = do + let refs = V.toList $ V.map (.schemaState) ctx.extractionWorker.shards + flushOne ref = + runBackground logger ctx tp (SchemaWorker.flushDirty ref) + >>= \r -> pure r + SchemaWorker.runSchemaFlusher ctx.config.schemaFlushIntervalSecs refs flushOne + + -- | 1-minute error-state decay tick. Owns `propagateMergedCounts` + -- `updateOccurrenceCounts` so errors auto-resolve once quiet long enough. -- Runs every minute per active project. diff --git a/src/Models/Apis/Anomalies.hs b/src/Models/Apis/Anomalies.hs index d0cbe8653..de92bcb15 100644 --- a/src/Models/Apis/Anomalies.hs +++ b/src/Models/Apis/Anomalies.hs @@ -47,7 +47,7 @@ import Effectful.Time qualified as Time import Hasql.Interpolate qualified as HI import Models.Apis.Endpoints qualified as Endpoints import Models.Apis.ErrorPatterns qualified as ErrorPatterns -import Models.Apis.Fields qualified as Fields ( +import Pkg.SchemaLearning.Catalog qualified as Fields ( FieldCategoryEnum, FieldId, FieldTypes, @@ -154,6 +154,14 @@ getAnomaliesVM pid hash | V.null hash = pure [] | otherwise = do now <- Time.currentTime + -- Legacy apis.shapes / fields / formats joins removed (those tables + -- were dropped in 0090). The schema-learning catalog + -- (apis.schema_catalog) replaces them; per-field VM details + -- (key_path, format examples, etc.) currently surface as NULL on + -- legacy anomaly_type values and need a fresh query against the new + -- table — TODO once the anomaly producer in + -- @Pkg.SchemaLearning.Worker.flushDirty@ stamps target_hash + -- accordingly. Hasql.interp [HI.sql| SELECT @@ -166,19 +174,19 @@ SELECT an.anomaly_type, an.action, an.target_hash, - shapes.id shape_id, - coalesce(shapes.new_unique_fields, '{}'::TEXT[]) new_unique_fields, - coalesce(shapes.deleted_fields, '{}'::TEXT[]) deleted_fields, - coalesce(shapes.updated_field_formats, '{}'::TEXT[]) updated_field_formats, - fields.id field_id, - fields.key field_key, - fields.key_path field_key_path, - fields.field_category field_category, - fields.format field_format, - formats.id format_id, - formats.field_type format_type, - formats.examples format_examples, - endpoints.id endpoint_id, + NULL::uuid shape_id, + '{}'::TEXT[] new_unique_fields, + '{}'::TEXT[] deleted_fields, + '{}'::TEXT[] updated_field_formats, + NULL::uuid field_id, + NULL::text field_key, + NULL::text field_key_path, + NULL::text field_category, -- placeholder; legacy field_category enum is dropped + NULL::text field_format, + NULL::uuid format_id, + NULL::text format_type, -- placeholder; legacy field_type enum is dropped + '{}'::jsonb[] format_examples, + endpoints.id endpoint_id, endpoints.method endpoint_method, endpoints.url_path endpoint_url_path, endpoints.service_name endpoint_service_name, @@ -189,17 +197,9 @@ SELECT from apis.anomalies an LEFT JOIN apis.issues iss ON iss.target_hash = an.target_hash AND iss.project_id = an.project_id - LEFT JOIN apis.formats on (an.target_hash = formats.hash AND an.project_id = formats.project_id) - LEFT JOIN apis.fields on ( - ((fields.hash = formats.field_hash ) AND an.project_id = fields.project_id) - OR fields.hash = formats.field_hash - ) - LEFT JOIN apis.shapes on (an.target_hash = shapes.hash AND an.project_id = shapes.project_id) LEFT JOIN apis.endpoints ON (starts_with(an.target_hash, endpoints.hash) AND an.project_id = endpoints.project_id) where ((an.anomaly_type = 'endpoint') - OR (an.anomaly_type = 'shape' AND endpoints.project_id = an.project_id AND endpoints.created_at != an.created_at) - OR (an.anomaly_type = 'format' AND fields.project_id = an.project_id AND fields.created_at != an.created_at) OR NOT (an.anomaly_type = ANY('{"endpoint","shape","field","format"}'::apis.anomaly_type[])) ) AND an.project_id=#{pid} AND an.target_hash=ANY(#{hash}) |] diff --git a/src/Models/Apis/Fields.hs b/src/Models/Apis/Fields.hs deleted file mode 100644 index a52ab5f92..000000000 --- a/src/Models/Apis/Fields.hs +++ /dev/null @@ -1,460 +0,0 @@ -{-# OPTIONS_GHC -Wno-name-shadowing #-} - -module Models.Apis.Fields ( - -- Types - Field (..), - FieldTypes (..), - FieldCategoryEnum (..), - FieldId (..), - SwField (..), - FacetSummary (..), - FacetValue (..), - FacetData (..), - bulkInsertFields, - Format (..), - FormatId, - SwFormat (..), - bulkInsertFormat, - -- Facets - generateAndSaveFacets, - getFacetSummary, - -- Shapes - Shape (..), - ShapeWithFields (..), - SwShape (..), - ShapeId, - bulkInsertShapes, -) -where - -import Control.Exception.Annotated (checkpoint) -import Data.Aeson qualified as AE -import Data.Annotation (toAnnotation) -import Data.Char (toLower) -import Data.Default -import Data.Effectful.Hasql (Hasql) -import Data.Effectful.Hasql qualified as Hasql -import Data.Effectful.UUID qualified as UUID -import Data.HashMap.Strict qualified as HM -import Data.Text.Display (Display) -import Data.Time (UTCTime, ZonedTime, addUTCTime, diffUTCTime) -import Data.Vector qualified as V -import Database.PostgreSQL.Entity.Types (CamelToSnake, Entity, FieldModifiers, GenericEntity, PrimaryKey, Schema, TableName) -import Database.PostgreSQL.Simple (FromRow, ToRow) -import Database.PostgreSQL.Simple.FromField (FromField) -import Database.PostgreSQL.Simple.Newtypes (Aeson (..)) -import Database.PostgreSQL.Simple.ToField (ToField) -import Deriving.Aeson qualified as DAE -import Effectful -import Effectful.Labeled (Labeled) -import GHC.Records (HasField (getField)) -import Hasql.Interpolate qualified as HI -import Models.Projects.Projects qualified as Projects -import Pkg.DeriveUtils (DB, UUIDId (..), WrappedEnumSC (..), rawSql) -import Relude -import Web.HttpApiData (FromHttpApiData) - - --- $setup --- >>> import Relude --- >>> import Data.Default --- >>> import Data.Vector hiding (fromList) --- >>> import Data.Vector qualified as V - - -newtype FieldId = FieldId {unFieldId :: UUID.UUID} - deriving stock (Generic, Show) - deriving newtype (NFData) - deriving (AE.FromJSON, AE.ToJSON, Default, Eq, FromField, FromHttpApiData, HI.DecodeValue, HI.EncodeValue, Ord, ToField) via UUID.UUID - - -data FieldTypes - = FTUnknown - | FTString - | FTNumber - | FTBool - | FTObject - | FTList - | FTNull - deriving stock (Eq, Generic, Read, Show) - deriving anyclass (Default, NFData) - deriving (AE.FromJSON, AE.ToJSON, FromField, ToField) via WrappedEnumSC "FT" FieldTypes - deriving (HI.DecodeValue, HI.EncodeValue) via WrappedEnumSC "FT" FieldTypes - - -instance HasField "toText" FieldTypes Text where - getField = toText . map toLower . drop 2 . show - - -data FieldCategoryEnum - = FCQueryParam - | FCPathParam - | FCRequestHeader - | FCResponseHeader - | FCRequestBody - | FCResponseBody - deriving stock (Eq, Generic, Ord, Read, Show) - deriving anyclass (Default, NFData) - deriving (AE.FromJSON, AE.ToJSON, Display, FromField, ToField) via WrappedEnumSC "FC" FieldCategoryEnum - deriving (HI.DecodeValue, HI.EncodeValue) via WrappedEnumSC "FC" FieldCategoryEnum - - -data Field = Field - { id :: FieldId - , createdAt :: ZonedTime - , updatedAt :: ZonedTime - , projectId :: Projects.ProjectId - , endpointHash :: Text - , key :: Text - , fieldType :: FieldTypes - , fieldTypeOverride :: Maybe Text - , format :: Text -- SHould fields be linked to the format table via the fieldFormat text or format Id? - , formatOverride :: Maybe Text - , description :: Text - , keyPath :: Text - , fieldCategory :: FieldCategoryEnum - , hash :: Text - , isEnum :: Bool - , isRequired :: Bool - } - deriving stock (Generic, Show) - deriving anyclass (Default, FromRow, NFData, ToRow) - deriving (Entity) via (GenericEntity '[Schema "apis", TableName "fields", PrimaryKey "id", FieldModifiers '[CamelToSnake]] Field) - deriving (FromField) via Aeson Field - deriving (AE.FromJSON, AE.ToJSON) via DAE.CustomJSON '[DAE.OmitNothingFields, DAE.FieldLabelModifier '[DAE.CamelToSnake]] Field - - -data SwField = SwField - { fEndpointHash :: Text - , fKey :: Text - , fFieldType :: FieldTypes - , fFormat :: Text - , fDescription :: Text - , fKeyPath :: Text - , fFieldCategory :: FieldCategoryEnum - , fHash :: Text - , fIsEnum :: Bool - , fIsRequired :: Bool - } - deriving stock (Generic, Show) - deriving anyclass (Default, FromRow, NFData, ToRow) - deriving (FromField) via Aeson SwField - deriving (AE.FromJSON, AE.ToJSON) via DAE.CustomJSON '[DAE.OmitNothingFields, DAE.FieldLabelModifier '[DAE.CamelToSnake]] SwField - - -data FacetValue = FacetValue - { value :: Text - , count :: Int - } - deriving stock (Eq, Generic, Show) - deriving anyclass (NFData) - deriving (AE.FromJSON, AE.ToJSON) via DAE.CustomJSON '[DAE.OmitNothingFields, DAE.FieldLabelModifier '[DAE.CamelToSnake]] FacetValue - - -newtype FacetData = FacetData (HM.HashMap Text [FacetValue]) - deriving stock (Eq, Generic, Show) - deriving newtype (NFData) - deriving (FromField, ToField) via Aeson FacetData - deriving (AE.FromJSON, AE.ToJSON) via DAE.CustomJSON '[DAE.OmitNothingFields] FacetData - deriving (HI.DecodeValue, HI.EncodeValue) via HI.AsJsonb FacetData - - -data FacetSummary = FacetSummary - { id :: UUID.UUID - , projectId :: Text - , tableName :: Text - , facetJson :: FacetData - } - deriving stock (Generic, Show) - deriving anyclass (FromRow, HI.DecodeRow, NFData, ToRow) - deriving (Entity) via (GenericEntity '[Schema "apis", TableName "facet_summaries", PrimaryKey "id", FieldModifiers '[CamelToSnake]] FacetSummary) - deriving (AE.FromJSON, AE.ToJSON) via DAE.CustomJSON '[DAE.OmitNothingFields, DAE.FieldLabelModifier '[DAE.CamelToSnake]] FacetSummary - - -instance Ord Field where - (<=) f1 f2 = (f1.projectId <= f2.projectId) && (f1.endpointHash <= f2.endpointHash) && (f1.keyPath <= f2.keyPath) - - -instance Eq Field where - (==) f1 f2 = (f1.projectId == f2.projectId) && (f1.endpointHash == f2.endpointHash) && (f1.keyPath == f2.keyPath) - - -bulkInsertFields :: DB es => V.Vector Field -> Eff es () -bulkInsertFields flds | V.null flds = pass -bulkInsertFields flds = - Hasql.interpExecute_ - [HI.sql| INSERT INTO apis.fields (project_id, endpoint_hash, key, field_type, format, description, key_path, field_category, hash) - SELECT * FROM unnest(#{pids}::uuid[], #{ehs}::text[], #{keys}::text[], #{fts}::apis.field_type[], #{fmts}::text[], #{descs}::text[], #{kps}::text[], #{fcs}::apis.field_category[], #{hs}::text[]) - ON CONFLICT DO NOTHING |] - where - pids = V.map (.projectId) flds - ehs = V.map (.endpointHash) flds - keys = V.map (.key) flds - fts = V.map (.fieldType) flds - fmts = V.map (.format) flds - descs = V.map (.description) flds - kps = V.map (.keyPath) flds - fcs = V.map (.fieldCategory) flds - hs = V.map (.hash) flds - - ---------------------------------- --- Formats -type FormatId = UUIDId "format" - - -data Format = Format - { id :: FormatId - , createdAt :: ZonedTime - , updatedAt :: ZonedTime - , projectId :: Projects.ProjectId - , fieldHash :: Text - , fieldType :: FieldTypes - , fieldFormat :: Text - , examples :: V.Vector AE.Value - , hash :: Text - } - deriving stock (Generic, Show) - deriving anyclass (FromRow, NFData, ToRow) - deriving (Entity) via (GenericEntity '[Schema "apis", TableName "formats", PrimaryKey "id", FieldModifiers '[CamelToSnake]] Format) - deriving (FromField) via Aeson Format - deriving (AE.FromJSON) via DAE.CustomJSON '[DAE.OmitNothingFields, DAE.FieldLabelModifier '[DAE.CamelToSnake]] Format - - -bulkInsertFormat :: DB es => V.Vector Format -> Eff es () -bulkInsertFormat = V.mapM_ \Format{projectId, fieldHash, fieldType, fieldFormat, examples, hash} -> - Hasql.interpExecute - [HI.sql| INSERT INTO apis.formats (project_id, field_hash, field_type, field_format, examples, hash) - VALUES (#{projectId}, #{fieldHash}, #{fieldType}::apis.field_type, #{fieldFormat}, #{examples}, #{hash}) - ON CONFLICT (project_id, field_hash, field_format) - DO UPDATE SET field_type = EXCLUDED.field_type, hash = EXCLUDED.hash, - examples = CASE WHEN COALESCE(array_length(apis.formats.examples, 1), 0) >= 20 - THEN apis.formats.examples - ELSE ARRAY(SELECT DISTINCT e FROM unnest(apis.formats.examples || excluded.examples) AS e ORDER BY e LIMIT 20) END |] - - -data SwFormat = SwFormat - { swFieldHash :: Text - , swFieldType :: FieldTypes - , swFieldFormat :: Text - , swExamples :: V.Vector AE.Value - , swHash :: Text - } - deriving stock (Generic, Show) - deriving anyclass (FromRow, NFData, ToRow) - deriving anyclass (AE.ToJSON) - deriving (FromField) via Aeson SwFormat - deriving (AE.FromJSON) via DAE.CustomJSON '[DAE.OmitNothingFields, DAE.FieldLabelModifier '[DAE.CamelToSnake]] SwFormat - - ---------------------------------- --- Facets - -facetColumns :: [Text] -facetColumns = - [ "name" - , "resource___service___name" - , "resource___service___version" - , "kind" - , "status_code" - , "level" - , "attributes___http___request___method" - , "attributes___http___response___status_code" - , "attributes___error___type" - , "resource___service___instance___id" - , "resource___service___namespace" - , "resource___telemetry___sdk___language" - , "resource___telemetry___sdk___name" - , "resource___telemetry___sdk___version" - , "attributes___http___request___method_original" - , "attributes___http___request___resend_count" - , "attributes___http___request___body___size" - , "attributes___url___path" - , "attributes___url___scheme" - , "attributes___url___full" - , "attributes___url___fragment" - , "attributes___url___query" - , "attributes___user_agent___original" - , "attributes___network___protocol___name" - , "attributes___network___protocol___version" - , "attributes___network___transport" - , "attributes___network___type" - , "attributes___client___address" - , "attributes___server___address" - , "attributes___user___id" - , "attributes___user___email" - , "attributes___user___name" - , "attributes___user___full_name" - , -- session.id / session.previous.id excluded: near-unique, useless for facets - "attributes___db___system___name" - , "attributes___db___collection___name" - , "attributes___db___namespace" - , "attributes___db___operation___name" - , "attributes___db___response___status_code" - , "attributes___db___operation___batch___size" - , "attributes___exception___type" - , "attributes___exception___message" - , "severity___severity_text" - , "severity___severity_number" - , "status_message" - ] - - -generateAndSaveFacets - :: (DB es, Labeled "timefusion" Hasql :> es, UUID.UUIDEff :> es) - => Bool - -- ^ enableTimefusionReads (caller passes it to keep this module a leaf of - -- `System.Config` — breaks the `Config ↔ Telemetry` cycle). - -> Projects.ProjectId - -> Text - -> Int - -> UTCTime - -> Eff es FacetSummary -generateAndSaveFacets enableTfReads pid tableName maxValues timestamp = do - let dayEnd = timestamp - dayStart = addUTCTime (-86400) dayEnd - - let pidText = pid.toText - facetSql = mconcat $ intersperse (fromString "\nUNION ALL\n") $ map (buildFacetColumnSql tableName pidText dayStart dayEnd maxValues) facetColumns - facetMap <- do - values <- checkpoint (toAnnotation ("facet-query" :: Text)) $ Hasql.withHasqlTimefusion enableTfReads $ Hasql.interp facetSql - pure $ processQueryResults $ V.fromList values - existingIdM <- - Hasql.interpOne - [HI.sql| SELECT id FROM apis.facet_summaries WHERE project_id = #{pidText} AND table_name = #{tableName} LIMIT 1 |] - - facetId <- maybe UUID.genUUID pure existingIdM - - let facetData = FacetData facetMap - summary = FacetSummary{id = facetId, projectId = pidText, tableName = tableName, facetJson = facetData} - - _ <- - Hasql.interpExecute - [HI.sql| INSERT INTO apis.facet_summaries (id, project_id, table_name, facet_json) - VALUES (#{facetId}, #{pidText}, #{tableName}, #{facetData}) - ON CONFLICT (project_id, table_name) - DO UPDATE SET facet_json = EXCLUDED.facet_json |] - - pure summary - - -processQueryResults :: V.Vector (Text, Text, Int) -> HM.HashMap Text [FacetValue] -processQueryResults = - V.foldr' addResult HM.empty - where - addResult (colName, valText, count) acc = - let currentVals = HM.lookupDefault [] colName acc - newVal = FacetValue valText count - updatedVals = insertSorted newVal currentVals - in HM.insert colName updatedVals acc - insertSorted newVal [] = [newVal] - insertSorted newVal@(FacetValue _ count) (x@(FacetValue _ xcount) : xs) - | count > xcount = newVal : x : xs - | otherwise = x : insertSorted newVal xs - - -buildFacetColumnSql :: Text -> Text -> UTCTime -> UTCTime -> Int -> Text -> HI.Sql -buildFacetColumnSql tableName pidText dayStart dayEnd maxValues colName = - rawSql ("(SELECT '" <> colName <> "' as column_name, " <> colName <> "::text as value, COUNT(*)::INT as count FROM " <> tableName <> " WHERE project_id = ") - <> [HI.sql|#{pidText}::text|] - <> [HI.sql| AND timestamp >= #{dayStart} AND timestamp < #{dayEnd} AND |] - <> rawSql (colName <> " IS NOT NULL GROUP BY value ORDER BY count DESC LIMIT ") - <> [HI.sql|#{maxValues})|] - - -getFacetSummary :: DB es => Projects.ProjectId -> Text -> UTCTime -> UTCTime -> Eff es (Maybe FacetSummary) -getFacetSummary projectId tableName fromTime toTime = checkpoint "getFacetSummary" $ do - let projectIdText = projectId.toText - timeSpanSeconds = max 1 $ floor $ diffUTCTime toTime fromTime - timeSpanMinutes = (timeSpanSeconds + 59) `div` 60 - - summaryM <- - checkpoint "Fetching facet summary" - $ Hasql.interpOne - [HI.sql| SELECT id, project_id, table_name, facet_json FROM apis.facet_summaries WHERE project_id = #{projectIdText} AND table_name = #{tableName} LIMIT 1 |] - - pure $ scaleFacetSummary <$> summaryM <*> pure timeSpanMinutes - where - scaleFacetSummary :: FacetSummary -> Int -> FacetSummary - scaleFacetSummary summary timeSpanMinutes = - let scaleFactor = fromIntegral timeSpanMinutes / 1440.0 - (FacetData facetMap) = summary.facetJson - scaledMap = - if abs (scaleFactor - 1.0) < 0.01 - then facetMap - else HM.map (scaleFacetValues scaleFactor) facetMap - in summary{facetJson = FacetData scaledMap} - scaleFacetValues :: Double -> [FacetValue] -> [FacetValue] - scaleFacetValues factor = - map (\(FacetValue v c) -> FacetValue v (max 1 $ ceiling $ fromIntegral c * factor)) - - ---------------------------------- --- Shapes - -type ShapeId = UUIDId "shape" - - -data ShapeWithFields = ShapeWidthFields - { status :: Int - , sHash :: Text - , fieldsMap :: Map FieldCategoryEnum [Field] - , reqDescription :: Text - , resDescription :: Text - } - deriving stock (Generic, Show) - deriving anyclass (NFData) - - -data Shape = Shape - { id :: ShapeId - , createdAt :: UTCTime - , updatedAt :: UTCTime - , approvedOn :: Maybe UTCTime - , projectId :: Projects.ProjectId - , endpointHash :: Text - , queryParamsKeypaths :: V.Vector Text - , requestBodyKeypaths :: V.Vector Text - , responseBodyKeypaths :: V.Vector Text - , requestHeadersKeypaths :: V.Vector Text - , responseHeadersKeypaths :: V.Vector Text - , fieldHashes :: V.Vector Text - , hash :: Text - , statusCode :: Int - , responseDescription :: Text - , requestDescription :: Text - } - deriving stock (Generic, Show) - deriving anyclass (Default, FromRow, NFData, ToRow) - deriving (Entity) via (GenericEntity '[Schema "apis", TableName "shapes", PrimaryKey "id", FieldModifiers '[CamelToSnake]] Shape) - deriving (FromField) via Aeson Shape - deriving (AE.FromJSON, AE.ToJSON) via DAE.CustomJSON '[DAE.OmitNothingFields, DAE.FieldLabelModifier '[DAE.CamelToSnake]] Shape - - -bulkInsertShapes :: DB es => V.Vector Shape -> Eff es () -bulkInsertShapes = V.mapM_ \Shape{projectId, endpointHash, queryParamsKeypaths, requestBodyKeypaths, responseBodyKeypaths, requestHeadersKeypaths, responseHeadersKeypaths, fieldHashes, hash, statusCode} -> - Hasql.interpExecute - [HI.sql| INSERT INTO apis.shapes - (project_id, endpoint_hash, query_params_keypaths, request_body_keypaths, response_body_keypaths, request_headers_keypaths, response_headers_keypaths, field_hashes, hash, status_code, request_description, response_description) - VALUES (#{projectId}, #{endpointHash}, #{queryParamsKeypaths}, #{requestBodyKeypaths}, #{responseBodyKeypaths}, #{requestHeadersKeypaths}, #{responseHeadersKeypaths}, #{fieldHashes}, #{hash}, #{statusCode}, '', '') - ON CONFLICT DO NOTHING |] - - -data SwShape = SwShape - { swEndpointHash :: Text - , swQueryParamsKeypaths :: V.Vector Text - , swRequestBodyKeypaths :: V.Vector Text - , swResponseBodyKeypaths :: V.Vector Text - , swRequestHeadersKeypaths :: V.Vector Text - , swResponseHeadersKeypaths :: V.Vector Text - , swHash :: Text - , swStatusCode :: Int - , swFieldHashes :: V.Vector Text - , swRequestDescription :: Text - , swResponseDescription :: Text - } - deriving stock (Generic, Show) - deriving anyclass (Default, FromRow, NFData, ToRow) - deriving anyclass (AE.ToJSON) - deriving (FromField) via Aeson SwShape - deriving (AE.FromJSON) via DAE.CustomJSON '[DAE.OmitNothingFields, DAE.FieldLabelModifier '[DAE.CamelToSnake]] SwShape diff --git a/src/Models/Apis/LogQueries.hs b/src/Models/Apis/LogQueries.hs index e91f48f50..262793f99 100644 --- a/src/Models/Apis/LogQueries.hs +++ b/src/Models/Apis/LogQueries.hs @@ -50,7 +50,6 @@ import Effectful.Labeled (Labeled) import Effectful.Log (Log) import Effectful.Time qualified as Time import Hasql.Interpolate qualified as HI -import Models.Apis.Fields () import Models.Apis.LogPatterns qualified as LogPatterns import Models.Projects.Projects qualified as Projects import OpenTelemetry.Attributes qualified as OA diff --git a/src/Models/Apis/SchemaCatalog.hs b/src/Models/Apis/SchemaCatalog.hs new file mode 100644 index 000000000..ebb7e4f3c --- /dev/null +++ b/src/Models/Apis/SchemaCatalog.hs @@ -0,0 +1,307 @@ +{-# LANGUAGE OverloadedRecordDot #-} + +-- | Hasql persistence layer for the schema-learning catalog. +-- +-- Three tables (see migration @0089_schema_catalog@): +-- +-- * @apis.schema_template@ — instance-wide, structure-only, dedup'd by hash. +-- * @apis.schema_catalog@ — per-project, references a template + carries +-- tenant-private bits. +-- * @apis.schema_summary@ — materialised per-project AI/query-editor doc. +-- +-- @FacetData@ / @FacetValue@ are re-exported from "Models.Apis.Fields" so +-- existing callers of @Fields.getFacetSummary@ can be redirected here without +-- changing their imports. +module Models.Apis.SchemaCatalog ( + TemplateRow (..), + CatalogRow (..), + upsertTemplates, + upsertCatalogRows, + getByProject, + getByHost, + getByKey, + getSummary, + upsertSummary, + vacuumUnreferencedTemplates, + toFacetSummary, + getFacetSummary, + -- Re-exports for reader migration. + Catalog.FacetData (..), + Catalog.FacetValue (..), + Catalog.FacetSummary (..), +) +where + +import Data.Effectful.Hasql qualified as Hasql +import Data.HashMap.Strict qualified as HM +import Data.Time (UTCTime) +import Data.UUID qualified as UUID +import Data.Vector qualified as V +import Effectful +import Hasql.Interpolate qualified as HI +import Models.Projects.Projects qualified as Projects +import Pkg.DeriveUtils (DB) +import Pkg.SchemaLearning.Catalog qualified as Catalog +import Relude + + +-- --------------------------------------------------------------------------- +-- Row shapes for inserts/lookups. + +-- | One row in @apis.schema_template@. Templates are immutable once written: +-- any structural change mints a new 'Catalog.templateHash'. +data TemplateRow = TemplateRow + { templateHash :: !Text + , keyKind :: !Catalog.KeyKind + , fields :: !(HM.HashMap Text Catalog.FieldStruct) + , lastSeenAt :: !UTCTime + } + deriving stock (Eq, Generic, Show) + deriving anyclass (NFData) + + +-- | One row in @apis.schema_catalog@. Holds the tenant-private bits + a +-- pointer to the shared template row. +data CatalogRow = CatalogRow + { projectId :: !Projects.ProjectId + , keyKind :: !Catalog.KeyKind + , keyHash :: !Text + , templateHash :: !Text + , scope :: !Catalog.Scope + , valuesDelta :: !(HM.HashMap Text Catalog.Examples) + , counts :: !(HM.HashMap Text Catalog.TopK) + , sampleCount :: !Word64 + , firstSeen :: !UTCTime + , lastSeen :: !UTCTime + } + deriving stock (Eq, Generic, Show) + deriving anyclass (NFData) + + +-- --------------------------------------------------------------------------- +-- Upserts. Both are multi-row INSERTs over @unnest@ so a single round-trip +-- handles the dirty subset of an entire shard. + +-- | Insert templates, bumping @last_seen_at@ on hits. Idempotent. +upsertTemplates :: DB es => V.Vector TemplateRow -> Eff es () +upsertTemplates rows | V.null rows = pass +upsertTemplates rows = + Hasql.interpExecute_ + [HI.sql| INSERT INTO apis.schema_template (template_hash, key_kind, fields, last_seen_at) + SELECT * FROM unnest( + #{hashes}::text[], + #{kinds}::apis.schema_key_kind[], + #{fieldsJson}::jsonb[], + #{seens}::timestamptz[]) + ON CONFLICT (template_hash) DO UPDATE + SET last_seen_at = GREATEST(apis.schema_template.last_seen_at, EXCLUDED.last_seen_at) |] + where + hashes = V.map (.templateHash) rows + kinds = V.map (.keyKind) rows + fieldsJson = V.map (HI.AsJsonb . (.fields)) rows + seens = V.map (.lastSeenAt) rows + + +-- | Upsert catalog rows. Replaces values_delta/counts/scope wholesale — +-- callers (the flush worker) merge in-memory before writing, so the SQL is a +-- plain assignment. +upsertCatalogRows :: DB es => V.Vector CatalogRow -> Eff es () +upsertCatalogRows rows | V.null rows = pass +upsertCatalogRows rows = + Hasql.interpExecute_ + [HI.sql| INSERT INTO apis.schema_catalog + (project_id, key_kind, key_hash, template_hash, scope, + values_delta, counts, sample_count, first_seen, last_seen, updated_at) + SELECT *, now() FROM unnest( + #{pids}::uuid[], + #{kinds}::apis.schema_key_kind[], + #{khs}::text[], + #{ths}::text[], + #{scopes}::jsonb[], + #{vds}::jsonb[], + #{cnts}::jsonb[], + #{ss}::bigint[], + #{firsts}::timestamptz[], + #{lasts}::timestamptz[]) + ON CONFLICT (project_id, key_hash) DO UPDATE + SET template_hash = EXCLUDED.template_hash, + scope = EXCLUDED.scope, + values_delta = EXCLUDED.values_delta, + counts = EXCLUDED.counts, + sample_count = EXCLUDED.sample_count, + last_seen = GREATEST(apis.schema_catalog.last_seen, EXCLUDED.last_seen), + updated_at = now() |] + where + pids = V.map (.projectId) rows + kinds = V.map (.keyKind) rows + khs = V.map (.keyHash) rows + ths = V.map (.templateHash) rows + scopes = V.map (HI.AsJsonb . (.scope)) rows + vds = V.map (HI.AsJsonb . (.valuesDelta)) rows + cnts = V.map (HI.AsJsonb . (.counts)) rows + ss = V.map (fromIntegral @Word64 @Int64 . (.sampleCount)) rows + firsts = V.map (.firstSeen) rows + lasts = V.map (.lastSeen) rows + + +-- --------------------------------------------------------------------------- +-- Lookups. + +-- | Decoded result of a join across @apis.schema_catalog@ ⨝ @apis.schema_template@. +data CatalogReadRow = CatalogReadRow + { projectId :: UUID.UUID + , keyKind :: Catalog.KeyKind + , keyHash :: Text + , templateHash :: Text + , scope :: HI.AsJsonb Catalog.Scope + , templateFields :: HI.AsJsonb (HM.HashMap Text Catalog.FieldStruct) + , valuesDelta :: HI.AsJsonb (HM.HashMap Text Catalog.Examples) + , counts :: HI.AsJsonb (HM.HashMap Text Catalog.TopK) + , sampleCount :: Int64 + , firstSeen :: UTCTime + , lastSeen :: UTCTime + } + deriving stock (Generic) + deriving anyclass (HI.DecodeRow) + + +readRowToEntry :: CatalogReadRow -> Catalog.CatalogEntry +readRowToEntry r = + let HI.AsJsonb sc = r.scope + HI.AsJsonb tf = r.templateFields + HI.AsJsonb vd = r.valuesDelta + HI.AsJsonb ct = r.counts + in Catalog.CatalogEntry + { scope = sc + , template = Catalog.Template r.keyKind tf + , valuesDelta = vd + , counts = ct + , sampleCount = fromIntegral r.sampleCount + , firstSeen = r.firstSeen + , lastSeen = r.lastSeen + , dirty = False + } + + +-- | All catalog rows for a project, ordered by most-recently-seen. +getByProject :: DB es => Projects.ProjectId -> Eff es (V.Vector Catalog.CatalogEntry) +getByProject pid = do + rows :: [CatalogReadRow] <- + Hasql.interp + [HI.sql| SELECT c.project_id, c.key_kind, c.key_hash, c.template_hash, + c.scope, t.fields, c.values_delta, c.counts, + c.sample_count, c.first_seen, c.last_seen + FROM apis.schema_catalog c + JOIN apis.schema_template t ON c.template_hash = t.template_hash + WHERE c.project_id = #{pid} + ORDER BY c.last_seen DESC |] + pure $ V.fromList $ readRowToEntry <$> rows + + +-- | Catalog rows for a project filtered to one host (HTTP keys only). +getByHost :: DB es => Projects.ProjectId -> Text -> Eff es (V.Vector Catalog.CatalogEntry) +getByHost pid host = do + rows :: [CatalogReadRow] <- + Hasql.interp + [HI.sql| SELECT c.project_id, c.key_kind, c.key_hash, c.template_hash, + c.scope, t.fields, c.values_delta, c.counts, + c.sample_count, c.first_seen, c.last_seen + FROM apis.schema_catalog c + JOIN apis.schema_template t ON c.template_hash = t.template_hash + WHERE c.project_id = #{pid} + AND c.key_kind = 'http_endpoint'::apis.schema_key_kind + AND c.scope->>'host' = #{host} + ORDER BY c.last_seen DESC |] + pure $ V.fromList $ readRowToEntry <$> rows + + +-- | One catalog row by primary key. +getByKey :: DB es => Projects.ProjectId -> Text -> Eff es (Maybe Catalog.CatalogEntry) +getByKey pid keyHash = + fmap readRowToEntry + <$> Hasql.interpOne + [HI.sql| SELECT c.project_id, c.key_kind, c.key_hash, c.template_hash, + c.scope, t.fields, c.values_delta, c.counts, + c.sample_count, c.first_seen, c.last_seen + FROM apis.schema_catalog c + JOIN apis.schema_template t ON c.template_hash = t.template_hash + WHERE c.project_id = #{pid} AND c.key_hash = #{keyHash} |] + + +-- --------------------------------------------------------------------------- +-- Summary doc. + +newtype SummaryRow = SummaryRow {doc :: HI.AsJsonb Catalog.SummaryDoc} + deriving stock (Generic) + deriving anyclass (HI.DecodeRow) + + +-- | Read the materialised AI/query-editor doc for a project. +getSummary :: DB es => Projects.ProjectId -> Eff es (Maybe Catalog.SummaryDoc) +getSummary pid = + fmap unwrap + <$> Hasql.interpOne + [HI.sql| SELECT doc FROM apis.schema_summary WHERE project_id = #{pid} |] + where + unwrap (SummaryRow (HI.AsJsonb d)) = d + + +upsertSummary :: DB es => Projects.ProjectId -> Catalog.SummaryDoc -> Eff es () +upsertSummary pid doc = do + let docJson = HI.AsJsonb doc + Hasql.interpExecute_ + [HI.sql| INSERT INTO apis.schema_summary (project_id, doc, generated_at) + VALUES (#{pid}, #{docJson}, now()) + ON CONFLICT (project_id) DO UPDATE + SET doc = EXCLUDED.doc, generated_at = now() |] + + +-- | Drop-in replacement for the legacy @Fields.getFacetSummary@: same +-- type signature; @tableName@ and the time range are accepted for source- +-- compat but ignored (schema is now unified per project, and counts come +-- from in-memory state rather than scaled-by-time-range warehouse scans). +getFacetSummary + :: DB es + => Projects.ProjectId + -> Text + -> UTCTime + -> UTCTime + -> Eff es (Maybe Catalog.FacetSummary) +getFacetSummary pid tableName _from _to = + fmap (toFacetSummary pid tableName) <$> getSummary pid + + +-- | Adapter: convert a 'Catalog.SummaryDoc' into the legacy +-- 'Catalog.FacetSummary' shape so existing callers (AI prompt, query editor) +-- don't need to change. The @tableName@ argument is ignored — schema is now +-- unified per project. +toFacetSummary :: Projects.ProjectId -> Text -> Catalog.SummaryDoc -> Catalog.FacetSummary +toFacetSummary pid tableName doc = + Catalog.FacetSummary + { id = UUID.nil -- summary is not row-identified; legacy callers don't depend on this + , projectId = pid.toText + , tableName = tableName + , facetJson = Catalog.FacetData $ HM.map topKToFacetValues doc.topValuesByField + } + where + topKToFacetValues :: Catalog.TopK -> [Catalog.FacetValue] + topKToFacetValues tk = + sortOn (negate . (.count)) + [ Catalog.FacetValue v (fromIntegral n) | (v, n) <- HM.toList tk.top + ] + + +-- --------------------------------------------------------------------------- +-- GC. + +-- | Drop @apis.schema_template@ rows no catalog row references and that +-- haven't been seen in 7 days (grace window for shards that just evicted but +-- haven't flushed yet). Returns rows deleted. +vacuumUnreferencedTemplates :: DB es => Eff es Int64 +vacuumUnreferencedTemplates = + Hasql.interpExecute + [HI.sql| DELETE FROM apis.schema_template t + WHERE t.last_seen_at < now() - interval '7 days' + AND NOT EXISTS ( + SELECT 1 FROM apis.schema_catalog c + WHERE c.template_hash = t.template_hash) |] diff --git a/src/Pages/Anomalies.hs b/src/Pages/Anomalies.hs index 869b94c76..58416a90c 100644 --- a/src/Pages/Anomalies.hs +++ b/src/Pages/Anomalies.hs @@ -69,9 +69,9 @@ import Models.Apis.Anomalies qualified as Anomalies import Models.Apis.Endpoints qualified as Endpoints import Models.Apis.ErrorPatterns (ErrorPatternId (..)) import Models.Apis.ErrorPatterns qualified as ErrorPatterns -import Models.Apis.Fields (FacetData (..), FacetSummary (..), FacetValue (..)) -import Models.Apis.Fields qualified as Fields import Models.Apis.Issues qualified as Issues +import Models.Apis.SchemaCatalog qualified as SchemaCatalog +import Pkg.SchemaLearning.Catalog (FacetData (..), FacetSummary (..), FacetValue (..)) import Models.Apis.LogPatterns (sourceFieldLabel) import Models.Apis.Monitors qualified as Monitors import Models.Apis.PatternMerge qualified as PatternMerge @@ -1060,7 +1060,7 @@ buildSystemPromptForIssue pid issue now = do _ -> pure Nothing let issueContext = unlines ["--- ISSUE CONTEXT ---", buildAIContext issue errorM traceDataM spans alertContextM] dayAgo = addUTCTime (-86400) now - facetSummaryM <- Fields.getFacetSummary pid "otel_logs_and_spans" dayAgo now + facetSummaryM <- SchemaCatalog.getFacetSummary pid "otel_logs_and_spans" dayAgo now let systemPrompt = anomalySystemPrompt now fullSystemPrompt = unlines [systemPrompt, "", "--- FACET SUMMARY ---", maybe "" formatFacetSummaryForAI facetSummaryM, "", issueContext] pure fullSystemPrompt diff --git a/src/Pages/Bots/Utils.hs b/src/Pages/Bots/Utils.hs index 60a17fbbc..07e11b41e 100644 --- a/src/Pages/Bots/Utils.hs +++ b/src/Pages/Bots/Utils.hs @@ -19,8 +19,8 @@ import Effectful.Log (Log) import Effectful.Time qualified as Time import Langchain.LLM.Core qualified as LLM import Lucid -import Models.Apis.Fields qualified as Fields import Models.Apis.Issues qualified as Reports +import Models.Apis.SchemaCatalog qualified as SchemaCatalog import Models.Apis.LogQueries qualified as LogQueries import Models.Projects.Projects qualified as Projects import Network.HTTP.Types (urlEncode) @@ -271,7 +271,7 @@ processAIQuery :: (DB es, ELLM.LLM :> es, Log :> es, Time.Time :> es, Tracing :> processAIQuery pid userQuery threadCtx model apiKey = do now <- Time.currentTime let dayAgo = addUTCTime (-86400) now - facetSummaryM <- Fields.getFacetSummary pid "otel_logs_and_spans" dayAgo now + facetSummaryM <- SchemaCatalog.getFacetSummary pid "otel_logs_and_spans" dayAgo now let config = (AI.defaultAgenticConfig pid){AI.facetContext = facetSummaryM, AI.customContext = threadCtx} result <- AI.runAgenticQuery config userQuery model apiKey case result of diff --git a/src/Pages/LogExplorer/Log.hs b/src/Pages/LogExplorer/Log.hs index ac9c3fb0f..cb07e2b9f 100644 --- a/src/Pages/LogExplorer/Log.hs +++ b/src/Pages/LogExplorer/Log.hs @@ -37,9 +37,9 @@ import Lucid.Aria qualified as Aria import Lucid.Base (TermRaw (termRaw)) import Lucid.Htmx import Lucid.Hyperscript (__) -import Models.Apis.Fields (FacetData (..), FacetSummary (..), FacetValue (..)) -import Models.Apis.Fields qualified as Fields import Models.Apis.LogQueries qualified as LogQueries +import Models.Apis.SchemaCatalog qualified as SchemaCatalog +import Pkg.SchemaLearning.Catalog (FacetData (..), FacetSummary (..), FacetValue (..)) import Models.Projects.Projects qualified as Projects import NeatInterpolation (text) import Numeric (showFFloat) @@ -756,7 +756,7 @@ apiLogH pid queryM' cols' cursorM' sinceM fromM toM layoutM sourceM targetSpansM let aw = Ki.atomically . Ki.await t1 <- Ki.fork scope fetchOrSkip t2 <- Ki.fork scope $ tryAny $ Projects.queryLibHistoryForUser pid sess.persistentSession.userId - t3 <- Ki.fork scope $ tryAny $ Fields.getFacetSummary pid "otel_logs_and_spans" (fromMaybe (addUTCTime (-86400) now) fromD) (fromMaybe now toD) + t3 <- Ki.fork scope $ tryAny $ SchemaCatalog.getFacetSummary pid "otel_logs_and_spans" (fromMaybe (addUTCTime (-86400) now) fromD) (fromMaybe now toD) t4 <- Ki.fork scope $ tryAny $ checkFreeTierStatus pid project.paymentPlan t5 <- Ki.fork scope $ tryAny $ V.fromList <$> ManageMembers.getTeams pid -- Patterns and sessions are mutually exclusive; a single fork suffices. @@ -1546,7 +1546,7 @@ aiSearchH pid requestBody = do else do -- Fetch precomputed facets for context (last 24 hours) let dayAgo = addUTCTime (-86400) now - facetSummaryM <- Fields.getFacetSummary pid "otel_logs_and_spans" dayAgo now + facetSummaryM <- SchemaCatalog.getFacetSummary pid "otel_logs_and_spans" dayAgo now let config = (AI.defaultAgenticConfig pid){AI.facetContext = facetSummaryM, AI.timezone = timezoneM, AI.maxIterations = 2} result <- AI.runAgenticQuery config inputText envCfg.openaiModel envCfg.openaiApiKey diff --git a/src/Pkg/AI.hs b/src/Pkg/AI.hs index ef10b7846..2c0ba35b7 100644 --- a/src/Pkg/AI.hs +++ b/src/Pkg/AI.hs @@ -59,7 +59,7 @@ import Effectful.Time qualified as Time import Langchain.LLM.Core qualified as LLM import Langchain.Memory.Core (BaseMemory (..)) import Langchain.Memory.TokenBufferMemory (TokenBufferMemory (..)) -import Models.Apis.Fields (FacetData (..), FacetSummary (..), FacetValue (..)) +import Pkg.SchemaLearning.Catalog (FacetData (..), FacetSummary (..), FacetValue (..)) import Models.Apis.Issues qualified as Issues import Models.Apis.LogQueries (executeSecuredQuery, selectLogTable) import Models.Projects.Projects qualified as Projects diff --git a/src/Pkg/Components/LogQueryBox.hs b/src/Pkg/Components/LogQueryBox.hs index 9dfae5763..010743151 100644 --- a/src/Pkg/Components/LogQueryBox.hs +++ b/src/Pkg/Components/LogQueryBox.hs @@ -13,7 +13,7 @@ import Lucid.Aria qualified as Aria import Lucid.Base (TermRaw (termRaw)) import Lucid.Htmx import Lucid.Hyperscript (__) -import Models.Apis.Fields (FacetData (..), FacetValue (..)) +import Pkg.SchemaLearning.Catalog (FacetData (..), FacetValue (..)) import Models.Apis.LogPatterns (knownPatternFields) import Models.Projects.Projects qualified as Projects import Models.Telemetry.Schema qualified as Schema diff --git a/src/Pkg/ExtractionWorker.hs b/src/Pkg/ExtractionWorker.hs index 7bd4e0e03..151808081 100644 --- a/src/Pkg/ExtractionWorker.hs +++ b/src/Pkg/ExtractionWorker.hs @@ -36,6 +36,7 @@ import Data.UUID qualified as UUID import Data.Vector qualified as V import Models.Projects.Projects qualified as Projects import Pkg.Drain qualified as Drain +import Pkg.SchemaLearning.Hot qualified as SchemaLearning import Relude import Relude.Extra.Enum (next, prev) import UnliftIO (tryAny) @@ -90,6 +91,10 @@ data ShardState s = ShardState , drainBuffers :: !(IORef (HashMap (Projects.ProjectId, Text) ServiceBuffer)) , drainTrees :: !(IORef (HashMap (Projects.ProjectId, Text) ServiceDrainTree)) , pendingRehydrations :: !(IORef (HashSet (Projects.ProjectId, Text))) + , -- | Schema-learning catalog state, owned by this shard. Single-writer + -- (the shard fiber); the flush worker swaps the dirty subset via + -- 'atomicModifyIORef''. + schemaState :: !(IORef SchemaLearning.SchemaShardState) } @@ -148,7 +153,8 @@ initWorkerState numShards queueCapacity = do drainBuffers <- newIORef HM.empty drainTrees <- newIORef HM.empty pendingRehydrations <- newIORef HashSet.empty - pure ShardState{ingressQ, drainFlushQ, rehydrationQ, queueDepth, drainBuffers, drainTrees, pendingRehydrations} + schemaState <- newIORef SchemaLearning.emptySchemaShardState + pure ShardState{ingressQ, drainFlushQ, rehydrationQ, queueDepth, drainBuffers, drainTrees, pendingRehydrations, schemaState} acceptingBatches <- newTVarIO False droppedBatches <- newIORef 0 droppedFlushTasks <- newIORef 0 diff --git a/src/Pkg/SchemaLearning/Catalog.hs b/src/Pkg/SchemaLearning/Catalog.hs new file mode 100644 index 000000000..6cbc94aac --- /dev/null +++ b/src/Pkg/SchemaLearning/Catalog.hs @@ -0,0 +1,486 @@ +{-# LANGUAGE OverloadedRecordDot #-} + +-- | Pure data + merge for the in-memory schema-learning catalog. +-- +-- Two-tier model: +-- +-- * 'Template' — structural skeleton (field paths, types, formats, category, +-- enum-flag). Content-addressable via 'templateHash', dedup'd across the +-- whole instance: identical autoinstrumentation spans (e.g. @redis.get@) +-- collapse to one row no matter how many tenants emit them. Holds no +-- examples, no values — safe to share. +-- * 'CatalogEntry' — per-(project, key) row carrying tenant-private bits: +-- 'Scope', 'Template', sampled examples, top-K counts, timestamps. +module Pkg.SchemaLearning.Catalog ( + KeyKind (..), + Scope (..), + emptyScope, + FieldStruct (..), + Template (..), + Examples (..), + TopK (..), + CatalogEntry (..), + SummaryDoc (..), + emptySummaryDoc, + templateHash, + fieldKindOfValue, + emptyExamples, + emptyTopK, + newEntry, + mergeFullWalk, + bumpSeen, + classifyFormat, + examplesCap, + topKCap, + exampleStringCap, + -- Re-homed from the deleted "Models.Apis.Fields"; Anomalies / SchemaCatalog + -- still need these symbols for VM types. + FieldTypes (..), + FieldCategoryEnum (..), + FieldId, + FormatId, + ShapeId, + FacetValue (..), + FacetData (..), + FacetSummary (..), +) +where + +import Data.Aeson qualified as AE +import Data.Aeson.Types qualified as AET +import Data.HashMap.Strict qualified as HM +import Data.HashSet qualified as HS +import Data.Scientific qualified as Scientific +import Data.Text qualified as T +import Data.Time (UTCTime) +import Data.Vector qualified as V +import Database.PostgreSQL.Simple.FromField (FromField) +import Database.PostgreSQL.Simple.ToField (ToField) +import Deriving.Aeson qualified as DAE +import Database.PostgreSQL.Entity.Types (CamelToSnake, Entity, FieldModifiers, GenericEntity, PrimaryKey, Schema, TableName) +import Database.PostgreSQL.Simple (FromRow, ToRow) +import Database.PostgreSQL.Simple.Newtypes (Aeson (..)) +import Data.Char (toLower) +import Data.Default (Default) +import Data.Text.Display (Display) +import Data.UUID qualified as UUID +import GHC.Records (HasField (getField)) +import Hasql.Interpolate qualified as HI +import Pkg.DeriveUtils (UUIDId (..), WrappedEnumSC (..)) +import Relude +import Utils (toXXHash) +import Web.HttpApiData (FromHttpApiData) + + +-- $setup +-- >>> :set -XOverloadedStrings -XQuasiQuotes -XOverloadedRecordDot +-- >>> import Data.Aeson.QQ.Simple (aesonQQ) +-- >>> import Data.HashMap.Strict qualified as HM +-- >>> import Data.HashSet qualified as HS +-- >>> import Data.Time (UTCTime (..), fromGregorian, secondsToDiffTime) +-- >>> let t0 = UTCTime (fromGregorian 2026 5 10) (secondsToDiffTime 0) + + +-- | The two top-level routing keys for a span. HTTP spans group by +-- @(project, host, method, normalized_path)@; everything else groups by +-- @(project, service.name, span.name, kind)@. +data KeyKind = HttpEndpoint | SpanIdentity + deriving stock (Bounded, Enum, Eq, Generic, Ord, Read, Show) + deriving anyclass (NFData) + deriving (AE.FromJSON, AE.ToJSON, FromField, ToField) via WrappedEnumSC "" KeyKind + deriving (HI.DecodeValue, HI.EncodeValue) via WrappedEnumSC "" KeyKind + + +-- | The per-tenant identity of a key. Stored as JSON in @apis.schema_catalog.scope@. +data Scope = Scope + { service :: !(Maybe Text) + , spanName :: !(Maybe Text) + , kind :: !(Maybe Text) + , host :: !(Maybe Text) + , method :: !(Maybe Text) + , urlPath :: !(Maybe Text) + , statusCodes :: !(V.Vector Int) + } + deriving stock (Eq, Generic, Show) + deriving anyclass (NFData) + deriving (AE.FromJSON, AE.ToJSON) via DAE.CustomJSON '[DAE.OmitNothingFields, DAE.FieldLabelModifier '[DAE.CamelToSnake]] Scope + + +-- | Default 'Scope' — useful for callers that build scopes incrementally. +emptyScope :: Scope +emptyScope = Scope Nothing Nothing Nothing Nothing Nothing Nothing V.empty + + +-- | Structural facts about one field path. Sets are ordered for deterministic +-- hashing. +data FieldStruct = FieldStruct + { types :: !(HS.HashSet FieldTypes) + , formats :: !(HS.HashSet Text) + , category :: !FieldCategoryEnum + , isEnum :: !Bool + } + deriving stock (Eq, Generic, Show) + deriving anyclass (NFData) + deriving (AE.FromJSON, AE.ToJSON) via DAE.CustomJSON '[DAE.OmitNothingFields, DAE.FieldLabelModifier '[DAE.CamelToSnake]] FieldStruct + + +-- | The structure of a span family. Stored once per unique 'templateHash' +-- across the whole instance. +data Template = Template + { keyKind :: !KeyKind + , fields :: !(HM.HashMap Text FieldStruct) + } + deriving stock (Eq, Generic, Show) + deriving anyclass (NFData) + deriving (AE.FromJSON, AE.ToJSON) via DAE.CustomJSON '[DAE.OmitNothingFields, DAE.FieldLabelModifier '[DAE.CamelToSnake]] Template + + +-- | Per-field example reservoir. Capped at 'examplesCap' to bound memory; once +-- full, merging is a no-op. +newtype Examples = Examples {values :: V.Vector AE.Value} + deriving stock (Eq, Generic, Show) + deriving anyclass (NFData) + deriving newtype (AE.FromJSON, AE.ToJSON) + + +-- | Per-field top-K cardinality / value counts. Summary input replacement for +-- @apis.facet_summaries@. +data TopK = TopK + { distinct :: !Word64 + , top :: !(HM.HashMap Text Word64) + } + deriving stock (Eq, Generic, Show) + deriving anyclass (NFData) + deriving (AE.FromJSON, AE.ToJSON) via DAE.CustomJSON '[DAE.OmitNothingFields, DAE.FieldLabelModifier '[DAE.CamelToSnake]] TopK + + +-- | Per-(project, key_hash) row. Mirrors @apis.schema_catalog@ plus a transient +-- 'dirty' flag the flush writer reads. +data CatalogEntry = CatalogEntry + { scope :: !Scope + , template :: !Template + , valuesDelta :: !(HM.HashMap Text Examples) + , counts :: !(HM.HashMap Text TopK) + , sampleCount :: !Word64 + , firstSeen :: !UTCTime + , lastSeen :: !UTCTime + , dirty :: !Bool + } + deriving stock (Eq, Generic, Show) + deriving anyclass (NFData) + + +-- | Convenience: 'CatalogEntry' carries a single 'templateHash' field via +-- pattern. +instance HasField "templateHash" CatalogEntry Text where + getField e = templateHash e.template + + +-- --------------------------------------------------------------------------- +-- Caps. Mirror legacy fieldsToFieldDTO defaults. + +examplesCap :: Int +examplesCap = 20 + + +topKCap :: Int +topKCap = 50 + + +exampleStringCap :: Int +exampleStringCap = 256 + + +emptyExamples :: Examples +emptyExamples = Examples V.empty + + +emptyTopK :: TopK +emptyTopK = TopK 0 HM.empty + + +-- --------------------------------------------------------------------------- +-- Template hash. + +-- | Content-addressable hash of a 'Template'. Two templates with identical +-- field skeletons hash the same, regardless of insertion order. +-- +-- >>> let mk xs = templateHash (Template HttpEndpoint (HM.fromList xs)) +-- >>> let f1 = ("user.id", FieldStruct (HS.fromList [FTString]) (HS.fromList ["{uuid}"]) FCRequestBody False) +-- >>> let f2 = ("user.email", FieldStruct (HS.fromList [FTString]) (HS.fromList ["{email}"]) FCRequestBody False) +-- >>> mk [f1, f2] == mk [f2, f1] +-- True +-- >>> mk [f1] == mk [f2] +-- False +templateHash :: Template -> Text +templateHash t = + toXXHash $ T.intercalate "\n" $ render t.keyKind : sort (renderField <$> HM.toList t.fields) + where + render kk = T.pack (show kk) + renderField (path, fs) = + path + <> "|" + <> T.intercalate "," (sort (T.pack . show <$> HS.toList fs.types)) + <> "|" + <> T.intercalate "," (sort (HS.toList fs.formats)) + <> "|" + <> T.pack (show fs.category) + <> "|" + <> bool "0" "1" fs.isEnum + + +-- --------------------------------------------------------------------------- +-- Walk helpers. + +-- | Bucket a JSON value into a 'FieldTypes'. Same buckets as the legacy +-- @aeValueToFieldType@ in 'ProcessMessage.fieldsToFieldDTO'. +fieldKindOfValue :: AE.Value -> FieldTypes +fieldKindOfValue = \case + AET.String _ -> FTString + AET.Number _ -> FTNumber + AET.Bool _ -> FTBool + AET.Null -> FTNull + AET.Object _ -> FTObject + AET.Array _ -> FTList + + +-- | Map a value to its format vocabulary entry. Numeric formats use the same +-- @"integer"@/@"float"@ vocabulary as 'ProcessMessage.valueToFormat'; string +-- formats are determined by the caller via 'valueToFormatStr' so we don't +-- import the regex set here. +classifyFormat :: AE.Value -> Maybe Text -> Text +classifyFormat v fromCaller = case v of + AET.String _ -> fromMaybe "text" fromCaller + AET.Number n + | Scientific.isFloating n -> "float" + | Scientific.isInteger n -> "integer" + | otherwise -> "unknown" + AET.Bool _ -> "bool" + AET.Null -> "null" + AET.Object _ -> "object" + AET.Array _ -> "array" + + +-- --------------------------------------------------------------------------- +-- Entry construction & merge. + +-- | Empty entry used the first time a key is observed. +newEntry :: KeyKind -> Scope -> UTCTime -> CatalogEntry +newEntry kk sc now = + CatalogEntry + { scope = sc + , template = Template kk HM.empty + , valuesDelta = HM.empty + , counts = HM.empty + , sampleCount = 0 + , firstSeen = now + , lastSeen = now + , dirty = True + } + + +-- | Strict cap on a string sample value: oversize strings are truncated with +-- an ellipsis. +boundExampleValue :: AE.Value -> AE.Value +boundExampleValue = \case + AET.String s | T.length s > exampleStringCap -> AET.String (T.take exampleStringCap s <> "…") + v -> v + + +mergeExamples :: V.Vector AE.Value -> Examples -> Examples +mergeExamples incoming (Examples existing) + | V.length existing >= examplesCap = Examples existing + | otherwise = + let bounded = V.map boundExampleValue incoming + seen = HS.fromList (V.toList existing) + fresh = V.filter (\v -> not (HS.member v seen)) bounded + merged = existing V.++ V.take (examplesCap - V.length existing) fresh + in Examples merged + + +bumpTopK :: V.Vector AE.Value -> TopK -> TopK +bumpTopK incoming (TopK d t) = + let texts = V.toList $ V.mapMaybe valueAsText incoming + bumped = foldl' (\acc k -> HM.insertWith (+) k 1 acc) t texts + capped = + if HM.size bumped <= topKCap + then bumped + else HM.fromList $ take topKCap $ sortOn (negate . snd) (HM.toList bumped) + newDistinct = d + fromIntegral (length texts) + in TopK newDistinct capped + where + valueAsText (AET.String s) = Just s + valueAsText (AET.Number n) = Just (T.pack (show n)) + valueAsText (AET.Bool b) = Just (if b then "true" else "false") + valueAsText _ = Nothing + + +-- | Merge a single span's leaf-walked fields into an entry. Caller supplies +-- @(keyPath, values, category)@ tuples; each value carries its own optional +-- format hint (the result of @ProcessMessage.valueToFormatStr@ applied to +-- that specific value, or 'Nothing'). Per-value because a single key path can +-- carry mixed formats (e.g. a field that's sometimes @{uuid}@, sometimes +-- @{integer}@). +-- +-- Commutative: the result of merging spans @a@ then @b@ equals @b@ then @a@ +-- modulo example reservoir ordering (capped + dedup'd). +mergeFullWalk + :: Scope + -> [(Text, V.Vector (AE.Value, Maybe Text), FieldCategoryEnum)] + -> UTCTime + -> CatalogEntry + -> CatalogEntry +mergeFullWalk newScope walk now e = + let (newFields, newValues, newCounts) = foldl' step (e.template.fields, e.valuesDelta, e.counts) walk + newTemplate = Template e.template.keyKind newFields + in e + { scope = mergeScope e.scope newScope + , template = newTemplate + , valuesDelta = newValues + , counts = newCounts + , sampleCount = e.sampleCount + 1 + , lastSeen = now + , dirty = True + } + where + step (flds, vals, cnts) (path, vhs, cat) = + let bareValues = V.map fst vhs + kinds = HS.fromList $ V.toList $ V.map fieldKindOfValue bareValues + fmts = HS.fromList $ V.toList $ V.map (uncurry classifyFormat) vhs + fs0 = HM.lookupDefault (FieldStruct HS.empty HS.empty cat False) path flds + fs1 = fs0{types = fs0.types <> kinds, formats = fs0.formats <> fmts, category = cat} + flds' = HM.insert path fs1 flds + vals' = HM.insertWith (\new old -> mergeExamples new.values old) path (Examples bareValues) vals + cnts' = HM.alter (Just . bumpTopK bareValues . fromMaybe emptyTopK) path cnts + in (flds', vals', cnts') + + +-- | Fast path: known key, past learning threshold. Just touches counters. +bumpSeen :: UTCTime -> CatalogEntry -> CatalogEntry +bumpSeen now e = e{sampleCount = e.sampleCount + 1, lastSeen = now, dirty = True} + + +-- | Union scopes, preferring populated fields and union'ing observed status +-- codes (capped to 32 distinct values). +mergeScope :: Scope -> Scope -> Scope +mergeScope a b = + Scope + { service = a.service <|> b.service + , spanName = a.spanName <|> b.spanName + , kind = a.kind <|> b.kind + , host = a.host <|> b.host + , method = a.method <|> b.method + , urlPath = a.urlPath <|> b.urlPath + , statusCodes = + let merged = HS.fromList (V.toList a.statusCodes ++ V.toList b.statusCodes) + in V.fromList $ take 32 $ sort $ HS.toList merged + } + + +-- --------------------------------------------------------------------------- +-- Per-project summary doc (materialised AI/query-editor read). + +-- | Stored verbatim in @apis.schema_summary.doc@ (jsonb). The shape is the +-- 'getSummary' query's response: enough info to feed the AI prompt and the +-- query-editor's autocomplete without further joins. +data SummaryDoc = SummaryDoc + { fields :: !(HM.HashMap Text FieldStruct) + , services :: !(V.Vector Text) + , topValuesByField :: !(HM.HashMap Text TopK) + } + deriving stock (Eq, Generic, Show) + deriving anyclass (NFData) + deriving (AE.FromJSON, AE.ToJSON) via DAE.CustomJSON '[DAE.OmitNothingFields, DAE.FieldLabelModifier '[DAE.CamelToSnake]] SummaryDoc + + +emptySummaryDoc :: SummaryDoc +emptySummaryDoc = SummaryDoc HM.empty V.empty HM.empty + + +-- --------------------------------------------------------------------------- +-- Re-homed from the deleted "Models.Apis.Fields". Kept name-compatible so +-- existing readers (Anomalies VM, SchemaCatalog adapter) work unchanged. + +-- | Primitive JSON-leaf bucket. Mirrors the legacy @apis.field_type@ enum so +-- migration / readers don't have to translate. +data FieldTypes = FTUnknown | FTString | FTNumber | FTBool | FTObject | FTList | FTNull + deriving stock (Eq, Generic, Read, Show) + deriving anyclass (Default, Hashable, NFData) + deriving (AE.FromJSON, AE.ToJSON, FromField, ToField) via WrappedEnumSC "FT" FieldTypes + deriving (HI.DecodeValue, HI.EncodeValue) via WrappedEnumSC "FT" FieldTypes + + +instance HasField "toText" FieldTypes Text where + getField = toText . map toLower . drop 2 . show + + +-- | Where on the span a field came from. HTTP-specific buckets keep parity +-- with the legacy @apis.field_category@; @FCAttribute@/@FCResource@/@FCEvent@ +-- cover non-HTTP spans. The new variants only flow through +-- @apis.schema_catalog.fields@ (jsonb) — never written to the legacy +-- @apis.field_category@ PG enum column, so no @ALTER TYPE@ is needed. +data FieldCategoryEnum + = FCQueryParam + | FCPathParam + | FCRequestHeader + | FCResponseHeader + | FCRequestBody + | FCResponseBody + | FCAttribute + | FCResource + | FCEvent + deriving stock (Eq, Generic, Ord, Read, Show) + deriving anyclass (Default, NFData) + deriving (AE.FromJSON, AE.ToJSON, Display, FromField, ToField) via WrappedEnumSC "FC" FieldCategoryEnum + deriving (HI.DecodeValue, HI.EncodeValue) via WrappedEnumSC "FC" FieldCategoryEnum + + +-- Type aliases for the legacy ID newtypes — VM types in +-- "Models.Apis.Anomalies" still reference these. +type FieldId = UUIDId "field" + + +type FormatId = UUIDId "format" + + +type ShapeId = UUIDId "shape" + + +-- --------------------------------------------------------------------------- +-- Facet types (re-homed from "Models.Apis.Fields"). Kept here so callers +-- of the AI / query-editor / log-explorer stack don't need to chase the +-- migration. + +data FacetValue = FacetValue + { value :: Text + , count :: Int + } + deriving stock (Eq, Generic, Show) + deriving anyclass (NFData) + deriving (AE.FromJSON, AE.ToJSON) via DAE.CustomJSON '[DAE.OmitNothingFields, DAE.FieldLabelModifier '[DAE.CamelToSnake]] FacetValue + + +newtype FacetData = FacetData (HM.HashMap Text [FacetValue]) + deriving stock (Eq, Generic, Show) + deriving newtype (NFData) + deriving (FromField, ToField) via Aeson FacetData + deriving (AE.FromJSON, AE.ToJSON) via DAE.CustomJSON '[DAE.OmitNothingFields] FacetData + deriving (HI.DecodeValue, HI.EncodeValue) via HI.AsJsonb FacetData + + +data FacetSummary = FacetSummary + { id :: UUID.UUID + , projectId :: Text + , tableName :: Text + , facetJson :: FacetData + } + deriving stock (Generic, Show) + deriving anyclass (FromRow, HI.DecodeRow, NFData, ToRow) + deriving (Entity) via (GenericEntity '[Schema "apis", TableName "facet_summaries", PrimaryKey "id", FieldModifiers '[CamelToSnake]] FacetSummary) + deriving (AE.FromJSON, AE.ToJSON) via DAE.CustomJSON '[DAE.OmitNothingFields, DAE.FieldLabelModifier '[DAE.CamelToSnake]] FacetSummary + + +-- Suppress warnings if FromHttpApiData / FromRow stay unused by direct +-- references — the deriving-via lines need them in scope. +_keepFromHttpApiData :: Maybe (Proxy FromHttpApiData) +_keepFromHttpApiData = Nothing diff --git a/src/Pkg/SchemaLearning/Hot.hs b/src/Pkg/SchemaLearning/Hot.hs new file mode 100644 index 000000000..28db20d05 --- /dev/null +++ b/src/Pkg/SchemaLearning/Hot.hs @@ -0,0 +1,239 @@ +{-# LANGUAGE OverloadedRecordDot #-} + +-- | Streaming hot path for schema-learning. Single-writer per shard; the +-- shard fiber calls 'observeSpans' once per ingestion batch, the flush +-- fiber (in "Pkg.SchemaLearning.Worker") swaps the dirty subset. +-- +-- Designed to keep per-span amortised cost flat: +-- +-- * Pre-keyed: caller hands us 'ObservationInput' with @keyHash@ already +-- computed from cheap top-level fields. No JSON walk on the hot path +-- until we decide to learn. +-- * Group-by-key: a batch with N spans collapses to K distinct keys via +-- @HashMap.fromListWith (<>)@; we merge once per key, not per span. +-- * Sample-after-threshold: past 'learnFullThreshold' samples on a key, +-- subsequent observations only bump counters. Every +-- 'learnSampleEveryN' spans we re-walk to detect drift. +module Pkg.SchemaLearning.Hot ( + SchemaShardState (..), + emptySchemaShardState, + SchemaKey (..), + DecisionPolicy (..), + defaultPolicy, + ObservationInput (..), + observeSpans, + takeDirty, + pruneEvicted, + evictLRU, +) +where + +import Data.Aeson qualified as AE +import Data.HashMap.Strict qualified as HM +import Data.HashSet qualified as HS +import Data.Time (UTCTime) +import Data.Vector qualified as V +import Models.Projects.Projects qualified as Projects +import Pkg.SchemaLearning.Catalog ( + CatalogEntry, + FieldCategoryEnum, + KeyKind, + Scope, + mergeFullWalk, + newEntry, + ) +import Pkg.SchemaLearning.Catalog qualified as Catalog +import Relude + + +-- | Compound key for the shard-local map. Project-qualified so a single +-- shard can own state for multiple projects (the shard router keys on +-- ProjectId, but we also defend against future re-routing). +data SchemaKey = SchemaKey + { projectId :: !Projects.ProjectId + , keyHash :: !Text + } + deriving stock (Eq, Generic, Show) + deriving anyclass (Hashable, NFData) + + +-- | Single-writer shard state. The shard fiber owns it; the flush fiber +-- only reads via 'takeDirty' (an 'atomicModifyIORef'') which clears +-- 'dirtyKeys' atomically. +data SchemaShardState = SchemaShardState + { entries :: !(HashMap SchemaKey CatalogEntry) + , knownTemplates :: !(HS.HashSet Text) + , dirtyKeys :: !(HS.HashSet SchemaKey) + } + deriving stock (Generic) + deriving anyclass (NFData) + + +emptySchemaShardState :: SchemaShardState +emptySchemaShardState = SchemaShardState HM.empty HS.empty HS.empty + + +-- | Sampling policy. Defaults are conservative; tune via 'System.Config'. +data DecisionPolicy = DecisionPolicy + { learnFullThreshold :: !Word64 + -- ^ During the first N samples for a key, every span gets a full walk. + , learnSampleEveryN :: !Word64 + -- ^ Past 'learnFullThreshold', re-walk every N spans to refresh examples + -- and detect drift. + , maxKeysPerProject :: !Int + -- ^ Eviction threshold per project (LRU by 'lastSeen'). + } + deriving stock (Eq, Generic, Show) + deriving anyclass (NFData) + + +defaultPolicy :: DecisionPolicy +defaultPolicy = DecisionPolicy{learnFullThreshold = 200, learnSampleEveryN = 200, maxKeysPerProject = 5000} + + +-- | Per-span keying tuple plus the leaf walk. The caller (in +-- @BackgroundJobs.processEagerBatch@) computes this from the +-- @OtelLogsAndSpans@ so this module stays free of telemetry / regex deps. +data ObservationInput = ObservationInput + { keyKind :: !KeyKind + , keyHash :: !Text + , scope :: !Scope + , walk :: ![(Text, V.Vector (AE.Value, Maybe Text), FieldCategoryEnum)] + -- ^ See 'Catalog.mergeFullWalk' — leaf-walked fields with per-value + -- format hints. + , timestamp :: !UTCTime + } + deriving stock (Generic) + + +-- | Per-batch observation. Single 'atomicModifyIORef'' for the whole batch: +-- +-- 1. Group inputs by @keyHash@ within the batch (same key → one merge). +-- 2. For each unique key, decide full-walk vs bump based on the existing +-- entry's 'sampleCount' and the policy. +-- 3. Update entries; mark touched keys dirty. +observeSpans + :: IORef SchemaShardState + -> DecisionPolicy + -> Projects.ProjectId + -> V.Vector ObservationInput + -> IO () +observeSpans ref policy pid inputs + | V.null inputs = pass + | otherwise = do + let groups :: HashMap Text (V.Vector ObservationInput) + groups = + HM.fromListWith + (<>) + [ (i.keyHash, V.singleton i) + | i <- V.toList inputs + ] + atomicModifyIORef' ref \st -> + let st' = HM.foldlWithKey' (\acc kh grp -> mergeGroup policy pid kh grp acc) st groups + in (st', ()) + + +-- | Merge one (project, keyHash) group into the shard state. Decides +-- full-walk vs bump-only based on the existing entry's 'sampleCount'. +mergeGroup + :: DecisionPolicy + -> Projects.ProjectId + -> Text + -> V.Vector ObservationInput + -> SchemaShardState + -> SchemaShardState +mergeGroup policy pid keyHash grp st = fromMaybe st do + rep <- grp V.!? 0 + let key = SchemaKey pid keyHash + now = rep.timestamp + curEntry = HM.lookup key st.entries + learnPhase = maybe True (\e -> e.sampleCount < policy.learnFullThreshold) curEntry + sampleNow = case curEntry of + Just e -> e.sampleCount `mod` policy.learnSampleEveryN == 0 + Nothing -> True + base = fromMaybe (newEntry rep.keyKind rep.scope now) curEntry + n = fromIntegral (V.length grp) :: Word64 + walked = + if learnPhase || sampleNow + then mergeFullWalk rep.scope (combinedWalk grp) now base + else base + -- mergeFullWalk only +1's sampleCount; we represent N spans, so add + -- the remaining (N - walks-applied). bump-only path: add N. + addExtra = if learnPhase || sampleNow then n - 1 else n + newEntry' = + walked + { Catalog.sampleCount = walked.sampleCount + addExtra + , Catalog.lastSeen = now + , Catalog.dirty = True + } + entries' = HM.insert key newEntry' st.entries + dirty' = HS.insert key st.dirtyKeys + pure st{entries = entries', dirtyKeys = dirty'} + + +-- | Combine the leaf-walks across all spans in a group: same path → values +-- concatenated. Lets us merge the whole group with a single +-- 'mergeFullWalk' call. +combinedWalk + :: V.Vector ObservationInput + -> [(Text, V.Vector (AE.Value, Maybe Text), FieldCategoryEnum)] +combinedWalk grp = + let allWalks = concatMap (.walk) (V.toList grp) + step acc (path, vs, cat) = HM.insertWith concatVals path (vs, cat) acc + concatVals (vsNew, _) (vsOld, catOld) = (vsOld <> vsNew, catOld) + grouped = foldl' step HM.empty allWalks + in [(p, vs, cat) | (p, (vs, cat)) <- HM.toList grouped] + + +-- | Atomic dirty-set swap. Returns the dirty entries cloned for the flush +-- writer; clears 'dirtyKeys' so subsequent observations re-mark fresh. +takeDirty :: IORef SchemaShardState -> IO (V.Vector (SchemaKey, CatalogEntry)) +takeDirty ref = + atomicModifyIORef' ref \st -> + let dirty = + V.fromList + [ (k, e{Catalog.dirty = False}) + | k <- HS.toList st.dirtyKeys + , Just e <- [HM.lookup k st.entries] + ] + st' = st{dirtyKeys = HS.empty} + in (st', dirty) + + +-- | Drop entries the flush writer evicted (e.g. after vacuum), and merge in +-- any newly-acknowledged template hashes so subsequent flushes can short- +-- circuit the template upsert. +pruneEvicted + :: IORef SchemaShardState + -> HS.HashSet SchemaKey + -- ^ keys safe to drop from in-memory state + -> HS.HashSet Text + -- ^ template hashes now persisted + -> IO () +pruneEvicted ref droppedKeys newTemplates = + atomicModifyIORef' ref \st -> + let entries' = HS.foldr HM.delete st.entries droppedKeys + known' = st.knownTemplates <> newTemplates + in (st{entries = entries', knownTemplates = known'}, ()) + + +-- | Bound the per-project key set by evicting LRU-by-lastSeen. +-- Pure on the shard state; called from the flush worker. +evictLRU :: DecisionPolicy -> SchemaShardState -> SchemaShardState +evictLRU policy st = + let byProject :: HashMap Projects.ProjectId [(SchemaKey, CatalogEntry)] + byProject = + HM.fromListWith + (<>) + [(k.projectId, [(k, e)]) | (k, e) <- HM.toList st.entries] + victims :: HS.HashSet SchemaKey + victims = + HS.fromList $ concatMap pickVictims (HM.elems byProject) + pickVictims xs + | length xs <= policy.maxKeysPerProject = [] + | otherwise = + let sorted = sortOn (\(_, e) -> e.lastSeen) xs + excess = length xs - policy.maxKeysPerProject + in fst <$> take excess sorted + entries' = HS.foldr HM.delete st.entries victims + in st{entries = entries'} diff --git a/src/Pkg/SchemaLearning/OpenApi.hs b/src/Pkg/SchemaLearning/OpenApi.hs new file mode 100644 index 000000000..aeb33f6a2 --- /dev/null +++ b/src/Pkg/SchemaLearning/OpenApi.hs @@ -0,0 +1,69 @@ +{-# LANGUAGE OverloadedRecordDot #-} + +-- | On-demand OpenAPI emitter from the schema-learning catalog. Pure: takes +-- a vector of catalog entries (the HTTP-keyed subset) and returns a minimal +-- 'OpenApi' document. Not wired to a route in this PR — the entry point is +-- 'fromCatalog' which the per-host endpoint can call once a UI is added. +module Pkg.SchemaLearning.OpenApi ( + fromCatalog, +) +where + +import Data.HashMap.Strict qualified as HM +import Data.HashMap.Strict.InsOrd qualified as IOH +import Data.OpenApi (OpenApi) +import Data.OpenApi qualified as OA +import Data.Text qualified as T +import Data.Vector qualified as V +import Pkg.SchemaLearning.Catalog (CatalogEntry, KeyKind (..)) +import Pkg.SchemaLearning.Catalog qualified as Catalog +import Relude + + +-- | Emit a minimal OpenAPI 3.0 document describing the HTTP endpoints in a +-- vector of 'CatalogEntry's. Non-HTTP keys are skipped; the document +-- includes only paths + methods for now (request/response schemas can be +-- layered on by walking 'FieldStruct's in a follow-up). +-- +-- The intent is one document per @(project, host)@ — callers filter the +-- catalog vector before invoking. +fromCatalog :: V.Vector CatalogEntry -> OpenApi +fromCatalog entries = + mempty + { OA._openApiInfo = + mempty + { OA._infoTitle = "Discovered API" + , OA._infoVersion = "1.0.0" + , OA._infoDescription = Just "Generated from monoscope schema-learning catalog." + } + , OA._openApiPaths = IOH.fromList $ HM.toList byPath + } + where + httpEntries = V.filter (\e -> e.template.keyKind == HttpEndpoint) entries + byPath :: HashMap FilePath OA.PathItem + byPath = V.foldl' addEntry HM.empty httpEntries + + addEntry :: HashMap FilePath OA.PathItem -> CatalogEntry -> HashMap FilePath OA.PathItem + addEntry acc e = + let p = toString (fromMaybe "/" e.scope.urlPath) + item = HM.lookupDefault mempty p acc + item' = applyMethod (fromMaybe "GET" e.scope.method) (mkOperation e) item + in HM.insert p item' acc + + mkOperation :: CatalogEntry -> OA.Operation + mkOperation e = + mempty + { OA._operationDescription = + Just $ "Auto-discovered. Sample count: " <> T.pack (show e.sampleCount) + } + + applyMethod :: Text -> OA.Operation -> OA.PathItem -> OA.PathItem + applyMethod m op item = case T.toUpper m of + "GET" -> item{OA._pathItemGet = Just op} + "POST" -> item{OA._pathItemPost = Just op} + "PUT" -> item{OA._pathItemPut = Just op} + "DELETE" -> item{OA._pathItemDelete = Just op} + "PATCH" -> item{OA._pathItemPatch = Just op} + "HEAD" -> item{OA._pathItemHead = Just op} + "OPTIONS" -> item{OA._pathItemOptions = Just op} + _ -> item diff --git a/src/Pkg/SchemaLearning/Worker.hs b/src/Pkg/SchemaLearning/Worker.hs new file mode 100644 index 000000000..e31651bab --- /dev/null +++ b/src/Pkg/SchemaLearning/Worker.hs @@ -0,0 +1,185 @@ +{-# LANGUAGE OverloadedRecordDot #-} + +-- | Periodic flush worker for the schema-learning catalog. +-- +-- Once per shard, every 'flushIntervalSecs', the worker: +-- +-- 1. Atomically takes the dirty subset (a single 'atomicModifyIORef''). +-- 2. Splits dirty entries into 'TemplateRow's (instance-wide, dedup'd by +-- 'Catalog.templateHash') and 'CatalogRow's (per-project pointers). +-- 3. Upserts templates first, then catalog rows, then re-derives the +-- per-project summary doc. +-- 4. Hands the newly-acknowledged template hashes back to the shard so +-- subsequent flushes can short-circuit unchanged-template upserts. +-- +-- Anomaly diff/produce is not yet wired here — the legacy +-- @new_anomaly_proc@ trigger is being deprecated and the replacement +-- belongs in a follow-up (see TODO in 'flushDirty'). +module Pkg.SchemaLearning.Worker ( + FlushResult (..), + flushDirty, + runSchemaFlusher, +) +where + +import Data.HashMap.Strict qualified as HM +import Data.HashSet qualified as HS +import Data.Time (UTCTime, getCurrentTime) +import Data.Vector qualified as V +import Control.Concurrent (threadDelay) +import Effectful (Eff) +import Models.Apis.SchemaCatalog qualified as SC +import Models.Projects.Projects qualified as Projects +import Pkg.DeriveUtils (DB) +import Pkg.SchemaLearning.Catalog (CatalogEntry, SummaryDoc (..), TopK) +import Pkg.SchemaLearning.Catalog qualified as Catalog +import Pkg.SchemaLearning.Hot (SchemaKey, SchemaShardState) +import Pkg.SchemaLearning.Hot qualified as Hot +import Relude + + +-- | Summary stats from one flush pass — useful for telemetry / log lines. +data FlushResult = FlushResult + { templatesWritten :: !Int + , catalogRowsWritten :: !Int + , summariesUpdated :: !Int + , dirtyKeys :: !Int + } + deriving stock (Eq, Generic, Show) + deriving anyclass (NFData) + + +-- | One flush pass over a single shard. Pure-Eff except for the +-- 'atomicModifyIORef'' inside 'Hot.takeDirty'. +flushDirty + :: DB es + => IORef SchemaShardState + -> Eff es FlushResult +flushDirty ref = do + dirty <- liftIO $ Hot.takeDirty ref + if V.null dirty + then pure FlushResult{templatesWritten = 0, catalogRowsWritten = 0, summariesUpdated = 0, dirtyKeys = 0} + else do + now <- liftIO getCurrentTime + let templateRows = dedupTemplates $ V.map (templateRowOf now . snd) dirty + catalogRows = V.map (uncurry catalogRowOf) dirty + touchedProjects = HS.fromList [k.projectId | (k, _) <- V.toList dirty] + SC.upsertTemplates templateRows + SC.upsertCatalogRows catalogRows + summariesN <- regenerateSummaries touchedProjects + let newHashes = HS.fromList $ V.toList $ V.map (.templateHash) templateRows + liftIO $ Hot.pruneEvicted ref HS.empty newHashes + -- TODO(schema-anomalies): diff dirty entries vs prior catalog rows + -- (stale @apis.shapes@/@apis.fields@ triggers no longer fire). Emit + -- per-(project, key_hash) endpoint/shape/field/format anomalies into + -- @apis.anomalies@ + @background_jobs@ so the legacy notification + -- pipeline keeps working. + pure + FlushResult + { templatesWritten = V.length templateRows + , catalogRowsWritten = V.length catalogRows + , summariesUpdated = summariesN + , dirtyKeys = V.length dirty + } + + +templateRowOf :: UTCTime -> CatalogEntry -> SC.TemplateRow +templateRowOf now e = + SC.TemplateRow + { templateHash = Catalog.templateHash e.template + , keyKind = e.template.keyKind + , fields = e.template.fields + , lastSeenAt = now + } + + +catalogRowOf :: SchemaKey -> CatalogEntry -> SC.CatalogRow +catalogRowOf k e = + SC.CatalogRow + { projectId = k.projectId + , keyKind = e.template.keyKind + , keyHash = k.keyHash + , templateHash = Catalog.templateHash e.template + , scope = e.scope + , valuesDelta = e.valuesDelta + , counts = e.counts + , sampleCount = e.sampleCount + , firstSeen = e.firstSeen + , lastSeen = e.lastSeen + } + + +dedupTemplates :: V.Vector SC.TemplateRow -> V.Vector SC.TemplateRow +dedupTemplates = V.fromList . HM.elems . V.foldl' step HM.empty + where + step acc r = HM.insert r.templateHash r acc + + +-- | Re-derive @apis.schema_summary.doc@ for each project that had at least +-- one dirty key this pass. Reads back the full per-project catalog so the +-- summary reflects all known structure, not just what changed in the batch. +regenerateSummaries + :: DB es + => HS.HashSet Projects.ProjectId + -> Eff es Int +regenerateSummaries projects = do + let pids = HS.toList projects + forM_ pids \pid -> do + entries <- SC.getByProject pid + let doc = summariseEntries entries + SC.upsertSummary pid doc + pure (length pids) + + +-- | Project-scoped roll-up of catalog entries into a 'SummaryDoc'. +-- Aggregates field structures across keys (a field appearing in multiple +-- keys with different types unions them) and reduces top-K counts. +summariseEntries :: V.Vector CatalogEntry -> SummaryDoc +summariseEntries entries = + let fieldsAcc = + V.foldl' mergeFields HM.empty + $ V.map ((.template.fields)) entries + svcs = + V.fromList + $ HS.toList + $ HS.fromList + $ catMaybes [e.scope.service | e <- V.toList entries] + topVals :: HashMap Text TopK + topVals = V.foldl' mergeCounts HM.empty (V.map (.counts) entries) + in SummaryDoc{fields = fieldsAcc, services = svcs, topValuesByField = topVals} + where + mergeFields acc fs = HM.unionWith mergeStruct acc fs + mergeStruct a b = + a + { Catalog.types = a.types <> b.types + , Catalog.formats = a.formats <> b.formats + , Catalog.isEnum = a.isEnum || b.isEnum + } + mergeCounts :: HashMap Text TopK -> HashMap Text TopK -> HashMap Text TopK + mergeCounts = HM.unionWith addTopK + addTopK a b = + Catalog.TopK + { Catalog.distinct = a.distinct + b.distinct + , Catalog.top = HM.unionWith (+) a.top b.top + } + + +-- --------------------------------------------------------------------------- +-- Long-running fiber. + +-- | Sleep + flush. Iterates each shard sequentially per tick — flushes are +-- cheap (one round-trip per shard) so parallel scoping isn't worth the +-- complexity. Caller injects the per-tick effect runner so this module +-- stays free of @AuthContext@ / logging plumbing. +runSchemaFlusher + :: Int + -- ^ flushIntervalSecs + -> [IORef SchemaShardState] + -- ^ one ref per shard + -> (IORef SchemaShardState -> IO FlushResult) + -- ^ caller-supplied per-shard runner — typically wraps 'flushDirty' in an + -- effect runner (Hasql + logging + tracing). + -> IO Void +runSchemaFlusher intervalSecs refs flushOne = forever do + threadDelay (intervalSecs * 1_000_000) + forM_ refs \ref -> void $ flushOne ref diff --git a/src/ProcessMessage.hs b/src/ProcessMessage.hs index ab8c0f298..35cb74f16 100644 --- a/src/ProcessMessage.hs +++ b/src/ProcessMessage.hs @@ -4,12 +4,12 @@ module ProcessMessage ( processMessages, processSpanToEntities, + extractObservation, RequestMessage (..), valueToFormatStr, valueToFields, redactJSON, replaceNullChars, - fieldsToFieldDTO, sortVector, ensureUrlParams, dedupFields, @@ -41,9 +41,7 @@ import Data.Effectful.UUID qualified as UUID import Data.HashMap.Strict qualified as HM import Data.HashTable.Class qualified as HTC import Data.HashTable.ST.Cuckoo qualified as HT -import Data.Scientific qualified as Scientific import Data.Text qualified as T -import Data.Text.Display (display) import Data.Time (addUTCTime, zonedTimeToUTC) import Data.Time.LocalTime (ZonedTime) import Data.UUID qualified as UUID @@ -57,12 +55,14 @@ import Effectful.Labeled (Labeled (..)) import Effectful.Log (Log) import Effectful.Reader.Static qualified as Eff import Models.Apis.Endpoints qualified as Endpoints -import Models.Apis.Fields qualified as Fields import Models.Apis.LogQueries qualified as LogQueries +import Pkg.SchemaLearning.Catalog qualified as Fields import Models.Projects.Projects qualified as Projects import Models.Telemetry.Telemetry (Context (trace_state), OtelLogsAndSpans (..), generateSummary) import Models.Telemetry.Telemetry qualified as Telemetry import Pkg.DeriveUtils (AesonText (..), UUIDId (..), unAesonTextMaybe) +import Pkg.SchemaLearning.Catalog qualified as Catalog +import Pkg.SchemaLearning.Hot qualified as SchemaHot import Relude hiding (ask) import Relude.Unsafe qualified as Unsafe import System.Config (AuthContext (..), EnvConfig (..)) @@ -196,12 +196,16 @@ stripNulBytes :: Text -> Text stripNulBytes = T.replace "\NUL" "" --- | Process a single span to extract entities for anomaly detection. --- Returns @(endpoint, shape, fields, formats, hashes, normalizedPath)@. --- The normalized path (@Just@ for HTTP spans) is stamped back onto the span's +-- | Process a single span to extract entities for hash-stamping. +-- Returns @(endpoint, hashes, normalizedPath)@. The normalized path +-- (@Just@ for HTTP spans) is stamped back onto the span's -- @attributes.http.route@ and @attributes.url.path@ by the caller so that -- explorer queries match the template stored in @apis.endpoints@. -processSpanToEntities :: HM.HashMap (Text, Text) [([Text], Text)] -> Projects.ProjectCache -> Telemetry.OtelLogsAndSpans -> UUID.UUID -> (Maybe Endpoints.Endpoint, Maybe Fields.Shape, V.Vector Fields.Field, V.Vector Fields.Format, V.Vector Text, Maybe Text) +-- +-- Schema learning (fields/formats/shapes) now flows through +-- 'extractObservation' + the schema-learning catalog; this function only +-- handles endpoint discovery + hash stamping. +processSpanToEntities :: HM.HashMap (Text, Text) [([Text], Text)] -> Projects.ProjectCache -> Telemetry.OtelLogsAndSpans -> UUID.UUID -> (Maybe Endpoints.Endpoint, V.Vector Text, Maybe Text) processSpanToEntities canonicalTemplates pjc otelSpan dumpId = let !projectId = UUIDId $ Unsafe.fromJust $ UUID.fromText otelSpan.project_id @@ -249,77 +253,18 @@ processSpanToEntities canonicalTemplates pjc otelSpan dumpId = -- URL normalization and dynamic path parameter extraction !urlPath' = LogQueries.normalizeUrlPath sdkType statusCode method routePath - !(!urlPathDyn, !pathParamsDyn, !hasDyn) = ensureUrlParams urlPath' - !(!urlPath, !pathParams) = + !(!urlPathDyn, !_pathParamsDyn, !hasDyn) = ensureUrlParams urlPath' + !urlPath = if hasDyn - then (urlPathDyn, pathParamsDyn) - else - ( fromMaybe urlPath' $ matchCanonicalPath canonicalTemplates method host urlPath' - , fromMaybe AE.emptyObject $ attrValue ^? key "http" . key "request" . key "path_params" - ) - - -- Extract query params and headers from attributes - !queryParams = fromMaybe AE.emptyObject $ attrValue ^? key "http" . key "request" . key "query_params" - !requestHeaders = fromMaybe AE.emptyObject $ extractHeaders "http.request.headers" attrValue - !responseHeaders = fromMaybe AE.emptyObject $ extractHeaders "http.response.headers" attrValue - - -- Generate endpoint hash - this uniquely identifies an API endpoint - -- Hash components: projectId + host + method + urlPath - -- This hash is used to detect new endpoints (endpoint anomalies) - !endpointHash = toXXHash $ projectId.toText <> host <> method <> urlPath + then urlPathDyn + else fromMaybe urlPath' $ matchCanonicalPath canonicalTemplates method host urlPath' - -- Set up redaction to protect sensitive data - !redactFieldsList = pjc.redactFieldslist V.++ V.fromList [".set-cookie", ".password"] - !redacted = redactJSON redactFieldsList - - -- Extract request/response bodies from span body - !bodyValue = fromMaybe AE.Null (unAesonTextMaybe otelSpan.body) - !requestBody = redacted $ fromMaybe AE.Null $ bodyValue ^? key "request_body" - !responseBody = redacted $ fromMaybe AE.Null $ bodyValue ^? key "response_body" - - -- Extract and process all field categories - !pathParamFields = valueToFields $ redacted pathParams - !queryParamFields = valueToFields $ redacted queryParams - !reqHeaderFields = valueToFields $ redacted requestHeaders - !respHeaderFields = valueToFields $ redacted responseHeaders - !reqBodyFields = valueToFields requestBody - !respBodyFields = valueToFields responseBody - - -- Extract key paths for shape hash calculation - !queryParamsKP = V.map fst queryParamFields - !requestHeadersKP = V.map fst reqHeaderFields - !responseHeadersKP = V.map fst respHeaderFields - !requestBodyKP = V.map fst reqBodyFields - !responseBodyKP = V.map fst respBodyFields - - -- Calculate shape hash - identifies unique request/response structures - -- A shape represents the "schema" of an API call for a specific status code - -- Hash components: endpointHash + statusCode + sorted field paths - -- New shapes trigger shape anomalies indicating API structure changes - !representativeKP = sortVector $ queryParamsKP <> responseHeadersKP <> requestBodyKP <> responseBodyKP - !combinedKeyPathStr = T.concat $ V.toList representativeKP - !shapeHash = endpointHash <> show statusCode <> toXXHash combinedKeyPathStr - - -- Convert all field categories to DTOs - !pathParamsFieldsDTO = V.map (fieldsToFieldDTO Fields.FCPathParam projectId endpointHash) pathParamFields - !queryParamsFieldsDTO = V.map (fieldsToFieldDTO Fields.FCQueryParam projectId endpointHash) queryParamFields - !reqHeadersFieldsDTO = V.map (fieldsToFieldDTO Fields.FCRequestHeader projectId endpointHash) reqHeaderFields - !respHeadersFieldsDTO = V.map (fieldsToFieldDTO Fields.FCResponseHeader projectId endpointHash) respHeaderFields - !reqBodyFieldsDTO = V.map (fieldsToFieldDTO Fields.FCRequestBody projectId endpointHash) reqBodyFields - !respBodyFieldsDTO = V.map (fieldsToFieldDTO Fields.FCResponseBody projectId endpointHash) respBodyFields - !fieldsDTO = V.concat [pathParamsFieldsDTO, queryParamsFieldsDTO, reqHeadersFieldsDTO, respHeadersFieldsDTO, reqBodyFieldsDTO, respBodyFieldsDTO] - - !(!fields, !formats) = V.unzip fieldsDTO - !fieldHashes = sortVector $ V.map (.hash) fields + -- Generate endpoint hash - this uniquely identifies an API endpoint. + !endpointHash = toXXHash $ projectId.toText <> host <> method <> urlPath -- Determine if request is outgoing based on span kind !outgoing = otelSpan.kind == Just "client" - -- Build endpoint if not in cache - -- Only create endpoint entity if: - -- 1. Not already in project cache (prevents duplicate anomalies) - -- 2. Not a 404 response (ignore missing routes) - -- When inserted, will trigger endpoints_created_anomaly in DB !endpoint = if endpointHash `elem` pjc.endpointHashes || statusCode == 404 then Nothing @@ -331,7 +276,7 @@ processSpanToEntities canonicalTemplates pjc otelSpan dumpId = , id = UUIDId dumpId , projectId = projectId , urlPath = urlPath - , urlParams = AE.emptyObject -- TODO: Should this use pathParams? + , urlParams = AE.emptyObject , method = method , host = host , hash = endpointHash @@ -341,50 +286,137 @@ processSpanToEntities canonicalTemplates pjc otelSpan dumpId = , environment = environment } - -- Build shape if not in cache - -- Shape represents the structure of request/response for a specific endpoint+status - -- Only create if not cached and not 404 - -- When inserted, will trigger shapes_created_anomaly in DB - !shape = - if shapeHash `elem` pjc.shapeHashes || statusCode == 404 - then Nothing - else - Just - $ Fields.Shape - { id = UUIDId dumpId - , createdAt = otelSpan.timestamp - , updatedAt = otelSpan.timestamp - , approvedOn = Nothing - , projectId = projectId - , endpointHash = endpointHash - , queryParamsKeypaths = queryParamsKP - , requestBodyKeypaths = requestBodyKP - , responseBodyKeypaths = responseBodyKP - , requestHeadersKeypaths = requestHeadersKP - , responseHeadersKeypaths = responseHeadersKP - , fieldHashes = fieldHashes - , hash = shapeHash - , statusCode = statusCode - , responseDescription = "" - , requestDescription = "" - } - - !fields' = if statusCode == 404 then V.empty else fields - !formats' = if statusCode == 404 then V.empty else formats - - -- Collect hashes to update span with - !hashes = - V.cons endpointHash (if isJust shape then V.cons shapeHash fieldHashes else fieldHashes) + -- Span gets one stamped hash (the endpoint hash). Field/shape hashes + -- used to be stamped here for anomaly cascades; the schema-learning + -- catalog now owns that lookup, so a single hash per span is enough. + !hashes = V.singleton endpointHash -- Normalized path written into both attributes.http.route and attributes.url.path -- so new-endpoint notification links and the catalog UI can filter by the -- same template stored in apis.endpoints. !normalizedPathForSpan = if isHttpSpan then Just urlPath else Nothing - in (endpoint, shape, fields', formats', hashes, normalizedPathForSpan) + in (endpoint, hashes, normalizedPathForSpan) + + +-- | Build a 'SchemaHot.ObservationInput' for the schema-learning catalog. +-- Covers HTTP and non-HTTP spans uniformly; the keying tuple distinguishes +-- them. Walks @attributes ∪ resource ∪ body ∪ events@ with redaction +-- applied so PII never enters the catalog. +extractObservation + :: Projects.ProjectCache + -> OtelLogsAndSpans + -> SchemaHot.ObservationInput +extractObservation pjc otelSpan = + let !projectIdText = otelSpan.project_id + !attrMap = maybeToMonoid (unAesonTextMaybe otelSpan.attributes) + !resMap = maybeToMonoid (unAesonTextMaybe otelSpan.resource) + !attrValue = AE.Object $ AEKM.fromMapText attrMap + !isHttpSpan = isJust $ attrValue ^? key "http" . key "request" . key "method" . _String + !redactList = pjc.redactFieldslist V.++ V.fromList [".set-cookie", ".password"] + !redacted = redactJSON redactList + + -- HTTP keying parity with processSpanToEntities. + !method = T.toUpper $ fromMaybe "GET" $ attrValue ^? key "http" . key "request" . key "method" . _String + !routePath = fromMaybe "/" $ asum [attrValue ^? key "http" . key "route" . _String, attrValue ^? key "url" . key "path" . _String] + !host = + fromMaybe "" + $ asum + [ attrValue ^? key "net" . key "host" . key "name" . _String + , attrValue ^? key "server" . key "address" . _String + , attrValue ^? key "http" . key "host" . _String + ] + !statusCode = + fromMaybe 0 + $ asum + [ attrValue ^? key "http" . key "response" . key "status_code" . _String >>= readMaybe @Int . toString + , truncate <$> attrValue ^? key "http" . key "response" . key "status_code" . _Number + ] + + !service = Telemetry.atMapText "service.name" (Just resMap) <|> Telemetry.atMapText "service.name" (Just attrMap) + !spanName = otelSpan.name + !spanKind = otelSpan.kind + + !(keyKind, keyHash, scope) = + if isHttpSpan + then + let !endpointHash = toXXHash $ projectIdText <> host <> method <> routePath + in ( Catalog.HttpEndpoint + , endpointHash + , Catalog.Scope + { Catalog.service = service + , Catalog.spanName = spanName + , Catalog.kind = spanKind + , Catalog.host = if T.null host then Nothing else Just host + , Catalog.method = Just method + , Catalog.urlPath = Just routePath + , Catalog.statusCodes = if statusCode > 0 then V.singleton statusCode else V.empty + } + ) + else + let !ident = + toXXHash + $ projectIdText + <> "|" + <> fromMaybe "" service + <> "|" + <> fromMaybe "" spanName + <> "|" + <> fromMaybe "" spanKind + in ( Catalog.SpanIdentity + , ident + , Catalog.Scope + { Catalog.service = service + , Catalog.spanName = spanName + , Catalog.kind = spanKind + , Catalog.host = Nothing + , Catalog.method = Nothing + , Catalog.urlPath = Nothing + , Catalog.statusCodes = V.empty + } + ) + + -- Walk every section of the span. For HTTP we keep the legacy + -- categorisation (header/body/etc.); for non-HTTP we tag attributes + -- and resource bag fields with FCAttribute / FCResource. + !bodyValue = fromMaybe AE.Null (unAesonTextMaybe otelSpan.body) + !eventsValue = fromMaybe AE.Null (unAesonTextMaybe otelSpan.events) + + !walk + | isHttpSpan = + let !pathParams = fromMaybe AE.emptyObject $ attrValue ^? key "http" . key "request" . key "path_params" + !queryParams = fromMaybe AE.emptyObject $ attrValue ^? key "http" . key "request" . key "query_params" + !reqHeaders = fromMaybe AE.emptyObject $ extractHeadersV "http.request.headers" attrValue + !respHeaders = fromMaybe AE.emptyObject $ extractHeadersV "http.response.headers" attrValue + !reqBody = redacted $ fromMaybe AE.Null $ bodyValue ^? key "request_body" + !respBody = redacted $ fromMaybe AE.Null $ bodyValue ^? key "response_body" + in concat + [ tagWalk Fields.FCPathParam (valueToFields $ redacted pathParams) + , tagWalk Fields.FCQueryParam (valueToFields $ redacted queryParams) + , tagWalk Fields.FCRequestHeader (valueToFields $ redacted reqHeaders) + , tagWalk Fields.FCResponseHeader (valueToFields $ redacted respHeaders) + , tagWalk Fields.FCRequestBody (valueToFields reqBody) + , tagWalk Fields.FCResponseBody (valueToFields respBody) + ] + | otherwise = + concat + [ tagWalk Fields.FCAttribute (valueToFields $ redacted attrValue) + , tagWalk Fields.FCResource (valueToFields $ redacted (AE.Object $ AEKM.fromMapText resMap)) + , tagWalk Fields.FCRequestBody (valueToFields $ redacted bodyValue) + , tagWalk Fields.FCEvent (valueToFields $ redacted eventsValue) + ] + in SchemaHot.ObservationInput + { keyKind = keyKind + , keyHash = keyHash + , scope = scope + , walk = walk + , timestamp = otelSpan.timestamp + } where - -- Helper function to extract headers from nested attribute structure - extractHeaders :: Text -> AE.Value -> Maybe AE.Value - extractHeaders prefix obj = case obj of + -- Reuse-friendly local of the inline header extractor in + -- processSpanToEntities. Kept private to avoid a cycle with the + -- where-clause version above. + extractHeadersV :: Text -> AE.Value -> Maybe AE.Value + extractHeadersV prefix obj = case obj of AE.Object keyMap -> let !prefixDot = prefix <> "." !prefixDotLen = T.length prefixDot @@ -392,6 +424,22 @@ processSpanToEntities canonicalTemplates pjc otelSpan dumpId = in if null headerPairs then Nothing else Just $ AE.Object $ AEKM.fromList headerPairs _ -> Nothing + -- Pair each value with its format hint and tag the whole walk with a + -- field category. Format hint computation here is what costs us the + -- regex sweep — only invoked on the slow-path full walk. + tagWalk + :: Fields.FieldCategoryEnum + -> V.Vector (Text, V.Vector AE.Value) + -> [(Text, V.Vector (AE.Value, Maybe Text), Fields.FieldCategoryEnum)] + tagWalk cat fields0 = + [ (path, V.map (\v -> (v, formatHint v)) vs, cat) + | (path, vs) <- V.toList fields0 + ] + + formatHint :: AE.Value -> Maybe Text + formatHint (AE.String s) = valueToFormatStr s + formatHint _ = Nothing + convertRequestMessageToSpan :: RequestMessage -> Int64 -> (UUID.UUID, Text) -> Telemetry.OtelLogsAndSpans convertRequestMessageToSpan rm msgSize (spanId, trId) = @@ -700,26 +748,6 @@ removeBlacklistedFields = V.map \(k, val) -> else (k, val) --- >>> valueToFormat (AET.String "22") --- "integer" --- --- >>> valueToFormat (AET.String "22.33") --- "float" --- --- >>> valueToFormat (AET.String "22/02/2022") --- "mm/dd/yyyy" --- -valueToFormat :: AE.Value -> Text -valueToFormat (AET.String val) = case valueToFormatStr val of - Just fmt -> T.drop 1 $ T.dropEnd 1 fmt -- Remove the curly braces - Nothing -> "text" -valueToFormat (AET.Number val) = valueToFormatNum val -valueToFormat (AET.Bool _) = "bool" -valueToFormat AET.Null = "null" -valueToFormat (AET.Object _) = "object" -valueToFormat (AET.Array _) = "array" - - -- | Common format patterns used by both replaceAllFormats and valueToFormatStr -- The order matters: more specific patterns should come before more general ones commonFormatPatterns :: [(RE, Text)] @@ -1011,83 +1039,6 @@ tokenizeUrlPath = V.fromList . map normalize . T.splitOn "/" normalize seg = fromMaybe (bool seg "<*>" $ isUrlIdLike seg) (valueToFormatStr seg) --- >>> valueToFormatNum 22.3 --- "float" --- >>> valueToFormatNum 22 --- "integer" -valueToFormatNum :: Scientific.Scientific -> Text -valueToFormatNum val - | Scientific.isFloating val = "float" - | Scientific.isInteger val = "integer" - | otherwise = "unknown" - - --- fieldsToFieldDTO processes a field from monoscope clients into a field and format record, --- which can then be converted into separate sql insert queries. -fieldsToFieldDTO :: Fields.FieldCategoryEnum -> Projects.ProjectId -> Text -> (Text, V.Vector AE.Value) -> (Fields.Field, Fields.Format) -fieldsToFieldDTO fieldCategory projectID endpointHash (keyPath, val) = - ( Fields.Field - { createdAt = Unsafe.read "2019-08-31 05:14:37.537084021 UTC" - , updatedAt = Unsafe.read "2019-08-31 05:14:37.537084021 UTC" - , id = Fields.FieldId UUID.nil - , endpointHash = endpointHash - , projectId = projectID - , key = snd $ T.breakOnEnd "." keyPath - , -- FIXME: We're discarding the field values of the others, if theer was more than 1 value. - -- FIXME: We should instead take all the fields into consideration - -- FIXME: when generating the field types and formats - fieldType = fieldType - , fieldTypeOverride = Nothing - , format = format - , formatOverride = Nothing - , description = "" - , keyPath = keyPath - , fieldCategory = fieldCategory - , hash = fieldHash - , isEnum = False - , isRequired = False - } - , Fields.Format - { id = UUIDId UUID.nil - , createdAt = Unsafe.read "2019-08-31 05:14:37.537084021 UTC" - , updatedAt = Unsafe.read "2019-08-31 05:14:37.537084021 UTC" - , projectId = projectID - , fieldHash = fieldHash - , fieldType = fieldType - , fieldFormat = format - , -- NOTE: A trailing question, is whether to store examples into a separate table. - -- It requires some more of a cost benefit analysis. - examples = boundedVal - , hash = formatHash - } - ) - where - aeValueToFieldType :: AE.Value -> Fields.FieldTypes - aeValueToFieldType (AET.String _) = Fields.FTString - aeValueToFieldType (AET.Number _) = Fields.FTNumber - aeValueToFieldType AET.Null = Fields.FTNull - aeValueToFieldType (AET.Bool _) = Fields.FTBool - aeValueToFieldType (AET.Object _) = Fields.FTObject - aeValueToFieldType (AET.Array _) = Fields.FTList - - -- Cap stored example/format-source values to avoid persisting large - -- payloads (e.g. HTML response bodies) into apis.fields/apis.formats. - -- Oversized strings are replaced with a sentinel; non-strings are kept as-is. - maxFieldValueSize :: Int - maxFieldValueSize = 256 - boundVal :: AE.Value -> AE.Value - boundVal (AET.String s) - | T.length s > maxFieldValueSize = AET.String (T.take maxFieldValueSize s <> "…") - boundVal v = v - boundedVal = V.map boundVal val - - fieldType :: Fields.FieldTypes - fieldType = fromMaybe Fields.FTUnknown $ V.map aeValueToFieldType boundedVal V.!? 0 - - -- field hash is + > (No space or comma between data) - !fieldHash = endpointHash <> toXXHash (display fieldCategory <> keyPath) - -- FIXME: We should rethink this value to format logic. - -- FIXME: Maybe it actually needs machine learning, - -- FIXME: or maybe it should operate on the entire list, and not just one value. - format = fromMaybe "" $ V.map valueToFormat boundedVal V.!? 0 - !formatHash = fieldHash <> toXXHash format + +-- fieldsToFieldDTO removed: schema learning now flows through the +-- in-memory catalog (see 'extractObservation' + Pkg.SchemaLearning). diff --git a/src/System/Config.hs b/src/System/Config.hs index 789628717..9a88ba761 100644 --- a/src/System/Config.hs +++ b/src/System/Config.hs @@ -157,6 +157,13 @@ data EnvConfig = EnvConfig , drainRehydrateIntervalSecs :: Int , maxBufferedSpans :: Int , maxDrainTrees :: Int + , -- Schema-learning knobs (see "Pkg.SchemaLearning.Hot"). + schemaFlushIntervalSecs :: Int + , schemaCatalogExamples :: Int + , schemaCatalogMaxKeysPerProject :: Int + , schemaCatalogMaxBytesPerShard :: Int + , schemaLearnFullThreshold :: Int + , schemaLearnSampleEveryN :: Int , processedAtCutoff :: UTCTime -- ^ Must match the `timestamp >=` literal in migration 0064's partial index. -- The safety-net query and the partial-index WHERE clause both filter @@ -192,6 +199,12 @@ instance DefConfig EnvConfig where , drainRehydrateIntervalSecs = 300 , maxBufferedSpans = 100000 , maxDrainTrees = 200 + , schemaFlushIntervalSecs = 60 + , schemaCatalogExamples = 20 + , schemaCatalogMaxKeysPerProject = 5000 + , schemaCatalogMaxBytesPerShard = 67108864 + , schemaLearnFullThreshold = 200 + , schemaLearnSampleEveryN = 200 , -- MUST match the literal in static/migrations/0064_processed_at_safety_net.sql. -- The partial index `idx_otel_unprocessed` filters on `timestamp >= this`, -- so a mismatched default would silently orphan post-cutoff rows from the diff --git a/src/System/Server.hs b/src/System/Server.hs index 181bcd8d4..77647afdf 100644 --- a/src/System/Server.hs +++ b/src/System/Server.hs @@ -130,6 +130,7 @@ runServer appLogger env tp = do <> [ fiber "drain-age-flush" $ BackgroundJobs.runDrainAgeFlushTimer appLogger env , fiber "error-decay" $ BackgroundJobs.runErrorDecayFiber appLogger env tp , fiber "session-backfill" $ BackgroundJobs.runSessionBackfillTimer appLogger env tp + , fiber "schema-flusher" $ void $ BackgroundJobs.runSchemaFlusherFiber appLogger env tp ] liftIO $ atomically $ writeTVar env.extractionWorker.acceptingBatches True asyncs <- diff --git a/src/Web/ApiHandlers.hs b/src/Web/ApiHandlers.hs index 5fe034030..12660b29c 100644 --- a/src/Web/ApiHandlers.hs +++ b/src/Web/ApiHandlers.hs @@ -110,8 +110,9 @@ import Effectful.Reader.Static (ask) import Effectful.Time qualified as Time import Models.Apis.Endpoints qualified as Endpoints import Models.Apis.ErrorPatterns qualified as ErrorPatterns -import Models.Apis.Fields qualified as Fields import Models.Apis.Issues qualified as Issues +import Models.Apis.SchemaCatalog qualified as SchemaCatalog +import Pkg.SchemaLearning.Catalog qualified as Fields import Models.Apis.LogPatterns qualified as LogPatterns import Models.Apis.Monitors qualified as Monitors import Models.Apis.ShareEvents qualified as ShareEvents @@ -1141,7 +1142,7 @@ apiFacets pid sinceM fromM toM fieldM = do let (fromT, toT, _) = TP.parseTimeRange now (TP.TimePicker sinceM fromM toM) defaultFrom = fromMaybe (addUTCTime (negate nominalDay) now) fromT defaultTo = fromMaybe now toT - summaryM <- Fields.getFacetSummary pid "otel_logs_and_spans" defaultFrom defaultTo + summaryM <- SchemaCatalog.getFacetSummary pid "otel_logs_and_spans" defaultFrom defaultTo let Fields.FacetData facetMap = maybe (Fields.FacetData mempty) (.facetJson) summaryM -- Storage uses `___` as the path separator (raw column names like -- `resource___service___name`); the public API contract is dotted. diff --git a/src/Web/MCP.hs b/src/Web/MCP.hs index 977198c1d..42df6edf2 100644 --- a/src/Web/MCP.hs +++ b/src/Web/MCP.hs @@ -29,7 +29,7 @@ import Data.UUID qualified as UUID import Effectful.Error.Static qualified as Error import Effectful.Reader.Static qualified as Reader import Effectful.Time qualified as Time -import Models.Apis.Fields qualified as Fields +import Models.Apis.SchemaCatalog qualified as SchemaCatalog import Models.Apis.LogPatterns qualified as LogPatterns import Models.Projects.Projects qualified as Projects import NeatInterpolation (text) @@ -540,7 +540,7 @@ searchEventsNL = | otherwise -> do authCtx <- Reader.ask @AuthContext now <- Time.currentTime - facets <- Fields.getFacetSummary pid "otel_logs_and_spans" (addUTCTime (-86400) now) now + facets <- SchemaCatalog.getFacetSummary pid "otel_logs_and_spans" (addUTCTime (-86400) now) now let cfg = (AI.defaultAgenticConfig pid){AI.facetContext = facets, AI.timezone = sanitizeTimezone =<< textArg "timezone" args, AI.maxIterations = 2} AI.runAgenticQuery cfg inputT authCtx.env.openaiModel authCtx.env.openaiApiKey >>= \case Left err -> pure $ toolError ("AI translation failed: " <> err) diff --git a/static/migrations/0089_schema_catalog.sql b/static/migrations/0089_schema_catalog.sql new file mode 100644 index 000000000..700a6ffd1 --- /dev/null +++ b/static/migrations/0089_schema_catalog.sql @@ -0,0 +1,55 @@ +-- In-memory schema-learning pipeline persistence layer. +-- +-- Replaces apis.shapes / apis.fields / apis.formats / apis.facet_summaries. +-- Two-tier: +-- * apis.schema_template — instance-wide, structure-only, dedup'd by hash +-- (autoinstrumentation spans collapse to a few templates shared by every +-- tenant — no examples or values, so safe to share). +-- * apis.schema_catalog — per-project, references a template + holds the +-- tenant-private bits (values, counts, first-seen, anomaly state). +-- * apis.schema_summary — materialized AI/query-editor doc per project. + +CREATE TYPE apis.schema_key_kind AS ENUM ('http_endpoint', 'span_identity'); + +CREATE TABLE apis.schema_template ( + template_hash text PRIMARY KEY, + key_kind apis.schema_key_kind NOT NULL, + fields jsonb NOT NULL, + ref_count bigint NOT NULL DEFAULT 0, + created_at timestamptz NOT NULL DEFAULT now(), + last_seen_at timestamptz NOT NULL DEFAULT now() +); + +CREATE INDEX schema_template_kind_lastseen_idx + ON apis.schema_template (key_kind, last_seen_at DESC); + +CREATE TABLE apis.schema_catalog ( + project_id uuid NOT NULL REFERENCES projects.projects(id) ON DELETE CASCADE, + key_kind apis.schema_key_kind NOT NULL, + key_hash text NOT NULL, + template_hash text NOT NULL REFERENCES apis.schema_template(template_hash), + scope jsonb NOT NULL, + values_delta jsonb NOT NULL DEFAULT '{}'::jsonb, + counts jsonb NOT NULL DEFAULT '{}'::jsonb, + sample_count bigint NOT NULL DEFAULT 0, + first_seen timestamptz NOT NULL, + last_seen timestamptz NOT NULL, + updated_at timestamptz NOT NULL DEFAULT now(), + PRIMARY KEY (project_id, key_hash) +); + +CREATE INDEX schema_catalog_project_lastseen_idx + ON apis.schema_catalog (project_id, last_seen DESC); +CREATE INDEX schema_catalog_project_kind_idx + ON apis.schema_catalog (project_id, key_kind); +CREATE INDEX schema_catalog_template_idx + ON apis.schema_catalog (template_hash); +CREATE INDEX schema_catalog_host_idx + ON apis.schema_catalog ((scope->>'host')) + WHERE key_kind = 'http_endpoint'; + +CREATE TABLE apis.schema_summary ( + project_id uuid PRIMARY KEY REFERENCES projects.projects(id) ON DELETE CASCADE, + doc jsonb NOT NULL, + generated_at timestamptz NOT NULL DEFAULT now() +); diff --git a/static/migrations/0090_drop_legacy_schema_tables.sql b/static/migrations/0090_drop_legacy_schema_tables.sql new file mode 100644 index 000000000..7588394f6 --- /dev/null +++ b/static/migrations/0090_drop_legacy_schema_tables.sql @@ -0,0 +1,24 @@ +-- Drop the legacy schema-derivation tables, replaced by the in-memory +-- schema-learning catalog (apis.schema_template / apis.schema_catalog / +-- apis.schema_summary). The new pipeline (Pkg.SchemaLearning) writes only +-- to those tables; nothing remaining inserts into the dropped ones. +-- +-- Triggers fired by these tables fan out into apis.anomalies + background +-- jobs; their replacement (per-flush diffing in +-- Pkg.SchemaLearning.Worker.flushDirty) is wired in the same branch. + +-- Triggers + procedure first (FK / dependency order). +DROP TRIGGER IF EXISTS endpoint_created_anomaly ON apis.endpoints; +DROP TRIGGER IF EXISTS shapes_created_anomaly ON apis.shapes; +DROP TRIGGER IF EXISTS fields_created_anomaly ON apis.fields; +DROP TRIGGER IF EXISTS format_created_anomaly ON apis.formats; + +-- Tables. +DROP TABLE IF EXISTS apis.facet_summaries CASCADE; +DROP TABLE IF EXISTS apis.formats CASCADE; +DROP TABLE IF EXISTS apis.fields CASCADE; +DROP TABLE IF EXISTS apis.shapes CASCADE; + +-- Enums (only safe to drop after their consumer tables are gone). +DROP TYPE IF EXISTS apis.field_category CASCADE; +DROP TYPE IF EXISTS apis.field_type CASCADE; From 752546ad8b2abdad25dc53b4c9a97ede1caea5ad Mon Sep 17 00:00:00 2001 From: "github-actions[bot]" Date: Sun, 10 May 2026 03:45:21 +0000 Subject: [PATCH 2/6] Auto-format code with fourmolu --- src/Models/Apis/Anomalies.hs | 4 ++-- src/Models/Apis/SchemaCatalog.hs | 3 ++- src/Pages/Anomalies.hs | 4 ++-- src/Pages/Bots/Utils.hs | 2 +- src/Pages/LogExplorer/Log.hs | 2 +- src/Pkg/AI.hs | 2 +- src/Pkg/Components/LogQueryBox.hs | 2 +- src/Pkg/ExtractionWorker.hs | 8 ++++---- src/Pkg/SchemaLearning/Catalog.hs | 16 ++++++++-------- src/Pkg/SchemaLearning/Worker.hs | 2 +- src/ProcessMessage.hs | 4 +--- src/Web/ApiHandlers.hs | 4 ++-- src/Web/MCP.hs | 2 +- 13 files changed, 27 insertions(+), 28 deletions(-) diff --git a/src/Models/Apis/Anomalies.hs b/src/Models/Apis/Anomalies.hs index de92bcb15..59b1881f5 100644 --- a/src/Models/Apis/Anomalies.hs +++ b/src/Models/Apis/Anomalies.hs @@ -47,6 +47,8 @@ import Effectful.Time qualified as Time import Hasql.Interpolate qualified as HI import Models.Apis.Endpoints qualified as Endpoints import Models.Apis.ErrorPatterns qualified as ErrorPatterns +import Models.Projects.Projects qualified as Projects +import Pkg.DeriveUtils (UUIDId (..), WrappedEnumSC (..)) import Pkg.SchemaLearning.Catalog qualified as Fields ( FieldCategoryEnum, FieldId, @@ -54,8 +56,6 @@ import Pkg.SchemaLearning.Catalog qualified as Fields ( FormatId, ShapeId, ) -import Models.Projects.Projects qualified as Projects -import Pkg.DeriveUtils (UUIDId (..), WrappedEnumSC (..)) import Relude hiding (id, many, some) import Servant (FromHttpApiData (..)) import System.Types (DB) diff --git a/src/Models/Apis/SchemaCatalog.hs b/src/Models/Apis/SchemaCatalog.hs index ebb7e4f3c..7c51fab0e 100644 --- a/src/Models/Apis/SchemaCatalog.hs +++ b/src/Models/Apis/SchemaCatalog.hs @@ -286,7 +286,8 @@ toFacetSummary pid tableName doc = where topKToFacetValues :: Catalog.TopK -> [Catalog.FacetValue] topKToFacetValues tk = - sortOn (negate . (.count)) + sortOn + (negate . (.count)) [ Catalog.FacetValue v (fromIntegral n) | (v, n) <- HM.toList tk.top ] diff --git a/src/Pages/Anomalies.hs b/src/Pages/Anomalies.hs index 58416a90c..d2699ff53 100644 --- a/src/Pages/Anomalies.hs +++ b/src/Pages/Anomalies.hs @@ -70,11 +70,10 @@ import Models.Apis.Endpoints qualified as Endpoints import Models.Apis.ErrorPatterns (ErrorPatternId (..)) import Models.Apis.ErrorPatterns qualified as ErrorPatterns import Models.Apis.Issues qualified as Issues -import Models.Apis.SchemaCatalog qualified as SchemaCatalog -import Pkg.SchemaLearning.Catalog (FacetData (..), FacetSummary (..), FacetValue (..)) import Models.Apis.LogPatterns (sourceFieldLabel) import Models.Apis.Monitors qualified as Monitors import Models.Apis.PatternMerge qualified as PatternMerge +import Models.Apis.SchemaCatalog qualified as SchemaCatalog import Models.Projects.ProjectMembers qualified as ProjectMembers import Models.Projects.Projects (User (id)) import Models.Projects.Projects qualified as Projects @@ -91,6 +90,7 @@ import Pkg.Components.Table (BulkAction (..), Column (..), Config (..), Features import Pkg.Components.TimePicker qualified as TimePicker import Pkg.Components.Widget qualified as Widget import Pkg.DeriveUtils (UUIDId (..), hashAssetFile) +import Pkg.SchemaLearning.Catalog (FacetData (..), FacetSummary (..), FacetValue (..)) import PyF (fmt) import Relude hiding (ask) import Relude.Unsafe qualified as Unsafe diff --git a/src/Pages/Bots/Utils.hs b/src/Pages/Bots/Utils.hs index 07e11b41e..04a0fdb2c 100644 --- a/src/Pages/Bots/Utils.hs +++ b/src/Pages/Bots/Utils.hs @@ -20,8 +20,8 @@ import Effectful.Time qualified as Time import Langchain.LLM.Core qualified as LLM import Lucid import Models.Apis.Issues qualified as Reports -import Models.Apis.SchemaCatalog qualified as SchemaCatalog import Models.Apis.LogQueries qualified as LogQueries +import Models.Apis.SchemaCatalog qualified as SchemaCatalog import Models.Projects.Projects qualified as Projects import Network.HTTP.Types (urlEncode) import Pages.BodyWrapper (PageCtx (..)) diff --git a/src/Pages/LogExplorer/Log.hs b/src/Pages/LogExplorer/Log.hs index cb07e2b9f..89e9932d0 100644 --- a/src/Pages/LogExplorer/Log.hs +++ b/src/Pages/LogExplorer/Log.hs @@ -39,7 +39,6 @@ import Lucid.Htmx import Lucid.Hyperscript (__) import Models.Apis.LogQueries qualified as LogQueries import Models.Apis.SchemaCatalog qualified as SchemaCatalog -import Pkg.SchemaLearning.Catalog (FacetData (..), FacetSummary (..), FacetValue (..)) import Models.Projects.Projects qualified as Projects import NeatInterpolation (text) import Numeric (showFFloat) @@ -49,6 +48,7 @@ import Pkg.Components.TimePicker qualified as Components import Pkg.Components.Widget (WidgetAxis (..), WidgetType (WTTimeseries, WTTimeseriesLine)) import Pkg.Components.Widget qualified as Widget import Pkg.Parser (pSource, parseQueryToAST, toQText) +import Pkg.SchemaLearning.Catalog (FacetData (..), FacetSummary (..), FacetValue (..)) import Relude hiding (ask) import Servant qualified import System.Config (AuthContext (..), EnvConfig (..)) diff --git a/src/Pkg/AI.hs b/src/Pkg/AI.hs index 2c0ba35b7..e80a6f34d 100644 --- a/src/Pkg/AI.hs +++ b/src/Pkg/AI.hs @@ -59,7 +59,6 @@ import Effectful.Time qualified as Time import Langchain.LLM.Core qualified as LLM import Langchain.Memory.Core (BaseMemory (..)) import Langchain.Memory.TokenBufferMemory (TokenBufferMemory (..)) -import Pkg.SchemaLearning.Catalog (FacetData (..), FacetSummary (..), FacetValue (..)) import Models.Apis.Issues qualified as Issues import Models.Apis.LogQueries (executeSecuredQuery, selectLogTable) import Models.Projects.Projects qualified as Projects @@ -72,6 +71,7 @@ import Pkg.Components.TimePicker (TimePicker) import Pkg.Components.Widget qualified as Widget import Pkg.DeriveUtils (UUIDId (..)) import Pkg.Parser (parseQueryToAST) +import Pkg.SchemaLearning.Catalog (FacetData (..), FacetSummary (..), FacetValue (..)) import Relude import System.Tracing (Tracing) import System.Types (DB) diff --git a/src/Pkg/Components/LogQueryBox.hs b/src/Pkg/Components/LogQueryBox.hs index 010743151..53297212b 100644 --- a/src/Pkg/Components/LogQueryBox.hs +++ b/src/Pkg/Components/LogQueryBox.hs @@ -13,12 +13,12 @@ import Lucid.Aria qualified as Aria import Lucid.Base (TermRaw (termRaw)) import Lucid.Htmx import Lucid.Hyperscript (__) -import Pkg.SchemaLearning.Catalog (FacetData (..), FacetValue (..)) import Models.Apis.LogPatterns (knownPatternFields) import Models.Projects.Projects qualified as Projects import Models.Telemetry.Schema qualified as Schema import NeatInterpolation (text) import Pages.Components (modal_) +import Pkg.SchemaLearning.Catalog (FacetData (..), FacetValue (..)) import Relude import Utils (displayTimestamp, faSprite_, formatUTC, onpointerdown_) diff --git a/src/Pkg/ExtractionWorker.hs b/src/Pkg/ExtractionWorker.hs index 151808081..9bb83e806 100644 --- a/src/Pkg/ExtractionWorker.hs +++ b/src/Pkg/ExtractionWorker.hs @@ -91,10 +91,10 @@ data ShardState s = ShardState , drainBuffers :: !(IORef (HashMap (Projects.ProjectId, Text) ServiceBuffer)) , drainTrees :: !(IORef (HashMap (Projects.ProjectId, Text) ServiceDrainTree)) , pendingRehydrations :: !(IORef (HashSet (Projects.ProjectId, Text))) - , -- | Schema-learning catalog state, owned by this shard. Single-writer - -- (the shard fiber); the flush worker swaps the dirty subset via - -- 'atomicModifyIORef''. - schemaState :: !(IORef SchemaLearning.SchemaShardState) + , schemaState :: !(IORef SchemaLearning.SchemaShardState) + -- ^ Schema-learning catalog state, owned by this shard. Single-writer + -- (the shard fiber); the flush worker swaps the dirty subset via + -- 'atomicModifyIORef''. } diff --git a/src/Pkg/SchemaLearning/Catalog.hs b/src/Pkg/SchemaLearning/Catalog.hs index 6cbc94aac..76de53676 100644 --- a/src/Pkg/SchemaLearning/Catalog.hs +++ b/src/Pkg/SchemaLearning/Catalog.hs @@ -48,22 +48,22 @@ where import Data.Aeson qualified as AE import Data.Aeson.Types qualified as AET +import Data.Char (toLower) +import Data.Default (Default) import Data.HashMap.Strict qualified as HM import Data.HashSet qualified as HS import Data.Scientific qualified as Scientific import Data.Text qualified as T +import Data.Text.Display (Display) import Data.Time (UTCTime) +import Data.UUID qualified as UUID import Data.Vector qualified as V -import Database.PostgreSQL.Simple.FromField (FromField) -import Database.PostgreSQL.Simple.ToField (ToField) -import Deriving.Aeson qualified as DAE import Database.PostgreSQL.Entity.Types (CamelToSnake, Entity, FieldModifiers, GenericEntity, PrimaryKey, Schema, TableName) import Database.PostgreSQL.Simple (FromRow, ToRow) +import Database.PostgreSQL.Simple.FromField (FromField) import Database.PostgreSQL.Simple.Newtypes (Aeson (..)) -import Data.Char (toLower) -import Data.Default (Default) -import Data.Text.Display (Display) -import Data.UUID qualified as UUID +import Database.PostgreSQL.Simple.ToField (ToField) +import Deriving.Aeson qualified as DAE import GHC.Records (HasField (getField)) import Hasql.Interpolate qualified as HI import Pkg.DeriveUtils (UUIDId (..), WrappedEnumSC (..)) @@ -139,8 +139,8 @@ data Template = Template -- full, merging is a no-op. newtype Examples = Examples {values :: V.Vector AE.Value} deriving stock (Eq, Generic, Show) - deriving anyclass (NFData) deriving newtype (AE.FromJSON, AE.ToJSON) + deriving anyclass (NFData) -- | Per-field top-K cardinality / value counts. Summary input replacement for diff --git a/src/Pkg/SchemaLearning/Worker.hs b/src/Pkg/SchemaLearning/Worker.hs index e31651bab..4b9a925e3 100644 --- a/src/Pkg/SchemaLearning/Worker.hs +++ b/src/Pkg/SchemaLearning/Worker.hs @@ -22,11 +22,11 @@ module Pkg.SchemaLearning.Worker ( ) where +import Control.Concurrent (threadDelay) import Data.HashMap.Strict qualified as HM import Data.HashSet qualified as HS import Data.Time (UTCTime, getCurrentTime) import Data.Vector qualified as V -import Control.Concurrent (threadDelay) import Effectful (Eff) import Models.Apis.SchemaCatalog qualified as SC import Models.Projects.Projects qualified as Projects diff --git a/src/ProcessMessage.hs b/src/ProcessMessage.hs index 35cb74f16..c6b51d48c 100644 --- a/src/ProcessMessage.hs +++ b/src/ProcessMessage.hs @@ -56,12 +56,12 @@ import Effectful.Log (Log) import Effectful.Reader.Static qualified as Eff import Models.Apis.Endpoints qualified as Endpoints import Models.Apis.LogQueries qualified as LogQueries -import Pkg.SchemaLearning.Catalog qualified as Fields import Models.Projects.Projects qualified as Projects import Models.Telemetry.Telemetry (Context (trace_state), OtelLogsAndSpans (..), generateSummary) import Models.Telemetry.Telemetry qualified as Telemetry import Pkg.DeriveUtils (AesonText (..), UUIDId (..), unAesonTextMaybe) import Pkg.SchemaLearning.Catalog qualified as Catalog +import Pkg.SchemaLearning.Catalog qualified as Fields import Pkg.SchemaLearning.Hot qualified as SchemaHot import Relude hiding (ask) import Relude.Unsafe qualified as Unsafe @@ -1038,7 +1038,5 @@ tokenizeUrlPath = V.fromList . map normalize . T.splitOn "/" where normalize seg = fromMaybe (bool seg "<*>" $ isUrlIdLike seg) (valueToFormatStr seg) - - -- fieldsToFieldDTO removed: schema learning now flows through the -- in-memory catalog (see 'extractObservation' + Pkg.SchemaLearning). diff --git a/src/Web/ApiHandlers.hs b/src/Web/ApiHandlers.hs index 12660b29c..662ae20be 100644 --- a/src/Web/ApiHandlers.hs +++ b/src/Web/ApiHandlers.hs @@ -111,10 +111,9 @@ import Effectful.Time qualified as Time import Models.Apis.Endpoints qualified as Endpoints import Models.Apis.ErrorPatterns qualified as ErrorPatterns import Models.Apis.Issues qualified as Issues -import Models.Apis.SchemaCatalog qualified as SchemaCatalog -import Pkg.SchemaLearning.Catalog qualified as Fields import Models.Apis.LogPatterns qualified as LogPatterns import Models.Apis.Monitors qualified as Monitors +import Models.Apis.SchemaCatalog qualified as SchemaCatalog import Models.Apis.ShareEvents qualified as ShareEvents import Models.Projects.Dashboards qualified as Dashboards import Models.Projects.ProjectApiKeys qualified as ProjectApiKeys @@ -126,6 +125,7 @@ import Pkg.Components.TimePicker qualified as TP import Pkg.Components.Widget qualified as Widget import Pkg.DeriveUtils (SnakeSchema (..), UUIDId (..)) import Pkg.Parser qualified as Parser +import Pkg.SchemaLearning.Catalog qualified as Fields import Relude hiding (ask, id) import Servant (NoContent (..), ServerError (..), err400, err404) import System.Config (AuthContext (..), EnvConfig (..)) diff --git a/src/Web/MCP.hs b/src/Web/MCP.hs index 42df6edf2..f37a91a1f 100644 --- a/src/Web/MCP.hs +++ b/src/Web/MCP.hs @@ -29,8 +29,8 @@ import Data.UUID qualified as UUID import Effectful.Error.Static qualified as Error import Effectful.Reader.Static qualified as Reader import Effectful.Time qualified as Time -import Models.Apis.SchemaCatalog qualified as SchemaCatalog import Models.Apis.LogPatterns qualified as LogPatterns +import Models.Apis.SchemaCatalog qualified as SchemaCatalog import Models.Projects.Projects qualified as Projects import NeatInterpolation (text) import Network.HTTP.Types qualified as H From afa7b275e70836a69b3d8c598c64d3b44867a9bd Mon Sep 17 00:00:00 2001 From: Anthony Alaribe Date: Sun, 10 May 2026 05:58:25 +0200 Subject: [PATCH 3/6] fix(schema): drop hot-path reads/writes against deleted apis.shapes/fields/formats MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit projectCacheById joined apis.shapes on every cache rebuild and filtered on sh.hash IS NOT NULL, so once 0090 dropped the table the cache for every project came back empty and the request path 500'd. Removes the join (and the now-unused shapeHashes field on ProjectCache) and strips the legacy shapes/fields/formats migration + delete steps from migrateAndDeleteMergedEndpoints — the schema-learning catalog re-derives structure per canonical key, so no row migration is needed. --- src/Models/Apis/Endpoints.hs | 58 ++++----------------------------- src/Models/Projects/Projects.hs | 9 ++--- 2 files changed, 9 insertions(+), 58 deletions(-) diff --git a/src/Models/Apis/Endpoints.hs b/src/Models/Apis/Endpoints.hs index 45dc2daa0..9dc44cca0 100644 --- a/src/Models/Apis/Endpoints.hs +++ b/src/Models/Apis/Endpoints.hs @@ -553,58 +553,17 @@ getMergedEndpointPairs pid = LIMIT 10000 |] --- | Migrate shapes/fields/formats from old endpoints to canonical ones, remap anomalies/issues, then delete old data. +-- | Remap anomalies/issues from merged endpoints to their canonical hashes, then delete old endpoints. +-- Legacy apis.shapes/fields/formats migration steps removed (tables dropped in 0090); +-- the schema-learning catalog (apis.schema_catalog) re-derives structure on the fly per +-- canonical key, so no explicit row migration is needed for the new model. migrateAndDeleteMergedEndpoints :: DB es => [(Text, Text)] -> Eff es () migrateAndDeleteMergedEndpoints [] = pass migrateAndDeleteMergedEndpoints pairs = do let (oldHashes, canonHashes) = unzip pairs oldArr = V.fromList oldHashes canonArr = V.fromList canonHashes - -- Step 1: Migrate shapes (prefix-replace endpoint_hash and hash, remap field_hashes array) - Hasql.interpExecute_ - [HI.sql| - INSERT INTO apis.shapes (id, created_at, updated_at, project_id, endpoint_hash, hash, - field_hashes, query_params_keypaths, request_body_keypaths, response_body_keypaths, - request_headers_keypaths, response_headers_keypaths, status_code, - response_description, request_description, new_unique_fields, deleted_fields, updated_field_formats) - SELECT gen_random_uuid(), s.created_at, s.updated_at, s.project_id, - m.canonical, m.canonical || substring(s.hash FROM 9), - array(SELECT m2.canonical || substring(fh FROM 9) - FROM unnest(s.field_hashes) fh - LEFT JOIN unnest(#{oldArr}::text[], #{canonArr}::text[]) m2(old, canonical) ON LEFT(fh, 8) = m2.old), - s.query_params_keypaths, s.request_body_keypaths, s.response_body_keypaths, - s.request_headers_keypaths, s.response_headers_keypaths, s.status_code, - s.response_description, s.request_description, s.new_unique_fields, s.deleted_fields, s.updated_field_formats - FROM apis.shapes s - JOIN unnest(#{oldArr}::text[], #{canonArr}::text[]) m(old, canonical) ON s.endpoint_hash = m.old - ON CONFLICT (hash) DO NOTHING |] - -- Step 2: Migrate fields - Hasql.interpExecute_ - [HI.sql| - INSERT INTO apis.fields (id, created_at, updated_at, project_id, endpoint_hash, key, - field_type, field_type_override, format, format_override, description, key_path, - field_category, hash, is_enum, is_required) - SELECT gen_random_uuid(), f.created_at, f.updated_at, f.project_id, - m.canonical, f.key, f.field_type, f.field_type_override, f.format, f.format_override, - f.description, f.key_path, f.field_category, - m.canonical || substring(f.hash FROM 9), - f.is_enum, f.is_required - FROM apis.fields f - JOIN unnest(#{oldArr}::text[], #{canonArr}::text[]) m(old, canonical) ON f.endpoint_hash = m.old - ON CONFLICT (hash) DO NOTHING |] - -- Step 3: Migrate formats - Hasql.interpExecute_ - [HI.sql| - INSERT INTO apis.formats (id, created_at, updated_at, project_id, field_hash, field_type, - field_format, examples, hash) - SELECT gen_random_uuid(), fmt.created_at, fmt.updated_at, fmt.project_id, - m.canonical || substring(fmt.field_hash FROM 9), - fmt.field_type, fmt.field_format, fmt.examples, - m.canonical || substring(fmt.hash FROM 9) - FROM apis.formats fmt - JOIN unnest(#{oldArr}::text[], #{canonArr}::text[]) m(old, canonical) ON LEFT(fmt.field_hash, 8) = m.old - ON CONFLICT (hash) DO NOTHING |] - -- Step 4: Remap anomalies (skip if canonical target already exists for same project) + -- Remap anomalies (skip if canonical target already exists for same project) Hasql.interpExecute_ [HI.sql| UPDATE apis.anomalies a @@ -614,7 +573,7 @@ migrateAndDeleteMergedEndpoints pairs = do AND NOT EXISTS (SELECT 1 FROM apis.anomalies a2 WHERE a2.project_id = a.project_id AND a2.target_hash = m.canonical || substring(a.target_hash FROM 9)) |] - -- Step 5: Remap issues (skip if canonical target already exists for same project+type) + -- Remap issues (skip if canonical target already exists for same project+type) Hasql.interpExecute_ [HI.sql| UPDATE apis.issues i @@ -627,10 +586,7 @@ migrateAndDeleteMergedEndpoints pairs = do AND i2.target_hash = m.canonical || substring(i.target_hash FROM 9) AND i2.issue_type = i.issue_type AND i2.acknowledged_at IS NULL AND i2.archived_at IS NULL) |] - -- Step 6: Delete old data (reverse dependency order) - Hasql.interpExecute_ [HI.sql| DELETE FROM apis.formats WHERE LEFT(field_hash, 8) = ANY(#{oldArr}) |] - Hasql.interpExecute_ [HI.sql| DELETE FROM apis.fields WHERE endpoint_hash = ANY(#{oldArr}) |] - Hasql.interpExecute_ [HI.sql| DELETE FROM apis.shapes WHERE endpoint_hash = ANY(#{oldArr}) |] + -- Delete leftover anomalies/issues for the merged-out endpoints, then the endpoints themselves. Hasql.interpExecute_ [HI.sql| DELETE FROM apis.anomalies WHERE LEFT(target_hash, 8) = ANY(#{oldArr}) |] Hasql.interpExecute_ [HI.sql| DELETE FROM apis.issues WHERE LEFT(target_hash, 8) = ANY(#{oldArr}) |] Hasql.interpExecute_ [HI.sql| DELETE FROM apis.endpoints WHERE hash = ANY(#{oldArr}) |] diff --git a/src/Models/Projects/Projects.hs b/src/Models/Projects/Projects.hs index 0384f959f..d0753ab77 100644 --- a/src/Models/Projects/Projects.hs +++ b/src/Models/Projects/Projects.hs @@ -329,9 +329,6 @@ data ProjectCache = ProjectCache hosts :: V.Vector Text , -- maybe we don't need this? See the next point. endpointHashes :: V.Vector Text - , -- Since shapes always have the endpoints hash prepended to them, maybe we don't need to store the hash of endpoints, - -- since we can derive that from the shapes. - shapeHashes :: V.Vector Text , -- We check if every request is part of the redact list, so it's better if we don't need to hit the db for them with each request. -- Since we have a need to redact fields by endpoint, we can simply have the fields paths be prepended by the endpoint hash. -- [endpointHash]<>[field_category eg requestBody]<>[field_key_path] @@ -383,7 +380,6 @@ projectCacheById pid = do [HI.sql| select coalesce(ARRAY_AGG(DISTINCT hosts ORDER BY hosts ASC),'{}') hosts, coalesce(ARRAY_AGG(DISTINCT endpoint_hashes ORDER BY endpoint_hashes ASC),'{}') endpoint_hashes, - coalesce(ARRAY_AGG(DISTINCT shape_hashes ORDER BY shape_hashes ASC),'{}'::text[]) shape_hashes, coalesce(ARRAY_AGG(DISTINCT paths ORDER BY paths ASC),'{}') redacted_fields, ( SELECT count(*)::int FROM otel_logs_and_spans WHERE project_id=#{pidText} AND timestamp > #{now}::timestamptz - INTERVAL '1' DAY @@ -396,11 +392,10 @@ projectCacheById pid = do FROM apis.endpoints WHERE project_id = #{pid} AND canonical_path IS NOT NULL ) canonical_paths from - (select e.host hosts, e.hash endpoint_hashes, sh.hash shape_hashes, concat(rf.endpoint_hash,'<>', rf.field_category,'<>', rf.path) paths + (select e.host hosts, e.hash endpoint_hashes, concat(rf.endpoint_hash,'<>', rf.field_category,'<>', rf.path) paths from apis.endpoints e - left join apis.shapes sh ON sh.endpoint_hash = e.hash left join projects.redacted_fields rf ON rf.project_id = e.project_id - where e.project_id = #{pid} AND sh.hash IS NOT null + where e.project_id = #{pid} ) enp; |] From 93595e12afcab37f49769d5689d13685676a13dd Mon Sep 17 00:00:00 2001 From: Anthony Alaribe Date: Sun, 10 May 2026 12:51:21 +0200 Subject: [PATCH 4/6] add tests --- monoscope.cabal | 2 + package.yaml | 1 + src/BackgroundJobs.hs | 18 +- src/Models/Apis/Anomalies.hs | 31 ++-- src/Models/Apis/SchemaCatalog.hs | 137 ++++++++++++++- src/Pkg/SchemaLearning/Catalog.hs | 97 +++++++++++ src/Pkg/SchemaLearning/Worker.hs | 51 ++++-- test/integration/SchemaLearningSpec.hs | 222 +++++++++++++++++++++++++ 8 files changed, 530 insertions(+), 29 deletions(-) create mode 100644 test/integration/SchemaLearningSpec.hs diff --git a/monoscope.cabal b/monoscope.cabal index 597bc79ee..c3a708ed1 100644 --- a/monoscope.cabal +++ b/monoscope.cabal @@ -622,6 +622,7 @@ test-suite integration-tests Pages.ShareSpec ProcessMessageSpec ReplaySpec + SchemaLearningSpec Spec Web.ApiV1Spec Web.ClientMetadataSpec @@ -701,6 +702,7 @@ test-suite integration-tests , text , time , unliftio + , unordered-containers , uuid , uuid-quasi , vector diff --git a/package.yaml b/package.yaml index a2a5b803e..1e3271580 100644 --- a/package.yaml +++ b/package.yaml @@ -431,6 +431,7 @@ tests: - http-types - wai - wai-extra + - unordered-containers unit-tests: main: Main.hs source-dirs: test/unit diff --git a/src/BackgroundJobs.hs b/src/BackgroundJobs.hs index dade160e8..4324ac861 100644 --- a/src/BackgroundJobs.hs +++ b/src/BackgroundJobs.hs @@ -1730,9 +1730,21 @@ processEagerBatch batch shard runSchemaFlusherFiber :: Logger -> Config.AuthContext -> TracerProvider -> IO Void runSchemaFlusherFiber logger ctx tp = do let refs = V.toList $ V.map (.schemaState) ctx.extractionWorker.shards - flushOne ref = - runBackground logger ctx tp (SchemaWorker.flushDirty ref) - >>= \r -> pure r + flushOne ref = do + r <- runBackground logger ctx tp (SchemaWorker.flushDirty ref) + when (r.dirtyKeys > 0) + $ runBackground logger ctx tp + $ Log.logTrace + "schema-flush" + ( AE.object + [ "dirty_keys" AE..= r.dirtyKeys + , "templates_written" AE..= r.templatesWritten + , "catalog_rows_written" AE..= r.catalogRowsWritten + , "summaries_updated" AE..= r.summariesUpdated + , "anomalies_emitted" AE..= r.anomaliesEmitted + ] + ) + pure r SchemaWorker.runSchemaFlusher ctx.config.schemaFlushIntervalSecs refs flushOne diff --git a/src/Models/Apis/Anomalies.hs b/src/Models/Apis/Anomalies.hs index 59b1881f5..961ca4a63 100644 --- a/src/Models/Apis/Anomalies.hs +++ b/src/Models/Apis/Anomalies.hs @@ -149,19 +149,23 @@ data AnomalyVM = AnomalyVM via (GenericEntity '[Schema "apis", TableName "anomalies_vm", PrimaryKey "id", FieldModifiers '[CamelToSnake]] AnomalyVM) +-- | Read VM rows for the anomalies UI. +-- +-- target_hash conventions (set by @Pkg.SchemaLearning.Worker@): +-- * endpoint → @keyHash@ (== endpoints.hash for HTTP) +-- * shape → @keyHash:s:@ +-- * field → @keyHash:f:@ +-- * format → @keyHash:fmt:@ +-- +-- Endpoint metadata joins via @endpoints.hash = split_part(target_hash,':',1)@. +-- Per-field detail (key path, format) is fetched lazily by the caller via +-- 'getAnomalyFieldDetail' — encoding the full path into target_hash would +-- blow the unique-index size budget, and Postgres has no built-in xxhash. getAnomaliesVM :: (DB es, Time :> es) => Projects.ProjectId -> V.Vector Text -> Eff es [AnomalyVM] getAnomaliesVM pid hash | V.null hash = pure [] | otherwise = do now <- Time.currentTime - -- Legacy apis.shapes / fields / formats joins removed (those tables - -- were dropped in 0090). The schema-learning catalog - -- (apis.schema_catalog) replaces them; per-field VM details - -- (key_path, format examples, etc.) currently surface as NULL on - -- legacy anomaly_type values and need a fresh query against the new - -- table — TODO once the anomaly producer in - -- @Pkg.SchemaLearning.Worker.flushDirty@ stamps target_hash - -- accordingly. Hasql.interp [HI.sql| SELECT @@ -181,10 +185,10 @@ SELECT NULL::uuid field_id, NULL::text field_key, NULL::text field_key_path, - NULL::text field_category, -- placeholder; legacy field_category enum is dropped + NULL::text field_category, NULL::text field_format, NULL::uuid format_id, - NULL::text format_type, -- placeholder; legacy field_type enum is dropped + NULL::text format_type, '{}'::jsonb[] format_examples, endpoints.id endpoint_id, endpoints.method endpoint_method, @@ -197,11 +201,10 @@ SELECT from apis.anomalies an LEFT JOIN apis.issues iss ON iss.target_hash = an.target_hash AND iss.project_id = an.project_id - LEFT JOIN apis.endpoints ON (starts_with(an.target_hash, endpoints.hash) AND an.project_id = endpoints.project_id) + LEFT JOIN apis.endpoints ON an.project_id = endpoints.project_id + AND endpoints.hash = split_part(an.target_hash, ':', 1) where - ((an.anomaly_type = 'endpoint') - OR NOT (an.anomaly_type = ANY('{"endpoint","shape","field","format"}'::apis.anomaly_type[])) - ) AND an.project_id=#{pid} AND an.target_hash=ANY(#{hash}) + an.project_id=#{pid} AND an.target_hash=ANY(#{hash}) |] diff --git a/src/Models/Apis/SchemaCatalog.hs b/src/Models/Apis/SchemaCatalog.hs index 7c51fab0e..aa51f4efa 100644 --- a/src/Models/Apis/SchemaCatalog.hs +++ b/src/Models/Apis/SchemaCatalog.hs @@ -20,11 +20,17 @@ module Models.Apis.SchemaCatalog ( getByProject, getByHost, getByKey, + getByKeysBatch, getSummary, upsertSummary, vacuumUnreferencedTemplates, toFacetSummary, getFacetSummary, + -- Anomaly producer support. + AnomalyInsertRow (..), + insertAnomalies, + enqueueAnomalyJobs, + getCatalogFieldAt, -- Re-exports for reader migration. Catalog.FacetData (..), Catalog.FacetValue (..), @@ -40,7 +46,7 @@ import Data.Vector qualified as V import Effectful import Hasql.Interpolate qualified as HI import Models.Projects.Projects qualified as Projects -import Pkg.DeriveUtils (DB) +import Pkg.DeriveUtils (DB, UUIDId (..)) import Pkg.SchemaLearning.Catalog qualified as Catalog import Relude @@ -228,6 +234,55 @@ getByKey pid keyHash = WHERE c.project_id = #{pid} AND c.key_hash = #{keyHash} |] +-- | Bulk variant: fetches catalog rows for a heterogeneous (project, key_hash) +-- set in one round-trip. Used by the anomaly producer to load priors for an +-- entire dirty batch before diffing. +getByKeysBatch + :: DB es + => V.Vector (Projects.ProjectId, Text) + -> Eff es (HM.HashMap (Projects.ProjectId, Text) Catalog.CatalogEntry) +getByKeysBatch pairs + | V.null pairs = pure HM.empty + | otherwise = do + let pids = V.map fst pairs + khs = V.map snd pairs + rows :: [CatalogReadRow] <- + Hasql.interp + [HI.sql| SELECT c.project_id, c.key_kind, c.key_hash, c.template_hash, + c.scope, t.fields, c.values_delta, c.counts, + c.sample_count, c.first_seen, c.last_seen + FROM apis.schema_catalog c + JOIN apis.schema_template t ON c.template_hash = t.template_hash + JOIN unnest(#{pids}::uuid[], #{khs}::text[]) m(pid, kh) + ON c.project_id = m.pid AND c.key_hash = m.kh |] + pure $ HM.fromList [((UUIDId r.projectId, r.keyHash), readRowToEntry r) | r <- rows] + + +-- | Fetch a single field's structure from a catalog row. Used by 'getAnomaliesVM' +-- so anomaly readers can surface per-field detail (key_path, format) without a +-- second join in SQL. +getCatalogFieldAt + :: DB es + => Projects.ProjectId + -> Text + -- ^ key_hash + -> Text + -- ^ field path + -> Eff es (Maybe Catalog.FieldStruct) +getCatalogFieldAt pid keyHash path = do + rowM :: Maybe FieldsRow <- + Hasql.interpOne + [HI.sql| SELECT t.fields FROM apis.schema_catalog c + JOIN apis.schema_template t ON c.template_hash = t.template_hash + WHERE c.project_id = #{pid} AND c.key_hash = #{keyHash} |] + pure $ rowM >>= \(FieldsRow (HI.AsJsonb m)) -> HM.lookup path m + + +newtype FieldsRow = FieldsRow (HI.AsJsonb (HM.HashMap Text Catalog.FieldStruct)) + deriving stock (Generic) + deriving anyclass (HI.DecodeRow) + + -- --------------------------------------------------------------------------- -- Summary doc. @@ -306,3 +361,83 @@ vacuumUnreferencedTemplates = AND NOT EXISTS ( SELECT 1 FROM apis.schema_catalog c WHERE c.template_hash = t.template_hash) |] + + +-- --------------------------------------------------------------------------- +-- Anomaly producer. + +-- | Row shape for bulk inserts into @apis.anomalies@. +data AnomalyInsertRow = AnomalyInsertRow + { projectId :: !Projects.ProjectId + , anomalyType :: !Text + -- ^ matches @apis.anomaly_type@: "endpoint" | "shape" | "field" | "format" + , targetHash :: !Text + } + deriving stock (Eq, Generic, Show) + deriving anyclass (NFData) + + +-- | Bulk-insert anomalies. The unique @(project_id, target_hash)@ index +-- de-duplicates across flush passes — we rely on it to avoid maintaining a +-- separate "already-emitted" set on the hot path. Returns rows actually +-- inserted (excludes ON CONFLICT collisions). +insertAnomalies :: DB es => V.Vector AnomalyInsertRow -> Eff es Int64 +insertAnomalies rows | V.null rows = pure 0 +insertAnomalies rows = + Hasql.interpExecute + [HI.sql| INSERT INTO apis.anomalies (project_id, anomaly_type, action, target_hash) + SELECT pid, atype::apis.anomaly_type, 'created'::apis.anomaly_action, th + FROM unnest(#{pids}::uuid[], #{atypes}::text[], #{ths}::text[]) AS m(pid, atype, th) + ON CONFLICT (project_id, target_hash) DO NOTHING |] + where + pids = V.map (.projectId) rows + atypes = V.map (.anomalyType) rows + ths = V.map (.targetHash) rows + + +-- | Enqueue one @NewAnomaly@ background job per (project, anomalyType) group +-- so the existing notification fan-out (legacy @new_anomaly_proc@'s job +-- emitter) keeps firing. Coalesces with an already-queued job for the same +-- group, mirroring the legacy behaviour. +enqueueAnomalyJobs :: DB es => V.Vector AnomalyInsertRow -> Eff es () +enqueueAnomalyJobs rows | V.null rows = pass +enqueueAnomalyJobs rows = do + let groups :: HM.HashMap (Projects.ProjectId, Text) [Text] + groups = HM.fromListWith (<>) [((r.projectId, r.anomalyType), [r.targetHash]) | r <- V.toList rows] + forM_ (HM.toList groups) \((pid, atype), ths) -> do + let payload :: V.Vector Text + payload = V.fromList ths + Hasql.interpExecute_ + [HI.sql| + WITH existing AS ( + SELECT id, payload->'targetHashes' AS ths + FROM background_jobs + WHERE payload->>'tag' = 'NewAnomaly' + AND payload->>'projectId' = #{pid}::text + AND payload->>'anomalyType' = #{atype} + AND status = 'queued' + ORDER BY run_at ASC LIMIT 1 + ), + upd AS ( + UPDATE background_jobs SET payload = jsonb_build_object( + 'tag', 'NewAnomaly', + 'projectId', #{pid}::text, + 'createdAt', to_jsonb(now()), + 'anomalyType', #{atype}::text, + 'anomalyAction', 'created'::text, + 'targetHashes', COALESCE((SELECT ths FROM existing), '[]'::jsonb) || to_jsonb(#{payload}::text[]) + ) + WHERE id = (SELECT id FROM existing) + RETURNING id + ) + INSERT INTO background_jobs (run_at, status, payload) + SELECT now(), 'queued', jsonb_build_object( + 'tag', 'NewAnomaly', + 'projectId', #{pid}::text, + 'createdAt', to_jsonb(now()), + 'anomalyType', #{atype}::text, + 'anomalyAction', 'created'::text, + 'targetHashes', to_jsonb(#{payload}::text[]) + ) + WHERE NOT EXISTS (SELECT 1 FROM upd) + |] diff --git a/src/Pkg/SchemaLearning/Catalog.hs b/src/Pkg/SchemaLearning/Catalog.hs index 76de53676..715e9841b 100644 --- a/src/Pkg/SchemaLearning/Catalog.hs +++ b/src/Pkg/SchemaLearning/Catalog.hs @@ -30,6 +30,11 @@ module Pkg.SchemaLearning.Catalog ( mergeFullWalk, bumpSeen, classifyFormat, + -- Anomaly diffing. + AnomalyKind (..), + ProducedAnomaly (..), + diffAnomalies, + fieldHashSuffix, examplesCap, topKCap, exampleStringCap, @@ -397,6 +402,98 @@ emptySummaryDoc :: SummaryDoc emptySummaryDoc = SummaryDoc HM.empty V.empty HM.empty +-- --------------------------------------------------------------------------- +-- Anomaly diffing. +-- +-- Replaces the legacy DB-trigger fan-out (@new_anomaly_proc@). The flush +-- worker calls 'diffAnomalies' for each dirty entry against its prior +-- catalog row; emitted 'ProducedAnomaly's are inserted into +-- @apis.anomalies@ and a @NewAnomaly@ background job is enqueued so the +-- existing notification pipeline keeps firing. + +-- | Anomaly buckets matching @apis.anomaly_type@ minus runtime-exception +-- (which the error-pattern path produces directly). +data AnomalyKind = AKEndpoint | AKShape | AKField | AKFormat + deriving stock (Eq, Generic, Ord, Show) + deriving anyclass (NFData) + + +-- | One emitted anomaly. 'targetHash' is shaped so 'getAnomaliesVM' can +-- recover the endpoint via @starts_with(target_hash, endpoints.hash)@. +data ProducedAnomaly = ProducedAnomaly + { kind :: !AnomalyKind + , targetHash :: !Text + , keyHash :: !Text + -- ^ owning catalog key — for joining back to apis.schema_catalog + , fieldPath :: !(Maybe Text) + -- ^ populated for AKField / AKFormat + } + deriving stock (Eq, Generic, Ord, Show) + deriving anyclass (NFData) + + +-- | Stable 8-char suffix for a field path. Keeps target_hash bounded +-- and gives the read path a deterministic way back to the field. +fieldHashSuffix :: Text -> Text +fieldHashSuffix path = T.take 8 (toXXHash path) + + +-- | Diff a (possibly absent) prior entry against the current one. +-- Order: endpoint > shape > field > format. Endpoint anomalies fire +-- only on the @HttpEndpoint@ key kind — non-HTTP keys still get shape / +-- field / format diffs, just not the "new endpoint" headline. +-- +-- >>> let path = "request.body.user.id" +-- >>> let fs1 = FieldStruct (HS.fromList [FTString]) (HS.fromList ["{uuid}"]) FCRequestBody False +-- >>> let fs2 = FieldStruct (HS.fromList [FTString, FTNumber]) (HS.fromList ["{uuid}"]) FCRequestBody False +-- >>> let mk fs = CatalogEntry emptyScope (Template HttpEndpoint (HM.fromList fs)) HM.empty HM.empty 1 t0 t0 True +-- >>> let new = mk [(path, fs1)] +-- >>> map (.kind) (diffAnomalies "kh" Nothing new) +-- [AKEndpoint,AKShape,AKField,AKFormat] +-- >>> map (.kind) (diffAnomalies "kh" (Just new) new) +-- [] +-- >>> -- type widened on existing field → AKShape (template hash changed) + AKFormat +-- >>> map (.kind) (diffAnomalies "kh" (Just new) (mk [(path, fs2)])) +-- [AKShape,AKFormat] +-- >>> -- new field added → AKShape + AKField + AKFormat +-- >>> let new2 = mk [(path, fs1), ("request.body.email", fs1)] +-- >>> sort (map (.kind) (diffAnomalies "kh" (Just new) new2)) +-- [AKShape,AKField,AKFormat] +diffAnomalies :: Text -> Maybe CatalogEntry -> CatalogEntry -> [ProducedAnomaly] +diffAnomalies kh priorM cur = + let priorFields = maybe HM.empty (.template.fields) priorM + curFields = cur.template.fields + isNew = isNothing priorM + isHttp = cur.template.keyKind == HttpEndpoint + headline = + [ ProducedAnomaly AKEndpoint kh kh Nothing + | isNew, isHttp + ] + shapeChanged = case priorM of + Nothing -> True + Just p -> templateHash p.template /= templateHash cur.template + shape = + [ ProducedAnomaly AKShape (kh <> ":s:" <> T.take 8 (templateHash cur.template)) kh Nothing + | shapeChanged + ] + newPaths = HS.toList $ HS.difference (HS.fromList (HM.keys curFields)) (HS.fromList (HM.keys priorFields)) + fields = + [ ProducedAnomaly AKField (kh <> ":f:" <> fieldHashSuffix p) kh (Just p) + | p <- sort newPaths + ] + formatChanged path = + case (HM.lookup path priorFields, HM.lookup path curFields) of + (Just p, Just c) -> p.types /= c.types || p.formats /= c.formats + (Nothing, Just _) -> True + _ -> False + changedFormatPaths = filter formatChanged (HM.keys curFields) + formats = + [ ProducedAnomaly AKFormat (kh <> ":fmt:" <> fieldHashSuffix p) kh (Just p) + | p <- sort changedFormatPaths + ] + in headline <> shape <> fields <> formats + + -- --------------------------------------------------------------------------- -- Re-homed from the deleted "Models.Apis.Fields". Kept name-compatible so -- existing readers (Anomalies VM, SchemaCatalog adapter) work unchanged. diff --git a/src/Pkg/SchemaLearning/Worker.hs b/src/Pkg/SchemaLearning/Worker.hs index 4b9a925e3..0ccb3a87b 100644 --- a/src/Pkg/SchemaLearning/Worker.hs +++ b/src/Pkg/SchemaLearning/Worker.hs @@ -9,16 +9,17 @@ -- 'Catalog.templateHash') and 'CatalogRow's (per-project pointers). -- 3. Upserts templates first, then catalog rows, then re-derives the -- per-project summary doc. --- 4. Hands the newly-acknowledged template hashes back to the shard so +-- 4. Diffs each dirty entry against its prior catalog row (one batched +-- lookup) and inserts endpoint/shape/field/format anomalies into +-- @apis.anomalies@ + a deduped @NewAnomaly@ background job. Replaces +-- the legacy @new_anomaly_proc@ trigger fan-out. +-- 5. Hands the newly-acknowledged template hashes back to the shard so -- subsequent flushes can short-circuit unchanged-template upserts. --- --- Anomaly diff/produce is not yet wired here — the legacy --- @new_anomaly_proc@ trigger is being deprecated and the replacement --- belongs in a follow-up (see TODO in 'flushDirty'). module Pkg.SchemaLearning.Worker ( FlushResult (..), flushDirty, runSchemaFlusher, + buildAnomalyRows, ) where @@ -44,6 +45,8 @@ data FlushResult = FlushResult , catalogRowsWritten :: !Int , summariesUpdated :: !Int , dirtyKeys :: !Int + , anomaliesEmitted :: !Int + -- ^ rows actually inserted into apis.anomalies (post-dedup) } deriving stock (Eq, Generic, Show) deriving anyclass (NFData) @@ -51,6 +54,10 @@ data FlushResult = FlushResult -- | One flush pass over a single shard. Pure-Eff except for the -- 'atomicModifyIORef'' inside 'Hot.takeDirty'. +-- +-- Order matters: we diff against the prior catalog row /before/ upserting +-- so we don't see our own write back as the "prior". Anomaly inserts race- +-- safely on the @(project_id, target_hash)@ unique index. flushDirty :: DB es => IORef SchemaShardState @@ -58,9 +65,15 @@ flushDirty flushDirty ref = do dirty <- liftIO $ Hot.takeDirty ref if V.null dirty - then pure FlushResult{templatesWritten = 0, catalogRowsWritten = 0, summariesUpdated = 0, dirtyKeys = 0} + then pure FlushResult{templatesWritten = 0, catalogRowsWritten = 0, summariesUpdated = 0, dirtyKeys = 0, anomaliesEmitted = 0} else do now <- liftIO getCurrentTime + -- 1. Anomaly diff (must precede upserts). + priors <- SC.getByKeysBatch (V.map (\(k, _) -> (k.projectId, k.keyHash)) dirty) + let anomalyRows = buildAnomalyRows priors dirty + anomaliesN <- SC.insertAnomalies anomalyRows + SC.enqueueAnomalyJobs anomalyRows + -- 2. Upserts. let templateRows = dedupTemplates $ V.map (templateRowOf now . snd) dirty catalogRows = V.map (uncurry catalogRowOf) dirty touchedProjects = HS.fromList [k.projectId | (k, _) <- V.toList dirty] @@ -69,20 +82,36 @@ flushDirty ref = do summariesN <- regenerateSummaries touchedProjects let newHashes = HS.fromList $ V.toList $ V.map (.templateHash) templateRows liftIO $ Hot.pruneEvicted ref HS.empty newHashes - -- TODO(schema-anomalies): diff dirty entries vs prior catalog rows - -- (stale @apis.shapes@/@apis.fields@ triggers no longer fire). Emit - -- per-(project, key_hash) endpoint/shape/field/format anomalies into - -- @apis.anomalies@ + @background_jobs@ so the legacy notification - -- pipeline keeps working. pure FlushResult { templatesWritten = V.length templateRows , catalogRowsWritten = V.length catalogRows , summariesUpdated = summariesN , dirtyKeys = V.length dirty + , anomaliesEmitted = fromIntegral anomaliesN } +-- | Build the anomaly insert rows for a dirty batch. Pure so it can be +-- doctested independently of the DB. +buildAnomalyRows + :: HM.HashMap (Projects.ProjectId, Text) CatalogEntry + -> V.Vector (SchemaKey, CatalogEntry) + -> V.Vector SC.AnomalyInsertRow +buildAnomalyRows priors dirty = + V.fromList + [ SC.AnomalyInsertRow{projectId = k.projectId, anomalyType = kindLabel pa.kind, targetHash = pa.targetHash} + | (k, e) <- V.toList dirty + , pa <- Catalog.diffAnomalies k.keyHash (HM.lookup (k.projectId, k.keyHash) priors) e + ] + where + kindLabel = \case + Catalog.AKEndpoint -> "endpoint" + Catalog.AKShape -> "shape" + Catalog.AKField -> "field" + Catalog.AKFormat -> "format" + + templateRowOf :: UTCTime -> CatalogEntry -> SC.TemplateRow templateRowOf now e = SC.TemplateRow diff --git a/test/integration/SchemaLearningSpec.hs b/test/integration/SchemaLearningSpec.hs new file mode 100644 index 000000000..13062f67c --- /dev/null +++ b/test/integration/SchemaLearningSpec.hs @@ -0,0 +1,222 @@ +-- | End-to-end coverage for the in-process schema-learning pipeline: +-- * 'observeSpans' populates a shard +-- * 'flushDirty' writes templates / catalog rows / summary +-- * The anomaly producer emits per-(project, key_hash) anomalies + +-- dedupes across flushes + enqueues a single coalesced @NewAnomaly@ job +-- +-- Mirrors the production fiber wiring from 'BackgroundJobs.runSchemaFlusherFiber' +-- but calls 'flushDirty' directly so we can assert on each pass. +module SchemaLearningSpec (spec) where + +import Data.Aeson qualified as AE +import Data.HashMap.Strict qualified as HM +import Data.HashSet qualified as HS +import Data.Time (UTCTime) +import Data.UUID qualified as UUID +import Data.Vector qualified as V +import Database.PostgreSQL.Entity.DBT (withPool) +import Database.PostgreSQL.Entity.DBT qualified as DBT +import Database.PostgreSQL.Simple (Only (..)) +import Database.PostgreSQL.Simple.SqlQQ (sql) +import Database.PostgreSQL.Simple.ToRow qualified +import Database.PostgreSQL.Simple.Types qualified +import Models.Projects.Projects qualified as Projects +import Pkg.DeriveUtils (UUIDId (..)) +import Pkg.SchemaLearning.Catalog qualified as Catalog +import Pkg.SchemaLearning.Hot qualified as Hot +import Pkg.SchemaLearning.Worker qualified as Worker +import Pkg.TestUtils (TestResources (..), frozenTime, runHasqlEffect, withTestResources) +import Relude +import Test.Hspec (Spec, aroundAll, describe, it, shouldBe, shouldReturn, shouldSatisfy) +import Utils (toXXHash) + + +pid :: Projects.ProjectId +pid = UUIDId UUID.nil + + +-- | Per-test reset: schema-learning + anomaly + queued-job state for this project. +-- Order matters: catalog → summary → template (FK), then anomalies + queued jobs. +clearAll :: TestResources -> IO () +clearAll tr = do + let exec :: forall ps. (Database.PostgreSQL.Simple.ToRow.ToRow ps) => Database.PostgreSQL.Simple.Types.Query -> ps -> IO () + exec q ps = void $ withPool tr.trPool $ DBT.execute q ps + exec [sql| DELETE FROM apis.schema_catalog WHERE project_id = ? |] (Only pid) + exec [sql| DELETE FROM apis.schema_summary WHERE project_id = ? |] (Only pid) + exec + [sql| DELETE FROM apis.schema_template + WHERE NOT EXISTS (SELECT 1 FROM apis.schema_catalog + WHERE template_hash = apis.schema_template.template_hash) |] + () + exec [sql| DELETE FROM apis.anomalies WHERE project_id = ? |] (Only pid) + exec + [sql| DELETE FROM background_jobs + WHERE payload->>'tag' = 'NewAnomaly' AND payload->>'projectId' = ?::text |] + (Only pid) + + +-- | Stable HTTP key. Mirrors the keying in 'ProcessMessage.extractObservation'. +keyHashFor :: Text -> Text -> Text -> Text +keyHashFor host method path = toXXHash (pid.toText <> host <> method <> path) + + +httpScope :: Text -> Text -> Text -> Catalog.Scope +httpScope host method path = + Catalog.Scope + { Catalog.service = Just "test-svc" + , Catalog.spanName = Just "GET /" + , Catalog.kind = Just "server" + , Catalog.host = Just host + , Catalog.method = Just method + , Catalog.urlPath = Just path + , Catalog.statusCodes = V.singleton 200 + } + + +-- | Build one ObservationInput for a synthetic HTTP span. 'walk' is the +-- (path, values, category) triples the leaf walker would produce. +mkObs + :: UTCTime + -> Text + -> Text + -> Text + -> [(Text, AE.Value, Catalog.FieldCategoryEnum)] + -> Hot.ObservationInput +mkObs ts host method path leaves = + Hot.ObservationInput + { keyKind = Catalog.HttpEndpoint + , keyHash = keyHashFor host method path + , scope = httpScope host method path + , walk = [(p, V.singleton (v, Nothing), c) | (p, v, c) <- leaves] + , timestamp = ts + } + + +-- | Convenience: observe + flush + return the result. +observeAndFlush :: TestResources -> IORef Hot.SchemaShardState -> [Hot.ObservationInput] -> IO Worker.FlushResult +observeAndFlush tr ref obs = do + Hot.observeSpans ref Hot.defaultPolicy pid (V.fromList obs) + runHasqlEffect tr (Worker.flushDirty ref) + + +countAnomalies :: TestResources -> Text -> IO Int +countAnomalies tr atype = do + rows :: V.Vector (Only Int) <- + withPool tr.trPool $ DBT.query + [sql| SELECT COUNT(*)::int FROM apis.anomalies + WHERE project_id = ? AND anomaly_type = ?::apis.anomaly_type |] + (pid, atype) + pure $ maybe 0 (\(Only n) -> n) (rows V.!? 0) + + +countAllAnomalies :: TestResources -> IO Int +countAllAnomalies tr = do + rows :: V.Vector (Only Int) <- + withPool tr.trPool $ DBT.query + [sql| SELECT COUNT(*)::int FROM apis.anomalies WHERE project_id = ? |] + (Only pid) + pure $ maybe 0 (\(Only n) -> n) (rows V.!? 0) + + +-- | Number of queued NewAnomaly jobs for this project. +countAnomalyJobs :: TestResources -> IO Int +countAnomalyJobs tr = do + rows :: V.Vector (Only Int) <- + withPool tr.trPool $ DBT.query + [sql| SELECT COUNT(*)::int FROM background_jobs + WHERE payload->>'tag' = 'NewAnomaly' + AND payload->>'projectId' = ?::text |] + (Only pid) + pure $ maybe 0 (\(Only n) -> n) (rows V.!? 0) + + +spec :: Spec +spec = aroundAll withTestResources $ + describe "Schema Learning – flush + anomaly producer" $ do + it "first flush emits endpoint+shape+field+format anomalies and enqueues one job per type" $ \tr -> do + clearAll tr + ref <- newIORef Hot.emptySchemaShardState + let obs = mkObs frozenTime "api.example.com" "GET" "/users" + [ ("request.body.user.id", AE.String "abc", Catalog.FCRequestBody) + , ("request.body.user.email", AE.String "x@y", Catalog.FCRequestBody) + ] + r <- observeAndFlush tr ref [obs] + r.dirtyKeys `shouldBe` 1 + r.catalogRowsWritten `shouldBe` 1 + -- 1 endpoint + 1 shape + 2 field (user.id, user.email) + 2 format = 6 + r.anomaliesEmitted `shouldBe` 6 + countAllAnomalies tr `shouldReturn` 6 + countAnomalies tr "endpoint" `shouldReturn` 1 + countAnomalies tr "shape" `shouldReturn` 1 + countAnomalies tr "field" `shouldReturn` 2 + countAnomalies tr "format" `shouldReturn` 2 + -- One NewAnomaly job per (project, anomaly_type) — 4 jobs total. + countAnomalyJobs tr `shouldReturn` 4 + + it "second flush of identical observations dedupes — no new anomalies, no extra jobs" $ \tr -> do + clearAll tr + ref <- newIORef Hot.emptySchemaShardState + let obs = mkObs frozenTime "api.example.com" "GET" "/users" + [("request.body.id", AE.String "abc", Catalog.FCRequestBody)] + _ <- observeAndFlush tr ref [obs] + before <- countAllAnomalies tr + jobsBefore <- countAnomalyJobs tr + _ <- observeAndFlush tr ref [obs] + after <- countAllAnomalies tr + jobsAfter <- countAnomalyJobs tr + after `shouldBe` before -- ON CONFLICT DO NOTHING on (project_id, target_hash) + jobsAfter `shouldBe` jobsBefore -- queued job coalesces in place + + it "adding a new field path emits one shape + one field + one format anomaly (no new endpoint)" $ \tr -> do + clearAll tr + ref <- newIORef Hot.emptySchemaShardState + let host = "api.example.com"; method = "POST"; path = "/orders" + _ <- observeAndFlush tr ref + [mkObs frozenTime host method path + [("request.body.id", AE.String "x", Catalog.FCRequestBody)]] + before <- countAllAnomalies tr + _ <- observeAndFlush tr ref + [mkObs frozenTime host method path + [ ("request.body.id", AE.String "x", Catalog.FCRequestBody) + , ("request.body.email", AE.String "a@b", Catalog.FCRequestBody) + ]] + countAnomalies tr "endpoint" `shouldReturn` 1 -- unchanged + shapeN <- countAnomalies tr "shape" + fieldN <- countAnomalies tr "field" + formatN <- countAnomalies tr "format" + shapeN `shouldBe` 2 -- one from initial flush + one from added field + fieldN `shouldBe` 2 -- id + email + formatN `shouldBe` 2 -- id + email + after <- countAllAnomalies tr + after `shouldSatisfy` (> before) + + it "widening a field type emits a shape + format anomaly only" $ \tr -> do + clearAll tr + ref <- newIORef Hot.emptySchemaShardState + let host = "api.example.com"; method = "GET"; path = "/widen" + -- First: id is string-only. + _ <- observeAndFlush tr ref + [mkObs frozenTime host method path + [("response.body.id", AE.String "abc", Catalog.FCResponseBody)]] + before <- countAllAnomalies tr + -- Now widen with a number occurrence — flushDirty re-walks past learnFullThreshold + -- but for the very first batch the walk runs unconditionally, so a single + -- widened observation suffices. + _ <- observeAndFlush tr ref + [mkObs frozenTime host method path + [("response.body.id", AE.Number 42, Catalog.FCResponseBody)]] + after <- countAllAnomalies tr + -- Endpoint already exists, no new field path → only shape + format add up. + (after - before) `shouldSatisfy` (`elem` [1, 2 :: Int]) + countAnomalies tr "endpoint" `shouldReturn` 1 + countAnomalies tr "field" `shouldReturn` 1 -- only the original + + it "buildAnomalyRows produces no rows when prior == current" $ \_tr -> do + let kh = "khTest" + fs = Catalog.FieldStruct (HS.fromList [Catalog.FTString]) (HS.fromList ["text"]) Catalog.FCRequestBody False + tmpl = Catalog.Template Catalog.HttpEndpoint (HM.fromList [("user.id", fs)]) + entry = Catalog.CatalogEntry (httpScope "h" "GET" "/p") tmpl HM.empty HM.empty 1 frozenTime frozenTime True + k = Hot.SchemaKey pid kh + priors = HM.fromList [((pid, kh), entry)] + rows = Worker.buildAnomalyRows priors (V.singleton (k, entry)) + V.length rows `shouldBe` 0 From fcff1d4617d7d8165fd189fe61186e5d222464bc Mon Sep 17 00:00:00 2001 From: Anthony Alaribe Date: Sun, 10 May 2026 14:03:36 +0200 Subject: [PATCH 5/6] fix(schema): drop rf.field_category from projectCacheById projects.redacted_fields.field_category was dropped via CASCADE in 0090 (the apis.field_category enum it depended on was removed). The hot-path projectCache rebuild query still referenced it, throwing "column rf.field_category does not exist" on every ingestion batch and killing prod again. Replaces it with an empty middle segment so the '<>'-separated string format stays compatible with any downstream split. --- src/Models/Projects/Projects.hs | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/src/Models/Projects/Projects.hs b/src/Models/Projects/Projects.hs index d0753ab77..7477c85b3 100644 --- a/src/Models/Projects/Projects.hs +++ b/src/Models/Projects/Projects.hs @@ -392,7 +392,10 @@ projectCacheById pid = do FROM apis.endpoints WHERE project_id = #{pid} AND canonical_path IS NOT NULL ) canonical_paths from - (select e.host hosts, e.hash endpoint_hashes, concat(rf.endpoint_hash,'<>', rf.field_category,'<>', rf.path) paths + -- field_category column was dropped by the 0090 cascade (apis.field_category + -- enum was the type). The redact-list format keeps the '<>' separators so + -- consumers that split on them still see three segments. + (select e.host hosts, e.hash endpoint_hashes, concat(rf.endpoint_hash,'<>','<>', rf.path) paths from apis.endpoints e left join projects.redacted_fields rf ON rf.project_id = e.project_id where e.project_id = #{pid} From 1a290c772b38fc60c84100124b26d09ae15a0ba9 Mon Sep 17 00:00:00 2001 From: "github-actions[bot]" Date: Sun, 10 May 2026 12:04:01 +0000 Subject: [PATCH 6/6] Auto-format code with fourmolu --- src/Pkg/SchemaLearning/Catalog.hs | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/src/Pkg/SchemaLearning/Catalog.hs b/src/Pkg/SchemaLearning/Catalog.hs index 715e9841b..f389fc147 100644 --- a/src/Pkg/SchemaLearning/Catalog.hs +++ b/src/Pkg/SchemaLearning/Catalog.hs @@ -467,7 +467,8 @@ diffAnomalies kh priorM cur = isHttp = cur.template.keyKind == HttpEndpoint headline = [ ProducedAnomaly AKEndpoint kh kh Nothing - | isNew, isHttp + | isNew + , isHttp ] shapeChanged = case priorM of Nothing -> True