feat(analyze): add --reanalyze flag for scoped re-analysis

leifericf · claude · leifericf · commit 748ce1ba8e17 · 2026-03-27T23:52:25.000+01:00
Allow users to explicitly re-analyze files that already have semantic
metadata, without manually retracting Datomic attributes. Supports four
scopes: all, prompt-changed, model-changed, and stale (files modified
by commits since their last analysis). Available in both CLI and MCP.

Co-Authored-By: Claude Opus 4.6 (1M context) &lt;noreply@anthropic.com&gt;
diff --git a/CLAUDE.md b/CLAUDE.md
@@ -22,7 +22,7 @@ This project has its own MCP server (`noumenon`) that provides a knowledge graph
 | `noumenon_ask` | Ask a natural-language question — AI-powered iterative querying |
 | `noumenon_import` | Import git history and file structure (idempotent — safe to re-run) |
 | `noumenon_update` | Sync knowledge graph with latest git state (import + enrich; pass `analyze=true` for LLM analysis) |
-| `noumenon_analyze` | Run LLM analysis on files not yet analyzed — enriches the graph with semantic metadata |
+| `noumenon_analyze` | Run LLM analysis on files — by default only unanalyzed; pass `reanalyze` for re-analysis (all, prompt-changed, model-changed, stale) |
 | `noumenon_enrich` | Extract cross-file import/dependency graph deterministically (no LLM calls) |
 | `noumenon_list_databases` | List all noumenon databases with entity counts, pipeline stages, and cost |
 | `noumenon_digest` | Run the full pipeline: import, enrich, analyze, benchmark (each step idempotent, skippable) |
diff --git a/README.md b/README.md
@@ -146,7 +146,7 @@ The CLI and [MCP](https://modelcontextprotocol.io) server expose the same capabi
 | Command | CLI | MCP tool | Description |
 |---|---|---|---|
 | Import | `import <path>` | `noumenon_import` | Import git history and file structure |
-| Analyze | `analyze <path>` | `noumenon_analyze` | Enrich files with LLM semantic metadata |
+| Analyze | `analyze <path>` | `noumenon_analyze` | Enrich files with LLM semantic metadata (`--reanalyze` for re-analysis) |
 | Enrich | `enrich <path>` | `noumenon_enrich` | Extract cross-file import graph (no LLM) |
 | Update | `update <path>` | `noumenon_update` | Sync knowledge graph with latest git state |
 | Digest | `digest <path>` | `noumenon_digest` | Run full pipeline: import, enrich, analyze, benchmark |
diff --git a/src/noumenon/analyze.clj b/src/noumenon/analyze.clj
@@ -306,6 +306,64 @@
       (log! (str "Skipping " (count sensitive) " sensitive file(s) from analysis")))
     (sort-by :file/path safe)))
 
+(def ^:private valid-reanalyze-scopes
+  #{:all :prompt-changed :model-changed :stale})
+
+(defn files-for-reanalysis
+  "Return analyzed files matching `scope` for re-analysis.
+   `opts` may include :prompt-hash (for :prompt-changed) and :model-id (for :model-changed).
+   Returns [{:file/path ... :file/lang ...}], same shape as `files-needing-analysis`."
+  [db scope opts]
+  {:pre [(valid-reanalyze-scopes scope)]}
+  (let [raw (case scope
+              :all
+              (d/q '[:find ?path ?lang
+                     :where
+                     [?e :file/path ?path]
+                     [?e :file/lang ?lang]
+                     [?e :sem/summary _]]
+                   db)
+
+              :prompt-changed
+              (d/q '[:find ?path ?lang
+                     :in $ ?current-hash
+                     :where
+                     [?e :file/path ?path]
+                     [?e :file/lang ?lang]
+                     [?e :sem/summary _ ?tx]
+                     [?tx :prov/prompt-hash ?h]
+                     [(not= ?h ?current-hash)]]
+                   db (:prompt-hash opts))
+
+              :model-changed
+              (d/q '[:find ?path ?lang
+                     :in $ ?current-model
+                     :where
+                     [?e :file/path ?path]
+                     [?e :file/lang ?lang]
+                     [?e :sem/summary _ ?tx]
+                     [?tx :prov/model-version ?m]
+                     [(not= ?m ?current-model)]]
+                   db (:model-id opts))
+
+              :stale
+              (d/q '[:find ?path ?lang
+                     :where
+                     [?e :file/path ?path]
+                     [?e :file/lang ?lang]
+                     [?e :sem/summary _ ?tx]
+                     [?tx :prov/analyzed-at ?at]
+                     [?c :commit/changed-files ?e]
+                     [?c :commit/committed-at ?ct]
+                     [(> ?ct ?at)]]
+                   db))
+        candidates (mapv (fn [[path lang]] {:file/path path :file/lang lang}) raw)
+        {sensitive true safe false} (group-by #(files/sensitive-path? (:file/path %))
+                                              candidates)]
+    (when (seq sensitive)
+      (log! (str "Skipping " (count sensitive) " sensitive file(s) from re-analysis")))
+    (sort-by :file/path safe)))
+
 ;; --- Orchestration ---
 
 (defn repo-name
diff --git a/src/noumenon/cli.clj b/src/noumenon/cli.clj
@@ -111,9 +111,14 @@
   [specs valid-set]
   (mapv #(if (= "--provider" (:flag %)) (assoc % :valid valid-set) %) specs))
 
+(def ^:private reanalyze-flag
+  {:flag "--reanalyze" :key :reanalyze :parse :string
+   :desc "Re-analyze files: all, prompt-changed, model-changed, stale"
+   :error-missing :missing-reanalyze-value})
+
 (def ^:private analyze-flags
   (vec (concat [model-flag (assoc provider-flag :valid all-valid-providers)
-                max-files-flag db-dir-flag]
+                max-files-flag reanalyze-flag db-dir-flag]
                verbose-flags concurrency-flags)))
 
 ;; --- Declarative command specs ---
@@ -234,7 +239,7 @@
    "analyze"      {:spec analyze-command-spec
                    :summary "Enrich imported files with LLM-driven semantic analysis"
                    :usage "analyze [options] <repo-path>"
-                   :epilog "Sensitive files (.env, *.pem, credentials, SSH keys, etc.) are\nautomatically excluded — their contents are never sent to the LLM."}
+                   :epilog "Sensitive files (.env, *.pem, credentials, SSH keys, etc.) are\nautomatically excluded — their contents are never sent to the LLM.\n\nRe-analysis scopes (--reanalyze):\n  all              Re-analyze every file\n  prompt-changed   Files analyzed with a different prompt template\n  model-changed    Files analyzed with a different model\n  stale            Files modified by commits since their last analysis"}
    "enrich"       {:spec enrich-command-spec
                    :summary "Extract cross-file import graph deterministically"
                    :usage "enrich [options] <repo-path>"
diff --git a/src/noumenon/main.clj b/src/noumenon/main.clj
@@ -103,9 +103,29 @@
                         {:db-path    (db-path ctx)
                          :next-step  (str cli/program-name " enrich " repo-path)})}))))
 
+(def ^:private valid-reanalyze-scopes
+  #{"all" "prompt-changed" "model-changed" "stale"})
+
+(defn- prepare-reanalysis!
+  "Retract analysis attrs for files matching the reanalyze scope.
+   Returns count of files marked for re-analysis, or nil if no scope given."
+  [conn db reanalyze {:keys [prompt-hash model-id]}]
+  (when reanalyze
+    (let [scope (keyword reanalyze)
+          files (analyze/files-for-reanalysis db scope {:prompt-hash prompt-hash
+                                                        :model-id    model-id})
+          paths (mapv :file/path files)
+          n     (if (seq paths) (sync/retract-analysis! conn paths) 0)]
+      (log! (str "Marked " n " file(s) for re-analysis (scope: " reanalyze ")"))
+      n)))
+
 (defn do-analyze
   "Run the analyze subcommand. Returns {:exit n :result map-or-nil}."
-  [{:keys [repo-path model provider concurrency min-delay max-files] :as opts}]
+  [{:keys [repo-path model provider concurrency min-delay max-files reanalyze] :as opts}]
+  (when (and reanalyze (not (valid-reanalyze-scopes reanalyze)))
+    (print-error! (str "Invalid --reanalyze scope: " reanalyze
+                       ". Must be one of: all, prompt-changed, model-changed, stale"))
+    (System/exit 1))
   (with-valid-repo
     opts
     (fn [ctx]
@@ -115,15 +135,18 @@
           (fn [{:keys [conn]}]
             (let [{:keys [prompt-fn model-id]}
                   (llm/wrap-as-prompt-fn-from-opts {:provider provider :model model})
-                  result (analyze/analyze-repo! conn repo-path prompt-fn
-                                                (cond-> {:model-id     model-id
-                                                         :concurrency  (or concurrency 3)
-                                                         :min-delay-ms (or min-delay 0)}
-                                                  max-files (assoc :max-files max-files)))]
-              (log! (str "Next: run '" cli/program-name " query <query-name> " repo-path
-                         "' or '" cli/program-name " ask -q \"...\" " repo-path
-                         "' to explore the knowledge graph."))
-              {:exit 0 :result result})))
+                  prompt-hash (analyze/prompt-hash (:template (analyze/load-prompt-template)))]
+              (prepare-reanalysis! conn (d/db conn) reanalyze
+                                   {:prompt-hash prompt-hash :model-id model-id})
+              (let [result (analyze/analyze-repo! conn repo-path prompt-fn
+                                                  (cond-> {:model-id     model-id
+                                                           :concurrency  (or concurrency 3)
+                                                           :min-delay-ms (or min-delay 0)}
+                                                    max-files (assoc :max-files max-files)))]
+                (log! (str "Next: run '" cli/program-name " query <query-name> " repo-path
+                           "' or '" cli/program-name " ask -q \"...\" " repo-path
+                           "' to explore the knowledge graph."))
+                {:exit 0 :result result}))))
         (catch clojure.lang.ExceptionInfo e
           (print-error! (.getMessage e))
           (when-let [help (cli/format-subcommand-help "analyze")]
diff --git a/src/noumenon/mcp.clj b/src/noumenon/mcp.clj
@@ -148,7 +148,7 @@
                                       "continue_from" {:type "string" :description "Session ID from a budget-exhausted run — resumes the agent from where it left off"}})
                   :required ["question" "repo_path"]}}
    {:name "noumenon_analyze"
-    :description "Run LLM analysis on repository files to enrich the knowledge graph with semantic metadata. Only analyzes files not yet analyzed. Requires a prior import."
+    :description "Run LLM analysis on repository files to enrich the knowledge graph with semantic metadata. By default only analyzes files not yet analyzed. Pass reanalyze to re-analyze files: all, prompt-changed, model-changed, or stale. Requires a prior import."
     :inputSchema {:type "object"
                   :properties (merge repo-path-prop
                                      {"provider" {:type "string"
@@ -158,7 +158,9 @@
                                       "concurrency" {:type "integer"
                                                      :description "Number of concurrent LLM calls (default: 3, max: 20)"}
                                       "max_files" {:type "integer"
-                                                   :description "Stop after analyzing N files (useful for sampling)"}})
+                                                   :description "Stop after analyzing N files (useful for sampling)"}
+                                      "reanalyze" {:type "string"
+                                                   :description "Re-analyze scope: all, prompt-changed, model-changed, stale (default: only unanalyzed files)"}})
                   :required ["repo_path"]}}
    {:name "noumenon_enrich"
     :description "Extract cross-file import graph deterministically. No LLM calls — uses language-specific parsers. Requires a prior import."
@@ -358,29 +360,53 @@
           (tool-result (or answer
                            (str "No answer found (status: " (name (:status result)) ")"))))))))
 
+(def ^:private valid-reanalyze-scopes
+  #{"all" "prompt-changed" "model-changed" "stale"})
+
+(defn- prepare-reanalysis!
+  "Retract analysis attrs for files matching the reanalyze scope.
+   Returns count of files marked for re-analysis, or nil if no scope given."
+  [conn db reanalyze {:keys [prompt-hash model-id]}]
+  (when reanalyze
+    (let [scope (keyword reanalyze)
+          files (analyze/files-for-reanalysis db scope {:prompt-hash prompt-hash
+                                                        :model-id    model-id})
+          paths (mapv :file/path files)
+          n     (if (seq paths) (sync/retract-analysis! conn paths) 0)]
+      (log! (str "Marked " n " file(s) for re-analysis (scope: " reanalyze ")"))
+      n)))
+
 (defn- handle-analyze [args defaults]
   (validate-llm-inputs! args)
-  (with-conn args defaults
-    (fn [{:keys [conn repo-path]}]
-      (let [{:keys [prompt-fn model-id]}
-            (llm/wrap-as-prompt-fn-from-opts {:provider (or (args "provider") (:provider defaults))
-                                              :model    (or (args "model") (:model defaults))})
-            concurrency (min (or (args "concurrency") 3) 20)
-            max-files   (args "max_files")
-            result      (analyze/analyze-repo! conn repo-path prompt-fn
-                                               (cond-> {:model-id    model-id
-                                                        :concurrency concurrency}
-                                                 max-files (assoc :max-files max-files)))]
-        (tool-result (str "Analysis complete. "
-                          (:files-analyzed result 0) " files analyzed"
-                          (when (pos? (:files-parse-errored result 0))
-                            (str ", " (:files-parse-errored result 0) " parse errors"))
-                          (when (pos? (:files-errored result 0))
-                            (str ", " (:files-errored result 0) " errors"))
-                          ". " (get-in result [:total-usage :input-tokens] 0)
-                          " in / " (get-in result [:total-usage :output-tokens] 0) " out tokens"
-                          (when-let [c (get-in result [:total-usage :cost-usd])]
-                            (when (pos? c) (str " ($" (format "%.2f" c) ")")))))))))
+  (let [reanalyze (args "reanalyze")]
+    (when (and reanalyze (not (valid-reanalyze-scopes reanalyze)))
+      (throw (ex-info (str "Invalid reanalyze scope: " reanalyze
+                           ". Must be one of: all, prompt-changed, model-changed, stale")
+                      {:scope reanalyze})))
+    (with-conn args defaults
+      (fn [{:keys [conn repo-path]}]
+        (let [{:keys [prompt-fn model-id]}
+              (llm/wrap-as-prompt-fn-from-opts {:provider (or (args "provider") (:provider defaults))
+                                                :model    (or (args "model") (:model defaults))})
+              prompt-hash (analyze/prompt-hash (:template (analyze/load-prompt-template)))]
+          (prepare-reanalysis! conn (d/db conn) reanalyze
+                               {:prompt-hash prompt-hash :model-id model-id})
+          (let [concurrency (min (or (args "concurrency") 3) 20)
+                max-files   (args "max_files")
+                result      (analyze/analyze-repo! conn repo-path prompt-fn
+                                                   (cond-> {:model-id    model-id
+                                                            :concurrency concurrency}
+                                                     max-files (assoc :max-files max-files)))]
+            (tool-result (str "Analysis complete. "
+                              (:files-analyzed result 0) " files analyzed"
+                              (when (pos? (:files-parse-errored result 0))
+                                (str ", " (:files-parse-errored result 0) " parse errors"))
+                              (when (pos? (:files-errored result 0))
+                                (str ", " (:files-errored result 0) " errors"))
+                              ". " (get-in result [:total-usage :input-tokens] 0)
+                              " in / " (get-in result [:total-usage :output-tokens] 0) " out tokens"
+                              (when-let [c (get-in result [:total-usage :cost-usd])]
+                                (when (pos? c) (str " ($" (format "%.2f" c) ")")))))))))))
 
 (defn- handle-enrich [args defaults]
   (with-conn args defaults
diff --git a/src/noumenon/sync.clj b/src/noumenon/sync.clj
@@ -70,14 +70,17 @@
 
 ;; --- Retraction ---
 
-(def ^:private mutable-file-attrs
-  "Attributes to retract on modified/deleted files so the pipeline re-processes them."
-  [:file/size :file/lines :file/imports
-   :sem/summary :sem/purpose :sem/tags :sem/complexity
+(def ^:private analysis-file-attrs
+  "Analysis attributes to retract when re-analyzing (not import/enrich attrs)."
+  [:sem/summary :sem/purpose :sem/tags :sem/complexity
    :sem/patterns :sem/category :sem/dependencies
    :arch/layer :arch/subsystem
    :prov/confidence])
 
+(def ^:private mutable-file-attrs
+  "Attributes to retract on modified/deleted files so the pipeline re-processes them."
+  (into [:file/size :file/lines :file/imports] analysis-file-attrs))
+
 (defn- find-file-eid
   "Look up a file entity ID by path. Returns nil if not found."
   [db path]
@@ -88,11 +91,11 @@
   [v]
   (if (map? v) (:db/id v) v))
 
-(defn- retract-file-attrs
-  "Build retraction tx-data for mutable attributes on a file entity."
-  [db eid]
-  (let [entity (d/pull db mutable-file-attrs eid)]
-    (->> mutable-file-attrs
+(defn- retract-attrs
+  "Build retraction tx-data for the given attributes on a file entity."
+  [db eid attrs]
+  (let [entity (d/pull db attrs eid)]
+    (->> attrs
          (mapcat (fn [attr]
                    (let [v (get entity attr)]
                      (cond
@@ -116,7 +119,26 @@
           results (->> paths
                        (keep (fn [path]
                                (when-let [eid (find-file-eid db path)]
-                                 (let [tx (into (retract-file-attrs db eid)
+                                 (let [tx (into (retract-attrs db eid mutable-file-attrs)
+                                                (retract-code-segments db eid))]
+                                   (when (seq tx) tx)))))
+                       vec)
+          tx-data (into [] cat results)]
+      (when (seq tx-data)
+        (d/transact conn {:tx-data tx-data}))
+      (count results))))
+
+(defn retract-analysis!
+  "Retract analysis attributes and code segments for the given file paths.
+   Does not retract import/enrich attrs (:file/size, :file/lines, :file/imports).
+   Returns count of files actually retracted."
+  [conn paths]
+  (when (seq paths)
+    (let [db      (d/db conn)
+          results (->> paths
+                       (keep (fn [path]
+                               (when-let [eid (find-file-eid db path)]
+                                 (let [tx (into (retract-attrs db eid analysis-file-attrs)
                                                 (retract-code-segments db eid))]
                                    (when (seq tx) tx)))))
                        vec)