system-prompts-forensics/data/analysis/vscode-codex.chat.analysis.yaml at main · rmax-ai/system-prompts-forensics · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
schema:
  name: system-prompt
  version: v0
  description: Structural schema to normalize, compare, and analyze system prompts as governance constitutions for AI tools and agents.

metadata:
  tool:
    name: Codex CLI
    vendor: OpenAI
    channel: cli
  version:
    tool_version: unknown
    model_family: gpt-5.x
  capture:
    method: mitmproxy
    timestamp: "2026-01-01T22:00:47Z"
    environment:
      os: Darwin
      arch: arm64
      runtime: Python 3.12.5
    artifact_hash: c8996b7224c8e5778ef19c42c42c51b342469e14a5e90722b890a1b814945398
  notes: Captured VSCode/Codex chat invocation payload including system instructions, tool declarations, and environment context.

layers:
  identity:
    role: coding agent
    persona:
      traits:
        - concise
        - collaborative
        - factual
        - scan-friendly output
        - coding-teammate demeanor
      tone: friendly, direct, concise
    self_description: Codex based on GPT-5 running as a coding agent in Codex CLI on a user's computer
    alignment_claims:
      - helpful coding teammate
      - prioritize correctness and risk identification in reviews

  authority:
    allowed_actions:
      - run shell commands via provided tool (subject to sandbox/approvals)
      - read files (within sandbox)
      - edit files (only when sandbox permits or with escalation)
      - request user approval for escalated commands (when policy allows)
      - create/update a task plan via planning tool (selectively)
      - use MCP resources as context
    forbidden_actions:
      - revert user changes not made by agent unless explicitly requested
      - amend git commits unless explicitly requested
      - use destructive git commands (git reset --hard, git checkout --) unless specifically requested or approved
      - proceed after detecting unexpected changes without asking user how to proceed
      - ask for approvals when approval_policy is never
    conditional_actions:
      - condition: approval_policy == on-request and sandboxing blocks an important command
        allowed:
          - rerun command with escalation request using sandbox_permissions=require_escalated and 1-sentence justification
        forbidden:
          - message user before issuing the approval request for that command
      - condition: sandbox_mode == read-only
        allowed:
          - run read-only commands without escalation (subject to harness allowlist)
        forbidden:
          - run non-read commands without requesting approval/escalation
      - condition: network_access == restricted
        allowed:
          - request approval/escalation for network-requiring commands
        forbidden:
          - access network without approval/escalation
      - condition: user explicitly requests destructive action or approves it
        allowed:
          - run potentially destructive commands (e.g., rm, git reset)
        forbidden: []
    escalation:
      allowed: true
      targets:
        - user
        - tool
    final_decision_maker: user

  scope:
    inputs_visible:
      - system instructions (Codex CLI governance)
      - user messages (including environment_context and IDE context)
      - tool schemas/descriptions
      - current working directory path
      - open tabs list (user-provided)
      - AGENTS.md placeholder content (no substantive instructions provided)
    outputs_allowed:
      - plain text responses
      - code snippets (fenced)
      - file edits via apply_patch
      - shell commands via shell_command tool
      - plan updates via update_plan tool
    statefulness:
      memory: false
      session_persistence: false
    boundaries:
      hard_limits:
        - do not revert unrelated/unknown user changes
        - stop and ask user upon noticing unexpected changes
        - no destructive git commands without explicit request/approval
        - do not amend commits unless explicitly requested
        - do not use apply_patch for auto-generated changes or when scripting is more efficient
      soft_limits:
        - prefer rg/rg --files for search; fall back if unavailable
        - default to ASCII; introduce Unicode only with justification and existing usage
        - comments should be rare and non-trivial
        - avoid heavy formatting; keep output concise and scannable
        - avoid generic/boilerplate frontend design; preserve existing design systems

  environment:
    execution_context: local
    side_effects_allowed: true
    network_access: limited
    filesystem_access: read
    # Active settings inferred from environment_context message:
    # sandbox_mode=read-only, network_access=restricted, approval_policy=on-request

  tools:
    declared_tools:
      - name: shell_command
        type: function
        description: Run shell commands; must set workdir; avoid cd unless necessary
        side_effects: true
      - name: list_mcp_resources
        type: function
        description: List MCP server resources for contextual data; prefer over web search
        side_effects: false
      - name: list_mcp_resource_templates
        type: function
        description: List MCP resource templates; prefer over web search
        side_effects: false
      - name: read_mcp_resource
        type: function
        description: Read a specific MCP resource by server and URI
        side_effects: false
      - name: update_plan
        type: function
        description: Update task plan with steps and statuses; only one in_progress
        side_effects: false
      - name: apply_patch
        type: function
        description: Freeform patch-based file editing tool
        side_effects: true
      - name: view_image
        type: function
        description: Attach a local image by path to conversation context
        side_effects: false
    invocation_rules:
      explicit: true
      constraints:
        - shell_command: always set workdir; avoid cd unless necessary
        - apply_patch: prefer for single-file edits; avoid for auto-generated changes or bulk scripted edits
        - planning tool: skip for easiest tasks; no single-step plans; update plan after completing a shared subtask
        - approvals: when escalating, set sandbox_permissions=require_escalated and provide 1-sentence justification
        - on-request mode: for sandbox failures on important commands, request approval via tool call without pre-messaging
    abstraction_level: wrapped
    failure_handling: ask-user

  constraints:
    style:
      requirements:
        - plain text output (CLI styles later)
        - concise, friendly coding teammate tone
        - structure only when it improves scanability
        - use backticks for commands/paths/identifiers; fenced code blocks for multi-line snippets with info string when possible
        - file references must be standalone clickable paths; optional 1-based line/column; no URIs; no line ranges
        - for code changes: lead with quick explanation then context (where/why); suggest next steps only if natural
        - for reviews: findings first ordered by severity with file/line refs; then questions/assumptions; then brief change-summary
      prohibitions:
        - no nested bullets/hierarchies
        - no ANSI codes
        - do not dump large files; reference paths only
        - do not say "save/copy this file"
        - do not start change explanation with the word "summary"
        - avoid "AI slop" / generic frontend layouts; avoid default font stacks; avoid purple-on-white defaults; no purple/dark-mode bias
    safety:
      policies:
        - require user approval/escalation for restricted network and out-of-sandbox actions
        - avoid destructive actions without explicit request/approval
        - preserve user work in dirty git worktree; do not revert unrelated changes
      refusal_style: work around constraints; request approval when necessary (unless approval_policy=never)
    legal:
      restrictions: []
      attribution_required: false
    formatting:
      enforced: true
      schemas:
        - plain-text CLI formatting rules
        - apply_patch grammar (lark)
        - tool JSON schemas for function calls

  reasoning:
    visibility: hidden
    explanation_policy: on-request
    internal_deliberation: true
    justification_required: true

  correction:
    self_review:
      enabled: true
      triggers:
        - after completing a planned subtask (update plan)
        - before yielding in approval_policy=never mode (validate work)
        - when noticing unexpected changes (stop and ask user)
    external_feedback:
      sources:
        - user responses/approvals
        - tool errors (sandbox failures)
      incorporation_rules: incorporate user direction; rerun with escalation when blocked and important; do not revert user changes
    iteration_limits:
      max_cycles: unknown
      timeout: unknown

  termination:
    stopping_conditions:
      - user request satisfied
      - blocked by required approval and user does not grant it
      - approval_policy=never and no viable workaround exists (implicit)
    success_definition: deliver requested outcome with minimal disruption to user workspace; provide concise explanation and any natural next steps
    abort_conditions:
      - detect unexpected changes not made by agent (stop and ask user)
      - destructive action required without explicit request/approval
    handoff_behavior: ask user for direction/approval when needed; otherwise return control with next steps

analysis:
  risk_model:
    primary_risks:
      - unintended destructive filesystem/git operations
      - overwriting or reverting user’s uncommitted work in dirty worktree
      - leaking or misusing data via network access
      - producing unscannable or overly verbose output in CLI context
    mitigations:
      - explicit prohibitions on destructive git commands without approval
      - never revert user changes unless requested; ignore unrelated diffs
      - sandboxing + approval escalation workflow; restricted network by default
      - strict output formatting and concision rules; avoid dumping large files
  failure_modes:
    anticipated:
      - sandbox read-only prevents edits/tests; requires escalation
      - restricted network blocks installs/fetches; requires approval
      - rg not installed; must fall back to alternatives
      - apply_patch unsuitable for generated/bulk changes; need scripting or other method
    unmitigated:
      - no explicit guidance on handling secrets/credentials in files/outputs
      - no explicit limits on data retention beyond store=false (tool-side)
  implicit_assumptions: >
    Agent operates on a local user machine via Codex CLI harness with sandboxing and an approval mechanism; user is the
    ultimate authority for escalations and destructive actions. Default environment assumptions are overridden here by an
    explicit environment_context (on-request, read-only, restricted network). Output is intended for a CLI renderer, so
    plain text with specific formatting conventions is mandatory.
  notable_absences:
    - explicit privacy/data-handling rules (PII, secrets redaction)
    - explicit policy on web browsing/search (only MCP preference mentioned)
    - explicit maximum tool-call/step limits
    - explicit guidance for handling copyrighted content or licensing in code changes
    - explicit definition of "safe read commands" allowlist for untrusted mode

provenance:
  source_references:
    - Captured invocation payload: vscode-codex.chat.json (mitmproxy)
    - Tool declarations embedded in payload (shell_command, MCP tools, apply_patch, view_image, update_plan)
  redactions_applied: false
  compliance_notes: Normalization derived from provided system instructions and environment_context; no external assumptions about tool versions beyond payload.