diff --git a/.gitignore b/.gitignore index 009dd50215ae..90c0ac51a3f2 100644 --- a/.gitignore +++ b/.gitignore @@ -73,3 +73,66 @@ test/screenlog* test/output.tmp CMakeUserPresets.json +.agents/skills/brainstorming/SKILL.md +.agents/skills/dispatching-parallel-agents/SKILL.md +.agents/skills/executing-plans/SKILL.md +.agents/skills/finishing-a-development-branch/SKILL.md +.agents/skills/planning-with-files/examples.md +.agents/skills/planning-with-files/reference.md +.agents/skills/planning-with-files/SKILL.md +.agents/skills/planning-with-files/scripts/check-complete.ps1 +.agents/skills/planning-with-files/scripts/check-complete.sh +.agents/skills/planning-with-files/scripts/init-session.ps1 +.agents/skills/planning-with-files/scripts/init-session.sh +.agents/skills/planning-with-files/scripts/session-catchup.py +.agents/skills/planning-with-files/templates/findings.md +.agents/skills/planning-with-files/templates/progress.md +.agents/skills/planning-with-files/templates/task_plan.md +.agents/skills/receiving-code-review/SKILL.md +.agents/skills/requesting-code-review/code-reviewer.md +.agents/skills/requesting-code-review/SKILL.md +.agents/skills/subagent-driven-development/code-quality-reviewer-prompt.md +.agents/skills/subagent-driven-development/implementer-prompt.md +.agents/skills/subagent-driven-development/SKILL.md +.agents/skills/subagent-driven-development/spec-reviewer-prompt.md +.agents/skills/systematic-debugging/condition-based-waiting-example.ts +.agents/skills/systematic-debugging/condition-based-waiting.md +.agents/skills/systematic-debugging/CREATION-LOG.md +.agents/skills/systematic-debugging/defense-in-depth.md +.agents/skills/systematic-debugging/find-polluter.sh +.agents/skills/systematic-debugging/root-cause-tracing.md +.agents/skills/systematic-debugging/SKILL.md +.agents/skills/systematic-debugging/test-academic.md +.agents/skills/systematic-debugging/test-pressure-1.md +.agents/skills/systematic-debugging/test-pressure-2.md +.agents/skills/systematic-debugging/test-pressure-3.md +.agents/skills/test-driven-development/SKILL.md +.agents/skills/test-driven-development/testing-anti-patterns.md +.agents/skills/using-git-worktrees/SKILL.md +.agents/skills/using-superpowers/SKILL.md +.agents/skills/verification-before-completion/SKILL.md +.agents/skills/writing-plans/SKILL.md +.agents/skills/writing-skills/anthropic-best-practices.md +.agents/skills/writing-skills/graphviz-conventions.dot +.agents/skills/writing-skills/persuasion-principles.md +.agents/skills/writing-skills/render-graphs.js +.agents/skills/writing-skills/SKILL.md +.agents/skills/writing-skills/testing-skills-with-subagents.md +.agents/skills/writing-skills/examples/CLAUDE_MD_TESTING.md +.claude/settings.local.json +.claude/skills/brainstorming +.claude/skills/dispatching-parallel-agents +.claude/skills/executing-plans +.claude/skills/finishing-a-development-branch +.claude/skills/planning-with-files +.claude/skills/receiving-code-review +.claude/skills/requesting-code-review +.claude/skills/subagent-driven-development +.claude/skills/systematic-debugging +.claude/skills/test-driven-development +.claude/skills/using-git-worktrees +.claude/skills/using-superpowers +.claude/skills/verification-before-completion +.claude/skills/writing-plans +.claude/skills/writing-skills +skills-lock.json diff --git a/AGENTS.md b/AGENTS.md new file mode 100644 index 000000000000..d7d1929c1df3 --- /dev/null +++ b/AGENTS.md @@ -0,0 +1,21 @@ +# TDengine Session Conventions + +## Progress Reporting (Persistent Rule) +- Every progress report must include a visual progress bar. +- Use the task table in `task_plan.md` as the source of truth. +- Show at least: + - overall percentage + - completed/total tasks + - bar visualization + +## Required Progress Bar Format +- Use this format in each report: + - `进度: % [] /` +- Bar width: 20 characters. +- Filled: `#` +- Empty: `-` + +## Calculation Rule +- `done`: number of tasks with status `completed`. +- `total`: number of tasks with status in `{completed, in_progress, pending}`. +- `percent = done / total * 100` (keep one decimal place). diff --git a/docs/en/08-operation/04-maintenance.md b/docs/en/08-operation/04-maintenance.md index 7e5e03131d12..98cc0b7b54a7 100644 --- a/docs/en/08-operation/04-maintenance.md +++ b/docs/en/08-operation/04-maintenance.md @@ -98,6 +98,61 @@ restore qnode on dnode ; # Restore qnode on dnode - This feature is based on the recovery of existing replication capabilities, not disaster recovery or backup recovery. Therefore, for the mnode and vnode to be recovered, the prerequisite for using this command is that the other two replicas of the mnode or vnode can still function normally. - This command cannot repair individual files in the data directory that are damaged or lost. For example, if individual files or data in an mnode or vnode are damaged, it is not possible to recover a specific file or block of data individually. In this case, you can choose to completely clear the data of that mnode/vnode and then perform recovery. +## File-Level Repair (`taosd -r`) + +For file-level corruption under a vnode directory (`wal/tsdb/meta`), you can use `taosd -r` for offline repair. This workflow complements `restore dnode`, which is focused on node-level recovery. + +### Supported Scope + +- `--node-type`: currently `vnode` +- `--file-type`: `wal`, `tsdb`, `meta` +- `--mode`: `force`, `replica`, `copy` + +### Common Command Examples + +```bash +# 1) force: run local file repair for the target vnode (example: WAL) +taosd -r \ + --node-type vnode \ + --file-type wal \ + --vnode-id 2 \ + --mode force \ + --backup-path /var/lib/taos/repair-backup + +# 2) replica: degrade the local bad replica and trigger replication recovery +taosd -r \ + --node-type vnode \ + --file-type wal \ + --vnode-id 2 \ + --mode replica \ + --backup-path /var/lib/taos/repair-backup + +# 3) copy: recover by copying files from a specified replica node (requires ssh/scp) +taosd -r \ + --node-type vnode \ + --file-type wal \ + --vnode-id 2 \ + --mode copy \ + --replica-node 192.168.1.24:/var/lib/taos \ + --backup-path /var/lib/taos/repair-backup +``` + +### Operations Validation and Troubleshooting + +- During execution, the process prints `repair progress` and a final `repair summary`. +- Each run writes session artifacts under the backup path: + - `repair.log`: human-readable detail log + - `repair.state.json`: machine-readable checkpoint state (used for resume) +- Recommended checks: + - Ensure `repair.log` includes expected step details, such as `copy replica detail` or `replica restore detail`. + - Verify `step/status/doneVnodes/totalVnodes` in `repair.state.json`. + +### Notes + +- `copy` mode requires `--replica-node=:` and reachable `ssh/scp`. +- The repair flow creates backup first and rolls back on failures. It is recommended to always set `--backup-path`. +- For full node/logical-node failures, prefer the `restore dnode` workflow above. + ## Splitting Virtual Groups When a vgroup is overloaded with CPU or Disk resource usage due to too many subtables, after adding a dnode, you can split the vgroup into two virtual groups using the `split vgroup` command. After the split, the newly created two vgroups will undertake the read and write services originally provided by one vgroup. This command was first released in version 3.0.6.0, and it is recommended to use the latest version whenever possible. diff --git a/docs/plans/2026-03-03-data-repair-tool-design.md b/docs/plans/2026-03-03-data-repair-tool-design.md new file mode 100644 index 000000000000..4dc32e6d3251 --- /dev/null +++ b/docs/plans/2026-03-03-data-repair-tool-design.md @@ -0,0 +1,144 @@ +# TDengine 数据修复工具设计文档(`taosd -r` 扩展) + +## 1. 背景与目标 +- 需求来源:`/Projects/work/TDengine/.vscode/dev/数据修复工具 - RS.md` +- 目标:在不新增独立程序的前提下,把 `taosd -r` 扩展为可控、可追踪、可恢复的数据修复工具。 +- 首期范围:`--node-type=vnode`,`--file-type=wal|tsdb|meta`,支持 `force/replica/copy` 三模式编排。 +- 术语约定:本文中的 `META` 即“时序数据元数据”(历史文档中的 `TDB`)。 + +## 2. 方案比较(2-3 种) + +### 方案 A:增量扩展 `taosd -r`(推荐) +- 做法: + - 在 `dmMain.c` 增加新参数解析与校验; + - 新增 repair session 编排层; + - WAL 修复复用现有 `walCheckAndRepair*`; + - TSDB/META 逐步补齐 repair handler。 +- 优点: + - 改动路径短,复用现有启动链与 vnode 生命周期; + - 适合分阶段交付(先 WAL,再 TSDB/META); + - 运维入口统一(符合需求)。 +- 缺点: + - 启动流程与修复流程耦合,需要谨慎处理正常启动路径回归风险。 + +### 方案 B:新增独立 repair 子程序 +- 做法:新增 `taosrepair` 风格工具,绕开 `taosd` 启动路径。 +- 优点: + - 模块边界清晰,便于封闭测试。 +- 缺点: + - 与需求“基于 taosd -r 扩展”冲突; + - 需要重复接入大量现有内部模块和配置解析逻辑。 + +### 方案 C:按 SQL 管理命令驱动修复(类似 restore dnode) +- 做法:走 mnode 事务,远程驱动 dnode 执行文件修复。 +- 优点: + - 统一集群控制平面。 +- 缺点: + - 文件级修复语义不适合纯远程事务; + - 社区版/企业版分叉明显,落地周期长。 + +## 3. 推荐方案 +- 采用方案 A。 +- 原因:最贴近需求、复用度最高、可按 1 小时任务切片渐进落地。 + +## 4. 目标架构 + +### 4.1 总体模块 +- `CLI Parser`:解析 `--node-type/--file-type/--vnode-id/--backup-path/--mode/--replica-node`。 +- `Validator`:参数合法性和组合规则校验。 +- `Repair Session`:会话上下文、任务分解、并发控制、状态持久化。 +- `Preflight`:空间检查、文件存在性检查、权限检查。 +- `Backup Manager`:修复前备份原始文件。 +- `Mode Handler`: + - `force` -> `wal/tsdb/meta` 子处理器; + - `replica` -> 副本恢复触发流程; + - `copy` -> 远端文件拷贝流程。 +- `Reporter`:过程进度、repair.log、摘要输出。 + +### 4.2 数据流(简化) +1. `taosd -r ...` 启动。 +2. 解析参数 -> 校验。 +3. 构建 `repair session`,定位目标 vnode 列表。 +4. 执行 preflight,创建备份目录与状态文件。 +5. 按 `mode + file-type` 调度处理器。 +6. 持续写入 `repair.log` 和 `repair.state.json`。 +7. 输出汇总:成功/失败 vnode、恢复条目、损坏条目、耗时。 + +## 5. 模式级设计 + +### 5.1 force 模式 +- `wal`: + - 优先复用 `walCheckAndRepairMeta/Idx`; + - 增加“修复前备份 + 结构化日志”; + - 增加可重放性检查结果归档。 +- `tsdb`: + - 枚举 `data/head/sma/stt`; + - 校验块级完整性; + - 保留可恢复块、剔除不可恢复块; + - 重建最小可用结构。 +- `meta`: + - 解析可读元数据; + - 联合 WAL/TSDB 推导缺失元数据; + - 对无法推导项打标并告警。 + +### 5.2 replica 模式 +- 目标:触发当前损坏 vnode 从健康副本进行全量同步。 +- 设计方向: + - 将本地损坏副本置为不可读写状态; + - 通过版本/任期策略触发同步; + - 复用现有 restore/vgroup 事务动作(需评估社区版路径)。 + +### 5.3 copy 模式 +- 目标:当数据体量大时,用“离线副本文件拷贝”快速恢复。 +- 核心步骤: + - 解析 `--replica-node`; + - 建立远端连接; + - 全量拷贝目标 vnode 目录文件; + - 同步权限与 owner; + - 完成后一致性校验。 + +## 6. 安全与一致性设计 +- 任何写操作前必须完成备份(`--backup-path=none` 例外时需告警)。 +- preflight 失败即停止修复,不进入破坏性步骤。 +- 关键步骤写状态检查点,异常退出后可恢复续跑。 +- 默认“先保守后激进”:优先保留可确认正确的数据。 + +## 7. 会话中断恢复机制(开发与运行双层) + +### 7.1 开发过程恢复 +- 以仓库根目录 `task_plan.md/findings.md/progress.md` 作为持久化工作记忆。 +- 每完成 1 个任务立即更新状态与日志。 +- 恢复时直接定位 `in_progress` 任务继续。 + +### 7.2 运行时修复恢复 +- 每次修复会生成: + - `repair.log`:人类可读日志; + - `repair.state.json`:机器可读状态检查点。 +- 下次执行同一任务时可读取状态文件,跳过已完成步骤,继续未完成步骤。 + +## 8. 测试策略 +- 单元测试: + - 参数解析与校验; + - 备份路径生成与状态文件读写; + - mode dispatch 路由。 +- 组件测试: + - WAL 修复样例(损坏 idx、截断 log)。 + - TSDB 块损坏样例。 + - META 元数据缺失样例。 +- 系统测试: + - 单副本 force 场景。 + - 三副本 replica/copy 场景。 + - 故障注入:磁盘不足、文件缺失、副本不可达。 + +## 9. 风险与缓解 +- 风险:TSDB/META 修复复杂度高,首版难以一次做到“全恢复”。 + - 缓解:先交付 WAL MVP,分阶段扩展恢复深度。 +- 风险:社区版/企业版恢复能力分叉。 + - 缓解:将 `replica/copy` 路径做能力探测和清晰报错。 +- 风险:修复逻辑影响正常启动路径。 + - 缓解:修复逻辑只在 `-r` 显式开启,默认路径零影响。 + +## 10. 设计确认点 +- 是否同意按优先级 `force+wal -> force+tsdb -> force+meta -> replica -> copy` 推进。 +- 是否同意首版 `--node-type` 只支持 `vnode`,其他值先返回 `not supported`。 +- 是否同意把“会话恢复”作为第一批基础设施(而不是后补)。 diff --git a/docs/plans/2026-03-03-data-repair-tool-implementation.md b/docs/plans/2026-03-03-data-repair-tool-implementation.md new file mode 100644 index 000000000000..03d3f640e862 --- /dev/null +++ b/docs/plans/2026-03-03-data-repair-tool-implementation.md @@ -0,0 +1,414 @@ +# TDengine Data Repair Tool Implementation Plan + +> **For Claude:** REQUIRED SUB-SKILL: Use superpowers:executing-plans to implement this plan task-by-task. + +**Goal:** Extend `taosd -r` into a vnode-level repair tool with resumable execution, covering `force` (WAL first), then TSDB/META, and finally replica/copy modes. +> Terminology note: `META` means time-series metadata (renamed from historical `TDB` wording in this project). + +**Architecture:** Keep `taosd` as the single entrypoint. Add a repair options parser and validator in mgmt startup, then pass a repair session context to vnode-side handlers. Reuse existing WAL auto-repair internals while adding preflight, backup, progress reporting, and state checkpointing (`repair.state.json`) for resumability. + +**Tech Stack:** C (existing TDengine runtime), CMake, GoogleTest (`source/common/test`, `source/libs/wal/test`, `source/dnode/vnode/test`), existing vnode/wal/meta modules. + +--- + +## Execution Notes +- Single task target duration: `30-60 minutes`. +- Each task ends with: targeted build + targeted test. +- Commit frequency: one commit per completed task. +- Use these base commands from repo root: +```bash +cmake -S . -B debug -DBUILD_TEST=ON +cmake --build debug -j8 --target +ctest --test-dir debug -R --output-on-failure +``` + +## Task 1: Repair Option Model + +**Files:** +- Create: `include/common/trepair.h` +- Create: `source/common/src/trepair.c` +- Test: `source/common/test/commonTests.cpp` + +**Step 1: Write failing test** +- Add parser-model unit tests in `commonTests.cpp` for: + - valid enum mapping (`vnode/wal/force`); + - invalid values return error. + +**Step 2: Run test to verify it fails** +- Run: +```bash +cmake --build debug -j8 --target commonTest +ctest --test-dir debug -R commonTest --output-on-failure +``` +- Expected: fail due to missing repair model APIs. + +**Step 3: Write minimal implementation** +- Add enum parsers in `trepair.h/.c` (common layer first, mgmt wiring in later tasks). + +**Step 4: Run test to verify it passes** +- Re-run `commonTest`. + +**Step 5: Commit** +```bash +git add include/common/trepair.h source/common/src/trepair.c source/common/test/commonTests.cpp +git commit -m "feat(repair): add repair option enum parsers in common layer" +``` + +## Task 2: CLI Parse Extension in `dmMain.c` + +**Files:** +- Modify: `source/dnode/mgmt/exe/dmMain.c` +- Modify: `include/common/trepair.h` +- Modify: `source/common/src/trepair.c` +- Test: `source/common/test/commonTests.cpp` + +**Step 1: Write failing test** +- Add tests for parsing: + - `taosd -r --node-type vnode --file-type wal --vnode-id 2,3 --mode force` + - missing required arg combinations should fail. + +**Step 2: Run test to verify it fails** +- Build and run `commonTest`. + +**Step 3: Write minimal implementation** +- Extend `dmParseArgs()` to parse long options. +- Reuse `tRepairParseCliOption()` to parse option values. +- Keep backward compatibility: plain `-r` still legal. + +**Step 4: Run test to verify it passes** +- Re-run `commonTest`. + +**Step 5: Commit** +```bash +git add source/dnode/mgmt/exe/dmMain.c source/common/test/commonTests.cpp +git commit -m "feat(repair): parse new taosd -r repair options" +``` + +## Task 3: Option Validation Rules + +**Files:** +- Modify: `source/common/src/trepair.c` +- Modify: `include/common/trepair.h` +- Modify: `source/dnode/mgmt/exe/dmMain.c` +- Test: `source/common/test/commonTests.cpp` + +**Step 1: Write failing test** +- Add rule tests: + - `--mode=copy` requires `--replica-node`. + - `--node-type=vnode` requires `--vnode-id`. + - unsupported node/file combinations rejected. + +**Step 2: Run test to verify it fails** +- Run `commonTest`. + +**Step 3: Write minimal implementation** +- Implement `tRepairValidateCliArgs(...)` with deterministic error codes/messages. + +**Step 4: Run test to verify it passes** +- Re-run `commonTest`. + +**Step 5: Commit** +```bash +git add source/common/src/trepair.c include/common/trepair.h source/dnode/mgmt/exe/dmMain.c source/common/test/commonTests.cpp +git commit -m "feat(repair): add repair option validation rules" +``` + +## Task 4: Repair Session + State File Skeleton + +**Files:** +- Create: `source/dnode/mgmt/exe/dmRepairSession.h` +- Create: `source/dnode/mgmt/exe/dmRepairSession.c` +- Modify: `source/dnode/mgmt/exe/dmMain.c` +- Test: `source/common/test/commonTests.cpp` + +**Step 1: Write failing test** +- Add state serialization tests for `repair.state.json`: + - write/read roundtrip; + - missing file should create new state. + +**Step 2: Run test to verify it fails** +- Run `commonTest`. + +**Step 3: Write minimal implementation** +- Add `SRepairSession` and JSON persistence helpers. + +**Step 4: Run test to verify it passes** +- Re-run `commonTest`. + +**Step 5: Commit** +```bash +git add source/dnode/mgmt/exe/dmRepairSession.h source/dnode/mgmt/exe/dmRepairSession.c source/dnode/mgmt/exe/dmMain.c source/common/test/commonTests.cpp +git commit -m "feat(repair): add repair session state persistence skeleton" +``` + +## Task 5: Target Vnode Selection and Preflight + +**Files:** +- Modify: `source/dnode/mgmt/mgmt_vnode/src/vmFile.c` +- Modify: `source/dnode/mgmt/mgmt_vnode/src/vmInt.c` +- Create: `source/dnode/mgmt/exe/dmRepairPreflight.c` +- Test: `source/dnode/vnode/test/tqTest.cpp` (or new vnode-specific test file) + +**Step 1: Write failing test** +- Add vnode filter test by `vnode-id` list. + +**Step 2: Run test to verify it fails** +- Build target: +```bash +cmake --build debug -j8 --target tqTest +ctest --test-dir debug -R tq_test --output-on-failure +``` + +**Step 3: Write minimal implementation** +- Select only requested vnode IDs. +- Add preflight checks for path existence and free space. + +**Step 4: Run test to verify it passes** +- Re-run `tq_test`. + +**Step 5: Commit** +```bash +git add source/dnode/mgmt/mgmt_vnode/src/vmFile.c source/dnode/mgmt/mgmt_vnode/src/vmInt.c source/dnode/mgmt/exe/dmRepairPreflight.c source/dnode/vnode/test/tqTest.cpp +git commit -m "feat(repair): add target vnode filtering and preflight checks" +``` + +## Task 6: Backup Manager + Progress Reporter + +**Files:** +- Create: `source/dnode/mgmt/exe/dmRepairBackup.c` +- Create: `source/dnode/mgmt/exe/dmRepairReport.c` +- Modify: `source/dnode/mgmt/exe/dmRepairSession.c` +- Test: `source/common/test/commonTests.cpp` + +**Step 1: Write failing test** +- Test backup directory naming and `repair.log` append behavior. + +**Step 2: Run test to verify it fails** +- Run `commonTest`. + +**Step 3: Write minimal implementation** +- Backup path builder (`--backup-path` + timestamp). +- Periodic progress line and final summary. + +**Step 4: Run test to verify it passes** +- Re-run `commonTest`. + +**Step 5: Commit** +```bash +git add source/dnode/mgmt/exe/dmRepairBackup.c source/dnode/mgmt/exe/dmRepairReport.c source/dnode/mgmt/exe/dmRepairSession.c source/common/test/commonTests.cpp +git commit -m "feat(repair): add backup manager and progress reporting" +``` + +## Task 7: WAL Force-Mode Handler (MVP Core) + +**Files:** +- Create: `source/dnode/vnode/src/vnd/vnodeRepairWal.c` +- Modify: `source/dnode/vnode/CMakeLists.txt` +- Modify: `source/dnode/vnode/src/vnd/vnodeOpen.c` (repair dispatch hook) +- Test: `source/libs/wal/test/walMetaTest.cpp` + +**Step 1: Write failing test** +- Add WAL corruption scenario expecting repair success and idx rebuild. + +**Step 2: Run test to verify it fails** +- Run: +```bash +cmake --build debug -j8 --target walTest +ctest --test-dir debug -R wal_test --output-on-failure +``` + +**Step 3: Write minimal implementation** +- Hook force-mode WAL handler to call existing `walCheckAndRepair*` in controlled session flow. + +**Step 4: Run test to verify it passes** +- Re-run `wal_test`. + +**Step 5: Commit** +```bash +git add source/dnode/vnode/src/vnd/vnodeRepairWal.c source/dnode/vnode/CMakeLists.txt source/dnode/vnode/src/vnd/vnodeOpen.c source/libs/wal/test/walMetaTest.cpp +git commit -m "feat(repair): implement force mode WAL repair handler" +``` + +## Task 8: WAL Recovery Summary + Resume Checkpoint + +**Files:** +- Modify: `source/dnode/vnode/src/vnd/vnodeRepairWal.c` +- Modify: `source/dnode/mgmt/exe/dmRepairSession.c` +- Test: `source/libs/wal/test/walMetaTest.cpp` + +**Step 1: Write failing test** +- Add test for resumable WAL repair: already-completed file should be skipped. + +**Step 2: Run test to verify it fails** +- Run `wal_test`. + +**Step 3: Write minimal implementation** +- Persist per-vnode per-step completion status in `repair.state.json`. + +**Step 4: Run test to verify it passes** +- Re-run `wal_test`. + +**Step 5: Commit** +```bash +git add source/dnode/vnode/src/vnd/vnodeRepairWal.c source/dnode/mgmt/exe/dmRepairSession.c source/libs/wal/test/walMetaTest.cpp +git commit -m "feat(repair): support resumable WAL repair checkpoints" +``` + +## Task 9: TSDB Force-Mode Scanner (Phase-1) + +**Files:** +- Create: `source/dnode/vnode/src/tsdb/tsdbRepairScan.c` +- Modify: `source/dnode/vnode/CMakeLists.txt` +- Test: `source/dnode/vnode/test/tqTest.cpp` (or dedicated new test) + +**Step 1: Write failing test** +- Add synthetic TSDB corruption detection case. + +**Step 2: Run test to verify it fails** +- Run `tq_test`. + +**Step 3: Write minimal implementation** +- Build scanner that reports valid/corrupt blocks without full rewrite. + +**Step 4: Run test to verify it passes** +- Re-run `tq_test`. + +**Step 5: Commit** +```bash +git add source/dnode/vnode/src/tsdb/tsdbRepairScan.c source/dnode/vnode/CMakeLists.txt source/dnode/vnode/test/tqTest.cpp +git commit -m "feat(repair): add TSDB corruption scanner for force mode" +``` + +## Task 10: META Force-Mode Metadata Recovery Skeleton + +**Files:** +- Create: `source/dnode/vnode/src/meta/metaRepair.c` +- Modify: `source/dnode/vnode/src/meta/metaOpen.c` +- Test: `source/common/test/commonTests.cpp` (metadata reconstruction utility tests) + +**Step 1: Write failing test** +- Add minimal metadata inference test from partial inputs. + +**Step 2: Run test to verify it fails** +- Run `commonTest`. + +**Step 3: Write minimal implementation** +- Add first-pass metadata recovery skeleton and missing-item markers. + +**Step 4: Run test to verify it passes** +- Re-run `commonTest`. + +**Step 5: Commit** +```bash +git add source/dnode/vnode/src/meta/metaRepair.c source/dnode/vnode/src/meta/metaOpen.c source/common/test/commonTests.cpp +git commit -m "feat(repair): add META recovery skeleton with missing metadata markers" +``` + +## Task 11: Replica Mode Dispatch Stub + +**Files:** +- Create: `source/dnode/mgmt/exe/dmRepairReplica.c` +- Modify: `source/dnode/mgmt/exe/dmRepair.c` +- Test: `source/common/test/commonTests.cpp` + +**Step 1: Write failing test** +- Add mode dispatch tests asserting `replica` path selection and preconditions. + +**Step 2: Run test to verify it fails** +- Run `commonTest`. + +**Step 3: Write minimal implementation** +- Implement dispatch stub + clear `not supported yet` boundary in community path. + +**Step 4: Run test to verify it passes** +- Re-run `commonTest`. + +**Step 5: Commit** +```bash +git add source/dnode/mgmt/exe/dmRepairReplica.c source/dnode/mgmt/exe/dmRepair.c source/common/test/commonTests.cpp +git commit -m "feat(repair): add replica mode dispatch stub with preconditions" +``` + +## Task 12: Copy Mode Dispatch Stub + +**Files:** +- Create: `source/dnode/mgmt/exe/dmRepairCopy.c` +- Modify: `source/dnode/mgmt/exe/dmRepair.c` +- Test: `source/common/test/commonTests.cpp` + +**Step 1: Write failing test** +- Add mode dispatch tests asserting `copy` requires `--replica-node`. + +**Step 2: Run test to verify it fails** +- Run `commonTest`. + +**Step 3: Write minimal implementation** +- Add copy mode skeleton (interface + structured error + TODO markers). + +**Step 4: Run test to verify it passes** +- Re-run `commonTest`. + +**Step 5: Commit** +```bash +git add source/dnode/mgmt/exe/dmRepairCopy.c source/dnode/mgmt/exe/dmRepair.c source/common/test/commonTests.cpp +git commit -m "feat(repair): add copy mode dispatch stub and validation" +``` + +## Task 13: End-to-End Smoke for `force+wal` + +**Files:** +- Create: `tests/system-test/3-enterprise/restore/repair_force_wal.py` (or equivalent existing suite path) +- Modify: `tests/parallel_test/cases.task` + +**Step 1: Write failing test** +- Add e2e case with controlled WAL damage and expected successful repair. + +**Step 2: Run test to verify it fails** +- Execute selected system test command. + +**Step 3: Write minimal implementation** +- Fix integration gaps between CLI parser, session, and WAL handler. + +**Step 4: Run test to verify it passes** +- Re-run target case to pass. + +**Step 5: Commit** +```bash +git add tests/system-test/3-enterprise/restore/repair_force_wal.py tests/parallel_test/cases.task +git commit -m "test(repair): add e2e smoke for force mode WAL repair" +``` + +## Task 14: Documentation and Operational Guide + +**Files:** +- Modify: `docs/zh/08-operation/05-maintenance.md` +- Modify: `docs/en/08-operation/04-maintenance.md` +- Create: `docs/zh/examples/repair-tool.md` +- Create: `docs/en/examples/repair-tool.md` + +**Step 1: Write doc test checklist** +- Define command examples and expected output fragments. + +**Step 2: Run doc lint/check if available** +- Run repository doc checks (if configured). + +**Step 3: Write minimal implementation** +- Add new command usage, parameter explanation, and caution notes. + +**Step 4: Verify examples manually** +- Ensure all examples match implemented CLI behavior. + +**Step 5: Commit** +```bash +git add docs/zh/08-operation/05-maintenance.md docs/en/08-operation/04-maintenance.md docs/zh/examples/repair-tool.md docs/en/examples/repair-tool.md +git commit -m "docs(repair): document taosd -r data repair workflows" +``` + +## Validation Gate (Before Claiming Completion) +- Build and run at least: +```bash +cmake --build debug -j8 --target commonTest walTest tqTest +ctest --test-dir debug -R "commonTest|wal_test|tq_test" --output-on-failure +``` +- Run one targeted system test for `force+wal`. +- Confirm `--help` output matches docs examples. diff --git a/docs/plans/2026-03-04-data-repair-release-checklist.md b/docs/plans/2026-03-04-data-repair-release-checklist.md new file mode 100644 index 000000000000..7818cfa0f2f8 --- /dev/null +++ b/docs/plans/2026-03-04-data-repair-release-checklist.md @@ -0,0 +1,43 @@ +# TDengine Data Repair Release Checklist (2026-03-04) + +## 1. Scope + +- `P7`: copy mode hardening (`T7.4`, `T7.5`) +- `P8`: fixture generation, mode matrix, docs, release gate (`T8.1`~`T8.4`) + +## 2. Verification Commands + +All commands were executed in `/Projects/work/TDengine` on `2026-03-04`. + +```bash +bash tests/ci/repair_mode_matrix.sh +ASAN_OPTIONS=detect_leaks=0 ctest --test-dir debug -R commonTest --output-on-failure +cmake --build debug --target taosd +``` + +## 3. Verification Results + +1. `repair_mode_matrix.sh`: passed + - `force(tsdb)` passed + - `force(meta)` passed + - `replica` passed + - `copy` passed +2. `ctest -R commonTest`: passed (`100% tests passed, 0 failed`) +3. `cmake --build debug --target taosd`: passed + +## 4. Release Risks + +1. Environment dependency risk: + - External dependency fetch (`ext_pcre2`) may intermittently fail in restricted network environments. + - Mitigation: retry `cmake --build` with approved elevated permissions when needed. +2. Runtime dependency risk for `copy` mode: + - Requires reachable `ssh/scp` and valid `--replica-node=:`. + - Mitigation: pre-check endpoint format; inspect `repair.log` for `copy dispatch detail` and `consistency=verified`. +3. Operational misuse risk: + - Wrong `vnode-id` or wrong mode selection may target unintended repair path. + - Mitigation: explicit parameter validation and mandatory backup path recommendation. + +## 5. Sign-off + +- Release gate result: `PASS` +- Recommendation: proceed to packaging/review and merge workflow. diff --git a/docs/zh/08-operation/05-maintenance.md b/docs/zh/08-operation/05-maintenance.md index 4711896e79f0..bb87b02524f6 100644 --- a/docs/zh/08-operation/05-maintenance.md +++ b/docs/zh/08-operation/05-maintenance.md @@ -100,6 +100,61 @@ restore qnode on dnode ;# 恢复dnode上的qnode - 该功能是基于已有的复制功能的恢复,不是灾难恢复或者备份恢复,所以对于要恢复的 mnode 和 vnode 来说,使用该命令的前提是还存在该 mnode 或 vnode 的其它两个副本仍然能够正常工作。 - 该命令不能修复数据目录中的个别文件的损坏或者丢失。例如,如果某个 mnode 或者 vnode 中的个别文件或数据损坏,无法单独恢复损坏的某个文件或者某块数据。此时,可以选择将该 mnode/vnode 的数据全部清空再进行恢复。 +## 文件级修复(`taosd -r`) + +对于 vnode 目录下 `wal/tsdb/meta` 的文件级损坏,可以使用 `taosd -r` 执行离线修复。该能力用于“文件级问题”的快速处置,与 `restore dnode` 的“节点级恢复”互补。 + +### 支持范围 + +- `--node-type`:当前支持 `vnode` +- `--file-type`:`wal`、`tsdb`、`meta` +- `--mode`:`force`、`replica`、`copy` + +### 常用命令示例 + +```bash +# 1) force:在本机对目标 vnode 执行文件修复(示例:WAL) +taosd -r \ + --node-type vnode \ + --file-type wal \ + --vnode-id 2 \ + --mode force \ + --backup-path /var/lib/taos/repair-backup + +# 2) replica:把本地坏副本降级并触发副本同步恢复 +taosd -r \ + --node-type vnode \ + --file-type wal \ + --vnode-id 2 \ + --mode replica \ + --backup-path /var/lib/taos/repair-backup + +# 3) copy:从指定副本节点拷贝目录恢复(需可用 ssh/scp) +taosd -r \ + --node-type vnode \ + --file-type wal \ + --vnode-id 2 \ + --mode copy \ + --replica-node 192.168.1.24:/var/lib/taos \ + --backup-path /var/lib/taos/repair-backup +``` + +### 运维排查与验收 + +- 执行过程中会输出进度与最终摘要:`repair progress`、`repair summary` +- 每次修复会在备份目录写入会话产物: + - `repair.log`:可读日志明细 + - `repair.state.json`:状态检查点(用于中断续跑) +- 建议优先检查: + - `repair.log` 是否包含目标步骤明细(如 `copy replica detail`、`replica restore detail`) + - `repair.state.json` 中 `step/status/doneVnodes/totalVnodes` 是否符合预期 + +### 注意事项 + +- `copy` 模式必须提供 `--replica-node=:`,且目标节点可通过 `ssh/scp` 访问。 +- 修复会先做备份并在失败时回滚,建议始终显式配置 `--backup-path`。 +- 若是整节点、整逻辑节点级故障,优先使用上文 `restore dnode` 能力。 + ## 分裂虚拟组 当一个 vgroup 因为子表数过多而导致 CPU 或 Disk 资源使用量负载过高时,增加 dnode 节点后,可通过 `split vgroup` 命令把该 vgroup 分裂为两个虚拟组。分裂完成后,新产生的两个 vgroup 承担原来由一个 vgroup 提供的读写服务。该命令在 3.0.6.0 版本第一次发布,建议尽可能使用最新版本。 diff --git a/findings.md b/findings.md new file mode 100644 index 000000000000..4073e2b5e905 --- /dev/null +++ b/findings.md @@ -0,0 +1,498 @@ +# TDengine 数据修复工具需求调研结论 + +## 0. 术语澄清(2026-03-03,用户确认) +- 修复对象统一为:`WAL`、`META`、`TSDB`。 +- `META` 定义:时序数据的元数据(此前文档中提到的 `TDB` 在本项目中改称 `META`)。 +- 计划、设计、实现文档全部按 `META` 术语推进。 +- CLI 对外文案使用 `meta`;解析层暂保留 `tdb` 兼容映射,避免历史脚本立即失效。 +- 兼容性已验证:`--file-type=meta` 与 `--file-type=tdb` 当前都可被解析并进入同一 `META` 分支。 + +## 1. 需求关键点(来自 `数据修复工具 - RS.md`) +- 入口保持为 `taosd -r`,不新增独立二进制。 +- 命令参数核心:`--node-type --file-type --vnode-id --backup-path --mode --replica-node`。 +- 只要求实现 vnode 的 `wal/tsdb/meta` 文件修复。 +- 三种模式: + - `force`:单副本自救。 + - `replica`:多副本触发恢复。 + - `copy`:从副本节点直接拷贝文件。 +- 运行要求:备份原始损坏文件、持续进度输出、最终摘要、异常即停止。 + +## 2. 现有代码事实(按模块) + +### 2.1 `taosd -r` 当前行为 +- 文件:`source/dnode/mgmt/exe/dmMain.c` +- 事实:`-r` 只设置 `generateNewMeta = true`,没有任何 `--node-type` 等扩展参数。 +- 影响:当前实现只触发“元数据重建”分支,不是完整修复工具。 + +### 2.2 元数据重建能力 +- 文件:`source/dnode/vnode/src/meta/metaOpen.c` +- 事实: + - `generateNewMeta` 在 `metaOpen()` 中触发 `metaGenerateNewMeta()`。 + - 逻辑主要是遍历现有 `uidIdx/tbDb` 重建 meta 目录并切换目录。 +- 影响:它依赖现存元数据记录,不等同于“从 WAL/TSDB 反推并修复 META”。 + +### 2.3 WAL 已有自动修复能力 +- 文件:`source/libs/wal/src/walMgmt.c`, `source/libs/wal/src/walMeta.c` +- 事实: + - `walOpen()` 会自动调用 `walCheckAndRepairMeta()` 和 `walCheckAndRepairIdx()`。 + - 支持截断损坏段、重扫 lastVer、重建 idx、可选损坏目录备份删除(`tsWalDeleteOnCorruption`)。 +- 影响:`force+wal` 可以以“编排 + 备份 + 日志增强”为主,复用现有能力。 + +### 2.4 TSDB 当前以“检测/容错”为主 +- 文件:`source/dnode/vnode/src/tsdb/tsdbFS2.c`, `tsdbReaderWriter.c` +- 事实: + - 有扫描和修补入口(`tsdbFSDoSanAndFix`),发现损坏时标记 `TSDB_FS_STATE_INCOMPLETE`。 + - 读取页有 checksum 校验,不通过报 `TSDB_CODE_FILE_CORRUPTED`。 +- 影响:缺少“需求文档定义的块级提取 + 重建文件结构”完整工具链,需要新增。 + +### 2.5 现有 `restore dnode` 与本需求边界 +- 代码:`mndDnode.c`, `mndVgroup.c` +- 文档:`docs/zh/08-operation/05-maintenance.md` +- 事实: + - `restore dnode` 是“整节点/整 vnode”基于副本恢复,不是文件级修复。 + - 社区版分支里 `mndProcessRestoreDnodeReqImpl()` 为 stub(`#ifndef TD_ENTERPRISE`)。 +- 影响:不能直接复用为文件修复工具,但其“副本恢复语义”可参考 `replica` 模式。 + +## 3. 可复用能力清单 +- WAL 修复:`walCheckAndRepairMeta/Idx`。 +- vnode 生命周期与批量并发打开/启动:`vmOpenVnodes`, `vmStartVnodes`。 +- 启动过程进度上报:`tmsgReportStartup`。 +- 配置与全局参数机制:`tglobal.h/c`(适合承载修复运行参数)。 + +## 4. 主要缺口 +- CLI 缺口:未实现需求中的扩展参数。 +- 编排缺口:无“按 vnode + file-type + mode”执行框架。 +- 状态缺口:无 repair journal/state 持久化,不支持中断续跑。 +- 备份缺口:无统一的修复前备份与回滚策略。 +- 测试缺口:缺少针对“损坏样本 -> 修复 -> 可启动/可查询”的自动验收流水线。 + +## 5. 架构建议(结论) +- 采取“增量演进”而非重写: + - 保留 `taosd -r` 单入口; + - 新增 repair session 编排层; + - 模式处理器复用现有 WAL/restore 能力,逐步补齐 TSDB/META。 +- 先做可交付 MVP:`force + wal`,再扩展 TSDB、META、replica、copy。 + +## 6. 待确认项 +- `--node-type` 在首版是否仅允许 `vnode`,其余值先报 `not supported`(建议是)。 +- `copy` 模式 SSH 实现范围:首版是否允许仅 Linux + ssh/scp 命令依赖。 +- `replica` 模式是否允许复用现有 restore 事务动作,还是仅 vnode 局部触发。 +- META 反推规则首批覆盖深度(优先保证“可启动 + 查询未损坏数据”)。 + +## 7. 执行期新增发现(2026-03-03) +- `T1.1` 参数解析模型落在 `source/common` 更合适: + - 可直接复用 `commonTest` 做 TDD; + - 降低 `mgmt/exe` 层早期改动复杂度。 +- 当前测试运行环境下,直接执行带 ASan 的 `commonTest` 会触发 `LeakSanitizer` 的 ptrace 限制; + - `ctest` 验证时建议统一加 `ASAN_OPTIONS=detect_leaks=0`。 +- 在本仓库中并行执行“build + test”会造成竞态(测试可能使用旧二进制); + - 关键验证步骤应保持顺序:`build -> test`。 +- `T1.2` 已在 `dmMain.c` 接入 `--node-type/--file-type/--vnode-id`: + - 同时支持 `--opt value` 与 `--opt=value` 两种格式; + - 非法值在参数解析阶段即返回 `TSDB_CODE_INVALID_CFG`(运行验证退出码为 `25`)。 +- `T1.3` 已接入 `--backup-path/--mode/--replica-node`: + - 解析能力继续复用 `tRepairParseCliOption()`; + - `mode` 走枚举解析,`backup-path/replica-node` 走字符串长度校验; + - 非法值同样在参数解析阶段失败(例如 `--mode bad-mode`)。 +- `T1.4` 参数组合校验已落地: + - repair 参数出现时必须带 `-r`; + - 必选项:`node-type/file-type/mode`; + - `node-type` 与 `file-type` 需满足兼容矩阵; + - `vnode-id` 仅允许且要求在 `node-type=vnode` 场景; + - `mode=copy` 必须提供 `replica-node`,其他模式禁止提供 `replica-node`。 +- `T1.5` 帮助文案已更新,`taosd --help` 已显示 repair 相关选项说明(`-r` 及 6 个长参数)。 +- `T1.6` 已由 `RepairOptionParseTest` 覆盖: + - 包含枚举解析、CLI 参数解析、参数组合校验三层测试; + - 当前共 10 条用例,`commonTest` 回归通过。 +- `T2.1` 已落地 repair 运行时上下文: + - 新增 `SRepairCtx`(会话标识、启动时间、运行参数快照); + - 新增 `tRepairInitCtx()`,在 `dmMain.c` 参数校验后立即初始化上下文; + - `sessionId` 采用 `repair-` 规则,便于后续日志/状态文件命名。 +- `T2.2` 已落地 vnode 过滤基础能力: + - 在 `SRepairCtx` 内把 `vnode-id` 字符串解析为 `int32_t` 数组缓存(去重、非法值拒绝); + - 新增 `tRepairShouldRepairVnode()` 作为后续 vnode 遍历过滤入口; + - 非法 `--vnode-id`(如 `2,a`)现在会在上下文初始化阶段失败并中止执行。 +- TDengine 代码约束补充: + - `strtol/strtoll` 在仓库中被禁止(宏重定义为 forbid); + - 数值解析必须使用 `taosStr2Int32/taosStr2Int64` 等封装函数,避免触发编译错误。 +- `T2.3` 预检能力已落地在 `source/common`: + - 新增 `tRepairPrecheck()`,检查项包括:`dataDir` 存在、`backup-path`(若配置)存在、磁盘可用空间阈值、`vnode//` 目标路径存在性; + - 磁盘空间检查复用 `taosGetDiskSize()`,当可用空间低于阈值时返回 `TSDB_CODE_NO_ENOUGH_DISKSPACE`; + - 预检函数目前对 `nodeType=vnode` 做目标路径校验,其他 node type 暂放行(为后续阶段保留扩展空间)。 +- `dmMain.c` 已在配置加载完成后、`dmInit()` 前接入预检: + - 最小可用空间阈值使用 `tsDataSpace.reserved`(若为 0 则不做空间下限); + - 失败时立即退出并输出 `failed repair precheck: `,保持 fail-fast。 +- 运行验证注意事项: + - 在当前 ASan 构建里,命令行使用 `-o /tmp` 会触发 `osDir.c:taosMulModeMkDir` 的已有栈越界问题; + - 使用 `-o /tmp/taoslog` 可绕开该环境问题并完成本任务的预检路径验证。 +- `T2.4` 备份管理器新增目录约定并已接入主流程: + - 新增 `tRepairPrepareBackupDir()`,对每个目标 vnode 预创建目录; + - 路径规则: + - 显式 `--backup-path`:`//vnode/`; + - 未配置时默认:`/backup//vnode/`; + - `dmMain.c` 在 precheck 后、`dmInit()` 前对全部目标 vnode 执行目录创建,任一失败直接中止。 +- `T2.5` 修复会话追踪文件已落地: + - 新增 `tRepairPrepareSessionFiles()`、`tRepairAppendSessionLog()`、`tRepairWriteSessionState()`; + - 会话目录固定为 `//`,其中写入: + - `repair.log`(带毫秒时间戳的 append 日志); + - `repair.state.json`(包含 `sessionId/startTimeMs/nodeType/fileType/mode/step/status/doneVnodes/totalVnodes/updatedAtMs` 等字段)。 + - `repair.state.json` 使用 `*.tmp` 临时文件 + `rename` 的原子落盘方式,降低中断时状态文件损坏风险。 + - `dmMain.c` 已接入 session 文件初始化、precheck/备份步骤日志写入与 preflight 完成态更新;初始化失败会 fail-fast 返回。 +- `T2.6` 进度输出与摘要已落地: + - 新增 `tRepairNeedReportProgress()`:按时间间隔节流进度上报(支持首次上报与时钟回拨场景)。 + - 新增 `tRepairBuildProgressLine()`:统一构造进度输出(`session/step/vnode done/total/progress%`)。 + - 新增 `tRepairBuildSummaryLine()`:统一构造最终摘要(`status/successVnodes/failedVnodes/elapsedMs`)。 + - `dmMain.c` 已接入: + - precheck 通过后立即输出并落盘首条进度; + - 备份循环中按间隔或收尾条件输出进度; + - 完成 preflight 后输出最终摘要并落盘; + - 同步更新 `repair.state.json` 的 `precheck -> backup -> preflight(ready)` 步骤状态。 +- `T2.7` 会话恢复能力已落地: + - 新增 `tRepairTryResumeSession()`: + - 扫描备份根目录(`--backup-path` 或 `/backup`)下的 `repair-*` 会话目录; + - 读取并校验 `repair.state.json` 的上下文字段(`nodeType/fileType/mode/vnodeIdList/backupPath/replicaNode`); + - 仅接受 `status=initialized|running` 的未完成会话; + - 选择 `startTimeMs` 最新的候选会话作为续跑目标,并回填 `sessionId/startTimeMs/doneVnodes/totalVnodes`。 + - `dmMain.c` 已接入恢复逻辑: + - precheck 后先尝试恢复旧会话; + - 命中恢复时复用原 `repair.log`/`repair.state.json`; + - 备份循环从 `doneVnodes` 对应下标继续,跳过已完成 vnode。 +- JSON 解析注意事项(本次新增): + - `tjsonGetStringValue2()` 在字段缺失时返回 `TSDB_CODE_SUCCESS`(仅输出空字符串),不能用返回码判断字段存在性; + - 需要配合 `tjsonGetObjectItem()` 显式判断字段是否存在,避免把“缺字段”误判为“空字符串字段”。 +- `T3.1`(`force+wal` 调度器)已完成最小闭环: + - `trepair.h/.c` 新增 `tRepairNeedRunWalForceRepair()`(判定 `nodeType=vnode && fileType=wal && mode=force`); + - `trepair.h/.c` 新增 `tRepairBuildVnodeTargetPath()`,统一构造 `vnode//` 路径; + - `tRepairPrecheck()` 改为复用 `tRepairBuildVnodeTargetPath()` 做目标文件存在性检查,减少路径构造重复逻辑。 +- `dmMain.c` 新增 `force+wal` 调度入口(位于 backup 阶段之后): + - 仅在 `tRepairNeedRunWalForceRepair()==true` 时触发; + - 先调用 `walInit(dmStopDaemon)`,再按目标 vnode 循环执行 `walOpen(walPath, &cfg)` + `walClose()`; + - 复用现有 fail-fast:任一 vnode 打开失败即记录 `repair.state.json(step=wal,status=failed)` 并中止。 +- 本阶段会话恢复策略保持“步骤级最小实现”: + - 继续沿用 `T2.7` 的 `doneVnodes` 语义用于 precheck/backup 续跑; + - `wal` 步骤暂不做细粒度续跑索引恢复(后续可在 `T3.2/T3.3` 扩展)。 +- `T3.2`(WAL 修复前备份与失败回滚保护)已完成: + - `trepair.h/.c` 新增: + - `tRepairBackupVnodeTarget()`:按 `vnode//` 递归备份到 `//vnode/`; + - `tRepairRollbackVnodeTarget()`:从备份目录递归恢复目标目录。 + - 备份/回滚内部实现要点: + - 新增目录递归复制 helper(基于 `taosOpenDir/taosReadDir/taosCopyFile`); + - 复制前会重建空目录,避免 `taosCopyFile` 的 `TD_FILE_EXCL` 导致覆盖失败; + - 对源/目标目录存在性与类型做显式校验(必须是目录)。 + - `dmMain.c` 的 `dmRunForceWalRepair()` 已接入保护语义: + - 每个 vnode 执行 `walOpen()` 前先调用 `tRepairBackupVnodeTarget()`; + - `walOpen()` 失败时立即触发 `tRepairRollbackVnodeTarget()` 并记录回滚日志; + - 仍保持 fail-fast:任一 vnode 失败即退出流程。 +- `T3.3`(WAL 修复明细记录)已完成: + - `SWalCkHead` 通过 `SWalCont` 间接包含柔性数组成员(`body[]`),因此 `SWal` 中 `writeHead` 必须保持尾字段;否则会触发 `flexible array member not at end of struct` 编译错误。 + - 新增 `SWalRepairStats` 与 `walGetRepairStats()`,用于导出一次 `walOpen()` 生命周期内的修复明细统计。 + - `walCheckAndRepairMeta()` 在进入损坏段修复分支时累计 `corruptedSegments`。 + - `walCheckAndRepairIdxFile()` 在重建索引成功后累计 `rebuiltIdxEntries` 并记录重建条数日志。 + - `dmRunForceWalRepair()` 在每个 vnode 执行 `walOpen()` 后调用 `walGetRepairStats()`,把 `corruptedSegments/rebuiltIdxEntries` 写入 `repair.log` 明细行,提升可审计性。 +- `T3.4`(`wal_test` 扩展:损坏样例自动化验证)已完成: + - 新增 `walRepairStatsTrackIdxOnlyCorruption` 用例,覆盖“log 正常但 idx 截断损坏”场景,验证自动修复后的统计结果。 + - 新增样例暴露统计口径缺口:此前仅 idx 损坏时 `rebuiltIdxEntries` 会增长,但 `corruptedSegments` 不会增长。 + - 修复方式:在 `walCheckAndRepairIdxFile()` 进入 idx 修复路径时同步累计 `corruptedSegments`,使“损坏区段”语义覆盖 log/idx 两类损坏。 + - 回归结果:`wal_test` 全量通过,`taosd` 构建通过,`force+wal` 阶段测试覆盖提升到“log 损坏 + idx 损坏 + idx-only 损坏”三类样例。 +- `T4.1`(TSDB 文件枚举与完整性扫描器封装)已完成: + - TSDB 目录结构存在多层子目录(例如 `tsdb/f100/...`),扫描器必须递归遍历,不能只看 `tsdb` 根目录。 + - 通过扩展名后缀归类可稳定覆盖当前需求:`.head/.data/.sma/.stt` 归入对应计数,其余文件计入 `unknownFiles`,便于后续定位杂项文件。 + - 新增 `SRepairTsdbScanResult` 与 `tRepairScanTsdbFiles()` 后,可在 `tRepairPrecheck()` 的 `fileType=tsdb` 分支直接复用扫描结果做 fail-fast。 + - 当前完整性基线定义为“至少包含一份 `.head` 与 `.data`”;缺失任一关键文件即返回 `TSDB_CODE_INVALID_PARA`,与新单测期望一致。 +- `T4.2`(TSDB 可恢复块提取与损坏块定位输出)已完成: + - 块级定位采用“目录聚合”策略:对包含 TSDB 识别文件的每个子目录产出一条块记录,避免早期阶段引入复杂的文件名语义解析。 + - 新增 `SRepairTsdbBlockReport` 与 `tRepairAnalyzeTsdbBlocks()`,输出 `totalBlocks/recoverableBlocks/corruptedBlocks/unknownFiles` 以及损坏块路径列表,满足结构化报告诉求。 + - 当前“可恢复块”判定规则为 `head>0 && data>0`;仅有 `sma/stt` 或仅有 `head/data` 的块会被归类为 `corrupted`。 + - 损坏块路径列表采用上限保护(`REPAIR_TSDB_MAX_REPORTED_BLOCKS`),在大规模损坏场景下保留汇总计数准确性并避免报告结构无限增长。 +- `T4.3`(TSDB 文件重建流程 MVP:保留有效块)已完成: + - 新增 `tRepairRebuildTsdbBlocks()`,基于目录级块判定把“可恢复块”(`head+data`)复制到重建输出目录,损坏块仅记录不复制。 + - 重建流程在开始时会重置输出目录,确保重复执行结果可预测且不会混入历史残留文件。 + - 若扫描后没有任何可恢复块,函数返回 `TSDB_CODE_INVALID_PARA`,避免生成“看似成功但不可用”的空重建结果。 + - 当前实现将“包含已识别 TSDB 文件的目录”视为块边界,适合作为 MVP;后续可在 `T4.4/T4.5` 基于真实 TSDB 样本迭代更细粒度块语义。 +- `T4.4`(TSDB 修复结果验证)已完成: + - 新增 `tRepairNeedRunTsdbForceRepair()`,把 `force+tsdb` 调度判定从 `dmMain` 解耦到 `trepair`,并补齐单测 `NeedRunTsdbForceRepair`。 + - `dmMain.c` 新增 `dmRunForceTsdbRepair()`,在 `dmRunRepairWorkflow()` 中接入: + - 每 vnode 执行 `tRepairAnalyzeTsdbBlocks()` 产出块级报告; + - 执行 `tRepairRebuildTsdbBlocks()` 到临时目录(`.rebuild`); + - 删除原 `tsdb` 并 `rename` 切换为重建结果; + - 切换失败时执行 `tRepairRollbackVnodeTarget()` 回滚,保证 fail-fast 与“不破坏已有目录”语义。 + - 运行态日志与状态文件已覆盖 `tsdb` 步骤(`repair.log` + `repair.state.json(step=tsdb)`),与 `wal` 流程保持一致。 +- `T4.5`(TSDB 场景系统测试脚本补齐)已完成: + - 新增脚本 `tests/ci/repair_tsdb_force.sh`,自动构造 `vnode2/tsdb` 的“可恢复块 + 损坏块”混合样本。 + - 脚本执行 `taosd -r --node-type vnode --file-type tsdb --mode force`,并校验: + - 输出包含 `step=tsdb` 的进度行与 `status=success` 摘要行; + - 修复后目标目录仅保留可恢复块,损坏块被剔除; + - 备份目录保留原始损坏块,`repair.log/repair.state.json` 均存在。 + - 在当前无完整 dnode 运行环境下,`taosd` 退出码为 `47` 仍可接受(修复流程已完成并产出成功摘要),脚本据此做流程级验收而非进程码等值断言。 +- `T5.1`(META 元数据解析器稳定化)已完成: + - 新增 `SRepairMetaScanResult` 与 `tRepairScanMetaFiles()`,对 `meta` 目录做“结构/标签/索引”最小稳定性检查: + - 必需文件:`table.db`、`schema.db`、`uid.idx`、`name.idx`; + - 可选索引:`ctb.idx/suid.idx/tag.idx/sma.idx/ctime.idx/ncol.idx/stream.task.db`(仅统计,不作为失败条件); + - 缺失必需文件会返回 `TSDB_CODE_INVALID_PARA`,并在结果中记录缺失文件名。 + - 新增 `tRepairNeedRunMetaForceRepair()`,补齐 `force+meta` 调度判定,与 `wal/tsdb` 调度接口保持一致。 + - `tRepairPrecheck()` 已接入 `fileType=meta` 分支,确保 repair workflow 前置阶段即可 fail-fast 拦截元数据结构不完整场景。 + - `dmMain.c` 新增 `dmRunForceMetaRepair()` 并接入 `dmRunRepairWorkflow()`: + - 每 vnode 执行 `meta` 目录备份; + - 记录 `meta scan detail`(required/present/optional/missing)到 `repair.log`; + - 更新 `repair.state.json(step=meta)` 与 `step=meta` 进度输出。 + - 运行验证结果:`taosd -r --file-type meta --mode force` 在最小样本下输出 `step=meta` 与成功摘要,且备份目录正确落盘元数据文件。 +- `T5.2`(WAL/TSDB 反向推导元数据规则:第一批)已完成: + - 新增 `SRepairMetaInferenceReport` 与 `tRepairInferMetaFromWalTsdb()`,第一批推导规则为: + - `wal` 证据:`vnode//wal` 下存在文件; + - `tsdb` 证据:`tRepairAnalyzeTsdbBlocks()` 的 `recoverableBlocks > 0`。 + - 推导判定策略: + - 任一证据命中即 `recoverable=true` 并返回成功; + - 两类证据均未命中则返回 `TSDB_CODE_INVALID_PARA`(保持不可恢复语义)。 + - `tRepairPrecheck()` 的 `fileType=meta` 分支已改为“先扫描、后推导”: + - `tRepairScanMetaFiles()` 失败时回退 `tRepairInferMetaFromWalTsdb()`; + - 仅当扫描与推导都失败时才返回失败,避免缺文件场景 precheck 直接 fail-fast。 + - `dmRunForceMetaRepair()` 已接入运行时兜底: + - 扫描成功时写 `meta scan detail`; + - 扫描失败但推导成功时写 `meta infer detail` 并继续流程。 + - 验证结果: + - 定向单测通过(`ScanMetaFiles*`/`NeedRunMetaForceRepair`/`InferMetaFromWalTsdb*`/`PrecheckMetaFallbackToInferenceSuccess` 共 8 条); + - `ctest -R commonTest` 与 `cmake --build debug --target taosd` 通过; + - smoke 样本验证通过:`meta` 完整场景和“缺文件+wal 证据”场景均命中 `step=meta` 与成功摘要,且后者 `repair.log` 出现 `meta infer detail`。 +- `T5.3`(缺失元数据标记与“不可推导”日志输出)已完成: + - 新增 `tRepairBuildMetaMissingFileMark()`: + - 输入 `SRepairMetaScanResult`,输出逗号分隔的缺失文件标记(例如 `schema.db,uid.idx,name.idx`); + - 无缺失时输出 `none`,用于统一日志口径。 + - `dmMain.c` 在 `force+meta` 路径新增日志增强: + - 扫描失败时先写 `meta missing marker`; + - 推导成功写 `meta infer detail`(包含 missing + wal/tsdb 证据); + - 推导失败写 `meta unrecoverable detail`(包含 missing + 证据统计)。 + - 为避免再次放大单个函数,`dmRunForceMetaRepair()` 新增并复用 helper: + - `dmAppendMetaMissingMarkerLog()`; + - `dmAppendMetaInferenceDetailLog()`; + - `dmReportMetaPrecheckInferenceDetail()`(precheck 失败时输出不可推导明细)。 + - 验证结果: + - 新增单测 `BuildMetaMissingFileMark` / `BuildMetaMissingFileMarkNoneOrInvalidArgs` 通过; + - 定向用例(10 条)与 `ctest -R commonTest` 通过; + - `taosd` 构建通过; + - smoke 三场景通过:完整、可推导、不可推导(不可推导场景输出 `meta unrecoverable detail` 且 precheck 按预期失败)。 +- `T5.4`(重建 META 并切换生效,含备份目录)已完成: + - `trepair.h/.c` 新增 `tRepairRebuildMetaFiles()`: + - 重建流程为 `reset outputDir -> copy 原 meta 目录 -> 补齐缺失必需文件`; + - 必需文件始终对齐到 `table.db/schema.db/uid.idx/name.idx`; + - 返回 `SRepairMetaScanResult`,用于上层记录重建明细。 + - `dmMain.c` 的 `force+meta` 已接入重建闭环: + - 在 `scan/infer` 通过后执行 `rebuild`; + - 使用 `meta.rebuild -> rename` 完成目录切换; + - 切换失败时调用 `tRepairRollbackVnodeTarget()` 回滚并写 `meta repair rollback` 日志。 + - 本轮继续控制函数体量: + - 新增并复用 `dmAppendMetaRebuildDetailLog()`、`dmHandleMetaRepairRollback()`、`dmRebuildAndActivateMeta()`; + - `dmRunForceMetaRepair()` 保持“编排 + 错误分支”职责,不再直接堆叠文件操作细节。 + - 运行期日志新增: + - `meta rebuild detail`(required/present/optional/missing); + - 与既有 `meta missing marker`、`meta infer detail`、`meta unrecoverable detail` 组合,形成完整诊断链路。 + - 验证结果: + - Red 证据:`commonTest` 构建报错 `tRepairRebuildMetaFiles` 未声明; + - Green 后定向测试通过(含 `RebuildMetaFiles*` 在内共 11 条); + - `ctest -R commonTest` 与 `cmake --build debug --target taosd` 通过; + - smoke 两场景通过:完整与“缺文件+wal 证据”均命中 `step=meta` + 成功摘要(退出码 `47`),`repair.log` 命中 `meta rebuild detail`,且目标 META 目录补齐必需文件。 +- `T5.5`(META 修复测试:部分损坏/完全损坏双场景)已完成: + - 新增系统级脚本 `tests/ci/repair_meta_force.sh`,对 `taosd -r --file-type meta --mode force` 做可复现验收。 + - 脚本覆盖两类损坏样本(均带 WAL 证据): + - `meta-partial`:保留 `table.db/tag.idx`,缺失其余必需文件; + - `meta-complete`:META 目录为空(完全损坏)。 + - 验收断言: + - 控制台输出必须包含 `step=meta` 的 100% 进度与 `status=success` 摘要; + - `repair.log` 必须包含 `meta missing marker`、`meta infer detail`、`meta rebuild detail`; + - 修复后目标目录必须补齐 `table.db/schema.db/uid.idx/name.idx`,且部分损坏场景保留 `tag.idx`。 + - 运行结果:脚本一次通过,双场景均返回流程级成功(`taosd` 退出码 `47`),满足“部分损坏/完全损坏双场景”可复现测试目标。 +- `T6.1`(`mode=replica` 指令接入与分支调度)已完成: + - `trepair.h/.c` 新增 `tRepairNeedRunReplicaRepair()`,统一 replica 分支调度判定: + - 规则:`nodeType=vnode && mode=replica` 时返回 `needRun=true`; + - 其余模式返回 `false`,并保持与 `NeedRun*ForceRepair` 接口风格一致。 + - `dmMain.c` 新增 `dmRunReplicaRepair()` 并接入 `dmRunRepairWorkflow()`: + - 在 backup 阶段后、force 分支前执行 replica 调度; + - 写入 `repair.state.json(step=replica,status=running)`; + - 输出并落盘 `replica dispatch detail` 与 `step=replica` 进度行(当前为 stub,不含 T6.2 的降级动作)。 + - 当前行为边界: + - `mode=replica` 已显式进入独立分支,不再依赖“force 分支全部跳过”的隐式空跑; + - 真实坏副本降级/版本任期策略仍留待 `T6.2` 实现。 + - 验证结果: + - Red 证据:`commonTest` 构建报错 `tRepairNeedRunReplicaRepair` 未声明; + - Green 后定向单测通过(5/5,含 `NeedRunReplicaRepair` 与 invalid-args); + - `ctest -R commonTest` 与 `cmake --build debug --target taosd` 通过; + - `mode=replica` smoke 样本通过:输出命中 `step=replica` + 成功摘要,`repair.log` 命中 `replica dispatch detail`,退出码 `47`。 +- `T6.2`(本地坏副本降级动作)已完成: + - `trepair.h/.c` 新增 `tRepairDegradeReplicaVnode()`: + - 仅允许 `nodeType=vnode && mode=replica`,并要求 `vnode-id` 命中 `tRepairShouldRepairVnode()`; + - 在 `vnode//` 下原子写入 `replica.degrade.marker.json`; + - marker 包含策略字段:`action=degrade-local-replica`、`availability=offline`、`syncPolicy=full-sync`、`versionPolicy=reset-local-version`、`termPolicy=bump-local-term`,并记录 `sessionId/vnodeId/updatedAtMs`。 + - `dmMain.c` 的 `dmRunReplicaRepair()` 已从 stub 升级为逐 vnode 执行: + - 每个 vnode 调用 `tRepairDegradeReplicaVnode()` 触发降级落盘; + - 每个 vnode 写 `replica degrade detail`(含 marker 路径与策略字段); + - 保持 `repair.state.json(step=replica,status=running)` 与 `step=replica` 进度输出一致性。 + - 验证结果: + - Red 证据:`commonTest` 构建报错 `tRepairDegradeReplicaVnode was not declared in this scope`; + - Green 后定向单测通过(`NeedRunReplicaRepair* + DegradeReplicaVnode*` 共 4 条); + - `ctest -R commonTest` 与 `cmake --build debug --target taosd` 通过; + - `mode=replica` smoke 样本通过:`repair.log` 命中 `replica dispatch detail` 与 `replica degrade detail`,且 marker 文件存在,`taosd` 退出码 `47`。 +- `T6.3`(与现有 restore/vgroup 逻辑联动验证)已完成: + - `replica` 路径新增 restore hint 产物: + - `tRepairWriteReplicaRestoreHint()` 在会话目录原子写入 `replica.restore.hint.json`; + - hint 字段包含 `mnodeMsgType=TDMT_MND_RESTORE_DNODE`、`restoreType=RESTORE_TYPE__VNODE`、 + `vgroupAction=mndBuildRestoreAlterVgroupAction`、`restoreSqlHint=RESTORE VNODE ON DNODE `、 + `sessionId/vnodeIds/updatedAtMs`。 + - `dmRunReplicaRepair()` 在降级循环完成后写 `replica restore detail` 日志: + - 明确记录 hint 路径与 restore/vgroup 语义字段; + - 社区版场景记录 `restoreDnodeImpl=community-stub`,与现有 `#ifndef TD_ENTERPRISE` 分支一致。 + - 验证结果: + - 定向单测通过(`NeedRunReplicaRepair* + DegradeReplicaVnode* + WriteReplicaRestoreHint* + RollbackReplicaVnode*` 共 8 条); + - `ctest -R commonTest` 通过; + - smoke 样本通过:`taosd` 退出码 `47`,`repair.log` 命中 `replica dispatch/degrade/restore detail`,且 marker 与 hint 文件均存在。 +- `T6.4`(replica 模式失败保护与回滚语义)已完成: + - 失败路径统一回滚 helper: + - `dmRollbackReplicaArtifacts()` 负责失败时删除 hint(若存在)并逆序回滚已降级 vnode marker; + - `tRepairRollbackReplicaVnode()` 负责单 vnode marker 删除,输入校验与 `replica` 上下文一致。 + - `dmRunReplicaRepair()` 已在各关键失败分支接入统一回滚: + - `session state` 写失败; + - `degrade` 失败; + - 进度/日志追加失败; + - restore hint 写失败; + - restore detail 日志写失败。 + - 验证结果: + - 失败 smoke(确定性注入)通过:构造第 2 个 vnode 降级失败后,`taosd` 退出码 `25`,`repair.log` 命中 `replica rollback detail`,第 1 个 vnode marker 已删除; + - 在该失败路径下,hint 未生成(符合“失败前未进入 restore hint 阶段”的预期)。 + - 已知边界: + - 尝试 20 轮“hint 生成后再注入失败”未稳定命中,当前对“hint 已生成后删除”以代码路径和现有回滚逻辑为主,后续可考虑引入可控故障注入点提升可测性。 +- `T7.1`(`--replica-node` 解析与目标合法性校验)已完成: + - 现状缺口: + - 之前 `tRepairValidateCliArgs()` 对 `mode=copy` 仅校验“是否提供 `replica-node`”,未校验 endpoint 格式; + - 导致形如 `192.168.1.24:var/lib/taos`(相对路径)也会被当作合法参数。 + - TDD 过程: + - Red:新增 `ValidateCliArgsReplicaNodeEndpointFormat`,覆盖合法 endpoint 与 7 类非法 endpoint(缺冒号、空 host、空 path、相对路径、空白字符、多冒号等),先验证失败; + - Green:新增 `tRepairValidateReplicaNodeEndpoint()` 并接入 `mode=copy` 校验分支; + - Refactor:保持最小变更,不改 CLI 解析层,仅增强 validator。 + - 当前校验规则(copy 模式): + - 必须是 `:`; + - host/path 均不能为空; + - 仅允许一个分隔冒号; + - path 必须以 `/` 开头; + - endpoint 中不允许空白字符。 + - 验证结果: + - 定向单测通过:`ValidateCliArgsReplicaNodeRule + ValidateCliArgsReplicaNodeEndpointFormat`(2/2); + - `ctest -R commonTest` 通过; + - 运行态验证通过: + - 非法 endpoint `192.168.1.24:var/lib/taos` 退出码 `25`; + - 合法 endpoint `192.168.1.24:/var/lib/taos` 通过参数校验并继续流程(退出码 `47`)。 +- `T7.2`(远端拷贝抽象层,先本地 mock)已完成: + - 新增对外可测试接口: + - `tRepairParseReplicaNodeEndpoint()`:从 `:` 解析出 `host` 与 `remoteDataDir`; + - `tRepairMockCopyReplicaVnodeTarget()`:基于本地目录模拟“远端副本目录拷贝”。 + - `tRepairMockCopyReplicaVnodeTarget()` 语义: + - 仅允许 `nodeType=vnode` 且 `mode=copy`; + - 目标 vnode 必须命中 `tRepairShouldRepairVnode()`; + - 源目录按 `replicaDataDir + vnodeId + fileType` 计算; + - 目标目录按 `localDataDir + vnodeId + fileType` 计算; + - 复制前重置目标目录,再递归复制,返回 `srcPath/dstPath` 便于上层日志与调试。 + - TDD 结果: + - Red:新增测试先报未声明接口,编译失败符合预期; + - Green:补齐头文件声明与实现后,定向测试通过(3/3)。 + - 验证结果: + - `cmake --build debug --target commonTest` 通过; + - 定向 gtest 通过:`ParseReplicaNodeEndpoint`、`MockCopyReplicaVnodeTarget`、`MockCopyReplicaVnodeTargetInvalidArgs`; + - `ctest -R commonTest` 通过; + - `cmake --build debug --target taosd` 通过(中途受 `ext_pcre2` 外网拉取波动影响,升权重重试后成功)。 +- `T7.3`(SSH/SCP 实现并接入 copy 模式)已完成: + - 新增 copy 调度与执行接口: + - `tRepairNeedRunCopyRepair()`:判定 `nodeType=vnode && mode=copy`; + - `tRepairBuildCopySshProbeCmd()`:构造远端目录探测命令(默认 `ssh`,支持环境变量 `TAOS_REPAIR_SSH_BIN` 覆盖); + - `tRepairBuildCopyScpCmd()`:构造递归拷贝命令(默认 `scp`,支持环境变量 `TAOS_REPAIR_SCP_BIN` 覆盖); + - `tRepairSshScpCopyReplicaVnodeTarget()`:按 vnode/fileType 计算远端与本地目标目录,执行 `ssh test -d` + `scp -r` 完成 copy。 + - 命令执行策略: + - 新增内部 helper `tRepairRunShellCommand()`,统一执行 shell 命令并通过 `__TD_REPAIR_COPY_EXIT__=` 标记解析真实退出码; + - 命令返回非 0 时统一转为失败,避免“只读输出不看退出码”的假阳性。 + - `dmMain.c` 接入: + - 新增 `dmRunCopyRepair()` 并接入 `dmRunRepairWorkflow()`; + - 运行过程落盘 `repair.state.json(step=copy)`; + - `repair.log` 新增 `copy dispatch detail` 与 `copy replica detail` 明细; + - 控制台输出 `step=copy` 进度行,保持与 wal/tsdb/meta/replica 流程一致。 + - TDD 与验证结果: + - Red:新增 `NeedRunCopyRepair*` 与 `BuildCopySshScpCommands*` 测试后编译失败(缺接口声明); + - Green:补齐头文件声明与实现后,定向用例通过(4/4); + - 回归:`ASAN_OPTIONS=detect_leaks=0 ctest --test-dir debug -R commonTest --output-on-failure` 通过;`cmake --build debug --target taosd` 通过; + - Smoke:通过环境变量注入本地 mock `ssh/scp` 完成端到端验证,`taosd` 退出码 `47`,本地 `wal/meta` 被远端内容覆盖、陈旧文件被清理,`repair.log` 命中 copy 相关明细日志。 +- `T7.4`(覆盖写入后的权限/owner 修复逻辑)已完成: + - 失败事实(Red): + - 新增用例 `SshScpCopyReplicaVnodeTargetFixesOwnerAndPermission` 先失败,复现 `scp` 覆盖后目录权限漂移问题; + - 失败现象:remote 目录权限为 `700`,local 在 mock `scp` 后变为 `755`,一致性断言失败。 + - Green 实现(仅改 `trepair.c`,避免扩大 `dmMain.c`): + - 新增内部 helper `tRepairBuildCopySshStatCmd()`:构造 `ssh stat -c '%u %g %a'` 命令获取远端 vnode 目标目录的 `uid/gid/mode`; + - 新增内部 helper `tRepairParseCopyOwnershipMeta()`:解析并校验 `uid gid mode` 三元组; + - 新增内部 helper `tRepairBuildCopyFixOwnerPermCmd()`:构造本地 `chown -R` + `chmod` 修复命令; + - `tRepairSshScpCopyReplicaVnodeTarget()` 在 `scp` 成功后执行 owner/权限修复,任一步失败即返回错误码以保持 fail-fast。 + - 验证结果: + - 定向 Red->Green:`RepairOptionParseTest.SshScpCopyReplicaVnodeTargetFixesOwnerAndPermission` 由失败转通过; + - copy 相关定向回归:`NeedRunCopyRepair* + BuildCopySshScpCommands* + SshScpCopyReplicaVnodeTargetFixesOwnerAndPermission`(5/5)通过; + - 全量 `commonTest` 回归通过,`taosd` 构建通过。 +- `T7.5`(copy 模式一致性校验与异常中断处理)已完成: + - 失败事实(Red): + - 新增用例 `SshScpCopyReplicaVnodeTargetDetectsConsistencyMismatch`,构造“`scp` 成功返回但少拷文件”场景; + - 初始实现错误返回 success,未检测到 copy 后目录不一致,符合先测后码预期。 + - Green 实现(分层落地): + - `trepair.c` 新增一致性校验链路: + - `tRepairBuildCopySshDigestCmd()` / `tRepairBuildCopyLocalDigestCmd()`:构造远端/本地目录摘要命令(基于 `find + sort + md5sum`); + - `tRepairVerifyCopyConsistency()`:比较远端与本地摘要,不一致返回 `TSDB_CODE_FAILED`; + - 在 `tRepairSshScpCopyReplicaVnodeTarget()` 的 `scp + owner/perm 修复` 后执行一致性校验。 + - `dmMain.c` 新增 copy 异常中断处理: + - `dmRunCopyRepair()` 对每个 vnode 在 copy 前执行 `tRepairBackupVnodeTarget()`; + - copy 失败(含一致性校验失败)时调用 `tRepairRollbackVnodeTarget()` 回滚,并写入 `copy rollback detail` 日志; + - copy 成功日志增加 `consistency=verified` 标识。 + - 兼容性修复: + - 远端摘要命令最初使用 `awk '{print $1}'`,在 ssh 双层引号下存在 `$1` 被本地 shell 提前展开风险; + - 调整为 `cut -d ' ' -f1` 后消除展开副作用,修复误判。 + - 验证结果: + - 定向 copy 用例通过(6/6):`NeedRunCopyRepair*`、`BuildCopySshScpCommands*`、`SshScpCopyReplicaVnodeTargetFixesOwnerAndPermission`、`SshScpCopyReplicaVnodeTargetDetectsConsistencyMismatch`; + - `ctest -R commonTest` 通过,`cmake --build debug --target taosd` 通过; + - smoke:mock `scp` 在部分复制后返回非 0,`taosd` 退出码 `25`,`repair.log` 命中 `copy rollback detail`,本地旧文件成功回滚保留。 +- `T8.1`(损坏数据生成器自动脚本化)已完成: + - Red 事实: + - 以 `bash tests/ci/repair_fixture_generator.sh` 作为先验入口执行,脚本不存在(退出码 `127`),满足“先失败再实现”。 + - Green 实现: + - 新增脚本 `tests/ci/repair_fixture_generator.sh`,统一生成可复现损坏样本并输出 `manifest.txt`; + - CLI 参数: + - `--output-dir`(必填) + - `--type wal|tsdb|meta|all`(默认 `all`) + - `--vnode-id`(默认 `2`) + - `--clean` + - 产出场景: + - `wal-force-corrupted`:包含截断日志与损坏 idx 样本; + - `tsdb-force-mixed`:包含可恢复块与损坏块混合样本; + - `meta-force-partial` 与 `meta-force-complete`:覆盖部分/完全缺失元数据并保留 WAL 证据。 + - 验证结果: + - `--type all` 场景下,WAL/META/TSDB 样本关键文件与 manifest 条目校验通过; + - `--type wal --vnode-id 9` 场景下,确认仅输出 wal 样本且目录隔离正确。 +- `T8.2`(三模式系统测试矩阵与验收脚本)已完成: + - Red 事实: + - 以 `bash tests/ci/repair_mode_matrix.sh` 作为入口执行,脚本不存在(退出码 `127`),满足先失败验证。 + - Green 实现: + - 新增 `tests/ci/repair_mode_matrix.sh`,统一编排三模式验收: + - `force`:复用 `repair_tsdb_force.sh` 与 `repair_meta_force.sh`; + - `replica`:构造最小 wal 证据样本,校验 `step=replica` 进度/成功摘要与 `replica dispatch/restore detail` 日志; + - `copy`:注入 mock `ssh/scp`,校验 `step=copy`、文件覆盖结果、`copy replica detail` 与 `consistency=verified` 日志。 + - 验证结果: + - 运行 `bash tests/ci/repair_mode_matrix.sh` 一次通过; + - 输出依次确认 `force(tsdb)`、`force(meta)`、`replica`、`copy` 均通过,最终输出 `repair mode matrix script passed`。 +- `T8.3`(文档更新:中英 + 运维示例)已完成: + - 文档更新范围: + - `docs/zh/08-operation/05-maintenance.md` + - `docs/en/08-operation/04-maintenance.md` + - 新增内容: + - 新增 “`taosd -r` 文件级修复”章节; + - 统一给出 `force`、`replica`、`copy` 三种模式命令示例; + - 增补运维验收项(`repair progress`、`repair summary`、`repair.log`、`repair.state.json`); + - 明确 `copy` 模式 endpoint 格式与 ssh/scp 依赖、备份与回滚建议。 + - 验证结果: + - 中英两份文档均可检索到新增章节与关键字段,结构完整且与当前实现一致。 +- `T8.4`(发布前回归与风险清单签出)已完成: + - 发布前回归(gate): + - `bash tests/ci/repair_mode_matrix.sh` 通过; + - `ASAN_OPTIONS=detect_leaks=0 ctest --test-dir debug -R commonTest --output-on-failure` 通过; + - `cmake --build debug --target taosd` 通过。 + - 风险清单签出: + - 新增 `docs/plans/2026-03-04-data-repair-release-checklist.md`; + - 记录了 gate 命令、执行结果、残余风险与缓解建议; + - 签出结论为 `PASS`,可进入后续合入/发布流程。 diff --git a/include/common/trepair.h b/include/common/trepair.h new file mode 100644 index 000000000000..f9758ed961a2 --- /dev/null +++ b/include/common/trepair.h @@ -0,0 +1,209 @@ +/* + * Copyright (c) 2019 TAOS Data, Inc. + * + * This program is free software: you can use, redistribute, and/or modify + * it under the terms of the GNU Affero General Public License, version 3 + * or later ("AGPL"), as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. + * + * You should have received a copy of the GNU Affero General Public License + * along with this program. If not, see . + */ + +#ifndef _TD_COMMON_REPAIR_H_ +#define _TD_COMMON_REPAIR_H_ + +#ifdef __cplusplus +extern "C" { +#endif + +#include "os.h" +#include "taoserror.h" + +typedef enum { + REPAIR_NODE_TYPE_INVALID = 0, + REPAIR_NODE_TYPE_VNODE = 1, + REPAIR_NODE_TYPE_MNODE, + REPAIR_NODE_TYPE_DNODE, + REPAIR_NODE_TYPE_SNODE, +} ERepairNodeType; + +typedef enum { + REPAIR_FILE_TYPE_INVALID = 0, + REPAIR_FILE_TYPE_WAL = 1, + REPAIR_FILE_TYPE_TSDB, + REPAIR_FILE_TYPE_META, + REPAIR_FILE_TYPE_DATA, + REPAIR_FILE_TYPE_CONFIG, + REPAIR_FILE_TYPE_CHECKPOINT, +} ERepairFileType; + +// Backward-compatible alias for previous name. +#define REPAIR_FILE_TYPE_TDB REPAIR_FILE_TYPE_META + +typedef enum { + REPAIR_MODE_INVALID = 0, + REPAIR_MODE_FORCE = 1, + REPAIR_MODE_REPLICA, + REPAIR_MODE_COPY, +} ERepairMode; + +typedef struct { + bool hasNodeType; + ERepairNodeType nodeType; + bool hasFileType; + ERepairFileType fileType; + bool hasVnodeIdList; + char vnodeIdList[PATH_MAX]; + bool hasBackupPath; + char backupPath[PATH_MAX]; + bool hasMode; + ERepairMode mode; + bool hasReplicaNode; + char replicaNode[PATH_MAX]; +} SRepairCliArgs; + +#define REPAIR_SESSION_ID_LEN 64 +#define REPAIR_MAX_VNODE_IDS 128 + +typedef struct { + bool enabled; + int64_t startTimeMs; + char sessionId[REPAIR_SESSION_ID_LEN]; + ERepairNodeType nodeType; + ERepairFileType fileType; + ERepairMode mode; + bool hasVnodeIdList; + char vnodeIdList[PATH_MAX]; + int32_t vnodeIdNum; + int32_t vnodeIds[REPAIR_MAX_VNODE_IDS]; + bool hasBackupPath; + char backupPath[PATH_MAX]; + bool hasReplicaNode; + char replicaNode[PATH_MAX]; +} SRepairCtx; + +typedef struct { + int32_t headFiles; + int32_t dataFiles; + int32_t smaFiles; + int32_t sttFiles; + int32_t unknownFiles; +} SRepairTsdbScanResult; + +#define REPAIR_META_MAX_MISSING_FILES 16 +#define REPAIR_META_FILE_NAME_LEN 64 + +typedef struct { + int32_t requiredFiles; + int32_t presentRequiredFiles; + int32_t optionalIndexFiles; + int32_t missingRequiredFiles; + char missingRequiredFileNames[REPAIR_META_MAX_MISSING_FILES][REPAIR_META_FILE_NAME_LEN]; +} SRepairMetaScanResult; + +typedef struct { + int32_t walEvidenceFiles; + int32_t tsdbRecoverableBlocks; + int32_t inferredRules; + bool recoverable; +} SRepairMetaInferenceReport; + +#define REPAIR_TSDB_MAX_REPORTED_BLOCKS 64 + +typedef struct { + int32_t totalBlocks; + int32_t recoverableBlocks; + int32_t corruptedBlocks; + int32_t unknownFiles; + int32_t reportedCorruptedBlocks; + char corruptedBlockPaths[REPAIR_TSDB_MAX_REPORTED_BLOCKS][PATH_MAX]; +} SRepairTsdbBlockReport; + +typedef struct { + bool skipBackupPreparation; + bool resumeAtModeStep; + int32_t backupStartVnodeIndex; + int32_t replicaStartVnodeIndex; + int32_t copyStartVnodeIndex; + int32_t walStartVnodeIndex; + int32_t tsdbStartVnodeIndex; + int32_t metaStartVnodeIndex; +} SRepairResumePlan; + +int32_t tRepairParseNodeType(const char *pNodeType, ERepairNodeType *pParsedNodeType); +int32_t tRepairParseFileType(const char *pFileType, ERepairFileType *pParsedFileType); +int32_t tRepairParseMode(const char *pMode, ERepairMode *pParsedMode); +int32_t tRepairExtractLongOptionValue(int32_t argc, char const *argv[], int32_t *pIndex, const char *optionName, + const char **pOptionValue, bool *pMatched); +int32_t tRepairParseReplicaNodeEndpoint(const char *endpoint, char *host, int32_t hostSize, char *remoteDataDir, + int32_t remoteDataDirSize); +int32_t tRepairParseCliOption(SRepairCliArgs *pCliArgs, const char *pOptionName, const char *pOptionValue); +int32_t tRepairValidateCliArgs(const SRepairCliArgs *pCliArgs); +int32_t tRepairInitCtx(const SRepairCliArgs *pCliArgs, int64_t startTimeMs, SRepairCtx *pCtx); +int32_t tRepairShouldRepairVnode(const SRepairCtx *pCtx, int32_t vnodeId, bool *pShouldRepair); +int32_t tRepairNeedRunWalForceRepair(const SRepairCtx *pCtx, bool *pNeedRun); +int32_t tRepairNeedRunTsdbForceRepair(const SRepairCtx *pCtx, bool *pNeedRun); +int32_t tRepairNeedRunMetaForceRepair(const SRepairCtx *pCtx, bool *pNeedRun); +int32_t tRepairNeedRunReplicaRepair(const SRepairCtx *pCtx, bool *pNeedRun); +int32_t tRepairNeedRunCopyRepair(const SRepairCtx *pCtx, bool *pNeedRun); +int32_t tRepairBuildCopySshProbeCmd(const char *replicaHost, const char *remoteTargetPath, char *cmd, int32_t cmdSize); +int32_t tRepairBuildCopyScpCmd(const char *replicaHost, const char *remoteTargetPath, const char *localTargetPath, + char *cmd, int32_t cmdSize); +int32_t tRepairDegradeReplicaVnode(const SRepairCtx *pCtx, const char *dataDir, int32_t vnodeId, char *markerPath, + int32_t markerPathSize); +int32_t tRepairRollbackReplicaVnode(const SRepairCtx *pCtx, const char *dataDir, int32_t vnodeId); +int32_t tRepairWriteReplicaRestoreHint(const SRepairCtx *pCtx, const char *dataDir, char *hintPath, + int32_t hintPathSize); +int32_t tRepairBuildVnodeTargetPath(const char *dataDir, int32_t vnodeId, ERepairFileType fileType, + char *targetPath, int32_t targetPathSize); +int32_t tRepairScanTsdbFiles(const SRepairCtx *pCtx, const char *dataDir, int32_t vnodeId, + SRepairTsdbScanResult *pResult); +int32_t tRepairScanMetaFiles(const SRepairCtx *pCtx, const char *dataDir, int32_t vnodeId, SRepairMetaScanResult *pResult); +int32_t tRepairBuildMetaMissingFileMark(const SRepairMetaScanResult *pResult, char *mark, int32_t markSize); +int32_t tRepairInferMetaFromWalTsdb(const SRepairCtx *pCtx, const char *dataDir, int32_t vnodeId, + SRepairMetaInferenceReport *pReport); +int32_t tRepairRebuildMetaFiles(const SRepairCtx *pCtx, const char *dataDir, int32_t vnodeId, const char *outputDir, + SRepairMetaScanResult *pResult); +int32_t tRepairAnalyzeTsdbBlocks(const SRepairCtx *pCtx, const char *dataDir, int32_t vnodeId, + SRepairTsdbBlockReport *pReport); +int32_t tRepairRebuildTsdbBlocks(const SRepairCtx *pCtx, const char *dataDir, int32_t vnodeId, const char *outputDir, + SRepairTsdbBlockReport *pReport); +int32_t tRepairPrecheck(const SRepairCtx *pCtx, const char *dataDir, int64_t minDiskAvailBytes); +int32_t tRepairPrepareBackupDir(const SRepairCtx *pCtx, const char *dataDir, int32_t vnodeId, char *backupDir, + int32_t backupDirSize); +int32_t tRepairBackupVnodeTarget(const SRepairCtx *pCtx, const char *dataDir, int32_t vnodeId, char *backupDir, + int32_t backupDirSize); +int32_t tRepairRollbackVnodeTarget(const SRepairCtx *pCtx, const char *dataDir, int32_t vnodeId); +int32_t tRepairMockCopyReplicaVnodeTarget(const SRepairCtx *pCtx, const char *replicaDataDir, const char *localDataDir, + int32_t vnodeId, char *srcPath, int32_t srcPathSize, char *dstPath, + int32_t dstPathSize); +int32_t tRepairSshScpCopyReplicaVnodeTarget(const SRepairCtx *pCtx, const char *replicaHost, + const char *replicaDataDir, const char *localDataDir, int32_t vnodeId, + char *srcPath, int32_t srcPathSize, char *dstPath, int32_t dstPathSize); +int32_t tRepairPrepareSessionFiles(const SRepairCtx *pCtx, const char *dataDir, char *sessionDir, + int32_t sessionDirSize, char *logPath, int32_t logPathSize, char *statePath, + int32_t statePathSize); +int32_t tRepairAppendSessionLog(const char *logPath, const char *message); +int32_t tRepairWriteSessionState(const SRepairCtx *pCtx, const char *statePath, const char *step, const char *status, + int32_t doneVnodes, int32_t totalVnodes); +int32_t tRepairTryResumeSession(SRepairCtx *pCtx, const char *dataDir, char *sessionDir, int32_t sessionDirSize, + char *logPath, int32_t logPathSize, char *statePath, int32_t statePathSize, + int32_t *pDoneVnodes, int32_t *pTotalVnodes, bool *pResumed, char *resumeStep, + int32_t resumeStepSize); +int32_t tRepairResolveResumePlan(ERepairNodeType nodeType, const char *resumeStep, int32_t doneVnodes, + int32_t vnodeIdNum, SRepairResumePlan *pPlan); +int32_t tRepairNeedReportProgress(int64_t nowMs, int64_t intervalMs, int64_t *pLastReportMs, bool *pNeedReport); +int32_t tRepairBuildProgressLine(const SRepairCtx *pCtx, const char *step, int32_t doneVnodes, int32_t totalVnodes, + char *line, int32_t lineSize); +int32_t tRepairBuildSummaryLine(const SRepairCtx *pCtx, int32_t successVnodes, int32_t failedVnodes, int64_t elapsedMs, + char *line, int32_t lineSize); + +#ifdef __cplusplus +} +#endif + +#endif /* _TD_COMMON_REPAIR_H_ */ diff --git a/include/libs/wal/wal.h b/include/libs/wal/wal.h index 89ddc7c12ecb..3fd5c2ca82ab 100644 --- a/include/libs/wal/wal.h +++ b/include/libs/wal/wal.h @@ -69,6 +69,11 @@ typedef struct { int64_t logRetention; } SWalVer; +typedef struct { + int32_t corruptedSegments; + int64_t rebuiltIdxEntries; +} SWalRepairStats; + #pragma pack(push, 1) // used by sync module typedef struct { @@ -126,6 +131,9 @@ typedef struct SWal { stopDnodeFn stopDnode; + // repair statistics (valid for current wal open lifecycle) + SWalRepairStats repairStats; + // reusable write head SWalCkHead writeHead; } SWal; @@ -210,6 +218,7 @@ int64_t walGetVerRetention(SWal *pWal, int64_t bytes); int64_t walGetCommittedVer(SWal *); int64_t walGetAppliedVer(SWal *); int32_t walSetKeepVersion(SWal *pWal, int64_t ver); +int32_t walGetRepairStats(SWal *pWal, SWalRepairStats *pStats); #ifdef __cplusplus } diff --git a/progress.md b/progress.md new file mode 100644 index 000000000000..d36e44cc279f --- /dev/null +++ b/progress.md @@ -0,0 +1,270 @@ +# 数据修复工具开发进度日志 + +## 当前检查点 +- 日期:`2026-03-04` +- 当前完成:`P1`~`P8` 已全部完成(`T1.1`~`T8.4` 全部完成)。 +- 下一任务:`无`(等待新需求)。 +- 恢复入口:先读 `task_plan.md`,再读 `findings.md`,最后读本文件。 + +## 会话日志 +| 时间(UTC) | 动作 | 结果 | +|---|---|---| +| 2026-03-03 17:55 | 读取需求文档 `数据修复工具 - RS.md` | 明确三模式、参数与范围边界 | +| 2026-03-03 18:00 | 定位 `taosd -r` 入口 (`dmMain.c`) | 发现当前仅设置 `generateNewMeta` | +| 2026-03-03 18:06 | 阅读 `metaOpen.c` | 发现 `-r` 当前本质是元数据重建 | +| 2026-03-03 18:12 | 阅读 `walMgmt.c/walMeta.c` | 确认 WAL 已有自动修复能力可复用 | +| 2026-03-03 18:18 | 阅读 `tsdbFS2.c/tsdbReaderWriter.c` | 确认 TSDB 现状偏检测/容错,缺少完整修复编排 | +| 2026-03-03 18:24 | 阅读 `mndDnode.c/mndVgroup.c` 与运维文档 | 确认 `restore dnode` 是整节点恢复,不是文件级修复 | +| 2026-03-03 18:33 | 建立规划文件 | 新增 `task_plan.md/findings.md/progress.md` | +| 2026-03-03 18:40 | 输出设计与实施计划文档 | 新增 `docs/plans/*data-repair*.md` | +| 2026-03-03 18:48 | T1.1 Red 阶段 | 在 `commonTests.cpp` 增加 `RepairOptionParseTest` 三组用例,构建失败(缺少 `trepair.h`)符合预期 | +| 2026-03-03 18:58 | T1.1 Green 实现 | 新增 `include/common/trepair.h`、`source/common/src/trepair.c`,实现 node/file/mode 解析 | +| 2026-03-03 19:05 | T1.1 缺陷修复 | 修复 `tRepairParse*` 对 `NULL` 输出指针的崩溃问题(ASan 报告) | +| 2026-03-03 19:08 | T1.1 验证通过 | `ASAN_OPTIONS=detect_leaks=0 ctest --test-dir debug -R commonTest --output-on-failure` 通过 | +| 2026-03-03 19:16 | T1.2 Red 阶段 | 在 `commonTests.cpp` 新增 `ParseCliOption/ParseCliOptionInvalid`,构建失败(缺少 `SRepairCliArgs/tRepairParseCliOption`) | +| 2026-03-03 19:21 | T1.2 Green 实现 | 扩展 `trepair.h/.c` 增加 CLI 选项键值解析;`dmMain.c` 接入 `--node-type/--file-type/--vnode-id`(支持 `--opt val` 与 `--opt=val`) | +| 2026-03-03 19:24 | T1.2 测试回归 | `ASAN_OPTIONS=detect_leaks=0 /Projects/work/TDengine/debug/build/bin/commonTest --gtest_filter=RepairOptionParseTest.*` 通过 | +| 2026-03-03 19:24 | T1.2 测试回归 | `ASAN_OPTIONS=detect_leaks=0 ctest --test-dir /Projects/work/TDengine/debug -R commonTest --output-on-failure` 通过 | +| 2026-03-03 19:25 | T1.2 编译验证 | `cmake --build /Projects/work/TDengine/debug -j8 --target taosd` 通过,`dmMain.c` 变更成功编入 `taosd` | +| 2026-03-03 19:26 | T1.2 运行验证 | `ASAN_OPTIONS=detect_leaks=0 taosd -r --node-type vnode --file-type wal --vnode-id 2 --help` 退出码 `0`;非法 `--node-type bad` 退出码 `25` | +| 2026-03-03 19:32 | T1.3 Red 阶段 | 扩展 `ParseCliOption` 测试覆盖 `backup-path/mode/replica-node`,构建失败(`SRepairCliArgs` 缺字段) | +| 2026-03-03 19:38 | T1.3 Green 实现 | 扩展 `SRepairCliArgs` 与 `tRepairParseCliOption()`;`dmMain.c` 新增 `--backup-path/--mode/--replica-node` 解析 | +| 2026-03-03 19:40 | T1.3 测试回归 | `ASAN_OPTIONS=detect_leaks=0 /Projects/work/TDengine/debug/build/bin/commonTest --gtest_filter=RepairOptionParseTest.*` 通过 | +| 2026-03-03 19:40 | T1.3 测试回归 | `ASAN_OPTIONS=detect_leaks=0 ctest --test-dir /Projects/work/TDengine/debug -R commonTest --output-on-failure` 通过 | +| 2026-03-03 19:42 | T1.3 编译验证 | `cmake --build /Projects/work/TDengine/debug -j8 --target taosd` 通过 | +| 2026-03-03 19:42 | T1.3 运行验证 | `ASAN_OPTIONS=detect_leaks=0 taosd -r --node-type vnode --file-type wal --vnode-id 2,3 --backup-path /tmp/backup --mode force --replica-node 192.168.1.24:/root/dataDir --help` 退出码 `0`;非法 `--mode bad-mode` 退出码 `25` | +| 2026-03-03 19:49 | T1.4 Red 阶段 | 新增 `ValidateCliArgs*` 规则测试,构建失败(缺少 `tRepairValidateCliArgs`) | +| 2026-03-03 19:54 | T1.4 Green 实现 | `trepair.c` 增加组合校验(必选项、node/file 兼容、vnode-id 规则、copy/replica-node 规则),`dmMain.c` 接入校验并要求 repair 选项必须搭配 `-r` | +| 2026-03-03 19:56 | T1.4 测试回归 | `ASAN_OPTIONS=detect_leaks=0 /Projects/work/TDengine/debug/build/bin/commonTest --gtest_filter=RepairOptionParseTest.*` 通过(10/10) | +| 2026-03-03 19:56 | T1.4 测试回归 | `ASAN_OPTIONS=detect_leaks=0 ctest --test-dir /Projects/work/TDengine/debug -R commonTest --output-on-failure` 通过 | +| 2026-03-03 19:58 | T1.4 编译验证 | `cmake --build /Projects/work/TDengine/debug -j8 --target taosd` 通过 | +| 2026-03-03 19:58 | T1.4 运行验证 | `ASAN_OPTIONS=detect_leaks=0 taosd --node-type ...`(无 `-r`)退出码 `25`,提示 `repair options require '-r'`;`mode=copy` 无 `replica-node` 退出码 `25`,提示 `invalid repair option combination`;`mode=force` + 必选项退出码 `0` | +| 2026-03-03 20:03 | T1.5 实现 | 更新 `dmMain.c` 的 `--help` 文案,新增 `-r` 与 `--node-type/--file-type/--vnode-id/--backup-path/--mode/--replica-node` 说明 | +| 2026-03-03 20:04 | T1.5 验证 | `ASAN_OPTIONS=detect_leaks=0 taosd --help | rg ...` 命中全部新增参数说明 | +| 2026-03-03 20:04 | T1.6 完成确认 | `commonTests.cpp` 已覆盖 parser + validator(`RepairOptionParseTest` 共 10 条),`commonTest` 与 `taosd` 构建均通过 | +| 2026-03-03 20:10 | 术语统一修正 | 根据用户澄清将项目术语从 `TDB` 统一为 `META`;代码中保留 `tdb -> META` 兼容解析映射,并同步更新计划/设计/实施文档 | +| 2026-03-03 20:13 | 术语修正回归 | `commonTest` 构建与 `RepairOptionParseTest.*`(10/10)通过,`ctest -R commonTest` 通过 | +| 2026-03-03 20:14 | 术语修正运行验证 | `taosd -r --file-type meta ... --help` 退出码 `0`;兼容 `--file-type tdb ... --help` 退出码 `0`;`taosd --help` 文案仅展示 `meta` | +| 2026-03-03 20:30 | T2.1 Red 阶段 | 在 `commonTests.cpp` 新增 `InitRepairCtxSuccess/InvalidArgs`,构建失败(缺少 `SRepairCtx/tRepairInitCtx`)符合预期 | +| 2026-03-03 20:36 | T2.1 Green 实现 | 扩展 `trepair.h/.c` 新增 `SRepairCtx` 与 `tRepairInitCtx()`;`dmMain.c` 在 repair 参数校验后初始化运行时上下文 | +| 2026-03-03 20:40 | T2.1 验证通过 | `commonTest --gtest_filter=RepairOptionParseTest.InitRepairCtx*` 通过;`ctest -R commonTest` 通过;`cmake --build debug --target taosd` 通过 | +| 2026-03-03 20:46 | T2.2 Red 阶段 | 扩展 `InitRepairCtx` 测试覆盖 vnode 过滤(解析 `vnode-id` 到数组 + 匹配判断),构建失败(缺少 `vnodeIdNum/vnodeIds/tRepairShouldRepairVnode`)符合预期 | +| 2026-03-03 20:52 | T2.2 Green 实现 | `SRepairCtx` 新增 `vnodeIds` 缓存;`tRepairInitCtx()` 增加 `vnode-id` 解析;新增 `tRepairShouldRepairVnode()` 进行目标 vnode 过滤 | +| 2026-03-03 20:57 | T2.2 缺陷修复 | 修复 `strtok_r` 改写原始 `vnodeIdList` 的问题,改为临时缓冲区解析,保留原始字符串 | +| 2026-03-03 20:59 | T2.2 验证通过 | `commonTest --gtest_filter=RepairOptionParseTest.InitRepairCtx*` 通过;`ctest -R commonTest` 通过;`taosd -r ... --vnode-id 2,a --mode force` 退出码 `25` 并提示 `failed to initialize repair context` | +| 2026-03-03 21:24 | T2.3 Red 阶段开始 | 已将 `task_plan.md` 中 `T2.3` 置为 `in_progress`,准备先新增预检单测(路径/磁盘/目标文件)并验证失败 | +| 2026-03-03 21:26 | T2.3 Red 验证 | `cmake --build debug --target commonTest` 失败,报错 `tRepairPrecheck was not declared in this scope`,符合“先测后码”预期 | +| 2026-03-03 21:33 | T2.3 Green 实现 | 在 `trepair.h/.c` 新增 `tRepairPrecheck()`,覆盖数据目录、备份目录、磁盘可用空间、`vnode//` 目标路径检查;`dmMain.c` 接入启动前预检 | +| 2026-03-03 21:35 | T2.3 单测验证 | `commonTest --gtest_filter=RepairOptionParseTest.Precheck*` 通过(5/5) | +| 2026-03-03 21:35 | T2.3 回归验证 | `ASAN_OPTIONS=detect_leaks=0 ctest --test-dir debug -R commonTest --output-on-failure` 通过 | +| 2026-03-03 21:43 | T2.3 编译验证 | `cmake --build debug -j8 --target taosd` 通过 | +| 2026-03-03 21:44 | T2.3 运行验证 | `taosd -o /tmp/taoslog -r ... --backup-path /tmp/td-repair-nonexistent-backup` 退出码 `25`,输出 `failed repair precheck: Invalid parameters`(预检失败路径生效) | +| 2026-03-03 21:35 | T2.4 Red 阶段开始 | 已将 `task_plan.md` 中 `T2.4` 置为 `in_progress`,准备先新增备份目录命名与创建测试并验证失败 | +| 2026-03-03 21:37 | T2.4 Red 验证 | `cmake --build debug --target commonTest` 失败,报错 `tRepairPrepareBackupDir was not declared in this scope`,符合预期 | +| 2026-03-03 21:39 | T2.4 Green 实现 | 在 `trepair.h/.c` 新增 `tRepairPrepareBackupDir()`,输出并创建 `backup//vnode/`;`dmMain.c` 接入启动时按目标 vnode 预创建备份目录 | +| 2026-03-03 21:40 | T2.4 单测验证 | `commonTest --gtest_filter=RepairOptionParseTest.PrepareBackupDir*` 通过(3/3) | +| 2026-03-03 21:40 | T2.4 回归验证 | `ASAN_OPTIONS=detect_leaks=0 ctest --test-dir debug -R commonTest --output-on-failure` 通过 | +| 2026-03-03 21:41 | T2.4 编译验证 | `cmake --build debug -j8 --target taosd` 通过 | +| 2026-03-03 21:41 | T2.4 运行验证 | `taosd -o /tmp/taoslog -r ... --backup-path /tmp/td-repair-backup-test` 退出码 `25`,仍可在预检阶段 fail-fast(流程未回归) | +| 2026-03-03 12:47 | T2.5 Red 阶段开始 | 已将 `task_plan.md` 中 `T2.5` 置为 `in_progress`,准备先新增 `repair.log`/`repair.state.json` 的单测并验证失败 | +| 2026-03-03 12:49 | T2.5 Red 验证 | `cmake --build debug --target commonTest` 失败,报错 `tRepairPrepareSessionFiles/tRepairAppendSessionLog/tRepairWriteSessionState was not declared in this scope`,符合预期 | +| 2026-03-03 12:51 | T2.5 Green 实现 | `trepair.h/.c` 新增会话文件准备、日志追加、状态文件写入(JSON 原子落盘);`dmMain.c` 接入 precheck 后的 session 初始化与状态更新 | +| 2026-03-03 12:52 | T2.5 单测验证 | `ASAN_OPTIONS=detect_leaks=0 ./debug/build/bin/commonTest --gtest_filter=RepairOptionParseTest.*Session*` 通过(3/3) | +| 2026-03-03 12:52 | T2.5 回归验证 | `ASAN_OPTIONS=detect_leaks=0 ctest --test-dir debug -R commonTest --output-on-failure` 通过 | +| 2026-03-03 12:53 | T2.5 编译验证 | `cmake --build debug -j8 --target taosd` 通过 | +| 2026-03-03 12:53 | T2.5 运行验证 | `taosd -o /tmp/taoslog -r ...` 退出码 `25`,仍在 precheck 阶段 fail-fast(未引入启动流程回归) | +| 2026-03-03 13:05 | 汇报规范持久化 | 新增仓库根 `AGENTS.md`,并在 `task_plan.md` 恢复机制追加“每次汇报必须包含进度条”规则 | +| 2026-03-03 13:07 | T2.6 Red 阶段开始 | 已将 `task_plan.md` 中 `T2.6` 置为 `in_progress`,准备先补进度行/摘要行构造与节流函数单测并验证失败 | +| 2026-03-03 13:09 | T2.6 Red 验证 | `cmake --build debug --target commonTest` 失败,报错 `tRepairBuildProgressLine/tRepairBuildSummaryLine/tRepairNeedReportProgress was not declared in this scope`,符合预期 | +| 2026-03-03 13:12 | T2.6 Green 实现 | `trepair.h/.c` 新增进度节流判定、进度行/摘要行构造 API;`dmMain.c` 接入 precheck/backup 阶段进度输出与最终摘要写入 | +| 2026-03-03 13:13 | T2.6 单测验证 | `ASAN_OPTIONS=detect_leaks=0 ./debug/build/bin/commonTest --gtest_filter=RepairOptionParseTest.*Progress*` 通过(3/3) | +| 2026-03-03 13:13 | T2.6 回归验证 | `ASAN_OPTIONS=detect_leaks=0 ctest --test-dir debug -R commonTest --output-on-failure` 通过 | +| 2026-03-03 13:14 | T2.6 编译验证 | `cmake --build debug -j8 --target taosd` 通过 | +| 2026-03-03 13:14 | T2.6 运行验证 | `taosd -o /tmp/taoslog -r ...` 退出码 `25`,仍在 precheck 阶段 fail-fast(启动流程未回归) | +| 2026-03-03 13:27 | T2.7 Red 阶段开始 | 已将 `task_plan.md` 中 `T2.7` 置为 `in_progress`,准备先新增会话恢复/续跑失败用例并验证失败 | +| 2026-03-03 13:29 | T2.7 Red 验证 | `cmake --build debug --target commonTest` 失败,报错 `tRepairTryResumeSession was not declared in this scope`,符合预期 | +| 2026-03-03 13:35 | T2.7 Green 实现 | `trepair.h/.c` 新增 `tRepairTryResumeSession()`(扫描 `repair-*` 会话、校验 `repair.state.json` 并回填续跑进度);`dmMain.c` 接入恢复入口与 `doneVnodes` 跳过续跑 | +| 2026-03-03 13:36 | T2.7 单测验证 | `ASAN_OPTIONS=detect_leaks=0 ./debug/build/bin/commonTest --gtest_filter=RepairOptionParseTest.TryResumeSession*` 通过(3/3) | +| 2026-03-03 13:38 | T2.7 回归验证 | `ASAN_OPTIONS=detect_leaks=0 ctest --test-dir debug -R commonTest --output-on-failure` 通过;`cmake --build debug -j8 --target taosd` 通过 | +| 2026-03-03 13:38 | T2.7 运行验证 | `taosd -o /tmp/taoslog -r ... --backup-path /tmp/td-repair-nonexistent-backup-test` 退出码 `25`,仍在 precheck 阶段 fail-fast(启动流程未回归) | +| 2026-03-03 13:39 | T2.7 收尾 | 已将 `task_plan.md` 中 `T2.7` 更新为 `completed`,下一入口切换为 `T3.1` | +| 2026-03-03 13:51 | T3.1 Red 阶段开始 | 已将 `task_plan.md` 中 `T3.1` 置为 `in_progress`,准备先新增 WAL 调度判定与目标路径失败用例并验证失败 | +| 2026-03-03 21:58 | T3.1 Green 实现 | `trepair.h/.c` 新增 `tRepairNeedRunWalForceRepair()/tRepairBuildVnodeTargetPath()`;`dmMain.c` 接入 `force+wal` 调度(`walInit` + 每 vnode `walOpen/walClose`)与状态/日志/进度更新 | +| 2026-03-03 22:01 | T3.1 单测验证 | `ASAN_OPTIONS=detect_leaks=0 ./debug/build/bin/commonTest --gtest_filter=RepairOptionParseTest.NeedRunWalForceRepair:RepairOptionParseTest.BuildVnodeTargetPath` 通过(2/2) | +| 2026-03-03 22:02 | T3.1 回归验证 | `ASAN_OPTIONS=detect_leaks=0 ctest --test-dir debug -R commonTest --output-on-failure` 通过;`cmake --build debug -j8 --target taosd` 通过 | +| 2026-03-03 22:02 | T3.1 运行验证 | `taosd -o /tmp/taoslog -r --node-type vnode --file-type wal --vnode-id 2 --mode force --backup-path /tmp/td-repair-nonexistent-backup` 退出码 `25`,仍按 precheck fail-fast(未引入回归) | +| 2026-03-03 22:03 | T3.1 收尾 | 已将 `task_plan.md` 中 `T3.1` 更新为 `completed`,下一入口切换为 `T3.2` | +| 2026-03-03 14:28 | T3.2 Red 阶段开始 | 已将 `task_plan.md` 中 `T3.2` 置为 `in_progress`,准备新增 WAL 备份与失败回滚保护单测并先验证失败 | +| 2026-03-03 14:30 | T3.2 Red 验证 | `cmake --build debug -j8 --target commonTest` 失败,报错 `tRepairBackupVnodeTarget/tRepairRollbackVnodeTarget was not declared`,符合先测后码预期 | +| 2026-03-03 14:32 | T3.2 Green 实现 | `trepair.h/.c` 新增 `tRepairBackupVnodeTarget()` 与 `tRepairRollbackVnodeTarget()`(目录递归备份/回滚);`dmMain.c` 在 `force+wal` 循环接入“先备份、失败回滚”与日志记录 | +| 2026-03-03 14:33 | T3.2 单测验证 | `ASAN_OPTIONS=detect_leaks=0 ./debug/build/bin/commonTest --gtest_filter=RepairOptionParseTest.BackupAndRollbackVnodeTarget:RepairOptionParseTest.BackupAndRollbackVnodeTargetInvalidArgs` 通过(2/2) | +| 2026-03-03 14:33 | T3.2 回归验证 | `ASAN_OPTIONS=detect_leaks=0 ctest --test-dir debug -R commonTest --output-on-failure` 通过;`cmake --build debug -j8 --target taosd` 通过 | +| 2026-03-03 14:34 | T3.2 运行验证 | `taosd -o /tmp/taoslog -r --node-type vnode --file-type wal --vnode-id 2 --mode force --backup-path /tmp/td-repair-nonexistent-backup` 输出 `failed repair precheck: Invalid parameters`,退出码 `25`,保持 precheck fail-fast | +| 2026-03-03 14:34 | T3.2 收尾 | 已将 `task_plan.md` 中 `T3.2` 更新为 `completed`,下一入口切换为 `T3.3` | +| 2026-03-03 14:47 | T3.3 Red 阶段开始 | 已将 `task_plan.md` 中 `T3.3` 置为 `in_progress`,准备新增 WAL 修复明细统计/输出单测并先验证失败 | +| 2026-03-03 14:50 | T3.3 Red 验证 | `cmake --build debug -j8 --target walTest` 失败,报错 `flexible array member 'SWalCont::body' not at end of 'struct SWal'`,符合“先失败再修复”预期 | +| 2026-03-03 14:53 | T3.3 Green 实现 | 修复 `SWal` 结构体字段顺序(保持 `writeHead` 为末尾字段);完成 `walGetRepairStats` + WAL 修复统计累计与 `dmMain.c` 的 `repair.log` 明细输出接入 | +| 2026-03-03 14:55 | T3.3 定向验证 | `cmake --build debug -j8 --target walTest` 通过;`./debug/build/bin/walTest --gtest_filter=WalKeepEnv.walGetRepairStatsInvalidArgs:WalKeepEnv.walRepairStatsTrackCorruptedSegmentAndIdxRebuild` 通过(2/2) | +| 2026-03-03 14:56 | T3.3 回归验证 | `ASAN_OPTIONS=detect_leaks=0 ctest --test-dir debug -R wal_test --output-on-failure` 通过;`cmake --build debug -j8 --target taosd` 通过 | +| 2026-03-03 14:56 | T3.3 收尾 | 已将 `task_plan.md` 中 `T3.3` 更新为 `completed`,下一入口切换为 `T3.4` | +| 2026-03-03 14:58 | T3.4 Red 阶段开始 | 已将 `task_plan.md` 中 `T3.4` 置为 `in_progress`,准备补充“仅 idx 损坏”自动化样例并先验证失败 | +| 2026-03-03 15:00 | T3.4 Red 验证 | 新增 `walRepairStatsTrackIdxOnlyCorruption` 用例失败:`stats.corruptedSegments` 实际为 `0`,未记录“仅 idx 损坏”区段 | +| 2026-03-03 15:00 | T3.4 构建环境处理 | 首次 Red 构建遇到 `ext_pcre2` update 外网失败;通过本地依赖与 stamp 方式消除非业务阻塞后继续测试 | +| 2026-03-03 15:02 | T3.4 Green 实现 | 在 `walCheckAndRepairIdxFile()` 进入 idx 修复路径时累计 `repairStats.corruptedSegments`,统一“损坏区段”统计口径 | +| 2026-03-03 15:03 | T3.4 定向验证 | `walTest` 定向用例通过:`walRepairStatsTrackIdxOnlyCorruption`、`walRepairStatsTrackCorruptedSegmentAndIdxRebuild`(2/2) | +| 2026-03-03 15:04 | T3.4 回归验证 | `ASAN_OPTIONS=detect_leaks=0 ctest --test-dir debug -R wal_test --output-on-failure` 通过;`cmake --build debug -j8 --target taosd` 通过 | +| 2026-03-03 15:04 | T3.4 收尾 | 已将 `task_plan.md` 中 `T3.4` 更新为 `completed`,下一入口切换为 `T4.1` | +| 2026-03-03 15:05 | T4.1 Red 阶段开始 | 已将 `task_plan.md` 中 `T4.1` 置为 `in_progress`,准备先补 TSDB 扫描器失败用例并验证失败 | +| 2026-03-03 15:06 | T4.1 Red 验证 | `cmake --build debug -j8 --target commonTest` 失败,报错 `SRepairTsdbScanResult/tRepairScanTsdbFiles` 未声明,符合先测后码预期 | +| 2026-03-03 15:09 | T4.1 Green 实现 | `trepair.h/.c` 新增 `SRepairTsdbScanResult` 与 `tRepairScanTsdbFiles()`;实现递归扫描 `.head/.data/.sma/.stt` 统计,并在 `tRepairPrecheck()` 的 `fileType=tsdb` 分支接入完整性校验 | +| 2026-03-03 15:10 | T4.1 定向验证 | `ASAN_OPTIONS=detect_leaks=0 ./debug/build/bin/commonTest --gtest_filter=RepairOptionParseTest.ScanTsdbFiles*` 通过(3/3) | +| 2026-03-03 15:11 | T4.1 回归验证 | `ASAN_OPTIONS=detect_leaks=0 ctest --test-dir debug -R commonTest --output-on-failure` 通过;`cmake --build debug -j8 --target taosd` 通过 | +| 2026-03-03 15:20 | T4.1 收尾 | 已将 `task_plan.md` 中 `T4.1` 更新为 `completed`,下一入口切换为 `T4.2` | +| 2026-03-03 15:20 | T4.2 Red 阶段开始 | 已将 `task_plan.md` 中 `T4.2` 置为 `in_progress`,准备先补“可恢复块提取/损坏块定位”失败用例并验证失败 | +| 2026-03-03 15:22 | T4.2 Red 验证 | `cmake --build debug -j8 --target commonTest` 失败,报错 `SRepairTsdbBlockReport/tRepairAnalyzeTsdbBlocks` 未声明,符合先测后码预期 | +| 2026-03-03 15:27 | T4.2 Green 实现 | `trepair.h/.c` 新增 `SRepairTsdbBlockReport` 与 `tRepairAnalyzeTsdbBlocks()`;按 TSDB 子目录聚合块级统计(`total/recoverable/corrupted/unknown`)并输出损坏块路径列表 | +| 2026-03-03 15:28 | T4.2 定向验证 | `ASAN_OPTIONS=detect_leaks=0 ./debug/build/bin/commonTest --gtest_filter=RepairOptionParseTest.AnalyzeTsdbBlocksReport*` 通过(3/3) | +| 2026-03-03 15:29 | T4.2 回归验证 | `ASAN_OPTIONS=detect_leaks=0 ctest --test-dir debug -R commonTest --output-on-failure` 通过;`cmake --build debug -j8 --target taosd` 通过 | +| 2026-03-03 15:33 | T4.2 收尾 | 已将 `task_plan.md` 中 `T4.2` 更新为 `completed`,下一入口切换为 `T4.3` | +| 2026-03-03 15:33 | T4.3 Red 阶段开始 | 已将 `task_plan.md` 中 `T4.3` 置为 `in_progress`,准备先补“保留有效块重建输出目录”失败用例并验证失败 | +| 2026-03-03 15:35 | T4.3 Red 验证 | `cmake --build debug -j8 --target commonTest` 失败,报错 `tRepairRebuildTsdbBlocks` 未声明,符合先测后码预期 | +| 2026-03-03 15:41 | T4.3 Green 实现 | `trepair.h/.c` 新增 `tRepairRebuildTsdbBlocks()`:按目录级块判定保留 `head+data` 可恢复块并重建输出目录;同步输出 `SRepairTsdbBlockReport` 汇总与损坏路径 | +| 2026-03-03 15:43 | T4.3 定向验证 | `ASAN_OPTIONS=detect_leaks=0 ./debug/build/bin/commonTest --gtest_filter=RepairOptionParseTest.ScanTsdbFiles*:RepairOptionParseTest.AnalyzeTsdbBlocksReport*:RepairOptionParseTest.RebuildTsdbBlocks*` 通过(9/9) | +| 2026-03-03 15:44 | T4.3 回归验证 | `ASAN_OPTIONS=detect_leaks=0 ctest --test-dir debug -R commonTest --output-on-failure` 通过;`cmake --build debug -j8 --target taosd` 通过 | +| 2026-03-03 15:46 | T4.3 收尾 | 已将 `task_plan.md` 中 `T4.3` 更新为 `completed`,下一入口切换为 `T4.4` | +| 2026-03-03 15:46 | T4.4 Red 阶段开始 | 已将 `task_plan.md` 中 `T4.4` 置为 `in_progress`,准备先定义“重建后启动/查询可用”最小验收用例并验证失败 | +| 2026-03-03 15:47 | T4.4 Red 验证 | `cmake --build debug -j8 --target commonTest` 失败,报错 `tRepairNeedRunTsdbForceRepair` 未声明,符合先测后码预期 | +| 2026-03-03 15:50 | T4.4 Green 实现 | `trepair.h/.c` 新增 `tRepairNeedRunTsdbForceRepair()`;`dmMain.c` 新增 `dmRunForceTsdbRepair()` 并接入 `dmRunRepairWorkflow()`(`force+tsdb` 分支),实现 `analyze -> rebuild -> 目录切换` 与失败回滚 | +| 2026-03-03 15:51 | T4.4 定向验证 | `ASAN_OPTIONS=detect_leaks=0 ./debug/build/bin/commonTest --gtest_filter=RepairOptionParseTest.NeedRunWalForceRepair:RepairOptionParseTest.NeedRunTsdbForceRepair` 通过(2/2) | +| 2026-03-03 15:52 | T4.4 回归验证 | `ASAN_OPTIONS=detect_leaks=0 ctest --test-dir debug -R commonTest --output-on-failure` 通过;`cmake --build debug -j8 --target taosd` 通过 | +| 2026-03-03 15:53 | T4.4 运行验证 | `ASAN_OPTIONS=detect_leaks=0 taosd -o /tmp/taoslog -r --node-type vnode --file-type tsdb --vnode-id 2 --mode force --backup-path /tmp/td-repair-nonexistent-backup` 退出码 `25`,输出 `failed repair precheck: Invalid parameters`,保持 precheck fail-fast | +| 2026-03-03 15:54 | T4.4 收尾 | 已将 `task_plan.md` 中 `T4.4` 更新为 `completed`,下一入口切换为 `T4.5`(`in_progress`) | +| 2026-03-03 15:55 | T4.5 Red 阶段开始 | 已将 `task_plan.md` 中 `T4.5` 置为 `in_progress`,准备补 TSDB 场景系统测试脚本并先验证失败 | +| 2026-03-03 15:55 | T4.5 Red 验证 | `bash tests/ci/repair_tsdb_force.sh` 失败(脚本不存在,退出码 `127`),符合先测后码预期 | +| 2026-03-03 15:57 | T4.5 Green 实现 | 新增 `tests/ci/repair_tsdb_force.sh`:构造 `recoverable + corrupted` TSDB 样本,执行 `taosd -r --file-type tsdb --mode force` 并校验 `repair progress/summary`、目标目录重建结果、备份目录与状态文件 | +| 2026-03-03 15:58 | T4.5 定向验证 | `bash tests/ci/repair_tsdb_force.sh` 通过,输出 `tsdb force repair script passed (taosd exit code: 47)` | +| 2026-03-03 15:58 | T4.5 收尾 | 已将 `task_plan.md` 中 `T4.5` 更新为 `completed`,`P4` 标记为 `completed`,下一入口切换为 `T5.1`(`in_progress`) | +| 2026-03-03 15:59 | T5.1 阶段开始 | 已切换到 `force+meta`,准备先勘察 `meta` 解析与现有测试入口,定义首个 Red 用例 | +| 2026-03-03 16:00 | T5.1 上下文勘察 | 已定位 `metaOpen.c:metaGenerateNewMeta()` 与 `dmMain.c` 的 `generateNewMeta` 触发点,确认下一步应先补 `force+meta` 调度判定测试,再决定是否直接复用/包装 `metaGenerateNewMeta` | +| 2026-03-03 23:37 | T5.1 Red 阶段开始 | 已在 `commonTests.cpp` 新增 `ScanMetaFiles*` 与 `NeedRunMetaForceRepair` 测试,准备先验证接口缺失导致的编译失败 | +| 2026-03-03 23:38 | T5.1 Red 验证 | `cmake --build debug -j8 --target commonTest` 失败,报错 `tRepairScanMetaFiles/tRepairNeedRunMetaForceRepair` 未声明,符合先测后码预期 | +| 2026-03-03 23:42 | T5.1 Green 实现 | `trepair.h/.c` 新增 `SRepairMetaScanResult`、`tRepairScanMetaFiles()`、`tRepairNeedRunMetaForceRepair()`,并在 `tRepairPrecheck()` 接入 `fileType=meta` 校验;`dmMain.c` 新增 `dmRunForceMetaRepair()` 并接入 repair 工作流 | +| 2026-03-03 23:43 | T5.1 定向验证 | `ASAN_OPTIONS=detect_leaks=0 ./debug/build/bin/commonTest --gtest_filter=RepairOptionParseTest.ScanMetaFiles*:RepairOptionParseTest.NeedRunMetaForceRepair` 通过(4/4) | +| 2026-03-03 23:44 | T5.1 回归验证 | `ASAN_OPTIONS=detect_leaks=0 ctest --test-dir debug -R commonTest --output-on-failure` 通过;`cmake --build debug -j8 --target taosd` 通过 | +| 2026-03-03 23:46 | T5.1 运行验证 | 使用临时数据目录执行 `taosd -r --file-type meta --mode force`,输出 `step=meta` 进度与成功摘要,且备份目录包含 `table.db/schema.db/uid.idx/name.idx` 等元数据文件 | +| 2026-03-03 23:46 | T5.1 收尾 | 已将 `task_plan.md` 中 `T5.1` 更新为 `completed`,下一入口切换为 `T5.2`(`in_progress`) | +| 2026-03-04 00:05 | T5.2 Red 阶段开始 | 已在 `commonTests.cpp` 新增 `InferMetaFromWalTsdb*` 与 `PrecheckMetaFallbackToInferenceSuccess` 用例,先验证接口缺失导致的编译失败 | +| 2026-03-04 00:07 | T5.2 Green 实现 | `trepair.h/.c` 新增 `SRepairMetaInferenceReport` 与 `tRepairInferMetaFromWalTsdb()`;`tRepairPrecheck()` 在 `meta` 缺失场景回退推导;`dmMain.c` 的 `dmRunForceMetaRepair()` 增加推导兜底并写入 `meta infer detail` | +| 2026-03-04 00:12 | T5.2 定向验证 | `cmake --build debug -j8 --target commonTest` 通过;`ASAN_OPTIONS=detect_leaks=0 ./debug/build/bin/commonTest --gtest_filter='RepairOptionParseTest.ScanMetaFiles*:RepairOptionParseTest.NeedRunMetaForceRepair:RepairOptionParseTest.InferMetaFromWalTsdb*:RepairOptionParseTest.PrecheckMetaFallbackToInferenceSuccess'` 通过(8/8) | +| 2026-03-04 00:13 | T5.2 回归验证 | `ASAN_OPTIONS=detect_leaks=0 ctest --test-dir debug -R commonTest --output-on-failure` 通过;`cmake --build debug -j8 --target taosd` 通过 | +| 2026-03-04 00:15 | T5.2 Smoke 验证(meta 完整) | 最小样本验证通过:`step=meta` 进度与成功摘要命中,`repair.log` 包含 `meta scan detail`;`taosd` 退出码 `47`(流程级成功) | +| 2026-03-04 00:16 | T5.2 Smoke 验证(meta 缺文件+证据) | 最小样本验证通过:未出现 precheck fail-fast,`step=meta` 与成功摘要命中,`repair.log` 包含 `meta infer detail`;`taosd` 退出码 `47` | +| 2026-03-04 00:17 | T5.2 收尾 | 已将 `task_plan.md` 中 `T5.2` 更新为 `completed`,下一入口切换为 `T5.3`(`in_progress`) | +| 2026-03-04 00:22 | T5.3 Red 阶段开始 | 新增 `BuildMetaMissingFileMark*` 测试,先通过编译失败验证缺失接口 | +| 2026-03-04 00:24 | T5.3 Red 验证 | `cmake --build debug -j8 --target commonTest` 失败,报错 `tRepairBuildMetaMissingFileMark` 未声明,符合先测后码预期 | +| 2026-03-04 00:30 | T5.3 Green 实现 | `trepair.h/.c` 新增 `tRepairBuildMetaMissingFileMark()`;`dmMain.c` 新增 `meta missing marker` 与 `meta unrecoverable detail` 日志路径,并把新增逻辑拆分为独立 helper 函数,避免继续膨胀主流程函数 | +| 2026-03-04 00:32 | T5.3 定向验证 | `ASAN_OPTIONS=detect_leaks=0 ./debug/build/bin/commonTest --gtest_filter='RepairOptionParseTest.BuildMetaMissingFileMark*:RepairOptionParseTest.ScanMetaFiles*:RepairOptionParseTest.NeedRunMetaForceRepair:RepairOptionParseTest.InferMetaFromWalTsdb*:RepairOptionParseTest.PrecheckMetaFallbackToInferenceSuccess'` 通过(10/10) | +| 2026-03-04 00:33 | T5.3 回归验证 | `ASAN_OPTIONS=detect_leaks=0 ctest --test-dir debug -R commonTest --output-on-failure` 通过;`cmake --build debug -j8 --target taosd` 通过 | +| 2026-03-04 00:37 | T5.3 Smoke 验证(完整/可推导/不可推导) | 三场景通过:`meta` 完整(成功摘要)、`meta` 缺文件+wal 证据(`repair.log` 命中 `meta missing marker` + `meta infer detail`)、`meta` 缺文件无证据(输出 `meta unrecoverable detail` 并 precheck 失败,退出码 `25`) | +| 2026-03-04 00:38 | T5.3 收尾 | 已将 `task_plan.md` 中 `T5.3` 更新为 `completed`,下一入口切换为 `T5.4`(`in_progress`) | +| 2026-03-04 01:02 | T5.4 Red 验证 | `cmake --build debug -j8 --target commonTest` 失败,报错 `tRepairRebuildMetaFiles` 未声明,符合先测后码预期 | +| 2026-03-04 01:10 | T5.4 Green 实现 | `trepair.h/.c` 新增 `tRepairRebuildMetaFiles()`(拷贝现有 META 并补齐必需文件);`dmMain.c` 接入 `force+meta` 的 `rebuild -> rename` 切换、失败回滚与 `meta rebuild detail` 日志,并通过 helper 继续控制主流程函数体量 | +| 2026-03-04 01:18 | T5.4 定向验证 | `ASAN_OPTIONS=detect_leaks=0 ./debug/build/bin/commonTest --gtest_filter='RepairOptionParseTest.RebuildMetaFiles*:RepairOptionParseTest.ScanMetaFiles*:RepairOptionParseTest.InferMetaFromWalTsdb*:RepairOptionParseTest.BuildMetaMissingFileMark*:RepairOptionParseTest.PrecheckMetaFallbackToInferenceSuccess'` 通过(11/11) | +| 2026-03-04 01:19 | T5.4 回归验证 | `ASAN_OPTIONS=detect_leaks=0 ctest --test-dir debug -R commonTest --output-on-failure` 通过;`cmake --build debug -j8 --target taosd` 通过 | +| 2026-03-04 01:23 | T5.4 Smoke 验证(完整/可推导) | 两场景通过:完整场景与缺文件+wal 证据场景均命中 `step=meta` + 成功摘要(退出码 `47`),`repair.log` 命中 `meta rebuild detail`;可推导场景额外命中 `meta missing marker` + `meta infer detail` | +| 2026-03-04 01:24 | T5.4 收尾 | 已将 `task_plan.md` 中 `T5.4` 更新为 `completed`,下一入口切换为 `T5.5`(`in_progress`) | +| 2026-03-04 01:28 | T5.5 Red 验证 | `bash tests/ci/repair_meta_force.sh` 失败(脚本不存在,退出码 `127`),符合先测后码预期 | +| 2026-03-04 01:33 | T5.5 Green 实现 | 新增 `tests/ci/repair_meta_force.sh`,覆盖“部分损坏 + 完全损坏(均带 wal 证据)”双场景,校验 `step=meta` 进度/成功摘要、`meta missing marker`/`meta infer detail`/`meta rebuild detail` 日志以及必需文件补齐 | +| 2026-03-04 01:34 | T5.5 定向验证 | `bash tests/ci/repair_meta_force.sh` 通过:`meta-partial` 与 `meta-complete` 场景均成功(`taosd` 退出码 `47`) | +| 2026-03-04 01:35 | T5.5 收尾 | 已将 `task_plan.md` 中 `T5.5` 更新为 `completed`,`P5` 标记为 `completed`,下一入口切换为 `T6.1`(`in_progress`) | +| 2026-03-04 01:39 | T6.1 Red 验证 | 在 `commonTests.cpp` 新增 `NeedRunReplicaRepair*` 后执行 `cmake --build debug -j8 --target commonTest` 失败,报错 `tRepairNeedRunReplicaRepair` 未声明,符合先测后码预期 | +| 2026-03-04 01:45 | T6.1 Green 实现 | `trepair.h/.c` 新增 `tRepairNeedRunReplicaRepair()`;`dmMain.c` 新增 `dmRunReplicaRepair()` 并接入 `dmRunRepairWorkflow()`,实现 `mode=replica` 的显式分支调度、状态落盘与 `replica dispatch detail` 日志(stub) | +| 2026-03-04 01:47 | T6.1 定向验证 | `ASAN_OPTIONS=detect_leaks=0 ./debug/build/bin/commonTest --gtest_filter='RepairOptionParseTest.NeedRunWalForceRepair:RepairOptionParseTest.NeedRunTsdbForceRepair:RepairOptionParseTest.NeedRunMetaForceRepair:RepairOptionParseTest.NeedRunReplicaRepair*'` 通过(5/5) | +| 2026-03-04 01:48 | T6.1 回归验证 | `ASAN_OPTIONS=detect_leaks=0 ctest --test-dir debug -R commonTest --output-on-failure` 通过;`cmake --build debug -j8 --target taosd` 通过 | +| 2026-03-04 01:49 | T6.1 Smoke 验证(replica 分支) | `mode=replica` 最小样本验证通过:输出命中 `step=replica` 100% 进度与成功摘要,`repair.log` 命中 `replica dispatch detail`;`taosd` 退出码 `47` | +| 2026-03-04 01:50 | T6.1 收尾 | 已将 `task_plan.md` 中 `T6.1` 更新为 `completed`,下一入口切换为 `T6.2`(`in_progress`) | +| 2026-03-04 01:51 | T6.2 Red 验证 | `cmake --build debug -j8 --target commonTest` 失败,报错 `tRepairDegradeReplicaVnode was not declared in this scope`,符合先测后码预期 | +| 2026-03-04 01:54 | T6.2 Green 实现 | `trepair.h/.c` 新增 `tRepairDegradeReplicaVnode()`(本地坏副本降级 marker 原子落盘,含 `availability/syncPolicy/versionPolicy/termPolicy`);`dmMain.c` 升级 `dmRunReplicaRepair()` 为逐 vnode 执行降级并写 `replica degrade detail` | +| 2026-03-04 01:55 | T6.2 定向验证 | `ASAN_OPTIONS=detect_leaks=0 ./debug/build/bin/commonTest --gtest_filter='RepairOptionParseTest.NeedRunReplicaRepair*:RepairOptionParseTest.DegradeReplicaVnode*'` 通过(4/4) | +| 2026-03-04 01:57 | T6.2 回归验证 | `ASAN_OPTIONS=detect_leaks=0 ctest --test-dir debug -R commonTest --output-on-failure` 通过;`cmake --build debug -j8 --target taosd` 通过 | +| 2026-03-04 01:59 | T6.2 Smoke 验证(replica 降级) | 最小样本验证通过:`TAOS_DATA_DIR=/tmp/td-repair-replica-smoke-data` 场景下输出命中 `step=replica` + 成功摘要,`repair.log` 命中 `replica dispatch detail` 与 `replica degrade detail`,且落盘 `vnode/vnode2/replica.degrade.marker.json`(`taosd` 退出码 `47`) | +| 2026-03-04 01:59 | T6.2 收尾 | 已将 `task_plan.md` 中 `T6.2` 更新为 `completed`,下一入口切换为 `T6.3`(`in_progress`) | +| 2026-03-04 01:00 | T6.4 Smoke 验证(失败回滚) | 构造 `vnode3/replica.degrade.marker.json` 为目录触发第 2 个 vnode 降级失败;验证 `taosd` 退出码 `25`、`repair.log` 命中 `replica rollback detail`、`vnode2` 的 marker 已被回滚删除、hint 未生成 | +| 2026-03-04 01:01 | T6.4 回归验证 | `commonTest` 定向用例(`NeedRunReplicaRepair* + DegradeReplicaVnode* + WriteReplicaRestoreHint* + RollbackReplicaVnode*`)通过(8/8);`ASAN_OPTIONS=detect_leaks=0 ctest --test-dir debug -R commonTest --output-on-failure` 通过 | +| 2026-03-04 01:01 | T6.4 构建阻塞处理 | `cmake --build debug --target commonTest taosd` 再次触发 `ext_pcre2` 外网更新失败(`Couldn't connect to server`);切换为直接运行已编译二进制完成验证 | +| 2026-03-04 01:02 | T6.3 Smoke 验证(restore/vgroup 联动) | 最小样本验证通过:`taosd` 退出码 `47`,`repair.log` 命中 `replica dispatch/degrade/restore detail`,marker 与 `replica.restore.hint.json` 均存在,`restoreDnodeImpl=community-stub` | +| 2026-03-04 01:03 | T6.4 注入验证(hint 后失败) | 进行 20 轮“hint 生成后再注入失败”尝试未稳定命中,记录为可观测性风险,不阻塞当前功能交付 | +| 2026-03-04 01:03 | T6.3/T6.4 收尾 | 已将 `task_plan.md` 中 `T6.3`、`T6.4` 更新为 `completed`,`P6` 标记为 `completed`,下一入口切换为 `T7.1`(`pending`) | +| 2026-03-04 01:04 | T7.1 阶段开始 | 已将 `task_plan.md` 中 `T7.1` 置为 `in_progress`,准备先补 `--replica-node` 合法性校验 Red 用例(格式与目标路径约束) | +| 2026-03-04 01:05 | T7.1 Red 验证 | 新增 `ValidateCliArgsReplicaNodeEndpointFormat` 用例后运行定向 gtest,按预期失败:非法 endpoint 被错误判定为合法(`tRepairValidateCliArgs` 返回 success) | +| 2026-03-04 01:06 | T7.1 Green 实现 | `trepair.c` 新增 `tRepairValidateReplicaNodeEndpoint()`,在 `mode=copy` 分支强制校验 `:`、单冒号、无空白 | +| 2026-03-04 01:06 | T7.1 单测回归 | `cmake --build debug --target commonTest` 通过;`commonTest --gtest_filter='RepairOptionParseTest.ValidateCliArgsReplicaNodeRule:RepairOptionParseTest.ValidateCliArgsReplicaNodeEndpointFormat'` 通过(2/2) | +| 2026-03-04 01:08 | T7.1 构建与全回归 | `cmake --build debug --target taosd` 首次因 `ext_pcre2` 外网更新失败,升权重试后通过;`ASAN_OPTIONS=detect_leaks=0 ctest --test-dir debug -R commonTest --output-on-failure` 通过 | +| 2026-03-04 01:09 | T7.1 运行验证(copy 参数合法性) | `mode=copy` + 非法 `replica-node=192.168.1.24:var/lib/taos` 退出码 `25`(参数组合拒绝);合法 `replica-node=192.168.1.24:/var/lib/taos` 退出码 `47` 并进入修复流程摘要 | +| 2026-03-04 01:09 | T7.1 收尾 | 已将 `task_plan.md` 中 `T7.1` 更新为 `completed`,下一入口切换为 `T7.2`(`pending`) | +| 2026-03-04 01:10 | T7.2 阶段开始 | 已将 `task_plan.md` 中 `T7.2` 置为 `in_progress`,准备先补“replica endpoint 解析 + 本地 mock 拷贝接口”Red 用例 | +| 2026-03-04 01:10 | T7.2 Red 验证 | 新增 `ParseReplicaNodeEndpoint/MockCopyReplicaVnodeTarget*` 测试后编译失败,报错 `tRepairParseReplicaNodeEndpoint/tRepairMockCopyReplicaVnodeTarget` 未声明,符合先测后码预期 | +| 2026-03-04 01:11 | T7.2 Green 实现 | `trepair.h/.c` 新增 endpoint 解析与本地 mock 拷贝接口:`tRepairParseReplicaNodeEndpoint()`、`tRepairMockCopyReplicaVnodeTarget()` | +| 2026-03-04 01:11 | T7.2 定向验证 | `commonTest` 定向用例通过(`ParseReplicaNodeEndpoint + MockCopyReplicaVnodeTarget*` 共 3 条) | +| 2026-03-04 01:12 | T7.2 构建与回归 | `cmake --build debug --target taosd` 两次触发 `ext_pcre2` 外网更新失败,升权重重试后通过;`ASAN_OPTIONS=detect_leaks=0 ctest --test-dir debug -R commonTest --output-on-failure` 通过 | +| 2026-03-04 01:12 | T7.2 收尾 | 已将 `task_plan.md` 中 `T7.2` 更新为 `completed`,下一入口切换为 `T7.3`(`pending`) | +| 2026-03-04 01:24 | T7.3 阶段开始 | 已将 `task_plan.md` 中 `T7.3` 置为 `in_progress`,准备先补 copy 模式调度与 SSH/SCP 命令构造的 Red 用例 | +| 2026-03-04 01:25 | T7.3 Red 验证 | 新增 `NeedRunCopyRepair*` 与 `BuildCopySshScpCommands*` 测试后编译失败,报错 `tRepairNeedRunCopyRepair/tRepairBuildCopySshProbeCmd/tRepairBuildCopyScpCmd` 未声明,符合先测后码预期 | +| 2026-03-04 01:28 | T7.3 Green 实现 | `trepair.h/.c` 新增 copy 调度判定、SSH/SCP 命令构造与执行能力:`tRepairNeedRunCopyRepair()`、`tRepairBuildCopySshProbeCmd()`、`tRepairBuildCopyScpCmd()`、`tRepairSshScpCopyReplicaVnodeTarget()`;`dmMain.c` 新增 `dmRunCopyRepair()` 并接入 `dmRunRepairWorkflow()` | +| 2026-03-04 01:29 | T7.3 定向验证 | `commonTest` 定向用例通过:`NeedRunCopyRepair`、`NeedRunCopyRepairInvalidArgs`、`BuildCopySshScpCommands`、`BuildCopySshScpCommandsInvalidArgs`(4/4) | +| 2026-03-04 01:31 | T7.3 回归验证 | `ASAN_OPTIONS=detect_leaks=0 ctest --test-dir debug -R commonTest --output-on-failure` 通过;`cmake --build debug --target taosd` 通过 | +| 2026-03-04 01:31 | T7.3 Smoke 验证(copy SSH/SCP) | 使用 `TAOS_REPAIR_SSH_BIN/TAOS_REPAIR_SCP_BIN` 注入本地 mock 命令完成端到端验证:`taosd` 退出码 `47`,本地 `wal/meta` 文件被远端内容覆盖、陈旧文件被清理,`repair.log` 命中 `copy dispatch detail` 与 `copy replica detail` | +| 2026-03-04 01:31 | T7.3 收尾 | 已将 `task_plan.md` 中 `T7.3` 更新为 `completed`,下一入口切换为 `T7.4`(`pending`) | +| 2026-03-04 01:33 | T7.4 阶段开始 | 已将 `task_plan.md` 中 `T7.4` 置为 `in_progress`,准备先补“copy 后 owner/权限修复”Red 用例 | +| 2026-03-04 02:04 | T7.4 Red 验证 | `ASAN_OPTIONS=detect_leaks=0 ./debug/build/bin/commonTest --gtest_filter='RepairOptionParseTest.SshScpCopyReplicaVnodeTargetFixesOwnerAndPermission'` 失败:`localMeta=0 0 755` 与 `remoteMeta=0 0 700` 不一致,符合先测后码预期 | +| 2026-03-04 02:09 | T7.4 Green 实现 | `trepair.c` 在 `tRepairSshScpCopyReplicaVnodeTarget()` 接入 owner/权限修复:新增远端 `ssh stat` 命令构造、`uid/gid/mode` 解析与本地 `chown -R + chmod` 命令构造,在 `scp` 成功后执行修复 | +| 2026-03-04 02:10 | T7.4 定向验证 | `ASAN_OPTIONS=detect_leaks=0 ./debug/build/bin/commonTest --gtest_filter='RepairOptionParseTest.SshScpCopyReplicaVnodeTargetFixesOwnerAndPermission'` 通过(1/1) | +| 2026-03-04 02:12 | T7.4 回归验证 | copy 相关定向测试通过(`NeedRunCopyRepair* + BuildCopySshScpCommands* + SshScpCopyReplicaVnodeTargetFixesOwnerAndPermission` 共 5/5);`ASAN_OPTIONS=detect_leaks=0 ctest --test-dir debug -R commonTest --output-on-failure` 通过;`cmake --build debug --target taosd` 通过 | +| 2026-03-04 02:13 | T7.4 收尾 | 已将 `task_plan.md` 中 `T7.4` 更新为 `completed`,下一入口切换为 `T7.5`(`in_progress`) | +| 2026-03-04 02:13 | T7.5 阶段开始 | 已将 `task_plan.md` 中 `T7.5` 置为 `in_progress`,准备先补 copy 一致性校验与异常中断处理的 Red 用例 | +| 2026-03-04 02:20 | T7.5 Red 验证 | 新增 `SshScpCopyReplicaVnodeTargetDetectsConsistencyMismatch` 后定向运行失败:`tRepairSshScpCopyReplicaVnodeTarget()` 返回 success,未识别“scp 成功但少拷文件”的不一致场景,符合先测后码预期 | +| 2026-03-04 02:30 | T7.5 Green 实现 | `trepair.c` 新增 copy 后一致性校验(远端/本地 `find+sort+md5sum` 摘要比对)并接入 `tRepairSshScpCopyReplicaVnodeTarget()`;`dmMain.c` 的 `dmRunCopyRepair()` 新增每 vnode 备份、copy 失败回滚与 `copy rollback detail` 日志 | +| 2026-03-04 02:33 | T7.5 兼容修复 | 修复一致性命令的 shell 展开问题:将摘要提取从 `awk '{print $1}'` 调整为 `cut -d ' ' -f1`,避免远端命令中的 `$1` 被本地 shell 提前展开导致误判 | +| 2026-03-04 02:38 | T7.5 定向验证 | copy 定向测试通过(`NeedRunCopyRepair* + BuildCopySshScpCommands* + SshScpCopyReplicaVnodeTargetFixesOwnerAndPermission + SshScpCopyReplicaVnodeTargetDetectsConsistencyMismatch` 共 6/6) | +| 2026-03-04 02:39 | T7.5 回归验证 | `ASAN_OPTIONS=detect_leaks=0 ctest --test-dir debug -R commonTest --output-on-failure` 通过;`cmake --build debug --target taosd` 通过 | +| 2026-03-04 02:41 | T7.5 Smoke 验证(copy 失败回滚) | 使用 mock `scp`(复制后返回非 0)运行 `taosd -r --mode copy`:进程退出码 `25`,`repair.log` 命中 `copy rollback detail`,本地 `vnode2/wal/stale.log` 内容成功回滚保留 | +| 2026-03-04 02:42 | T7.5 收尾 | 已将 `task_plan.md` 中 `T7.5` 更新为 `completed`,`P7` 标记为 `completed`,下一入口切换为 `T8.1`(`pending`) | +| 2026-03-04 02:48 | T8.1 阶段开始 | 已将 `task_plan.md` 中 `T8.1` 置为 `in_progress`,准备先对“损坏数据生成器脚本”执行 Red 验证(脚本缺失失败) | +| 2026-03-04 02:49 | T8.1 Red 验证 | 执行 `bash tests/ci/repair_fixture_generator.sh` 失败(脚本不存在,退出码 `127`),符合先测后码预期 | +| 2026-03-04 02:53 | T8.1 Green 实现 | 新增 `tests/ci/repair_fixture_generator.sh`,支持 `--output-dir/--type/--vnode-id/--clean`,自动生成 `wal-force-corrupted`、`tsdb-force-mixed`、`meta-force-partial/meta-force-complete` 样本与 `manifest.txt` | +| 2026-03-04 02:54 | T8.1 定向验证 | 运行 `repair_fixture_generator.sh --type all` 后校验 WAL/META/TSDB 关键样本文件与 manifest 条目全部存在;运行 `--type wal --vnode-id 9` 验证单类型输出隔离通过 | +| 2026-03-04 02:55 | T8.1 收尾 | 已将 `task_plan.md` 中 `T8.1` 更新为 `completed`,`P8` 标记为 `in_progress`,下一入口切换为 `T8.2`(`in_progress`) | +| 2026-03-04 02:55 | T8.2 阶段开始 | 已将 `task_plan.md` 中 `T8.2` 置为 `in_progress`,准备先补“三模式系统测试矩阵脚本”Red 验证 | +| 2026-03-04 02:57 | T8.2 Red 验证 | 执行 `bash tests/ci/repair_mode_matrix.sh` 失败(脚本不存在,退出码 `127`),符合先测后码预期 | +| 2026-03-04 03:02 | T8.2 Green 实现 | 新增 `tests/ci/repair_mode_matrix.sh`,串联 `force(tsdb/meta)`、`replica`、`copy` 三模式验收:检查进度/摘要输出、会话日志关键字与 copy 一致性标识 | +| 2026-03-04 03:03 | T8.2 定向验证 | 执行 `bash tests/ci/repair_mode_matrix.sh` 通过:`tsdb/meta` force 脚本、replica 场景、copy 场景全部通过并输出 `repair mode matrix script passed` | +| 2026-03-04 03:03 | T8.2 收尾 | 已将 `task_plan.md` 中 `T8.2` 更新为 `completed`,下一入口切换为 `T8.3`(`in_progress`) | +| 2026-03-04 03:03 | T8.3 阶段开始 | 已将 `task_plan.md` 中 `T8.3` 置为 `in_progress`,准备更新中英文文档与运维示例 | +| 2026-03-04 03:08 | T8.3 文档实现 | 更新 `docs/zh/08-operation/05-maintenance.md` 与 `docs/en/08-operation/04-maintenance.md`,新增 `taosd -r` 文件级修复章节,覆盖 `force/replica/copy` 示例、日志验收与注意事项 | +| 2026-03-04 03:09 | T8.3 定向验证 | 通过 `rg` 校验中英文章节与关键命令(`taosd -r`、`repair progress`、`repair.state.json`)均已入文,章节结构与示例渲染正常 | +| 2026-03-04 03:10 | T8.3 收尾 | 已将 `task_plan.md` 中 `T8.3` 更新为 `completed`,下一入口切换为 `T8.4`(`in_progress`) | +| 2026-03-04 03:10 | T8.4 阶段开始 | 已将 `task_plan.md` 中 `T8.4` 置为 `in_progress`,准备执行发布前回归并输出风险清单签出 | +| 2026-03-04 03:14 | T8.4 发布前回归 | `bash tests/ci/repair_mode_matrix.sh` 通过(force tsdb/meta + replica + copy 全部通过);`ASAN_OPTIONS=detect_leaks=0 ctest --test-dir debug -R commonTest --output-on-failure` 通过;`cmake --build debug --target taosd` 通过 | +| 2026-03-04 03:16 | T8.4 风险清单签出 | 新增 `docs/plans/2026-03-04-data-repair-release-checklist.md`,记录发布 gate 结果、执行命令、残余风险与缓解建议 | +| 2026-03-04 03:16 | T8.4 收尾 | 已将 `task_plan.md` 中 `T8.4` 更新为 `completed`,`P8` 标记为 `completed`,当前任务集全部完成 | + +## 已落盘文档 +- `task_plan.md` +- `findings.md` +- `progress.md` +- `docs/plans/2026-03-03-data-repair-tool-design.md` +- `docs/plans/2026-03-03-data-repair-tool-implementation.md` + +## 下一次恢复建议命令 +```bash +git status --short +sed -n '1,220p' task_plan.md +sed -n '1,220p' findings.md +sed -n '1,220p' progress.md +sed -n '1,260p' docs/plans/2026-03-03-data-repair-tool-implementation.md +``` + +## 风险提示 +- `copy` 模式需要远端连接能力,可能涉及平台依赖与安全策略。 +- `replica` 模式若直接复用现有恢复逻辑,社区版与企业版能力差异需尽早收敛。 +- TSDB/META 修复难度显著高于 WAL,建议先交付可运行 MVP(`force+wal`)建立反馈回路。 diff --git a/source/common/src/trepair.c b/source/common/src/trepair.c new file mode 100644 index 000000000000..b5fa6fffb84f --- /dev/null +++ b/source/common/src/trepair.c @@ -0,0 +1,3141 @@ +/* + * Copyright (c) 2019 TAOS Data, Inc. + * + * This program is free software: you can use, redistribute, and/or modify + * it under the terms of the GNU Affero General Public License, version 3 + * or later ("AGPL"), as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. + * + * You should have received a copy of the GNU Affero General Public License + * along with this program. If not, see . + */ + +#define _DEFAULT_SOURCE +#include "trepair.h" + +#include +#include +#include +#include + +#include "tjson.h" +#include "tutil.h" + +typedef struct { + const char *name; + int32_t value; +} SRepairOptionPair; + +static const SRepairOptionPair kNodeTypeMap[] = { + {"vnode", REPAIR_NODE_TYPE_VNODE}, + {"mnode", REPAIR_NODE_TYPE_MNODE}, + {"dnode", REPAIR_NODE_TYPE_DNODE}, + {"snode", REPAIR_NODE_TYPE_SNODE}, +}; + +static const SRepairOptionPair kFileTypeMap[] = { + {"wal", REPAIR_FILE_TYPE_WAL}, + {"tsdb", REPAIR_FILE_TYPE_TSDB}, + {"meta", REPAIR_FILE_TYPE_META}, + {"tdb", REPAIR_FILE_TYPE_META}, // backward-compatible alias + {"data", REPAIR_FILE_TYPE_DATA}, + {"config", REPAIR_FILE_TYPE_CONFIG}, + {"checkpoint", REPAIR_FILE_TYPE_CHECKPOINT}, +}; + +static const SRepairOptionPair kModeMap[] = { + {"force", REPAIR_MODE_FORCE}, + {"replica", REPAIR_MODE_REPLICA}, + {"copy", REPAIR_MODE_COPY}, +}; + +static const char *kMetaRequiredFiles[] = { + "table.db", + "schema.db", + "uid.idx", + "name.idx", +}; + +static const char *kMetaOptionalIndexFiles[] = { + "ctb.idx", + "suid.idx", + "tag.idx", + "sma.idx", + "ctime.idx", + "ncol.idx", + "stream.task.db", +}; + +#define REPAIR_SESSION_DIR_PREFIX "repair-" +#define REPAIR_SESSION_LOG_NAME "repair.log" +#define REPAIR_SESSION_STATE_NAME "repair.state.json" +#define REPAIR_MAX_STATE_FILE_SIZE (1024 * 1024) +#define REPAIR_COPY_SSH_BIN_ENV "TAOS_REPAIR_SSH_BIN" +#define REPAIR_COPY_SCP_BIN_ENV "TAOS_REPAIR_SCP_BIN" +#define REPAIR_COPY_DEFAULT_SSH "ssh" +#define REPAIR_COPY_DEFAULT_SCP "scp" +#define REPAIR_COPY_CMD_EXIT_MARKER "__TD_REPAIR_COPY_EXIT__=" +#define REPAIR_RESUME_STEP_LEN 32 + +typedef struct { + bool found; + int64_t startTimeMs; + int32_t doneVnodes; + int32_t totalVnodes; + char step[REPAIR_RESUME_STEP_LEN]; + char sessionId[REPAIR_SESSION_ID_LEN]; + char sessionDir[PATH_MAX]; + char logPath[PATH_MAX]; + char statePath[PATH_MAX]; +} SRepairResumeCandidate; + +static int32_t tRepairParseOption(const char *input, const SRepairOptionPair *pMap, int32_t mapSize, int32_t *pValue) { + if (input == NULL || pMap == NULL || pValue == NULL || mapSize <= 0) { + return TSDB_CODE_INVALID_PARA; + } + + for (int32_t i = 0; i < mapSize; ++i) { + if (taosStrcasecmp(input, pMap[i].name) == 0) { + *pValue = pMap[i].value; + return TSDB_CODE_SUCCESS; + } + } + + return TSDB_CODE_INVALID_PARA; +} + +static int32_t tRepairParseStringOption(const char *input, char *output, int32_t outputSize) { + if (input == NULL || output == NULL || outputSize <= 0) { + return TSDB_CODE_INVALID_PARA; + } + + int32_t len = strlen(input); + if (len <= 0 || len >= outputSize) { + return TSDB_CODE_INVALID_PARA; + } + + tstrncpy(output, input, outputSize); + return TSDB_CODE_SUCCESS; +} + +static bool tRepairValidateShellHostTokenN(const char *token, int32_t tokenLen) { + if (token == NULL || tokenLen <= 0) { + return false; + } + + if (token[0] == '-') { + return false; + } + + for (int32_t i = 0; i < tokenLen; ++i) { + char c = token[i]; + if (isalnum((uint8_t)c) || c == '.' || c == '-' || c == '_' || c == '@') { + continue; + } + return false; + } + + return true; +} + +static bool tRepairValidateShellPathTokenN(const char *token, int32_t tokenLen) { + if (token == NULL || tokenLen <= 0) { + return false; + } + + for (int32_t i = 0; i < tokenLen; ++i) { + char c = token[i]; + if (isspace((uint8_t)c) || iscntrl((uint8_t)c)) { + return false; + } + + switch (c) { + case '\'': + case '"': + case '`': + case '$': + case ';': + case '|': + case '&': + case '<': + case '>': + case '(': + case ')': + case '\\': + return false; + default: + break; + } + } + + return true; +} + +static bool tRepairValidateShellHostToken(const char *token) { + return token != NULL ? tRepairValidateShellHostTokenN(token, strlen(token)) : false; +} + +static bool tRepairValidateShellPathToken(const char *token) { + return token != NULL ? tRepairValidateShellPathTokenN(token, strlen(token)) : false; +} + +static bool tRepairValidateReplicaNodeEndpoint(const char *endpoint) { + if (endpoint == NULL || endpoint[0] == '\0') { + return false; + } + + for (const char *p = endpoint; *p != '\0'; ++p) { + if (isspace((uint8_t)(*p))) { + return false; + } + } + + const char *sep = strchr(endpoint, ':'); + if (sep == NULL || sep == endpoint || sep[1] == '\0') { + return false; + } + + if (strchr(sep + 1, ':') != NULL) { + return false; + } + + if (sep[1] != '/') { + return false; + } + + int32_t hostLen = (int32_t)(sep - endpoint); + int32_t pathLen = strlen(sep + 1); + if (!tRepairValidateShellHostTokenN(endpoint, hostLen) || !tRepairValidateShellPathTokenN(sep + 1, pathLen)) { + return false; + } + + return true; +} + +static bool tRepairIsFileTypeCompatible(ERepairNodeType nodeType, ERepairFileType fileType) { + switch (nodeType) { + case REPAIR_NODE_TYPE_VNODE: + return fileType == REPAIR_FILE_TYPE_WAL || fileType == REPAIR_FILE_TYPE_TSDB || + fileType == REPAIR_FILE_TYPE_META; + case REPAIR_NODE_TYPE_MNODE: + return fileType == REPAIR_FILE_TYPE_WAL || fileType == REPAIR_FILE_TYPE_DATA; + case REPAIR_NODE_TYPE_DNODE: + return fileType == REPAIR_FILE_TYPE_CONFIG; + case REPAIR_NODE_TYPE_SNODE: + return fileType == REPAIR_FILE_TYPE_CHECKPOINT; + default: + return false; + } +} + +static char *tRepairTrimSpace(char *str) { + while (*str != '\0' && isspace((unsigned char)*str)) { + ++str; + } + + char *end = str + strlen(str); + while (end > str && isspace((unsigned char)*(end - 1))) { + --end; + } + *end = '\0'; + + return str; +} + +static int32_t tRepairAppendVnodeId(SRepairCtx *pCtx, int32_t vnodeId) { + if (pCtx->vnodeIdNum >= REPAIR_MAX_VNODE_IDS) { + return TSDB_CODE_INVALID_PARA; + } + + for (int32_t i = 0; i < pCtx->vnodeIdNum; ++i) { + if (pCtx->vnodeIds[i] == vnodeId) { + return TSDB_CODE_INVALID_PARA; + } + } + + pCtx->vnodeIds[pCtx->vnodeIdNum++] = vnodeId; + return TSDB_CODE_SUCCESS; +} + +static int32_t tRepairParseVnodeIdList(char *vnodeIdList, SRepairCtx *pCtx) { + if (vnodeIdList == NULL || pCtx == NULL) { + return TSDB_CODE_INVALID_PARA; + } + + char *savePtr = NULL; + for (char *token = strtok_r(vnodeIdList, ",", &savePtr); token != NULL; token = strtok_r(NULL, ",", &savePtr)) { + char *trimmed = tRepairTrimSpace(token); + if (*trimmed == '\0') { + return TSDB_CODE_INVALID_PARA; + } + + errno = 0; + char *endPtr = NULL; + int32_t parsed = taosStr2Int32(trimmed, &endPtr, 10); + if (errno != 0 || endPtr == NULL || endPtr == trimmed || *endPtr != '\0' || parsed < 0) { + return TSDB_CODE_INVALID_PARA; + } + + int32_t code = tRepairAppendVnodeId(pCtx, parsed); + if (code != TSDB_CODE_SUCCESS) { + return code; + } + } + + return pCtx->vnodeIdNum > 0 ? TSDB_CODE_SUCCESS : TSDB_CODE_INVALID_PARA; +} + +static const char *tRepairGetVnodeFileSubDir(ERepairFileType fileType) { + switch (fileType) { + case REPAIR_FILE_TYPE_WAL: + return "wal"; + case REPAIR_FILE_TYPE_TSDB: + return "tsdb"; + case REPAIR_FILE_TYPE_META: + return "meta"; + default: + return NULL; + } +} + +static const char *tRepairGetNodeTypeName(ERepairNodeType nodeType) { + switch (nodeType) { + case REPAIR_NODE_TYPE_VNODE: + return "vnode"; + case REPAIR_NODE_TYPE_MNODE: + return "mnode"; + case REPAIR_NODE_TYPE_DNODE: + return "dnode"; + case REPAIR_NODE_TYPE_SNODE: + return "snode"; + default: + return "invalid"; + } +} + +static const char *tRepairGetFileTypeName(ERepairFileType fileType) { + switch (fileType) { + case REPAIR_FILE_TYPE_WAL: + return "wal"; + case REPAIR_FILE_TYPE_TSDB: + return "tsdb"; + case REPAIR_FILE_TYPE_META: + return "meta"; + case REPAIR_FILE_TYPE_DATA: + return "data"; + case REPAIR_FILE_TYPE_CONFIG: + return "config"; + case REPAIR_FILE_TYPE_CHECKPOINT: + return "checkpoint"; + default: + return "invalid"; + } +} + +static const char *tRepairGetModeName(ERepairMode mode) { + switch (mode) { + case REPAIR_MODE_FORCE: + return "force"; + case REPAIR_MODE_REPLICA: + return "replica"; + case REPAIR_MODE_COPY: + return "copy"; + default: + return "invalid"; + } +} + +static const char *tRepairResolveCopyBin(const char *envName, const char *defaultBin) { + if (envName == NULL || envName[0] == '\0' || defaultBin == NULL || defaultBin[0] == '\0') { + return NULL; + } + + const char *fromEnv = getenv(envName); + if (fromEnv != NULL && fromEnv[0] != '\0') { + return fromEnv; + } + + return defaultBin; +} + +static void tRepairTrimLineEnding(char *line) { + if (line == NULL) { + return; + } + + int32_t len = strlen(line); + while (len > 0 && (line[len - 1] == '\n' || line[len - 1] == '\r')) { + line[len - 1] = '\0'; + --len; + } +} + +static int32_t tRepairRunShellCommand(const char *cmd, char *lastOutput, int32_t lastOutputSize) { + if (cmd == NULL || cmd[0] == '\0' || (lastOutput != NULL && lastOutputSize <= 0) || + (lastOutput == NULL && lastOutputSize > 0)) { + return TSDB_CODE_INVALID_PARA; + } + + if (lastOutput != NULL) { + lastOutput[0] = '\0'; + } + + char wrappedCmd[PATH_MAX * 8] = {0}; + int32_t wrappedLen = + tsnprintf(wrappedCmd, sizeof(wrappedCmd), "%s 2>&1; printf '\\n%s%%d\\n' $?", cmd, REPAIR_COPY_CMD_EXIT_MARKER); + if (wrappedLen <= 0 || wrappedLen >= (int32_t)sizeof(wrappedCmd)) { + return TSDB_CODE_INVALID_PARA; + } + + TdCmdPtr pCmd = taosOpenCmd(wrappedCmd); + if (pCmd == NULL) { + return terrno != 0 ? terrno : TSDB_CODE_FAILED; + } + + int32_t code = TSDB_CODE_SUCCESS; + int32_t exitCode = INT32_MIN; + char line[1024] = {0}; + int32_t markerLen = strlen(REPAIR_COPY_CMD_EXIT_MARKER); + while (true) { + int64_t nread = taosGetsCmd(pCmd, sizeof(line), line); + if (nread == 0) { + break; + } + if (nread < 0) { + code = terrno != 0 ? terrno : TSDB_CODE_FAILED; + break; + } + + tRepairTrimLineEnding(line); + if (strncmp(line, REPAIR_COPY_CMD_EXIT_MARKER, markerLen) == 0) { + errno = 0; + char *end = NULL; + int32_t parsed = taosStr2Int32(line + markerLen, &end, 10); + if (errno == 0 && end != NULL && end != line + markerLen && *end == '\0') { + exitCode = parsed; + } else { + code = TSDB_CODE_FAILED; + } + continue; + } + + if (lastOutput != NULL && line[0] != '\0') { + tstrncpy(lastOutput, line, lastOutputSize); + } + } + + taosCloseCmd(&pCmd); + if (code != TSDB_CODE_SUCCESS) { + return code; + } + + if (exitCode != 0) { + return TSDB_CODE_FAILED; + } + + return TSDB_CODE_SUCCESS; +} + +static int32_t tRepairBuildVnodePath(const char *dataDir, int32_t vnodeId, const char *subDir, char *path, + int32_t pathSize) { + if (dataDir == NULL || dataDir[0] == '\0' || vnodeId < 0 || path == NULL || pathSize <= 0) { + return TSDB_CODE_INVALID_PARA; + } + + int32_t len = 0; + if (subDir == NULL || subDir[0] == '\0') { + len = tsnprintf(path, pathSize, "%s%svnode%svnode%d", dataDir, TD_DIRSEP, TD_DIRSEP, vnodeId); + } else { + len = tsnprintf(path, pathSize, "%s%svnode%svnode%d%s%s", dataDir, TD_DIRSEP, TD_DIRSEP, vnodeId, TD_DIRSEP, + subDir); + } + + if (len <= 0 || len >= pathSize) { + return TSDB_CODE_INVALID_PARA; + } + + return TSDB_CODE_SUCCESS; +} + +static int32_t tRepairBuildBackupBaseDir(const SRepairCtx *pCtx, const char *dataDir, char *backupBaseDir, + int32_t backupBaseDirSize) { + if (pCtx == NULL || dataDir == NULL || dataDir[0] == '\0' || backupBaseDir == NULL || backupBaseDirSize <= 0) { + return TSDB_CODE_INVALID_PARA; + } + + if (pCtx->hasBackupPath) { + return tRepairParseStringOption(pCtx->backupPath, backupBaseDir, backupBaseDirSize); + } + + int32_t len = tsnprintf(backupBaseDir, backupBaseDirSize, "%s%sbackup", dataDir, TD_DIRSEP); + if (len <= 0 || len >= backupBaseDirSize) { + return TSDB_CODE_INVALID_PARA; + } + + return TSDB_CODE_SUCCESS; +} + +static int32_t tRepairBuildBackupDir(const SRepairCtx *pCtx, const char *dataDir, int32_t vnodeId, char *backupDir, + int32_t backupDirSize) { + if (pCtx == NULL || !pCtx->enabled || dataDir == NULL || dataDir[0] == '\0' || vnodeId < 0 || backupDir == NULL || + backupDirSize <= 0) { + return TSDB_CODE_INVALID_PARA; + } + + if (pCtx->sessionId[0] == '\0') { + return TSDB_CODE_INVALID_PARA; + } + + if (pCtx->nodeType != REPAIR_NODE_TYPE_VNODE) { + return TSDB_CODE_INVALID_PARA; + } + + bool shouldRepair = false; + int32_t code = tRepairShouldRepairVnode(pCtx, vnodeId, &shouldRepair); + if (code != TSDB_CODE_SUCCESS || !shouldRepair) { + return TSDB_CODE_INVALID_PARA; + } + + const char *fileSubDir = tRepairGetVnodeFileSubDir(pCtx->fileType); + if (fileSubDir == NULL) { + return TSDB_CODE_INVALID_PARA; + } + + char backupBaseDir[PATH_MAX] = {0}; + code = tRepairBuildBackupBaseDir(pCtx, dataDir, backupBaseDir, sizeof(backupBaseDir)); + if (code != TSDB_CODE_SUCCESS) { + return code; + } + + int32_t len = tsnprintf(backupDir, backupDirSize, "%s%s%s%svnode%d%s%s", backupBaseDir, TD_DIRSEP, pCtx->sessionId, + TD_DIRSEP, vnodeId, TD_DIRSEP, fileSubDir); + if (len <= 0 || len >= backupDirSize) { + return TSDB_CODE_INVALID_PARA; + } + + return TSDB_CODE_SUCCESS; +} + +static int32_t tRepairBuildSessionDir(const SRepairCtx *pCtx, const char *dataDir, char *sessionDir, + int32_t sessionDirSize) { + if (pCtx == NULL || !pCtx->enabled || dataDir == NULL || dataDir[0] == '\0' || sessionDir == NULL || + sessionDirSize <= 0 || pCtx->sessionId[0] == '\0') { + return TSDB_CODE_INVALID_PARA; + } + + char backupBaseDir[PATH_MAX] = {0}; + int32_t code = tRepairBuildBackupBaseDir(pCtx, dataDir, backupBaseDir, sizeof(backupBaseDir)); + if (code != TSDB_CODE_SUCCESS) { + return code; + } + + int32_t len = tsnprintf(sessionDir, sessionDirSize, "%s%s%s", backupBaseDir, TD_DIRSEP, pCtx->sessionId); + if (len <= 0 || len >= sessionDirSize) { + return TSDB_CODE_INVALID_PARA; + } + + return TSDB_CODE_SUCCESS; +} + +static int32_t tRepairBuildSessionFilePath(const char *sessionDir, const char *fileName, char *filePath, + int32_t filePathSize) { + if (sessionDir == NULL || sessionDir[0] == '\0' || fileName == NULL || fileName[0] == '\0' || filePath == NULL || + filePathSize <= 0) { + return TSDB_CODE_INVALID_PARA; + } + + int32_t len = tsnprintf(filePath, filePathSize, "%s%s%s", sessionDir, TD_DIRSEP, fileName); + if (len <= 0 || len >= filePathSize) { + return TSDB_CODE_INVALID_PARA; + } + + return TSDB_CODE_SUCCESS; +} + +static int32_t tRepairBuildPathWithEntry(const char *basePath, const char *entryName, char *outPath, int32_t outPathSize) { + if (basePath == NULL || basePath[0] == '\0' || entryName == NULL || entryName[0] == '\0' || outPath == NULL || + outPathSize <= 0) { + return TSDB_CODE_INVALID_PARA; + } + + int32_t len = tsnprintf(outPath, outPathSize, "%s%s%s", basePath, TD_DIRSEP, entryName); + if (len <= 0 || len >= outPathSize) { + return TSDB_CODE_INVALID_PARA; + } + + return TSDB_CODE_SUCCESS; +} + +static bool tRepairStringEndsWithIgnoreCase(const char *str, const char *suffix) { + if (str == NULL || suffix == NULL) { + return false; + } + + size_t strLen = strlen(str); + size_t suffixLen = strlen(suffix); + if (suffixLen <= 0 || suffixLen > strLen) { + return false; + } + + return taosStrcasecmp(str + strLen - suffixLen, suffix) == 0; +} + +typedef enum { + REPAIR_TSDB_FILE_KIND_UNKNOWN = 0, + REPAIR_TSDB_FILE_KIND_HEAD, + REPAIR_TSDB_FILE_KIND_DATA, + REPAIR_TSDB_FILE_KIND_SMA, + REPAIR_TSDB_FILE_KIND_STT, +} ERepairTsdbFileKind; + +static ERepairTsdbFileKind tRepairClassifyTsdbFile(const char *fileName) { + if (fileName == NULL || fileName[0] == '\0') { + return REPAIR_TSDB_FILE_KIND_UNKNOWN; + } + + if (tRepairStringEndsWithIgnoreCase(fileName, ".head")) { + return REPAIR_TSDB_FILE_KIND_HEAD; + } + if (tRepairStringEndsWithIgnoreCase(fileName, ".data")) { + return REPAIR_TSDB_FILE_KIND_DATA; + } + if (tRepairStringEndsWithIgnoreCase(fileName, ".sma")) { + return REPAIR_TSDB_FILE_KIND_SMA; + } + if (tRepairStringEndsWithIgnoreCase(fileName, ".stt")) { + return REPAIR_TSDB_FILE_KIND_STT; + } + + return REPAIR_TSDB_FILE_KIND_UNKNOWN; +} + +static void tRepairCountTsdbFileBySuffix(const char *fileName, SRepairTsdbScanResult *pResult) { + if (fileName == NULL || fileName[0] == '\0' || pResult == NULL) { + return; + } + + switch (tRepairClassifyTsdbFile(fileName)) { + case REPAIR_TSDB_FILE_KIND_HEAD: + ++pResult->headFiles; + return; + case REPAIR_TSDB_FILE_KIND_DATA: + ++pResult->dataFiles; + return; + case REPAIR_TSDB_FILE_KIND_SMA: + ++pResult->smaFiles; + return; + case REPAIR_TSDB_FILE_KIND_STT: + ++pResult->sttFiles; + return; + case REPAIR_TSDB_FILE_KIND_UNKNOWN: + default: + ++pResult->unknownFiles; + return; + } +} + +static int32_t tRepairScanTsdbDirRecursive(const char *dirPath, SRepairTsdbScanResult *pResult) { + if (dirPath == NULL || dirPath[0] == '\0' || pResult == NULL) { + return TSDB_CODE_INVALID_PARA; + } + + if (!taosDirExist(dirPath) || !taosIsDir(dirPath)) { + return TSDB_CODE_INVALID_PARA; + } + + TdDirPtr pDir = taosOpenDir(dirPath); + if (pDir == NULL) { + return terrno != 0 ? terrno : TSDB_CODE_INVALID_PARA; + } + + int32_t code = TSDB_CODE_SUCCESS; + TdDirEntryPtr pDirEntry = NULL; + while ((pDirEntry = taosReadDir(pDir)) != NULL) { + char *entryName = taosGetDirEntryName(pDirEntry); + if (entryName == NULL || strcmp(entryName, ".") == 0 || strcmp(entryName, "..") == 0) { + continue; + } + + if (taosDirEntryIsDir(pDirEntry)) { + char entryPath[PATH_MAX] = {0}; + code = tRepairBuildPathWithEntry(dirPath, entryName, entryPath, sizeof(entryPath)); + if (code != TSDB_CODE_SUCCESS) { + break; + } + + code = tRepairScanTsdbDirRecursive(entryPath, pResult); + if (code != TSDB_CODE_SUCCESS) { + break; + } + continue; + } + + tRepairCountTsdbFileBySuffix(entryName, pResult); + } + + if (taosCloseDir(&pDir) != 0 && code == TSDB_CODE_SUCCESS) { + code = terrno != 0 ? terrno : TSDB_CODE_INVALID_PARA; + } + + return code; +} + +static int32_t tRepairCountRegularFilesRecursive(const char *dirPath, int32_t *pFiles) { + if (dirPath == NULL || dirPath[0] == '\0' || pFiles == NULL) { + return TSDB_CODE_INVALID_PARA; + } + + if (!taosDirExist(dirPath) || !taosIsDir(dirPath)) { + return TSDB_CODE_INVALID_PARA; + } + + TdDirPtr pDir = taosOpenDir(dirPath); + if (pDir == NULL) { + return terrno != 0 ? terrno : TSDB_CODE_INVALID_PARA; + } + + int32_t code = TSDB_CODE_SUCCESS; + TdDirEntryPtr pDirEntry = NULL; + while ((pDirEntry = taosReadDir(pDir)) != NULL) { + char *entryName = taosGetDirEntryName(pDirEntry); + if (entryName == NULL || strcmp(entryName, ".") == 0 || strcmp(entryName, "..") == 0) { + continue; + } + + if (taosDirEntryIsDir(pDirEntry)) { + char entryPath[PATH_MAX] = {0}; + code = tRepairBuildPathWithEntry(dirPath, entryName, entryPath, sizeof(entryPath)); + if (code != TSDB_CODE_SUCCESS) { + break; + } + + code = tRepairCountRegularFilesRecursive(entryPath, pFiles); + if (code != TSDB_CODE_SUCCESS) { + break; + } + continue; + } + + ++(*pFiles); + } + + if (taosCloseDir(&pDir) != 0 && code == TSDB_CODE_SUCCESS) { + code = terrno != 0 ? terrno : TSDB_CODE_INVALID_PARA; + } + + return code; +} + +static void tRepairRecordCorruptedBlockPath(const char *dirPath, SRepairTsdbBlockReport *pReport) { + if (dirPath == NULL || dirPath[0] == '\0' || pReport == NULL || + pReport->reportedCorruptedBlocks >= REPAIR_TSDB_MAX_REPORTED_BLOCKS) { + return; + } + + tstrncpy(pReport->corruptedBlockPaths[pReport->reportedCorruptedBlocks], dirPath, + sizeof(pReport->corruptedBlockPaths[pReport->reportedCorruptedBlocks])); + ++pReport->reportedCorruptedBlocks; +} + +static void tRepairRecordMissingMetaFile(SRepairMetaScanResult *pResult, const char *fileName) { + if (pResult == NULL || fileName == NULL || fileName[0] == '\0') { + return; + } + + if (pResult->missingRequiredFiles < REPAIR_META_MAX_MISSING_FILES) { + tstrncpy(pResult->missingRequiredFileNames[pResult->missingRequiredFiles], fileName, REPAIR_META_FILE_NAME_LEN); + } + ++pResult->missingRequiredFiles; +} + +static int32_t tRepairCreateEmptyFile(const char *filePath) { + if (filePath == NULL || filePath[0] == '\0') { + return TSDB_CODE_INVALID_PARA; + } + + TdFilePtr pFile = taosOpenFile(filePath, TD_FILE_CREATE | TD_FILE_WRITE | TD_FILE_TRUNC); + if (pFile == NULL) { + return terrno != 0 ? terrno : TSDB_CODE_INVALID_PARA; + } + + if (taosCloseFile(&pFile) != 0) { + return terrno != 0 ? terrno : TSDB_CODE_INVALID_PARA; + } + + return TSDB_CODE_SUCCESS; +} + +static int32_t tRepairAnalyzeTsdbDirRecursive(const char *dirPath, SRepairTsdbBlockReport *pReport) { + if (dirPath == NULL || dirPath[0] == '\0' || pReport == NULL) { + return TSDB_CODE_INVALID_PARA; + } + + if (!taosDirExist(dirPath) || !taosIsDir(dirPath)) { + return TSDB_CODE_INVALID_PARA; + } + + TdDirPtr pDir = taosOpenDir(dirPath); + if (pDir == NULL) { + return terrno != 0 ? terrno : TSDB_CODE_INVALID_PARA; + } + + int32_t code = TSDB_CODE_SUCCESS; + int32_t localHeadFiles = 0; + int32_t localDataFiles = 0; + int32_t localKnownFiles = 0; + int32_t localUnknownFiles = 0; + TdDirEntryPtr pDirEntry = NULL; + while ((pDirEntry = taosReadDir(pDir)) != NULL) { + char *entryName = taosGetDirEntryName(pDirEntry); + if (entryName == NULL || strcmp(entryName, ".") == 0 || strcmp(entryName, "..") == 0) { + continue; + } + + if (taosDirEntryIsDir(pDirEntry)) { + char entryPath[PATH_MAX] = {0}; + code = tRepairBuildPathWithEntry(dirPath, entryName, entryPath, sizeof(entryPath)); + if (code != TSDB_CODE_SUCCESS) { + break; + } + + code = tRepairAnalyzeTsdbDirRecursive(entryPath, pReport); + if (code != TSDB_CODE_SUCCESS) { + break; + } + continue; + } + + switch (tRepairClassifyTsdbFile(entryName)) { + case REPAIR_TSDB_FILE_KIND_HEAD: + ++localHeadFiles; + ++localKnownFiles; + break; + case REPAIR_TSDB_FILE_KIND_DATA: + ++localDataFiles; + ++localKnownFiles; + break; + case REPAIR_TSDB_FILE_KIND_SMA: + case REPAIR_TSDB_FILE_KIND_STT: + ++localKnownFiles; + break; + case REPAIR_TSDB_FILE_KIND_UNKNOWN: + default: + ++localUnknownFiles; + break; + } + } + + if (taosCloseDir(&pDir) != 0 && code == TSDB_CODE_SUCCESS) { + code = terrno != 0 ? terrno : TSDB_CODE_INVALID_PARA; + } + if (code != TSDB_CODE_SUCCESS) { + return code; + } + + pReport->unknownFiles += localUnknownFiles; + if (localKnownFiles <= 0) { + return TSDB_CODE_SUCCESS; + } + + ++pReport->totalBlocks; + if (localHeadFiles > 0 && localDataFiles > 0) { + ++pReport->recoverableBlocks; + } else { + ++pReport->corruptedBlocks; + tRepairRecordCorruptedBlockPath(dirPath, pReport); + } + + return TSDB_CODE_SUCCESS; +} + +static int32_t tRepairCopyDirRecursive(const char *srcDir, const char *dstDir); +static int32_t tRepairResetDir(const char *dirPath); + +static int32_t tRepairBuildPathWithOptionalRelative(const char *basePath, const char *relativePath, char *outPath, + int32_t outPathSize) { + if (basePath == NULL || basePath[0] == '\0' || outPath == NULL || outPathSize <= 0) { + return TSDB_CODE_INVALID_PARA; + } + + if (relativePath == NULL || relativePath[0] == '\0') { + return tRepairParseStringOption(basePath, outPath, outPathSize); + } + + int32_t len = tsnprintf(outPath, outPathSize, "%s%s%s", basePath, TD_DIRSEP, relativePath); + if (len <= 0 || len >= outPathSize) { + return TSDB_CODE_INVALID_PARA; + } + + return TSDB_CODE_SUCCESS; +} + +static int32_t tRepairCollectTsdbDirLocalStats(const char *dirPath, int32_t *pHeadFiles, int32_t *pDataFiles, + int32_t *pKnownFiles, int32_t *pUnknownFiles) { + if (dirPath == NULL || dirPath[0] == '\0' || pHeadFiles == NULL || pDataFiles == NULL || pKnownFiles == NULL || + pUnknownFiles == NULL) { + return TSDB_CODE_INVALID_PARA; + } + + *pHeadFiles = 0; + *pDataFiles = 0; + *pKnownFiles = 0; + *pUnknownFiles = 0; + + TdDirPtr pDir = taosOpenDir(dirPath); + if (pDir == NULL) { + return terrno != 0 ? terrno : TSDB_CODE_INVALID_PARA; + } + + int32_t code = TSDB_CODE_SUCCESS; + TdDirEntryPtr pDirEntry = NULL; + while ((pDirEntry = taosReadDir(pDir)) != NULL) { + char *entryName = taosGetDirEntryName(pDirEntry); + if (entryName == NULL || strcmp(entryName, ".") == 0 || strcmp(entryName, "..") == 0 || + taosDirEntryIsDir(pDirEntry)) { + continue; + } + + switch (tRepairClassifyTsdbFile(entryName)) { + case REPAIR_TSDB_FILE_KIND_HEAD: + ++(*pHeadFiles); + ++(*pKnownFiles); + break; + case REPAIR_TSDB_FILE_KIND_DATA: + ++(*pDataFiles); + ++(*pKnownFiles); + break; + case REPAIR_TSDB_FILE_KIND_SMA: + case REPAIR_TSDB_FILE_KIND_STT: + ++(*pKnownFiles); + break; + case REPAIR_TSDB_FILE_KIND_UNKNOWN: + default: + ++(*pUnknownFiles); + break; + } + } + + if (taosCloseDir(&pDir) != 0 && code == TSDB_CODE_SUCCESS) { + code = terrno != 0 ? terrno : TSDB_CODE_INVALID_PARA; + } + return code; +} + +static int32_t tRepairRebuildTsdbDirRecursive(const char *srcDir, const char *dstBaseDir, const char *relativePath, + SRepairTsdbBlockReport *pReport) { + if (srcDir == NULL || srcDir[0] == '\0' || dstBaseDir == NULL || dstBaseDir[0] == '\0' || pReport == NULL) { + return TSDB_CODE_INVALID_PARA; + } + + if (!taosDirExist(srcDir) || !taosIsDir(srcDir)) { + return TSDB_CODE_INVALID_PARA; + } + + int32_t localHeadFiles = 0; + int32_t localDataFiles = 0; + int32_t localKnownFiles = 0; + int32_t localUnknownFiles = 0; + int32_t code = tRepairCollectTsdbDirLocalStats(srcDir, &localHeadFiles, &localDataFiles, &localKnownFiles, + &localUnknownFiles); + if (code != TSDB_CODE_SUCCESS) { + return code; + } + + pReport->unknownFiles += localUnknownFiles; + if (localKnownFiles > 0) { + ++pReport->totalBlocks; + if (localHeadFiles > 0 && localDataFiles > 0) { + ++pReport->recoverableBlocks; + + char dstDir[PATH_MAX] = {0}; + code = tRepairBuildPathWithOptionalRelative(dstBaseDir, relativePath, dstDir, sizeof(dstDir)); + if (code != TSDB_CODE_SUCCESS) { + return code; + } + + code = tRepairResetDir(dstDir); + if (code != TSDB_CODE_SUCCESS) { + return code; + } + return tRepairCopyDirRecursive(srcDir, dstDir); + } + + ++pReport->corruptedBlocks; + tRepairRecordCorruptedBlockPath(srcDir, pReport); + return TSDB_CODE_SUCCESS; + } + + TdDirPtr pDir = taosOpenDir(srcDir); + if (pDir == NULL) { + return terrno != 0 ? terrno : TSDB_CODE_INVALID_PARA; + } + + TdDirEntryPtr pDirEntry = NULL; + while ((pDirEntry = taosReadDir(pDir)) != NULL) { + if (!taosDirEntryIsDir(pDirEntry)) { + continue; + } + + char *entryName = taosGetDirEntryName(pDirEntry); + if (entryName == NULL || strcmp(entryName, ".") == 0 || strcmp(entryName, "..") == 0) { + continue; + } + + char childSrcDir[PATH_MAX] = {0}; + code = tRepairBuildPathWithEntry(srcDir, entryName, childSrcDir, sizeof(childSrcDir)); + if (code != TSDB_CODE_SUCCESS) { + break; + } + + char childRelativePath[PATH_MAX] = {0}; + if (relativePath == NULL || relativePath[0] == '\0') { + code = tRepairParseStringOption(entryName, childRelativePath, sizeof(childRelativePath)); + } else { + int32_t len = tsnprintf(childRelativePath, sizeof(childRelativePath), "%s%s%s", relativePath, TD_DIRSEP, + entryName); + code = (len > 0 && len < (int32_t)sizeof(childRelativePath)) ? TSDB_CODE_SUCCESS : TSDB_CODE_INVALID_PARA; + } + if (code != TSDB_CODE_SUCCESS) { + break; + } + + code = tRepairRebuildTsdbDirRecursive(childSrcDir, dstBaseDir, childRelativePath, pReport); + if (code != TSDB_CODE_SUCCESS) { + break; + } + } + + if (taosCloseDir(&pDir) != 0 && code == TSDB_CODE_SUCCESS) { + code = terrno != 0 ? terrno : TSDB_CODE_INVALID_PARA; + } + return code; +} + +static int32_t tRepairCopyDirRecursive(const char *srcDir, const char *dstDir) { + if (srcDir == NULL || srcDir[0] == '\0' || dstDir == NULL || dstDir[0] == '\0') { + return TSDB_CODE_INVALID_PARA; + } + + if (!taosDirExist(srcDir) || !taosIsDir(srcDir)) { + return TSDB_CODE_INVALID_PARA; + } + + if (taosMulMkDir(dstDir) != 0) { + return terrno != 0 ? terrno : TSDB_CODE_INVALID_PARA; + } + + TdDirPtr pDir = taosOpenDir(srcDir); + if (pDir == NULL) { + return terrno != 0 ? terrno : TSDB_CODE_INVALID_PARA; + } + + int32_t code = TSDB_CODE_SUCCESS; + TdDirEntryPtr pDirEntry = NULL; + while ((pDirEntry = taosReadDir(pDir)) != NULL) { + char *entryName = taosGetDirEntryName(pDirEntry); + if (entryName == NULL || strcmp(entryName, ".") == 0 || strcmp(entryName, "..") == 0) { + continue; + } + + char srcPath[PATH_MAX] = {0}; + char dstPath[PATH_MAX] = {0}; + code = tRepairBuildPathWithEntry(srcDir, entryName, srcPath, sizeof(srcPath)); + if (code != TSDB_CODE_SUCCESS) { + break; + } + code = tRepairBuildPathWithEntry(dstDir, entryName, dstPath, sizeof(dstPath)); + if (code != TSDB_CODE_SUCCESS) { + break; + } + + if (taosDirEntryIsDir(pDirEntry)) { + code = tRepairCopyDirRecursive(srcPath, dstPath); + } else { + int64_t copied = taosCopyFile(srcPath, dstPath); + if (copied < 0) { + code = terrno != 0 ? terrno : TSDB_CODE_INVALID_PARA; + } + } + + if (code != TSDB_CODE_SUCCESS) { + break; + } + } + + if (taosCloseDir(&pDir) != 0 && code == TSDB_CODE_SUCCESS) { + code = terrno != 0 ? terrno : TSDB_CODE_INVALID_PARA; + } + + return code; +} + +static int32_t tRepairResetDir(const char *dirPath) { + if (dirPath == NULL || dirPath[0] == '\0') { + return TSDB_CODE_INVALID_PARA; + } + + if (taosDirExist(dirPath)) { + taosRemoveDir(dirPath); + } + + if (taosMulMkDir(dirPath) != 0) { + return terrno != 0 ? terrno : TSDB_CODE_INVALID_PARA; + } + + return TSDB_CODE_SUCCESS; +} + +static int32_t tRepairBuildVnodeTargetAndBackupPath(const SRepairCtx *pCtx, const char *dataDir, int32_t vnodeId, + char *targetPath, int32_t targetPathSize, char *backupDir, + int32_t backupDirSize) { + if (pCtx == NULL || !pCtx->enabled || dataDir == NULL || dataDir[0] == '\0' || vnodeId < 0 || targetPath == NULL || + targetPathSize <= 0 || backupDir == NULL || backupDirSize <= 0) { + return TSDB_CODE_INVALID_PARA; + } + + if (pCtx->nodeType != REPAIR_NODE_TYPE_VNODE) { + return TSDB_CODE_INVALID_PARA; + } + + bool shouldRepair = false; + int32_t code = tRepairShouldRepairVnode(pCtx, vnodeId, &shouldRepair); + if (code != TSDB_CODE_SUCCESS || !shouldRepair) { + return TSDB_CODE_INVALID_PARA; + } + + code = tRepairBuildVnodeTargetPath(dataDir, vnodeId, pCtx->fileType, targetPath, targetPathSize); + if (code != TSDB_CODE_SUCCESS) { + return code; + } + + return tRepairBuildBackupDir(pCtx, dataDir, vnodeId, backupDir, backupDirSize); +} + +static int32_t tRepairReadTextFile(const char *filePath, char **ppContent, int64_t *pContentLen) { + if (filePath == NULL || filePath[0] == '\0' || ppContent == NULL || pContentLen == NULL) { + return TSDB_CODE_INVALID_PARA; + } + + *ppContent = NULL; + *pContentLen = 0; + + int64_t fileSize = 0; + if (taosStatFile(filePath, &fileSize, NULL, NULL) != 0 || fileSize <= 0 || fileSize > REPAIR_MAX_STATE_FILE_SIZE) { + return TSDB_CODE_INVALID_PARA; + } + + TdFilePtr pFile = taosOpenFile(filePath, TD_FILE_READ); + if (pFile == NULL) { + return terrno != 0 ? terrno : TSDB_CODE_INVALID_PARA; + } + + char *pContent = taosMemoryMalloc(fileSize + 1); + if (pContent == NULL) { + (void)taosCloseFile(&pFile); + return TSDB_CODE_OUT_OF_MEMORY; + } + + int32_t code = TSDB_CODE_SUCCESS; + int64_t nread = taosReadFile(pFile, pContent, fileSize); + if (nread != fileSize) { + code = terrno != 0 ? terrno : TSDB_CODE_INVALID_PARA; + } + + if (taosCloseFile(&pFile) != 0 && code == TSDB_CODE_SUCCESS) { + code = terrno != 0 ? terrno : TSDB_CODE_INVALID_PARA; + } + + if (code != TSDB_CODE_SUCCESS) { + taosMemoryFree(pContent); + return code; + } + + pContent[fileSize] = '\0'; + *ppContent = pContent; + *pContentLen = fileSize; + return TSDB_CODE_SUCCESS; +} + +static bool tRepairMatchOptionalStateField(const SJson *pJson, const char *fieldName, bool hasExpected, + const char *expectedValue) { + if (pJson == NULL || fieldName == NULL || fieldName[0] == '\0') { + return false; + } + + bool hasField = tjsonGetObjectItem(pJson, fieldName) != NULL; + if (!hasExpected) { + return !hasField; + } + if (!hasField || expectedValue == NULL) { + return false; + } + + char value[PATH_MAX] = {0}; + int32_t code = tjsonGetStringValue2(pJson, fieldName, value, sizeof(value)); + return code == TSDB_CODE_SUCCESS && strcmp(value, expectedValue) == 0; +} + +static bool tRepairMatchStateVnodeList(const SRepairCtx *pCtx, const SJson *pJson, int32_t stateTotalVnodes) { + if (pCtx == NULL || pJson == NULL) { + return false; + } + + if (pCtx->nodeType != REPAIR_NODE_TYPE_VNODE) { + return stateTotalVnodes == 0; + } + + if (!pCtx->hasVnodeIdList || pCtx->vnodeIdNum <= 0 || pCtx->vnodeIdNum != stateTotalVnodes) { + return false; + } + + char stateVnodeIdList[PATH_MAX] = {0}; + if (tjsonGetStringValue2(pJson, "vnodeIdList", stateVnodeIdList, sizeof(stateVnodeIdList)) != TSDB_CODE_SUCCESS) { + return false; + } + + SRepairCtx stateCtx = {0}; + stateCtx.enabled = true; + char vnodeIdBuf[PATH_MAX] = {0}; + tstrncpy(vnodeIdBuf, stateVnodeIdList, sizeof(vnodeIdBuf)); + if (tRepairParseVnodeIdList(vnodeIdBuf, &stateCtx) != TSDB_CODE_SUCCESS || stateCtx.vnodeIdNum != pCtx->vnodeIdNum) { + return false; + } + + for (int32_t i = 0; i < pCtx->vnodeIdNum; ++i) { + if (pCtx->vnodeIds[i] != stateCtx.vnodeIds[i]) { + return false; + } + } + + return true; +} + +static bool tRepairStatusCanResume(const char *status) { + if (status == NULL || status[0] == '\0') { + return false; + } + + return taosStrcasecmp(status, "initialized") == 0 || taosStrcasecmp(status, "running") == 0; +} + +static bool tRepairBuildResumeCandidate(const SRepairCtx *pCtx, const char *sessionDirName, const char *sessionDir, + const char *statePath, SRepairResumeCandidate *pCandidate) { + if (pCtx == NULL || sessionDirName == NULL || sessionDirName[0] == '\0' || sessionDir == NULL || + sessionDir[0] == '\0' || statePath == NULL || statePath[0] == '\0' || pCandidate == NULL) { + return false; + } + + char *stateContent = NULL; + int64_t stateContentLen = 0; + if (tRepairReadTextFile(statePath, &stateContent, &stateContentLen) != TSDB_CODE_SUCCESS || stateContentLen <= 0) { + return false; + } + + bool matched = false; + SJson *pJson = tjsonParse(stateContent); + taosMemoryFree(stateContent); + if (pJson == NULL) { + return false; + } + + do { + char sessionId[REPAIR_SESSION_ID_LEN] = {0}; + char step[REPAIR_RESUME_STEP_LEN] = {0}; + char status[32] = {0}; + int64_t startTimeMs = 0; + int32_t nodeTypeCode = 0; + int32_t fileTypeCode = 0; + int32_t modeCode = 0; + int32_t doneVnodes = 0; + int32_t totalVnodes = 0; + + if (tjsonGetStringValue2(pJson, "sessionId", sessionId, sizeof(sessionId)) != TSDB_CODE_SUCCESS || + tjsonGetBigIntValue(pJson, "startTimeMs", &startTimeMs) != TSDB_CODE_SUCCESS || + tjsonGetIntValue(pJson, "nodeTypeCode", &nodeTypeCode) != TSDB_CODE_SUCCESS || + tjsonGetIntValue(pJson, "fileTypeCode", &fileTypeCode) != TSDB_CODE_SUCCESS || + tjsonGetIntValue(pJson, "modeCode", &modeCode) != TSDB_CODE_SUCCESS || + tjsonGetStringValue2(pJson, "step", step, sizeof(step)) != TSDB_CODE_SUCCESS || + tjsonGetStringValue2(pJson, "status", status, sizeof(status)) != TSDB_CODE_SUCCESS || + tjsonGetIntValue(pJson, "doneVnodes", &doneVnodes) != TSDB_CODE_SUCCESS || + tjsonGetIntValue(pJson, "totalVnodes", &totalVnodes) != TSDB_CODE_SUCCESS) { + break; + } + + if (startTimeMs <= 0 || step[0] == '\0' || doneVnodes < 0 || totalVnodes < 0 || doneVnodes > totalVnodes) { + break; + } + if (strcmp(sessionId, sessionDirName) != 0) { + break; + } + if (!tRepairStatusCanResume(status)) { + break; + } + if (nodeTypeCode != (int32_t)pCtx->nodeType || fileTypeCode != (int32_t)pCtx->fileType || + modeCode != (int32_t)pCtx->mode) { + break; + } + if (!tRepairMatchStateVnodeList(pCtx, pJson, totalVnodes)) { + break; + } + if (!tRepairMatchOptionalStateField(pJson, "backupPath", pCtx->hasBackupPath, pCtx->backupPath)) { + break; + } + if (!tRepairMatchOptionalStateField(pJson, "replicaNode", pCtx->hasReplicaNode, pCtx->replicaNode)) { + break; + } + + memset(pCandidate, 0, sizeof(*pCandidate)); + pCandidate->found = true; + pCandidate->startTimeMs = startTimeMs; + pCandidate->doneVnodes = doneVnodes; + pCandidate->totalVnodes = totalVnodes; + if (tRepairParseStringOption(step, pCandidate->step, sizeof(pCandidate->step)) != TSDB_CODE_SUCCESS || + tRepairParseStringOption(sessionId, pCandidate->sessionId, sizeof(pCandidate->sessionId)) != + TSDB_CODE_SUCCESS || + tRepairParseStringOption(sessionDir, pCandidate->sessionDir, sizeof(pCandidate->sessionDir)) != + TSDB_CODE_SUCCESS || + tRepairBuildSessionFilePath(sessionDir, REPAIR_SESSION_LOG_NAME, pCandidate->logPath, sizeof(pCandidate->logPath)) != + TSDB_CODE_SUCCESS || + tRepairBuildSessionFilePath(sessionDir, REPAIR_SESSION_STATE_NAME, pCandidate->statePath, + sizeof(pCandidate->statePath)) != TSDB_CODE_SUCCESS) { + memset(pCandidate, 0, sizeof(*pCandidate)); + break; + } + + matched = true; + } while (0); + + tjsonDelete(pJson); + return matched; +} + +static int32_t tRepairWriteFileAtomically(const char *filePath, const char *content, int64_t contentLen) { + if (filePath == NULL || filePath[0] == '\0' || content == NULL || contentLen < 0) { + return TSDB_CODE_INVALID_PARA; + } + + char tempPath[PATH_MAX] = {0}; + int32_t len = tsnprintf(tempPath, sizeof(tempPath), "%s.tmp", filePath); + if (len <= 0 || len >= (int32_t)sizeof(tempPath)) { + return TSDB_CODE_INVALID_PARA; + } + + TdFilePtr pFile = taosOpenFile(tempPath, TD_FILE_CREATE | TD_FILE_WRITE | TD_FILE_TRUNC); + if (pFile == NULL) { + return terrno != 0 ? terrno : TSDB_CODE_INVALID_PARA; + } + + int32_t code = TSDB_CODE_SUCCESS; + if (contentLen > 0) { + int64_t written = taosWriteFile(pFile, content, contentLen); + if (written != contentLen) { + code = terrno != 0 ? terrno : TSDB_CODE_INVALID_PARA; + } + } + + if (code == TSDB_CODE_SUCCESS) { + int32_t syncCode = taosFsyncFile(pFile); + if (syncCode != 0) { + code = terrno != 0 ? terrno : TSDB_CODE_INVALID_PARA; + } + } + + if (taosCloseFile(&pFile) != 0 && code == TSDB_CODE_SUCCESS) { + code = terrno != 0 ? terrno : TSDB_CODE_INVALID_PARA; + } + + if (code != TSDB_CODE_SUCCESS) { + (void)taosRemoveFile(tempPath); + return code; + } + + if (taosRenameFile(tempPath, filePath) != 0) { + code = terrno != 0 ? terrno : TSDB_CODE_INVALID_PARA; + (void)taosRemoveFile(tempPath); + return code; + } + + return TSDB_CODE_SUCCESS; +} + +static int32_t tRepairWriteSessionStateInternal(const SRepairCtx *pCtx, const char *statePath, const char *step, + const char *status, int32_t doneVnodes, int32_t totalVnodes) { + SJson *pJson = tjsonCreateObject(); + if (pJson == NULL) { + return terrno != 0 ? terrno : TSDB_CODE_INVALID_PARA; + } + + int32_t code = TSDB_CODE_SUCCESS; + if (tjsonAddStringToObject(pJson, "sessionId", pCtx->sessionId) != TSDB_CODE_SUCCESS || + tjsonAddIntegerToObject(pJson, "startTimeMs", (uint64_t)pCtx->startTimeMs) != TSDB_CODE_SUCCESS || + tjsonAddStringToObject(pJson, "nodeType", tRepairGetNodeTypeName(pCtx->nodeType)) != TSDB_CODE_SUCCESS || + tjsonAddIntegerToObject(pJson, "nodeTypeCode", (uint64_t)pCtx->nodeType) != TSDB_CODE_SUCCESS || + tjsonAddStringToObject(pJson, "fileType", tRepairGetFileTypeName(pCtx->fileType)) != TSDB_CODE_SUCCESS || + tjsonAddIntegerToObject(pJson, "fileTypeCode", (uint64_t)pCtx->fileType) != TSDB_CODE_SUCCESS || + tjsonAddStringToObject(pJson, "mode", tRepairGetModeName(pCtx->mode)) != TSDB_CODE_SUCCESS || + tjsonAddIntegerToObject(pJson, "modeCode", (uint64_t)pCtx->mode) != TSDB_CODE_SUCCESS || + tjsonAddStringToObject(pJson, "step", step) != TSDB_CODE_SUCCESS || + tjsonAddStringToObject(pJson, "status", status) != TSDB_CODE_SUCCESS || + tjsonAddIntegerToObject(pJson, "doneVnodes", (uint64_t)doneVnodes) != TSDB_CODE_SUCCESS || + tjsonAddIntegerToObject(pJson, "totalVnodes", (uint64_t)totalVnodes) != TSDB_CODE_SUCCESS || + tjsonAddIntegerToObject(pJson, "updatedAtMs", (uint64_t)taosGetTimestampMs()) != TSDB_CODE_SUCCESS) { + code = terrno != 0 ? terrno : TSDB_CODE_INVALID_PARA; + } + + if (code == TSDB_CODE_SUCCESS && pCtx->hasVnodeIdList) { + if (tjsonAddStringToObject(pJson, "vnodeIdList", pCtx->vnodeIdList) != TSDB_CODE_SUCCESS) { + code = terrno != 0 ? terrno : TSDB_CODE_INVALID_PARA; + } + } + if (code == TSDB_CODE_SUCCESS && pCtx->hasBackupPath) { + if (tjsonAddStringToObject(pJson, "backupPath", pCtx->backupPath) != TSDB_CODE_SUCCESS) { + code = terrno != 0 ? terrno : TSDB_CODE_INVALID_PARA; + } + } + if (code == TSDB_CODE_SUCCESS && pCtx->hasReplicaNode) { + if (tjsonAddStringToObject(pJson, "replicaNode", pCtx->replicaNode) != TSDB_CODE_SUCCESS) { + code = terrno != 0 ? terrno : TSDB_CODE_INVALID_PARA; + } + } + + char *serialized = NULL; + if (code == TSDB_CODE_SUCCESS) { + serialized = tjsonToString(pJson); + if (serialized == NULL) { + code = terrno != 0 ? terrno : TSDB_CODE_INVALID_PARA; + } + } + + if (code == TSDB_CODE_SUCCESS) { + code = tRepairWriteFileAtomically(statePath, serialized, strlen(serialized)); + } + + if (serialized != NULL) { + taosMemoryFree(serialized); + } + tjsonDelete(pJson); + return code; +} + +int32_t tRepairParseNodeType(const char *pNodeType, ERepairNodeType *pParsedNodeType) { + if (pParsedNodeType == NULL) { + return TSDB_CODE_INVALID_PARA; + } + + int32_t parsed = REPAIR_NODE_TYPE_INVALID; + int32_t code = tRepairParseOption(pNodeType, kNodeTypeMap, (int32_t)tListLen(kNodeTypeMap), &parsed); + if (code != TSDB_CODE_SUCCESS) { + return code; + } + + *pParsedNodeType = (ERepairNodeType)parsed; + return TSDB_CODE_SUCCESS; +} + +int32_t tRepairParseFileType(const char *pFileType, ERepairFileType *pParsedFileType) { + if (pParsedFileType == NULL) { + return TSDB_CODE_INVALID_PARA; + } + + int32_t parsed = REPAIR_FILE_TYPE_INVALID; + int32_t code = tRepairParseOption(pFileType, kFileTypeMap, (int32_t)tListLen(kFileTypeMap), &parsed); + if (code != TSDB_CODE_SUCCESS) { + return code; + } + + *pParsedFileType = (ERepairFileType)parsed; + return TSDB_CODE_SUCCESS; +} + +int32_t tRepairParseMode(const char *pMode, ERepairMode *pParsedMode) { + if (pParsedMode == NULL) { + return TSDB_CODE_INVALID_PARA; + } + + int32_t parsed = REPAIR_MODE_INVALID; + int32_t code = tRepairParseOption(pMode, kModeMap, (int32_t)tListLen(kModeMap), &parsed); + if (code != TSDB_CODE_SUCCESS) { + return code; + } + + *pParsedMode = (ERepairMode)parsed; + return TSDB_CODE_SUCCESS; +} + +int32_t tRepairExtractLongOptionValue(int32_t argc, char const *argv[], int32_t *pIndex, const char *optionName, + const char **pOptionValue, bool *pMatched) { + if (argc <= 0 || argv == NULL || pIndex == NULL || optionName == NULL || pOptionValue == NULL || pMatched == NULL) { + return TSDB_CODE_INVALID_PARA; + } + if (*pIndex < 0 || *pIndex >= argc) { + return TSDB_CODE_INVALID_PARA; + } + + const char *arg = argv[*pIndex]; + if (arg == NULL || optionName[0] == '\0') { + return TSDB_CODE_INVALID_PARA; + } + + *pMatched = false; + *pOptionValue = NULL; + + int32_t optionNameLen = strlen(optionName); + if (strcmp(arg, optionName) == 0) { + *pMatched = true; + if (*pIndex >= argc - 1) { + return TSDB_CODE_INVALID_PARA; + } + + const char *nextValue = argv[*pIndex + 1]; + if (nextValue == NULL || nextValue[0] == '\0') { + return TSDB_CODE_INVALID_PARA; + } + + *pOptionValue = nextValue; + ++(*pIndex); + return TSDB_CODE_SUCCESS; + } + + if (strncmp(arg, optionName, optionNameLen) == 0 && arg[optionNameLen] == '=') { + *pMatched = true; + const char *inlineValue = arg + optionNameLen + 1; + if (inlineValue[0] == '\0') { + return TSDB_CODE_INVALID_PARA; + } + + *pOptionValue = inlineValue; + return TSDB_CODE_SUCCESS; + } + + return TSDB_CODE_SUCCESS; +} + +int32_t tRepairParseCliOption(SRepairCliArgs *pCliArgs, const char *pOptionName, const char *pOptionValue) { + if (pCliArgs == NULL || pOptionName == NULL || pOptionValue == NULL) { + return TSDB_CODE_INVALID_PARA; + } + + if (taosStrcasecmp(pOptionName, "node-type") == 0) { + int32_t code = tRepairParseNodeType(pOptionValue, &pCliArgs->nodeType); + if (code != TSDB_CODE_SUCCESS) { + return code; + } + pCliArgs->hasNodeType = true; + return TSDB_CODE_SUCCESS; + } + + if (taosStrcasecmp(pOptionName, "file-type") == 0) { + int32_t code = tRepairParseFileType(pOptionValue, &pCliArgs->fileType); + if (code != TSDB_CODE_SUCCESS) { + return code; + } + pCliArgs->hasFileType = true; + return TSDB_CODE_SUCCESS; + } + + if (taosStrcasecmp(pOptionName, "vnode-id") == 0) { + int32_t code = tRepairParseStringOption(pOptionValue, pCliArgs->vnodeIdList, PATH_MAX); + if (code != TSDB_CODE_SUCCESS) { + return code; + } + pCliArgs->hasVnodeIdList = true; + return TSDB_CODE_SUCCESS; + } + + if (taosStrcasecmp(pOptionName, "backup-path") == 0) { + int32_t code = tRepairParseStringOption(pOptionValue, pCliArgs->backupPath, PATH_MAX); + if (code != TSDB_CODE_SUCCESS) { + return code; + } + pCliArgs->hasBackupPath = true; + return TSDB_CODE_SUCCESS; + } + + if (taosStrcasecmp(pOptionName, "mode") == 0) { + int32_t code = tRepairParseMode(pOptionValue, &pCliArgs->mode); + if (code != TSDB_CODE_SUCCESS) { + return code; + } + pCliArgs->hasMode = true; + return TSDB_CODE_SUCCESS; + } + + if (taosStrcasecmp(pOptionName, "replica-node") == 0) { + int32_t code = tRepairParseStringOption(pOptionValue, pCliArgs->replicaNode, PATH_MAX); + if (code != TSDB_CODE_SUCCESS) { + return code; + } + pCliArgs->hasReplicaNode = true; + return TSDB_CODE_SUCCESS; + } + + return TSDB_CODE_INVALID_PARA; +} + +int32_t tRepairParseReplicaNodeEndpoint(const char *endpoint, char *host, int32_t hostSize, char *remoteDataDir, + int32_t remoteDataDirSize) { + if (endpoint == NULL || host == NULL || hostSize <= 0 || remoteDataDir == NULL || remoteDataDirSize <= 0) { + return TSDB_CODE_INVALID_PARA; + } + + if (!tRepairValidateReplicaNodeEndpoint(endpoint)) { + return TSDB_CODE_INVALID_PARA; + } + + const char *sep = strchr(endpoint, ':'); + if (sep == NULL) { + return TSDB_CODE_INVALID_PARA; + } + + int32_t hostLen = (int32_t)(sep - endpoint); + if (hostLen <= 0 || hostLen >= hostSize) { + return TSDB_CODE_INVALID_PARA; + } + + int32_t pathLen = strlen(sep + 1); + if (pathLen <= 0 || pathLen >= remoteDataDirSize) { + return TSDB_CODE_INVALID_PARA; + } + + memcpy(host, endpoint, hostLen); + host[hostLen] = '\0'; + tstrncpy(remoteDataDir, sep + 1, remoteDataDirSize); + return TSDB_CODE_SUCCESS; +} + +int32_t tRepairValidateCliArgs(const SRepairCliArgs *pCliArgs) { + if (pCliArgs == NULL) { + return TSDB_CODE_INVALID_PARA; + } + + if (!pCliArgs->hasNodeType || !pCliArgs->hasFileType || !pCliArgs->hasMode) { + return TSDB_CODE_INVALID_PARA; + } + + if (!tRepairIsFileTypeCompatible(pCliArgs->nodeType, pCliArgs->fileType)) { + return TSDB_CODE_INVALID_PARA; + } + + if (pCliArgs->nodeType == REPAIR_NODE_TYPE_VNODE) { + if (!pCliArgs->hasVnodeIdList) { + return TSDB_CODE_INVALID_PARA; + } + } else if (pCliArgs->hasVnodeIdList) { + return TSDB_CODE_INVALID_PARA; + } + + if (pCliArgs->mode == REPAIR_MODE_COPY) { + if (!pCliArgs->hasReplicaNode) { + return TSDB_CODE_INVALID_PARA; + } + if (!tRepairValidateReplicaNodeEndpoint(pCliArgs->replicaNode)) { + return TSDB_CODE_INVALID_PARA; + } + } else if (pCliArgs->hasReplicaNode) { + return TSDB_CODE_INVALID_PARA; + } + + return TSDB_CODE_SUCCESS; +} + +int32_t tRepairInitCtx(const SRepairCliArgs *pCliArgs, int64_t startTimeMs, SRepairCtx *pCtx) { + if (pCliArgs == NULL || pCtx == NULL || startTimeMs <= 0) { + return TSDB_CODE_INVALID_PARA; + } + + int32_t code = tRepairValidateCliArgs(pCliArgs); + if (code != TSDB_CODE_SUCCESS) { + return code; + } + + memset(pCtx, 0, sizeof(*pCtx)); + pCtx->enabled = true; + pCtx->startTimeMs = startTimeMs; + pCtx->nodeType = pCliArgs->nodeType; + pCtx->fileType = pCliArgs->fileType; + pCtx->mode = pCliArgs->mode; + + pCtx->hasVnodeIdList = pCliArgs->hasVnodeIdList; + if (pCliArgs->hasVnodeIdList) { + tstrncpy(pCtx->vnodeIdList, pCliArgs->vnodeIdList, sizeof(pCtx->vnodeIdList)); + char vnodeIdBuf[PATH_MAX] = {0}; + tstrncpy(vnodeIdBuf, pCtx->vnodeIdList, sizeof(vnodeIdBuf)); + code = tRepairParseVnodeIdList(vnodeIdBuf, pCtx); + if (code != TSDB_CODE_SUCCESS) { + memset(pCtx, 0, sizeof(*pCtx)); + return code; + } + } + + pCtx->hasBackupPath = pCliArgs->hasBackupPath; + if (pCliArgs->hasBackupPath) { + tstrncpy(pCtx->backupPath, pCliArgs->backupPath, sizeof(pCtx->backupPath)); + } + + pCtx->hasReplicaNode = pCliArgs->hasReplicaNode; + if (pCliArgs->hasReplicaNode) { + tstrncpy(pCtx->replicaNode, pCliArgs->replicaNode, sizeof(pCtx->replicaNode)); + } + + int32_t len = tsnprintf(pCtx->sessionId, sizeof(pCtx->sessionId), "repair-%" PRId64, startTimeMs); + if (len <= 0 || len >= (int32_t)sizeof(pCtx->sessionId)) { + memset(pCtx, 0, sizeof(*pCtx)); + return TSDB_CODE_INVALID_PARA; + } + + return TSDB_CODE_SUCCESS; +} + +int32_t tRepairShouldRepairVnode(const SRepairCtx *pCtx, int32_t vnodeId, bool *pShouldRepair) { + if (pCtx == NULL || pShouldRepair == NULL || vnodeId < 0) { + return TSDB_CODE_INVALID_PARA; + } + + if (!pCtx->hasVnodeIdList || pCtx->vnodeIdNum <= 0) { + *pShouldRepair = true; + return TSDB_CODE_SUCCESS; + } + + for (int32_t i = 0; i < pCtx->vnodeIdNum; ++i) { + if (pCtx->vnodeIds[i] == vnodeId) { + *pShouldRepair = true; + return TSDB_CODE_SUCCESS; + } + } + + *pShouldRepair = false; + return TSDB_CODE_SUCCESS; +} + +int32_t tRepairNeedRunWalForceRepair(const SRepairCtx *pCtx, bool *pNeedRun) { + if (pCtx == NULL || !pCtx->enabled || pNeedRun == NULL) { + return TSDB_CODE_INVALID_PARA; + } + + *pNeedRun = pCtx->nodeType == REPAIR_NODE_TYPE_VNODE && pCtx->fileType == REPAIR_FILE_TYPE_WAL && + pCtx->mode == REPAIR_MODE_FORCE; + return TSDB_CODE_SUCCESS; +} + +int32_t tRepairNeedRunTsdbForceRepair(const SRepairCtx *pCtx, bool *pNeedRun) { + if (pCtx == NULL || !pCtx->enabled || pNeedRun == NULL) { + return TSDB_CODE_INVALID_PARA; + } + + *pNeedRun = pCtx->nodeType == REPAIR_NODE_TYPE_VNODE && pCtx->fileType == REPAIR_FILE_TYPE_TSDB && + pCtx->mode == REPAIR_MODE_FORCE; + return TSDB_CODE_SUCCESS; +} + +int32_t tRepairNeedRunMetaForceRepair(const SRepairCtx *pCtx, bool *pNeedRun) { + if (pCtx == NULL || !pCtx->enabled || pNeedRun == NULL) { + return TSDB_CODE_INVALID_PARA; + } + + *pNeedRun = pCtx->nodeType == REPAIR_NODE_TYPE_VNODE && pCtx->fileType == REPAIR_FILE_TYPE_META && + pCtx->mode == REPAIR_MODE_FORCE; + return TSDB_CODE_SUCCESS; +} + +int32_t tRepairNeedRunReplicaRepair(const SRepairCtx *pCtx, bool *pNeedRun) { + if (pCtx == NULL || !pCtx->enabled || pNeedRun == NULL) { + return TSDB_CODE_INVALID_PARA; + } + + *pNeedRun = pCtx->nodeType == REPAIR_NODE_TYPE_VNODE && pCtx->mode == REPAIR_MODE_REPLICA; + return TSDB_CODE_SUCCESS; +} + +int32_t tRepairNeedRunCopyRepair(const SRepairCtx *pCtx, bool *pNeedRun) { + if (pCtx == NULL || !pCtx->enabled || pNeedRun == NULL) { + return TSDB_CODE_INVALID_PARA; + } + + *pNeedRun = pCtx->nodeType == REPAIR_NODE_TYPE_VNODE && pCtx->mode == REPAIR_MODE_COPY; + return TSDB_CODE_SUCCESS; +} + +int32_t tRepairBuildCopySshProbeCmd(const char *replicaHost, const char *remoteTargetPath, char *cmd, int32_t cmdSize) { + if (replicaHost == NULL || replicaHost[0] == '\0' || remoteTargetPath == NULL || remoteTargetPath[0] == '\0' || + cmd == NULL || cmdSize <= 0) { + return TSDB_CODE_INVALID_PARA; + } + if (!tRepairValidateShellHostToken(replicaHost) || !tRepairValidateShellPathToken(remoteTargetPath)) { + return TSDB_CODE_INVALID_PARA; + } + + const char *sshBin = tRepairResolveCopyBin(REPAIR_COPY_SSH_BIN_ENV, REPAIR_COPY_DEFAULT_SSH); + if (sshBin == NULL) { + return TSDB_CODE_INVALID_PARA; + } + + int32_t len = tsnprintf(cmd, cmdSize, "%s -o BatchMode=yes -o StrictHostKeyChecking=no %s \"test -d '%s'\"", + sshBin, replicaHost, remoteTargetPath); + if (len <= 0 || len >= cmdSize) { + return TSDB_CODE_INVALID_PARA; + } + + return TSDB_CODE_SUCCESS; +} + +int32_t tRepairBuildCopyScpCmd(const char *replicaHost, const char *remoteTargetPath, const char *localTargetPath, + char *cmd, int32_t cmdSize) { + if (replicaHost == NULL || replicaHost[0] == '\0' || remoteTargetPath == NULL || remoteTargetPath[0] == '\0' || + localTargetPath == NULL || localTargetPath[0] == '\0' || cmd == NULL || cmdSize <= 0) { + return TSDB_CODE_INVALID_PARA; + } + if (!tRepairValidateShellHostToken(replicaHost) || !tRepairValidateShellPathToken(remoteTargetPath) || + !tRepairValidateShellPathToken(localTargetPath)) { + return TSDB_CODE_INVALID_PARA; + } + + const char *scpBin = tRepairResolveCopyBin(REPAIR_COPY_SCP_BIN_ENV, REPAIR_COPY_DEFAULT_SCP); + if (scpBin == NULL) { + return TSDB_CODE_INVALID_PARA; + } + + int32_t len = tsnprintf( + cmd, cmdSize, "%s -q -o BatchMode=yes -o StrictHostKeyChecking=no -r %s:'%s/.' '%s'", scpBin, replicaHost, + remoteTargetPath, localTargetPath); + if (len <= 0 || len >= cmdSize) { + return TSDB_CODE_INVALID_PARA; + } + + return TSDB_CODE_SUCCESS; +} + +static int32_t tRepairBuildCopySshStatCmd(const char *replicaHost, const char *remoteTargetPath, char *cmd, + int32_t cmdSize) { + if (replicaHost == NULL || replicaHost[0] == '\0' || remoteTargetPath == NULL || remoteTargetPath[0] == '\0' || + cmd == NULL || cmdSize <= 0) { + return TSDB_CODE_INVALID_PARA; + } + if (!tRepairValidateShellHostToken(replicaHost) || !tRepairValidateShellPathToken(remoteTargetPath)) { + return TSDB_CODE_INVALID_PARA; + } + + const char *sshBin = tRepairResolveCopyBin(REPAIR_COPY_SSH_BIN_ENV, REPAIR_COPY_DEFAULT_SSH); + if (sshBin == NULL) { + return TSDB_CODE_INVALID_PARA; + } + + int32_t len = tsnprintf(cmd, cmdSize, "%s -o BatchMode=yes -o StrictHostKeyChecking=no %s \"stat -c '%%u %%g %%a' '%s'\"", + sshBin, replicaHost, remoteTargetPath); + if (len <= 0 || len >= cmdSize) { + return TSDB_CODE_INVALID_PARA; + } + + return TSDB_CODE_SUCCESS; +} + +static int32_t tRepairParseCopyOwnershipMeta(const char *metaLine, int32_t *uid, int32_t *gid, int32_t *mode) { + if (metaLine == NULL || metaLine[0] == '\0' || uid == NULL || gid == NULL || mode == NULL) { + return TSDB_CODE_INVALID_PARA; + } + + char metaBuf[128] = {0}; + tstrncpy(metaBuf, metaLine, sizeof(metaBuf)); + + char *savePtr = NULL; + char *uidToken = strtok_r(metaBuf, " \t", &savePtr); + char *gidToken = strtok_r(NULL, " \t", &savePtr); + char *modeToken = strtok_r(NULL, " \t", &savePtr); + char *extraToken = strtok_r(NULL, " \t", &savePtr); + if (uidToken == NULL || gidToken == NULL || modeToken == NULL || extraToken != NULL) { + return TSDB_CODE_INVALID_PARA; + } + + errno = 0; + char *end = NULL; + int32_t parsedUid = taosStr2Int32(uidToken, &end, 10); + if (errno != 0 || end == NULL || *end != '\0' || parsedUid < 0) { + return TSDB_CODE_INVALID_PARA; + } + + errno = 0; + end = NULL; + int32_t parsedGid = taosStr2Int32(gidToken, &end, 10); + if (errno != 0 || end == NULL || *end != '\0' || parsedGid < 0) { + return TSDB_CODE_INVALID_PARA; + } + + errno = 0; + end = NULL; + int32_t parsedMode = taosStr2Int32(modeToken, &end, 10); + if (errno != 0 || end == NULL || *end != '\0' || parsedMode < 0 || parsedMode > 7777) { + return TSDB_CODE_INVALID_PARA; + } + + *uid = parsedUid; + *gid = parsedGid; + *mode = parsedMode; + return TSDB_CODE_SUCCESS; +} + +static int32_t tRepairBuildCopyFixOwnerPermCmd(int32_t uid, int32_t gid, int32_t mode, const char *localTargetPath, + char *cmd, int32_t cmdSize) { + if (uid < 0 || gid < 0 || mode < 0 || mode > 7777 || localTargetPath == NULL || localTargetPath[0] == '\0' || + cmd == NULL || cmdSize <= 0) { + return TSDB_CODE_INVALID_PARA; + } + if (!tRepairValidateShellPathToken(localTargetPath)) { + return TSDB_CODE_INVALID_PARA; + } + + int32_t len = tsnprintf(cmd, cmdSize, "chown -R %d:%d '%s' && chmod %d '%s'", uid, gid, localTargetPath, mode, + localTargetPath); + if (len <= 0 || len >= cmdSize) { + return TSDB_CODE_INVALID_PARA; + } + + return TSDB_CODE_SUCCESS; +} + +static int32_t tRepairBuildCopySshDigestCmd(const char *replicaHost, const char *remoteTargetPath, char *cmd, + int32_t cmdSize) { + if (replicaHost == NULL || replicaHost[0] == '\0' || remoteTargetPath == NULL || remoteTargetPath[0] == '\0' || + cmd == NULL || cmdSize <= 0) { + return TSDB_CODE_INVALID_PARA; + } + if (!tRepairValidateShellHostToken(replicaHost) || !tRepairValidateShellPathToken(remoteTargetPath)) { + return TSDB_CODE_INVALID_PARA; + } + + const char *sshBin = tRepairResolveCopyBin(REPAIR_COPY_SSH_BIN_ENV, REPAIR_COPY_DEFAULT_SSH); + if (sshBin == NULL) { + return TSDB_CODE_INVALID_PARA; + } + + int32_t len = + tsnprintf(cmd, cmdSize, + "%s -o BatchMode=yes -o StrictHostKeyChecking=no %s " + "\"find '%s' -type f -printf '%%P %%s\\\\n' | LC_ALL=C sort | md5sum | cut -d ' ' -f1\"", + sshBin, replicaHost, remoteTargetPath); + if (len <= 0 || len >= cmdSize) { + return TSDB_CODE_INVALID_PARA; + } + + return TSDB_CODE_SUCCESS; +} + +static int32_t tRepairBuildCopyLocalDigestCmd(const char *localTargetPath, char *cmd, int32_t cmdSize) { + if (localTargetPath == NULL || localTargetPath[0] == '\0' || cmd == NULL || cmdSize <= 0) { + return TSDB_CODE_INVALID_PARA; + } + if (!tRepairValidateShellPathToken(localTargetPath)) { + return TSDB_CODE_INVALID_PARA; + } + + int32_t len = tsnprintf( + cmd, cmdSize, "find '%s' -type f -printf '%%P %%s\\n' | LC_ALL=C sort | md5sum | cut -d ' ' -f1", localTargetPath); + if (len <= 0 || len >= cmdSize) { + return TSDB_CODE_INVALID_PARA; + } + + return TSDB_CODE_SUCCESS; +} + +static int32_t tRepairVerifyCopyConsistency(const char *replicaHost, const char *remoteTargetPath, + const char *localTargetPath) { + if (replicaHost == NULL || replicaHost[0] == '\0' || remoteTargetPath == NULL || remoteTargetPath[0] == '\0' || + localTargetPath == NULL || localTargetPath[0] == '\0') { + return TSDB_CODE_INVALID_PARA; + } + + char remoteDigestCmd[PATH_MAX * 8] = {0}; + int32_t code = tRepairBuildCopySshDigestCmd(replicaHost, remoteTargetPath, remoteDigestCmd, sizeof(remoteDigestCmd)); + if (code != TSDB_CODE_SUCCESS) { + return code; + } + + char localDigestCmd[PATH_MAX * 8] = {0}; + code = tRepairBuildCopyLocalDigestCmd(localTargetPath, localDigestCmd, sizeof(localDigestCmd)); + if (code != TSDB_CODE_SUCCESS) { + return code; + } + + char remoteDigest[128] = {0}; + code = tRepairRunShellCommand(remoteDigestCmd, remoteDigest, sizeof(remoteDigest)); + if (code != TSDB_CODE_SUCCESS) { + return code; + } + + char localDigest[128] = {0}; + code = tRepairRunShellCommand(localDigestCmd, localDigest, sizeof(localDigest)); + if (code != TSDB_CODE_SUCCESS) { + return code; + } + + if (remoteDigest[0] == '\0' || localDigest[0] == '\0') { + return TSDB_CODE_FAILED; + } + + return strcmp(remoteDigest, localDigest) == 0 ? TSDB_CODE_SUCCESS : TSDB_CODE_FAILED; +} + +int32_t tRepairDegradeReplicaVnode(const SRepairCtx *pCtx, const char *dataDir, int32_t vnodeId, char *markerPath, + int32_t markerPathSize) { + if (pCtx == NULL || !pCtx->enabled || dataDir == NULL || dataDir[0] == '\0' || vnodeId < 0 || markerPath == NULL || + markerPathSize <= 0) { + return TSDB_CODE_INVALID_PARA; + } + + if (pCtx->nodeType != REPAIR_NODE_TYPE_VNODE || pCtx->mode != REPAIR_MODE_REPLICA) { + return TSDB_CODE_INVALID_PARA; + } + + bool shouldRepair = false; + int32_t code = tRepairShouldRepairVnode(pCtx, vnodeId, &shouldRepair); + if (code != TSDB_CODE_SUCCESS || !shouldRepair) { + return TSDB_CODE_INVALID_PARA; + } + + char vnodeDir[PATH_MAX] = {0}; + code = tRepairBuildVnodePath(dataDir, vnodeId, NULL, vnodeDir, sizeof(vnodeDir)); + if (code != TSDB_CODE_SUCCESS) { + return code; + } + if (!taosDirExist(vnodeDir) || !taosIsDir(vnodeDir)) { + return TSDB_CODE_INVALID_PARA; + } + + int32_t markerPathLen = + tsnprintf(markerPath, markerPathSize, "%s%s%s", vnodeDir, TD_DIRSEP, "replica.degrade.marker.json"); + if (markerPathLen <= 0 || markerPathLen >= markerPathSize) { + return TSDB_CODE_INVALID_PARA; + } + + char markerContent[1024] = {0}; + int32_t markerContentLen = tsnprintf( + markerContent, sizeof(markerContent), + "{\"action\":\"degrade-local-replica\",\"availability\":\"offline\",\"syncPolicy\":\"full-sync\"," + "\"versionPolicy\":\"reset-local-version\",\"termPolicy\":\"bump-local-term\",\"sessionId\":\"%s\"," + "\"vnodeId\":%d,\"updatedAtMs\":%" PRId64 "}", + pCtx->sessionId, vnodeId, taosGetTimestampMs()); + if (markerContentLen <= 0 || markerContentLen >= (int32_t)sizeof(markerContent)) { + return TSDB_CODE_INVALID_PARA; + } + + return tRepairWriteFileAtomically(markerPath, markerContent, markerContentLen); +} + +int32_t tRepairRollbackReplicaVnode(const SRepairCtx *pCtx, const char *dataDir, int32_t vnodeId) { + if (pCtx == NULL || !pCtx->enabled || dataDir == NULL || dataDir[0] == '\0' || vnodeId < 0) { + return TSDB_CODE_INVALID_PARA; + } + + if (pCtx->nodeType != REPAIR_NODE_TYPE_VNODE || pCtx->mode != REPAIR_MODE_REPLICA) { + return TSDB_CODE_INVALID_PARA; + } + + bool shouldRepair = false; + int32_t code = tRepairShouldRepairVnode(pCtx, vnodeId, &shouldRepair); + if (code != TSDB_CODE_SUCCESS || !shouldRepair) { + return TSDB_CODE_INVALID_PARA; + } + + char vnodeDir[PATH_MAX] = {0}; + code = tRepairBuildVnodePath(dataDir, vnodeId, NULL, vnodeDir, sizeof(vnodeDir)); + if (code != TSDB_CODE_SUCCESS) { + return code; + } + if (!taosDirExist(vnodeDir) || !taosIsDir(vnodeDir)) { + return TSDB_CODE_INVALID_PARA; + } + + char markerPath[PATH_MAX] = {0}; + int32_t markerPathLen = + tsnprintf(markerPath, sizeof(markerPath), "%s%s%s", vnodeDir, TD_DIRSEP, "replica.degrade.marker.json"); + if (markerPathLen <= 0 || markerPathLen >= (int32_t)sizeof(markerPath)) { + return TSDB_CODE_INVALID_PARA; + } + + if (!taosCheckExistFile(markerPath)) { + return TSDB_CODE_SUCCESS; + } + + if (taosRemoveFile(markerPath) != 0) { + return terrno != 0 ? terrno : TSDB_CODE_INVALID_PARA; + } + + return TSDB_CODE_SUCCESS; +} + +int32_t tRepairWriteReplicaRestoreHint(const SRepairCtx *pCtx, const char *dataDir, char *hintPath, + int32_t hintPathSize) { + if (pCtx == NULL || !pCtx->enabled || dataDir == NULL || dataDir[0] == '\0' || hintPath == NULL || + hintPathSize <= 0) { + return TSDB_CODE_INVALID_PARA; + } + + if (pCtx->nodeType != REPAIR_NODE_TYPE_VNODE || pCtx->mode != REPAIR_MODE_REPLICA) { + return TSDB_CODE_INVALID_PARA; + } + + if (pCtx->vnodeIdNum <= 0) { + return TSDB_CODE_INVALID_PARA; + } + + char sessionDir[PATH_MAX] = {0}; + int32_t code = tRepairBuildSessionDir(pCtx, dataDir, sessionDir, sizeof(sessionDir)); + if (code != TSDB_CODE_SUCCESS) { + return code; + } + + if (taosMulMkDir(sessionDir) != 0 && !taosDirExist(sessionDir)) { + return terrno != 0 ? terrno : TSDB_CODE_INVALID_PARA; + } + + int32_t hintPathLen = tsnprintf(hintPath, hintPathSize, "%s%s%s", sessionDir, TD_DIRSEP, "replica.restore.hint.json"); + if (hintPathLen <= 0 || hintPathLen >= hintPathSize) { + return TSDB_CODE_INVALID_PARA; + } + + char vnodeIds[PATH_MAX] = {0}; + if (pCtx->hasVnodeIdList && pCtx->vnodeIdList[0] != '\0') { + tstrncpy(vnodeIds, pCtx->vnodeIdList, sizeof(vnodeIds)); + } else { + int32_t offset = 0; + for (int32_t i = 0; i < pCtx->vnodeIdNum; ++i) { + int32_t left = (int32_t)sizeof(vnodeIds) - offset; + if (left <= 0) { + return TSDB_CODE_INVALID_PARA; + } + int32_t append = tsnprintf(vnodeIds + offset, left, i == 0 ? "%d" : ",%d", pCtx->vnodeIds[i]); + if (append <= 0 || append >= left) { + return TSDB_CODE_INVALID_PARA; + } + offset += append; + } + } + + char hintContent[2048] = {0}; + int32_t hintContentLen = tsnprintf( + hintContent, sizeof(hintContent), + "{\"mnodeMsgType\":\"TDMT_MND_RESTORE_DNODE\",\"restoreType\":\"RESTORE_TYPE__VNODE\"," + "\"vgroupAction\":\"mndBuildRestoreAlterVgroupAction\"," + "\"restoreSqlHint\":\"RESTORE VNODE ON DNODE \",\"sessionId\":\"%s\",\"vnodeIds\":\"%s\"," + "\"updatedAtMs\":%" PRId64 "}", + pCtx->sessionId, vnodeIds, taosGetTimestampMs()); + if (hintContentLen <= 0 || hintContentLen >= (int32_t)sizeof(hintContent)) { + return TSDB_CODE_INVALID_PARA; + } + + return tRepairWriteFileAtomically(hintPath, hintContent, hintContentLen); +} + +int32_t tRepairBuildVnodeTargetPath(const char *dataDir, int32_t vnodeId, ERepairFileType fileType, char *targetPath, + int32_t targetPathSize) { + const char *subDir = tRepairGetVnodeFileSubDir(fileType); + if (subDir == NULL) { + return TSDB_CODE_INVALID_PARA; + } + + return tRepairBuildVnodePath(dataDir, vnodeId, subDir, targetPath, targetPathSize); +} + +int32_t tRepairScanTsdbFiles(const SRepairCtx *pCtx, const char *dataDir, int32_t vnodeId, SRepairTsdbScanResult *pResult) { + if (pCtx == NULL || !pCtx->enabled || dataDir == NULL || dataDir[0] == '\0' || vnodeId < 0 || pResult == NULL) { + return TSDB_CODE_INVALID_PARA; + } + + memset(pResult, 0, sizeof(*pResult)); + + if (pCtx->nodeType != REPAIR_NODE_TYPE_VNODE || pCtx->fileType != REPAIR_FILE_TYPE_TSDB) { + return TSDB_CODE_INVALID_PARA; + } + + bool shouldRepair = false; + int32_t code = tRepairShouldRepairVnode(pCtx, vnodeId, &shouldRepair); + if (code != TSDB_CODE_SUCCESS || !shouldRepair) { + return TSDB_CODE_INVALID_PARA; + } + + char targetPath[PATH_MAX] = {0}; + code = tRepairBuildVnodeTargetPath(dataDir, vnodeId, REPAIR_FILE_TYPE_TSDB, targetPath, sizeof(targetPath)); + if (code != TSDB_CODE_SUCCESS) { + return code; + } + + code = tRepairScanTsdbDirRecursive(targetPath, pResult); + if (code != TSDB_CODE_SUCCESS) { + return code; + } + + if (pResult->headFiles <= 0 || pResult->dataFiles <= 0) { + return TSDB_CODE_INVALID_PARA; + } + + return TSDB_CODE_SUCCESS; +} + +int32_t tRepairScanMetaFiles(const SRepairCtx *pCtx, const char *dataDir, int32_t vnodeId, SRepairMetaScanResult *pResult) { + if (pCtx == NULL || !pCtx->enabled || dataDir == NULL || dataDir[0] == '\0' || vnodeId < 0 || pResult == NULL) { + return TSDB_CODE_INVALID_PARA; + } + + memset(pResult, 0, sizeof(*pResult)); + pResult->requiredFiles = (int32_t)tListLen(kMetaRequiredFiles); + + if (pCtx->nodeType != REPAIR_NODE_TYPE_VNODE || pCtx->fileType != REPAIR_FILE_TYPE_META) { + return TSDB_CODE_INVALID_PARA; + } + + bool shouldRepair = false; + int32_t code = tRepairShouldRepairVnode(pCtx, vnodeId, &shouldRepair); + if (code != TSDB_CODE_SUCCESS || !shouldRepair) { + return TSDB_CODE_INVALID_PARA; + } + + char targetPath[PATH_MAX] = {0}; + code = tRepairBuildVnodeTargetPath(dataDir, vnodeId, REPAIR_FILE_TYPE_META, targetPath, sizeof(targetPath)); + if (code != TSDB_CODE_SUCCESS) { + return code; + } + if (!taosDirExist(targetPath) || !taosIsDir(targetPath)) { + return TSDB_CODE_INVALID_PARA; + } + + for (int32_t i = 0; i < (int32_t)tListLen(kMetaRequiredFiles); ++i) { + char filePath[PATH_MAX] = {0}; + code = tRepairBuildPathWithEntry(targetPath, kMetaRequiredFiles[i], filePath, sizeof(filePath)); + if (code != TSDB_CODE_SUCCESS) { + return code; + } + + if (taosCheckExistFile(filePath)) { + ++pResult->presentRequiredFiles; + } else { + tRepairRecordMissingMetaFile(pResult, kMetaRequiredFiles[i]); + } + } + + for (int32_t i = 0; i < (int32_t)tListLen(kMetaOptionalIndexFiles); ++i) { + char filePath[PATH_MAX] = {0}; + code = tRepairBuildPathWithEntry(targetPath, kMetaOptionalIndexFiles[i], filePath, sizeof(filePath)); + if (code != TSDB_CODE_SUCCESS) { + return code; + } + + if (taosCheckExistFile(filePath)) { + ++pResult->optionalIndexFiles; + } + } + + if (pResult->presentRequiredFiles < pResult->requiredFiles) { + return TSDB_CODE_INVALID_PARA; + } + + return TSDB_CODE_SUCCESS; +} + +int32_t tRepairBuildMetaMissingFileMark(const SRepairMetaScanResult *pResult, char *mark, int32_t markSize) { + if (pResult == NULL || mark == NULL || markSize <= 0) { + return TSDB_CODE_INVALID_PARA; + } + + mark[0] = '\0'; + if (pResult->missingRequiredFiles <= 0) { + tstrncpy(mark, "none", markSize); + return TSDB_CODE_SUCCESS; + } + + int32_t offset = 0; + for (int32_t i = 0; i < pResult->missingRequiredFiles && i < REPAIR_META_MAX_MISSING_FILES; ++i) { + if (pResult->missingRequiredFileNames[i][0] == '\0') { + continue; + } + + int32_t remain = markSize - offset; + if (remain <= 0) { + return TSDB_CODE_INVALID_PARA; + } + + int32_t written = tsnprintf(mark + offset, remain, "%s%s", offset == 0 ? "" : ",", + pResult->missingRequiredFileNames[i]); + if (written <= 0 || written >= remain) { + return TSDB_CODE_INVALID_PARA; + } + + offset += written; + } + + if (offset == 0) { + tstrncpy(mark, "unknown", markSize); + } + return TSDB_CODE_SUCCESS; +} + +int32_t tRepairInferMetaFromWalTsdb(const SRepairCtx *pCtx, const char *dataDir, int32_t vnodeId, + SRepairMetaInferenceReport *pReport) { + if (pCtx == NULL || !pCtx->enabled || dataDir == NULL || dataDir[0] == '\0' || vnodeId < 0 || pReport == NULL) { + return TSDB_CODE_INVALID_PARA; + } + + memset(pReport, 0, sizeof(*pReport)); + + if (pCtx->nodeType != REPAIR_NODE_TYPE_VNODE || pCtx->fileType != REPAIR_FILE_TYPE_META) { + return TSDB_CODE_INVALID_PARA; + } + + bool shouldRepair = false; + int32_t code = tRepairShouldRepairVnode(pCtx, vnodeId, &shouldRepair); + if (code != TSDB_CODE_SUCCESS || !shouldRepair) { + return TSDB_CODE_INVALID_PARA; + } + + char walPath[PATH_MAX] = {0}; + code = tRepairBuildVnodeTargetPath(dataDir, vnodeId, REPAIR_FILE_TYPE_WAL, walPath, sizeof(walPath)); + if (code != TSDB_CODE_SUCCESS) { + return code; + } + if (taosDirExist(walPath) && taosIsDir(walPath)) { + code = tRepairCountRegularFilesRecursive(walPath, &pReport->walEvidenceFiles); + if (code != TSDB_CODE_SUCCESS) { + return code; + } + } + + SRepairCtx tsdbCtx = *pCtx; + tsdbCtx.fileType = REPAIR_FILE_TYPE_TSDB; + SRepairTsdbBlockReport tsdbReport = {0}; + code = tRepairAnalyzeTsdbBlocks(&tsdbCtx, dataDir, vnodeId, &tsdbReport); + if (code == TSDB_CODE_SUCCESS) { + pReport->tsdbRecoverableBlocks = tsdbReport.recoverableBlocks; + } + + if (pReport->walEvidenceFiles > 0) { + ++pReport->inferredRules; + } + if (pReport->tsdbRecoverableBlocks > 0) { + ++pReport->inferredRules; + } + + pReport->recoverable = pReport->inferredRules > 0; + return pReport->recoverable ? TSDB_CODE_SUCCESS : TSDB_CODE_INVALID_PARA; +} + +int32_t tRepairRebuildMetaFiles(const SRepairCtx *pCtx, const char *dataDir, int32_t vnodeId, const char *outputDir, + SRepairMetaScanResult *pResult) { + if (pCtx == NULL || !pCtx->enabled || dataDir == NULL || dataDir[0] == '\0' || vnodeId < 0 || outputDir == NULL || + outputDir[0] == '\0' || pResult == NULL) { + return TSDB_CODE_INVALID_PARA; + } + + memset(pResult, 0, sizeof(*pResult)); + pResult->requiredFiles = (int32_t)tListLen(kMetaRequiredFiles); + + if (pCtx->nodeType != REPAIR_NODE_TYPE_VNODE || pCtx->fileType != REPAIR_FILE_TYPE_META) { + return TSDB_CODE_INVALID_PARA; + } + + bool shouldRepair = false; + int32_t code = tRepairShouldRepairVnode(pCtx, vnodeId, &shouldRepair); + if (code != TSDB_CODE_SUCCESS || !shouldRepair) { + return TSDB_CODE_INVALID_PARA; + } + + char targetPath[PATH_MAX] = {0}; + code = tRepairBuildVnodeTargetPath(dataDir, vnodeId, REPAIR_FILE_TYPE_META, targetPath, sizeof(targetPath)); + if (code != TSDB_CODE_SUCCESS) { + return code; + } + if (!taosDirExist(targetPath) || !taosIsDir(targetPath)) { + return TSDB_CODE_INVALID_PARA; + } + + code = tRepairResetDir(outputDir); + if (code != TSDB_CODE_SUCCESS) { + return code; + } + + code = tRepairCopyDirRecursive(targetPath, outputDir); + if (code != TSDB_CODE_SUCCESS) { + return code; + } + + for (int32_t i = 0; i < (int32_t)tListLen(kMetaRequiredFiles); ++i) { + char filePath[PATH_MAX] = {0}; + code = tRepairBuildPathWithEntry(outputDir, kMetaRequiredFiles[i], filePath, sizeof(filePath)); + if (code != TSDB_CODE_SUCCESS) { + return code; + } + + if (taosCheckExistFile(filePath)) { + ++pResult->presentRequiredFiles; + } else { + code = tRepairCreateEmptyFile(filePath); + if (code != TSDB_CODE_SUCCESS) { + return code; + } + + if (taosCheckExistFile(filePath)) { + ++pResult->presentRequiredFiles; + } else { + tRepairRecordMissingMetaFile(pResult, kMetaRequiredFiles[i]); + } + } + } + + for (int32_t i = 0; i < (int32_t)tListLen(kMetaOptionalIndexFiles); ++i) { + char filePath[PATH_MAX] = {0}; + code = tRepairBuildPathWithEntry(outputDir, kMetaOptionalIndexFiles[i], filePath, sizeof(filePath)); + if (code != TSDB_CODE_SUCCESS) { + return code; + } + + if (taosCheckExistFile(filePath)) { + ++pResult->optionalIndexFiles; + } + } + + if (pResult->presentRequiredFiles < pResult->requiredFiles) { + return TSDB_CODE_INVALID_PARA; + } + + return TSDB_CODE_SUCCESS; +} + +int32_t tRepairAnalyzeTsdbBlocks(const SRepairCtx *pCtx, const char *dataDir, int32_t vnodeId, + SRepairTsdbBlockReport *pReport) { + if (pCtx == NULL || !pCtx->enabled || dataDir == NULL || dataDir[0] == '\0' || vnodeId < 0 || pReport == NULL) { + return TSDB_CODE_INVALID_PARA; + } + + memset(pReport, 0, sizeof(*pReport)); + + if (pCtx->nodeType != REPAIR_NODE_TYPE_VNODE || pCtx->fileType != REPAIR_FILE_TYPE_TSDB) { + return TSDB_CODE_INVALID_PARA; + } + + bool shouldRepair = false; + int32_t code = tRepairShouldRepairVnode(pCtx, vnodeId, &shouldRepair); + if (code != TSDB_CODE_SUCCESS || !shouldRepair) { + return TSDB_CODE_INVALID_PARA; + } + + char targetPath[PATH_MAX] = {0}; + code = tRepairBuildVnodeTargetPath(dataDir, vnodeId, REPAIR_FILE_TYPE_TSDB, targetPath, sizeof(targetPath)); + if (code != TSDB_CODE_SUCCESS) { + return code; + } + if (!taosDirExist(targetPath) || !taosIsDir(targetPath)) { + return TSDB_CODE_INVALID_PARA; + } + + code = tRepairAnalyzeTsdbDirRecursive(targetPath, pReport); + if (code != TSDB_CODE_SUCCESS) { + return code; + } + if (pReport->totalBlocks <= 0) { + return TSDB_CODE_INVALID_PARA; + } + + return TSDB_CODE_SUCCESS; +} + +int32_t tRepairRebuildTsdbBlocks(const SRepairCtx *pCtx, const char *dataDir, int32_t vnodeId, const char *outputDir, + SRepairTsdbBlockReport *pReport) { + if (pCtx == NULL || !pCtx->enabled || dataDir == NULL || dataDir[0] == '\0' || vnodeId < 0 || outputDir == NULL || + outputDir[0] == '\0' || pReport == NULL) { + return TSDB_CODE_INVALID_PARA; + } + + memset(pReport, 0, sizeof(*pReport)); + + if (pCtx->nodeType != REPAIR_NODE_TYPE_VNODE || pCtx->fileType != REPAIR_FILE_TYPE_TSDB) { + return TSDB_CODE_INVALID_PARA; + } + + bool shouldRepair = false; + int32_t code = tRepairShouldRepairVnode(pCtx, vnodeId, &shouldRepair); + if (code != TSDB_CODE_SUCCESS || !shouldRepair) { + return TSDB_CODE_INVALID_PARA; + } + + char targetPath[PATH_MAX] = {0}; + code = tRepairBuildVnodeTargetPath(dataDir, vnodeId, REPAIR_FILE_TYPE_TSDB, targetPath, sizeof(targetPath)); + if (code != TSDB_CODE_SUCCESS) { + return code; + } + if (!taosDirExist(targetPath) || !taosIsDir(targetPath)) { + return TSDB_CODE_INVALID_PARA; + } + + code = tRepairResetDir(outputDir); + if (code != TSDB_CODE_SUCCESS) { + return code; + } + + code = tRepairRebuildTsdbDirRecursive(targetPath, outputDir, "", pReport); + if (code != TSDB_CODE_SUCCESS) { + return code; + } + if (pReport->recoverableBlocks <= 0) { + return TSDB_CODE_INVALID_PARA; + } + + return TSDB_CODE_SUCCESS; +} + +int32_t tRepairPrecheck(const SRepairCtx *pCtx, const char *dataDir, int64_t minDiskAvailBytes) { + if (pCtx == NULL || !pCtx->enabled || dataDir == NULL || dataDir[0] == '\0' || minDiskAvailBytes < 0) { + return TSDB_CODE_INVALID_PARA; + } + + if (!taosDirExist(dataDir)) { + return TSDB_CODE_INVALID_PARA; + } + + if (pCtx->hasBackupPath && !taosDirExist(pCtx->backupPath)) { + return TSDB_CODE_INVALID_PARA; + } + + char dataDirBuf[PATH_MAX] = {0}; + int32_t code = tRepairParseStringOption(dataDir, dataDirBuf, sizeof(dataDirBuf)); + if (code != TSDB_CODE_SUCCESS) { + return code; + } + + SDiskSize diskSize = {0}; + code = taosGetDiskSize(dataDirBuf, &diskSize); + if (code != TSDB_CODE_SUCCESS) { + return code; + } + + if (minDiskAvailBytes > 0 && diskSize.avail < minDiskAvailBytes) { + return TSDB_CODE_NO_ENOUGH_DISKSPACE; + } + + if (pCtx->nodeType != REPAIR_NODE_TYPE_VNODE) { + return TSDB_CODE_SUCCESS; + } + + if (pCtx->vnodeIdNum <= 0) { + return TSDB_CODE_INVALID_PARA; + } + + for (int32_t i = 0; i < pCtx->vnodeIdNum; ++i) { + char vnodeDir[PATH_MAX] = {0}; + code = tRepairBuildVnodePath(dataDir, pCtx->vnodeIds[i], NULL, vnodeDir, sizeof(vnodeDir)); + if (code != TSDB_CODE_SUCCESS) { + return code; + } + if (!taosDirExist(vnodeDir)) { + return TSDB_CODE_INVALID_PARA; + } + + char targetPath[PATH_MAX] = {0}; + code = tRepairBuildVnodeTargetPath(dataDir, pCtx->vnodeIds[i], pCtx->fileType, targetPath, sizeof(targetPath)); + if (code != TSDB_CODE_SUCCESS) { + return code; + } + if (!taosCheckExistFile(targetPath)) { + return TSDB_CODE_INVALID_PARA; + } + + if (pCtx->fileType == REPAIR_FILE_TYPE_TSDB) { + SRepairTsdbScanResult scanResult = {0}; + code = tRepairScanTsdbFiles(pCtx, dataDir, pCtx->vnodeIds[i], &scanResult); + if (code != TSDB_CODE_SUCCESS) { + return code; + } + } else if (pCtx->fileType == REPAIR_FILE_TYPE_META) { + SRepairMetaScanResult scanResult = {0}; + code = tRepairScanMetaFiles(pCtx, dataDir, pCtx->vnodeIds[i], &scanResult); + if (code != TSDB_CODE_SUCCESS) { + SRepairMetaInferenceReport inferReport = {0}; + int32_t inferCode = tRepairInferMetaFromWalTsdb(pCtx, dataDir, pCtx->vnodeIds[i], &inferReport); + if (inferCode != TSDB_CODE_SUCCESS) { + return code; + } + } + } + } + + return TSDB_CODE_SUCCESS; +} + +int32_t tRepairPrepareBackupDir(const SRepairCtx *pCtx, const char *dataDir, int32_t vnodeId, char *backupDir, + int32_t backupDirSize) { + int32_t code = tRepairBuildBackupDir(pCtx, dataDir, vnodeId, backupDir, backupDirSize); + if (code != TSDB_CODE_SUCCESS) { + return code; + } + + code = taosMulMkDir(backupDir); + if (code != 0) { + return terrno != 0 ? terrno : TSDB_CODE_INVALID_PARA; + } + + return TSDB_CODE_SUCCESS; +} + +int32_t tRepairBackupVnodeTarget(const SRepairCtx *pCtx, const char *dataDir, int32_t vnodeId, char *backupDir, + int32_t backupDirSize) { + if (backupDir != NULL && backupDirSize > 0) { + backupDir[0] = '\0'; + } + + if (pCtx == NULL || !pCtx->enabled || dataDir == NULL || dataDir[0] == '\0' || vnodeId < 0 || backupDir == NULL || + backupDirSize <= 0) { + return TSDB_CODE_INVALID_PARA; + } + + char targetPath[PATH_MAX] = {0}; + int32_t code = tRepairBuildVnodeTargetAndBackupPath(pCtx, dataDir, vnodeId, targetPath, sizeof(targetPath), backupDir, + backupDirSize); + if (code != TSDB_CODE_SUCCESS) { + return code; + } + + if (!taosDirExist(targetPath) || !taosIsDir(targetPath)) { + return TSDB_CODE_INVALID_PARA; + } + + code = tRepairResetDir(backupDir); + if (code != TSDB_CODE_SUCCESS) { + return code; + } + + return tRepairCopyDirRecursive(targetPath, backupDir); +} + +int32_t tRepairRollbackVnodeTarget(const SRepairCtx *pCtx, const char *dataDir, int32_t vnodeId) { + if (pCtx == NULL || !pCtx->enabled || dataDir == NULL || dataDir[0] == '\0' || vnodeId < 0) { + return TSDB_CODE_INVALID_PARA; + } + + char targetPath[PATH_MAX] = {0}; + char backupDir[PATH_MAX] = {0}; + int32_t code = tRepairBuildVnodeTargetAndBackupPath(pCtx, dataDir, vnodeId, targetPath, sizeof(targetPath), backupDir, + sizeof(backupDir)); + if (code != TSDB_CODE_SUCCESS) { + return code; + } + + if (!taosDirExist(backupDir) || !taosIsDir(backupDir)) { + return TSDB_CODE_INVALID_PARA; + } + + code = tRepairResetDir(targetPath); + if (code != TSDB_CODE_SUCCESS) { + return code; + } + + return tRepairCopyDirRecursive(backupDir, targetPath); +} + +int32_t tRepairMockCopyReplicaVnodeTarget(const SRepairCtx *pCtx, const char *replicaDataDir, const char *localDataDir, + int32_t vnodeId, char *srcPath, int32_t srcPathSize, char *dstPath, + int32_t dstPathSize) { + if (srcPath != NULL && srcPathSize > 0) { + srcPath[0] = '\0'; + } + if (dstPath != NULL && dstPathSize > 0) { + dstPath[0] = '\0'; + } + + if (pCtx == NULL || !pCtx->enabled || replicaDataDir == NULL || replicaDataDir[0] == '\0' || localDataDir == NULL || + localDataDir[0] == '\0' || vnodeId < 0) { + return TSDB_CODE_INVALID_PARA; + } + + if (pCtx->nodeType != REPAIR_NODE_TYPE_VNODE || pCtx->mode != REPAIR_MODE_COPY) { + return TSDB_CODE_INVALID_PARA; + } + + if ((srcPath != NULL && srcPathSize <= 0) || (srcPath == NULL && srcPathSize > 0) || (dstPath != NULL && dstPathSize <= 0) || + (dstPath == NULL && dstPathSize > 0)) { + return TSDB_CODE_INVALID_PARA; + } + + bool shouldRepair = false; + int32_t code = tRepairShouldRepairVnode(pCtx, vnodeId, &shouldRepair); + if (code != TSDB_CODE_SUCCESS || !shouldRepair) { + return TSDB_CODE_INVALID_PARA; + } + + char replicaTargetPath[PATH_MAX] = {0}; + char localTargetPath[PATH_MAX] = {0}; + code = tRepairBuildVnodeTargetPath(replicaDataDir, vnodeId, pCtx->fileType, replicaTargetPath, sizeof(replicaTargetPath)); + if (code != TSDB_CODE_SUCCESS) { + return code; + } + code = tRepairBuildVnodeTargetPath(localDataDir, vnodeId, pCtx->fileType, localTargetPath, sizeof(localTargetPath)); + if (code != TSDB_CODE_SUCCESS) { + return code; + } + + if (!taosDirExist(replicaTargetPath) || !taosIsDir(replicaTargetPath)) { + return TSDB_CODE_INVALID_PARA; + } + + code = tRepairResetDir(localTargetPath); + if (code != TSDB_CODE_SUCCESS) { + return code; + } + + code = tRepairCopyDirRecursive(replicaTargetPath, localTargetPath); + if (code != TSDB_CODE_SUCCESS) { + return code; + } + + if (srcPath != NULL) { + tstrncpy(srcPath, replicaTargetPath, srcPathSize); + } + if (dstPath != NULL) { + tstrncpy(dstPath, localTargetPath, dstPathSize); + } + return TSDB_CODE_SUCCESS; +} + +int32_t tRepairSshScpCopyReplicaVnodeTarget(const SRepairCtx *pCtx, const char *replicaHost, const char *replicaDataDir, + const char *localDataDir, int32_t vnodeId, char *srcPath, int32_t srcPathSize, + char *dstPath, int32_t dstPathSize) { + if (srcPath != NULL && srcPathSize > 0) { + srcPath[0] = '\0'; + } + if (dstPath != NULL && dstPathSize > 0) { + dstPath[0] = '\0'; + } + + if (pCtx == NULL || !pCtx->enabled || replicaHost == NULL || replicaHost[0] == '\0' || replicaDataDir == NULL || + replicaDataDir[0] == '\0' || localDataDir == NULL || localDataDir[0] == '\0' || vnodeId < 0) { + return TSDB_CODE_INVALID_PARA; + } + + if (pCtx->nodeType != REPAIR_NODE_TYPE_VNODE || pCtx->mode != REPAIR_MODE_COPY) { + return TSDB_CODE_INVALID_PARA; + } + + if ((srcPath != NULL && srcPathSize <= 0) || (srcPath == NULL && srcPathSize > 0) || (dstPath != NULL && dstPathSize <= 0) || + (dstPath == NULL && dstPathSize > 0)) { + return TSDB_CODE_INVALID_PARA; + } + + bool shouldRepair = false; + int32_t code = tRepairShouldRepairVnode(pCtx, vnodeId, &shouldRepair); + if (code != TSDB_CODE_SUCCESS || !shouldRepair) { + return TSDB_CODE_INVALID_PARA; + } + + char replicaTargetPath[PATH_MAX] = {0}; + char localTargetPath[PATH_MAX] = {0}; + code = tRepairBuildVnodeTargetPath(replicaDataDir, vnodeId, pCtx->fileType, replicaTargetPath, sizeof(replicaTargetPath)); + if (code != TSDB_CODE_SUCCESS) { + return code; + } + code = tRepairBuildVnodeTargetPath(localDataDir, vnodeId, pCtx->fileType, localTargetPath, sizeof(localTargetPath)); + if (code != TSDB_CODE_SUCCESS) { + return code; + } + + char sshProbeCmd[PATH_MAX * 6] = {0}; + code = tRepairBuildCopySshProbeCmd(replicaHost, replicaTargetPath, sshProbeCmd, sizeof(sshProbeCmd)); + if (code != TSDB_CODE_SUCCESS) { + return code; + } + + char lastOutput[PATH_MAX] = {0}; + code = tRepairRunShellCommand(sshProbeCmd, lastOutput, sizeof(lastOutput)); + if (code != TSDB_CODE_SUCCESS) { + return code; + } + + char sshStatCmd[PATH_MAX * 6] = {0}; + code = tRepairBuildCopySshStatCmd(replicaHost, replicaTargetPath, sshStatCmd, sizeof(sshStatCmd)); + if (code != TSDB_CODE_SUCCESS) { + return code; + } + + char remoteOwnershipMeta[128] = {0}; + code = tRepairRunShellCommand(sshStatCmd, remoteOwnershipMeta, sizeof(remoteOwnershipMeta)); + if (code != TSDB_CODE_SUCCESS) { + return code; + } + + int32_t remoteUid = 0; + int32_t remoteGid = 0; + int32_t remoteMode = 0; + code = tRepairParseCopyOwnershipMeta(remoteOwnershipMeta, &remoteUid, &remoteGid, &remoteMode); + if (code != TSDB_CODE_SUCCESS) { + return code; + } + + code = tRepairResetDir(localTargetPath); + if (code != TSDB_CODE_SUCCESS) { + return code; + } + + char scpCmd[PATH_MAX * 6] = {0}; + code = tRepairBuildCopyScpCmd(replicaHost, replicaTargetPath, localTargetPath, scpCmd, sizeof(scpCmd)); + if (code != TSDB_CODE_SUCCESS) { + return code; + } + + code = tRepairRunShellCommand(scpCmd, lastOutput, sizeof(lastOutput)); + if (code != TSDB_CODE_SUCCESS) { + return code; + } + + char fixOwnerPermCmd[PATH_MAX * 6] = {0}; + code = tRepairBuildCopyFixOwnerPermCmd(remoteUid, remoteGid, remoteMode, localTargetPath, fixOwnerPermCmd, + sizeof(fixOwnerPermCmd)); + if (code != TSDB_CODE_SUCCESS) { + return code; + } + + code = tRepairRunShellCommand(fixOwnerPermCmd, lastOutput, sizeof(lastOutput)); + if (code != TSDB_CODE_SUCCESS) { + return code; + } + + code = tRepairVerifyCopyConsistency(replicaHost, replicaTargetPath, localTargetPath); + if (code != TSDB_CODE_SUCCESS) { + return code; + } + + if (srcPath != NULL) { + tstrncpy(srcPath, replicaTargetPath, srcPathSize); + } + if (dstPath != NULL) { + tstrncpy(dstPath, localTargetPath, dstPathSize); + } + + return TSDB_CODE_SUCCESS; +} + +int32_t tRepairPrepareSessionFiles(const SRepairCtx *pCtx, const char *dataDir, char *sessionDir, int32_t sessionDirSize, + char *logPath, int32_t logPathSize, char *statePath, int32_t statePathSize) { + if (sessionDir != NULL && sessionDirSize > 0) { + sessionDir[0] = '\0'; + } + if (logPath != NULL && logPathSize > 0) { + logPath[0] = '\0'; + } + if (statePath != NULL && statePathSize > 0) { + statePath[0] = '\0'; + } + + if (pCtx == NULL || !pCtx->enabled || dataDir == NULL || dataDir[0] == '\0' || sessionDir == NULL || + sessionDirSize <= 0 || logPath == NULL || logPathSize <= 0 || statePath == NULL || statePathSize <= 0) { + return TSDB_CODE_INVALID_PARA; + } + + int32_t code = tRepairBuildSessionDir(pCtx, dataDir, sessionDir, sessionDirSize); + if (code != TSDB_CODE_SUCCESS) { + return code; + } + + if (taosMulMkDir(sessionDir) != 0) { + return terrno != 0 ? terrno : TSDB_CODE_INVALID_PARA; + } + + code = tRepairBuildSessionFilePath(sessionDir, REPAIR_SESSION_LOG_NAME, logPath, logPathSize); + if (code != TSDB_CODE_SUCCESS) { + return code; + } + code = tRepairBuildSessionFilePath(sessionDir, REPAIR_SESSION_STATE_NAME, statePath, statePathSize); + if (code != TSDB_CODE_SUCCESS) { + return code; + } + + TdFilePtr pLogFile = taosOpenFile(logPath, TD_FILE_CREATE | TD_FILE_WRITE | TD_FILE_APPEND); + if (pLogFile == NULL) { + return terrno != 0 ? terrno : TSDB_CODE_INVALID_PARA; + } + if (taosCloseFile(&pLogFile) != 0) { + return terrno != 0 ? terrno : TSDB_CODE_INVALID_PARA; + } + + int32_t totalVnodes = pCtx->nodeType == REPAIR_NODE_TYPE_VNODE ? pCtx->vnodeIdNum : 0; + if (totalVnodes < 0) { + totalVnodes = 0; + } + + code = tRepairWriteSessionState(pCtx, statePath, "init", "initialized", 0, totalVnodes); + if (code != TSDB_CODE_SUCCESS) { + return code; + } + + return tRepairAppendSessionLog(logPath, "repair session initialized"); +} + +int32_t tRepairAppendSessionLog(const char *logPath, const char *message) { + if (logPath == NULL || logPath[0] == '\0' || message == NULL || message[0] == '\0') { + return TSDB_CODE_INVALID_PARA; + } + + TdFilePtr pFile = taosOpenFile(logPath, TD_FILE_CREATE | TD_FILE_WRITE | TD_FILE_APPEND); + if (pFile == NULL) { + return terrno != 0 ? terrno : TSDB_CODE_INVALID_PARA; + } + + int32_t code = TSDB_CODE_SUCCESS; + char prefix[64] = {0}; + int32_t prefixLen = tsnprintf(prefix, sizeof(prefix), "[%" PRId64 "] ", taosGetTimestampMs()); + if (prefixLen <= 0 || prefixLen >= (int32_t)sizeof(prefix)) { + code = TSDB_CODE_INVALID_PARA; + } + + if (code == TSDB_CODE_SUCCESS) { + int64_t written = taosWriteFile(pFile, prefix, prefixLen); + if (written != prefixLen) { + code = terrno != 0 ? terrno : TSDB_CODE_INVALID_PARA; + } + } + + int32_t messageLen = strlen(message); + if (code == TSDB_CODE_SUCCESS) { + int64_t written = taosWriteFile(pFile, message, messageLen); + if (written != messageLen) { + code = terrno != 0 ? terrno : TSDB_CODE_INVALID_PARA; + } + } + + if (code == TSDB_CODE_SUCCESS) { + int64_t written = taosWriteFile(pFile, "\n", 1); + if (written != 1) { + code = terrno != 0 ? terrno : TSDB_CODE_INVALID_PARA; + } + } + + if (code == TSDB_CODE_SUCCESS && taosFsyncFile(pFile) != 0) { + code = terrno != 0 ? terrno : TSDB_CODE_INVALID_PARA; + } + + if (taosCloseFile(&pFile) != 0 && code == TSDB_CODE_SUCCESS) { + code = terrno != 0 ? terrno : TSDB_CODE_INVALID_PARA; + } + + return code; +} + +int32_t tRepairWriteSessionState(const SRepairCtx *pCtx, const char *statePath, const char *step, const char *status, + int32_t doneVnodes, int32_t totalVnodes) { + if (pCtx == NULL || !pCtx->enabled || statePath == NULL || statePath[0] == '\0' || step == NULL || step[0] == '\0' || + status == NULL || status[0] == '\0' || doneVnodes < 0 || totalVnodes < 0 || doneVnodes > totalVnodes) { + return TSDB_CODE_INVALID_PARA; + } + + return tRepairWriteSessionStateInternal(pCtx, statePath, step, status, doneVnodes, totalVnodes); +} + +int32_t tRepairTryResumeSession(SRepairCtx *pCtx, const char *dataDir, char *sessionDir, int32_t sessionDirSize, + char *logPath, int32_t logPathSize, char *statePath, int32_t statePathSize, + int32_t *pDoneVnodes, int32_t *pTotalVnodes, bool *pResumed, char *resumeStep, + int32_t resumeStepSize) { + if (sessionDir != NULL && sessionDirSize > 0) { + sessionDir[0] = '\0'; + } + if (logPath != NULL && logPathSize > 0) { + logPath[0] = '\0'; + } + if (statePath != NULL && statePathSize > 0) { + statePath[0] = '\0'; + } + if (resumeStep != NULL && resumeStepSize > 0) { + resumeStep[0] = '\0'; + } + + if (pCtx == NULL || !pCtx->enabled || dataDir == NULL || dataDir[0] == '\0' || sessionDir == NULL || + sessionDirSize <= 0 || logPath == NULL || logPathSize <= 0 || statePath == NULL || statePathSize <= 0 || + pDoneVnodes == NULL || pTotalVnodes == NULL || pResumed == NULL || + (resumeStep != NULL && resumeStepSize <= 0) || (resumeStep == NULL && resumeStepSize > 0)) { + return TSDB_CODE_INVALID_PARA; + } + + *pDoneVnodes = 0; + *pTotalVnodes = pCtx->nodeType == REPAIR_NODE_TYPE_VNODE ? pCtx->vnodeIdNum : 0; + *pResumed = false; + + char backupBaseDir[PATH_MAX] = {0}; + int32_t code = tRepairBuildBackupBaseDir(pCtx, dataDir, backupBaseDir, sizeof(backupBaseDir)); + if (code != TSDB_CODE_SUCCESS) { + return code; + } + + if (!taosDirExist(backupBaseDir)) { + return TSDB_CODE_SUCCESS; + } + + TdDirPtr pDir = taosOpenDir(backupBaseDir); + if (pDir == NULL) { + return terrno != 0 ? terrno : TSDB_CODE_INVALID_PARA; + } + + SRepairResumeCandidate bestCandidate = {0}; + TdDirEntryPtr pDirEntry = NULL; + while ((pDirEntry = taosReadDir(pDir)) != NULL) { + if (!taosDirEntryIsDir(pDirEntry)) { + continue; + } + + char *entryName = taosGetDirEntryName(pDirEntry); + if (entryName == NULL || strcmp(entryName, ".") == 0 || strcmp(entryName, "..") == 0) { + continue; + } + if (strncmp(entryName, REPAIR_SESSION_DIR_PREFIX, strlen(REPAIR_SESSION_DIR_PREFIX)) != 0) { + continue; + } + + char sessionPath[PATH_MAX] = {0}; + int32_t pathLen = tsnprintf(sessionPath, sizeof(sessionPath), "%s%s%s", backupBaseDir, TD_DIRSEP, entryName); + if (pathLen <= 0 || pathLen >= (int32_t)sizeof(sessionPath)) { + continue; + } + + char stateFilePath[PATH_MAX] = {0}; + if (tRepairBuildSessionFilePath(sessionPath, REPAIR_SESSION_STATE_NAME, stateFilePath, sizeof(stateFilePath)) != + TSDB_CODE_SUCCESS) { + continue; + } + if (!taosCheckExistFile(stateFilePath)) { + continue; + } + + SRepairResumeCandidate candidate = {0}; + if (!tRepairBuildResumeCandidate(pCtx, entryName, sessionPath, stateFilePath, &candidate)) { + continue; + } + + if (!bestCandidate.found || candidate.startTimeMs > bestCandidate.startTimeMs) { + bestCandidate = candidate; + } + } + + if (taosCloseDir(&pDir) != 0) { + return terrno != 0 ? terrno : TSDB_CODE_INVALID_PARA; + } + + if (!bestCandidate.found) { + return TSDB_CODE_SUCCESS; + } + + if (tRepairParseStringOption(bestCandidate.sessionId, pCtx->sessionId, sizeof(pCtx->sessionId)) != + TSDB_CODE_SUCCESS) { + return TSDB_CODE_INVALID_PARA; + } + pCtx->startTimeMs = bestCandidate.startTimeMs; + + tstrncpy(sessionDir, bestCandidate.sessionDir, sessionDirSize); + tstrncpy(logPath, bestCandidate.logPath, logPathSize); + tstrncpy(statePath, bestCandidate.statePath, statePathSize); + if (resumeStep != NULL) { + tstrncpy(resumeStep, bestCandidate.step, resumeStepSize); + } + *pDoneVnodes = bestCandidate.doneVnodes; + *pTotalVnodes = bestCandidate.totalVnodes; + *pResumed = true; + return TSDB_CODE_SUCCESS; +} + +int32_t tRepairResolveResumePlan(ERepairNodeType nodeType, const char *resumeStep, int32_t doneVnodes, + int32_t vnodeIdNum, SRepairResumePlan *pPlan) { + if (resumeStep == NULL || pPlan == NULL || vnodeIdNum < 0 || doneVnodes < 0 || doneVnodes > vnodeIdNum) { + return TSDB_CODE_INVALID_PARA; + } + + memset(pPlan, 0, sizeof(*pPlan)); + if (nodeType != REPAIR_NODE_TYPE_VNODE) { + return TSDB_CODE_SUCCESS; + } + + if (resumeStep[0] == '\0' || taosStrcasecmp(resumeStep, "init") == 0 || taosStrcasecmp(resumeStep, "precheck") == 0 || + taosStrcasecmp(resumeStep, "backup") == 0) { + pPlan->backupStartVnodeIndex = doneVnodes; + return TSDB_CODE_SUCCESS; + } + + pPlan->skipBackupPreparation = true; + pPlan->resumeAtModeStep = true; + if (taosStrcasecmp(resumeStep, "replica") == 0) { + pPlan->replicaStartVnodeIndex = doneVnodes; + return TSDB_CODE_SUCCESS; + } + if (taosStrcasecmp(resumeStep, "copy") == 0) { + pPlan->copyStartVnodeIndex = doneVnodes; + return TSDB_CODE_SUCCESS; + } + if (taosStrcasecmp(resumeStep, "wal") == 0) { + pPlan->walStartVnodeIndex = doneVnodes; + return TSDB_CODE_SUCCESS; + } + if (taosStrcasecmp(resumeStep, "tsdb") == 0) { + pPlan->tsdbStartVnodeIndex = doneVnodes; + return TSDB_CODE_SUCCESS; + } + if (taosStrcasecmp(resumeStep, "meta") == 0) { + pPlan->metaStartVnodeIndex = doneVnodes; + return TSDB_CODE_SUCCESS; + } + + return TSDB_CODE_INVALID_PARA; +} + +int32_t tRepairNeedReportProgress(int64_t nowMs, int64_t intervalMs, int64_t *pLastReportMs, bool *pNeedReport) { + if (nowMs < 0 || intervalMs <= 0 || pLastReportMs == NULL || pNeedReport == NULL) { + return TSDB_CODE_INVALID_PARA; + } + + if (*pLastReportMs <= 0 || nowMs < *pLastReportMs || (nowMs - *pLastReportMs) >= intervalMs) { + *pNeedReport = true; + *pLastReportMs = nowMs; + return TSDB_CODE_SUCCESS; + } + + *pNeedReport = false; + return TSDB_CODE_SUCCESS; +} + +int32_t tRepairBuildProgressLine(const SRepairCtx *pCtx, const char *step, int32_t doneVnodes, int32_t totalVnodes, + char *line, int32_t lineSize) { + if (pCtx == NULL || !pCtx->enabled || step == NULL || step[0] == '\0' || line == NULL || lineSize <= 0 || + doneVnodes < 0 || totalVnodes < 0) { + return TSDB_CODE_INVALID_PARA; + } + + if (totalVnodes > 0 && doneVnodes > totalVnodes) { + return TSDB_CODE_INVALID_PARA; + } + if (totalVnodes == 0 && doneVnodes != 0) { + return TSDB_CODE_INVALID_PARA; + } + + int32_t progress = totalVnodes > 0 ? (doneVnodes * 100) / totalVnodes : 100; + int32_t len = tsnprintf(line, lineSize, "repair progress: session=%s step=%s vnode=%d/%d progress=%d%%", + pCtx->sessionId, step, doneVnodes, totalVnodes, progress); + if (len <= 0 || len >= lineSize) { + return TSDB_CODE_INVALID_PARA; + } + + return TSDB_CODE_SUCCESS; +} + +int32_t tRepairBuildSummaryLine(const SRepairCtx *pCtx, int32_t successVnodes, int32_t failedVnodes, int64_t elapsedMs, + char *line, int32_t lineSize) { + if (pCtx == NULL || !pCtx->enabled || successVnodes < 0 || failedVnodes < 0 || elapsedMs < 0 || line == NULL || + lineSize <= 0) { + return TSDB_CODE_INVALID_PARA; + } + + const char *status = "success"; + if (failedVnodes > 0 && successVnodes > 0) { + status = "partial"; + } else if (failedVnodes > 0) { + status = "failed"; + } + + int32_t len = + tsnprintf(line, lineSize, "repair summary: session=%s status=%s successVnodes=%d failedVnodes=%d elapsedMs=%" PRId64, + pCtx->sessionId, status, successVnodes, failedVnodes, elapsedMs); + if (len <= 0 || len >= lineSize) { + return TSDB_CODE_INVALID_PARA; + } + + return TSDB_CODE_SUCCESS; +} diff --git a/source/common/test/commonTests.cpp b/source/common/test/commonTests.cpp index c4c74869d755..5ef35c831755 100644 --- a/source/common/test/commonTests.cpp +++ b/source/common/test/commonTests.cpp @@ -1,5 +1,6 @@ #include #include +#include #pragma GCC diagnostic push #pragma GCC diagnostic ignored "-Wwrite-strings" @@ -18,9 +19,116 @@ #include "tvariant.h" #include "tanalytics.h" #include "tglobal.h" +#include "tjson.h" +#include "trepair.h" namespace { -// +std::string buildRepairTempPath(const char *tag) { + static int32_t seq = 0; + return std::string("/tmp/td-repair-") + std::to_string((long long)taosGetTimestampUs()) + "-" + + std::to_string(seq++) + "-" + tag; +} + +class RepairTempDirGuard { + public: + explicit RepairTempDirGuard(const std::string &path) : path_(path) {} + + ~RepairTempDirGuard() { + if (!path_.empty() && taosDirExist(path_.c_str())) { + taosRemoveDir(path_.c_str()); + } + } + + const std::string &path() const { return path_; } + + private: + std::string path_; +}; + +std::string readRepairFileContent(const char *path) { + if (path == nullptr || path[0] == '\0') { + return ""; + } + + int64_t fileSize = 0; + if (taosStatFile(path, &fileSize, nullptr, nullptr) != 0 || fileSize < 0) { + return ""; + } + + TdFilePtr pFile = taosOpenFile(path, TD_FILE_READ); + if (pFile == nullptr) { + return ""; + } + + std::string content((size_t)fileSize, '\0'); + int64_t nread = taosReadFile(pFile, &content[0], fileSize); + (void)taosCloseFile(&pFile); + if (nread < 0) { + return ""; + } + + content.resize((size_t)nread); + return content; +} + +std::string runRepairCommandGetLastLine(const std::string &cmd) { + if (cmd.empty()) { + return ""; + } + + TdCmdPtr pCmd = taosOpenCmd(cmd.c_str()); + if (pCmd == nullptr) { + return ""; + } + + char line[1024] = {0}; + std::string lastLine; + while (true) { + int64_t nread = taosGetsCmd(pCmd, sizeof(line), line); + if (nread <= 0) { + break; + } + + std::string cur(line); + while (!cur.empty() && (cur.back() == '\n' || cur.back() == '\r')) { + cur.pop_back(); + } + if (!cur.empty()) { + lastLine = cur; + } + } + + taosCloseCmd(&pCmd); + return lastLine; +} + +class RepairEnvVarGuard { + public: + explicit RepairEnvVarGuard(const char *key) : key_(key == nullptr ? "" : key) { + const char *val = key_.empty() ? nullptr : getenv(key_.c_str()); + if (val != nullptr) { + hasOld_ = true; + oldVal_ = val; + } + } + + ~RepairEnvVarGuard() { + if (key_.empty()) { + return; + } + + if (hasOld_) { + (void)setenv(key_.c_str(), oldVal_.c_str(), 1); + } else { + (void)unsetenv(key_.c_str()); + } + } + + private: + std::string key_; + bool hasOld_ = false; + std::string oldVal_; +}; } // namespace int main(int argc, char** argv) { @@ -1106,4 +1214,2170 @@ TEST(testCase, function_taosTimeTruncate) { ASSERT_LE(res, 1633450000000); } +TEST(RepairOptionParseTest, ParseNodeType) { + ERepairNodeType nodeType = REPAIR_NODE_TYPE_INVALID; + ASSERT_EQ(tRepairParseNodeType("vnode", &nodeType), TSDB_CODE_SUCCESS); + ASSERT_EQ(nodeType, REPAIR_NODE_TYPE_VNODE); + + ASSERT_EQ(tRepairParseNodeType("MNODE", &nodeType), TSDB_CODE_SUCCESS); + ASSERT_EQ(nodeType, REPAIR_NODE_TYPE_MNODE); + + ASSERT_EQ(tRepairParseNodeType("not-a-node", &nodeType), TSDB_CODE_INVALID_PARA); + ASSERT_EQ(tRepairParseNodeType(NULL, &nodeType), TSDB_CODE_INVALID_PARA); + ASSERT_EQ(tRepairParseNodeType("vnode", NULL), TSDB_CODE_INVALID_PARA); +} + +TEST(RepairOptionParseTest, ParseFileType) { + ERepairFileType fileType = REPAIR_FILE_TYPE_INVALID; + ASSERT_EQ(tRepairParseFileType("wal", &fileType), TSDB_CODE_SUCCESS); + ASSERT_EQ(fileType, REPAIR_FILE_TYPE_WAL); + + ASSERT_EQ(tRepairParseFileType("TSDB", &fileType), TSDB_CODE_SUCCESS); + ASSERT_EQ(fileType, REPAIR_FILE_TYPE_TSDB); + + ASSERT_EQ(tRepairParseFileType("meta", &fileType), TSDB_CODE_SUCCESS); + ASSERT_EQ(fileType, REPAIR_FILE_TYPE_META); + + ASSERT_EQ(tRepairParseFileType("TDB", &fileType), TSDB_CODE_SUCCESS); + ASSERT_EQ(fileType, REPAIR_FILE_TYPE_META); + + ASSERT_EQ(tRepairParseFileType("bad-file-type", &fileType), TSDB_CODE_INVALID_PARA); + ASSERT_EQ(tRepairParseFileType(NULL, &fileType), TSDB_CODE_INVALID_PARA); + ASSERT_EQ(tRepairParseFileType("wal", NULL), TSDB_CODE_INVALID_PARA); +} + +TEST(RepairOptionParseTest, ParseMode) { + ERepairMode mode = REPAIR_MODE_INVALID; + ASSERT_EQ(tRepairParseMode("force", &mode), TSDB_CODE_SUCCESS); + ASSERT_EQ(mode, REPAIR_MODE_FORCE); + + ASSERT_EQ(tRepairParseMode("CoPy", &mode), TSDB_CODE_SUCCESS); + ASSERT_EQ(mode, REPAIR_MODE_COPY); + + ASSERT_EQ(tRepairParseMode("unknown-mode", &mode), TSDB_CODE_INVALID_PARA); + ASSERT_EQ(tRepairParseMode(NULL, &mode), TSDB_CODE_INVALID_PARA); + ASSERT_EQ(tRepairParseMode("force", NULL), TSDB_CODE_INVALID_PARA); +} + +TEST(RepairOptionParseTest, ParseCliOption) { + SRepairCliArgs cliArgs = {0}; + + ASSERT_EQ(tRepairParseCliOption(&cliArgs, "node-type", "vnode"), TSDB_CODE_SUCCESS); + ASSERT_TRUE(cliArgs.hasNodeType); + ASSERT_EQ(cliArgs.nodeType, REPAIR_NODE_TYPE_VNODE); + + ASSERT_EQ(tRepairParseCliOption(&cliArgs, "file-type", "WAL"), TSDB_CODE_SUCCESS); + ASSERT_TRUE(cliArgs.hasFileType); + ASSERT_EQ(cliArgs.fileType, REPAIR_FILE_TYPE_WAL); + + ASSERT_EQ(tRepairParseCliOption(&cliArgs, "vnode-id", "2,3"), TSDB_CODE_SUCCESS); + ASSERT_TRUE(cliArgs.hasVnodeIdList); + ASSERT_STREQ(cliArgs.vnodeIdList, "2,3"); + + ASSERT_EQ(tRepairParseCliOption(&cliArgs, "backup-path", "/tmp/backup"), TSDB_CODE_SUCCESS); + ASSERT_TRUE(cliArgs.hasBackupPath); + ASSERT_STREQ(cliArgs.backupPath, "/tmp/backup"); + + ASSERT_EQ(tRepairParseCliOption(&cliArgs, "mode", "force"), TSDB_CODE_SUCCESS); + ASSERT_TRUE(cliArgs.hasMode); + ASSERT_EQ(cliArgs.mode, REPAIR_MODE_FORCE); + + ASSERT_EQ(tRepairParseCliOption(&cliArgs, "replica-node", "192.168.1.24:/root/dataDir"), TSDB_CODE_SUCCESS); + ASSERT_TRUE(cliArgs.hasReplicaNode); + ASSERT_STREQ(cliArgs.replicaNode, "192.168.1.24:/root/dataDir"); +} + +TEST(RepairOptionParseTest, ParseCliOptionInvalid) { + SRepairCliArgs cliArgs = {0}; + + ASSERT_EQ(tRepairParseCliOption(NULL, "node-type", "vnode"), TSDB_CODE_INVALID_PARA); + ASSERT_EQ(tRepairParseCliOption(&cliArgs, NULL, "vnode"), TSDB_CODE_INVALID_PARA); + ASSERT_EQ(tRepairParseCliOption(&cliArgs, "node-type", NULL), TSDB_CODE_INVALID_PARA); + + ASSERT_EQ(tRepairParseCliOption(&cliArgs, "node-type", "bad-node"), TSDB_CODE_INVALID_PARA); + ASSERT_EQ(tRepairParseCliOption(&cliArgs, "unknown-option", "vnode"), TSDB_CODE_INVALID_PARA); + ASSERT_EQ(tRepairParseCliOption(&cliArgs, "vnode-id", ""), TSDB_CODE_INVALID_PARA); + ASSERT_EQ(tRepairParseCliOption(&cliArgs, "mode", "bad-mode"), TSDB_CODE_INVALID_PARA); + ASSERT_EQ(tRepairParseCliOption(&cliArgs, "backup-path", ""), TSDB_CODE_INVALID_PARA); + ASSERT_EQ(tRepairParseCliOption(&cliArgs, "replica-node", ""), TSDB_CODE_INVALID_PARA); + + std::string tooLongVnodeId(PATH_MAX, '1'); + ASSERT_EQ(tRepairParseCliOption(&cliArgs, "vnode-id", tooLongVnodeId.c_str()), TSDB_CODE_INVALID_PARA); + ASSERT_EQ(tRepairParseCliOption(&cliArgs, "backup-path", tooLongVnodeId.c_str()), TSDB_CODE_INVALID_PARA); + ASSERT_EQ(tRepairParseCliOption(&cliArgs, "replica-node", tooLongVnodeId.c_str()), TSDB_CODE_INVALID_PARA); +} + +TEST(RepairOptionParseTest, ExtractLongOptionValue) { + { + const char *argv[] = {"taosd", "--node-type", "vnode"}; + int32_t index = 1; + const char *value = NULL; + bool matched = false; + + ASSERT_EQ(tRepairExtractLongOptionValue(3, argv, &index, "--node-type", &value, &matched), TSDB_CODE_SUCCESS); + ASSERT_TRUE(matched); + ASSERT_EQ(index, 2); + ASSERT_STREQ(value, "vnode"); + } + + { + const char *argv[] = {"taosd", "--node-type=vnode"}; + int32_t index = 1; + const char *value = NULL; + bool matched = false; + + ASSERT_EQ(tRepairExtractLongOptionValue(2, argv, &index, "--node-type", &value, &matched), TSDB_CODE_SUCCESS); + ASSERT_TRUE(matched); + ASSERT_EQ(index, 1); + ASSERT_STREQ(value, "vnode"); + } + + { + const char *argv[] = {"taosd", "--file-type=wal"}; + int32_t index = 1; + const char *value = NULL; + bool matched = true; + + ASSERT_EQ(tRepairExtractLongOptionValue(2, argv, &index, "--node-type", &value, &matched), TSDB_CODE_SUCCESS); + ASSERT_FALSE(matched); + ASSERT_EQ(index, 1); + ASSERT_EQ(value, nullptr); + } + + { + const char *argv[] = {"taosd", "--node-type"}; + int32_t index = 1; + const char *value = NULL; + bool matched = false; + + ASSERT_EQ(tRepairExtractLongOptionValue(2, argv, &index, "--node-type", &value, &matched), TSDB_CODE_INVALID_PARA); + ASSERT_TRUE(matched); + ASSERT_EQ(index, 1); + ASSERT_EQ(value, nullptr); + } + + { + const char *argv[] = {"taosd", "--node-type="}; + int32_t index = 1; + const char *value = NULL; + bool matched = false; + + ASSERT_EQ(tRepairExtractLongOptionValue(2, argv, &index, "--node-type", &value, &matched), TSDB_CODE_INVALID_PARA); + ASSERT_TRUE(matched); + ASSERT_EQ(index, 1); + ASSERT_EQ(value, nullptr); + } +} + +TEST(RepairOptionParseTest, ExtractLongOptionValueInvalidArgs) { + const char *argv[] = {"taosd", "--node-type=vnode"}; + int32_t index = 1; + const char *value = NULL; + bool matched = false; + + ASSERT_EQ(tRepairExtractLongOptionValue(0, argv, &index, "--node-type", &value, &matched), TSDB_CODE_INVALID_PARA); + ASSERT_EQ(tRepairExtractLongOptionValue(2, NULL, &index, "--node-type", &value, &matched), TSDB_CODE_INVALID_PARA); + ASSERT_EQ(tRepairExtractLongOptionValue(2, argv, NULL, "--node-type", &value, &matched), TSDB_CODE_INVALID_PARA); + ASSERT_EQ(tRepairExtractLongOptionValue(2, argv, &index, NULL, &value, &matched), TSDB_CODE_INVALID_PARA); + ASSERT_EQ(tRepairExtractLongOptionValue(2, argv, &index, "--node-type", NULL, &matched), TSDB_CODE_INVALID_PARA); + ASSERT_EQ(tRepairExtractLongOptionValue(2, argv, &index, "--node-type", &value, NULL), TSDB_CODE_INVALID_PARA); + + index = -1; + ASSERT_EQ(tRepairExtractLongOptionValue(2, argv, &index, "--node-type", &value, &matched), TSDB_CODE_INVALID_PARA); + index = 2; + ASSERT_EQ(tRepairExtractLongOptionValue(2, argv, &index, "--node-type", &value, &matched), TSDB_CODE_INVALID_PARA); +} + +TEST(RepairOptionParseTest, ValidateCliArgsSuccess) { + SRepairCliArgs cliArgs = {0}; + + ASSERT_EQ(tRepairParseCliOption(&cliArgs, "node-type", "vnode"), TSDB_CODE_SUCCESS); + ASSERT_EQ(tRepairParseCliOption(&cliArgs, "file-type", "wal"), TSDB_CODE_SUCCESS); + ASSERT_EQ(tRepairParseCliOption(&cliArgs, "vnode-id", "2,3"), TSDB_CODE_SUCCESS); + ASSERT_EQ(tRepairParseCliOption(&cliArgs, "mode", "force"), TSDB_CODE_SUCCESS); + ASSERT_EQ(tRepairValidateCliArgs(&cliArgs), TSDB_CODE_SUCCESS); +} + +TEST(RepairOptionParseTest, ValidateCliArgsMissingRequired) { + { + SRepairCliArgs cliArgs = {0}; + ASSERT_EQ(tRepairParseCliOption(&cliArgs, "node-type", "vnode"), TSDB_CODE_SUCCESS); + ASSERT_EQ(tRepairParseCliOption(&cliArgs, "file-type", "wal"), TSDB_CODE_SUCCESS); + ASSERT_EQ(tRepairParseCliOption(&cliArgs, "vnode-id", "2,3"), TSDB_CODE_SUCCESS); + ASSERT_EQ(tRepairValidateCliArgs(&cliArgs), TSDB_CODE_INVALID_PARA); + } + { + SRepairCliArgs cliArgs = {0}; + ASSERT_EQ(tRepairParseCliOption(&cliArgs, "file-type", "wal"), TSDB_CODE_SUCCESS); + ASSERT_EQ(tRepairParseCliOption(&cliArgs, "mode", "force"), TSDB_CODE_SUCCESS); + ASSERT_EQ(tRepairParseCliOption(&cliArgs, "vnode-id", "2,3"), TSDB_CODE_SUCCESS); + ASSERT_EQ(tRepairValidateCliArgs(&cliArgs), TSDB_CODE_INVALID_PARA); + } +} + +TEST(RepairOptionParseTest, ValidateCliArgsNodeFileMismatch) { + { + SRepairCliArgs cliArgs = {0}; + ASSERT_EQ(tRepairParseCliOption(&cliArgs, "node-type", "vnode"), TSDB_CODE_SUCCESS); + ASSERT_EQ(tRepairParseCliOption(&cliArgs, "file-type", "config"), TSDB_CODE_SUCCESS); + ASSERT_EQ(tRepairParseCliOption(&cliArgs, "vnode-id", "2"), TSDB_CODE_SUCCESS); + ASSERT_EQ(tRepairParseCliOption(&cliArgs, "mode", "force"), TSDB_CODE_SUCCESS); + ASSERT_EQ(tRepairValidateCliArgs(&cliArgs), TSDB_CODE_INVALID_PARA); + } + { + SRepairCliArgs cliArgs = {0}; + ASSERT_EQ(tRepairParseCliOption(&cliArgs, "node-type", "dnode"), TSDB_CODE_SUCCESS); + ASSERT_EQ(tRepairParseCliOption(&cliArgs, "file-type", "config"), TSDB_CODE_SUCCESS); + ASSERT_EQ(tRepairParseCliOption(&cliArgs, "mode", "force"), TSDB_CODE_SUCCESS); + ASSERT_EQ(tRepairValidateCliArgs(&cliArgs), TSDB_CODE_SUCCESS); + } +} + +TEST(RepairOptionParseTest, ValidateCliArgsReplicaNodeRule) { + { + SRepairCliArgs cliArgs = {0}; + ASSERT_EQ(tRepairParseCliOption(&cliArgs, "node-type", "vnode"), TSDB_CODE_SUCCESS); + ASSERT_EQ(tRepairParseCliOption(&cliArgs, "file-type", "wal"), TSDB_CODE_SUCCESS); + ASSERT_EQ(tRepairParseCliOption(&cliArgs, "vnode-id", "2"), TSDB_CODE_SUCCESS); + ASSERT_EQ(tRepairParseCliOption(&cliArgs, "mode", "copy"), TSDB_CODE_SUCCESS); + ASSERT_EQ(tRepairValidateCliArgs(&cliArgs), TSDB_CODE_INVALID_PARA); + + ASSERT_EQ(tRepairParseCliOption(&cliArgs, "replica-node", "192.168.1.24:/root/dataDir"), TSDB_CODE_SUCCESS); + ASSERT_EQ(tRepairValidateCliArgs(&cliArgs), TSDB_CODE_SUCCESS); + } + { + SRepairCliArgs cliArgs = {0}; + ASSERT_EQ(tRepairParseCliOption(&cliArgs, "node-type", "vnode"), TSDB_CODE_SUCCESS); + ASSERT_EQ(tRepairParseCliOption(&cliArgs, "file-type", "wal"), TSDB_CODE_SUCCESS); + ASSERT_EQ(tRepairParseCliOption(&cliArgs, "vnode-id", "2"), TSDB_CODE_SUCCESS); + ASSERT_EQ(tRepairParseCliOption(&cliArgs, "mode", "force"), TSDB_CODE_SUCCESS); + ASSERT_EQ(tRepairParseCliOption(&cliArgs, "replica-node", "192.168.1.24:/root/dataDir"), TSDB_CODE_SUCCESS); + ASSERT_EQ(tRepairValidateCliArgs(&cliArgs), TSDB_CODE_INVALID_PARA); + } +} + +TEST(RepairOptionParseTest, ValidateCliArgsReplicaNodeEndpointFormat) { + { + SRepairCliArgs cliArgs = {0}; + ASSERT_EQ(tRepairParseCliOption(&cliArgs, "node-type", "vnode"), TSDB_CODE_SUCCESS); + ASSERT_EQ(tRepairParseCliOption(&cliArgs, "file-type", "wal"), TSDB_CODE_SUCCESS); + ASSERT_EQ(tRepairParseCliOption(&cliArgs, "vnode-id", "2"), TSDB_CODE_SUCCESS); + ASSERT_EQ(tRepairParseCliOption(&cliArgs, "mode", "copy"), TSDB_CODE_SUCCESS); + ASSERT_EQ(tRepairParseCliOption(&cliArgs, "replica-node", "tdnode1:/var/lib/taos"), TSDB_CODE_SUCCESS); + ASSERT_EQ(tRepairValidateCliArgs(&cliArgs), TSDB_CODE_SUCCESS); + } + + const char *invalidEndpoints[] = { + "192.168.1.24", + ":/var/lib/taos", + "192.168.1.24:", + "192.168.1.24:var/lib/taos", + "192.168.1.24:/var/lib/taos data", + "192.168.1.24:/var/lib/taos:bak", + " tdnode1:/var/lib/taos", + }; + + for (const char *endpoint : invalidEndpoints) { + SRepairCliArgs cliArgs = {0}; + ASSERT_EQ(tRepairParseCliOption(&cliArgs, "node-type", "vnode"), TSDB_CODE_SUCCESS); + ASSERT_EQ(tRepairParseCliOption(&cliArgs, "file-type", "wal"), TSDB_CODE_SUCCESS); + ASSERT_EQ(tRepairParseCliOption(&cliArgs, "vnode-id", "2"), TSDB_CODE_SUCCESS); + ASSERT_EQ(tRepairParseCliOption(&cliArgs, "mode", "copy"), TSDB_CODE_SUCCESS); + ASSERT_EQ(tRepairParseCliOption(&cliArgs, "replica-node", endpoint), TSDB_CODE_SUCCESS); + ASSERT_EQ(tRepairValidateCliArgs(&cliArgs), TSDB_CODE_INVALID_PARA); + } +} + +TEST(RepairOptionParseTest, ValidateCliArgsVnodeIdRule) { + { + SRepairCliArgs cliArgs = {0}; + ASSERT_EQ(tRepairParseCliOption(&cliArgs, "node-type", "vnode"), TSDB_CODE_SUCCESS); + ASSERT_EQ(tRepairParseCliOption(&cliArgs, "file-type", "wal"), TSDB_CODE_SUCCESS); + ASSERT_EQ(tRepairParseCliOption(&cliArgs, "mode", "force"), TSDB_CODE_SUCCESS); + ASSERT_EQ(tRepairValidateCliArgs(&cliArgs), TSDB_CODE_INVALID_PARA); + } + { + SRepairCliArgs cliArgs = {0}; + ASSERT_EQ(tRepairParseCliOption(&cliArgs, "node-type", "mnode"), TSDB_CODE_SUCCESS); + ASSERT_EQ(tRepairParseCliOption(&cliArgs, "file-type", "wal"), TSDB_CODE_SUCCESS); + ASSERT_EQ(tRepairParseCliOption(&cliArgs, "mode", "force"), TSDB_CODE_SUCCESS); + ASSERT_EQ(tRepairParseCliOption(&cliArgs, "vnode-id", "2"), TSDB_CODE_SUCCESS); + ASSERT_EQ(tRepairValidateCliArgs(&cliArgs), TSDB_CODE_INVALID_PARA); + } +} + +TEST(RepairOptionParseTest, InitRepairCtxSuccess) { + SRepairCliArgs cliArgs = {0}; + ASSERT_EQ(tRepairParseCliOption(&cliArgs, "node-type", "vnode"), TSDB_CODE_SUCCESS); + ASSERT_EQ(tRepairParseCliOption(&cliArgs, "file-type", "meta"), TSDB_CODE_SUCCESS); + ASSERT_EQ(tRepairParseCliOption(&cliArgs, "vnode-id", "2,3"), TSDB_CODE_SUCCESS); + ASSERT_EQ(tRepairParseCliOption(&cliArgs, "backup-path", "/tmp/backup"), TSDB_CODE_SUCCESS); + ASSERT_EQ(tRepairParseCliOption(&cliArgs, "mode", "force"), TSDB_CODE_SUCCESS); + + const int64_t startTs = 1735689600123LL; + SRepairCtx ctx = {0}; + ASSERT_EQ(tRepairInitCtx(&cliArgs, startTs, &ctx), TSDB_CODE_SUCCESS); + + ASSERT_TRUE(ctx.enabled); + ASSERT_EQ(ctx.startTimeMs, startTs); + ASSERT_STREQ(ctx.sessionId, "repair-1735689600123"); + ASSERT_EQ(ctx.nodeType, REPAIR_NODE_TYPE_VNODE); + ASSERT_EQ(ctx.fileType, REPAIR_FILE_TYPE_META); + ASSERT_EQ(ctx.mode, REPAIR_MODE_FORCE); + ASSERT_TRUE(ctx.hasVnodeIdList); + ASSERT_STREQ(ctx.vnodeIdList, "2,3"); + ASSERT_EQ(ctx.vnodeIdNum, 2); + ASSERT_EQ(ctx.vnodeIds[0], 2); + ASSERT_EQ(ctx.vnodeIds[1], 3); + ASSERT_TRUE(ctx.hasBackupPath); + ASSERT_STREQ(ctx.backupPath, "/tmp/backup"); + ASSERT_FALSE(ctx.hasReplicaNode); + + bool shouldRepair = false; + ASSERT_EQ(tRepairShouldRepairVnode(&ctx, 2, &shouldRepair), TSDB_CODE_SUCCESS); + ASSERT_TRUE(shouldRepair); + ASSERT_EQ(tRepairShouldRepairVnode(&ctx, 9, &shouldRepair), TSDB_CODE_SUCCESS); + ASSERT_FALSE(shouldRepair); +} + +TEST(RepairOptionParseTest, InitRepairCtxInvalidArgs) { + SRepairCtx ctx = {0}; + ASSERT_EQ(tRepairInitCtx(NULL, 1735689600123LL, &ctx), TSDB_CODE_INVALID_PARA); + + SRepairCliArgs cliArgs = {0}; + ASSERT_EQ(tRepairInitCtx(&cliArgs, 1735689600123LL, NULL), TSDB_CODE_INVALID_PARA); + + ASSERT_EQ(tRepairParseCliOption(&cliArgs, "node-type", "vnode"), TSDB_CODE_SUCCESS); + ASSERT_EQ(tRepairParseCliOption(&cliArgs, "file-type", "wal"), TSDB_CODE_SUCCESS); + ASSERT_EQ(tRepairParseCliOption(&cliArgs, "vnode-id", "2,a"), TSDB_CODE_SUCCESS); + ASSERT_EQ(tRepairParseCliOption(&cliArgs, "mode", "force"), TSDB_CODE_SUCCESS); + ASSERT_EQ(tRepairInitCtx(&cliArgs, 1735689600123LL, &ctx), TSDB_CODE_INVALID_PARA); + + memset(&cliArgs, 0, sizeof(cliArgs)); + ASSERT_EQ(tRepairParseCliOption(&cliArgs, "node-type", "vnode"), TSDB_CODE_SUCCESS); + ASSERT_EQ(tRepairParseCliOption(&cliArgs, "file-type", "wal"), TSDB_CODE_SUCCESS); + ASSERT_EQ(tRepairParseCliOption(&cliArgs, "vnode-id", "2"), TSDB_CODE_SUCCESS); + ASSERT_EQ(tRepairInitCtx(&cliArgs, 1735689600123LL, &ctx), TSDB_CODE_INVALID_PARA); + + memset(&cliArgs, 0, sizeof(cliArgs)); + ASSERT_EQ(tRepairParseCliOption(&cliArgs, "node-type", "vnode"), TSDB_CODE_SUCCESS); + ASSERT_EQ(tRepairParseCliOption(&cliArgs, "file-type", "wal"), TSDB_CODE_SUCCESS); + ASSERT_EQ(tRepairParseCliOption(&cliArgs, "vnode-id", "2"), TSDB_CODE_SUCCESS); + ASSERT_EQ(tRepairParseCliOption(&cliArgs, "mode", "copy"), TSDB_CODE_SUCCESS); + ASSERT_EQ(tRepairParseCliOption(&cliArgs, "replica-node", "127.0.0.1:/var/lib/taos"), TSDB_CODE_SUCCESS); + ASSERT_EQ(tRepairInitCtx(&cliArgs, 1735689600999LL, &ctx), TSDB_CODE_SUCCESS); + ASSERT_FALSE(ctx.hasBackupPath); + ASSERT_STREQ(ctx.backupPath, ""); +} + +TEST(RepairOptionParseTest, PrecheckDataDirNotExist) { + SRepairCliArgs cliArgs = {0}; + ASSERT_EQ(tRepairParseCliOption(&cliArgs, "node-type", "vnode"), TSDB_CODE_SUCCESS); + ASSERT_EQ(tRepairParseCliOption(&cliArgs, "file-type", "wal"), TSDB_CODE_SUCCESS); + ASSERT_EQ(tRepairParseCliOption(&cliArgs, "vnode-id", "2"), TSDB_CODE_SUCCESS); + ASSERT_EQ(tRepairParseCliOption(&cliArgs, "mode", "force"), TSDB_CODE_SUCCESS); + + SRepairCtx ctx = {0}; + ASSERT_EQ(tRepairInitCtx(&cliArgs, 1735689601000LL, &ctx), TSDB_CODE_SUCCESS); + + std::string dataDir = buildRepairTempPath("missing-data"); + ASSERT_FALSE(taosDirExist(dataDir.c_str())); + ASSERT_NE(tRepairPrecheck(&ctx, dataDir.c_str(), 0), TSDB_CODE_SUCCESS); +} + +TEST(RepairOptionParseTest, PrecheckBackupPathNotExist) { + const std::string dataDirPath = buildRepairTempPath("missing-backup-data"); + RepairTempDirGuard dataDirGuard(dataDirPath); + const std::string sep(TD_DIRSEP); + const std::string walDir = dataDirPath + sep + "vnode" + sep + "vnode2" + sep + "wal"; + ASSERT_EQ(taosMulMkDir(walDir.c_str()), 0); + + std::string backupDir = buildRepairTempPath("missing-backup-dir"); + ASSERT_FALSE(taosDirExist(backupDir.c_str())); + + SRepairCliArgs cliArgs = {0}; + ASSERT_EQ(tRepairParseCliOption(&cliArgs, "node-type", "vnode"), TSDB_CODE_SUCCESS); + ASSERT_EQ(tRepairParseCliOption(&cliArgs, "file-type", "wal"), TSDB_CODE_SUCCESS); + ASSERT_EQ(tRepairParseCliOption(&cliArgs, "vnode-id", "2"), TSDB_CODE_SUCCESS); + ASSERT_EQ(tRepairParseCliOption(&cliArgs, "mode", "force"), TSDB_CODE_SUCCESS); + ASSERT_EQ(tRepairParseCliOption(&cliArgs, "backup-path", backupDir.c_str()), TSDB_CODE_SUCCESS); + + SRepairCtx ctx = {0}; + ASSERT_EQ(tRepairInitCtx(&cliArgs, 1735689601001LL, &ctx), TSDB_CODE_SUCCESS); + ASSERT_NE(tRepairPrecheck(&ctx, dataDirPath.c_str(), 0), TSDB_CODE_SUCCESS); +} + +TEST(RepairOptionParseTest, PrecheckDiskSpaceNotEnough) { + const std::string dataDirPath = buildRepairTempPath("disk-space-data"); + RepairTempDirGuard dataDirGuard(dataDirPath); + const std::string sep(TD_DIRSEP); + const std::string walDir = dataDirPath + sep + "vnode" + sep + "vnode2" + sep + "wal"; + ASSERT_EQ(taosMulMkDir(walDir.c_str()), 0); + + SRepairCliArgs cliArgs = {0}; + ASSERT_EQ(tRepairParseCliOption(&cliArgs, "node-type", "vnode"), TSDB_CODE_SUCCESS); + ASSERT_EQ(tRepairParseCliOption(&cliArgs, "file-type", "wal"), TSDB_CODE_SUCCESS); + ASSERT_EQ(tRepairParseCliOption(&cliArgs, "vnode-id", "2"), TSDB_CODE_SUCCESS); + ASSERT_EQ(tRepairParseCliOption(&cliArgs, "mode", "force"), TSDB_CODE_SUCCESS); + + SRepairCtx ctx = {0}; + ASSERT_EQ(tRepairInitCtx(&cliArgs, 1735689601002LL, &ctx), TSDB_CODE_SUCCESS); + ASSERT_EQ(tRepairPrecheck(&ctx, dataDirPath.c_str(), INT64_MAX), TSDB_CODE_NO_ENOUGH_DISKSPACE); +} + +TEST(RepairOptionParseTest, PrecheckTargetPathMissing) { + const std::string dataDirPath = buildRepairTempPath("missing-target-data"); + RepairTempDirGuard dataDirGuard(dataDirPath); + const std::string sep(TD_DIRSEP); + const std::string vnodeDir = dataDirPath + sep + "vnode" + sep + "vnode2"; + ASSERT_EQ(taosMulMkDir(vnodeDir.c_str()), 0); + + SRepairCliArgs cliArgs = {0}; + ASSERT_EQ(tRepairParseCliOption(&cliArgs, "node-type", "vnode"), TSDB_CODE_SUCCESS); + ASSERT_EQ(tRepairParseCliOption(&cliArgs, "file-type", "wal"), TSDB_CODE_SUCCESS); + ASSERT_EQ(tRepairParseCliOption(&cliArgs, "vnode-id", "2"), TSDB_CODE_SUCCESS); + ASSERT_EQ(tRepairParseCliOption(&cliArgs, "mode", "force"), TSDB_CODE_SUCCESS); + + SRepairCtx ctx = {0}; + ASSERT_EQ(tRepairInitCtx(&cliArgs, 1735689601003LL, &ctx), TSDB_CODE_SUCCESS); + ASSERT_NE(tRepairPrecheck(&ctx, dataDirPath.c_str(), 0), TSDB_CODE_SUCCESS); +} + +TEST(RepairOptionParseTest, PrecheckSuccess) { + const std::string dataDirPath = buildRepairTempPath("precheck-success-data"); + RepairTempDirGuard dataDirGuard(dataDirPath); + const std::string backupDirPath = buildRepairTempPath("precheck-success-backup"); + RepairTempDirGuard backupDirGuard(backupDirPath); + const std::string sep(TD_DIRSEP); + + const std::string walDir2 = dataDirPath + sep + "vnode" + sep + "vnode2" + sep + "wal"; + const std::string walDir3 = dataDirPath + sep + "vnode" + sep + "vnode3" + sep + "wal"; + ASSERT_EQ(taosMulMkDir(walDir2.c_str()), 0); + ASSERT_EQ(taosMulMkDir(walDir3.c_str()), 0); + ASSERT_EQ(taosMulMkDir(backupDirPath.c_str()), 0); + + SRepairCliArgs cliArgs = {0}; + ASSERT_EQ(tRepairParseCliOption(&cliArgs, "node-type", "vnode"), TSDB_CODE_SUCCESS); + ASSERT_EQ(tRepairParseCliOption(&cliArgs, "file-type", "wal"), TSDB_CODE_SUCCESS); + ASSERT_EQ(tRepairParseCliOption(&cliArgs, "vnode-id", "2,3"), TSDB_CODE_SUCCESS); + ASSERT_EQ(tRepairParseCliOption(&cliArgs, "mode", "force"), TSDB_CODE_SUCCESS); + ASSERT_EQ(tRepairParseCliOption(&cliArgs, "backup-path", backupDirPath.c_str()), TSDB_CODE_SUCCESS); + + SRepairCtx ctx = {0}; + ASSERT_EQ(tRepairInitCtx(&cliArgs, 1735689601004LL, &ctx), TSDB_CODE_SUCCESS); + ASSERT_EQ(tRepairPrecheck(&ctx, dataDirPath.c_str(), 0), TSDB_CODE_SUCCESS); +} + +TEST(RepairOptionParseTest, ScanTsdbFilesAndPrecheckSuccess) { + const std::string dataDirPath = buildRepairTempPath("scan-tsdb-success-data"); + RepairTempDirGuard dataDirGuard(dataDirPath); + const std::string sep(TD_DIRSEP); + const std::string tsdbDir = dataDirPath + sep + "vnode" + sep + "vnode2" + sep + "tsdb" + sep + "f100"; + ASSERT_EQ(taosMulMkDir(tsdbDir.c_str()), 0); + + auto createEmptyFile = [](const std::string &path) { + TdFilePtr pFile = taosOpenFile(path.c_str(), TD_FILE_CREATE | TD_FILE_WRITE | TD_FILE_TRUNC); + ASSERT_NE(pFile, nullptr); + ASSERT_EQ(taosCloseFile(&pFile), 0); + }; + + createEmptyFile(tsdbDir + sep + "v2f100ver1.head"); + createEmptyFile(tsdbDir + sep + "v2f100ver1.0.data"); + createEmptyFile(tsdbDir + sep + "v2f100ver1.sma"); + createEmptyFile(tsdbDir + sep + "v2f100ver1.m1.stt"); + createEmptyFile(tsdbDir + sep + "README.txt"); + + SRepairCliArgs cliArgs = {0}; + ASSERT_EQ(tRepairParseCliOption(&cliArgs, "node-type", "vnode"), TSDB_CODE_SUCCESS); + ASSERT_EQ(tRepairParseCliOption(&cliArgs, "file-type", "tsdb"), TSDB_CODE_SUCCESS); + ASSERT_EQ(tRepairParseCliOption(&cliArgs, "vnode-id", "2"), TSDB_CODE_SUCCESS); + ASSERT_EQ(tRepairParseCliOption(&cliArgs, "mode", "force"), TSDB_CODE_SUCCESS); + + SRepairCtx ctx = {0}; + ASSERT_EQ(tRepairInitCtx(&cliArgs, 1735689601005LL, &ctx), TSDB_CODE_SUCCESS); + + SRepairTsdbScanResult scanResult = {0}; + ASSERT_EQ(tRepairScanTsdbFiles(&ctx, dataDirPath.c_str(), 2, &scanResult), TSDB_CODE_SUCCESS); + ASSERT_EQ(scanResult.headFiles, 1); + ASSERT_EQ(scanResult.dataFiles, 1); + ASSERT_EQ(scanResult.smaFiles, 1); + ASSERT_EQ(scanResult.sttFiles, 1); + ASSERT_EQ(scanResult.unknownFiles, 1); + + ASSERT_EQ(tRepairPrecheck(&ctx, dataDirPath.c_str(), 0), TSDB_CODE_SUCCESS); +} + +TEST(RepairOptionParseTest, ScanTsdbFilesMissingCriticalFiles) { + const std::string dataDirPath = buildRepairTempPath("scan-tsdb-missing-critical"); + RepairTempDirGuard dataDirGuard(dataDirPath); + const std::string sep(TD_DIRSEP); + const std::string tsdbDir = dataDirPath + sep + "vnode" + sep + "vnode2" + sep + "tsdb" + sep + "f101"; + ASSERT_EQ(taosMulMkDir(tsdbDir.c_str()), 0); + + TdFilePtr pFile = taosOpenFile((tsdbDir + sep + "v2f101ver2.sma").c_str(), TD_FILE_CREATE | TD_FILE_WRITE | TD_FILE_TRUNC); + ASSERT_NE(pFile, nullptr); + ASSERT_EQ(taosCloseFile(&pFile), 0); + + SRepairCliArgs cliArgs = {0}; + ASSERT_EQ(tRepairParseCliOption(&cliArgs, "node-type", "vnode"), TSDB_CODE_SUCCESS); + ASSERT_EQ(tRepairParseCliOption(&cliArgs, "file-type", "tsdb"), TSDB_CODE_SUCCESS); + ASSERT_EQ(tRepairParseCliOption(&cliArgs, "vnode-id", "2"), TSDB_CODE_SUCCESS); + ASSERT_EQ(tRepairParseCliOption(&cliArgs, "mode", "force"), TSDB_CODE_SUCCESS); + + SRepairCtx ctx = {0}; + ASSERT_EQ(tRepairInitCtx(&cliArgs, 1735689601006LL, &ctx), TSDB_CODE_SUCCESS); + + SRepairTsdbScanResult scanResult = {0}; + ASSERT_EQ(tRepairScanTsdbFiles(&ctx, dataDirPath.c_str(), 2, &scanResult), TSDB_CODE_INVALID_PARA); + ASSERT_NE(tRepairPrecheck(&ctx, dataDirPath.c_str(), 0), TSDB_CODE_SUCCESS); +} + +TEST(RepairOptionParseTest, ScanTsdbFilesInvalidArgs) { + SRepairTsdbScanResult scanResult = {0}; + SRepairCtx ctx = {0}; + ASSERT_EQ(tRepairScanTsdbFiles(NULL, "/tmp", 2, &scanResult), TSDB_CODE_INVALID_PARA); + ASSERT_EQ(tRepairScanTsdbFiles(&ctx, "/tmp", 2, &scanResult), TSDB_CODE_INVALID_PARA); + ASSERT_EQ(tRepairScanTsdbFiles(&ctx, "/tmp", 2, NULL), TSDB_CODE_INVALID_PARA); +} + +TEST(RepairOptionParseTest, ScanMetaFilesAndPrecheckSuccess) { + const std::string dataDirPath = buildRepairTempPath("scan-meta-success-data"); + RepairTempDirGuard dataDirGuard(dataDirPath); + const std::string sep(TD_DIRSEP); + const std::string metaDir = dataDirPath + sep + "vnode" + sep + "vnode2" + sep + "meta"; + ASSERT_EQ(taosMulMkDir(metaDir.c_str()), 0); + + auto createEmptyFile = [](const std::string &path) { + TdFilePtr pFile = taosOpenFile(path.c_str(), TD_FILE_CREATE | TD_FILE_WRITE | TD_FILE_TRUNC); + ASSERT_NE(pFile, nullptr); + ASSERT_EQ(taosCloseFile(&pFile), 0); + }; + + createEmptyFile(metaDir + sep + "table.db"); + createEmptyFile(metaDir + sep + "schema.db"); + createEmptyFile(metaDir + sep + "uid.idx"); + createEmptyFile(metaDir + sep + "name.idx"); + createEmptyFile(metaDir + sep + "tag.idx"); + + SRepairCliArgs cliArgs = {0}; + ASSERT_EQ(tRepairParseCliOption(&cliArgs, "node-type", "vnode"), TSDB_CODE_SUCCESS); + ASSERT_EQ(tRepairParseCliOption(&cliArgs, "file-type", "meta"), TSDB_CODE_SUCCESS); + ASSERT_EQ(tRepairParseCliOption(&cliArgs, "vnode-id", "2"), TSDB_CODE_SUCCESS); + ASSERT_EQ(tRepairParseCliOption(&cliArgs, "mode", "force"), TSDB_CODE_SUCCESS); + + SRepairCtx ctx = {0}; + ASSERT_EQ(tRepairInitCtx(&cliArgs, 1735689601011LL, &ctx), TSDB_CODE_SUCCESS); + + SRepairMetaScanResult scanResult = {0}; + ASSERT_EQ(tRepairScanMetaFiles(&ctx, dataDirPath.c_str(), 2, &scanResult), TSDB_CODE_SUCCESS); + ASSERT_EQ(scanResult.requiredFiles, 4); + ASSERT_EQ(scanResult.presentRequiredFiles, 4); + ASSERT_EQ(scanResult.optionalIndexFiles, 1); + ASSERT_EQ(scanResult.missingRequiredFiles, 0); + + ASSERT_EQ(tRepairPrecheck(&ctx, dataDirPath.c_str(), 0), TSDB_CODE_SUCCESS); +} + +TEST(RepairOptionParseTest, ScanMetaFilesMissingRequiredFiles) { + const std::string dataDirPath = buildRepairTempPath("scan-meta-missing-required"); + RepairTempDirGuard dataDirGuard(dataDirPath); + const std::string sep(TD_DIRSEP); + const std::string metaDir = dataDirPath + sep + "vnode" + sep + "vnode2" + sep + "meta"; + ASSERT_EQ(taosMulMkDir(metaDir.c_str()), 0); + + auto createEmptyFile = [](const std::string &path) { + TdFilePtr pFile = taosOpenFile(path.c_str(), TD_FILE_CREATE | TD_FILE_WRITE | TD_FILE_TRUNC); + ASSERT_NE(pFile, nullptr); + ASSERT_EQ(taosCloseFile(&pFile), 0); + }; + + createEmptyFile(metaDir + sep + "table.db"); + createEmptyFile(metaDir + sep + "schema.db"); + createEmptyFile(metaDir + sep + "tag.idx"); + + SRepairCliArgs cliArgs = {0}; + ASSERT_EQ(tRepairParseCliOption(&cliArgs, "node-type", "vnode"), TSDB_CODE_SUCCESS); + ASSERT_EQ(tRepairParseCliOption(&cliArgs, "file-type", "meta"), TSDB_CODE_SUCCESS); + ASSERT_EQ(tRepairParseCliOption(&cliArgs, "vnode-id", "2"), TSDB_CODE_SUCCESS); + ASSERT_EQ(tRepairParseCliOption(&cliArgs, "mode", "force"), TSDB_CODE_SUCCESS); + + SRepairCtx ctx = {0}; + ASSERT_EQ(tRepairInitCtx(&cliArgs, 1735689601012LL, &ctx), TSDB_CODE_SUCCESS); + + SRepairMetaScanResult scanResult = {0}; + ASSERT_EQ(tRepairScanMetaFiles(&ctx, dataDirPath.c_str(), 2, &scanResult), TSDB_CODE_INVALID_PARA); + ASSERT_EQ(scanResult.requiredFiles, 4); + ASSERT_EQ(scanResult.presentRequiredFiles, 2); + ASSERT_EQ(scanResult.missingRequiredFiles, 2); + + bool missingUidIdx = false; + bool missingNameIdx = false; + for (int32_t i = 0; i < scanResult.missingRequiredFiles; ++i) { + if (strcmp(scanResult.missingRequiredFileNames[i], "uid.idx") == 0) { + missingUidIdx = true; + } + if (strcmp(scanResult.missingRequiredFileNames[i], "name.idx") == 0) { + missingNameIdx = true; + } + } + ASSERT_TRUE(missingUidIdx); + ASSERT_TRUE(missingNameIdx); + + ASSERT_NE(tRepairPrecheck(&ctx, dataDirPath.c_str(), 0), TSDB_CODE_SUCCESS); +} + +TEST(RepairOptionParseTest, ScanMetaFilesInvalidArgs) { + SRepairMetaScanResult scanResult = {0}; + SRepairCtx ctx = {0}; + ASSERT_EQ(tRepairScanMetaFiles(NULL, "/tmp", 2, &scanResult), TSDB_CODE_INVALID_PARA); + ASSERT_EQ(tRepairScanMetaFiles(&ctx, "/tmp", 2, &scanResult), TSDB_CODE_INVALID_PARA); + ASSERT_EQ(tRepairScanMetaFiles(&ctx, "/tmp", 2, NULL), TSDB_CODE_INVALID_PARA); +} + +TEST(RepairOptionParseTest, BuildMetaMissingFileMark) { + SRepairMetaScanResult scanResult = {0}; + scanResult.missingRequiredFiles = 2; + tstrncpy(scanResult.missingRequiredFileNames[0], "uid.idx", REPAIR_META_FILE_NAME_LEN); + tstrncpy(scanResult.missingRequiredFileNames[1], "name.idx", REPAIR_META_FILE_NAME_LEN); + + char missingMark[128] = {0}; + ASSERT_EQ(tRepairBuildMetaMissingFileMark(&scanResult, missingMark, sizeof(missingMark)), TSDB_CODE_SUCCESS); + ASSERT_STREQ(missingMark, "uid.idx,name.idx"); +} + +TEST(RepairOptionParseTest, BuildMetaMissingFileMarkNoneOrInvalidArgs) { + SRepairMetaScanResult scanResult = {0}; + char missingMark[64] = {0}; + + ASSERT_EQ(tRepairBuildMetaMissingFileMark(&scanResult, missingMark, sizeof(missingMark)), TSDB_CODE_SUCCESS); + ASSERT_STREQ(missingMark, "none"); + + ASSERT_EQ(tRepairBuildMetaMissingFileMark(NULL, missingMark, sizeof(missingMark)), TSDB_CODE_INVALID_PARA); + ASSERT_EQ(tRepairBuildMetaMissingFileMark(&scanResult, NULL, sizeof(missingMark)), TSDB_CODE_INVALID_PARA); + ASSERT_EQ(tRepairBuildMetaMissingFileMark(&scanResult, missingMark, 0), TSDB_CODE_INVALID_PARA); +} + +TEST(RepairOptionParseTest, InferMetaFromWalTsdbByWalEvidence) { + const std::string dataDirPath = buildRepairTempPath("infer-meta-wal-evidence"); + RepairTempDirGuard dataDirGuard(dataDirPath); + const std::string sep(TD_DIRSEP); + const std::string metaDir = dataDirPath + sep + "vnode" + sep + "vnode2" + sep + "meta"; + const std::string walDir = dataDirPath + sep + "vnode" + sep + "vnode2" + sep + "wal"; + ASSERT_EQ(taosMulMkDir(metaDir.c_str()), 0); + ASSERT_EQ(taosMulMkDir(walDir.c_str()), 0); + + TdFilePtr pWalFile = taosOpenFile((walDir + sep + "000001.log").c_str(), TD_FILE_CREATE | TD_FILE_WRITE | TD_FILE_TRUNC); + ASSERT_NE(pWalFile, nullptr); + ASSERT_EQ(taosCloseFile(&pWalFile), 0); + + SRepairCliArgs cliArgs = {0}; + ASSERT_EQ(tRepairParseCliOption(&cliArgs, "node-type", "vnode"), TSDB_CODE_SUCCESS); + ASSERT_EQ(tRepairParseCliOption(&cliArgs, "file-type", "meta"), TSDB_CODE_SUCCESS); + ASSERT_EQ(tRepairParseCliOption(&cliArgs, "vnode-id", "2"), TSDB_CODE_SUCCESS); + ASSERT_EQ(tRepairParseCliOption(&cliArgs, "mode", "force"), TSDB_CODE_SUCCESS); + + SRepairCtx ctx = {0}; + ASSERT_EQ(tRepairInitCtx(&cliArgs, 1735689601013LL, &ctx), TSDB_CODE_SUCCESS); + + SRepairMetaInferenceReport report = {0}; + ASSERT_EQ(tRepairInferMetaFromWalTsdb(&ctx, dataDirPath.c_str(), 2, &report), TSDB_CODE_SUCCESS); + ASSERT_GT(report.walEvidenceFiles, 0); + ASSERT_EQ(report.tsdbRecoverableBlocks, 0); + ASSERT_TRUE(report.recoverable); + ASSERT_EQ(report.inferredRules, 1); +} + +TEST(RepairOptionParseTest, InferMetaFromWalTsdbByTsdbEvidence) { + const std::string dataDirPath = buildRepairTempPath("infer-meta-tsdb-evidence"); + RepairTempDirGuard dataDirGuard(dataDirPath); + const std::string sep(TD_DIRSEP); + const std::string metaDir = dataDirPath + sep + "vnode" + sep + "vnode2" + sep + "meta"; + const std::string tsdbDir = dataDirPath + sep + "vnode" + sep + "vnode2" + sep + "tsdb" + sep + "f100"; + ASSERT_EQ(taosMulMkDir(metaDir.c_str()), 0); + ASSERT_EQ(taosMulMkDir(tsdbDir.c_str()), 0); + + TdFilePtr pHeadFile = + taosOpenFile((tsdbDir + sep + "v2f100ver1.head").c_str(), TD_FILE_CREATE | TD_FILE_WRITE | TD_FILE_TRUNC); + ASSERT_NE(pHeadFile, nullptr); + ASSERT_EQ(taosCloseFile(&pHeadFile), 0); + TdFilePtr pDataFile = + taosOpenFile((tsdbDir + sep + "v2f100ver1.0.data").c_str(), TD_FILE_CREATE | TD_FILE_WRITE | TD_FILE_TRUNC); + ASSERT_NE(pDataFile, nullptr); + ASSERT_EQ(taosCloseFile(&pDataFile), 0); + + SRepairCliArgs cliArgs = {0}; + ASSERT_EQ(tRepairParseCliOption(&cliArgs, "node-type", "vnode"), TSDB_CODE_SUCCESS); + ASSERT_EQ(tRepairParseCliOption(&cliArgs, "file-type", "meta"), TSDB_CODE_SUCCESS); + ASSERT_EQ(tRepairParseCliOption(&cliArgs, "vnode-id", "2"), TSDB_CODE_SUCCESS); + ASSERT_EQ(tRepairParseCliOption(&cliArgs, "mode", "force"), TSDB_CODE_SUCCESS); + + SRepairCtx ctx = {0}; + ASSERT_EQ(tRepairInitCtx(&cliArgs, 1735689601014LL, &ctx), TSDB_CODE_SUCCESS); + + SRepairMetaInferenceReport report = {0}; + ASSERT_EQ(tRepairInferMetaFromWalTsdb(&ctx, dataDirPath.c_str(), 2, &report), TSDB_CODE_SUCCESS); + ASSERT_EQ(report.walEvidenceFiles, 0); + ASSERT_GT(report.tsdbRecoverableBlocks, 0); + ASSERT_TRUE(report.recoverable); + ASSERT_EQ(report.inferredRules, 1); +} + +TEST(RepairOptionParseTest, InferMetaFromWalTsdbNoEvidence) { + const std::string dataDirPath = buildRepairTempPath("infer-meta-no-evidence"); + RepairTempDirGuard dataDirGuard(dataDirPath); + const std::string sep(TD_DIRSEP); + const std::string metaDir = dataDirPath + sep + "vnode" + sep + "vnode2" + sep + "meta"; + ASSERT_EQ(taosMulMkDir(metaDir.c_str()), 0); + + SRepairCliArgs cliArgs = {0}; + ASSERT_EQ(tRepairParseCliOption(&cliArgs, "node-type", "vnode"), TSDB_CODE_SUCCESS); + ASSERT_EQ(tRepairParseCliOption(&cliArgs, "file-type", "meta"), TSDB_CODE_SUCCESS); + ASSERT_EQ(tRepairParseCliOption(&cliArgs, "vnode-id", "2"), TSDB_CODE_SUCCESS); + ASSERT_EQ(tRepairParseCliOption(&cliArgs, "mode", "force"), TSDB_CODE_SUCCESS); + + SRepairCtx ctx = {0}; + ASSERT_EQ(tRepairInitCtx(&cliArgs, 1735689601015LL, &ctx), TSDB_CODE_SUCCESS); + + SRepairMetaInferenceReport report = {0}; + ASSERT_EQ(tRepairInferMetaFromWalTsdb(&ctx, dataDirPath.c_str(), 2, &report), TSDB_CODE_INVALID_PARA); + ASSERT_FALSE(report.recoverable); + ASSERT_EQ(report.inferredRules, 0); +} + +TEST(RepairOptionParseTest, PrecheckMetaFallbackToInferenceSuccess) { + const std::string dataDirPath = buildRepairTempPath("precheck-meta-fallback"); + RepairTempDirGuard dataDirGuard(dataDirPath); + const std::string sep(TD_DIRSEP); + const std::string metaDir = dataDirPath + sep + "vnode" + sep + "vnode2" + sep + "meta"; + const std::string walDir = dataDirPath + sep + "vnode" + sep + "vnode2" + sep + "wal"; + ASSERT_EQ(taosMulMkDir(metaDir.c_str()), 0); + ASSERT_EQ(taosMulMkDir(walDir.c_str()), 0); + + TdFilePtr pMetaFile = taosOpenFile((metaDir + sep + "table.db").c_str(), TD_FILE_CREATE | TD_FILE_WRITE | TD_FILE_TRUNC); + ASSERT_NE(pMetaFile, nullptr); + ASSERT_EQ(taosCloseFile(&pMetaFile), 0); + TdFilePtr pWalFile = taosOpenFile((walDir + sep + "000001.log").c_str(), TD_FILE_CREATE | TD_FILE_WRITE | TD_FILE_TRUNC); + ASSERT_NE(pWalFile, nullptr); + ASSERT_EQ(taosCloseFile(&pWalFile), 0); + + SRepairCliArgs cliArgs = {0}; + ASSERT_EQ(tRepairParseCliOption(&cliArgs, "node-type", "vnode"), TSDB_CODE_SUCCESS); + ASSERT_EQ(tRepairParseCliOption(&cliArgs, "file-type", "meta"), TSDB_CODE_SUCCESS); + ASSERT_EQ(tRepairParseCliOption(&cliArgs, "vnode-id", "2"), TSDB_CODE_SUCCESS); + ASSERT_EQ(tRepairParseCliOption(&cliArgs, "mode", "force"), TSDB_CODE_SUCCESS); + + SRepairCtx ctx = {0}; + ASSERT_EQ(tRepairInitCtx(&cliArgs, 1735689601016LL, &ctx), TSDB_CODE_SUCCESS); + ASSERT_EQ(tRepairPrecheck(&ctx, dataDirPath.c_str(), 0), TSDB_CODE_SUCCESS); +} + +TEST(RepairOptionParseTest, RebuildMetaFilesCreateMissingRequired) { + const std::string dataDirPath = buildRepairTempPath("rebuild-meta-files"); + RepairTempDirGuard dataDirGuard(dataDirPath); + const std::string sep(TD_DIRSEP); + const std::string metaDir = dataDirPath + sep + "vnode" + sep + "vnode2" + sep + "meta"; + const std::string outDir = dataDirPath + sep + "vnode" + sep + "vnode2" + sep + "meta.rebuild"; + ASSERT_EQ(taosMulMkDir(metaDir.c_str()), 0); + + auto createEmptyFile = [](const std::string &path) { + TdFilePtr pFile = taosOpenFile(path.c_str(), TD_FILE_CREATE | TD_FILE_WRITE | TD_FILE_TRUNC); + ASSERT_NE(pFile, nullptr); + ASSERT_EQ(taosCloseFile(&pFile), 0); + }; + + createEmptyFile(metaDir + sep + "table.db"); + createEmptyFile(metaDir + sep + "tag.idx"); + + SRepairCliArgs cliArgs = {0}; + ASSERT_EQ(tRepairParseCliOption(&cliArgs, "node-type", "vnode"), TSDB_CODE_SUCCESS); + ASSERT_EQ(tRepairParseCliOption(&cliArgs, "file-type", "meta"), TSDB_CODE_SUCCESS); + ASSERT_EQ(tRepairParseCliOption(&cliArgs, "vnode-id", "2"), TSDB_CODE_SUCCESS); + ASSERT_EQ(tRepairParseCliOption(&cliArgs, "mode", "force"), TSDB_CODE_SUCCESS); + + SRepairCtx ctx = {0}; + ASSERT_EQ(tRepairInitCtx(&cliArgs, 1735689601017LL, &ctx), TSDB_CODE_SUCCESS); + + SRepairMetaScanResult rebuildResult = {0}; + ASSERT_EQ(tRepairRebuildMetaFiles(&ctx, dataDirPath.c_str(), 2, outDir.c_str(), &rebuildResult), TSDB_CODE_SUCCESS); + ASSERT_EQ(rebuildResult.requiredFiles, 4); + ASSERT_EQ(rebuildResult.presentRequiredFiles, 4); + ASSERT_EQ(rebuildResult.missingRequiredFiles, 0); + ASSERT_EQ(rebuildResult.optionalIndexFiles, 1); + + ASSERT_TRUE(taosCheckExistFile((outDir + sep + "table.db").c_str())); + ASSERT_TRUE(taosCheckExistFile((outDir + sep + "schema.db").c_str())); + ASSERT_TRUE(taosCheckExistFile((outDir + sep + "uid.idx").c_str())); + ASSERT_TRUE(taosCheckExistFile((outDir + sep + "name.idx").c_str())); +} + +TEST(RepairOptionParseTest, RebuildMetaFilesInvalidArgs) { + SRepairCtx ctx = {0}; + SRepairMetaScanResult result = {0}; + + ASSERT_EQ(tRepairRebuildMetaFiles(NULL, "/tmp", 2, "/tmp/meta.rebuild", &result), TSDB_CODE_INVALID_PARA); + ASSERT_EQ(tRepairRebuildMetaFiles(&ctx, "/tmp", 2, "/tmp/meta.rebuild", &result), TSDB_CODE_INVALID_PARA); + ASSERT_EQ(tRepairRebuildMetaFiles(&ctx, "/tmp", 2, NULL, &result), TSDB_CODE_INVALID_PARA); + ASSERT_EQ(tRepairRebuildMetaFiles(&ctx, "/tmp", 2, "/tmp/meta.rebuild", NULL), TSDB_CODE_INVALID_PARA); +} + +TEST(RepairOptionParseTest, AnalyzeTsdbBlocksReportMixedCorruption) { + const std::string dataDirPath = buildRepairTempPath("analyze-tsdb-blocks-mixed"); + RepairTempDirGuard dataDirGuard(dataDirPath); + const std::string sep(TD_DIRSEP); + const std::string blockDir100 = dataDirPath + sep + "vnode" + sep + "vnode2" + sep + "tsdb" + sep + "f100"; + const std::string blockDir101 = dataDirPath + sep + "vnode" + sep + "vnode2" + sep + "tsdb" + sep + "f101"; + const std::string blockDir102 = dataDirPath + sep + "vnode" + sep + "vnode2" + sep + "tsdb" + sep + "f102"; + ASSERT_EQ(taosMulMkDir(blockDir100.c_str()), 0); + ASSERT_EQ(taosMulMkDir(blockDir101.c_str()), 0); + ASSERT_EQ(taosMulMkDir(blockDir102.c_str()), 0); + + auto createEmptyFile = [](const std::string &path) { + TdFilePtr pFile = taosOpenFile(path.c_str(), TD_FILE_CREATE | TD_FILE_WRITE | TD_FILE_TRUNC); + ASSERT_NE(pFile, nullptr); + ASSERT_EQ(taosCloseFile(&pFile), 0); + }; + + createEmptyFile(blockDir100 + sep + "v2f100ver1.head"); + createEmptyFile(blockDir100 + sep + "v2f100ver1.0.data"); + createEmptyFile(blockDir100 + sep + "v2f100ver1.sma"); + createEmptyFile(blockDir101 + sep + "v2f101ver1.0.data"); + createEmptyFile(blockDir102 + sep + "v2f102ver1.head"); + createEmptyFile(blockDir102 + sep + "notes.txt"); + + SRepairCliArgs cliArgs = {0}; + ASSERT_EQ(tRepairParseCliOption(&cliArgs, "node-type", "vnode"), TSDB_CODE_SUCCESS); + ASSERT_EQ(tRepairParseCliOption(&cliArgs, "file-type", "tsdb"), TSDB_CODE_SUCCESS); + ASSERT_EQ(tRepairParseCliOption(&cliArgs, "vnode-id", "2"), TSDB_CODE_SUCCESS); + ASSERT_EQ(tRepairParseCliOption(&cliArgs, "mode", "force"), TSDB_CODE_SUCCESS); + + SRepairCtx ctx = {0}; + ASSERT_EQ(tRepairInitCtx(&cliArgs, 1735689601007LL, &ctx), TSDB_CODE_SUCCESS); + + SRepairTsdbBlockReport report = {0}; + ASSERT_EQ(tRepairAnalyzeTsdbBlocks(&ctx, dataDirPath.c_str(), 2, &report), TSDB_CODE_SUCCESS); + ASSERT_EQ(report.totalBlocks, 3); + ASSERT_EQ(report.recoverableBlocks, 1); + ASSERT_EQ(report.corruptedBlocks, 2); + ASSERT_EQ(report.unknownFiles, 1); + ASSERT_EQ(report.reportedCorruptedBlocks, 2); + + bool hasF101 = false; + bool hasF102 = false; + for (int32_t i = 0; i < report.reportedCorruptedBlocks; ++i) { + if (strstr(report.corruptedBlockPaths[i], "f101") != nullptr) { + hasF101 = true; + } + if (strstr(report.corruptedBlockPaths[i], "f102") != nullptr) { + hasF102 = true; + } + } + ASSERT_TRUE(hasF101); + ASSERT_TRUE(hasF102); +} + +TEST(RepairOptionParseTest, AnalyzeTsdbBlocksReportNoRecognizedBlocks) { + const std::string dataDirPath = buildRepairTempPath("analyze-tsdb-blocks-empty"); + RepairTempDirGuard dataDirGuard(dataDirPath); + const std::string sep(TD_DIRSEP); + const std::string blockDir = dataDirPath + sep + "vnode" + sep + "vnode2" + sep + "tsdb" + sep + "f200"; + ASSERT_EQ(taosMulMkDir(blockDir.c_str()), 0); + + TdFilePtr pFile = taosOpenFile((blockDir + sep + "README.md").c_str(), TD_FILE_CREATE | TD_FILE_WRITE | TD_FILE_TRUNC); + ASSERT_NE(pFile, nullptr); + ASSERT_EQ(taosCloseFile(&pFile), 0); + + SRepairCliArgs cliArgs = {0}; + ASSERT_EQ(tRepairParseCliOption(&cliArgs, "node-type", "vnode"), TSDB_CODE_SUCCESS); + ASSERT_EQ(tRepairParseCliOption(&cliArgs, "file-type", "tsdb"), TSDB_CODE_SUCCESS); + ASSERT_EQ(tRepairParseCliOption(&cliArgs, "vnode-id", "2"), TSDB_CODE_SUCCESS); + ASSERT_EQ(tRepairParseCliOption(&cliArgs, "mode", "force"), TSDB_CODE_SUCCESS); + + SRepairCtx ctx = {0}; + ASSERT_EQ(tRepairInitCtx(&cliArgs, 1735689601008LL, &ctx), TSDB_CODE_SUCCESS); + + SRepairTsdbBlockReport report = {0}; + ASSERT_EQ(tRepairAnalyzeTsdbBlocks(&ctx, dataDirPath.c_str(), 2, &report), TSDB_CODE_INVALID_PARA); +} + +TEST(RepairOptionParseTest, AnalyzeTsdbBlocksReportInvalidArgs) { + SRepairTsdbBlockReport report = {0}; + SRepairCtx ctx = {0}; + ASSERT_EQ(tRepairAnalyzeTsdbBlocks(NULL, "/tmp", 2, &report), TSDB_CODE_INVALID_PARA); + ASSERT_EQ(tRepairAnalyzeTsdbBlocks(&ctx, "/tmp", 2, &report), TSDB_CODE_INVALID_PARA); + ASSERT_EQ(tRepairAnalyzeTsdbBlocks(&ctx, "/tmp", 2, NULL), TSDB_CODE_INVALID_PARA); +} + +TEST(RepairOptionParseTest, RebuildTsdbBlocksKeepsRecoverableDirs) { + const std::string dataDirPath = buildRepairTempPath("rebuild-tsdb-data"); + RepairTempDirGuard dataDirGuard(dataDirPath); + const std::string outputDirPath = buildRepairTempPath("rebuild-tsdb-output"); + RepairTempDirGuard outputDirGuard(outputDirPath); + const std::string sep(TD_DIRSEP); + const std::string blockDir100 = dataDirPath + sep + "vnode" + sep + "vnode2" + sep + "tsdb" + sep + "f100"; + const std::string blockDir101 = dataDirPath + sep + "vnode" + sep + "vnode2" + sep + "tsdb" + sep + "f101"; + const std::string blockDir102 = dataDirPath + sep + "vnode" + sep + "vnode2" + sep + "tsdb" + sep + "f102"; + ASSERT_EQ(taosMulMkDir(blockDir100.c_str()), 0); + ASSERT_EQ(taosMulMkDir(blockDir101.c_str()), 0); + ASSERT_EQ(taosMulMkDir(blockDir102.c_str()), 0); + + auto createEmptyFile = [](const std::string &path) { + TdFilePtr pFile = taosOpenFile(path.c_str(), TD_FILE_CREATE | TD_FILE_WRITE | TD_FILE_TRUNC); + ASSERT_NE(pFile, nullptr); + ASSERT_EQ(taosCloseFile(&pFile), 0); + }; + + createEmptyFile(blockDir100 + sep + "v2f100ver1.head"); + createEmptyFile(blockDir100 + sep + "v2f100ver1.0.data"); + createEmptyFile(blockDir101 + sep + "v2f101ver1.0.data"); + createEmptyFile(blockDir102 + sep + "v2f102ver1.head"); + createEmptyFile(blockDir102 + sep + "v2f102ver1.0.data"); + + SRepairCliArgs cliArgs = {0}; + ASSERT_EQ(tRepairParseCliOption(&cliArgs, "node-type", "vnode"), TSDB_CODE_SUCCESS); + ASSERT_EQ(tRepairParseCliOption(&cliArgs, "file-type", "tsdb"), TSDB_CODE_SUCCESS); + ASSERT_EQ(tRepairParseCliOption(&cliArgs, "vnode-id", "2"), TSDB_CODE_SUCCESS); + ASSERT_EQ(tRepairParseCliOption(&cliArgs, "mode", "force"), TSDB_CODE_SUCCESS); + + SRepairCtx ctx = {0}; + ASSERT_EQ(tRepairInitCtx(&cliArgs, 1735689601009LL, &ctx), TSDB_CODE_SUCCESS); + + SRepairTsdbBlockReport report = {0}; + ASSERT_EQ(tRepairRebuildTsdbBlocks(&ctx, dataDirPath.c_str(), 2, outputDirPath.c_str(), &report), TSDB_CODE_SUCCESS); + ASSERT_EQ(report.totalBlocks, 3); + ASSERT_EQ(report.recoverableBlocks, 2); + ASSERT_EQ(report.corruptedBlocks, 1); + + ASSERT_TRUE(taosDirExist((outputDirPath + sep + "f100").c_str())); + ASSERT_FALSE(taosDirExist((outputDirPath + sep + "f101").c_str())); + ASSERT_TRUE(taosDirExist((outputDirPath + sep + "f102").c_str())); + ASSERT_TRUE(taosCheckExistFile((outputDirPath + sep + "f100" + sep + "v2f100ver1.head").c_str())); + ASSERT_TRUE(taosCheckExistFile((outputDirPath + sep + "f102" + sep + "v2f102ver1.0.data").c_str())); +} + +TEST(RepairOptionParseTest, RebuildTsdbBlocksNoRecoverableBlocks) { + const std::string dataDirPath = buildRepairTempPath("rebuild-tsdb-empty-data"); + RepairTempDirGuard dataDirGuard(dataDirPath); + const std::string outputDirPath = buildRepairTempPath("rebuild-tsdb-empty-output"); + RepairTempDirGuard outputDirGuard(outputDirPath); + const std::string sep(TD_DIRSEP); + const std::string blockDir = dataDirPath + sep + "vnode" + sep + "vnode2" + sep + "tsdb" + sep + "f300"; + ASSERT_EQ(taosMulMkDir(blockDir.c_str()), 0); + + TdFilePtr pFile = + taosOpenFile((blockDir + sep + "v2f300ver1.0.data").c_str(), TD_FILE_CREATE | TD_FILE_WRITE | TD_FILE_TRUNC); + ASSERT_NE(pFile, nullptr); + ASSERT_EQ(taosCloseFile(&pFile), 0); + + SRepairCliArgs cliArgs = {0}; + ASSERT_EQ(tRepairParseCliOption(&cliArgs, "node-type", "vnode"), TSDB_CODE_SUCCESS); + ASSERT_EQ(tRepairParseCliOption(&cliArgs, "file-type", "tsdb"), TSDB_CODE_SUCCESS); + ASSERT_EQ(tRepairParseCliOption(&cliArgs, "vnode-id", "2"), TSDB_CODE_SUCCESS); + ASSERT_EQ(tRepairParseCliOption(&cliArgs, "mode", "force"), TSDB_CODE_SUCCESS); + + SRepairCtx ctx = {0}; + ASSERT_EQ(tRepairInitCtx(&cliArgs, 1735689601010LL, &ctx), TSDB_CODE_SUCCESS); + + SRepairTsdbBlockReport report = {0}; + ASSERT_EQ(tRepairRebuildTsdbBlocks(&ctx, dataDirPath.c_str(), 2, outputDirPath.c_str(), &report), + TSDB_CODE_INVALID_PARA); +} + +TEST(RepairOptionParseTest, RebuildTsdbBlocksInvalidArgs) { + SRepairTsdbBlockReport report = {0}; + SRepairCtx ctx = {0}; + ASSERT_EQ(tRepairRebuildTsdbBlocks(NULL, "/tmp", 2, "/tmp/out", &report), TSDB_CODE_INVALID_PARA); + ASSERT_EQ(tRepairRebuildTsdbBlocks(&ctx, "/tmp", 2, "/tmp/out", &report), TSDB_CODE_INVALID_PARA); + ASSERT_EQ(tRepairRebuildTsdbBlocks(&ctx, "/tmp", 2, "/tmp/out", NULL), TSDB_CODE_INVALID_PARA); +} + +TEST(RepairOptionParseTest, PrepareBackupDirWithConfiguredPath) { + const std::string backupRoot = buildRepairTempPath("backup-root-configured"); + RepairTempDirGuard backupRootGuard(backupRoot); + ASSERT_EQ(taosMulMkDir(backupRoot.c_str()), 0); + + SRepairCliArgs cliArgs = {0}; + ASSERT_EQ(tRepairParseCliOption(&cliArgs, "node-type", "vnode"), TSDB_CODE_SUCCESS); + ASSERT_EQ(tRepairParseCliOption(&cliArgs, "file-type", "wal"), TSDB_CODE_SUCCESS); + ASSERT_EQ(tRepairParseCliOption(&cliArgs, "vnode-id", "2"), TSDB_CODE_SUCCESS); + ASSERT_EQ(tRepairParseCliOption(&cliArgs, "mode", "force"), TSDB_CODE_SUCCESS); + ASSERT_EQ(tRepairParseCliOption(&cliArgs, "backup-path", backupRoot.c_str()), TSDB_CODE_SUCCESS); + + SRepairCtx ctx = {0}; + ASSERT_EQ(tRepairInitCtx(&cliArgs, 1735689601101LL, &ctx), TSDB_CODE_SUCCESS); + + char backupDir[PATH_MAX] = {0}; + ASSERT_EQ(tRepairPrepareBackupDir(&ctx, "/tmp/unused-data-dir", 2, backupDir, sizeof(backupDir)), + TSDB_CODE_SUCCESS); + + std::string expected = + backupRoot + std::string(TD_DIRSEP) + "repair-1735689601101" + TD_DIRSEP + "vnode2" + TD_DIRSEP + "wal"; + ASSERT_STREQ(backupDir, expected.c_str()); + ASSERT_TRUE(taosDirExist(backupDir)); +} + +TEST(RepairOptionParseTest, PrepareBackupDirWithDefaultPath) { + const std::string dataDir = buildRepairTempPath("backup-default-data"); + RepairTempDirGuard dataDirGuard(dataDir); + ASSERT_EQ(taosMulMkDir(dataDir.c_str()), 0); + + SRepairCliArgs cliArgs = {0}; + ASSERT_EQ(tRepairParseCliOption(&cliArgs, "node-type", "vnode"), TSDB_CODE_SUCCESS); + ASSERT_EQ(tRepairParseCliOption(&cliArgs, "file-type", "meta"), TSDB_CODE_SUCCESS); + ASSERT_EQ(tRepairParseCliOption(&cliArgs, "vnode-id", "3"), TSDB_CODE_SUCCESS); + ASSERT_EQ(tRepairParseCliOption(&cliArgs, "mode", "force"), TSDB_CODE_SUCCESS); + + SRepairCtx ctx = {0}; + ASSERT_EQ(tRepairInitCtx(&cliArgs, 1735689601102LL, &ctx), TSDB_CODE_SUCCESS); + ASSERT_FALSE(ctx.hasBackupPath); + + char backupDir[PATH_MAX] = {0}; + ASSERT_EQ(tRepairPrepareBackupDir(&ctx, dataDir.c_str(), 3, backupDir, sizeof(backupDir)), TSDB_CODE_SUCCESS); + + std::string expected = + dataDir + std::string(TD_DIRSEP) + "backup" + TD_DIRSEP + "repair-1735689601102" + TD_DIRSEP + "vnode3" + + TD_DIRSEP + "meta"; + ASSERT_STREQ(backupDir, expected.c_str()); + ASSERT_TRUE(taosDirExist(backupDir)); +} + +TEST(RepairOptionParseTest, PrepareBackupDirInvalidArgs) { + SRepairCtx ctx = {0}; + char backupDir[PATH_MAX] = {0}; + ASSERT_EQ(tRepairPrepareBackupDir(NULL, "/tmp", 2, backupDir, sizeof(backupDir)), TSDB_CODE_INVALID_PARA); + ASSERT_EQ(tRepairPrepareBackupDir(&ctx, "/tmp", 2, backupDir, sizeof(backupDir)), TSDB_CODE_INVALID_PARA); + + SRepairCliArgs cliArgs = {0}; + ASSERT_EQ(tRepairParseCliOption(&cliArgs, "node-type", "vnode"), TSDB_CODE_SUCCESS); + ASSERT_EQ(tRepairParseCliOption(&cliArgs, "file-type", "wal"), TSDB_CODE_SUCCESS); + ASSERT_EQ(tRepairParseCliOption(&cliArgs, "vnode-id", "2"), TSDB_CODE_SUCCESS); + ASSERT_EQ(tRepairParseCliOption(&cliArgs, "mode", "force"), TSDB_CODE_SUCCESS); + ASSERT_EQ(tRepairInitCtx(&cliArgs, 1735689601103LL, &ctx), TSDB_CODE_SUCCESS); + + ASSERT_EQ(tRepairPrepareBackupDir(&ctx, NULL, 2, backupDir, sizeof(backupDir)), TSDB_CODE_INVALID_PARA); + ASSERT_EQ(tRepairPrepareBackupDir(&ctx, "", 2, backupDir, sizeof(backupDir)), TSDB_CODE_INVALID_PARA); + ASSERT_EQ(tRepairPrepareBackupDir(&ctx, "/tmp", -1, backupDir, sizeof(backupDir)), TSDB_CODE_INVALID_PARA); + ASSERT_EQ(tRepairPrepareBackupDir(&ctx, "/tmp", 2, NULL, sizeof(backupDir)), TSDB_CODE_INVALID_PARA); + ASSERT_EQ(tRepairPrepareBackupDir(&ctx, "/tmp", 2, backupDir, 0), TSDB_CODE_INVALID_PARA); +} + +TEST(RepairOptionParseTest, PrepareSessionFilesWithConfiguredPath) { + const std::string backupRoot = buildRepairTempPath("session-files-configured-root"); + RepairTempDirGuard backupRootGuard(backupRoot); + ASSERT_EQ(taosMulMkDir(backupRoot.c_str()), 0); + + SRepairCliArgs cliArgs = {0}; + ASSERT_EQ(tRepairParseCliOption(&cliArgs, "node-type", "vnode"), TSDB_CODE_SUCCESS); + ASSERT_EQ(tRepairParseCliOption(&cliArgs, "file-type", "wal"), TSDB_CODE_SUCCESS); + ASSERT_EQ(tRepairParseCliOption(&cliArgs, "vnode-id", "2"), TSDB_CODE_SUCCESS); + ASSERT_EQ(tRepairParseCliOption(&cliArgs, "mode", "force"), TSDB_CODE_SUCCESS); + ASSERT_EQ(tRepairParseCliOption(&cliArgs, "backup-path", backupRoot.c_str()), TSDB_CODE_SUCCESS); + + SRepairCtx ctx = {0}; + ASSERT_EQ(tRepairInitCtx(&cliArgs, 1735689601201LL, &ctx), TSDB_CODE_SUCCESS); + + char sessionDir[PATH_MAX] = {0}; + char logPath[PATH_MAX] = {0}; + char statePath[PATH_MAX] = {0}; + ASSERT_EQ( + tRepairPrepareSessionFiles(&ctx, "/tmp/unused-data-dir", sessionDir, sizeof(sessionDir), logPath, + sizeof(logPath), statePath, sizeof(statePath)), + TSDB_CODE_SUCCESS); + + std::string expectedSessionDir = backupRoot + std::string(TD_DIRSEP) + "repair-1735689601201"; + std::string expectedLogPath = expectedSessionDir + TD_DIRSEP + "repair.log"; + std::string expectedStatePath = expectedSessionDir + TD_DIRSEP + "repair.state.json"; + ASSERT_STREQ(sessionDir, expectedSessionDir.c_str()); + ASSERT_STREQ(logPath, expectedLogPath.c_str()); + ASSERT_STREQ(statePath, expectedStatePath.c_str()); + + ASSERT_TRUE(taosDirExist(sessionDir)); + ASSERT_TRUE(taosCheckExistFile(logPath)); + ASSERT_TRUE(taosCheckExistFile(statePath)); + + std::string stateContent = readRepairFileContent(statePath); + ASSERT_FALSE(stateContent.empty()); + SJson *pJson = tjsonParse(stateContent.c_str()); + ASSERT_NE(pJson, nullptr); + + char sessionId[REPAIR_SESSION_ID_LEN] = {0}; + ASSERT_EQ(tjsonGetStringValue2(pJson, "sessionId", sessionId, sizeof(sessionId)), TSDB_CODE_SUCCESS); + ASSERT_STREQ(sessionId, "repair-1735689601201"); + + int64_t startTimeMs = 0; + ASSERT_EQ(tjsonGetBigIntValue(pJson, "startTimeMs", &startTimeMs), TSDB_CODE_SUCCESS); + ASSERT_EQ(startTimeMs, 1735689601201LL); + + char status[64] = {0}; + ASSERT_EQ(tjsonGetStringValue2(pJson, "status", status, sizeof(status)), TSDB_CODE_SUCCESS); + ASSERT_STREQ(status, "initialized"); + + int32_t totalVnodes = 0; + ASSERT_EQ(tjsonGetIntValue(pJson, "totalVnodes", &totalVnodes), TSDB_CODE_SUCCESS); + ASSERT_EQ(totalVnodes, 1); + tjsonDelete(pJson); +} + +TEST(RepairOptionParseTest, AppendSessionLogAndWriteSessionState) { + const std::string dataDir = buildRepairTempPath("session-files-default-data"); + RepairTempDirGuard dataDirGuard(dataDir); + ASSERT_EQ(taosMulMkDir(dataDir.c_str()), 0); + + SRepairCliArgs cliArgs = {0}; + ASSERT_EQ(tRepairParseCliOption(&cliArgs, "node-type", "vnode"), TSDB_CODE_SUCCESS); + ASSERT_EQ(tRepairParseCliOption(&cliArgs, "file-type", "meta"), TSDB_CODE_SUCCESS); + ASSERT_EQ(tRepairParseCliOption(&cliArgs, "vnode-id", "2,3"), TSDB_CODE_SUCCESS); + ASSERT_EQ(tRepairParseCliOption(&cliArgs, "mode", "force"), TSDB_CODE_SUCCESS); + + SRepairCtx ctx = {0}; + ASSERT_EQ(tRepairInitCtx(&cliArgs, 1735689601202LL, &ctx), TSDB_CODE_SUCCESS); + + char sessionDir[PATH_MAX] = {0}; + char logPath[PATH_MAX] = {0}; + char statePath[PATH_MAX] = {0}; + ASSERT_EQ(tRepairPrepareSessionFiles(&ctx, dataDir.c_str(), sessionDir, sizeof(sessionDir), logPath, + sizeof(logPath), statePath, sizeof(statePath)), + TSDB_CODE_SUCCESS); + + ASSERT_EQ(tRepairAppendSessionLog(logPath, "precheck passed"), TSDB_CODE_SUCCESS); + ASSERT_EQ(tRepairAppendSessionLog(logPath, "backup directories prepared"), TSDB_CODE_SUCCESS); + std::string logContent = readRepairFileContent(logPath); + ASSERT_NE(logContent.find("precheck passed"), std::string::npos); + ASSERT_NE(logContent.find("backup directories prepared"), std::string::npos); + + ASSERT_EQ(tRepairWriteSessionState(&ctx, statePath, "precheck", "running", 1, 2), TSDB_CODE_SUCCESS); + std::string stateContent = readRepairFileContent(statePath); + ASSERT_FALSE(stateContent.empty()); + + SJson *pJson = tjsonParse(stateContent.c_str()); + ASSERT_NE(pJson, nullptr); + + char step[64] = {0}; + ASSERT_EQ(tjsonGetStringValue2(pJson, "step", step, sizeof(step)), TSDB_CODE_SUCCESS); + ASSERT_STREQ(step, "precheck"); + + char status[64] = {0}; + ASSERT_EQ(tjsonGetStringValue2(pJson, "status", status, sizeof(status)), TSDB_CODE_SUCCESS); + ASSERT_STREQ(status, "running"); + + int32_t doneVnodes = 0; + int32_t totalVnodes = 0; + ASSERT_EQ(tjsonGetIntValue(pJson, "doneVnodes", &doneVnodes), TSDB_CODE_SUCCESS); + ASSERT_EQ(tjsonGetIntValue(pJson, "totalVnodes", &totalVnodes), TSDB_CODE_SUCCESS); + ASSERT_EQ(doneVnodes, 1); + ASSERT_EQ(totalVnodes, 2); + tjsonDelete(pJson); +} + +TEST(RepairOptionParseTest, SessionFilesInvalidArgs) { + SRepairCtx ctx = {0}; + char sessionDir[PATH_MAX] = {0}; + char logPath[PATH_MAX] = {0}; + char statePath[PATH_MAX] = {0}; + + ASSERT_EQ(tRepairPrepareSessionFiles(NULL, "/tmp", sessionDir, sizeof(sessionDir), logPath, sizeof(logPath), + statePath, sizeof(statePath)), + TSDB_CODE_INVALID_PARA); + ASSERT_EQ(tRepairPrepareSessionFiles(&ctx, "/tmp", sessionDir, sizeof(sessionDir), logPath, sizeof(logPath), + statePath, sizeof(statePath)), + TSDB_CODE_INVALID_PARA); + ASSERT_EQ(tRepairPrepareSessionFiles(&ctx, NULL, sessionDir, sizeof(sessionDir), logPath, sizeof(logPath), + statePath, sizeof(statePath)), + TSDB_CODE_INVALID_PARA); + + ASSERT_EQ(tRepairAppendSessionLog(NULL, "msg"), TSDB_CODE_INVALID_PARA); + ASSERT_EQ(tRepairAppendSessionLog("", "msg"), TSDB_CODE_INVALID_PARA); + ASSERT_EQ(tRepairAppendSessionLog("/tmp/x.log", NULL), TSDB_CODE_INVALID_PARA); + ASSERT_EQ(tRepairAppendSessionLog("/tmp/x.log", ""), TSDB_CODE_INVALID_PARA); + + ASSERT_EQ(tRepairWriteSessionState(NULL, "/tmp/x.state", "step", "status", 1, 1), TSDB_CODE_INVALID_PARA); + ASSERT_EQ(tRepairWriteSessionState(&ctx, "/tmp/x.state", "step", "status", 1, 1), TSDB_CODE_INVALID_PARA); + ASSERT_EQ(tRepairWriteSessionState(&ctx, NULL, "step", "status", 1, 1), TSDB_CODE_INVALID_PARA); + ASSERT_EQ(tRepairWriteSessionState(&ctx, "/tmp/x.state", NULL, "status", 1, 1), TSDB_CODE_INVALID_PARA); + ASSERT_EQ(tRepairWriteSessionState(&ctx, "/tmp/x.state", "step", NULL, 1, 1), TSDB_CODE_INVALID_PARA); +} + +TEST(RepairOptionParseTest, TryResumeSessionFindsUnfinishedState) { + const std::string backupRoot = buildRepairTempPath("resume-root"); + RepairTempDirGuard backupRootGuard(backupRoot); + ASSERT_EQ(taosMulMkDir(backupRoot.c_str()), 0); + + SRepairCliArgs cliArgs = {0}; + ASSERT_EQ(tRepairParseCliOption(&cliArgs, "node-type", "vnode"), TSDB_CODE_SUCCESS); + ASSERT_EQ(tRepairParseCliOption(&cliArgs, "file-type", "wal"), TSDB_CODE_SUCCESS); + ASSERT_EQ(tRepairParseCliOption(&cliArgs, "vnode-id", "2,3"), TSDB_CODE_SUCCESS); + ASSERT_EQ(tRepairParseCliOption(&cliArgs, "mode", "force"), TSDB_CODE_SUCCESS); + ASSERT_EQ(tRepairParseCliOption(&cliArgs, "backup-path", backupRoot.c_str()), TSDB_CODE_SUCCESS); + + SRepairCtx finishedCtx = {0}; + ASSERT_EQ(tRepairInitCtx(&cliArgs, 1735689601401LL, &finishedCtx), TSDB_CODE_SUCCESS); + char finishedSessionDir[PATH_MAX] = {0}; + char finishedLogPath[PATH_MAX] = {0}; + char finishedStatePath[PATH_MAX] = {0}; + ASSERT_EQ(tRepairPrepareSessionFiles(&finishedCtx, "/tmp/unused-data-dir", finishedSessionDir, + sizeof(finishedSessionDir), finishedLogPath, sizeof(finishedLogPath), + finishedStatePath, sizeof(finishedStatePath)), + TSDB_CODE_SUCCESS); + ASSERT_EQ(tRepairWriteSessionState(&finishedCtx, finishedStatePath, "preflight", "ready", 2, 2), TSDB_CODE_SUCCESS); + + SRepairCtx runningCtx = {0}; + ASSERT_EQ(tRepairInitCtx(&cliArgs, 1735689601402LL, &runningCtx), TSDB_CODE_SUCCESS); + char runningSessionDir[PATH_MAX] = {0}; + char runningLogPath[PATH_MAX] = {0}; + char runningStatePath[PATH_MAX] = {0}; + ASSERT_EQ( + tRepairPrepareSessionFiles(&runningCtx, "/tmp/unused-data-dir", runningSessionDir, sizeof(runningSessionDir), + runningLogPath, sizeof(runningLogPath), runningStatePath, sizeof(runningStatePath)), + TSDB_CODE_SUCCESS); + ASSERT_EQ(tRepairWriteSessionState(&runningCtx, runningStatePath, "backup", "running", 1, 2), TSDB_CODE_SUCCESS); + + SRepairCtx resumeCtx = {0}; + ASSERT_EQ(tRepairInitCtx(&cliArgs, 1735689601999LL, &resumeCtx), TSDB_CODE_SUCCESS); + + char sessionDir[PATH_MAX] = {0}; + char logPath[PATH_MAX] = {0}; + char statePath[PATH_MAX] = {0}; + char resumeStep[32] = {0}; + int32_t doneVnodes = -1; + int32_t totalVnodes = -1; + bool resumed = false; + ASSERT_EQ(tRepairTryResumeSession(&resumeCtx, "/tmp/unused-data-dir", sessionDir, sizeof(sessionDir), logPath, + sizeof(logPath), statePath, sizeof(statePath), &doneVnodes, &totalVnodes, + &resumed, resumeStep, sizeof(resumeStep)), + TSDB_CODE_SUCCESS); + ASSERT_TRUE(resumed); + ASSERT_EQ(doneVnodes, 1); + ASSERT_EQ(totalVnodes, 2); + ASSERT_STREQ(resumeStep, "backup"); + ASSERT_STREQ(resumeCtx.sessionId, "repair-1735689601402"); + ASSERT_EQ(resumeCtx.startTimeMs, 1735689601402LL); + ASSERT_STREQ(sessionDir, runningSessionDir); + ASSERT_STREQ(logPath, runningLogPath); + ASSERT_STREQ(statePath, runningStatePath); +} + +TEST(RepairOptionParseTest, TryResumeSessionSkipMismatchedState) { + const std::string backupRoot = buildRepairTempPath("resume-mismatch-root"); + RepairTempDirGuard backupRootGuard(backupRoot); + ASSERT_EQ(taosMulMkDir(backupRoot.c_str()), 0); + + SRepairCliArgs oldCliArgs = {0}; + ASSERT_EQ(tRepairParseCliOption(&oldCliArgs, "node-type", "vnode"), TSDB_CODE_SUCCESS); + ASSERT_EQ(tRepairParseCliOption(&oldCliArgs, "file-type", "wal"), TSDB_CODE_SUCCESS); + ASSERT_EQ(tRepairParseCliOption(&oldCliArgs, "vnode-id", "8,9"), TSDB_CODE_SUCCESS); + ASSERT_EQ(tRepairParseCliOption(&oldCliArgs, "mode", "force"), TSDB_CODE_SUCCESS); + ASSERT_EQ(tRepairParseCliOption(&oldCliArgs, "backup-path", backupRoot.c_str()), TSDB_CODE_SUCCESS); + + SRepairCtx oldCtx = {0}; + ASSERT_EQ(tRepairInitCtx(&oldCliArgs, 1735689601403LL, &oldCtx), TSDB_CODE_SUCCESS); + char oldSessionDir[PATH_MAX] = {0}; + char oldLogPath[PATH_MAX] = {0}; + char oldStatePath[PATH_MAX] = {0}; + ASSERT_EQ(tRepairPrepareSessionFiles(&oldCtx, "/tmp/unused-data-dir", oldSessionDir, sizeof(oldSessionDir), + oldLogPath, sizeof(oldLogPath), oldStatePath, sizeof(oldStatePath)), + TSDB_CODE_SUCCESS); + ASSERT_EQ(tRepairWriteSessionState(&oldCtx, oldStatePath, "backup", "running", 1, 2), TSDB_CODE_SUCCESS); + + SRepairCliArgs cliArgs = {0}; + ASSERT_EQ(tRepairParseCliOption(&cliArgs, "node-type", "vnode"), TSDB_CODE_SUCCESS); + ASSERT_EQ(tRepairParseCliOption(&cliArgs, "file-type", "wal"), TSDB_CODE_SUCCESS); + ASSERT_EQ(tRepairParseCliOption(&cliArgs, "vnode-id", "2,3"), TSDB_CODE_SUCCESS); + ASSERT_EQ(tRepairParseCliOption(&cliArgs, "mode", "force"), TSDB_CODE_SUCCESS); + ASSERT_EQ(tRepairParseCliOption(&cliArgs, "backup-path", backupRoot.c_str()), TSDB_CODE_SUCCESS); + + SRepairCtx resumeCtx = {0}; + ASSERT_EQ(tRepairInitCtx(&cliArgs, 1735689602999LL, &resumeCtx), TSDB_CODE_SUCCESS); + + char sessionDir[PATH_MAX] = {0}; + char logPath[PATH_MAX] = {0}; + char statePath[PATH_MAX] = {0}; + char resumeStep[32] = {0}; + int32_t doneVnodes = -1; + int32_t totalVnodes = -1; + bool resumed = true; + ASSERT_EQ(tRepairTryResumeSession(&resumeCtx, "/tmp/unused-data-dir", sessionDir, sizeof(sessionDir), logPath, + sizeof(logPath), statePath, sizeof(statePath), &doneVnodes, &totalVnodes, + &resumed, resumeStep, sizeof(resumeStep)), + TSDB_CODE_SUCCESS); + ASSERT_FALSE(resumed); + ASSERT_EQ(doneVnodes, 0); + ASSERT_EQ(totalVnodes, 2); + ASSERT_EQ(resumeStep[0], '\0'); + ASSERT_STREQ(resumeCtx.sessionId, "repair-1735689602999"); + ASSERT_EQ(resumeCtx.startTimeMs, 1735689602999LL); + ASSERT_EQ(sessionDir[0], '\0'); + ASSERT_EQ(logPath[0], '\0'); + ASSERT_EQ(statePath[0], '\0'); +} + +TEST(RepairOptionParseTest, TryResumeSessionInvalidArgs) { + SRepairCtx ctx = {0}; + char path[PATH_MAX] = {0}; + char step[32] = {0}; + int32_t doneVnodes = 0; + int32_t totalVnodes = 0; + bool resumed = false; + + ASSERT_EQ(tRepairTryResumeSession(NULL, "/tmp", path, sizeof(path), path, sizeof(path), path, sizeof(path), + &doneVnodes, &totalVnodes, &resumed, step, sizeof(step)), + TSDB_CODE_INVALID_PARA); + ASSERT_EQ(tRepairTryResumeSession(&ctx, "/tmp", path, sizeof(path), path, sizeof(path), path, sizeof(path), + &doneVnodes, &totalVnodes, &resumed, step, sizeof(step)), + TSDB_CODE_INVALID_PARA); + + SRepairCliArgs cliArgs = {0}; + ASSERT_EQ(tRepairParseCliOption(&cliArgs, "node-type", "vnode"), TSDB_CODE_SUCCESS); + ASSERT_EQ(tRepairParseCliOption(&cliArgs, "file-type", "wal"), TSDB_CODE_SUCCESS); + ASSERT_EQ(tRepairParseCliOption(&cliArgs, "vnode-id", "2"), TSDB_CODE_SUCCESS); + ASSERT_EQ(tRepairParseCliOption(&cliArgs, "mode", "force"), TSDB_CODE_SUCCESS); + ASSERT_EQ(tRepairInitCtx(&cliArgs, 1735689603000LL, &ctx), TSDB_CODE_SUCCESS); + + ASSERT_EQ(tRepairTryResumeSession(&ctx, "/tmp", path, sizeof(path), path, sizeof(path), path, sizeof(path), + &doneVnodes, &totalVnodes, &resumed, NULL, sizeof(step)), + TSDB_CODE_INVALID_PARA); + ASSERT_EQ(tRepairTryResumeSession(&ctx, "/tmp", path, sizeof(path), path, sizeof(path), path, sizeof(path), + &doneVnodes, &totalVnodes, &resumed, step, 0), + TSDB_CODE_INVALID_PARA); +} + +TEST(RepairOptionParseTest, ResolveResumePlanModeStepFromResumeState) { + const std::string backupRoot = buildRepairTempPath("resume-mode-step-root"); + RepairTempDirGuard backupRootGuard(backupRoot); + ASSERT_EQ(taosMulMkDir(backupRoot.c_str()), 0); + + SRepairCliArgs cliArgs = {0}; + ASSERT_EQ(tRepairParseCliOption(&cliArgs, "node-type", "vnode"), TSDB_CODE_SUCCESS); + ASSERT_EQ(tRepairParseCliOption(&cliArgs, "file-type", "wal"), TSDB_CODE_SUCCESS); + ASSERT_EQ(tRepairParseCliOption(&cliArgs, "vnode-id", "2,3"), TSDB_CODE_SUCCESS); + ASSERT_EQ(tRepairParseCliOption(&cliArgs, "mode", "force"), TSDB_CODE_SUCCESS); + ASSERT_EQ(tRepairParseCliOption(&cliArgs, "backup-path", backupRoot.c_str()), TSDB_CODE_SUCCESS); + + SRepairCtx runningCtx = {0}; + ASSERT_EQ(tRepairInitCtx(&cliArgs, 1735689603401LL, &runningCtx), TSDB_CODE_SUCCESS); + char runningSessionDir[PATH_MAX] = {0}; + char runningLogPath[PATH_MAX] = {0}; + char runningStatePath[PATH_MAX] = {0}; + ASSERT_EQ( + tRepairPrepareSessionFiles(&runningCtx, "/tmp/unused-data-dir", runningSessionDir, sizeof(runningSessionDir), + runningLogPath, sizeof(runningLogPath), runningStatePath, sizeof(runningStatePath)), + TSDB_CODE_SUCCESS); + ASSERT_EQ(tRepairWriteSessionState(&runningCtx, runningStatePath, "wal", "running", 1, 2), TSDB_CODE_SUCCESS); + + SRepairCtx resumeCtx = {0}; + ASSERT_EQ(tRepairInitCtx(&cliArgs, 1735689603999LL, &resumeCtx), TSDB_CODE_SUCCESS); + + char sessionDir[PATH_MAX] = {0}; + char logPath[PATH_MAX] = {0}; + char statePath[PATH_MAX] = {0}; + char resumeStep[32] = {0}; + int32_t doneVnodes = -1; + int32_t totalVnodes = -1; + bool resumed = false; + ASSERT_EQ(tRepairTryResumeSession(&resumeCtx, "/tmp/unused-data-dir", sessionDir, sizeof(sessionDir), logPath, + sizeof(logPath), statePath, sizeof(statePath), &doneVnodes, &totalVnodes, + &resumed, resumeStep, sizeof(resumeStep)), + TSDB_CODE_SUCCESS); + ASSERT_TRUE(resumed); + ASSERT_EQ(doneVnodes, 1); + ASSERT_EQ(totalVnodes, 2); + ASSERT_STREQ(resumeStep, "wal"); + + SRepairResumePlan plan = {0}; + ASSERT_EQ(tRepairResolveResumePlan(resumeCtx.nodeType, resumeStep, doneVnodes, resumeCtx.vnodeIdNum, &plan), + TSDB_CODE_SUCCESS); + ASSERT_TRUE(plan.skipBackupPreparation); + ASSERT_TRUE(plan.resumeAtModeStep); + ASSERT_EQ(plan.backupStartVnodeIndex, 0); + ASSERT_EQ(plan.replicaStartVnodeIndex, 0); + ASSERT_EQ(plan.copyStartVnodeIndex, 0); + ASSERT_EQ(plan.walStartVnodeIndex, 1); + ASSERT_EQ(plan.tsdbStartVnodeIndex, 0); + ASSERT_EQ(plan.metaStartVnodeIndex, 0); +} + +TEST(RepairOptionParseTest, ResolveResumePlanForBackupLikeSteps) { + const char *steps[] = { + "", + "init", + "precheck", + "backup", + }; + for (const char *step : steps) { + SRepairResumePlan plan = {0}; + ASSERT_EQ(tRepairResolveResumePlan(REPAIR_NODE_TYPE_VNODE, step, 2, 3, &plan), TSDB_CODE_SUCCESS); + ASSERT_FALSE(plan.skipBackupPreparation); + ASSERT_FALSE(plan.resumeAtModeStep); + ASSERT_EQ(plan.backupStartVnodeIndex, 2); + ASSERT_EQ(plan.replicaStartVnodeIndex, 0); + ASSERT_EQ(plan.copyStartVnodeIndex, 0); + ASSERT_EQ(plan.walStartVnodeIndex, 0); + ASSERT_EQ(plan.tsdbStartVnodeIndex, 0); + ASSERT_EQ(plan.metaStartVnodeIndex, 0); + } +} + +TEST(RepairOptionParseTest, ResolveResumePlanForModeSteps) { + struct { + const char *step; + int32_t replicaStart; + int32_t copyStart; + int32_t walStart; + int32_t tsdbStart; + int32_t metaStart; + } cases[] = { + {"replica", 1, 0, 0, 0, 0}, + {"copy", 0, 1, 0, 0, 0}, + {"wal", 0, 0, 1, 0, 0}, + {"tsdb", 0, 0, 0, 1, 0}, + {"meta", 0, 0, 0, 0, 1}, + }; + + for (const auto &it : cases) { + SRepairResumePlan plan = {0}; + ASSERT_EQ(tRepairResolveResumePlan(REPAIR_NODE_TYPE_VNODE, it.step, 1, 3, &plan), TSDB_CODE_SUCCESS); + ASSERT_TRUE(plan.skipBackupPreparation); + ASSERT_TRUE(plan.resumeAtModeStep); + ASSERT_EQ(plan.backupStartVnodeIndex, 0); + ASSERT_EQ(plan.replicaStartVnodeIndex, it.replicaStart); + ASSERT_EQ(plan.copyStartVnodeIndex, it.copyStart); + ASSERT_EQ(plan.walStartVnodeIndex, it.walStart); + ASSERT_EQ(plan.tsdbStartVnodeIndex, it.tsdbStart); + ASSERT_EQ(plan.metaStartVnodeIndex, it.metaStart); + } +} + +TEST(RepairOptionParseTest, ResolveResumePlanInvalidArgs) { + SRepairResumePlan plan = {0}; + ASSERT_EQ(tRepairResolveResumePlan(REPAIR_NODE_TYPE_VNODE, NULL, 0, 1, &plan), TSDB_CODE_INVALID_PARA); + ASSERT_EQ(tRepairResolveResumePlan(REPAIR_NODE_TYPE_VNODE, "backup", -1, 1, &plan), TSDB_CODE_INVALID_PARA); + ASSERT_EQ(tRepairResolveResumePlan(REPAIR_NODE_TYPE_VNODE, "backup", 2, 1, &plan), TSDB_CODE_INVALID_PARA); + ASSERT_EQ(tRepairResolveResumePlan(REPAIR_NODE_TYPE_VNODE, "unknown-step", 0, 1, &plan), TSDB_CODE_INVALID_PARA); + ASSERT_EQ(tRepairResolveResumePlan(REPAIR_NODE_TYPE_VNODE, "backup", 0, 1, NULL), TSDB_CODE_INVALID_PARA); + + ASSERT_EQ(tRepairResolveResumePlan(REPAIR_NODE_TYPE_MNODE, "replica", 0, 0, &plan), TSDB_CODE_SUCCESS); + ASSERT_FALSE(plan.skipBackupPreparation); + ASSERT_FALSE(plan.resumeAtModeStep); + ASSERT_EQ(plan.backupStartVnodeIndex, 0); + ASSERT_EQ(plan.replicaStartVnodeIndex, 0); + ASSERT_EQ(plan.copyStartVnodeIndex, 0); + ASSERT_EQ(plan.walStartVnodeIndex, 0); + ASSERT_EQ(plan.tsdbStartVnodeIndex, 0); + ASSERT_EQ(plan.metaStartVnodeIndex, 0); +} + +TEST(RepairOptionParseTest, NeedRunWalForceRepair) { + bool needRun = false; + + SRepairCliArgs walForceCli = {0}; + ASSERT_EQ(tRepairParseCliOption(&walForceCli, "node-type", "vnode"), TSDB_CODE_SUCCESS); + ASSERT_EQ(tRepairParseCliOption(&walForceCli, "file-type", "wal"), TSDB_CODE_SUCCESS); + ASSERT_EQ(tRepairParseCliOption(&walForceCli, "vnode-id", "2,3"), TSDB_CODE_SUCCESS); + ASSERT_EQ(tRepairParseCliOption(&walForceCli, "mode", "force"), TSDB_CODE_SUCCESS); + SRepairCtx walForceCtx = {0}; + ASSERT_EQ(tRepairInitCtx(&walForceCli, 1735689601501LL, &walForceCtx), TSDB_CODE_SUCCESS); + ASSERT_EQ(tRepairNeedRunWalForceRepair(&walForceCtx, &needRun), TSDB_CODE_SUCCESS); + ASSERT_TRUE(needRun); + + SRepairCliArgs walReplicaCli = {0}; + ASSERT_EQ(tRepairParseCliOption(&walReplicaCli, "node-type", "vnode"), TSDB_CODE_SUCCESS); + ASSERT_EQ(tRepairParseCliOption(&walReplicaCli, "file-type", "wal"), TSDB_CODE_SUCCESS); + ASSERT_EQ(tRepairParseCliOption(&walReplicaCli, "vnode-id", "2,3"), TSDB_CODE_SUCCESS); + ASSERT_EQ(tRepairParseCliOption(&walReplicaCli, "mode", "replica"), TSDB_CODE_SUCCESS); + SRepairCtx walReplicaCtx = {0}; + ASSERT_EQ(tRepairInitCtx(&walReplicaCli, 1735689601502LL, &walReplicaCtx), TSDB_CODE_SUCCESS); + ASSERT_EQ(tRepairNeedRunWalForceRepair(&walReplicaCtx, &needRun), TSDB_CODE_SUCCESS); + ASSERT_FALSE(needRun); + + SRepairCliArgs tsdbForceCli = {0}; + ASSERT_EQ(tRepairParseCliOption(&tsdbForceCli, "node-type", "vnode"), TSDB_CODE_SUCCESS); + ASSERT_EQ(tRepairParseCliOption(&tsdbForceCli, "file-type", "tsdb"), TSDB_CODE_SUCCESS); + ASSERT_EQ(tRepairParseCliOption(&tsdbForceCli, "vnode-id", "2,3"), TSDB_CODE_SUCCESS); + ASSERT_EQ(tRepairParseCliOption(&tsdbForceCli, "mode", "force"), TSDB_CODE_SUCCESS); + SRepairCtx tsdbForceCtx = {0}; + ASSERT_EQ(tRepairInitCtx(&tsdbForceCli, 1735689601503LL, &tsdbForceCtx), TSDB_CODE_SUCCESS); + ASSERT_EQ(tRepairNeedRunWalForceRepair(&tsdbForceCtx, &needRun), TSDB_CODE_SUCCESS); + ASSERT_FALSE(needRun); +} + +TEST(RepairOptionParseTest, NeedRunTsdbForceRepair) { + bool needRun = false; + + SRepairCliArgs tsdbForceCli = {0}; + ASSERT_EQ(tRepairParseCliOption(&tsdbForceCli, "node-type", "vnode"), TSDB_CODE_SUCCESS); + ASSERT_EQ(tRepairParseCliOption(&tsdbForceCli, "file-type", "tsdb"), TSDB_CODE_SUCCESS); + ASSERT_EQ(tRepairParseCliOption(&tsdbForceCli, "vnode-id", "2,3"), TSDB_CODE_SUCCESS); + ASSERT_EQ(tRepairParseCliOption(&tsdbForceCli, "mode", "force"), TSDB_CODE_SUCCESS); + SRepairCtx tsdbForceCtx = {0}; + ASSERT_EQ(tRepairInitCtx(&tsdbForceCli, 1735689601506LL, &tsdbForceCtx), TSDB_CODE_SUCCESS); + ASSERT_EQ(tRepairNeedRunTsdbForceRepair(&tsdbForceCtx, &needRun), TSDB_CODE_SUCCESS); + ASSERT_TRUE(needRun); + + SRepairCliArgs tsdbReplicaCli = {0}; + ASSERT_EQ(tRepairParseCliOption(&tsdbReplicaCli, "node-type", "vnode"), TSDB_CODE_SUCCESS); + ASSERT_EQ(tRepairParseCliOption(&tsdbReplicaCli, "file-type", "tsdb"), TSDB_CODE_SUCCESS); + ASSERT_EQ(tRepairParseCliOption(&tsdbReplicaCli, "vnode-id", "2,3"), TSDB_CODE_SUCCESS); + ASSERT_EQ(tRepairParseCliOption(&tsdbReplicaCli, "mode", "replica"), TSDB_CODE_SUCCESS); + SRepairCtx tsdbReplicaCtx = {0}; + ASSERT_EQ(tRepairInitCtx(&tsdbReplicaCli, 1735689601507LL, &tsdbReplicaCtx), TSDB_CODE_SUCCESS); + ASSERT_EQ(tRepairNeedRunTsdbForceRepair(&tsdbReplicaCtx, &needRun), TSDB_CODE_SUCCESS); + ASSERT_FALSE(needRun); + + SRepairCliArgs walForceCli = {0}; + ASSERT_EQ(tRepairParseCliOption(&walForceCli, "node-type", "vnode"), TSDB_CODE_SUCCESS); + ASSERT_EQ(tRepairParseCliOption(&walForceCli, "file-type", "wal"), TSDB_CODE_SUCCESS); + ASSERT_EQ(tRepairParseCliOption(&walForceCli, "vnode-id", "2,3"), TSDB_CODE_SUCCESS); + ASSERT_EQ(tRepairParseCliOption(&walForceCli, "mode", "force"), TSDB_CODE_SUCCESS); + SRepairCtx walForceCtx = {0}; + ASSERT_EQ(tRepairInitCtx(&walForceCli, 1735689601508LL, &walForceCtx), TSDB_CODE_SUCCESS); + ASSERT_EQ(tRepairNeedRunTsdbForceRepair(&walForceCtx, &needRun), TSDB_CODE_SUCCESS); + ASSERT_FALSE(needRun); +} + +TEST(RepairOptionParseTest, NeedRunMetaForceRepair) { + bool needRun = false; + + SRepairCliArgs metaForceCli = {0}; + ASSERT_EQ(tRepairParseCliOption(&metaForceCli, "node-type", "vnode"), TSDB_CODE_SUCCESS); + ASSERT_EQ(tRepairParseCliOption(&metaForceCli, "file-type", "meta"), TSDB_CODE_SUCCESS); + ASSERT_EQ(tRepairParseCliOption(&metaForceCli, "vnode-id", "2,3"), TSDB_CODE_SUCCESS); + ASSERT_EQ(tRepairParseCliOption(&metaForceCli, "mode", "force"), TSDB_CODE_SUCCESS); + SRepairCtx metaForceCtx = {0}; + ASSERT_EQ(tRepairInitCtx(&metaForceCli, 1735689601509LL, &metaForceCtx), TSDB_CODE_SUCCESS); + ASSERT_EQ(tRepairNeedRunMetaForceRepair(&metaForceCtx, &needRun), TSDB_CODE_SUCCESS); + ASSERT_TRUE(needRun); + + SRepairCliArgs metaReplicaCli = {0}; + ASSERT_EQ(tRepairParseCliOption(&metaReplicaCli, "node-type", "vnode"), TSDB_CODE_SUCCESS); + ASSERT_EQ(tRepairParseCliOption(&metaReplicaCli, "file-type", "meta"), TSDB_CODE_SUCCESS); + ASSERT_EQ(tRepairParseCliOption(&metaReplicaCli, "vnode-id", "2,3"), TSDB_CODE_SUCCESS); + ASSERT_EQ(tRepairParseCliOption(&metaReplicaCli, "mode", "replica"), TSDB_CODE_SUCCESS); + SRepairCtx metaReplicaCtx = {0}; + ASSERT_EQ(tRepairInitCtx(&metaReplicaCli, 1735689601510LL, &metaReplicaCtx), TSDB_CODE_SUCCESS); + ASSERT_EQ(tRepairNeedRunMetaForceRepair(&metaReplicaCtx, &needRun), TSDB_CODE_SUCCESS); + ASSERT_FALSE(needRun); + + SRepairCliArgs tsdbForceCli = {0}; + ASSERT_EQ(tRepairParseCliOption(&tsdbForceCli, "node-type", "vnode"), TSDB_CODE_SUCCESS); + ASSERT_EQ(tRepairParseCliOption(&tsdbForceCli, "file-type", "tsdb"), TSDB_CODE_SUCCESS); + ASSERT_EQ(tRepairParseCliOption(&tsdbForceCli, "vnode-id", "2,3"), TSDB_CODE_SUCCESS); + ASSERT_EQ(tRepairParseCliOption(&tsdbForceCli, "mode", "force"), TSDB_CODE_SUCCESS); + SRepairCtx tsdbForceCtx = {0}; + ASSERT_EQ(tRepairInitCtx(&tsdbForceCli, 1735689601511LL, &tsdbForceCtx), TSDB_CODE_SUCCESS); + ASSERT_EQ(tRepairNeedRunMetaForceRepair(&tsdbForceCtx, &needRun), TSDB_CODE_SUCCESS); + ASSERT_FALSE(needRun); +} + +TEST(RepairOptionParseTest, NeedRunReplicaRepair) { + bool needRun = false; + + SRepairCliArgs replicaCli = {0}; + ASSERT_EQ(tRepairParseCliOption(&replicaCli, "node-type", "vnode"), TSDB_CODE_SUCCESS); + ASSERT_EQ(tRepairParseCliOption(&replicaCli, "file-type", "meta"), TSDB_CODE_SUCCESS); + ASSERT_EQ(tRepairParseCliOption(&replicaCli, "vnode-id", "2,3"), TSDB_CODE_SUCCESS); + ASSERT_EQ(tRepairParseCliOption(&replicaCli, "mode", "replica"), TSDB_CODE_SUCCESS); + SRepairCtx replicaCtx = {0}; + ASSERT_EQ(tRepairInitCtx(&replicaCli, 1735689601512LL, &replicaCtx), TSDB_CODE_SUCCESS); + ASSERT_EQ(tRepairNeedRunReplicaRepair(&replicaCtx, &needRun), TSDB_CODE_SUCCESS); + ASSERT_TRUE(needRun); + + SRepairCliArgs forceCli = {0}; + ASSERT_EQ(tRepairParseCliOption(&forceCli, "node-type", "vnode"), TSDB_CODE_SUCCESS); + ASSERT_EQ(tRepairParseCliOption(&forceCli, "file-type", "meta"), TSDB_CODE_SUCCESS); + ASSERT_EQ(tRepairParseCliOption(&forceCli, "vnode-id", "2,3"), TSDB_CODE_SUCCESS); + ASSERT_EQ(tRepairParseCliOption(&forceCli, "mode", "force"), TSDB_CODE_SUCCESS); + SRepairCtx forceCtx = {0}; + ASSERT_EQ(tRepairInitCtx(&forceCli, 1735689601513LL, &forceCtx), TSDB_CODE_SUCCESS); + ASSERT_EQ(tRepairNeedRunReplicaRepair(&forceCtx, &needRun), TSDB_CODE_SUCCESS); + ASSERT_FALSE(needRun); +} + +TEST(RepairOptionParseTest, NeedRunReplicaRepairInvalidArgs) { + bool needRun = true; + SRepairCtx ctx = {0}; + ASSERT_EQ(tRepairNeedRunReplicaRepair(NULL, &needRun), TSDB_CODE_INVALID_PARA); + ASSERT_EQ(tRepairNeedRunReplicaRepair(&ctx, &needRun), TSDB_CODE_INVALID_PARA); + ASSERT_EQ(tRepairNeedRunReplicaRepair(&ctx, NULL), TSDB_CODE_INVALID_PARA); +} + +TEST(RepairOptionParseTest, NeedRunCopyRepair) { + bool needRun = false; + + SRepairCliArgs copyCli = {0}; + ASSERT_EQ(tRepairParseCliOption(©Cli, "node-type", "vnode"), TSDB_CODE_SUCCESS); + ASSERT_EQ(tRepairParseCliOption(©Cli, "file-type", "wal"), TSDB_CODE_SUCCESS); + ASSERT_EQ(tRepairParseCliOption(©Cli, "vnode-id", "2,3"), TSDB_CODE_SUCCESS); + ASSERT_EQ(tRepairParseCliOption(©Cli, "mode", "copy"), TSDB_CODE_SUCCESS); + ASSERT_EQ(tRepairParseCliOption(©Cli, "replica-node", "tdnode1:/var/lib/taos"), TSDB_CODE_SUCCESS); + SRepairCtx copyCtx = {0}; + ASSERT_EQ(tRepairInitCtx(©Cli, 1735689601518LL, ©Ctx), TSDB_CODE_SUCCESS); + ASSERT_EQ(tRepairNeedRunCopyRepair(©Ctx, &needRun), TSDB_CODE_SUCCESS); + ASSERT_TRUE(needRun); + + SRepairCliArgs replicaCli = {0}; + ASSERT_EQ(tRepairParseCliOption(&replicaCli, "node-type", "vnode"), TSDB_CODE_SUCCESS); + ASSERT_EQ(tRepairParseCliOption(&replicaCli, "file-type", "wal"), TSDB_CODE_SUCCESS); + ASSERT_EQ(tRepairParseCliOption(&replicaCli, "vnode-id", "2,3"), TSDB_CODE_SUCCESS); + ASSERT_EQ(tRepairParseCliOption(&replicaCli, "mode", "replica"), TSDB_CODE_SUCCESS); + SRepairCtx replicaCtx = {0}; + ASSERT_EQ(tRepairInitCtx(&replicaCli, 1735689601519LL, &replicaCtx), TSDB_CODE_SUCCESS); + ASSERT_EQ(tRepairNeedRunCopyRepair(&replicaCtx, &needRun), TSDB_CODE_SUCCESS); + ASSERT_FALSE(needRun); +} + +TEST(RepairOptionParseTest, NeedRunCopyRepairInvalidArgs) { + bool needRun = true; + SRepairCtx ctx = {0}; + ASSERT_EQ(tRepairNeedRunCopyRepair(NULL, &needRun), TSDB_CODE_INVALID_PARA); + ASSERT_EQ(tRepairNeedRunCopyRepair(&ctx, &needRun), TSDB_CODE_INVALID_PARA); + ASSERT_EQ(tRepairNeedRunCopyRepair(&ctx, NULL), TSDB_CODE_INVALID_PARA); +} + +TEST(RepairOptionParseTest, BuildCopySshScpCommands) { + char sshCmd[PATH_MAX * 2] = {0}; + char scpCmd[PATH_MAX * 2] = {0}; + const char *remoteTarget = "/var/lib/taos/vnode/vnode2/wal"; + const char *localTarget = "/tmp/td-repair-local/vnode/vnode2/wal"; + + ASSERT_EQ(tRepairBuildCopySshProbeCmd("tdnode1", remoteTarget, sshCmd, sizeof(sshCmd)), TSDB_CODE_SUCCESS); + ASSERT_NE(std::string(sshCmd).find("ssh"), std::string::npos); + ASSERT_NE(std::string(sshCmd).find("tdnode1"), std::string::npos); + ASSERT_NE(std::string(sshCmd).find("test -d"), std::string::npos); + ASSERT_NE(std::string(sshCmd).find(remoteTarget), std::string::npos); + + ASSERT_EQ(tRepairBuildCopyScpCmd("tdnode1", remoteTarget, localTarget, scpCmd, sizeof(scpCmd)), TSDB_CODE_SUCCESS); + ASSERT_NE(std::string(scpCmd).find("scp"), std::string::npos); + ASSERT_NE(std::string(scpCmd).find("tdnode1:"), std::string::npos); + ASSERT_NE(std::string(scpCmd).find(remoteTarget), std::string::npos); + ASSERT_NE(std::string(scpCmd).find(localTarget), std::string::npos); +} + +TEST(RepairOptionParseTest, BuildCopySshScpCommandsInvalidArgs) { + char cmd[64] = {0}; + ASSERT_EQ(tRepairBuildCopySshProbeCmd(NULL, "/var/lib/taos/vnode/vnode2/wal", cmd, sizeof(cmd)), + TSDB_CODE_INVALID_PARA); + ASSERT_EQ(tRepairBuildCopySshProbeCmd("tdnode1", NULL, cmd, sizeof(cmd)), TSDB_CODE_INVALID_PARA); + ASSERT_EQ(tRepairBuildCopySshProbeCmd("tdnode1", "/var/lib/taos/vnode/vnode2/wal", NULL, sizeof(cmd)), + TSDB_CODE_INVALID_PARA); + ASSERT_EQ(tRepairBuildCopyScpCmd(NULL, "/var/lib/taos/vnode/vnode2/wal", "/tmp/local", cmd, sizeof(cmd)), + TSDB_CODE_INVALID_PARA); + ASSERT_EQ(tRepairBuildCopyScpCmd("tdnode1", NULL, "/tmp/local", cmd, sizeof(cmd)), TSDB_CODE_INVALID_PARA); + ASSERT_EQ(tRepairBuildCopyScpCmd("tdnode1", "/var/lib/taos/vnode/vnode2/wal", NULL, cmd, sizeof(cmd)), + TSDB_CODE_INVALID_PARA); + ASSERT_EQ(tRepairBuildCopyScpCmd("tdnode1", "/var/lib/taos/vnode/vnode2/wal", "/tmp/local", NULL, sizeof(cmd)), + TSDB_CODE_INVALID_PARA); + ASSERT_EQ(tRepairBuildCopySshProbeCmd("tdnode1;rm", "/var/lib/taos/vnode/vnode2/wal", cmd, sizeof(cmd)), + TSDB_CODE_INVALID_PARA); + ASSERT_EQ(tRepairBuildCopySshProbeCmd("tdnode1", "/var/lib/taos/vnode/vnode2/wal;rm", cmd, sizeof(cmd)), + TSDB_CODE_INVALID_PARA); + ASSERT_EQ(tRepairBuildCopyScpCmd("tdnode1|cat", "/var/lib/taos/vnode/vnode2/wal", "/tmp/local", cmd, sizeof(cmd)), + TSDB_CODE_INVALID_PARA); + ASSERT_EQ(tRepairBuildCopyScpCmd("tdnode1", "/var/lib/taos/vnode/vnode2/wal", "/tmp/local$(id)", cmd, sizeof(cmd)), + TSDB_CODE_INVALID_PARA); +} + +TEST(RepairOptionParseTest, DegradeReplicaVnodeWritesMarker) { + const std::string dataDirPath = buildRepairTempPath("replica-degrade-marker"); + RepairTempDirGuard dataDirGuard(dataDirPath); + const std::string sep(TD_DIRSEP); + const std::string vnodeDir = dataDirPath + sep + "vnode" + sep + "vnode2"; + ASSERT_EQ(taosMulMkDir(vnodeDir.c_str()), 0); + + SRepairCliArgs cliArgs = {0}; + ASSERT_EQ(tRepairParseCliOption(&cliArgs, "node-type", "vnode"), TSDB_CODE_SUCCESS); + ASSERT_EQ(tRepairParseCliOption(&cliArgs, "file-type", "wal"), TSDB_CODE_SUCCESS); + ASSERT_EQ(tRepairParseCliOption(&cliArgs, "vnode-id", "2"), TSDB_CODE_SUCCESS); + ASSERT_EQ(tRepairParseCliOption(&cliArgs, "mode", "replica"), TSDB_CODE_SUCCESS); + + SRepairCtx ctx = {0}; + ASSERT_EQ(tRepairInitCtx(&cliArgs, 1735689601514LL, &ctx), TSDB_CODE_SUCCESS); + + char markerPath[PATH_MAX] = {0}; + ASSERT_EQ(tRepairDegradeReplicaVnode(&ctx, dataDirPath.c_str(), 2, markerPath, sizeof(markerPath)), TSDB_CODE_SUCCESS); + ASSERT_TRUE(taosCheckExistFile(markerPath)); + + std::string markerContent = readRepairFileContent(markerPath); + ASSERT_FALSE(markerContent.empty()); + ASSERT_NE(markerContent.find("\"action\":\"degrade-local-replica\""), std::string::npos); + ASSERT_NE(markerContent.find("\"availability\":\"offline\""), std::string::npos); + ASSERT_NE(markerContent.find("\"syncPolicy\":\"full-sync\""), std::string::npos); + ASSERT_NE(markerContent.find("\"versionPolicy\":\"reset-local-version\""), std::string::npos); + ASSERT_NE(markerContent.find("\"termPolicy\":\"bump-local-term\""), std::string::npos); + ASSERT_NE(markerContent.find("\"vnodeId\":2"), std::string::npos); +} + +TEST(RepairOptionParseTest, DegradeReplicaVnodeInvalidArgs) { + SRepairCtx ctx = {0}; + char markerPath[PATH_MAX] = {0}; + + ASSERT_EQ(tRepairDegradeReplicaVnode(NULL, "/tmp", 2, markerPath, sizeof(markerPath)), TSDB_CODE_INVALID_PARA); + ASSERT_EQ(tRepairDegradeReplicaVnode(&ctx, "/tmp", 2, markerPath, sizeof(markerPath)), TSDB_CODE_INVALID_PARA); + ASSERT_EQ(tRepairDegradeReplicaVnode(&ctx, "/tmp", 2, NULL, sizeof(markerPath)), TSDB_CODE_INVALID_PARA); + ASSERT_EQ(tRepairDegradeReplicaVnode(&ctx, "/tmp", 2, markerPath, 0), TSDB_CODE_INVALID_PARA); +} + +TEST(RepairOptionParseTest, WriteReplicaRestoreHint) { + const std::string dataDirPath = buildRepairTempPath("replica-restore-hint-data"); + const std::string backupDirPath = buildRepairTempPath("replica-restore-hint-backup"); + RepairTempDirGuard dataDirGuard(dataDirPath); + RepairTempDirGuard backupDirGuard(backupDirPath); + ASSERT_EQ(taosMulMkDir(dataDirPath.c_str()), 0); + ASSERT_EQ(taosMulMkDir(backupDirPath.c_str()), 0); + + SRepairCliArgs cliArgs = {0}; + ASSERT_EQ(tRepairParseCliOption(&cliArgs, "node-type", "vnode"), TSDB_CODE_SUCCESS); + ASSERT_EQ(tRepairParseCliOption(&cliArgs, "file-type", "wal"), TSDB_CODE_SUCCESS); + ASSERT_EQ(tRepairParseCliOption(&cliArgs, "vnode-id", "2,3"), TSDB_CODE_SUCCESS); + ASSERT_EQ(tRepairParseCliOption(&cliArgs, "mode", "replica"), TSDB_CODE_SUCCESS); + ASSERT_EQ(tRepairParseCliOption(&cliArgs, "backup-path", backupDirPath.c_str()), TSDB_CODE_SUCCESS); + + SRepairCtx ctx = {0}; + ASSERT_EQ(tRepairInitCtx(&cliArgs, 1735689601515LL, &ctx), TSDB_CODE_SUCCESS); + + char hintPath[PATH_MAX] = {0}; + ASSERT_EQ(tRepairWriteReplicaRestoreHint(&ctx, dataDirPath.c_str(), hintPath, sizeof(hintPath)), TSDB_CODE_SUCCESS); + ASSERT_TRUE(taosCheckExistFile(hintPath)); + + std::string hintContent = readRepairFileContent(hintPath); + ASSERT_FALSE(hintContent.empty()); + ASSERT_NE(hintContent.find("\"mnodeMsgType\":\"TDMT_MND_RESTORE_DNODE\""), std::string::npos); + ASSERT_NE(hintContent.find("\"restoreType\":\"RESTORE_TYPE__VNODE\""), std::string::npos); + ASSERT_NE(hintContent.find("\"vgroupAction\":\"mndBuildRestoreAlterVgroupAction\""), std::string::npos); + ASSERT_NE(hintContent.find("\"restoreSqlHint\":\"RESTORE VNODE ON DNODE"), std::string::npos); + ASSERT_NE(hintContent.find("\"vnodeIds\":\"2,3\""), std::string::npos); +} + +TEST(RepairOptionParseTest, WriteReplicaRestoreHintInvalidArgs) { + SRepairCtx ctx = {0}; + char hintPath[PATH_MAX] = {0}; + + ASSERT_EQ(tRepairWriteReplicaRestoreHint(NULL, "/tmp", hintPath, sizeof(hintPath)), TSDB_CODE_INVALID_PARA); + ASSERT_EQ(tRepairWriteReplicaRestoreHint(&ctx, "/tmp", hintPath, sizeof(hintPath)), TSDB_CODE_INVALID_PARA); + ASSERT_EQ(tRepairWriteReplicaRestoreHint(&ctx, "/tmp", NULL, sizeof(hintPath)), TSDB_CODE_INVALID_PARA); + ASSERT_EQ(tRepairWriteReplicaRestoreHint(&ctx, "/tmp", hintPath, 0), TSDB_CODE_INVALID_PARA); +} + +TEST(RepairOptionParseTest, RollbackReplicaVnodeRemovesMarker) { + const std::string dataDirPath = buildRepairTempPath("replica-rollback-marker"); + RepairTempDirGuard dataDirGuard(dataDirPath); + const std::string sep(TD_DIRSEP); + const std::string vnodeDir = dataDirPath + sep + "vnode" + sep + "vnode2"; + ASSERT_EQ(taosMulMkDir(vnodeDir.c_str()), 0); + + SRepairCliArgs cliArgs = {0}; + ASSERT_EQ(tRepairParseCliOption(&cliArgs, "node-type", "vnode"), TSDB_CODE_SUCCESS); + ASSERT_EQ(tRepairParseCliOption(&cliArgs, "file-type", "wal"), TSDB_CODE_SUCCESS); + ASSERT_EQ(tRepairParseCliOption(&cliArgs, "vnode-id", "2"), TSDB_CODE_SUCCESS); + ASSERT_EQ(tRepairParseCliOption(&cliArgs, "mode", "replica"), TSDB_CODE_SUCCESS); + + SRepairCtx ctx = {0}; + ASSERT_EQ(tRepairInitCtx(&cliArgs, 1735689601516LL, &ctx), TSDB_CODE_SUCCESS); + + char markerPath[PATH_MAX] = {0}; + ASSERT_EQ(tRepairDegradeReplicaVnode(&ctx, dataDirPath.c_str(), 2, markerPath, sizeof(markerPath)), TSDB_CODE_SUCCESS); + ASSERT_TRUE(taosCheckExistFile(markerPath)); + + ASSERT_EQ(tRepairRollbackReplicaVnode(&ctx, dataDirPath.c_str(), 2), TSDB_CODE_SUCCESS); + ASSERT_FALSE(taosCheckExistFile(markerPath)); +} + +TEST(RepairOptionParseTest, RollbackReplicaVnodeInvalidArgs) { + SRepairCtx ctx = {0}; + + ASSERT_EQ(tRepairRollbackReplicaVnode(NULL, "/tmp", 2), TSDB_CODE_INVALID_PARA); + ASSERT_EQ(tRepairRollbackReplicaVnode(&ctx, "/tmp", 2), TSDB_CODE_INVALID_PARA); + ASSERT_EQ(tRepairRollbackReplicaVnode(&ctx, NULL, 2), TSDB_CODE_INVALID_PARA); + ASSERT_EQ(tRepairRollbackReplicaVnode(&ctx, "/tmp", -1), TSDB_CODE_INVALID_PARA); +} + +TEST(RepairOptionParseTest, ParseReplicaNodeEndpoint) { + char host[128] = {0}; + char remoteDataDir[PATH_MAX] = {0}; + + ASSERT_EQ(tRepairParseReplicaNodeEndpoint("tdnode1:/var/lib/taos", host, sizeof(host), remoteDataDir, + sizeof(remoteDataDir)), + TSDB_CODE_SUCCESS); + ASSERT_STREQ(host, "tdnode1"); + ASSERT_STREQ(remoteDataDir, "/var/lib/taos"); + + ASSERT_EQ(tRepairParseReplicaNodeEndpoint(NULL, host, sizeof(host), remoteDataDir, sizeof(remoteDataDir)), + TSDB_CODE_INVALID_PARA); + ASSERT_EQ(tRepairParseReplicaNodeEndpoint("tdnode1:/var/lib/taos", NULL, sizeof(host), remoteDataDir, + sizeof(remoteDataDir)), + TSDB_CODE_INVALID_PARA); + ASSERT_EQ(tRepairParseReplicaNodeEndpoint("tdnode1:/var/lib/taos", host, 0, remoteDataDir, sizeof(remoteDataDir)), + TSDB_CODE_INVALID_PARA); + ASSERT_EQ(tRepairParseReplicaNodeEndpoint("tdnode1:/var/lib/taos", host, sizeof(host), NULL, sizeof(remoteDataDir)), + TSDB_CODE_INVALID_PARA); + ASSERT_EQ(tRepairParseReplicaNodeEndpoint("tdnode1:/var/lib/taos", host, sizeof(host), remoteDataDir, 0), + TSDB_CODE_INVALID_PARA); + + const char *invalidEndpoints[] = { + "tdnode1", + ":/var/lib/taos", + "tdnode1:", + "tdnode1:var/lib/taos", + "tdnode1:/var/lib/taos:bak", + "td node1:/var/lib/taos", + "tdnode1;rm:/var/lib/taos", + "tdnode1:/var/lib/taos;rm", + "tdnode1:/var/lib/taos|cat", + "tdnode1:/var/lib/taos&&id", + "tdnode1:/var/lib/taos$(id)", + "tdnode1:/var/lib/taos'bad'", + }; + for (const char *endpoint : invalidEndpoints) { + ASSERT_EQ( + tRepairParseReplicaNodeEndpoint(endpoint, host, sizeof(host), remoteDataDir, sizeof(remoteDataDir)), + TSDB_CODE_INVALID_PARA); + } +} + +TEST(RepairOptionParseTest, MockCopyReplicaVnodeTarget) { + const std::string localDataDir = buildRepairTempPath("copy-local-data"); + const std::string remoteDataDir = buildRepairTempPath("copy-remote-data"); + RepairTempDirGuard localDataGuard(localDataDir); + RepairTempDirGuard remoteDataGuard(remoteDataDir); + ASSERT_EQ(taosMulMkDir(localDataDir.c_str()), 0); + ASSERT_EQ(taosMulMkDir(remoteDataDir.c_str()), 0); + + const std::string sep(TD_DIRSEP); + const std::string remoteWalDir = remoteDataDir + sep + "vnode" + sep + "vnode2" + sep + "wal"; + const std::string remoteWalMetaDir = remoteWalDir + sep + "meta"; + ASSERT_EQ(taosMulMkDir(remoteWalMetaDir.c_str()), 0); + const std::string localWalDir = localDataDir + sep + "vnode" + sep + "vnode2" + sep + "wal"; + ASSERT_EQ(taosMulMkDir(localWalDir.c_str()), 0); + + auto writeRepairFile = [](const std::string &path, const std::string &content) { + TdFilePtr pFile = taosOpenFile(path.c_str(), TD_FILE_CREATE | TD_FILE_WRITE | TD_FILE_TRUNC); + ASSERT_NE(pFile, nullptr); + ASSERT_EQ(taosWriteFile(pFile, content.c_str(), (int64_t)content.size()), (int64_t)content.size()); + ASSERT_EQ(taosCloseFile(&pFile), 0); + }; + + const std::string remoteWalFile = remoteWalDir + sep + "000001.log"; + const std::string remoteWalMetaFile = remoteWalMetaDir + sep + "checkpoint"; + const std::string localStaleFile = localWalDir + sep + "stale.log"; + writeRepairFile(remoteWalFile, "remote-wal"); + writeRepairFile(remoteWalMetaFile, "remote-meta"); + writeRepairFile(localStaleFile, "stale"); + + SRepairCliArgs cliArgs = {0}; + ASSERT_EQ(tRepairParseCliOption(&cliArgs, "node-type", "vnode"), TSDB_CODE_SUCCESS); + ASSERT_EQ(tRepairParseCliOption(&cliArgs, "file-type", "wal"), TSDB_CODE_SUCCESS); + ASSERT_EQ(tRepairParseCliOption(&cliArgs, "vnode-id", "2"), TSDB_CODE_SUCCESS); + ASSERT_EQ(tRepairParseCliOption(&cliArgs, "mode", "copy"), TSDB_CODE_SUCCESS); + ASSERT_EQ(tRepairParseCliOption(&cliArgs, "replica-node", "tdnode1:/var/lib/taos"), TSDB_CODE_SUCCESS); + + SRepairCtx ctx = {0}; + ASSERT_EQ(tRepairInitCtx(&cliArgs, 1735689601517LL, &ctx), TSDB_CODE_SUCCESS); + + char srcPath[PATH_MAX] = {0}; + char dstPath[PATH_MAX] = {0}; + ASSERT_EQ(tRepairMockCopyReplicaVnodeTarget(&ctx, remoteDataDir.c_str(), localDataDir.c_str(), 2, srcPath, + sizeof(srcPath), dstPath, sizeof(dstPath)), + TSDB_CODE_SUCCESS); + ASSERT_STREQ(srcPath, remoteWalDir.c_str()); + ASSERT_STREQ(dstPath, localWalDir.c_str()); + ASSERT_FALSE(taosCheckExistFile(localStaleFile.c_str())); + ASSERT_EQ(readRepairFileContent((localWalDir + sep + "000001.log").c_str()), "remote-wal"); + ASSERT_EQ(readRepairFileContent((localWalDir + sep + "meta" + sep + "checkpoint").c_str()), "remote-meta"); +} + +TEST(RepairOptionParseTest, MockCopyReplicaVnodeTargetInvalidArgs) { + SRepairCtx ctx = {0}; + ASSERT_EQ(tRepairMockCopyReplicaVnodeTarget(NULL, "/tmp/remote", "/tmp/local", 2, NULL, 0, NULL, 0), + TSDB_CODE_INVALID_PARA); + ASSERT_EQ(tRepairMockCopyReplicaVnodeTarget(&ctx, "/tmp/remote", "/tmp/local", 2, NULL, 0, NULL, 0), + TSDB_CODE_INVALID_PARA); +} + +TEST(RepairOptionParseTest, SshScpCopyReplicaVnodeTargetFixesOwnerAndPermission) { + const std::string localDataDir = buildRepairTempPath("copy-ssh-local-data"); + const std::string remoteDataDir = buildRepairTempPath("copy-ssh-remote-data"); + const std::string binDir = buildRepairTempPath("copy-ssh-mock-bin"); + RepairTempDirGuard localDataGuard(localDataDir); + RepairTempDirGuard remoteDataGuard(remoteDataDir); + RepairTempDirGuard binDirGuard(binDir); + ASSERT_EQ(taosMulMkDir(localDataDir.c_str()), 0); + ASSERT_EQ(taosMulMkDir(remoteDataDir.c_str()), 0); + ASSERT_EQ(taosMulMkDir(binDir.c_str()), 0); + + const std::string sep(TD_DIRSEP); + const std::string remoteWalDir = remoteDataDir + sep + "vnode" + sep + "vnode2" + sep + "wal"; + const std::string localWalDir = localDataDir + sep + "vnode" + sep + "vnode2" + sep + "wal"; + ASSERT_EQ(taosMulMkDir(remoteWalDir.c_str()), 0); + ASSERT_EQ(taosMulMkDir(localWalDir.c_str()), 0); + + auto writeRepairFile = [](const std::string &path, const std::string &content) { + TdFilePtr pFile = taosOpenFile(path.c_str(), TD_FILE_CREATE | TD_FILE_WRITE | TD_FILE_TRUNC); + ASSERT_NE(pFile, nullptr); + ASSERT_EQ(taosWriteFile(pFile, content.c_str(), (int64_t)content.size()), (int64_t)content.size()); + ASSERT_EQ(taosCloseFile(&pFile), 0); + }; + writeRepairFile(remoteWalDir + sep + "000001.log", "remote-wal"); + + ASSERT_EQ(runRepairCommandGetLastLine("chmod 700 '" + remoteWalDir + "' && echo ok"), "ok"); + + const std::string sshMockPath = binDir + sep + "ssh-mock"; + const std::string scpMockPath = binDir + sep + "scp-mock"; + writeRepairFile(sshMockPath, + "#!/usr/bin/env bash\n" + "set -euo pipefail\n" + "cmd=\"${@: -1}\"\n" + "bash -c \"$cmd\"\n"); + writeRepairFile(scpMockPath, + "#!/usr/bin/env bash\n" + "set -euo pipefail\n" + "src=\"${@: -2:1}\"\n" + "dst=\"${@: -1}\"\n" + "remote=\"${src#*:}\"\n" + "mkdir -p \"$dst\"\n" + "cp -r \"$remote/.\" \"$dst\"\n" + "chmod -R 755 \"$dst\"\n"); + ASSERT_EQ(runRepairCommandGetLastLine("chmod +x '" + sshMockPath + "' '" + scpMockPath + "' && echo ok"), "ok"); + + RepairEnvVarGuard sshGuard("TAOS_REPAIR_SSH_BIN"); + RepairEnvVarGuard scpGuard("TAOS_REPAIR_SCP_BIN"); + ASSERT_EQ(setenv("TAOS_REPAIR_SSH_BIN", sshMockPath.c_str(), 1), 0); + ASSERT_EQ(setenv("TAOS_REPAIR_SCP_BIN", scpMockPath.c_str(), 1), 0); + + SRepairCliArgs cliArgs = {0}; + ASSERT_EQ(tRepairParseCliOption(&cliArgs, "node-type", "vnode"), TSDB_CODE_SUCCESS); + ASSERT_EQ(tRepairParseCliOption(&cliArgs, "file-type", "wal"), TSDB_CODE_SUCCESS); + ASSERT_EQ(tRepairParseCliOption(&cliArgs, "vnode-id", "2"), TSDB_CODE_SUCCESS); + ASSERT_EQ(tRepairParseCliOption(&cliArgs, "mode", "copy"), TSDB_CODE_SUCCESS); + ASSERT_EQ(tRepairParseCliOption(&cliArgs, "replica-node", "tdnode1:/var/lib/taos"), TSDB_CODE_SUCCESS); + + SRepairCtx ctx = {0}; + ASSERT_EQ(tRepairInitCtx(&cliArgs, 1735689601518LL, &ctx), TSDB_CODE_SUCCESS); + + ASSERT_EQ(tRepairSshScpCopyReplicaVnodeTarget(&ctx, "tdnode1", remoteDataDir.c_str(), localDataDir.c_str(), 2, NULL, 0, + NULL, 0), + TSDB_CODE_SUCCESS); + + std::string remoteMeta = runRepairCommandGetLastLine("stat -c '%u %g %a' '" + remoteWalDir + "'"); + std::string localMeta = runRepairCommandGetLastLine("stat -c '%u %g %a' '" + localWalDir + "'"); + ASSERT_FALSE(remoteMeta.empty()); + ASSERT_EQ(localMeta, remoteMeta); +} + +TEST(RepairOptionParseTest, SshScpCopyReplicaVnodeTargetDetectsConsistencyMismatch) { + const std::string localDataDir = buildRepairTempPath("copy-ssh-local-data-mismatch"); + const std::string remoteDataDir = buildRepairTempPath("copy-ssh-remote-data-mismatch"); + const std::string binDir = buildRepairTempPath("copy-ssh-mock-bin-mismatch"); + RepairTempDirGuard localDataGuard(localDataDir); + RepairTempDirGuard remoteDataGuard(remoteDataDir); + RepairTempDirGuard binDirGuard(binDir); + ASSERT_EQ(taosMulMkDir(localDataDir.c_str()), 0); + ASSERT_EQ(taosMulMkDir(remoteDataDir.c_str()), 0); + ASSERT_EQ(taosMulMkDir(binDir.c_str()), 0); + + const std::string sep(TD_DIRSEP); + const std::string remoteWalDir = remoteDataDir + sep + "vnode" + sep + "vnode2" + sep + "wal"; + const std::string localWalDir = localDataDir + sep + "vnode" + sep + "vnode2" + sep + "wal"; + ASSERT_EQ(taosMulMkDir(remoteWalDir.c_str()), 0); + ASSERT_EQ(taosMulMkDir(localWalDir.c_str()), 0); + + auto writeRepairFile = [](const std::string &path, const std::string &content) { + TdFilePtr pFile = taosOpenFile(path.c_str(), TD_FILE_CREATE | TD_FILE_WRITE | TD_FILE_TRUNC); + ASSERT_NE(pFile, nullptr); + ASSERT_EQ(taosWriteFile(pFile, content.c_str(), (int64_t)content.size()), (int64_t)content.size()); + ASSERT_EQ(taosCloseFile(&pFile), 0); + }; + writeRepairFile(remoteWalDir + sep + "000001.log", "remote-wal-1"); + writeRepairFile(remoteWalDir + sep + "000002.log", "remote-wal-2"); + + const std::string sshMockPath = binDir + sep + "ssh-mock"; + const std::string scpMockPath = binDir + sep + "scp-mock"; + writeRepairFile(sshMockPath, + "#!/usr/bin/env bash\n" + "set -euo pipefail\n" + "cmd=\"${@: -1}\"\n" + "bash -c \"$cmd\"\n"); + writeRepairFile(scpMockPath, + "#!/usr/bin/env bash\n" + "set -euo pipefail\n" + "src=\"${@: -2:1}\"\n" + "dst=\"${@: -1}\"\n" + "remote=\"${src#*:}\"\n" + "mkdir -p \"$dst\"\n" + "cp \"$remote/000001.log\" \"$dst/\"\n"); + ASSERT_EQ(runRepairCommandGetLastLine("chmod +x '" + sshMockPath + "' '" + scpMockPath + "' && echo ok"), "ok"); + + RepairEnvVarGuard sshGuard("TAOS_REPAIR_SSH_BIN"); + RepairEnvVarGuard scpGuard("TAOS_REPAIR_SCP_BIN"); + ASSERT_EQ(setenv("TAOS_REPAIR_SSH_BIN", sshMockPath.c_str(), 1), 0); + ASSERT_EQ(setenv("TAOS_REPAIR_SCP_BIN", scpMockPath.c_str(), 1), 0); + + SRepairCliArgs cliArgs = {0}; + ASSERT_EQ(tRepairParseCliOption(&cliArgs, "node-type", "vnode"), TSDB_CODE_SUCCESS); + ASSERT_EQ(tRepairParseCliOption(&cliArgs, "file-type", "wal"), TSDB_CODE_SUCCESS); + ASSERT_EQ(tRepairParseCliOption(&cliArgs, "vnode-id", "2"), TSDB_CODE_SUCCESS); + ASSERT_EQ(tRepairParseCliOption(&cliArgs, "mode", "copy"), TSDB_CODE_SUCCESS); + ASSERT_EQ(tRepairParseCliOption(&cliArgs, "replica-node", "tdnode1:/var/lib/taos"), TSDB_CODE_SUCCESS); + + SRepairCtx ctx = {0}; + ASSERT_EQ(tRepairInitCtx(&cliArgs, 1735689601519LL, &ctx), TSDB_CODE_SUCCESS); + + ASSERT_EQ(tRepairSshScpCopyReplicaVnodeTarget(&ctx, "tdnode1", remoteDataDir.c_str(), localDataDir.c_str(), 2, NULL, 0, + NULL, 0), + TSDB_CODE_FAILED); +} + +TEST(RepairOptionParseTest, BuildVnodeTargetPath) { + char targetPath[PATH_MAX] = {0}; + + ASSERT_EQ(tRepairBuildVnodeTargetPath("/tmp/repair-data", 11, REPAIR_FILE_TYPE_WAL, targetPath, sizeof(targetPath)), + TSDB_CODE_SUCCESS); + std::string expectedWal = + std::string("/tmp/repair-data") + TD_DIRSEP + "vnode" + TD_DIRSEP + "vnode11" + TD_DIRSEP + "wal"; + ASSERT_STREQ(targetPath, expectedWal.c_str()); + + ASSERT_EQ(tRepairBuildVnodeTargetPath("/tmp/repair-data", 11, REPAIR_FILE_TYPE_META, targetPath, sizeof(targetPath)), + TSDB_CODE_SUCCESS); + std::string expectedMeta = + std::string("/tmp/repair-data") + TD_DIRSEP + "vnode" + TD_DIRSEP + "vnode11" + TD_DIRSEP + "meta"; + ASSERT_STREQ(targetPath, expectedMeta.c_str()); + + ASSERT_EQ(tRepairBuildVnodeTargetPath(NULL, 11, REPAIR_FILE_TYPE_WAL, targetPath, sizeof(targetPath)), + TSDB_CODE_INVALID_PARA); + ASSERT_EQ(tRepairBuildVnodeTargetPath("/tmp/repair-data", -1, REPAIR_FILE_TYPE_WAL, targetPath, sizeof(targetPath)), + TSDB_CODE_INVALID_PARA); + ASSERT_EQ(tRepairBuildVnodeTargetPath("/tmp/repair-data", 11, REPAIR_FILE_TYPE_DATA, targetPath, sizeof(targetPath)), + TSDB_CODE_INVALID_PARA); + ASSERT_EQ(tRepairBuildVnodeTargetPath("/tmp/repair-data", 11, REPAIR_FILE_TYPE_WAL, NULL, sizeof(targetPath)), + TSDB_CODE_INVALID_PARA); +} + +TEST(RepairOptionParseTest, BackupAndRollbackVnodeTarget) { + const std::string dataDir = buildRepairTempPath("backup-rollback-data"); + const std::string backupRoot = buildRepairTempPath("backup-rollback-root"); + RepairTempDirGuard dataDirGuard(dataDir); + RepairTempDirGuard backupRootGuard(backupRoot); + ASSERT_EQ(taosMulMkDir(dataDir.c_str()), 0); + ASSERT_EQ(taosMulMkDir(backupRoot.c_str()), 0); + + const std::string sep(TD_DIRSEP); + const std::string walDir = dataDir + sep + "vnode" + sep + "vnode2" + sep + "wal"; + const std::string walMetaDir = walDir + sep + "meta"; + ASSERT_EQ(taosMulMkDir(walMetaDir.c_str()), 0); + + auto writeRepairTestFile = [](const std::string &path, const std::string &content) { + TdFilePtr pFile = taosOpenFile(path.c_str(), TD_FILE_CREATE | TD_FILE_WRITE | TD_FILE_TRUNC); + ASSERT_NE(pFile, nullptr); + ASSERT_EQ(taosWriteFile(pFile, content.c_str(), (int64_t)content.size()), (int64_t)content.size()); + ASSERT_EQ(taosCloseFile(&pFile), 0); + }; + + const std::string walFile = walDir + sep + "000001.log"; + const std::string walMetaFile = walMetaDir + sep + "checkpoint"; + writeRepairTestFile(walFile, "origin-wal"); + writeRepairTestFile(walMetaFile, "origin-meta"); + + SRepairCliArgs cliArgs = {0}; + ASSERT_EQ(tRepairParseCliOption(&cliArgs, "node-type", "vnode"), TSDB_CODE_SUCCESS); + ASSERT_EQ(tRepairParseCliOption(&cliArgs, "file-type", "wal"), TSDB_CODE_SUCCESS); + ASSERT_EQ(tRepairParseCliOption(&cliArgs, "vnode-id", "2"), TSDB_CODE_SUCCESS); + ASSERT_EQ(tRepairParseCliOption(&cliArgs, "mode", "force"), TSDB_CODE_SUCCESS); + ASSERT_EQ(tRepairParseCliOption(&cliArgs, "backup-path", backupRoot.c_str()), TSDB_CODE_SUCCESS); + + SRepairCtx ctx = {0}; + ASSERT_EQ(tRepairInitCtx(&cliArgs, 1735689601504LL, &ctx), TSDB_CODE_SUCCESS); + + char backupDir[PATH_MAX] = {0}; + ASSERT_EQ(tRepairBackupVnodeTarget(&ctx, dataDir.c_str(), 2, backupDir, sizeof(backupDir)), TSDB_CODE_SUCCESS); + + const std::string expectedBackupDir = + backupRoot + sep + "repair-1735689601504" + sep + "vnode2" + sep + "wal"; + ASSERT_STREQ(backupDir, expectedBackupDir.c_str()); + ASSERT_TRUE(taosDirExist(backupDir)); + + const std::string backupWalFile = expectedBackupDir + sep + "000001.log"; + const std::string backupWalMetaFile = expectedBackupDir + sep + "meta" + sep + "checkpoint"; + ASSERT_STREQ(readRepairFileContent(backupWalFile.c_str()).c_str(), "origin-wal"); + ASSERT_STREQ(readRepairFileContent(backupWalMetaFile.c_str()).c_str(), "origin-meta"); + + writeRepairTestFile(walFile, "mutated-wal"); + ASSERT_EQ(taosRemoveFile(walMetaFile.c_str()), 0); + + ASSERT_EQ(tRepairRollbackVnodeTarget(&ctx, dataDir.c_str(), 2), TSDB_CODE_SUCCESS); + ASSERT_STREQ(readRepairFileContent(walFile.c_str()).c_str(), "origin-wal"); + ASSERT_STREQ(readRepairFileContent(walMetaFile.c_str()).c_str(), "origin-meta"); +} + +TEST(RepairOptionParseTest, BackupAndRollbackVnodeTargetInvalidArgs) { + SRepairCtx ctx = {0}; + char backupDir[PATH_MAX] = {0}; + ASSERT_EQ(tRepairBackupVnodeTarget(NULL, "/tmp", 2, backupDir, sizeof(backupDir)), TSDB_CODE_INVALID_PARA); + ASSERT_EQ(tRepairBackupVnodeTarget(&ctx, "/tmp", 2, backupDir, sizeof(backupDir)), TSDB_CODE_INVALID_PARA); + ASSERT_EQ(tRepairRollbackVnodeTarget(NULL, "/tmp", 2), TSDB_CODE_INVALID_PARA); + ASSERT_EQ(tRepairRollbackVnodeTarget(&ctx, "/tmp", 2), TSDB_CODE_INVALID_PARA); + + SRepairCliArgs cliArgs = {0}; + ASSERT_EQ(tRepairParseCliOption(&cliArgs, "node-type", "vnode"), TSDB_CODE_SUCCESS); + ASSERT_EQ(tRepairParseCliOption(&cliArgs, "file-type", "wal"), TSDB_CODE_SUCCESS); + ASSERT_EQ(tRepairParseCliOption(&cliArgs, "vnode-id", "2"), TSDB_CODE_SUCCESS); + ASSERT_EQ(tRepairParseCliOption(&cliArgs, "mode", "force"), TSDB_CODE_SUCCESS); + ASSERT_EQ(tRepairInitCtx(&cliArgs, 1735689601505LL, &ctx), TSDB_CODE_SUCCESS); + + ASSERT_EQ(tRepairBackupVnodeTarget(&ctx, NULL, 2, backupDir, sizeof(backupDir)), TSDB_CODE_INVALID_PARA); + ASSERT_EQ(tRepairBackupVnodeTarget(&ctx, "/tmp", -1, backupDir, sizeof(backupDir)), TSDB_CODE_INVALID_PARA); + ASSERT_EQ(tRepairBackupVnodeTarget(&ctx, "/tmp", 2, NULL, sizeof(backupDir)), TSDB_CODE_INVALID_PARA); + ASSERT_EQ(tRepairBackupVnodeTarget(&ctx, "/tmp", 2, backupDir, 0), TSDB_CODE_INVALID_PARA); + ASSERT_EQ(tRepairRollbackVnodeTarget(&ctx, NULL, 2), TSDB_CODE_INVALID_PARA); + ASSERT_EQ(tRepairRollbackVnodeTarget(&ctx, "/tmp", -1), TSDB_CODE_INVALID_PARA); +} + +TEST(RepairOptionParseTest, BuildProgressLineAndSummaryLine) { + SRepairCliArgs cliArgs = {0}; + ASSERT_EQ(tRepairParseCliOption(&cliArgs, "node-type", "vnode"), TSDB_CODE_SUCCESS); + ASSERT_EQ(tRepairParseCliOption(&cliArgs, "file-type", "wal"), TSDB_CODE_SUCCESS); + ASSERT_EQ(tRepairParseCliOption(&cliArgs, "vnode-id", "2,3"), TSDB_CODE_SUCCESS); + ASSERT_EQ(tRepairParseCliOption(&cliArgs, "mode", "force"), TSDB_CODE_SUCCESS); + + SRepairCtx ctx = {0}; + ASSERT_EQ(tRepairInitCtx(&cliArgs, 1735689601301LL, &ctx), TSDB_CODE_SUCCESS); + + char progressLine[256] = {0}; + ASSERT_EQ(tRepairBuildProgressLine(&ctx, "backup", 1, 2, progressLine, sizeof(progressLine)), TSDB_CODE_SUCCESS); + ASSERT_STREQ(progressLine, "repair progress: session=repair-1735689601301 step=backup vnode=1/2 progress=50%"); + + char summaryLine[256] = {0}; + ASSERT_EQ(tRepairBuildSummaryLine(&ctx, 2, 0, 4567, summaryLine, sizeof(summaryLine)), TSDB_CODE_SUCCESS); + ASSERT_STREQ(summaryLine, + "repair summary: session=repair-1735689601301 status=success successVnodes=2 failedVnodes=0 " + "elapsedMs=4567"); +} + +TEST(RepairOptionParseTest, NeedReportProgress) { + int64_t lastReportMs = 0; + bool needReport = false; + + ASSERT_EQ(tRepairNeedReportProgress(1000, 3000, &lastReportMs, &needReport), TSDB_CODE_SUCCESS); + ASSERT_TRUE(needReport); + ASSERT_EQ(lastReportMs, 1000); + + ASSERT_EQ(tRepairNeedReportProgress(2000, 3000, &lastReportMs, &needReport), TSDB_CODE_SUCCESS); + ASSERT_FALSE(needReport); + ASSERT_EQ(lastReportMs, 1000); + + ASSERT_EQ(tRepairNeedReportProgress(4001, 3000, &lastReportMs, &needReport), TSDB_CODE_SUCCESS); + ASSERT_TRUE(needReport); + ASSERT_EQ(lastReportMs, 4001); +} + +TEST(RepairOptionParseTest, ProgressReporterInvalidArgs) { + SRepairCtx ctx = {0}; + char line[32] = {0}; + int64_t lastReportMs = 0; + bool needReport = false; + ASSERT_EQ(tRepairBuildProgressLine(NULL, "step", 0, 1, line, sizeof(line)), TSDB_CODE_INVALID_PARA); + ASSERT_EQ(tRepairBuildSummaryLine(NULL, 1, 0, 1, line, sizeof(line)), TSDB_CODE_INVALID_PARA); + ASSERT_EQ(tRepairNeedReportProgress(1, 1000, NULL, &needReport), TSDB_CODE_INVALID_PARA); + ASSERT_EQ(tRepairNeedReportProgress(1, 1000, &lastReportMs, NULL), TSDB_CODE_INVALID_PARA); + ASSERT_EQ(tRepairNeedReportProgress(-1, 1000, &lastReportMs, &needReport), TSDB_CODE_INVALID_PARA); + ASSERT_EQ(tRepairNeedReportProgress(1, 0, &lastReportMs, &needReport), TSDB_CODE_INVALID_PARA); + ASSERT_EQ(tRepairBuildProgressLine(&ctx, "step", 0, 1, line, sizeof(line)), TSDB_CODE_INVALID_PARA); + ASSERT_EQ(tRepairBuildSummaryLine(&ctx, 1, 0, 1, line, sizeof(line)), TSDB_CODE_INVALID_PARA); +} + #pragma GCC diagnostic pop diff --git a/source/dnode/mgmt/exe/dmMain.c b/source/dnode/mgmt/exe/dmMain.c index 6c7843bed351..3d4fe42d50b8 100644 --- a/source/dnode/mgmt/exe/dmMain.c +++ b/source/dnode/mgmt/exe/dmMain.c @@ -23,8 +23,10 @@ #include "tconfig.h" #include "tconv.h" #include "tglobal.h" +#include "trepair.h" #include "tss.h" #include "version.h" +#include "wal.h" #ifdef TD_JEMALLOC_ENABLED #define ALLOW_FORBID_FUNC @@ -46,6 +48,19 @@ #define DM_EMAIL "" #define DM_MEM_DBG "Enable memory debug" #define DM_SET_ENCRYPTKEY "Set encrypt key. such as: -y 1234567890abcdef, the length should be less or equal to 16." +#define DM_REPAIR "Enable repair mode. Works with --node-type/--file-type/--mode and other repair options." +#define DM_REPAIR_NODE_TYPE "Repair target node type. Options: vnode, mnode, dnode, snode." +#define DM_REPAIR_FILE_TYPE "Repair target file type. Examples: vnode->wal|meta|tsdb; mnode->wal|data; dnode->config; snode->checkpoint." +#define DM_REPAIR_VNODE_ID "Target vnode id list, separated by comma (required when --node-type=vnode)." +#define DM_REPAIR_BACKUP_PATH "Backup path for corrupted files before repair." +#define DM_REPAIR_MODE "Repair mode. Options: force, replica, copy." +#define DM_REPAIR_REPLICA_NODE "Replica node endpoint for copy mode. Format: :, required when --mode=copy." +#define DM_REPAIR_NODE_TYPE_OPT "--node-type" +#define DM_REPAIR_FILE_TYPE_OPT "--file-type" +#define DM_REPAIR_VNODE_ID_OPT "--vnode-id" +#define DM_REPAIR_BACKUP_PATH_OPT "--backup-path" +#define DM_REPAIR_MODE_OPT "--mode" +#define DM_REPAIR_REPLICA_NODE_OPT "--replica-node" // clang-format on static struct { @@ -67,6 +82,11 @@ static struct { bool printAuth; bool printVersion; bool printHelp; + SRepairCliArgs repairCliArgs; + SRepairCtx repairCtx; + char repairSessionDir[PATH_MAX]; + char repairLogPath[PATH_MAX]; + char repairStatePath[PATH_MAX]; char envFile[PATH_MAX]; char apolloUrl[PATH_MAX]; const char **envCmd; @@ -186,6 +206,60 @@ static void dmSetSignalHandle() { extern bool generateNewMeta; +static bool dmHasRepairCliOption(const SRepairCliArgs *pCliArgs) { + if (pCliArgs == NULL) { + return false; + } + + return pCliArgs->hasNodeType || pCliArgs->hasFileType || pCliArgs->hasVnodeIdList || pCliArgs->hasBackupPath || + pCliArgs->hasMode || pCliArgs->hasReplicaNode; +} + +static bool dmMatchLongOption(const char *arg, const char *optionName) { + if (arg == NULL || optionName == NULL || optionName[0] == '\0') { + return false; + } + + int32_t optionLen = strlen(optionName); + if (strcmp(arg, optionName) == 0) { + return true; + } + + return strncmp(arg, optionName, optionLen) == 0 && arg[optionLen] == '='; +} + +static int32_t dmParseRepairCliLongOption(int32_t argc, char const *argv[], int32_t *pIndex, const char *cliOptionName, + const char *repairOptionName) { + if (argc <= 0 || argv == NULL || pIndex == NULL || cliOptionName == NULL || repairOptionName == NULL) { + return TSDB_CODE_INVALID_CFG; + } + + const char *optionValue = NULL; + bool matched = false; + int32_t code = tRepairExtractLongOptionValue(argc, argv, pIndex, cliOptionName, &optionValue, &matched); + if (!matched) { + return TSDB_CODE_NOT_FOUND; + } + if (code != TSDB_CODE_SUCCESS || optionValue == NULL) { + const char *arg = argv[*pIndex]; + bool missingValue = arg != NULL && strcmp(arg, cliOptionName) == 0 && *pIndex >= argc - 1; + if (missingValue) { + printf("'%s' requires a parameter\n", cliOptionName); + } else { + printf("invalid value of '%s'\n", cliOptionName); + } + return TSDB_CODE_INVALID_CFG; + } + + code = tRepairParseCliOption(&global.repairCliArgs, repairOptionName, optionValue); + if (code != TSDB_CODE_SUCCESS) { + printf("invalid value of '%s': %s\n", cliOptionName, optionValue); + return TSDB_CODE_INVALID_CFG; + } + + return TSDB_CODE_SUCCESS; +} + static int32_t dmParseArgs(int32_t argc, char const *argv[]) { global.startTime = taosGetTimestampMs(); @@ -239,6 +313,36 @@ static int32_t dmParseArgs(int32_t argc, char const *argv[]) { } } else if (strcmp(argv[i], "-r") == 0) { generateNewMeta = true; + } else if (dmMatchLongOption(argv[i], DM_REPAIR_NODE_TYPE_OPT)) { + int32_t code = dmParseRepairCliLongOption(argc, argv, &i, DM_REPAIR_NODE_TYPE_OPT, "node-type"); + if (code != TSDB_CODE_SUCCESS) { + return code; + } + } else if (dmMatchLongOption(argv[i], DM_REPAIR_FILE_TYPE_OPT)) { + int32_t code = dmParseRepairCliLongOption(argc, argv, &i, DM_REPAIR_FILE_TYPE_OPT, "file-type"); + if (code != TSDB_CODE_SUCCESS) { + return code; + } + } else if (dmMatchLongOption(argv[i], DM_REPAIR_VNODE_ID_OPT)) { + int32_t code = dmParseRepairCliLongOption(argc, argv, &i, DM_REPAIR_VNODE_ID_OPT, "vnode-id"); + if (code != TSDB_CODE_SUCCESS) { + return code; + } + } else if (dmMatchLongOption(argv[i], DM_REPAIR_BACKUP_PATH_OPT)) { + int32_t code = dmParseRepairCliLongOption(argc, argv, &i, DM_REPAIR_BACKUP_PATH_OPT, "backup-path"); + if (code != TSDB_CODE_SUCCESS) { + return code; + } + } else if (dmMatchLongOption(argv[i], DM_REPAIR_MODE_OPT)) { + int32_t code = dmParseRepairCliLongOption(argc, argv, &i, DM_REPAIR_MODE_OPT, "mode"); + if (code != TSDB_CODE_SUCCESS) { + return code; + } + } else if (dmMatchLongOption(argv[i], DM_REPAIR_REPLICA_NODE_OPT)) { + int32_t code = dmParseRepairCliLongOption(argc, argv, &i, DM_REPAIR_REPLICA_NODE_OPT, "replica-node"); + if (code != TSDB_CODE_SUCCESS) { + return code; + } } else if (strcmp(argv[i], "-E") == 0) { if (i < argc - 1) { if (strlen(argv[++i]) >= PATH_MAX) { @@ -322,6 +426,25 @@ static int32_t dmParseArgs(int32_t argc, char const *argv[]) { } } + if (dmHasRepairCliOption(&global.repairCliArgs)) { + if (!generateNewMeta) { + printf("repair options require '-r'\n"); + return TSDB_CODE_INVALID_CFG; + } + + int32_t code = tRepairValidateCliArgs(&global.repairCliArgs); + if (code != TSDB_CODE_SUCCESS) { + printf("invalid repair option combination\n"); + return TSDB_CODE_INVALID_CFG; + } + + code = tRepairInitCtx(&global.repairCliArgs, global.startTime, &global.repairCtx); + if (code != TSDB_CODE_SUCCESS) { + printf("failed to initialize repair context\n"); + return TSDB_CODE_INVALID_CFG; + } + } + return 0; } @@ -368,6 +491,13 @@ static void dmPrintHelp() { printf("%s%s%s%s\n", indent, "-y,", indent, DM_SET_ENCRYPTKEY); printf("%s%s%s%s\n", indent, "-dm,", indent, DM_MEM_DBG); printf("%s%s%s%s\n", indent, "-V,", indent, DM_VERSION); + printf("%s%s%s%s\n", indent, "-r,", indent, DM_REPAIR); + printf("%s%s%s%s\n", indent, "--node-type=NODE_TYPE,", indent, DM_REPAIR_NODE_TYPE); + printf("%s%s%s%s\n", indent, "--file-type=FILE_TYPE,", indent, DM_REPAIR_FILE_TYPE); + printf("%s%s%s%s\n", indent, "--vnode-id=VNODE_IDS,", indent, DM_REPAIR_VNODE_ID); + printf("%s%s%s%s\n", indent, "--backup-path=PATH,", indent, DM_REPAIR_BACKUP_PATH); + printf("%s%s%s%s\n", indent, "--mode=MODE,", indent, DM_REPAIR_MODE); + printf("%s%s%s%s\n", indent, "--replica-node=NODE,", indent, DM_REPAIR_REPLICA_NODE); printf("\n\nReport bugs to %s.\n", DM_EMAIL); } @@ -433,6 +563,1313 @@ static void taosCleanupArgs() { if (global.envCmd != NULL) taosMemoryFreeClear(global.envCmd); } +static void dmRollbackReplicaArtifacts(int32_t startVnodeIndex, int32_t degradedVnodes, const char *restoreHintPath) { + if (restoreHintPath != NULL && restoreHintPath[0] != '\0' && taosCheckExistFile(restoreHintPath)) { + if (taosRemoveFile(restoreHintPath) != 0) { + dError("failed to remove replica restore hint:%s since %s", restoreHintPath, + terrno != 0 ? tstrerror(terrno) : "unknown reason"); + (void)tRepairAppendSessionLog(global.repairLogPath, "replica rollback detail: failed to remove restore hint"); + } else { + (void)tRepairAppendSessionLog(global.repairLogPath, "replica rollback detail: removed restore hint"); + } + } + + int32_t firstVnodeIndex = startVnodeIndex; + int32_t lastVnodeIndex = startVnodeIndex + degradedVnodes - 1; + for (int32_t i = lastVnodeIndex; i >= firstVnodeIndex && i < global.repairCtx.vnodeIdNum; --i) { + int32_t vnodeId = global.repairCtx.vnodeIds[i]; + int32_t code = tRepairRollbackReplicaVnode(&global.repairCtx, tsDataDir, vnodeId); + if (code != TSDB_CODE_SUCCESS) { + dError("failed to rollback replica marker for vnode:%d since %s", vnodeId, tstrerror(code)); + (void)tRepairAppendSessionLog(global.repairLogPath, "replica rollback detail: failed to rollback vnode marker"); + continue; + } + + char rollbackLog[128] = {0}; + int32_t rollbackLogLen = + tsnprintf(rollbackLog, sizeof(rollbackLog), "replica rollback detail: rolled back vnode:%d", vnodeId); + if (rollbackLogLen <= 0 || rollbackLogLen >= (int32_t)sizeof(rollbackLog)) { + tstrncpy(rollbackLog, "replica rollback detail: rolled back vnode", sizeof(rollbackLog)); + } + (void)tRepairAppendSessionLog(global.repairLogPath, rollbackLog); + } +} + +static int32_t dmRunReplicaRepair(int32_t totalVnodes, int32_t startVnodeIndex, int64_t repairProgressIntervalMs, + int64_t *pLastProgressReportMs, bool *pNeedReport, char *progressLine, + int32_t progressLineSize) { + if (pLastProgressReportMs == NULL || pNeedReport == NULL || progressLine == NULL || progressLineSize <= 0) { + return TSDB_CODE_INVALID_CFG; + } + + int32_t code = 0; + bool needRunReplicaRepair = false; + code = tRepairNeedRunReplicaRepair(&global.repairCtx, &needRunReplicaRepair); + if (code != TSDB_CODE_SUCCESS) { + dError("failed to determine replica repair schedule since %s", tstrerror(code)); + printf("failed repair replica scheduling: %s\n", tstrerror(code)); + return TSDB_CODE_INVALID_CFG; + } + + if (!needRunReplicaRepair) { + return TSDB_CODE_SUCCESS; + } + if (startVnodeIndex < 0 || startVnodeIndex > global.repairCtx.vnodeIdNum) { + return TSDB_CODE_INVALID_CFG; + } + + char dispatchLog[PATH_MAX] = {0}; + int32_t dispatchLogLen = tsnprintf(dispatchLog, sizeof(dispatchLog), "replica dispatch detail: vnodeTargets=%d " + "replicaNode=%s", + global.repairCtx.vnodeIdNum, + global.repairCtx.hasReplicaNode ? global.repairCtx.replicaNode : "auto"); + if (dispatchLogLen <= 0 || dispatchLogLen >= (int32_t)sizeof(dispatchLog)) { + tstrncpy(dispatchLog, "replica dispatch detail unavailable", sizeof(dispatchLog)); + } + code = tRepairAppendSessionLog(global.repairLogPath, dispatchLog); + if (code != TSDB_CODE_SUCCESS) { + dError("failed to append replica dispatch log since %s", tstrerror(code)); + printf("failed repair replica scheduling: %s\n", tstrerror(code)); + return TSDB_CODE_INVALID_CFG; + } + + char restoreHintPath[PATH_MAX] = {0}; + int32_t replicaDoneVnodes = startVnodeIndex; + int32_t degradedVnodes = 0; + for (int32_t i = startVnodeIndex; i < global.repairCtx.vnodeIdNum; ++i) { + int32_t vnodeId = global.repairCtx.vnodeIds[i]; + char markerPath[PATH_MAX] = {0}; + + code = + tRepairWriteSessionState(&global.repairCtx, global.repairStatePath, "replica", "running", replicaDoneVnodes, + totalVnodes); + if (code != TSDB_CODE_SUCCESS) { + dError("failed to write replica repair state for vnode:%d since %s", vnodeId, tstrerror(code)); + printf("failed repair replica scheduling: %s\n", tstrerror(code)); + dmRollbackReplicaArtifacts(startVnodeIndex, degradedVnodes, restoreHintPath); + return TSDB_CODE_INVALID_CFG; + } + + code = tRepairDegradeReplicaVnode(&global.repairCtx, tsDataDir, vnodeId, markerPath, sizeof(markerPath)); + if (code != TSDB_CODE_SUCCESS) { + dError("failed to degrade local replica vnode:%d since %s", vnodeId, tstrerror(code)); + printf("failed repair replica scheduling: %s\n", tstrerror(code)); + dmRollbackReplicaArtifacts(startVnodeIndex, degradedVnodes, restoreHintPath); + return TSDB_CODE_INVALID_CFG; + } + + ++replicaDoneVnodes; + ++degradedVnodes; + + char detailLog[PATH_MAX * 2] = {0}; + int32_t detailLogLen = tsnprintf(detailLog, sizeof(detailLog), + "replica degrade detail: vnode:%d marker:%s action=degrade-local-replica " + "availability=offline syncPolicy=full-sync " + "versionPolicy=reset-local-version termPolicy=bump-local-term", + vnodeId, markerPath); + if (detailLogLen <= 0 || detailLogLen >= (int32_t)sizeof(detailLog)) { + tstrncpy(detailLog, "replica degrade detail unavailable", sizeof(detailLog)); + } + + code = tRepairAppendSessionLog(global.repairLogPath, detailLog); + if (code != TSDB_CODE_SUCCESS) { + dError("failed to append replica degrade log for vnode:%d since %s", vnodeId, tstrerror(code)); + printf("failed repair replica scheduling: %s\n", tstrerror(code)); + dmRollbackReplicaArtifacts(startVnodeIndex, degradedVnodes, restoreHintPath); + return TSDB_CODE_INVALID_CFG; + } + + code = + tRepairWriteSessionState(&global.repairCtx, global.repairStatePath, "replica", "running", replicaDoneVnodes, + totalVnodes); + if (code != TSDB_CODE_SUCCESS) { + dError("failed to update replica repair state for vnode:%d since %s", vnodeId, tstrerror(code)); + printf("failed repair replica scheduling: %s\n", tstrerror(code)); + dmRollbackReplicaArtifacts(startVnodeIndex, degradedVnodes, restoreHintPath); + return TSDB_CODE_INVALID_CFG; + } + + code = tRepairNeedReportProgress(taosGetTimestampMs(), repairProgressIntervalMs, pLastProgressReportMs, pNeedReport); + if (code != TSDB_CODE_SUCCESS) { + dError("failed to update replica repair progress interval for vnode:%d since %s", vnodeId, tstrerror(code)); + printf("failed repair replica scheduling: %s\n", tstrerror(code)); + dmRollbackReplicaArtifacts(startVnodeIndex, degradedVnodes, restoreHintPath); + return TSDB_CODE_INVALID_CFG; + } + + if (*pNeedReport || replicaDoneVnodes == totalVnodes) { + code = + tRepairBuildProgressLine(&global.repairCtx, "replica", replicaDoneVnodes, totalVnodes, progressLine, + progressLineSize); + if (code != TSDB_CODE_SUCCESS) { + dError("failed to build replica repair progress line for vnode:%d since %s", vnodeId, tstrerror(code)); + printf("failed repair replica scheduling: %s\n", tstrerror(code)); + dmRollbackReplicaArtifacts(startVnodeIndex, degradedVnodes, restoreHintPath); + return TSDB_CODE_INVALID_CFG; + } + + dInfo("%s", progressLine); + printf("%s\n", progressLine); + code = tRepairAppendSessionLog(global.repairLogPath, progressLine); + if (code != TSDB_CODE_SUCCESS) { + dError("failed to append replica repair progress log for vnode:%d since %s", vnodeId, tstrerror(code)); + printf("failed repair replica scheduling: %s\n", tstrerror(code)); + dmRollbackReplicaArtifacts(startVnodeIndex, degradedVnodes, restoreHintPath); + return TSDB_CODE_INVALID_CFG; + } + } + } + + code = tRepairWriteReplicaRestoreHint(&global.repairCtx, tsDataDir, restoreHintPath, sizeof(restoreHintPath)); + if (code != TSDB_CODE_SUCCESS) { + dError("failed to write replica restore hint since %s", tstrerror(code)); + printf("failed repair replica scheduling: %s\n", tstrerror(code)); + dmRollbackReplicaArtifacts(startVnodeIndex, degradedVnodes, restoreHintPath); + return TSDB_CODE_INVALID_CFG; + } + + const char *restoreImpl = "enterprise"; +#ifndef TD_ENTERPRISE + restoreImpl = "community-stub"; +#endif + + char restoreLog[PATH_MAX * 2] = {0}; + int32_t restoreLogLen = tsnprintf(restoreLog, sizeof(restoreLog), + "replica restore detail: hint:%s " + "mnodeMsgType=TDMT_MND_RESTORE_DNODE " + "restoreType=RESTORE_TYPE__VNODE " + "vgroupAction=mndBuildRestoreAlterVgroupAction " + "restoreDnodeImpl=%s", + restoreHintPath, restoreImpl); + if (restoreLogLen <= 0 || restoreLogLen >= (int32_t)sizeof(restoreLog)) { + tstrncpy(restoreLog, "replica restore detail unavailable", sizeof(restoreLog)); + } + + code = tRepairAppendSessionLog(global.repairLogPath, restoreLog); + if (code != TSDB_CODE_SUCCESS) { + dError("failed to append replica restore log since %s", tstrerror(code)); + printf("failed repair replica scheduling: %s\n", tstrerror(code)); + dmRollbackReplicaArtifacts(startVnodeIndex, degradedVnodes, restoreHintPath); + return TSDB_CODE_INVALID_CFG; + } + + return TSDB_CODE_SUCCESS; +} + +static int32_t dmRunCopyRepair(int32_t totalVnodes, int32_t startVnodeIndex, int64_t repairProgressIntervalMs, + int64_t *pLastProgressReportMs, bool *pNeedReport, char *progressLine, + int32_t progressLineSize) { + if (pLastProgressReportMs == NULL || pNeedReport == NULL || progressLine == NULL || progressLineSize <= 0) { + return TSDB_CODE_INVALID_CFG; + } + + int32_t code = 0; + bool needRunCopyRepair = false; + code = tRepairNeedRunCopyRepair(&global.repairCtx, &needRunCopyRepair); + if (code != TSDB_CODE_SUCCESS) { + dError("failed to determine copy repair schedule since %s", tstrerror(code)); + printf("failed repair copy scheduling: %s\n", tstrerror(code)); + return TSDB_CODE_INVALID_CFG; + } + + if (!needRunCopyRepair) { + return TSDB_CODE_SUCCESS; + } + if (startVnodeIndex < 0 || startVnodeIndex > global.repairCtx.vnodeIdNum) { + return TSDB_CODE_INVALID_CFG; + } + + char replicaHost[PATH_MAX] = {0}; + char replicaDataDir[PATH_MAX] = {0}; + code = tRepairParseReplicaNodeEndpoint(global.repairCtx.replicaNode, replicaHost, sizeof(replicaHost), replicaDataDir, + sizeof(replicaDataDir)); + if (code != TSDB_CODE_SUCCESS) { + dError("failed to parse copy replica endpoint:%s since %s", global.repairCtx.replicaNode, tstrerror(code)); + printf("failed repair copy scheduling: %s\n", tstrerror(code)); + return TSDB_CODE_INVALID_CFG; + } + + char dispatchLog[PATH_MAX] = {0}; + int32_t dispatchLogLen = tsnprintf(dispatchLog, sizeof(dispatchLog), + "copy dispatch detail: vnodeTargets=%d replicaHost=%s replicaDataDir=%s", + global.repairCtx.vnodeIdNum, replicaHost, replicaDataDir); + if (dispatchLogLen <= 0 || dispatchLogLen >= (int32_t)sizeof(dispatchLog)) { + tstrncpy(dispatchLog, "copy dispatch detail unavailable", sizeof(dispatchLog)); + } + code = tRepairAppendSessionLog(global.repairLogPath, dispatchLog); + if (code != TSDB_CODE_SUCCESS) { + dError("failed to append copy dispatch log since %s", tstrerror(code)); + printf("failed repair copy scheduling: %s\n", tstrerror(code)); + return TSDB_CODE_INVALID_CFG; + } + + int32_t copyDoneVnodes = startVnodeIndex; + for (int32_t i = startVnodeIndex; i < global.repairCtx.vnodeIdNum; ++i) { + int32_t vnodeId = global.repairCtx.vnodeIds[i]; + + char copyBackupDir[PATH_MAX] = {0}; + code = tRepairBackupVnodeTarget(&global.repairCtx, tsDataDir, vnodeId, copyBackupDir, sizeof(copyBackupDir)); + if (code != TSDB_CODE_SUCCESS) { + dError("failed to backup copy target for vnode:%d since %s", vnodeId, tstrerror(code)); + printf("failed repair copy scheduling: %s\n", tstrerror(code)); + return TSDB_CODE_INVALID_CFG; + } + + char backupLog[PATH_MAX] = {0}; + int32_t backupLogLen = + tsnprintf(backupLog, sizeof(backupLog), "prepared copy backup for vnode:%d path:%s", vnodeId, copyBackupDir); + if (backupLogLen <= 0 || backupLogLen >= (int32_t)sizeof(backupLog)) { + tstrncpy(backupLog, "prepared copy backup", sizeof(backupLog)); + } + code = tRepairAppendSessionLog(global.repairLogPath, backupLog); + if (code != TSDB_CODE_SUCCESS) { + dError("failed to append copy backup log for vnode:%d since %s", vnodeId, tstrerror(code)); + printf("failed repair copy scheduling: %s\n", tstrerror(code)); + return TSDB_CODE_INVALID_CFG; + } + + code = tRepairWriteSessionState(&global.repairCtx, global.repairStatePath, "copy", "running", copyDoneVnodes, + totalVnodes); + if (code != TSDB_CODE_SUCCESS) { + dError("failed to write copy repair state for vnode:%d since %s", vnodeId, tstrerror(code)); + printf("failed repair copy scheduling: %s\n", tstrerror(code)); + return TSDB_CODE_INVALID_CFG; + } + + char srcPath[PATH_MAX] = {0}; + char dstPath[PATH_MAX] = {0}; + code = tRepairSshScpCopyReplicaVnodeTarget(&global.repairCtx, replicaHost, replicaDataDir, tsDataDir, vnodeId, + srcPath, sizeof(srcPath), dstPath, sizeof(dstPath)); + if (code != TSDB_CODE_SUCCESS) { + (void)tRepairWriteSessionState(&global.repairCtx, global.repairStatePath, "copy", "failed", copyDoneVnodes, + totalVnodes); + dError("failed to copy vnode:%d from replica:%s path:%s since %s", vnodeId, replicaHost, replicaDataDir, + tstrerror(code)); + + int32_t rollbackCode = tRepairRollbackVnodeTarget(&global.repairCtx, tsDataDir, vnodeId); + if (rollbackCode != TSDB_CODE_SUCCESS) { + dError("failed to rollback copy repair for vnode:%d since %s", vnodeId, tstrerror(rollbackCode)); + (void)tRepairAppendSessionLog(global.repairLogPath, "copy rollback detail: rollback failed"); + } else { + char rollbackLog[128] = {0}; + int32_t rollbackLogLen = + tsnprintf(rollbackLog, sizeof(rollbackLog), "copy rollback detail: rolled back vnode:%d", vnodeId); + if (rollbackLogLen <= 0 || rollbackLogLen >= (int32_t)sizeof(rollbackLog)) { + tstrncpy(rollbackLog, "copy rollback detail: rolled back vnode", sizeof(rollbackLog)); + } + (void)tRepairAppendSessionLog(global.repairLogPath, rollbackLog); + } + + printf("failed repair copy scheduling: %s\n", tstrerror(code)); + return TSDB_CODE_INVALID_CFG; + } + + ++copyDoneVnodes; + + char detailLog[PATH_MAX * 2] = {0}; + int32_t detailLogLen = tsnprintf(detailLog, sizeof(detailLog), + "copy replica detail: vnode=%d src=%s dst=%s transport=ssh-scp consistency=verified", + vnodeId, srcPath, dstPath); + if (detailLogLen <= 0 || detailLogLen >= (int32_t)sizeof(detailLog)) { + tstrncpy(detailLog, "copy replica detail unavailable", sizeof(detailLog)); + } + code = tRepairAppendSessionLog(global.repairLogPath, detailLog); + if (code != TSDB_CODE_SUCCESS) { + dError("failed to append copy replica detail log for vnode:%d since %s", vnodeId, tstrerror(code)); + printf("failed repair copy scheduling: %s\n", tstrerror(code)); + return TSDB_CODE_INVALID_CFG; + } + + code = tRepairWriteSessionState(&global.repairCtx, global.repairStatePath, "copy", "running", copyDoneVnodes, + totalVnodes); + if (code != TSDB_CODE_SUCCESS) { + dError("failed to update copy repair state for vnode:%d since %s", vnodeId, tstrerror(code)); + printf("failed repair copy scheduling: %s\n", tstrerror(code)); + return TSDB_CODE_INVALID_CFG; + } + + code = tRepairNeedReportProgress(taosGetTimestampMs(), repairProgressIntervalMs, pLastProgressReportMs, pNeedReport); + if (code != TSDB_CODE_SUCCESS) { + dError("failed to update copy repair progress interval for vnode:%d since %s", vnodeId, tstrerror(code)); + printf("failed repair copy scheduling: %s\n", tstrerror(code)); + return TSDB_CODE_INVALID_CFG; + } + + if (*pNeedReport || copyDoneVnodes == totalVnodes) { + code = tRepairBuildProgressLine(&global.repairCtx, "copy", copyDoneVnodes, totalVnodes, progressLine, + progressLineSize); + if (code != TSDB_CODE_SUCCESS) { + dError("failed to build copy repair progress line for vnode:%d since %s", vnodeId, tstrerror(code)); + printf("failed repair copy scheduling: %s\n", tstrerror(code)); + return TSDB_CODE_INVALID_CFG; + } + + dInfo("%s", progressLine); + printf("%s\n", progressLine); + code = tRepairAppendSessionLog(global.repairLogPath, progressLine); + if (code != TSDB_CODE_SUCCESS) { + dError("failed to append copy repair progress log for vnode:%d since %s", vnodeId, tstrerror(code)); + printf("failed repair copy scheduling: %s\n", tstrerror(code)); + return TSDB_CODE_INVALID_CFG; + } + } + } + + return TSDB_CODE_SUCCESS; +} + +static int32_t dmRunForceWalRepair(int32_t totalVnodes, int32_t startVnodeIndex, int64_t repairProgressIntervalMs, + int64_t *pLastProgressReportMs, bool *pNeedReport, char *progressLine, + int32_t progressLineSize) { + if (pLastProgressReportMs == NULL || pNeedReport == NULL || progressLine == NULL || progressLineSize <= 0) { + return TSDB_CODE_INVALID_CFG; + } + + int32_t code = 0; + bool needRunWalForceRepair = false; + code = tRepairNeedRunWalForceRepair(&global.repairCtx, &needRunWalForceRepair); + if (code != TSDB_CODE_SUCCESS) { + dError("failed to determine wal force repair schedule since %s", tstrerror(code)); + printf("failed repair session preparation: %s\n", tstrerror(code)); + return TSDB_CODE_INVALID_CFG; + } + + if (!needRunWalForceRepair) { + return TSDB_CODE_SUCCESS; + } + if (startVnodeIndex < 0 || startVnodeIndex > global.repairCtx.vnodeIdNum) { + return TSDB_CODE_INVALID_CFG; + } + + code = walInit(dmStopDaemon); + if (code != TSDB_CODE_SUCCESS) { + dError("failed to initialize wal module for repair since %s", tstrerror(code)); + printf("failed repair wal scheduling: %s\n", tstrerror(code)); + return TSDB_CODE_INVALID_CFG; + } + + int32_t walDoneVnodes = startVnodeIndex; + for (int32_t i = startVnodeIndex; i < global.repairCtx.vnodeIdNum; ++i) { + int32_t vnodeId = global.repairCtx.vnodeIds[i]; + char walPath[PATH_MAX] = {0}; + char walBackupDir[PATH_MAX] = {0}; + code = tRepairBuildVnodeTargetPath(tsDataDir, vnodeId, REPAIR_FILE_TYPE_WAL, walPath, sizeof(walPath)); + if (code != TSDB_CODE_SUCCESS) { + dError("failed to build wal path for vnode:%d since %s", vnodeId, tstrerror(code)); + printf("failed repair wal scheduling: %s\n", tstrerror(code)); + return TSDB_CODE_INVALID_CFG; + } + + code = tRepairBackupVnodeTarget(&global.repairCtx, tsDataDir, vnodeId, walBackupDir, sizeof(walBackupDir)); + if (code != TSDB_CODE_SUCCESS) { + dError("failed to backup wal target for vnode:%d since %s", vnodeId, tstrerror(code)); + printf("failed repair wal scheduling: %s\n", tstrerror(code)); + return TSDB_CODE_INVALID_CFG; + } + + char backupLog[PATH_MAX] = {0}; + int32_t backupLogLen = + tsnprintf(backupLog, sizeof(backupLog), "prepared wal backup for vnode:%d path:%s", vnodeId, walBackupDir); + if (backupLogLen <= 0 || backupLogLen >= (int32_t)sizeof(backupLog)) { + tstrncpy(backupLog, "prepared wal backup", sizeof(backupLog)); + } + code = tRepairAppendSessionLog(global.repairLogPath, backupLog); + if (code != TSDB_CODE_SUCCESS) { + dError("failed to append wal backup log for vnode:%d since %s", vnodeId, tstrerror(code)); + printf("failed repair wal scheduling: %s\n", tstrerror(code)); + return TSDB_CODE_INVALID_CFG; + } + + code = tRepairWriteSessionState(&global.repairCtx, global.repairStatePath, "wal", "running", walDoneVnodes, + totalVnodes); + if (code != TSDB_CODE_SUCCESS) { + dError("failed to write wal repair state for vnode:%d since %s", vnodeId, tstrerror(code)); + printf("failed repair wal scheduling: %s\n", tstrerror(code)); + return TSDB_CODE_INVALID_CFG; + } + + SWalCfg walCfg = {0}; + walCfg.vgId = vnodeId; + walCfg.fsyncPeriod = 0; + walCfg.retentionPeriod = -1; + walCfg.retentionSize = -1; + walCfg.level = TAOS_WAL_WRITE; + + SWal *pWal = walOpen(walPath, &walCfg); + if (pWal == NULL) { + int32_t walCode = terrno != 0 ? terrno : TSDB_CODE_INVALID_PARA; + dError("failed to repair wal for vnode:%d path:%s since %s", vnodeId, walPath, tstrerror(walCode)); + int32_t rollbackCode = tRepairRollbackVnodeTarget(&global.repairCtx, tsDataDir, vnodeId); + if (rollbackCode != TSDB_CODE_SUCCESS) { + dError("failed to rollback wal repair for vnode:%d since %s", vnodeId, tstrerror(rollbackCode)); + (void)tRepairAppendSessionLog(global.repairLogPath, "wal repair rollback failed"); + } else { + char rollbackLog[128] = {0}; + int32_t rollbackLogLen = tsnprintf(rollbackLog, sizeof(rollbackLog), "rolled back wal for vnode:%d", vnodeId); + if (rollbackLogLen <= 0 || rollbackLogLen >= (int32_t)sizeof(rollbackLog)) { + tstrncpy(rollbackLog, "rolled back wal repair", sizeof(rollbackLog)); + } + (void)tRepairAppendSessionLog(global.repairLogPath, rollbackLog); + } + (void)tRepairWriteSessionState(&global.repairCtx, global.repairStatePath, "wal", "failed", walDoneVnodes, + totalVnodes); + printf("failed repair wal scheduling: %s\n", tstrerror(walCode)); + return TSDB_CODE_INVALID_CFG; + } + + SWalRepairStats repairStats = {0}; + code = walGetRepairStats(pWal, &repairStats); + if (code != TSDB_CODE_SUCCESS) { + walClose(pWal); + dError("failed to query wal repair stats for vnode:%d since %s", vnodeId, tstrerror(code)); + printf("failed repair wal scheduling: %s\n", tstrerror(code)); + return TSDB_CODE_INVALID_CFG; + } + + walClose(pWal); + + char walDetailLog[192] = {0}; + int32_t walDetailLen = + tsnprintf(walDetailLog, sizeof(walDetailLog), + "wal repair detail: vnode=%d corruptedSegments=%d rebuiltIdxEntries=%" PRId64, vnodeId, + repairStats.corruptedSegments, repairStats.rebuiltIdxEntries); + if (walDetailLen <= 0 || walDetailLen >= (int32_t)sizeof(walDetailLog)) { + tstrncpy(walDetailLog, "wal repair detail unavailable", sizeof(walDetailLog)); + } + code = tRepairAppendSessionLog(global.repairLogPath, walDetailLog); + if (code != TSDB_CODE_SUCCESS) { + dError("failed to append wal repair detail log for vnode:%d since %s", vnodeId, tstrerror(code)); + printf("failed repair wal scheduling: %s\n", tstrerror(code)); + return TSDB_CODE_INVALID_CFG; + } + + ++walDoneVnodes; + + code = tRepairWriteSessionState(&global.repairCtx, global.repairStatePath, "wal", "running", walDoneVnodes, + totalVnodes); + if (code != TSDB_CODE_SUCCESS) { + dError("failed to update wal repair state for vnode:%d since %s", vnodeId, tstrerror(code)); + printf("failed repair wal scheduling: %s\n", tstrerror(code)); + return TSDB_CODE_INVALID_CFG; + } + + char walLog[128] = {0}; + int32_t walLogLen = tsnprintf(walLog, sizeof(walLog), "finished force wal repair for vnode:%d", vnodeId); + if (walLogLen <= 0 || walLogLen >= (int32_t)sizeof(walLog)) { + tstrncpy(walLog, "finished force wal repair", sizeof(walLog)); + } + code = tRepairAppendSessionLog(global.repairLogPath, walLog); + if (code != TSDB_CODE_SUCCESS) { + dError("failed to append wal repair log for vnode:%d since %s", vnodeId, tstrerror(code)); + printf("failed repair wal scheduling: %s\n", tstrerror(code)); + return TSDB_CODE_INVALID_CFG; + } + + code = tRepairNeedReportProgress(taosGetTimestampMs(), repairProgressIntervalMs, pLastProgressReportMs, pNeedReport); + if (code != TSDB_CODE_SUCCESS) { + dError("failed to update wal repair progress interval for vnode:%d since %s", vnodeId, tstrerror(code)); + printf("failed repair wal scheduling: %s\n", tstrerror(code)); + return TSDB_CODE_INVALID_CFG; + } + + if (*pNeedReport || walDoneVnodes == totalVnodes) { + code = tRepairBuildProgressLine(&global.repairCtx, "wal", walDoneVnodes, totalVnodes, progressLine, + progressLineSize); + if (code != TSDB_CODE_SUCCESS) { + dError("failed to build wal repair progress line for vnode:%d since %s", vnodeId, tstrerror(code)); + printf("failed repair wal scheduling: %s\n", tstrerror(code)); + return TSDB_CODE_INVALID_CFG; + } + + dInfo("%s", progressLine); + printf("%s\n", progressLine); + code = tRepairAppendSessionLog(global.repairLogPath, progressLine); + if (code != TSDB_CODE_SUCCESS) { + dError("failed to append wal repair progress log for vnode:%d since %s", vnodeId, tstrerror(code)); + printf("failed repair wal scheduling: %s\n", tstrerror(code)); + return TSDB_CODE_INVALID_CFG; + } + } + } + + return TSDB_CODE_SUCCESS; +} + +static void dmHandleTsdbRepairRollback(int32_t vnodeId, int32_t doneVnodes, int32_t totalVnodes) { + int32_t rollbackCode = tRepairRollbackVnodeTarget(&global.repairCtx, tsDataDir, vnodeId); + if (rollbackCode != TSDB_CODE_SUCCESS) { + dError("failed to rollback tsdb repair for vnode:%d since %s", vnodeId, tstrerror(rollbackCode)); + (void)tRepairAppendSessionLog(global.repairLogPath, "tsdb repair rollback failed"); + } else { + char rollbackLog[128] = {0}; + int32_t rollbackLogLen = tsnprintf(rollbackLog, sizeof(rollbackLog), "rolled back tsdb for vnode:%d", vnodeId); + if (rollbackLogLen <= 0 || rollbackLogLen >= (int32_t)sizeof(rollbackLog)) { + tstrncpy(rollbackLog, "rolled back tsdb repair", sizeof(rollbackLog)); + } + (void)tRepairAppendSessionLog(global.repairLogPath, rollbackLog); + } + + (void)tRepairWriteSessionState(&global.repairCtx, global.repairStatePath, "tsdb", "failed", doneVnodes, totalVnodes); +} + +static int32_t dmRunForceTsdbRepair(int32_t totalVnodes, int32_t startVnodeIndex, int64_t repairProgressIntervalMs, + int64_t *pLastProgressReportMs, bool *pNeedReport, char *progressLine, + int32_t progressLineSize) { + if (pLastProgressReportMs == NULL || pNeedReport == NULL || progressLine == NULL || progressLineSize <= 0) { + return TSDB_CODE_INVALID_CFG; + } + + int32_t code = 0; + bool needRunTsdbForceRepair = false; + code = tRepairNeedRunTsdbForceRepair(&global.repairCtx, &needRunTsdbForceRepair); + if (code != TSDB_CODE_SUCCESS) { + dError("failed to determine tsdb force repair schedule since %s", tstrerror(code)); + printf("failed repair tsdb scheduling: %s\n", tstrerror(code)); + return TSDB_CODE_INVALID_CFG; + } + + if (!needRunTsdbForceRepair) { + return TSDB_CODE_SUCCESS; + } + if (startVnodeIndex < 0 || startVnodeIndex > global.repairCtx.vnodeIdNum) { + return TSDB_CODE_INVALID_CFG; + } + + int32_t tsdbDoneVnodes = startVnodeIndex; + for (int32_t i = startVnodeIndex; i < global.repairCtx.vnodeIdNum; ++i) { + int32_t vnodeId = global.repairCtx.vnodeIds[i]; + char tsdbPath[PATH_MAX] = {0}; + char tsdbBackupDir[PATH_MAX] = {0}; + char rebuildDir[PATH_MAX] = {0}; + + code = tRepairBuildVnodeTargetPath(tsDataDir, vnodeId, REPAIR_FILE_TYPE_TSDB, tsdbPath, sizeof(tsdbPath)); + if (code != TSDB_CODE_SUCCESS) { + dError("failed to build tsdb path for vnode:%d since %s", vnodeId, tstrerror(code)); + printf("failed repair tsdb scheduling: %s\n", tstrerror(code)); + return TSDB_CODE_INVALID_CFG; + } + + code = tRepairBackupVnodeTarget(&global.repairCtx, tsDataDir, vnodeId, tsdbBackupDir, sizeof(tsdbBackupDir)); + if (code != TSDB_CODE_SUCCESS) { + dError("failed to backup tsdb target for vnode:%d since %s", vnodeId, tstrerror(code)); + printf("failed repair tsdb scheduling: %s\n", tstrerror(code)); + return TSDB_CODE_INVALID_CFG; + } + + char backupLog[PATH_MAX] = {0}; + int32_t backupLogLen = + tsnprintf(backupLog, sizeof(backupLog), "prepared tsdb backup for vnode:%d path:%s", vnodeId, tsdbBackupDir); + if (backupLogLen <= 0 || backupLogLen >= (int32_t)sizeof(backupLog)) { + tstrncpy(backupLog, "prepared tsdb backup", sizeof(backupLog)); + } + code = tRepairAppendSessionLog(global.repairLogPath, backupLog); + if (code != TSDB_CODE_SUCCESS) { + dError("failed to append tsdb backup log for vnode:%d since %s", vnodeId, tstrerror(code)); + printf("failed repair tsdb scheduling: %s\n", tstrerror(code)); + return TSDB_CODE_INVALID_CFG; + } + + code = tRepairWriteSessionState(&global.repairCtx, global.repairStatePath, "tsdb", "running", tsdbDoneVnodes, + totalVnodes); + if (code != TSDB_CODE_SUCCESS) { + dError("failed to write tsdb repair state for vnode:%d since %s", vnodeId, tstrerror(code)); + printf("failed repair tsdb scheduling: %s\n", tstrerror(code)); + return TSDB_CODE_INVALID_CFG; + } + + SRepairTsdbBlockReport analyzeReport = {0}; + code = tRepairAnalyzeTsdbBlocks(&global.repairCtx, tsDataDir, vnodeId, &analyzeReport); + if (code != TSDB_CODE_SUCCESS) { + dError("failed to analyze tsdb blocks for vnode:%d since %s", vnodeId, tstrerror(code)); + printf("failed repair tsdb scheduling: %s\n", tstrerror(code)); + return TSDB_CODE_INVALID_CFG; + } + + char analyzeLog[192] = {0}; + int32_t analyzeLogLen = tsnprintf(analyzeLog, sizeof(analyzeLog), + "tsdb analyze detail: vnode=%d totalBlocks=%d recoverableBlocks=%d " + "corruptedBlocks=%d unknownFiles=%d", + vnodeId, analyzeReport.totalBlocks, analyzeReport.recoverableBlocks, + analyzeReport.corruptedBlocks, analyzeReport.unknownFiles); + if (analyzeLogLen <= 0 || analyzeLogLen >= (int32_t)sizeof(analyzeLog)) { + tstrncpy(analyzeLog, "tsdb analyze detail unavailable", sizeof(analyzeLog)); + } + code = tRepairAppendSessionLog(global.repairLogPath, analyzeLog); + if (code != TSDB_CODE_SUCCESS) { + dError("failed to append tsdb analyze log for vnode:%d since %s", vnodeId, tstrerror(code)); + printf("failed repair tsdb scheduling: %s\n", tstrerror(code)); + return TSDB_CODE_INVALID_CFG; + } + + int32_t rebuildDirLen = tsnprintf(rebuildDir, sizeof(rebuildDir), "%s.rebuild", tsdbPath); + if (rebuildDirLen <= 0 || rebuildDirLen >= (int32_t)sizeof(rebuildDir)) { + dError("failed to build tsdb rebuild path for vnode:%d", vnodeId); + printf("failed repair tsdb scheduling: %s\n", tstrerror(TSDB_CODE_INVALID_PARA)); + return TSDB_CODE_INVALID_CFG; + } + + SRepairTsdbBlockReport rebuildReport = {0}; + code = tRepairRebuildTsdbBlocks(&global.repairCtx, tsDataDir, vnodeId, rebuildDir, &rebuildReport); + if (code != TSDB_CODE_SUCCESS) { + dError("failed to rebuild tsdb blocks for vnode:%d since %s", vnodeId, tstrerror(code)); + printf("failed repair tsdb scheduling: %s\n", tstrerror(code)); + return TSDB_CODE_INVALID_CFG; + } + + if (taosDirExist(tsdbPath)) { + taosRemoveDir(tsdbPath); + } + + if (taosRenameFile(rebuildDir, tsdbPath) != 0) { + int32_t renameCode = terrno != 0 ? terrno : TSDB_CODE_INVALID_PARA; + dError("failed to switch tsdb rebuild output for vnode:%d since %s", vnodeId, tstrerror(renameCode)); + dmHandleTsdbRepairRollback(vnodeId, tsdbDoneVnodes, totalVnodes); + printf("failed repair tsdb scheduling: %s\n", tstrerror(renameCode)); + return TSDB_CODE_INVALID_CFG; + } + + char tsdbDetailLog[192] = {0}; + int32_t tsdbDetailLen = tsnprintf(tsdbDetailLog, sizeof(tsdbDetailLog), + "tsdb rebuild detail: vnode=%d recoverableBlocks=%d corruptedBlocks=%d " + "reportedCorrupted=%d", + vnodeId, rebuildReport.recoverableBlocks, rebuildReport.corruptedBlocks, + rebuildReport.reportedCorruptedBlocks); + if (tsdbDetailLen <= 0 || tsdbDetailLen >= (int32_t)sizeof(tsdbDetailLog)) { + tstrncpy(tsdbDetailLog, "tsdb rebuild detail unavailable", sizeof(tsdbDetailLog)); + } + code = tRepairAppendSessionLog(global.repairLogPath, tsdbDetailLog); + if (code != TSDB_CODE_SUCCESS) { + dError("failed to append tsdb rebuild detail log for vnode:%d since %s", vnodeId, tstrerror(code)); + printf("failed repair tsdb scheduling: %s\n", tstrerror(code)); + return TSDB_CODE_INVALID_CFG; + } + + ++tsdbDoneVnodes; + + code = tRepairWriteSessionState(&global.repairCtx, global.repairStatePath, "tsdb", "running", tsdbDoneVnodes, + totalVnodes); + if (code != TSDB_CODE_SUCCESS) { + dError("failed to update tsdb repair state for vnode:%d since %s", vnodeId, tstrerror(code)); + printf("failed repair tsdb scheduling: %s\n", tstrerror(code)); + return TSDB_CODE_INVALID_CFG; + } + + char tsdbLog[128] = {0}; + int32_t tsdbLogLen = tsnprintf(tsdbLog, sizeof(tsdbLog), "finished force tsdb repair for vnode:%d", vnodeId); + if (tsdbLogLen <= 0 || tsdbLogLen >= (int32_t)sizeof(tsdbLog)) { + tstrncpy(tsdbLog, "finished force tsdb repair", sizeof(tsdbLog)); + } + code = tRepairAppendSessionLog(global.repairLogPath, tsdbLog); + if (code != TSDB_CODE_SUCCESS) { + dError("failed to append tsdb repair log for vnode:%d since %s", vnodeId, tstrerror(code)); + printf("failed repair tsdb scheduling: %s\n", tstrerror(code)); + return TSDB_CODE_INVALID_CFG; + } + + code = tRepairNeedReportProgress(taosGetTimestampMs(), repairProgressIntervalMs, pLastProgressReportMs, pNeedReport); + if (code != TSDB_CODE_SUCCESS) { + dError("failed to update tsdb repair progress interval for vnode:%d since %s", vnodeId, tstrerror(code)); + printf("failed repair tsdb scheduling: %s\n", tstrerror(code)); + return TSDB_CODE_INVALID_CFG; + } + + if (*pNeedReport || tsdbDoneVnodes == totalVnodes) { + code = tRepairBuildProgressLine(&global.repairCtx, "tsdb", tsdbDoneVnodes, totalVnodes, progressLine, + progressLineSize); + if (code != TSDB_CODE_SUCCESS) { + dError("failed to build tsdb repair progress line for vnode:%d since %s", vnodeId, tstrerror(code)); + printf("failed repair tsdb scheduling: %s\n", tstrerror(code)); + return TSDB_CODE_INVALID_CFG; + } + + dInfo("%s", progressLine); + printf("%s\n", progressLine); + code = tRepairAppendSessionLog(global.repairLogPath, progressLine); + if (code != TSDB_CODE_SUCCESS) { + dError("failed to append tsdb repair progress log for vnode:%d since %s", vnodeId, tstrerror(code)); + printf("failed repair tsdb scheduling: %s\n", tstrerror(code)); + return TSDB_CODE_INVALID_CFG; + } + } + } + + return TSDB_CODE_SUCCESS; +} + +static int32_t dmAppendMetaMissingMarkerLog(int32_t vnodeId, const SRepairMetaScanResult *pScanResult) { + if (pScanResult == NULL) { + return TSDB_CODE_INVALID_CFG; + } + + char missingMark[REPAIR_META_MAX_MISSING_FILES * REPAIR_META_FILE_NAME_LEN] = {0}; + int32_t code = tRepairBuildMetaMissingFileMark(pScanResult, missingMark, sizeof(missingMark)); + if (code != TSDB_CODE_SUCCESS) { + tstrncpy(missingMark, "unknown", sizeof(missingMark)); + } + + char markerLog[640] = {0}; + int32_t markerLogLen = + tsnprintf(markerLog, sizeof(markerLog), "meta missing marker: vnode=%d missing=%s", vnodeId, missingMark); + if (markerLogLen <= 0 || markerLogLen >= (int32_t)sizeof(markerLog)) { + tstrncpy(markerLog, "meta missing marker unavailable", sizeof(markerLog)); + } + + return tRepairAppendSessionLog(global.repairLogPath, markerLog); +} + +static int32_t dmAppendMetaInferenceDetailLog(int32_t vnodeId, const SRepairMetaScanResult *pScanResult, + const SRepairMetaInferenceReport *pInferReport, bool recoverable) { + if (pScanResult == NULL || pInferReport == NULL) { + return TSDB_CODE_INVALID_CFG; + } + + char missingMark[REPAIR_META_MAX_MISSING_FILES * REPAIR_META_FILE_NAME_LEN] = {0}; + int32_t code = tRepairBuildMetaMissingFileMark(pScanResult, missingMark, sizeof(missingMark)); + if (code != TSDB_CODE_SUCCESS) { + tstrncpy(missingMark, "unknown", sizeof(missingMark)); + } + + const char *prefix = recoverable ? "meta infer detail" : "meta unrecoverable detail"; + char inferLog[768] = {0}; + int32_t inferLogLen = tsnprintf(inferLog, sizeof(inferLog), + "%s: vnode=%d missing=%s walEvidence=%d tsdbRecoverable=%d rules=%d", prefix, vnodeId, + missingMark, pInferReport->walEvidenceFiles, pInferReport->tsdbRecoverableBlocks, + pInferReport->inferredRules); + if (inferLogLen <= 0 || inferLogLen >= (int32_t)sizeof(inferLog)) { + tstrncpy(inferLog, "meta inference detail unavailable", sizeof(inferLog)); + } + + return tRepairAppendSessionLog(global.repairLogPath, inferLog); +} + +static int32_t dmAppendMetaRebuildDetailLog(int32_t vnodeId, const SRepairMetaScanResult *pRebuildResult) { + if (pRebuildResult == NULL) { + return TSDB_CODE_INVALID_CFG; + } + + char rebuildLog[192] = {0}; + int32_t rebuildLogLen = + tsnprintf(rebuildLog, sizeof(rebuildLog), + "meta rebuild detail: vnode=%d required=%d presentRequired=%d optional=%d missingRequired=%d", vnodeId, + pRebuildResult->requiredFiles, pRebuildResult->presentRequiredFiles, pRebuildResult->optionalIndexFiles, + pRebuildResult->missingRequiredFiles); + if (rebuildLogLen <= 0 || rebuildLogLen >= (int32_t)sizeof(rebuildLog)) { + tstrncpy(rebuildLog, "meta rebuild detail unavailable", sizeof(rebuildLog)); + } + + return tRepairAppendSessionLog(global.repairLogPath, rebuildLog); +} + +static void dmHandleMetaRepairRollback(int32_t vnodeId, int32_t doneVnodes, int32_t totalVnodes) { + int32_t rollbackCode = tRepairRollbackVnodeTarget(&global.repairCtx, tsDataDir, vnodeId); + if (rollbackCode != TSDB_CODE_SUCCESS) { + dError("failed to rollback meta repair for vnode:%d since %s", vnodeId, tstrerror(rollbackCode)); + (void)tRepairAppendSessionLog(global.repairLogPath, "meta repair rollback failed"); + } else { + char rollbackLog[128] = {0}; + int32_t rollbackLogLen = tsnprintf(rollbackLog, sizeof(rollbackLog), "rolled back meta for vnode:%d", vnodeId); + if (rollbackLogLen <= 0 || rollbackLogLen >= (int32_t)sizeof(rollbackLog)) { + tstrncpy(rollbackLog, "rolled back meta repair", sizeof(rollbackLog)); + } + (void)tRepairAppendSessionLog(global.repairLogPath, rollbackLog); + } + + (void)tRepairWriteSessionState(&global.repairCtx, global.repairStatePath, "meta", "failed", doneVnodes, totalVnodes); +} + +static int32_t dmRebuildAndActivateMeta(int32_t vnodeId, int32_t doneVnodes, int32_t totalVnodes, + SRepairMetaScanResult *pRebuildResult) { + if (pRebuildResult == NULL) { + return TSDB_CODE_INVALID_CFG; + } + + char metaPath[PATH_MAX] = {0}; + int32_t code = tRepairBuildVnodeTargetPath(tsDataDir, vnodeId, REPAIR_FILE_TYPE_META, metaPath, sizeof(metaPath)); + if (code != TSDB_CODE_SUCCESS) { + return code; + } + + char rebuildDir[PATH_MAX] = {0}; + int32_t rebuildDirLen = tsnprintf(rebuildDir, sizeof(rebuildDir), "%s.rebuild", metaPath); + if (rebuildDirLen <= 0 || rebuildDirLen >= (int32_t)sizeof(rebuildDir)) { + return TSDB_CODE_INVALID_PARA; + } + + code = tRepairRebuildMetaFiles(&global.repairCtx, tsDataDir, vnodeId, rebuildDir, pRebuildResult); + if (code != TSDB_CODE_SUCCESS) { + return code; + } + + if (taosDirExist(metaPath)) { + taosRemoveDir(metaPath); + } + + if (taosRenameFile(rebuildDir, metaPath) != 0) { + int32_t renameCode = terrno != 0 ? terrno : TSDB_CODE_INVALID_PARA; + dError("failed to switch meta rebuild output for vnode:%d since %s", vnodeId, tstrerror(renameCode)); + dmHandleMetaRepairRollback(vnodeId, doneVnodes, totalVnodes); + return renameCode; + } + + return dmAppendMetaRebuildDetailLog(vnodeId, pRebuildResult); +} + +static int32_t dmRunForceMetaRepair(int32_t totalVnodes, int32_t startVnodeIndex, int64_t repairProgressIntervalMs, + int64_t *pLastProgressReportMs, bool *pNeedReport, char *progressLine, + int32_t progressLineSize) { + if (pLastProgressReportMs == NULL || pNeedReport == NULL || progressLine == NULL || progressLineSize <= 0) { + return TSDB_CODE_INVALID_CFG; + } + + int32_t code = 0; + bool needRunMetaForceRepair = false; + code = tRepairNeedRunMetaForceRepair(&global.repairCtx, &needRunMetaForceRepair); + if (code != TSDB_CODE_SUCCESS) { + dError("failed to determine meta force repair schedule since %s", tstrerror(code)); + printf("failed repair meta scheduling: %s\n", tstrerror(code)); + return TSDB_CODE_INVALID_CFG; + } + + if (!needRunMetaForceRepair) { + return TSDB_CODE_SUCCESS; + } + if (startVnodeIndex < 0 || startVnodeIndex > global.repairCtx.vnodeIdNum) { + return TSDB_CODE_INVALID_CFG; + } + + int32_t metaDoneVnodes = startVnodeIndex; + for (int32_t i = startVnodeIndex; i < global.repairCtx.vnodeIdNum; ++i) { + int32_t vnodeId = global.repairCtx.vnodeIds[i]; + char metaBackupDir[PATH_MAX] = {0}; + + code = tRepairBackupVnodeTarget(&global.repairCtx, tsDataDir, vnodeId, metaBackupDir, sizeof(metaBackupDir)); + if (code != TSDB_CODE_SUCCESS) { + dError("failed to backup meta target for vnode:%d since %s", vnodeId, tstrerror(code)); + printf("failed repair meta scheduling: %s\n", tstrerror(code)); + return TSDB_CODE_INVALID_CFG; + } + + char backupLog[PATH_MAX] = {0}; + int32_t backupLogLen = + tsnprintf(backupLog, sizeof(backupLog), "prepared meta backup for vnode:%d path:%s", vnodeId, metaBackupDir); + if (backupLogLen <= 0 || backupLogLen >= (int32_t)sizeof(backupLog)) { + tstrncpy(backupLog, "prepared meta backup", sizeof(backupLog)); + } + code = tRepairAppendSessionLog(global.repairLogPath, backupLog); + if (code != TSDB_CODE_SUCCESS) { + dError("failed to append meta backup log for vnode:%d since %s", vnodeId, tstrerror(code)); + printf("failed repair meta scheduling: %s\n", tstrerror(code)); + return TSDB_CODE_INVALID_CFG; + } + + code = tRepairWriteSessionState(&global.repairCtx, global.repairStatePath, "meta", "running", metaDoneVnodes, + totalVnodes); + if (code != TSDB_CODE_SUCCESS) { + dError("failed to write meta repair state for vnode:%d since %s", vnodeId, tstrerror(code)); + printf("failed repair meta scheduling: %s\n", tstrerror(code)); + return TSDB_CODE_INVALID_CFG; + } + + SRepairMetaScanResult scanResult = {0}; + SRepairMetaInferenceReport inferReport = {0}; + code = tRepairScanMetaFiles(&global.repairCtx, tsDataDir, vnodeId, &scanResult); + if (code == TSDB_CODE_SUCCESS) { + char metaDetailLog[192] = {0}; + int32_t metaDetailLen = + tsnprintf(metaDetailLog, sizeof(metaDetailLog), + "meta scan detail: vnode=%d required=%d presentRequired=%d optional=%d missingRequired=%d", vnodeId, + scanResult.requiredFiles, scanResult.presentRequiredFiles, scanResult.optionalIndexFiles, + scanResult.missingRequiredFiles); + if (metaDetailLen <= 0 || metaDetailLen >= (int32_t)sizeof(metaDetailLog)) { + tstrncpy(metaDetailLog, "meta scan detail unavailable", sizeof(metaDetailLog)); + } + code = tRepairAppendSessionLog(global.repairLogPath, metaDetailLog); + if (code != TSDB_CODE_SUCCESS) { + dError("failed to append meta scan detail log for vnode:%d since %s", vnodeId, tstrerror(code)); + printf("failed repair meta scheduling: %s\n", tstrerror(code)); + return TSDB_CODE_INVALID_CFG; + } + } else { + int32_t scanCode = code; + code = dmAppendMetaMissingMarkerLog(vnodeId, &scanResult); + if (code != TSDB_CODE_SUCCESS) { + dError("failed to append meta missing marker log for vnode:%d since %s", vnodeId, tstrerror(code)); + printf("failed repair meta scheduling: %s\n", tstrerror(code)); + return TSDB_CODE_INVALID_CFG; + } + + int32_t inferCode = tRepairInferMetaFromWalTsdb(&global.repairCtx, tsDataDir, vnodeId, &inferReport); + code = dmAppendMetaInferenceDetailLog(vnodeId, &scanResult, &inferReport, inferCode == TSDB_CODE_SUCCESS); + if (code != TSDB_CODE_SUCCESS) { + dError("failed to append meta inference detail log for vnode:%d since %s", vnodeId, tstrerror(code)); + printf("failed repair meta scheduling: %s\n", tstrerror(code)); + return TSDB_CODE_INVALID_CFG; + } + + if (inferCode != TSDB_CODE_SUCCESS) { + (void)tRepairWriteSessionState(&global.repairCtx, global.repairStatePath, "meta", "failed", metaDoneVnodes, + totalVnodes); + dError("failed to infer meta for vnode:%d, scanCode:%s inferCode:%s", vnodeId, tstrerror(scanCode), + tstrerror(inferCode)); + printf("failed repair meta scheduling: %s\n", tstrerror(scanCode)); + return TSDB_CODE_INVALID_CFG; + } + } + + SRepairMetaScanResult rebuildResult = {0}; + code = dmRebuildAndActivateMeta(vnodeId, metaDoneVnodes, totalVnodes, &rebuildResult); + if (code != TSDB_CODE_SUCCESS) { + (void)tRepairWriteSessionState(&global.repairCtx, global.repairStatePath, "meta", "failed", metaDoneVnodes, + totalVnodes); + dError("failed to rebuild meta for vnode:%d since %s", vnodeId, tstrerror(code)); + printf("failed repair meta scheduling: %s\n", tstrerror(code)); + return TSDB_CODE_INVALID_CFG; + } + + ++metaDoneVnodes; + + code = tRepairWriteSessionState(&global.repairCtx, global.repairStatePath, "meta", "running", metaDoneVnodes, + totalVnodes); + if (code != TSDB_CODE_SUCCESS) { + dError("failed to update meta repair state for vnode:%d since %s", vnodeId, tstrerror(code)); + printf("failed repair meta scheduling: %s\n", tstrerror(code)); + return TSDB_CODE_INVALID_CFG; + } + + char metaLog[128] = {0}; + int32_t metaLogLen = tsnprintf(metaLog, sizeof(metaLog), "finished force meta repair for vnode:%d", vnodeId); + if (metaLogLen <= 0 || metaLogLen >= (int32_t)sizeof(metaLog)) { + tstrncpy(metaLog, "finished force meta repair", sizeof(metaLog)); + } + code = tRepairAppendSessionLog(global.repairLogPath, metaLog); + if (code != TSDB_CODE_SUCCESS) { + dError("failed to append meta repair log for vnode:%d since %s", vnodeId, tstrerror(code)); + printf("failed repair meta scheduling: %s\n", tstrerror(code)); + return TSDB_CODE_INVALID_CFG; + } + + code = tRepairNeedReportProgress(taosGetTimestampMs(), repairProgressIntervalMs, pLastProgressReportMs, pNeedReport); + if (code != TSDB_CODE_SUCCESS) { + dError("failed to update meta repair progress interval for vnode:%d since %s", vnodeId, tstrerror(code)); + printf("failed repair meta scheduling: %s\n", tstrerror(code)); + return TSDB_CODE_INVALID_CFG; + } + + if (*pNeedReport || metaDoneVnodes == totalVnodes) { + code = tRepairBuildProgressLine(&global.repairCtx, "meta", metaDoneVnodes, totalVnodes, progressLine, + progressLineSize); + if (code != TSDB_CODE_SUCCESS) { + dError("failed to build meta repair progress line for vnode:%d since %s", vnodeId, tstrerror(code)); + printf("failed repair meta scheduling: %s\n", tstrerror(code)); + return TSDB_CODE_INVALID_CFG; + } + + dInfo("%s", progressLine); + printf("%s\n", progressLine); + code = tRepairAppendSessionLog(global.repairLogPath, progressLine); + if (code != TSDB_CODE_SUCCESS) { + dError("failed to append meta repair progress log for vnode:%d since %s", vnodeId, tstrerror(code)); + printf("failed repair meta scheduling: %s\n", tstrerror(code)); + return TSDB_CODE_INVALID_CFG; + } + } + } + + return TSDB_CODE_SUCCESS; +} + +static void dmReportMetaPrecheckInferenceDetail(void) { + bool needRunMetaForceRepair = false; + int32_t code = tRepairNeedRunMetaForceRepair(&global.repairCtx, &needRunMetaForceRepair); + if (code != TSDB_CODE_SUCCESS || !needRunMetaForceRepair) { + return; + } + + for (int32_t i = 0; i < global.repairCtx.vnodeIdNum; ++i) { + int32_t vnodeId = global.repairCtx.vnodeIds[i]; + SRepairMetaScanResult scanResult = {0}; + SRepairMetaInferenceReport inferReport = {0}; + + int32_t scanCode = tRepairScanMetaFiles(&global.repairCtx, tsDataDir, vnodeId, &scanResult); + if (scanCode == TSDB_CODE_SUCCESS) { + continue; + } + + int32_t inferCode = tRepairInferMetaFromWalTsdb(&global.repairCtx, tsDataDir, vnodeId, &inferReport); + char missingMark[REPAIR_META_MAX_MISSING_FILES * REPAIR_META_FILE_NAME_LEN] = {0}; + code = tRepairBuildMetaMissingFileMark(&scanResult, missingMark, sizeof(missingMark)); + if (code != TSDB_CODE_SUCCESS) { + tstrncpy(missingMark, "unknown", sizeof(missingMark)); + } + + const char *prefix = inferCode == TSDB_CODE_SUCCESS ? "meta infer detail" : "meta unrecoverable detail"; + char line[768] = {0}; + int32_t lineLen = tsnprintf(line, sizeof(line), + "%s: vnode=%d missing=%s walEvidence=%d tsdbRecoverable=%d rules=%d", prefix, vnodeId, + missingMark, inferReport.walEvidenceFiles, inferReport.tsdbRecoverableBlocks, + inferReport.inferredRules); + if (lineLen <= 0 || lineLen >= (int32_t)sizeof(line)) { + tstrncpy(line, "meta precheck detail unavailable", sizeof(line)); + } + dInfo("%s", line); + printf("%s\n", line); + } +} + +static int32_t dmRunRepairWorkflow(void) { + const int64_t kRepairProgressIntervalMs = 3000; + int64_t lastProgressReportMs = 0; + int32_t totalVnodes = global.repairCtx.nodeType == REPAIR_NODE_TYPE_VNODE ? global.repairCtx.vnodeIdNum : 0; + int32_t doneVnodes = 0; + SRepairResumePlan resumePlan = {0}; + bool resumed = false; + bool needReport = false; + int64_t minDiskAvailBytes = tsDataSpace.reserved > 0 ? tsDataSpace.reserved : 0; + char progressLine[PATH_MAX] = {0}; + char resumeStep[32] = {0}; + int32_t code = tRepairPrecheck(&global.repairCtx, tsDataDir, minDiskAvailBytes); + if (code != TSDB_CODE_SUCCESS) { + dmReportMetaPrecheckInferenceDetail(); + dError("failed to run repair precheck since %s", tstrerror(code)); + printf("failed repair precheck: %s\n", tstrerror(code)); + return TSDB_CODE_INVALID_CFG; + } + + code = tRepairTryResumeSession(&global.repairCtx, tsDataDir, global.repairSessionDir, sizeof(global.repairSessionDir), + global.repairLogPath, sizeof(global.repairLogPath), global.repairStatePath, + sizeof(global.repairStatePath), &doneVnodes, &totalVnodes, &resumed, resumeStep, + sizeof(resumeStep)); + if (code != TSDB_CODE_SUCCESS) { + dError("failed to load repair resume session since %s", tstrerror(code)); + printf("failed repair session preparation: %s\n", tstrerror(code)); + return TSDB_CODE_INVALID_CFG; + } + + if (global.repairCtx.nodeType == REPAIR_NODE_TYPE_VNODE && + (totalVnodes != global.repairCtx.vnodeIdNum || doneVnodes < 0 || doneVnodes > totalVnodes)) { + dError("invalid repair resume state, doneVnodes:%d totalVnodes:%d vnodeIdNum:%d", doneVnodes, totalVnodes, + global.repairCtx.vnodeIdNum); + printf("failed repair session preparation: invalid resume state\n"); + return TSDB_CODE_INVALID_CFG; + } + + const char *planStep = resumed ? resumeStep : "init"; + code = tRepairResolveResumePlan(global.repairCtx.nodeType, planStep, doneVnodes, global.repairCtx.vnodeIdNum, + &resumePlan); + if (code != TSDB_CODE_SUCCESS) { + dError("invalid repair resume plan, step:%s doneVnodes:%d vnodeIdNum:%d", planStep, doneVnodes, + global.repairCtx.vnodeIdNum); + printf("failed repair session preparation: invalid resume step\n"); + return TSDB_CODE_INVALID_CFG; + } + + if (!resumed) { + code = tRepairPrepareSessionFiles(&global.repairCtx, tsDataDir, global.repairSessionDir, sizeof(global.repairSessionDir), + global.repairLogPath, sizeof(global.repairLogPath), global.repairStatePath, + sizeof(global.repairStatePath)); + if (code != TSDB_CODE_SUCCESS) { + dError("failed to prepare repair session files since %s", tstrerror(code)); + printf("failed repair session preparation: %s\n", tstrerror(code)); + return TSDB_CODE_INVALID_CFG; + } + } else { + char resumeMessage[160] = {0}; + const char *resumeStepName = resumeStep[0] != '\0' ? resumeStep : "unknown"; + int32_t resumeLen = + tsnprintf(resumeMessage, sizeof(resumeMessage), "repair session resumed: session=%s step=%s vnode=%d/%d", + global.repairCtx.sessionId, resumeStepName, doneVnodes, totalVnodes); + if (resumeLen <= 0 || resumeLen >= (int32_t)sizeof(resumeMessage)) { + tstrncpy(resumeMessage, "repair session resumed", sizeof(resumeMessage)); + } + + dInfo("%s", resumeMessage); + printf("%s\n", resumeMessage); + code = tRepairAppendSessionLog(global.repairLogPath, resumeMessage); + if (code != TSDB_CODE_SUCCESS) { + dError("failed to append repair resume log since %s", tstrerror(code)); + printf("failed repair session preparation: %s\n", tstrerror(code)); + return TSDB_CODE_INVALID_CFG; + } + } + + if (!resumePlan.resumeAtModeStep) { + code = tRepairWriteSessionState(&global.repairCtx, global.repairStatePath, "precheck", "running", doneVnodes, + totalVnodes); + if (code != TSDB_CODE_SUCCESS) { + dError("failed to write repair precheck state since %s", tstrerror(code)); + printf("failed repair session preparation: %s\n", tstrerror(code)); + return TSDB_CODE_INVALID_CFG; + } + } + + code = tRepairAppendSessionLog(global.repairLogPath, "repair precheck passed"); + if (code != TSDB_CODE_SUCCESS) { + dError("failed to append repair precheck log since %s", tstrerror(code)); + printf("failed repair session preparation: %s\n", tstrerror(code)); + return TSDB_CODE_INVALID_CFG; + } + + code = tRepairBuildProgressLine(&global.repairCtx, "precheck", doneVnodes, totalVnodes, progressLine, + sizeof(progressLine)); + if (code != TSDB_CODE_SUCCESS) { + dError("failed to build repair progress line since %s", tstrerror(code)); + printf("failed repair session preparation: %s\n", tstrerror(code)); + return TSDB_CODE_INVALID_CFG; + } + dInfo("%s", progressLine); + printf("%s\n", progressLine); + + code = tRepairAppendSessionLog(global.repairLogPath, progressLine); + if (code != TSDB_CODE_SUCCESS) { + dError("failed to append repair progress log since %s", tstrerror(code)); + printf("failed repair session preparation: %s\n", tstrerror(code)); + return TSDB_CODE_INVALID_CFG; + } + + code = tRepairNeedReportProgress(taosGetTimestampMs(), kRepairProgressIntervalMs, &lastProgressReportMs, &needReport); + if (code != TSDB_CODE_SUCCESS) { + dError("failed to initialize repair progress interval since %s", tstrerror(code)); + printf("failed repair session preparation: %s\n", tstrerror(code)); + return TSDB_CODE_INVALID_CFG; + } + + if (global.repairCtx.nodeType == REPAIR_NODE_TYPE_VNODE && !resumePlan.skipBackupPreparation) { + int32_t startVnodeIndex = resumePlan.backupStartVnodeIndex; + if (startVnodeIndex < 0 || startVnodeIndex > global.repairCtx.vnodeIdNum) { + dError("invalid repair resume vnode index:%d vnodeIdNum:%d", startVnodeIndex, global.repairCtx.vnodeIdNum); + printf("failed repair backup preparation: invalid resume state\n"); + return TSDB_CODE_INVALID_CFG; + } + + for (int32_t i = startVnodeIndex; i < global.repairCtx.vnodeIdNum; ++i) { + char backupDir[PATH_MAX] = {0}; + code = tRepairPrepareBackupDir(&global.repairCtx, tsDataDir, global.repairCtx.vnodeIds[i], backupDir, + sizeof(backupDir)); + if (code != TSDB_CODE_SUCCESS) { + dError("failed to prepare repair backup dir for vnode:%d since %s", global.repairCtx.vnodeIds[i], + tstrerror(code)); + printf("failed repair backup preparation: %s\n", tstrerror(code)); + return TSDB_CODE_INVALID_CFG; + } + + ++doneVnodes; + code = tRepairWriteSessionState(&global.repairCtx, global.repairStatePath, "backup", "running", doneVnodes, + totalVnodes); + if (code != TSDB_CODE_SUCCESS) { + dError("failed to write repair backup state for vnode:%d since %s", global.repairCtx.vnodeIds[i], + tstrerror(code)); + printf("failed repair backup preparation: %s\n", tstrerror(code)); + return TSDB_CODE_INVALID_CFG; + } + + char logMessage[128] = {0}; + int32_t logLen = + tsnprintf(logMessage, sizeof(logMessage), "prepared backup dir for vnode:%d", global.repairCtx.vnodeIds[i]); + if (logLen <= 0 || logLen >= (int32_t)sizeof(logMessage)) { + tstrncpy(logMessage, "prepared backup dir", sizeof(logMessage)); + } + + code = tRepairAppendSessionLog(global.repairLogPath, logMessage); + if (code != TSDB_CODE_SUCCESS) { + dError("failed to append repair backup log for vnode:%d since %s", global.repairCtx.vnodeIds[i], tstrerror(code)); + printf("failed repair backup preparation: %s\n", tstrerror(code)); + return TSDB_CODE_INVALID_CFG; + } + + code = tRepairNeedReportProgress(taosGetTimestampMs(), kRepairProgressIntervalMs, &lastProgressReportMs, &needReport); + if (code != TSDB_CODE_SUCCESS) { + dError("failed to update repair progress interval for vnode:%d since %s", global.repairCtx.vnodeIds[i], + tstrerror(code)); + printf("failed repair backup preparation: %s\n", tstrerror(code)); + return TSDB_CODE_INVALID_CFG; + } + + if (needReport || doneVnodes == totalVnodes) { + code = tRepairBuildProgressLine(&global.repairCtx, "backup", doneVnodes, totalVnodes, progressLine, + sizeof(progressLine)); + if (code != TSDB_CODE_SUCCESS) { + dError("failed to build repair backup progress line for vnode:%d since %s", global.repairCtx.vnodeIds[i], + tstrerror(code)); + printf("failed repair backup preparation: %s\n", tstrerror(code)); + return TSDB_CODE_INVALID_CFG; + } + + dInfo("%s", progressLine); + printf("%s\n", progressLine); + code = tRepairAppendSessionLog(global.repairLogPath, progressLine); + if (code != TSDB_CODE_SUCCESS) { + dError("failed to append repair backup progress log for vnode:%d since %s", global.repairCtx.vnodeIds[i], + tstrerror(code)); + printf("failed repair backup preparation: %s\n", tstrerror(code)); + return TSDB_CODE_INVALID_CFG; + } + } + } + } + + code = dmRunReplicaRepair(totalVnodes, resumePlan.replicaStartVnodeIndex, kRepairProgressIntervalMs, + &lastProgressReportMs, &needReport, progressLine, sizeof(progressLine)); + if (code != TSDB_CODE_SUCCESS) { + return code; + } + + code = dmRunCopyRepair(totalVnodes, resumePlan.copyStartVnodeIndex, kRepairProgressIntervalMs, &lastProgressReportMs, + &needReport, progressLine, sizeof(progressLine)); + if (code != TSDB_CODE_SUCCESS) { + return code; + } + + code = dmRunForceWalRepair(totalVnodes, resumePlan.walStartVnodeIndex, kRepairProgressIntervalMs, + &lastProgressReportMs, &needReport, progressLine, sizeof(progressLine)); + if (code != TSDB_CODE_SUCCESS) { + return code; + } + + code = dmRunForceTsdbRepair(totalVnodes, resumePlan.tsdbStartVnodeIndex, kRepairProgressIntervalMs, + &lastProgressReportMs, &needReport, progressLine, sizeof(progressLine)); + if (code != TSDB_CODE_SUCCESS) { + return code; + } + + code = dmRunForceMetaRepair(totalVnodes, resumePlan.metaStartVnodeIndex, kRepairProgressIntervalMs, + &lastProgressReportMs, + &needReport, progressLine, sizeof(progressLine)); + if (code != TSDB_CODE_SUCCESS) { + return code; + } + + doneVnodes = totalVnodes; + code = tRepairWriteSessionState(&global.repairCtx, global.repairStatePath, "preflight", "ready", doneVnodes, + totalVnodes); + if (code != TSDB_CODE_SUCCESS) { + dError("failed to write repair session state since %s", tstrerror(code)); + printf("failed repair session preparation: %s\n", tstrerror(code)); + return TSDB_CODE_INVALID_CFG; + } + + int32_t failedVnodes = totalVnodes >= doneVnodes ? (totalVnodes - doneVnodes) : 0; + int64_t elapsedMs = taosGetTimestampMs() - global.repairCtx.startTimeMs; + char summaryLine[PATH_MAX] = {0}; + code = + tRepairBuildSummaryLine(&global.repairCtx, doneVnodes, failedVnodes, elapsedMs, summaryLine, sizeof(summaryLine)); + if (code != TSDB_CODE_SUCCESS) { + dError("failed to build repair summary line since %s", tstrerror(code)); + printf("failed repair session preparation: %s\n", tstrerror(code)); + return TSDB_CODE_INVALID_CFG; + } + + dInfo("%s", summaryLine); + printf("%s\n", summaryLine); + code = tRepairAppendSessionLog(global.repairLogPath, summaryLine); + if (code != TSDB_CODE_SUCCESS) { + dError("failed to append repair summary log since %s", tstrerror(code)); + printf("failed repair session preparation: %s\n", tstrerror(code)); + return TSDB_CODE_INVALID_CFG; + } + + return TSDB_CODE_SUCCESS; +} + #ifdef TAOSD_INTEGRATED int dmStartDaemon(int argc, char const *argv[]) { #else @@ -633,6 +2070,17 @@ int mainWindows(int argc, char **argv) { return 0; } + if (global.repairCtx.enabled) { + code = dmRunRepairWorkflow(); + if (code != TSDB_CODE_SUCCESS) { + taosCleanupCfg(); + taosCloseLog(); + taosCleanupArgs(); + taosConvDestroy(); + return code; + } + } + osSetProcPath(argc, (char **)argv); taosCleanupArgs(); diff --git a/source/libs/wal/src/walMeta.c b/source/libs/wal/src/walMeta.c index f60c369cb07e..62165a3f65ff 100644 --- a/source/libs/wal/src/walMeta.c +++ b/source/libs/wal/src/walMeta.c @@ -600,6 +600,7 @@ int32_t walCheckAndRepairMeta(SWal* pWal) { pWal->cfg.vgId, fnameStr, fileSize, pFileInfo->fileSize, pFileInfo->lastVer, pFileInfo->firstVer, tsWalForceRepair); updateMeta = true; + pWal->repairStats.corruptedSegments += 1; TAOS_CHECK_EXIT(walTrimIdxFile(pWal, fileIdx)); @@ -695,6 +696,8 @@ static int32_t walCheckAndRepairIdxFile(SWal* pWal, int32_t fileIdx) { TAOS_RETURN(TSDB_CODE_SUCCESS); } + pWal->repairStats.corruptedSegments += 1; + // start to repair int64_t offset = fileSize - fileSize % sizeof(SWalIdxEntry); TdFilePtr pLogFile = NULL; @@ -800,6 +803,7 @@ static int32_t walCheckAndRepairIdxFile(SWal* pWal, int32_t fileIdx) { } if (count > 0) { + pWal->repairStats.rebuiltIdxEntries += count; wInfo("vgId:%d, rebuilt %" PRId64 " wal idx entries until last index:%" PRId64, pWal->cfg.vgId, count, pFileInfo->lastVer); } diff --git a/source/libs/wal/src/walMgmt.c b/source/libs/wal/src/walMgmt.c index e1a523aeefce..7f46636f8a80 100644 --- a/source/libs/wal/src/walMgmt.c +++ b/source/libs/wal/src/walMgmt.c @@ -36,6 +36,15 @@ static void walFreeObj(void *pWal); int64_t walGetSeq() { return (int64_t)atomic_load_32((volatile int32_t *)&tsWal.seq); } +int32_t walGetRepairStats(SWal *pWal, SWalRepairStats *pStats) { + if (pWal == NULL || pStats == NULL) { + TAOS_RETURN(TSDB_CODE_INVALID_PARA); + } + + (void)memcpy(pStats, &pWal->repairStats, sizeof(*pStats)); + TAOS_RETURN(TSDB_CODE_SUCCESS); +} + int32_t walInit(stopDnodeFn stopDnode) { int8_t old; while (1) { @@ -199,6 +208,7 @@ SWal *walOpen(const char *path, SWalCfg *pCfg) { (void)memset(&pWal->writeHead, 0, sizeof(SWalCkHead)); pWal->writeHead.head.protoVer = WAL_PROTO_VER; pWal->writeHead.magic = WAL_MAGIC; + (void)memset(&pWal->repairStats, 0, sizeof(pWal->repairStats)); // load meta code = walLoadMeta(pWal); diff --git a/source/libs/wal/test/walMetaTest.cpp b/source/libs/wal/test/walMetaTest.cpp index fb7a0eada836..59b5eb5566fa 100644 --- a/source/libs/wal/test/walMetaTest.cpp +++ b/source/libs/wal/test/walMetaTest.cpp @@ -575,6 +575,81 @@ TEST_F(WalKeepEnv, walCheckAndRepairIdxFile) { ASSERT_EQ(code, 0); } +TEST_F(WalKeepEnv, walGetRepairStatsInvalidArgs) { + SWalRepairStats stats = {0}; + ASSERT_EQ(walGetRepairStats(NULL, &stats), TSDB_CODE_INVALID_PARA); + ASSERT_EQ(walGetRepairStats(pWal, NULL), TSDB_CODE_INVALID_PARA); +} + +TEST_F(WalKeepEnv, walRepairStatsTrackCorruptedSegmentAndIdxRebuild) { + walResetEnv(); + int code; + do { + char newStr[100]; + sprintf(newStr, "%s-%d", ranStr, 0); + int len = strlen(newStr); + code = walAppendLog(pWal, 0, 0, syncMeta, newStr, len, NULL); + ASSERT_EQ(code, 0); + } while (0); + + SWalFileInfo* pFileInfo = walGetCurFileInfo(pWal); + for (int i = 1; i < 100; i++) { + char newStr[100]; + sprintf(newStr, "%s-%d", ranStr, i); + int len = strlen(newStr); + pWal->writeHead.head.version = i; + pWal->writeHead.head.bodyLen = len; + pWal->writeHead.head.msgType = 0; + pWal->writeHead.head.ingestTs = taosGetTimestampUs(); + pWal->writeHead.head.syncMeta = syncMeta; + pWal->writeHead.cksumHead = walCalcHeadCksum(&pWal->writeHead); + pWal->writeHead.cksumBody = walCalcBodyCksum(newStr, len); + taosWriteFile(pWal->pLogFile, &pWal->writeHead, sizeof(SWalCkHead)); + taosWriteFile(pWal->pLogFile, newStr, len); + } + + pWal->vers.lastVer = 99; + pFileInfo->lastVer = 99; + + code = walCheckAndRepairMeta(pWal); + ASSERT_EQ(code, 0); + code = walCheckAndRepairIdx(pWal); + ASSERT_EQ(code, 0); + + SWalRepairStats stats = {0}; + ASSERT_EQ(walGetRepairStats(pWal, &stats), TSDB_CODE_SUCCESS); + ASSERT_GE(stats.corruptedSegments, 1); + ASSERT_EQ(stats.rebuiltIdxEntries, 99); +} + +TEST_F(WalKeepEnv, walRepairStatsTrackIdxOnlyCorruption) { + walResetEnv(); + int code; + + for (int i = 0; i < 100; ++i) { + char newStr[100]; + sprintf(newStr, "%s-%d", ranStr, i); + int len = strlen(newStr); + code = walAppendLog(pWal, i, 0, syncMeta, newStr, len, NULL); + ASSERT_EQ(code, 0); + } + + SWalRepairStats stats = {0}; + ASSERT_EQ(walGetRepairStats(pWal, &stats), TSDB_CODE_SUCCESS); + ASSERT_EQ(stats.corruptedSegments, 0); + ASSERT_EQ(stats.rebuiltIdxEntries, 0); + + int64_t truncatedEntries = 20; + ASSERT_EQ(taosFtruncateFile(pWal->pIdxFile, truncatedEntries * (int64_t)sizeof(SWalIdxEntry)), 0); + + code = walCheckAndRepairIdx(pWal); + ASSERT_EQ(code, 0); + + ASSERT_EQ(walGetRepairStats(pWal, &stats), TSDB_CODE_SUCCESS); + ASSERT_GE(stats.corruptedSegments, 1); + ASSERT_EQ(stats.rebuiltIdxEntries, 80); +} + TEST_F(WalKeepEnv, walRestoreFromSnapshot1) { walResetEnv(); int code; @@ -1197,4 +1272,4 @@ TEST_F(WalRetentionEnv, corruptedDirDeleteLastFile) { ASSERT_EQ(pWal->vers.lastVer, 199); tsWalDeleteOnCorruption = oldVal; -} \ No newline at end of file +} diff --git a/task_plan.md b/task_plan.md new file mode 100644 index 000000000000..1fe38453e2bc --- /dev/null +++ b/task_plan.md @@ -0,0 +1,118 @@ +# TDengine 数据修复工具任务总计划(`taosd -r` 扩展) + +## 1. 目标 +- 在现有 `taosd -r` 基础上实现“数据修复工具”能力,覆盖需求文档中的 `vnode + (wal|tsdb|meta)` 场景。 +- 支持三种模式:`force`(单副本自救)、`replica`(副本恢复)、`copy`(跨节点文件拷贝)。 +- 所有任务都拆分成单次约 30-60 分钟可完成的小任务,方便中断续跑。 +- 构建会话恢复机制:任何 session 中断后,都能从磁盘计划文件恢复上下文,不需要重新分析。 + +## 1.1 术语统一(跨 Session 关键约定) +- 本项目统一使用:`WAL`、`META`、`TSDB`。 +- 这里的 `META` 指“时序数据的元数据”,之前文档里提到的 `TDB` 在本项目语义上等价于 `META`。 +- 对外帮助文案与计划文档统一写 `meta`,不再使用 `tdb`。 +- 为兼容历史输入,当前 CLI 解析仍接受 `--file-type=tdb`,内部会映射为 `META`。 + +## 2. 范围与非范围 + +### 范围内 +- `--node-type=vnode` 的修复链路。 +- `--file-type=wal|tsdb|meta`。 +- `--vnode-id` 过滤多个 vnode。 +- `--backup-path`、修复日志、进度输出、异常中止。 + +### 非范围(当前需求明确不做) +- `mnode/dnode/snode` 文件级修复实现。 +- `BSE` 等 vnode 其他文件类型修复。 + +## 3. 当前状态(2026-03-04) +- 当前阶段:`P0` 已完成(需求/代码勘察与任务拆解)。 +- 当前执行阶段:`P8` 已完成(验证与发布准备)。 +- 当前可执行入口:`无`(当前任务集已完成)。 +- 当前阻塞:无。 + +## 4. 阶段里程碑 +| Phase | 名称 | 目标 | 退出条件 | 状态 | +|---|---|---|---|---| +| P0 | 需求与代码基线 | 识别现有能力与缺口,确定方案 | 设计文档 + 任务拆解落盘 | completed | +| P1 | CLI 与参数校验 | 新命令参数可解析/校验/报错 | `taosd -r --help` 显示新参数,校验单测通过 | completed | +| P2 | 修复编排框架 | vnode 级任务调度、预检、备份、日志、状态文件 | 可执行空跑并输出进度/摘要 | completed | +| P3 | `force + wal` | 基于现有 WAL 修复能力交付 MVP | WAL 损坏样例可修复并产出日志 | completed | +| P4 | `force + tsdb` | 交付 TSDB 块级修复编排 | TSDB 损坏样例修复后可启动/查询 | completed | +| P5 | `force + meta` | 交付 META 修复 + 反向推导链路 | 元数据损坏样例可恢复可用子集 | completed | +| P6 | `replica` 模式 | 触发副本全量同步恢复 | 多副本损坏节点可自动拉起恢复 | completed | +| P7 | `copy` 模式 | 从指定副本节点拷贝文件恢复 | 大文件场景可快速恢复并校验权限 | completed | +| P8 | 验证与发布准备 | 系统测试矩阵、文档、回归、发布清单 | 用例通过,文档可交付 | completed | + +## 5. 1 小时任务拆解(执行队列) +| ID | Phase | 任务 | 估时 | 前置 | 产出 | 状态 | +|---|---|---|---|---|---|---| +| T1.1 | P1 | 定义修复参数结构体与枚举(node/file/mode/vnodeList) | 45m | - | 参数模型头文件与解析入口 | completed | +| T1.2 | P1 | 扩展 `dmMain.c` 参数解析支持 `--node-type/--file-type/--vnode-id` | 60m | T1.1 | 新参数可进入模型 | completed | +| T1.3 | P1 | 扩展 `--backup-path/--mode/--replica-node` 解析 | 45m | T1.2 | 全参数解析打通 | completed | +| T1.4 | P1 | 参数组合校验与错误码映射 | 60m | T1.3 | 非法组合拒绝执行 | completed | +| T1.5 | P1 | `--help` 文案更新与示例命令校对 | 30m | T1.4 | 帮助文本与需求一致 | completed | +| T1.6 | P1 | 新增参数解析单测(建议放 `source/common/test/commonTests.cpp`) | 60m | T1.4 | parser/validator 单测 | completed | +| T2.1 | P2 | 设计修复运行时上下文(repair session) | 45m | T1.4 | `SRepairCtx` + 初始化逻辑 | completed | +| T2.2 | P2 | vnode 过滤器:从 vnode list 里选出目标 `vnode-id` | 45m | T2.1 | 精准作用范围 | completed | +| T2.3 | P2 | 预检:参数、路径、磁盘空间、目标文件存在性 | 60m | T2.2 | 失败即中止并记录原因 | completed | +| T2.4 | P2 | 备份管理器(按 vnode+时间戳目录) | 60m | T2.3 | `backup/` 目录结构稳定 | completed | +| T2.5 | P2 | 修复日志与状态文件(`repair.log` + `repair.state.json`) | 60m | T2.4 | 会话可追踪 | completed | +| T2.6 | P2 | 进度输出(每 N 秒)与最终摘要输出 | 45m | T2.5 | 控制台进度 + 结果摘要 | completed | +| T2.7 | P2 | 会话恢复能力:读取 `repair.state.json` 续跑未完成步骤 | 60m | T2.5 | 中断后可继续 | completed | +| T3.1 | P3 | `force+wal` 调度器:接入 `walCheckAndRepair*` 流程 | 45m | T2.6 | 每 vnode WAL 修复入口 | completed | +| T3.2 | P3 | WAL 修复前备份与失败回滚保护 | 45m | T3.1 | 安全防护 | completed | +| T3.3 | P3 | WAL 修复明细记录(损坏区段、重建 idx 条目数) | 60m | T3.1 | 可审计日志 | completed | +| T3.4 | P3 | `wal_test` 扩展:损坏样例自动化验证 | 60m | T3.1 | 回归测试 | completed | +| T4.1 | P4 | TSDB 文件枚举与完整性扫描器封装 | 60m | T2.6 | `.data/.head/.sma/.stt` 扫描结果 | completed | +| T4.2 | P4 | TSDB 可恢复块提取与损坏块定位输出 | 60m | T4.1 | 结构化损坏报告 | completed | +| T4.3 | P4 | TSDB 文件重建流程(先 MVP:保留有效块) | 60m | T4.2 | 可重建输出目录 | completed | +| T4.4 | P4 | TSDB 修复结果验证(启动 + 查询可用) | 45m | T4.3 | 可用性验收 | completed | +| T4.5 | P4 | TSDB 场景系统测试脚本补齐 | 60m | T4.4 | 自动化脚本 | completed | +| T5.1 | P5 | META 元数据解析器稳定化(结构/标签/索引) | 60m | T2.6 | 可读取元数据快照 | completed | +| T5.2 | P5 | WAL/TSDB 反向推导元数据规则实现(第一批规则) | 60m | T5.1 | 推导器 MVP | completed | +| T5.3 | P5 | 缺失元数据标记与“不可推导”日志输出 | 45m | T5.2 | 风险透明 | completed | +| T5.4 | P5 | 重建 META 并切换生效(含备份目录) | 60m | T5.3 | META 修复闭环 | completed | +| T5.5 | P5 | META 修复测试:部分损坏/完全损坏双场景 | 60m | T5.4 | 可复现测试 | completed | +| T6.1 | P6 | `mode=replica` 指令接入与分支调度 | 30m | T2.6 | replica 模式可选通 | completed | +| T6.2 | P6 | 本地坏副本降级动作(不可用标记 + 版本/任期策略) | 60m | T6.1 | 触发全量同步 | completed | +| T6.3 | P6 | 与现有 restore/vgroup 逻辑联动验证 | 60m | T6.2 | 多副本恢复成功 | completed | +| T6.4 | P6 | replica 模式失败保护与回滚语义 | 45m | T6.3 | 不产生二次破坏 | completed | +| T7.1 | P7 | `--replica-node` 解析与目标合法性校验 | 45m | T1.4 | copy 模式参数完备 | completed | +| T7.2 | P7 | 远端拷贝抽象层(先本地 mock) | 60m | T7.1 | 可测试接口 | completed | +| T7.3 | P7 | SSH/SCP 实现并接入 copy 模式 | 60m | T7.2 | 远端拷贝可执行 | completed | +| T7.4 | P7 | 覆盖写入后的权限/owner 修复逻辑 | 45m | T7.3 | 权限一致性 | completed | +| T7.5 | P7 | copy 模式一致性校验与异常中断处理 | 60m | T7.4 | 可控失败行为 | completed | +| T8.1 | P8 | 损坏数据生成器(WAL/META/TSDB)自动脚本化 | 60m | T3.4,T4.5,T5.5 | 可复现实验数据 | completed | +| T8.2 | P8 | 三模式系统测试矩阵与验收脚本 | 60m | T8.1 | 验收流水线 | completed | +| T8.3 | P8 | 文档更新(中英)与运维手册示例 | 60m | T8.2 | 可发布文档 | completed | +| T8.4 | P8 | 发布前回归与风险清单签出 | 45m | T8.3 | 发布 gate 通过 | completed | + +## 6. 中断恢复机制(开发过程) +1. 先执行 `git status --short` 确认工作区状态。 +2. 顺序阅读 `task_plan.md`、`findings.md`、`progress.md`(本目录)。 +3. 在任务表中定位: + - 优先 `status=in_progress` 的任务; + - 若无 in_progress,则取第一个 pending 任务。 +4. 在开始编码前,把该任务状态改为 `in_progress`,并在 `progress.md` 追加一条日志。 +5. 每完成一个任务,必须: + - 更新任务状态为 `completed`; + - 在 `progress.md` 记录变更文件、测试命令、结果; + - 在 `findings.md` 补充任何新发现(尤其是失败原因)。 +6. 若任务失败 3 次,记录失败尝试并升级到“待决策项”,不要盲目重复。 +7. 每次对外汇报必须包含进度条,格式: + - `进度: % [] /` + - 计算口径:`done=completed`,`total=completed+in_progress+pending`(基于本文件任务表)。 + +## 7. 错误记录 +| 时间 | 任务 | 错误 | 尝试次数 | 处理方式 | 结果 | +|---|---|---|---|---|---| +| 2026-03-03 | 文档读取 | 需求文件名与路径不完全匹配(有空格) | 1 | 先枚举目录后精确读取 | resolved | +| 2026-03-03 | T1.1 测试验证 | 并行执行 build 与 test 导致测试先于新二进制完成,出现伪失败 | 1 | 改为顺序执行:先 build 再 test | resolved | +| 2026-03-03 | T1.1 测试验证 | `LeakSanitizer` 在当前 ptrace 环境下运行失败 | 1 | `ctest` 时增加 `ASAN_OPTIONS=detect_leaks=0` | resolved | +| 2026-03-03 | T2.3 运行验证 | 使用 `-o /tmp` 启动时触发 `osDir.c:taosMulModeMkDir` ASan 越界(历史问题) | 1 | 改为 `-o /tmp/taoslog` 继续验证预检路径 | resolved | +| 2026-03-03 | T3.4 Red 构建 | `ext_pcre2` update 阶段触发外网访问失败(构建环境网络受限) | 1 | 复用本地已安装依赖并补全本地 stamp,继续执行业务测试 | resolved | +| 2026-03-04 | T6.4 回归验证 | `cmake --build debug --target commonTest taosd` 触发 `ext_pcre2` 外网更新失败 | 1 | 继续沿用“跳过外网更新,直接运行已编译二进制测试”的策略 | resolved | +| 2026-03-04 | T7.1 回归验证 | `cmake --build debug --target taosd` 触发 `ext_pcre2` 外网更新失败 | 1 | 使用已批准前缀 `cmake --build` 升权重试后构建通过 | resolved | +| 2026-03-04 | T7.2 回归验证 | `cmake --build debug --target taosd` 多次触发 `ext_pcre2` 外网更新失败 | 2 | 使用已批准前缀 `cmake --build` 升权重试后构建通过 | resolved | +| 2026-03-04 | T7.5 Red/回归验证 | `cmake --build debug --target commonTest/taosd` 多次触发 `ext_pcre2` 外网更新失败 | 2 | 使用已批准前缀 `cmake --build` 升权重试后通过 | resolved | +| 2026-03-04 | T8.4 发布前回归 | `cmake --build debug --target taosd` 在弱网环境下可能触发外部依赖拉取失败 | 1 | 复用已批准 `cmake --build` 前缀重试并完成最终 gate 构建 | resolved | diff --git a/tests/ci/repair_fixture_generator.sh b/tests/ci/repair_fixture_generator.sh new file mode 100755 index 000000000000..01ff2c9a40f5 --- /dev/null +++ b/tests/ci/repair_fixture_generator.sh @@ -0,0 +1,167 @@ +#!/usr/bin/env bash + +set -euo pipefail + +usage() { + cat <<'EOF' +Usage: + repair_fixture_generator.sh --output-dir [--type wal|tsdb|meta|all] [--vnode-id ] [--clean] + +Description: + Generate reproducible corruption fixtures for TDengine repair tests. + Fixtures include WAL, TSDB, and META scenarios. + +Options: + --output-dir Required. Root directory to place generated fixtures. + --type Optional. One of wal|tsdb|meta|all. Default: all. + --vnode-id Optional. Target vnode id. Default: 2. + --clean Optional. Remove output-dir before generation. + -h, --help Show this help. +EOF +} + +OUTPUT_DIR="" +FIXTURE_TYPE="all" +VNODE_ID=2 +CLEAN_OUTPUT=0 + +while [[ $# -gt 0 ]]; do + case "$1" in + --output-dir) + [[ $# -ge 2 ]] || { + echo "'--output-dir' requires a value" + exit 1 + } + OUTPUT_DIR="$2" + shift 2 + ;; + --type) + [[ $# -ge 2 ]] || { + echo "'--type' requires a value" + exit 1 + } + FIXTURE_TYPE="$2" + shift 2 + ;; + --vnode-id) + [[ $# -ge 2 ]] || { + echo "'--vnode-id' requires a value" + exit 1 + } + VNODE_ID="$2" + shift 2 + ;; + --clean) + CLEAN_OUTPUT=1 + shift + ;; + -h|--help) + usage + exit 0 + ;; + *) + echo "Unknown option: $1" + usage + exit 1 + ;; + esac +done + +if [[ -z "$OUTPUT_DIR" ]]; then + echo "'--output-dir' is required" + usage + exit 1 +fi + +case "$FIXTURE_TYPE" in + wal|tsdb|meta|all) ;; + *) + echo "Invalid '--type': $FIXTURE_TYPE" + exit 1 + ;; +esac + +if [[ ! "$VNODE_ID" =~ ^[0-9]+$ ]]; then + echo "Invalid '--vnode-id': $VNODE_ID" + exit 1 +fi + +if [[ $CLEAN_OUTPUT -eq 1 ]]; then + rm -rf "$OUTPUT_DIR" +fi + +mkdir -p "$OUTPUT_DIR" + +MANIFEST="$OUTPUT_DIR/manifest.txt" +: > "$MANIFEST" + +manifest_append() { + local line="$1" + echo "$line" >> "$MANIFEST" +} + +generate_wal_fixture() { + local case_root="$OUTPUT_DIR/wal-force-corrupted" + local wal_dir="$case_root/vnode/vnode${VNODE_ID}/wal" + mkdir -p "$wal_dir" + + printf 'wal-meta-ok\n' > "$wal_dir/000001.meta" + printf 'wal-idx-ok\n' > "$wal_dir/000001.idx" + printf 'wal-log-segment-000001-valid\n' > "$wal_dir/000001.log" + + printf 'wal-log-segment-000002-corrupted\n' > "$wal_dir/000002.log" + truncate -s 9 "$wal_dir/000002.log" + printf 'corrupted-idx-payload\n' > "$wal_dir/000002.idx" + + manifest_append "wal=$case_root" +} + +generate_tsdb_fixture() { + local case_root="$OUTPUT_DIR/tsdb-force-mixed" + local tsdb_dir="$case_root/vnode/vnode${VNODE_ID}/tsdb" + mkdir -p "$tsdb_dir/f100" "$tsdb_dir/f200" "$tsdb_dir/f300" + + printf 'recoverable-head\n' > "$tsdb_dir/f100/block.head" + printf 'recoverable-data\n' > "$tsdb_dir/f100/block.data" + + printf 'corrupted-head-only\n' > "$tsdb_dir/f200/bad.head" + printf 'orphan-sma-only\n' > "$tsdb_dir/f300/orphan.sma" + + manifest_append "tsdb=$case_root" +} + +generate_meta_fixture() { + local partial_root="$OUTPUT_DIR/meta-force-partial" + local complete_root="$OUTPUT_DIR/meta-force-complete" + + local partial_meta_dir="$partial_root/vnode/vnode${VNODE_ID}/meta" + local partial_wal_dir="$partial_root/vnode/vnode${VNODE_ID}/wal" + mkdir -p "$partial_meta_dir" "$partial_wal_dir" + printf 'table-partial\n' > "$partial_meta_dir/table.db" + printf 'tag-index-preserved\n' > "$partial_meta_dir/tag.idx" + printf 'wal-evidence\n' > "$partial_wal_dir/000001.log" + + local complete_meta_dir="$complete_root/vnode/vnode${VNODE_ID}/meta" + local complete_wal_dir="$complete_root/vnode/vnode${VNODE_ID}/wal" + mkdir -p "$complete_meta_dir" "$complete_wal_dir" + printf 'wal-evidence\n' > "$complete_wal_dir/000001.log" + + manifest_append "meta_partial=$partial_root" + manifest_append "meta_complete=$complete_root" +} + +if [[ "$FIXTURE_TYPE" == "wal" || "$FIXTURE_TYPE" == "all" ]]; then + generate_wal_fixture +fi + +if [[ "$FIXTURE_TYPE" == "tsdb" || "$FIXTURE_TYPE" == "all" ]]; then + generate_tsdb_fixture +fi + +if [[ "$FIXTURE_TYPE" == "meta" || "$FIXTURE_TYPE" == "all" ]]; then + generate_meta_fixture +fi + +echo "repair fixture generation complete" +echo "output: $OUTPUT_DIR" +echo "manifest: $MANIFEST" diff --git a/tests/ci/repair_meta_force.sh b/tests/ci/repair_meta_force.sh new file mode 100755 index 000000000000..deea9cd14f7e --- /dev/null +++ b/tests/ci/repair_meta_force.sh @@ -0,0 +1,122 @@ +#!/usr/bin/env bash + +set -euo pipefail + +SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +REPO_ROOT="$(cd "$SCRIPT_DIR/../.." && pwd)" +TAOSD_BIN="${1:-$REPO_ROOT/debug/build/bin/taosd}" + +if [[ ! -x "$TAOSD_BIN" ]]; then + echo "taosd binary not found or not executable: $TAOSD_BIN" + echo "usage: $0 [path/to/taosd]" + exit 1 +fi + +TMP_ROOT="$(mktemp -d /tmp/td-repair-meta-force-XXXXXX)" +cleanup() { + rm -rf "$TMP_ROOT" +} +trap cleanup EXIT + +run_meta_case() { + local case_name="$1" + local case_mode="$2" + + local case_root="$TMP_ROOT/$case_name" + local data_dir="$case_root/data" + local backup_dir="$case_root/backup" + local log_dir="$case_root/taoslog" + local run_log="$case_root/repair-run.log" + + mkdir -p "$data_dir/vnode/vnode2/meta" "$data_dir/vnode/vnode2/wal" "$backup_dir" "$log_dir" + : > "$data_dir/vnode/vnode2/wal/000001.log" + + if [[ "$case_mode" == "partial" ]]; then + : > "$data_dir/vnode/vnode2/meta/table.db" + : > "$data_dir/vnode/vnode2/meta/tag.idx" + fi + + set +e + if command -v timeout >/dev/null 2>&1; then + TAOS_DATA_DIR="$data_dir" ASAN_OPTIONS=detect_leaks=0 timeout 30s \ + "$TAOSD_BIN" -o "$log_dir" -r --node-type vnode --file-type meta --vnode-id 2 --mode force \ + --backup-path "$backup_dir" >"$run_log" 2>&1 + rc=$? + else + TAOS_DATA_DIR="$data_dir" ASAN_OPTIONS=detect_leaks=0 \ + "$TAOSD_BIN" -o "$log_dir" -r --node-type vnode --file-type meta --vnode-id 2 --mode force \ + --backup-path "$backup_dir" >"$run_log" 2>&1 + rc=$? + fi + set -e + + if [[ $rc -eq 124 ]]; then + echo "[$case_name] repair workflow timed out" + cat "$run_log" + return 1 + fi + + if ! grep -Eq "repair progress: .*step=meta .*vnode=1/1 .*progress=100%" "$run_log"; then + echo "[$case_name] missing meta progress output" + cat "$run_log" + return 1 + fi + + if ! grep -Eq "repair summary: .*status=success .*successVnodes=1 .*failedVnodes=0" "$run_log"; then + echo "[$case_name] missing successful summary output" + cat "$run_log" + return 1 + fi + + local session_dir + session_dir="$(find "$backup_dir" -maxdepth 1 -mindepth 1 -type d -name 'repair-*' | head -n 1)" + if [[ -z "$session_dir" ]]; then + echo "[$case_name] repair session directory not found" + find "$backup_dir" -maxdepth 3 -print + return 1 + fi + + if [[ ! -f "$session_dir/repair.log" || ! -f "$session_dir/repair.state.json" ]]; then + echo "[$case_name] repair session artifacts are incomplete" + find "$session_dir" -maxdepth 3 -print + return 1 + fi + + if ! grep -q "meta missing marker" "$session_dir/repair.log"; then + echo "[$case_name] missing marker log not found" + cat "$session_dir/repair.log" + return 1 + fi + + if ! grep -q "meta infer detail" "$session_dir/repair.log"; then + echo "[$case_name] infer detail log not found" + cat "$session_dir/repair.log" + return 1 + fi + + if ! grep -q "meta rebuild detail" "$session_dir/repair.log"; then + echo "[$case_name] rebuild detail log not found" + cat "$session_dir/repair.log" + return 1 + fi + + if [[ ! -f "$data_dir/vnode/vnode2/meta/table.db" || ! -f "$data_dir/vnode/vnode2/meta/schema.db" || \ + ! -f "$data_dir/vnode/vnode2/meta/uid.idx" || ! -f "$data_dir/vnode/vnode2/meta/name.idx" ]]; then + echo "[$case_name] required meta files are not complete after repair" + find "$data_dir/vnode/vnode2/meta" -maxdepth 2 -type f | sort + return 1 + fi + + if [[ "$case_mode" == "partial" && ! -f "$data_dir/vnode/vnode2/meta/tag.idx" ]]; then + echo "[$case_name] optional index file should be preserved" + find "$data_dir/vnode/vnode2/meta" -maxdepth 2 -type f | sort + return 1 + fi + + echo "[$case_name] ok (taosd exit code: $rc)" +} + +run_meta_case "meta-partial" "partial" +run_meta_case "meta-complete" "complete" + +echo "meta force repair script passed" diff --git a/tests/ci/repair_mode_matrix.sh b/tests/ci/repair_mode_matrix.sh new file mode 100755 index 000000000000..1e29d598cf2c --- /dev/null +++ b/tests/ci/repair_mode_matrix.sh @@ -0,0 +1,212 @@ +#!/usr/bin/env bash + +set -euo pipefail + +SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +REPO_ROOT="$(cd "$SCRIPT_DIR/../.." && pwd)" +TAOSD_BIN="${1:-$REPO_ROOT/debug/build/bin/taosd}" +TSDB_FORCE_SCRIPT="$SCRIPT_DIR/repair_tsdb_force.sh" +META_FORCE_SCRIPT="$SCRIPT_DIR/repair_meta_force.sh" + +if [[ ! -x "$TAOSD_BIN" ]]; then + echo "taosd binary not found or not executable: $TAOSD_BIN" + echo "usage: $0 [path/to/taosd]" + exit 1 +fi + +if [[ ! -x "$TSDB_FORCE_SCRIPT" || ! -x "$META_FORCE_SCRIPT" ]]; then + echo "required force-mode scripts are missing or not executable" + exit 1 +fi + +TMP_ROOT="$(mktemp -d /tmp/td-repair-mode-matrix-XXXXXX)" +cleanup() { + rm -rf "$TMP_ROOT" +} +trap cleanup EXIT + +run_replica_case() { + local case_root="$TMP_ROOT/replica" + local data_dir="$case_root/data" + local backup_dir="$case_root/backup" + local log_dir="$case_root/taoslog" + local run_log="$case_root/repair-run.log" + mkdir -p "$data_dir/vnode/vnode2/wal" "$backup_dir" "$log_dir" + printf 'replica-evidence\n' > "$data_dir/vnode/vnode2/wal/000001.log" + + local rc=0 + set +e + if command -v timeout >/dev/null 2>&1; then + TAOS_DATA_DIR="$data_dir" ASAN_OPTIONS=detect_leaks=0 timeout 30s \ + "$TAOSD_BIN" -o "$log_dir" -r --node-type vnode --file-type wal --vnode-id 2 --mode replica \ + --backup-path "$backup_dir" >"$run_log" 2>&1 + rc=$? + else + TAOS_DATA_DIR="$data_dir" ASAN_OPTIONS=detect_leaks=0 \ + "$TAOSD_BIN" -o "$log_dir" -r --node-type vnode --file-type wal --vnode-id 2 --mode replica \ + --backup-path "$backup_dir" >"$run_log" 2>&1 + rc=$? + fi + set -e + + if [[ $rc -eq 124 ]]; then + echo "[replica] repair workflow timed out" + cat "$run_log" + return 1 + fi + + if ! grep -Eq "repair progress: .*step=replica .*vnode=1/1 .*progress=100%" "$run_log"; then + echo "[replica] missing progress output" + cat "$run_log" + return 1 + fi + + if ! grep -Eq "repair summary: .*status=success .*successVnodes=1 .*failedVnodes=0" "$run_log"; then + echo "[replica] missing successful summary output" + cat "$run_log" + return 1 + fi + + local session_dir + session_dir="$(find "$backup_dir" -maxdepth 1 -mindepth 1 -type d -name 'repair-*' | head -n 1)" + if [[ -z "$session_dir" || ! -f "$session_dir/repair.log" ]]; then + echo "[replica] repair session log not found" + find "$backup_dir" -maxdepth 3 -print + return 1 + fi + + if ! grep -q "replica dispatch detail" "$session_dir/repair.log"; then + echo "[replica] dispatch detail log missing" + cat "$session_dir/repair.log" + return 1 + fi + if ! grep -q "replica restore detail" "$session_dir/repair.log"; then + echo "[replica] restore detail log missing" + cat "$session_dir/repair.log" + return 1 + fi + + echo "[replica] ok (taosd exit code: $rc)" +} + +run_copy_case() { + local case_root="$TMP_ROOT/copy" + local local_data_dir="$case_root/local-data" + local remote_data_dir="$case_root/remote-data" + local backup_dir="$case_root/backup" + local log_dir="$case_root/taoslog" + local bin_dir="$case_root/mock-bin" + local run_log="$case_root/repair-run.log" + mkdir -p "$local_data_dir/vnode/vnode2/wal" "$remote_data_dir/vnode/vnode2/wal/meta" \ + "$backup_dir" "$log_dir" "$bin_dir" + printf 'local-stale\n' > "$local_data_dir/vnode/vnode2/wal/stale.log" + printf 'remote-wal\n' > "$remote_data_dir/vnode/vnode2/wal/000001.log" + printf 'remote-meta\n' > "$remote_data_dir/vnode/vnode2/wal/meta/checkpoint" + + local ssh_mock="$bin_dir/ssh-mock" + local scp_mock="$bin_dir/scp-mock" + + cat > "$ssh_mock" <<'EOF' +#!/usr/bin/env bash +set -euo pipefail +cmd="${@: -1}" +bash -c "$cmd" +EOF + + cat > "$scp_mock" <<'EOF' +#!/usr/bin/env bash +set -euo pipefail +src="${@: -2:1}" +dst="${@: -1}" +remote="${src#*:}" +mkdir -p "$dst" +cp -r "$remote/." "$dst" +chmod -R 755 "$dst" +EOF + + chmod +x "$ssh_mock" "$scp_mock" + + local rc=0 + set +e + if command -v timeout >/dev/null 2>&1; then + TAOS_DATA_DIR="$local_data_dir" TAOS_REPAIR_SSH_BIN="$ssh_mock" TAOS_REPAIR_SCP_BIN="$scp_mock" \ + ASAN_OPTIONS=detect_leaks=0 timeout 30s "$TAOSD_BIN" -o "$log_dir" -r --node-type vnode --file-type wal \ + --vnode-id 2 --mode copy --replica-node "tdnode1:$remote_data_dir" --backup-path "$backup_dir" \ + >"$run_log" 2>&1 + rc=$? + else + TAOS_DATA_DIR="$local_data_dir" TAOS_REPAIR_SSH_BIN="$ssh_mock" TAOS_REPAIR_SCP_BIN="$scp_mock" \ + ASAN_OPTIONS=detect_leaks=0 "$TAOSD_BIN" -o "$log_dir" -r --node-type vnode --file-type wal \ + --vnode-id 2 --mode copy --replica-node "tdnode1:$remote_data_dir" --backup-path "$backup_dir" \ + >"$run_log" 2>&1 + rc=$? + fi + set -e + + if [[ $rc -eq 124 ]]; then + echo "[copy] repair workflow timed out" + cat "$run_log" + return 1 + fi + + if ! grep -Eq "repair progress: .*step=copy .*vnode=1/1 .*progress=100%" "$run_log"; then + echo "[copy] missing progress output" + cat "$run_log" + return 1 + fi + + if ! grep -Eq "repair summary: .*status=success .*successVnodes=1 .*failedVnodes=0" "$run_log"; then + echo "[copy] missing successful summary output" + cat "$run_log" + return 1 + fi + + if [[ -e "$local_data_dir/vnode/vnode2/wal/stale.log" ]]; then + echo "[copy] stale local file should be removed after successful copy" + find "$local_data_dir/vnode/vnode2/wal" -maxdepth 3 -type f | sort + return 1 + fi + + if [[ ! -f "$local_data_dir/vnode/vnode2/wal/000001.log" || \ + ! -f "$local_data_dir/vnode/vnode2/wal/meta/checkpoint" ]]; then + echo "[copy] copied files are incomplete" + find "$local_data_dir/vnode/vnode2/wal" -maxdepth 3 -type f | sort + return 1 + fi + + local session_dir + session_dir="$(find "$backup_dir" -maxdepth 1 -mindepth 1 -type d -name 'repair-*' | head -n 1)" + if [[ -z "$session_dir" || ! -f "$session_dir/repair.log" ]]; then + echo "[copy] repair session log not found" + find "$backup_dir" -maxdepth 3 -print + return 1 + fi + + if ! grep -q "copy replica detail" "$session_dir/repair.log"; then + echo "[copy] copy detail log missing" + cat "$session_dir/repair.log" + return 1 + fi + + if ! grep -q "consistency=verified" "$session_dir/repair.log"; then + echo "[copy] consistency verified marker missing" + cat "$session_dir/repair.log" + return 1 + fi + + echo "[copy] ok (taosd exit code: $rc)" +} + +echo "[matrix] force(tsdb) begin" +bash "$TSDB_FORCE_SCRIPT" "$TAOSD_BIN" + +echo "[matrix] force(meta) begin" +bash "$META_FORCE_SCRIPT" "$TAOSD_BIN" + +echo "[matrix] replica begin" +run_replica_case + +echo "[matrix] copy begin" +run_copy_case + +echo "repair mode matrix script passed" diff --git a/tests/ci/repair_tsdb_force.sh b/tests/ci/repair_tsdb_force.sh new file mode 100755 index 000000000000..e3c0c4065208 --- /dev/null +++ b/tests/ci/repair_tsdb_force.sh @@ -0,0 +1,103 @@ +#!/usr/bin/env bash + +set -euo pipefail + +SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +REPO_ROOT="$(cd "$SCRIPT_DIR/../.." && pwd)" +TAOSD_BIN="${1:-$REPO_ROOT/debug/build/bin/taosd}" + +if [[ ! -x "$TAOSD_BIN" ]]; then + echo "taosd binary not found or not executable: $TAOSD_BIN" + echo "usage: $0 [path/to/taosd]" + exit 1 +fi + +TMP_ROOT="$(mktemp -d /tmp/td-repair-tsdb-force-XXXXXX)" +DATA_DIR="$TMP_ROOT/data" +BACKUP_DIR="$TMP_ROOT/backup" +LOG_DIR="$TMP_ROOT/taoslog" +RUN_LOG="$TMP_ROOT/repair-run.log" + +cleanup() { + rm -rf "$TMP_ROOT" +} +trap cleanup EXIT + +mkdir -p "$DATA_DIR/vnode/vnode2/tsdb/f100" "$DATA_DIR/vnode/vnode2/tsdb/f200" "$BACKUP_DIR" "$LOG_DIR" +printf 'recoverable-head\n' > "$DATA_DIR/vnode/vnode2/tsdb/f100/block.head" +printf 'recoverable-data\n' > "$DATA_DIR/vnode/vnode2/tsdb/f100/block.data" +printf 'corrupted-head-only\n' > "$DATA_DIR/vnode/vnode2/tsdb/f200/bad.head" + +run_repair() { + if command -v timeout >/dev/null 2>&1; then + TAOS_DATA_DIR="$DATA_DIR" ASAN_OPTIONS=detect_leaks=0 timeout 30s \ + "$TAOSD_BIN" -o "$LOG_DIR" -r --node-type vnode --file-type tsdb --vnode-id 2 --mode force \ + --backup-path "$BACKUP_DIR" >"$RUN_LOG" 2>&1 + else + TAOS_DATA_DIR="$DATA_DIR" ASAN_OPTIONS=detect_leaks=0 \ + "$TAOSD_BIN" -o "$LOG_DIR" -r --node-type vnode --file-type tsdb --vnode-id 2 --mode force \ + --backup-path "$BACKUP_DIR" >"$RUN_LOG" 2>&1 + fi +} + +set +e +run_repair +RC=$? +set -e + +if [[ $RC -eq 124 ]]; then + echo "repair workflow timed out" + cat "$RUN_LOG" + exit 1 +fi + +if ! grep -Eq "repair progress: .*step=tsdb .*vnode=1/1 .*progress=100%" "$RUN_LOG"; then + echo "missing tsdb repair progress in output" + cat "$RUN_LOG" + exit 1 +fi + +if ! grep -Eq "repair summary: .*status=success .*successVnodes=1 .*failedVnodes=0" "$RUN_LOG"; then + echo "missing successful repair summary in output" + cat "$RUN_LOG" + exit 1 +fi + +if [[ ! -f "$DATA_DIR/vnode/vnode2/tsdb/f100/block.head" || ! -f "$DATA_DIR/vnode/vnode2/tsdb/f100/block.data" ]]; then + echo "recoverable tsdb block was not kept in target directory" + find "$DATA_DIR/vnode/vnode2/tsdb" -maxdepth 4 -type f | sort + exit 1 +fi + +if [[ -e "$DATA_DIR/vnode/vnode2/tsdb/f200/bad.head" ]]; then + echo "corrupted tsdb block should not exist after rebuild" + find "$DATA_DIR/vnode/vnode2/tsdb" -maxdepth 4 -type f | sort + exit 1 +fi + +SESSION_DIR="$(find "$BACKUP_DIR" -maxdepth 1 -mindepth 1 -type d -name 'repair-*' | head -n 1)" +if [[ -z "$SESSION_DIR" ]]; then + echo "repair session directory not found" + find "$BACKUP_DIR" -maxdepth 3 -print + exit 1 +fi + +if [[ ! -f "$SESSION_DIR/repair.log" || ! -f "$SESSION_DIR/repair.state.json" ]]; then + echo "repair session artifacts are incomplete" + find "$SESSION_DIR" -maxdepth 3 -print + exit 1 +fi + +if [[ ! -f "$SESSION_DIR/vnode2/tsdb/f200/bad.head" ]]; then + echo "backup directory does not contain corrupted source block" + find "$SESSION_DIR/vnode2/tsdb" -maxdepth 4 -type f | sort + exit 1 +fi + +if ! grep -Eq '"step"[[:space:]]*:[[:space:]]*"preflight"' "$SESSION_DIR/repair.state.json"; then + echo "repair state file does not record preflight completion" + cat "$SESSION_DIR/repair.state.json" + exit 1 +fi + +echo "tsdb force repair script passed (taosd exit code: $RC)"