豆豆友情提示:这是一个非官方 GitHub 代理镜像,主要用于网络测试或访问加速。请勿在此进行登录、注册或处理任何敏感信息。进行这些操作请务必访问官方网站 github.com。 Raw 内容也通过此代理提供。
Skip to content

Commit dda9158

Browse files
LawrenceLawrence
authored andcommitted
Auto-reconcile stale running jobs when pid is gone
1 parent 6a5c2ba commit dda9158

File tree

2 files changed

+150
-2
lines changed

2 files changed

+150
-2
lines changed

plugins/codex/scripts/lib/state.mjs

Lines changed: 98 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,7 @@ import { createHash } from "node:crypto";
22
import fs from "node:fs";
33
import os from "node:os";
44
import path from "node:path";
5+
import process from "node:process";
56

67
import { resolveWorkspaceRoot } from "./workspace.mjs";
78

@@ -16,6 +17,94 @@ function nowIso() {
1617
return new Date().toISOString();
1718
}
1819

20+
function isProcessAlive(pidValue) {
21+
const pid = Number(pidValue);
22+
if (!Number.isFinite(pid) || pid <= 0) {
23+
return false;
24+
}
25+
26+
try {
27+
process.kill(Math.trunc(pid), 0);
28+
return true;
29+
} catch (error) {
30+
if (error && typeof error === "object" && "code" in error && error.code === "EPERM") {
31+
return true;
32+
}
33+
return false;
34+
}
35+
}
36+
37+
function normalizePid(pidValue) {
38+
const pid = Number(pidValue);
39+
if (!Number.isFinite(pid) || pid <= 0) {
40+
return null;
41+
}
42+
return Math.trunc(pid);
43+
}
44+
45+
function appendStaleJobLog(job, message) {
46+
if (!job?.logFile) {
47+
return;
48+
}
49+
try {
50+
fs.appendFileSync(job.logFile, `[${nowIso()}] ${message}\n`, "utf8");
51+
} catch {
52+
// Best-effort logging; status reconciliation should not fail on log write errors.
53+
}
54+
}
55+
56+
function reconcileRunningJobs(cwd, jobs) {
57+
const completedAt = nowIso();
58+
let changed = false;
59+
60+
const nextJobs = jobs.map((job) => {
61+
if (job?.status !== "running") {
62+
return job;
63+
}
64+
const pid = normalizePid(job.pid);
65+
if (pid == null) {
66+
return job;
67+
}
68+
if (isProcessAlive(pid)) {
69+
return job;
70+
}
71+
72+
changed = true;
73+
const reason = `process ${pid} is not running`;
74+
const errorMessage = `Codex job ended unexpectedly (${reason}); auto-reconciled as failed.`;
75+
const nextJob = {
76+
...job,
77+
status: "failed",
78+
phase: "failed",
79+
pid: null,
80+
completedAt,
81+
errorMessage,
82+
updatedAt: completedAt
83+
};
84+
85+
appendStaleJobLog(job, `Detected stale running job (${reason}). Marked as failed automatically.`);
86+
const jobFile = resolveJobFile(cwd, job.id);
87+
if (fs.existsSync(jobFile)) {
88+
try {
89+
const stored = readJobFile(jobFile);
90+
writeJobFile(cwd, job.id, {
91+
...stored,
92+
...nextJob
93+
});
94+
} catch {
95+
// Ignore malformed on-disk job files; state reconciliation still proceeds.
96+
}
97+
}
98+
99+
return nextJob;
100+
});
101+
102+
return {
103+
changed,
104+
jobs: nextJobs
105+
};
106+
}
107+
19108
function defaultState() {
20109
return {
21110
version: STATE_VERSION,
@@ -147,7 +236,15 @@ export function upsertJob(cwd, jobPatch) {
147236
}
148237

149238
export function listJobs(cwd) {
150-
return loadState(cwd).jobs;
239+
const state = loadState(cwd);
240+
const reconciled = reconcileRunningJobs(cwd, state.jobs ?? []);
241+
if (reconciled.changed) {
242+
saveState(cwd, {
243+
...state,
244+
jobs: reconciled.jobs
245+
});
246+
}
247+
return reconciled.jobs;
151248
}
152249

153250
export function setConfig(cwd, key, value) {

tests/state.test.mjs

Lines changed: 52 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -5,7 +5,15 @@ import test from "node:test";
55
import assert from "node:assert/strict";
66

77
import { makeTempDir } from "./helpers.mjs";
8-
import { resolveJobFile, resolveJobLogFile, resolveStateDir, resolveStateFile, saveState } from "../plugins/codex/scripts/lib/state.mjs";
8+
import {
9+
listJobs,
10+
resolveJobFile,
11+
resolveJobLogFile,
12+
resolveStateDir,
13+
resolveStateFile,
14+
saveState,
15+
writeJobFile
16+
} from "../plugins/codex/scripts/lib/state.mjs";
917

1018
test("resolveStateDir uses a temp-backed per-workspace directory", () => {
1119
const workspace = makeTempDir();
@@ -103,3 +111,46 @@ test("saveState prunes dropped job artifacts when indexed jobs exceed the cap",
103111
.sort()
104112
);
105113
});
114+
115+
test("listJobs auto-reconciles stale running jobs when pid is no longer alive", () => {
116+
const workspace = makeTempDir();
117+
const jobId = "task-stale-running";
118+
const logFile = resolveJobLogFile(workspace, jobId);
119+
const staleJob = {
120+
id: jobId,
121+
status: "running",
122+
phase: "verifying",
123+
pid: 999999,
124+
logFile,
125+
updatedAt: "2026-01-01T00:00:00.000Z",
126+
createdAt: "2026-01-01T00:00:00.000Z",
127+
startedAt: "2026-01-01T00:00:00.000Z"
128+
};
129+
130+
saveState(workspace, {
131+
version: 1,
132+
config: { stopReviewGate: false },
133+
jobs: [staleJob]
134+
});
135+
fs.writeFileSync(logFile, "[2026-01-01T00:00:01.000Z] Running command: fake\n", "utf8");
136+
writeJobFile(workspace, jobId, staleJob);
137+
138+
const [job] = listJobs(workspace);
139+
140+
assert.equal(job.status, "failed");
141+
assert.equal(job.phase, "failed");
142+
assert.equal(job.pid, null);
143+
assert.match(job.errorMessage, /auto-reconciled as failed/);
144+
assert.ok(job.completedAt);
145+
146+
const persistedState = JSON.parse(fs.readFileSync(resolveStateFile(workspace), "utf8"));
147+
assert.equal(persistedState.jobs[0].status, "failed");
148+
assert.equal(persistedState.jobs[0].pid, null);
149+
150+
const persistedJob = JSON.parse(fs.readFileSync(resolveJobFile(workspace, jobId), "utf8"));
151+
assert.equal(persistedJob.status, "failed");
152+
assert.equal(persistedJob.pid, null);
153+
154+
const logTail = fs.readFileSync(logFile, "utf8");
155+
assert.match(logTail, /Detected stale running job/);
156+
});

0 commit comments

Comments
 (0)