豆豆友情提示:这是一个非官方 GitHub 代理镜像,主要用于网络测试或访问加速。请勿在此进行登录、注册或处理任何敏感信息。进行这些操作请务必访问官方网站 github.com。 Raw 内容也通过此代理提供。
Skip to content

Commit 25f602d

Browse files
authored
Merge pull request #477 from githubnext/copilot/fix-dangling-mcp-servers
Add startup timeout to prevent gateway hanging on unresponsive MCP servers
2 parents 55768c0 + 7f1fc67 commit 25f602d

File tree

4 files changed

+229
-63
lines changed

4 files changed

+229
-63
lines changed

internal/config/config.go

Lines changed: 25 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -13,6 +13,15 @@ import (
1313

1414
var logConfig = logger.New("config:config")
1515

16+
const (
17+
// DefaultPort is the default port for the gateway HTTP server
18+
DefaultPort = 3000
19+
// DefaultStartupTimeout is the default timeout for backend server startup (seconds)
20+
DefaultStartupTimeout = 60
21+
// DefaultToolTimeout is the default timeout for tool execution (seconds)
22+
DefaultToolTimeout = 120
23+
)
24+
1625
// Config represents the MCPG configuration
1726
type Config struct {
1827
Servers map[string]*ServerConfig `toml:"servers"`
@@ -94,6 +103,19 @@ func LoadFromFile(path string) (*Config, error) {
94103
// return nil, fmt.Errorf("unknown configuration keys: %v", meta.Undecoded())
95104
}
96105

106+
// Set default gateway config values if gateway section exists but fields are unset
107+
if cfg.Gateway != nil {
108+
if cfg.Gateway.StartupTimeout == 0 {
109+
cfg.Gateway.StartupTimeout = DefaultStartupTimeout
110+
}
111+
if cfg.Gateway.ToolTimeout == 0 {
112+
cfg.Gateway.ToolTimeout = DefaultToolTimeout
113+
}
114+
if cfg.Gateway.Port == 0 {
115+
cfg.Gateway.Port = DefaultPort
116+
}
117+
}
118+
97119
logConfig.Printf("Successfully loaded %d servers from TOML file", len(cfg.Servers))
98120
return &cfg, nil
99121
}
@@ -157,11 +179,11 @@ func LoadFromStdin() (*Config, error) {
157179
// Store gateway config with defaults
158180
if stdinCfg.Gateway != nil {
159181
cfg.Gateway = &GatewayConfig{
160-
Port: 3000,
182+
Port: DefaultPort,
161183
APIKey: stdinCfg.Gateway.APIKey,
162184
Domain: stdinCfg.Gateway.Domain,
163-
StartupTimeout: 60,
164-
ToolTimeout: 120,
185+
StartupTimeout: DefaultStartupTimeout,
186+
ToolTimeout: DefaultToolTimeout,
165187
}
166188
if stdinCfg.Gateway.Port != nil {
167189
cfg.Gateway.Port = *stdinCfg.Gateway.Port

internal/launcher/launcher.go

Lines changed: 124 additions & 54 deletions
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,7 @@ import (
77
"os"
88
"strings"
99
"sync"
10+
"time"
1011

1112
"github.com/githubnext/gh-aw-mcpg/internal/config"
1213
"github.com/githubnext/gh-aw-mcpg/internal/logger"
@@ -17,6 +18,12 @@ import (
1718

1819
var logLauncher = logger.New("launcher:launcher")
1920

21+
// connectionResult is used to return the result of a connection attempt from a goroutine
22+
type connectionResult struct {
23+
conn *mcp.Connection
24+
err error
25+
}
26+
2027
// Launcher manages backend MCP server connections
2128
type Launcher struct {
2229
ctx context.Context
@@ -25,6 +32,7 @@ type Launcher struct {
2532
sessionPool *SessionConnectionPool // Session-aware connections (stateful/stdio)
2633
mu sync.RWMutex
2734
runningInContainer bool
35+
startupTimeout time.Duration // Timeout for backend server startup
2836
}
2937

3038
// New creates a new Launcher
@@ -36,12 +44,22 @@ func New(ctx context.Context, cfg *config.Config) *Launcher {
3644
log.Println("[LAUNCHER] Detected running inside a container")
3745
}
3846

47+
// Get startup timeout from config, default to config.DefaultStartupTimeout seconds
48+
startupTimeout := time.Duration(config.DefaultStartupTimeout) * time.Second
49+
if cfg.Gateway != nil && cfg.Gateway.StartupTimeout > 0 {
50+
startupTimeout = time.Duration(cfg.Gateway.StartupTimeout) * time.Second
51+
logLauncher.Printf("Using configured startup timeout: %v", startupTimeout)
52+
} else {
53+
logLauncher.Printf("Using default startup timeout: %v", startupTimeout)
54+
}
55+
3956
return &Launcher{
4057
ctx: ctx,
4158
config: cfg,
4259
connections: make(map[string]*mcp.Connection),
4360
sessionPool: NewSessionConnectionPool(ctx),
4461
runningInContainer: inContainer,
62+
startupTimeout: startupTimeout,
4563
}
4664
}
4765

@@ -147,41 +165,66 @@ func GetOrLaunch(l *Launcher, serverID string) (*mcp.Connection, error) {
147165
log.Printf("[LAUNCHER] Additional env vars: %v", sanitize.TruncateSecretMap(serverCfg.Env))
148166
}
149167

150-
// Create connection
151-
conn, err := mcp.NewConnection(l.ctx, serverCfg.Command, serverCfg.Args, serverCfg.Env)
152-
if err != nil {
153-
// Enhanced error logging for command-based servers
154-
logger.LogError("backend", "Failed to launch MCP backend server: %s, error=%v", serverID, err)
155-
log.Printf("[LAUNCHER] ❌ FAILED to launch server '%s'", serverID)
156-
log.Printf("[LAUNCHER] Error: %v", err)
157-
log.Printf("[LAUNCHER] Debug Information:")
158-
log.Printf("[LAUNCHER] - Command: %s", serverCfg.Command)
159-
log.Printf("[LAUNCHER] - Args: %v", serverCfg.Args)
160-
log.Printf("[LAUNCHER] - Env vars: %v", sanitize.TruncateSecretMap(serverCfg.Env))
161-
log.Printf("[LAUNCHER] - Running in container: %v", l.runningInContainer)
162-
log.Printf("[LAUNCHER] - Is direct command: %v", isDirectCommand)
163-
164-
if isDirectCommand && l.runningInContainer {
165-
log.Printf("[LAUNCHER] ⚠️ Possible causes:")
166-
log.Printf("[LAUNCHER] - Command '%s' may not be installed in the gateway container", serverCfg.Command)
167-
log.Printf("[LAUNCHER] - Consider using 'container' config instead of 'command'")
168-
log.Printf("[LAUNCHER] - Or add '%s' to the gateway's Dockerfile", serverCfg.Command)
169-
} else if isDirectCommand {
170-
log.Printf("[LAUNCHER] ⚠️ Possible causes:")
171-
log.Printf("[LAUNCHER] - Command '%s' may not be in PATH", serverCfg.Command)
172-
log.Printf("[LAUNCHER] - Check if '%s' is installed: which %s", serverCfg.Command, serverCfg.Command)
173-
log.Printf("[LAUNCHER] - Verify file permissions and execute bit")
168+
log.Printf("[LAUNCHER] Starting server with %v timeout", l.startupTimeout)
169+
logLauncher.Printf("Starting server with timeout: serverID=%s, timeout=%v", serverID, l.startupTimeout)
170+
171+
// Create a buffered channel to receive connection result
172+
// Buffer size of 1 prevents goroutine leak if timeout occurs before connection completes
173+
resultChan := make(chan connectionResult, 1)
174+
175+
// Launch connection in a goroutine
176+
go func() {
177+
conn, err := mcp.NewConnection(l.ctx, serverCfg.Command, serverCfg.Args, serverCfg.Env)
178+
resultChan <- connectionResult{conn, err}
179+
}()
180+
181+
// Wait for connection with timeout
182+
select {
183+
case result := <-resultChan:
184+
conn, err := result.conn, result.err
185+
if err != nil {
186+
// Enhanced error logging for command-based servers
187+
logger.LogError("backend", "Failed to launch MCP backend server: %s, error=%v", serverID, err)
188+
log.Printf("[LAUNCHER] ❌ FAILED to launch server '%s'", serverID)
189+
log.Printf("[LAUNCHER] Error: %v", err)
190+
log.Printf("[LAUNCHER] Debug Information:")
191+
log.Printf("[LAUNCHER] - Command: %s", serverCfg.Command)
192+
log.Printf("[LAUNCHER] - Args: %v", serverCfg.Args)
193+
log.Printf("[LAUNCHER] - Env vars: %v", sanitize.TruncateSecretMap(serverCfg.Env))
194+
log.Printf("[LAUNCHER] - Running in container: %v", l.runningInContainer)
195+
log.Printf("[LAUNCHER] - Is direct command: %v", isDirectCommand)
196+
log.Printf("[LAUNCHER] - Startup timeout: %v", l.startupTimeout)
197+
198+
if isDirectCommand && l.runningInContainer {
199+
log.Printf("[LAUNCHER] ⚠️ Possible causes:")
200+
log.Printf("[LAUNCHER] - Command '%s' may not be installed in the gateway container", serverCfg.Command)
201+
log.Printf("[LAUNCHER] - Consider using 'container' config instead of 'command'")
202+
log.Printf("[LAUNCHER] - Or add '%s' to the gateway's Dockerfile", serverCfg.Command)
203+
} else if isDirectCommand {
204+
log.Printf("[LAUNCHER] ⚠️ Possible causes:")
205+
log.Printf("[LAUNCHER] - Command '%s' may not be in PATH", serverCfg.Command)
206+
log.Printf("[LAUNCHER] - Check if '%s' is installed: which %s", serverCfg.Command, serverCfg.Command)
207+
log.Printf("[LAUNCHER] - Verify file permissions and execute bit")
208+
}
209+
210+
return nil, fmt.Errorf("failed to create connection: %w", err)
174211
}
175212

176-
return nil, fmt.Errorf("failed to create connection: %w", err)
177-
}
213+
logger.LogInfo("backend", "Successfully launched MCP backend server: %s", serverID)
214+
log.Printf("[LAUNCHER] Successfully launched: %s", serverID)
215+
logLauncher.Printf("Connection established: serverID=%s", serverID)
178216

179-
logger.LogInfo("backend", "Successfully launched MCP backend server: %s", serverID)
180-
log.Printf("[LAUNCHER] Successfully launched: %s", serverID)
181-
logLauncher.Printf("Connection established: serverID=%s", serverID)
217+
l.connections[serverID] = conn
218+
return conn, nil
182219

183-
l.connections[serverID] = conn
184-
return conn, nil
220+
case <-time.After(l.startupTimeout):
221+
// Timeout occurred
222+
logger.LogError("backend", "MCP backend server startup timeout: %s, timeout=%v", serverID, l.startupTimeout)
223+
log.Printf("[LAUNCHER] ❌ Server startup timed out after %v", l.startupTimeout)
224+
log.Printf("[LAUNCHER] ⚠️ The server may be hanging or taking too long to initialize")
225+
log.Printf("[LAUNCHER] ⚠️ Consider increasing 'startupTimeout' in gateway config if server needs more time")
226+
return nil, fmt.Errorf("server startup timeout after %v", l.startupTimeout)
227+
}
185228
}
186229

187230
// GetOrLaunchForSession returns a session-aware connection or launches a new one
@@ -267,32 +310,59 @@ func GetOrLaunchForSession(l *Launcher, serverID, sessionID string) (*mcp.Connec
267310
log.Printf("[LAUNCHER] Additional env vars: %v", sanitize.TruncateSecretMap(serverCfg.Env))
268311
}
269312

270-
// Create connection
271-
conn, err := mcp.NewConnection(l.ctx, serverCfg.Command, serverCfg.Args, serverCfg.Env)
272-
if err != nil {
273-
logger.LogError("backend", "Failed to launch MCP backend server for session: server=%s, session=%s, error=%v", serverID, sessionID, err)
274-
log.Printf("[LAUNCHER] ❌ FAILED to launch server '%s' for session '%s'", serverID, sessionID)
275-
log.Printf("[LAUNCHER] Error: %v", err)
276-
log.Printf("[LAUNCHER] Debug Information:")
277-
log.Printf("[LAUNCHER] - Command: %s", serverCfg.Command)
278-
log.Printf("[LAUNCHER] - Args: %v", serverCfg.Args)
279-
log.Printf("[LAUNCHER] - Env vars: %v", sanitize.TruncateSecretMap(serverCfg.Env))
280-
log.Printf("[LAUNCHER] - Running in container: %v", l.runningInContainer)
281-
log.Printf("[LAUNCHER] - Is direct command: %v", isDirectCommand)
282-
283-
// Record error in session pool
284-
l.sessionPool.RecordError(serverID, sessionID)
313+
log.Printf("[LAUNCHER] Starting server for session with %v timeout", l.startupTimeout)
314+
logLauncher.Printf("Starting session server with timeout: serverID=%s, sessionID=%s, timeout=%v", serverID, sessionID, l.startupTimeout)
285315

286-
return nil, fmt.Errorf("failed to create connection: %w", err)
287-
}
316+
// Create a buffered channel to receive connection result
317+
// Buffer size of 1 prevents goroutine leak if timeout occurs before connection completes
318+
resultChan := make(chan connectionResult, 1)
288319

289-
logger.LogInfo("backend", "Successfully launched MCP backend server for session: server=%s, session=%s", serverID, sessionID)
290-
log.Printf("[LAUNCHER] Successfully launched: %s (session: %s)", serverID, sessionID)
291-
logLauncher.Printf("Session connection established: serverID=%s, sessionID=%s", serverID, sessionID)
320+
// Launch connection in a goroutine
321+
go func() {
322+
conn, err := mcp.NewConnection(l.ctx, serverCfg.Command, serverCfg.Args, serverCfg.Env)
323+
resultChan <- connectionResult{conn, err}
324+
}()
292325

293-
// Add to session pool
294-
l.sessionPool.Set(serverID, sessionID, conn)
295-
return conn, nil
326+
// Wait for connection with timeout
327+
select {
328+
case result := <-resultChan:
329+
conn, err := result.conn, result.err
330+
if err != nil {
331+
logger.LogError("backend", "Failed to launch MCP backend server for session: server=%s, session=%s, error=%v", serverID, sessionID, err)
332+
log.Printf("[LAUNCHER] ❌ FAILED to launch server '%s' for session '%s'", serverID, sessionID)
333+
log.Printf("[LAUNCHER] Error: %v", err)
334+
log.Printf("[LAUNCHER] Debug Information:")
335+
log.Printf("[LAUNCHER] - Command: %s", serverCfg.Command)
336+
log.Printf("[LAUNCHER] - Args: %v", serverCfg.Args)
337+
log.Printf("[LAUNCHER] - Env vars: %v", sanitize.TruncateSecretMap(serverCfg.Env))
338+
log.Printf("[LAUNCHER] - Running in container: %v", l.runningInContainer)
339+
log.Printf("[LAUNCHER] - Is direct command: %v", isDirectCommand)
340+
log.Printf("[LAUNCHER] - Startup timeout: %v", l.startupTimeout)
341+
342+
// Record error in session pool
343+
l.sessionPool.RecordError(serverID, sessionID)
344+
345+
return nil, fmt.Errorf("failed to create connection: %w", err)
346+
}
347+
348+
logger.LogInfo("backend", "Successfully launched MCP backend server for session: server=%s, session=%s", serverID, sessionID)
349+
log.Printf("[LAUNCHER] Successfully launched: %s (session: %s)", serverID, sessionID)
350+
logLauncher.Printf("Session connection established: serverID=%s, sessionID=%s", serverID, sessionID)
351+
352+
// Add to session pool
353+
l.sessionPool.Set(serverID, sessionID, conn)
354+
return conn, nil
355+
356+
case <-time.After(l.startupTimeout):
357+
// Timeout occurred
358+
logger.LogError("backend", "MCP backend server startup timeout for session: server=%s, session=%s, timeout=%v", serverID, sessionID, l.startupTimeout)
359+
log.Printf("[LAUNCHER] ❌ Server startup timed out after %v", l.startupTimeout)
360+
log.Printf("[LAUNCHER] ⚠️ The server may be hanging or taking too long to initialize")
361+
log.Printf("[LAUNCHER] ⚠️ Consider increasing 'startupTimeout' in gateway config if server needs more time")
362+
// Record error in session pool before returning
363+
l.sessionPool.RecordError(serverID, sessionID)
364+
return nil, fmt.Errorf("server startup timeout after %v", l.startupTimeout)
365+
}
296366
}
297367

298368
// ServerIDs returns all configured server IDs

internal/launcher/launcher_test.go

Lines changed: 75 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -549,3 +549,78 @@ func TestGetOrLaunchForSession_InvalidServer(t *testing.T) {
549549
assert.Nil(t, conn)
550550
assert.Contains(t, err.Error(), "not found in config")
551551
}
552+
553+
func TestLauncher_StartupTimeout(t *testing.T) {
554+
// Test that launcher respects the startup timeout from config
555+
tests := []struct {
556+
name string
557+
configTimeout int
558+
expectedTimeout string
559+
}{
560+
{
561+
name: "default timeout (60 seconds)",
562+
configTimeout: 0, // 0 means use default
563+
expectedTimeout: "1m0s",
564+
},
565+
{
566+
name: "custom timeout (30 seconds)",
567+
configTimeout: 30,
568+
expectedTimeout: "30s",
569+
},
570+
{
571+
name: "custom timeout (120 seconds)",
572+
configTimeout: 120,
573+
expectedTimeout: "2m0s",
574+
},
575+
}
576+
577+
for _, tt := range tests {
578+
t.Run(tt.name, func(t *testing.T) {
579+
ctx := context.Background()
580+
cfg := &config.Config{
581+
Servers: map[string]*config.ServerConfig{
582+
"test-server": {
583+
Type: "http",
584+
URL: "http://example.com",
585+
},
586+
},
587+
Gateway: &config.GatewayConfig{
588+
Port: 3000,
589+
StartupTimeout: tt.configTimeout,
590+
ToolTimeout: 120,
591+
},
592+
}
593+
594+
// If timeout is 0, set it to default to match LoadFromFile behavior
595+
if cfg.Gateway.StartupTimeout == 0 {
596+
cfg.Gateway.StartupTimeout = config.DefaultStartupTimeout
597+
}
598+
599+
l := New(ctx, cfg)
600+
defer l.Close()
601+
602+
// Verify the timeout was set correctly
603+
assert.Equal(t, tt.expectedTimeout, l.startupTimeout.String())
604+
})
605+
}
606+
}
607+
608+
func TestLauncher_TimeoutWithNilGateway(t *testing.T) {
609+
// Test that launcher uses default timeout when Gateway config is nil
610+
ctx := context.Background()
611+
cfg := &config.Config{
612+
Servers: map[string]*config.ServerConfig{
613+
"test-server": {
614+
Type: "http",
615+
URL: "http://example.com",
616+
},
617+
},
618+
Gateway: nil, // No gateway config
619+
}
620+
621+
l := New(ctx, cfg)
622+
defer l.Close()
623+
624+
// Should use default timeout (60 seconds)
625+
assert.Equal(t, "1m0s", l.startupTimeout.String())
626+
}

internal/logger/common_test.go

Lines changed: 5 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,6 @@ import (
44
"fmt"
55
"os"
66
"path/filepath"
7-
"strings"
87
"sync"
98
"testing"
109

@@ -607,7 +606,7 @@ func TestInitLogger_MarkdownLoggerFallback(t *testing.T) {
607606
}
608607

609608
func TestInitLogger_SetupError(t *testing.T) {
610-
assert := assert.New(t)
609+
a := assert.New(t)
611610
tmpDir := t.TempDir()
612611
logDir := filepath.Join(tmpDir, "logs")
613612
fileName := "test.log"
@@ -625,14 +624,14 @@ func TestInitLogger_SetupError(t *testing.T) {
625624
},
626625
)
627626

628-
assert.Error(err, "initLogger should return error on setup failure")
629-
assert.Equal(assert.AnError, err, "Error should match setup error")
630-
assert.Nil(logger, "logger should be nil on setup error")
627+
a.Error(err, "initLogger should return error on setup failure")
628+
a.Equal(assert.AnError, err, "Error should match setup error")
629+
a.Nil(logger, "logger should be nil on setup error")
631630

632631
// Verify the log file was created but then closed
633632
logPath := filepath.Join(logDir, fileName)
634633
_, err = os.Stat(logPath)
635-
assert.NoError(err, "Log file should exist even after setup error")
634+
a.NoError(err, "Log file should exist even after setup error")
636635
}
637636

638637
func TestInitLogger_FileFlags(t *testing.T) {

0 commit comments

Comments
 (0)