Skip to content

Commit dcb3457

Browse files
committed
libtailscale: fix multiTUN.Write deadlock during wireguard reconfig
Fixes tailscale/tailscale#18679 Signed-off-by: Nick Khyl <nickk@tailscale.com>
1 parent 5c5030c commit dcb3457

3 files changed

Lines changed: 122 additions & 9 deletions

File tree

libtailscale/backend.go

Lines changed: 10 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -145,7 +145,10 @@ func (a *App) runBackend(ctx context.Context, hardwareAttestation bool) error {
145145
if hardwareAttestation {
146146
a.backend.SetHardwareAttested()
147147
}
148-
defer b.CloseTUNs()
148+
defer func() {
149+
b.devices.Down()
150+
b.CloseTUNs()
151+
}()
149152

150153
hc := localapi.HandlerConfig{
151154
Actor: ipnauth.Self,
@@ -253,6 +256,9 @@ func (a *App) runBackend(ctx context.Context, hardwareAttestation bool) error {
253256
}
254257
case s := <-onDisconnect:
255258
if vpnService.service != nil && vpnService.service.ID() == s.ID() {
259+
if b.devices.Down() {
260+
log.Printf("tunnel brought down on disconnect")
261+
}
256262
b.CloseTUNs()
257263
netns.SetAndroidProtectFunc(nil)
258264
netns.SetAndroidBindToNetworkFunc(nil)
@@ -399,7 +405,9 @@ func (a *App) closeVpnService(err error, b *backend) {
399405
log.Printf("localapi edit prefs error %v", localApiErr)
400406
}
401407

402-
b.lastCfg = nil
408+
if b.devices.Down() {
409+
log.Printf("tunnel brought down on VPN service error: %v", err)
410+
}
403411
b.CloseTUNs()
404412

405413
vpnService.service.DisconnectVPN()

libtailscale/multitun.go

Lines changed: 89 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -7,8 +7,10 @@ import (
77
"log"
88
"os"
99
"runtime/debug"
10+
"sync"
1011

1112
"github.com/tailscale/wireguard-go/tun"
13+
"tailscale.com/syncs"
1214
)
1315

1416
// multiTUN implements a tun.Device that supports multiple
@@ -30,6 +32,19 @@ type multiTUN struct {
3032
names chan chan nameReply
3133
shutdowns chan struct{}
3234
shutdownDone chan struct{}
35+
36+
downMu sync.Mutex
37+
// downCh is closed when the multiTUN is brought down,
38+
// such as when Tailscale transitions to the Stopped state.
39+
// This indicates that all outgoing packets should be dropped,
40+
// and [multiTUN.Write] should return immediately without blocking
41+
// until [multiTUN.Up] is called. See [multiTUN.Down]
42+
// and tailscale/tailscale#18679 for more details.
43+
//
44+
// It can be read without holding downMu, but the mutex
45+
// must be held when writing.
46+
downCh syncs.AtomicValue[chan struct{}]
47+
down bool // whether the downCh is closed
3348
}
3449

3550
// tunDevice wraps and drives a single run.Device.
@@ -76,7 +91,11 @@ func newTUNDevices() *multiTUN {
7691
names: make(chan chan nameReply),
7792
shutdowns: make(chan struct{}),
7893
shutdownDone: make(chan struct{}),
94+
down: true, // The device is initially down.
7995
}
96+
downCh := make(chan struct{})
97+
d.downCh.Store(downCh)
98+
close(downCh)
8099
go d.run()
81100
return d
82101
}
@@ -256,6 +275,47 @@ func (d *multiTUN) add(dev tun.Device) {
256275
d.devices <- dev
257276
}
258277

278+
// Up brings the multiTUN up, allowing it to write packets
279+
// to the underlying tunnel device. If there is no underlying
280+
// device yet, write operations are pended until a new device
281+
// is added with [multiTUN.add].
282+
//
283+
// It reports whether this call brought the device up.
284+
func (d *multiTUN) Up() bool {
285+
d.downMu.Lock()
286+
defer d.downMu.Unlock()
287+
if !d.down {
288+
return false
289+
}
290+
d.downCh.Store(make(chan struct{}))
291+
d.down = false
292+
return true
293+
}
294+
295+
// Down brings the multiTUN down, causing all outgoing packets
296+
// to be dropped without waiting for the underlying tunnel device,
297+
// and makes all [multiTUN.Write] calls return immediately
298+
// until [multiTUN.Up] is called.
299+
//
300+
// It mainly exists to distinguish between cases where the underlying
301+
// device is temporarily unavailable due to VPN reconfiguration,
302+
// in which case write requests should be pended, and cases where
303+
// Tailscale is stopped, where any pending and new requests
304+
// should complete immediately to prevent deadlocks.
305+
// See tailscale/tailscale#18679.
306+
//
307+
// It reports whether this call brought the device down.
308+
func (d *multiTUN) Down() bool {
309+
d.downMu.Lock()
310+
defer d.downMu.Unlock()
311+
if d.down {
312+
return false
313+
}
314+
close(d.downCh.Load())
315+
d.down = true
316+
return true
317+
}
318+
259319
func (d *multiTUN) File() *os.File {
260320
// The underlying file descriptor is not constant on Android.
261321
// Let's hope no-one uses it.
@@ -264,16 +324,39 @@ func (d *multiTUN) File() *os.File {
264324

265325
func (d *multiTUN) Read(data [][]byte, sizes []int, offset int) (int, error) {
266326
r := make(chan ioReply)
267-
d.reads <- ioRequest{data, sizes, offset, r}
268-
rep := <-r
269-
return rep.count, rep.err
327+
select {
328+
// We don't care about d.downCh here, as it's fine
329+
// to continue waiting until the tunnel is up again
330+
// or the multiTUN device is permanently closed.
331+
// This does not block WireGuard reconfiguration.
332+
case d.reads <- ioRequest{data, sizes, offset, r}:
333+
rep := <-r
334+
return rep.count, rep.err
335+
case <-d.close:
336+
// Return immediately if the multiTUN device is closed.
337+
return 0, os.ErrClosed
338+
}
270339
}
271340

272341
func (d *multiTUN) Write(data [][]byte, offset int) (int, error) {
273342
r := make(chan ioReply)
274-
d.writes <- ioRequest{data, nil, offset, r}
275-
rep := <-r
276-
return rep.count, rep.err
343+
select {
344+
case d.writes <- ioRequest{data, nil, offset, r}:
345+
rep := <-r
346+
return rep.count, rep.err
347+
case <-d.downCh.Load():
348+
// Drop the packet silently if the tunnel is down.
349+
// Otherwise, a race may occur during wireguard reconfig
350+
// and result in a deadlock, since a wireguard-go/device.Peer
351+
// cannot be removed until its RoutineSequentialReceiver
352+
// returns, and it will not return if it is blocked in
353+
// (*multiTUN).Write while sending to d.writes without
354+
// a receiver on the other side of the pipe.
355+
return 0, nil
356+
case <-d.close:
357+
// Return immediately if the multiTUN device is closed.
358+
return 0, os.ErrClosed
359+
}
277360
}
278361

279362
func (d *multiTUN) MTU() (int, error) {

libtailscale/net.go

Lines changed: 23 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -75,7 +75,7 @@ var googleDNSServers = []netip.Addr{
7575
netip.MustParseAddr("2001:4860:4860::8844"),
7676
}
7777

78-
func (b *backend) updateTUN(rcfg *router.Config, dcfg *dns.OSConfig) error {
78+
func (b *backend) updateTUN(rcfg *router.Config, dcfg *dns.OSConfig) (err error) {
7979
b.logger.Logf("updateTUN: changed")
8080
defer b.logger.Logf("updateTUN: finished")
8181

@@ -89,6 +89,23 @@ func (b *backend) updateTUN(rcfg *router.Config, dcfg *dns.OSConfig) error {
8989
b.CloseTUNs()
9090
b.logger.Logf("updateTUN: closed old TUNs")
9191

92+
// Since the previous tunnel(s) are closed, the [multiTUN] device is
93+
// not operational until a new underlying tunnel is created and added,
94+
// which may never happen in case of an error or an empty [router.Config].
95+
//
96+
// Therefore, to prevent deadlocks where a [multiTUN.Write] would
97+
// block waiting for a new tunnel to be added, we bring the multiTUN
98+
// device down on exit unless a new [tun.Device] is created and added
99+
// successfully. See tailscale/tailscale#18679.
100+
//
101+
// TODO(nickkhyl): revisit and simplify the [multiTUN] implementation?
102+
hasTunnel := false
103+
defer func() {
104+
if !hasTunnel && b.devices.Down() {
105+
b.logger.Logf("updateTUN: tunnel brought down: %v", err)
106+
}
107+
}()
108+
92109
if len(rcfg.LocalAddrs) == 0 {
93110
return nil
94111
}
@@ -182,8 +199,13 @@ func (b *backend) updateTUN(rcfg *router.Config, dcfg *dns.OSConfig) error {
182199
b.logger.Logf("updateTUN: created TUN device")
183200

184201
b.devices.add(tunDev)
202+
hasTunnel = true
185203
b.logger.Logf("updateTUN: added TUN device")
186204

205+
if b.devices.Up() {
206+
b.logger.Logf("tunnel brought up")
207+
}
208+
187209
b.lastCfg = rcfg
188210
b.lastDNSCfg = dcfg
189211
return nil

0 commit comments

Comments
 (0)