From 0d2e03109d933c51ed521366b8f0e90345b48967 Mon Sep 17 00:00:00 2001 From: fearful-symmetry Date: Mon, 10 Mar 2025 20:18:59 +0000 Subject: [PATCH 01/19] tinker with async close --- audit.go | 50 +++++++++++++++++++++++++++++++++++++++++++------- netlink.go | 12 +++++++++++- 2 files changed, 54 insertions(+), 8 deletions(-) diff --git a/audit.go b/audit.go index f37f957..eb63fa6 100644 --- a/audit.go +++ b/audit.go @@ -49,7 +49,7 @@ const ( // Netlink groups. const ( NetlinkGroupNone = iota // Group 0 not used - NetlinkGroupReadLog // "best effort" read only socket + NetlinkGroupReadLog // "best effort" read only socket, defined in the kernel as AUDIT_NLGRP_READLOG ) // WaitMode is a flag to control the behavior of methods that abstract @@ -427,16 +427,12 @@ func (c *AuditClient) Receive(nonBlocking bool) (*RawAuditMessage, error) { // become no-ops. func (c *AuditClient) Close() error { var err error - // Only unregister and close the socket once. c.closeOnce.Do(func() { if c.clearPIDOnClose { // Unregister from the kernel for a clean exit. - status := AuditStatus{ - Mask: AuditStatusPID, - PID: 0, - } - err = c.set(status, NoWait) + c.closeAndUnsetPid() + } err = errors.Join(err, c.Netlink.Close()) @@ -505,6 +501,46 @@ func (c *AuditClient) getReply(seq uint32) (*syscall.NetlinkMessage, error) { return &msg, nil } +// an isolated operation to unset our pid from the audit subsystem and close the socket +func (c *AuditClient) closeAndUnsetPid() { + msg := syscall.NetlinkMessage{ + Header: syscall.NlMsghdr{ + Type: AuditSet, + Flags: syscall.NLM_F_REQUEST | syscall.NLM_F_ACK, + }, + Data: AuditStatus{ + Mask: AuditStatusPID, + PID: 0, + }.toWireFormat(), + } + + noParse := func(bytes []byte) ([]syscall.NetlinkMessage, error) { + return nil, nil + } + + // If our request to unset the PID would block, then try to drain events from + // the netlink socket, resend, try again. + // In netlink, EAGAIN usually indicates our read buffer is full. + // The auditd code (which I'm using as a reference implementation) doesn't wait for a response when unsetting the audit pid. + _, err := c.Netlink.SendNoWait(msg) + if err != nil { + // sending may be blocked, retry. If the kernel layer is blocked, we'll usually get ENOBUF, + // but retry on any error. + for { + // throw out events until we can try to send again. We're closing, so we don't really care. + _, err = c.Netlink.Receive(true, noParse) + // if we got some events, or we would be waiting, try again try sending again + if err == nil || errors.Is(err, syscall.EAGAIN) || errors.Is(err, syscall.EWOULDBLOCK) { + _, err := c.Netlink.SendNoWait(msg) + if err == nil { + break + } + } + + } + } +} + func (c *AuditClient) set(status AuditStatus, mode WaitMode) error { msg := syscall.NetlinkMessage{ Header: syscall.NlMsghdr{ diff --git a/netlink.go b/netlink.go index 03302c5..5223f7c 100644 --- a/netlink.go +++ b/netlink.go @@ -36,6 +36,7 @@ import ( // in the message and an error if it occurred. type NetlinkSender interface { Send(msg syscall.NetlinkMessage) (uint32, error) + SendNoWait(msg syscall.NetlinkMessage) (uint32, error) } // NetlinkReceiver receives data from the netlink socket and uses the provided @@ -126,17 +127,26 @@ func getPortID(fd int) (uint32, error) { return addr.Pid, nil } +// Send a message to the netlink client in non-blocking mode. Behavior is otherwise identical to Send() +func (c *NetlinkClient) SendNoWait(msg syscall.NetlinkMessage) (uint32, error) { + return c.send(msg, syscall.MSG_DONTWAIT) +} + // Send sends a netlink message and returns the sequence number used // in the message and an error if it occurred. If the PID is not set then // the value will be populated automatically (recommended). func (c *NetlinkClient) Send(msg syscall.NetlinkMessage) (uint32, error) { + return c.send(msg, 0) +} + +func (c *NetlinkClient) send(msg syscall.NetlinkMessage, flags int) (uint32, error) { if msg.Header.Pid == 0 { msg.Header.Pid = c.pid } msg.Header.Seq = atomic.AddUint32(&c.seq, 1) to := &syscall.SockaddrNetlink{} - return msg.Header.Seq, syscall.Sendto(c.fd, serialize(msg), 0, to) + return msg.Header.Seq, syscall.Sendto(c.fd, serialize(msg), flags, to) } func serialize(msg syscall.NetlinkMessage) []byte { From 9bcd2693a72980efdc31f00a34eef88ee5af4723 Mon Sep 17 00:00:00 2001 From: fearful-symmetry Date: Mon, 10 Mar 2025 20:30:54 +0000 Subject: [PATCH 02/19] docs --- audit.go | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/audit.go b/audit.go index eb63fa6..7865e04 100644 --- a/audit.go +++ b/audit.go @@ -524,8 +524,7 @@ func (c *AuditClient) closeAndUnsetPid() { // The auditd code (which I'm using as a reference implementation) doesn't wait for a response when unsetting the audit pid. _, err := c.Netlink.SendNoWait(msg) if err != nil { - // sending may be blocked, retry. If the kernel layer is blocked, we'll usually get ENOBUF, - // but retry on any error. + // sending may be blocked, retry. for { // throw out events until we can try to send again. We're closing, so we don't really care. _, err = c.Netlink.Receive(true, noParse) From 2b7c7b47f2b808b2e970e4cac0cff7c44d8a82ab Mon Sep 17 00:00:00 2001 From: fearful-symmetry Date: Mon, 10 Mar 2025 20:44:17 +0000 Subject: [PATCH 03/19] docs.. --- audit.go | 1 + 1 file changed, 1 insertion(+) diff --git a/audit.go b/audit.go index 7865e04..7109a8f 100644 --- a/audit.go +++ b/audit.go @@ -529,6 +529,7 @@ func (c *AuditClient) closeAndUnsetPid() { // throw out events until we can try to send again. We're closing, so we don't really care. _, err = c.Netlink.Receive(true, noParse) // if we got some events, or we would be waiting, try again try sending again + // If we get an ENOBUF, should we try to drain the socket, or retry first? This assumes we keep draining the socket. if err == nil || errors.Is(err, syscall.EAGAIN) || errors.Is(err, syscall.EWOULDBLOCK) { _, err := c.Netlink.SendNoWait(msg) if err == nil { From 1bf87718f8951410c2d55f6ad87b94df504b95d3 Mon Sep 17 00:00:00 2001 From: fearful-symmetry Date: Mon, 10 Mar 2025 20:55:07 +0000 Subject: [PATCH 04/19] add badfd condition --- audit.go | 3 +++ 1 file changed, 3 insertions(+) diff --git a/audit.go b/audit.go index 7109a8f..c5f0adf 100644 --- a/audit.go +++ b/audit.go @@ -535,6 +535,9 @@ func (c *AuditClient) closeAndUnsetPid() { if err == nil { break } + // Not sure how we could end up here unless other threads are doing something weird, but best to handle this so we don't loop forever. + } else if errors.Is(err, syscall.EBADFD) { + return } } From cd4b38d2bf1fd48defe7a2632964ddaf3cf9bc60 Mon Sep 17 00:00:00 2001 From: fearful-symmetry Date: Mon, 10 Mar 2025 20:57:36 +0000 Subject: [PATCH 05/19] lint --- netlink.go | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/netlink.go b/netlink.go index 5223f7c..6ef4411 100644 --- a/netlink.go +++ b/netlink.go @@ -127,7 +127,7 @@ func getPortID(fd int) (uint32, error) { return addr.Pid, nil } -// Send a message to the netlink client in non-blocking mode. Behavior is otherwise identical to Send() +// SendNoWait sends a message to the netlink client in non-blocking mode. Behavior is otherwise identical to Send() func (c *NetlinkClient) SendNoWait(msg syscall.NetlinkMessage) (uint32, error) { return c.send(msg, syscall.MSG_DONTWAIT) } From f2c42d4f04d648884053639fa1b41c3641593d49 Mon Sep 17 00:00:00 2001 From: fearful-symmetry Date: Mon, 10 Mar 2025 21:01:01 +0000 Subject: [PATCH 06/19] format --- audit.go | 1 - 1 file changed, 1 deletion(-) diff --git a/audit.go b/audit.go index c5f0adf..ea35186 100644 --- a/audit.go +++ b/audit.go @@ -432,7 +432,6 @@ func (c *AuditClient) Close() error { if c.clearPIDOnClose { // Unregister from the kernel for a clean exit. c.closeAndUnsetPid() - } err = errors.Join(err, c.Netlink.Close()) From 8e5d0745b0ce9ce4ae7d24101f31c9dcdb1fc0ce Mon Sep 17 00:00:00 2001 From: fearful-symmetry Date: Tue, 11 Mar 2025 17:39:05 +0000 Subject: [PATCH 07/19] rewrite --- audit.go | 50 +++++++++++++++++++++++++++++++------------------- 1 file changed, 31 insertions(+), 19 deletions(-) diff --git a/audit.go b/audit.go index ea35186..3ac3955 100644 --- a/audit.go +++ b/audit.go @@ -500,12 +500,13 @@ func (c *AuditClient) getReply(seq uint32) (*syscall.NetlinkMessage, error) { return &msg, nil } -// an isolated operation to unset our pid from the audit subsystem and close the socket -func (c *AuditClient) closeAndUnsetPid() { +// unset our pid from the audit subsystem and close the socket. +// This is a sort of isolated refactor, meant to deal with the deadlocks that can happen when we're not careful with blocking operations throughout a lot of this code. +func (c *AuditClient) closeAndUnsetPid() error { msg := syscall.NetlinkMessage{ Header: syscall.NlMsghdr{ Type: AuditSet, - Flags: syscall.NLM_F_REQUEST | syscall.NLM_F_ACK, + Flags: syscall.NLM_F_REQUEST, }, Data: AuditStatus{ Mask: AuditStatusPID, @@ -517,30 +518,41 @@ func (c *AuditClient) closeAndUnsetPid() { return nil, nil } + // retry the send if the syscall gets an interrupt. + // This may not be totally needed, as non-blocking calls shouldn't really return an EINTR, but there's nothing saying it can't. + retryOnEINTR := func(msg syscall.NetlinkMessage) (uint32, error) { + maxRetry := 10 + for i := 0; i < maxRetry; i++ { + wrote, err := c.Netlink.SendNoWait(msg) + if !errors.Is(err, syscall.EINTR) { + return wrote, err + } + } + return 0, fmt.Errorf("could not send netlink message, got repeated EINTR") + } + // If our request to unset the PID would block, then try to drain events from // the netlink socket, resend, try again. // In netlink, EAGAIN usually indicates our read buffer is full. // The auditd code (which I'm using as a reference implementation) doesn't wait for a response when unsetting the audit pid. - _, err := c.Netlink.SendNoWait(msg) + _, err := retryOnEINTR(msg) if err != nil { - // sending may be blocked, retry. - for { - // throw out events until we can try to send again. We're closing, so we don't really care. + maxRetry := 10000 + // sending may be blocked, try to empty the buffer, then send again + for i := 0; i < maxRetry; i++ { + // Attempt to drain the socket before we retry the close _, err = c.Netlink.Receive(true, noParse) - // if we got some events, or we would be waiting, try again try sending again - // If we get an ENOBUF, should we try to drain the socket, or retry first? This assumes we keep draining the socket. - if err == nil || errors.Is(err, syscall.EAGAIN) || errors.Is(err, syscall.EWOULDBLOCK) { - _, err := c.Netlink.SendNoWait(msg) - if err == nil { - break - } - // Not sure how we could end up here unless other threads are doing something weird, but best to handle this so we don't loop forever. - } else if errors.Is(err, syscall.EBADFD) { - return + if err == nil || errors.Is(err, syscall.EINTR) || errors.Is(err, syscall.ENOBUFS) { + continue + } else if errors.Is(err, syscall.EAGAIN) { + break } - } } + // One last attempt. + // Looking at the kernel code in kauditd_thread, I think we can survive failing to unset the pid, as connection failures will cause the pid to be unset. + _, err = retryOnEINTR(msg) + return fmt.Errorf("error sending PID unset event: %w", err) } func (c *AuditClient) set(status AuditStatus, mode WaitMode) error { @@ -598,7 +610,7 @@ func parseNetlinkAuditMessage(buf []byte) ([]syscall.NetlinkMessage, error) { // https://github.com/linux-audit/audit-kernel/blob/v4.7/include/uapi/linux/audit.h#L318-L325 type AuditStatusMask uint32 -// Mask types for AuditStatus. +// Mask types for AuditStatus. Originally defined in the kernel at audit.h const ( AuditStatusEnabled AuditStatusMask = 1 << iota AuditStatusFailure From 185bc5e135154909d1427a989c206ea8e518e1ab Mon Sep 17 00:00:00 2001 From: fearful-symmetry Date: Tue, 11 Mar 2025 17:44:48 +0000 Subject: [PATCH 08/19] error checking --- audit.go | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/audit.go b/audit.go index 3ac3955..fad482e 100644 --- a/audit.go +++ b/audit.go @@ -431,7 +431,8 @@ func (c *AuditClient) Close() error { c.closeOnce.Do(func() { if c.clearPIDOnClose { // Unregister from the kernel for a clean exit. - c.closeAndUnsetPid() + err = c.closeAndUnsetPid() + } err = errors.Join(err, c.Netlink.Close()) From e1259ebcb61e437b51deb79639c8c88ba16a86f6 Mon Sep 17 00:00:00 2001 From: fearful-symmetry Date: Tue, 11 Mar 2025 17:50:10 +0000 Subject: [PATCH 09/19] fmt --- audit.go | 1 - 1 file changed, 1 deletion(-) diff --git a/audit.go b/audit.go index fad482e..2ad4b71 100644 --- a/audit.go +++ b/audit.go @@ -432,7 +432,6 @@ func (c *AuditClient) Close() error { if c.clearPIDOnClose { // Unregister from the kernel for a clean exit. err = c.closeAndUnsetPid() - } err = errors.Join(err, c.Netlink.Close()) From 7867c2f6801d43496c6ca382b6c81a77afeeabd3 Mon Sep 17 00:00:00 2001 From: fearful-symmetry Date: Tue, 11 Mar 2025 20:00:45 +0000 Subject: [PATCH 10/19] rewrite --- audit.go | 57 ++++++++++++++++++++++++++++---------------------------- 1 file changed, 28 insertions(+), 29 deletions(-) diff --git a/audit.go b/audit.go index 2ad4b71..312fece 100644 --- a/audit.go +++ b/audit.go @@ -518,41 +518,40 @@ func (c *AuditClient) closeAndUnsetPid() error { return nil, nil } - // retry the send if the syscall gets an interrupt. - // This may not be totally needed, as non-blocking calls shouldn't really return an EINTR, but there's nothing saying it can't. - retryOnEINTR := func(msg syscall.NetlinkMessage) (uint32, error) { - maxRetry := 10 - for i := 0; i < maxRetry; i++ { - wrote, err := c.Netlink.SendNoWait(msg) - if !errors.Is(err, syscall.EINTR) { - return wrote, err - } - } - return 0, fmt.Errorf("could not send netlink message, got repeated EINTR") - } - // If our request to unset the PID would block, then try to drain events from // the netlink socket, resend, try again. // In netlink, EAGAIN usually indicates our read buffer is full. - // The auditd code (which I'm using as a reference implementation) doesn't wait for a response when unsetting the audit pid. - _, err := retryOnEINTR(msg) - if err != nil { - maxRetry := 10000 - // sending may be blocked, try to empty the buffer, then send again - for i := 0; i < maxRetry; i++ { - // Attempt to drain the socket before we retry the close - _, err = c.Netlink.Receive(true, noParse) - if err == nil || errors.Is(err, syscall.EINTR) || errors.Is(err, syscall.ENOBUFS) { - continue - } else if errors.Is(err, syscall.EAGAIN) { - break + // The auditd code (which I'm using as a reference implementation) doesn't wait for a response when unsetting the audit pid. + maxLoop := 5 + for i := 0; i < maxLoop; i++ { + _, err := c.Netlink.SendNoWait(msg) + // if we get an interrupt, retry the send + if err == nil { + return nil + } else if errors.Is(err, syscall.EINTR) { + // got interrupt, try again + continue + } else if errors.Is(err, syscall.EAGAIN) { + maxRecv := 10000 + // send would block, try to drain the receive socket + for i := 0; i < maxRecv; i++ { + _, err = c.Netlink.Receive(true, noParse) + if errors.Is(err, syscall.EAGAIN) { + // receive would block, try to send again + break + } else if err == nil || errors.Is(err, syscall.EINTR) || errors.Is(err, syscall.ENOBUFS) { + // retry the receive + continue + } else { + // if we have another kind of error, just bail and return that error. + return err + } } } + } - // One last attempt. - // Looking at the kernel code in kauditd_thread, I think we can survive failing to unset the pid, as connection failures will cause the pid to be unset. - _, err = retryOnEINTR(msg) - return fmt.Errorf("error sending PID unset event: %w", err) + // we may not want to treat this as a hard error? + return fmt.Errorf("could not unset pid from audit after retries") } func (c *AuditClient) set(status AuditStatus, mode WaitMode) error { From fdc60e408a0c1b539ced6d2c4e26054f061f23f9 Mon Sep 17 00:00:00 2001 From: fearful-symmetry Date: Wed, 12 Mar 2025 16:25:11 +0000 Subject: [PATCH 11/19] add else block --- audit.go | 3 +++ 1 file changed, 3 insertions(+) diff --git a/audit.go b/audit.go index 312fece..4dd8542 100644 --- a/audit.go +++ b/audit.go @@ -547,6 +547,9 @@ func (c *AuditClient) closeAndUnsetPid() error { return err } } + } else { + // if we get another error from the send, return that up + return err } } From d84de7cee10fe2802d030a64d8890e5adba83296 Mon Sep 17 00:00:00 2001 From: fearful-symmetry Date: Wed, 12 Mar 2025 21:17:50 +0000 Subject: [PATCH 12/19] add tests, dumb linter --- audit.go | 7 +++-- audit_test.go | 86 +++++++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 90 insertions(+), 3 deletions(-) diff --git a/audit.go b/audit.go index 4dd8542..475cc17 100644 --- a/audit.go +++ b/audit.go @@ -547,10 +547,11 @@ func (c *AuditClient) closeAndUnsetPid() error { return err } } - } else { - // if we get another error from the send, return that up - return err + // try again after we flush the recv buffer + continue } + // if we get another error from the send, return that up + return err } // we may not want to treat this as a hard error? diff --git a/audit_test.go b/audit_test.go index 816662c..90fd42e 100644 --- a/audit_test.go +++ b/audit_test.go @@ -30,12 +30,15 @@ import ( "io" "os" "runtime" + "slices" + "sync" "syscall" "testing" "testing/quick" "time" "github.com/stretchr/testify/assert" + "github.com/stretchr/testify/require" "github.com/elastic/go-libaudit/v2/rule" "github.com/elastic/go-libaudit/v2/rule/flags" @@ -55,6 +58,89 @@ var ( // -a always,exit -S open,truncate -F dir=/etc -F success=0 const testRule = `BAAAAAIAAAACAAAABAAAAAAAAAAAEAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAGsAAABoAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAEAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAQAAAAEAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAQAAAAvZXRj` +// TestNetlinkIface is a mock interface for testing close behavior +type TestNetlinkIface struct { + recvStack []error + sendStack []error +} + +func (tn *TestNetlinkIface) Close() error { + return nil +} + +func (tn *TestNetlinkIface) Send(msg syscall.NetlinkMessage) (uint32, error) { + top := tn.sendStack[0] + tn.sendStack = slices.Delete(tn.sendStack, 0, 1) + return 0, top +} + +func (tn *TestNetlinkIface) SendNoWait(msg syscall.NetlinkMessage) (uint32, error) { + top := tn.sendStack[0] + tn.sendStack = slices.Delete(tn.sendStack, 0, 1) + return 0, top +} + +func (tn *TestNetlinkIface) Receive(nonBlocking bool, p NetlinkParser) ([]syscall.NetlinkMessage, error) { + top := tn.recvStack[0] + tn.recvStack = slices.Delete(tn.recvStack, 0, 1) + return nil, top + +} + +func TestCloseBehavior(t *testing.T) { + testCases := []struct { + name string + cfg *TestNetlinkIface + err error + }{ + { + name: "retry", + cfg: &TestNetlinkIface{ + // cause the first send to error out + sendStack: []error{syscall.EWOULDBLOCK, nil, nil}, + // force the close logic to drain + recvStack: []error{syscall.ENOBUFS, syscall.ENOBUFS, syscall.EAGAIN}, + }, + err: nil, + }, + { + name: "fail-recv-error", + cfg: &TestNetlinkIface{ + // cause the first send to error out + sendStack: []error{syscall.EWOULDBLOCK, nil, nil}, + // force the close logic to drain + recvStack: []error{syscall.ENOBUFS, syscall.ENOBUFS, syscall.EBADFD}, + }, + err: syscall.EBADFD, + }, + { + name: "fail-send-error", + cfg: &TestNetlinkIface{ + // cause the first send to error out + sendStack: []error{syscall.EWOULDBLOCK, syscall.EBADFD, nil}, + // force the close logic to drain + recvStack: []error{syscall.EAGAIN, syscall.EAGAIN}, + }, + err: syscall.EBADFD, + }, + } + + for _, test := range testCases { + t.Run(test.name, func(t *testing.T) { + testClient := AuditClient{ + Netlink: test.cfg, + pendingAcks: []uint32{}, + clearPIDOnClose: true, + closeOnce: sync.Once{}, + } + + err := testClient.Close() + require.True(t, errors.Is(err, test.err)) + }) + } + +} + func TestAuditClientGetStatus(t *testing.T) { if os.Geteuid() != 0 { t.Skip("must be root to get audit status") From afe7f92aa0f0da6e33c6a6cbeefa3d04042ce2b3 Mon Sep 17 00:00:00 2001 From: fearful-symmetry Date: Wed, 12 Mar 2025 21:23:00 +0000 Subject: [PATCH 13/19] linter... --- audit_test.go | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/audit_test.go b/audit_test.go index 90fd42e..da637dd 100644 --- a/audit_test.go +++ b/audit_test.go @@ -64,23 +64,23 @@ type TestNetlinkIface struct { sendStack []error } -func (tn *TestNetlinkIface) Close() error { +func (_ *TestNetlinkIface) Close() error { return nil } -func (tn *TestNetlinkIface) Send(msg syscall.NetlinkMessage) (uint32, error) { +func (tn *TestNetlinkIface) Send(_ syscall.NetlinkMessage) (uint32, error) { top := tn.sendStack[0] tn.sendStack = slices.Delete(tn.sendStack, 0, 1) return 0, top } -func (tn *TestNetlinkIface) SendNoWait(msg syscall.NetlinkMessage) (uint32, error) { +func (tn *TestNetlinkIface) SendNoWait(_ syscall.NetlinkMessage) (uint32, error) { top := tn.sendStack[0] tn.sendStack = slices.Delete(tn.sendStack, 0, 1) return 0, top } -func (tn *TestNetlinkIface) Receive(nonBlocking bool, p NetlinkParser) ([]syscall.NetlinkMessage, error) { +func (tn *TestNetlinkIface) Receive(_ bool, _ NetlinkParser) ([]syscall.NetlinkMessage, error) { top := tn.recvStack[0] tn.recvStack = slices.Delete(tn.recvStack, 0, 1) return nil, top From 9ba5dbbf1d80cc5c45b674e7d10e92795f513316 Mon Sep 17 00:00:00 2001 From: fearful-symmetry Date: Wed, 12 Mar 2025 21:24:24 +0000 Subject: [PATCH 14/19] LINTER --- audit_test.go | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/audit_test.go b/audit_test.go index da637dd..06e42b5 100644 --- a/audit_test.go +++ b/audit_test.go @@ -64,7 +64,7 @@ type TestNetlinkIface struct { sendStack []error } -func (_ *TestNetlinkIface) Close() error { +func (*TestNetlinkIface) Close() error { return nil } @@ -84,7 +84,6 @@ func (tn *TestNetlinkIface) Receive(_ bool, _ NetlinkParser) ([]syscall.NetlinkM top := tn.recvStack[0] tn.recvStack = slices.Delete(tn.recvStack, 0, 1) return nil, top - } func TestCloseBehavior(t *testing.T) { @@ -138,7 +137,6 @@ func TestCloseBehavior(t *testing.T) { require.True(t, errors.Is(err, test.err)) }) } - } func TestAuditClientGetStatus(t *testing.T) { From c5852ce893f7165afd239b08c2bcc9499ba09b52 Mon Sep 17 00:00:00 2001 From: fearful-symmetry Date: Fri, 14 Mar 2025 17:29:14 +0000 Subject: [PATCH 15/19] refactor to use switch --- audit.go | 47 +++++++++++++++++++++++++---------------------- audit_test.go | 22 +++++++++++++++++++++- 2 files changed, 46 insertions(+), 23 deletions(-) diff --git a/audit.go b/audit.go index 475cc17..9f174d6 100644 --- a/audit.go +++ b/audit.go @@ -514,50 +514,53 @@ func (c *AuditClient) closeAndUnsetPid() error { }.toWireFormat(), } - noParse := func(bytes []byte) ([]syscall.NetlinkMessage, error) { - return nil, nil - } - // If our request to unset the PID would block, then try to drain events from // the netlink socket, resend, try again. // In netlink, EAGAIN usually indicates our read buffer is full. // The auditd code (which I'm using as a reference implementation) doesn't wait for a response when unsetting the audit pid. - maxLoop := 5 - for i := 0; i < maxLoop; i++ { + // The retry count here is largely arbitrary, and provides a buffer for either transient errors (EINTR) or retries. + retries := 5 +outer: + for i := 0; i < retries; i++ { _, err := c.Netlink.SendNoWait(msg) - // if we get an interrupt, retry the send - if err == nil { + switch err { + case nil: + // send good, return return nil - } else if errors.Is(err, syscall.EINTR) { - // got interrupt, try again + case syscall.EINTR: + // got a transient interrupt, try again continue - } else if errors.Is(err, syscall.EAGAIN) { + case syscall.EAGAIN: + // send would block, try to drain the receive socket. The recv count here is just so we have enough of a buffer to attempt a send again maxRecv := 10000 - // send would block, try to drain the receive socket for i := 0; i < maxRecv; i++ { _, err = c.Netlink.Receive(true, noParse) - if errors.Is(err, syscall.EAGAIN) { - // receive would block, try to send again - break - } else if err == nil || errors.Is(err, syscall.EINTR) || errors.Is(err, syscall.ENOBUFS) { - // retry the receive + switch err { + case nil, syscall.EINTR, syscall.ENOBUFS: + // continue with receive, try to read more data continue - } else { + case syscall.EAGAIN: + // receive would block, try to send again + continue outer + default: // if we have another kind of error, just bail and return that error. return err } } - // try again after we flush the recv buffer - continue + default: + return err } - // if we get another error from the send, return that up - return err } // we may not want to treat this as a hard error? return fmt.Errorf("could not unset pid from audit after retries") } +// noParse is a no-op parser used by closeAndUnsetPID +func noParse([]byte) ([]syscall.NetlinkMessage, error) { + return nil, nil +} + func (c *AuditClient) set(status AuditStatus, mode WaitMode) error { msg := syscall.NetlinkMessage{ Header: syscall.NlMsghdr{ diff --git a/audit_test.go b/audit_test.go index 06e42b5..c3b1776 100644 --- a/audit_test.go +++ b/audit_test.go @@ -102,6 +102,26 @@ func TestCloseBehavior(t *testing.T) { }, err: nil, }, + { + name: "repeated-send-fail", + cfg: &TestNetlinkIface{ + // cause the first send to error out + sendStack: []error{syscall.EWOULDBLOCK, syscall.EWOULDBLOCK, syscall.EWOULDBLOCK, nil}, + // force the close logic to drain + recvStack: []error{syscall.EWOULDBLOCK, syscall.EWOULDBLOCK, nil, syscall.EWOULDBLOCK, nil, syscall.EWOULDBLOCK, nil}, + }, + err: nil, + }, + { + name: "transient-eintr-send", + cfg: &TestNetlinkIface{ + // cause the first send to error out + sendStack: []error{syscall.EINTR, nil, nil}, + // force the close logic to drain + recvStack: []error{syscall.EAGAIN}, + }, + err: nil, + }, { name: "fail-recv-error", cfg: &TestNetlinkIface{ @@ -134,7 +154,7 @@ func TestCloseBehavior(t *testing.T) { } err := testClient.Close() - require.True(t, errors.Is(err, test.err)) + require.True(t, errors.Is(err, test.err), "expected error %s", test.err) }) } } From 2d13888f9ca3bd69fe71753667a6e44cd8b0c2d6 Mon Sep 17 00:00:00 2001 From: fearful-symmetry Date: Fri, 14 Mar 2025 18:51:18 +0000 Subject: [PATCH 16/19] still tinkering... --- audit.go | 23 +++++++++-------------- 1 file changed, 9 insertions(+), 14 deletions(-) diff --git a/audit.go b/audit.go index 9f174d6..879e836 100644 --- a/audit.go +++ b/audit.go @@ -520,34 +520,29 @@ func (c *AuditClient) closeAndUnsetPid() error { // The auditd code (which I'm using as a reference implementation) doesn't wait for a response when unsetting the audit pid. // The retry count here is largely arbitrary, and provides a buffer for either transient errors (EINTR) or retries. retries := 5 -outer: + // outer: for i := 0; i < retries; i++ { _, err := c.Netlink.SendNoWait(msg) - switch err { - case nil: - // send good, return - return nil - case syscall.EINTR: + if errors.Is(err, syscall.EINTR) { // got a transient interrupt, try again continue - case syscall.EAGAIN: + } else if errors.Is(err, syscall.EAGAIN) { // send would block, try to drain the receive socket. The recv count here is just so we have enough of a buffer to attempt a send again maxRecv := 10000 for i := 0; i < maxRecv; i++ { _, err = c.Netlink.Receive(true, noParse) - switch err { - case nil, syscall.EINTR, syscall.ENOBUFS: + if err == nil || errors.Is(err, syscall.EINTR) || errors.Is(err, syscall.ENOBUFS) { // continue with receive, try to read more data continue - case syscall.EAGAIN: + } else if errors.Is(err, syscall.EAGAIN) { // receive would block, try to send again - continue outer - default: - // if we have another kind of error, just bail and return that error. + break + } else { return err } } - default: + } else { + // if we have another kind of error, just bail and return that error. return err } From 377e8322f4c1bcbc707755e1dafd1c923888f89a Mon Sep 17 00:00:00 2001 From: fearful-symmetry Date: Fri, 14 Mar 2025 19:02:26 +0000 Subject: [PATCH 17/19] tinkering... --- audit.go | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/audit.go b/audit.go index 879e836..8d74d8f 100644 --- a/audit.go +++ b/audit.go @@ -526,8 +526,9 @@ func (c *AuditClient) closeAndUnsetPid() error { if errors.Is(err, syscall.EINTR) { // got a transient interrupt, try again continue - } else if errors.Is(err, syscall.EAGAIN) { - // send would block, try to drain the receive socket. The recv count here is just so we have enough of a buffer to attempt a send again + } else if errors.Is(err, syscall.EAGAIN) { //nolint:revive // easier to read with the else blocks + // send would block, try to drain the receive socket. The recv count here is just so we have enough of a buffer to attempt a send again/ + // The number is just here so we ideally have enough of a buffer to attempt the send again. maxRecv := 10000 for i := 0; i < maxRecv; i++ { _, err = c.Netlink.Receive(true, noParse) From bdca1acb321cd0a65598bf5f48e34c43e07eb247 Mon Sep 17 00:00:00 2001 From: fearful-symmetry Date: Fri, 14 Mar 2025 19:11:59 +0000 Subject: [PATCH 18/19] improve docs --- audit.go | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/audit.go b/audit.go index 8d74d8f..507b9f4 100644 --- a/audit.go +++ b/audit.go @@ -539,16 +539,19 @@ func (c *AuditClient) closeAndUnsetPid() error { // receive would block, try to send again break } else { + // if receive returns an other error, just return that. return err } } } else { - // if we have another kind of error, just bail and return that error. + // if Send returns and other error, just return that return err } } // we may not want to treat this as a hard error? + // It's not a massive error if this fails, since the kernel will unset the PID if it can't communicate with the process, + // so this is largely for neatness. return fmt.Errorf("could not unset pid from audit after retries") } From a15b92367775e085c5140d60eabc64333e459a5d Mon Sep 17 00:00:00 2001 From: fearful-symmetry Date: Fri, 14 Mar 2025 20:25:11 +0000 Subject: [PATCH 19/19] refactor, again --- audit.go | 20 ++++++++++++-------- 1 file changed, 12 insertions(+), 8 deletions(-) diff --git a/audit.go b/audit.go index 507b9f4..90143d9 100644 --- a/audit.go +++ b/audit.go @@ -520,30 +520,34 @@ func (c *AuditClient) closeAndUnsetPid() error { // The auditd code (which I'm using as a reference implementation) doesn't wait for a response when unsetting the audit pid. // The retry count here is largely arbitrary, and provides a buffer for either transient errors (EINTR) or retries. retries := 5 - // outer: +outer: for i := 0; i < retries; i++ { _, err := c.Netlink.SendNoWait(msg) - if errors.Is(err, syscall.EINTR) { + switch { + case err == nil: + return nil + case errors.Is(err, syscall.EINTR): // got a transient interrupt, try again continue - } else if errors.Is(err, syscall.EAGAIN) { //nolint:revive // easier to read with the else blocks + case errors.Is(err, syscall.EAGAIN): // send would block, try to drain the receive socket. The recv count here is just so we have enough of a buffer to attempt a send again/ // The number is just here so we ideally have enough of a buffer to attempt the send again. maxRecv := 10000 for i := 0; i < maxRecv; i++ { _, err = c.Netlink.Receive(true, noParse) - if err == nil || errors.Is(err, syscall.EINTR) || errors.Is(err, syscall.ENOBUFS) { + switch { + case err == nil, errors.Is(err, syscall.EINTR), errors.Is(err, syscall.ENOBUFS): // continue with receive, try to read more data continue - } else if errors.Is(err, syscall.EAGAIN) { + case errors.Is(err, syscall.EAGAIN): // receive would block, try to send again - break - } else { + continue outer + default: // if receive returns an other error, just return that. return err } } - } else { + default: // if Send returns and other error, just return that return err }