From 0d2e03109d933c51ed521366b8f0e90345b48967 Mon Sep 17 00:00:00 2001
From: fearful-symmetry <alexpkristiansen@icloud.com>
Date: Mon, 10 Mar 2025 20:18:59 +0000
Subject: [PATCH 01/19] tinker with async close

---
 audit.go   | 50 +++++++++++++++++++++++++++++++++++++++++++-------
 netlink.go | 12 +++++++++++-
 2 files changed, 54 insertions(+), 8 deletions(-)

diff --git a/audit.go b/audit.go
index f37f957..eb63fa6 100644
--- a/audit.go
+++ b/audit.go
@@ -49,7 +49,7 @@ const (
 // Netlink groups.
 const (
 	NetlinkGroupNone    = iota // Group 0 not used
-	NetlinkGroupReadLog        // "best effort" read only socket
+	NetlinkGroupReadLog        // "best effort" read only socket, defined in the kernel as AUDIT_NLGRP_READLOG
 )
 
 // WaitMode is a flag to control the behavior of methods that abstract
@@ -427,16 +427,12 @@ func (c *AuditClient) Receive(nonBlocking bool) (*RawAuditMessage, error) {
 // become no-ops.
 func (c *AuditClient) Close() error {
 	var err error
-
 	// Only unregister and close the socket once.
 	c.closeOnce.Do(func() {
 		if c.clearPIDOnClose {
 			// Unregister from the kernel for a clean exit.
-			status := AuditStatus{
-				Mask: AuditStatusPID,
-				PID:  0,
-			}
-			err = c.set(status, NoWait)
+			c.closeAndUnsetPid()
+
 		}
 
 		err = errors.Join(err, c.Netlink.Close())
@@ -505,6 +501,46 @@ func (c *AuditClient) getReply(seq uint32) (*syscall.NetlinkMessage, error) {
 	return &msg, nil
 }
 
+// an isolated operation to unset our pid from the audit subsystem and close the socket
+func (c *AuditClient) closeAndUnsetPid() {
+	msg := syscall.NetlinkMessage{
+		Header: syscall.NlMsghdr{
+			Type:  AuditSet,
+			Flags: syscall.NLM_F_REQUEST | syscall.NLM_F_ACK,
+		},
+		Data: AuditStatus{
+			Mask: AuditStatusPID,
+			PID:  0,
+		}.toWireFormat(),
+	}
+
+	noParse := func(bytes []byte) ([]syscall.NetlinkMessage, error) {
+		return nil, nil
+	}
+
+	// If our request to unset the PID would block, then try to drain events from
+	// the netlink socket, resend, try again.
+	// In netlink, EAGAIN usually indicates our read buffer is full.
+	// The auditd code (which I'm using as a reference implementation) doesn't wait for a response when unsetting the  audit pid.
+	_, err := c.Netlink.SendNoWait(msg)
+	if err != nil {
+		// sending may be blocked, retry. If the kernel layer is blocked, we'll usually get ENOBUF,
+		// but retry on any error.
+		for {
+			// throw out events until we can try to send again. We're closing, so we don't really care.
+			_, err = c.Netlink.Receive(true, noParse)
+			// if we got some events, or we would be waiting, try again try sending again
+			if err == nil || errors.Is(err, syscall.EAGAIN) || errors.Is(err, syscall.EWOULDBLOCK) {
+				_, err := c.Netlink.SendNoWait(msg)
+				if err == nil {
+					break
+				}
+			}
+
+		}
+	}
+}
+
 func (c *AuditClient) set(status AuditStatus, mode WaitMode) error {
 	msg := syscall.NetlinkMessage{
 		Header: syscall.NlMsghdr{
diff --git a/netlink.go b/netlink.go
index 03302c5..5223f7c 100644
--- a/netlink.go
+++ b/netlink.go
@@ -36,6 +36,7 @@ import (
 // in the message and an error if it occurred.
 type NetlinkSender interface {
 	Send(msg syscall.NetlinkMessage) (uint32, error)
+	SendNoWait(msg syscall.NetlinkMessage) (uint32, error)
 }
 
 // NetlinkReceiver receives data from the netlink socket and uses the provided
@@ -126,17 +127,26 @@ func getPortID(fd int) (uint32, error) {
 	return addr.Pid, nil
 }
 
+// Send a message to the netlink client in non-blocking mode. Behavior is otherwise identical to Send()
+func (c *NetlinkClient) SendNoWait(msg syscall.NetlinkMessage) (uint32, error) {
+	return c.send(msg, syscall.MSG_DONTWAIT)
+}
+
 // Send sends a netlink message and returns the sequence number used
 // in the message and an error if it occurred. If the PID is not set then
 // the value will be populated automatically (recommended).
 func (c *NetlinkClient) Send(msg syscall.NetlinkMessage) (uint32, error) {
+	return c.send(msg, 0)
+}
+
+func (c *NetlinkClient) send(msg syscall.NetlinkMessage, flags int) (uint32, error) {
 	if msg.Header.Pid == 0 {
 		msg.Header.Pid = c.pid
 	}
 
 	msg.Header.Seq = atomic.AddUint32(&c.seq, 1)
 	to := &syscall.SockaddrNetlink{}
-	return msg.Header.Seq, syscall.Sendto(c.fd, serialize(msg), 0, to)
+	return msg.Header.Seq, syscall.Sendto(c.fd, serialize(msg), flags, to)
 }
 
 func serialize(msg syscall.NetlinkMessage) []byte {

From 9bcd2693a72980efdc31f00a34eef88ee5af4723 Mon Sep 17 00:00:00 2001
From: fearful-symmetry <alexpkristiansen@icloud.com>
Date: Mon, 10 Mar 2025 20:30:54 +0000
Subject: [PATCH 02/19] docs

---
 audit.go | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/audit.go b/audit.go
index eb63fa6..7865e04 100644
--- a/audit.go
+++ b/audit.go
@@ -524,8 +524,7 @@ func (c *AuditClient) closeAndUnsetPid() {
 	// The auditd code (which I'm using as a reference implementation) doesn't wait for a response when unsetting the  audit pid.
 	_, err := c.Netlink.SendNoWait(msg)
 	if err != nil {
-		// sending may be blocked, retry. If the kernel layer is blocked, we'll usually get ENOBUF,
-		// but retry on any error.
+		// sending may be blocked, retry.
 		for {
 			// throw out events until we can try to send again. We're closing, so we don't really care.
 			_, err = c.Netlink.Receive(true, noParse)

From 2b7c7b47f2b808b2e970e4cac0cff7c44d8a82ab Mon Sep 17 00:00:00 2001
From: fearful-symmetry <alexpkristiansen@icloud.com>
Date: Mon, 10 Mar 2025 20:44:17 +0000
Subject: [PATCH 03/19] docs..

---
 audit.go | 1 +
 1 file changed, 1 insertion(+)

diff --git a/audit.go b/audit.go
index 7865e04..7109a8f 100644
--- a/audit.go
+++ b/audit.go
@@ -529,6 +529,7 @@ func (c *AuditClient) closeAndUnsetPid() {
 			// throw out events until we can try to send again. We're closing, so we don't really care.
 			_, err = c.Netlink.Receive(true, noParse)
 			// if we got some events, or we would be waiting, try again try sending again
+			// If we get an ENOBUF, should we try to drain the socket, or retry first? This assumes we keep draining the socket.
 			if err == nil || errors.Is(err, syscall.EAGAIN) || errors.Is(err, syscall.EWOULDBLOCK) {
 				_, err := c.Netlink.SendNoWait(msg)
 				if err == nil {

From 1bf87718f8951410c2d55f6ad87b94df504b95d3 Mon Sep 17 00:00:00 2001
From: fearful-symmetry <alexpkristiansen@icloud.com>
Date: Mon, 10 Mar 2025 20:55:07 +0000
Subject: [PATCH 04/19] add badfd condition

---
 audit.go | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/audit.go b/audit.go
index 7109a8f..c5f0adf 100644
--- a/audit.go
+++ b/audit.go
@@ -535,6 +535,9 @@ func (c *AuditClient) closeAndUnsetPid() {
 				if err == nil {
 					break
 				}
+				// Not sure how we could end up here unless other threads are doing something weird, but best to handle this so we don't loop forever.
+			} else if errors.Is(err, syscall.EBADFD) {
+				return
 			}
 
 		}

From cd4b38d2bf1fd48defe7a2632964ddaf3cf9bc60 Mon Sep 17 00:00:00 2001
From: fearful-symmetry <alexpkristiansen@icloud.com>
Date: Mon, 10 Mar 2025 20:57:36 +0000
Subject: [PATCH 05/19] lint

---
 netlink.go | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/netlink.go b/netlink.go
index 5223f7c..6ef4411 100644
--- a/netlink.go
+++ b/netlink.go
@@ -127,7 +127,7 @@ func getPortID(fd int) (uint32, error) {
 	return addr.Pid, nil
 }
 
-// Send a message to the netlink client in non-blocking mode. Behavior is otherwise identical to Send()
+// SendNoWait sends a message to the netlink client in non-blocking mode. Behavior is otherwise identical to Send()
 func (c *NetlinkClient) SendNoWait(msg syscall.NetlinkMessage) (uint32, error) {
 	return c.send(msg, syscall.MSG_DONTWAIT)
 }

From f2c42d4f04d648884053639fa1b41c3641593d49 Mon Sep 17 00:00:00 2001
From: fearful-symmetry <alexpkristiansen@icloud.com>
Date: Mon, 10 Mar 2025 21:01:01 +0000
Subject: [PATCH 06/19] format

---
 audit.go | 1 -
 1 file changed, 1 deletion(-)

diff --git a/audit.go b/audit.go
index c5f0adf..ea35186 100644
--- a/audit.go
+++ b/audit.go
@@ -432,7 +432,6 @@ func (c *AuditClient) Close() error {
 		if c.clearPIDOnClose {
 			// Unregister from the kernel for a clean exit.
 			c.closeAndUnsetPid()
-
 		}
 
 		err = errors.Join(err, c.Netlink.Close())

From 8e5d0745b0ce9ce4ae7d24101f31c9dcdb1fc0ce Mon Sep 17 00:00:00 2001
From: fearful-symmetry <alexpkristiansen@icloud.com>
Date: Tue, 11 Mar 2025 17:39:05 +0000
Subject: [PATCH 07/19] rewrite

---
 audit.go | 50 +++++++++++++++++++++++++++++++-------------------
 1 file changed, 31 insertions(+), 19 deletions(-)

diff --git a/audit.go b/audit.go
index ea35186..3ac3955 100644
--- a/audit.go
+++ b/audit.go
@@ -500,12 +500,13 @@ func (c *AuditClient) getReply(seq uint32) (*syscall.NetlinkMessage, error) {
 	return &msg, nil
 }
 
-// an isolated operation to unset our pid from the audit subsystem and close the socket
-func (c *AuditClient) closeAndUnsetPid() {
+// unset our pid from the audit subsystem and close the socket.
+// This is a sort of isolated refactor, meant to deal with the deadlocks that can happen when we're not careful with blocking operations throughout a lot of this code.
+func (c *AuditClient) closeAndUnsetPid() error {
 	msg := syscall.NetlinkMessage{
 		Header: syscall.NlMsghdr{
 			Type:  AuditSet,
-			Flags: syscall.NLM_F_REQUEST | syscall.NLM_F_ACK,
+			Flags: syscall.NLM_F_REQUEST,
 		},
 		Data: AuditStatus{
 			Mask: AuditStatusPID,
@@ -517,30 +518,41 @@ func (c *AuditClient) closeAndUnsetPid() {
 		return nil, nil
 	}
 
+	// retry the send if the syscall gets an interrupt.
+	// This may not be totally needed, as non-blocking calls shouldn't really return an EINTR, but there's nothing saying it can't.
+	retryOnEINTR := func(msg syscall.NetlinkMessage) (uint32, error) {
+		maxRetry := 10
+		for i := 0; i < maxRetry; i++ {
+			wrote, err := c.Netlink.SendNoWait(msg)
+			if !errors.Is(err, syscall.EINTR) {
+				return wrote, err
+			}
+		}
+		return 0, fmt.Errorf("could not send netlink message, got repeated EINTR")
+	}
+
 	// If our request to unset the PID would block, then try to drain events from
 	// the netlink socket, resend, try again.
 	// In netlink, EAGAIN usually indicates our read buffer is full.
 	// The auditd code (which I'm using as a reference implementation) doesn't wait for a response when unsetting the  audit pid.
-	_, err := c.Netlink.SendNoWait(msg)
+	_, err := retryOnEINTR(msg)
 	if err != nil {
-		// sending may be blocked, retry.
-		for {
-			// throw out events until we can try to send again. We're closing, so we don't really care.
+		maxRetry := 10000
+		// sending may be blocked, try to empty the buffer, then send again
+		for i := 0; i < maxRetry; i++ {
+			// Attempt to drain the socket before we retry the close
 			_, err = c.Netlink.Receive(true, noParse)
-			// if we got some events, or we would be waiting, try again try sending again
-			// If we get an ENOBUF, should we try to drain the socket, or retry first? This assumes we keep draining the socket.
-			if err == nil || errors.Is(err, syscall.EAGAIN) || errors.Is(err, syscall.EWOULDBLOCK) {
-				_, err := c.Netlink.SendNoWait(msg)
-				if err == nil {
-					break
-				}
-				// Not sure how we could end up here unless other threads are doing something weird, but best to handle this so we don't loop forever.
-			} else if errors.Is(err, syscall.EBADFD) {
-				return
+			if err == nil || errors.Is(err, syscall.EINTR) || errors.Is(err, syscall.ENOBUFS) {
+				continue
+			} else if errors.Is(err, syscall.EAGAIN) {
+				break
 			}
-
 		}
 	}
+	// One last attempt.
+	// Looking at the kernel code in kauditd_thread, I think we can survive failing to unset the pid, as connection failures will cause the pid to be unset.
+	_, err = retryOnEINTR(msg)
+	return fmt.Errorf("error sending PID unset event: %w", err)
 }
 
 func (c *AuditClient) set(status AuditStatus, mode WaitMode) error {
@@ -598,7 +610,7 @@ func parseNetlinkAuditMessage(buf []byte) ([]syscall.NetlinkMessage, error) {
 // https://github.com/linux-audit/audit-kernel/blob/v4.7/include/uapi/linux/audit.h#L318-L325
 type AuditStatusMask uint32
 
-// Mask types for AuditStatus.
+// Mask types for AuditStatus. Originally defined in the kernel at audit.h
 const (
 	AuditStatusEnabled AuditStatusMask = 1 << iota
 	AuditStatusFailure

From 185bc5e135154909d1427a989c206ea8e518e1ab Mon Sep 17 00:00:00 2001
From: fearful-symmetry <alexpkristiansen@icloud.com>
Date: Tue, 11 Mar 2025 17:44:48 +0000
Subject: [PATCH 08/19] error checking

---
 audit.go | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/audit.go b/audit.go
index 3ac3955..fad482e 100644
--- a/audit.go
+++ b/audit.go
@@ -431,7 +431,8 @@ func (c *AuditClient) Close() error {
 	c.closeOnce.Do(func() {
 		if c.clearPIDOnClose {
 			// Unregister from the kernel for a clean exit.
-			c.closeAndUnsetPid()
+			err = c.closeAndUnsetPid()
+
 		}
 
 		err = errors.Join(err, c.Netlink.Close())

From e1259ebcb61e437b51deb79639c8c88ba16a86f6 Mon Sep 17 00:00:00 2001
From: fearful-symmetry <alexpkristiansen@icloud.com>
Date: Tue, 11 Mar 2025 17:50:10 +0000
Subject: [PATCH 09/19] fmt

---
 audit.go | 1 -
 1 file changed, 1 deletion(-)

diff --git a/audit.go b/audit.go
index fad482e..2ad4b71 100644
--- a/audit.go
+++ b/audit.go
@@ -432,7 +432,6 @@ func (c *AuditClient) Close() error {
 		if c.clearPIDOnClose {
 			// Unregister from the kernel for a clean exit.
 			err = c.closeAndUnsetPid()
-
 		}
 
 		err = errors.Join(err, c.Netlink.Close())

From 7867c2f6801d43496c6ca382b6c81a77afeeabd3 Mon Sep 17 00:00:00 2001
From: fearful-symmetry <alexpkristiansen@icloud.com>
Date: Tue, 11 Mar 2025 20:00:45 +0000
Subject: [PATCH 10/19] rewrite

---
 audit.go | 57 ++++++++++++++++++++++++++++----------------------------
 1 file changed, 28 insertions(+), 29 deletions(-)

diff --git a/audit.go b/audit.go
index 2ad4b71..312fece 100644
--- a/audit.go
+++ b/audit.go
@@ -518,41 +518,40 @@ func (c *AuditClient) closeAndUnsetPid() error {
 		return nil, nil
 	}
 
-	// retry the send if the syscall gets an interrupt.
-	// This may not be totally needed, as non-blocking calls shouldn't really return an EINTR, but there's nothing saying it can't.
-	retryOnEINTR := func(msg syscall.NetlinkMessage) (uint32, error) {
-		maxRetry := 10
-		for i := 0; i < maxRetry; i++ {
-			wrote, err := c.Netlink.SendNoWait(msg)
-			if !errors.Is(err, syscall.EINTR) {
-				return wrote, err
-			}
-		}
-		return 0, fmt.Errorf("could not send netlink message, got repeated EINTR")
-	}
-
 	// If our request to unset the PID would block, then try to drain events from
 	// the netlink socket, resend, try again.
 	// In netlink, EAGAIN usually indicates our read buffer is full.
-	// The auditd code (which I'm using as a reference implementation) doesn't wait for a response when unsetting the  audit pid.
-	_, err := retryOnEINTR(msg)
-	if err != nil {
-		maxRetry := 10000
-		// sending may be blocked, try to empty the buffer, then send again
-		for i := 0; i < maxRetry; i++ {
-			// Attempt to drain the socket before we retry the close
-			_, err = c.Netlink.Receive(true, noParse)
-			if err == nil || errors.Is(err, syscall.EINTR) || errors.Is(err, syscall.ENOBUFS) {
-				continue
-			} else if errors.Is(err, syscall.EAGAIN) {
-				break
+	// The auditd code (which I'm using as a reference implementation) doesn't wait for a response when unsetting the audit pid.
+	maxLoop := 5
+	for i := 0; i < maxLoop; i++ {
+		_, err := c.Netlink.SendNoWait(msg)
+		// if we get an interrupt, retry the send
+		if err == nil {
+			return nil
+		} else if errors.Is(err, syscall.EINTR) {
+			// got interrupt, try again
+			continue
+		} else if errors.Is(err, syscall.EAGAIN) {
+			maxRecv := 10000
+			// send would block, try to drain the receive socket
+			for i := 0; i < maxRecv; i++ {
+				_, err = c.Netlink.Receive(true, noParse)
+				if errors.Is(err, syscall.EAGAIN) {
+					// receive would block, try to send again
+					break
+				} else if err == nil || errors.Is(err, syscall.EINTR) || errors.Is(err, syscall.ENOBUFS) {
+					// retry the receive
+					continue
+				} else {
+					// if we have another kind of error, just bail and return that error.
+					return err
+				}
 			}
 		}
+
 	}
-	// One last attempt.
-	// Looking at the kernel code in kauditd_thread, I think we can survive failing to unset the pid, as connection failures will cause the pid to be unset.
-	_, err = retryOnEINTR(msg)
-	return fmt.Errorf("error sending PID unset event: %w", err)
+	// we may not want to treat this as a hard error?
+	return fmt.Errorf("could not unset pid from audit after retries")
 }
 
 func (c *AuditClient) set(status AuditStatus, mode WaitMode) error {

From fdc60e408a0c1b539ced6d2c4e26054f061f23f9 Mon Sep 17 00:00:00 2001
From: fearful-symmetry <alexpkristiansen@icloud.com>
Date: Wed, 12 Mar 2025 16:25:11 +0000
Subject: [PATCH 11/19] add else block

---
 audit.go | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/audit.go b/audit.go
index 312fece..4dd8542 100644
--- a/audit.go
+++ b/audit.go
@@ -547,6 +547,9 @@ func (c *AuditClient) closeAndUnsetPid() error {
 					return err
 				}
 			}
+		} else {
+			// if we get another error from the send, return that up
+			return err
 		}
 
 	}

From d84de7cee10fe2802d030a64d8890e5adba83296 Mon Sep 17 00:00:00 2001
From: fearful-symmetry <alexpkristiansen@icloud.com>
Date: Wed, 12 Mar 2025 21:17:50 +0000
Subject: [PATCH 12/19] add tests, dumb linter

---
 audit.go      |  7 +++--
 audit_test.go | 86 +++++++++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 90 insertions(+), 3 deletions(-)

diff --git a/audit.go b/audit.go
index 4dd8542..475cc17 100644
--- a/audit.go
+++ b/audit.go
@@ -547,10 +547,11 @@ func (c *AuditClient) closeAndUnsetPid() error {
 					return err
 				}
 			}
-		} else {
-			// if we get another error from the send, return that up
-			return err
+			// try again after we flush the recv buffer
+			continue
 		}
+		// if we get another error from the send, return that up
+		return err
 
 	}
 	// we may not want to treat this as a hard error?
diff --git a/audit_test.go b/audit_test.go
index 816662c..90fd42e 100644
--- a/audit_test.go
+++ b/audit_test.go
@@ -30,12 +30,15 @@ import (
 	"io"
 	"os"
 	"runtime"
+	"slices"
+	"sync"
 	"syscall"
 	"testing"
 	"testing/quick"
 	"time"
 
 	"github.com/stretchr/testify/assert"
+	"github.com/stretchr/testify/require"
 
 	"github.com/elastic/go-libaudit/v2/rule"
 	"github.com/elastic/go-libaudit/v2/rule/flags"
@@ -55,6 +58,89 @@ var (
 // -a always,exit -S open,truncate -F dir=/etc -F success=0
 const testRule = `BAAAAAIAAAACAAAABAAAAAAAAAAAEAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAGsAAABoAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAEAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAQAAAAEAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAQAAAAvZXRj`
 
+// TestNetlinkIface is a mock interface for testing close behavior
+type TestNetlinkIface struct {
+	recvStack []error
+	sendStack []error
+}
+
+func (tn *TestNetlinkIface) Close() error {
+	return nil
+}
+
+func (tn *TestNetlinkIface) Send(msg syscall.NetlinkMessage) (uint32, error) {
+	top := tn.sendStack[0]
+	tn.sendStack = slices.Delete(tn.sendStack, 0, 1)
+	return 0, top
+}
+
+func (tn *TestNetlinkIface) SendNoWait(msg syscall.NetlinkMessage) (uint32, error) {
+	top := tn.sendStack[0]
+	tn.sendStack = slices.Delete(tn.sendStack, 0, 1)
+	return 0, top
+}
+
+func (tn *TestNetlinkIface) Receive(nonBlocking bool, p NetlinkParser) ([]syscall.NetlinkMessage, error) {
+	top := tn.recvStack[0]
+	tn.recvStack = slices.Delete(tn.recvStack, 0, 1)
+	return nil, top
+
+}
+
+func TestCloseBehavior(t *testing.T) {
+	testCases := []struct {
+		name string
+		cfg  *TestNetlinkIface
+		err  error
+	}{
+		{
+			name: "retry",
+			cfg: &TestNetlinkIface{
+				// cause the first send to error out
+				sendStack: []error{syscall.EWOULDBLOCK, nil, nil},
+				// force the close logic to drain
+				recvStack: []error{syscall.ENOBUFS, syscall.ENOBUFS, syscall.EAGAIN},
+			},
+			err: nil,
+		},
+		{
+			name: "fail-recv-error",
+			cfg: &TestNetlinkIface{
+				// cause the first send to error out
+				sendStack: []error{syscall.EWOULDBLOCK, nil, nil},
+				// force the close logic to drain
+				recvStack: []error{syscall.ENOBUFS, syscall.ENOBUFS, syscall.EBADFD},
+			},
+			err: syscall.EBADFD,
+		},
+		{
+			name: "fail-send-error",
+			cfg: &TestNetlinkIface{
+				// cause the first send to error out
+				sendStack: []error{syscall.EWOULDBLOCK, syscall.EBADFD, nil},
+				// force the close logic to drain
+				recvStack: []error{syscall.EAGAIN, syscall.EAGAIN},
+			},
+			err: syscall.EBADFD,
+		},
+	}
+
+	for _, test := range testCases {
+		t.Run(test.name, func(t *testing.T) {
+			testClient := AuditClient{
+				Netlink:         test.cfg,
+				pendingAcks:     []uint32{},
+				clearPIDOnClose: true,
+				closeOnce:       sync.Once{},
+			}
+
+			err := testClient.Close()
+			require.True(t, errors.Is(err, test.err))
+		})
+	}
+
+}
+
 func TestAuditClientGetStatus(t *testing.T) {
 	if os.Geteuid() != 0 {
 		t.Skip("must be root to get audit status")

From afe7f92aa0f0da6e33c6a6cbeefa3d04042ce2b3 Mon Sep 17 00:00:00 2001
From: fearful-symmetry <alexpkristiansen@icloud.com>
Date: Wed, 12 Mar 2025 21:23:00 +0000
Subject: [PATCH 13/19] linter...

---
 audit_test.go | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/audit_test.go b/audit_test.go
index 90fd42e..da637dd 100644
--- a/audit_test.go
+++ b/audit_test.go
@@ -64,23 +64,23 @@ type TestNetlinkIface struct {
 	sendStack []error
 }
 
-func (tn *TestNetlinkIface) Close() error {
+func (_ *TestNetlinkIface) Close() error {
 	return nil
 }
 
-func (tn *TestNetlinkIface) Send(msg syscall.NetlinkMessage) (uint32, error) {
+func (tn *TestNetlinkIface) Send(_ syscall.NetlinkMessage) (uint32, error) {
 	top := tn.sendStack[0]
 	tn.sendStack = slices.Delete(tn.sendStack, 0, 1)
 	return 0, top
 }
 
-func (tn *TestNetlinkIface) SendNoWait(msg syscall.NetlinkMessage) (uint32, error) {
+func (tn *TestNetlinkIface) SendNoWait(_ syscall.NetlinkMessage) (uint32, error) {
 	top := tn.sendStack[0]
 	tn.sendStack = slices.Delete(tn.sendStack, 0, 1)
 	return 0, top
 }
 
-func (tn *TestNetlinkIface) Receive(nonBlocking bool, p NetlinkParser) ([]syscall.NetlinkMessage, error) {
+func (tn *TestNetlinkIface) Receive(_ bool, _ NetlinkParser) ([]syscall.NetlinkMessage, error) {
 	top := tn.recvStack[0]
 	tn.recvStack = slices.Delete(tn.recvStack, 0, 1)
 	return nil, top

From 9ba5dbbf1d80cc5c45b674e7d10e92795f513316 Mon Sep 17 00:00:00 2001
From: fearful-symmetry <alexpkristiansen@icloud.com>
Date: Wed, 12 Mar 2025 21:24:24 +0000
Subject: [PATCH 14/19] LINTER

---
 audit_test.go | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/audit_test.go b/audit_test.go
index da637dd..06e42b5 100644
--- a/audit_test.go
+++ b/audit_test.go
@@ -64,7 +64,7 @@ type TestNetlinkIface struct {
 	sendStack []error
 }
 
-func (_ *TestNetlinkIface) Close() error {
+func (*TestNetlinkIface) Close() error {
 	return nil
 }
 
@@ -84,7 +84,6 @@ func (tn *TestNetlinkIface) Receive(_ bool, _ NetlinkParser) ([]syscall.NetlinkM
 	top := tn.recvStack[0]
 	tn.recvStack = slices.Delete(tn.recvStack, 0, 1)
 	return nil, top
-
 }
 
 func TestCloseBehavior(t *testing.T) {
@@ -138,7 +137,6 @@ func TestCloseBehavior(t *testing.T) {
 			require.True(t, errors.Is(err, test.err))
 		})
 	}
-
 }
 
 func TestAuditClientGetStatus(t *testing.T) {

From c5852ce893f7165afd239b08c2bcc9499ba09b52 Mon Sep 17 00:00:00 2001
From: fearful-symmetry <alexpkristiansen@icloud.com>
Date: Fri, 14 Mar 2025 17:29:14 +0000
Subject: [PATCH 15/19] refactor to use switch

---
 audit.go      | 47 +++++++++++++++++++++++++----------------------
 audit_test.go | 22 +++++++++++++++++++++-
 2 files changed, 46 insertions(+), 23 deletions(-)

diff --git a/audit.go b/audit.go
index 475cc17..9f174d6 100644
--- a/audit.go
+++ b/audit.go
@@ -514,50 +514,53 @@ func (c *AuditClient) closeAndUnsetPid() error {
 		}.toWireFormat(),
 	}
 
-	noParse := func(bytes []byte) ([]syscall.NetlinkMessage, error) {
-		return nil, nil
-	}
-
 	// If our request to unset the PID would block, then try to drain events from
 	// the netlink socket, resend, try again.
 	// In netlink, EAGAIN usually indicates our read buffer is full.
 	// The auditd code (which I'm using as a reference implementation) doesn't wait for a response when unsetting the audit pid.
-	maxLoop := 5
-	for i := 0; i < maxLoop; i++ {
+	// The retry count here is largely arbitrary, and provides a buffer for either transient errors (EINTR) or retries.
+	retries := 5
+outer:
+	for i := 0; i < retries; i++ {
 		_, err := c.Netlink.SendNoWait(msg)
-		// if we get an interrupt, retry the send
-		if err == nil {
+		switch err {
+		case nil:
+			// send good, return
 			return nil
-		} else if errors.Is(err, syscall.EINTR) {
-			// got interrupt, try again
+		case syscall.EINTR:
+			// got a transient interrupt, try again
 			continue
-		} else if errors.Is(err, syscall.EAGAIN) {
+		case syscall.EAGAIN:
+			// send would block, try to drain the receive socket. The recv count here is just so we have enough of a buffer to attempt a send again
 			maxRecv := 10000
-			// send would block, try to drain the receive socket
 			for i := 0; i < maxRecv; i++ {
 				_, err = c.Netlink.Receive(true, noParse)
-				if errors.Is(err, syscall.EAGAIN) {
-					// receive would block, try to send again
-					break
-				} else if err == nil || errors.Is(err, syscall.EINTR) || errors.Is(err, syscall.ENOBUFS) {
-					// retry the receive
+				switch err {
+				case nil, syscall.EINTR, syscall.ENOBUFS:
+					// continue with receive, try to read more data
 					continue
-				} else {
+				case syscall.EAGAIN:
+					// receive would block, try to send again
+					continue outer
+				default:
 					// if we have another kind of error, just bail and return that error.
 					return err
 				}
 			}
-			// try again after we flush the recv buffer
-			continue
+		default:
+			return err
 		}
-		// if we get another error from the send, return that up
-		return err
 
 	}
 	// we may not want to treat this as a hard error?
 	return fmt.Errorf("could not unset pid from audit after retries")
 }
 
+// noParse is a no-op parser used by closeAndUnsetPID
+func noParse([]byte) ([]syscall.NetlinkMessage, error) {
+	return nil, nil
+}
+
 func (c *AuditClient) set(status AuditStatus, mode WaitMode) error {
 	msg := syscall.NetlinkMessage{
 		Header: syscall.NlMsghdr{
diff --git a/audit_test.go b/audit_test.go
index 06e42b5..c3b1776 100644
--- a/audit_test.go
+++ b/audit_test.go
@@ -102,6 +102,26 @@ func TestCloseBehavior(t *testing.T) {
 			},
 			err: nil,
 		},
+		{
+			name: "repeated-send-fail",
+			cfg: &TestNetlinkIface{
+				// cause the first send to error out
+				sendStack: []error{syscall.EWOULDBLOCK, syscall.EWOULDBLOCK, syscall.EWOULDBLOCK, nil},
+				// force the close logic to drain
+				recvStack: []error{syscall.EWOULDBLOCK, syscall.EWOULDBLOCK, nil, syscall.EWOULDBLOCK, nil, syscall.EWOULDBLOCK, nil},
+			},
+			err: nil,
+		},
+		{
+			name: "transient-eintr-send",
+			cfg: &TestNetlinkIface{
+				// cause the first send to error out
+				sendStack: []error{syscall.EINTR, nil, nil},
+				// force the close logic to drain
+				recvStack: []error{syscall.EAGAIN},
+			},
+			err: nil,
+		},
 		{
 			name: "fail-recv-error",
 			cfg: &TestNetlinkIface{
@@ -134,7 +154,7 @@ func TestCloseBehavior(t *testing.T) {
 			}
 
 			err := testClient.Close()
-			require.True(t, errors.Is(err, test.err))
+			require.True(t, errors.Is(err, test.err), "expected error %s", test.err)
 		})
 	}
 }

From 2d13888f9ca3bd69fe71753667a6e44cd8b0c2d6 Mon Sep 17 00:00:00 2001
From: fearful-symmetry <alexpkristiansen@icloud.com>
Date: Fri, 14 Mar 2025 18:51:18 +0000
Subject: [PATCH 16/19] still tinkering...

---
 audit.go | 23 +++++++++--------------
 1 file changed, 9 insertions(+), 14 deletions(-)

diff --git a/audit.go b/audit.go
index 9f174d6..879e836 100644
--- a/audit.go
+++ b/audit.go
@@ -520,34 +520,29 @@ func (c *AuditClient) closeAndUnsetPid() error {
 	// The auditd code (which I'm using as a reference implementation) doesn't wait for a response when unsetting the audit pid.
 	// The retry count here is largely arbitrary, and provides a buffer for either transient errors (EINTR) or retries.
 	retries := 5
-outer:
+	// outer:
 	for i := 0; i < retries; i++ {
 		_, err := c.Netlink.SendNoWait(msg)
-		switch err {
-		case nil:
-			// send good, return
-			return nil
-		case syscall.EINTR:
+		if errors.Is(err, syscall.EINTR) {
 			// got a transient interrupt, try again
 			continue
-		case syscall.EAGAIN:
+		} else if errors.Is(err, syscall.EAGAIN) {
 			// send would block, try to drain the receive socket. The recv count here is just so we have enough of a buffer to attempt a send again
 			maxRecv := 10000
 			for i := 0; i < maxRecv; i++ {
 				_, err = c.Netlink.Receive(true, noParse)
-				switch err {
-				case nil, syscall.EINTR, syscall.ENOBUFS:
+				if err == nil || errors.Is(err, syscall.EINTR) || errors.Is(err, syscall.ENOBUFS) {
 					// continue with receive, try to read more data
 					continue
-				case syscall.EAGAIN:
+				} else if errors.Is(err, syscall.EAGAIN) {
 					// receive would block, try to send again
-					continue outer
-				default:
-					// if we have another kind of error, just bail and return that error.
+					break
+				} else {
 					return err
 				}
 			}
-		default:
+		} else {
+			// if we have another kind of error, just bail and return that error.
 			return err
 		}
 

From 377e8322f4c1bcbc707755e1dafd1c923888f89a Mon Sep 17 00:00:00 2001
From: fearful-symmetry <alexpkristiansen@icloud.com>
Date: Fri, 14 Mar 2025 19:02:26 +0000
Subject: [PATCH 17/19] tinkering...

---
 audit.go | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/audit.go b/audit.go
index 879e836..8d74d8f 100644
--- a/audit.go
+++ b/audit.go
@@ -526,8 +526,9 @@ func (c *AuditClient) closeAndUnsetPid() error {
 		if errors.Is(err, syscall.EINTR) {
 			// got a transient interrupt, try again
 			continue
-		} else if errors.Is(err, syscall.EAGAIN) {
-			// send would block, try to drain the receive socket. The recv count here is just so we have enough of a buffer to attempt a send again
+		} else if errors.Is(err, syscall.EAGAIN) { //nolint:revive // easier to read with the else blocks
+			// send would block, try to drain the receive socket. The recv count here is just so we have enough of a buffer to attempt a send again/
+			// The number is just here so we ideally have enough of a buffer to attempt the send again.
 			maxRecv := 10000
 			for i := 0; i < maxRecv; i++ {
 				_, err = c.Netlink.Receive(true, noParse)

From bdca1acb321cd0a65598bf5f48e34c43e07eb247 Mon Sep 17 00:00:00 2001
From: fearful-symmetry <alexpkristiansen@icloud.com>
Date: Fri, 14 Mar 2025 19:11:59 +0000
Subject: [PATCH 18/19] improve docs

---
 audit.go | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/audit.go b/audit.go
index 8d74d8f..507b9f4 100644
--- a/audit.go
+++ b/audit.go
@@ -539,16 +539,19 @@ func (c *AuditClient) closeAndUnsetPid() error {
 					// receive would block, try to send again
 					break
 				} else {
+					// if receive returns an other error, just return that.
 					return err
 				}
 			}
 		} else {
-			// if we have another kind of error, just bail and return that error.
+			// if Send returns and other error, just return that
 			return err
 		}
 
 	}
 	// we may not want to treat this as a hard error?
+	// It's not a massive error if this fails, since the kernel will unset the PID if it can't communicate with the process,
+	// so this is largely for neatness.
 	return fmt.Errorf("could not unset pid from audit after retries")
 }
 

From a15b92367775e085c5140d60eabc64333e459a5d Mon Sep 17 00:00:00 2001
From: fearful-symmetry <alexpkristiansen@icloud.com>
Date: Fri, 14 Mar 2025 20:25:11 +0000
Subject: [PATCH 19/19] refactor, again

---
 audit.go | 20 ++++++++++++--------
 1 file changed, 12 insertions(+), 8 deletions(-)

diff --git a/audit.go b/audit.go
index 507b9f4..90143d9 100644
--- a/audit.go
+++ b/audit.go
@@ -520,30 +520,34 @@ func (c *AuditClient) closeAndUnsetPid() error {
 	// The auditd code (which I'm using as a reference implementation) doesn't wait for a response when unsetting the audit pid.
 	// The retry count here is largely arbitrary, and provides a buffer for either transient errors (EINTR) or retries.
 	retries := 5
-	// outer:
+outer:
 	for i := 0; i < retries; i++ {
 		_, err := c.Netlink.SendNoWait(msg)
-		if errors.Is(err, syscall.EINTR) {
+		switch {
+		case err == nil:
+			return nil
+		case errors.Is(err, syscall.EINTR):
 			// got a transient interrupt, try again
 			continue
-		} else if errors.Is(err, syscall.EAGAIN) { //nolint:revive // easier to read with the else blocks
+		case errors.Is(err, syscall.EAGAIN):
 			// send would block, try to drain the receive socket. The recv count here is just so we have enough of a buffer to attempt a send again/
 			// The number is just here so we ideally have enough of a buffer to attempt the send again.
 			maxRecv := 10000
 			for i := 0; i < maxRecv; i++ {
 				_, err = c.Netlink.Receive(true, noParse)
-				if err == nil || errors.Is(err, syscall.EINTR) || errors.Is(err, syscall.ENOBUFS) {
+				switch {
+				case err == nil, errors.Is(err, syscall.EINTR), errors.Is(err, syscall.ENOBUFS):
 					// continue with receive, try to read more data
 					continue
-				} else if errors.Is(err, syscall.EAGAIN) {
+				case errors.Is(err, syscall.EAGAIN):
 					// receive would block, try to send again
-					break
-				} else {
+					continue outer
+				default:
 					// if receive returns an other error, just return that.
 					return err
 				}
 			}
-		} else {
+		default:
 			// if Send returns and other error, just return that
 			return err
 		}