Skip to content

[3.5] backport #17176 fix learner metric incorrect issue #17266

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Draft
wants to merge 1 commit into
base: release-3.5
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
14 changes: 14 additions & 0 deletions server/etcdserver/api/membership/cluster.go
Original file line number Diff line number Diff line change
Expand Up @@ -275,6 +275,10 @@ func (c *RaftCluster) Recover(onSet func(*zap.Logger, *semver.Version)) {
onSet(c.lg, c.version)

for _, m := range c.members {
if c.localID == m.ID {
setIsLearnerMetric(m)
}

c.lg.Info(
"recovered/added member from store",
zap.String("cluster-id", c.cid.String()),
Expand Down Expand Up @@ -397,6 +401,11 @@ func (c *RaftCluster) AddMember(m *Member, shouldApplyV3 ShouldApplyV3) {
}
}
}

if m.ID == c.localID {
setIsLearnerMetric(m)
}

if c.be != nil && shouldApplyV3 {
beErr = unsafeSaveMemberToBackend(c.lg, c.be, m)
if beErr != nil && !errors.Is(beErr, errMemberAlreadyExist) {
Expand Down Expand Up @@ -528,6 +537,11 @@ func (c *RaftCluster) PromoteMember(id types.ID, shouldApplyV3 ShouldApplyV3) {
if c.v2store != nil {
mustUpdateMemberInStore(c.lg, c.v2store, c.members[id])
}

if id == c.localID {
isLearner.Set(0)
}

if c.be != nil && shouldApplyV3 {
unsafeSaveMemberToBackend(c.lg, c.be, c.members[id])
}
Expand Down
15 changes: 15 additions & 0 deletions server/etcdserver/api/membership/metrics.go
Original file line number Diff line number Diff line change
Expand Up @@ -24,8 +24,23 @@ var (
Help: "Which version is running. 1 for 'cluster_version' label with current cluster version",
},
[]string{"cluster_version"})
isLearner = prometheus.NewGauge(prometheus.GaugeOpts{
Namespace: "etcd",
Subsystem: "server",
Name: "is_learner",
Help: "Whether or not this member is a learner. 1 if is, 0 otherwise.",
})
)

func setIsLearnerMetric(m *Member) {
if m.IsLearner {
isLearner.Set(1)
} else {
isLearner.Set(0)
}
}

func init() {
prometheus.MustRegister(ClusterVersionMetrics)
prometheus.MustRegister(isLearner)
}
7 changes: 0 additions & 7 deletions server/etcdserver/metrics.go
Original file line number Diff line number Diff line change
Expand Up @@ -44,12 +44,6 @@ var (
Name: "leader_changes_seen_total",
Help: "The number of leader changes seen.",
})
isLearner = prometheus.NewGauge(prometheus.GaugeOpts{
Namespace: "etcd",
Subsystem: "server",
Name: "is_learner",
Help: "Whether or not this member is a learner. 1 if is, 0 otherwise.",
})
learnerPromoteFailed = prometheus.NewCounterVec(prometheus.CounterOpts{
Namespace: "etcd",
Subsystem: "server",
Expand Down Expand Up @@ -195,7 +189,6 @@ func init() {
prometheus.MustRegister(currentVersion)
prometheus.MustRegister(currentGoVersion)
prometheus.MustRegister(serverID)
prometheus.MustRegister(isLearner)
prometheus.MustRegister(learnerPromoteSucceed)
prometheus.MustRegister(learnerPromoteFailed)
prometheus.MustRegister(fdUsed)
Expand Down
9 changes: 0 additions & 9 deletions server/etcdserver/server.go
Original file line number Diff line number Diff line change
Expand Up @@ -2340,15 +2340,6 @@ func (s *EtcdServer) applyConfChange(cc raftpb.ConfChange, confState *raftpb.Con
}
}

// update the isLearner metric when this server id is equal to the id in raft member confChange
if confChangeContext.Member.ID == s.id {
if cc.Type == raftpb.ConfChangeAddLearnerNode {
isLearner.Set(1)
} else {
isLearner.Set(0)
}
}

case raftpb.ConfChangeRemoveNode:
id := types.ID(cc.NodeID)
s.cluster.RemoveMember(id, shouldApplyV3)
Expand Down
77 changes: 77 additions & 0 deletions tests/e2e/metrics_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -15,10 +15,14 @@
package e2e

import (
"context"
"fmt"
"testing"
"time"

"go.etcd.io/etcd/api/v3/version"
"go.etcd.io/etcd/pkg/v3/expect"
"go.etcd.io/etcd/tests/v3/framework/config"
"go.etcd.io/etcd/tests/v3/framework/e2e"
)

Expand All @@ -36,6 +40,18 @@ func TestV3MetricsInsecure(t *testing.T) {
testCtl(t, metricsTest)
}

func TestV3LearnerMetricRecover(t *testing.T) {
cfg := e2e.NewConfigTLS()
cfg.SnapshotCount = 10
testCtl(t, learnerMetricRecoverTest, withCfg(*cfg))
}

func TestV3LearnerMetricApplyFromSnapshotTest(t *testing.T) {
cfg := e2e.NewConfigTLS()
cfg.SnapshotCount = 10
testCtl(t, learnerMetricApplyFromSnapshotTest, withCfg(*cfg))
}

func metricsTest(cx ctlCtx) {
if err := ctlV3Put(cx, "k", "v", ""); err != nil {
cx.t.Fatal(err)
Expand Down Expand Up @@ -68,3 +84,64 @@ func metricsTest(cx ctlCtx) {
}
}
}

func learnerMetricRecoverTest(cx ctlCtx) {
ctx, cancel := context.WithCancel(context.Background())
defer cancel()

if _, err := cx.epc.StartNewProc(ctx, nil, cx.t, true /* addAsLearner */); err != nil {
cx.t.Fatal(err)
}
expectLearnerMetrics(cx)

triggerSnapshot(ctx, cx)

// Restart cluster
if err := cx.epc.Restart(ctx); err != nil {
cx.t.Fatal(err)
}
expectLearnerMetrics(cx)
}

func learnerMetricApplyFromSnapshotTest(cx ctlCtx) {
ctx, cancel := context.WithCancel(context.Background())
defer cancel()

// Add learner but do not start it
_, learnerCfg, err := cx.epc.AddMember(ctx, nil, cx.t, true /* addAsLearner */)
if err != nil {
cx.t.Fatal(err)
}

triggerSnapshot(ctx, cx)

// Start the learner
if err = cx.epc.StartNewProcFromConfig(ctx, cx.t, learnerCfg); err != nil {
cx.t.Fatal(err)
}
expectLearnerMetrics(cx)
}

func triggerSnapshot(ctx context.Context, cx ctlCtx) {
etcdctl := cx.epc.Procs[0].Etcdctl()
for i := 0; i < int(cx.epc.Cfg.SnapshotCount); i++ {
if err := etcdctl.Put(ctx, "k", "v", config.PutOptions{}); err != nil {
cx.t.Fatal(err)
}
}
}

func expectLearnerMetrics(cx ctlCtx) {
expectLearnerMetric(cx, 0, "etcd_server_is_learner 0")
expectLearnerMetric(cx, 1, "etcd_server_is_learner 1")
}

func expectLearnerMetric(cx ctlCtx, procIdx int, expectMetric string) {
ctx, cancel := context.WithTimeout(context.Background(), 5*time.Second)
defer cancel()

args := e2e.CURLPrefixArgsCluster(cx.epc.Cfg, cx.epc.Procs[procIdx], "GET", e2e.CURLReq{Endpoint: "/metrics"})
if err := e2e.SpawnWithExpectsContext(ctx, args, nil, expect.ExpectedResponse{Value: expectMetric}); err != nil {
cx.t.Fatal(err)
}
}
Loading