Skip to content

Commit e579883

Browse files
authored
Merge pull request #1305 from yp969803/issue1294
feat: added new prometheus metrics for long conn
2 parents 7bfd83d + 6c6e3c4 commit e579883

File tree

10 files changed

+1114
-185
lines changed

10 files changed

+1114
-185
lines changed

ctl/monitoring/monitoring.go

Lines changed: 45 additions & 117 deletions
Original file line numberDiff line numberDiff line change
@@ -33,9 +33,18 @@ import (
3333
)
3434

3535
const (
36-
patternAccesslog = "/accesslog"
37-
patternMonitoring = "/monitoring"
38-
patternWorkloadMetrics = "/workload_metrics"
36+
patternAccesslog = "/accesslog"
37+
patternMonitoring = "/monitoring"
38+
patternWorkloadMetrics = "/workload_metrics"
39+
patternConnectionMetrics = "/connection_metrics"
40+
)
41+
42+
// Different types of monitoring
43+
const (
44+
MONITORING = "monitoring"
45+
ACCESSLOG = "accesslog"
46+
WORKLOAD = "workload metrics"
47+
CONNECTION = "connection metrics"
3948
)
4049

4150
var log = logger.NewLoggerScope("kmeshctl/monitoring")
@@ -53,14 +62,20 @@ kmeshctl monitoring <kmesh-daemon-pod> --all enable/disable
5362
# Enable/Disable workload granularity metrics:
5463
kmeshctl monitoring <kmesh-daemon-pod> --workloadMetrics enable/disable
5564
65+
# Enable/Disable connection granularity metrics:
66+
kmeshctl monitoring <kmesh-daemon-pod> --connectionMetrics enable/disable
67+
5668
# If you want to change the monitoring functionality of all kmesh daemons in the cluster
5769
# Enable/Disable Kmesh's accesslog in each node:
5870
kmeshctl monitoring --accesslog enable/disable
5971
6072
# Enable/Disable workload granularity metrics in each node:
6173
kmeshctl monitoring --workloadMetrics enable/disable
6274
63-
#Enable/Disable services' and workloads' metrics and accesslog generated from bpf in each node:
75+
# Enable/Disable connection granularity metrics in each node:
76+
kmeshctl monitoring --connectionMetrics enable/disable
77+
78+
#Enable/Disable services', workloads' and 'connections' metrics and accesslog generated from bpf in each node:
6479
kmeshctl monitoring --all enable/disable`,
6580
Args: cobra.MaximumNArgs(1),
6681
Run: func(cmd *cobra.Command, args []string) {
@@ -69,7 +84,8 @@ kmeshctl monitoring --all enable/disable`,
6984
}
7085
cmd.Flags().String("accesslog", "", "Control accesslog enable or disable")
7186
cmd.Flags().String("all", "", "Control accesslog and services' and workloads' metrics enable or disable together")
72-
cmd.Flags().String("workloadMetrics", "", "Control granularity metrics enable or disable")
87+
cmd.Flags().String("workloadMetrics", "", "Control workload granularity metrics enable or disable")
88+
cmd.Flags().String("connectionMetrics", "", "Control connection granularity metrics enable or disable")
7389
return cmd
7490
}
7591

@@ -82,22 +98,26 @@ func ControlMonitoring(cmd *cobra.Command, args []string) {
8298
accesslogFlag, _ := cmd.Flags().GetString("accesslog")
8399
allFlag, _ := cmd.Flags().GetString("all")
84100
workloadMetricsFlag, _ := cmd.Flags().GetString("workloadMetrics")
85-
if accesslogFlag == "" && allFlag == "" && workloadMetricsFlag == "" {
86-
log.Print("no parameters. Need --accesslog, --workloadMetric or --all")
101+
connectionMetricsFlag, _ := cmd.Flags().GetString("connectionMetrics")
102+
if accesslogFlag == "" && allFlag == "" && workloadMetricsFlag == "" && connectionMetricsFlag == "" {
103+
log.Print("no parameters. Need --accesslog, --workloadMetrics, --connectionMetrics or --all")
87104
return
88105
}
89106

90107
podName, hasKmeshPod := getKmeshDaemonPod(args)
91108
if hasKmeshPod {
92109
// Processes triggers for specified kmesh daemon.
93110
if allFlag != "" {
94-
SetMonitoringPerKmeshDaemon(client, podName, allFlag)
111+
SetObservabilityPerKmeshDaemon(client, podName, allFlag, MONITORING, patternMonitoring)
95112
}
96113
if accesslogFlag != "" {
97-
SetAccesslogPerKmeshDaemon(client, podName, accesslogFlag)
114+
SetObservabilityPerKmeshDaemon(client, podName, accesslogFlag, ACCESSLOG, patternAccesslog)
98115
}
99116
if workloadMetricsFlag != "" {
100-
SetWorkloadMetricsPerKmeshDaemon(client, podName, workloadMetricsFlag)
117+
SetObservabilityPerKmeshDaemon(client, podName, workloadMetricsFlag, WORKLOAD, patternWorkloadMetrics)
118+
}
119+
if connectionMetricsFlag != "" {
120+
SetObservabilityPerKmeshDaemon(client, podName, connectionMetricsFlag, CONNECTION, patternConnectionMetrics)
101121
}
102122
} else {
103123
// Perform operations on all kmesh daemons.
@@ -108,13 +128,16 @@ func ControlMonitoring(cmd *cobra.Command, args []string) {
108128
}
109129
for _, pod := range podList.Items {
110130
if allFlag != "" {
111-
SetMonitoringPerKmeshDaemon(client, pod.GetName(), allFlag)
131+
SetObservabilityPerKmeshDaemon(client, pod.GetName(), allFlag, MONITORING, patternMonitoring)
112132
}
113133
if accesslogFlag != "" {
114-
SetAccesslogPerKmeshDaemon(client, pod.GetName(), accesslogFlag)
134+
SetObservabilityPerKmeshDaemon(client, pod.GetName(), accesslogFlag, ACCESSLOG, patternAccesslog)
115135
}
116136
if workloadMetricsFlag != "" {
117-
SetWorkloadMetricsPerKmeshDaemon(client, pod.GetName(), workloadMetricsFlag)
137+
SetObservabilityPerKmeshDaemon(client, pod.GetName(), workloadMetricsFlag, WORKLOAD, patternWorkloadMetrics)
138+
}
139+
if connectionMetricsFlag != "" {
140+
SetObservabilityPerKmeshDaemon(client, pod.GetName(), connectionMetricsFlag, CONNECTION, patternConnectionMetrics)
118141
}
119142
}
120143
}
@@ -130,12 +153,12 @@ func getKmeshDaemonPod(args []string) (string, bool) {
130153
return args[0], true
131154
}
132155

133-
func SetAccesslogPerKmeshDaemon(cli kube.CLIClient, podName, info string) {
134-
var accesslogInfo string
156+
func SetObservabilityPerKmeshDaemon(cli kube.CLIClient, podName, info string, observablityType string, pattern string) {
157+
var status string
135158
if info == "enable" {
136-
accesslogInfo = "true"
159+
status = "true"
137160
} else if info == "disable" {
138-
accesslogInfo = "false"
161+
status = "false"
139162
} else {
140163
log.Errorf("Error: Argument must be 'enable' or 'disable'")
141164
os.Exit(1)
@@ -152,7 +175,7 @@ func SetAccesslogPerKmeshDaemon(cli kube.CLIClient, podName, info string) {
152175
}
153176
defer fw.Close()
154177

155-
url := fmt.Sprintf("http://%s%s?enable=%s", fw.Address(), patternAccesslog, accesslogInfo)
178+
url := fmt.Sprintf("http://%s%s?enable=%s", fw.Address(), pattern, status)
156179

157180
req, err := http.NewRequest(http.MethodPost, url, nil)
158181
if err != nil {
@@ -170,114 +193,19 @@ func SetAccesslogPerKmeshDaemon(cli kube.CLIClient, podName, info string) {
170193
defer resp.Body.Close()
171194

172195
if resp.StatusCode != http.StatusOK {
173-
bodyBytes, readErr := io.ReadAll(resp.Body)
174-
if readErr != nil {
175-
log.Errorf("Error reading response body: %v", readErr)
196+
if observablityType == MONITORING {
197+
log.Errorf("Error: received status code %d", resp.StatusCode)
176198
return
177199
}
178-
bodyString := string(bodyBytes)
179-
if resp.StatusCode == http.StatusBadRequest && bytes.Contains(bodyBytes, []byte("Kmesh monitoring is disable, cannot enable accesslog")) {
180-
log.Errorf("failed to enable accesslog: %v. Need to start Kmesh's Monitoring. Please run `kmeshctl monitoring -h` for more help.", bodyString)
181-
return
182-
}
183-
log.Errorf("Error: received status code %d", resp.StatusCode)
184-
return
185-
}
186-
}
187-
188-
func SetMonitoringPerKmeshDaemon(cli kube.CLIClient, podName, info string) {
189-
var monitoringInfo string
190-
if info == "enable" {
191-
monitoringInfo = "true"
192-
} else if info == "disable" {
193-
monitoringInfo = "false"
194-
} else {
195-
log.Errorf("Error: Argument must be 'enable' or 'disable'")
196-
os.Exit(1)
197-
}
198-
199-
fw, err := utils.CreateKmeshPortForwarder(cli, podName)
200-
if err != nil {
201-
log.Errorf("failed to create port forwarder for Kmesh daemon pod %s: %v", podName, err)
202-
os.Exit(1)
203-
}
204-
if err := fw.Start(); err != nil {
205-
log.Errorf("failed to start port forwarder for Kmesh daemon pod %s: %v", podName, err)
206-
os.Exit(1)
207-
}
208-
defer fw.Close()
209-
210-
url := fmt.Sprintf("http://%s%s?enable=%s", fw.Address(), patternMonitoring, monitoringInfo)
211-
212-
req, err := http.NewRequest(http.MethodPost, url, nil)
213-
if err != nil {
214-
log.Errorf("Error creating request: %v", err)
215-
return
216-
}
217-
218-
req.Header.Set("Content-Type", "application/json")
219-
client := &http.Client{}
220-
resp, err := client.Do(req)
221-
if err != nil {
222-
log.Errorf("failed to make HTTP request: %v", err)
223-
return
224-
}
225-
defer resp.Body.Close()
226-
227-
if resp.StatusCode != http.StatusOK {
228-
log.Errorf("Error: received status code %d", resp.StatusCode)
229-
return
230-
}
231-
}
232200

233-
func SetWorkloadMetricsPerKmeshDaemon(cli kube.CLIClient, podName, workloadMetricsInfo string) {
234-
var info string
235-
if workloadMetricsInfo == "enable" {
236-
info = "true"
237-
} else if workloadMetricsInfo == "disable" {
238-
info = "false"
239-
} else {
240-
log.Errorf("Error: Argument must be 'enable' or 'disable'")
241-
os.Exit(1)
242-
}
243-
244-
fw, err := utils.CreateKmeshPortForwarder(cli, podName)
245-
if err != nil {
246-
log.Errorf("failed to create port forwarder for Kmesh daemon pod %s: %v", podName, err)
247-
os.Exit(1)
248-
}
249-
if err := fw.Start(); err != nil {
250-
log.Errorf("failed to start port forwarder for Kmesh daemon pod %s: %v", podName, err)
251-
os.Exit(1)
252-
}
253-
defer fw.Close()
254-
255-
url := fmt.Sprintf("http://%s%s?enable=%s", fw.Address(), patternWorkloadMetrics, info)
256-
257-
req, err := http.NewRequest(http.MethodPost, url, nil)
258-
if err != nil {
259-
log.Errorf("Error creating request: %v", err)
260-
return
261-
}
262-
263-
req.Header.Set("Content-Type", "application/json")
264-
client := &http.Client{}
265-
resp, err := client.Do(req)
266-
if err != nil {
267-
log.Errorf("failed to make HTTP request: %v", err)
268-
return
269-
}
270-
defer resp.Body.Close()
271-
272-
if resp.StatusCode != http.StatusOK {
273201
bodyBytes, readErr := io.ReadAll(resp.Body)
274202
if readErr != nil {
275203
log.Errorf("Error reading response body: %v", readErr)
276204
return
277205
}
278206
bodyString := string(bodyBytes)
279-
if resp.StatusCode == http.StatusBadRequest && bytes.Contains(bodyBytes, []byte("Kmesh monitoring is disable, cannot enable accesslog")) {
280-
log.Errorf("failed to enable workload metrics: %v. Need to start Kmesh's Monitoring. Please run `kmeshctl monitoring -h` for more help.", bodyString)
207+
if resp.StatusCode == http.StatusBadRequest && bytes.Contains(bodyBytes, []byte(fmt.Sprintf("Kmesh monitoring is disable, cannot enable %s.", observablityType))) {
208+
log.Errorf("failed to enable %s: %v. Need to start Kmesh's Monitoring. Please run `kmeshctl monitoring -h` for more help.", observablityType, bodyString)
281209
return
282210
}
283211
log.Errorf("Error: received status code %d", resp.StatusCode)

docs/ctl/kmeshctl_monitoring.md

Lines changed: 12 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -18,24 +18,31 @@ kmeshctl monitoring <kmesh-daemon-pod> --all enable/disable
1818
# Enable/Disable workload granularity metrics:
1919
kmeshctl monitoring <kmesh-daemon-pod> --workloadMetrics enable/disable
2020
21+
# Enable/Disable connection granularity metrics:
22+
kmeshctl monitoring <kmesh-daemon-pod> --connectionMetrics enable/disable
23+
2124
# If you want to change the monitoring functionality of all kmesh daemons in the cluster
2225
# Enable/Disable Kmesh's accesslog in each node:
2326
kmeshctl monitoring --accesslog enable/disable
2427
2528
# Enable/Disable workload granularity metrics in each node:
2629
kmeshctl monitoring --workloadMetrics enable/disable
2730
28-
#Enable/Disable services' and workloads' metrics and accesslog generated from bpf in each node:
31+
# Enable/Disable connection granularity metrics in each node:
32+
kmeshctl monitoring --connectionMetrics enable/disable
33+
34+
#Enable/Disable services', workloads' and 'connections' metrics and accesslog generated from bpf in each node:
2935
kmeshctl monitoring --all enable/disable
3036
```
3137

3238
### Options
3339

3440
```
35-
--accesslog string Control accesslog enable or disable
36-
--all string Control accesslog and services' and workloads' metrics enable or disable together
37-
-h, --help help for monitoring
38-
--workloadMetrics string Control granularity metrics enable or disable
41+
--accesslog string Control accesslog enable or disable
42+
--all string Control accesslog and services' and workloads' metrics enable or disable together
43+
--connectionMetrics string Control connection granularity metrics enable or disable
44+
-h, --help help for monitoring
45+
--workloadMetrics string Control workload granularity metrics enable or disable
3946
```
4047

4148
### SEE ALSO

docs/proposal/tcp_long_connection_metrics.md

Lines changed: 52 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -168,6 +168,58 @@ static inline void observe_on_data(struct bpf_sock *sk)
168168
We will update the functions of metric.go for periodic updating the workload and service metrics, also we will create a new metric for long tcp connections.
169169

170170
![design](./pics/tcp_long_conn_design.png)
171+
172+
#### Exposing long connection prometheus metrics
173+
174+
We will expose metrics for the connections whose duration exceeds 30 seconds. Not exposing metrics for short connection as it can lead to lot of metrics and they are also not suitable for prometheus metrics because prometheus itself has a scrape interval of maximum 15s, and short-lived connections may start and end between scrapes, resulting in incomplete or misleading data. By focusing only on longer-lived connections, we ensure the metrics are stable, meaningful, and better aligned with Prometheus’s time-series data model.
175+
176+
We can have a another component in future which reports realtime information about connections like cilium hubble.
177+
178+
Prometheus metrics exposed
179+
180+
- `kmesh_tcp_connection_sent_bytes_total` : The total number of bytes sent over established TCP connection
181+
182+
- `kmesh_tcp_connection_received_bytes_total` : The total number of bytes received over established TCP connection
183+
184+
- `kmesh_tcp_connection_packet_lost_total` : Total number of packets lost during transmission in a TCP connection
185+
186+
- `kmesh_tcp_connection_retrans_total` : The total number of retransmits over established TCP connection
187+
188+
The above metrics has following labels
189+
190+
```
191+
"reporter"
192+
"start_time"
193+
"source_workload"
194+
"source_canonical_service"
195+
"source_canonical_revision"
196+
"source_workload_namespace"
197+
"source_principal"
198+
"source_app"
199+
"source_version"
200+
"source_cluster"
201+
"source_address"
202+
"destination_address"
203+
"destination_pod_address"
204+
"destination_pod_namespace"
205+
"destination_pod_name"
206+
"destination_service"
207+
"destination_service_namespace"
208+
"destination_service_name"
209+
"destination_workload"
210+
"destination_canonical_service"
211+
"destination_canonical_revision"
212+
"destination_workload_namespace"
213+
"destination_principal"
214+
"destination_app"
215+
"destination_version"
216+
"destination_cluster"
217+
"request_protocol"
218+
"response_flags"
219+
"connection_security_policy"
220+
```
221+
222+
171223
#### User Stories (Optional)
172224

173225
<!--

0 commit comments

Comments
 (0)