Add Prometheus metric to count upload errors (#8788)
Add Prometheus metric to count upload errors (#8775) Add SeaweedFS_upload_error_total counter labeled by HTTP status code, so operators can alert on write/replication failures. Code "0" indicates a transport error (no HTTP response received). Also add an "Upload Errors" panel to the Grafana dashboard.
This commit is contained in:
@@ -11,6 +11,7 @@ import (
|
||||
"net/http"
|
||||
"net/textproto"
|
||||
"path/filepath"
|
||||
"strconv"
|
||||
"strings"
|
||||
"sync"
|
||||
"time"
|
||||
@@ -415,6 +416,7 @@ func (uploader *Uploader) upload_content(ctx context.Context, fillBufferFunction
|
||||
}
|
||||
}
|
||||
if post_err != nil {
|
||||
stats.UploadErrorCounter.WithLabelValues("0").Inc()
|
||||
return nil, fmt.Errorf("upload %s %d bytes to %v: %v", option.Filename, originalDataSize, option.UploadUrl, post_err)
|
||||
}
|
||||
// print("-")
|
||||
@@ -428,15 +430,18 @@ func (uploader *Uploader) upload_content(ctx context.Context, fillBufferFunction
|
||||
|
||||
resp_body, ra_err := io.ReadAll(resp.Body)
|
||||
if ra_err != nil {
|
||||
stats.UploadErrorCounter.WithLabelValues(strconv.Itoa(resp.StatusCode)).Inc()
|
||||
return nil, fmt.Errorf("read response body %v: %w", option.UploadUrl, ra_err)
|
||||
}
|
||||
|
||||
unmarshal_err := json.Unmarshal(resp_body, &ret)
|
||||
if unmarshal_err != nil {
|
||||
stats.UploadErrorCounter.WithLabelValues(strconv.Itoa(resp.StatusCode)).Inc()
|
||||
glog.ErrorfCtx(ctx, "unmarshal %s: %v", option.UploadUrl, string(resp_body))
|
||||
return nil, fmt.Errorf("unmarshal %v: %w", option.UploadUrl, unmarshal_err)
|
||||
}
|
||||
if ret.Error != "" {
|
||||
stats.UploadErrorCounter.WithLabelValues(strconv.Itoa(resp.StatusCode)).Inc()
|
||||
return nil, fmt.Errorf("unmarshalled error %v: %v", option.UploadUrl, ret.Error)
|
||||
}
|
||||
ret.ETag = etag
|
||||
|
||||
@@ -458,6 +458,13 @@ var (
|
||||
Name: "bucket_object_count",
|
||||
Help: "Current number of objects in each S3 bucket (logical count, deduplicated across replicas).",
|
||||
}, []string{"bucket"})
|
||||
|
||||
UploadErrorCounter = prometheus.NewCounterVec(
|
||||
prometheus.CounterOpts{
|
||||
Namespace: Namespace,
|
||||
Name: "upload_error_total",
|
||||
Help: "Counter of upload errors by HTTP status code. Code 0 means transport error (no response received).",
|
||||
}, []string{"code"})
|
||||
)
|
||||
|
||||
func init() {
|
||||
@@ -519,6 +526,8 @@ func init() {
|
||||
Gather.MustRegister(S3BucketPhysicalSizeBytesGauge)
|
||||
Gather.MustRegister(S3BucketObjectCountGauge)
|
||||
|
||||
Gather.MustRegister(UploadErrorCounter)
|
||||
|
||||
go bucketMetricTTLControl()
|
||||
}
|
||||
|
||||
|
||||
Reference in New Issue
Block a user