Add Prometheus metric to count upload errors (#8788)

Add Prometheus metric to count upload errors (#8775)

Add SeaweedFS_upload_error_total counter labeled by HTTP status code,
so operators can alert on write/replication failures. Code "0" indicates
a transport error (no HTTP response received).

Also add an "Upload Errors" panel to the Grafana dashboard.
This commit is contained in:
Chris Lu
2026-03-26 16:58:05 -07:00
committed by GitHub
parent 17028fbf59
commit 5fa5507234
3 changed files with 143 additions and 27 deletions

View File

@@ -11,6 +11,7 @@ import (
"net/http"
"net/textproto"
"path/filepath"
"strconv"
"strings"
"sync"
"time"
@@ -415,6 +416,7 @@ func (uploader *Uploader) upload_content(ctx context.Context, fillBufferFunction
}
}
if post_err != nil {
stats.UploadErrorCounter.WithLabelValues("0").Inc()
return nil, fmt.Errorf("upload %s %d bytes to %v: %v", option.Filename, originalDataSize, option.UploadUrl, post_err)
}
// print("-")
@@ -428,15 +430,18 @@ func (uploader *Uploader) upload_content(ctx context.Context, fillBufferFunction
resp_body, ra_err := io.ReadAll(resp.Body)
if ra_err != nil {
stats.UploadErrorCounter.WithLabelValues(strconv.Itoa(resp.StatusCode)).Inc()
return nil, fmt.Errorf("read response body %v: %w", option.UploadUrl, ra_err)
}
unmarshal_err := json.Unmarshal(resp_body, &ret)
if unmarshal_err != nil {
stats.UploadErrorCounter.WithLabelValues(strconv.Itoa(resp.StatusCode)).Inc()
glog.ErrorfCtx(ctx, "unmarshal %s: %v", option.UploadUrl, string(resp_body))
return nil, fmt.Errorf("unmarshal %v: %w", option.UploadUrl, unmarshal_err)
}
if ret.Error != "" {
stats.UploadErrorCounter.WithLabelValues(strconv.Itoa(resp.StatusCode)).Inc()
return nil, fmt.Errorf("unmarshalled error %v: %v", option.UploadUrl, ret.Error)
}
ret.ETag = etag

View File

@@ -458,6 +458,13 @@ var (
Name: "bucket_object_count",
Help: "Current number of objects in each S3 bucket (logical count, deduplicated across replicas).",
}, []string{"bucket"})
UploadErrorCounter = prometheus.NewCounterVec(
prometheus.CounterOpts{
Namespace: Namespace,
Name: "upload_error_total",
Help: "Counter of upload errors by HTTP status code. Code 0 means transport error (no response received).",
}, []string{"code"})
)
func init() {
@@ -519,6 +526,8 @@ func init() {
Gather.MustRegister(S3BucketPhysicalSizeBytesGauge)
Gather.MustRegister(S3BucketObjectCountGauge)
Gather.MustRegister(UploadErrorCounter)
go bucketMetricTTLControl()
}