feat(config): update alert configurations and error rate thresholds

- Increased ErrorRate threshold from 0.5 to 0.8 for stricter alerting.
- Added AlertInterval setting to config for customizable notification intervals.
- Updated latency thresholds for small, medium, and large files to improve performance monitoring.
- Enhanced metrics handling to incorporate new alert configurations in the system.
This commit is contained in:
wood chen 2024-12-03 17:54:45 +08:00
parent 68c27b544b
commit 3962799980
5 changed files with 29 additions and 8 deletions

View File

@ -45,7 +45,8 @@
"WindowInterval": "5m", "WindowInterval": "5m",
"DedupeWindow": "15m", "DedupeWindow": "15m",
"MinRequests": 10, "MinRequests": 10,
"ErrorRate": 0.5 "ErrorRate": 0.8,
"AlertInterval": "24h"
}, },
"Latency": { "Latency": {
"SmallFileSize": 1048576, "SmallFileSize": 1048576,

View File

@ -47,6 +47,7 @@ type MetricsConfig struct {
DedupeWindow time.Duration `json:"DedupeWindow"` // 告警去重时间窗口 DedupeWindow time.Duration `json:"DedupeWindow"` // 告警去重时间窗口
MinRequests int64 `json:"MinRequests"` // 触发告警的最小请求数 MinRequests int64 `json:"MinRequests"` // 触发告警的最小请求数
ErrorRate float64 `json:"ErrorRate"` // 错误率告警阈值 ErrorRate float64 `json:"ErrorRate"` // 错误率告警阈值
AlertInterval time.Duration `json:"AlertInterval"` // 告警间隔时间
} `json:"Alert"` } `json:"Alert"`
// 延迟告警配置 // 延迟告警配置
Latency struct { Latency struct {

View File

@ -24,17 +24,18 @@ var (
AlertWindowSize = 12 // 监控窗口数量 AlertWindowSize = 12 // 监控窗口数量
AlertWindowInterval = 5 * time.Minute // 每个窗口时间长度 AlertWindowInterval = 5 * time.Minute // 每个窗口时间长度
AlertDedupeWindow = 15 * time.Minute // 告警去重时间窗口 AlertDedupeWindow = 15 * time.Minute // 告警去重时间窗口
AlertNotifyInterval = 24 * time.Hour // 告警通知间隔
MinRequestsForAlert int64 = 10 // 触发告警的最小请求数 MinRequestsForAlert int64 = 10 // 触发告警的最小请求数
ErrorRateThreshold = 0.5 // 错误率告警阈值 (50%) ErrorRateThreshold = 0.8 // 错误率告警阈值
// 延迟告警阈值 // 延迟告警阈值
SmallFileSize int64 = 1 * MB // 小文件阈值 SmallFileSize int64 = 1 * MB // 小文件阈值
MediumFileSize int64 = 10 * MB // 中等文件阈值 MediumFileSize int64 = 10 * MB // 中等文件阈值
LargeFileSize int64 = 100 * MB // 大文件阈值 LargeFileSize int64 = 100 * MB // 大文件阈值
SmallFileLatency = 3 * time.Second // 小文件最大延迟 SmallFileLatency = 5 * time.Second // 小文件最大延迟
MediumFileLatency = 8 * time.Second // 中等文件最大延迟 MediumFileLatency = 10 * time.Second // 中等文件最大延迟
LargeFileLatency = 30 * time.Second // 大文件最大延迟 LargeFileLatency = 50 * time.Second // 大文件最大延迟
HugeFileLatency = 300 * time.Second // 超大文件最大延迟 (5分钟) HugeFileLatency = 300 * time.Second // 超大文件最大延迟 (5分钟)
// 单位常量 // 单位常量
@ -60,6 +61,9 @@ func UpdateFromConfig(cfg *config.Config) {
if cfg.Metrics.Alert.ErrorRate > 0 { if cfg.Metrics.Alert.ErrorRate > 0 {
ErrorRateThreshold = cfg.Metrics.Alert.ErrorRate ErrorRateThreshold = cfg.Metrics.Alert.ErrorRate
} }
if cfg.Metrics.Alert.AlertInterval > 0 {
AlertNotifyInterval = cfg.Metrics.Alert.AlertInterval
}
// 延迟告警配置 // 延迟告警配置
if cfg.Metrics.Latency.SmallFileSize > 0 { if cfg.Metrics.Latency.SmallFileSize > 0 {

View File

@ -47,6 +47,11 @@ func (h *ProxyHandler) MetricsHandler(w http.ResponseWriter, r *http.Request) {
return return
} }
var avgLatency int64
if latency, ok := stats["avg_latency"]; ok && latency != nil {
avgLatency = latency.(int64)
}
metrics := Metrics{ metrics := Metrics{
Uptime: uptime.String(), Uptime: uptime.String(),
ActiveRequests: stats["active_requests"].(int64), ActiveRequests: stats["active_requests"].(int64),
@ -55,7 +60,7 @@ func (h *ProxyHandler) MetricsHandler(w http.ResponseWriter, r *http.Request) {
ErrorRate: float64(stats["total_errors"].(int64)) / float64(stats["total_requests"].(int64)), ErrorRate: float64(stats["total_errors"].(int64)) / float64(stats["total_requests"].(int64)),
NumGoroutine: stats["num_goroutine"].(int), NumGoroutine: stats["num_goroutine"].(int),
MemoryUsage: stats["memory_usage"].(string), MemoryUsage: stats["memory_usage"].(string),
AverageResponseTime: metrics.FormatDuration(time.Duration(stats["avg_latency"].(int64))), AverageResponseTime: metrics.FormatDuration(time.Duration(avgLatency)),
TotalBytes: stats["total_bytes"].(int64), TotalBytes: stats["total_bytes"].(int64),
BytesPerSecond: float64(stats["total_bytes"].(int64)) / metrics.Max(uptime.Seconds(), 1), BytesPerSecond: float64(stats["total_bytes"].(int64)) / metrics.Max(uptime.Seconds(), 1),
RequestsPerSecond: float64(stats["total_requests"].(int64)) / metrics.Max(uptime.Seconds(), 1), RequestsPerSecond: float64(stats["total_requests"].(int64)) / metrics.Max(uptime.Seconds(), 1),

View File

@ -48,9 +48,10 @@ type Monitor struct {
alerts chan Alert alerts chan Alert
handlers []AlertHandler handlers []AlertHandler
dedup sync.Map dedup sync.Map
errorWindow [12]ErrorStats // 5分钟一个窗口保存最近1小时 lastNotify sync.Map
errorWindow [12]ErrorStats
currentWindow atomic.Int32 currentWindow atomic.Int32
transferWindow [12]TransferStats // 5分钟一个窗口保存最近1小时 transferWindow [12]TransferStats
currentTWindow atomic.Int32 currentTWindow atomic.Int32
} }
@ -90,6 +91,15 @@ func (m *Monitor) processAlerts() {
continue continue
} }
// 检查是否在通知间隔内
notifyKey := fmt.Sprintf("notify:%s", alert.Level)
if lastTime, ok := m.lastNotify.Load(notifyKey); ok {
if time.Since(lastTime.(time.Time)) < constants.AlertNotifyInterval {
continue
}
}
m.lastNotify.Store(notifyKey, time.Now())
for _, handler := range m.handlers { for _, handler := range m.handlers {
handler.HandleAlert(alert) handler.HandleAlert(alert)
} }