feat(metrics): enhance configuration and metrics validation

- Added new configuration structures for loading, saving, and validation settings in MetricsConfig.
- Introduced constants for load retry counts, intervals, and validation parameters to improve configurability.
- Enhanced metrics collector with data validation and consistency checks during metrics loading.
- Implemented backup and restore functionality for metrics data, ensuring data integrity and recoverability.
- Improved logging for metrics operations, providing better insights into the metrics collection process.

These changes enhance the flexibility and reliability of the metrics system, ensuring accurate data handling and improved monitoring capabilities.
This commit is contained in:
wood chen 2024-12-05 10:15:20 +08:00
parent 8c266a1ec2
commit e275326d91
6 changed files with 428 additions and 63 deletions

View File

@ -59,6 +59,23 @@ type MetricsConfig struct {
LargeLatency time.Duration `json:"LargeLatency"` // 大文件最大延迟 LargeLatency time.Duration `json:"LargeLatency"` // 大文件最大延迟
HugeLatency time.Duration `json:"HugeLatency"` // 超大文件最大延迟 HugeLatency time.Duration `json:"HugeLatency"` // 超大文件最大延迟
} `json:"Latency"` } `json:"Latency"`
// 加载配置
Load struct {
RetryCount int `json:"retry_count"`
RetryInterval time.Duration `json:"retry_interval"`
Timeout time.Duration `json:"timeout"`
} `json:"load"`
// 保存配置
Save struct {
MinInterval time.Duration `json:"min_interval"`
MaxInterval time.Duration `json:"max_interval"`
DefaultInterval time.Duration `json:"default_interval"`
} `json:"save"`
// 验证配置
Validation struct {
MaxErrorRate float64 `json:"max_error_rate"`
MaxDataDeviation float64 `json:"max_data_deviation"`
} `json:"validation"`
} }
// 添加一个辅助方法来处理字符串到 PathConfig 的转换 // 添加一个辅助方法来处理字符串到 PathConfig 的转换

View File

@ -51,6 +51,20 @@ var (
// 性能监控阈值 // 性能监控阈值
MaxRequestsPerMinute = 1000 // 每分钟最大请求数 MaxRequestsPerMinute = 1000 // 每分钟最大请求数
MaxBytesPerMinute = 100 * 1024 * 1024 // 每分钟最大流量 (100MB) MaxBytesPerMinute = 100 * 1024 * 1024 // 每分钟最大流量 (100MB)
// 数据加载相关
LoadRetryCount = 3 // 加载重试次数
LoadRetryInterval = time.Second // 重试间隔
LoadTimeout = 30 * time.Second // 加载超时时间
// 数据保存相关
MinSaveInterval = 5 * time.Minute // 最小保存间隔
MaxSaveInterval = 15 * time.Minute // 最大保存间隔
DefaultSaveInterval = 10 * time.Minute // 默认保存间隔
// 数据验证相关
MaxErrorRate = 0.8 // 最大错误率
MaxDataDeviation = 0.01 // 最大数据偏差(1%)
) )
// UpdateFromConfig 从配置文件更新常量 // UpdateFromConfig 从配置文件更新常量
@ -97,4 +111,34 @@ func UpdateFromConfig(cfg *config.Config) {
if cfg.Metrics.Latency.HugeLatency > 0 { if cfg.Metrics.Latency.HugeLatency > 0 {
HugeFileLatency = cfg.Metrics.Latency.HugeLatency HugeFileLatency = cfg.Metrics.Latency.HugeLatency
} }
// 数据加载相关
if cfg.Metrics.Load.RetryCount > 0 {
LoadRetryCount = cfg.Metrics.Load.RetryCount
}
if cfg.Metrics.Load.RetryInterval > 0 {
LoadRetryInterval = cfg.Metrics.Load.RetryInterval
}
if cfg.Metrics.Load.Timeout > 0 {
LoadTimeout = cfg.Metrics.Load.Timeout
}
// 数据保存相关
if cfg.Metrics.Save.MinInterval > 0 {
MinSaveInterval = cfg.Metrics.Save.MinInterval
}
if cfg.Metrics.Save.MaxInterval > 0 {
MaxSaveInterval = cfg.Metrics.Save.MaxInterval
}
if cfg.Metrics.Save.DefaultInterval > 0 {
DefaultSaveInterval = cfg.Metrics.Save.DefaultInterval
}
// 数据验证相关
if cfg.Metrics.Validation.MaxErrorRate > 0 {
MaxErrorRate = cfg.Metrics.Validation.MaxErrorRate
}
if cfg.Metrics.Validation.MaxDataDeviation > 0 {
MaxDataDeviation = cfg.Metrics.Validation.MaxDataDeviation
}
} }

View File

@ -0,0 +1,11 @@
package interfaces
import (
"time"
)
// MetricsCollector 定义指标收集器接口
type MetricsCollector interface {
CheckDataConsistency() error
GetLastSaveTime() time.Time
}

View File

@ -2,10 +2,13 @@ package metrics
import ( import (
"database/sql" "database/sql"
"encoding/json"
"fmt" "fmt"
"log" "log"
"math/rand" "math/rand"
"net/http" "net/http"
"os"
"path"
"proxy-go/internal/cache" "proxy-go/internal/cache"
"proxy-go/internal/config" "proxy-go/internal/config"
"proxy-go/internal/constants" "proxy-go/internal/constants"
@ -81,7 +84,7 @@ func InitCollector(dbPath string, config *config.Config) error {
} }
globalCollector.cache = cache.NewCache(constants.CacheTTL) globalCollector.cache = cache.NewCache(constants.CacheTTL)
globalCollector.monitor = monitor.NewMonitor() globalCollector.monitor = monitor.NewMonitor(globalCollector)
// 如果配置了飞书webhook则启用飞书告警 // 如果配置了飞书webhook则启用飞书告警
if config.Metrics.FeishuWebhook != "" { if config.Metrics.FeishuWebhook != "" {
@ -436,12 +439,21 @@ func (c *Collector) SaveMetrics(stats map[string]interface{}) error {
c.persistentStats.totalErrors.Store(stats["total_errors"].(int64)) c.persistentStats.totalErrors.Store(stats["total_errors"].(int64))
c.persistentStats.totalBytes.Store(stats["total_bytes"].(int64)) c.persistentStats.totalBytes.Store(stats["total_bytes"].(int64))
// 在重置前记录当前值
oldRequests := atomic.LoadInt64(&c.totalRequests)
oldErrors := atomic.LoadInt64(&c.totalErrors)
oldBytes := c.totalBytes.Load()
// 重置当前会话计数器 // 重置当前会话计数器
atomic.StoreInt64(&c.totalRequests, 0) atomic.StoreInt64(&c.totalRequests, 0)
atomic.StoreInt64(&c.totalErrors, 0) atomic.StoreInt64(&c.totalErrors, 0)
c.totalBytes.Store(0) c.totalBytes.Store(0)
c.latencySum.Store(0) c.latencySum.Store(0)
// 记录重置日志
log.Printf("Reset counters: requests=%d->0, errors=%d->0, bytes=%d->0",
oldRequests, oldErrors, oldBytes)
// 重置状态码统计 // 重置状态码统计
for i := range c.statusStats { for i := range c.statusStats {
c.statusStats[i].Store(0) c.statusStats[i].Store(0)
@ -459,20 +471,29 @@ func (c *Collector) SaveMetrics(stats map[string]interface{}) error {
return true return true
}) })
// 更新最后保存时间
lastSaveTime = time.Now()
return c.db.SaveMetrics(stats) return c.db.SaveMetrics(stats)
} }
// LoadRecentStats 加载最近的统计数据 // LoadRecentStats 加载最近的统计数据
func (c *Collector) LoadRecentStats() error { func (c *Collector) LoadRecentStats() error {
start := time.Now() start := time.Now()
var err error log.Printf("Starting to load recent stats...")
var err error
// 添加重试机制 // 添加重试机制
for retryCount := 0; retryCount < 3; retryCount++ { for retryCount := 0; retryCount < 3; retryCount++ {
if err = c.loadRecentStatsInternal(); err == nil { if err = c.loadRecentStatsInternal(); err == nil {
// 添加数据验证
if err = c.validateLoadedData(); err != nil {
log.Printf("Data validation failed: %v", err)
continue
}
break break
} }
log.Printf("Retry %d: Failed to load stats: %v", retryCount+1, err) log.Printf("Retry %d/3: Failed to load stats: %v", retryCount+1, err)
time.Sleep(time.Second) time.Sleep(time.Second)
} }
@ -480,69 +501,32 @@ func (c *Collector) LoadRecentStats() error {
return fmt.Errorf("failed to load stats after retries: %v", err) return fmt.Errorf("failed to load stats after retries: %v", err)
} }
log.Printf("Loaded all stats in %v", time.Since(start)) log.Printf("Successfully loaded all stats in %v", time.Since(start))
return nil return nil
} }
// loadRecentStatsInternal 内部加载函数 // loadRecentStatsInternal 内部加载函数
func (c *Collector) loadRecentStatsInternal() error { func (c *Collector) loadRecentStatsInternal() error {
// 加载最近5分钟的数据 loadStart := time.Now()
row := c.db.DB.QueryRow(` // 先加载基础指标
SELECT if err := c.loadBasicMetrics(); err != nil {
total_requests, total_errors, total_bytes, avg_latency return fmt.Errorf("failed to load basic metrics: %v", err)
FROM metrics_history
WHERE timestamp >= datetime('now', '-5', 'minutes')
ORDER BY timestamp DESC
LIMIT 1
`)
var metrics models.HistoricalMetrics
if err := row.Scan(
&metrics.TotalRequests,
&metrics.TotalErrors,
&metrics.TotalBytes,
&metrics.AvgLatency,
); err != nil && err != sql.ErrNoRows {
return err
} }
log.Printf("Loaded basic metrics in %v", time.Since(loadStart))
// 更新持久化数据 // 再加载状态码统计
if metrics.TotalRequests > 0 { statusStart := time.Now()
c.persistentStats.totalRequests.Store(metrics.TotalRequests) if err := c.loadStatusStats(); err != nil {
c.persistentStats.totalErrors.Store(metrics.TotalErrors) return fmt.Errorf("failed to load status stats: %v", err)
c.persistentStats.totalBytes.Store(metrics.TotalBytes)
log.Printf("Loaded persistent stats: requests=%d, errors=%d, bytes=%d",
metrics.TotalRequests, metrics.TotalErrors, metrics.TotalBytes)
} }
log.Printf("Loaded status codes in %v", time.Since(statusStart))
// 加载状态码统计 // 最后加载路径和引用来源统计
rows, err := c.db.DB.Query(` pathStart := time.Now()
SELECT status_group, SUM(count) as count if err := c.loadPathAndRefererStats(); err != nil {
FROM status_code_history return fmt.Errorf("failed to load path and referer stats: %v", err)
WHERE timestamp >= datetime('now', '-5', 'minutes')
GROUP BY status_group
`)
if err != nil {
return err
} }
defer rows.Close() log.Printf("Loaded path and referer stats in %v", time.Since(pathStart))
var totalStatusCodes int64
for rows.Next() {
var group string
var count int64
if err := rows.Scan(&group, &count); err != nil {
return err
}
if len(group) > 0 {
idx := (int(group[0]) - '0') - 1
if idx >= 0 && idx < len(c.statusStats) {
c.statusStats[idx].Store(count)
totalStatusCodes += count
}
}
}
log.Printf("Loaded status codes stats: total=%d", totalStatusCodes)
return nil return nil
} }
@ -588,7 +572,7 @@ func (c *Collector) CheckDataConsistency() error {
return fmt.Errorf("invalid total_requests: %d", totalReqs) return fmt.Errorf("invalid total_requests: %d", totalReqs)
} }
// 查状态码统计 // 查状态码统计
statusStats := stats["status_code_stats"].(map[string]int64) statusStats := stats["status_code_stats"].(map[string]int64)
var statusTotal int64 var statusTotal int64
for _, count := range statusStats { for _, count := range statusStats {
@ -598,10 +582,20 @@ func (c *Collector) CheckDataConsistency() error {
statusTotal += count statusTotal += count
} }
// 检查路径统计
pathStats := stats["top_paths"].([]models.PathMetrics)
var pathTotal int64
for _, p := range pathStats {
if p.RequestCount < 0 {
return fmt.Errorf("invalid path request count: %d", p.RequestCount)
}
pathTotal += p.RequestCount
}
// 允许一定的误差 // 允许一定的误差
if abs(statusTotal-totalReqs) > totalReqs/100 { if abs(statusTotal-totalReqs) > totalReqs/100 || abs(pathTotal-totalReqs) > totalReqs/100 {
log.Printf("Warning: status code total (%d) doesn't match total requests (%d)", log.Printf("Warning: status code total (%d) and path total (%d) don't match total requests (%d)",
statusTotal, totalReqs) statusTotal, pathTotal, totalReqs)
} }
return nil return nil
@ -613,3 +607,239 @@ func abs(x int64) int64 {
} }
return x return x
} }
func (c *Collector) validateLoadedData() error {
// 验证基础指标
if c.persistentStats.totalRequests.Load() < 0 ||
c.persistentStats.totalErrors.Load() < 0 ||
c.persistentStats.totalBytes.Load() < 0 {
return fmt.Errorf("invalid persistent stats values")
}
// 验证错误数不能大于总请求数
if c.persistentStats.totalErrors.Load() > c.persistentStats.totalRequests.Load() {
return fmt.Errorf("total errors exceeds total requests")
}
// 验证状态码统计
for i := range c.statusStats {
if c.statusStats[i].Load() < 0 {
return fmt.Errorf("invalid status code count at index %d", i)
}
}
// 验证路径统计
var totalPathRequests int64
c.pathStats.Range(func(_, value interface{}) bool {
stats := value.(*models.PathStats)
if stats.Requests.Load() < 0 || stats.Errors.Load() < 0 {
return false
}
totalPathRequests += stats.Requests.Load()
return true
})
// 验证总数一致性
if totalPathRequests > c.persistentStats.totalRequests.Load() {
return fmt.Errorf("path stats total exceeds total requests")
}
return nil
}
// loadBasicMetrics 加载基础指标
func (c *Collector) loadBasicMetrics() error {
// 加载最近5分钟的数据
row := c.db.DB.QueryRow(`
SELECT
total_requests, total_errors, total_bytes, avg_latency
FROM metrics_history
WHERE timestamp >= datetime('now', '-24', 'hours')
ORDER BY timestamp DESC
LIMIT 1
`)
var metrics models.HistoricalMetrics
if err := row.Scan(
&metrics.TotalRequests,
&metrics.TotalErrors,
&metrics.TotalBytes,
&metrics.AvgLatency,
); err != nil && err != sql.ErrNoRows {
return err
}
// 更新持久化数据
if metrics.TotalRequests > 0 {
c.persistentStats.totalRequests.Store(metrics.TotalRequests)
c.persistentStats.totalErrors.Store(metrics.TotalErrors)
c.persistentStats.totalBytes.Store(metrics.TotalBytes)
log.Printf("Loaded persistent stats: requests=%d, errors=%d, bytes=%d",
metrics.TotalRequests, metrics.TotalErrors, metrics.TotalBytes)
}
return nil
}
// loadStatusStats 加载状态码统计
func (c *Collector) loadStatusStats() error {
rows, err := c.db.DB.Query(`
SELECT status_group, SUM(count) as count
FROM status_code_history
WHERE timestamp >= datetime('now', '-24', 'hours')
GROUP BY status_group
`)
if err != nil {
return err
}
defer rows.Close()
var totalStatusCodes int64
for rows.Next() {
var group string
var count int64
if err := rows.Scan(&group, &count); err != nil {
return err
}
if len(group) > 0 {
idx := (int(group[0]) - '0') - 1
if idx >= 0 && idx < len(c.statusStats) {
c.statusStats[idx].Store(count)
totalStatusCodes += count
}
}
}
return rows.Err()
}
// loadPathAndRefererStats 加载路径和引用来源统计
func (c *Collector) loadPathAndRefererStats() error {
// 加载路径统计
rows, err := c.db.DB.Query(`
SELECT
path,
SUM(request_count) as requests,
SUM(error_count) as errors,
AVG(bytes_transferred) as bytes,
AVG(avg_latency) as latency
FROM popular_paths_history
WHERE timestamp >= datetime('now', '-24', 'hours')
GROUP BY path
ORDER BY requests DESC
LIMIT 10
`)
if err != nil {
return err
}
defer rows.Close()
for rows.Next() {
var path string
var requests, errors, bytes int64
var latency float64
if err := rows.Scan(&path, &requests, &errors, &bytes, &latency); err != nil {
return err
}
stats := &models.PathStats{}
stats.Requests.Store(requests)
stats.Errors.Store(errors)
stats.Bytes.Store(bytes)
stats.LatencySum.Store(int64(latency))
c.pathStats.Store(path, stats)
}
if err := rows.Err(); err != nil {
return err
}
// 加载引用来源统计
rows, err = c.db.DB.Query(`
SELECT
referer,
SUM(request_count) as requests
FROM referer_history
WHERE timestamp >= datetime('now', '-24', 'hours')
GROUP BY referer
ORDER BY requests DESC
LIMIT 10
`)
if err != nil {
return err
}
defer rows.Close()
for rows.Next() {
var referer string
var count int64
if err := rows.Scan(&referer, &count); err != nil {
return err
}
stats := &models.PathStats{}
stats.Requests.Store(count)
c.refererStats.Store(referer, stats)
}
return rows.Err()
}
// SaveBackup 保存数据备份
func (c *Collector) SaveBackup() error {
stats := c.GetStats()
backupFile := fmt.Sprintf("backup_%s.json", time.Now().Format("20060102_150405"))
data, err := json.MarshalIndent(stats, "", " ")
if err != nil {
return err
}
return os.WriteFile(path.Join("data/backup", backupFile), data, 0644)
}
// LoadBackup 加载备份数据
func (c *Collector) LoadBackup(backupFile string) error {
data, err := os.ReadFile(path.Join("data/backup", backupFile))
if err != nil {
return err
}
var stats map[string]interface{}
if err := json.Unmarshal(data, &stats); err != nil {
return err
}
return c.RestoreFromBackup(stats)
}
// RestoreFromBackup 从备份恢复数据
func (c *Collector) RestoreFromBackup(stats map[string]interface{}) error {
// 恢复基础指标
if totalReqs, ok := stats["total_requests"].(int64); ok {
c.persistentStats.totalRequests.Store(totalReqs)
}
if totalErrs, ok := stats["total_errors"].(int64); ok {
c.persistentStats.totalErrors.Store(totalErrs)
}
if totalBytes, ok := stats["total_bytes"].(int64); ok {
c.persistentStats.totalBytes.Store(totalBytes)
}
// 恢复状态码统计
if statusStats, ok := stats["status_code_stats"].(map[string]int64); ok {
for group, count := range statusStats {
if len(group) > 0 {
idx := (int(group[0]) - '0') - 1
if idx >= 0 && idx < len(c.statusStats) {
c.statusStats[idx].Store(count)
}
}
}
}
return nil
}
// GetLastSaveTime 实现 interfaces.MetricsCollector 接口
var lastSaveTime time.Time
func (c *Collector) GetLastSaveTime() time.Time {
return lastSaveTime
}

View File

@ -0,0 +1,35 @@
package metrics
import (
"sync"
"sync/atomic"
"testing"
"time"
)
// NewTestCollector creates a new collector for testing
func NewTestCollector() *Collector {
return &Collector{
startTime: time.Now(),
pathStats: sync.Map{},
statusStats: [6]atomic.Int64{},
latencyBuckets: [10]atomic.Int64{},
}
}
func TestDataConsistency(t *testing.T) {
c := NewTestCollector()
// 测试基础指标
c.RecordRequest("/test", 200, time.Second, 1024, "127.0.0.1", nil)
if err := c.CheckDataConsistency(); err != nil {
t.Errorf("Data consistency check failed: %v", err)
}
// 测试错误数据
c.persistentStats.totalErrors.Store(100)
c.persistentStats.totalRequests.Store(50)
if err := c.CheckDataConsistency(); err == nil {
t.Error("Expected error for invalid data")
}
}

View File

@ -4,6 +4,7 @@ import (
"fmt" "fmt"
"log" "log"
"proxy-go/internal/constants" "proxy-go/internal/constants"
"proxy-go/internal/interfaces"
"sync" "sync"
"sync/atomic" "sync/atomic"
"time" "time"
@ -53,12 +54,14 @@ type Monitor struct {
currentWindow atomic.Int32 currentWindow atomic.Int32
transferWindow [12]TransferStats transferWindow [12]TransferStats
currentTWindow atomic.Int32 currentTWindow atomic.Int32
collector interfaces.MetricsCollector
} }
func NewMonitor() *Monitor { func NewMonitor(collector interfaces.MetricsCollector) *Monitor {
m := &Monitor{ m := &Monitor{
alerts: make(chan Alert, 100), alerts: make(chan Alert, 100),
handlers: make([]AlertHandler, 0), handlers: make([]AlertHandler, 0),
collector: collector,
} }
// 初始化第一个窗口 // 初始化第一个窗口
@ -142,6 +145,23 @@ func (m *Monitor) CheckMetrics(stats map[string]interface{}) {
} }
} }
} }
// 检查数据一致性
if err := m.collector.CheckDataConsistency(); err != nil {
m.recordAlert("数据一致性告警", err.Error())
}
// 检查错误率
totalReqs := stats["total_requests"].(int64)
totalErrs := stats["total_errors"].(int64)
if totalReqs > 0 && float64(totalErrs)/float64(totalReqs) > constants.MaxErrorRate {
m.recordAlert("错误率告警", fmt.Sprintf("错误率超过阈值: %.2f%%", float64(totalErrs)/float64(totalReqs)*100))
}
// 检查数据保存
if lastSaveTime := m.collector.GetLastSaveTime(); time.Since(lastSaveTime) > constants.MaxSaveInterval*2 {
m.recordAlert("数据保存告警", "数据保存间隔过长")
}
} }
func (m *Monitor) CheckLatency(latency time.Duration, bytes int64) { func (m *Monitor) CheckLatency(latency time.Duration, bytes int64) {
@ -264,3 +284,11 @@ func (m *Monitor) cleanupWindows() {
}) })
} }
} }
func (m *Monitor) recordAlert(title, message string) {
m.alerts <- Alert{
Level: AlertLevelError,
Message: fmt.Sprintf("%s: %s", title, message),
Time: time.Now(),
}
}