MySQL 事务性能优化与监控

性能优化与监控

监控关键指标

// 事务性能监控的实际实现
func monitorTransactionPerformance() {
    
    // 1. 长事务监控 - 找出执行时间超过阈值的事务
    longTransactionsQuery := `
        SELECT 
            trx_id,
            trx_started,
            TIME_TO_SEC(TIMEDIFF(NOW(), trx_started)) as duration_seconds,
            trx_tables_in_use,
            trx_tables_locked,
            trx_query
        FROM information_schema.innodb_trx 
        WHERE TIME_TO_SEC(TIMEDIFF(NOW(), trx_started)) > 60
        ORDER BY trx_started;
    `
    
    // 2. Undo Log 空间监控 - 检查 undo 表空间使用情况
    undoSpaceQuery := `
        SELECT 
            tablespace_name,
            file_name,
            ROUND(file_size/1024/1024/1024, 2) as size_gb,
            ROUND(allocated_size/1024/1024/1024, 2) as allocated_gb
        FROM information_schema.files 
        WHERE tablespace_name LIKE 'innodb_undo%'
        ORDER BY file_size DESC;
    `
    
    // 3. Redo Log 写入压力监控
    redoLogStatusQuery := `
        SHOW ENGINE INNODB STATUS;
        -- 关注以下指标：
        -- Log sequence number: 当前 LSN
        -- Log flushed up to: 已刷盘 LSN  
        -- Pages flushed up to: 已刷页 LSN
        -- Last checkpoint at: 最后检查点 LSN
    `
    
    // 4. 锁等待监控 - 检查当前锁冲突情况
    lockWaitQuery := `
        SELECT 
            r.trx_id as requesting_trx_id,
            r.trx_mysql_thread_id as requesting_thread,
            r.trx_query as requesting_query,
            b.trx_id as blocking_trx_id, 
            b.trx_mysql_thread_id as blocking_thread,
            b.trx_query as blocking_query,
            l.lock_table,
            l.lock_index,
            l.lock_mode
        FROM information_schema.innodb_lock_waits w
        INNER JOIN information_schema.innodb_trx r 
            ON w.requesting_trx_id = r.trx_id
        INNER JOIN information_schema.innodb_trx b
            ON w.blocking_trx_id = b.trx_id  
        INNER JOIN information_schema.innodb_locks l
            ON w.blocking_lock_id = l.lock_id;
    `
    
    // 5. 主从延迟监控
    replicationDelayQuery := `
        SHOW SLAVE STATUS;
        -- 关注指标：
        -- Seconds_Behind_Master: 主从延迟秒数
        -- Master_Log_File, Read_Master_Log_Pos: 主库位置
        -- Relay_Log_File, Relay_Log_Pos: 中继日志位置  
        -- Exec_Master_Log_Pos: 从库执行位置
    `
    
    // 6. 性能指标计算
    performanceMetrics := struct {
        TransactionThroughput float64 `json:"tps"`          // 每秒事务数
        RedoLogWriteRate     float64 `json:"redo_mb_per_s"` // Redo Log 写入速率  
        UndoLogSize          float64 `json:"undo_size_gb"`  // Undo Log 总大小
        LockWaitTime         float64 `json:"avg_lock_wait"` // 平均锁等待时间
        ReplicationDelay     int     `json:"repl_delay_s"`  // 主从延迟
    }{
        TransactionThroughput: calculateTPS(),
        RedoLogWriteRate:     calculateRedoWriteRate(), 
        UndoLogSize:          calculateUndoSize(),
        LockWaitTime:         calculateAvgLockWait(),
        ReplicationDelay:     getReplicationDelay(),
    }
    
    // 7. 告警阈值设置
    alerts := []Alert{
        {
            Metric:    "long_transaction", 
            Threshold: 300,  // 5分钟
            Action:    "发送告警，可能需要 kill 事务",
        },
        {
            Metric:    "undo_size",
            Threshold: 10,   // 10GB  
            Action:    "检查是否有长事务阻止清理",
        },
        {
            Metric:    "replication_delay",
            Threshold: 60,   // 60秒
            Action:    "检查网络和从库性能",
        },
        {
            Metric:    "lock_wait_time",
            Threshold: 30,   // 30秒
            Action:    "分析锁冲突，优化查询",
        },
    }
}

// 工具函数示例
func calculateTPS() float64 {
    // 通过 SHOW GLOBAL STATUS LIKE 'Com_commit' 计算
    // TPS = (当前提交数 - 上次提交数) / 时间间隔
    return 0.0
}

func calculateRedoWriteRate() float64 {
    // 通过 SHOW ENGINE INNODB STATUS 中的 LSN 增长计算
    // 写入速率 = LSN 增长量 / 时间间隔 / 1024 / 1024
    return 0.0
}
性能优化与监控​

监控关键指标​

性能优化与监控

监控关键指标