zfsmon/main.go

257 lines
6.4 KiB
Go

package main
import (
"flag"
"fmt"
"io/ioutil"
"log"
"os"
"os/exec"
"path/filepath"
"strings"
"time"
"code.cloudfoundry.org/bytefmt"
zfs "github.com/bicomsystems/go-libzfs"
"github.com/davidscholberg/go-durationfmt"
"gopkg.in/yaml.v2"
)
const LAST_SCAN_FILE = "last_scan.yaml"
const LOG_FILE = "zfsmon.log"
const POOL_SPACE_WARN = 90
const DATE_FORMAT = "02.01.2006"
const TIME_FORMAT = "02.01.2006 15:04:05"
type ScanStats struct {
Func string
State uint64
Errors uint64
Duration uint64
LastFinished uint64
}
type DeviceState struct {
State string
ReadErrors uint64 `yaml:"read_errors"`
WriteErrors uint64 `yaml:"write_errors"`
ChecksumErrors uint64 `yaml:"checksum_errors"`
}
type PoolState struct {
Devices map[string]DeviceState
Scan *ScanStats
TotalSpace uint64 `yaml:"total_space"`
FreeSpace uint64 `yaml:"free_space"`
FillLevel float64 `yaml:"fill_level"`
}
func getStateOfDevices(tree zfs.VDevTree, data map[string]DeviceState) {
data[tree.Name] = DeviceState{
State: tree.Stat.State.String(),
ReadErrors: tree.Stat.ReadErrors,
WriteErrors: tree.Stat.WriteErrors,
ChecksumErrors: tree.Stat.ChecksumErrors,
}
if tree.Devices != nil {
for _, subtree := range tree.Devices {
getStateOfDevices(subtree, data)
}
}
}
func getScanFunc(scanFunc uint64) string {
switch scanFunc {
case zfs.PoolScanScrub:
return "Scrub"
case zfs.PoolScanResilver:
return "Resilver"
default:
return "None"
}
}
func getScanStats(scanStat zfs.PoolScanStat) *ScanStats {
return &ScanStats{
Func: getScanFunc(scanStat.Func),
State: scanStat.State,
Errors: scanStat.Errors,
Duration: scanStat.EndTime - scanStat.StartTime,
LastFinished: scanStat.EndTime,
}
}
func getPoolState(poolName string) (*PoolState, error) {
pool, err := zfs.PoolOpen(poolName)
if err != nil {
return nil, err
}
tree, err := pool.VDevTree()
if err != nil {
return nil, err
}
deviceData := map[string]DeviceState{}
getStateOfDevices(tree, deviceData)
scanStats := getScanStats(tree.ScanStat)
level := float64(tree.Stat.Alloc) / float64(tree.Stat.Space) * 100.0
return &PoolState{
Devices: deviceData,
Scan: scanStats,
TotalSpace: tree.Stat.Space,
FreeSpace: tree.Stat.Space - tree.Stat.Alloc,
FillLevel: level,
}, nil
}
func (s *PoolState) numDeviceErrors() uint64 {
var n uint64
for _, dev := range s.Devices {
n += (dev.ReadErrors + dev.WriteErrors + dev.ChecksumErrors)
if dev.State != "ONLINE" {
n++
}
}
return n
}
func readLastScanFile(lastScanFile string) *PoolState {
historyFile, err := os.Open(lastScanFile)
if err != nil {
return nil
}
defer historyFile.Close()
byteValue, _ := ioutil.ReadAll(historyFile)
var poolState *PoolState
_ = yaml.Unmarshal(byteValue, &poolState)
return poolState
}
func (s *PoolState) writeLastScanFile(lastScanFile string) {
data, _ := yaml.Marshal(s)
data = append([]byte("# Scan history from "+time.Now().Format(DATE_FORMAT)+"\n\n"), data...)
err := ioutil.WriteFile(lastScanFile, data, 0644)
if err != nil {
log.Println(err)
}
}
func checkPool(poolName string, lastScanFile string, spaceWarn float64) []string {
poolState, err := getPoolState(poolName)
if err != nil {
msg := fmt.Sprintf("Could not open %s. Error: %s", poolName, err.Error())
log.Println(msg)
return []string{msg}
}
lastState := readLastScanFile(lastScanFile)
msgs := []string{}
if lastState != nil {
// Compare scan data to last state
if poolState.numDeviceErrors() > lastState.numDeviceErrors() {
log.Println("Device errors occurred")
overviewBts, _ := yaml.Marshal(poolState.Devices)
msg := fmt.Sprintf(`Oh no! Looks like %s has encountered device errors.
Here is an overview of all devices and their statuses:
%s`, poolName, string(overviewBts))
msgs = append(msgs, msg)
}
// Check if new scan has completed
if poolState.Scan.LastFinished > lastState.Scan.LastFinished {
timeStr := time.Unix(int64(poolState.Scan.LastFinished), 0).Format(TIME_FORMAT)
durationStr, _ := durationfmt.Format(time.Second*time.Duration(poolState.Scan.Duration), "%0h:%0m:%0s")
log.Printf("%s on %s has finished at %s, took %s, %d errors",
poolState.Scan.Func, poolName, timeStr, durationStr, poolState.Scan.Errors)
msg := fmt.Sprintf("%s on %s has finished at %s\nDuration: %s\nErrors: %d",
poolState.Scan.Func, poolName, timeStr, durationStr, poolState.Scan.Errors)
msgs = append(msgs, msg)
}
// Check if fill level has reached the threshold
if lastState.FillLevel <= spaceWarn && poolState.FillLevel > spaceWarn {
byteStr := bytefmt.ByteSize(lastState.FreeSpace) + "B"
msg := fmt.Sprintf("Disk space warning for %s - only %s left", poolName, byteStr)
log.Println(msg)
msgs = append(msgs, msg)
}
} else {
log.Println("This is your first scan. Creating last_scan file.")
}
poolState.writeLastScanFile(lastScanFile)
return msgs
}
func sendMsgs(sendmail string, msgUri string, msgs []string) {
hostname, _ := os.Hostname()
if hostname == "" {
hostname = "unknown"
}
mailHeader := fmt.Sprintf(`Subject:[%s] zfsmon status update
MIME-Version: 1.0
Content-Type: text/plain; charset=utf-8
`, hostname)
for _, msg := range msgs {
cmd := exec.Command(sendmail, msgUri)
cmd.Stdin = strings.NewReader(mailHeader + msg)
err := cmd.Run()
if err != nil {
log.Println(err)
} else {
log.Printf("sent mail to %s", msgUri)
}
}
}
func main() {
workDir := ""
if os.Getuid() == 0 {
workDir = "/etc/zfsmon"
}
// Cmdline args
poolName := flag.String("pool", "tank", "ZFS pool name")
sendmail := flag.String("sendmail", "sendmail", "Sendmail path")
mailAddress := flag.String("m", "", "E-Mail address")
lastScanFile := flag.String("f", filepath.Join(workDir, LAST_SCAN_FILE), "File to store last scan results")
logFile := flag.String("log", filepath.Join(workDir, LOG_FILE), "Log file")
spaceWarn := flag.Int("space", POOL_SPACE_WARN, "Pool fill level in percent to warn at")
flag.Parse()
// Log file
if *logFile != "" {
f, err := os.OpenFile(*logFile, os.O_RDWR|os.O_CREATE|os.O_APPEND, 0666)
if err != nil {
log.Fatalf("error opening logfile: %v", err)
}
defer f.Close()
log.SetOutput(f)
}
msgs := checkPool(*poolName, *lastScanFile, float64(*spaceWarn))
if *mailAddress != "" {
sendMsgs(*sendmail, *mailAddress, msgs)
} else {
log.Println("mail address not set, messaging disabled")
}
}