package main import ( "flag" "fmt" "io/ioutil" "log" "os" "os/exec" "path/filepath" "strings" "time" "code.cloudfoundry.org/bytefmt" zfs "github.com/bicomsystems/go-libzfs" "github.com/davidscholberg/go-durationfmt" "gopkg.in/yaml.v2" ) const LAST_SCAN_FILE = "last_scan.yaml" const LOG_FILE = "zfsmon.log" const POOL_SPACE_WARN = 90 const DATE_FORMAT = "02.01.2006" const TIME_FORMAT = "02.01.2006 15:04:05" type ScanStats struct { Func string State uint64 Errors uint64 Duration uint64 LastFinished uint64 } type DeviceState struct { State string ReadErrors uint64 `yaml:"read_errors"` WriteErrors uint64 `yaml:"write_errors"` ChecksumErrors uint64 `yaml:"checksum_errors"` } type PoolState struct { Devices map[string]DeviceState Scan *ScanStats TotalSpace uint64 `yaml:"total_space"` FreeSpace uint64 `yaml:"free_space"` FillLevel float64 `yaml:"fill_level"` } func getStateOfDevices(tree zfs.VDevTree, data map[string]DeviceState) { data[tree.Name] = DeviceState{ State: tree.Stat.State.String(), ReadErrors: tree.Stat.ReadErrors, WriteErrors: tree.Stat.WriteErrors, ChecksumErrors: tree.Stat.ChecksumErrors, } if tree.Devices != nil { for _, subtree := range tree.Devices { getStateOfDevices(subtree, data) } } } func getScanFunc(scanFunc uint64) string { switch scanFunc { case zfs.PoolScanScrub: return "Scrub" case zfs.PoolScanResilver: return "Resilver" default: return "None" } } func getScanStats(scanStat zfs.PoolScanStat) *ScanStats { return &ScanStats{ Func: getScanFunc(scanStat.Func), State: scanStat.State, Errors: scanStat.Errors, Duration: scanStat.EndTime - scanStat.StartTime, LastFinished: scanStat.EndTime, } } func getPoolState(poolName string) (*PoolState, error) { pool, err := zfs.PoolOpen(poolName) if err != nil { return nil, err } tree, err := pool.VDevTree() if err != nil { return nil, err } deviceData := map[string]DeviceState{} getStateOfDevices(tree, deviceData) scanStats := getScanStats(tree.ScanStat) level := float64(tree.Stat.Alloc) / float64(tree.Stat.Space) * 100.0 return &PoolState{ Devices: deviceData, Scan: scanStats, TotalSpace: tree.Stat.Space, FreeSpace: tree.Stat.Space - tree.Stat.Alloc, FillLevel: level, }, nil } func (s *PoolState) numDeviceErrors() uint64 { var n uint64 for _, dev := range s.Devices { n += (dev.ReadErrors + dev.WriteErrors + dev.ChecksumErrors) if dev.State != "ONLINE" { n++ } } return n } func readLastScanFile(lastScanFile string) *PoolState { historyFile, err := os.Open(lastScanFile) if err != nil { return nil } defer historyFile.Close() byteValue, _ := ioutil.ReadAll(historyFile) var poolState *PoolState _ = yaml.Unmarshal(byteValue, &poolState) return poolState } func (s *PoolState) writeLastScanFile(lastScanFile string) { data, _ := yaml.Marshal(s) data = append([]byte("# Scan history from "+time.Now().Format(DATE_FORMAT)+"\n\n"), data...) err := ioutil.WriteFile(lastScanFile, data, 0644) if err != nil { log.Println(err) } } func checkPool(poolName string, lastScanFile string, spaceWarn float64) []string { poolState, err := getPoolState(poolName) if err != nil { msg := fmt.Sprintf("Could not open %s. Error: %s", poolName, err.Error()) log.Println(msg) return []string{msg} } lastState := readLastScanFile(lastScanFile) msgs := []string{} if lastState != nil { // Compare scan data to last state if poolState.numDeviceErrors() > lastState.numDeviceErrors() { log.Println("Device errors occurred") overviewBts, _ := yaml.Marshal(poolState.Devices) msg := fmt.Sprintf(`Oh no! Looks like %s has encountered device errors. Here is an overview of all devices and their statuses: %s`, poolName, string(overviewBts)) msgs = append(msgs, msg) } // Check if new scan has completed if poolState.Scan.LastFinished > lastState.Scan.LastFinished { timeStr := time.Unix(int64(poolState.Scan.LastFinished), 0).Format(TIME_FORMAT) durationStr, _ := durationfmt.Format(time.Second*time.Duration(poolState.Scan.Duration), "%0h:%0m:%0s") log.Printf("%s on %s has finished at %s, took %s, %d errors", poolState.Scan.Func, poolName, timeStr, durationStr, poolState.Scan.Errors) msg := fmt.Sprintf("%s on %s has finished at %s\nDuration: %s\nErrors: %d", poolState.Scan.Func, poolName, timeStr, durationStr, poolState.Scan.Errors) msgs = append(msgs, msg) } // Check if fill level has reached the threshold if lastState.FillLevel <= spaceWarn && poolState.FillLevel > spaceWarn { byteStr := bytefmt.ByteSize(lastState.FreeSpace) + "B" msg := fmt.Sprintf("Disk space warning for %s - only %s left", poolName, byteStr) log.Println(msg) msgs = append(msgs, msg) } } else { log.Println("This is your first scan. Creating last_scan file.") } poolState.writeLastScanFile(lastScanFile) return msgs } func sendMsgs(sendmail string, msgUri string, msgs []string) { hostname, _ := os.Hostname() if hostname == "" { hostname = "unknown" } mailHeader := fmt.Sprintf(`Subject:[%s] zfsmon status update MIME-Version: 1.0 Content-Type: text/plain; charset=utf-8 `, hostname) for _, msg := range msgs { cmd := exec.Command(sendmail, msgUri) cmd.Stdin = strings.NewReader(mailHeader + msg) err := cmd.Run() if err != nil { log.Println(err) } else { log.Printf("sent mail to %s", msgUri) } } } func main() { workDir := "" if os.Getuid() == 0 { workDir = "/etc/zfsmon" } // Cmdline args poolName := flag.String("pool", "tank", "ZFS pool name") sendmail := flag.String("sendmail", "sendmail", "Sendmail path") mailAddress := flag.String("m", "", "E-Mail address") lastScanFile := flag.String("f", filepath.Join(workDir, LAST_SCAN_FILE), "File to store last scan results") logFile := flag.String("log", filepath.Join(workDir, LOG_FILE), "Log file") spaceWarn := flag.Int("space", POOL_SPACE_WARN, "Pool fill level in percent to warn at") flag.Parse() // Log file if *logFile != "" { f, err := os.OpenFile(*logFile, os.O_RDWR|os.O_CREATE|os.O_APPEND, 0666) if err != nil { log.Fatalf("error opening logfile: %v", err) } defer f.Close() log.SetOutput(f) } msgs := checkPool(*poolName, *lastScanFile, float64(*spaceWarn)) if *mailAddress != "" { sendMsgs(*sendmail, *mailAddress, msgs) } else { log.Println("mail address not set, messaging disabled") } }