stillbox/pkg/gordio/alerting/alerting.go

460 lines
11 KiB
Go
Raw Normal View History

package alerting
import (
2024-10-31 00:10:53 -04:00
"bytes"
"context"
2024-10-31 00:10:53 -04:00
"fmt"
"net/http"
"sort"
2024-10-31 00:10:53 -04:00
"strconv"
"sync"
2024-10-31 00:10:53 -04:00
"text/template"
"time"
cl "dynatron.me/x/stillbox/pkg/calls"
"dynatron.me/x/stillbox/pkg/gordio/config"
"dynatron.me/x/stillbox/pkg/gordio/database"
2024-10-31 00:10:53 -04:00
"dynatron.me/x/stillbox/pkg/gordio/notify"
"dynatron.me/x/stillbox/pkg/gordio/sinks"
"dynatron.me/x/stillbox/internal/timeseries"
"dynatron.me/x/stillbox/internal/trending"
2024-11-01 09:15:39 -04:00
"github.com/google/uuid"
"github.com/jackc/pgx/v5/pgtype"
"github.com/rs/zerolog/log"
)
const (
ScoreThreshold = -1
CountThreshold = 1.0
2024-10-31 00:10:53 -04:00
NotificationSubject = "Stillbox Alert"
DefaultRenotify = 30 * time.Minute
alerterTickInterval = time.Minute
)
type Alerter interface {
sinks.Sink
Enabled() bool
Go(context.Context)
stats
}
type alerter struct {
sync.RWMutex
2024-11-01 09:15:39 -04:00
clock timeseries.Clock
cfg config.Alerting
scorer trending.Scorer[cl.Talkgroup]
scores trending.Scores[cl.Talkgroup]
lastScore time.Time
sim *Simulation
alertCache map[cl.Talkgroup]Alert
renotify time.Duration
notifier notify.Notifier
2024-11-02 11:39:02 -04:00
tgCache cl.TalkgroupCache
}
type offsetClock time.Duration
func (c *offsetClock) Now() time.Time {
return time.Now().Add(c.Duration())
}
func (c *offsetClock) Duration() time.Duration {
return time.Duration(*c)
}
2024-10-30 09:49:45 -04:00
// OffsetClock returns a clock whose Now() method returns the specified offset from the current time.
func OffsetClock(d time.Duration) offsetClock {
return offsetClock(d)
}
type AlertOption func(*alerter)
2024-10-30 09:49:45 -04:00
// WithClock makes the alerter use a simulated clock.
func WithClock(clock timeseries.Clock) AlertOption {
return func(as *alerter) {
as.clock = clock
}
}
2024-10-31 00:10:53 -04:00
// WithNotifier sets the notifier
func WithNotifier(n notify.Notifier) AlertOption {
return func(as *alerter) {
as.notifier = n
}
}
2024-10-30 09:49:45 -04:00
// New creates a new Alerter using the provided configuration.
2024-11-02 11:39:02 -04:00
func New(cfg config.Alerting, tgCache cl.TalkgroupCache, opts ...AlertOption) Alerter {
if !cfg.Enable {
return &noopAlerter{}
}
as := &alerter{
2024-11-01 09:15:39 -04:00
cfg: cfg,
alertCache: make(map[cl.Talkgroup]Alert),
clock: timeseries.DefaultClock,
renotify: DefaultRenotify,
2024-11-02 11:39:02 -04:00
tgCache: tgCache,
2024-10-31 00:10:53 -04:00
}
if cfg.Renotify != nil {
as.renotify = cfg.Renotify.Duration()
}
for _, opt := range opts {
opt(as)
}
2024-11-02 09:41:48 -04:00
as.scorer = trending.NewScorer(
trending.WithTimeSeries(as.newTimeSeries),
trending.WithStorageDuration[cl.Talkgroup](time.Hour*24*time.Duration(cfg.LookbackDays)),
2024-10-30 09:49:45 -04:00
trending.WithRecentDuration[cl.Talkgroup](time.Duration(cfg.Recent)),
trending.WithHalfLife[cl.Talkgroup](time.Duration(cfg.HalfLife)),
trending.WithScoreThreshold[cl.Talkgroup](ScoreThreshold),
trending.WithCountThreshold[cl.Talkgroup](CountThreshold),
trending.WithClock[cl.Talkgroup](as.clock),
)
return as
}
2024-10-30 09:49:45 -04:00
// Go is the alerting loop. It does not start a goroutine.
func (as *alerter) Go(ctx context.Context) {
2024-10-31 16:50:08 -04:00
err := as.startBackfill(ctx)
if err != nil {
log.Error().Err(err).Msg("backfill")
}
2024-10-31 16:14:38 -04:00
as.score(time.Now())
ticker := time.NewTicker(alerterTickInterval)
for {
select {
case now := <-ticker.C:
2024-10-31 16:14:38 -04:00
as.score(now)
2024-10-31 00:10:53 -04:00
err := as.notify(ctx)
if err != nil {
log.Error().Err(err).Msg("notify")
}
as.cleanCache()
case <-ctx.Done():
ticker.Stop()
return
}
}
}
2024-10-31 00:10:53 -04:00
const notificationTemplStr = `{{ range . -}}
2024-10-31 16:14:38 -04:00
{{ .TGName }} is active with a score of {{ f .Score.Score 4 }}! ({{ f .Score.RecentCount 0 }}/{{ .Score.Count }} recent calls)
{{ end -}}`
2024-10-31 00:10:53 -04:00
var notificationTemplate = template.Must(template.New("notification").Funcs(funcMap).Parse(notificationTemplStr))
2024-11-02 11:39:02 -04:00
func (as *alerter) eval(ctx context.Context, now time.Time, testMode bool) ([]Alert, error) {
err := as.tgCache.Hint(ctx, as.scoredTGs())
2024-11-02 09:41:48 -04:00
if err != nil {
2024-11-02 11:39:02 -04:00
return nil, fmt.Errorf("prime TG cache: %w", err)
2024-11-02 09:41:48 -04:00
}
db := database.FromCtx(ctx)
var notifications []Alert
for _, s := range as.scores {
2024-11-02 11:39:02 -04:00
origScore := s.Score
tgr, has := as.tgCache.TG(ctx, s.ID)
2024-11-02 09:41:48 -04:00
if has {
if !tgr.Alert {
continue
}
s.Score *= float64(tgr.Weight)
}
2024-11-02 11:39:02 -04:00
if s.Score > as.cfg.AlertThreshold || testMode {
2024-11-02 09:41:48 -04:00
if old, inCache := as.alertCache[s.ID]; !inCache || now.Sub(old.Timestamp) > as.renotify {
2024-11-02 14:43:47 -04:00
s.Score = as.tgCache.ApplyAlertRules(s, now)
2024-11-02 11:39:02 -04:00
a, err := as.makeAlert(ctx, s, origScore)
2024-11-02 09:41:48 -04:00
if err != nil {
return nil, fmt.Errorf("makeAlert: %w", err)
}
2024-11-02 14:26:58 -04:00
if s.Score < as.cfg.AlertThreshold {
a.Suppressed = true
}
2024-11-02 09:41:48 -04:00
as.alertCache[s.ID] = a
2024-11-02 11:39:02 -04:00
if !testMode {
2024-11-02 09:41:48 -04:00
err = db.AddAlert(ctx, a.ToAddAlertParams())
if err != nil {
return nil, fmt.Errorf("addAlert: %w", err)
}
}
2024-11-02 14:26:58 -04:00
if !a.Suppressed {
notifications = append(notifications, a)
}
2024-11-02 09:41:48 -04:00
}
}
}
2024-11-02 11:39:02 -04:00
return notifications, nil
2024-11-02 09:41:48 -04:00
}
2024-10-31 00:10:53 -04:00
func (as *alerter) testNotifyHandler(w http.ResponseWriter, r *http.Request) {
as.RLock()
defer as.RUnlock()
2024-11-01 09:15:39 -04:00
alerts := make([]Alert, 0, len(as.scores))
2024-10-31 00:10:53 -04:00
ctx := r.Context()
2024-11-02 11:39:02 -04:00
alerts, err := as.eval(ctx, time.Now(), true)
2024-10-31 16:14:38 -04:00
if err != nil {
2024-11-02 11:39:02 -04:00
log.Error().Err(err).Msg("test notification eval")
2024-10-31 16:14:38 -04:00
http.Error(w, err.Error(), http.StatusInternalServerError)
return
}
2024-11-01 09:15:39 -04:00
err = as.sendNotification(ctx, alerts)
2024-10-31 00:10:53 -04:00
if err != nil {
log.Error().Err(err).Msg("test notification send")
http.Error(w, err.Error(), http.StatusInternalServerError)
return
}
2024-10-31 16:50:08 -04:00
_, _ = w.Write([]byte("Sent"))
2024-10-31 00:10:53 -04:00
}
2024-11-02 11:39:02 -04:00
// scoredTGs gets a list of TGs.
func (as *alerter) scoredTGs() []cl.Talkgroup {
tgs := make([]cl.Talkgroup, 0, len(as.scores))
for _, s := range as.scores {
tgs = append(tgs, s.ID)
}
return tgs
}
// packedScoredTGs gets a list of packed TGIDs.
2024-10-31 16:14:38 -04:00
func (as *alerter) packedScoredTGs() []int64 {
2024-11-02 11:39:02 -04:00
tgs := make([]int64, 0, len(as.scores))
2024-10-31 16:14:38 -04:00
for _, s := range as.scores {
2024-11-02 11:39:02 -04:00
tgs = append(tgs, s.ID.Pack())
2024-10-31 16:14:38 -04:00
}
2024-11-02 11:39:02 -04:00
return tgs
2024-10-31 16:14:38 -04:00
}
2024-10-31 00:10:53 -04:00
// notify iterates the scores and sends out any necessary notifications
func (as *alerter) notify(ctx context.Context) error {
if as.notifier == nil {
return nil
}
as.Lock()
defer as.Unlock()
2024-11-02 11:39:02 -04:00
notifications, err := as.eval(ctx, time.Now(), false)
2024-10-31 16:14:38 -04:00
if err != nil {
return err
}
2024-10-31 00:10:53 -04:00
if len(notifications) > 0 {
return as.sendNotification(ctx, notifications)
}
return nil
}
2024-11-01 09:15:39 -04:00
type Alert struct {
2024-11-02 14:26:58 -04:00
ID uuid.UUID
Timestamp time.Time
TGName string
Score trending.Score[cl.Talkgroup]
OrigScore float64
Weight float32
Suppressed bool
2024-11-01 09:15:39 -04:00
}
func (a *Alert) ToAddAlertParams() database.AddAlertParams {
f32score := float32(a.Score.Score)
2024-11-02 09:41:48 -04:00
f32origscore := float32(a.OrigScore)
var origScore *float32
if a.Score.Score != a.OrigScore {
origScore = &f32origscore
}
2024-11-01 09:15:39 -04:00
return database.AddAlertParams{
2024-11-02 11:39:02 -04:00
ID: a.ID,
Time: pgtype.Timestamptz{Time: a.Timestamp, Valid: true},
PackedTg: a.Score.ID.Pack(),
Weight: &a.Weight,
Score: &f32score,
OrigScore: origScore,
2024-11-02 14:26:58 -04:00
Notified: !a.Suppressed,
2024-11-01 09:15:39 -04:00
}
2024-10-31 00:10:53 -04:00
}
// sendNotification renders and sends the notification.
2024-11-01 09:15:39 -04:00
func (as *alerter) sendNotification(ctx context.Context, n []Alert) error {
2024-10-31 00:10:53 -04:00
msgBuffer := new(bytes.Buffer)
err := notificationTemplate.Execute(msgBuffer, n)
if err != nil {
return fmt.Errorf("notification template render: %w", err)
}
log.Debug().Str("msg", msgBuffer.String()).Msg("notifying")
return as.notifier.Send(ctx, NotificationSubject, msgBuffer.String())
}
2024-11-01 09:15:39 -04:00
// makeAlert creates a notification for later rendering by the template.
2024-10-31 00:10:53 -04:00
// It takes a talkgroup Score as input.
2024-11-02 11:39:02 -04:00
func (as *alerter) makeAlert(ctx context.Context, score trending.Score[cl.Talkgroup], origScore float64) (Alert, error) {
2024-11-01 09:15:39 -04:00
d := Alert{
ID: uuid.New(),
Score: score,
Timestamp: time.Now(),
Weight: 1.0,
2024-11-02 09:41:48 -04:00
OrigScore: origScore,
2024-10-31 00:10:53 -04:00
}
2024-11-02 11:39:02 -04:00
tgRecord, has := as.tgCache.TG(ctx, score.ID)
2024-10-31 16:14:38 -04:00
switch has {
case true:
2024-11-01 09:15:39 -04:00
d.Weight = tgRecord.Weight
2024-10-31 00:10:53 -04:00
if tgRecord.SystemName == "" {
2024-10-31 16:14:38 -04:00
tgRecord.SystemName = strconv.Itoa(int(score.ID.System))
2024-10-31 00:10:53 -04:00
}
if tgRecord.Name != nil {
2024-10-31 16:14:38 -04:00
d.TGName = fmt.Sprintf("%s %s (%d)", tgRecord.SystemName, *tgRecord.Name, score.ID.Talkgroup)
2024-10-31 00:10:53 -04:00
} else {
2024-10-31 16:14:38 -04:00
d.TGName = fmt.Sprintf("%s:%d", tgRecord.SystemName, int(score.ID.Talkgroup))
2024-10-31 00:10:53 -04:00
}
2024-10-31 16:14:38 -04:00
case false:
2024-11-02 11:39:02 -04:00
system, has := as.tgCache.SystemName(ctx, int(score.ID.System))
2024-10-31 16:14:38 -04:00
if has {
d.TGName = fmt.Sprintf("%s:%d", system, int(score.ID.Talkgroup))
} else {
d.TGName = fmt.Sprintf("%d:%d", int(score.ID.System), int(score.ID.Talkgroup))
2024-10-31 00:10:53 -04:00
}
}
return d, nil
}
// cleanCache clears the cache of aged-out entries
func (as *alerter) cleanCache() {
if as.notifier == nil {
return
}
now := time.Now()
as.Lock()
defer as.Unlock()
2024-11-01 09:15:39 -04:00
for k, a := range as.alertCache {
if now.Sub(a.Timestamp) > as.renotify {
delete(as.alertCache, k)
2024-10-31 00:10:53 -04:00
}
}
}
func (as *alerter) newTimeSeries(id cl.Talkgroup) trending.TimeSeries {
ts, _ := timeseries.NewTimeSeries(timeseries.WithGranularities(
[]timeseries.Granularity{
{Granularity: time.Second, Count: 60},
{Granularity: time.Minute, Count: 10},
{Granularity: time.Hour, Count: 24},
{Granularity: time.Hour * 24, Count: int(as.cfg.LookbackDays)},
},
), timeseries.WithClock(as.clock))
return ts
}
2024-10-31 00:10:53 -04:00
func (as *alerter) startBackfill(ctx context.Context) error {
now := time.Now()
since := now.Add(-24 * time.Hour * time.Duration(as.cfg.LookbackDays))
log.Debug().Time("since", since).Msg("starting stats backfill")
2024-10-30 09:49:45 -04:00
count, err := as.backfill(ctx, since, now)
if err != nil {
2024-10-31 16:50:08 -04:00
return err
}
2024-10-31 16:50:08 -04:00
log.Debug().Int("callsCount", count).Str("in", time.Since(now).String()).Int("tgCount", as.scorer.Score().Len()).Msg("backfill finished")
2024-10-31 00:10:53 -04:00
return nil
}
2024-10-31 16:14:38 -04:00
func (as *alerter) score(now time.Time) {
as.Lock()
defer as.Unlock()
as.scores = as.scorer.Score()
as.lastScore = now
sort.Sort(as.scores)
}
2024-10-30 09:49:45 -04:00
func (as *alerter) backfill(ctx context.Context, since time.Time, until time.Time) (count int, err error) {
db := database.FromCtx(ctx)
2024-10-30 09:49:45 -04:00
const backfillStatsQuery = `SELECT system, talkgroup, call_date FROM calls WHERE call_date > $1 AND call_date < $2 ORDER BY call_date ASC`
2024-10-30 09:49:45 -04:00
rows, err := db.Query(ctx, backfillStatsQuery, since, until)
if err != nil {
return count, err
}
defer rows.Close()
as.Lock()
defer as.Unlock()
for rows.Next() {
var tg cl.Talkgroup
var callDate time.Time
if err := rows.Scan(&tg.System, &tg.Talkgroup, &callDate); err != nil {
return count, err
}
as.scorer.AddEvent(tg, callDate)
2024-10-30 09:49:45 -04:00
if as.sim != nil { // step the simulator if it is active
as.sim.stepClock(callDate)
}
count++
}
if err := rows.Err(); err != nil {
return count, err
}
return count, nil
}
func (as *alerter) SinkType() string {
return "alerting"
}
func (as *alerter) Call(ctx context.Context, call *cl.Call) error {
as.Lock()
defer as.Unlock()
as.scorer.AddEvent(call.TalkgroupTuple(), call.DateTime)
return nil
}
func (*alerter) Enabled() bool { return true }
2024-10-30 09:49:45 -04:00
// noopAlerter is used when alerting is disabled.
type noopAlerter struct{}
func (*noopAlerter) SinkType() string { return "noopAlerter" }
func (*noopAlerter) Call(_ context.Context, _ *cl.Call) error { return nil }
func (*noopAlerter) Go(_ context.Context) {}
func (*noopAlerter) Enabled() bool { return false }