Creating Watcher Alerts for Machine Learning jobs

This appendix describes how to create watcher alerts for machine learning jobs.

Watcher Alert Workaround

DMF 8.1 uses Elasticsearch 7.2.0 where the inter-container functional calls were HTTP-based. However, DMF 8.3 uses Elasticsearch version 7.13.0 where these calls are now required to be HTTPS-based. This would require an extensive change in the system calls used by the Analytics Node and this effort is being worked on by engineering. Arista recommends the below workaround until the above fixes are released.

Workaround Summary: Create a Watcher manually using the provided template. Configure the watcher to use the job id for the ML job that needs to send alerts. Use ‘webhook’ as the alerting mechanism within the Watcher to send alerts to 3rd party tools like ‘Slack’.

  1. Access the ML job page on the AN and then click Manage Jobs to list the ML jobs.
  2. If the data feed column shows as stopped, skip to Step 3. If it says started, click the 3 dots for a particular ML job and Stop the data feed for the current ML job.
    Figure 1. Stop Data Feed
  3. After the data feed has stopped, click the 3 dots and start the data feed.
    Figure 2. Start Data Feed
  4. Select the options as shown in the diagram below.
    Figure 3. Job Time Options
  5. Confirm that the data feed has started. Note down the job id of this ML job.
    Figure 4. ML Job Characteristics
  6. Access the Watchers page.
    Figure 5. Access Watchers
  7. Create an advanced Watcher.
    Figure 6. Create Advanced Watcher
  8. Configure the name of the watcher (can include whitespace characters), e.g., Latency ML.
  9. Configure the ID of the watcher (can be alphanumeric, but without whitespace characters), e.g., ml_latency.
  10. Delete the code from the Watch JSON section.
  11. Copy and paste the following code into the watcher. Replace the highlighted text according to your environment and your ML job parameters.
    {
      "trigger": {
        "schedule": {
          "interval": "107s"
        }
      },
      "input": {
        "search": {
          "request": {
            "search_type": "query_then_fetch",
            "indices": [
              ".ml-anomalies-*"
            ],
            "rest_total_hits_as_int": true,
            "body": {
              "size": 0,
              "query": {
                "bool": {
                  "filter": [
                    {
                      "term": {
                        "job_id": ""
                      }
                    },
                    {
                      "range": {
                        "timestamp": {
                          "gte": "now-30m"
                        }
                      }
                    },
                    {
                      "terms": {
                        "result_type": [
                          "bucket",
                          "record",
                          "influencer"
                        ]
                      }
                    }
                  ]
                }
              },
              "aggs": {
                "bucket_results": {
                  "filter": {
                    "range": {
                      "anomaly_score": {
                        "gte": 75
                      }
                    }
                  },
                  "aggs": {
                    "top_bucket_hits": {
                      "top_hits": {
                        "sort": [
                          {
                            "anomaly_score": {
                              "order": "desc"
                            }
                          }
                        ],
                        "_source": {
                          "includes": [
                            "job_id",
                            "result_type",
                            "timestamp",
                            "anomaly_score",
                            "is_interim"
                          ]
                        },
                        "size": 1,
                        "script_fields": {
                          "start": {
                            "script": {
                              "lang": "painless",
                              "source": "LocalDateTime.ofEpochSecond((doc[\"timestamp\"].value.getMillis()-((doc[\"bucket_span\"].value * 1000)\n * params.padding)) / 1000, 0,ZoneOffset.UTC).toString()+\":00.000Z\"",
                              "params": {
                                "padding": 10
                              }
                            }
                          },
                          "end": {
                            "script": {
                              "lang": "painless",
                              "source": "LocalDateTime.ofEpochSecond((doc[\"timestamp\"].value.getMillis()+((doc[\"bucket_span\"].value * 1000)\n * params.padding)) / 1000, 0,ZoneOffset.UTC).toString()+\":00.000Z\"",
                              "params": {
                                "padding": 10
                              }
                            }
                          },
                          "timestamp_epoch": {
                            "script": {
                              "lang": "painless",
                              "source": """doc["timestamp"].value.getMillis()/1000"""
                            }
                          },
                          "timestamp_iso8601": {
                            "script": {
                              "lang": "painless",
                              "source": """doc["timestamp"].value"""
                            }
                          },
                          "score": {
                            "script": {
                              "lang": "painless",
                              "source": """Math.round(doc["anomaly_score"].value)"""
                            }
                          }
                        }
                      }
                    }
                  }
                },
                "influencer_results": {
                  "filter": {
                    "range": {
                      "influencer_score": {
                        "gte": 3
                      }
                    }
                  },
                  "aggs": {
                    "top_influencer_hits": {
                      "top_hits": {
                        "sort": [
                          {
                            "influencer_score": {
                              "order": "desc"
                            }
                          }
                        ],
                        "_source": {
                          "includes": [
                            "result_type",
                            "timestamp",
                            "influencer_field_name",
                            "influencer_field_value",
                            "influencer_score",
                            "isInterim"
                          ]
                        },
                        "size": 3,
                        "script_fields": {
                          "score": {
                            "script": {
                              "lang": "painless",
                              "source": """Math.round(doc["influencer_score"].value)"""
                            }
                          }
                        }
                      }
                    }
                  }
                },
                "record_results": {
                  "filter": {
                    "range": {
                      "record_score": {
                        "gte": 75
                      }
                    }
                  },
                  "aggs": {
                    "top_record_hits": {
                      "top_hits": {
                        "sort": [
                          {
                            "record_score": {
                              "order": "desc"
                            }
                          }
                        ],
                        "_source": {
                          "includes": [
                            "result_type",
                            "timestamp",
                            "record_score",
                            "is_interim",
                            "function",
                            "field_name",
                            "by_field_value",
                            "over_field_value",
                            "partition_field_value"
                          ]
                        },
                        "size": 3,
                        "script_fields": {
                          "score": {
                            "script": {
                              "lang": "painless",
                              "source": """Math.round(doc["record_score"].value)"""
                            }
                          }
                        }
                      }
                    }
                  }
                }
              }
            }
          }
        }
      },
      "condition": {
        "compare": {
          "ctx.payload.aggregations.bucket_results.doc_count": {
            "gt": 0
          }
        }
      },
      "actions": {
        "log": {
          "logging": {
            "level": "info",
            "text": "Alert for job [{{ctx.payload.aggregations.bucket_results.top_bucket_hits.hits.hits.0._source.job_id}}] at [{{ctx.payload.aggregations.bucket_results.top_bucket_hits.hits.hits.0.fields.timestamp_iso8601.0}}] score [{ctx.payload.aggregations.bucket_results.top_bucket_hits.hits.hits.0.fields.score.0}}]"
          }
        },
        "my_webhook": {
          "webhook": {
            "scheme": "https",
            "host": "hooks.slack.com",
            "port": 443,
            "method": "post",
            "path": "",
            "params": {},
            "headers": {
              "Content-Type": "application/json"
            },
            "body": """{"channel": "#", "username": "webhookbot", "text":"Alert for job [{{ctx.payload.aggregations.bucket_results.top_bucket_hits.hits.hits.0._source.job_id}}] at [{{ctx.payload.aggregations.bucket_results.top_bucket_hits.hits.hits.0.fields.timestamp_iso8601.0}}] score [{{ctx.payload.aggregations.bucket_results.top_bucket_hits.hits.hits.0.fields.score.0}}]", "icon_emoji": ":exclamation:"}"""
          }
        }
      }
    }
  12. Click Create Watch to create the watcher.