From 241fb9183f892943ea63853682ba6182cfa870fb Mon Sep 17 00:00:00 2001 From: Nikodemas Tuckus Date: Mon, 12 Jun 2023 09:52:03 +0200 Subject: [PATCH] Decommission cmsmon intelligence service --- .github/workflows/build-go-tools.yml | 27 +- src/go/MONIT/intelligence.go | 389 ------------------ src/go/intelligence/Makefile | 43 -- src/go/intelligence/README.md | 10 - src/go/intelligence/config.md | 265 ------------ src/go/intelligence/go.mod | 3 - src/go/intelligence/installation.md | 110 ----- src/go/intelligence/int_test/Makefile | 43 -- src/go/intelligence/int_test/test.go | 178 -------- src/go/intelligence/int_test/test_cases.json | 91 ---- src/go/intelligence/int_test/test_config.json | 109 ----- src/go/intelligence/int_test/test_wrapper.sh | 172 -------- src/go/intelligence/main.go | 100 ----- src/go/intelligence/models/models.go | 208 ---------- .../intelligence/pipeline/add_annotations.go | 244 ----------- .../pipeline/delete_old_silents.go | 88 ---- src/go/intelligence/pipeline/fetch_alert.go | 45 -- src/go/intelligence/pipeline/filter.go | 60 --- .../intelligence/pipeline/keyword_matching.go | 105 ----- src/go/intelligence/pipeline/ml_box.go | 36 -- src/go/intelligence/pipeline/preprocess.go | 91 ---- src/go/intelligence/pipeline/push_alert.go | 41 -- src/go/intelligence/pipeline/silence_alert.go | 109 ----- src/go/intelligence/testing.md | 27 -- src/go/intelligence/utils/utils.go | 364 ---------------- 25 files changed, 1 insertion(+), 2957 deletions(-) delete mode 100644 src/go/MONIT/intelligence.go delete mode 100644 src/go/intelligence/Makefile delete mode 100644 src/go/intelligence/README.md delete mode 100644 src/go/intelligence/config.md delete mode 100644 src/go/intelligence/go.mod delete mode 100644 src/go/intelligence/installation.md delete mode 100644 src/go/intelligence/int_test/Makefile delete mode 100644 src/go/intelligence/int_test/test.go delete mode 100644 src/go/intelligence/int_test/test_cases.json delete mode 100644 src/go/intelligence/int_test/test_config.json delete mode 100755 src/go/intelligence/int_test/test_wrapper.sh delete mode 100644 src/go/intelligence/main.go delete mode 100644 src/go/intelligence/models/models.go delete mode 100644 src/go/intelligence/pipeline/add_annotations.go delete mode 100644 src/go/intelligence/pipeline/delete_old_silents.go delete mode 100644 src/go/intelligence/pipeline/fetch_alert.go delete mode 100644 src/go/intelligence/pipeline/filter.go delete mode 100644 src/go/intelligence/pipeline/keyword_matching.go delete mode 100644 src/go/intelligence/pipeline/ml_box.go delete mode 100644 src/go/intelligence/pipeline/preprocess.go delete mode 100644 src/go/intelligence/pipeline/push_alert.go delete mode 100644 src/go/intelligence/pipeline/silence_alert.go delete mode 100644 src/go/intelligence/testing.md delete mode 100644 src/go/intelligence/utils/utils.go diff --git a/.github/workflows/build-go-tools.yml b/.github/workflows/build-go-tools.yml index 86860053..6a94fbac 100644 --- a/.github/workflows/build-go-tools.yml +++ b/.github/workflows/build-go-tools.yml @@ -31,11 +31,10 @@ jobs: go build -o datasources datasources.go go build -o ggus_alerting ggus_alerting.go go build -o ggus_parser ggus_parser.go - go build -o intelligence intelligence.go go build -o ssb_alerting ssb_alerting.go go build -o es_exporter es_exporter.go mv monit alert annotationManager datasources ggus_alerting \ - ggus_parser intelligence ssb_alerting es_exporter ../../../cmsmon-tools + ggus_parser ssb_alerting es_exporter ../../../cmsmon-tools cd ../NATS go build -o dbs_vm dbs_vm.go go build -o nats-pub nats-pub.go @@ -71,30 +70,6 @@ jobs: id: get_tag run: echo ::set-output name=tag::${GITHUB_REF/refs\/tags\//} - - name: Build cmsmon-int image - run: | - echo Image tag: ${{ steps.get_tag.outputs.tag }} - curl -ksLO https://raw.githubusercontent.com/dmwm/CMSKubernetes/master/docker/cmsmon-intelligence/Dockerfile - sed -i -e "s,ENV CMSMON_TAG=.*,ENV CMSMON_TAG=${{steps.get_tag.outputs.tag}},g" Dockerfile - docker build . --tag docker.pkg.github.com/dmwm/cmsmon-int/cmsmon-int - docker tag docker.pkg.github.com/dmwm/cmsmon-int/cmsmon-int registry.cern.ch/cmsmonitoring/cmsmon-int - - - name: Login to registry.cern.ch - uses: docker/login-action@v1.6.0 - with: - registry: registry.cern.ch - username: ${{ secrets.CERN_LOGIN }} - password: ${{ secrets.CERN_TOKEN }} - - - name: Publish cmsmon-int image to registry.cern.ch - uses: docker/build-push-action@v1 - with: - username: ${{ secrets.CERN_LOGIN }} - password: ${{ secrets.CERN_TOKEN }} - registry: registry.cern.ch - repository: cmsmonitoring/cmsmon-int - tag_with_ref: true - - name: Build cmsmon-alerts image run: | echo Image tag: ${{ steps.get_tag.outputs.tag }} diff --git a/src/go/MONIT/intelligence.go b/src/go/MONIT/intelligence.go deleted file mode 100644 index f405885b..00000000 --- a/src/go/MONIT/intelligence.go +++ /dev/null @@ -1,389 +0,0 @@ -package main - -import ( - "bytes" - "encoding/json" - "errors" - "flag" - "fmt" - "io" - "log" - "net/http" - "net/http/httputil" - "net/url" - "os" - "regexp" - "time" -) - -// File : intelligence.go -// Author : Rahul Indra -// Created : Thu, 18 June 2020 13:02:19 GMT -// Description: CERN MONIT infrastructure Intelligence Module - -// --------MAPS---------- -// Map for storing alertData with instance as key. -var silenceMap map[string][]amJSON - -//--------MAPS---------- - -// -------STRUCTS--------- -// AlertManager API acceptable JSON Data for GGUS Data -type amJSON struct { - Labels map[string]interface{} `json:"labels"` - Annotations map[string]interface{} `json:"annotations"` - StartsAt time.Time `json:"startsAt"` - EndsAt time.Time `json:"endsAt"` -} - -type amData struct { - Data []amJSON -} - -// Alert CLI tool data struct (Tabular) -type alertData struct { - Name string - Service string - Tag string - Severity string - StartsAt time.Time - EndsAt time.Time -} - -// Array of alerts for alert CLI Tool (Tabular) -var allAlertData []alertData - -type matchers struct { - Name string `json:"name"` - Value string `json:"value"` -} - -type silenceData struct { - Matchers []matchers `json:"matchers"` - StartsAt time.Time `json:"startsAt"` - EndsAt time.Time `json:"endsAt"` - CreatedBy string `json:"createdBy"` - Comment string `json:"comment"` -} - -type config struct { - CMSMONURL string `json:"cmsmonURL"` - GetAlertsAPI string `json:"getAlertsAPI"` - PostSilenceAPI string `json:"postSilenceAPI"` - HttpTimeout int `json:"httpTimeout"` - Interval time.Duration `json:"interval"` - CreatedBy string `json:"createdBy"` - SeverityFilter string `json:"severityFilter"` - SearchingLabel string `json:"searchingLabel"` - UniqueLabel string `json:"uniqueLabel"` - Seperator string `json:"seperator"` - Comment string `json:"comment"` - Verbose int `json:"verbose"` -} - -var configJSON config - -//-------STRUCTS--------- - -// function for constructing and validating AM URL -func construct(baseURL, apiURL string) string { - - cmpltURL := baseURL + apiURL - - u, err := url.ParseRequestURI(cmpltURL) - if err != nil { - log.Fatalf("AlertManager API URL is not valid, error:%v", err) - } - - return u.String() -} - -// function for get request on /api/v1/alerts alertmanager endpoint for fetching alerts. -func get(data interface{}) error { - - //GET API for fetching all AM alerts. - apiurl := construct(configJSON.CMSMONURL, configJSON.GetAlertsAPI) - - req, reqErr := http.NewRequest("GET", apiurl, nil) - if reqErr != nil { - return reqErr - } - req.Header.Add("Accept-Encoding", "identity") - req.Header.Add("Accept", "application/json") - - timeout := time.Duration(configJSON.HttpTimeout) * time.Second - client := &http.Client{Timeout: timeout} - - if configJSON.Verbose > 1 { - dump, dumpErr := httputil.DumpRequestOut(req, true) - if dumpErr == nil { - log.Println("Request: ", string(dump)) - } - } - - resp, respErr := client.Do(req) - if respErr != nil { - return respErr - } else { - if resp.StatusCode != http.StatusOK { - log.Printf("Http Response Code Error, status code: %d", resp.StatusCode) - return errors.New("http Response Code Error") - } - } - defer resp.Body.Close() - - byteValue, bvErr := io.ReadAll(resp.Body) - - if bvErr != nil { - log.Printf("Unable to read JSON Data from AlertManager GET API, error: %v\n", bvErr) - return bvErr - } - - jsonErr := json.Unmarshal(byteValue, &data) - if jsonErr != nil { - if configJSON.Verbose > 0 { - log.Println(string(byteValue)) - } - log.Printf("Unable to parse JSON Data from AlertManager GET API, error: %v\n", jsonErr) - return jsonErr - } - - if configJSON.Verbose > 1 { - dump, dumpErr := httputil.DumpResponse(resp, true) - if dumpErr == nil { - log.Println("Response: ", string(dump)) - } - } - - return nil -} - -// Function for post request on /api/v1/silences alertmanager endpoint for creating silences. -func silence(data amJSON, mData amJSON) error { - // POST API for creating silences. - apiurl := construct(configJSON.CMSMONURL, configJSON.PostSilenceAPI) - - var sData silenceData - sData.StartsAt = mData.StartsAt //Start Time equal to maintenance alert //So that when maintenance alerts vanishes, ongoing alerts comes back alive. - sData.EndsAt = mData.EndsAt //End Time equal to maintenance alert //So that when maintenance alerts vanishes, ongoing alerts comes back alive. - sData.CreatedBy = configJSON.CreatedBy - sData.Comment = configJSON.Comment - var m matchers - - m.Name = configJSON.UniqueLabel - for k, v := range data.Labels { - if k == configJSON.UniqueLabel { - if val, ok := v.(string); ok { - m.Value = val - } - } - } - - sData.Matchers = append(sData.Matchers, m) - jsonStr, jsonErr := json.Marshal(sData) - - if jsonErr != nil { - log.Printf("Unable to convert JSON Data, error: %v\n", jsonErr) - return jsonErr - } - - req, reqErr := http.NewRequest("POST", apiurl, bytes.NewBuffer(jsonStr)) - if reqErr != nil { - return reqErr - } - req.Header.Set("Content-Type", "application/json") - - timeout := time.Duration(configJSON.HttpTimeout) * time.Second - client := &http.Client{Timeout: timeout} - - if configJSON.Verbose > 1 { - dump, dumpErr := httputil.DumpRequestOut(req, true) - if dumpErr == nil { - log.Println("Request: ", string(dump)) - } - } - - resp, respErr := client.Do(req) - if respErr != nil { - return respErr - } else { - if resp.StatusCode != http.StatusOK { - log.Printf("Http Response Code Error, status code: %d", resp.StatusCode) - return errors.New("http Response Code Error") - } - } - defer resp.Body.Close() - - if configJSON.Verbose > 1 { - dump, dumpErr := httputil.DumpResponse(resp, true) - if dumpErr == nil { - log.Println("Response: ", string(dump)) - } - } - - if configJSON.Verbose > 1 { - log.Printf("Silence Data:\n%+v\n", string(jsonStr)) - } - - return nil -} - -// Function for silencing maintenance false alerts -func silenceMaintenance(filteredAlerts amData) { - - if configJSON.Verbose > 1 { - if len(filteredAlerts.Data) == 0 { - log.Printf("No Maintenance Alert Found") - return - } else { - log.Printf("Maintenance Alert Data:\n%v\n\n", filteredAlerts) - } - } - - ifAnyAlert := false - - for _, each := range filteredAlerts.Data { - for k, v := range each.Labels { - if k == configJSON.SearchingLabel { - for _, ins := range regexp.MustCompile("["+configJSON.Seperator+"\\,\\s]+").Split(v.(string), -1) { - if ins != "" { - re, err := regexp.Compile(ins + ":\\d*|" + ins + "$") - if err != nil { - log.Fatalf("Regex didn't compile %v", err) - } - for silenceMapKey, silenceMapValues := range silenceMap { - if re.Match([]byte(silenceMapKey)) { - ifAnyAlert = true - for _, silenceMapValue := range silenceMapValues { - silenceErr := silence(silenceMapValue, each) - if silenceErr != nil { - log.Printf("Could not silence, error: %v\n", silenceErr) - if configJSON.Verbose > 1 { - log.Printf("Silence Data: %s\n ", silenceMapValue) - } - } - } - } - } - } - } - } - } - } - - if configJSON.Verbose > 1 { - if ifAnyAlert == false { - log.Printf("No Alert found for Silencing") - } - } -} - -// Function for filtering maintenance alerts -func filterMaintenance(amdata amData) amData { - silenceMap = make(map[string][]amJSON) - var maintenanceData amData - - for _, each := range amdata.Data { - for k, v := range each.Labels { - if k == "severity" && v == configJSON.SeverityFilter { - maintenanceData.Data = append(maintenanceData.Data, each) - } else { - if k == configJSON.SearchingLabel { - if len(regexp.MustCompile("["+configJSON.Seperator+"\\,\\s]+").Split(v.(string), -1)) == 1 { - silenceMap[v.(string)] = append(silenceMap[v.(string)], each) - } - } - } - } - } - return maintenanceData -} - -// Function running all logics -func run() { - - var amdata amData - getErr := get(&amdata) - - if getErr != nil { - log.Fatalf("Could not get alerts. %v", getErr) - } - - filtered := filterMaintenance(amdata) - silenceMaintenance(filtered) - -} - -// Function for parsing the config File -func parseConfig(configFile string, verbose int) { - - //Defaults in case no config file is provided - configJSON.CMSMONURL = "https://cms-monitoring.cern.ch" - configJSON.GetAlertsAPI = "/api/v1/alerts?active=true&silenced=false&inhibited=false&unprocessed=false" - configJSON.PostSilenceAPI = "/api/v1/silences" - configJSON.SeverityFilter = "maintenance" - configJSON.SearchingLabel = "instance" - configJSON.UniqueLabel = "alertname" - configJSON.Seperator = " " - configJSON.Comment = "maintenance" - configJSON.CreatedBy = "admin" - configJSON.HttpTimeout = 3 //3 secs timeout for HTTP requests - configJSON.Interval = 10 // 10 sec interval for the service - configJSON.Verbose = verbose - - if stats, err := os.Stat(configFile); err == nil { - if configJSON.Verbose > 1 { - log.Printf("FileInfo: %s\n", stats) - } - jsonFile, e := os.Open(configFile) - if e != nil { - log.Fatalf("Config File not found, error: %s", e) - } - defer jsonFile.Close() - decoder := json.NewDecoder(jsonFile) - err := decoder.Decode(&configJSON) - if err != nil { - log.Fatalf("Config JSON File can't be loaded, error: %s", err) - } else if configJSON.Verbose > 0 { - log.Printf("Load config from %s\n", configFile) - } - } else { - log.Fatalf("%s: Config File doesn't exist, error: %v", configFile, err) - } - - if configJSON.Verbose > 0 { - log.SetFlags(log.LstdFlags | log.Lshortfile) - } else { - log.SetFlags(log.LstdFlags) - } - - if configJSON.Verbose > 1 { - log.Printf("Configuration:\n%+v\n", configJSON) - } - -} - -// Function for running the logic on a time interval -func runInfinite() { - for true { - run() - time.Sleep(configJSON.Interval * time.Second) - } -} - -func main() { - - var verbose int - var configFile string - flag.StringVar(&configFile, "config", "", "Config File path") - flag.IntVar(&verbose, "verbose", 0, "Verbosity Level, can be overwritten in config") - - flag.Usage = func() { - fmt.Println("Usage: intelligence [options]") - flag.PrintDefaults() - } - - flag.Parse() - parseConfig(configFile, verbose) - runInfinite() -} diff --git a/src/go/intelligence/Makefile b/src/go/intelligence/Makefile deleted file mode 100644 index 2b32f416..00000000 --- a/src/go/intelligence/Makefile +++ /dev/null @@ -1,43 +0,0 @@ -VERSION=`git rev-parse --short HEAD` -flags=-ldflags="-s -w -X main.version=${VERSION}" -# flags=-ldflags="-s -w -extldflags -static" - -all: build - -build: - go clean; rm -rf pkg; go build ${flags} - -build_debug: - go clean; rm -rf pkg; go build ${flags} -gcflags="-m -m" - -build_all: build_osx build_linux build - -build_osx: - go clean; rm -rf pkg intelligence_osx; GOOS=darwin go build ${flags} - mv intelligence intelligence_osx - -build_linux: - go clean; rm -rf pkg intelligence_linux; GOOS=linux go build ${flags} - mv intelligence intelligence_linux - -build_power8: - go clean; rm -rf pkg intelligence_power8; GOARCH=ppc64le GOOS=linux go build ${flags} - mv intelligence intelligence_power8 - -build_arm64: - go clean; rm -rf pkg intelligence_arm64; GOARCH=arm64 GOOS=linux go build ${flags} - mv intelligence intelligence_arm64 - -build_windows: - go clean; rm -rf pkg intelligence.exe; GOARCH=amd64 GOOS=windows go build ${flags} - -install: - go install - -clean: - go clean; rm -rf pkg - -test : test1 - -test1: - cd test; go test diff --git a/src/go/intelligence/README.md b/src/go/intelligence/README.md deleted file mode 100644 index 56185c87..00000000 --- a/src/go/intelligence/README.md +++ /dev/null @@ -1,10 +0,0 @@ -# CMSMonitoring/src/go/intelligence - -## Overview - -This intelligence Module has been developed for the CMS Infra MONIT AlertManagement. It is responsible for assigning relevant severity levels, bundling similar alerts, silencing false alerts from time to time based on it's intelligence. - -- Table of contents - * [Installation](https://github.com/dmwm/CMSMonitoring/blob/master/src/go/intelligence/installation.md) - Read more about the installation. - * [Configuration](https://github.com/dmwm/CMSMonitoring/blob/master/src/go/intelligence/config.md) - Detailed information for config file. - * [Testing](https://github.com/dmwm/CMSMonitoring/blob/master/src/go/intelligence/testing.md) - Detailed information about testing procedure of intelligence module. diff --git a/src/go/intelligence/config.md b/src/go/intelligence/config.md deleted file mode 100644 index df9c076b..00000000 --- a/src/go/intelligence/config.md +++ /dev/null @@ -1,265 +0,0 @@ -# Configuration - -This is the most important file for our intelligence module. The intelligence is controlled from here. Let's see what each field mean and how important they are. - -*The given config file format below should be followed.* - -*Same config file can be used for production run as well as testing purpose. For testing purpose extra config is required which are given a "+" mark beside them.* - -- Table of contents - * [Parts](#parts) - - [Server](#server) - - [Annotation Dashboard](#annotation-dashboard) - - [Alerts](#alerts) - - [Silence](#silence) - - [Services](#services) - * [Config File](#config-file-example) - -# Parts - -The config file consists of mainly four parts :- - -- Server -- Annotation Dashboard -- Alerts -- Silence -- Services - -# Server - -Contains all the values related to servers (Alertmanager, alerting services). - -- **cmsmonURL** - CMS Monitoring Infrastructures URL. -- **getAlertsAPI** - AlertManager API endpoint for fetching alerts (GET Request). -- **getSuppressedAlertsAPI** - AlertManager API endpoint for fetching suppressed alerts (GET Request). -- **getSilencesAPI** - AlertManager API endpoint for fetching silences (GET Request). -- **postAlertsAPI** - AlertManager API endpoint for creating alerts (POST Request). -- **postSilenceAPI** - AlertManager API endpoint for creating silences (POST Request). -- **deleteSilenceAPI** - AlertManager API endpoint for deleting silences (DELETE Request). -- **httpTimeout** - HTTP timeout for all requests (in sec). -- **interval** - Time interval at which intelligence module repeats itself (in sec). -- **verbose** - verbosity level for better debugging - - 0 - no verbosity - - 1 - first level of verbosity - - 2 - second level of verbosity - - 3 - deep level verbosity -- **dryRun** - boolean flag for dry run, if true intelligence module will not make any changes in Alermanager (doesn't create alerts, silences etc.) and doesn't annotate Grafana Dashboard. Use for debbuging purpose. -- **+** **testing** - storing details about test scenario - - **testfile** - test cases file name for testing (Fake alerts). - - **lifetimeOfTestAlerts** - lifetime of test alerts (fake alerts). (in minutes) - - **annotateTestStatus** - boolean value which becomes true if dashboards annotation is successful during testing else remains false. - -**Fields with + mark are for testing purpose.** - -##### Defaults - -- **cmsmonURL** - https://cms-monitoring.cern.ch -- **getAlertsAPI** - /api/v1/alerts?active=true&silenced=false&inhibited=false&unprocessed=false -- **getSuppressedAlertsAPI** - /api/v1/alerts?active=false&silenced=true -- **getSilencesAPI** - /api/v1/silences -- **postAlertsAPI** - /api/v1/alerts -- **postSilenceAPI** - /api/v1/silences -- **deleteSilenceAPI** - /api/v1/silence -- **httpTimeout** - 3 -- **interval** - 1 -- **verbose** - 0 -- **dryRun** - false -- **+** **testing** - storing details about test scenario - - **testfile** - /tmp/test_cases.json - - **lifetimeOfTestAlerts** - 5 - - **annotateTestStatus** - false - -# Annotation Dashboard - -Contains all the values required for annotation of Grafana Dashboards. - -- **url** - Grafana Dashboard base URL. -- **dashboardSearchAPI** - Grafana API endpoint for searching dashboards with given list of tags. -- **annotationAPI** - Grafana API endpoint for creating annotations. -- **\* tags** - List of all tags for the dashboards required for annotations (all possible values required for each services and set of keywords) so that we can maintain a Parent Cache for the dashboards' data. -- **\* token**- Grafana Proxy Token, required for creating a annotation request. -- **dashboardsCacheExpiration** - Dashboard Cache Expiration (in hours). Intelligence module runs infinitely as a service in background. To increase it's latency and make less request to Grafana dashboards, a cache is maintained which contains all dashboards value and upon expiration the cache gets updated. -- **intelligenceModuleTag** - Tag attached to annotations which reflects it is created by the intelligence module. - -**Fields with * mark are required.** - -##### Defaults - -- **url** - https://monit-grafana.cern.ch -- **dashboardSearchAPI** - /api/search -- **annotationAPI** - /api/annotations -- **dashboardsCacheExpiration** - 1 -- **intelligenceModuleTag** - cmsmon-int - -# Alerts - -- **uniqueLabel** - Label which defines an alert uniquely required for processing a specific alert. -- **severityLabel** - Label for severity level of an alert required while assigning proper severity level. -- **serviceLabel** - Label for service of an alert. -- **\* severityLevels** - map for defined severity levels and their priority. -- **defaultSeverityLevel** - Default severity level value in case intelligence module is not able to assign one. -- **durationThreshold** - filter alerts whose time span exceed duration - threshold, default 24 hour -- **filterKeywords** - list of keyword values to filter in alert annotations - attribute values - -**Fields with * mark are required.** - -##### Defaults - -- **uniqueLabel** - alertname -- **severityLabel** - severity -- **serviceLabel** - service -- **defaultSeverityLevel** - info - -# Silence - -- **createdBy** - Name of the creater of silences. -- **comment** - Comment for creating silences -- **\* silenceStatus** - Labels for status of the silence. - - ["active", "expired", "pending"] this should be fixed unless there's change in AlertManager. DO NOT CHANGE THE VALUES OR THEIR ORDER. - -**Fields with * mark are required.** - -##### Defaults - -- **createdBy** - admin -- **comment** - silence by intelligence module -- **\* silenceStatus** - ["active", "expired", "pending"] - -# Services - -- **name** - Name of a service (eg. SSB, GGUS) -- **# keywordLabel** - Field in which the intelligence module tries to match keywords. -- **# defaultLevel** - Default Severity Level assigned to the alert at the time of it's creation by alerting services (ggus_alerting, ssb_alerting). -- **severityMap** - Map for severity levels for a service. -- **annotationMap** - Map for Dashboard annotations' keywords. - - **# label** - Field in which the intelligence module tries to match keywords for the following actions and systems. - - **annotations** - Array of information about annotations i.e. A specific set of keywords & set of dashboards to annotate when the keywords are matched in alerts. - - **actions** - List of actions (eg. outage, maintenance, intervention) - - **systems** - List of services which are involved (eg. Network, Database, rucio etc.) - - **tags** - List of tags for the dashboards where to create annotations based on specific set of keywords and service (SSB, GGUS etc.) - - **urlLabel** - Label for searching URL from the alerts and putting the found URL into text while annotating the dashboards. - -**Fields with * mark are required. Fields with # mark should not be changed unless there's any change in the codebase for the same.** - -- **keywordLabel for SSB should be "shortDescription" and for GGUS "Priority"**. -- **defaultLevels for SSB should be "notification" and for GGUS "ticket"**. -- **label for annotationMap for SSB should be "shortDescription" and for GGUS "Subject"**. -- **urlLabel for annotationMap for SSB should be "URL" and for GGUS "URL"**. - -DO NOT MAKE CHANGES FOR SSB AND GGUS SERVICES UNLESS YOU MAKE CHANGES ACCORDINGLY IN GGUS & SSB ALERTING SERVICES. -YOU ARE FREE TO SET VALUES FOR NEW SERVICES THOUGH. - -# Config File Example - -```json -{ - "server": { - "cmsmonURL": "https://cms-monitoring.cern.ch", - "getAlertsAPI": "/api/v1/alerts?active=true&silenced=false&inhibited=false&unprocessed=false", - "getSuppressedAlertsAPI": "/api/v1/alerts?active=false&silenced=true", - "getSilencesAPI": "/api/v1/silences", - "postAlertsAPI": "/api/v1/alerts", - "postSilenceAPI": "/api/v1/silences", - "deleteSilenceAPI": "/api/v1/silence", - "httpTimeout": 3, - "interval": 1, - "verbose": 0, - "dryRun": false, - "testing": { - + "testfile": "/tmp/test_cases.json", - "lifetimeOfTestAlerts": 5, - "annotateTestStatus" : false - } - }, - - "annotationDashboard": { - "url": "https://monit-grafana.cern.ch", - "dashboardSearchAPI": "/api/search", - "annotationAPI": "/api/annotations", - * "tags": ["cmsweb", "jobs", "prod"], - * "token": "", - "dashboardsCacheExpiration": 1 - }, - - "alerts": { - "uniqueLabel": "alertname", - "severityLabel": "severity", - "serviceLabel": "service", - * "severityLevels": { - "info": 0, - "warning": 1, - "medium": 2, - "high": 3, - "urgent": 4 - }, - "defaultSeverityLevel": "info" - }, - - "silence": { - "createdBy": "admin", - "comment": "maintenance", - #* "silenceStatus": ["active", "expired", "pending"] #DO NOT CHANGE IT - }, - - * "services": [ - { - "name": "SSB", - # "keywordLabel": "shortDescription", #DO NOT CHANGE IT, UNIQUE FOR SSB - # "defaultLevel": "notification", #DO NOT CHANGE IT, UNIQUE FOR SSB - "severityMap": { - "update": "info", - "configuration": "info", - "support": "info", - "patching": "info", - "upgrade": "warning", - "intervention": "warning", - "migration": "warning", - "interruption": "medium", - "risk": "high", - "down": "urgent" - }, - "annotationMap": { - "label": "shortDescription", #DO NOT CHANGE IT, UNIQUE FOR SSB - "annotations": [ - { - "actions": ["intervention", "outage"], - "systems": ["network", "database", "db"], - "tags": ["cmsweb", "prod"] - }, - - { - "actions": ["update", "upgrade"], - "systems": ["network", "database", "db"], - "tags": ["jobs"] - } - ], - "urlLabel": "URL" - } - }, - - * { - "name": "GGUS", - # "keywordLabel": "Priority", #DO NOT CHANGE IT, UNIQUE FOR GGUS - # "defaultLevel": "ticket", #DO NOT CHANGE IT, UNIQUE FOR GGUS - "severityMap": { - "less urgent": "medium", - "urgent": "high", - "very urgent": "urgent" - }, - "annotationMap": { - "label": "Subject", #DO NOT CHANGE IT, UNIQUE FOR GGUS - "annotations": [ - { - "actions": ["transfer", "outage"], - "systems": ["rucio"], - "tags": ["prod"] - } - ], - "urlLabel": "URL" - } - } - ] -} -``` diff --git a/src/go/intelligence/go.mod b/src/go/intelligence/go.mod deleted file mode 100644 index bae3c2b5..00000000 --- a/src/go/intelligence/go.mod +++ /dev/null @@ -1,3 +0,0 @@ -module github.com/dmwm/CMSMonitoring/src/go/intelligence - -go 1.20 diff --git a/src/go/intelligence/installation.md b/src/go/intelligence/installation.md deleted file mode 100644 index a416d6cc..00000000 --- a/src/go/intelligence/installation.md +++ /dev/null @@ -1,110 +0,0 @@ -# Installation - -- Table of contents - * [Build](#build) - - [Main](#main) - - [Test](#test) - * [Run](#run) - * [Test](#test-1) - - [Manual](#manual) - - [Automation](#automation) - -## Build - -#### Main -To build the intelligence module run the following command - -`make` - -#### Test - -Build the test binary residing [here](https://github.com/dmwm/CMSMonitoring/blob/master/src/go/intelligence/int_test/test.go) by running :- - -``` -cd int_test -make -``` - -## Run -``` -Usage: intelligence [options] - -config string - Config File path - -iter int - Custom defined no. of iterations for premature termination - -verbose int - Verbosity Level, can be overwritten in config - -version - Show version -``` -As $GOPATH/bin has already been set in PATH variable, you can run the intelligence module binary by executing the command below. Config file path flag (-config) is mandatory. However, -verbose and -iter flags are optional. - -`intelligence -config ` - -## Test -[/src/go/intelligence/int_test](https://github.com/dmwm/CMSMonitoring/tree/master/src/go/intelligence/int_test) - -For testing purpose of intelligence module, we have provide following :- -- [test.go](https://github.com/dmwm/CMSMonitoring/blob/master/src/go/intelligence/int_test/test.go) - testing module -- [test_cases.json](https://github.com/dmwm/CMSMonitoring/blob/master/src/go/intelligence/int_test/test_cases.json) - Fake alerts mimicing SSB and GGUS alerts for testing purpose. -- [test_config.json](https://github.com/dmwm/CMSMonitoring/blob/master/src/go/intelligence/int_test/test_config.json) - dedicated config file for testing (similar to the main config file with few changes). -- [test_wrapper.sh](https://github.com/dmwm/CMSMonitoring/blob/master/src/go/intelligence/int_test/test_wrapper.sh) - Bash script for automating the testing process. - -Testing can be done in two ways :- -- Manual - - You run an instance of Alertmanager manually and then run the test binary with required values in [test_config.json](https://github.com/dmwm/CMSMonitoring/blob/master/src/go/intelligence/int_test/test_config.json). -- Automation - - You just run the [test_wrapper.sh](https://github.com/dmwm/CMSMonitoring/blob/master/src/go/intelligence/int_test/test_wrapper.sh) bash script which will automate all process from environment setup to running the test. - -Testing config changes and points to consider :- -- CMSMON url must be changed to the url of Testing instance of AlertManager -- "testfile" can take two values in different scenario. - - Manual -> /CMSMonitoring/src/go/intelligence/int_test/test_cases.json - - Automation -> default value is /tmp/test_cases.json if you are testing in /tmp/ directory. It can be configured though. - -Let's see how we can test in two scenarios. - -##### Manual - -It is expected that a testing instance of Alertmanager is running in the system. On running the command below the testing of the whole pipeline starts. It pushes test alerts to Alertmanager, changes their severity value, annotates the Grafana dashboards, silences unwanted alerts and at the end outputs the test results. - -`int_test -config ` - -##### Automation - -All test scenario required for intelligence module testing has been automated using the [test_wrapper.sh](https://github.com/dmwm/CMSMonitoring/blob/master/src/go/intelligence/int_test/test_wrapper.sh). - -``` - Script for automation of testing process of the intelligence module. - Usage: test_wrapper.sh - - config test config file path (mandatory) - wdir work directory default: /tmp/$USER - - Options: - help help manual -``` - -Follow the following steps :- -1) Clone the repository at a specific directory - -```$ git clone https://github.com/dmwm/CMSMonitoring.git``` - -2) Set the PATH variable - -```$ export PATH=$PWD/CMSMonitoring/src/go/intelligence/int_test/:$PATH``` - -3) Edit [CMSMonitoring/src/go/intelligence/int_test/test_config.json](https://github.com/dmwm/CMSMonitoring/blob/master/src/go/intelligence/int_test/test_config.json) and set "testfile" to: /test_cases.json - -where WORKDIR is the directory where you want to run the test (if you use the default WORK_DIR /tmp/$USER, change "testfile" to : /tmp/$USER/test_cases.json. -If different WORK_DIR, then change "testfile" to: /test_cases.json). - -4) Run [test_wrapper.sh](https://github.com/dmwm/CMSMonitoring/blob/master/src/go/intelligence/int_test/test_wrapper.sh) at default directory (i.e. /tmp/$USER) - -```$ test_wrapper.sh ``` - -Run this command for testing at different directory. - -```$ test_wrapper.sh ``` - -##### *For LXPLUS USERS* - If you want to test on lxplus VM, you don't need to deploy alerting services (GGUS & SSB). There are some fake alerts which are similar to GGUS and SSB ticketing services which are pushed into Alertmanager before starting the test. However, you can run alerting services to include realtime alerts in testing process too. Wondering how to run alerting services ? Go [here](https://github.com/dmwm/CMSMonitoring/blob/master/doc/AlertManagement/installation.md). diff --git a/src/go/intelligence/int_test/Makefile b/src/go/intelligence/int_test/Makefile deleted file mode 100644 index d117ea4d..00000000 --- a/src/go/intelligence/int_test/Makefile +++ /dev/null @@ -1,43 +0,0 @@ -VERSION=`git rev-parse --short HEAD` -flags=-ldflags="-s -w -X main.version=${VERSION}" -# flags=-ldflags="-s -w -extldflags -static" - -all: build - -build: - go clean; rm -rf pkg; go build ${flags} - -build_debug: - go clean; rm -rf pkg; go build ${flags} -gcflags="-m -m" - -build_all: build_osx build_linux build - -build_osx: - go clean; rm -rf pkg int_test_osx; GOOS=darwin go build ${flags} - mv int_test int_test_osx - -build_linux: - go clean; rm -rf pkg int_test_linux; GOOS=linux go build ${flags} - mv int_test int_test_linux - -build_power8: - go clean; rm -rf pkg int_test_power8; GOARCH=ppc64le GOOS=linux go build ${flags} - mv int_test int_test_power8 - -build_arm64: - go clean; rm -rf pkg int_test_arm64; GOARCH=arm64 GOOS=linux go build ${flags} - mv int_test int_test_arm64 - -build_windows: - go clean; rm -rf pkg int_test.exe; GOARCH=amd64 GOOS=windows go build ${flags} - -install: - go install - -clean: - go clean; rm -rf pkg - -test : test1 - -test1: - cd test; go test diff --git a/src/go/intelligence/int_test/test.go b/src/go/intelligence/int_test/test.go deleted file mode 100644 index efaea756..00000000 --- a/src/go/intelligence/int_test/test.go +++ /dev/null @@ -1,178 +0,0 @@ -package main - -import ( - "encoding/json" - "flag" - "fmt" - "github.com/dmwm/CMSMonitoring/src/go/intelligence/models" - "github.com/dmwm/CMSMonitoring/src/go/intelligence/pipeline" - "github.com/dmwm/CMSMonitoring/src/go/intelligence/utils" - "io" - "log" - "os" - "runtime" - "time" -) - -// Module : intelligence -// Author : Rahul Indra -// Created : Wed, 1 July 2020 11:04:01 GMT -// Description: CMS MONIT infrastructure Intelligence Module - -// git version of our code -var version string - -func info() string { - goVersion := runtime.Version() - tstamp := time.Now() - return fmt.Sprintf("git=%s go=%s date=%s", version, goVersion, tstamp) -} - -func runPipeline() { - utils.ChangeCounters = models.ChangeCounters{} - var processedData []models.AmJSON - a := pipeline.DeleteSilence(pipeline.Silence( - pipeline.PushAlert( - pipeline.MlBox( - pipeline.AddAnnotation( - pipeline.KeywordMatching( - pipeline.Preprocess( - pipeline.FetchAlert()))))))) - - for d := range a { - processedData = append(processedData, d) - } - - if utils.ConfigJSON.Server.Verbose > 2 { - log.Printf("Processed Alerts Data: %s\n", processedData) - } -} - -func pushTestAlerts() { - log.Printf("Pushing Test alerts into AlertManager \n") - - var testAlertData []models.AmJSON - - file, err := os.Open(utils.ConfigJSON.Server.Testing.TestFile) - if err != nil { - log.Fatalf("Unable to open JSON file. Testing failed! error: %v\n", err) - } - - defer file.Close() - - jsonData, err := io.ReadAll(file) - if err != nil { - log.Fatalf("Unable to read JSON file. Testing failed! error: %v\n", err) - } - - err = json.Unmarshal(jsonData, &testAlertData) - if err != nil { - log.Fatalf("Unable to Unmarshal Data. Testing failed! error: %v\n", err) - } - - timeNow := time.Now() - - for _, each := range testAlertData { - - each.StartsAt = timeNow - each.EndsAt = time.Now().Add(utils.ConfigJSON.Server.Testing.LifetimeOfTestAlerts * time.Minute) - timeNow = timeNow.Add(utils.ConfigJSON.Server.Testing.LifetimeOfTestAlerts * time.Minute * -1) // Adjusting startTime for fake alerts so that they don't overlap in the dashboards - - err := utils.PostAlert(each) - if err != nil { - if utils.ConfigJSON.Server.Verbose > 1 { - log.Printf("Alert Data: %s\n ", each) - } - log.Fatalf("Could not push alert. Testing failed! error:%v\n", err) - } - } - - log.Printf("Test alerts has been pushed into AlertManager successfully.\n\n") -} - -func runTest() { - - log.Printf("Starting Intelligence Pipeline Testing!\n\n") - - pushTestAlerts() - log.Printf("Data getting persisted in AlertManager...\n\n") - time.Sleep(10 * time.Second) - - log.Printf("Snapshot of AlertManager before starting Testing... \n") - snapshotBefore := getAMSnapshot() - - runPipeline() - log.Printf("Changes made during the pipeline execution.... \n") - log.Printf("Number of Alerts Pushed : %d \n", utils.ChangeCounters.NoOfPushedAlerts) - log.Printf("Number of Silences Created : %d \n", utils.ChangeCounters.NoOfSilencesCreated) - log.Printf("Number of Silences Deleted : %d \n\n", utils.ChangeCounters.NoOfSilencesDeleted) - - log.Printf("Snapshot of AlertManager after completing Testing... \n") - snapshotAfter := getAMSnapshot() - - if snapshotBefore.NoOfActiveSilences+utils.ChangeCounters.NoOfSilencesCreated != snapshotAfter.NoOfActiveSilences { - log.Fatalf("Number of Active Silences Mismatched... Testing Failed !!") - } - - if snapshotBefore.NoOfExpiredSilences+utils.ChangeCounters.NoOfSilencesDeleted != snapshotAfter.NoOfExpiredSilences { - log.Fatalf("Number of Expired Silences Mismatched... Testing Failed !!") - } - - if utils.ConfigJSON.Server.Testing.AnnotateTestStatus == false { - log.Fatalf("Unable to Annotate Dashboard... Testing Failed !!") - } - - log.Printf("Testing Successful!\n\n") -} - -func getAMSnapshot() models.ChangeCounters { - currentCounters := models.ChangeCounters{} - - data, err := utils.GetAlerts(utils.ConfigJSON.Server.GetAlertsAPI, true) - if err != nil { - log.Fatalf("Could not fetch alerts from AlertManager. Testing Failed! error:%v\n", err) - } - silenceData, err := utils.GetSilences() - if err != nil { - log.Fatalf("Could not fetch silences from AlertManager. Testing Failed! error:%v\n", err) - } - - currentCounters.NoOfAlerts = len(data.Data) - - for _, each := range silenceData.Data { - if each.Status.State == utils.ConfigJSON.Silence.SilenceStatus[0] { - currentCounters.NoOfActiveSilences++ - } - if each.Status.State == utils.ConfigJSON.Silence.SilenceStatus[1] { - currentCounters.NoOfExpiredSilences++ - } - if each.Status.State == utils.ConfigJSON.Silence.SilenceStatus[2] { - currentCounters.NoOfPendingSilences++ - } - } - - log.Printf("Number of Alerts : %d\n", currentCounters.NoOfAlerts) - log.Printf("Number of Active Silences : %d\n", currentCounters.NoOfActiveSilences) - log.Printf("Number of Expired Silences : %d\n", currentCounters.NoOfExpiredSilences) - log.Printf("Number of Pending Silences : %d\n\n", currentCounters.NoOfPendingSilences) - - return currentCounters -} - -func main() { - var verbose int - var configFile string - var version bool - flag.BoolVar(&version, "version", false, "Show version") - flag.StringVar(&configFile, "config", "", "Config File path") - flag.IntVar(&verbose, "verbose", 0, "Verbosity Level, can be overwritten in config") - - flag.Parse() - if version { - fmt.Println("version:", info()) - return - } - utils.ParseConfig(configFile, verbose) - - runTest() -} diff --git a/src/go/intelligence/int_test/test_cases.json b/src/go/intelligence/int_test/test_cases.json deleted file mode 100644 index a03be6aa..00000000 --- a/src/go/intelligence/int_test/test_cases.json +++ /dev/null @@ -1,91 +0,0 @@ -[ - { - "labels": { - "alertname": "ssb-OTG00088-Testing-Alert-Network-Intervention", - "description": "Network Services", - "feName": "Network", - "seName": "Network Service", - "service": "SSB", - "severity": "notification", - "tag": "monitoring", - "type": "Planned Intervention" - }, - "annotations": { - "date": "2020-07-29T06:30:00Z", - "description": "Network Services", - "feName": "Network", - "monitState": "CLOSED", - "monitState1": "CLOSED", - "seName": "Network Service", - "shortDescription": "Network Intervention in building 8888 - TEST ALERT", - "ssbNumber": "OTG00088", - "sysCreatedBy": "jdoe", - "sysModCount": "2", - "sysUpdatedBy": "jdoe", - "type": "Planned Intervention", - "updateTimestamp": "2020-07-03T15:32:36Z", - "URL": "https://cern.service-now.com/service-portal?id=outage&n=OTG0058253" - }, - "startsAt": "2020-07-27T04:30:00Z", - "endsAt": "2020-07-31T20:10:58.537Z" - }, - - { - "labels": { - "alertname": "ssb-OTG00099-Testing-Alert-DB-Intervention", - "description": "Database Services", - "feName": "Database", - "seName": "Database Service", - "service": "SSB", - "severity": "notification", - "tag": "monitoring", - "type": "Planned Intervention" - }, - "annotations": { - "date": "2020-08-28T06:30:00Z", - "description": "Database Services", - "feName": "Database", - "monitState": "CLOSED", - "monitState1": "CLOSED", - "seName": "Database Service", - "shortDescription": "Database update in building 7777 - TEST ALERT", - "ssbNumber": "OTG00099", - "sysCreatedBy": "jdoe2", - "sysModCount": "22", - "sysUpdatedBy": "jdoe2", - "type": "Planned Intervention", - "updateTimestamp": "2020-07-05T15:32:36Z", - "URL": "https://cern.service-now.com/service-portal?id=outage&n=OTG0058253" - }, - "startsAt": "2020-07-27T04:30:00Z", - "endsAt": "2020-07-31T20:10:58.537Z" - }, - - { - "labels": { - "Priority": "urgent", - "Scope": "WLCG", - "Site": "pic", - "Type": "USER", - "VO": "cms", - "alertname": "ggus-89899-Testing-Alert", - "service": "GGUS", - "severity": "ticket", - "tag": "monitoring" - }, - "annotations": { - "Priority": "urgent", - "ResponsibleUnit": "VOSupport", - "Scope": "WLCG", - "Site": "pic", - "Status": "assigned", - "Subject": "Transfers failing to rucio - TEST ALERT", - "TicketID": "89899", - "Type": "USER", - "URL": "https://ggus.eu/?mode=ticket_info&ticket_id=89899", - "VO": "cms" - }, - "startsAt": "2020-07-22T05:54:00Z", - "endsAt": "2020-07-30T22:15:58.537Z" - } -] diff --git a/src/go/intelligence/int_test/test_config.json b/src/go/intelligence/int_test/test_config.json deleted file mode 100644 index 341535f1..00000000 --- a/src/go/intelligence/int_test/test_config.json +++ /dev/null @@ -1,109 +0,0 @@ -{ - "server": { - "cmsmonURL": "http://localhost:9093", - "getAlertsAPI": "/api/v1/alerts?active=true&silenced=false&inhibited=false&unprocessed=false", - "getSuppressedAlertsAPI" : "/api/v1/alerts?active=false&silenced=true", - "getSilencesAPI": "/api/v1/silences", - "postAlertsAPI": "/api/v1/alerts", - "postSilenceAPI": "/api/v1/silences", - "deleteSilenceAPI": "/api/v1/silence", - "httpTimeout": 3, - "interval": 1, - "verbose": 0, - "dryRun" : false, - "testing" : { - "testfile" : "/test_cases.json", - "lifetimeOfTestAlerts" : 5, - "annotateTestStatus" : false - } - }, - - "annotationDashboard" :{ - "url" : "https://monit-grafana.cern.ch", - "dashboardSearchAPI" : "/api/search", - "annotationAPI" : "/api/annotations", - "tags" : ["cmsweb-play","cmsweb-play2"], - "token" : "", - "dashboardsCacheExpiration" : 1, - "intelligenceModuleTag" : "cmsmon-int" - }, - - "alerts": { - "uniqueLabel": "alertname", - "severityLabel": "severity", - "serviceLabel": "service", - "severityLevels": { - "info": 0, - "warning": 1, - "medium": 2, - "high": 3, - "urgent": 4 - }, - "defaultSeverityLevel": "info" - }, - - "silence": { - "createdBy": "admin", - "comment": "maintenance", - "silenceStatus": ["active","expired","pending"] - }, - - "services": [ - { - "name": "SSB", - "keywordLabel": "shortDescription", - "defaultLevel": "notification", - "severityMap": { - "update": "info", - "configuration": "info", - "support": "info", - "patching": "info", - "upgrade": "warning", - "intervention": "warning", - "migration": "warning", - "interruption": "medium", - "risk": "high", - "down": "urgent" - }, - "annotationMap": { - "label": "shortDescription", - "annotations": [ - { - "actions": ["intervention", "outage"], - "systems": ["network", "database", "db"], - "tags": ["cmsweb-play"] - }, - - { - "actions": ["update", "upgrade"], - "systems": ["network", "database", "db"], - "tags": ["cmsweb-play2"] - } - ], - "urlLabel": "URL" - } - }, - - { - "name": "GGUS", - "keywordLabel": "Priority", - "defaultLevel": "ticket", - "severityMap": { - "less urgent": "medium", - "urgent": "high", - "very urgent": "urgent" - }, - "annotationMap": { - "label": "Subject", - "annotations": [ - { - "actions": ["transfer", "outage"], - "systems": ["rucio"], - "tags": ["cmsweb-play2"] - } - ], - "urlLabel": "URL" - } - } - ] -} \ No newline at end of file diff --git a/src/go/intelligence/int_test/test_wrapper.sh b/src/go/intelligence/int_test/test_wrapper.sh deleted file mode 100755 index fa1da349..00000000 --- a/src/go/intelligence/int_test/test_wrapper.sh +++ /dev/null @@ -1,172 +0,0 @@ -#!/bin/sh -##H Script for automation of testing process of the intelligence module. -##H Usage: test_wrapper.sh -##H - -# Function for printing usage. -print_usage() { - cat $0 | grep "^##H" | sed -e "s,##H,,g" - echo " config test config file path (mandatory)" - echo " wdir work directory default: /tmp/${USER}" - echo "" - echo " Options:" - echo " help help manual" - exit 1 -} - -case ${1:-status} in -help) - print_usage - ;; - -esac - -# Check if user is passing least required arguments. -if [ "$#" -lt 1 ]; then - print_usage -fi - -# Check if user has passed test config file path. -TEST_CONFIG=${1} -if [ -z "$TEST_CONFIG" ]; then - echo "Pass the Config File Path. Testing Failed. Exiting.." - exit -fi - -# Setup work directory based on user input -WDIR=${2:-"/tmp/$USER"} -TOP=$(dirname $WDIR) - -if [ ! -d $WDIR ]; then - echo "Provided work directory '$WDIR' does not exists. creating..." - - if [ ! -w $TOP ]; then - echo "can not create $WDIR: $USER has no write permission for '$TOP' directory. exiting." - exit 1 - fi - - if mkdir -p $WDIR; then - echo "$WDIR created !" - else - echo "Unable to create $WDIR" - exit 1 - fi -fi - -#Variables -AM_BIN=$WDIR/am/alertmanager -AM_CONFIG=$WDIR/am/alertmanager.yml -AM_VERSION="alertmanager-0.21.0.linux-amd64" -AM_URL="https://github.com/prometheus/alertmanager/releases/download/v0.21.0/${AM_VERSION}.tar.gz" -CMSMONITORING_REPO_URL="https://github.com/dmwm/CMSMonitoring.git" - -PID=$(ps auxwww | egrep "alertmanager" | grep -v grep | awk 'BEGIN{ORS=" "} {print $2}') - -# function for starting AlertManager -start_am() { - echo "Starting AlertManager in background." - nohup $AM_BIN --config.file=$AM_CONFIG &1 >AM.log & -} - -# function for stopping AlertManager -stop_am() { - local PID=$(ps auxwww | egrep "alertmanager" | grep -v grep | awk 'BEGIN{ORS=" "} {print $2}') - echo "Stopping Alertmanager. PID : ${PID}" - if [ -n "${PID}" ]; then - kill -9 ${PID} - fi -} - -#function for clearing out working directory -delete_wdir() { - echo "Deleting work directory : ${WDIR}" - rm -rf $WDIR -} - -# Logic for deploying the alertmanager -if [ ! -z "$PID" ]; then - echo "AlertManager already running. PID : ${PID}" -else - if [ -f "$AM_BIN" ]; then - start_am - else - echo "AlertManager not found !!" - command -v wget - if [ $? -eq 0 ]; then - echo "Downloading AlertManager using wget" - wget $AM_URL -O $WDIR/am.tar.gz - else - command -v curl - if [ $? -eq 0 ]; then - echo "Downloading AlertManager using curl" - curl $AM_URL >$WDIR/am.tar.gz - else - echo "Install wget or curl to continue and try again. Exiting.." - exit 1 - fi - fi - command -v tar - if [ $? -eq 0 ]; then - echo "Untar AlertManager..." - tar -C $WDIR -xzf $WDIR/am.tar.gz - mv $WDIR/$AM_VERSION $WDIR/am - if [ $? -eq 0 ]; then - echo "Successfully renamed ${WDIR}/${AM_VERSION} to ${WDIR}/am" - else - echo "Could not rename. Exiting.." - delete_wdir - exit 1 - fi - else - echo "Install tar to continue and try again. Exiting.." - exit 1 - fi - start_am - fi -fi - -## git clone the update CMSMonitoring repository in the working directory -command -v git -if [ $? -eq 0 ]; then - echo "Cloning CMSMonitoring at ${WDIR}." - cd $WDIR && git clone $CMSMONITORING_REPO_URL -else - echo "Install git to continue and try again. Exiting.." - exit 1 -fi - -## building the intelligence module for testing -mv $WDIR/CMSMonitoring/src/go/intelligence/int_test/test_cases.json $WDIR -if [ $? -eq 0 ]; then - echo "Successfully moved ${WDIR}/CMSMonitoring/src/go/intelligence/int_test/test_cases.json to $WDIR." -else - echo "Could not move. Exiting.." - delete_wdir - exit 1 -fi - -export GOPATH=$WDIR/CMSMonitoring -export PATH=$WDIR:$WDIR/bin:$PATH - -command -v go -if [ $? -eq 0 ]; then - echo "Building the int module...." - go build -o $WDIR go/intelligence/int_test -else - echo "Install go to continue and try again. Exiting.." - exit 1 -fi - -# Delay for Alertmanager so that it starts completely. -sleep 5 - -command -v int_test -if [ $? -eq 0 ]; then - int_test -config=$TEST_CONFIG - stop_am - delete_wdir -else - echo "Test script not found !! Testing Failed..." - delete_wdir - exit 1 -fi diff --git a/src/go/intelligence/main.go b/src/go/intelligence/main.go deleted file mode 100644 index 1bb62c7d..00000000 --- a/src/go/intelligence/main.go +++ /dev/null @@ -1,100 +0,0 @@ -package main - -import ( - "flag" - "fmt" - "github.com/dmwm/CMSMonitoring/src/go/intelligence/models" - "github.com/dmwm/CMSMonitoring/src/go/intelligence/pipeline" - "github.com/dmwm/CMSMonitoring/src/go/intelligence/utils" - "log" - "runtime" - "time" -) - -// Module : intelligence -// Author : Rahul Indra -// Created : Wed, 1 July 2020 11:04:01 GMT -// Description: CMS MONIT infrastructure Intelligence Module - -// git version of our code -var version string - -func info() string { - goVersion := runtime.Version() - tstamp := time.Now() - return fmt.Sprintf("git=%s go=%s date=%s", version, goVersion, tstamp) -} - -// Function running all logics -// Processing data pipeline module is based on ideas presented in -// https://towardsdatascience.com/concurrent-data-pipelines-in-golang-85b18c2eecc2 -func run() { - if utils.ConfigJSON.Server.Verbose > 0 { - log.Println("### run cmsmon intelligent pipeline") - } - var processedData []models.AmJSON - a := pipeline.DeleteSilence(pipeline.Silence( - pipeline.PushAlert( - pipeline.MlBox( - pipeline.AddAnnotation( - pipeline.KeywordMatching( - pipeline.Filter( - pipeline.Preprocess( - pipeline.FetchAlert())))))))) - - for d := range a { - processedData = append(processedData, d) - } - - if utils.ConfigJSON.Server.Verbose > 2 { - log.Printf("Processed Alerts Data: %s\n", processedData) - } -} - -func runDefinedIterations(iter int) { - for i := 0; i < iter; i++ { - run() - utils.ChangeCounters = models.ChangeCounters{} - utils.FirstRunSinceRestart = false - time.Sleep(utils.ConfigJSON.Server.Interval * time.Second) - } -} - -func runInfinite() { - for true { - utils.ChangeCounters = models.ChangeCounters{} - run() - utils.FirstRunSinceRestart = false - time.Sleep(utils.ConfigJSON.Server.Interval * time.Second) - } -} - -func main() { - var verbose int - var iter int - var configFile string - var version bool - flag.BoolVar(&version, "version", false, "Show version") - flag.StringVar(&configFile, "config", "", "Config File path") - flag.IntVar(&iter, "iter", 0, "Custom defined no. of iterations for premature termination") - flag.IntVar(&verbose, "verbose", 0, "Verbosity Level, can be overwritten in config") - - flag.Usage = func() { - log.Println("Usage: intelligence [options]") - flag.PrintDefaults() - } - - flag.Parse() - if version { - fmt.Println("version:", info()) - return - } - utils.ParseConfig(configFile, verbose) - - utils.FirstRunSinceRestart = true - if iter == 0 { - runInfinite() - } else { - runDefinedIterations(iter) - } -} diff --git a/src/go/intelligence/models/models.go b/src/go/intelligence/models/models.go deleted file mode 100644 index 22d8eecd..00000000 --- a/src/go/intelligence/models/models.go +++ /dev/null @@ -1,208 +0,0 @@ -package models - -import ( - "encoding/json" - "fmt" - "sync" - "time" -) - -// Module : intelligence -// Author : Rahul Indra -// Created : Wed, 1 July 2020 11:04:01 GMT -// Description: CMS MONIT infrastructure Intelligence Module - -var lock sync.RWMutex - -// AmJSON AlertManager API acceptable JSON Data -type AmJSON struct { - Labels map[string]interface{} `json:"labels"` // Map of Labels for each alert - Annotations map[string]interface{} `json:"annotations"` // Map of Annotations for each alert - StartsAt time.Time `json:"startsAt"` // Starting time of an alert - EndsAt time.Time `json:"endsAt"` // Ending time of an alert -} - -// String returns string representation of AmJSON -func (a *AmJSON) String() string { - var s string - var nlabels, nannotations int - lock.RLock() - if a.Labels != nil { - nlabels = len(a.Labels) - } - if a.Annotations != nil { - nannotations = len(a.Annotations) - } - lock.RUnlock() - data, err := json.Marshal(a) - if err == nil { - return string(data) - } - diff := a.EndsAt.Sub(a.StartsAt) - s = fmt.Sprintf("am data has %d labels %d annotations, duration %v", nlabels, nannotations, diff) - return s -} - -// AmData data struct, array of AmJSON -type AmData struct { - Data []AmJSON // Array of struct AmJSON required for GET API call data storage -} - -// Matchers for Silence Data -type Matchers struct { - Name string `json:"name"` // Name of a matcher - Value string `json:"value"` // Value of a matcher -} - -// Status struct for the SilenceData -type status struct { - State string `json:"state"` // Status of the Silence -} - -// SilenceData data struct -type SilenceData struct { - ID string `json:"id"` // ID for each silence - Matchers []Matchers `json:"matchers"` // Array of matchers which helps in finding the alert for silencing - StartsAt time.Time `json:"startsAt"` // Starting time of a silence - EndsAt time.Time `json:"endsAt"` // Ending time of a silence - CreatedBy string `json:"createdBy"` // Name of the creater of the silence - Comment string `json:"comment"` // Comment for the silence - Status status `json:"status"` // Status of the silence -} - -// AllSilences data struct, array of SilenceData -type AllSilences struct { - Data []SilenceData // Array of struct SilenceData required for GET API call data storage -} - -//AllDashboardsFetched is an array of all Dashboards' information having given tags in common -type AllDashboardsFetched struct { - ID float64 `json:"id"` // ID for the dashboard - UID string `json:"uid"` // UID for the dashboard - Title string `json:"title"` // Title for the dashboard - URI string `json:"uri"` // URI of the dashboard - URL string `json:"url"` // URL of the dashboard - Slug string `json:"slug"` // Slug of the dashboard - Type string `json:"type"` // Type of the dashboard - Tags []string `json:"tags"` // All tags for the dashboard (eg. prod, jobs, cmsweb etc.) - IsStarred bool `json:"isStarred"` // if dashboard is starred - FolderID float64 `json:"folderId"` // ID for the folder - FolderUID string `json:"folderUid"` // UID for the folder - FolderTitle string `json:"folderTitle"` // Title of the folder - FolderURL string `json:"folderUrl"` // URL of the folder -} - -// String representation of AllDashbaordsFetched structure -func (a *AllDashboardsFetched) String() string { - s := fmt.Sprintf("dashboard id=%v, uid=%v, title=%s, tags=%v\n", a.ID, a.UID, a.Title, a.Tags) - return s -} - -// GrafanaDashboard data struct for storing Annotation's information to each dashboard -type GrafanaDashboard struct { - DashboardID float64 `json:"dashboardId"` // ID of a dashboard - Time int64 `json:"time"` // Start Time of the annotation - TimeEnd int64 `json:"timeEnd"` // End Time of the annotation - Tags []string `json:"tags"` // Dashboard tags to be annotated (eg. prod, jobs, cmsweb etc.) - Text string `json:"text"` // Annotation Text Field -} - -// String function provides representation of GrafanaDashboard data -func (g *GrafanaDashboard) String() string { - s := fmt.Sprintf("grafana dashboard %v, tags: %v, text %v between [%v-%v]", g.DashboardID, g.Tags, g.Text, g.Time, g.TimeEnd) - return s -} - -// TestingData data struct -type TestingData struct { - TestFile string `json:"testfile"` // Test cases file name for testing - LifetimeOfTestAlerts time.Duration `json:"lifetimeOfTestAlerts"` // Lifetime of test alerts (in minutes) - AnnotateTestStatus bool `json:"annotateTestStatus"` // Check for dashboards annotation during testing -} - -// server data struct -type server struct { - CMSMONURL string `json:"cmsmonURL"` // CMSMON URL for AlertManager API - GetAlertsAPI string `json:"getAlertsAPI"` // API endpoint from fetching alerts - GetSuppressedAlertsAPI string `json:"getSuppressedAlertsAPI"` // API endpoint from fetching suppressed alerts - PostAlertsAPI string `json:"postAlertsAPI"` // API endpoint from creating new alerts - GetSilencesAPI string `json:"getSilencesAPI"` // API endpoint from fetching silences - PostSilenceAPI string `json:"postSilenceAPI"` // API endpoint from silencing alerts - DeleteSilenceAPI string `json:"deleteSilenceAPI"` // API endpoint from deleting silences - HTTPTimeout int `json:"httpTimeout"` // Timeout for HTTP Requests - Interval time.Duration `json:"interval"` // Time Interval at which the intelligence service will repeat - Verbose int `json:"verbose"` // Verbosity Level - DryRun bool `json:"dryRun"` // DryRun boolean flag for dry run - Testing TestingData `json:"testing"` // Testing struct for storing details about test scenario -} - -type annotationDashboard struct { - URL string `json:"URL"` // Dashboards' Base URL for sending annotation - DashboardSearchAPI string `json:"dashboardSearchAPI"` // API endpoint for searching dashboards with tags - AnnotationAPI string `json:"annotationAPI"` // API endpoint for pushing annotations - Tags []string `json:"tags"` // Tags for the dashboards - Token string `json:"token"` // Admin's Token required for sending requests - DashboardsCacheExpiration time.Duration `json:"dashboardsCacheExpiration"` // Dashboard Cache Expiration in terms of hour(s) - IntelligenceModuleTag string `json:"intelligenceModuleTag"` // Tag attached to annotations which reflects it is made by the intelligence module (eg. "cmsmon-int") -} - -// alert data struct -type alert struct { - UniqueLabel string `json:"uniqueLabel"` // Label which defines an alert uniquely - SeverityLabel string `json:"severityLabel"` // Label for severity level of an alert - ServiceLabel string `json:"serviceLabel"` // Label for service of an alert - DefaultSeverityLevel string `json:"defaultSeverityLevel"` // Default severity level value in case service is not able to assign one - SeverityLevels map[string]int `json:"severityLevels"` // Map for defined severity levels and their priority - DurationThreshold float64 `json:"durationThreshold"` // defines max duration threshold for alerts - FilterKeywords []string `json:"filterKeywords"` // list of tags to apply to filter step, e.g. remove alerts with ATLAS keyword in annotation attribute values -} - -// silence data struct -type silence struct { - CreatedBy string `json:"createdBy"` // Name of the creater of the silence (Made configurable) - Comment string `json:"comment"` // Comment for the silence (Made configurable) - SilenceStatus []string `json:"silenceStatus"` // Labels for status of the silence -} - -// annotations struct for storing specific set of keywords and -// set of dashboards to annotate when these keywords are matched in alerts. -type annotationsData struct { - Actions []string `json:"actions"` // A set of keywords of actions taken (eg. outage, intervention, update, upgrade, down etc.) - Systems []string `json:"systems"` // A set of keywords of systems affected (eg. network, database, rucio etc.) - Tags []string `json:"tags"` // A list of tags of dashboards in Grafana -} - -type annotationMap struct { - Label string `json:"label"` // Unique field of the alert Data where descriptive information about it is given so that keywords are matched in here (eg. for SSB --> "shortDescription", for GGUS --> "Subject" etc.) - AnnotationsData []annotationsData `json:"annotations"` // annotationsData struct - URLLabel string `json:"urlLabel"` // Field which identifies URL in the alert data (eg. "URL" has been set for both GGUS & SSB). -} - -// Service data struct -type Service struct { - Name string `json:"name"` // Name of a service (eg. SSB, GGUS) - KeywordLabel string `json:"keywordLabel"` // Field in which the service tries to match keyword - DefaultLevel string `json:"defaultLevel"` // Default Severity Level assigned to the alert at the time of it's creation - SeverityMap map[string]string `json:"severityMap"` // Map for severity levels for a service - AnnotationMap annotationMap `json:"annotationMap"` // Map for Dashboard annotations' keywords -} - -// Config data struct -type Config struct { - Server server `json:"server"` // server struct - AnnotationDashboard annotationDashboard `json:"annotationDashboard"` // annotation Dashboard struct - Alerts alert `json:"alerts"` // Alert struct - Silence silence `json:"silence"` // Silence struct - Services []Service `json:"services"` // Array of Service -} - -// ChangeCounters data struct -type ChangeCounters struct { - NoOfAlerts int // No of alerts in AM - NoOfPushedAlerts int // No of alerts being pushed to AM - NoOfSilencesCreated int // No of new silences created in AM - NoOfSilencesDeleted int // No of new silences deleted from AM - NoOfActiveSilences int // No of active Silences in AM - NoOfExpiredSilences int // No of expired Silences in AM - NoOfPendingSilences int // No of pending Silences in AM -} diff --git a/src/go/intelligence/pipeline/add_annotations.go b/src/go/intelligence/pipeline/add_annotations.go deleted file mode 100644 index 971b23ca..00000000 --- a/src/go/intelligence/pipeline/add_annotations.go +++ /dev/null @@ -1,244 +0,0 @@ -package pipeline - -import ( - "bytes" - "encoding/json" - "fmt" - "github.com/dmwm/CMSMonitoring/src/go/intelligence/models" - "github.com/dmwm/CMSMonitoring/src/go/intelligence/utils" - "log" - "net/http" - "strings" - "time" -) - -// Module : intelligence -// Author : Rahul Indra -// Created : Wed, 1 July 2020 11:04:01 GMT -// Description: CMS MONIT infrastructure Intelligence Module - -// AddAnnotation - function for adding annotations to dashboards -func AddAnnotation(data <-chan models.AmJSON) <-chan models.AmJSON { - - if utils.ConfigJSON.Server.Verbose > 0 { - log.Println("AddAnnotation step") - } - - dataAfterAnnotation := make(chan models.AmJSON) - ptr := &utils.DCache - ptr.UpdateDashboardCache() - - go func() { - defer close(dataAfterAnnotation) - - for each := range data { - if utils.ConfigJSON.Server.Verbose > 0 { - log.Println(each.String()) - } - var srv models.Service - ifServiceFound := false - - for _, service := range utils.ConfigJSON.Services { - slabel, ok := utils.Get(each.Labels, utils.ConfigJSON.Alerts.ServiceLabel) - if ok && slabel == service.Name { - srv = service - ifServiceFound = true - break - } - } - if utils.ConfigJSON.Server.Verbose > 2 { - log.Printf("service found %v %+v\n", ifServiceFound, srv) - } - - if ifServiceFound { - - for _, annotationData := range srv.AnnotationMap.AnnotationsData { - - //If action keywords match in alerts - ifActionFound := checkIfAvailable(annotationData.Actions, each, srv.AnnotationMap.Label) - //If system keywords match in alerts - ifSystemFound := checkIfAvailable(annotationData.Systems, each, srv.AnnotationMap.Label) - - if utils.ConfigJSON.Server.Verbose > 1 { - log.Printf("annotation data: actions=%+v systems=%+v\n", ifActionFound, ifSystemFound) - } - - if ifActionFound && ifSystemFound { - if len(utils.DCache.Dashboards) == 0 { - log.Println("No annotation dashboards is provided, annotation will be skipped") - } - if utils.ConfigJSON.Server.Verbose > 2 { - log.Println("dashboards", utils.DCache.Dashboards) - } - - for _, dashboard := range utils.DCache.Dashboards { - - /* - ifTagsIntersect(annotationDashboardTags, allDashboardsTags []string) bool {..} is used for checking if there's intersection between two list of strings each having tags for Grafana Dashboards. - - Check is done by finding the length of intersection result. - - If it is greater than 0 --> Means there are some tags after intersection. - Else --> there are no tags after intersection - - Each service has annotationMap field, and annotationMap has a "annotation" field which is a list of action & system keywords and tags for dashboards. - See below in the example. - - "annotationMap": { - . - . - . - - "annotations": [ - { - "actions": ["intervention", "outage"], - "systems": ["network", "database", "db"], ---> # - "tags": ["cmsweb-play"] - }, - - { - "actions": ["update", "upgrads"], - "systems": ["network", "database", "db"], ---> * - "tags": ["das"] - } - ], - - } - - However, in config "annotationDashboard" field also consists of "tags" field. It is a list of dashboard tags which the intelligent module tracks for annotating. - - "annotationDashboard": { - . - . - . - "tags": ["jobs","das", "cmsweb"] - } - - So, when we create a cache for Dashboards data based on these tags (here, "tags": ["jobs","das", "cmsweb"]), all dashboards info are saved. - Thus, we need to find intersection between tags of all these dashboards with those passed with specific services (SSB/GGUS). - - Using example above we can say the intersection result would be, - - ["cmsweb-play"] ---> # since len(["cmsweb-play"]) > 0 returns true - ["das"] ---> * since len(["cmsweb-play"]) > 0 returns true - */ - - ifCommonTagsFound := ifTagsIntersect(annotationData.Tags, dashboard.Tags) - - if utils.ConfigJSON.Server.Verbose > 1 { - log.Printf("add annotation common tags: %v, annotation tags %v, dashbaord tags %v\n", ifCommonTagsFound, annotationData.Tags, dashboard.Tags) - } - - if ifCommonTagsFound == false { - continue - } - - var dashboardData models.GrafanaDashboard - - // Custom tags which consists of intelligence module - // tag, unique identifier for an alert, and tags of - // all those dashboards where the alert has been annotated. - var customTags []string - - //intelligence module tag (eg. "cmsmon-int") - customTags = append(customTags, utils.ConfigJSON.AnnotationDashboard.IntelligenceModuleTag) - - if val, ok := utils.Get(each.Labels, utils.ConfigJSON.Alerts.UniqueLabel); ok { - // Unique identifier for an alert - // (eg. ssbNumber for SSB alerts, TicketID for GGUS alerts etc.) - customTags = append(customTags, val) - } - - for _, eachTag := range annotationData.Tags { - // Appending all tags of the dashboard where the alert is going to get annotated. - customTags = append(customTags, eachTag) - } - - dashboardData.DashboardID = dashboard.ID - dashboardData.Time = each.StartsAt.Unix() * 1000 - dashboardData.TimeEnd = each.EndsAt.Unix() * 1000 - dashboardData.Tags = customTags - - if val, ok := utils.Get(each.Annotations, srv.AnnotationMap.Label); ok { - dashboardData.Text = srv.Name + ": " + val - if url, urlOk := utils.Get(each.Annotations, srv.AnnotationMap.URLLabel); urlOk { - dashboardData.Text = srv.Name + ": " + val + "\n" + makeHTMLhref(url) - } - } - addAnnotationHelper(dashboardData) - } - } - } - } - dataAfterAnnotation <- each - } - }() - return dataAfterAnnotation -} - -// makeHTMLhref for making url clickable it needs be tagged with html href attribute that's what this function does -func makeHTMLhref(url string) string { - return "URL" -} - -// ifTagsIntersect for checking intersection between two list of dashboard tags. -func ifTagsIntersect(annotationDashboardTags, allDashboardsTags []string) bool { - - var intersectionResult []string - - hashMap := make(map[string]bool) //a hashmap which helps to reduce the intersection operation from O(n^2) to O(n). - - for _, val := range allDashboardsTags { - hashMap[val] = true - } - - for _, each := range annotationDashboardTags { - if _, ok := hashMap[each]; ok { - intersectionResult = append(intersectionResult, each) - } - } - - return len(intersectionResult) > 0 -} - -// checkIfAvailable - function for finding if particular keyword is available or not in the given field of Alerts -func checkIfAvailable(data []string, amData models.AmJSON, label string) bool { - for _, each := range data { - if val, ok := utils.Get(amData.Annotations, label); ok { - if strings.Contains(strings.ToLower(val), strings.ToLower(each)) { - return true - } - } - } - return false -} - -// addAnnotationHelper - helper function -// The following block of code was taken from -// https://github.com/dmwm/CMSMonitoring/blob/master/src/go/MONIT/monit.go#L639 -func addAnnotationHelper(d models.GrafanaDashboard) { - var headers [][]string - bearer := fmt.Sprintf("Bearer %s", utils.ConfigJSON.AnnotationDashboard.Token) - headers = append(headers, []string{"Authorization", bearer}) - headers = append(headers, []string{"Content-Type", "application/json"}) - - apiURL := utils.ValidateURL(utils.ConfigJSON.AnnotationDashboard.URL, utils.ConfigJSON.AnnotationDashboard.AnnotationAPI) - dData, err := json.Marshal(d) - if err != nil { - log.Printf("Unable to convert the data into JSON %v, error: %v\n", d, err) - return - } - resp := utils.HttpCall("POST", apiURL, headers, bytes.NewBuffer(dData)) - defer resp.Body.Close() - - if resp.StatusCode == http.StatusForbidden { - utils.ConfigJSON.Server.Testing.AnnotateTestStatus = false - log.Printf("Unable to annotate the dashboard(s), NO PERMISSION") - return - } - - utils.ConfigJSON.Server.Testing.AnnotateTestStatus = true - tms := time.Unix(d.Time/1000, 0) // dashboard time is in milliseconds - tme := time.Unix(d.TimeEnd/1000, 0) // dashboard time is in milliseconds - log.Printf("add annotation '%s' from %v to %v to dashboard %v tags %v", d.Text, tms, tme, d.DashboardID, d.Tags) -} diff --git a/src/go/intelligence/pipeline/delete_old_silents.go b/src/go/intelligence/pipeline/delete_old_silents.go deleted file mode 100644 index 15bfa9b6..00000000 --- a/src/go/intelligence/pipeline/delete_old_silents.go +++ /dev/null @@ -1,88 +0,0 @@ -package pipeline - -import ( - "github.com/dmwm/CMSMonitoring/src/go/intelligence/models" - "github.com/dmwm/CMSMonitoring/src/go/intelligence/utils" - "log" - "time" -) - -// Module : intelligence -// Author : Rahul Indra -// Created : Wed, 1 July 2020 11:04:01 GMT -// Description: CMS MONIT infrastructure Intelligence Module - -// DeleteSilence - function for deleting expired silences -func DeleteSilence(data <-chan models.AmJSON) <-chan models.AmJSON { - - if utils.ConfigJSON.Server.Verbose > 0 { - log.Println("DeleteSilence step") - } - - finalData := make(chan models.AmJSON) - go func() { - defer close(finalData) - if utils.ConfigJSON.Server.DryRun == false { - deleteSilenceHelper() - } - for each := range data { - if utils.ConfigJSON.Server.Verbose > 1 { - log.Println(each.String()) - } - finalData <- each - } - }() - return finalData -} - -// deleteSilenceHelper - helper function for deleting a silence -func deleteSilenceHelper() { - utils.DataReadWriteLock.RLock() - defer utils.DataReadWriteLock.RUnlock() - for silencedAlert, val := range utils.IfSilencedMap { - if utils.ExtAlertsMap[silencedAlert] == 1 { - continue - } - deleteSuppressedAlert(silencedAlert) - err := deleteSilenceAPICall(silencedAlert, val.SilenceID) - if err != nil { - log.Printf("Could not delete expired silence for: %s, error:%v\n", silencedAlert, err) - } - - utils.ChangeCounters.NoOfSilencesDeleted++ - } -} - -// deleteSuppressedAlert - helper function for deleting a suppressed alert -func deleteSuppressedAlert(silencedAlert string) { - utils.DataReadWriteLock.RLock() - defer utils.DataReadWriteLock.RUnlock() - if data, ifDataFound := utils.ExtSuppressedAlertsMap[silencedAlert]; ifDataFound { - data.EndsAt = time.Now() - err := utils.PostAlert(data) - if err != nil { - log.Printf("Could not delete suppressed alert, error:%v\n", err) - if utils.ConfigJSON.Server.Verbose > 1 { - log.Printf("Suppressed Alert Data: %s\n ", data) - } - } - } -} - -// deleteSilenceAPICall - helper function for making API call for deleting a silence -func deleteSilenceAPICall(silencedAlert, silenceID string) error { - - apiURL := utils.ValidateURL(utils.ConfigJSON.Server.CMSMONURL, utils.ConfigJSON.Server.DeleteSilenceAPI) - apiURL = utils.ValidateURL(apiURL, "/"+silenceID) - - var headers [][]string - headers = append(headers, []string{"Content-Type", "application/json"}) - resp := utils.HttpCall("DELETE", apiURL, headers, nil) - defer resp.Body.Close() - - if utils.ConfigJSON.Server.Verbose > 2 { - log.Printf("Silence Deleted for:\n%+v\n", silencedAlert) - } - - return nil -} diff --git a/src/go/intelligence/pipeline/fetch_alert.go b/src/go/intelligence/pipeline/fetch_alert.go deleted file mode 100644 index 8acc1bb8..00000000 --- a/src/go/intelligence/pipeline/fetch_alert.go +++ /dev/null @@ -1,45 +0,0 @@ -package pipeline - -import ( - "github.com/dmwm/CMSMonitoring/src/go/intelligence/models" - "github.com/dmwm/CMSMonitoring/src/go/intelligence/utils" - "log" -) - -// Module : intelligence -// Author : Rahul Indra -// Created : Wed, 1 July 2020 11:04:01 GMT -// Description: CMS MONIT infrastructure Intelligence Module -// Code is based on -// https://towardsdatascience.com/concurrent-data-pipelines-in-golang-85b18c2eecc2 - -// FetchAlert - function for fetching all active alerts from AlertManager -func FetchAlert() <-chan models.AmJSON { - if utils.ConfigJSON.Server.Verbose > 0 { - log.Println("FetchAlert step") - } - - fetchedData := make(chan models.AmJSON) - - _, err := utils.GetAlerts(utils.ConfigJSON.Server.GetSuppressedAlertsAPI, false) - if err != nil { - log.Printf("Could not fetch suppressed alerts from AlertManager, error:%v\n", err) - } - - data, err := utils.GetAlerts(utils.ConfigJSON.Server.GetAlertsAPI, true) - if err != nil { - log.Printf("Could not fetch alerts from AlertManager, error:%v\n", err) - } - utils.ChangeCounters.NoOfAlerts = len(utils.ExtAlertsMap) - - go func() { - defer close(fetchedData) - for _, each := range data.Data { - if utils.ConfigJSON.Server.Verbose > 1 { - log.Println(each.String()) - } - fetchedData <- each - } - }() - return fetchedData -} diff --git a/src/go/intelligence/pipeline/filter.go b/src/go/intelligence/pipeline/filter.go deleted file mode 100644 index 5ece0d9c..00000000 --- a/src/go/intelligence/pipeline/filter.go +++ /dev/null @@ -1,60 +0,0 @@ -package pipeline - -import ( - "fmt" - "github.com/dmwm/CMSMonitoring/src/go/intelligence/models" - "github.com/dmwm/CMSMonitoring/src/go/intelligence/utils" - "log" - "strings" -) - -// Module : intelligence -// Author : Valentin Kuznetsov -// Created : Fri Mar 12 09:54:45 EST 2021 -// Description: CMS MONIT infrastructure Intelligence Module -// Code is based on -// https://towardsdatascience.com/concurrent-data-pipelines-in-golang-85b18c2eecc2 - -// Filter provide filtering of incoming data -func Filter(data <-chan models.AmJSON) <-chan models.AmJSON { - - if utils.ConfigJSON.Server.Verbose > 0 { - log.Println("Filter pipeline") - } - out := make(chan models.AmJSON) - - go func() { - defer close(out) - for each := range data { - if utils.ConfigJSON.Server.Verbose > 1 { - log.Println(each.String()) - } - match := false - // filter out each AM message if it contains filter tag - if len(utils.ConfigJSON.Alerts.FilterKeywords) > 0 { - for _, val := range each.Annotations { - for _, tag := range utils.ConfigJSON.Alerts.FilterKeywords { - if strings.Contains(fmt.Sprintf("%v", val), tag) { - log.Printf("filter alert, matching keyword %v\n%v", tag, each.String()) - match = true - } - } - } - } - if match { - continue - } - // filter out alerts which has large duration time - diff := each.EndsAt.Sub(each.StartsAt) - if diff.Hours() < utils.ConfigJSON.Alerts.DurationThreshold { - // if our alert time range (defined between starts and ends timestamps) - // is less than our threshold we'll keep it, otherwise the alert will be - // rejected (i.e. we'll not pass it to next pipeline level) - out <- each - } else { - log.Printf("filter alert, duration %v\n\n%v", diff, each.String()) - } - } - }() - return out -} diff --git a/src/go/intelligence/pipeline/keyword_matching.go b/src/go/intelligence/pipeline/keyword_matching.go deleted file mode 100644 index 1ae014a3..00000000 --- a/src/go/intelligence/pipeline/keyword_matching.go +++ /dev/null @@ -1,105 +0,0 @@ -package pipeline - -import ( - "github.com/dmwm/CMSMonitoring/src/go/intelligence/models" - "github.com/dmwm/CMSMonitoring/src/go/intelligence/utils" - "log" - "strings" -) - -// Module : intelligence -// Author : Rahul Indra -// Created : Wed, 1 July 2020 11:04:01 GMT -// Description: CMS MONIT infrastructure Intelligence Module -// Code is based on -// https://towardsdatascience.com/concurrent-data-pipelines-in-golang-85b18c2eecc2 - -// KeywordMatching - function finds defined keywords in the shortDescription of alerts and assign severity level accordingly -func KeywordMatching(data <-chan models.AmJSON) <-chan models.AmJSON { - - if utils.ConfigJSON.Server.Verbose > 0 { - log.Println("KeyworkMatching pipeline") - } - dataWithSeverity := make(chan models.AmJSON) - - go func() { - defer close(dataWithSeverity) - for each := range data { - if utils.ConfigJSON.Server.Verbose > 1 { - log.Println(each.String()) - } - changedData := each - for _, service := range utils.ConfigJSON.Services { - slabel, ok := utils.Get(each.Labels, utils.ConfigJSON.Alerts.ServiceLabel) - if ok && slabel == service.Name { - keywordMatchingHelper(&changedData, service) - } - } - dataWithSeverity <- changedData - } - }() - return dataWithSeverity -} - -// keywordMatchingHelper - helper function which matches keywords and assigns severity levels -func keywordMatchingHelper(data *models.AmJSON, srv models.Service) { - /* - Common Structure of an alert - - { ".", - ".", - "labels": { - "alertname", - "service", - "tag", - "severity" - ".", - "." - }, - "annotations": { - "shortDescription" or "Priority", - ".", - ".", - "." - } - } here "." represents other fields can be/are introduced as well. - - Each alert will have a field in it's "Annotations" which can be helpful to determine it's severity. - Ex. SSB has "shortDescription", GGUS has "Priority". - - We have defined some set of rules or in other words some set of keywords which defines the severity of an alert and upon finding such keyword - in decided Annotation's field of the alerts, we assign corresponding severity level. - - Ex. When SSB's shortDescription says "Short network interruption in building 182-275" , we can extract relevant keyword like "interruption" - and we will assign mapped severity level to this keyword i.e. "interruption": "medium". - - */ - - assignSeverityLevel := "" - maxSeverityLevel := -1 - - for key, value := range data.Annotations { - if key == srv.KeywordLabel { - for k, v := range srv.SeverityMap { - if val, ok := value.(string); ok { - if strings.Contains(strings.ToLower(val), k) { - if utils.ConfigJSON.Alerts.SeverityLevels[v] > maxSeverityLevel { - maxSeverityLevel = utils.ConfigJSON.Alerts.SeverityLevels[v] - assignSeverityLevel = v - } - } - } - } - } - } - - for key := range data.Labels { - if key == utils.ConfigJSON.Alerts.SeverityLabel { - if assignSeverityLevel != "" { - utils.Set(data.Labels, utils.ConfigJSON.Alerts.SeverityLabel, assignSeverityLevel) - } else { - utils.Set(data.Labels, utils.ConfigJSON.Alerts.SeverityLabel, utils.ConfigJSON.Alerts.DefaultSeverityLevel) - } - } - } -} diff --git a/src/go/intelligence/pipeline/ml_box.go b/src/go/intelligence/pipeline/ml_box.go deleted file mode 100644 index 638e9874..00000000 --- a/src/go/intelligence/pipeline/ml_box.go +++ /dev/null @@ -1,36 +0,0 @@ -package pipeline - -import ( - "github.com/dmwm/CMSMonitoring/src/go/intelligence/models" - "github.com/dmwm/CMSMonitoring/src/go/intelligence/utils" - "log" -) - -// Module : intelligence -// Author : Rahul Indra -// Created : Wed, 1 July 2020 11:04:01 GMT -// Description: CMS MONIT infrastructure Intelligence Module - -// MlBox - Machine Learning predicted Data -func MlBox(data <-chan models.AmJSON) <-chan models.AmJSON { - - if utils.ConfigJSON.Server.Verbose > 0 { - log.Println("MlBox step") - } - /* - IMPLEMENT THE LOGIC OF ML PREDICTIONS - so far we are returning back the data without any predictions. - */ - - predictedData := make(chan models.AmJSON) - go func() { - defer close(predictedData) - for d := range data { - if utils.ConfigJSON.Server.Verbose > 1 { - log.Println(d.String()) - } - predictedData <- d - } - }() - return predictedData -} diff --git a/src/go/intelligence/pipeline/preprocess.go b/src/go/intelligence/pipeline/preprocess.go deleted file mode 100644 index b4b3c258..00000000 --- a/src/go/intelligence/pipeline/preprocess.go +++ /dev/null @@ -1,91 +0,0 @@ -package pipeline - -import ( - "github.com/dmwm/CMSMonitoring/src/go/intelligence/models" - "github.com/dmwm/CMSMonitoring/src/go/intelligence/utils" - "log" -) - -// Module : intelligence -// Author : Rahul Indra -// Created : Wed, 1 July 2020 11:04:01 GMT -// Description: CMS MONIT infrastructure Intelligence Module - -// Preprocess - function make required changes to alerts and filter only SSB and GGUS alerts -func Preprocess(data <-chan models.AmJSON) <-chan models.AmJSON { - if utils.ConfigJSON.Server.Verbose > 0 { - log.Println("Preprocess step") - } - utils.IfSilencedMap = make(map[string]utils.SilenceMapVals) - - err := updateSilencedMap() - if err != nil { - log.Printf("Unable to update the IfSilenced Map, error: %v\n", err) - } - - if utils.ConfigJSON.Server.Verbose > 1 { - log.Printf("Current IfSilenced Map has %d entries", len(utils.IfSilencedMap)) - if utils.ConfigJSON.Server.Verbose > 2 { - log.Printf("Current IfSilenced Map: %v", utils.IfSilencedMap) - } - } - - preprocessedData := make(chan models.AmJSON) - go func() { - defer close(preprocessedData) - for each := range data { - if utils.ConfigJSON.Server.Verbose > 1 { - log.Println(each.String()) - } - for _, service := range utils.ConfigJSON.Services { - - srvLabel, ok := utils.Get(each.Labels, utils.ConfigJSON.Alerts.ServiceLabel) - if ok && srvLabel == service.Name { - if val, ok := utils.Get(each.Labels, utils.ConfigJSON.Alerts.UniqueLabel); ok { - utils.DataReadWriteLock.RLock() - _, alertFoundInSilencedMap := utils.IfSilencedMap[val] - utils.DataReadWriteLock.RUnlock() - if !alertFoundInSilencedMap { - preprocessedData <- each - } - - } - } - } - } - }() - - return preprocessedData -} - -// updateSilencedMap -function for updating the ifSilenced Map to help us not to push redundant silences -func updateSilencedMap() error { - - data, err := utils.GetSilences() - if err != nil { - log.Printf("Unable to Update Silence Map, error: %v", err) - } - - for _, each := range data.Data { - if each.Status.State == utils.ConfigJSON.Silence.SilenceStatus[0] { - utils.ChangeCounters.NoOfActiveSilences++ - } - if each.Status.State == utils.ConfigJSON.Silence.SilenceStatus[1] { - utils.ChangeCounters.NoOfExpiredSilences++ - } - if each.Status.State == utils.ConfigJSON.Silence.SilenceStatus[2] { - utils.ChangeCounters.NoOfPendingSilences++ - } - for _, matcher := range each.Matchers { - if matcher.Name == utils.ConfigJSON.Alerts.UniqueLabel { - if each.Status.State == utils.ConfigJSON.Silence.SilenceStatus[0] { - utils.DataReadWriteLock.Lock() - utils.IfSilencedMap[matcher.Value] = utils.SilenceMapVals{IfAvail: 1, SilenceID: each.ID} - utils.DataReadWriteLock.Unlock() - } - } - } - } - - return nil -} diff --git a/src/go/intelligence/pipeline/push_alert.go b/src/go/intelligence/pipeline/push_alert.go deleted file mode 100644 index 83db22b1..00000000 --- a/src/go/intelligence/pipeline/push_alert.go +++ /dev/null @@ -1,41 +0,0 @@ -package pipeline - -import ( - "github.com/dmwm/CMSMonitoring/src/go/intelligence/models" - "github.com/dmwm/CMSMonitoring/src/go/intelligence/utils" - "log" -) - -// Module : intelligence -// Author : Rahul Indra -// Created : Wed, 1 July 2020 11:04:01 GMT -// Description: CMS MONIT infrastructure Intelligence Module - -// PushAlert - function for pushing modified alerts back to AlertManager -func PushAlert(data <-chan models.AmJSON) <-chan models.AmJSON { - if utils.ConfigJSON.Server.Verbose > 0 { - log.Println("PushAlert step") - } - c := make(chan models.AmJSON) - - go func() { - defer close(c) - for each := range data { - if utils.ConfigJSON.Server.Verbose > 1 { - log.Println(each.String()) - } - if utils.ConfigJSON.Server.DryRun == false { - err := utils.PostAlert(each) - if err != nil { - log.Printf("Could not push alert, error:%v\n", err) - if utils.ConfigJSON.Server.Verbose > 1 { - log.Printf("Alert Data: %s\n ", each) - } - } - } - utils.ChangeCounters.NoOfPushedAlerts++ - c <- each - } - }() - return c -} diff --git a/src/go/intelligence/pipeline/silence_alert.go b/src/go/intelligence/pipeline/silence_alert.go deleted file mode 100644 index 7bf2d5a4..00000000 --- a/src/go/intelligence/pipeline/silence_alert.go +++ /dev/null @@ -1,109 +0,0 @@ -package pipeline - -import ( - "bytes" - "encoding/json" - "github.com/dmwm/CMSMonitoring/src/go/intelligence/models" - "github.com/dmwm/CMSMonitoring/src/go/intelligence/utils" - "log" - "time" -) - -// Module : intelligence -// Author : Rahul Indra -// Created : Wed, 1 July 2020 11:04:01 GMT -// Description: CMS MONIT infrastructure Intelligence Module - -// Silence - function silences the old alert -func Silence(data <-chan models.AmJSON) <-chan models.AmJSON { - - if utils.ConfigJSON.Server.Verbose > 0 { - log.Println("Silence step") - } - silencedData := make(chan models.AmJSON) - go func() { - defer close(silencedData) - for each := range data { - if utils.ConfigJSON.Server.Verbose > 1 { - log.Println(each.String()) - } - if utils.ConfigJSON.Server.DryRun == false { - err := silenceAlert(each) - if err != nil { - log.Printf("Could not silence alert, error:%v\n", err) - if utils.ConfigJSON.Server.Verbose > 1 { - log.Printf("Silence Data: %s\n ", each) - } - } - } - utils.ChangeCounters.NoOfSilencesCreated++ - silencedData <- each - } - }() - return silencedData -} - -// silenceAlert - helper function for silencing old alerts -func silenceAlert(data models.AmJSON) error { - - apiURL := utils.ValidateURL(utils.ConfigJSON.Server.CMSMONURL, utils.ConfigJSON.Server.PostSilenceAPI) - - var sData models.SilenceData - var alertnameMatcher models.Matchers - var severityMatcher models.Matchers - - if data.StartsAt.After(time.Now()) { - // wtart Time equal to current time if it's starting in future - // so that the silence remains on old alert till the lifetime of the alert - sData.StartsAt = time.Now() - } else { - // start Time equal to main alert - // so that the silence remains on old alert till the lifetime of the alert - sData.StartsAt = data.StartsAt - } - - // end Time equal to main alert - // so that the silence remains on old alert till the lifetime of the alert - sData.EndsAt = data.EndsAt - sData.CreatedBy = utils.ConfigJSON.Silence.CreatedBy - sData.Comment = utils.ConfigJSON.Silence.Comment - - alertnameMatcher.Name = utils.ConfigJSON.Alerts.UniqueLabel - severityMatcher.Name = utils.ConfigJSON.Alerts.SeverityLabel - - for k, v := range data.Labels { - if k == utils.ConfigJSON.Alerts.UniqueLabel { - if val, ok := v.(string); ok { - alertnameMatcher.Value = val - } - } - - if k == utils.ConfigJSON.Alerts.SeverityLabel { - for _, service := range utils.ConfigJSON.Services { - slabel, ok := utils.Get(data.Labels, utils.ConfigJSON.Alerts.ServiceLabel) - if ok && slabel == service.Name { - severityMatcher.Value = service.DefaultLevel - } - } - } - } - - sData.Matchers = append(sData.Matchers, alertnameMatcher) - sData.Matchers = append(sData.Matchers, severityMatcher) - jsonStr, err := json.Marshal(sData) - if err != nil { - log.Printf("Unable to convert JSON Data, error: %v\n", err) - return err - } - - var headers [][]string - headers = append(headers, []string{"Content-Type", "application/json"}) - resp := utils.HttpCall("POST", apiURL, headers, bytes.NewBuffer(jsonStr)) - defer resp.Body.Close() - - if utils.ConfigJSON.Server.Verbose > 1 { - log.Printf("Silence Data:\n%+v\n", string(jsonStr)) - } - - return nil -} diff --git a/src/go/intelligence/testing.md b/src/go/intelligence/testing.md deleted file mode 100644 index 0716f990..00000000 --- a/src/go/intelligence/testing.md +++ /dev/null @@ -1,27 +0,0 @@ -### Testing intelligence module -We can easily test intelligence module by adding a fake/test alert -through `amtool` and checking if intelligence module will perform -all necessary steps to annotate the dashboards. Please adjust -and use the following command for testing: -``` -#!/bin/bash -aurl="http://cms-monitoring.cern.ch:30093" -startAt="2020-10-30T19:05:00Z" -endAt="2020-10-30T19:15:00Z" -ssbNumber=OTG111112 -stype="Planned Intervention" -severity="notification" -amtool alert add ssb-$ssbNumber tag=monitoring \ - service=SSB severity=$severity type="$stype" \ - ssbNumber="$ssbNumber" \ - --annotation=shortDescription="TEST ALERT : Network Intervention in building 697" \ - --annotation=description="Network Services" \ - --annotation=type="$stype" \ - --annotation=severity="$severity" \ - --annotation=ssbNumber="$ssbNumber" \ - --start $startAt --end $endAt \ - --alertmanager.url $aurl -``` - -The start and end time stamp should be adjusted. The description shoudl contain -intervention to create appropriate annotations on cms monitoring Grafana dashboards. diff --git a/src/go/intelligence/utils/utils.go b/src/go/intelligence/utils/utils.go deleted file mode 100644 index 17de639a..00000000 --- a/src/go/intelligence/utils/utils.go +++ /dev/null @@ -1,364 +0,0 @@ -package utils - -import ( - "bytes" - "encoding/json" - "fmt" - "github.com/dmwm/CMSMonitoring/src/go/intelligence/models" - "io" - "log" - "net/http" - "net/http/httputil" - "net/url" - "os" - "strings" - "sync" - "time" -) - -// Module : intelligence -// Author : Rahul Indra -// Created : Wed, 1 July 2020 11:04:01 GMT -// Description: CMS MONIT infrastructure Intelligence Module - -// ConfigJSON variable -var ConfigJSON models.Config - -// SilenceMapVals struct for storing ifAvail bool and silenceID in IfSilencedMap -type SilenceMapVals struct { - IfAvail int - SilenceID string -} - -// ChangeCounters variable for storing counters for logging before and after running intelligence module -var ChangeCounters models.ChangeCounters - -// IfSilencedMap - variable for storing ongoing silences -var IfSilencedMap map[string]SilenceMapVals - -// ExtAlertsMap - map for storing existing Alerts in AlertManager -var ExtAlertsMap map[string]int - -// ExtSuppressedAlertsMap - map for storing existing suppressed Alerts in AlertManager -var ExtSuppressedAlertsMap map[string]models.AmJSON - -// FirstRunSinceRestart - store information if the it's the first start of the service after restart or not -var FirstRunSinceRestart bool - -// DataReadWriteLock variable for solving Concurrent Read/Write on Map issue. -var DataReadWriteLock sync.RWMutex - -// DCache - variable for DashboardsCache -var DCache DashboardsCache - -// DashboardsCache - a cache for storing dashboards and Expiration time for updating the cache -type DashboardsCache struct { - Dashboards map[float64]models.AllDashboardsFetched - Expiration time.Time -} - -// UpdateDashboardCache - function for updating the dashboards cache on expiration -func (dCache *DashboardsCache) UpdateDashboardCache() { - - if !FirstRunSinceRestart && dCache.Expiration.After(time.Now()) { - return - } - - DataReadWriteLock.Lock() - defer DataReadWriteLock.Unlock() - dCache.Dashboards = make(map[float64]models.AllDashboardsFetched) - - for _, tag := range ConfigJSON.AnnotationDashboard.Tags { - tmp := findDashboards(tag) - - for _, each := range tmp { - dCache.Dashboards[each.ID] = each - } - } - if ConfigJSON.Server.Verbose > 0 { - log.Println("updated dashboard cache with", len(dCache.Dashboards), "maps") - if ConfigJSON.Server.Verbose > 1 { - for _, d := range dCache.Dashboards { - log.Println(d.String()) - } - } - } - - dCache.Expiration = time.Now().Add(ConfigJSON.AnnotationDashboard.DashboardsCacheExpiration * time.Hour) -} - -// ValidateURL - function for constructing and validating AM URL -func ValidateURL(baseURL, apiURL string) string { - - cmpltURL := baseURL + apiURL - - u, err := url.ParseRequestURI(cmpltURL) - if err != nil { - log.Fatalf("AlertManager API URL is not valid, error:%v", err) - } - - return u.String() -} - -// ParseConfig - Function for parsing the config File -func ParseConfig(configFile string, verbose int) { - - //Defaults in case no config file is provided - ConfigJSON.Server.CMSMONURL = "https://cms-monitoring.cern.ch" - ConfigJSON.Server.GetAlertsAPI = "/api/v1/alerts?active=true&silenced=false&inhibited=false&unprocessed=false" - ConfigJSON.Server.GetSuppressedAlertsAPI = "/api/v1/alerts?active=false&silenced=true" - ConfigJSON.Server.PostAlertsAPI = "/api/v1/alerts" - ConfigJSON.Server.GetSilencesAPI = "/api/v1/silences" - ConfigJSON.Server.PostSilenceAPI = "/api/v1/silences" - ConfigJSON.Server.DeleteSilenceAPI = "/api/v1/silence" - ConfigJSON.Server.HTTPTimeout = 3 // timeout in sec for HTTP requests - ConfigJSON.Server.Interval = 10 // interval in sec for the service - ConfigJSON.Server.Verbose = verbose - - ConfigJSON.AnnotationDashboard.URL = "https://monit-grafana.cern.ch" - ConfigJSON.AnnotationDashboard.DashboardSearchAPI = "/api/search" - ConfigJSON.AnnotationDashboard.AnnotationAPI = "/api/annotations" - ConfigJSON.AnnotationDashboard.DashboardsCacheExpiration = 1 - ConfigJSON.AnnotationDashboard.IntelligenceModuleTag = "cmsmon-int" - - ConfigJSON.Alerts.UniqueLabel = "alertname" - ConfigJSON.Alerts.SeverityLabel = "severity" - ConfigJSON.Alerts.ServiceLabel = "service" - ConfigJSON.Alerts.DefaultSeverityLevel = "info" - ConfigJSON.Alerts.DurationThreshold = 24 // duration threshold (in hours) used by filter pipeline - ConfigJSON.Alerts.FilterKeywords = []string{} - - ConfigJSON.Silence.Comment = "maintenance" - ConfigJSON.Silence.CreatedBy = "admin" - - if stats, err := os.Stat(configFile); err == nil { - if ConfigJSON.Server.Verbose > 1 { - log.Printf("FileInfo: %s\n", stats) - } - jsonFile, e := os.Open(configFile) - if e != nil { - log.Fatalf("Config File not found, error: %s", e) - } - defer jsonFile.Close() - decoder := json.NewDecoder(jsonFile) - err := decoder.Decode(&ConfigJSON) - if err != nil { - log.Fatalf("Config JSON File can't be loaded, error: %s", err) - } else if ConfigJSON.Server.Verbose > 0 { - log.Printf("Load config from %s\n", configFile) - } - } else { - log.Fatalf("%s: Config File doesn't exist, error: %v", configFile, err) - } - - // Custom verbose value overriden - if verbose > 0 { - ConfigJSON.Server.Verbose = verbose - } - - if ConfigJSON.Server.Verbose > 0 { - log.SetFlags(log.LstdFlags | log.Lshortfile) - } else { - log.SetFlags(log.LstdFlags) - } - - if ConfigJSON.Server.Verbose > 1 { - log.Printf("Configuration:\n%+v\n", ConfigJSON) - } - -} - -// findDashboard - helper function to find dashboard info -// The following block of code was taken from -// https://github.com/dmwm/CMSMonitoring/blob/master/src/go/MONIT/monit.go#L604 -func findDashboards(tag string) []models.AllDashboardsFetched { - var headers [][]string - bearer := fmt.Sprintf("Bearer %s", ConfigJSON.AnnotationDashboard.Token) - headers = append(headers, []string{"Authorization", bearer}) - headers = append(headers, []string{"Accept", "application/json"}) - // example: /api/search?query=Production%20Overview&starred=true&tag=prod - v := url.Values{} - v.Set("tag", strings.Trim(tag, " ")) - apiURL := fmt.Sprintf("%s%s?%s", ConfigJSON.AnnotationDashboard.URL, ConfigJSON.AnnotationDashboard.DashboardSearchAPI, v.Encode()) - - resp := HttpCall("GET", apiURL, headers, nil) - defer resp.Body.Close() - - // Deserialize the response into a map. - var data []models.AllDashboardsFetched - if err := json.NewDecoder(resp.Body).Decode(&data); err != nil { - d, _ := io.ReadAll(resp.Body) - log.Printf("Error parsing the response body: %s, %+v\n", err, string(d)) - } - return data -} - -// GetSilences - function for get request on /api/v1/silences alertmanager endpoint for fetching all silences. -func GetSilences() (models.AllSilences, error) { - var data models.AllSilences - apiURL := ValidateURL(ConfigJSON.Server.CMSMONURL, ConfigJSON.Server.GetSilencesAPI) - - var headers [][]string - headers = append(headers, []string{"Accept-Encoding", "identify"}) - headers = append(headers, []string{"Accept", "application/json"}) - resp := HttpCall("GET", apiURL, headers, nil) - defer resp.Body.Close() - - byteValue, err := io.ReadAll(resp.Body) - if err != nil { - log.Printf("Unable to read JSON Data from AlertManager Silence GET API, error: %v\n", err) - return data, err - } - - err = json.Unmarshal(byteValue, &data) - if err != nil { - if ConfigJSON.Server.Verbose > 0 { - log.Println(string(byteValue)) - } - log.Printf("Unable to parse JSON Data from AlertManager Silence GET API, error: %v\n", err) - return data, err - } - - return data, nil -} - -// GetAlerts - function for get request on /api/v1/alerts alertmanager endpoint for fetching alerts. -func GetAlerts(getAlertsAPI string, updateMapChoice bool) (models.AmData, error) { - - var data models.AmData - apiURL := ValidateURL(ConfigJSON.Server.CMSMONURL, getAlertsAPI) - - var headers [][]string - headers = append(headers, []string{"Accept-Encoding", "identify"}) - headers = append(headers, []string{"Accept", "application/json"}) - resp := HttpCall("GET", apiURL, headers, nil) - defer resp.Body.Close() - - byteValue, err := io.ReadAll(resp.Body) - if err != nil { - log.Printf("Unable to read JSON Data from AlertManager GET API, error: %v\n", err) - return data, err - } - - err = json.Unmarshal(byteValue, &data) - if err != nil { - if ConfigJSON.Server.Verbose > 0 { - log.Println(string(byteValue)) - } - log.Printf("Unable to parse JSON Data from AlertManager GET API, error: %v\n", err) - return data, err - } - - if updateMapChoice == true { - DataReadWriteLock.Lock() - defer DataReadWriteLock.Unlock() - ExtAlertsMap = make(map[string]int) - for _, eachAlert := range data.Data { - for k, v := range eachAlert.Labels { - if k == ConfigJSON.Alerts.UniqueLabel { - if val, ok := v.(string); ok { - ExtAlertsMap[val] = 1 - } - } - } - } - } else { - DataReadWriteLock.Lock() - defer DataReadWriteLock.Unlock() - ExtSuppressedAlertsMap = make(map[string]models.AmJSON) - for _, eachAlert := range data.Data { - for k, v := range eachAlert.Labels { - if k == ConfigJSON.Alerts.UniqueLabel { - if val, ok := v.(string); ok { - ExtSuppressedAlertsMap[val] = eachAlert - } - } - } - } - } - - return data, nil -} - -// PostAlert - function for making post request on /api/v1/alerts alertmanager endpoint for creating alerts. -func PostAlert(data models.AmJSON) error { - apiURL := ValidateURL(ConfigJSON.Server.CMSMONURL, ConfigJSON.Server.PostAlertsAPI) - var finalData []models.AmJSON - finalData = append(finalData, data) - - jsonStr, err := json.Marshal(finalData) - if err != nil { - log.Printf("Unable to convert JSON Data, error: %v\n", err) - return err - } - var headers [][]string - headers = append(headers, []string{"Content-Type", "application/json"}) - resp := HttpCall("POST", apiURL, headers, bytes.NewBuffer(jsonStr)) - defer resp.Body.Close() - - if ConfigJSON.Server.Verbose > 1 { - log.Println("Pushed Alerts: ", string(jsonStr)) - } - return nil -} - -// Get helper function to safely get value from the dict -func Get(dict map[string]interface{}, key string) (string, bool) { - DataReadWriteLock.RLock() - defer DataReadWriteLock.RUnlock() - val, ok := dict[key] - if ok { - return val.(string), ok - } - return "", false -} - -// Set helper function to safely set value in dict -func Set(dict map[string]interface{}, key string, value string) { - DataReadWriteLock.Lock() - defer DataReadWriteLock.Unlock() - dict[key] = value -} - -// HttpCall helper function to make http call -func HttpCall(method, apiURL string, headers [][]string, buf *bytes.Buffer) *http.Response { - var req *http.Request - var err error - if buf != nil { - // POST request - req, err = http.NewRequest(method, apiURL, buf) - } else { - // GET, DELETE requests - req, err = http.NewRequest(method, apiURL, nil) - } - if err != nil { - log.Printf("Unable to make request to %s, error: %s", apiURL, err) - } - for _, v := range headers { - if len(v) == 2 { - req.Header.Set(v[0], v[1]) - } - } - if ConfigJSON.Server.Verbose > 1 { - dump, err := httputil.DumpRequestOut(req, true) - if err == nil { - log.Println("request: ", string(dump)) - } - } - - timeout := time.Duration(ConfigJSON.Server.HTTPTimeout) * time.Second - client := &http.Client{Timeout: timeout} - - resp, err := client.Do(req) - if err != nil { - log.Printf("Unable to get response from %s, error: %s", apiURL, err) - } - if ConfigJSON.Server.Verbose > 2 { - dump, err := httputil.DumpResponse(resp, true) - if err == nil { - log.Println("response: ", string(dump)) - } - } - log.Println(method, apiURL, resp.Status) - return resp -}