From 52ed87edb78bb0e88cd284ac7fd21b4523676a52 Mon Sep 17 00:00:00 2001 From: Dustin Decker Date: Sun, 8 Oct 2023 22:52:28 -0400 Subject: [PATCH] Add an option to filter unverified results using shannon entropy (#1875) * Add an option to filter unverified results using shannon entropy * lint * add test, update test, and optimize --- main.go | 2 ++ pkg/detectors/detectors_test.go | 6 ++-- pkg/detectors/falsepositives.go | 37 +++++++++++++++++++++++ pkg/detectors/falsepositives_test.go | 45 +++++++++++++++++++++++++++- pkg/engine/engine.go | 18 ++++++++++- 5 files changed, 103 insertions(+), 5 deletions(-) diff --git a/main.go b/main.go index 2d10ae753999..4f3b829136ad 100644 --- a/main.go +++ b/main.go @@ -48,6 +48,7 @@ var ( noVerification = cli.Flag("no-verification", "Don't verify the results.").Bool() onlyVerified = cli.Flag("only-verified", "Only output verified results.").Bool() filterUnverified = cli.Flag("filter-unverified", "Only output first unverified result per chunk per detector if there are more than one results.").Bool() + filterEntropy = cli.Flag("filter-entropy", "Filter unverified results with Shannon entropy. Start with 3.0.").Float64() configFilename = cli.Flag("config", "Path to configuration file.").ExistingFile() // rules = cli.Flag("rules", "Path to file with custom rules.").String() printAvgDetectorTime = cli.Flag("print-avg-detector-time", "Print the average time spent on each detector.").Bool() @@ -370,6 +371,7 @@ func run(state overseer.State) { engine.WithOnlyVerified(*onlyVerified), engine.WithPrintAvgDetectorTime(*printAvgDetectorTime), engine.WithPrinter(printer), + engine.WithFilterEntropy(*filterEntropy), ) if err != nil { logFatal(err, "error initializing engine") diff --git a/pkg/detectors/detectors_test.go b/pkg/detectors/detectors_test.go index 767544c512d9..0e4d8a9369f7 100644 --- a/pkg/detectors/detectors_test.go +++ b/pkg/detectors/detectors_test.go @@ -12,15 +12,15 @@ func TestPrefixRegex(t *testing.T) { }{ { keywords: []string{"securitytrails"}, - expected: `(?i)(?:securitytrails).|(?:[\n\r]){0,40}`, + expected: `(?i)(?:securitytrails)(?:.|[\n\r]){0,40}`, }, { keywords: []string{"zipbooks"}, - expected: `(?i)(?:zipbooks).|(?:[\n\r]){0,40}`, + expected: `(?i)(?:zipbooks)(?:.|[\n\r]){0,40}`, }, { keywords: []string{"wrike"}, - expected: `(?i)(?:wrike).|(?:[\n\r]){0,40}`, + expected: `(?i)(?:wrike)(?:.|[\n\r]){0,40}`, }, } for _, tt := range tests { diff --git a/pkg/detectors/falsepositives.go b/pkg/detectors/falsepositives.go index e63de9ff15cc..c12cf6b53ce0 100644 --- a/pkg/detectors/falsepositives.go +++ b/pkg/detectors/falsepositives.go @@ -2,6 +2,7 @@ package detectors import ( _ "embed" + "math" "strings" "unicode" ) @@ -90,3 +91,39 @@ func bytesToCleanWordList(data []byte) []string { } return words } + +func StringShannonEntropy(input string) float64 { + chars := make(map[rune]float64) + inverseTotal := 1 / float64(len(input)) // precompute the inverse + + for _, char := range input { + chars[char]++ + } + + entropy := 0.0 + for _, count := range chars { + probability := count * inverseTotal + entropy += probability * math.Log2(probability) + } + + return -entropy +} + +// FilterResultsWithEntropy filters out determinately unverified results that have a shannon entropy below the given value. +func FilterResultsWithEntropy(results []Result, entropy float64) []Result { + filteredResults := []Result{} + for _, result := range results { + if !result.Verified && result.VerificationError == nil { + if result.RawV2 != nil { + if StringShannonEntropy(string(result.RawV2)) >= entropy { + filteredResults = append(filteredResults, result) + } + } else { + if StringShannonEntropy(string(result.Raw)) >= entropy { + filteredResults = append(filteredResults, result) + } + } + } + } + return filteredResults +} diff --git a/pkg/detectors/falsepositives_test.go b/pkg/detectors/falsepositives_test.go index 1d7567209bb9..1a5a5b9e57aa 100644 --- a/pkg/detectors/falsepositives_test.go +++ b/pkg/detectors/falsepositives_test.go @@ -3,7 +3,10 @@ package detectors -import "testing" +import ( + _ "embed" + "testing" +) func TestIsFalsePositive(t *testing.T) { type args struct { @@ -40,3 +43,43 @@ func TestIsFalsePositive(t *testing.T) { }) } } + +func TestStringShannonEntropy(t *testing.T) { + type args struct { + input string + } + tests := []struct { + name string + args args + want float64 + }{ + { + name: "entropy 1", + args: args{ + input: "aaaaaaaaaaaaaaaaaaaaaaaaaaaa", + }, + want: 0, + }, + { + name: "entropy 2", + args: args{ + input: "aaaaaaaaaaaaaaaaaaaaaaaaaaab", + }, + want: 0.22228483068568816, + }, + { + name: "entropy 3", + args: args{ + input: "aaaaaaaaaaaaaaaaaaaaaaaaaaabaaaaaaaaaaaaaaaaaaaaaaaaaaab", + }, + want: 0.22228483068568816, + }, + } + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + if got := StringShannonEntropy(tt.args.input); got != tt.want { + t.Errorf("StringShannonEntropy() = %v, want %v", got, tt.want) + } + }) + } +} diff --git a/pkg/engine/engine.go b/pkg/engine/engine.go index aae5e4536d36..61a525048486 100644 --- a/pkg/engine/engine.go +++ b/pkg/engine/engine.go @@ -60,7 +60,9 @@ type Engine struct { // filterUnverified is used to reduce the number of unverified results. // If there are multiple unverified results for the same chunk for the same detector, // only the first one will be kept. - filterUnverified bool + filterUnverified bool + // entropyFilter is used to filter out unverified results using Shannon entropy. + filterEntropy *float64 onlyVerified bool printAvgDetectorTime bool @@ -128,6 +130,15 @@ func WithFilterUnverified(filter bool) EngineOption { } } +// WithFilterEntropy filters out unverified results using Shannon entropy. +func WithFilterEntropy(entropy float64) EngineOption { + return func(e *Engine) { + if entropy > 0 { + e.filterEntropy = &entropy + } + } +} + // WithOnlyVerified sets the onlyVerified flag on the engine. If set to true, // the engine will only print verified results. func WithOnlyVerified(onlyVerified bool) EngineOption { @@ -513,6 +524,7 @@ func (e *Engine) detectChunk(ctx context.Context, data detectableChunk) { if err != nil { ctx.Logger().Error(err, "error scanning chunk") } + if e.printAvgDetectorTime && len(results) > 0 { elapsed := time.Since(start) detectorName := results[0].DetectorType.String() @@ -532,6 +544,10 @@ func (e *Engine) detectChunk(ctx context.Context, data detectableChunk) { results = detectors.CleanResults(results) } + if e.filterEntropy != nil { + results = detectors.FilterResultsWithEntropy(results, *e.filterEntropy) + } + for _, res := range results { e.processResult(ctx, data, res) }