Skip to content

Commit

Permalink
Add an option to filter unverified results using shannon entropy (#1875)
Browse files Browse the repository at this point in the history
* Add an option to filter unverified results using shannon entropy

* lint

* add test, update test, and optimize
  • Loading branch information
dustin-decker authored Oct 9, 2023
1 parent f09bce3 commit 52ed87e
Show file tree
Hide file tree
Showing 5 changed files with 103 additions and 5 deletions.
2 changes: 2 additions & 0 deletions main.go
Original file line number Diff line number Diff line change
Expand Up @@ -48,6 +48,7 @@ var (
noVerification = cli.Flag("no-verification", "Don't verify the results.").Bool()
onlyVerified = cli.Flag("only-verified", "Only output verified results.").Bool()
filterUnverified = cli.Flag("filter-unverified", "Only output first unverified result per chunk per detector if there are more than one results.").Bool()
filterEntropy = cli.Flag("filter-entropy", "Filter unverified results with Shannon entropy. Start with 3.0.").Float64()
configFilename = cli.Flag("config", "Path to configuration file.").ExistingFile()
// rules = cli.Flag("rules", "Path to file with custom rules.").String()
printAvgDetectorTime = cli.Flag("print-avg-detector-time", "Print the average time spent on each detector.").Bool()
Expand Down Expand Up @@ -370,6 +371,7 @@ func run(state overseer.State) {
engine.WithOnlyVerified(*onlyVerified),
engine.WithPrintAvgDetectorTime(*printAvgDetectorTime),
engine.WithPrinter(printer),
engine.WithFilterEntropy(*filterEntropy),
)
if err != nil {
logFatal(err, "error initializing engine")
Expand Down
6 changes: 3 additions & 3 deletions pkg/detectors/detectors_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -12,15 +12,15 @@ func TestPrefixRegex(t *testing.T) {
}{
{
keywords: []string{"securitytrails"},
expected: `(?i)(?:securitytrails).|(?:[\n\r]){0,40}`,
expected: `(?i)(?:securitytrails)(?:.|[\n\r]){0,40}`,
},
{
keywords: []string{"zipbooks"},
expected: `(?i)(?:zipbooks).|(?:[\n\r]){0,40}`,
expected: `(?i)(?:zipbooks)(?:.|[\n\r]){0,40}`,
},
{
keywords: []string{"wrike"},
expected: `(?i)(?:wrike).|(?:[\n\r]){0,40}`,
expected: `(?i)(?:wrike)(?:.|[\n\r]){0,40}`,
},
}
for _, tt := range tests {
Expand Down
37 changes: 37 additions & 0 deletions pkg/detectors/falsepositives.go
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@ package detectors

import (
_ "embed"
"math"
"strings"
"unicode"
)
Expand Down Expand Up @@ -90,3 +91,39 @@ func bytesToCleanWordList(data []byte) []string {
}
return words
}

func StringShannonEntropy(input string) float64 {
chars := make(map[rune]float64)
inverseTotal := 1 / float64(len(input)) // precompute the inverse

for _, char := range input {
chars[char]++
}

entropy := 0.0
for _, count := range chars {
probability := count * inverseTotal
entropy += probability * math.Log2(probability)
}

return -entropy
}

// FilterResultsWithEntropy filters out determinately unverified results that have a shannon entropy below the given value.
func FilterResultsWithEntropy(results []Result, entropy float64) []Result {
filteredResults := []Result{}
for _, result := range results {
if !result.Verified && result.VerificationError == nil {
if result.RawV2 != nil {
if StringShannonEntropy(string(result.RawV2)) >= entropy {
filteredResults = append(filteredResults, result)
}
} else {
if StringShannonEntropy(string(result.Raw)) >= entropy {
filteredResults = append(filteredResults, result)
}
}
}
}
return filteredResults
}
45 changes: 44 additions & 1 deletion pkg/detectors/falsepositives_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,10 @@

package detectors

import "testing"
import (
_ "embed"
"testing"
)

func TestIsFalsePositive(t *testing.T) {
type args struct {
Expand Down Expand Up @@ -40,3 +43,43 @@ func TestIsFalsePositive(t *testing.T) {
})
}
}

func TestStringShannonEntropy(t *testing.T) {
type args struct {
input string
}
tests := []struct {
name string
args args
want float64
}{
{
name: "entropy 1",
args: args{
input: "aaaaaaaaaaaaaaaaaaaaaaaaaaaa",
},
want: 0,
},
{
name: "entropy 2",
args: args{
input: "aaaaaaaaaaaaaaaaaaaaaaaaaaab",
},
want: 0.22228483068568816,
},
{
name: "entropy 3",
args: args{
input: "aaaaaaaaaaaaaaaaaaaaaaaaaaabaaaaaaaaaaaaaaaaaaaaaaaaaaab",
},
want: 0.22228483068568816,
},
}
for _, tt := range tests {
t.Run(tt.name, func(t *testing.T) {
if got := StringShannonEntropy(tt.args.input); got != tt.want {
t.Errorf("StringShannonEntropy() = %v, want %v", got, tt.want)
}
})
}
}
18 changes: 17 additions & 1 deletion pkg/engine/engine.go
Original file line number Diff line number Diff line change
Expand Up @@ -60,7 +60,9 @@ type Engine struct {
// filterUnverified is used to reduce the number of unverified results.
// If there are multiple unverified results for the same chunk for the same detector,
// only the first one will be kept.
filterUnverified bool
filterUnverified bool
// entropyFilter is used to filter out unverified results using Shannon entropy.
filterEntropy *float64
onlyVerified bool
printAvgDetectorTime bool

Expand Down Expand Up @@ -128,6 +130,15 @@ func WithFilterUnverified(filter bool) EngineOption {
}
}

// WithFilterEntropy filters out unverified results using Shannon entropy.
func WithFilterEntropy(entropy float64) EngineOption {
return func(e *Engine) {
if entropy > 0 {
e.filterEntropy = &entropy
}
}
}

// WithOnlyVerified sets the onlyVerified flag on the engine. If set to true,
// the engine will only print verified results.
func WithOnlyVerified(onlyVerified bool) EngineOption {
Expand Down Expand Up @@ -513,6 +524,7 @@ func (e *Engine) detectChunk(ctx context.Context, data detectableChunk) {
if err != nil {
ctx.Logger().Error(err, "error scanning chunk")
}

if e.printAvgDetectorTime && len(results) > 0 {
elapsed := time.Since(start)
detectorName := results[0].DetectorType.String()
Expand All @@ -532,6 +544,10 @@ func (e *Engine) detectChunk(ctx context.Context, data detectableChunk) {
results = detectors.CleanResults(results)
}

if e.filterEntropy != nil {
results = detectors.FilterResultsWithEntropy(results, *e.filterEntropy)
}

for _, res := range results {
e.processResult(ctx, data, res)
}
Expand Down

0 comments on commit 52ed87e

Please sign in to comment.