From bf77251543fa4efad590ee054767b6af04682ce1 Mon Sep 17 00:00:00 2001 From: ahrav Date: Tue, 11 Jun 2024 09:12:31 -0700 Subject: [PATCH] [feat] - Update span calculation logic to use offset magnitude (#2957) * Add a default start offset * update * use keywordIdx --- pkg/detectors/detectors.go | 6 + pkg/detectors/gcp/gcp.go | 12 +- .../gcpapplicationdefaultcredentials.go | 10 +- pkg/engine/ahocorasick/ahocorasickcore.go | 78 +++++---- .../ahocorasick/ahocorasickcore_test.go | 164 +++++++++++++++++- 5 files changed, 228 insertions(+), 42 deletions(-) diff --git a/pkg/detectors/detectors.go b/pkg/detectors/detectors.go index 0fb10b321fe0..465f9d88ed9a 100644 --- a/pkg/detectors/detectors.go +++ b/pkg/detectors/detectors.go @@ -38,6 +38,12 @@ type MaxSecretSizeProvider interface { MaxSecretSize() int64 } +// StartOffsetProvider is an optional interface that a detector can implement to +// provide a custom start offset for the secret it finds. +type StartOffsetProvider interface { + StartOffset() int64 +} + // MultiPartCredentialProvider is an optional interface that a detector can implement // to indicate its compatibility with multi-part credentials and provide the maximum // secret size for the credential it finds. diff --git a/pkg/detectors/gcp/gcp.go b/pkg/detectors/gcp/gcp.go index 86900c2421dc..e031553e443b 100644 --- a/pkg/detectors/gcp/gcp.go +++ b/pkg/detectors/gcp/gcp.go @@ -20,6 +20,7 @@ type Scanner struct{} var _ detectors.Detector = (*Scanner)(nil) var _ detectors.CustomFalsePositiveChecker = (*Scanner)(nil) var _ detectors.MaxSecretSizeProvider = (*Scanner)(nil) +var _ detectors.StartOffsetProvider = (*Scanner)(nil) var ( keyPat = regexp.MustCompile(`\{[^{]+auth_provider_x509_cert_url[^}]+\}`) @@ -50,10 +51,15 @@ func (s Scanner) Keywords() []string { return []string{"provider_x509"} } -const maxGCPKeySize = 4096 +const maxGCPKeySize = 2048 -// ProvideMaxSecretSize returns the maximum size of a secret that this detector can find. -func (s Scanner) MaxSecretSize() int64 { return maxGCPKeySize } +// MaxSecretSize returns the maximum size of a secret that this detector can find. +func (Scanner) MaxSecretSize() int64 { return maxGCPKeySize } + +const startOffset = 4096 + +// StartOffset returns the start offset for the secret this detector finds. +func (Scanner) StartOffset() int64 { return startOffset } // FromData will find and optionally verify GCP secrets in a given set of bytes. func (s Scanner) FromData(ctx context.Context, verify bool, data []byte) (results []detectors.Result, err error) { diff --git a/pkg/detectors/gcpapplicationdefaultcredentials/gcpapplicationdefaultcredentials.go b/pkg/detectors/gcpapplicationdefaultcredentials/gcpapplicationdefaultcredentials.go index 142c8995e4c9..bf48780a3d0e 100644 --- a/pkg/detectors/gcpapplicationdefaultcredentials/gcpapplicationdefaultcredentials.go +++ b/pkg/detectors/gcpapplicationdefaultcredentials/gcpapplicationdefaultcredentials.go @@ -26,6 +26,7 @@ type Scanner struct { // Ensure the Scanner satisfies the interface at compile time. var _ detectors.Detector = (*Scanner)(nil) var _ detectors.MaxSecretSizeProvider = (*Scanner)(nil) +var _ detectors.StartOffsetProvider = (*Scanner)(nil) var ( defaultClient = common.SaneHttpClient() @@ -48,8 +49,13 @@ func (s Scanner) Keywords() []string { const maxGCPADCKeySize = 1024 -// ProvideMaxSecretSize returns the maximum size of a secret that this detector can find. -func (s Scanner) MaxSecretSize() int64 { return maxGCPADCKeySize } +// MaxSecretSize returns the maximum size of a secret that this detector can find. +func (Scanner) MaxSecretSize() int64 { return maxGCPADCKeySize } + +const startOffset = maxGCPADCKeySize + +// StartOffset returns the start offset for the secret this detector finds. +func (Scanner) StartOffset() int64 { return startOffset } // FromData will find and optionally verify Gcpapplicationdefaultcredentials secrets in a given set of bytes. func (s Scanner) FromData(ctx context.Context, verify bool, data []byte) (results []detectors.Result, err error) { diff --git a/pkg/engine/ahocorasick/ahocorasickcore.go b/pkg/engine/ahocorasick/ahocorasickcore.go index 33dc20348de1..d6ce0dd96b18 100644 --- a/pkg/engine/ahocorasick/ahocorasickcore.go +++ b/pkg/engine/ahocorasick/ahocorasickcore.go @@ -34,11 +34,11 @@ type spanCalculator interface { } // spanCalculationParams provides the necessary context for calculating match spans, -// including the starting index in the chunk, the chunk data itself, and the detector being used. +// including the keyword index in the chunk, the chunk data itself, and the detector being used. type spanCalculationParams struct { - startIdx int64 - chunkData []byte - detector detectors.Detector + keywordIdx int64 // Index of the keyword in the chunk data + chunkData []byte + detector detectors.Detector } // EntireChunkSpanCalculator is a strategy that calculates the match span to use the entire chunk data. @@ -51,34 +51,44 @@ func (e *EntireChunkSpanCalculator) calculateSpan(params spanCalculationParams) return matchSpan{startOffset: 0, endOffset: int64(len(params.chunkData))} } -// maxMatchLengthSpanCalculator is a strategy that calculates match spans based on a default max -// match length or values provided by detectors. This allows for more granular control over the match span. -type maxMatchLengthSpanCalculator struct{ maxMatchLength int64 } +// adjustableSpanCalculator is a strategy that calculates match spans. It uses a default offset magnitude +// or values provided by specific detectors to adjust the start and end indices of the span, allowing +// for more granular control over the match. +type adjustableSpanCalculator struct{ offsetMagnitude int64 } -// newMaxMatchLengthSpanCalculator creates a new instance of maxMatchLengthSpanCalculator with the -// specified max match length. -func newMaxMatchLengthSpanCalculator(maxMatchLength int64) *maxMatchLengthSpanCalculator { - return &maxMatchLengthSpanCalculator{maxMatchLength: maxMatchLength} +// newAdjustableSpanCalculator creates a new instance of adjustableSpanCalculator with the +// specified offset magnitude. +func newAdjustableSpanCalculator(offsetRadius int64) *adjustableSpanCalculator { + return &adjustableSpanCalculator{offsetMagnitude: offsetRadius} } -// calculateSpans computes the match spans based on the start index and the max match length. -// If the detector provides an override value, it uses that instead of the default max match length. -func (m *maxMatchLengthSpanCalculator) calculateSpan(params spanCalculationParams) matchSpan { - maxSize := m.maxMatchLength - - switch d := params.detector.(type) { - case detectors.MultiPartCredentialProvider: - maxSize = d.MaxCredentialSpan() - case detectors.MaxSecretSizeProvider: - maxSize = d.MaxSecretSize() - default: // Use the default max match length +// calculateSpan computes the match span based on the keyword index and the offset magnitude. +// If the detector provides an override value, it uses that instead of the default offset magnitude to +// calculate the maximum size of the span. +// The start index of the span is also adjusted if the detector provides a start offset. +func (m *adjustableSpanCalculator) calculateSpan(params spanCalculationParams) matchSpan { + keywordIdx := params.keywordIdx + + maxSize := keywordIdx + m.offsetMagnitude + startOffset := keywordIdx - m.offsetMagnitude + + // Check if the detector implements each interface and update values accordingly. + // This CAN'T be done in a switch statement because a detector can implement multiple interfaces. + if provider, ok := params.detector.(detectors.MultiPartCredentialProvider); ok { + maxSize = provider.MaxCredentialSpan() + keywordIdx + startOffset = keywordIdx - provider.MaxCredentialSpan() } - endIdx := params.startIdx + maxSize - if endIdx > int64(len(params.chunkData)) { - endIdx = int64(len(params.chunkData)) + if provider, ok := params.detector.(detectors.MaxSecretSizeProvider); ok { + maxSize = provider.MaxSecretSize() + keywordIdx } + if provider, ok := params.detector.(detectors.StartOffsetProvider); ok { + startOffset = keywordIdx - provider.StartOffset() + } + + startIdx := max(startOffset, 0) + endIdx := min(maxSize, int64(len(params.chunkData))) - return matchSpan{startOffset: params.startIdx, endOffset: endIdx} + return matchSpan{startOffset: startIdx, endOffset: endIdx} } // CoreOption is a functional option type for configuring an AhoCorasickCore instance. @@ -123,19 +133,19 @@ func NewAhoCorasickCore(allDetectors []detectors.Detector, opts ...CoreOption) * } } - const maxMatchLength int64 = 512 - ac := &Core{ + const defaultOffsetRadius int64 = 512 + core := &Core{ keywordsToDetectors: keywordsToDetectors, detectorsByKey: detectorsByKey, prefilter: *ahocorasick.NewTrieBuilder().AddStrings(keywords).Build(), - spanCalculator: newMaxMatchLengthSpanCalculator(maxMatchLength), // Default span calculator + spanCalculator: newAdjustableSpanCalculator(defaultOffsetRadius), // Default span calculator } for _, opt := range opts { - opt(ac) + opt(core) } - return ac + return core } // DetectorMatch represents a detected pattern's metadata in a data chunk. @@ -234,9 +244,9 @@ func (ac *Core) FindDetectorMatches(chunkData []byte) []*DetectorMatch { startIdx := m.Pos() span := ac.spanCalculator.calculateSpan( spanCalculationParams{ - startIdx: startIdx, - chunkData: chunkData, - detector: detectorMatch.Detector, + keywordIdx: startIdx, + chunkData: chunkData, + detector: detectorMatch.Detector, }, ) detectorMatch.addMatchSpan(span) diff --git a/pkg/engine/ahocorasick/ahocorasickcore_test.go b/pkg/engine/ahocorasick/ahocorasickcore_test.go index 209ed326e78e..2d88002f5945 100644 --- a/pkg/engine/ahocorasick/ahocorasickcore_test.go +++ b/pkg/engine/ahocorasick/ahocorasickcore_test.go @@ -63,6 +63,64 @@ func (testDetectorV3) Type() detectorspb.DetectorType { func (testDetectorV3) Version() int { return 1 } +var _ detectors.Detector = (*testDetectorV4)(nil) +var _ detectors.MultiPartCredentialProvider = (*testDetectorV4)(nil) +var _ detectors.StartOffsetProvider = (*testDetectorV4)(nil) + +type testDetectorV4 struct{} + +func (testDetectorV4) FromData(context.Context, bool, []byte) ([]detectors.Result, error) { + return make([]detectors.Result, 0), nil +} + +func (testDetectorV4) Keywords() []string { return []string{"password"} } + +func (testDetectorV4) Type() detectorspb.DetectorType { return TestDetectorType } + +func (testDetectorV4) Version() int { return 1 } + +func (testDetectorV4) MaxCredentialSpan() int64 { return 15 } + +func (testDetectorV4) StartOffset() int64 { return 5 } + +var _ detectors.Detector = (*testDetectorV5)(nil) +var _ detectors.MaxSecretSizeProvider = (*testDetectorV5)(nil) +var _ detectors.StartOffsetProvider = (*testDetectorV5)(nil) + +type testDetectorV5 struct{} + +func (testDetectorV5) FromData(context.Context, bool, []byte) ([]detectors.Result, error) { + return make([]detectors.Result, 0), nil +} + +func (testDetectorV5) Keywords() []string { return []string{"password"} } + +func (testDetectorV5) Type() detectorspb.DetectorType { return TestDetectorType } + +func (testDetectorV5) Version() int { return 1 } + +func (testDetectorV5) MaxSecretSize() int64 { return 10 } + +func (testDetectorV5) StartOffset() int64 { return 3 } + +var _ detectors.Detector = (*testDetectorV6)(nil) +var _ detectors.Detector = (*testDetectorV6)(nil) +var _ detectors.StartOffsetProvider = (*testDetectorV6)(nil) + +type testDetectorV6 struct{} + +func (testDetectorV6) FromData(context.Context, bool, []byte) ([]detectors.Result, error) { + return make([]detectors.Result, 0), nil +} + +func (testDetectorV6) Keywords() []string { return []string{"password"} } + +func (testDetectorV6) Type() detectorspb.DetectorType { return TestDetectorType } + +func (testDetectorV6) Version() int { return 1 } + +func (testDetectorV6) StartOffset() int64 { return 1 } + var _ detectors.Detector = (*testDetectorV1)(nil) var _ detectors.Detector = (*testDetectorV2)(nil) var _ detectors.Versioner = (*testDetectorV1)(nil) @@ -141,7 +199,7 @@ func TestFindDetectorMatches(t *testing.T) { }, sampleData: "This is a sample data containing keyword truffle", expectedResult: map[DetectorKey][][]int64{ - CreateDetectorKey(testDetectorV3{}): {{41, 48}}, + CreateDetectorKey(testDetectorV3{}): {{0, 48}}, }, }, { @@ -151,7 +209,7 @@ func TestFindDetectorMatches(t *testing.T) { }, sampleData: "This is a sample data containing keyword a", expectedResult: map[DetectorKey][][]int64{ - CreateDetectorKey(testDetectorV1{}): {{8, 42}}, + CreateDetectorKey(testDetectorV1{}): {{0, 42}}, }, }, { @@ -172,7 +230,7 @@ func TestFindDetectorMatches(t *testing.T) { eget ultricies ugue ugue id ugue. Meens liquet libero c libero molestie, nec mlesud ugue ugue eget. This is the second occurrence of the letter a.`, expectedResult: map[DetectorKey][][]int64{ - CreateDetectorKey(testDetectorV2{}): {{43, 555}, {854, 856}}, + CreateDetectorKey(testDetectorV2{}): {{0, 856}}, }, }, { @@ -219,6 +277,106 @@ func TestFindDetectorMatches(t *testing.T) { CreateDetectorKey(testDetectorV2{}): {{0, 856}}, }, }, + { + name: "keyword in the middle of the credential; MultiPartCredentialProvider, StartOffsetProvider", + detectors: []detectors.Detector{ + testDetectorV4{}, + }, + sampleData: "This is a password in the middle of some data", + expectedResult: map[DetectorKey][][]int64{ + CreateDetectorKey(testDetectorV4{}): {{5, 25}}, + }, + }, + { + name: "keyword at the end of the credential; MultiPartCredentialProvider, StartOffsetProvider", + detectors: []detectors.Detector{ + testDetectorV4{}, + }, + sampleData: "This data ends with a password", + expectedResult: map[DetectorKey][][]int64{ + CreateDetectorKey(testDetectorV4{}): {{17, 30}}, + }, + }, + { + name: "keyword near the start of the data; MultiPartCredentialProvider, StartOffsetProvider", + detectors: []detectors.Detector{ + testDetectorV4{}, + }, + sampleData: "a password at the start", + expectedResult: map[DetectorKey][][]int64{ + CreateDetectorKey(testDetectorV4{}): {{0, 17}}, + }, + }, + { + name: "keyword in the middle of the credential; MaxSecretSizeProvider, StartOffsetProvider", + detectors: []detectors.Detector{ + testDetectorV5{}, + }, + sampleData: "This is a password in the middle of some data", + expectedResult: map[DetectorKey][][]int64{ + CreateDetectorKey(testDetectorV5{}): {{7, 20}}, + }, + }, + { + name: "keyword at the end of the credential; MaxSecretSizeProvider, StartOffsetProvider", + detectors: []detectors.Detector{ + testDetectorV5{}, + }, + sampleData: "This data ends with a password", + expectedResult: map[DetectorKey][][]int64{ + CreateDetectorKey(testDetectorV5{}): {{19, 30}}, + }, + }, + { + name: "keyword near the start of the data; MaxSecretSizeProvider, StartOffsetProvider", + detectors: []detectors.Detector{ + testDetectorV5{}, + }, + sampleData: "a password at the start", + expectedResult: map[DetectorKey][][]int64{ + CreateDetectorKey(testDetectorV5{}): {{0, 12}}, + }, + }, + { + name: "keyword in the middle of the credential; StartOffsetProvider", + detectors: []detectors.Detector{ + testDetectorV6{}, + }, + sampleData: "This is a password in the middle of some data", + expectedResult: map[DetectorKey][][]int64{ + CreateDetectorKey(testDetectorV6{}): {{9, 45}}, + }, + }, + { + name: "keyword at the end of the credential; StartOffsetProvider", + detectors: []detectors.Detector{ + testDetectorV6{}, + }, + sampleData: "This data ends with a password", + expectedResult: map[DetectorKey][][]int64{ + CreateDetectorKey(testDetectorV6{}): {{21, 30}}, + }, + }, + { + name: "keyword near the start of the data; StartOffsetProvider", + detectors: []detectors.Detector{ + testDetectorV6{}, + }, + sampleData: "a password at the start", + expectedResult: map[DetectorKey][][]int64{ + CreateDetectorKey(testDetectorV6{}): {{1, 23}}, + }, + }, + { + name: "multiple keyword in the middle of the credential; StartOffsetProvider", + detectors: []detectors.Detector{ + testDetectorV6{}, + }, + sampleData: "This is a password in the middle of some data, and another password at the end!", + expectedResult: map[DetectorKey][][]int64{ + CreateDetectorKey(testDetectorV6{}): {{9, 79}}, + }, + }, { name: "No matches", detectors: []detectors.Detector{