Skip to content

Commit

Permalink
[feat] - Make skipping binaries configurable (#2226)
Browse files Browse the repository at this point in the history
* Make skipping binaries configurable

* remove ioutil

* fix

* address comments

* address comments

* use multi-reader

* remove print

* use const

* fix test

* fix my stupidness
  • Loading branch information
ahrav authored Dec 15, 2023
1 parent 78b5a95 commit 5c6ce69
Show file tree
Hide file tree
Showing 16 changed files with 767 additions and 475 deletions.
4 changes: 3 additions & 1 deletion hack/snifftest/main.go
Original file line number Diff line number Diff line change
Expand Up @@ -201,7 +201,9 @@ func main() {
},
},
}
})
},
true,
)

logger.Info("scanning repo", "repo", r)
err = s.ScanRepo(ctx, repo, path, git.NewScanOptions(), sources.ChanReporter{Ch: chunksChan})
Expand Down
54 changes: 46 additions & 8 deletions pkg/common/vars.go
Original file line number Diff line number Diff line change
Expand Up @@ -36,17 +36,49 @@ var (
"gif",
"tiff",

"fnt", // Windows font file
"fon", // Generic font file
"ttf", // TrueType font
"otf", // OpenType font
"woff", // Web Open Font Format
"woff2", // Web Open Font Format 2
"eot", // Embedded OpenType font
"svgz", // Compressed Scalable Vector Graphics file
"icns", // Apple icon image file
"ico", // Icon file
}

binaryExtensions = map[string]struct{}{
// binaries
// These can theoretically contain secrets, but need decoding for users to make sense of them, and we don't have
// any such decoders right now.
"class",
"dll",
"xsb",
"jdo",
"jks",
"ser",
"idx",
"hprof",
"class": {}, // Java bytecode class file
"dll": {}, // Dynamic Link Library, Windows
"jdo": {}, // Java Data Object, Java serialization format
"jks": {}, // Java Key Store, Java keystore format
"ser": {}, // Java serialization format
"idx": {}, // Index file, often binary
"hprof": {}, // Java heap dump format
"exe": {}, // Executable, Windows
"bin": {}, // Binary, often used for compiled source code
"so": {}, // Shared object, Unix/Linux
"o": {}, // Object file from compilation/ intermediate object file
"a": {}, // Static library, Unix/Linux
"dylib": {}, // Dynamic library, macOS
"lib": {}, // Library, Unix/Linux
"obj": {}, // Object file, typically from compiled source code
"pdb": {}, // Program Database, Microsoft Visual Studio debugging format
"dat": {}, // Generic data file, often binary but not always
"elf": {}, // Executable and Linkable Format, common in Unix/Linux
"dmg": {}, // Disk Image for macOS
"iso": {}, // ISO image (optical disk image)
"img": {}, // Disk image files
"out": {}, // Common output file from compiled executable in Unix/Linux
"com": {}, // DOS command file, executable
"sys": {}, // Windows system file, often a driver
"vxd": {}, // Virtual device driver in Windows
"sfx": {}, // Self-extracting archive
"bundle": {}, // Mac OS X application bundle
}
)

Expand All @@ -58,3 +90,9 @@ func SkipFile(filename string) bool {
}
return false
}

// IsBinary returns true if the file extension is in the binaryExtensions list.
func IsBinary(filename string) bool {
_, ok := binaryExtensions[strings.ToLower(strings.TrimPrefix(filepath.Ext(filename), "."))]
return ok
}
33 changes: 33 additions & 0 deletions pkg/common/vars_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -34,3 +34,36 @@ func BenchmarkSkipFile(b *testing.B) {
SkipFile("test.mp4")
}
}

func TestIsBinary(t *testing.T) {
type testCase struct {
file string
want bool
}

// Add a test case for each binary extension.
testCases := make([]testCase, 0, len(binaryExtensions)+1)
for ext := range binaryExtensions {
testCases = append(testCases, testCase{
file: "test." + ext,
want: true,
})
}

// Add a test case for a file that should not be skipped.
testCases = append(testCases, testCase{file: "test.txt", want: false})

for _, tt := range testCases {
t.Run(tt.file, func(t *testing.T) {
if got := IsBinary(tt.file); got != tt.want {
t.Errorf("IsBinary(%v) got %v, want %v", tt.file, got, tt.want)
}
})
}
}

func BenchmarkIsBinary(b *testing.B) {
for i := 0; i < b.N; i++ {
IsBinary("test.exe")
}
}
1 change: 1 addition & 0 deletions pkg/engine/git.go
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,7 @@ func (e *Engine) ScanGit(ctx context.Context, c sources.GitConfig) error {
IncludePathsFile: c.IncludePathsFile,
ExcludePathsFile: c.ExcludePathsFile,
MaxDepth: int64(c.MaxDepth),
SkipBinaries: c.SkipBinaries,
}
var conn anypb.Any
if err := anypb.MarshalFrom(&conn, connection, proto.MarshalOptions{}); err != nil {
Expand Down
1 change: 1 addition & 0 deletions pkg/engine/github.go
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,7 @@ func (e *Engine) ScanGitHub(ctx context.Context, c sources.GithubConfig) error {
IncludeIssueComments: c.IncludeIssueComments,
IncludePullRequestComments: c.IncludePullRequestComments,
IncludeGistComments: c.IncludeGistComments,
SkipBinaries: c.SkipBinaries,
}
if len(c.Token) > 0 {
connection.Credential = &sourcespb.GitHub_Token{
Expand Down
2 changes: 1 addition & 1 deletion pkg/engine/gitlab.go
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,7 @@ func (e *Engine) ScanGitLab(ctx context.Context, c sources.GitlabConfig) error {
}
scanOptions := git.NewScanOptions(opts...)

connection := &sourcespb.GitLab{}
connection := &sourcespb.GitLab{SkipBinaries: c.SkipBinaries}

switch {
case len(c.Token) > 0:
Expand Down
45 changes: 41 additions & 4 deletions pkg/handlers/archive.go
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
package handlers

import (
"bufio"
"bytes"
"context"
"errors"
Expand All @@ -12,6 +13,7 @@ import (
"strings"
"time"

"github.com/gabriel-vasile/mimetype"
"github.com/h2non/filetype"
"github.com/mholt/archiver/v4"

Expand Down Expand Up @@ -43,11 +45,14 @@ var _ SpecializedHandler = (*Archive)(nil)
type Archive struct {
size int
currentDepth int
skipBinaries bool
}

// New sets a default maximum size and current size counter.
func (a *Archive) New() {
a.size = 0
// New creates a new Archive handler with the provided options.
func (a *Archive) New(opts ...Option) {
for _, opt := range opts {
opt(a)
}
}

// SetArchiveMaxSize sets the maximum size of the archive.
Expand Down Expand Up @@ -114,9 +119,34 @@ func (a *Archive) openArchive(ctx logContext.Context, depth int, reader io.Reade
}
}

const mimeTypeBufferSize = 512

func (a *Archive) handleNonArchiveContent(ctx logContext.Context, reader io.Reader, archiveChan chan []byte) error {
bufReader := bufio.NewReaderSize(reader, mimeTypeBufferSize)
// A buffer of 512 bytes is used since many file formats store their magic numbers within the first 512 bytes.
// If fewer bytes are read, MIME type detection may still succeed.
buffer, err := bufReader.Peek(mimeTypeBufferSize)
if err != nil && !errors.Is(err, io.EOF) {
return fmt.Errorf("unable to read file for MIME type detection: %w", err)
}

mime := mimetype.Detect(buffer)
mimeT := mimeType(mime.String())

if common.SkipFile(mime.Extension()) {
ctx.Logger().V(5).Info("skipping file", "ext", mimeT)
return nil
}

if a.skipBinaries {
if common.IsBinary(mime.Extension()) || mimeT == machOType || mimeT == octetStream {
ctx.Logger().V(5).Info("skipping binary file", "ext", mimeT)
return nil
}
}

chunkReader := sources.NewChunkReader()
chunkResChan := chunkReader(ctx, reader)
chunkResChan := chunkReader(ctx, bufReader)
for data := range chunkResChan {
if err := data.Error(); err != nil {
ctx.Logger().Error(err, "error reading chunk")
Expand Down Expand Up @@ -166,6 +196,11 @@ func (a *Archive) extractorHandler(archiveChan chan []byte) func(context.Context
return nil
}

if a.skipBinaries && common.IsBinary(f.Name()) {
lCtx.Logger().V(5).Info("skipping binary file", "filename", f.Name())
return nil
}

fileBytes, err := a.ReadToMax(lCtx, fReader)
if err != nil {
return err
Expand Down Expand Up @@ -222,6 +257,8 @@ type mimeType string
const (
arMimeType mimeType = "application/x-unix-archive"
rpmMimeType mimeType = "application/x-rpm"
machOType mimeType = "application/x-mach-binary"
octetStream mimeType = "application/octet-stream"
)

// mimeTools maps MIME types to the necessary command-line tools to handle them.
Expand Down
79 changes: 79 additions & 0 deletions pkg/handlers/archive_test.go
Original file line number Diff line number Diff line change
@@ -1,9 +1,12 @@
package handlers

import (
"archive/tar"
"bytes"
"context"
"encoding/binary"
"io"
"math/rand"
"net/http"
"os"
"regexp"
Expand Down Expand Up @@ -129,6 +132,82 @@ func TestHandleFile(t *testing.T) {
assert.Equal(t, 1, len(reporter.Ch))
}

func TestHandleFileSkipBinaries(t *testing.T) {
filename := createBinaryArchive(t)
defer os.Remove(filename)

file, err := os.Open(filename)
assert.NoError(t, err)

ctx, cancel := logContext.WithTimeout(logContext.Background(), 5*time.Second)
defer cancel()
sourceChan := make(chan *sources.Chunk, 1)

go func() {
defer close(sourceChan)
HandleFile(ctx, file, &sources.Chunk{}, sources.ChanReporter{Ch: sourceChan}, WithSkipBinaries(true))
}()

count := 0
for range sourceChan {
count++
}
// The binary archive should not be scanned.
assert.Equal(t, 0, count)
}

func createBinaryArchive(t *testing.T) string {
t.Helper()

f, err := os.CreateTemp("", "testbinary")
assert.NoError(t, err)
defer os.Remove(f.Name())

r := rand.New(rand.NewSource(time.Now().UnixNano()))

randomBytes := make([]byte, 1024)
_, err = r.Read(randomBytes)
assert.NoError(t, err)

_, err = f.Write(randomBytes)
assert.NoError(t, err)

// Create and write some structured binary data (e.g., integers, floats)
for i := 0; i < 10; i++ {
err = binary.Write(f, binary.LittleEndian, int32(rand.Intn(1000)))
assert.NoError(t, err)
err = binary.Write(f, binary.LittleEndian, rand.Float64())
assert.NoError(t, err)
}

tarFile, err := os.Create("example.tar")
if err != nil {
t.Fatal(err)
}
defer tarFile.Close()

// Create a new tar archive.
tarWriter := tar.NewWriter(tarFile)
defer tarWriter.Close()

fileInfo, err := f.Stat()
assert.NoError(t, err)

header, err := tar.FileInfoHeader(fileInfo, "")
assert.NoError(t, err)

err = tarWriter.WriteHeader(header)
assert.NoError(t, err)

fileContent, err := os.ReadFile(f.Name())
assert.NoError(t, err)

_, err = tarWriter.Write(fileContent)
assert.NoError(t, err)

return tarFile.Name()
}

func TestReadToMax(t *testing.T) {
tests := []struct {
name string
Expand Down
18 changes: 15 additions & 3 deletions pkg/handlers/handlers.go
Original file line number Diff line number Diff line change
Expand Up @@ -24,10 +24,22 @@ type SpecializedHandler interface {
HandleSpecialized(logContext.Context, io.Reader) (io.Reader, bool, error)
}

// Option is a function type that applies a configuration to a Handler.
type Option func(Handler)

// WithSkipBinaries returns a Option that configures whether to skip binary files.
func WithSkipBinaries(skip bool) Option {
return func(h Handler) {
if a, ok := h.(*Archive); ok {
a.skipBinaries = skip
}
}
}

type Handler interface {
FromFile(logContext.Context, io.Reader) chan []byte
IsFiletype(logContext.Context, io.Reader) (io.Reader, bool)
New()
New(...Option)
}

// HandleFile processes a given file by selecting an appropriate handler from DefaultHandlers.
Expand All @@ -36,9 +48,9 @@ type Handler interface {
// packages them in the provided chunk skeleton, and reports them to the chunk reporter.
// The function returns true if processing was successful and false otherwise.
// Context is used for cancellation, and the caller is responsible for canceling it if needed.
func HandleFile(ctx logContext.Context, file io.Reader, chunkSkel *sources.Chunk, reporter sources.ChunkReporter) bool {
func HandleFile(ctx logContext.Context, file io.Reader, chunkSkel *sources.Chunk, reporter sources.ChunkReporter, opts ...Option) bool {
for _, h := range DefaultHandlers() {
h.New()
h.New(opts...)

// The re-reader is used to reset the file reader after checking if the handler implements SpecializedHandler.
// This is necessary because the archive pkg doesn't correctly determine the file type when using
Expand Down
Loading

0 comments on commit 5c6ce69

Please sign in to comment.