[refactor] - Create separate handler for non-archive data (#2825)

* Remove specialized handler and archive struct and restructure handlers pkg. * Refactor RPM archive handlers to use a library instead of shelling out * make rpm handling context aware * update test * Refactor AR/deb archive handler to use an existing library instead of shelling out * Update tests * Handle non-archive data within the DefaultHandler * make structs and methods private * Remove non-archive data handling within sources * add max size check * add filename and size to context kvp * move skip file check and is binary check before opening file * fix test * preserve existing funcitonality of not handling non-archive files in HandleFile * Handle non-archive data within the DefaultHandler * rebase * Remove non-archive data handling within sources * Adjust check for rpm/deb archive type * add additional deb mime type * add gzip * move diskbuffered rereader setup into handler pkg * remove DiskBuffereReader creation logic within sources * update comment * move rewind closer * reduce log verbosity * add metrics for file handling * add metrics for errors * make defaultBufferSize a const * add metrics for file handling * add metrics for errors * fix tests * add metrics for max archive depth and skipped files * update error * skip symlinks and dirs * update err * Address incompatible reader to openArchive * remove nil check * fix err assignment * Allow git cat-file blob to complete before trying to handle the file * wrap compReader with DiskbufferReader * Allow git cat-file blob to complete before trying to handle the file * updates * use buffer writer * update * refactor * update context pkg * revert stuff * update test * fix test * remove * use correct reader * add metrics for file handling * add metrics for errors * fix tests * rebase * add metrics for errors * add metrics for max archive depth and skipped files * update error * skip symlinks and dirs * update err * fix err assignment * rebase * remove * Update write method in contentWriter interface * Add bufferReadSeekCloser * update name * update comment * fix lint * Remove specialized handler and archive struct and restructure handlers pkg. * Refactor RPM archive handlers to use a library instead of shelling out * make rpm handling context aware * update test * Refactor AR/deb archive handler to use an existing library instead of shelling out * Update tests * add max size check * add filename and size to context kvp * move skip file check and is binary check before opening file * fix test * preserve existing funcitonality of not handling non-archive files in HandleFile * Handle non-archive data within the DefaultHandler * rebase * Remove non-archive data handling within sources * Handle non-archive data within the DefaultHandler * add gzip * move diskbuffered rereader setup into handler pkg * remove DiskBuffereReader creation logic within sources * update comment * move rewind closer * reduce log verbosity * make defaultBufferSize a const * add metrics for file handling * add metrics for errors * fix tests * add metrics for max archive depth and skipped files * update error * skip symlinks and dirs * update err * Address incompatible reader to openArchive * remove nil check * fix err assignment * wrap compReader with DiskbufferReader * Allow git cat-file blob to complete before trying to handle the file * updates * use buffer writer * update * refactor * update context pkg * revert stuff * update test * remove * rebase * go mod tidy * lint check * update metric to ms * update metric * update comments * dont use ptr * update * fix * Remove specialized handler and archive struct and restructure handlers pkg. * Refactor RPM archive handlers to use a library instead of shelling out * make rpm handling context aware * update test * Refactor AR/deb archive handler to use an existing library instead of shelling out * Update tests * add max size check * add filename and size to context kvp * move skip file check and is binary check before opening file * fix test * preserve existing funcitonality of not handling non-archive files in HandleFile * Adjust check for rpm/deb archive type * add additional deb mime type * update comment * go mod tidy * update go mod * Add a buffered file reader * update comments * use Buffered File Readder * return buffer * update * fix * return * go mod tidy * merge * use a shared pool * use sync.Once * reorganzie * remove unused code * fix double init * fix stuff * nil check * reduce allocations * updates * update metrics * updates * reset buffer instead of putting it back * skip binaries * skip * concurrently process diffs * close chan * concurrently enumerate orgs * increase workers * ignore pbix and vsdx files * add metrics for gitparse's Diffchan * fix metric * update metrics * update * fix checks * fix * inc * update * reduce * Create workers to handle binary files * modify workers * updates * add check * delete code * use custom reader * rename struct * add nonarchive handler * fix break * add comments * add tests * refactor * remove log * do not scan rpm links * simplify * rename var * rename * fix benchmark * add buffer * buffer * buffer * handle panic * merge main * merge main * add recover * revert stuff * revert * revert to using reader * fixes * remove * update * fixes * linter * fix test * fix comment * update field name * fix
trufflesecurity · May 15, 2024 · ead9dd5 · ead9dd5
1 parent 7025b0a
commit ead9dd5
Show file tree

Hide file tree

Showing 17 changed files with 644 additions and 465 deletions.
diff --git a/pkg/handlers/ar.go b/pkg/handlers/ar.go
@@ -11,8 +11,7 @@ import (
 	logContext "github.com/trufflesecurity/trufflehog/v3/pkg/context"
 )
 
-// arHandler specializes defaultHandler to handle AR archive formats. By embedding defaultHandler,
-// arHandler inherits and can further customize the common handling behavior such as skipping binaries.
+// arHandler handles AR archive formats.
 type arHandler struct{ *defaultHandler }
 
 // newARHandler creates an arHandler.
@@ -22,7 +21,7 @@ func newARHandler() *arHandler {
 
 // HandleFile processes AR formatted files. This function needs to be implemented to extract or
 // manage data from AR files according to specific requirements.
-func (h *arHandler) HandleFile(ctx logContext.Context, input readSeekCloser) (chan []byte, error) {
+func (h *arHandler) HandleFile(ctx logContext.Context, input fileReader) (chan []byte, error) {
 	archiveChan := make(chan []byte, defaultBufferSize)
 
 	go func() {
@@ -33,7 +32,23 @@ func (h *arHandler) HandleFile(ctx logContext.Context, input readSeekCloser) (ch
 		// Update the metrics for the file processing.
 		start := time.Now()
 		var err error
-		defer h.measureLatencyAndHandleErrors(start, err)
+		defer func() {
+			h.measureLatencyAndHandleErrors(start, err)
+			h.metrics.incFilesProcessed()
+		}()
+
+		// Defer a panic recovery to handle any panics that occur during the AR processing.
+		defer func() {
+			if r := recover(); r != nil {
+				// Return the panic as an error.
+				if e, ok := r.(error); ok {
+					err = e
+				} else {
+					err = fmt.Errorf("panic occurred: %v", r)
+				}
+				ctx.Logger().Error(err, "Panic occurred when reading ar archive")
+			}
+		}()
 
 		var arReader *deb.Ar
 		arReader, err = deb.LoadAr(input)

diff --git a/pkg/handlers/ar_test.go b/pkg/handlers/ar_test.go
@@ -18,8 +18,12 @@ func TestHandleARFile(t *testing.T) {
 	ctx, cancel := context.WithTimeout(context.Background(), 3*time.Second)
 	defer cancel()
 
+	rdr, err := newFileReader(file)
+	assert.NoError(t, err)
+	defer rdr.Close()
+
 	handler := newARHandler()
-	archiveChan, err := handler.HandleFile(context.AddLogger(ctx), file)
+	archiveChan, err := handler.HandleFile(context.AddLogger(ctx), rdr)
 	assert.NoError(t, err)
 
 	wantChunkCount := 102

diff --git a/pkg/handlers/archive.go b/pkg/handlers/archive.go
@@ -0,0 +1,192 @@
+package handlers
+
+import (
+	"context"
+	"errors"
+	"fmt"
+	"time"
+
+	"github.com/mholt/archiver/v4"
+
+	"github.com/trufflesecurity/trufflehog/v3/pkg/common"
+	logContext "github.com/trufflesecurity/trufflehog/v3/pkg/context"
+)
+
+type ctxKey int
+
+const (
+	depthKey          ctxKey = iota
+	defaultBufferSize        = 512
+)
+
+var (
+	maxDepth   = 5
+	maxSize    = 250 * 1024 * 1024 // 250 MB
+	maxTimeout = time.Duration(30) * time.Second
+)
+
+// SetArchiveMaxSize sets the maximum size of the archive.
+func SetArchiveMaxSize(size int) { maxSize = size }
+
+// SetArchiveMaxDepth sets the maximum depth of the archive.
+func SetArchiveMaxDepth(depth int) { maxDepth = depth }
+
+// SetArchiveMaxTimeout sets the maximum timeout for the archive handler.
+func SetArchiveMaxTimeout(timeout time.Duration) { maxTimeout = timeout }
+
+// archiveHandler is a handler for common archive files that are supported by the archiver library.
+type archiveHandler struct{ *defaultHandler }
+
+func newArchiveHandler() *archiveHandler {
+	return &archiveHandler{defaultHandler: newDefaultHandler(archiveHandlerType)}
+}
+
+// HandleFile processes the input as either an archive or non-archive based on its content,
+// utilizing a single output channel. It first tries to identify the input as an archive. If it is an archive,
+// it processes it accordingly; otherwise, it handles the input as non-archive content.
+// The function returns a channel that will receive the extracted data bytes and an error if the initial setup fails.
+func (h *archiveHandler) HandleFile(ctx logContext.Context, input fileReader) (chan []byte, error) {
+	dataChan := make(chan []byte, defaultBufferSize)
+
+	go func() {
+		ctx, cancel := logContext.WithTimeout(ctx, maxTimeout)
+		defer cancel()
+		defer close(dataChan)
+
+		// Update the metrics for the file processing.
+		start := time.Now()
+		var err error
+		defer func() {
+			h.measureLatencyAndHandleErrors(start, err)
+			h.metrics.incFilesProcessed()
+		}()
+
+		if err = h.openArchive(ctx, 0, input, dataChan); err != nil {
+			ctx.Logger().Error(err, "error unarchiving chunk.")
+		}
+	}()
+
+	return dataChan, nil
+}
+
+var ErrMaxDepthReached = errors.New("max archive depth reached")
+
+// openArchive recursively extracts content from an archive up to a maximum depth, handling nested archives if necessary.
+// It takes a reader from which it attempts to identify and process the archive format. Depending on the archive type,
+// it either decompresses or extracts the contents directly, sending data to the provided channel.
+// Returns an error if the archive cannot be processed due to issues like exceeding maximum depth or unsupported formats.
+func (h *archiveHandler) openArchive(ctx logContext.Context, depth int, reader fileReader, archiveChan chan []byte) error {
+	if common.IsDone(ctx) {
+		return ctx.Err()
+	}
+
+	if depth >= maxDepth {
+		h.metrics.incMaxArchiveDepthCount()
+		return ErrMaxDepthReached
+	}
+
+	arReader := reader.BufferedFileReader
+	if reader.format == nil && depth > 0 {
+		return h.handleNonArchiveContent(ctx, arReader, archiveChan)
+	}
+
+	switch archive := reader.format.(type) {
+	case archiver.Decompressor:
+		// Decompress tha archive and feed the decompressed data back into the archive handler to extract any nested archives.
+		compReader, err := archive.OpenReader(arReader)
+		if err != nil {
+			return fmt.Errorf("error opening decompressor with format: %s %w", reader.format.Name(), err)
+		}
+		defer compReader.Close()
+
+		rdr, err := newFileReader(compReader)
+		if err != nil {
+			return fmt.Errorf("error creating custom reader: %w", err)
+		}
+		defer rdr.Close()
+
+		return h.openArchive(ctx, depth+1, rdr, archiveChan)
+	case archiver.Extractor:
+		err := archive.Extract(logContext.WithValue(ctx, depthKey, depth+1), arReader, nil, h.extractorHandler(archiveChan))
+		if err != nil {
+			return fmt.Errorf("error extracting archive with format: %s: %w", reader.format.Name(), err)
+		}
+		return nil
+	default:
+		return fmt.Errorf("unknown archive type: %s", reader.mimeType)
+	}
+}
+
+// extractorHandler creates a closure that handles individual files extracted by an archiver.
+// It logs the extraction, checks for cancellation, and decides whether to skip the file based on its name or type,
+// particularly for binary files if configured to skip. If the file is not skipped, it recursively calls openArchive
+// to handle nested archives or to continue processing based on the file's content and depth in the archive structure.
+func (h *archiveHandler) extractorHandler(archiveChan chan []byte) func(context.Context, archiver.File) error {
+	return func(ctx context.Context, file archiver.File) error {
+		lCtx := logContext.WithValues(
+			logContext.AddLogger(ctx),
+			"filename", file.Name(),
+			"size", file.Size(),
+		)
+		lCtx.Logger().V(5).Info("Handling extracted file.")
+
+		if file.IsDir() || file.LinkTarget != "" {
+			lCtx.Logger().V(5).Info("skipping directory or symlink")
+			return nil
+		}
+
+		if common.IsDone(ctx) {
+			return ctx.Err()
+		}
+
+		depth := 0
+		if ctxDepth, ok := ctx.Value(depthKey).(int); ok {
+			depth = ctxDepth
+		}
+
+		fileSize := file.Size()
+		if int(fileSize) > maxSize {
+			lCtx.Logger().V(3).Info("skipping file due to size")
+			return nil
+		}
+
+		if common.SkipFile(file.Name()) || common.IsBinary(file.Name()) {
+			lCtx.Logger().V(5).Info("skipping file")
+			h.metrics.incFilesSkipped()
+			return nil
+		}
+
+		f, err := file.Open()
+		if err != nil {
+			return fmt.Errorf("error opening file %s: %w", file.Name(), err)
+		}
+		defer f.Close()
+
+		// Archiver v4 is in alpha and using an experimental version of
+		// rardecode. There is a bug somewhere with rar decoder format 29
+		// that can lead to a panic. An issue is open in rardecode repo
+		// https://github.com/nwaples/rardecode/issues/30.
+		defer func() {
+			if r := recover(); r != nil {
+				// Return the panic as an error.
+				if e, ok := r.(error); ok {
+					err = e
+				} else {
+					err = fmt.Errorf("panic occurred: %v", r)
+				}
+				lCtx.Logger().Error(err, "Panic occurred when reading archive")
+			}
+		}()
+
+		rdr, err := newFileReader(f)
+		if err != nil {
+			return fmt.Errorf("error creating custom reader: %w", err)
+		}
+		defer rdr.Close()
+
+		h.metrics.incFilesProcessed()
+		h.metrics.observeFileSize(fileSize)
+
+		return h.openArchive(lCtx, depth, rdr, archiveChan)
+	}
+}
diff --git a/pkg/handlers/archive_test.go b/pkg/handlers/archive_test.go
@@ -0,0 +1,128 @@
+package handlers
+
+import (
+	"context"
+	"io"
+	"net/http"
+	"regexp"
+	"strings"
+	"testing"
+
+	"github.com/stretchr/testify/assert"
+
+	logContext "github.com/trufflesecurity/trufflehog/v3/pkg/context"
+)
+
+func TestArchiveHandler(t *testing.T) {
+	tests := map[string]struct {
+		archiveURL     string
+		expectedChunks int
+		matchString    string
+		expectErr      bool
+	}{
+		"gzip-single": {
+			"https://raw.githubusercontent.com/bill-rich/bad-secrets/master/one-zip.gz",
+			1,
+			"AKIAYVP4CIPPH5TNP3SW",
+			false,
+		},
+		"gzip-nested": {
+			"https://raw.githubusercontent.com/bill-rich/bad-secrets/master/double-zip.gz",
+			1,
+			"AKIAYVP4CIPPH5TNP3SW",
+			false,
+		},
+		"gzip-too-deep": {
+			"https://raw.githubusercontent.com/bill-rich/bad-secrets/master/six-zip.gz",
+			0,
+			"",
+			true,
+		},
+		"tar-single": {
+			"https://raw.githubusercontent.com/bill-rich/bad-secrets/master/one.tar",
+			1,
+			"AKIAYVP4CIPPH5TNP3SW",
+			false,
+		},
+		"tar-nested": {
+			"https://raw.githubusercontent.com/bill-rich/bad-secrets/master/two.tar",
+			1,
+			"AKIAYVP4CIPPH5TNP3SW",
+			false,
+		},
+		"tar-too-deep": {
+			"https://raw.githubusercontent.com/bill-rich/bad-secrets/master/six.tar",
+			0,
+			"",
+			true,
+		},
+		"targz-single": {
+			"https://raw.githubusercontent.com/bill-rich/bad-secrets/master/tar-archive.tar.gz",
+			1,
+			"AKIAYVP4CIPPH5TNP3SW",
+			false,
+		},
+		"gzip-large": {
+			"https://raw.githubusercontent.com/bill-rich/bad-secrets/master/FifteenMB.gz",
+			1543,
+			"AKIAYVP4CIPPH5TNP3SW",
+			false,
+		},
+		"zip-single": {
+			"https://raw.githubusercontent.com/bill-rich/bad-secrets/master/aws-canary-creds.zip",
+			1,
+			"AKIAYVP4CIPPH5TNP3SW",
+			false,
+		},
+	}
+
+	for name, testCase := range tests {
+		t.Run(name, func(t *testing.T) {
+			resp, err := http.Get(testCase.archiveURL)
+			assert.NoError(t, err)
+			assert.Equal(t, http.StatusOK, resp.StatusCode)
+			defer resp.Body.Close()
+
+			handler := newArchiveHandler()
+
+			newReader, err := newFileReader(resp.Body)
+			if err != nil {
+				t.Errorf("error creating reusable reader: %s", err)
+			}
+			archiveChan, err := handler.HandleFile(logContext.Background(), newReader)
+			if testCase.expectErr {
+				assert.NoError(t, err)
+				return
+			}
+
+			count := 0
+			re := regexp.MustCompile(testCase.matchString)
+			matched := false
+			for chunk := range archiveChan {
+				count++
+				if re.Match(chunk) {
+					matched = true
+				}
+			}
+
+			assert.True(t, matched)
+			assert.Equal(t, testCase.expectedChunks, count)
+		})
+	}
+}
+
+func TestOpenInvalidArchive(t *testing.T) {
+	reader := strings.NewReader("invalid archive")
+
+	ctx := logContext.AddLogger(context.Background())
+	handler := archiveHandler{}
+
+	rdr, err := newFileReader(io.NopCloser(reader))
+	assert.NoError(t, err)
+	defer rdr.Close()
+
+	archiveChan := make(chan []byte)
+
+	err = handler.openArchive(ctx, 0, rdr, archiveChan)
+	assert.Error(t, err)
+}