diff --git a/codesearch/index/write.go b/codesearch/index/write.go index f056e35e..7bf3d8bc 100644 --- a/codesearch/index/write.go +++ b/codesearch/index/write.go @@ -5,6 +5,7 @@ package index import ( + "errors" "fmt" "io" "io/ioutil" @@ -14,6 +15,7 @@ import ( "unsafe" "github.com/hound-search/hound/codesearch/sparse" + "golang.org/x/text/encoding" ) // Index writing. See read.go for details of on-disk format. @@ -123,7 +125,7 @@ func (ix *IndexWriter) AddFile(name string) { func (ix *IndexWriter) Add(name string, f io.Reader) string { ix.trigram.Reset() var ( - c = byte(0) //nolint + c = byte(0) //nolint i = 0 buf = ix.inbuf[:0] tv = uint32(0) @@ -131,9 +133,9 @@ func (ix *IndexWriter) Add(name string, f io.Reader) string { linelen = 0 numLines = 0 longLines = 0 - skipReason = "" //nolint + skipReason = "" //nolint ) - + const invalidUTF8 = "Invalid UTF-8" for { tv = (tv << 8) & (1<<24 - 1) if i >= len(buf) { @@ -144,6 +146,9 @@ func (ix *IndexWriter) Add(name string, f io.Reader) string { break } log.Printf("%s: %v\n", name, err) + if errors.Is(err, encoding.ErrInvalidUTF8) { + return invalidUTF8 + } return "" } log.Printf("%s: 0-length read\n", name) @@ -159,7 +164,7 @@ func (ix *IndexWriter) Add(name string, f io.Reader) string { ix.trigram.Add(tv) } if !validUTF8((tv>>8)&0xFF, tv&0xFF) { - skipReason = "Invalid UTF-8" + skipReason = invalidUTF8 if ix.LogSkip { log.Printf("%s: %s\n", name, skipReason) } @@ -246,7 +251,7 @@ func (ix *IndexWriter) Flush() { os.Remove(ix.nameData.name) for _, d := range ix.postData { - unmmap(d) //nolint + unmmap(d) //nolint } for _, f := range ix.postFile { f.Close() @@ -310,7 +315,7 @@ func (ix *IndexWriter) flushPost() { } ix.post = ix.post[:0] - w.Seek(0, 0) //nolint + w.Seek(0, 0) //nolint ix.postFile = append(ix.postFile, w) } @@ -368,7 +373,7 @@ type postChunk struct { m []postEntry // remaining entries after e } -const postBuf = 4096 //nolint +const postBuf = 4096 //nolint // A postHeap is a heap (priority queue) of postChunks. type postHeap struct { @@ -388,7 +393,7 @@ func (h *postHeap) addMem(x []postEntry) { // step reads the next entry from ch and saves it in ch.e. // It returns false if ch is over. -func (h *postHeap) step(ch *postChunk) bool { //nolint +func (h *postHeap) step(ch *postChunk) bool { //nolint old := ch.e m := ch.m if len(m) == 0 { @@ -414,7 +419,7 @@ func (h *postHeap) add(ch *postChunk) { } // empty reports whether the postHeap is empty. -func (h *postHeap) empty() bool { //nolint +func (h *postHeap) empty() bool { //nolint return len(h.ch) == 0 } @@ -492,7 +497,7 @@ type bufWriter struct { name string file *os.File buf []byte - tmp [8]byte //nolint + tmp [8]byte //nolint } // bufCreate creates a new file with the given name and returns a @@ -578,7 +583,7 @@ func (b *bufWriter) flush() { func (b *bufWriter) finish() *os.File { b.flush() f := b.file - f.Seek(0, 0) //nolint + f.Seek(0, 0) //nolint return f } diff --git a/config/config.go b/config/config.go index c978c009..992977c0 100644 --- a/config/config.go +++ b/config/config.go @@ -36,6 +36,7 @@ type Repo struct { EnablePollUpdates *bool `json:"enable-poll-updates"` EnablePushUpdates *bool `json:"enable-push-updates"` AutoGeneratedFiles []string `json:"auto-generated-files"` + FallbackEncoding string `json:"fallback-encoding"` } // Used for interpreting the config value for fields that use *bool. If a value diff --git a/go.mod b/go.mod index 3d5151e6..67917f67 100644 --- a/go.mod +++ b/go.mod @@ -4,5 +4,6 @@ go 1.16 require ( github.com/blang/semver/v4 v4.0.0 - golang.org/x/mod v0.10.0 + golang.org/x/mod v0.14.0 + golang.org/x/text v0.14.0 ) diff --git a/go.sum b/go.sum index ab72a812..9b82c458 100644 --- a/go.sum +++ b/go.sum @@ -3,25 +3,46 @@ github.com/blang/semver/v4 v4.0.0/go.mod h1:IbckMUScFkM3pff0VJDNKRiT6TG/YpiHIM2y github.com/yuin/goldmark v1.4.13/go.mod h1:6yULJ656Px+3vBD8DxQVa3kxgyrAnzto9xy5taEt/CY= golang.org/x/crypto v0.0.0-20190308221718-c2843e01d9a2/go.mod h1:djNgcEr1/C05ACkg1iLfiJU5Ep61QUkGW8qpdssI0+w= golang.org/x/crypto v0.0.0-20210921155107-089bfa567519/go.mod h1:GvvjBRRGRdwPK5ydBHafDWAxML/pGHZbMvKqRZ5+Abc= +golang.org/x/crypto v0.13.0/go.mod h1:y6Z2r+Rw4iayiXXAIxJIDAJ1zMW4yaTpebo8fPOliYc= golang.org/x/mod v0.6.0-dev.0.20220419223038-86c51ed26bb4/go.mod h1:jJ57K6gSWd91VN4djpZkiMVwK6gcyfeH4XE8wZrZaV4= -golang.org/x/mod v0.10.0 h1:lFO9qtOdlre5W1jxS3r/4szv2/6iXxScdzjoBMXNhYk= -golang.org/x/mod v0.10.0/go.mod h1:iBbtSCu2XBx23ZKBPSOrRkjjQPZFPuis4dIYUhu/chs= +golang.org/x/mod v0.8.0/go.mod h1:iBbtSCu2XBx23ZKBPSOrRkjjQPZFPuis4dIYUhu/chs= +golang.org/x/mod v0.12.0/go.mod h1:iBbtSCu2XBx23ZKBPSOrRkjjQPZFPuis4dIYUhu/chs= +golang.org/x/mod v0.14.0 h1:dGoOF9QVLYng8IHTm7BAyWqCqSheQ5pYWGhzW00YJr0= +golang.org/x/mod v0.14.0/go.mod h1:hTbmBsO62+eylJbnUtE2MGJUyE7QWk4xUqPFrRgJ+7c= golang.org/x/net v0.0.0-20190620200207-3b0461eec859/go.mod h1:z5CRVTTTmAJ677TzLLGU+0bjPO0LkuOLi4/5GtJWs/s= golang.org/x/net v0.0.0-20210226172049-e18ecbb05110/go.mod h1:m0MpNAwzfU5UDzcl9v0D8zg8gWTRqZa9RBIspLL5mdg= golang.org/x/net v0.0.0-20220722155237-a158d28d115b/go.mod h1:XRhObCWvk6IyKnWLug+ECip1KBveYUHfp+8e9klMJ9c= +golang.org/x/net v0.6.0/go.mod h1:2Tu9+aMcznHK/AK1HMvgo6xiTLG5rD5rZLDS+rp2Bjs= +golang.org/x/net v0.10.0/go.mod h1:0qNGK6F8kojg2nk9dLZ2mShWaEBan6FAoqfSigmmuDg= +golang.org/x/net v0.15.0/go.mod h1:idbUs1IY1+zTqbi8yxTbhexhEEk5ur9LInksu6HrEpk= golang.org/x/sync v0.0.0-20190423024810-112230192c58/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= golang.org/x/sync v0.0.0-20220722155255-886fb9371eb4/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= +golang.org/x/sync v0.1.0/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= +golang.org/x/sync v0.3.0/go.mod h1:FU7BRWz2tNW+3quACPkgCx/L+uEAv1htQ0V83Z9Rj+Y= golang.org/x/sys v0.0.0-20190215142949-d0b11bdaac8a/go.mod h1:STP8DvDyc/dI5b8T5hshtkjS+E42TnysNCUPdjciGhY= golang.org/x/sys v0.0.0-20201119102817-f84b799fce68/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= golang.org/x/sys v0.0.0-20210615035016-665e8c7367d1/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= golang.org/x/sys v0.0.0-20220520151302-bc2c85ada10a/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= golang.org/x/sys v0.0.0-20220722155257-8c9f86f7a55f/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= +golang.org/x/sys v0.5.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= +golang.org/x/sys v0.8.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= +golang.org/x/sys v0.12.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= golang.org/x/term v0.0.0-20201126162022-7de9c90e9dd1/go.mod h1:bj7SfCRtBDWHUb9snDiAeCFNEtKQo2Wmx5Cou7ajbmo= golang.org/x/term v0.0.0-20210927222741-03fcf44c2211/go.mod h1:jbD1KX2456YbFQfuXm/mYQcufACuNUgVhRMnK/tPxf8= +golang.org/x/term v0.5.0/go.mod h1:jMB1sMXY+tzblOD4FWmEbocvup2/aLOaQEp7JmGp78k= +golang.org/x/term v0.8.0/go.mod h1:xPskH00ivmX89bAKVGSKKtLOWNx2+17Eiy94tnKShWo= +golang.org/x/term v0.12.0/go.mod h1:owVbMEjm3cBLCHdkQu9b1opXd4ETQWc3BhuQGKgXgvU= golang.org/x/text v0.3.0/go.mod h1:NqM8EUOU14njkJ3fqMW+pc6Ldnwhi/IjpwHt7yyuwOQ= golang.org/x/text v0.3.3/go.mod h1:5Zoc/QRtKVWzQhOtBMvqHzDpF6irO9z98xDceosuGiQ= golang.org/x/text v0.3.7/go.mod h1:u+2+/6zg+i71rQMx5EYifcz6MCKuco9NR6JIITiCfzQ= +golang.org/x/text v0.7.0/go.mod h1:mrYo+phRRbMaCq/xk9113O4dZlRixOauAjOtrjsXDZ8= +golang.org/x/text v0.9.0/go.mod h1:e1OnstbJyHTd6l/uOt8jFFHp6TRDWZR/bV3emEE/zU8= +golang.org/x/text v0.13.0/go.mod h1:TvPlkZtksWOMsz7fbANvkp4WM8x/WCo/om8BMLbz+aE= +golang.org/x/text v0.14.0 h1:ScX5w1eTa3QqT8oi6+ziP7dTV1S2+ALU0bI+0zXKWiQ= +golang.org/x/text v0.14.0/go.mod h1:18ZOQIKpY8NJVqYksKHtTdi31H5itFRjB5/qKTNYzSU= golang.org/x/tools v0.0.0-20180917221912-90fa682c2a6e/go.mod h1:n7NCudcB/nEzxVGmLbDWY5pfWTLqBcC2KZ6jyYvM4mQ= golang.org/x/tools v0.0.0-20191119224855-298f0cb1881e/go.mod h1:b+2E5dAYhXwXZwtnZ6UAqBI28+e2cm9otk0dWdXHAEo= golang.org/x/tools v0.1.12/go.mod h1:hNGJHUnrk76NpqgfD5Aqm5Crs+Hm0VOH/i9J2+nxYbc= +golang.org/x/tools v0.6.0/go.mod h1:Xwgl3UAJ/d3gWutnCtw505GrjyAbvKui8lOU390QaIU= +golang.org/x/tools v0.13.0/go.mod h1:HvlwmtVNQAhOuCjW7xxvovg8wbNq7LwfXh/k7wXUl58= golang.org/x/xerrors v0.0.0-20190717185122-a985d3407aa7/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0= diff --git a/index/index.go b/index/index.go index f84d933b..cf600d1d 100644 --- a/index/index.go +++ b/index/index.go @@ -13,6 +13,8 @@ import ( "github.com/hound-search/hound/codesearch/index" "github.com/hound-search/hound/codesearch/regexp" + "golang.org/x/text/encoding" + "golang.org/x/text/transform" ) const ( @@ -37,6 +39,7 @@ type IndexOptions struct { ExcludeDotFiles bool SpecialFiles []string AutoGeneratedFiles []string + FallbackEnc encoding.Encoding } type SearchOptions struct { @@ -256,7 +259,7 @@ func (n *Index) Search(pat string, opt *SearchOptions) (*SearchResponse, error) }, nil } -func isTextFile(filename string) (bool, error) { +func isTextFile(filename string) (isText bool, err error) { buf := make([]byte, filePeekSize) r, err := os.Open(filename) if err != nil { @@ -271,14 +274,14 @@ func isTextFile(filename string) (bool, error) { buf = buf[:n] - if n < filePeekSize { - // read the whole file, must be valid. - return utf8.Valid(buf), nil + if n < filePeekSize && utf8.Valid(buf) || // read the whole file, must be valid. + n >= filePeekSize && validUTF8IgnoringPartialTrailingRune(buf) { // read a prefix, allow trailing partial runes. + return true, nil } - - // read a prefix, allow trailing partial runes. - return validUTF8IgnoringPartialTrailingRune(buf), nil - + if isBinary(buf) { + return false, nil + } + return true, nil } // Determines if the buffer contains valid UTF8 encoded string data. The buffer is assumed @@ -307,17 +310,26 @@ func validUTF8IgnoringPartialTrailingRune(p []byte) bool { return true } -func addFileToIndex(ix *index.IndexWriter, dst, src, path string) (string, error) { +func isBinary(p []byte) bool { + for _, c := range p { + if c < 10 { + return true + } + } + return false +} + +func addFileToIndex(ix *index.IndexWriter, dst, src, path string, fallbackEnc encoding.Encoding) (string, error) { rel, err := filepath.Rel(src, path) if err != nil { return "", err } - r, err := os.Open(path) + fh, err := os.Open(path) if err != nil { return "", err } - defer r.Close() + defer fh.Close() dup := filepath.Join(dst, "raw", rel) w, err := os.Create(dup) @@ -325,10 +337,32 @@ func addFileToIndex(ix *index.IndexWriter, dst, src, path string) (string, error return "", err } defer w.Close() - g := gzip.NewWriter(w) defer g.Close() + r := io.Reader(fh) + + // Without fallback encoding, assume UTF-8. + maybeValidated := r + if fallbackEnc != nil { + maybeValidated = transform.NewReader(r, encoding.UTF8Validator) + } + skipReason := ix.Add(rel, io.TeeReader(maybeValidated, g)) + if fallbackEnc == nil || skipReason == "" || skipReason != "Invalid UTF-8" { + return skipReason, nil + } + // Reset, then try the fallback encoding. + if _, err = fh.Seek(0, 0); err != nil { + return skipReason, err + } + if _, err = w.Seek(0, 0); err != nil { + return skipReason, err + } + if err = w.Truncate(0); err != nil { + return skipReason, err + } + g.Reset(w) + r = fallbackEnc.NewDecoder().Reader(r) return ix.Add(rel, io.TeeReader(r, g)), nil } @@ -426,12 +460,12 @@ func indexAllFiles(opt *IndexOptions, dst, src string) error { return nil } - txt, err := isTextFile(path) + isText, err := isTextFile(path) if err != nil { return err } - if !txt { + if !isText { excluded = append(excluded, &ExcludedFile{ rel, reasonNotText, @@ -439,7 +473,7 @@ func indexAllFiles(opt *IndexOptions, dst, src string) error { return nil } - reasonForExclusion, err := addFileToIndex(ix, dst, src, path) + reasonForExclusion, err := addFileToIndex(ix, dst, src, path, opt.FallbackEnc) if err != nil { return err } diff --git a/index/index_test.go b/index/index_test.go index a8de1c34..edf26492 100644 --- a/index/index_test.go +++ b/index/index_test.go @@ -8,6 +8,9 @@ import ( "path/filepath" "runtime" "testing" + + "github.com/hound-search/hound/codesearch/index" + "golang.org/x/text/encoding/charmap" ) const ( @@ -37,7 +40,7 @@ func TestSearch(t *testing.T) { if err != nil { t.Fatal(err) } - defer ref.Remove() //nolint + defer ref.Remove() //nolint // Make sure the metadata in the ref is good. if ref.Rev != rev { @@ -116,7 +119,7 @@ func TestRead(t *testing.T) { if err != nil { t.Fatal(err) } - defer ref.Remove() //nolint + defer ref.Remove() //nolint r, err := Read(ref.Dir()) if err != nil { @@ -137,3 +140,33 @@ func TestRead(t *testing.T) { } defer idx.Close() } + +func TestFallbackEnc(t *testing.T) { + dst, err := ioutil.TempDir(os.TempDir(), "hound") + if err != nil { + t.Fatal(err) + } + defer os.RemoveAll(dst) + os.MkdirAll(filepath.Join(dst, "raw"), 0701) + + ix := index.Create(filepath.Join(dst, "tri")) + defer ix.Close() + + // { for i in $(seq 0 $(( 2048 / 43 ))); do echo '2048 byte of ASCII to fill the peek buffer'; done; echo ''; echo 'árvíztűrÅ‘ tükörfúrógép' |iconv -f UTF8 -t ISO8859-2; } > testdata/iso8859_2.txt')) + const src = "testdata" + const path = "iso8859_2.txt" + skipReason, err := addFileToIndex(ix, dst, src, filepath.Join(src, path), nil) + if err != nil { + t.Fatal(err) + } + if skipReason == "" { + t.Error("wanted skip, got success without fallback encoding") + } + skipReason, err = addFileToIndex(ix, dst, src, filepath.Join(src, path), charmap.ISO8859_2) + if err != nil { + t.Fatal(err) + } + if skipReason != "" { + t.Errorf("wanted success, got skip %q", skipReason) + } +} diff --git a/index/testdata/iso8859_2.txt b/index/testdata/iso8859_2.txt new file mode 100644 index 00000000..c2a33e82 --- /dev/null +++ b/index/testdata/iso8859_2.txt @@ -0,0 +1,50 @@ +2048 byte of ASCII to fill the peek buffer +2048 byte of ASCII to fill the peek buffer +2048 byte of ASCII to fill the peek buffer +2048 byte of ASCII to fill the peek buffer +2048 byte of ASCII to fill the peek buffer +2048 byte of ASCII to fill the peek buffer +2048 byte of ASCII to fill the peek buffer +2048 byte of ASCII to fill the peek buffer +2048 byte of ASCII to fill the peek buffer +2048 byte of ASCII to fill the peek buffer +2048 byte of ASCII to fill the peek buffer +2048 byte of ASCII to fill the peek buffer +2048 byte of ASCII to fill the peek buffer +2048 byte of ASCII to fill the peek buffer +2048 byte of ASCII to fill the peek buffer +2048 byte of ASCII to fill the peek buffer +2048 byte of ASCII to fill the peek buffer +2048 byte of ASCII to fill the peek buffer +2048 byte of ASCII to fill the peek buffer +2048 byte of ASCII to fill the peek buffer +2048 byte of ASCII to fill the peek buffer +2048 byte of ASCII to fill the peek buffer +2048 byte of ASCII to fill the peek buffer +2048 byte of ASCII to fill the peek buffer +2048 byte of ASCII to fill the peek buffer +2048 byte of ASCII to fill the peek buffer +2048 byte of ASCII to fill the peek buffer +2048 byte of ASCII to fill the peek buffer +2048 byte of ASCII to fill the peek buffer +2048 byte of ASCII to fill the peek buffer +2048 byte of ASCII to fill the peek buffer +2048 byte of ASCII to fill the peek buffer +2048 byte of ASCII to fill the peek buffer +2048 byte of ASCII to fill the peek buffer +2048 byte of ASCII to fill the peek buffer +2048 byte of ASCII to fill the peek buffer +2048 byte of ASCII to fill the peek buffer +2048 byte of ASCII to fill the peek buffer +2048 byte of ASCII to fill the peek buffer +2048 byte of ASCII to fill the peek buffer +2048 byte of ASCII to fill the peek buffer +2048 byte of ASCII to fill the peek buffer +2048 byte of ASCII to fill the peek buffer +2048 byte of ASCII to fill the peek buffer +2048 byte of ASCII to fill the peek buffer +2048 byte of ASCII to fill the peek buffer +2048 byte of ASCII to fill the peek buffer +2048 byte of ASCII to fill the peek buffer + +árvíztûrõ tükörfúrógép diff --git a/searcher/searcher.go b/searcher/searcher.go index 791ce810..ec17c8a4 100644 --- a/searcher/searcher.go +++ b/searcher/searcher.go @@ -16,6 +16,7 @@ import ( "github.com/hound-search/hound/config" "github.com/hound-search/hound/index" "github.com/hound-search/hound/vcs" + "golang.org/x/text/encoding/htmlindex" ) type Searcher struct { @@ -264,7 +265,7 @@ func reportOnMemory() { // Utility function for producing a hex encoded sha1 hash for a string. func hashFor(name string) string { h := sha1.New() - h.Write([]byte(name)) //nolint + h.Write([]byte(name)) //nolint return hex.EncodeToString(h.Sum(nil)) } @@ -407,6 +408,15 @@ func newSearcher( return nil, err } + opt := &index.IndexOptions{ + ExcludeDotFiles: repo.ExcludeDotFiles, + SpecialFiles: wd.SpecialFiles(), + } + if repo.FallbackEncoding != "" { + if opt.FallbackEnc, err = htmlindex.Get(repo.FallbackEncoding); err != nil { + return nil, fmt.Errorf("%s.fallback-encoding=%q: %w", name, repo.FallbackEncoding, err) + } + } rev, err := wd.PullOrClone(vcsDir, repo.Url) if err != nil { @@ -420,11 +430,7 @@ func newSearcher( autoFiles = wd.AutoGeneratedFiles(vcsDir) } - opt := &index.IndexOptions{ - ExcludeDotFiles: repo.ExcludeDotFiles, - SpecialFiles: wd.SpecialFiles(), - AutoGeneratedFiles: autoFiles, - } + opt.AutoGeneratedFiles = autoFiles var idxDir string ref := refs.find(repo.Url, rev)