-
Notifications
You must be signed in to change notification settings - Fork 5
/
Copy pathfile.go
161 lines (134 loc) · 3.93 KB
/
file.go
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
package main
import (
"bytes"
"fmt"
"github.com/boyter/gocodewalker"
"github.com/mfonda/simhash"
"os"
"strings"
)
func readFileContent(fi os.FileInfo, err error, f *gocodewalker.File) []byte {
var content []byte
// Only read up to ~1MB of a file because anything beyond that is probably pointless
if fi.Size() < maxReadSizeBytes {
content, err = os.ReadFile(f.Location)
} else {
fi, err := os.Open(f.Location)
if err != nil {
return nil
}
defer fi.Close()
byteSlice := make([]byte, maxReadSizeBytes)
_, err = fi.Read(byteSlice)
if err != nil {
return nil
}
content = byteSlice
}
return content
}
func selectFiles() map[string][]duplicateFile {
// Now we need to run through every file closed by the filewalker when done
fileListQueue := make(chan *gocodewalker.File, 100)
fileWalker := gocodewalker.NewFileWalker(dirFilePaths[0], fileListQueue)
fileWalker.AllowListExtensions = allowListExtensions
fileWalker.IgnoreIgnoreFile = ignoreIgnoreFile
fileWalker.IgnoreGitIgnore = ignoreGitIgnore
fileWalker.LocationExcludePattern = locationExcludePattern
go fileWalker.Start()
extensionFileMap := map[string][]duplicateFile{}
var totalLines uint64
for f := range fileListQueue {
// for each file we want to read its contents, calculate its stats then pass that off to an upserter
fi, err := os.Lstat(f.Location)
if err != nil {
if verbose {
fmt.Println(fmt.Sprintf("error %s", err.Error()))
}
continue
}
if fi.Mode()&os.ModeSymlink == os.ModeSymlink {
if verbose {
fmt.Println(fmt.Sprintf("skipping symlink file: %s", f.Location))
}
continue
}
content := readFileContent(fi, err, f)
// if there is nothing in the file lets not bother with anything
if len(content) == 0 {
if verbose {
fmt.Println(fmt.Sprintf("empty file so moving on %s", f.Location))
}
continue
}
// Check if this file is binary by checking for nul byte and if so bail out
// this is how GNU Grep, git and ripgrep binaryCheck for binary files
isBinary := false
binaryCheck := content
if len(binaryCheck) > 10_000 {
binaryCheck = content[:10_000]
}
for _, b := range binaryCheck {
if b == 0 {
isBinary = true
continue
}
}
if isBinary {
if verbose {
fmt.Println(fmt.Sprintf("file determined to be binary so moving on %s", f.Location))
}
continue
}
// Check if this file is minified
// Check if the file is minified and if so ignore it
split := bytes.Split(content, []byte("\n"))
sumLineLength := 0
for _, s := range split {
sumLineLength += len(s)
}
averageLineLength := sumLineLength / len(split)
if averageLineLength > minifiedLineByteLength {
if verbose {
fmt.Println(fmt.Sprintf("file determined to be minified so moving on %s", f.Location))
}
continue
}
// at this point we have a candidate file to work with :)
// what we want to do now is crunch down the candidate lines to hashes which we can then compare
ext := gocodewalker.GetExtension(f.Filename)
lines := strings.Split(string(content), "\n")
var lineHashes []uint64
for i := 0; i < len(lines); i++ {
clean := strings.ToLower(spaceMap(lines[i]))
hash := simhash.Simhash(simhash.NewWordFeatureSet([]byte(clean)))
lineHashes = append(lineHashes, hash)
if len(clean) > 3 {
addSimhashToFileExtDatabase(hash, ext, f.Location)
addSimhashToFileExtDatabase2(hash, ext, f.Location)
}
totalLines++
}
_, ok := extensionFileMap[ext]
if ok {
extensionFileMap[ext] = append(extensionFileMap[ext], duplicateFile{
Filename: f.Filename,
Location: f.Location,
Extension: ext,
LineHashes: lineHashes,
})
} else {
t := append([]duplicateFile{}, duplicateFile{
Filename: f.Filename,
Location: f.Location,
Extension: ext,
LineHashes: lineHashes,
})
extensionFileMap[ext] = t
}
}
for k := range hashToFiles {
hashToFiles[k] = removeStringDuplicates(hashToFiles[k])
}
return extensionFileMap
}