Skip to content

Commit

Permalink
Caching file
Browse files Browse the repository at this point in the history
An imgcache.json file is used to store intermediate image data, including hashes. This saves a significant amount of time for subsequent runs of the program that would have had to read every single image into memory again to compute their hashes.

Also, test changed to use the process.argv for getting a directory.
  • Loading branch information
Math committed Jun 10, 2018
1 parent 26b7ad4 commit 0c9b14a
Show file tree
Hide file tree
Showing 7 changed files with 460 additions and 68 deletions.
145 changes: 88 additions & 57 deletions dupe-image-checker.js
Original file line number Diff line number Diff line change
@@ -1,15 +1,14 @@
const Jimp = require('jimp');
const readAll = require('./utils/read-all');
const Array = require('./utils/Array');
const Format = require('./utils/formatting');
const File = require('./utils/File');
const Logger = require('./utils/Logger');
const Jimp = require('jimp');
const readAll = require('./utils/read-all');
const hex = require('./utils/hex');
const Array = require('./utils/Array');
const Format = require('./utils/formatting');
const File = require('./utils/File');
const FileCache = require('./utils/FileCache');
const Logger = require('./utils/Logger');

var logger = new Logger('Dupe Checker');

function bin2hex(x) {
return x.match(/[01]{4}/g).map(bin => parseInt(bin,2).toString(16)).join('');
}
// hamming distance function (stolen from pHash)
function distance(h1, h2) {
var sum = 0;
Expand All @@ -20,6 +19,9 @@ function distance(h1, h2) {
}
return sum / h1.length;
}
function flcmp(f1, f2) {
return Math.abs(f1 - f2) < 0.0001;
}

module.exports = function findDuplicates(directory, options = {}) {
options.threshold = options.threshold === undefined ? 0.1 : options.threshold;
Expand All @@ -33,63 +35,65 @@ module.exports = function findDuplicates(directory, options = {}) {
var images = Object.keys(filesObj).map(name => new File(filesObj[name]));
var length = images.length;
var duplicates = [];

function markAsDuplicates(...files) {
for (var dupes of duplicates) {
for (var file of files) {
if (dupes.includes(file)) {
for (var newFile of files.diff(dupes)) {
dupes.push(newFile);
}
return;
}
}
}
duplicates.push(files);
}
function isDuplicate(file) {
for (var dupes of duplicates) {
if (dupes.includes(file)) {
return true;
}
}
return false;
}
function flcmp(f1, f2) {
return Math.abs(f1 - f2) < 0.0001;
}
function sameRatio(i1, i2) {
return flcmp(i1.width/i2.width, i1.height/i2.height);
}

var startTime = Date.now();

logger.log(`Caching data for ${length} images...`);
var imgHashCache = new FileCache(directory, 'imgcache');

logger.indent();

/* Step 1. Cache all the image widths x heights, sizes, and perceptual hashes.
Note: Jimp takes a while to load images... */
return images.forEachAsync((file, i) => {
logger.log(`[${i+1}/${length}]`, file.name);
logger.indent();
return Jimp.read(file.path).then(image => {
var size = file.size();
var hash2 = image.hash(2);
var hash16 = bin2hex(hash2);

file.width = image.bitmap.width;
file.height = image.bitmap.height;
file._size = size;
file._hash = hash2;

logger.cyan('Size: ', file.width, 'x', file.height);
logger.cyan('Bytes:', Format.bytes(size));
logger.cyan('Hash: ', hash16);

delete image;
})
.catch(e => logger.error(e))
.then(() => logger.unindent())
if (imgHashCache.has(file.name)) {
try {
// use the cached data to skip loading the image
Object.assign(file, imgHashCache.get(file.name));

// decompress the hash
var hash16 = file._hash;
file._hash = hex.hex2bin(file._hash);

logger.cyan('Size: ', file.width, 'x', file.height);
logger.cyan('Bytes:', Format.bytes(file._size));
logger.cyan('Hash: ', hash16);
} catch (e) {
logger.error(e);
} finally {
logger.unindent();
return;
}
} else {
return Jimp.read(file.path).then(image => {
var size = file.size();
var hash2 = image.hash(2);
var hash16 = hex.bin2hex(hash2);

file.width = image.bitmap.width;
file.height = image.bitmap.height;
file._size = size;
file._hash = hash2;

// no longer need this
delete image;

logger.cyan('Size: ', file.width, 'x', file.height);
logger.cyan('Bytes:', Format.bytes(file._size));
logger.cyan('Hash: ', hash16);

// cache the file metadata, but with the hash compressed
return imgHashCache.set(file.name, {
width: file.width,
height: file.height,
_size: size,
_hash: hash16
});
})
.catch(e => logger.error(e))
.then(() => logger.unindent())
}
})
/* Step 2. Sort images. This is completely useless to do. */
.then(() => {
Expand Down Expand Up @@ -121,7 +125,7 @@ module.exports = function findDuplicates(directory, options = {}) {
if (!sameRatio(file1, file2)) return;

var dist = distance(file1._hash, file2._hash);
if (dist > 3/64) return;
if (dist > 2/64) return;

logger.log(`[${j+1}/${length}]`, file2.name, `(${Format.bytes(file2._size)})`);
logger.indent();
Expand Down Expand Up @@ -162,6 +166,8 @@ module.exports = function findDuplicates(directory, options = {}) {
var endTime = Date.now();
var timeElapsed = endTime - startTime;

delete imgHashCache;

logger.unindent();
logger.ln();
logger.log(`Finished in ${Format.time(timeElapsed)}.`);
Expand Down Expand Up @@ -191,4 +197,29 @@ module.exports = function findDuplicates(directory, options = {}) {
return duplicates;
});
*/

function markAsDuplicates(...files) {
for (var dupes of duplicates) {
for (var file of files) {
if (dupes.includes(file)) {
for (var newFile of files.diff(dupes)) {
dupes.push(newFile);
}
return;
}
}
}
duplicates.push(files);
}
function isDuplicate(file) {
for (var dupes of duplicates) {
if (dupes.includes(file)) {
return true;
}
}
return false;
}
function sameRatio(i1, i2) {
return flcmp(i1.width/i2.width, i1.height/i2.height);
}
};
15 changes: 11 additions & 4 deletions dupe-image-remover.js
Original file line number Diff line number Diff line change
@@ -1,7 +1,8 @@
const File = require('./utils/file');
const FileExplorer = require('./utils/file-explorer');
const Array = require('./utils/Array');
const Format = require('./utils/formatting');
const Array = require('./utils/array');
const File = require('./utils/File');
const FileCache = require('./utils/FileCache');
const FileExplorer = require('./utils/FileExplorer');
const Logger = require('./utils/Logger');
const findDuplicates = require('./dupe-image-checker');

Expand All @@ -28,6 +29,7 @@ module.exports = function removeDuplicates(directory, options = {}) {
}

var startTime = Date.now();
var imgHashCache = new FileCache(directory, 'imgcache');

logger.ln();
logger.log('Determining duplicates to remove...');
Expand Down Expand Up @@ -76,15 +78,20 @@ module.exports = function removeDuplicates(directory, options = {}) {
logger.green('Removed: ', file.name);
removed.push(file);
file.delete();
return imgHashCache.delete(file.name);
})
.then(() => {
logger.green('Retained: ', bestSizeFile.name);
retained.push(bestSizeFile);
if (bestSizeFile.name != bestName && options.rename) {
bestName = bestSizeFile.rename(bestName);
logger.yellow('Renamed As: ', bestName);
renamed++;
var prevName = bestSizeFile.name;
bestName = bestSizeFile.rename(bestName);
return imgHashCache.replace(prevName, bestSizeFile.name);
}
})
.then(() => {
logger.unindent();
});
})
Expand Down
2 changes: 1 addition & 1 deletion package.json
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
"name": "dupe-images",
"description": "Node.js package for finding and removing duplicate image files with extreme precision",
"main": "index.js",
"version": "1.01",
"version": "1.02",
"dependencies": {
"jimp": "^0.2.28"
}
Expand Down
19 changes: 13 additions & 6 deletions test.js
Original file line number Diff line number Diff line change
@@ -1,17 +1,24 @@
function isMD5(x) {
return /^[0-9a-f]{32}/.test(x);
}
require('./dupe-image-remover')('./folder', {
const removeDuplicates = require('./dupe-image-remover');
const directory = process.argv[2] || './folder';

const options = {
recursive: false,
exact: false,
tolerance: 0.005,
rename: true,
namePreference(n1, n2) {
return isMD5(n1) ? n1 : isMD5(n2) ? n2 : n1.length > n2.length ? n1 : n2;
}
}).then(results => {
};

function isMD5(x) {
return /^[0-9a-f]{32}/.test(x);
}
function saveResults(results) {
require('fs').writeFile('./results.json', JSON.stringify(results), function(err) {
if (err) console.log(results);
else console.log('Data saved to results.json');
});
});
}

removeDuplicates(directory, options).then(saveResults);
38 changes: 38 additions & 0 deletions utils/FileCache.js
Original file line number Diff line number Diff line change
@@ -0,0 +1,38 @@
const FilePromise = require('./FilePromise');

class Cache {
constructor(directory, name) {
this.path = FilePromise.join(directory, name + '.json');
this.data = {};
if (FilePromise.existsSync(this.path)) {
this.load();
}
}
load() {
this.data = FilePromise.readSync(this.path);
}
save() {
return FilePromise.create(this.path, this.data);
}
has(filename) {
return filename in this.data;
}
set(filename, data) {
this.data[filename] = data;
return this.save();
}
get(filename) {
return this.data[filename];
}
delete(filename) {
delete this.data[filename];
return this.save();
}
replace(oldFilename, newFilename) {
this.data[newFilename] = this.data[oldFilename];
delete this.data[oldFilename];
return this.save();
}
}

module.exports = Cache;
Loading

0 comments on commit 0c9b14a

Please sign in to comment.