diff --git a/src/igir.ts b/src/igir.ts index 3b9b4be49..d7610915e 100644 --- a/src/igir.ts +++ b/src/igir.ts @@ -35,6 +35,7 @@ import DAT from './types/dats/dat.js'; import Parent from './types/dats/parent.js'; import DATStatus from './types/datStatus.js'; import File from './types/files/file.js'; +import IndexedFiles from './types/indexedFiles.js'; import Options from './types/options.js'; import OutputFactory from './types/outputFactory.js'; import Patch from './types/patches/patch.js'; @@ -73,9 +74,7 @@ export default class Igir { // Scan and process input files let dats = await this.processDATScanner(); const indexedRoms = await this.processROMScanner(); - const roms = [...indexedRoms.values()] - .flat() - .reduce(ArrayPoly.reduceUnique(), []); + const roms = indexedRoms.getFiles(); const patches = await this.processPatchScanner(); // Set up progress bar and input for DAT processing @@ -218,7 +217,7 @@ export default class Igir { return dats; } - private async processROMScanner(): Promise> { + private async processROMScanner(): Promise { const romScannerProgressBarName = 'Scanning for ROMs'; const romProgressBar = await this.logger.addProgressBar(romScannerProgressBarName); @@ -254,7 +253,7 @@ export default class Igir { private async generateCandidates( progressBar: ProgressBar, dat: DAT, - indexedRoms: Map, + indexedRoms: IndexedFiles, patches: Patch[], ): Promise> { const candidates = await new CandidateGenerator(this.options, progressBar) diff --git a/src/modules/candidateGenerator.ts b/src/modules/candidateGenerator.ts index 8211fa280..86f765e1d 100644 --- a/src/modules/candidateGenerator.ts +++ b/src/modules/candidateGenerator.ts @@ -12,6 +12,7 @@ import Archive from '../types/files/archives/archive.js'; import ArchiveEntry from '../types/files/archives/archiveEntry.js'; import Zip from '../types/files/archives/zip.js'; import File from '../types/files/file.js'; +import IndexedFiles from '../types/indexedFiles.js'; import Options from '../types/options.js'; import OutputFactory from '../types/outputFactory.js'; import ReleaseCandidate from '../types/releaseCandidate.js'; @@ -37,9 +38,9 @@ export default class CandidateGenerator extends Module { */ async generate( dat: DAT, - hashCodeToInputFiles: Map, + indexedFiles: IndexedFiles, ): Promise> { - if (hashCodeToInputFiles.size === 0) { + if (indexedFiles.getFiles().length === 0) { this.progressBar.logTrace(`${dat.getNameShort()}: no input ROMs to make candidates from`); return new Map(); } @@ -71,7 +72,7 @@ export default class CandidateGenerator extends Module { dat, game, release, - hashCodeToInputFiles, + indexedFiles, ); if (releaseCandidate) { releaseCandidates.push(releaseCandidate); @@ -104,9 +105,9 @@ export default class CandidateGenerator extends Module { dat: DAT, game: Game, release: Release | undefined, - hashCodeToInputFiles: Map, + indexedFiles: IndexedFiles, ): Promise { - const romsToInputFiles = CandidateGenerator.getInputFilesForGame(game, hashCodeToInputFiles); + const romsToInputFiles = this.getInputFilesForGame(game, indexedFiles); // For each Game's ROM, find the matching File const romFiles = await Promise.all( @@ -136,9 +137,12 @@ export default class CandidateGenerator extends Module { ) { // No automatic header removal will be performed when raw-copying an archive, so return no // match if we wanted a headerless ROM but got a headered one. - if (rom.hashCode() !== originalInputFile.hashCodeWithHeader() - && rom.hashCode() === originalInputFile.hashCodeWithoutHeader() + if (originalInputFile.getFileHeader() + && !(rom.getCrc32() === originalInputFile.getCrc32() + || rom.getMd5() === originalInputFile.getMd5() + || rom.getSha1() === originalInputFile.getSha1()) ) { + // TODO(cemmer): is this right? return [rom, undefined]; } @@ -189,13 +193,13 @@ export default class CandidateGenerator extends Module { return new ReleaseCandidate(game, release, foundRomsWithFiles); } - private static getInputFilesForGame( + private getInputFilesForGame( game: Game, - hashCodeToInputFiles: Map, + indexedFiles: IndexedFiles, ): Map { let romsAndInputFiles = game.getRoms().map((rom) => ([ rom, - (hashCodeToInputFiles.get(rom.hashCode()) ?? []), + indexedFiles.findFiles(rom) ?? [], ])) satisfies [ROM, File[]][]; // Detect if there is one input archive that contains every ROM, and prefer to use its entries. diff --git a/src/modules/fileIndexer.ts b/src/modules/fileIndexer.ts index 9fdb59929..192747193 100644 --- a/src/modules/fileIndexer.ts +++ b/src/modules/fileIndexer.ts @@ -8,6 +8,7 @@ import SevenZip from '../types/files/archives/sevenZip.js'; import Tar from '../types/files/archives/tar.js'; import Zip from '../types/files/archives/zip.js'; import File from '../types/files/file.js'; +import IndexedFiles, { AllChecksums, ChecksumsToFiles } from '../types/indexedFiles.js'; import Options from '../types/options.js'; import Module from './module.js'; @@ -26,83 +27,60 @@ export default class FileIndexer extends Module { /** * Index files. */ - async index(files: File[]): Promise> { - if (files.length === 0) { - return new Map(); - } - + async index(files: File[]): Promise { this.progressBar.logTrace(`indexing ${files.length.toLocaleString()} file${files.length !== 1 ? 's' : ''}`); await this.progressBar.setSymbol(ProgressBarSymbol.INDEXING); // await this.progressBar.reset(files.length); - const results = new Map(); + // Index the files + const result = IndexedFiles.fromFiles(files); + // Then apply some sorting preferences + Object.keys(result).forEach((checksum) => this.sortMap(result[checksum as keyof AllChecksums])); - // TODO(cemmer): ability to index files by some other property such as name - files.forEach((file) => { - // Index on full file contents - FileIndexer.setFileInMap(results, file.hashCodeWithHeader(), file); + this.progressBar.logTrace(`found ${result.getSize()} unique file${result.getSize() !== 1 ? 's' : ''}`); - // Optionally index without a header - if (file.getFileHeader()) { - FileIndexer.setFileInMap(results, file.hashCodeWithoutHeader(), file); - } - }); + this.progressBar.logTrace('done indexing files'); + return result; + } + private sortMap(checksumsToFiles: ChecksumsToFiles): void { const outputDir = path.resolve(this.options.getOutputDirRoot()); const outputDirDisk = FsPoly.disksSync().find((mount) => outputDir.startsWith(mount)); - // Sort the file arrays - [...results.entries()] - .forEach(([hashCode, filesForHash]) => filesForHash.sort((fileOne, fileTwo) => { - // First, prefer "raw" files (files with their header) - const fileOneHeadered = fileOne.getFileHeader() - && fileOne.hashCodeWithoutHeader() === hashCode ? 1 : 0; - const fileTwoHeadered = fileTwo.getFileHeader() - && fileTwo.hashCodeWithoutHeader() === hashCode ? 1 : 0; - if (fileOneHeadered !== fileTwoHeadered) { - return fileOneHeadered - fileTwoHeadered; - } - - // Then, prefer un-archived files - const fileOneArchived = FileIndexer.archiveEntryPriority(fileOne); - const fileTwoArchived = FileIndexer.archiveEntryPriority(fileTwo); - if (fileOneArchived !== fileTwoArchived) { - return fileOneArchived - fileTwoArchived; - } - - // Then, prefer files that are NOT already in the output directory - // This is in case the output file is invalid and we're trying to overwrite it with - // something else. Otherwise, we'll just attempt to overwrite the invalid output file with - // itself, still resulting in an invalid output file. - const fileOneInOutput = path.resolve(fileOne.getFilePath()).startsWith(outputDir) ? 1 : 0; - const fileTwoInOutput = path.resolve(fileTwo.getFilePath()).startsWith(outputDir) ? 1 : 0; - if (fileOneInOutput !== fileTwoInOutput) { - return fileOneInOutput - fileTwoInOutput; - } - - // Then, prefer files that are on the same disk for fs efficiency see {@link FsPoly#mv} - if (outputDirDisk) { - const fileOneInOutputDisk = path.resolve(fileOne.getFilePath()) - .startsWith(outputDirDisk) ? 0 : 1; - const fileTwoInOutputDisk = path.resolve(fileTwo.getFilePath()) - .startsWith(outputDirDisk) ? 0 : 1; - if (fileOneInOutputDisk !== fileTwoInOutputDisk) { - return fileOneInOutputDisk - fileTwoInOutputDisk; + [...checksumsToFiles.values()] + .forEach((files) => files + .sort((fileOne, fileTwo) => { + // Prefer un-archived files + const fileOneArchived = FileIndexer.archiveEntryPriority(fileOne); + const fileTwoArchived = FileIndexer.archiveEntryPriority(fileTwo); + if (fileOneArchived !== fileTwoArchived) { + return fileOneArchived - fileTwoArchived; } - } - - // Otherwise, be deterministic - return fileOne.getFilePath().localeCompare(fileTwo.getFilePath()); - })); - this.progressBar.logTrace(`found ${results.size} unique file${results.size !== 1 ? 's' : ''}`); + // Then, prefer files that are NOT already in the output directory + // This is in case the output file is invalid and we're trying to overwrite it with + // something else. Otherwise, we'll just attempt to overwrite the invalid output file with + // itself, still resulting in an invalid output file. + const fileOneInOutput = path.resolve(fileOne.getFilePath()).startsWith(outputDir) ? 1 : 0; + const fileTwoInOutput = path.resolve(fileTwo.getFilePath()).startsWith(outputDir) ? 1 : 0; + if (fileOneInOutput !== fileTwoInOutput) { + return fileOneInOutput - fileTwoInOutput; + } - this.progressBar.logTrace('done indexing files'); - return results; - } + // Then, prefer files that are on the same disk for fs efficiency see {@link FsPoly#mv} + if (outputDirDisk) { + const fileOneInOutputDisk = path.resolve(fileOne.getFilePath()) + .startsWith(outputDirDisk) ? 0 : 1; + const fileTwoInOutputDisk = path.resolve(fileTwo.getFilePath()) + .startsWith(outputDirDisk) ? 0 : 1; + if (fileOneInOutputDisk !== fileTwoInOutputDisk) { + return fileOneInOutputDisk - fileTwoInOutputDisk; + } + } - private static setFileInMap(map: Map, key: K, file: File): void { - map.set(key, [...(map.get(key) ?? []), file]); + // Otherwise, be deterministic + return fileOne.getFilePath().localeCompare(fileTwo.getFilePath()); + })); } /** diff --git a/src/modules/movedRomDeleter.ts b/src/modules/movedRomDeleter.ts index 6aedbdf2c..72b61e0fe 100644 --- a/src/modules/movedRomDeleter.ts +++ b/src/modules/movedRomDeleter.ts @@ -78,7 +78,7 @@ export default class MovedROMDeleter extends Module { // the unique set of ArchiveEntry hash codes to know if every ArchiveEntry was "consumed" // during writing. const movedEntryHashCodes = new Set( - movedEntries.flatMap((file) => file.hashCodes()), + movedEntries.flatMap((file) => file.hashCode()), ); const inputEntries = groupedInputRoms.get(filePath) ?? []; @@ -94,7 +94,7 @@ export default class MovedROMDeleter extends Module { } // Otherwise, the entry needs to have been explicitly moved - return entry.hashCodes().some((hashCode) => !movedEntryHashCodes.has(hashCode)); + return !movedEntryHashCodes.has(entry.hashCode()); }); if (unmovedEntries.length > 0) { this.progressBar.logWarn(`${filePath}: not deleting moved file, ${unmovedEntries.length.toLocaleString()} archive entr${unmovedEntries.length !== 1 ? 'ies were' : 'y was'} unmatched:\n${unmovedEntries.sort().map((entry) => ` ${entry}`).join('\n')}`); diff --git a/src/modules/scanner.ts b/src/modules/scanner.ts index b198d1956..14331f728 100644 --- a/src/modules/scanner.ts +++ b/src/modules/scanner.ts @@ -54,7 +54,7 @@ export default abstract class Scanner extends Module { ): Promise { const foundFiles = await this.getFilesFromPaths(filePaths, threads, checksumBitmask); return foundFiles - .filter(ArrayPoly.filterUniqueMapped((file) => file.hashCodes().join(','))); + .filter(ArrayPoly.filterUniqueMapped((file) => file.hashCode())); } private async getFilesFromPath( diff --git a/src/types/dats/rom.ts b/src/types/dats/rom.ts index 3d5dab474..cc57266a3 100644 --- a/src/types/dats/rom.ts +++ b/src/types/dats/rom.ts @@ -95,14 +95,6 @@ export default class ROM implements ROMProps { return this.sha1?.toLowerCase().replace(/^0x/, '').padStart(40, '0'); } - getChecksumProps(): ChecksumProps { - return { - crc32: this.getCrc32(), - md5: this.getMd5(), - sha1: this.getSha1(), - }; - } - getStatus(): ROMStatus | undefined { return this.status; } @@ -119,14 +111,14 @@ export default class ROM implements ROMProps { * Turn this {@link ROM} into a non-existent {@link File}. */ async toFile(): Promise { - return File.fileOf(this.getName(), this.getSize(), this.getChecksumProps()); + return File.fileOf(this.getName(), this.getSize(), this); } /** * Turn this {@link ROM} into a non-existent {@link ArchiveEntry}, given a {@link Archive}. */ async toArchiveEntry(archive: A): Promise> { - return ArchiveEntry.entryOf(archive, this.getName(), this.getSize(), this.getChecksumProps()); + return ArchiveEntry.entryOf(archive, this.getName(), this.getSize(), this); } /** @@ -141,6 +133,8 @@ export default class ROM implements ROMProps { * A string hash code to uniquely identify this {@link ROM}. */ hashCode(): string { - return File.hashCode(this.getCrc32(), this.getSize()); + return this.getSha1() + ?? this.getMd5() + ?? `${this.getCrc32()}|${this.getSize()}`; } } diff --git a/src/types/files/file.ts b/src/types/files/file.ts index 67f6f9d87..49ba20721 100644 --- a/src/types/files/file.ts +++ b/src/types/files/file.ts @@ -437,6 +437,7 @@ export default class File implements FileProps { **************************** */ + // TODO(cemmer): refactor usages of this that should use hashCode() or something else toString(): string { // TODO(cemmer): indicate if there's a patch? if (this.getSymlinkSource()) { @@ -445,23 +446,13 @@ export default class File implements FileProps { return this.getFilePath(); } - static hashCode(crc: string, size: number): string { - return `${crc}|${size}`; - } - - hashCodeWithHeader(): string { - return File.hashCode(this.getCrc32(), this.getSize()); - } - - hashCodeWithoutHeader(): string { - return File.hashCode(this.getCrc32WithoutHeader(), this.getSizeWithoutHeader()); - } - - hashCodes(): string[] { - return [ - this.hashCodeWithHeader(), - this.hashCodeWithoutHeader(), - ].reduce(ArrayPoly.reduceUnique(), []); + /** + * A string hash code to uniquely identify this {@link File}. + */ + hashCode(): string { + return this.getSha1() + ?? this.getMd5() + ?? `${this.getCrc32()}|${this.getSize()}`; } equals(other: File): boolean { diff --git a/src/types/indexedFiles.ts b/src/types/indexedFiles.ts new file mode 100644 index 000000000..be7a6a8f9 --- /dev/null +++ b/src/types/indexedFiles.ts @@ -0,0 +1,141 @@ +import { Memoize } from 'typescript-memoize'; + +import ArrayPoly from '../polyfill/arrayPoly.js'; +import File from '../types/files/file.js'; +import ROM from './dats/rom.js'; + +export type ChecksumsToFiles = Map; + +export interface AllChecksums { + crc32: ChecksumsToFiles, + md5: ChecksumsToFiles, + sha1: ChecksumsToFiles, +} + +/** + * TODO + */ +export default class IndexedFiles { + private readonly crc32: ChecksumsToFiles; + + private readonly md5: ChecksumsToFiles; + + private readonly sha1: ChecksumsToFiles; + + private constructor(crc32: ChecksumsToFiles, md5: ChecksumsToFiles, sha1: ChecksumsToFiles) { + this.crc32 = crc32; + this.md5 = md5; + this.sha1 = sha1; + } + + /** + * TODO + */ + static fromFiles(files: File[]): IndexedFiles { + const crc32RawMap = new Map(); + const crc32WithoutHeaderMap = new Map(); + const md5RawMap = new Map(); + const md5WithoutHeaderMap = new Map(); + const sha1RawMap = new Map(); + const sha1WithoutHeaderMap = new Map(); + + // Build the maps + files.forEach((file) => { + const crc32WithSize = `${file.getCrc32()}|${file.getSize()}`; + crc32RawMap.set(crc32WithSize, [file, ...(crc32RawMap.get(crc32WithSize) ?? [])]); + + const md5 = file.getMd5(); + if (md5) { + md5RawMap.set(md5, [file, ...(crc32RawMap.get(md5) ?? [])]); + } + + const sha1 = file.getSha1(); + if (sha1) { + sha1RawMap.set(sha1, [file, ...(crc32RawMap.get(sha1) ?? [])]); + } + + if (file.getFileHeader()) { + const crc32WithoutHeader = file.getCrc32WithoutHeader(); + crc32WithoutHeaderMap.set( + crc32WithoutHeader, + [...(crc32WithoutHeaderMap.get(crc32WithoutHeader) ?? []), file], + ); + + const md5WithoutHeader = file.getMd5WithoutHeader(); + if (md5WithoutHeader) { + md5WithoutHeaderMap.set( + md5WithoutHeader, + [...(md5WithoutHeaderMap.get(md5WithoutHeader) ?? []), file], + ); + } + + const sha1WithoutHeader = file.getSha1WithoutHeader(); + if (sha1WithoutHeader) { + sha1WithoutHeaderMap.set( + sha1WithoutHeader, + [...(sha1WithoutHeaderMap.get(sha1WithoutHeader) ?? []), file], + ); + } + } + }); + + const crc32Map = this.combineMaps(crc32RawMap, crc32WithoutHeaderMap); + const md5Map = this.combineMaps(md5RawMap, md5WithoutHeaderMap); + const sha1Map = this.combineMaps(sha1RawMap, sha1WithoutHeaderMap); + return new IndexedFiles(crc32Map, md5Map, sha1Map); + } + + private static combineMaps( + withHeaders: ChecksumsToFiles, + withoutHeaders: ChecksumsToFiles, + ): ChecksumsToFiles { + const result = new Map(withHeaders); + [...withoutHeaders.entries()] + // Prefer "raw" files as they exist on disk, without any header manipulation + .filter(([checksum]) => !result.has(checksum)) + .forEach(([checksum, files]) => { + result.set(checksum, files); + }); + return result; + } + + @Memoize() + getFiles(): File[] { + return [ + ...this.crc32.values(), + ...this.md5.values(), + ...this.sha1.values(), + ] + .flat() + .filter(ArrayPoly.filterUniqueMapped((file) => file.toString())); + } + + getSize(): number { + return this.getFiles().length; + } + + /** + * TODO + */ + findFiles(file: File | ROM): File[] | undefined { + const { sha1 } = file; + if (sha1 && this.sha1.has(sha1)) { + return this.sha1.get(sha1); + } + + const { md5 } = file; + if (md5 && this.md5.has(md5)) { + return this.md5.get(md5); + } + + const { crc32 } = file; + if (crc32) { + const crc32WithSize = `${crc32}|${file.getSize()}`; + if (this.crc32.has(crc32WithSize)) { + return this.crc32.get(crc32WithSize); + } + } + + return undefined; + } +}