Skip to content

Commit

Permalink
🩹 back: when indexing documents, call unoconv directly to ignore stde…
Browse files Browse the repository at this point in the history
…rr output. And do not try indexing multiple ways.
  • Loading branch information
ericlinagora committed May 15, 2024
1 parent 79fc8f2 commit a8de167
Showing 1 changed file with 4 additions and 8 deletions.
12 changes: 4 additions & 8 deletions tdrive/backend/node/src/services/documents/utils.ts
Original file line number Diff line number Diff line change
@@ -1,8 +1,8 @@
import { spawnCheckingExitCode } from "../../utils/exec";
import archiver from "archiver";
import { merge } from "lodash";
import PdfParse from "pdf-parse";
import { Readable } from "stream";
import unoconv from "unoconv-promise";
import Repository from "../../core/platform/services/database/services/orm/repository/repository";
import {
cleanFiles,
Expand Down Expand Up @@ -387,11 +387,7 @@ export const officeFileToString = async (file: Readable, extension: string): Pro
const outputPath = getTmpFile(".pdf");

try {
await unoconv.run({
file: officeFilePath,
output: outputPath,
});

await spawnCheckingExitCode("unoconv", [`-o${outputPath}`, officeFilePath]);
cleanFiles([officeFilePath]);

return await pdfFileToString(outputPath);
Expand Down Expand Up @@ -633,11 +629,11 @@ export const getKeywordsOfFile = async (
logger.info(`Processing text file: ${filename}`);
content_strings = await readableToString(file);
}
if (isFileType(mime, filename, pdfExtensions)) {
if ((content_strings ?? "").trim().length == 0 && isFileType(mime, filename, pdfExtensions)) {
logger.info(`Processing PDF file: ${filename}`);
content_strings = await pdfFileToString(file);
}
if (isFileType(mime, filename, officeExtensions)) {
if ((content_strings ?? "").trim().length == 0 && isFileType(mime, filename, officeExtensions)) {
logger.info(`Processing office file: ${filename}`);
content_strings = await officeFileToString(file, extension);
}
Expand Down

0 comments on commit a8de167

Please sign in to comment.