Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Acreom (Knowledge Base Tool) local files document loader #7525

Open
wants to merge 1 commit into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
17 changes: 6 additions & 11 deletions langchain/langchain.config.js
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@ export const config = {
/@langchain\/community/,
"axios", // axios is a dependency of openai
"mysql2/promise",
"notion-to-md/build/utils/notion.js"
"notion-to-md/build/utils/notion.js",
],
entrypoints: {
load: "load/index",
Expand Down Expand Up @@ -67,7 +67,7 @@ export const config = {
// text_splitter
text_splitter: "text_splitter",
// memory
"memory": "memory/index",
memory: "memory/index",
"memory/chat_memory": "memory/chat_memory",
// document
document: "document",
Expand Down Expand Up @@ -146,11 +146,7 @@ export const config = {
"schema/query_constructor": "schema/query_constructor",
"schema/prompt_template": "schema/prompt_template",
},
deprecatedOmitFromImportMap: [
"document",
"load/serializable",
"runnables",
],
deprecatedOmitFromImportMap: ["document", "load/serializable", "runnables"],
requiresOptionalDependency: [
"agents/load",
"agents/toolkits/sql",
Expand Down Expand Up @@ -201,6 +197,7 @@ export const config = {
"document_loaders/fs/csv",
"document_loaders/fs/notion",
"document_loaders/fs/obsidian",
"document_loaders/fs/acreom",
"document_loaders/fs/unstructured",
"document_loaders/fs/openai_whisper_audio",
"document_loaders/fs/pptx",
Expand Down Expand Up @@ -293,9 +290,7 @@ export const config = {
path: "@langchain/core/prompts",
},
{
modules: [
"ImagePromptTemplate",
],
modules: ["ImagePromptTemplate"],
alias: ["prompts", "image"],
path: "@langchain/core/prompts",
},
Expand Down Expand Up @@ -337,7 +332,7 @@ export const config = {
modules: ["ChatGenerationChunk", "GenerationChunk"],
alias: ["schema", "output"],
path: "@langchain/core/outputs",
}
},
],
shouldTestExports: true,
tsConfigPath: resolve("./tsconfig.json"),
Expand Down
4 changes: 3 additions & 1 deletion libs/langchain-community/langchain.config.js
Original file line number Diff line number Diff line change
Expand Up @@ -80,7 +80,8 @@ export const config = {
"embeddings/gradient_ai": "embeddings/gradient_ai",
"embeddings/hf": "embeddings/hf",
"embeddings/hf_transformers": "embeddings/hf_transformers",
"embeddings/huggingface_transformers": "embeddings/huggingface_transformers",
"embeddings/huggingface_transformers":
"embeddings/huggingface_transformers",
"embeddings/ibm": "embeddings/ibm",
"embeddings/jina": "embeddings/jina",
"embeddings/llama_cpp": "embeddings/llama_cpp",
Expand Down Expand Up @@ -317,6 +318,7 @@ export const config = {
"document_loaders/fs/csv": "document_loaders/fs/csv",
"document_loaders/fs/notion": "document_loaders/fs/notion",
"document_loaders/fs/obsidian": "document_loaders/fs/obsidian",
"document_loaders/fs/acreom": "document_loaders/fs/acreom",
"document_loaders/fs/unstructured": "document_loaders/fs/unstructured",
"document_loaders/fs/openai_whisper_audio":
"document_loaders/fs/openai_whisper_audio",
Expand Down
195 changes: 195 additions & 0 deletions libs/langchain-community/src/document_loaders/fs/acreom.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,195 @@
import type { basename as BasenameT } from "node:path";
import type { readFile as ReadFileT, stat as StatT } from "node:fs/promises";
import yaml from "js-yaml";
import { Document } from "@langchain/core/documents";
import { getEnv } from "@langchain/core/utils/env";
import { BaseDocumentLoader } from "@langchain/core/document_loaders/base";
import {
DirectoryLoader,
UnknownHandling,
} from "langchain/document_loaders/fs/directory";

export type FrontMatter = {
title?: string;
description?: string;
tags?: string[] | string;
[key: string]: unknown;
};

export interface AcreomFileLoaderOptions {
encoding?: BufferEncoding;
collectMetadata?: boolean;
}

/**
* Represents a loader for Acreom markdown files. This loader extends the BaseDocumentLoader
* and provides functionality to parse metadata, tags, and content-specific rules for Acreom files.
*/
export class AcreomFileLoader extends BaseDocumentLoader {
private filePath: string;

private encoding: BufferEncoding;

private collectMetadata: boolean;

private static FRONT_MATTER_REGEX = /^---\n(.*?)\n---\n/s;

private static ACREOM_HASHTAGS_REGEX = /#/g;

private static ACREOM_TASKS_REGEX = /\s*-\s\[\s\]\s.*|\s*\[\s\]\s.*/g;

private static ACREOM_LINKS_REGEX = /\[\[.*?\]\]/g;

/**
* Initializes a new instance of the AcreomFileLoader class.
* @param filePath The path to the Acreom markdown file.
* @param options Configuration options for encoding and metadata collection.
*/
constructor(
filePath: string,
{ encoding = "utf-8", collectMetadata = true }: AcreomFileLoaderOptions = {}
) {
super();
this.filePath = filePath;
this.encoding = encoding;
this.collectMetadata = collectMetadata;
}

/**
* Parses YAML front matter from the given content string.
* @param content The string content of the markdown file.
* @returns An object representing the parsed front matter.
*/
private parseFrontMatter(content: string): FrontMatter {
if (!this.collectMetadata) {
return {};
}

const match = content.match(AcreomFileLoader.FRONT_MATTER_REGEX);
if (!match) {
return {};
}

try {
return yaml.load(match[1]) as FrontMatter;
} catch (e) {
console.warn("Encountered non-yaml frontmatter");
return {};
}
}

/**
* Removes YAML front matter from the given content string.
* @param content The string content of the markdown file.
* @returns The content string with front matter removed.
*/
private removeFrontMatter(content: string): string {
return this.collectMetadata
? content.replace(AcreomFileLoader.FRONT_MATTER_REGEX, "")
: content;
}

/**
* Processes Acreom-specific content rules, such as removing tasks, hashtags, and doclinks.
* @param content The raw content of the markdown file.
* @returns Cleaned content.
*/
private processAcreomContent(content: string): string {
return content
.replace(AcreomFileLoader.ACREOM_TASKS_REGEX, "") // Remove tasks
.replace(AcreomFileLoader.ACREOM_HASHTAGS_REGEX, "") // Remove hashtags
.replace(AcreomFileLoader.ACREOM_LINKS_REGEX, ""); // Remove double-bracketed links
}

/**
* Converts metadata to a format compatible with LangChain.
* @param metadata The metadata object to convert.
* @returns A record object containing key-value pairs of LangChain-compatible metadata.
*/
private toLangchainCompatibleMetadata(metadata: Record<string, unknown>) {
const result: Record<string, unknown> = {};
for (const [key, value] of Object.entries(metadata)) {
if (typeof value === "string" || typeof value === "number") {
result[key] = value;
} else {
result[key] = JSON.stringify(value);
}
}
return result;
}

/**
* Loads the Acreom file, parses it, and returns a `Document` instance.
* @returns An array of `Document` instances to comply with the BaseDocumentLoader interface.
*/
public async load(): Promise<Document[]> {
const { basename, readFile, stat } = await AcreomFileLoader.imports();
const fileName = basename(this.filePath);
const stats = await stat(this.filePath);
let content = await readFile(this.filePath, this.encoding);

const frontMatter = this.parseFrontMatter(content);
content = this.removeFrontMatter(content);
content = this.processAcreomContent(content);

const metadata: Document["metadata"] = {
source: fileName,
path: this.filePath,
created: stats.birthtimeMs,
lastModified: stats.mtimeMs,
lastAccessed: stats.atimeMs,
...this.toLangchainCompatibleMetadata(frontMatter),
};

return [
new Document({
pageContent: content,
metadata,
}),
];
}

/**
* Dynamically imports required modules. Throws an error if the imports fail.
* @returns An object containing the imported modules.
*/
static async imports(): Promise<{
basename: typeof BasenameT;
readFile: typeof ReadFileT;
stat: typeof StatT;
}> {
try {
const { basename } = await import("node:path");
const { readFile, stat } = await import("node:fs/promises");
return { basename, readFile, stat };
} catch (e) {
console.error(e);
throw new Error(
`Failed to load fs/promises. AcreomFileLoader available only in 'node' environment. Current environment: '${getEnv()}'.`
);
}
}
}

/**
* Represents a loader for directories containing Acreom markdown files. This loader extends
* the DirectoryLoader and provides functionality to load and parse `.md` files with YAML frontmatter
* and Acreom-specific rules for tasks, hashtags, and links.
*/
export class AcreomLoader extends DirectoryLoader {
/**
* Initializes a new instance of the AcreomLoader class.
* @param directoryPath The path to the directory containing Acreom markdown files.
* @param options Configuration options for encoding and metadata collection.
*/
constructor(directoryPath: string, options?: AcreomFileLoaderOptions) {
super(
directoryPath,
{
".md": (filePath) => new AcreomFileLoader(filePath, options),
},
true, // Recursive directory loading
UnknownHandling.Ignore // Ignore unknown file types
);
}
}
85 changes: 85 additions & 0 deletions libs/langchain-community/src/document_loaders/tests/acreom.test.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,85 @@
import path from "path";
import { fileURLToPath } from "url";
import { AcreomFileLoader } from "../fs/acreom.js";

// Resolve the test data path relative to this file
const testDataPath = path.resolve(
path.dirname(fileURLToPath(import.meta.url)),
"example_data/acreom"
);

describe("AcreomFileLoader", () => {
const encoding = "utf8" as BufferEncoding;

it("should parse metadata and content correctly from frontmatter.md", async () => {
const filePath = path.join(testDataPath, "frontmatter.md");
const loader = new AcreomFileLoader(filePath, { encoding });
const documents = await loader.load();

const document = documents[0];

expect(document).toBeDefined();
expect(document.metadata).toMatchObject({
source: "frontmatter.md",
path: filePath,
title: "Correct Title",
tags: "Tag1, Tag2",
author: "Test Author",
});

expect(document.pageContent.trim()).toBe(
"This is the content of the file with correct front matter."
);
});

it("should handle no front matter in no_frontmatter.md gracefully", async () => {
const filePath = path.join(testDataPath, "no_frontmatter.md");
const loader = new AcreomFileLoader(filePath, { encoding });
const documents = await loader.load();

const document = documents[0];

expect(document).toBeDefined();
expect(document.metadata).toMatchObject({
source: "no_frontmatter.md",
path: filePath,
});

expect(document.pageContent.trim()).toBe(
"This content does not have front matter. Only plain text."
);
});

it("should handle bad front matter in bad_frontmatter.md gracefully", async () => {
const filePath = path.join(testDataPath, "bad_frontmatter.md");
const loader = new AcreomFileLoader(filePath, { encoding });
const documents = await loader.load();

const document = documents[0];

expect(document).toBeDefined();
expect(document.metadata).toMatchObject({
source: "bad_frontmatter.md",
path: filePath,
});

expect(document.pageContent.trim()).toBe(
"This is the content of the file with bad front matter."
);
});

it("should ignore tasks, hashtags, and doclinks in frontmatter.md", async () => {
const filePath = path.join(testDataPath, "frontmatter.md");
const loader = new AcreomFileLoader(filePath, { encoding });
const documents = await loader.load();

const document = documents[0];

expect(document).toBeDefined();

// Ensure tasks, hashtags, and doclinks are removed from the content
expect(document.pageContent).not.toContain("[ ]");
expect(document.pageContent).not.toContain("#");
expect(document.pageContent).not.toContain("[[");
});
});
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
---
title Correct Title
tags Tag1, Tag2
---

This is the content of the file with bad front matter.
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
---
title: Correct Title
tags: Tag1, Tag2
author: Test Author
---

This is the content of the file with correct front matter.
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
This content does not have front matter. Only plain text.