diff --git a/langchain/langchain.config.js b/langchain/langchain.config.js index d2a317e76fb7..17c7f862d365 100644 --- a/langchain/langchain.config.js +++ b/langchain/langchain.config.js @@ -18,7 +18,7 @@ export const config = { /@langchain\/community/, "axios", // axios is a dependency of openai "mysql2/promise", - "notion-to-md/build/utils/notion.js" + "notion-to-md/build/utils/notion.js", ], entrypoints: { load: "load/index", @@ -67,7 +67,7 @@ export const config = { // text_splitter text_splitter: "text_splitter", // memory - "memory": "memory/index", + memory: "memory/index", "memory/chat_memory": "memory/chat_memory", // document document: "document", @@ -146,11 +146,7 @@ export const config = { "schema/query_constructor": "schema/query_constructor", "schema/prompt_template": "schema/prompt_template", }, - deprecatedOmitFromImportMap: [ - "document", - "load/serializable", - "runnables", - ], + deprecatedOmitFromImportMap: ["document", "load/serializable", "runnables"], requiresOptionalDependency: [ "agents/load", "agents/toolkits/sql", @@ -201,6 +197,7 @@ export const config = { "document_loaders/fs/csv", "document_loaders/fs/notion", "document_loaders/fs/obsidian", + "document_loaders/fs/acreom", "document_loaders/fs/unstructured", "document_loaders/fs/openai_whisper_audio", "document_loaders/fs/pptx", @@ -293,9 +290,7 @@ export const config = { path: "@langchain/core/prompts", }, { - modules: [ - "ImagePromptTemplate", - ], + modules: ["ImagePromptTemplate"], alias: ["prompts", "image"], path: "@langchain/core/prompts", }, @@ -337,7 +332,7 @@ export const config = { modules: ["ChatGenerationChunk", "GenerationChunk"], alias: ["schema", "output"], path: "@langchain/core/outputs", - } + }, ], shouldTestExports: true, tsConfigPath: resolve("./tsconfig.json"), diff --git a/libs/langchain-community/langchain.config.js b/libs/langchain-community/langchain.config.js index dc58963eed75..e392dee034b6 100644 --- a/libs/langchain-community/langchain.config.js +++ b/libs/langchain-community/langchain.config.js @@ -80,7 +80,8 @@ export const config = { "embeddings/gradient_ai": "embeddings/gradient_ai", "embeddings/hf": "embeddings/hf", "embeddings/hf_transformers": "embeddings/hf_transformers", - "embeddings/huggingface_transformers": "embeddings/huggingface_transformers", + "embeddings/huggingface_transformers": + "embeddings/huggingface_transformers", "embeddings/ibm": "embeddings/ibm", "embeddings/jina": "embeddings/jina", "embeddings/llama_cpp": "embeddings/llama_cpp", @@ -317,6 +318,7 @@ export const config = { "document_loaders/fs/csv": "document_loaders/fs/csv", "document_loaders/fs/notion": "document_loaders/fs/notion", "document_loaders/fs/obsidian": "document_loaders/fs/obsidian", + "document_loaders/fs/acreom": "document_loaders/fs/acreom", "document_loaders/fs/unstructured": "document_loaders/fs/unstructured", "document_loaders/fs/openai_whisper_audio": "document_loaders/fs/openai_whisper_audio", diff --git a/libs/langchain-community/src/document_loaders/fs/acreom.ts b/libs/langchain-community/src/document_loaders/fs/acreom.ts new file mode 100644 index 000000000000..e97efdc1e38f --- /dev/null +++ b/libs/langchain-community/src/document_loaders/fs/acreom.ts @@ -0,0 +1,195 @@ +import type { basename as BasenameT } from "node:path"; +import type { readFile as ReadFileT, stat as StatT } from "node:fs/promises"; +import yaml from "js-yaml"; +import { Document } from "@langchain/core/documents"; +import { getEnv } from "@langchain/core/utils/env"; +import { BaseDocumentLoader } from "@langchain/core/document_loaders/base"; +import { + DirectoryLoader, + UnknownHandling, +} from "langchain/document_loaders/fs/directory"; + +export type FrontMatter = { + title?: string; + description?: string; + tags?: string[] | string; + [key: string]: unknown; +}; + +export interface AcreomFileLoaderOptions { + encoding?: BufferEncoding; + collectMetadata?: boolean; +} + +/** + * Represents a loader for Acreom markdown files. This loader extends the BaseDocumentLoader + * and provides functionality to parse metadata, tags, and content-specific rules for Acreom files. + */ +export class AcreomFileLoader extends BaseDocumentLoader { + private filePath: string; + + private encoding: BufferEncoding; + + private collectMetadata: boolean; + + private static FRONT_MATTER_REGEX = /^---\n(.*?)\n---\n/s; + + private static ACREOM_HASHTAGS_REGEX = /#/g; + + private static ACREOM_TASKS_REGEX = /\s*-\s\[\s\]\s.*|\s*\[\s\]\s.*/g; + + private static ACREOM_LINKS_REGEX = /\[\[.*?\]\]/g; + + /** + * Initializes a new instance of the AcreomFileLoader class. + * @param filePath The path to the Acreom markdown file. + * @param options Configuration options for encoding and metadata collection. + */ + constructor( + filePath: string, + { encoding = "utf-8", collectMetadata = true }: AcreomFileLoaderOptions = {} + ) { + super(); + this.filePath = filePath; + this.encoding = encoding; + this.collectMetadata = collectMetadata; + } + + /** + * Parses YAML front matter from the given content string. + * @param content The string content of the markdown file. + * @returns An object representing the parsed front matter. + */ + private parseFrontMatter(content: string): FrontMatter { + if (!this.collectMetadata) { + return {}; + } + + const match = content.match(AcreomFileLoader.FRONT_MATTER_REGEX); + if (!match) { + return {}; + } + + try { + return yaml.load(match[1]) as FrontMatter; + } catch (e) { + console.warn("Encountered non-yaml frontmatter"); + return {}; + } + } + + /** + * Removes YAML front matter from the given content string. + * @param content The string content of the markdown file. + * @returns The content string with front matter removed. + */ + private removeFrontMatter(content: string): string { + return this.collectMetadata + ? content.replace(AcreomFileLoader.FRONT_MATTER_REGEX, "") + : content; + } + + /** + * Processes Acreom-specific content rules, such as removing tasks, hashtags, and doclinks. + * @param content The raw content of the markdown file. + * @returns Cleaned content. + */ + private processAcreomContent(content: string): string { + return content + .replace(AcreomFileLoader.ACREOM_TASKS_REGEX, "") // Remove tasks + .replace(AcreomFileLoader.ACREOM_HASHTAGS_REGEX, "") // Remove hashtags + .replace(AcreomFileLoader.ACREOM_LINKS_REGEX, ""); // Remove double-bracketed links + } + + /** + * Converts metadata to a format compatible with LangChain. + * @param metadata The metadata object to convert. + * @returns A record object containing key-value pairs of LangChain-compatible metadata. + */ + private toLangchainCompatibleMetadata(metadata: Record) { + const result: Record = {}; + for (const [key, value] of Object.entries(metadata)) { + if (typeof value === "string" || typeof value === "number") { + result[key] = value; + } else { + result[key] = JSON.stringify(value); + } + } + return result; + } + + /** + * Loads the Acreom file, parses it, and returns a `Document` instance. + * @returns An array of `Document` instances to comply with the BaseDocumentLoader interface. + */ + public async load(): Promise { + const { basename, readFile, stat } = await AcreomFileLoader.imports(); + const fileName = basename(this.filePath); + const stats = await stat(this.filePath); + let content = await readFile(this.filePath, this.encoding); + + const frontMatter = this.parseFrontMatter(content); + content = this.removeFrontMatter(content); + content = this.processAcreomContent(content); + + const metadata: Document["metadata"] = { + source: fileName, + path: this.filePath, + created: stats.birthtimeMs, + lastModified: stats.mtimeMs, + lastAccessed: stats.atimeMs, + ...this.toLangchainCompatibleMetadata(frontMatter), + }; + + return [ + new Document({ + pageContent: content, + metadata, + }), + ]; + } + + /** + * Dynamically imports required modules. Throws an error if the imports fail. + * @returns An object containing the imported modules. + */ + static async imports(): Promise<{ + basename: typeof BasenameT; + readFile: typeof ReadFileT; + stat: typeof StatT; + }> { + try { + const { basename } = await import("node:path"); + const { readFile, stat } = await import("node:fs/promises"); + return { basename, readFile, stat }; + } catch (e) { + console.error(e); + throw new Error( + `Failed to load fs/promises. AcreomFileLoader available only in 'node' environment. Current environment: '${getEnv()}'.` + ); + } + } +} + +/** + * Represents a loader for directories containing Acreom markdown files. This loader extends + * the DirectoryLoader and provides functionality to load and parse `.md` files with YAML frontmatter + * and Acreom-specific rules for tasks, hashtags, and links. + */ +export class AcreomLoader extends DirectoryLoader { + /** + * Initializes a new instance of the AcreomLoader class. + * @param directoryPath The path to the directory containing Acreom markdown files. + * @param options Configuration options for encoding and metadata collection. + */ + constructor(directoryPath: string, options?: AcreomFileLoaderOptions) { + super( + directoryPath, + { + ".md": (filePath) => new AcreomFileLoader(filePath, options), + }, + true, // Recursive directory loading + UnknownHandling.Ignore // Ignore unknown file types + ); + } +} diff --git a/libs/langchain-community/src/document_loaders/tests/acreom.test.ts b/libs/langchain-community/src/document_loaders/tests/acreom.test.ts new file mode 100644 index 000000000000..e08fdb6c3072 --- /dev/null +++ b/libs/langchain-community/src/document_loaders/tests/acreom.test.ts @@ -0,0 +1,85 @@ +import path from "path"; +import { fileURLToPath } from "url"; +import { AcreomFileLoader } from "../fs/acreom.js"; + +// Resolve the test data path relative to this file +const testDataPath = path.resolve( + path.dirname(fileURLToPath(import.meta.url)), + "example_data/acreom" +); + +describe("AcreomFileLoader", () => { + const encoding = "utf8" as BufferEncoding; + + it("should parse metadata and content correctly from frontmatter.md", async () => { + const filePath = path.join(testDataPath, "frontmatter.md"); + const loader = new AcreomFileLoader(filePath, { encoding }); + const documents = await loader.load(); + + const document = documents[0]; + + expect(document).toBeDefined(); + expect(document.metadata).toMatchObject({ + source: "frontmatter.md", + path: filePath, + title: "Correct Title", + tags: "Tag1, Tag2", + author: "Test Author", + }); + + expect(document.pageContent.trim()).toBe( + "This is the content of the file with correct front matter." + ); + }); + + it("should handle no front matter in no_frontmatter.md gracefully", async () => { + const filePath = path.join(testDataPath, "no_frontmatter.md"); + const loader = new AcreomFileLoader(filePath, { encoding }); + const documents = await loader.load(); + + const document = documents[0]; + + expect(document).toBeDefined(); + expect(document.metadata).toMatchObject({ + source: "no_frontmatter.md", + path: filePath, + }); + + expect(document.pageContent.trim()).toBe( + "This content does not have front matter. Only plain text." + ); + }); + + it("should handle bad front matter in bad_frontmatter.md gracefully", async () => { + const filePath = path.join(testDataPath, "bad_frontmatter.md"); + const loader = new AcreomFileLoader(filePath, { encoding }); + const documents = await loader.load(); + + const document = documents[0]; + + expect(document).toBeDefined(); + expect(document.metadata).toMatchObject({ + source: "bad_frontmatter.md", + path: filePath, + }); + + expect(document.pageContent.trim()).toBe( + "This is the content of the file with bad front matter." + ); + }); + + it("should ignore tasks, hashtags, and doclinks in frontmatter.md", async () => { + const filePath = path.join(testDataPath, "frontmatter.md"); + const loader = new AcreomFileLoader(filePath, { encoding }); + const documents = await loader.load(); + + const document = documents[0]; + + expect(document).toBeDefined(); + + // Ensure tasks, hashtags, and doclinks are removed from the content + expect(document.pageContent).not.toContain("[ ]"); + expect(document.pageContent).not.toContain("#"); + expect(document.pageContent).not.toContain("[["); + }); +}); diff --git a/libs/langchain-community/src/document_loaders/tests/example_data/acreom/bad_frontmatter.md b/libs/langchain-community/src/document_loaders/tests/example_data/acreom/bad_frontmatter.md new file mode 100644 index 000000000000..19df08388482 --- /dev/null +++ b/libs/langchain-community/src/document_loaders/tests/example_data/acreom/bad_frontmatter.md @@ -0,0 +1,6 @@ +--- +title Correct Title +tags Tag1, Tag2 +--- + +This is the content of the file with bad front matter. diff --git a/libs/langchain-community/src/document_loaders/tests/example_data/acreom/frontmatter.md b/libs/langchain-community/src/document_loaders/tests/example_data/acreom/frontmatter.md new file mode 100644 index 000000000000..031dfd8c8636 --- /dev/null +++ b/libs/langchain-community/src/document_loaders/tests/example_data/acreom/frontmatter.md @@ -0,0 +1,7 @@ +--- +title: Correct Title +tags: Tag1, Tag2 +author: Test Author +--- + +This is the content of the file with correct front matter. diff --git a/libs/langchain-community/src/document_loaders/tests/example_data/acreom/no_frontmatter.md b/libs/langchain-community/src/document_loaders/tests/example_data/acreom/no_frontmatter.md new file mode 100644 index 000000000000..da210c9b5601 --- /dev/null +++ b/libs/langchain-community/src/document_loaders/tests/example_data/acreom/no_frontmatter.md @@ -0,0 +1 @@ +This content does not have front matter. Only plain text.