diff --git a/packages/langium/src/parser/indentation-aware.ts b/packages/langium/src/parser/indentation-aware.ts new file mode 100644 index 000000000..194acd198 --- /dev/null +++ b/packages/langium/src/parser/indentation-aware.ts @@ -0,0 +1,367 @@ +/****************************************************************************** + * Copyright 2024 TypeFox GmbH + * This program and the accompanying materials are made available under the + * terms of the MIT License, which is available in the project root. + ******************************************************************************/ + +import type { CustomPatternMatcherFunc, TokenType, IToken } from 'chevrotain'; +import type { Grammar, TerminalRule } from '../languages/generated/ast.js'; +import type { TokenBuilderOptions } from './token-builder.js'; +import type { LexerResult } from './lexer.js'; +import type { LangiumCoreServices } from '../services.js'; +import { createToken, createTokenInstance, Lexer } from 'chevrotain'; +import { DefaultTokenBuilder } from './token-builder.js'; +import { DefaultLexer, isTokenTypeArray } from './lexer.js'; + +export interface IndentationTokenBuilderOptions { + /** + * The name of the token used to denote indentation in the grammar. + * A possible definition in the grammar could look like this: + * ```langium + * terminal INDENT: ':synthetic-indent:'; + * ``` + * + * @default 'INDENT' + */ + indentTokenName: string; + /** + * The name of the token used to denote deindentation in the grammar. + * A possible definition in the grammar could look like this: + * ```langium + * terminal DEDENT: ':synthetic-dedent:'; + * ``` + * + * @default 'DEDENT' + */ + dedentTokenName: string; + /** + * The name of the token used to denote whitespace other than indentation and newlines in the grammar. + * A possible definition in the grammar could look like this: + * ```langium + * hidden terminal WS: /[ \t]+/; + * ``` + * + * @default 'WS' + */ + whitespaceTokenName: string; +} + +export const indentationBuilderDefaultOptions: IndentationTokenBuilderOptions = { + indentTokenName: 'INDENT', + dedentTokenName: 'DEDENT', + whitespaceTokenName: 'WS', +}; + +/** + * A token builder that is sensitive to indentation in the input text. + * It will generate tokens for indentation and dedentation based on the indentation level. + * + * Inspired by https://github.com/chevrotain/chevrotain/blob/master/examples/lexer/python_indentation/python_indentation.js + */ +export class IndentationAwareTokenBuilder extends DefaultTokenBuilder { + /** + * The stack in which all the previous matched indentation levels are stored + * to understand how deep a the next tokens are nested. + */ + protected indentationStack: number[] = [0]; + readonly options: IndentationTokenBuilderOptions; + + /** + * The token type to be used for indentation tokens + */ + readonly indentTokenType: TokenType; + + /** + * The token type to be used for dedentation tokens + */ + readonly dedentTokenType: TokenType; + + /** + * A regular expression to match a series of tabs and/or spaces. + * Override this to customize what the indentation is allowed to consist of. + */ + protected whitespaceRegExp = /[ \t]+/y; + + constructor(options: Partial = indentationBuilderDefaultOptions) { + super(); + this.options = { + ...indentationBuilderDefaultOptions, + ...options, + }; + + this.indentTokenType = createToken({ + name: this.options.indentTokenName, + pattern: this.indentMatcher, + line_breaks: false, + }); + + this.dedentTokenType = createToken({ + name: this.options.dedentTokenName, + pattern: this.dedentMatcher, + line_breaks: false, + }); + } + + override buildTokens(grammar: Grammar, options?: TokenBuilderOptions | undefined) { + const tokenTypes = super.buildTokens(grammar, options); + if (!isTokenTypeArray(tokenTypes)) { + throw new Error('Invalid tokens built by default builder'); + } + + const { indentTokenName, dedentTokenName, whitespaceTokenName } = this.options; + + // Rearrange tokens because whitespace (which is ignored) goes to the beginning by default, consuming indentation as well + // Order should be: dedent, indent, spaces + let dedent: TokenType | undefined; + let indent: TokenType | undefined; + let ws: TokenType | undefined; + const otherTokens: TokenType[] = []; + for (const tokenType of tokenTypes) { + if (tokenType.name === dedentTokenName) { + dedent = tokenType; + } else if (tokenType.name === indentTokenName) { + indent = tokenType; + } else if (tokenType.name === whitespaceTokenName) { + ws = tokenType; + } else { + otherTokens.push(tokenType); + } + } + if (!dedent || !indent || !ws) { + throw new Error('Some indentation/whitespace tokens not found!'); + } + return [dedent, indent, ws, ...otherTokens]; + } + + /** + * Helper function to check if the current position is the start of a new line. + * + * @param text The full input string. + * @param offset The current position at which to check + * @returns Whether the current position is the start of a new line + */ + protected isStartOfLine(text: string, offset: number): boolean { + return offset === 0 || '\r\n'.includes(text[offset - 1]); + } + + /** + * A helper function used in matching both indents and dedents. + * + * @param text The full input string. + * @param offset The current position at which to attempt a match + * @returns The current and previous indentation levels and the matched whitespace + */ + protected matchWhitespace(text: string, offset: number) { + this.whitespaceRegExp.lastIndex = offset; + const match = this.whitespaceRegExp.exec(text); + return { + currIndentLevel: match?.[0].length ?? 0, + prevIndentLevel: this.indentationStack.at(-1)!, + match, + }; + } + + /** + * Helper function to create an instance of an indentation token. + * + * @param tokenType Indent or dedent token type + * @param text Full input string, used to calculate the line number + * @param image The original image of the token (tabs or spaces) + * @param offset Current position in the input string + * @returns The indentation token instance + */ + protected createIndentationTokenInstance(tokenType: TokenType, text: string, image: string, offset: number) { + const lineNumber = text.substring(0, offset).split(/\r\n|\r|\n/).length; + return createTokenInstance( + tokenType, + image, + offset, offset + image.length, + lineNumber, lineNumber, + 0, image.length, + ); + } + + /** + * A custom pattern for matching indents + * + * @param text The full input string. + * @param offset The offset at which to attempt a match + * @param tokens Previously scanned Tokens + * @param groups Token Groups + */ + protected indentMatcher: CustomPatternMatcherFunc = (text, offset, tokens, _groups) => { + const { indentTokenName } = this.options; + + if (!this.isStartOfLine(text, offset)) { + return null; + } + + const { currIndentLevel, prevIndentLevel, match } = this.matchWhitespace(text, offset); + + if (currIndentLevel <= prevIndentLevel) { + // shallower indentation (should be matched by dedent) + // or same indentation level (should be matched by whitespace and ignored) + return null; + } + + this.indentationStack.push(currIndentLevel); + + const indentToken = this.createIndentationTokenInstance( + this.indentTokenType, + text, + match?.[0] ?? indentTokenName, + offset, + ); + tokens.push(indentToken); + + // Token already added, let the indentation now be consumed as whitespace and ignored + return null; + }; + + /** + * A custom pattern for matching dedents + * + * @param text The full input string. + * @param offset The offset at which to attempt a match + * @param tokens Previously scanned Tokens + * @param groups Token Groups + */ + protected dedentMatcher: CustomPatternMatcherFunc = (text, offset, tokens, _groups) => { + const { dedentTokenName } = this.options; + + if (!this.isStartOfLine(text, offset)) { + return null; + } + + const { currIndentLevel, prevIndentLevel, match } = this.matchWhitespace(text, offset); + + if (currIndentLevel >= prevIndentLevel) { + // bigger indentation (should be matched by indent) + // or same indentation level (should be matched by whitespace and ignored) + return null; + } + + const matchIndentIndex = this.indentationStack.lastIndexOf(currIndentLevel); + + // Any dedent must match some previous indentation level. + if (matchIndentIndex === -1) { + console.error(`Invalid dedent level ${currIndentLevel} at offset: ${offset}. Current indetation stack: ${this.indentationStack}`); + // throwing an error would crash the language server + // TODO: find a way to report error diagnostics message + return null; + } + + const numberOfDedents = this.indentationStack.length - matchIndentIndex - 1; + + for (let i = 0; i < numberOfDedents; i++) { + const token = this.createIndentationTokenInstance( + this.dedentTokenType, + text, + match?.[0] ?? dedentTokenName, + offset, + ); + tokens.push(token); + this.indentationStack.pop(); + } + + // Token already added, let the dedentation now be consumed as whitespace and ignored + return null; + }; + + protected override buildTerminalToken(terminal: TerminalRule): TokenType { + const tokenType = super.buildTerminalToken(terminal); + const { indentTokenName, dedentTokenName, whitespaceTokenName } = this.options; + + if (tokenType.name === indentTokenName) { + return this.indentTokenType; + } else if (tokenType.name === dedentTokenName) { + return this.dedentTokenType; + } else if (tokenType.name === whitespaceTokenName) { + return createToken({ + name: whitespaceTokenName, + pattern: this.whitespaceRegExp, + group: Lexer.SKIPPED, + }); + } + + return tokenType; + } + + /** + * Resets the indentation stack between different runs of the lexer + * + * @param text Full text that was tokenized + * @returns Remaining dedent tokens to match all previous indents at the end of the file + */ + popRemainingDedents(text: string): IToken[] { + const remainingDedents: IToken[] = []; + while (this.indentationStack.length > 1) { + remainingDedents.push( + this.createIndentationTokenInstance(this.dedentTokenType, text, this.options.dedentTokenName, text.length) + ); + this.indentationStack.pop(); + } + + this.indentationStack = [0]; + return remainingDedents; + } +} + +/** + * A lexer that is aware of indentation in the input text. + * The only purpose of this lexer is to reset the internal state of the {@link IndentationAwareTokenBuilder} + * between the tokenization of different text inputs. + * + * In your module, you can override the default lexer with this one as such: + * ```ts + * parser: { + * TokenBuilder: () => new IndentationAwareTokenBuilder(), + * Lexer: (services) => new IndentationAwareLexer(services), + * } + * ``` + */ +export class IndentationAwareLexer extends DefaultLexer { + + protected readonly indentationTokenBuilder: IndentationAwareTokenBuilder; + + constructor(services: LangiumCoreServices) { + super(services); + if (services.parser.TokenBuilder instanceof IndentationAwareTokenBuilder) { + this.indentationTokenBuilder = services.parser.TokenBuilder; + } else { + throw new Error('IndentationAwareLexer requires an accompanying IndentationAwareTokenBuilder'); + } + } + + override tokenize(text: string): LexerResult { + const result = super.tokenize(text); + + // reset the indent stack between processing of different text inputs + const remainingDedents = this.indentationTokenBuilder.popRemainingDedents(text); + result.tokens.push(...remainingDedents); + + // remove any "indent-dedent" pair with an empty body as these are typically + // added by comments or lines with just whitespace but have no real value + const { indentTokenType, dedentTokenType } = this.indentationTokenBuilder; + // Use tokenTypeIdx for fast comparison + const indentTokenIdx = indentTokenType.tokenTypeIdx; + const dedentTokenIdx = dedentTokenType.tokenTypeIdx; + const cleanTokens: IToken[] = []; + const length = result.tokens.length - 1; + for (let i = 0; i < length; i++) { + const token = result.tokens[i]; + const nextToken = result.tokens[i + 1]; + if (token.tokenTypeIdx === indentTokenIdx && nextToken.tokenTypeIdx === dedentTokenIdx) { + i++; + continue; + } + + cleanTokens.push(token); + } + // Push last token separately + cleanTokens.push(result.tokens[length]); + result.tokens = cleanTokens; + + return result; + } +} diff --git a/packages/langium/src/parser/index.ts b/packages/langium/src/parser/index.ts index c0a9e8d7f..fab284a9c 100644 --- a/packages/langium/src/parser/index.ts +++ b/packages/langium/src/parser/index.ts @@ -7,6 +7,7 @@ export * from './async-parser.js'; export * from './completion-parser-builder.js'; export * from './cst-node-builder.js'; +export * from './indentation-aware.js'; export * from './langium-parser-builder.js'; export * from './langium-parser.js'; export * from './lexer.js'; diff --git a/packages/langium/test/parser/indentation-aware.test.ts b/packages/langium/test/parser/indentation-aware.test.ts new file mode 100644 index 000000000..269da3c2a --- /dev/null +++ b/packages/langium/test/parser/indentation-aware.test.ts @@ -0,0 +1,293 @@ +/****************************************************************************** + * Copyright 2024 TypeFox GmbH + * This program and the accompanying materials are made available under the + * terms of the MIT License, which is available in the project root. + ******************************************************************************/ + +import type { TokenType } from '@chevrotain/types'; +import type { AstNode, Grammar, LangiumParser, Lexer, Module } from 'langium'; +import { beforeEach, describe, expect, test } from 'vitest'; +import { EmptyFileSystem, IndentationAwareLexer, IndentationAwareTokenBuilder } from 'langium'; +import { createLangiumGrammarServices, createServicesForGrammar } from 'langium/grammar'; +import type { LangiumServices, PartialLangiumServices } from 'langium/lsp'; +import { expandToString } from 'langium/generate'; +import { parseHelper } from 'langium/test'; + +const grammarServices = createLangiumGrammarServices(EmptyFileSystem).grammar; +const helper = parseHelper(grammarServices); + +const tokenBuilder = new IndentationAwareTokenBuilder(); + +async function getTokens(grammarString: string): Promise { + const grammar = (await helper(grammarString)).parseResult.value; + return tokenBuilder.buildTokens(grammar) as TokenType[]; +} + +async function getLexer(grammar: string): Promise { + const services = await createIndentationAwareServices(grammar); + return services.parser.Lexer; +} + +async function getParser(grammar: string): Promise { + const services = await createIndentationAwareServices(grammar); + return services.parser.LangiumParser; +} + +async function createIndentationAwareServices(grammar: string): Promise { + const services = await createServicesForGrammar({ + grammar, + module: { + parser: { + TokenBuilder: () => new IndentationAwareTokenBuilder(), + Lexer: services => new IndentationAwareLexer(services) + } + } satisfies Module + }); + return services; +} + +beforeEach(() => { + tokenBuilder.popRemainingDedents(''); +}); + +describe('IndentationAwareTokenBuilder', () => { + + const sampleGrammar = ` + entry Main: + INDENT name=ID DEDENT; + + terminal ID: /[a-zA-Z_]\\w*/; + hidden terminal NL: /[\\r\\n]+/; + hidden terminal WS: /[\\t ]+/; + terminal INDENT: 'synthetic:indent'; + terminal DEDENT: 'synthetic:dedent'; + `; + + test('Moves indent/dedent token types to the beginning', async () => { + const tokenTypes = await getTokens(sampleGrammar); + + expect(tokenTypes).toHaveLength(5); + + const [dedent, indent, ws] = tokenTypes; + expect(dedent.name).toBe('DEDENT'); + expect(indent.name).toBe('INDENT'); + expect(ws.name).toBe('WS'); + }); + + test('Modifies indent/dedent patterns to be functions', async () => { + const tokenTypes = await getTokens(sampleGrammar); + + expect(tokenTypes).toHaveLength(5); + + const [dedent, indent] = tokenTypes; + expect(dedent.PATTERN).toBeTypeOf('function'); + expect(indent.PATTERN).toBeTypeOf('function'); + }); + + test('Rejects grammar without indent', async () => { + const indentlessGrammar = ` + entry Main: name=ID; + + terminal ID: /[a-zA-Z_]\\w*/; + hidden terminal WS: /\\s+/; + terminal DEDENT: 'synthetic:dedent'; + `; + + await expect(getTokens(indentlessGrammar)).rejects.toThrowError(); + }); + + test('Rejects grammar without dedent', async () => { + const dedentlessGrammar = ` + entry Main: name=ID; + + terminal ID: /[a-zA-Z_]\\w*/; + hidden terminal WS: /\\s+/; + terminal INDENT: 'synthetic:indent'; + `; + + await expect(getTokens(dedentlessGrammar)).rejects.toThrowError(); + }); + + test('Rejects grammar without whitespace', async () => { + const spacelessGrammar = ` + entry Main: name=ID; + + terminal ID: /[a-zA-Z_]\\w*/; + terminal INDENT: 'synthetic:indent'; + terminal DEDENT: 'synthetic:dedent'; + `; + + await expect(getTokens(spacelessGrammar)).rejects.toThrowError(); + }); + +}); + +describe('IndentationAwareLexer', () => { + + const sampleGrammar = ` + grammar Test + + entry Block: '{' INDENT names+=ID* DEDENT nested+=Block* '}'; + + terminal ID: /[a-zA-Z_]\\w*/; + hidden terminal NL: /[\\r\\n]+/; + hidden terminal WS: /[\\t ]+/; + terminal INDENT: 'synthetic:indent'; + terminal DEDENT: 'synthetic:dedent'; + hidden terminal ML_COMMENT: /\\/\\*[\\s\\S]*?\\*\\//; + hidden terminal SL_COMMENT: /\\/\\/[^\\n\\r]*/; + `; + + test('should emit indent/dedent tokens around a block', async () => { + const lexer = await getLexer(sampleGrammar); + const { tokens, errors } = lexer.tokenize(expandToString` + { + name + anotherName + }`); + + expect(errors).toHaveLength(0); + expect(tokens).toHaveLength(6); + + const [/* L_BRAC */, indent, /* id1 */, /* id2 */, dedent, /* _R_BRAC */] = tokens; + expect(indent.tokenType.name).toBe('INDENT'); + expect(dedent.tokenType.name).toBe('DEDENT'); + }); + + test('should ignore indent tokens before comments', async () => { + const lexer = await getLexer(sampleGrammar); + const { tokens, errors } = lexer.tokenize(expandToString` + // single-line comment + // indented comment when not expecting indentation + { + name + // comment with different indentation inside block + anotherName + }`); + + expect(errors).toHaveLength(0); + expect(tokens).toHaveLength(6); + }); + + test('should not dedect indentation without a newline', async () => { + const lexer = await getLexer(sampleGrammar); + const { tokens } = lexer.tokenize(expandToString` + { name + // indented comment - to be ignored + }`); + expect(tokens).toHaveLength(3); + expect(tokens[1]).not.toBe('INDENT'); + }); + + test('should add remaining dedents to the end', async () => { + const lexer = await getLexer(sampleGrammar); + const { tokens } = lexer.tokenize(expandToString` + // single-line comment + { + name`); + expect(tokens).toHaveLength(4); + + const [/* L_BRAC */, indent, /* id */, dedent] = tokens; + expect(indent.tokenType.name).toBe('INDENT'); + expect(dedent.tokenType.name).toBe('DEDENT'); + }); + +}); + +describe('IndentationAware parsing', () => { + + const sampleGrammar = ` + grammar PythonIf + + entry Statement: If | Return; + + If: + 'if' condition=BOOLEAN ':' + INDENT thenBlock+=Statement+ DEDENT + ('else' ':' INDENT elseBlock+=Statement+ DEDENT)?; + + Return: 'return' value=BOOLEAN; + + terminal BOOLEAN returns boolean: /true|false/; + terminal INDENT: 'synthetic:indent'; + terminal DEDENT: 'synthetic:dedent'; + hidden terminal NL: /[\\r\\n]+/; + hidden terminal WS: /[\\t ]+/; + `; + + test('should parse correctly indented code', async () => { + const parser = await getParser(sampleGrammar); + const { parserErrors } = parser.parse(expandToString` + if true: + return false + else: + return true + `); + + expect(parserErrors).toHaveLength(0); + }); + + test('should error on non-matching dedent', async () => { + const parser = await getParser(sampleGrammar); + const { parserErrors } = parser.parse(expandToString` + if true: + return false + else: + return true + `); + + expect(parserErrors.length).toBeGreaterThan(0); + }); + + test('should throw an error on unexpected indent', async () => { + const parser = await getParser(sampleGrammar); + const { parserErrors } = parser.parse(expandToString` + // Parsing starts here + if true: + return false + `); + + expect(parserErrors.length).toBeGreaterThan(0); + }); + + test('should correctly parse nested blocks', async () => { + const parser = await getParser(sampleGrammar); + const { parserErrors, value } = parser.parse(expandToString` + if true: + return true + else: + if false: + return true + return false + return true + `); + + expect(parserErrors).toHaveLength(0); + const ifValue = value as If; + expect(ifValue.thenBlock).toHaveLength(1); + expect(ifValue.elseBlock).toHaveLength(2); + const elseBlock = ifValue.elseBlock[0] as If; + expect(elseBlock.thenBlock).toHaveLength(2); + const nestedReturn1 = elseBlock.thenBlock[0] as Return; + expect(nestedReturn1.value).toBe(true); + const nestedReturn2 = elseBlock.thenBlock[1] as Return; + expect(nestedReturn2.value).toBe(false); + const return2 = ifValue.elseBlock[1] as Return; + expect(return2.value).toBe(true); + }); + +}); + +type Statement = If | Return; + +interface If extends AstNode { + $type: 'If'; + condition: boolean; + thenBlock: Statement[]; + elseBlock: Statement[]; +} + +interface Return extends AstNode { + $type: 'Return'; + value: boolean; +}