diff --git a/packages/langium/src/parser/indentation-aware.ts b/packages/langium/src/parser/indentation-aware.ts index f996d3674..9921eb454 100644 --- a/packages/langium/src/parser/indentation-aware.ts +++ b/packages/langium/src/parser/indentation-aware.ts @@ -4,7 +4,7 @@ * terms of the MIT License, which is available in the project root. ******************************************************************************/ -import type { CustomPatternMatcherFunc, TokenType, IToken } from 'chevrotain'; +import type { CustomPatternMatcherFunc, TokenType, IToken, IMultiModeLexerDefinition } from 'chevrotain'; import type { Grammar, TerminalRule } from '../languages/generated/ast.js'; import type { TokenBuilderOptions } from './token-builder.js'; import type { LexerResult } from './lexer.js'; @@ -13,6 +13,8 @@ import { createToken, createTokenInstance, Lexer } from 'chevrotain'; import { DefaultTokenBuilder } from './token-builder.js'; import { DefaultLexer, isTokenTypeArray } from './lexer.js'; +type IndentationAwareDelimiter = [begin: TokenName, end: TokenName]; + export interface IndentationTokenBuilderOptions { /** * The name of the token used to denote indentation in the grammar. @@ -44,14 +46,29 @@ export interface IndentationTokenBuilderOptions> } export const indentationBuilderDefaultOptions: IndentationTokenBuilderOptions = { indentTokenName: 'INDENT', dedentTokenName: 'DEDENT', whitespaceTokenName: 'WS', + ignoreIndentationDelimeters: [], }; +export enum LexingMode { + REGULAR = 'indentation-sensitive', + IGNORE_INDENTATION = 'ignore-indentation', +} + /** * A token builder that is sensitive to indentation in the input text. * It will generate tokens for indentation and dedentation based on the indentation level. @@ -108,7 +125,7 @@ export class IndentationAwareTokenBuilder ext throw new Error('Invalid tokens built by default builder'); } - const { indentTokenName, dedentTokenName, whitespaceTokenName } = this.options; + const { indentTokenName, dedentTokenName, whitespaceTokenName, ignoreIndentationDelimeters } = this.options; // Rearrange tokens because whitespace (which is ignored) goes to the beginning by default, consuming indentation as well // Order should be: dedent, indent, spaces @@ -117,6 +134,13 @@ export class IndentationAwareTokenBuilder ext let ws: TokenType | undefined; const otherTokens: TokenType[] = []; for (const tokenType of tokenTypes) { + for (const [begin, end] of ignoreIndentationDelimeters) { + if (tokenType.name === begin) { + tokenType.PUSH_MODE = LexingMode.IGNORE_INDENTATION; + } else if (tokenType.name === end) { + tokenType.POP_MODE = true; + } + } if (tokenType.name === dedentTokenName) { dedent = tokenType; } else if (tokenType.name === indentTokenName) { @@ -130,7 +154,19 @@ export class IndentationAwareTokenBuilder ext if (!dedent || !indent || !ws) { throw new Error('Some indentation/whitespace tokens not found!'); } - return [dedent, indent, ws, ...otherTokens]; + + if (ignoreIndentationDelimeters.length > 0) { + const multiModeLexerDef: IMultiModeLexerDefinition = { + modes: { + [LexingMode.REGULAR]: [dedent, indent, ...otherTokens, ws], + [LexingMode.IGNORE_INDENTATION]: [...otherTokens, ws], + }, + defaultMode: LexingMode.REGULAR, + }; + return multiModeLexerDef; + } else { + return [dedent, indent, ws, ...otherTokens]; + } } /** @@ -283,7 +319,6 @@ export class IndentationAwareTokenBuilder ext group: Lexer.SKIPPED, }); } - return tokenType; } diff --git a/packages/langium/test/parser/indentation-aware.test.ts b/packages/langium/test/parser/indentation-aware.test.ts index 9afc89573..b79f25708 100644 --- a/packages/langium/test/parser/indentation-aware.test.ts +++ b/packages/langium/test/parser/indentation-aware.test.ts @@ -5,7 +5,7 @@ ******************************************************************************/ import type { TokenType } from '@chevrotain/types'; -import type { AstNode, Grammar, LangiumParser, Lexer, Module } from 'langium'; +import type { AstNode, Grammar, IndentationTokenBuilderOptions, LangiumParser, Lexer, Module } from 'langium'; import { beforeEach, describe, expect, test } from 'vitest'; import { EmptyFileSystem, IndentationAwareLexer, IndentationAwareTokenBuilder } from 'langium'; import { createLangiumGrammarServices, createServicesForGrammar } from 'langium/grammar'; @@ -20,25 +20,30 @@ const tokenBuilder = new IndentationAwareTokenBuilder(); async function getTokens(grammarString: string): Promise { const grammar = (await helper(grammarString)).parseResult.value; - return tokenBuilder.buildTokens(grammar) as TokenType[]; + const tokens = tokenBuilder.buildTokens(grammar); + if (Array.isArray(tokens)) { + return tokens; + } else { + return tokens.modes[tokens.defaultMode]; + } } -async function getLexer(grammar: string): Promise { - const services = await createIndentationAwareServices(grammar); +async function getLexer(grammar: string, options?: Partial): Promise { + const services = await createIndentationAwareServices(grammar, options); return services.parser.Lexer; } -async function getParser(grammar: string): Promise { - const services = await createIndentationAwareServices(grammar); +async function getParser(grammar: string, options?: Partial): Promise { + const services = await createIndentationAwareServices(grammar, options); return services.parser.LangiumParser; } -async function createIndentationAwareServices(grammar: string): Promise { +async function createIndentationAwareServices(grammar: string, options?: Partial): Promise { const services = await createServicesForGrammar({ grammar, module: { parser: { - TokenBuilder: () => new IndentationAwareTokenBuilder(), + TokenBuilder: () => new IndentationAwareTokenBuilder(options), Lexer: services => new IndentationAwareLexer(services) } } satisfies Module @@ -68,10 +73,9 @@ describe('IndentationAwareTokenBuilder', () => { expect(tokenTypes).toHaveLength(5); - const [dedent, indent, ws] = tokenTypes; + const [dedent, indent] = tokenTypes; expect(dedent.name).toBe('DEDENT'); expect(indent.name).toBe('INDENT'); - expect(ws.name).toBe('WS'); }); test('Modifies indent/dedent patterns to be functions', async () => { @@ -200,6 +204,94 @@ describe('IndentationAwareLexer', () => { }); +describe('IndentationAwareTokenBuilder#ignoreIndentationDelimeters', async () => { + + const grammar = ` + grammar PythonIfWithLists + + entry Statement: (If | Return)*; + + If: + 'if' condition=BOOLEAN ':' + INDENT thenBlock+=Statement+ DEDENT + ('else' ':' INDENT elseBlock+=Statement+ DEDENT)?; + + Return: 'return' value=Expression; + + Expression: List | Tuple | BOOLEAN; + + Tuple: '(' (elements+=Expression (',' elements+=Expression)*)? ')'; + List: '[' (elements+=Expression (',' elements+=Expression)*)? ']'; + + terminal BOOLEAN returns boolean: /true|false/; + terminal INDENT: 'synthetic:indent'; + terminal DEDENT: 'synthetic:dedent'; + hidden terminal NL: /[\\r\\n]+/; + hidden terminal WS: /[\\t ]+/; + hidden terminal SL_COMMENT: /\\/\\/[^\\n\\r]*/; + `; + + const lexer = await getLexer(grammar, { + ignoreIndentationDelimeters: [ + ['(', ')'], + ['[', ']'], + ], + }); + + test('should behave as usual without the given tokens in the input', async () => { + const { errors } = lexer.tokenize(expandToString` + if true: + return false + else: + return true + `); + expect(errors).toHaveLength(0); + }); + + test('should ignore indentation inside the given delimeters', async () => { + const { errors, tokens } = lexer.tokenize(expandToString` + return [ + false, + true, // including inconsitent indentation + true + ] + return (true, + false + ) + `); + + expect(errors).toHaveLength(0); + + const tokenNames = tokens.map(token => token.tokenType.name); + expect(tokenNames).not.toContain('INDENT'); + expect(tokenNames).not.toContain('DEDENT'); + }); + + test('should handle nested delimeters', async () => { + const { errors, tokens } = lexer.tokenize(expandToString` + return [ + [ + false, + true + ], + ([true, + true], + false) + [ + true + ] + ] + `); + + expect(errors).toHaveLength(0); + + const tokenNames = tokens.map(token => token.tokenType.name); + expect(tokenNames).not.toContain('INDENT'); + expect(tokenNames).not.toContain('DEDENT'); + }); + +}); + describe('IndentationAware parsing', () => { const sampleGrammar = `