eclipse-langium · msujew · Aug 21, 2024 · Jul 27, 2024 · Aug 9, 2024 · Aug 9, 2024
diff --git a/packages/langium/src/parser/indentation-aware.ts b/packages/langium/src/parser/indentation-aware.ts
@@ -4,7 +4,7 @@
  * terms of the MIT License, which is available in the project root.
  ******************************************************************************/
 
-import type { CustomPatternMatcherFunc, TokenType, IToken } from 'chevrotain';
+import type { CustomPatternMatcherFunc, TokenType, IToken, IMultiModeLexerDefinition } from 'chevrotain';
 import type { Grammar, TerminalRule } from '../languages/generated/ast.js';
 import type { TokenBuilderOptions } from './token-builder.js';
 import type { LexerResult } from './lexer.js';
@@ -44,14 +44,30 @@ export interface IndentationTokenBuilderOptions<TokenName extends string = strin
      * @default 'WS'
      */
     whitespaceTokenName: TokenName;
+    /**
+     * The delimeter tokens inside of which indentation should be ignored and treated as normal whitespace.
+     * For example, Python doesn't treat any whitespace between `'('` and `')'` as significant.
+     *
+     * Note that this works only with terminal tokens, not keyword tokens,
+     * so for `'('` you will have to define `terminal L_PAREN: /\(/;` and pass `'L_PAREN'` here.
+     *
+     * @default []
+     */
+    ignoreIndentationDelimeters: Array<[begin: TokenName, end: TokenName]>
 }
 
 export const indentationBuilderDefaultOptions: IndentationTokenBuilderOptions = {
     indentTokenName: 'INDENT',
     dedentTokenName: 'DEDENT',
     whitespaceTokenName: 'WS',
+    ignoreIndentationDelimeters: [],
 };
 
+export enum LexingMode {
+    REGULAR = 'indentation-sensitive',
+    IGNORE_INDENTATION = 'ignore-indentation',
+}
+
 /**
  * A token builder that is sensitive to indentation in the input text.
  * It will generate tokens for indentation and dedentation based on the indentation level.
@@ -130,7 +146,16 @@ export class IndentationAwareTokenBuilder<Terminals extends string = string> ext
         if (!dedent || !indent || !ws) {
             throw new Error('Some indentation/whitespace tokens not found!');
         }
-        return [dedent, indent, ws, ...otherTokens];
+
+        const multiModeLexerDef: IMultiModeLexerDefinition = {
+            modes: {
+                [LexingMode.REGULAR]: [dedent, indent, ...otherTokens, ws],
+                [LexingMode.IGNORE_INDENTATION]: [...otherTokens, ws],
+            },
+            defaultMode: LexingMode.REGULAR,
+        };
+
+        return multiModeLexerDef;
     }
 
     /**
@@ -270,7 +295,7 @@ export class IndentationAwareTokenBuilder<Terminals extends string = string> ext
 
     protected override buildTerminalToken(terminal: TerminalRule): TokenType {
         const tokenType = super.buildTerminalToken(terminal);
-        const { indentTokenName, dedentTokenName, whitespaceTokenName } = this.options;
+        const { indentTokenName, dedentTokenName, whitespaceTokenName, ignoreIndentationDelimeters } = this.options;
 
         if (tokenType.name === indentTokenName) {
             return this.indentTokenType;
@@ -284,6 +309,14 @@ export class IndentationAwareTokenBuilder<Terminals extends string = string> ext
             });
         }
 
+        for (const [begin, end] of ignoreIndentationDelimeters) {
+            if (tokenType.name === begin) {
+                tokenType.PUSH_MODE = LexingMode.IGNORE_INDENTATION;
+            } else if (tokenType.name === end) {
+                tokenType.POP_MODE = true;
+            }
+        }
+
         return tokenType;
     }
 

diff --git a/packages/langium/test/parser/indentation-aware.test.ts b/packages/langium/test/parser/indentation-aware.test.ts
@@ -5,7 +5,7 @@
  ******************************************************************************/
 
 import type { TokenType } from '@chevrotain/types';
-import type { AstNode, Grammar, LangiumParser, Lexer, Module } from 'langium';
+import type { AstNode, Grammar, IndentationTokenBuilderOptions, LangiumParser, Lexer, Module } from 'langium';
 import { beforeEach, describe, expect, test } from 'vitest';
 import { EmptyFileSystem, IndentationAwareLexer, IndentationAwareTokenBuilder } from 'langium';
 import { createLangiumGrammarServices, createServicesForGrammar } from 'langium/grammar';
@@ -20,25 +20,26 @@ const tokenBuilder = new IndentationAwareTokenBuilder();
 
 async function getTokens(grammarString: string): Promise<TokenType[]> {
     const grammar = (await helper(grammarString)).parseResult.value;
-    return tokenBuilder.buildTokens(grammar) as TokenType[];
+    const { modes, defaultMode } = tokenBuilder.buildTokens(grammar);
+    return modes[defaultMode] as TokenType[];
 }
 
-async function getLexer(grammar: string): Promise<Lexer> {
-    const services = await createIndentationAwareServices(grammar);
+async function getLexer(grammar: string, options?: Partial<IndentationTokenBuilderOptions>): Promise<Lexer> {
+    const services = await createIndentationAwareServices(grammar, options);
     return services.parser.Lexer;
 }
 
-async function getParser(grammar: string): Promise<LangiumParser> {
-    const services = await createIndentationAwareServices(grammar);
+async function getParser(grammar: string, options?: Partial<IndentationTokenBuilderOptions>): Promise<LangiumParser> {
+    const services = await createIndentationAwareServices(grammar, options);
     return services.parser.LangiumParser;
 }
 
-async function createIndentationAwareServices(grammar: string): Promise<LangiumServices> {
+async function createIndentationAwareServices(grammar: string, options?: Partial<IndentationTokenBuilderOptions>): Promise<LangiumServices> {
     const services = await createServicesForGrammar({
         grammar,
         module: {
             parser: {
-                TokenBuilder: () => new IndentationAwareTokenBuilder(),
+                TokenBuilder: () => new IndentationAwareTokenBuilder(options),
                 Lexer: services => new IndentationAwareLexer(services)
             }
         } satisfies Module<LangiumServices, PartialLangiumServices>
@@ -68,10 +69,9 @@ describe('IndentationAwareTokenBuilder', () => {
 
         expect(tokenTypes).toHaveLength(5);
 
-        const [dedent, indent, ws] = tokenTypes;
+        const [dedent, indent] = tokenTypes;
         expect(dedent.name).toBe('DEDENT');
         expect(indent.name).toBe('INDENT');
-        expect(ws.name).toBe('WS');
     });
 
     test('Modifies indent/dedent patterns to be functions', async () => {
@@ -200,6 +200,98 @@ describe('IndentationAwareLexer', () => {
 
 });
 
+describe('IndentationAwareTokenBuilder#ignoreIndentationDelimeters', async () => {
+
+    const grammar = `
+        grammar PythonIfWithLists
+
+        entry Statement: (If | Return)*;
+
+        If:
+            'if' condition=BOOLEAN ':'
+            INDENT thenBlock+=Statement+ DEDENT
+            ('else' ':' INDENT elseBlock+=Statement+ DEDENT)?;
+
+        Return: 'return' value=Expression;
+
+        Expression: List | Tuple | BOOLEAN;
+
+        Tuple: L_PAREN  (elements+=Expression (',' elements+=Expression)*)? R_PAREN;
+        List: L_BRACKET (elements+=Expression (',' elements+=Expression)*)? R_BRACKET;
+
+        terminal BOOLEAN returns boolean: /true|false/;
+        terminal INDENT: 'synthetic:indent';
+        terminal DEDENT: 'synthetic:dedent';
+        terminal L_PAREN: '(';
+        terminal R_PAREN: ')';
+        terminal L_BRACKET: '[';
+        terminal R_BRACKET: ']';
+        hidden terminal NL: /[\\r\\n]+/;
+        hidden terminal WS: /[\\t ]+/;
+        hidden terminal SL_COMMENT: /\\/\\/[^\\n\\r]*/;
+    `;
+
+    const lexer = await getLexer(grammar, {
+        ignoreIndentationDelimeters: [
+            ['L_PAREN', 'R_PAREN'],
+            ['L_BRACKET', 'R_BRACKET'],
+        ],
+    });
+
+    test('should behave as usual without the given tokens in the input', async () => {
+        const { errors } = lexer.tokenize(expandToString`
+        if true:
+            return false
+        else:
+            return true
+        `);
+        expect(errors).toHaveLength(0);
+    });
+
+    test('should ignore indentation inside the given delimeters', async () => {
+        const { errors, tokens } = lexer.tokenize(expandToString`
+            return [
+                false,
+            true, // including inconsitent indentation
+                    true
+            ]
+            return (true,
+                    false
+                   )
+        `);
+
+        expect(errors).toHaveLength(0);
+
+        const tokenNames = tokens.map(token => token.tokenType.name);
+        expect(tokenNames).not.toContain('INDENT');
+        expect(tokenNames).not.toContain('DEDENT');
+    });
+
+    test('should handle nested delimeters', async () => {
+        const { errors, tokens } = lexer.tokenize(expandToString`
+            return [
+                [
+                    false,
+                    true
+                ],
+                    ([true,
+                    true],
+                    false)
+                [
+                    true
+                ]
+            ]
+        `);
+
+        expect(errors).toHaveLength(0);
+
+        const tokenNames = tokens.map(token => token.tokenType.name);
+        expect(tokenNames).not.toContain('INDENT');
+        expect(tokenNames).not.toContain('DEDENT');
+    });
+
+});
+
 describe('IndentationAware parsing', () => {
 
     const sampleGrammar = `