Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Support ignoring indentation within delimiters #1608

Merged
merged 6 commits into from
Aug 21, 2024
Merged
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
39 changes: 36 additions & 3 deletions packages/langium/src/parser/indentation-aware.ts
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@
* terms of the MIT License, which is available in the project root.
******************************************************************************/

import type { CustomPatternMatcherFunc, TokenType, IToken } from 'chevrotain';
import type { CustomPatternMatcherFunc, TokenType, IToken, IMultiModeLexerDefinition } from 'chevrotain';
import type { Grammar, TerminalRule } from '../languages/generated/ast.js';
import type { TokenBuilderOptions } from './token-builder.js';
import type { LexerResult } from './lexer.js';
Expand Down Expand Up @@ -44,14 +44,30 @@ export interface IndentationTokenBuilderOptions<TokenName extends string = strin
* @default 'WS'
*/
whitespaceTokenName: TokenName;
/**
* The delimeter tokens inside of which indentation should be ignored and treated as normal whitespace.
* For example, Python doesn't treat any whitespace between `'('` and `')'` as significant.
aabounegm marked this conversation as resolved.
Show resolved Hide resolved
*
* Note that this works only with terminal tokens, not keyword tokens,
* so for `'('` you will have to define `terminal L_PAREN: /\(/;` and pass `'L_PAREN'` here.
*
* @default []
*/
ignoreIndentationDelimeters: Array<[begin: TokenName, end: TokenName]>
msujew marked this conversation as resolved.
Show resolved Hide resolved
}

export const indentationBuilderDefaultOptions: IndentationTokenBuilderOptions = {
indentTokenName: 'INDENT',
dedentTokenName: 'DEDENT',
whitespaceTokenName: 'WS',
ignoreIndentationDelimeters: [],
};

export enum LexingMode {
REGULAR = 'indentation-sensitive',
IGNORE_INDENTATION = 'ignore-indentation',
}

/**
* A token builder that is sensitive to indentation in the input text.
* It will generate tokens for indentation and dedentation based on the indentation level.
Expand Down Expand Up @@ -130,7 +146,16 @@ export class IndentationAwareTokenBuilder<Terminals extends string = string> ext
if (!dedent || !indent || !ws) {
throw new Error('Some indentation/whitespace tokens not found!');
}
return [dedent, indent, ws, ...otherTokens];

const multiModeLexerDef: IMultiModeLexerDefinition = {
modes: {
[LexingMode.REGULAR]: [dedent, indent, ...otherTokens, ws],
[LexingMode.IGNORE_INDENTATION]: [...otherTokens, ws],
},
defaultMode: LexingMode.REGULAR,
};

return multiModeLexerDef;
}

/**
Expand Down Expand Up @@ -270,7 +295,7 @@ export class IndentationAwareTokenBuilder<Terminals extends string = string> ext

protected override buildTerminalToken(terminal: TerminalRule): TokenType {
const tokenType = super.buildTerminalToken(terminal);
const { indentTokenName, dedentTokenName, whitespaceTokenName } = this.options;
const { indentTokenName, dedentTokenName, whitespaceTokenName, ignoreIndentationDelimeters } = this.options;

if (tokenType.name === indentTokenName) {
return this.indentTokenType;
Expand All @@ -284,6 +309,14 @@ export class IndentationAwareTokenBuilder<Terminals extends string = string> ext
});
}

for (const [begin, end] of ignoreIndentationDelimeters) {
if (tokenType.name === begin) {
tokenType.PUSH_MODE = LexingMode.IGNORE_INDENTATION;
} else if (tokenType.name === end) {
tokenType.POP_MODE = true;
}
}
msujew marked this conversation as resolved.
Show resolved Hide resolved

return tokenType;
}

Expand Down
112 changes: 102 additions & 10 deletions packages/langium/test/parser/indentation-aware.test.ts
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@
******************************************************************************/

import type { TokenType } from '@chevrotain/types';
import type { AstNode, Grammar, LangiumParser, Lexer, Module } from 'langium';
import type { AstNode, Grammar, IndentationTokenBuilderOptions, LangiumParser, Lexer, Module } from 'langium';
import { beforeEach, describe, expect, test } from 'vitest';
import { EmptyFileSystem, IndentationAwareLexer, IndentationAwareTokenBuilder } from 'langium';
import { createLangiumGrammarServices, createServicesForGrammar } from 'langium/grammar';
Expand All @@ -20,25 +20,26 @@ const tokenBuilder = new IndentationAwareTokenBuilder();

async function getTokens(grammarString: string): Promise<TokenType[]> {
const grammar = (await helper(grammarString)).parseResult.value;
return tokenBuilder.buildTokens(grammar) as TokenType[];
const { modes, defaultMode } = tokenBuilder.buildTokens(grammar);
return modes[defaultMode] as TokenType[];
}

async function getLexer(grammar: string): Promise<Lexer> {
const services = await createIndentationAwareServices(grammar);
async function getLexer(grammar: string, options?: Partial<IndentationTokenBuilderOptions>): Promise<Lexer> {
const services = await createIndentationAwareServices(grammar, options);
return services.parser.Lexer;
}

async function getParser(grammar: string): Promise<LangiumParser> {
const services = await createIndentationAwareServices(grammar);
async function getParser(grammar: string, options?: Partial<IndentationTokenBuilderOptions>): Promise<LangiumParser> {
const services = await createIndentationAwareServices(grammar, options);
return services.parser.LangiumParser;
}

async function createIndentationAwareServices(grammar: string): Promise<LangiumServices> {
async function createIndentationAwareServices(grammar: string, options?: Partial<IndentationTokenBuilderOptions>): Promise<LangiumServices> {
const services = await createServicesForGrammar({
grammar,
module: {
parser: {
TokenBuilder: () => new IndentationAwareTokenBuilder(),
TokenBuilder: () => new IndentationAwareTokenBuilder(options),
Lexer: services => new IndentationAwareLexer(services)
}
} satisfies Module<LangiumServices, PartialLangiumServices>
Expand Down Expand Up @@ -68,10 +69,9 @@ describe('IndentationAwareTokenBuilder', () => {

expect(tokenTypes).toHaveLength(5);

const [dedent, indent, ws] = tokenTypes;
const [dedent, indent] = tokenTypes;
expect(dedent.name).toBe('DEDENT');
expect(indent.name).toBe('INDENT');
expect(ws.name).toBe('WS');
});

test('Modifies indent/dedent patterns to be functions', async () => {
Expand Down Expand Up @@ -200,6 +200,98 @@ describe('IndentationAwareLexer', () => {

});

describe('IndentationAwareTokenBuilder#ignoreIndentationDelimeters', async () => {

const grammar = `
grammar PythonIfWithLists

entry Statement: (If | Return)*;

If:
'if' condition=BOOLEAN ':'
INDENT thenBlock+=Statement+ DEDENT
('else' ':' INDENT elseBlock+=Statement+ DEDENT)?;

Return: 'return' value=Expression;

Expression: List | Tuple | BOOLEAN;

Tuple: L_PAREN (elements+=Expression (',' elements+=Expression)*)? R_PAREN;
List: L_BRACKET (elements+=Expression (',' elements+=Expression)*)? R_BRACKET;

terminal BOOLEAN returns boolean: /true|false/;
terminal INDENT: 'synthetic:indent';
terminal DEDENT: 'synthetic:dedent';
terminal L_PAREN: '(';
terminal R_PAREN: ')';
terminal L_BRACKET: '[';
terminal R_BRACKET: ']';
hidden terminal NL: /[\\r\\n]+/;
hidden terminal WS: /[\\t ]+/;
hidden terminal SL_COMMENT: /\\/\\/[^\\n\\r]*/;
`;

const lexer = await getLexer(grammar, {
ignoreIndentationDelimeters: [
['L_PAREN', 'R_PAREN'],
['L_BRACKET', 'R_BRACKET'],
],
});

test('should behave as usual without the given tokens in the input', async () => {
const { errors } = lexer.tokenize(expandToString`
if true:
return false
else:
return true
`);
expect(errors).toHaveLength(0);
});

test('should ignore indentation inside the given delimeters', async () => {
const { errors, tokens } = lexer.tokenize(expandToString`
return [
false,
true, // including inconsitent indentation
true
]
return (true,
false
)
`);

expect(errors).toHaveLength(0);

const tokenNames = tokens.map(token => token.tokenType.name);
expect(tokenNames).not.toContain('INDENT');
expect(tokenNames).not.toContain('DEDENT');
});

test('should handle nested delimeters', async () => {
const { errors, tokens } = lexer.tokenize(expandToString`
return [
[
false,
true
],
([true,
true],
false)
[
true
]
]
`);

expect(errors).toHaveLength(0);

const tokenNames = tokens.map(token => token.tokenType.name);
expect(tokenNames).not.toContain('INDENT');
expect(tokenNames).not.toContain('DEDENT');
});

});

describe('IndentationAware parsing', () => {

const sampleGrammar = `
Expand Down
Loading