From cba9d1f2ff798bc039e88e220b765c302be6cf62 Mon Sep 17 00:00:00 2001 From: Angelo Probst Date: Mon, 4 Nov 2024 16:52:12 -0300 Subject: [PATCH] experiments --- hyperon_das/tokenizer.py | 113 ++++++++++++++++++--------------------- 1 file changed, 51 insertions(+), 62 deletions(-) diff --git a/hyperon_das/tokenizer.py b/hyperon_das/tokenizer.py index 88ea3b13..4e3cc74e 100644 --- a/hyperon_das/tokenizer.py +++ b/hyperon_das/tokenizer.py @@ -14,14 +14,12 @@ def to_tokens(self) -> list[str]: @staticmethod def from_tokens(tokens: list[str], cursor: int = 0) -> tuple[int, "Node"]: - match tokens[cursor]: - case "NODE": - cursor += 1 # Skip the "NODE" token - node = Node(type=tokens[cursor], name=tokens[cursor + 1]) - cursor += 2 # Skip the type and name tokens - return cursor, node - case _: - raise ValueError(f"Unsupported token: {tokens[cursor:]}") + if tokens[cursor] == "NODE": + cursor += 1 # Skip the "NODE" token + node = Node(type=tokens[cursor], name=tokens[cursor + 1]) + cursor += 2 # Skip the type and name tokens + return cursor, node + raise ValueError(f"Unsupported sequence of tokens: {tokens[cursor:]}") @dataclasses.dataclass @@ -33,14 +31,12 @@ def to_tokens(self) -> list[str]: @staticmethod def from_tokens(tokens: list[str], cursor: int = 0) -> tuple[int, "Variable"]: - match tokens[cursor]: - case "VARIABLE": - cursor += 1 # Skip the "VARIABLE" token - variable = Variable(name=tokens[cursor]) - cursor += 1 # Skip the name token - return cursor, variable - case _: - raise ValueError(f"Unsupported token: {tokens[cursor:]}") + if tokens[cursor] == "VARIABLE": + cursor += 1 # Skip the "VARIABLE" token + variable = Variable(name=tokens[cursor]) + cursor += 1 # Skip the name token + return cursor, variable + raise ValueError(f"Unsupported sequence of tokens: {tokens[cursor:]}") @dataclasses.dataclass @@ -59,42 +55,38 @@ def to_tokens(self) -> list[str]: @staticmethod def from_tokens(tokens: list[str], cursor: int = 0) -> tuple[int, "Link"]: - match tokens[cursor]: - case "LINK" | "LINK_TEMPLATE": - link_tag = tokens[cursor] - cursor += 1 # Skip the "LINK" or "LINK_TEMPLATE" token - link = Link(type=tokens[cursor]) - cursor += 1 # Skip the type token - target_count = int(tokens[cursor]) - cursor += 1 # Skip the target count token - for _ in range(target_count): - match tokens[cursor]: - case "NODE": - cursor, target = Node.from_tokens(tokens, cursor) - case "VARIABLE": - link.is_template = True - cursor, target = Variable.from_tokens(tokens, cursor) - case "LINK" | "LINK_TEMPLATE": - cursor, target = Link.from_tokens(tokens, cursor) - case _: - raise ValueError(f"Unsupported token: {tokens[cursor:]}") - link.targets.append(target) - - if link_tag == "LINK_TEMPLATE" and not link.is_template: - raise ValueError("Template link without variables") - elif link_tag == "LINK" and link.is_template: - raise ValueError("Non-template link with variables") - - return cursor, link - case _: - raise ValueError(f"Unsupported token: {tokens[cursor:]}") + if tokens[cursor] in {"LINK", "LINK_TEMPLATE"}: + link_tag = tokens[cursor] + cursor += 1 # Skip the "LINK" or "LINK_TEMPLATE" token + link = Link(type=tokens[cursor]) + cursor += 1 # Skip the type token + target_count = int(tokens[cursor]) + cursor += 1 # Skip the target count token + for _ in range(target_count): + match tokens[cursor]: + case "NODE": + cursor, target = Node.from_tokens(tokens, cursor) + case "VARIABLE": + link.is_template = True + cursor, target = Variable.from_tokens(tokens, cursor) + case "LINK" | "LINK_TEMPLATE": + cursor, target = Link.from_tokens(tokens, cursor) + case _: + raise ValueError(f"Unsupported token: {tokens[cursor:]}") + link.targets.append(target) + if link_tag == "LINK_TEMPLATE" and not link.is_template: + raise ValueError("Template link without variables") + elif link_tag == "LINK" and link.is_template: + raise ValueError("Non-template link with variables") + return cursor, link + raise ValueError(f"Unsupported sequence of tokens: {tokens[cursor:]}") class DictQueryTokenizer: Query: TypeAlias = dict[str, Any] @staticmethod - def tokenize(_query: Query) -> str: + def tokenize(query: Query) -> str: def _tokenize( _query: DictQueryTokenizer.Query, _parent: Link | None = None ) -> Link | Node | Variable: @@ -111,27 +103,24 @@ def _tokenize( case _: raise ValueError(f"Unsupported query: {_query}") - return TOKENS_DELIMITER.join(_tokenize(_query).to_tokens()) + return TOKENS_DELIMITER.join(_tokenize(query).to_tokens()) @staticmethod def untokenize(tokens: str) -> Query: - _, link = Link.from_tokens(tokens.split()) - def _untokenize(_atom: Link | Node | Variable) -> DictQueryTokenizer.Query: - match _atom: - case Link(type, targets): - return { - "atom_type": "link", - "type": type, - "targets": [_untokenize(target) for target in targets], - } - case Node(type, name): - return {"atom_type": "node", "type": type, "name": name} - case Variable(name): - return {"atom_type": "variable", "name": name} - case _: - raise ValueError(f"Unsupported element: {_atom}") + if isinstance(_atom, Link): + return { + "atom_type": "link", + "type": _atom.type, + "targets": [_untokenize(target) for target in _atom.targets], + } + elif isinstance(_atom, Node): + return {"atom_type": "node", "type": _atom.type, "name": _atom.name} + elif isinstance(_atom, Variable): + return {"atom_type": "variable", "name": _atom.name} + raise ValueError(f"Unsupported element: {_atom}") + _, link = Link.from_tokens(tokens.split()) return _untokenize(link)