Skip to content

Commit

Permalink
experiments
Browse files Browse the repository at this point in the history
  • Loading branch information
angeloprobst committed Nov 4, 2024
1 parent 499f1dc commit cba9d1f
Showing 1 changed file with 51 additions and 62 deletions.
113 changes: 51 additions & 62 deletions hyperon_das/tokenizer.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,14 +14,12 @@ def to_tokens(self) -> list[str]:

@staticmethod
def from_tokens(tokens: list[str], cursor: int = 0) -> tuple[int, "Node"]:
match tokens[cursor]:
case "NODE":
cursor += 1 # Skip the "NODE" token
node = Node(type=tokens[cursor], name=tokens[cursor + 1])
cursor += 2 # Skip the type and name tokens
return cursor, node
case _:
raise ValueError(f"Unsupported token: {tokens[cursor:]}")
if tokens[cursor] == "NODE":
cursor += 1 # Skip the "NODE" token
node = Node(type=tokens[cursor], name=tokens[cursor + 1])
cursor += 2 # Skip the type and name tokens
return cursor, node
raise ValueError(f"Unsupported sequence of tokens: {tokens[cursor:]}")


@dataclasses.dataclass
Expand All @@ -33,14 +31,12 @@ def to_tokens(self) -> list[str]:

@staticmethod
def from_tokens(tokens: list[str], cursor: int = 0) -> tuple[int, "Variable"]:
match tokens[cursor]:
case "VARIABLE":
cursor += 1 # Skip the "VARIABLE" token
variable = Variable(name=tokens[cursor])
cursor += 1 # Skip the name token
return cursor, variable
case _:
raise ValueError(f"Unsupported token: {tokens[cursor:]}")
if tokens[cursor] == "VARIABLE":
cursor += 1 # Skip the "VARIABLE" token
variable = Variable(name=tokens[cursor])
cursor += 1 # Skip the name token
return cursor, variable
raise ValueError(f"Unsupported sequence of tokens: {tokens[cursor:]}")


@dataclasses.dataclass
Expand All @@ -59,42 +55,38 @@ def to_tokens(self) -> list[str]:

@staticmethod
def from_tokens(tokens: list[str], cursor: int = 0) -> tuple[int, "Link"]:
match tokens[cursor]:
case "LINK" | "LINK_TEMPLATE":
link_tag = tokens[cursor]
cursor += 1 # Skip the "LINK" or "LINK_TEMPLATE" token
link = Link(type=tokens[cursor])
cursor += 1 # Skip the type token
target_count = int(tokens[cursor])
cursor += 1 # Skip the target count token
for _ in range(target_count):
match tokens[cursor]:
case "NODE":
cursor, target = Node.from_tokens(tokens, cursor)
case "VARIABLE":
link.is_template = True
cursor, target = Variable.from_tokens(tokens, cursor)
case "LINK" | "LINK_TEMPLATE":
cursor, target = Link.from_tokens(tokens, cursor)
case _:
raise ValueError(f"Unsupported token: {tokens[cursor:]}")
link.targets.append(target)

if link_tag == "LINK_TEMPLATE" and not link.is_template:
raise ValueError("Template link without variables")
elif link_tag == "LINK" and link.is_template:
raise ValueError("Non-template link with variables")

return cursor, link
case _:
raise ValueError(f"Unsupported token: {tokens[cursor:]}")
if tokens[cursor] in {"LINK", "LINK_TEMPLATE"}:
link_tag = tokens[cursor]
cursor += 1 # Skip the "LINK" or "LINK_TEMPLATE" token
link = Link(type=tokens[cursor])
cursor += 1 # Skip the type token
target_count = int(tokens[cursor])
cursor += 1 # Skip the target count token
for _ in range(target_count):
match tokens[cursor]:
case "NODE":
cursor, target = Node.from_tokens(tokens, cursor)
case "VARIABLE":
link.is_template = True
cursor, target = Variable.from_tokens(tokens, cursor)
case "LINK" | "LINK_TEMPLATE":
cursor, target = Link.from_tokens(tokens, cursor)
case _:
raise ValueError(f"Unsupported token: {tokens[cursor:]}")
link.targets.append(target)
if link_tag == "LINK_TEMPLATE" and not link.is_template:
raise ValueError("Template link without variables")
elif link_tag == "LINK" and link.is_template:
raise ValueError("Non-template link with variables")
return cursor, link
raise ValueError(f"Unsupported sequence of tokens: {tokens[cursor:]}")


class DictQueryTokenizer:
Query: TypeAlias = dict[str, Any]

@staticmethod
def tokenize(_query: Query) -> str:
def tokenize(query: Query) -> str:
def _tokenize(
_query: DictQueryTokenizer.Query, _parent: Link | None = None
) -> Link | Node | Variable:
Expand All @@ -111,27 +103,24 @@ def _tokenize(
case _:
raise ValueError(f"Unsupported query: {_query}")

return TOKENS_DELIMITER.join(_tokenize(_query).to_tokens())
return TOKENS_DELIMITER.join(_tokenize(query).to_tokens())

@staticmethod
def untokenize(tokens: str) -> Query:
_, link = Link.from_tokens(tokens.split())

def _untokenize(_atom: Link | Node | Variable) -> DictQueryTokenizer.Query:
match _atom:
case Link(type, targets):
return {
"atom_type": "link",
"type": type,
"targets": [_untokenize(target) for target in targets],
}
case Node(type, name):
return {"atom_type": "node", "type": type, "name": name}
case Variable(name):
return {"atom_type": "variable", "name": name}
case _:
raise ValueError(f"Unsupported element: {_atom}")
if isinstance(_atom, Link):
return {
"atom_type": "link",
"type": _atom.type,
"targets": [_untokenize(target) for target in _atom.targets],
}
elif isinstance(_atom, Node):
return {"atom_type": "node", "type": _atom.type, "name": _atom.name}
elif isinstance(_atom, Variable):
return {"atom_type": "variable", "name": _atom.name}
raise ValueError(f"Unsupported element: {_atom}")

_, link = Link.from_tokens(tokens.split())
return _untokenize(link)


Expand Down

0 comments on commit cba9d1f

Please sign in to comment.