diff --git a/src/examples/indefinites/grammar.py b/src/examples/indefinites/grammar.py index 4aeb5d1..18c1791 100644 --- a/src/examples/indefinites/grammar.py +++ b/src/examples/indefinites/grammar.py @@ -1,8 +1,16 @@ from ultk.language.grammar import Grammar, Rule +import pandas as pd +from ultk.language.semantics import Universe -# indefinites_grammar = Grammar.from_yaml("indefinites/grammar.yml") -indefinites_grammar = Grammar.from_module("indefinites.grammar_functions") +referents = pd.read_csv("indefinites/referents.csv") +prior = pd.read_csv("indefinites/data/Beekhuizen_priors.csv") +assert (referents["name"] == prior["name"]).all() +referents["probability"] = prior["probability"] +universe = Universe.from_dataframe(referents) +indefinites_grammar = Grammar.from_yaml("indefinites/grammar.yml") +#indefinites_grammar = Grammar.from_module("indefinites.grammar_functions") +print(indefinites_grammar.parse("and(not(K+), or(N-, not(SE-)))").hm_sample(indefinites_grammar, [(universe.referents[2], True)])) """ # this defines the grammar "manually" instead of using the YAML text format diff --git a/src/ultk/language/grammar.py b/src/ultk/language/grammar.py index 9911fb3..bb7c101 100644 --- a/src/ultk/language/grammar.py +++ b/src/ultk/language/grammar.py @@ -1,6 +1,7 @@ import inspect import random import re +import copy from collections import defaultdict from collections.abc import Sequence from dataclasses import dataclass @@ -20,6 +21,15 @@ T = TypeVar("T") +def all_or_nothing(data, tree): + prob = 1 + for i in data: + if tree(i[0])==i[1]: + prob = prob * 1 + else: + return 0 + return prob + @dataclass(frozen=True) class Rule: @@ -162,6 +172,9 @@ def add_child(self, child) -> None: self.children = tuple([child]) else: self.children = self.children + (child,) + + def replace_children(self, children) -> None: + self.children = children def to_dict(self) -> dict: the_dict = super().to_dict() @@ -178,6 +191,78 @@ def count_atoms(self): return 1 return sum(child.count_atoms() for child in self.children) + + + + # data: (input, output) + def hm_sample(self, grammar: "Grammar", data, likelihood_func=all_or_nothing) -> "GrammaticalExpression": + old_tree_prior = self.prior(grammar) + old_node_count = self.node_count() + while True: + old_tree = copy.deepcopy(self) + linearized_self = [] + parents = [] + stack = [(old_tree, -1)] + while stack: + current_node, parent_index = stack.pop() + linearized_self.append(current_node) + parents.append(parent_index) + current_index = len(linearized_self) - 1 + children = current_node.children if current_node.children else () + for child in children: + stack.append((child, current_index)) + changing_node = random.choice(range(len(linearized_self))) + #print(str(linearized_self[changing_node])) + #print(str(linearized_self[parents[changing_node]])) + current_node = linearized_self[changing_node] + parent_node = linearized_self[parents[changing_node]] + old_subtree_prior = current_node.prior(grammar) + new_tree, new_node = None, None + if parents[changing_node] != -1: + new_children = [] + children = parent_node.children if parent_node.children else () + for child in children: + if child is current_node: + new_node = grammar.generate(grammar._rules_by_name[current_node.rule_name].lhs) + new_children.append(new_node) + else: + new_children.append(child) + parent_node.replace_children(tuple(new_children)) + new_tree = old_tree + else: + new_node = grammar.generate(grammar._rules_by_name[old_tree.rule_name].lhs) + new_tree = new_node + new_tree_prior = new_tree.prior(grammar) + new_node_count = new_tree.node_count() + new_subtree_prior = new_node.prior(grammar) + # Seems sketchy with the division by zero going on + try: + mh_accept = min(1, ((new_tree_prior*likelihood_func(data, new_tree))/(old_tree_prior*likelihood_func(data, old_tree)))*((old_subtree_prior/new_node_count)/(new_subtree_prior/old_node_count))) + except ZeroDivisionError: + mh_accept = 0 + print(mh_accept) + if random.random() < mh_accept: + return(new_tree) + + def prior(self, grammar: "Grammar") -> float: + probability = grammar.probability(grammar._rules_by_name[self.rule_name]) + children = self.children if self.children else () + for child in children: + probability = probability * (child.prior(grammar)) + return probability + + def node_count(self) -> int: + counter = 1 + stack = [self] + while stack: + current_node = stack.pop() + children = current_node.children if current_node.children else () + for child in children: + stack.append(child) + counter += 1 + return counter + + @classmethod def from_dict(cls, the_dict: dict, grammar: "Grammar") -> "GrammaticalExpression": children = the_dict.get("children") @@ -258,6 +343,9 @@ def add_rule(self, rule: Rule): ) self._rules_by_name[rule.name] = rule + def probability(self, rule: Rule) -> float: + return float(rule.weight)/sum([r.weight for r in self._rules[rule.lhs]]) + def parse( self, expression: str, diff --git a/src/ultk/language/semantics.py b/src/ultk/language/semantics.py index fa22384..6cdb937 100644 --- a/src/ultk/language/semantics.py +++ b/src/ultk/language/semantics.py @@ -196,6 +196,7 @@ def __bool__(self): return bool(self.mapping) # and bool(self.universe) def __str__(self): + return f"Mapping:\n\t{chr(10).join(f'{ref}: {self.mapping[ref]}' for ref in self.mapping)}" return "Mapping:\n\t{0}".format( "\n".join(f"{ref}: {self.mapping[ref]}" for ref in self.mapping) ) # \ \nDistribution:\n\t{self.dist}\n"