Added vose polyfill making it optional

Theomat · Theomat · commit 9178d4d4bc94 · 2024-08-19T15:04:55.000+02:00
diff --git a/synth/generation/sampler.py b/synth/generation/sampler.py
@@ -14,7 +14,7 @@
 import copy
 
 import numpy as np
-import vose
+from synth.utils.vose_polyfill import Sampler as VoseSampler
 
 from synth.syntax.type_system import List, Type
 
@@ -53,7 +53,7 @@ def __init__(
             filled_probabilities = probabilites
         else:
             filled_probabilities = [1 / len(self.lexicon) for _ in lexicon]
-        self.sampler = vose.Sampler(np.asarray(filled_probabilities), seed=seed)
+        self.sampler = VoseSampler(np.asarray(filled_probabilities), seed=seed)
 
     def sample(self, **kwargs: Any) -> U:
         index: int = self.sampler.sample()
@@ -104,7 +104,7 @@ def __init__(
             if not isinstance(probabilities[0], tuple):
                 correct_prob = [(i + 1, p) for i, p in enumerate(probabilities)]  # type: ignore
             self._length_mapping = [n for n, _ in correct_prob]
-            self.sampler = vose.Sampler(
+            self.sampler = VoseSampler(
                 np.array([p for _, p in correct_prob]), seed=seed
             )
 
diff --git a/synth/syntax/grammars/tagged_det_grammar.py b/synth/syntax/grammars/tagged_det_grammar.py
@@ -14,7 +14,7 @@
 )
 
 import numpy as np
-import vose
+from synth.utils.vose_polyfill import Sampler as VoseSampler
 
 if TYPE_CHECKING:
     from synth.syntax.grammars.cfg import CFG
@@ -165,7 +165,7 @@ def init_sampling(self, seed: Optional[int] = None) -> None:
 
         for i, S in enumerate(self.tags):
             P_list = list(self.tags[S].keys())
-            self.vose_samplers[S] = vose.Sampler(
+            self.vose_samplers[S] = VoseSampler(
                 np.array(
                     [self.tags[S][P] for P in P_list],
                     dtype=float,
diff --git a/synth/syntax/grammars/tagged_u_grammar.py b/synth/syntax/grammars/tagged_u_grammar.py
@@ -11,7 +11,7 @@
 )
 
 import numpy as np
-import vose
+from synth.utils.vose_polyfill import Sampler as VoseSampler
 
 from synth.syntax.grammars.det_grammar import DerivableProgram
 from synth.syntax.grammars.u_grammar import UGrammar
@@ -183,7 +183,7 @@ def init_sampling(self, seed: Optional[int] = None) -> None:
 
         for i, S in enumerate(self.tags):
             P_list = list(self.tags[S].keys())
-            self.vose_samplers[S] = vose.Sampler(
+            self.vose_samplers[S] = VoseSampler(
                 np.array(
                     [sum(p for p in self.tags[S][P].values()) for P in P_list],
                     dtype=float,
@@ -192,7 +192,7 @@ def init_sampling(self, seed: Optional[int] = None) -> None:
             )
             self._vose_samplers_2[S] = {}
             for P in P_list:
-                self._vose_samplers_2[S][P] = vose.Sampler(
+                self._vose_samplers_2[S][P] = VoseSampler(
                     np.array(
                         [p for p in self.tags[S][P].values()],
                         dtype=float,
@@ -202,7 +202,7 @@ def init_sampling(self, seed: Optional[int] = None) -> None:
                 )
             self.sampling_map[S] = P_list
         self._int2start = list(self.starts)
-        self._start_sampler = vose.Sampler(
+        self._start_sampler = VoseSampler(
             np.array(
                 [v for v in self.start_tags.values()],
                 dtype=float,
diff --git a/synth/utils/vose_polyfill.py b/synth/utils/vose_polyfill.py
@@ -0,0 +1,94 @@
+from typing import Optional, Union
+import numpy as np
+
+
+class PythonSampler:
+    def __init__(self, weights: np.ndarray, seed: Optional[int] = None) -> None:
+        self.rng = np.random.default_rng(seed or 1)
+        n = len(weights)
+        alias = np.zeros(n, dtype=int)
+        proba = np.zeros(n, dtype=float)
+        # Compute the average probability and cache it for later use.
+        avg = 1.0 / n
+        # Create two stacks to act as worklists as we populate the tables.
+        small = []
+        large = []
+        # Populate the stacks with the input probabilities.
+        for i in range(n):
+            # If the probability is below the average probability, then we add it to the small
+            # list; otherwise we add it to the large list.
+            if weights[i] >= avg:
+                large.append(i)
+            else:
+                small.append(i)
+        # As a note: in the mathematical specification of the algorithm, we will always exhaust the
+        # small list before the big list. However, due to floating point inaccuracies, this is not
+        # necessarily true. Consequently, this inner loop (which tries to pair small and large
+        # elements) will have to check that both lists aren't empty.
+        while len(small) > 0 and len(large) > 0:
+            # Get the index of the small and the large probabilities.
+            less = small.pop(0)
+            more = large.pop(0)
+            # These probabilities have not yet been scaled up to be such that 1 / n is given weight
+            # 1.0. We do this here instead.
+            proba[less] = weights[less] * n
+            alias[less] = more
+            # Decrease the probability of the larger one by the appropriate amount.
+            weights[more] = weights[more] + weights[less] - avg
+            # If the new probability is less than the average, add it into the small list;
+            # otherwise add it to the large list.
+            if weights[more] >= avg:
+                large.append(more)
+            else:
+                small.append(more)
+        # At this point, everything is in one list, which means that the remaining probabilities
+        # should all be 1 / n.  Based on this, set them appropriately. Due to numerical issues, we
+        # can't be sure which stack will hold the entries, so we empty both.
+        while len(small) > 0:
+            less = small.pop(0)
+            proba[less] = 1.0
+        while len(large) > 0:
+            more = large.pop(0)
+            proba[more] = 1.0
+        self.n = n
+        self.alias = alias
+        self.proba = proba
+
+    def sample_1(self) -> int:
+        # Generate a fair die roll to determine which column to inspect.
+        col = int(self.rng.uniform(0, self.n))
+        # Generate a biased coin toss to determine which option to pick.
+        heads = self.rng.uniform() < 0.5
+
+        # Based on the outcome, return either the column or its alias.
+        if heads:
+            return col
+        return self.alias[col]  # type: ignore
+
+    def sample(
+        self, k: int = 1, values: Optional[np.ndarray] = None
+    ) -> Union[int, np.ndarray]:
+        """Sample a random integer or a value from a given array.
+
+        Parameters:
+            k: The number of integers to sample. If `k = 1`, then a single int (or float if values is not None) is returned. In any
+                other case, a numpy array is returned.
+            values: The numpy array of values from which to sample from.
+
+        """
+        if values is None:
+            if k == 1:
+                return self.sample_1()
+            return np.asarray([self.sample_1() for _ in range(k)])
+        else:
+            if k == 1:
+                return values[self.sample_1()]  # type: ignore
+            return np.asarray([values[self.sample_1()] for _ in range(k)])
+
+
+try:
+    import vose
+
+    Sampler = vose.Sampler
+except ImportError:
+    Sampler = PythonSampler