Generalise re_replacement_seq to deal with special symbols

saattrupdan · web-flow · commit cad6344d81e3 · 2025-01-08T16:48:12.000Z
This PR is similar to #90, and generalises the regex to deal with all the previous, and hopefully all future cases as well. The new special case not covered by the previous approach are the `�?` and `�,` tokens, used by Salamandra models. Since all these special tokens (new and old) consist of one or more � symbols, with an optional single-character prefix and/or suffix, we can simplify and generalise the pattern to r"^.?�+.?$".
diff --git a/python/outlines_core/fsm/regex.py b/python/outlines_core/fsm/regex.py
@@ -342,11 +342,11 @@ def make_deterministic_fsm(fsm: FSM) -> Tuple[BetterFSM, Dict[int, int]]:
 
 re_llama_byte_token = re.compile(r"^<0x[0-9A-F]{2}>$")
 
-# The "▁*" prefix is required to handle Gemma and GPT-SW3 tokenizers.
-# The "\.*" suffix is required to handle the NorwAI tokenizer.
-# The "\.*" prefix is required to handle the Salamandra tokenizer.
-# The "s*$" suffix is required to handle the OpenCoder tokenizer.
-re_replacement_seq = re.compile(r"^▁*\.*�+\.*s*$")
+# The ".?" prefix and suffix is to handle special cases in some model vocabularies. This
+# includes Gemma models (which use "▁�" as a token), NorwAI models (which use ".�" as a
+# token), Salamandra models (which use ".�" and "�?" as tokens) and OpenCoder models
+# (which use "�s" as a token).
+re_replacement_seq = re.compile(r"^.?�+.?$")
 
 
 # Copied from transformers.models.gpt2.tokenization_gpt2.bytes_to_unicode
diff --git a/tests/fsm/test_regex.py b/tests/fsm/test_regex.py
@@ -542,12 +542,14 @@ def convert_token_to_string(self, token):
         "�",
         "��",
         "�.",
-        "�..",
+        ".�",
+        ".�.",
         "▁�",
-        "▁▁�",
-        "▁�.",
-        "▁�.",
-        "▁▁�..",
+        "�▁",
+        "▁�▁",
+        "?�",
+        "�?",
+        "?�?",
     ],
 )
 def test_reduced_vocabulary_with_rare_tokens(rare_token):