eth-easl
diff --git a/‎docker/Dockerfile.aarch64-cuda‎
Lines changed: 1 addition & 1 deletion b/‎docker/Dockerfile.aarch64-cuda‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎docker/Dockerfile.x86_64-cuda‎
Lines changed: 3 additions & 3 deletions b/‎docker/Dockerfile.x86_64-cuda‎
Lines changed: 3 additions & 3 deletions
diff --git a/‎docker/build_image.sh‎
Lines changed: 2 additions & 2 deletions b/‎docker/build_image.sh‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎docs/examples/mllama_request.py‎
Lines changed: 8 additions & 1 deletion b/‎docs/examples/mllama_request.py‎
Lines changed: 8 additions & 1 deletion
diff --git a/‎scratchpad/constrained/__init__.py‎
Lines changed: 18 additions & 0 deletions b/‎scratchpad/constrained/__init__.py‎
Lines changed: 18 additions & 0 deletions
diff --git a/‎scratchpad/constrained/bnf_cache.py‎
Lines changed: 48 additions & 0 deletions b/‎scratchpad/constrained/bnf_cache.py‎
Lines changed: 48 additions & 0 deletions
diff --git a/‎scratchpad/constrained/fsm_cache.py‎
Lines changed: 26 additions & 4 deletions b/‎scratchpad/constrained/fsm_cache.py‎
Lines changed: 26 additions & 4 deletions
diff --git a/‎scratchpad/constrained/grammar.py‎
Lines changed: 177 additions & 0 deletions b/‎scratchpad/constrained/grammar.py‎
Lines changed: 177 additions & 0 deletions
diff --git a/‎scratchpad/managers/tokenizer.py‎
Lines changed: 0 additions & 1 deletion b/‎scratchpad/managers/tokenizer.py‎
Lines changed: 0 additions & 1 deletion
@@ -17,7 +17,7 @@ COPY . /scratchpad
 
 RUN pip install https://filedn.eu/lougUsdPvd1uJK2jfOYWogH/pypi/flashinfer-0.1.6-cp310-cp310-linux_aarch64.whl
 RUN pip install https://filedn.eu/lougUsdPvd1uJK2jfOYWogH/pypi/triteia-0.1.0-cp310-cp310-linux_aarch64.whl
-RUN pip install -r requirements-extra.txt
+RUN pip install -r meta/requirements-extra.txt
 RUN pip install .
 # todo(xiaozhe): figure out why pynvml is installed in the first place. We should use nvidia-ml-py instead.
 RUN pip uninstall pynvml -y
@@ -1,4 +1,4 @@
-FROM nvcr.io/nvidia/pytorch:24.09-py3 AS base
+FROM nvcr.io/nvidia/pytorch:24.07-py3 AS base
 
 LABEL org.opencontainers.image.source=https://github.com/xiaozheyao/Scratchpad
 LABEL org.opencontainers.image.description="Scratchpad: Adaptive Serving of LMs"
@@ -15,8 +15,8 @@ WORKDIR /scratchpad
 
 COPY . /scratchpad
 
-RUN pip install https://filedn.eu/lougUsdPvd1uJK2jfOYWogH/pypi/flashinfer-0.1.6-cp310-cp310-linux_x86_64.whl
+RUN pip install flashinfer -i https://flashinfer.ai/whl/cu124/torch2.4/
 RUN pip install https://filedn.eu/lougUsdPvd1uJK2jfOYWogH/pypi/triteia-0.1.0-cp310-cp310-linux_x86_64.whl
-RUN pip install -r requirements-extra.txt
+RUN pip install -r meta/requirements-extra.txt
 RUN pip install .
 RUN pip uninstall pynvml -y
@@ -7,5 +7,5 @@ if [ -z "$version" ]; then
     exit 1
 fi
 echo "Building image for $arch, version $version"
-podman build -f docker/Dockerfile.$arch-cuda . -t ghcr.io/xiaozheyao/scratchpad:${version}dev-$arch --build-arg ARCH=$arch
-podman push ghcr.io/xiaozheyao/scratchpad:${version}dev-$arch
+docker build -f docker/Dockerfile.$arch-cuda . -t ghcr.io/xiaozheyao/scratchpad:${version}dev-$arch --build-arg ARCH=$arch
+docker push ghcr.io/xiaozheyao/scratchpad:${version}dev-$arch
@@ -4,6 +4,7 @@
 
 prompt = "What is in this image?"
 img_url = "https://images.unsplash.com/photo-1692350914621-f0ca2d206368?q=80&w=3000&auto=format&fit=crop&ixlib=rb-4.0.3&ixid=M3wxMjA3fDB8MHxwaG90by1wYWdlfHx8fGVufDB8fHx8fA%3D%3D"
+stream = True
 
 
 def image_url_to_base64(url):
@@ -41,5 +42,11 @@ def image_url_to_base64(url):
             ],
         }
     ],
+    stream=stream,
 )
-print(response.choices[0].message.content)
+if stream:
+    for chunk in response:
+        if len(chunk.choices) > 0 and chunk.choices[0].delta.content:
+            print(chunk.choices[0].delta.content, end="", flush=True)
+else:
+    print(response.choices[0].message.content)
@@ -34,6 +34,21 @@ def build_regex_from_object(
         return build_regex_from_schema(schema, whitespace_pattern)
 
 
+try:
+    from xgrammar import (
+        GrammarMatcher,
+        GrammarMatcherInitContext,
+        GrammarMatcherInitContextCache,
+    )
+except ImportError as e:
+
+    class Dummy:
+        pass
+
+    GrammarMatcher = Dummy
+    GrammarMatcherInitContext = Dummy
+    GrammarMatcherInitContextCache = Dummy
+
 __all__ = [
     "RegexGuide",
     "FSMInfo",
@@ -43,4 +58,7 @@ def build_regex_from_object(
     "disk_cache",
     "disable_cache",
     "make_byte_level_fsm",
+    "GrammarMatcher",
+    "GrammarMatcherInitContext",
+    "GrammarMatcherInitContextCache",
 ]
@@ -0,0 +1,48 @@
+"""Cache for the compressed finite state machine."""
+
+from typing import Tuple
+
+from transformers import AutoTokenizer
+
+from scratchpad.constrained import (
+    GrammarMatcher,
+    GrammarMatcherInitContext,
+    GrammarMatcherInitContextCache,
+)
+
+MAX_ROLLBACK_TOKENS = 10
+
+
+class BNFCache:
+    grammar_cache: GrammarMatcherInitContextCache
+
+    def __init__(
+        self,
+        tokenizer_path,
+        tokenizer_args_dict,
+        skip_tokenizer_init=False,
+        whitespace_patterns=None,
+    ):
+        # TODO(dark): how to deal with whitespace_patterns and skip_tokenizer_init
+        if skip_tokenizer_init:
+            return
+
+        tokenizer = AutoTokenizer.from_pretrained(tokenizer_path, **tokenizer_args_dict)
+        self.grammar_cache = GrammarMatcherInitContextCache(
+            tokenizer_or_vocab=tokenizer
+        )
+
+    def get_context(self, key: Tuple[str, str]) -> GrammarMatcherInitContext:
+        key_type, key_string = key
+        if key_type == "json":
+            return self.grammar_cache.get_init_context_for_json_schema(key_string)
+        elif key_type == "regex":
+            raise ValueError(f"regex hasn't been supported by xgrammar yet")
+        else:
+            raise ValueError(f"Invalid key_type: {key_type}")
+
+    def query(self, key: Tuple[str, str], vocab_size: int) -> GrammarMatcher:
+        ctx = self.get_context(key)
+        return GrammarMatcher(
+            ctx, max_rollback_tokens=MAX_ROLLBACK_TOKENS, mask_vocab_size=vocab_size
+        )
@@ -3,6 +3,17 @@
 from .base_tool_cache import BaseToolCache
 from . import RegexGuide, TransformerTokenizer
 
+import logging
+
+from interegular import InvalidSyntax, parse_pattern
+from outlines.fsm.json_schema import build_regex_from_schema
+from transformers import AutoTokenizer
+
+from scratchpad.constrained import RegexGuide, TransformerTokenizer
+from .base_tool_cache import BaseToolCache
+
+logger = logging.getLogger(__name__)
+
 
 class FSMCache(BaseToolCache):
     def __init__(
@@ -51,12 +62,23 @@ def fset(self, value):
     def init_value(self, key):
         key_type, key_string = key
         if key_type == "json":
-            regex = build_regex_from_schema(
-                key_string, whitespace_pattern=self.constrained_json_whitespace_pattern
-            )
+            try:
+                regex = build_regex_from_schema(
+                    key_string,
+                    whitespace_pattern=self.constrained_json_whitespace_pattern,
+                )
+            except NotImplementedError as e:
+                logger.warning(
+                    f"skip invalid json schema: json_schema={key_string}, {e=}"
+                )
+                return None, key_string
         elif key_type == "regex":
             regex = key_string
         else:
             raise ValueError(f"Invalid key_type: {key_type}")
-
+        try:
+            parse_pattern(regex)
+        except InvalidSyntax as e:
+            logger.warning(f"skip invalid regex guide: {regex=}, {e=}")
+            return None, regex
         return RegexGuide(regex, self.outlines_tokenizer), regex
@@ -0,0 +1,177 @@
+"""Cache for the compressed finite state machine."""
+import logging
+from typing import List, Optional, Tuple, Union
+
+import torch
+
+from scratchpad.constrained import GrammarMatcher, RegexGuide
+from .bnf_cache import BNFCache
+from .fsm_cache import FSMCache
+from .jump_forward import JumpForwardCache, JumpForwardMap
+
+# from sglang.srt.managers.schedule_batch import Req
+
+logger = logging.getLogger(__name__)
+
+INIT_INCREMENTAL_DETOKENIZATION_OFFSET = 5
+
+
+class XGrammarJump:
+    pass
+
+
+class JumpHelper:
+    data: Union[List, str]
+    state: int
+    suffix_ids: List[int]
+
+    def __init__(
+        self, data: Union[List, str] = "", state: int = -1, suffix_ids=[]
+    ) -> None:
+        self.data = data
+        self.state = state
+        self.suffix_ids = suffix_ids
+
+    def can_jump(self):
+        return len(self.data) > 0
+
+
+class Grammar:
+    grammar: Union[GrammarMatcher, Tuple[RegexGuide, int]]
+    jump_map: Union[XGrammarJump, JumpForwardMap, None]
+
+    def __init__(
+        self,
+        grammar: Union[GrammarMatcher, Tuple[RegexGuide, int]],
+        jump_map: Union[XGrammarJump, JumpForwardMap, None],
+    ) -> None:
+        self.grammar = grammar
+        self.jump_map = jump_map
+
+    def accept_token(self, token: int):
+        if isinstance(self.grammar, GrammarMatcher):
+            assert self.grammar.accept_token(token)
+        else:
+            guide, state = self.grammar
+            self.grammar = guide, guide.get_next_state(state, token)
+
+    def try_jump(self, tokenizer) -> JumpHelper:
+        if isinstance(self.jump_map, XGrammarJump):
+            assert isinstance(self.grammar, GrammarMatcher)
+            return JumpHelper(self.grammar.find_jump_forward_string())
+        elif isinstance(self.jump_map, JumpForwardMap):
+            assert isinstance(self.grammar, Tuple)
+
+            _, state = self.grammar
+            jump_forward_bytes = self.jump_map.jump_forward_byte(state)
+            if jump_forward_bytes is None or len(jump_forward_bytes) == 0:
+                return JumpHelper()  # can't jump
+
+            # preprocess the jump forward string
+            suffix_bytes = []
+            continuation_range = range(0x80, 0xC0)
+            cur_state = state
+            while (
+                len(jump_forward_bytes)
+                and jump_forward_bytes[0][0] in continuation_range
+            ):
+                # continuation bytes
+                byte_edge = jump_forward_bytes.pop(0)
+                suffix_bytes.append(byte_edge[0])
+                cur_state = byte_edge[1]
+
+            suffix_tokens = [f"<0x{hex(b)[2:].upper()}>" for b in suffix_bytes]
+            suffix_ids = tokenizer.convert_tokens_to_ids(suffix_tokens)
+            return JumpHelper(suffix_ids, cur_state, suffix_bytes)
+        else:
+            return JumpHelper()  # can't jump
+
+    def jump_forward_str_state(self, helper: JumpHelper) -> Tuple[str, int]:
+        if isinstance(helper.data, str):
+            return helper.data, -1
+        else:
+            assert isinstance(self.jump_map, JumpForwardMap)
+            return self.jump_map.jump_forward_symbol(helper.state)
+
+    def jump_and_retokenize(
+        self, old_output_ids: List[int], new_output_ids: List[int], next_state: int
+    ):
+        if isinstance(self.grammar, GrammarMatcher):
+            k = 0
+            for i, old_id in enumerate(old_output_ids):
+                if old_id == new_output_ids[i]:
+                    k = i + 1
+                else:
+                    break
+
+            # rollback to the last token that is the same
+            if k < len(old_output_ids):
+                self.grammar.rollback(len(old_output_ids) - k)
+
+            for i in range(k, len(new_output_ids)):
+                assert self.grammar.accept_token(new_output_ids[i])
+        else:
+            self.grammar = self.grammar[0], next_state
+
+    def fill_vocab_mask(self, vocab_mask: torch.Tensor, vocab_size: int):
+        if isinstance(self.grammar, GrammarMatcher):
+            # Note that this bitmask is a bitset, not bool
+            bitmask = self.grammar.find_next_token_bitmask()
+            # Mask the tokens that are not allowed
+            vocab_mask[
+                self.grammar.get_rejected_tokens_from_bitmask(bitmask, vocab_size)
+            ] = 1
+        else:
+            guide, state = self.grammar
+            vocab_mask.fill_(1)
+            vocab_mask[guide.get_next_instruction(state).tokens] = 0
+
+
+class GrammarCache:
+    grammar_cache: Union[BNFCache, FSMCache]
+    jump_cache: Union[XGrammarJump, JumpForwardCache, None]
+
+    def __init__(
+        self,
+        tokenizer_path,
+        tokenizer_args_dict,
+        skip_tokenizer_init=False,
+        whitespace_patterns=None,
+        backend=None,
+        allow_jump=False,
+    ):
+        if backend == "xgrammar":
+            self.grammar_cache = BNFCache(
+                tokenizer_path=tokenizer_path,
+                tokenizer_args_dict=tokenizer_args_dict,
+                skip_tokenizer_init=skip_tokenizer_init,
+                whitespace_patterns=whitespace_patterns,
+            )
+            self.jump_cache = XGrammarJump() if allow_jump else None
+        else:
+            assert backend == "outlines"
+            self.grammar_cache = FSMCache(
+                tokenizer_path=tokenizer_path,
+                tokenizer_args_dict=tokenizer_args_dict,
+                skip_tokenizer_init=skip_tokenizer_init,
+                constrained_json_whitespace_pattern=whitespace_patterns,
+                enable=True,
+            )
+            self.jump_cache = JumpForwardCache() if allow_jump else None
+
+    def query(self, key: Tuple[str, str], vocab_size: int) -> Grammar:
+        if isinstance(self.grammar_cache, BNFCache):
+            assert not isinstance(self.jump_cache, JumpForwardCache)
+            return Grammar(self.grammar_cache.query(key, vocab_size), self.jump_cache)
+        else:
+            jump_map = None
+            guide, regex = self.grammar_cache.query(key)
+            if isinstance(self.jump_cache, JumpForwardCache):
+                jump_map = self.jump_cache.query(regex)
+            return Grammar((guide, 0), jump_map)
+
+    def reset(self):
+        if isinstance(self.grammar_cache, FSMCache):
+            self.grammar_cache.reset()
+        if isinstance(self.jump_cache, JumpForwardCache):
+            self.jump_cache.reset()
@@ -440,7 +440,6 @@ def create_handle_loop(self):
 
     async def sigterm_watchdog(self):
         while not self.gracefully_exit:
-            print("sigterm_watchdog")
             await asyncio.sleep(5)
 
         # drain requests