use gpt-oss tokenizer because it's a great tokenizer

2026-04-19 22:28:09 -04:00 · 2026-04-19 22:28:09 -04:00 · 97bc414977
commit 97bc414977
parent 5cfef742b5
4 changed files with 373 additions and 7 deletions
--- a/open_mythos/init.py
+++ b/open_mythos/init.py
@ -15,6 +15,7 @@ from open_mythos.main import (
    apply_rope,
    loop_index_embedding,
 )
+from open_mythos.tokenizer import MythosTokenizer
 from open_mythos.variants import (
    mythos_1b,
    mythos_3b,
@ -48,4 +49,7 @@ __all__ = [
    "mythos_100b",
    "mythos_500b",
    "mythos_1t",
+    "load_tokenizer",
+    "get_vocab_size",
+    "MythosTokenizer",
 ]
--- a/open_mythos/main.py
+++ b/open_mythos/main.py
@ -1,9 +1,3 @@
-"""
-OpenMythos v1 — Recurrent-Depth Transformer
-Architecture: Prelude → [Looped Recurrent Block]×T → Coda
-MoE FFN (DeepSeek-style), GQA or MLA, RoPE, RMSNorm, KV cache, LTI-stable injection, ACT halting
-"""
-
 from dataclasses import dataclass
 from typing import Optional

@ -633,7 +627,9 @@ class TransformerBlock(nn.Module):
        Returns:
            Output tensor of shape (B, T, dim)
        """
-        x = x + self.resid_drop(self.attn(self.attn_norm(x), freqs_cis, mask, kv_cache, cache_key))
+        x = x + self.resid_drop(
+            self.attn(self.attn_norm(x), freqs_cis, mask, kv_cache, cache_key)
+        )
        x = x + self.resid_drop(self.ffn(self.ffn_norm(x)))
        return x

--- a/open_mythos/tokenizer.py
+++ b/open_mythos/tokenizer.py
@ -0,0 +1,64 @@
+from transformers import AutoTokenizer
+
+DEFAULT_MODEL_ID = "openai/gpt-oss-20b"
+
+
+class MythosTokenizer:
+    """
+    HuggingFace tokenizer wrapper for OpenMythos.
+
+    Args:
+        model_id (str): The HuggingFace model ID or path to use with AutoTokenizer.
+            Defaults to "openai/gpt-oss-20b".
+
+    Attributes:
+        tokenizer: An instance of HuggingFace's AutoTokenizer.
+
+    Example:
+        >>> tok = MythosTokenizer()
+        >>> ids = tok.encode("Hello world")
+        >>> s = tok.decode(ids)
+    """
+
+    def __init__(self, model_id: str = DEFAULT_MODEL_ID):
+        """
+        Initialize the MythosTokenizer.
+
+        Args:
+            model_id (str): HuggingFace model identifier or path to tokenizer files.
+        """
+        self.tokenizer = AutoTokenizer.from_pretrained(model_id)
+
+    @property
+    def vocab_size(self) -> int:
+        """
+        Return the size of the tokenizer vocabulary.
+
+        Returns:
+            int: The number of unique tokens in the tokenizer vocabulary.
+        """
+        return self.tokenizer.vocab_size
+
+    def encode(self, text: str) -> list[int]:
+        """
+        Encode input text into a list of token IDs.
+
+        Args:
+            text (str): The input text string to tokenize.
+
+        Returns:
+            list[int]: List of integer token IDs representing the input text.
+        """
+        return self.tokenizer.encode(text, add_special_tokens=False)
+
+    def decode(self, token_ids: list[int]) -> str:
+        """
+        Decode a list of token IDs back into a text string.
+
+        Args:
+            token_ids (list[int]): A list of integer token IDs to decode.
+
+        Returns:
+            str: Decoded string representation of the token IDs.
+        """
+        return self.tokenizer.decode(token_ids, skip_special_tokens=True)