use gpt-oss tokenizer because it's a great tokenizer

This commit is contained in:
Kye Gomez 2026-04-19 22:28:09 -04:00
parent 5cfef742b5
commit 97bc414977
4 changed files with 373 additions and 7 deletions

View file

@ -15,6 +15,7 @@ from open_mythos.main import (
apply_rope,
loop_index_embedding,
)
from open_mythos.tokenizer import MythosTokenizer
from open_mythos.variants import (
mythos_1b,
mythos_3b,
@ -48,4 +49,7 @@ __all__ = [
"mythos_100b",
"mythos_500b",
"mythos_1t",
"load_tokenizer",
"get_vocab_size",
"MythosTokenizer",
]

View file

@ -1,9 +1,3 @@
"""
OpenMythos v1 Recurrent-Depth Transformer
Architecture: Prelude [Looped Recurrent Block]×T Coda
MoE FFN (DeepSeek-style), GQA or MLA, RoPE, RMSNorm, KV cache, LTI-stable injection, ACT halting
"""
from dataclasses import dataclass
from typing import Optional
@ -633,7 +627,9 @@ class TransformerBlock(nn.Module):
Returns:
Output tensor of shape (B, T, dim)
"""
x = x + self.resid_drop(self.attn(self.attn_norm(x), freqs_cis, mask, kv_cache, cache_key))
x = x + self.resid_drop(
self.attn(self.attn_norm(x), freqs_cis, mask, kv_cache, cache_key)
)
x = x + self.resid_drop(self.ffn(self.ffn_norm(x)))
return x

64
open_mythos/tokenizer.py Normal file
View file

@ -0,0 +1,64 @@
from transformers import AutoTokenizer
DEFAULT_MODEL_ID = "openai/gpt-oss-20b"
class MythosTokenizer:
"""
HuggingFace tokenizer wrapper for OpenMythos.
Args:
model_id (str): The HuggingFace model ID or path to use with AutoTokenizer.
Defaults to "openai/gpt-oss-20b".
Attributes:
tokenizer: An instance of HuggingFace's AutoTokenizer.
Example:
>>> tok = MythosTokenizer()
>>> ids = tok.encode("Hello world")
>>> s = tok.decode(ids)
"""
def __init__(self, model_id: str = DEFAULT_MODEL_ID):
"""
Initialize the MythosTokenizer.
Args:
model_id (str): HuggingFace model identifier or path to tokenizer files.
"""
self.tokenizer = AutoTokenizer.from_pretrained(model_id)
@property
def vocab_size(self) -> int:
"""
Return the size of the tokenizer vocabulary.
Returns:
int: The number of unique tokens in the tokenizer vocabulary.
"""
return self.tokenizer.vocab_size
def encode(self, text: str) -> list[int]:
"""
Encode input text into a list of token IDs.
Args:
text (str): The input text string to tokenize.
Returns:
list[int]: List of integer token IDs representing the input text.
"""
return self.tokenizer.encode(text, add_special_tokens=False)
def decode(self, token_ids: list[int]) -> str:
"""
Decode a list of token IDs back into a text string.
Args:
token_ids (list[int]): A list of integer token IDs to decode.
Returns:
str: Decoded string representation of the token IDs.
"""
return self.tokenizer.decode(token_ids, skip_special_tokens=True)