use gpt-oss tokenizer because it's a great tokenizer
This commit is contained in:
parent
5cfef742b5
commit
97bc414977
4 changed files with 373 additions and 7 deletions
|
|
@ -15,6 +15,7 @@ from open_mythos.main import (
|
|||
apply_rope,
|
||||
loop_index_embedding,
|
||||
)
|
||||
from open_mythos.tokenizer import MythosTokenizer
|
||||
from open_mythos.variants import (
|
||||
mythos_1b,
|
||||
mythos_3b,
|
||||
|
|
@ -48,4 +49,7 @@ __all__ = [
|
|||
"mythos_100b",
|
||||
"mythos_500b",
|
||||
"mythos_1t",
|
||||
"load_tokenizer",
|
||||
"get_vocab_size",
|
||||
"MythosTokenizer",
|
||||
]
|
||||
|
|
|
|||
|
|
@ -1,9 +1,3 @@
|
|||
"""
|
||||
OpenMythos v1 — Recurrent-Depth Transformer
|
||||
Architecture: Prelude → [Looped Recurrent Block]×T → Coda
|
||||
MoE FFN (DeepSeek-style), GQA or MLA, RoPE, RMSNorm, KV cache, LTI-stable injection, ACT halting
|
||||
"""
|
||||
|
||||
from dataclasses import dataclass
|
||||
from typing import Optional
|
||||
|
||||
|
|
@ -633,7 +627,9 @@ class TransformerBlock(nn.Module):
|
|||
Returns:
|
||||
Output tensor of shape (B, T, dim)
|
||||
"""
|
||||
x = x + self.resid_drop(self.attn(self.attn_norm(x), freqs_cis, mask, kv_cache, cache_key))
|
||||
x = x + self.resid_drop(
|
||||
self.attn(self.attn_norm(x), freqs_cis, mask, kv_cache, cache_key)
|
||||
)
|
||||
x = x + self.resid_drop(self.ffn(self.ffn_norm(x)))
|
||||
return x
|
||||
|
||||
|
|
|
|||
64
open_mythos/tokenizer.py
Normal file
64
open_mythos/tokenizer.py
Normal file
|
|
@ -0,0 +1,64 @@
|
|||
from transformers import AutoTokenizer
|
||||
|
||||
DEFAULT_MODEL_ID = "openai/gpt-oss-20b"
|
||||
|
||||
|
||||
class MythosTokenizer:
|
||||
"""
|
||||
HuggingFace tokenizer wrapper for OpenMythos.
|
||||
|
||||
Args:
|
||||
model_id (str): The HuggingFace model ID or path to use with AutoTokenizer.
|
||||
Defaults to "openai/gpt-oss-20b".
|
||||
|
||||
Attributes:
|
||||
tokenizer: An instance of HuggingFace's AutoTokenizer.
|
||||
|
||||
Example:
|
||||
>>> tok = MythosTokenizer()
|
||||
>>> ids = tok.encode("Hello world")
|
||||
>>> s = tok.decode(ids)
|
||||
"""
|
||||
|
||||
def __init__(self, model_id: str = DEFAULT_MODEL_ID):
|
||||
"""
|
||||
Initialize the MythosTokenizer.
|
||||
|
||||
Args:
|
||||
model_id (str): HuggingFace model identifier or path to tokenizer files.
|
||||
"""
|
||||
self.tokenizer = AutoTokenizer.from_pretrained(model_id)
|
||||
|
||||
@property
|
||||
def vocab_size(self) -> int:
|
||||
"""
|
||||
Return the size of the tokenizer vocabulary.
|
||||
|
||||
Returns:
|
||||
int: The number of unique tokens in the tokenizer vocabulary.
|
||||
"""
|
||||
return self.tokenizer.vocab_size
|
||||
|
||||
def encode(self, text: str) -> list[int]:
|
||||
"""
|
||||
Encode input text into a list of token IDs.
|
||||
|
||||
Args:
|
||||
text (str): The input text string to tokenize.
|
||||
|
||||
Returns:
|
||||
list[int]: List of integer token IDs representing the input text.
|
||||
"""
|
||||
return self.tokenizer.encode(text, add_special_tokens=False)
|
||||
|
||||
def decode(self, token_ids: list[int]) -> str:
|
||||
"""
|
||||
Decode a list of token IDs back into a text string.
|
||||
|
||||
Args:
|
||||
token_ids (list[int]): A list of integer token IDs to decode.
|
||||
|
||||
Returns:
|
||||
str: Decoded string representation of the token IDs.
|
||||
"""
|
||||
return self.tokenizer.decode(token_ids, skip_special_tokens=True)
|
||||
Loading…
Add table
Add a link
Reference in a new issue