[docs][readme-badges][add pypi twitter github badges to
readme][improvement][init-sort][sort imports alphabetically in init][improvement][version-bump][bump version to 0.3.0][feat][tokenizer-class][add MythosTokenizer to init exports][feat][test-tokenizer][add tokenizer test suite with printed output]
This commit is contained in:
parent
5ffb897dcf
commit
12f6c5b32e
6 changed files with 1249 additions and 20 deletions
75
tests/test_tokenizer.py
Normal file
75
tests/test_tokenizer.py
Normal file
|
|
@ -0,0 +1,75 @@
|
|||
import pytest
|
||||
from open_mythos.tokenizer import MythosTokenizer
|
||||
|
||||
|
||||
@pytest.fixture(scope="module")
|
||||
def tokenizer():
|
||||
tok = MythosTokenizer()
|
||||
print(f"\nLoaded tokenizer: {tok.tokenizer.name_or_path}")
|
||||
return tok
|
||||
|
||||
|
||||
def test_loads(tokenizer):
|
||||
assert tokenizer is not None
|
||||
print(f"Tokenizer: {tokenizer}")
|
||||
|
||||
|
||||
def test_vocab_size(tokenizer):
|
||||
size = tokenizer.vocab_size
|
||||
print(f"Vocab size: {size:,}")
|
||||
assert size > 0
|
||||
|
||||
|
||||
def test_encode_returns_list_of_ints(tokenizer):
|
||||
ids = tokenizer.encode("Hello, world!")
|
||||
print(f"encode('Hello, world!') → {ids}")
|
||||
assert isinstance(ids, list)
|
||||
assert all(isinstance(i, int) for i in ids)
|
||||
assert len(ids) > 0
|
||||
|
||||
|
||||
def test_encode_empty_string(tokenizer):
|
||||
ids = tokenizer.encode("")
|
||||
print(f"encode('') → {ids}")
|
||||
assert isinstance(ids, list)
|
||||
|
||||
|
||||
def test_decode_returns_string(tokenizer):
|
||||
ids = tokenizer.encode("Hello, world!")
|
||||
text = tokenizer.decode(ids)
|
||||
print(f"decode({ids}) → '{text}'")
|
||||
assert isinstance(text, str)
|
||||
|
||||
|
||||
def test_roundtrip(tokenizer):
|
||||
original = "The quick brown fox jumps over the lazy dog."
|
||||
ids = tokenizer.encode(original)
|
||||
recovered = tokenizer.decode(ids)
|
||||
print(f"original: '{original}'")
|
||||
print(f"token ids: {ids}")
|
||||
print(f"recovered: '{recovered}'")
|
||||
assert original in recovered or recovered in original
|
||||
|
||||
|
||||
def test_encode_long_text(tokenizer):
|
||||
text = "OpenMythos is a recurrent depth transformer. " * 100
|
||||
ids = tokenizer.encode(text)
|
||||
print(f"Long text ({len(text)} chars) → {len(ids)} tokens")
|
||||
assert len(ids) > 100
|
||||
|
||||
|
||||
def test_custom_model_id():
|
||||
tok = MythosTokenizer(model_id="openai/gpt-oss-20b")
|
||||
print(f"Custom model_id vocab size: {tok.vocab_size:,}")
|
||||
assert tok.vocab_size > 0
|
||||
|
||||
|
||||
def test_vocab_size_consistent(tokenizer):
|
||||
outer = tokenizer.vocab_size
|
||||
inner = tokenizer.tokenizer.vocab_size
|
||||
print(f"vocab_size property: {outer:,} | inner tokenizer.vocab_size: {inner:,}")
|
||||
assert outer == inner
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
pytest.main([__file__, "--verbose", "-s"])
|
||||
Loading…
Add table
Add a link
Reference in a new issue