diff --git a/README.md b/README.md index 5c73d7f..3542f2c 100644 --- a/README.md +++ b/README.md @@ -71,6 +71,41 @@ print( +## Model Variants + +Pre-configured scales from 1B to 1T parameters: + +```python +from open_mythos import ( + mythos_1b, + mythos_3b, + mythos_10b, + mythos_50b, + mythos_100b, + mythos_500b, + mythos_1t, + OpenMythos, +) + +cfg = mythos_7b() # returns a MythosConfig +model = OpenMythos(cfg) + +total = sum(p.numel() for p in model.parameters()) +print(f"Parameters: {total:,}") +``` + +| Variant | `dim` | Experts | `expert_dim` | Loop iters | Context | Max output | +|---|---|---|---|---|---|---| +| `mythos_1b` | 2048 | 64 | 2048 | 16 | 4k | 4k | +| `mythos_3b` | 3072 | 64 | 4096 | 16 | 4k | 4k | +| `mythos_10b` | 4096 | 128 | 5632 | 24 | 8k | 4k | +| `mythos_50b` | 6144 | 256 | 9728 | 32 | 8k | 4k | +| `mythos_100b` | 8192 | 256 | 13568 | 32 | 1M | 128k | +| `mythos_500b` | 12288 | 512 | 23040 | 48 | 1M | 128k | +| `mythos_1t` | 16384 | 512 | 34560 | 64 | 1M | 128k | + +--- + ## Documentation | Page | Description | diff --git a/open_mythos/variants.py b/open_mythos/variants.py index a63f6a5..83f7dd4 100644 --- a/open_mythos/variants.py +++ b/open_mythos/variants.py @@ -1,8 +1,13 @@ from open_mythos.main import MythosConfig +# Parameter budget breakdown per variant: +# total ≈ embed + prelude/coda dense blocks + recurrent MLA + MoE +# MoE = 3 * dim * expert_dim * (n_experts + n_shared * n_experts_per_tok) +# expert_dim is solved from the residual budget after all other terms. + def mythos_1b() -> MythosConfig: - """1B parameter config. Small research/fine-tuning model. dim=2048, 16 experts, 16 loop iters, 4k context.""" + """1B parameter config. Small research/fine-tuning model. dim=2048, 64 experts, 16 loop iters, 4k context.""" return MythosConfig( vocab_size=32000, dim=2048, @@ -18,10 +23,10 @@ def mythos_1b() -> MythosConfig: qk_rope_head_dim=32, qk_nope_head_dim=64, v_head_dim=64, - n_experts=16, + n_experts=64, n_shared_experts=2, - n_experts_per_tok=2, - expert_dim=256, + n_experts_per_tok=4, + expert_dim=2048, act_threshold=0.99, rope_theta=500000.0, lora_rank=8, @@ -29,7 +34,7 @@ def mythos_1b() -> MythosConfig: def mythos_3b() -> MythosConfig: - """3B parameter config. Compact inference model. dim=3072, 32 experts, 16 loop iters, 4k context.""" + """3B parameter config. Compact inference model. dim=3072, 64 experts, 16 loop iters, 4k context.""" return MythosConfig( vocab_size=32000, dim=3072, @@ -45,10 +50,10 @@ def mythos_3b() -> MythosConfig: qk_rope_head_dim=32, qk_nope_head_dim=96, v_head_dim=96, - n_experts=32, + n_experts=64, n_shared_experts=2, - n_experts_per_tok=2, - expert_dim=384, + n_experts_per_tok=4, + expert_dim=4096, act_threshold=0.99, rope_theta=500000.0, lora_rank=8, @@ -56,7 +61,7 @@ def mythos_3b() -> MythosConfig: def mythos_10b() -> MythosConfig: - """10B parameter config. Mid-scale general model. dim=4096, 64 experts, 24 loop iters, 8k context.""" + """10B parameter config. Mid-scale general model. dim=4096, 128 experts, 24 loop iters, 8k context.""" return MythosConfig( vocab_size=32000, dim=4096, @@ -72,10 +77,10 @@ def mythos_10b() -> MythosConfig: qk_rope_head_dim=64, qk_nope_head_dim=128, v_head_dim=128, - n_experts=64, + n_experts=128, n_shared_experts=2, n_experts_per_tok=4, - expert_dim=512, + expert_dim=5632, act_threshold=0.99, rope_theta=500000.0, lora_rank=16, @@ -83,7 +88,7 @@ def mythos_10b() -> MythosConfig: def mythos_50b() -> MythosConfig: - """50B parameter config. Large reasoning model. dim=6144, 128 experts, 32 loop iters, 8k context.""" + """50B parameter config. Large reasoning model. dim=6144, 256 experts, 32 loop iters, 8k context.""" return MythosConfig( vocab_size=32000, dim=6144, @@ -99,10 +104,10 @@ def mythos_50b() -> MythosConfig: qk_rope_head_dim=64, qk_nope_head_dim=128, v_head_dim=128, - n_experts=128, + n_experts=256, n_shared_experts=4, n_experts_per_tok=4, - expert_dim=768, + expert_dim=9728, act_threshold=0.99, rope_theta=500000.0, lora_rank=32, @@ -110,7 +115,7 @@ def mythos_50b() -> MythosConfig: def mythos_100b() -> MythosConfig: - """100B parameter config. Frontier-class model. dim=8192, 160 experts, 32 loop iters, 1M context, 128k output.""" + """100B parameter config. Frontier-class model. dim=8192, 256 experts, 32 loop iters, 1M context, 128k output.""" return MythosConfig( vocab_size=32000, dim=8192, @@ -126,10 +131,10 @@ def mythos_100b() -> MythosConfig: qk_rope_head_dim=64, qk_nope_head_dim=128, v_head_dim=128, - n_experts=160, + n_experts=256, n_shared_experts=4, n_experts_per_tok=8, - expert_dim=1024, + expert_dim=13568, act_threshold=0.99, rope_theta=1000000.0, lora_rank=64, @@ -138,7 +143,7 @@ def mythos_100b() -> MythosConfig: def mythos_500b() -> MythosConfig: - """500B parameter config. Ultra-scale MoE model. dim=12288, 256 experts, 48 loop iters, 1M context, 128k output.""" + """500B parameter config. Ultra-scale MoE model. dim=12288, 512 experts, 48 loop iters, 1M context, 128k output.""" return MythosConfig( vocab_size=100000, dim=12288, @@ -154,10 +159,10 @@ def mythos_500b() -> MythosConfig: qk_rope_head_dim=64, qk_nope_head_dim=128, v_head_dim=128, - n_experts=256, + n_experts=512, n_shared_experts=8, n_experts_per_tok=8, - expert_dim=1536, + expert_dim=23040, act_threshold=0.99, rope_theta=1000000.0, lora_rank=128, @@ -185,7 +190,7 @@ def mythos_1t() -> MythosConfig: n_experts=512, n_shared_experts=8, n_experts_per_tok=8, - expert_dim=2048, + expert_dim=34560, act_threshold=0.99, rope_theta=2000000.0, lora_rank=256, diff --git a/pyproject.toml b/pyproject.toml index 9562800..40dca10 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -5,7 +5,7 @@ build-backend = "poetry.core.masonry.api" [tool.poetry] name = "open-mythos" -version = "0.1.0" +version = "0.2.0" description = "OpenMythos — open-source theoretical reconstruction of the Claude Mythos Recurrent-Depth Transformer architecture" license = "MIT" authors = ["Kye Gomez "] diff --git a/variants_example.py b/variants_example.py new file mode 100644 index 0000000..19868bb --- /dev/null +++ b/variants_example.py @@ -0,0 +1,10 @@ +from open_mythos import ( + mythos_1b, + OpenMythos, +) + +cfg = mythos_1b() +model = OpenMythos(cfg) + +total = sum(p.numel() for p in model.parameters()) +print(f"Parameters: {total:,}")