[Update variants with parameter count equation]
This commit is contained in:
parent
f00d10b59f
commit
a825ba217f
4 changed files with 72 additions and 22 deletions
35
README.md
35
README.md
|
|
@ -71,6 +71,41 @@ print(
|
|||
|
||||
|
||||
|
||||
## Model Variants
|
||||
|
||||
Pre-configured scales from 1B to 1T parameters:
|
||||
|
||||
```python
|
||||
from open_mythos import (
|
||||
mythos_1b,
|
||||
mythos_3b,
|
||||
mythos_10b,
|
||||
mythos_50b,
|
||||
mythos_100b,
|
||||
mythos_500b,
|
||||
mythos_1t,
|
||||
OpenMythos,
|
||||
)
|
||||
|
||||
cfg = mythos_7b() # returns a MythosConfig
|
||||
model = OpenMythos(cfg)
|
||||
|
||||
total = sum(p.numel() for p in model.parameters())
|
||||
print(f"Parameters: {total:,}")
|
||||
```
|
||||
|
||||
| Variant | `dim` | Experts | `expert_dim` | Loop iters | Context | Max output |
|
||||
|---|---|---|---|---|---|---|
|
||||
| `mythos_1b` | 2048 | 64 | 2048 | 16 | 4k | 4k |
|
||||
| `mythos_3b` | 3072 | 64 | 4096 | 16 | 4k | 4k |
|
||||
| `mythos_10b` | 4096 | 128 | 5632 | 24 | 8k | 4k |
|
||||
| `mythos_50b` | 6144 | 256 | 9728 | 32 | 8k | 4k |
|
||||
| `mythos_100b` | 8192 | 256 | 13568 | 32 | 1M | 128k |
|
||||
| `mythos_500b` | 12288 | 512 | 23040 | 48 | 1M | 128k |
|
||||
| `mythos_1t` | 16384 | 512 | 34560 | 64 | 1M | 128k |
|
||||
|
||||
---
|
||||
|
||||
## Documentation
|
||||
|
||||
| Page | Description |
|
||||
|
|
|
|||
|
|
@ -1,8 +1,13 @@
|
|||
from open_mythos.main import MythosConfig
|
||||
|
||||
# Parameter budget breakdown per variant:
|
||||
# total ≈ embed + prelude/coda dense blocks + recurrent MLA + MoE
|
||||
# MoE = 3 * dim * expert_dim * (n_experts + n_shared * n_experts_per_tok)
|
||||
# expert_dim is solved from the residual budget after all other terms.
|
||||
|
||||
|
||||
def mythos_1b() -> MythosConfig:
|
||||
"""1B parameter config. Small research/fine-tuning model. dim=2048, 16 experts, 16 loop iters, 4k context."""
|
||||
"""1B parameter config. Small research/fine-tuning model. dim=2048, 64 experts, 16 loop iters, 4k context."""
|
||||
return MythosConfig(
|
||||
vocab_size=32000,
|
||||
dim=2048,
|
||||
|
|
@ -18,10 +23,10 @@ def mythos_1b() -> MythosConfig:
|
|||
qk_rope_head_dim=32,
|
||||
qk_nope_head_dim=64,
|
||||
v_head_dim=64,
|
||||
n_experts=16,
|
||||
n_experts=64,
|
||||
n_shared_experts=2,
|
||||
n_experts_per_tok=2,
|
||||
expert_dim=256,
|
||||
n_experts_per_tok=4,
|
||||
expert_dim=2048,
|
||||
act_threshold=0.99,
|
||||
rope_theta=500000.0,
|
||||
lora_rank=8,
|
||||
|
|
@ -29,7 +34,7 @@ def mythos_1b() -> MythosConfig:
|
|||
|
||||
|
||||
def mythos_3b() -> MythosConfig:
|
||||
"""3B parameter config. Compact inference model. dim=3072, 32 experts, 16 loop iters, 4k context."""
|
||||
"""3B parameter config. Compact inference model. dim=3072, 64 experts, 16 loop iters, 4k context."""
|
||||
return MythosConfig(
|
||||
vocab_size=32000,
|
||||
dim=3072,
|
||||
|
|
@ -45,10 +50,10 @@ def mythos_3b() -> MythosConfig:
|
|||
qk_rope_head_dim=32,
|
||||
qk_nope_head_dim=96,
|
||||
v_head_dim=96,
|
||||
n_experts=32,
|
||||
n_experts=64,
|
||||
n_shared_experts=2,
|
||||
n_experts_per_tok=2,
|
||||
expert_dim=384,
|
||||
n_experts_per_tok=4,
|
||||
expert_dim=4096,
|
||||
act_threshold=0.99,
|
||||
rope_theta=500000.0,
|
||||
lora_rank=8,
|
||||
|
|
@ -56,7 +61,7 @@ def mythos_3b() -> MythosConfig:
|
|||
|
||||
|
||||
def mythos_10b() -> MythosConfig:
|
||||
"""10B parameter config. Mid-scale general model. dim=4096, 64 experts, 24 loop iters, 8k context."""
|
||||
"""10B parameter config. Mid-scale general model. dim=4096, 128 experts, 24 loop iters, 8k context."""
|
||||
return MythosConfig(
|
||||
vocab_size=32000,
|
||||
dim=4096,
|
||||
|
|
@ -72,10 +77,10 @@ def mythos_10b() -> MythosConfig:
|
|||
qk_rope_head_dim=64,
|
||||
qk_nope_head_dim=128,
|
||||
v_head_dim=128,
|
||||
n_experts=64,
|
||||
n_experts=128,
|
||||
n_shared_experts=2,
|
||||
n_experts_per_tok=4,
|
||||
expert_dim=512,
|
||||
expert_dim=5632,
|
||||
act_threshold=0.99,
|
||||
rope_theta=500000.0,
|
||||
lora_rank=16,
|
||||
|
|
@ -83,7 +88,7 @@ def mythos_10b() -> MythosConfig:
|
|||
|
||||
|
||||
def mythos_50b() -> MythosConfig:
|
||||
"""50B parameter config. Large reasoning model. dim=6144, 128 experts, 32 loop iters, 8k context."""
|
||||
"""50B parameter config. Large reasoning model. dim=6144, 256 experts, 32 loop iters, 8k context."""
|
||||
return MythosConfig(
|
||||
vocab_size=32000,
|
||||
dim=6144,
|
||||
|
|
@ -99,10 +104,10 @@ def mythos_50b() -> MythosConfig:
|
|||
qk_rope_head_dim=64,
|
||||
qk_nope_head_dim=128,
|
||||
v_head_dim=128,
|
||||
n_experts=128,
|
||||
n_experts=256,
|
||||
n_shared_experts=4,
|
||||
n_experts_per_tok=4,
|
||||
expert_dim=768,
|
||||
expert_dim=9728,
|
||||
act_threshold=0.99,
|
||||
rope_theta=500000.0,
|
||||
lora_rank=32,
|
||||
|
|
@ -110,7 +115,7 @@ def mythos_50b() -> MythosConfig:
|
|||
|
||||
|
||||
def mythos_100b() -> MythosConfig:
|
||||
"""100B parameter config. Frontier-class model. dim=8192, 160 experts, 32 loop iters, 1M context, 128k output."""
|
||||
"""100B parameter config. Frontier-class model. dim=8192, 256 experts, 32 loop iters, 1M context, 128k output."""
|
||||
return MythosConfig(
|
||||
vocab_size=32000,
|
||||
dim=8192,
|
||||
|
|
@ -126,10 +131,10 @@ def mythos_100b() -> MythosConfig:
|
|||
qk_rope_head_dim=64,
|
||||
qk_nope_head_dim=128,
|
||||
v_head_dim=128,
|
||||
n_experts=160,
|
||||
n_experts=256,
|
||||
n_shared_experts=4,
|
||||
n_experts_per_tok=8,
|
||||
expert_dim=1024,
|
||||
expert_dim=13568,
|
||||
act_threshold=0.99,
|
||||
rope_theta=1000000.0,
|
||||
lora_rank=64,
|
||||
|
|
@ -138,7 +143,7 @@ def mythos_100b() -> MythosConfig:
|
|||
|
||||
|
||||
def mythos_500b() -> MythosConfig:
|
||||
"""500B parameter config. Ultra-scale MoE model. dim=12288, 256 experts, 48 loop iters, 1M context, 128k output."""
|
||||
"""500B parameter config. Ultra-scale MoE model. dim=12288, 512 experts, 48 loop iters, 1M context, 128k output."""
|
||||
return MythosConfig(
|
||||
vocab_size=100000,
|
||||
dim=12288,
|
||||
|
|
@ -154,10 +159,10 @@ def mythos_500b() -> MythosConfig:
|
|||
qk_rope_head_dim=64,
|
||||
qk_nope_head_dim=128,
|
||||
v_head_dim=128,
|
||||
n_experts=256,
|
||||
n_experts=512,
|
||||
n_shared_experts=8,
|
||||
n_experts_per_tok=8,
|
||||
expert_dim=1536,
|
||||
expert_dim=23040,
|
||||
act_threshold=0.99,
|
||||
rope_theta=1000000.0,
|
||||
lora_rank=128,
|
||||
|
|
@ -185,7 +190,7 @@ def mythos_1t() -> MythosConfig:
|
|||
n_experts=512,
|
||||
n_shared_experts=8,
|
||||
n_experts_per_tok=8,
|
||||
expert_dim=2048,
|
||||
expert_dim=34560,
|
||||
act_threshold=0.99,
|
||||
rope_theta=2000000.0,
|
||||
lora_rank=256,
|
||||
|
|
|
|||
|
|
@ -5,7 +5,7 @@ build-backend = "poetry.core.masonry.api"
|
|||
|
||||
[tool.poetry]
|
||||
name = "open-mythos"
|
||||
version = "0.1.0"
|
||||
version = "0.2.0"
|
||||
description = "OpenMythos — open-source theoretical reconstruction of the Claude Mythos Recurrent-Depth Transformer architecture"
|
||||
license = "MIT"
|
||||
authors = ["Kye Gomez <kye@swarms.world>"]
|
||||
|
|
|
|||
10
variants_example.py
Normal file
10
variants_example.py
Normal file
|
|
@ -0,0 +1,10 @@
|
|||
from open_mythos import (
|
||||
mythos_1b,
|
||||
OpenMythos,
|
||||
)
|
||||
|
||||
cfg = mythos_1b()
|
||||
model = OpenMythos(cfg)
|
||||
|
||||
total = sum(p.numel() for p in model.parameters())
|
||||
print(f"Parameters: {total:,}")
|
||||
Loading…
Add table
Add a link
Reference in a new issue