full_attention_no_gqa_bs8_ctx512
Mobile-optimized MoE model configuration from architecture search.
Metrics
- Best Training Loss: 3.5330
- Best Eval Loss: 5.4000
- Total Parameters: 255.6M
- Active Parameters: 114.1M
- Steps Completed: 2000
Usage
# Load the model
from safetensors.torch import load_file
from architecture.model import Qwen3Model # Your custom model class
# Load config
import json
with open("config.json") as f:
config = json.load(f)
# Initialize model
model = Qwen3Model(config)
# Load weights
state_dict = load_file("model.safetensors")
model.load_state_dict(state_dict)
# Load tokenizer
from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained("kshitijthakkar/moe-255m-114m-12x2-12L-full-attention-no-gqa-bs8-ctx512")
Model Configuration
{
"vocab_size": 151936,
"emb_dim": 512,
"n_heads": 8,
"n_layers": 12,
"n_kv_groups": 2,
"num_experts": 12,
"num_experts_per_tok": 2,
"moe_hidden_dim": 768,
"head_dim": 64,
"max_position_embeddings": 4096,
"rope_base": 1000000.0,
"qk_norm": true
}
Training Configuration
{
"model_config": {
"vocab_size": 151936,
"emb_dim": 512,
"n_heads": 8,
"n_layers": 12,
"n_kv_groups": 2,
"num_experts": 12,
"num_experts_per_tok": 2,
"moe_hidden_dim": 768,
"head_dim": 64,
"max_position_embeddings": 4096,
"rope_base": 1000000.0,
"qk_norm": true
},
"learning_rate": 0.0001,
"batch_size": 8,
"context_length": 512,
"warmup_ratio": 0.1,
"warmup_steps": null,
"weight_decay": 0.1,
"gradient_clip": 1.0,
"gradient_accumulation_steps": 1,
"scheduler_type": "cosine",
"wsd_decay_ratio": 0.1,
"max_steps": 2000,
"eval_steps": 500,
"eval_batches": 20,
"log_steps": 100,
"early_stopping": true,
"early_stopping_patience": 500,
"early_stopping_min_delta": 0.01,
"early_stopping_min_steps": 200,
"track_expert_balance": true,
"expert_balance_log_steps": 100,
"use_wandb": true,
"wandb_project": "moe-architecture-search",
"wandb_entity": null,
"wandb_tags": [
"full_attention_no_gqa_bs8_ctx512",
"architecture-search"
],
"train_data_path": null,
"val_data_path": null,
"output_dir": null,
"experiment_name": "full_attention_no_gqa_bs8_ctx512",
"device": "cuda",
"dtype": "bfloat16",
"gradient_checkpointing": true,
"architecture_name": "full_attention_no_gqa_bs8_ctx512",
"mobile_estimate": {
"tok_per_sec_fp16": 41.061977007248444,
"tok_per_sec_q8": 68.43662834541408,
"tok_per_sec_q4": 95.8112796835797,
"ttft_ms_fp16": 92.9495970909091,
"ttft_ms_q8": 61.96639806060606,
"ttft_ms_q4": 49.57311844848485,
"memory_mb_fp16": 545.8759765625,
"memory_mb_q8": 322.31499023437505,
"memory_mb_q4": 198.34599609375,
"total_params": 255611392,
"active_params": 114053632,
"meets_ttft_target": true,
"meets_throughput_target": true,
"meets_memory_target": true
}
}
- Downloads last month
- 27
Inference Providers
NEW
This model isn't deployed by any Inference Provider.
🙋
Ask for provider support