Quick Start Guide
This guide will help you get started with SHC in just a few minutes.
Creating Your First Model
from shc.models import SHCTransformer, get_config
# Load a predefined configuration
config = get_config('500m') # Options: '500m', '1b', '3b', '7b'
# Create the model
model = SHCTransformer(config)
# Print model info
print(f"Parameters: {model.get_num_params():,}")
Forward Pass
import torch
# Create sample input
batch_size, seq_len = 2, 512
input_ids = torch.randint(0, config.vocab_size, (batch_size, seq_len))
# Forward pass
logits = model(input_ids)
print(f"Output shape: {logits.shape}") # (2, 512, 32000)
Text Generation
# Generate text from a prompt
prompt = torch.randint(0, config.vocab_size, (1, 10))
output = model.generate(
prompt,
max_new_tokens=50,
temperature=0.7,
top_p=0.9,
do_sample=True,
)
print(f"Generated {output.shape[1] - prompt.shape[1]} new tokens")
Using Individual Components
Cayley Transform
The Cayley transform generates orthogonal matrices with spectral norm exactly 1:
from shc.layers import CayleyTransform
# Create a 4x4 orthogonal matrix generator
cayley = CayleyTransform(n=4)
# Generate orthogonal matrix
Q = cayley()
print(f"Q^T @ Q ≈ I: {torch.allclose(Q.T @ Q, torch.eye(4), atol=1e-5)}")
print(f"Spectral norm: {cayley.get_spectral_norm():.4f}") # Always 1.0
Sparse Orthogonal Mixture
The routing layer that guarantees ρ ≤ 1:
from shc.layers import SparseOrthogonalMixture
# Create routing layer with k=2 orthogonal matrices
routing = SparseOrthogonalMixture(n=4, k=2, hidden_dim=768)
# Compute routing matrix for input
x = torch.randn(2, 768) # batch_size=2, hidden_dim=768
H_res = routing(x) # (2, 4, 4)
# Verify spectral norm bound
norms = routing.get_spectral_norm(x)
print(f"Max spectral norm: {norms.max():.4f}") # Always ≤ 1.0
Factorized KV Cache
Compress multi-stream representations:
from shc.layers import FactorizedKVCache
# Create factorized cache with rank-1 compression
cache = FactorizedKVCache(n=4, d=768, r=1)
# Compress multi-stream hidden state
x_bar = torch.randn(2, 4, 768) # (batch, n_streams, hidden_dim)
compressed = cache.compress(x_bar)
# Decompress when needed
reconstructed = cache.decompress(compressed)
print(f"Compression ratio: {cache.get_compression_ratio():.1f}x")
Custom Model Configuration
from shc.models import SHCTransformer, SHCTransformerConfig
# Create custom configuration
config = SHCTransformerConfig(
vocab_size=32000,
hidden_dim=1024,
n_layers=24,
n_heads=16,
max_seq_len=4096,
n_streams=4, # Number of parallel residual streams
k_mixture=2, # Number of orthogonal matrices in mixture
factorization_rank=1, # Rank for KV cache compression
)
model = SHCTransformer(config)
GPU Training
import torch
# Move model to GPU
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = model.to(device)
# Create data on GPU
input_ids = torch.randint(0, 32000, (2, 512), device=device)
labels = torch.randint(0, 32000, (2, 512), device=device)
# Forward pass with loss
outputs = model(input_ids, labels=labels)
Next Steps
Read the Theory section to understand the mathematics
Learn about Training for large-scale training
Explore the API Reference for detailed documentation