Quick Start Guide

This guide will help you get started with SHC in just a few minutes.

Creating Your First Model

from shc.models import SHCTransformer, get_config

# Load a predefined configuration
config = get_config('500m')  # Options: '500m', '1b', '3b', '7b'

# Create the model
model = SHCTransformer(config)

# Print model info
print(f"Parameters: {model.get_num_params():,}")

Forward Pass

import torch

# Create sample input
batch_size, seq_len = 2, 512
input_ids = torch.randint(0, config.vocab_size, (batch_size, seq_len))

# Forward pass
logits = model(input_ids)
print(f"Output shape: {logits.shape}")  # (2, 512, 32000)

Text Generation

# Generate text from a prompt
prompt = torch.randint(0, config.vocab_size, (1, 10))

output = model.generate(
    prompt,
    max_new_tokens=50,
    temperature=0.7,
    top_p=0.9,
    do_sample=True,
)

print(f"Generated {output.shape[1] - prompt.shape[1]} new tokens")

Using Individual Components

Cayley Transform

The Cayley transform generates orthogonal matrices with spectral norm exactly 1:

from shc.layers import CayleyTransform

# Create a 4x4 orthogonal matrix generator
cayley = CayleyTransform(n=4)

# Generate orthogonal matrix
Q = cayley()
print(f"Q^T @ Q ≈ I: {torch.allclose(Q.T @ Q, torch.eye(4), atol=1e-5)}")
print(f"Spectral norm: {cayley.get_spectral_norm():.4f}")  # Always 1.0

Sparse Orthogonal Mixture

The routing layer that guarantees ρ ≤ 1:

from shc.layers import SparseOrthogonalMixture

# Create routing layer with k=2 orthogonal matrices
routing = SparseOrthogonalMixture(n=4, k=2, hidden_dim=768)

# Compute routing matrix for input
x = torch.randn(2, 768)  # batch_size=2, hidden_dim=768
H_res = routing(x)       # (2, 4, 4)

# Verify spectral norm bound
norms = routing.get_spectral_norm(x)
print(f"Max spectral norm: {norms.max():.4f}")  # Always ≤ 1.0

Factorized KV Cache

Compress multi-stream representations:

from shc.layers import FactorizedKVCache

# Create factorized cache with rank-1 compression
cache = FactorizedKVCache(n=4, d=768, r=1)

# Compress multi-stream hidden state
x_bar = torch.randn(2, 4, 768)  # (batch, n_streams, hidden_dim)
compressed = cache.compress(x_bar)

# Decompress when needed
reconstructed = cache.decompress(compressed)
print(f"Compression ratio: {cache.get_compression_ratio():.1f}x")

Custom Model Configuration

from shc.models import SHCTransformer, SHCTransformerConfig

# Create custom configuration
config = SHCTransformerConfig(
    vocab_size=32000,
    hidden_dim=1024,
    n_layers=24,
    n_heads=16,
    max_seq_len=4096,
    n_streams=4,         # Number of parallel residual streams
    k_mixture=2,         # Number of orthogonal matrices in mixture
    factorization_rank=1, # Rank for KV cache compression
)

model = SHCTransformer(config)

GPU Training

import torch

# Move model to GPU
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = model.to(device)

# Create data on GPU
input_ids = torch.randint(0, 32000, (2, 512), device=device)
labels = torch.randint(0, 32000, (2, 512), device=device)

# Forward pass with loss
outputs = model(input_ids, labels=labels)

Next Steps

Read the Theory section to understand the mathematics
Learn about Training for large-scale training
Explore the API Reference for detailed documentation