A PyTorch library for custom floating point quantization with autograd support. This library provides efficient implementations of custom floating point formats with automatic differentiation capabilities.
- Custom Floating Point Formats: Support for arbitrary floating point configurations (sign bits, exponent bits, mantissa bits, bias)
- Autograd Support: Full PyTorch autograd integration for training with quantized weights
- CUDA Support: GPU acceleration for both forward and backward passes
- Straight-Through Estimator: Gradient-friendly quantization for training
pip install torch-floating-pointgit clone https://github.com/SamirMoustafa/torch-floating-point.git
cd torch-floating-point
pip install -e .import torch
from floating_point import FloatingPoint, Round
# Define a custom 8-bit floating point format (1 sign, 4 exponent, 3 mantissa bits)
fp8 = FloatingPoint(sign_bits=1, exponent_bits=4, mantissa_bits=3, bias=7, bits=8)
# Create a rounding function
rounder = Round(fp8)
# Create a tensor with gradients
x = torch.randn(10, requires_grad=True)
# Quantize the tensor
quantized = rounder(x)
# Use in training (gradients flow through)
loss = quantized.sum()
loss.backward()
print(f"Original: {x}")
print(f"Quantized: {quantized}")
print(f"Gradients: {x.grad}")import torch
import torch.nn as nn
from floating_point import FloatingPoint, Round
class FloatPointLinear(nn.Module):
def __init__(self, in_features, out_features, fp_config):
super().__init__()
self.weight = nn.Parameter(torch.randn(out_features, in_features))
self.bias = nn.Parameter(torch.randn(out_features))
self.rounder = Round(fp_config)
def forward(self, x):
quantized_weight = self.rounder(self.weight)
return torch.nn.functional.linear(x, quantized_weight, self.bias)
# Define custom floating point format
fp8 = FloatingPoint(sign_bits=1, exponent_bits=4, mantissa_bits=3, bias=7, bits=8)
# Create model with quantized weights
model = FloatPointLinear(10, 5, fp8)
optimizer = torch.optim.Adam(model.parameters(), lr=0.01)
criterion = nn.MSELoss()
# Create simple data
x = torch.randn(32, 10)
y = torch.randn(32, 5)
# Training loop
for epoch in range(5):
optimizer.zero_grad()
# Forward pass
output = model(x)
loss = criterion(output, y)
# Backward pass
loss.backward()
optimizer.step()
print(f"Epoch {epoch + 1}: Loss = {loss.item():.6f}")- Fork the repository
- Create a feature branch (
git checkout -b feature/amazing-feature) - Install development dependencies (
make setup-dev) - Make your changes
- Run tests (
make test) - Run linting (
make lint) - Commit your changes (
git commit -m 'Add amazing feature') - Push to the branch (
git push origin feature/amazing-feature) - Open a Pull Request
This project is licensed under the MIT License - see the LICENSE file for details.
If you use this library in your research, please cite:
@software{moustafa2025torchfloatingpoint,
title={Torch Floating Point: A PyTorch library for custom floating point quantization},
author={Samir Moustafa},
year={2025},
url={https://github.com/SamirMoustafa/torch-floating-point}
}- Issues: GitHub Issues
- Discussions: GitHub Discussions
- Email: [email protected]
