Skip to content

Commit fd0d63b

Browse files
authored
Affine quant always in fp32 (#1925)
* do affine quant in fp32 * static cast
1 parent 3835a42 commit fd0d63b

File tree

3 files changed

+39
-33
lines changed

3 files changed

+39
-33
lines changed

mlx/backend/cpu/quantized.cpp

Lines changed: 18 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -543,8 +543,8 @@ void quantize(
543543
T* scales = scales_.data<T>();
544544
T* biases = biases_.data<T>();
545545

546-
T n_bins = (1 << bits) - 1;
547-
T eps = 1e-7;
546+
float n_bins = (1 << bits) - 1;
547+
float eps = 1e-7;
548548
bool power_of_2_bits = is_power_of_2(bits);
549549
int el_per_int = bits == 3 ? 8 : bits == 6 ? 4 : 32 / bits;
550550
// For 3/6 bits we read 3 uint8s at a time instead of 1 uint32
@@ -554,32 +554,30 @@ void quantize(
554554

555555
for (size_t i = 0; i < n_groups; ++i) {
556556
size_t w_idx = i * group_size;
557-
T w_min = std::numeric_limits<float>::infinity();
558-
T w_max = -w_min;
557+
float w_min = std::numeric_limits<float>::infinity();
558+
float w_max = -w_min;
559559
for (int j = 0; j < group_size; ++j) {
560-
w_max = std::max(w_max, w[w_idx + j]);
561-
w_min = std::min(w_min, w[w_idx + j]);
560+
w_max = std::max(w_max, (float)w[w_idx + j]);
561+
w_min = std::min(w_min, (float)w[w_idx + j]);
562562
}
563563
bool mask = std::abs(w_min) > std::abs(w_max);
564-
T scale = std::max(T((w_max - w_min) / n_bins), eps);
564+
float scale = std::max((w_max - w_min) / n_bins, eps);
565565
scale = mask ? scale : -scale;
566566

567-
auto edge = mask ? w_min : w_max;
568-
auto q0 = std::rint(edge / scale);
569-
if (q0 == 0) {
570-
scales[i] = scale;
571-
biases[i] = 0;
572-
} else {
573-
scales[i] = edge / q0;
574-
biases[i] = edge;
567+
float edge = mask ? w_min : w_max;
568+
float q0 = std::rint(edge / scale);
569+
float bias = 0;
570+
if (q0 != 0) {
571+
scale = edge / q0;
572+
bias = edge;
575573
}
576574
size_t out_idx = i * int_per_group;
577575
for (int j = 0; j < int_per_group / bytes_per_pack; ++j) {
578576
uint32_t out_el = 0;
579577
for (int k = 0; k < el_per_int; ++k) {
580-
T w_el = w[w_idx + j * el_per_int + k];
581-
w_el = std::rint((w_el - biases[i]) / scales[i]);
582-
w_el = std::min(std::max(w_el, T(0)), n_bins);
578+
float w_el = w[w_idx + j * el_per_int + k];
579+
w_el = std::rint((w_el - bias) / scale);
580+
w_el = std::min(std::max(w_el, 0.0f), n_bins);
583581
out_el |= static_cast<uint32_t>(w_el) << (k * bits);
584582
}
585583
if (power_of_2_bits) {
@@ -590,6 +588,8 @@ void quantize(
590588
out[out_idx + bytes_per_pack * j + 2] = (out_el & 0xff0000) >> 16;
591589
}
592590
}
591+
scales[i] = static_cast<T>(scale);
592+
biases[i] = static_cast<T>(bias);
593593
}
594594
}
595595

mlx/backend/metal/kernels/quantized.h

Lines changed: 12 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -2015,9 +2015,9 @@ template <typename T, const int group_size, const int bits>
20152015
device T* biases [[buffer(3)]],
20162016
uint2 index [[thread_position_in_grid]],
20172017
uint2 grid_dim [[threads_per_grid]]) {
2018-
constexpr T eps = T(1e-7);
2018+
constexpr float eps = 1e-7;
20192019
constexpr int simd_size = 32;
2020-
constexpr T n_bins = (1 << bits) - 1;
2020+
constexpr float n_bins = (1 << bits) - 1;
20212021
constexpr int packs_per_int = bits == 3 ? 8 : bits == 6 ? 4 : 8 / bits;
20222022
constexpr int values_per_reduce = group_size / simd_size;
20232023
constexpr int writes_per_reduce = packs_per_int / values_per_reduce;
@@ -2036,13 +2036,13 @@ template <typename T, const int group_size, const int bits>
20362036
? offset * writes_per_pack
20372037
: offset * bytes_per_pack / writes_per_reduce;
20382038

2039-
T w_thread[values_per_reduce];
2040-
T w_min = Limits<T>::max;
2041-
T w_max = 0;
2039+
float w_thread[values_per_reduce];
2040+
float w_min = Limits<T>::max;
2041+
float w_max = 0;
20422042

20432043
#pragma clang loop unroll(full)
20442044
for (int i = 0; i < values_per_reduce; i++) {
2045-
T val = w[in_index + i];
2045+
float val = w[in_index + i];
20462046
w_thread[i] = val;
20472047
w_min = min(w_min, val);
20482048
w_max = max(w_max, val);
@@ -2051,20 +2051,20 @@ template <typename T, const int group_size, const int bits>
20512051
w_min = simd_min(w_min);
20522052
w_max = simd_max(w_max);
20532053

2054-
T scale = max((w_max - w_min) / n_bins, eps);
2054+
float scale = max((w_max - w_min) / n_bins, eps);
20552055
bool side = abs(w_min) > abs(w_max);
20562056
scale = side ? scale : -scale;
2057-
T edge = side ? w_min : w_max;
2058-
T q0 = round(edge / scale);
2057+
float edge = side ? w_min : w_max;
2058+
float q0 = round(edge / scale);
20592059
bool at_zero = q0 == 0.0f;
20602060
scale = at_zero ? scale : edge / q0;
2061-
T bias = at_zero ? T(0) : edge;
2061+
float bias = at_zero ? 0 : edge;
20622062

20632063
// Write out the scales and biases
20642064
size_t gindex = in_index / group_size;
20652065
if (in_index % group_size == 0) {
2066-
scales[gindex] = scale;
2067-
biases[gindex] = bias;
2066+
scales[gindex] = static_cast<T>(scale);
2067+
biases[gindex] = static_cast<T>(bias);
20682068
}
20692069

20702070
// We accumulate 3 bytes worth for 3/6 bit so we need a uint32_t

mlx/fast.cpp

Lines changed: 9 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -827,14 +827,17 @@ affine_quantize(const array& w, int group_size, int bits, StreamOrDevice s_) {
827827
auto wshape = w.shape();
828828
wshape.back() = -1;
829829

830-
array zero(0, w.dtype());
831-
array n_bins((1 << bits) - 1, w.dtype()); // 2**bits - 1
832-
array eps(1e-7, w.dtype());
830+
array zero(0, float32);
831+
array n_bins((1 << bits) - 1, float32); // 2**bits - 1
832+
array eps(1e-7, float32);
833833

834834
array packed_w = reshape(w, {-1, w.shape(-1) / group_size, group_size}, s);
835835

836836
array w_max = max(packed_w, /* axis= */ -1, /* keepdims= */ true, s);
837837
array w_min = min(packed_w, /* axis= */ -1, /* keepdims= */ true, s);
838+
w_max = astype(w_max, float32, s);
839+
w_min = astype(w_min, float32, s);
840+
838841
array mask = greater(abs(w_min, s), abs(w_max, s), s);
839842
array scales =
840843
maximum(divide(subtract(w_max, w_min, s), n_bins, s), eps, s);
@@ -845,6 +848,9 @@ affine_quantize(const array& w, int group_size, int bits, StreamOrDevice s_) {
845848
array biases = where(equal(q0, zero, s), zero, edge, s);
846849

847850
packed_w = pack_and_quantize(packed_w, scales, biases, bits, s);
851+
852+
scales = astype(scales, w.dtype(), s);
853+
biases = astype(biases, w.dtype(), s);
848854
return {
849855
reshape(packed_w, wshape, s),
850856
reshape(scales, wshape, s),

0 commit comments

Comments
 (0)