From d366809d3833adb6a5f88d087363ce72cdb7e156 Mon Sep 17 00:00:00 2001 From: Nikhil Dev Goyal Date: Fri, 19 Jun 2026 01:07:06 -0700 Subject: [PATCH] [MatMul] Increase reps while autotuning for SFC traversals in MatMul. 1.06x prefill time speedup over unchanged reps. PiperOrigin-RevId: 934784015 --- ops/matmul.cc | 13 +++++++++++-- ops/matmul.h | 4 ++++ 2 files changed, 15 insertions(+), 2 deletions(-) diff --git a/ops/matmul.cc b/ops/matmul.cc index c01943d1..2167dc80 100644 --- a/ops/matmul.cc +++ b/ops/matmul.cc @@ -274,8 +274,14 @@ class GenerateCandidates { } // We know `order` is multiple MC, where more/smaller values of `mc` are - // helpful, especially for two B, hence add iterations. - const size_t reps = 2 + num_B_; + // helpful, especially for two B. For SFC, smaller tile sizes ensure that + // a larger cluster of adjacent tiles along the space-filling curve path + // fits concurrently in L2/L3 cache, maximizing boundary data reuse (rows + // of A or cols of B) as the curve moves. Hence add more iterations. + size_t reps = 2 + num_B_; + if (IsSFC(order)) { + reps += 2; + } for (size_t rep = 0; rep < reps; ++rep) { prev = PrevDivisor(mr, prev, rounded_M, mr); if (prev == 0) break; // none found @@ -326,6 +332,9 @@ class GenerateCandidates { // Large L3, but its behavior and characteristics varies across platforms, // hence autotune a wider range of nc than the other dimensions. size_t reps = 9 + num_B_; + if (IsSFC(order)) { + reps += 2; + } // For small M, we can afford larger NC, hence allow fewer small options. if (max_M_ <= 2 * mr) reps -= 1; diff --git a/ops/matmul.h b/ops/matmul.h index 66575756..9e0d3f5d 100644 --- a/ops/matmul.h +++ b/ops/matmul.h @@ -419,6 +419,10 @@ static inline bool IsOneKC(MMOrder order) { order == MMOrder::kSFC; } +static inline bool IsSFC(MMOrder order) { + return order == MMOrder::kSFC_K || order == MMOrder::kSFC; +} + static inline const char* StringFromOrder(MMOrder order) { switch (order) { case MMOrder::kNT_K: