diff --git a/ops/matmul.cc b/ops/matmul.cc index c01943d1..2167dc80 100644 --- a/ops/matmul.cc +++ b/ops/matmul.cc @@ -274,8 +274,14 @@ class GenerateCandidates { } // We know `order` is multiple MC, where more/smaller values of `mc` are - // helpful, especially for two B, hence add iterations. - const size_t reps = 2 + num_B_; + // helpful, especially for two B. For SFC, smaller tile sizes ensure that + // a larger cluster of adjacent tiles along the space-filling curve path + // fits concurrently in L2/L3 cache, maximizing boundary data reuse (rows + // of A or cols of B) as the curve moves. Hence add more iterations. + size_t reps = 2 + num_B_; + if (IsSFC(order)) { + reps += 2; + } for (size_t rep = 0; rep < reps; ++rep) { prev = PrevDivisor(mr, prev, rounded_M, mr); if (prev == 0) break; // none found @@ -326,6 +332,9 @@ class GenerateCandidates { // Large L3, but its behavior and characteristics varies across platforms, // hence autotune a wider range of nc than the other dimensions. size_t reps = 9 + num_B_; + if (IsSFC(order)) { + reps += 2; + } // For small M, we can afford larger NC, hence allow fewer small options. if (max_M_ <= 2 * mr) reps -= 1; diff --git a/ops/matmul.h b/ops/matmul.h index 66575756..9e0d3f5d 100644 --- a/ops/matmul.h +++ b/ops/matmul.h @@ -419,6 +419,10 @@ static inline bool IsOneKC(MMOrder order) { order == MMOrder::kSFC; } +static inline bool IsSFC(MMOrder order) { + return order == MMOrder::kSFC_K || order == MMOrder::kSFC; +} + static inline const char* StringFromOrder(MMOrder order) { switch (order) { case MMOrder::kNT_K: