diff --git a/deps/histogram/include/hdr/hdr_histogram_version.h b/deps/histogram/include/hdr/hdr_histogram_version.h index ad6a118f5a771e..922f8fcfcda18e 100644 --- a/deps/histogram/include/hdr/hdr_histogram_version.h +++ b/deps/histogram/include/hdr/hdr_histogram_version.h @@ -7,6 +7,6 @@ #ifndef HDR_HISTOGRAM_VERSION_H #define HDR_HISTOGRAM_VERSION_H -#define HDR_HISTOGRAM_VERSION "0.11.9" +#define HDR_HISTOGRAM_VERSION "0.11.10" #endif // HDR_HISTOGRAM_VERSION_H diff --git a/deps/histogram/src/hdr_histogram.c b/deps/histogram/src/hdr_histogram.c index c620045cb174b2..b2041f8bf15545 100644 --- a/deps/histogram/src/hdr_histogram.c +++ b/deps/histogram/src/hdr_histogram.c @@ -23,6 +23,25 @@ #include HDR_MALLOC_INCLUDE +/* Prefetch hint for upcoming write access */ +#if defined(__GNUC__) || defined(__clang__) +# define HDR_PREFETCH_WRITE(addr) __builtin_prefetch((addr), 1, 3) +# define HDR_LIKELY(x) __builtin_expect(!!(x), 1) +# define HDR_UNLIKELY(x) __builtin_expect(!!(x), 0) +#else +# define HDR_PREFETCH_WRITE(addr) ((void)(addr)) +# define HDR_LIKELY(x) (x) +# define HDR_UNLIKELY(x) (x) +#endif + +/* Runtime-dispatched AVX2 path: keep the rest of this TU at the project's + baseline ISA so the shipped binary does not silently require AVX2. */ +#if (defined(__x86_64__) || defined(_M_X64) || defined(__i386__) || defined(_M_IX86)) \ + && (defined(__GNUC__) || defined(__clang__)) && !defined(__INTEL_COMPILER) +# define HDR_HAS_AVX2_DISPATCH 1 +# include +#endif + /* ###### ####### ## ## ## ## ######## ###### */ /* ## ## ## ## ## ## ### ## ## ## ## */ /* ## ## ## ## ## #### ## ## ## */ @@ -67,24 +86,48 @@ static int64_t counts_get_normalised(const struct hdr_histogram* h, int32_t inde static void counts_inc_normalised( struct hdr_histogram* h, int32_t index, int64_t value) { - int32_t normalised_index = normalize_index(h, index); - h->counts[normalised_index] += value; + if (HDR_LIKELY(h->normalizing_index_offset == 0)) + { + HDR_PREFETCH_WRITE(&h->counts[index]); + h->counts[index] += value; + } + else + { + int32_t normalised_index = normalize_index(h, index); + HDR_PREFETCH_WRITE(&h->counts[normalised_index]); + h->counts[normalised_index] += value; + } h->total_count += value; } static void counts_inc_normalised_atomic( struct hdr_histogram* h, int32_t index, int64_t value) { - int32_t normalised_index = normalize_index(h, index); - - hdr_atomic_add_fetch_64(&h->counts[normalised_index], value); + if (HDR_LIKELY(h->normalizing_index_offset == 0)) + { + HDR_PREFETCH_WRITE(&h->counts[index]); + hdr_atomic_add_fetch_64(&h->counts[index], value); + } + else + { + int32_t normalised_index = normalize_index(h, index); + HDR_PREFETCH_WRITE(&h->counts[normalised_index]); + hdr_atomic_add_fetch_64(&h->counts[normalised_index], value); + } hdr_atomic_add_fetch_64(&h->total_count, value); } static void update_min_max(struct hdr_histogram* h, int64_t value) { - h->min_value = (value < h->min_value && value != 0) ? value : h->min_value; - h->max_value = (value > h->max_value) ? value : h->max_value; + if (HDR_UNLIKELY(value > h->max_value)) + { + h->max_value = value; + } + + if (HDR_UNLIKELY(value != 0 && value < h->min_value)) + { + h->min_value = value; + } } static void update_min_max_atomic(struct hdr_histogram* h, int64_t value) @@ -498,7 +541,7 @@ bool hdr_record_values(struct hdr_histogram* h, int64_t value, int64_t count) } counts_index = counts_index_for(h, value); - if (counts_index < 0 || h->counts_len <= counts_index) + if ((uint32_t)counts_index >= (uint32_t)h->counts_len) { return false; } @@ -520,7 +563,7 @@ bool hdr_record_values_atomic(struct hdr_histogram* h, int64_t value, int64_t co counts_index = counts_index_for(h, value); - if (counts_index < 0 || h->counts_len <= counts_index) + if ((uint32_t)counts_index >= (uint32_t)h->counts_len) { return false; } @@ -665,22 +708,65 @@ int64_t hdr_min(const struct hdr_histogram* h) return non_zero_min(h); } -static int64_t get_value_from_idx_up_to_count(const struct hdr_histogram* h, int64_t count_at_percentile) +static int64_t get_value_from_idx_up_to_count_scalar( + const struct hdr_histogram* h, int64_t count_at_percentile) { int64_t count_to_idx = 0; - - count_at_percentile = 0 < count_at_percentile ? count_at_percentile : 1; - for (int32_t idx = 0; idx < h->counts_len; idx++) - { + for (int32_t idx = 0; idx < h->counts_len; idx++) { count_to_idx += h->counts[idx]; if (count_to_idx >= count_at_percentile) - { return hdr_value_at_index(h, idx); - } } + return 0; +} +#ifdef HDR_HAS_AVX2_DISPATCH +__attribute__((target("avx2"))) +static int64_t get_value_from_idx_up_to_count_avx2( + const struct hdr_histogram* h, int64_t count_at_percentile) +{ + int64_t running = 0; + int32_t idx = 0; + const int32_t limit = h->counts_len & ~3; + + for (; idx < limit; idx += 4) { + __m256i v = _mm256_loadu_si256((const __m256i*)&h->counts[idx]); + __m128i lo = _mm256_castsi256_si128(v); + __m128i hi = _mm256_extracti128_si256(v, 1); + __m128i s = _mm_add_epi64(lo, hi); + /* Lanes are non-negative counts whose total fits in int64_t (total_count + invariant), so the chunk sum cannot overflow under valid state. Use + unsigned add to avoid signed-overflow UB if invariants are violated. */ + int64_t chunk = (int64_t)((uint64_t)_mm_extract_epi64(s, 0) + + (uint64_t)_mm_extract_epi64(s, 1)); + + if (__builtin_expect(running + chunk >= count_at_percentile, 0)) { + for (int32_t j = idx; j < idx + 4; j++) { + running += h->counts[j]; + if (running >= count_at_percentile) + return hdr_value_at_index(h, j); + } + } + running += chunk; + } + for (; idx < h->counts_len; idx++) { + running += h->counts[idx]; + if (running >= count_at_percentile) + return hdr_value_at_index(h, idx); + } return 0; } +#endif + +static int64_t get_value_from_idx_up_to_count(const struct hdr_histogram* h, int64_t count_at_percentile) +{ + count_at_percentile = count_at_percentile > 0 ? count_at_percentile : 1; +#ifdef HDR_HAS_AVX2_DISPATCH + if (__builtin_cpu_supports("avx2")) + return get_value_from_idx_up_to_count_avx2(h, count_at_percentile); +#endif + return get_value_from_idx_up_to_count_scalar(h, count_at_percentile); +} int64_t hdr_value_at_percentile(const struct hdr_histogram* h, double percentile)