From be394a96c7bd692a66f0a44f1522136e1be52e39 Mon Sep 17 00:00:00 2001 From: Dnomd343 Date: Sun, 23 Jun 2024 18:18:44 +0800 Subject: [PATCH] perf: optimize ShortCode conversion --- src/core/benchmark/codec.cc | 34 +++++++++++++++--- src/core/common_code/common_code.h | 9 +++++ src/core/short_code/internal/convert.cc | 38 +++++++-------------- src/core/short_code/internal/short_code.inl | 31 ++++------------- src/core/short_code/short_code.h | 15 +++++--- src/core/utils/utility.h | 3 ++ 6 files changed, 70 insertions(+), 60 deletions(-) diff --git a/src/core/benchmark/codec.cc b/src/core/benchmark/codec.cc index c19307e..d98f508 100644 --- a/src/core/benchmark/codec.cc +++ b/src/core/benchmark/codec.cc @@ -197,8 +197,8 @@ static void ShortCodeDeserialize(benchmark::State &state) { static void ShortCodeToCommonCode(benchmark::State &state) { - // ShortCode::speed_up(true); - ShortCode::speed_up(false); + ShortCode::speed_up(true); + // ShortCode::speed_up(false); // ShortCode::fast_decode(4091296); @@ -206,7 +206,9 @@ static void ShortCodeToCommonCode(benchmark::State &state) { for (auto _ : state) { - volatile auto kk = short_code.to_common_code(); + // volatile auto kk = short_code.to_common_code(); + + benchmark::DoNotOptimize(short_code.to_common_code()); // if (AllCases::instance().is_available()) { // if (ShortCode::stage_ == ShortCode::Stage::FAST) { @@ -218,6 +220,29 @@ static void ShortCodeToCommonCode(benchmark::State &state) { } +static void CommonCodeToShortCode(benchmark::State &state) { + ShortCode::speed_up(true); + // ShortCode::speed_up(false); + + auto common_code = CommonCode::unsafe_create(0x1A9BF0C00); + + // std::vector samples; + // for (auto code : common_code_samples(256)) { + // samples.emplace_back(CommonCode::unsafe_create(code)); + // } + + for (auto _ : state) { + + // for (auto common_code : samples) { + // volatile auto kk = ShortCode(common_code); + + benchmark::DoNotOptimize(ShortCode(common_code)); + + // } + + } +} + // BENCHMARK(CommonCodeSerialize)->Range(8, 256); // BENCHMARK(CommonCodeDeserialize)->Range(8, 256); // BENCHMARK(CommonCodeSerializeShorten)->Range(8, 256); @@ -226,7 +251,8 @@ static void ShortCodeToCommonCode(benchmark::State &state) { // BENCHMARK(ShortCodeSerialize)->Range(8, 256); // BENCHMARK(ShortCodeDeserialize)->Range(8, 256); -BENCHMARK(ShortCodeToCommonCode); +// BENCHMARK(ShortCodeToCommonCode); +BENCHMARK(CommonCodeToShortCode); // static void CommonCodeDecode(benchmark::State &state) { // const auto tmp = str_common_codes(state.range(0)); diff --git a/src/core/common_code/common_code.h b/src/core/common_code/common_code.h index 43d4e32..fba9aa6 100644 --- a/src/core/common_code/common_code.h +++ b/src/core/common_code/common_code.h @@ -166,6 +166,15 @@ private: // ------------------------------------------------------------------------------------- // }; +// Ref: https://cplusplus.github.io/CWG/issues/1734.html + +// TODO: By definition, the default constructor is deleted, it is not trivial. +// TODO: But in clang and g++, it is legal, but in msvc it fails. +static_assert(std::is_trivial_v); + +static_assert(std::is_standard_layout_v); +static_assert(std::is_trivially_copyable_v); + } // namespace klotski::codec #include "internal/common_code.inl" diff --git a/src/core/short_code/internal/convert.cc b/src/core/short_code/internal/convert.cc index 2e6f4d9..0b87697 100644 --- a/src/core/short_code/internal/convert.cc +++ b/src/core/short_code/internal/convert.cc @@ -50,18 +50,10 @@ static uint32_t check_range(uint32_t head, uint32_t range) noexcept { return 0; // pass check } -std::mutex ShortCode::busy_ {}; - -// ShortCode::Stage ShortCode::stage_ = Stage::UNINIT; - -// const klotski::cases::RangesUnion *ShortCode::cases_ = &AllCases::instance().fetch(); -const klotski::cases::RangesUnion *ShortCode::cases_ = nullptr; - -const klotski::cases::Ranges *ShortCode::ranges_ = nullptr; - uint32_t ShortCode::fast_encode(uint64_t common_code) { auto head = common_code >> 32; - auto &ranges = AllCases::instance().fetch()[head]; // match available ranges + auto &ranges = (*cases_)[head]; // match available ranges + // TODO: try to narrow the scope by prefix auto target = std::lower_bound(ranges.begin(), ranges.end(), (uint32_t)common_code); return ALL_CASES_OFFSET[head] + (target - ranges.begin()); } @@ -79,19 +71,22 @@ uint32_t ShortCode::tiny_encode(uint64_t common_code) { uint32_t offset = 0; auto index = RANGES_GLOBAL_OFFSET[prefix]; - const auto &basic_ranges = BasicRanges::instance().fetch(); + + ranges_ = &cases::BasicRanges::instance().fetch(); + const auto &ranges = *ranges_; + auto target = (uint32_t)common_code; // target range - for (; index < basic_ranges.size(); ++index) { - auto broken_offset = check_range(head, range_reverse(basic_ranges[index])); + for (; index < ranges.size(); ++index) { + auto broken_offset = check_range(head, range_reverse(ranges[index])); if (!broken_offset) { // valid case - if (basic_ranges[index] == target) { + if (ranges[index] == target) { break; // found target range } ++offset; // record sub offset } else { auto delta = (uint32_t)1 << (32 - broken_offset * 2); // delta to next possible range - auto next_min = (basic_ranges[index] & ~(delta - 1)) + delta; - while (basic_ranges[++index] < next_min); // located next range + auto next_min = (ranges[index] & ~(delta - 1)) + delta; + while (ranges[++index] < next_min); // located next range --index; } } @@ -99,13 +94,6 @@ uint32_t ShortCode::tiny_encode(uint64_t common_code) { } uint64_t ShortCode::tiny_decode(uint32_t short_code) { // short code --> common code - // speed_up(false); - - // std::lock_guard guard {busy_}; - - ranges_ = &cases::BasicRanges::instance().fetch(); - // stage_ = Stage::TINY; - auto offset_ = std::upper_bound(ALL_CASES_OFFSET.begin(), ALL_CASES_OFFSET.end(), short_code) - 1; auto head = offset_ - ALL_CASES_OFFSET.begin(); // head index short_code -= *offset_; @@ -116,11 +104,9 @@ uint64_t ShortCode::tiny_decode(uint32_t short_code) { // short code --> common /// search for target range auto index = RANGES_GLOBAL_OFFSET[prefix]; - // auto basic_ranges = ranges_; - // const auto &basic_ranges = BasicRanges::instance().fetch(); + ranges_ = &cases::BasicRanges::instance().fetch(); const auto &ranges = *ranges_; - // const auto &ranges = BasicRanges::instance().fetch(); for (; index < ranges.size(); ++index) { // traverse basic ranges auto broken_offset = check_range(head, range_reverse(ranges[index])); diff --git a/src/core/short_code/internal/short_code.inl b/src/core/short_code/internal/short_code.inl index 5f04047..e3d2714 100644 --- a/src/core/short_code/internal/short_code.inl +++ b/src/core/short_code/internal/short_code.inl @@ -9,8 +9,7 @@ namespace klotski::codec { // ------------------------------------------------------------------------------------- // inline ShortCode::ShortCode(const CommonCode common_code) { - // TODO: test the affect of CPU branch prediction. - if (cases::AllCases::instance().is_available()) { + if (fast_) { code_ = fast_encode(common_code.unwrap()); } else { code_ = tiny_encode(common_code.unwrap()); @@ -39,19 +38,12 @@ inline bool ShortCode::check(const uint32_t short_code) { } inline void ShortCode::speed_up(const bool fast_mode) { - // TODO: keep one way change. + ranges_ = &cases::BasicRanges::instance().fetch(); if (fast_mode) { - // cases::AllCases::instance().build(); std::lock_guard guard {busy_}; cases_ = &cases::AllCases::instance().fetch(); - stage_ = Stage::FAST; - } else { - std::lock_guard guard {busy_}; - - // TODO: skip if stage_ is FAST - - ranges_ = &cases::BasicRanges::instance().fetch(); - stage_ = Stage::TINY; + KLSK_MEM_BARRIER; + fast_ = true; } } @@ -73,21 +65,10 @@ inline std::string ShortCode::to_string() const { } inline CommonCode ShortCode::to_common_code() const { - // TODO: test the affect of CPU branch prediction. - // if (cases::AllCases::instance().is_available()) { - // return CommonCode::unsafe_create(fast_decode(code_)); - // } - // return CommonCode::unsafe_create(tiny_decode(code_)); - - switch (stage_) { - case Stage::UNINIT: - // TODO: do speed up - // speed_up(false); // FIXME: slow about 3% - case Stage::TINY: - return CommonCode::unsafe_create(tiny_decode(code_)); - case Stage::FAST: + if (fast_) { return CommonCode::unsafe_create(fast_decode(code_)); } + return CommonCode::unsafe_create(tiny_decode(code_)); } // ----------------------------------------------------------------------------------------- // diff --git a/src/core/short_code/short_code.h b/src/core/short_code/short_code.h index b39efa5..39834fd 100644 --- a/src/core/short_code/short_code.h +++ b/src/core/short_code/short_code.h @@ -178,14 +178,19 @@ private: // ------------------------------------------------------------------------------------- // - enum class Stage { UNINIT, TINY, FAST }; + /// Whether fast mode is available. + static inline bool fast_ {false}; - static std::mutex busy_; + /// Mutex for protecting critical section. + static inline std::mutex busy_ {}; - static inline auto stage_ {Stage::UNINIT}; + /// Static pointer to klotski AllCases data. + static inline const cases::RangesUnion *cases_ {nullptr}; - static const cases::Ranges *ranges_; - static const cases::RangesUnion *cases_; + /// Static pointer to klotski BasicRanges data. + static inline std::atomic ranges_ {nullptr}; + + // ------------------------------------------------------------------------------------- // }; } // namespace klotski::codec diff --git a/src/core/utils/utility.h b/src/core/utils/utility.h index 9f19b13..67995b2 100644 --- a/src/core/utils/utility.h +++ b/src/core/utils/utility.h @@ -25,6 +25,9 @@ /// Force function declaration to be inline. #define KLSK_INLINE __attribute__ ((always_inline)) +/// Prevent reordering for both compiler and processor. +#define KLSK_MEM_BARRIER std::atomic_thread_fence(std::memory_order_seq_cst) + namespace klotski { /// Calculate the sum of an array of integers.