Browse Source

perf: optimize ShortCode conversion

legacy
Dnomd343 6 months ago
parent
commit
be394a96c7
  1. 34
      src/core/benchmark/codec.cc
  2. 9
      src/core/common_code/common_code.h
  3. 38
      src/core/short_code/internal/convert.cc
  4. 31
      src/core/short_code/internal/short_code.inl
  5. 15
      src/core/short_code/short_code.h
  6. 3
      src/core/utils/utility.h

34
src/core/benchmark/codec.cc

@ -197,8 +197,8 @@ static void ShortCodeDeserialize(benchmark::State &state) {
static void ShortCodeToCommonCode(benchmark::State &state) { static void ShortCodeToCommonCode(benchmark::State &state) {
// ShortCode::speed_up(true); ShortCode::speed_up(true);
ShortCode::speed_up(false); // ShortCode::speed_up(false);
// ShortCode::fast_decode(4091296); // ShortCode::fast_decode(4091296);
@ -206,7 +206,9 @@ static void ShortCodeToCommonCode(benchmark::State &state) {
for (auto _ : state) { for (auto _ : state) {
volatile auto kk = short_code.to_common_code(); // volatile auto kk = short_code.to_common_code();
benchmark::DoNotOptimize(short_code.to_common_code());
// if (AllCases::instance().is_available()) { // if (AllCases::instance().is_available()) {
// if (ShortCode::stage_ == ShortCode::Stage::FAST) { // if (ShortCode::stage_ == ShortCode::Stage::FAST) {
@ -218,6 +220,29 @@ static void ShortCodeToCommonCode(benchmark::State &state) {
} }
static void CommonCodeToShortCode(benchmark::State &state) {
ShortCode::speed_up(true);
// ShortCode::speed_up(false);
auto common_code = CommonCode::unsafe_create(0x1A9BF0C00);
// std::vector<CommonCode> samples;
// for (auto code : common_code_samples(256)) {
// samples.emplace_back(CommonCode::unsafe_create(code));
// }
for (auto _ : state) {
// for (auto common_code : samples) {
// volatile auto kk = ShortCode(common_code);
benchmark::DoNotOptimize(ShortCode(common_code));
// }
}
}
// BENCHMARK(CommonCodeSerialize)->Range(8, 256); // BENCHMARK(CommonCodeSerialize)->Range(8, 256);
// BENCHMARK(CommonCodeDeserialize)->Range(8, 256); // BENCHMARK(CommonCodeDeserialize)->Range(8, 256);
// BENCHMARK(CommonCodeSerializeShorten)->Range(8, 256); // BENCHMARK(CommonCodeSerializeShorten)->Range(8, 256);
@ -226,7 +251,8 @@ static void ShortCodeToCommonCode(benchmark::State &state) {
// BENCHMARK(ShortCodeSerialize)->Range(8, 256); // BENCHMARK(ShortCodeSerialize)->Range(8, 256);
// BENCHMARK(ShortCodeDeserialize)->Range(8, 256); // BENCHMARK(ShortCodeDeserialize)->Range(8, 256);
BENCHMARK(ShortCodeToCommonCode); // BENCHMARK(ShortCodeToCommonCode);
BENCHMARK(CommonCodeToShortCode);
// static void CommonCodeDecode(benchmark::State &state) { // static void CommonCodeDecode(benchmark::State &state) {
// const auto tmp = str_common_codes(state.range(0)); // const auto tmp = str_common_codes(state.range(0));

9
src/core/common_code/common_code.h

@ -166,6 +166,15 @@ private:
// ------------------------------------------------------------------------------------- // // ------------------------------------------------------------------------------------- //
}; };
// Ref: https://cplusplus.github.io/CWG/issues/1734.html
// TODO: By definition, the default constructor is deleted, it is not trivial.
// TODO: But in clang and g++, it is legal, but in msvc it fails.
static_assert(std::is_trivial_v<CommonCode>);
static_assert(std::is_standard_layout_v<CommonCode>);
static_assert(std::is_trivially_copyable_v<CommonCode>);
} // namespace klotski::codec } // namespace klotski::codec
#include "internal/common_code.inl" #include "internal/common_code.inl"

38
src/core/short_code/internal/convert.cc

@ -50,18 +50,10 @@ static uint32_t check_range(uint32_t head, uint32_t range) noexcept {
return 0; // pass check return 0; // pass check
} }
std::mutex ShortCode::busy_ {};
// ShortCode::Stage ShortCode::stage_ = Stage::UNINIT;
// const klotski::cases::RangesUnion *ShortCode::cases_ = &AllCases::instance().fetch();
const klotski::cases::RangesUnion *ShortCode::cases_ = nullptr;
const klotski::cases::Ranges *ShortCode::ranges_ = nullptr;
uint32_t ShortCode::fast_encode(uint64_t common_code) { uint32_t ShortCode::fast_encode(uint64_t common_code) {
auto head = common_code >> 32; auto head = common_code >> 32;
auto &ranges = AllCases::instance().fetch()[head]; // match available ranges auto &ranges = (*cases_)[head]; // match available ranges
// TODO: try to narrow the scope by prefix
auto target = std::lower_bound(ranges.begin(), ranges.end(), (uint32_t)common_code); auto target = std::lower_bound(ranges.begin(), ranges.end(), (uint32_t)common_code);
return ALL_CASES_OFFSET[head] + (target - ranges.begin()); return ALL_CASES_OFFSET[head] + (target - ranges.begin());
} }
@ -79,19 +71,22 @@ uint32_t ShortCode::tiny_encode(uint64_t common_code) {
uint32_t offset = 0; uint32_t offset = 0;
auto index = RANGES_GLOBAL_OFFSET[prefix]; auto index = RANGES_GLOBAL_OFFSET[prefix];
const auto &basic_ranges = BasicRanges::instance().fetch();
ranges_ = &cases::BasicRanges::instance().fetch();
const auto &ranges = *ranges_;
auto target = (uint32_t)common_code; // target range auto target = (uint32_t)common_code; // target range
for (; index < basic_ranges.size(); ++index) { for (; index < ranges.size(); ++index) {
auto broken_offset = check_range(head, range_reverse(basic_ranges[index])); auto broken_offset = check_range(head, range_reverse(ranges[index]));
if (!broken_offset) { // valid case if (!broken_offset) { // valid case
if (basic_ranges[index] == target) { if (ranges[index] == target) {
break; // found target range break; // found target range
} }
++offset; // record sub offset ++offset; // record sub offset
} else { } else {
auto delta = (uint32_t)1 << (32 - broken_offset * 2); // delta to next possible range auto delta = (uint32_t)1 << (32 - broken_offset * 2); // delta to next possible range
auto next_min = (basic_ranges[index] & ~(delta - 1)) + delta; auto next_min = (ranges[index] & ~(delta - 1)) + delta;
while (basic_ranges[++index] < next_min); // located next range while (ranges[++index] < next_min); // located next range
--index; --index;
} }
} }
@ -99,13 +94,6 @@ uint32_t ShortCode::tiny_encode(uint64_t common_code) {
} }
uint64_t ShortCode::tiny_decode(uint32_t short_code) { // short code --> common code uint64_t ShortCode::tiny_decode(uint32_t short_code) { // short code --> common code
// speed_up(false);
// std::lock_guard guard {busy_};
ranges_ = &cases::BasicRanges::instance().fetch();
// stage_ = Stage::TINY;
auto offset_ = std::upper_bound(ALL_CASES_OFFSET.begin(), ALL_CASES_OFFSET.end(), short_code) - 1; auto offset_ = std::upper_bound(ALL_CASES_OFFSET.begin(), ALL_CASES_OFFSET.end(), short_code) - 1;
auto head = offset_ - ALL_CASES_OFFSET.begin(); // head index auto head = offset_ - ALL_CASES_OFFSET.begin(); // head index
short_code -= *offset_; short_code -= *offset_;
@ -116,11 +104,9 @@ uint64_t ShortCode::tiny_decode(uint32_t short_code) { // short code --> common
/// search for target range /// search for target range
auto index = RANGES_GLOBAL_OFFSET[prefix]; auto index = RANGES_GLOBAL_OFFSET[prefix];
// auto basic_ranges = ranges_;
// const auto &basic_ranges = BasicRanges::instance().fetch();
ranges_ = &cases::BasicRanges::instance().fetch();
const auto &ranges = *ranges_; const auto &ranges = *ranges_;
// const auto &ranges = BasicRanges::instance().fetch();
for (; index < ranges.size(); ++index) { // traverse basic ranges for (; index < ranges.size(); ++index) { // traverse basic ranges
auto broken_offset = check_range(head, range_reverse(ranges[index])); auto broken_offset = check_range(head, range_reverse(ranges[index]));

31
src/core/short_code/internal/short_code.inl

@ -9,8 +9,7 @@ namespace klotski::codec {
// ------------------------------------------------------------------------------------- // // ------------------------------------------------------------------------------------- //
inline ShortCode::ShortCode(const CommonCode common_code) { inline ShortCode::ShortCode(const CommonCode common_code) {
// TODO: test the affect of CPU branch prediction. if (fast_) {
if (cases::AllCases::instance().is_available()) {
code_ = fast_encode(common_code.unwrap()); code_ = fast_encode(common_code.unwrap());
} else { } else {
code_ = tiny_encode(common_code.unwrap()); code_ = tiny_encode(common_code.unwrap());
@ -39,19 +38,12 @@ inline bool ShortCode::check(const uint32_t short_code) {
} }
inline void ShortCode::speed_up(const bool fast_mode) { inline void ShortCode::speed_up(const bool fast_mode) {
// TODO: keep one way change. ranges_ = &cases::BasicRanges::instance().fetch();
if (fast_mode) { if (fast_mode) {
// cases::AllCases::instance().build();
std::lock_guard guard {busy_}; std::lock_guard guard {busy_};
cases_ = &cases::AllCases::instance().fetch(); cases_ = &cases::AllCases::instance().fetch();
stage_ = Stage::FAST; KLSK_MEM_BARRIER;
} else { fast_ = true;
std::lock_guard guard {busy_};
// TODO: skip if stage_ is FAST
ranges_ = &cases::BasicRanges::instance().fetch();
stage_ = Stage::TINY;
} }
} }
@ -73,21 +65,10 @@ inline std::string ShortCode::to_string() const {
} }
inline CommonCode ShortCode::to_common_code() const { inline CommonCode ShortCode::to_common_code() const {
// TODO: test the affect of CPU branch prediction. if (fast_) {
// if (cases::AllCases::instance().is_available()) {
// return CommonCode::unsafe_create(fast_decode(code_));
// }
// return CommonCode::unsafe_create(tiny_decode(code_));
switch (stage_) {
case Stage::UNINIT:
// TODO: do speed up
// speed_up(false); // FIXME: slow about 3%
case Stage::TINY:
return CommonCode::unsafe_create(tiny_decode(code_));
case Stage::FAST:
return CommonCode::unsafe_create(fast_decode(code_)); return CommonCode::unsafe_create(fast_decode(code_));
} }
return CommonCode::unsafe_create(tiny_decode(code_));
} }
// ----------------------------------------------------------------------------------------- // // ----------------------------------------------------------------------------------------- //

15
src/core/short_code/short_code.h

@ -178,14 +178,19 @@ private:
// ------------------------------------------------------------------------------------- // // ------------------------------------------------------------------------------------- //
enum class Stage { UNINIT, TINY, FAST }; /// Whether fast mode is available.
static inline bool fast_ {false};
static std::mutex busy_; /// Mutex for protecting critical section.
static inline std::mutex busy_ {};
static inline auto stage_ {Stage::UNINIT}; /// Static pointer to klotski AllCases data.
static inline const cases::RangesUnion *cases_ {nullptr};
static const cases::Ranges *ranges_; /// Static pointer to klotski BasicRanges data.
static const cases::RangesUnion *cases_; static inline std::atomic<const cases::Ranges*> ranges_ {nullptr};
// ------------------------------------------------------------------------------------- //
}; };
} // namespace klotski::codec } // namespace klotski::codec

3
src/core/utils/utility.h

@ -25,6 +25,9 @@
/// Force function declaration to be inline. /// Force function declaration to be inline.
#define KLSK_INLINE __attribute__ ((always_inline)) #define KLSK_INLINE __attribute__ ((always_inline))
/// Prevent reordering for both compiler and processor.
#define KLSK_MEM_BARRIER std::atomic_thread_fence(std::memory_order_seq_cst)
namespace klotski { namespace klotski {
/// Calculate the sum of an array of integers. /// Calculate the sum of an array of integers.

Loading…
Cancel
Save