From 986bae9f8cacb442379b53493c10bccc5bbabcb2 Mon Sep 17 00:00:00 2001 From: Dnomd343 Date: Sat, 1 Jun 2024 15:12:59 +0800 Subject: [PATCH] perf: optimize BasicRanges calculation --- src/core/all_cases/all_cases.h | 37 +---- src/core/all_cases/internal/all_cases.cc | 2 +- src/core/all_cases/internal/basic_ranges.cc | 143 +++++++++++-------- src/core/all_cases/internal/basic_ranges.inl | 11 -- src/core/all_cases/internal/constant.inl | 57 ++++++++ src/core/benchmark/group.cc | 13 +- src/core/main.cc | 43 ++---- src/core/ranges/ranges.h | 2 + src/core/utils/utility.h | 50 ++++--- src/core/utils/worker.inl | 10 +- 10 files changed, 205 insertions(+), 163 deletions(-) create mode 100644 src/core/all_cases/internal/constant.inl diff --git a/src/core/all_cases/all_cases.h b/src/core/all_cases/all_cases.h index d718267..8b6d804 100644 --- a/src/core/all_cases/all_cases.h +++ b/src/core/all_cases/all_cases.h @@ -37,45 +37,25 @@ #pragma once -#include #include -#include #include "utils/utility.h" #include "ranges/ranges.h" +#include "internal/constant.inl" namespace klotski::cases { // ------------------------------------------------------------------------------------- // -typedef std::array RangesUnion; - -// ------------------------------------------------------------------------------------- // - -constexpr auto BASIC_RANGES_NUM = 7311885; - -constexpr std::array ALL_CASES_NUM { - 2942906, 2260392, 2942906, 0, - 2322050, 1876945, 2322050, 0, - 2322050, 1876945, 2322050, 0, - 2942906, 2260392, 2942906, 0, -}; - -// TODO: move to short_code namespace (also `numeric` header) -constexpr auto ALL_CASES_NUM_ = std::accumulate( - ALL_CASES_NUM.begin(), ALL_CASES_NUM.end(), 0); - -// ------------------------------------------------------------------------------------- // - class BasicRanges { public: - /// Execute the build process and ensure thread safety. + /// Execute the build process. void build(); - /// Execute the build process in parallel without blocking. + /// Execute the build process without blocking. void build_async(Executor &&executor, Notifier &&callback); - /// Get the basic-ranges and make sure the result is available. + /// Get basic-ranges and make sure the result is available. const Ranges& fetch(); /// Determine whether the basic-ranges data is available. @@ -88,9 +68,6 @@ private: /// Get static singleton variable. static Ranges& get_ranges(); - /// Search and sort all possible basic-ranges permutations. - static void build_ranges(Ranges &ranges); - KLSK_INSTANCE(BasicRanges) }; @@ -98,11 +75,11 @@ private: class AllCases { public: - /// Execute the build process and ensure thread safety. + /// Execute the build process. void build(); - /// Execute the build process in parallel without blocking. - void build_parallel_async(Executor &&executor, Notifier &&callback); + /// Execute the build process without blocking. + void build_async(Executor &&executor, Notifier &&callback); /// Get all-cases and make sure the result is available. const RangesUnion& fetch(); diff --git a/src/core/all_cases/internal/all_cases.cc b/src/core/all_cases/internal/all_cases.cc index d9e056e..83f01bc 100644 --- a/src/core/all_cases/internal/all_cases.cc +++ b/src/core/all_cases/internal/all_cases.cc @@ -100,7 +100,7 @@ void AllCases::build() { available_ = true; } -void AllCases::build_parallel_async(Executor &&executor, Notifier &&callback) { +void AllCases::build_async(Executor &&executor, Notifier &&callback) { if (available_) { callback(); return; // reduce consumption of mutex diff --git a/src/core/all_cases/internal/basic_ranges.cc b/src/core/all_cases/internal/basic_ranges.cc index 94e4280..2d719e3 100644 --- a/src/core/all_cases/internal/basic_ranges.cc +++ b/src/core/all_cases/internal/basic_ranges.cc @@ -1,21 +1,23 @@ #include #include -#include "ranges/ranges.h" +#include "group/group.h" #include "all_cases/all_cases.h" using klotski::cases::Ranges; using klotski::cases::BasicRanges; +using klotski::cases::TYPE_ID_LIMIT; +typedef Ranges::iterator RangesIter; typedef std::tuple RangeType; -typedef std::array RangeTypeUnion; +typedef std::array RangeTypeUnion; /// Generate all possible basic-ranges permutations. consteval static RangeTypeUnion range_types() { RangeTypeUnion data; for (int i = 0, n = 0; n <= 7; ++n) { // 1x2 + 2x1 -> 0 ~ 7 for (int n_2x1 = 0; n_2x1 <= n; ++n_2x1) { // 2x1 -> 0 ~ n - if (n == 7 && n_2x1 == 7) { + if (n_2x1 == 7) { break; } for (int n_1x1 = 0; n_1x1 <= (14 - n * 2); ++n_1x1) { // 1x1 -> 0 ~ (14 - 2n) @@ -27,8 +29,8 @@ consteval static RangeTypeUnion range_types() { } /// Combine two consecutive sorted arrays into one sorted arrays. -static void inplace_merge(Ranges::iterator begin, Ranges::iterator mid, const Ranges::iterator end) { - std::vector tmp = {begin, mid}; // left array backup +static void inplace_merge(RangesIter begin, RangesIter mid, const RangesIter end) { + std::vector tmp {begin, mid}; // left array backup for (auto p = tmp.begin();;) { if (*p <= *mid) { *(begin++) = *(p++); // stored in original span @@ -44,88 +46,103 @@ static void inplace_merge(Ranges::iterator begin, Ranges::iterator mid, const Ra } } -void BasicRanges::build_ranges(Ranges &ranges) { - ranges.clear(); - ranges.reserve(BASIC_RANGES_NUM); +void BasicRanges::build() { + if (available_) { + return; // reduce consumption of mutex + } + std::lock_guard guard {building_}; + if (available_) { + return; // data is already available + } - std::list flags {ranges.begin()}; // TODO: flags can be constexpr + auto &ranges = get_ranges(); + ranges.clear(); + ranges.reserve(BASIC_RANGES_NUM_); for (auto [n, n_2x1, n_1x1] : range_types()) { ranges.spawn(n, n_2x1, n_1x1); - flags.emplace_back(ranges.end()); // mark ordered interval } + std::list points; // mark ordered interval + for (const auto offset : to_offset(BASIC_RANGES_NUM, 0)) { + points.emplace_back(ranges.begin() + offset); + } + points.emplace_back(ranges.end()); + do { - decltype(flags.begin()) begin = flags.begin(), mid, end; - while (++(mid = begin) != flags.end() && ++(end = mid) != flags.end()) { + decltype(points)::iterator begin = points.begin(), mid, end; + while (++(mid = begin) != points.end() && ++(end = mid) != points.end()) { inplace_merge(*begin, *mid, *end); // merge two ordered interval - flags.erase(mid); + points.erase(mid); begin = end; } - } while (flags.size() > 2); // merge until only one interval remains -} - -void do_sort(klotski::Executor &&executor, klotski::Notifier notifier, std::shared_ptr> flags) { - - klotski::Worker worker {std::move(executor)}; - - decltype(flags->begin()) begin = flags->begin(), mid, end; - while (++(mid = begin) != flags->end() && ++(end = mid) != flags->end()) { - - worker.post([begin = *begin, mid = *mid, end = *end]() { - inplace_merge(begin, mid, end); // merge two ordered interval - }); - - flags->erase(mid); - begin = end; - } - - worker.then([flags, notifier](klotski::Executor &&executor) { - - if (flags->size() == 2) { - notifier(); - return; - } - - do_sort(std::move(executor), notifier, flags); - - }); + } while (points.size() > 2); // merge until only one interval remains + available_ = true; } void BasicRanges::build_async(Executor &&executor, Notifier &&callback) { + if (available_) { + callback(); + return; // reduce consumption of mutex + } + building_.lock(); + if (available_) { + building_.unlock(); + callback(); + return; // data is already available + } - // TODO: add mutex protect here - - Worker worker {std::move(executor)}; - auto cache = std::make_shared>(); - - for (uint32_t i = 0; i < 203; ++i) { + auto all_done = [this, callback = std::move(callback)] { + available_ = true; + building_.unlock(); + callback(); + }; + + Worker worker {executor}; + auto cache = std::make_shared>(); + for (uint32_t i = 0; i < TYPE_ID_LIMIT; ++i) { + (*cache)[i].reserve(BASIC_RANGES_NUM[i]); worker.post([cache, i] { auto [n, n_2x1, n_1x1] = range_types()[i]; - cache->operator[](i).spawn(n, n_2x1, n_1x1); + (*cache)[i].spawn(n, n_2x1, n_1x1); }); } - // auto all_done = std::make_shared(std::move(callback)); - - worker.then([cache, this, callback](Executor &&executor) { - + worker.then([cache, all_done = std::move(all_done), executor = std::move(executor)] mutable { auto &ranges = get_ranges(); - ranges.clear(); - ranges.reserve(BASIC_RANGES_NUM); - - const auto flags = std::make_shared>(); - flags->emplace_back(ranges.end()); - - for (auto &tmp : *cache) { + ranges.reserve(BASIC_RANGES_NUM_); + for (auto &&tmp : *cache) { ranges.insert(ranges.end(), tmp.begin(), tmp.end()); - flags->emplace_back(ranges.end()); // mark ordered interval } - do_sort(std::move(executor), callback, flags); - - available_ = true; + auto points = std::make_shared>(); // mark ordered interval + for (const auto offset : to_offset(BASIC_RANGES_NUM, 0)) { + points->emplace_back(ranges.begin() + offset); + } + points->emplace_back(ranges.end()); + + auto inner_sort = [points, all_done, executor = std::move(executor)](auto &&self) -> void { + Worker sorter {executor}; + + auto begin = points->begin(); + decltype(begin) mid, end; + while (++(mid = begin) != points->end() && ++(end = mid) != points->end()) { + sorter.post([begin = *begin, mid = *mid, end = *end] { + inplace_merge(begin, mid, end); // merge two ordered interval + }); + points->erase(mid); + begin = end; + } + sorter.then([self, points, all_done] { + if (points->size() == 2) { + all_done(); + return; + } + self(self); // next sort round + }); + }; + inner_sort(inner_sort); // TODO: using `this auto &&self` in new version }); } diff --git a/src/core/all_cases/internal/basic_ranges.inl b/src/core/all_cases/internal/basic_ranges.inl index e7b6df5..d7850a3 100644 --- a/src/core/all_cases/internal/basic_ranges.inl +++ b/src/core/all_cases/internal/basic_ranges.inl @@ -14,17 +14,6 @@ inline const Ranges& BasicRanges::fetch() { return get_ranges(); } -inline void BasicRanges::build() { - if (available_) { - return; // reduce consumption of mutex - } - std::lock_guard guard {building_}; - if (!available_) { - build_ranges(get_ranges()); - available_ = true; - } -} - inline bool BasicRanges::is_available() const { return available_; // no mutex required in one-way state } diff --git a/src/core/all_cases/internal/constant.inl b/src/core/all_cases/internal/constant.inl new file mode 100644 index 0000000..2dc773f --- /dev/null +++ b/src/core/all_cases/internal/constant.inl @@ -0,0 +1,57 @@ +#pragma once + +#include + +namespace klotski::cases { + +// ------------------------------------------------------------------------------------- // + +constexpr auto ALL_CASES_NUM = std::to_array({ + 2942906, 2260392, 2942906, 0, + 2322050, 1876945, 2322050, 0, + 2322050, 1876945, 2322050, 0, + 2942906, 2260392, 2942906, 0, +}); + +static_assert(ALL_CASES_NUM.size() == 16); + +constexpr auto ALL_CASES_NUM_ = array_sum(ALL_CASES_NUM); + +// ------------------------------------------------------------------------------------- // + +constexpr auto BASIC_RANGES_NUM = std::to_array({ + 1 , 16 , 120 , 560 , 1820 , 4368 , 8008 , 11440 , + 12870 , 11440 , 8008 , 4368 , 1820 , 560 , 120 , 15 , + 210 , 1365 , 5460 , 15015 , 30030, 45045, 51480 , 45045 , + 30030 , 15015 , 5460 , 1365 , 15 , 210 , 1365 , 5460 , + 15015 , 30030 , 45045 , 51480 , 45045, 30030, 15015 , 5460 , + 1365 , 91 , 1092 , 6006 , 20020, 45045, 72072 , 84084 , + 72072 , 45045 , 20020 , 6006 , 182 , 2184 , 12012 , 40040 , + 90090 , 144144, 168168, 144144, 90090, 40040, 12012 , 91 , + 1092 , 6006 , 20020 , 45045 , 72072, 84084, 72072 , 45045 , + 20020 , 6006 , 286 , 2860 , 12870, 34320, 60060 , 72072 , + 60060 , 34320 , 12870 , 858 , 8580 , 38610, 102960, 180180, + 216216, 180180, 102960, 38610 , 858 , 8580 , 38610 , 102960, + 180180, 216216, 180180, 102960, 38610, 286 , 2860 , 12870 , + 34320 , 60060 , 72072 , 60060 , 34320, 12870, 495 , 3960 , + 13860 , 27720 , 34650 , 27720 , 13860, 1980 , 15840 , 55440 , + 110880, 138600, 110880, 55440 , 2970 , 23760, 83160 , 166320, + 207900, 166320, 83160 , 1980 , 15840, 55440, 110880, 138600, + 110880, 55440 , 495 , 3960 , 13860, 27720, 34650 , 27720 , + 13860 , 462 , 2772 , 6930 , 9240 , 6930 , 2310 , 13860 , + 34650 , 46200 , 34650 , 4620 , 27720, 69300, 92400 , 69300 , + 4620 , 27720 , 69300 , 92400 , 69300, 2310 , 13860 , 34650 , + 46200 , 34650 , 462 , 2772 , 6930 , 9240 , 6930 , 210 , + 840 , 1260 , 1260 , 5040 , 7560 , 3150 , 12600 , 18900 , + 4200 , 16800 , 25200 , 3150 , 12600, 18900, 1260 , 5040 , + 7560 , 210 , 840 , 1260 , 36 , 252 , 756 , 1260 , + 1260 , 756 , 252 , +}); + +static_assert(BASIC_RANGES_NUM.size() == 203); + +constexpr auto BASIC_RANGES_NUM_ = array_sum(BASIC_RANGES_NUM); + +// ------------------------------------------------------------------------------------- // + +} // namespace klotski::cases diff --git a/src/core/benchmark/group.cc b/src/core/benchmark/group.cc index 3d3d157..1f22f16 100644 --- a/src/core/benchmark/group.cc +++ b/src/core/benchmark/group.cc @@ -172,14 +172,17 @@ static void OriginBasicRanges(benchmark::State &state) { for (auto _ : state) { auto &kk = klotski::cases::BasicRanges::instance(); - kk.build_ranges(kk.get_ranges()); + // kk.build_ranges(kk.get_ranges()); + + kk.available_ = false; + // kk.build(); // kk.build_async([](auto func) {func();}, [](){}); - // kk.build_async([&pool](auto func) { - // pool.submit_task(func); - // }, [] {}); - // pool.wait(); + kk.build_async([&pool](auto func) { + pool.submit_task(func); + }, [] {}); + pool.wait(); } } diff --git a/src/core/main.cc b/src/core/main.cc index 4bc9c3d..ce8c3f3 100644 --- a/src/core/main.cc +++ b/src/core/main.cc @@ -38,45 +38,32 @@ int main() { BS::thread_pool pool {}; + // auto demo = [](auto &&self, int val) { + // std::cout << "val = " << val << std::endl; + // if (val == 0) { + // return; + // } + // self(self, val - 1); + // }; + // + // demo(demo, 5); + + // constexpr std::array kk {1, 2, 3, 4, 5}; + // auto ret = klotski::to_offset(kk, 0); + // std::cout << std::format("{}", ret) << std::endl; + // klotski::cases::BasicRanges::instance().build(); klotski::cases::BasicRanges::instance().build_async([&pool](auto &&func) { pool.submit_task(func); }, [] { - std::cout << "all done" << std::endl; + // std::cout << "all done" << std::endl; }); - // klotski::cases::BasicRanges::instance().build(); - // // klotski::cases::AllCases::instance().build_parallel_async([&pool](auto func) { // pool.submit_task(func); // }, [] {}); - // std::cout << "start call" << std::endl; - // klotski::Notifier kk {}; - // kk(); - // std::cout << "end call" << std::endl; - - // { - // klotski::Worker worker {[&pool](auto &&func) { pool.submit_task(func); }}; - // - // for (int i = 1; i < 3; ++i) { - // worker.post([i] { - // std::cout << std::format("task {} begin\n", i); - // std::this_thread::sleep_for(std::chrono::seconds(i)); - // std::cout << std::format("task {} complete\n", i); - // }); - // } - // - // worker.then([](klotski::Executor &&executor){ - // std::cout << "all tasks done\n"; - // }); - // - // std::cout << "worker start release\n"; - // } - // - // std::cout << "block exit\n"; - pool.wait(); // std::cout << BasicRanges::instance().fetch().size() << std::endl; diff --git a/src/core/ranges/ranges.h b/src/core/ranges/ranges.h index a781731..6b6cfe6 100644 --- a/src/core/ranges/ranges.h +++ b/src/core/ranges/ranges.h @@ -25,6 +25,8 @@ void derive_demo(const std::vector &range, const std::vector void derive_demo_pro(const BidiRanges &bidi_range, std::vector &output, int head); +typedef std::array RangesUnion; + // TODO: add RangesUnion here // TODO: -> spawn from Ranges / export std::vector diff --git a/src/core/utils/utility.h b/src/core/utils/utility.h index e0adb42..c30cc2e 100644 --- a/src/core/utils/utility.h +++ b/src/core/utils/utility.h @@ -2,6 +2,7 @@ #include #include +#include #include /// Mark target class as a singleton. @@ -26,23 +27,33 @@ namespace klotski { -/// Get the number of consecutive `0` in the low bits. -// inline int low_zero_num(const uint32_t bin) { -// return __builtin_ctzl(bin); -// -// // TODO: using (bin ^ (bin - 1)) when non-builtin -// -// // WARN: be aware of serious performance issues -// // return __builtin_popcount(~(bin ^ -bin)) - 1; -// } - -/// Get the number of consecutive `0` in the low bits. -// inline int low_zero_num(const uint64_t bin) { -// return __builtin_ctzll(bin); -// -// // WARN: be aware of serious performance issues -// // return __builtin_popcount(~(bin ^ -bin)) - 1; -// } +template +concept Addable = requires(T a, T b) { a + b; }; + +template +consteval int array_sum(const std::array &arr) { + return std::accumulate(arr.begin(), arr.end(), 0); +} + +template +consteval std::array to_offset(const std::array &arr, T base) { + + static_assert(N > 0); + + std::array offset; + + T val = base; + + offset[0] = 0; + + for (int i = 0; i < N - 1; ++i) { + val += arr[i]; + offset[i + 1] = val; + } + + return offset; + +} /// Flips the input u32 every two bits in low-high symmetry. inline uint32_t range_reverse(uint32_t bin) { @@ -69,16 +80,15 @@ typedef std::function &&)> Executor; class Worker final { public: using Task = std::function; - using After = std::function; /// Construction based on executor. - explicit Worker(Executor &&executor); + explicit Worker(Executor executor); /// Post new task into the queue. void post(Task &&task); /// Setting up callback entry. - void then(After &&after); + void then(Notifier &&after); /// Tasks will be triggered at destruction. ~Worker(); diff --git a/src/core/utils/worker.inl b/src/core/utils/worker.inl index e9e93c2..d7d9baf 100644 --- a/src/core/utils/worker.inl +++ b/src/core/utils/worker.inl @@ -4,16 +4,16 @@ namespace klotski { -inline Worker::Worker(Executor &&executor) - : after_([] {}), executor_(executor) {} +inline Worker::Worker(Executor executor) + : after_([] {}), executor_(std::move(executor)) {} inline void Worker::post(Task &&task) { tasks_.emplace_back(std::move(task)); } -inline void Worker::then(After &&after) { - after_ = [after = std::move(after), executor = executor_]() mutable { - after(std::move(executor)); +inline void Worker::then(Notifier &&after) { + after_ = [after = std::move(after)]() { + after(); }; }