From d163791aa0333a9ced16c4ec10378598dc41c826 Mon Sep 17 00:00:00 2001 From: Dnomd343 Date: Sun, 17 Sep 2023 14:33:50 +0800 Subject: [PATCH] perf: high performance basic ranges build --- src/CMakeLists.txt | 10 +- src/core/CMakeLists.txt | 9 ++ src/core/all_cases/basic_ranges.cc | 249 +++++++++++++++++++++++++++++ src/core/all_cases/basic_ranges.h | 80 +++++++++ src/core/main.cc | 43 +++++ 5 files changed, 387 insertions(+), 4 deletions(-) create mode 100644 src/core/CMakeLists.txt create mode 100644 src/core/all_cases/basic_ranges.cc create mode 100644 src/core/all_cases/basic_ranges.h create mode 100644 src/core/main.cc diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt index ca0dbd3..42c85c2 100644 --- a/src/CMakeLists.txt +++ b/src/CMakeLists.txt @@ -2,10 +2,12 @@ cmake_minimum_required(VERSION 3.0) project(klotski-cli LANGUAGES C) -add_subdirectory(klotski_core) -include_directories(klotski_core) +add_subdirectory(core) -add_executable(cli main.c) -target_link_libraries(cli PRIVATE klotski absl::flat_hash_map) +#add_subdirectory(klotski_core) +#include_directories(klotski_core) + +#add_executable(cli main.c) +#target_link_libraries(cli PRIVATE klotski absl::flat_hash_map) # -labsl_hash -labsl_city -labsl_low_level_hash -labsl_raw_hash_set diff --git a/src/core/CMakeLists.txt b/src/core/CMakeLists.txt new file mode 100644 index 0000000..72d924d --- /dev/null +++ b/src/core/CMakeLists.txt @@ -0,0 +1,9 @@ +cmake_minimum_required(VERSION 3.0) + +set(CMAKE_CXX_STANDARD 20) + +project(klotski-core VERSION 0.1.2 LANGUAGES CXX) + +add_compile_options(-fno-exceptions) + +add_executable(${PROJECT_NAME} main.cc all_cases/basic_ranges.cc) diff --git a/src/core/all_cases/basic_ranges.cc b/src/core/all_cases/basic_ranges.cc new file mode 100644 index 0000000..76a4d95 --- /dev/null +++ b/src/core/all_cases/basic_ranges.cc @@ -0,0 +1,249 @@ +#include +#include +#include +#include +#include "basic_ranges.h" + +static uint32_t range_reverse(uint32_t bin) noexcept { +// bin = ((bin << 16) & 0xFFFF0000) | ((bin >> 16) & 0x0000FFFF); +// bin = ((bin << 8) & 0xFF00FF00) | ((bin >> 8) & 0x00FF00FF); + bin = __builtin_bswap32(bin); + bin = ((bin << 4) & 0xF0F0F0F0) | ((bin >> 4) & 0x0F0F0F0F); + return ((bin << 2) & 0xCCCCCCCC) | ((bin >> 2) & 0x33333333); +} + +void build(std::vector &result, int n1, int n2, int n3, int n4) { + + std::vector demo; + demo.reserve(n1 + n2 + n3 + n4); + + int move_num = 32 - (n1 + n2 + n3 + n4) * 2; + + demo.insert(demo.end(), n1, 0b00); + demo.insert(demo.end(), n2, 0b01); + demo.insert(demo.end(), n3, 0b10); + demo.insert(demo.end(), n4, 0b11); + +// for (int i = 0; i < n1; ++i) { +// demo.emplace_back(0b00); +// } +// for (int i = 0; i < n2; ++i) { +// demo.emplace_back(0b01); +// } +// for (int i = 0; i < n3; ++i) { +// demo.emplace_back(0b10); +// } +// for (int i = 0; i < n4; ++i) { +// demo.emplace_back(0b11); +// } + +// for (auto x : demo) { +// std::cout << x << " "; +// } +// std::cout << std::endl; + + do { + + uint32_t tmp = 0; + for (auto x : demo) { + tmp <<= 2; + tmp |= x; + } + tmp <<= move_num; +// volatile auto r = tmp; + result.emplace_back(tmp); + + } while (next_permutation(demo.begin(), demo.end())); +} + +template +void sort(T begin, T end, T kk) { + + std::vector tmp; + tmp.reserve(end - begin); + auto k1 = begin; + auto k2 = kk; + + while (1 == 1) { + if (*k1 < *k2) { + tmp.emplace_back(*k1); + ++k1; + if (k1 == kk) { + tmp.insert(tmp.end(), k2, end); + break; + } + } else { + tmp.emplace_back(*k2); + ++k2; + if (k2 == end) { + tmp.insert(tmp.end(), k1, kk); + break; + } + } + } + + auto p = begin; + for (auto x : tmp) { + *p = x; + ++p; + } + +} + +template +void sort_v2(ITER_T begin, ITER_T end, ITER_T mid) { + + std::vector tmp {begin, mid}; + auto k1 = tmp.begin(); + auto k2 = mid; + auto target = begin; + + while (1 == 1) { + +// std::cout << *k1 << " vs " << *k2 << std::endl; + + if (*k1 < *k2) { + *target = *k1; + ++target; +// tmp.emplace_back(*k1); + ++k1; + if (k1 == tmp.end()) { + +// memcpy(&*target, &*k2, end - k2); + +// tmp.insert(tmp.end(), k2, end); + break; + } + } else { + *target = *k2; + ++target; +// tmp.emplace_back(*k2); + ++k2; + if (k2 == end) { + +// std::cout << "get it" << std::endl; + +// std::cout << "size = " << (tmp.end() - k1) << std::endl; +// std::cout << (target - begin) + + memcpy(&*target, &*k1, (tmp.end() - k1) * 4); + +// tmp.insert(tmp.end(), k1, kk); + break; + } + } + } +// +// auto p = begin; +// for (auto x : tmp) { +// *p = x; +// ++p; +// } + +} + +template +void sort_v3(ITER_T begin, ITER_T end, ITER_T mid) { + std::vector tmp = {begin, mid}; + auto p = tmp.begin(); + for (;;) { + if (*p < *mid) { + *(begin++) = *(p++); + if (p == tmp.end()) { + return; + } + } else { + *(begin++) = *(mid++); + if (mid == end) { + std::copy(p, tmp.end(), begin); + return; + } + } + } +} + +void demo() { + +// std::vector demo = {1, 2, 5, 9, 11, 3, 6, 7}; +// auto kk = demo.begin() + 5; +// for (auto p = demo.begin(); p < kk; ++p) { +// std::cout << *p << std::endl; +// } +// sort_v3(demo.begin(), demo.end(), kk); +// +// for (auto x : demo) { +// std::cout << x << std::endl; +// } +// return; + + +// uint32_t tmp = 0x5129B263; // 0xC98E6845 +// +// for (uint64_t i = 0; i < 0xFFFFFFFF; ++i) { +// volatile auto r = range_reverse(tmp); +// } + +// printf("%08X\n", tmp); + + std::vector result; + + result.reserve(7311921); + +// build(result, 4, 2, 1, 6); + + std::list pp; + pp.emplace_back(result.begin()); + + for (int n = 0; n <= 7; ++n) // number of 1x2 and 2x1 block -> 0 ~ 7 + for (int n_2x1 = 0; n_2x1 <= n; ++n_2x1) // number of 2x1 block -> 0 ~ n + for (int n_1x1 = 0; n_1x1 <= (14 - n * 2); ++n_1x1) { // number of 1x1 block -> 0 ~ (14 - 2n) + build(result, 16 - n * 2 - n_1x1, n - n_2x1, n_2x1, n_1x1); + pp.emplace_back(result.end()); + } + +// std::stable_sort(result.begin(), result.end()); +// return; + +// sort(result.begin() + 363149, result.begin() + 459674, result.begin() + 408194); + + while (1 == 1) { + auto begin = pp.begin(); + while (1 == 1) { + auto mid = begin; + ++mid; + if (mid == pp.end()) { + break; + } + auto end = mid; + ++end; + if (end == pp.end()) { + break; + } +// std::cout << (*begin - result.begin()) << " " << (*mid - result.begin()) << " " << (*end - result.begin()) << std::endl; + sort_v3(*begin, *end, *mid); + ++begin; + ++begin; + pp.erase(mid); + } + if (pp.size() == 2) { + break; + } + } + + +// std::sort(result.begin(), result.end()); +// std::stable_sort(result.begin(), result.end()); + + for (auto &x : result) { + x = range_reverse(x); + } + + for (auto x : result) { + printf("%08X\n", x); + } + +// for (uint32_t i = 0; i < 0xFF; ++i) { +// build(result, 4, 2, 1, 6); +// } + +} diff --git a/src/core/all_cases/basic_ranges.h b/src/core/all_cases/basic_ranges.h new file mode 100644 index 0000000..153c3b6 --- /dev/null +++ b/src/core/all_cases/basic_ranges.h @@ -0,0 +1,80 @@ +#pragma once + +/// Based on the requirements of valid klotski, the `2x2` block that must exist +/// and only one, witch will occupy 4 empty slots, and the remaining 16 slots +/// will be allocated to space, `1x2`, `2x1` and `1x1`. Then, according to the +/// rules of CommonCode, they are coded as `00` `01` `10` `11` respectively, and +/// the remaining positions are filled with `0` and stored as 32-bit variables. + +/// As we all know, a space or `1x1` block will occupy 1 slot, `1x2` or `2x1` +/// block will occupy 2 slots, and together they fill 16 positions, so all +/// possible combinations can be calculated, this number is 204. Each combination +/// can produce different permutations. After verification, there are a total of +/// 7311921 possible permutations. The goal of BasicRanges is to find these +/// permutations, sort them and store them in a `uint32_t` array. + +/// In terms of algorithms, there are two options: the first is to generate +/// out-of-order data and then quickly arrange them; the second is to generate +/// ordered data for 204 combinations, and then merge and sort them. After testing, +/// the former is faster in generation (consuming T time), but it will consume +/// more time in sorting (about 7T), and the latter will cost about 2T in +/// generation due to the loss of the tree structure queue. But it can save more +/// time in sorting, which is about 2T, so the second solution will get the result +/// faster. + +/// Finally, due to the performance considerations of AllCases, the resulting data +/// will be flipped every two bits, which will not consume too much time (less than +/// 10% of T), but can almost double the speed of the subsequent `check_range`. + +#include +#include +#include + +void demo(); + +namespace klotski { + +/// basic ranges count + const uint32_t BASIC_RANGES_SIZE = 7311921; + + class BasicRanges { + public: + /// Three basic states, one-way transition. + /// {NOT_INIT} -> {BUILDING} -> {AVAILABLE} + enum Status { + NOT_INIT, + BUILDING, + AVAILABLE, + }; + typedef std::vector basic_ranges_t; + + /// Trigger the build process, from `NOT_INIT` to `BUILDING`. + static void build(); + + /// Get current status of BasicRanges. + static Status status() noexcept; + + /// Blocking access to constructed data. + static const basic_ranges_t& fetch(); + + private: + static bool available_; + static std::mutex building_; + static basic_ranges_t data_; + + static void build_data(); + + public: + /// The number of types of blocks. + struct generate_t { + int n1; // number of `00` -> space + int n2; // number of `01` -> 1x2 block + int n3; // number of `10` -> 2x1 block + int n4; // number of `11` -> 1x1 block + }; + + /// Generate all basic-ranges of the specified type. + static void generate(basic_ranges_t &release, generate_t info); + }; + +} // namespace klotski diff --git a/src/core/main.cc b/src/core/main.cc new file mode 100644 index 0000000..2069348 --- /dev/null +++ b/src/core/main.cc @@ -0,0 +1,43 @@ +#include +#include + +#include "all_cases/basic_ranges.h" + +int main() { + + auto start = clock(); + + demo(); + + +// for (int n = 0; n <= 7; ++n) // number of 1x2 and 2x1 block -> 0 ~ 7 +// for (int n_2x1 = 0; n_2x1 <= n; ++n_2x1) // number of 2x1 block -> 0 ~ n +// for (int n_1x1 = 0; n_1x1 <= (14 - n * 2); ++n_1x1) // number of 1x1 block -> 0 ~ (14 - 2n) +// build(16 - n * 2 - n_1x1, n - n_2x1, n_2x1, n_1x1); + + + +// generate(generate_t { // generate target ranges +// .n1 = 16 - n * 2 - n_1x1, /// space -> 00 +// .n2 = n - n_2x1, /// 1x2 -> 01 +// .n3 = n_2x1, /// 2x1 -> 10 +// .n4 = n_1x1, /// 1x1 -> 11 +// }); + + +// std::stable_sort(result.begin(), result.end()); + +// build(4, 2, 1, 6); + + std::cerr << ((clock() - start) * 1000 / CLOCKS_PER_SEC) << "ms" << std::endl; + +// std::cout << result.size() << std::endl; + +// std::vector demo {0b00, 0b01, 0b10, 0b11}; + +// do { +// std::cout << demo[0] << " " << demo[1] << " " << demo[2] << " " << demo[3] << std::endl; +// } while (next_permutation(demo.begin(), demo.end())); + + return 0; +}