Browse Source

update: interface of ShortCode

master
Dnomd343 1 year ago
parent
commit
7cbfd9badd
  1. 25
      src/main.cc
  2. 2
      src/short_code/CMakeLists.txt
  3. 16
      src/short_code/convert.cc
  4. 70
      src/short_code/data_loader.cc
  5. 44
      src/short_code/serialize.cc
  6. 6
      src/short_code/serialize_chars.h
  7. 61
      src/short_code/short_code.cc
  8. 54
      src/short_code/short_code.h

25
src/main.cc

@ -178,13 +178,13 @@ int main() {
// AllCases::build();
BasicRanges::build();
std::vector<uint64_t> all_cases;
for (uint64_t head = 0; head < 16; ++head) {
for (const auto &range : AllCases::fetch()[head]) {
all_cases.emplace_back(head << 32 | range);
}
}
std::cout << "test data size: " << all_cases.size() << std::endl;
// std::vector<uint64_t> all_cases;
// for (uint64_t head = 0; head < 16; ++head) {
// for (const auto &range : AllCases::fetch()[head]) {
// all_cases.emplace_back(head << 32 | range);
// }
// }
// std::cout << "test data size: " << all_cases.size() << std::endl;
// std::cout << "start benchmark" << std::endl;
auto start_time = clock();
@ -271,14 +271,17 @@ int main() {
// }
// }
for (uint32_t i = 1000000; i < 1010000; ++i) {
ShortCode::tiny_decode(i);
ShortCode::tiny_encode(all_cases[i]);
}
// for (uint32_t i = 1000000; i < 1010000; ++i) {
// ShortCode::tiny_decode(i);
// ShortCode::tiny_encode(all_cases[i]);
// }
// printf("%09lX\n", ShortCode::tiny_decode(14323231));
// std::cout << ShortCode::tiny_encode(0x6EC0F8800) << std::endl;
std::cout << ShortCode(14323231).to_string() << std::endl;
std::cout << ShortCode::from_string("EP4HZ").unwrap() << std::endl;
// printf("%09lX\n", ShortCode::fast_decode(14323231));
// std::cout << ShortCode::fast_encode(0x6EC0F8800) << std::endl;

2
src/short_code/CMakeLists.txt

@ -2,5 +2,5 @@ cmake_minimum_required(VERSION 3.0)
include_directories(offset)
add_library(short_code convert.cc short_code.cc data_loader.cc)
add_library(short_code convert.cc serialize.cc short_code.cc)
target_link_libraries(short_code utils all_cases)

16
src/short_code/convert.cc

@ -6,21 +6,25 @@
#include "basic_ranges_offset.h"
#include "range_prefix_offset.h"
CommonCode ShortCode::to_common_code() const { // convert to common code
if (ShortCode::check_mode() == ShortCode::NORMAL) {
return CommonCode::unsafe_create(tiny_decode(code)); // normal mode
}
return CommonCode::unsafe_create(fast_decode(code)); // fast mode
ShortCode ShortCode::from_common_code(const CommonCode &common_code) {
return ShortCode(common_code); // convert from common code
}
ShortCode::ShortCode(const CommonCode &common_code) { // convert from common code
if (ShortCode::check_mode() == ShortCode::NORMAL) {
if (ShortCode::mode() == ShortCode::NORMAL) {
code = tiny_encode(common_code.unwrap()); // normal mode
} else {
code = fast_encode(common_code.unwrap()); // fast mode
}
}
CommonCode ShortCode::to_common_code() const { // convert to common code
if (ShortCode::mode() == ShortCode::NORMAL) {
return CommonCode::unsafe_create(tiny_decode(code)); // normal mode
}
return CommonCode::unsafe_create(fast_decode(code)); // fast mode
}
/// NOTE: ensure that input common code is valid !!!
uint32_t ShortCode::fast_encode(uint64_t common_code) { // common code --> short code
auto head = common_code >> 32; // head index

70
src/short_code/data_loader.cc

@ -1,70 +0,0 @@
#include "all_cases.h"
#include "short_code.h"
#include "basic_ranges.h"
//#include "short_code_mark.h"
//std::mutex ShortCode::map_building;
bool ShortCode::fast_mode_available = false;
bool ShortCode::normal_mode_available = false;
//std::vector<uint64_t> ShortCode::all_cases_list;
//std::unordered_map<uint64_t, uint32_t> ShortCode::all_cases_dict;
void ShortCode::speed_up(ShortCode::Mode mode) {
if (fast_mode_available) {
return; // fast mode already available
}
if (mode == ShortCode::FAST) { // build fast mode data
// build_mappings();
// TODO: confirm AllCases data available
AllCases::build();
} else if (mode == ShortCode::NORMAL && !normal_mode_available) { // build normal mode data
BasicRanges::build(); // blocking function
normal_mode_available = true;
}
}
ShortCode::Mode ShortCode::check_mode() { // ensure speed up enabled and return current mode
if (fast_mode_available) {
return ShortCode::FAST; // fast mode already enabled
}
if (normal_mode_available) {
return ShortCode::NORMAL; // normal mode already enabled
}
speed_up(ShortCode::Mode::NORMAL); // without initialized -> enter normal mode
return ShortCode::Mode::NORMAL; // use normal mode
}
/// ensure that fast_mode_available == false
//void ShortCode::build_mappings() { // build fast search mappings
// if (map_building.try_lock()) { // lock success -> start building
// for (int head = 0; head < 16; ++head) {
// uint64_t prefix = (uint64_t)head << 32;
// for (const auto &range : AllCases::fetch()[head]) { // blocking function
// all_cases_list.emplace_back(prefix | range); // short_code -> common_code
// }
// }
// for (int index = 0; index < all_cases_list.size(); ++index) {
// all_cases_dict[all_cases_list[index]] = index; // common_code -> short_code
// }
// fast_mode_available = true; // set available flag
// } else { // another thread building
// map_building.lock(); // blocking waiting
// }
// map_building.unlock();
//}
//uint32_t ShortCode::fast_encode_legacy(uint64_t common_code) {
// return all_cases_dict[common_code];
//}
#include <algorithm>
#include "basic_ranges_offset.h"
#include "range_prefix_offset.h"
#include <iostream>
#include "common.h"

44
src/short_code/serialize.cc

@ -0,0 +1,44 @@
#include <stdexcept>
#include "short_code.h"
#include "serialize_chars.h"
ShortCode ShortCode::from_string(const std::string &short_code) {
return ShortCode(short_code); // convert from string
}
std::string ShortCode::to_string() const { // encode as 5-bits string
uint32_t short_code = code;
std::string result(5, '\0'); // short code length 5
for (int n = 0; n < 5; ++n) {
uint8_t bit = short_code % 32;
short_code = (short_code - bit) / 32;
result[4 - n] = SHORT_CODE_TABLE[bit];
}
return result;
}
ShortCode::ShortCode(const std::string &short_code) { // 5-bits string decode
if (short_code.length() != 5) { // check string length
throw std::invalid_argument("short code format error");
}
uint64_t result = 0;
for (auto bit : short_code) {
result *= 32;
if (bit >= 'a' && bit <= 'z') {
bit -= 32; // convert to uppercase
}
if (bit >= '1' && bit <= 'Z') {
result += (bit = SHORT_CODE_TABLE_REV[bit - 49]); // table convert
if (bit != -1) {
continue; // pass check
}
}
throw std::invalid_argument("short code format error"); // unknown characters
}
if (!ShortCode::check(result)) { // check converted short code
throw std::invalid_argument("invalid short code");
}
code = result;
}

6
src/short_code/short_code_chars.h → src/short_code/serialize_chars.h

@ -1,6 +1,8 @@
#pragma once
const char SHORT_CODE_TABLE[32] = {
#include <cstdint>
const int8_t SHORT_CODE_TABLE[32] = {
'1', '2', '3', '4', '5', '6', '7', '8', '9', // skip `0`
'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', // skip `I`
'J', 'K', // skip `L`
@ -9,7 +11,7 @@ const char SHORT_CODE_TABLE[32] = {
};
/// `1`(49) ~ `Z`(90)
const char SHORT_CODE_TABLE_REV[42] = {
const int8_t SHORT_CODE_TABLE_REV[42] = {
0, 1, 2, 3, 4, 5, 6, 7, 8, // `1`(49) ~ `9`(57)
-1, -1, -1, -1, -1, -1, -1, // `:`(58) ~ `@`(64)
9, 10, 11, 12, 13, 14, 15, 16, -1, 17, // `A`(65) ~ `J`(74)

61
src/short_code/short_code.cc

@ -1,11 +1,17 @@
#include <stdexcept>
#include "all_cases.h"
#include "short_code.h"
#include "short_code_chars.h"
bool ShortCode::fast_mode_available = false;
bool ShortCode::normal_mode_available = false;
uint32_t ShortCode::unwrap() const { // get raw uint32_t code
return code;
}
ShortCode ShortCode::create(uint32_t short_code) {
return ShortCode(short_code);
}
bool ShortCode::check(uint32_t short_code) {
return short_code < ShortCode::SHORT_CODE_LIMIT; // 0 ~ (SHORT_CODE_LIMIT - 1)
}
@ -17,39 +23,34 @@ ShortCode::ShortCode(uint32_t short_code) {
code = short_code;
}
ShortCode::ShortCode(const std::string &short_code_str) { // 5-bits string decode
if (short_code_str.length() != 5) { // check string length
throw std::invalid_argument("short code format error");
}
uint64_t short_code = 0;
for (auto bit : short_code_str) {
short_code *= 32;
if (bit >= 'a' && bit <= 'z') {
bit -= 32; // convert to uppercase
}
if (bit >= '1' && bit <= 'Z') {
short_code += (bit = SHORT_CODE_TABLE_REV[bit - 49]); // table convert
if (bit != -1) {
continue; // pass check
}
}
throw std::invalid_argument("short code format error"); // unknown characters
void ShortCode::speed_up(ShortCode::Mode mode) {
if (fast_mode_available) {
return; // fast mode already available
}
if (mode == ShortCode::FAST) { // build fast mode data
// build_mappings();
if (!ShortCode::check(short_code)) { // check converted short code
throw std::invalid_argument("invalid short code");
// TODO: confirm AllCases data available
AllCases::build();
} else if (mode == ShortCode::NORMAL && !normal_mode_available) { // build normal mode data
BasicRanges::build(); // blocking function
normal_mode_available = true;
}
code = short_code;
}
std::string ShortCode::to_string() const { // encode as 5-bits string
uint32_t short_code = code;
std::string result(5, '\0'); // short code length 5
for (int n = 0; n < 5; ++n) {
uint8_t bit = short_code % 32;
short_code = (short_code - bit) / 32;
result[4 - n] = SHORT_CODE_TABLE[bit];
ShortCode::Mode ShortCode::mode() { // ensure speed up enabled and return current mode
if (fast_mode_available) {
return ShortCode::FAST; // fast mode already enabled
}
if (normal_mode_available) {
return ShortCode::NORMAL; // normal mode already enabled
}
return result;
speed_up(ShortCode::Mode::NORMAL); // without initialized -> enter normal mode
return ShortCode::Mode::NORMAL; // use normal mode
}

54
src/short_code/short_code.h

@ -1,19 +1,13 @@
#pragma once
#include <mutex>
#include <vector>
#include <cstdint>
#include <unordered_map>
#include "common_code.h"
class CommonCode;
class ShortCode {
public:
enum Mode {
NORMAL, FAST
};
static enum Mode check_mode();
enum Mode {NORMAL, FAST};
static void speed_up(enum Mode mode);
uint32_t unwrap() const;
@ -24,49 +18,27 @@ public:
// TODO: std::cout << ShortCode(...) << std::endl;
explicit ShortCode(uint32_t short_code);
explicit ShortCode(const std::string &short_code);
explicit ShortCode(const CommonCode &common_code);
explicit ShortCode(const std::string &short_code_str);
ShortCode(uint32_t short_code, enum Mode mode) : ShortCode(short_code) {
speed_up(mode);
}
ShortCode(const CommonCode &common_code, enum Mode mode) : ShortCode(common_code) {
speed_up(mode);
}
ShortCode(const std::string &short_code_str, enum Mode mode) : ShortCode(short_code_str) {
speed_up(mode);
}
// TODO: mark as private after inner test
static uint64_t fast_decode(uint32_t short_code);
static uint32_t fast_encode(uint64_t common_code);
static uint64_t tiny_decode(uint32_t short_code);
static uint32_t tiny_encode(uint64_t common_code);
// static uint64_t tiny_decode_10b(uint32_t short_code);
// static uint32_t tiny_encode_10b(uint64_t common_code);
// TODO: ShortCode::create() / ShortCode::from_str(...) / ShortCode::from_common_code(...)
static ShortCode create(uint32_t short_code);
static ShortCode from_string(const std::string &short_code);
static ShortCode from_common_code(const CommonCode &common_code);
// static uint32_t fast_encode_legacy(uint64_t common_code);
ShortCode(uint32_t short_code, enum Mode mode) : ShortCode(short_code) { speed_up(mode); }
ShortCode(const std::string &short_code, enum Mode mode) : ShortCode(short_code) { speed_up(mode); }
ShortCode(const CommonCode &common_code, enum Mode mode) : ShortCode(common_code) { speed_up(mode); }
private:
uint32_t code;
// static std::mutex map_building;
static bool fast_mode_available;
static bool normal_mode_available;
static const uint32_t SHORT_CODE_LIMIT = 29334498;
/// NOTE: using binary search instead of global vector and unordered_map
/// for some test, the new function only using < 170MB memory, while the legacy using > 1.5GB
/// but the legacy one is more easy and a little fast.
/// BTW, the new one init less than 1.2s, legacy one need about 15s
// static std::vector<uint64_t> all_cases_list; // short_code -> common_code
// static std::unordered_map<uint64_t, uint32_t> all_cases_dict; // common_code -> short_code
static enum Mode mode();
// static void build_mappings();
// static uint64_t tiny_decode(uint32_t short_code);
// static uint32_t tiny_encode(uint64_t common_code);
static uint64_t fast_decode(uint32_t short_code);
static uint32_t fast_encode(uint64_t common_code);
static uint64_t tiny_decode(uint32_t short_code);
static uint32_t tiny_encode(uint64_t common_code);
};

Loading…
Cancel
Save