主要功能:把一个字符串转成一个整数ID,根据一个整数ID找回字符串
支持增加、删除、查找、保存到文件和从文件读取。
需要改进:
查找操作先使用hash算法确定一个bucket,缩小查找范围,然后在这个bucket里面用StringArray.Search,最简单的遍历方法进行查找,所以Add/Delete会比较慢,考虑到Search(IntegerID)的调用次数要远多于Add(string)/Delete(string)的次数,就没有对这部分做优化。
改进方法1:
在StringArray里面再使用一次Hash算法(需要换一个hash算法)进一步缩小遍历查找的范围。改动比较小,不容易出错,占用内存少。
改进方法2:
采用Trie树,增加一个index,速度快,但是改写稍微复杂,多占一些内存。
#pragma once
#include<unordered_map>
#include<string>
#include<fstream>
#include<vector>
namespace DataStructure {
class StringArray
{
std::vector<char> charVector;
std::vector<int> offsets;
public:
void Append(const char *str, size_t strLen);
void Remove(size_t idx);
__int64 Search(const char *str, size_t strLen);
std::string At(size_t idx);
void Save(std::ofstream& out);
void Load(std::ifstream& in);
};
class StringIntBiMap {
public:
int BitOfBucket = 32;
std::vector<StringArray> keyBuckets;
std::vector <std::vector<__int64>> valueBuckets;
int m_bucketCount;
StringIntBiMap(int buckectCount = 5000011);
uint64_t FNV64aHash(void *buf, size_t len);
__int64 Add(std::string& key, bool &alreadyAdded);
__int64 Delete(std::string& key);
std::string Search(__int64 value);
void Save(const std::string& filename);
void Load(const std::string& filename);
};
}
#include "StringIntBiMap.h"
namespace DataStructure
{
void StringArray::Append(const char *str, size_t strLen)
{
offsets.push_back(charVector.size());
charVector.insert(charVector.end(), str, str + strLen);
charVector.push_back('\0');
}
void StringArray::Remove(size_t idx)
{
int size = offsets.size();
if (idx >= size)
{
return;
}
auto offset = offsets[idx];
int ed = idx + 1 < size ? offsets[idx + 1] : charVector.size();
charVector.erase(charVector.begin() + offset, charVector.begin() + ed);
offsets.erase(offsets.begin() + idx);
int length = ed - offset;
for (int i = idx; i < size; i++)
{
offsets[i] -= length;
}
}
__int64 StringArray::Search(const char *str, size_t strLen)
{
size_t size = offsets.size();
for (size_t i = 0; i < size; i++)
{
if (strcmp(charVector.data() + offsets[i], str) == 0)
{
return i;
}
}
return -1;
}
std::string StringArray::At(size_t idx)
{
if (idx >= offsets.size()) return "";
return charVector.data() + offsets[idx];
}
void StringArray::Save(std::ofstream& out)
{
auto size = charVector.size();
out.write((char *)&size, sizeof(size));
out.write(charVector.data(), size * sizeof(char));
size = offsets.size();
out.write((char *)&size, sizeof(size));
out.write((char *)offsets.data(), size * sizeof(int));
}
void StringArray::Load(std::ifstream& in)
{
size_t size = 0;
in.read((char *)&size, sizeof(size));
charVector.resize(size);
in.read(charVector.data(), size * sizeof(char));
in.read((char *)&size, sizeof(size));
offsets.resize(size);
in.read((char *)offsets.data(), size * sizeof(int));
}
StringIntBiMap::StringIntBiMap(int buckectCount)
:m_bucketCount(buckectCount)
{
keyBuckets.resize(m_bucketCount);
valueBuckets.resize(m_bucketCount);
}
uint64_t StringIntBiMap::FNV64aHash(void *buf, size_t len)
{
uint64_t hval = 0xcbf29ce484222325ULL;
unsigned char *bp = (unsigned char *)buf;
unsigned char *be = bp + len;
while (bp < be) {
hval ^= (uint64_t)*bp++;
hval += (hval << 1) + (hval << 4) + (hval << 5) +
(hval << 7) + (hval << 8) + (hval << 40);
}
return hval % m_bucketCount;
}
__int64 StringIntBiMap::Add(std::string& key, bool &alreadyAdded)
{
alreadyAdded = false;
auto bucketId = FNV64aHash((void *)key.c_str(), key.length());
auto keyBucket = keyBuckets.begin() + bucketId;
auto valueBucket = valueBuckets.begin() + bucketId;
auto IdInBucket = keyBucket->Search(key.c_str(), key.length());
if (IdInBucket >= 0)
{
alreadyAdded = true;
return (bucketId << BitOfBucket) + valueBucket->at(IdInBucket);
}
__int64 lowBitValue = 0;
if (valueBucket->size() > 0)
{
lowBitValue = *(valueBucket->rbegin()) + 1;
}
valueBucket->push_back(lowBitValue);
keyBucket->Append(key.c_str(), key.length());
return (bucketId << BitOfBucket) + lowBitValue;
}
__int64 StringIntBiMap::Delete(std::string& key)
{
auto bucketId = FNV64aHash((void *)key.c_str(), key.length());
auto keyBucket = keyBuckets.begin() + bucketId;
auto valueBucket = valueBuckets.begin() + bucketId;
auto IdInBucket = keyBucket->Search(key.c_str(), key.length());
if (IdInBucket < 0)
{
return -1;
}
__int64 lowBitValue = valueBucket->at(IdInBucket);
keyBucket->Remove(IdInBucket);
valueBucket->erase(valueBucket->begin() + IdInBucket);
return (bucketId << BitOfBucket) + lowBitValue;
}
std::string StringIntBiMap::Search(__int64 value)
{
__int64 bucketId = value >> BitOfBucket;
__int64 lowBitValue = value & ((1LL << BitOfBucket) - 1);
auto valueBucket = valueBuckets.begin() + bucketId;
auto size = valueBucket->size();
if (size > 16)
{
__int64 lef = 0;
__int64 rig = size;
_int64 mid = 0;
while (lef < rig)
{
mid = (lef + rig) >> 1;
__int64 cur = valueBucket->at(mid);
if (cur == lowBitValue)
{
return std::move((keyBuckets.begin() + bucketId)->At(mid));
}
else if (cur > lowBitValue)
{
rig = mid;
}
else
{
lef = mid + 1;
}
}
}
else
{
for (int i = 0; i < size; i++)
{
if (valueBucket->at(i) == lowBitValue)
{
return std::move((keyBuckets.begin() + bucketId)->At(i));
}
}
}
return "";
}
void StringIntBiMap::Save(const std::string& filename)
{
std::ofstream out(filename, std::ios::out | std::ios::binary);
out.write((char *)&BitOfBucket, sizeof(BitOfBucket));
out.write((char *)&m_bucketCount, sizeof(m_bucketCount));
for (int i = 0; i < m_bucketCount; i++)
{
keyBuckets[i].Save(out);
auto size = valueBuckets[i].size();
out.write((char *)&size, sizeof(size));
out.write((char *)valueBuckets[i].data(), size * sizeof(__int64));
}
out.close();
}
void StringIntBiMap::Load(const std::string& filename)
{
std::ifstream in(filename, std::ios::in | std::ios::binary);
in.read((char *)&BitOfBucket, sizeof(BitOfBucket));
in.read((char *)&m_bucketCount, sizeof(m_bucketCount));
keyBuckets.resize(m_bucketCount);
valueBuckets.resize(m_bucketCount);
for (int i = 0; i < m_bucketCount; i++)
{
keyBuckets[i].Load(in);
size_t size = 0;
in.read((char *)&size, sizeof(size));
//LogAsset(size == keyBuckets[i].Size());
valueBuckets[i].resize(size);
in.read((char *)valueBuckets[i].data(), size * sizeof(__int64));
}
in.close();
}
}