主要功能:把一个字符串转成一个整数ID,根据一个整数ID找回字符串
支持增加、删除、查找、保存到文件和从文件读取。
需要改进:
查找操作先使用hash算法确定一个bucket,缩小查找范围,然后在这个bucket里面用StringArray.Search,最简单的遍历方法进行查找,所以Add/Delete会比较慢,考虑到Search(IntegerID)的调用次数要远多于Add(string)/Delete(string)的次数,就没有对这部分做优化。
改进方法1:
在StringArray里面再使用一次Hash算法(需要换一个hash算法)进一步缩小遍历查找的范围。改动比较小,不容易出错,占用内存少。
改进方法2:
采用Trie树,增加一个index,速度快,但是改写稍微复杂,多占一些内存。
#pragma once #include<unordered_map> #include<string> #include<fstream> #include<vector> namespace DataStructure { class StringArray { std::vector<char> charVector; std::vector<int> offsets; public: void Append(const char *str, size_t strLen); void Remove(size_t idx); __int64 Search(const char *str, size_t strLen); std::string At(size_t idx); void Save(std::ofstream& out); void Load(std::ifstream& in); }; class StringIntBiMap { public: int BitOfBucket = 32; std::vector<StringArray> keyBuckets; std::vector <std::vector<__int64>> valueBuckets; int m_bucketCount; StringIntBiMap(int buckectCount = 5000011); uint64_t FNV64aHash(void *buf, size_t len); __int64 Add(std::string& key, bool &alreadyAdded); __int64 Delete(std::string& key); std::string Search(__int64 value); void Save(const std::string& filename); void Load(const std::string& filename); }; } #include "StringIntBiMap.h" namespace DataStructure { void StringArray::Append(const char *str, size_t strLen) { offsets.push_back(charVector.size()); charVector.insert(charVector.end(), str, str + strLen); charVector.push_back('\0'); } void StringArray::Remove(size_t idx) { int size = offsets.size(); if (idx >= size) { return; } auto offset = offsets[idx]; int ed = idx + 1 < size ? offsets[idx + 1] : charVector.size(); charVector.erase(charVector.begin() + offset, charVector.begin() + ed); offsets.erase(offsets.begin() + idx); int length = ed - offset; for (int i = idx; i < size; i++) { offsets[i] -= length; } } __int64 StringArray::Search(const char *str, size_t strLen) { size_t size = offsets.size(); for (size_t i = 0; i < size; i++) { if (strcmp(charVector.data() + offsets[i], str) == 0) { return i; } } return -1; } std::string StringArray::At(size_t idx) { if (idx >= offsets.size()) return ""; return charVector.data() + offsets[idx]; } void StringArray::Save(std::ofstream& out) { auto size = charVector.size(); out.write((char *)&size, sizeof(size)); out.write(charVector.data(), size * sizeof(char)); size = offsets.size(); out.write((char *)&size, sizeof(size)); out.write((char *)offsets.data(), size * sizeof(int)); } void StringArray::Load(std::ifstream& in) { size_t size = 0; in.read((char *)&size, sizeof(size)); charVector.resize(size); in.read(charVector.data(), size * sizeof(char)); in.read((char *)&size, sizeof(size)); offsets.resize(size); in.read((char *)offsets.data(), size * sizeof(int)); } StringIntBiMap::StringIntBiMap(int buckectCount) :m_bucketCount(buckectCount) { keyBuckets.resize(m_bucketCount); valueBuckets.resize(m_bucketCount); } uint64_t StringIntBiMap::FNV64aHash(void *buf, size_t len) { uint64_t hval = 0xcbf29ce484222325ULL; unsigned char *bp = (unsigned char *)buf; unsigned char *be = bp + len; while (bp < be) { hval ^= (uint64_t)*bp++; hval += (hval << 1) + (hval << 4) + (hval << 5) + (hval << 7) + (hval << 8) + (hval << 40); } return hval % m_bucketCount; } __int64 StringIntBiMap::Add(std::string& key, bool &alreadyAdded) { alreadyAdded = false; auto bucketId = FNV64aHash((void *)key.c_str(), key.length()); auto keyBucket = keyBuckets.begin() + bucketId; auto valueBucket = valueBuckets.begin() + bucketId; auto IdInBucket = keyBucket->Search(key.c_str(), key.length()); if (IdInBucket >= 0) { alreadyAdded = true; return (bucketId << BitOfBucket) + valueBucket->at(IdInBucket); } __int64 lowBitValue = 0; if (valueBucket->size() > 0) { lowBitValue = *(valueBucket->rbegin()) + 1; } valueBucket->push_back(lowBitValue); keyBucket->Append(key.c_str(), key.length()); return (bucketId << BitOfBucket) + lowBitValue; } __int64 StringIntBiMap::Delete(std::string& key) { auto bucketId = FNV64aHash((void *)key.c_str(), key.length()); auto keyBucket = keyBuckets.begin() + bucketId; auto valueBucket = valueBuckets.begin() + bucketId; auto IdInBucket = keyBucket->Search(key.c_str(), key.length()); if (IdInBucket < 0) { return -1; } __int64 lowBitValue = valueBucket->at(IdInBucket); keyBucket->Remove(IdInBucket); valueBucket->erase(valueBucket->begin() + IdInBucket); return (bucketId << BitOfBucket) + lowBitValue; } std::string StringIntBiMap::Search(__int64 value) { __int64 bucketId = value >> BitOfBucket; __int64 lowBitValue = value & ((1LL << BitOfBucket) - 1); auto valueBucket = valueBuckets.begin() + bucketId; auto size = valueBucket->size(); if (size > 16) { __int64 lef = 0; __int64 rig = size; _int64 mid = 0; while (lef < rig) { mid = (lef + rig) >> 1; __int64 cur = valueBucket->at(mid); if (cur == lowBitValue) { return std::move((keyBuckets.begin() + bucketId)->At(mid)); } else if (cur > lowBitValue) { rig = mid; } else { lef = mid + 1; } } } else { for (int i = 0; i < size; i++) { if (valueBucket->at(i) == lowBitValue) { return std::move((keyBuckets.begin() + bucketId)->At(i)); } } } return ""; } void StringIntBiMap::Save(const std::string& filename) { std::ofstream out(filename, std::ios::out | std::ios::binary); out.write((char *)&BitOfBucket, sizeof(BitOfBucket)); out.write((char *)&m_bucketCount, sizeof(m_bucketCount)); for (int i = 0; i < m_bucketCount; i++) { keyBuckets[i].Save(out); auto size = valueBuckets[i].size(); out.write((char *)&size, sizeof(size)); out.write((char *)valueBuckets[i].data(), size * sizeof(__int64)); } out.close(); } void StringIntBiMap::Load(const std::string& filename) { std::ifstream in(filename, std::ios::in | std::ios::binary); in.read((char *)&BitOfBucket, sizeof(BitOfBucket)); in.read((char *)&m_bucketCount, sizeof(m_bucketCount)); keyBuckets.resize(m_bucketCount); valueBuckets.resize(m_bucketCount); for (int i = 0; i < m_bucketCount; i++) { keyBuckets[i].Load(in); size_t size = 0; in.read((char *)&size, sizeof(size)); //LogAsset(size == keyBuckets[i].Size()); valueBuckets[i].resize(size); in.read((char *)valueBuckets[i].data(), size * sizeof(__int64)); } in.close(); } }