// Get the list of files to search in this level FileMetaData*const* files =&files_[level][0]; if (level ==0) { //level0鐗規(guī)畩澶勭悊錛屽洜涓簁ey鏄噸鍙狅紝鎵鏈夌鍚堟潯浠剁殑鏂囦歡蹇呴』琚煡鎵?/span> tmp.reserve(num_files); for (uint32_t i =0; i < num_files; i++) { FileMetaData* f = files[i]; if (ucmp->Compare(user_key, f->smallest.user_key()) >=0&& ucmp->Compare(user_key, f->largest.user_key()) <=0) { tmp.push_back(f); } } if (tmp.empty()) continue;
std::sort(tmp.begin(), tmp.end(), NewestFirst); files =&tmp[0]; num_files = tmp.size(); } else { // 浜屽垎娉曟煡鎵撅紝鏌愪釜key鍙彲鑳藉睘浜庝竴涓枃浠?/span> uint32_t index = FindFile(vset_->icmp_, files_[level], ikey); //娌℃湁鏌ュ埌 if (index >= num_files) { files = NULL; num_files =0; } else { tmp2 = files[index]; if (ucmp->Compare(user_key, tmp2->smallest.user_key()) <0) { // All of "tmp2" is past any data for user_key files = NULL; num_files =0; } else { files =&tmp2; num_files =1; } } }
for (uint32_t i =0; i < num_files; ++i) { //閬嶅巻鏈眰絎﹀悎鏉′歡鐨勬枃浠?/span> if (last_file_read != NULL && stats->seek_file == NULL) { //seek_file鍙褰曠涓涓?/span> stats->seek_file = last_file_read; stats->seek_file_level = last_file_read_level; }
FileMetaData* f = files[i]; last_file_read = f; last_file_read_level = level;
//浠嶭RU cache涓煡鎵?/span> Cache::Handle* handle = cache_->Lookup(key); if (handle == NULL) { /鍔犺澆鏂囦歡 std::string fname = TableFileName(dbname_, file_number); RandomAccessFile* file = NULL; Table* table = NULL; Status s = env_->NewRandomAccessFile(fname, &file); if (s.ok()) { s = Table::Open(*options_, file, file_size, &table); }
if (!s.ok()) { assert(table == NULL); delete file; // We do not cache error results so that if the error is transient, // or somebody repairs the file, we recover automatically. return NewErrorIterator(s); }
Status DBImpl::Write(const WriteOptions& options, WriteBatch* updates) { Status status; //鍔犻攣 MutexLock l(&mutex_); LoggerId self; //鎷垮埌鍐檒og鐨勬潈鍒?/span> AcquireLoggingResponsibility(&self); //媯(gè)鏌ユ槸鍚﹀彲鍐?/span> status = MakeRoomForWrite(false); // May temporarily release lock and wait uint64_t last_sequence = versions_->LastSequence(); if (status.ok()) { WriteBatchInternal::SetSequence(updates, last_sequence +1); last_sequence += WriteBatchInternal::Count(updates);
// Add to log and apply to memtable. We can release the lock during // this phase since the "logger_" flag protects against concurrent // loggers and concurrent writes into mem_. { assert(logger_ ==&self); mutex_.Unlock(); //IO鎿嶄綔錛氬啓鍏OG status = log_->AddRecord(WriteBatchInternal::Contents(updates)); if (status.ok() && options.sync) { status = logfile_->Sync(); } //鎻掑叆memtable if (status.ok()) { status = WriteBatchInternal::InsertInto(updates, mem_); } mutex_.Lock(); assert(logger_ ==&self); } //璁劇疆鏂扮殑seqence number versions_->SetLastSequence(last_sequence); } //閲婃斁鍐橪OG閿?/span> ReleaseLoggingResponsibility(&self); return status; }
鍐欐祦閲忔帶鍒訛細(xì) <db/dbimpl.cc>
Status DBImpl::MakeRoomForWrite(bool force) { mutex_.AssertHeld(); assert(logger_ != NULL); bool allow_delay =!force; Status s; while (true) { if (!bg_error_.ok()) { // Yield previous error s = bg_error_; break; } elseif ( allow_delay && versions_->NumLevelFiles(0) >= config::kL0_SlowdownWritesTrigger) { mutex_.Unlock(); //濡傛灉level0鐨勬枃浠跺ぇ浜巏L0_SlowdownWritesTrigger闃堝鹼紝鍒檚leep 1s錛岃繖鏍風(fēng)粰compaction鏇村鐨凜PU env_->SleepForMicroseconds(1000); allow_delay =false; // Do not delay a single write more than once mutex_.Lock(); } elseif (!force && (mem_->ApproximateMemoryUsage() <= options_.write_buffer_size)) { //鍙啓 break; } elseif (imm_ != NULL) { // imm_:涔嬪墠鐨刴emtable 娌℃湁琚玞ompaction錛岄渶瑕佺瓑寰?/span> bg_cv_.Wait(); } elseif (versions_->NumLevelFiles(0) >= config::kL0_StopWritesTrigger) { // level0鏂囦歡涓暟澶т簬kL0_StopWritesTrigger,闇瑕佺瓑寰?/span> Log(options_.info_log, "waiting\n"); bg_cv_.Wait(); } else { //鐢熸垚鏂扮殑棰漨emtable鍜宭ogfile錛屾妸褰撳墠memtable浼犵粰imm_ assert(versions_->PrevLogNumber() ==0); uint64_t new_log_number = versions_->NewFileNumber(); WritableFile* lfile = NULL; s = env_->NewWritableFile(LogFileName(dbname_, new_log_number), &lfile); if (!s.ok()) { break; } delete log_; delete logfile_; logfile_ = lfile; logfile_number_ = new_log_number; log_ =new log::Writer(lfile); imm_ = mem_; has_imm_.Release_Store(imm_); mem_ =new MemTable(internal_comparator_); mem_->Ref(); force =false; // Do not force another compaction if have room
// No copying allowed MemTable(const MemTable&); voidoperator=(const MemTable&); };
鍏堢湅鐪嬫彃鍏?br /><db/memtable.cc>
void MemTable::Add(SequenceNumber s, ValueType type, const Slice& key, const Slice& value) { //鏁版嵁緇撴瀯錛?br /> //1.internal key size : Varint32 (length of 2+3) //2.key data //3.SequenceNumber+Key type: 8 bytes //4 value size: Varint32 //5 value data size_t key_size = key.size(); size_t val_size = value.size(); size_t internal_key_size = key_size +8; const size_t encoded_len = VarintLength(internal_key_size) + internal_key_size + VarintLength(val_size) + val_size; char* buf = arena_.Allocate(encoded_len); char* p = EncodeVarint32(buf, internal_key_size); memcpy(p, key.data(), key_size); p += key_size; EncodeFixed64(p, (s <<8) | type); p +=8; p = EncodeVarint32(p, val_size); memcpy(p, value.data(), val_size); assert((p + val_size) - buf == encoded_len); table_.Insert(buf); }
鏌ヨ <db/memtable.cc>
bool MemTable::Get(const LookupKey& key, std::string* value, Status* s) { Slice memkey = key.memtable_key(); Table::Iterator iter(&table_); iter.Seek(memkey.data()); if (iter.Valid()) { // entry format is: // klength varint32 // userkey char[klength] // tag uint64 // vlength varint32 // value char[vlength] // Check that it belongs to same user key. We do not check the // sequence number since the Seek() call above should have skipped // all entries with overly large sequence numbers. constchar* entry = iter.key(); uint32_t key_length; constchar* key_ptr = GetVarint32Ptr(entry, entry+5, &key_length); if (comparator_.comparator.user_comparator()->Compare( Slice(key_ptr, key_length -8), key.user_key()) ==0) { // Correct user key const uint64_t tag = DecodeFixed64(key_ptr + key_length -8); switch (static_cast<ValueType>(tag &0xff)) { case kTypeValue: { Slice v = GetLengthPrefixedSlice(key_ptr + key_length); value->assign(v.data(), v.size()); returntrue; } case kTypeDeletion: *s = Status::NotFound(Slice()); returntrue; } } } returnfalse; }
]]>leveldb鐮旂┒7-Version/VersionSet/VersionEdithttp://www.tkk7.com/sandy/archive/2012/03/16/leveldb7.html灝忔槑灝忔槑Fri, 16 Mar 2012 09:10:00 GMThttp://www.tkk7.com/sandy/archive/2012/03/16/leveldb7.htmlhttp://www.tkk7.com/sandy/comments/372028.htmlhttp://www.tkk7.com/sandy/archive/2012/03/16/leveldb7.html#Feedback0http://www.tkk7.com/sandy/comments/commentRss/372028.htmlhttp://www.tkk7.com/sandy/services/trackbacks/372028.html
鍏堢湅鐪嬩竴涓噸瑕佺殑鏁版嵁緇撴灉錛宻st file鐨凪ETA info
<db/version_edit.h>
f->allowed_seeks = (f->file_size /16384); if (f->allowed_seeks <100) f->allowed_seeks =100;
鍘熷洜錛岃鐪媗eveldb鐨勬敞閲婏細(xì)
// We arrange to automatically compact this file after a certain number of seeks. Let's assume:
// (1) One seek costs 10ms
// (2) Writing or reading 1MB costs 10ms (100MB/s)
// (3) A compaction of 1MB does 25MB of IO:
// 1MB read from this level
// 10-12MB read from next level (boundaries may be misaligned)
// 10-12MB written to next level
// This implies that 25 seeks cost the same as the compaction
// of 1MB of data. I.e., one seek costs approximately the
// same as the compaction of 40KB of data. We are a little
// conservative and allow approximately one seek for every 16KB
// of data before triggering a compaction.
//鏌ヨ鍜宬ey range鏈夊叧鐨刦iles void GetOverlappingInputs( int level, const InternalKey* begin, // NULL means before all keys const InternalKey* end, // NULL means after all keys std::vector<FileMetaData*>* inputs);
//memtable output搴旇鏀懼埌鍝釜level int PickLevelForMemTableOutput(const Slice& smallest_user_key, const Slice& largest_user_key);
//鏌愪釜level鐨勬枃浠朵釜鏁?/span> int NumFiles(int level) const { return files_[level].size(); }
// Return a human readable string that describes this version's contents. std::string DebugString() const;
private:
friend class Compaction;
friend class VersionSet;
class LevelFileNumIterator;
Iterator* NewConcatenatingIterator(const ReadOptions&, int level) const;
VersionSet* vset_; // VersionSet to which this Version belongs Version* next_; // Next version in linked list Version* prev_; // Previous version in linked list int refs_; // Number of live refs to this version
WritableFile* descriptor_file_; log::Writer* descriptor_log_; Version dummy_versions_; // Head of circular doubly-linked list of versions. Version* current_; // == dummy_versions_.prev_
// We leave eight bits empty at the bottom so a type and sequence# // can be packed together into 64-bits. static const SequenceNumber kMaxSequenceNumber = ((0x1ull << 56) - 1);
SnapShot(db/snapshot.h),錛屽彲浠ョ湅鍑簊napshot鍏跺疄灝辨槸涓涓猻equence number
class SnapshotImpl : public Snapshot { public: //鍒涘緩鍚庝繚鎸佷笉鍙?/span> SequenceNumber number_;
]]>Learn From HBase/Bigtablehttp://www.tkk7.com/sandy/archive/2012/03/07/Learn_From_HBase.html灝忔槑灝忔槑Wed, 07 Mar 2012 02:42:00 GMThttp://www.tkk7.com/sandy/archive/2012/03/07/Learn_From_HBase.htmlhttp://www.tkk7.com/sandy/comments/371379.htmlhttp://www.tkk7.com/sandy/archive/2012/03/07/Learn_From_HBase.html#Feedback0http://www.tkk7.com/sandy/comments/commentRss/371379.htmlhttp://www.tkk7.com/sandy/services/trackbacks/371379.html
1. 浣跨敤鍙俊浠葷殑鍒嗗竷寮忕粍浠舵潵鎼緩鑷繁鐨勫垎甯冨紡緋葷粺銆?/strong> 璁?璁′竴涓彲闈狅紝鍋ュ.鐨勫垎甯冨紡緋葷粺鏄瘮杈冨洶闅劇殑銆傛垜浠煡閬擄紝涓轟簡(jiǎn)闃叉SPOF(Single Point Of Failure)闂錛屾垜浠鍒嗘暎椋庨櫓錛屾妸鏁版嵁鏀懼湪澶氫釜nodes涓婇潰鍘伙紝浣嗘槸榪欐牱甯︽潵浜?jiǎn)鏄暟鎹殑鍚屾闂鍜岀増鏈棶棰樺Q岃В鍐寵繖涓棶棰橀渶瑕佽繍鐢ㄥ鏉傜殑 Paxos鍗忚錛岀郴緇熺殑澶嶆潅搴﹁嚜鐒跺氨鍗囬珮?shù)簡(jiǎn)銆傚彟澶栦竴涓渶瑕佽В鍐崇殑闂鏄垎甯冨紡閿佸拰浜嬩歡閫氱煡鏈哄埗錛屼互鍙?qiáng)鍏ㄥ眬淇℃伅鍏變韓錛岃璁¤繖浜涢兘闇瑕佸ぇ閲忕殑綺懼姏鍜屼粩緇嗙殑鐮? 絀躲?/p>