Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 2 additions & 1 deletion conanfile.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@

class HomeObjectConan(ConanFile):
name = "homeobject"
version = "4.1.18"
version = "4.1.19"

homepage = "https://github.com/eBay/HomeObject"
description = "Blob Store built on HomeStore"
Expand All @@ -26,6 +26,7 @@ class HomeObjectConan(ConanFile):
"coverage": ['True', 'False'],
"sanitize": ['True', 'False'],
}

default_options = {
'shared': False,
'fPIC': True,
Expand Down
11 changes: 5 additions & 6 deletions src/lib/homestore_backend/gc_manager.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -365,7 +365,7 @@ folly::SemiFuture< bool > GCManager::pdev_gc_actor::add_gc_task(uint8_t priority
const auto pg_id = EXvchunk->m_pg_id.value();
m_hs_home_object->gc_manager()->incr_pg_pending_gc_task(pg_id);

if (!m_hs_home_object->can_chunks_in_pg_be_gc(pg_id)) {
if (!m_hs_home_object->is_pg_alive(pg_id)) {
LOGDEBUGMOD(gcmgr, "chunk_id={} belongs to pg {}, which is not eligible for gc at this moment!",
move_from_chunk, pg_id)
m_hs_home_object->gc_manager()->decr_pg_pending_gc_task(pg_id);
Expand Down Expand Up @@ -504,7 +504,7 @@ void GCManager::pdev_gc_actor::handle_recovered_gc_task(
}

// we have no gc_task_guard for recovered gc task, so we need to do this manually to make sure the gc task can be
// marked as completed and the pg can be marked as available for new gc task
// marked as completed
on_gc_task_completed(priority, pg_id, move_from_chunk, move_to_chunk, vchunk_id, true, 0);

GCLOGD(RECOVERD_GC_TASK_ID, pg_id, NO_SHARD_ID,
Expand Down Expand Up @@ -797,9 +797,8 @@ bool GCManager::pdev_gc_actor::copy_valid_data(
move_from_chunk);
}

// check if all the pbas in the valid_blob_indexes are in move_from_chunk, if not, it means the
// shard is being modified during gc, we can not guarantee the data consistency, so we fail this gc
// task and let it be retried later.
// check if all the pbas in the valid_blob_indexes are in move_from_chunk, if not, we cancel this task and retry
// later.
for (const auto& [blob, v] : valid_blob_indexes) {
auto pba = v.pbas();
if (pba.chunk_num() != move_from_chunk) {
Expand Down Expand Up @@ -1100,7 +1099,7 @@ bool GCManager::pdev_gc_actor::purge_reserved_chunk(chunk_id_t chunk, const uint
RELEASE_ASSERT(!vchunk->m_pg_id.has_value(),
"chunk_id={} is expected to be a reserved chunk, and not belong to a pg", chunk);
RELEASE_ASSERT(vchunk->m_state == ChunkState::GC,
"chunk_id={} is a reserved chunk, expected to have a GC state, but actuall state is {} ", chunk,
"chunk_id={} is a reserved chunk, expected to have a GC state, but the actual state is {} ", chunk,
vchunk->m_state);

// Clear all rreqs on the reserved chunk BEFORE reset() resets its allocator.
Expand Down
6 changes: 6 additions & 0 deletions src/lib/homestore_backend/hs_homeobject.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -429,6 +429,12 @@ void HSHomeObject::on_replica_restart() {
// and log replay can complete successfully.

gc_mgr_->handle_all_recovered_gc_tasks();

// redo destory pg to reclaim pg resource for destroyed stale pgs.
for (const auto pg_id : destoryed_stale_pgs_) {
LOGI("Redo destroy pg for stale destroyed pg {}", pg_id);
destroy_pg_resource(pg_id);
}
});
}

Expand Down
7 changes: 5 additions & 2 deletions src/lib/homestore_backend/hs_homeobject.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -104,6 +104,7 @@ class HSHomeObject : public HomeObjectImpl {

// Shard migration info: tracks shards that need migration from v1 to v2 format
std::vector< shard_id_t > shards_to_migrate_;
std::vector< pg_id_t > destoryed_stale_pgs_;

public:
// Old version shard_info_superblk (v0.01) - for backward compatibility testing and migration
Expand Down Expand Up @@ -332,7 +333,7 @@ class HSHomeObject : public HomeObjectImpl {

register_me_to_farm();
attach_gather_cb(std::bind(&PGMetrics::on_gather, this));
blk_size = pg_.repl_dev_->get_blk_size();
blk_size = homestore::data_service().get_blk_size();
}
~PGMetrics() { deregister_me_from_farm(); }
PGMetrics(const PGMetrics&) = delete;
Expand Down Expand Up @@ -881,6 +882,8 @@ class HSHomeObject : public HomeObjectImpl {
*/
bool pg_destroy(pg_id_t pg_id, bool need_to_pause_pg_state_machine = false);

void destroy_pg_resource(pg_id_t pg_id);

bool pause_pg_state_machine(pg_id_t pg_id);

bool resume_pg_state_machine(pg_id_t pg_id);
Expand Down Expand Up @@ -977,7 +980,7 @@ class HSHomeObject : public HomeObjectImpl {
* @param pg_id The ID of the PG whose shards are to be destroyed.
* @return True if the chunks in the PG can be garbage collected, false otherwise.
*/
bool can_chunks_in_pg_be_gc(pg_id_t pg_id) const;
bool is_pg_alive(pg_id_t pg_id) const;

bool pg_exists(pg_id_t pg_id) const;

Expand Down
123 changes: 67 additions & 56 deletions src/lib/homestore_backend/hs_pg_manager.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -688,29 +688,41 @@ std::optional< pg_id_t > HSHomeObject::get_pg_id_with_group_id(group_id_t group_

void HSHomeObject::_destroy_pg(pg_id_t pg_id) { pg_destroy(pg_id); }

void HSHomeObject::destroy_pg_resource(pg_id_t pg_id) {
destroy_shards(pg_id);
destroy_hs_resources(pg_id);
destroy_pg_index_table(pg_id);
destroy_pg_superblk(pg_id);

// return pg chunks to dev heap
// which must be done after destroying pg super blk to avoid multiple pg use same chunks
bool res = chunk_selector_->return_pg_chunks_to_dev_heap(pg_id);
RELEASE_ASSERT(res, "Failed to return pg={} chunks to dev_heap", pg_id);
LOGI("resource of pg={} is destroyed", pg_id);
}

bool HSHomeObject::pg_destroy(pg_id_t pg_id, bool need_to_pause_pg_state_machine) {
// Baseline resync concern: if pg_destroy partially completes before a crash (e.g., the index
// table is destroyed but the PG superblk is not), log replay on recovery might attempt to write
// to the destroyed index table and fail.
//
// This is not an issue. Before pg_destroy is triggered during baseline resync,
// m_rd_sb->last_snapshot_lsn is persisted to snapshot.get_last_log_idx(). Any log at or
// before that LSN is skipped on recovery and never replayed.
//
// See raft_repl_dev::need_skip_processing for details.
if (need_to_pause_pg_state_machine && !pause_pg_state_machine(pg_id)) {
LOGI("Failed to pause pg state machine, pg_id={}", pg_id);
return false;
}

LOGI("Destroying pg={}", pg_id);
mark_pg_destroyed(pg_id);

// we have the assumption that after pg is marked as destroyed, it will not be marked as alive again.
// TODO:: if this assumption is broken, we need to handle it.
gc_mgr_->drain_pg_pending_gc_task(pg_id);

destroy_shards(pg_id);
destroy_hs_resources(pg_id);
destroy_pg_index_table(pg_id);
destroy_pg_superblk(pg_id);

// return pg chunks to dev heap
// which must be done after destroying pg super blk to avoid multiple pg use same chunks
bool res = chunk_selector_->return_pg_chunks_to_dev_heap(pg_id);
RELEASE_ASSERT(res, "Failed to return pg={} chunks to dev_heap", pg_id);

LOGI("pg={} is destroyed", pg_id);
destroy_pg_resource(pg_id);
return true;
}

Expand All @@ -724,29 +736,8 @@ PGManager::NullResult HSHomeObject::_exit_pg(uuid_t group_id, peer_id_t peer_id,
LOGI("group_id is nil, nothing to exit, trace_id={}", tid);
return folly::makeUnexpected(PGError::INVALID_ARG);
}
pg_id_t pg_id{0};
{
auto lg = std::shared_lock(_pg_lock);
auto iter = std::find_if(_pg_map.begin(), _pg_map.end(), [group_id](const auto& entry) {
return pg_repl_dev(*entry.second).group_id() == group_id;
});
if (iter != _pg_map.end()) {
pg_id = iter->first;
} else {
// There is a known case during adding member: the new member may think itself already in group but actually
// not, so the pg is not created yet.
LOGI("no pg found, group_id={}, trace_id={}", group_id, tid);
}
}
if (pg_id != 0 && !pg_destroy(pg_id)) {
// don't need to pause state machine here, this api is called during member leaving or the member is not in the
// cluster actually.
LOGE("failed to destroy pg={}, group_id={}, trace_id={}", pg_id, group_id, tid);
return folly::makeUnexpected(PGError::UNKNOWN);
}
LOGI("pg is cleaned, going to destroy repl_dev, group_id={}, trace_id={}", group_id, tid);
// TODO pass peer_id into destroy_repl_dev for peer validation
// destroy_repl_dev will leave raft group

// mark pg as destoryed and then permanent_destroy will call destory_pg to reclaim pg resource.
auto ret = hs_repl_service().destroy_repl_dev(group_id);
if (ret == ReplServiceError::SERVER_NOT_FOUND) {
LOGW("repl dev not found, ignore, group_id={}, trace_id={}", group_id, tid);
Expand All @@ -756,6 +747,7 @@ PGManager::NullResult HSHomeObject::_exit_pg(uuid_t group_id, peer_id_t peer_id,
LOGE("Failed to destroy repl dev for group_id={}, error={}, trace_id={}", group_id, ret, tid);
return folly::makeUnexpected(toPgError(ret));
}

return folly::Unit();
}

Expand Down Expand Up @@ -800,7 +792,7 @@ void HSHomeObject::mark_pg_destroyed(pg_id_t pg_id) {
LOGD("pg={} is marked as destroyed", pg_id);
}

bool HSHomeObject::can_chunks_in_pg_be_gc(pg_id_t pg_id) const {
bool HSHomeObject::is_pg_alive(pg_id_t pg_id) const {
auto lg = std::scoped_lock(_pg_lock);
auto hs_pg = const_cast< HS_PG* >(_get_hs_pg_unlocked(pg_id));
if (hs_pg == nullptr) {
Expand Down Expand Up @@ -859,7 +851,11 @@ void HSHomeObject::destroy_pg_superblk(pg_id_t pg_id) {
}

hs_pg->pg_sb_.destroy();
destroy_snapshot_sb(hs_pg->repl_dev_->group_id());

// FIXME:: if repl_dev does not exist, how to get group_id to destory snapshot sb? we should store group_id in
// pg superblk to avoid this issue.
if (hs_pg->repl_dev_) { destroy_snapshot_sb(hs_pg->repl_dev_->group_id()); }

hs_pg->snp_rcvr_info_sb_.destroy();
hs_pg->snp_rcvr_shard_list_sb_.destroy();

Expand All @@ -871,9 +867,12 @@ void HSHomeObject::destroy_pg_superblk(pg_id_t pg_id) {
}

void HSHomeObject::add_pg_to_map(unique< HS_PG > hs_pg) {
RELEASE_ASSERT(hs_pg->pg_info_.replica_set_uuid == hs_pg->repl_dev_->group_id(),
"PGInfo replica set uuid mismatch with ReplDev instance for {}",
boost::uuids::to_string(hs_pg->pg_info_.replica_set_uuid));
if (hs_pg->repl_dev_) {
RELEASE_ASSERT(hs_pg->pg_info_.replica_set_uuid == hs_pg->repl_dev_->group_id(),
"PGInfo replica set uuid mismatch with ReplDev instance for {}",
boost::uuids::to_string(hs_pg->pg_info_.replica_set_uuid));
}

auto lg = std::scoped_lock(_pg_lock);
auto id = hs_pg->pg_info_.id;
auto [it1, _] = _pg_map.try_emplace(id, std::move(hs_pg));
Expand Down Expand Up @@ -923,19 +922,39 @@ void HSHomeObject::on_pg_meta_blk_found(sisl::byte_view const& buf, void* meta_c
LOGI("on_pg_meta_blk_found is called")
homestore::superblk< pg_info_superblk > pg_sb(_pg_meta_name);
pg_sb.load(buf, meta_cookie);
const auto pg_id = pg_sb->id;
shared< homestore::ReplDev > rdev;

auto v = hs_repl_service().get_repl_dev(pg_sb->replica_set_uuid);
if (v.hasError()) {
// TODO: We need to raise an alert here, since without pg repl_dev all operations on that pg will fail
LOGE("open_repl_dev for group_id={} has failed, pg={}", boost::uuids::to_string(pg_sb->replica_set_uuid),
pg_sb->id);
return;
// We have a pg_super_blk but cannot find the corresponding repl_dev. This happens when repl_dev
// is marked as destroyed (m_rd_sb->destroy_pending = 0x1) in raft_repl_dev::leave(), but a crash
// occurs before pg_destroy is called.
//
// repl_dev is marked as destroyed in three cases:
//
// 1. Forced member exit: exit_pg calls destroy_repl_dev, then raft_repl_dev::leave() and
// pg_destroy directly.
//
// 2. raft_repl_dev::destroy_group: proposes a HS_CTRL_DESTROY journal log; on commit,
// raft_repl_dev::leave() is called, then RaftReplDev::permanent_destroy, which calls
// on_destroy → pg_destroy to clean up the pg resource and superblk.
//
// 3. Member removal: the leader calls repl_dev::remove_member; the removed member receives
// nuraft::cb_func::RemovedFromCluster, which triggers repl_dev::leave(), then
// on_destroy and pg_destroy via RaftReplDev::permanent_destroy.
//
// When a destroyed repl_dev is recovered, it is skipped (see RaftReplService::load_repl_dev()),
// so no log replay occurs. We must therefore destroy the pg resource when no repl_dev is found.
destoryed_stale_pgs_.emplace_back(pg_id);
} else {
rdev = std::move(v.value());
}
auto pg_id = pg_sb->id;

std::vector< chunk_num_t > p_chunk_ids(pg_sb->get_chunk_ids(), pg_sb->get_chunk_ids() + pg_sb->num_chunks);
bool set_pg_chunks_res = chunk_selector_->recover_pg_chunks(pg_id, std::move(p_chunk_ids));
auto uuid_str = boost::uuids::to_string(pg_sb->index_table_uuid);
auto hs_pg = std::make_unique< HS_PG >(std::move(pg_sb), std::move(v.value()));
auto hs_pg = std::make_unique< HS_PG >(std::move(pg_sb), rdev);
if (!set_pg_chunks_res) {
hs_pg->pg_state_.set_state(PGStateMask::DISK_DOWN);
hs_pg->repl_dev_->set_stage(homestore::repl_dev_stage_t::UNREADY);
Expand All @@ -949,8 +968,6 @@ void HSHomeObject::on_pg_meta_blk_found(sisl::byte_view const& buf, void* meta_c
hs_pg->index_table_ = it->second.index_table;
it->second.pg_id = pg_id;
} else {
RELEASE_ASSERT(hs_pg->pg_sb_->state == PGState::DESTROYED, "IndexTable should be recovered before PG");
hs_pg->index_table_ = nullptr;
LOGI("Index table not found for destroyed pg={}, index_table_uuid={}", pg_id, uuid_str);
}

Expand Down Expand Up @@ -1262,17 +1279,11 @@ uint32_t HSHomeObject::get_pg_tombstone_blob_count(pg_id_t pg_id) const {
}

void HSHomeObject::refresh_pg_statistics(pg_id_t pg_id) {
RELEASE_ASSERT(is_pg_alive(pg_id), "pg={} should be alive", pg_id);
auto hs_pg = const_cast< HS_PG* >(_get_hs_pg_unlocked(pg_id));
RELEASE_ASSERT(hs_pg, "Failed to get pg={} for statistics refresh", pg_id);
auto pg_index_table = hs_pg->index_table_;
if (!pg_index_table) {
if (hs_pg->pg_sb_->state == PGState::DESTROYED) {
LOGI("pg={} is destroyed, skip statistics refresh", pg_id);
} else {
RELEASE_ASSERT(false, "index table is not found for pg={} and not in PGState::DESTROYED state", pg_id);
}
return;
}
RELEASE_ASSERT(pg_index_table, "pg is alive, index table should be found for pg={}", pg_id);

// Step 1: Scan index table to count active and tombstone blobs in one pass
uint64_t active_count = 0;
Expand Down
20 changes: 18 additions & 2 deletions src/lib/homestore_backend/replication_state_machine.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -449,6 +449,10 @@ void ReplicationStateMachine::write_snapshot_obj(std::shared_ptr< homestore::sna
set_snapshot_context(context); // Update the snapshot context in case apply_snapshot is not called
auto hs_pg = home_object_->get_hs_pg(m_snp_rcv_handler->get_context_pg_id());
hs_pg->pg_state_.clear_state(PGStateMask::BASELINE_RESYNC);
// we only reset this if destroying pg happens in BR case. for other cases (on_destroy and _exit_pg),
Comment thread
xiaoxichen marked this conversation as resolved.
// since this replica will leave the PG and no later logs will be received, no need to reset this.
reset_no_space_left_error_info();
repl_dev()->reset_latch_lsn();
return;
}

Expand Down Expand Up @@ -499,7 +503,7 @@ void ReplicationStateMachine::write_snapshot_obj(std::shared_ptr< homestore::sna
if (home_object_->pg_exists(pg_data->pg_id())) {
LOGI("pg already exists, clean pg resources before snapshot, pg={} {}", pg_data->pg_id(), log_suffix);
// Need to pause state machine before destroying the PG, if fail, let raft retry.

Copy link
Copy Markdown
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

comments out of date, as well as we dont have a branch that returns false as of now.

Copy link
Copy Markdown
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

let`s remove this out-of-date comments after addressing other comments for this PR

if (!home_object_->pg_destroy(pg_data->pg_id(), true /* pause state machine */)) {
if (!home_object_->pg_destroy(pg_data->pg_id())) {
LOGE("failed to destroy existing pg, let raft retry, pg={} {}", pg_data->pg_id(), log_suffix);
return;
}
Expand Down Expand Up @@ -1030,7 +1034,19 @@ void ReplicationStateMachine::on_log_replay_done(const homestore::group_id_t& gr
const auto pg_id = pg_id_opt.value();
RELEASE_ASSERT(home_object_->pg_exists(pg_id), "pg={} should exist, but not! fatal error!", pg_id);

const auto& shards_in_pg = (const_cast< HSHomeObject::HS_PG* >(home_object_->_get_hs_pg_unlocked(pg_id)))->shards_;
const auto hs_pg = (const_cast< HSHomeObject::HS_PG* >(home_object_->get_hs_pg(pg_id)));
RELEASE_ASSERT(hs_pg, "Failed to get pg={} when log replay done", pg_id);
if (hs_pg->pg_sb_->state == PGState::DESTROYED) {
// if we reach here, it means we have a repl_dev (since only we have a repl_dev , we can have log replay and
// thus on_log_replay_done will be called), but the state of the related pg is destroyed. this can only happen
// when crash happens after pg is destroyed but before pg_super_blk is destroyed in baseline resync case.

// we need to do nothing here, since the first snapshot message(obj_id.shard_seq_num == 0) will be received
// again and pg_destory will be called again when handling the first snapshot message.
return;
}

const auto& shards_in_pg = hs_pg->shards_;
auto chunk_selector = home_object_->chunk_selector();

for (const auto& shard_iter : shards_in_pg) {
Expand Down
2 changes: 1 addition & 1 deletion src/lib/homestore_backend/tests/homeobj_fixture.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -49,7 +49,7 @@ class HomeObjectFixture : public ::testing::Test {

HSHomeObject::_hs_chunk_size = SISL_OPTIONS["chunk_size"].as< uint64_t >() * Mi;
_obj_inst = std::dynamic_pointer_cast< HSHomeObject >(g_helper->build_new_homeobject());

// Used to export metrics, it should be called after init_homeobject
if (SISL_OPTIONS["enable_http"].as< bool >()) { g_helper->app->start_http_server(); }
if (!g_helper->is_current_testcase_restarted()) {
Expand Down
Loading
Loading