From 722fc612bdca51bec6ddcced3a4b5dfa2f9910a1 Mon Sep 17 00:00:00 2001 From: mu <59917266+4eUeP@users.noreply.github.com> Date: Fri, 28 Apr 2023 17:44:59 +0800 Subject: [PATCH] Improvements for sigsegv handler (#1391) --- common/base/cbits/fatalsignal.cpp | 204 +++++++++++++++++++ common/base/cbits/hs_utils.cpp | 13 -- common/base/hstream-common-base.cabal | 1 + common/base/include/hs_common.h | 10 - common/hstream/HStream/Utils/Common.hs | 6 +- common/hstream/cbits/hs_utils.cpp | 20 -- common/hstream/include/hs_common.h | 2 +- common/hstream/test/HStream/UtilsSpec.hs | 2 +- common/stats/test/HStream/StatsSpec.hs | 4 +- hstream/app/client.hs | 2 +- hstream/app/server.hs | 4 +- hstream/src/HStream/Client/Execute.hs | 4 +- hstream/test/HStream/AdminCommandSpec.hs | 2 +- hstream/test/HStream/HandlerSpec.hs | 2 +- hstream/test/HStream/RegressionSpec.hs | 4 +- hstream/test/HStream/RunQuerySpec.hs | 4 +- hstream/test/HStream/RunSQLSpec.hs | 2 +- hstream/test/HStream/StatsIntegrationSpec.hs | 2 +- 18 files changed, 225 insertions(+), 63 deletions(-) create mode 100644 common/base/cbits/fatalsignal.cpp delete mode 100644 common/hstream/cbits/hs_utils.cpp diff --git a/common/base/cbits/fatalsignal.cpp b/common/base/cbits/fatalsignal.cpp new file mode 100644 index 000000000..fa94400f2 --- /dev/null +++ b/common/base/cbits/fatalsignal.cpp @@ -0,0 +1,204 @@ +#include "hs_common.h" + +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include + +// Boost.Stacktrace does not show the details +// +// void handle_fatal_signal(int signum) { +// ::signal(signum, SIG_DFL); +// std::cerr << "handle_fatal_signal(): Caught coredump signal " << signum +// << '\n'; +// std::cerr << "Backtrace:\n" << boost::stacktrace::stacktrace() << '\n'; +// ::raise(SIGTRAP); +// } + +// The following code is copied from "logdevice/server/fatalsignal.cpp" + +// a collection of rocksdb caches +struct RocksDBCachesInfo { + std::weak_ptr block_cache; + std::weak_ptr block_cache_compressed; + std::weak_ptr metadata_block_cache; +}; + +extern RocksDBCachesInfo g_rocksdb_caches; + +RocksDBCachesInfo g_rocksdb_caches; + +static size_t unmap_count = 0, page_count = 0; + +static const size_t PAGE_SHIFT = 12; +static const size_t PAGE_SIZE = (1UL << 12); +static const size_t PAGE_MASK = ~((1UL << PAGE_SHIFT) - 1); + +struct Segment { + size_t start; + size_t end; + Segment(int s, int e) : start(s), end(e) {} +}; + +static Segment* sarray = nullptr; +static size_t s_index = 0; +static const size_t SEGMAP_SIZE = 1024 * 1024 * 1024; +static const size_t max_num_segs = SEGMAP_SIZE / sizeof(Segment); + +static void unmap_callback(void* entry, size_t charge) { + if (s_index >= max_num_segs) { + return; + } + // align both start and end to page boundaries + sarray[s_index].start = (size_t)entry & PAGE_MASK; + sarray[s_index].end = ((size_t)entry + charge + PAGE_SIZE - 1) & PAGE_MASK; + ++s_index; +} + +static void safe_print(const char msg[]) { + auto _ = write(2, msg, std::strlen(msg)); +} + +static void safe_print_unsigned(size_t num) { + char buf[64]; + size_t i = 0; + if (num == 0) { + safe_print("0"); + return; + } + while (num != 0) { + buf[i++] = (num % 10) + '0'; + num /= 10; + } + buf[i] = '\0'; + for (size_t j = 0, k = i - 1; j < k; ++j, --k) { + size_t tmp = buf[j]; + buf[j] = buf[k]; + buf[k] = tmp; + } + safe_print(buf); +} + +// Construct this on startup, since it allocates on the heap and we don't want +// to do that in a signal handler. Leak it so we don't have to worry about +// destruction order. +folly::symbolizer::SafeStackTracePrinter* gStackTrace = + new folly::symbolizer::SafeStackTracePrinter(); + +static void handle_fatal_signal(int sig) { + static std::atomic insegv(0); + pthread_t old_val(0); + + if (!insegv.compare_exchange_strong(old_val, pthread_self())) { + /* another thread is in the handler, suspend this one forever */ + if (pthread_self() != old_val) { + pause(); + _exit(EXIT_FAILURE); + } + /* recursive call from the same thread, give up and dump core*/ + raise(SIGTRAP); + _exit(EXIT_FAILURE); + } + + safe_print("handle_fatal_signal(): Caught coredump signal "); + safe_print_unsigned(sig); + safe_print("\n"); + + gStackTrace->printStackTrace(true); + + // allocate a big enough VM to hold all segments + sarray = (Segment*)mmap(nullptr, SEGMAP_SIZE, PROT_READ | PROT_WRITE, + MAP_PRIVATE | MAP_ANONYMOUS, -1, 0); + if (sarray == (Segment*)MAP_FAILED) { + safe_print("handle_fatal_signal(): allocate seg array failed.\n"); + raise(SIGTRAP); + _exit(EXIT_FAILURE); + } + + for (auto cache_weak_ptr : + {g_rocksdb_caches.block_cache, g_rocksdb_caches.block_cache_compressed, + g_rocksdb_caches.metadata_block_cache}) { + std::shared_ptr cache = cache_weak_ptr.lock(); + if (cache != nullptr) { + cache->ApplyToAllCacheEntries(unmap_callback, false); + } + } + + if (s_index == 0) { + safe_print("handle_fatal_signal(): No rocksdb caches found.\n"); + munmap((void*)sarray, SEGMAP_SIZE); + raise(SIGTRAP); + return; + } + + safe_print("handle_fatal_signal(): processed segments: "); + safe_print_unsigned(s_index); + safe_print("\n"); + + std::sort(sarray, sarray + s_index, + [](const Segment& a, const Segment& b) -> bool { + return a.start < b.start; + }); + + size_t cur_start = sarray[0].start, cur_end = sarray[0].end; + for (size_t i = 1; i <= s_index; ++i) { + // unmap a segment + if (i == s_index || sarray[i].start > cur_end) { + size_t len = cur_end - cur_start; + munmap((void*)cur_start, len); + ++unmap_count; + page_count += (len >> PAGE_SHIFT); + if (i < s_index) { + cur_start = sarray[i].start; + cur_end = sarray[i].end; + } + } + // could merge + else { + cur_end = std::max(cur_end, sarray[i].end); + } + } + + safe_print("handle_fatal_signal(): unmapped pages - unmap calls: "); + safe_print_unsigned(page_count); + safe_print(" - "); + safe_print_unsigned(unmap_count); + safe_print("\n"); + + // unmap the seg array + munmap((void*)sarray, SEGMAP_SIZE); + + raise(SIGTRAP); +} + +static void setup_signal_handler(int signum, void (*handler)(int)) { + struct sigaction sa; + sa.sa_handler = handler; + sigemptyset(&sa.sa_mask); + sa.sa_flags = 0; + int rv; + rv = sigaction(signum, &sa, nullptr); + ld_check(rv == 0); +} + +extern "C" { +// ---------------------------------------------------------------------------- + +void setup_fatal_signal_handler() { + for (int signum : {SIGSEGV, SIGABRT, SIGBUS, SIGQUIT, SIGILL, SIGFPE}) { + setup_signal_handler(signum, handle_fatal_signal); + } +} + +// ---------------------------------------------------------------------------- +} diff --git a/common/base/cbits/hs_utils.cpp b/common/base/cbits/hs_utils.cpp index 30a5e6790..e87655078 100644 --- a/common/base/cbits/hs_utils.cpp +++ b/common/base/cbits/hs_utils.cpp @@ -1,21 +1,8 @@ #include "hs_common.h" -#include -#include -#include - extern "C" { // ---------------------------------------------------------------------------- -void handle_fatal_signal(int signum) { - ::signal(signum, SIG_DFL); - std::cerr << "handle_fatal_signal(): Caught coredump signal " << signum - << '\n'; - std::cerr << "Backtrace:\n" << boost::stacktrace::stacktrace() << '\n'; - ::raise(SIGTRAP); -} - -void setup_sigsegv_handler() { ::signal(SIGSEGV, &handle_fatal_signal); } // ---------------------------------------------------------------------------- } diff --git a/common/base/hstream-common-base.cabal b/common/base/hstream-common-base.cabal index 4e5035a48..4fec4eb6e 100644 --- a/common/base/hstream-common-base.cabal +++ b/common/base/hstream-common-base.cabal @@ -66,6 +66,7 @@ library hs_cpp_lib.h cxx-sources: + cbits/fatalsignal.cpp cbits/hs_struct.cpp cbits/hs_utils.cpp diff --git a/common/base/include/hs_common.h b/common/base/include/hs_common.h index 180ae385f..4c6145c29 100644 --- a/common/base/include/hs_common.h +++ b/common/base/include/hs_common.h @@ -12,16 +12,6 @@ extern "C" { void setup_sigsegv_handler(); -// ---------------------------------------------------------------------------- -// Stats -// -// See: cbits/stats.cpp - -// ---------------------------------------------------------------------------- -// Query -// -// See: cbits/query.cpp - // ---------------------------------------------------------------------------- #ifdef __cplusplus } /* end extern "C" */ diff --git a/common/hstream/HStream/Utils/Common.hs b/common/hstream/HStream/Utils/Common.hs index d10a2daf7..7832e2af2 100644 --- a/common/hstream/HStream/Utils/Common.hs +++ b/common/hstream/HStream/Utils/Common.hs @@ -1,6 +1,6 @@ module HStream.Utils.Common ( maybeToEither - , setupSigsegvHandler + , setupFatalSignalHandler , newRandomText ) where @@ -13,8 +13,8 @@ import System.Random maybeToEither :: b -> Maybe a -> Either b a maybeToEither errmsg = maybe (Left errmsg) Right -foreign import ccall unsafe "hs_common.h setup_sigsegv_handler" - setupSigsegvHandler :: IO () +foreign import ccall unsafe "hs_common.h setup_fatal_signal_handler" + setupFatalSignalHandler :: IO () newRandomText :: Int -> IO Text newRandomText n = Text.pack . take n . randomRs ('a', 'z') <$> newStdGen diff --git a/common/hstream/cbits/hs_utils.cpp b/common/hstream/cbits/hs_utils.cpp deleted file mode 100644 index 5875b7961..000000000 --- a/common/hstream/cbits/hs_utils.cpp +++ /dev/null @@ -1,20 +0,0 @@ -#include "hs_common.h" - -#include -#include - -extern "C" { -// ---------------------------------------------------------------------------- - -void handle_fatal_signal(int signum) { - ::signal(signum, SIG_DFL); - std::cerr << "handle_fatal_signal(): Caught coredump signal " << signum - << '\n'; - std::cerr << "Backtrace:\n" << boost::stacktrace::stacktrace() << '\n'; - ::raise(SIGTRAP); -} - -void setup_sigsegv_handler() { ::signal(SIGSEGV, &handle_fatal_signal); } - -// ---------------------------------------------------------------------------- -} diff --git a/common/hstream/include/hs_common.h b/common/hstream/include/hs_common.h index ff6b7b12b..fc10af113 100644 --- a/common/hstream/include/hs_common.h +++ b/common/hstream/include/hs_common.h @@ -14,7 +14,7 @@ extern "C" { // ---------------------------------------------------------------------------- // Utils -void setup_sigsegv_handler(); +void setup_fatal_signal_handler(); // ---------------------------------------------------------------------------- // Stats diff --git a/common/hstream/test/HStream/UtilsSpec.hs b/common/hstream/test/HStream/UtilsSpec.hs index c8643e15d..64fa2f215 100644 --- a/common/hstream/test/HStream/UtilsSpec.hs +++ b/common/hstream/test/HStream/UtilsSpec.hs @@ -22,4 +22,4 @@ timeIntervalSpec = describe "TimeInterval" $ do utilsSpec :: Spec utilsSpec = describe "HStream.Utils" $ do -- TODO - it "setupSigsegvHandler" $ setupSigsegvHandler `shouldReturn` () + it "setupFatalSignalHandler" $ setupFatalSignalHandler `shouldReturn` () diff --git a/common/stats/test/HStream/StatsSpec.hs b/common/stats/test/HStream/StatsSpec.hs index 338b90afe..ad7e0557a 100644 --- a/common/stats/test/HStream/StatsSpec.hs +++ b/common/stats/test/HStream/StatsSpec.hs @@ -9,13 +9,13 @@ import Test.Hspec import HStream.Stats import HStream.StatsSpecUtils (mkTimeSeriesTest) -import HStream.Utils (runConc, setupSigsegvHandler) +import HStream.Utils (runConc, setupFatalSignalHandler) {-# ANN module ("HLint: ignore Use head" :: String) #-} spec :: Spec spec = do - runIO setupSigsegvHandler + runIO setupFatalSignalHandler statsSpec threadedStatsSpec diff --git a/hstream/app/client.hs b/hstream/app/client.hs index 4ca481344..388546fef 100644 --- a/hstream/app/client.hs +++ b/hstream/app/client.hs @@ -67,7 +67,7 @@ import HStream.Utils (ResourceType (..), formatResult, mkGRPCClientConfWithSSL, pattern EnumPB, - setupSigsegvHandler) + setupFatalSignalHandler) import qualified HStream.Utils.Aeson as AesonComp main :: IO () diff --git a/hstream/app/server.hs b/hstream/app/server.hs index 03bf31335..c8a4c6625 100644 --- a/hstream/app/server.hs +++ b/hstream/app/server.hs @@ -90,7 +90,7 @@ import qualified HStream.ThirdParty.Protobuf as Proto import HStream.Utils (ResourceType (..), getProtoTimestamp, pattern EnumPB, - setupSigsegvHandler) + setupFatalSignalHandler) main :: IO () @@ -98,7 +98,7 @@ main = getConfig >>= app app :: ServerOpts -> IO () app config@ServerOpts{..} = do - setupSigsegvHandler + setupFatalSignalHandler Log.setDefaultLogger _serverLogLevel _serverLogWithColor Log.LogStderr Log.setLogDeviceDbgLevel' _ldLogLevel diff --git a/hstream/src/HStream/Client/Execute.hs b/hstream/src/HStream/Client/Execute.hs index 73299386f..e7603e8cf 100644 --- a/hstream/src/HStream/Client/Execute.hs +++ b/hstream/src/HStream/Client/Execute.hs @@ -43,7 +43,7 @@ import HStream.Utils (Format, HStreamClientApi, getServerResp, mkGRPCClientConfWithSSL, serverNodeToSocketAddr, - setupSigsegvHandler) + setupFatalSignalHandler) executeShowPlan :: HStreamCliContext -> ShowObject -> IO () executeShowPlan ctx showObject = @@ -147,7 +147,7 @@ initCliContext RefinedCliConnOpts{..} = do currentServer <- newMVar addr let sslConfig = clientSSLConfig clientConfig let ctx = HStreamCliContext {..} - setupSigsegvHandler + setupFatalSignalHandler connected <- waitForServerToStart (retryTimeout * 1000000) addr sslConfig case connected of Nothing -> errorWithoutStackTrace "Connection timed out. Please check the server URI and try again." diff --git a/hstream/test/HStream/AdminCommandSpec.hs b/hstream/test/HStream/AdminCommandSpec.hs index 728b78288..a567729cf 100644 --- a/hstream/test/HStream/AdminCommandSpec.hs +++ b/hstream/test/HStream/AdminCommandSpec.hs @@ -23,7 +23,7 @@ import qualified HStream.Utils.Aeson as Aeson spec :: Spec spec = describe "HStream.AdminCommnadSpec" $ do - runIO setupSigsegvHandler + runIO setupFatalSignalHandler runIO $ setLogDeviceDbgLevel C_DBG_ERROR adminCommandStatsSpec diff --git a/hstream/test/HStream/HandlerSpec.hs b/hstream/test/HStream/HandlerSpec.hs index 02b21ac8b..55228e57c 100644 --- a/hstream/test/HStream/HandlerSpec.hs +++ b/hstream/test/HStream/HandlerSpec.hs @@ -27,7 +27,7 @@ import HStream.Utils spec :: Spec spec = describe "HStream.HandlerSpec" $ do - runIO setupSigsegvHandler + runIO setupFatalSignalHandler runIO $ setLogDeviceDbgLevel C_DBG_ERROR streamSpec diff --git a/hstream/test/HStream/RegressionSpec.hs b/hstream/test/HStream/RegressionSpec.hs index 80715d3bc..9d0b69889 100644 --- a/hstream/test/HStream/RegressionSpec.hs +++ b/hstream/test/HStream/RegressionSpec.hs @@ -16,7 +16,7 @@ import Test.Hspec import HStream.SpecUtils import HStream.Store.Logger (pattern C_DBG_ERROR, setLogDeviceDbgLevel) -import HStream.Utils (setupSigsegvHandler) +import HStream.Utils (setupFatalSignalHandler) import Network.GRPC.HighLevel.Client import Network.GRPC.LowLevel @@ -24,7 +24,7 @@ import Network.GRPC.LowLevel spec :: Spec spec = aroundAll provideHstreamApi $ describe "HStream.RegressionSpec" $ do - runIO setupSigsegvHandler + runIO setupFatalSignalHandler runIO $ setLogDeviceDbgLevel C_DBG_ERROR it "#391_JOIN" $ \api -> do diff --git a/hstream/test/HStream/RunQuerySpec.hs b/hstream/test/HStream/RunQuerySpec.hs index 32c38f206..e306ae3e8 100644 --- a/hstream/test/HStream/RunQuerySpec.hs +++ b/hstream/test/HStream/RunQuerySpec.hs @@ -17,7 +17,7 @@ import HStream.SpecUtils import HStream.Store.Logger (pattern C_DBG_ERROR, setLogDeviceDbgLevel) import HStream.Utils (TaskStatus (..), - setupSigsegvHandler) + setupFatalSignalHandler) getQueryResponseIdIs :: T.Text -> Query -> Bool getQueryResponseIdIs targetId Query {..} = queryId == targetId @@ -75,7 +75,7 @@ terminateQuery qid = withGRPCClient clientConfig $ \client -> do spec :: Spec spec = aroundAll provideHstreamApi $ describe "HStream.RunQuerySpec" $ do - runIO setupSigsegvHandler + runIO setupFatalSignalHandler runIO $ setLogDeviceDbgLevel C_DBG_ERROR source1 <- runIO $ newRandomText 20 diff --git a/hstream/test/HStream/RunSQLSpec.hs b/hstream/test/HStream/RunSQLSpec.hs index 3d10a23db..049aa72db 100644 --- a/hstream/test/HStream/RunSQLSpec.hs +++ b/hstream/test/HStream/RunSQLSpec.hs @@ -18,7 +18,7 @@ import HStream.Utils hiding (newRandomText) spec :: Spec spec = describe "HStream.RunSQLSpec" $ do - runIO setupSigsegvHandler + runIO setupFatalSignalHandler runIO $ setLogDeviceDbgLevel C_DBG_ERROR baseSpec diff --git a/hstream/test/HStream/StatsIntegrationSpec.hs b/hstream/test/HStream/StatsIntegrationSpec.hs index 5ddd39f86..7e0a74fbb 100644 --- a/hstream/test/HStream/StatsIntegrationSpec.hs +++ b/hstream/test/HStream/StatsIntegrationSpec.hs @@ -15,7 +15,7 @@ import HStream.Utils spec :: Spec spec = describe "HStream.StatsIntegrationTest" $ do - runIO setupSigsegvHandler + runIO setupFatalSignalHandler runIO $ setLogDeviceDbgLevel C_DBG_ERROR perStreamTimeSeriesSpec