diff --git a/common/workunit/workunit.cpp b/common/workunit/workunit.cpp index 78b2a1c864b..a927e795349 100644 --- a/common/workunit/workunit.cpp +++ b/common/workunit/workunit.cpp @@ -3673,6 +3673,7 @@ EnumMapping actions[] = { { WUActionPause, "pause" }, { WUActionPauseNow, "pausenow" }, { WUActionResume, "resume" }, + { WUActionGenerateDebugInfo, "debuginfo" }, { WUActionSize, NULL }, }; diff --git a/common/workunit/workunit.hpp b/common/workunit/workunit.hpp index 743672a82f3..0ae02fc3e6f 100644 --- a/common/workunit/workunit.hpp +++ b/common/workunit/workunit.hpp @@ -152,7 +152,8 @@ enum WUAction WUActionPause = 5, WUActionPauseNow = 6, WUActionResume = 7, - WUActionSize = 8 + WUActionGenerateDebugInfo = 8, + WUActionSize = 9, // NB: must be last }; diff --git a/esp/services/ws_workunits/ws_workunitsHelpers.cpp b/esp/services/ws_workunits/ws_workunitsHelpers.cpp index 30bf1715fe8..4c35663c515 100644 --- a/esp/services/ws_workunits/ws_workunitsHelpers.cpp +++ b/esp/services/ws_workunits/ws_workunitsHelpers.cpp @@ -2704,7 +2704,8 @@ bool WsWuInfo::validateWUAssociatedFile(const char* file, WUFileType type) //which contains Post Mortem files. Owned postMortemFile = createIFile(name.str()); validatePostMortemFile(postMortemFile, file, validated); - return validated; + if (validated) + return true; } if (strieq(file, name.str())) diff --git a/helm/hpcc/templates/eclagent.yaml b/helm/hpcc/templates/eclagent.yaml index ac91fdb6a8f..29fe33e4422 100644 --- a/helm/hpcc/templates/eclagent.yaml +++ b/helm/hpcc/templates/eclagent.yaml @@ -88,7 +88,7 @@ data: {{- $appCmd := printf "%s %s %s _HPCC_ARGS_" $apptype (include "hpcc.configArg" .me) (include "hpcc.daliArg" (dict "root" .root "component" "ECL Agent" "optional" false )) }} {{ include "hpcc.addCommandAndLifecycle" (. | merge (dict "command" $appCmd)) | indent 12 }} env: -{{- include "hpcc.mergeEnvironments" (dict "env" .env "defaultArenas" 1) | nindent 12 }} +{{- include "hpcc.mergeEnvironments" (dict "env" (append (.env | default list) (dict "name" "MY_JOB_NAME" "value" "_HPCC_JOBNAME_")) "defaultArenas" 1) | nindent 12 }} {{- include "hpcc.generateImageEnv" . | nindent 12 }} workingDir: /var/lib/HPCCSystems volumeMounts: diff --git a/helm/hpcc/templates/eclccserver.yaml b/helm/hpcc/templates/eclccserver.yaml index c3d98bfc90d..3aba72c5686 100644 --- a/helm/hpcc/templates/eclccserver.yaml +++ b/helm/hpcc/templates/eclccserver.yaml @@ -88,7 +88,7 @@ data: {{- $eclccserverCmd := printf "eclccserver %s %s _HPCC_ARGS_" (include "hpcc.configArg" .me) (include "hpcc.daliArg" (dict "root" .root "component" "ECLCC Server" "optional" false)) }} {{ include "hpcc.addCommandAndLifecycle" (. | merge (dict "command" $eclccserverCmd)) | indent 12 }} env: -{{- include "hpcc.mergeEnvironments" (dict "env" .env "defaultArenas" 1) | nindent 12 }} +{{- include "hpcc.mergeEnvironments" (dict "env" (append (.env | default list) (dict "name" "MY_JOB_NAME" "value" "_HPCC_JOBNAME_")) "defaultArenas" 1) | nindent 12 }} {{- include "hpcc.generateImageEnv" . | nindent 12 }} workingDir: /tmp volumeMounts: diff --git a/helm/hpcc/templates/thor.yaml b/helm/hpcc/templates/thor.yaml index 4336a22056c..df95266fcbe 100644 --- a/helm/hpcc/templates/thor.yaml +++ b/helm/hpcc/templates/thor.yaml @@ -115,7 +115,7 @@ data: {{- $agentCmd := printf "%s %s %s _HPCC_ARGS_" $eclAgentType (include "hpcc.configArg" .me) (include "hpcc.daliArg" (dict "root" .root "component" "Thor" "optional" false)) }} {{ include "hpcc.addCommandAndLifecycle" (. | merge (dict "command" $agentCmd)) | indent 12 }} env: -{{- include "hpcc.mergeEnvironments" (dict "env" .env "defaultArenas" 1) | nindent 12 }} +{{- include "hpcc.mergeEnvironments" (dict "env" (append (.env | default list) (dict "name" "MY_JOB_NAME" "value" "_HPCC_JOBNAME_")) "defaultArenas" 1) | nindent 12 }} {{- include "hpcc.generateImageEnv" . | nindent 12 }} workingDir: /var/lib/HPCCSystems volumeMounts: @@ -183,7 +183,7 @@ data: {{- $thorManagerCmd := printf "thormaster_lcr %s %s _HPCC_ARGS_" (include "hpcc.configArg" .me) (include "hpcc.daliArg" (dict "root" .root "component" "Thor" "optional" false)) }} {{ include "hpcc.addCommandAndLifecycle" (. | merge (dict "command" $thorManagerCmd)) | indent 12 }} env: -{{- include "hpcc.mergeEnvironments" (dict "env" .env "defaultArenas" 2) | nindent 12 }} +{{- include "hpcc.mergeEnvironments" (dict "env" (append (.env | default list) (dict "name" "MY_JOB_NAME" "value" "_HPCC_JOBNAME_")) "defaultArenas" 2) | nindent 12 }} {{- include "hpcc.generateImageEnv" . | nindent 12 }} workingDir: /var/lib/HPCCSystems volumeMounts: @@ -253,7 +253,7 @@ data: {{- $thorWorkerCmd := printf "thorslave_lcr %s %s _HPCC_ARGS_ --slaveport=%d" (include "hpcc.configArg" $configCtx.me) (include "hpcc.daliArg" (dict "root" $configCtx.root "component" "Thor" "optional" false)) $slavePort }} {{ include "hpcc.addCommandAndLifecycle" ($configCtx | merge (dict "command" $thorWorkerCmd)) | indent 12 }} env: -{{- $env := append ($configCtx.me.env | default list) (dict "name" "MY_CONTAINER_NAME" "value" (printf "%s-%d" $thorWorkerJobName $containerNum)) }} +{{- $env := concat ($configCtx.me.env | default list) (list (dict "name" "MY_JOB_NAME" "value" "_HPCC_JOBNAME_") (dict "name" "MY_CONTAINER_NAME" "value" (printf "%s-%d" $thorWorkerJobName $containerNum))) }} {{- include "hpcc.mergeEnvironments" (dict "env" $env "defaultArenas" 8) | nindent 12 }} {{- include "hpcc.generateImageEnv" $configCtx | nindent 12 }} workingDir: /var/lib/HPCCSystems diff --git a/initfiles/CMakeLists.txt b/initfiles/CMakeLists.txt index 65fed209129..acc3501a34e 100644 --- a/initfiles/CMakeLists.txt +++ b/initfiles/CMakeLists.txt @@ -39,10 +39,11 @@ if ( PLATFORM AND UNIX ) configure_file("${CMAKE_CURRENT_SOURCE_DIR}/bash-vars.in" "${CMAKE_BINARY_DIR}/bash-vars") set(bash-vars "${CMAKE_BINARY_DIR}/bash-vars") + configure_file("${CMAKE_CURRENT_SOURCE_DIR}/bin/.gdbinit.in" "${CMAKE_BINARY_DIR}/bin/.gdbinit" @ONLY) + install ( FILES "${CMAKE_BINARY_DIR}/bin/.gdbinit" DESTINATION ${EXEC_DIR} COMPONENT Runtime ) + install ( FILES bin/post-mortem-gdb DESTINATION ${EXEC_DIR} COMPONENT Runtime ) + install ( FILES lib/libjlib.so-gdb.py DESTINATION ${LIB_DIR} COMPONENT Runtime ) if ( CONTAINERIZED ) - install ( FILES bin/.gdbinit DESTINATION ${EXEC_DIR} COMPONENT Runtime ) - install ( FILES bin/post-mortem-gdb DESTINATION ${EXEC_DIR} COMPONENT Runtime ) - install ( FILES lib/libjlib.so-gdb.py DESTINATION ${LIB_DIR} COMPONENT Runtime ) install ( PROGRAMS bin/k8s_postjob_clearup.sh DESTINATION ${EXEC_DIR} COMPONENT Runtime ) install ( PROGRAMS bin/check_executes DESTINATION ${EXEC_DIR} COMPONENT Runtime ) else () diff --git a/initfiles/bin/.gdbinit b/initfiles/bin/.gdbinit deleted file mode 100644 index 763ce6453ff..00000000000 --- a/initfiles/bin/.gdbinit +++ /dev/null @@ -1,3 +0,0 @@ -# These commands will be executed by gdb on startup -add-auto-load-safe-path /opt/HPCCSystems/lib/libjlib.so-gdb.py -set print object 1 diff --git a/initfiles/bin/.gdbinit.in b/initfiles/bin/.gdbinit.in new file mode 100644 index 00000000000..201312d6fbd --- /dev/null +++ b/initfiles/bin/.gdbinit.in @@ -0,0 +1,3 @@ +# These commands will be executed by gdb on startup +add-auto-load-safe-path @DESTDIR@/opt/HPCCSystems/lib/libjlib.so-gdb.py +set print object 1 diff --git a/initfiles/lib/libjlib.so-gdb.py b/initfiles/lib/libjlib.so-gdb.py index 423626d5b0f..432c20da810 100644 --- a/initfiles/lib/libjlib.so-gdb.py +++ b/initfiles/lib/libjlib.so-gdb.py @@ -165,7 +165,7 @@ def invoke (self, arg, from_tty): ignoreVars = set(['statsMetaData', 'roAttributes', 'roAttributeValues', 'RandomMain']) ignorematch = re.compile(" StatisticsMapping ") varmatch = re.compile("[^a-zA-Z_0-9:]([a-zA-Z_][a-z0-9_A-Z:]*)(\\[.*])?;$") - goodfilematch = re.compile("^File /hpcc-dev/HPCC-Platform/(.*[.]cpp):$") + goodfilematch = re.compile("^File /.*/HPCC-Platform/(.*[.]cpp):$") filematch = re.compile("^File (.*):$") infile = None file_written = False diff --git a/system/jlib/jcontainerized.cpp b/system/jlib/jcontainerized.cpp index aa83cde683f..ca081c6fcb1 100644 --- a/system/jlib/jcontainerized.cpp +++ b/system/jlib/jcontainerized.cpp @@ -20,7 +20,7 @@ namespace k8s { -static StringBuffer myPodName, myContainerName; +static StringBuffer myPodName, myContainerName, myJobName; const char *queryMyPodName() { @@ -32,6 +32,11 @@ const char *queryMyContainerName() return myContainerName; } +const char *queryMyJobName() +{ + return myJobName; +} + KeepJobs translateKeepJobs(const char *keepJob) { if (!isEmptyString(keepJob)) // common case @@ -508,6 +513,9 @@ MODULE_INIT(INIT_PRIORITY_STANDARD) getEnvVar("MY_CONTAINER_NAME", myContainerName.clear()); if (myContainerName.isEmpty()) myContainerName.set(myPodName); // if identical (standard case), not set by templates + getEnvVar("MY_JOB_NAME", myJobName.clear()); // only k8s jobs will have this set + if (myJobName.isEmpty()) + myJobName.set(myPodName); // if no explicit job name (not a k8s job) then use pod name }; if (isContainerized()) podInfoInitCBId = installConfigUpdateHook(updateFunc, true); diff --git a/system/jlib/jcontainerized.hpp b/system/jlib/jcontainerized.hpp index ccb65f70d01..d868be659a7 100644 --- a/system/jlib/jcontainerized.hpp +++ b/system/jlib/jcontainerized.hpp @@ -28,6 +28,7 @@ namespace k8s { jlib_decl std::vector> getPodNodes(const char *selector); jlib_decl const char *queryMyPodName(); jlib_decl const char *queryMyContainerName(); +jlib_decl const char *queryMyJobName(); enum class KeepJobs { none, podfailures, all }; jlib_decl KeepJobs translateKeepJobs(const char *keepJobs); diff --git a/system/jlib/jexcept.cpp b/system/jlib/jexcept.cpp index 4aecef4f168..17f22ca40b9 100644 --- a/system/jlib/jexcept.cpp +++ b/system/jlib/jexcept.cpp @@ -1815,7 +1815,9 @@ bool getDebuggerGetStacksCmd(StringBuffer &output) output.append("Unable to capture stacks"); return false; } - return output.appendf("gdb --batch -n -ex 'thread apply all bt' %s %u", exePath, GetCurrentProcessId()); + + output.appendf("gdb --batch -ix %s/.gdbinit -x %s/post-mortem-gdb %s %u", hpccBuildInfo.execDir, hpccBuildInfo.execDir, exePath, GetCurrentProcessId()); + return true; } bool getAllStacks(StringBuffer &output) diff --git a/thorlcr/graph/thgraph.cpp b/thorlcr/graph/thgraph.cpp index a8235c57985..4cb97264ad7 100644 --- a/thorlcr/graph/thgraph.cpp +++ b/thorlcr/graph/thgraph.cpp @@ -2810,16 +2810,6 @@ void CJobBase::startJob() keyFileCacheLimit = (querySlaves()+1)*2; setKeyIndexCacheSize(keyFileCacheLimit); PROGLOG("Key file cache size set to: %d", keyFileCacheLimit); - if (getOptBool("dumpStacks")) // mainly as an example of printAllStacks() usage - { - StringBuffer output; - if (getAllStacks(output)) - { - IERRLOG("%s", output.str()); - } - else - IWARNLOG("Failed to capture process stacks: %s", output.str()); - } // NB: these defaults match defaults in jfile rename retry mechanism constexpr unsigned defaultNumRenameRetries = 4; diff --git a/thorlcr/graph/thgraphmaster.cpp b/thorlcr/graph/thgraphmaster.cpp index 9603d8a6b54..a13db163620 100644 --- a/thorlcr/graph/thgraphmaster.cpp +++ b/thorlcr/graph/thgraphmaster.cpp @@ -20,10 +20,11 @@ #include #include #include -#include "jprop.hpp" +#include "jcontainerized.hpp" #include "jexcept.hpp" #include "jiter.ipp" #include "jlzw.hpp" +#include "jprop.hpp" #include "jsocket.hpp" #include "jset.hpp" #include "jsort.hpp" @@ -1892,26 +1893,110 @@ bool CJobMaster::go() if (flags & SubscribeOptionAction) { job.markWuDirty(); - bool abort = false; - bool pause = false; wu.forceReload(); WUAction action = wu.getAction(); - if (action==WUActionPause) + if ((WUActionPause==action) || (WUActionPauseNow==action)) { // pause after current subgraph - pause = true; - } - else if (action==WUActionPauseNow) - { - // abort current subgraph - abort = true; - pause = true; - } - if (pause) - { + bool abort = (action==WUActionPauseNow); // abort current subgraph PROGLOG("Pausing job%s", abort?" [now]":""); job.pause(abort); } + else if (action==WUActionGenerateDebugInfo) + { + StringBuffer dir; + if (!getConfigurationDirectory(globals->queryPropTree("Directories"), "debug", "thor", globals->queryProp("@name"), dir)) + { + if (!isContainerized()) + { + appendCurrentDirectory(dir, false); + addPathSepChar(dir); + dir.append("debuginfo"); // use ./debuginfo in non-containerized mode + } + else + { + IWARNLOG("Failed to get debug directory"); + return; + } + } + addPathSepChar(dir); + dir.append(job.queryWuid()); + if (isContainerized()) + { + addPathSepChar(dir); + dir.append(k8s::queryMyJobName()); + } + addPathSepChar(dir); + CDateTime now; + now.setNow(); + unsigned year, month, day, hour, minute, second, nano; + now.getDate(year, month, day); + now.getTime(hour, minute, second, nano); + VStringBuffer dateStr("%04d%02d%02d-%02d%02d%02d", year, month, day, hour, minute, second); + dir.append(dateStr); + + auto managerCaptureFunc = [&dir]() + { + return captureDebugInfo(dir, "thormanager", nullptr); + }; + std::future> managerResultsFuture = std::async(std::launch::async, managerCaptureFunc); + + std::vector capturedFiles; + auto responseFunc = [&capturedFiles](unsigned worker, MemoryBuffer &mb) + { + bool res; + mb.read(res); + if (!res) + { + Owned e = deserializeException(mb); + VStringBuffer msg("Failed to get stack trace from worker %u", worker); + IWARNLOG(e, msg); + } + StringAttr file; + while (true) + { + mb.read(file); + if (file.isEmpty()) + break; + capturedFiles.push_back(file.get()); + } + }; + VStringBuffer cmd("", dir.str()); + job.issueWorkerDebugCmd(cmd, 0, responseFunc); + std::vector managerResults = managerResultsFuture.get(); + capturedFiles.insert(capturedFiles.end(), managerResults.begin(), managerResults.end()); + + VStringBuffer description("debuginfo-%s", dateStr.str()); + if (isContainerized()) + { + VStringBuffer archiveFilename("debuginfo-%s.tar.gz", dateStr.str()); + VStringBuffer tarCmd("cd %s && tar -czf %s --exclude=%s --remove-files *", dir.str(), archiveFilename.str(), archiveFilename.str()); + if (0 != system(tarCmd)) + { + OWARNLOG("Failed to create tarball of debuginfo"); + return; + } + Owned lw = &wu.lock(); + Owned query = lw->updateQuery(); + VStringBuffer archiveFilePath("%s/%s", dir.str(), archiveFilename.str()); + query->addAssociatedFile(FileTypePostMortem, archiveFilePath, "localhost", description, 0, 0, 0); + } + else + { + Owned lw = &wu.lock(); + Owned query = lw->updateQuery(); + for (auto &file: capturedFiles) + { + RemoteFilename rfn; + rfn.setRemotePath(file.c_str()); + StringBuffer localPath; + rfn.getLocalPath(localPath); + StringBuffer host; + rfn.queryEndpoint().getEndpointHostText(host); + query->addAssociatedFile(FileTypeLog, localPath, host, description, 0, 0, 0); + } + } + } } } } workunitStateChangeHandler(*this, *workunit); @@ -2089,6 +2174,45 @@ void CJobMaster::pause(bool doAbort) } } +void CJobMaster::issueWorkerDebugCmd(const char *rawText, unsigned workerNum, std::function responseFunc) +{ + mptag_t replyTag = createReplyTag(); + ICommunicator &comm = queryNodeComm(); + CMessageBuffer mbuf; + mbuf.append(DebugRequest); + mbuf.append(queryKey()); + serializeMPtag(mbuf, replyTag); + mbuf.append(rawText); + rank_t rank = workerNum ? workerNum : RANK_ALL_OTHER; // 0 == all workers + if (!comm.send(mbuf, rank, managerWorkerMpTag, MP_ASYNC_SEND)) + { + DBGLOG("Failed to send debug info to slave"); + throwUnexpected(); + } + + rank = workerNum ? workerNum : RANK_ALL; + unsigned numToRecv = workerNum ? 1 : queryNodes(); + while (numToRecv) + { + rank_t sender; + mbuf.clear(); + unsigned recvTimeoutCount = 0; + while (!comm.recv(mbuf, rank, replyTag, &sender, SHORTTIMEOUT)) + { + if (queryAborted()) + return; + ++recvTimeoutCount; + if (recvTimeoutCount == 10) + throw makeStringExceptionV(0, "Timedout waiting for debugcmd response from worker %u", workerNum); + IWARNLOG("Waiting for debugcmd response from worker %u", workerNum); + } + while (mbuf.remaining()) + responseFunc(sender, mbuf); + + numToRecv--; + } +} + bool CJobMaster::queryCreatedFile(const char *file) { StringBuffer scopedName; diff --git a/thorlcr/graph/thgraphmaster.ipp b/thorlcr/graph/thgraphmaster.ipp index ee836e059e6..433ff4605e3 100644 --- a/thorlcr/graph/thgraphmaster.ipp +++ b/thorlcr/graph/thgraphmaster.ipp @@ -180,6 +180,7 @@ public: void saveSpills(); bool go(); void pause(bool abort); + void issueWorkerDebugCmd(const char *rawText, unsigned workerNum, std::function responseFunc); virtual IConstWorkUnit &queryWorkUnit() const { diff --git a/thorlcr/graph/thgraphslave.cpp b/thorlcr/graph/thgraphslave.cpp index 4c8065b555c..7e31640b07d 100644 --- a/thorlcr/graph/thgraphslave.cpp +++ b/thorlcr/graph/thgraphslave.cpp @@ -16,6 +16,7 @@ ############################################################################## */ #include "jlib.hpp" +#include "jcontainerized.hpp" #include "jlzw.hpp" #include "jhtree.hpp" #include "rmtfile.hpp" @@ -1888,7 +1889,47 @@ double CJobSlave::getWorkUnitValueReal(const char *prop, double defVal) const void CJobSlave::debugRequest(MemoryBuffer &msg, const char *request) const { - if (watchdog) watchdog->debugRequest(msg, request); + Owned req = createPTreeFromXMLString(request); + const char *command = req->queryName(); + + if (strieq(command, "print")) + { + if (watchdog) watchdog->debugRequest(msg, req); + } + else if (strieq(command, "debuginfo")) + { + try + { + StringBuffer dir; + if (!req->getProp("@dir", dir)) + throw makeStringException(0, "deguginfo command missing 'dir' attribute"); + // NB: stacks are for all channels, but file naming is based on 1st channel + rank_t myRank = queryJobChannel(0).queryMyRank(); + StringBuffer suffix; + suffix.append((unsigned)myRank); + + std::vector capturedFiles = captureDebugInfo(dir, "thorworker", suffix); + msg.append(true); + // this info is not really needed by containerized, where all files will be on same mount point + for (auto &file : capturedFiles) + { + RemoteFilename rfn; + rfn.setLocalPath(file.c_str()); + StringBuffer fullPath; + rfn.getRemotePath(fullPath); + msg.append(fullPath.str()); + } + msg.append(""); + } + catch (IException *e) + { + msg.append(false); + serializeException(e, msg); + e->Release(); + } + } + else + throw makeStringExceptionV(5300, "Command '%s' not supported by Thor", command); } IGraphTempHandler *CJobSlave::createTempHandler(bool errorOnMissing) diff --git a/thorlcr/master/thgraphmanager.cpp b/thorlcr/master/thgraphmanager.cpp index 14d40893116..475432c45dd 100644 --- a/thorlcr/master/thgraphmanager.cpp +++ b/thorlcr/master/thgraphmanager.cpp @@ -246,41 +246,21 @@ class CJobManager : public CSimpleInterface, implements IJobManager, implements FlushingStringBuffer response(&ssock, false, MarkupFmt_XML, false, false, queryDummyContextLogger()); response.startDataset("Debug", NULL, (unsigned) -1); - if (strncmp(command,"print", 5) == 0) + if (strieq(command, "print")) { const char *edgeId = queryXml->queryProp("@edgeId"); if (!edgeId) throw MakeStringException(5300, "Debug command requires edgeId"); - - ICommunicator &comm = job->queryNodeComm(); - CMessageBuffer mbuf; - mbuf.append(DebugRequest); - mbuf.append(job->queryKey()); - mptag_t replyTag = createReplyTag(); - serializeMPtag(mbuf, replyTag); - mbuf.append(rawText); - if (!comm.send(mbuf, RANK_ALL_OTHER, managerWorkerMpTag, MP_ASYNC_SEND)) - { - DBGLOG("Failed to send debug info to slave"); - throwUnexpected(); - } - unsigned nodes = job->queryNodes(); response.appendf("", graphId, edgeId); - while (nodes) + auto responseFunc = [&response](unsigned worker, MemoryBuffer &mb) { - rank_t sender; - mbuf.clear(); - comm.recv(mbuf, RANK_ALL, replyTag, &sender, 10000); - while (mbuf.remaining()) - { - StringAttr row; - mbuf.read(row); - response.append(row); - } - nodes--; - } + StringAttr row; + mb.read(row); + response.append(row); + }; + job->issueWorkerDebugCmd(rawText.str(), 0, responseFunc); response.append(""); } - else if (strncmp(command,"quit", 4) == 0) + else if (strieq(command, "quit")) { DBGLOG("ABORT detected from user during debug session"); Owned e = MakeThorException(TE_WorkUnitAborting, "User signalled abort during debug session"); @@ -288,7 +268,7 @@ class CJobManager : public CSimpleInterface, implements IJobManager, implements response.appendf(""); } else - throw MakeStringException(5300, "Command not supported by Thor"); + throw makeStringExceptionV(5300, "Command '%s' not supported by Thor", command); response.flush(true); } diff --git a/thorlcr/slave/slwatchdog.cpp b/thorlcr/slave/slwatchdog.cpp index b466c6a8be5..d4ce27d74f2 100644 --- a/thorlcr/slave/slwatchdog.cpp +++ b/thorlcr/slave/slwatchdog.cpp @@ -141,10 +141,8 @@ class CGraphProgressHandlerBase : public CInterfaceOf, implement DBGLOG("Stopped watchdog"); } } - virtual void debugRequest(MemoryBuffer &msg, const char *request) const override + virtual void debugRequest(MemoryBuffer &msg, const IPropertyTree *req) const override { - Owned req = createPTreeFromXMLString(request); - StringBuffer edgeString; req->getProp("@edgeId", edgeString); diff --git a/thorlcr/slave/slwatchdog.hpp b/thorlcr/slave/slwatchdog.hpp index e9b43bb9b9f..fdb61bf15fd 100644 --- a/thorlcr/slave/slwatchdog.hpp +++ b/thorlcr/slave/slwatchdog.hpp @@ -27,7 +27,7 @@ interface ISlaveWatchdog : extends IInterface virtual void startGraph(CGraphBase &graph) = 0; virtual void stopGraph(CGraphBase &graph, MemoryBuffer *mb=NULL) = 0; virtual void stop() = 0; - virtual void debugRequest(MemoryBuffer &msg, const char *request) const = 0; + virtual void debugRequest(MemoryBuffer &msg, const IPropertyTree *req) const = 0; }; ISlaveWatchdog *createProgressHandler(bool udp=false); diff --git a/thorlcr/thorutil/thormisc.cpp b/thorlcr/thorutil/thormisc.cpp index 85c15ef2007..17e43771bba 100644 --- a/thorlcr/thorutil/thormisc.cpp +++ b/thorlcr/thorutil/thormisc.cpp @@ -15,6 +15,8 @@ limitations under the License. ############################################################################## */ +#include + #ifndef _WIN32 #include #include @@ -1669,3 +1671,49 @@ void saveWuidToFile(const char *wuid) wuidFileIO->write(0, strlen(wuid), wuid); wuidFileIO->close(); } + +std::vector captureDebugInfo(const char *_dir, const char *prefix, const char *suffix) +{ + if (!recursiveCreateDirectory(_dir)) + { + IWARNLOG("Failed to create debug directory: %s", _dir); + return {}; + } + + StringBuffer dir(_dir); + addPathSepChar(dir); + + // utility function to build filename based on prefix, suffix, and extension + auto getFilename = [&](StringBuffer &result, const char *name, const char *ext) -> StringBuffer & + { + result.append(dir); + if (!isEmptyString(prefix)) + { + result.append(prefix); + result.append('_'); + } + result.append(name); + if (!isEmptyString(suffix)) + { + result.append('_'); + result.append(suffix); + } + if (!isEmptyString(ext)) + { + result.append('.'); + result.append(ext); + } + return result; + }; + StringBuffer stacksFName; + getFilename(stacksFName, "stacks", "txt"); + StringBuffer gdbCmd; + getDebuggerGetStacksCmd(gdbCmd); + gdbCmd.append(" > ").append(stacksFName); + if (0 != system(gdbCmd.str())) + { + OWARNLOG("Failed to run gdb to capture stack info. Cmd = %s", gdbCmd.str()); + return { }; + } + return { stacksFName.str() }; // JCSMORE capture/return other files +} \ No newline at end of file diff --git a/thorlcr/thorutil/thormisc.hpp b/thorlcr/thorutil/thormisc.hpp index 9858848c552..5900502e5d8 100644 --- a/thorlcr/thorutil/thormisc.hpp +++ b/thorlcr/thorutil/thormisc.hpp @@ -723,4 +723,6 @@ class graph_decl CThorPerfTracer : protected PerfTracer extern graph_decl void saveWuidToFile(const char *wuid); +extern graph_decl std::vector captureDebugInfo(const char *dir, const char *prefix, const char *suffix); + #endif