Skip to content

Commit

Permalink
Restrict getting the restore process start time to CRaC API
Browse files Browse the repository at this point in the history
There is a race condition on the existence of the criu restore
process and retrieving its start time, when --restore-detached
is passed to criu restore. This patch restrict retrieving the
process start time to restoring via CRaC which is not affected.

Issues: eclipse-openj9#20214
Signed-off-by: Nathan Henderson <[email protected]>
  • Loading branch information
ThanHenderson authored and rmnattas committed Nov 1, 2024
1 parent 9603d39 commit ada79db
Show file tree
Hide file tree
Showing 3 changed files with 14 additions and 59 deletions.
27 changes: 14 additions & 13 deletions runtime/vm/CRIUHelpers.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1583,7 +1583,6 @@ criuCheckpointJVMImpl(JNIEnv *env,
I_32 syslogBufferSize = 0;
UDATA oldVMState = VM_VMHelpers::setVMState(currentThread, J9VMSTATE_CRIU_SUPPORT_CHECKPOINT_PHASE_START);
UDATA notSafeToCheckpoint = 0;
UDATA criuRestorePid = 0;
U_32 intGhostFileLimit = 0;
IDATA criuDumpReturnCode = 0;
bool restoreFailure = false;
Expand Down Expand Up @@ -1885,19 +1884,21 @@ criuCheckpointJVMImpl(JNIEnv *env,
}

if (hasDumpSucceeded) {
/* Calculate restore time excluding `criu restore ...` for MXBean API. */
criuRestorePid = j9sysinfo_get_ppid();
systemReturnCode = j9sysinfo_get_process_start_time(criuRestorePid, &restoreNanoUTCTime);
if (0 != systemReturnCode) {
currentExceptionClass = vm->checkpointState.criuSystemRestoreExceptionClass;
nlsMsgFormat = j9nls_lookup_message(
J9NLS_DO_NOT_PRINT_MESSAGE_TAG | J9NLS_DO_NOT_APPEND_NEWLINE,
J9NLS_VM_CRIU_J9_GET_PROCESS_START_TIME_FAILURE,
NULL);
restoreFailure = true;
/* Calculate restore time for CRaC MXBean API. */
if (J9_ARE_ALL_BITS_SET(vm->checkpointState.flags, J9VM_CRAC_IS_CHECKPOINT_ENABLED)) {
UDATA cracRestorePid = j9sysinfo_get_ppid();
systemReturnCode = j9sysinfo_get_process_start_time(cracRestorePid, &restoreNanoUTCTime);
if (0 != systemReturnCode) {
currentExceptionClass = vm->checkpointState.criuSystemRestoreExceptionClass;
nlsMsgFormat = j9nls_lookup_message(
J9NLS_DO_NOT_PRINT_MESSAGE_TAG | J9NLS_DO_NOT_APPEND_NEWLINE,
J9NLS_VM_CRIU_J9_GET_PROCESS_START_TIME_FAILURE,
NULL);
restoreFailure = true;
}
vm->checkpointState.processRestoreStartTimeInNanoseconds = (I_64)restoreNanoUTCTime;
Trc_VM_criu_process_restore_start_after_dump(currentThread, cracRestorePid, vm->checkpointState.processRestoreStartTimeInNanoseconds);
}
vm->checkpointState.processRestoreStartTimeInNanoseconds = (I_64)restoreNanoUTCTime;
Trc_VM_criu_process_restore_start_after_dump(currentThread, criuRestorePid, vm->checkpointState.processRestoreStartTimeInNanoseconds);

/* Load restore arguments from restore file or env vars. */
switch (loadRestoreArguments(currentThread, optionsFileChars, envFileChars)) {
Expand Down
16 changes: 0 additions & 16 deletions test/functional/cmdLineTests/criu/criu_nonPortable.xml
Original file line number Diff line number Diff line change
Expand Up @@ -209,22 +209,6 @@
<output type="failure" caseSensitive="yes" regex="no">User requested Java dump using</output>
</test>

<test id="Create CRIU checkpoint image and restore once - testGetProcessRestoreStartTime">
<command>bash $SCRIPPATH$ $TEST_RESROOT$ $JAVA_COMMAND$ "$JVM_OPTIONS$ $XTRACE_CRIU$ $STD_CMD_OPTS$" $MAINCLASS_TIMECHANGE$ testGetProcessRestoreStartTime 1 false false</command>
<output type="success" caseSensitive="no" regex="no">Killed</output>
<output type="success" caseSensitive="yes" regex="no">PASSED: InternalCRIUSupport.getProcessRestoreStartTime()</output>
<output type="failure" caseSensitive="yes" regex="no">CRIU is not enabled</output>
<output type="failure" caseSensitive="yes" regex="no">Operation not permitted</output>
<output type="failure" caseSensitive="yes" regex="no">FAILED: InternalCRIUSupport.getProcessRestoreStartTime()</output>
<!-- If CRIU can't acquire the original thread IDs, this test will fail. Nothing can be done about this failure. -->
<output type="success" caseSensitive="yes" regex="no">Thread pid mismatch</output>
<output type="success" caseSensitive="yes" regex="no">do not match expected</output>
<output type="success" caseSensitive="yes" regex="no">Unable to create a thread:</output>
<!-- In the past, the failure below was caused by an issue where CRIU can't be found on the PATH. -->
<output type="failure" caseSensitive="yes" regex="no">Could not dump the JVM processes, err=-70</output>
<output type="failure" caseSensitive="yes" regex="no">User requested Java dump using</output>
</test>

<test id="Create CRIU checkpoint image and restore once - testMXBeanUpTime">
<command>bash $SCRIPPATH$ $TEST_RESROOT$ $JAVA_COMMAND$ "$JVM_OPTIONS$ -Xtrace:print={j9jcl.533,j9vm.684-696,j9vm.699,j9vm.717-743} $STD_CMD_OPTS$" -Xdump:java+system+jit:events=throw+systhrow,filter=org/eclipse/openj9/criu/JVMCheckpointException $MAINCLASS_TIMECHANGE$ testMXBeanUpTime 1 false false</command>
<output type="success" caseSensitive="no" regex="no">Killed</output>
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -61,9 +61,6 @@ public static void main(String args[]) throws InterruptedException {
case "testGetLastRestoreTime":
tct.testGetLastRestoreTime();
break;
case "testGetProcessRestoreStartTime":
tct.testGetProcessRestoreStartTime();
break;
case "testMXBeanUpTime":
tct.testMXBeanUpTime();
break;
Expand Down Expand Up @@ -267,33 +264,6 @@ private void testGetLastRestoreTime() {
}
}

private void testGetProcessRestoreStartTime() {
long processRestoreStartTime = InternalCRIUSupport.getProcessRestoreStartTime();
if (processRestoreStartTime != -1) {
System.out.println("FAILED: InternalCRIUSupport.getProcessRestoreStartTime() - " + processRestoreStartTime
+ " is not -1 before restore");
}
CRIUSupport criu = CRIUTestUtils.prepareCheckPointJVM(CRIUTestUtils.imagePath);
long beforeCheckpointTime = TimeUtilities.getCurrentTimeInNanoseconds();
CRIUTestUtils.checkPointJVMNoSetup(criu, CRIUTestUtils.imagePath, false);
processRestoreStartTime = InternalCRIUSupport.getProcessRestoreStartTime();
long lastRestoreTime = InternalCRIUSupport.getLastRestoreTime();
long afterRestoreTime = TimeUtilities.getCurrentTimeInNanoseconds();
if (beforeCheckpointTime >= processRestoreStartTime) {
System.out.println("FAILED: InternalCRIUSupport.getProcessRestoreStartTime() - " + processRestoreStartTime
+ " is less than beforeCheckpointTime - " + beforeCheckpointTime);
} else if (processRestoreStartTime >= lastRestoreTime) {
System.out.println("FAILED: InternalCRIUSupport.getProcessRestoreStartTime() - " + processRestoreStartTime
+ " is more than InternalCRIUSupport.getLastRestoreTime() - " + lastRestoreTime);
} else if (processRestoreStartTime >= afterRestoreTime) {
System.out.println("FAILED: InternalCRIUSupport.getProcessRestoreStartTime() - " + processRestoreStartTime
+ " is more than afterRestoreTime - " + afterRestoreTime);
} else {
System.out.println("PASSED: InternalCRIUSupport.getProcessRestoreStartTime() - " + processRestoreStartTime
+ " is between beforeCheckpointTime - " + beforeCheckpointTime + " and afterRestoreTime - " + afterRestoreTime);
}
}

private void testMXBeanUpTime() {
CRIUSupport criu = CRIUTestUtils.prepareCheckPointJVM(CRIUTestUtils.imagePath);
RuntimeMXBean mxb = ManagementFactory.getRuntimeMXBean();
Expand Down

0 comments on commit ada79db

Please sign in to comment.