Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[Databricks][Cgroups] Add cgroups stats #19

Open
wants to merge 1 commit into
base: fbenkstein-db/extended-cgroups-support
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@
// limitations under the License.
package com.google.devtools.build.lib.profiler;

import static java.util.Map.entry;
import static com.google.common.base.Preconditions.checkNotNull;
import static com.google.common.base.Preconditions.checkState;

Expand Down Expand Up @@ -714,6 +715,14 @@ public void logEventAtTime(long atTimeNanos, ProfilerTask type, String descripti
logTask(atTimeNanos, 0, type, description);
}

/** Log arbitrary data. */
public void logData(TraceData data) {
JsonTraceFileWriter writer = writerRef.get();
if (writer != null) {
writer.enqueue(data);
}
}

/** Used to log "events" - tasks with zero duration. */
@VisibleForTesting
void logEvent(ProfilerTask type, String description) {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -540,6 +540,7 @@ public void verifyPostCondition(
}
Optional<VirtualCGroup> cgroup = cgroups.remove(context.getId());
if (cgroup != null && cgroup.isPresent()) {
cgroup.get().logStats();
// We cannot leave the cgroups around and delete them only when we delete the sandboxes
// because linux has a hard limit of 65535 memory controllers.
// Ref. https://github.com/torvalds/linux/blob/58d4e450a490d5f02183f6834c12550ba26d3b47/include/linux/memcontrol.h#L69
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -21,10 +21,12 @@ java_library(
deps = [
"//src/main/java/com/google/devtools/build/lib/actions:exec_exception",
"//src/main/java/com/google/devtools/build/lib/events",
"//src/main/java/com/google/devtools/build/lib/profiler",
"//src/main/protobuf:failure_details_java_proto",
"//third_party:auto_value",
"//third_party:flogger",
"//third_party:guava",
"//third_party:gson",
"//third_party:jsr305",
],
)
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,10 @@
import java.lang.reflect.InvocationHandler;
import java.lang.reflect.Method;
import java.lang.reflect.Proxy;
import java.nio.file.Files;
import java.nio.file.Path;
import java.util.Map;
import java.util.stream.Collectors;

public interface Controller {
default boolean isLegacy() throws IOException {
Expand All @@ -33,12 +36,27 @@ protected FailureDetails.FailureDetail getFailureDetail(String message) {

Path getPath() throws IOException;

Path statFile() throws IOException;

default String getStats() throws IOException {
if (statFile() != null && statFile().toFile().exists()) {
return Files.readString(statFile());
}
return "";
}

interface Memory extends Controller {
void setMaxBytes(long bytes) throws IOException;
long getMaxBytes() throws IOException;
long oomKills() throws IOException;
long maxUsage() throws IOException;
}
interface Cpu extends Controller {
void setCpus(float cpus) throws IOException;
int getCpus() throws IOException;
int getPeriod() throws IOException;
}
interface CpuAcct extends Controller {
long getUsage() throws IOException;
}
}
}
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
package com.google.devtools.build.lib.sandbox.cgroups;

import com.google.gson.stream.JsonWriter;
import com.google.auto.value.AutoValue;
import com.google.common.collect.ImmutableMap;
import com.google.common.collect.ImmutableSet;
Expand All @@ -8,24 +9,32 @@
import com.google.common.io.Files;
import com.google.devtools.build.lib.events.Event;
import com.google.devtools.build.lib.events.EventHandler;
import com.google.devtools.build.lib.profiler.Profiler;
import com.google.devtools.build.lib.profiler.ProfilerTask;
import com.google.devtools.build.lib.profiler.TraceData;
import com.google.devtools.build.lib.sandbox.cgroups.v1.LegacyCpu;
import com.google.devtools.build.lib.sandbox.cgroups.v1.LegacyCpuAcct;
import com.google.devtools.build.lib.sandbox.cgroups.v1.LegacyMemory;
import com.google.devtools.build.lib.sandbox.cgroups.v2.UnifiedCpu;
import com.google.devtools.build.lib.sandbox.cgroups.v2.UnifiedMemory;

import javax.annotation.Nullable;
import java.io.BufferedReader;
import java.io.File;
import java.io.FileInputStream;
import java.io.IOException;
import java.io.InputStream;
import java.io.StringReader;
import java.nio.charset.StandardCharsets;
import java.nio.file.Path;
import java.nio.file.Paths;
import java.util.LinkedHashMap;
import java.util.List;
import java.util.Map;
import java.util.Queue;
import java.util.Scanner;
import java.util.concurrent.ConcurrentLinkedQueue;
import java.util.concurrent.TimeUnit;


/**
Expand All @@ -44,6 +53,9 @@ public abstract class VirtualCGroup {

public abstract Controller.Cpu cpu();
public abstract Controller.Memory memory();
@Nullable
public abstract Controller.CpuAcct cpuacct();

public abstract ImmutableSet<Path> paths();

private final Queue<VirtualCGroup> children = new ConcurrentLinkedQueue<>();
Expand Down Expand Up @@ -100,6 +112,7 @@ static VirtualCGroup create(File procMounts, File procCgroup, EventHandler repor

Controller.Memory memory = null;
Controller.Cpu cpu = null;
Controller.CpuAcct cpuacct = null;
ImmutableSet.Builder<Path> paths = ImmutableSet.builder();

for (Mount m: mounts) {
Expand Down Expand Up @@ -172,14 +185,19 @@ static VirtualCGroup create(File procMounts, File procCgroup, EventHandler repor
logger.atInfo().log("Found cgroup v1 cpu controller at %s", cgroup);
cpu = new LegacyCpu(cgroup);
break;
case "cpuacct":
if (cpuacct != null) continue;
logger.atInfo().log("Found cgroup v1 cpuacct controller at %s", cgroup);
cpuacct = new LegacyCpuAcct(cgroup);
break;
}
}
}
}

cpu = cpu != null ? cpu : Controller.getDefault(Controller.Cpu.class);
memory = memory != null ? memory : Controller.getDefault(Controller.Memory.class);
VirtualCGroup vcgroup = new AutoValue_VirtualCGroup(cpu, memory, paths.build());
VirtualCGroup vcgroup = new AutoValue_VirtualCGroup(cpu, memory, cpuacct, paths.build());
Runtime.getRuntime().addShutdownHook(new Thread(() -> vcgroup.delete()));
return vcgroup;
}
Expand All @@ -192,6 +210,7 @@ public void delete() {
public VirtualCGroup child(String name) throws IOException {
Controller.Cpu cpu = Controller.getDefault(Controller.Cpu.class);
Controller.Memory memory = Controller.getDefault(Controller.Memory.class);
Controller.CpuAcct cpuacct = null;
ImmutableSet.Builder<Path> paths = ImmutableSet.builder();
if (memory() != null && memory().getPath() != null) {
copyControllersToSubtree(memory().getPath());
Expand All @@ -207,8 +226,83 @@ public VirtualCGroup child(String name) throws IOException {
cpu = cpu().isLegacy() ? new LegacyCpu(cgroup) : new UnifiedCpu(cgroup);
paths.add(cgroup);
}
VirtualCGroup child = new AutoValue_VirtualCGroup(cpu, memory, paths.build());
if (cpuacct() != null && cpuacct().getPath() != null) {
Path cgroup = cpuacct().getPath().resolve(name);
cgroup.toFile().mkdirs();
cpuacct = new LegacyCpuAcct(cgroup);
paths.add(cgroup);
}
VirtualCGroup child = new AutoValue_VirtualCGroup(cpu, memory, cpuacct, paths.build());
this.children.add(child);
return child;
}

final class StatsData implements TraceData {
@Override
public void writeTraceData(JsonWriter jsonWriter, long profileStartTimeNanos) throws IOException {
long timestamp = TimeUnit.NANOSECONDS.toMicros(System.nanoTime() - profileStartTimeNanos);
if (cpu() != null || cpuacct() != null) {
var stats = new LinkedHashMap<String, String>();
if (cpu() != null) {
try (BufferedReader reader = new BufferedReader(new StringReader(cpu().getStats()))) {
String line;
while ((line = reader.readLine()) != null) {
String[] parts = line.split(" ", 2);
stats.put(parts[0], parts[1]);
}
}
stats.put("quota", String.valueOf(cpu().getCpus()));
stats.put("period", String.valueOf(cpu().getPeriod()));
}
if (cpuacct() != null) {
try (BufferedReader reader = new BufferedReader(new StringReader(cpuacct().getStats()))) {
String line;
while ((line = reader.readLine()) != null) {
String[] parts = line.split(" ", 2);
Double value = Long.parseLong(parts[1]) * 1e6 / LegacyCpuAcct.USER_HZ;
stats.put(parts[0] + "_usec", String.valueOf(value.longValue()));
}
}
stats.put("usage_usec", String.valueOf(cpuacct().getUsage() / 1000));
}

writeStats(jsonWriter, timestamp, "CPU stats (Sandbox)", stats);
}
if (memory() != null) {
var stats = new LinkedHashMap<String, String>();
Long kills = memory().oomKills();
Long limit = memory().getMaxBytes();
Long usage = memory().maxUsage();
if (usage > 0) stats.put("max_usage_in_bytes", String.valueOf(usage));
if (limit > 0) stats.put("limit_in_bytes", String.valueOf(limit));
if (kills > 0) stats.put("oom_kills", String.valueOf(kills));
writeStats(jsonWriter, timestamp, "Memory stats (Sandbox)", stats);
}
}

void writeStats(JsonWriter writer, long timestamp, String name, Map<String, String> stats) throws IOException {
var currentThread = Thread.currentThread();
var threadId = currentThread.threadId();
writer.setIndent(" ");
writer.beginObject();
writer.setIndent("");
writer.name("cat").value("sandbox info");
writer.name("name").value(name);
writer.name("args");
writer.beginObject();
for (var entry : stats.entrySet()) {
writer.name(entry.getKey()).value(entry.getValue());
}
writer.endObject();
writer.name("ph").value("i");
writer.name("ts").value(timestamp);
writer.name("pid").value(1);
writer.name("tid").value(threadId);
writer.endObject();
}
}

public void logStats() throws IOException {
Profiler.instance().logData(new StatsData());
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,8 @@
import java.io.IOException;
import java.nio.file.Files;
import java.nio.file.Path;
import java.util.Map;
import java.util.stream.Collectors;

public class LegacyCpu implements Controller.Cpu {
private final Path path;
Expand All @@ -20,6 +22,11 @@ public Path getPath() {
return path;
}

@Override
public Path statFile() throws IOException {
return path.resolve("cpu.stat");
}

@Override
public void setCpus(float cpus) throws IOException {
int quota = Math.round(cpus * period);
Expand All @@ -30,4 +37,8 @@ public void setCpus(float cpus) throws IOException {
public int getCpus() throws IOException {
return Integer.parseInt(Files.readString(path.resolve("cpu.cfs_quota_us")).trim());
}

public int getPeriod() throws IOException {
return this.period;
}
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,30 @@
package com.google.devtools.build.lib.sandbox.cgroups.v1;

import com.google.devtools.build.lib.sandbox.cgroups.Controller;
import java.io.IOException;
import java.nio.file.Files;
import java.nio.file.Path;

public class LegacyCpuAcct implements Controller.CpuAcct {
private final Path path;
// TODO get this value from the system
public static long USER_HZ = 100;

public LegacyCpuAcct(Path path) {
this.path = path;
}

@Override
public Path getPath() throws IOException {
return path;
}

@Override
public Path statFile() throws IOException {
return path.resolve("cpuacct.stat");
}

public long getUsage() throws IOException {
return Long.parseLong(Files.readString(path.resolve("cpuacct.usage")).trim());
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,9 @@
import java.io.IOException;
import java.nio.file.Files;
import java.nio.file.Path;
import java.util.List;
import java.util.Map;
import java.util.stream.Collectors;

public class LegacyMemory implements Controller.Memory {
private final Path path;
Expand All @@ -14,6 +17,11 @@ public Path getPath() {
return path;
}

@Override
public Path statFile() throws IOException {
return path.resolve("memory.stat");
}

public LegacyMemory(Path path) {
this.path = path;
}
Expand All @@ -27,4 +35,19 @@ public void setMaxBytes(long bytes) throws IOException {
public long getMaxBytes() throws IOException {
return Long.parseLong(Files.readString(path.resolve("memory.limit_in_bytes")).trim());
}

@Override
public long oomKills() throws IOException {
for (String line: Files.readAllLines(getPath().resolve("memory.oom_control"))) {
if (line.startsWith("oom_kill ")) {
return Long.parseLong(line.substring(line.indexOf(" ") + 1));
}
}
return -1;
}

@Override
public long maxUsage() throws IOException {
return Long.parseLong(Files.readString(path.resolve("memory.max_usage_in_bytes")).trim());
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -8,18 +8,24 @@

public class UnifiedCpu implements Controller.Cpu {
private final Path path;
public UnifiedCpu(Path path) {
private final int period;
public UnifiedCpu(Path path) throws IOException {
this.path = path;
this.period = Integer.parseInt(Files.readString(path.resolve("cpu.max")).split(" ", 2)[1]);
}

@Override
public Path getPath() {
return path;
}

@Override
public Path statFile() throws IOException {
return path.resolve("cpu.stat");
}

@Override
public void setCpus(float cpus) throws IOException {
int period = 1000_000;
int quota = Math.round(period * cpus);
String limit = String.format("%d %d", quota, period);
Files.writeString(path.resolve("cpu.max"), limit);
Expand All @@ -29,4 +35,8 @@ public void setCpus(float cpus) throws IOException {
public int getCpus() throws IOException {
return Integer.parseInt(Files.readString(path.resolve("cpu.max")).trim());
}

public int getPeriod() {
return period;
}
}
Loading