Skip to content

Commit

Permalink
feat: add multi-experiment analysis
Browse files Browse the repository at this point in the history
  • Loading branch information
gmega committed Jan 10, 2025
1 parent 84bac45 commit 0acd2e3
Show file tree
Hide file tree
Showing 15 changed files with 1,662 additions and 5 deletions.
10 changes: 5 additions & 5 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -9,9 +9,9 @@ locally, however, using [Minikube](https://minikube.sigs.k8s.io/) (or Kind, or D

## Limits

When running experiments locally in a Linux machine, you will bump onto a number of
limitations. I have documented those here. I won't go into how to make those changes
permanent within your system as there's significant variation across distributions.
When running experiments locally in a Linux machine, you will likely need to adjust several
of the default OS limits. I won't go into how to make those changes permanent within your
system as there's significant variation across distributions.

**ARP Cache.** The default size for the ARP cache is too small. You should bump it
significantly, e.g.:
Expand All @@ -22,7 +22,7 @@ echo 8192 | sudo tee /proc/sys/net/ipv4/neigh/default/gc_thresh2
echo 16384 | sudo tee /proc/sys/net/ipv4/neigh/default/gc_thresh3
```

**Inotify.** Kubernetes seems to enjoy watching the filesystem, so
**inotify.** Kubernetes seems to enjoy watching the filesystem, so
you should increase inotify limits across the board:

```bash
Expand All @@ -31,7 +31,7 @@ sudo sysctl -w fs.inotify.max_queued_events=2099999999
sudo sysctl -w fs.inotify.max_user_watches=2099999999
```

**Kernel key retention service.* Kubernetes also places a large number of keys
**Kernel key retention service.** Kubernetes also places a large number of keys
within the kernel. Make sure you have enough room:

```bash
Expand Down
4 changes: 4 additions & 0 deletions analysis/final.analysis/.Rbuildignore
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
^renv$
^renv\.lock$
^.*\.Rproj$
^\.Rproj\.user$
1 change: 1 addition & 0 deletions analysis/final.analysis/.Rprofile
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
source("renv/activate.R")
1 change: 1 addition & 0 deletions analysis/final.analysis/.gitignore
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
data
16 changes: 16 additions & 0 deletions analysis/final.analysis/DESCRIPTION
Original file line number Diff line number Diff line change
@@ -0,0 +1,16 @@
Package: final.analysis
Type: Package
Title: What the Package Does (Title Case)
Version: 0.1.0
Authors@R: c(
person(
"Jane", "Doe",
email = "[email protected]",
role = c("aut", "cre")
)
)
Description: More about what it does (maybe more than one line).
Continuation lines should be indented.
License: What license is it under?
Encoding: UTF-8
LazyData: true
1 change: 1 addition & 0 deletions analysis/final.analysis/NAMESPACE
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
exportPattern("^[[:alpha:]]+")
106 changes: 106 additions & 0 deletions analysis/final.analysis/R/analysis.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,106 @@
extract_repetitions <- function(deluge_torrent_download) {
deluge_torrent_download |>
mutate(
temp = str_remove(torrent_name, '^dataset-'),
seed_set = as.numeric(str_extract(temp, '^\\d+')),
run = as.numeric(str_extract(temp, '\\d+$'))
) |>
rename(piece = value) |>
select(-temp, -name)
}

compute_pieces <- function(deluge_torrent_download, n_pieces) {
deluge_torrent_download |>
group_by(node, seed_set, run) |>
arrange(timestamp) |>
mutate(
piece_count = seq_along(timestamp)
) |>
ungroup() |>
mutate(completed = piece_count / n_pieces)
}

check_incomplete_downloads <- function(deluge_torrent_download, n_pieces) {
incomplete_downloads <- downloads |>
group_by(node, seed_set, run) |>
count() |>
ungroup() |>
filter(n != n_pieces)

nrow(incomplete_downloads) == 0
}

check_mismatching_repetitions <- function(deluge_torrent_download, repetitions) {
mismatching_repetitions <- downloads |>
select(seed_set, node, run) |>
distinct() |>
group_by(seed_set, node) |>
count() |>
filter(n != repetitions)

nrow(mismatching_repetitions) == 0
}

compute_download_times <- function(meta, request_event, deluge_torrent_download, group_id) {
n_leechers <- meta$nodes$network_size - meta$seeders

download_start <- request_event |>
select(-request_id) |>
filter(name == 'leech', type == 'RequestEventType.end') |>
mutate(
# We didn't log those on the runner side so I have to reconstruct them.
run = rep(rep(
1:meta$repetitions - 1,
each = n_leechers), times=meta$seeder_sets),
seed_set = rep(
1:meta$seeder_sets - 1,
each = n_leechers * meta$repetitions),
) |>
transmute(node = destination, run, seed_set, seed_request_time = timestamp)

download_times <- deluge_torrent_download |>
# FIXME remove this once we fix the chart
mutate(node = sub(pattern = glue::glue('-{group_id}$'), replacement = '', x = node)) |>
left_join(download_start, by = c('node', 'run', 'seed_set')) |>
mutate(
elapsed_download_time = as.numeric(timestamp - seed_request_time)
) |>
group_by(node, run, seed_set) |>
mutate(lookup_time = as.numeric(min(timestamp) - seed_request_time)) |>
ungroup()

if (nrow(download_times |>
filter(elapsed_download_time < 0 | lookup_time < 0)) > 0) {
stop('Calculation for download times contains negative numbers')
}

download_times
}

check_seeder_count <- function(download_times, seeders) {
mismatching_seeders <- download_times |>
filter(is.na(seed_request_time)) |>
select(node, seed_set, run) |>
distinct() |>
group_by(seed_set, run) |>
count() |>
filter(n != seeders)

nrow(mismatching_seeders) == 0
}

download_time_stats <- function(download_times) {
download_times |>
filter(!is.na(elapsed_download_time)) |>
group_by(piece_count, completed) |>
summarise(
mean = mean(elapsed_download_time),
median = median(elapsed_download_time),
max = max(elapsed_download_time),
min = min(elapsed_download_time),
p90 = quantile(elapsed_download_time, p = 0.95),
p10 = quantile(elapsed_download_time, p = 0.05),
.groups = 'drop'
)
}

25 changes: 25 additions & 0 deletions analysis/final.analysis/R/read-all-experiments.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,25 @@
read_all_experiments <- function(base_path, skip_incomplete = TRUE) {
roots <- list.files(base_path,
include.dirs = TRUE, no.. = TRUE, full.names = TRUE)

experiments <- lapply(roots, read_single_experiment)
names(experiments) <- sapply(roots, basename)

# Validates that no experiment has missing data.
key_sets <- lapply(experiments, ls) |> unique()
# Selects the largest keyset which is presumably the most complete.
key_set <- key_sets[[order(sapply(key_sets, length), decreasing = TRUE)[1]]]

# Discards any experiment that doesn't have all keys.
experiments <- lapply(experiments, function(experiment) {
if (!(all(key_set %in% names(experiment)))) {
warning(glue::glue('Experiment {experiment$experiment_id} is missing ',
'some keys and will be discarded.'))
NULL
} else {
experiment
}
})

experiments[!is.null(experiments)]
}
36 changes: 36 additions & 0 deletions analysis/final.analysis/R/read-single-experiment.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,36 @@
read_single_experiment <- function(experiment_folder) {
# This is a structural assumption: the base folder for the experiment
# corresponds to its ID.
experiment_id <- basename(experiment_folder)
print(glue::glue('Reading experiment {experiment_id}'))

meta <- jsonlite::read_json(.lookup_experiment_config(experiment_folder))
table_files <- list.files(path = experiment_folder, '\\.csv$')
data <- lapply(table_files, function(table_file) {
read_csv(
file.path(experiment_folder, table_file),
show_col_types = FALSE
) |>
mutate(
experiment_id = !!experiment_id
)
})

names(data) <- gsub('(\\..*)$', '', table_files)
data$meta <- meta
data$experiment_id <- experiment_id

data
}

.lookup_experiment_config <- function(experiment_folder) {
candidates <- list.files(path = experiment_folder,
pattern = '_experiment_config_log_entry.jsonl$')

if (length(candidates) != 1) {
stop(glue::glue(
'Cannot establish the correct config file at {experiment_folder}.'))
}

file.path(experiment_folder, candidates)
}
4 changes: 4 additions & 0 deletions analysis/final.analysis/R/utils.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
drop_nulls <- function(a_list) {
a_copy <- a_list[!is.null(a_list)]
a_copy
}
23 changes: 23 additions & 0 deletions analysis/final.analysis/renv.lock
Original file line number Diff line number Diff line change
@@ -0,0 +1,23 @@
{
"R": {
"Version": "4.3.3",
"Repositories": [
{
"Name": "CRAN",
"URL": "https://cloud.r-project.org"
}
]
},
"Packages": {
"renv": {
"Package": "renv",
"Version": "1.0.11",
"Source": "Repository",
"Repository": "CRAN",
"Requirements": [
"utils"
],
"Hash": "47623f66b4e80b3b0587bc5d7b309888"
}
}
}
7 changes: 7 additions & 0 deletions analysis/final.analysis/renv/.gitignore
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
library/
local/
cellar/
lock/
python/
sandbox/
staging/
Loading

0 comments on commit 0acd2e3

Please sign in to comment.