-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathmissing.R
executable file
·62 lines (50 loc) · 1.99 KB
/
missing.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
library(polars)
missing_sites <- function(bed, ref, blacklist) {
if (!is.na(blacklist)) {
blacklist <- (
pl$scan_parquet(blacklist, parallel="row_groups")
$with_columns(
pl$col("chr")$cast(pl$dtypes$String),
pl$col("start")$cast(pl$dtypes$UInt64)
)
$select(list("chr", "start"))
)
ref <- (
pl$scan_parquet(ref, parallel="row_groups")
$with_columns(
pl$col("chr")$cast(pl$dtypes$String),
pl$col("start")$cast(pl$dtypes$UInt64),
pl$col("end")$cast(pl$dtypes$UInt64)
)
$select(list("chr", "start", "end"))
)$filter(~pl$col("chr")$is_in(c("Y", "X")))$join(blacklist, how = "anti", on = c("chr", "start"))
} else {
ref <- (
pl$scan_parquet(ref, parallel="row_groups")
$with_columns(
pl$col("chr")$cast(pl$dtypes$String),
pl$col("start")$cast(pl$dtypes$UInt64),
pl$col("end")$cast(pl$dtypes$UInt64)
)
$select(list("chr", "start", "end"))
)
}
missing <- ref$join(bed, on=c("chr", "start"), how="left")
missing_mat <- (
missing$with_columns(
pl$when(pl$col("avg")$is_not_null())$then(pl$col("start"))$alias("b_start"),
pl$when(pl$col("avg")$is_not_null())$then(pl$col("start"))$alias("f_start"),
pl$when(pl$col("avg")$is_not_null())$then(pl$col("avg"))$alias("b_meth"),
pl$when(pl$col("avg")$is_not_null())$then(pl$col("avg"))$alias("f_meth")
)
$with_columns(
pl$col(c("f_start", "f_meth"))$backward_fill()$over("chr"),
pl$col(c("b_start", "b_meth"))$forward_fill()$over("chr")
)
$with_columns(
(pl$col("start") - pl$col("b_start"))$alias("b_dist"),
(pl$col("f_start") - pl$col("start"))$alias("f_dist")
)
)
return(missing_mat)
}