forked from PIM-Data-Science/prescient-coding-challenge-2024
-
Notifications
You must be signed in to change notification settings - Fork 0
/
solution.R
160 lines (115 loc) · 4.88 KB
/
solution.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
# Load necessary libraries
library(dplyr)
library(lubridate)
library(ggplot2)
library(randomForest)
library(tidyr)
library(pacman)
# pacman::p_load(randomForest)
print('---> R Script Start')
# Define parameters
start_train <- as.Date("2017-01-01")
end_train <- as.Date("2023-11-30")
start_test <- as.Date("2024-01-01")
end_test <- as.Date("2024-06-30")
n_buys <- 10
verbose <- FALSE
print('---> initial data set up')
# Load sector data
df_sectors <- read.csv('data/data0.csv')
# Load price and financial data
df_data <- read.csv('data/data1.csv')
df_data$date <- as.Date(df_data$date)
df_x <- df_data %>% select(date, security, price, return30, ratio_pe, ratio_pcf, ratio_de, ratio_roe, ratio_roa)
df_y <- df_data %>% select(date, security, label)
list_vars1 <- c('price', 'return30', 'ratio_pe', 'ratio_pcf', 'ratio_de', 'ratio_roe', 'ratio_roa')
# Create signals DataFrame
df_signals <- data.frame(date = unique(df_x$date[df_x$date >= start_test & df_x$date <= end_test]))
df_signals <- df_signals %>% arrange(date)
# Initialize an empty list for storing accuracy results
df_signals$acc_total <- NA
df_signals$acc_current <- NA
for (i in seq_len(nrow(df_signals))) {
if (verbose) print(paste('---> doing', df_signals$date[i]))
# Training set
df_trainx <- df_x %>% filter(date < df_signals$date[i])
df_trainx <- df_trainx %>% filter(date != max(date))
df_trainy <- df_y %>% filter(date < df_signals$date[i])
df_trainy <- df_trainy %>% filter(date != max(date))
# Test set
df_testx <- df_x %>% filter(date >= df_signals$date[i])
df_testy <- df_y %>% filter(date >= df_signals$date[i])
# Scale data
for (col in list_vars1) {
scaler <- function(x) (x - min(x)) / (max(x) - min(x)) * 2 - 1
df_trainx[[col]] <- scaler(df_trainx[[col]])
df_testx[[col]] <- scaler(df_testx[[col]])
}
df_trainy <- df_trainy %>% mutate(label = as.factor(label))
# Fit the Random Forest classifier
if (i == 1) {
clf <- randomForest(x = df_trainx[list_vars1], y = df_trainy$label, ntree = 10, mtry = sqrt(length(list_vars1)), nodesize = 1000)
}
# Predictions and accuracy
pred_probs <- predict(clf, newdata = df_testx[list_vars1], type = "prob")
df_testy$signal <- pred_probs[, 2]
df_testy$pred <- ifelse(pred_probs[, 2] > 0.5, 1, 0)
df_testy$count <- 1
df_current <- df_testy %>% filter(date == df_signals$date[i])
acc_total <- mean(df_testy$label == df_testy$pred)
acc_current <- mean(df_current$label == df_current$pred)
print(paste('---> accuracy test set', round(acc_total, 2), ', accuracy current date', round(acc_current, 2)))
# Add accuracy and signal to dataframe
df_signals$acc_total[i] <- acc_total
df_signals$acc_current[i] <- acc_current
df_signals[i, df_current$security] <- df_current$signal
}
# Create buy matrix for payoff plot
df_signals$`10th` <- apply(df_signals[df_sectors$security], 1, function(x) sort(x, decreasing = TRUE)[n_buys])
df_index <- df_signals[df_sectors$security] >= df_signals$`10th`
# Set 1 for top 10 strongest signals
df_buys <- as.data.frame(matrix(0, nrow = nrow(df_signals), ncol = length(df_sectors$security)))
colnames(df_buys) <- df_sectors$security
df_buys[df_index] <- 1
# keep first 10 obs
process_row <- function(row) {
ones_indices <- which(row == 1) # Get the indices of 1's
if(length(ones_indices) > 10) {
row[ones_indices[11:length(ones_indices)]] <- 0 # Set ones after the first 10 to 0
}
return(row)
}
df_buys <- t(apply(df_buys, 1, process_row))
# add dates
library(zoo)
# df_buys <- cbind(date = as.Date(df_signals$date), df_buys)
df_buys <- cbind(date = df_signals$date, df_buys)
# df_buys[,-1] %>% rowSums()
# Plot signals
ggplot(df_signals, aes(x = date)) + geom_line(aes(y = AAPL)) + ggtitle("AAPL Signals")
image(t(df_buys[-1])) # You might need to use other visualization for the buy signals
# Create return matrix
df_returns <- read.csv('data/returns.csv')
df_returns$date <- as.Date(df_returns$date)
df_returns <- df_returns %>% filter(date >= start_test)
df_returns <- df_returns %>% pivot_wider(names_from = security, values_from = return1)
plot_payoff <- function(df_buys) {
df <- df_buys[,-1]
if (sum(rowSums(df) == 10) != nrow(df)) {
stop("---> must have exactly 10 buys each day")
}
# df_payoff <- df[, 1, drop = FALSE]
df <- df * (1 / n_buys) # equally weighted
df_payoff <- NULL
arr_ret <- as.matrix(df_returns[-1] + 1)
df_payoff$payoff <- diag(df %*% t(arr_ret-1))
df_payoff$tri <- cumprod(1+df_payoff$payoff)
df_payoff$date <- df_returns$date
df_payoff <- df_payoff %>% as_tibble()
payoff_result <- (tail(df_payoff$tri, 1) - 1) * 100
print(paste("---> payoff for these buys between period", min(df_payoff$date), "and", max(df_payoff$date), "is", round(payoff_result, 2), "%"))
return(df_payoff)
}
df_payoff <- plot_payoff(df_buys)
ggplot(df_payoff, aes(x = date, y = tri)) + geom_line() + ggtitle("Payoff Over Time")
print('---> R Script End')