forked from jvaverka/2023-09-26-parallel-computing
-
Notifications
You must be signed in to change notification settings - Fork 0
/
0_simple_datascience.jl
53 lines (48 loc) · 1.06 KB
/
0_simple_datascience.jl
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
using Plots, CSV, MultivariateStats
plotlyjs()
## Setup parameters
begin
# Number of observations
N = 1000
# First/second feature variances
σ₁ = 0.5
σ₂ = 0.1
θ = π/3
end
## Create `data` that is N observations of 2 different
begin
Xform = [σ₁ 0;
0 σ₂] *
[cos(θ) -sin(θ);
sin(θ) cos(θ)]
data = (randn(N, 2) * Xform)'
p = scatter(
data[1, :],
data[2, :];
xlim=(-1.5, 1.5),
ylim=(-1.5, 1.5),
title="Scatterplot of feature 1 vs. feature 2",
xlabel="feature 1",
ylabel="feature 2",
legend=nothing,
)
display(p)
end
## Use PCA to discover vectors of maximal variance
M = fit(PCA, data)
## Plot data with PCA vectors overlaid
begin
p = scatter(
data[1, :],
data[2, :];
xlim=(-1.5, 1.5),
ylim=(-1.5, 1.5),
title="Principal components",
xlabel="feature 1",
ylabel="feature 2",
legend=nothing,
)
plot!(p, [M.mean[1], M.mean[1] .+ M.proj[1,1]], [M.mean[2], M.mean[2] .+ M.proj[2,1]]; linewidth=10)
plot!(p, [M.mean[1], M.mean[1] .+ M.proj[1,2]], [M.mean[2], M.mean[2] .+ M.proj[2,2]]; linewidth=10)
display(p)
end