Adds 32bit support part2

cvxgrp · Jan 1, 2025 · 11d59bc · 11d59bc
1 parent a54f000
commit 11d59bc
Showing 1 changed file with 37 additions and 13 deletions.
diff --git a/utils/sherlock_script.py b/utils/sherlock_script.py
@@ -3,6 +3,7 @@
 import numpy as np
 import time
 import os
+import pickle
 import pandas as pd
 import pgenlib as pg
 
@@ -23,7 +24,9 @@
 cache_dir = "/scratch/groups/candes/parth"
 df = pd.read_csv(os.path.join(data_dir, "phenotypes.QC.britishonly.csv"), index_col=0)
 df = df.drop('ethnicity', axis=1)
-covars_dense = df[['age', 'sex'] + [f'PC{i}' for i in range(1, 11)]]
+covars_dense = np.array(
+    df[['age', 'age_squared', 'sex'] + [f'PC{i}' for i in range(1, 11)]].to_numpy(),
+    dtype=np.float32)
 y = np.array(df['height'].to_numpy(), dtype=np.float32)
 
 chromosomes = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22]
@@ -35,7 +38,7 @@
             ad.matrix.snp_unphased(
                 ad.io.snp_unphased(
                     os.path.join(cache_dir, f"EUR_subset_chr{chr}.snpdat"),
-                ), n_threads=32, dtype=np.float32,
+                ), n_threads=32, dtype=np.float32
             )
             for chr in chromosomes],
         axis=1,
@@ -54,16 +57,37 @@
 print(f'{X_train.shape=}')
 print(f'{X_test.shape=}')
 
-ti_solve = time.monotonic()
-state = ad.grpnet(
-    X=X_train,
-    glm=ad.glm.gaussian(y_train),
-    early_exit=False,
-    min_ratio=1e-6,
-    n_threads=32,
-)
-tf_solve = time.monotonic()
+model_cache = f'/scratch/groups/candes/parth/fit_model_{task_id}.pkl'
+
+if os.path.exists(model_cache):
+    class fake_state:
+        def __init__(self):
+            with open(model_cache, 'rb') as fd:
+                d = pickle.load(fd)
+            self.betas = d['betas']
+            self.lmda_path = d['lmda_path']
+            self.intercepts = d['intercepts']
+            self.intercept = True
+            self.X = X_train
+            self.groups = np.arange(self.X.shape[1])
+            self.alpha = 1.0
+    state = fake_state()
+    ti_solve = 0
+    tf_solve = 0
+else:
+    ti_solve = time.monotonic()
+    state = ad.grpnet(
+        X=X_train,
+        glm=ad.glm.gaussian(y_train, dtype=np.float32),
+        early_exit=False,
+        min_ratio=1e-6,
+        n_threads=32,
+        lmda_path_size=241,
+    )
+    tf_solve = time.monotonic()
 
+    with open(model_cache, 'wb') as fd:
+        pickle.dump({'betas': state.betas, 'lmda_path': state.lmda_path, 'intercepts': state.intercepts}, fd)
 
 loss = torch.nn.MSELoss()
 L = state.betas.shape[0]
@@ -76,7 +100,7 @@
     ins[i] = loss(torch.from_numpy(y_hat_train[i]), torch.from_numpy(y_train))
 
 ti_alo = time.monotonic()
-ld, alo, ts, r2 = ai.get_alo_for_sweep(y_train, state, loss, 10)
+ld, alo, ts, r2 = ai.get_alo_for_sweep_v2(y_train, state, loss, 80)
 tf_alo = time.monotonic()
 
-np.savez(sys.argv[1], lamda=ld, alo=alo, oos=oos, in_sample=ins, ts=ts, r2=r2, solve_time=tf_solve - ti_solve, alo_time=tf_alo - ti_alo)
+#np.savez(sys.argv[1], alo_lamda=ld, full_lamda=state.lmda_path, alo=alo, oos=oos, in_sample=ins, ts=ts, r2=r2, solve_time=tf_solve - ti_solve, alo_time=tf_alo - ti_alo)