From 3ac3e30b90e0238e820d44d66852562dbaa4d3cb Mon Sep 17 00:00:00 2001 From: Qirong Mao Date: Mon, 19 Feb 2024 14:25:32 +0000 Subject: [PATCH 1/2] Adding Stereoseq mouse embryo dataset --- data/stereoseq_mouse_embryo/sample_info.csv | 54 +++++++ .../stereoseq_mouse_embryo.py | 135 ++++++++++++++++++ .../stereoseq_mouse_embryo.yml | 8 ++ 3 files changed, 197 insertions(+) create mode 100644 data/stereoseq_mouse_embryo/sample_info.csv create mode 100644 data/stereoseq_mouse_embryo/stereoseq_mouse_embryo.py create mode 100644 data/stereoseq_mouse_embryo/stereoseq_mouse_embryo.yml diff --git a/data/stereoseq_mouse_embryo/sample_info.csv b/data/stereoseq_mouse_embryo/sample_info.csv new file mode 100644 index 00000000..2a4c8b00 --- /dev/null +++ b/data/stereoseq_mouse_embryo/sample_info.csv @@ -0,0 +1,54 @@ +"filename","file_type","sample_name","size","download" +"E9.5_E1S1.MOSTA.h5ad","h5ad","E9.5_E1S1","442M","https://ftp.cngb.org/pub/SciRAID/stomics/STDS0000058/stomics/E9.5_E1S1.MOSTA.h5ad" +"E9.5_E2S1.MOSTA.h5ad","h5ad","E9.5_E2S1","381M","https://ftp.cngb.org/pub/SciRAID/stomics/STDS0000058/stomics/E9.5_E2S1.MOSTA.h5ad" +"E9.5_E2S2.MOSTA.h5ad","h5ad","E9.5_E2S2","292M","https://ftp.cngb.org/pub/SciRAID/stomics/STDS0000058/stomics/E9.5_E2S2.MOSTA.h5ad" +"E9.5_E2S3.MOSTA.h5ad","h5ad","E9.5_E2S3","350M","https://ftp.cngb.org/pub/SciRAID/stomics/STDS0000058/stomics/E9.5_E2S3.MOSTA.h5ad" +"E9.5_E2S4.MOSTA.h5ad","h5ad","E9.5_E2S4","297M","https://ftp.cngb.org/pub/SciRAID/stomics/STDS0000058/stomics/E9.5_E2S4.MOSTA.h5ad" +"E10.5_E1S1.MOSTA.h5ad","h5ad","E10.5_E1S1","1.4G","https://ftp.cngb.org/pub/SciRAID/stomics/STDS0000058/stomics/E10.5_E1S1.MOSTA.h5ad" +"E10.5_E1S2.MOSTA.h5ad","h5ad","E10.5_E1S2","944M","https://ftp.cngb.org/pub/SciRAID/stomics/STDS0000058/stomics/E10.5_E1S2.MOSTA.h5ad" +"E10.5_E1S3.MOSTA.h5ad","h5ad","E10.5_E1S3","1.4G","https://ftp.cngb.org/pub/SciRAID/stomics/STDS0000058/stomics/E10.5_E1S3.MOSTA.h5ad" +"E10.5_E2S1.MOSTA.h5ad","h5ad","E10.5_E2S1","382M","https://ftp.cngb.org/pub/SciRAID/stomics/STDS0000058/stomics/E10.5_E2S1.MOSTA.h5ad" +"E11.5_E1S1.MOSTA.h5ad","h5ad","E11.5_E1S1","1.7G","https://ftp.cngb.org/pub/SciRAID/stomics/STDS0000058/stomics/E11.5_E1S1.MOSTA.h5ad" +"E11.5_E1S2.MOSTA.h5ad","h5ad","E11.5_E1S2","1.2G","https://ftp.cngb.org/pub/SciRAID/stomics/STDS0000058/stomics/E11.5_E1S2.MOSTA.h5ad" +"E11.5_E1S3.MOSTA.h5ad","h5ad","E11.5_E1S3","1.1G","https://ftp.cngb.org/pub/SciRAID/stomics/STDS0000058/stomics/E11.5_E1S3.MOSTA.h5ad" +"E11.5_E1S4.MOSTA.h5ad","h5ad","E11.5_E1S4","1.5G","https://ftp.cngb.org/pub/SciRAID/stomics/STDS0000058/stomics/E11.5_E1S4.MOSTA.h5ad" +"E12.5_E1S1.MOSTA.h5ad","h5ad","E12.5_E1S1","2.6G","https://ftp.cngb.org/pub/SciRAID/stomics/STDS0000058/stomics/E12.5_E1S1.MOSTA.h5ad" +"E12.5_E1S2.MOSTA.h5ad","h5ad","E12.5_E1S2","2.2G","https://ftp.cngb.org/pub/SciRAID/stomics/STDS0000058/stomics/E12.5_E1S2.MOSTA.h5ad" +"E12.5_E1S3.MOSTA.h5ad","h5ad","E12.5_E1S3","1.4G","https://ftp.cngb.org/pub/SciRAID/stomics/STDS0000058/stomics/E12.5_E1S3.MOSTA.h5ad" +"E12.5_E1S4.MOSTA.h5ad","h5ad","E12.5_E1S4","2.4G","https://ftp.cngb.org/pub/SciRAID/stomics/STDS0000058/stomics/E12.5_E1S4.MOSTA.h5ad" +"E12.5_E1S5.MOSTA.h5ad","h5ad","E12.5_E1S5","2.0G","https://ftp.cngb.org/pub/SciRAID/stomics/STDS0000058/stomics/E12.5_E1S5.MOSTA.h5ad" +"E12.5_E2S1.MOSTA.h5ad","h5ad","E12.5_E2S1","1.3G","https://ftp.cngb.org/pub/SciRAID/stomics/STDS0000058/stomics/E12.5_E2S1.MOSTA.h5ad" +"E13.5_E1S1.MOSTA.h5ad","h5ad","E13.5_E1S1","3.3G","https://ftp.cngb.org/pub/SciRAID/stomics/STDS0000058/stomics/E13.5_E1S1.MOSTA.h5ad" +"E13.5_E1S2.MOSTA.h5ad","h5ad","E13.5_E1S2","4.2G","https://ftp.cngb.org/pub/SciRAID/stomics/STDS0000058/stomics/E13.5_E1S2.MOSTA.h5ad" +"E13.5_E1S3.MOSTA.h5ad","h5ad","E13.5_E1S3","4.2G","https://ftp.cngb.org/pub/SciRAID/stomics/STDS0000058/stomics/E13.5_E1S3.MOSTA.h5ad" +"E13.5_E1S4.MOSTA.h5ad","h5ad","E13.5_E1S4","3.1G","https://ftp.cngb.org/pub/SciRAID/stomics/STDS0000058/stomics/E13.5_E1S4.MOSTA.h5ad" +"E14.5_E1S1.MOSTA.h5ad","h5ad","E14.5_E1S1","5.8G","https://ftp.cngb.org/pub/SciRAID/stomics/STDS0000058/stomics/E14.5_E1S1.MOSTA.h5ad" +"E14.5_E1S2.MOSTA.h5ad","h5ad","E14.5_E1S2","4.3G","https://ftp.cngb.org/pub/SciRAID/stomics/STDS0000058/stomics/E14.5_E1S2.MOSTA.h5ad" +"E14.5_E1S3.MOSTA.h5ad","h5ad","E14.5_E1S3","2.1G","https://ftp.cngb.org/pub/SciRAID/stomics/STDS0000058/stomics/E14.5_E1S3.MOSTA.h5ad" +"E14.5_E1S4.MOSTA.h5ad","h5ad","E14.5_E1S4","4.9G","https://ftp.cngb.org/pub/SciRAID/stomics/STDS0000058/stomics/E14.5_E1S4.MOSTA.h5ad" +"E14.5_E1S5.MOSTA.h5ad","h5ad","E14.5_E1S5","4.3G","https://ftp.cngb.org/pub/SciRAID/stomics/STDS0000058/stomics/E14.5_E1S5.MOSTA.h5ad" +"E14.5_E2S1.MOSTA.h5ad","h5ad","E14.5_E2S1","2.1G","https://ftp.cngb.org/pub/SciRAID/stomics/STDS0000058/stomics/E14.5_E2S1.MOSTA.h5ad" +"E14.5_E2S2.MOSTA.h5ad","h5ad","E14.5_E2S2","3.6G","https://ftp.cngb.org/pub/SciRAID/stomics/STDS0000058/stomics/E14.5_E2S2.MOSTA.h5ad" +"E15.5_E1S1.MOSTA.h5ad","h5ad","E15.5_E1S1","4.1G","https://ftp.cngb.org/pub/SciRAID/stomics/STDS0000058/stomics/E15.5_E1S1.MOSTA.h5ad" +"E15.5_E1S2.MOSTA.h5ad","h5ad","E15.5_E1S2","3.6G","https://ftp.cngb.org/pub/SciRAID/stomics/STDS0000058/stomics/E15.5_E1S2.MOSTA.h5ad" +"E15.5_E1S3.MOSTA.h5ad","h5ad","E15.5_E1S3","4.6G","https://ftp.cngb.org/pub/SciRAID/stomics/STDS0000058/stomics/E15.5_E1S3.MOSTA.h5ad" +"E15.5_E1S4.MOSTA.h5ad","h5ad","E15.5_E1S4","3.4G","https://ftp.cngb.org/pub/SciRAID/stomics/STDS0000058/stomics/E15.5_E1S4.MOSTA.h5ad" +"E15.5_E2S1.MOSTA.h5ad","h5ad","E15.5_E2S1","3.0G","https://ftp.cngb.org/pub/SciRAID/stomics/STDS0000058/stomics/E15.5_E2S1.MOSTA.h5ad" +"E16.5_E1S1.MOSTA.h5ad","h5ad","E16.5_E1S1","4.5G","https://ftp.cngb.org/pub/SciRAID/stomics/STDS0000058/stomics/E16.5_E1S1.MOSTA.h5ad" +"E16.5_E1S2.MOSTA.h5ad","h5ad","E16.5_E1S2","3.6G","https://ftp.cngb.org/pub/SciRAID/stomics/STDS0000058/stomics/E16.5_E1S2.MOSTA.h5ad" +"E16.5_E1S3.MOSTA.h5ad","h5ad","E16.5_E1S3","5.3G","https://ftp.cngb.org/pub/SciRAID/stomics/STDS0000058/stomics/E16.5_E1S3.MOSTA.h5ad" +"E16.5_E1S4.MOSTA.h5ad","h5ad","E16.5_E1S4","4.5G","https://ftp.cngb.org/pub/SciRAID/stomics/STDS0000058/stomics/E16.5_E1S4.MOSTA.h5ad" +"E16.5_E1S5.MOSTA.h5ad","h5ad","E16.5_E1S5","4.1G","https://ftp.cngb.org/pub/SciRAID/stomics/STDS0000058/stomics/E16.5_E1S5.MOSTA.h5ad" +"E16.5_E2S10.MOSTA.h5ad","h5ad","E16.5_E2S10","3.4G","https://ftp.cngb.org/pub/SciRAID/stomics/STDS0000058/stomics/E16.5_E2S10.MOSTA.h5ad" +"E16.5_E2S11.MOSTA.h5ad","h5ad","E16.5_E2S11","3.1G","https://ftp.cngb.org/pub/SciRAID/stomics/STDS0000058/stomics/E16.5_E2S11.MOSTA.h5ad" +"E16.5_E2S12.MOSTA.h5ad","h5ad","E16.5_E2S12","2.5G","https://ftp.cngb.org/pub/SciRAID/stomics/STDS0000058/stomics/E16.5_E2S12.MOSTA.h5ad" +"E16.5_E2S13.MOSTA.h5ad","h5ad","E16.5_E2S13","2.6G","https://ftp.cngb.org/pub/SciRAID/stomics/STDS0000058/stomics/E16.5_E2S13.MOSTA.h5ad" +"E16.5_E2S1.MOSTA.h5ad","h5ad","E16.5_E2S1","2.1G","https://ftp.cngb.org/pub/SciRAID/stomics/STDS0000058/stomics/E16.5_E2S1.MOSTA.h5ad" +"E16.5_E2S2.MOSTA.h5ad","h5ad","E16.5_E2S2","2.9G","https://ftp.cngb.org/pub/SciRAID/stomics/STDS0000058/stomics/E16.5_E2S2.MOSTA.h5ad" +"E16.5_E2S3.MOSTA.h5ad","h5ad","E16.5_E2S3","2.8G","https://ftp.cngb.org/pub/SciRAID/stomics/STDS0000058/stomics/E16.5_E2S3.MOSTA.h5ad" +"E16.5_E2S4.MOSTA.h5ad","h5ad","E16.5_E2S4","2.9G","https://ftp.cngb.org/pub/SciRAID/stomics/STDS0000058/stomics/E16.5_E2S4.MOSTA.h5ad" +"E16.5_E2S5.MOSTA.h5ad","h5ad","E16.5_E2S5","4.1G","https://ftp.cngb.org/pub/SciRAID/stomics/STDS0000058/stomics/E16.5_E2S5.MOSTA.h5ad" +"E16.5_E2S6.MOSTA.h5ad","h5ad","E16.5_E2S6","3.7G","https://ftp.cngb.org/pub/SciRAID/stomics/STDS0000058/stomics/E16.5_E2S6.MOSTA.h5ad" +"E16.5_E2S7.MOSTA.h5ad","h5ad","E16.5_E2S7","4.3G","https://ftp.cngb.org/pub/SciRAID/stomics/STDS0000058/stomics/E16.5_E2S7.MOSTA.h5ad" +"E16.5_E2S8.MOSTA.h5ad","h5ad","E16.5_E2S8","4.0G","https://ftp.cngb.org/pub/SciRAID/stomics/STDS0000058/stomics/E16.5_E2S8.MOSTA.h5ad" +"E16.5_E2S9.MOSTA.h5ad","h5ad","E16.5_E2S9","5.0G","https://ftp.cngb.org/pub/SciRAID/stomics/STDS0000058/stomics/E16.5_E2S9.MOSTA.h5ad" \ No newline at end of file diff --git a/data/stereoseq_mouse_embryo/stereoseq_mouse_embryo.py b/data/stereoseq_mouse_embryo/stereoseq_mouse_embryo.py new file mode 100644 index 00000000..bf71a237 --- /dev/null +++ b/data/stereoseq_mouse_embryo/stereoseq_mouse_embryo.py @@ -0,0 +1,135 @@ +#!/usr/bin/env python + +import urllib.request +from urllib.parse import urlparse +import os +import anndata +import argparse +import shutil +import pandas as pd +import scipy +import json +import tempfile + +# 6 available but only 2 contain region label and coordinates + +sample_info=pd.read_csv('sample_info.csv') + +LINKS = sample_info["download"].tolist() + + +META_DICT = {"technology":"Stereo-seq"} + +SAMPLE_COLUMNS = ["sample","n_clusters","directory"] + + +def download_links(links, temp_dir): + for link in links: + try: + response = urllib.request.urlopen(link) + # Extract filename from the URL + filename = os.path.join(temp_dir, urlparse(link).path.split("/")[-1]) + with open(filename, 'wb') as file: + file.write(response.read()) + print(f"Downloaded: {filename}") + except Exception as e: + print(f"Error downloading {link}: {e}") + +def process_adata(adata_path,output_folder,iteration,sample_df,sample_info): + folder_name = os.path.splitext(os.path.basename(adata_path))[0] + complete_path = os.path.join(output_folder,folder_name) + os.makedirs(complete_path, exist_ok=True) + adata = anndata.read_h5ad(adata_path) + + # Observations + obs = adata.obs.copy() + obs["selected"] = "true" + obs.to_csv(f"{complete_path}/observations.tsv",sep="\t",index_label="") + + # Features + vars = adata.var.copy() + vars["selected"] = "true" + vars.to_csv(f"{complete_path}/features.tsv",sep="\t",index_label="") + + # Coordinates + coords = pd.DataFrame(adata.obsm["spatial"],columns=["x","y"]) + coords.index = adata.obs.index + coords.to_csv(f"{complete_path}/coordinates.tsv",sep="\t",index_label="") + + # Matrix + + # Check if "count" key exists in adata.layers + if "count" in adata.layers: + matrix_to_write = adata.layers["count"] + file_path = f"{complete_path}/counts.mtx" + scipy.io.mmwrite(file_path, matrix_to_write) + elif "counts" in adata.layers: + matrix_to_write = adata.layers["counts"] + file_path = f"{complete_path}/counts.mtx" + scipy.io.mmwrite(file_path, matrix_to_write) + print(f"Matrix written to {file_path}") + else: + print("Neither 'count' nor 'counts' key found in adata.layers.") + + + # add info for sample.tsv + # Your sample_data_basis dictionary + sample_data_basis = {"sample":sample_info["sample_name"].iloc[iteration],"n_clusters": adata.obs.annotation.nunique(), "directory": folder_name} + + # Creating a DataFrame from the dictionary + sample_data = pd.DataFrame([sample_data_basis]) + + # Concatenating the new DataFrame to sample_df + sample_df.iloc[iteration] = sample_data_basis + + # Write labels.tsv + if "annotation" in adata.obs.columns: + labels = adata.obs["annotation"] + labels.to_csv(f"{complete_path}/labels.tsv",sep="\t",index_label="") + + + +def write_json(dict,output_path): + with open(output_path, 'w') as json_file: + json.dump(dict, json_file) + + +def main(): + # Set up command-line argument parser + parser = argparse.ArgumentParser(description="Convert Stereo-seq Mouse Embryo data to Spacehack format.") + + # Add arguments for output folder + parser.add_argument('-o','--out_dir', help="Output directory to write files to.",required=True) + + + # Parse the command-line arguments + args = parser.parse_args() + + # Download and process + with tempfile.TemporaryDirectory() as temp_dir: + download_links(LINKS,temp_dir) + os.makedirs(args.out_dir, exist_ok=True) + + + sample_df = pd.DataFrame(columns=SAMPLE_COLUMNS,index=range(len(LINKS))) + anndatas = [os.path.join(temp_dir, file) for file in os.listdir(temp_dir) if file.endswith(".h5ad")] + for iteration, adata in enumerate(anndatas): + process_adata(adata, args.out_dir,iteration,sample_df,sample_info) + + + # write json + write_json(META_DICT,f"{args.out_dir}/experiment.json") + + # write samples.tsv + sample_df.to_csv(f"{args.out_dir}/samples.tsv", sep="\t", index_label="") + + + + + +if __name__ == "__main__": + main() + + + + \ No newline at end of file diff --git a/data/stereoseq_mouse_embryo/stereoseq_mouse_embryo.yml b/data/stereoseq_mouse_embryo/stereoseq_mouse_embryo.yml new file mode 100644 index 00000000..4283feab --- /dev/null +++ b/data/stereoseq_mouse_embryo/stereoseq_mouse_embryo.yml @@ -0,0 +1,8 @@ +channels: + - conda-forge +dependencies: + - python=3.11.6 + - scipy=1.11.4 + - anndata=0.10.3 + - numpy=1.26.2 + - pandas=2.1.3 \ No newline at end of file From 73daf970ec0ee50f2e53f162893452dfdc5dc930 Mon Sep 17 00:00:00 2001 From: Qirong Mao Date: Thu, 22 Feb 2024 09:41:18 +0000 Subject: [PATCH 2/2] Intergrate sample_info.csv into codes --- data/stereoseq_mouse_embryo/sample_info.csv | 54 ------------------- .../stereoseq_mouse_embryo.py | 23 ++++++-- 2 files changed, 18 insertions(+), 59 deletions(-) delete mode 100644 data/stereoseq_mouse_embryo/sample_info.csv diff --git a/data/stereoseq_mouse_embryo/sample_info.csv b/data/stereoseq_mouse_embryo/sample_info.csv deleted file mode 100644 index 2a4c8b00..00000000 --- a/data/stereoseq_mouse_embryo/sample_info.csv +++ /dev/null @@ -1,54 +0,0 @@ -"filename","file_type","sample_name","size","download" -"E9.5_E1S1.MOSTA.h5ad","h5ad","E9.5_E1S1","442M","https://ftp.cngb.org/pub/SciRAID/stomics/STDS0000058/stomics/E9.5_E1S1.MOSTA.h5ad" -"E9.5_E2S1.MOSTA.h5ad","h5ad","E9.5_E2S1","381M","https://ftp.cngb.org/pub/SciRAID/stomics/STDS0000058/stomics/E9.5_E2S1.MOSTA.h5ad" -"E9.5_E2S2.MOSTA.h5ad","h5ad","E9.5_E2S2","292M","https://ftp.cngb.org/pub/SciRAID/stomics/STDS0000058/stomics/E9.5_E2S2.MOSTA.h5ad" -"E9.5_E2S3.MOSTA.h5ad","h5ad","E9.5_E2S3","350M","https://ftp.cngb.org/pub/SciRAID/stomics/STDS0000058/stomics/E9.5_E2S3.MOSTA.h5ad" -"E9.5_E2S4.MOSTA.h5ad","h5ad","E9.5_E2S4","297M","https://ftp.cngb.org/pub/SciRAID/stomics/STDS0000058/stomics/E9.5_E2S4.MOSTA.h5ad" -"E10.5_E1S1.MOSTA.h5ad","h5ad","E10.5_E1S1","1.4G","https://ftp.cngb.org/pub/SciRAID/stomics/STDS0000058/stomics/E10.5_E1S1.MOSTA.h5ad" -"E10.5_E1S2.MOSTA.h5ad","h5ad","E10.5_E1S2","944M","https://ftp.cngb.org/pub/SciRAID/stomics/STDS0000058/stomics/E10.5_E1S2.MOSTA.h5ad" -"E10.5_E1S3.MOSTA.h5ad","h5ad","E10.5_E1S3","1.4G","https://ftp.cngb.org/pub/SciRAID/stomics/STDS0000058/stomics/E10.5_E1S3.MOSTA.h5ad" -"E10.5_E2S1.MOSTA.h5ad","h5ad","E10.5_E2S1","382M","https://ftp.cngb.org/pub/SciRAID/stomics/STDS0000058/stomics/E10.5_E2S1.MOSTA.h5ad" -"E11.5_E1S1.MOSTA.h5ad","h5ad","E11.5_E1S1","1.7G","https://ftp.cngb.org/pub/SciRAID/stomics/STDS0000058/stomics/E11.5_E1S1.MOSTA.h5ad" -"E11.5_E1S2.MOSTA.h5ad","h5ad","E11.5_E1S2","1.2G","https://ftp.cngb.org/pub/SciRAID/stomics/STDS0000058/stomics/E11.5_E1S2.MOSTA.h5ad" -"E11.5_E1S3.MOSTA.h5ad","h5ad","E11.5_E1S3","1.1G","https://ftp.cngb.org/pub/SciRAID/stomics/STDS0000058/stomics/E11.5_E1S3.MOSTA.h5ad" -"E11.5_E1S4.MOSTA.h5ad","h5ad","E11.5_E1S4","1.5G","https://ftp.cngb.org/pub/SciRAID/stomics/STDS0000058/stomics/E11.5_E1S4.MOSTA.h5ad" -"E12.5_E1S1.MOSTA.h5ad","h5ad","E12.5_E1S1","2.6G","https://ftp.cngb.org/pub/SciRAID/stomics/STDS0000058/stomics/E12.5_E1S1.MOSTA.h5ad" -"E12.5_E1S2.MOSTA.h5ad","h5ad","E12.5_E1S2","2.2G","https://ftp.cngb.org/pub/SciRAID/stomics/STDS0000058/stomics/E12.5_E1S2.MOSTA.h5ad" -"E12.5_E1S3.MOSTA.h5ad","h5ad","E12.5_E1S3","1.4G","https://ftp.cngb.org/pub/SciRAID/stomics/STDS0000058/stomics/E12.5_E1S3.MOSTA.h5ad" -"E12.5_E1S4.MOSTA.h5ad","h5ad","E12.5_E1S4","2.4G","https://ftp.cngb.org/pub/SciRAID/stomics/STDS0000058/stomics/E12.5_E1S4.MOSTA.h5ad" -"E12.5_E1S5.MOSTA.h5ad","h5ad","E12.5_E1S5","2.0G","https://ftp.cngb.org/pub/SciRAID/stomics/STDS0000058/stomics/E12.5_E1S5.MOSTA.h5ad" -"E12.5_E2S1.MOSTA.h5ad","h5ad","E12.5_E2S1","1.3G","https://ftp.cngb.org/pub/SciRAID/stomics/STDS0000058/stomics/E12.5_E2S1.MOSTA.h5ad" -"E13.5_E1S1.MOSTA.h5ad","h5ad","E13.5_E1S1","3.3G","https://ftp.cngb.org/pub/SciRAID/stomics/STDS0000058/stomics/E13.5_E1S1.MOSTA.h5ad" -"E13.5_E1S2.MOSTA.h5ad","h5ad","E13.5_E1S2","4.2G","https://ftp.cngb.org/pub/SciRAID/stomics/STDS0000058/stomics/E13.5_E1S2.MOSTA.h5ad" -"E13.5_E1S3.MOSTA.h5ad","h5ad","E13.5_E1S3","4.2G","https://ftp.cngb.org/pub/SciRAID/stomics/STDS0000058/stomics/E13.5_E1S3.MOSTA.h5ad" -"E13.5_E1S4.MOSTA.h5ad","h5ad","E13.5_E1S4","3.1G","https://ftp.cngb.org/pub/SciRAID/stomics/STDS0000058/stomics/E13.5_E1S4.MOSTA.h5ad" -"E14.5_E1S1.MOSTA.h5ad","h5ad","E14.5_E1S1","5.8G","https://ftp.cngb.org/pub/SciRAID/stomics/STDS0000058/stomics/E14.5_E1S1.MOSTA.h5ad" -"E14.5_E1S2.MOSTA.h5ad","h5ad","E14.5_E1S2","4.3G","https://ftp.cngb.org/pub/SciRAID/stomics/STDS0000058/stomics/E14.5_E1S2.MOSTA.h5ad" -"E14.5_E1S3.MOSTA.h5ad","h5ad","E14.5_E1S3","2.1G","https://ftp.cngb.org/pub/SciRAID/stomics/STDS0000058/stomics/E14.5_E1S3.MOSTA.h5ad" -"E14.5_E1S4.MOSTA.h5ad","h5ad","E14.5_E1S4","4.9G","https://ftp.cngb.org/pub/SciRAID/stomics/STDS0000058/stomics/E14.5_E1S4.MOSTA.h5ad" -"E14.5_E1S5.MOSTA.h5ad","h5ad","E14.5_E1S5","4.3G","https://ftp.cngb.org/pub/SciRAID/stomics/STDS0000058/stomics/E14.5_E1S5.MOSTA.h5ad" -"E14.5_E2S1.MOSTA.h5ad","h5ad","E14.5_E2S1","2.1G","https://ftp.cngb.org/pub/SciRAID/stomics/STDS0000058/stomics/E14.5_E2S1.MOSTA.h5ad" -"E14.5_E2S2.MOSTA.h5ad","h5ad","E14.5_E2S2","3.6G","https://ftp.cngb.org/pub/SciRAID/stomics/STDS0000058/stomics/E14.5_E2S2.MOSTA.h5ad" -"E15.5_E1S1.MOSTA.h5ad","h5ad","E15.5_E1S1","4.1G","https://ftp.cngb.org/pub/SciRAID/stomics/STDS0000058/stomics/E15.5_E1S1.MOSTA.h5ad" -"E15.5_E1S2.MOSTA.h5ad","h5ad","E15.5_E1S2","3.6G","https://ftp.cngb.org/pub/SciRAID/stomics/STDS0000058/stomics/E15.5_E1S2.MOSTA.h5ad" -"E15.5_E1S3.MOSTA.h5ad","h5ad","E15.5_E1S3","4.6G","https://ftp.cngb.org/pub/SciRAID/stomics/STDS0000058/stomics/E15.5_E1S3.MOSTA.h5ad" -"E15.5_E1S4.MOSTA.h5ad","h5ad","E15.5_E1S4","3.4G","https://ftp.cngb.org/pub/SciRAID/stomics/STDS0000058/stomics/E15.5_E1S4.MOSTA.h5ad" -"E15.5_E2S1.MOSTA.h5ad","h5ad","E15.5_E2S1","3.0G","https://ftp.cngb.org/pub/SciRAID/stomics/STDS0000058/stomics/E15.5_E2S1.MOSTA.h5ad" -"E16.5_E1S1.MOSTA.h5ad","h5ad","E16.5_E1S1","4.5G","https://ftp.cngb.org/pub/SciRAID/stomics/STDS0000058/stomics/E16.5_E1S1.MOSTA.h5ad" -"E16.5_E1S2.MOSTA.h5ad","h5ad","E16.5_E1S2","3.6G","https://ftp.cngb.org/pub/SciRAID/stomics/STDS0000058/stomics/E16.5_E1S2.MOSTA.h5ad" -"E16.5_E1S3.MOSTA.h5ad","h5ad","E16.5_E1S3","5.3G","https://ftp.cngb.org/pub/SciRAID/stomics/STDS0000058/stomics/E16.5_E1S3.MOSTA.h5ad" -"E16.5_E1S4.MOSTA.h5ad","h5ad","E16.5_E1S4","4.5G","https://ftp.cngb.org/pub/SciRAID/stomics/STDS0000058/stomics/E16.5_E1S4.MOSTA.h5ad" -"E16.5_E1S5.MOSTA.h5ad","h5ad","E16.5_E1S5","4.1G","https://ftp.cngb.org/pub/SciRAID/stomics/STDS0000058/stomics/E16.5_E1S5.MOSTA.h5ad" -"E16.5_E2S10.MOSTA.h5ad","h5ad","E16.5_E2S10","3.4G","https://ftp.cngb.org/pub/SciRAID/stomics/STDS0000058/stomics/E16.5_E2S10.MOSTA.h5ad" -"E16.5_E2S11.MOSTA.h5ad","h5ad","E16.5_E2S11","3.1G","https://ftp.cngb.org/pub/SciRAID/stomics/STDS0000058/stomics/E16.5_E2S11.MOSTA.h5ad" -"E16.5_E2S12.MOSTA.h5ad","h5ad","E16.5_E2S12","2.5G","https://ftp.cngb.org/pub/SciRAID/stomics/STDS0000058/stomics/E16.5_E2S12.MOSTA.h5ad" -"E16.5_E2S13.MOSTA.h5ad","h5ad","E16.5_E2S13","2.6G","https://ftp.cngb.org/pub/SciRAID/stomics/STDS0000058/stomics/E16.5_E2S13.MOSTA.h5ad" -"E16.5_E2S1.MOSTA.h5ad","h5ad","E16.5_E2S1","2.1G","https://ftp.cngb.org/pub/SciRAID/stomics/STDS0000058/stomics/E16.5_E2S1.MOSTA.h5ad" -"E16.5_E2S2.MOSTA.h5ad","h5ad","E16.5_E2S2","2.9G","https://ftp.cngb.org/pub/SciRAID/stomics/STDS0000058/stomics/E16.5_E2S2.MOSTA.h5ad" -"E16.5_E2S3.MOSTA.h5ad","h5ad","E16.5_E2S3","2.8G","https://ftp.cngb.org/pub/SciRAID/stomics/STDS0000058/stomics/E16.5_E2S3.MOSTA.h5ad" -"E16.5_E2S4.MOSTA.h5ad","h5ad","E16.5_E2S4","2.9G","https://ftp.cngb.org/pub/SciRAID/stomics/STDS0000058/stomics/E16.5_E2S4.MOSTA.h5ad" -"E16.5_E2S5.MOSTA.h5ad","h5ad","E16.5_E2S5","4.1G","https://ftp.cngb.org/pub/SciRAID/stomics/STDS0000058/stomics/E16.5_E2S5.MOSTA.h5ad" -"E16.5_E2S6.MOSTA.h5ad","h5ad","E16.5_E2S6","3.7G","https://ftp.cngb.org/pub/SciRAID/stomics/STDS0000058/stomics/E16.5_E2S6.MOSTA.h5ad" -"E16.5_E2S7.MOSTA.h5ad","h5ad","E16.5_E2S7","4.3G","https://ftp.cngb.org/pub/SciRAID/stomics/STDS0000058/stomics/E16.5_E2S7.MOSTA.h5ad" -"E16.5_E2S8.MOSTA.h5ad","h5ad","E16.5_E2S8","4.0G","https://ftp.cngb.org/pub/SciRAID/stomics/STDS0000058/stomics/E16.5_E2S8.MOSTA.h5ad" -"E16.5_E2S9.MOSTA.h5ad","h5ad","E16.5_E2S9","5.0G","https://ftp.cngb.org/pub/SciRAID/stomics/STDS0000058/stomics/E16.5_E2S9.MOSTA.h5ad" \ No newline at end of file diff --git a/data/stereoseq_mouse_embryo/stereoseq_mouse_embryo.py b/data/stereoseq_mouse_embryo/stereoseq_mouse_embryo.py index bf71a237..f10ddcd1 100644 --- a/data/stereoseq_mouse_embryo/stereoseq_mouse_embryo.py +++ b/data/stereoseq_mouse_embryo/stereoseq_mouse_embryo.py @@ -13,9 +13,22 @@ # 6 available but only 2 contain region label and coordinates -sample_info=pd.read_csv('sample_info.csv') +sample_name = ['E9.5_E1S1', 'E9.5_E2S1', 'E9.5_E2S2', 'E9.5_E2S3', 'E9.5_E2S4', + 'E10.5_E1S1', 'E10.5_E1S2', 'E10.5_E1S3', 'E10.5_E2S1', + 'E11.5_E1S1', 'E11.5_E1S2', 'E11.5_E1S3', 'E11.5_E1S4', + 'E12.5_E1S1', 'E12.5_E1S2', 'E12.5_E1S3', 'E12.5_E1S4', + 'E12.5_E1S5', 'E12.5_E2S1', 'E13.5_E1S1', 'E13.5_E1S2', + 'E13.5_E1S3', 'E13.5_E1S4', 'E14.5_E1S1', 'E14.5_E1S2', + 'E14.5_E1S3', 'E14.5_E1S4', 'E14.5_E1S5', 'E14.5_E2S1', + 'E14.5_E2S2', 'E15.5_E1S1', 'E15.5_E1S2', 'E15.5_E1S3', + 'E15.5_E1S4', 'E15.5_E2S1', 'E16.5_E1S1', 'E16.5_E1S2', + 'E16.5_E1S3', 'E16.5_E1S4', 'E16.5_E1S5', 'E16.5_E2S10', + 'E16.5_E2S11', 'E16.5_E2S12', 'E16.5_E2S13', 'E16.5_E2S1', + 'E16.5_E2S2', 'E16.5_E2S3', 'E16.5_E2S4', 'E16.5_E2S5', + 'E16.5_E2S6', 'E16.5_E2S7', 'E16.5_E2S8', 'E16.5_E2S9'] + +LINKS = [f"https://ftp.cngb.org/pub/SciRAID/stomics/STDS0000058/stomics/{sample}.MOSTA.h5ad" for sample in sample_name] -LINKS = sample_info["download"].tolist() META_DICT = {"technology":"Stereo-seq"} @@ -35,7 +48,7 @@ def download_links(links, temp_dir): except Exception as e: print(f"Error downloading {link}: {e}") -def process_adata(adata_path,output_folder,iteration,sample_df,sample_info): +def process_adata(adata_path,output_folder,iteration,sample_df,sample_name): folder_name = os.path.splitext(os.path.basename(adata_path))[0] complete_path = os.path.join(output_folder,folder_name) os.makedirs(complete_path, exist_ok=True) @@ -74,7 +87,7 @@ def process_adata(adata_path,output_folder,iteration,sample_df,sample_info): # add info for sample.tsv # Your sample_data_basis dictionary - sample_data_basis = {"sample":sample_info["sample_name"].iloc[iteration],"n_clusters": adata.obs.annotation.nunique(), "directory": folder_name} + sample_data_basis = {"sample":sample_name[iteration],"n_clusters": adata.obs.annotation.nunique(), "directory": folder_name} # Creating a DataFrame from the dictionary sample_data = pd.DataFrame([sample_data_basis]) @@ -114,7 +127,7 @@ def main(): sample_df = pd.DataFrame(columns=SAMPLE_COLUMNS,index=range(len(LINKS))) anndatas = [os.path.join(temp_dir, file) for file in os.listdir(temp_dir) if file.endswith(".h5ad")] for iteration, adata in enumerate(anndatas): - process_adata(adata, args.out_dir,iteration,sample_df,sample_info) + process_adata(adata, args.out_dir,iteration,sample_df,sample_name) # write json