diff --git a/scripts/create_resources/combine/process_datasets_bruker_nebius.sh b/scripts/create_resources/combine/process_datasets_bruker_nebius.sh new file mode 100644 index 000000000..dbc103819 --- /dev/null +++ b/scripts/create_resources/combine/process_datasets_bruker_nebius.sh @@ -0,0 +1,157 @@ +#!/bin/bash + +# TODO: The param_list metadata was mostly infered with chatGPT from the create resources scripts. +# Double check if everything's correct. + + +# get the root of the directory +REPO_ROOT=$(git rev-parse --show-toplevel) + +# ensure that the command below is run from the root of the repository +cd "$REPO_ROOT" + +set -e + +input_dir="s3://openproblems-data/resources/datasets" +#publish_dir="s3://openproblems-data/resources/task_ist_preprocessing/datasets" +publish_dir='/scratch/task_ist_preprocessing/datasets' + +cat > /tmp/params_bruker.yaml << HERE +param_list: + + - id: "bruker_mouse_brain_cosmx_combined/rep1" + input_sp: "$input_dir/bruker_cosmx/bruker_mouse_brain_cosmx/rep1/dataset.zarr" + input_sc: "$input_dir/allen_brain_cell_atlas/2023_yao_mouse_brain_scrnaseq_10xv2/dataset.h5ad" + dataset_id: "bruker_mouse_brain_cosmx_combined/rep1" + dataset_name: "Mouse brain combined Bruker CosMx rep1 2023 Yao scRNAseq" + dataset_url: "https://nanostring.com/products/cosmx-spatial-molecular-imager/ffpe-dataset/cosmx-smi-mouse-brain-ffpe-dataset/" + dataset_reference: "10.1038/s41586-023-06812-z" + dataset_summary: "Bruker CosMx Mouse Brain + ABCA Mouse Brain scRNAseq" + dataset_description: "Bruker CosMx Mouse Brain + ABCA Mouse Brain scRNAseq" + dataset_organism: "mus_musculus" + + - id: "bruker_human_liver_cosmx_combined" + input_sp: "$input_dir/bruker_cosmx/bruker_human_liver_cosmx/dataset.zarr" + input_sc: "$input_dir/scrnaseq_for_ist/2022Andrews_human_liver_sc/dataset.h5ad" + dataset_id: "bruker_human_liver_cosmx_combined" + dataset_name: "Human liver combined Bruker CosMx 2022 Andrews scRNAseq" + dataset_url: "https://nanostring.com/products/cosmx-spatial-molecular-imager/ffpe-dataset/human-liver-rna-ffpe-dataset/" + dataset_reference: "https://doi.org/10.1002/hep4.1854" + dataset_summary: "Bruker CosMx Human Liver + 2022 Andrews scRNAseq" + dataset_description: "Bruker CosMx Human Liver + 2022 Andrews scRNAseq" + dataset_organism: "homo_sapiens" + + - id: "bruker_human_liver_cancer_cosmx_combined" + input_sp: "$input_dir/bruker_cosmx/bruker_human_liver_cancer_cosmx/dataset.zarr" + input_sc: "$input_dir/scrnaseq_for_ist/2022Lu_human_liver_cancer_sc/dataset.h5ad" + dataset_id: "bruker_human_liver_cancer_cosmx_combined" + dataset_name: "Human liver cancer combined Bruker CosMx 2022 Lu scRNAseq" + dataset_url: "https://nanostring.com/products/cosmx-spatial-molecular-imager/ffpe-dataset/human-liver-rna-ffpe-dataset/" + dataset_reference: "https://doi.org/10.1038/s41467-022-32283-3" + dataset_summary: "Bruker CosMx Human Liver Cancer + 2022 Lu scRNAseq" + dataset_description: "Bruker CosMx Human Liver Cancer + 2022 Lu scRNAseq" + dataset_organism: "homo_sapiens" + + - id: "bruker_human_lung_cancer_cosmx_combined/lung5_rep1" + input_sp: "$input_dir/bruker_cosmx/bruker_human_lung_cancer_cosmx/lung5_rep1/dataset.zarr" + input_sc: "$input_dir/zuani_human_nsclc_sc/2024Zuani_human_nsclc_sc/dataset.h5ad" + dataset_id: "bruker_human_lung_cancer_cosmx_combined/lung5_rep1" + dataset_name: "Human lung cancer combined Bruker CosMx Lung5 rep1 2024 Zuani scRNAseq" + dataset_url: "https://nanostring.com/products/cosmx-spatial-molecular-imager/ffpe-dataset/nsclc-ffpe-dataset/" + dataset_reference: "https://www.ebi.ac.uk/biostudies/arrayexpress/studies/E-MTAB-13526" + dataset_summary: "Bruker CosMx Human Lung Cancer Lung5 Rep1 + 2024 Zuani scRNAseq" + dataset_description: "Bruker CosMx Human Lung Cancer Lung5 Rep1 + 2024 Zuani scRNAseq" + dataset_organism: "homo_sapiens" + + - id: "bruker_human_lung_cancer_cosmx_combined/lung5_rep2" + input_sp: "$input_dir/bruker_cosmx/bruker_human_lung_cancer_cosmx/lung5_rep2/dataset.zarr" + input_sc: "$input_dir/zuani_human_nsclc_sc/2024Zuani_human_nsclc_sc/dataset.h5ad" + dataset_id: "bruker_human_lung_cancer_cosmx_combined/lung5_rep2" + dataset_name: "Human lung cancer combined Bruker CosMx Lung5 rep2 2024 Zuani scRNAseq" + dataset_url: "https://nanostring.com/products/cosmx-spatial-molecular-imager/ffpe-dataset/nsclc-ffpe-dataset/" + dataset_reference: "https://www.ebi.ac.uk/biostudies/arrayexpress/studies/E-MTAB-13526" + dataset_summary: "Bruker CosMx Human Lung Cancer Lung5 Rep2 + 2024 Zuani scRNAseq" + dataset_description: "Bruker CosMx Human Lung Cancer Lung5 Rep2 + 2024 Zuani scRNAseq" + dataset_organism: "homo_sapiens" + + - id: "bruker_human_lung_cancer_cosmx_combined/lung5_rep3" + input_sp: "$input_dir/bruker_cosmx/bruker_human_lung_cancer_cosmx/lung5_rep3/dataset.zarr" + input_sc: "$input_dir/zuani_human_nsclc_sc/2024Zuani_human_nsclc_sc/dataset.h5ad" + dataset_id: "bruker_human_lung_cancer_cosmx_combined/lung5_rep3" + dataset_name: "Human lung cancer combined Bruker CosMx Lung5 rep3 2024 Zuani scRNAseq" + dataset_url: "https://nanostring.com/products/cosmx-spatial-molecular-imager/ffpe-dataset/nsclc-ffpe-dataset/" + dataset_reference: "https://www.ebi.ac.uk/biostudies/arrayexpress/studies/E-MTAB-13526" + dataset_summary: "Bruker CosMx Human Lung Cancer Lung5 Rep3 + 2024 Zuani scRNAseq" + dataset_description: "Bruker CosMx Human Lung Cancer Lung5 Rep3 + 2024 Zuani scRNAseq" + dataset_organism: "homo_sapiens" + + - id: "bruker_human_lung_cancer_cosmx_combined/lung6" + input_sp: "$input_dir/bruker_cosmx/bruker_human_lung_cancer_cosmx/lung6/dataset.zarr" + input_sc: "$input_dir/zuani_human_nsclc_sc/2024Zuani_human_nsclc_sc/dataset.h5ad" + dataset_id: "bruker_human_lung_cancer_cosmx_combined/lung6" + dataset_name: "Human lung cancer combined Bruker CosMx Lung6 2024 Zuani scRNAseq" + dataset_url: "https://nanostring.com/products/cosmx-spatial-molecular-imager/ffpe-dataset/nsclc-ffpe-dataset/" + dataset_reference: "https://www.ebi.ac.uk/biostudies/arrayexpress/studies/E-MTAB-13526" + dataset_summary: "Bruker CosMx Human Lung Cancer Lung6 + 2024 Zuani scRNAseq" + dataset_description: "Bruker CosMx Human Lung Cancer Lung6 + 2024 Zuani scRNAseq" + dataset_organism: "homo_sapiens" + + - id: "bruker_human_lung_cancer_cosmx_combined/lung9_rep1" + input_sp: "$input_dir/bruker_cosmx/bruker_human_lung_cancer_cosmx/lung9_rep1/dataset.zarr" + input_sc: "$input_dir/zuani_human_nsclc_sc/2024Zuani_human_nsclc_sc/dataset.h5ad" + dataset_id: "bruker_human_lung_cancer_cosmx_combined/lung9_rep1" + dataset_name: "Human lung cancer combined Bruker CosMx Lung9 rep1 2024 Zuani scRNAseq" + dataset_url: "https://nanostring.com/products/cosmx-spatial-molecular-imager/ffpe-dataset/nsclc-ffpe-dataset/" + dataset_reference: "https://www.ebi.ac.uk/biostudies/arrayexpress/studies/E-MTAB-13526" + dataset_summary: "Bruker CosMx Human Lung Cancer Lung9 Rep1 + 2024 Zuani scRNAseq" + dataset_description: "Bruker CosMx Human Lung Cancer Lung9 Rep1 + 2024 Zuani scRNAseq" + dataset_organism: "homo_sapiens" + + - id: "bruker_human_lung_cancer_cosmx_combined/lung9_rep2" + input_sp: "$input_dir/bruker_cosmx/bruker_human_lung_cancer_cosmx/lung9_rep2/dataset.zarr" + input_sc: "$input_dir/zuani_human_nsclc_sc/2024Zuani_human_nsclc_sc/dataset.h5ad" + dataset_id: "bruker_human_lung_cancer_cosmx_combined/lung9_rep2" + dataset_name: "Human lung cancer combined Bruker CosMx Lung9 rep2 2024 Zuani scRNAseq" + dataset_url: "https://nanostring.com/products/cosmx-spatial-molecular-imager/ffpe-dataset/nsclc-ffpe-dataset/" + dataset_reference: "https://www.ebi.ac.uk/biostudies/arrayexpress/studies/E-MTAB-13526" + dataset_summary: "Bruker CosMx Human Lung Cancer Lung9 Rep2 + 2024 Zuani scRNAseq" + dataset_description: "Bruker CosMx Human Lung Cancer Lung9 Rep2 + 2024 Zuani scRNAseq" + dataset_organism: "homo_sapiens" + + - id: "bruker_human_lung_cancer_cosmx_combined/lung12" + input_sp: "$input_dir/bruker_cosmx/bruker_human_lung_cancer_cosmx/lung12/dataset.zarr" + input_sc: "$input_dir/zuani_human_nsclc_sc/2024Zuani_human_nsclc_sc/dataset.h5ad" + dataset_id: "bruker_human_lung_cancer_cosmx_combined/lung12" + dataset_name: "Human lung cancer combined Bruker CosMx Lung12 2024 Zuani scRNAseq" + dataset_url: "https://nanostring.com/products/cosmx-spatial-molecular-imager/ffpe-dataset/nsclc-ffpe-dataset/" + dataset_reference: "https://www.ebi.ac.uk/biostudies/arrayexpress/studies/E-MTAB-13526" + dataset_summary: "Bruker CosMx Human Lung Cancer Lung12 + 2024 Zuani scRNAseq" + dataset_description: "Bruker CosMx Human Lung Cancer Lung12 + 2024 Zuani scRNAseq" + dataset_organism: "homo_sapiens" + + - id: "bruker_human_lung_cancer_cosmx_combined/lung13" + input_sp: "$input_dir/bruker_cosmx/bruker_human_lung_cancer_cosmx/lung13/dataset.zarr" + input_sc: "$input_dir/zuani_human_nsclc_sc/2024Zuani_human_nsclc_sc/dataset.h5ad" + dataset_id: "bruker_human_lung_cancer_cosmx_combined/lung13" + dataset_name: "Human lung cancer combined Bruker CosMx Lung13 2024 Zuani scRNAseq" + dataset_url: "https://nanostring.com/products/cosmx-spatial-molecular-imager/ffpe-dataset/nsclc-ffpe-dataset/" + dataset_reference: "https://www.ebi.ac.uk/biostudies/arrayexpress/studies/E-MTAB-13526" + dataset_summary: "Bruker CosMx Human Lung Cancer Lung13 + 2024 Zuani scRNAseq" + dataset_description: "Bruker CosMx Human Lung Cancer Lung13 + 2024 Zuani scRNAseq" + dataset_organism: "homo_sapiens" + +output_sc: "\$id/output_sc.h5ad" +output_sp: "\$id/output_sp.zarr" +output_state: "\$id/state.yaml" +publish_dir: "$publish_dir" +HERE + +tw launch https://github.com/openproblems-bio/task_ist_preprocessing.git \ + --revision build/main \ + --pull-latest \ + --main-script target/nextflow/workflows/process_datasets/main.nf \ + --workspace 167877437119966 \ + --compute-env 5hfmdCBxMRd4nHZaJKYEQZ \ + --params-file /tmp/params_bruker.yaml \ + --config src/base/labels_nebius.config \ + --labels "task_ist_preprocessing,process_datasets,bruker" \ No newline at end of file diff --git a/scripts/create_resources/combine/process_datasets_vizgen_nebius.sh b/scripts/create_resources/combine/process_datasets_vizgen_nebius.sh new file mode 100644 index 000000000..c6eb4ad78 --- /dev/null +++ b/scripts/create_resources/combine/process_datasets_vizgen_nebius.sh @@ -0,0 +1,113 @@ +#!/bin/bash + +# TODO: The param_list metadata was mostly infered with chatGPT from the create resources scripts. +# Double check if everything's correct. + + +# get the root of the directory +REPO_ROOT=$(git rev-parse --show-toplevel) + +# ensure that the command below is run from the root of the repository +cd "$REPO_ROOT" + +set -e + +input_dir="s3://openproblems-data/resources/datasets" +#publish_dir="s3://openproblems-data/resources/task_ist_preprocessing/datasets" +publish_dir='/scratch/task_ist_preprocessing/datasets' + +cat > /tmp/params_vizgen.yaml << HERE +param_list: + + - id: "2022_vizgen_human_breast_cancer_merfish_combined/rep1" + input_sp: "$input_dir/vizgen_merscope/2022_vizgen_human_breast_cancer_merfish/rep1/dataset.zarr" + input_sc: "$input_dir/wu_human_breast_cancer_sc/2021Wu_human_breast_cancer_sc/dataset.h5ad" + dataset_id: "2022_vizgen_human_breast_cancer_merfish_combined/rep1" + dataset_name: "Human breast cancer combined 2022 Vizgen MERFISH rep1 2021 Wu scRNAseq" + dataset_url: "https://info.vizgen.com/ffpe-showcase" + dataset_reference: "https://doi.org/10.1038/s41588-021-00911-1" + dataset_summary: "Vizgen Human Breast Cancer MERFISH Patient1 + 2021 Wu scRNAseq" + dataset_description: "Vizgen Human Breast Cancer MERFISH Patient1 + 2021 Wu scRNAseq" + dataset_organism: "homo_sapiens" + + - id: "2022_vizgen_human_liver_cancer_merfish_combined/rep1" + input_sp: "$input_dir/vizgen_merscope/2022_vizgen_human_liver_cancer_merfish/rep1/dataset.zarr" + input_sc: "$input_dir/scrnaseq_for_ist/2022Lu_human_liver_cancer_sc/dataset.h5ad" + dataset_id: "2022_vizgen_human_liver_cancer_merfish_combined/rep1" + dataset_name: "Human liver cancer combined 2022 Vizgen MERFISH rep1 2022 Lu scRNAseq" + dataset_url: "https://info.vizgen.com/ffpe-showcase" + dataset_reference: "https://doi.org/10.1038/s41467-022-32283-3" + dataset_summary: "Vizgen Human Liver Cancer MERFISH Patient1 + 2022 Lu scRNAseq" + dataset_description: "Vizgen Human Liver Cancer MERFISH Patient1 + 2022 Lu scRNAseq" + dataset_organism: "homo_sapiens" + + - id: "2022_vizgen_human_liver_cancer_merfish_combined/rep2" + input_sp: "$input_dir/vizgen_merscope/2022_vizgen_human_liver_cancer_merfish/rep2/dataset.zarr" + input_sc: "$input_dir/scrnaseq_for_ist/2022Lu_human_liver_cancer_sc/dataset.h5ad" + dataset_id: "2022_vizgen_human_liver_cancer_merfish_combined/rep2" + dataset_name: "Human liver cancer combined 2022 Vizgen MERFISH rep2 2022 Lu scRNAseq" + dataset_url: "https://info.vizgen.com/ffpe-showcase" + dataset_reference: "https://doi.org/10.1038/s41467-022-32283-3" + dataset_summary: "Vizgen Human Liver Cancer MERFISH Patient2 + 2022 Lu scRNAseq" + dataset_description: "Vizgen Human Liver Cancer MERFISH Patient2 + 2022 Lu scRNAseq" + dataset_organism: "homo_sapiens" + + - id: "2022_vizgen_human_lung_cancer_merfish_combined/rep1" + input_sp: "$input_dir/vizgen_merscope/2022_vizgen_human_lung_cancer_merfish/rep1/dataset.zarr" + input_sc: "$input_dir/zuani_human_nsclc_sc/2024Zuani_human_nsclc_sc/dataset.h5ad" + dataset_id: "2022_vizgen_human_lung_cancer_merfish_combined/rep1" + dataset_name: "Human lung cancer combined 2022 Vizgen MERFISH rep1 2024 Zuani scRNAseq" + dataset_url: "https://info.vizgen.com/ffpe-showcase" + dataset_reference: "https://www.ebi.ac.uk/biostudies/arrayexpress/studies/E-MTAB-13526" + dataset_summary: "Vizgen Human Lung Cancer MERFISH Patient1 + 2024 Zuani scRNAseq" + dataset_description: "Vizgen Human Lung Cancer MERFISH Patient1 + 2024 Zuani scRNAseq" + dataset_organism: "homo_sapiens" + + - id: "2022_vizgen_human_lung_cancer_merfish_combined/rep2" + input_sp: "$input_dir/vizgen_merscope/2022_vizgen_human_lung_cancer_merfish/rep2/dataset.zarr" + input_sc: "$input_dir/zuani_human_nsclc_sc/2024Zuani_human_nsclc_sc/dataset.h5ad" + dataset_id: "2022_vizgen_human_lung_cancer_merfish_combined/rep2" + dataset_name: "Human lung cancer combined 2022 Vizgen MERFISH rep2 2024 Zuani scRNAseq" + dataset_url: "https://info.vizgen.com/ffpe-showcase" + dataset_reference: "https://www.ebi.ac.uk/biostudies/arrayexpress/studies/E-MTAB-13526" + dataset_summary: "Vizgen Human Lung Cancer MERFISH Patient2 + 2024 Zuani scRNAseq" + dataset_description: "Vizgen Human Lung Cancer MERFISH Patient2 + 2024 Zuani scRNAseq" + dataset_organism: "homo_sapiens" + + - id: "2022_vizgen_human_colon_cancer_merfish_combined/rep1" + input_sp: "$input_dir/vizgen_merscope/2022_vizgen_human_colon_cancer_merfish/rep1/dataset.zarr" + input_sc: "$input_dir/scrnaseq_for_ist/2020Lee_human_colon_cancer_sc/dataset.h5ad" + dataset_id: "2022_vizgen_human_colon_cancer_merfish_combined/rep1" + dataset_name: "Human colon cancer combined 2022 Vizgen MERFISH rep1 2020 Lee scRNAseq" + dataset_url: "https://info.vizgen.com/ffpe-showcase" + dataset_reference: "https://doi.org/10.1038/s41588-020-0636-z" + dataset_summary: "2022 Vizgen Human Colon Cancer MERFISH Patient1 + 2020 Lee scRNAseq" + dataset_description: "2022 Vizgen Human Colon Cancer MERFISH Patient1 + 2020 Lee scRNAseq" + dataset_organism: "homo_sapiens" + + - id: "2022_vizgen_human_colon_cancer_merfish_combined/rep2" + input_sp: "$input_dir/vizgen_merscope/2022_vizgen_human_colon_cancer_merfish/rep2/dataset.zarr" + input_sc: "$input_dir/scrnaseq_for_ist/2020Lee_human_colon_cancer_sc/dataset.h5ad" + dataset_id: "2022_vizgen_human_colon_cancer_merfish_combined/rep2" + dataset_name: "Human colon cancer combined 2022 Vizgen MERFISH rep2 2020 Lee scRNAseq" + dataset_url: "https://info.vizgen.com/ffpe-showcase" + dataset_reference: "https://doi.org/10.1038/s41588-020-0636-z" + dataset_summary: "2022 Vizgen Human Colon Cancer MERFISH Patient2 + 2020 Lee scRNAseq" + dataset_description: "2022 Vizgen Human Colon Cancer MERFISH Patient2 + 2020 Lee scRNAseq" + dataset_organism: "homo_sapiens" + +output_sc: "\$id/output_sc.h5ad" +output_sp: "\$id/output_sp.zarr" +output_state: "\$id/state.yaml" +publish_dir: "$publish_dir" +HERE + +tw launch https://github.com/openproblems-bio/task_ist_preprocessing.git \ + --revision build/main \ + --pull-latest \ + --main-script target/nextflow/workflows/process_datasets/main.nf \ + --workspace 167877437119966 \ + --compute-env 5hfmdCBxMRd4nHZaJKYEQZ \ + --params-file /tmp/params_vizgen.yaml \ + --config src/base/labels_nebius.config \ + --labels "task_ist_preprocessing,process_datasets,vizgen" \ No newline at end of file diff --git a/scripts/create_resources/combine/process_datasets_xenium_nebius.sh b/scripts/create_resources/combine/process_datasets_xenium_nebius.sh new file mode 100644 index 000000000..43dad7f5e --- /dev/null +++ b/scripts/create_resources/combine/process_datasets_xenium_nebius.sh @@ -0,0 +1,146 @@ +#!/bin/bash + +# TODO: The param_list metadata was mostly infered with chatGPT from the create resources scripts. +# Double check if everything's correct. + + +# get the root of the directory +REPO_ROOT=$(git rev-parse --show-toplevel) + +# ensure that the command below is run from the root of the repository +cd "$REPO_ROOT" + +set -e + +input_dir="s3://openproblems-data/resources/datasets" +#publish_dir="s3://openproblems-data/resources/task_ist_preprocessing/datasets" +publish_dir='/scratch/task_ist_preprocessing/datasets' + +cat > /tmp/params_xenium.yaml << HERE +param_list: + + - id: "2023_10x_mouse_brain_xenium_combined/rep1" + input_sp: "$input_dir/10x_xenium/2023_10x_mouse_brain_xenium/rep1/dataset.zarr" + input_sc: "$input_dir/allen_brain_cell_atlas/2023_yao_mouse_brain_scrnaseq_10xv2/dataset.h5ad" + dataset_id: "2023_10x_mouse_brain_xenium_combined/rep1" + dataset_name: "Mouse brain combined 2023 10x Xenium rep1 2023 Yao scRNAseq" + dataset_url: "https://www.10xgenomics.com/datasets/fresh-frozen-mouse-brain-replicates-1-standard" + dataset_reference: "https://www.10xgenomics.com/datasets/fresh-frozen-mouse-brain-replicates-1-standard" + dataset_summary: "Xenium V1 Fresh Frozen Mouse Brain rep1 + ABCA Mouse Brain scRNAseq" + dataset_description: "Xenium V1 Fresh Frozen Mouse Brain rep1 + ABCA Mouse Brain scRNAseq" + dataset_organism: "mus_musculus" + + - id: "2023_10x_mouse_brain_xenium_combined/rep2" + input_sp: "$input_dir/10x_xenium/2023_10x_mouse_brain_xenium/rep2/dataset.zarr" + input_sc: "$input_dir/allen_brain_cell_atlas/2023_yao_mouse_brain_scrnaseq_10xv2/dataset.h5ad" + dataset_id: "2023_10x_mouse_brain_xenium_combined/rep2" + dataset_name: "Mouse brain combined 2023 10x Xenium rep2 2023 Yao scRNAseq" + dataset_url: "https://www.10xgenomics.com/datasets/fresh-frozen-mouse-brain-replicates-1-standard" + dataset_reference: "https://www.10xgenomics.com/datasets/fresh-frozen-mouse-brain-replicates-1-standard" + dataset_summary: "Xenium V1 Fresh Frozen Mouse Brain rep2 + ABCA Mouse Brain scRNAseq" + dataset_description: "Xenium V1 Fresh Frozen Mouse Brain rep2 + ABCA Mouse Brain scRNAseq" + dataset_organism: "mus_musculus" + + - id: "2023_10x_mouse_brain_xenium_combined/rep3" + input_sp: "$input_dir/10x_xenium/2023_10x_mouse_brain_xenium/rep3/dataset.zarr" + input_sc: "$input_dir/allen_brain_cell_atlas/2023_yao_mouse_brain_scrnaseq_10xv2/dataset.h5ad" + dataset_id: "2023_10x_mouse_brain_xenium_combined/rep3" + dataset_name: "Mouse brain combined 2023 10x Xenium rep3 2023 Yao scRNAseq" + dataset_url: "https://www.10xgenomics.com/datasets/fresh-frozen-mouse-brain-replicates-1-standard" + dataset_reference: "https://www.10xgenomics.com/datasets/fresh-frozen-mouse-brain-replicates-1-standard" + dataset_summary: "Xenium V1 Fresh Frozen Mouse Brain rep3 + ABCA Mouse Brain scRNAseq" + dataset_description: "Xenium V1 Fresh Frozen Mouse Brain rep3 + ABCA Mouse Brain scRNAseq" + dataset_organism: "mus_musculus" + + - id: "2023_10x_human_lung_xenium_combined" + input_sp: "$input_dir/10x_xenium/2023_10x_human_lung_xenium/dataset.zarr" + input_sc: "$input_dir/scrnaseq_for_ist/2020Travaglini_human_lung_sc/dataset.h5ad" + dataset_id: "2023_10x_human_lung_xenium_combined" + dataset_name: "Human lung combined 2023 10x Xenium 2020 Travaglini scRNAseq" + dataset_url: "https://www.10xgenomics.com/datasets/xenium-human-lung-preview-data-1-standard" + dataset_reference: "https://doi.org/10.1038/s41586-020-2922-4" + dataset_summary: "Xenium Preview Human Non diseased Lung FFPE + 2020 Travaglini scRNAseq" + dataset_description: "Xenium Preview Human Non diseased Lung FFPE + 2020 Travaglini scRNAseq" + dataset_organism: "homo_sapiens" + + - id: "2023_10x_human_lung_cancer_xenium_combined" + input_sp: "$input_dir/10x_xenium/2023_10x_human_lung_cancer_xenium/dataset.zarr" + input_sc: "$input_dir/zuani_human_nsclc_sc/2024Zuani_human_nsclc_sc/dataset.h5ad" + dataset_id: "2023_10x_human_lung_cancer_xenium_combined" + dataset_name: "Human lung cancer combined 2023 10x Xenium 2024 Zuani scRNAseq" + dataset_url: "https://www.10xgenomics.com/datasets/xenium-human-lung-preview-data-1-standard" + dataset_reference: "https://www.ebi.ac.uk/biostudies/arrayexpress/studies/E-MTAB-13526" + dataset_summary: "Xenium Preview Human Lung Cancer FFPE + 2024 Zuani scRNAseq" + dataset_description: "Xenium Preview Human Lung Cancer FFPE + 2024 Zuani scRNAseq" + dataset_organism: "homo_sapiens" + + - id: "2024_10x_human_skin_xenium_combined" + input_sp: "$input_dir/10x_xenium/2024_10x_human_skin_xenium/dataset.zarr" + input_sc: "$input_dir/scrnaseq_for_ist/2024Ganier_human_skin_sc/dataset.h5ad" + dataset_id: "2024_10x_human_skin_xenium_combined" + dataset_name: "Human skin combined 2024 10x Xenium 2024 Ganier scRNAseq" + dataset_url: "https://www.10xgenomics.com/datasets/human-skin-data-xenium-human-multi-tissue-and-cancer-panel-1-standard" + dataset_reference: "https://doi.org/10.1073/pnas.2313326120" + dataset_summary: "Xenium V1 hSkin nondiseased FFPE + 2024 Ganier scRNAseq" + dataset_description: "Xenium V1 hSkin nondiseased FFPE + 2024 Ganier scRNAseq" + dataset_organism: "homo_sapiens" + + - id: "2024_10x_human_liver_xenium_combined" + input_sp: "$input_dir/10x_xenium/2024_10x_human_liver_xenium/dataset.zarr" + input_sc: "$input_dir/scrnaseq_for_ist/2022Andrews_human_liver_sc/dataset.h5ad" + dataset_id: "2024_10x_human_liver_xenium_combined" + dataset_name: "Human liver combined 2024 10x Xenium 2022 Andrews scRNAseq" + dataset_url: "https://www.10xgenomics.com/datasets/human-liver-data-xenium-human-multi-tissue-and-cancer-panel-1-standard" + dataset_reference: "https://doi.org/10.1002/hep4.1854" + dataset_summary: "Xenium V1 hLiver FFPE + 2022 Andrews scRNAseq" + dataset_description: "Xenium V1 hLiver FFPE + 2022 Andrews scRNAseq" + dataset_organism: "homo_sapiens" + + - id: "2024_10x_human_liver_cancer_xenium_combined" + input_sp: "$input_dir/10x_xenium/2024_10x_human_liver_cancer_xenium/dataset.zarr" + input_sc: "$input_dir/scrnaseq_for_ist/2022Lu_human_liver_cancer_sc/dataset.h5ad" + dataset_id: "2024_10x_human_liver_cancer_xenium_combined" + dataset_name: "Human liver cancer combined 2024 10x Xenium 2022 Lu scRNAseq" + dataset_url: "https://www.10xgenomics.com/datasets/human-liver-data-xenium-human-multi-tissue-and-cancer-panel-1-standard" + dataset_reference: "https://doi.org/10.1038/s41467-022-32283-3" + dataset_summary: "Xenium V1 hLiver cancer FFPE + 2022 Lu scRNAseq" + dataset_description: "Xenium V1 hLiver cancer FFPE + 2022 Lu scRNAseq" + dataset_organism: "homo_sapiens" + + - id: "2023_10x_human_colon_cancer_xenium_combined" + input_sp: "$input_dir/10x_xenium/2023_10x_human_colon_cancer_xenium/dataset.zarr" + input_sc: "$input_dir/scrnaseq_for_ist/2020Lee_human_colon_cancer_sc/dataset.h5ad" + dataset_id: "2023_10x_human_colon_cancer_xenium_combined" + dataset_name: "Human colon cancer combined 2023 10x Xenium 2020 Lee scRNAseq" + dataset_url: "https://www.10xgenomics.com/datasets/human-colon-preview-data-xenium-human-colon-gene-expression-panel-1-standard" + dataset_reference: "https://doi.org/10.1038/s41588-020-0636-z" + dataset_summary: "Xenium V1 hColon Cancer FFPE + 2020 Lee scRNAseq" + dataset_description: "Xenium V1 hColon Cancer FFPE + 2020 Lee scRNAseq" + dataset_organism: "homo_sapiens" + + - id: "2023_10x_human_breast_cancer_xenium_combined" + input_sp: "$input_dir/10x_xenium/2023_10x_human_breast_cancer_xenium/dataset.zarr" + input_sc: "$input_dir/wu_human_breast_cancer_sc/2021Wu_human_breast_cancer_sc/dataset.h5ad" + dataset_id: "2023_10x_human_breast_cancer_xenium_combined" + dataset_name: "Human breast cancer combined 2023 10x Xenium 2021 Wu scRNAseq" + dataset_url: "https://www.10xgenomics.com/datasets/xenium-ffpe-human-breast-with-custom-add-on-panel-1-standard" + dataset_reference: "https://doi.org/10.1038/s41588-021-00911-1" + dataset_summary: "Xenium V1 FFPE Human Breast IDC + 2021 Wu scRNAseq" + dataset_description: "Xenium V1 FFPE Human Breast IDC + 2021 Wu scRNAseq" + dataset_organism: "homo_sapiens" + +output_sc: "\$id/output_sc.h5ad" +output_sp: "\$id/output_sp.zarr" +output_state: "\$id/state.yaml" +publish_dir: "$publish_dir" +HERE + +tw launch https://github.com/openproblems-bio/task_ist_preprocessing.git \ + --revision build/main \ + --pull-latest \ + --main-script target/nextflow/workflows/process_datasets/main.nf \ + --workspace 167877437119966 \ + --compute-env 5hfmdCBxMRd4nHZaJKYEQZ \ + --params-file /tmp/params_xenium.yaml \ + --config src/base/labels_nebius.config \ + --labels "task_ist_preprocessing,process_datasets,xenium" \ No newline at end of file diff --git a/scripts/create_resources/spatial/process_10x_atera_nebius.sh b/scripts/create_resources/spatial/process_10x_atera_nebius.sh new file mode 100644 index 000000000..baf016a08 --- /dev/null +++ b/scripts/create_resources/spatial/process_10x_atera_nebius.sh @@ -0,0 +1,38 @@ +#!/bin/bash + +# get the root of the directory +REPO_ROOT=$(git rev-parse --show-toplevel) + +# ensure that the command below is run from the root of the repository +cd "$REPO_ROOT" + +set -e + +publish_dir="s3://openproblems-data/resources/datasets" + +cat > /tmp/params_atera.yaml << HERE +param_list: + + - id: "10x_atera/2026_10x_human_breast_cancer_atera" + input: https://s3-us-west-2.amazonaws.com/10x.files/samples/atera/dev/WTA_Preview_FFPE_Breast_Cancer/WTA_Preview_FFPE_Breast_Cancer_xe_outs.zip + dataset_name: "Atera WTA FFPE Human Breast Cancer" + dataset_url: "https://www.10xgenomics.com/datasets/atera-wta-ffpe-human-breast-cancer" + dataset_summary: "Preview dataset showcasing the pre-commercial Atera Whole Transcriptome Assay (WTA) applied to FFPE human breast cancer tissue, profiling 18,028 genes and detecting 170,057 cells." + dataset_description: "This human FFPE breast cancer data showcases results using the pre-commercial version of the Atera Whole Transcriptome Assay (WTA), which is currently under development. The assay is designed to closely match the Chromium Flex Apex assay in terms of content and sensitivity, and includes 18,028 genes. A single 5µm FFPE section of breast cancer tissue (DCIS Grade 3, T1c N0 M0) was analyzed, yielding 170,057 detected cells with a median of 2,116 transcripts per cell and 624,095,990 total high-quality decoded transcripts across 58.9 million µm² of tissue area. Output files are formatted to closely resemble Xenium Onboard Analysis v4 file formats." + dataset_organism: "homo_sapiens" + segmentation_id: [cell, nucleus] + +output_dataset: "\$id/dataset.zarr" +output_state: "\$id/state.yaml" +publish_dir: "$publish_dir" +HERE + +tw launch https://github.com/openproblems-bio/task_ist_preprocessing.git \ + --revision build/main \ + --pull-latest \ + --main-script target/nextflow/datasets/workflows/process_tenx_atera/main.nf \ + --workspace 167877437119966 \ + --compute-env 5hfmdCBxMRd4nHZaJKYEQZ \ + --params-file /tmp/params_atera.yaml \ + --config src/base/labels_nebius.config \ + --labels datasets,atera \ No newline at end of file diff --git a/src/datasets/loaders/tenx_atera/config.vsh.yaml b/src/datasets/loaders/tenx_atera/config.vsh.yaml new file mode 100644 index 000000000..25ed630fe --- /dev/null +++ b/src/datasets/loaders/tenx_atera/config.vsh.yaml @@ -0,0 +1,70 @@ +name: tenx_atera +namespace: datasets/loaders + +argument_groups: + - name: Inputs + arguments: + - type: file + name: --input + required: true + description: A 10x Atera directory or zip file or download url + - type: string + name: --segmentation_id + required: true + description: The segmentation identifier + multiple: true + - name: Metadata + arguments: + - type: string + name: --dataset_id + description: "A unique identifier for the dataset" + required: true + - name: --dataset_name + type: string + description: Nicely formatted name. + required: true + - type: string + name: --dataset_url + description: Link to the original source of the dataset. + required: false + - name: --dataset_reference + type: string + description: Bibtex reference of the paper in which the dataset was published. + required: false + - name: --dataset_summary + type: string + description: Short description of the dataset. + required: true + - name: --dataset_description + type: string + description: Long description of the dataset. + required: true + - name: --dataset_organism + type: string + description: The organism of the sample in the dataset. + required: false + - name: Outputs + arguments: + - name: "--output" + __merge__: /src/api/file_common_ist.yaml + direction: output + required: true + +resources: + - type: python_script + path: script.py + +engines: + - type: docker + image: openproblems/base_python:1 + setup: + - type: python + pypi: + - spatialdata-io + - type: native + +runners: + - type: executable + - type: nextflow + directives: + label: [midmem, midcpu, midtime] \ No newline at end of file diff --git a/src/datasets/loaders/tenx_atera/script.py b/src/datasets/loaders/tenx_atera/script.py new file mode 100644 index 000000000..9532d049f --- /dev/null +++ b/src/datasets/loaders/tenx_atera/script.py @@ -0,0 +1,80 @@ +## code author: Florian Heyl +import spatialdata as sd +import anndata as ad +from spatialdata_io import xenium +import shutil +import os +import zipfile +import tempfile + +## VIASH START +par = { + "input": "https://s3-us-west-2.amazonaws.com/10x.files/samples/atera/dev/WTA_Preview_FFPE_Breast_Cancer/WTA_Preview_FFPE_Breast_Cancer_xe_outs.zip", + "segmentation_id": [ + "cell", + "nucleus", + ], + "dataset_id": "value", + "dataset_name": "value", + "dataset_url": "value", + "dataset_reference": "value", + "dataset_summary": "value", + "dataset_description": "value", + "dataset_organism": "value", + "output": "temp/datasets/10x_atera/breast/breast.zarr" +} +meta = { + "cpus": 1, +} + +## VIASH END + +# Download the data if it's a download url, extract the data if it's a zip file +par_input = par["input"] +with tempfile.TemporaryDirectory() as tmpdirname: + if par_input.startswith("http"): + print(f"Downloading data to {tmpdirname}", flush=True) + file_name = par_input.split("/")[-1] + os.system(f"wget {par['input']} -O {tmpdirname}/{file_name}") + par_input = tmpdirname + "/" + file_name + + if zipfile.is_zipfile(par_input): + print(f"Extracting input zip to {tmpdirname}", flush=True) + with zipfile.ZipFile(par_input, "r") as zip_ref: + zip_ref.extractall(tmpdirname) + par_input = tmpdirname + + # read the data + sdata = xenium( + path=par_input, + n_jobs=meta["cpus"] or 1, + cells_boundaries=True, + nucleus_boundaries=True, + morphology_focus=True, + cells_as_circles=False, + ) + + # remove morphology_focus + _ = sdata.images.pop("morphology_focus") + + print("Add uns to table", flush=True) + new_uns = { + "dataset_id": par["dataset_id"], + "dataset_name": par["dataset_name"], + "dataset_url": par["dataset_url"], + "dataset_reference": par["dataset_reference"], + "dataset_summary": par["dataset_summary"], + "dataset_description": par["dataset_description"], + "dataset_organism": par["dataset_organism"], + "segmentation_id": par["segmentation_id"], + } + for key, value in new_uns.items(): + sdata.tables["table"].uns[key] = value + + print(f"Output: {sdata}", flush=True) + + print(f"Writing to '{par['output']}'", flush=True) + if os.path.exists(par["output"]): + shutil.rmtree(par["output"]) + + sdata.write(par["output"]) \ No newline at end of file diff --git a/src/datasets/workflows/process_tenx_atera/config.vsh.yaml b/src/datasets/workflows/process_tenx_atera/config.vsh.yaml new file mode 100644 index 000000000..5b43ab444 --- /dev/null +++ b/src/datasets/workflows/process_tenx_atera/config.vsh.yaml @@ -0,0 +1,85 @@ +name: process_tenx_atera +namespace: datasets/workflows + +argument_groups: + - name: Inputs + arguments: + - type: file + name: --input + required: true + description: A 10x Atera directory or zip file or download url + - type: string + name: --segmentation_id + required: true + description: The segmentation identifier + multiple: true + - name: Metadata + arguments: + - type: string + name: --id + description: "A unique identifier for the dataset" + required: true + - name: --dataset_name + type: string + description: Nicely formatted name. + required: true + - type: string + name: --dataset_url + description: Link to the original source of the dataset. + required: false + - name: --dataset_reference + type: string + description: Bibtex reference of the paper in which the dataset was published. + required: false + - name: --dataset_summary + type: string + description: Short description of the dataset. + required: true + - name: --dataset_description + type: string + description: Long description of the dataset. + required: true + - name: --dataset_organism + type: string + description: The organism of the sample in the dataset. + required: false + - name: Crop region + description: If provided, the region will be cropped to the specified coordinates. + arguments: + - type: integer + name: --crop_region_min_x + required: false + description: The minimum x coordinate of the region to crop. + - type: integer + name: --crop_region_max_x + required: false + description: The maximum x coordinate of the region to crop. + - type: integer + name: --crop_region_min_y + required: false + description: The minimum y coordinate of the region to crop. + - type: integer + name: --crop_region_max_y + required: false + description: The maximum y coordinate of the region to crop. + - name: Outputs + arguments: + - name: "--output_dataset" + __merge__: /src/api/file_common_ist.yaml + direction: output + required: true + default: "$id/dataset.zarr" + +resources: + - type: nextflow_script + path: main.nf + entrypoint: run_wf + +dependencies: + - name: datasets/loaders/tenx_atera + - name: datasets/processors/crop_region + +runners: + - type: nextflow + directives: + label: [highcpu, midmem, hightime] \ No newline at end of file diff --git a/src/datasets/workflows/process_tenx_atera/main.nf b/src/datasets/workflows/process_tenx_atera/main.nf new file mode 100644 index 000000000..bc5a8f316 --- /dev/null +++ b/src/datasets/workflows/process_tenx_atera/main.nf @@ -0,0 +1,52 @@ +workflow auto { + findStates(params, meta.config) + | meta.workflow.run( + auto: [publish: "state"] + ) +} + +workflow run_wf { + take: + input_ch + + main: + output_ch = input_ch + + // copy id to the state + | map{ id, state -> + def new_state = state + [dataset_id: id] + [id, new_state] + } + + | tenx_atera.run( + fromState: [ + "input", + "segmentation_id", + "dataset_id", + "dataset_name", + "dataset_url", + "dataset_reference", + "dataset_summary", + "dataset_description", + "dataset_organism", + ], + toState: ["output"] + ) + + | crop_region.run( + runIf: { id, state -> state.crop_region_min_x }, + fromState: [ + "input": "output", + "min_x": "crop_region_min_x", + "min_y": "crop_region_min_y", + "max_x": "crop_region_max_x", + "max_y": "crop_region_max_y" + ], + toState: ["output"] + ) + + | setState([output_dataset: "output"]) + + emit: + output_ch +} \ No newline at end of file