From e0a7846fd429e93b3dd3713c42be667cde0ea7ba Mon Sep 17 00:00:00 2001
From: dariarom94 <romanovskaiadaria@gmail.com>
Date: Wed, 27 May 2026 19:08:36 +0200
Subject: [PATCH 1/5] api for data loader: test

---
 src/api/file_tenx_xenium.yaml                 | 30 +++++++++++++++++++
 .../loaders/tenx_xenium/config.vsh.yaml       |  6 ++--
 2 files changed, 33 insertions(+), 3 deletions(-)
 create mode 100644 src/api/file_tenx_xenium.yaml

diff --git a/src/api/file_tenx_xenium.yaml b/src/api/file_tenx_xenium.yaml
new file mode 100644
index 0000000..1443649
--- /dev/null
+++ b/src/api/file_tenx_xenium.yaml
@@ -0,0 +1,30 @@
+type: file
+example: "https://cf.10xgenomics.com/samples/xenium/1.9.0/Xenium_V1_hLiver_cancer_section_FFPE/Xenium_V1_hLiver_cancer_section_FFPE_outs.zip"
+label: "10x Xenium Raw Dataset"
+summary: A 10x Xenium output zip archive or download URL.
+description: |
+  Raw output from the 10x Genomics Xenium platform provided as a zip archive
+  or a URL pointing to one. The archive is expected to contain the standard
+  Xenium Onboard Analysis output files.
+info:
+  format:
+    type: zip
+    files:
+      - name: transcripts.csv.gz
+        description: Per-transcript coordinates and feature assignments.
+        required: true
+      - name: cell_boundaries.parquet
+        description: Cell boundary polygon geometries.
+        required: false
+      - name: nucleus_boundaries.parquet
+        description: Nucleus boundary polygon geometries.
+        required: false
+      - name: cells.csv.gz
+        description: Per-cell summary metadata.
+        required: false
+      - name: morphology_focus.ome.tif
+        description: Morphology focus image (multi-channel).
+        required: false
+      - name: experiment.xenium
+        description: Experiment metadata JSON.
+        required: false
diff --git a/src/datasets/loaders/tenx_xenium/config.vsh.yaml b/src/datasets/loaders/tenx_xenium/config.vsh.yaml
index 109536e..f32402e 100644
--- a/src/datasets/loaders/tenx_xenium/config.vsh.yaml
+++ b/src/datasets/loaders/tenx_xenium/config.vsh.yaml
@@ -4,10 +4,10 @@ namespace: datasets/loaders
 argument_groups:
   - name: Inputs
     arguments:
-      - type: string
-        name: --input
+      - name: "--input"
+        __merge__: /src/api/file_tenx_xenium.yaml
         required: true
-        description: A 10x xenium directory or zip file or download url
+        direction: input
       - type: string
         name: --segmentation_id
         required: true

From 24439f6cfb844c117625bb452ccf624eef52922e Mon Sep 17 00:00:00 2001
From: dariarom94 <romanovskaiadaria@gmail.com>
Date: Mon, 1 Jun 2026 13:36:55 +0200
Subject: [PATCH 2/5] adding the yaml for the 10X loader

---
 src/api/comp_dataset_loader_tenx_xenium.yaml | 58 ++++++++++++++++++++
 1 file changed, 58 insertions(+)
 create mode 100644 src/api/comp_dataset_loader_tenx_xenium.yaml

diff --git a/src/api/comp_dataset_loader_tenx_xenium.yaml b/src/api/comp_dataset_loader_tenx_xenium.yaml
new file mode 100644
index 0000000..dd8fe40
--- /dev/null
+++ b/src/api/comp_dataset_loader_tenx_xenium.yaml
@@ -0,0 +1,58 @@
+namespace: "datasets/loaders"
+info:
+  type: dataset_loader
+  type_info:
+    label: 10x Xenium Dataset Loader
+    summary: Loads a raw 10x Xenium dataset and converts it to a Common iST Dataset.
+    description: |
+      Downloads and extracts a 10x Xenium zip archive, then converts it into
+      a task-compatible SpatialData zarr using spatialdata-io.
+argument_groups:
+  - name: Inputs
+    arguments:
+      - name: "--input"
+        __merge__: file_tenx_xenium.yaml
+        required: true
+        direction: input
+      - type: string
+        name: --segmentation_id
+        description: The segmentation identifier(s) to include (e.g. cell, nucleus).
+        required: true
+        multiple: true
+  - name: Outputs
+    arguments:
+      - name: "--output"
+        __merge__: file_common_ist.yaml
+        direction: output
+        required: true
+  - name: Dataset Metadata
+    description: Metadata that will be stored in the output dataset.
+    arguments:
+      - type: string
+        name: --dataset_id
+        description: A unique identifier for the dataset.
+        required: true
+      - name: --dataset_name
+        type: string
+        description: Nicely formatted name.
+        required: true
+      - type: string
+        name: --dataset_url
+        description: Link to the original source of the dataset.
+        required: false
+      - name: --dataset_reference
+        type: string
+        description: Bibtex reference of the paper in which the dataset was published.
+        required: false
+      - name: --dataset_summary
+        type: string
+        description: Short description of the dataset.
+        required: true
+      - name: --dataset_description
+        type: string
+        description: Long description of the dataset.
+        required: true
+      - name: --dataset_organism
+        type: string
+        description: The organism of the sample in the dataset.
+        required: false
\ No newline at end of file

From f644ab67fc61eda2ac4a1f85452abf5b1d9c40d9 Mon Sep 17 00:00:00 2001
From: dariarom94 <romanovskaiadaria@gmail.com>
Date: Mon, 1 Jun 2026 13:37:15 +0200
Subject: [PATCH 3/5] inherit the loader yaml

---
 src/datasets/loaders/tenx_xenium/config.vsh.yaml | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/src/datasets/loaders/tenx_xenium/config.vsh.yaml b/src/datasets/loaders/tenx_xenium/config.vsh.yaml
index f32402e..230a03b 100644
--- a/src/datasets/loaders/tenx_xenium/config.vsh.yaml
+++ b/src/datasets/loaders/tenx_xenium/config.vsh.yaml
@@ -1,5 +1,6 @@
+__merge__: /src/api/comp_dataset_loader_tenx_xenium.yaml
+
 name: tenx_xenium
-namespace: datasets/loaders
 
 argument_groups:
   - name: Inputs

From 7674c21dd5ea7b2af0f3404556553dd16edf51b1 Mon Sep 17 00:00:00 2001
From: dariarom94 <romanovskaiadaria@gmail.com>
Date: Mon, 1 Jun 2026 13:54:20 +0200
Subject: [PATCH 4/5] remove duplicate references

---
 .../loaders/tenx_xenium/config.vsh.yaml       | 51 +------------------
 1 file changed, 1 insertion(+), 50 deletions(-)

diff --git a/src/datasets/loaders/tenx_xenium/config.vsh.yaml b/src/datasets/loaders/tenx_xenium/config.vsh.yaml
index 230a03b..84b252d 100644
--- a/src/datasets/loaders/tenx_xenium/config.vsh.yaml
+++ b/src/datasets/loaders/tenx_xenium/config.vsh.yaml
@@ -2,55 +2,6 @@ __merge__: /src/api/comp_dataset_loader_tenx_xenium.yaml
 
 name: tenx_xenium
 
-argument_groups:
-  - name: Inputs
-    arguments:
-      - name: "--input"
-        __merge__: /src/api/file_tenx_xenium.yaml
-        required: true
-        direction: input
-      - type: string
-        name: --segmentation_id
-        required: true
-        description: The segmentation identifier
-        multiple: true
-  - name: Metadata
-    arguments:
-      - type: string
-        name: --dataset_id
-        description: "A unique identifier for the dataset"
-        required: true
-      - name: --dataset_name
-        type: string
-        description: Nicely formatted name.
-        required: true
-      - type: string
-        name: --dataset_url
-        description: Link to the original source of the dataset.
-        required: false
-      - name: --dataset_reference
-        type: string
-        description: Bibtex reference of the paper in which the dataset was published.
-        required: false
-      - name: --dataset_summary
-        type: string
-        description: Short description of the dataset.
-        required: true
-      - name: --dataset_description
-        type: string
-        description: Long description of the dataset.
-        required: true
-      - name: --dataset_organism
-        type: string
-        description: The organism of the sample in the dataset.
-        required: false
-  - name: Outputs
-    arguments:
-      - name: "--output"
-        __merge__: /src/api/file_common_ist.yaml
-        direction: output
-        required: true
-
 resources:
   - type: python_script
     path: script.py
@@ -68,4 +19,4 @@ runners:
   - type: executable
   - type: nextflow
     directives:
-      label: [midmem, midcpu, midtime]
+      label: [midmem, midcpu, midtime]
\ No newline at end of file

From e615c0f716918d68f273f1253c4860da59b1cd9f Mon Sep 17 00:00:00 2001
From: dariarom94 <romanovskaiadaria@gmail.com>
Date: Mon, 1 Jun 2026 15:00:25 +0200
Subject: [PATCH 5/5] update readme

---
 README.md | 314 ++++++++++++++++++++++++++++++------------------------
 1 file changed, 173 insertions(+), 141 deletions(-)

diff --git a/README.md b/README.md
index 198c62d..abecb80 100644
--- a/README.md
+++ b/README.md
@@ -38,7 +38,7 @@ should convince readers of the significance and relevance of your task.
 
 ``` mermaid
 flowchart TB
-  file_common_ist("<a href='https://github.com/openproblems-bio/task_spatial_segmentation#file-format-common-ist-dataset'>Common iST Dataset</a>")
+  file_common_scrnaseq("<a href='https://github.com/openproblems-bio/task_spatial_segmentation#file-format-common-sc-dataset'>Common SC Dataset</a>")
   comp_data_processor[/"<a href='https://github.com/openproblems-bio/task_spatial_segmentation#component-type-data-processor'>Data processor</a>"/]
   file_spatial_unlabelled("<a href='https://github.com/openproblems-bio/task_spatial_segmentation#file-format-unlabelled'>Unlabelled</a>")
   file_spatial_solution("<a href='https://github.com/openproblems-bio/task_spatial_segmentation#file-format-solution'>Solution</a>")
@@ -50,8 +50,10 @@ flowchart TB
   file_prediction("<a href='https://github.com/openproblems-bio/task_spatial_segmentation#file-format-predicted-data'>Predicted data</a>")
   file_processed_prediction("<a href='https://github.com/openproblems-bio/task_spatial_segmentation#file-format-processed-prediction'>Processed prediction</a>")
   file_score("<a href='https://github.com/openproblems-bio/task_spatial_segmentation#file-format-score'>Score</a>")
-  file_common_scrnaseq("<a href='https://github.com/openproblems-bio/task_spatial_segmentation#file-format-common-sc-dataset'>Common SC Dataset</a>")
-  file_common_ist---comp_data_processor
+  file_common_ist("<a href='https://github.com/openproblems-bio/task_spatial_segmentation#file-format-common-ist-dataset'>Common iST Dataset</a>")
+  file_tenx_xenium("<a href='https://github.com/openproblems-bio/task_spatial_segmentation#file-format-10x-xenium-raw-dataset'>10x Xenium Raw Dataset</a>")
+  comp_dataset_loader_tenx_xenium[/"<a href='https://github.com/openproblems-bio/task_spatial_segmentation#component-type-10x-xenium-dataset-loader'>10x Xenium Dataset Loader</a>"/]
+  file_common_scrnaseq---comp_data_processor
   comp_data_processor-->file_spatial_unlabelled
   comp_data_processor-->file_spatial_solution
   comp_data_processor-->file_scrnaseq_reference
@@ -66,32 +68,38 @@ flowchart TB
   comp_metric-->file_score
   file_prediction---comp_output_processor
   file_processed_prediction---comp_metric
-  file_common_scrnaseq---comp_data_processor
+  file_common_ist---comp_data_processor
+  file_tenx_xenium---comp_dataset_loader_tenx_xenium
+  comp_dataset_loader_tenx_xenium-->file_common_ist
 ```
 
-## File format: Common iST Dataset
+## File format: Common SC Dataset
 
-An unprocessed spatial imaging dataset stored as a zarr file.
+An unprocessed dataset as output by a dataset loader.
 
 Example file:
-`resources_test/common/2023_10x_mouse_brain_xenium_rep1/dataset.zarr`
+`resources_test/common/2023_yao_mouse_brain_scrnaseq_10xv2/dataset.h5ad`
 
 Description:
 
-This dataset contains raw images, labels, points, shapes, and tables as
-output by a dataset loader.
+This dataset contains raw counts and metadata as output by a dataset
+loader.
+
+The format of this file is mainly derived from the [CELLxGENE schema
+v4.0.0](https://github.com/chanzuckerberg/single-cell-curation/blob/main/schema/4.0.0/schema.md).
 
 Format:
 
 <div class="small">
 
-    SpatialData object
-     images: 'image', 'image_3D', 'he_image'
-     labels: 'cell_labels', 'nucleus_labels', 'groundtruth_cell_labels'
-     points: 'transcripts'
-     shapes: 'cell_boundaries', 'nucleus_boundaries'
-     tables: 'table'
-     coordinate_systems: 'global'
+    AnnData object
+     obs: 'cell_type', 'cell_type_level2', 'cell_type_level3', 'cell_type_level4', 'dataset_id', 'assay', 'assay_ontology_term_id', 'cell_type_ontology_term_id', 'development_stage', 'development_stage_ontology_term_id', 'disease', 'disease_ontology_term_id', 'donor_id', 'is_primary_data', 'organism', 'organism_ontology_term_id', 'self_reported_ethnicity', 'self_reported_ethnicity_ontology_term_id', 'sex', 'sex_ontology_term_id', 'suspension_type', 'tissue', 'tissue_ontology_term_id', 'tissue_general', 'tissue_general_ontology_term_id', 'batch', 'soma_joinid'
+     var: 'feature_id', 'feature_name', 'soma_joinid', 'hvg', 'hvg_score'
+     obsm: 'X_pca'
+     obsp: 'knn_distances', 'knn_connectivities'
+     varm: 'pca_loadings'
+     layers: 'counts', 'normalized'
+     uns: 'dataset_id', 'dataset_name', 'dataset_url', 'dataset_reference', 'dataset_summary', 'dataset_description', 'dataset_organism'
 
 </div>
 
@@ -99,78 +107,53 @@ Data structure:
 
 <div class="small">
 
-*images*
-
-| Name       | Description                         |
-|:-----------|:------------------------------------|
-| `image`    | The raw image data.                 |
-| `image_3D` | (*Optional*) The raw 3D image data. |
-| `he_image` | (*Optional*) H&E image data.        |
-
-*labels*
-
-| Name | Description |
-|:---|:---|
-| `cell_labels` | Vendor-provided cell segmentation labels. |
-| `nucleus_labels` | Vendor-provided nucleus segmentation labels. |
-| `groundtruth_cell_labels` | (*Optional*) Manually annotated cell segmentation labels used as ground truth for evaluation. |
-
-*points*
-
-`transcripts`: Point cloud data of transcripts.
-
-| Column | Type | Description |
-|:---|:---|:---|
-| `x` | `float` | x-coordinate of the point. |
-| `y` | `float` | y-coordinate of the point. |
-| `z` | `float` | (*Optional*) z-coordinate of the point. |
-| `feature_name` | `categorical` | Name of the feature. |
-| `cell_id` | `integer` | (*Optional*) Unique identifier of the cell. |
-| `nucleus_id` | `integer` | (*Optional*) Unique identifier of the nucleus. |
-| `cell_type` | `string` | (*Optional*) Cell type of the cell. |
-| `qv` | `float` | (*Optional*) Quality value of the point. |
-| `transcript_id` | `long` | Unique identifier of the transcript. |
-| `overlaps_nucleus` | `boolean` | (*Optional*) Whether the point overlaps with a nucleus. |
-
-*shapes*
-
-`cell_boundaries`: Cell boundaries.
-
-| Column     | Type     | Description                    |
-|:-----------|:---------|:-------------------------------|
-| `geometry` | `object` | Geometry of the cell boundary. |
-
-`nucleus_boundaries`: Nucleus boundaries.
-
-| Column     | Type     | Description                       |
-|:-----------|:---------|:----------------------------------|
-| `geometry` | `object` | Geometry of the nucleus boundary. |
-
-*tables*
-
-`table`: Metadata of spatial dataset.
-
 | Slot | Type | Description |
 |:---|:---|:---|
-| `obs["cell_id"]` | `string` | A unique identifier for the cell. |
-| `obs["groundtruth_cell_type"]` | `string` | (*Optional*) Manually curated cell type annotations which serves as ground truth for evaluations. |
-| `var["gene_ids"]` | `string` | Unique identifier for the gene. |
-| `var["feature_types"]` | `string` | Type of the feature. |
-| `obsm["spatial"]` | `double` | Spatial coordinates of the cell. |
-| `uns["dataset_id"]` | `string` | A unique identifier for the dataset. |
+| `obs["cell_type"]` | `string` | Classification of the cell type based on its characteristics and function within the tissue or organism. |
+| `obs["cell_type_level2"]` | `string` | (*Optional*) Classification of the cell type based on its characteristics and function within the tissue or organism. |
+| `obs["cell_type_level3"]` | `string` | (*Optional*) Classification of the cell type based on its characteristics and function within the tissue or organism. |
+| `obs["cell_type_level4"]` | `string` | (*Optional*) Classification of the cell type based on its characteristics and function within the tissue or organism. |
+| `obs["dataset_id"]` | `string` | (*Optional*) Identifier for the dataset from which the cell data is derived, useful for tracking and referencing purposes. |
+| `obs["assay"]` | `string` | (*Optional*) Type of assay used to generate the cell data, indicating the methodology or technique employed. |
+| `obs["assay_ontology_term_id"]` | `string` | (*Optional*) Experimental Factor Ontology (`EFO:`) term identifier for the assay, providing a standardized reference to the assay type. |
+| `obs["cell_type_ontology_term_id"]` | `string` | (*Optional*) Cell Ontology (`CL:`) term identifier for the cell type, offering a standardized reference to the specific cell classification. |
+| `obs["development_stage"]` | `string` | (*Optional*) Stage of development of the organism or tissue from which the cell is derived, indicating its maturity or developmental phase. |
+| `obs["development_stage_ontology_term_id"]` | `string` | (*Optional*) Ontology term identifier for the developmental stage, providing a standardized reference to the organism’s developmental phase. If the organism is human (`organism_ontology_term_id == 'NCBITaxon:9606'`), then the Human Developmental Stages (`HsapDv:`) ontology is used. If the organism is mouse (`organism_ontology_term_id == 'NCBITaxon:10090'`), then the Mouse Developmental Stages (`MmusDv:`) ontology is used. Otherwise, the Uberon (`UBERON:`) ontology is used. |
+| `obs["disease"]` | `string` | (*Optional*) Information on any disease or pathological condition associated with the cell or donor. |
+| `obs["disease_ontology_term_id"]` | `string` | (*Optional*) Ontology term identifier for the disease, enabling standardized disease classification and referencing. Must be a term from the Mondo Disease Ontology (`MONDO:`) ontology term, or `PATO:0000461` from the Phenotype And Trait Ontology (`PATO:`). |
+| `obs["donor_id"]` | `string` | (*Optional*) Identifier for the donor from whom the cell sample is obtained. |
+| `obs["is_primary_data"]` | `boolean` | (*Optional*) Indicates whether the data is primary (directly obtained from experiments) or has been computationally derived from other primary data. |
+| `obs["organism"]` | `string` | (*Optional*) Organism from which the cell sample is obtained. |
+| `obs["organism_ontology_term_id"]` | `string` | (*Optional*) Ontology term identifier for the organism, providing a standardized reference for the organism. Must be a term from the NCBI Taxonomy Ontology (`NCBITaxon:`) which is a child of `NCBITaxon:33208`. |
+| `obs["self_reported_ethnicity"]` | `string` | (*Optional*) Ethnicity of the donor as self-reported, relevant for studies considering genetic diversity and population-specific traits. |
+| `obs["self_reported_ethnicity_ontology_term_id"]` | `string` | (*Optional*) Ontology term identifier for the self-reported ethnicity, providing a standardized reference for ethnic classifications. If the organism is human (`organism_ontology_term_id == 'NCBITaxon:9606'`), then the Human Ancestry Ontology (`HANCESTRO:`) is used. |
+| `obs["sex"]` | `string` | (*Optional*) Biological sex of the donor or source organism, crucial for studies involving sex-specific traits or conditions. |
+| `obs["sex_ontology_term_id"]` | `string` | (*Optional*) Ontology term identifier for the biological sex, ensuring standardized classification of sex. Only `PATO:0000383`, `PATO:0000384` and `PATO:0001340` are allowed. |
+| `obs["suspension_type"]` | `string` | (*Optional*) Type of suspension or medium in which the cells were stored or processed, important for understanding cell handling and conditions. |
+| `obs["tissue"]` | `string` | (*Optional*) Specific tissue from which the cells were derived, key for context and specificity in cell studies. |
+| `obs["tissue_ontology_term_id"]` | `string` | (*Optional*) Ontology term identifier for the tissue, providing a standardized reference for the tissue type. For organoid or tissue samples, the Uber-anatomy ontology (`UBERON:`) is used. The term ids must be a child term of `UBERON:0001062` (anatomical entity). For cell cultures, the Cell Ontology (`CL:`) is used. The term ids cannot be `CL:0000255`, `CL:0000257` or `CL:0000548`. |
+| `obs["tissue_general"]` | `string` | (*Optional*) General category or classification of the tissue, useful for broader grouping and comparison of cell data. |
+| `obs["tissue_general_ontology_term_id"]` | `string` | (*Optional*) Ontology term identifier for the general tissue category, aiding in standardizing and grouping tissue types. For organoid or tissue samples, the Uber-anatomy ontology (`UBERON:`) is used. The term ids must be a child term of `UBERON:0001062` (anatomical entity). For cell cultures, the Cell Ontology (`CL:`) is used. The term ids cannot be `CL:0000255`, `CL:0000257` or `CL:0000548`. |
+| `obs["batch"]` | `string` | (*Optional*) A batch identifier. This label is very context-dependent and may be a combination of the tissue, assay, donor, etc. |
+| `obs["soma_joinid"]` | `integer` | (*Optional*) If the dataset was retrieved from CELLxGENE census, this is a unique identifier for the cell. |
+| `var["feature_id"]` | `string` | (*Optional*) Unique identifier for the feature, usually a ENSEMBL gene id. |
+| `var["feature_name"]` | `string` | A human-readable name for the feature, usually a gene symbol. |
+| `var["soma_joinid"]` | `integer` | (*Optional*) If the dataset was retrieved from CELLxGENE census, this is a unique identifier for the feature. |
+| `var["hvg"]` | `boolean` | Whether or not the feature is considered to be a ‘highly variable gene’. |
+| `var["hvg_score"]` | `double` | A score for the feature indicating how highly variable it is. |
+| `obsm["X_pca"]` | `double` | The resulting PCA embedding. |
+| `obsp["knn_distances"]` | `double` | K nearest neighbors distance matrix. |
+| `obsp["knn_connectivities"]` | `double` | K nearest neighbors connectivities matrix. |
+| `varm["pca_loadings"]` | `double` | The PCA loadings matrix. |
+| `layers["counts"]` | `integer` | Raw counts. |
+| `layers["normalized"]` | `integer` | Normalized expression values. |
+| `uns["dataset_id"]` | `string` | A unique identifier for the dataset. This is different from the `obs.dataset_id` field, which is the identifier for the dataset from which the cell data is derived. |
 | `uns["dataset_name"]` | `string` | A human-readable name for the dataset. |
-| `uns["dataset_url"]` | `string` | Link to the original source of the dataset. |
-| `uns["dataset_reference"]` | `string` | Bibtex reference of the paper in which the dataset was published. |
+| `uns["dataset_url"]` | `string` | (*Optional*) Link to the original source of the dataset. |
+| `uns["dataset_reference"]` | `string` | (*Optional*) Bibtex reference of the paper in which the dataset was published. |
 | `uns["dataset_summary"]` | `string` | Short description of the dataset. |
 | `uns["dataset_description"]` | `string` | Long description of the dataset. |
-| `uns["dataset_organism"]` | `string` | The organism of the sample in the dataset. |
-| `uns["segmentation_id"]` | `string` | A unique identifier for the segmentation. |
-
-*coordinate_systems*
-
-| Name     | Description                         |
-|:---------|:------------------------------------|
-| `global` | Coordinate system of the replicate. |
+| `uns["dataset_organism"]` | `string` | (*Optional*) The organism of the sample in the dataset. |
 
 </div>
 
@@ -594,33 +577,29 @@ Data structure:
 
 </div>
 
-## File format: Common SC Dataset
+## File format: Common iST Dataset
 
-An unprocessed dataset as output by a dataset loader.
+An unprocessed spatial imaging dataset stored as a zarr file.
 
 Example file:
-`resources_test/common/2023_yao_mouse_brain_scrnaseq_10xv2/dataset.h5ad`
+`resources_test/common/2023_10x_mouse_brain_xenium_rep1/dataset.zarr`
 
 Description:
 
-This dataset contains raw counts and metadata as output by a dataset
-loader.
-
-The format of this file is mainly derived from the [CELLxGENE schema
-v4.0.0](https://github.com/chanzuckerberg/single-cell-curation/blob/main/schema/4.0.0/schema.md).
+This dataset contains raw images, labels, points, shapes, and tables as
+output by a dataset loader.
 
 Format:
 
 <div class="small">
 
-    AnnData object
-     obs: 'cell_type', 'cell_type_level2', 'cell_type_level3', 'cell_type_level4', 'dataset_id', 'assay', 'assay_ontology_term_id', 'cell_type_ontology_term_id', 'development_stage', 'development_stage_ontology_term_id', 'disease', 'disease_ontology_term_id', 'donor_id', 'is_primary_data', 'organism', 'organism_ontology_term_id', 'self_reported_ethnicity', 'self_reported_ethnicity_ontology_term_id', 'sex', 'sex_ontology_term_id', 'suspension_type', 'tissue', 'tissue_ontology_term_id', 'tissue_general', 'tissue_general_ontology_term_id', 'batch', 'soma_joinid'
-     var: 'feature_id', 'feature_name', 'soma_joinid', 'hvg', 'hvg_score'
-     obsm: 'X_pca'
-     obsp: 'knn_distances', 'knn_connectivities'
-     varm: 'pca_loadings'
-     layers: 'counts', 'normalized'
-     uns: 'dataset_id', 'dataset_name', 'dataset_url', 'dataset_reference', 'dataset_summary', 'dataset_description', 'dataset_organism'
+    SpatialData object
+     images: 'image', 'image_3D', 'he_image'
+     labels: 'cell_labels', 'nucleus_labels', 'groundtruth_cell_labels'
+     points: 'transcripts'
+     shapes: 'cell_boundaries', 'nucleus_boundaries'
+     tables: 'table'
+     coordinate_systems: 'global'
 
 </div>
 
@@ -628,52 +607,105 @@ Data structure:
 
 <div class="small">
 
+*images*
+
+| Name       | Description                         |
+|:-----------|:------------------------------------|
+| `image`    | The raw image data.                 |
+| `image_3D` | (*Optional*) The raw 3D image data. |
+| `he_image` | (*Optional*) H&E image data.        |
+
+*labels*
+
+| Name | Description |
+|:---|:---|
+| `cell_labels` | Vendor-provided cell segmentation labels. |
+| `nucleus_labels` | Vendor-provided nucleus segmentation labels. |
+| `groundtruth_cell_labels` | (*Optional*) Manually annotated cell segmentation labels used as ground truth for evaluation. |
+
+*points*
+
+`transcripts`: Point cloud data of transcripts.
+
+| Column | Type | Description |
+|:---|:---|:---|
+| `x` | `float` | x-coordinate of the point. |
+| `y` | `float` | y-coordinate of the point. |
+| `z` | `float` | (*Optional*) z-coordinate of the point. |
+| `feature_name` | `categorical` | Name of the feature. |
+| `cell_id` | `integer` | (*Optional*) Unique identifier of the cell. |
+| `nucleus_id` | `integer` | (*Optional*) Unique identifier of the nucleus. |
+| `cell_type` | `string` | (*Optional*) Cell type of the cell. |
+| `qv` | `float` | (*Optional*) Quality value of the point. |
+| `transcript_id` | `long` | Unique identifier of the transcript. |
+| `overlaps_nucleus` | `boolean` | (*Optional*) Whether the point overlaps with a nucleus. |
+
+*shapes*
+
+`cell_boundaries`: Cell boundaries.
+
+| Column     | Type     | Description                    |
+|:-----------|:---------|:-------------------------------|
+| `geometry` | `object` | Geometry of the cell boundary. |
+
+`nucleus_boundaries`: Nucleus boundaries.
+
+| Column     | Type     | Description                       |
+|:-----------|:---------|:----------------------------------|
+| `geometry` | `object` | Geometry of the nucleus boundary. |
+
+*tables*
+
+`table`: Metadata of spatial dataset.
+
 | Slot | Type | Description |
 |:---|:---|:---|
-| `obs["cell_type"]` | `string` | Classification of the cell type based on its characteristics and function within the tissue or organism. |
-| `obs["cell_type_level2"]` | `string` | (*Optional*) Classification of the cell type based on its characteristics and function within the tissue or organism. |
-| `obs["cell_type_level3"]` | `string` | (*Optional*) Classification of the cell type based on its characteristics and function within the tissue or organism. |
-| `obs["cell_type_level4"]` | `string` | (*Optional*) Classification of the cell type based on its characteristics and function within the tissue or organism. |
-| `obs["dataset_id"]` | `string` | (*Optional*) Identifier for the dataset from which the cell data is derived, useful for tracking and referencing purposes. |
-| `obs["assay"]` | `string` | (*Optional*) Type of assay used to generate the cell data, indicating the methodology or technique employed. |
-| `obs["assay_ontology_term_id"]` | `string` | (*Optional*) Experimental Factor Ontology (`EFO:`) term identifier for the assay, providing a standardized reference to the assay type. |
-| `obs["cell_type_ontology_term_id"]` | `string` | (*Optional*) Cell Ontology (`CL:`) term identifier for the cell type, offering a standardized reference to the specific cell classification. |
-| `obs["development_stage"]` | `string` | (*Optional*) Stage of development of the organism or tissue from which the cell is derived, indicating its maturity or developmental phase. |
-| `obs["development_stage_ontology_term_id"]` | `string` | (*Optional*) Ontology term identifier for the developmental stage, providing a standardized reference to the organism’s developmental phase. If the organism is human (`organism_ontology_term_id == 'NCBITaxon:9606'`), then the Human Developmental Stages (`HsapDv:`) ontology is used. If the organism is mouse (`organism_ontology_term_id == 'NCBITaxon:10090'`), then the Mouse Developmental Stages (`MmusDv:`) ontology is used. Otherwise, the Uberon (`UBERON:`) ontology is used. |
-| `obs["disease"]` | `string` | (*Optional*) Information on any disease or pathological condition associated with the cell or donor. |
-| `obs["disease_ontology_term_id"]` | `string` | (*Optional*) Ontology term identifier for the disease, enabling standardized disease classification and referencing. Must be a term from the Mondo Disease Ontology (`MONDO:`) ontology term, or `PATO:0000461` from the Phenotype And Trait Ontology (`PATO:`). |
-| `obs["donor_id"]` | `string` | (*Optional*) Identifier for the donor from whom the cell sample is obtained. |
-| `obs["is_primary_data"]` | `boolean` | (*Optional*) Indicates whether the data is primary (directly obtained from experiments) or has been computationally derived from other primary data. |
-| `obs["organism"]` | `string` | (*Optional*) Organism from which the cell sample is obtained. |
-| `obs["organism_ontology_term_id"]` | `string` | (*Optional*) Ontology term identifier for the organism, providing a standardized reference for the organism. Must be a term from the NCBI Taxonomy Ontology (`NCBITaxon:`) which is a child of `NCBITaxon:33208`. |
-| `obs["self_reported_ethnicity"]` | `string` | (*Optional*) Ethnicity of the donor as self-reported, relevant for studies considering genetic diversity and population-specific traits. |
-| `obs["self_reported_ethnicity_ontology_term_id"]` | `string` | (*Optional*) Ontology term identifier for the self-reported ethnicity, providing a standardized reference for ethnic classifications. If the organism is human (`organism_ontology_term_id == 'NCBITaxon:9606'`), then the Human Ancestry Ontology (`HANCESTRO:`) is used. |
-| `obs["sex"]` | `string` | (*Optional*) Biological sex of the donor or source organism, crucial for studies involving sex-specific traits or conditions. |
-| `obs["sex_ontology_term_id"]` | `string` | (*Optional*) Ontology term identifier for the biological sex, ensuring standardized classification of sex. Only `PATO:0000383`, `PATO:0000384` and `PATO:0001340` are allowed. |
-| `obs["suspension_type"]` | `string` | (*Optional*) Type of suspension or medium in which the cells were stored or processed, important for understanding cell handling and conditions. |
-| `obs["tissue"]` | `string` | (*Optional*) Specific tissue from which the cells were derived, key for context and specificity in cell studies. |
-| `obs["tissue_ontology_term_id"]` | `string` | (*Optional*) Ontology term identifier for the tissue, providing a standardized reference for the tissue type. For organoid or tissue samples, the Uber-anatomy ontology (`UBERON:`) is used. The term ids must be a child term of `UBERON:0001062` (anatomical entity). For cell cultures, the Cell Ontology (`CL:`) is used. The term ids cannot be `CL:0000255`, `CL:0000257` or `CL:0000548`. |
-| `obs["tissue_general"]` | `string` | (*Optional*) General category or classification of the tissue, useful for broader grouping and comparison of cell data. |
-| `obs["tissue_general_ontology_term_id"]` | `string` | (*Optional*) Ontology term identifier for the general tissue category, aiding in standardizing and grouping tissue types. For organoid or tissue samples, the Uber-anatomy ontology (`UBERON:`) is used. The term ids must be a child term of `UBERON:0001062` (anatomical entity). For cell cultures, the Cell Ontology (`CL:`) is used. The term ids cannot be `CL:0000255`, `CL:0000257` or `CL:0000548`. |
-| `obs["batch"]` | `string` | (*Optional*) A batch identifier. This label is very context-dependent and may be a combination of the tissue, assay, donor, etc. |
-| `obs["soma_joinid"]` | `integer` | (*Optional*) If the dataset was retrieved from CELLxGENE census, this is a unique identifier for the cell. |
-| `var["feature_id"]` | `string` | (*Optional*) Unique identifier for the feature, usually a ENSEMBL gene id. |
-| `var["feature_name"]` | `string` | A human-readable name for the feature, usually a gene symbol. |
-| `var["soma_joinid"]` | `integer` | (*Optional*) If the dataset was retrieved from CELLxGENE census, this is a unique identifier for the feature. |
-| `var["hvg"]` | `boolean` | Whether or not the feature is considered to be a ‘highly variable gene’. |
-| `var["hvg_score"]` | `double` | A score for the feature indicating how highly variable it is. |
-| `obsm["X_pca"]` | `double` | The resulting PCA embedding. |
-| `obsp["knn_distances"]` | `double` | K nearest neighbors distance matrix. |
-| `obsp["knn_connectivities"]` | `double` | K nearest neighbors connectivities matrix. |
-| `varm["pca_loadings"]` | `double` | The PCA loadings matrix. |
-| `layers["counts"]` | `integer` | Raw counts. |
-| `layers["normalized"]` | `integer` | Normalized expression values. |
-| `uns["dataset_id"]` | `string` | A unique identifier for the dataset. This is different from the `obs.dataset_id` field, which is the identifier for the dataset from which the cell data is derived. |
+| `obs["cell_id"]` | `string` | A unique identifier for the cell. |
+| `obs["groundtruth_cell_type"]` | `string` | (*Optional*) Manually curated cell type annotations which serves as ground truth for evaluations. |
+| `var["gene_ids"]` | `string` | Unique identifier for the gene. |
+| `var["feature_types"]` | `string` | Type of the feature. |
+| `obsm["spatial"]` | `double` | Spatial coordinates of the cell. |
+| `uns["dataset_id"]` | `string` | A unique identifier for the dataset. |
 | `uns["dataset_name"]` | `string` | A human-readable name for the dataset. |
-| `uns["dataset_url"]` | `string` | (*Optional*) Link to the original source of the dataset. |
-| `uns["dataset_reference"]` | `string` | (*Optional*) Bibtex reference of the paper in which the dataset was published. |
+| `uns["dataset_url"]` | `string` | Link to the original source of the dataset. |
+| `uns["dataset_reference"]` | `string` | Bibtex reference of the paper in which the dataset was published. |
 | `uns["dataset_summary"]` | `string` | Short description of the dataset. |
 | `uns["dataset_description"]` | `string` | Long description of the dataset. |
-| `uns["dataset_organism"]` | `string` | (*Optional*) The organism of the sample in the dataset. |
+| `uns["dataset_organism"]` | `string` | The organism of the sample in the dataset. |
+| `uns["segmentation_id"]` | `string` | A unique identifier for the segmentation. |
+
+*coordinate_systems*
+
+| Name     | Description                         |
+|:---------|:------------------------------------|
+| `global` | Coordinate system of the replicate. |
+
+</div>
+
+## File format: 10x Xenium Raw Dataset
+
+A 10x Xenium output zip archive or download URL.
+
+Example file:
+`https://cf.10xgenomics.com/samples/xenium/1.9.0/Xenium_V1_hLiver_cancer_section_FFPE/Xenium_V1_hLiver_cancer_section_FFPE_outs.zip`
+
+Description:
+
+Raw output from the 10x Genomics Xenium platform provided as a zip
+archive or a URL pointing to one. The archive is expected to contain the
+standard Xenium Onboard Analysis output files.
+
+## Component type: 10x Xenium Dataset Loader
+
+Loads a raw 10x Xenium dataset and converts it to a Common iST Dataset.
+
+Arguments:
+
+<div class="small">
+
+| Name | Type | Description |
+|:---|:---|:---|
+| `--input` | `file` | A 10x Xenium output zip archive or download URL. |
+| `--output` | `file` | (*Output*) An unprocessed spatial imaging dataset stored as a zarr file. |
 
 </div>