From e0a7846fd429e93b3dd3713c42be667cde0ea7ba Mon Sep 17 00:00:00 2001 From: dariarom94 Date: Wed, 27 May 2026 19:08:36 +0200 Subject: [PATCH 1/5] api for data loader: test --- src/api/file_tenx_xenium.yaml | 30 +++++++++++++++++++ .../loaders/tenx_xenium/config.vsh.yaml | 6 ++-- 2 files changed, 33 insertions(+), 3 deletions(-) create mode 100644 src/api/file_tenx_xenium.yaml diff --git a/src/api/file_tenx_xenium.yaml b/src/api/file_tenx_xenium.yaml new file mode 100644 index 0000000..1443649 --- /dev/null +++ b/src/api/file_tenx_xenium.yaml @@ -0,0 +1,30 @@ +type: file +example: "https://cf.10xgenomics.com/samples/xenium/1.9.0/Xenium_V1_hLiver_cancer_section_FFPE/Xenium_V1_hLiver_cancer_section_FFPE_outs.zip" +label: "10x Xenium Raw Dataset" +summary: A 10x Xenium output zip archive or download URL. +description: | + Raw output from the 10x Genomics Xenium platform provided as a zip archive + or a URL pointing to one. The archive is expected to contain the standard + Xenium Onboard Analysis output files. +info: + format: + type: zip + files: + - name: transcripts.csv.gz + description: Per-transcript coordinates and feature assignments. + required: true + - name: cell_boundaries.parquet + description: Cell boundary polygon geometries. + required: false + - name: nucleus_boundaries.parquet + description: Nucleus boundary polygon geometries. + required: false + - name: cells.csv.gz + description: Per-cell summary metadata. + required: false + - name: morphology_focus.ome.tif + description: Morphology focus image (multi-channel). + required: false + - name: experiment.xenium + description: Experiment metadata JSON. + required: false diff --git a/src/datasets/loaders/tenx_xenium/config.vsh.yaml b/src/datasets/loaders/tenx_xenium/config.vsh.yaml index 109536e..f32402e 100644 --- a/src/datasets/loaders/tenx_xenium/config.vsh.yaml +++ b/src/datasets/loaders/tenx_xenium/config.vsh.yaml @@ -4,10 +4,10 @@ namespace: datasets/loaders argument_groups: - name: Inputs arguments: - - type: string - name: --input + - name: "--input" + __merge__: /src/api/file_tenx_xenium.yaml required: true - description: A 10x xenium directory or zip file or download url + direction: input - type: string name: --segmentation_id required: true From 24439f6cfb844c117625bb452ccf624eef52922e Mon Sep 17 00:00:00 2001 From: dariarom94 Date: Mon, 1 Jun 2026 13:36:55 +0200 Subject: [PATCH 2/5] adding the yaml for the 10X loader --- src/api/comp_dataset_loader_tenx_xenium.yaml | 58 ++++++++++++++++++++ 1 file changed, 58 insertions(+) create mode 100644 src/api/comp_dataset_loader_tenx_xenium.yaml diff --git a/src/api/comp_dataset_loader_tenx_xenium.yaml b/src/api/comp_dataset_loader_tenx_xenium.yaml new file mode 100644 index 0000000..dd8fe40 --- /dev/null +++ b/src/api/comp_dataset_loader_tenx_xenium.yaml @@ -0,0 +1,58 @@ +namespace: "datasets/loaders" +info: + type: dataset_loader + type_info: + label: 10x Xenium Dataset Loader + summary: Loads a raw 10x Xenium dataset and converts it to a Common iST Dataset. + description: | + Downloads and extracts a 10x Xenium zip archive, then converts it into + a task-compatible SpatialData zarr using spatialdata-io. +argument_groups: + - name: Inputs + arguments: + - name: "--input" + __merge__: file_tenx_xenium.yaml + required: true + direction: input + - type: string + name: --segmentation_id + description: The segmentation identifier(s) to include (e.g. cell, nucleus). + required: true + multiple: true + - name: Outputs + arguments: + - name: "--output" + __merge__: file_common_ist.yaml + direction: output + required: true + - name: Dataset Metadata + description: Metadata that will be stored in the output dataset. + arguments: + - type: string + name: --dataset_id + description: A unique identifier for the dataset. + required: true + - name: --dataset_name + type: string + description: Nicely formatted name. + required: true + - type: string + name: --dataset_url + description: Link to the original source of the dataset. + required: false + - name: --dataset_reference + type: string + description: Bibtex reference of the paper in which the dataset was published. + required: false + - name: --dataset_summary + type: string + description: Short description of the dataset. + required: true + - name: --dataset_description + type: string + description: Long description of the dataset. + required: true + - name: --dataset_organism + type: string + description: The organism of the sample in the dataset. + required: false \ No newline at end of file From f644ab67fc61eda2ac4a1f85452abf5b1d9c40d9 Mon Sep 17 00:00:00 2001 From: dariarom94 Date: Mon, 1 Jun 2026 13:37:15 +0200 Subject: [PATCH 3/5] inherit the loader yaml --- src/datasets/loaders/tenx_xenium/config.vsh.yaml | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/src/datasets/loaders/tenx_xenium/config.vsh.yaml b/src/datasets/loaders/tenx_xenium/config.vsh.yaml index f32402e..230a03b 100644 --- a/src/datasets/loaders/tenx_xenium/config.vsh.yaml +++ b/src/datasets/loaders/tenx_xenium/config.vsh.yaml @@ -1,5 +1,6 @@ +__merge__: /src/api/comp_dataset_loader_tenx_xenium.yaml + name: tenx_xenium -namespace: datasets/loaders argument_groups: - name: Inputs From 7674c21dd5ea7b2af0f3404556553dd16edf51b1 Mon Sep 17 00:00:00 2001 From: dariarom94 Date: Mon, 1 Jun 2026 13:54:20 +0200 Subject: [PATCH 4/5] remove duplicate references --- .../loaders/tenx_xenium/config.vsh.yaml | 51 +------------------ 1 file changed, 1 insertion(+), 50 deletions(-) diff --git a/src/datasets/loaders/tenx_xenium/config.vsh.yaml b/src/datasets/loaders/tenx_xenium/config.vsh.yaml index 230a03b..84b252d 100644 --- a/src/datasets/loaders/tenx_xenium/config.vsh.yaml +++ b/src/datasets/loaders/tenx_xenium/config.vsh.yaml @@ -2,55 +2,6 @@ __merge__: /src/api/comp_dataset_loader_tenx_xenium.yaml name: tenx_xenium -argument_groups: - - name: Inputs - arguments: - - name: "--input" - __merge__: /src/api/file_tenx_xenium.yaml - required: true - direction: input - - type: string - name: --segmentation_id - required: true - description: The segmentation identifier - multiple: true - - name: Metadata - arguments: - - type: string - name: --dataset_id - description: "A unique identifier for the dataset" - required: true - - name: --dataset_name - type: string - description: Nicely formatted name. - required: true - - type: string - name: --dataset_url - description: Link to the original source of the dataset. - required: false - - name: --dataset_reference - type: string - description: Bibtex reference of the paper in which the dataset was published. - required: false - - name: --dataset_summary - type: string - description: Short description of the dataset. - required: true - - name: --dataset_description - type: string - description: Long description of the dataset. - required: true - - name: --dataset_organism - type: string - description: The organism of the sample in the dataset. - required: false - - name: Outputs - arguments: - - name: "--output" - __merge__: /src/api/file_common_ist.yaml - direction: output - required: true - resources: - type: python_script path: script.py @@ -68,4 +19,4 @@ runners: - type: executable - type: nextflow directives: - label: [midmem, midcpu, midtime] + label: [midmem, midcpu, midtime] \ No newline at end of file From e615c0f716918d68f273f1253c4860da59b1cd9f Mon Sep 17 00:00:00 2001 From: dariarom94 Date: Mon, 1 Jun 2026 15:00:25 +0200 Subject: [PATCH 5/5] update readme --- README.md | 314 ++++++++++++++++++++++++++++++------------------------ 1 file changed, 173 insertions(+), 141 deletions(-) diff --git a/README.md b/README.md index 198c62d..abecb80 100644 --- a/README.md +++ b/README.md @@ -38,7 +38,7 @@ should convince readers of the significance and relevance of your task. ``` mermaid flowchart TB - file_common_ist("Common iST Dataset") + file_common_scrnaseq("Common SC Dataset") comp_data_processor[/"Data processor"/] file_spatial_unlabelled("Unlabelled") file_spatial_solution("Solution") @@ -50,8 +50,10 @@ flowchart TB file_prediction("Predicted data") file_processed_prediction("Processed prediction") file_score("Score") - file_common_scrnaseq("Common SC Dataset") - file_common_ist---comp_data_processor + file_common_ist("Common iST Dataset") + file_tenx_xenium("10x Xenium Raw Dataset") + comp_dataset_loader_tenx_xenium[/"10x Xenium Dataset Loader"/] + file_common_scrnaseq---comp_data_processor comp_data_processor-->file_spatial_unlabelled comp_data_processor-->file_spatial_solution comp_data_processor-->file_scrnaseq_reference @@ -66,32 +68,38 @@ flowchart TB comp_metric-->file_score file_prediction---comp_output_processor file_processed_prediction---comp_metric - file_common_scrnaseq---comp_data_processor + file_common_ist---comp_data_processor + file_tenx_xenium---comp_dataset_loader_tenx_xenium + comp_dataset_loader_tenx_xenium-->file_common_ist ``` -## File format: Common iST Dataset +## File format: Common SC Dataset -An unprocessed spatial imaging dataset stored as a zarr file. +An unprocessed dataset as output by a dataset loader. Example file: -`resources_test/common/2023_10x_mouse_brain_xenium_rep1/dataset.zarr` +`resources_test/common/2023_yao_mouse_brain_scrnaseq_10xv2/dataset.h5ad` Description: -This dataset contains raw images, labels, points, shapes, and tables as -output by a dataset loader. +This dataset contains raw counts and metadata as output by a dataset +loader. + +The format of this file is mainly derived from the [CELLxGENE schema +v4.0.0](https://github.com/chanzuckerberg/single-cell-curation/blob/main/schema/4.0.0/schema.md). Format:
- SpatialData object - images: 'image', 'image_3D', 'he_image' - labels: 'cell_labels', 'nucleus_labels', 'groundtruth_cell_labels' - points: 'transcripts' - shapes: 'cell_boundaries', 'nucleus_boundaries' - tables: 'table' - coordinate_systems: 'global' + AnnData object + obs: 'cell_type', 'cell_type_level2', 'cell_type_level3', 'cell_type_level4', 'dataset_id', 'assay', 'assay_ontology_term_id', 'cell_type_ontology_term_id', 'development_stage', 'development_stage_ontology_term_id', 'disease', 'disease_ontology_term_id', 'donor_id', 'is_primary_data', 'organism', 'organism_ontology_term_id', 'self_reported_ethnicity', 'self_reported_ethnicity_ontology_term_id', 'sex', 'sex_ontology_term_id', 'suspension_type', 'tissue', 'tissue_ontology_term_id', 'tissue_general', 'tissue_general_ontology_term_id', 'batch', 'soma_joinid' + var: 'feature_id', 'feature_name', 'soma_joinid', 'hvg', 'hvg_score' + obsm: 'X_pca' + obsp: 'knn_distances', 'knn_connectivities' + varm: 'pca_loadings' + layers: 'counts', 'normalized' + uns: 'dataset_id', 'dataset_name', 'dataset_url', 'dataset_reference', 'dataset_summary', 'dataset_description', 'dataset_organism'
@@ -99,78 +107,53 @@ Data structure:
-*images* - -| Name | Description | -|:-----------|:------------------------------------| -| `image` | The raw image data. | -| `image_3D` | (*Optional*) The raw 3D image data. | -| `he_image` | (*Optional*) H&E image data. | - -*labels* - -| Name | Description | -|:---|:---| -| `cell_labels` | Vendor-provided cell segmentation labels. | -| `nucleus_labels` | Vendor-provided nucleus segmentation labels. | -| `groundtruth_cell_labels` | (*Optional*) Manually annotated cell segmentation labels used as ground truth for evaluation. | - -*points* - -`transcripts`: Point cloud data of transcripts. - -| Column | Type | Description | -|:---|:---|:---| -| `x` | `float` | x-coordinate of the point. | -| `y` | `float` | y-coordinate of the point. | -| `z` | `float` | (*Optional*) z-coordinate of the point. | -| `feature_name` | `categorical` | Name of the feature. | -| `cell_id` | `integer` | (*Optional*) Unique identifier of the cell. | -| `nucleus_id` | `integer` | (*Optional*) Unique identifier of the nucleus. | -| `cell_type` | `string` | (*Optional*) Cell type of the cell. | -| `qv` | `float` | (*Optional*) Quality value of the point. | -| `transcript_id` | `long` | Unique identifier of the transcript. | -| `overlaps_nucleus` | `boolean` | (*Optional*) Whether the point overlaps with a nucleus. | - -*shapes* - -`cell_boundaries`: Cell boundaries. - -| Column | Type | Description | -|:-----------|:---------|:-------------------------------| -| `geometry` | `object` | Geometry of the cell boundary. | - -`nucleus_boundaries`: Nucleus boundaries. - -| Column | Type | Description | -|:-----------|:---------|:----------------------------------| -| `geometry` | `object` | Geometry of the nucleus boundary. | - -*tables* - -`table`: Metadata of spatial dataset. - | Slot | Type | Description | |:---|:---|:---| -| `obs["cell_id"]` | `string` | A unique identifier for the cell. | -| `obs["groundtruth_cell_type"]` | `string` | (*Optional*) Manually curated cell type annotations which serves as ground truth for evaluations. | -| `var["gene_ids"]` | `string` | Unique identifier for the gene. | -| `var["feature_types"]` | `string` | Type of the feature. | -| `obsm["spatial"]` | `double` | Spatial coordinates of the cell. | -| `uns["dataset_id"]` | `string` | A unique identifier for the dataset. | +| `obs["cell_type"]` | `string` | Classification of the cell type based on its characteristics and function within the tissue or organism. | +| `obs["cell_type_level2"]` | `string` | (*Optional*) Classification of the cell type based on its characteristics and function within the tissue or organism. | +| `obs["cell_type_level3"]` | `string` | (*Optional*) Classification of the cell type based on its characteristics and function within the tissue or organism. | +| `obs["cell_type_level4"]` | `string` | (*Optional*) Classification of the cell type based on its characteristics and function within the tissue or organism. | +| `obs["dataset_id"]` | `string` | (*Optional*) Identifier for the dataset from which the cell data is derived, useful for tracking and referencing purposes. | +| `obs["assay"]` | `string` | (*Optional*) Type of assay used to generate the cell data, indicating the methodology or technique employed. | +| `obs["assay_ontology_term_id"]` | `string` | (*Optional*) Experimental Factor Ontology (`EFO:`) term identifier for the assay, providing a standardized reference to the assay type. | +| `obs["cell_type_ontology_term_id"]` | `string` | (*Optional*) Cell Ontology (`CL:`) term identifier for the cell type, offering a standardized reference to the specific cell classification. | +| `obs["development_stage"]` | `string` | (*Optional*) Stage of development of the organism or tissue from which the cell is derived, indicating its maturity or developmental phase. | +| `obs["development_stage_ontology_term_id"]` | `string` | (*Optional*) Ontology term identifier for the developmental stage, providing a standardized reference to the organism’s developmental phase. If the organism is human (`organism_ontology_term_id == 'NCBITaxon:9606'`), then the Human Developmental Stages (`HsapDv:`) ontology is used. If the organism is mouse (`organism_ontology_term_id == 'NCBITaxon:10090'`), then the Mouse Developmental Stages (`MmusDv:`) ontology is used. Otherwise, the Uberon (`UBERON:`) ontology is used. | +| `obs["disease"]` | `string` | (*Optional*) Information on any disease or pathological condition associated with the cell or donor. | +| `obs["disease_ontology_term_id"]` | `string` | (*Optional*) Ontology term identifier for the disease, enabling standardized disease classification and referencing. Must be a term from the Mondo Disease Ontology (`MONDO:`) ontology term, or `PATO:0000461` from the Phenotype And Trait Ontology (`PATO:`). | +| `obs["donor_id"]` | `string` | (*Optional*) Identifier for the donor from whom the cell sample is obtained. | +| `obs["is_primary_data"]` | `boolean` | (*Optional*) Indicates whether the data is primary (directly obtained from experiments) or has been computationally derived from other primary data. | +| `obs["organism"]` | `string` | (*Optional*) Organism from which the cell sample is obtained. | +| `obs["organism_ontology_term_id"]` | `string` | (*Optional*) Ontology term identifier for the organism, providing a standardized reference for the organism. Must be a term from the NCBI Taxonomy Ontology (`NCBITaxon:`) which is a child of `NCBITaxon:33208`. | +| `obs["self_reported_ethnicity"]` | `string` | (*Optional*) Ethnicity of the donor as self-reported, relevant for studies considering genetic diversity and population-specific traits. | +| `obs["self_reported_ethnicity_ontology_term_id"]` | `string` | (*Optional*) Ontology term identifier for the self-reported ethnicity, providing a standardized reference for ethnic classifications. If the organism is human (`organism_ontology_term_id == 'NCBITaxon:9606'`), then the Human Ancestry Ontology (`HANCESTRO:`) is used. | +| `obs["sex"]` | `string` | (*Optional*) Biological sex of the donor or source organism, crucial for studies involving sex-specific traits or conditions. | +| `obs["sex_ontology_term_id"]` | `string` | (*Optional*) Ontology term identifier for the biological sex, ensuring standardized classification of sex. Only `PATO:0000383`, `PATO:0000384` and `PATO:0001340` are allowed. | +| `obs["suspension_type"]` | `string` | (*Optional*) Type of suspension or medium in which the cells were stored or processed, important for understanding cell handling and conditions. | +| `obs["tissue"]` | `string` | (*Optional*) Specific tissue from which the cells were derived, key for context and specificity in cell studies. | +| `obs["tissue_ontology_term_id"]` | `string` | (*Optional*) Ontology term identifier for the tissue, providing a standardized reference for the tissue type. For organoid or tissue samples, the Uber-anatomy ontology (`UBERON:`) is used. The term ids must be a child term of `UBERON:0001062` (anatomical entity). For cell cultures, the Cell Ontology (`CL:`) is used. The term ids cannot be `CL:0000255`, `CL:0000257` or `CL:0000548`. | +| `obs["tissue_general"]` | `string` | (*Optional*) General category or classification of the tissue, useful for broader grouping and comparison of cell data. | +| `obs["tissue_general_ontology_term_id"]` | `string` | (*Optional*) Ontology term identifier for the general tissue category, aiding in standardizing and grouping tissue types. For organoid or tissue samples, the Uber-anatomy ontology (`UBERON:`) is used. The term ids must be a child term of `UBERON:0001062` (anatomical entity). For cell cultures, the Cell Ontology (`CL:`) is used. The term ids cannot be `CL:0000255`, `CL:0000257` or `CL:0000548`. | +| `obs["batch"]` | `string` | (*Optional*) A batch identifier. This label is very context-dependent and may be a combination of the tissue, assay, donor, etc. | +| `obs["soma_joinid"]` | `integer` | (*Optional*) If the dataset was retrieved from CELLxGENE census, this is a unique identifier for the cell. | +| `var["feature_id"]` | `string` | (*Optional*) Unique identifier for the feature, usually a ENSEMBL gene id. | +| `var["feature_name"]` | `string` | A human-readable name for the feature, usually a gene symbol. | +| `var["soma_joinid"]` | `integer` | (*Optional*) If the dataset was retrieved from CELLxGENE census, this is a unique identifier for the feature. | +| `var["hvg"]` | `boolean` | Whether or not the feature is considered to be a ‘highly variable gene’. | +| `var["hvg_score"]` | `double` | A score for the feature indicating how highly variable it is. | +| `obsm["X_pca"]` | `double` | The resulting PCA embedding. | +| `obsp["knn_distances"]` | `double` | K nearest neighbors distance matrix. | +| `obsp["knn_connectivities"]` | `double` | K nearest neighbors connectivities matrix. | +| `varm["pca_loadings"]` | `double` | The PCA loadings matrix. | +| `layers["counts"]` | `integer` | Raw counts. | +| `layers["normalized"]` | `integer` | Normalized expression values. | +| `uns["dataset_id"]` | `string` | A unique identifier for the dataset. This is different from the `obs.dataset_id` field, which is the identifier for the dataset from which the cell data is derived. | | `uns["dataset_name"]` | `string` | A human-readable name for the dataset. | -| `uns["dataset_url"]` | `string` | Link to the original source of the dataset. | -| `uns["dataset_reference"]` | `string` | Bibtex reference of the paper in which the dataset was published. | +| `uns["dataset_url"]` | `string` | (*Optional*) Link to the original source of the dataset. | +| `uns["dataset_reference"]` | `string` | (*Optional*) Bibtex reference of the paper in which the dataset was published. | | `uns["dataset_summary"]` | `string` | Short description of the dataset. | | `uns["dataset_description"]` | `string` | Long description of the dataset. | -| `uns["dataset_organism"]` | `string` | The organism of the sample in the dataset. | -| `uns["segmentation_id"]` | `string` | A unique identifier for the segmentation. | - -*coordinate_systems* - -| Name | Description | -|:---------|:------------------------------------| -| `global` | Coordinate system of the replicate. | +| `uns["dataset_organism"]` | `string` | (*Optional*) The organism of the sample in the dataset. |
@@ -594,33 +577,29 @@ Data structure: -## File format: Common SC Dataset +## File format: Common iST Dataset -An unprocessed dataset as output by a dataset loader. +An unprocessed spatial imaging dataset stored as a zarr file. Example file: -`resources_test/common/2023_yao_mouse_brain_scrnaseq_10xv2/dataset.h5ad` +`resources_test/common/2023_10x_mouse_brain_xenium_rep1/dataset.zarr` Description: -This dataset contains raw counts and metadata as output by a dataset -loader. - -The format of this file is mainly derived from the [CELLxGENE schema -v4.0.0](https://github.com/chanzuckerberg/single-cell-curation/blob/main/schema/4.0.0/schema.md). +This dataset contains raw images, labels, points, shapes, and tables as +output by a dataset loader. Format:
- AnnData object - obs: 'cell_type', 'cell_type_level2', 'cell_type_level3', 'cell_type_level4', 'dataset_id', 'assay', 'assay_ontology_term_id', 'cell_type_ontology_term_id', 'development_stage', 'development_stage_ontology_term_id', 'disease', 'disease_ontology_term_id', 'donor_id', 'is_primary_data', 'organism', 'organism_ontology_term_id', 'self_reported_ethnicity', 'self_reported_ethnicity_ontology_term_id', 'sex', 'sex_ontology_term_id', 'suspension_type', 'tissue', 'tissue_ontology_term_id', 'tissue_general', 'tissue_general_ontology_term_id', 'batch', 'soma_joinid' - var: 'feature_id', 'feature_name', 'soma_joinid', 'hvg', 'hvg_score' - obsm: 'X_pca' - obsp: 'knn_distances', 'knn_connectivities' - varm: 'pca_loadings' - layers: 'counts', 'normalized' - uns: 'dataset_id', 'dataset_name', 'dataset_url', 'dataset_reference', 'dataset_summary', 'dataset_description', 'dataset_organism' + SpatialData object + images: 'image', 'image_3D', 'he_image' + labels: 'cell_labels', 'nucleus_labels', 'groundtruth_cell_labels' + points: 'transcripts' + shapes: 'cell_boundaries', 'nucleus_boundaries' + tables: 'table' + coordinate_systems: 'global'
@@ -628,52 +607,105 @@ Data structure:
+*images* + +| Name | Description | +|:-----------|:------------------------------------| +| `image` | The raw image data. | +| `image_3D` | (*Optional*) The raw 3D image data. | +| `he_image` | (*Optional*) H&E image data. | + +*labels* + +| Name | Description | +|:---|:---| +| `cell_labels` | Vendor-provided cell segmentation labels. | +| `nucleus_labels` | Vendor-provided nucleus segmentation labels. | +| `groundtruth_cell_labels` | (*Optional*) Manually annotated cell segmentation labels used as ground truth for evaluation. | + +*points* + +`transcripts`: Point cloud data of transcripts. + +| Column | Type | Description | +|:---|:---|:---| +| `x` | `float` | x-coordinate of the point. | +| `y` | `float` | y-coordinate of the point. | +| `z` | `float` | (*Optional*) z-coordinate of the point. | +| `feature_name` | `categorical` | Name of the feature. | +| `cell_id` | `integer` | (*Optional*) Unique identifier of the cell. | +| `nucleus_id` | `integer` | (*Optional*) Unique identifier of the nucleus. | +| `cell_type` | `string` | (*Optional*) Cell type of the cell. | +| `qv` | `float` | (*Optional*) Quality value of the point. | +| `transcript_id` | `long` | Unique identifier of the transcript. | +| `overlaps_nucleus` | `boolean` | (*Optional*) Whether the point overlaps with a nucleus. | + +*shapes* + +`cell_boundaries`: Cell boundaries. + +| Column | Type | Description | +|:-----------|:---------|:-------------------------------| +| `geometry` | `object` | Geometry of the cell boundary. | + +`nucleus_boundaries`: Nucleus boundaries. + +| Column | Type | Description | +|:-----------|:---------|:----------------------------------| +| `geometry` | `object` | Geometry of the nucleus boundary. | + +*tables* + +`table`: Metadata of spatial dataset. + | Slot | Type | Description | |:---|:---|:---| -| `obs["cell_type"]` | `string` | Classification of the cell type based on its characteristics and function within the tissue or organism. | -| `obs["cell_type_level2"]` | `string` | (*Optional*) Classification of the cell type based on its characteristics and function within the tissue or organism. | -| `obs["cell_type_level3"]` | `string` | (*Optional*) Classification of the cell type based on its characteristics and function within the tissue or organism. | -| `obs["cell_type_level4"]` | `string` | (*Optional*) Classification of the cell type based on its characteristics and function within the tissue or organism. | -| `obs["dataset_id"]` | `string` | (*Optional*) Identifier for the dataset from which the cell data is derived, useful for tracking and referencing purposes. | -| `obs["assay"]` | `string` | (*Optional*) Type of assay used to generate the cell data, indicating the methodology or technique employed. | -| `obs["assay_ontology_term_id"]` | `string` | (*Optional*) Experimental Factor Ontology (`EFO:`) term identifier for the assay, providing a standardized reference to the assay type. | -| `obs["cell_type_ontology_term_id"]` | `string` | (*Optional*) Cell Ontology (`CL:`) term identifier for the cell type, offering a standardized reference to the specific cell classification. | -| `obs["development_stage"]` | `string` | (*Optional*) Stage of development of the organism or tissue from which the cell is derived, indicating its maturity or developmental phase. | -| `obs["development_stage_ontology_term_id"]` | `string` | (*Optional*) Ontology term identifier for the developmental stage, providing a standardized reference to the organism’s developmental phase. If the organism is human (`organism_ontology_term_id == 'NCBITaxon:9606'`), then the Human Developmental Stages (`HsapDv:`) ontology is used. If the organism is mouse (`organism_ontology_term_id == 'NCBITaxon:10090'`), then the Mouse Developmental Stages (`MmusDv:`) ontology is used. Otherwise, the Uberon (`UBERON:`) ontology is used. | -| `obs["disease"]` | `string` | (*Optional*) Information on any disease or pathological condition associated with the cell or donor. | -| `obs["disease_ontology_term_id"]` | `string` | (*Optional*) Ontology term identifier for the disease, enabling standardized disease classification and referencing. Must be a term from the Mondo Disease Ontology (`MONDO:`) ontology term, or `PATO:0000461` from the Phenotype And Trait Ontology (`PATO:`). | -| `obs["donor_id"]` | `string` | (*Optional*) Identifier for the donor from whom the cell sample is obtained. | -| `obs["is_primary_data"]` | `boolean` | (*Optional*) Indicates whether the data is primary (directly obtained from experiments) or has been computationally derived from other primary data. | -| `obs["organism"]` | `string` | (*Optional*) Organism from which the cell sample is obtained. | -| `obs["organism_ontology_term_id"]` | `string` | (*Optional*) Ontology term identifier for the organism, providing a standardized reference for the organism. Must be a term from the NCBI Taxonomy Ontology (`NCBITaxon:`) which is a child of `NCBITaxon:33208`. | -| `obs["self_reported_ethnicity"]` | `string` | (*Optional*) Ethnicity of the donor as self-reported, relevant for studies considering genetic diversity and population-specific traits. | -| `obs["self_reported_ethnicity_ontology_term_id"]` | `string` | (*Optional*) Ontology term identifier for the self-reported ethnicity, providing a standardized reference for ethnic classifications. If the organism is human (`organism_ontology_term_id == 'NCBITaxon:9606'`), then the Human Ancestry Ontology (`HANCESTRO:`) is used. | -| `obs["sex"]` | `string` | (*Optional*) Biological sex of the donor or source organism, crucial for studies involving sex-specific traits or conditions. | -| `obs["sex_ontology_term_id"]` | `string` | (*Optional*) Ontology term identifier for the biological sex, ensuring standardized classification of sex. Only `PATO:0000383`, `PATO:0000384` and `PATO:0001340` are allowed. | -| `obs["suspension_type"]` | `string` | (*Optional*) Type of suspension or medium in which the cells were stored or processed, important for understanding cell handling and conditions. | -| `obs["tissue"]` | `string` | (*Optional*) Specific tissue from which the cells were derived, key for context and specificity in cell studies. | -| `obs["tissue_ontology_term_id"]` | `string` | (*Optional*) Ontology term identifier for the tissue, providing a standardized reference for the tissue type. For organoid or tissue samples, the Uber-anatomy ontology (`UBERON:`) is used. The term ids must be a child term of `UBERON:0001062` (anatomical entity). For cell cultures, the Cell Ontology (`CL:`) is used. The term ids cannot be `CL:0000255`, `CL:0000257` or `CL:0000548`. | -| `obs["tissue_general"]` | `string` | (*Optional*) General category or classification of the tissue, useful for broader grouping and comparison of cell data. | -| `obs["tissue_general_ontology_term_id"]` | `string` | (*Optional*) Ontology term identifier for the general tissue category, aiding in standardizing and grouping tissue types. For organoid or tissue samples, the Uber-anatomy ontology (`UBERON:`) is used. The term ids must be a child term of `UBERON:0001062` (anatomical entity). For cell cultures, the Cell Ontology (`CL:`) is used. The term ids cannot be `CL:0000255`, `CL:0000257` or `CL:0000548`. | -| `obs["batch"]` | `string` | (*Optional*) A batch identifier. This label is very context-dependent and may be a combination of the tissue, assay, donor, etc. | -| `obs["soma_joinid"]` | `integer` | (*Optional*) If the dataset was retrieved from CELLxGENE census, this is a unique identifier for the cell. | -| `var["feature_id"]` | `string` | (*Optional*) Unique identifier for the feature, usually a ENSEMBL gene id. | -| `var["feature_name"]` | `string` | A human-readable name for the feature, usually a gene symbol. | -| `var["soma_joinid"]` | `integer` | (*Optional*) If the dataset was retrieved from CELLxGENE census, this is a unique identifier for the feature. | -| `var["hvg"]` | `boolean` | Whether or not the feature is considered to be a ‘highly variable gene’. | -| `var["hvg_score"]` | `double` | A score for the feature indicating how highly variable it is. | -| `obsm["X_pca"]` | `double` | The resulting PCA embedding. | -| `obsp["knn_distances"]` | `double` | K nearest neighbors distance matrix. | -| `obsp["knn_connectivities"]` | `double` | K nearest neighbors connectivities matrix. | -| `varm["pca_loadings"]` | `double` | The PCA loadings matrix. | -| `layers["counts"]` | `integer` | Raw counts. | -| `layers["normalized"]` | `integer` | Normalized expression values. | -| `uns["dataset_id"]` | `string` | A unique identifier for the dataset. This is different from the `obs.dataset_id` field, which is the identifier for the dataset from which the cell data is derived. | +| `obs["cell_id"]` | `string` | A unique identifier for the cell. | +| `obs["groundtruth_cell_type"]` | `string` | (*Optional*) Manually curated cell type annotations which serves as ground truth for evaluations. | +| `var["gene_ids"]` | `string` | Unique identifier for the gene. | +| `var["feature_types"]` | `string` | Type of the feature. | +| `obsm["spatial"]` | `double` | Spatial coordinates of the cell. | +| `uns["dataset_id"]` | `string` | A unique identifier for the dataset. | | `uns["dataset_name"]` | `string` | A human-readable name for the dataset. | -| `uns["dataset_url"]` | `string` | (*Optional*) Link to the original source of the dataset. | -| `uns["dataset_reference"]` | `string` | (*Optional*) Bibtex reference of the paper in which the dataset was published. | +| `uns["dataset_url"]` | `string` | Link to the original source of the dataset. | +| `uns["dataset_reference"]` | `string` | Bibtex reference of the paper in which the dataset was published. | | `uns["dataset_summary"]` | `string` | Short description of the dataset. | | `uns["dataset_description"]` | `string` | Long description of the dataset. | -| `uns["dataset_organism"]` | `string` | (*Optional*) The organism of the sample in the dataset. | +| `uns["dataset_organism"]` | `string` | The organism of the sample in the dataset. | +| `uns["segmentation_id"]` | `string` | A unique identifier for the segmentation. | + +*coordinate_systems* + +| Name | Description | +|:---------|:------------------------------------| +| `global` | Coordinate system of the replicate. | + +
+ +## File format: 10x Xenium Raw Dataset + +A 10x Xenium output zip archive or download URL. + +Example file: +`https://cf.10xgenomics.com/samples/xenium/1.9.0/Xenium_V1_hLiver_cancer_section_FFPE/Xenium_V1_hLiver_cancer_section_FFPE_outs.zip` + +Description: + +Raw output from the 10x Genomics Xenium platform provided as a zip +archive or a URL pointing to one. The archive is expected to contain the +standard Xenium Onboard Analysis output files. + +## Component type: 10x Xenium Dataset Loader + +Loads a raw 10x Xenium dataset and converts it to a Common iST Dataset. + +Arguments: + +
+ +| Name | Type | Description | +|:---|:---|:---| +| `--input` | `file` | A 10x Xenium output zip archive or download URL. | +| `--output` | `file` | (*Output*) An unprocessed spatial imaging dataset stored as a zarr file. |