From e4ce43b44006ac0f2ff547b3dcde151b93ba4c9e Mon Sep 17 00:00:00 2001 From: Weishan Li Date: Wed, 27 May 2026 14:30:06 -0600 Subject: [PATCH 1/3] Update package dependency to support S3 access of data --- pyproject.toml | 1 + 1 file changed, 1 insertion(+) diff --git a/pyproject.toml b/pyproject.toml index 1dfdd8a..2f7b3dd 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -30,6 +30,7 @@ dependencies = [ "notebook", "tifffile", "pandera[pandas]", + "boto3" ] [project.optional-dependencies] From 5c7e1a96f0785ebc9cc1d446b5d4cf732a0b215b Mon Sep 17 00:00:00 2001 From: Weishan Li Date: Wed, 27 May 2026 14:32:02 -0600 Subject: [PATCH 2/3] Add CPJUMP1 dataset manifest building functionality for more principled example dataset acquisition + utilties for converting manifest as formats required by virtual stain flow datasets --- .../datasets/example/arrange_as_wide.py | 32 ++++ .../datasets/example/cpjump1_manifest.py | 145 ++++++++++++++++++ 2 files changed, 177 insertions(+) create mode 100644 src/virtual_stain_flow/datasets/example/arrange_as_wide.py create mode 100644 src/virtual_stain_flow/datasets/example/cpjump1_manifest.py diff --git a/src/virtual_stain_flow/datasets/example/arrange_as_wide.py b/src/virtual_stain_flow/datasets/example/arrange_as_wide.py new file mode 100644 index 0000000..f150bf3 --- /dev/null +++ b/src/virtual_stain_flow/datasets/example/arrange_as_wide.py @@ -0,0 +1,32 @@ +""" +Helper utility specifically to support the example CPJUMP1 dataset + pivoting and arranging as file index. +""" + +import pandas as pd + + +def arrange_manifest_channels(manifest): + """ + Return a wide dataframe with one row per plate/well/site and URL columns per channel. + """ + required_channels = ["LZ_BF", "BF", "HZ_BF", "DNA", "Mito", "AGP", "ER", "RNA"] + keys = ["Metadata_Plate", "Metadata_Well", "Metadata_Site"] + filtered = manifest[manifest["Metadata_ChannelName"].isin(required_channels)].copy() + filtered["Metadata_ChannelName"] = filtered["Metadata_ChannelName"].astype( + pd.CategoricalDtype(categories=required_channels, ordered=True) + ) + filtered = filtered.sort_values(keys + ["Metadata_ChannelName"]) + wide = ( + filtered.pivot_table( + index=keys, + columns="Metadata_ChannelName", + values="Metadata_FileUrl", + aggfunc="first", + observed=False, + ) + .reindex(columns=required_channels) + .reset_index() + ) + + return wide diff --git a/src/virtual_stain_flow/datasets/example/cpjump1_manifest.py b/src/virtual_stain_flow/datasets/example/cpjump1_manifest.py new file mode 100644 index 0000000..4a1f508 --- /dev/null +++ b/src/virtual_stain_flow/datasets/example/cpjump1_manifest.py @@ -0,0 +1,145 @@ +"""Build an enriched image manifest for CPJUMP1 dataset access. + +Only compound perturbations (no CRISPR or ORF) are included, which is +appropriate for virtual staining experiments. +""" + +from __future__ import annotations + +import argparse +import sys +from typing import Optional + +import pandas as pd + +# Most recent commit ref as of Mar 25 2026. +REPO_REF = "6ea3958c3809cd04ac95b63138937dd64a7c4c12" +REPO_BASE = f"https://github.com/WayScience/JUMP-single-cell/raw/{REPO_REF}/" + +IMAGE_MANIFEST_URL = f"{REPO_BASE}0.download_data/data/2020_11_04_CPJUMP1_all_plates.parquet" +IMAGE_MANIFEST_COLUMNS = [ + "Metadata_Plate", + "Metadata_Well", + "Metadata_Site", + "Metadata_ChannelName", + "Metadata_PlaneID", + "Metadata_PositionZ", + "Metadata_FileUrl", + "Metadata_Filename", +] + +EXPERIMENT_METADATA_URL = f"{REPO_BASE}reference_plate_data/experiment-metadata.tsv" +COMPOUND_PLATEMAP_URL = f"{REPO_BASE}reference_plate_data/JUMP-Target-1_compound_platemap.txt" +COMPOUND_METADATA_URL = f"{REPO_BASE}reference_plate_data/JUMP-Target-1_compound_metadata_targets.tsv" + +__all__ = ["build_manifest", "get_manifest", "main"] + +_MANIFEST_CACHE: Optional[pd.DataFrame] = None + + +def build_manifest() -> pd.DataFrame: + """ + Main utility function that handles all the wrangling. + Return an enriched CPJUMP1 manifest as a pandas DataFrame. + """ + image_manifest = pd.read_parquet(IMAGE_MANIFEST_URL, columns=IMAGE_MANIFEST_COLUMNS) + + experiment_meta = pd.read_csv(EXPERIMENT_METADATA_URL, delimiter="\t") + experiment_meta.rename(columns={"Assay_Plate_Barcode": "Metadata_Plate"}, inplace=True) + experiment_meta = experiment_meta[experiment_meta["Perturbation"] == "compound"] + # exclude dl batch which is essentially duplicate in context for image data access + experiment_meta = experiment_meta[~experiment_meta["Batch"].str.endswith("_DL")] + + compound_platemap = pd.merge( + pd.read_csv(COMPOUND_PLATEMAP_URL, delimiter="\t"), + pd.read_csv(COMPOUND_METADATA_URL, delimiter="\t"), + on="broad_sample", + how="left", + validate="many_to_one" + ).rename(columns={"well_position": "Metadata_Well"}, inplace=False) + + image_manifest_compound = pd.merge( + experiment_meta, + image_manifest, + on="Metadata_Plate", + how="inner", + validate="one_to_many" # one plate id should map to many image rows + ) + + return pd.merge( + compound_platemap, + image_manifest_compound, + on="Metadata_Well", + how="inner", + # all the plates share the same well map so one well should map to many image rows + validate="one_to_many" + ) + + +def get_manifest() -> pd.DataFrame: + """ + Return a cached manifest to avoid repeated network reads. + """ + global _MANIFEST_CACHE + if _MANIFEST_CACHE is None: + _MANIFEST_CACHE = build_manifest() + return _MANIFEST_CACHE + + +def _write_manifest(df: pd.DataFrame, output: str, fmt: str) -> None: + if fmt == "csv": + df.to_csv(output, index=False) + elif fmt == "parquet": + df.to_parquet(output, index=False) + else: + raise ValueError(f"Unsupported format: {fmt}") + + +def main(argv: Optional[list[str]] = None) -> int: + """ + Command-line interface to building and ouputting the CPJUMP1 manifest. + By default, it prints a summary and preview of the manifest. + Use --output or --stdout to write the full manifest to a file or stdout. + May or may not be useful. + """ + parser = argparse.ArgumentParser(description="Build CPJUMP1 enriched manifest.") + parser.add_argument( + "--output", + help="Write manifest to a file (CSV or Parquet).", + ) + parser.add_argument( + "--format", + choices=["csv", "parquet"], + default="csv", + help="Output file format when using --output (default: csv).", + ) + parser.add_argument( + "--stdout", + action="store_true", + help="Write manifest to stdout as CSV.", + ) + parser.add_argument( + "--head", + type=int, + default=5, + help="Rows to display when no output is specified (default: 5).", + ) + args = parser.parse_args(argv) + + manifest = get_manifest() + + if args.stdout: + manifest.to_csv(sys.stdout, index=False) + return 0 + + if args.output: + _write_manifest(manifest, args.output, args.format) + return 0 + + print(f"Rows: {len(manifest):,} | Columns: {len(manifest.columns)}") + print(manifest.head(args.head).to_string(index=False)) + return 0 + + +if __name__ == "__main__": + raise SystemExit(main()) From a0095d8033af99973a0a4531d8b7e6bbcc83290a Mon Sep 17 00:00:00 2001 From: Weishan Li Date: Wed, 27 May 2026 14:35:42 -0600 Subject: [PATCH 3/3] Add script to download JUMP pilot plate data from S3 for training examples --- examples/0.download_example_dataset.ipynb | 2250 +++++++++++++++++ .../nbconverted/0.download_example_dataset.py | 169 ++ 2 files changed, 2419 insertions(+) create mode 100644 examples/0.download_example_dataset.ipynb create mode 100644 examples/nbconverted/0.download_example_dataset.py diff --git a/examples/0.download_example_dataset.ipynb b/examples/0.download_example_dataset.ipynb new file mode 100644 index 0000000..986c997 --- /dev/null +++ b/examples/0.download_example_dataset.ipynb @@ -0,0 +1,2250 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "b013fcef", + "metadata": {}, + "source": [ + "# Download JUMP pilot plate data from AWS S3 bucket for example training" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "id": "6c8de60f", + "metadata": {}, + "outputs": [], + "source": [ + "from pathlib import Path\n", + "from urllib.parse import urlparse\n", + "\n", + "import pandas as pd\n", + "from sklearn.model_selection import train_test_split\n", + "\n", + "from virtual_stain_flow.datasets.example.cpjump1_manifest import get_manifest\n", + "from virtual_stain_flow.datasets.example.arrange_as_wide import arrange_manifest_channels" + ] + }, + { + "cell_type": "markdown", + "id": "c879dacc", + "metadata": {}, + "source": [ + "## Pathing" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "d3baf91c", + "metadata": {}, + "outputs": [], + "source": [ + "DATA_DOWNLOAD_DIR = Path(\"/PATH/TO/WHERE/YOU/WANT/TO/DOWNLOAD/CPJUMP1\")\n", + "DATA_DOWNLOAD_DIR.mkdir(exist_ok=True, parents=True)" + ] + }, + { + "cell_type": "markdown", + "id": "63c8d005", + "metadata": {}, + "source": [ + "## S3 download helpers" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "id": "6d789a28", + "metadata": {}, + "outputs": [], + "source": [ + "def _parse_s3_url(url):\n", + " parsed = urlparse(url)\n", + " if parsed.scheme != \"s3\":\n", + " raise ValueError(f\"Expected s3:// URL, got: {url}\")\n", + " return parsed.netloc, parsed.path.lstrip(\"/\")\n", + "\n", + "def download_wide_manifest_channels(\n", + " wide_manifest,\n", + " dest_dir,\n", + " channel_columns=None,\n", + " overwrite=False,\n", + "):\n", + " \"\"\"\n", + " Download S3 TIFFs for each channel and write a local file_index.csv with paths.\n", + " \"\"\"\n", + " if channel_columns is None:\n", + " channel_columns = [\"LZ_BF\", \"BF\", \"HZ_BF\", \"DNA\", \"Mito\", \"AGP\", \"ER\", \"RNA\"]\n", + " dest_dir = Path(dest_dir)\n", + " dest_dir.mkdir(parents=True, exist_ok=True)\n", + " try:\n", + " import boto3\n", + " from botocore import UNSIGNED\n", + " from botocore.config import Config\n", + " except ImportError as exc:\n", + " raise ImportError(\n", + " \"boto3 is required for S3 downloads. Install with: pip install boto3\"\n", + " ) from exc\n", + " s3 = boto3.client(\"s3\", config=Config(signature_version=UNSIGNED))\n", + " local_rows = []\n", + " for row_idx, row in wide_manifest.iterrows():\n", + " prefix_parts = []\n", + " for key in [\"Metadata_Plate\", \"Metadata_Well\", \"Metadata_Site\"]:\n", + " if key in wide_manifest.columns:\n", + " prefix_parts.append(str(row[key]))\n", + " prefix = \"_\".join(prefix_parts) if prefix_parts else f\"row_{row_idx}\"\n", + " local_row = {}\n", + " for channel in channel_columns:\n", + " url = row[channel] if channel in wide_manifest.columns else None\n", + " if pd.isna(url):\n", + " local_row[channel] = None\n", + " continue\n", + " bucket, key = _parse_s3_url(url)\n", + " suffix = Path(key).suffix or \".tif\"\n", + " local_path = dest_dir / f\"{prefix}_{channel}{suffix}\"\n", + " if overwrite or not local_path.exists():\n", + " s3.download_file(bucket, key, str(local_path))\n", + " local_row[channel] = str(local_path)\n", + " local_rows.append(local_row)\n", + " file_index = pd.DataFrame(local_rows, columns=channel_columns)\n", + " file_index.to_csv(dest_dir / \"file_index.csv\", index=False)\n", + " return file_index\n" + ] + }, + { + "cell_type": "markdown", + "id": "8e5e7baf", + "metadata": {}, + "source": [ + "## Retrieve compound manifest" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "id": "3452cbb4", + "metadata": {}, + "outputs": [ + { + "data": { + "application/vnd.microsoft.datawrangler.viewer.v0+json": { + "columns": [ + { + "name": "index", + "rawType": "int64", + "type": "integer" + }, + { + "name": "Metadata_Well", + "rawType": "object", + "type": "string" + }, + { + "name": "broad_sample", + "rawType": "object", + "type": "string" + }, + { + "name": "solvent", + "rawType": "object", + "type": "string" + }, + { + "name": "InChIKey", + "rawType": "object", + "type": "string" + }, + { + "name": "pert_iname", + "rawType": "object", + "type": "string" + }, + { + "name": "pubchem_cid", + "rawType": "float64", + "type": "float" + }, + { + "name": "target", + "rawType": "object", + "type": "string" + }, + { + "name": "target_list", + "rawType": "object", + "type": "string" + }, + { + "name": "pert_type", + "rawType": "object", + "type": "string" + }, + { + "name": "control_type", + "rawType": "object", + "type": "unknown" + }, + { + "name": "smiles", + "rawType": "object", + "type": "string" + }, + { + "name": "Batch", + "rawType": "object", + "type": "string" + }, + { + "name": "Plate_Map_Name", + "rawType": "object", + "type": "string" + }, + { + "name": "Metadata_Plate", + "rawType": "object", + "type": "string" + }, + { + "name": "Perturbation", + "rawType": "object", + "type": "string" + }, + { + "name": "Cell_type", + "rawType": "object", + "type": "string" + }, + { + "name": "Time", + "rawType": "int64", + "type": "integer" + }, + { + "name": "Density", + "rawType": "int64", + "type": "integer" + }, + { + "name": "Antibiotics", + "rawType": "object", + "type": "string" + }, + { + "name": "Cell_line", + "rawType": "object", + "type": "string" + }, + { + "name": "Time_delay", + "rawType": "object", + "type": "string" + }, + { + "name": "Times_imaged", + "rawType": "int64", + "type": "integer" + }, + { + "name": "Anomaly", + "rawType": "object", + "type": "string" + }, + { + "name": "Number_of_images", + "rawType": "int64", + "type": "integer" + }, + { + "name": "Metadata_Site", + "rawType": "int64", + "type": "integer" + }, + { + "name": "Metadata_ChannelName", + "rawType": "object", + "type": "string" + }, + { + "name": "Metadata_PlaneID", + "rawType": "int64", + "type": "integer" + }, + { + "name": "Metadata_PositionZ", + "rawType": "float64", + "type": "float" + }, + { + "name": "Metadata_FileUrl", + "rawType": "object", + "type": "string" + }, + { + "name": "Metadata_Filename", + "rawType": "object", + "type": "string" + } + ], + "ref": "89505c13-a772-4c21-9d8e-9de5bd7c7087", + "rows": [ + [ + "0", + "A01", + "BRD-A86665761-001-01-1", + "DMSO", + "TZDUHAJSIBHXDL-UHFFFAOYSA-N", + "gabapentin-enacarbil", + "9883933.0", + "CACNB4", + "CACNA1A|CACNA1B|CACNA1C|CACNA1D|CACNA1E|CACNA1F|CACNA1G|CACNA1H|CACNA1I|CACNA1S|CACNA2D1|CACNA2D2|CACNA2D3|CACNA2D4|CACNB1|CACNB2|CACNB3|CACNB4|CACNG1|CACNG2|CACNG3|CACNG4|CACNG5|CACNG6|CACNG7|CACNG8", + "trt", + null, + "CC(C)C(=O)OC(C)OC(=O)NCC1(CC(O)=O)CCCCC1", + "2020_11_04_CPJUMP1", + "JUMP-Target-1_compound_platemap", + "BR00116991", + "compound", + "A549", + "24", + "100", + "absent", + "Parental", + "Day0", + "1", + "WGA", + "27648", + "1", + "DNA", + "1", + "-2e-06", + "s3://cellpainting-gallery/cpg0000-jump-pilot/source_4/images/2020_11_04_CPJUMP1/images/BR00116991__2020-11-05T19_51_35-Measurement1/Images/r01c01f01p01-ch5sk1fk1fl1.tiff", + "r01c01f01p01-ch5sk1fk1fl1.tiff" + ], + [ + "1", + "A01", + "BRD-A86665761-001-01-1", + "DMSO", + "TZDUHAJSIBHXDL-UHFFFAOYSA-N", + "gabapentin-enacarbil", + "9883933.0", + "CACNB4", + "CACNA1A|CACNA1B|CACNA1C|CACNA1D|CACNA1E|CACNA1F|CACNA1G|CACNA1H|CACNA1I|CACNA1S|CACNA2D1|CACNA2D2|CACNA2D3|CACNA2D4|CACNB1|CACNB2|CACNB3|CACNB4|CACNG1|CACNG2|CACNG3|CACNG4|CACNG5|CACNG6|CACNG7|CACNG8", + "trt", + null, + "CC(C)C(=O)OC(C)OC(=O)NCC1(CC(O)=O)CCCCC1", + "2020_11_04_CPJUMP1", + "JUMP-Target-1_compound_platemap", + "BR00116991", + "compound", + "A549", + "24", + "100", + "absent", + "Parental", + "Day0", + "1", + "WGA", + "27648", + "2", + "DNA", + "1", + "-2e-06", + "s3://cellpainting-gallery/cpg0000-jump-pilot/source_4/images/2020_11_04_CPJUMP1/images/BR00116991__2020-11-05T19_51_35-Measurement1/Images/r01c01f02p01-ch5sk1fk1fl1.tiff", + "r01c01f02p01-ch5sk1fk1fl1.tiff" + ], + [ + "2", + "A01", + "BRD-A86665761-001-01-1", + "DMSO", + "TZDUHAJSIBHXDL-UHFFFAOYSA-N", + "gabapentin-enacarbil", + "9883933.0", + "CACNB4", + "CACNA1A|CACNA1B|CACNA1C|CACNA1D|CACNA1E|CACNA1F|CACNA1G|CACNA1H|CACNA1I|CACNA1S|CACNA2D1|CACNA2D2|CACNA2D3|CACNA2D4|CACNB1|CACNB2|CACNB3|CACNB4|CACNG1|CACNG2|CACNG3|CACNG4|CACNG5|CACNG6|CACNG7|CACNG8", + "trt", + null, + "CC(C)C(=O)OC(C)OC(=O)NCC1(CC(O)=O)CCCCC1", + "2020_11_04_CPJUMP1", + "JUMP-Target-1_compound_platemap", + "BR00116991", + "compound", + "A549", + "24", + "100", + "absent", + "Parental", + "Day0", + "1", + "WGA", + "27648", + "3", + "DNA", + "1", + "-2e-06", + "s3://cellpainting-gallery/cpg0000-jump-pilot/source_4/images/2020_11_04_CPJUMP1/images/BR00116991__2020-11-05T19_51_35-Measurement1/Images/r01c01f03p01-ch5sk1fk1fl1.tiff", + "r01c01f03p01-ch5sk1fk1fl1.tiff" + ], + [ + "3", + "A01", + "BRD-A86665761-001-01-1", + "DMSO", + "TZDUHAJSIBHXDL-UHFFFAOYSA-N", + "gabapentin-enacarbil", + "9883933.0", + "CACNB4", + "CACNA1A|CACNA1B|CACNA1C|CACNA1D|CACNA1E|CACNA1F|CACNA1G|CACNA1H|CACNA1I|CACNA1S|CACNA2D1|CACNA2D2|CACNA2D3|CACNA2D4|CACNB1|CACNB2|CACNB3|CACNB4|CACNG1|CACNG2|CACNG3|CACNG4|CACNG5|CACNG6|CACNG7|CACNG8", + "trt", + null, + "CC(C)C(=O)OC(C)OC(=O)NCC1(CC(O)=O)CCCCC1", + "2020_11_04_CPJUMP1", + "JUMP-Target-1_compound_platemap", + "BR00116991", + "compound", + "A549", + "24", + "100", + "absent", + "Parental", + "Day0", + "1", + "WGA", + "27648", + "4", + "DNA", + "1", + "-2e-06", + "s3://cellpainting-gallery/cpg0000-jump-pilot/source_4/images/2020_11_04_CPJUMP1/images/BR00116991__2020-11-05T19_51_35-Measurement1/Images/r01c01f04p01-ch5sk1fk1fl1.tiff", + "r01c01f04p01-ch5sk1fk1fl1.tiff" + ], + [ + "4", + "A01", + "BRD-A86665761-001-01-1", + "DMSO", + "TZDUHAJSIBHXDL-UHFFFAOYSA-N", + "gabapentin-enacarbil", + "9883933.0", + "CACNB4", + "CACNA1A|CACNA1B|CACNA1C|CACNA1D|CACNA1E|CACNA1F|CACNA1G|CACNA1H|CACNA1I|CACNA1S|CACNA2D1|CACNA2D2|CACNA2D3|CACNA2D4|CACNB1|CACNB2|CACNB3|CACNB4|CACNG1|CACNG2|CACNG3|CACNG4|CACNG5|CACNG6|CACNG7|CACNG8", + "trt", + null, + "CC(C)C(=O)OC(C)OC(=O)NCC1(CC(O)=O)CCCCC1", + "2020_11_04_CPJUMP1", + "JUMP-Target-1_compound_platemap", + "BR00116991", + "compound", + "A549", + "24", + "100", + "absent", + "Parental", + "Day0", + "1", + "WGA", + "27648", + "5", + "DNA", + "1", + "-2e-06", + "s3://cellpainting-gallery/cpg0000-jump-pilot/source_4/images/2020_11_04_CPJUMP1/images/BR00116991__2020-11-05T19_51_35-Measurement1/Images/r01c01f05p01-ch5sk1fk1fl1.tiff", + "r01c01f05p01-ch5sk1fk1fl1.tiff" + ] + ], + "shape": { + "columns": 30, + "rows": 5 + } + }, + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
Metadata_Wellbroad_samplesolventInChIKeypert_inamepubchem_cidtargettarget_listpert_typecontrol_type...Time_delayTimes_imagedAnomalyNumber_of_imagesMetadata_SiteMetadata_ChannelNameMetadata_PlaneIDMetadata_PositionZMetadata_FileUrlMetadata_Filename
0A01BRD-A86665761-001-01-1DMSOTZDUHAJSIBHXDL-UHFFFAOYSA-Ngabapentin-enacarbil9883933.0CACNB4CACNA1A|CACNA1B|CACNA1C|CACNA1D|CACNA1E|CACNA1...trtNaN...Day01WGA276481DNA1-0.000002s3://cellpainting-gallery/cpg0000-jump-pilot/s...r01c01f01p01-ch5sk1fk1fl1.tiff
1A01BRD-A86665761-001-01-1DMSOTZDUHAJSIBHXDL-UHFFFAOYSA-Ngabapentin-enacarbil9883933.0CACNB4CACNA1A|CACNA1B|CACNA1C|CACNA1D|CACNA1E|CACNA1...trtNaN...Day01WGA276482DNA1-0.000002s3://cellpainting-gallery/cpg0000-jump-pilot/s...r01c01f02p01-ch5sk1fk1fl1.tiff
2A01BRD-A86665761-001-01-1DMSOTZDUHAJSIBHXDL-UHFFFAOYSA-Ngabapentin-enacarbil9883933.0CACNB4CACNA1A|CACNA1B|CACNA1C|CACNA1D|CACNA1E|CACNA1...trtNaN...Day01WGA276483DNA1-0.000002s3://cellpainting-gallery/cpg0000-jump-pilot/s...r01c01f03p01-ch5sk1fk1fl1.tiff
3A01BRD-A86665761-001-01-1DMSOTZDUHAJSIBHXDL-UHFFFAOYSA-Ngabapentin-enacarbil9883933.0CACNB4CACNA1A|CACNA1B|CACNA1C|CACNA1D|CACNA1E|CACNA1...trtNaN...Day01WGA276484DNA1-0.000002s3://cellpainting-gallery/cpg0000-jump-pilot/s...r01c01f04p01-ch5sk1fk1fl1.tiff
4A01BRD-A86665761-001-01-1DMSOTZDUHAJSIBHXDL-UHFFFAOYSA-Ngabapentin-enacarbil9883933.0CACNB4CACNA1A|CACNA1B|CACNA1C|CACNA1D|CACNA1E|CACNA1...trtNaN...Day01WGA276485DNA1-0.000002s3://cellpainting-gallery/cpg0000-jump-pilot/s...r01c01f05p01-ch5sk1fk1fl1.tiff
\n", + "

5 rows × 30 columns

\n", + "
" + ], + "text/plain": [ + " Metadata_Well broad_sample solvent InChIKey \\\n", + "0 A01 BRD-A86665761-001-01-1 DMSO TZDUHAJSIBHXDL-UHFFFAOYSA-N \n", + "1 A01 BRD-A86665761-001-01-1 DMSO TZDUHAJSIBHXDL-UHFFFAOYSA-N \n", + "2 A01 BRD-A86665761-001-01-1 DMSO TZDUHAJSIBHXDL-UHFFFAOYSA-N \n", + "3 A01 BRD-A86665761-001-01-1 DMSO TZDUHAJSIBHXDL-UHFFFAOYSA-N \n", + "4 A01 BRD-A86665761-001-01-1 DMSO TZDUHAJSIBHXDL-UHFFFAOYSA-N \n", + "\n", + " pert_iname pubchem_cid target \\\n", + "0 gabapentin-enacarbil 9883933.0 CACNB4 \n", + "1 gabapentin-enacarbil 9883933.0 CACNB4 \n", + "2 gabapentin-enacarbil 9883933.0 CACNB4 \n", + "3 gabapentin-enacarbil 9883933.0 CACNB4 \n", + "4 gabapentin-enacarbil 9883933.0 CACNB4 \n", + "\n", + " target_list pert_type control_type \\\n", + "0 CACNA1A|CACNA1B|CACNA1C|CACNA1D|CACNA1E|CACNA1... trt NaN \n", + "1 CACNA1A|CACNA1B|CACNA1C|CACNA1D|CACNA1E|CACNA1... trt NaN \n", + "2 CACNA1A|CACNA1B|CACNA1C|CACNA1D|CACNA1E|CACNA1... trt NaN \n", + "3 CACNA1A|CACNA1B|CACNA1C|CACNA1D|CACNA1E|CACNA1... trt NaN \n", + "4 CACNA1A|CACNA1B|CACNA1C|CACNA1D|CACNA1E|CACNA1... trt NaN \n", + "\n", + " ... Time_delay Times_imaged Anomaly Number_of_images Metadata_Site \\\n", + "0 ... Day0 1 WGA 27648 1 \n", + "1 ... Day0 1 WGA 27648 2 \n", + "2 ... Day0 1 WGA 27648 3 \n", + "3 ... Day0 1 WGA 27648 4 \n", + "4 ... Day0 1 WGA 27648 5 \n", + "\n", + " Metadata_ChannelName Metadata_PlaneID Metadata_PositionZ \\\n", + "0 DNA 1 -0.000002 \n", + "1 DNA 1 -0.000002 \n", + "2 DNA 1 -0.000002 \n", + "3 DNA 1 -0.000002 \n", + "4 DNA 1 -0.000002 \n", + "\n", + " Metadata_FileUrl \\\n", + "0 s3://cellpainting-gallery/cpg0000-jump-pilot/s... \n", + "1 s3://cellpainting-gallery/cpg0000-jump-pilot/s... \n", + "2 s3://cellpainting-gallery/cpg0000-jump-pilot/s... \n", + "3 s3://cellpainting-gallery/cpg0000-jump-pilot/s... \n", + "4 s3://cellpainting-gallery/cpg0000-jump-pilot/s... \n", + "\n", + " Metadata_Filename \n", + "0 r01c01f01p01-ch5sk1fk1fl1.tiff \n", + "1 r01c01f02p01-ch5sk1fk1fl1.tiff \n", + "2 r01c01f03p01-ch5sk1fk1fl1.tiff \n", + "3 r01c01f04p01-ch5sk1fk1fl1.tiff \n", + "4 r01c01f05p01-ch5sk1fk1fl1.tiff \n", + "\n", + "[5 rows x 30 columns]" + ] + }, + "execution_count": 4, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "MANIFEST = get_manifest()\n", + "MANIFEST.head()" + ] + }, + { + "cell_type": "markdown", + "id": "c62e8252", + "metadata": {}, + "source": [ + "## Filter manifest\n", + "For the sake of demoing training here we restricted timepoint to 24, and selected untreated U2-OS cells" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "id": "82d77177", + "metadata": {}, + "outputs": [ + { + "data": { + "application/vnd.microsoft.datawrangler.viewer.v0+json": { + "columns": [ + { + "name": "index", + "rawType": "int64", + "type": "integer" + }, + { + "name": "Metadata_Well", + "rawType": "object", + "type": "string" + }, + { + "name": "broad_sample", + "rawType": "object", + "type": "unknown" + }, + { + "name": "solvent", + "rawType": "object", + "type": "string" + }, + { + "name": "InChIKey", + "rawType": "object", + "type": "string" + }, + { + "name": "pert_iname", + "rawType": "object", + "type": "string" + }, + { + "name": "pubchem_cid", + "rawType": "float64", + "type": "float" + }, + { + "name": "target", + "rawType": "object", + "type": "unknown" + }, + { + "name": "target_list", + "rawType": "object", + "type": "unknown" + }, + { + "name": "pert_type", + "rawType": "object", + "type": "string" + }, + { + "name": "control_type", + "rawType": "object", + "type": "string" + }, + { + "name": "smiles", + "rawType": "object", + "type": "string" + }, + { + "name": "Batch", + "rawType": "object", + "type": "string" + }, + { + "name": "Plate_Map_Name", + "rawType": "object", + "type": "string" + }, + { + "name": "Metadata_Plate", + "rawType": "object", + "type": "string" + }, + { + "name": "Perturbation", + "rawType": "object", + "type": "string" + }, + { + "name": "Cell_type", + "rawType": "object", + "type": "string" + }, + { + "name": "Time", + "rawType": "int64", + "type": "integer" + }, + { + "name": "Density", + "rawType": "int64", + "type": "integer" + }, + { + "name": "Antibiotics", + "rawType": "object", + "type": "string" + }, + { + "name": "Cell_line", + "rawType": "object", + "type": "string" + }, + { + "name": "Time_delay", + "rawType": "object", + "type": "string" + }, + { + "name": "Times_imaged", + "rawType": "int64", + "type": "integer" + }, + { + "name": "Anomaly", + "rawType": "object", + "type": "string" + }, + { + "name": "Number_of_images", + "rawType": "int64", + "type": "integer" + }, + { + "name": "Metadata_Site", + "rawType": "int64", + "type": "integer" + }, + { + "name": "Metadata_ChannelName", + "rawType": "object", + "type": "string" + }, + { + "name": "Metadata_PlaneID", + "rawType": "int64", + "type": "integer" + }, + { + "name": "Metadata_PositionZ", + "rawType": "float64", + "type": "float" + }, + { + "name": "Metadata_FileUrl", + "rawType": "object", + "type": "string" + }, + { + "name": "Metadata_Filename", + "rawType": "object", + "type": "string" + } + ], + "ref": "e21369f3-038a-44f7-bea4-0965696eaf8a", + "rows": [ + [ + "2240", + "A02", + null, + "DMSO", + "IAZDPXIOMUYVGZ-UHFFFAOYSA-N", + "DMSO", + "679.0", + null, + null, + "control", + "negcon", + "CS(=O)C", + "2020_11_04_CPJUMP1", + "JUMP-Target-1_compound_platemap", + "BR00116995", + "compound", + "U2OS", + "24", + "100", + "absent", + "Parental", + "Day0", + "1", + "WGA", + "27648", + "1", + "DNA", + "1", + "-2e-06", + "s3://cellpainting-gallery/cpg0000-jump-pilot/source_4/images/2020_11_04_CPJUMP1/images/BR00116995__2020-11-06T02_41_05-Measurement1/Images/r01c02f01p01-ch5sk1fk1fl1.tiff", + "r01c02f01p01-ch5sk1fk1fl1.tiff" + ], + [ + "2241", + "A02", + null, + "DMSO", + "IAZDPXIOMUYVGZ-UHFFFAOYSA-N", + "DMSO", + "679.0", + null, + null, + "control", + "negcon", + "CS(=O)C", + "2020_11_04_CPJUMP1", + "JUMP-Target-1_compound_platemap", + "BR00116995", + "compound", + "U2OS", + "24", + "100", + "absent", + "Parental", + "Day0", + "1", + "WGA", + "27648", + "2", + "DNA", + "1", + "-2e-06", + "s3://cellpainting-gallery/cpg0000-jump-pilot/source_4/images/2020_11_04_CPJUMP1/images/BR00116995__2020-11-06T02_41_05-Measurement1/Images/r01c02f02p01-ch5sk1fk1fl1.tiff", + "r01c02f02p01-ch5sk1fk1fl1.tiff" + ], + [ + "2242", + "A02", + null, + "DMSO", + "IAZDPXIOMUYVGZ-UHFFFAOYSA-N", + "DMSO", + "679.0", + null, + null, + "control", + "negcon", + "CS(=O)C", + "2020_11_04_CPJUMP1", + "JUMP-Target-1_compound_platemap", + "BR00116995", + "compound", + "U2OS", + "24", + "100", + "absent", + "Parental", + "Day0", + "1", + "WGA", + "27648", + "3", + "DNA", + "1", + "-2e-06", + "s3://cellpainting-gallery/cpg0000-jump-pilot/source_4/images/2020_11_04_CPJUMP1/images/BR00116995__2020-11-06T02_41_05-Measurement1/Images/r01c02f03p01-ch5sk1fk1fl1.tiff", + "r01c02f03p01-ch5sk1fk1fl1.tiff" + ], + [ + "2243", + "A02", + null, + "DMSO", + "IAZDPXIOMUYVGZ-UHFFFAOYSA-N", + "DMSO", + "679.0", + null, + null, + "control", + "negcon", + "CS(=O)C", + "2020_11_04_CPJUMP1", + "JUMP-Target-1_compound_platemap", + "BR00116995", + "compound", + "U2OS", + "24", + "100", + "absent", + "Parental", + "Day0", + "1", + "WGA", + "27648", + "4", + "DNA", + "1", + "-2e-06", + "s3://cellpainting-gallery/cpg0000-jump-pilot/source_4/images/2020_11_04_CPJUMP1/images/BR00116995__2020-11-06T02_41_05-Measurement1/Images/r01c02f04p01-ch5sk1fk1fl1.tiff", + "r01c02f04p01-ch5sk1fk1fl1.tiff" + ], + [ + "2244", + "A02", + null, + "DMSO", + "IAZDPXIOMUYVGZ-UHFFFAOYSA-N", + "DMSO", + "679.0", + null, + null, + "control", + "negcon", + "CS(=O)C", + "2020_11_04_CPJUMP1", + "JUMP-Target-1_compound_platemap", + "BR00116995", + "compound", + "U2OS", + "24", + "100", + "absent", + "Parental", + "Day0", + "1", + "WGA", + "27648", + "5", + "DNA", + "1", + "-2e-06", + "s3://cellpainting-gallery/cpg0000-jump-pilot/source_4/images/2020_11_04_CPJUMP1/images/BR00116995__2020-11-06T02_41_05-Measurement1/Images/r01c02f05p01-ch5sk1fk1fl1.tiff", + "r01c02f05p01-ch5sk1fk1fl1.tiff" + ] + ], + "shape": { + "columns": 30, + "rows": 5 + } + }, + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
Metadata_Wellbroad_samplesolventInChIKeypert_inamepubchem_cidtargettarget_listpert_typecontrol_type...Time_delayTimes_imagedAnomalyNumber_of_imagesMetadata_SiteMetadata_ChannelNameMetadata_PlaneIDMetadata_PositionZMetadata_FileUrlMetadata_Filename
2240A02NaNDMSOIAZDPXIOMUYVGZ-UHFFFAOYSA-NDMSO679.0NaNNaNcontrolnegcon...Day01WGA276481DNA1-0.000002s3://cellpainting-gallery/cpg0000-jump-pilot/s...r01c02f01p01-ch5sk1fk1fl1.tiff
2241A02NaNDMSOIAZDPXIOMUYVGZ-UHFFFAOYSA-NDMSO679.0NaNNaNcontrolnegcon...Day01WGA276482DNA1-0.000002s3://cellpainting-gallery/cpg0000-jump-pilot/s...r01c02f02p01-ch5sk1fk1fl1.tiff
2242A02NaNDMSOIAZDPXIOMUYVGZ-UHFFFAOYSA-NDMSO679.0NaNNaNcontrolnegcon...Day01WGA276483DNA1-0.000002s3://cellpainting-gallery/cpg0000-jump-pilot/s...r01c02f03p01-ch5sk1fk1fl1.tiff
2243A02NaNDMSOIAZDPXIOMUYVGZ-UHFFFAOYSA-NDMSO679.0NaNNaNcontrolnegcon...Day01WGA276484DNA1-0.000002s3://cellpainting-gallery/cpg0000-jump-pilot/s...r01c02f04p01-ch5sk1fk1fl1.tiff
2244A02NaNDMSOIAZDPXIOMUYVGZ-UHFFFAOYSA-NDMSO679.0NaNNaNcontrolnegcon...Day01WGA276485DNA1-0.000002s3://cellpainting-gallery/cpg0000-jump-pilot/s...r01c02f05p01-ch5sk1fk1fl1.tiff
\n", + "

5 rows × 30 columns

\n", + "
" + ], + "text/plain": [ + " Metadata_Well broad_sample solvent InChIKey \\\n", + "2240 A02 NaN DMSO IAZDPXIOMUYVGZ-UHFFFAOYSA-N \n", + "2241 A02 NaN DMSO IAZDPXIOMUYVGZ-UHFFFAOYSA-N \n", + "2242 A02 NaN DMSO IAZDPXIOMUYVGZ-UHFFFAOYSA-N \n", + "2243 A02 NaN DMSO IAZDPXIOMUYVGZ-UHFFFAOYSA-N \n", + "2244 A02 NaN DMSO IAZDPXIOMUYVGZ-UHFFFAOYSA-N \n", + "\n", + " pert_iname pubchem_cid target target_list pert_type control_type ... \\\n", + "2240 DMSO 679.0 NaN NaN control negcon ... \n", + "2241 DMSO 679.0 NaN NaN control negcon ... \n", + "2242 DMSO 679.0 NaN NaN control negcon ... \n", + "2243 DMSO 679.0 NaN NaN control negcon ... \n", + "2244 DMSO 679.0 NaN NaN control negcon ... \n", + "\n", + " Time_delay Times_imaged Anomaly Number_of_images Metadata_Site \\\n", + "2240 Day0 1 WGA 27648 1 \n", + "2241 Day0 1 WGA 27648 2 \n", + "2242 Day0 1 WGA 27648 3 \n", + "2243 Day0 1 WGA 27648 4 \n", + "2244 Day0 1 WGA 27648 5 \n", + "\n", + " Metadata_ChannelName Metadata_PlaneID Metadata_PositionZ \\\n", + "2240 DNA 1 -0.000002 \n", + "2241 DNA 1 -0.000002 \n", + "2242 DNA 1 -0.000002 \n", + "2243 DNA 1 -0.000002 \n", + "2244 DNA 1 -0.000002 \n", + "\n", + " Metadata_FileUrl \\\n", + "2240 s3://cellpainting-gallery/cpg0000-jump-pilot/s... \n", + "2241 s3://cellpainting-gallery/cpg0000-jump-pilot/s... \n", + "2242 s3://cellpainting-gallery/cpg0000-jump-pilot/s... \n", + "2243 s3://cellpainting-gallery/cpg0000-jump-pilot/s... \n", + "2244 s3://cellpainting-gallery/cpg0000-jump-pilot/s... \n", + "\n", + " Metadata_Filename \n", + "2240 r01c02f01p01-ch5sk1fk1fl1.tiff \n", + "2241 r01c02f02p01-ch5sk1fk1fl1.tiff \n", + "2242 r01c02f03p01-ch5sk1fk1fl1.tiff \n", + "2243 r01c02f04p01-ch5sk1fk1fl1.tiff \n", + "2244 r01c02f05p01-ch5sk1fk1fl1.tiff \n", + "\n", + "[5 rows x 30 columns]" + ] + }, + "execution_count": 5, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "negcon_u2os_24_manifest = MANIFEST[\n", + " (MANIFEST[\"Batch\"] == \"2020_11_04_CPJUMP1\") &\n", + " (MANIFEST[\"control_type\"] == \"negcon\") &\n", + " (MANIFEST[\"Cell_type\"] == \"U2OS\") &\n", + " (MANIFEST[\"Time\"] == 24)\n", + "]\n", + "negcon_u2os_24_manifest.head()" + ] + }, + { + "cell_type": "markdown", + "id": "6b4431f0", + "metadata": {}, + "source": [ + "## Arrange as wide to be in anticipated format dor virtual stain flow datasets and also the format the download helper expects" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "id": "952e2717", + "metadata": {}, + "outputs": [ + { + "data": { + "application/vnd.microsoft.datawrangler.viewer.v0+json": { + "columns": [ + { + "name": "index", + "rawType": "int64", + "type": "integer" + }, + { + "name": "Metadata_Plate", + "rawType": "object", + "type": "string" + }, + { + "name": "Metadata_Well", + "rawType": "object", + "type": "string" + }, + { + "name": "Metadata_Site", + "rawType": "int64", + "type": "integer" + }, + { + "name": "LZ_BF", + "rawType": "object", + "type": "string" + }, + { + "name": "BF", + "rawType": "object", + "type": "string" + }, + { + "name": "HZ_BF", + "rawType": "object", + "type": "string" + }, + { + "name": "DNA", + "rawType": "object", + "type": "string" + }, + { + "name": "Mito", + "rawType": "object", + "type": "string" + }, + { + "name": "AGP", + "rawType": "object", + "type": "string" + }, + { + "name": "ER", + "rawType": "object", + "type": "string" + }, + { + "name": "RNA", + "rawType": "object", + "type": "string" + } + ], + "ref": "512f84e8-44b9-410e-80d4-2d0926f51491", + "rows": [ + [ + "0", + "BR00116995", + "A02", + "1", + "s3://cellpainting-gallery/cpg0000-jump-pilot/source_4/images/2020_11_04_CPJUMP1/images/BR00116995__2020-11-06T02_41_05-Measurement1/Images/r01c02f01p01-ch7sk1fk1fl1.tiff", + "s3://cellpainting-gallery/cpg0000-jump-pilot/source_4/images/2020_11_04_CPJUMP1/images/BR00116995__2020-11-06T02_41_05-Measurement1/Images/r01c02f01p01-ch8sk1fk1fl1.tiff", + "s3://cellpainting-gallery/cpg0000-jump-pilot/source_4/images/2020_11_04_CPJUMP1/images/BR00116995__2020-11-06T02_41_05-Measurement1/Images/r01c02f01p01-ch6sk1fk1fl1.tiff", + "s3://cellpainting-gallery/cpg0000-jump-pilot/source_4/images/2020_11_04_CPJUMP1/images/BR00116995__2020-11-06T02_41_05-Measurement1/Images/r01c02f01p01-ch5sk1fk1fl1.tiff", + "s3://cellpainting-gallery/cpg0000-jump-pilot/source_4/images/2020_11_04_CPJUMP1/images/BR00116995__2020-11-06T02_41_05-Measurement1/Images/r01c02f01p01-ch1sk1fk1fl1.tiff", + "s3://cellpainting-gallery/cpg0000-jump-pilot/source_4/images/2020_11_04_CPJUMP1/images/BR00116995__2020-11-06T02_41_05-Measurement1/Images/r01c02f01p01-ch2sk1fk1fl1.tiff", + "s3://cellpainting-gallery/cpg0000-jump-pilot/source_4/images/2020_11_04_CPJUMP1/images/BR00116995__2020-11-06T02_41_05-Measurement1/Images/r01c02f01p01-ch4sk1fk1fl1.tiff", + "s3://cellpainting-gallery/cpg0000-jump-pilot/source_4/images/2020_11_04_CPJUMP1/images/BR00116995__2020-11-06T02_41_05-Measurement1/Images/r01c02f01p01-ch3sk1fk1fl1.tiff" + ], + [ + "1", + "BR00116995", + "A02", + "2", + "s3://cellpainting-gallery/cpg0000-jump-pilot/source_4/images/2020_11_04_CPJUMP1/images/BR00116995__2020-11-06T02_41_05-Measurement1/Images/r01c02f02p01-ch7sk1fk1fl1.tiff", + "s3://cellpainting-gallery/cpg0000-jump-pilot/source_4/images/2020_11_04_CPJUMP1/images/BR00116995__2020-11-06T02_41_05-Measurement1/Images/r01c02f02p01-ch8sk1fk1fl1.tiff", + "s3://cellpainting-gallery/cpg0000-jump-pilot/source_4/images/2020_11_04_CPJUMP1/images/BR00116995__2020-11-06T02_41_05-Measurement1/Images/r01c02f02p01-ch6sk1fk1fl1.tiff", + "s3://cellpainting-gallery/cpg0000-jump-pilot/source_4/images/2020_11_04_CPJUMP1/images/BR00116995__2020-11-06T02_41_05-Measurement1/Images/r01c02f02p01-ch5sk1fk1fl1.tiff", + "s3://cellpainting-gallery/cpg0000-jump-pilot/source_4/images/2020_11_04_CPJUMP1/images/BR00116995__2020-11-06T02_41_05-Measurement1/Images/r01c02f02p01-ch1sk1fk1fl1.tiff", + "s3://cellpainting-gallery/cpg0000-jump-pilot/source_4/images/2020_11_04_CPJUMP1/images/BR00116995__2020-11-06T02_41_05-Measurement1/Images/r01c02f02p01-ch2sk1fk1fl1.tiff", + "s3://cellpainting-gallery/cpg0000-jump-pilot/source_4/images/2020_11_04_CPJUMP1/images/BR00116995__2020-11-06T02_41_05-Measurement1/Images/r01c02f02p01-ch4sk1fk1fl1.tiff", + "s3://cellpainting-gallery/cpg0000-jump-pilot/source_4/images/2020_11_04_CPJUMP1/images/BR00116995__2020-11-06T02_41_05-Measurement1/Images/r01c02f02p01-ch3sk1fk1fl1.tiff" + ], + [ + "2", + "BR00116995", + "A02", + "3", + "s3://cellpainting-gallery/cpg0000-jump-pilot/source_4/images/2020_11_04_CPJUMP1/images/BR00116995__2020-11-06T02_41_05-Measurement1/Images/r01c02f03p01-ch7sk1fk1fl1.tiff", + "s3://cellpainting-gallery/cpg0000-jump-pilot/source_4/images/2020_11_04_CPJUMP1/images/BR00116995__2020-11-06T02_41_05-Measurement1/Images/r01c02f03p01-ch8sk1fk1fl1.tiff", + "s3://cellpainting-gallery/cpg0000-jump-pilot/source_4/images/2020_11_04_CPJUMP1/images/BR00116995__2020-11-06T02_41_05-Measurement1/Images/r01c02f03p01-ch6sk1fk1fl1.tiff", + "s3://cellpainting-gallery/cpg0000-jump-pilot/source_4/images/2020_11_04_CPJUMP1/images/BR00116995__2020-11-06T02_41_05-Measurement1/Images/r01c02f03p01-ch5sk1fk1fl1.tiff", + "s3://cellpainting-gallery/cpg0000-jump-pilot/source_4/images/2020_11_04_CPJUMP1/images/BR00116995__2020-11-06T02_41_05-Measurement1/Images/r01c02f03p01-ch1sk1fk1fl1.tiff", + "s3://cellpainting-gallery/cpg0000-jump-pilot/source_4/images/2020_11_04_CPJUMP1/images/BR00116995__2020-11-06T02_41_05-Measurement1/Images/r01c02f03p01-ch2sk1fk1fl1.tiff", + "s3://cellpainting-gallery/cpg0000-jump-pilot/source_4/images/2020_11_04_CPJUMP1/images/BR00116995__2020-11-06T02_41_05-Measurement1/Images/r01c02f03p01-ch4sk1fk1fl1.tiff", + "s3://cellpainting-gallery/cpg0000-jump-pilot/source_4/images/2020_11_04_CPJUMP1/images/BR00116995__2020-11-06T02_41_05-Measurement1/Images/r01c02f03p01-ch3sk1fk1fl1.tiff" + ], + [ + "3", + "BR00116995", + "A02", + "4", + "s3://cellpainting-gallery/cpg0000-jump-pilot/source_4/images/2020_11_04_CPJUMP1/images/BR00116995__2020-11-06T02_41_05-Measurement1/Images/r01c02f04p01-ch7sk1fk1fl1.tiff", + "s3://cellpainting-gallery/cpg0000-jump-pilot/source_4/images/2020_11_04_CPJUMP1/images/BR00116995__2020-11-06T02_41_05-Measurement1/Images/r01c02f04p01-ch8sk1fk1fl1.tiff", + "s3://cellpainting-gallery/cpg0000-jump-pilot/source_4/images/2020_11_04_CPJUMP1/images/BR00116995__2020-11-06T02_41_05-Measurement1/Images/r01c02f04p01-ch6sk1fk1fl1.tiff", + "s3://cellpainting-gallery/cpg0000-jump-pilot/source_4/images/2020_11_04_CPJUMP1/images/BR00116995__2020-11-06T02_41_05-Measurement1/Images/r01c02f04p01-ch5sk1fk1fl1.tiff", + "s3://cellpainting-gallery/cpg0000-jump-pilot/source_4/images/2020_11_04_CPJUMP1/images/BR00116995__2020-11-06T02_41_05-Measurement1/Images/r01c02f04p01-ch1sk1fk1fl1.tiff", + "s3://cellpainting-gallery/cpg0000-jump-pilot/source_4/images/2020_11_04_CPJUMP1/images/BR00116995__2020-11-06T02_41_05-Measurement1/Images/r01c02f04p01-ch2sk1fk1fl1.tiff", + "s3://cellpainting-gallery/cpg0000-jump-pilot/source_4/images/2020_11_04_CPJUMP1/images/BR00116995__2020-11-06T02_41_05-Measurement1/Images/r01c02f04p01-ch4sk1fk1fl1.tiff", + "s3://cellpainting-gallery/cpg0000-jump-pilot/source_4/images/2020_11_04_CPJUMP1/images/BR00116995__2020-11-06T02_41_05-Measurement1/Images/r01c02f04p01-ch3sk1fk1fl1.tiff" + ], + [ + "4", + "BR00116995", + "A02", + "5", + "s3://cellpainting-gallery/cpg0000-jump-pilot/source_4/images/2020_11_04_CPJUMP1/images/BR00116995__2020-11-06T02_41_05-Measurement1/Images/r01c02f05p01-ch7sk1fk1fl1.tiff", + "s3://cellpainting-gallery/cpg0000-jump-pilot/source_4/images/2020_11_04_CPJUMP1/images/BR00116995__2020-11-06T02_41_05-Measurement1/Images/r01c02f05p01-ch8sk1fk1fl1.tiff", + "s3://cellpainting-gallery/cpg0000-jump-pilot/source_4/images/2020_11_04_CPJUMP1/images/BR00116995__2020-11-06T02_41_05-Measurement1/Images/r01c02f05p01-ch6sk1fk1fl1.tiff", + "s3://cellpainting-gallery/cpg0000-jump-pilot/source_4/images/2020_11_04_CPJUMP1/images/BR00116995__2020-11-06T02_41_05-Measurement1/Images/r01c02f05p01-ch5sk1fk1fl1.tiff", + "s3://cellpainting-gallery/cpg0000-jump-pilot/source_4/images/2020_11_04_CPJUMP1/images/BR00116995__2020-11-06T02_41_05-Measurement1/Images/r01c02f05p01-ch1sk1fk1fl1.tiff", + "s3://cellpainting-gallery/cpg0000-jump-pilot/source_4/images/2020_11_04_CPJUMP1/images/BR00116995__2020-11-06T02_41_05-Measurement1/Images/r01c02f05p01-ch2sk1fk1fl1.tiff", + "s3://cellpainting-gallery/cpg0000-jump-pilot/source_4/images/2020_11_04_CPJUMP1/images/BR00116995__2020-11-06T02_41_05-Measurement1/Images/r01c02f05p01-ch4sk1fk1fl1.tiff", + "s3://cellpainting-gallery/cpg0000-jump-pilot/source_4/images/2020_11_04_CPJUMP1/images/BR00116995__2020-11-06T02_41_05-Measurement1/Images/r01c02f05p01-ch3sk1fk1fl1.tiff" + ] + ], + "shape": { + "columns": 11, + "rows": 5 + } + }, + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
Metadata_ChannelNameMetadata_PlateMetadata_WellMetadata_SiteLZ_BFBFHZ_BFDNAMitoAGPERRNA
0BR00116995A021s3://cellpainting-gallery/cpg0000-jump-pilot/s...s3://cellpainting-gallery/cpg0000-jump-pilot/s...s3://cellpainting-gallery/cpg0000-jump-pilot/s...s3://cellpainting-gallery/cpg0000-jump-pilot/s...s3://cellpainting-gallery/cpg0000-jump-pilot/s...s3://cellpainting-gallery/cpg0000-jump-pilot/s...s3://cellpainting-gallery/cpg0000-jump-pilot/s...s3://cellpainting-gallery/cpg0000-jump-pilot/s...
1BR00116995A022s3://cellpainting-gallery/cpg0000-jump-pilot/s...s3://cellpainting-gallery/cpg0000-jump-pilot/s...s3://cellpainting-gallery/cpg0000-jump-pilot/s...s3://cellpainting-gallery/cpg0000-jump-pilot/s...s3://cellpainting-gallery/cpg0000-jump-pilot/s...s3://cellpainting-gallery/cpg0000-jump-pilot/s...s3://cellpainting-gallery/cpg0000-jump-pilot/s...s3://cellpainting-gallery/cpg0000-jump-pilot/s...
2BR00116995A023s3://cellpainting-gallery/cpg0000-jump-pilot/s...s3://cellpainting-gallery/cpg0000-jump-pilot/s...s3://cellpainting-gallery/cpg0000-jump-pilot/s...s3://cellpainting-gallery/cpg0000-jump-pilot/s...s3://cellpainting-gallery/cpg0000-jump-pilot/s...s3://cellpainting-gallery/cpg0000-jump-pilot/s...s3://cellpainting-gallery/cpg0000-jump-pilot/s...s3://cellpainting-gallery/cpg0000-jump-pilot/s...
3BR00116995A024s3://cellpainting-gallery/cpg0000-jump-pilot/s...s3://cellpainting-gallery/cpg0000-jump-pilot/s...s3://cellpainting-gallery/cpg0000-jump-pilot/s...s3://cellpainting-gallery/cpg0000-jump-pilot/s...s3://cellpainting-gallery/cpg0000-jump-pilot/s...s3://cellpainting-gallery/cpg0000-jump-pilot/s...s3://cellpainting-gallery/cpg0000-jump-pilot/s...s3://cellpainting-gallery/cpg0000-jump-pilot/s...
4BR00116995A025s3://cellpainting-gallery/cpg0000-jump-pilot/s...s3://cellpainting-gallery/cpg0000-jump-pilot/s...s3://cellpainting-gallery/cpg0000-jump-pilot/s...s3://cellpainting-gallery/cpg0000-jump-pilot/s...s3://cellpainting-gallery/cpg0000-jump-pilot/s...s3://cellpainting-gallery/cpg0000-jump-pilot/s...s3://cellpainting-gallery/cpg0000-jump-pilot/s...s3://cellpainting-gallery/cpg0000-jump-pilot/s...
\n", + "
" + ], + "text/plain": [ + "Metadata_ChannelName Metadata_Plate Metadata_Well Metadata_Site \\\n", + "0 BR00116995 A02 1 \n", + "1 BR00116995 A02 2 \n", + "2 BR00116995 A02 3 \n", + "3 BR00116995 A02 4 \n", + "4 BR00116995 A02 5 \n", + "\n", + "Metadata_ChannelName LZ_BF \\\n", + "0 s3://cellpainting-gallery/cpg0000-jump-pilot/s... \n", + "1 s3://cellpainting-gallery/cpg0000-jump-pilot/s... \n", + "2 s3://cellpainting-gallery/cpg0000-jump-pilot/s... \n", + "3 s3://cellpainting-gallery/cpg0000-jump-pilot/s... \n", + "4 s3://cellpainting-gallery/cpg0000-jump-pilot/s... \n", + "\n", + "Metadata_ChannelName BF \\\n", + "0 s3://cellpainting-gallery/cpg0000-jump-pilot/s... \n", + "1 s3://cellpainting-gallery/cpg0000-jump-pilot/s... \n", + "2 s3://cellpainting-gallery/cpg0000-jump-pilot/s... \n", + "3 s3://cellpainting-gallery/cpg0000-jump-pilot/s... \n", + "4 s3://cellpainting-gallery/cpg0000-jump-pilot/s... \n", + "\n", + "Metadata_ChannelName HZ_BF \\\n", + "0 s3://cellpainting-gallery/cpg0000-jump-pilot/s... \n", + "1 s3://cellpainting-gallery/cpg0000-jump-pilot/s... \n", + "2 s3://cellpainting-gallery/cpg0000-jump-pilot/s... \n", + "3 s3://cellpainting-gallery/cpg0000-jump-pilot/s... \n", + "4 s3://cellpainting-gallery/cpg0000-jump-pilot/s... \n", + "\n", + "Metadata_ChannelName DNA \\\n", + "0 s3://cellpainting-gallery/cpg0000-jump-pilot/s... \n", + "1 s3://cellpainting-gallery/cpg0000-jump-pilot/s... \n", + "2 s3://cellpainting-gallery/cpg0000-jump-pilot/s... \n", + "3 s3://cellpainting-gallery/cpg0000-jump-pilot/s... \n", + "4 s3://cellpainting-gallery/cpg0000-jump-pilot/s... \n", + "\n", + "Metadata_ChannelName Mito \\\n", + "0 s3://cellpainting-gallery/cpg0000-jump-pilot/s... \n", + "1 s3://cellpainting-gallery/cpg0000-jump-pilot/s... \n", + "2 s3://cellpainting-gallery/cpg0000-jump-pilot/s... \n", + "3 s3://cellpainting-gallery/cpg0000-jump-pilot/s... \n", + "4 s3://cellpainting-gallery/cpg0000-jump-pilot/s... \n", + "\n", + "Metadata_ChannelName AGP \\\n", + "0 s3://cellpainting-gallery/cpg0000-jump-pilot/s... \n", + "1 s3://cellpainting-gallery/cpg0000-jump-pilot/s... \n", + "2 s3://cellpainting-gallery/cpg0000-jump-pilot/s... \n", + "3 s3://cellpainting-gallery/cpg0000-jump-pilot/s... \n", + "4 s3://cellpainting-gallery/cpg0000-jump-pilot/s... \n", + "\n", + "Metadata_ChannelName ER \\\n", + "0 s3://cellpainting-gallery/cpg0000-jump-pilot/s... \n", + "1 s3://cellpainting-gallery/cpg0000-jump-pilot/s... \n", + "2 s3://cellpainting-gallery/cpg0000-jump-pilot/s... \n", + "3 s3://cellpainting-gallery/cpg0000-jump-pilot/s... \n", + "4 s3://cellpainting-gallery/cpg0000-jump-pilot/s... \n", + "\n", + "Metadata_ChannelName RNA \n", + "0 s3://cellpainting-gallery/cpg0000-jump-pilot/s... \n", + "1 s3://cellpainting-gallery/cpg0000-jump-pilot/s... \n", + "2 s3://cellpainting-gallery/cpg0000-jump-pilot/s... \n", + "3 s3://cellpainting-gallery/cpg0000-jump-pilot/s... \n", + "4 s3://cellpainting-gallery/cpg0000-jump-pilot/s... " + ] + }, + "execution_count": 6, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "wide_manifest = arrange_manifest_channels(negcon_u2os_24_manifest)\n", + "wide_manifest.head()" + ] + }, + { + "cell_type": "markdown", + "id": "d62c00e5", + "metadata": {}, + "source": [ + "## Data split" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "id": "848fd839", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Train plates: 3, Test plates: 1\n", + "Train samples: 1728, Test samples: 576\n" + ] + } + ], + "source": [ + "# Get unique plates\n", + "unique_plates = wide_manifest['Metadata_Plate'].unique()\n", + "\n", + "# Split plates into train (75%) and test (25%) with seed\n", + "train_plates, test_plates = train_test_split(\n", + " unique_plates, \n", + " test_size=0.25, \n", + " random_state=42\n", + ")\n", + "\n", + "# Create train and test manifests based on plate split\n", + "train_manifest_wide = wide_manifest[wide_manifest['Metadata_Plate'].isin(train_plates)]\n", + "test_manifest_wide = wide_manifest[wide_manifest['Metadata_Plate'].isin(test_plates)]\n", + "\n", + "print(f\"Train plates: {len(train_plates)}, Test plates: {len(test_plates)}\")\n", + "print(f\"Train samples: {len(train_manifest_wide)}, Test samples: {len(test_manifest_wide)}\")" + ] + }, + { + "cell_type": "markdown", + "id": "70d2ff65", + "metadata": {}, + "source": [ + "## Write final splitted download manifest with metadata and download all needed data" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "id": "bc058e2c", + "metadata": {}, + "outputs": [], + "source": [ + "train_manifest_wide.to_csv(DATA_DOWNLOAD_DIR/ \"train_manifest.csv\", index=False)\n", + "test_manifest_wide.to_csv(DATA_DOWNLOAD_DIR / \"test_manifest.csv\", index=False)" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "id": "bec96da0", + "metadata": {}, + "outputs": [ + { + "data": { + "application/vnd.microsoft.datawrangler.viewer.v0+json": { + "columns": [ + { + "name": "index", + "rawType": "int64", + "type": "integer" + }, + { + "name": "LZ_BF", + "rawType": "object", + "type": "string" + }, + { + "name": "BF", + "rawType": "object", + "type": "string" + }, + { + "name": "HZ_BF", + "rawType": "object", + "type": "string" + }, + { + "name": "DNA", + "rawType": "object", + "type": "string" + }, + { + "name": "Mito", + "rawType": "object", + "type": "string" + }, + { + "name": "AGP", + "rawType": "object", + "type": "string" + }, + { + "name": "ER", + "rawType": "object", + "type": "string" + }, + { + "name": "RNA", + "rawType": "object", + "type": "string" + } + ], + "ref": "227a6f35-3cca-4c7f-99f5-93c70f489a1b", + "rows": [ + [ + "0", + "/home/weishanli/data_fast/cpjump1/cpjump1_u2os_train/BR00116995_A02_1_LZ_BF.tiff", + "/home/weishanli/data_fast/cpjump1/cpjump1_u2os_train/BR00116995_A02_1_BF.tiff", + "/home/weishanli/data_fast/cpjump1/cpjump1_u2os_train/BR00116995_A02_1_HZ_BF.tiff", + "/home/weishanli/data_fast/cpjump1/cpjump1_u2os_train/BR00116995_A02_1_DNA.tiff", + "/home/weishanli/data_fast/cpjump1/cpjump1_u2os_train/BR00116995_A02_1_Mito.tiff", + "/home/weishanli/data_fast/cpjump1/cpjump1_u2os_train/BR00116995_A02_1_AGP.tiff", + "/home/weishanli/data_fast/cpjump1/cpjump1_u2os_train/BR00116995_A02_1_ER.tiff", + "/home/weishanli/data_fast/cpjump1/cpjump1_u2os_train/BR00116995_A02_1_RNA.tiff" + ], + [ + "1", + "/home/weishanli/data_fast/cpjump1/cpjump1_u2os_train/BR00116995_A02_2_LZ_BF.tiff", + "/home/weishanli/data_fast/cpjump1/cpjump1_u2os_train/BR00116995_A02_2_BF.tiff", + "/home/weishanli/data_fast/cpjump1/cpjump1_u2os_train/BR00116995_A02_2_HZ_BF.tiff", + "/home/weishanli/data_fast/cpjump1/cpjump1_u2os_train/BR00116995_A02_2_DNA.tiff", + "/home/weishanli/data_fast/cpjump1/cpjump1_u2os_train/BR00116995_A02_2_Mito.tiff", + "/home/weishanli/data_fast/cpjump1/cpjump1_u2os_train/BR00116995_A02_2_AGP.tiff", + "/home/weishanli/data_fast/cpjump1/cpjump1_u2os_train/BR00116995_A02_2_ER.tiff", + "/home/weishanli/data_fast/cpjump1/cpjump1_u2os_train/BR00116995_A02_2_RNA.tiff" + ], + [ + "2", + "/home/weishanli/data_fast/cpjump1/cpjump1_u2os_train/BR00116995_A02_3_LZ_BF.tiff", + "/home/weishanli/data_fast/cpjump1/cpjump1_u2os_train/BR00116995_A02_3_BF.tiff", + "/home/weishanli/data_fast/cpjump1/cpjump1_u2os_train/BR00116995_A02_3_HZ_BF.tiff", + "/home/weishanli/data_fast/cpjump1/cpjump1_u2os_train/BR00116995_A02_3_DNA.tiff", + "/home/weishanli/data_fast/cpjump1/cpjump1_u2os_train/BR00116995_A02_3_Mito.tiff", + "/home/weishanli/data_fast/cpjump1/cpjump1_u2os_train/BR00116995_A02_3_AGP.tiff", + "/home/weishanli/data_fast/cpjump1/cpjump1_u2os_train/BR00116995_A02_3_ER.tiff", + "/home/weishanli/data_fast/cpjump1/cpjump1_u2os_train/BR00116995_A02_3_RNA.tiff" + ], + [ + "3", + "/home/weishanli/data_fast/cpjump1/cpjump1_u2os_train/BR00116995_A02_4_LZ_BF.tiff", + "/home/weishanli/data_fast/cpjump1/cpjump1_u2os_train/BR00116995_A02_4_BF.tiff", + "/home/weishanli/data_fast/cpjump1/cpjump1_u2os_train/BR00116995_A02_4_HZ_BF.tiff", + "/home/weishanli/data_fast/cpjump1/cpjump1_u2os_train/BR00116995_A02_4_DNA.tiff", + "/home/weishanli/data_fast/cpjump1/cpjump1_u2os_train/BR00116995_A02_4_Mito.tiff", + "/home/weishanli/data_fast/cpjump1/cpjump1_u2os_train/BR00116995_A02_4_AGP.tiff", + "/home/weishanli/data_fast/cpjump1/cpjump1_u2os_train/BR00116995_A02_4_ER.tiff", + "/home/weishanli/data_fast/cpjump1/cpjump1_u2os_train/BR00116995_A02_4_RNA.tiff" + ], + [ + "4", + "/home/weishanli/data_fast/cpjump1/cpjump1_u2os_train/BR00116995_A02_5_LZ_BF.tiff", + "/home/weishanli/data_fast/cpjump1/cpjump1_u2os_train/BR00116995_A02_5_BF.tiff", + "/home/weishanli/data_fast/cpjump1/cpjump1_u2os_train/BR00116995_A02_5_HZ_BF.tiff", + "/home/weishanli/data_fast/cpjump1/cpjump1_u2os_train/BR00116995_A02_5_DNA.tiff", + "/home/weishanli/data_fast/cpjump1/cpjump1_u2os_train/BR00116995_A02_5_Mito.tiff", + "/home/weishanli/data_fast/cpjump1/cpjump1_u2os_train/BR00116995_A02_5_AGP.tiff", + "/home/weishanli/data_fast/cpjump1/cpjump1_u2os_train/BR00116995_A02_5_ER.tiff", + "/home/weishanli/data_fast/cpjump1/cpjump1_u2os_train/BR00116995_A02_5_RNA.tiff" + ] + ], + "shape": { + "columns": 8, + "rows": 5 + } + }, + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
LZ_BFBFHZ_BFDNAMitoAGPERRNA
0/home/weishanli/data_fast/cpjump1/cpjump1_u2os.../home/weishanli/data_fast/cpjump1/cpjump1_u2os.../home/weishanli/data_fast/cpjump1/cpjump1_u2os.../home/weishanli/data_fast/cpjump1/cpjump1_u2os.../home/weishanli/data_fast/cpjump1/cpjump1_u2os.../home/weishanli/data_fast/cpjump1/cpjump1_u2os.../home/weishanli/data_fast/cpjump1/cpjump1_u2os.../home/weishanli/data_fast/cpjump1/cpjump1_u2os...
1/home/weishanli/data_fast/cpjump1/cpjump1_u2os.../home/weishanli/data_fast/cpjump1/cpjump1_u2os.../home/weishanli/data_fast/cpjump1/cpjump1_u2os.../home/weishanli/data_fast/cpjump1/cpjump1_u2os.../home/weishanli/data_fast/cpjump1/cpjump1_u2os.../home/weishanli/data_fast/cpjump1/cpjump1_u2os.../home/weishanli/data_fast/cpjump1/cpjump1_u2os.../home/weishanli/data_fast/cpjump1/cpjump1_u2os...
2/home/weishanli/data_fast/cpjump1/cpjump1_u2os.../home/weishanli/data_fast/cpjump1/cpjump1_u2os.../home/weishanli/data_fast/cpjump1/cpjump1_u2os.../home/weishanli/data_fast/cpjump1/cpjump1_u2os.../home/weishanli/data_fast/cpjump1/cpjump1_u2os.../home/weishanli/data_fast/cpjump1/cpjump1_u2os.../home/weishanli/data_fast/cpjump1/cpjump1_u2os.../home/weishanli/data_fast/cpjump1/cpjump1_u2os...
3/home/weishanli/data_fast/cpjump1/cpjump1_u2os.../home/weishanli/data_fast/cpjump1/cpjump1_u2os.../home/weishanli/data_fast/cpjump1/cpjump1_u2os.../home/weishanli/data_fast/cpjump1/cpjump1_u2os.../home/weishanli/data_fast/cpjump1/cpjump1_u2os.../home/weishanli/data_fast/cpjump1/cpjump1_u2os.../home/weishanli/data_fast/cpjump1/cpjump1_u2os.../home/weishanli/data_fast/cpjump1/cpjump1_u2os...
4/home/weishanli/data_fast/cpjump1/cpjump1_u2os.../home/weishanli/data_fast/cpjump1/cpjump1_u2os.../home/weishanli/data_fast/cpjump1/cpjump1_u2os.../home/weishanli/data_fast/cpjump1/cpjump1_u2os.../home/weishanli/data_fast/cpjump1/cpjump1_u2os.../home/weishanli/data_fast/cpjump1/cpjump1_u2os.../home/weishanli/data_fast/cpjump1/cpjump1_u2os.../home/weishanli/data_fast/cpjump1/cpjump1_u2os...
\n", + "
" + ], + "text/plain": [ + " LZ_BF \\\n", + "0 /home/weishanli/data_fast/cpjump1/cpjump1_u2os... \n", + "1 /home/weishanli/data_fast/cpjump1/cpjump1_u2os... \n", + "2 /home/weishanli/data_fast/cpjump1/cpjump1_u2os... \n", + "3 /home/weishanli/data_fast/cpjump1/cpjump1_u2os... \n", + "4 /home/weishanli/data_fast/cpjump1/cpjump1_u2os... \n", + "\n", + " BF \\\n", + "0 /home/weishanli/data_fast/cpjump1/cpjump1_u2os... \n", + "1 /home/weishanli/data_fast/cpjump1/cpjump1_u2os... \n", + "2 /home/weishanli/data_fast/cpjump1/cpjump1_u2os... \n", + "3 /home/weishanli/data_fast/cpjump1/cpjump1_u2os... \n", + "4 /home/weishanli/data_fast/cpjump1/cpjump1_u2os... \n", + "\n", + " HZ_BF \\\n", + "0 /home/weishanli/data_fast/cpjump1/cpjump1_u2os... \n", + "1 /home/weishanli/data_fast/cpjump1/cpjump1_u2os... \n", + "2 /home/weishanli/data_fast/cpjump1/cpjump1_u2os... \n", + "3 /home/weishanli/data_fast/cpjump1/cpjump1_u2os... \n", + "4 /home/weishanli/data_fast/cpjump1/cpjump1_u2os... \n", + "\n", + " DNA \\\n", + "0 /home/weishanli/data_fast/cpjump1/cpjump1_u2os... \n", + "1 /home/weishanli/data_fast/cpjump1/cpjump1_u2os... \n", + "2 /home/weishanli/data_fast/cpjump1/cpjump1_u2os... \n", + "3 /home/weishanli/data_fast/cpjump1/cpjump1_u2os... \n", + "4 /home/weishanli/data_fast/cpjump1/cpjump1_u2os... \n", + "\n", + " Mito \\\n", + "0 /home/weishanli/data_fast/cpjump1/cpjump1_u2os... \n", + "1 /home/weishanli/data_fast/cpjump1/cpjump1_u2os... \n", + "2 /home/weishanli/data_fast/cpjump1/cpjump1_u2os... \n", + "3 /home/weishanli/data_fast/cpjump1/cpjump1_u2os... \n", + "4 /home/weishanli/data_fast/cpjump1/cpjump1_u2os... \n", + "\n", + " AGP \\\n", + "0 /home/weishanli/data_fast/cpjump1/cpjump1_u2os... \n", + "1 /home/weishanli/data_fast/cpjump1/cpjump1_u2os... \n", + "2 /home/weishanli/data_fast/cpjump1/cpjump1_u2os... \n", + "3 /home/weishanli/data_fast/cpjump1/cpjump1_u2os... \n", + "4 /home/weishanli/data_fast/cpjump1/cpjump1_u2os... \n", + "\n", + " ER \\\n", + "0 /home/weishanli/data_fast/cpjump1/cpjump1_u2os... \n", + "1 /home/weishanli/data_fast/cpjump1/cpjump1_u2os... \n", + "2 /home/weishanli/data_fast/cpjump1/cpjump1_u2os... \n", + "3 /home/weishanli/data_fast/cpjump1/cpjump1_u2os... \n", + "4 /home/weishanli/data_fast/cpjump1/cpjump1_u2os... \n", + "\n", + " RNA \n", + "0 /home/weishanli/data_fast/cpjump1/cpjump1_u2os... \n", + "1 /home/weishanli/data_fast/cpjump1/cpjump1_u2os... \n", + "2 /home/weishanli/data_fast/cpjump1/cpjump1_u2os... \n", + "3 /home/weishanli/data_fast/cpjump1/cpjump1_u2os... \n", + "4 /home/weishanli/data_fast/cpjump1/cpjump1_u2os... " + ] + }, + "execution_count": 9, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "train_download_summary = download_wide_manifest_channels(\n", + " train_manifest_wide,\n", + " dest_dir = DATA_DOWNLOAD_DIR / \"cpjump1_u2os_train\" \n", + ")\n", + "train_download_summary.head()" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "id": "70ab975c", + "metadata": {}, + "outputs": [ + { + "data": { + "application/vnd.microsoft.datawrangler.viewer.v0+json": { + "columns": [ + { + "name": "index", + "rawType": "int64", + "type": "integer" + }, + { + "name": "LZ_BF", + "rawType": "object", + "type": "string" + }, + { + "name": "BF", + "rawType": "object", + "type": "string" + }, + { + "name": "HZ_BF", + "rawType": "object", + "type": "string" + }, + { + "name": "DNA", + "rawType": "object", + "type": "string" + }, + { + "name": "Mito", + "rawType": "object", + "type": "string" + }, + { + "name": "AGP", + "rawType": "object", + "type": "string" + }, + { + "name": "ER", + "rawType": "object", + "type": "string" + }, + { + "name": "RNA", + "rawType": "object", + "type": "string" + } + ], + "ref": "779ce221-d8ff-4070-af9e-a65d1780cdfd", + "rows": [ + [ + "0", + "/home/weishanli/data_fast/cpjump1/cpjump1_u2os_test/BR00117024_A02_1_LZ_BF.tiff", + "/home/weishanli/data_fast/cpjump1/cpjump1_u2os_test/BR00117024_A02_1_BF.tiff", + "/home/weishanli/data_fast/cpjump1/cpjump1_u2os_test/BR00117024_A02_1_HZ_BF.tiff", + "/home/weishanli/data_fast/cpjump1/cpjump1_u2os_test/BR00117024_A02_1_DNA.tiff", + "/home/weishanli/data_fast/cpjump1/cpjump1_u2os_test/BR00117024_A02_1_Mito.tiff", + "/home/weishanli/data_fast/cpjump1/cpjump1_u2os_test/BR00117024_A02_1_AGP.tiff", + "/home/weishanli/data_fast/cpjump1/cpjump1_u2os_test/BR00117024_A02_1_ER.tiff", + "/home/weishanli/data_fast/cpjump1/cpjump1_u2os_test/BR00117024_A02_1_RNA.tiff" + ], + [ + "1", + "/home/weishanli/data_fast/cpjump1/cpjump1_u2os_test/BR00117024_A02_2_LZ_BF.tiff", + "/home/weishanli/data_fast/cpjump1/cpjump1_u2os_test/BR00117024_A02_2_BF.tiff", + "/home/weishanli/data_fast/cpjump1/cpjump1_u2os_test/BR00117024_A02_2_HZ_BF.tiff", + "/home/weishanli/data_fast/cpjump1/cpjump1_u2os_test/BR00117024_A02_2_DNA.tiff", + "/home/weishanli/data_fast/cpjump1/cpjump1_u2os_test/BR00117024_A02_2_Mito.tiff", + "/home/weishanli/data_fast/cpjump1/cpjump1_u2os_test/BR00117024_A02_2_AGP.tiff", + "/home/weishanli/data_fast/cpjump1/cpjump1_u2os_test/BR00117024_A02_2_ER.tiff", + "/home/weishanli/data_fast/cpjump1/cpjump1_u2os_test/BR00117024_A02_2_RNA.tiff" + ], + [ + "2", + "/home/weishanli/data_fast/cpjump1/cpjump1_u2os_test/BR00117024_A02_3_LZ_BF.tiff", + "/home/weishanli/data_fast/cpjump1/cpjump1_u2os_test/BR00117024_A02_3_BF.tiff", + "/home/weishanli/data_fast/cpjump1/cpjump1_u2os_test/BR00117024_A02_3_HZ_BF.tiff", + "/home/weishanli/data_fast/cpjump1/cpjump1_u2os_test/BR00117024_A02_3_DNA.tiff", + "/home/weishanli/data_fast/cpjump1/cpjump1_u2os_test/BR00117024_A02_3_Mito.tiff", + "/home/weishanli/data_fast/cpjump1/cpjump1_u2os_test/BR00117024_A02_3_AGP.tiff", + "/home/weishanli/data_fast/cpjump1/cpjump1_u2os_test/BR00117024_A02_3_ER.tiff", + "/home/weishanli/data_fast/cpjump1/cpjump1_u2os_test/BR00117024_A02_3_RNA.tiff" + ], + [ + "3", + "/home/weishanli/data_fast/cpjump1/cpjump1_u2os_test/BR00117024_A02_4_LZ_BF.tiff", + "/home/weishanli/data_fast/cpjump1/cpjump1_u2os_test/BR00117024_A02_4_BF.tiff", + "/home/weishanli/data_fast/cpjump1/cpjump1_u2os_test/BR00117024_A02_4_HZ_BF.tiff", + "/home/weishanli/data_fast/cpjump1/cpjump1_u2os_test/BR00117024_A02_4_DNA.tiff", + "/home/weishanli/data_fast/cpjump1/cpjump1_u2os_test/BR00117024_A02_4_Mito.tiff", + "/home/weishanli/data_fast/cpjump1/cpjump1_u2os_test/BR00117024_A02_4_AGP.tiff", + "/home/weishanli/data_fast/cpjump1/cpjump1_u2os_test/BR00117024_A02_4_ER.tiff", + "/home/weishanli/data_fast/cpjump1/cpjump1_u2os_test/BR00117024_A02_4_RNA.tiff" + ], + [ + "4", + "/home/weishanli/data_fast/cpjump1/cpjump1_u2os_test/BR00117024_A02_5_LZ_BF.tiff", + "/home/weishanli/data_fast/cpjump1/cpjump1_u2os_test/BR00117024_A02_5_BF.tiff", + "/home/weishanli/data_fast/cpjump1/cpjump1_u2os_test/BR00117024_A02_5_HZ_BF.tiff", + "/home/weishanli/data_fast/cpjump1/cpjump1_u2os_test/BR00117024_A02_5_DNA.tiff", + "/home/weishanli/data_fast/cpjump1/cpjump1_u2os_test/BR00117024_A02_5_Mito.tiff", + "/home/weishanli/data_fast/cpjump1/cpjump1_u2os_test/BR00117024_A02_5_AGP.tiff", + "/home/weishanli/data_fast/cpjump1/cpjump1_u2os_test/BR00117024_A02_5_ER.tiff", + "/home/weishanli/data_fast/cpjump1/cpjump1_u2os_test/BR00117024_A02_5_RNA.tiff" + ] + ], + "shape": { + "columns": 8, + "rows": 5 + } + }, + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
LZ_BFBFHZ_BFDNAMitoAGPERRNA
0/home/weishanli/data_fast/cpjump1/cpjump1_u2os.../home/weishanli/data_fast/cpjump1/cpjump1_u2os.../home/weishanli/data_fast/cpjump1/cpjump1_u2os.../home/weishanli/data_fast/cpjump1/cpjump1_u2os.../home/weishanli/data_fast/cpjump1/cpjump1_u2os.../home/weishanli/data_fast/cpjump1/cpjump1_u2os.../home/weishanli/data_fast/cpjump1/cpjump1_u2os.../home/weishanli/data_fast/cpjump1/cpjump1_u2os...
1/home/weishanli/data_fast/cpjump1/cpjump1_u2os.../home/weishanli/data_fast/cpjump1/cpjump1_u2os.../home/weishanli/data_fast/cpjump1/cpjump1_u2os.../home/weishanli/data_fast/cpjump1/cpjump1_u2os.../home/weishanli/data_fast/cpjump1/cpjump1_u2os.../home/weishanli/data_fast/cpjump1/cpjump1_u2os.../home/weishanli/data_fast/cpjump1/cpjump1_u2os.../home/weishanli/data_fast/cpjump1/cpjump1_u2os...
2/home/weishanli/data_fast/cpjump1/cpjump1_u2os.../home/weishanli/data_fast/cpjump1/cpjump1_u2os.../home/weishanli/data_fast/cpjump1/cpjump1_u2os.../home/weishanli/data_fast/cpjump1/cpjump1_u2os.../home/weishanli/data_fast/cpjump1/cpjump1_u2os.../home/weishanli/data_fast/cpjump1/cpjump1_u2os.../home/weishanli/data_fast/cpjump1/cpjump1_u2os.../home/weishanli/data_fast/cpjump1/cpjump1_u2os...
3/home/weishanli/data_fast/cpjump1/cpjump1_u2os.../home/weishanli/data_fast/cpjump1/cpjump1_u2os.../home/weishanli/data_fast/cpjump1/cpjump1_u2os.../home/weishanli/data_fast/cpjump1/cpjump1_u2os.../home/weishanli/data_fast/cpjump1/cpjump1_u2os.../home/weishanli/data_fast/cpjump1/cpjump1_u2os.../home/weishanli/data_fast/cpjump1/cpjump1_u2os.../home/weishanli/data_fast/cpjump1/cpjump1_u2os...
4/home/weishanli/data_fast/cpjump1/cpjump1_u2os.../home/weishanli/data_fast/cpjump1/cpjump1_u2os.../home/weishanli/data_fast/cpjump1/cpjump1_u2os.../home/weishanli/data_fast/cpjump1/cpjump1_u2os.../home/weishanli/data_fast/cpjump1/cpjump1_u2os.../home/weishanli/data_fast/cpjump1/cpjump1_u2os.../home/weishanli/data_fast/cpjump1/cpjump1_u2os.../home/weishanli/data_fast/cpjump1/cpjump1_u2os...
\n", + "
" + ], + "text/plain": [ + " LZ_BF \\\n", + "0 /home/weishanli/data_fast/cpjump1/cpjump1_u2os... \n", + "1 /home/weishanli/data_fast/cpjump1/cpjump1_u2os... \n", + "2 /home/weishanli/data_fast/cpjump1/cpjump1_u2os... \n", + "3 /home/weishanli/data_fast/cpjump1/cpjump1_u2os... \n", + "4 /home/weishanli/data_fast/cpjump1/cpjump1_u2os... \n", + "\n", + " BF \\\n", + "0 /home/weishanli/data_fast/cpjump1/cpjump1_u2os... \n", + "1 /home/weishanli/data_fast/cpjump1/cpjump1_u2os... \n", + "2 /home/weishanli/data_fast/cpjump1/cpjump1_u2os... \n", + "3 /home/weishanli/data_fast/cpjump1/cpjump1_u2os... \n", + "4 /home/weishanli/data_fast/cpjump1/cpjump1_u2os... \n", + "\n", + " HZ_BF \\\n", + "0 /home/weishanli/data_fast/cpjump1/cpjump1_u2os... \n", + "1 /home/weishanli/data_fast/cpjump1/cpjump1_u2os... \n", + "2 /home/weishanli/data_fast/cpjump1/cpjump1_u2os... \n", + "3 /home/weishanli/data_fast/cpjump1/cpjump1_u2os... \n", + "4 /home/weishanli/data_fast/cpjump1/cpjump1_u2os... \n", + "\n", + " DNA \\\n", + "0 /home/weishanli/data_fast/cpjump1/cpjump1_u2os... \n", + "1 /home/weishanli/data_fast/cpjump1/cpjump1_u2os... \n", + "2 /home/weishanli/data_fast/cpjump1/cpjump1_u2os... \n", + "3 /home/weishanli/data_fast/cpjump1/cpjump1_u2os... \n", + "4 /home/weishanli/data_fast/cpjump1/cpjump1_u2os... \n", + "\n", + " Mito \\\n", + "0 /home/weishanli/data_fast/cpjump1/cpjump1_u2os... \n", + "1 /home/weishanli/data_fast/cpjump1/cpjump1_u2os... \n", + "2 /home/weishanli/data_fast/cpjump1/cpjump1_u2os... \n", + "3 /home/weishanli/data_fast/cpjump1/cpjump1_u2os... \n", + "4 /home/weishanli/data_fast/cpjump1/cpjump1_u2os... \n", + "\n", + " AGP \\\n", + "0 /home/weishanli/data_fast/cpjump1/cpjump1_u2os... \n", + "1 /home/weishanli/data_fast/cpjump1/cpjump1_u2os... \n", + "2 /home/weishanli/data_fast/cpjump1/cpjump1_u2os... \n", + "3 /home/weishanli/data_fast/cpjump1/cpjump1_u2os... \n", + "4 /home/weishanli/data_fast/cpjump1/cpjump1_u2os... \n", + "\n", + " ER \\\n", + "0 /home/weishanli/data_fast/cpjump1/cpjump1_u2os... \n", + "1 /home/weishanli/data_fast/cpjump1/cpjump1_u2os... \n", + "2 /home/weishanli/data_fast/cpjump1/cpjump1_u2os... \n", + "3 /home/weishanli/data_fast/cpjump1/cpjump1_u2os... \n", + "4 /home/weishanli/data_fast/cpjump1/cpjump1_u2os... \n", + "\n", + " RNA \n", + "0 /home/weishanli/data_fast/cpjump1/cpjump1_u2os... \n", + "1 /home/weishanli/data_fast/cpjump1/cpjump1_u2os... \n", + "2 /home/weishanli/data_fast/cpjump1/cpjump1_u2os... \n", + "3 /home/weishanli/data_fast/cpjump1/cpjump1_u2os... \n", + "4 /home/weishanli/data_fast/cpjump1/cpjump1_u2os... " + ] + }, + "execution_count": 10, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "test_download_summary =download_wide_manifest_channels(\n", + " test_manifest_wide,\n", + " dest_dir = DATA_DOWNLOAD_DIR / \"cpjump1_u2os_test\" \n", + ")\n", + "test_download_summary.head()" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "virtual_stain_flow", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.12.12" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/examples/nbconverted/0.download_example_dataset.py b/examples/nbconverted/0.download_example_dataset.py new file mode 100644 index 0000000..ea4e561 --- /dev/null +++ b/examples/nbconverted/0.download_example_dataset.py @@ -0,0 +1,169 @@ +#!/usr/bin/env python +# coding: utf-8 + +# # Download JUMP pilot plate data from AWS S3 bucket for example training + +# In[1]: + + +from pathlib import Path +from urllib.parse import urlparse + +import pandas as pd +from sklearn.model_selection import train_test_split + +from virtual_stain_flow.datasets.example.cpjump1_manifest import get_manifest +from virtual_stain_flow.datasets.example.arrange_as_wide import arrange_manifest_channels + + +# ## Pathing + +# In[ ]: + + +DATA_DOWNLOAD_DIR = Path("/PATH/TO/WHERE/YOU/WANT/TO/DOWNLOAD/CPJUMP1") +DATA_DOWNLOAD_DIR.mkdir(exist_ok=True, parents=True) + + +# ## S3 download helpers + +# In[3]: + + +def _parse_s3_url(url): + parsed = urlparse(url) + if parsed.scheme != "s3": + raise ValueError(f"Expected s3:// URL, got: {url}") + return parsed.netloc, parsed.path.lstrip("/") + +def download_wide_manifest_channels( + wide_manifest, + dest_dir, + channel_columns=None, + overwrite=False, +): + """ + Download S3 TIFFs for each channel and write a local file_index.csv with paths. + """ + if channel_columns is None: + channel_columns = ["LZ_BF", "BF", "HZ_BF", "DNA", "Mito", "AGP", "ER", "RNA"] + dest_dir = Path(dest_dir) + dest_dir.mkdir(parents=True, exist_ok=True) + try: + import boto3 + from botocore import UNSIGNED + from botocore.config import Config + except ImportError as exc: + raise ImportError( + "boto3 is required for S3 downloads. Install with: pip install boto3" + ) from exc + s3 = boto3.client("s3", config=Config(signature_version=UNSIGNED)) + local_rows = [] + for row_idx, row in wide_manifest.iterrows(): + prefix_parts = [] + for key in ["Metadata_Plate", "Metadata_Well", "Metadata_Site"]: + if key in wide_manifest.columns: + prefix_parts.append(str(row[key])) + prefix = "_".join(prefix_parts) if prefix_parts else f"row_{row_idx}" + local_row = {} + for channel in channel_columns: + url = row[channel] if channel in wide_manifest.columns else None + if pd.isna(url): + local_row[channel] = None + continue + bucket, key = _parse_s3_url(url) + suffix = Path(key).suffix or ".tif" + local_path = dest_dir / f"{prefix}_{channel}{suffix}" + if overwrite or not local_path.exists(): + s3.download_file(bucket, key, str(local_path)) + local_row[channel] = str(local_path) + local_rows.append(local_row) + file_index = pd.DataFrame(local_rows, columns=channel_columns) + file_index.to_csv(dest_dir / "file_index.csv", index=False) + return file_index + + +# ## Retrieve compound manifest + +# In[4]: + + +MANIFEST = get_manifest() +MANIFEST.head() + + +# ## Filter manifest +# For the sake of demoing training here we restricted timepoint to 24, and selected untreated U2-OS cells + +# In[5]: + + +negcon_u2os_24_manifest = MANIFEST[ + (MANIFEST["Batch"] == "2020_11_04_CPJUMP1") & + (MANIFEST["control_type"] == "negcon") & + (MANIFEST["Cell_type"] == "U2OS") & + (MANIFEST["Time"] == 24) +] +negcon_u2os_24_manifest.head() + + +# ## Arrange as wide to be in anticipated format dor virtual stain flow datasets and also the format the download helper expects + +# In[6]: + + +wide_manifest = arrange_manifest_channels(negcon_u2os_24_manifest) +wide_manifest.head() + + +# ## Data split + +# In[7]: + + +# Get unique plates +unique_plates = wide_manifest['Metadata_Plate'].unique() + +# Split plates into train (75%) and test (25%) with seed +train_plates, test_plates = train_test_split( + unique_plates, + test_size=0.25, + random_state=42 +) + +# Create train and test manifests based on plate split +train_manifest_wide = wide_manifest[wide_manifest['Metadata_Plate'].isin(train_plates)] +test_manifest_wide = wide_manifest[wide_manifest['Metadata_Plate'].isin(test_plates)] + +print(f"Train plates: {len(train_plates)}, Test plates: {len(test_plates)}") +print(f"Train samples: {len(train_manifest_wide)}, Test samples: {len(test_manifest_wide)}") + + +# ## Write final splitted download manifest with metadata and download all needed data + +# In[8]: + + +train_manifest_wide.to_csv(DATA_DOWNLOAD_DIR/ "train_manifest.csv", index=False) +test_manifest_wide.to_csv(DATA_DOWNLOAD_DIR / "test_manifest.csv", index=False) + + +# In[9]: + + +train_download_summary = download_wide_manifest_channels( + train_manifest_wide, + dest_dir = DATA_DOWNLOAD_DIR / "cpjump1_u2os_train" +) +train_download_summary.head() + + +# In[10]: + + +test_download_summary =download_wide_manifest_channels( + test_manifest_wide, + dest_dir = DATA_DOWNLOAD_DIR / "cpjump1_u2os_test" +) +test_download_summary.head() +