diff --git a/examples/0.download_example_dataset.ipynb b/examples/0.download_example_dataset.ipynb
new file mode 100644
index 0000000..986c997
--- /dev/null
+++ b/examples/0.download_example_dataset.ipynb
@@ -0,0 +1,2250 @@
+{
+ "cells": [
+ {
+ "cell_type": "markdown",
+ "id": "b013fcef",
+ "metadata": {},
+ "source": [
+ "# Download JUMP pilot plate data from AWS S3 bucket for example training"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 1,
+ "id": "6c8de60f",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "from pathlib import Path\n",
+ "from urllib.parse import urlparse\n",
+ "\n",
+ "import pandas as pd\n",
+ "from sklearn.model_selection import train_test_split\n",
+ "\n",
+ "from virtual_stain_flow.datasets.example.cpjump1_manifest import get_manifest\n",
+ "from virtual_stain_flow.datasets.example.arrange_as_wide import arrange_manifest_channels"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "c879dacc",
+ "metadata": {},
+ "source": [
+ "## Pathing"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "d3baf91c",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "DATA_DOWNLOAD_DIR = Path(\"/PATH/TO/WHERE/YOU/WANT/TO/DOWNLOAD/CPJUMP1\")\n",
+ "DATA_DOWNLOAD_DIR.mkdir(exist_ok=True, parents=True)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "63c8d005",
+ "metadata": {},
+ "source": [
+ "## S3 download helpers"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 3,
+ "id": "6d789a28",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "def _parse_s3_url(url):\n",
+ " parsed = urlparse(url)\n",
+ " if parsed.scheme != \"s3\":\n",
+ " raise ValueError(f\"Expected s3:// URL, got: {url}\")\n",
+ " return parsed.netloc, parsed.path.lstrip(\"/\")\n",
+ "\n",
+ "def download_wide_manifest_channels(\n",
+ " wide_manifest,\n",
+ " dest_dir,\n",
+ " channel_columns=None,\n",
+ " overwrite=False,\n",
+ "):\n",
+ " \"\"\"\n",
+ " Download S3 TIFFs for each channel and write a local file_index.csv with paths.\n",
+ " \"\"\"\n",
+ " if channel_columns is None:\n",
+ " channel_columns = [\"LZ_BF\", \"BF\", \"HZ_BF\", \"DNA\", \"Mito\", \"AGP\", \"ER\", \"RNA\"]\n",
+ " dest_dir = Path(dest_dir)\n",
+ " dest_dir.mkdir(parents=True, exist_ok=True)\n",
+ " try:\n",
+ " import boto3\n",
+ " from botocore import UNSIGNED\n",
+ " from botocore.config import Config\n",
+ " except ImportError as exc:\n",
+ " raise ImportError(\n",
+ " \"boto3 is required for S3 downloads. Install with: pip install boto3\"\n",
+ " ) from exc\n",
+ " s3 = boto3.client(\"s3\", config=Config(signature_version=UNSIGNED))\n",
+ " local_rows = []\n",
+ " for row_idx, row in wide_manifest.iterrows():\n",
+ " prefix_parts = []\n",
+ " for key in [\"Metadata_Plate\", \"Metadata_Well\", \"Metadata_Site\"]:\n",
+ " if key in wide_manifest.columns:\n",
+ " prefix_parts.append(str(row[key]))\n",
+ " prefix = \"_\".join(prefix_parts) if prefix_parts else f\"row_{row_idx}\"\n",
+ " local_row = {}\n",
+ " for channel in channel_columns:\n",
+ " url = row[channel] if channel in wide_manifest.columns else None\n",
+ " if pd.isna(url):\n",
+ " local_row[channel] = None\n",
+ " continue\n",
+ " bucket, key = _parse_s3_url(url)\n",
+ " suffix = Path(key).suffix or \".tif\"\n",
+ " local_path = dest_dir / f\"{prefix}_{channel}{suffix}\"\n",
+ " if overwrite or not local_path.exists():\n",
+ " s3.download_file(bucket, key, str(local_path))\n",
+ " local_row[channel] = str(local_path)\n",
+ " local_rows.append(local_row)\n",
+ " file_index = pd.DataFrame(local_rows, columns=channel_columns)\n",
+ " file_index.to_csv(dest_dir / \"file_index.csv\", index=False)\n",
+ " return file_index\n"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "8e5e7baf",
+ "metadata": {},
+ "source": [
+ "## Retrieve compound manifest"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 4,
+ "id": "3452cbb4",
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "application/vnd.microsoft.datawrangler.viewer.v0+json": {
+ "columns": [
+ {
+ "name": "index",
+ "rawType": "int64",
+ "type": "integer"
+ },
+ {
+ "name": "Metadata_Well",
+ "rawType": "object",
+ "type": "string"
+ },
+ {
+ "name": "broad_sample",
+ "rawType": "object",
+ "type": "string"
+ },
+ {
+ "name": "solvent",
+ "rawType": "object",
+ "type": "string"
+ },
+ {
+ "name": "InChIKey",
+ "rawType": "object",
+ "type": "string"
+ },
+ {
+ "name": "pert_iname",
+ "rawType": "object",
+ "type": "string"
+ },
+ {
+ "name": "pubchem_cid",
+ "rawType": "float64",
+ "type": "float"
+ },
+ {
+ "name": "target",
+ "rawType": "object",
+ "type": "string"
+ },
+ {
+ "name": "target_list",
+ "rawType": "object",
+ "type": "string"
+ },
+ {
+ "name": "pert_type",
+ "rawType": "object",
+ "type": "string"
+ },
+ {
+ "name": "control_type",
+ "rawType": "object",
+ "type": "unknown"
+ },
+ {
+ "name": "smiles",
+ "rawType": "object",
+ "type": "string"
+ },
+ {
+ "name": "Batch",
+ "rawType": "object",
+ "type": "string"
+ },
+ {
+ "name": "Plate_Map_Name",
+ "rawType": "object",
+ "type": "string"
+ },
+ {
+ "name": "Metadata_Plate",
+ "rawType": "object",
+ "type": "string"
+ },
+ {
+ "name": "Perturbation",
+ "rawType": "object",
+ "type": "string"
+ },
+ {
+ "name": "Cell_type",
+ "rawType": "object",
+ "type": "string"
+ },
+ {
+ "name": "Time",
+ "rawType": "int64",
+ "type": "integer"
+ },
+ {
+ "name": "Density",
+ "rawType": "int64",
+ "type": "integer"
+ },
+ {
+ "name": "Antibiotics",
+ "rawType": "object",
+ "type": "string"
+ },
+ {
+ "name": "Cell_line",
+ "rawType": "object",
+ "type": "string"
+ },
+ {
+ "name": "Time_delay",
+ "rawType": "object",
+ "type": "string"
+ },
+ {
+ "name": "Times_imaged",
+ "rawType": "int64",
+ "type": "integer"
+ },
+ {
+ "name": "Anomaly",
+ "rawType": "object",
+ "type": "string"
+ },
+ {
+ "name": "Number_of_images",
+ "rawType": "int64",
+ "type": "integer"
+ },
+ {
+ "name": "Metadata_Site",
+ "rawType": "int64",
+ "type": "integer"
+ },
+ {
+ "name": "Metadata_ChannelName",
+ "rawType": "object",
+ "type": "string"
+ },
+ {
+ "name": "Metadata_PlaneID",
+ "rawType": "int64",
+ "type": "integer"
+ },
+ {
+ "name": "Metadata_PositionZ",
+ "rawType": "float64",
+ "type": "float"
+ },
+ {
+ "name": "Metadata_FileUrl",
+ "rawType": "object",
+ "type": "string"
+ },
+ {
+ "name": "Metadata_Filename",
+ "rawType": "object",
+ "type": "string"
+ }
+ ],
+ "ref": "89505c13-a772-4c21-9d8e-9de5bd7c7087",
+ "rows": [
+ [
+ "0",
+ "A01",
+ "BRD-A86665761-001-01-1",
+ "DMSO",
+ "TZDUHAJSIBHXDL-UHFFFAOYSA-N",
+ "gabapentin-enacarbil",
+ "9883933.0",
+ "CACNB4",
+ "CACNA1A|CACNA1B|CACNA1C|CACNA1D|CACNA1E|CACNA1F|CACNA1G|CACNA1H|CACNA1I|CACNA1S|CACNA2D1|CACNA2D2|CACNA2D3|CACNA2D4|CACNB1|CACNB2|CACNB3|CACNB4|CACNG1|CACNG2|CACNG3|CACNG4|CACNG5|CACNG6|CACNG7|CACNG8",
+ "trt",
+ null,
+ "CC(C)C(=O)OC(C)OC(=O)NCC1(CC(O)=O)CCCCC1",
+ "2020_11_04_CPJUMP1",
+ "JUMP-Target-1_compound_platemap",
+ "BR00116991",
+ "compound",
+ "A549",
+ "24",
+ "100",
+ "absent",
+ "Parental",
+ "Day0",
+ "1",
+ "WGA",
+ "27648",
+ "1",
+ "DNA",
+ "1",
+ "-2e-06",
+ "s3://cellpainting-gallery/cpg0000-jump-pilot/source_4/images/2020_11_04_CPJUMP1/images/BR00116991__2020-11-05T19_51_35-Measurement1/Images/r01c01f01p01-ch5sk1fk1fl1.tiff",
+ "r01c01f01p01-ch5sk1fk1fl1.tiff"
+ ],
+ [
+ "1",
+ "A01",
+ "BRD-A86665761-001-01-1",
+ "DMSO",
+ "TZDUHAJSIBHXDL-UHFFFAOYSA-N",
+ "gabapentin-enacarbil",
+ "9883933.0",
+ "CACNB4",
+ "CACNA1A|CACNA1B|CACNA1C|CACNA1D|CACNA1E|CACNA1F|CACNA1G|CACNA1H|CACNA1I|CACNA1S|CACNA2D1|CACNA2D2|CACNA2D3|CACNA2D4|CACNB1|CACNB2|CACNB3|CACNB4|CACNG1|CACNG2|CACNG3|CACNG4|CACNG5|CACNG6|CACNG7|CACNG8",
+ "trt",
+ null,
+ "CC(C)C(=O)OC(C)OC(=O)NCC1(CC(O)=O)CCCCC1",
+ "2020_11_04_CPJUMP1",
+ "JUMP-Target-1_compound_platemap",
+ "BR00116991",
+ "compound",
+ "A549",
+ "24",
+ "100",
+ "absent",
+ "Parental",
+ "Day0",
+ "1",
+ "WGA",
+ "27648",
+ "2",
+ "DNA",
+ "1",
+ "-2e-06",
+ "s3://cellpainting-gallery/cpg0000-jump-pilot/source_4/images/2020_11_04_CPJUMP1/images/BR00116991__2020-11-05T19_51_35-Measurement1/Images/r01c01f02p01-ch5sk1fk1fl1.tiff",
+ "r01c01f02p01-ch5sk1fk1fl1.tiff"
+ ],
+ [
+ "2",
+ "A01",
+ "BRD-A86665761-001-01-1",
+ "DMSO",
+ "TZDUHAJSIBHXDL-UHFFFAOYSA-N",
+ "gabapentin-enacarbil",
+ "9883933.0",
+ "CACNB4",
+ "CACNA1A|CACNA1B|CACNA1C|CACNA1D|CACNA1E|CACNA1F|CACNA1G|CACNA1H|CACNA1I|CACNA1S|CACNA2D1|CACNA2D2|CACNA2D3|CACNA2D4|CACNB1|CACNB2|CACNB3|CACNB4|CACNG1|CACNG2|CACNG3|CACNG4|CACNG5|CACNG6|CACNG7|CACNG8",
+ "trt",
+ null,
+ "CC(C)C(=O)OC(C)OC(=O)NCC1(CC(O)=O)CCCCC1",
+ "2020_11_04_CPJUMP1",
+ "JUMP-Target-1_compound_platemap",
+ "BR00116991",
+ "compound",
+ "A549",
+ "24",
+ "100",
+ "absent",
+ "Parental",
+ "Day0",
+ "1",
+ "WGA",
+ "27648",
+ "3",
+ "DNA",
+ "1",
+ "-2e-06",
+ "s3://cellpainting-gallery/cpg0000-jump-pilot/source_4/images/2020_11_04_CPJUMP1/images/BR00116991__2020-11-05T19_51_35-Measurement1/Images/r01c01f03p01-ch5sk1fk1fl1.tiff",
+ "r01c01f03p01-ch5sk1fk1fl1.tiff"
+ ],
+ [
+ "3",
+ "A01",
+ "BRD-A86665761-001-01-1",
+ "DMSO",
+ "TZDUHAJSIBHXDL-UHFFFAOYSA-N",
+ "gabapentin-enacarbil",
+ "9883933.0",
+ "CACNB4",
+ "CACNA1A|CACNA1B|CACNA1C|CACNA1D|CACNA1E|CACNA1F|CACNA1G|CACNA1H|CACNA1I|CACNA1S|CACNA2D1|CACNA2D2|CACNA2D3|CACNA2D4|CACNB1|CACNB2|CACNB3|CACNB4|CACNG1|CACNG2|CACNG3|CACNG4|CACNG5|CACNG6|CACNG7|CACNG8",
+ "trt",
+ null,
+ "CC(C)C(=O)OC(C)OC(=O)NCC1(CC(O)=O)CCCCC1",
+ "2020_11_04_CPJUMP1",
+ "JUMP-Target-1_compound_platemap",
+ "BR00116991",
+ "compound",
+ "A549",
+ "24",
+ "100",
+ "absent",
+ "Parental",
+ "Day0",
+ "1",
+ "WGA",
+ "27648",
+ "4",
+ "DNA",
+ "1",
+ "-2e-06",
+ "s3://cellpainting-gallery/cpg0000-jump-pilot/source_4/images/2020_11_04_CPJUMP1/images/BR00116991__2020-11-05T19_51_35-Measurement1/Images/r01c01f04p01-ch5sk1fk1fl1.tiff",
+ "r01c01f04p01-ch5sk1fk1fl1.tiff"
+ ],
+ [
+ "4",
+ "A01",
+ "BRD-A86665761-001-01-1",
+ "DMSO",
+ "TZDUHAJSIBHXDL-UHFFFAOYSA-N",
+ "gabapentin-enacarbil",
+ "9883933.0",
+ "CACNB4",
+ "CACNA1A|CACNA1B|CACNA1C|CACNA1D|CACNA1E|CACNA1F|CACNA1G|CACNA1H|CACNA1I|CACNA1S|CACNA2D1|CACNA2D2|CACNA2D3|CACNA2D4|CACNB1|CACNB2|CACNB3|CACNB4|CACNG1|CACNG2|CACNG3|CACNG4|CACNG5|CACNG6|CACNG7|CACNG8",
+ "trt",
+ null,
+ "CC(C)C(=O)OC(C)OC(=O)NCC1(CC(O)=O)CCCCC1",
+ "2020_11_04_CPJUMP1",
+ "JUMP-Target-1_compound_platemap",
+ "BR00116991",
+ "compound",
+ "A549",
+ "24",
+ "100",
+ "absent",
+ "Parental",
+ "Day0",
+ "1",
+ "WGA",
+ "27648",
+ "5",
+ "DNA",
+ "1",
+ "-2e-06",
+ "s3://cellpainting-gallery/cpg0000-jump-pilot/source_4/images/2020_11_04_CPJUMP1/images/BR00116991__2020-11-05T19_51_35-Measurement1/Images/r01c01f05p01-ch5sk1fk1fl1.tiff",
+ "r01c01f05p01-ch5sk1fk1fl1.tiff"
+ ]
+ ],
+ "shape": {
+ "columns": 30,
+ "rows": 5
+ }
+ },
+ "text/html": [
+ "
\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " Metadata_Well | \n",
+ " broad_sample | \n",
+ " solvent | \n",
+ " InChIKey | \n",
+ " pert_iname | \n",
+ " pubchem_cid | \n",
+ " target | \n",
+ " target_list | \n",
+ " pert_type | \n",
+ " control_type | \n",
+ " ... | \n",
+ " Time_delay | \n",
+ " Times_imaged | \n",
+ " Anomaly | \n",
+ " Number_of_images | \n",
+ " Metadata_Site | \n",
+ " Metadata_ChannelName | \n",
+ " Metadata_PlaneID | \n",
+ " Metadata_PositionZ | \n",
+ " Metadata_FileUrl | \n",
+ " Metadata_Filename | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " | 0 | \n",
+ " A01 | \n",
+ " BRD-A86665761-001-01-1 | \n",
+ " DMSO | \n",
+ " TZDUHAJSIBHXDL-UHFFFAOYSA-N | \n",
+ " gabapentin-enacarbil | \n",
+ " 9883933.0 | \n",
+ " CACNB4 | \n",
+ " CACNA1A|CACNA1B|CACNA1C|CACNA1D|CACNA1E|CACNA1... | \n",
+ " trt | \n",
+ " NaN | \n",
+ " ... | \n",
+ " Day0 | \n",
+ " 1 | \n",
+ " WGA | \n",
+ " 27648 | \n",
+ " 1 | \n",
+ " DNA | \n",
+ " 1 | \n",
+ " -0.000002 | \n",
+ " s3://cellpainting-gallery/cpg0000-jump-pilot/s... | \n",
+ " r01c01f01p01-ch5sk1fk1fl1.tiff | \n",
+ "
\n",
+ " \n",
+ " | 1 | \n",
+ " A01 | \n",
+ " BRD-A86665761-001-01-1 | \n",
+ " DMSO | \n",
+ " TZDUHAJSIBHXDL-UHFFFAOYSA-N | \n",
+ " gabapentin-enacarbil | \n",
+ " 9883933.0 | \n",
+ " CACNB4 | \n",
+ " CACNA1A|CACNA1B|CACNA1C|CACNA1D|CACNA1E|CACNA1... | \n",
+ " trt | \n",
+ " NaN | \n",
+ " ... | \n",
+ " Day0 | \n",
+ " 1 | \n",
+ " WGA | \n",
+ " 27648 | \n",
+ " 2 | \n",
+ " DNA | \n",
+ " 1 | \n",
+ " -0.000002 | \n",
+ " s3://cellpainting-gallery/cpg0000-jump-pilot/s... | \n",
+ " r01c01f02p01-ch5sk1fk1fl1.tiff | \n",
+ "
\n",
+ " \n",
+ " | 2 | \n",
+ " A01 | \n",
+ " BRD-A86665761-001-01-1 | \n",
+ " DMSO | \n",
+ " TZDUHAJSIBHXDL-UHFFFAOYSA-N | \n",
+ " gabapentin-enacarbil | \n",
+ " 9883933.0 | \n",
+ " CACNB4 | \n",
+ " CACNA1A|CACNA1B|CACNA1C|CACNA1D|CACNA1E|CACNA1... | \n",
+ " trt | \n",
+ " NaN | \n",
+ " ... | \n",
+ " Day0 | \n",
+ " 1 | \n",
+ " WGA | \n",
+ " 27648 | \n",
+ " 3 | \n",
+ " DNA | \n",
+ " 1 | \n",
+ " -0.000002 | \n",
+ " s3://cellpainting-gallery/cpg0000-jump-pilot/s... | \n",
+ " r01c01f03p01-ch5sk1fk1fl1.tiff | \n",
+ "
\n",
+ " \n",
+ " | 3 | \n",
+ " A01 | \n",
+ " BRD-A86665761-001-01-1 | \n",
+ " DMSO | \n",
+ " TZDUHAJSIBHXDL-UHFFFAOYSA-N | \n",
+ " gabapentin-enacarbil | \n",
+ " 9883933.0 | \n",
+ " CACNB4 | \n",
+ " CACNA1A|CACNA1B|CACNA1C|CACNA1D|CACNA1E|CACNA1... | \n",
+ " trt | \n",
+ " NaN | \n",
+ " ... | \n",
+ " Day0 | \n",
+ " 1 | \n",
+ " WGA | \n",
+ " 27648 | \n",
+ " 4 | \n",
+ " DNA | \n",
+ " 1 | \n",
+ " -0.000002 | \n",
+ " s3://cellpainting-gallery/cpg0000-jump-pilot/s... | \n",
+ " r01c01f04p01-ch5sk1fk1fl1.tiff | \n",
+ "
\n",
+ " \n",
+ " | 4 | \n",
+ " A01 | \n",
+ " BRD-A86665761-001-01-1 | \n",
+ " DMSO | \n",
+ " TZDUHAJSIBHXDL-UHFFFAOYSA-N | \n",
+ " gabapentin-enacarbil | \n",
+ " 9883933.0 | \n",
+ " CACNB4 | \n",
+ " CACNA1A|CACNA1B|CACNA1C|CACNA1D|CACNA1E|CACNA1... | \n",
+ " trt | \n",
+ " NaN | \n",
+ " ... | \n",
+ " Day0 | \n",
+ " 1 | \n",
+ " WGA | \n",
+ " 27648 | \n",
+ " 5 | \n",
+ " DNA | \n",
+ " 1 | \n",
+ " -0.000002 | \n",
+ " s3://cellpainting-gallery/cpg0000-jump-pilot/s... | \n",
+ " r01c01f05p01-ch5sk1fk1fl1.tiff | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
5 rows × 30 columns
\n",
+ "
"
+ ],
+ "text/plain": [
+ " Metadata_Well broad_sample solvent InChIKey \\\n",
+ "0 A01 BRD-A86665761-001-01-1 DMSO TZDUHAJSIBHXDL-UHFFFAOYSA-N \n",
+ "1 A01 BRD-A86665761-001-01-1 DMSO TZDUHAJSIBHXDL-UHFFFAOYSA-N \n",
+ "2 A01 BRD-A86665761-001-01-1 DMSO TZDUHAJSIBHXDL-UHFFFAOYSA-N \n",
+ "3 A01 BRD-A86665761-001-01-1 DMSO TZDUHAJSIBHXDL-UHFFFAOYSA-N \n",
+ "4 A01 BRD-A86665761-001-01-1 DMSO TZDUHAJSIBHXDL-UHFFFAOYSA-N \n",
+ "\n",
+ " pert_iname pubchem_cid target \\\n",
+ "0 gabapentin-enacarbil 9883933.0 CACNB4 \n",
+ "1 gabapentin-enacarbil 9883933.0 CACNB4 \n",
+ "2 gabapentin-enacarbil 9883933.0 CACNB4 \n",
+ "3 gabapentin-enacarbil 9883933.0 CACNB4 \n",
+ "4 gabapentin-enacarbil 9883933.0 CACNB4 \n",
+ "\n",
+ " target_list pert_type control_type \\\n",
+ "0 CACNA1A|CACNA1B|CACNA1C|CACNA1D|CACNA1E|CACNA1... trt NaN \n",
+ "1 CACNA1A|CACNA1B|CACNA1C|CACNA1D|CACNA1E|CACNA1... trt NaN \n",
+ "2 CACNA1A|CACNA1B|CACNA1C|CACNA1D|CACNA1E|CACNA1... trt NaN \n",
+ "3 CACNA1A|CACNA1B|CACNA1C|CACNA1D|CACNA1E|CACNA1... trt NaN \n",
+ "4 CACNA1A|CACNA1B|CACNA1C|CACNA1D|CACNA1E|CACNA1... trt NaN \n",
+ "\n",
+ " ... Time_delay Times_imaged Anomaly Number_of_images Metadata_Site \\\n",
+ "0 ... Day0 1 WGA 27648 1 \n",
+ "1 ... Day0 1 WGA 27648 2 \n",
+ "2 ... Day0 1 WGA 27648 3 \n",
+ "3 ... Day0 1 WGA 27648 4 \n",
+ "4 ... Day0 1 WGA 27648 5 \n",
+ "\n",
+ " Metadata_ChannelName Metadata_PlaneID Metadata_PositionZ \\\n",
+ "0 DNA 1 -0.000002 \n",
+ "1 DNA 1 -0.000002 \n",
+ "2 DNA 1 -0.000002 \n",
+ "3 DNA 1 -0.000002 \n",
+ "4 DNA 1 -0.000002 \n",
+ "\n",
+ " Metadata_FileUrl \\\n",
+ "0 s3://cellpainting-gallery/cpg0000-jump-pilot/s... \n",
+ "1 s3://cellpainting-gallery/cpg0000-jump-pilot/s... \n",
+ "2 s3://cellpainting-gallery/cpg0000-jump-pilot/s... \n",
+ "3 s3://cellpainting-gallery/cpg0000-jump-pilot/s... \n",
+ "4 s3://cellpainting-gallery/cpg0000-jump-pilot/s... \n",
+ "\n",
+ " Metadata_Filename \n",
+ "0 r01c01f01p01-ch5sk1fk1fl1.tiff \n",
+ "1 r01c01f02p01-ch5sk1fk1fl1.tiff \n",
+ "2 r01c01f03p01-ch5sk1fk1fl1.tiff \n",
+ "3 r01c01f04p01-ch5sk1fk1fl1.tiff \n",
+ "4 r01c01f05p01-ch5sk1fk1fl1.tiff \n",
+ "\n",
+ "[5 rows x 30 columns]"
+ ]
+ },
+ "execution_count": 4,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "MANIFEST = get_manifest()\n",
+ "MANIFEST.head()"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "c62e8252",
+ "metadata": {},
+ "source": [
+ "## Filter manifest\n",
+ "For the sake of demoing training here we restricted timepoint to 24, and selected untreated U2-OS cells"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 5,
+ "id": "82d77177",
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "application/vnd.microsoft.datawrangler.viewer.v0+json": {
+ "columns": [
+ {
+ "name": "index",
+ "rawType": "int64",
+ "type": "integer"
+ },
+ {
+ "name": "Metadata_Well",
+ "rawType": "object",
+ "type": "string"
+ },
+ {
+ "name": "broad_sample",
+ "rawType": "object",
+ "type": "unknown"
+ },
+ {
+ "name": "solvent",
+ "rawType": "object",
+ "type": "string"
+ },
+ {
+ "name": "InChIKey",
+ "rawType": "object",
+ "type": "string"
+ },
+ {
+ "name": "pert_iname",
+ "rawType": "object",
+ "type": "string"
+ },
+ {
+ "name": "pubchem_cid",
+ "rawType": "float64",
+ "type": "float"
+ },
+ {
+ "name": "target",
+ "rawType": "object",
+ "type": "unknown"
+ },
+ {
+ "name": "target_list",
+ "rawType": "object",
+ "type": "unknown"
+ },
+ {
+ "name": "pert_type",
+ "rawType": "object",
+ "type": "string"
+ },
+ {
+ "name": "control_type",
+ "rawType": "object",
+ "type": "string"
+ },
+ {
+ "name": "smiles",
+ "rawType": "object",
+ "type": "string"
+ },
+ {
+ "name": "Batch",
+ "rawType": "object",
+ "type": "string"
+ },
+ {
+ "name": "Plate_Map_Name",
+ "rawType": "object",
+ "type": "string"
+ },
+ {
+ "name": "Metadata_Plate",
+ "rawType": "object",
+ "type": "string"
+ },
+ {
+ "name": "Perturbation",
+ "rawType": "object",
+ "type": "string"
+ },
+ {
+ "name": "Cell_type",
+ "rawType": "object",
+ "type": "string"
+ },
+ {
+ "name": "Time",
+ "rawType": "int64",
+ "type": "integer"
+ },
+ {
+ "name": "Density",
+ "rawType": "int64",
+ "type": "integer"
+ },
+ {
+ "name": "Antibiotics",
+ "rawType": "object",
+ "type": "string"
+ },
+ {
+ "name": "Cell_line",
+ "rawType": "object",
+ "type": "string"
+ },
+ {
+ "name": "Time_delay",
+ "rawType": "object",
+ "type": "string"
+ },
+ {
+ "name": "Times_imaged",
+ "rawType": "int64",
+ "type": "integer"
+ },
+ {
+ "name": "Anomaly",
+ "rawType": "object",
+ "type": "string"
+ },
+ {
+ "name": "Number_of_images",
+ "rawType": "int64",
+ "type": "integer"
+ },
+ {
+ "name": "Metadata_Site",
+ "rawType": "int64",
+ "type": "integer"
+ },
+ {
+ "name": "Metadata_ChannelName",
+ "rawType": "object",
+ "type": "string"
+ },
+ {
+ "name": "Metadata_PlaneID",
+ "rawType": "int64",
+ "type": "integer"
+ },
+ {
+ "name": "Metadata_PositionZ",
+ "rawType": "float64",
+ "type": "float"
+ },
+ {
+ "name": "Metadata_FileUrl",
+ "rawType": "object",
+ "type": "string"
+ },
+ {
+ "name": "Metadata_Filename",
+ "rawType": "object",
+ "type": "string"
+ }
+ ],
+ "ref": "e21369f3-038a-44f7-bea4-0965696eaf8a",
+ "rows": [
+ [
+ "2240",
+ "A02",
+ null,
+ "DMSO",
+ "IAZDPXIOMUYVGZ-UHFFFAOYSA-N",
+ "DMSO",
+ "679.0",
+ null,
+ null,
+ "control",
+ "negcon",
+ "CS(=O)C",
+ "2020_11_04_CPJUMP1",
+ "JUMP-Target-1_compound_platemap",
+ "BR00116995",
+ "compound",
+ "U2OS",
+ "24",
+ "100",
+ "absent",
+ "Parental",
+ "Day0",
+ "1",
+ "WGA",
+ "27648",
+ "1",
+ "DNA",
+ "1",
+ "-2e-06",
+ "s3://cellpainting-gallery/cpg0000-jump-pilot/source_4/images/2020_11_04_CPJUMP1/images/BR00116995__2020-11-06T02_41_05-Measurement1/Images/r01c02f01p01-ch5sk1fk1fl1.tiff",
+ "r01c02f01p01-ch5sk1fk1fl1.tiff"
+ ],
+ [
+ "2241",
+ "A02",
+ null,
+ "DMSO",
+ "IAZDPXIOMUYVGZ-UHFFFAOYSA-N",
+ "DMSO",
+ "679.0",
+ null,
+ null,
+ "control",
+ "negcon",
+ "CS(=O)C",
+ "2020_11_04_CPJUMP1",
+ "JUMP-Target-1_compound_platemap",
+ "BR00116995",
+ "compound",
+ "U2OS",
+ "24",
+ "100",
+ "absent",
+ "Parental",
+ "Day0",
+ "1",
+ "WGA",
+ "27648",
+ "2",
+ "DNA",
+ "1",
+ "-2e-06",
+ "s3://cellpainting-gallery/cpg0000-jump-pilot/source_4/images/2020_11_04_CPJUMP1/images/BR00116995__2020-11-06T02_41_05-Measurement1/Images/r01c02f02p01-ch5sk1fk1fl1.tiff",
+ "r01c02f02p01-ch5sk1fk1fl1.tiff"
+ ],
+ [
+ "2242",
+ "A02",
+ null,
+ "DMSO",
+ "IAZDPXIOMUYVGZ-UHFFFAOYSA-N",
+ "DMSO",
+ "679.0",
+ null,
+ null,
+ "control",
+ "negcon",
+ "CS(=O)C",
+ "2020_11_04_CPJUMP1",
+ "JUMP-Target-1_compound_platemap",
+ "BR00116995",
+ "compound",
+ "U2OS",
+ "24",
+ "100",
+ "absent",
+ "Parental",
+ "Day0",
+ "1",
+ "WGA",
+ "27648",
+ "3",
+ "DNA",
+ "1",
+ "-2e-06",
+ "s3://cellpainting-gallery/cpg0000-jump-pilot/source_4/images/2020_11_04_CPJUMP1/images/BR00116995__2020-11-06T02_41_05-Measurement1/Images/r01c02f03p01-ch5sk1fk1fl1.tiff",
+ "r01c02f03p01-ch5sk1fk1fl1.tiff"
+ ],
+ [
+ "2243",
+ "A02",
+ null,
+ "DMSO",
+ "IAZDPXIOMUYVGZ-UHFFFAOYSA-N",
+ "DMSO",
+ "679.0",
+ null,
+ null,
+ "control",
+ "negcon",
+ "CS(=O)C",
+ "2020_11_04_CPJUMP1",
+ "JUMP-Target-1_compound_platemap",
+ "BR00116995",
+ "compound",
+ "U2OS",
+ "24",
+ "100",
+ "absent",
+ "Parental",
+ "Day0",
+ "1",
+ "WGA",
+ "27648",
+ "4",
+ "DNA",
+ "1",
+ "-2e-06",
+ "s3://cellpainting-gallery/cpg0000-jump-pilot/source_4/images/2020_11_04_CPJUMP1/images/BR00116995__2020-11-06T02_41_05-Measurement1/Images/r01c02f04p01-ch5sk1fk1fl1.tiff",
+ "r01c02f04p01-ch5sk1fk1fl1.tiff"
+ ],
+ [
+ "2244",
+ "A02",
+ null,
+ "DMSO",
+ "IAZDPXIOMUYVGZ-UHFFFAOYSA-N",
+ "DMSO",
+ "679.0",
+ null,
+ null,
+ "control",
+ "negcon",
+ "CS(=O)C",
+ "2020_11_04_CPJUMP1",
+ "JUMP-Target-1_compound_platemap",
+ "BR00116995",
+ "compound",
+ "U2OS",
+ "24",
+ "100",
+ "absent",
+ "Parental",
+ "Day0",
+ "1",
+ "WGA",
+ "27648",
+ "5",
+ "DNA",
+ "1",
+ "-2e-06",
+ "s3://cellpainting-gallery/cpg0000-jump-pilot/source_4/images/2020_11_04_CPJUMP1/images/BR00116995__2020-11-06T02_41_05-Measurement1/Images/r01c02f05p01-ch5sk1fk1fl1.tiff",
+ "r01c02f05p01-ch5sk1fk1fl1.tiff"
+ ]
+ ],
+ "shape": {
+ "columns": 30,
+ "rows": 5
+ }
+ },
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " Metadata_Well | \n",
+ " broad_sample | \n",
+ " solvent | \n",
+ " InChIKey | \n",
+ " pert_iname | \n",
+ " pubchem_cid | \n",
+ " target | \n",
+ " target_list | \n",
+ " pert_type | \n",
+ " control_type | \n",
+ " ... | \n",
+ " Time_delay | \n",
+ " Times_imaged | \n",
+ " Anomaly | \n",
+ " Number_of_images | \n",
+ " Metadata_Site | \n",
+ " Metadata_ChannelName | \n",
+ " Metadata_PlaneID | \n",
+ " Metadata_PositionZ | \n",
+ " Metadata_FileUrl | \n",
+ " Metadata_Filename | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " | 2240 | \n",
+ " A02 | \n",
+ " NaN | \n",
+ " DMSO | \n",
+ " IAZDPXIOMUYVGZ-UHFFFAOYSA-N | \n",
+ " DMSO | \n",
+ " 679.0 | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " control | \n",
+ " negcon | \n",
+ " ... | \n",
+ " Day0 | \n",
+ " 1 | \n",
+ " WGA | \n",
+ " 27648 | \n",
+ " 1 | \n",
+ " DNA | \n",
+ " 1 | \n",
+ " -0.000002 | \n",
+ " s3://cellpainting-gallery/cpg0000-jump-pilot/s... | \n",
+ " r01c02f01p01-ch5sk1fk1fl1.tiff | \n",
+ "
\n",
+ " \n",
+ " | 2241 | \n",
+ " A02 | \n",
+ " NaN | \n",
+ " DMSO | \n",
+ " IAZDPXIOMUYVGZ-UHFFFAOYSA-N | \n",
+ " DMSO | \n",
+ " 679.0 | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " control | \n",
+ " negcon | \n",
+ " ... | \n",
+ " Day0 | \n",
+ " 1 | \n",
+ " WGA | \n",
+ " 27648 | \n",
+ " 2 | \n",
+ " DNA | \n",
+ " 1 | \n",
+ " -0.000002 | \n",
+ " s3://cellpainting-gallery/cpg0000-jump-pilot/s... | \n",
+ " r01c02f02p01-ch5sk1fk1fl1.tiff | \n",
+ "
\n",
+ " \n",
+ " | 2242 | \n",
+ " A02 | \n",
+ " NaN | \n",
+ " DMSO | \n",
+ " IAZDPXIOMUYVGZ-UHFFFAOYSA-N | \n",
+ " DMSO | \n",
+ " 679.0 | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " control | \n",
+ " negcon | \n",
+ " ... | \n",
+ " Day0 | \n",
+ " 1 | \n",
+ " WGA | \n",
+ " 27648 | \n",
+ " 3 | \n",
+ " DNA | \n",
+ " 1 | \n",
+ " -0.000002 | \n",
+ " s3://cellpainting-gallery/cpg0000-jump-pilot/s... | \n",
+ " r01c02f03p01-ch5sk1fk1fl1.tiff | \n",
+ "
\n",
+ " \n",
+ " | 2243 | \n",
+ " A02 | \n",
+ " NaN | \n",
+ " DMSO | \n",
+ " IAZDPXIOMUYVGZ-UHFFFAOYSA-N | \n",
+ " DMSO | \n",
+ " 679.0 | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " control | \n",
+ " negcon | \n",
+ " ... | \n",
+ " Day0 | \n",
+ " 1 | \n",
+ " WGA | \n",
+ " 27648 | \n",
+ " 4 | \n",
+ " DNA | \n",
+ " 1 | \n",
+ " -0.000002 | \n",
+ " s3://cellpainting-gallery/cpg0000-jump-pilot/s... | \n",
+ " r01c02f04p01-ch5sk1fk1fl1.tiff | \n",
+ "
\n",
+ " \n",
+ " | 2244 | \n",
+ " A02 | \n",
+ " NaN | \n",
+ " DMSO | \n",
+ " IAZDPXIOMUYVGZ-UHFFFAOYSA-N | \n",
+ " DMSO | \n",
+ " 679.0 | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " control | \n",
+ " negcon | \n",
+ " ... | \n",
+ " Day0 | \n",
+ " 1 | \n",
+ " WGA | \n",
+ " 27648 | \n",
+ " 5 | \n",
+ " DNA | \n",
+ " 1 | \n",
+ " -0.000002 | \n",
+ " s3://cellpainting-gallery/cpg0000-jump-pilot/s... | \n",
+ " r01c02f05p01-ch5sk1fk1fl1.tiff | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
5 rows × 30 columns
\n",
+ "
"
+ ],
+ "text/plain": [
+ " Metadata_Well broad_sample solvent InChIKey \\\n",
+ "2240 A02 NaN DMSO IAZDPXIOMUYVGZ-UHFFFAOYSA-N \n",
+ "2241 A02 NaN DMSO IAZDPXIOMUYVGZ-UHFFFAOYSA-N \n",
+ "2242 A02 NaN DMSO IAZDPXIOMUYVGZ-UHFFFAOYSA-N \n",
+ "2243 A02 NaN DMSO IAZDPXIOMUYVGZ-UHFFFAOYSA-N \n",
+ "2244 A02 NaN DMSO IAZDPXIOMUYVGZ-UHFFFAOYSA-N \n",
+ "\n",
+ " pert_iname pubchem_cid target target_list pert_type control_type ... \\\n",
+ "2240 DMSO 679.0 NaN NaN control negcon ... \n",
+ "2241 DMSO 679.0 NaN NaN control negcon ... \n",
+ "2242 DMSO 679.0 NaN NaN control negcon ... \n",
+ "2243 DMSO 679.0 NaN NaN control negcon ... \n",
+ "2244 DMSO 679.0 NaN NaN control negcon ... \n",
+ "\n",
+ " Time_delay Times_imaged Anomaly Number_of_images Metadata_Site \\\n",
+ "2240 Day0 1 WGA 27648 1 \n",
+ "2241 Day0 1 WGA 27648 2 \n",
+ "2242 Day0 1 WGA 27648 3 \n",
+ "2243 Day0 1 WGA 27648 4 \n",
+ "2244 Day0 1 WGA 27648 5 \n",
+ "\n",
+ " Metadata_ChannelName Metadata_PlaneID Metadata_PositionZ \\\n",
+ "2240 DNA 1 -0.000002 \n",
+ "2241 DNA 1 -0.000002 \n",
+ "2242 DNA 1 -0.000002 \n",
+ "2243 DNA 1 -0.000002 \n",
+ "2244 DNA 1 -0.000002 \n",
+ "\n",
+ " Metadata_FileUrl \\\n",
+ "2240 s3://cellpainting-gallery/cpg0000-jump-pilot/s... \n",
+ "2241 s3://cellpainting-gallery/cpg0000-jump-pilot/s... \n",
+ "2242 s3://cellpainting-gallery/cpg0000-jump-pilot/s... \n",
+ "2243 s3://cellpainting-gallery/cpg0000-jump-pilot/s... \n",
+ "2244 s3://cellpainting-gallery/cpg0000-jump-pilot/s... \n",
+ "\n",
+ " Metadata_Filename \n",
+ "2240 r01c02f01p01-ch5sk1fk1fl1.tiff \n",
+ "2241 r01c02f02p01-ch5sk1fk1fl1.tiff \n",
+ "2242 r01c02f03p01-ch5sk1fk1fl1.tiff \n",
+ "2243 r01c02f04p01-ch5sk1fk1fl1.tiff \n",
+ "2244 r01c02f05p01-ch5sk1fk1fl1.tiff \n",
+ "\n",
+ "[5 rows x 30 columns]"
+ ]
+ },
+ "execution_count": 5,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "negcon_u2os_24_manifest = MANIFEST[\n",
+ " (MANIFEST[\"Batch\"] == \"2020_11_04_CPJUMP1\") &\n",
+ " (MANIFEST[\"control_type\"] == \"negcon\") &\n",
+ " (MANIFEST[\"Cell_type\"] == \"U2OS\") &\n",
+ " (MANIFEST[\"Time\"] == 24)\n",
+ "]\n",
+ "negcon_u2os_24_manifest.head()"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "6b4431f0",
+ "metadata": {},
+ "source": [
+ "## Arrange as wide to be in anticipated format dor virtual stain flow datasets and also the format the download helper expects"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 6,
+ "id": "952e2717",
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "application/vnd.microsoft.datawrangler.viewer.v0+json": {
+ "columns": [
+ {
+ "name": "index",
+ "rawType": "int64",
+ "type": "integer"
+ },
+ {
+ "name": "Metadata_Plate",
+ "rawType": "object",
+ "type": "string"
+ },
+ {
+ "name": "Metadata_Well",
+ "rawType": "object",
+ "type": "string"
+ },
+ {
+ "name": "Metadata_Site",
+ "rawType": "int64",
+ "type": "integer"
+ },
+ {
+ "name": "LZ_BF",
+ "rawType": "object",
+ "type": "string"
+ },
+ {
+ "name": "BF",
+ "rawType": "object",
+ "type": "string"
+ },
+ {
+ "name": "HZ_BF",
+ "rawType": "object",
+ "type": "string"
+ },
+ {
+ "name": "DNA",
+ "rawType": "object",
+ "type": "string"
+ },
+ {
+ "name": "Mito",
+ "rawType": "object",
+ "type": "string"
+ },
+ {
+ "name": "AGP",
+ "rawType": "object",
+ "type": "string"
+ },
+ {
+ "name": "ER",
+ "rawType": "object",
+ "type": "string"
+ },
+ {
+ "name": "RNA",
+ "rawType": "object",
+ "type": "string"
+ }
+ ],
+ "ref": "512f84e8-44b9-410e-80d4-2d0926f51491",
+ "rows": [
+ [
+ "0",
+ "BR00116995",
+ "A02",
+ "1",
+ "s3://cellpainting-gallery/cpg0000-jump-pilot/source_4/images/2020_11_04_CPJUMP1/images/BR00116995__2020-11-06T02_41_05-Measurement1/Images/r01c02f01p01-ch7sk1fk1fl1.tiff",
+ "s3://cellpainting-gallery/cpg0000-jump-pilot/source_4/images/2020_11_04_CPJUMP1/images/BR00116995__2020-11-06T02_41_05-Measurement1/Images/r01c02f01p01-ch8sk1fk1fl1.tiff",
+ "s3://cellpainting-gallery/cpg0000-jump-pilot/source_4/images/2020_11_04_CPJUMP1/images/BR00116995__2020-11-06T02_41_05-Measurement1/Images/r01c02f01p01-ch6sk1fk1fl1.tiff",
+ "s3://cellpainting-gallery/cpg0000-jump-pilot/source_4/images/2020_11_04_CPJUMP1/images/BR00116995__2020-11-06T02_41_05-Measurement1/Images/r01c02f01p01-ch5sk1fk1fl1.tiff",
+ "s3://cellpainting-gallery/cpg0000-jump-pilot/source_4/images/2020_11_04_CPJUMP1/images/BR00116995__2020-11-06T02_41_05-Measurement1/Images/r01c02f01p01-ch1sk1fk1fl1.tiff",
+ "s3://cellpainting-gallery/cpg0000-jump-pilot/source_4/images/2020_11_04_CPJUMP1/images/BR00116995__2020-11-06T02_41_05-Measurement1/Images/r01c02f01p01-ch2sk1fk1fl1.tiff",
+ "s3://cellpainting-gallery/cpg0000-jump-pilot/source_4/images/2020_11_04_CPJUMP1/images/BR00116995__2020-11-06T02_41_05-Measurement1/Images/r01c02f01p01-ch4sk1fk1fl1.tiff",
+ "s3://cellpainting-gallery/cpg0000-jump-pilot/source_4/images/2020_11_04_CPJUMP1/images/BR00116995__2020-11-06T02_41_05-Measurement1/Images/r01c02f01p01-ch3sk1fk1fl1.tiff"
+ ],
+ [
+ "1",
+ "BR00116995",
+ "A02",
+ "2",
+ "s3://cellpainting-gallery/cpg0000-jump-pilot/source_4/images/2020_11_04_CPJUMP1/images/BR00116995__2020-11-06T02_41_05-Measurement1/Images/r01c02f02p01-ch7sk1fk1fl1.tiff",
+ "s3://cellpainting-gallery/cpg0000-jump-pilot/source_4/images/2020_11_04_CPJUMP1/images/BR00116995__2020-11-06T02_41_05-Measurement1/Images/r01c02f02p01-ch8sk1fk1fl1.tiff",
+ "s3://cellpainting-gallery/cpg0000-jump-pilot/source_4/images/2020_11_04_CPJUMP1/images/BR00116995__2020-11-06T02_41_05-Measurement1/Images/r01c02f02p01-ch6sk1fk1fl1.tiff",
+ "s3://cellpainting-gallery/cpg0000-jump-pilot/source_4/images/2020_11_04_CPJUMP1/images/BR00116995__2020-11-06T02_41_05-Measurement1/Images/r01c02f02p01-ch5sk1fk1fl1.tiff",
+ "s3://cellpainting-gallery/cpg0000-jump-pilot/source_4/images/2020_11_04_CPJUMP1/images/BR00116995__2020-11-06T02_41_05-Measurement1/Images/r01c02f02p01-ch1sk1fk1fl1.tiff",
+ "s3://cellpainting-gallery/cpg0000-jump-pilot/source_4/images/2020_11_04_CPJUMP1/images/BR00116995__2020-11-06T02_41_05-Measurement1/Images/r01c02f02p01-ch2sk1fk1fl1.tiff",
+ "s3://cellpainting-gallery/cpg0000-jump-pilot/source_4/images/2020_11_04_CPJUMP1/images/BR00116995__2020-11-06T02_41_05-Measurement1/Images/r01c02f02p01-ch4sk1fk1fl1.tiff",
+ "s3://cellpainting-gallery/cpg0000-jump-pilot/source_4/images/2020_11_04_CPJUMP1/images/BR00116995__2020-11-06T02_41_05-Measurement1/Images/r01c02f02p01-ch3sk1fk1fl1.tiff"
+ ],
+ [
+ "2",
+ "BR00116995",
+ "A02",
+ "3",
+ "s3://cellpainting-gallery/cpg0000-jump-pilot/source_4/images/2020_11_04_CPJUMP1/images/BR00116995__2020-11-06T02_41_05-Measurement1/Images/r01c02f03p01-ch7sk1fk1fl1.tiff",
+ "s3://cellpainting-gallery/cpg0000-jump-pilot/source_4/images/2020_11_04_CPJUMP1/images/BR00116995__2020-11-06T02_41_05-Measurement1/Images/r01c02f03p01-ch8sk1fk1fl1.tiff",
+ "s3://cellpainting-gallery/cpg0000-jump-pilot/source_4/images/2020_11_04_CPJUMP1/images/BR00116995__2020-11-06T02_41_05-Measurement1/Images/r01c02f03p01-ch6sk1fk1fl1.tiff",
+ "s3://cellpainting-gallery/cpg0000-jump-pilot/source_4/images/2020_11_04_CPJUMP1/images/BR00116995__2020-11-06T02_41_05-Measurement1/Images/r01c02f03p01-ch5sk1fk1fl1.tiff",
+ "s3://cellpainting-gallery/cpg0000-jump-pilot/source_4/images/2020_11_04_CPJUMP1/images/BR00116995__2020-11-06T02_41_05-Measurement1/Images/r01c02f03p01-ch1sk1fk1fl1.tiff",
+ "s3://cellpainting-gallery/cpg0000-jump-pilot/source_4/images/2020_11_04_CPJUMP1/images/BR00116995__2020-11-06T02_41_05-Measurement1/Images/r01c02f03p01-ch2sk1fk1fl1.tiff",
+ "s3://cellpainting-gallery/cpg0000-jump-pilot/source_4/images/2020_11_04_CPJUMP1/images/BR00116995__2020-11-06T02_41_05-Measurement1/Images/r01c02f03p01-ch4sk1fk1fl1.tiff",
+ "s3://cellpainting-gallery/cpg0000-jump-pilot/source_4/images/2020_11_04_CPJUMP1/images/BR00116995__2020-11-06T02_41_05-Measurement1/Images/r01c02f03p01-ch3sk1fk1fl1.tiff"
+ ],
+ [
+ "3",
+ "BR00116995",
+ "A02",
+ "4",
+ "s3://cellpainting-gallery/cpg0000-jump-pilot/source_4/images/2020_11_04_CPJUMP1/images/BR00116995__2020-11-06T02_41_05-Measurement1/Images/r01c02f04p01-ch7sk1fk1fl1.tiff",
+ "s3://cellpainting-gallery/cpg0000-jump-pilot/source_4/images/2020_11_04_CPJUMP1/images/BR00116995__2020-11-06T02_41_05-Measurement1/Images/r01c02f04p01-ch8sk1fk1fl1.tiff",
+ "s3://cellpainting-gallery/cpg0000-jump-pilot/source_4/images/2020_11_04_CPJUMP1/images/BR00116995__2020-11-06T02_41_05-Measurement1/Images/r01c02f04p01-ch6sk1fk1fl1.tiff",
+ "s3://cellpainting-gallery/cpg0000-jump-pilot/source_4/images/2020_11_04_CPJUMP1/images/BR00116995__2020-11-06T02_41_05-Measurement1/Images/r01c02f04p01-ch5sk1fk1fl1.tiff",
+ "s3://cellpainting-gallery/cpg0000-jump-pilot/source_4/images/2020_11_04_CPJUMP1/images/BR00116995__2020-11-06T02_41_05-Measurement1/Images/r01c02f04p01-ch1sk1fk1fl1.tiff",
+ "s3://cellpainting-gallery/cpg0000-jump-pilot/source_4/images/2020_11_04_CPJUMP1/images/BR00116995__2020-11-06T02_41_05-Measurement1/Images/r01c02f04p01-ch2sk1fk1fl1.tiff",
+ "s3://cellpainting-gallery/cpg0000-jump-pilot/source_4/images/2020_11_04_CPJUMP1/images/BR00116995__2020-11-06T02_41_05-Measurement1/Images/r01c02f04p01-ch4sk1fk1fl1.tiff",
+ "s3://cellpainting-gallery/cpg0000-jump-pilot/source_4/images/2020_11_04_CPJUMP1/images/BR00116995__2020-11-06T02_41_05-Measurement1/Images/r01c02f04p01-ch3sk1fk1fl1.tiff"
+ ],
+ [
+ "4",
+ "BR00116995",
+ "A02",
+ "5",
+ "s3://cellpainting-gallery/cpg0000-jump-pilot/source_4/images/2020_11_04_CPJUMP1/images/BR00116995__2020-11-06T02_41_05-Measurement1/Images/r01c02f05p01-ch7sk1fk1fl1.tiff",
+ "s3://cellpainting-gallery/cpg0000-jump-pilot/source_4/images/2020_11_04_CPJUMP1/images/BR00116995__2020-11-06T02_41_05-Measurement1/Images/r01c02f05p01-ch8sk1fk1fl1.tiff",
+ "s3://cellpainting-gallery/cpg0000-jump-pilot/source_4/images/2020_11_04_CPJUMP1/images/BR00116995__2020-11-06T02_41_05-Measurement1/Images/r01c02f05p01-ch6sk1fk1fl1.tiff",
+ "s3://cellpainting-gallery/cpg0000-jump-pilot/source_4/images/2020_11_04_CPJUMP1/images/BR00116995__2020-11-06T02_41_05-Measurement1/Images/r01c02f05p01-ch5sk1fk1fl1.tiff",
+ "s3://cellpainting-gallery/cpg0000-jump-pilot/source_4/images/2020_11_04_CPJUMP1/images/BR00116995__2020-11-06T02_41_05-Measurement1/Images/r01c02f05p01-ch1sk1fk1fl1.tiff",
+ "s3://cellpainting-gallery/cpg0000-jump-pilot/source_4/images/2020_11_04_CPJUMP1/images/BR00116995__2020-11-06T02_41_05-Measurement1/Images/r01c02f05p01-ch2sk1fk1fl1.tiff",
+ "s3://cellpainting-gallery/cpg0000-jump-pilot/source_4/images/2020_11_04_CPJUMP1/images/BR00116995__2020-11-06T02_41_05-Measurement1/Images/r01c02f05p01-ch4sk1fk1fl1.tiff",
+ "s3://cellpainting-gallery/cpg0000-jump-pilot/source_4/images/2020_11_04_CPJUMP1/images/BR00116995__2020-11-06T02_41_05-Measurement1/Images/r01c02f05p01-ch3sk1fk1fl1.tiff"
+ ]
+ ],
+ "shape": {
+ "columns": 11,
+ "rows": 5
+ }
+ },
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | Metadata_ChannelName | \n",
+ " Metadata_Plate | \n",
+ " Metadata_Well | \n",
+ " Metadata_Site | \n",
+ " LZ_BF | \n",
+ " BF | \n",
+ " HZ_BF | \n",
+ " DNA | \n",
+ " Mito | \n",
+ " AGP | \n",
+ " ER | \n",
+ " RNA | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " | 0 | \n",
+ " BR00116995 | \n",
+ " A02 | \n",
+ " 1 | \n",
+ " s3://cellpainting-gallery/cpg0000-jump-pilot/s... | \n",
+ " s3://cellpainting-gallery/cpg0000-jump-pilot/s... | \n",
+ " s3://cellpainting-gallery/cpg0000-jump-pilot/s... | \n",
+ " s3://cellpainting-gallery/cpg0000-jump-pilot/s... | \n",
+ " s3://cellpainting-gallery/cpg0000-jump-pilot/s... | \n",
+ " s3://cellpainting-gallery/cpg0000-jump-pilot/s... | \n",
+ " s3://cellpainting-gallery/cpg0000-jump-pilot/s... | \n",
+ " s3://cellpainting-gallery/cpg0000-jump-pilot/s... | \n",
+ "
\n",
+ " \n",
+ " | 1 | \n",
+ " BR00116995 | \n",
+ " A02 | \n",
+ " 2 | \n",
+ " s3://cellpainting-gallery/cpg0000-jump-pilot/s... | \n",
+ " s3://cellpainting-gallery/cpg0000-jump-pilot/s... | \n",
+ " s3://cellpainting-gallery/cpg0000-jump-pilot/s... | \n",
+ " s3://cellpainting-gallery/cpg0000-jump-pilot/s... | \n",
+ " s3://cellpainting-gallery/cpg0000-jump-pilot/s... | \n",
+ " s3://cellpainting-gallery/cpg0000-jump-pilot/s... | \n",
+ " s3://cellpainting-gallery/cpg0000-jump-pilot/s... | \n",
+ " s3://cellpainting-gallery/cpg0000-jump-pilot/s... | \n",
+ "
\n",
+ " \n",
+ " | 2 | \n",
+ " BR00116995 | \n",
+ " A02 | \n",
+ " 3 | \n",
+ " s3://cellpainting-gallery/cpg0000-jump-pilot/s... | \n",
+ " s3://cellpainting-gallery/cpg0000-jump-pilot/s... | \n",
+ " s3://cellpainting-gallery/cpg0000-jump-pilot/s... | \n",
+ " s3://cellpainting-gallery/cpg0000-jump-pilot/s... | \n",
+ " s3://cellpainting-gallery/cpg0000-jump-pilot/s... | \n",
+ " s3://cellpainting-gallery/cpg0000-jump-pilot/s... | \n",
+ " s3://cellpainting-gallery/cpg0000-jump-pilot/s... | \n",
+ " s3://cellpainting-gallery/cpg0000-jump-pilot/s... | \n",
+ "
\n",
+ " \n",
+ " | 3 | \n",
+ " BR00116995 | \n",
+ " A02 | \n",
+ " 4 | \n",
+ " s3://cellpainting-gallery/cpg0000-jump-pilot/s... | \n",
+ " s3://cellpainting-gallery/cpg0000-jump-pilot/s... | \n",
+ " s3://cellpainting-gallery/cpg0000-jump-pilot/s... | \n",
+ " s3://cellpainting-gallery/cpg0000-jump-pilot/s... | \n",
+ " s3://cellpainting-gallery/cpg0000-jump-pilot/s... | \n",
+ " s3://cellpainting-gallery/cpg0000-jump-pilot/s... | \n",
+ " s3://cellpainting-gallery/cpg0000-jump-pilot/s... | \n",
+ " s3://cellpainting-gallery/cpg0000-jump-pilot/s... | \n",
+ "
\n",
+ " \n",
+ " | 4 | \n",
+ " BR00116995 | \n",
+ " A02 | \n",
+ " 5 | \n",
+ " s3://cellpainting-gallery/cpg0000-jump-pilot/s... | \n",
+ " s3://cellpainting-gallery/cpg0000-jump-pilot/s... | \n",
+ " s3://cellpainting-gallery/cpg0000-jump-pilot/s... | \n",
+ " s3://cellpainting-gallery/cpg0000-jump-pilot/s... | \n",
+ " s3://cellpainting-gallery/cpg0000-jump-pilot/s... | \n",
+ " s3://cellpainting-gallery/cpg0000-jump-pilot/s... | \n",
+ " s3://cellpainting-gallery/cpg0000-jump-pilot/s... | \n",
+ " s3://cellpainting-gallery/cpg0000-jump-pilot/s... | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ "Metadata_ChannelName Metadata_Plate Metadata_Well Metadata_Site \\\n",
+ "0 BR00116995 A02 1 \n",
+ "1 BR00116995 A02 2 \n",
+ "2 BR00116995 A02 3 \n",
+ "3 BR00116995 A02 4 \n",
+ "4 BR00116995 A02 5 \n",
+ "\n",
+ "Metadata_ChannelName LZ_BF \\\n",
+ "0 s3://cellpainting-gallery/cpg0000-jump-pilot/s... \n",
+ "1 s3://cellpainting-gallery/cpg0000-jump-pilot/s... \n",
+ "2 s3://cellpainting-gallery/cpg0000-jump-pilot/s... \n",
+ "3 s3://cellpainting-gallery/cpg0000-jump-pilot/s... \n",
+ "4 s3://cellpainting-gallery/cpg0000-jump-pilot/s... \n",
+ "\n",
+ "Metadata_ChannelName BF \\\n",
+ "0 s3://cellpainting-gallery/cpg0000-jump-pilot/s... \n",
+ "1 s3://cellpainting-gallery/cpg0000-jump-pilot/s... \n",
+ "2 s3://cellpainting-gallery/cpg0000-jump-pilot/s... \n",
+ "3 s3://cellpainting-gallery/cpg0000-jump-pilot/s... \n",
+ "4 s3://cellpainting-gallery/cpg0000-jump-pilot/s... \n",
+ "\n",
+ "Metadata_ChannelName HZ_BF \\\n",
+ "0 s3://cellpainting-gallery/cpg0000-jump-pilot/s... \n",
+ "1 s3://cellpainting-gallery/cpg0000-jump-pilot/s... \n",
+ "2 s3://cellpainting-gallery/cpg0000-jump-pilot/s... \n",
+ "3 s3://cellpainting-gallery/cpg0000-jump-pilot/s... \n",
+ "4 s3://cellpainting-gallery/cpg0000-jump-pilot/s... \n",
+ "\n",
+ "Metadata_ChannelName DNA \\\n",
+ "0 s3://cellpainting-gallery/cpg0000-jump-pilot/s... \n",
+ "1 s3://cellpainting-gallery/cpg0000-jump-pilot/s... \n",
+ "2 s3://cellpainting-gallery/cpg0000-jump-pilot/s... \n",
+ "3 s3://cellpainting-gallery/cpg0000-jump-pilot/s... \n",
+ "4 s3://cellpainting-gallery/cpg0000-jump-pilot/s... \n",
+ "\n",
+ "Metadata_ChannelName Mito \\\n",
+ "0 s3://cellpainting-gallery/cpg0000-jump-pilot/s... \n",
+ "1 s3://cellpainting-gallery/cpg0000-jump-pilot/s... \n",
+ "2 s3://cellpainting-gallery/cpg0000-jump-pilot/s... \n",
+ "3 s3://cellpainting-gallery/cpg0000-jump-pilot/s... \n",
+ "4 s3://cellpainting-gallery/cpg0000-jump-pilot/s... \n",
+ "\n",
+ "Metadata_ChannelName AGP \\\n",
+ "0 s3://cellpainting-gallery/cpg0000-jump-pilot/s... \n",
+ "1 s3://cellpainting-gallery/cpg0000-jump-pilot/s... \n",
+ "2 s3://cellpainting-gallery/cpg0000-jump-pilot/s... \n",
+ "3 s3://cellpainting-gallery/cpg0000-jump-pilot/s... \n",
+ "4 s3://cellpainting-gallery/cpg0000-jump-pilot/s... \n",
+ "\n",
+ "Metadata_ChannelName ER \\\n",
+ "0 s3://cellpainting-gallery/cpg0000-jump-pilot/s... \n",
+ "1 s3://cellpainting-gallery/cpg0000-jump-pilot/s... \n",
+ "2 s3://cellpainting-gallery/cpg0000-jump-pilot/s... \n",
+ "3 s3://cellpainting-gallery/cpg0000-jump-pilot/s... \n",
+ "4 s3://cellpainting-gallery/cpg0000-jump-pilot/s... \n",
+ "\n",
+ "Metadata_ChannelName RNA \n",
+ "0 s3://cellpainting-gallery/cpg0000-jump-pilot/s... \n",
+ "1 s3://cellpainting-gallery/cpg0000-jump-pilot/s... \n",
+ "2 s3://cellpainting-gallery/cpg0000-jump-pilot/s... \n",
+ "3 s3://cellpainting-gallery/cpg0000-jump-pilot/s... \n",
+ "4 s3://cellpainting-gallery/cpg0000-jump-pilot/s... "
+ ]
+ },
+ "execution_count": 6,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "wide_manifest = arrange_manifest_channels(negcon_u2os_24_manifest)\n",
+ "wide_manifest.head()"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "d62c00e5",
+ "metadata": {},
+ "source": [
+ "## Data split"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 7,
+ "id": "848fd839",
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "Train plates: 3, Test plates: 1\n",
+ "Train samples: 1728, Test samples: 576\n"
+ ]
+ }
+ ],
+ "source": [
+ "# Get unique plates\n",
+ "unique_plates = wide_manifest['Metadata_Plate'].unique()\n",
+ "\n",
+ "# Split plates into train (75%) and test (25%) with seed\n",
+ "train_plates, test_plates = train_test_split(\n",
+ " unique_plates, \n",
+ " test_size=0.25, \n",
+ " random_state=42\n",
+ ")\n",
+ "\n",
+ "# Create train and test manifests based on plate split\n",
+ "train_manifest_wide = wide_manifest[wide_manifest['Metadata_Plate'].isin(train_plates)]\n",
+ "test_manifest_wide = wide_manifest[wide_manifest['Metadata_Plate'].isin(test_plates)]\n",
+ "\n",
+ "print(f\"Train plates: {len(train_plates)}, Test plates: {len(test_plates)}\")\n",
+ "print(f\"Train samples: {len(train_manifest_wide)}, Test samples: {len(test_manifest_wide)}\")"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "70d2ff65",
+ "metadata": {},
+ "source": [
+ "## Write final splitted download manifest with metadata and download all needed data"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 8,
+ "id": "bc058e2c",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "train_manifest_wide.to_csv(DATA_DOWNLOAD_DIR/ \"train_manifest.csv\", index=False)\n",
+ "test_manifest_wide.to_csv(DATA_DOWNLOAD_DIR / \"test_manifest.csv\", index=False)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 9,
+ "id": "bec96da0",
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "application/vnd.microsoft.datawrangler.viewer.v0+json": {
+ "columns": [
+ {
+ "name": "index",
+ "rawType": "int64",
+ "type": "integer"
+ },
+ {
+ "name": "LZ_BF",
+ "rawType": "object",
+ "type": "string"
+ },
+ {
+ "name": "BF",
+ "rawType": "object",
+ "type": "string"
+ },
+ {
+ "name": "HZ_BF",
+ "rawType": "object",
+ "type": "string"
+ },
+ {
+ "name": "DNA",
+ "rawType": "object",
+ "type": "string"
+ },
+ {
+ "name": "Mito",
+ "rawType": "object",
+ "type": "string"
+ },
+ {
+ "name": "AGP",
+ "rawType": "object",
+ "type": "string"
+ },
+ {
+ "name": "ER",
+ "rawType": "object",
+ "type": "string"
+ },
+ {
+ "name": "RNA",
+ "rawType": "object",
+ "type": "string"
+ }
+ ],
+ "ref": "227a6f35-3cca-4c7f-99f5-93c70f489a1b",
+ "rows": [
+ [
+ "0",
+ "/home/weishanli/data_fast/cpjump1/cpjump1_u2os_train/BR00116995_A02_1_LZ_BF.tiff",
+ "/home/weishanli/data_fast/cpjump1/cpjump1_u2os_train/BR00116995_A02_1_BF.tiff",
+ "/home/weishanli/data_fast/cpjump1/cpjump1_u2os_train/BR00116995_A02_1_HZ_BF.tiff",
+ "/home/weishanli/data_fast/cpjump1/cpjump1_u2os_train/BR00116995_A02_1_DNA.tiff",
+ "/home/weishanli/data_fast/cpjump1/cpjump1_u2os_train/BR00116995_A02_1_Mito.tiff",
+ "/home/weishanli/data_fast/cpjump1/cpjump1_u2os_train/BR00116995_A02_1_AGP.tiff",
+ "/home/weishanli/data_fast/cpjump1/cpjump1_u2os_train/BR00116995_A02_1_ER.tiff",
+ "/home/weishanli/data_fast/cpjump1/cpjump1_u2os_train/BR00116995_A02_1_RNA.tiff"
+ ],
+ [
+ "1",
+ "/home/weishanli/data_fast/cpjump1/cpjump1_u2os_train/BR00116995_A02_2_LZ_BF.tiff",
+ "/home/weishanli/data_fast/cpjump1/cpjump1_u2os_train/BR00116995_A02_2_BF.tiff",
+ "/home/weishanli/data_fast/cpjump1/cpjump1_u2os_train/BR00116995_A02_2_HZ_BF.tiff",
+ "/home/weishanli/data_fast/cpjump1/cpjump1_u2os_train/BR00116995_A02_2_DNA.tiff",
+ "/home/weishanli/data_fast/cpjump1/cpjump1_u2os_train/BR00116995_A02_2_Mito.tiff",
+ "/home/weishanli/data_fast/cpjump1/cpjump1_u2os_train/BR00116995_A02_2_AGP.tiff",
+ "/home/weishanli/data_fast/cpjump1/cpjump1_u2os_train/BR00116995_A02_2_ER.tiff",
+ "/home/weishanli/data_fast/cpjump1/cpjump1_u2os_train/BR00116995_A02_2_RNA.tiff"
+ ],
+ [
+ "2",
+ "/home/weishanli/data_fast/cpjump1/cpjump1_u2os_train/BR00116995_A02_3_LZ_BF.tiff",
+ "/home/weishanli/data_fast/cpjump1/cpjump1_u2os_train/BR00116995_A02_3_BF.tiff",
+ "/home/weishanli/data_fast/cpjump1/cpjump1_u2os_train/BR00116995_A02_3_HZ_BF.tiff",
+ "/home/weishanli/data_fast/cpjump1/cpjump1_u2os_train/BR00116995_A02_3_DNA.tiff",
+ "/home/weishanli/data_fast/cpjump1/cpjump1_u2os_train/BR00116995_A02_3_Mito.tiff",
+ "/home/weishanli/data_fast/cpjump1/cpjump1_u2os_train/BR00116995_A02_3_AGP.tiff",
+ "/home/weishanli/data_fast/cpjump1/cpjump1_u2os_train/BR00116995_A02_3_ER.tiff",
+ "/home/weishanli/data_fast/cpjump1/cpjump1_u2os_train/BR00116995_A02_3_RNA.tiff"
+ ],
+ [
+ "3",
+ "/home/weishanli/data_fast/cpjump1/cpjump1_u2os_train/BR00116995_A02_4_LZ_BF.tiff",
+ "/home/weishanli/data_fast/cpjump1/cpjump1_u2os_train/BR00116995_A02_4_BF.tiff",
+ "/home/weishanli/data_fast/cpjump1/cpjump1_u2os_train/BR00116995_A02_4_HZ_BF.tiff",
+ "/home/weishanli/data_fast/cpjump1/cpjump1_u2os_train/BR00116995_A02_4_DNA.tiff",
+ "/home/weishanli/data_fast/cpjump1/cpjump1_u2os_train/BR00116995_A02_4_Mito.tiff",
+ "/home/weishanli/data_fast/cpjump1/cpjump1_u2os_train/BR00116995_A02_4_AGP.tiff",
+ "/home/weishanli/data_fast/cpjump1/cpjump1_u2os_train/BR00116995_A02_4_ER.tiff",
+ "/home/weishanli/data_fast/cpjump1/cpjump1_u2os_train/BR00116995_A02_4_RNA.tiff"
+ ],
+ [
+ "4",
+ "/home/weishanli/data_fast/cpjump1/cpjump1_u2os_train/BR00116995_A02_5_LZ_BF.tiff",
+ "/home/weishanli/data_fast/cpjump1/cpjump1_u2os_train/BR00116995_A02_5_BF.tiff",
+ "/home/weishanli/data_fast/cpjump1/cpjump1_u2os_train/BR00116995_A02_5_HZ_BF.tiff",
+ "/home/weishanli/data_fast/cpjump1/cpjump1_u2os_train/BR00116995_A02_5_DNA.tiff",
+ "/home/weishanli/data_fast/cpjump1/cpjump1_u2os_train/BR00116995_A02_5_Mito.tiff",
+ "/home/weishanli/data_fast/cpjump1/cpjump1_u2os_train/BR00116995_A02_5_AGP.tiff",
+ "/home/weishanli/data_fast/cpjump1/cpjump1_u2os_train/BR00116995_A02_5_ER.tiff",
+ "/home/weishanli/data_fast/cpjump1/cpjump1_u2os_train/BR00116995_A02_5_RNA.tiff"
+ ]
+ ],
+ "shape": {
+ "columns": 8,
+ "rows": 5
+ }
+ },
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " LZ_BF | \n",
+ " BF | \n",
+ " HZ_BF | \n",
+ " DNA | \n",
+ " Mito | \n",
+ " AGP | \n",
+ " ER | \n",
+ " RNA | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " | 0 | \n",
+ " /home/weishanli/data_fast/cpjump1/cpjump1_u2os... | \n",
+ " /home/weishanli/data_fast/cpjump1/cpjump1_u2os... | \n",
+ " /home/weishanli/data_fast/cpjump1/cpjump1_u2os... | \n",
+ " /home/weishanli/data_fast/cpjump1/cpjump1_u2os... | \n",
+ " /home/weishanli/data_fast/cpjump1/cpjump1_u2os... | \n",
+ " /home/weishanli/data_fast/cpjump1/cpjump1_u2os... | \n",
+ " /home/weishanli/data_fast/cpjump1/cpjump1_u2os... | \n",
+ " /home/weishanli/data_fast/cpjump1/cpjump1_u2os... | \n",
+ "
\n",
+ " \n",
+ " | 1 | \n",
+ " /home/weishanli/data_fast/cpjump1/cpjump1_u2os... | \n",
+ " /home/weishanli/data_fast/cpjump1/cpjump1_u2os... | \n",
+ " /home/weishanli/data_fast/cpjump1/cpjump1_u2os... | \n",
+ " /home/weishanli/data_fast/cpjump1/cpjump1_u2os... | \n",
+ " /home/weishanli/data_fast/cpjump1/cpjump1_u2os... | \n",
+ " /home/weishanli/data_fast/cpjump1/cpjump1_u2os... | \n",
+ " /home/weishanli/data_fast/cpjump1/cpjump1_u2os... | \n",
+ " /home/weishanli/data_fast/cpjump1/cpjump1_u2os... | \n",
+ "
\n",
+ " \n",
+ " | 2 | \n",
+ " /home/weishanli/data_fast/cpjump1/cpjump1_u2os... | \n",
+ " /home/weishanli/data_fast/cpjump1/cpjump1_u2os... | \n",
+ " /home/weishanli/data_fast/cpjump1/cpjump1_u2os... | \n",
+ " /home/weishanli/data_fast/cpjump1/cpjump1_u2os... | \n",
+ " /home/weishanli/data_fast/cpjump1/cpjump1_u2os... | \n",
+ " /home/weishanli/data_fast/cpjump1/cpjump1_u2os... | \n",
+ " /home/weishanli/data_fast/cpjump1/cpjump1_u2os... | \n",
+ " /home/weishanli/data_fast/cpjump1/cpjump1_u2os... | \n",
+ "
\n",
+ " \n",
+ " | 3 | \n",
+ " /home/weishanli/data_fast/cpjump1/cpjump1_u2os... | \n",
+ " /home/weishanli/data_fast/cpjump1/cpjump1_u2os... | \n",
+ " /home/weishanli/data_fast/cpjump1/cpjump1_u2os... | \n",
+ " /home/weishanli/data_fast/cpjump1/cpjump1_u2os... | \n",
+ " /home/weishanli/data_fast/cpjump1/cpjump1_u2os... | \n",
+ " /home/weishanli/data_fast/cpjump1/cpjump1_u2os... | \n",
+ " /home/weishanli/data_fast/cpjump1/cpjump1_u2os... | \n",
+ " /home/weishanli/data_fast/cpjump1/cpjump1_u2os... | \n",
+ "
\n",
+ " \n",
+ " | 4 | \n",
+ " /home/weishanli/data_fast/cpjump1/cpjump1_u2os... | \n",
+ " /home/weishanli/data_fast/cpjump1/cpjump1_u2os... | \n",
+ " /home/weishanli/data_fast/cpjump1/cpjump1_u2os... | \n",
+ " /home/weishanli/data_fast/cpjump1/cpjump1_u2os... | \n",
+ " /home/weishanli/data_fast/cpjump1/cpjump1_u2os... | \n",
+ " /home/weishanli/data_fast/cpjump1/cpjump1_u2os... | \n",
+ " /home/weishanli/data_fast/cpjump1/cpjump1_u2os... | \n",
+ " /home/weishanli/data_fast/cpjump1/cpjump1_u2os... | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " LZ_BF \\\n",
+ "0 /home/weishanli/data_fast/cpjump1/cpjump1_u2os... \n",
+ "1 /home/weishanli/data_fast/cpjump1/cpjump1_u2os... \n",
+ "2 /home/weishanli/data_fast/cpjump1/cpjump1_u2os... \n",
+ "3 /home/weishanli/data_fast/cpjump1/cpjump1_u2os... \n",
+ "4 /home/weishanli/data_fast/cpjump1/cpjump1_u2os... \n",
+ "\n",
+ " BF \\\n",
+ "0 /home/weishanli/data_fast/cpjump1/cpjump1_u2os... \n",
+ "1 /home/weishanli/data_fast/cpjump1/cpjump1_u2os... \n",
+ "2 /home/weishanli/data_fast/cpjump1/cpjump1_u2os... \n",
+ "3 /home/weishanli/data_fast/cpjump1/cpjump1_u2os... \n",
+ "4 /home/weishanli/data_fast/cpjump1/cpjump1_u2os... \n",
+ "\n",
+ " HZ_BF \\\n",
+ "0 /home/weishanli/data_fast/cpjump1/cpjump1_u2os... \n",
+ "1 /home/weishanli/data_fast/cpjump1/cpjump1_u2os... \n",
+ "2 /home/weishanli/data_fast/cpjump1/cpjump1_u2os... \n",
+ "3 /home/weishanli/data_fast/cpjump1/cpjump1_u2os... \n",
+ "4 /home/weishanli/data_fast/cpjump1/cpjump1_u2os... \n",
+ "\n",
+ " DNA \\\n",
+ "0 /home/weishanli/data_fast/cpjump1/cpjump1_u2os... \n",
+ "1 /home/weishanli/data_fast/cpjump1/cpjump1_u2os... \n",
+ "2 /home/weishanli/data_fast/cpjump1/cpjump1_u2os... \n",
+ "3 /home/weishanli/data_fast/cpjump1/cpjump1_u2os... \n",
+ "4 /home/weishanli/data_fast/cpjump1/cpjump1_u2os... \n",
+ "\n",
+ " Mito \\\n",
+ "0 /home/weishanli/data_fast/cpjump1/cpjump1_u2os... \n",
+ "1 /home/weishanli/data_fast/cpjump1/cpjump1_u2os... \n",
+ "2 /home/weishanli/data_fast/cpjump1/cpjump1_u2os... \n",
+ "3 /home/weishanli/data_fast/cpjump1/cpjump1_u2os... \n",
+ "4 /home/weishanli/data_fast/cpjump1/cpjump1_u2os... \n",
+ "\n",
+ " AGP \\\n",
+ "0 /home/weishanli/data_fast/cpjump1/cpjump1_u2os... \n",
+ "1 /home/weishanli/data_fast/cpjump1/cpjump1_u2os... \n",
+ "2 /home/weishanli/data_fast/cpjump1/cpjump1_u2os... \n",
+ "3 /home/weishanli/data_fast/cpjump1/cpjump1_u2os... \n",
+ "4 /home/weishanli/data_fast/cpjump1/cpjump1_u2os... \n",
+ "\n",
+ " ER \\\n",
+ "0 /home/weishanli/data_fast/cpjump1/cpjump1_u2os... \n",
+ "1 /home/weishanli/data_fast/cpjump1/cpjump1_u2os... \n",
+ "2 /home/weishanli/data_fast/cpjump1/cpjump1_u2os... \n",
+ "3 /home/weishanli/data_fast/cpjump1/cpjump1_u2os... \n",
+ "4 /home/weishanli/data_fast/cpjump1/cpjump1_u2os... \n",
+ "\n",
+ " RNA \n",
+ "0 /home/weishanli/data_fast/cpjump1/cpjump1_u2os... \n",
+ "1 /home/weishanli/data_fast/cpjump1/cpjump1_u2os... \n",
+ "2 /home/weishanli/data_fast/cpjump1/cpjump1_u2os... \n",
+ "3 /home/weishanli/data_fast/cpjump1/cpjump1_u2os... \n",
+ "4 /home/weishanli/data_fast/cpjump1/cpjump1_u2os... "
+ ]
+ },
+ "execution_count": 9,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "train_download_summary = download_wide_manifest_channels(\n",
+ " train_manifest_wide,\n",
+ " dest_dir = DATA_DOWNLOAD_DIR / \"cpjump1_u2os_train\" \n",
+ ")\n",
+ "train_download_summary.head()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 10,
+ "id": "70ab975c",
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "application/vnd.microsoft.datawrangler.viewer.v0+json": {
+ "columns": [
+ {
+ "name": "index",
+ "rawType": "int64",
+ "type": "integer"
+ },
+ {
+ "name": "LZ_BF",
+ "rawType": "object",
+ "type": "string"
+ },
+ {
+ "name": "BF",
+ "rawType": "object",
+ "type": "string"
+ },
+ {
+ "name": "HZ_BF",
+ "rawType": "object",
+ "type": "string"
+ },
+ {
+ "name": "DNA",
+ "rawType": "object",
+ "type": "string"
+ },
+ {
+ "name": "Mito",
+ "rawType": "object",
+ "type": "string"
+ },
+ {
+ "name": "AGP",
+ "rawType": "object",
+ "type": "string"
+ },
+ {
+ "name": "ER",
+ "rawType": "object",
+ "type": "string"
+ },
+ {
+ "name": "RNA",
+ "rawType": "object",
+ "type": "string"
+ }
+ ],
+ "ref": "779ce221-d8ff-4070-af9e-a65d1780cdfd",
+ "rows": [
+ [
+ "0",
+ "/home/weishanli/data_fast/cpjump1/cpjump1_u2os_test/BR00117024_A02_1_LZ_BF.tiff",
+ "/home/weishanli/data_fast/cpjump1/cpjump1_u2os_test/BR00117024_A02_1_BF.tiff",
+ "/home/weishanli/data_fast/cpjump1/cpjump1_u2os_test/BR00117024_A02_1_HZ_BF.tiff",
+ "/home/weishanli/data_fast/cpjump1/cpjump1_u2os_test/BR00117024_A02_1_DNA.tiff",
+ "/home/weishanli/data_fast/cpjump1/cpjump1_u2os_test/BR00117024_A02_1_Mito.tiff",
+ "/home/weishanli/data_fast/cpjump1/cpjump1_u2os_test/BR00117024_A02_1_AGP.tiff",
+ "/home/weishanli/data_fast/cpjump1/cpjump1_u2os_test/BR00117024_A02_1_ER.tiff",
+ "/home/weishanli/data_fast/cpjump1/cpjump1_u2os_test/BR00117024_A02_1_RNA.tiff"
+ ],
+ [
+ "1",
+ "/home/weishanli/data_fast/cpjump1/cpjump1_u2os_test/BR00117024_A02_2_LZ_BF.tiff",
+ "/home/weishanli/data_fast/cpjump1/cpjump1_u2os_test/BR00117024_A02_2_BF.tiff",
+ "/home/weishanli/data_fast/cpjump1/cpjump1_u2os_test/BR00117024_A02_2_HZ_BF.tiff",
+ "/home/weishanli/data_fast/cpjump1/cpjump1_u2os_test/BR00117024_A02_2_DNA.tiff",
+ "/home/weishanli/data_fast/cpjump1/cpjump1_u2os_test/BR00117024_A02_2_Mito.tiff",
+ "/home/weishanli/data_fast/cpjump1/cpjump1_u2os_test/BR00117024_A02_2_AGP.tiff",
+ "/home/weishanli/data_fast/cpjump1/cpjump1_u2os_test/BR00117024_A02_2_ER.tiff",
+ "/home/weishanli/data_fast/cpjump1/cpjump1_u2os_test/BR00117024_A02_2_RNA.tiff"
+ ],
+ [
+ "2",
+ "/home/weishanli/data_fast/cpjump1/cpjump1_u2os_test/BR00117024_A02_3_LZ_BF.tiff",
+ "/home/weishanli/data_fast/cpjump1/cpjump1_u2os_test/BR00117024_A02_3_BF.tiff",
+ "/home/weishanli/data_fast/cpjump1/cpjump1_u2os_test/BR00117024_A02_3_HZ_BF.tiff",
+ "/home/weishanli/data_fast/cpjump1/cpjump1_u2os_test/BR00117024_A02_3_DNA.tiff",
+ "/home/weishanli/data_fast/cpjump1/cpjump1_u2os_test/BR00117024_A02_3_Mito.tiff",
+ "/home/weishanli/data_fast/cpjump1/cpjump1_u2os_test/BR00117024_A02_3_AGP.tiff",
+ "/home/weishanli/data_fast/cpjump1/cpjump1_u2os_test/BR00117024_A02_3_ER.tiff",
+ "/home/weishanli/data_fast/cpjump1/cpjump1_u2os_test/BR00117024_A02_3_RNA.tiff"
+ ],
+ [
+ "3",
+ "/home/weishanli/data_fast/cpjump1/cpjump1_u2os_test/BR00117024_A02_4_LZ_BF.tiff",
+ "/home/weishanli/data_fast/cpjump1/cpjump1_u2os_test/BR00117024_A02_4_BF.tiff",
+ "/home/weishanli/data_fast/cpjump1/cpjump1_u2os_test/BR00117024_A02_4_HZ_BF.tiff",
+ "/home/weishanli/data_fast/cpjump1/cpjump1_u2os_test/BR00117024_A02_4_DNA.tiff",
+ "/home/weishanli/data_fast/cpjump1/cpjump1_u2os_test/BR00117024_A02_4_Mito.tiff",
+ "/home/weishanli/data_fast/cpjump1/cpjump1_u2os_test/BR00117024_A02_4_AGP.tiff",
+ "/home/weishanli/data_fast/cpjump1/cpjump1_u2os_test/BR00117024_A02_4_ER.tiff",
+ "/home/weishanli/data_fast/cpjump1/cpjump1_u2os_test/BR00117024_A02_4_RNA.tiff"
+ ],
+ [
+ "4",
+ "/home/weishanli/data_fast/cpjump1/cpjump1_u2os_test/BR00117024_A02_5_LZ_BF.tiff",
+ "/home/weishanli/data_fast/cpjump1/cpjump1_u2os_test/BR00117024_A02_5_BF.tiff",
+ "/home/weishanli/data_fast/cpjump1/cpjump1_u2os_test/BR00117024_A02_5_HZ_BF.tiff",
+ "/home/weishanli/data_fast/cpjump1/cpjump1_u2os_test/BR00117024_A02_5_DNA.tiff",
+ "/home/weishanli/data_fast/cpjump1/cpjump1_u2os_test/BR00117024_A02_5_Mito.tiff",
+ "/home/weishanli/data_fast/cpjump1/cpjump1_u2os_test/BR00117024_A02_5_AGP.tiff",
+ "/home/weishanli/data_fast/cpjump1/cpjump1_u2os_test/BR00117024_A02_5_ER.tiff",
+ "/home/weishanli/data_fast/cpjump1/cpjump1_u2os_test/BR00117024_A02_5_RNA.tiff"
+ ]
+ ],
+ "shape": {
+ "columns": 8,
+ "rows": 5
+ }
+ },
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " LZ_BF | \n",
+ " BF | \n",
+ " HZ_BF | \n",
+ " DNA | \n",
+ " Mito | \n",
+ " AGP | \n",
+ " ER | \n",
+ " RNA | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " | 0 | \n",
+ " /home/weishanli/data_fast/cpjump1/cpjump1_u2os... | \n",
+ " /home/weishanli/data_fast/cpjump1/cpjump1_u2os... | \n",
+ " /home/weishanli/data_fast/cpjump1/cpjump1_u2os... | \n",
+ " /home/weishanli/data_fast/cpjump1/cpjump1_u2os... | \n",
+ " /home/weishanli/data_fast/cpjump1/cpjump1_u2os... | \n",
+ " /home/weishanli/data_fast/cpjump1/cpjump1_u2os... | \n",
+ " /home/weishanli/data_fast/cpjump1/cpjump1_u2os... | \n",
+ " /home/weishanli/data_fast/cpjump1/cpjump1_u2os... | \n",
+ "
\n",
+ " \n",
+ " | 1 | \n",
+ " /home/weishanli/data_fast/cpjump1/cpjump1_u2os... | \n",
+ " /home/weishanli/data_fast/cpjump1/cpjump1_u2os... | \n",
+ " /home/weishanli/data_fast/cpjump1/cpjump1_u2os... | \n",
+ " /home/weishanli/data_fast/cpjump1/cpjump1_u2os... | \n",
+ " /home/weishanli/data_fast/cpjump1/cpjump1_u2os... | \n",
+ " /home/weishanli/data_fast/cpjump1/cpjump1_u2os... | \n",
+ " /home/weishanli/data_fast/cpjump1/cpjump1_u2os... | \n",
+ " /home/weishanli/data_fast/cpjump1/cpjump1_u2os... | \n",
+ "
\n",
+ " \n",
+ " | 2 | \n",
+ " /home/weishanli/data_fast/cpjump1/cpjump1_u2os... | \n",
+ " /home/weishanli/data_fast/cpjump1/cpjump1_u2os... | \n",
+ " /home/weishanli/data_fast/cpjump1/cpjump1_u2os... | \n",
+ " /home/weishanli/data_fast/cpjump1/cpjump1_u2os... | \n",
+ " /home/weishanli/data_fast/cpjump1/cpjump1_u2os... | \n",
+ " /home/weishanli/data_fast/cpjump1/cpjump1_u2os... | \n",
+ " /home/weishanli/data_fast/cpjump1/cpjump1_u2os... | \n",
+ " /home/weishanli/data_fast/cpjump1/cpjump1_u2os... | \n",
+ "
\n",
+ " \n",
+ " | 3 | \n",
+ " /home/weishanli/data_fast/cpjump1/cpjump1_u2os... | \n",
+ " /home/weishanli/data_fast/cpjump1/cpjump1_u2os... | \n",
+ " /home/weishanli/data_fast/cpjump1/cpjump1_u2os... | \n",
+ " /home/weishanli/data_fast/cpjump1/cpjump1_u2os... | \n",
+ " /home/weishanli/data_fast/cpjump1/cpjump1_u2os... | \n",
+ " /home/weishanli/data_fast/cpjump1/cpjump1_u2os... | \n",
+ " /home/weishanli/data_fast/cpjump1/cpjump1_u2os... | \n",
+ " /home/weishanli/data_fast/cpjump1/cpjump1_u2os... | \n",
+ "
\n",
+ " \n",
+ " | 4 | \n",
+ " /home/weishanli/data_fast/cpjump1/cpjump1_u2os... | \n",
+ " /home/weishanli/data_fast/cpjump1/cpjump1_u2os... | \n",
+ " /home/weishanli/data_fast/cpjump1/cpjump1_u2os... | \n",
+ " /home/weishanli/data_fast/cpjump1/cpjump1_u2os... | \n",
+ " /home/weishanli/data_fast/cpjump1/cpjump1_u2os... | \n",
+ " /home/weishanli/data_fast/cpjump1/cpjump1_u2os... | \n",
+ " /home/weishanli/data_fast/cpjump1/cpjump1_u2os... | \n",
+ " /home/weishanli/data_fast/cpjump1/cpjump1_u2os... | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " LZ_BF \\\n",
+ "0 /home/weishanli/data_fast/cpjump1/cpjump1_u2os... \n",
+ "1 /home/weishanli/data_fast/cpjump1/cpjump1_u2os... \n",
+ "2 /home/weishanli/data_fast/cpjump1/cpjump1_u2os... \n",
+ "3 /home/weishanli/data_fast/cpjump1/cpjump1_u2os... \n",
+ "4 /home/weishanli/data_fast/cpjump1/cpjump1_u2os... \n",
+ "\n",
+ " BF \\\n",
+ "0 /home/weishanli/data_fast/cpjump1/cpjump1_u2os... \n",
+ "1 /home/weishanli/data_fast/cpjump1/cpjump1_u2os... \n",
+ "2 /home/weishanli/data_fast/cpjump1/cpjump1_u2os... \n",
+ "3 /home/weishanli/data_fast/cpjump1/cpjump1_u2os... \n",
+ "4 /home/weishanli/data_fast/cpjump1/cpjump1_u2os... \n",
+ "\n",
+ " HZ_BF \\\n",
+ "0 /home/weishanli/data_fast/cpjump1/cpjump1_u2os... \n",
+ "1 /home/weishanli/data_fast/cpjump1/cpjump1_u2os... \n",
+ "2 /home/weishanli/data_fast/cpjump1/cpjump1_u2os... \n",
+ "3 /home/weishanli/data_fast/cpjump1/cpjump1_u2os... \n",
+ "4 /home/weishanli/data_fast/cpjump1/cpjump1_u2os... \n",
+ "\n",
+ " DNA \\\n",
+ "0 /home/weishanli/data_fast/cpjump1/cpjump1_u2os... \n",
+ "1 /home/weishanli/data_fast/cpjump1/cpjump1_u2os... \n",
+ "2 /home/weishanli/data_fast/cpjump1/cpjump1_u2os... \n",
+ "3 /home/weishanli/data_fast/cpjump1/cpjump1_u2os... \n",
+ "4 /home/weishanli/data_fast/cpjump1/cpjump1_u2os... \n",
+ "\n",
+ " Mito \\\n",
+ "0 /home/weishanli/data_fast/cpjump1/cpjump1_u2os... \n",
+ "1 /home/weishanli/data_fast/cpjump1/cpjump1_u2os... \n",
+ "2 /home/weishanli/data_fast/cpjump1/cpjump1_u2os... \n",
+ "3 /home/weishanli/data_fast/cpjump1/cpjump1_u2os... \n",
+ "4 /home/weishanli/data_fast/cpjump1/cpjump1_u2os... \n",
+ "\n",
+ " AGP \\\n",
+ "0 /home/weishanli/data_fast/cpjump1/cpjump1_u2os... \n",
+ "1 /home/weishanli/data_fast/cpjump1/cpjump1_u2os... \n",
+ "2 /home/weishanli/data_fast/cpjump1/cpjump1_u2os... \n",
+ "3 /home/weishanli/data_fast/cpjump1/cpjump1_u2os... \n",
+ "4 /home/weishanli/data_fast/cpjump1/cpjump1_u2os... \n",
+ "\n",
+ " ER \\\n",
+ "0 /home/weishanli/data_fast/cpjump1/cpjump1_u2os... \n",
+ "1 /home/weishanli/data_fast/cpjump1/cpjump1_u2os... \n",
+ "2 /home/weishanli/data_fast/cpjump1/cpjump1_u2os... \n",
+ "3 /home/weishanli/data_fast/cpjump1/cpjump1_u2os... \n",
+ "4 /home/weishanli/data_fast/cpjump1/cpjump1_u2os... \n",
+ "\n",
+ " RNA \n",
+ "0 /home/weishanli/data_fast/cpjump1/cpjump1_u2os... \n",
+ "1 /home/weishanli/data_fast/cpjump1/cpjump1_u2os... \n",
+ "2 /home/weishanli/data_fast/cpjump1/cpjump1_u2os... \n",
+ "3 /home/weishanli/data_fast/cpjump1/cpjump1_u2os... \n",
+ "4 /home/weishanli/data_fast/cpjump1/cpjump1_u2os... "
+ ]
+ },
+ "execution_count": 10,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "test_download_summary =download_wide_manifest_channels(\n",
+ " test_manifest_wide,\n",
+ " dest_dir = DATA_DOWNLOAD_DIR / \"cpjump1_u2os_test\" \n",
+ ")\n",
+ "test_download_summary.head()"
+ ]
+ }
+ ],
+ "metadata": {
+ "kernelspec": {
+ "display_name": "virtual_stain_flow",
+ "language": "python",
+ "name": "python3"
+ },
+ "language_info": {
+ "codemirror_mode": {
+ "name": "ipython",
+ "version": 3
+ },
+ "file_extension": ".py",
+ "mimetype": "text/x-python",
+ "name": "python",
+ "nbconvert_exporter": "python",
+ "pygments_lexer": "ipython3",
+ "version": "3.12.12"
+ }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}
diff --git a/examples/nbconverted/0.download_example_dataset.py b/examples/nbconverted/0.download_example_dataset.py
new file mode 100644
index 0000000..ea4e561
--- /dev/null
+++ b/examples/nbconverted/0.download_example_dataset.py
@@ -0,0 +1,169 @@
+#!/usr/bin/env python
+# coding: utf-8
+
+# # Download JUMP pilot plate data from AWS S3 bucket for example training
+
+# In[1]:
+
+
+from pathlib import Path
+from urllib.parse import urlparse
+
+import pandas as pd
+from sklearn.model_selection import train_test_split
+
+from virtual_stain_flow.datasets.example.cpjump1_manifest import get_manifest
+from virtual_stain_flow.datasets.example.arrange_as_wide import arrange_manifest_channels
+
+
+# ## Pathing
+
+# In[ ]:
+
+
+DATA_DOWNLOAD_DIR = Path("/PATH/TO/WHERE/YOU/WANT/TO/DOWNLOAD/CPJUMP1")
+DATA_DOWNLOAD_DIR.mkdir(exist_ok=True, parents=True)
+
+
+# ## S3 download helpers
+
+# In[3]:
+
+
+def _parse_s3_url(url):
+ parsed = urlparse(url)
+ if parsed.scheme != "s3":
+ raise ValueError(f"Expected s3:// URL, got: {url}")
+ return parsed.netloc, parsed.path.lstrip("/")
+
+def download_wide_manifest_channels(
+ wide_manifest,
+ dest_dir,
+ channel_columns=None,
+ overwrite=False,
+):
+ """
+ Download S3 TIFFs for each channel and write a local file_index.csv with paths.
+ """
+ if channel_columns is None:
+ channel_columns = ["LZ_BF", "BF", "HZ_BF", "DNA", "Mito", "AGP", "ER", "RNA"]
+ dest_dir = Path(dest_dir)
+ dest_dir.mkdir(parents=True, exist_ok=True)
+ try:
+ import boto3
+ from botocore import UNSIGNED
+ from botocore.config import Config
+ except ImportError as exc:
+ raise ImportError(
+ "boto3 is required for S3 downloads. Install with: pip install boto3"
+ ) from exc
+ s3 = boto3.client("s3", config=Config(signature_version=UNSIGNED))
+ local_rows = []
+ for row_idx, row in wide_manifest.iterrows():
+ prefix_parts = []
+ for key in ["Metadata_Plate", "Metadata_Well", "Metadata_Site"]:
+ if key in wide_manifest.columns:
+ prefix_parts.append(str(row[key]))
+ prefix = "_".join(prefix_parts) if prefix_parts else f"row_{row_idx}"
+ local_row = {}
+ for channel in channel_columns:
+ url = row[channel] if channel in wide_manifest.columns else None
+ if pd.isna(url):
+ local_row[channel] = None
+ continue
+ bucket, key = _parse_s3_url(url)
+ suffix = Path(key).suffix or ".tif"
+ local_path = dest_dir / f"{prefix}_{channel}{suffix}"
+ if overwrite or not local_path.exists():
+ s3.download_file(bucket, key, str(local_path))
+ local_row[channel] = str(local_path)
+ local_rows.append(local_row)
+ file_index = pd.DataFrame(local_rows, columns=channel_columns)
+ file_index.to_csv(dest_dir / "file_index.csv", index=False)
+ return file_index
+
+
+# ## Retrieve compound manifest
+
+# In[4]:
+
+
+MANIFEST = get_manifest()
+MANIFEST.head()
+
+
+# ## Filter manifest
+# For the sake of demoing training here we restricted timepoint to 24, and selected untreated U2-OS cells
+
+# In[5]:
+
+
+negcon_u2os_24_manifest = MANIFEST[
+ (MANIFEST["Batch"] == "2020_11_04_CPJUMP1") &
+ (MANIFEST["control_type"] == "negcon") &
+ (MANIFEST["Cell_type"] == "U2OS") &
+ (MANIFEST["Time"] == 24)
+]
+negcon_u2os_24_manifest.head()
+
+
+# ## Arrange as wide to be in anticipated format dor virtual stain flow datasets and also the format the download helper expects
+
+# In[6]:
+
+
+wide_manifest = arrange_manifest_channels(negcon_u2os_24_manifest)
+wide_manifest.head()
+
+
+# ## Data split
+
+# In[7]:
+
+
+# Get unique plates
+unique_plates = wide_manifest['Metadata_Plate'].unique()
+
+# Split plates into train (75%) and test (25%) with seed
+train_plates, test_plates = train_test_split(
+ unique_plates,
+ test_size=0.25,
+ random_state=42
+)
+
+# Create train and test manifests based on plate split
+train_manifest_wide = wide_manifest[wide_manifest['Metadata_Plate'].isin(train_plates)]
+test_manifest_wide = wide_manifest[wide_manifest['Metadata_Plate'].isin(test_plates)]
+
+print(f"Train plates: {len(train_plates)}, Test plates: {len(test_plates)}")
+print(f"Train samples: {len(train_manifest_wide)}, Test samples: {len(test_manifest_wide)}")
+
+
+# ## Write final splitted download manifest with metadata and download all needed data
+
+# In[8]:
+
+
+train_manifest_wide.to_csv(DATA_DOWNLOAD_DIR/ "train_manifest.csv", index=False)
+test_manifest_wide.to_csv(DATA_DOWNLOAD_DIR / "test_manifest.csv", index=False)
+
+
+# In[9]:
+
+
+train_download_summary = download_wide_manifest_channels(
+ train_manifest_wide,
+ dest_dir = DATA_DOWNLOAD_DIR / "cpjump1_u2os_train"
+)
+train_download_summary.head()
+
+
+# In[10]:
+
+
+test_download_summary =download_wide_manifest_channels(
+ test_manifest_wide,
+ dest_dir = DATA_DOWNLOAD_DIR / "cpjump1_u2os_test"
+)
+test_download_summary.head()
+
diff --git a/pyproject.toml b/pyproject.toml
index 1dfdd8a..2f7b3dd 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -30,6 +30,7 @@ dependencies = [
"notebook",
"tifffile",
"pandera[pandas]",
+ "boto3"
]
[project.optional-dependencies]
diff --git a/src/virtual_stain_flow/datasets/example/arrange_as_wide.py b/src/virtual_stain_flow/datasets/example/arrange_as_wide.py
new file mode 100644
index 0000000..f150bf3
--- /dev/null
+++ b/src/virtual_stain_flow/datasets/example/arrange_as_wide.py
@@ -0,0 +1,32 @@
+"""
+Helper utility specifically to support the example CPJUMP1 dataset
+ pivoting and arranging as file index.
+"""
+
+import pandas as pd
+
+
+def arrange_manifest_channels(manifest):
+ """
+ Return a wide dataframe with one row per plate/well/site and URL columns per channel.
+ """
+ required_channels = ["LZ_BF", "BF", "HZ_BF", "DNA", "Mito", "AGP", "ER", "RNA"]
+ keys = ["Metadata_Plate", "Metadata_Well", "Metadata_Site"]
+ filtered = manifest[manifest["Metadata_ChannelName"].isin(required_channels)].copy()
+ filtered["Metadata_ChannelName"] = filtered["Metadata_ChannelName"].astype(
+ pd.CategoricalDtype(categories=required_channels, ordered=True)
+ )
+ filtered = filtered.sort_values(keys + ["Metadata_ChannelName"])
+ wide = (
+ filtered.pivot_table(
+ index=keys,
+ columns="Metadata_ChannelName",
+ values="Metadata_FileUrl",
+ aggfunc="first",
+ observed=False,
+ )
+ .reindex(columns=required_channels)
+ .reset_index()
+ )
+
+ return wide
diff --git a/src/virtual_stain_flow/datasets/example/cpjump1_manifest.py b/src/virtual_stain_flow/datasets/example/cpjump1_manifest.py
new file mode 100644
index 0000000..4a1f508
--- /dev/null
+++ b/src/virtual_stain_flow/datasets/example/cpjump1_manifest.py
@@ -0,0 +1,145 @@
+"""Build an enriched image manifest for CPJUMP1 dataset access.
+
+Only compound perturbations (no CRISPR or ORF) are included, which is
+appropriate for virtual staining experiments.
+"""
+
+from __future__ import annotations
+
+import argparse
+import sys
+from typing import Optional
+
+import pandas as pd
+
+# Most recent commit ref as of Mar 25 2026.
+REPO_REF = "6ea3958c3809cd04ac95b63138937dd64a7c4c12"
+REPO_BASE = f"https://github.com/WayScience/JUMP-single-cell/raw/{REPO_REF}/"
+
+IMAGE_MANIFEST_URL = f"{REPO_BASE}0.download_data/data/2020_11_04_CPJUMP1_all_plates.parquet"
+IMAGE_MANIFEST_COLUMNS = [
+ "Metadata_Plate",
+ "Metadata_Well",
+ "Metadata_Site",
+ "Metadata_ChannelName",
+ "Metadata_PlaneID",
+ "Metadata_PositionZ",
+ "Metadata_FileUrl",
+ "Metadata_Filename",
+]
+
+EXPERIMENT_METADATA_URL = f"{REPO_BASE}reference_plate_data/experiment-metadata.tsv"
+COMPOUND_PLATEMAP_URL = f"{REPO_BASE}reference_plate_data/JUMP-Target-1_compound_platemap.txt"
+COMPOUND_METADATA_URL = f"{REPO_BASE}reference_plate_data/JUMP-Target-1_compound_metadata_targets.tsv"
+
+__all__ = ["build_manifest", "get_manifest", "main"]
+
+_MANIFEST_CACHE: Optional[pd.DataFrame] = None
+
+
+def build_manifest() -> pd.DataFrame:
+ """
+ Main utility function that handles all the wrangling.
+ Return an enriched CPJUMP1 manifest as a pandas DataFrame.
+ """
+ image_manifest = pd.read_parquet(IMAGE_MANIFEST_URL, columns=IMAGE_MANIFEST_COLUMNS)
+
+ experiment_meta = pd.read_csv(EXPERIMENT_METADATA_URL, delimiter="\t")
+ experiment_meta.rename(columns={"Assay_Plate_Barcode": "Metadata_Plate"}, inplace=True)
+ experiment_meta = experiment_meta[experiment_meta["Perturbation"] == "compound"]
+ # exclude dl batch which is essentially duplicate in context for image data access
+ experiment_meta = experiment_meta[~experiment_meta["Batch"].str.endswith("_DL")]
+
+ compound_platemap = pd.merge(
+ pd.read_csv(COMPOUND_PLATEMAP_URL, delimiter="\t"),
+ pd.read_csv(COMPOUND_METADATA_URL, delimiter="\t"),
+ on="broad_sample",
+ how="left",
+ validate="many_to_one"
+ ).rename(columns={"well_position": "Metadata_Well"}, inplace=False)
+
+ image_manifest_compound = pd.merge(
+ experiment_meta,
+ image_manifest,
+ on="Metadata_Plate",
+ how="inner",
+ validate="one_to_many" # one plate id should map to many image rows
+ )
+
+ return pd.merge(
+ compound_platemap,
+ image_manifest_compound,
+ on="Metadata_Well",
+ how="inner",
+ # all the plates share the same well map so one well should map to many image rows
+ validate="one_to_many"
+ )
+
+
+def get_manifest() -> pd.DataFrame:
+ """
+ Return a cached manifest to avoid repeated network reads.
+ """
+ global _MANIFEST_CACHE
+ if _MANIFEST_CACHE is None:
+ _MANIFEST_CACHE = build_manifest()
+ return _MANIFEST_CACHE
+
+
+def _write_manifest(df: pd.DataFrame, output: str, fmt: str) -> None:
+ if fmt == "csv":
+ df.to_csv(output, index=False)
+ elif fmt == "parquet":
+ df.to_parquet(output, index=False)
+ else:
+ raise ValueError(f"Unsupported format: {fmt}")
+
+
+def main(argv: Optional[list[str]] = None) -> int:
+ """
+ Command-line interface to building and ouputting the CPJUMP1 manifest.
+ By default, it prints a summary and preview of the manifest.
+ Use --output or --stdout to write the full manifest to a file or stdout.
+ May or may not be useful.
+ """
+ parser = argparse.ArgumentParser(description="Build CPJUMP1 enriched manifest.")
+ parser.add_argument(
+ "--output",
+ help="Write manifest to a file (CSV or Parquet).",
+ )
+ parser.add_argument(
+ "--format",
+ choices=["csv", "parquet"],
+ default="csv",
+ help="Output file format when using --output (default: csv).",
+ )
+ parser.add_argument(
+ "--stdout",
+ action="store_true",
+ help="Write manifest to stdout as CSV.",
+ )
+ parser.add_argument(
+ "--head",
+ type=int,
+ default=5,
+ help="Rows to display when no output is specified (default: 5).",
+ )
+ args = parser.parse_args(argv)
+
+ manifest = get_manifest()
+
+ if args.stdout:
+ manifest.to_csv(sys.stdout, index=False)
+ return 0
+
+ if args.output:
+ _write_manifest(manifest, args.output, args.format)
+ return 0
+
+ print(f"Rows: {len(manifest):,} | Columns: {len(manifest.columns)}")
+ print(manifest.head(args.head).to_string(index=False))
+ return 0
+
+
+if __name__ == "__main__":
+ raise SystemExit(main())