diff --git a/Cargo.toml b/Cargo.toml index cbe8b7c..f299d31 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -47,6 +47,8 @@ pre-release-hook = ["git-cliff", "-o", "CHANGELOG.md", "--tag", "v{{version}}" ] publish = false pre-release-replacements = [ { file = "skills/hotdata/SKILL.md", search = "^version: .+", replace = "version: {{version}}", exactly = 1 }, + { file = "skills/hotdata-search/SKILL.md", search = "^version: .+", replace = "version: {{version}}", exactly = 1 }, + { file = "skills/hotdata-analytics/SKILL.md", search = "^version: .+", replace = "version: {{version}}", exactly = 1 }, { file = "skills/hotdata-geospatial/SKILL.md", search = "^version: .+", replace = "version: {{version}}", exactly = 1 }, { file = "README.md", search = "version-[0-9.]+-blue", replace = "version-{{version}}-blue", exactly = 1 }, ] diff --git a/scripts/release.sh b/scripts/release.sh index 85544b9..fdd3215 100755 --- a/scripts/release.sh +++ b/scripts/release.sh @@ -2,8 +2,8 @@ # release.sh — two-phase release wrapper around cargo-release # # Usage: -# scripts/release.sh prepare # steps 0-2: branch, bump, push PR -# scripts/release.sh finish # step 4: tag, publish, trigger dist +# scripts/release.sh prepare # branch, bump, changelog PR +# scripts/release.sh finish # tag only (main is branch-protected) set -euo pipefail @@ -13,7 +13,7 @@ VERSION="${2:-}" usage() { echo "Usage:" echo " scripts/release.sh prepare # create release branch and open PR" - echo " scripts/release.sh finish # tag and publish from main" + echo " scripts/release.sh finish # push v tag from main (no main push)" exit 1 } @@ -24,6 +24,16 @@ require_clean_tree() { fi } +read_crate_version() { + local ver + ver="$(grep -E '^version = ' Cargo.toml | head -1 | sed -E 's/^version = "([^"]+)".*/\1/')" + if [ -z "$ver" ]; then + echo "error: could not read version from Cargo.toml" >&2 + exit 1 + fi + printf '%s' "$ver" +} + case "$COMMAND" in prepare) if [ -z "$VERSION" ]; then @@ -35,17 +45,21 @@ case "$COMMAND" in require_clean_tree - # step 0: create release branch echo "→ Creating branch $BRANCH" git checkout -b "$BRANCH" - # step 2: bump versions, commit, push branch echo "" echo "→ Running cargo release (no publish, no tag)..." - # git-cliff (pre-release hook) is often installed via cargo install export PATH="${HOME}/.cargo/bin:${PATH}" cargo release --no-publish --no-tag --no-confirm --allow-branch="$BRANCH" --execute "$VERSION" + if [ -f scripts/validate-changelog.py ]; then + echo "" + echo "→ Validating CHANGELOG.md against origin/main..." + git fetch origin main 2>/dev/null || true + python3 scripts/validate-changelog.py origin/main + fi + echo "" echo "→ Opening pull request..." PR_URL=$(gh pr create \ @@ -77,15 +91,33 @@ case "$COMMAND" in fi echo "→ Pulling latest main..." - git pull + git pull origin main + + VERSION="$(read_crate_version)" + TAG="v${VERSION}" echo "" - echo "→ Running cargo release (tagging release)..." - export PATH="${HOME}/.cargo/bin:${PATH}" - cargo release --no-confirm --execute + echo "→ Release version from Cargo.toml: $VERSION (tag $TAG)" + + if git rev-parse "$TAG" >/dev/null 2>&1; then + echo "error: tag $TAG already exists locally. Delete it or pick a new version." >&2 + exit 1 + fi + + if git ls-remote --exit-code --tags origin "refs/tags/${TAG}" >/dev/null 2>&1; then + echo "error: tag $TAG already exists on origin." >&2 + exit 1 + fi + + echo "→ Creating annotated tag $TAG (no commit to main)..." + git tag -a "$TAG" -m "Release hotdata-cli version $VERSION" + + echo "→ Pushing tag to origin..." + git push origin "$TAG" echo "" - echo "✓ Release complete. Tag pushed and dist workflow triggered." + echo "✓ Tag $TAG pushed. Dist/release workflow should run on GitHub." + echo " (main was not pushed — version bump must already be merged via release PR.)" ;; *) diff --git a/skills/hotdata-analytics/SKILL.md b/skills/hotdata-analytics/SKILL.md new file mode 100644 index 0000000..37cbda9 --- /dev/null +++ b/skills/hotdata-analytics/SKILL.md @@ -0,0 +1,124 @@ +--- +name: hotdata-analytics +description: Use this skill when the user wants OLAP-style SQL analytics in Hotdata — aggregations, GROUP BY, JOINs, reporting, exploratory queries, query run history, stored results, or materialized follow-up tables (Chain via datasets or managed databases). Activate for "analyze", "aggregate", "rollup", "pivot", "report", "metrics", "GROUP BY", "query history", "past queries", "query runs", "stored results", "materialize", "chain", "intermediate table", or sorted indexes for filters/range scans. Do not load for BM25/vector search or geospatial SQL — use hotdata-search or hotdata-geospatial. Requires the core hotdata skill for connections, tables, datasets, and auth. +version: 0.2.3 +--- + +# Hotdata Analytics Skill + +**OLAP-style analytics** in Hotdata: PostgreSQL-dialect SQL, query execution, run history, stored results, **Chain** materializations, and **sorted** indexes for filters and joins. + +**Prerequisites:** Authenticate, workspace, and catalog discovery via the **`hotdata`** skill (`connections`, `tables`, `datasets`, `databases`). + +**Related skills:** **`hotdata-search`** (BM25, vector, retrieval indexes), **`hotdata-geospatial`** (spatial SQL). + +--- + +## Execute SQL + +```bash +hotdata query "" [--workspace-id ] [--connection ] [--output table|json|csv] +hotdata query status [--output table|json|csv] +``` + +- **PostgreSQL dialect.** Quote mixed-case identifiers: `"CustomerName"`. +- Use **`hotdata tables list`** for schema discovery — not `information_schema` via `query`. +- Fully qualified names: `..`, `datasets..
`, `..
`. +- Long-running queries may return `query_run_id` → poll with **`query status`** (exit `2` = still running). Do not re-run identical heavy SQL while polling. +- For **workspace-wide** joins and naming, load **context:DATAMODEL** when listed (`hotdata context list` → `show DATAMODEL`) — see **`hotdata`** skill. + +### OLAP patterns + +Typical analytics SQL (all via `hotdata query`): + +- **Aggregations:** `COUNT`, `SUM`, `AVG`, `MIN`, `MAX` with `GROUP BY` +- **Joins:** `INNER` / `LEFT JOIN` across `..
` names +- **Filtering:** `WHERE` on partition-friendly columns (consider **sorted** indexes below) +- **Ordering:** `ORDER BY` on metrics or dimensions +- **Bounded exploration:** always `LIMIT` while iterating; widen once validated + +Column names from CSV uploads may be case-sensitive — use double quotes when not all-lowercase. + +--- + +## Query run history + +Uses the **active workspace only** (no `--workspace-id`; set with `hotdata workspaces set`). + +```bash +hotdata queries list [--limit ] [--cursor ] [--status ] [--output table|json|yaml] +hotdata queries [--output table|json|yaml] +``` + +- `list` — status, duration, row count, SQL preview (default limit 20). Filter: `--status running,failed`. +- `` — full metadata, formatted SQL, `result_id` when present. +- Use history to find recurring `WHERE` / `JOIN` / `GROUP BY` patterns before adding indexes (search skill) or chains. + +--- + +## Stored results + +```bash +hotdata results list [--workspace-id ] [--limit ] [--offset ] [--output table|json|yaml] +hotdata results [--workspace-id ] [--output table|json|csv] +``` + +- Prefer **`results `** over re-running identical heavy queries. +- Query footers may include `[result-id: rslt...]`; also available from `queries `. + +--- + +## Chain (materialized follow-ups) + +**Pattern:** run SQL → materialize a smaller table → query the materialized name. + +1. **Base query** + + ```bash + hotdata query "SELECT ..." + hotdata query status # if async + ``` + +2. **Materialize** (pick one) + + ```bash + hotdata datasets create --label "chain slice" --sql "SELECT ..." [--table-name chain_slice] + hotdata datasets create --label "from saved" --query-id [--table-name ...] + ``` + + Or managed parquet: + + ```bash + hotdata databases create --name analytics --table slice + hotdata databases tables load analytics slice --file ./slice.parquet + ``` + +3. **Chain query** — use printed **`full_name`** or `datasets list` **FULL NAME** column: + + ```bash + hotdata query "SELECT * FROM datasets.main.chain_slice WHERE ..." + hotdata query "SELECT * FROM analytics.public.slice WHERE ..." + ``` + +Document stable chains in **context:DATAMODEL → Derived tables (Chain)**. + +Full procedure: [references/WORKFLOWS.md](references/WORKFLOWS.md). + +--- + +## Sorted indexes (filters and range scans) + +For equality, range, and sort-heavy OLAP — not full-text or vector (see **`hotdata-search`**): + +```bash +hotdata indexes create --connection-id --schema --table
\ + --name idx_orders_created --columns created_at --type sorted [--async] +``` + +List and delete use the same `hotdata indexes` commands as in the search skill; only **`--type sorted`** is the analytics focus here. + +--- + +## Sandboxes and chains + +Sandbox datasets use **`datasets..
`**, not `datasets.main`. Run queries with active sandbox config or `hotdata sandbox run hotdata query "..."`. See **`hotdata`** skill **Sandboxes**. diff --git a/skills/hotdata-analytics/references/WORKFLOWS.md b/skills/hotdata-analytics/references/WORKFLOWS.md new file mode 100644 index 0000000..0a11385 --- /dev/null +++ b/skills/hotdata-analytics/references/WORKFLOWS.md @@ -0,0 +1,116 @@ +# Analytics workflows + +OLAP-style SQL, **History** (query runs and stored results), and **Chain** (materialized follow-ups). Requires **`hotdata`** for auth, workspaces, and catalog commands. + +**Related:** **`hotdata-search`** for BM25/vector indexes and `hotdata search`; **`hotdata`** [WORKFLOWS.md](../../hotdata/references/WORKFLOWS.md) for datasets vs managed databases. + +--- + +## History + +**Goal:** Find prior work: query runs (execution history) and stored result rows. + +### Query runs + +Uses the **active workspace only** — no `--workspace-id` on `queries`. Set default workspace with `hotdata workspaces set` first. + +```bash +hotdata queries list [--limit N] [--cursor ] [--status ] +hotdata queries +``` + +- `list` — status, creation time, duration, row count, truncated SQL preview (default limit 20). +- `--status` — filter comma-separated values, e.g. `--status running,failed`. +- `` — full metadata (timings, `result_id`, snapshot, hashes) and formatted SQL. +- If a run has a `result_id`, fetch rows with `hotdata results ` below. + +Use history to spot recurring `WHERE`, `JOIN`, `GROUP BY`, or search-style SQL before adding indexes (**`hotdata-search`**) or new Chain tables. + +### Stored results + +```bash +hotdata results list [--workspace-id ] [--limit N] [--offset N] +hotdata results [--workspace-id ] [--output table|json|csv] +``` + +- Query footers may include `[result-id: rslt...]` — record it for later. +- Pick up `result_id` from `queries ` when present. +- **Prefer `hotdata results ` over re-running identical heavy SQL.** Re-runs waste resources and may return different data. + +Results are paginated; the CLI hints the next `--offset` when more rows exist. + +--- + +## Chain + +**Goal:** Follow-up analysis on a **bounded** intermediate without rescanning huge base tables. + +**Pattern:** run SQL → materialize → query the materialized **qualified name**. + +### 1. Base query + +```bash +hotdata query "SELECT ..." +``` + +- Quote mixed-case columns with double quotes (PostgreSQL dialect). +- If the CLI returns a `query_run_id`, poll instead of re-running: + + ```bash + hotdata query status + ``` + + Exit codes: `0` succeeded, `1` failed, `2` still running. + +### 2. Materialize + +Land a smaller table — pick one: + +**Datasets** (CSV/JSON/URL/SQL snapshot → `datasets..
`): + +```bash +hotdata datasets create --label "chain revenue slice" --sql "SELECT ..." [--table-name chain_revenue_slice] +hotdata datasets create --label "from saved" --query-id [--table-name ...] +``` + +**Managed database** (parquet → `..
`): + +```bash +hotdata databases create --name chain_db --table revenue_slice +hotdata databases tables load chain_db revenue_slice --file ./revenue_slice.parquet +``` + +Note the printed **`full_name`** (e.g. `datasets.main.chain_revenue_slice` or `chain_db.public.revenue_slice`). For datasets, **`FULL NAME`** from `datasets list` is authoritative. + +### 3. Chain query + +Query using that name — do not hardcode `datasets.main` if the schema segment is a sandbox id: + +```bash +hotdata datasets list +hotdata query "SELECT * FROM datasets.main.chain_revenue_slice WHERE ..." +# Sandbox example (use actual full_name from create or list): +# hotdata query "SELECT * FROM datasets.s_ufmblmvq.chain_revenue_slice WHERE ..." +# Managed database: +# hotdata query "SELECT * FROM chain_db.public.revenue_slice WHERE ..." +``` + +### Sandbox context + +For **sandbox-scoped** chain tables: + +- Qualified name is **`datasets..
`**, not `datasets.main`. +- Run queries with **active sandbox** in config (`hotdata sandbox set`) **or** inside **`hotdata sandbox run hotdata query "…"`**. +- Without sandbox context, you may get **access denied** on sandbox-only tables. + +### Naming and documentation + +- Prefer predictable `--table-name` values: `chain__`. +- Record long-lived chains in **context:DATAMODEL → Derived tables (Chain)** with the **full** SQL name you use (`datasets.…` or `database.schema.table`). +- Promote join/grain findings to **context:DATAMODEL** when they should outlive the sandbox (**`hotdata`** skill). + +### Guardrails + +- Materialize when the base scan is large and the follow-up runs many times. +- Keep Chain tables focused; avoid wide `SELECT *` materializations when a narrow projection suffices. +- For upload format choice (datasets vs databases), see **`hotdata`** WORKFLOWS — [Datasets vs managed databases](../../hotdata/references/WORKFLOWS.md#datasets-vs-managed-databases). diff --git a/skills/hotdata-geospatial/SKILL.md b/skills/hotdata-geospatial/SKILL.md index 3a92b8a..e282d0e 100644 --- a/skills/hotdata-geospatial/SKILL.md +++ b/skills/hotdata-geospatial/SKILL.md @@ -8,6 +8,8 @@ version: 0.2.3 Use this skill when working with geospatial data in Hotdata. Hotdata supports a subset of PostGIS-style functions using **PostgreSQL dialect SQL**. This reference is dataset-agnostic — apply it to any table with geometry columns. +**Related skills:** **`hotdata`** (core CLI), **`hotdata-search`** (BM25/vector), **`hotdata-analytics`** (OLAP SQL). + --- ## Geometry Columns diff --git a/skills/hotdata-search/SKILL.md b/skills/hotdata-search/SKILL.md new file mode 100644 index 0000000..7531bdc --- /dev/null +++ b/skills/hotdata-search/SKILL.md @@ -0,0 +1,94 @@ +--- +name: hotdata-search +description: Use this skill when the user wants full-text search, BM25 keyword search, vector similarity search, semantic search, embeddings, or retrieval indexes in Hotdata. Activate for "hotdata search", "BM25", "full-text", "vector search", "semantic search", "similarity", "embedding", "embedding provider", "create an index" (bm25 or vector), "list indexes" for search, or SQL using bm25_search or vector_distance. Do not load for general SQL analytics (aggregations, GROUP BY) or geospatial work — use hotdata-analytics or hotdata-geospatial instead. Requires the core hotdata skill for auth and workspace basics. +version: 0.2.3 +--- + +# Hotdata Search Skill + +Retrieval workloads in Hotdata: **BM25 full-text**, **vector similarity**, and the **indexes** and **embedding providers** that power them. + +**Prerequisites:** Authenticate and select a workspace (see the **`hotdata`** skill). Use fully qualified table names: `..
`. + +**Related skills:** **`hotdata-analytics`** (OLAP SQL, query history, materialized chains), **`hotdata-geospatial`** (PostGIS-style functions). + +--- + +## Search CLI + +`--type` is **required**: `bm25` or `vector`. Both run server-side. + +```bash +# BM25 (requires a BM25 index on the column) +hotdata search "" --type bm25 --table --column \ + [--select ] [--limit ] [--workspace-id ] [--output table|json|csv] + +# Vector (requires a vector index; server auto-embeds the query text) +hotdata search "" --type vector --table --column \ + [--select ] [--limit ] [--workspace-id ] [--output table|json|csv] +``` + +| Type | Behavior | +|------|----------| +| **`bm25`** | Server generates `bm25_search(table, col, 'text')`. Results sort by score (descending). | +| **`vector`** | Pass plain-text query; name the **source text column** (e.g. `title`). Server embeds using the same provider/metric/dimensions as the index. SQL uses `vector_distance(col, 'text')`. Results sort by distance (ascending). | + +- **No vector index, or custom embedding model?** Use raw SQL via `hotdata query` (e.g. `cosine_distance(col, [])`). The removed `--model` / stdin-vector paths hardcoded `l2_distance` and are not supported. +- **Before search:** create the right index (`indexes create --type bm25` or `--type vector`). See [references/INDEXES.md](references/INDEXES.md). +- Default `--limit` is 10. + +--- + +## Indexes (BM25 and vector) + +Indexes attach to a **connection table** (`--connection-id` + `--schema` + `--table`) or a **dataset** (`--dataset-id`). Scopes are mutually exclusive for create/delete. + +```bash +# List — workspace scan on connection tables (filter with -c / --schema / --table) +hotdata indexes list [--connection-id ] [--schema ] [--table
] [--workspace-id ] [--output table|json|yaml] +hotdata indexes list --dataset-id [--workspace-id ] [--output table|json|yaml] + +# Connection table +hotdata indexes create --connection-id --schema --table
\ + --name --columns --type bm25|vector \ + [--metric l2|cosine|dot] [--async] \ + [--embedding-provider-id ] [--dimensions ] [--output-column ] [--description ] +hotdata indexes delete --connection-id --schema --table
--name + +# Dataset +hotdata indexes create --dataset-id --name --columns --type bm25|vector ... +hotdata indexes delete --dataset-id --name +``` + +- **`--type` is required** on create: `bm25` (one text column) or `vector` (exactly one column; often embeddings or auto-embedded text). +- **`sorted`** indexes (range/equality for OLAP filters) are documented in **`hotdata-analytics`** — this skill focuses on retrieval types. +- **`--async`:** poll with `hotdata jobs ` (see **`hotdata`** skill **Jobs**). +- **Auto-embedding:** `--type vector` on a **text** column generates embeddings server-side. Optional `--embedding-provider-id`; default output column `{column}_embedding` (override with `--output-column`). + +Full workflow (gather workload → compare existing → create → verify): [references/INDEXES.md](references/INDEXES.md). + +--- + +## Embedding providers + +```bash +hotdata embedding-providers list [--workspace-id ] [--output table|json|yaml] +hotdata embedding-providers get [--workspace-id ] [--output table|json|yaml] +hotdata embedding-providers create --name --provider-type service|local \ + [--config ''] [--provider-api-key | --secret-name ] [--workspace-id ] +hotdata embedding-providers update [--name ] [--config ''] [--provider-api-key | --secret-name ] [--workspace-id ] [--output table|json|yaml] +hotdata embedding-providers delete [--workspace-id ] +``` + +- System providers (e.g. `sys_emb_openai`) are pre-configured; use `list` for IDs to pass to `--embedding-provider-id`. +- `--provider-api-key` is the **embedding service** key (not Hotdata `--api-key`). `--secret-name` references an existing secret. + +--- + +## Quick workflow + +1. `hotdata tables list --connection-id ` — confirm column types. +2. `hotdata indexes list` — avoid duplicate indexes. +3. `hotdata indexes create ... --type bm25|vector` (add `--async` if large). +4. `hotdata search "..." --type bm25|vector --table ... --column ...` +5. Record what exists in **context:DATAMODEL** (core skill) when the workspace should remember index choices. diff --git a/skills/hotdata-search/references/INDEXES.md b/skills/hotdata-search/references/INDEXES.md new file mode 100644 index 0000000..98fd783 --- /dev/null +++ b/skills/hotdata-search/references/INDEXES.md @@ -0,0 +1,51 @@ +# Index workflow (BM25 and vector) + +**Goal:** Find full-text and vector access patterns that lack indexes, then create **bm25** or **vector** indexes when the benefit is clear. + +## 1. Gather workload and schema + +- **Query-run history** — recurring predicates or search-style SQL (`bm25_search`, `vector_distance`, or planned `hotdata search`): + + ```bash + hotdata queries list + hotdata queries + ``` + +- **Columns** — confirm types: + + ```bash + hotdata tables list --connection-id + ``` + +High-cardinality **text** (`title`, `body`, …) → **bm25**. **Embedding** / float list columns → **vector** (+ `--metric`). + +## 2. Compare to existing indexes + +```bash +hotdata indexes list [--connection-id ] [--schema ] [--table
] +hotdata indexes list --dataset-id +``` + +Skip duplicates (same table, column, and purpose). + +## 3. Create indexes + +```bash +hotdata indexes create --connection-id --schema --table
\ + --name idx_posts_body_bm25 --columns body --type bm25 + +hotdata indexes create --connection-id --schema --table
\ + --name idx_chunks_embedding --columns embedding --type vector --metric cosine +``` + +Large builds: `--async`, then `hotdata jobs list` / `hotdata jobs `. + +## 4. Verify + +Re-run `hotdata search` or representative SQL. Update **context:DATAMODEL → Search & index summary** via `hotdata context push DATAMODEL` (core skill). + +## Guardrails + +- Prefer evidence (repeated search workloads) over speculative indexes. +- Get approval before production `indexes create` when cost/impact is uncertain. +- Align connection/schema/table with `hotdata tables list` output. diff --git a/skills/hotdata/SKILL.md b/skills/hotdata/SKILL.md index 7987f3e..bfae105 100644 --- a/skills/hotdata/SKILL.md +++ b/skills/hotdata/SKILL.md @@ -1,6 +1,6 @@ --- name: hotdata -description: Use this skill when the user wants to run hotdata CLI commands, query the Hotdata API, list workspaces, list connections, create connections, list or create managed databases, load parquet into database tables, list tables, manage datasets, execute SQL queries, inspect query run history, search tables, manage indexes, manage sandboxes, manage workspace context and stored docs such as context:DATAMODEL via the context API (`hotdata context`), install or update the bundled agent skills (`hotdata skills`), generate shell completions (`hotdata completions`), or interact with the hotdata service. Activate when the user says "run hotdata", "query hotdata", "list workspaces", "list connections", "create a connection", "list databases", "create a database", "managed database", "load parquet", "list tables", "list datasets", "create a dataset", "upload a dataset", "execute a query", "search a table", "list indexes", "create an index", "list query runs", "list past queries", "query history", "list sandboxes", "create a sandbox", "run a sandbox", "workspace context", "pull context", "push context", "data model", "context:DATAMODEL", or asks you to use the hotdata CLI. +description: Use this skill when the user wants to run core hotdata CLI commands — auth, workspaces, connections, managed databases, datasets, tables, basic SQL query, sandboxes, workspace context (context:DATAMODEL), jobs, and skill install. Activate for "run hotdata", "list workspaces", "list connections", "create a connection", "list databases", "managed database", "load parquet", "list tables", "list datasets", "create a dataset", "execute a query", "list sandboxes", "workspace context", "context:DATAMODEL", or general Hotdata CLI usage. For full-text/vector search and retrieval indexes use hotdata-search; for OLAP analytics, query history, stored results, and Chain materializations use hotdata-analytics; for geospatial/GIS use hotdata-geospatial. version: 0.2.3 --- @@ -14,6 +14,17 @@ hotdata [args] Or if installed on PATH: `hotdata [args]` +## Bundled sub-skills + +Install all skills with **`hotdata skills install`**. Load specialized skills only when the task needs them: + +| Skill | Use for | +|-------|---------| +| **`hotdata`** (this file) | Auth, workspaces, connections, databases, datasets, tables, basic `query`, context, sandboxes, jobs | +| **`hotdata-search`** | BM25, vector search, `hotdata search`, bm25/vector indexes, embedding providers | +| **`hotdata-analytics`** | OLAP SQL, aggregations, query/results history, Chain materializations, sorted indexes | +| **`hotdata-geospatial`** | PostGIS-style `ST_*`, WKB, spatial joins | + ## Authentication Run **`hotdata auth login`** (or **`hotdata auth`** with no subcommand—same behavior) to authenticate via browser login. Config is stored in `~/.hotdata/config.yml`. @@ -67,20 +78,19 @@ Keep two layers separate: Use [references/DATA_MODEL.template.md](references/DATA_MODEL.template.md) and [references/MODEL_BUILD.md](references/MODEL_BUILD.md) for **what to write inside** the Markdown you store under **context:** stems. Never put workspace-specific model text inside agent skill install paths—only in **workspace context** (and transient `./.md` for push/pull when needed). -## Multi-step workflows (Model, History, Chain, Indexes) +## Multi-step workflows These are **patterns** built from the commands below—not separate CLI subcommands: - **Model (`context:DATAMODEL`)** — The **shared** Markdown semantic map of the workspace (entities, keys, joins across connections). **Store and read it only via workspace context** (`hotdata context list`, then `hotdata context show DATAMODEL` **only when listed**, `context push DATAMODEL`); refresh using `connections`, `connections refresh`, `tables list`, and `datasets list`. For a **deep** pass (connector enrichment, indexes, per-table detail), see [references/MODEL_BUILD.md](references/MODEL_BUILD.md). Contrast **analysis modeling** in sandboxes or chat (see [Analysis modeling vs context:DATAMODEL](#analysis-modeling-vs-contextdatamodel)). -- **History** — Inspect prior activity via `hotdata queries list` (query runs) and `hotdata results list` / `results ` (row data). -- **Chain** — Follow-ups via **`datasets create`** then `query` against `datasets..
`, or via **`databases create`** + **`databases tables load`** (parquet) then `query` against `..
`. -- **Indexes** — Review SQL and schema, compare to existing indexes, create **sorted**, **bm25**, or **vector** indexes when it clearly helps; see [references/WORKFLOWS.md](references/WORKFLOWS.md#indexes). +- **History / Chain / OLAP SQL** — See **`hotdata-analytics`** and [references/WORKFLOWS.md](references/WORKFLOWS.md). +- **Search / retrieval indexes** — See **`hotdata-search`**. -Full step-by-step procedures: [references/WORKFLOWS.md](references/WORKFLOWS.md). +Catalog, skill decision tree, epic flows (onboard, chain, retrieval), datasets vs databases, and sandbox procedures: [references/WORKFLOWS.md](references/WORKFLOWS.md). ## Available Commands -Top-level subcommands (each detailed below): **`auth`**, **`datasets`**, **`query`**, **`workspaces`**, **`connections`**, **`databases`**, **`tables`**, **`skills`**, **`results`**, **`jobs`**, **`indexes`**, **`embedding-providers`**, **`search`**, **`queries`**, **`sandbox`**, **`context`**, **`completions`**. +Top-level subcommands (each detailed below): **`auth`**, **`datasets`**, **`query`**, **`workspaces`**, **`connections`**, **`databases`**, **`tables`**, **`skills`**, **`results`**, **`jobs`**, **`indexes`**, **`embedding-providers`**, **`search`**, **`queries`**, **`sandbox`**, **`context`**, **`completions`**. Search, indexes (bm25/vector), and embedding providers are documented in **`hotdata-search`**; query history, results, Chain, and OLAP patterns in **`hotdata-analytics`**. Global CLI options: **`--api-key`**, **`-v` / `--version`**, **`-h` / `--help`**. Hidden developer flag: **`--debug`** (verbose HTTP logs). @@ -300,112 +310,20 @@ hotdata context push [--workspace-id ] [--dry-run] **Convention:** **context:DATAMODEL** is the primary workspace semantic map; **context:GLOSSARY** (or other **`context:`** docs) for additional narrative context. Same identifier rules as SQL table names. CLI: `hotdata context show DATAMODEL` (bare stem). ### Execute SQL Query + ``` hotdata query "" [--workspace-id ] [--connection ] [--output table|json|csv] hotdata query status [--output table|json|csv] ``` -- Default output is `table`, which prints results with row count and execution time. -- Use `--connection` to scope the query to a specific connection. -- Use `hotdata tables list` to discover tables and columns — do not query `information_schema` directly. -- **Always use PostgreSQL dialect SQL.** Column names that are **not** all-lowercase (e.g. from CSV headers like `CustomerName`) are **case-sensitive**; quote them with **double quotes** in SQL, e.g. `"CustomerName"`. -- Long-running queries automatically fall back to async execution and return a `query_run_id`. -- Use `hotdata query status ` to poll for results. -- Exit codes for `query status`: `0` = succeeded, `1` = failed, `2` = still running (poll again). -- **When a query returns a `query_run_id`, use `query status` to poll rather than re-running the query.** - -### Query results -#### List stored results -``` -hotdata results list [--workspace-id ] [--limit ] [--offset ] [--output table|json|yaml] -``` -- Lists recent stored query results with `id`, `status`, and `created_at`. -- Results are paginated; when more are available, the CLI prints a hint with the next `--offset`. -- Use a row’s `id` with `hotdata results ` below. - -#### Get result by ID -``` -hotdata results [--workspace-id ] [--output table|json|csv] -``` -- Retrieves a previously executed query result by its result ID. -- Query output also includes a `result-id` in the footer (e.g. `[result-id: rslt...]`). -- **Always use `results list` / `results ` to retrieve past query results rather than re-running the same query.** Re-running queries wastes resources and may return different results. - -### Query Run History -``` -hotdata queries list [--limit ] [--cursor ] [--status ] [--output table|json|yaml] -hotdata queries [--output table|json|yaml] -``` -These commands use the **active workspace only** (the `queries` command has no `--workspace-id` flag); set the default workspace with `workspaces set` if needed. -- `list` shows query runs with status, creation time, duration, row count, and a truncated SQL preview (default limit 20). -- `--status` filters by run status (comma-separated, e.g. `--status running,failed`). -- View a run by ID to see full metadata (timings, `result_id`, snapshot, hashes) and the formatted, syntax-highlighted SQL. -- If a run has a `result_id`, fetch its rows with `hotdata results `. - -To create a dataset from a **saved query** still registered for the workspace, use **`hotdata datasets create --query-id `** (this CLI does not expose separate saved-query create/run subcommands). - -### Search -`--type` is **required**. Pass `vector` or `bm25`. Both run entirely server-side. +- Default output is `table` (row count and execution time). +- Use `hotdata tables list` for discovery — not `information_schema` via `query`. +- **PostgreSQL dialect.** Quote non-lowercase columns with double quotes. +- Async runs return `query_run_id` → poll with `query status` (do not re-run the same heavy SQL). +- **OLAP** (aggregations, history, Chain, sorted indexes): **`hotdata-analytics`** skill. +- **Search** (BM25, vector): **`hotdata-search`** skill. -``` -# BM25 full-text search (requires BM25 index on the column) -hotdata search "" --type bm25 --table --column \ - [--select ] [--limit ] [--workspace-id ] [--output table|json|csv] - -# Vector similarity search via server-side auto-embed (requires a vector index on the column) -hotdata search "" --type vector --table --column \ - [--select ] [--limit ] [--workspace-id ] [--output table|json|csv] -``` -- **`--type vector`** — pass the query as **plain text** and name the **source text column** (e.g. `title`). The server embeds the query at the same time, using the same provider that auto-embedded the column when the index was built — distance metric, model, and dimensions match automatically. No client-side embedding, no `OPENAI_API_KEY` required. Generated SQL: `vector_distance(col, 'text')`. -- **`--type bm25`** generates `bm25_search(table, col, 'text')` server-side; requires a BM25 index on the column. -- **No vector index on the column, or want a different embedding model?** `hotdata search` won't help — drop down to raw SQL via `hotdata query` (e.g. `SELECT *, cosine_distance(col, []) FROM ...`). See the SQL reference for available distance functions and table UDFs. -- BM25 results sort by score (descending). Vector results sort by distance (ascending). -- `--select` specifies which columns to return (comma-separated, defaults to all). -- Default limit is 10. -- **For BM25 search, create a BM25 index on the target column first (`hotdata indexes create ... --type bm25`). For vector search, create a vector index, optionally with auto-embedding on a text column.** -- The earlier `--model` flag and stdin-piped-vector path have both been removed. They hardcoded `l2_distance` regardless of the index's metric (silently wrong on cosine indexes). For client-side embedding or precomputed-vector workflows, use raw SQL via `hotdata query`. - -### Indexes - -Indexes attach to either a connection-table (`--connection-id` + `--schema` + `--table`) or a dataset (`--dataset-id`) — the two scopes are mutually exclusive for **create** / **delete**. **`indexes list`** supports three ways to scope (below). - -For **create**, `--type` is required (no default). - -``` -# List — default: all indexes on connection tables in the workspace (from information_schema; parallel fetch). -# Narrow the scan with any subset of: --connection-id (-c), --schema, --table. With all three set, uses one table API call. -# Dataset indexes are not included; use --dataset-id per dataset. -hotdata indexes list [--connection-id ] [--schema ] [--table
] [--workspace-id ] [--output table|json|yaml] -hotdata indexes list --dataset-id [--workspace-id ] [--output table|json|yaml] - -# Connection-table scope — create / delete -hotdata indexes create --connection-id --schema --table
\ - --name --columns --type sorted|bm25|vector \ - [--metric l2|cosine|dot] [--async] \ - [--embedding-provider-id ] [--dimensions ] [--output-column ] [--description ] -hotdata indexes delete --connection-id --schema --table
--name - -# Dataset scope — create / delete (same --dataset-id flag) -hotdata indexes create --dataset-id --name --columns --type sorted|bm25|vector ... -hotdata indexes delete --dataset-id --name -``` -- **`indexes list`:** With no `--dataset-id`, lists indexes on **connection** tables (workspace scan or filtered scan). **Dataset** indexes are listed only via `--dataset-id` (one dataset per invocation). -- `--type` accepts `sorted` (B-tree-like; range/exact lookups), `bm25` (full-text), or `vector` (similarity). It is **required** for **create**. -- `--type vector` requires exactly one column. -- `--async` submits index creation as a background job; poll with `hotdata jobs `. -- **Auto-embedding:** with `--type vector` on a **text** column, the server generates embeddings automatically. Pass `--embedding-provider-id` to pick a specific provider; if omitted, the first system provider is used. The generated column defaults to `{column}_embedding` (override with `--output-column`). - -### Embedding providers -``` -hotdata embedding-providers list [--workspace-id ] [--output table|json|yaml] -hotdata embedding-providers get [--workspace-id ] [--output table|json|yaml] -hotdata embedding-providers create --name --provider-type service|local \ - [--config ''] [--provider-api-key | --secret-name ] [--workspace-id ] -hotdata embedding-providers update [--name ] [--config ''] [--provider-api-key | --secret-name ] [--workspace-id ] [--output table|json|yaml] -hotdata embedding-providers delete [--workspace-id ] -``` -- System providers (e.g. `sys_emb_openai`) come pre-configured. `list` shows IDs to pass to `--embedding-provider-id`. -- `--provider-api-key` (the embedding service's own key, e.g. an OpenAI `sk-...`) auto-creates a managed secret. Pairs with `--provider-type`; named to avoid colliding with the global `--api-key` (Hotdata auth). `--secret-name` references an existing secret. Mutually exclusive. +To create a dataset from a saved query: **`hotdata datasets create --query-id `**. ### Jobs ``` @@ -419,7 +337,7 @@ hotdata jobs [--workspace-id ] [--output table|json|yaml] ### Agent skills (`skills`) -Bundled Markdown skills (**`hotdata`**, **`hotdata-geospatial`**) ship with the CLI release tarball. +Bundled Markdown skills (**`hotdata`**, **`hotdata-search`**, **`hotdata-analytics`**, **`hotdata-geospatial`**) ship with the CLI release tarball. ``` hotdata skills install [--project] diff --git a/skills/hotdata/references/DATA_MODEL.template.md b/skills/hotdata/references/DATA_MODEL.template.md index c55fd5c..6c1833e 100644 --- a/skills/hotdata/references/DATA_MODEL.template.md +++ b/skills/hotdata/references/DATA_MODEL.template.md @@ -58,7 +58,7 @@ Document safe join paths and caveats (fan-out, timing, different refresh cadence |-------|--------|--------------------------|--------------|-------| | | | | | | -_Use `hotdata indexes list` for connection tables across the workspace (add `-c` / `--schema` / `--table` to narrow), or per table with all three set; use `hotdata indexes list --dataset-id ` for uploaded datasets._ +_Use `hotdata indexes list` for connection tables (see **hotdata-search** skill). Record bm25/vector indexes here; sorted indexes for OLAP filters in **hotdata-analytics**._ ## Datasets (uploaded) diff --git a/skills/hotdata/references/MODEL_BUILD.md b/skills/hotdata/references/MODEL_BUILD.md index c43938e..9bdec6e 100644 --- a/skills/hotdata/references/MODEL_BUILD.md +++ b/skills/hotdata/references/MODEL_BUILD.md @@ -100,7 +100,7 @@ Note: - **Time** columns — event grain vs slowly changing dimensions. - **Facts vs dimensions** — for analytics-oriented workspaces. -When suggesting a new index, use the same connection/schema/table/column names as in `tables list` and the main skill’s `indexes create` examples. +When suggesting a new index, use the same connection/schema/table/column names as in `tables list` and **`hotdata-search`** / **`hotdata-analytics`** `indexes create` examples (bm25/vector vs sorted). --- diff --git a/skills/hotdata/references/WORKFLOWS.md b/skills/hotdata/references/WORKFLOWS.md index 8b5edf4..fbcf6f1 100644 --- a/skills/hotdata/references/WORKFLOWS.md +++ b/skills/hotdata/references/WORKFLOWS.md @@ -1,217 +1,197 @@ # Hotdata CLI workflows -Procedures for **Model**, **History**, **Chain**, **Indexes**, and **sandboxes with datasets** (see **Sandboxes and datasets**). These compose existing `hotdata` commands; they are not separate subcommands. - -**Notation:** **`context:`** (e.g. **`context:DATAMODEL`**, **`context:GLOSSARY`**) means the **workspace document** stored under that stem via the **context API**—not generic “data model” language and not local files except as `pull`/`push` transport. **CLI** still uses bare stems: `hotdata context show DATAMODEL`. - -## Where things live - -| Concept | Location | -|--------|----------| -| **Model** | **`context:DATAMODEL`** — workspace context API (`hotdata context list` then `show` / `pull` / `push` with `./DATAMODEL.md` in the project cwd only as the CLI file surface; **list before `show`** so missing `DATAMODEL` does not error). Never store workspace-specific model text inside agent skill directories. | -| **History** | `hotdata queries list` / `queries ` for query runs (execution history); `hotdata results list` / `results ` for row data. | -| **Chain** | Intermediate tables in **`datasets..
`** — usually **`datasets.main.*`** for workspace-wide materializations; **sandbox uploads** use **`datasets..*`** (see **Sandboxes and datasets** below). Document stable chains in **context:DATAMODEL** under **Derived tables (Chain)**. | -| **Indexes** | Recommendations and live objects in Hotdata (`indexes list` / `indexes create`). Record rationale in **context:DATAMODEL** (e.g. Search & index summary) or a dedicated **context:** stem if you split concerns. | +**Notation:** **`context:`** (e.g. **`context:DATAMODEL`**) means the workspace document stored via the **context API**—CLI uses bare stems: `hotdata context show DATAMODEL`. --- -## Model - -**Goal:** A markdown map of entities, keys, grain, and how connections relate—stored as **context:DATAMODEL** on top of the live **catalog** from Hotdata. +## Which skill? -### Initialize +Load **`hotdata`** first for auth and workspace setup. Add a sub-skill only when the task needs it. -1. Use [DATA_MODEL.template.md](DATA_MODEL.template.md) in this skill bundle as the **structure** for what you store as **context:DATAMODEL**. -2. Run **`hotdata context list`**. **Only if** `DATAMODEL` appears, you may use `hotdata context show DATAMODEL` or `pull` to hydrate `./DATAMODEL.md`. If it does **not** appear, start from the template only—**do not** run `show` (it exits 1). In the **project directory** where you run `hotdata`, create or refresh `./DATAMODEL.md`, fill workspace-specific sections as you discover schema, then **`hotdata context push DATAMODEL`** so the server owns **context:DATAMODEL**. -3. Agents that skip local files: **`context list`** first; **`context show DATAMODEL` only when listed** to read **context:DATAMODEL**; when updating, write `./DATAMODEL.md` then `hotdata context push DATAMODEL`. +| User goal | Skill | Key commands | +|-----------|--------|----------------| +| Login, workspaces, connections, tables, context, sandboxes | **`hotdata`** | `auth`, `workspaces`, `connections`, `tables`, `context`, `sandbox` | +| Upload CSV/JSON/URL or SQL-derived tables | **`hotdata`** | `datasets create`, `databases …` (see below) | +| SQL analytics, aggregations, history, Chain | **`hotdata-analytics`** | `query`, `queries`, `results`, `datasets create --sql` | +| BM25 / vector search, retrieval indexes | **`hotdata-search`** | `search`, `indexes create`, `embedding-providers` | +| Geospatial / PostGIS-style SQL | **`hotdata-geospatial`** | `query` with `ST_*`, WKB columns | -### Deep model pass (optional) +| Concept | Where documented | +|--------|------------------| +| **Model** | This file — [Model](#model) | +| **Upload path (datasets vs databases)** | This file — [Datasets vs managed databases](#datasets-vs-managed-databases) | +| **Sandboxes** | This file — [Sandboxes and datasets](#sandboxes-and-datasets) | +| **History / Chain** | **`hotdata-analytics`** — [WORKFLOWS.md](../../hotdata-analytics/references/WORKFLOWS.md) | +| **Search indexes** | **`hotdata-search`** — [INDEXES.md](../../hotdata-search/references/INDEXES.md) | +| **Epic flows** | This file — [Epic flows](#epic-flows) | -For a **full** catalog-style document—datasets, enrichment from connector or loader docs (e.g. dlt), relationships, search/index notes, and stricter documentation rules—follow **[MODEL_BUILD.md](MODEL_BUILD.md)**. Use it when the light template is not enough; skip it for small or fast-moving workspaces. - -### Refresh catalog facts (run from project root) +--- -When metadata may be **stale**, run `connections refresh` for affected connections **before** relying on `tables list` (same order as below). +## Epic flows -```bash -hotdata workspaces list -hotdata connections list -# For each connection you care about: -hotdata connections refresh # after DDL / stale metadata -hotdata tables list -hotdata tables list --connection-id -hotdata datasets list -hotdata datasets # schema detail per dataset -``` +End-to-end checklists. Use the linked sections for command detail and guardrails. -`datasets list` returns **every** dataset in the workspace (no sandbox-only filter). Use the **`FULL NAME`** column (`datasets..
`): **`main`** in the middle segment is the usual workspace catalog; a value like **`s_…`** is the **sandbox id** for sandbox-scoped datasets. +### Onboard a workspace -Use output to update **Connections**, **Tables**, **Columns**, and **Datasets** in **context:DATAMODEL** (edit via `./DATAMODEL.md` + `hotdata context push DATAMODEL`, or your editor workflow). Optional: small exploratory queries once names are known: +**Skill:** **`hotdata`** (optional **`hotdata-analytics`** for first queries) -```bash -hotdata query "SELECT * FROM ..
LIMIT 5" -``` +1. [ ] `hotdata auth login` (or `hotdata auth`) +2. [ ] `hotdata workspaces list` → `hotdata workspaces set` if not on the right workspace +3. [ ] `hotdata connections list` — note connection ids and names +4. [ ] (Optional) `hotdata connections create …` — see **`hotdata`** skill **Create a Connection** +5. [ ] `hotdata connections refresh ` if catalog may be stale +6. [ ] `hotdata tables list` and `hotdata tables list --connection-id ` for columns +7. [ ] (Optional) `hotdata context list` — if `DATAMODEL` is listed, `hotdata context show DATAMODEL`; else skip `show` +8. [ ] (Optional) Bootstrap **context:DATAMODEL** — [Model](#model), [DATA_MODEL.template.md](DATA_MODEL.template.md) -**Rule:** Use `hotdata tables list` for discovery; do not use `query` against `information_schema` for that (see main skill). +**Next:** upload data ([Datasets vs managed databases](#datasets-vs-managed-databases)) or run analytics (**Chain** below). ---- +### Chain (materialize then query) -## Sandboxes and datasets +**Skill:** **`hotdata-analytics`** (catalog via **`hotdata`**) -Use this when work is isolated in a **sandbox** (exploratory runs, ephemeral datasets). +1. [ ] Run base SQL: `hotdata query "SELECT …"` — poll `hotdata query status ` if async +2. [ ] Materialize one way: + - [ ] **Dataset:** `hotdata datasets create --label "…" --sql "SELECT …" [--table-name …]` + - [ ] **Managed DB:** `hotdata databases create --name … --table …` then `hotdata databases tables load … --file ./….parquet` +3. [ ] Copy **`full_name`** from create output (or `datasets list` **FULL NAME**) +4. [ ] Chain: `hotdata query "SELECT … FROM WHERE …"` +5. [ ] (Sandbox) Use `datasets..
` and active sandbox or `hotdata sandbox run …` +6. [ ] Record stable chains in **context:DATAMODEL** when they should outlive the session -**Active sandbox vs `sandbox run`:** After `hotdata sandbox new` or `hotdata sandbox set `, run **`hotdata datasets create`**, **`hotdata query`**, etc. **directly** — the CLI attaches the sandbox from saved config. **`hotdata sandbox run `** (no sandbox id before `run`) **always creates a new sandbox**; it does **not** reuse the active one. To wrap a command in an **existing** sandbox, use **`hotdata sandbox run [args…]`**. +**Detail:** [hotdata-analytics WORKFLOWS — Chain](../../hotdata-analytics/references/WORKFLOWS.md#chain) -**Qualified table names:** Workspace-wide dataset tables are typically **`datasets.main.`**. Datasets created **inside** a sandbox use **`datasets..`**, not `main`. After **`datasets create`**, use the printed **`full_name`**; after **`datasets list`**, use the **`FULL NAME`** column — do not assume `datasets.main` for sandbox data. +### Retrieval (index then search) -**Access:** Queries against sandbox-only tables need sandbox context: **active sandbox in config** (`sandbox set`) **or** commands run under **`hotdata sandbox run …`**. Otherwise you may see **access denied**. +**Skill:** **`hotdata-search`** (schema via **`hotdata`**) -**Listing:** `datasets list` does not filter by sandbox; use **`FULL NAME`** to distinguish `…main…` from `…s_…` rows. +1. [ ] `hotdata tables list --connection-id ` — pick text column (BM25) or embedding/text column (vector) +2. [ ] `hotdata indexes list` — avoid duplicate bm25/vector indexes on the same column +3. [ ] Create index: + - [ ] **Keyword:** `hotdata indexes create … --type bm25 --columns ` + - [ ] **Semantic:** `hotdata indexes create … --type vector --columns [--metric cosine|l2|dot]` + - [ ] Large build: add `--async`, then `hotdata jobs ` +4. [ ] Search: + - [ ] `hotdata search "…" --type bm25 --table --column ` + - [ ] `hotdata search "…" --type vector --table … --column ` +5. [ ] (Optional) Note indexes in **context:DATAMODEL → Search & index summary** -**SQL:** Column names from uploads that are not all-lowercase are **case-sensitive** in PostgreSQL; quote with double quotes (e.g. `"CustomerName"`). +**Detail:** [hotdata-search INDEXES.md](../../hotdata-search/references/INDEXES.md) --- -## History +## Datasets vs managed databases -**Goal:** Find prior work: query runs (execution history) and stored result rows. +Both land queryable tables in the workspace; the path depends on **format** and **how you want to name tables in SQL**. -### Query runs +| | **Datasets** | **Managed databases** | +|---|-------------|------------------------| +| **Best for** | CSV, JSON, URL import, stdin, SQL/query snapshot | Parquet files you own; catalog-style `name.schema.table` | +| **SQL prefix** | `datasets..
` (often `datasets.main.*`) | `..
` (database = connection name) | +| **CLI** | `hotdata datasets create` | `hotdata databases create` + `databases tables load` | +| **Declare schema up front** | No | Yes — `--table` on create (required before load on current API) | +| **Parquet** | Yes (`--file`, `--url`, `--upload-id`) | **Only** parquet on `tables load` | +| **Refresh upstream** | `datasets refresh` (URL/query sources) | Replace via `tables load` again | -```bash -hotdata queries list [--limit N] [--cursor ] [--status ] -hotdata queries -``` - -`queries list` returns recent executions with status, duration, row count, and a SQL preview (default limit 20). Filter with `--status` (e.g. `--status failed`). The detail view shows full timings, the `result_id` (if any), and the formatted SQL. +**Rule of thumb:** CSV/JSON or “upload a file from a URL” → **datasets**. Parquet catalog you control as **`mydb.public.orders`** → **databases**. -### Results +### Workflow: dataset upload and query -```bash -hotdata results list [--workspace-id ] [--limit N] [--offset N] -hotdata results [--workspace-id ] -``` - -Query footers include a `result-id` when applicable—record it for later, or pick it up from `queries `. **Prefer `hotdata results ` over re-running identical heavy SQL.** - ---- +1. Authenticate and set workspace (`hotdata auth`, `hotdata workspaces set` if needed). +2. Create the dataset (one source): -## Chain - -**Goal:** Follow-up analysis on a **bounded** intermediate without rescanning huge base tables. - -**Pattern:** materialize → query using the dataset’s **qualified name** (`datasets..
`). + ```bash + hotdata datasets create --label "Orders" --file ./orders.csv + # or: --url "https://example.com/orders.parquet" + # or: --sql "SELECT ..." # materialize from a query + ``` -1. **Base** — run SQL: +3. Note the printed **`full_name`** (e.g. `datasets.main.orders`) — do not assume `datasets.main`. +4. Inspect if needed: `hotdata datasets list`, `hotdata datasets `. +5. Query: ```bash - hotdata query "SELECT ..." + hotdata query "SELECT count(*) FROM datasets.main.orders" ``` - If the CLI returns a `query_run_id`, poll: +### Workflow: managed database (parquet) + +1. Create the database and **declare tables** up front: ```bash - hotdata query status + hotdata databases create --name sales --table orders --table customers ``` -2. **Materialize** — land a table in datasets (pick one): +2. Load parquet per table: ```bash - hotdata datasets create --label "chain revenue slice" --sql "SELECT ..." [--table-name chain_revenue_slice] - hotdata datasets create --label "from saved" --query-id [--table-name ...] + hotdata databases tables load sales orders --file ./orders.parquet ``` - Note the **`full_name`** line in the output (e.g. `datasets.main.chain_revenue_slice` or `datasets.s_….…` inside a sandbox). + If load fails with *not declared*, add `--table` at create time. There is no `--url` on load — download parquet locally first. -3. **Chain** — query the dataset using that **`full_name`** (or **`FULL NAME`** from `datasets list`); do not hardcode `datasets.main` if the schema segment is a sandbox id: +3. Confirm and query: ```bash - hotdata datasets list # FULL NAME column: datasets..
- hotdata query "SELECT * FROM datasets.main. WHERE ..." # workspace / no sandbox - # Sandbox example (use the actual full_name from create or list): - # hotdata query "SELECT * FROM datasets.s_ufmblmvq. WHERE ..." + hotdata databases tables list sales + hotdata query "SELECT count(*) FROM sales.public.orders" ``` - For **sandbox-scoped** chain tables, ensure an **active sandbox** (`sandbox set`) or run the query inside **`hotdata sandbox run hotdata query "…"`**. Quote mixed-case columns: e.g. `"Revenue"`. - -**Naming:** Prefer predictable `--table-name` values, e.g. `chain__`, and list long-lived chains in **context:DATAMODEL → Derived tables (Chain)** (record the **full** `datasets..
` you use in SQL). +For **Chain** materializations into datasets or databases, see **`hotdata-analytics`**. --- -## Indexes - -**Goal:** Find filters, joins, sorts, full-text, and vector access patterns that are **missing** indexes, then **create** them when the benefit is clear. - -### 1. Gather workload and schema +## Model -- **Query-run history** — Inspect recent runs for recurring `WHERE`, `JOIN`, `GROUP BY`, `ORDER BY`, and any use of full-text or vector access (e.g. SQL that calls `bm25_search`, or workloads you run via **`hotdata search`** — see main skill **Search**). +**Goal:** A markdown map of entities, keys, grain, and how connections relate—stored as **context:DATAMODEL** on top of the live **catalog** from Hotdata. - ```bash - hotdata queries list - hotdata queries - ``` +### Initialize -- **Table/column types** — Confirm columns exist and types fit the index you plan: +1. Use [DATA_MODEL.template.md](DATA_MODEL.template.md) as the **structure** for **context:DATAMODEL**. +2. Run **`hotdata context list`**. **Only if** `DATAMODEL` appears, use `show` or `pull`. If absent, start from the template—**do not** run `show` (exits 1). +3. Edit `./DATAMODEL.md` in the project directory, then **`hotdata context push DATAMODEL`**. - ```bash - hotdata tables list --connection-id - ``` +### Deep model pass (optional) -High-cardinality **text** columns (`title`, `body`, `description`, …) may warrant **bm25** if you use or plan text search. **Embedding** / list-of-float columns may warrant **vector** (+ `--metric`). Equality/range/sort on discrete fields often map to **sorted** (default index type)—confirm fit with your workload and product limits when in doubt. +Follow **[MODEL_BUILD.md](MODEL_BUILD.md)** for connector enrichment, per-table detail, and index/search notes in the data model. -### 2. Compare to existing indexes +### Refresh catalog facts -Start broad, then narrow: +When metadata may be **stale**, run `connections refresh` before `tables list`. After **`databases tables load`**, refresh is not required for the new table—use `databases tables list` or `tables list`. ```bash -# All indexes on connection tables in the workspace (optional: -c / --schema / --table to filter) -hotdata indexes list [--workspace-id ] -``` - -For a single table, or to avoid scanning the whole workspace: - -```bash -hotdata indexes list --connection-id --schema --table
[--workspace-id ] +hotdata workspaces list +hotdata connections list +hotdata connections refresh # after DDL / stale remote metadata +hotdata tables list +hotdata tables list --connection-id +hotdata datasets list +hotdata datasets +hotdata databases list ``` -Indexes on **uploaded datasets** are not included in that workspace scan — use `hotdata indexes list --dataset-id ` per dataset. - -Skip creating a duplicate: same table + overlapping columns + same purpose (e.g. another bm25 on the same column). +Use `hotdata tables list` for discovery; do not query `information_schema` for that. -### 3. Create indexes when justified - -Use stable names (e.g. `idx_
__`). Examples: - -```bash -# Sorted (default) — filters, joins, ordering on scalar columns -hotdata indexes create --connection-id --schema --table
\ - --name idx_orders_created --columns created_at --type sorted +--- -# BM25 — full-text on one text column (required for bm25_search on that column) -hotdata indexes create --connection-id --schema --table
\ - --name idx_posts_body_bm25 --columns body --type bm25 +## Sandboxes and datasets -# Vector — embeddings; requires --metric -hotdata indexes create --connection-id --schema --table
\ - --name idx_chunks_embedding --columns embedding --type vector --metric l2 -``` +Use this when work is isolated in a **sandbox** (exploratory runs, ephemeral datasets). -Large builds: add `--async` and track with **`hotdata jobs list`** / **`hotdata jobs `** (see main skill **Indexes** and **Jobs**). +**Active sandbox vs `sandbox run`:** After `sandbox new` or `sandbox set`, run **`datasets create`**, **`query`**, etc. **directly**. **`sandbox run `** (no id before `run`) **always creates a new sandbox**. -### 4. Verify +**Qualified names:** Workspace datasets → **`datasets.main.
`**. Sandbox datasets → **`datasets..
`**. Use **`full_name`** from create or **FULL NAME** from `datasets list`. -Re-run representative **`hotdata query`** or **`hotdata search`** workloads. Update **context:DATAMODEL → Search & index summary** (`hotdata context push DATAMODEL` after editing `./DATAMODEL.md`) so future agents see what exists. +**Access:** Sandbox-only tables need active sandbox config or **`hotdata sandbox run …`**. -### Guardrails +**SQL:** Quote mixed-case columns with double quotes. -- Prefer **evidence** (repeated predicates, slow queries, or planned search) over speculative indexes. -- **Production:** get explicit approval before `indexes create` when impact or cost is uncertain. -- Align **connection id**, **schema**, and **table** with `hotdata tables list` output. +**Listing:** `datasets list` returns all workspace datasets; use **FULL NAME** to spot sandbox vs `main` rows. --- ## Cross-cutting -- **Workspace:** Use active workspace or `--workspace-id` when targeting a non-default workspace. -- **Sandboxes:** See **Sandboxes and datasets** above (`sandbox run` vs direct commands, `full_name`, access denied without context). -- **Jobs:** For async work (indexes, some refreshes), `hotdata jobs list` and `hotdata jobs `. +- **Workspace:** Active workspace or `--workspace-id`. **`hotdata queries`** uses the active workspace only (no `--workspace-id`). +- **Jobs:** `hotdata jobs list` / `jobs ` for async refreshes, dataset refresh, and index builds. +- **Discovery:** `hotdata tables list` — not `query` on `information_schema`. diff --git a/src/auth.rs b/src/auth.rs index e15ac16..d9d4e10 100644 --- a/src/auth.rs +++ b/src/auth.rs @@ -169,7 +169,16 @@ struct WsListResponse { workspaces: Vec } struct WsItem { public_id: String, name: String } /// Wait for the browser callback, verify state, and extract the authorization code. -fn receive_callback(server: &tiny_http::Server, expected_state: &str) -> Result { +/// +/// `success_title` and `success_body` are interpolated directly into HTML +/// without escaping. Callers **must** pass static, trusted strings — never +/// dynamic or user-supplied content. +fn receive_callback( + server: &tiny_http::Server, + expected_state: &str, + success_title: &str, + success_body: &str, +) -> Result { let request = server.recv().map_err(|e| format!("failed to receive callback: {e}"))?; let raw_url = request.url().to_string(); let params = parse_query_params(&raw_url); @@ -187,15 +196,16 @@ fn receive_callback(server: &tiny_http::Server, expected_state: &str) -> Result< } }; - let html = r#" + let html = format!( + r#" - Hotdata — Login Successful + Hotdata — {success_title} @@ -235,11 +245,12 @@ fn receive_callback(server: &tiny_http::Server, expected_state: &str) -> Result< -

Login successful

-

You're now authenticated with Hotdata.
You can close this tab and return to the terminal.

+

{success_title}

+

{success_body}

-"#; +"# + ); let response = tiny_http::Response::from_string(html).with_header( "Content-Type: text/html" .parse::() @@ -254,71 +265,53 @@ fn is_already_signed_in(profile_config: &config::ProfileConfig) -> bool { check_status(profile_config) == AuthStatus::Authenticated } -pub fn login() { - let profile_config = config::load("default").unwrap_or_default(); +/// Shared PKCE browser-handoff loop used by both `login` and `register`. +/// +/// 1. Generates PKCE params and starts the local loopback callback server. +/// 2. Calls `build_url(app_url, code_challenge, state, port)` to construct +/// the browser URL. +/// 3. Opens the browser and waits for the OAuth/registration callback. +/// 4. Calls `exchange(code, code_verifier, port)` to mint a JWT session. +/// 5. Saves the session, prints `success_print`, and displays the workspace. +fn run_browser_auth( + profile_config: &config::ProfileConfig, + opening_msg: &str, + waiting_msg: &str, + success_print: &str, + success_title: &str, + success_body: &str, + build_url: impl Fn(&str, &str, &str, u16) -> String, + exchange: impl Fn(&str, &str, u16) -> Result, +) { let app_url = profile_config.app_url.to_string(); - - // Check if already authenticated - if is_already_signed_in(&profile_config) { - println!("{}", "You are already signed in.".green()); - if !crate::util::is_interactive() { - return; - } - print!("Do you want to log in again? [y/N] "); - use std::io::Write; - std::io::stdout().flush().unwrap(); - let mut input = String::new(); - std::io::stdin().read_line(&mut input).unwrap(); - if !input.trim().eq_ignore_ascii_case("y") { - return; - } - } - let code_verifier = generate_code_verifier(); let code_challenge = generate_code_challenge(&code_verifier); let state = generate_random_string(32); - // Bind to port 0 so the OS picks an available port. DOT's consent - // page will redirect here with `?code=...&state=...`. let server = tiny_http::Server::http("127.0.0.1:0").expect("failed to start local callback server"); let port = server.server_addr().to_ip().unwrap().port(); - let redirect_uri = format!("http://127.0.0.1:{port}/"); - // DOT's `/o/authorize/` endpoint is mounted off the app URL (the - // browser-facing one; allauth session cookies live here). We send - // no `scope` parameter — the consent page picks permissions and - // workspace scope interactively, then composes the scope string - // server-side (see HotdataAllowForm). - let login_url = format!( - "{app_url}/o/authorize/\ - ?client_id=hotdata-cli\ - &response_type=code\ - &redirect_uri={redirect_uri}\ - &code_challenge={code_challenge}\ - &code_challenge_method=S256\ - &state={state}", - app_url = app_url.trim_end_matches('/'), - ); + let url = build_url(app_url.trim_end_matches('/'), &code_challenge, &state, port); - println!("Opening browser to log in..."); + println!("{opening_msg}"); stdout() .execute(Print("If your browser does not open, visit:\n ")) .unwrap() .execute(SetForegroundColor(Color::DarkGrey)) .unwrap() - .execute(Print(format!("{login_url}\n"))) + .execute(Print(format!("{url}\n"))) .unwrap() .execute(ResetColor) .unwrap(); - if let Err(e) = open::that(&login_url) { + if let Err(e) = open::that(&url) { eprintln!("failed to open browser: {e}"); } - println!("Waiting for login callback..."); + println!("{waiting_msg}"); - let code = match receive_callback(&server, &state) { + let code = match receive_callback(&server, &state, success_title, success_body) { Ok(c) => c, Err(e) => { eprintln!("error: {e}"); @@ -326,7 +319,7 @@ pub fn login() { } }; - match crate::jwt::mint_from_pkce_code(&profile_config, &code, &code_verifier, &redirect_uri) { + match exchange(&code, &code_verifier, port) { Ok(session) => { if let Err(e) = crate::jwt::save_session(&session) { eprintln!("warning: could not save session: {e}"); @@ -334,19 +327,23 @@ pub fn login() { stdout() .execute(SetForegroundColor(Color::Green)) .unwrap() - .execute(Print("Logged in successfully.\n")) + .execute(Print(format!("{success_print}\n"))) .unwrap() .execute(ResetColor) .unwrap(); - // Best-effort workspace cache using the freshly minted JWT. - // Fall back to the existing on-disk list if the fetch fails. - let workspaces = cache_workspaces(&profile_config, &session.access_token) - .unwrap_or(profile_config.workspaces); + let workspaces = cache_workspaces(profile_config, &session.access_token) + .unwrap_or_else(|_| profile_config.workspaces.clone()); match workspaces.first() { Some(w) => { - print_row("Workspace", &format!("{} {}", w.name.as_str().cyan(), format!("({})", w.public_id).dark_grey())); - print_row("", &"use 'hotdata workspaces set' to switch workspaces".dark_grey().to_string()); + print_row( + "Workspace", + &format!("{} {}", w.name.as_str().cyan(), format!("({})", w.public_id).dark_grey()), + ); + print_row( + "", + &"use 'hotdata workspaces set' to switch workspaces".dark_grey().to_string(), + ); } None => print_row("Workspace", &"None".dark_grey().to_string()), } @@ -358,6 +355,90 @@ pub fn login() { } } +pub fn login() { + let profile_config = config::load("default").unwrap_or_default(); + + if is_already_signed_in(&profile_config) { + println!("{}", "You are already signed in.".green()); + if !crate::util::is_interactive() { + return; + } + print!("Do you want to log in again? [y/N] "); + use std::io::Write; + std::io::stdout().flush().unwrap(); + let mut input = String::new(); + std::io::stdin().read_line(&mut input).unwrap(); + if !input.trim().eq_ignore_ascii_case("y") { + return; + } + } + + // DOT's `/o/authorize/` endpoint is mounted off the app URL (the + // browser-facing one; allauth session cookies live here). We send + // no `scope` parameter — the consent page picks permissions and + // workspace scope interactively, then composes the scope string + // server-side (see HotdataAllowForm). + run_browser_auth( + &profile_config, + "Opening browser to log in...", + "Waiting for login callback...", + "Logged in successfully.", + "Login successful", + "You're now authenticated with Hotdata.
You can close this tab and return to the terminal.", + |app_url, code_challenge, state, port| { + let redirect_uri = format!("http://127.0.0.1:{port}/"); + format!( + "{app_url}/o/authorize/\ + ?client_id=hotdata-cli\ + &response_type=code\ + &redirect_uri={redirect_uri}\ + &code_challenge={code_challenge}\ + &code_challenge_method=S256\ + &state={state}" + ) + }, + |code, code_verifier, port| { + let redirect_uri = format!("http://127.0.0.1:{port}/"); + crate::jwt::mint_from_pkce_code(&profile_config, code, code_verifier, &redirect_uri) + }, + ); +} + +pub fn register(use_email: bool) { + let profile_config = config::load("default").unwrap_or_default(); + + if is_already_signed_in(&profile_config) { + println!( + "{}", + "You are already signed in. Use 'hotdata auth login' to log in with a different account.".green() + ); + return; + } + + let method = if use_email { "email" } else { "github" }; + run_browser_auth( + &profile_config, + "Opening browser to create your account...", + "Waiting for account setup to complete...", + "Account created and logged in.", + "Account created", + "Your Hotdata account is ready.
You can close this tab and return to the terminal.", + |app_url, code_challenge, state, port| { + format!( + "{app_url}/auth/cli-register/\ + ?code_challenge={code_challenge}\ + &code_challenge_method=S256\ + &state={state}\ + &callback_port={port}\ + &method={method}" + ) + }, + |code, code_verifier, _port| { + crate::jwt::exchange_cli_register_code(&profile_config, code, code_verifier) + }, + ); +} + /// Fetch workspaces with a freshly minted JWT and cache them in config. /// Returns the freshly fetched list so callers can display it without /// having to reload config from disk. @@ -650,7 +731,7 @@ mod tests { .unwrap(); }); - let result = receive_callback(&server, "expected-state"); + let result = receive_callback(&server, "expected-state", "", ""); handle.join().unwrap(); assert_eq!(result.unwrap(), "test-auth-code"); @@ -670,7 +751,7 @@ mod tests { .send(); }); - let result = receive_callback(&server, "expected-state"); + let result = receive_callback(&server, "expected-state", "", ""); handle.join().unwrap(); assert!(result.is_err()); @@ -754,7 +835,7 @@ mod tests { .send(); }); - let result = receive_callback(&server, "expected-state"); + let result = receive_callback(&server, "expected-state", "", ""); handle.join().unwrap(); assert!(result.is_err()); diff --git a/src/command.rs b/src/command.rs index 2480ce8..0e4fca4 100644 --- a/src/command.rs +++ b/src/command.rs @@ -274,6 +274,13 @@ pub enum AuthCommands { /// Log in via browser (same as `hotdata auth` with no subcommand) Login, + /// Create a new account via browser (defaults to GitHub OAuth) + Register { + /// Sign up with email and password instead of GitHub + #[arg(long)] + email: bool, + }, + /// Remove authentication for a profile Logout, diff --git a/src/jwt.rs b/src/jwt.rs index 2f30930..c7de483 100644 --- a/src/jwt.rs +++ b/src/jwt.rs @@ -153,6 +153,49 @@ fn redacted_form_body(params: &[(&str, &str)]) -> serde_json::Value { /// body so the caller can still parse real values out of it. const TOKEN_REDACT_KEYS: &[&str] = &["access_token", "refresh_token"]; +/// Exchange a CLI registration PKCE code for a session. +/// +/// The `/auth/cli-register/` flow issues a short-lived `CLIAuthCode` (not a +/// full OAuth code). This function POSTs it to `/v1/auth/token` to get an +/// opaque API token, then immediately mints a full JWT session via +/// `mint_from_api_token` so the on-disk state is identical to a normal login. +pub fn exchange_cli_register_code( + profile: &config::ProfileConfig, + code: &str, + code_verifier: &str, +) -> Result { + let url = format!("{}/v1/auth/token", oauth_base(profile)); + let body = serde_json::json!({ "code": code, "code_verifier": code_verifier }); + let body_log = serde_json::json!({ + "code": util::mask_credential(code), + "code_verifier": util::mask_credential(code_verifier), + }); + + let client = reqwest::blocking::Client::new(); + let req = client.post(&url).json(&body); + let (status, body_text) = util::send_debug_with_redaction( + &client, + req, + Some(&body_log), + &["token"], + ) + .map_err(|e| format!("connection error: {e}"))?; + if !status.is_success() { + return Err(format!( + "registration token exchange failed: HTTP {status}: {body_text}" + )); + } + + #[derive(Deserialize)] + struct RegisterResponse { + token: String, + } + let resp: RegisterResponse = serde_json::from_str(&body_text) + .map_err(|e| format!("malformed token response: {e}"))?; + + mint_from_api_token(profile, &resp.token) +} + /// Exchange a PKCE authorization code for a session. pub fn mint_from_pkce_code( profile: &config::ProfileConfig, @@ -501,6 +544,79 @@ mod tests { assert!(err.contains("connection"), "got: {err}"); } + // --- exchange_cli_register_code --- + + #[test] + fn exchange_cli_register_code_success() { + let mut server = mockito::Server::new(); + // Step 1: exchange the PKCE code for an opaque API token. + let token_mock = server + .mock("POST", "/v1/auth/token") + .match_body(mockito::Matcher::Json(serde_json::json!({ + "code": "reg-code", + "code_verifier": "verifier", + }))) + .with_status(200) + .with_header("content-type", "application/json") + .with_body(r#"{"token":"hd_tok"}"#) + .create(); + // Step 2: mint_from_api_token exchanges the opaque token for a JWT. + let mint_mock = server + .mock("POST", "/o/token/") + .match_body(mockito::Matcher::AllOf(vec![ + mockito::Matcher::UrlEncoded("grant_type".into(), "api_token".into()), + mockito::Matcher::UrlEncoded("api_token".into(), "hd_tok".into()), + ])) + .with_status(200) + .with_header("content-type", "application/json") + .with_body(r#"{"access_token":"jwt-abc","expires_in":300,"refresh_token":"r"}"#) + .create(); + + let profile = mock_profile(&server.url()); + let session = exchange_cli_register_code(&profile, "reg-code", "verifier").unwrap(); + token_mock.assert(); + mint_mock.assert(); + assert_eq!(session.access_token, "jwt-abc"); + assert_eq!(session.source, "api_token"); + } + + #[test] + fn exchange_cli_register_code_http_error() { + let mut server = mockito::Server::new(); + let m = server + .mock("POST", "/v1/auth/token") + .with_status(401) + .with_body("invalid code") + .create(); + + let profile = mock_profile(&server.url()); + let err = exchange_cli_register_code(&profile, "bad-code", "v").unwrap_err(); + m.assert(); + assert!(err.contains("401"), "got: {err}"); + } + + #[test] + fn exchange_cli_register_code_malformed_response() { + let mut server = mockito::Server::new(); + let m = server + .mock("POST", "/v1/auth/token") + .with_status(200) + .with_body("not json") + .create(); + + let profile = mock_profile(&server.url()); + let err = exchange_cli_register_code(&profile, "code", "v").unwrap_err(); + m.assert(); + assert!(err.contains("malformed"), "got: {err}"); + } + + #[test] + fn exchange_cli_register_code_connection_error() { + let profile = mock_profile("http://127.0.0.1:1"); + let err = exchange_cli_register_code(&profile, "code", "v").unwrap_err(); + assert!(err.contains("connection"), "got: {err}"); + } + // --- mint_from_api_token --- #[test] diff --git a/src/main.rs b/src/main.rs index ca20713..5766fdc 100644 --- a/src/main.rs +++ b/src/main.rs @@ -158,6 +158,7 @@ fn main() { Some(cmd) => match cmd { Commands::Auth { command } => match command { None | Some(AuthCommands::Login) => auth::login(), + Some(AuthCommands::Register { email }) => auth::register(email), Some(AuthCommands::Status) => auth::status("default"), Some(AuthCommands::Logout) => auth::logout("default"), }, diff --git a/src/skill.rs b/src/skill.rs index 1cacd11..700d00e 100644 --- a/src/skill.rs +++ b/src/skill.rs @@ -6,7 +6,12 @@ use std::path::PathBuf; const REPO: &str = "hotdata-dev/hotdata-cli"; const PRIMARY_SKILL_NAME: &str = "hotdata"; -const SKILL_NAMES: &[&str] = &["hotdata", "hotdata-geospatial"]; +const SKILL_NAMES: &[&str] = &[ + "hotdata", + "hotdata-search", + "hotdata-analytics", + "hotdata-geospatial", +]; const CURRENT_VERSION: &str = env!("CARGO_PKG_VERSION"); /// Agent root directories to check for symlink installation.