From da1d0acc8cd4c9a211d39f5210e84725afa6d623 Mon Sep 17 00:00:00 2001 From: thodson-usgs Date: Sat, 30 May 2026 15:07:39 -0400 Subject: [PATCH] fix(waterdata): raise RequestTooLarge for an unchunkable over-budget request MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit ChunkPlan's "no chunkable axes" branch returned immediately without sizing the request, deliberately leaving an over-budget URL for the server to reject. So a single large CQL-text `filter` with one big `IN (...)` clause (no top-level `OR`, hence no chunk axis) was shipped verbatim and failed with an opaque HTTP 414 — and not even RequestTooLarge. (The equivalent monitoring_location_id=[...] chunks fine.) Size-check the no-axes path: if the single request fits, pass through as before; if it's over budget there's nothing to split, so raise RequestTooLarge with actionable guidance instead of shipping it. Co-Authored-By: Claude Opus 4.8 (1M context) --- dataretrieval/waterdata/chunking.py | 21 +++++++++++++++------ tests/waterdata_chunking_test.py | 17 +++++++++++++---- 2 files changed, 28 insertions(+), 10 deletions(-) diff --git a/dataretrieval/waterdata/chunking.py b/dataretrieval/waterdata/chunking.py index ab079070..04a25171 100644 --- a/dataretrieval/waterdata/chunking.py +++ b/dataretrieval/waterdata/chunking.py @@ -871,13 +871,22 @@ def __init__( self.canonical_url: str | None = None axes = _extract_axes(args) - # No chunkable axes → skip ``build_request`` entirely; the - # common Water Data call shape shouldn't pay for an unused - # request prep on the passthrough hot path. The fetcher - # will run with the user's args verbatim; if that produces - # an over-budget URL, the server (or httpx itself) rejects. if not axes: - return + # No chunkable axis: nothing to split. If the single request fits, + # run it verbatim (the common passthrough). If it's over budget we + # can't chunk it (e.g. a single large CQL ``IN`` clause with no + # top-level ``OR``, or one oversized value), so raise an actionable + # error instead of shipping it for the server to reject with an + # opaque HTTP 414. ``_safe_request_bytes`` treats an un-constructable + # URL (httpx.InvalidURL, > 64 KB) as over budget. + if _safe_request_bytes(build_request, args, url_limit) <= url_limit: + return + raise RequestTooLarge( + f"Request exceeds {url_limit} bytes (URL + body) and has no " + f"chunkable multi-value argument to split (e.g. a single large " + f"CQL `IN` clause, or one oversized value). Narrow the query, " + f"simplify the filter, or split the call manually." + ) # Constructing the initial request can itself trip # ``httpx.InvalidURL`` (URL > 64 KB) — that's the canonical diff --git a/tests/waterdata_chunking_test.py b/tests/waterdata_chunking_test.py index a435a4de..98ed70c9 100644 --- a/tests/waterdata_chunking_test.py +++ b/tests/waterdata_chunking_test.py @@ -161,15 +161,24 @@ def test_extract_axes_skips_singletons_and_never_chunk_params(): def test_chunk_plan_returns_passthrough_when_no_chunkable_axes(): - """Scalar args with nothing to chunk → passthrough, even at a - URL limit the request technically exceeds (the server may 414, - but ``ChunkPlan`` has nothing to split).""" + """Scalar args with nothing to chunk and a request within the limit → + passthrough (no axes).""" args = {"monitoring_location_id": "scalar-only"} - plan = ChunkPlan(args, _fake_build, url_limit=10) + plan = ChunkPlan(args, _fake_build, url_limit=1000) assert plan.axes == [] assert plan.total == 1 +def test_chunk_plan_raises_when_unchunkable_request_exceeds_limit(): + """A request with nothing to chunk that still exceeds the byte limit (e.g. + a single large CQL ``IN`` clause with no top-level ``OR``) raises + RequestTooLarge instead of being shipped for the server to reject with an + opaque HTTP 414.""" + args = {"monitoring_location_id": "scalar-only"} + with pytest.raises(RequestTooLarge): + ChunkPlan(args, _fake_build, url_limit=10) + + def test_chunk_plan_greedy_halving_targets_largest_axis_chunk(): """The biggest chunk across all axes halves first — when one list axis dominates URL bytes, only it gets split until it stops being