From 56a01ad2d1b0dde0eea4259ce0d7b7679f2ab831 Mon Sep 17 00:00:00 2001 From: Denys Fedoryshchenko Date: Thu, 21 May 2026 23:52:23 +0300 Subject: [PATCH] Add API token validation at startup We need to validate at startup if we have all needed permission. This is additional validation to reduce headache of troubleshooting, if there is any mistake in permissions and node is not being updated. Signed-off-by: Denys Fedoryshchenko --- src/kernel_ci_cloud_labs/pull_labs_poller.py | 78 ++++++++++++++++++++ tests/test_pull_labs_poller.py | 68 ++++++++++++++++- 2 files changed, 144 insertions(+), 2 deletions(-) diff --git a/src/kernel_ci_cloud_labs/pull_labs_poller.py b/src/kernel_ci_cloud_labs/pull_labs_poller.py index f5c2690..121bd5e 100644 --- a/src/kernel_ci_cloud_labs/pull_labs_poller.py +++ b/src/kernel_ci_cloud_labs/pull_labs_poller.py @@ -141,6 +141,80 @@ def _http_put_json( return json.loads(resp_body) if resp_body else None +def _validate_api_token( + api_base_uri: str, api_token: Optional[str], runtime_name: str +) -> None: + """Startup preflight: confirm api_token authenticates and can edit nodes. + + Calls GET /whoami once and logs the outcome. Never fatal -- a transient + API error must not stop the poller from starting, and node updates have + their own per-call error handling. It surfaces, at startup, the two + failure modes that otherwise only show up as a 401 on every job: + * the token does not authenticate at all (no token / invalid / expired); + * the token authenticates but the user cannot edit job nodes for this + runtime (kernelci-api _user_can_edit_node). + """ + if not api_token: + logger.warning( + "No kernelci-api token set (KERNELCI_API_TOKEN / UNIFIED_TOKEN / " + "config kernelci.api_token) -- node claim/finish updates will " + "fail with HTTP 401" + ) + return + + url = f"{api_base_uri.rstrip('/')}/whoami" + try: + whoami = _http_get_json(url, token=api_token) or {} + except urllib.error.HTTPError as e: + if e.code in (401, 403): + logger.error( + "kernelci-api token rejected by %s (HTTP %s) -- the token is " + "invalid, expired, or not a kernelci-api token; node updates " + "will fail", + url, e.code, + ) + else: + logger.warning( + "Could not validate kernelci-api token via %s: HTTP %s", + url, e.code, + ) + return + except (urllib.error.URLError, json.JSONDecodeError) as e: + logger.warning( + "Could not reach %s to validate the kernelci-api token (%s) -- " + "continuing; node updates will be retried per job", + url, e, + ) + return + + username = whoami.get("username") or whoami.get("email") or "" + is_superuser = bool(whoami.get("is_superuser")) + groups = { + g.get("name") + for g in whoami.get("groups", []) + if isinstance(g, dict) and g.get("name") + } + # Groups that let a user edit a job node it does not own + # (kernelci-api _user_can_edit_node). + editor_groups = { + "node:edit:any", + f"runtime:{runtime_name}:node-editor", + f"runtime:{runtime_name}:node-admin", + } + logger.info( + "kernelci-api token OK: user=%s superuser=%s groups=%s", + username, is_superuser, sorted(groups) or [], + ) + if not is_superuser and not (groups & editor_groups): + logger.warning( + "kernelci-api user %s cannot edit job nodes for runtime '%s': " + "not a superuser and in none of %s -- node claim/finish updates " + "will fail with HTTP 401 unless the user owns the nodes. Add the " + "user to group 'runtime:%s:node-editor'.", + username, runtime_name, sorted(editor_groups), runtime_name, + ) + + # --------------------------------------------------------------------------- # Cursor persistence — generic filesystem backend by default. # A deployment can swap in a custom CursorStore (e.g. backed by S3) by @@ -392,6 +466,10 @@ def __init__( if job_executor is None: _validate_default_executor_deps() + # Startup preflight: surface a bad/under-privileged kernelci-api token + # now, rather than as a 401 on every job's claim/finish update. + _validate_api_token(self.api_base_uri, self.api_token, self.runtime_name) + # -- Credential resolution ------------------------------------------- @staticmethod diff --git a/tests/test_pull_labs_poller.py b/tests/test_pull_labs_poller.py index 1e4a1b5..75bc2e9 100644 --- a/tests/test_pull_labs_poller.py +++ b/tests/test_pull_labs_poller.py @@ -6,6 +6,7 @@ """Unit tests for pull_labs_poller (no network, no AWS).""" import json +import logging import os import tempfile import urllib.error @@ -28,9 +29,10 @@ _GET = "kernel_ci_cloud_labs.pull_labs_poller._http_get_json" _PUT = "kernel_ci_cloud_labs.pull_labs_poller._http_put_json" -# Capture the real validator at import time so a specific test can restore it -# after the autouse fixture has stubbed it out. +# Capture the real validators at import time so a specific test can call them +# after the autouse fixtures have stubbed them out. _REAL_VALIDATE_DEFAULT_EXECUTOR_DEPS = poller_mod._validate_default_executor_deps +_REAL_VALIDATE_API_TOKEN = poller_mod._validate_api_token # --------------------------------------------------------------------------- @@ -60,6 +62,16 @@ def _skip_default_executor_deps_check(monkeypatch): monkeypatch.setattr(poller_mod, "_validate_default_executor_deps", lambda: None) +@pytest.fixture(autouse=True) +def _skip_api_token_check(monkeypatch): + """Bypass the startup /whoami token preflight (no network in unit tests). + + Dedicated tests call the real _validate_api_token via the captured + reference with _http_get_json patched. + """ + monkeypatch.setattr(poller_mod, "_validate_api_token", lambda *a, **k: None) + + def _minimal_kc(**overrides): base = { "api_base_uri": "https://api.example/latest", @@ -554,3 +566,55 @@ def _fail_if_called(): # Custom executor — validator must be skipped, no SystemExit. PullLabsPoller(_minimal_kc(), job_executor=lambda cfg: ([], None)) assert called["validator"] is False + + +# --------------------------------------------------------------------------- +# Startup /whoami token preflight +# --------------------------------------------------------------------------- + + +class TestValidateApiToken: + """_validate_api_token() -- never fatal, logs token validity and groups.""" + + URI = "https://api.example/latest" + RUNTIME = "pull-labs-aws-ec2" + + def test_no_token_warns(self, caplog): + with caplog.at_level(logging.WARNING): + _REAL_VALIDATE_API_TOKEN(self.URI, None, self.RUNTIME) + assert "No kernelci-api token" in caplog.text + + def test_401_logs_error(self, caplog): + err = urllib.error.HTTPError(self.URI, 401, "Unauthorized", {}, None) + with patch(_GET, side_effect=err), caplog.at_level(logging.ERROR): + _REAL_VALIDATE_API_TOKEN(self.URI, "bad-token", self.RUNTIME) + assert "rejected" in caplog.text + + def test_network_error_is_not_fatal(self): + # A transient API error must not raise -- it cannot block startup. + with patch(_GET, side_effect=urllib.error.URLError("boom")): + _REAL_VALIDATE_API_TOKEN(self.URI, "t", self.RUNTIME) + + def test_valid_token_with_editor_group(self, caplog): + whoami = { + "username": "pullbot", + "groups": [{"name": "runtime:pull-labs-aws-ec2:node-editor"}], + } + with patch(_GET, return_value=whoami), caplog.at_level(logging.INFO): + _REAL_VALIDATE_API_TOKEN(self.URI, "t", self.RUNTIME) + assert "token OK" in caplog.text + assert "cannot edit" not in caplog.text + + def test_superuser_token_ok(self, caplog): + whoami = {"username": "root", "is_superuser": True, "groups": []} + with patch(_GET, return_value=whoami), caplog.at_level(logging.INFO): + _REAL_VALIDATE_API_TOKEN(self.URI, "t", self.RUNTIME) + assert "cannot edit" not in caplog.text + + def test_valid_token_without_editor_group_warns(self, caplog): + whoami = {"username": "pullbot", "groups": [{"name": "some-other-group"}]} + with patch(_GET, return_value=whoami), caplog.at_level(logging.WARNING): + _REAL_VALIDATE_API_TOKEN(self.URI, "t", self.RUNTIME) + assert "cannot edit job nodes" in caplog.text + # The required group is named in the hint. + assert "runtime:pull-labs-aws-ec2:node-editor" in caplog.text