diff --git a/src/kernel_ci_cloud_labs/pull_labs_poller.py b/src/kernel_ci_cloud_labs/pull_labs_poller.py index f5c2690..121bd5e 100644 --- a/src/kernel_ci_cloud_labs/pull_labs_poller.py +++ b/src/kernel_ci_cloud_labs/pull_labs_poller.py @@ -141,6 +141,80 @@ def _http_put_json( return json.loads(resp_body) if resp_body else None +def _validate_api_token( + api_base_uri: str, api_token: Optional[str], runtime_name: str +) -> None: + """Startup preflight: confirm api_token authenticates and can edit nodes. + + Calls GET /whoami once and logs the outcome. Never fatal -- a transient + API error must not stop the poller from starting, and node updates have + their own per-call error handling. It surfaces, at startup, the two + failure modes that otherwise only show up as a 401 on every job: + * the token does not authenticate at all (no token / invalid / expired); + * the token authenticates but the user cannot edit job nodes for this + runtime (kernelci-api _user_can_edit_node). + """ + if not api_token: + logger.warning( + "No kernelci-api token set (KERNELCI_API_TOKEN / UNIFIED_TOKEN / " + "config kernelci.api_token) -- node claim/finish updates will " + "fail with HTTP 401" + ) + return + + url = f"{api_base_uri.rstrip('/')}/whoami" + try: + whoami = _http_get_json(url, token=api_token) or {} + except urllib.error.HTTPError as e: + if e.code in (401, 403): + logger.error( + "kernelci-api token rejected by %s (HTTP %s) -- the token is " + "invalid, expired, or not a kernelci-api token; node updates " + "will fail", + url, e.code, + ) + else: + logger.warning( + "Could not validate kernelci-api token via %s: HTTP %s", + url, e.code, + ) + return + except (urllib.error.URLError, json.JSONDecodeError) as e: + logger.warning( + "Could not reach %s to validate the kernelci-api token (%s) -- " + "continuing; node updates will be retried per job", + url, e, + ) + return + + username = whoami.get("username") or whoami.get("email") or "" + is_superuser = bool(whoami.get("is_superuser")) + groups = { + g.get("name") + for g in whoami.get("groups", []) + if isinstance(g, dict) and g.get("name") + } + # Groups that let a user edit a job node it does not own + # (kernelci-api _user_can_edit_node). + editor_groups = { + "node:edit:any", + f"runtime:{runtime_name}:node-editor", + f"runtime:{runtime_name}:node-admin", + } + logger.info( + "kernelci-api token OK: user=%s superuser=%s groups=%s", + username, is_superuser, sorted(groups) or [], + ) + if not is_superuser and not (groups & editor_groups): + logger.warning( + "kernelci-api user %s cannot edit job nodes for runtime '%s': " + "not a superuser and in none of %s -- node claim/finish updates " + "will fail with HTTP 401 unless the user owns the nodes. Add the " + "user to group 'runtime:%s:node-editor'.", + username, runtime_name, sorted(editor_groups), runtime_name, + ) + + # --------------------------------------------------------------------------- # Cursor persistence — generic filesystem backend by default. # A deployment can swap in a custom CursorStore (e.g. backed by S3) by @@ -392,6 +466,10 @@ def __init__( if job_executor is None: _validate_default_executor_deps() + # Startup preflight: surface a bad/under-privileged kernelci-api token + # now, rather than as a 401 on every job's claim/finish update. + _validate_api_token(self.api_base_uri, self.api_token, self.runtime_name) + # -- Credential resolution ------------------------------------------- @staticmethod diff --git a/tests/test_pull_labs_poller.py b/tests/test_pull_labs_poller.py index 1e4a1b5..75bc2e9 100644 --- a/tests/test_pull_labs_poller.py +++ b/tests/test_pull_labs_poller.py @@ -6,6 +6,7 @@ """Unit tests for pull_labs_poller (no network, no AWS).""" import json +import logging import os import tempfile import urllib.error @@ -28,9 +29,10 @@ _GET = "kernel_ci_cloud_labs.pull_labs_poller._http_get_json" _PUT = "kernel_ci_cloud_labs.pull_labs_poller._http_put_json" -# Capture the real validator at import time so a specific test can restore it -# after the autouse fixture has stubbed it out. +# Capture the real validators at import time so a specific test can call them +# after the autouse fixtures have stubbed them out. _REAL_VALIDATE_DEFAULT_EXECUTOR_DEPS = poller_mod._validate_default_executor_deps +_REAL_VALIDATE_API_TOKEN = poller_mod._validate_api_token # --------------------------------------------------------------------------- @@ -60,6 +62,16 @@ def _skip_default_executor_deps_check(monkeypatch): monkeypatch.setattr(poller_mod, "_validate_default_executor_deps", lambda: None) +@pytest.fixture(autouse=True) +def _skip_api_token_check(monkeypatch): + """Bypass the startup /whoami token preflight (no network in unit tests). + + Dedicated tests call the real _validate_api_token via the captured + reference with _http_get_json patched. + """ + monkeypatch.setattr(poller_mod, "_validate_api_token", lambda *a, **k: None) + + def _minimal_kc(**overrides): base = { "api_base_uri": "https://api.example/latest", @@ -554,3 +566,55 @@ def _fail_if_called(): # Custom executor — validator must be skipped, no SystemExit. PullLabsPoller(_minimal_kc(), job_executor=lambda cfg: ([], None)) assert called["validator"] is False + + +# --------------------------------------------------------------------------- +# Startup /whoami token preflight +# --------------------------------------------------------------------------- + + +class TestValidateApiToken: + """_validate_api_token() -- never fatal, logs token validity and groups.""" + + URI = "https://api.example/latest" + RUNTIME = "pull-labs-aws-ec2" + + def test_no_token_warns(self, caplog): + with caplog.at_level(logging.WARNING): + _REAL_VALIDATE_API_TOKEN(self.URI, None, self.RUNTIME) + assert "No kernelci-api token" in caplog.text + + def test_401_logs_error(self, caplog): + err = urllib.error.HTTPError(self.URI, 401, "Unauthorized", {}, None) + with patch(_GET, side_effect=err), caplog.at_level(logging.ERROR): + _REAL_VALIDATE_API_TOKEN(self.URI, "bad-token", self.RUNTIME) + assert "rejected" in caplog.text + + def test_network_error_is_not_fatal(self): + # A transient API error must not raise -- it cannot block startup. + with patch(_GET, side_effect=urllib.error.URLError("boom")): + _REAL_VALIDATE_API_TOKEN(self.URI, "t", self.RUNTIME) + + def test_valid_token_with_editor_group(self, caplog): + whoami = { + "username": "pullbot", + "groups": [{"name": "runtime:pull-labs-aws-ec2:node-editor"}], + } + with patch(_GET, return_value=whoami), caplog.at_level(logging.INFO): + _REAL_VALIDATE_API_TOKEN(self.URI, "t", self.RUNTIME) + assert "token OK" in caplog.text + assert "cannot edit" not in caplog.text + + def test_superuser_token_ok(self, caplog): + whoami = {"username": "root", "is_superuser": True, "groups": []} + with patch(_GET, return_value=whoami), caplog.at_level(logging.INFO): + _REAL_VALIDATE_API_TOKEN(self.URI, "t", self.RUNTIME) + assert "cannot edit" not in caplog.text + + def test_valid_token_without_editor_group_warns(self, caplog): + whoami = {"username": "pullbot", "groups": [{"name": "some-other-group"}]} + with patch(_GET, return_value=whoami), caplog.at_level(logging.WARNING): + _REAL_VALIDATE_API_TOKEN(self.URI, "t", self.RUNTIME) + assert "cannot edit job nodes" in caplog.text + # The required group is named in the hint. + assert "runtime:pull-labs-aws-ec2:node-editor" in caplog.text