🚧 added initial http error handling

m-misiura · m-misiura · commit ba96199a8915 · 2026-04-30T10:57:55.000+01:00
diff --git a/nemoguardrails/actions/llm/utils.py b/nemoguardrails/actions/llm/utils.py
@@ -26,7 +26,7 @@
     reasoning_trace_var,
     tool_calls_var,
 )
-from nemoguardrails.exceptions import LLMCallException
+from nemoguardrails.exceptions import LLMCallException, LLMClientError
 from nemoguardrails.logging.explain import LLMCallInfo
 from nemoguardrails.logging.llm_tracker import track_llm_call
 from nemoguardrails.types import ChatMessage, LLMModel, LLMResponse, LLMResponseChunk, UsageInfo
@@ -311,6 +311,34 @@ def _update_token_stats_from_chunk(chunk: LLMResponseChunk) -> None:
             llm_stats.inc("total_completion_tokens", chunk.usage.output_tokens)
 
 
+def _extract_http_status(exception: BaseException) -> Optional[int]:
+    """Extract an HTTP status code from a provider exception, if present.
+
+    Checks, in order:
+    1. ``LLMClientError.status_code`` (NeMo Guardrails client layer).
+    2. ``exception.status_code`` (OpenAI SDK, httpx).
+    3. ``exception.response.status_code`` (requests-style wrappers).
+
+    Returns ``None`` when no status can be determined or when the status
+    is ``0`` (used by ``LLMTimeoutError`` / ``LLMConnectionError`` for
+    client-side failures where no HTTP response was received).
+    """
+    if isinstance(exception, LLMClientError):
+        return exception.status_code if exception.status_code > 0 else None
+
+    status = getattr(exception, "status_code", None)
+    if isinstance(status, int) and status > 0:
+        return status
+
+    response = getattr(exception, "response", None)
+    if response is not None:
+        status = getattr(response, "status_code", None)
+        if isinstance(status, int) and status > 0:
+            return status
+
+    return None
+
+
 def _raise_llm_call_exception(
     exception: Exception,
     model: LLMModel,
@@ -328,11 +356,13 @@ def _raise_llm_call_exception(
     if endpoint_url:
         context_parts.append(f"endpoint={endpoint_url}")
 
+    status = _extract_http_status(exception)
+
     if context_parts:
         detail = f"Error invoking LLM ({', '.join(context_parts)})"
-        raise LLMCallException(exception, detail=detail) from exception
+        raise LLMCallException(exception, detail=detail, status=status) from exception
     else:
-        raise LLMCallException(exception) from exception
+        raise LLMCallException(exception, status=status) from exception
 
 
 def _store_reasoning_traces(response: LLMResponse) -> None:
diff --git a/nemoguardrails/exceptions.py b/nemoguardrails/exceptions.py
@@ -69,25 +69,28 @@ class StreamingNotSupportedError(InvalidRailsConfigurationError):
 class LLMCallException(Exception):
     """A wrapper around the LLM call invocation exception.
 
-    This is used to propagate the exception out of the `generate_async` call. The default behavior is to
-    catch it and return an "Internal server error." message.
+    This is used to propagate the exception out of the ``generate_async`` call.
+    When the inner exception carries an HTTP status code (e.g. a
+    :class:`LLMClientError`), callers can inspect :attr:`status` to decide
+    which HTTP response code to return to the upstream client.
     """
 
     inner_exception: Union[BaseException, str]
     detail: Optional[str]
+    status: Optional[int]
 
-    def __init__(self, inner_exception: Union[BaseException, str], detail: Optional[str] = None):
-        """Initialize LLMCallException.
-
-        Args:
-            inner_exception: The original exception that occurred
-            detail: Optional context to prepend (for example, the model name or endpoint)
-        """
+    def __init__(
+        self,
+        inner_exception: Union[BaseException, str],
+        detail: Optional[str] = None,
+        status: Optional[int] = None,
+    ):
         message = f"{detail or 'LLM Call Exception'}: {str(inner_exception)}"
         super().__init__(message)
 
         self.inner_exception = inner_exception
         self.detail = detail
+        self.status = status
 
 
 class LLMClientError(Exception):
diff --git a/nemoguardrails/guardrails/iorails.py b/nemoguardrails/guardrails/iorails.py
@@ -53,6 +53,7 @@
     stream_active_metric,
     traced_request,
 )
+from nemoguardrails.llm.clients._errors import _redact_secrets
 from nemoguardrails.llm.taskmanager import LLMTaskManager
 from nemoguardrails.rails.llm.buffer import get_buffer_strategy
 from nemoguardrails.rails.llm.config import RailsConfig
@@ -413,8 +414,15 @@ async def _generation_task(request_span):
                 # streaming path.
                 if self._metrics_enabled:
                     record_request_error(e)
+                status = getattr(e, "status", None)
                 error_payload = json.dumps(
-                    {"error": {"message": str(e), "type": _GENERATION_ERROR_TYPE, "code": "generation_failed"}}
+                    {
+                        "error": {
+                            "message": _redact_secrets(str(e)),
+                            "type": "downstream_error" if status is not None else _GENERATION_ERROR_TYPE,
+                            "code": status if status is not None else "generation_failed",
+                        }
+                    }
                 )
                 await streaming_handler.push_chunk(error_payload)
                 await streaming_handler.push_chunk(END_OF_STREAM)  # type: ignore[arg-type]
@@ -521,12 +529,14 @@ async def _run_output_rails_in_streaming(
             user_output_chunks = chunk_batch.user_output_chunks
             bot_response_chunk = buffer_strategy.format_chunks(chunk_batch.processing_context)
 
-            # If the batch contains a generation error from _generation_task,
+            # If the batch contains an error chunk (generation or downstream HTTP),
             # yield it directly and stop — don't feed error JSON through output rails.
             for chunk in user_output_chunks:
                 try:
                     parsed = json.loads(chunk)
-                    if isinstance(parsed, dict) and parsed.get("error", {}).get("type") == _GENERATION_ERROR_TYPE:
+                    error_obj = parsed.get("error") if isinstance(parsed, dict) else None
+                    error_type = error_obj.get("type") if isinstance(error_obj, dict) else None
+                    if error_type in (_GENERATION_ERROR_TYPE, "downstream_error"):
                         yield chunk
                         return
                 except (json.JSONDecodeError, TypeError):
diff --git a/nemoguardrails/guardrails/rail_action.py b/nemoguardrails/guardrails/rail_action.py
@@ -26,14 +26,17 @@
 from abc import ABC, abstractmethod
 from typing import TYPE_CHECKING, Any, Optional, Union
 
+from nemoguardrails.guardrails.api_engine import APIEngineError
 from nemoguardrails.guardrails.engine_registry import EngineRegistry
 from nemoguardrails.guardrails.guardrails_types import (
     LLMMessages,
     RailResult,
     get_request_id,
     truncate,
 )
+from nemoguardrails.guardrails.model_engine import ModelEngineError
 from nemoguardrails.guardrails.telemetry import action_span, record_span_error
+from nemoguardrails.llm.clients._errors import _redact_secrets
 from nemoguardrails.llm.taskmanager import LLMTaskManager
 from nemoguardrails.rails.llm.config import _get_flow_model, _get_flow_name
 from nemoguardrails.types import LLMResponse
@@ -101,11 +104,17 @@ async def run(
                 response = await self._get_response(model_type, prompt)
                 log.debug("[%s] %s response: %s", req_id, base_flow, truncate(response))
                 return self._parse_response(response)
+            except (ModelEngineError, APIEngineError) as e:
+                record_span_error(span, e)
+                if e.status is not None:
+                    log.error("[%s] %s failed (HTTP %d): %s", req_id, base_flow, e.status, e)
+                    raise
+                log.error("[%s] %s failed: %s", req_id, base_flow, e)
+                return RailResult(is_safe=False, reason=_redact_secrets(f"{base_flow} error: {e}"))
             except Exception as e:
-                # Record an error on the OTEL span
                 record_span_error(span, e)
                 log.error("[%s] %s failed: %s", req_id, base_flow, e)
-                return RailResult(is_safe=False, reason=f"{base_flow} error: {e}")
+                return RailResult(is_safe=False, reason=_redact_secrets(f"{base_flow} error: {e}"))
 
     def _get_model_type(self, flow: str) -> Optional[str]:
         """Extract model from the flow's ``$model=`` parameter, falling back to :attr:`fallback_model`."""
diff --git a/nemoguardrails/rails/llm/llmrails.py b/nemoguardrails/rails/llm/llmrails.py
@@ -77,6 +77,7 @@
 )
 from nemoguardrails.kb.kb import KnowledgeBase
 from nemoguardrails.llm.cache import CacheInterface, LFUCache
+from nemoguardrails.llm.clients._errors import _redact_secrets
 from nemoguardrails.llm.models.initializer import (
     ModelInitializationError,
     init_llm_model,
@@ -899,14 +900,9 @@ async def generate_async(
                 log.error("Error in generate_async: %s", e, exc_info=True)
                 streaming_handler = streaming_handler_var.get()
                 if streaming_handler:
-                    # Push an error chunk instead of None.
-                    error_message = str(e)
-                    error_dict = extract_error_json(error_message)
-                    error_payload: str = json.dumps(error_dict)
+                    error_payload: str = _build_streaming_error_payload(e)
                     await streaming_handler.push_chunk(error_payload)
-                    # push a termination signal
                     await streaming_handler.push_chunk(END_OF_STREAM)  # type: ignore
-                # Re-raise the exact exception
                 raise
         else:
             # In generation mode, by default the bot response is an instant action.
@@ -1265,12 +1261,8 @@ async def _generation_task():
                     state=state,
                 )
             except Exception as e:
-                # If an exception occurs during generation, push it to the streaming handler as a json string
-                # This ensures the streaming pipeline is properly terminated
                 log.error(f"Error in generation task: {e}", exc_info=True)
-                error_message = str(e)
-                error_dict = extract_error_json(error_message)
-                error_payload = json.dumps(error_dict)
+                error_payload = _build_streaming_error_payload(e)
                 await streaming_handler.push_chunk(error_payload)
                 await streaming_handler.push_chunk(END_OF_STREAM)  # type: ignore
 
@@ -1931,3 +1923,42 @@ def _get_last_response_content(response: "GenerationResponse") -> str:
     if isinstance(response.response, str):
         return response.response
     return ""
+
+
+def _build_streaming_error_payload(e: Exception) -> str:
+    """Build a JSON error payload for SSE streaming from an exception.
+
+    Normalizes all error shapes from extract_error_json into the
+    {"error": {"message", "type", "code"}} format that iorails.py
+    expects for error chunk detection.
+    """
+    error_dict = extract_error_json(str(e))
+    if not isinstance(error_dict, dict):
+        error_dict = {}
+    error_val = error_dict.get("error")
+    status = getattr(e, "status", None)
+    error_type = "downstream_error" if status is not None else "generation_error"
+    error_code = status if status is not None else "generation_failed"
+
+    if isinstance(error_val, dict):
+        error_val["message"] = _redact_secrets(error_val.get("message", ""))
+        if status is not None:
+            error_val["code"] = status
+            error_val["type"] = "downstream_error"
+        else:
+            error_val.setdefault("type", error_type)
+            error_val.setdefault("code", error_code)
+    elif isinstance(error_val, str):
+        error_dict["error"] = {
+            "message": _redact_secrets(error_val),
+            "type": error_type,
+            "code": error_code,
+        }
+    else:
+        error_dict["error"] = {
+            "message": _redact_secrets(str(e)),
+            "type": error_type,
+            "code": error_code,
+        }
+
+    return json.dumps(error_dict)
diff --git a/nemoguardrails/server/api.py b/nemoguardrails/server/api.py
@@ -31,9 +31,13 @@
 from openai.types.chat.chat_completion import Choice
 from openai.types.chat.chat_completion_message import ChatCompletionMessage
 from pydantic import BaseModel, ValidationError
-from starlette.responses import RedirectResponse, StreamingResponse
+from starlette.responses import JSONResponse, RedirectResponse, StreamingResponse
 
 from nemoguardrails import LLMRails, RailsConfig, utils
+from nemoguardrails.exceptions import LLMCallException
+from nemoguardrails.guardrails.api_engine import APIEngineError
+from nemoguardrails.guardrails.model_engine import ModelEngineError
+from nemoguardrails.llm.clients._errors import _redact_secrets
 from nemoguardrails.rails.llm.config import Model
 from nemoguardrails.rails.llm.options import GenerationResponse
 from nemoguardrails.server.datastore.datastore import DataStore
@@ -364,7 +368,7 @@ class ChunkErrorMetadata(BaseModel):
     message: str
     type: Optional[str] = None
     param: Optional[str] = None
-    code: Optional[str] = None
+    code: Union[str, int, None] = None
 
 
 class ChunkError(BaseModel):
@@ -589,12 +593,26 @@ async def chat_completion(body: GuardrailsChatCompletionRequest, request: Reques
 
     except HTTPException:
         raise
+    except (LLMCallException, ModelEngineError, APIEngineError) as ex:
+        log.exception(ex)
+        status = getattr(ex, "status", None) or 500
+        return JSONResponse(
+            status_code=status,
+            content=create_error_chat_completion(
+                model=body.model,
+                error_message=_redact_secrets(str(ex)),
+                config_id=config_ids[0] if config_ids else None,
+            ).model_dump(),
+        )
     except Exception as ex:
         log.exception(ex)
-        return create_error_chat_completion(
-            model=body.model,
-            error_message="Internal server error",
-            config_id=config_ids[0] if config_ids else None,
+        return JSONResponse(
+            status_code=500,
+            content=create_error_chat_completion(
+                model=body.model,
+                error_message="Internal server error",
+                config_id=config_ids[0] if config_ids else None,
+            ).model_dump(),
         )
 
 
diff --git a/tests/test_http_error_handling.py b/tests/test_http_error_handling.py