fix: #2797 accept raw image_url content parts on chat completions input (#2799)

seratch · web-flow · commit a7b4851e69c3 · 2026-03-28T11:02:39.000+09:00
diff --git a/src/agents/models/chatcmpl_converter.py b/src/agents/models/chatcmpl_converter.py
@@ -329,6 +329,41 @@ def extract_text_content(
                 raise UserError(f"Only text content is supported here, got: {c}")
         return out
 
+    @classmethod
+    def _normalize_input_content_part_alias(
+        cls,
+        content_part: ResponseInputContentWithAudioParam,
+    ) -> ResponseInputContentWithAudioParam:
+        """Accept raw Chat Completions parts by mapping them to SDK canonical shapes."""
+        if not isinstance(content_part, dict):
+            return content_part
+
+        content_type = content_part.get("type")
+        if content_type == "text":
+            text = content_part.get("text")
+            if not isinstance(text, str):
+                raise UserError(f"Only text content is supported here, got: {content_part}")
+            # Cast the normalized dict because we are constructing a TypedDict alias by hand.
+            return cast(ResponseInputTextParam, {"type": "input_text", "text": text})
+
+        if content_type != "image_url":
+            return content_part
+
+        image_payload = content_part.get("image_url")
+        if not isinstance(image_payload, dict):
+            raise UserError(f"Only image URLs are supported for image_url {content_part}")
+
+        image_url = image_payload.get("url")
+        if not isinstance(image_url, str) or not image_url:
+            raise UserError(f"Only image URLs are supported for image_url {content_part}")
+
+        normalized: dict[str, Any] = {"type": "input_image", "image_url": image_url}
+        detail = image_payload.get("detail")
+        if detail is not None:
+            normalized["detail"] = detail
+        # Cast the normalized dict because we are constructing a TypedDict alias by hand.
+        return cast(ResponseInputImageParam, normalized)
+
     @classmethod
     def extract_all_content(
         cls, content: str | Iterable[ResponseInputContentWithAudioParam]
@@ -338,6 +373,7 @@ def extract_all_content(
         out: list[ChatCompletionContentPartParam] = []
 
         for c in content:
+            c = cls._normalize_input_content_part_alias(c)
             if isinstance(c, dict) and c.get("type") == "input_text":
                 casted_text_param = cast(ResponseInputTextParam, c)
                 out.append(
diff --git a/src/agents/models/openai_chatcompletions.py b/src/agents/models/openai_chatcompletions.py
@@ -87,7 +87,11 @@ def _validate_official_openai_input_content_types(
                 if not isinstance(part, dict):
                     continue
 
-                content_type = part.get("type")
+                normalized_part = Converter._normalize_input_content_part_alias(part)
+                if not isinstance(normalized_part, dict):
+                    continue
+
+                content_type = normalized_part.get("type")
                 if content_type in self._OFFICIAL_OPENAI_SUPPORTED_INPUT_CONTENT_TYPES:
                     continue
 
diff --git a/tests/test_openai_chatcompletions.py b/tests/test_openai_chatcompletions.py
@@ -384,6 +384,90 @@ def __init__(self, completions: DummyCompletions) -> None:
     assert kwargs["stream_options"] is omit
 
 
+@pytest.mark.allow_call_model_methods
+@pytest.mark.asyncio
+async def test_get_response_accepts_raw_chat_completions_image_content() -> None:
+    """
+    Raw Chat Completions content parts should be accepted on the SDK input path
+    when using the Chat Completions backend.
+    """
+
+    class DummyCompletions:
+        def __init__(self) -> None:
+            self.kwargs: dict[str, Any] = {}
+
+        async def create(self, **kwargs: Any) -> Any:
+            self.kwargs = kwargs
+            return chat
+
+    class DummyClient:
+        def __init__(self, completions: DummyCompletions) -> None:
+            self.chat = type("_Chat", (), {"completions": completions})()
+            self.base_url = httpx.URL("https://api.openai.com/v1/")
+
+    msg = ChatCompletionMessage(role="assistant", content="ok")
+    choice = Choice(index=0, finish_reason="stop", message=msg)
+    chat = ChatCompletion(
+        id="resp-id",
+        created=0,
+        model="fake",
+        object="chat.completion",
+        choices=[choice],
+        usage=None,
+    )
+    completions = DummyCompletions()
+    dummy_client = DummyClient(completions)
+    model = OpenAIChatCompletionsModel(model="gpt-4", openai_client=dummy_client)  # type: ignore[arg-type]
+
+    await model.get_response(
+        system_instructions=None,
+        input=[
+            # Cast the fixture because the raw chat-style alias is intentionally outside the
+            # canonical TypedDict shape that mypy expects for ordinary SDK inputs.
+            cast(
+                Any,
+                {
+                    "role": "user",
+                    "content": [
+                        {"type": "text", "text": "What is in this image?"},
+                        {
+                            "type": "image_url",
+                            "image_url": {
+                                "url": "data:image/png;base64,AAAA",
+                                "detail": "high",
+                            },
+                        },
+                    ],
+                },
+            )
+        ],
+        model_settings=ModelSettings(),
+        tools=[],
+        output_schema=None,
+        handoffs=[],
+        tracing=ModelTracing.DISABLED,
+        previous_response_id=None,
+        conversation_id=None,
+        prompt=None,
+    )
+
+    assert completions.kwargs["messages"] == [
+        {
+            "role": "user",
+            "content": [
+                {"type": "text", "text": "What is in this image?"},
+                {
+                    "type": "image_url",
+                    "image_url": {
+                        "url": "data:image/png;base64,AAAA",
+                        "detail": "high",
+                    },
+                },
+            ],
+        }
+    ]
+
+
 @pytest.mark.asyncio
 async def test_fetch_response_stream(monkeypatch) -> None:
     """
diff --git a/tests/test_openai_chatcompletions_converter.py b/tests/test_openai_chatcompletions_converter.py
@@ -140,6 +140,49 @@ def test_items_to_messages_with_easy_input_message():
     assert out["content"] == "How are you?"
 
 
+def test_items_to_messages_accepts_raw_chat_completions_user_content_parts():
+    """
+    Raw Chat Completions content parts should be accepted as aliases for the SDK's
+    canonical input content shapes.
+    """
+    items: list[TResponseInputItem] = [
+        # Cast the fixture because mypy cannot infer this raw chat-style dict as a specific
+        # member of the TResponseInputItem TypedDict union on its own.
+        cast(
+            TResponseInputItem,
+            {
+                "role": "user",
+                "content": [
+                    {"type": "text", "text": "What is in this image?"},
+                    {
+                        "type": "image_url",
+                        "image_url": {
+                            "url": "https://example.com/image.png",
+                            "detail": "high",
+                        },
+                    },
+                ],
+            },
+        )
+    ]
+
+    messages = Converter.items_to_messages(items)
+
+    assert len(messages) == 1
+    message = messages[0]
+    assert message["role"] == "user"
+    assert message["content"] == [
+        {"type": "text", "text": "What is in this image?"},
+        {
+            "type": "image_url",
+            "image_url": {
+                "url": "https://example.com/image.png",
+                "detail": "high",
+            },
+        },
+    ]
+
+
 def test_items_to_messages_with_output_message_and_function_call():
     """
     Given a sequence of one ResponseOutputMessageParam followed by a