openai spec updates (#156)

soumilinandi · web-flow · commit da2c6741daa6 · 2025-06-04T13:33:36.000-04:00
* adding support for max_completion_tokens

* adding support for max_completion_token as class param

* adding format changes

* formatting changes for lint

* formatting changes for lint

* formatting changes for lint

* removing whitespace
diff --git a/libs/ai-endpoints/langchain_nvidia_ai_endpoints/chat_models.py b/libs/ai-endpoints/langchain_nvidia_ai_endpoints/chat_models.py
@@ -264,7 +264,9 @@ class ChatNVIDIA(BaseChatModel):
         None, description="Sampling temperature in [0, 1]"
     )
     max_tokens: Optional[int] = Field(
-        1024, description="Maximum # of tokens to generate"
+        1024,
+        description="Maximum # of tokens to generate",
+        alias="max_completion_tokens",
     )
     top_p: Optional[float] = Field(None, description="Top-p for distribution sampling")
     seed: Optional[int] = Field(None, description="The seed for deterministic results")
@@ -287,6 +289,8 @@ def __init__(self, **kwargs: Any):
                             Format for base URL is http://host:port
             temperature (float): Sampling temperature in [0, 1].
             max_tokens (int): Maximum number of tokens to generate.
+                              Deprecated, use max_completion_tokens instead
+            max_completion_tokens (int): Maximum number of tokens to generate.
             top_p (float): Top-p for distribution sampling.
             seed (int): A seed for deterministic results.
             stop (list[str]): A list of cased stop words.
@@ -303,6 +307,16 @@ def __init__(self, **kwargs: Any):
                 model="meta-llama3-8b-instruct"
             )
         """
+        # Show deprecation warning if max_tokens was used
+        if "max_tokens" in kwargs:
+            warnings.warn(
+                "The 'max_tokens' parameter is deprecated and will be removed "
+                "in a future version. "
+                "Please use 'max_completion_tokens' instead.",
+                DeprecationWarning,
+                stacklevel=2,
+            )
+
         super().__init__(**kwargs)
         # allow nvidia_base_url as an alternative for base_url
         base_url = kwargs.pop("nvidia_base_url", self.base_url)
@@ -359,7 +373,11 @@ def _get_ls_params(
             ls_model_name=self.model or "UNKNOWN",
             ls_model_type="chat",
             ls_temperature=params.get("temperature", self.temperature),
-            ls_max_tokens=params.get("max_tokens", self.max_tokens),
+            # TODO: remove max_tokens once all models support max_completion_tokens
+            ls_max_tokens=(
+                params.get("max_completion_tokens", self.max_tokens)
+                or params.get("max_tokens", self.max_tokens)
+            ),
             # mypy error: Extra keys ("ls_top_p", "ls_seed")
             #  for TypedDict "LangSmithParams"  [typeddict-item]
             # ls_top_p=params.get("top_p", self.top_p),
@@ -765,7 +783,7 @@ class Choices(enum.Enum):
         For Pydantic schema and Enum, the output will be None if the response is
         insufficient to construct the object or otherwise invalid. For instance,
         ```
-        llm = ChatNVIDIA(max_tokens=1)
+        llm = ChatNVIDIA(max_completion_tokens=1)
         structured_llm = llm.with_structured_output(Joke)
         print(structured_llm.invoke("Tell me a joke about NVIDIA"))
 
diff --git a/libs/ai-endpoints/tests/unit_tests/test_chat_models.py b/libs/ai-endpoints/tests/unit_tests/test_chat_models.py
@@ -1,5 +1,6 @@
 """Test chat model integration."""
 
+import warnings
 
 import pytest
 from requests_mock import Mocker
@@ -45,3 +46,28 @@ def test_integration_initialization() -> None:
 def test_unavailable(empty_v1_models: None) -> None:
     with pytest.warns(UserWarning, match="Model not-a-real-model is unknown"):
         ChatNVIDIA(api_key="BOGUS", model="not-a-real-model")
+
+
+def test_max_tokens_deprecation_warning() -> None:
+    """Test that using max_tokens raises a deprecation warning."""
+    with pytest.warns(
+        DeprecationWarning,
+        match=(
+            "The 'max_tokens' parameter is deprecated and will be removed "
+            "in a future version"
+        ),
+    ):
+        ChatNVIDIA(model="meta/llama2-70b", max_tokens=50)
+
+
+def test_max_completion_tokens() -> None:
+    """Test that max_completion_tokens works without warning."""
+    with warnings.catch_warnings(record=True) as w:
+        warnings.simplefilter("always")
+        llm = ChatNVIDIA(
+            model="meta/llama2-70b",
+            max_completion_tokens=50,
+            nvidia_api_key="nvapi-...",
+        )
+        assert len(w) == 0
+        assert llm.max_tokens == 50