add thinking_budget and fix run_async

dongwang218 · dongwang218 · commit bc86b0caf652 · 2025-05-05T23:26:28.000Z
diff --git a/matrix/app_server/app_api.py b/matrix/app_server/app_api.py
@@ -38,7 +38,7 @@
 from matrix.client.endpoint_cache import EndpointCache
 from matrix.common.cluster_info import ClusterInfo, get_head_http_host
 from matrix.utils.json import convert_to_json_compatible
-from matrix.utils.os import lock_file
+from matrix.utils.os import lock_file, run_async
 from matrix.utils.ray import (
     ACTOR_NAME_SPACE,
     Action,
@@ -335,7 +335,7 @@ async def dummy_updater():
                 ttl=endpoint_ttl_sec,
                 serve_app=serve_app,
             )
-            workers = asyncio.run(endpoint_cache())
+            workers = run_async(endpoint_cache())
         metadata["endpoints"] = {
             "head": head,
             "workers": workers,
diff --git a/matrix/app_server/deploy_utils.py b/matrix/app_server/deploy_utils.py
@@ -61,6 +61,8 @@
     "aws_account",
     "aws_region",
     "endpoint_name",
+    "anthropic_version",
+    "thinking_budget",
 ]
 
 vllm_app_template = """
@@ -116,6 +118,7 @@
   args:
     model: {{ app.model_name }}
     api_key: {{ app.api_key }}
+    thinking_budget: {{ app.thinking_budget }}
   deployments:
   - name: GeminiDeployment
     max_ongoing_requests: {{ app.max_ongoing_requests }}
@@ -394,6 +397,7 @@ def get_yaml_for_deployment(
                 default_params = {
                     "name": "gemini",
                     "max_ongoing_requests": 10,
+                    "thinking_budget": 1024,
                 }
                 app.update({k: v for k, v in default_params.items() if k not in app})
                 assert "api_key" in app, "add api_key to gemini app"
diff --git a/matrix/app_server/llm/gemini_proxy.py b/matrix/app_server/llm/gemini_proxy.py
@@ -5,9 +5,11 @@
 # LICENSE file in the root directory of this source tree.
 
 import logging
+import re
 from argparse import ArgumentParser
 from typing import Any, Dict, List, Optional
 
+import packaging
 from fastapi import FastAPI, HTTPException
 from google import genai
 from ray import serve
@@ -19,6 +21,11 @@
 app = FastAPI()
 
 
+def _extract_version(name: str) -> packaging.version.Version | None:
+    match = re.search(r"gemini-(\d+\.\d+)", name)
+    return packaging.version.parse(match.group(1)) if match else None
+
+
 @serve.deployment(
     autoscaling_config={
         "min_replicas": 1,
@@ -33,9 +40,15 @@ def __init__(
         self,
         api_key: str,
         model_name: str,
+        thinking_budget: int,
     ):
         self.model_name = model_name
         self.client = genai.Client(api_key=api_key)
+        self.thinking_budget = thinking_budget
+        version = _extract_version(model_name)
+        self.reasoning = version is not None and version >= packaging.version.parse(
+            "2.5"
+        )
 
     def _transform_messages(
         self, messages: List[Dict[str, str]]
@@ -98,7 +111,7 @@ async def create_chat_completion(
             completion_request.get("messages", [])
         )
 
-        request_params = {
+        request_params: Dict[str, Any] = {
             "contents": messages_transformed,
             "config": {
                 "temperature": completion_request.get("temperature", 0.6),
@@ -110,22 +123,36 @@ async def create_chat_completion(
                 "system_instruction": system_instruction_content,
             },
         }
+        if self.reasoning:
+            request_params["config"]["thinking_config"] = {
+                "thinking_budget": self.thinking_budget
+            }
         try:
             response = await self.client.aio.models.generate_content(
                 model=self.model_name, **request_params
             )
         except genai.errors.APIError as e:
             raise HTTPException(status_code=e.code, detail=str(e))
 
-        completion_response: Dict[str, Any] = {
-            "id": response.response_id,
-            "usage": {
+        if response.usage_metadata:
+            usage = {
                 "prompt_tokens": response.usage_metadata.prompt_token_count,
                 "total_tokens": response.usage_metadata.total_token_count,
                 "completion_tokens": response.usage_metadata.candidates_token_count,
-            },
+            }
+        else:
+            usage = {
+                "prompt_tokens": 0,
+                "total_tokens": 0,
+                "completion_tokens": 0,
+            }
+
+        completion_response: Dict[str, Any] = {
+            "id": response.response_id,
+            "usage": usage,
         }
         if response.candidates is None:
+            error: Dict[str, Any] = {}
             if response.prompt_feedback and response.prompt_feedback.block_reason:
                 error = {  # Adding an error field for clarity, not standard OpenAI format
                     "message": f"Content blocked due to: {response.prompt_feedback.block_reason.name}",
@@ -143,17 +170,22 @@ async def create_chat_completion(
             choices = [
                 {
                     "index": i,
-                    "finish_reason": [candidate.finish_reason.value],
+                    "finish_reason": (
+                        candidate.finish_reason.value
+                        if candidate.finish_reason is not None
+                        else ""
+                    ),
                     "message": {
                         "content": "".join(
                             part.text
-                            for part in candidate.content.parts
+                            for part in (candidate.content.parts or [])
                             if part.text is not None
                         ),
                         "role": "assistant",
                     },
                 }
                 for i, candidate in enumerate(response.candidates)
+                if candidate.content
             ]
             completion_response["choices"] = choices
 
@@ -168,6 +200,7 @@ def build_app(cli_args: Dict[str, str]) -> serve.Application:
     argparse = ArgumentParser()
     argparse.add_argument("--api_key", type=str, required=True)
     argparse.add_argument("--model_name", type=str, required=True)
+    argparse.add_argument("--thinking_budget", type=int, default=1024)
 
     arg_strings = []
     for key, value in cli_args.items():
@@ -187,4 +220,5 @@ def build_app(cli_args: Dict[str, str]) -> serve.Application:
     ).bind(
         args.api_key,
         args.model_name,
+        args.thinking_budget,
     )
diff --git a/matrix/client/query_llm.py b/matrix/client/query_llm.py
@@ -27,6 +27,7 @@
 from matrix.app_server.llm import openai_pb2, openai_pb2_grpc
 from matrix.client.client_utils import get_an_endpoint_url, save_to_jsonl
 from matrix.client.endpoint_cache import EndpointCache
+from matrix.utils.os import run_async
 
 CHAR_PER_TOKEN = 3.61
 logging.basicConfig(
@@ -451,35 +452,7 @@ async def _process_requests():
             *[make_request(url, model, request, **kwargs) for request in requests]
         )
 
-    # Get the event loop
-    try:
-        loop = asyncio.get_event_loop()
-    except RuntimeError:
-        # No event loop in this thread, create a new one
-        loop = asyncio.new_event_loop()
-        asyncio.set_event_loop(loop)
-
-    # Check if we're already in an async context
-    if loop.is_running():
-        # We're in an async context and can't use run_until_complete
-        # Create a new thread to run our async code
-        import concurrent.futures
-        import threading
-
-        def run_in_new_loop():
-            # Create a new event loop for this thread
-            new_loop = asyncio.new_event_loop()
-            try:
-                return new_loop.run_until_complete(_process_requests())
-            finally:
-                new_loop.close()
-
-        # Run in an executor to avoid blocking the current event loop
-        with concurrent.futures.ThreadPoolExecutor() as pool:
-            return pool.submit(run_in_new_loop).result()
-    else:
-        # We're in a sync context, use the current loop
-        return loop.run_until_complete(_process_requests())
+    return run_async(_process_requests())
 
 
 async def main(
diff --git a/matrix/utils/os.py b/matrix/utils/os.py
@@ -4,6 +4,8 @@
 # This source code is licensed under the license found in the
 # LICENSE file in the root directory of this source tree.
 
+import asyncio
+import concurrent
 import os
 import select
 import signal
@@ -233,3 +235,29 @@ def lock_file(filepath, mode, timeout=10, poll_interval=0.1):
                     f"Could not acquire lock for {filepath} within {timeout} seconds."
                 )
             time.sleep(poll_interval)
+
+
+def run_async(coro: tp.Awaitable[tp.Any]) -> tp.Any:
+    """
+    Run an async coroutine from a synchronous context.
+    Handles cases where an event loop is already running (e.g., Jupyter, FastAPI).
+    """
+    try:
+        loop = asyncio.get_event_loop()
+    except RuntimeError:
+        loop = asyncio.new_event_loop()
+        asyncio.set_event_loop(loop)
+
+    if loop.is_running():
+
+        def run_in_new_loop():
+            new_loop = asyncio.new_event_loop()
+            try:
+                return new_loop.run_until_complete(coro)
+            finally:
+                new_loop.close()
+
+        with concurrent.futures.ThreadPoolExecutor() as pool:
+            return pool.submit(run_in_new_loop).result()
+    else:
+        return loop.run_until_complete(coro)
diff --git a/pyproject.toml b/pyproject.toml
@@ -23,7 +23,7 @@ dependencies = [
   "pyyaml",
   "portalocker",
   "boto3",
-  "google-genai==1.9.0",
+  "google-genai>=1.13.0",
   "datasketch",
 ]
 # zip_safe = false

Original file line number	Diff line number	Diff line change
`@@ -23,7 +23,7 @@ dependencies = [`
`23`	`23`	`"pyyaml",`
`24`	`24`	`"portalocker",`
`25`	`25`	`"boto3",`
`26`		`- "google-genai==1.9.0",`
	`26`	`+ "google-genai>=1.13.0",`
`27`	`27`	`"datasketch",`
`28`	`28`	`]`
`29`	`29`	`# zip_safe = false`