Merge pull request #368 from makaveli10/upgrade_trt_v0_18

makaveli10 · web-flow · commit 7fb2d356f954 · 2025-04-30T12:19:39.000+05:30
Upgrade tensorrt_llm to v0.18.2
diff --git a/README.md b/README.md
@@ -141,16 +141,11 @@ client(hls_url="http://as-hls-ww-live.akamaized.net/pool_904/live/ww/bbc_1xtra/b
 
 ## Browser Extensions
 - Run the server with your desired backend as shown [here](https://github.com/collabora/WhisperLive?tab=readme-ov-file#running-the-server).
-- Transcribe audio directly from your browser using our Chrome or Firefox extensions. Refer to [Audio-Transcription-Chrome](https://github.com/collabora/whisper-live/tree/main/Audio-Transcription-Chrome#readme) and [Audio-Transcription-Firefox](https://github.com/collabora/whisper-live/tree/main/Audio-Transcription-Firefox#readme) for setup instructions.
-
-## Whisper Live Server in Docker
-- GPU
-  - Faster-Whisper
-  ```bash
+- Transcribe audio directly from your browser using our Chrome or Firefox extensions. Refer to [Audio-Transcription-Chrome](https://github.com/collabora/whisper-live/tree/main/Audio-Transcription-Chrome#readme) and https://github.com/collabora/WhisperLive/blob/main/TensorRT_whisper.md
   docker run -it --gpus all -p 9090:9090 ghcr.io/collabora/whisperlive-gpu:latest
   ```
 
-  - TensorRT. 
+  - TensorRT. Refer to [TensorRT_whisper readme](https://github.com/collabora/WhisperLive/blob/main/TensorRT_whisper.md) for setup and more tensorrt backend configurations.
   ```bash
   docker run -p 9090:9090 --runtime=nvidia --gpus all --entrypoint /bin/bash -it ghcr.io/collabora/whisperlive-tensorrt
 
diff --git a/TensorRT_whisper.md b/TensorRT_whisper.md
@@ -1,6 +1,6 @@
 # WhisperLive-TensorRT
 We have only tested the TensorRT backend in docker so, we recommend docker for a smooth TensorRT backend setup.
-**Note**: We use `tensorrt_llm==0.15.0.dev2024111200`
+**Note**: We use `tensorrt_llm==0.18.2`
 
 ## Installation
 - Install [docker](https://docs.docker.com/engine/install/)
@@ -36,3 +36,11 @@ python3 run_server.py --port 9090 \
                       --trt_model_path "/app/TensorRT-LLM-examples/whisper/whisper_small_float16" \
                       --trt_multilingual
 ```
+
+By default trt_backend uses cpp_session, to use python session pass `--trt_py_session` to run_server.py
+```bash
+python3 run_server.py --port 9090 \
+                      --backend tensorrt \
+                      --trt_model_path "/app/TensorRT-LLM-examples/whisper/whisper_small_float16" \
+                      --trt_py_session
+```
diff --git a/docker/Dockerfile.tensorrt b/docker/Dockerfile.tensorrt
@@ -1,19 +1,19 @@
-FROM nvidia/cuda:12.4.1-base-ubuntu22.04 AS base
+FROM nvidia/cuda:12.8.1-base-ubuntu22.04 AS base
 
 ARG DEBIAN_FRONTEND=noninteractive
 
 RUN apt-get update && apt-get install -y \
     python3.10 python3-pip openmpi-bin libopenmpi-dev git git-lfs wget \
+    && apt install python-is-python3 \
+    && pip install --upgrade pip setuptools \
     && rm -rf /var/lib/apt/lists/*
 
 FROM base AS devel
-RUN pip3 install --no-cache-dir -U tensorrt_llm==0.15.0.dev2024111200 --extra-index-url https://pypi.nvidia.com
+RUN pip install --no-cache-dir -U tensorrt_llm==0.18.2 --extra-index-url https://pypi.nvidia.com
 WORKDIR /app
-RUN git clone https://github.com/NVIDIA/TensorRT-LLM.git && cd TensorRT-LLM && \
-    git checkout c629546ce429623c8a163633095230154a6f0574 && cd ../ && \
-    mv TensorRT-LLM/examples ./TensorRT-LLM-examples && \
-    rm -rf TensorRT-LLM
-
+RUN git clone -b v0.18.2 https://github.com/NVIDIA/TensorRT-LLM.git \
+    && mv TensorRT-LLM/examples ./TensorRT-LLM-examples \
+    && rm -rf TensorRT-LLM
 
 FROM devel AS release
 WORKDIR /app
@@ -25,7 +25,6 @@ RUN apt update && bash setup.sh && rm setup.sh
 
 COPY requirements/server.txt .
 RUN pip install --no-cache-dir -r server.txt && rm server.txt
-RUN pip install pynvml==11.5.0
 COPY whisper_live ./whisper_live
 COPY scripts/build_whisper_tensorrt.sh .
 COPY run_server.py .
diff --git a/run_server.py b/run_server.py
@@ -21,6 +21,9 @@
     parser.add_argument('--trt_multilingual', '-m',
                         action="store_true",
                         help='Boolean only for TensorRT model. True if multilingual.')
+    parser.add_argument('--trt_py_session',
+                        action="store_true",
+                        help='Boolean only for TensorRT model. Use python session or cpp session, By default uses Cpp.')
     parser.add_argument('--omp_num_threads', '-omp',
                         type=int,
                         default=1,
@@ -46,5 +49,6 @@
         faster_whisper_custom_model_path=args.faster_whisper_custom_model_path,
         whisper_tensorrt_path=args.trt_model_path,
         trt_multilingual=args.trt_multilingual,
+        trt_py_session=args.trt_py_session,
         single_model=not args.no_single_model,
     )
diff --git a/scripts/build_whisper_tensorrt.sh b/scripts/build_whisper_tensorrt.sh
@@ -54,7 +54,7 @@ download_and_build_model() {
     local inference_precision="float16"
     local weight_only_precision="${2:-float16}"
     local max_beam_width=4
-    local max_batch_size=1
+    local max_batch_size=4
 
     echo "Downloading $model_name..."
     # wget --directory-prefix=assets "$model_url"
@@ -80,7 +80,6 @@ download_and_build_model() {
         --checkpoint_dir "${checkpoint_dir}/encoder" \
         --output_dir "${output_dir}/encoder" \
         --moe_plugin disable \
-        --enable_xqa disable \
         --max_batch_size "$max_batch_size" \
         --gemm_plugin disable \
         --bert_attention_plugin "$inference_precision" \
@@ -92,11 +91,10 @@ download_and_build_model() {
         --checkpoint_dir "${checkpoint_dir}/decoder" \
         --output_dir "${output_dir}/decoder" \
         --moe_plugin disable \
-        --enable_xqa disable \
         --max_beam_width "$max_beam_width" \
         --max_batch_size "$max_batch_size" \
-        --max_seq_len 200 \
-        --max_input_len 14 \
+        --max_seq_len 225 \
+        --max_input_len 32 \
         --max_encoder_input_len 3000 \
         --gemm_plugin "$inference_precision" \
         --bert_attention_plugin "$inference_precision" \
diff --git a/whisper_live/backend/trt_backend.py b/whisper_live/backend/trt_backend.py
@@ -11,7 +11,18 @@ class ServeClientTensorRT(ServeClientBase):
     SINGLE_MODEL = None
     SINGLE_MODEL_LOCK = threading.Lock()
 
-    def __init__(self, websocket, task="transcribe", multilingual=False, language=None, client_uid=None, model=None, single_model=False):
+    def __init__(
+        self,
+        websocket,
+        task="transcribe",
+        multilingual=False,
+        language=None,
+        client_uid=None,
+        model=None,
+        single_model=False,
+        use_py_session=False,
+        max_new_tokens=225,
+    ):
         """
         Initialize a ServeClient instance.
         The Whisper model is initialized based on the client's language and device availability.
@@ -26,21 +37,24 @@ def __init__(self, websocket, task="transcribe", multilingual=False, language=No
             language (str, optional): The language for transcription. Defaults to None.
             client_uid (str, optional): A unique identifier for the client. Defaults to None.
             single_model (bool, optional): Whether to instantiate a new model for each client connection. Defaults to False.
+            use_py_session (bool, optional): Use python session or cpp session. Defaults to Cpp Session.
+            max_new_tokens (int, optional): Max number of tokens to generate.
 
         """
         super().__init__(client_uid, websocket)
         self.language = language if multilingual else "en"
         self.task = task
         self.eos = False
+        self.max_new_tokens = max_new_tokens
 
         if single_model:
             if ServeClientTensorRT.SINGLE_MODEL is None:
-                self.create_model(model, multilingual)
+                self.create_model(model, multilingual, use_py_session=use_py_session)
                 ServeClientTensorRT.SINGLE_MODEL = self.transcriber
             else:
                 self.transcriber = ServeClientTensorRT.SINGLE_MODEL
         else:
-            self.create_model(model, multilingual)
+            self.create_model(model, multilingual, use_py_session=use_py_session)
 
         # threading
         self.trans_thread = threading.Thread(target=self.speech_to_text)
@@ -52,7 +66,7 @@ def __init__(self, websocket, task="transcribe", multilingual=False, language=No
             "backend": "tensorrt"
         }))
 
-    def create_model(self, model, multilingual, warmup=True):
+    def create_model(self, model, multilingual, warmup=True, use_py_session=False):
         """
         Instantiates a new model, sets it as the transcriber and does warmup if desired.
         """
@@ -62,7 +76,9 @@ def create_model(self, model, multilingual, warmup=True):
             device="cuda",
             is_multilingual=multilingual,
             language=self.language,
-            task=self.task
+            task=self.task,
+            use_py_session=use_py_session,
+            max_output_len=self.max_new_tokens,
         )
         if warmup:
             self.warmup()
@@ -117,7 +133,7 @@ def transcribe_audio(self, input_bytes):
         mel, duration = self.transcriber.log_mel_spectrogram(input_bytes)
         last_segment = self.transcriber.transcribe(
             mel,
-            text_prefix=f"<|startoftranscript|><|{self.language}|><|{self.task}|><|notimestamps|>"
+            text_prefix=f"<|startoftranscript|><|{self.language}|><|{self.task}|><|notimestamps|>",
         )
         if ServeClientTensorRT.SINGLE_MODEL:
             ServeClientTensorRT.SINGLE_MODEL_LOCK.release()
diff --git a/whisper_live/server.py b/whisper_live/server.py
@@ -153,7 +153,7 @@ def __init__(self):
 
     def initialize_client(
         self, websocket, options, faster_whisper_custom_model_path,
-        whisper_tensorrt_path, trt_multilingual
+        whisper_tensorrt_path, trt_multilingual, trt_py_session=False,
     ):
         client: Optional[ServeClientBase] = None
 
@@ -168,6 +168,7 @@ def initialize_client(
                     client_uid=options["uid"],
                     model=whisper_tensorrt_path,
                     single_model=self.single_model,
+                    use_py_session=trt_py_session,
                 )
                 logging.info("Running TensorRT backend.")
             except Exception as e:
@@ -248,7 +249,7 @@ def get_audio_from_websocket(self, websocket):
         return np.frombuffer(frame_data, dtype=np.float32)
 
     def handle_new_connection(self, websocket, faster_whisper_custom_model_path,
-                              whisper_tensorrt_path, trt_multilingual):
+                              whisper_tensorrt_path, trt_multilingual, trt_py_session=False):
         try:
             logging.info("New client connected")
             options = websocket.recv()
@@ -267,7 +268,7 @@ def handle_new_connection(self, websocket, faster_whisper_custom_model_path,
             if self.backend.is_tensorrt():
                 self.vad_detector = VoiceActivityDetector(frame_rate=self.RATE)
             self.initialize_client(websocket, options, faster_whisper_custom_model_path,
-                                   whisper_tensorrt_path, trt_multilingual)
+                                   whisper_tensorrt_path, trt_multilingual, trt_py_session=trt_py_session)
             return True
         except json.JSONDecodeError:
             logging.error("Failed to decode JSON from client")
@@ -299,11 +300,12 @@ def process_audio_frames(self, websocket):
         return True
 
     def recv_audio(self,
-                   websocket,
+                   websocket,   
                    backend: BackendType = BackendType.FASTER_WHISPER,
                    faster_whisper_custom_model_path=None,
                    whisper_tensorrt_path=None,
-                   trt_multilingual=False):
+                   trt_multilingual=False,
+                   trt_py_session=False):
         """
         Receive audio chunks from a client in an infinite loop.
 
@@ -330,7 +332,7 @@ def recv_audio(self,
         """
         self.backend = backend
         if not self.handle_new_connection(websocket, faster_whisper_custom_model_path,
-                                          whisper_tensorrt_path, trt_multilingual):
+                                          whisper_tensorrt_path, trt_multilingual, trt_py_session=trt_py_session):
             return
 
         try:
@@ -354,6 +356,7 @@ def run(self,
             faster_whisper_custom_model_path=None,
             whisper_tensorrt_path=None,
             trt_multilingual=False,
+            trt_py_session=False,
             single_model=False):
         """
         Run the transcription server.
@@ -381,7 +384,8 @@ def run(self,
                 backend=BackendType(backend),
                 faster_whisper_custom_model_path=faster_whisper_custom_model_path,
                 whisper_tensorrt_path=whisper_tensorrt_path,
-                trt_multilingual=trt_multilingual
+                trt_multilingual=trt_multilingual,
+                trt_py_session=trt_py_session,
             ),
             host,
             port
diff --git a/whisper_live/transcriber/transcriber_tensorrt.py b/whisper_live/transcriber/transcriber_tensorrt.py