fix

hanzhi713 · hanzhi713 · commit 16447e54f334 · 2023-12-19T23:45:57.000Z
diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py
@@ -35,7 +35,7 @@ class EngineArgs:
     quantization: Optional[str] = None
     enforce_eager: bool = False
     max_context_len_to_capture: int = 8192
-    disable_fast_allreduce = False
+    disable_fast_allreduce: bool = False
 
     def __post_init__(self):
         if self.tokenizer is None:
diff --git a/vllm/worker/model_runner.py b/vllm/worker/model_runner.py
@@ -408,7 +408,7 @@ def capture_model(self, kv_caches: List[KVCache]) -> None:
         context_lens = torch.ones(max_batch_size, dtype=torch.int32).cuda()
         block_tables = torch.from_numpy(self.graph_block_tables).cuda()
 
-        if not self.model_config.disable_fast_allreduce:
+        if not self.parallel_config.disable_fast_allreduce:
             comm_op.init_fast_ar()
         comm_op.begin_capture()
         # NOTE: Capturing the largest batch size first may help reduce the