Skip to content

Commit e7cbe32

Browse files
fakezetamudler
andauthored
feat: Openvino runtime for transformer backend and streaming support for Openvino and CUDA (#1892)
* fixes #1775 and #1774 Add BitsAndBytes Quantization and fixes embedding on CUDA devices * Manage 4bit and 8 bit quantization Manage different BitsAndBytes options with the quantization: parameter in yaml * fix compilation errors on non CUDA environment * OpenVINO draft First draft of OpenVINO integration in transformer backend * first working implementation * Streaming working * Small fix for regression on CUDA and XPU * use pip version of optimum[openvino] * Update backend/python/transformers/transformers_server.py Signed-off-by: Ettore Di Giacinto <[email protected]> --------- Signed-off-by: Ettore Di Giacinto <[email protected]> Co-authored-by: Ettore Di Giacinto <[email protected]>
1 parent b500cea commit e7cbe32

File tree

2 files changed

+91
-19
lines changed

2 files changed

+91
-19
lines changed

backend/python/common-env/transformers/transformers.yml

Lines changed: 6 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -34,6 +34,7 @@ dependencies:
3434
- boto3==1.28.61
3535
- botocore==1.31.61
3636
- certifi==2023.7.22
37+
- coloredlogs==15.0.1
3738
- TTS==0.22.0
3839
- charset-normalizer==3.3.0
3940
- datasets==2.14.5
@@ -48,6 +49,7 @@ dependencies:
4849
- funcy==2.0
4950
- grpcio==1.59.0
5051
- huggingface-hub
52+
- humanfriendly==10.0
5153
- idna==3.4
5254
- jinja2==3.1.2
5355
- jmespath==1.0.1
@@ -57,7 +59,10 @@ dependencies:
5759
- multiprocess==0.70.15
5860
- networkx
5961
- numpy==1.26.0
60-
- optimum==1.17.1
62+
- onnx==1.15.0
63+
- openvino==2024.0.0
64+
- openvino-telemetry==2023.2.1
65+
- optimum[openvino]==1.17.1
6166
- packaging==23.2
6267
- pandas
6368
- peft==0.5.0

backend/python/transformers/transformers_server.py

Lines changed: 85 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,7 @@
88
import signal
99
import sys
1010
import os
11+
from threading import Thread
1112

1213
import time
1314
import backend_pb2
@@ -17,13 +18,16 @@
1718
import torch
1819
import torch.cuda
1920

21+
2022
XPU=os.environ.get("XPU", "0") == "1"
2123
if XPU:
2224
import intel_extension_for_pytorch as ipex
2325
from intel_extension_for_transformers.transformers.modeling import AutoModelForCausalLM
24-
from transformers import AutoTokenizer, AutoModel, set_seed
26+
from transformers import AutoTokenizer, AutoModel, set_seed, TextIteratorStreamer
27+
from optimum.intel.openvino import OVModelForCausalLM
28+
from openvino.runtime import Core
2529
else:
26-
from transformers import AutoTokenizer, AutoModel, AutoModelForCausalLM, set_seed, BitsAndBytesConfig
30+
from transformers import AutoTokenizer, AutoModel, AutoModelForCausalLM, set_seed, BitsAndBytesConfig, TextIteratorStreamer
2731

2832

2933
_ONE_DAY_IN_SECONDS = 60 * 60 * 24
@@ -81,6 +85,7 @@ def LoadModel(self, request, context):
8185
compute=torch.bfloat16
8286

8387
self.CUDA = request.CUDA
88+
self.OV=False
8489

8590
device_map="cpu"
8691

@@ -105,23 +110,55 @@ def LoadModel(self, request, context):
105110
bnb_4bit_compute_dtype = None,
106111
load_in_8bit=True,
107112
)
108-
109-
113+
110114
try:
111115
if request.Type == "AutoModelForCausalLM":
112116
if XPU:
113-
if quantization == "xpu_4bit":
117+
device_map="xpu"
118+
compute=torch.float16
119+
if request.Quantization == "xpu_4bit":
114120
xpu_4bit = True
115-
self.model = AutoModelForCausalLM.from_pretrained(model_name, trust_remote_code=request.TrustRemoteCode,
116-
device_map="xpu", load_in_4bit=xpu_4bit)
121+
xpu_8bit = False
122+
elif request.Quantization == "xpu_8bit":
123+
xpu_4bit = False
124+
xpu_8bit = True
125+
else:
126+
xpu_4bit = False
127+
xpu_8bit = False
128+
self.model = AutoModelForCausalLM.from_pretrained(model_name,
129+
trust_remote_code=request.TrustRemoteCode,
130+
use_safetensors=True,
131+
device_map=device_map,
132+
load_in_4bit=xpu_4bit,
133+
load_in_8bit=xpu_8bit,
134+
torch_dtype=compute)
135+
else:
136+
self.model = AutoModelForCausalLM.from_pretrained(model_name,
137+
trust_remote_code=request.TrustRemoteCode,
138+
use_safetensors=True,
139+
quantization_config=quantization,
140+
device_map=device_map,
141+
torch_dtype=compute)
142+
elif request.Type == "OVModelForCausalLM":
143+
if "GPU" in Core().available_devices:
144+
device_map="GPU"
117145
else:
118-
self.model = AutoModelForCausalLM.from_pretrained(model_name, trust_remote_code=request.TrustRemoteCode, use_safetensors=True, quantization_config=quantization, device_map=device_map, torch_dtype=compute)
146+
device_map="CPU"
147+
self.model = OVModelForCausalLM.from_pretrained(model_name,
148+
compile=True,
149+
device=device_map)
150+
self.OV = True
119151
else:
120-
self.model = AutoModel.from_pretrained(model_name, trust_remote_code=request.TrustRemoteCode, use_safetensors=True, quantization_config=quantization, device_map=device_map, torch_dtype=compute)
152+
self.model = AutoModel.from_pretrained(model_name,
153+
trust_remote_code=request.TrustRemoteCode,
154+
use_safetensors=True,
155+
quantization_config=quantization,
156+
device_map=device_map,
157+
torch_dtype=compute)
121158
self.tokenizer = AutoTokenizer.from_pretrained(model_name, use_safetensors=True)
122159
self.XPU = False
123160

124-
if XPU:
161+
if XPU and self.OV == False:
125162
self.XPU = True
126163
try:
127164
print("Optimizing model", model_name, "to XPU.", file=sys.stderr)
@@ -130,6 +167,7 @@ def LoadModel(self, request, context):
130167
print("Not using XPU:", err, file=sys.stderr)
131168

132169
except Exception as err:
170+
print("Error:", err, file=sys.stderr)
133171
return backend_pb2.Result(success=False, message=f"Unexpected {err=}, {type(err)=}")
134172
# Implement your logic here for the LoadModel service
135173
# Replace this with your desired response
@@ -167,7 +205,7 @@ def Embedding(self, request, context):
167205
print("Embeddings:", sentence_embeddings, file=sys.stderr)
168206
return backend_pb2.EmbeddingResult(embeddings=sentence_embeddings[0])
169207

170-
def Predict(self, request, context):
208+
def Predict(self, request, context, streaming=False):
171209
"""
172210
Generates text based on the given prompt and sampling parameters.
173211
@@ -186,15 +224,42 @@ def Predict(self, request, context):
186224
if request.Tokens > 0:
187225
max_tokens = request.Tokens
188226

189-
inputs = self.tokenizer(request.Prompt, return_tensors="pt").input_ids
227+
inputs = self.tokenizer(request.Prompt, return_tensors="pt")
190228
if self.CUDA:
191229
inputs = inputs.to("cuda")
192-
if XPU:
230+
if XPU and self.OV == False:
193231
inputs = inputs.to("xpu")
194-
195-
outputs = self.model.generate(inputs,max_new_tokens=max_tokens, temperature=request.Temperature, top_p=request.TopP, do_sample=True, pad_token_id=self.tokenizer.eos_token_id)
196-
generated_text = self.tokenizer.batch_decode(outputs[:, inputs.shape[1]:], skip_special_tokens=True)[0]
197-
232+
streaming = False
233+
234+
if streaming:
235+
streamer=TextIteratorStreamer(self.tokenizer,
236+
skip_prompt=True,
237+
skip_special_tokens=True)
238+
config=dict(inputs,
239+
max_new_tokens=max_tokens,
240+
temperature=request.Temperature,
241+
top_p=request.TopP,
242+
top_k=request.TopK,
243+
do_sample=True,
244+
attention_mask=inputs["attention_mask"],
245+
eos_token_id=self.tokenizer.eos_token_id,
246+
pad_token_id=self.tokenizer.eos_token_id,
247+
streamer=streamer)
248+
thread=Thread(target=self.model.generate, kwargs=config)
249+
thread.start()
250+
generated_text = ""
251+
for new_text in streamer:
252+
generated_text += new_text
253+
yield backend_pb2.Reply(message=bytes(new_text, encoding='utf-8'))
254+
else:
255+
outputs = self.model.generate(inputs["input_ids"],
256+
max_new_tokens=max_tokens,
257+
temperature=request.Temperature,
258+
top_p=request.TopP,
259+
top_k=request.TopK,
260+
do_sample=True,
261+
pad_token=self.tokenizer.eos_token_id)
262+
generated_text = self.tokenizer.batch_decode(outputs[:, inputs["input_ids"].shape[1]:], skip_special_tokens=True)[0]
198263
return backend_pb2.Reply(message=bytes(generated_text, encoding='utf-8'))
199264

200265
def PredictStream(self, request, context):
@@ -208,7 +273,9 @@ def PredictStream(self, request, context):
208273
Returns:
209274
backend_pb2.Result: The predict stream result.
210275
"""
211-
yield self.Predict(request, context)
276+
iterations = self.Predict(request, context, streaming=True)
277+
for iteration in iterations:
278+
yield iteration
212279

213280

214281
def serve(address):

0 commit comments

Comments
 (0)