8
8
import signal
9
9
import sys
10
10
import os
11
+ from threading import Thread
11
12
12
13
import time
13
14
import backend_pb2
17
18
import torch
18
19
import torch .cuda
19
20
21
+
20
22
XPU = os .environ .get ("XPU" , "0" ) == "1"
21
23
if XPU :
22
24
import intel_extension_for_pytorch as ipex
23
25
from intel_extension_for_transformers .transformers .modeling import AutoModelForCausalLM
24
- from transformers import AutoTokenizer , AutoModel , set_seed
26
+ from transformers import AutoTokenizer , AutoModel , set_seed , TextIteratorStreamer
27
+ from optimum .intel .openvino import OVModelForCausalLM
28
+ from openvino .runtime import Core
25
29
else :
26
- from transformers import AutoTokenizer , AutoModel , AutoModelForCausalLM , set_seed , BitsAndBytesConfig
30
+ from transformers import AutoTokenizer , AutoModel , AutoModelForCausalLM , set_seed , BitsAndBytesConfig , TextIteratorStreamer
27
31
28
32
29
33
_ONE_DAY_IN_SECONDS = 60 * 60 * 24
@@ -81,6 +85,7 @@ def LoadModel(self, request, context):
81
85
compute = torch .bfloat16
82
86
83
87
self .CUDA = request .CUDA
88
+ self .OV = False
84
89
85
90
device_map = "cpu"
86
91
@@ -105,23 +110,55 @@ def LoadModel(self, request, context):
105
110
bnb_4bit_compute_dtype = None ,
106
111
load_in_8bit = True ,
107
112
)
108
-
109
-
113
+
110
114
try :
111
115
if request .Type == "AutoModelForCausalLM" :
112
116
if XPU :
113
- if quantization == "xpu_4bit" :
117
+ device_map = "xpu"
118
+ compute = torch .float16
119
+ if request .Quantization == "xpu_4bit" :
114
120
xpu_4bit = True
115
- self .model = AutoModelForCausalLM .from_pretrained (model_name , trust_remote_code = request .TrustRemoteCode ,
116
- device_map = "xpu" , load_in_4bit = xpu_4bit )
121
+ xpu_8bit = False
122
+ elif request .Quantization == "xpu_8bit" :
123
+ xpu_4bit = False
124
+ xpu_8bit = True
125
+ else :
126
+ xpu_4bit = False
127
+ xpu_8bit = False
128
+ self .model = AutoModelForCausalLM .from_pretrained (model_name ,
129
+ trust_remote_code = request .TrustRemoteCode ,
130
+ use_safetensors = True ,
131
+ device_map = device_map ,
132
+ load_in_4bit = xpu_4bit ,
133
+ load_in_8bit = xpu_8bit ,
134
+ torch_dtype = compute )
135
+ else :
136
+ self .model = AutoModelForCausalLM .from_pretrained (model_name ,
137
+ trust_remote_code = request .TrustRemoteCode ,
138
+ use_safetensors = True ,
139
+ quantization_config = quantization ,
140
+ device_map = device_map ,
141
+ torch_dtype = compute )
142
+ elif request .Type == "OVModelForCausalLM" :
143
+ if "GPU" in Core ().available_devices :
144
+ device_map = "GPU"
117
145
else :
118
- self .model = AutoModelForCausalLM .from_pretrained (model_name , trust_remote_code = request .TrustRemoteCode , use_safetensors = True , quantization_config = quantization , device_map = device_map , torch_dtype = compute )
146
+ device_map = "CPU"
147
+ self .model = OVModelForCausalLM .from_pretrained (model_name ,
148
+ compile = True ,
149
+ device = device_map )
150
+ self .OV = True
119
151
else :
120
- self .model = AutoModel .from_pretrained (model_name , trust_remote_code = request .TrustRemoteCode , use_safetensors = True , quantization_config = quantization , device_map = device_map , torch_dtype = compute )
152
+ self .model = AutoModel .from_pretrained (model_name ,
153
+ trust_remote_code = request .TrustRemoteCode ,
154
+ use_safetensors = True ,
155
+ quantization_config = quantization ,
156
+ device_map = device_map ,
157
+ torch_dtype = compute )
121
158
self .tokenizer = AutoTokenizer .from_pretrained (model_name , use_safetensors = True )
122
159
self .XPU = False
123
160
124
- if XPU :
161
+ if XPU and self . OV == False :
125
162
self .XPU = True
126
163
try :
127
164
print ("Optimizing model" , model_name , "to XPU." , file = sys .stderr )
@@ -130,6 +167,7 @@ def LoadModel(self, request, context):
130
167
print ("Not using XPU:" , err , file = sys .stderr )
131
168
132
169
except Exception as err :
170
+ print ("Error:" , err , file = sys .stderr )
133
171
return backend_pb2 .Result (success = False , message = f"Unexpected { err = } , { type (err )= } " )
134
172
# Implement your logic here for the LoadModel service
135
173
# Replace this with your desired response
@@ -167,7 +205,7 @@ def Embedding(self, request, context):
167
205
print ("Embeddings:" , sentence_embeddings , file = sys .stderr )
168
206
return backend_pb2 .EmbeddingResult (embeddings = sentence_embeddings [0 ])
169
207
170
- def Predict (self , request , context ):
208
+ def Predict (self , request , context , streaming = False ):
171
209
"""
172
210
Generates text based on the given prompt and sampling parameters.
173
211
@@ -186,15 +224,42 @@ def Predict(self, request, context):
186
224
if request .Tokens > 0 :
187
225
max_tokens = request .Tokens
188
226
189
- inputs = self .tokenizer (request .Prompt , return_tensors = "pt" ). input_ids
227
+ inputs = self .tokenizer (request .Prompt , return_tensors = "pt" )
190
228
if self .CUDA :
191
229
inputs = inputs .to ("cuda" )
192
- if XPU :
230
+ if XPU and self . OV == False :
193
231
inputs = inputs .to ("xpu" )
194
-
195
- outputs = self .model .generate (inputs ,max_new_tokens = max_tokens , temperature = request .Temperature , top_p = request .TopP , do_sample = True , pad_token_id = self .tokenizer .eos_token_id )
196
- generated_text = self .tokenizer .batch_decode (outputs [:, inputs .shape [1 ]:], skip_special_tokens = True )[0 ]
197
-
232
+ streaming = False
233
+
234
+ if streaming :
235
+ streamer = TextIteratorStreamer (self .tokenizer ,
236
+ skip_prompt = True ,
237
+ skip_special_tokens = True )
238
+ config = dict (inputs ,
239
+ max_new_tokens = max_tokens ,
240
+ temperature = request .Temperature ,
241
+ top_p = request .TopP ,
242
+ top_k = request .TopK ,
243
+ do_sample = True ,
244
+ attention_mask = inputs ["attention_mask" ],
245
+ eos_token_id = self .tokenizer .eos_token_id ,
246
+ pad_token_id = self .tokenizer .eos_token_id ,
247
+ streamer = streamer )
248
+ thread = Thread (target = self .model .generate , kwargs = config )
249
+ thread .start ()
250
+ generated_text = ""
251
+ for new_text in streamer :
252
+ generated_text += new_text
253
+ yield backend_pb2 .Reply (message = bytes (new_text , encoding = 'utf-8' ))
254
+ else :
255
+ outputs = self .model .generate (inputs ["input_ids" ],
256
+ max_new_tokens = max_tokens ,
257
+ temperature = request .Temperature ,
258
+ top_p = request .TopP ,
259
+ top_k = request .TopK ,
260
+ do_sample = True ,
261
+ pad_token = self .tokenizer .eos_token_id )
262
+ generated_text = self .tokenizer .batch_decode (outputs [:, inputs ["input_ids" ].shape [1 ]:], skip_special_tokens = True )[0 ]
198
263
return backend_pb2 .Reply (message = bytes (generated_text , encoding = 'utf-8' ))
199
264
200
265
def PredictStream (self , request , context ):
@@ -208,7 +273,9 @@ def PredictStream(self, request, context):
208
273
Returns:
209
274
backend_pb2.Result: The predict stream result.
210
275
"""
211
- yield self .Predict (request , context )
276
+ iterations = self .Predict (request , context , streaming = True )
277
+ for iteration in iterations :
278
+ yield iteration
212
279
213
280
214
281
def serve (address ):
0 commit comments