1
+ import argparse
2
+ import os
3
+ import sys
4
+ import requests
5
+ import numpy as np
6
+ import datetime
7
+ os .environ ["CUDA_VISIBLE_DEVICES" ] = "0"
8
+ os .environ ["FLAGS_use_cuda_managed_memory" ] = "true"
9
+
10
+ import paddle
11
+ from paddle import inference
12
+ from paddlenlp .transformers import MiniGPT4Processor
13
+ from PIL import Image
14
+
15
+ from utils import load_real_time_tokens
16
+
17
+
18
+ class Predictor (object ):
19
+ def __init__ (self , args ):
20
+ self .args = args
21
+ self .first_predictor , self .first_input_handles , self .first_output_handles = self .create_predictor (
22
+ args .first_model_path )
23
+ self .second_predictor , self .second_input_handles , self .second_output_handles = self .create_predictor (
24
+ args .second_model_path )
25
+ print (f"first_model_path: { args .first_model_path } , { self .first_predictor } " )
26
+ print (f"second_model_path: { args .second_model_path } , { self .second_predictor } " )
27
+ self .processor = MiniGPT4Processor .from_pretrained (args .minigpt4_path )
28
+
29
+ def create_predictor (self , model_path ):
30
+
31
+ from paddlenlp .utils .import_utils import import_module
32
+ import_module ("paddlenlp_ops.encode_rotary_qk" )
33
+ import_module ("paddlenlp_ops.get_padding_offset" )
34
+ import_module ("paddlenlp_ops.qkv_transpose_split" )
35
+ import_module ("paddlenlp_ops.rebuild_padding" )
36
+ import_module ("paddlenlp_ops.transpose_remove_padding" )
37
+ import_module ("paddlenlp_ops.write_cache_kv" )
38
+
39
+ model_file = model_path + ".pdmodel"
40
+ params_file = model_path + ".pdiparams"
41
+ if not os .path .exists (model_file ):
42
+ raise ValueError ("not find model file path {}" .format (model_file ))
43
+ if not os .path .exists (params_file ):
44
+ raise ValueError ("not find params file path {}" .format (params_file ))
45
+ config = paddle .inference .Config (model_file , params_file )
46
+
47
+ shape_range_file = model_file + "shape.txt"
48
+ # 第一次运行的时候需要收集shape信息,请打开下面的注释
49
+ # config.collect_shape_range_info(shape_range_file)
50
+
51
+ config .switch_ir_optim (True )
52
+ # 第一个模型跑TRT
53
+ if (model_file .find ("llama" ) == - 1 ):
54
+ self .args .use_tensorrt = False
55
+ else :
56
+ self .args .use_tensorrt = False
57
+
58
+ if self .args .device == "gpu" :
59
+ # set GPU configs accordingly
60
+ # such as initialize the gpu memory, enable tensorrt
61
+ config .enable_use_gpu (100 , 0 )
62
+ precision_mode = inference .PrecisionType .Half
63
+ # 第一个模型是要跑TRT的
64
+ if self .args .use_tensorrt :
65
+ config .enable_tuned_tensorrt_dynamic_shape (shape_range_file , True )
66
+ config .enable_tensorrt_engine (
67
+ max_batch_size = - 1 , min_subgraph_size = 30 , precision_mode = precision_mode ,
68
+ use_static = True )
69
+
70
+ config .switch_use_feed_fetch_ops (False )
71
+ predictor = paddle .inference .create_predictor (config )
72
+ input_handles = [predictor .get_input_handle (name ) for name in predictor .get_input_names ()]
73
+ output_handle = [predictor .get_output_handle (name ) for name in predictor .get_output_names ()]
74
+
75
+ return predictor , input_handles , output_handle
76
+
77
+ @paddle .no_grad ()
78
+ def encode_images (self , pixel_values ):
79
+ # pixel_values 已经在GPU上了
80
+ [language_model_inputs , language_model_attention_mask ] = self .first_predictor .run ([pixel_values ])
81
+ return language_model_inputs , language_model_attention_mask
82
+
83
+ @paddle .no_grad ()
84
+ def generate_with_image_features (self ,
85
+ image_features ,
86
+ first_input_ids ,
87
+ second_input_ids ,
88
+ image_attention_mask = None ,
89
+ first_attention_mask = None ,
90
+ second_attention_mask = None ,
91
+ ** generate_kwargs , ):
92
+ batch , seq ,_ = image_features .shape
93
+ seq = image_features .shape [1 ] + first_input_ids .shape [1 ] + second_input_ids .shape [1 ]
94
+ max_len = 204
95
+ dtype = "float16"
96
+ tgt_generation_mask = paddle .full ([batch , 1 , 1 , max_len ], 0 , dtype = dtype )
97
+ tgt_generation_mask [:,0 ,0 ,:seq ] = 1
98
+
99
+ attention_mask = paddle .full ([batch , 1 , max_len , max_len ], 0 , dtype = dtype )
100
+ attention_mask [:,0 ,:seq ,:seq ] = paddle .tril (
101
+ paddle .ones (shape = (seq , seq ), dtype = dtype )
102
+ )
103
+ position_ids = paddle .full ([batch , seq ], 0 , dtype = "int64" )
104
+ for i in range (batch ):
105
+ position_ids [i ,:] = paddle .to_tensor ([i for i in range (seq )], dtype = "int64" )
106
+
107
+
108
+
109
+ inputs = [image_features ,
110
+ first_input_ids ,
111
+ second_input_ids ,
112
+ attention_mask ,
113
+ # image_attention_mask,
114
+ # first_attention_mask,
115
+ # second_attention_mask,
116
+ position_ids , # position_ids
117
+ paddle .full ([batch , 1 ], 1.0 , dtype = "float32" ), # penalty_score
118
+ paddle .full ([batch , 1 ], 0.0 , dtype = "float32" ), # frequency_score,
119
+ paddle .full ([batch , 1 ], 0.0 , dtype = "float32" ), # presence_score,
120
+ paddle .full ([batch , 1 ], 1 , dtype = "int64" ), # min_length,
121
+ paddle .full ([batch , 1 ], max_len - seq , dtype = "int64" ), # max_length,
122
+ paddle .full ([batch , 1 ], 1.0 , dtype = "float32" ), # temperature,
123
+ paddle .full ([batch , 1 ], 0.0 , dtype = "float32" ), # top_p,
124
+ paddle .full ([1 ], 2277 , dtype = "int64" ), # eos_token_id,
125
+ paddle .full ([batch , 1 ], seq , dtype = "int32" ), # seq_len_encoder,
126
+ paddle .full ([batch , 1 ], seq , dtype = "int32" ), # seq_len_decoder,
127
+ paddle .full ([batch , 1 ], 0 , dtype = "int64" ), # step_idx,
128
+ paddle .full ([batch , 1 ], False , dtype = "bool" ), # stop_flags,
129
+ paddle .full ([batch , 1 ], - 123 , dtype = "int64" ), # tgt_ids can be be initialized arbitrarily
130
+ paddle .full ([batch , 1 ], seq - 1 , dtype = "int64" ), # tgt_pos,
131
+ tgt_generation_mask , # tgt_generation_mask,
132
+ paddle .full ([batch , max_len ], - 100 , dtype = "int64" ), # pre_ids, can be initialized arbitrarily
133
+ paddle .full ([1 ], batch , dtype = "int64" ) # stop_nums, be batch
134
+ ]
135
+ for i in range (40 ):
136
+ tmp = paddle .rand (shape = [2 , batch , 40 , max_len , 128 ], dtype = dtype )
137
+ print (tmp .shape )
138
+ inputs .append (tmp )
139
+
140
+ self .second_predictor .run (inputs )
141
+ tokens : np .ndarray = load_real_time_tokens ()
142
+ generate_ids = tokens .tolist ()
143
+ return generate_ids , None
144
+
145
+ def pre_processing (self , images , text , prompt = None ):
146
+ processed_contents = self .processor (images , text , prompt = prompt )
147
+ return processed_contents
148
+
149
+ def post_processing (self , generate_ids ):
150
+ msg = self .processor .batch_decode (generate_ids )
151
+ return msg
152
+
153
+ def predict (self , images , text , prompt = None ):
154
+ processed_contents = self .pre_processing (images , text , prompt = prompt )
155
+ batch = 1
156
+ processed_contents ["pixel_values" ] = paddle .tile (processed_contents ["pixel_values" ], repeat_times = [batch ,1 ,1 ,1 ])
157
+ image_features , image_attention_mask = self .encode_images (processed_contents ["pixel_values" ])
158
+ print (image_attention_mask .shape )
159
+ processed_contents ["first_input_ids" ] = paddle .tile (processed_contents ["first_input_ids" ], repeat_times = [batch ,1 ])
160
+ processed_contents ["second_input_ids" ] = paddle .tile (processed_contents ["second_input_ids" ], repeat_times = [batch ,1 ])
161
+ processed_contents ["first_attention_mask" ] = paddle .tile (processed_contents ["first_attention_mask" ], repeat_times = [batch ,1 ])
162
+ processed_contents ["second_attention_mask" ] = paddle .tile (processed_contents ["second_attention_mask" ], repeat_times = [batch ,1 ])
163
+ generate_ids , _ = self .generate_with_image_features (
164
+ image_features ,
165
+ processed_contents ["first_input_ids" ],
166
+ processed_contents ["second_input_ids" ],
167
+ image_attention_mask ,
168
+ processed_contents ["first_attention_mask" ],
169
+ processed_contents ["second_attention_mask" ],
170
+ )
171
+
172
+ msg = self .post_processing (generate_ids )
173
+
174
+ return msg
175
+
176
+
177
+ if __name__ == "__main__" :
178
+ parser = argparse .ArgumentParser ()
179
+ parser .add_argument ("--first_model_path" , default = 'The dir name of image encoder model' , type = str , help = "" , )
180
+ parser .add_argument ("--second_model_path" , default = 'The dir name of language model' , type = str , help = "" , )
181
+ parser .add_argument ("--minigpt4_path" , type = str ,
182
+ default = "The minigpt4 dir name of saving tokenizer" ,
183
+ help = "The path of extraction model path that you want to load." )
184
+ parser .add_argument ("--use_tensorrt" , action = 'store_true' , help = "Whether to use inference engin TensorRT." )
185
+ parser .add_argument ("--precision" , default = "fp32" , type = str , choices = ["fp32" , "fp16" , "int8" ],
186
+ help = 'The tensorrt precision.' )
187
+ parser .add_argument ("--device" , default = "gpu" , choices = ["gpu" , "cpu" , "xpu" ], help = "Device selected for inference." )
188
+ parser .add_argument ('--cpu_threads' , default = 10 , type = int , help = 'Number of threads to predict when using cpu.' )
189
+ parser .add_argument ('--enable_mkldnn' , default = False , type = eval , choices = [True , False ],
190
+ help = 'Enable to use mkldnn to speed up when using cpu.' )
191
+ args = parser .parse_args ()
192
+
193
+ predictor = Predictor (args )
194
+
195
+ url = "https://paddlenlp.bj.bcebos.com/data/images/mugs.png"
196
+ image = Image .open (requests .get (url , stream = True ).raw )
197
+
198
+ text = "describe this image"
199
+ prompt = "Give the following image: <Img>ImageContent</Img>. You will be able to see the image once I provide it to you. Please answer my questions.###Human: <Img><ImageHere></Img> <TextHere>###Assistant:"
200
+
201
+ # warm up
202
+ warm_up_times = 2
203
+ repeat_times = 10
204
+ for i in range (warm_up_times ):
205
+ msg = predictor .predict (image , text , prompt )
206
+
207
+ # 测试50次
208
+ starttime = datetime .datetime .now ()
209
+ for i in range (repeat_times ):
210
+ msg = predictor .predict (image , text , prompt )
211
+
212
+ endtime = datetime .datetime .now ()
213
+ duringtime = endtime - starttime
214
+ time_ms = duringtime .seconds * 1000 + duringtime .microseconds / 1000.0
215
+
216
+ print ("Reference: The image shows two black and white cats sitting next to each other on a blue background. The cats have black fur and white fur with black noses, eyes, and paws. They are both looking at the camera with a curious expression. The mugs are also blue with the same design of the cats on them. There is a small white flower on the left side of the mug. The background is a light blue color." )
217
+ print ("Outputs: " , msg )
218
+ print ("The whole time on average: " , time_ms / repeat_times , "ms" )
0 commit comments