oleksandrfluxon commited on
Commit
65f5921
1 Parent(s): 25659de

Update handler.py

Browse files
Files changed (1) hide show
  1. handler.py +31 -29
handler.py CHANGED
@@ -6,38 +6,40 @@ from transformers import AutoModelForCausalLM, AutoTokenizer, AutoConfig
6
 
7
  class EndpointHandler:
8
  def __init__(self, path=""):
9
- # load model and tokenizer from path
10
- self.tokenizer = AutoTokenizer.from_pretrained("EleutherAI/gpt-neox-20b", padding_side="left")
11
-
12
- config = AutoConfig.from_pretrained(path, trust_remote_code=True)
13
- # config.attn_config['attn_impl'] = 'triton'
14
- config.init_device = 'cuda:0' # For fast initialization directly on GPU!
15
- config.max_seq_len = 4096 # (input + output) tokens can now be up to 4096
16
-
17
- self.model = AutoModelForCausalLM.from_pretrained(
18
- path,
19
- config,
20
- torch_dtype=torch.float16,
21
- trust_remote_code=True
22
- )
23
- # self.device = "cuda" if torch.cuda.is_available() else "cpu"
24
- self.device = 'cuda'
 
25
 
26
  def __call__(self, data: Dict[str, Any]) -> Dict[str, str]:
27
  # process input
28
  inputs = data.pop("inputs", data)
29
  parameters = data.pop("parameters", None)
30
 
31
- # preprocess
32
- inputs = self.tokenizer(inputs, return_tensors="pt").to(self.device)
33
-
34
- # pass inputs with all kwargs in data
35
- if parameters is not None:
36
- outputs = self.model.generate(**inputs, **parameters)
37
- else:
38
- outputs = self.model.generate(**inputs)
39
-
40
- # postprocess the prediction
41
- prediction = self.tokenizer.decode(outputs[0], skip_special_tokens=True)
42
-
43
- return [{"generated_text": prediction}]
 
 
6
 
7
  class EndpointHandler:
8
  def __init__(self, path=""):
9
+ with torch.autocast('cuda'):
10
+ # load model and tokenizer from path
11
+ self.tokenizer = AutoTokenizer.from_pretrained("EleutherAI/gpt-neox-20b", padding_side="left")
12
+
13
+ config = AutoConfig.from_pretrained(path, trust_remote_code=True)
14
+ # config.attn_config['attn_impl'] = 'triton'
15
+ config.init_device = 'cuda:0' # For fast initialization directly on GPU!
16
+ config.max_seq_len = 4096 # (input + output) tokens can now be up to 4096
17
+
18
+ self.model = AutoModelForCausalLM.from_pretrained(
19
+ path,
20
+ config,
21
+ torch_dtype=torch.float16,
22
+ trust_remote_code=True
23
+ )
24
+ # self.device = "cuda" if torch.cuda.is_available() else "cpu"
25
+ self.device = 'cuda'
26
 
27
  def __call__(self, data: Dict[str, Any]) -> Dict[str, str]:
28
  # process input
29
  inputs = data.pop("inputs", data)
30
  parameters = data.pop("parameters", None)
31
 
32
+ with torch.autocast('cuda'):
33
+ # preprocess
34
+ inputs = self.tokenizer(inputs, return_tensors="pt").to(self.device)
35
+
36
+ # pass inputs with all kwargs in data
37
+ if parameters is not None:
38
+ outputs = self.model.generate(**inputs, **parameters)
39
+ else:
40
+ outputs = self.model.generate(**inputs)
41
+
42
+ # postprocess the prediction
43
+ prediction = self.tokenizer.decode(outputs[0], skip_special_tokens=True)
44
+
45
+ return [{"generated_text": prediction}]