You signed in with another tab or window. Reload to refresh your session.You signed out in another tab or window. Reload to refresh your session.You switched accounts on another tab or window. Reload to refresh your session.Dismiss alert
from tensorrt_llm import SamplingParams
from tensorrt_llm._torch import LLM
from tensorrt_llm._torch.pyexecutor.config import PyTorchConfig
from transformers import AutoModelForCausalLM, AutoTokenizer, PretrainedConfig, QuantoConfig
运行报错:
raceback (most recent call last):
File "/usr/local/lib/python3.12/dist-packages/tensorrt_llm/executor.py", line 1568, in workers_main
executor = worker_cls(engine, executor_config,
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/usr/local/lib/python3.12/dist-packages/tensorrt_llm/executor.py", line 817, in init
self.engine = _create_engine()
^^^^^^^^^^^^^^^^
File "/usr/local/lib/python3.12/dist-packages/tensorrt_llm/executor.py", line 811, in _create_engine
return unique_create_executor(engine,
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/usr/local/lib/python3.12/dist-packages/tensorrt_llm/_torch/pyexecutor/backend_registries/backend_registry.py", line 88, in unique_create_executor
engine = create_py_executor_by_config(executor_config.backend,
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/usr/local/lib/python3.12/dist-packages/tensorrt_llm/_torch/pyexecutor/backend_registries/backend_registry.py", line 59, in create_py_executor_by_config
py_executor = backend_registry[name].func(executor_config, checkpoint_dir,
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/usr/local/lib/python3.12/dist-packages/tensorrt_llm/_torch/pyexecutor/backend_registries/pytorch_model_registry.py", line 108, in create_pytorch_model_based_executor
kv_cache_max_tokens = estimate_max_kv_cache_tokens(model_engine,
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/usr/local/lib/python3.12/dist-packages/tensorrt_llm/_torch/pyexecutor/backend_registries/_util.py", line 93, in estimate_max_kv_cache_tokens
model_engine.forward(req, resource_manager)
File "/usr/local/lib/python3.12/dist-packages/torch/utils/_contextlib.py", line 116, in decorate_context
return func(*args, **kwargs)
^^^^^^^^^^^^^^^^^^^^^
File "/usr/local/lib/python3.12/dist-packages/tensorrt_llm/_torch/pyexecutor/pytorch_model_engine.py", line 943, in forward
return self._forward_step(inputs, gather_ids)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/usr/lib/python3.12/contextlib.py", line 81, in inner
return func(*args, **kwds)
^^^^^^^^^^^^^^^^^^^
File "/usr/local/lib/python3.12/dist-packages/tensorrt_llm/_torch/pyexecutor/pytorch_model_engine.py", line 980, in _forward_step
logits = self.model.forward(**inputs,
^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/usr/local/lib/python3.12/dist-packages/tensorrt_llm/_torch/models/modeling_utils.py", line 187, in forward
hidden_states = self.model(
^^^^^^^^^^^
File "/usr/local/lib/python3.12/dist-packages/torch/nn/modules/module.py", line 1740, in _wrapped_call_impl
return self._call_impl(*args, **kwargs)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/usr/local/lib/python3.12/dist-packages/torch/nn/modules/module.py", line 1751, in _call_impl
return forward_call(*args, **kwargs)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/usr/local/lib/python3.12/dist-packages/tensorrt_llm/_torch/models/modeling_qwen.py", line 178, in forward
hidden_states, residual = decoder_layer(position_ids=position_ids,
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/usr/local/lib/python3.12/dist-packages/torch/nn/modules/module.py", line 1740, in _wrapped_call_impl
return self._call_impl(*args, **kwargs)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/usr/local/lib/python3.12/dist-packages/torch/nn/modules/module.py", line 1751, in _call_impl
return forward_call(*args, **kwargs)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/usr/local/lib/python3.12/dist-packages/tensorrt_llm/_torch/models/modeling_qwen.py", line 113, in forward
hidden_states = self.self_attn(
^^^^^^^^^^^^^^^
File "/usr/local/lib/python3.12/dist-packages/torch/nn/modules/module.py", line 1740, in _wrapped_call_impl
return self._call_impl(*args, **kwargs)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/usr/local/lib/python3.12/dist-packages/torch/nn/modules/module.py", line 1751, in _call_impl
return forward_call(*args, **kwargs)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/usr/local/lib/python3.12/dist-packages/tensorrt_llm/_torch/modules/attention.py", line 104, in forward
qkv = self.qkv_proj(hidden_states)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/usr/local/lib/python3.12/dist-packages/torch/nn/modules/module.py", line 1740, in _wrapped_call_impl
return self._call_impl(*args, **kwargs)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/usr/local/lib/python3.12/dist-packages/torch/nn/modules/module.py", line 1751, in _call_impl
return forward_call(*args, **kwargs)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/usr/local/lib/python3.12/dist-packages/tensorrt_llm/_torch/modules/linear.py", line 313, in forward
output = self.apply_linear(input, self.weight, self.bias)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/usr/local/lib/python3.12/dist-packages/tensorrt_llm/_torch/modules/linear.py", line 256, in apply_linear
output = torch.ops.trtllm.cublas_scaled_mm(
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/usr/local/lib/python3.12/dist-packages/torch/_ops.py", line 1123, in call
return self._op(*args, **(kwargs or {}))
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
RuntimeError: bias is not support yet
additional notes
None
The text was updated successfully, but these errors were encountered:
System Info
Who can help?
No response
Information
Tasks
examples
folder (such as GLUE/SQuAD, ...)Reproduction
git clone https://github.com/NVIDIA/TensorRT-Model-Optimizer.git
cd TensorRT-Model-Optimizer/examples/llm_ptq
scripts/huggingface_example.sh --model --quant fp8 --export_fmt hf,得到量化后的模型
使用以下脚本调用pytorch backend的generate:
import argparse, time
from tensorrt_llm import SamplingParams
from tensorrt_llm._torch import LLM
from tensorrt_llm._torch.pyexecutor.config import PyTorchConfig
from transformers import AutoModelForCausalLM, AutoTokenizer, PretrainedConfig, QuantoConfig
def parse_arguments():
parser = argparse.ArgumentParser()
parser.add_argument('--model_dir',
type=str,
default='the quantized fp8 model dir')
parser.add_argument('--tp_size', type=int, default=1)
parser.add_argument('--enable_overlap_scheduler',
default=False,
action='store_true')
parser.add_argument('--enable_chunked_prefill',
default=False,
action='store_true')
parser.add_argument('--kv_cache_dtype', type=str, default='auto')
args = parser.parse_args()
return args
def main():
args = parse_arguments()
if name == 'main':
main()
Expected behavior
正确输出生成文本
actual behavior
运行报错:
raceback (most recent call last):
File "/usr/local/lib/python3.12/dist-packages/tensorrt_llm/executor.py", line 1568, in workers_main
executor = worker_cls(engine, executor_config,
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/usr/local/lib/python3.12/dist-packages/tensorrt_llm/executor.py", line 817, in init
self.engine = _create_engine()
^^^^^^^^^^^^^^^^
File "/usr/local/lib/python3.12/dist-packages/tensorrt_llm/executor.py", line 811, in _create_engine
return unique_create_executor(engine,
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/usr/local/lib/python3.12/dist-packages/tensorrt_llm/_torch/pyexecutor/backend_registries/backend_registry.py", line 88, in unique_create_executor
engine = create_py_executor_by_config(executor_config.backend,
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/usr/local/lib/python3.12/dist-packages/tensorrt_llm/_torch/pyexecutor/backend_registries/backend_registry.py", line 59, in create_py_executor_by_config
py_executor = backend_registry[name].func(executor_config, checkpoint_dir,
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/usr/local/lib/python3.12/dist-packages/tensorrt_llm/_torch/pyexecutor/backend_registries/pytorch_model_registry.py", line 108, in create_pytorch_model_based_executor
kv_cache_max_tokens = estimate_max_kv_cache_tokens(model_engine,
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/usr/local/lib/python3.12/dist-packages/tensorrt_llm/_torch/pyexecutor/backend_registries/_util.py", line 93, in estimate_max_kv_cache_tokens
model_engine.forward(req, resource_manager)
File "/usr/local/lib/python3.12/dist-packages/torch/utils/_contextlib.py", line 116, in decorate_context
return func(*args, **kwargs)
^^^^^^^^^^^^^^^^^^^^^
File "/usr/local/lib/python3.12/dist-packages/tensorrt_llm/_torch/pyexecutor/pytorch_model_engine.py", line 943, in forward
return self._forward_step(inputs, gather_ids)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/usr/lib/python3.12/contextlib.py", line 81, in inner
return func(*args, **kwds)
^^^^^^^^^^^^^^^^^^^
File "/usr/local/lib/python3.12/dist-packages/tensorrt_llm/_torch/pyexecutor/pytorch_model_engine.py", line 980, in _forward_step
logits = self.model.forward(**inputs,
^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/usr/local/lib/python3.12/dist-packages/tensorrt_llm/_torch/models/modeling_utils.py", line 187, in forward
hidden_states = self.model(
^^^^^^^^^^^
File "/usr/local/lib/python3.12/dist-packages/torch/nn/modules/module.py", line 1740, in _wrapped_call_impl
return self._call_impl(*args, **kwargs)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/usr/local/lib/python3.12/dist-packages/torch/nn/modules/module.py", line 1751, in _call_impl
return forward_call(*args, **kwargs)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/usr/local/lib/python3.12/dist-packages/tensorrt_llm/_torch/models/modeling_qwen.py", line 178, in forward
hidden_states, residual = decoder_layer(position_ids=position_ids,
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/usr/local/lib/python3.12/dist-packages/torch/nn/modules/module.py", line 1740, in _wrapped_call_impl
return self._call_impl(*args, **kwargs)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/usr/local/lib/python3.12/dist-packages/torch/nn/modules/module.py", line 1751, in _call_impl
return forward_call(*args, **kwargs)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/usr/local/lib/python3.12/dist-packages/tensorrt_llm/_torch/models/modeling_qwen.py", line 113, in forward
hidden_states = self.self_attn(
^^^^^^^^^^^^^^^
File "/usr/local/lib/python3.12/dist-packages/torch/nn/modules/module.py", line 1740, in _wrapped_call_impl
return self._call_impl(*args, **kwargs)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/usr/local/lib/python3.12/dist-packages/torch/nn/modules/module.py", line 1751, in _call_impl
return forward_call(*args, **kwargs)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/usr/local/lib/python3.12/dist-packages/tensorrt_llm/_torch/modules/attention.py", line 104, in forward
qkv = self.qkv_proj(hidden_states)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/usr/local/lib/python3.12/dist-packages/torch/nn/modules/module.py", line 1740, in _wrapped_call_impl
return self._call_impl(*args, **kwargs)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/usr/local/lib/python3.12/dist-packages/torch/nn/modules/module.py", line 1751, in _call_impl
return forward_call(*args, **kwargs)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/usr/local/lib/python3.12/dist-packages/tensorrt_llm/_torch/modules/linear.py", line 313, in forward
output = self.apply_linear(input, self.weight, self.bias)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/usr/local/lib/python3.12/dist-packages/tensorrt_llm/_torch/modules/linear.py", line 256, in apply_linear
output = torch.ops.trtllm.cublas_scaled_mm(
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/usr/local/lib/python3.12/dist-packages/torch/_ops.py", line 1123, in call
return self._op(*args, **(kwargs or {}))
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
RuntimeError: bias is not support yet
additional notes
None
The text was updated successfully, but these errors were encountered: