| Documentation | Blog | Discussions |
git clone --branch support_peft https://github.com/SuperBruceJia/vllm.git
cd vllm
pip install -e . --user
Please note that this is just a demo!
from vllm import LLM, SamplingParams
from vllm.model_executor.adapters import lora
from vllm.model_executor.parallel_utils.parallel_state import destroy_model_parallel
def stop_token_list():
stop_tokens = ["Question:",
"Question",
"USER:",
"USER",
"ASSISTANT:",
"ASSISTANT",
"Instruction:",
"Instruction",
"Response:",
"Response",]
return stop_tokens
stop_tokens = stop_token_list()
sampling_params = SamplingParams(temperature=0.0, top_p=1, max_tokens=128, stop=stop_tokens)
llm = LLM(model="meta-llama/Llama-2-7b-hf", load_format="auto", tensor_parallel_size=1, gpu_memory_utilization=0.90)
lora.LoRAModel.from_pretrained(llm.llm_engine.workers[0].model, '/adapter') # The adapter saved path
prompts = ["Hello World", "Hello Python"]
completions = llm.generate(prompts, sampling_params)
for output in completions:
gens = output.outputs[0].text
print(gens, '\n')