LLM Laboratory

Date: 29.09.2025

Mistral 7b Bad CUDA PyTorch Test

Test environment

My test environment: HP Z440 + NVIDIA Tesla P100

Steps

Preapre python environment for CUDA:

mkdir -p ~/llm && cd ~/llm
python3 -m venv .venv_llm_mistral
source ./.venv_llm_mistral/bin/activate
python -m pip install --upgrade pip
pip install "torch==2.5.0" "torchvision==0.20.0" "torchaudio==2.5.0" --index-url https://download.pytorch.org/whl/cu124
pip install transformers accelerate

Get the Mistral

git lfs install
git clone https://huggingface.co/mistralai/Mistral-7B-v0.1 mistral
git lfs install
git clone https://huggingface.co/mistralai/Mistral-7B-Instruct-v0.1 mistral-instruct

Create script test_cuda_mistral.py:

from transformers import AutoTokenizer, AutoModelForCausalLM
import torch

print("GPU available:", torch.cuda.is_available())
print("GPU name:", torch.cuda.get_device_name(0))

model_path = "/home/sysadmin/llm/mistral"

tokenizer = AutoTokenizer.from_pretrained(model_path)
model     = AutoModelForCausalLM.from_pretrained(
    model_path,
    torch_dtype=torch.float16
).to("cuda")

inputs = tokenizer("Hello!", return_tensors="pt").to("cuda")

output_ids = model.generate(
    **inputs,
    max_new_tokens=160,
)

print(tokenizer.decode(output_ids[0], skip_special_tokens=True))
python test_cuda_mistral.py

Create script test_cuda_mistral_chat_mode.py

from transformers import AutoTokenizer, AutoModelForCausalLM
import torch

print("GPU available:", torch.cuda.is_available())
print("GPU name:", torch.cuda.get_device_name(0))

model_path = "/home/sysadmin/llm/mistral"

tokenizer = AutoTokenizer.from_pretrained(model_path)
model     = AutoModelForCausalLM.from_pretrained(
    model_path,
    torch_dtype=torch.float16
).to("cuda")

messages = [
    {"role": "system", "content": "You are a helpful assistant."},
    {"role": "user", "content": "Hello!"}
]

parts = []
for m in messages:
    role = m.get("role", "user")
    content = m.get("content", "")
    parts.append(f"{role}: {content}")
prompt = "\n".join(parts) + "\nassistant:"

inputs = tokenizer(prompt, return_tensors="pt").to("cuda")

output_ids = model.generate(
    **inputs,
    max_new_tokens=160,
)

print(tokenizer.decode(output_ids[0], skip_special_tokens=True))
python test_cuda_mistral_chat_mode.py

Create script test_cuda_mistral_instruct.py:

from transformers import AutoTokenizer, AutoModelForCausalLM
import torch

print("GPU available:", torch.cuda.is_available())
print("GPU name:", torch.cuda.get_device_name(0))

model_path = "/home/sysadmin/llm/mistral-instruct"

tokenizer = AutoTokenizer.from_pretrained(model_path)
model     = AutoModelForCausalLM.from_pretrained(
    model_path,
    torch_dtype=torch.float16
).to("cuda")

inputs = tokenizer("Hello!", return_tensors="pt").to("cuda")

output_ids = model.generate(
    **inputs,
    max_new_tokens=160,
)

print(tokenizer.decode(output_ids[0], skip_special_tokens=True))
python test_cuda_mistral_instruct.py

Create script test_cuda_mistral_instruct_chat_mode.py:

from transformers import AutoTokenizer, AutoModelForCausalLM
import torch

print("GPU available:", torch.cuda.is_available())
print("GPU name:", torch.cuda.get_device_name(0))

model_path = "/home/sysadmin/llm/mistral-instruct"

tokenizer = AutoTokenizer.from_pretrained(model_path)
model     = AutoModelForCausalLM.from_pretrained(
    model_path,
    torch_dtype=torch.float16
).to("cuda")

messages = [
    {"role": "system", "content": "You are a helpful assistant."},
    {"role": "user", "content": "Hello!"}
]

prompt = tokenizer.apply_chat_template(
    messages,
    tokenize=False,
    add_generation_prompt=True,
)

inputs = tokenizer(prompt, return_tensors="pt").to("cuda")

output_ids = model.generate(
    **inputs,
    max_new_tokens=160,
)

print(tokenizer.decode(output_ids[0], skip_special_tokens=True))
python test_cuda_mistral_instruct_chat_mode.py

Enjoy the result!