Source code for models.devstral_local_model
"""
Script
------
devstral_local_model.py
Path
----
python/hillstar/models/devstral_local_model.py
Purpose
-------
LOCAL DEVSTRAL-SMALL-2 MODEL - OPTIONAL ADVANCED SETUP
Integrates Devstral-Small-2 via local llama.cpp HTTP server.
This is an OPTIONAL setup for power users with appropriate hardware.
Connects to llama.cpp server running on localhost:8080.
Uses OpenAI-compatible /v1/chat/completions endpoint (not Ollama API).
Free, local execution on GPU. Default temperature 0.00000073 minimizes hallucination.
HARDWARE REQUIREMENTS (MANDATORY)
-----------------------------------
Minimum: 16GB VRAM GPU (RTX 4080, RTX 4090, A100, etc.)
Model: Quantized GGUF format (~14GB) from HuggingFace
Setup: Requires devstral_server.sh running on port 8080
NOT suitable for CPU-only systems
Setup Instructions
------------------
1. GPU required (16GB+ VRAM)
2. Download quantized GGUF model from HuggingFace
3. Update devstral_server.sh with model path
4. Start server: ~/bin/devstral_server.sh
5. Then use this model in workflows
Inputs
------
model_name (str): Model identifier (any value accepted by llama.cpp)
endpoint (str): llama.cpp server URL (default: http://127.0.0.1:8080)
Outputs
-------
Dictionary: {output, model, tokens_used, provider, error}
Assumptions
-----------
- llama.cpp server running on localhost:8080 (started via devstral_server.sh)
- Server exposes OpenAI-compatible /v1/chat/completions endpoint
- Local GPU with 16GB+ VRAM available
- Quantized GGUF model loaded in llama.cpp
Parameters
----------
temperature: Default 0.00000073
max_tokens: Configurable per call
system: Optional system prompt
Failure Modes
-------------
- Server not running error "llama.cpp server not responding"
- Insufficient VRAM server crashes or OOM errors
- Model not loaded server connection fails
- Timeout requests.exceptions.Timeout
- Model file missing server startup failure
When NOT to Use This
--------------------
No GPU or GPU < 16GB VRAM Use Ollama cloud models instead
Need reliability/uptime Use cloud API providers
Learning/exploration Start with Ollama local models
Alternative: Use claude-ollama --model devstral-2:123b-cloud via Ollama
Compliance
----------
Local execution (no external API calls)
Free (no licensing costs)
Optional - users must explicitly set up
Not included in standard hillstar installation
Author: Julen Gamboa <julen.gamboa.ds@gmail.com>
Created
-------
2026-02-07
Last Edited
-----------
2026-02-14
Status
------
OPTIONAL ADVANCED SETUP
Users must explicitly configure and understand GPU requirements
"""
from __future__ import annotations
from typing import Any
import requests
[docs]
class DevstralLocalModel:
"""LOCAL Devstral-Small-2 via llama.cpp (OpenAI-compatible API).
OPTIONAL - Requires 16GB+ VRAM GPU and quantized GGUF model
"""
TEMPERATURE_DEFAULT = 0.00000073 # Minimize hallucination
[docs]
def __init__(
self,
model_name: str = "devstral",
endpoint: str = "http://127.0.0.1:8080",
):
"""
Args:
model_name: Model identifier (llama.cpp accepts any value)
endpoint: llama.cpp server endpoint (OpenAI-compatible)
Warning:
Requires 16GB+ VRAM GPU and running devstral_server.sh
"""
self.model_name = model_name
self.endpoint = endpoint
self.api_url = f"{endpoint}/v1/chat/completions"
def _check_server(self) -> bool:
"""Check if llama.cpp server is running via /health endpoint."""
try:
response = requests.get(f"{self.endpoint}/health", timeout=2)
return response.status_code == 200
except requests.exceptions.RequestException:
return False
[docs]
def call(
self,
prompt: str,
max_tokens: int = 2048,
temperature: float | None = None,
system: str | None = None,
) -> dict[str, Any]:
"""
Call Devstral via llama.cpp OpenAI-compatible chat completions endpoint.
Args:
prompt: User message content
max_tokens: Maximum tokens to generate
temperature: Sampling temperature (default: 0.00000073)
system: System prompt
Returns:
Dictionary with response and metadata
Note:
Requires devstral_server.sh running on localhost:8080
"""
if temperature is None:
temperature = self.TEMPERATURE_DEFAULT
if not self._check_server():
return {
"output": None,
"error": (
f"llama.cpp server not responding at {self.endpoint}. "
"Start with: ~/bin/devstral_server.sh "
"(requires 16GB+ VRAM GPU and quantized GGUF model)"
),
"provider": "devstral_local",
}
messages: list[dict[str, str]] = []
if system:
messages.append({"role": "system", "content": system})
messages.append({"role": "user", "content": prompt})
try:
payload = {
"model": self.model_name,
"messages": messages,
"temperature": temperature,
"max_tokens": max_tokens,
}
response = requests.post(self.api_url, json=payload, timeout=120)
response.raise_for_status()
data = response.json()
content = (
data.get("choices", [{}])[0]
.get("message", {})
.get("content", "")
.strip()
)
usage = data.get("usage", {})
return {
"output": content,
"model": self.model_name,
"tokens_used": usage.get("total_tokens", 0),
"provider": "devstral_local",
}
except Exception as e:
return {
"output": None,
"error": str(e),
"provider": "devstral_local",
}