local-llm-router

Prerequisites (CRITICAL)

Before using this skill, ensure:

Serena MCP Server installed and running (PRIMARY TOOL)
At least one local LLM service running (Ollama, LM Studio, Jan, etc.)

```bash

# Install Serena (required)

pip install serena

# Or via uvx

uvx --from git+https://github.com/oraios/serena serena start-mcp-server

# Verify local LLM service

curl http://localhost:11434/api/version # Ollama

curl http://localhost:1234/v1/models # LM Studio

curl http://localhost:1337/v1/models # Jan

```

Quick Start

```python

import httpx

import asyncio

from dataclasses import dataclass

from enum import Enum

from typing import Optional

class TaskCategory(Enum):

CODING = "coding"

REASONING = "reasoning"

ANALYSIS = "analysis"

DOCUMENTATION = "documentation"

@dataclass

class RouterConfig:

"""Local LLM Router configuration."""

ollama_url: str = "http://localhost:11434"

lmstudio_url: str = "http://localhost:1234"

jan_url: str = "http://localhost:1337"

serena_enabled: bool = True

timeout: int = 30

async def quick_route(query: str, config: RouterConfig = RouterConfig()):

"""Quick routing example - detects services and routes query."""

# 1. Detect available services

services = await discover_services(config)

if not services:

raise RuntimeError("No local LLM services available")

# 2. Classify task

category = classify_task(query)

# 3. Select best model for task

model = select_model(category, services)

# 4. Execute query

return await execute_query(query, model, services[0])

# Example usage

async def main():

response = await quick_route("Write a function to parse JSON safely")

print(response)

asyncio.run(main())

```

Serena Integration (PRIMARY TOOL)

CRITICAL: Serena MCP MUST be invoked FIRST for all code-related tasks. This provides semantic understanding of the codebase before routing to an LLM.

Why Serena First?

Token Efficiency: Serena extracts only relevant code context
Accuracy: Symbol-level operations vs grep-style searches
Codebase Awareness: Understands types, references, call hierarchies
Edit Precision: Applies changes at symbol level, not string matching

Serena MCP Setup

```python

import subprocess

import json

from typing import Any

class SerenaMCP:

"""Serena MCP client for code intelligence."""

def __init__(self, workspace_root: str):

self.workspace = workspace_root

self.process = None

async def start(self):

"""Start Serena MCP server."""

self.process = subprocess.Popen(

["serena", "start-mcp-server", "--workspace", self.workspace],

stdin=subprocess.PIPE,

stdout=subprocess.PIPE,

stderr=subprocess.PIPE

)

async def call(self, method: str, params: dict) -> Any:

"""Call Serena MCP method."""

request = {

"jsonrpc": "2.0",

"id": 1,

"method": method,

"params": params

}

self.process.stdin.write(json.dumps(request).encode() + b"\n")

self.process.stdin.flush()

response = self.process.stdout.readline()

return json.loads(response)

async def find_symbol(self, name: str) -> dict:

"""Find symbol definition by name."""

return await self.call("find_symbol", {"name": name})

async def get_references(self, file: str, line: int, char: int) -> list:

"""Get all references to symbol at position."""

return await self.call("get_references", {

"file": file,

"line": line,

"character": char

})

async def get_hover_info(self, file: str, line: int, char: int) -> dict:

"""Get type/documentation info at position."""

return await self.call("get_hover_info", {

"file": file,

"line": line,

"character": char

})

async def get_diagnostics(self, file: str) -> list:

"""Get errors/warnings for file."""

return await self.call("get_diagnostics", {"file": file})

async def apply_edit(self, file: str, edits: list) -> bool:

"""Apply code edits to file."""

return await self.call("apply_edit", {"file": file, "edits": edits})

# Serena tools by priority (always use higher priority first)

SERENA_TOOLS = {

# Priority 1: Symbol-level operations (highest)

"find_symbol": {"priority": 1, "use_for": ["navigation", "definition"]},

"get_references": {"priority": 1, "use_for": ["refactoring", "impact analysis"]},

"get_hover_info": {"priority": 1, "use_for": ["type info", "documentation"]},

# Priority 2: Code navigation

"go_to_definition": {"priority": 2, "use_for": ["navigation"]},

"go_to_type_definition": {"priority": 2, "use_for": ["type navigation"]},

"go_to_implementation": {"priority": 2, "use_for": ["interface impl"]},

# Priority 3: Code understanding

"get_document_symbols": {"priority": 3, "use_for": ["file structure"]},

"get_workspace_symbols": {"priority": 3, "use_for": ["codebase search"]},

"get_call_hierarchy": {"priority": 3, "use_for": ["call analysis"]},

# Priority 4: Code modification

"apply_edit": {"priority": 4, "use_for": ["editing"]},

"rename_symbol": {"priority": 4, "use_for": ["refactoring"]},

# Priority 5: Diagnostics

"get_diagnostics": {"priority": 5, "use_for": ["errors", "warnings"]},

"get_code_actions": {"priority": 5, "use_for": ["quick fixes"]},

}

```

Serena-First Request Handler

```python

async def handle_code_request(

query: str,

file_context: Optional[dict] = None,

serena: SerenaMCP = None,

router: "LLMRouter" = None

):

"""

Handle code request with Serena-first pattern.

CRITICAL: Serena is ALWAYS invoked first for code tasks.

"""

# Step 1: Classify the task

category = classify_task(query)

# Step 2: ALWAYS use Serena for code context (if available)

serena_context = {}

if serena and file_context:

# Gather semantic context from Serena

if file_context.get("file") and file_context.get("position"):

file = file_context["file"]

line = file_context["position"]["line"]

char = file_context["position"]["character"]

# Get hover info (type, docs)

serena_context["hover"] = await serena.get_hover_info(file, line, char)

# For refactoring/analysis, get references

if category in [TaskCategory.ANALYSIS, TaskCategory.CODING]:

if "refactor" in query.lower() or "rename" in query.lower():

serena_context["references"] = await serena.get_references(

file, line, char

)

# Always get diagnostics for the file

serena_context["diagnostics"] = await serena.get_diagnostics(file)

# Step 3: Build enriched prompt with Serena context

enriched_query = build_enriched_query(query, serena_context)

# Step 4: Select and route to appropriate LLM

model = router.select_model(category)

response = await router.execute(enriched_query, model)

# Step 5: If response contains edits, apply via Serena

if serena and contains_code_edit(response):

edits = parse_code_edits(response)

await serena.apply_edit(file_context["file"], edits)

return response

def build_enriched_query(query: str, serena_context: dict) -> str:

"""Build query enriched with Serena context."""

parts = [query]

if serena_context.get("hover"):

hover = serena_context["hover"]

parts.append(f"\n## Type Information\n``\n{hover}\n``")

if serena_context.get("references"):

refs = serena_context["references"]

parts.append(f"\n## References ({len(refs)} found)\n")

for ref in refs[:10]: # Limit to first 10

parts.append(f"- {ref['file']}:{ref['line']}")

if serena_context.get("diagnostics"):

diags = serena_context["diagnostics"]

if diags:

parts.append(f"\n## Current Issues ({len(diags)})\n")

for diag in diags[:5]:

parts.append(f"- Line {diag['line']}: {diag['message']}")

return "\n".join(parts)

```

Service Discovery

Supported Services

|---------|-----------------|--------------|-----------------|---------------|-----------|

OS Detection

```python

import sys

import os

import platform

from dataclasses import dataclass

@dataclass

class OSInfo:

platform: str # 'windows', 'linux', 'darwin'

release: str

arch: str # 'x64', 'arm64'

is_wsl: bool

is_container: bool

def detect_os() -> OSInfo:

"""Detect operating system and environment."""

plat = sys.platform

# Normalize platform name

if plat == 'win32':

plat = 'windows'

elif plat == 'darwin':

plat = 'darwin'

else:

plat = 'linux'

# WSL detection

is_wsl = False

if plat == 'linux':

try:

with open('/proc/version', 'r') as f:

is_wsl = 'microsoft' in f.read().lower()

except FileNotFoundError:

pass

is_wsl = is_wsl or os.environ.get('WSL_DISTRO_NAME') is not None

# Container detection

is_container = (

os.path.exists('/.dockerenv') or

os.environ.get('KUBERNETES_SERVICE_HOST') is not None

)

if not is_container and plat == 'linux':

try:

with open('/proc/1/cgroup', 'r') as f:

is_container = 'docker' in f.read() or 'kubepods' in f.read()

except FileNotFoundError:

pass

return OSInfo(

platform=plat,

release=platform.release(),

arch=platform.machine(),

is_wsl=is_wsl,

is_container=is_container

)

def adjust_endpoint_for_os(endpoint: str, os_info: OSInfo) -> str:

"""Adjust endpoint based on OS environment."""

if os_info.is_wsl or os_info.is_container:

# In WSL/containers, localhost services are on the host

return endpoint.replace('localhost', 'host.docker.internal')

return endpoint

```

Service Discovery Implementation

```python

import httpx

import asyncio

from dataclasses import dataclass, field

from datetime import datetime

from typing import Optional

@dataclass

class DiscoveredModel:

id: str

name: str

size: int = 0

family: Optional[str] = None

context_length: int = 4096

quantization: Optional[str] = None

@dataclass

class LLMService:

name: str

type: str # 'ollama', 'lmstudio', 'jan', 'openwebui', 'custom'

endpoint: str

status: str = 'unknown' # 'online', 'offline', 'unknown'

models: list = field(default_factory=list)

last_checked: datetime = None

api_style: str = 'openai' # 'openai', 'native'

# Endpoint paths

health_path: str = '/v1/models'

models_path: str = '/v1/models'

chat_path: str = '/v1/chat/completions'

# Default service configurations

SERVICE_DEFAULTS = {

'ollama': LLMService(

name='Ollama',

type='ollama',

endpoint='http://localhost:11434',

health_path='/api/version',

models_path='/api/tags',

chat_path='/api/chat',

api_style='native'

),

'lmstudio': LLMService(

name='LM Studio',

type='lmstudio',

endpoint='http://localhost:1234',

health_path='/v1/models',

models_path='/v1/models',

chat_path='/v1/chat/completions',

api_style='openai'

),

'jan': LLMService(

name='Jan',

type='jan',

endpoint='http://localhost:1337',

health_path='/v1/models',

models_path='/v1/models',

chat_path='/v1/chat/completions',

api_style='openai'

),

'openwebui': LLMService(

name='Open WebUI',

type='openwebui',

endpoint='http://localhost:3000',

health_path='/api/health',

models_path='/api/models',

chat_path='/api/chat',

api_style='custom'

),

'localai': LLMService(

name='LocalAI',

type='localai',

endpoint='http://localhost:8080',

health_path='/readyz',

models_path='/v1/models',

chat_path='/v1/chat/completions',

api_style='openai'

),

'vllm': LLMService(

name='vLLM',

type='vllm',

endpoint='http://localhost:8000',

health_path='/health',

models_path='/v1/models',

chat_path='/v1/chat/completions',

api_style='openai'

),

'llamacpp': LLMService(

name='llama.cpp',

type='llamacpp',

endpoint='http://localhost:8080',

health_path='/health',

models_path='/v1/models',

chat_path='/v1/chat/completions',

api_style='openai'

),

'koboldcpp': LLMService(

name='Kobold.cpp',

type='koboldcpp',

endpoint='http://localhost:5001',

health_path='/api/v1/info',

models_path='/api/v1/model',

chat_path='/api/v1/generate',

api_style='custom'

),

'gpt4all': LLMService(

name='GPT4All',

type='gpt4all',

endpoint='http://localhost:4891',

health_path='/v1/models',

models_path='/v1/models',

chat_path='/v1/chat/completions',

api_style='openai'

),

}

class ServiceDiscovery:

"""Discover and monitor local LLM services."""

def __init__(self, custom_endpoints: list = None):

self.services: dict[str, LLMService] = {}

self.os_info = detect_os()

self.custom_endpoints = custom_endpoints or []

self._client = httpx.AsyncClient(timeout=5.0)

async def discover_all(self) -> list[LLMService]:

"""Discover all available LLM services."""

discovered = []

# Check default services

tasks = []

for key, default in SERVICE_DEFAULTS.items():

service = LLMService(

name=default.name,

type=default.type,

endpoint=adjust_endpoint_for_os(default.endpoint, self.os_info),

health_path=default.health_path,

models_path=default.models_path,

chat_path=default.chat_path,

api_style=default.api_style

)

tasks.append(self._check_service(service))

# Check custom endpoints

for custom in self.custom_endpoints:

service = LLMService(

name=custom.get('name', 'Custom'),

type='custom',

endpoint=custom['endpoint'],

health_path=custom.get('health_path', '/v1/models'),

models_path=custom.get('models_path', '/v1/models'),

chat_path=custom.get('chat_path', '/v1/chat/completions'),

api_style=custom.get('api_style', 'openai')

)

tasks.append(self._check_service(service))

results = await asyncio.gather(*tasks, return_exceptions=True)

for result in results:

if isinstance(result, LLMService) and result.status == 'online':

discovered.append(result)

self.services[result.type] = result

return discovered

async def _check_service(self, service: LLMService) -> LLMService:

"""Check if service is online and discover models."""

try:

# Health check

response = await self._client.get(

f"{service.endpoint}{service.health_path}"

)

if response.status_code == 200:

service.status = 'online'

service.last_checked = datetime.now()

# Discover models

service.models = await self._discover_models(service)

else:

service.status = 'offline'

except (httpx.ConnectError, httpx.TimeoutException):

service.status = 'offline'

return service

async def _discover_models(self, service: LLMService) -> list[DiscoveredModel]:

"""Discover available models on service."""

try:

response = await self._client.get(

f"{service.endpoint}{service.models_path}"

)

data = response.json()

# Parse based on service type

if service.type == 'ollama':

return [

DiscoveredModel(

id=m['name'],

name=m['name'],

size=m.get('size', 0),

family=m.get('details', {}).get('family'),

context_length=self._infer_context_length(m['name'])

)

for m in data.get('models', [])

]

else: # OpenAI-style

return [

DiscoveredModel(

id=m['id'],

name=m['id'],

context_length=m.get('context_length', 4096)

)

for m in data.get('data', [])

]

except Exception:

return []

def _infer_context_length(self, model_name: str) -> int:

"""Infer context length from model name."""

name_lower = model_name.lower()

# Check for explicit context markers

if '128k' in name_lower or '131k' in name_lower:

return 131072

if '64k' in name_lower:

return 65536

if '32k' in name_lower:

return 32768

if '16k' in name_lower:

return 16384

# Model family defaults

if 'qwen' in name_lower:

return 131072 # Qwen models typically have 128K+

if 'deepseek' in name_lower:

return 128000

if 'llama-3' in name_lower or 'llama3' in name_lower:

return 128000

if 'codellama' in name_lower:

return 100000

if 'mixtral' in name_lower:

return 65536

return 8192 # Safe default

```

Task Classification

Classification System

```python

import re

from enum import Enum

from dataclasses import dataclass

class TaskCategory(Enum):

CODING = "coding"

REASONING = "reasoning"

ANALYSIS = "analysis"

DOCUMENTATION = "documentation"

@dataclass

class ClassificationResult:

category: TaskCategory

confidence: float # 0.0 - 1.0

requires_serena: bool

keywords_matched: list[str]

# Task patterns (regex)

TASK_PATTERNS = {

TaskCategory.CODING: [

r"refactor\s+(?:this|the)",

r"add\s+(?:error\s+handling|validation|logging|tests?)",

r"complete\s+(?:this|the)\s+code",

r"(?:convert|translate)\s+(?:this|the)\s+code",

r"(?:optimize|improve)\s+(?:this|the)\s+(?:function|code|performance)",

],

TaskCategory.REASONING: [

r"how\s+should\s+(?:I|we)\s+(?:approach|structure|implement)",

r"what\s+(?:is|would\s+be)\s+the\s+best\s+(?:way|approach|pattern)",

r"explain\s+the\s+(?:logic|reasoning|algorithm)",

r"compare\s+(?:and\s+contrast|between)",

r"(?:recommend|suggest)\s+(?:an?\s+)?(?:approach|solution|pattern)",

r"trade-?offs?\s+(?:between|of)",

],

TaskCategory.ANALYSIS: [

r"(?:review|analyze|audit)\s+(?:this|the)\s+code",

r"find\s+(?:potential\s+)?(?:issues|vulnerabilities|bugs|problems)",

r"(?:security|performance)\s+(?:review|analysis|audit)",

r"what\s+(?:could|might)\s+go\s+wrong",

r"identify\s+(?:problems|improvements|issues)",

r"(?:check|scan)\s+for\s+(?:vulnerabilities|issues)",

],

TaskCategory.DOCUMENTATION: [

r"(?:write|create|generate)\s+(?:documentation|docs|docstring)",

r"(?:add|write)\s+(?:comments|jsdoc|docstring|type\s+hints)",

r"(?:document|explain)\s+(?:this|the)\s+(?:code|function|api)",

r"(?:create|write)\s+(?:a\s+)?readme",

r"(?:generate|write)\s+(?:api\s+)?documentation",

r"describe\s+(?:what|how)\s+(?:this|the)",

],

}

# Keyword weights for scoring

KEYWORD_WEIGHTS = {

# Coding

"function": (TaskCategory.CODING, 0.3),

"implement": (TaskCategory.CODING, 0.4),

"code": (TaskCategory.CODING, 0.2),

"debug": (TaskCategory.CODING, 0.5),

"refactor": (TaskCategory.CODING, 0.6),

"fix": (TaskCategory.CODING, 0.4),

"test": (TaskCategory.CODING, 0.3),

"bug": (TaskCategory.CODING, 0.5),

# Reasoning

"architecture": (TaskCategory.REASONING, 0.6),

"design": (TaskCategory.REASONING, 0.4),

"approach": (TaskCategory.REASONING, 0.3),

"strategy": (TaskCategory.REASONING, 0.5),

"tradeoff": (TaskCategory.REASONING, 0.5),

"compare": (TaskCategory.REASONING, 0.4),

"recommend": (TaskCategory.REASONING, 0.4),

# Analysis

"review": (TaskCategory.ANALYSIS, 0.5),

"analyze": (TaskCategory.ANALYSIS, 0.6),

"security": (TaskCategory.ANALYSIS, 0.4),

"vulnerability": (TaskCategory.ANALYSIS, 0.7),

"performance": (TaskCategory.ANALYSIS, 0.3),

"audit": (TaskCategory.ANALYSIS, 0.6),

# Documentation

"document": (TaskCategory.DOCUMENTATION, 0.6),

"readme": (TaskCategory.DOCUMENTATION, 0.8),

"docstring": (TaskCategory.DOCUMENTATION, 0.8),

"comment": (TaskCategory.DOCUMENTATION, 0.4),

"explain": (TaskCategory.DOCUMENTATION, 0.3),

}

def classify_task(query: str) -> ClassificationResult:

"""Classify a query into a task category."""

query_lower = query.lower()

scores = {cat: 0.0 for cat in TaskCategory}

matched_keywords = []

# Pattern matching (weight: 0.5)

for category, patterns in TASK_PATTERNS.items():

for pattern in patterns:

if re.search(pattern, query_lower):

scores[category] += 0.5

# Keyword scoring (weight: 0.5)

words = re.findall(r'\w+', query_lower)

for word in words:

if word in KEYWORD_WEIGHTS:

category, weight = KEYWORD_WEIGHTS[word]

scores[category] += weight * 0.5

matched_keywords.append(word)

# Find highest scoring category

best_category = max(scores, key=scores.get)

confidence = min(scores[best_category], 1.0)

# Default to CODING if no clear match

if confidence < 0.2:

best_category = TaskCategory.CODING

confidence = 0.5

# Determine if Serena is required

requires_serena = (

best_category == TaskCategory.ANALYSIS or

any(kw in query_lower for kw in [

'definition', 'reference', 'symbol', 'rename',

'where is', 'find all', 'go to', 'jump to'

])

)

return ClassificationResult(

category=best_category,

confidence=confidence,

requires_serena=requires_serena,

keywords_matched=matched_keywords

)

```

Model Selection

Model Capability Matrix

```python

from dataclasses import dataclass

from typing import Optional

@dataclass

class ModelCapability:

id: str

family: str

context_window: int

vram_gb: float

categories: list[TaskCategory]

performance_scores: dict[TaskCategory, int] # 0-100

tier: int # 1=best, 2=good, 3=basic

quantization: Optional[str] = None

# Comprehensive model database (40+ models) - Updated January 2025

MODEL_DATABASE: dict[str, ModelCapability] = {

# === CODING SPECIALISTS (Tier 1) ===

"deepseek-v3": ModelCapability(

id="deepseek-v3",

family="deepseek",

context_window=128000,

vram_gb=48, # MoE: 685B total, 37B active

categories=[TaskCategory.CODING, TaskCategory.REASONING, TaskCategory.ANALYSIS],

performance_scores={

TaskCategory.CODING: 99,

TaskCategory.REASONING: 97,

TaskCategory.ANALYSIS: 96,

TaskCategory.DOCUMENTATION: 92

},

tier=1

),

"qwen2.5-coder-32b": ModelCapability(

id="qwen2.5-coder-32b",

family="qwen",

context_window=131072,

vram_gb=22,

categories=[TaskCategory.CODING, TaskCategory.ANALYSIS],

performance_scores={

TaskCategory.CODING: 96,

TaskCategory.REASONING: 82,

TaskCategory.ANALYSIS: 92,

TaskCategory.DOCUMENTATION: 88

},

tier=1

),

"deepseek-coder-v2": ModelCapability(

id="deepseek-coder-v2",

family="deepseek",

context_window=128000,

vram_gb=48, # MoE: 236B total, 21B active

categories=[TaskCategory.CODING, TaskCategory.ANALYSIS, TaskCategory.REASONING],

performance_scores={

TaskCategory.CODING: 95,

TaskCategory.REASONING: 88,

TaskCategory.ANALYSIS: 92,

TaskCategory.DOCUMENTATION: 80

},

tier=1

),

"codellama-70b": ModelCapability(

id="codellama-70b",

family="llama",

context_window=100000,

vram_gb=40,

categories=[TaskCategory.CODING],

performance_scores={

TaskCategory.CODING: 90,

TaskCategory.REASONING: 70,

TaskCategory.ANALYSIS: 85,

TaskCategory.DOCUMENTATION: 75

},

tier=1

),

"codellama-34b": ModelCapability(

id="codellama-34b",

family="llama",

context_window=100000,

vram_gb=20,

categories=[TaskCategory.CODING],

performance_scores={

TaskCategory.CODING: 85,

TaskCategory.REASONING: 65,

TaskCategory.ANALYSIS: 80,

TaskCategory.DOCUMENTATION: 70

},

tier=2

),

"qwen2.5-coder-14b": ModelCapability(

id="qwen2.5-coder-14b",

family="qwen",

context_window=131072,

vram_gb=10,

categories=[TaskCategory.CODING],

performance_scores={

TaskCategory.CODING: 82,

TaskCategory.REASONING: 60,

TaskCategory.ANALYSIS: 75,

TaskCategory.DOCUMENTATION: 70

},

tier=2

),

"starcoder2-15b": ModelCapability(

id="starcoder2-15b",

family="starcoder",

context_window=16384,

vram_gb=10,

categories=[TaskCategory.CODING],

performance_scores={

TaskCategory.CODING: 80,

TaskCategory.REASONING: 50,

TaskCategory.ANALYSIS: 70,

TaskCategory.DOCUMENTATION: 60

},

tier=2

),

"deepseek-coder-6.7b": ModelCapability(

id="deepseek-coder-6.7b",

family="deepseek",

context_window=16384,

vram_gb=5,

categories=[TaskCategory.CODING],

performance_scores={

TaskCategory.CODING: 75,

TaskCategory.REASONING: 50,

TaskCategory.ANALYSIS: 65,

TaskCategory.DOCUMENTATION: 55

},

tier=3

),

"codellama-7b": ModelCapability(

id="codellama-7b",

family="llama",

context_window=16384,

vram_gb=5,

categories=[TaskCategory.CODING],

performance_scores={

TaskCategory.CODING: 70,

TaskCategory.REASONING: 45,

TaskCategory.ANALYSIS: 60,

TaskCategory.DOCUMENTATION: 50

},

tier=3

),

# === REASONING SPECIALISTS ===

"deepseek-r1": ModelCapability(

id="deepseek-r1",

family="deepseek",

context_window=128000,

vram_gb=160, # 671B total

categories=[TaskCategory.REASONING, TaskCategory.CODING],

performance_scores={

TaskCategory.CODING: 92,

TaskCategory.REASONING: 99,

TaskCategory.ANALYSIS: 95,

TaskCategory.DOCUMENTATION: 90

},

tier=1

),

"deepseek-r1-distill-70b": ModelCapability(

id="deepseek-r1-distill-70b",

family="deepseek",

context_window=128000,

vram_gb=42,

categories=[TaskCategory.REASONING, TaskCategory.CODING],

performance_scores={

TaskCategory.CODING: 88,

TaskCategory.REASONING: 94,

TaskCategory.ANALYSIS: 90,

TaskCategory.DOCUMENTATION: 86

},

tier=1

),

"qwen2.5-72b-instruct": ModelCapability(

id="qwen2.5-72b-instruct",

family="qwen",

context_window=131072,

vram_gb=48,

categories=[TaskCategory.REASONING, TaskCategory.CODING],

performance_scores={

TaskCategory.CODING: 88,

TaskCategory.REASONING: 95,

TaskCategory.ANALYSIS: 92,

TaskCategory.DOCUMENTATION: 94

},

tier=1

),

"llama-3.3-70b-instruct": ModelCapability(

id="llama-3.3-70b-instruct",

family="llama",

context_window=128000,

vram_gb=42,

categories=[TaskCategory.REASONING, TaskCategory.CODING],

performance_scores={

TaskCategory.CODING: 85,

TaskCategory.REASONING: 92,

TaskCategory.ANALYSIS: 88,

TaskCategory.DOCUMENTATION: 90

},

tier=1

),

"deepseek-r1-distill-32b": ModelCapability(

id="deepseek-r1-distill-32b",

family="deepseek",

context_window=128000,

vram_gb=22,

categories=[TaskCategory.REASONING, TaskCategory.CODING],

performance_scores={

TaskCategory.CODING: 82,

TaskCategory.REASONING: 90,

TaskCategory.ANALYSIS: 85,

TaskCategory.DOCUMENTATION: 82

},

tier=2

),

"mistral-small-24b": ModelCapability(

id="mistral-small-24b",

family="mistral",

context_window=32768,

vram_gb=16,

categories=[TaskCategory.REASONING, TaskCategory.CODING],

performance_scores={

TaskCategory.CODING: 80,

TaskCategory.REASONING: 85,

TaskCategory.ANALYSIS: 82,

TaskCategory.DOCUMENTATION: 84

},

tier=2

),

"qwen2.5-32b-instruct": ModelCapability(

id="qwen2.5-32b-instruct",

family="qwen",

context_window=131072,

vram_gb=22,

categories=[TaskCategory.REASONING, TaskCategory.DOCUMENTATION],

performance_scores={

TaskCategory.CODING: 78,

TaskCategory.REASONING: 86,

TaskCategory.ANALYSIS: 82,

TaskCategory.DOCUMENTATION: 88

},

tier=2

),

"phi-4": ModelCapability(

id="phi-4",

family="phi",

context_window=16384,

vram_gb=10,

categories=[TaskCategory.REASONING, TaskCategory.CODING],

performance_scores={

TaskCategory.CODING: 82,

TaskCategory.REASONING: 88,

TaskCategory.ANALYSIS: 80,

TaskCategory.DOCUMENTATION: 78

},

tier=2

),

"deepseek-r1-distill-14b": ModelCapability(

id="deepseek-r1-distill-14b",

family="deepseek",

context_window=128000,

vram_gb=10,

categories=[TaskCategory.REASONING],

performance_scores={

TaskCategory.CODING: 75,

TaskCategory.REASONING: 85,

TaskCategory.ANALYSIS: 78,

TaskCategory.DOCUMENTATION: 76

},

tier=2

),

"llama-3.2-11b-vision": ModelCapability(

id="llama-3.2-11b-vision",

family="llama",

context_window=128000,

vram_gb=8,

categories=[TaskCategory.REASONING, TaskCategory.DOCUMENTATION],

performance_scores={

TaskCategory.CODING: 68,

TaskCategory.REASONING: 78,

TaskCategory.ANALYSIS: 75,

TaskCategory.DOCUMENTATION: 80

},

tier=2

),

"gemma-2-27b": ModelCapability(

id="gemma-2-27b",

family="gemma",

context_window=8192,

vram_gb=18,

categories=[TaskCategory.REASONING, TaskCategory.CODING],

performance_scores={

TaskCategory.CODING: 78,

TaskCategory.REASONING: 82,

TaskCategory.ANALYSIS: 78,

TaskCategory.DOCUMENTATION: 80

},

tier=2

),

"deepseek-r1-distill-8b": ModelCapability(

id="deepseek-r1-distill-8b",

family="deepseek",

context_window=128000,

vram_gb=6,

categories=[TaskCategory.REASONING],

performance_scores={

TaskCategory.CODING: 68,

TaskCategory.REASONING: 78,

TaskCategory.ANALYSIS: 70,

TaskCategory.DOCUMENTATION: 68

},

tier=3

),

"gemma-2-9b": ModelCapability(

id="gemma-2-9b",

family="gemma",

context_window=8192,

vram_gb=7,

categories=[TaskCategory.REASONING, TaskCategory.CODING],

performance_scores={

TaskCategory.CODING: 72,

TaskCategory.REASONING: 75,

TaskCategory.ANALYSIS: 70,

TaskCategory.DOCUMENTATION: 74

},

tier=3

),

"llama-3.2-3b": ModelCapability(

id="llama-3.2-3b",

family="llama",

context_window=128000,

vram_gb=3,

categories=[TaskCategory.REASONING],

performance_scores={

TaskCategory.CODING: 55,

TaskCategory.REASONING: 65,

TaskCategory.ANALYSIS: 58,

TaskCategory.DOCUMENTATION: 65

},

tier=3

),

# === ANALYSIS SPECIALISTS (Serena Required) ===

"codellama-34b-instruct": ModelCapability(

id="codellama-34b-instruct",

family="llama",

context_window=100000,

vram_gb=20,

categories=[TaskCategory.ANALYSIS],

performance_scores={

TaskCategory.CODING: 80,

TaskCategory.REASONING: 70,

TaskCategory.ANALYSIS: 88,

TaskCategory.DOCUMENTATION: 75

},

tier=2

),

# === DOCUMENTATION SPECIALISTS ===

"mistral-nemo-12b": ModelCapability(

id="mistral-nemo-12b",

family="mistral",

context_window=128000,

vram_gb=8,

categories=[TaskCategory.DOCUMENTATION],

performance_scores={

TaskCategory.CODING: 65,

TaskCategory.REASONING: 70,

TaskCategory.ANALYSIS: 65,

TaskCategory.DOCUMENTATION: 82

},

tier=2

),

"mistral-7b": ModelCapability(

id="mistral-7b",

family="mistral",

context_window=32768,

vram_gb=5,

categories=[TaskCategory.DOCUMENTATION],

performance_scores={

TaskCategory.CODING: 55,

TaskCategory.REASONING: 60,

TaskCategory.ANALYSIS: 55,

TaskCategory.DOCUMENTATION: 72

},

tier=3

),

# === ADDITIONAL MODELS ===

"phi-3-medium": ModelCapability(

id="phi-3-medium",

family="phi",

context_window=128000,

vram_gb=8,

categories=[TaskCategory.CODING, TaskCategory.REASONING],

performance_scores={

TaskCategory.CODING: 72,

TaskCategory.REASONING: 75,

TaskCategory.ANALYSIS: 68,

TaskCategory.DOCUMENTATION: 70

},

tier=2

),

"gemma-2-27b": ModelCapability(

id="gemma-2-27b",

family="gemma",

context_window=8192,

vram_gb=18,

categories=[TaskCategory.CODING, TaskCategory.REASONING],

performance_scores={

TaskCategory.CODING: 78,

TaskCategory.REASONING: 80,

TaskCategory.ANALYSIS: 75,

TaskCategory.DOCUMENTATION: 78

},

tier=2

),

"yi-34b": ModelCapability(

id="yi-34b",

family="yi",

context_window=200000,

vram_gb=20,

categories=[TaskCategory.REASONING, TaskCategory.DOCUMENTATION],

performance_scores={

TaskCategory.CODING: 72,

TaskCategory.REASONING: 82,

TaskCategory.ANALYSIS: 75,

TaskCategory.DOCUMENTATION: 80

},

tier=2

),

"command-r-plus": ModelCapability(

id="command-r-plus",

family="cohere",

context_window=128000,

vram_gb=48,

categories=[TaskCategory.REASONING, TaskCategory.DOCUMENTATION],

performance_scores={

TaskCategory.CODING: 70,

TaskCategory.REASONING: 85,

TaskCategory.ANALYSIS: 78,

TaskCategory.DOCUMENTATION: 88

},

tier=1

),

"wizardcoder-33b": ModelCapability(

id="wizardcoder-33b",

family="wizard",

context_window=16384,

vram_gb=20,

categories=[TaskCategory.CODING],

performance_scores={

TaskCategory.CODING: 85,

TaskCategory.REASONING: 60,

TaskCategory.ANALYSIS: 75,

TaskCategory.DOCUMENTATION: 65

},

tier=2

),

"magicoder-7b": ModelCapability(

id="magicoder-7b",

family="magicoder",

context_window=16384,

vram_gb=5,

categories=[TaskCategory.CODING],

performance_scores={

TaskCategory.CODING: 78,

TaskCategory.REASONING: 50,

TaskCategory.ANALYSIS: 65,

TaskCategory.DOCUMENTATION: 55

},

tier=3

),

"dolphin-mixtral-8x7b": ModelCapability(

id="dolphin-mixtral-8x7b",

family="dolphin",

context_window=32768,

vram_gb=28,

categories=[TaskCategory.CODING, TaskCategory.REASONING],

performance_scores={

TaskCategory.CODING: 75,

TaskCategory.REASONING: 78,

TaskCategory.ANALYSIS: 72,

TaskCategory.DOCUMENTATION: 75

},

tier=2

),

"nous-hermes-2-mixtral": ModelCapability(

id="nous-hermes-2-mixtral",

family="nous",

context_window=32768,

vram_gb=28,

categories=[TaskCategory.REASONING],

performance_scores={

TaskCategory.CODING: 72,

TaskCategory.REASONING: 82,

TaskCategory.ANALYSIS: 75,

TaskCategory.DOCUMENTATION: 78

},

tier=2

),

"solar-10.7b": ModelCapability(

id="solar-10.7b",

family="solar",

context_window=4096,

vram_gb=7,

categories=[TaskCategory.REASONING, TaskCategory.DOCUMENTATION],

performance_scores={

TaskCategory.CODING: 60,

TaskCategory.REASONING: 72,

TaskCategory.ANALYSIS: 65,

TaskCategory.DOCUMENTATION: 75

},

tier=3

),

}

# Task-to-model priority mapping (Updated January 2025)

TASK_MODEL_PRIORITY = {

TaskCategory.CODING: [

# Tier 1 - Best

"deepseek-v3", "qwen2.5-coder-32b", "deepseek-coder-v2",

# Tier 2 - Good

"codellama-70b", "qwen2.5-coder-14b", "codellama-34b",

"starcoder2-15b", "phi-4",

# Tier 3 - Basic

"qwen2.5-coder-7b", "codellama-7b", "deepseek-coder-6.7b"

],

TaskCategory.REASONING: [

# Tier 1 - Best

"deepseek-r1", "deepseek-v3", "deepseek-r1-distill-70b",

"qwen2.5-72b-instruct", "llama-3.3-70b-instruct",

# Tier 2 - Good

"deepseek-r1-distill-32b", "mistral-small-24b", "qwen2.5-32b-instruct",

"phi-4", "gemma-2-27b",

# Tier 3 - Basic

"deepseek-r1-distill-14b", "deepseek-r1-distill-8b", "gemma-2-9b"

],

TaskCategory.ANALYSIS: [

# Requires Serena LSP

"deepseek-v3", "qwen2.5-coder-32b", "deepseek-coder-v2",

"codellama-34b-instruct", "qwen2.5-72b-instruct"

],

TaskCategory.DOCUMENTATION: [

"qwen2.5-72b-instruct", "llama-3.3-70b-instruct", "qwen2.5-32b-instruct",

"mistral-small-24b", "mistral-nemo-12b", "gemma-2-27b"

],

}

```

Model Selection Logic

```python

from typing import Optional

class ModelSelector:

"""Select optimal model for task based on availability and requirements."""

def __init__(self, available_models: list[str]):

self.available = set(m.lower() for m in available_models)

def select(

self,

category: TaskCategory,

required_context: int = 0,

max_vram_gb: Optional[float] = None

) -> Optional[str]:

"""Select best available model for task category."""

# Get priority list for category

priority_list = TASK_MODEL_PRIORITY.get(category, [])

for model_id in priority_list:

# Check if model is available

if not self._is_available(model_id):

continue

# Check model capability

capability = MODEL_DATABASE.get(model_id)

if not capability:

continue

# Check context window requirement

if required_context > 0 and capability.context_window < required_context:

continue

# Check VRAM constraint

if max_vram_gb and capability.vram_gb > max_vram_gb:

continue

return model_id

# Fallback: return any available model

for model_id, capability in MODEL_DATABASE.items():

if self._is_available(model_id):

return model_id

return None

def _is_available(self, model_id: str) -> bool:

"""Check if model is available (fuzzy matching)."""

model_lower = model_id.lower()

# Exact match

if model_lower in self.available:

return True

# Partial match (model name contained in available)

for avail in self.available:

if model_lower in avail or avail in model_lower:

return True

return False

def get_fallback_models(self, category: TaskCategory) -> list[str]:

"""Get list of fallback models for category."""

priority_list = TASK_MODEL_PRIORITY.get(category, [])

available_in_priority = [

m for m in priority_list if self._is_available(m)

]

# Return tier 2 and 3 models as fallbacks

fallbacks = []

for model_id in available_in_priority:

capability = MODEL_DATABASE.get(model_id)

if capability and capability.tier >= 2:

fallbacks.append(model_id)

return fallbacks

```

Context Management

Token Counting

```python

from abc import ABC, abstractmethod

import re

class TokenCounter(ABC):

"""Base class for token counting."""

@abstractmethod

def count(self, text: str) -> int:

pass

class EstimationCounter(TokenCounter):

"""Estimation-based token counter (no external dependencies)."""

def __init__(self, chars_per_token: float = 4.0):

self.chars_per_token = chars_per_token

def count(self, text: str) -> int:

return int(len(text) / self.chars_per_token)

class QwenCounter(TokenCounter):

"""Token counter for Qwen models."""

def count(self, text: str) -> int:

# Qwen uses slightly different tokenization

return int(len(text) / 3.5)

class LlamaCounter(TokenCounter):

"""Token counter for Llama models."""

def count(self, text: str) -> int:

# Llama uses SentencePiece

return int(len(text) / 3.8)

# Model family to counter mapping

TOKEN_COUNTERS = {

"qwen": QwenCounter(),

"deepseek": EstimationCounter(4.0),

"llama": LlamaCounter(),

"mistral": EstimationCounter(4.0),

"mixtral": EstimationCounter(4.0),

"default": EstimationCounter(4.0),

}

def get_token_counter(model_id: str) -> TokenCounter:

"""Get appropriate token counter for model."""

capability = MODEL_DATABASE.get(model_id)

if capability:

return TOKEN_COUNTERS.get(capability.family, TOKEN_COUNTERS["default"])

return TOKEN_COUNTERS["default"]

```

Context Manager

```python

from dataclasses import dataclass, field

from datetime import datetime

from typing import Optional

@dataclass

class Message:

role: str # 'system', 'user', 'assistant', 'tool'

content: str

timestamp: datetime = field(default_factory=datetime.now)

token_count: int = 0

metadata: dict = field(default_factory=dict)

@dataclass

class ConversationContext:

session_id: str

messages: list[Message] = field(default_factory=list)

total_tokens: int = 0

system_prompt: str = ""

system_prompt_tokens: int = 0

active_model: str = ""

model_history: list[str] = field(default_factory=list)

compaction_count: int = 0

class ContextManager:

"""Manage conversation context with compaction support."""

def __init__(

self,

session_id: str,

system_prompt: str = "",

compaction_threshold: float = 0.8, # 80% of context window

compaction_target: float = 0.5, # Compact to 50%

preserve_recent: int = 10 # Keep last N messages

):

self.context = ConversationContext(

session_id=session_id,

system_prompt=system_prompt

)

self.compaction_threshold = compaction_threshold

self.compaction_target = compaction_target

self.preserve_recent = preserve_recent

self._counter: Optional[TokenCounter] = None

def set_model(self, model_id: str):

"""Set active model and update token counter."""

if self.context.active_model:

self.context.model_history.append(self.context.active_model)

self.context.active_model = model_id

self._counter = get_token_counter(model_id)

# Recount all tokens with new counter

self._recount_tokens()

def add_message(self, role: str, content: str, metadata: dict = None):

"""Add message to context."""

token_count = self._counter.count(content) if self._counter else 0

message = Message(

role=role,

content=content,

token_count=token_count,

metadata=metadata or {}

)

self.context.messages.append(message)

self.context.total_tokens += token_count

def check_and_compact(self, max_tokens: int) -> bool:

"""Check if compaction needed and perform if so."""

threshold = int(max_tokens * self.compaction_threshold)

if self.context.total_tokens > threshold:

self._compact(max_tokens)

return True

return False

def _compact(self, max_tokens: int):

"""Compact context to target size."""

target = int(max_tokens * self.compaction_target)

# Step 1: Truncate large tool outputs

for msg in self.context.messages:

if msg.role == 'tool' and msg.token_count > 500:

original = msg.token_count

msg.content = f"[Tool output truncated - {msg.metadata.get('tool_name', 'unknown')}]"

msg.token_count = self._counter.count(msg.content)

msg.metadata['truncated'] = True

msg.metadata['original_tokens'] = original

self._recalculate_total()

if self.context.total_tokens <= target:

return

# Step 2: Summarize older messages

if len(self.context.messages) > self.preserve_recent:

older = self.context.messages[:-self.preserve_recent]

recent = self.context.messages[-self.preserve_recent:]

# Create summary of older messages

summary = self._create_summary(older)

summary_msg = Message(

role='system',

content=f"[Previous conversation summary]\n{summary}",

token_count=self._counter.count(summary),

metadata={'compacted': True}

)

self.context.messages = [summary_msg] + recent

self.context.compaction_count += 1

self._recalculate_total()

def _create_summary(self, messages: list[Message]) -> str:

"""Create summary of messages (simple implementation)."""

# In production, this would use a lightweight LLM

key_points = []

for msg in messages:

if msg.role == 'user':

# Extract first

Installation

Skill Details

Overview

Prerequisites (CRITICAL)

Quick Start

Serena Integration (PRIMARY TOOL)

Why Serena First?

Serena MCP Setup

Serena-First Request Handler

Service Discovery

Supported Services

OS Detection

Service Discovery Implementation

Task Classification

Classification System

Model Selection

Model Capability Matrix

Model Selection Logic

Context Management

Token Counting

Context Manager

More from this repository10