Workflow 1: Input filtering (prompt moderation)
Check user prompts before LLM:
```python
def check_input(user_message):
result = moderate([{"role": "user", "content": user_message}])
if result.startswith("unsafe"):
category = result.split("\n")[1]
return False, category # Blocked
else:
return True, None # Safe
# Example
safe, category = check_input("How do I hack a website?")
if not safe:
print(f"Request blocked: {category}")
# Return error to user
else:
# Send to LLM
response = llm.generate(user_message)
```
Safety categories:
- S1: Violence & Hate
- S2: Sexual Content
- S3: Guns & Illegal Weapons
- S4: Regulated Substances
- S5: Suicide & Self-Harm
- S6: Criminal Planning
Workflow 2: Output filtering (response moderation)
Check LLM responses before showing to user:
```python
def check_output(user_message, bot_response):
conversation = [
{"role": "user", "content": user_message},
{"role": "assistant", "content": bot_response}
]
result = moderate(conversation)
if result.startswith("unsafe"):
category = result.split("\n")[1]
return False, category
else:
return True, None
# Example
user_msg = "Tell me about harmful substances"
bot_msg = llm.generate(user_msg)
safe, category = check_output(user_msg, bot_msg)
if not safe:
print(f"Response blocked: {category}")
# Return generic response
return "I cannot provide that information."
else:
return bot_msg
```
Workflow 3: vLLM deployment (fast inference)
Production-ready serving:
```python
from vllm import LLM, SamplingParams
# Initialize vLLM
llm = LLM(model="meta-llama/LlamaGuard-7b", tensor_parallel_size=1)
# Sampling params
sampling_params = SamplingParams(
temperature=0.0, # Deterministic
max_tokens=100
)
def moderate_vllm(chat):
# Format prompt
prompt = tokenizer.apply_chat_template(chat, tokenize=False)
# Generate
output = llm.generate([prompt], sampling_params)
return output[0].outputs[0].text
# Batch moderation
chats = [
[{"role": "user", "content": "How to make bombs?"}],
[{"role": "user", "content": "What's the weather?"}],
[{"role": "user", "content": "Tell me about drugs"}]
]
prompts = [tokenizer.apply_chat_template(c, tokenize=False) for c in chats]
results = llm.generate(prompts, sampling_params)
for i, result in enumerate(results):
print(f"Chat {i}: {result.outputs[0].text}")
```
Throughput: ~50-100 requests/sec on single A100
Workflow 4: API endpoint (FastAPI)
Serve as moderation API:
```python
from fastapi import FastAPI
from pydantic import BaseModel
from vllm import LLM, SamplingParams
app = FastAPI()
llm = LLM(model="meta-llama/LlamaGuard-7b")
sampling_params = SamplingParams(temperature=0.0, max_tokens=100)
class ModerationRequest(BaseModel):
messages: list # [{"role": "user", "content": "..."}]
@app.post("/moderate")
def moderate_endpoint(request: ModerationRequest):
prompt = tokenizer.apply_chat_template(request.messages, tokenize=False)
output = llm.generate([prompt], sampling_params)[0]
result = output.outputs[0].text
is_safe = result.startswith("safe")
category = None if is_safe else result.split("\n")[1] if "\n" in result else None
return {
"safe": is_safe,
"category": category,
"full_output": result
}
# Run: uvicorn api:app --host 0.0.0.0 --port 8000
```
Usage:
```bash
curl -X POST http://localhost:8000/moderate \
-H "Content-Type: application/json" \
-d '{"messages": [{"role": "user", "content": "How to hack?"}]}'
# Response: {"safe": false, "category": "S6", "full_output": "unsafe\nS6"}
```
Workflow 5: NeMo Guardrails integration
Use with NVIDIA Guardrails:
```python
from nemoguardrails import RailsConfig, LLMRails
from nemoguardrails.integrations.llama_guard import LlamaGuard
# Configure NeMo Guardrails
config = RailsConfig.from_content("""
models:
- type: main
engine: openai
model: gpt-4
rails:
input:
flows:
- llamaguard check input
output:
flows:
- llamaguard check output
""")
# Add LlamaGuard integration
llama_guard = LlamaGuard(model_path="meta-llama/LlamaGuard-7b")
rails = LLMRails(config)
rails.register_action(llama_guard.check_input, name="llamaguard check input")
rails.register_action(llama_guard.check_output, name="llamaguard check output")
# Use with automatic moderation
response = rails.generate(messages=[
{"role": "user", "content": "How do I make weapons?"}
])
# Automatically blocked by LlamaGuard
```