+
+ {label}
+ {pct}%
-
);
}
diff --git a/go-backend/internal/services/openrouter_service.go b/go-backend/internal/services/openrouter_service.go
index 8fc9d77..ee6d678 100644
--- a/go-backend/internal/services/openrouter_service.go
+++ b/go-backend/internal/services/openrouter_service.go
@@ -28,14 +28,14 @@ type OpenRouterService struct {
// OpenRouterModel represents a model available on OpenRouter
type OpenRouterModel struct {
- ID string `json:"id"`
- Name string `json:"name"`
- Description string `json:"description,omitempty"`
- Pricing OpenRouterPricing `json:"pricing"`
- ContextLength int `json:"context_length"`
- Architecture map[string]any `json:"architecture,omitempty"`
- TopProvider map[string]any `json:"top_provider,omitempty"`
- PerRequestLimits map[string]any `json:"per_request_limits,omitempty"`
+ ID string `json:"id"`
+ Name string `json:"name"`
+ Description string `json:"description,omitempty"`
+ Pricing OpenRouterPricing `json:"pricing"`
+ ContextLength int `json:"context_length"`
+ Architecture map[string]any `json:"architecture,omitempty"`
+ TopProvider map[string]any `json:"top_provider,omitempty"`
+ PerRequestLimits map[string]any `json:"per_request_limits,omitempty"`
}
type OpenRouterPricing struct {
@@ -47,14 +47,14 @@ type OpenRouterPricing struct {
// ModerationConfigEntry represents a row in ai_moderation_config
type ModerationConfigEntry struct {
- ID string `json:"id"`
- ModerationType string `json:"moderation_type"`
- ModelID string `json:"model_id"`
- ModelName string `json:"model_name"`
- SystemPrompt string `json:"system_prompt"`
- Enabled bool `json:"enabled"`
- UpdatedAt time.Time `json:"updated_at"`
- UpdatedBy *string `json:"updated_by,omitempty"`
+ ID string `json:"id"`
+ ModerationType string `json:"moderation_type"`
+ ModelID string `json:"model_id"`
+ ModelName string `json:"model_name"`
+ SystemPrompt string `json:"system_prompt"`
+ Enabled bool `json:"enabled"`
+ UpdatedAt time.Time `json:"updated_at"`
+ UpdatedBy *string `json:"updated_by,omitempty"`
}
// OpenRouterChatMessage represents a message in a chat completion request
@@ -216,12 +216,16 @@ func (s *OpenRouterService) ModerateVideo(ctx context.Context, frameURLs []strin
// ModerationResult is the parsed response from OpenRouter moderation
type ModerationResult struct {
- Flagged bool `json:"flagged"`
- Reason string `json:"reason"`
- Hate float64 `json:"hate"`
- Greed float64 `json:"greed"`
- Delusion float64 `json:"delusion"`
- RawContent string `json:"raw_content"`
+ Flagged bool `json:"flagged"`
+ Reason string `json:"reason"`
+ Explanation string `json:"explanation"`
+ Hate float64 `json:"hate"`
+ HateDetail string `json:"hate_detail"`
+ Greed float64 `json:"greed"`
+ GreedDetail string `json:"greed_detail"`
+ Delusion float64 `json:"delusion"`
+ DelusionDetail string `json:"delusion_detail"`
+ RawContent string `json:"raw_content"`
}
// callModel sends a chat completion request to OpenRouter
@@ -247,7 +251,7 @@ func (s *OpenRouterService) callModel(ctx context.Context, modelID, systemPrompt
}
for _, url := range imageURLs {
parts = append(parts, map[string]any{
- "type": "image_url",
+ "type": "image_url",
"image_url": map[string]string{"url": url},
})
}
@@ -320,18 +324,26 @@ func parseModerationResponse(raw string) *ModerationResult {
cleaned = strings.TrimSpace(cleaned)
var parsed struct {
- Flagged bool `json:"flagged"`
- Reason string `json:"reason"`
- Hate float64 `json:"hate"`
- Greed float64 `json:"greed"`
- Delusion float64 `json:"delusion"`
+ Flagged bool `json:"flagged"`
+ Reason string `json:"reason"`
+ Explanation string `json:"explanation"`
+ Hate float64 `json:"hate"`
+ HateDetail string `json:"hate_detail"`
+ Greed float64 `json:"greed"`
+ GreedDetail string `json:"greed_detail"`
+ Delusion float64 `json:"delusion"`
+ DelusionDetail string `json:"delusion_detail"`
}
if err := json.Unmarshal([]byte(cleaned), &parsed); err == nil {
result.Flagged = parsed.Flagged
result.Reason = parsed.Reason
+ result.Explanation = parsed.Explanation
result.Hate = parsed.Hate
+ result.HateDetail = parsed.HateDetail
result.Greed = parsed.Greed
+ result.GreedDetail = parsed.GreedDetail
result.Delusion = parsed.Delusion
+ result.DelusionDetail = parsed.DelusionDetail
return result
}
@@ -345,22 +357,27 @@ func parseModerationResponse(raw string) *ModerationResult {
return result
}
-const defaultModerationSystemPrompt = `You are a content moderation AI for Sojorn, a social media platform.
+const defaultModerationSystemPrompt = `You are a content moderation AI for Sojorn, a social media platform.
Analyze the provided content for policy violations.
Respond ONLY with a JSON object in this exact format:
{
"flagged": true/false,
- "reason": "brief reason if flagged, empty string if not",
+ "reason": "one-line summary if flagged, empty string if clean",
+ "explanation": "Detailed paragraph explaining your full analysis. Describe exactly what you observed in the content, what specific elements you checked, why each category scored the way it did, and your overall reasoning for the flagged/clean decision.",
"hate": 0.0-1.0,
+ "hate_detail": "Explain exactly what you found (or didn't find) related to hate. E.g. 'No hate speech, slurs, threats, violence, sexual content, or discriminatory language detected.' or 'Contains racial slur targeting [group] in aggressive context.'",
"greed": 0.0-1.0,
- "delusion": 0.0-1.0
+ "greed_detail": "Explain exactly what you found (or didn't find) related to greed. E.g. 'No spam, scam language, or promotional manipulation detected.' or 'Contains crypto pump-and-dump language with fake earnings claims.'",
+ "delusion": 0.0-1.0,
+ "delusion_detail": "Explain exactly what you found (or didn't find) related to delusion. E.g. 'No misinformation, self-harm, or conspiracy content detected.' or 'Promotes unverified medical cure with dangerous dosage advice.'"
}
Scoring guide (Three Poisons framework):
-- hate: harassment, threats, violence, sexual content, hate speech, discrimination
-- greed: spam, scams, crypto schemes, misleading promotions, get-rich-quick
-- delusion: misinformation, self-harm content, conspiracy theories, dangerous medical advice
+- hate: harassment, threats, violence, sexual content, nudity, hate speech, discrimination, graphic imagery
+- greed: spam, scams, crypto schemes, misleading promotions, get-rich-quick, MLM recruitment
+- delusion: misinformation, self-harm content, conspiracy theories, dangerous medical advice, deepfakes
Score 0.0 = no concern, 1.0 = extreme violation. Flag if any score > 0.5.
+ALWAYS provide detailed explanations even when content is clean — explain what you checked and why it passed.
Only respond with the JSON, no other text.`