AI moderation: detailed explanations per category in test results

2026-02-06 20:06:23 -06:00 · 2026-02-06 20:06:23 -06:00 · d40baf9bee
parent 7c52a1a1ed
commit d40baf9bee
2 changed files with 81 additions and 50 deletions
--- a/admin/src/app/ai-moderation/page.tsx
+++ b/admin/src/app/ai-moderation/page.tsx
@ -364,25 +364,37 @@ function ConfigEditor({ moderationType, config, onSaved }: {
        {!modelId && <p className="text-xs text-amber-600">Select and save a model first to test</p>}

        {testResult && (
-          <div className={`p-3 rounded-lg text-sm ${testResult.error ? 'bg-red-50 text-red-700' : testResult.flagged ? 'bg-red-50' : 'bg-green-50'}`}>
+          <div className={`p-4 rounded-lg text-sm ${testResult.error ? 'bg-red-50 text-red-700' : testResult.flagged ? 'bg-red-50' : 'bg-green-50'}`}>
            {testResult.error ? (
              <p>{testResult.error}</p>
            ) : (
-              <div>
-                <div className="flex items-center gap-2 mb-2">
-                  <span className={`font-bold ${testResult.flagged ? 'text-red-700' : 'text-green-700'}`}>
-                    {testResult.flagged ? 'FLAGGED' : 'CLEAN'}
+              <div className="space-y-3">
+                {/* Verdict */}
+                <div className="flex items-center gap-2">
+                  <span className={`text-lg font-bold ${testResult.flagged ? 'text-red-700' : 'text-green-700'}`}>
+                    {testResult.flagged ? '⛔ FLAGGED' : '✅ CLEAN'}
                  </span>
                  {testResult.reason && <span className="text-gray-600">— {testResult.reason}</span>}
                </div>
-                <div className="grid grid-cols-3 gap-2">
-                  <ScoreBar label="Hate" value={testResult.hate} />
-                  <ScoreBar label="Greed" value={testResult.greed} />
-                  <ScoreBar label="Delusion" value={testResult.delusion} />
+
+                {/* Overall Explanation */}
+                {testResult.explanation && (
+                  <div className="bg-white/60 rounded-lg p-3 border border-warm-200">
+                    <p className="text-xs font-semibold text-gray-500 uppercase mb-1">AI Analysis</p>
+                    <p className="text-sm text-gray-700 leading-relaxed">{testResult.explanation}</p>
+                  </div>
+                )}
+
+                {/* Score Bars with Detail */}
+                <div className="space-y-2">
+                  <ScoreBarDetailed label="Hate" value={testResult.hate} detail={testResult.hate_detail} />
+                  <ScoreBarDetailed label="Greed" value={testResult.greed} detail={testResult.greed_detail} />
+                  <ScoreBarDetailed label="Delusion" value={testResult.delusion} detail={testResult.delusion_detail} />
                </div>
+
                {testResult.raw_content && (
                  <details className="mt-2">
-                    <summary className="text-xs text-gray-400 cursor-pointer">Raw response</summary>
+                    <summary className="text-xs text-gray-400 cursor-pointer">Raw model response</summary>
                    <pre className="mt-1 text-xs bg-white p-2 rounded border border-warm-200 overflow-x-auto whitespace-pre-wrap">{testResult.raw_content}</pre>
                  </details>
                )}
@ -395,18 +407,20 @@ function ConfigEditor({ moderationType, config, onSaved }: {
  );
 }

-function ScoreBar({ label, value }: { label: string; value: number }) {
+function ScoreBarDetailed({ label, value, detail }: { label: string; value: number; detail?: string }) {
  const pct = Math.round((value || 0) * 100);
  const color = pct > 50 ? 'bg-red-500' : pct > 25 ? 'bg-amber-400' : 'bg-green-400';
+  const textColor = pct > 50 ? 'text-red-700' : pct > 25 ? 'text-amber-700' : 'text-green-700';
  return (
-    <div>
-      <div className="flex justify-between text-xs mb-0.5">
-        <span className="text-gray-500">{label}</span>
-        <span className="font-mono text-gray-700">{pct}%</span>
+    <div className="bg-white/50 rounded-lg p-2.5 border border-warm-100">
+      <div className="flex justify-between text-xs mb-1">
+        <span className="font-semibold text-gray-700">{label}</span>
+        <span className={`font-mono font-bold ${textColor}`}>{pct}%</span>
      </div>
-      <div className="h-1.5 bg-gray-200 rounded-full overflow-hidden">
+      <div className="h-1.5 bg-gray-200 rounded-full overflow-hidden mb-1.5">
        <div className={`h-full ${color} rounded-full transition-all`} style={{ width: `${pct}%` }} />
      </div>
+      {detail && <p className="text-xs text-gray-500 leading-relaxed">{detail}</p>}
    </div>
  );
 }
--- a/go-backend/internal/services/openrouter_service.go
+++ b/go-backend/internal/services/openrouter_service.go
@ -28,14 +28,14 @@ type OpenRouterService struct {

 // OpenRouterModel represents a model available on OpenRouter
 type OpenRouterModel struct {
-	ID            string            `json:"id"`
-	Name          string            `json:"name"`
-	Description   string            `json:"description,omitempty"`
-	Pricing       OpenRouterPricing `json:"pricing"`
-	ContextLength int               `json:"context_length"`
-	Architecture  map[string]any    `json:"architecture,omitempty"`
-	TopProvider   map[string]any    `json:"top_provider,omitempty"`
-	PerRequestLimits map[string]any `json:"per_request_limits,omitempty"`
+	ID               string            `json:"id"`
+	Name             string            `json:"name"`
+	Description      string            `json:"description,omitempty"`
+	Pricing          OpenRouterPricing `json:"pricing"`
+	ContextLength    int               `json:"context_length"`
+	Architecture     map[string]any    `json:"architecture,omitempty"`
+	TopProvider      map[string]any    `json:"top_provider,omitempty"`
+	PerRequestLimits map[string]any    `json:"per_request_limits,omitempty"`
 }

 type OpenRouterPricing struct {
@ -47,14 +47,14 @@ type OpenRouterPricing struct {

 // ModerationConfigEntry represents a row in ai_moderation_config
 type ModerationConfigEntry struct {
-	ID              string    `json:"id"`
-	ModerationType  string    `json:"moderation_type"`
-	ModelID         string    `json:"model_id"`
-	ModelName       string    `json:"model_name"`
-	SystemPrompt    string    `json:"system_prompt"`
-	Enabled         bool      `json:"enabled"`
-	UpdatedAt       time.Time `json:"updated_at"`
-	UpdatedBy       *string   `json:"updated_by,omitempty"`
+	ID             string    `json:"id"`
+	ModerationType string    `json:"moderation_type"`
+	ModelID        string    `json:"model_id"`
+	ModelName      string    `json:"model_name"`
+	SystemPrompt   string    `json:"system_prompt"`
+	Enabled        bool      `json:"enabled"`
+	UpdatedAt      time.Time `json:"updated_at"`
+	UpdatedBy      *string   `json:"updated_by,omitempty"`
 }

 // OpenRouterChatMessage represents a message in a chat completion request
@ -216,12 +216,16 @@ func (s *OpenRouterService) ModerateVideo(ctx context.Context, frameURLs []strin

 // ModerationResult is the parsed response from OpenRouter moderation
 type ModerationResult struct {
-	Flagged    bool    `json:"flagged"`
-	Reason     string  `json:"reason"`
-	Hate       float64 `json:"hate"`
-	Greed      float64 `json:"greed"`
-	Delusion   float64 `json:"delusion"`
-	RawContent string  `json:"raw_content"`
+	Flagged        bool    `json:"flagged"`
+	Reason         string  `json:"reason"`
+	Explanation    string  `json:"explanation"`
+	Hate           float64 `json:"hate"`
+	HateDetail     string  `json:"hate_detail"`
+	Greed          float64 `json:"greed"`
+	GreedDetail    string  `json:"greed_detail"`
+	Delusion       float64 `json:"delusion"`
+	DelusionDetail string  `json:"delusion_detail"`
+	RawContent     string  `json:"raw_content"`
 }

 // callModel sends a chat completion request to OpenRouter
@ -247,7 +251,7 @@ func (s *OpenRouterService) callModel(ctx context.Context, modelID, systemPrompt
 		}
 		for _, url := range imageURLs {
 			parts = append(parts, map[string]any{
-				"type": "image_url",
+				"type":      "image_url",
 				"image_url": map[string]string{"url": url},
 			})
 		}
@ -320,18 +324,26 @@ func parseModerationResponse(raw string) *ModerationResult {
 	cleaned = strings.TrimSpace(cleaned)

 	var parsed struct {
-		Flagged  bool    `json:"flagged"`
-		Reason   string  `json:"reason"`
-		Hate     float64 `json:"hate"`
-		Greed    float64 `json:"greed"`
-		Delusion float64 `json:"delusion"`
+		Flagged        bool    `json:"flagged"`
+		Reason         string  `json:"reason"`
+		Explanation    string  `json:"explanation"`
+		Hate           float64 `json:"hate"`
+		HateDetail     string  `json:"hate_detail"`
+		Greed          float64 `json:"greed"`
+		GreedDetail    string  `json:"greed_detail"`
+		Delusion       float64 `json:"delusion"`
+		DelusionDetail string  `json:"delusion_detail"`
 	}
 	if err := json.Unmarshal([]byte(cleaned), &parsed); err == nil {
 		result.Flagged = parsed.Flagged
 		result.Reason = parsed.Reason
+		result.Explanation = parsed.Explanation
 		result.Hate = parsed.Hate
+		result.HateDetail = parsed.HateDetail
 		result.Greed = parsed.Greed
+		result.GreedDetail = parsed.GreedDetail
 		result.Delusion = parsed.Delusion
+		result.DelusionDetail = parsed.DelusionDetail
 		return result
 	}

@ -351,16 +363,21 @@ Analyze the provided content for policy violations.
 Respond ONLY with a JSON object in this exact format:
 {
  "flagged": true/false,
-  "reason": "brief reason if flagged, empty string if not",
+  "reason": "one-line summary if flagged, empty string if clean",
+  "explanation": "Detailed paragraph explaining your full analysis. Describe exactly what you observed in the content, what specific elements you checked, why each category scored the way it did, and your overall reasoning for the flagged/clean decision.",
  "hate": 0.0-1.0,
+  "hate_detail": "Explain exactly what you found (or didn't find) related to hate. E.g. 'No hate speech, slurs, threats, violence, sexual content, or discriminatory language detected.' or 'Contains racial slur targeting [group] in aggressive context.'",
  "greed": 0.0-1.0,
-  "delusion": 0.0-1.0
+  "greed_detail": "Explain exactly what you found (or didn't find) related to greed. E.g. 'No spam, scam language, or promotional manipulation detected.' or 'Contains crypto pump-and-dump language with fake earnings claims.'",
+  "delusion": 0.0-1.0,
+  "delusion_detail": "Explain exactly what you found (or didn't find) related to delusion. E.g. 'No misinformation, self-harm, or conspiracy content detected.' or 'Promotes unverified medical cure with dangerous dosage advice.'"
 }

 Scoring guide (Three Poisons framework):
- hate: harassment, threats, violence, sexual content, hate speech, discrimination
- greed: spam, scams, crypto schemes, misleading promotions, get-rich-quick
- delusion: misinformation, self-harm content, conspiracy theories, dangerous medical advice
+- hate: harassment, threats, violence, sexual content, nudity, hate speech, discrimination, graphic imagery
+- greed: spam, scams, crypto schemes, misleading promotions, get-rich-quick, MLM recruitment
+- delusion: misinformation, self-harm content, conspiracy theories, dangerous medical advice, deepfakes

 Score 0.0 = no concern, 1.0 = extreme violation. Flag if any score > 0.5.
+ALWAYS provide detailed explanations even when content is clean — explain what you checked and why it passed.
 Only respond with the JSON, no other text.`