Fix AI moderation parser: robust JSON extraction + score-based flagging override

2026-02-06 20:19:36 -06:00 · 2026-02-06 20:19:36 -06:00 · e81e9e52b7
parent d40baf9bee
commit e81e9e52b7
1 changed files with 46 additions and 21 deletions
--- a/go-backend/internal/services/openrouter_service.go
+++ b/go-backend/internal/services/openrouter_service.go
@ -307,21 +307,34 @@ func (s *OpenRouterService) callModel(ctx context.Context, modelID, systemPrompt
 func parseModerationResponse(raw string) *ModerationResult {
 	result := &ModerationResult{RawContent: raw}

-	// Try to parse JSON from the response
-	// Models may wrap JSON in markdown code blocks
+	// Strategy: try multiple ways to extract JSON from the response
+	candidates := []string{}
+
+	// 1. Strip markdown code fences
 	cleaned := raw
 	if idx := strings.Index(cleaned, "```json"); idx >= 0 {
 		cleaned = cleaned[idx+7:]
 		if end := strings.Index(cleaned, "```"); end >= 0 {
 			cleaned = cleaned[:end]
 		}
+		candidates = append(candidates, strings.TrimSpace(cleaned))
 	} else if idx := strings.Index(cleaned, "```"); idx >= 0 {
 		cleaned = cleaned[idx+3:]
 		if end := strings.Index(cleaned, "```"); end >= 0 {
 			cleaned = cleaned[:end]
 		}
+		candidates = append(candidates, strings.TrimSpace(cleaned))
 	}
-	cleaned = strings.TrimSpace(cleaned)
+
+	// 2. Find first '{' and last '}' in raw text (greedy JSON extraction)
+	if start := strings.Index(raw, "{"); start >= 0 {
+		if end := strings.LastIndex(raw, "}"); end > start {
+			candidates = append(candidates, raw[start:end+1])
+		}
+	}
+
+	// 3. Try the raw text as-is
+	candidates = append(candidates, strings.TrimSpace(raw))

 	var parsed struct {
 		Flagged        bool    `json:"flagged"`
@ -334,7 +347,9 @@ func parseModerationResponse(raw string) *ModerationResult {
 		Delusion       float64 `json:"delusion"`
 		DelusionDetail string  `json:"delusion_detail"`
 	}
-	if err := json.Unmarshal([]byte(cleaned), &parsed); err == nil {
+
+	for _, candidate := range candidates {
+		if err := json.Unmarshal([]byte(candidate), &parsed); err == nil {
 			result.Flagged = parsed.Flagged
 			result.Reason = parsed.Reason
 			result.Explanation = parsed.Explanation
@ -344,16 +359,26 @@ func parseModerationResponse(raw string) *ModerationResult {
 			result.GreedDetail = parsed.GreedDetail
 			result.Delusion = parsed.Delusion
 			result.DelusionDetail = parsed.DelusionDetail
+
+			// Safety: re-derive flagged from scores — if any score > 0.5, it's flagged
+			// regardless of what the model put in the "flagged" field
+			scoreFlagged := parsed.Hate > 0.5 || parsed.Greed > 0.5 || parsed.Delusion > 0.5
+			if scoreFlagged != result.Flagged {
+				result.Flagged = scoreFlagged
+				if scoreFlagged && result.Reason == "" {
+					result.Reason = "Flagged: score exceeded 0.5 threshold"
+				}
+				if !scoreFlagged {
+					result.Reason = ""
+				}
+			}
+
 			return result
 		}
-
-	// Fallback: check for keywords in raw text
-	lower := strings.ToLower(raw)
-	if strings.Contains(lower, "violation") || strings.Contains(lower, "inappropriate") || strings.Contains(lower, "flagged") {
-		result.Flagged = true
-		result.Reason = "Content flagged by AI moderation"
 	}

+	// All parsing failed — mark as error so admin can see the raw output
+	result.Explanation = "Failed to parse model response as JSON. Check raw response below."
 	return result
 }