Fix AI moderation parser: robust JSON extraction + score-based flagging override

2026-02-06 20:19:36 -06:00 · 2026-02-06 20:19:36 -06:00 · e81e9e52b7
parent d40baf9bee
commit e81e9e52b7
1 changed files with 46 additions and 21 deletions
--- a/go-backend/internal/services/openrouter_service.go
+++ b/go-backend/internal/services/openrouter_service.go
@ -307,21 +307,34 @@ func (s *OpenRouterService) callModel(ctx context.Context, modelID, systemPrompt
 func parseModerationResponse(raw string) *ModerationResult {
 	result := &ModerationResult{RawContent: raw}
-	// Try to parse JSON from the response
+	// Strategy: try multiple ways to extract JSON from the response
-	// Models may wrap JSON in markdown code blocks
+	candidates := []string{}
 	// 1. Strip markdown code fences
 	cleaned := raw
 	if idx := strings.Index(cleaned, "```json"); idx >= 0 {
 		cleaned = cleaned[idx+7:]
 		if end := strings.Index(cleaned, "```"); end >= 0 {
 			cleaned = cleaned[:end]
 		}
 		candidates = append(candidates, strings.TrimSpace(cleaned))
 	} else if idx := strings.Index(cleaned, "```"); idx >= 0 {
 		cleaned = cleaned[idx+3:]
 		if end := strings.Index(cleaned, "```"); end >= 0 {
 			cleaned = cleaned[:end]
 		}
 		candidates = append(candidates, strings.TrimSpace(cleaned))
 	}
-	cleaned = strings.TrimSpace(cleaned)
+
 	// 2. Find first '{' and last '}' in raw text (greedy JSON extraction)
 	if start := strings.Index(raw, "{"); start >= 0 {
 		if end := strings.LastIndex(raw, "}"); end > start {
 			candidates = append(candidates, raw[start:end+1])
 		}
 	}
 	// 3. Try the raw text as-is
 	candidates = append(candidates, strings.TrimSpace(raw))
 	var parsed struct {
 		Flagged        bool    `json:"flagged"`
@ -334,7 +347,9 @@ func parseModerationResponse(raw string) *ModerationResult {
 		Delusion       float64 `json:"delusion"`
 		DelusionDetail string  `json:"delusion_detail"`
 	}
-	if err := json.Unmarshal([]byte(cleaned), &parsed); err == nil {
+
 	for _, candidate := range candidates {
 		if err := json.Unmarshal([]byte(candidate), &parsed); err == nil {
 			result.Flagged = parsed.Flagged
 			result.Reason = parsed.Reason
 			result.Explanation = parsed.Explanation
@ -344,16 +359,26 @@ func parseModerationResponse(raw string) *ModerationResult {
 			result.GreedDetail = parsed.GreedDetail
 			result.Delusion = parsed.Delusion
 			result.DelusionDetail = parsed.DelusionDetail
 			// Safety: re-derive flagged from scores — if any score > 0.5, it's flagged
 			// regardless of what the model put in the "flagged" field
 			scoreFlagged := parsed.Hate > 0.5 || parsed.Greed > 0.5 || parsed.Delusion > 0.5
 			if scoreFlagged != result.Flagged {
 				result.Flagged = scoreFlagged
 				if scoreFlagged && result.Reason == "" {
 					result.Reason = "Flagged: score exceeded 0.5 threshold"
 				}
 				if !scoreFlagged {
 					result.Reason = ""
 				}
 			}
 			return result
 		}
 	// Fallback: check for keywords in raw text
 	lower := strings.ToLower(raw)
 	if strings.Contains(lower, "violation") || strings.Contains(lower, "inappropriate") || strings.Contains(lower, "flagged") {
 		result.Flagged = true
 		result.Reason = "Content flagged by AI moderation"
 	}
 	// All parsing failed — mark as error so admin can see the raw output
 	result.Explanation = "Failed to parse model response as JSON. Check raw response below."
 	return result
 }