From e81e9e52b71389d1df4c4fcf4bd064c3afbd10ac Mon Sep 17 00:00:00 2001 From: Patrick Britton Date: Fri, 6 Feb 2026 20:19:36 -0600 Subject: [PATCH] Fix AI moderation parser: robust JSON extraction + score-based flagging override --- .../internal/services/openrouter_service.go | 67 +++++++++++++------ 1 file changed, 46 insertions(+), 21 deletions(-) diff --git a/go-backend/internal/services/openrouter_service.go b/go-backend/internal/services/openrouter_service.go index ee6d678..c430b3a 100644 --- a/go-backend/internal/services/openrouter_service.go +++ b/go-backend/internal/services/openrouter_service.go @@ -307,21 +307,34 @@ func (s *OpenRouterService) callModel(ctx context.Context, modelID, systemPrompt func parseModerationResponse(raw string) *ModerationResult { result := &ModerationResult{RawContent: raw} - // Try to parse JSON from the response - // Models may wrap JSON in markdown code blocks + // Strategy: try multiple ways to extract JSON from the response + candidates := []string{} + + // 1. Strip markdown code fences cleaned := raw if idx := strings.Index(cleaned, "```json"); idx >= 0 { cleaned = cleaned[idx+7:] if end := strings.Index(cleaned, "```"); end >= 0 { cleaned = cleaned[:end] } + candidates = append(candidates, strings.TrimSpace(cleaned)) } else if idx := strings.Index(cleaned, "```"); idx >= 0 { cleaned = cleaned[idx+3:] if end := strings.Index(cleaned, "```"); end >= 0 { cleaned = cleaned[:end] } + candidates = append(candidates, strings.TrimSpace(cleaned)) } - cleaned = strings.TrimSpace(cleaned) + + // 2. Find first '{' and last '}' in raw text (greedy JSON extraction) + if start := strings.Index(raw, "{"); start >= 0 { + if end := strings.LastIndex(raw, "}"); end > start { + candidates = append(candidates, raw[start:end+1]) + } + } + + // 3. Try the raw text as-is + candidates = append(candidates, strings.TrimSpace(raw)) var parsed struct { Flagged bool `json:"flagged"` @@ -334,26 +347,38 @@ func parseModerationResponse(raw string) *ModerationResult { Delusion float64 `json:"delusion"` DelusionDetail string `json:"delusion_detail"` } - if err := json.Unmarshal([]byte(cleaned), &parsed); err == nil { - result.Flagged = parsed.Flagged - result.Reason = parsed.Reason - result.Explanation = parsed.Explanation - result.Hate = parsed.Hate - result.HateDetail = parsed.HateDetail - result.Greed = parsed.Greed - result.GreedDetail = parsed.GreedDetail - result.Delusion = parsed.Delusion - result.DelusionDetail = parsed.DelusionDetail - return result - } - - // Fallback: check for keywords in raw text - lower := strings.ToLower(raw) - if strings.Contains(lower, "violation") || strings.Contains(lower, "inappropriate") || strings.Contains(lower, "flagged") { - result.Flagged = true - result.Reason = "Content flagged by AI moderation" + + for _, candidate := range candidates { + if err := json.Unmarshal([]byte(candidate), &parsed); err == nil { + result.Flagged = parsed.Flagged + result.Reason = parsed.Reason + result.Explanation = parsed.Explanation + result.Hate = parsed.Hate + result.HateDetail = parsed.HateDetail + result.Greed = parsed.Greed + result.GreedDetail = parsed.GreedDetail + result.Delusion = parsed.Delusion + result.DelusionDetail = parsed.DelusionDetail + + // Safety: re-derive flagged from scores — if any score > 0.5, it's flagged + // regardless of what the model put in the "flagged" field + scoreFlagged := parsed.Hate > 0.5 || parsed.Greed > 0.5 || parsed.Delusion > 0.5 + if scoreFlagged != result.Flagged { + result.Flagged = scoreFlagged + if scoreFlagged && result.Reason == "" { + result.Reason = "Flagged: score exceeded 0.5 threshold" + } + if !scoreFlagged { + result.Reason = "" + } + } + + return result + } } + // All parsing failed — mark as error so admin can see the raw output + result.Explanation = "Failed to parse model response as JSON. Check raw response below." return result }