Fix AI moderation parser: robust JSON extraction + score-based flagging override

This commit is contained in:
Patrick Britton 2026-02-06 20:19:36 -06:00
parent d40baf9bee
commit e81e9e52b7

View file

@ -307,21 +307,34 @@ func (s *OpenRouterService) callModel(ctx context.Context, modelID, systemPrompt
func parseModerationResponse(raw string) *ModerationResult {
result := &ModerationResult{RawContent: raw}
// Try to parse JSON from the response
// Models may wrap JSON in markdown code blocks
// Strategy: try multiple ways to extract JSON from the response
candidates := []string{}
// 1. Strip markdown code fences
cleaned := raw
if idx := strings.Index(cleaned, "```json"); idx >= 0 {
cleaned = cleaned[idx+7:]
if end := strings.Index(cleaned, "```"); end >= 0 {
cleaned = cleaned[:end]
}
candidates = append(candidates, strings.TrimSpace(cleaned))
} else if idx := strings.Index(cleaned, "```"); idx >= 0 {
cleaned = cleaned[idx+3:]
if end := strings.Index(cleaned, "```"); end >= 0 {
cleaned = cleaned[:end]
}
candidates = append(candidates, strings.TrimSpace(cleaned))
}
cleaned = strings.TrimSpace(cleaned)
// 2. Find first '{' and last '}' in raw text (greedy JSON extraction)
if start := strings.Index(raw, "{"); start >= 0 {
if end := strings.LastIndex(raw, "}"); end > start {
candidates = append(candidates, raw[start:end+1])
}
}
// 3. Try the raw text as-is
candidates = append(candidates, strings.TrimSpace(raw))
var parsed struct {
Flagged bool `json:"flagged"`
@ -334,7 +347,9 @@ func parseModerationResponse(raw string) *ModerationResult {
Delusion float64 `json:"delusion"`
DelusionDetail string `json:"delusion_detail"`
}
if err := json.Unmarshal([]byte(cleaned), &parsed); err == nil {
for _, candidate := range candidates {
if err := json.Unmarshal([]byte(candidate), &parsed); err == nil {
result.Flagged = parsed.Flagged
result.Reason = parsed.Reason
result.Explanation = parsed.Explanation
@ -344,16 +359,26 @@ func parseModerationResponse(raw string) *ModerationResult {
result.GreedDetail = parsed.GreedDetail
result.Delusion = parsed.Delusion
result.DelusionDetail = parsed.DelusionDetail
// Safety: re-derive flagged from scores — if any score > 0.5, it's flagged
// regardless of what the model put in the "flagged" field
scoreFlagged := parsed.Hate > 0.5 || parsed.Greed > 0.5 || parsed.Delusion > 0.5
if scoreFlagged != result.Flagged {
result.Flagged = scoreFlagged
if scoreFlagged && result.Reason == "" {
result.Reason = "Flagged: score exceeded 0.5 threshold"
}
if !scoreFlagged {
result.Reason = ""
}
}
return result
}
// Fallback: check for keywords in raw text
lower := strings.ToLower(raw)
if strings.Contains(lower, "violation") || strings.Contains(lower, "inappropriate") || strings.Contains(lower, "flagged") {
result.Flagged = true
result.Reason = "Content flagged by AI moderation"
}
// All parsing failed — mark as error so admin can see the raw output
result.Explanation = "Failed to parse model response as JSON. Check raw response below."
return result
}