Fix AI moderation parser: robust JSON extraction + score-based flagging override
This commit is contained in:
parent
d40baf9bee
commit
e81e9e52b7
|
|
@ -307,21 +307,34 @@ func (s *OpenRouterService) callModel(ctx context.Context, modelID, systemPrompt
|
||||||
func parseModerationResponse(raw string) *ModerationResult {
|
func parseModerationResponse(raw string) *ModerationResult {
|
||||||
result := &ModerationResult{RawContent: raw}
|
result := &ModerationResult{RawContent: raw}
|
||||||
|
|
||||||
// Try to parse JSON from the response
|
// Strategy: try multiple ways to extract JSON from the response
|
||||||
// Models may wrap JSON in markdown code blocks
|
candidates := []string{}
|
||||||
|
|
||||||
|
// 1. Strip markdown code fences
|
||||||
cleaned := raw
|
cleaned := raw
|
||||||
if idx := strings.Index(cleaned, "```json"); idx >= 0 {
|
if idx := strings.Index(cleaned, "```json"); idx >= 0 {
|
||||||
cleaned = cleaned[idx+7:]
|
cleaned = cleaned[idx+7:]
|
||||||
if end := strings.Index(cleaned, "```"); end >= 0 {
|
if end := strings.Index(cleaned, "```"); end >= 0 {
|
||||||
cleaned = cleaned[:end]
|
cleaned = cleaned[:end]
|
||||||
}
|
}
|
||||||
|
candidates = append(candidates, strings.TrimSpace(cleaned))
|
||||||
} else if idx := strings.Index(cleaned, "```"); idx >= 0 {
|
} else if idx := strings.Index(cleaned, "```"); idx >= 0 {
|
||||||
cleaned = cleaned[idx+3:]
|
cleaned = cleaned[idx+3:]
|
||||||
if end := strings.Index(cleaned, "```"); end >= 0 {
|
if end := strings.Index(cleaned, "```"); end >= 0 {
|
||||||
cleaned = cleaned[:end]
|
cleaned = cleaned[:end]
|
||||||
}
|
}
|
||||||
|
candidates = append(candidates, strings.TrimSpace(cleaned))
|
||||||
}
|
}
|
||||||
cleaned = strings.TrimSpace(cleaned)
|
|
||||||
|
// 2. Find first '{' and last '}' in raw text (greedy JSON extraction)
|
||||||
|
if start := strings.Index(raw, "{"); start >= 0 {
|
||||||
|
if end := strings.LastIndex(raw, "}"); end > start {
|
||||||
|
candidates = append(candidates, raw[start:end+1])
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// 3. Try the raw text as-is
|
||||||
|
candidates = append(candidates, strings.TrimSpace(raw))
|
||||||
|
|
||||||
var parsed struct {
|
var parsed struct {
|
||||||
Flagged bool `json:"flagged"`
|
Flagged bool `json:"flagged"`
|
||||||
|
|
@ -334,7 +347,9 @@ func parseModerationResponse(raw string) *ModerationResult {
|
||||||
Delusion float64 `json:"delusion"`
|
Delusion float64 `json:"delusion"`
|
||||||
DelusionDetail string `json:"delusion_detail"`
|
DelusionDetail string `json:"delusion_detail"`
|
||||||
}
|
}
|
||||||
if err := json.Unmarshal([]byte(cleaned), &parsed); err == nil {
|
|
||||||
|
for _, candidate := range candidates {
|
||||||
|
if err := json.Unmarshal([]byte(candidate), &parsed); err == nil {
|
||||||
result.Flagged = parsed.Flagged
|
result.Flagged = parsed.Flagged
|
||||||
result.Reason = parsed.Reason
|
result.Reason = parsed.Reason
|
||||||
result.Explanation = parsed.Explanation
|
result.Explanation = parsed.Explanation
|
||||||
|
|
@ -344,16 +359,26 @@ func parseModerationResponse(raw string) *ModerationResult {
|
||||||
result.GreedDetail = parsed.GreedDetail
|
result.GreedDetail = parsed.GreedDetail
|
||||||
result.Delusion = parsed.Delusion
|
result.Delusion = parsed.Delusion
|
||||||
result.DelusionDetail = parsed.DelusionDetail
|
result.DelusionDetail = parsed.DelusionDetail
|
||||||
|
|
||||||
|
// Safety: re-derive flagged from scores — if any score > 0.5, it's flagged
|
||||||
|
// regardless of what the model put in the "flagged" field
|
||||||
|
scoreFlagged := parsed.Hate > 0.5 || parsed.Greed > 0.5 || parsed.Delusion > 0.5
|
||||||
|
if scoreFlagged != result.Flagged {
|
||||||
|
result.Flagged = scoreFlagged
|
||||||
|
if scoreFlagged && result.Reason == "" {
|
||||||
|
result.Reason = "Flagged: score exceeded 0.5 threshold"
|
||||||
|
}
|
||||||
|
if !scoreFlagged {
|
||||||
|
result.Reason = ""
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
return result
|
return result
|
||||||
}
|
}
|
||||||
|
|
||||||
// Fallback: check for keywords in raw text
|
|
||||||
lower := strings.ToLower(raw)
|
|
||||||
if strings.Contains(lower, "violation") || strings.Contains(lower, "inappropriate") || strings.Contains(lower, "flagged") {
|
|
||||||
result.Flagged = true
|
|
||||||
result.Reason = "Content flagged by AI moderation"
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// All parsing failed — mark as error so admin can see the raw output
|
||||||
|
result.Explanation = "Failed to parse model response as JSON. Check raw response below."
|
||||||
return result
|
return result
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
||||||
Loading…
Reference in a new issue