From 66fe4bd60ef81691203ca54d90b3b23259f8246d Mon Sep 17 00:00:00 2001 From: Patrick Britton Date: Fri, 6 Feb 2026 11:12:00 -0600 Subject: [PATCH] Fix OpenAI Moderation API: correct response parsing, use omni-moderation-latest model --- .../internal/services/moderation_service.go | 66 +++++++++++-------- 1 file changed, 38 insertions(+), 28 deletions(-) diff --git a/go-backend/internal/services/moderation_service.go b/go-backend/internal/services/moderation_service.go index 788d1c7..5016251 100644 --- a/go-backend/internal/services/moderation_service.go +++ b/go-backend/internal/services/moderation_service.go @@ -42,26 +42,30 @@ type ThreePoisonsScore struct { type OpenAIModerationResponse struct { Results []struct { Categories struct { - Hate float64 `json:"hate"` - HateThreatening float64 `json:"hate/threatening"` - SelfHarm float64 `json:"self-harm"` - SelfHarmIntent float64 `json:"self-harm/intent"` - SelfHarmInstructions float64 `json:"self-harm/instructions"` - Sexual float64 `json:"sexual"` - SexualMinors float64 `json:"sexual/minors"` - Violence float64 `json:"violence"` - ViolenceGraphic float64 `json:"violence/graphic"` + Hate bool `json:"hate"` + HateThreatening bool `json:"hate/threatening"` + Harassment bool `json:"harassment"` + HarassmentThreatening bool `json:"harassment/threatening"` + SelfHarm bool `json:"self-harm"` + SelfHarmIntent bool `json:"self-harm/intent"` + SelfHarmInstructions bool `json:"self-harm/instructions"` + Sexual bool `json:"sexual"` + SexualMinors bool `json:"sexual/minors"` + Violence bool `json:"violence"` + ViolenceGraphic bool `json:"violence/graphic"` } `json:"categories"` CategoryScores struct { - Hate float64 `json:"hate"` - HateThreatening float64 `json:"hate/threatening"` - SelfHarm float64 `json:"self-harm"` - SelfHarmIntent float64 `json:"self-harm/intent"` - SelfHarmInstructions float64 `json:"self-harm/instructions"` - Sexual float64 `json:"sexual"` - SexualMinors float64 `json:"sexual/minors"` - Violence float64 `json:"violence"` - ViolenceGraphic float64 `json:"violence/graphic"` + Hate float64 `json:"hate"` + HateThreatening float64 `json:"hate/threatening"` + Harassment float64 `json:"harassment"` + HarassmentThreatening float64 `json:"harassment/threatening"` + SelfHarm float64 `json:"self-harm"` + SelfHarmIntent float64 `json:"self-harm/intent"` + SelfHarmInstructions float64 `json:"self-harm/instructions"` + Sexual float64 `json:"sexual"` + SexualMinors float64 `json:"sexual/minors"` + Violence float64 `json:"violence"` + ViolenceGraphic float64 `json:"violence/graphic"` } `json:"category_scores"` Flagged bool `json:"flagged"` } `json:"results"` @@ -150,7 +154,7 @@ func (s *ModerationService) analyzeWithOpenAI(ctx context.Context, content strin requestBody := map[string]interface{}{ "input": content, - "model": "text-moderation-latest", + "model": "omni-moderation-latest", } jsonBody, err := json.Marshal(requestBody) @@ -187,22 +191,28 @@ func (s *ModerationService) analyzeWithOpenAI(ctx context.Context, content strin } result := moderationResp.Results[0] + scores := result.CategoryScores score := &ThreePoisonsScore{ - // Map OpenAI categories to Three Poisons + // Map OpenAI category scores to Three Poisons Hate: max( - result.Categories.Hate, - result.Categories.HateThreatening, - result.Categories.Violence, - result.Categories.ViolenceGraphic, + scores.Hate, + scores.HateThreatening, + scores.Harassment, + scores.HarassmentThreatening, + scores.Violence, + scores.ViolenceGraphic, + scores.Sexual, + scores.SexualMinors, ), - Greed: 0, // OpenAI doesn't detect greed/spam well + Greed: 0, // OpenAI doesn't detect greed/spam — handled by keyword fallback Delusion: max( - result.Categories.SelfHarm, - result.Categories.SelfHarmIntent, - result.Categories.SelfHarmInstructions, + scores.SelfHarm, + scores.SelfHarmIntent, + scores.SelfHarmInstructions, ), } + fmt.Printf("OpenAI moderation: flagged=%v hate=%.3f greed=%.3f delusion=%.3f\n", result.Flagged, score.Hate, score.Greed, score.Delusion) return score, nil }