Fix OpenAI Moderation API: correct response parsing, use omni-moderation-latest model

2026-02-06 11:12:00 -06:00 · 2026-02-06 11:12:00 -06:00 · 66fe4bd60e
parent ec5a0aad8b
commit 66fe4bd60e
1 changed files with 38 additions and 28 deletions
--- a/go-backend/internal/services/moderation_service.go
+++ b/go-backend/internal/services/moderation_service.go
@ -42,19 +42,23 @@ type ThreePoisonsScore struct {
 type OpenAIModerationResponse struct {
 	Results []struct {
 		Categories struct {
-			Hate                 float64 `json:"hate"`
-			HateThreatening      float64 `json:"hate/threatening"`
-			SelfHarm             float64 `json:"self-harm"`
-			SelfHarmIntent       float64 `json:"self-harm/intent"`
-			SelfHarmInstructions float64 `json:"self-harm/instructions"`
-			Sexual               float64 `json:"sexual"`
-			SexualMinors         float64 `json:"sexual/minors"`
-			Violence             float64 `json:"violence"`
-			ViolenceGraphic      float64 `json:"violence/graphic"`
+			Hate                  bool `json:"hate"`
+			HateThreatening       bool `json:"hate/threatening"`
+			Harassment            bool `json:"harassment"`
+			HarassmentThreatening bool `json:"harassment/threatening"`
+			SelfHarm              bool `json:"self-harm"`
+			SelfHarmIntent        bool `json:"self-harm/intent"`
+			SelfHarmInstructions  bool `json:"self-harm/instructions"`
+			Sexual                bool `json:"sexual"`
+			SexualMinors          bool `json:"sexual/minors"`
+			Violence              bool `json:"violence"`
+			ViolenceGraphic       bool `json:"violence/graphic"`
 		} `json:"categories"`
 		CategoryScores struct {
 			Hate                  float64 `json:"hate"`
 			HateThreatening       float64 `json:"hate/threatening"`
+			Harassment            float64 `json:"harassment"`
+			HarassmentThreatening float64 `json:"harassment/threatening"`
 			SelfHarm              float64 `json:"self-harm"`
 			SelfHarmIntent        float64 `json:"self-harm/intent"`
 			SelfHarmInstructions  float64 `json:"self-harm/instructions"`
@ -150,7 +154,7 @@ func (s *ModerationService) analyzeWithOpenAI(ctx context.Context, content strin

 	requestBody := map[string]interface{}{
 		"input": content,
-		"model": "text-moderation-latest",
+		"model": "omni-moderation-latest",
 	}

 	jsonBody, err := json.Marshal(requestBody)
@ -187,22 +191,28 @@ func (s *ModerationService) analyzeWithOpenAI(ctx context.Context, content strin
 	}

 	result := moderationResp.Results[0]
+	scores := result.CategoryScores
 	score := &ThreePoisonsScore{
-		// Map OpenAI categories to Three Poisons
+		// Map OpenAI category scores to Three Poisons
 		Hate: max(
-			result.Categories.Hate,
-			result.Categories.HateThreatening,
-			result.Categories.Violence,
-			result.Categories.ViolenceGraphic,
+			scores.Hate,
+			scores.HateThreatening,
+			scores.Harassment,
+			scores.HarassmentThreatening,
+			scores.Violence,
+			scores.ViolenceGraphic,
+			scores.Sexual,
+			scores.SexualMinors,
 		),
-		Greed: 0, // OpenAI doesn't detect greed/spam well
+		Greed: 0, // OpenAI doesn't detect greed/spam — handled by keyword fallback
 		Delusion: max(
-			result.Categories.SelfHarm,
-			result.Categories.SelfHarmIntent,
-			result.Categories.SelfHarmInstructions,
+			scores.SelfHarm,
+			scores.SelfHarmIntent,
+			scores.SelfHarmInstructions,
 		),
 	}

+	fmt.Printf("OpenAI moderation: flagged=%v hate=%.3f greed=%.3f delusion=%.3f\n", result.Flagged, score.Hate, score.Greed, score.Delusion)
 	return score, nil
 }