Fix OpenAI Moderation API: correct response parsing, use omni-moderation-latest model

This commit is contained in:
Patrick Britton 2026-02-06 11:12:00 -06:00
parent ec5a0aad8b
commit 66fe4bd60e

View file

@ -42,19 +42,23 @@ type ThreePoisonsScore struct {
type OpenAIModerationResponse struct { type OpenAIModerationResponse struct {
Results []struct { Results []struct {
Categories struct { Categories struct {
Hate float64 `json:"hate"` Hate bool `json:"hate"`
HateThreatening float64 `json:"hate/threatening"` HateThreatening bool `json:"hate/threatening"`
SelfHarm float64 `json:"self-harm"` Harassment bool `json:"harassment"`
SelfHarmIntent float64 `json:"self-harm/intent"` HarassmentThreatening bool `json:"harassment/threatening"`
SelfHarmInstructions float64 `json:"self-harm/instructions"` SelfHarm bool `json:"self-harm"`
Sexual float64 `json:"sexual"` SelfHarmIntent bool `json:"self-harm/intent"`
SexualMinors float64 `json:"sexual/minors"` SelfHarmInstructions bool `json:"self-harm/instructions"`
Violence float64 `json:"violence"` Sexual bool `json:"sexual"`
ViolenceGraphic float64 `json:"violence/graphic"` SexualMinors bool `json:"sexual/minors"`
Violence bool `json:"violence"`
ViolenceGraphic bool `json:"violence/graphic"`
} `json:"categories"` } `json:"categories"`
CategoryScores struct { CategoryScores struct {
Hate float64 `json:"hate"` Hate float64 `json:"hate"`
HateThreatening float64 `json:"hate/threatening"` HateThreatening float64 `json:"hate/threatening"`
Harassment float64 `json:"harassment"`
HarassmentThreatening float64 `json:"harassment/threatening"`
SelfHarm float64 `json:"self-harm"` SelfHarm float64 `json:"self-harm"`
SelfHarmIntent float64 `json:"self-harm/intent"` SelfHarmIntent float64 `json:"self-harm/intent"`
SelfHarmInstructions float64 `json:"self-harm/instructions"` SelfHarmInstructions float64 `json:"self-harm/instructions"`
@ -150,7 +154,7 @@ func (s *ModerationService) analyzeWithOpenAI(ctx context.Context, content strin
requestBody := map[string]interface{}{ requestBody := map[string]interface{}{
"input": content, "input": content,
"model": "text-moderation-latest", "model": "omni-moderation-latest",
} }
jsonBody, err := json.Marshal(requestBody) jsonBody, err := json.Marshal(requestBody)
@ -187,22 +191,28 @@ func (s *ModerationService) analyzeWithOpenAI(ctx context.Context, content strin
} }
result := moderationResp.Results[0] result := moderationResp.Results[0]
scores := result.CategoryScores
score := &ThreePoisonsScore{ score := &ThreePoisonsScore{
// Map OpenAI categories to Three Poisons // Map OpenAI category scores to Three Poisons
Hate: max( Hate: max(
result.Categories.Hate, scores.Hate,
result.Categories.HateThreatening, scores.HateThreatening,
result.Categories.Violence, scores.Harassment,
result.Categories.ViolenceGraphic, scores.HarassmentThreatening,
scores.Violence,
scores.ViolenceGraphic,
scores.Sexual,
scores.SexualMinors,
), ),
Greed: 0, // OpenAI doesn't detect greed/spam well Greed: 0, // OpenAI doesn't detect greed/spam — handled by keyword fallback
Delusion: max( Delusion: max(
result.Categories.SelfHarm, scores.SelfHarm,
result.Categories.SelfHarmIntent, scores.SelfHarmIntent,
result.Categories.SelfHarmInstructions, scores.SelfHarmInstructions,
), ),
} }
fmt.Printf("OpenAI moderation: flagged=%v hate=%.3f greed=%.3f delusion=%.3f\n", result.Flagged, score.Hate, score.Greed, score.Delusion)
return score, nil return score, nil
} }