Fix OpenAI Moderation API: correct response parsing, use omni-moderation-latest model

This commit is contained in:
Patrick Britton 2026-02-06 11:12:00 -06:00
parent ec5a0aad8b
commit 66fe4bd60e

View file

@ -42,19 +42,23 @@ type ThreePoisonsScore struct {
type OpenAIModerationResponse struct {
Results []struct {
Categories struct {
Hate float64 `json:"hate"`
HateThreatening float64 `json:"hate/threatening"`
SelfHarm float64 `json:"self-harm"`
SelfHarmIntent float64 `json:"self-harm/intent"`
SelfHarmInstructions float64 `json:"self-harm/instructions"`
Sexual float64 `json:"sexual"`
SexualMinors float64 `json:"sexual/minors"`
Violence float64 `json:"violence"`
ViolenceGraphic float64 `json:"violence/graphic"`
Hate bool `json:"hate"`
HateThreatening bool `json:"hate/threatening"`
Harassment bool `json:"harassment"`
HarassmentThreatening bool `json:"harassment/threatening"`
SelfHarm bool `json:"self-harm"`
SelfHarmIntent bool `json:"self-harm/intent"`
SelfHarmInstructions bool `json:"self-harm/instructions"`
Sexual bool `json:"sexual"`
SexualMinors bool `json:"sexual/minors"`
Violence bool `json:"violence"`
ViolenceGraphic bool `json:"violence/graphic"`
} `json:"categories"`
CategoryScores struct {
Hate float64 `json:"hate"`
HateThreatening float64 `json:"hate/threatening"`
Harassment float64 `json:"harassment"`
HarassmentThreatening float64 `json:"harassment/threatening"`
SelfHarm float64 `json:"self-harm"`
SelfHarmIntent float64 `json:"self-harm/intent"`
SelfHarmInstructions float64 `json:"self-harm/instructions"`
@ -150,7 +154,7 @@ func (s *ModerationService) analyzeWithOpenAI(ctx context.Context, content strin
requestBody := map[string]interface{}{
"input": content,
"model": "text-moderation-latest",
"model": "omni-moderation-latest",
}
jsonBody, err := json.Marshal(requestBody)
@ -187,22 +191,28 @@ func (s *ModerationService) analyzeWithOpenAI(ctx context.Context, content strin
}
result := moderationResp.Results[0]
scores := result.CategoryScores
score := &ThreePoisonsScore{
// Map OpenAI categories to Three Poisons
// Map OpenAI category scores to Three Poisons
Hate: max(
result.Categories.Hate,
result.Categories.HateThreatening,
result.Categories.Violence,
result.Categories.ViolenceGraphic,
scores.Hate,
scores.HateThreatening,
scores.Harassment,
scores.HarassmentThreatening,
scores.Violence,
scores.ViolenceGraphic,
scores.Sexual,
scores.SexualMinors,
),
Greed: 0, // OpenAI doesn't detect greed/spam well
Greed: 0, // OpenAI doesn't detect greed/spam — handled by keyword fallback
Delusion: max(
result.Categories.SelfHarm,
result.Categories.SelfHarmIntent,
result.Categories.SelfHarmInstructions,
scores.SelfHarm,
scores.SelfHarmIntent,
scores.SelfHarmInstructions,
),
}
fmt.Printf("OpenAI moderation: flagged=%v hate=%.3f greed=%.3f delusion=%.3f\n", result.Flagged, score.Hate, score.Greed, score.Delusion)
return score, nil
}