AI moderation: detailed explanations per category in test results
This commit is contained in:
parent
7c52a1a1ed
commit
d40baf9bee
|
|
@ -364,25 +364,37 @@ function ConfigEditor({ moderationType, config, onSaved }: {
|
||||||
{!modelId && <p className="text-xs text-amber-600">Select and save a model first to test</p>}
|
{!modelId && <p className="text-xs text-amber-600">Select and save a model first to test</p>}
|
||||||
|
|
||||||
{testResult && (
|
{testResult && (
|
||||||
<div className={`p-3 rounded-lg text-sm ${testResult.error ? 'bg-red-50 text-red-700' : testResult.flagged ? 'bg-red-50' : 'bg-green-50'}`}>
|
<div className={`p-4 rounded-lg text-sm ${testResult.error ? 'bg-red-50 text-red-700' : testResult.flagged ? 'bg-red-50' : 'bg-green-50'}`}>
|
||||||
{testResult.error ? (
|
{testResult.error ? (
|
||||||
<p>{testResult.error}</p>
|
<p>{testResult.error}</p>
|
||||||
) : (
|
) : (
|
||||||
<div>
|
<div className="space-y-3">
|
||||||
<div className="flex items-center gap-2 mb-2">
|
{/* Verdict */}
|
||||||
<span className={`font-bold ${testResult.flagged ? 'text-red-700' : 'text-green-700'}`}>
|
<div className="flex items-center gap-2">
|
||||||
{testResult.flagged ? 'FLAGGED' : 'CLEAN'}
|
<span className={`text-lg font-bold ${testResult.flagged ? 'text-red-700' : 'text-green-700'}`}>
|
||||||
|
{testResult.flagged ? '⛔ FLAGGED' : '✅ CLEAN'}
|
||||||
</span>
|
</span>
|
||||||
{testResult.reason && <span className="text-gray-600">— {testResult.reason}</span>}
|
{testResult.reason && <span className="text-gray-600">— {testResult.reason}</span>}
|
||||||
</div>
|
</div>
|
||||||
<div className="grid grid-cols-3 gap-2">
|
|
||||||
<ScoreBar label="Hate" value={testResult.hate} />
|
{/* Overall Explanation */}
|
||||||
<ScoreBar label="Greed" value={testResult.greed} />
|
{testResult.explanation && (
|
||||||
<ScoreBar label="Delusion" value={testResult.delusion} />
|
<div className="bg-white/60 rounded-lg p-3 border border-warm-200">
|
||||||
|
<p className="text-xs font-semibold text-gray-500 uppercase mb-1">AI Analysis</p>
|
||||||
|
<p className="text-sm text-gray-700 leading-relaxed">{testResult.explanation}</p>
|
||||||
</div>
|
</div>
|
||||||
|
)}
|
||||||
|
|
||||||
|
{/* Score Bars with Detail */}
|
||||||
|
<div className="space-y-2">
|
||||||
|
<ScoreBarDetailed label="Hate" value={testResult.hate} detail={testResult.hate_detail} />
|
||||||
|
<ScoreBarDetailed label="Greed" value={testResult.greed} detail={testResult.greed_detail} />
|
||||||
|
<ScoreBarDetailed label="Delusion" value={testResult.delusion} detail={testResult.delusion_detail} />
|
||||||
|
</div>
|
||||||
|
|
||||||
{testResult.raw_content && (
|
{testResult.raw_content && (
|
||||||
<details className="mt-2">
|
<details className="mt-2">
|
||||||
<summary className="text-xs text-gray-400 cursor-pointer">Raw response</summary>
|
<summary className="text-xs text-gray-400 cursor-pointer">Raw model response</summary>
|
||||||
<pre className="mt-1 text-xs bg-white p-2 rounded border border-warm-200 overflow-x-auto whitespace-pre-wrap">{testResult.raw_content}</pre>
|
<pre className="mt-1 text-xs bg-white p-2 rounded border border-warm-200 overflow-x-auto whitespace-pre-wrap">{testResult.raw_content}</pre>
|
||||||
</details>
|
</details>
|
||||||
)}
|
)}
|
||||||
|
|
@ -395,18 +407,20 @@ function ConfigEditor({ moderationType, config, onSaved }: {
|
||||||
);
|
);
|
||||||
}
|
}
|
||||||
|
|
||||||
function ScoreBar({ label, value }: { label: string; value: number }) {
|
function ScoreBarDetailed({ label, value, detail }: { label: string; value: number; detail?: string }) {
|
||||||
const pct = Math.round((value || 0) * 100);
|
const pct = Math.round((value || 0) * 100);
|
||||||
const color = pct > 50 ? 'bg-red-500' : pct > 25 ? 'bg-amber-400' : 'bg-green-400';
|
const color = pct > 50 ? 'bg-red-500' : pct > 25 ? 'bg-amber-400' : 'bg-green-400';
|
||||||
|
const textColor = pct > 50 ? 'text-red-700' : pct > 25 ? 'text-amber-700' : 'text-green-700';
|
||||||
return (
|
return (
|
||||||
<div>
|
<div className="bg-white/50 rounded-lg p-2.5 border border-warm-100">
|
||||||
<div className="flex justify-between text-xs mb-0.5">
|
<div className="flex justify-between text-xs mb-1">
|
||||||
<span className="text-gray-500">{label}</span>
|
<span className="font-semibold text-gray-700">{label}</span>
|
||||||
<span className="font-mono text-gray-700">{pct}%</span>
|
<span className={`font-mono font-bold ${textColor}`}>{pct}%</span>
|
||||||
</div>
|
</div>
|
||||||
<div className="h-1.5 bg-gray-200 rounded-full overflow-hidden">
|
<div className="h-1.5 bg-gray-200 rounded-full overflow-hidden mb-1.5">
|
||||||
<div className={`h-full ${color} rounded-full transition-all`} style={{ width: `${pct}%` }} />
|
<div className={`h-full ${color} rounded-full transition-all`} style={{ width: `${pct}%` }} />
|
||||||
</div>
|
</div>
|
||||||
|
{detail && <p className="text-xs text-gray-500 leading-relaxed">{detail}</p>}
|
||||||
</div>
|
</div>
|
||||||
);
|
);
|
||||||
}
|
}
|
||||||
|
|
|
||||||
|
|
@ -218,9 +218,13 @@ func (s *OpenRouterService) ModerateVideo(ctx context.Context, frameURLs []strin
|
||||||
type ModerationResult struct {
|
type ModerationResult struct {
|
||||||
Flagged bool `json:"flagged"`
|
Flagged bool `json:"flagged"`
|
||||||
Reason string `json:"reason"`
|
Reason string `json:"reason"`
|
||||||
|
Explanation string `json:"explanation"`
|
||||||
Hate float64 `json:"hate"`
|
Hate float64 `json:"hate"`
|
||||||
|
HateDetail string `json:"hate_detail"`
|
||||||
Greed float64 `json:"greed"`
|
Greed float64 `json:"greed"`
|
||||||
|
GreedDetail string `json:"greed_detail"`
|
||||||
Delusion float64 `json:"delusion"`
|
Delusion float64 `json:"delusion"`
|
||||||
|
DelusionDetail string `json:"delusion_detail"`
|
||||||
RawContent string `json:"raw_content"`
|
RawContent string `json:"raw_content"`
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
@ -322,16 +326,24 @@ func parseModerationResponse(raw string) *ModerationResult {
|
||||||
var parsed struct {
|
var parsed struct {
|
||||||
Flagged bool `json:"flagged"`
|
Flagged bool `json:"flagged"`
|
||||||
Reason string `json:"reason"`
|
Reason string `json:"reason"`
|
||||||
|
Explanation string `json:"explanation"`
|
||||||
Hate float64 `json:"hate"`
|
Hate float64 `json:"hate"`
|
||||||
|
HateDetail string `json:"hate_detail"`
|
||||||
Greed float64 `json:"greed"`
|
Greed float64 `json:"greed"`
|
||||||
|
GreedDetail string `json:"greed_detail"`
|
||||||
Delusion float64 `json:"delusion"`
|
Delusion float64 `json:"delusion"`
|
||||||
|
DelusionDetail string `json:"delusion_detail"`
|
||||||
}
|
}
|
||||||
if err := json.Unmarshal([]byte(cleaned), &parsed); err == nil {
|
if err := json.Unmarshal([]byte(cleaned), &parsed); err == nil {
|
||||||
result.Flagged = parsed.Flagged
|
result.Flagged = parsed.Flagged
|
||||||
result.Reason = parsed.Reason
|
result.Reason = parsed.Reason
|
||||||
|
result.Explanation = parsed.Explanation
|
||||||
result.Hate = parsed.Hate
|
result.Hate = parsed.Hate
|
||||||
|
result.HateDetail = parsed.HateDetail
|
||||||
result.Greed = parsed.Greed
|
result.Greed = parsed.Greed
|
||||||
|
result.GreedDetail = parsed.GreedDetail
|
||||||
result.Delusion = parsed.Delusion
|
result.Delusion = parsed.Delusion
|
||||||
|
result.DelusionDetail = parsed.DelusionDetail
|
||||||
return result
|
return result
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
@ -351,16 +363,21 @@ Analyze the provided content for policy violations.
|
||||||
Respond ONLY with a JSON object in this exact format:
|
Respond ONLY with a JSON object in this exact format:
|
||||||
{
|
{
|
||||||
"flagged": true/false,
|
"flagged": true/false,
|
||||||
"reason": "brief reason if flagged, empty string if not",
|
"reason": "one-line summary if flagged, empty string if clean",
|
||||||
|
"explanation": "Detailed paragraph explaining your full analysis. Describe exactly what you observed in the content, what specific elements you checked, why each category scored the way it did, and your overall reasoning for the flagged/clean decision.",
|
||||||
"hate": 0.0-1.0,
|
"hate": 0.0-1.0,
|
||||||
|
"hate_detail": "Explain exactly what you found (or didn't find) related to hate. E.g. 'No hate speech, slurs, threats, violence, sexual content, or discriminatory language detected.' or 'Contains racial slur targeting [group] in aggressive context.'",
|
||||||
"greed": 0.0-1.0,
|
"greed": 0.0-1.0,
|
||||||
"delusion": 0.0-1.0
|
"greed_detail": "Explain exactly what you found (or didn't find) related to greed. E.g. 'No spam, scam language, or promotional manipulation detected.' or 'Contains crypto pump-and-dump language with fake earnings claims.'",
|
||||||
|
"delusion": 0.0-1.0,
|
||||||
|
"delusion_detail": "Explain exactly what you found (or didn't find) related to delusion. E.g. 'No misinformation, self-harm, or conspiracy content detected.' or 'Promotes unverified medical cure with dangerous dosage advice.'"
|
||||||
}
|
}
|
||||||
|
|
||||||
Scoring guide (Three Poisons framework):
|
Scoring guide (Three Poisons framework):
|
||||||
- hate: harassment, threats, violence, sexual content, hate speech, discrimination
|
- hate: harassment, threats, violence, sexual content, nudity, hate speech, discrimination, graphic imagery
|
||||||
- greed: spam, scams, crypto schemes, misleading promotions, get-rich-quick
|
- greed: spam, scams, crypto schemes, misleading promotions, get-rich-quick, MLM recruitment
|
||||||
- delusion: misinformation, self-harm content, conspiracy theories, dangerous medical advice
|
- delusion: misinformation, self-harm content, conspiracy theories, dangerous medical advice, deepfakes
|
||||||
|
|
||||||
Score 0.0 = no concern, 1.0 = extreme violation. Flag if any score > 0.5.
|
Score 0.0 = no concern, 1.0 = extreme violation. Flag if any score > 0.5.
|
||||||
|
ALWAYS provide detailed explanations even when content is clean — explain what you checked and why it passed.
|
||||||
Only respond with the JSON, no other text.`
|
Only respond with the JSON, no other text.`
|
||||||
|
|
|
||||||
Loading…
Reference in a new issue