feat: full NSFW system - Cinemax rules, auto-reclassify with warning, not-allowed removal with appeal email, blur toggle setting, user self-labeling

2026-02-07 16:58:57 -06:00 · 2026-02-07 16:58:57 -06:00 · 27b48128fe
parent 68dd8d3544
commit 27b48128fe
7 changed files with 192 additions and 29 deletions
--- a/go-backend/internal/handlers/post_handler.go
+++ b/go-backend/internal/handlers/post_handler.go
@ -2,6 +2,7 @@ package handlers
 import (
 	"context"
 	"fmt"
 	"net/http"
 	"strings"
 	"time"
@ -209,6 +210,8 @@ func (h *PostHandler) CreatePost(c *gin.Context) {
 		BeaconLat     *float64 `json:"beacon_lat"`
 		BeaconLong    *float64 `json:"beacon_long"`
 		TTLHours      *int     `json:"ttl_hours"`
 		IsNSFW        bool     `json:"is_nsfw"`
 		NSFWReason    string   `json:"nsfw_reason"`
 	}
 	if err := c.ShouldBindJSON(&req); err != nil {
@ -293,6 +296,8 @@ func (h *PostHandler) CreatePost(c *gin.Context) {
 		AllowChain:     allowChain,
 		Visibility:     "public",
 		ExpiresAt:      expiresAt,
 		IsNSFW:         req.IsNSFW,
 		NSFWReason:     req.NSFWReason,
 		Lat:            req.BeaconLat,
 		Long:           req.BeaconLong,
 	}
@ -338,18 +343,24 @@ func (h *PostHandler) CreatePost(c *gin.Context) {
 	}
 	// 5b. OpenRouter AI Moderation — NSFW vs Flag decision
 	userSelfLabeledNSFW := req.IsNSFW
 	orDecision := ""
 	if h.openRouterService != nil {
 		orResult, orErr := h.openRouterService.ModerateText(c.Request.Context(), req.Body)
 		if orErr == nil && orResult != nil {
 			orDecision = orResult.Action
 			switch orResult.Action {
 			case "nsfw":
 				post.IsNSFW = true
 				if orResult.NSFWReason != "" {
 					post.NSFWReason = orResult.NSFWReason
 				}
 				if post.Status != "pending_moderation" {
 					post.Status = "active" // NSFW posts are active but blurred
 				}
 			case "flag":
-				post.Status = "pending_moderation"
+				// NOT ALLOWED — will be removed after creation
 				post.Status = "removed"
 			}
 			// Update CIS from OpenRouter scores if available
 			if orResult.Hate > 0 || orResult.Greed > 0 || orResult.Delusion > 0 {
@ -367,8 +378,7 @@ func (h *PostHandler) CreatePost(c *gin.Context) {
 	}
 	// Handle Flags - Comprehensive Content Flagging
-	if h.moderationService != nil && post.Status == "pending_moderation" {
+	if h.moderationService != nil && (post.Status == "pending_moderation" || post.Status == "removed") {
 		// Extract all media URLs for flagging
 		mediaURLs := []string{}
 		if req.ImageURL != nil && *req.ImageURL != "" {
 			mediaURLs = append(mediaURLs, *req.ImageURL)
@ -384,6 +394,64 @@ func (h *PostHandler) CreatePost(c *gin.Context) {
 		_ = h.moderationService.FlagPost(c.Request.Context(), post.ID, scores, reason)
 	}
 	// NSFW auto-reclassify: AI says NSFW but user didn't self-label → send warning
 	if post.IsNSFW && !userSelfLabeledNSFW && h.notificationService != nil {
 		go func() {
 			ctx := context.Background()
 			h.notificationService.NotifyNSFWWarning(ctx, userID.String(), post.ID.String())
 			log.Info().Str("post_id", post.ID.String()).Str("author_id", userID.String()).Msg("NSFW warning sent — post auto-labeled")
 		}()
 	}
 	// NOT ALLOWED: AI flagged → post removed, create violation, send appeal notification + email
 	if post.Status == "removed" && orDecision == "flag" {
 		go func() {
 			ctx := context.Background()
 			// Send in-app notification
 			if h.notificationService != nil {
 				h.notificationService.NotifyContentRemoved(ctx, userID.String(), post.ID.String())
 			}
 			// Create moderation violation record
 			if h.moderationService != nil {
 				h.moderationService.FlagPost(ctx, post.ID, &services.ThreePoisonsScore{Hate: 1.0}, "not_allowed")
 			}
 			// Send appeal email — get email from users table, display name from profiles
 			var userEmail string
 			h.postRepo.Pool().QueryRow(ctx, `SELECT email FROM users WHERE id = $1`, userID).Scan(&userEmail)
 			profile, _ := h.userRepo.GetProfileByID(ctx, userID.String())
 			if userEmail != "" {
 				displayName := "there"
 				if profile != nil && profile.DisplayName != nil {
 					displayName = *profile.DisplayName
 				}
 				snippet := req.Body
 				if len(snippet) > 100 {
 					snippet = snippet[:100] + "..."
 				}
 				appealBody := fmt.Sprintf(
 					"Hi %s,\n\n"+
 						"Your recent post on Sojorn was removed because it was found to violate our community guidelines.\n\n"+
 						"Post content: \"%s\"\n\n"+
 						"If you believe this was a mistake, you can appeal this decision in your Sojorn app:\n"+
 						"Go to Profile → Settings → Appeals\n\n"+
 						"Our moderation team will review your appeal within 48 hours.\n\n"+
 						"— The Sojorn Team",
 					displayName, snippet,
 				)
 				log.Info().Str("email", userEmail).Msg("Sending content removal appeal email")
 				h.postRepo.Pool().Exec(ctx,
 					`INSERT INTO email_queue (to_email, subject, body, created_at) VALUES ($1, $2, $3, NOW()) ON CONFLICT DO NOTHING`,
 					userEmail, "Your Sojorn post was removed", appealBody,
 				)
 			}
 			log.Warn().Str("post_id", post.ID.String()).Str("author_id", userID.String()).Msg("Post removed by AI moderation — not allowed content")
 		}()
 	}
 	// Log AI moderation decision to audit log
 	if h.moderationService != nil {
 		decision := "pass"
@ -391,7 +459,9 @@ func (h *PostHandler) CreatePost(c *gin.Context) {
 		if post.ToneLabel != nil && *post.ToneLabel != "" {
 			flagReason = *post.ToneLabel
 		}
-		if post.Status == "pending_moderation" {
+		if post.Status == "removed" {
 			decision = "flag"
 		} else if post.Status == "pending_moderation" {
 			decision = "flag"
 		} else if post.IsNSFW {
 			decision = "nsfw"
@ -403,7 +473,7 @@ func (h *PostHandler) CreatePost(c *gin.Context) {
 		} else {
 			scores = &services.ThreePoisonsScore{}
 		}
-		h.moderationService.LogAIDecision(c.Request.Context(), "post", post.ID, userID, req.Body, scores, nil, decision, flagReason, "", nil)
+		h.moderationService.LogAIDecision(c.Request.Context(), "post", post.ID, userID, req.Body, scores, nil, decision, flagReason, orDecision, nil)
 	}
 	// Check for @mentions and notify mentioned users
--- a/go-backend/internal/models/notification.go
+++ b/go-backend/internal/models/notification.go
@ -22,6 +22,8 @@ const (
 	NotificationTypeBeaconReport   = "beacon_report"
 	NotificationTypeShare          = "share"
 	NotificationTypeQuipReaction   = "quip_reaction"
 	NotificationTypeNSFWWarning    = "nsfw_warning"
 	NotificationTypeContentRemoved = "content_removed"
 )
 // NotificationPriority constants
--- a/go-backend/internal/models/settings.go
+++ b/go-backend/internal/models/settings.go
@ -31,5 +31,6 @@ type UserSettings struct {
 	DataSaverMode        *bool     `json:"data_saver_mode" db:"data_saver_mode"`
 	DefaultPostTtl       *int      `json:"default_post_ttl" db:"default_post_ttl"`
 	NSFWEnabled          *bool     `json:"nsfw_enabled" db:"nsfw_enabled"`
 	NSFWBlurEnabled      *bool     `json:"nsfw_blur_enabled" db:"nsfw_blur_enabled"`
 	UpdatedAt            time.Time `json:"updated_at" db:"updated_at"`
 }
--- a/go-backend/internal/repository/post_repository.go
+++ b/go-backend/internal/repository/post_repository.go
@ -19,6 +19,10 @@ func NewPostRepository(pool *pgxpool.Pool) *PostRepository {
 	return &PostRepository{pool: pool}
 }
 func (r *PostRepository) Pool() *pgxpool.Pool {
 	return r.pool
 }
 func (r *PostRepository) CreatePost(ctx context.Context, post *models.Post) error {
 	// Calculate confidence score if it's a beacon
 	if post.IsBeacon {
--- a/go-backend/internal/repository/user_repository.go
+++ b/go-backend/internal/repository/user_repository.go
@ -729,7 +729,7 @@ func (r *UserRepository) GetUserSettings(ctx context.Context, userID string) (*m
 	query := `
 		SELECT user_id, theme, language, notifications_enabled, email_notifications,
 		       push_notifications, content_filter_level, auto_play_videos, data_saver_mode, 
-		       default_post_ttl, COALESCE(nsfw_enabled, FALSE), updated_at
+		       default_post_ttl, COALESCE(nsfw_enabled, FALSE), COALESCE(nsfw_blur_enabled, TRUE), updated_at
 		FROM public.user_settings
 		WHERE user_id = $1::uuid
 	`
@ -737,7 +737,7 @@ func (r *UserRepository) GetUserSettings(ctx context.Context, userID string) (*m
 	err := r.pool.QueryRow(ctx, query, userID).Scan(
 		&us.UserID, &us.Theme, &us.Language, &us.NotificationsEnabled, &us.EmailNotifications,
 		&us.PushNotifications, &us.ContentFilterLevel, &us.AutoPlayVideos, &us.DataSaverMode,
-		&us.DefaultPostTtl, &us.NSFWEnabled, &us.UpdatedAt,
+		&us.DefaultPostTtl, &us.NSFWEnabled, &us.NSFWBlurEnabled, &us.UpdatedAt,
 	)
 	if err != nil {
 		if err.Error() == "no rows in result set" || err.Error() == "pgx: no rows in result set" {
@ -759,6 +759,7 @@ func (r *UserRepository) GetUserSettings(ctx context.Context, userID string) (*m
 				AutoPlayVideos:       &t,
 				DataSaverMode:        &f,
 				NSFWEnabled:          &f,
 				NSFWBlurEnabled:      &t,
 				UpdatedAt:            time.Now(),
 			}, nil
 		}
@ -772,8 +773,8 @@ func (r *UserRepository) UpdateUserSettings(ctx context.Context, us *models.User
 		INSERT INTO public.user_settings (
 			user_id, theme, language, notifications_enabled, email_notifications,
 			push_notifications, content_filter_level, auto_play_videos, data_saver_mode, 
-			default_post_ttl, nsfw_enabled, updated_at
+			default_post_ttl, nsfw_enabled, nsfw_blur_enabled, updated_at
-		) VALUES ($1::uuid, $2, $3, $4, $5, $6, $7, $8, $9, $10, $11, NOW())
+		) VALUES ($1::uuid, $2, $3, $4, $5, $6, $7, $8, $9, $10, $11, $12, NOW())
 		ON CONFLICT (user_id) DO UPDATE SET
 			theme = COALESCE(EXCLUDED.theme, user_settings.theme),
 			language = COALESCE(EXCLUDED.language, user_settings.language),
@ -785,12 +786,13 @@ func (r *UserRepository) UpdateUserSettings(ctx context.Context, us *models.User
 			data_saver_mode = COALESCE(EXCLUDED.data_saver_mode, user_settings.data_saver_mode),
 			default_post_ttl = COALESCE(EXCLUDED.default_post_ttl, user_settings.default_post_ttl),
 			nsfw_enabled = COALESCE(EXCLUDED.nsfw_enabled, user_settings.nsfw_enabled),
 			nsfw_blur_enabled = COALESCE(EXCLUDED.nsfw_blur_enabled, user_settings.nsfw_blur_enabled),
 			updated_at = NOW()
 	`
 	_, err := r.pool.Exec(ctx, query,
 		us.UserID, us.Theme, us.Language, us.NotificationsEnabled, us.EmailNotifications,
 		us.PushNotifications, us.ContentFilterLevel, us.AutoPlayVideos, us.DataSaverMode,
-		us.DefaultPostTtl, us.NSFWEnabled,
+		us.DefaultPostTtl, us.NSFWEnabled, us.NSFWBlurEnabled,
 	)
 	return err
 }
--- a/go-backend/internal/services/notification_service.go
+++ b/go-backend/internal/services/notification_service.go
@ -206,6 +206,32 @@ func (s *NotificationService) NotifyBeaconReport(ctx context.Context, beaconAuth
 	})
 }
 // NotifyNSFWWarning sends a warning when a post is auto-labeled as NSFW
 func (s *NotificationService) NotifyNSFWWarning(ctx context.Context, authorID string, postID string) error {
 	authorUUID := uuid.MustParse(authorID)
 	return s.sendNotification(ctx, models.PushNotificationRequest{
 		UserID:   authorUUID,
 		Type:     models.NotificationTypeNSFWWarning,
 		ActorID:  authorUUID, // system-generated, actor is self
 		PostID:   uuidPtr(postID),
 		PostType: "standard",
 		Priority: models.PriorityHigh,
 	})
 }
 // NotifyContentRemoved sends a notification when content is removed by AI moderation
 func (s *NotificationService) NotifyContentRemoved(ctx context.Context, authorID string, postID string) error {
 	authorUUID := uuid.MustParse(authorID)
 	return s.sendNotification(ctx, models.PushNotificationRequest{
 		UserID:   authorUUID,
 		Type:     models.NotificationTypeContentRemoved,
 		ActorID:  authorUUID, // system-generated
 		PostID:   uuidPtr(postID),
 		PostType: "standard",
 		Priority: models.PriorityUrgent,
 	})
 }
 // ============================================================================
 // Core Send Logic
 // ============================================================================
@ -402,6 +428,16 @@ func (s *NotificationService) buildPushPayload(req models.PushNotificationReques
 			body = fmt.Sprintf("%s reacted to your quip", actorName)
 		}
 	case models.NotificationTypeNSFWWarning:
 		title = "Content Labeled as Sensitive"
 		body = "Your post was automatically labeled as NSFW. Please label sensitive content when posting to avoid further action."
 		data["target"] = "main_feed"
 	case models.NotificationTypeContentRemoved:
 		title = "Content Removed"
 		body = "Your post was removed for violating community guidelines. You can appeal this decision in your profile settings."
 		data["target"] = "profile_settings"
 	default:
 		title = "Sojorn"
 		body = "You have a new notification"
--- a/go-backend/internal/services/openrouter_service.go
+++ b/go-backend/internal/services/openrouter_service.go
@ -414,18 +414,66 @@ const defaultModerationSystemPrompt = `You are a content moderation AI for Sojor
 Analyze the provided content and decide one of three actions:
 1. "clean" — Content is appropriate for all users. No issues.
-2. "nsfw" — Content is NOT illegal or bannable, but is mature/sensitive. Examples: mild violence, suggestive (but not explicit) imagery, dark humor, intense themes, horror content, heated political speech, depictions of alcohol/smoking. This content will be blurred with a warning label so users who opted in can choose to view it.
+2. "nsfw" — Content is mature/sensitive but ALLOWED on the platform. It will be blurred behind a warning label for users who have opted in. Think "Cinemax late night" — permissive but not extreme.
-3. "flag" — Content violates platform policy and should be reviewed by moderators. Examples: explicit nudity/pornography, graphic gore, illegal activity, credible threats, child exploitation, hard drug use instructions, doxxing, extreme hate speech.
+3. "flag" — Content is NOT ALLOWED and will be removed. The user will receive an appeal notice.
-When unsure, prefer "nsfw" over "flag" — only flag content you believe is clearly illegal or extremely graphic.
+═══════════════════════════════════════════
 NUDITY & SEXUAL CONTENT RULES (Cinemax Rule)
 ═══════════════════════════════════════════
 NSFW (allowed, blurred):
 - Partial or full nudity (breasts, buttocks, genitalia visible)
 - Suggestive or sensual poses, lingerie, implied sexual situations
 - Artistic nude photography, figure drawing, body-positive content
 - Breastfeeding, non-sexual nudity in natural contexts
 NOT ALLOWED (flag):
 - Explicit sexual intercourse (penetration, oral sex, any sex acts)
 - Hardcore pornography of any kind
 - Any sexual content involving minors (ZERO TOLERANCE — always flag)
 - Non-consensual sexual content, revenge porn
 - Bestiality
 ═══════════════════════════════════════════
 VIOLENCE RULES (1-10 Scale)
 ═══════════════════════════════════════════
 Rate the violence level on a 1-10 scale in your explanation:
  1-3: Mild (arguments, shoving, cartoon violence) → "clean"
  4-5: Moderate (blood from injuries, protest footage with blood, boxing/MMA, hunting) → "nsfw"
  6-7: Graphic (open wounds, significant bloodshed, war footage) → "flag"
  8-10: Extreme (torture, dismemberment, gore, execution) → "flag"
 Only violence rated 5 or below is allowed. 6+ is always flagged and removed.
 Protest footage showing blood or injuries = NSFW (4-5), NOT flagged.
 ═══════════════════════════════════════════
 OTHER CONTENT RULES
 ═══════════════════════════════════════════
 NSFW (allowed, blurred):
 - Dark humor, edgy memes, intense themes
 - Horror content, gore in fiction/movies (≤5 on violence scale)
 - Drug/alcohol references, smoking imagery
 - Heated political speech, strong profanity
 - Depictions of self-harm recovery (educational/supportive context)
 NOT ALLOWED (flag):
 - Credible threats of violence against real people
 - Doxxing (sharing private info to harass)
 - Illegal activity instructions (bomb-making, drug synthesis)
 - Extreme hate speech targeting protected groups
 - Spam/scam content designed to defraud users
 - Dangerous medical misinformation that could cause harm
 - Deepfakes designed to deceive or defame
 When unsure between clean and nsfw, prefer "nsfw" (better safe, user sees it blurred).
 When unsure between nsfw and flag, prefer "nsfw" — only flag content that clearly crosses the lines above.
 Respond ONLY with a JSON object in this exact format:
 {
  "action": "clean" or "nsfw" or "flag",
-  "nsfw_reason": "If action is nsfw, a short label users will see: e.g. 'Violence', 'Suggestive Content', '18+ Themes', 'Gore', 'Drug References'. Empty string if clean or flag.",
+  "nsfw_reason": "If action is nsfw, a short label: e.g. 'Nudity', 'Violence', 'Suggestive Content', '18+ Themes', 'Gore', 'Drug References'. Empty string if clean or flag.",
  "flagged": true/false,
  "reason": "one-line summary if flagged or nsfw, empty string if clean",
-  "explanation": "Detailed paragraph explaining your full analysis and why you chose this action.",
+  "explanation": "Detailed paragraph explaining your analysis. For violence, include your 1-10 rating. For nudity, explain what is shown and why it does or does not cross the intercourse line.",
  "hate": 0.0-1.0,
  "hate_detail": "What you found or didn't find related to hate/violence/sexual content.",
  "greed": 0.0-1.0,