Add layered content moderation: hard blocklist + strike system + client-side filter

2026-02-06 11:46:30 -06:00 · 2026-02-06 11:46:30 -06:00 · f6c4bb88e0
parent 35740f3fc6
commit f6c4bb88e0
6 changed files with 391 additions and 5 deletions
--- a/go-backend/cmd/api/main.go
+++ b/go-backend/cmd/api/main.go
@ -120,11 +120,14 @@ func main() {
 	// Initialize appeal service
 	appealService := services.NewAppealService(dbPool)
 	// Initialize content filter (hard blocklist + strike system)
 	contentFilter := services.NewContentFilter(dbPool)
 	hub := realtime.NewHub()
 	wsHandler := handlers.NewWSHandler(hub, cfg.JWTSecret)
 	userHandler := handlers.NewUserHandler(userRepo, postRepo, notificationService, assetService)
-	postHandler := handlers.NewPostHandler(postRepo, userRepo, feedService, assetService, notificationService, moderationService)
+	postHandler := handlers.NewPostHandler(postRepo, userRepo, feedService, assetService, notificationService, moderationService, contentFilter)
 	chatHandler := handlers.NewChatHandler(chatRepo, notificationService, hub)
 	authHandler := handlers.NewAuthHandler(userRepo, cfg, emailService)
 	categoryHandler := handlers.NewCategoryHandler(categoryRepo)
--- a/go-backend/internal/handlers/post_handler.go
+++ b/go-backend/internal/handlers/post_handler.go
@ -22,9 +22,10 @@ type PostHandler struct {
 	assetService        *services.AssetService
 	notificationService *services.NotificationService
 	moderationService   *services.ModerationService
 	contentFilter       *services.ContentFilter
 }
-func NewPostHandler(postRepo *repository.PostRepository, userRepo *repository.UserRepository, feedService *services.FeedService, assetService *services.AssetService, notificationService *services.NotificationService, moderationService *services.ModerationService) *PostHandler {
+func NewPostHandler(postRepo *repository.PostRepository, userRepo *repository.UserRepository, feedService *services.FeedService, assetService *services.AssetService, notificationService *services.NotificationService, moderationService *services.ModerationService, contentFilter *services.ContentFilter) *PostHandler {
 	return &PostHandler{
 		postRepo:            postRepo,
 		userRepo:            userRepo,
@ -32,6 +33,7 @@ func NewPostHandler(postRepo *repository.PostRepository, userRepo *repository.Us
 		assetService:        assetService,
 		notificationService: notificationService,
 		moderationService:   moderationService,
 		contentFilter:       contentFilter,
 	}
 }
@ -55,6 +57,23 @@ func (h *PostHandler) CreateComment(c *gin.Context) {
 		return
 	}
 	// Layer 0: Hard blocklist check — reject immediately, never save
 	if h.contentFilter != nil {
 		result := h.contentFilter.CheckContent(req.Body)
 		if result.Blocked {
 			// Record strike
 			strikeCount, consequence, _ := h.contentFilter.RecordStrike(c.Request.Context(), userID, result.Category, req.Body)
 			c.JSON(http.StatusUnprocessableEntity, gin.H{
 				"error":       result.Message,
 				"blocked":     true,
 				"category":    result.Category,
 				"strikes":     strikeCount,
 				"consequence": consequence,
 			})
 			return
 		}
 	}
 	tags := utils.ExtractHashtags(req.Body)
 	tone := "neutral"
 	cis := 0.8
@ -185,6 +204,22 @@ func (h *PostHandler) CreatePost(c *gin.Context) {
 		return
 	}
 	// Layer 0: Hard blocklist check — reject immediately, never save
 	if h.contentFilter != nil {
 		result := h.contentFilter.CheckContent(req.Body)
 		if result.Blocked {
 			strikeCount, consequence, _ := h.contentFilter.RecordStrike(c.Request.Context(), userID, result.Category, req.Body)
 			c.JSON(http.StatusUnprocessableEntity, gin.H{
 				"error":       result.Message,
 				"blocked":     true,
 				"category":    result.Category,
 				"strikes":     strikeCount,
 				"consequence": consequence,
 			})
 			return
 		}
 	}
 	// 1. Check rate limit (Simplification)
 	trustState, err := h.userRepo.GetTrustState(c.Request.Context(), userID.String())
 	if err == nil && trustState.PostsToday >= 50 { // Example hard limit
--- a/go-backend/internal/services/content_filter.go
+++ b/go-backend/internal/services/content_filter.go
@ -0,0 +1,210 @@
 package services
 import (
 	"context"
 	"fmt"
 	"regexp"
 	"strings"
 	"time"
 	"github.com/google/uuid"
 	"github.com/jackc/pgx/v5/pgxpool"
 )
 // ContentFilter provides hard blocklist checking and strike tracking.
 // Layer 0: Instant rejection for obvious slurs — post never saves.
 type ContentFilter struct {
 	pool     *pgxpool.Pool
 	patterns []*blockedPattern
 }
 type blockedPattern struct {
 	regex    *regexp.Regexp
 	category string // "slur", "threat", etc.
 	severity string // "hard" = instant block, "soft" = warning
 }
 // ContentCheckResult is returned by CheckContent.
 type ContentCheckResult struct {
 	Blocked  bool   `json:"blocked"`
 	Category string `json:"category,omitempty"`
 	Message  string `json:"message,omitempty"`
 }
 func NewContentFilter(pool *pgxpool.Pool) *ContentFilter {
 	cf := &ContentFilter{pool: pool}
 	cf.buildPatterns()
 	return cf
 }
 // buildPatterns compiles regex patterns for slur detection.
 // Uses word-boundary-aware patterns that catch common evasion tactics:
 //   - Spacing (n i g g e r)
 //   - Leetspeak (n1gg3r)
 //   - Repeated chars (niggger)
 //   - Partial masking (n*gger, n**ga)
 func (cf *ContentFilter) buildPatterns() {
 	type entry struct {
 		pattern  string
 		category string
 		severity string
 	}
 	// Hard-blocked slurs — these NEVER get posted.
 	// Patterns use (?i) for case-insensitive and flexible char matching.
 	entries := []entry{
 		// N-word and variants
 		{`(?i)\bn[i1!|l][gq9][gq9]+[e3a@]?[r0d]?s?\b`, "slur", "hard"},
 		{`(?i)\bn[i1!|l][gq9]+[aA@]\b`, "slur", "hard"},
 		{`(?i)\bn\s*[i1!]\s*[gq9]\s*[gq9]\s*[e3a]?\s*[r0]?\b`, "slur", "hard"},
 		// F-word (homophobic slur) and variants
 		{`(?i)\bf[a@4][gq9][gq9]?[o0]?[t7]?s?\b`, "slur", "hard"},
 		{`(?i)\bf\s*[a@4]\s*[gq9]\s*[gq9]?\s*[o0]?\s*[t7]?\b`, "slur", "hard"},
 		// K-word (anti-Jewish slur)
 		{`(?i)\bk[i1][k]+[e3]?s?\b`, "slur", "hard"},
 		// C-word (racial slur against Asian people)
 		{`(?i)\bch[i1]n[k]+s?\b`, "slur", "hard"},
 		// S-word (anti-Hispanic slur)
 		{`(?i)\bsp[i1][ck]+s?\b`, "slur", "hard"},
 		// W-word (racial slur)
 		{`(?i)\bw[e3][t7]b[a@]ck+s?\b`, "slur", "hard"},
 		// R-word (ableist slur)
 		{`(?i)\br[e3]t[a@]rd+s?\b`, "slur", "hard"},
 		// T-word (transphobic slur)
 		{`(?i)\btr[a@4]nn[yie]+s?\b`, "slur", "hard"},
 		// Direct death/violence threats
 		{`(?i)\b(i('?m| am) go(ing|nna)|i('?ll| will)) (to )?(kill|murder|shoot|stab|rape)\b`, "threat", "hard"},
 		{`(?i)\b(kill|murder|shoot|stab|rape) (you|them|him|her|all)\b`, "threat", "hard"},
 	}
 	cf.patterns = make([]*blockedPattern, 0, len(entries))
 	for _, e := range entries {
 		re, err := regexp.Compile(e.pattern)
 		if err != nil {
 			fmt.Printf("Content filter: failed to compile pattern %q: %v\n", e.pattern, err)
 			continue
 		}
 		cf.patterns = append(cf.patterns, &blockedPattern{
 			regex:    re,
 			category: e.category,
 			severity: e.severity,
 		})
 	}
 	fmt.Printf("Content filter: loaded %d patterns\n", len(cf.patterns))
 }
 // CheckContent scans text against the hard blocklist.
 // Returns immediately on first match — no need to check all patterns.
 func (cf *ContentFilter) CheckContent(text string) *ContentCheckResult {
 	if text == "" {
 		return &ContentCheckResult{Blocked: false}
 	}
 	// Normalize: collapse whitespace, strip zero-width chars
 	normalized := normalizeText(text)
 	for _, p := range cf.patterns {
 		if p.severity == "hard" && p.regex.MatchString(normalized) {
 			return &ContentCheckResult{
 				Blocked:  true,
 				Category: p.category,
 				Message:  "This content contains language that isn't allowed on Sojorn. Please revise your post.",
 			}
 		}
 	}
 	return &ContentCheckResult{Blocked: false}
 }
 // RecordStrike records a content violation strike against a user.
 // Strike escalation:
 //
 //	1-2 strikes: warning (post blocked, user informed)
 //	3 strikes:   24-hour posting suspension
 //	5 strikes:   7-day suspension
 //	7+ strikes:  permanent ban
 func (cf *ContentFilter) RecordStrike(ctx context.Context, userID uuid.UUID, category, content string) (int, string, error) {
 	// Insert strike
 	_, err := cf.pool.Exec(ctx, `
 		INSERT INTO content_strikes (user_id, category, content_snippet, created_at)
 		VALUES ($1, $2, $3, NOW())
 	`, userID, category, truncate(content, 100))
 	if err != nil {
 		return 0, "", fmt.Errorf("failed to record strike: %w", err)
 	}
 	// Count recent strikes (last 30 days)
 	var count int
 	err = cf.pool.QueryRow(ctx, `
 		SELECT COUNT(*) FROM content_strikes
 		WHERE user_id = $1 AND created_at > NOW() - INTERVAL '30 days'
 	`, userID).Scan(&count)
 	if err != nil {
 		return 0, "", fmt.Errorf("failed to count strikes: %w", err)
 	}
 	// Determine consequence
 	consequence := "warning"
 	switch {
 	case count >= 7:
 		consequence = "ban"
 		cf.pool.Exec(ctx, `UPDATE users SET status = 'banned' WHERE id = $1`, userID)
 		fmt.Printf("Content filter: user %s BANNED (%d strikes)\n", userID, count)
 	case count >= 5:
 		consequence = "suspend_7d"
 		suspendUntil := time.Now().Add(7 * 24 * time.Hour)
 		cf.pool.Exec(ctx, `UPDATE users SET status = 'suspended', suspended_until = $2 WHERE id = $1`, userID, suspendUntil)
 		fmt.Printf("Content filter: user %s suspended 7 days (%d strikes)\n", userID, count)
 	case count >= 3:
 		consequence = "suspend_24h"
 		suspendUntil := time.Now().Add(24 * time.Hour)
 		cf.pool.Exec(ctx, `UPDATE users SET status = 'suspended', suspended_until = $2 WHERE id = $1`, userID, suspendUntil)
 		fmt.Printf("Content filter: user %s suspended 24h (%d strikes)\n", userID, count)
 	default:
 		fmt.Printf("Content filter: user %s warning (%d strikes)\n", userID, count)
 	}
 	return count, consequence, nil
 }
 // GetUserStrikes returns the number of recent strikes for a user.
 func (cf *ContentFilter) GetUserStrikes(ctx context.Context, userID uuid.UUID) (int, error) {
 	var count int
 	err := cf.pool.QueryRow(ctx, `
 		SELECT COUNT(*) FROM content_strikes
 		WHERE user_id = $1 AND created_at > NOW() - INTERVAL '30 days'
 	`, userID).Scan(&count)
 	return count, err
 }
 // normalizeText strips common evasion characters and collapses spacing.
 func normalizeText(text string) string {
 	// Remove zero-width characters
 	text = strings.ReplaceAll(text, "\u200b", "") // zero-width space
 	text = strings.ReplaceAll(text, "\u200c", "") // zero-width non-joiner
 	text = strings.ReplaceAll(text, "\u200d", "") // zero-width joiner
 	text = strings.ReplaceAll(text, "\ufeff", "")  // BOM
 	// Remove common separator characters used to evade filters
 	for _, ch := range []string{".", "-", "_", "*", "|"} {
 		text = strings.ReplaceAll(text, ch, "")
 	}
 	return text
 }
 func truncate(s string, maxLen int) string {
 	if len(s) <= maxLen {
 		return s
 	}
 	return s[:maxLen]
 }
--- a/go-backend/scripts/create_content_strikes.sql
+++ b/go-backend/scripts/create_content_strikes.sql
@ -0,0 +1,18 @@
 CREATE TABLE IF NOT EXISTS content_strikes (
    id UUID PRIMARY KEY DEFAULT gen_random_uuid(),
    user_id UUID NOT NULL REFERENCES users(id) ON DELETE CASCADE,
    category TEXT NOT NULL,
    content_snippet TEXT,
    created_at TIMESTAMP WITH TIME ZONE DEFAULT NOW()
 );
 CREATE INDEX IF NOT EXISTS idx_content_strikes_user_id ON content_strikes(user_id);
 CREATE INDEX IF NOT EXISTS idx_content_strikes_created_at ON content_strikes(created_at);
 -- Add suspended_until column to users if not exists
 DO $$
 BEGIN
    IF NOT EXISTS (SELECT 1 FROM information_schema.columns WHERE table_name='users' AND column_name='suspended_until') THEN
        ALTER TABLE users ADD COLUMN suspended_until TIMESTAMP WITH TIME ZONE;
    END IF;
 END $$;
--- a/sojorn_app/lib/screens/compose/compose_screen.dart
+++ b/sojorn_app/lib/screens/compose/compose_screen.dart
@ -15,6 +15,7 @@ import '../../providers/feed_refresh_provider.dart';
 import '../../services/image_upload_service.dart';
 import '../../theme/app_theme.dart';
 import '../../widgets/composer/composer_toolbar.dart';
 import '../../services/content_filter.dart';
 import '../../widgets/sojorn_snackbar.dart';
 import 'image_editor_screen.dart';
 import '../quips/create/quip_studio_screen.dart'; // Added import
@ -322,6 +323,13 @@ class _ComposeScreenState extends ConsumerState<ComposeScreen> {
      return;
    }
    // Layer 0: Client-side hard blocklist — never even send to server
    final blockMessage = ContentFilter.instance.check(_bodyController.text.trim());
    if (blockMessage != null) {
      await _showBlockedDialog(blockMessage);
      return;
    }
    setState(() {
      _isLoading = true;
      _errorMessage = null;
@ -400,9 +408,15 @@ class _ComposeScreenState extends ConsumerState<ComposeScreen> {
            'Content verification temporarily unavailable. Please try again.';
      });
    } catch (e) {
-      setState(() {
+      final msg = e.toString().replaceAll('Exception: ', '');
-        _errorMessage = e.toString().replaceAll('Exception: ', '');
+      // Server-side blocklist catch (422 with blocked content message)
-      });
+      if (msg.contains("isn't allowed on Sojorn") || msg.contains('not allowed')) {
        if (mounted) await _showBlockedDialog(msg);
      } else {
        setState(() {
          _errorMessage = msg;
        });
      }
    } finally {
      if (mounted) {
        setState(() {
@ -440,6 +454,38 @@ class _ComposeScreenState extends ConsumerState<ComposeScreen> {
    return result ?? false;
  }
  Future<void> _showBlockedDialog(String message) async {
    await showDialog<void>(
      context: context,
      barrierDismissible: false,
      builder: (context) => AlertDialog(
        shape: RoundedRectangleBorder(borderRadius: BorderRadius.circular(16)),
        title: Row(
          children: [
            Icon(Icons.block, color: AppTheme.error, size: 24),
            const SizedBox(width: 8),
            const Text('Not Allowed'),
          ],
        ),
        content: Text(
          message,
          style: AppTheme.textTheme.bodyMedium,
        ),
        actions: [
          ElevatedButton(
            onPressed: () => Navigator.pop(context),
            style: ElevatedButton.styleFrom(
              backgroundColor: AppTheme.brightNavy,
              foregroundColor: AppTheme.white,
              shape: const StadiumBorder(),
            ),
            child: const Text('Edit My Post'),
          ),
        ],
      ),
    );
  }
  bool get _canPublish {
    return _bodyController.text.trim().isNotEmpty &&
        _bodyController.text.trim().length <= _maxCharacters &&
--- a/sojorn_app/lib/services/content_filter.dart
+++ b/sojorn_app/lib/services/content_filter.dart
@ -0,0 +1,74 @@
 /// Client-side content filter for Sojorn.
 /// Layer 0: Catches obvious slurs BEFORE sending to server.
 /// This prevents the post from ever leaving the device.
 class ContentFilter {
  ContentFilter._();
  static final instance = ContentFilter._();
  /// Check text for hard-blocked content.
  /// Returns null if clean, or a user-friendly message if blocked.
  String? check(String text) {
    if (text.isEmpty) return null;
    final normalized = _normalize(text);
    for (final pattern in _hardBlockPatterns) {
      if (pattern.hasMatch(normalized)) {
        return "We don't allow that kind of language on Sojorn. Please revise your post.";
      }
    }
    return null;
  }
  /// Normalize text to catch common evasion tactics.
  String _normalize(String text) {
    var result = text.toLowerCase();
    // Remove zero-width characters
    result = result.replaceAll('\u200b', '');
    result = result.replaceAll('\u200c', '');
    result = result.replaceAll('\u200d', '');
    result = result.replaceAll('\ufeff', '');
    // Remove common separator characters used to evade filters
    result = result.replaceAll(RegExp(r'[.\-_*|]'), '');
    return result;
  }
  // Hard-blocked patterns — these match slurs and direct threats.
  // Mirrors the server-side patterns in content_filter.go.
  static final List<RegExp> _hardBlockPatterns = [
    // N-word and variants
    RegExp(r'\bn[i1!|l][gq9][gq9]+[e3a@]?[r0d]?s?\b', caseSensitive: false),
    RegExp(r'\bn[i1!|l][gq9]+[aA@]\b', caseSensitive: false),
    RegExp(r'\bn\s*[i1!]\s*[gq9]\s*[gq9]\s*[e3a]?\s*[r0]?\b', caseSensitive: false),
    // F-word (homophobic slur) and variants
    RegExp(r'\bf[a@4][gq9][gq9]?[o0]?[t7]?s?\b', caseSensitive: false),
    RegExp(r'\bf\s*[a@4]\s*[gq9]\s*[gq9]?\s*[o0]?\s*[t7]?\b', caseSensitive: false),
    // K-word (anti-Jewish slur)
    RegExp(r'\bk[i1][k]+[e3]?s?\b', caseSensitive: false),
    // C-word (racial slur against Asian people)
    RegExp(r'\bch[i1]n[k]+s?\b', caseSensitive: false),
    // S-word (anti-Hispanic slur)
    RegExp(r'\bsp[i1][ck]+s?\b', caseSensitive: false),
    // W-word (racial slur)
    RegExp(r'\bw[e3][t7]b[a@]ck+s?\b', caseSensitive: false),
    // R-word (ableist slur)
    RegExp(r'\br[e3]t[a@]rd+s?\b', caseSensitive: false),
    // T-word (transphobic slur)
    RegExp(r'\btr[a@4]nn[yie]+s?\b', caseSensitive: false),
    // Direct death/violence threats
    RegExp(r"\b(i('?m| am) go(ing|nna)|i('?ll| will)) (to )?(kill|murder|shoot|stab|rape)\b", caseSensitive: false),
    RegExp(r'\b(kill|murder|shoot|stab|rape) (you|them|him|her|all)\b', caseSensitive: false),
  ];
 }