sojorn/go-backend/internal/services/username_validation_service.go

package services

import (
	"context"
	"regexp"
	"strings"

	"github.com/jackc/pgx/v5/pgxpool"
)

type UsernameViolation int

const (
	UsernameOK UsernameViolation = iota
	UsernameReserved
	UsernameInappropriate
	UsernameInvalidFormat
)

type UsernameCheckResult struct {
	Violation UsernameViolation
	Message   string
}

// ValidateUsernameWithDB checks a handle against reserved names (hardcoded + DB),
// inappropriate words, and format rules.
func ValidateUsernameWithDB(ctx context.Context, pool *pgxpool.Pool, handle string) UsernameCheckResult {
	result := ValidateUsername(handle)
	if result.Violation != UsernameOK {
		return result
	}

	// Also check DB reserved_usernames table
	if pool != nil {
		var count int
		err := pool.QueryRow(ctx, `SELECT COUNT(*) FROM reserved_usernames WHERE username = $1`, strings.ToLower(strings.TrimSpace(handle))).Scan(&count)
		if err == nil && count > 0 {
			return UsernameCheckResult{
				UsernameReserved,
				"This username is reserved. If you officially represent this brand, company, or public figure, you can submit a verification request at support@sojorn.net to claim it.",
			}
		}
	}

	return UsernameCheckResult{UsernameOK, ""}
}

// ValidateUsername checks a handle against reserved names, inappropriate words,
// and format rules. Returns a result with a user-facing message.
func ValidateUsername(handle string) UsernameCheckResult {
	h := strings.ToLower(strings.TrimSpace(handle))

	// Format check
	if len(h) < 3 || len(h) > 30 {
		return UsernameCheckResult{UsernameInvalidFormat, "Username must be between 3 and 30 characters."}
	}
	if !validHandleRegex.MatchString(h) {
		return UsernameCheckResult{UsernameInvalidFormat, "Username can only contain letters, numbers, underscores, and periods."}
	}

	// Reserved check
	if isReserved(h) {
		return UsernameCheckResult{
			UsernameReserved,
			"This username is reserved. If you officially represent this brand, company, or public figure, you can submit a verification request at support@sojorn.net to claim it.",
		}
	}

	// Inappropriate check
	if reason := isInappropriate(h); reason != "" {
		return UsernameCheckResult{UsernameInappropriate, "This username is not allowed: " + reason}
	}

	return UsernameCheckResult{UsernameOK, ""}
}

// ValidateDisplayName checks a display name for inappropriate content.
func ValidateDisplayName(name string) UsernameCheckResult {
	n := strings.ToLower(strings.TrimSpace(name))
	if len(n) == 0 || len(n) > 50 {
		return UsernameCheckResult{UsernameInvalidFormat, "Display name must be between 1 and 50 characters."}
	}
	if reason := isInappropriate(n); reason != "" {
		return UsernameCheckResult{UsernameInappropriate, "This display name is not allowed: " + reason}
	}
	return UsernameCheckResult{UsernameOK, ""}
}

var validHandleRegex = regexp.MustCompile(`^[a-z0-9_.]+$`)

// -------------------------------------------------------------------
// Reserved usernames
// -------------------------------------------------------------------

func isReserved(h string) bool {
	// Exact match
	if reservedSet[h] {
		return true
	}
	// Prefix match (e.g. "sojorn_anything", "admin_anything")
	for _, prefix := range reservedPrefixes {
		if strings.HasPrefix(h, prefix) {
			return true
		}
	}
	// Contains match for brand names that shouldn't appear even as substrings
	for _, substr := range reservedSubstrings {
		if strings.Contains(h, substr) {
			return true
		}
	}
	return false
}

// Platform terms, system accounts, and roles
var platformReserved = []string{
	"sojorn", "admin", "administrator", "moderator", "mod",
	"support", "help", "helpdesk", "system", "official",
	"root", "superuser", "staff", "team", "security",
	"abuse", "postmaster", "webmaster", "info", "contact",
	"noreply", "no_reply", "mailer", "daemon", "bot",
	"api", "dev", "developer", "ceo", "cto", "cfo", "coo",
	"founder", "cofounder", "intern", "hr",
	"legal", "compliance", "privacy", "terms",
	"news", "press", "media", "blog", "status",
	"feedback", "report", "bug", "feature",
	"billing", "payment", "sales", "marketing",
	"everyone", "all", "here", "channel",
	"null", "undefined", "anonymous", "unknown",
	"test", "testing", "demo", "example",
	"signup", "signin", "login", "logout", "register",
	"settings", "account", "profile", "dashboard",
	"home", "feed", "explore", "discover", "search",
	"notification", "notifications", "message", "messages",
	"chat", "dm", "dms", "inbox", "outbox",
	"verified", "verification", "verify",
	"beacon", "beacons", "quip", "quips",
}

// Major tech companies and social platforms
var techCompanyReserved = []string{
	"google", "apple", "microsoft", "amazon", "meta",
	"facebook", "instagram", "twitter", "tiktok", "snapchat",
	"linkedin", "reddit", "pinterest", "youtube", "twitch",
	"discord", "telegram", "whatsapp", "signal",
	"netflix", "spotify", "hulu", "disney", "disneyplus",
	"openai", "chatgpt", "anthropic", "claude",
	"nvidia", "amd", "intel", "samsung", "sony",
	"tesla", "spacex", "nasa", "boeing", "airbus",
	"uber", "lyft", "airbnb", "doordash", "grubhub",
	"paypal", "stripe", "venmo", "cashapp", "zelle",
	"coinbase", "binance", "robinhood", "fidelity",
	"github", "gitlab", "stackoverflow", "atlassian",
	"slack", "zoom", "teams", "webex",
	"shopify", "squarespace", "wordpress", "wix",
	"dropbox", "icloud", "onedrive",
	"oracle", "ibm", "salesforce", "adobe", "canva",
}

// Major brands and corporations
var brandReserved = []string{
	"nike", "adidas", "puma", "reebok", "underarmour",
	"cocacola", "coca_cola", "pepsi", "starbucks", "mcdonalds",
	"walmart", "target", "costco", "kroger", "wholefoods",
	"bmw", "mercedes", "audi", "porsche", "ferrari",
	"lamborghini", "ford", "chevrolet", "toyota", "honda",
	"gucci", "louisvuitton", "chanel", "prada", "hermes",
	"rolex", "cartier", "tiffany", "burberry",
	"nfl", "nba", "mlb", "nhl", "mls", "fifa", "ufc",
	"espn", "cnn", "bbc", "foxnews", "msnbc", "nytimes",
	"washingtonpost", "wsj", "reuters", "apnews",
	"marvel", "dccomics", "nintendo", "playstation", "xbox",
	"paramount", "warner", "universal",
}

// Public figures, politicians, and notable people
var publicFigureReserved = []string{
	"elonmusk", "elon_musk", "jeffbezos", "jeff_bezos",
	"markzuckerberg", "mark_zuckerberg", "zuckerberg",
	"timcook", "tim_cook", "billgates", "bill_gates",
	"satyanadella", "sundarpichai", "samaltman",
	"joebiden", "joe_biden", "donaldtrump", "donald_trump",
	"barackobama", "barack_obama", "kamalaharris",
	"taylorswift", "taylor_swift", "beyonce", "rihanna",
	"drake", "kanyewest", "kanye_west", "ye",
	"kimkardashian", "kim_kardashian", "kyliejenner",
	"cristiano", "ronaldo", "messi", "lebronjames", "lebron_james",
	"therock", "the_rock", "dwaynejohnson",
	"mrbeaast", "mrbeast", "pewdiepie", "ninja",
	"joerogan", "joe_rogan", "oprah",
	"pope", "popefrancis", "dalailama",
}

var reservedPrefixes = []string{
	"sojorn_", "sojorn.", "official_", "official.",
	"admin_", "admin.", "mod_", "mod.",
	"support_", "support.", "team_", "team.",
	"staff_", "staff.", "system_", "system.",
}

var reservedSubstrings = []string{
	"sojorn",
}

var reservedSet map[string]bool

func init() {
	reservedSet = make(map[string]bool)
	for _, lists := range [][]string{
		platformReserved,
		techCompanyReserved,
		brandReserved,
		publicFigureReserved,
	} {
		for _, name := range lists {
			reservedSet[name] = true
		}
	}
}

// -------------------------------------------------------------------
// Inappropriate content filter
// -------------------------------------------------------------------

func isInappropriate(text string) string {
	// Remove common substitutions for bypass attempts
	normalized := normalizeUsername(text)

	for _, entry := range inappropriatePatterns {
		if entry.regex.MatchString(normalized) || entry.regex.MatchString(text) {
			return entry.reason
		}
	}
	return ""
}

type inappropriateEntry struct {
	regex  *regexp.Regexp
	reason string
}

var inappropriatePatterns []inappropriateEntry

func init() {
	type raw struct {
		pattern string
		reason  string
	}
	entries := []raw{
		// Slurs and hate speech
		{`\bn[i1!|]gg[e3a@][r]?\b`, "contains a racial slur"},
		{`\bf[a@]gg?[o0][t]?\b`, "contains a homophobic slur"},
		{`\bk[i1!]ke\b`, "contains an antisemitic slur"},
		{`\bsp[i1!]c\b`, "contains a racial slur"},
		{`\bch[i1!]nk\b`, "contains a racial slur"},
		{`\bw[e3]tb[a@]ck\b`, "contains a racial slur"},
		{`\bcoon\b`, "contains a racial slur"},
		{`\btr[a@]nn[yie]\b`, "contains a transphobic slur"},
		{`\bdyke\b`, "contains a homophobic slur"},
		{`\bretard(ed)?\b`, "contains an ableist slur"},

		// Sexually explicit
		{`\bp[o0]rn`, "contains sexually explicit content"},
		{`\bx{2,}`, "contains sexually explicit content"},
		{`\bhentai\b`, "contains sexually explicit content"},
		{`\bcum(sl[u]t|dump|bucket)\b`, "contains sexually explicit content"},
		{`\bpussy\b`, "contains sexually explicit content"},
		{`\bd[i1!]ck(head|face|sucker)`, "contains sexually explicit content"},
		{`\bc[o0]ck(sucker)?`, "contains sexually explicit content"},

		// Violent / threatening
		{`\bk[i1!]ll(er)?_(yo)?u`, "contains threatening language"},
		{`\bschool.?shoot`, "contains violent content"},
		{`\bmass.?murder`, "contains violent content"},
		{`\bgenocide\b`, "contains violent content"},
		{`\bterroris[tm]`, "contains references to terrorism"},
		{`\bisis\b`, "contains references to terrorism"},
		{`\bal.?qaeda\b`, "contains references to terrorism"},
		{`\bjihad(i|ist)?\b`, "contains references to terrorism"},

		// Drugs (hard)
		{`\bmeth(head|lab)\b`, "contains drug references"},
		{`\bcrackhead\b`, "contains drug references"},
		{`\bheroin(e)?\b`, "contains drug references"},
		{`\bfentanyl\b`, "contains drug references"},

		// Impersonation indicators
		{`\breal_?\b`, "may imply impersonation"},
		{`\bthe_?real\b`, "may imply impersonation"},
		{`\bofficial_\b`, "may imply impersonation"},
		{`\bnot_?fake\b`, "may imply impersonation"},

		// Scam / fraud
		{`\bfree.?money\b`, "suggests fraudulent activity"},
		{`\bcrypto.?scam\b`, "suggests fraudulent activity"},
		{`\bget.?rich\b`, "suggests fraudulent activity"},

		// Self-harm
		{`\bsu[i1!]c[i1!]de\b`, "contains references to self-harm"},
		{`\bkill.?myself\b`, "contains references to self-harm"},
		{`\bcut.?myself\b`, "contains references to self-harm"},

		// General profanity as usernames (strong)
		{`\bfuck`, "contains strong profanity"},
		{`\bsh[i1!]t(head|face|stain)`, "contains strong profanity"},
		{`\bass(hole|wipe|face|hat)`, "contains strong profanity"},
		{`\bbitch\b`, "contains strong profanity"},
		{`\bwhore\b`, "contains strong profanity"},
		{`\bslut\b`, "contains strong profanity"},
		{`\bcunt\b`, "contains strong profanity"},
	}

	inappropriatePatterns = make([]inappropriateEntry, 0, len(entries))
	for _, e := range entries {
		re := regexp.MustCompile("(?i)" + e.pattern)
		inappropriatePatterns = append(inappropriatePatterns, inappropriateEntry{regex: re, reason: e.reason})
	}
}

// normalizeUsername applies common leet-speak substitutions to catch bypass attempts
func normalizeUsername(s string) string {
	replacer := strings.NewReplacer(
		"0", "o",
		"1", "i",
		"3", "e",
		"4", "a",
		"5", "s",
		"7", "t",
		"@", "a",
		"$", "s",
		"!", "i",
		"|", "l",
	)
	return replacer.Replace(s)
}