fix: unescape HTML entities in OG tag parsing for link previews

This commit is contained in:
Patrick Britton 2026-02-09 08:58:56 -06:00
parent 2aa4eb77cf
commit 0f7874b429
2 changed files with 21 additions and 5 deletions

15
fix_entities.sql Normal file
View file

@ -0,0 +1,15 @@
-- Fix HTML entities in link preview titles
UPDATE posts SET link_preview_title = REPLACE(link_preview_title, ''', '''') WHERE link_preview_title LIKE '%'%';
UPDATE posts SET link_preview_title = REPLACE(link_preview_title, '&', '&') WHERE link_preview_title LIKE '%&%';
UPDATE posts SET link_preview_title = REPLACE(link_preview_title, '"', '"') WHERE link_preview_title LIKE '%"%';
-- Fix HTML entities in link preview descriptions
UPDATE posts SET link_preview_description = REPLACE(link_preview_description, ''', '''') WHERE link_preview_description LIKE '%'%';
UPDATE posts SET link_preview_description = REPLACE(link_preview_description, '&', '&') WHERE link_preview_description LIKE '%&%';
UPDATE posts SET link_preview_description = REPLACE(link_preview_description, '"', '"') WHERE link_preview_description LIKE '%"%';
-- Fix HTML entities in article titles/descriptions
UPDATE official_account_articles SET title = REPLACE(title, ''', '''') WHERE title LIKE '%'%';
UPDATE official_account_articles SET title = REPLACE(title, '&', '&') WHERE title LIKE '%&%';
UPDATE official_account_articles SET description = REPLACE(description, ''', '''') WHERE description LIKE '%'%';
UPDATE official_account_articles SET description = REPLACE(description, '&', '&') WHERE description LIKE '%&%';

View file

@ -3,6 +3,7 @@ package services
import (
"context"
"fmt"
"html"
"io"
"net"
"net/http"
@ -182,19 +183,19 @@ func (s *LinkPreviewService) validateURL(u *url.URL) error {
}
// parseOGTags extracts OpenGraph meta tags from raw HTML.
func (s *LinkPreviewService) parseOGTags(html string, sourceURL string) *LinkPreview {
func (s *LinkPreviewService) parseOGTags(htmlStr string, sourceURL string) *LinkPreview {
preview := &LinkPreview{}
// Use regex to extract meta tags — lightweight, no dependency needed
metaRe := regexp.MustCompile(`(?i)<meta\s+[^>]*>`)
metas := metaRe.FindAllString(html, -1)
metas := metaRe.FindAllString(htmlStr, -1)
for _, tag := range metas {
prop := extractAttr(tag, "property")
if prop == "" {
prop = extractAttr(tag, "name")
}
content := extractAttr(tag, "content")
content := html.UnescapeString(extractAttr(tag, "content"))
if content == "" {
continue
}
@ -227,8 +228,8 @@ func (s *LinkPreviewService) parseOGTags(html string, sourceURL string) *LinkPre
// Fallback: try <title> tag if no og:title
if preview.Title == "" {
titleRe := regexp.MustCompile(`(?i)<title[^>]*>(.*?)</title>`)
if m := titleRe.FindStringSubmatch(html); len(m) > 1 {
preview.Title = strings.TrimSpace(m[1])
if m := titleRe.FindStringSubmatch(htmlStr); len(m) > 1 {
preview.Title = html.UnescapeString(strings.TrimSpace(m[1]))
}
}