fix: unescape HTML entities in OG tag parsing for link previews
This commit is contained in:
parent
2aa4eb77cf
commit
0f7874b429
15
fix_entities.sql
Normal file
15
fix_entities.sql
Normal file
|
|
@ -0,0 +1,15 @@
|
||||||
|
-- Fix HTML entities in link preview titles
|
||||||
|
UPDATE posts SET link_preview_title = REPLACE(link_preview_title, ''', '''') WHERE link_preview_title LIKE '%'%';
|
||||||
|
UPDATE posts SET link_preview_title = REPLACE(link_preview_title, '&', '&') WHERE link_preview_title LIKE '%&%';
|
||||||
|
UPDATE posts SET link_preview_title = REPLACE(link_preview_title, '"', '"') WHERE link_preview_title LIKE '%"%';
|
||||||
|
|
||||||
|
-- Fix HTML entities in link preview descriptions
|
||||||
|
UPDATE posts SET link_preview_description = REPLACE(link_preview_description, ''', '''') WHERE link_preview_description LIKE '%'%';
|
||||||
|
UPDATE posts SET link_preview_description = REPLACE(link_preview_description, '&', '&') WHERE link_preview_description LIKE '%&%';
|
||||||
|
UPDATE posts SET link_preview_description = REPLACE(link_preview_description, '"', '"') WHERE link_preview_description LIKE '%"%';
|
||||||
|
|
||||||
|
-- Fix HTML entities in article titles/descriptions
|
||||||
|
UPDATE official_account_articles SET title = REPLACE(title, ''', '''') WHERE title LIKE '%'%';
|
||||||
|
UPDATE official_account_articles SET title = REPLACE(title, '&', '&') WHERE title LIKE '%&%';
|
||||||
|
UPDATE official_account_articles SET description = REPLACE(description, ''', '''') WHERE description LIKE '%'%';
|
||||||
|
UPDATE official_account_articles SET description = REPLACE(description, '&', '&') WHERE description LIKE '%&%';
|
||||||
|
|
@ -3,6 +3,7 @@ package services
|
||||||
import (
|
import (
|
||||||
"context"
|
"context"
|
||||||
"fmt"
|
"fmt"
|
||||||
|
"html"
|
||||||
"io"
|
"io"
|
||||||
"net"
|
"net"
|
||||||
"net/http"
|
"net/http"
|
||||||
|
|
@ -182,19 +183,19 @@ func (s *LinkPreviewService) validateURL(u *url.URL) error {
|
||||||
}
|
}
|
||||||
|
|
||||||
// parseOGTags extracts OpenGraph meta tags from raw HTML.
|
// parseOGTags extracts OpenGraph meta tags from raw HTML.
|
||||||
func (s *LinkPreviewService) parseOGTags(html string, sourceURL string) *LinkPreview {
|
func (s *LinkPreviewService) parseOGTags(htmlStr string, sourceURL string) *LinkPreview {
|
||||||
preview := &LinkPreview{}
|
preview := &LinkPreview{}
|
||||||
|
|
||||||
// Use regex to extract meta tags — lightweight, no dependency needed
|
// Use regex to extract meta tags — lightweight, no dependency needed
|
||||||
metaRe := regexp.MustCompile(`(?i)<meta\s+[^>]*>`)
|
metaRe := regexp.MustCompile(`(?i)<meta\s+[^>]*>`)
|
||||||
metas := metaRe.FindAllString(html, -1)
|
metas := metaRe.FindAllString(htmlStr, -1)
|
||||||
|
|
||||||
for _, tag := range metas {
|
for _, tag := range metas {
|
||||||
prop := extractAttr(tag, "property")
|
prop := extractAttr(tag, "property")
|
||||||
if prop == "" {
|
if prop == "" {
|
||||||
prop = extractAttr(tag, "name")
|
prop = extractAttr(tag, "name")
|
||||||
}
|
}
|
||||||
content := extractAttr(tag, "content")
|
content := html.UnescapeString(extractAttr(tag, "content"))
|
||||||
if content == "" {
|
if content == "" {
|
||||||
continue
|
continue
|
||||||
}
|
}
|
||||||
|
|
@ -227,8 +228,8 @@ func (s *LinkPreviewService) parseOGTags(html string, sourceURL string) *LinkPre
|
||||||
// Fallback: try <title> tag if no og:title
|
// Fallback: try <title> tag if no og:title
|
||||||
if preview.Title == "" {
|
if preview.Title == "" {
|
||||||
titleRe := regexp.MustCompile(`(?i)<title[^>]*>(.*?)</title>`)
|
titleRe := regexp.MustCompile(`(?i)<title[^>]*>(.*?)</title>`)
|
||||||
if m := titleRe.FindStringSubmatch(html); len(m) > 1 {
|
if m := titleRe.FindStringSubmatch(htmlStr); len(m) > 1 {
|
||||||
preview.Title = strings.TrimSpace(m[1])
|
preview.Title = html.UnescapeString(strings.TrimSpace(m[1]))
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
||||||
Loading…
Reference in a new issue