fix: resolve Google News URLs to actual source articles before fetching link previews

This commit is contained in:
Patrick Britton 2026-02-08 18:48:49 -06:00
parent 6de8a475d1
commit 4704708c2c
2 changed files with 86 additions and 11 deletions

View file

@ -98,6 +98,14 @@ func (s *LinkPreviewService) FetchPreview(ctx context.Context, rawURL string, tr
return nil, fmt.Errorf("empty URL")
}
// Resolve Google News URLs to the actual source article
if strings.Contains(rawURL, "news.google.com") {
resolved := ResolveGoogleNewsURL(rawURL)
if resolved != rawURL {
rawURL = resolved
}
}
parsed, err := url.Parse(rawURL)
if err != nil {
return nil, fmt.Errorf("invalid URL: %w", err)

View file

@ -7,6 +7,7 @@ import (
"fmt"
"io"
"net/http"
"regexp"
"strings"
"sync"
"time"
@ -271,35 +272,101 @@ func (s *OfficialAccountsService) FetchRSS(ctx context.Context, rssURL string) (
return feed.Channel.Items, nil
}
// resolveGoogleNewsLink follows the Google News redirect to get the actual article URL.
// resolveGoogleNewsLink follows the Google News redirect chain to get the actual article URL.
func (s *OfficialAccountsService) resolveGoogleNewsLink(googleURL string) string {
return ResolveGoogleNewsURL(googleURL)
}
// ResolveGoogleNewsURL is a package-level helper that resolves a Google News URL
// to the underlying article URL by following the full redirect chain.
func ResolveGoogleNewsURL(googleURL string) string {
if googleURL == "" || !strings.Contains(googleURL, "news.google.com") {
return googleURL
}
// Use a client that does NOT follow redirects
noRedirectClient := &http.Client{
Timeout: 10 * time.Second,
// Track the final URL after all redirects
var finalURL string
client := &http.Client{
Timeout: 15 * time.Second,
CheckRedirect: func(req *http.Request, via []*http.Request) error {
if len(via) >= 10 {
return fmt.Errorf("too many redirects")
}
// Track every hop; stop if we've left Google domains
host := req.URL.Hostname()
if !strings.Contains(host, "google.com") && !strings.Contains(host, "google.") {
finalURL = req.URL.String()
return http.ErrUseLastResponse
}
return nil
},
}
resp, err := noRedirectClient.Get(googleURL)
req, err := http.NewRequest("GET", googleURL, nil)
if err != nil {
return googleURL
}
req.Header.Set("User-Agent", "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36")
req.Header.Set("Accept", "text/html,application/xhtml+xml")
resp, err := client.Do(req)
if err != nil {
// If we captured a non-Google URL before the error, use it
if finalURL != "" {
log.Debug().Str("resolved", finalURL).Str("original", googleURL).Msg("Resolved Google News link via redirect chain")
return finalURL
}
log.Debug().Err(err).Str("url", googleURL).Msg("Failed to resolve Google News link")
return googleURL
}
defer resp.Body.Close()
if resp.StatusCode >= 300 && resp.StatusCode < 400 {
loc := resp.Header.Get("Location")
if loc != "" {
return loc
// If we captured a non-Google URL during redirects, use that
if finalURL != "" {
log.Debug().Str("resolved", finalURL).Str("original", googleURL).Msg("Resolved Google News link via redirect chain")
return finalURL
}
// Check final response URL
if resp.Request != nil && resp.Request.URL != nil {
final := resp.Request.URL.String()
if !strings.Contains(final, "news.google.com") && !strings.Contains(final, "consent.google") {
log.Debug().Str("resolved", final).Str("original", googleURL).Msg("Resolved Google News link via final URL")
return final
}
}
// Some Google News links use JS-based redirect; try parsing from body as fallback
// Fallback: parse meta refresh or JS redirect from response body
body, err := io.ReadAll(io.LimitReader(resp.Body, 64*1024))
if err == nil {
html := string(body)
// Look for <meta http-equiv="refresh" content="0;url=...">
metaRe := regexp.MustCompile(`(?i)http-equiv\s*=\s*["']refresh["'][^>]*content\s*=\s*["'][^;]*;\s*url\s*=\s*([^"'\s>]+)`)
if m := metaRe.FindStringSubmatch(html); len(m) > 1 {
resolved := strings.TrimSpace(m[1])
if !strings.Contains(resolved, "google.com") {
log.Debug().Str("resolved", resolved).Str("original", googleURL).Msg("Resolved Google News link via meta refresh")
return resolved
}
}
// Look for window.location or document.location JS redirects
jsRe := regexp.MustCompile(`(?:window|document)\.location\s*[=]\s*["']([^"']+)["']`)
if m := jsRe.FindStringSubmatch(html); len(m) > 1 {
resolved := strings.TrimSpace(m[1])
if !strings.Contains(resolved, "google.com") {
log.Debug().Str("resolved", resolved).Str("original", googleURL).Msg("Resolved Google News link via JS redirect")
return resolved
}
}
// Look for <a href="..."> with data-n-au attribute (Google News article link)
auRe := regexp.MustCompile(`data-n-au\s*=\s*["']([^"']+)["']`)
if m := auRe.FindStringSubmatch(html); len(m) > 1 {
log.Debug().Str("resolved", m[1]).Str("original", googleURL).Msg("Resolved Google News link via data-n-au")
return m[1]
}
}
log.Warn().Str("url", googleURL).Msg("Could not resolve Google News link to source article")
return googleURL
}