fix: resolve Google News URLs to actual source articles before fetching link previews
This commit is contained in:
parent
6de8a475d1
commit
4704708c2c
|
|
@ -98,6 +98,14 @@ func (s *LinkPreviewService) FetchPreview(ctx context.Context, rawURL string, tr
|
|||
return nil, fmt.Errorf("empty URL")
|
||||
}
|
||||
|
||||
// Resolve Google News URLs to the actual source article
|
||||
if strings.Contains(rawURL, "news.google.com") {
|
||||
resolved := ResolveGoogleNewsURL(rawURL)
|
||||
if resolved != rawURL {
|
||||
rawURL = resolved
|
||||
}
|
||||
}
|
||||
|
||||
parsed, err := url.Parse(rawURL)
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("invalid URL: %w", err)
|
||||
|
|
|
|||
|
|
@ -7,6 +7,7 @@ import (
|
|||
"fmt"
|
||||
"io"
|
||||
"net/http"
|
||||
"regexp"
|
||||
"strings"
|
||||
"sync"
|
||||
"time"
|
||||
|
|
@ -271,35 +272,101 @@ func (s *OfficialAccountsService) FetchRSS(ctx context.Context, rssURL string) (
|
|||
return feed.Channel.Items, nil
|
||||
}
|
||||
|
||||
// resolveGoogleNewsLink follows the Google News redirect to get the actual article URL.
|
||||
// resolveGoogleNewsLink follows the Google News redirect chain to get the actual article URL.
|
||||
func (s *OfficialAccountsService) resolveGoogleNewsLink(googleURL string) string {
|
||||
return ResolveGoogleNewsURL(googleURL)
|
||||
}
|
||||
|
||||
// ResolveGoogleNewsURL is a package-level helper that resolves a Google News URL
|
||||
// to the underlying article URL by following the full redirect chain.
|
||||
func ResolveGoogleNewsURL(googleURL string) string {
|
||||
if googleURL == "" || !strings.Contains(googleURL, "news.google.com") {
|
||||
return googleURL
|
||||
}
|
||||
|
||||
// Use a client that does NOT follow redirects
|
||||
noRedirectClient := &http.Client{
|
||||
Timeout: 10 * time.Second,
|
||||
// Track the final URL after all redirects
|
||||
var finalURL string
|
||||
client := &http.Client{
|
||||
Timeout: 15 * time.Second,
|
||||
CheckRedirect: func(req *http.Request, via []*http.Request) error {
|
||||
if len(via) >= 10 {
|
||||
return fmt.Errorf("too many redirects")
|
||||
}
|
||||
// Track every hop; stop if we've left Google domains
|
||||
host := req.URL.Hostname()
|
||||
if !strings.Contains(host, "google.com") && !strings.Contains(host, "google.") {
|
||||
finalURL = req.URL.String()
|
||||
return http.ErrUseLastResponse
|
||||
}
|
||||
return nil
|
||||
},
|
||||
}
|
||||
|
||||
resp, err := noRedirectClient.Get(googleURL)
|
||||
req, err := http.NewRequest("GET", googleURL, nil)
|
||||
if err != nil {
|
||||
return googleURL
|
||||
}
|
||||
req.Header.Set("User-Agent", "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36")
|
||||
req.Header.Set("Accept", "text/html,application/xhtml+xml")
|
||||
|
||||
resp, err := client.Do(req)
|
||||
if err != nil {
|
||||
// If we captured a non-Google URL before the error, use it
|
||||
if finalURL != "" {
|
||||
log.Debug().Str("resolved", finalURL).Str("original", googleURL).Msg("Resolved Google News link via redirect chain")
|
||||
return finalURL
|
||||
}
|
||||
log.Debug().Err(err).Str("url", googleURL).Msg("Failed to resolve Google News link")
|
||||
return googleURL
|
||||
}
|
||||
defer resp.Body.Close()
|
||||
|
||||
if resp.StatusCode >= 300 && resp.StatusCode < 400 {
|
||||
loc := resp.Header.Get("Location")
|
||||
if loc != "" {
|
||||
return loc
|
||||
// If we captured a non-Google URL during redirects, use that
|
||||
if finalURL != "" {
|
||||
log.Debug().Str("resolved", finalURL).Str("original", googleURL).Msg("Resolved Google News link via redirect chain")
|
||||
return finalURL
|
||||
}
|
||||
|
||||
// Check final response URL
|
||||
if resp.Request != nil && resp.Request.URL != nil {
|
||||
final := resp.Request.URL.String()
|
||||
if !strings.Contains(final, "news.google.com") && !strings.Contains(final, "consent.google") {
|
||||
log.Debug().Str("resolved", final).Str("original", googleURL).Msg("Resolved Google News link via final URL")
|
||||
return final
|
||||
}
|
||||
}
|
||||
|
||||
// Some Google News links use JS-based redirect; try parsing from body as fallback
|
||||
// Fallback: parse meta refresh or JS redirect from response body
|
||||
body, err := io.ReadAll(io.LimitReader(resp.Body, 64*1024))
|
||||
if err == nil {
|
||||
html := string(body)
|
||||
// Look for <meta http-equiv="refresh" content="0;url=...">
|
||||
metaRe := regexp.MustCompile(`(?i)http-equiv\s*=\s*["']refresh["'][^>]*content\s*=\s*["'][^;]*;\s*url\s*=\s*([^"'\s>]+)`)
|
||||
if m := metaRe.FindStringSubmatch(html); len(m) > 1 {
|
||||
resolved := strings.TrimSpace(m[1])
|
||||
if !strings.Contains(resolved, "google.com") {
|
||||
log.Debug().Str("resolved", resolved).Str("original", googleURL).Msg("Resolved Google News link via meta refresh")
|
||||
return resolved
|
||||
}
|
||||
}
|
||||
// Look for window.location or document.location JS redirects
|
||||
jsRe := regexp.MustCompile(`(?:window|document)\.location\s*[=]\s*["']([^"']+)["']`)
|
||||
if m := jsRe.FindStringSubmatch(html); len(m) > 1 {
|
||||
resolved := strings.TrimSpace(m[1])
|
||||
if !strings.Contains(resolved, "google.com") {
|
||||
log.Debug().Str("resolved", resolved).Str("original", googleURL).Msg("Resolved Google News link via JS redirect")
|
||||
return resolved
|
||||
}
|
||||
}
|
||||
// Look for <a href="..."> with data-n-au attribute (Google News article link)
|
||||
auRe := regexp.MustCompile(`data-n-au\s*=\s*["']([^"']+)["']`)
|
||||
if m := auRe.FindStringSubmatch(html); len(m) > 1 {
|
||||
log.Debug().Str("resolved", m[1]).Str("original", googleURL).Msg("Resolved Google News link via data-n-au")
|
||||
return m[1]
|
||||
}
|
||||
}
|
||||
|
||||
log.Warn().Str("url", googleURL).Msg("Could not resolve Google News link to source article")
|
||||
return googleURL
|
||||
}
|
||||
|
||||
|
|
|
|||
Loading…
Reference in a new issue