diff --git a/go-backend/internal/services/link_preview_service.go b/go-backend/internal/services/link_preview_service.go
index bf0f695..4903415 100644
--- a/go-backend/internal/services/link_preview_service.go
+++ b/go-backend/internal/services/link_preview_service.go
@@ -98,6 +98,14 @@ func (s *LinkPreviewService) FetchPreview(ctx context.Context, rawURL string, tr
return nil, fmt.Errorf("empty URL")
}
+ // Resolve Google News URLs to the actual source article
+ if strings.Contains(rawURL, "news.google.com") {
+ resolved := ResolveGoogleNewsURL(rawURL)
+ if resolved != rawURL {
+ rawURL = resolved
+ }
+ }
+
parsed, err := url.Parse(rawURL)
if err != nil {
return nil, fmt.Errorf("invalid URL: %w", err)
diff --git a/go-backend/internal/services/official_accounts_service.go b/go-backend/internal/services/official_accounts_service.go
index 3decd29..16f30f3 100644
--- a/go-backend/internal/services/official_accounts_service.go
+++ b/go-backend/internal/services/official_accounts_service.go
@@ -7,6 +7,7 @@ import (
"fmt"
"io"
"net/http"
+ "regexp"
"strings"
"sync"
"time"
@@ -271,35 +272,101 @@ func (s *OfficialAccountsService) FetchRSS(ctx context.Context, rssURL string) (
return feed.Channel.Items, nil
}
-// resolveGoogleNewsLink follows the Google News redirect to get the actual article URL.
+// resolveGoogleNewsLink follows the Google News redirect chain to get the actual article URL.
func (s *OfficialAccountsService) resolveGoogleNewsLink(googleURL string) string {
+ return ResolveGoogleNewsURL(googleURL)
+}
+
+// ResolveGoogleNewsURL is a package-level helper that resolves a Google News URL
+// to the underlying article URL by following the full redirect chain.
+func ResolveGoogleNewsURL(googleURL string) string {
if googleURL == "" || !strings.Contains(googleURL, "news.google.com") {
return googleURL
}
- // Use a client that does NOT follow redirects
- noRedirectClient := &http.Client{
- Timeout: 10 * time.Second,
+ // Track the final URL after all redirects
+ var finalURL string
+ client := &http.Client{
+ Timeout: 15 * time.Second,
CheckRedirect: func(req *http.Request, via []*http.Request) error {
- return http.ErrUseLastResponse
+ if len(via) >= 10 {
+ return fmt.Errorf("too many redirects")
+ }
+ // Track every hop; stop if we've left Google domains
+ host := req.URL.Hostname()
+ if !strings.Contains(host, "google.com") && !strings.Contains(host, "google.") {
+ finalURL = req.URL.String()
+ return http.ErrUseLastResponse
+ }
+ return nil
},
}
- resp, err := noRedirectClient.Get(googleURL)
+ req, err := http.NewRequest("GET", googleURL, nil)
if err != nil {
+ return googleURL
+ }
+ req.Header.Set("User-Agent", "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36")
+ req.Header.Set("Accept", "text/html,application/xhtml+xml")
+
+ resp, err := client.Do(req)
+ if err != nil {
+ // If we captured a non-Google URL before the error, use it
+ if finalURL != "" {
+ log.Debug().Str("resolved", finalURL).Str("original", googleURL).Msg("Resolved Google News link via redirect chain")
+ return finalURL
+ }
log.Debug().Err(err).Str("url", googleURL).Msg("Failed to resolve Google News link")
return googleURL
}
defer resp.Body.Close()
- if resp.StatusCode >= 300 && resp.StatusCode < 400 {
- loc := resp.Header.Get("Location")
- if loc != "" {
- return loc
+ // If we captured a non-Google URL during redirects, use that
+ if finalURL != "" {
+ log.Debug().Str("resolved", finalURL).Str("original", googleURL).Msg("Resolved Google News link via redirect chain")
+ return finalURL
+ }
+
+ // Check final response URL
+ if resp.Request != nil && resp.Request.URL != nil {
+ final := resp.Request.URL.String()
+ if !strings.Contains(final, "news.google.com") && !strings.Contains(final, "consent.google") {
+ log.Debug().Str("resolved", final).Str("original", googleURL).Msg("Resolved Google News link via final URL")
+ return final
}
}
- // Some Google News links use JS-based redirect; try parsing from body as fallback
+ // Fallback: parse meta refresh or JS redirect from response body
+ body, err := io.ReadAll(io.LimitReader(resp.Body, 64*1024))
+ if err == nil {
+ html := string(body)
+ // Look for
+ metaRe := regexp.MustCompile(`(?i)http-equiv\s*=\s*["']refresh["'][^>]*content\s*=\s*["'][^;]*;\s*url\s*=\s*([^"'\s>]+)`)
+ if m := metaRe.FindStringSubmatch(html); len(m) > 1 {
+ resolved := strings.TrimSpace(m[1])
+ if !strings.Contains(resolved, "google.com") {
+ log.Debug().Str("resolved", resolved).Str("original", googleURL).Msg("Resolved Google News link via meta refresh")
+ return resolved
+ }
+ }
+ // Look for window.location or document.location JS redirects
+ jsRe := regexp.MustCompile(`(?:window|document)\.location\s*[=]\s*["']([^"']+)["']`)
+ if m := jsRe.FindStringSubmatch(html); len(m) > 1 {
+ resolved := strings.TrimSpace(m[1])
+ if !strings.Contains(resolved, "google.com") {
+ log.Debug().Str("resolved", resolved).Str("original", googleURL).Msg("Resolved Google News link via JS redirect")
+ return resolved
+ }
+ }
+ // Look for with data-n-au attribute (Google News article link)
+ auRe := regexp.MustCompile(`data-n-au\s*=\s*["']([^"']+)["']`)
+ if m := auRe.FindStringSubmatch(html); len(m) > 1 {
+ log.Debug().Str("resolved", m[1]).Str("original", googleURL).Msg("Resolved Google News link via data-n-au")
+ return m[1]
+ }
+ }
+
+ log.Warn().Str("url", googleURL).Msg("Could not resolve Google News link to source article")
return googleURL
}