diff --git a/go-backend/internal/services/link_preview_service.go b/go-backend/internal/services/link_preview_service.go index bf0f695..4903415 100644 --- a/go-backend/internal/services/link_preview_service.go +++ b/go-backend/internal/services/link_preview_service.go @@ -98,6 +98,14 @@ func (s *LinkPreviewService) FetchPreview(ctx context.Context, rawURL string, tr return nil, fmt.Errorf("empty URL") } + // Resolve Google News URLs to the actual source article + if strings.Contains(rawURL, "news.google.com") { + resolved := ResolveGoogleNewsURL(rawURL) + if resolved != rawURL { + rawURL = resolved + } + } + parsed, err := url.Parse(rawURL) if err != nil { return nil, fmt.Errorf("invalid URL: %w", err) diff --git a/go-backend/internal/services/official_accounts_service.go b/go-backend/internal/services/official_accounts_service.go index 3decd29..16f30f3 100644 --- a/go-backend/internal/services/official_accounts_service.go +++ b/go-backend/internal/services/official_accounts_service.go @@ -7,6 +7,7 @@ import ( "fmt" "io" "net/http" + "regexp" "strings" "sync" "time" @@ -271,35 +272,101 @@ func (s *OfficialAccountsService) FetchRSS(ctx context.Context, rssURL string) ( return feed.Channel.Items, nil } -// resolveGoogleNewsLink follows the Google News redirect to get the actual article URL. +// resolveGoogleNewsLink follows the Google News redirect chain to get the actual article URL. func (s *OfficialAccountsService) resolveGoogleNewsLink(googleURL string) string { + return ResolveGoogleNewsURL(googleURL) +} + +// ResolveGoogleNewsURL is a package-level helper that resolves a Google News URL +// to the underlying article URL by following the full redirect chain. +func ResolveGoogleNewsURL(googleURL string) string { if googleURL == "" || !strings.Contains(googleURL, "news.google.com") { return googleURL } - // Use a client that does NOT follow redirects - noRedirectClient := &http.Client{ - Timeout: 10 * time.Second, + // Track the final URL after all redirects + var finalURL string + client := &http.Client{ + Timeout: 15 * time.Second, CheckRedirect: func(req *http.Request, via []*http.Request) error { - return http.ErrUseLastResponse + if len(via) >= 10 { + return fmt.Errorf("too many redirects") + } + // Track every hop; stop if we've left Google domains + host := req.URL.Hostname() + if !strings.Contains(host, "google.com") && !strings.Contains(host, "google.") { + finalURL = req.URL.String() + return http.ErrUseLastResponse + } + return nil }, } - resp, err := noRedirectClient.Get(googleURL) + req, err := http.NewRequest("GET", googleURL, nil) if err != nil { + return googleURL + } + req.Header.Set("User-Agent", "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36") + req.Header.Set("Accept", "text/html,application/xhtml+xml") + + resp, err := client.Do(req) + if err != nil { + // If we captured a non-Google URL before the error, use it + if finalURL != "" { + log.Debug().Str("resolved", finalURL).Str("original", googleURL).Msg("Resolved Google News link via redirect chain") + return finalURL + } log.Debug().Err(err).Str("url", googleURL).Msg("Failed to resolve Google News link") return googleURL } defer resp.Body.Close() - if resp.StatusCode >= 300 && resp.StatusCode < 400 { - loc := resp.Header.Get("Location") - if loc != "" { - return loc + // If we captured a non-Google URL during redirects, use that + if finalURL != "" { + log.Debug().Str("resolved", finalURL).Str("original", googleURL).Msg("Resolved Google News link via redirect chain") + return finalURL + } + + // Check final response URL + if resp.Request != nil && resp.Request.URL != nil { + final := resp.Request.URL.String() + if !strings.Contains(final, "news.google.com") && !strings.Contains(final, "consent.google") { + log.Debug().Str("resolved", final).Str("original", googleURL).Msg("Resolved Google News link via final URL") + return final } } - // Some Google News links use JS-based redirect; try parsing from body as fallback + // Fallback: parse meta refresh or JS redirect from response body + body, err := io.ReadAll(io.LimitReader(resp.Body, 64*1024)) + if err == nil { + html := string(body) + // Look for + metaRe := regexp.MustCompile(`(?i)http-equiv\s*=\s*["']refresh["'][^>]*content\s*=\s*["'][^;]*;\s*url\s*=\s*([^"'\s>]+)`) + if m := metaRe.FindStringSubmatch(html); len(m) > 1 { + resolved := strings.TrimSpace(m[1]) + if !strings.Contains(resolved, "google.com") { + log.Debug().Str("resolved", resolved).Str("original", googleURL).Msg("Resolved Google News link via meta refresh") + return resolved + } + } + // Look for window.location or document.location JS redirects + jsRe := regexp.MustCompile(`(?:window|document)\.location\s*[=]\s*["']([^"']+)["']`) + if m := jsRe.FindStringSubmatch(html); len(m) > 1 { + resolved := strings.TrimSpace(m[1]) + if !strings.Contains(resolved, "google.com") { + log.Debug().Str("resolved", resolved).Str("original", googleURL).Msg("Resolved Google News link via JS redirect") + return resolved + } + } + // Look for with data-n-au attribute (Google News article link) + auRe := regexp.MustCompile(`data-n-au\s*=\s*["']([^"']+)["']`) + if m := auRe.FindStringSubmatch(html); len(m) > 1 { + log.Debug().Str("resolved", m[1]).Str("original", googleURL).Msg("Resolved Google News link via data-n-au") + return m[1] + } + } + + log.Warn().Str("url", googleURL).Msg("Could not resolve Google News link to source article") return googleURL }