From 70261d839bf4723959e00711661007748b672ec2 Mon Sep 17 00:00:00 2001 From: Patrick Britton Date: Sun, 8 Feb 2026 18:58:54 -0600 Subject: [PATCH] fix: replace slow HTTP-based Google News URL resolution with instant base64 protobuf decode --- .../services/official_accounts_service.go | 153 +++++++++--------- 1 file changed, 78 insertions(+), 75 deletions(-) diff --git a/go-backend/internal/services/official_accounts_service.go b/go-backend/internal/services/official_accounts_service.go index 16f30f3..0610342 100644 --- a/go-backend/internal/services/official_accounts_service.go +++ b/go-backend/internal/services/official_accounts_service.go @@ -2,12 +2,12 @@ package services import ( "context" + "encoding/base64" "encoding/json" "encoding/xml" "fmt" "io" "net/http" - "regexp" "strings" "sync" "time" @@ -278,98 +278,101 @@ func (s *OfficialAccountsService) resolveGoogleNewsLink(googleURL string) string } // ResolveGoogleNewsURL is a package-level helper that resolves a Google News URL -// to the underlying article URL by following the full redirect chain. +// to the underlying article URL by decoding the base64-encoded article ID. +// This is instant (no network request) — Google News embeds the real URL in the article ID. func ResolveGoogleNewsURL(googleURL string) string { if googleURL == "" || !strings.Contains(googleURL, "news.google.com") { return googleURL } - // Track the final URL after all redirects - var finalURL string - client := &http.Client{ - Timeout: 15 * time.Second, - CheckRedirect: func(req *http.Request, via []*http.Request) error { - if len(via) >= 10 { - return fmt.Errorf("too many redirects") - } - // Track every hop; stop if we've left Google domains - host := req.URL.Hostname() - if !strings.Contains(host, "google.com") && !strings.Contains(host, "google.") { - finalURL = req.URL.String() - return http.ErrUseLastResponse - } - return nil - }, + // Extract the article ID from URLs like: + // https://news.google.com/rss/articles/CBMisgFBVV95cUxQ...?oc=5 + articleID := "" + if idx := strings.Index(googleURL, "/articles/"); idx != -1 { + articleID = googleURL[idx+len("/articles/"):] + } else if idx := strings.Index(googleURL, "/read/"); idx != -1 { + articleID = googleURL[idx+len("/read/"):] } - - req, err := http.NewRequest("GET", googleURL, nil) - if err != nil { + if articleID == "" { return googleURL } - req.Header.Set("User-Agent", "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36") - req.Header.Set("Accept", "text/html,application/xhtml+xml") - resp, err := client.Do(req) + // Strip query params + if qIdx := strings.Index(articleID, "?"); qIdx != -1 { + articleID = articleID[:qIdx] + } + + // Base64url decode the article ID + decoded, err := base64DecodeGNews(articleID) if err != nil { - // If we captured a non-Google URL before the error, use it - if finalURL != "" { - log.Debug().Str("resolved", finalURL).Str("original", googleURL).Msg("Resolved Google News link via redirect chain") - return finalURL - } - log.Debug().Err(err).Str("url", googleURL).Msg("Failed to resolve Google News link") + log.Debug().Err(err).Str("url", googleURL).Msg("Failed to base64-decode Google News article ID") return googleURL } - defer resp.Body.Close() - // If we captured a non-Google URL during redirects, use that - if finalURL != "" { - log.Debug().Str("resolved", finalURL).Str("original", googleURL).Msg("Resolved Google News link via redirect chain") - return finalURL + // The decoded protobuf contains the article URL as an embedded string. + // Scan for "http" to find the URL within the decoded bytes. + resolved := extractURLFromBytes(decoded) + if resolved != "" { + log.Debug().Str("resolved", resolved).Str("original", googleURL).Msg("Resolved Google News link via base64 decode") + return resolved } - // Check final response URL - if resp.Request != nil && resp.Request.URL != nil { - final := resp.Request.URL.String() - if !strings.Contains(final, "news.google.com") && !strings.Contains(final, "consent.google") { - log.Debug().Str("resolved", final).Str("original", googleURL).Msg("Resolved Google News link via final URL") - return final - } - } - - // Fallback: parse meta refresh or JS redirect from response body - body, err := io.ReadAll(io.LimitReader(resp.Body, 64*1024)) - if err == nil { - html := string(body) - // Look for - metaRe := regexp.MustCompile(`(?i)http-equiv\s*=\s*["']refresh["'][^>]*content\s*=\s*["'][^;]*;\s*url\s*=\s*([^"'\s>]+)`) - if m := metaRe.FindStringSubmatch(html); len(m) > 1 { - resolved := strings.TrimSpace(m[1]) - if !strings.Contains(resolved, "google.com") { - log.Debug().Str("resolved", resolved).Str("original", googleURL).Msg("Resolved Google News link via meta refresh") - return resolved - } - } - // Look for window.location or document.location JS redirects - jsRe := regexp.MustCompile(`(?:window|document)\.location\s*[=]\s*["']([^"']+)["']`) - if m := jsRe.FindStringSubmatch(html); len(m) > 1 { - resolved := strings.TrimSpace(m[1]) - if !strings.Contains(resolved, "google.com") { - log.Debug().Str("resolved", resolved).Str("original", googleURL).Msg("Resolved Google News link via JS redirect") - return resolved - } - } - // Look for with data-n-au attribute (Google News article link) - auRe := regexp.MustCompile(`data-n-au\s*=\s*["']([^"']+)["']`) - if m := auRe.FindStringSubmatch(html); len(m) > 1 { - log.Debug().Str("resolved", m[1]).Str("original", googleURL).Msg("Resolved Google News link via data-n-au") - return m[1] - } - } - - log.Warn().Str("url", googleURL).Msg("Could not resolve Google News link to source article") + log.Debug().Str("url", googleURL).Msg("Could not extract URL from Google News article ID") return googleURL } +// base64DecodeGNews decodes a Google News article ID which uses base64url encoding +// with optional padding. +func base64DecodeGNews(s string) ([]byte, error) { + // Replace URL-safe chars + s = strings.ReplaceAll(s, "-", "+") + s = strings.ReplaceAll(s, "_", "/") + // Add padding if needed + switch len(s) % 4 { + case 2: + s += "==" + case 3: + s += "=" + } + return base64.StdEncoding.DecodeString(s) +} + +// extractURLFromBytes scans decoded protobuf bytes for an embedded HTTP(S) URL. +func extractURLFromBytes(data []byte) string { + s := string(data) + // Find "http://" or "https://" in the decoded data + for _, prefix := range []string{"https://", "http://"} { + idx := strings.Index(s, prefix) + if idx == -1 { + continue + } + // Extract the URL — it ends at the first non-URL byte + urlStart := idx + urlEnd := urlStart + for urlEnd < len(s) { + c := s[urlEnd] + // URL-safe characters + if (c >= 'a' && c <= 'z') || (c >= 'A' && c <= 'Z') || (c >= '0' && c <= '9') || + c == ':' || c == '/' || c == '.' || c == '-' || c == '_' || c == '~' || + c == '?' || c == '&' || c == '=' || c == '%' || c == '#' || c == '+' || + c == '@' || c == '!' || c == '$' || c == '(' || c == ')' || c == ',' || c == ';' { + urlEnd++ + } else { + break + } + } + candidate := s[urlStart:urlEnd] + // Must be a real URL, not a Google News internal URL + if strings.Contains(candidate, "news.google.com") || strings.Contains(candidate, "consent.google") { + continue + } + if len(candidate) > 20 { // Minimum viable URL length + return candidate + } + } + return "" +} + // FetchNewArticles fetches new articles from all enabled news sources for a config, // filtering out already-posted articles. func (s *OfficialAccountsService) FetchNewArticles(ctx context.Context, configID string) ([]RSSItem, []string, error) {