fix: replace slow HTTP-based Google News URL resolution with instant base64 protobuf decode

This commit is contained in:
Patrick Britton 2026-02-08 18:58:54 -06:00
parent 4704708c2c
commit 70261d839b

View file

@ -2,12 +2,12 @@ package services
import ( import (
"context" "context"
"encoding/base64"
"encoding/json" "encoding/json"
"encoding/xml" "encoding/xml"
"fmt" "fmt"
"io" "io"
"net/http" "net/http"
"regexp"
"strings" "strings"
"sync" "sync"
"time" "time"
@ -278,98 +278,101 @@ func (s *OfficialAccountsService) resolveGoogleNewsLink(googleURL string) string
} }
// ResolveGoogleNewsURL is a package-level helper that resolves a Google News URL // ResolveGoogleNewsURL is a package-level helper that resolves a Google News URL
// to the underlying article URL by following the full redirect chain. // to the underlying article URL by decoding the base64-encoded article ID.
// This is instant (no network request) — Google News embeds the real URL in the article ID.
func ResolveGoogleNewsURL(googleURL string) string { func ResolveGoogleNewsURL(googleURL string) string {
if googleURL == "" || !strings.Contains(googleURL, "news.google.com") { if googleURL == "" || !strings.Contains(googleURL, "news.google.com") {
return googleURL return googleURL
} }
// Track the final URL after all redirects // Extract the article ID from URLs like:
var finalURL string // https://news.google.com/rss/articles/CBMisgFBVV95cUxQ...?oc=5
client := &http.Client{ articleID := ""
Timeout: 15 * time.Second, if idx := strings.Index(googleURL, "/articles/"); idx != -1 {
CheckRedirect: func(req *http.Request, via []*http.Request) error { articleID = googleURL[idx+len("/articles/"):]
if len(via) >= 10 { } else if idx := strings.Index(googleURL, "/read/"); idx != -1 {
return fmt.Errorf("too many redirects") articleID = googleURL[idx+len("/read/"):]
}
// Track every hop; stop if we've left Google domains
host := req.URL.Hostname()
if !strings.Contains(host, "google.com") && !strings.Contains(host, "google.") {
finalURL = req.URL.String()
return http.ErrUseLastResponse
}
return nil
},
} }
if articleID == "" {
req, err := http.NewRequest("GET", googleURL, nil)
if err != nil {
return googleURL return googleURL
} }
req.Header.Set("User-Agent", "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36")
req.Header.Set("Accept", "text/html,application/xhtml+xml")
resp, err := client.Do(req) // Strip query params
if qIdx := strings.Index(articleID, "?"); qIdx != -1 {
articleID = articleID[:qIdx]
}
// Base64url decode the article ID
decoded, err := base64DecodeGNews(articleID)
if err != nil { if err != nil {
// If we captured a non-Google URL before the error, use it log.Debug().Err(err).Str("url", googleURL).Msg("Failed to base64-decode Google News article ID")
if finalURL != "" {
log.Debug().Str("resolved", finalURL).Str("original", googleURL).Msg("Resolved Google News link via redirect chain")
return finalURL
}
log.Debug().Err(err).Str("url", googleURL).Msg("Failed to resolve Google News link")
return googleURL return googleURL
} }
defer resp.Body.Close()
// If we captured a non-Google URL during redirects, use that // The decoded protobuf contains the article URL as an embedded string.
if finalURL != "" { // Scan for "http" to find the URL within the decoded bytes.
log.Debug().Str("resolved", finalURL).Str("original", googleURL).Msg("Resolved Google News link via redirect chain") resolved := extractURLFromBytes(decoded)
return finalURL if resolved != "" {
log.Debug().Str("resolved", resolved).Str("original", googleURL).Msg("Resolved Google News link via base64 decode")
return resolved
} }
// Check final response URL log.Debug().Str("url", googleURL).Msg("Could not extract URL from Google News article ID")
if resp.Request != nil && resp.Request.URL != nil {
final := resp.Request.URL.String()
if !strings.Contains(final, "news.google.com") && !strings.Contains(final, "consent.google") {
log.Debug().Str("resolved", final).Str("original", googleURL).Msg("Resolved Google News link via final URL")
return final
}
}
// Fallback: parse meta refresh or JS redirect from response body
body, err := io.ReadAll(io.LimitReader(resp.Body, 64*1024))
if err == nil {
html := string(body)
// Look for <meta http-equiv="refresh" content="0;url=...">
metaRe := regexp.MustCompile(`(?i)http-equiv\s*=\s*["']refresh["'][^>]*content\s*=\s*["'][^;]*;\s*url\s*=\s*([^"'\s>]+)`)
if m := metaRe.FindStringSubmatch(html); len(m) > 1 {
resolved := strings.TrimSpace(m[1])
if !strings.Contains(resolved, "google.com") {
log.Debug().Str("resolved", resolved).Str("original", googleURL).Msg("Resolved Google News link via meta refresh")
return resolved
}
}
// Look for window.location or document.location JS redirects
jsRe := regexp.MustCompile(`(?:window|document)\.location\s*[=]\s*["']([^"']+)["']`)
if m := jsRe.FindStringSubmatch(html); len(m) > 1 {
resolved := strings.TrimSpace(m[1])
if !strings.Contains(resolved, "google.com") {
log.Debug().Str("resolved", resolved).Str("original", googleURL).Msg("Resolved Google News link via JS redirect")
return resolved
}
}
// Look for <a href="..."> with data-n-au attribute (Google News article link)
auRe := regexp.MustCompile(`data-n-au\s*=\s*["']([^"']+)["']`)
if m := auRe.FindStringSubmatch(html); len(m) > 1 {
log.Debug().Str("resolved", m[1]).Str("original", googleURL).Msg("Resolved Google News link via data-n-au")
return m[1]
}
}
log.Warn().Str("url", googleURL).Msg("Could not resolve Google News link to source article")
return googleURL return googleURL
} }
// base64DecodeGNews decodes a Google News article ID which uses base64url encoding
// with optional padding.
func base64DecodeGNews(s string) ([]byte, error) {
// Replace URL-safe chars
s = strings.ReplaceAll(s, "-", "+")
s = strings.ReplaceAll(s, "_", "/")
// Add padding if needed
switch len(s) % 4 {
case 2:
s += "=="
case 3:
s += "="
}
return base64.StdEncoding.DecodeString(s)
}
// extractURLFromBytes scans decoded protobuf bytes for an embedded HTTP(S) URL.
func extractURLFromBytes(data []byte) string {
s := string(data)
// Find "http://" or "https://" in the decoded data
for _, prefix := range []string{"https://", "http://"} {
idx := strings.Index(s, prefix)
if idx == -1 {
continue
}
// Extract the URL — it ends at the first non-URL byte
urlStart := idx
urlEnd := urlStart
for urlEnd < len(s) {
c := s[urlEnd]
// URL-safe characters
if (c >= 'a' && c <= 'z') || (c >= 'A' && c <= 'Z') || (c >= '0' && c <= '9') ||
c == ':' || c == '/' || c == '.' || c == '-' || c == '_' || c == '~' ||
c == '?' || c == '&' || c == '=' || c == '%' || c == '#' || c == '+' ||
c == '@' || c == '!' || c == '$' || c == '(' || c == ')' || c == ',' || c == ';' {
urlEnd++
} else {
break
}
}
candidate := s[urlStart:urlEnd]
// Must be a real URL, not a Google News internal URL
if strings.Contains(candidate, "news.google.com") || strings.Contains(candidate, "consent.google") {
continue
}
if len(candidate) > 20 { // Minimum viable URL length
return candidate
}
}
return ""
}
// FetchNewArticles fetches new articles from all enabled news sources for a config, // FetchNewArticles fetches new articles from all enabled news sources for a config,
// filtering out already-posted articles. // filtering out already-posted articles.
func (s *OfficialAccountsService) FetchNewArticles(ctx context.Context, configID string) ([]RSSItem, []string, error) { func (s *OfficialAccountsService) FetchNewArticles(ctx context.Context, configID string) ([]RSSItem, []string, error) {