fix: replace slow HTTP-based Google News URL resolution with instant base64 protobuf decode
This commit is contained in:
parent
4704708c2c
commit
70261d839b
|
|
@ -2,12 +2,12 @@ package services
|
|||
|
||||
import (
|
||||
"context"
|
||||
"encoding/base64"
|
||||
"encoding/json"
|
||||
"encoding/xml"
|
||||
"fmt"
|
||||
"io"
|
||||
"net/http"
|
||||
"regexp"
|
||||
"strings"
|
||||
"sync"
|
||||
"time"
|
||||
|
|
@ -278,98 +278,101 @@ func (s *OfficialAccountsService) resolveGoogleNewsLink(googleURL string) string
|
|||
}
|
||||
|
||||
// ResolveGoogleNewsURL is a package-level helper that resolves a Google News URL
|
||||
// to the underlying article URL by following the full redirect chain.
|
||||
// to the underlying article URL by decoding the base64-encoded article ID.
|
||||
// This is instant (no network request) — Google News embeds the real URL in the article ID.
|
||||
func ResolveGoogleNewsURL(googleURL string) string {
|
||||
if googleURL == "" || !strings.Contains(googleURL, "news.google.com") {
|
||||
return googleURL
|
||||
}
|
||||
|
||||
// Track the final URL after all redirects
|
||||
var finalURL string
|
||||
client := &http.Client{
|
||||
Timeout: 15 * time.Second,
|
||||
CheckRedirect: func(req *http.Request, via []*http.Request) error {
|
||||
if len(via) >= 10 {
|
||||
return fmt.Errorf("too many redirects")
|
||||
}
|
||||
// Track every hop; stop if we've left Google domains
|
||||
host := req.URL.Hostname()
|
||||
if !strings.Contains(host, "google.com") && !strings.Contains(host, "google.") {
|
||||
finalURL = req.URL.String()
|
||||
return http.ErrUseLastResponse
|
||||
}
|
||||
return nil
|
||||
},
|
||||
// Extract the article ID from URLs like:
|
||||
// https://news.google.com/rss/articles/CBMisgFBVV95cUxQ...?oc=5
|
||||
articleID := ""
|
||||
if idx := strings.Index(googleURL, "/articles/"); idx != -1 {
|
||||
articleID = googleURL[idx+len("/articles/"):]
|
||||
} else if idx := strings.Index(googleURL, "/read/"); idx != -1 {
|
||||
articleID = googleURL[idx+len("/read/"):]
|
||||
}
|
||||
|
||||
req, err := http.NewRequest("GET", googleURL, nil)
|
||||
if err != nil {
|
||||
if articleID == "" {
|
||||
return googleURL
|
||||
}
|
||||
req.Header.Set("User-Agent", "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36")
|
||||
req.Header.Set("Accept", "text/html,application/xhtml+xml")
|
||||
|
||||
resp, err := client.Do(req)
|
||||
// Strip query params
|
||||
if qIdx := strings.Index(articleID, "?"); qIdx != -1 {
|
||||
articleID = articleID[:qIdx]
|
||||
}
|
||||
|
||||
// Base64url decode the article ID
|
||||
decoded, err := base64DecodeGNews(articleID)
|
||||
if err != nil {
|
||||
// If we captured a non-Google URL before the error, use it
|
||||
if finalURL != "" {
|
||||
log.Debug().Str("resolved", finalURL).Str("original", googleURL).Msg("Resolved Google News link via redirect chain")
|
||||
return finalURL
|
||||
}
|
||||
log.Debug().Err(err).Str("url", googleURL).Msg("Failed to resolve Google News link")
|
||||
log.Debug().Err(err).Str("url", googleURL).Msg("Failed to base64-decode Google News article ID")
|
||||
return googleURL
|
||||
}
|
||||
defer resp.Body.Close()
|
||||
|
||||
// If we captured a non-Google URL during redirects, use that
|
||||
if finalURL != "" {
|
||||
log.Debug().Str("resolved", finalURL).Str("original", googleURL).Msg("Resolved Google News link via redirect chain")
|
||||
return finalURL
|
||||
// The decoded protobuf contains the article URL as an embedded string.
|
||||
// Scan for "http" to find the URL within the decoded bytes.
|
||||
resolved := extractURLFromBytes(decoded)
|
||||
if resolved != "" {
|
||||
log.Debug().Str("resolved", resolved).Str("original", googleURL).Msg("Resolved Google News link via base64 decode")
|
||||
return resolved
|
||||
}
|
||||
|
||||
// Check final response URL
|
||||
if resp.Request != nil && resp.Request.URL != nil {
|
||||
final := resp.Request.URL.String()
|
||||
if !strings.Contains(final, "news.google.com") && !strings.Contains(final, "consent.google") {
|
||||
log.Debug().Str("resolved", final).Str("original", googleURL).Msg("Resolved Google News link via final URL")
|
||||
return final
|
||||
}
|
||||
}
|
||||
|
||||
// Fallback: parse meta refresh or JS redirect from response body
|
||||
body, err := io.ReadAll(io.LimitReader(resp.Body, 64*1024))
|
||||
if err == nil {
|
||||
html := string(body)
|
||||
// Look for <meta http-equiv="refresh" content="0;url=...">
|
||||
metaRe := regexp.MustCompile(`(?i)http-equiv\s*=\s*["']refresh["'][^>]*content\s*=\s*["'][^;]*;\s*url\s*=\s*([^"'\s>]+)`)
|
||||
if m := metaRe.FindStringSubmatch(html); len(m) > 1 {
|
||||
resolved := strings.TrimSpace(m[1])
|
||||
if !strings.Contains(resolved, "google.com") {
|
||||
log.Debug().Str("resolved", resolved).Str("original", googleURL).Msg("Resolved Google News link via meta refresh")
|
||||
return resolved
|
||||
}
|
||||
}
|
||||
// Look for window.location or document.location JS redirects
|
||||
jsRe := regexp.MustCompile(`(?:window|document)\.location\s*[=]\s*["']([^"']+)["']`)
|
||||
if m := jsRe.FindStringSubmatch(html); len(m) > 1 {
|
||||
resolved := strings.TrimSpace(m[1])
|
||||
if !strings.Contains(resolved, "google.com") {
|
||||
log.Debug().Str("resolved", resolved).Str("original", googleURL).Msg("Resolved Google News link via JS redirect")
|
||||
return resolved
|
||||
}
|
||||
}
|
||||
// Look for <a href="..."> with data-n-au attribute (Google News article link)
|
||||
auRe := regexp.MustCompile(`data-n-au\s*=\s*["']([^"']+)["']`)
|
||||
if m := auRe.FindStringSubmatch(html); len(m) > 1 {
|
||||
log.Debug().Str("resolved", m[1]).Str("original", googleURL).Msg("Resolved Google News link via data-n-au")
|
||||
return m[1]
|
||||
}
|
||||
}
|
||||
|
||||
log.Warn().Str("url", googleURL).Msg("Could not resolve Google News link to source article")
|
||||
log.Debug().Str("url", googleURL).Msg("Could not extract URL from Google News article ID")
|
||||
return googleURL
|
||||
}
|
||||
|
||||
// base64DecodeGNews decodes a Google News article ID which uses base64url encoding
|
||||
// with optional padding.
|
||||
func base64DecodeGNews(s string) ([]byte, error) {
|
||||
// Replace URL-safe chars
|
||||
s = strings.ReplaceAll(s, "-", "+")
|
||||
s = strings.ReplaceAll(s, "_", "/")
|
||||
// Add padding if needed
|
||||
switch len(s) % 4 {
|
||||
case 2:
|
||||
s += "=="
|
||||
case 3:
|
||||
s += "="
|
||||
}
|
||||
return base64.StdEncoding.DecodeString(s)
|
||||
}
|
||||
|
||||
// extractURLFromBytes scans decoded protobuf bytes for an embedded HTTP(S) URL.
|
||||
func extractURLFromBytes(data []byte) string {
|
||||
s := string(data)
|
||||
// Find "http://" or "https://" in the decoded data
|
||||
for _, prefix := range []string{"https://", "http://"} {
|
||||
idx := strings.Index(s, prefix)
|
||||
if idx == -1 {
|
||||
continue
|
||||
}
|
||||
// Extract the URL — it ends at the first non-URL byte
|
||||
urlStart := idx
|
||||
urlEnd := urlStart
|
||||
for urlEnd < len(s) {
|
||||
c := s[urlEnd]
|
||||
// URL-safe characters
|
||||
if (c >= 'a' && c <= 'z') || (c >= 'A' && c <= 'Z') || (c >= '0' && c <= '9') ||
|
||||
c == ':' || c == '/' || c == '.' || c == '-' || c == '_' || c == '~' ||
|
||||
c == '?' || c == '&' || c == '=' || c == '%' || c == '#' || c == '+' ||
|
||||
c == '@' || c == '!' || c == '$' || c == '(' || c == ')' || c == ',' || c == ';' {
|
||||
urlEnd++
|
||||
} else {
|
||||
break
|
||||
}
|
||||
}
|
||||
candidate := s[urlStart:urlEnd]
|
||||
// Must be a real URL, not a Google News internal URL
|
||||
if strings.Contains(candidate, "news.google.com") || strings.Contains(candidate, "consent.google") {
|
||||
continue
|
||||
}
|
||||
if len(candidate) > 20 { // Minimum viable URL length
|
||||
return candidate
|
||||
}
|
||||
}
|
||||
return ""
|
||||
}
|
||||
|
||||
// FetchNewArticles fetches new articles from all enabled news sources for a config,
|
||||
// filtering out already-posted articles.
|
||||
func (s *OfficialAccountsService) FetchNewArticles(ctx context.Context, configID string) ([]RSSItem, []string, error) {
|
||||
|
|
|
|||
Loading…
Reference in a new issue