diff --git a/go-backend/internal/services/official_accounts_service.go b/go-backend/internal/services/official_accounts_service.go
index 16f30f3..0610342 100644
--- a/go-backend/internal/services/official_accounts_service.go
+++ b/go-backend/internal/services/official_accounts_service.go
@@ -2,12 +2,12 @@ package services
import (
"context"
+ "encoding/base64"
"encoding/json"
"encoding/xml"
"fmt"
"io"
"net/http"
- "regexp"
"strings"
"sync"
"time"
@@ -278,98 +278,101 @@ func (s *OfficialAccountsService) resolveGoogleNewsLink(googleURL string) string
}
// ResolveGoogleNewsURL is a package-level helper that resolves a Google News URL
-// to the underlying article URL by following the full redirect chain.
+// to the underlying article URL by decoding the base64-encoded article ID.
+// This is instant (no network request) — Google News embeds the real URL in the article ID.
func ResolveGoogleNewsURL(googleURL string) string {
if googleURL == "" || !strings.Contains(googleURL, "news.google.com") {
return googleURL
}
- // Track the final URL after all redirects
- var finalURL string
- client := &http.Client{
- Timeout: 15 * time.Second,
- CheckRedirect: func(req *http.Request, via []*http.Request) error {
- if len(via) >= 10 {
- return fmt.Errorf("too many redirects")
- }
- // Track every hop; stop if we've left Google domains
- host := req.URL.Hostname()
- if !strings.Contains(host, "google.com") && !strings.Contains(host, "google.") {
- finalURL = req.URL.String()
- return http.ErrUseLastResponse
- }
- return nil
- },
+ // Extract the article ID from URLs like:
+ // https://news.google.com/rss/articles/CBMisgFBVV95cUxQ...?oc=5
+ articleID := ""
+ if idx := strings.Index(googleURL, "/articles/"); idx != -1 {
+ articleID = googleURL[idx+len("/articles/"):]
+ } else if idx := strings.Index(googleURL, "/read/"); idx != -1 {
+ articleID = googleURL[idx+len("/read/"):]
}
-
- req, err := http.NewRequest("GET", googleURL, nil)
- if err != nil {
+ if articleID == "" {
return googleURL
}
- req.Header.Set("User-Agent", "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36")
- req.Header.Set("Accept", "text/html,application/xhtml+xml")
- resp, err := client.Do(req)
+ // Strip query params
+ if qIdx := strings.Index(articleID, "?"); qIdx != -1 {
+ articleID = articleID[:qIdx]
+ }
+
+ // Base64url decode the article ID
+ decoded, err := base64DecodeGNews(articleID)
if err != nil {
- // If we captured a non-Google URL before the error, use it
- if finalURL != "" {
- log.Debug().Str("resolved", finalURL).Str("original", googleURL).Msg("Resolved Google News link via redirect chain")
- return finalURL
- }
- log.Debug().Err(err).Str("url", googleURL).Msg("Failed to resolve Google News link")
+ log.Debug().Err(err).Str("url", googleURL).Msg("Failed to base64-decode Google News article ID")
return googleURL
}
- defer resp.Body.Close()
- // If we captured a non-Google URL during redirects, use that
- if finalURL != "" {
- log.Debug().Str("resolved", finalURL).Str("original", googleURL).Msg("Resolved Google News link via redirect chain")
- return finalURL
+ // The decoded protobuf contains the article URL as an embedded string.
+ // Scan for "http" to find the URL within the decoded bytes.
+ resolved := extractURLFromBytes(decoded)
+ if resolved != "" {
+ log.Debug().Str("resolved", resolved).Str("original", googleURL).Msg("Resolved Google News link via base64 decode")
+ return resolved
}
- // Check final response URL
- if resp.Request != nil && resp.Request.URL != nil {
- final := resp.Request.URL.String()
- if !strings.Contains(final, "news.google.com") && !strings.Contains(final, "consent.google") {
- log.Debug().Str("resolved", final).Str("original", googleURL).Msg("Resolved Google News link via final URL")
- return final
- }
- }
-
- // Fallback: parse meta refresh or JS redirect from response body
- body, err := io.ReadAll(io.LimitReader(resp.Body, 64*1024))
- if err == nil {
- html := string(body)
- // Look for
- metaRe := regexp.MustCompile(`(?i)http-equiv\s*=\s*["']refresh["'][^>]*content\s*=\s*["'][^;]*;\s*url\s*=\s*([^"'\s>]+)`)
- if m := metaRe.FindStringSubmatch(html); len(m) > 1 {
- resolved := strings.TrimSpace(m[1])
- if !strings.Contains(resolved, "google.com") {
- log.Debug().Str("resolved", resolved).Str("original", googleURL).Msg("Resolved Google News link via meta refresh")
- return resolved
- }
- }
- // Look for window.location or document.location JS redirects
- jsRe := regexp.MustCompile(`(?:window|document)\.location\s*[=]\s*["']([^"']+)["']`)
- if m := jsRe.FindStringSubmatch(html); len(m) > 1 {
- resolved := strings.TrimSpace(m[1])
- if !strings.Contains(resolved, "google.com") {
- log.Debug().Str("resolved", resolved).Str("original", googleURL).Msg("Resolved Google News link via JS redirect")
- return resolved
- }
- }
- // Look for with data-n-au attribute (Google News article link)
- auRe := regexp.MustCompile(`data-n-au\s*=\s*["']([^"']+)["']`)
- if m := auRe.FindStringSubmatch(html); len(m) > 1 {
- log.Debug().Str("resolved", m[1]).Str("original", googleURL).Msg("Resolved Google News link via data-n-au")
- return m[1]
- }
- }
-
- log.Warn().Str("url", googleURL).Msg("Could not resolve Google News link to source article")
+ log.Debug().Str("url", googleURL).Msg("Could not extract URL from Google News article ID")
return googleURL
}
+// base64DecodeGNews decodes a Google News article ID which uses base64url encoding
+// with optional padding.
+func base64DecodeGNews(s string) ([]byte, error) {
+ // Replace URL-safe chars
+ s = strings.ReplaceAll(s, "-", "+")
+ s = strings.ReplaceAll(s, "_", "/")
+ // Add padding if needed
+ switch len(s) % 4 {
+ case 2:
+ s += "=="
+ case 3:
+ s += "="
+ }
+ return base64.StdEncoding.DecodeString(s)
+}
+
+// extractURLFromBytes scans decoded protobuf bytes for an embedded HTTP(S) URL.
+func extractURLFromBytes(data []byte) string {
+ s := string(data)
+ // Find "http://" or "https://" in the decoded data
+ for _, prefix := range []string{"https://", "http://"} {
+ idx := strings.Index(s, prefix)
+ if idx == -1 {
+ continue
+ }
+ // Extract the URL — it ends at the first non-URL byte
+ urlStart := idx
+ urlEnd := urlStart
+ for urlEnd < len(s) {
+ c := s[urlEnd]
+ // URL-safe characters
+ if (c >= 'a' && c <= 'z') || (c >= 'A' && c <= 'Z') || (c >= '0' && c <= '9') ||
+ c == ':' || c == '/' || c == '.' || c == '-' || c == '_' || c == '~' ||
+ c == '?' || c == '&' || c == '=' || c == '%' || c == '#' || c == '+' ||
+ c == '@' || c == '!' || c == '$' || c == '(' || c == ')' || c == ',' || c == ';' {
+ urlEnd++
+ } else {
+ break
+ }
+ }
+ candidate := s[urlStart:urlEnd]
+ // Must be a real URL, not a Google News internal URL
+ if strings.Contains(candidate, "news.google.com") || strings.Contains(candidate, "consent.google") {
+ continue
+ }
+ if len(candidate) > 20 { // Minimum viable URL length
+ return candidate
+ }
+ }
+ return ""
+}
+
// FetchNewArticles fetches new articles from all enabled news sources for a config,
// filtering out already-posted articles.
func (s *OfficialAccountsService) FetchNewArticles(ctx context.Context, configID string) ([]RSSItem, []string, error) {