feat: replace Google News RSS with SearXNG news API for article discovery

2026-02-09 08:05:47 -06:00 · 2026-02-09 08:05:47 -06:00 · 6860916792
parent 541a409806
commit 6860916792
1 changed files with 105 additions and 141 deletions
--- a/go-backend/internal/services/official_accounts_service.go
+++ b/go-backend/internal/services/official_accounts_service.go
@ -2,7 +2,6 @@ package services
 import (
 	"context"
 	"encoding/base64"
 	"encoding/json"
 	"encoding/xml"
 	"fmt"
@ -44,9 +43,9 @@ type OfficialAccountConfig struct {
 	AvatarURL   string `json:"avatar_url,omitempty"`
 }
-// NewsSource represents a single RSS feed configuration.
+// NewsSource represents a single news feed configuration.
-// If Site is set, the Google News RSS URL is auto-constructed.
+// If Site is set, SearXNG is used to find news articles for that site.
-// If RSSURL is set directly, it's used as-is (legacy/fallback).
+// If RSSURL is set directly, the RSS feed is fetched as-is.
 type NewsSource struct {
 	Name    string `json:"name"`
 	Site    string `json:"site,omitempty"`
@ -54,16 +53,32 @@ type NewsSource struct {
 	Enabled bool   `json:"enabled"`
 }
-// GoogleNewsRSSURL builds a Google News RSS search URL for the given site domain.
+// SearXNG endpoint — local Docker instance
-func GoogleNewsRSSURL(site string) string {
+const searxngBaseURL = "http://localhost:8888"
-	return fmt.Sprintf("https://news.google.com/rss/search?q=site:%s&hl=en-US&gl=US&ceid=US:en", site)
+
 // SearXNGResponse represents the JSON response from SearXNG /search endpoint.
 type SearXNGResponse struct {
 	Results []SearXNGResult `json:"results"`
 }
-// EffectiveRSSURL returns the RSS URL to fetch — Google News if Site is set, otherwise RSSURL.
+// SearXNGResult represents a single search result from SearXNG.
 type SearXNGResult struct {
 	URL           string `json:"url"`
 	Title         string `json:"title"`
 	Content       string `json:"content"`
 	PublishedDate string `json:"publishedDate"`
 	Thumbnail     string `json:"thumbnail"`
 	Engine        string `json:"engine"`
 	Category      string `json:"category"`
 }
 // UseSearXNG returns true if this source should use SearXNG (has a Site configured).
 func (ns *NewsSource) UseSearXNG() bool {
 	return ns.Site != ""
 }
 // EffectiveRSSURL returns the direct RSS URL if set, empty string otherwise.
 func (ns *NewsSource) EffectiveRSSURL() string {
 	if ns.Site != "" {
 		return GoogleNewsRSSURL(ns.Site)
 	}
 	return ns.RSSURL
 }
@ -237,8 +252,61 @@ func (s *OfficialAccountsService) ToggleEnabled(ctx context.Context, id string,
 	return err
 }
-// ── RSS News Fetching ────────────────────────────────
+// ── News Fetching (SearXNG + RSS) ───────────────────
 // FetchSearXNGNews queries the local SearXNG instance for news about a site.
 // Returns results as RSSItems for uniform handling in the pipeline.
 func (s *OfficialAccountsService) FetchSearXNGNews(ctx context.Context, site string) ([]RSSItem, error) {
 	searchURL := fmt.Sprintf("%s/search?q=site:%s&categories=news&format=json&language=en", searxngBaseURL, site)
 	req, err := http.NewRequestWithContext(ctx, "GET", searchURL, nil)
 	if err != nil {
 		return nil, err
 	}
 	resp, err := s.httpClient.Do(req)
 	if err != nil {
 		return nil, fmt.Errorf("SearXNG request failed for site %s: %w", site, err)
 	}
 	defer resp.Body.Close()
 	if resp.StatusCode != http.StatusOK {
 		return nil, fmt.Errorf("SearXNG returned status %d for site %s", resp.StatusCode, site)
 	}
 	body, err := io.ReadAll(resp.Body)
 	if err != nil {
 		return nil, err
 	}
 	var sxResp SearXNGResponse
 	if err := json.Unmarshal(body, &sxResp); err != nil {
 		return nil, fmt.Errorf("failed to parse SearXNG response: %w", err)
 	}
 	// Convert SearXNG results to RSSItems
 	var items []RSSItem
 	for _, r := range sxResp.Results {
 		if r.URL == "" || r.Title == "" {
 			continue
 		}
 		item := RSSItem{
 			Title:       r.Title,
 			Link:        r.URL,
 			Description: r.Content,
 			GUID:        r.URL, // use the actual URL as GUID for dedup
 		}
 		if r.PublishedDate != "" {
 			item.PubDate = r.PublishedDate
 		}
 		items = append(items, item)
 	}
 	log.Debug().Int("results", len(items)).Str("site", site).Msg("[SearXNG] Fetched news articles")
 	return items, nil
 }
 // FetchRSS fetches and parses a standard RSS/XML feed.
 func (s *OfficialAccountsService) FetchRSS(ctx context.Context, rssURL string) ([]RSSItem, error) {
 	req, err := http.NewRequestWithContext(ctx, "GET", rssURL, nil)
 	if err != nil {
@ -266,131 +334,9 @@ func (s *OfficialAccountsService) FetchRSS(ctx context.Context, rssURL string) (
 		return nil, fmt.Errorf("failed to parse RSS from %s: %w", rssURL, err)
 	}
 	// If items come from Google News, resolve redirect links to actual article URLs
 	isGoogleNews := strings.Contains(rssURL, "news.google.com/rss")
 	if isGoogleNews {
 		for i := range feed.Channel.Items {
 			item := &feed.Channel.Items[i]
 			// Preserve original Google News URL in GUID for dedup
 			if item.GUID == "" {
 				item.GUID = item.Link
 			}
 			resolved := ResolveGoogleNewsURL(item.Link)
 			if resolved != item.Link && resolved != "" {
 				// Base64 decode succeeded — use real article URL
 				item.Link = resolved
 			} else if item.Source.URL != "" {
 				// Fall back to the <source url="..."> from the RSS item
 				item.Link = item.Source.URL
 				log.Debug().Str("source", item.Source.Name).Str("url", item.Source.URL).Msg("Using RSS source URL as fallback for Google News link")
 			}
 		}
 	}
 	return feed.Channel.Items, nil
 }
 // resolveGoogleNewsLink follows the Google News redirect chain to get the actual article URL.
 func (s *OfficialAccountsService) resolveGoogleNewsLink(googleURL string) string {
 	return ResolveGoogleNewsURL(googleURL)
 }
 // ResolveGoogleNewsURL is a package-level helper that resolves a Google News URL
 // to the underlying article URL by decoding the base64-encoded article ID.
 // This is instant (no network request) — Google News embeds the real URL in the article ID.
 func ResolveGoogleNewsURL(googleURL string) string {
 	if googleURL == "" || !strings.Contains(googleURL, "news.google.com") {
 		return googleURL
 	}
 	// Extract the article ID from URLs like:
 	// https://news.google.com/rss/articles/CBMisgFBVV95cUxQ...?oc=5
 	articleID := ""
 	if idx := strings.Index(googleURL, "/articles/"); idx != -1 {
 		articleID = googleURL[idx+len("/articles/"):]
 	} else if idx := strings.Index(googleURL, "/read/"); idx != -1 {
 		articleID = googleURL[idx+len("/read/"):]
 	}
 	if articleID == "" {
 		return googleURL
 	}
 	// Strip query params
 	if qIdx := strings.Index(articleID, "?"); qIdx != -1 {
 		articleID = articleID[:qIdx]
 	}
 	// Base64url decode the article ID
 	decoded, err := base64DecodeGNews(articleID)
 	if err != nil {
 		log.Debug().Err(err).Str("url", googleURL).Msg("Failed to base64-decode Google News article ID")
 		return googleURL
 	}
 	// The decoded protobuf contains the article URL as an embedded string.
 	// Scan for "http" to find the URL within the decoded bytes.
 	resolved := extractURLFromBytes(decoded)
 	if resolved != "" {
 		log.Debug().Str("resolved", resolved).Str("original", googleURL).Msg("Resolved Google News link via base64 decode")
 		return resolved
 	}
 	log.Debug().Str("url", googleURL).Msg("Could not extract URL from Google News article ID")
 	return googleURL
 }
 // base64DecodeGNews decodes a Google News article ID which uses base64url encoding
 // with optional padding.
 func base64DecodeGNews(s string) ([]byte, error) {
 	// Replace URL-safe chars
 	s = strings.ReplaceAll(s, "-", "+")
 	s = strings.ReplaceAll(s, "_", "/")
 	// Add padding if needed
 	switch len(s) % 4 {
 	case 2:
 		s += "=="
 	case 3:
 		s += "="
 	}
 	return base64.StdEncoding.DecodeString(s)
 }
 // extractURLFromBytes scans decoded protobuf bytes for an embedded HTTP(S) URL.
 func extractURLFromBytes(data []byte) string {
 	s := string(data)
 	// Find "http://" or "https://" in the decoded data
 	for _, prefix := range []string{"https://", "http://"} {
 		idx := strings.Index(s, prefix)
 		if idx == -1 {
 			continue
 		}
 		// Extract the URL — it ends at the first non-URL byte
 		urlStart := idx
 		urlEnd := urlStart
 		for urlEnd < len(s) {
 			c := s[urlEnd]
 			// URL-safe characters
 			if (c >= 'a' && c <= 'z') || (c >= 'A' && c <= 'Z') || (c >= '0' && c <= '9') ||
 				c == ':' || c == '/' || c == '.' || c == '-' || c == '_' || c == '~' ||
 				c == '?' || c == '&' || c == '=' || c == '%' || c == '#' || c == '+' ||
 				c == '@' || c == '!' || c == '$' || c == '(' || c == ')' || c == ',' || c == ';' {
 				urlEnd++
 			} else {
 				break
 			}
 		}
 		candidate := s[urlStart:urlEnd]
 		// Must be a real URL, not a Google News internal URL
 		if strings.Contains(candidate, "news.google.com") || strings.Contains(candidate, "consent.google") {
 			continue
 		}
 		if len(candidate) > 20 { // Minimum viable URL length
 			return candidate
 		}
 	}
 	return ""
 }
 // ── Article Pipeline ─────────────────────────────────
 // DiscoverArticles fetches RSS feeds and caches all new articles in the DB as 'discovered'.
@ -408,13 +354,22 @@ func (s *OfficialAccountsService) DiscoverArticles(ctx context.Context, configID
 	newCount := 0
 	for _, src := range sources {
-		rssURL := src.EffectiveRSSURL()
+		if !src.Enabled {
 		if !src.Enabled || rssURL == "" {
 			continue
 		}
-		items, err := s.FetchRSS(ctx, rssURL)
+
-		if err != nil {
+		// Fetch articles: SearXNG for site-based sources, RSS for direct feed URLs
-			log.Warn().Err(err).Str("source", src.Name).Msg("Failed to fetch RSS feed")
+		var items []RSSItem
 		var fetchErr error
 		if src.UseSearXNG() {
 			items, fetchErr = s.FetchSearXNGNews(ctx, src.Site)
 		} else if rssURL := src.EffectiveRSSURL(); rssURL != "" {
 			items, fetchErr = s.FetchRSS(ctx, rssURL)
 		} else {
 			continue
 		}
 		if fetchErr != nil {
 			log.Warn().Err(fetchErr).Str("source", src.Name).Msg("Failed to fetch articles")
 			continue
 		}
@ -427,13 +382,16 @@ func (s *OfficialAccountsService) DiscoverArticles(ctx context.Context, configID
 				continue
 			}
-			// Parse pub date
+			// Parse pub date — support multiple formats
 			var pubDate *time.Time
 			if item.PubDate != "" {
 				for _, layout := range []string{
 					time.RFC1123Z, time.RFC1123, time.RFC822Z, time.RFC822,
 					"Mon, 2 Jan 2006 15:04:05 -0700",
 					"2006-01-02T15:04:05Z",
 					"2006-01-02T15:04:05", // SearXNG format
 					"2006-01-02 15:04:05", // SearXNG alt format
 					time.RFC3339,
 				} {
 					if t, err := time.Parse(layout, item.PubDate); err == nil {
 						pubDate = &t
@ -448,13 +406,19 @@ func (s *OfficialAccountsService) DiscoverArticles(ctx context.Context, configID
 				desc = desc[:1000]
 			}
 			// Source URL: use RSS source element if present, otherwise build from site
 			sourceURL := item.Source.URL
 			if sourceURL == "" && src.Site != "" {
 				sourceURL = "https://" + src.Site
 			}
 			// Insert into pipeline — ON CONFLICT means we already know about this article
 			tag, err := s.pool.Exec(ctx, `
 				INSERT INTO official_account_articles
 					(config_id, guid, title, link, source_name, source_url, description, pub_date, status)
 				VALUES ($1, $2, $3, $4, $5, $6, $7, $8, 'discovered')
 				ON CONFLICT (config_id, guid) DO NOTHING
-			`, configID, guid, item.Title, item.Link, src.Name, item.Source.URL, desc, pubDate)
+			`, configID, guid, item.Title, item.Link, src.Name, sourceURL, desc, pubDate)
 			if err != nil {
 				log.Warn().Err(err).Str("guid", guid).Msg("Failed to cache article")
 				continue