feat: replace Google News RSS with SearXNG news API for article discovery

This commit is contained in:
Patrick Britton 2026-02-09 08:05:47 -06:00
parent 541a409806
commit 6860916792

View file

@ -2,7 +2,6 @@ package services
import ( import (
"context" "context"
"encoding/base64"
"encoding/json" "encoding/json"
"encoding/xml" "encoding/xml"
"fmt" "fmt"
@ -44,9 +43,9 @@ type OfficialAccountConfig struct {
AvatarURL string `json:"avatar_url,omitempty"` AvatarURL string `json:"avatar_url,omitempty"`
} }
// NewsSource represents a single RSS feed configuration. // NewsSource represents a single news feed configuration.
// If Site is set, the Google News RSS URL is auto-constructed. // If Site is set, SearXNG is used to find news articles for that site.
// If RSSURL is set directly, it's used as-is (legacy/fallback). // If RSSURL is set directly, the RSS feed is fetched as-is.
type NewsSource struct { type NewsSource struct {
Name string `json:"name"` Name string `json:"name"`
Site string `json:"site,omitempty"` Site string `json:"site,omitempty"`
@ -54,16 +53,32 @@ type NewsSource struct {
Enabled bool `json:"enabled"` Enabled bool `json:"enabled"`
} }
// GoogleNewsRSSURL builds a Google News RSS search URL for the given site domain. // SearXNG endpoint — local Docker instance
func GoogleNewsRSSURL(site string) string { const searxngBaseURL = "http://localhost:8888"
return fmt.Sprintf("https://news.google.com/rss/search?q=site:%s&hl=en-US&gl=US&ceid=US:en", site)
// SearXNGResponse represents the JSON response from SearXNG /search endpoint.
type SearXNGResponse struct {
Results []SearXNGResult `json:"results"`
} }
// EffectiveRSSURL returns the RSS URL to fetch — Google News if Site is set, otherwise RSSURL. // SearXNGResult represents a single search result from SearXNG.
type SearXNGResult struct {
URL string `json:"url"`
Title string `json:"title"`
Content string `json:"content"`
PublishedDate string `json:"publishedDate"`
Thumbnail string `json:"thumbnail"`
Engine string `json:"engine"`
Category string `json:"category"`
}
// UseSearXNG returns true if this source should use SearXNG (has a Site configured).
func (ns *NewsSource) UseSearXNG() bool {
return ns.Site != ""
}
// EffectiveRSSURL returns the direct RSS URL if set, empty string otherwise.
func (ns *NewsSource) EffectiveRSSURL() string { func (ns *NewsSource) EffectiveRSSURL() string {
if ns.Site != "" {
return GoogleNewsRSSURL(ns.Site)
}
return ns.RSSURL return ns.RSSURL
} }
@ -237,8 +252,61 @@ func (s *OfficialAccountsService) ToggleEnabled(ctx context.Context, id string,
return err return err
} }
// ── RSS News Fetching ──────────────────────────────── // ── News Fetching (SearXNG + RSS) ───────────────────
// FetchSearXNGNews queries the local SearXNG instance for news about a site.
// Returns results as RSSItems for uniform handling in the pipeline.
func (s *OfficialAccountsService) FetchSearXNGNews(ctx context.Context, site string) ([]RSSItem, error) {
searchURL := fmt.Sprintf("%s/search?q=site:%s&categories=news&format=json&language=en", searxngBaseURL, site)
req, err := http.NewRequestWithContext(ctx, "GET", searchURL, nil)
if err != nil {
return nil, err
}
resp, err := s.httpClient.Do(req)
if err != nil {
return nil, fmt.Errorf("SearXNG request failed for site %s: %w", site, err)
}
defer resp.Body.Close()
if resp.StatusCode != http.StatusOK {
return nil, fmt.Errorf("SearXNG returned status %d for site %s", resp.StatusCode, site)
}
body, err := io.ReadAll(resp.Body)
if err != nil {
return nil, err
}
var sxResp SearXNGResponse
if err := json.Unmarshal(body, &sxResp); err != nil {
return nil, fmt.Errorf("failed to parse SearXNG response: %w", err)
}
// Convert SearXNG results to RSSItems
var items []RSSItem
for _, r := range sxResp.Results {
if r.URL == "" || r.Title == "" {
continue
}
item := RSSItem{
Title: r.Title,
Link: r.URL,
Description: r.Content,
GUID: r.URL, // use the actual URL as GUID for dedup
}
if r.PublishedDate != "" {
item.PubDate = r.PublishedDate
}
items = append(items, item)
}
log.Debug().Int("results", len(items)).Str("site", site).Msg("[SearXNG] Fetched news articles")
return items, nil
}
// FetchRSS fetches and parses a standard RSS/XML feed.
func (s *OfficialAccountsService) FetchRSS(ctx context.Context, rssURL string) ([]RSSItem, error) { func (s *OfficialAccountsService) FetchRSS(ctx context.Context, rssURL string) ([]RSSItem, error) {
req, err := http.NewRequestWithContext(ctx, "GET", rssURL, nil) req, err := http.NewRequestWithContext(ctx, "GET", rssURL, nil)
if err != nil { if err != nil {
@ -266,131 +334,9 @@ func (s *OfficialAccountsService) FetchRSS(ctx context.Context, rssURL string) (
return nil, fmt.Errorf("failed to parse RSS from %s: %w", rssURL, err) return nil, fmt.Errorf("failed to parse RSS from %s: %w", rssURL, err)
} }
// If items come from Google News, resolve redirect links to actual article URLs
isGoogleNews := strings.Contains(rssURL, "news.google.com/rss")
if isGoogleNews {
for i := range feed.Channel.Items {
item := &feed.Channel.Items[i]
// Preserve original Google News URL in GUID for dedup
if item.GUID == "" {
item.GUID = item.Link
}
resolved := ResolveGoogleNewsURL(item.Link)
if resolved != item.Link && resolved != "" {
// Base64 decode succeeded — use real article URL
item.Link = resolved
} else if item.Source.URL != "" {
// Fall back to the <source url="..."> from the RSS item
item.Link = item.Source.URL
log.Debug().Str("source", item.Source.Name).Str("url", item.Source.URL).Msg("Using RSS source URL as fallback for Google News link")
}
}
}
return feed.Channel.Items, nil return feed.Channel.Items, nil
} }
// resolveGoogleNewsLink follows the Google News redirect chain to get the actual article URL.
func (s *OfficialAccountsService) resolveGoogleNewsLink(googleURL string) string {
return ResolveGoogleNewsURL(googleURL)
}
// ResolveGoogleNewsURL is a package-level helper that resolves a Google News URL
// to the underlying article URL by decoding the base64-encoded article ID.
// This is instant (no network request) — Google News embeds the real URL in the article ID.
func ResolveGoogleNewsURL(googleURL string) string {
if googleURL == "" || !strings.Contains(googleURL, "news.google.com") {
return googleURL
}
// Extract the article ID from URLs like:
// https://news.google.com/rss/articles/CBMisgFBVV95cUxQ...?oc=5
articleID := ""
if idx := strings.Index(googleURL, "/articles/"); idx != -1 {
articleID = googleURL[idx+len("/articles/"):]
} else if idx := strings.Index(googleURL, "/read/"); idx != -1 {
articleID = googleURL[idx+len("/read/"):]
}
if articleID == "" {
return googleURL
}
// Strip query params
if qIdx := strings.Index(articleID, "?"); qIdx != -1 {
articleID = articleID[:qIdx]
}
// Base64url decode the article ID
decoded, err := base64DecodeGNews(articleID)
if err != nil {
log.Debug().Err(err).Str("url", googleURL).Msg("Failed to base64-decode Google News article ID")
return googleURL
}
// The decoded protobuf contains the article URL as an embedded string.
// Scan for "http" to find the URL within the decoded bytes.
resolved := extractURLFromBytes(decoded)
if resolved != "" {
log.Debug().Str("resolved", resolved).Str("original", googleURL).Msg("Resolved Google News link via base64 decode")
return resolved
}
log.Debug().Str("url", googleURL).Msg("Could not extract URL from Google News article ID")
return googleURL
}
// base64DecodeGNews decodes a Google News article ID which uses base64url encoding
// with optional padding.
func base64DecodeGNews(s string) ([]byte, error) {
// Replace URL-safe chars
s = strings.ReplaceAll(s, "-", "+")
s = strings.ReplaceAll(s, "_", "/")
// Add padding if needed
switch len(s) % 4 {
case 2:
s += "=="
case 3:
s += "="
}
return base64.StdEncoding.DecodeString(s)
}
// extractURLFromBytes scans decoded protobuf bytes for an embedded HTTP(S) URL.
func extractURLFromBytes(data []byte) string {
s := string(data)
// Find "http://" or "https://" in the decoded data
for _, prefix := range []string{"https://", "http://"} {
idx := strings.Index(s, prefix)
if idx == -1 {
continue
}
// Extract the URL — it ends at the first non-URL byte
urlStart := idx
urlEnd := urlStart
for urlEnd < len(s) {
c := s[urlEnd]
// URL-safe characters
if (c >= 'a' && c <= 'z') || (c >= 'A' && c <= 'Z') || (c >= '0' && c <= '9') ||
c == ':' || c == '/' || c == '.' || c == '-' || c == '_' || c == '~' ||
c == '?' || c == '&' || c == '=' || c == '%' || c == '#' || c == '+' ||
c == '@' || c == '!' || c == '$' || c == '(' || c == ')' || c == ',' || c == ';' {
urlEnd++
} else {
break
}
}
candidate := s[urlStart:urlEnd]
// Must be a real URL, not a Google News internal URL
if strings.Contains(candidate, "news.google.com") || strings.Contains(candidate, "consent.google") {
continue
}
if len(candidate) > 20 { // Minimum viable URL length
return candidate
}
}
return ""
}
// ── Article Pipeline ───────────────────────────────── // ── Article Pipeline ─────────────────────────────────
// DiscoverArticles fetches RSS feeds and caches all new articles in the DB as 'discovered'. // DiscoverArticles fetches RSS feeds and caches all new articles in the DB as 'discovered'.
@ -408,13 +354,22 @@ func (s *OfficialAccountsService) DiscoverArticles(ctx context.Context, configID
newCount := 0 newCount := 0
for _, src := range sources { for _, src := range sources {
rssURL := src.EffectiveRSSURL() if !src.Enabled {
if !src.Enabled || rssURL == "" {
continue continue
} }
items, err := s.FetchRSS(ctx, rssURL)
if err != nil { // Fetch articles: SearXNG for site-based sources, RSS for direct feed URLs
log.Warn().Err(err).Str("source", src.Name).Msg("Failed to fetch RSS feed") var items []RSSItem
var fetchErr error
if src.UseSearXNG() {
items, fetchErr = s.FetchSearXNGNews(ctx, src.Site)
} else if rssURL := src.EffectiveRSSURL(); rssURL != "" {
items, fetchErr = s.FetchRSS(ctx, rssURL)
} else {
continue
}
if fetchErr != nil {
log.Warn().Err(fetchErr).Str("source", src.Name).Msg("Failed to fetch articles")
continue continue
} }
@ -427,13 +382,16 @@ func (s *OfficialAccountsService) DiscoverArticles(ctx context.Context, configID
continue continue
} }
// Parse pub date // Parse pub date — support multiple formats
var pubDate *time.Time var pubDate *time.Time
if item.PubDate != "" { if item.PubDate != "" {
for _, layout := range []string{ for _, layout := range []string{
time.RFC1123Z, time.RFC1123, time.RFC822Z, time.RFC822, time.RFC1123Z, time.RFC1123, time.RFC822Z, time.RFC822,
"Mon, 2 Jan 2006 15:04:05 -0700", "Mon, 2 Jan 2006 15:04:05 -0700",
"2006-01-02T15:04:05Z", "2006-01-02T15:04:05Z",
"2006-01-02T15:04:05", // SearXNG format
"2006-01-02 15:04:05", // SearXNG alt format
time.RFC3339,
} { } {
if t, err := time.Parse(layout, item.PubDate); err == nil { if t, err := time.Parse(layout, item.PubDate); err == nil {
pubDate = &t pubDate = &t
@ -448,13 +406,19 @@ func (s *OfficialAccountsService) DiscoverArticles(ctx context.Context, configID
desc = desc[:1000] desc = desc[:1000]
} }
// Source URL: use RSS source element if present, otherwise build from site
sourceURL := item.Source.URL
if sourceURL == "" && src.Site != "" {
sourceURL = "https://" + src.Site
}
// Insert into pipeline — ON CONFLICT means we already know about this article // Insert into pipeline — ON CONFLICT means we already know about this article
tag, err := s.pool.Exec(ctx, ` tag, err := s.pool.Exec(ctx, `
INSERT INTO official_account_articles INSERT INTO official_account_articles
(config_id, guid, title, link, source_name, source_url, description, pub_date, status) (config_id, guid, title, link, source_name, source_url, description, pub_date, status)
VALUES ($1, $2, $3, $4, $5, $6, $7, $8, 'discovered') VALUES ($1, $2, $3, $4, $5, $6, $7, $8, 'discovered')
ON CONFLICT (config_id, guid) DO NOTHING ON CONFLICT (config_id, guid) DO NOTHING
`, configID, guid, item.Title, item.Link, src.Name, item.Source.URL, desc, pubDate) `, configID, guid, item.Title, item.Link, src.Name, sourceURL, desc, pubDate)
if err != nil { if err != nil {
log.Warn().Err(err).Str("guid", guid).Msg("Failed to cache article") log.Warn().Err(err).Str("guid", guid).Msg("Failed to cache article")
continue continue