feat: replace Google News RSS with SearXNG news API for article discovery
This commit is contained in:
parent
541a409806
commit
6860916792
|
|
@ -2,7 +2,6 @@ package services
|
|||
|
||||
import (
|
||||
"context"
|
||||
"encoding/base64"
|
||||
"encoding/json"
|
||||
"encoding/xml"
|
||||
"fmt"
|
||||
|
|
@ -44,9 +43,9 @@ type OfficialAccountConfig struct {
|
|||
AvatarURL string `json:"avatar_url,omitempty"`
|
||||
}
|
||||
|
||||
// NewsSource represents a single RSS feed configuration.
|
||||
// If Site is set, the Google News RSS URL is auto-constructed.
|
||||
// If RSSURL is set directly, it's used as-is (legacy/fallback).
|
||||
// NewsSource represents a single news feed configuration.
|
||||
// If Site is set, SearXNG is used to find news articles for that site.
|
||||
// If RSSURL is set directly, the RSS feed is fetched as-is.
|
||||
type NewsSource struct {
|
||||
Name string `json:"name"`
|
||||
Site string `json:"site,omitempty"`
|
||||
|
|
@ -54,16 +53,32 @@ type NewsSource struct {
|
|||
Enabled bool `json:"enabled"`
|
||||
}
|
||||
|
||||
// GoogleNewsRSSURL builds a Google News RSS search URL for the given site domain.
|
||||
func GoogleNewsRSSURL(site string) string {
|
||||
return fmt.Sprintf("https://news.google.com/rss/search?q=site:%s&hl=en-US&gl=US&ceid=US:en", site)
|
||||
// SearXNG endpoint — local Docker instance
|
||||
const searxngBaseURL = "http://localhost:8888"
|
||||
|
||||
// SearXNGResponse represents the JSON response from SearXNG /search endpoint.
|
||||
type SearXNGResponse struct {
|
||||
Results []SearXNGResult `json:"results"`
|
||||
}
|
||||
|
||||
// EffectiveRSSURL returns the RSS URL to fetch — Google News if Site is set, otherwise RSSURL.
|
||||
// SearXNGResult represents a single search result from SearXNG.
|
||||
type SearXNGResult struct {
|
||||
URL string `json:"url"`
|
||||
Title string `json:"title"`
|
||||
Content string `json:"content"`
|
||||
PublishedDate string `json:"publishedDate"`
|
||||
Thumbnail string `json:"thumbnail"`
|
||||
Engine string `json:"engine"`
|
||||
Category string `json:"category"`
|
||||
}
|
||||
|
||||
// UseSearXNG returns true if this source should use SearXNG (has a Site configured).
|
||||
func (ns *NewsSource) UseSearXNG() bool {
|
||||
return ns.Site != ""
|
||||
}
|
||||
|
||||
// EffectiveRSSURL returns the direct RSS URL if set, empty string otherwise.
|
||||
func (ns *NewsSource) EffectiveRSSURL() string {
|
||||
if ns.Site != "" {
|
||||
return GoogleNewsRSSURL(ns.Site)
|
||||
}
|
||||
return ns.RSSURL
|
||||
}
|
||||
|
||||
|
|
@ -237,8 +252,61 @@ func (s *OfficialAccountsService) ToggleEnabled(ctx context.Context, id string,
|
|||
return err
|
||||
}
|
||||
|
||||
// ── RSS News Fetching ────────────────────────────────
|
||||
// ── News Fetching (SearXNG + RSS) ───────────────────
|
||||
|
||||
// FetchSearXNGNews queries the local SearXNG instance for news about a site.
|
||||
// Returns results as RSSItems for uniform handling in the pipeline.
|
||||
func (s *OfficialAccountsService) FetchSearXNGNews(ctx context.Context, site string) ([]RSSItem, error) {
|
||||
searchURL := fmt.Sprintf("%s/search?q=site:%s&categories=news&format=json&language=en", searxngBaseURL, site)
|
||||
|
||||
req, err := http.NewRequestWithContext(ctx, "GET", searchURL, nil)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
resp, err := s.httpClient.Do(req)
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("SearXNG request failed for site %s: %w", site, err)
|
||||
}
|
||||
defer resp.Body.Close()
|
||||
|
||||
if resp.StatusCode != http.StatusOK {
|
||||
return nil, fmt.Errorf("SearXNG returned status %d for site %s", resp.StatusCode, site)
|
||||
}
|
||||
|
||||
body, err := io.ReadAll(resp.Body)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
var sxResp SearXNGResponse
|
||||
if err := json.Unmarshal(body, &sxResp); err != nil {
|
||||
return nil, fmt.Errorf("failed to parse SearXNG response: %w", err)
|
||||
}
|
||||
|
||||
// Convert SearXNG results to RSSItems
|
||||
var items []RSSItem
|
||||
for _, r := range sxResp.Results {
|
||||
if r.URL == "" || r.Title == "" {
|
||||
continue
|
||||
}
|
||||
item := RSSItem{
|
||||
Title: r.Title,
|
||||
Link: r.URL,
|
||||
Description: r.Content,
|
||||
GUID: r.URL, // use the actual URL as GUID for dedup
|
||||
}
|
||||
if r.PublishedDate != "" {
|
||||
item.PubDate = r.PublishedDate
|
||||
}
|
||||
items = append(items, item)
|
||||
}
|
||||
|
||||
log.Debug().Int("results", len(items)).Str("site", site).Msg("[SearXNG] Fetched news articles")
|
||||
return items, nil
|
||||
}
|
||||
|
||||
// FetchRSS fetches and parses a standard RSS/XML feed.
|
||||
func (s *OfficialAccountsService) FetchRSS(ctx context.Context, rssURL string) ([]RSSItem, error) {
|
||||
req, err := http.NewRequestWithContext(ctx, "GET", rssURL, nil)
|
||||
if err != nil {
|
||||
|
|
@ -266,131 +334,9 @@ func (s *OfficialAccountsService) FetchRSS(ctx context.Context, rssURL string) (
|
|||
return nil, fmt.Errorf("failed to parse RSS from %s: %w", rssURL, err)
|
||||
}
|
||||
|
||||
// If items come from Google News, resolve redirect links to actual article URLs
|
||||
isGoogleNews := strings.Contains(rssURL, "news.google.com/rss")
|
||||
if isGoogleNews {
|
||||
for i := range feed.Channel.Items {
|
||||
item := &feed.Channel.Items[i]
|
||||
// Preserve original Google News URL in GUID for dedup
|
||||
if item.GUID == "" {
|
||||
item.GUID = item.Link
|
||||
}
|
||||
resolved := ResolveGoogleNewsURL(item.Link)
|
||||
if resolved != item.Link && resolved != "" {
|
||||
// Base64 decode succeeded — use real article URL
|
||||
item.Link = resolved
|
||||
} else if item.Source.URL != "" {
|
||||
// Fall back to the <source url="..."> from the RSS item
|
||||
item.Link = item.Source.URL
|
||||
log.Debug().Str("source", item.Source.Name).Str("url", item.Source.URL).Msg("Using RSS source URL as fallback for Google News link")
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return feed.Channel.Items, nil
|
||||
}
|
||||
|
||||
// resolveGoogleNewsLink follows the Google News redirect chain to get the actual article URL.
|
||||
func (s *OfficialAccountsService) resolveGoogleNewsLink(googleURL string) string {
|
||||
return ResolveGoogleNewsURL(googleURL)
|
||||
}
|
||||
|
||||
// ResolveGoogleNewsURL is a package-level helper that resolves a Google News URL
|
||||
// to the underlying article URL by decoding the base64-encoded article ID.
|
||||
// This is instant (no network request) — Google News embeds the real URL in the article ID.
|
||||
func ResolveGoogleNewsURL(googleURL string) string {
|
||||
if googleURL == "" || !strings.Contains(googleURL, "news.google.com") {
|
||||
return googleURL
|
||||
}
|
||||
|
||||
// Extract the article ID from URLs like:
|
||||
// https://news.google.com/rss/articles/CBMisgFBVV95cUxQ...?oc=5
|
||||
articleID := ""
|
||||
if idx := strings.Index(googleURL, "/articles/"); idx != -1 {
|
||||
articleID = googleURL[idx+len("/articles/"):]
|
||||
} else if idx := strings.Index(googleURL, "/read/"); idx != -1 {
|
||||
articleID = googleURL[idx+len("/read/"):]
|
||||
}
|
||||
if articleID == "" {
|
||||
return googleURL
|
||||
}
|
||||
|
||||
// Strip query params
|
||||
if qIdx := strings.Index(articleID, "?"); qIdx != -1 {
|
||||
articleID = articleID[:qIdx]
|
||||
}
|
||||
|
||||
// Base64url decode the article ID
|
||||
decoded, err := base64DecodeGNews(articleID)
|
||||
if err != nil {
|
||||
log.Debug().Err(err).Str("url", googleURL).Msg("Failed to base64-decode Google News article ID")
|
||||
return googleURL
|
||||
}
|
||||
|
||||
// The decoded protobuf contains the article URL as an embedded string.
|
||||
// Scan for "http" to find the URL within the decoded bytes.
|
||||
resolved := extractURLFromBytes(decoded)
|
||||
if resolved != "" {
|
||||
log.Debug().Str("resolved", resolved).Str("original", googleURL).Msg("Resolved Google News link via base64 decode")
|
||||
return resolved
|
||||
}
|
||||
|
||||
log.Debug().Str("url", googleURL).Msg("Could not extract URL from Google News article ID")
|
||||
return googleURL
|
||||
}
|
||||
|
||||
// base64DecodeGNews decodes a Google News article ID which uses base64url encoding
|
||||
// with optional padding.
|
||||
func base64DecodeGNews(s string) ([]byte, error) {
|
||||
// Replace URL-safe chars
|
||||
s = strings.ReplaceAll(s, "-", "+")
|
||||
s = strings.ReplaceAll(s, "_", "/")
|
||||
// Add padding if needed
|
||||
switch len(s) % 4 {
|
||||
case 2:
|
||||
s += "=="
|
||||
case 3:
|
||||
s += "="
|
||||
}
|
||||
return base64.StdEncoding.DecodeString(s)
|
||||
}
|
||||
|
||||
// extractURLFromBytes scans decoded protobuf bytes for an embedded HTTP(S) URL.
|
||||
func extractURLFromBytes(data []byte) string {
|
||||
s := string(data)
|
||||
// Find "http://" or "https://" in the decoded data
|
||||
for _, prefix := range []string{"https://", "http://"} {
|
||||
idx := strings.Index(s, prefix)
|
||||
if idx == -1 {
|
||||
continue
|
||||
}
|
||||
// Extract the URL — it ends at the first non-URL byte
|
||||
urlStart := idx
|
||||
urlEnd := urlStart
|
||||
for urlEnd < len(s) {
|
||||
c := s[urlEnd]
|
||||
// URL-safe characters
|
||||
if (c >= 'a' && c <= 'z') || (c >= 'A' && c <= 'Z') || (c >= '0' && c <= '9') ||
|
||||
c == ':' || c == '/' || c == '.' || c == '-' || c == '_' || c == '~' ||
|
||||
c == '?' || c == '&' || c == '=' || c == '%' || c == '#' || c == '+' ||
|
||||
c == '@' || c == '!' || c == '$' || c == '(' || c == ')' || c == ',' || c == ';' {
|
||||
urlEnd++
|
||||
} else {
|
||||
break
|
||||
}
|
||||
}
|
||||
candidate := s[urlStart:urlEnd]
|
||||
// Must be a real URL, not a Google News internal URL
|
||||
if strings.Contains(candidate, "news.google.com") || strings.Contains(candidate, "consent.google") {
|
||||
continue
|
||||
}
|
||||
if len(candidate) > 20 { // Minimum viable URL length
|
||||
return candidate
|
||||
}
|
||||
}
|
||||
return ""
|
||||
}
|
||||
|
||||
// ── Article Pipeline ─────────────────────────────────
|
||||
|
||||
// DiscoverArticles fetches RSS feeds and caches all new articles in the DB as 'discovered'.
|
||||
|
|
@ -408,13 +354,22 @@ func (s *OfficialAccountsService) DiscoverArticles(ctx context.Context, configID
|
|||
|
||||
newCount := 0
|
||||
for _, src := range sources {
|
||||
rssURL := src.EffectiveRSSURL()
|
||||
if !src.Enabled || rssURL == "" {
|
||||
if !src.Enabled {
|
||||
continue
|
||||
}
|
||||
items, err := s.FetchRSS(ctx, rssURL)
|
||||
if err != nil {
|
||||
log.Warn().Err(err).Str("source", src.Name).Msg("Failed to fetch RSS feed")
|
||||
|
||||
// Fetch articles: SearXNG for site-based sources, RSS for direct feed URLs
|
||||
var items []RSSItem
|
||||
var fetchErr error
|
||||
if src.UseSearXNG() {
|
||||
items, fetchErr = s.FetchSearXNGNews(ctx, src.Site)
|
||||
} else if rssURL := src.EffectiveRSSURL(); rssURL != "" {
|
||||
items, fetchErr = s.FetchRSS(ctx, rssURL)
|
||||
} else {
|
||||
continue
|
||||
}
|
||||
if fetchErr != nil {
|
||||
log.Warn().Err(fetchErr).Str("source", src.Name).Msg("Failed to fetch articles")
|
||||
continue
|
||||
}
|
||||
|
||||
|
|
@ -427,13 +382,16 @@ func (s *OfficialAccountsService) DiscoverArticles(ctx context.Context, configID
|
|||
continue
|
||||
}
|
||||
|
||||
// Parse pub date
|
||||
// Parse pub date — support multiple formats
|
||||
var pubDate *time.Time
|
||||
if item.PubDate != "" {
|
||||
for _, layout := range []string{
|
||||
time.RFC1123Z, time.RFC1123, time.RFC822Z, time.RFC822,
|
||||
"Mon, 2 Jan 2006 15:04:05 -0700",
|
||||
"2006-01-02T15:04:05Z",
|
||||
"2006-01-02T15:04:05", // SearXNG format
|
||||
"2006-01-02 15:04:05", // SearXNG alt format
|
||||
time.RFC3339,
|
||||
} {
|
||||
if t, err := time.Parse(layout, item.PubDate); err == nil {
|
||||
pubDate = &t
|
||||
|
|
@ -448,13 +406,19 @@ func (s *OfficialAccountsService) DiscoverArticles(ctx context.Context, configID
|
|||
desc = desc[:1000]
|
||||
}
|
||||
|
||||
// Source URL: use RSS source element if present, otherwise build from site
|
||||
sourceURL := item.Source.URL
|
||||
if sourceURL == "" && src.Site != "" {
|
||||
sourceURL = "https://" + src.Site
|
||||
}
|
||||
|
||||
// Insert into pipeline — ON CONFLICT means we already know about this article
|
||||
tag, err := s.pool.Exec(ctx, `
|
||||
INSERT INTO official_account_articles
|
||||
(config_id, guid, title, link, source_name, source_url, description, pub_date, status)
|
||||
VALUES ($1, $2, $3, $4, $5, $6, $7, $8, 'discovered')
|
||||
ON CONFLICT (config_id, guid) DO NOTHING
|
||||
`, configID, guid, item.Title, item.Link, src.Name, item.Source.URL, desc, pubDate)
|
||||
`, configID, guid, item.Title, item.Link, src.Name, sourceURL, desc, pubDate)
|
||||
if err != nil {
|
||||
log.Warn().Err(err).Str("guid", guid).Msg("Failed to cache article")
|
||||
continue
|
||||
|
|
|
|||
Loading…
Reference in a new issue