diff --git a/go-backend/internal/services/official_accounts_service.go b/go-backend/internal/services/official_accounts_service.go index 76a038e..fd5f679 100644 --- a/go-backend/internal/services/official_accounts_service.go +++ b/go-backend/internal/services/official_accounts_service.go @@ -2,7 +2,6 @@ package services import ( "context" - "encoding/base64" "encoding/json" "encoding/xml" "fmt" @@ -44,9 +43,9 @@ type OfficialAccountConfig struct { AvatarURL string `json:"avatar_url,omitempty"` } -// NewsSource represents a single RSS feed configuration. -// If Site is set, the Google News RSS URL is auto-constructed. -// If RSSURL is set directly, it's used as-is (legacy/fallback). +// NewsSource represents a single news feed configuration. +// If Site is set, SearXNG is used to find news articles for that site. +// If RSSURL is set directly, the RSS feed is fetched as-is. type NewsSource struct { Name string `json:"name"` Site string `json:"site,omitempty"` @@ -54,16 +53,32 @@ type NewsSource struct { Enabled bool `json:"enabled"` } -// GoogleNewsRSSURL builds a Google News RSS search URL for the given site domain. -func GoogleNewsRSSURL(site string) string { - return fmt.Sprintf("https://news.google.com/rss/search?q=site:%s&hl=en-US&gl=US&ceid=US:en", site) +// SearXNG endpoint — local Docker instance +const searxngBaseURL = "http://localhost:8888" + +// SearXNGResponse represents the JSON response from SearXNG /search endpoint. +type SearXNGResponse struct { + Results []SearXNGResult `json:"results"` } -// EffectiveRSSURL returns the RSS URL to fetch — Google News if Site is set, otherwise RSSURL. +// SearXNGResult represents a single search result from SearXNG. +type SearXNGResult struct { + URL string `json:"url"` + Title string `json:"title"` + Content string `json:"content"` + PublishedDate string `json:"publishedDate"` + Thumbnail string `json:"thumbnail"` + Engine string `json:"engine"` + Category string `json:"category"` +} + +// UseSearXNG returns true if this source should use SearXNG (has a Site configured). +func (ns *NewsSource) UseSearXNG() bool { + return ns.Site != "" +} + +// EffectiveRSSURL returns the direct RSS URL if set, empty string otherwise. func (ns *NewsSource) EffectiveRSSURL() string { - if ns.Site != "" { - return GoogleNewsRSSURL(ns.Site) - } return ns.RSSURL } @@ -237,8 +252,61 @@ func (s *OfficialAccountsService) ToggleEnabled(ctx context.Context, id string, return err } -// ── RSS News Fetching ──────────────────────────────── +// ── News Fetching (SearXNG + RSS) ─────────────────── +// FetchSearXNGNews queries the local SearXNG instance for news about a site. +// Returns results as RSSItems for uniform handling in the pipeline. +func (s *OfficialAccountsService) FetchSearXNGNews(ctx context.Context, site string) ([]RSSItem, error) { + searchURL := fmt.Sprintf("%s/search?q=site:%s&categories=news&format=json&language=en", searxngBaseURL, site) + + req, err := http.NewRequestWithContext(ctx, "GET", searchURL, nil) + if err != nil { + return nil, err + } + + resp, err := s.httpClient.Do(req) + if err != nil { + return nil, fmt.Errorf("SearXNG request failed for site %s: %w", site, err) + } + defer resp.Body.Close() + + if resp.StatusCode != http.StatusOK { + return nil, fmt.Errorf("SearXNG returned status %d for site %s", resp.StatusCode, site) + } + + body, err := io.ReadAll(resp.Body) + if err != nil { + return nil, err + } + + var sxResp SearXNGResponse + if err := json.Unmarshal(body, &sxResp); err != nil { + return nil, fmt.Errorf("failed to parse SearXNG response: %w", err) + } + + // Convert SearXNG results to RSSItems + var items []RSSItem + for _, r := range sxResp.Results { + if r.URL == "" || r.Title == "" { + continue + } + item := RSSItem{ + Title: r.Title, + Link: r.URL, + Description: r.Content, + GUID: r.URL, // use the actual URL as GUID for dedup + } + if r.PublishedDate != "" { + item.PubDate = r.PublishedDate + } + items = append(items, item) + } + + log.Debug().Int("results", len(items)).Str("site", site).Msg("[SearXNG] Fetched news articles") + return items, nil +} + +// FetchRSS fetches and parses a standard RSS/XML feed. func (s *OfficialAccountsService) FetchRSS(ctx context.Context, rssURL string) ([]RSSItem, error) { req, err := http.NewRequestWithContext(ctx, "GET", rssURL, nil) if err != nil { @@ -266,131 +334,9 @@ func (s *OfficialAccountsService) FetchRSS(ctx context.Context, rssURL string) ( return nil, fmt.Errorf("failed to parse RSS from %s: %w", rssURL, err) } - // If items come from Google News, resolve redirect links to actual article URLs - isGoogleNews := strings.Contains(rssURL, "news.google.com/rss") - if isGoogleNews { - for i := range feed.Channel.Items { - item := &feed.Channel.Items[i] - // Preserve original Google News URL in GUID for dedup - if item.GUID == "" { - item.GUID = item.Link - } - resolved := ResolveGoogleNewsURL(item.Link) - if resolved != item.Link && resolved != "" { - // Base64 decode succeeded — use real article URL - item.Link = resolved - } else if item.Source.URL != "" { - // Fall back to the from the RSS item - item.Link = item.Source.URL - log.Debug().Str("source", item.Source.Name).Str("url", item.Source.URL).Msg("Using RSS source URL as fallback for Google News link") - } - } - } - return feed.Channel.Items, nil } -// resolveGoogleNewsLink follows the Google News redirect chain to get the actual article URL. -func (s *OfficialAccountsService) resolveGoogleNewsLink(googleURL string) string { - return ResolveGoogleNewsURL(googleURL) -} - -// ResolveGoogleNewsURL is a package-level helper that resolves a Google News URL -// to the underlying article URL by decoding the base64-encoded article ID. -// This is instant (no network request) — Google News embeds the real URL in the article ID. -func ResolveGoogleNewsURL(googleURL string) string { - if googleURL == "" || !strings.Contains(googleURL, "news.google.com") { - return googleURL - } - - // Extract the article ID from URLs like: - // https://news.google.com/rss/articles/CBMisgFBVV95cUxQ...?oc=5 - articleID := "" - if idx := strings.Index(googleURL, "/articles/"); idx != -1 { - articleID = googleURL[idx+len("/articles/"):] - } else if idx := strings.Index(googleURL, "/read/"); idx != -1 { - articleID = googleURL[idx+len("/read/"):] - } - if articleID == "" { - return googleURL - } - - // Strip query params - if qIdx := strings.Index(articleID, "?"); qIdx != -1 { - articleID = articleID[:qIdx] - } - - // Base64url decode the article ID - decoded, err := base64DecodeGNews(articleID) - if err != nil { - log.Debug().Err(err).Str("url", googleURL).Msg("Failed to base64-decode Google News article ID") - return googleURL - } - - // The decoded protobuf contains the article URL as an embedded string. - // Scan for "http" to find the URL within the decoded bytes. - resolved := extractURLFromBytes(decoded) - if resolved != "" { - log.Debug().Str("resolved", resolved).Str("original", googleURL).Msg("Resolved Google News link via base64 decode") - return resolved - } - - log.Debug().Str("url", googleURL).Msg("Could not extract URL from Google News article ID") - return googleURL -} - -// base64DecodeGNews decodes a Google News article ID which uses base64url encoding -// with optional padding. -func base64DecodeGNews(s string) ([]byte, error) { - // Replace URL-safe chars - s = strings.ReplaceAll(s, "-", "+") - s = strings.ReplaceAll(s, "_", "/") - // Add padding if needed - switch len(s) % 4 { - case 2: - s += "==" - case 3: - s += "=" - } - return base64.StdEncoding.DecodeString(s) -} - -// extractURLFromBytes scans decoded protobuf bytes for an embedded HTTP(S) URL. -func extractURLFromBytes(data []byte) string { - s := string(data) - // Find "http://" or "https://" in the decoded data - for _, prefix := range []string{"https://", "http://"} { - idx := strings.Index(s, prefix) - if idx == -1 { - continue - } - // Extract the URL — it ends at the first non-URL byte - urlStart := idx - urlEnd := urlStart - for urlEnd < len(s) { - c := s[urlEnd] - // URL-safe characters - if (c >= 'a' && c <= 'z') || (c >= 'A' && c <= 'Z') || (c >= '0' && c <= '9') || - c == ':' || c == '/' || c == '.' || c == '-' || c == '_' || c == '~' || - c == '?' || c == '&' || c == '=' || c == '%' || c == '#' || c == '+' || - c == '@' || c == '!' || c == '$' || c == '(' || c == ')' || c == ',' || c == ';' { - urlEnd++ - } else { - break - } - } - candidate := s[urlStart:urlEnd] - // Must be a real URL, not a Google News internal URL - if strings.Contains(candidate, "news.google.com") || strings.Contains(candidate, "consent.google") { - continue - } - if len(candidate) > 20 { // Minimum viable URL length - return candidate - } - } - return "" -} - // ── Article Pipeline ───────────────────────────────── // DiscoverArticles fetches RSS feeds and caches all new articles in the DB as 'discovered'. @@ -408,13 +354,22 @@ func (s *OfficialAccountsService) DiscoverArticles(ctx context.Context, configID newCount := 0 for _, src := range sources { - rssURL := src.EffectiveRSSURL() - if !src.Enabled || rssURL == "" { + if !src.Enabled { continue } - items, err := s.FetchRSS(ctx, rssURL) - if err != nil { - log.Warn().Err(err).Str("source", src.Name).Msg("Failed to fetch RSS feed") + + // Fetch articles: SearXNG for site-based sources, RSS for direct feed URLs + var items []RSSItem + var fetchErr error + if src.UseSearXNG() { + items, fetchErr = s.FetchSearXNGNews(ctx, src.Site) + } else if rssURL := src.EffectiveRSSURL(); rssURL != "" { + items, fetchErr = s.FetchRSS(ctx, rssURL) + } else { + continue + } + if fetchErr != nil { + log.Warn().Err(fetchErr).Str("source", src.Name).Msg("Failed to fetch articles") continue } @@ -427,13 +382,16 @@ func (s *OfficialAccountsService) DiscoverArticles(ctx context.Context, configID continue } - // Parse pub date + // Parse pub date — support multiple formats var pubDate *time.Time if item.PubDate != "" { for _, layout := range []string{ time.RFC1123Z, time.RFC1123, time.RFC822Z, time.RFC822, "Mon, 2 Jan 2006 15:04:05 -0700", "2006-01-02T15:04:05Z", + "2006-01-02T15:04:05", // SearXNG format + "2006-01-02 15:04:05", // SearXNG alt format + time.RFC3339, } { if t, err := time.Parse(layout, item.PubDate); err == nil { pubDate = &t @@ -448,13 +406,19 @@ func (s *OfficialAccountsService) DiscoverArticles(ctx context.Context, configID desc = desc[:1000] } + // Source URL: use RSS source element if present, otherwise build from site + sourceURL := item.Source.URL + if sourceURL == "" && src.Site != "" { + sourceURL = "https://" + src.Site + } + // Insert into pipeline — ON CONFLICT means we already know about this article tag, err := s.pool.Exec(ctx, ` INSERT INTO official_account_articles (config_id, guid, title, link, source_name, source_url, description, pub_date, status) VALUES ($1, $2, $3, $4, $5, $6, $7, $8, 'discovered') ON CONFLICT (config_id, guid) DO NOTHING - `, configID, guid, item.Title, item.Link, src.Name, item.Source.URL, desc, pubDate) + `, configID, guid, item.Title, item.Link, src.Name, sourceURL, desc, pubDate) if err != nil { log.Warn().Err(err).Str("guid", guid).Msg("Failed to cache article") continue