Tradarr/backend/internal/scraper/reuters/reuters.go

package reuters

import (
	"context"
	"encoding/xml"
	"fmt"
	"io"
	"net/http"
	"strings"
	"time"

	"github.com/tradarr/backend/internal/scraper"
)

// Reuters RSS est bloqué par Cloudflare. On utilise des flux RSS financiers
// publics fiables à la place : MarketWatch, CNBC, Seeking Alpha.
var feeds = []struct {
	name string
	url  string
}{
	{"MarketWatch Top Stories", "https://feeds.content.dowjones.io/public/rss/mw_topstories"},
	{"MarketWatch Markets", "https://feeds.content.dowjones.io/public/rss/mw_marketpulse"},
	{"CNBC Top News", "https://search.cnbc.com/rs/search/combinedcombined/rss/topNews"},
	{"CNBC Finance", "https://search.cnbc.com/rs/search/combinedcombined/rss/topNews?tag=Finance"},
}

type Reuters struct {
	client *http.Client
}

func New() *Reuters {
	return &Reuters{client: &http.Client{Timeout: 15 * time.Second}}
}

func (r *Reuters) Name() string { return "reuters" }

type rssFeed struct {
	Channel struct {
		Items []struct {
			Title       string `xml:"title"`
			Link        string `xml:"link"`
			Description string `xml:"description"`
			PubDate     string `xml:"pubDate"`
		} `xml:"item"`
	} `xml:"channel"`
}

func (r *Reuters) Scrape(ctx context.Context, _ []string) ([]scraper.Article, error) {
	var articles []scraper.Article
	seen := make(map[string]bool)

	for i, feed := range feeds {
		if i > 0 {
			select {
			case <-ctx.Done():
				return articles, ctx.Err()
			case <-time.After(300 * time.Millisecond):
			}
		}
		items, err := r.fetchFeed(ctx, feed.url)
		if err != nil {
			fmt.Printf("reuters/financial %s: %v\n", feed.name, err)
			continue
		}
		for _, a := range items {
			if !seen[a.URL] {
				seen[a.URL] = true
				articles = append(articles, a)
			}
		}
		fmt.Printf("reuters/financial %s: %d articles\n", feed.name, len(items))
	}
	return articles, nil
}

func (r *Reuters) fetchFeed(ctx context.Context, feedURL string) ([]scraper.Article, error) {
	req, err := http.NewRequestWithContext(ctx, http.MethodGet, feedURL, nil)
	if err != nil {
		return nil, err
	}
	req.Header.Set("User-Agent", "Mozilla/5.0 (compatible; Tradarr/1.0)")
	req.Header.Set("Accept", "application/rss+xml, application/xml, text/xml")

	resp, err := r.client.Do(req)
	if err != nil {
		return nil, err
	}
	defer resp.Body.Close()

	if resp.StatusCode != http.StatusOK {
		body, _ := io.ReadAll(io.LimitReader(resp.Body, 256))
		return nil, fmt.Errorf("HTTP %d: %s", resp.StatusCode, body)
	}

	var feed rssFeed
	if err := xml.NewDecoder(resp.Body).Decode(&feed); err != nil {
		return nil, fmt.Errorf("parse RSS: %w", err)
	}

	var articles []scraper.Article
	for _, item := range feed.Channel.Items {
		title := strings.TrimSpace(item.Title)
		link := strings.TrimSpace(item.Link)
		if title == "" || link == "" {
			continue
		}

		var publishedAt *time.Time
		for _, f := range []string{time.RFC1123Z, time.RFC1123, "Mon, 02 Jan 2006 15:04:05 -0700"} {
			if t, err := time.Parse(f, item.PubDate); err == nil {
				publishedAt = &t
				break
			}
		}

		content := strings.TrimSpace(item.Description)
		if content == "" {
			content = title
		}

		articles = append(articles, scraper.Article{
			Title:       title,
			Content:     content,
			URL:         link,
			PublishedAt: publishedAt,
		})
	}
	return articles, nil
}