package reuters import ( "context" "encoding/xml" "fmt" "io" "net/http" "strings" "time" "github.com/tradarr/backend/internal/scraper" ) // Reuters RSS est bloqué par Cloudflare. On utilise des flux RSS financiers // publics fiables à la place : MarketWatch, CNBC, Seeking Alpha. var feeds = []struct { name string url string }{ {"MarketWatch Top Stories", "https://feeds.content.dowjones.io/public/rss/mw_topstories"}, {"MarketWatch Markets", "https://feeds.content.dowjones.io/public/rss/mw_marketpulse"}, {"CNBC Top News", "https://search.cnbc.com/rs/search/combinedcombined/rss/topNews"}, {"CNBC Finance", "https://search.cnbc.com/rs/search/combinedcombined/rss/topNews?tag=Finance"}, } type Reuters struct { client *http.Client } func New() *Reuters { return &Reuters{client: &http.Client{Timeout: 15 * time.Second}} } func (r *Reuters) Name() string { return "reuters" } type rssFeed struct { Channel struct { Items []struct { Title string `xml:"title"` Link string `xml:"link"` Description string `xml:"description"` PubDate string `xml:"pubDate"` } `xml:"item"` } `xml:"channel"` } func (r *Reuters) Scrape(ctx context.Context, _ []string) ([]scraper.Article, error) { var articles []scraper.Article seen := make(map[string]bool) for i, feed := range feeds { if i > 0 { select { case <-ctx.Done(): return articles, ctx.Err() case <-time.After(300 * time.Millisecond): } } items, err := r.fetchFeed(ctx, feed.url) if err != nil { fmt.Printf("reuters/financial %s: %v\n", feed.name, err) continue } for _, a := range items { if !seen[a.URL] { seen[a.URL] = true articles = append(articles, a) } } fmt.Printf("reuters/financial %s: %d articles\n", feed.name, len(items)) } return articles, nil } func (r *Reuters) fetchFeed(ctx context.Context, feedURL string) ([]scraper.Article, error) { req, err := http.NewRequestWithContext(ctx, http.MethodGet, feedURL, nil) if err != nil { return nil, err } req.Header.Set("User-Agent", "Mozilla/5.0 (compatible; Tradarr/1.0)") req.Header.Set("Accept", "application/rss+xml, application/xml, text/xml") resp, err := r.client.Do(req) if err != nil { return nil, err } defer resp.Body.Close() if resp.StatusCode != http.StatusOK { body, _ := io.ReadAll(io.LimitReader(resp.Body, 256)) return nil, fmt.Errorf("HTTP %d: %s", resp.StatusCode, body) } var feed rssFeed if err := xml.NewDecoder(resp.Body).Decode(&feed); err != nil { return nil, fmt.Errorf("parse RSS: %w", err) } var articles []scraper.Article for _, item := range feed.Channel.Items { title := strings.TrimSpace(item.Title) link := strings.TrimSpace(item.Link) if title == "" || link == "" { continue } var publishedAt *time.Time for _, f := range []string{time.RFC1123Z, time.RFC1123, "Mon, 02 Jan 2006 15:04:05 -0700"} { if t, err := time.Parse(f, item.PubDate); err == nil { publishedAt = &t break } } content := strings.TrimSpace(item.Description) if content == "" { content = title } articles = append(articles, scraper.Article{ Title: title, Content: content, URL: link, PublishedAt: publishedAt, }) } return articles, nil }