feat: add sources to retrieve news and divide the IA reflexions in 2 steps to limit the number of news

2026-04-19 10:43:15 +02:00
parent 93668273ff
commit eb1fb5ca78
28 changed files with 1086 additions and 249 deletions
--- a/backend/internal/scraper/reuters/reuters.go
+++ b/backend/internal/scraper/reuters/reuters.go
@ -0,0 +1,129 @@
+package reuters
+
+import (
+	"context"
+	"encoding/xml"
+	"fmt"
+	"io"
+	"net/http"
+	"strings"
+	"time"
+
+	"github.com/tradarr/backend/internal/scraper"
+)
+
+// Reuters RSS est bloqué par Cloudflare. On utilise des flux RSS financiers
+// publics fiables à la place : MarketWatch, CNBC, Seeking Alpha.
+var feeds = []struct {
+	name string
+	url  string
+}{
+	{"MarketWatch Top Stories", "https://feeds.content.dowjones.io/public/rss/mw_topstories"},
+	{"MarketWatch Markets", "https://feeds.content.dowjones.io/public/rss/mw_marketpulse"},
+	{"CNBC Top News", "https://search.cnbc.com/rs/search/combinedcombined/rss/topNews"},
+	{"CNBC Finance", "https://search.cnbc.com/rs/search/combinedcombined/rss/topNews?tag=Finance"},
+}
+
+type Reuters struct {
+	client *http.Client
+}
+
+func New() *Reuters {
+	return &Reuters{client: &http.Client{Timeout: 15 * time.Second}}
+}
+
+func (r *Reuters) Name() string { return "reuters" }
+
+type rssFeed struct {
+	Channel struct {
+		Items []struct {
+			Title       string `xml:"title"`
+			Link        string `xml:"link"`
+			Description string `xml:"description"`
+			PubDate     string `xml:"pubDate"`
+		} `xml:"item"`
+	} `xml:"channel"`
+}
+
+func (r *Reuters) Scrape(ctx context.Context, _ []string) ([]scraper.Article, error) {
+	var articles []scraper.Article
+	seen := make(map[string]bool)
+
+	for i, feed := range feeds {
+		if i > 0 {
+			select {
+			case <-ctx.Done():
+				return articles, ctx.Err()
+			case <-time.After(300 * time.Millisecond):
+			}
+		}
+		items, err := r.fetchFeed(ctx, feed.url)
+		if err != nil {
+			fmt.Printf("reuters/financial %s: %v\n", feed.name, err)
+			continue
+		}
+		for _, a := range items {
+			if !seen[a.URL] {
+				seen[a.URL] = true
+				articles = append(articles, a)
+			}
+		}
+		fmt.Printf("reuters/financial %s: %d articles\n", feed.name, len(items))
+	}
+	return articles, nil
+}
+
+func (r *Reuters) fetchFeed(ctx context.Context, feedURL string) ([]scraper.Article, error) {
+	req, err := http.NewRequestWithContext(ctx, http.MethodGet, feedURL, nil)
+	if err != nil {
+		return nil, err
+	}
+	req.Header.Set("User-Agent", "Mozilla/5.0 (compatible; Tradarr/1.0)")
+	req.Header.Set("Accept", "application/rss+xml, application/xml, text/xml")
+
+	resp, err := r.client.Do(req)
+	if err != nil {
+		return nil, err
+	}
+	defer resp.Body.Close()
+
+	if resp.StatusCode != http.StatusOK {
+		body, _ := io.ReadAll(io.LimitReader(resp.Body, 256))
+		return nil, fmt.Errorf("HTTP %d: %s", resp.StatusCode, body)
+	}
+
+	var feed rssFeed
+	if err := xml.NewDecoder(resp.Body).Decode(&feed); err != nil {
+		return nil, fmt.Errorf("parse RSS: %w", err)
+	}
+
+	var articles []scraper.Article
+	for _, item := range feed.Channel.Items {
+		title := strings.TrimSpace(item.Title)
+		link := strings.TrimSpace(item.Link)
+		if title == "" || link == "" {
+			continue
+		}
+
+		var publishedAt *time.Time
+		for _, f := range []string{time.RFC1123Z, time.RFC1123, "Mon, 02 Jan 2006 15:04:05 -0700"} {
+			if t, err := time.Parse(f, item.PubDate); err == nil {
+				publishedAt = &t
+				break
+			}
+		}
+
+		content := strings.TrimSpace(item.Description)
+		if content == "" {
+			content = title
+		}
+
+		articles = append(articles, scraper.Article{
+			Title:       title,
+			Content:     content,
+			URL:         link,
+			PublishedAt: publishedAt,
+		})
+	}
+	return articles, nil
+}