feat: add sources to retrieve news and divide the IA reflexions in 2 steps to limit the number of news

This commit is contained in:
2026-04-19 10:43:15 +02:00
parent 93668273ff
commit eb1fb5ca78
28 changed files with 1086 additions and 249 deletions

View File

@ -0,0 +1,129 @@
package reuters
import (
"context"
"encoding/xml"
"fmt"
"io"
"net/http"
"strings"
"time"
"github.com/tradarr/backend/internal/scraper"
)
// Reuters RSS est bloqué par Cloudflare. On utilise des flux RSS financiers
// publics fiables à la place : MarketWatch, CNBC, Seeking Alpha.
var feeds = []struct {
name string
url string
}{
{"MarketWatch Top Stories", "https://feeds.content.dowjones.io/public/rss/mw_topstories"},
{"MarketWatch Markets", "https://feeds.content.dowjones.io/public/rss/mw_marketpulse"},
{"CNBC Top News", "https://search.cnbc.com/rs/search/combinedcombined/rss/topNews"},
{"CNBC Finance", "https://search.cnbc.com/rs/search/combinedcombined/rss/topNews?tag=Finance"},
}
type Reuters struct {
client *http.Client
}
func New() *Reuters {
return &Reuters{client: &http.Client{Timeout: 15 * time.Second}}
}
func (r *Reuters) Name() string { return "reuters" }
type rssFeed struct {
Channel struct {
Items []struct {
Title string `xml:"title"`
Link string `xml:"link"`
Description string `xml:"description"`
PubDate string `xml:"pubDate"`
} `xml:"item"`
} `xml:"channel"`
}
func (r *Reuters) Scrape(ctx context.Context, _ []string) ([]scraper.Article, error) {
var articles []scraper.Article
seen := make(map[string]bool)
for i, feed := range feeds {
if i > 0 {
select {
case <-ctx.Done():
return articles, ctx.Err()
case <-time.After(300 * time.Millisecond):
}
}
items, err := r.fetchFeed(ctx, feed.url)
if err != nil {
fmt.Printf("reuters/financial %s: %v\n", feed.name, err)
continue
}
for _, a := range items {
if !seen[a.URL] {
seen[a.URL] = true
articles = append(articles, a)
}
}
fmt.Printf("reuters/financial %s: %d articles\n", feed.name, len(items))
}
return articles, nil
}
func (r *Reuters) fetchFeed(ctx context.Context, feedURL string) ([]scraper.Article, error) {
req, err := http.NewRequestWithContext(ctx, http.MethodGet, feedURL, nil)
if err != nil {
return nil, err
}
req.Header.Set("User-Agent", "Mozilla/5.0 (compatible; Tradarr/1.0)")
req.Header.Set("Accept", "application/rss+xml, application/xml, text/xml")
resp, err := r.client.Do(req)
if err != nil {
return nil, err
}
defer resp.Body.Close()
if resp.StatusCode != http.StatusOK {
body, _ := io.ReadAll(io.LimitReader(resp.Body, 256))
return nil, fmt.Errorf("HTTP %d: %s", resp.StatusCode, body)
}
var feed rssFeed
if err := xml.NewDecoder(resp.Body).Decode(&feed); err != nil {
return nil, fmt.Errorf("parse RSS: %w", err)
}
var articles []scraper.Article
for _, item := range feed.Channel.Items {
title := strings.TrimSpace(item.Title)
link := strings.TrimSpace(item.Link)
if title == "" || link == "" {
continue
}
var publishedAt *time.Time
for _, f := range []string{time.RFC1123Z, time.RFC1123, "Mon, 02 Jan 2006 15:04:05 -0700"} {
if t, err := time.Parse(f, item.PubDate); err == nil {
publishedAt = &t
break
}
}
content := strings.TrimSpace(item.Description)
if content == "" {
content = title
}
articles = append(articles, scraper.Article{
Title: title,
Content: content,
URL: link,
PublishedAt: publishedAt,
})
}
return articles, nil
}