feat: add sources to retrieve news and divide the IA reflexions in 2 steps to limit the number of news
This commit is contained in:
129
backend/internal/scraper/reuters/reuters.go
Normal file
129
backend/internal/scraper/reuters/reuters.go
Normal file
@ -0,0 +1,129 @@
|
||||
package reuters
|
||||
|
||||
import (
|
||||
"context"
|
||||
"encoding/xml"
|
||||
"fmt"
|
||||
"io"
|
||||
"net/http"
|
||||
"strings"
|
||||
"time"
|
||||
|
||||
"github.com/tradarr/backend/internal/scraper"
|
||||
)
|
||||
|
||||
// Reuters RSS est bloqué par Cloudflare. On utilise des flux RSS financiers
|
||||
// publics fiables à la place : MarketWatch, CNBC, Seeking Alpha.
|
||||
var feeds = []struct {
|
||||
name string
|
||||
url string
|
||||
}{
|
||||
{"MarketWatch Top Stories", "https://feeds.content.dowjones.io/public/rss/mw_topstories"},
|
||||
{"MarketWatch Markets", "https://feeds.content.dowjones.io/public/rss/mw_marketpulse"},
|
||||
{"CNBC Top News", "https://search.cnbc.com/rs/search/combinedcombined/rss/topNews"},
|
||||
{"CNBC Finance", "https://search.cnbc.com/rs/search/combinedcombined/rss/topNews?tag=Finance"},
|
||||
}
|
||||
|
||||
type Reuters struct {
|
||||
client *http.Client
|
||||
}
|
||||
|
||||
func New() *Reuters {
|
||||
return &Reuters{client: &http.Client{Timeout: 15 * time.Second}}
|
||||
}
|
||||
|
||||
func (r *Reuters) Name() string { return "reuters" }
|
||||
|
||||
type rssFeed struct {
|
||||
Channel struct {
|
||||
Items []struct {
|
||||
Title string `xml:"title"`
|
||||
Link string `xml:"link"`
|
||||
Description string `xml:"description"`
|
||||
PubDate string `xml:"pubDate"`
|
||||
} `xml:"item"`
|
||||
} `xml:"channel"`
|
||||
}
|
||||
|
||||
func (r *Reuters) Scrape(ctx context.Context, _ []string) ([]scraper.Article, error) {
|
||||
var articles []scraper.Article
|
||||
seen := make(map[string]bool)
|
||||
|
||||
for i, feed := range feeds {
|
||||
if i > 0 {
|
||||
select {
|
||||
case <-ctx.Done():
|
||||
return articles, ctx.Err()
|
||||
case <-time.After(300 * time.Millisecond):
|
||||
}
|
||||
}
|
||||
items, err := r.fetchFeed(ctx, feed.url)
|
||||
if err != nil {
|
||||
fmt.Printf("reuters/financial %s: %v\n", feed.name, err)
|
||||
continue
|
||||
}
|
||||
for _, a := range items {
|
||||
if !seen[a.URL] {
|
||||
seen[a.URL] = true
|
||||
articles = append(articles, a)
|
||||
}
|
||||
}
|
||||
fmt.Printf("reuters/financial %s: %d articles\n", feed.name, len(items))
|
||||
}
|
||||
return articles, nil
|
||||
}
|
||||
|
||||
func (r *Reuters) fetchFeed(ctx context.Context, feedURL string) ([]scraper.Article, error) {
|
||||
req, err := http.NewRequestWithContext(ctx, http.MethodGet, feedURL, nil)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
req.Header.Set("User-Agent", "Mozilla/5.0 (compatible; Tradarr/1.0)")
|
||||
req.Header.Set("Accept", "application/rss+xml, application/xml, text/xml")
|
||||
|
||||
resp, err := r.client.Do(req)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
defer resp.Body.Close()
|
||||
|
||||
if resp.StatusCode != http.StatusOK {
|
||||
body, _ := io.ReadAll(io.LimitReader(resp.Body, 256))
|
||||
return nil, fmt.Errorf("HTTP %d: %s", resp.StatusCode, body)
|
||||
}
|
||||
|
||||
var feed rssFeed
|
||||
if err := xml.NewDecoder(resp.Body).Decode(&feed); err != nil {
|
||||
return nil, fmt.Errorf("parse RSS: %w", err)
|
||||
}
|
||||
|
||||
var articles []scraper.Article
|
||||
for _, item := range feed.Channel.Items {
|
||||
title := strings.TrimSpace(item.Title)
|
||||
link := strings.TrimSpace(item.Link)
|
||||
if title == "" || link == "" {
|
||||
continue
|
||||
}
|
||||
|
||||
var publishedAt *time.Time
|
||||
for _, f := range []string{time.RFC1123Z, time.RFC1123, "Mon, 02 Jan 2006 15:04:05 -0700"} {
|
||||
if t, err := time.Parse(f, item.PubDate); err == nil {
|
||||
publishedAt = &t
|
||||
break
|
||||
}
|
||||
}
|
||||
|
||||
content := strings.TrimSpace(item.Description)
|
||||
if content == "" {
|
||||
content = title
|
||||
}
|
||||
|
||||
articles = append(articles, scraper.Article{
|
||||
Title: title,
|
||||
Content: content,
|
||||
URL: link,
|
||||
PublishedAt: publishedAt,
|
||||
})
|
||||
}
|
||||
return articles, nil
|
||||
}
|
||||
Reference in New Issue
Block a user