130 lines
3.2 KiB
Go
130 lines
3.2 KiB
Go
package reuters
|
|
|
|
import (
|
|
"context"
|
|
"encoding/xml"
|
|
"fmt"
|
|
"io"
|
|
"net/http"
|
|
"strings"
|
|
"time"
|
|
|
|
"github.com/tradarr/backend/internal/scraper"
|
|
)
|
|
|
|
// Reuters RSS est bloqué par Cloudflare. On utilise des flux RSS financiers
|
|
// publics fiables à la place : MarketWatch, CNBC, Seeking Alpha.
|
|
var feeds = []struct {
|
|
name string
|
|
url string
|
|
}{
|
|
{"MarketWatch Top Stories", "https://feeds.content.dowjones.io/public/rss/mw_topstories"},
|
|
{"MarketWatch Markets", "https://feeds.content.dowjones.io/public/rss/mw_marketpulse"},
|
|
{"CNBC Top News", "https://search.cnbc.com/rs/search/combinedcombined/rss/topNews"},
|
|
{"CNBC Finance", "https://search.cnbc.com/rs/search/combinedcombined/rss/topNews?tag=Finance"},
|
|
}
|
|
|
|
type Reuters struct {
|
|
client *http.Client
|
|
}
|
|
|
|
func New() *Reuters {
|
|
return &Reuters{client: &http.Client{Timeout: 15 * time.Second}}
|
|
}
|
|
|
|
func (r *Reuters) Name() string { return "reuters" }
|
|
|
|
type rssFeed struct {
|
|
Channel struct {
|
|
Items []struct {
|
|
Title string `xml:"title"`
|
|
Link string `xml:"link"`
|
|
Description string `xml:"description"`
|
|
PubDate string `xml:"pubDate"`
|
|
} `xml:"item"`
|
|
} `xml:"channel"`
|
|
}
|
|
|
|
func (r *Reuters) Scrape(ctx context.Context, _ []string) ([]scraper.Article, error) {
|
|
var articles []scraper.Article
|
|
seen := make(map[string]bool)
|
|
|
|
for i, feed := range feeds {
|
|
if i > 0 {
|
|
select {
|
|
case <-ctx.Done():
|
|
return articles, ctx.Err()
|
|
case <-time.After(300 * time.Millisecond):
|
|
}
|
|
}
|
|
items, err := r.fetchFeed(ctx, feed.url)
|
|
if err != nil {
|
|
fmt.Printf("reuters/financial %s: %v\n", feed.name, err)
|
|
continue
|
|
}
|
|
for _, a := range items {
|
|
if !seen[a.URL] {
|
|
seen[a.URL] = true
|
|
articles = append(articles, a)
|
|
}
|
|
}
|
|
fmt.Printf("reuters/financial %s: %d articles\n", feed.name, len(items))
|
|
}
|
|
return articles, nil
|
|
}
|
|
|
|
func (r *Reuters) fetchFeed(ctx context.Context, feedURL string) ([]scraper.Article, error) {
|
|
req, err := http.NewRequestWithContext(ctx, http.MethodGet, feedURL, nil)
|
|
if err != nil {
|
|
return nil, err
|
|
}
|
|
req.Header.Set("User-Agent", "Mozilla/5.0 (compatible; Tradarr/1.0)")
|
|
req.Header.Set("Accept", "application/rss+xml, application/xml, text/xml")
|
|
|
|
resp, err := r.client.Do(req)
|
|
if err != nil {
|
|
return nil, err
|
|
}
|
|
defer resp.Body.Close()
|
|
|
|
if resp.StatusCode != http.StatusOK {
|
|
body, _ := io.ReadAll(io.LimitReader(resp.Body, 256))
|
|
return nil, fmt.Errorf("HTTP %d: %s", resp.StatusCode, body)
|
|
}
|
|
|
|
var feed rssFeed
|
|
if err := xml.NewDecoder(resp.Body).Decode(&feed); err != nil {
|
|
return nil, fmt.Errorf("parse RSS: %w", err)
|
|
}
|
|
|
|
var articles []scraper.Article
|
|
for _, item := range feed.Channel.Items {
|
|
title := strings.TrimSpace(item.Title)
|
|
link := strings.TrimSpace(item.Link)
|
|
if title == "" || link == "" {
|
|
continue
|
|
}
|
|
|
|
var publishedAt *time.Time
|
|
for _, f := range []string{time.RFC1123Z, time.RFC1123, "Mon, 02 Jan 2006 15:04:05 -0700"} {
|
|
if t, err := time.Parse(f, item.PubDate); err == nil {
|
|
publishedAt = &t
|
|
break
|
|
}
|
|
}
|
|
|
|
content := strings.TrimSpace(item.Description)
|
|
if content == "" {
|
|
content = title
|
|
}
|
|
|
|
articles = append(articles, scraper.Article{
|
|
Title: title,
|
|
Content: content,
|
|
URL: link,
|
|
PublishedAt: publishedAt,
|
|
})
|
|
}
|
|
return articles, nil
|
|
}
|