feat: add sources to retrieve news and divide the IA reflexions in 2 steps to limit the number of news

2026-04-19 10:43:15 +02:00
parent 93668273ff
commit eb1fb5ca78
28 changed files with 1086 additions and 249 deletions
--- a/backend/internal/scraper/bloomberg/bloomberg.go
+++ b/backend/internal/scraper/bloomberg/bloomberg.go
@ -1,206 +1,94 @@
 package bloomberg

 import (
+	"bytes"
 	"context"
+	"encoding/json"
 	"fmt"
+	"io"
+	"net/http"
 	"strings"
 	"time"

-	"github.com/chromedp/chromedp"
 	"github.com/tradarr/backend/internal/scraper"
 )

 type Bloomberg struct {
-	username   string
-	password   string
-	chromePath string
+	scraperURL string
+	client     *http.Client
 }

-func New(username, password, chromePath string) *Bloomberg {
-	return &Bloomberg{username: username, password: password, chromePath: chromePath}
+func New(scraperURL string) *Bloomberg {
+	if scraperURL == "" {
+		scraperURL = "http://scraper:3001"
+	}
+	return &Bloomberg{
+		scraperURL: scraperURL,
+		client:     &http.Client{Timeout: 10 * time.Minute},
+	}
 }

 func (b *Bloomberg) Name() string { return "bloomberg" }

-func (b *Bloomberg) Scrape(ctx context.Context, symbols []string) ([]scraper.Article, error) {
-	if b.username == "" || b.password == "" {
-		return nil, fmt.Errorf("bloomberg credentials not configured")
-	}
-
-	opts := []chromedp.ExecAllocatorOption{
-		chromedp.NoFirstRun,
-		chromedp.NoDefaultBrowserCheck,
-		chromedp.Headless,
-		chromedp.DisableGPU,
-		chromedp.Flag("no-sandbox", true),
-		chromedp.Flag("disable-setuid-sandbox", true),
-		chromedp.Flag("disable-dev-shm-usage", true),
-		chromedp.Flag("disable-blink-features", "AutomationControlled"),
-		chromedp.Flag("disable-infobars", true),
-		chromedp.Flag("window-size", "1920,1080"),
-		chromedp.Flag("ignore-certificate-errors", true),
-		chromedp.UserAgent("Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/122.0.0.0 Safari/537.36"),
-	}
-	if b.chromePath != "" {
-		opts = append(opts, chromedp.ExecPath(b.chromePath))
-	}
-
-	allocCtx, cancelAlloc := chromedp.NewExecAllocator(ctx, opts...)
-	defer cancelAlloc()
-
-	chromeCtx, cancelChrome := chromedp.NewContext(allocCtx)
-	defer cancelChrome()
-
-	timeoutCtx, cancelTimeout := context.WithTimeout(chromeCtx, 5*time.Minute)
-	defer cancelTimeout()
-
-	if err := b.login(timeoutCtx); err != nil {
-		return nil, fmt.Errorf("bloomberg login: %w", err)
-	}
-
-	var articles []scraper.Article
-	pages := []string{
-		"https://www.bloomberg.com/markets",
-		"https://www.bloomberg.com/technology",
-		"https://www.bloomberg.com/economics",
-	}
-	for _, u := range pages {
-		pageArticles, err := b.scrapePage(timeoutCtx, u, symbols)
-		if err != nil {
-			fmt.Printf("bloomberg scrape %s: %v\n", u, err)
-			continue
-		}
-		articles = append(articles, pageArticles...)
-	}
-	fmt.Printf("bloomberg: %d articles fetched total\n", len(articles))
-	return articles, nil
+type scraperRequest struct {
+	Username string `json:"username"`
+	Password string `json:"password"`
 }

-func (b *Bloomberg) login(ctx context.Context) error {
-	loginCtx, cancel := context.WithTimeout(ctx, 2*time.Minute)
-	defer cancel()
-
-	// Masquer la détection d'automation via JS
-	if err := chromedp.Run(loginCtx,
-		chromedp.ActionFunc(func(ctx context.Context) error {
-			return chromedp.Evaluate(`
-				Object.defineProperty(navigator, 'webdriver', {get: () => undefined});
-				window.chrome = { runtime: {} };
-			`, nil).Do(ctx)
-		}),
-	); err != nil {
-		fmt.Printf("bloomberg: could not inject stealth JS: %v\n", err)
-	}
-
-	err := chromedp.Run(loginCtx,
-		chromedp.Navigate("https://www.bloomberg.com/account/signin"),
-		chromedp.Sleep(2*time.Second),
-		// Essayer plusieurs sélecteurs pour l'email
-		chromedp.ActionFunc(func(ctx context.Context) error {
-			selectors := []string{
-				`input[name="email"]`,
-				`input[type="email"]`,
-				`input[data-type="email"]`,
-				`input[placeholder*="email" i]`,
-				`input[placeholder*="mail" i]`,
-			}
-			for _, sel := range selectors {
-				var count int
-				if err := chromedp.Evaluate(fmt.Sprintf(`document.querySelectorAll('%s').length`, sel), &count).Do(ctx); err == nil && count > 0 {
-					fmt.Printf("bloomberg: using email selector: %s\n", sel)
-					return chromedp.SendKeys(sel, b.username, chromedp.ByQuery).Do(ctx)
-				}
-			}
-			return fmt.Errorf("could not find email input — Bloomberg login page structure may have changed")
-		}),
-		chromedp.Sleep(500*time.Millisecond),
-		// Submit email
-		chromedp.ActionFunc(func(ctx context.Context) error {
-			selectors := []string{`button[type="submit"]`, `input[type="submit"]`, `button[data-testid*="submit"]`}
-			for _, sel := range selectors {
-				var count int
-				if err := chromedp.Evaluate(fmt.Sprintf(`document.querySelectorAll('%s').length`, sel), &count).Do(ctx); err == nil && count > 0 {
-					return chromedp.Click(sel, chromedp.ByQuery).Do(ctx)
-				}
-			}
-			// Fallback: press Enter
-			return chromedp.KeyEvent("\r").Do(ctx)
-		}),
-		chromedp.Sleep(2*time.Second),
-		// Password
-		chromedp.ActionFunc(func(ctx context.Context) error {
-			selectors := []string{`input[type="password"]`, `input[name="password"]`}
-			for _, sel := range selectors {
-				var count int
-				if err := chromedp.Evaluate(fmt.Sprintf(`document.querySelectorAll('%s').length`, sel), &count).Do(ctx); err == nil && count > 0 {
-					fmt.Printf("bloomberg: using password selector: %s\n", sel)
-					return chromedp.SendKeys(sel, b.password, chromedp.ByQuery).Do(ctx)
-				}
-			}
-			return fmt.Errorf("could not find password input")
-		}),
-		chromedp.Sleep(500*time.Millisecond),
-		chromedp.ActionFunc(func(ctx context.Context) error {
-			selectors := []string{`button[type="submit"]`, `input[type="submit"]`}
-			for _, sel := range selectors {
-				var count int
-				if err := chromedp.Evaluate(fmt.Sprintf(`document.querySelectorAll('%s').length`, sel), &count).Do(ctx); err == nil && count > 0 {
-					return chromedp.Click(sel, chromedp.ByQuery).Do(ctx)
-				}
-			}
-			return chromedp.KeyEvent("\r").Do(ctx)
-		}),
-		chromedp.Sleep(3*time.Second),
-	)
-	return err
+type scraperArticle struct {
+	Title string `json:"title"`
+	URL   string `json:"url"`
 }

-func (b *Bloomberg) scrapePage(ctx context.Context, pageURL string, symbols []string) ([]scraper.Article, error) {
-	pageCtx, cancel := context.WithTimeout(ctx, 60*time.Second)
-	defer cancel()
+type scraperResponse struct {
+	Articles []scraperArticle `json:"articles"`
+	Error    string           `json:"error,omitempty"`
+}

-	var articleNodes []map[string]string
-	err := chromedp.Run(pageCtx,
-		chromedp.Navigate(pageURL),
-		chromedp.Sleep(3*time.Second),
-		chromedp.Evaluate(`
-			(function() {
-				var items = [];
-				var seen = new Set();
-				var links = document.querySelectorAll('a[href*="/news/articles"], a[href*="/opinion/"], a[href*="/markets/"]');
-				links.forEach(function(a) {
-					if (seen.has(a.href)) return;
-					seen.add(a.href);
-					var title = a.querySelector('h1,h2,h3,h4,[class*="headline"],[class*="title"]');
-					var text = title ? title.innerText.trim() : a.innerText.trim();
-					if (text.length > 20 && a.href.includes('bloomberg.com')) {
-						items.push({title: text, url: a.href});
-					}
-				});
-				return items.slice(0, 25);
-			})()
-		`, &articleNodes),
-	)
+func (b *Bloomberg) ScrapeWithCredentials(ctx context.Context, username, password string, symbols []string) ([]scraper.Article, error) {
+	payload, _ := json.Marshal(scraperRequest{Username: username, Password: password})
+	req, err := http.NewRequestWithContext(ctx, http.MethodPost, b.scraperURL+"/bloomberg/scrape", bytes.NewReader(payload))
 	if err != nil {
-		return nil, fmt.Errorf("navigate %s: %w", pageURL, err)
+		return nil, err
+	}
+	req.Header.Set("Content-Type", "application/json")
+
+	resp, err := b.client.Do(req)
+	if err != nil {
+		return nil, fmt.Errorf("scraper service unreachable: %w", err)
+	}
+	defer resp.Body.Close()
+
+	body, _ := io.ReadAll(resp.Body)
+	if resp.StatusCode != http.StatusOK {
+		return nil, fmt.Errorf("scraper service HTTP %d: %s", resp.StatusCode, body)
+	}
+
+	var result scraperResponse
+	if err := json.Unmarshal(body, &result); err != nil {
+		return nil, fmt.Errorf("parse scraper response: %w", err)
+	}
+	if result.Error != "" {
+		return nil, fmt.Errorf("bloomberg: %s", result.Error)
 	}

-	var articles []scraper.Article
 	now := time.Now()
-	for _, node := range articleNodes {
-		title := strings.TrimSpace(node["title"])
-		url := node["url"]
-		if title == "" || url == "" || !strings.Contains(url, "bloomberg.com") {
+	var articles []scraper.Article
+	for _, a := range result.Articles {
+		title := strings.TrimSpace(a.Title)
+		url := a.URL
+		if title == "" || url == "" {
 			continue
 		}
 		syms := scraper.DetectSymbols(title, symbols)
 		articles = append(articles, scraper.Article{
 			Title:       title,
-			Content:     title, // contenu minimal — l'article complet nécessite un accès payant
+			Content:     title,
 			URL:         url,
 			PublishedAt: &now,
 			Symbols:     syms,
 		})
 	}
+	fmt.Printf("bloomberg: %d articles fetched\n", len(articles))
 	return articles, nil
 }
--- a/backend/internal/scraper/bloomberg/dynamic.go
+++ b/backend/internal/scraper/bloomberg/dynamic.go
@ -9,21 +9,19 @@ import (
 	"github.com/tradarr/backend/internal/scraper"
 )

-// DynamicBloomberg charge les credentials depuis la DB avant chaque scraping
 type DynamicBloomberg struct {
 	repo       *models.Repository
 	enc        *crypto.Encryptor
-	chromePath string
+	scraperURL string
 }

-func NewDynamic(repo *models.Repository, enc *crypto.Encryptor, chromePath string) *DynamicBloomberg {
-	return &DynamicBloomberg{repo: repo, enc: enc, chromePath: chromePath}
+func NewDynamic(repo *models.Repository, enc *crypto.Encryptor, scraperURL string) *DynamicBloomberg {
+	return &DynamicBloomberg{repo: repo, enc: enc, scraperURL: scraperURL}
 }

 func (d *DynamicBloomberg) Name() string { return "bloomberg" }

 func (d *DynamicBloomberg) Scrape(ctx context.Context, symbols []string) ([]scraper.Article, error) {
-	// Récupérer la source Bloomberg
 	source, err := d.repo.GetSourceByType("bloomberg")
 	if err != nil || source == nil {
 		return nil, fmt.Errorf("bloomberg source not found")
@ -34,7 +32,7 @@ func (d *DynamicBloomberg) Scrape(ctx context.Context, symbols []string) ([]scra
 		return nil, fmt.Errorf("get bloomberg credentials: %w", err)
 	}
 	if cred == nil || cred.Username == "" {
-		return nil, fmt.Errorf("bloomberg credentials not configured — please set them in the admin panel")
+		return nil, fmt.Errorf("bloomberg credentials not configured — configure them in the admin panel")
 	}

 	password := ""
@ -45,6 +43,6 @@ func (d *DynamicBloomberg) Scrape(ctx context.Context, symbols []string) ([]scra
 		}
 	}

-	b := New(cred.Username, password, d.chromePath)
-	return b.Scrape(ctx, symbols)
+	b := New(d.scraperURL)
+	return b.ScrapeWithCredentials(ctx, cred.Username, password, symbols)
 }
--- a/backend/internal/scraper/reuters/reuters.go
+++ b/backend/internal/scraper/reuters/reuters.go
@ -0,0 +1,129 @@
+package reuters
+
+import (
+	"context"
+	"encoding/xml"
+	"fmt"
+	"io"
+	"net/http"
+	"strings"
+	"time"
+
+	"github.com/tradarr/backend/internal/scraper"
+)
+
+// Reuters RSS est bloqué par Cloudflare. On utilise des flux RSS financiers
+// publics fiables à la place : MarketWatch, CNBC, Seeking Alpha.
+var feeds = []struct {
+	name string
+	url  string
+}{
+	{"MarketWatch Top Stories", "https://feeds.content.dowjones.io/public/rss/mw_topstories"},
+	{"MarketWatch Markets", "https://feeds.content.dowjones.io/public/rss/mw_marketpulse"},
+	{"CNBC Top News", "https://search.cnbc.com/rs/search/combinedcombined/rss/topNews"},
+	{"CNBC Finance", "https://search.cnbc.com/rs/search/combinedcombined/rss/topNews?tag=Finance"},
+}
+
+type Reuters struct {
+	client *http.Client
+}
+
+func New() *Reuters {
+	return &Reuters{client: &http.Client{Timeout: 15 * time.Second}}
+}
+
+func (r *Reuters) Name() string { return "reuters" }
+
+type rssFeed struct {
+	Channel struct {
+		Items []struct {
+			Title       string `xml:"title"`
+			Link        string `xml:"link"`
+			Description string `xml:"description"`
+			PubDate     string `xml:"pubDate"`
+		} `xml:"item"`
+	} `xml:"channel"`
+}
+
+func (r *Reuters) Scrape(ctx context.Context, _ []string) ([]scraper.Article, error) {
+	var articles []scraper.Article
+	seen := make(map[string]bool)
+
+	for i, feed := range feeds {
+		if i > 0 {
+			select {
+			case <-ctx.Done():
+				return articles, ctx.Err()
+			case <-time.After(300 * time.Millisecond):
+			}
+		}
+		items, err := r.fetchFeed(ctx, feed.url)
+		if err != nil {
+			fmt.Printf("reuters/financial %s: %v\n", feed.name, err)
+			continue
+		}
+		for _, a := range items {
+			if !seen[a.URL] {
+				seen[a.URL] = true
+				articles = append(articles, a)
+			}
+		}
+		fmt.Printf("reuters/financial %s: %d articles\n", feed.name, len(items))
+	}
+	return articles, nil
+}
+
+func (r *Reuters) fetchFeed(ctx context.Context, feedURL string) ([]scraper.Article, error) {
+	req, err := http.NewRequestWithContext(ctx, http.MethodGet, feedURL, nil)
+	if err != nil {
+		return nil, err
+	}
+	req.Header.Set("User-Agent", "Mozilla/5.0 (compatible; Tradarr/1.0)")
+	req.Header.Set("Accept", "application/rss+xml, application/xml, text/xml")
+
+	resp, err := r.client.Do(req)
+	if err != nil {
+		return nil, err
+	}
+	defer resp.Body.Close()
+
+	if resp.StatusCode != http.StatusOK {
+		body, _ := io.ReadAll(io.LimitReader(resp.Body, 256))
+		return nil, fmt.Errorf("HTTP %d: %s", resp.StatusCode, body)
+	}
+
+	var feed rssFeed
+	if err := xml.NewDecoder(resp.Body).Decode(&feed); err != nil {
+		return nil, fmt.Errorf("parse RSS: %w", err)
+	}
+
+	var articles []scraper.Article
+	for _, item := range feed.Channel.Items {
+		title := strings.TrimSpace(item.Title)
+		link := strings.TrimSpace(item.Link)
+		if title == "" || link == "" {
+			continue
+		}
+
+		var publishedAt *time.Time
+		for _, f := range []string{time.RFC1123Z, time.RFC1123, "Mon, 02 Jan 2006 15:04:05 -0700"} {
+			if t, err := time.Parse(f, item.PubDate); err == nil {
+				publishedAt = &t
+				break
+			}
+		}
+
+		content := strings.TrimSpace(item.Description)
+		if content == "" {
+			content = title
+		}
+
+		articles = append(articles, scraper.Article{
+			Title:       title,
+			Content:     content,
+			URL:         link,
+			PublishedAt: publishedAt,
+		})
+	}
+	return articles, nil
+}
--- a/backend/internal/scraper/watcherguru/watcherguru.go
+++ b/backend/internal/scraper/watcherguru/watcherguru.go
@ -0,0 +1,200 @@
+package watcherguru
+
+import (
+	"context"
+	"encoding/xml"
+	"fmt"
+	"io"
+	"net/http"
+	"strings"
+	"time"
+
+	"golang.org/x/net/html"
+
+	"github.com/tradarr/backend/internal/scraper"
+)
+
+const baseURL = "https://watcher.guru"
+
+type WatcherGuru struct {
+	client *http.Client
+}
+
+func New() *WatcherGuru {
+	return &WatcherGuru{client: &http.Client{Timeout: 15 * time.Second}}
+}
+
+func (w *WatcherGuru) Name() string { return "watcherguru" }
+
+type rssFeed struct {
+	Channel struct {
+		Items []struct {
+			Title   string `xml:"title"`
+			Link    string `xml:"link"`
+			PubDate string `xml:"pubDate"`
+			Desc    string `xml:"description"`
+		} `xml:"item"`
+	} `xml:"channel"`
+}
+
+func (w *WatcherGuru) Scrape(ctx context.Context, _ []string) ([]scraper.Article, error) {
+	// Try RSS feeds first
+	for _, feedURL := range []string{
+		baseURL + "/feed/",
+		baseURL + "/news/feed/",
+	} {
+		articles, err := w.fetchRSS(ctx, feedURL)
+		if err == nil && len(articles) > 0 {
+			fmt.Printf("watcherguru rss: %d articles\n", len(articles))
+			return articles, nil
+		}
+	}
+
+	// Fallback: HTML scraping
+	articles, err := w.scrapeHTML(ctx)
+	if err != nil {
+		return nil, fmt.Errorf("watcherguru: %w", err)
+	}
+	fmt.Printf("watcherguru html: %d articles\n", len(articles))
+	return articles, nil
+}
+
+func (w *WatcherGuru) fetchRSS(ctx context.Context, feedURL string) ([]scraper.Article, error) {
+	req, err := http.NewRequestWithContext(ctx, http.MethodGet, feedURL, nil)
+	if err != nil {
+		return nil, err
+	}
+	req.Header.Set("User-Agent", "Mozilla/5.0 (compatible; Tradarr/1.0)")
+
+	resp, err := w.client.Do(req)
+	if err != nil {
+		return nil, err
+	}
+	defer resp.Body.Close()
+
+	if resp.StatusCode != http.StatusOK {
+		return nil, fmt.Errorf("HTTP %d", resp.StatusCode)
+	}
+
+	var feed rssFeed
+	if err := xml.NewDecoder(resp.Body).Decode(&feed); err != nil {
+		return nil, fmt.Errorf("parse RSS: %w", err)
+	}
+
+	var articles []scraper.Article
+	for _, item := range feed.Channel.Items {
+		title := strings.TrimSpace(item.Title)
+		link := strings.TrimSpace(item.Link)
+		if title == "" || link == "" {
+			continue
+		}
+		var publishedAt *time.Time
+		for _, f := range []string{time.RFC1123Z, time.RFC1123, "Mon, 02 Jan 2006 15:04:05 -0700"} {
+			if t, err := time.Parse(f, item.PubDate); err == nil {
+				publishedAt = &t
+				break
+			}
+		}
+		content := strings.TrimSpace(item.Desc)
+		if content == "" {
+			content = title
+		}
+		articles = append(articles, scraper.Article{
+			Title:       title,
+			Content:     content,
+			URL:         link,
+			PublishedAt: publishedAt,
+		})
+	}
+	return articles, nil
+}
+
+func (w *WatcherGuru) scrapeHTML(ctx context.Context) ([]scraper.Article, error) {
+	req, err := http.NewRequestWithContext(ctx, http.MethodGet, baseURL+"/news/", nil)
+	if err != nil {
+		return nil, err
+	}
+	req.Header.Set("User-Agent", "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 Chrome/122.0.0.0 Safari/537.36")
+	req.Header.Set("Accept", "text/html,application/xhtml+xml")
+
+	resp, err := w.client.Do(req)
+	if err != nil {
+		return nil, err
+	}
+	defer resp.Body.Close()
+
+	if resp.StatusCode != http.StatusOK {
+		body, _ := io.ReadAll(io.LimitReader(resp.Body, 512))
+		return nil, fmt.Errorf("HTTP %d: %s", resp.StatusCode, body)
+	}
+
+	doc, err := html.Parse(resp.Body)
+	if err != nil {
+		return nil, fmt.Errorf("parse HTML: %w", err)
+	}
+
+	var articles []scraper.Article
+	seen := make(map[string]bool)
+	now := time.Now()
+
+	var walk func(*html.Node)
+	walk = func(n *html.Node) {
+		if n.Type == html.ElementNode && (n.Data == "a" || n.Data == "article") {
+			if n.Data == "a" {
+				href := attrVal(n, "href")
+				if href == "" || seen[href] {
+					walk(n.FirstChild)
+					return
+				}
+				// Collect links that look like news articles
+				if strings.Contains(href, "/news/") || strings.Contains(href, "watcher.guru") {
+					text := strings.TrimSpace(nodeText(n))
+					if len(text) > 20 {
+						url := href
+						if !strings.HasPrefix(url, "http") {
+							url = baseURL + url
+						}
+						if !seen[url] {
+							seen[url] = true
+							articles = append(articles, scraper.Article{
+								Title:       text,
+								Content:     text,
+								URL:         url,
+								PublishedAt: &now,
+							})
+						}
+					}
+				}
+			}
+		}
+		for c := n.FirstChild; c != nil; c = c.NextSibling {
+			walk(c)
+		}
+	}
+	walk(doc)
+
+	if len(articles) > 40 {
+		articles = articles[:40]
+	}
+	return articles, nil
+}
+
+func attrVal(n *html.Node, key string) string {
+	for _, a := range n.Attr {
+		if a.Key == key {
+			return a.Val
+		}
+	}
+	return ""
+}
+
+func nodeText(n *html.Node) string {
+	if n.Type == html.TextNode {
+		return n.Data
+	}
+	var sb strings.Builder
+	for c := n.FirstChild; c != nil; c = c.NextSibling {
+		sb.WriteString(nodeText(c))
+	}
+	return sb.String()
+}
--- a/backend/internal/scraper/yahoofinance/yahoofinance.go
+++ b/backend/internal/scraper/yahoofinance/yahoofinance.go
@ -86,8 +86,13 @@ func (y *YahooFinance) fetchSymbol(ctx context.Context, symbol string) ([]scrape
 		return nil, fmt.Errorf("parse RSS: %w", err)
 	}

+	const maxPerSymbol = 5
+
 	var articles []scraper.Article
 	for _, item := range feed.Channel.Items {
+		if len(articles) >= maxPerSymbol {
+			break
+		}
 		title := strings.TrimSpace(item.Title)
 		link := strings.TrimSpace(item.Link)
 		if title == "" || link == "" {