feat: add frontend + backend + database to retrieve and compute news from Yahoo

2026-04-18 23:53:57 +02:00
parent f9b6d35c49
commit 93668273ff
84 changed files with 15431 additions and 0 deletions
--- a/backend/internal/scraper/bloomberg/bloomberg.go
+++ b/backend/internal/scraper/bloomberg/bloomberg.go
@ -0,0 +1,206 @@
+package bloomberg
+
+import (
+	"context"
+	"fmt"
+	"strings"
+	"time"
+
+	"github.com/chromedp/chromedp"
+	"github.com/tradarr/backend/internal/scraper"
+)
+
+type Bloomberg struct {
+	username   string
+	password   string
+	chromePath string
+}
+
+func New(username, password, chromePath string) *Bloomberg {
+	return &Bloomberg{username: username, password: password, chromePath: chromePath}
+}
+
+func (b *Bloomberg) Name() string { return "bloomberg" }
+
+func (b *Bloomberg) Scrape(ctx context.Context, symbols []string) ([]scraper.Article, error) {
+	if b.username == "" || b.password == "" {
+		return nil, fmt.Errorf("bloomberg credentials not configured")
+	}
+
+	opts := []chromedp.ExecAllocatorOption{
+		chromedp.NoFirstRun,
+		chromedp.NoDefaultBrowserCheck,
+		chromedp.Headless,
+		chromedp.DisableGPU,
+		chromedp.Flag("no-sandbox", true),
+		chromedp.Flag("disable-setuid-sandbox", true),
+		chromedp.Flag("disable-dev-shm-usage", true),
+		chromedp.Flag("disable-blink-features", "AutomationControlled"),
+		chromedp.Flag("disable-infobars", true),
+		chromedp.Flag("window-size", "1920,1080"),
+		chromedp.Flag("ignore-certificate-errors", true),
+		chromedp.UserAgent("Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/122.0.0.0 Safari/537.36"),
+	}
+	if b.chromePath != "" {
+		opts = append(opts, chromedp.ExecPath(b.chromePath))
+	}
+
+	allocCtx, cancelAlloc := chromedp.NewExecAllocator(ctx, opts...)
+	defer cancelAlloc()
+
+	chromeCtx, cancelChrome := chromedp.NewContext(allocCtx)
+	defer cancelChrome()
+
+	timeoutCtx, cancelTimeout := context.WithTimeout(chromeCtx, 5*time.Minute)
+	defer cancelTimeout()
+
+	if err := b.login(timeoutCtx); err != nil {
+		return nil, fmt.Errorf("bloomberg login: %w", err)
+	}
+
+	var articles []scraper.Article
+	pages := []string{
+		"https://www.bloomberg.com/markets",
+		"https://www.bloomberg.com/technology",
+		"https://www.bloomberg.com/economics",
+	}
+	for _, u := range pages {
+		pageArticles, err := b.scrapePage(timeoutCtx, u, symbols)
+		if err != nil {
+			fmt.Printf("bloomberg scrape %s: %v\n", u, err)
+			continue
+		}
+		articles = append(articles, pageArticles...)
+	}
+	fmt.Printf("bloomberg: %d articles fetched total\n", len(articles))
+	return articles, nil
+}
+
+func (b *Bloomberg) login(ctx context.Context) error {
+	loginCtx, cancel := context.WithTimeout(ctx, 2*time.Minute)
+	defer cancel()
+
+	// Masquer la détection d'automation via JS
+	if err := chromedp.Run(loginCtx,
+		chromedp.ActionFunc(func(ctx context.Context) error {
+			return chromedp.Evaluate(`
+				Object.defineProperty(navigator, 'webdriver', {get: () => undefined});
+				window.chrome = { runtime: {} };
+			`, nil).Do(ctx)
+		}),
+	); err != nil {
+		fmt.Printf("bloomberg: could not inject stealth JS: %v\n", err)
+	}
+
+	err := chromedp.Run(loginCtx,
+		chromedp.Navigate("https://www.bloomberg.com/account/signin"),
+		chromedp.Sleep(2*time.Second),
+		// Essayer plusieurs sélecteurs pour l'email
+		chromedp.ActionFunc(func(ctx context.Context) error {
+			selectors := []string{
+				`input[name="email"]`,
+				`input[type="email"]`,
+				`input[data-type="email"]`,
+				`input[placeholder*="email" i]`,
+				`input[placeholder*="mail" i]`,
+			}
+			for _, sel := range selectors {
+				var count int
+				if err := chromedp.Evaluate(fmt.Sprintf(`document.querySelectorAll('%s').length`, sel), &count).Do(ctx); err == nil && count > 0 {
+					fmt.Printf("bloomberg: using email selector: %s\n", sel)
+					return chromedp.SendKeys(sel, b.username, chromedp.ByQuery).Do(ctx)
+				}
+			}
+			return fmt.Errorf("could not find email input — Bloomberg login page structure may have changed")
+		}),
+		chromedp.Sleep(500*time.Millisecond),
+		// Submit email
+		chromedp.ActionFunc(func(ctx context.Context) error {
+			selectors := []string{`button[type="submit"]`, `input[type="submit"]`, `button[data-testid*="submit"]`}
+			for _, sel := range selectors {
+				var count int
+				if err := chromedp.Evaluate(fmt.Sprintf(`document.querySelectorAll('%s').length`, sel), &count).Do(ctx); err == nil && count > 0 {
+					return chromedp.Click(sel, chromedp.ByQuery).Do(ctx)
+				}
+			}
+			// Fallback: press Enter
+			return chromedp.KeyEvent("\r").Do(ctx)
+		}),
+		chromedp.Sleep(2*time.Second),
+		// Password
+		chromedp.ActionFunc(func(ctx context.Context) error {
+			selectors := []string{`input[type="password"]`, `input[name="password"]`}
+			for _, sel := range selectors {
+				var count int
+				if err := chromedp.Evaluate(fmt.Sprintf(`document.querySelectorAll('%s').length`, sel), &count).Do(ctx); err == nil && count > 0 {
+					fmt.Printf("bloomberg: using password selector: %s\n", sel)
+					return chromedp.SendKeys(sel, b.password, chromedp.ByQuery).Do(ctx)
+				}
+			}
+			return fmt.Errorf("could not find password input")
+		}),
+		chromedp.Sleep(500*time.Millisecond),
+		chromedp.ActionFunc(func(ctx context.Context) error {
+			selectors := []string{`button[type="submit"]`, `input[type="submit"]`}
+			for _, sel := range selectors {
+				var count int
+				if err := chromedp.Evaluate(fmt.Sprintf(`document.querySelectorAll('%s').length`, sel), &count).Do(ctx); err == nil && count > 0 {
+					return chromedp.Click(sel, chromedp.ByQuery).Do(ctx)
+				}
+			}
+			return chromedp.KeyEvent("\r").Do(ctx)
+		}),
+		chromedp.Sleep(3*time.Second),
+	)
+	return err
+}
+
+func (b *Bloomberg) scrapePage(ctx context.Context, pageURL string, symbols []string) ([]scraper.Article, error) {
+	pageCtx, cancel := context.WithTimeout(ctx, 60*time.Second)
+	defer cancel()
+
+	var articleNodes []map[string]string
+	err := chromedp.Run(pageCtx,
+		chromedp.Navigate(pageURL),
+		chromedp.Sleep(3*time.Second),
+		chromedp.Evaluate(`
+			(function() {
+				var items = [];
+				var seen = new Set();
+				var links = document.querySelectorAll('a[href*="/news/articles"], a[href*="/opinion/"], a[href*="/markets/"]');
+				links.forEach(function(a) {
+					if (seen.has(a.href)) return;
+					seen.add(a.href);
+					var title = a.querySelector('h1,h2,h3,h4,[class*="headline"],[class*="title"]');
+					var text = title ? title.innerText.trim() : a.innerText.trim();
+					if (text.length > 20 && a.href.includes('bloomberg.com')) {
+						items.push({title: text, url: a.href});
+					}
+				});
+				return items.slice(0, 25);
+			})()
+		`, &articleNodes),
+	)
+	if err != nil {
+		return nil, fmt.Errorf("navigate %s: %w", pageURL, err)
+	}
+
+	var articles []scraper.Article
+	now := time.Now()
+	for _, node := range articleNodes {
+		title := strings.TrimSpace(node["title"])
+		url := node["url"]
+		if title == "" || url == "" || !strings.Contains(url, "bloomberg.com") {
+			continue
+		}
+		syms := scraper.DetectSymbols(title, symbols)
+		articles = append(articles, scraper.Article{
+			Title:       title,
+			Content:     title, // contenu minimal — l'article complet nécessite un accès payant
+			URL:         url,
+			PublishedAt: &now,
+			Symbols:     syms,
+		})
+	}
+	return articles, nil
+}