feat: add frontend + backend + database to retrieve and compute news from Yahoo

2026-04-18 23:53:57 +02:00
parent f9b6d35c49
commit 93668273ff
84 changed files with 15431 additions and 0 deletions
--- a/backend/internal/scraper/bloomberg/bloomberg.go
+++ b/backend/internal/scraper/bloomberg/bloomberg.go
@ -0,0 +1,206 @@
+package bloomberg
+
+import (
+	"context"
+	"fmt"
+	"strings"
+	"time"
+
+	"github.com/chromedp/chromedp"
+	"github.com/tradarr/backend/internal/scraper"
+)
+
+type Bloomberg struct {
+	username   string
+	password   string
+	chromePath string
+}
+
+func New(username, password, chromePath string) *Bloomberg {
+	return &Bloomberg{username: username, password: password, chromePath: chromePath}
+}
+
+func (b *Bloomberg) Name() string { return "bloomberg" }
+
+func (b *Bloomberg) Scrape(ctx context.Context, symbols []string) ([]scraper.Article, error) {
+	if b.username == "" || b.password == "" {
+		return nil, fmt.Errorf("bloomberg credentials not configured")
+	}
+
+	opts := []chromedp.ExecAllocatorOption{
+		chromedp.NoFirstRun,
+		chromedp.NoDefaultBrowserCheck,
+		chromedp.Headless,
+		chromedp.DisableGPU,
+		chromedp.Flag("no-sandbox", true),
+		chromedp.Flag("disable-setuid-sandbox", true),
+		chromedp.Flag("disable-dev-shm-usage", true),
+		chromedp.Flag("disable-blink-features", "AutomationControlled"),
+		chromedp.Flag("disable-infobars", true),
+		chromedp.Flag("window-size", "1920,1080"),
+		chromedp.Flag("ignore-certificate-errors", true),
+		chromedp.UserAgent("Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/122.0.0.0 Safari/537.36"),
+	}
+	if b.chromePath != "" {
+		opts = append(opts, chromedp.ExecPath(b.chromePath))
+	}
+
+	allocCtx, cancelAlloc := chromedp.NewExecAllocator(ctx, opts...)
+	defer cancelAlloc()
+
+	chromeCtx, cancelChrome := chromedp.NewContext(allocCtx)
+	defer cancelChrome()
+
+	timeoutCtx, cancelTimeout := context.WithTimeout(chromeCtx, 5*time.Minute)
+	defer cancelTimeout()
+
+	if err := b.login(timeoutCtx); err != nil {
+		return nil, fmt.Errorf("bloomberg login: %w", err)
+	}
+
+	var articles []scraper.Article
+	pages := []string{
+		"https://www.bloomberg.com/markets",
+		"https://www.bloomberg.com/technology",
+		"https://www.bloomberg.com/economics",
+	}
+	for _, u := range pages {
+		pageArticles, err := b.scrapePage(timeoutCtx, u, symbols)
+		if err != nil {
+			fmt.Printf("bloomberg scrape %s: %v\n", u, err)
+			continue
+		}
+		articles = append(articles, pageArticles...)
+	}
+	fmt.Printf("bloomberg: %d articles fetched total\n", len(articles))
+	return articles, nil
+}
+
+func (b *Bloomberg) login(ctx context.Context) error {
+	loginCtx, cancel := context.WithTimeout(ctx, 2*time.Minute)
+	defer cancel()
+
+	// Masquer la détection d'automation via JS
+	if err := chromedp.Run(loginCtx,
+		chromedp.ActionFunc(func(ctx context.Context) error {
+			return chromedp.Evaluate(`
+				Object.defineProperty(navigator, 'webdriver', {get: () => undefined});
+				window.chrome = { runtime: {} };
+			`, nil).Do(ctx)
+		}),
+	); err != nil {
+		fmt.Printf("bloomberg: could not inject stealth JS: %v\n", err)
+	}
+
+	err := chromedp.Run(loginCtx,
+		chromedp.Navigate("https://www.bloomberg.com/account/signin"),
+		chromedp.Sleep(2*time.Second),
+		// Essayer plusieurs sélecteurs pour l'email
+		chromedp.ActionFunc(func(ctx context.Context) error {
+			selectors := []string{
+				`input[name="email"]`,
+				`input[type="email"]`,
+				`input[data-type="email"]`,
+				`input[placeholder*="email" i]`,
+				`input[placeholder*="mail" i]`,
+			}
+			for _, sel := range selectors {
+				var count int
+				if err := chromedp.Evaluate(fmt.Sprintf(`document.querySelectorAll('%s').length`, sel), &count).Do(ctx); err == nil && count > 0 {
+					fmt.Printf("bloomberg: using email selector: %s\n", sel)
+					return chromedp.SendKeys(sel, b.username, chromedp.ByQuery).Do(ctx)
+				}
+			}
+			return fmt.Errorf("could not find email input — Bloomberg login page structure may have changed")
+		}),
+		chromedp.Sleep(500*time.Millisecond),
+		// Submit email
+		chromedp.ActionFunc(func(ctx context.Context) error {
+			selectors := []string{`button[type="submit"]`, `input[type="submit"]`, `button[data-testid*="submit"]`}
+			for _, sel := range selectors {
+				var count int
+				if err := chromedp.Evaluate(fmt.Sprintf(`document.querySelectorAll('%s').length`, sel), &count).Do(ctx); err == nil && count > 0 {
+					return chromedp.Click(sel, chromedp.ByQuery).Do(ctx)
+				}
+			}
+			// Fallback: press Enter
+			return chromedp.KeyEvent("\r").Do(ctx)
+		}),
+		chromedp.Sleep(2*time.Second),
+		// Password
+		chromedp.ActionFunc(func(ctx context.Context) error {
+			selectors := []string{`input[type="password"]`, `input[name="password"]`}
+			for _, sel := range selectors {
+				var count int
+				if err := chromedp.Evaluate(fmt.Sprintf(`document.querySelectorAll('%s').length`, sel), &count).Do(ctx); err == nil && count > 0 {
+					fmt.Printf("bloomberg: using password selector: %s\n", sel)
+					return chromedp.SendKeys(sel, b.password, chromedp.ByQuery).Do(ctx)
+				}
+			}
+			return fmt.Errorf("could not find password input")
+		}),
+		chromedp.Sleep(500*time.Millisecond),
+		chromedp.ActionFunc(func(ctx context.Context) error {
+			selectors := []string{`button[type="submit"]`, `input[type="submit"]`}
+			for _, sel := range selectors {
+				var count int
+				if err := chromedp.Evaluate(fmt.Sprintf(`document.querySelectorAll('%s').length`, sel), &count).Do(ctx); err == nil && count > 0 {
+					return chromedp.Click(sel, chromedp.ByQuery).Do(ctx)
+				}
+			}
+			return chromedp.KeyEvent("\r").Do(ctx)
+		}),
+		chromedp.Sleep(3*time.Second),
+	)
+	return err
+}
+
+func (b *Bloomberg) scrapePage(ctx context.Context, pageURL string, symbols []string) ([]scraper.Article, error) {
+	pageCtx, cancel := context.WithTimeout(ctx, 60*time.Second)
+	defer cancel()
+
+	var articleNodes []map[string]string
+	err := chromedp.Run(pageCtx,
+		chromedp.Navigate(pageURL),
+		chromedp.Sleep(3*time.Second),
+		chromedp.Evaluate(`
+			(function() {
+				var items = [];
+				var seen = new Set();
+				var links = document.querySelectorAll('a[href*="/news/articles"], a[href*="/opinion/"], a[href*="/markets/"]');
+				links.forEach(function(a) {
+					if (seen.has(a.href)) return;
+					seen.add(a.href);
+					var title = a.querySelector('h1,h2,h3,h4,[class*="headline"],[class*="title"]');
+					var text = title ? title.innerText.trim() : a.innerText.trim();
+					if (text.length > 20 && a.href.includes('bloomberg.com')) {
+						items.push({title: text, url: a.href});
+					}
+				});
+				return items.slice(0, 25);
+			})()
+		`, &articleNodes),
+	)
+	if err != nil {
+		return nil, fmt.Errorf("navigate %s: %w", pageURL, err)
+	}
+
+	var articles []scraper.Article
+	now := time.Now()
+	for _, node := range articleNodes {
+		title := strings.TrimSpace(node["title"])
+		url := node["url"]
+		if title == "" || url == "" || !strings.Contains(url, "bloomberg.com") {
+			continue
+		}
+		syms := scraper.DetectSymbols(title, symbols)
+		articles = append(articles, scraper.Article{
+			Title:       title,
+			Content:     title, // contenu minimal — l'article complet nécessite un accès payant
+			URL:         url,
+			PublishedAt: &now,
+			Symbols:     syms,
+		})
+	}
+	return articles, nil
+}
--- a/backend/internal/scraper/bloomberg/dynamic.go
+++ b/backend/internal/scraper/bloomberg/dynamic.go
@ -0,0 +1,50 @@
+package bloomberg
+
+import (
+	"context"
+	"fmt"
+
+	"github.com/tradarr/backend/internal/crypto"
+	"github.com/tradarr/backend/internal/models"
+	"github.com/tradarr/backend/internal/scraper"
+)
+
+// DynamicBloomberg charge les credentials depuis la DB avant chaque scraping
+type DynamicBloomberg struct {
+	repo       *models.Repository
+	enc        *crypto.Encryptor
+	chromePath string
+}
+
+func NewDynamic(repo *models.Repository, enc *crypto.Encryptor, chromePath string) *DynamicBloomberg {
+	return &DynamicBloomberg{repo: repo, enc: enc, chromePath: chromePath}
+}
+
+func (d *DynamicBloomberg) Name() string { return "bloomberg" }
+
+func (d *DynamicBloomberg) Scrape(ctx context.Context, symbols []string) ([]scraper.Article, error) {
+	// Récupérer la source Bloomberg
+	source, err := d.repo.GetSourceByType("bloomberg")
+	if err != nil || source == nil {
+		return nil, fmt.Errorf("bloomberg source not found")
+	}
+
+	cred, err := d.repo.GetCredentials(source.ID)
+	if err != nil {
+		return nil, fmt.Errorf("get bloomberg credentials: %w", err)
+	}
+	if cred == nil || cred.Username == "" {
+		return nil, fmt.Errorf("bloomberg credentials not configured — please set them in the admin panel")
+	}
+
+	password := ""
+	if cred.PasswordEncrypted != "" {
+		password, err = d.enc.Decrypt(cred.PasswordEncrypted)
+		if err != nil {
+			return nil, fmt.Errorf("decrypt bloomberg password: %w", err)
+		}
+	}
+
+	b := New(cred.Username, password, d.chromePath)
+	return b.Scrape(ctx, symbols)
+}
--- a/backend/internal/scraper/registry.go
+++ b/backend/internal/scraper/registry.go
@ -0,0 +1,106 @@
+package scraper
+
+import (
+	"context"
+	"fmt"
+	"time"
+
+	"github.com/tradarr/backend/internal/models"
+)
+
+type Registry struct {
+	scrapers map[string]Scraper
+	repo     *models.Repository
+}
+
+func NewRegistry(repo *models.Repository) *Registry {
+	return &Registry{
+		scrapers: map[string]Scraper{},
+		repo:     repo,
+	}
+}
+
+func (r *Registry) Register(s Scraper) {
+	r.scrapers[s.Name()] = s
+}
+
+// Run exécute le scraper associé à sourceID et persiste les articles
+func (r *Registry) Run(sourceID string) error {
+	sources, err := r.repo.ListSources()
+	if err != nil {
+		return err
+	}
+
+	var source *models.Source
+	for i := range sources {
+		if sources[i].ID == sourceID {
+			source = &sources[i]
+			break
+		}
+	}
+	if source == nil {
+		return fmt.Errorf("source %s not found", sourceID)
+	}
+
+	scrpr, ok := r.scrapers[source.Type]
+	if !ok {
+		return fmt.Errorf("no scraper for type %s", source.Type)
+	}
+
+	// Créer le job
+	job, err := r.repo.CreateScrapeJob(sourceID)
+	if err != nil {
+		return err
+	}
+
+	if err := r.repo.UpdateScrapeJob(job.ID, "running", 0, ""); err != nil {
+		return err
+	}
+
+	// Récupérer les symboles surveillés
+	symbols, err := r.repo.GetAllWatchedSymbols()
+	if err != nil {
+		return err
+	}
+
+	ctx, cancel := context.WithTimeout(context.Background(), 5*time.Minute)
+	defer cancel()
+
+	articles, scrapeErr := scrpr.Scrape(ctx, symbols)
+	if scrapeErr != nil {
+		_ = r.repo.UpdateScrapeJob(job.ID, "error", 0, scrapeErr.Error())
+		return scrapeErr
+	}
+
+	// Persister les articles
+	count := 0
+	for _, a := range articles {
+		saved, err := r.repo.UpsertArticle(sourceID, a.Title, a.Content, a.URL, a.PublishedAt)
+		if err != nil {
+			continue
+		}
+		count++
+		for _, sym := range a.Symbols {
+			_ = r.repo.AddArticleSymbol(saved.ID, sym)
+		}
+	}
+
+	return r.repo.UpdateScrapeJob(job.ID, "done", count, "")
+}
+
+// RunAll exécute tous les scrapers activés
+func (r *Registry) RunAll() error {
+	sources, err := r.repo.ListSources()
+	if err != nil {
+		return err
+	}
+	for _, src := range sources {
+		if !src.Enabled {
+			continue
+		}
+		if err := r.Run(src.ID); err != nil {
+			fmt.Printf("scraper %s error: %v\n", src.Name, err)
+		}
+	}
+	return nil
+}
--- a/backend/internal/scraper/scraper.go
+++ b/backend/internal/scraper/scraper.go
@ -0,0 +1,75 @@
+package scraper
+
+import (
+	"context"
+	"time"
+
+	"github.com/tradarr/backend/internal/models"
+)
+
+type Article struct {
+	Title       string
+	Content     string
+	URL         string
+	PublishedAt *time.Time
+	Symbols     []string
+}
+
+type Scraper interface {
+	Name() string
+	Scrape(ctx context.Context, symbols []string) ([]Article, error)
+}
+
+// detectSymbols extrait les symboles mentionnés dans un texte
+func DetectSymbols(text string, watchlist []string) []string {
+	found := map[string]bool{}
+	for _, s := range watchlist {
+		// Recherche du symbole en majuscules dans le texte
+		if containsWord(text, s) {
+			found[s] = true
+		}
+	}
+	result := make([]string, 0, len(found))
+	for s := range found {
+		result = append(result, s)
+	}
+	return result
+}
+
+func containsWord(text, word string) bool {
+	upper := []byte(text)
+	w := []byte(word)
+	for i := 0; i <= len(upper)-len(w); i++ {
+		match := true
+		for j := range w {
+			c := upper[i+j]
+			if c >= 'a' && c <= 'z' {
+				c -= 32
+			}
+			if c != w[j] {
+				match = false
+				break
+			}
+		}
+		if match {
+			// Vérifier que c'est un mot entier
+			before := i == 0 || !isAlphaNum(upper[i-1])
+			after := i+len(w) >= len(upper) || !isAlphaNum(upper[i+len(w)])
+			if before && after {
+				return true
+			}
+		}
+	}
+	return false
+}
+
+func isAlphaNum(b byte) bool {
+	return (b >= 'a' && b <= 'z') || (b >= 'A' && b <= 'Z') || (b >= '0' && b <= '9')
+}
+
+// ScraperResult est le résultat d'un job de scraping
+type ScraperResult struct {
+	Source   *models.Source
+	Articles []Article
+	Err      error
+}
--- a/backend/internal/scraper/stocktwits/stocktwits.go
+++ b/backend/internal/scraper/stocktwits/stocktwits.go
@ -0,0 +1,128 @@
+package stocktwits
+
+import (
+	"context"
+	"encoding/json"
+	"fmt"
+	"io"
+	"net/http"
+	"time"
+
+	"github.com/tradarr/backend/internal/scraper"
+)
+
+const apiBase = "https://api.stocktwits.com/api/2"
+
+type StockTwits struct {
+	client *http.Client
+}
+
+func New() *StockTwits {
+	return &StockTwits{
+		client: &http.Client{Timeout: 15 * time.Second},
+	}
+}
+
+func (s *StockTwits) Name() string { return "stocktwits" }
+
+type apiResponse struct {
+	Response struct {
+		Status int    `json:"status"`
+		Error  string `json:"error,omitempty"`
+	} `json:"response"`
+	Messages []struct {
+		ID        int    `json:"id"`
+		Body      string `json:"body"`
+		CreatedAt string `json:"created_at"`
+		User      struct {
+			Username string `json:"username"`
+		} `json:"user"`
+		Entities struct {
+			Sentiment *struct {
+				Basic string `json:"basic"`
+			} `json:"sentiment"`
+		} `json:"entities"`
+	} `json:"messages"`
+}
+
+func (s *StockTwits) Scrape(ctx context.Context, symbols []string) ([]scraper.Article, error) {
+	var articles []scraper.Article
+	for i, symbol := range symbols {
+		// Délai entre les requêtes pour éviter le rate limiting
+		if i > 0 {
+			select {
+			case <-ctx.Done():
+				return articles, ctx.Err()
+			case <-time.After(500 * time.Millisecond):
+			}
+		}
+		msgs, err := s.fetchSymbol(ctx, symbol)
+		if err != nil {
+			fmt.Printf("stocktwits %s: %v\n", symbol, err)
+			continue
+		}
+		articles = append(articles, msgs...)
+	}
+	return articles, nil
+}
+
+func (s *StockTwits) fetchSymbol(ctx context.Context, symbol string) ([]scraper.Article, error) {
+	url := fmt.Sprintf("%s/streams/symbol/%s.json", apiBase, symbol)
+	req, err := http.NewRequestWithContext(ctx, http.MethodGet, url, nil)
+	if err != nil {
+		return nil, err
+	}
+	req.Header.Set("User-Agent", "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 Chrome/120.0.0.0 Safari/537.36")
+
+	resp, err := s.client.Do(req)
+	if err != nil {
+		return nil, err
+	}
+	defer resp.Body.Close()
+
+	body, err := io.ReadAll(resp.Body)
+	if err != nil {
+		return nil, err
+	}
+
+	if resp.StatusCode == 429 {
+		return nil, fmt.Errorf("rate limited by StockTwits for %s", symbol)
+	}
+	if resp.StatusCode != http.StatusOK {
+		return nil, fmt.Errorf("StockTwits returned HTTP %d for %s: %s", resp.StatusCode, symbol, string(body))
+	}
+
+	var data apiResponse
+	if err := json.Unmarshal(body, &data); err != nil {
+		return nil, fmt.Errorf("parse response for %s: %w", symbol, err)
+	}
+
+	// L'API StockTwits retourne un status dans le body même en HTTP 200
+	if data.Response.Status != 0 && data.Response.Status != 200 {
+		return nil, fmt.Errorf("StockTwits API error %d for %s: %s", data.Response.Status, symbol, data.Response.Error)
+	}
+
+	var articles []scraper.Article
+	for _, msg := range data.Messages {
+		if msg.Body == "" {
+			continue
+		}
+		sentiment := ""
+		if msg.Entities.Sentiment != nil {
+			sentiment = " [" + msg.Entities.Sentiment.Basic + "]"
+		}
+		title := fmt.Sprintf("$%s — @%s%s", symbol, msg.User.Username, sentiment)
+		publishedAt, _ := time.Parse(time.RFC3339, msg.CreatedAt)
+		msgURL := fmt.Sprintf("https://stocktwits.com/%s/message/%d", msg.User.Username, msg.ID)
+
+		articles = append(articles, scraper.Article{
+			Title:       title,
+			Content:     msg.Body,
+			URL:         msgURL,
+			PublishedAt: &publishedAt,
+			Symbols:     []string{symbol},
+		})
+	}
+	fmt.Printf("stocktwits %s: %d messages fetched\n", symbol, len(articles))
+	return articles, nil
+}
--- a/backend/internal/scraper/yahoofinance/yahoofinance.go
+++ b/backend/internal/scraper/yahoofinance/yahoofinance.go
@ -0,0 +1,126 @@
+package yahoofinance
+
+import (
+	"context"
+	"encoding/xml"
+	"fmt"
+	"io"
+	"net/http"
+	"strings"
+	"time"
+
+	"github.com/tradarr/backend/internal/scraper"
+)
+
+type YahooFinance struct {
+	client *http.Client
+}
+
+func New() *YahooFinance {
+	return &YahooFinance{
+		client: &http.Client{Timeout: 15 * time.Second},
+	}
+}
+
+func (y *YahooFinance) Name() string { return "stocktwits" } // garde le même type en DB
+
+type rssFeed struct {
+	Channel struct {
+		Items []struct {
+			Title       string `xml:"title"`
+			Link        string `xml:"link"`
+			Description string `xml:"description"`
+			PubDate     string `xml:"pubDate"`
+			GUID        string `xml:"guid"`
+		} `xml:"item"`
+	} `xml:"channel"`
+}
+
+func (y *YahooFinance) Scrape(ctx context.Context, symbols []string) ([]scraper.Article, error) {
+	var articles []scraper.Article
+
+	for i, symbol := range symbols {
+		if i > 0 {
+			select {
+			case <-ctx.Done():
+				return articles, ctx.Err()
+			case <-time.After(300 * time.Millisecond):
+			}
+		}
+		items, err := y.fetchSymbol(ctx, symbol)
+		if err != nil {
+			fmt.Printf("yahoofinance %s: %v\n", symbol, err)
+			continue
+		}
+		articles = append(articles, items...)
+		fmt.Printf("yahoofinance %s: %d articles fetched\n", symbol, len(items))
+	}
+	return articles, nil
+}
+
+func (y *YahooFinance) fetchSymbol(ctx context.Context, symbol string) ([]scraper.Article, error) {
+	url := fmt.Sprintf(
+		"https://feeds.finance.yahoo.com/rss/2.0/headline?s=%s&region=US&lang=en-US",
+		symbol,
+	)
+	req, err := http.NewRequestWithContext(ctx, http.MethodGet, url, nil)
+	if err != nil {
+		return nil, err
+	}
+	req.Header.Set("User-Agent", "Mozilla/5.0 (compatible; Tradarr/1.0)")
+	req.Header.Set("Accept", "application/rss+xml, application/xml, text/xml")
+
+	resp, err := y.client.Do(req)
+	if err != nil {
+		return nil, err
+	}
+	defer resp.Body.Close()
+
+	if resp.StatusCode != http.StatusOK {
+		body, _ := io.ReadAll(io.LimitReader(resp.Body, 512))
+		return nil, fmt.Errorf("HTTP %d: %s", resp.StatusCode, string(body))
+	}
+
+	var feed rssFeed
+	if err := xml.NewDecoder(resp.Body).Decode(&feed); err != nil {
+		return nil, fmt.Errorf("parse RSS: %w", err)
+	}
+
+	var articles []scraper.Article
+	for _, item := range feed.Channel.Items {
+		title := strings.TrimSpace(item.Title)
+		link := strings.TrimSpace(item.Link)
+		if title == "" || link == "" {
+			continue
+		}
+
+		var publishedAt *time.Time
+		if item.PubDate != "" {
+			formats := []string{
+				time.RFC1123Z,
+				time.RFC1123,
+				"Mon, 02 Jan 2006 15:04:05 -0700",
+			}
+			for _, f := range formats {
+				if t, err := time.Parse(f, item.PubDate); err == nil {
+					publishedAt = &t
+					break
+				}
+			}
+		}
+
+		content := strings.TrimSpace(item.Description)
+		if content == "" {
+			content = title
+		}
+
+		articles = append(articles, scraper.Article{
+			Title:       title,
+			Content:     content,
+			URL:         link,
+			PublishedAt: publishedAt,
+			Symbols:     []string{symbol},
+		})
+	}
+	return articles, nil
+}