feat: add frontend + backend + database to retrieve and compute news from Yahoo

This commit is contained in:
2026-04-18 23:53:57 +02:00
parent f9b6d35c49
commit 93668273ff
84 changed files with 15431 additions and 0 deletions

View File

@ -0,0 +1,206 @@
package bloomberg
import (
"context"
"fmt"
"strings"
"time"
"github.com/chromedp/chromedp"
"github.com/tradarr/backend/internal/scraper"
)
type Bloomberg struct {
username string
password string
chromePath string
}
func New(username, password, chromePath string) *Bloomberg {
return &Bloomberg{username: username, password: password, chromePath: chromePath}
}
func (b *Bloomberg) Name() string { return "bloomberg" }
func (b *Bloomberg) Scrape(ctx context.Context, symbols []string) ([]scraper.Article, error) {
if b.username == "" || b.password == "" {
return nil, fmt.Errorf("bloomberg credentials not configured")
}
opts := []chromedp.ExecAllocatorOption{
chromedp.NoFirstRun,
chromedp.NoDefaultBrowserCheck,
chromedp.Headless,
chromedp.DisableGPU,
chromedp.Flag("no-sandbox", true),
chromedp.Flag("disable-setuid-sandbox", true),
chromedp.Flag("disable-dev-shm-usage", true),
chromedp.Flag("disable-blink-features", "AutomationControlled"),
chromedp.Flag("disable-infobars", true),
chromedp.Flag("window-size", "1920,1080"),
chromedp.Flag("ignore-certificate-errors", true),
chromedp.UserAgent("Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/122.0.0.0 Safari/537.36"),
}
if b.chromePath != "" {
opts = append(opts, chromedp.ExecPath(b.chromePath))
}
allocCtx, cancelAlloc := chromedp.NewExecAllocator(ctx, opts...)
defer cancelAlloc()
chromeCtx, cancelChrome := chromedp.NewContext(allocCtx)
defer cancelChrome()
timeoutCtx, cancelTimeout := context.WithTimeout(chromeCtx, 5*time.Minute)
defer cancelTimeout()
if err := b.login(timeoutCtx); err != nil {
return nil, fmt.Errorf("bloomberg login: %w", err)
}
var articles []scraper.Article
pages := []string{
"https://www.bloomberg.com/markets",
"https://www.bloomberg.com/technology",
"https://www.bloomberg.com/economics",
}
for _, u := range pages {
pageArticles, err := b.scrapePage(timeoutCtx, u, symbols)
if err != nil {
fmt.Printf("bloomberg scrape %s: %v\n", u, err)
continue
}
articles = append(articles, pageArticles...)
}
fmt.Printf("bloomberg: %d articles fetched total\n", len(articles))
return articles, nil
}
func (b *Bloomberg) login(ctx context.Context) error {
loginCtx, cancel := context.WithTimeout(ctx, 2*time.Minute)
defer cancel()
// Masquer la détection d'automation via JS
if err := chromedp.Run(loginCtx,
chromedp.ActionFunc(func(ctx context.Context) error {
return chromedp.Evaluate(`
Object.defineProperty(navigator, 'webdriver', {get: () => undefined});
window.chrome = { runtime: {} };
`, nil).Do(ctx)
}),
); err != nil {
fmt.Printf("bloomberg: could not inject stealth JS: %v\n", err)
}
err := chromedp.Run(loginCtx,
chromedp.Navigate("https://www.bloomberg.com/account/signin"),
chromedp.Sleep(2*time.Second),
// Essayer plusieurs sélecteurs pour l'email
chromedp.ActionFunc(func(ctx context.Context) error {
selectors := []string{
`input[name="email"]`,
`input[type="email"]`,
`input[data-type="email"]`,
`input[placeholder*="email" i]`,
`input[placeholder*="mail" i]`,
}
for _, sel := range selectors {
var count int
if err := chromedp.Evaluate(fmt.Sprintf(`document.querySelectorAll('%s').length`, sel), &count).Do(ctx); err == nil && count > 0 {
fmt.Printf("bloomberg: using email selector: %s\n", sel)
return chromedp.SendKeys(sel, b.username, chromedp.ByQuery).Do(ctx)
}
}
return fmt.Errorf("could not find email input — Bloomberg login page structure may have changed")
}),
chromedp.Sleep(500*time.Millisecond),
// Submit email
chromedp.ActionFunc(func(ctx context.Context) error {
selectors := []string{`button[type="submit"]`, `input[type="submit"]`, `button[data-testid*="submit"]`}
for _, sel := range selectors {
var count int
if err := chromedp.Evaluate(fmt.Sprintf(`document.querySelectorAll('%s').length`, sel), &count).Do(ctx); err == nil && count > 0 {
return chromedp.Click(sel, chromedp.ByQuery).Do(ctx)
}
}
// Fallback: press Enter
return chromedp.KeyEvent("\r").Do(ctx)
}),
chromedp.Sleep(2*time.Second),
// Password
chromedp.ActionFunc(func(ctx context.Context) error {
selectors := []string{`input[type="password"]`, `input[name="password"]`}
for _, sel := range selectors {
var count int
if err := chromedp.Evaluate(fmt.Sprintf(`document.querySelectorAll('%s').length`, sel), &count).Do(ctx); err == nil && count > 0 {
fmt.Printf("bloomberg: using password selector: %s\n", sel)
return chromedp.SendKeys(sel, b.password, chromedp.ByQuery).Do(ctx)
}
}
return fmt.Errorf("could not find password input")
}),
chromedp.Sleep(500*time.Millisecond),
chromedp.ActionFunc(func(ctx context.Context) error {
selectors := []string{`button[type="submit"]`, `input[type="submit"]`}
for _, sel := range selectors {
var count int
if err := chromedp.Evaluate(fmt.Sprintf(`document.querySelectorAll('%s').length`, sel), &count).Do(ctx); err == nil && count > 0 {
return chromedp.Click(sel, chromedp.ByQuery).Do(ctx)
}
}
return chromedp.KeyEvent("\r").Do(ctx)
}),
chromedp.Sleep(3*time.Second),
)
return err
}
func (b *Bloomberg) scrapePage(ctx context.Context, pageURL string, symbols []string) ([]scraper.Article, error) {
pageCtx, cancel := context.WithTimeout(ctx, 60*time.Second)
defer cancel()
var articleNodes []map[string]string
err := chromedp.Run(pageCtx,
chromedp.Navigate(pageURL),
chromedp.Sleep(3*time.Second),
chromedp.Evaluate(`
(function() {
var items = [];
var seen = new Set();
var links = document.querySelectorAll('a[href*="/news/articles"], a[href*="/opinion/"], a[href*="/markets/"]');
links.forEach(function(a) {
if (seen.has(a.href)) return;
seen.add(a.href);
var title = a.querySelector('h1,h2,h3,h4,[class*="headline"],[class*="title"]');
var text = title ? title.innerText.trim() : a.innerText.trim();
if (text.length > 20 && a.href.includes('bloomberg.com')) {
items.push({title: text, url: a.href});
}
});
return items.slice(0, 25);
})()
`, &articleNodes),
)
if err != nil {
return nil, fmt.Errorf("navigate %s: %w", pageURL, err)
}
var articles []scraper.Article
now := time.Now()
for _, node := range articleNodes {
title := strings.TrimSpace(node["title"])
url := node["url"]
if title == "" || url == "" || !strings.Contains(url, "bloomberg.com") {
continue
}
syms := scraper.DetectSymbols(title, symbols)
articles = append(articles, scraper.Article{
Title: title,
Content: title, // contenu minimal — l'article complet nécessite un accès payant
URL: url,
PublishedAt: &now,
Symbols: syms,
})
}
return articles, nil
}

View File

@ -0,0 +1,50 @@
package bloomberg
import (
"context"
"fmt"
"github.com/tradarr/backend/internal/crypto"
"github.com/tradarr/backend/internal/models"
"github.com/tradarr/backend/internal/scraper"
)
// DynamicBloomberg charge les credentials depuis la DB avant chaque scraping
type DynamicBloomberg struct {
repo *models.Repository
enc *crypto.Encryptor
chromePath string
}
func NewDynamic(repo *models.Repository, enc *crypto.Encryptor, chromePath string) *DynamicBloomberg {
return &DynamicBloomberg{repo: repo, enc: enc, chromePath: chromePath}
}
func (d *DynamicBloomberg) Name() string { return "bloomberg" }
func (d *DynamicBloomberg) Scrape(ctx context.Context, symbols []string) ([]scraper.Article, error) {
// Récupérer la source Bloomberg
source, err := d.repo.GetSourceByType("bloomberg")
if err != nil || source == nil {
return nil, fmt.Errorf("bloomberg source not found")
}
cred, err := d.repo.GetCredentials(source.ID)
if err != nil {
return nil, fmt.Errorf("get bloomberg credentials: %w", err)
}
if cred == nil || cred.Username == "" {
return nil, fmt.Errorf("bloomberg credentials not configured — please set them in the admin panel")
}
password := ""
if cred.PasswordEncrypted != "" {
password, err = d.enc.Decrypt(cred.PasswordEncrypted)
if err != nil {
return nil, fmt.Errorf("decrypt bloomberg password: %w", err)
}
}
b := New(cred.Username, password, d.chromePath)
return b.Scrape(ctx, symbols)
}

View File

@ -0,0 +1,106 @@
package scraper
import (
"context"
"fmt"
"time"
"github.com/tradarr/backend/internal/models"
)
type Registry struct {
scrapers map[string]Scraper
repo *models.Repository
}
func NewRegistry(repo *models.Repository) *Registry {
return &Registry{
scrapers: map[string]Scraper{},
repo: repo,
}
}
func (r *Registry) Register(s Scraper) {
r.scrapers[s.Name()] = s
}
// Run exécute le scraper associé à sourceID et persiste les articles
func (r *Registry) Run(sourceID string) error {
sources, err := r.repo.ListSources()
if err != nil {
return err
}
var source *models.Source
for i := range sources {
if sources[i].ID == sourceID {
source = &sources[i]
break
}
}
if source == nil {
return fmt.Errorf("source %s not found", sourceID)
}
scrpr, ok := r.scrapers[source.Type]
if !ok {
return fmt.Errorf("no scraper for type %s", source.Type)
}
// Créer le job
job, err := r.repo.CreateScrapeJob(sourceID)
if err != nil {
return err
}
if err := r.repo.UpdateScrapeJob(job.ID, "running", 0, ""); err != nil {
return err
}
// Récupérer les symboles surveillés
symbols, err := r.repo.GetAllWatchedSymbols()
if err != nil {
return err
}
ctx, cancel := context.WithTimeout(context.Background(), 5*time.Minute)
defer cancel()
articles, scrapeErr := scrpr.Scrape(ctx, symbols)
if scrapeErr != nil {
_ = r.repo.UpdateScrapeJob(job.ID, "error", 0, scrapeErr.Error())
return scrapeErr
}
// Persister les articles
count := 0
for _, a := range articles {
saved, err := r.repo.UpsertArticle(sourceID, a.Title, a.Content, a.URL, a.PublishedAt)
if err != nil {
continue
}
count++
for _, sym := range a.Symbols {
_ = r.repo.AddArticleSymbol(saved.ID, sym)
}
}
return r.repo.UpdateScrapeJob(job.ID, "done", count, "")
}
// RunAll exécute tous les scrapers activés
func (r *Registry) RunAll() error {
sources, err := r.repo.ListSources()
if err != nil {
return err
}
for _, src := range sources {
if !src.Enabled {
continue
}
if err := r.Run(src.ID); err != nil {
fmt.Printf("scraper %s error: %v\n", src.Name, err)
}
}
return nil
}

View File

@ -0,0 +1,75 @@
package scraper
import (
"context"
"time"
"github.com/tradarr/backend/internal/models"
)
type Article struct {
Title string
Content string
URL string
PublishedAt *time.Time
Symbols []string
}
type Scraper interface {
Name() string
Scrape(ctx context.Context, symbols []string) ([]Article, error)
}
// detectSymbols extrait les symboles mentionnés dans un texte
func DetectSymbols(text string, watchlist []string) []string {
found := map[string]bool{}
for _, s := range watchlist {
// Recherche du symbole en majuscules dans le texte
if containsWord(text, s) {
found[s] = true
}
}
result := make([]string, 0, len(found))
for s := range found {
result = append(result, s)
}
return result
}
func containsWord(text, word string) bool {
upper := []byte(text)
w := []byte(word)
for i := 0; i <= len(upper)-len(w); i++ {
match := true
for j := range w {
c := upper[i+j]
if c >= 'a' && c <= 'z' {
c -= 32
}
if c != w[j] {
match = false
break
}
}
if match {
// Vérifier que c'est un mot entier
before := i == 0 || !isAlphaNum(upper[i-1])
after := i+len(w) >= len(upper) || !isAlphaNum(upper[i+len(w)])
if before && after {
return true
}
}
}
return false
}
func isAlphaNum(b byte) bool {
return (b >= 'a' && b <= 'z') || (b >= 'A' && b <= 'Z') || (b >= '0' && b <= '9')
}
// ScraperResult est le résultat d'un job de scraping
type ScraperResult struct {
Source *models.Source
Articles []Article
Err error
}

View File

@ -0,0 +1,128 @@
package stocktwits
import (
"context"
"encoding/json"
"fmt"
"io"
"net/http"
"time"
"github.com/tradarr/backend/internal/scraper"
)
const apiBase = "https://api.stocktwits.com/api/2"
type StockTwits struct {
client *http.Client
}
func New() *StockTwits {
return &StockTwits{
client: &http.Client{Timeout: 15 * time.Second},
}
}
func (s *StockTwits) Name() string { return "stocktwits" }
type apiResponse struct {
Response struct {
Status int `json:"status"`
Error string `json:"error,omitempty"`
} `json:"response"`
Messages []struct {
ID int `json:"id"`
Body string `json:"body"`
CreatedAt string `json:"created_at"`
User struct {
Username string `json:"username"`
} `json:"user"`
Entities struct {
Sentiment *struct {
Basic string `json:"basic"`
} `json:"sentiment"`
} `json:"entities"`
} `json:"messages"`
}
func (s *StockTwits) Scrape(ctx context.Context, symbols []string) ([]scraper.Article, error) {
var articles []scraper.Article
for i, symbol := range symbols {
// Délai entre les requêtes pour éviter le rate limiting
if i > 0 {
select {
case <-ctx.Done():
return articles, ctx.Err()
case <-time.After(500 * time.Millisecond):
}
}
msgs, err := s.fetchSymbol(ctx, symbol)
if err != nil {
fmt.Printf("stocktwits %s: %v\n", symbol, err)
continue
}
articles = append(articles, msgs...)
}
return articles, nil
}
func (s *StockTwits) fetchSymbol(ctx context.Context, symbol string) ([]scraper.Article, error) {
url := fmt.Sprintf("%s/streams/symbol/%s.json", apiBase, symbol)
req, err := http.NewRequestWithContext(ctx, http.MethodGet, url, nil)
if err != nil {
return nil, err
}
req.Header.Set("User-Agent", "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 Chrome/120.0.0.0 Safari/537.36")
resp, err := s.client.Do(req)
if err != nil {
return nil, err
}
defer resp.Body.Close()
body, err := io.ReadAll(resp.Body)
if err != nil {
return nil, err
}
if resp.StatusCode == 429 {
return nil, fmt.Errorf("rate limited by StockTwits for %s", symbol)
}
if resp.StatusCode != http.StatusOK {
return nil, fmt.Errorf("StockTwits returned HTTP %d for %s: %s", resp.StatusCode, symbol, string(body))
}
var data apiResponse
if err := json.Unmarshal(body, &data); err != nil {
return nil, fmt.Errorf("parse response for %s: %w", symbol, err)
}
// L'API StockTwits retourne un status dans le body même en HTTP 200
if data.Response.Status != 0 && data.Response.Status != 200 {
return nil, fmt.Errorf("StockTwits API error %d for %s: %s", data.Response.Status, symbol, data.Response.Error)
}
var articles []scraper.Article
for _, msg := range data.Messages {
if msg.Body == "" {
continue
}
sentiment := ""
if msg.Entities.Sentiment != nil {
sentiment = " [" + msg.Entities.Sentiment.Basic + "]"
}
title := fmt.Sprintf("$%s — @%s%s", symbol, msg.User.Username, sentiment)
publishedAt, _ := time.Parse(time.RFC3339, msg.CreatedAt)
msgURL := fmt.Sprintf("https://stocktwits.com/%s/message/%d", msg.User.Username, msg.ID)
articles = append(articles, scraper.Article{
Title: title,
Content: msg.Body,
URL: msgURL,
PublishedAt: &publishedAt,
Symbols: []string{symbol},
})
}
fmt.Printf("stocktwits %s: %d messages fetched\n", symbol, len(articles))
return articles, nil
}

View File

@ -0,0 +1,126 @@
package yahoofinance
import (
"context"
"encoding/xml"
"fmt"
"io"
"net/http"
"strings"
"time"
"github.com/tradarr/backend/internal/scraper"
)
type YahooFinance struct {
client *http.Client
}
func New() *YahooFinance {
return &YahooFinance{
client: &http.Client{Timeout: 15 * time.Second},
}
}
func (y *YahooFinance) Name() string { return "stocktwits" } // garde le même type en DB
type rssFeed struct {
Channel struct {
Items []struct {
Title string `xml:"title"`
Link string `xml:"link"`
Description string `xml:"description"`
PubDate string `xml:"pubDate"`
GUID string `xml:"guid"`
} `xml:"item"`
} `xml:"channel"`
}
func (y *YahooFinance) Scrape(ctx context.Context, symbols []string) ([]scraper.Article, error) {
var articles []scraper.Article
for i, symbol := range symbols {
if i > 0 {
select {
case <-ctx.Done():
return articles, ctx.Err()
case <-time.After(300 * time.Millisecond):
}
}
items, err := y.fetchSymbol(ctx, symbol)
if err != nil {
fmt.Printf("yahoofinance %s: %v\n", symbol, err)
continue
}
articles = append(articles, items...)
fmt.Printf("yahoofinance %s: %d articles fetched\n", symbol, len(items))
}
return articles, nil
}
func (y *YahooFinance) fetchSymbol(ctx context.Context, symbol string) ([]scraper.Article, error) {
url := fmt.Sprintf(
"https://feeds.finance.yahoo.com/rss/2.0/headline?s=%s&region=US&lang=en-US",
symbol,
)
req, err := http.NewRequestWithContext(ctx, http.MethodGet, url, nil)
if err != nil {
return nil, err
}
req.Header.Set("User-Agent", "Mozilla/5.0 (compatible; Tradarr/1.0)")
req.Header.Set("Accept", "application/rss+xml, application/xml, text/xml")
resp, err := y.client.Do(req)
if err != nil {
return nil, err
}
defer resp.Body.Close()
if resp.StatusCode != http.StatusOK {
body, _ := io.ReadAll(io.LimitReader(resp.Body, 512))
return nil, fmt.Errorf("HTTP %d: %s", resp.StatusCode, string(body))
}
var feed rssFeed
if err := xml.NewDecoder(resp.Body).Decode(&feed); err != nil {
return nil, fmt.Errorf("parse RSS: %w", err)
}
var articles []scraper.Article
for _, item := range feed.Channel.Items {
title := strings.TrimSpace(item.Title)
link := strings.TrimSpace(item.Link)
if title == "" || link == "" {
continue
}
var publishedAt *time.Time
if item.PubDate != "" {
formats := []string{
time.RFC1123Z,
time.RFC1123,
"Mon, 02 Jan 2006 15:04:05 -0700",
}
for _, f := range formats {
if t, err := time.Parse(f, item.PubDate); err == nil {
publishedAt = &t
break
}
}
}
content := strings.TrimSpace(item.Description)
if content == "" {
content = title
}
articles = append(articles, scraper.Article{
Title: title,
Content: content,
URL: link,
PublishedAt: publishedAt,
Symbols: []string{symbol},
})
}
return articles, nil
}