207 lines
6.7 KiB
Go
207 lines
6.7 KiB
Go
package bloomberg
|
|
|
|
import (
|
|
"context"
|
|
"fmt"
|
|
"strings"
|
|
"time"
|
|
|
|
"github.com/chromedp/chromedp"
|
|
"github.com/tradarr/backend/internal/scraper"
|
|
)
|
|
|
|
type Bloomberg struct {
|
|
username string
|
|
password string
|
|
chromePath string
|
|
}
|
|
|
|
func New(username, password, chromePath string) *Bloomberg {
|
|
return &Bloomberg{username: username, password: password, chromePath: chromePath}
|
|
}
|
|
|
|
func (b *Bloomberg) Name() string { return "bloomberg" }
|
|
|
|
func (b *Bloomberg) Scrape(ctx context.Context, symbols []string) ([]scraper.Article, error) {
|
|
if b.username == "" || b.password == "" {
|
|
return nil, fmt.Errorf("bloomberg credentials not configured")
|
|
}
|
|
|
|
opts := []chromedp.ExecAllocatorOption{
|
|
chromedp.NoFirstRun,
|
|
chromedp.NoDefaultBrowserCheck,
|
|
chromedp.Headless,
|
|
chromedp.DisableGPU,
|
|
chromedp.Flag("no-sandbox", true),
|
|
chromedp.Flag("disable-setuid-sandbox", true),
|
|
chromedp.Flag("disable-dev-shm-usage", true),
|
|
chromedp.Flag("disable-blink-features", "AutomationControlled"),
|
|
chromedp.Flag("disable-infobars", true),
|
|
chromedp.Flag("window-size", "1920,1080"),
|
|
chromedp.Flag("ignore-certificate-errors", true),
|
|
chromedp.UserAgent("Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/122.0.0.0 Safari/537.36"),
|
|
}
|
|
if b.chromePath != "" {
|
|
opts = append(opts, chromedp.ExecPath(b.chromePath))
|
|
}
|
|
|
|
allocCtx, cancelAlloc := chromedp.NewExecAllocator(ctx, opts...)
|
|
defer cancelAlloc()
|
|
|
|
chromeCtx, cancelChrome := chromedp.NewContext(allocCtx)
|
|
defer cancelChrome()
|
|
|
|
timeoutCtx, cancelTimeout := context.WithTimeout(chromeCtx, 5*time.Minute)
|
|
defer cancelTimeout()
|
|
|
|
if err := b.login(timeoutCtx); err != nil {
|
|
return nil, fmt.Errorf("bloomberg login: %w", err)
|
|
}
|
|
|
|
var articles []scraper.Article
|
|
pages := []string{
|
|
"https://www.bloomberg.com/markets",
|
|
"https://www.bloomberg.com/technology",
|
|
"https://www.bloomberg.com/economics",
|
|
}
|
|
for _, u := range pages {
|
|
pageArticles, err := b.scrapePage(timeoutCtx, u, symbols)
|
|
if err != nil {
|
|
fmt.Printf("bloomberg scrape %s: %v\n", u, err)
|
|
continue
|
|
}
|
|
articles = append(articles, pageArticles...)
|
|
}
|
|
fmt.Printf("bloomberg: %d articles fetched total\n", len(articles))
|
|
return articles, nil
|
|
}
|
|
|
|
func (b *Bloomberg) login(ctx context.Context) error {
|
|
loginCtx, cancel := context.WithTimeout(ctx, 2*time.Minute)
|
|
defer cancel()
|
|
|
|
// Masquer la détection d'automation via JS
|
|
if err := chromedp.Run(loginCtx,
|
|
chromedp.ActionFunc(func(ctx context.Context) error {
|
|
return chromedp.Evaluate(`
|
|
Object.defineProperty(navigator, 'webdriver', {get: () => undefined});
|
|
window.chrome = { runtime: {} };
|
|
`, nil).Do(ctx)
|
|
}),
|
|
); err != nil {
|
|
fmt.Printf("bloomberg: could not inject stealth JS: %v\n", err)
|
|
}
|
|
|
|
err := chromedp.Run(loginCtx,
|
|
chromedp.Navigate("https://www.bloomberg.com/account/signin"),
|
|
chromedp.Sleep(2*time.Second),
|
|
// Essayer plusieurs sélecteurs pour l'email
|
|
chromedp.ActionFunc(func(ctx context.Context) error {
|
|
selectors := []string{
|
|
`input[name="email"]`,
|
|
`input[type="email"]`,
|
|
`input[data-type="email"]`,
|
|
`input[placeholder*="email" i]`,
|
|
`input[placeholder*="mail" i]`,
|
|
}
|
|
for _, sel := range selectors {
|
|
var count int
|
|
if err := chromedp.Evaluate(fmt.Sprintf(`document.querySelectorAll('%s').length`, sel), &count).Do(ctx); err == nil && count > 0 {
|
|
fmt.Printf("bloomberg: using email selector: %s\n", sel)
|
|
return chromedp.SendKeys(sel, b.username, chromedp.ByQuery).Do(ctx)
|
|
}
|
|
}
|
|
return fmt.Errorf("could not find email input — Bloomberg login page structure may have changed")
|
|
}),
|
|
chromedp.Sleep(500*time.Millisecond),
|
|
// Submit email
|
|
chromedp.ActionFunc(func(ctx context.Context) error {
|
|
selectors := []string{`button[type="submit"]`, `input[type="submit"]`, `button[data-testid*="submit"]`}
|
|
for _, sel := range selectors {
|
|
var count int
|
|
if err := chromedp.Evaluate(fmt.Sprintf(`document.querySelectorAll('%s').length`, sel), &count).Do(ctx); err == nil && count > 0 {
|
|
return chromedp.Click(sel, chromedp.ByQuery).Do(ctx)
|
|
}
|
|
}
|
|
// Fallback: press Enter
|
|
return chromedp.KeyEvent("\r").Do(ctx)
|
|
}),
|
|
chromedp.Sleep(2*time.Second),
|
|
// Password
|
|
chromedp.ActionFunc(func(ctx context.Context) error {
|
|
selectors := []string{`input[type="password"]`, `input[name="password"]`}
|
|
for _, sel := range selectors {
|
|
var count int
|
|
if err := chromedp.Evaluate(fmt.Sprintf(`document.querySelectorAll('%s').length`, sel), &count).Do(ctx); err == nil && count > 0 {
|
|
fmt.Printf("bloomberg: using password selector: %s\n", sel)
|
|
return chromedp.SendKeys(sel, b.password, chromedp.ByQuery).Do(ctx)
|
|
}
|
|
}
|
|
return fmt.Errorf("could not find password input")
|
|
}),
|
|
chromedp.Sleep(500*time.Millisecond),
|
|
chromedp.ActionFunc(func(ctx context.Context) error {
|
|
selectors := []string{`button[type="submit"]`, `input[type="submit"]`}
|
|
for _, sel := range selectors {
|
|
var count int
|
|
if err := chromedp.Evaluate(fmt.Sprintf(`document.querySelectorAll('%s').length`, sel), &count).Do(ctx); err == nil && count > 0 {
|
|
return chromedp.Click(sel, chromedp.ByQuery).Do(ctx)
|
|
}
|
|
}
|
|
return chromedp.KeyEvent("\r").Do(ctx)
|
|
}),
|
|
chromedp.Sleep(3*time.Second),
|
|
)
|
|
return err
|
|
}
|
|
|
|
func (b *Bloomberg) scrapePage(ctx context.Context, pageURL string, symbols []string) ([]scraper.Article, error) {
|
|
pageCtx, cancel := context.WithTimeout(ctx, 60*time.Second)
|
|
defer cancel()
|
|
|
|
var articleNodes []map[string]string
|
|
err := chromedp.Run(pageCtx,
|
|
chromedp.Navigate(pageURL),
|
|
chromedp.Sleep(3*time.Second),
|
|
chromedp.Evaluate(`
|
|
(function() {
|
|
var items = [];
|
|
var seen = new Set();
|
|
var links = document.querySelectorAll('a[href*="/news/articles"], a[href*="/opinion/"], a[href*="/markets/"]');
|
|
links.forEach(function(a) {
|
|
if (seen.has(a.href)) return;
|
|
seen.add(a.href);
|
|
var title = a.querySelector('h1,h2,h3,h4,[class*="headline"],[class*="title"]');
|
|
var text = title ? title.innerText.trim() : a.innerText.trim();
|
|
if (text.length > 20 && a.href.includes('bloomberg.com')) {
|
|
items.push({title: text, url: a.href});
|
|
}
|
|
});
|
|
return items.slice(0, 25);
|
|
})()
|
|
`, &articleNodes),
|
|
)
|
|
if err != nil {
|
|
return nil, fmt.Errorf("navigate %s: %w", pageURL, err)
|
|
}
|
|
|
|
var articles []scraper.Article
|
|
now := time.Now()
|
|
for _, node := range articleNodes {
|
|
title := strings.TrimSpace(node["title"])
|
|
url := node["url"]
|
|
if title == "" || url == "" || !strings.Contains(url, "bloomberg.com") {
|
|
continue
|
|
}
|
|
syms := scraper.DetectSymbols(title, symbols)
|
|
articles = append(articles, scraper.Article{
|
|
Title: title,
|
|
Content: title, // contenu minimal — l'article complet nécessite un accès payant
|
|
URL: url,
|
|
PublishedAt: &now,
|
|
Symbols: syms,
|
|
})
|
|
}
|
|
return articles, nil
|
|
}
|