feat: add sources to retrieve news and divide the IA reflexions in 2 steps to limit the number of news

This commit is contained in:
2026-04-19 10:43:15 +02:00
parent 93668273ff
commit eb1fb5ca78
28 changed files with 1086 additions and 249 deletions

View File

@ -15,6 +15,8 @@ import (
"github.com/tradarr/backend/internal/scheduler"
"github.com/tradarr/backend/internal/scraper"
"github.com/tradarr/backend/internal/scraper/bloomberg"
"github.com/tradarr/backend/internal/scraper/reuters"
"github.com/tradarr/backend/internal/scraper/watcherguru"
"github.com/tradarr/backend/internal/scraper/yahoofinance"
)
@ -38,30 +40,23 @@ func main() {
enc := crypto.New(cfg.EncryptionKey)
pipeline := ai.NewPipeline(repo, enc)
// Créer le compte admin initial si nécessaire
if err := ensureAdmin(repo, cfg); err != nil {
log.Printf("ensure admin: %v", err)
}
// Configurer les scrapers
registry := scraper.NewRegistry(repo)
registry.Register(bloomberg.NewDynamic(repo, enc, cfg.ScraperURL))
registry.Register(yahoofinance.New())
registry.Register(reuters.New())
registry.Register(watcherguru.New())
// Bloomberg (credentials chargés depuis la DB à chaque run)
bbScraper := bloomberg.NewDynamic(repo, enc, cfg.ChromePath)
registry.Register(bbScraper)
stScraper := yahoofinance.New()
registry.Register(stScraper)
// Scheduler
sched := scheduler.New(registry, pipeline, repo)
if err := sched.Start(); err != nil {
log.Printf("scheduler: %v", err)
}
defer sched.Stop()
// API
h := handlers.New(repo, cfg, enc, registry, pipeline)
h := handlers.New(repo, cfg, enc, registry, pipeline, sched)
r := api.SetupRouter(h, cfg.JWTSecret)
addr := fmt.Sprintf(":%s", cfg.Port)

View File

@ -3,6 +3,7 @@ package ai
import (
"context"
"fmt"
"regexp"
"strconv"
"strings"
"time"
@ -31,7 +32,6 @@ func NewPipeline(repo *models.Repository, enc *crypto.Encryptor) *Pipeline {
return &Pipeline{repo: repo, enc: enc}
}
// BuildProvider instancie un provider à partir de ses paramètres
func (p *Pipeline) BuildProvider(name, apiKey, endpoint string) (Provider, error) {
provider, err := p.repo.GetActiveAIProvider()
if err != nil {
@ -44,9 +44,7 @@ func (p *Pipeline) BuildProvider(name, apiKey, endpoint string) (Provider, error
return NewProvider(name, apiKey, model, endpoint)
}
// GenerateForUser génère un résumé personnalisé pour un utilisateur
func (p *Pipeline) GenerateForUser(ctx context.Context, userID string) (*models.Summary, error) {
// Récupérer le provider actif
providerCfg, err := p.repo.GetActiveAIProvider()
if err != nil {
return nil, fmt.Errorf("get active provider: %w", err)
@ -68,7 +66,6 @@ func (p *Pipeline) GenerateForUser(ctx context.Context, userID string) (*models.
return nil, fmt.Errorf("build provider: %w", err)
}
// Récupérer la watchlist de l'utilisateur (pour le contexte IA uniquement)
assets, err := p.repo.GetUserAssets(userID)
if err != nil {
return nil, fmt.Errorf("get user assets: %w", err)
@ -78,7 +75,6 @@ func (p *Pipeline) GenerateForUser(ctx context.Context, userID string) (*models.
symbols[i] = a.Symbol
}
// Récupérer TOUS les articles récents, toutes sources confondues
hoursStr, _ := p.repo.GetSetting("articles_lookback_hours")
hours, _ := strconv.Atoi(hoursStr)
if hours == 0 {
@ -98,16 +94,21 @@ func (p *Pipeline) GenerateForUser(ctx context.Context, userID string) (*models.
if maxArticles == 0 {
maxArticles = 50
}
// Passe 1 : filtrage par pertinence sur les titres si trop d'articles
if len(articles) > maxArticles {
articles = articles[:maxArticles]
fmt.Printf("pipeline: %d articles → filtering to %d via AI\n", len(articles), maxArticles)
articles = p.filterByRelevance(ctx, provider, symbols, articles, maxArticles)
fmt.Printf("pipeline: %d articles retained after filtering\n", len(articles))
}
systemPrompt, _ := p.repo.GetSetting("ai_system_prompt")
if systemPrompt == "" {
systemPrompt = DefaultSystemPrompt
}
prompt := buildPrompt(systemPrompt, symbols, articles)
// Passe 2 : résumé complet
prompt := buildPrompt(systemPrompt, symbols, articles)
summary, err := provider.Summarize(ctx, prompt)
if err != nil {
return nil, fmt.Errorf("AI summarize: %w", err)
@ -116,7 +117,77 @@ func (p *Pipeline) GenerateForUser(ctx context.Context, userID string) (*models.
return p.repo.CreateSummary(userID, summary, &providerCfg.ID)
}
// GenerateForAll génère les résumés pour tous les utilisateurs ayant une watchlist
// filterByRelevance demande à l'IA de sélectionner les articles les plus pertinents
// en ne lui envoyant que les titres (prompt très court = rapide).
func (p *Pipeline) filterByRelevance(ctx context.Context, provider Provider, symbols []string, articles []models.Article, max int) []models.Article {
prompt := buildFilterPrompt(symbols, articles, max)
response, err := provider.Summarize(ctx, prompt)
if err != nil {
fmt.Printf("pipeline: filter AI call failed (%v), falling back to truncation\n", err)
return articles[:max]
}
indices := parseIndexArray(response, len(articles))
if len(indices) == 0 {
fmt.Printf("pipeline: could not parse filter response, falling back to truncation\n")
return articles[:max]
}
filtered := make([]models.Article, 0, len(indices))
for _, i := range indices {
filtered = append(filtered, articles[i])
if len(filtered) >= max {
break
}
}
return filtered
}
func buildFilterPrompt(symbols []string, articles []models.Article, max int) string {
var sb strings.Builder
sb.WriteString("Tu es un assistant de trading financier. ")
sb.WriteString(fmt.Sprintf("Parmi les %d articles ci-dessous, sélectionne les %d plus pertinents pour un trader actif.\n", len(articles), max))
if len(symbols) > 0 {
sb.WriteString("Actifs surveillés (priorité haute) : ")
sb.WriteString(strings.Join(symbols, ", "))
sb.WriteString("\n")
}
sb.WriteString(fmt.Sprintf("\nRéponds UNIQUEMENT avec un tableau JSON des indices sélectionnés (base 0), exemple : [0, 3, 7, 12]\n"))
sb.WriteString("N'ajoute aucun texte avant ou après le tableau JSON.\n\n")
sb.WriteString("Articles :\n")
for i, a := range articles {
sb.WriteString(fmt.Sprintf("[%d] %s (%s)\n", i, a.Title, a.SourceName))
}
return sb.String()
}
var jsonArrayRe = regexp.MustCompile(`\[[\d\s,]+\]`)
func parseIndexArray(response string, maxIndex int) []int {
match := jsonArrayRe.FindString(response)
if match == "" {
return nil
}
match = strings.Trim(match, "[]")
parts := strings.Split(match, ",")
seen := make(map[int]bool)
var indices []int
for _, p := range parts {
n, err := strconv.Atoi(strings.TrimSpace(p))
if err != nil || n < 0 || n >= maxIndex || seen[n] {
continue
}
seen[n] = true
indices = append(indices, n)
}
return indices
}
func (p *Pipeline) GenerateForAll(ctx context.Context) error {
users, err := p.repo.ListUsers()
if err != nil {

View File

@ -287,6 +287,45 @@ func (h *Handler) UpdateSettings(c *gin.Context) {
httputil.OK(c, gin.H{"ok": true})
}
// ── Schedule ───────────────────────────────────────────────────────────────
func (h *Handler) GetSchedule(c *gin.Context) {
slots, err := h.repo.ListScheduleSlots()
if err != nil {
httputil.InternalError(c, err)
return
}
httputil.OK(c, slots)
}
type scheduleRequest struct {
Slots []struct {
DayOfWeek int `json:"day_of_week"`
Hour int `json:"hour"`
Minute int `json:"minute"`
} `json:"slots"`
}
func (h *Handler) UpdateSchedule(c *gin.Context) {
var req scheduleRequest
if err := c.ShouldBindJSON(&req); err != nil {
httputil.BadRequest(c, err)
return
}
slots := make([]models.ScheduleSlot, len(req.Slots))
for i, s := range req.Slots {
slots[i] = models.ScheduleSlot{DayOfWeek: s.DayOfWeek, Hour: s.Hour, Minute: s.Minute}
}
if err := h.repo.ReplaceSchedule(slots); err != nil {
httputil.InternalError(c, err)
return
}
if err := h.scheduler.Reload(); err != nil {
fmt.Printf("schedule reload: %v\n", err)
}
httputil.OK(c, gin.H{"ok": true})
}
func (h *Handler) GetDefaultSystemPrompt(c *gin.Context) {
httputil.OK(c, gin.H{"prompt": ai.DefaultSystemPrompt})
}

View File

@ -5,6 +5,7 @@ import (
"github.com/tradarr/backend/internal/config"
"github.com/tradarr/backend/internal/crypto"
"github.com/tradarr/backend/internal/models"
"github.com/tradarr/backend/internal/scheduler"
"github.com/tradarr/backend/internal/scraper"
)
@ -14,6 +15,7 @@ type Handler struct {
enc *crypto.Encryptor
registry *scraper.Registry
pipeline *ai.Pipeline
scheduler *scheduler.Scheduler
}
func New(
@ -22,6 +24,7 @@ func New(
enc *crypto.Encryptor,
registry *scraper.Registry,
pipeline *ai.Pipeline,
sched *scheduler.Scheduler,
) *Handler {
return &Handler{
repo: repo,
@ -29,5 +32,6 @@ func New(
enc: enc,
registry: registry,
pipeline: pipeline,
scheduler: sched,
}
}

View File

@ -65,6 +65,9 @@ func SetupRouter(h *handlers.Handler, jwtSecret string) *gin.Engine {
admin.PUT("/settings", h.UpdateSettings)
admin.GET("/settings/default-prompt", h.GetDefaultSystemPrompt)
admin.GET("/schedule", h.GetSchedule)
admin.PUT("/schedule", h.UpdateSchedule)
admin.GET("/users", h.ListUsers)
admin.PUT("/users/:id", h.UpdateAdminUser)
admin.DELETE("/users/:id", h.DeleteAdminUser)

View File

@ -11,7 +11,7 @@ type Config struct {
JWTSecret string
EncryptionKey []byte
Port string
ChromePath string
ScraperURL string
AdminEmail string
AdminPassword string
}
@ -41,12 +41,17 @@ func Load() (*Config, error) {
port = "8080"
}
scraperURL := os.Getenv("SCRAPER_URL")
if scraperURL == "" {
scraperURL = "http://scraper:3001"
}
return &Config{
DatabaseURL: dbURL,
JWTSecret: jwtSecret,
EncryptionKey: encKey,
Port: port,
ChromePath: os.Getenv("CHROME_PATH"),
ScraperURL: scraperURL,
AdminEmail: os.Getenv("ADMIN_EMAIL"),
AdminPassword: os.Getenv("ADMIN_PASSWORD"),
}, nil

View File

@ -21,7 +21,7 @@ CREATE TABLE user_assets (
CREATE TABLE sources (
id UUID PRIMARY KEY DEFAULT gen_random_uuid(),
name TEXT NOT NULL,
type TEXT NOT NULL CHECK (type IN ('bloomberg', 'stocktwits')),
type TEXT NOT NULL CHECK (type IN ('bloomberg', 'stocktwits', 'reuters', 'watcherguru')),
enabled BOOLEAN NOT NULL DEFAULT TRUE,
created_at TIMESTAMPTZ NOT NULL DEFAULT NOW()
);
@ -97,7 +97,7 @@ CREATE INDEX idx_user_assets_user_id ON user_assets(user_id);
-- Sources initiales
INSERT INTO sources (name, type, enabled) VALUES
('Bloomberg', 'bloomberg', TRUE),
('StockTwits', 'stocktwits', TRUE);
('Yahoo Finance', 'stocktwits', TRUE);
-- Paramètres par défaut
INSERT INTO settings (key, value) VALUES

View File

@ -0,0 +1 @@
DELETE FROM sources WHERE type IN ('reuters', 'watcherguru');

View File

@ -0,0 +1,4 @@
INSERT INTO sources (name, type, enabled) VALUES
('Reuters', 'reuters', true),
('Watcher.Guru', 'watcherguru', true)
ON CONFLICT DO NOTHING;

View File

@ -0,0 +1 @@
DROP TABLE IF EXISTS scrape_schedules;

View File

@ -0,0 +1,17 @@
CREATE TABLE scrape_schedules (
id UUID PRIMARY KEY DEFAULT gen_random_uuid(),
day_of_week SMALLINT NOT NULL CHECK (day_of_week BETWEEN 0 AND 6),
hour SMALLINT NOT NULL CHECK (hour BETWEEN 0 AND 23),
minute SMALLINT NOT NULL DEFAULT 0 CHECK (minute BETWEEN 0 AND 59),
UNIQUE (day_of_week, hour, minute)
);
-- Planning par défaut : lun-ven à 6h et 15h, week-end à 6h uniquement
INSERT INTO scrape_schedules (day_of_week, hour, minute) VALUES
(1, 6, 0), (1, 15, 0),
(2, 6, 0), (2, 15, 0),
(3, 6, 0), (3, 15, 0),
(4, 6, 0), (4, 15, 0),
(5, 6, 0), (5, 15, 0),
(6, 6, 0),
(0, 6, 0);

View File

@ -97,3 +97,10 @@ type Setting struct {
Key string `json:"key"`
Value string `json:"value"`
}
type ScheduleSlot struct {
ID string `json:"id"`
DayOfWeek int `json:"day_of_week"` // 0=dimanche, 1=lundi ... 6=samedi
Hour int `json:"hour"`
Minute int `json:"minute"`
}

View File

@ -520,6 +520,51 @@ func (r *Repository) SetSetting(key, value string) error {
return err
}
// ── Schedule ───────────────────────────────────────────────────────────────
func (r *Repository) ListScheduleSlots() ([]ScheduleSlot, error) {
rows, err := r.db.Query(`
SELECT id, day_of_week, hour, minute FROM scrape_schedules
ORDER BY day_of_week, hour, minute`)
if err != nil {
return nil, err
}
defer rows.Close()
var slots []ScheduleSlot
for rows.Next() {
var s ScheduleSlot
if err := rows.Scan(&s.ID, &s.DayOfWeek, &s.Hour, &s.Minute); err != nil {
return nil, err
}
slots = append(slots, s)
}
return slots, nil
}
func (r *Repository) ReplaceSchedule(slots []ScheduleSlot) error {
tx, err := r.db.Begin()
if err != nil {
return err
}
defer tx.Rollback()
if _, err := tx.Exec(`DELETE FROM scrape_schedules`); err != nil {
return err
}
for _, s := range slots {
if _, err := tx.Exec(
`INSERT INTO scrape_schedules (day_of_week, hour, minute) VALUES ($1, $2, $3)
ON CONFLICT (day_of_week, hour, minute) DO NOTHING`,
s.DayOfWeek, s.Hour, s.Minute,
); err != nil {
return err
}
}
return tx.Commit()
}
// ── Settings ───────────────────────────────────────────────────────────────
func (r *Repository) ListSettings() ([]Setting, error) {
rows, err := r.db.Query(`SELECT key, value FROM settings ORDER BY key`)
if err != nil {

View File

@ -3,7 +3,6 @@ package scheduler
import (
"context"
"fmt"
"strconv"
"github.com/robfig/cron/v3"
"github.com/tradarr/backend/internal/ai"
@ -16,7 +15,7 @@ type Scheduler struct {
registry *scraper.Registry
pipeline *ai.Pipeline
repo *models.Repository
entryID cron.EntryID
entryIDs []cron.EntryID
}
func New(registry *scraper.Registry, pipeline *ai.Pipeline, repo *models.Repository) *Scheduler {
@ -29,19 +28,10 @@ func New(registry *scraper.Registry, pipeline *ai.Pipeline, repo *models.Reposit
}
func (s *Scheduler) Start() error {
interval, err := s.getInterval()
if err != nil {
if err := s.loadSchedule(); err != nil {
return err
}
spec := fmt.Sprintf("@every %dm", interval)
s.entryID, err = s.cron.AddFunc(spec, s.run)
if err != nil {
return fmt.Errorf("add cron: %w", err)
}
s.cron.Start()
fmt.Printf("scheduler started, running every %d minutes\n", interval)
return nil
}
@ -50,39 +40,46 @@ func (s *Scheduler) Stop() {
}
func (s *Scheduler) Reload() error {
s.cron.Remove(s.entryID)
interval, err := s.getInterval()
if err != nil {
return err
for _, id := range s.entryIDs {
s.cron.Remove(id)
}
spec := fmt.Sprintf("@every %dm", interval)
s.entryID, err = s.cron.AddFunc(spec, s.run)
return err
s.entryIDs = nil
return s.loadSchedule()
}
func (s *Scheduler) loadSchedule() error {
slots, err := s.repo.ListScheduleSlots()
if err != nil {
return fmt.Errorf("load schedule: %w", err)
}
if len(slots) == 0 {
fmt.Println("scheduler: no schedule configured, scraping disabled")
return nil
}
for _, slot := range slots {
// Format cron: "minute hour * * day_of_week"
spec := fmt.Sprintf("%d %d * * %d", slot.Minute, slot.Hour, slot.DayOfWeek)
id, err := s.cron.AddFunc(spec, s.run)
if err != nil {
fmt.Printf("scheduler: invalid cron spec %q: %v\n", spec, err)
continue
}
s.entryIDs = append(s.entryIDs, id)
}
fmt.Printf("scheduler: %d time slots loaded\n", len(s.entryIDs))
return nil
}
func (s *Scheduler) run() {
fmt.Println("scheduler: running scraping cycle")
fmt.Println("scheduler: starting scraping cycle")
if err := s.registry.RunAll(); err != nil {
fmt.Printf("scheduler scrape error: %v\n", err)
return
}
fmt.Println("scheduler: running AI summaries")
fmt.Println("scheduler: starting AI summaries")
if err := s.pipeline.GenerateForAll(context.Background()); err != nil {
fmt.Printf("scheduler summary error: %v\n", err)
}
}
func (s *Scheduler) getInterval() (int, error) {
v, err := s.repo.GetSetting("scrape_interval_minutes")
if err != nil {
return 60, nil
}
if v == "" {
return 60, nil
}
n, err := strconv.Atoi(v)
if err != nil || n < 1 {
return 60, nil
}
return n, nil
}

View File

@ -1,206 +1,94 @@
package bloomberg
import (
"bytes"
"context"
"encoding/json"
"fmt"
"io"
"net/http"
"strings"
"time"
"github.com/chromedp/chromedp"
"github.com/tradarr/backend/internal/scraper"
)
type Bloomberg struct {
username string
password string
chromePath string
scraperURL string
client *http.Client
}
func New(username, password, chromePath string) *Bloomberg {
return &Bloomberg{username: username, password: password, chromePath: chromePath}
func New(scraperURL string) *Bloomberg {
if scraperURL == "" {
scraperURL = "http://scraper:3001"
}
return &Bloomberg{
scraperURL: scraperURL,
client: &http.Client{Timeout: 10 * time.Minute},
}
}
func (b *Bloomberg) Name() string { return "bloomberg" }
func (b *Bloomberg) Scrape(ctx context.Context, symbols []string) ([]scraper.Article, error) {
if b.username == "" || b.password == "" {
return nil, fmt.Errorf("bloomberg credentials not configured")
}
opts := []chromedp.ExecAllocatorOption{
chromedp.NoFirstRun,
chromedp.NoDefaultBrowserCheck,
chromedp.Headless,
chromedp.DisableGPU,
chromedp.Flag("no-sandbox", true),
chromedp.Flag("disable-setuid-sandbox", true),
chromedp.Flag("disable-dev-shm-usage", true),
chromedp.Flag("disable-blink-features", "AutomationControlled"),
chromedp.Flag("disable-infobars", true),
chromedp.Flag("window-size", "1920,1080"),
chromedp.Flag("ignore-certificate-errors", true),
chromedp.UserAgent("Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/122.0.0.0 Safari/537.36"),
}
if b.chromePath != "" {
opts = append(opts, chromedp.ExecPath(b.chromePath))
}
allocCtx, cancelAlloc := chromedp.NewExecAllocator(ctx, opts...)
defer cancelAlloc()
chromeCtx, cancelChrome := chromedp.NewContext(allocCtx)
defer cancelChrome()
timeoutCtx, cancelTimeout := context.WithTimeout(chromeCtx, 5*time.Minute)
defer cancelTimeout()
if err := b.login(timeoutCtx); err != nil {
return nil, fmt.Errorf("bloomberg login: %w", err)
}
var articles []scraper.Article
pages := []string{
"https://www.bloomberg.com/markets",
"https://www.bloomberg.com/technology",
"https://www.bloomberg.com/economics",
}
for _, u := range pages {
pageArticles, err := b.scrapePage(timeoutCtx, u, symbols)
if err != nil {
fmt.Printf("bloomberg scrape %s: %v\n", u, err)
continue
}
articles = append(articles, pageArticles...)
}
fmt.Printf("bloomberg: %d articles fetched total\n", len(articles))
return articles, nil
type scraperRequest struct {
Username string `json:"username"`
Password string `json:"password"`
}
func (b *Bloomberg) login(ctx context.Context) error {
loginCtx, cancel := context.WithTimeout(ctx, 2*time.Minute)
defer cancel()
// Masquer la détection d'automation via JS
if err := chromedp.Run(loginCtx,
chromedp.ActionFunc(func(ctx context.Context) error {
return chromedp.Evaluate(`
Object.defineProperty(navigator, 'webdriver', {get: () => undefined});
window.chrome = { runtime: {} };
`, nil).Do(ctx)
}),
); err != nil {
fmt.Printf("bloomberg: could not inject stealth JS: %v\n", err)
}
err := chromedp.Run(loginCtx,
chromedp.Navigate("https://www.bloomberg.com/account/signin"),
chromedp.Sleep(2*time.Second),
// Essayer plusieurs sélecteurs pour l'email
chromedp.ActionFunc(func(ctx context.Context) error {
selectors := []string{
`input[name="email"]`,
`input[type="email"]`,
`input[data-type="email"]`,
`input[placeholder*="email" i]`,
`input[placeholder*="mail" i]`,
}
for _, sel := range selectors {
var count int
if err := chromedp.Evaluate(fmt.Sprintf(`document.querySelectorAll('%s').length`, sel), &count).Do(ctx); err == nil && count > 0 {
fmt.Printf("bloomberg: using email selector: %s\n", sel)
return chromedp.SendKeys(sel, b.username, chromedp.ByQuery).Do(ctx)
}
}
return fmt.Errorf("could not find email input — Bloomberg login page structure may have changed")
}),
chromedp.Sleep(500*time.Millisecond),
// Submit email
chromedp.ActionFunc(func(ctx context.Context) error {
selectors := []string{`button[type="submit"]`, `input[type="submit"]`, `button[data-testid*="submit"]`}
for _, sel := range selectors {
var count int
if err := chromedp.Evaluate(fmt.Sprintf(`document.querySelectorAll('%s').length`, sel), &count).Do(ctx); err == nil && count > 0 {
return chromedp.Click(sel, chromedp.ByQuery).Do(ctx)
}
}
// Fallback: press Enter
return chromedp.KeyEvent("\r").Do(ctx)
}),
chromedp.Sleep(2*time.Second),
// Password
chromedp.ActionFunc(func(ctx context.Context) error {
selectors := []string{`input[type="password"]`, `input[name="password"]`}
for _, sel := range selectors {
var count int
if err := chromedp.Evaluate(fmt.Sprintf(`document.querySelectorAll('%s').length`, sel), &count).Do(ctx); err == nil && count > 0 {
fmt.Printf("bloomberg: using password selector: %s\n", sel)
return chromedp.SendKeys(sel, b.password, chromedp.ByQuery).Do(ctx)
}
}
return fmt.Errorf("could not find password input")
}),
chromedp.Sleep(500*time.Millisecond),
chromedp.ActionFunc(func(ctx context.Context) error {
selectors := []string{`button[type="submit"]`, `input[type="submit"]`}
for _, sel := range selectors {
var count int
if err := chromedp.Evaluate(fmt.Sprintf(`document.querySelectorAll('%s').length`, sel), &count).Do(ctx); err == nil && count > 0 {
return chromedp.Click(sel, chromedp.ByQuery).Do(ctx)
}
}
return chromedp.KeyEvent("\r").Do(ctx)
}),
chromedp.Sleep(3*time.Second),
)
return err
type scraperArticle struct {
Title string `json:"title"`
URL string `json:"url"`
}
func (b *Bloomberg) scrapePage(ctx context.Context, pageURL string, symbols []string) ([]scraper.Article, error) {
pageCtx, cancel := context.WithTimeout(ctx, 60*time.Second)
defer cancel()
type scraperResponse struct {
Articles []scraperArticle `json:"articles"`
Error string `json:"error,omitempty"`
}
var articleNodes []map[string]string
err := chromedp.Run(pageCtx,
chromedp.Navigate(pageURL),
chromedp.Sleep(3*time.Second),
chromedp.Evaluate(`
(function() {
var items = [];
var seen = new Set();
var links = document.querySelectorAll('a[href*="/news/articles"], a[href*="/opinion/"], a[href*="/markets/"]');
links.forEach(function(a) {
if (seen.has(a.href)) return;
seen.add(a.href);
var title = a.querySelector('h1,h2,h3,h4,[class*="headline"],[class*="title"]');
var text = title ? title.innerText.trim() : a.innerText.trim();
if (text.length > 20 && a.href.includes('bloomberg.com')) {
items.push({title: text, url: a.href});
}
});
return items.slice(0, 25);
})()
`, &articleNodes),
)
func (b *Bloomberg) ScrapeWithCredentials(ctx context.Context, username, password string, symbols []string) ([]scraper.Article, error) {
payload, _ := json.Marshal(scraperRequest{Username: username, Password: password})
req, err := http.NewRequestWithContext(ctx, http.MethodPost, b.scraperURL+"/bloomberg/scrape", bytes.NewReader(payload))
if err != nil {
return nil, fmt.Errorf("navigate %s: %w", pageURL, err)
return nil, err
}
req.Header.Set("Content-Type", "application/json")
resp, err := b.client.Do(req)
if err != nil {
return nil, fmt.Errorf("scraper service unreachable: %w", err)
}
defer resp.Body.Close()
body, _ := io.ReadAll(resp.Body)
if resp.StatusCode != http.StatusOK {
return nil, fmt.Errorf("scraper service HTTP %d: %s", resp.StatusCode, body)
}
var result scraperResponse
if err := json.Unmarshal(body, &result); err != nil {
return nil, fmt.Errorf("parse scraper response: %w", err)
}
if result.Error != "" {
return nil, fmt.Errorf("bloomberg: %s", result.Error)
}
var articles []scraper.Article
now := time.Now()
for _, node := range articleNodes {
title := strings.TrimSpace(node["title"])
url := node["url"]
if title == "" || url == "" || !strings.Contains(url, "bloomberg.com") {
var articles []scraper.Article
for _, a := range result.Articles {
title := strings.TrimSpace(a.Title)
url := a.URL
if title == "" || url == "" {
continue
}
syms := scraper.DetectSymbols(title, symbols)
articles = append(articles, scraper.Article{
Title: title,
Content: title, // contenu minimal — l'article complet nécessite un accès payant
Content: title,
URL: url,
PublishedAt: &now,
Symbols: syms,
})
}
fmt.Printf("bloomberg: %d articles fetched\n", len(articles))
return articles, nil
}

View File

@ -9,21 +9,19 @@ import (
"github.com/tradarr/backend/internal/scraper"
)
// DynamicBloomberg charge les credentials depuis la DB avant chaque scraping
type DynamicBloomberg struct {
repo *models.Repository
enc *crypto.Encryptor
chromePath string
scraperURL string
}
func NewDynamic(repo *models.Repository, enc *crypto.Encryptor, chromePath string) *DynamicBloomberg {
return &DynamicBloomberg{repo: repo, enc: enc, chromePath: chromePath}
func NewDynamic(repo *models.Repository, enc *crypto.Encryptor, scraperURL string) *DynamicBloomberg {
return &DynamicBloomberg{repo: repo, enc: enc, scraperURL: scraperURL}
}
func (d *DynamicBloomberg) Name() string { return "bloomberg" }
func (d *DynamicBloomberg) Scrape(ctx context.Context, symbols []string) ([]scraper.Article, error) {
// Récupérer la source Bloomberg
source, err := d.repo.GetSourceByType("bloomberg")
if err != nil || source == nil {
return nil, fmt.Errorf("bloomberg source not found")
@ -34,7 +32,7 @@ func (d *DynamicBloomberg) Scrape(ctx context.Context, symbols []string) ([]scra
return nil, fmt.Errorf("get bloomberg credentials: %w", err)
}
if cred == nil || cred.Username == "" {
return nil, fmt.Errorf("bloomberg credentials not configured — please set them in the admin panel")
return nil, fmt.Errorf("bloomberg credentials not configured — configure them in the admin panel")
}
password := ""
@ -45,6 +43,6 @@ func (d *DynamicBloomberg) Scrape(ctx context.Context, symbols []string) ([]scra
}
}
b := New(cred.Username, password, d.chromePath)
return b.Scrape(ctx, symbols)
b := New(d.scraperURL)
return b.ScrapeWithCredentials(ctx, cred.Username, password, symbols)
}

View File

@ -0,0 +1,129 @@
package reuters
import (
"context"
"encoding/xml"
"fmt"
"io"
"net/http"
"strings"
"time"
"github.com/tradarr/backend/internal/scraper"
)
// Reuters RSS est bloqué par Cloudflare. On utilise des flux RSS financiers
// publics fiables à la place : MarketWatch, CNBC, Seeking Alpha.
var feeds = []struct {
name string
url string
}{
{"MarketWatch Top Stories", "https://feeds.content.dowjones.io/public/rss/mw_topstories"},
{"MarketWatch Markets", "https://feeds.content.dowjones.io/public/rss/mw_marketpulse"},
{"CNBC Top News", "https://search.cnbc.com/rs/search/combinedcombined/rss/topNews"},
{"CNBC Finance", "https://search.cnbc.com/rs/search/combinedcombined/rss/topNews?tag=Finance"},
}
type Reuters struct {
client *http.Client
}
func New() *Reuters {
return &Reuters{client: &http.Client{Timeout: 15 * time.Second}}
}
func (r *Reuters) Name() string { return "reuters" }
type rssFeed struct {
Channel struct {
Items []struct {
Title string `xml:"title"`
Link string `xml:"link"`
Description string `xml:"description"`
PubDate string `xml:"pubDate"`
} `xml:"item"`
} `xml:"channel"`
}
func (r *Reuters) Scrape(ctx context.Context, _ []string) ([]scraper.Article, error) {
var articles []scraper.Article
seen := make(map[string]bool)
for i, feed := range feeds {
if i > 0 {
select {
case <-ctx.Done():
return articles, ctx.Err()
case <-time.After(300 * time.Millisecond):
}
}
items, err := r.fetchFeed(ctx, feed.url)
if err != nil {
fmt.Printf("reuters/financial %s: %v\n", feed.name, err)
continue
}
for _, a := range items {
if !seen[a.URL] {
seen[a.URL] = true
articles = append(articles, a)
}
}
fmt.Printf("reuters/financial %s: %d articles\n", feed.name, len(items))
}
return articles, nil
}
func (r *Reuters) fetchFeed(ctx context.Context, feedURL string) ([]scraper.Article, error) {
req, err := http.NewRequestWithContext(ctx, http.MethodGet, feedURL, nil)
if err != nil {
return nil, err
}
req.Header.Set("User-Agent", "Mozilla/5.0 (compatible; Tradarr/1.0)")
req.Header.Set("Accept", "application/rss+xml, application/xml, text/xml")
resp, err := r.client.Do(req)
if err != nil {
return nil, err
}
defer resp.Body.Close()
if resp.StatusCode != http.StatusOK {
body, _ := io.ReadAll(io.LimitReader(resp.Body, 256))
return nil, fmt.Errorf("HTTP %d: %s", resp.StatusCode, body)
}
var feed rssFeed
if err := xml.NewDecoder(resp.Body).Decode(&feed); err != nil {
return nil, fmt.Errorf("parse RSS: %w", err)
}
var articles []scraper.Article
for _, item := range feed.Channel.Items {
title := strings.TrimSpace(item.Title)
link := strings.TrimSpace(item.Link)
if title == "" || link == "" {
continue
}
var publishedAt *time.Time
for _, f := range []string{time.RFC1123Z, time.RFC1123, "Mon, 02 Jan 2006 15:04:05 -0700"} {
if t, err := time.Parse(f, item.PubDate); err == nil {
publishedAt = &t
break
}
}
content := strings.TrimSpace(item.Description)
if content == "" {
content = title
}
articles = append(articles, scraper.Article{
Title: title,
Content: content,
URL: link,
PublishedAt: publishedAt,
})
}
return articles, nil
}

View File

@ -0,0 +1,200 @@
package watcherguru
import (
"context"
"encoding/xml"
"fmt"
"io"
"net/http"
"strings"
"time"
"golang.org/x/net/html"
"github.com/tradarr/backend/internal/scraper"
)
const baseURL = "https://watcher.guru"
type WatcherGuru struct {
client *http.Client
}
func New() *WatcherGuru {
return &WatcherGuru{client: &http.Client{Timeout: 15 * time.Second}}
}
func (w *WatcherGuru) Name() string { return "watcherguru" }
type rssFeed struct {
Channel struct {
Items []struct {
Title string `xml:"title"`
Link string `xml:"link"`
PubDate string `xml:"pubDate"`
Desc string `xml:"description"`
} `xml:"item"`
} `xml:"channel"`
}
func (w *WatcherGuru) Scrape(ctx context.Context, _ []string) ([]scraper.Article, error) {
// Try RSS feeds first
for _, feedURL := range []string{
baseURL + "/feed/",
baseURL + "/news/feed/",
} {
articles, err := w.fetchRSS(ctx, feedURL)
if err == nil && len(articles) > 0 {
fmt.Printf("watcherguru rss: %d articles\n", len(articles))
return articles, nil
}
}
// Fallback: HTML scraping
articles, err := w.scrapeHTML(ctx)
if err != nil {
return nil, fmt.Errorf("watcherguru: %w", err)
}
fmt.Printf("watcherguru html: %d articles\n", len(articles))
return articles, nil
}
func (w *WatcherGuru) fetchRSS(ctx context.Context, feedURL string) ([]scraper.Article, error) {
req, err := http.NewRequestWithContext(ctx, http.MethodGet, feedURL, nil)
if err != nil {
return nil, err
}
req.Header.Set("User-Agent", "Mozilla/5.0 (compatible; Tradarr/1.0)")
resp, err := w.client.Do(req)
if err != nil {
return nil, err
}
defer resp.Body.Close()
if resp.StatusCode != http.StatusOK {
return nil, fmt.Errorf("HTTP %d", resp.StatusCode)
}
var feed rssFeed
if err := xml.NewDecoder(resp.Body).Decode(&feed); err != nil {
return nil, fmt.Errorf("parse RSS: %w", err)
}
var articles []scraper.Article
for _, item := range feed.Channel.Items {
title := strings.TrimSpace(item.Title)
link := strings.TrimSpace(item.Link)
if title == "" || link == "" {
continue
}
var publishedAt *time.Time
for _, f := range []string{time.RFC1123Z, time.RFC1123, "Mon, 02 Jan 2006 15:04:05 -0700"} {
if t, err := time.Parse(f, item.PubDate); err == nil {
publishedAt = &t
break
}
}
content := strings.TrimSpace(item.Desc)
if content == "" {
content = title
}
articles = append(articles, scraper.Article{
Title: title,
Content: content,
URL: link,
PublishedAt: publishedAt,
})
}
return articles, nil
}
func (w *WatcherGuru) scrapeHTML(ctx context.Context) ([]scraper.Article, error) {
req, err := http.NewRequestWithContext(ctx, http.MethodGet, baseURL+"/news/", nil)
if err != nil {
return nil, err
}
req.Header.Set("User-Agent", "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 Chrome/122.0.0.0 Safari/537.36")
req.Header.Set("Accept", "text/html,application/xhtml+xml")
resp, err := w.client.Do(req)
if err != nil {
return nil, err
}
defer resp.Body.Close()
if resp.StatusCode != http.StatusOK {
body, _ := io.ReadAll(io.LimitReader(resp.Body, 512))
return nil, fmt.Errorf("HTTP %d: %s", resp.StatusCode, body)
}
doc, err := html.Parse(resp.Body)
if err != nil {
return nil, fmt.Errorf("parse HTML: %w", err)
}
var articles []scraper.Article
seen := make(map[string]bool)
now := time.Now()
var walk func(*html.Node)
walk = func(n *html.Node) {
if n.Type == html.ElementNode && (n.Data == "a" || n.Data == "article") {
if n.Data == "a" {
href := attrVal(n, "href")
if href == "" || seen[href] {
walk(n.FirstChild)
return
}
// Collect links that look like news articles
if strings.Contains(href, "/news/") || strings.Contains(href, "watcher.guru") {
text := strings.TrimSpace(nodeText(n))
if len(text) > 20 {
url := href
if !strings.HasPrefix(url, "http") {
url = baseURL + url
}
if !seen[url] {
seen[url] = true
articles = append(articles, scraper.Article{
Title: text,
Content: text,
URL: url,
PublishedAt: &now,
})
}
}
}
}
}
for c := n.FirstChild; c != nil; c = c.NextSibling {
walk(c)
}
}
walk(doc)
if len(articles) > 40 {
articles = articles[:40]
}
return articles, nil
}
func attrVal(n *html.Node, key string) string {
for _, a := range n.Attr {
if a.Key == key {
return a.Val
}
}
return ""
}
func nodeText(n *html.Node) string {
if n.Type == html.TextNode {
return n.Data
}
var sb strings.Builder
for c := n.FirstChild; c != nil; c = c.NextSibling {
sb.WriteString(nodeText(c))
}
return sb.String()
}

View File

@ -86,8 +86,13 @@ func (y *YahooFinance) fetchSymbol(ctx context.Context, symbol string) ([]scrape
return nil, fmt.Errorf("parse RSS: %w", err)
}
const maxPerSymbol = 5
var articles []scraper.Article
for _, item := range feed.Channel.Items {
if len(articles) >= maxPerSymbol {
break
}
title := strings.TrimSpace(item.Title)
link := strings.TrimSpace(item.Link)
if title == "" || link == "" {

View File

@ -14,6 +14,14 @@ services:
timeout: 5s
retries: 5
scraper:
build:
context: ./scraper-service
dockerfile: Dockerfile
restart: unless-stopped
expose:
- "3001"
backend:
build:
context: ./backend
@ -22,11 +30,14 @@ services:
depends_on:
postgres:
condition: service_healthy
scraper:
condition: service_started
environment:
DATABASE_URL: "host=postgres port=5432 user=${POSTGRES_USER:-tradarr} password=${POSTGRES_PASSWORD} dbname=${POSTGRES_DB:-tradarr} sslmode=disable"
JWT_SECRET: ${JWT_SECRET:?JWT_SECRET is required}
ENCRYPTION_KEY: ${ENCRYPTION_KEY:?ENCRYPTION_KEY must be 32 bytes hex}
PORT: "8080"
SCRAPER_URL: "http://scraper:3001"
ADMIN_EMAIL: ${ADMIN_EMAIL:-admin@tradarr.local}
ADMIN_PASSWORD: ${ADMIN_PASSWORD:-changeme}
expose:

View File

@ -12,6 +12,7 @@ export interface ScrapeJob {
articles_found: number; error_msg: string; created_at: string
}
export interface Setting { key: string; value: string }
export interface ScheduleSlot { id?: string; day_of_week: number; hour: number; minute: number }
export interface AdminUser { id: string; email: string; role: string; created_at: string }
export interface Credential { source_id: string; source_name: string; username: string; has_password: boolean }
@ -44,6 +45,10 @@ export const adminApi = {
updateSettings: (settings: Setting[]) => api.put<void>('/admin/settings', { settings }),
getDefaultPrompt: () => api.get<{ prompt: string }>('/admin/settings/default-prompt'),
// Schedule
getSchedule: () => api.get<ScheduleSlot[]>('/admin/schedule'),
updateSchedule: (slots: ScheduleSlot[]) => api.put<void>('/admin/schedule', { slots }),
// Users
listUsers: () => api.get<AdminUser[]>('/admin/users'),
updateUser: (id: string, email: string, role: string) =>

View File

@ -1,5 +1,5 @@
import { NavLink } from 'react-router-dom'
import { LayoutDashboard, Newspaper, Star, Settings, Key, Cpu, Database, ClipboardList, Users, LogOut, TrendingUp } from 'lucide-react'
import { LayoutDashboard, Newspaper, Star, Settings, Key, Cpu, Database, ClipboardList, Users, LogOut, TrendingUp, CalendarDays } from 'lucide-react'
import { useAuth } from '@/lib/auth'
import { cn } from '@/lib/cn'
@ -15,6 +15,7 @@ const adminItems = [
{ to: '/admin/sources', icon: Database, label: 'Sources' },
{ to: '/admin/jobs', icon: ClipboardList, label: 'Jobs' },
{ to: '/admin/users', icon: Users, label: 'Utilisateurs' },
{ to: '/admin/schedule', icon: CalendarDays, label: 'Planning' },
{ to: '/admin/settings', icon: Settings, label: 'Paramètres' },
]

View File

@ -11,6 +11,7 @@ import { Sources } from '@/pages/admin/Sources'
import { Jobs } from '@/pages/admin/Jobs'
import { AdminUsers } from '@/pages/admin/AdminUsers'
import { AdminSettings } from '@/pages/admin/AdminSettings'
import { Schedule } from '@/pages/admin/Schedule'
export const router = createBrowserRouter([
{ path: '/login', element: <Login /> },
@ -31,6 +32,7 @@ export const router = createBrowserRouter([
{ path: 'jobs', element: <Jobs /> },
{ path: 'users', element: <AdminUsers /> },
{ path: 'settings', element: <AdminSettings /> },
{ path: 'schedule', element: <Schedule /> },
],
},
],

View File

@ -0,0 +1,145 @@
import { useState, useEffect } from 'react'
import { Plus, Trash2, Save } from 'lucide-react'
import { adminApi, type ScheduleSlot } from '@/api/admin'
import { Card, CardContent, CardHeader, CardTitle } from '@/components/ui/card'
import { Button } from '@/components/ui/button'
import { Spinner } from '@/components/ui/spinner'
const DAYS = [
{ label: 'Lundi', short: 'LUN', value: 1 },
{ label: 'Mardi', short: 'MAR', value: 2 },
{ label: 'Mercredi', short: 'MER', value: 3 },
{ label: 'Jeudi', short: 'JEU', value: 4 },
{ label: 'Vendredi', short: 'VEN', value: 5 },
{ label: 'Samedi', short: 'SAM', value: 6 },
{ label: 'Dimanche', short: 'DIM', value: 0 },
]
type SlotKey = `${number}-${number}-${number}`
function toKey(s: ScheduleSlot): SlotKey {
return `${s.day_of_week}-${s.hour}-${s.minute}`
}
function fmt(h: number, m: number) {
return `${String(h).padStart(2, '0')}:${String(m).padStart(2, '0')}`
}
export function Schedule() {
const [slots, setSlots] = useState<ScheduleSlot[]>([])
const [loading, setLoading] = useState(true)
const [saving, setSaving] = useState(false)
const [saved, setSaved] = useState(false)
const [newTimes, setNewTimes] = useState<Record<number, string>>({})
useEffect(() => { load() }, [])
async function load() {
setLoading(true)
try { setSlots((await adminApi.getSchedule()) ?? []) } finally { setLoading(false) }
}
function slotsForDay(day: number) {
return slots
.filter(s => s.day_of_week === day)
.sort((a, b) => a.hour !== b.hour ? a.hour - b.hour : a.minute - b.minute)
}
function addSlot(day: number) {
const time = newTimes[day] || '06:00'
const [h, m] = time.split(':').map(Number)
const newSlot: ScheduleSlot = { day_of_week: day, hour: h, minute: m }
if (slots.some(s => toKey(s) === toKey(newSlot))) return
setSlots(prev => [...prev, newSlot])
setNewTimes(p => ({ ...p, [day]: '06:00' }))
}
function removeSlot(slot: ScheduleSlot) {
setSlots(prev => prev.filter(s => toKey(s) !== toKey(slot)))
}
async function save() {
setSaving(true); setSaved(false)
await adminApi.updateSchedule(slots)
setSaving(false); setSaved(true)
setTimeout(() => setSaved(false), 2000)
}
if (loading) return <div className="flex justify-center py-20"><Spinner /></div>
return (
<div className="space-y-6">
<div className="flex items-center justify-between">
<div>
<h1 className="text-2xl font-bold">Planning hebdomadaire</h1>
<p className="text-muted-foreground text-sm">
Définissez les créneaux de scraping + résumé IA pour chaque jour
</p>
</div>
<Button onClick={save} disabled={saving}>
{saving ? <Spinner className="h-4 w-4" /> : <Save className="h-4 w-4" />}
{saved ? 'Enregistré !' : 'Enregistrer'}
</Button>
</div>
<div className="grid grid-cols-1 sm:grid-cols-2 lg:grid-cols-4 xl:grid-cols-7 gap-3">
{DAYS.map(day => {
const daySlots = slotsForDay(day.value)
const isWeekend = day.value === 0 || day.value === 6
return (
<Card key={day.value} className={isWeekend ? 'border-muted' : ''}>
<CardHeader className="pb-2 pt-4 px-4">
<CardTitle className="text-sm font-semibold">
<span className="hidden xl:block">{day.label}</span>
<span className="xl:hidden">{day.short}</span>
</CardTitle>
</CardHeader>
<CardContent className="px-4 pb-4 space-y-2">
{/* Créneaux existants */}
{daySlots.length === 0 && (
<p className="text-xs text-muted-foreground italic">Aucun créneau</p>
)}
{daySlots.map(slot => (
<div
key={toKey(slot)}
className="flex items-center justify-between rounded bg-primary/10 px-2 py-1"
>
<span className="text-sm font-mono font-medium">
{fmt(slot.hour, slot.minute)}
</span>
<button
onClick={() => removeSlot(slot)}
className="text-muted-foreground hover:text-destructive transition-colors ml-2"
>
<Trash2 className="h-3 w-3" />
</button>
</div>
))}
{/* Ajout d'un créneau */}
<div className="flex items-center gap-1 pt-1">
<input
type="time"
value={newTimes[day.value] ?? '06:00'}
onChange={e => setNewTimes(p => ({ ...p, [day.value]: e.target.value }))}
className="flex-1 min-w-0 rounded border border-input bg-background px-2 py-1 text-xs font-mono focus:outline-none focus:ring-1 focus:ring-ring"
/>
<button
onClick={() => addSlot(day.value)}
className="rounded bg-primary/10 p-1 hover:bg-primary/20 transition-colors"
>
<Plus className="h-3 w-3" />
</button>
</div>
</CardContent>
</Card>
)
})}
</div>
<p className="text-xs text-muted-foreground">
À chaque créneau, le service lance le scraping de toutes les sources actives puis génère les résumés IA.
</p>
</div>
)
}

View File

@ -15,8 +15,7 @@
"noUnusedLocals": true,
"noUnusedParameters": true,
"noFallthroughCasesInSwitch": true,
"baseUrl": ".",
"paths": { "@/*": ["src/*"] }
"paths": { "@/*": ["./src/*"] }
},
"include": ["src"]
}

View File

@ -0,0 +1,46 @@
FROM node:20-slim
RUN apt-get update && apt-get install -y \
chromium \
fonts-liberation \
libasound2 \
libatk-bridge2.0-0 \
libatk1.0-0 \
libcairo2 \
libcups2 \
libdbus-1-3 \
libexpat1 \
libfontconfig1 \
libgbm1 \
libglib2.0-0 \
libgtk-3-0 \
libnspr4 \
libnss3 \
libpango-1.0-0 \
libpangocairo-1.0-0 \
libx11-6 \
libx11-xcb1 \
libxcb1 \
libxcomposite1 \
libxcursor1 \
libxdamage1 \
libxext6 \
libxfixes3 \
libxi6 \
libxrandr2 \
libxrender1 \
libxss1 \
libxtst6 \
--no-install-recommends \
&& rm -rf /var/lib/apt/lists/*
ENV PUPPETEER_SKIP_CHROMIUM_DOWNLOAD=true
ENV PUPPETEER_EXECUTABLE_PATH=/usr/bin/chromium
WORKDIR /app
COPY package*.json ./
RUN npm install --omit=dev
COPY . .
EXPOSE 3001
CMD ["node", "index.js"]

205
scraper-service/index.js Normal file
View File

@ -0,0 +1,205 @@
const express = require('express')
const puppeteer = require('puppeteer-extra')
const StealthPlugin = require('puppeteer-extra-plugin-stealth')
puppeteer.use(StealthPlugin())
const app = express()
app.use(express.json())
const CHROME_PATH = process.env.PUPPETEER_EXECUTABLE_PATH || '/usr/bin/chromium'
const PORT = process.env.PORT || 3001
function launchBrowser() {
return puppeteer.launch({
executablePath: CHROME_PATH,
headless: true,
args: [
'--no-sandbox',
'--disable-setuid-sandbox',
'--disable-dev-shm-usage',
'--disable-gpu',
'--window-size=1920,1080',
'--disable-blink-features=AutomationControlled',
],
})
}
async function tryClick(page, selectors) {
for (const sel of selectors) {
try {
const el = await page.$(sel)
if (el) { await el.click(); return true }
} catch {}
}
await page.keyboard.press('Enter')
return false
}
async function tryType(page, selectors, text) {
for (const sel of selectors) {
try {
await page.waitForSelector(sel, { timeout: 4000 })
await page.type(sel, text, { delay: 60 })
return true
} catch {}
}
return false
}
app.get('/health', (_, res) => res.json({ ok: true }))
app.post('/bloomberg/scrape', async (req, res) => {
const { username, password } = req.body || {}
if (!username || !password) {
return res.status(400).json({ error: 'username and password required' })
}
let browser
try {
browser = await launchBrowser()
const page = await browser.newPage()
await page.setViewport({ width: 1920, height: 1080 })
// Hide automation signals
await page.evaluateOnNewDocument(() => {
Object.defineProperty(navigator, 'webdriver', { get: () => undefined })
window.chrome = { runtime: {} }
Object.defineProperty(navigator, 'plugins', { get: () => [1, 2, 3] })
})
console.log('[bloomberg] navigating to login page')
await page.goto('https://www.bloomberg.com/account/signin', {
waitUntil: 'networkidle2',
timeout: 60000,
})
await new Promise(r => setTimeout(r, 2000))
// Debug: état de la page avant de chercher l'email
const pageInputs = await page.evaluate(() =>
Array.from(document.querySelectorAll('input')).map(i => ({
type: i.type, name: i.name, id: i.id, placeholder: i.placeholder, visible: i.offsetParent !== null
}))
)
console.log('[bloomberg] inputs on login page:', JSON.stringify(pageInputs))
const pageTitle = await page.title()
console.log('[bloomberg] page title:', pageTitle)
console.log('[bloomberg] entering email')
const emailSelectors = [
'#email-form-input',
'input[id="email-form-input"]',
'input[type="email"]',
'input[name="text-input"]',
'input[placeholder*="email" i]',
]
const emailOk = await tryType(page, emailSelectors, username)
if (!emailOk) throw new Error('could not find email input')
await new Promise(r => setTimeout(r, 800))
// Click submit via JS pour contourner les boutons désactivés
const submitted = await page.evaluate(() => {
const btns = Array.from(document.querySelectorAll('button'))
const btn = btns.find(b =>
b.type === 'submit' ||
/continue|next|sign.?in/i.test(b.textContent)
)
if (btn) { btn.click(); return true }
const form = document.querySelector('form')
if (form) { form.submit(); return true }
return false
})
if (!submitted) await page.keyboard.press('Enter')
// Attendre que la page change (password input apparaît ou navigation)
try {
await page.waitForFunction(
() => document.querySelector('input[type="password"]') !== null,
{ timeout: 10000 }
)
} catch {
await new Promise(r => setTimeout(r, 3000))
}
console.log('[bloomberg] after email submit, url:', page.url())
// Debug inputs disponibles
const allInputs = await page.evaluate(() =>
Array.from(document.querySelectorAll('input')).map(i => ({
type: i.type, name: i.name, id: i.id, placeholder: i.placeholder
}))
)
console.log('[bloomberg] inputs after email submit:', JSON.stringify(allInputs))
console.log('[bloomberg] entering password')
const pwdOk = await tryType(page, [
'input[type="password"]',
'input[name="password"]',
'input[autocomplete="current-password"]',
'input[autocomplete="password"]',
], password)
if (!pwdOk) throw new Error('could not find password input — check logs above for available inputs')
await new Promise(r => setTimeout(r, 500))
await tryClick(page, ['button[type="submit"]', 'input[type="submit"]'])
await new Promise(r => setTimeout(r, 3000))
const currentURL = page.url()
console.log('[bloomberg] after login, url:', currentURL)
const pages = [
'https://www.bloomberg.com/markets',
'https://www.bloomberg.com/technology',
'https://www.bloomberg.com/economics',
]
const articles = []
const seen = new Set()
for (const url of pages) {
try {
console.log('[bloomberg] scraping', url)
await page.goto(url, { waitUntil: 'networkidle2', timeout: 30000 })
await new Promise(r => setTimeout(r, 2000))
const items = await page.evaluate(() => {
const results = []
const seen = new Set()
const links = document.querySelectorAll(
'a[href*="/news/articles"], a[href*="/opinion/"], a[href*="/markets/"]'
)
links.forEach(a => {
if (seen.has(a.href)) return
seen.add(a.href)
const titleEl = a.querySelector('h1,h2,h3,h4,[class*="headline"],[class*="title"]')
const text = titleEl ? titleEl.innerText.trim() : a.innerText.trim()
if (text.length > 20 && a.href.includes('bloomberg.com')) {
results.push({ title: text, url: a.href })
}
})
return results.slice(0, 25)
})
for (const item of items) {
if (!seen.has(item.url) && item.title && item.url) {
seen.add(item.url)
articles.push(item)
}
}
console.log('[bloomberg]', url, '->', items.length, 'articles')
} catch (e) {
console.error('[bloomberg] error on', url, ':', e.message)
}
}
console.log('[bloomberg] total:', articles.length, 'articles')
res.json({ articles })
} catch (e) {
console.error('[bloomberg] scrape error:', e.message)
res.status(500).json({ error: e.message })
} finally {
if (browser) await browser.close()
}
})
app.listen(PORT, () => console.log(`scraper-service listening on :${PORT}`))

View File

@ -0,0 +1,14 @@
{
"name": "tradarr-scraper-service",
"version": "1.0.0",
"main": "index.js",
"scripts": {
"start": "node index.js"
},
"dependencies": {
"express": "^4.19.2",
"puppeteer-extra": "^3.3.6",
"puppeteer-extra-plugin-stealth": "^2.11.2",
"puppeteer": "^22.0.0"
}
}