feat: add sources to retrieve news and divide the IA reflexions in 2 steps to limit the number of news

This commit is contained in:
2026-04-19 10:43:15 +02:00
parent 93668273ff
commit eb1fb5ca78
28 changed files with 1086 additions and 249 deletions

View File

@ -15,6 +15,8 @@ import (
"github.com/tradarr/backend/internal/scheduler"
"github.com/tradarr/backend/internal/scraper"
"github.com/tradarr/backend/internal/scraper/bloomberg"
"github.com/tradarr/backend/internal/scraper/reuters"
"github.com/tradarr/backend/internal/scraper/watcherguru"
"github.com/tradarr/backend/internal/scraper/yahoofinance"
)
@ -38,30 +40,23 @@ func main() {
enc := crypto.New(cfg.EncryptionKey)
pipeline := ai.NewPipeline(repo, enc)
// Créer le compte admin initial si nécessaire
if err := ensureAdmin(repo, cfg); err != nil {
log.Printf("ensure admin: %v", err)
}
// Configurer les scrapers
registry := scraper.NewRegistry(repo)
registry.Register(bloomberg.NewDynamic(repo, enc, cfg.ScraperURL))
registry.Register(yahoofinance.New())
registry.Register(reuters.New())
registry.Register(watcherguru.New())
// Bloomberg (credentials chargés depuis la DB à chaque run)
bbScraper := bloomberg.NewDynamic(repo, enc, cfg.ChromePath)
registry.Register(bbScraper)
stScraper := yahoofinance.New()
registry.Register(stScraper)
// Scheduler
sched := scheduler.New(registry, pipeline, repo)
if err := sched.Start(); err != nil {
log.Printf("scheduler: %v", err)
}
defer sched.Stop()
// API
h := handlers.New(repo, cfg, enc, registry, pipeline)
h := handlers.New(repo, cfg, enc, registry, pipeline, sched)
r := api.SetupRouter(h, cfg.JWTSecret)
addr := fmt.Sprintf(":%s", cfg.Port)

View File

@ -3,6 +3,7 @@ package ai
import (
"context"
"fmt"
"regexp"
"strconv"
"strings"
"time"
@ -31,7 +32,6 @@ func NewPipeline(repo *models.Repository, enc *crypto.Encryptor) *Pipeline {
return &Pipeline{repo: repo, enc: enc}
}
// BuildProvider instancie un provider à partir de ses paramètres
func (p *Pipeline) BuildProvider(name, apiKey, endpoint string) (Provider, error) {
provider, err := p.repo.GetActiveAIProvider()
if err != nil {
@ -44,9 +44,7 @@ func (p *Pipeline) BuildProvider(name, apiKey, endpoint string) (Provider, error
return NewProvider(name, apiKey, model, endpoint)
}
// GenerateForUser génère un résumé personnalisé pour un utilisateur
func (p *Pipeline) GenerateForUser(ctx context.Context, userID string) (*models.Summary, error) {
// Récupérer le provider actif
providerCfg, err := p.repo.GetActiveAIProvider()
if err != nil {
return nil, fmt.Errorf("get active provider: %w", err)
@ -68,7 +66,6 @@ func (p *Pipeline) GenerateForUser(ctx context.Context, userID string) (*models.
return nil, fmt.Errorf("build provider: %w", err)
}
// Récupérer la watchlist de l'utilisateur (pour le contexte IA uniquement)
assets, err := p.repo.GetUserAssets(userID)
if err != nil {
return nil, fmt.Errorf("get user assets: %w", err)
@ -78,7 +75,6 @@ func (p *Pipeline) GenerateForUser(ctx context.Context, userID string) (*models.
symbols[i] = a.Symbol
}
// Récupérer TOUS les articles récents, toutes sources confondues
hoursStr, _ := p.repo.GetSetting("articles_lookback_hours")
hours, _ := strconv.Atoi(hoursStr)
if hours == 0 {
@ -98,16 +94,21 @@ func (p *Pipeline) GenerateForUser(ctx context.Context, userID string) (*models.
if maxArticles == 0 {
maxArticles = 50
}
// Passe 1 : filtrage par pertinence sur les titres si trop d'articles
if len(articles) > maxArticles {
articles = articles[:maxArticles]
fmt.Printf("pipeline: %d articles → filtering to %d via AI\n", len(articles), maxArticles)
articles = p.filterByRelevance(ctx, provider, symbols, articles, maxArticles)
fmt.Printf("pipeline: %d articles retained after filtering\n", len(articles))
}
systemPrompt, _ := p.repo.GetSetting("ai_system_prompt")
if systemPrompt == "" {
systemPrompt = DefaultSystemPrompt
}
prompt := buildPrompt(systemPrompt, symbols, articles)
// Passe 2 : résumé complet
prompt := buildPrompt(systemPrompt, symbols, articles)
summary, err := provider.Summarize(ctx, prompt)
if err != nil {
return nil, fmt.Errorf("AI summarize: %w", err)
@ -116,7 +117,77 @@ func (p *Pipeline) GenerateForUser(ctx context.Context, userID string) (*models.
return p.repo.CreateSummary(userID, summary, &providerCfg.ID)
}
// GenerateForAll génère les résumés pour tous les utilisateurs ayant une watchlist
// filterByRelevance demande à l'IA de sélectionner les articles les plus pertinents
// en ne lui envoyant que les titres (prompt très court = rapide).
func (p *Pipeline) filterByRelevance(ctx context.Context, provider Provider, symbols []string, articles []models.Article, max int) []models.Article {
prompt := buildFilterPrompt(symbols, articles, max)
response, err := provider.Summarize(ctx, prompt)
if err != nil {
fmt.Printf("pipeline: filter AI call failed (%v), falling back to truncation\n", err)
return articles[:max]
}
indices := parseIndexArray(response, len(articles))
if len(indices) == 0 {
fmt.Printf("pipeline: could not parse filter response, falling back to truncation\n")
return articles[:max]
}
filtered := make([]models.Article, 0, len(indices))
for _, i := range indices {
filtered = append(filtered, articles[i])
if len(filtered) >= max {
break
}
}
return filtered
}
func buildFilterPrompt(symbols []string, articles []models.Article, max int) string {
var sb strings.Builder
sb.WriteString("Tu es un assistant de trading financier. ")
sb.WriteString(fmt.Sprintf("Parmi les %d articles ci-dessous, sélectionne les %d plus pertinents pour un trader actif.\n", len(articles), max))
if len(symbols) > 0 {
sb.WriteString("Actifs surveillés (priorité haute) : ")
sb.WriteString(strings.Join(symbols, ", "))
sb.WriteString("\n")
}
sb.WriteString(fmt.Sprintf("\nRéponds UNIQUEMENT avec un tableau JSON des indices sélectionnés (base 0), exemple : [0, 3, 7, 12]\n"))
sb.WriteString("N'ajoute aucun texte avant ou après le tableau JSON.\n\n")
sb.WriteString("Articles :\n")
for i, a := range articles {
sb.WriteString(fmt.Sprintf("[%d] %s (%s)\n", i, a.Title, a.SourceName))
}
return sb.String()
}
var jsonArrayRe = regexp.MustCompile(`\[[\d\s,]+\]`)
func parseIndexArray(response string, maxIndex int) []int {
match := jsonArrayRe.FindString(response)
if match == "" {
return nil
}
match = strings.Trim(match, "[]")
parts := strings.Split(match, ",")
seen := make(map[int]bool)
var indices []int
for _, p := range parts {
n, err := strconv.Atoi(strings.TrimSpace(p))
if err != nil || n < 0 || n >= maxIndex || seen[n] {
continue
}
seen[n] = true
indices = append(indices, n)
}
return indices
}
func (p *Pipeline) GenerateForAll(ctx context.Context) error {
users, err := p.repo.ListUsers()
if err != nil {

View File

@ -287,6 +287,45 @@ func (h *Handler) UpdateSettings(c *gin.Context) {
httputil.OK(c, gin.H{"ok": true})
}
// ── Schedule ───────────────────────────────────────────────────────────────
func (h *Handler) GetSchedule(c *gin.Context) {
slots, err := h.repo.ListScheduleSlots()
if err != nil {
httputil.InternalError(c, err)
return
}
httputil.OK(c, slots)
}
type scheduleRequest struct {
Slots []struct {
DayOfWeek int `json:"day_of_week"`
Hour int `json:"hour"`
Minute int `json:"minute"`
} `json:"slots"`
}
func (h *Handler) UpdateSchedule(c *gin.Context) {
var req scheduleRequest
if err := c.ShouldBindJSON(&req); err != nil {
httputil.BadRequest(c, err)
return
}
slots := make([]models.ScheduleSlot, len(req.Slots))
for i, s := range req.Slots {
slots[i] = models.ScheduleSlot{DayOfWeek: s.DayOfWeek, Hour: s.Hour, Minute: s.Minute}
}
if err := h.repo.ReplaceSchedule(slots); err != nil {
httputil.InternalError(c, err)
return
}
if err := h.scheduler.Reload(); err != nil {
fmt.Printf("schedule reload: %v\n", err)
}
httputil.OK(c, gin.H{"ok": true})
}
func (h *Handler) GetDefaultSystemPrompt(c *gin.Context) {
httputil.OK(c, gin.H{"prompt": ai.DefaultSystemPrompt})
}

View File

@ -5,6 +5,7 @@ import (
"github.com/tradarr/backend/internal/config"
"github.com/tradarr/backend/internal/crypto"
"github.com/tradarr/backend/internal/models"
"github.com/tradarr/backend/internal/scheduler"
"github.com/tradarr/backend/internal/scraper"
)
@ -14,6 +15,7 @@ type Handler struct {
enc *crypto.Encryptor
registry *scraper.Registry
pipeline *ai.Pipeline
scheduler *scheduler.Scheduler
}
func New(
@ -22,12 +24,14 @@ func New(
enc *crypto.Encryptor,
registry *scraper.Registry,
pipeline *ai.Pipeline,
sched *scheduler.Scheduler,
) *Handler {
return &Handler{
repo: repo,
cfg: cfg,
enc: enc,
registry: registry,
pipeline: pipeline,
repo: repo,
cfg: cfg,
enc: enc,
registry: registry,
pipeline: pipeline,
scheduler: sched,
}
}

View File

@ -65,6 +65,9 @@ func SetupRouter(h *handlers.Handler, jwtSecret string) *gin.Engine {
admin.PUT("/settings", h.UpdateSettings)
admin.GET("/settings/default-prompt", h.GetDefaultSystemPrompt)
admin.GET("/schedule", h.GetSchedule)
admin.PUT("/schedule", h.UpdateSchedule)
admin.GET("/users", h.ListUsers)
admin.PUT("/users/:id", h.UpdateAdminUser)
admin.DELETE("/users/:id", h.DeleteAdminUser)

View File

@ -7,13 +7,13 @@ import (
)
type Config struct {
DatabaseURL string
JWTSecret string
EncryptionKey []byte
Port string
ChromePath string
AdminEmail string
AdminPassword string
DatabaseURL string
JWTSecret string
EncryptionKey []byte
Port string
ScraperURL string
AdminEmail string
AdminPassword string
}
func Load() (*Config, error) {
@ -41,12 +41,17 @@ func Load() (*Config, error) {
port = "8080"
}
scraperURL := os.Getenv("SCRAPER_URL")
if scraperURL == "" {
scraperURL = "http://scraper:3001"
}
return &Config{
DatabaseURL: dbURL,
JWTSecret: jwtSecret,
EncryptionKey: encKey,
Port: port,
ChromePath: os.Getenv("CHROME_PATH"),
ScraperURL: scraperURL,
AdminEmail: os.Getenv("ADMIN_EMAIL"),
AdminPassword: os.Getenv("ADMIN_PASSWORD"),
}, nil

View File

@ -21,7 +21,7 @@ CREATE TABLE user_assets (
CREATE TABLE sources (
id UUID PRIMARY KEY DEFAULT gen_random_uuid(),
name TEXT NOT NULL,
type TEXT NOT NULL CHECK (type IN ('bloomberg', 'stocktwits')),
type TEXT NOT NULL CHECK (type IN ('bloomberg', 'stocktwits', 'reuters', 'watcherguru')),
enabled BOOLEAN NOT NULL DEFAULT TRUE,
created_at TIMESTAMPTZ NOT NULL DEFAULT NOW()
);
@ -97,7 +97,7 @@ CREATE INDEX idx_user_assets_user_id ON user_assets(user_id);
-- Sources initiales
INSERT INTO sources (name, type, enabled) VALUES
('Bloomberg', 'bloomberg', TRUE),
('StockTwits', 'stocktwits', TRUE);
('Yahoo Finance', 'stocktwits', TRUE);
-- Paramètres par défaut
INSERT INTO settings (key, value) VALUES

View File

@ -0,0 +1 @@
DELETE FROM sources WHERE type IN ('reuters', 'watcherguru');

View File

@ -0,0 +1,4 @@
INSERT INTO sources (name, type, enabled) VALUES
('Reuters', 'reuters', true),
('Watcher.Guru', 'watcherguru', true)
ON CONFLICT DO NOTHING;

View File

@ -0,0 +1 @@
DROP TABLE IF EXISTS scrape_schedules;

View File

@ -0,0 +1,17 @@
CREATE TABLE scrape_schedules (
id UUID PRIMARY KEY DEFAULT gen_random_uuid(),
day_of_week SMALLINT NOT NULL CHECK (day_of_week BETWEEN 0 AND 6),
hour SMALLINT NOT NULL CHECK (hour BETWEEN 0 AND 23),
minute SMALLINT NOT NULL DEFAULT 0 CHECK (minute BETWEEN 0 AND 59),
UNIQUE (day_of_week, hour, minute)
);
-- Planning par défaut : lun-ven à 6h et 15h, week-end à 6h uniquement
INSERT INTO scrape_schedules (day_of_week, hour, minute) VALUES
(1, 6, 0), (1, 15, 0),
(2, 6, 0), (2, 15, 0),
(3, 6, 0), (3, 15, 0),
(4, 6, 0), (4, 15, 0),
(5, 6, 0), (5, 15, 0),
(6, 6, 0),
(0, 6, 0);

View File

@ -97,3 +97,10 @@ type Setting struct {
Key string `json:"key"`
Value string `json:"value"`
}
type ScheduleSlot struct {
ID string `json:"id"`
DayOfWeek int `json:"day_of_week"` // 0=dimanche, 1=lundi ... 6=samedi
Hour int `json:"hour"`
Minute int `json:"minute"`
}

View File

@ -520,6 +520,51 @@ func (r *Repository) SetSetting(key, value string) error {
return err
}
// ── Schedule ───────────────────────────────────────────────────────────────
func (r *Repository) ListScheduleSlots() ([]ScheduleSlot, error) {
rows, err := r.db.Query(`
SELECT id, day_of_week, hour, minute FROM scrape_schedules
ORDER BY day_of_week, hour, minute`)
if err != nil {
return nil, err
}
defer rows.Close()
var slots []ScheduleSlot
for rows.Next() {
var s ScheduleSlot
if err := rows.Scan(&s.ID, &s.DayOfWeek, &s.Hour, &s.Minute); err != nil {
return nil, err
}
slots = append(slots, s)
}
return slots, nil
}
func (r *Repository) ReplaceSchedule(slots []ScheduleSlot) error {
tx, err := r.db.Begin()
if err != nil {
return err
}
defer tx.Rollback()
if _, err := tx.Exec(`DELETE FROM scrape_schedules`); err != nil {
return err
}
for _, s := range slots {
if _, err := tx.Exec(
`INSERT INTO scrape_schedules (day_of_week, hour, minute) VALUES ($1, $2, $3)
ON CONFLICT (day_of_week, hour, minute) DO NOTHING`,
s.DayOfWeek, s.Hour, s.Minute,
); err != nil {
return err
}
}
return tx.Commit()
}
// ── Settings ───────────────────────────────────────────────────────────────
func (r *Repository) ListSettings() ([]Setting, error) {
rows, err := r.db.Query(`SELECT key, value FROM settings ORDER BY key`)
if err != nil {

View File

@ -3,7 +3,6 @@ package scheduler
import (
"context"
"fmt"
"strconv"
"github.com/robfig/cron/v3"
"github.com/tradarr/backend/internal/ai"
@ -16,7 +15,7 @@ type Scheduler struct {
registry *scraper.Registry
pipeline *ai.Pipeline
repo *models.Repository
entryID cron.EntryID
entryIDs []cron.EntryID
}
func New(registry *scraper.Registry, pipeline *ai.Pipeline, repo *models.Repository) *Scheduler {
@ -29,19 +28,10 @@ func New(registry *scraper.Registry, pipeline *ai.Pipeline, repo *models.Reposit
}
func (s *Scheduler) Start() error {
interval, err := s.getInterval()
if err != nil {
if err := s.loadSchedule(); err != nil {
return err
}
spec := fmt.Sprintf("@every %dm", interval)
s.entryID, err = s.cron.AddFunc(spec, s.run)
if err != nil {
return fmt.Errorf("add cron: %w", err)
}
s.cron.Start()
fmt.Printf("scheduler started, running every %d minutes\n", interval)
return nil
}
@ -50,39 +40,46 @@ func (s *Scheduler) Stop() {
}
func (s *Scheduler) Reload() error {
s.cron.Remove(s.entryID)
interval, err := s.getInterval()
if err != nil {
return err
for _, id := range s.entryIDs {
s.cron.Remove(id)
}
spec := fmt.Sprintf("@every %dm", interval)
s.entryID, err = s.cron.AddFunc(spec, s.run)
return err
s.entryIDs = nil
return s.loadSchedule()
}
func (s *Scheduler) loadSchedule() error {
slots, err := s.repo.ListScheduleSlots()
if err != nil {
return fmt.Errorf("load schedule: %w", err)
}
if len(slots) == 0 {
fmt.Println("scheduler: no schedule configured, scraping disabled")
return nil
}
for _, slot := range slots {
// Format cron: "minute hour * * day_of_week"
spec := fmt.Sprintf("%d %d * * %d", slot.Minute, slot.Hour, slot.DayOfWeek)
id, err := s.cron.AddFunc(spec, s.run)
if err != nil {
fmt.Printf("scheduler: invalid cron spec %q: %v\n", spec, err)
continue
}
s.entryIDs = append(s.entryIDs, id)
}
fmt.Printf("scheduler: %d time slots loaded\n", len(s.entryIDs))
return nil
}
func (s *Scheduler) run() {
fmt.Println("scheduler: running scraping cycle")
fmt.Println("scheduler: starting scraping cycle")
if err := s.registry.RunAll(); err != nil {
fmt.Printf("scheduler scrape error: %v\n", err)
return
}
fmt.Println("scheduler: running AI summaries")
fmt.Println("scheduler: starting AI summaries")
if err := s.pipeline.GenerateForAll(context.Background()); err != nil {
fmt.Printf("scheduler summary error: %v\n", err)
}
}
func (s *Scheduler) getInterval() (int, error) {
v, err := s.repo.GetSetting("scrape_interval_minutes")
if err != nil {
return 60, nil
}
if v == "" {
return 60, nil
}
n, err := strconv.Atoi(v)
if err != nil || n < 1 {
return 60, nil
}
return n, nil
}

View File

@ -1,206 +1,94 @@
package bloomberg
import (
"bytes"
"context"
"encoding/json"
"fmt"
"io"
"net/http"
"strings"
"time"
"github.com/chromedp/chromedp"
"github.com/tradarr/backend/internal/scraper"
)
type Bloomberg struct {
username string
password string
chromePath string
scraperURL string
client *http.Client
}
func New(username, password, chromePath string) *Bloomberg {
return &Bloomberg{username: username, password: password, chromePath: chromePath}
func New(scraperURL string) *Bloomberg {
if scraperURL == "" {
scraperURL = "http://scraper:3001"
}
return &Bloomberg{
scraperURL: scraperURL,
client: &http.Client{Timeout: 10 * time.Minute},
}
}
func (b *Bloomberg) Name() string { return "bloomberg" }
func (b *Bloomberg) Scrape(ctx context.Context, symbols []string) ([]scraper.Article, error) {
if b.username == "" || b.password == "" {
return nil, fmt.Errorf("bloomberg credentials not configured")
}
opts := []chromedp.ExecAllocatorOption{
chromedp.NoFirstRun,
chromedp.NoDefaultBrowserCheck,
chromedp.Headless,
chromedp.DisableGPU,
chromedp.Flag("no-sandbox", true),
chromedp.Flag("disable-setuid-sandbox", true),
chromedp.Flag("disable-dev-shm-usage", true),
chromedp.Flag("disable-blink-features", "AutomationControlled"),
chromedp.Flag("disable-infobars", true),
chromedp.Flag("window-size", "1920,1080"),
chromedp.Flag("ignore-certificate-errors", true),
chromedp.UserAgent("Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/122.0.0.0 Safari/537.36"),
}
if b.chromePath != "" {
opts = append(opts, chromedp.ExecPath(b.chromePath))
}
allocCtx, cancelAlloc := chromedp.NewExecAllocator(ctx, opts...)
defer cancelAlloc()
chromeCtx, cancelChrome := chromedp.NewContext(allocCtx)
defer cancelChrome()
timeoutCtx, cancelTimeout := context.WithTimeout(chromeCtx, 5*time.Minute)
defer cancelTimeout()
if err := b.login(timeoutCtx); err != nil {
return nil, fmt.Errorf("bloomberg login: %w", err)
}
var articles []scraper.Article
pages := []string{
"https://www.bloomberg.com/markets",
"https://www.bloomberg.com/technology",
"https://www.bloomberg.com/economics",
}
for _, u := range pages {
pageArticles, err := b.scrapePage(timeoutCtx, u, symbols)
if err != nil {
fmt.Printf("bloomberg scrape %s: %v\n", u, err)
continue
}
articles = append(articles, pageArticles...)
}
fmt.Printf("bloomberg: %d articles fetched total\n", len(articles))
return articles, nil
type scraperRequest struct {
Username string `json:"username"`
Password string `json:"password"`
}
func (b *Bloomberg) login(ctx context.Context) error {
loginCtx, cancel := context.WithTimeout(ctx, 2*time.Minute)
defer cancel()
// Masquer la détection d'automation via JS
if err := chromedp.Run(loginCtx,
chromedp.ActionFunc(func(ctx context.Context) error {
return chromedp.Evaluate(`
Object.defineProperty(navigator, 'webdriver', {get: () => undefined});
window.chrome = { runtime: {} };
`, nil).Do(ctx)
}),
); err != nil {
fmt.Printf("bloomberg: could not inject stealth JS: %v\n", err)
}
err := chromedp.Run(loginCtx,
chromedp.Navigate("https://www.bloomberg.com/account/signin"),
chromedp.Sleep(2*time.Second),
// Essayer plusieurs sélecteurs pour l'email
chromedp.ActionFunc(func(ctx context.Context) error {
selectors := []string{
`input[name="email"]`,
`input[type="email"]`,
`input[data-type="email"]`,
`input[placeholder*="email" i]`,
`input[placeholder*="mail" i]`,
}
for _, sel := range selectors {
var count int
if err := chromedp.Evaluate(fmt.Sprintf(`document.querySelectorAll('%s').length`, sel), &count).Do(ctx); err == nil && count > 0 {
fmt.Printf("bloomberg: using email selector: %s\n", sel)
return chromedp.SendKeys(sel, b.username, chromedp.ByQuery).Do(ctx)
}
}
return fmt.Errorf("could not find email input — Bloomberg login page structure may have changed")
}),
chromedp.Sleep(500*time.Millisecond),
// Submit email
chromedp.ActionFunc(func(ctx context.Context) error {
selectors := []string{`button[type="submit"]`, `input[type="submit"]`, `button[data-testid*="submit"]`}
for _, sel := range selectors {
var count int
if err := chromedp.Evaluate(fmt.Sprintf(`document.querySelectorAll('%s').length`, sel), &count).Do(ctx); err == nil && count > 0 {
return chromedp.Click(sel, chromedp.ByQuery).Do(ctx)
}
}
// Fallback: press Enter
return chromedp.KeyEvent("\r").Do(ctx)
}),
chromedp.Sleep(2*time.Second),
// Password
chromedp.ActionFunc(func(ctx context.Context) error {
selectors := []string{`input[type="password"]`, `input[name="password"]`}
for _, sel := range selectors {
var count int
if err := chromedp.Evaluate(fmt.Sprintf(`document.querySelectorAll('%s').length`, sel), &count).Do(ctx); err == nil && count > 0 {
fmt.Printf("bloomberg: using password selector: %s\n", sel)
return chromedp.SendKeys(sel, b.password, chromedp.ByQuery).Do(ctx)
}
}
return fmt.Errorf("could not find password input")
}),
chromedp.Sleep(500*time.Millisecond),
chromedp.ActionFunc(func(ctx context.Context) error {
selectors := []string{`button[type="submit"]`, `input[type="submit"]`}
for _, sel := range selectors {
var count int
if err := chromedp.Evaluate(fmt.Sprintf(`document.querySelectorAll('%s').length`, sel), &count).Do(ctx); err == nil && count > 0 {
return chromedp.Click(sel, chromedp.ByQuery).Do(ctx)
}
}
return chromedp.KeyEvent("\r").Do(ctx)
}),
chromedp.Sleep(3*time.Second),
)
return err
type scraperArticle struct {
Title string `json:"title"`
URL string `json:"url"`
}
func (b *Bloomberg) scrapePage(ctx context.Context, pageURL string, symbols []string) ([]scraper.Article, error) {
pageCtx, cancel := context.WithTimeout(ctx, 60*time.Second)
defer cancel()
type scraperResponse struct {
Articles []scraperArticle `json:"articles"`
Error string `json:"error,omitempty"`
}
var articleNodes []map[string]string
err := chromedp.Run(pageCtx,
chromedp.Navigate(pageURL),
chromedp.Sleep(3*time.Second),
chromedp.Evaluate(`
(function() {
var items = [];
var seen = new Set();
var links = document.querySelectorAll('a[href*="/news/articles"], a[href*="/opinion/"], a[href*="/markets/"]');
links.forEach(function(a) {
if (seen.has(a.href)) return;
seen.add(a.href);
var title = a.querySelector('h1,h2,h3,h4,[class*="headline"],[class*="title"]');
var text = title ? title.innerText.trim() : a.innerText.trim();
if (text.length > 20 && a.href.includes('bloomberg.com')) {
items.push({title: text, url: a.href});
}
});
return items.slice(0, 25);
})()
`, &articleNodes),
)
func (b *Bloomberg) ScrapeWithCredentials(ctx context.Context, username, password string, symbols []string) ([]scraper.Article, error) {
payload, _ := json.Marshal(scraperRequest{Username: username, Password: password})
req, err := http.NewRequestWithContext(ctx, http.MethodPost, b.scraperURL+"/bloomberg/scrape", bytes.NewReader(payload))
if err != nil {
return nil, fmt.Errorf("navigate %s: %w", pageURL, err)
return nil, err
}
req.Header.Set("Content-Type", "application/json")
resp, err := b.client.Do(req)
if err != nil {
return nil, fmt.Errorf("scraper service unreachable: %w", err)
}
defer resp.Body.Close()
body, _ := io.ReadAll(resp.Body)
if resp.StatusCode != http.StatusOK {
return nil, fmt.Errorf("scraper service HTTP %d: %s", resp.StatusCode, body)
}
var result scraperResponse
if err := json.Unmarshal(body, &result); err != nil {
return nil, fmt.Errorf("parse scraper response: %w", err)
}
if result.Error != "" {
return nil, fmt.Errorf("bloomberg: %s", result.Error)
}
var articles []scraper.Article
now := time.Now()
for _, node := range articleNodes {
title := strings.TrimSpace(node["title"])
url := node["url"]
if title == "" || url == "" || !strings.Contains(url, "bloomberg.com") {
var articles []scraper.Article
for _, a := range result.Articles {
title := strings.TrimSpace(a.Title)
url := a.URL
if title == "" || url == "" {
continue
}
syms := scraper.DetectSymbols(title, symbols)
articles = append(articles, scraper.Article{
Title: title,
Content: title, // contenu minimal — l'article complet nécessite un accès payant
Content: title,
URL: url,
PublishedAt: &now,
Symbols: syms,
})
}
fmt.Printf("bloomberg: %d articles fetched\n", len(articles))
return articles, nil
}

View File

@ -9,21 +9,19 @@ import (
"github.com/tradarr/backend/internal/scraper"
)
// DynamicBloomberg charge les credentials depuis la DB avant chaque scraping
type DynamicBloomberg struct {
repo *models.Repository
enc *crypto.Encryptor
chromePath string
scraperURL string
}
func NewDynamic(repo *models.Repository, enc *crypto.Encryptor, chromePath string) *DynamicBloomberg {
return &DynamicBloomberg{repo: repo, enc: enc, chromePath: chromePath}
func NewDynamic(repo *models.Repository, enc *crypto.Encryptor, scraperURL string) *DynamicBloomberg {
return &DynamicBloomberg{repo: repo, enc: enc, scraperURL: scraperURL}
}
func (d *DynamicBloomberg) Name() string { return "bloomberg" }
func (d *DynamicBloomberg) Scrape(ctx context.Context, symbols []string) ([]scraper.Article, error) {
// Récupérer la source Bloomberg
source, err := d.repo.GetSourceByType("bloomberg")
if err != nil || source == nil {
return nil, fmt.Errorf("bloomberg source not found")
@ -34,7 +32,7 @@ func (d *DynamicBloomberg) Scrape(ctx context.Context, symbols []string) ([]scra
return nil, fmt.Errorf("get bloomberg credentials: %w", err)
}
if cred == nil || cred.Username == "" {
return nil, fmt.Errorf("bloomberg credentials not configured — please set them in the admin panel")
return nil, fmt.Errorf("bloomberg credentials not configured — configure them in the admin panel")
}
password := ""
@ -45,6 +43,6 @@ func (d *DynamicBloomberg) Scrape(ctx context.Context, symbols []string) ([]scra
}
}
b := New(cred.Username, password, d.chromePath)
return b.Scrape(ctx, symbols)
b := New(d.scraperURL)
return b.ScrapeWithCredentials(ctx, cred.Username, password, symbols)
}

View File

@ -0,0 +1,129 @@
package reuters
import (
"context"
"encoding/xml"
"fmt"
"io"
"net/http"
"strings"
"time"
"github.com/tradarr/backend/internal/scraper"
)
// Reuters RSS est bloqué par Cloudflare. On utilise des flux RSS financiers
// publics fiables à la place : MarketWatch, CNBC, Seeking Alpha.
var feeds = []struct {
name string
url string
}{
{"MarketWatch Top Stories", "https://feeds.content.dowjones.io/public/rss/mw_topstories"},
{"MarketWatch Markets", "https://feeds.content.dowjones.io/public/rss/mw_marketpulse"},
{"CNBC Top News", "https://search.cnbc.com/rs/search/combinedcombined/rss/topNews"},
{"CNBC Finance", "https://search.cnbc.com/rs/search/combinedcombined/rss/topNews?tag=Finance"},
}
type Reuters struct {
client *http.Client
}
func New() *Reuters {
return &Reuters{client: &http.Client{Timeout: 15 * time.Second}}
}
func (r *Reuters) Name() string { return "reuters" }
type rssFeed struct {
Channel struct {
Items []struct {
Title string `xml:"title"`
Link string `xml:"link"`
Description string `xml:"description"`
PubDate string `xml:"pubDate"`
} `xml:"item"`
} `xml:"channel"`
}
func (r *Reuters) Scrape(ctx context.Context, _ []string) ([]scraper.Article, error) {
var articles []scraper.Article
seen := make(map[string]bool)
for i, feed := range feeds {
if i > 0 {
select {
case <-ctx.Done():
return articles, ctx.Err()
case <-time.After(300 * time.Millisecond):
}
}
items, err := r.fetchFeed(ctx, feed.url)
if err != nil {
fmt.Printf("reuters/financial %s: %v\n", feed.name, err)
continue
}
for _, a := range items {
if !seen[a.URL] {
seen[a.URL] = true
articles = append(articles, a)
}
}
fmt.Printf("reuters/financial %s: %d articles\n", feed.name, len(items))
}
return articles, nil
}
func (r *Reuters) fetchFeed(ctx context.Context, feedURL string) ([]scraper.Article, error) {
req, err := http.NewRequestWithContext(ctx, http.MethodGet, feedURL, nil)
if err != nil {
return nil, err
}
req.Header.Set("User-Agent", "Mozilla/5.0 (compatible; Tradarr/1.0)")
req.Header.Set("Accept", "application/rss+xml, application/xml, text/xml")
resp, err := r.client.Do(req)
if err != nil {
return nil, err
}
defer resp.Body.Close()
if resp.StatusCode != http.StatusOK {
body, _ := io.ReadAll(io.LimitReader(resp.Body, 256))
return nil, fmt.Errorf("HTTP %d: %s", resp.StatusCode, body)
}
var feed rssFeed
if err := xml.NewDecoder(resp.Body).Decode(&feed); err != nil {
return nil, fmt.Errorf("parse RSS: %w", err)
}
var articles []scraper.Article
for _, item := range feed.Channel.Items {
title := strings.TrimSpace(item.Title)
link := strings.TrimSpace(item.Link)
if title == "" || link == "" {
continue
}
var publishedAt *time.Time
for _, f := range []string{time.RFC1123Z, time.RFC1123, "Mon, 02 Jan 2006 15:04:05 -0700"} {
if t, err := time.Parse(f, item.PubDate); err == nil {
publishedAt = &t
break
}
}
content := strings.TrimSpace(item.Description)
if content == "" {
content = title
}
articles = append(articles, scraper.Article{
Title: title,
Content: content,
URL: link,
PublishedAt: publishedAt,
})
}
return articles, nil
}

View File

@ -0,0 +1,200 @@
package watcherguru
import (
"context"
"encoding/xml"
"fmt"
"io"
"net/http"
"strings"
"time"
"golang.org/x/net/html"
"github.com/tradarr/backend/internal/scraper"
)
const baseURL = "https://watcher.guru"
type WatcherGuru struct {
client *http.Client
}
func New() *WatcherGuru {
return &WatcherGuru{client: &http.Client{Timeout: 15 * time.Second}}
}
func (w *WatcherGuru) Name() string { return "watcherguru" }
type rssFeed struct {
Channel struct {
Items []struct {
Title string `xml:"title"`
Link string `xml:"link"`
PubDate string `xml:"pubDate"`
Desc string `xml:"description"`
} `xml:"item"`
} `xml:"channel"`
}
func (w *WatcherGuru) Scrape(ctx context.Context, _ []string) ([]scraper.Article, error) {
// Try RSS feeds first
for _, feedURL := range []string{
baseURL + "/feed/",
baseURL + "/news/feed/",
} {
articles, err := w.fetchRSS(ctx, feedURL)
if err == nil && len(articles) > 0 {
fmt.Printf("watcherguru rss: %d articles\n", len(articles))
return articles, nil
}
}
// Fallback: HTML scraping
articles, err := w.scrapeHTML(ctx)
if err != nil {
return nil, fmt.Errorf("watcherguru: %w", err)
}
fmt.Printf("watcherguru html: %d articles\n", len(articles))
return articles, nil
}
func (w *WatcherGuru) fetchRSS(ctx context.Context, feedURL string) ([]scraper.Article, error) {
req, err := http.NewRequestWithContext(ctx, http.MethodGet, feedURL, nil)
if err != nil {
return nil, err
}
req.Header.Set("User-Agent", "Mozilla/5.0 (compatible; Tradarr/1.0)")
resp, err := w.client.Do(req)
if err != nil {
return nil, err
}
defer resp.Body.Close()
if resp.StatusCode != http.StatusOK {
return nil, fmt.Errorf("HTTP %d", resp.StatusCode)
}
var feed rssFeed
if err := xml.NewDecoder(resp.Body).Decode(&feed); err != nil {
return nil, fmt.Errorf("parse RSS: %w", err)
}
var articles []scraper.Article
for _, item := range feed.Channel.Items {
title := strings.TrimSpace(item.Title)
link := strings.TrimSpace(item.Link)
if title == "" || link == "" {
continue
}
var publishedAt *time.Time
for _, f := range []string{time.RFC1123Z, time.RFC1123, "Mon, 02 Jan 2006 15:04:05 -0700"} {
if t, err := time.Parse(f, item.PubDate); err == nil {
publishedAt = &t
break
}
}
content := strings.TrimSpace(item.Desc)
if content == "" {
content = title
}
articles = append(articles, scraper.Article{
Title: title,
Content: content,
URL: link,
PublishedAt: publishedAt,
})
}
return articles, nil
}
func (w *WatcherGuru) scrapeHTML(ctx context.Context) ([]scraper.Article, error) {
req, err := http.NewRequestWithContext(ctx, http.MethodGet, baseURL+"/news/", nil)
if err != nil {
return nil, err
}
req.Header.Set("User-Agent", "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 Chrome/122.0.0.0 Safari/537.36")
req.Header.Set("Accept", "text/html,application/xhtml+xml")
resp, err := w.client.Do(req)
if err != nil {
return nil, err
}
defer resp.Body.Close()
if resp.StatusCode != http.StatusOK {
body, _ := io.ReadAll(io.LimitReader(resp.Body, 512))
return nil, fmt.Errorf("HTTP %d: %s", resp.StatusCode, body)
}
doc, err := html.Parse(resp.Body)
if err != nil {
return nil, fmt.Errorf("parse HTML: %w", err)
}
var articles []scraper.Article
seen := make(map[string]bool)
now := time.Now()
var walk func(*html.Node)
walk = func(n *html.Node) {
if n.Type == html.ElementNode && (n.Data == "a" || n.Data == "article") {
if n.Data == "a" {
href := attrVal(n, "href")
if href == "" || seen[href] {
walk(n.FirstChild)
return
}
// Collect links that look like news articles
if strings.Contains(href, "/news/") || strings.Contains(href, "watcher.guru") {
text := strings.TrimSpace(nodeText(n))
if len(text) > 20 {
url := href
if !strings.HasPrefix(url, "http") {
url = baseURL + url
}
if !seen[url] {
seen[url] = true
articles = append(articles, scraper.Article{
Title: text,
Content: text,
URL: url,
PublishedAt: &now,
})
}
}
}
}
}
for c := n.FirstChild; c != nil; c = c.NextSibling {
walk(c)
}
}
walk(doc)
if len(articles) > 40 {
articles = articles[:40]
}
return articles, nil
}
func attrVal(n *html.Node, key string) string {
for _, a := range n.Attr {
if a.Key == key {
return a.Val
}
}
return ""
}
func nodeText(n *html.Node) string {
if n.Type == html.TextNode {
return n.Data
}
var sb strings.Builder
for c := n.FirstChild; c != nil; c = c.NextSibling {
sb.WriteString(nodeText(c))
}
return sb.String()
}

View File

@ -86,8 +86,13 @@ func (y *YahooFinance) fetchSymbol(ctx context.Context, symbol string) ([]scrape
return nil, fmt.Errorf("parse RSS: %w", err)
}
const maxPerSymbol = 5
var articles []scraper.Article
for _, item := range feed.Channel.Items {
if len(articles) >= maxPerSymbol {
break
}
title := strings.TrimSpace(item.Title)
link := strings.TrimSpace(item.Link)
if title == "" || link == "" {