feat: add sources to retrieve news and divide the IA reflexions in 2 steps to limit the number of news
This commit is contained in:
@ -1,206 +1,94 @@
|
||||
package bloomberg
|
||||
|
||||
import (
|
||||
"bytes"
|
||||
"context"
|
||||
"encoding/json"
|
||||
"fmt"
|
||||
"io"
|
||||
"net/http"
|
||||
"strings"
|
||||
"time"
|
||||
|
||||
"github.com/chromedp/chromedp"
|
||||
"github.com/tradarr/backend/internal/scraper"
|
||||
)
|
||||
|
||||
type Bloomberg struct {
|
||||
username string
|
||||
password string
|
||||
chromePath string
|
||||
scraperURL string
|
||||
client *http.Client
|
||||
}
|
||||
|
||||
func New(username, password, chromePath string) *Bloomberg {
|
||||
return &Bloomberg{username: username, password: password, chromePath: chromePath}
|
||||
func New(scraperURL string) *Bloomberg {
|
||||
if scraperURL == "" {
|
||||
scraperURL = "http://scraper:3001"
|
||||
}
|
||||
return &Bloomberg{
|
||||
scraperURL: scraperURL,
|
||||
client: &http.Client{Timeout: 10 * time.Minute},
|
||||
}
|
||||
}
|
||||
|
||||
func (b *Bloomberg) Name() string { return "bloomberg" }
|
||||
|
||||
func (b *Bloomberg) Scrape(ctx context.Context, symbols []string) ([]scraper.Article, error) {
|
||||
if b.username == "" || b.password == "" {
|
||||
return nil, fmt.Errorf("bloomberg credentials not configured")
|
||||
}
|
||||
|
||||
opts := []chromedp.ExecAllocatorOption{
|
||||
chromedp.NoFirstRun,
|
||||
chromedp.NoDefaultBrowserCheck,
|
||||
chromedp.Headless,
|
||||
chromedp.DisableGPU,
|
||||
chromedp.Flag("no-sandbox", true),
|
||||
chromedp.Flag("disable-setuid-sandbox", true),
|
||||
chromedp.Flag("disable-dev-shm-usage", true),
|
||||
chromedp.Flag("disable-blink-features", "AutomationControlled"),
|
||||
chromedp.Flag("disable-infobars", true),
|
||||
chromedp.Flag("window-size", "1920,1080"),
|
||||
chromedp.Flag("ignore-certificate-errors", true),
|
||||
chromedp.UserAgent("Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/122.0.0.0 Safari/537.36"),
|
||||
}
|
||||
if b.chromePath != "" {
|
||||
opts = append(opts, chromedp.ExecPath(b.chromePath))
|
||||
}
|
||||
|
||||
allocCtx, cancelAlloc := chromedp.NewExecAllocator(ctx, opts...)
|
||||
defer cancelAlloc()
|
||||
|
||||
chromeCtx, cancelChrome := chromedp.NewContext(allocCtx)
|
||||
defer cancelChrome()
|
||||
|
||||
timeoutCtx, cancelTimeout := context.WithTimeout(chromeCtx, 5*time.Minute)
|
||||
defer cancelTimeout()
|
||||
|
||||
if err := b.login(timeoutCtx); err != nil {
|
||||
return nil, fmt.Errorf("bloomberg login: %w", err)
|
||||
}
|
||||
|
||||
var articles []scraper.Article
|
||||
pages := []string{
|
||||
"https://www.bloomberg.com/markets",
|
||||
"https://www.bloomberg.com/technology",
|
||||
"https://www.bloomberg.com/economics",
|
||||
}
|
||||
for _, u := range pages {
|
||||
pageArticles, err := b.scrapePage(timeoutCtx, u, symbols)
|
||||
if err != nil {
|
||||
fmt.Printf("bloomberg scrape %s: %v\n", u, err)
|
||||
continue
|
||||
}
|
||||
articles = append(articles, pageArticles...)
|
||||
}
|
||||
fmt.Printf("bloomberg: %d articles fetched total\n", len(articles))
|
||||
return articles, nil
|
||||
type scraperRequest struct {
|
||||
Username string `json:"username"`
|
||||
Password string `json:"password"`
|
||||
}
|
||||
|
||||
func (b *Bloomberg) login(ctx context.Context) error {
|
||||
loginCtx, cancel := context.WithTimeout(ctx, 2*time.Minute)
|
||||
defer cancel()
|
||||
|
||||
// Masquer la détection d'automation via JS
|
||||
if err := chromedp.Run(loginCtx,
|
||||
chromedp.ActionFunc(func(ctx context.Context) error {
|
||||
return chromedp.Evaluate(`
|
||||
Object.defineProperty(navigator, 'webdriver', {get: () => undefined});
|
||||
window.chrome = { runtime: {} };
|
||||
`, nil).Do(ctx)
|
||||
}),
|
||||
); err != nil {
|
||||
fmt.Printf("bloomberg: could not inject stealth JS: %v\n", err)
|
||||
}
|
||||
|
||||
err := chromedp.Run(loginCtx,
|
||||
chromedp.Navigate("https://www.bloomberg.com/account/signin"),
|
||||
chromedp.Sleep(2*time.Second),
|
||||
// Essayer plusieurs sélecteurs pour l'email
|
||||
chromedp.ActionFunc(func(ctx context.Context) error {
|
||||
selectors := []string{
|
||||
`input[name="email"]`,
|
||||
`input[type="email"]`,
|
||||
`input[data-type="email"]`,
|
||||
`input[placeholder*="email" i]`,
|
||||
`input[placeholder*="mail" i]`,
|
||||
}
|
||||
for _, sel := range selectors {
|
||||
var count int
|
||||
if err := chromedp.Evaluate(fmt.Sprintf(`document.querySelectorAll('%s').length`, sel), &count).Do(ctx); err == nil && count > 0 {
|
||||
fmt.Printf("bloomberg: using email selector: %s\n", sel)
|
||||
return chromedp.SendKeys(sel, b.username, chromedp.ByQuery).Do(ctx)
|
||||
}
|
||||
}
|
||||
return fmt.Errorf("could not find email input — Bloomberg login page structure may have changed")
|
||||
}),
|
||||
chromedp.Sleep(500*time.Millisecond),
|
||||
// Submit email
|
||||
chromedp.ActionFunc(func(ctx context.Context) error {
|
||||
selectors := []string{`button[type="submit"]`, `input[type="submit"]`, `button[data-testid*="submit"]`}
|
||||
for _, sel := range selectors {
|
||||
var count int
|
||||
if err := chromedp.Evaluate(fmt.Sprintf(`document.querySelectorAll('%s').length`, sel), &count).Do(ctx); err == nil && count > 0 {
|
||||
return chromedp.Click(sel, chromedp.ByQuery).Do(ctx)
|
||||
}
|
||||
}
|
||||
// Fallback: press Enter
|
||||
return chromedp.KeyEvent("\r").Do(ctx)
|
||||
}),
|
||||
chromedp.Sleep(2*time.Second),
|
||||
// Password
|
||||
chromedp.ActionFunc(func(ctx context.Context) error {
|
||||
selectors := []string{`input[type="password"]`, `input[name="password"]`}
|
||||
for _, sel := range selectors {
|
||||
var count int
|
||||
if err := chromedp.Evaluate(fmt.Sprintf(`document.querySelectorAll('%s').length`, sel), &count).Do(ctx); err == nil && count > 0 {
|
||||
fmt.Printf("bloomberg: using password selector: %s\n", sel)
|
||||
return chromedp.SendKeys(sel, b.password, chromedp.ByQuery).Do(ctx)
|
||||
}
|
||||
}
|
||||
return fmt.Errorf("could not find password input")
|
||||
}),
|
||||
chromedp.Sleep(500*time.Millisecond),
|
||||
chromedp.ActionFunc(func(ctx context.Context) error {
|
||||
selectors := []string{`button[type="submit"]`, `input[type="submit"]`}
|
||||
for _, sel := range selectors {
|
||||
var count int
|
||||
if err := chromedp.Evaluate(fmt.Sprintf(`document.querySelectorAll('%s').length`, sel), &count).Do(ctx); err == nil && count > 0 {
|
||||
return chromedp.Click(sel, chromedp.ByQuery).Do(ctx)
|
||||
}
|
||||
}
|
||||
return chromedp.KeyEvent("\r").Do(ctx)
|
||||
}),
|
||||
chromedp.Sleep(3*time.Second),
|
||||
)
|
||||
return err
|
||||
type scraperArticle struct {
|
||||
Title string `json:"title"`
|
||||
URL string `json:"url"`
|
||||
}
|
||||
|
||||
func (b *Bloomberg) scrapePage(ctx context.Context, pageURL string, symbols []string) ([]scraper.Article, error) {
|
||||
pageCtx, cancel := context.WithTimeout(ctx, 60*time.Second)
|
||||
defer cancel()
|
||||
type scraperResponse struct {
|
||||
Articles []scraperArticle `json:"articles"`
|
||||
Error string `json:"error,omitempty"`
|
||||
}
|
||||
|
||||
var articleNodes []map[string]string
|
||||
err := chromedp.Run(pageCtx,
|
||||
chromedp.Navigate(pageURL),
|
||||
chromedp.Sleep(3*time.Second),
|
||||
chromedp.Evaluate(`
|
||||
(function() {
|
||||
var items = [];
|
||||
var seen = new Set();
|
||||
var links = document.querySelectorAll('a[href*="/news/articles"], a[href*="/opinion/"], a[href*="/markets/"]');
|
||||
links.forEach(function(a) {
|
||||
if (seen.has(a.href)) return;
|
||||
seen.add(a.href);
|
||||
var title = a.querySelector('h1,h2,h3,h4,[class*="headline"],[class*="title"]');
|
||||
var text = title ? title.innerText.trim() : a.innerText.trim();
|
||||
if (text.length > 20 && a.href.includes('bloomberg.com')) {
|
||||
items.push({title: text, url: a.href});
|
||||
}
|
||||
});
|
||||
return items.slice(0, 25);
|
||||
})()
|
||||
`, &articleNodes),
|
||||
)
|
||||
func (b *Bloomberg) ScrapeWithCredentials(ctx context.Context, username, password string, symbols []string) ([]scraper.Article, error) {
|
||||
payload, _ := json.Marshal(scraperRequest{Username: username, Password: password})
|
||||
req, err := http.NewRequestWithContext(ctx, http.MethodPost, b.scraperURL+"/bloomberg/scrape", bytes.NewReader(payload))
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("navigate %s: %w", pageURL, err)
|
||||
return nil, err
|
||||
}
|
||||
req.Header.Set("Content-Type", "application/json")
|
||||
|
||||
resp, err := b.client.Do(req)
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("scraper service unreachable: %w", err)
|
||||
}
|
||||
defer resp.Body.Close()
|
||||
|
||||
body, _ := io.ReadAll(resp.Body)
|
||||
if resp.StatusCode != http.StatusOK {
|
||||
return nil, fmt.Errorf("scraper service HTTP %d: %s", resp.StatusCode, body)
|
||||
}
|
||||
|
||||
var result scraperResponse
|
||||
if err := json.Unmarshal(body, &result); err != nil {
|
||||
return nil, fmt.Errorf("parse scraper response: %w", err)
|
||||
}
|
||||
if result.Error != "" {
|
||||
return nil, fmt.Errorf("bloomberg: %s", result.Error)
|
||||
}
|
||||
|
||||
var articles []scraper.Article
|
||||
now := time.Now()
|
||||
for _, node := range articleNodes {
|
||||
title := strings.TrimSpace(node["title"])
|
||||
url := node["url"]
|
||||
if title == "" || url == "" || !strings.Contains(url, "bloomberg.com") {
|
||||
var articles []scraper.Article
|
||||
for _, a := range result.Articles {
|
||||
title := strings.TrimSpace(a.Title)
|
||||
url := a.URL
|
||||
if title == "" || url == "" {
|
||||
continue
|
||||
}
|
||||
syms := scraper.DetectSymbols(title, symbols)
|
||||
articles = append(articles, scraper.Article{
|
||||
Title: title,
|
||||
Content: title, // contenu minimal — l'article complet nécessite un accès payant
|
||||
Content: title,
|
||||
URL: url,
|
||||
PublishedAt: &now,
|
||||
Symbols: syms,
|
||||
})
|
||||
}
|
||||
fmt.Printf("bloomberg: %d articles fetched\n", len(articles))
|
||||
return articles, nil
|
||||
}
|
||||
|
||||
@ -9,21 +9,19 @@ import (
|
||||
"github.com/tradarr/backend/internal/scraper"
|
||||
)
|
||||
|
||||
// DynamicBloomberg charge les credentials depuis la DB avant chaque scraping
|
||||
type DynamicBloomberg struct {
|
||||
repo *models.Repository
|
||||
enc *crypto.Encryptor
|
||||
chromePath string
|
||||
scraperURL string
|
||||
}
|
||||
|
||||
func NewDynamic(repo *models.Repository, enc *crypto.Encryptor, chromePath string) *DynamicBloomberg {
|
||||
return &DynamicBloomberg{repo: repo, enc: enc, chromePath: chromePath}
|
||||
func NewDynamic(repo *models.Repository, enc *crypto.Encryptor, scraperURL string) *DynamicBloomberg {
|
||||
return &DynamicBloomberg{repo: repo, enc: enc, scraperURL: scraperURL}
|
||||
}
|
||||
|
||||
func (d *DynamicBloomberg) Name() string { return "bloomberg" }
|
||||
|
||||
func (d *DynamicBloomberg) Scrape(ctx context.Context, symbols []string) ([]scraper.Article, error) {
|
||||
// Récupérer la source Bloomberg
|
||||
source, err := d.repo.GetSourceByType("bloomberg")
|
||||
if err != nil || source == nil {
|
||||
return nil, fmt.Errorf("bloomberg source not found")
|
||||
@ -34,7 +32,7 @@ func (d *DynamicBloomberg) Scrape(ctx context.Context, symbols []string) ([]scra
|
||||
return nil, fmt.Errorf("get bloomberg credentials: %w", err)
|
||||
}
|
||||
if cred == nil || cred.Username == "" {
|
||||
return nil, fmt.Errorf("bloomberg credentials not configured — please set them in the admin panel")
|
||||
return nil, fmt.Errorf("bloomberg credentials not configured — configure them in the admin panel")
|
||||
}
|
||||
|
||||
password := ""
|
||||
@ -45,6 +43,6 @@ func (d *DynamicBloomberg) Scrape(ctx context.Context, symbols []string) ([]scra
|
||||
}
|
||||
}
|
||||
|
||||
b := New(cred.Username, password, d.chromePath)
|
||||
return b.Scrape(ctx, symbols)
|
||||
b := New(d.scraperURL)
|
||||
return b.ScrapeWithCredentials(ctx, cred.Username, password, symbols)
|
||||
}
|
||||
|
||||
129
backend/internal/scraper/reuters/reuters.go
Normal file
129
backend/internal/scraper/reuters/reuters.go
Normal file
@ -0,0 +1,129 @@
|
||||
package reuters
|
||||
|
||||
import (
|
||||
"context"
|
||||
"encoding/xml"
|
||||
"fmt"
|
||||
"io"
|
||||
"net/http"
|
||||
"strings"
|
||||
"time"
|
||||
|
||||
"github.com/tradarr/backend/internal/scraper"
|
||||
)
|
||||
|
||||
// Reuters RSS est bloqué par Cloudflare. On utilise des flux RSS financiers
|
||||
// publics fiables à la place : MarketWatch, CNBC, Seeking Alpha.
|
||||
var feeds = []struct {
|
||||
name string
|
||||
url string
|
||||
}{
|
||||
{"MarketWatch Top Stories", "https://feeds.content.dowjones.io/public/rss/mw_topstories"},
|
||||
{"MarketWatch Markets", "https://feeds.content.dowjones.io/public/rss/mw_marketpulse"},
|
||||
{"CNBC Top News", "https://search.cnbc.com/rs/search/combinedcombined/rss/topNews"},
|
||||
{"CNBC Finance", "https://search.cnbc.com/rs/search/combinedcombined/rss/topNews?tag=Finance"},
|
||||
}
|
||||
|
||||
type Reuters struct {
|
||||
client *http.Client
|
||||
}
|
||||
|
||||
func New() *Reuters {
|
||||
return &Reuters{client: &http.Client{Timeout: 15 * time.Second}}
|
||||
}
|
||||
|
||||
func (r *Reuters) Name() string { return "reuters" }
|
||||
|
||||
type rssFeed struct {
|
||||
Channel struct {
|
||||
Items []struct {
|
||||
Title string `xml:"title"`
|
||||
Link string `xml:"link"`
|
||||
Description string `xml:"description"`
|
||||
PubDate string `xml:"pubDate"`
|
||||
} `xml:"item"`
|
||||
} `xml:"channel"`
|
||||
}
|
||||
|
||||
func (r *Reuters) Scrape(ctx context.Context, _ []string) ([]scraper.Article, error) {
|
||||
var articles []scraper.Article
|
||||
seen := make(map[string]bool)
|
||||
|
||||
for i, feed := range feeds {
|
||||
if i > 0 {
|
||||
select {
|
||||
case <-ctx.Done():
|
||||
return articles, ctx.Err()
|
||||
case <-time.After(300 * time.Millisecond):
|
||||
}
|
||||
}
|
||||
items, err := r.fetchFeed(ctx, feed.url)
|
||||
if err != nil {
|
||||
fmt.Printf("reuters/financial %s: %v\n", feed.name, err)
|
||||
continue
|
||||
}
|
||||
for _, a := range items {
|
||||
if !seen[a.URL] {
|
||||
seen[a.URL] = true
|
||||
articles = append(articles, a)
|
||||
}
|
||||
}
|
||||
fmt.Printf("reuters/financial %s: %d articles\n", feed.name, len(items))
|
||||
}
|
||||
return articles, nil
|
||||
}
|
||||
|
||||
func (r *Reuters) fetchFeed(ctx context.Context, feedURL string) ([]scraper.Article, error) {
|
||||
req, err := http.NewRequestWithContext(ctx, http.MethodGet, feedURL, nil)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
req.Header.Set("User-Agent", "Mozilla/5.0 (compatible; Tradarr/1.0)")
|
||||
req.Header.Set("Accept", "application/rss+xml, application/xml, text/xml")
|
||||
|
||||
resp, err := r.client.Do(req)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
defer resp.Body.Close()
|
||||
|
||||
if resp.StatusCode != http.StatusOK {
|
||||
body, _ := io.ReadAll(io.LimitReader(resp.Body, 256))
|
||||
return nil, fmt.Errorf("HTTP %d: %s", resp.StatusCode, body)
|
||||
}
|
||||
|
||||
var feed rssFeed
|
||||
if err := xml.NewDecoder(resp.Body).Decode(&feed); err != nil {
|
||||
return nil, fmt.Errorf("parse RSS: %w", err)
|
||||
}
|
||||
|
||||
var articles []scraper.Article
|
||||
for _, item := range feed.Channel.Items {
|
||||
title := strings.TrimSpace(item.Title)
|
||||
link := strings.TrimSpace(item.Link)
|
||||
if title == "" || link == "" {
|
||||
continue
|
||||
}
|
||||
|
||||
var publishedAt *time.Time
|
||||
for _, f := range []string{time.RFC1123Z, time.RFC1123, "Mon, 02 Jan 2006 15:04:05 -0700"} {
|
||||
if t, err := time.Parse(f, item.PubDate); err == nil {
|
||||
publishedAt = &t
|
||||
break
|
||||
}
|
||||
}
|
||||
|
||||
content := strings.TrimSpace(item.Description)
|
||||
if content == "" {
|
||||
content = title
|
||||
}
|
||||
|
||||
articles = append(articles, scraper.Article{
|
||||
Title: title,
|
||||
Content: content,
|
||||
URL: link,
|
||||
PublishedAt: publishedAt,
|
||||
})
|
||||
}
|
||||
return articles, nil
|
||||
}
|
||||
200
backend/internal/scraper/watcherguru/watcherguru.go
Normal file
200
backend/internal/scraper/watcherguru/watcherguru.go
Normal file
@ -0,0 +1,200 @@
|
||||
package watcherguru
|
||||
|
||||
import (
|
||||
"context"
|
||||
"encoding/xml"
|
||||
"fmt"
|
||||
"io"
|
||||
"net/http"
|
||||
"strings"
|
||||
"time"
|
||||
|
||||
"golang.org/x/net/html"
|
||||
|
||||
"github.com/tradarr/backend/internal/scraper"
|
||||
)
|
||||
|
||||
const baseURL = "https://watcher.guru"
|
||||
|
||||
type WatcherGuru struct {
|
||||
client *http.Client
|
||||
}
|
||||
|
||||
func New() *WatcherGuru {
|
||||
return &WatcherGuru{client: &http.Client{Timeout: 15 * time.Second}}
|
||||
}
|
||||
|
||||
func (w *WatcherGuru) Name() string { return "watcherguru" }
|
||||
|
||||
type rssFeed struct {
|
||||
Channel struct {
|
||||
Items []struct {
|
||||
Title string `xml:"title"`
|
||||
Link string `xml:"link"`
|
||||
PubDate string `xml:"pubDate"`
|
||||
Desc string `xml:"description"`
|
||||
} `xml:"item"`
|
||||
} `xml:"channel"`
|
||||
}
|
||||
|
||||
func (w *WatcherGuru) Scrape(ctx context.Context, _ []string) ([]scraper.Article, error) {
|
||||
// Try RSS feeds first
|
||||
for _, feedURL := range []string{
|
||||
baseURL + "/feed/",
|
||||
baseURL + "/news/feed/",
|
||||
} {
|
||||
articles, err := w.fetchRSS(ctx, feedURL)
|
||||
if err == nil && len(articles) > 0 {
|
||||
fmt.Printf("watcherguru rss: %d articles\n", len(articles))
|
||||
return articles, nil
|
||||
}
|
||||
}
|
||||
|
||||
// Fallback: HTML scraping
|
||||
articles, err := w.scrapeHTML(ctx)
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("watcherguru: %w", err)
|
||||
}
|
||||
fmt.Printf("watcherguru html: %d articles\n", len(articles))
|
||||
return articles, nil
|
||||
}
|
||||
|
||||
func (w *WatcherGuru) fetchRSS(ctx context.Context, feedURL string) ([]scraper.Article, error) {
|
||||
req, err := http.NewRequestWithContext(ctx, http.MethodGet, feedURL, nil)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
req.Header.Set("User-Agent", "Mozilla/5.0 (compatible; Tradarr/1.0)")
|
||||
|
||||
resp, err := w.client.Do(req)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
defer resp.Body.Close()
|
||||
|
||||
if resp.StatusCode != http.StatusOK {
|
||||
return nil, fmt.Errorf("HTTP %d", resp.StatusCode)
|
||||
}
|
||||
|
||||
var feed rssFeed
|
||||
if err := xml.NewDecoder(resp.Body).Decode(&feed); err != nil {
|
||||
return nil, fmt.Errorf("parse RSS: %w", err)
|
||||
}
|
||||
|
||||
var articles []scraper.Article
|
||||
for _, item := range feed.Channel.Items {
|
||||
title := strings.TrimSpace(item.Title)
|
||||
link := strings.TrimSpace(item.Link)
|
||||
if title == "" || link == "" {
|
||||
continue
|
||||
}
|
||||
var publishedAt *time.Time
|
||||
for _, f := range []string{time.RFC1123Z, time.RFC1123, "Mon, 02 Jan 2006 15:04:05 -0700"} {
|
||||
if t, err := time.Parse(f, item.PubDate); err == nil {
|
||||
publishedAt = &t
|
||||
break
|
||||
}
|
||||
}
|
||||
content := strings.TrimSpace(item.Desc)
|
||||
if content == "" {
|
||||
content = title
|
||||
}
|
||||
articles = append(articles, scraper.Article{
|
||||
Title: title,
|
||||
Content: content,
|
||||
URL: link,
|
||||
PublishedAt: publishedAt,
|
||||
})
|
||||
}
|
||||
return articles, nil
|
||||
}
|
||||
|
||||
func (w *WatcherGuru) scrapeHTML(ctx context.Context) ([]scraper.Article, error) {
|
||||
req, err := http.NewRequestWithContext(ctx, http.MethodGet, baseURL+"/news/", nil)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
req.Header.Set("User-Agent", "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 Chrome/122.0.0.0 Safari/537.36")
|
||||
req.Header.Set("Accept", "text/html,application/xhtml+xml")
|
||||
|
||||
resp, err := w.client.Do(req)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
defer resp.Body.Close()
|
||||
|
||||
if resp.StatusCode != http.StatusOK {
|
||||
body, _ := io.ReadAll(io.LimitReader(resp.Body, 512))
|
||||
return nil, fmt.Errorf("HTTP %d: %s", resp.StatusCode, body)
|
||||
}
|
||||
|
||||
doc, err := html.Parse(resp.Body)
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("parse HTML: %w", err)
|
||||
}
|
||||
|
||||
var articles []scraper.Article
|
||||
seen := make(map[string]bool)
|
||||
now := time.Now()
|
||||
|
||||
var walk func(*html.Node)
|
||||
walk = func(n *html.Node) {
|
||||
if n.Type == html.ElementNode && (n.Data == "a" || n.Data == "article") {
|
||||
if n.Data == "a" {
|
||||
href := attrVal(n, "href")
|
||||
if href == "" || seen[href] {
|
||||
walk(n.FirstChild)
|
||||
return
|
||||
}
|
||||
// Collect links that look like news articles
|
||||
if strings.Contains(href, "/news/") || strings.Contains(href, "watcher.guru") {
|
||||
text := strings.TrimSpace(nodeText(n))
|
||||
if len(text) > 20 {
|
||||
url := href
|
||||
if !strings.HasPrefix(url, "http") {
|
||||
url = baseURL + url
|
||||
}
|
||||
if !seen[url] {
|
||||
seen[url] = true
|
||||
articles = append(articles, scraper.Article{
|
||||
Title: text,
|
||||
Content: text,
|
||||
URL: url,
|
||||
PublishedAt: &now,
|
||||
})
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
for c := n.FirstChild; c != nil; c = c.NextSibling {
|
||||
walk(c)
|
||||
}
|
||||
}
|
||||
walk(doc)
|
||||
|
||||
if len(articles) > 40 {
|
||||
articles = articles[:40]
|
||||
}
|
||||
return articles, nil
|
||||
}
|
||||
|
||||
func attrVal(n *html.Node, key string) string {
|
||||
for _, a := range n.Attr {
|
||||
if a.Key == key {
|
||||
return a.Val
|
||||
}
|
||||
}
|
||||
return ""
|
||||
}
|
||||
|
||||
func nodeText(n *html.Node) string {
|
||||
if n.Type == html.TextNode {
|
||||
return n.Data
|
||||
}
|
||||
var sb strings.Builder
|
||||
for c := n.FirstChild; c != nil; c = c.NextSibling {
|
||||
sb.WriteString(nodeText(c))
|
||||
}
|
||||
return sb.String()
|
||||
}
|
||||
@ -86,8 +86,13 @@ func (y *YahooFinance) fetchSymbol(ctx context.Context, symbol string) ([]scrape
|
||||
return nil, fmt.Errorf("parse RSS: %w", err)
|
||||
}
|
||||
|
||||
const maxPerSymbol = 5
|
||||
|
||||
var articles []scraper.Article
|
||||
for _, item := range feed.Channel.Items {
|
||||
if len(articles) >= maxPerSymbol {
|
||||
break
|
||||
}
|
||||
title := strings.TrimSpace(item.Title)
|
||||
link := strings.TrimSpace(item.Link)
|
||||
if title == "" || link == "" {
|
||||
|
||||
Reference in New Issue
Block a user