feat: add sources to retrieve news and divide the IA reflexions in 2 steps to limit the number of news

This commit is contained in:
2026-04-19 10:43:15 +02:00
parent 93668273ff
commit eb1fb5ca78
28 changed files with 1086 additions and 249 deletions

View File

@ -0,0 +1,200 @@
package watcherguru
import (
"context"
"encoding/xml"
"fmt"
"io"
"net/http"
"strings"
"time"
"golang.org/x/net/html"
"github.com/tradarr/backend/internal/scraper"
)
const baseURL = "https://watcher.guru"
type WatcherGuru struct {
client *http.Client
}
func New() *WatcherGuru {
return &WatcherGuru{client: &http.Client{Timeout: 15 * time.Second}}
}
func (w *WatcherGuru) Name() string { return "watcherguru" }
type rssFeed struct {
Channel struct {
Items []struct {
Title string `xml:"title"`
Link string `xml:"link"`
PubDate string `xml:"pubDate"`
Desc string `xml:"description"`
} `xml:"item"`
} `xml:"channel"`
}
func (w *WatcherGuru) Scrape(ctx context.Context, _ []string) ([]scraper.Article, error) {
// Try RSS feeds first
for _, feedURL := range []string{
baseURL + "/feed/",
baseURL + "/news/feed/",
} {
articles, err := w.fetchRSS(ctx, feedURL)
if err == nil && len(articles) > 0 {
fmt.Printf("watcherguru rss: %d articles\n", len(articles))
return articles, nil
}
}
// Fallback: HTML scraping
articles, err := w.scrapeHTML(ctx)
if err != nil {
return nil, fmt.Errorf("watcherguru: %w", err)
}
fmt.Printf("watcherguru html: %d articles\n", len(articles))
return articles, nil
}
func (w *WatcherGuru) fetchRSS(ctx context.Context, feedURL string) ([]scraper.Article, error) {
req, err := http.NewRequestWithContext(ctx, http.MethodGet, feedURL, nil)
if err != nil {
return nil, err
}
req.Header.Set("User-Agent", "Mozilla/5.0 (compatible; Tradarr/1.0)")
resp, err := w.client.Do(req)
if err != nil {
return nil, err
}
defer resp.Body.Close()
if resp.StatusCode != http.StatusOK {
return nil, fmt.Errorf("HTTP %d", resp.StatusCode)
}
var feed rssFeed
if err := xml.NewDecoder(resp.Body).Decode(&feed); err != nil {
return nil, fmt.Errorf("parse RSS: %w", err)
}
var articles []scraper.Article
for _, item := range feed.Channel.Items {
title := strings.TrimSpace(item.Title)
link := strings.TrimSpace(item.Link)
if title == "" || link == "" {
continue
}
var publishedAt *time.Time
for _, f := range []string{time.RFC1123Z, time.RFC1123, "Mon, 02 Jan 2006 15:04:05 -0700"} {
if t, err := time.Parse(f, item.PubDate); err == nil {
publishedAt = &t
break
}
}
content := strings.TrimSpace(item.Desc)
if content == "" {
content = title
}
articles = append(articles, scraper.Article{
Title: title,
Content: content,
URL: link,
PublishedAt: publishedAt,
})
}
return articles, nil
}
func (w *WatcherGuru) scrapeHTML(ctx context.Context) ([]scraper.Article, error) {
req, err := http.NewRequestWithContext(ctx, http.MethodGet, baseURL+"/news/", nil)
if err != nil {
return nil, err
}
req.Header.Set("User-Agent", "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 Chrome/122.0.0.0 Safari/537.36")
req.Header.Set("Accept", "text/html,application/xhtml+xml")
resp, err := w.client.Do(req)
if err != nil {
return nil, err
}
defer resp.Body.Close()
if resp.StatusCode != http.StatusOK {
body, _ := io.ReadAll(io.LimitReader(resp.Body, 512))
return nil, fmt.Errorf("HTTP %d: %s", resp.StatusCode, body)
}
doc, err := html.Parse(resp.Body)
if err != nil {
return nil, fmt.Errorf("parse HTML: %w", err)
}
var articles []scraper.Article
seen := make(map[string]bool)
now := time.Now()
var walk func(*html.Node)
walk = func(n *html.Node) {
if n.Type == html.ElementNode && (n.Data == "a" || n.Data == "article") {
if n.Data == "a" {
href := attrVal(n, "href")
if href == "" || seen[href] {
walk(n.FirstChild)
return
}
// Collect links that look like news articles
if strings.Contains(href, "/news/") || strings.Contains(href, "watcher.guru") {
text := strings.TrimSpace(nodeText(n))
if len(text) > 20 {
url := href
if !strings.HasPrefix(url, "http") {
url = baseURL + url
}
if !seen[url] {
seen[url] = true
articles = append(articles, scraper.Article{
Title: text,
Content: text,
URL: url,
PublishedAt: &now,
})
}
}
}
}
}
for c := n.FirstChild; c != nil; c = c.NextSibling {
walk(c)
}
}
walk(doc)
if len(articles) > 40 {
articles = articles[:40]
}
return articles, nil
}
func attrVal(n *html.Node, key string) string {
for _, a := range n.Attr {
if a.Key == key {
return a.Val
}
}
return ""
}
func nodeText(n *html.Node) string {
if n.Type == html.TextNode {
return n.Data
}
var sb strings.Builder
for c := n.FirstChild; c != nil; c = c.NextSibling {
sb.WriteString(nodeText(c))
}
return sb.String()
}