feat: add sources to retrieve news and divide the IA reflexions in 2 steps to limit the number of news

2026-04-19 10:43:15 +02:00
parent 93668273ff
commit eb1fb5ca78
28 changed files with 1086 additions and 249 deletions
--- a/backend/internal/scraper/watcherguru/watcherguru.go
+++ b/backend/internal/scraper/watcherguru/watcherguru.go
@ -0,0 +1,200 @@
+package watcherguru
+
+import (
+	"context"
+	"encoding/xml"
+	"fmt"
+	"io"
+	"net/http"
+	"strings"
+	"time"
+
+	"golang.org/x/net/html"
+
+	"github.com/tradarr/backend/internal/scraper"
+)
+
+const baseURL = "https://watcher.guru"
+
+type WatcherGuru struct {
+	client *http.Client
+}
+
+func New() *WatcherGuru {
+	return &WatcherGuru{client: &http.Client{Timeout: 15 * time.Second}}
+}
+
+func (w *WatcherGuru) Name() string { return "watcherguru" }
+
+type rssFeed struct {
+	Channel struct {
+		Items []struct {
+			Title   string `xml:"title"`
+			Link    string `xml:"link"`
+			PubDate string `xml:"pubDate"`
+			Desc    string `xml:"description"`
+		} `xml:"item"`
+	} `xml:"channel"`
+}
+
+func (w *WatcherGuru) Scrape(ctx context.Context, _ []string) ([]scraper.Article, error) {
+	// Try RSS feeds first
+	for _, feedURL := range []string{
+		baseURL + "/feed/",
+		baseURL + "/news/feed/",
+	} {
+		articles, err := w.fetchRSS(ctx, feedURL)
+		if err == nil && len(articles) > 0 {
+			fmt.Printf("watcherguru rss: %d articles\n", len(articles))
+			return articles, nil
+		}
+	}
+
+	// Fallback: HTML scraping
+	articles, err := w.scrapeHTML(ctx)
+	if err != nil {
+		return nil, fmt.Errorf("watcherguru: %w", err)
+	}
+	fmt.Printf("watcherguru html: %d articles\n", len(articles))
+	return articles, nil
+}
+
+func (w *WatcherGuru) fetchRSS(ctx context.Context, feedURL string) ([]scraper.Article, error) {
+	req, err := http.NewRequestWithContext(ctx, http.MethodGet, feedURL, nil)
+	if err != nil {
+		return nil, err
+	}
+	req.Header.Set("User-Agent", "Mozilla/5.0 (compatible; Tradarr/1.0)")
+
+	resp, err := w.client.Do(req)
+	if err != nil {
+		return nil, err
+	}
+	defer resp.Body.Close()
+
+	if resp.StatusCode != http.StatusOK {
+		return nil, fmt.Errorf("HTTP %d", resp.StatusCode)
+	}
+
+	var feed rssFeed
+	if err := xml.NewDecoder(resp.Body).Decode(&feed); err != nil {
+		return nil, fmt.Errorf("parse RSS: %w", err)
+	}
+
+	var articles []scraper.Article
+	for _, item := range feed.Channel.Items {
+		title := strings.TrimSpace(item.Title)
+		link := strings.TrimSpace(item.Link)
+		if title == "" || link == "" {
+			continue
+		}
+		var publishedAt *time.Time
+		for _, f := range []string{time.RFC1123Z, time.RFC1123, "Mon, 02 Jan 2006 15:04:05 -0700"} {
+			if t, err := time.Parse(f, item.PubDate); err == nil {
+				publishedAt = &t
+				break
+			}
+		}
+		content := strings.TrimSpace(item.Desc)
+		if content == "" {
+			content = title
+		}
+		articles = append(articles, scraper.Article{
+			Title:       title,
+			Content:     content,
+			URL:         link,
+			PublishedAt: publishedAt,
+		})
+	}
+	return articles, nil
+}
+
+func (w *WatcherGuru) scrapeHTML(ctx context.Context) ([]scraper.Article, error) {
+	req, err := http.NewRequestWithContext(ctx, http.MethodGet, baseURL+"/news/", nil)
+	if err != nil {
+		return nil, err
+	}
+	req.Header.Set("User-Agent", "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 Chrome/122.0.0.0 Safari/537.36")
+	req.Header.Set("Accept", "text/html,application/xhtml+xml")
+
+	resp, err := w.client.Do(req)
+	if err != nil {
+		return nil, err
+	}
+	defer resp.Body.Close()
+
+	if resp.StatusCode != http.StatusOK {
+		body, _ := io.ReadAll(io.LimitReader(resp.Body, 512))
+		return nil, fmt.Errorf("HTTP %d: %s", resp.StatusCode, body)
+	}
+
+	doc, err := html.Parse(resp.Body)
+	if err != nil {
+		return nil, fmt.Errorf("parse HTML: %w", err)
+	}
+
+	var articles []scraper.Article
+	seen := make(map[string]bool)
+	now := time.Now()
+
+	var walk func(*html.Node)
+	walk = func(n *html.Node) {
+		if n.Type == html.ElementNode && (n.Data == "a" || n.Data == "article") {
+			if n.Data == "a" {
+				href := attrVal(n, "href")
+				if href == "" || seen[href] {
+					walk(n.FirstChild)
+					return
+				}
+				// Collect links that look like news articles
+				if strings.Contains(href, "/news/") || strings.Contains(href, "watcher.guru") {
+					text := strings.TrimSpace(nodeText(n))
+					if len(text) > 20 {
+						url := href
+						if !strings.HasPrefix(url, "http") {
+							url = baseURL + url
+						}
+						if !seen[url] {
+							seen[url] = true
+							articles = append(articles, scraper.Article{
+								Title:       text,
+								Content:     text,
+								URL:         url,
+								PublishedAt: &now,
+							})
+						}
+					}
+				}
+			}
+		}
+		for c := n.FirstChild; c != nil; c = c.NextSibling {
+			walk(c)
+		}
+	}
+	walk(doc)
+
+	if len(articles) > 40 {
+		articles = articles[:40]
+	}
+	return articles, nil
+}
+
+func attrVal(n *html.Node, key string) string {
+	for _, a := range n.Attr {
+		if a.Key == key {
+			return a.Val
+		}
+	}
+	return ""
+}
+
+func nodeText(n *html.Node) string {
+	if n.Type == html.TextNode {
+		return n.Data
+	}
+	var sb strings.Builder
+	for c := n.FirstChild; c != nil; c = c.NextSibling {
+		sb.WriteString(nodeText(c))
+	}
+	return sb.String()
+}