Tradarr/backend/internal/scraper/watcherguru/watcherguru.go

package watcherguru

import (
	"context"
	"encoding/xml"
	"fmt"
	"io"
	"net/http"
	"strings"
	"time"

	"golang.org/x/net/html"

	"github.com/tradarr/backend/internal/scraper"
)

const baseURL = "https://watcher.guru"

type WatcherGuru struct {
	client *http.Client
}

func New() *WatcherGuru {
	return &WatcherGuru{client: &http.Client{Timeout: 15 * time.Second}}
}

func (w *WatcherGuru) Name() string { return "watcherguru" }

type rssFeed struct {
	Channel struct {
		Items []struct {
			Title   string `xml:"title"`
			Link    string `xml:"link"`
			PubDate string `xml:"pubDate"`
			Desc    string `xml:"description"`
		} `xml:"item"`
	} `xml:"channel"`
}

func (w *WatcherGuru) Scrape(ctx context.Context, _ []string) ([]scraper.Article, error) {
	// Try RSS feeds first
	for _, feedURL := range []string{
		baseURL + "/feed/",
		baseURL + "/news/feed/",
	} {
		articles, err := w.fetchRSS(ctx, feedURL)
		if err == nil && len(articles) > 0 {
			fmt.Printf("watcherguru rss: %d articles\n", len(articles))
			return articles, nil
		}
	}

	// Fallback: HTML scraping
	articles, err := w.scrapeHTML(ctx)
	if err != nil {
		return nil, fmt.Errorf("watcherguru: %w", err)
	}
	fmt.Printf("watcherguru html: %d articles\n", len(articles))
	return articles, nil
}

func (w *WatcherGuru) fetchRSS(ctx context.Context, feedURL string) ([]scraper.Article, error) {
	req, err := http.NewRequestWithContext(ctx, http.MethodGet, feedURL, nil)
	if err != nil {
		return nil, err
	}
	req.Header.Set("User-Agent", "Mozilla/5.0 (compatible; Tradarr/1.0)")

	resp, err := w.client.Do(req)
	if err != nil {
		return nil, err
	}
	defer resp.Body.Close()

	if resp.StatusCode != http.StatusOK {
		return nil, fmt.Errorf("HTTP %d", resp.StatusCode)
	}

	var feed rssFeed
	if err := xml.NewDecoder(resp.Body).Decode(&feed); err != nil {
		return nil, fmt.Errorf("parse RSS: %w", err)
	}

	var articles []scraper.Article
	for _, item := range feed.Channel.Items {
		title := strings.TrimSpace(item.Title)
		link := strings.TrimSpace(item.Link)
		if title == "" || link == "" {
			continue
		}
		var publishedAt *time.Time
		for _, f := range []string{time.RFC1123Z, time.RFC1123, "Mon, 02 Jan 2006 15:04:05 -0700"} {
			if t, err := time.Parse(f, item.PubDate); err == nil {
				publishedAt = &t
				break
			}
		}
		content := strings.TrimSpace(item.Desc)
		if content == "" {
			content = title
		}
		articles = append(articles, scraper.Article{
			Title:       title,
			Content:     content,
			URL:         link,
			PublishedAt: publishedAt,
		})
	}
	return articles, nil
}

func (w *WatcherGuru) scrapeHTML(ctx context.Context) ([]scraper.Article, error) {
	req, err := http.NewRequestWithContext(ctx, http.MethodGet, baseURL+"/news/", nil)
	if err != nil {
		return nil, err
	}
	req.Header.Set("User-Agent", "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 Chrome/122.0.0.0 Safari/537.36")
	req.Header.Set("Accept", "text/html,application/xhtml+xml")

	resp, err := w.client.Do(req)
	if err != nil {
		return nil, err
	}
	defer resp.Body.Close()

	if resp.StatusCode != http.StatusOK {
		body, _ := io.ReadAll(io.LimitReader(resp.Body, 512))
		return nil, fmt.Errorf("HTTP %d: %s", resp.StatusCode, body)
	}

	doc, err := html.Parse(resp.Body)
	if err != nil {
		return nil, fmt.Errorf("parse HTML: %w", err)
	}

	var articles []scraper.Article
	seen := make(map[string]bool)
	now := time.Now()

	var walk func(*html.Node)
	walk = func(n *html.Node) {
		if n.Type == html.ElementNode && (n.Data == "a" || n.Data == "article") {
			if n.Data == "a" {
				href := attrVal(n, "href")
				if href == "" || seen[href] {
					walk(n.FirstChild)
					return
				}
				// Collect links that look like news articles
				if strings.Contains(href, "/news/") || strings.Contains(href, "watcher.guru") {
					text := strings.TrimSpace(nodeText(n))
					if len(text) > 20 {
						url := href
						if !strings.HasPrefix(url, "http") {
							url = baseURL + url
						}
						if !seen[url] {
							seen[url] = true
							articles = append(articles, scraper.Article{
								Title:       text,
								Content:     text,
								URL:         url,
								PublishedAt: &now,
							})
						}
					}
				}
			}
		}
		for c := n.FirstChild; c != nil; c = c.NextSibling {
			walk(c)
		}
	}
	walk(doc)

	if len(articles) > 40 {
		articles = articles[:40]
	}
	return articles, nil
}

func attrVal(n *html.Node, key string) string {
	for _, a := range n.Attr {
		if a.Key == key {
			return a.Val
		}
	}
	return ""
}

func nodeText(n *html.Node) string {
	if n.Type == html.TextNode {
		return n.Data
	}
	var sb strings.Builder
	for c := n.FirstChild; c != nil; c = c.NextSibling {
		sb.WriteString(nodeText(c))
	}
	return sb.String()
}