package watcherguru import ( "context" "encoding/xml" "fmt" "io" "net/http" "strings" "time" "golang.org/x/net/html" "github.com/tradarr/backend/internal/scraper" ) const baseURL = "https://watcher.guru" type WatcherGuru struct { client *http.Client } func New() *WatcherGuru { return &WatcherGuru{client: &http.Client{Timeout: 15 * time.Second}} } func (w *WatcherGuru) Name() string { return "watcherguru" } type rssFeed struct { Channel struct { Items []struct { Title string `xml:"title"` Link string `xml:"link"` PubDate string `xml:"pubDate"` Desc string `xml:"description"` } `xml:"item"` } `xml:"channel"` } func (w *WatcherGuru) Scrape(ctx context.Context, _ []string) ([]scraper.Article, error) { // Try RSS feeds first for _, feedURL := range []string{ baseURL + "/feed/", baseURL + "/news/feed/", } { articles, err := w.fetchRSS(ctx, feedURL) if err == nil && len(articles) > 0 { fmt.Printf("watcherguru rss: %d articles\n", len(articles)) return articles, nil } } // Fallback: HTML scraping articles, err := w.scrapeHTML(ctx) if err != nil { return nil, fmt.Errorf("watcherguru: %w", err) } fmt.Printf("watcherguru html: %d articles\n", len(articles)) return articles, nil } func (w *WatcherGuru) fetchRSS(ctx context.Context, feedURL string) ([]scraper.Article, error) { req, err := http.NewRequestWithContext(ctx, http.MethodGet, feedURL, nil) if err != nil { return nil, err } req.Header.Set("User-Agent", "Mozilla/5.0 (compatible; Tradarr/1.0)") resp, err := w.client.Do(req) if err != nil { return nil, err } defer resp.Body.Close() if resp.StatusCode != http.StatusOK { return nil, fmt.Errorf("HTTP %d", resp.StatusCode) } var feed rssFeed if err := xml.NewDecoder(resp.Body).Decode(&feed); err != nil { return nil, fmt.Errorf("parse RSS: %w", err) } var articles []scraper.Article for _, item := range feed.Channel.Items { title := strings.TrimSpace(item.Title) link := strings.TrimSpace(item.Link) if title == "" || link == "" { continue } var publishedAt *time.Time for _, f := range []string{time.RFC1123Z, time.RFC1123, "Mon, 02 Jan 2006 15:04:05 -0700"} { if t, err := time.Parse(f, item.PubDate); err == nil { publishedAt = &t break } } content := strings.TrimSpace(item.Desc) if content == "" { content = title } articles = append(articles, scraper.Article{ Title: title, Content: content, URL: link, PublishedAt: publishedAt, }) } return articles, nil } func (w *WatcherGuru) scrapeHTML(ctx context.Context) ([]scraper.Article, error) { req, err := http.NewRequestWithContext(ctx, http.MethodGet, baseURL+"/news/", nil) if err != nil { return nil, err } req.Header.Set("User-Agent", "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 Chrome/122.0.0.0 Safari/537.36") req.Header.Set("Accept", "text/html,application/xhtml+xml") resp, err := w.client.Do(req) if err != nil { return nil, err } defer resp.Body.Close() if resp.StatusCode != http.StatusOK { body, _ := io.ReadAll(io.LimitReader(resp.Body, 512)) return nil, fmt.Errorf("HTTP %d: %s", resp.StatusCode, body) } doc, err := html.Parse(resp.Body) if err != nil { return nil, fmt.Errorf("parse HTML: %w", err) } var articles []scraper.Article seen := make(map[string]bool) now := time.Now() var walk func(*html.Node) walk = func(n *html.Node) { if n.Type == html.ElementNode && (n.Data == "a" || n.Data == "article") { if n.Data == "a" { href := attrVal(n, "href") if href == "" || seen[href] { walk(n.FirstChild) return } // Collect links that look like news articles if strings.Contains(href, "/news/") || strings.Contains(href, "watcher.guru") { text := strings.TrimSpace(nodeText(n)) if len(text) > 20 { url := href if !strings.HasPrefix(url, "http") { url = baseURL + url } if !seen[url] { seen[url] = true articles = append(articles, scraper.Article{ Title: text, Content: text, URL: url, PublishedAt: &now, }) } } } } } for c := n.FirstChild; c != nil; c = c.NextSibling { walk(c) } } walk(doc) if len(articles) > 40 { articles = articles[:40] } return articles, nil } func attrVal(n *html.Node, key string) string { for _, a := range n.Attr { if a.Key == key { return a.Val } } return "" } func nodeText(n *html.Node) string { if n.Type == html.TextNode { return n.Data } var sb strings.Builder for c := n.FirstChild; c != nil; c = c.NextSibling { sb.WriteString(nodeText(c)) } return sb.String() }