201 lines
4.6 KiB
Go
201 lines
4.6 KiB
Go
package watcherguru
|
|
|
|
import (
|
|
"context"
|
|
"encoding/xml"
|
|
"fmt"
|
|
"io"
|
|
"net/http"
|
|
"strings"
|
|
"time"
|
|
|
|
"golang.org/x/net/html"
|
|
|
|
"github.com/tradarr/backend/internal/scraper"
|
|
)
|
|
|
|
const baseURL = "https://watcher.guru"
|
|
|
|
type WatcherGuru struct {
|
|
client *http.Client
|
|
}
|
|
|
|
func New() *WatcherGuru {
|
|
return &WatcherGuru{client: &http.Client{Timeout: 15 * time.Second}}
|
|
}
|
|
|
|
func (w *WatcherGuru) Name() string { return "watcherguru" }
|
|
|
|
type rssFeed struct {
|
|
Channel struct {
|
|
Items []struct {
|
|
Title string `xml:"title"`
|
|
Link string `xml:"link"`
|
|
PubDate string `xml:"pubDate"`
|
|
Desc string `xml:"description"`
|
|
} `xml:"item"`
|
|
} `xml:"channel"`
|
|
}
|
|
|
|
func (w *WatcherGuru) Scrape(ctx context.Context, _ []string) ([]scraper.Article, error) {
|
|
// Try RSS feeds first
|
|
for _, feedURL := range []string{
|
|
baseURL + "/feed/",
|
|
baseURL + "/news/feed/",
|
|
} {
|
|
articles, err := w.fetchRSS(ctx, feedURL)
|
|
if err == nil && len(articles) > 0 {
|
|
fmt.Printf("watcherguru rss: %d articles\n", len(articles))
|
|
return articles, nil
|
|
}
|
|
}
|
|
|
|
// Fallback: HTML scraping
|
|
articles, err := w.scrapeHTML(ctx)
|
|
if err != nil {
|
|
return nil, fmt.Errorf("watcherguru: %w", err)
|
|
}
|
|
fmt.Printf("watcherguru html: %d articles\n", len(articles))
|
|
return articles, nil
|
|
}
|
|
|
|
func (w *WatcherGuru) fetchRSS(ctx context.Context, feedURL string) ([]scraper.Article, error) {
|
|
req, err := http.NewRequestWithContext(ctx, http.MethodGet, feedURL, nil)
|
|
if err != nil {
|
|
return nil, err
|
|
}
|
|
req.Header.Set("User-Agent", "Mozilla/5.0 (compatible; Tradarr/1.0)")
|
|
|
|
resp, err := w.client.Do(req)
|
|
if err != nil {
|
|
return nil, err
|
|
}
|
|
defer resp.Body.Close()
|
|
|
|
if resp.StatusCode != http.StatusOK {
|
|
return nil, fmt.Errorf("HTTP %d", resp.StatusCode)
|
|
}
|
|
|
|
var feed rssFeed
|
|
if err := xml.NewDecoder(resp.Body).Decode(&feed); err != nil {
|
|
return nil, fmt.Errorf("parse RSS: %w", err)
|
|
}
|
|
|
|
var articles []scraper.Article
|
|
for _, item := range feed.Channel.Items {
|
|
title := strings.TrimSpace(item.Title)
|
|
link := strings.TrimSpace(item.Link)
|
|
if title == "" || link == "" {
|
|
continue
|
|
}
|
|
var publishedAt *time.Time
|
|
for _, f := range []string{time.RFC1123Z, time.RFC1123, "Mon, 02 Jan 2006 15:04:05 -0700"} {
|
|
if t, err := time.Parse(f, item.PubDate); err == nil {
|
|
publishedAt = &t
|
|
break
|
|
}
|
|
}
|
|
content := strings.TrimSpace(item.Desc)
|
|
if content == "" {
|
|
content = title
|
|
}
|
|
articles = append(articles, scraper.Article{
|
|
Title: title,
|
|
Content: content,
|
|
URL: link,
|
|
PublishedAt: publishedAt,
|
|
})
|
|
}
|
|
return articles, nil
|
|
}
|
|
|
|
func (w *WatcherGuru) scrapeHTML(ctx context.Context) ([]scraper.Article, error) {
|
|
req, err := http.NewRequestWithContext(ctx, http.MethodGet, baseURL+"/news/", nil)
|
|
if err != nil {
|
|
return nil, err
|
|
}
|
|
req.Header.Set("User-Agent", "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 Chrome/122.0.0.0 Safari/537.36")
|
|
req.Header.Set("Accept", "text/html,application/xhtml+xml")
|
|
|
|
resp, err := w.client.Do(req)
|
|
if err != nil {
|
|
return nil, err
|
|
}
|
|
defer resp.Body.Close()
|
|
|
|
if resp.StatusCode != http.StatusOK {
|
|
body, _ := io.ReadAll(io.LimitReader(resp.Body, 512))
|
|
return nil, fmt.Errorf("HTTP %d: %s", resp.StatusCode, body)
|
|
}
|
|
|
|
doc, err := html.Parse(resp.Body)
|
|
if err != nil {
|
|
return nil, fmt.Errorf("parse HTML: %w", err)
|
|
}
|
|
|
|
var articles []scraper.Article
|
|
seen := make(map[string]bool)
|
|
now := time.Now()
|
|
|
|
var walk func(*html.Node)
|
|
walk = func(n *html.Node) {
|
|
if n.Type == html.ElementNode && (n.Data == "a" || n.Data == "article") {
|
|
if n.Data == "a" {
|
|
href := attrVal(n, "href")
|
|
if href == "" || seen[href] {
|
|
walk(n.FirstChild)
|
|
return
|
|
}
|
|
// Collect links that look like news articles
|
|
if strings.Contains(href, "/news/") || strings.Contains(href, "watcher.guru") {
|
|
text := strings.TrimSpace(nodeText(n))
|
|
if len(text) > 20 {
|
|
url := href
|
|
if !strings.HasPrefix(url, "http") {
|
|
url = baseURL + url
|
|
}
|
|
if !seen[url] {
|
|
seen[url] = true
|
|
articles = append(articles, scraper.Article{
|
|
Title: text,
|
|
Content: text,
|
|
URL: url,
|
|
PublishedAt: &now,
|
|
})
|
|
}
|
|
}
|
|
}
|
|
}
|
|
}
|
|
for c := n.FirstChild; c != nil; c = c.NextSibling {
|
|
walk(c)
|
|
}
|
|
}
|
|
walk(doc)
|
|
|
|
if len(articles) > 40 {
|
|
articles = articles[:40]
|
|
}
|
|
return articles, nil
|
|
}
|
|
|
|
func attrVal(n *html.Node, key string) string {
|
|
for _, a := range n.Attr {
|
|
if a.Key == key {
|
|
return a.Val
|
|
}
|
|
}
|
|
return ""
|
|
}
|
|
|
|
func nodeText(n *html.Node) string {
|
|
if n.Type == html.TextNode {
|
|
return n.Data
|
|
}
|
|
var sb strings.Builder
|
|
for c := n.FirstChild; c != nil; c = c.NextSibling {
|
|
sb.WriteString(nodeText(c))
|
|
}
|
|
return sb.String()
|
|
}
|