feat: add sources to retrieve news and divide the IA reflexions in 2 steps to limit the number of news
This commit is contained in:
200
backend/internal/scraper/watcherguru/watcherguru.go
Normal file
200
backend/internal/scraper/watcherguru/watcherguru.go
Normal file
@ -0,0 +1,200 @@
|
||||
package watcherguru
|
||||
|
||||
import (
|
||||
"context"
|
||||
"encoding/xml"
|
||||
"fmt"
|
||||
"io"
|
||||
"net/http"
|
||||
"strings"
|
||||
"time"
|
||||
|
||||
"golang.org/x/net/html"
|
||||
|
||||
"github.com/tradarr/backend/internal/scraper"
|
||||
)
|
||||
|
||||
const baseURL = "https://watcher.guru"
|
||||
|
||||
type WatcherGuru struct {
|
||||
client *http.Client
|
||||
}
|
||||
|
||||
func New() *WatcherGuru {
|
||||
return &WatcherGuru{client: &http.Client{Timeout: 15 * time.Second}}
|
||||
}
|
||||
|
||||
func (w *WatcherGuru) Name() string { return "watcherguru" }
|
||||
|
||||
type rssFeed struct {
|
||||
Channel struct {
|
||||
Items []struct {
|
||||
Title string `xml:"title"`
|
||||
Link string `xml:"link"`
|
||||
PubDate string `xml:"pubDate"`
|
||||
Desc string `xml:"description"`
|
||||
} `xml:"item"`
|
||||
} `xml:"channel"`
|
||||
}
|
||||
|
||||
func (w *WatcherGuru) Scrape(ctx context.Context, _ []string) ([]scraper.Article, error) {
|
||||
// Try RSS feeds first
|
||||
for _, feedURL := range []string{
|
||||
baseURL + "/feed/",
|
||||
baseURL + "/news/feed/",
|
||||
} {
|
||||
articles, err := w.fetchRSS(ctx, feedURL)
|
||||
if err == nil && len(articles) > 0 {
|
||||
fmt.Printf("watcherguru rss: %d articles\n", len(articles))
|
||||
return articles, nil
|
||||
}
|
||||
}
|
||||
|
||||
// Fallback: HTML scraping
|
||||
articles, err := w.scrapeHTML(ctx)
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("watcherguru: %w", err)
|
||||
}
|
||||
fmt.Printf("watcherguru html: %d articles\n", len(articles))
|
||||
return articles, nil
|
||||
}
|
||||
|
||||
func (w *WatcherGuru) fetchRSS(ctx context.Context, feedURL string) ([]scraper.Article, error) {
|
||||
req, err := http.NewRequestWithContext(ctx, http.MethodGet, feedURL, nil)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
req.Header.Set("User-Agent", "Mozilla/5.0 (compatible; Tradarr/1.0)")
|
||||
|
||||
resp, err := w.client.Do(req)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
defer resp.Body.Close()
|
||||
|
||||
if resp.StatusCode != http.StatusOK {
|
||||
return nil, fmt.Errorf("HTTP %d", resp.StatusCode)
|
||||
}
|
||||
|
||||
var feed rssFeed
|
||||
if err := xml.NewDecoder(resp.Body).Decode(&feed); err != nil {
|
||||
return nil, fmt.Errorf("parse RSS: %w", err)
|
||||
}
|
||||
|
||||
var articles []scraper.Article
|
||||
for _, item := range feed.Channel.Items {
|
||||
title := strings.TrimSpace(item.Title)
|
||||
link := strings.TrimSpace(item.Link)
|
||||
if title == "" || link == "" {
|
||||
continue
|
||||
}
|
||||
var publishedAt *time.Time
|
||||
for _, f := range []string{time.RFC1123Z, time.RFC1123, "Mon, 02 Jan 2006 15:04:05 -0700"} {
|
||||
if t, err := time.Parse(f, item.PubDate); err == nil {
|
||||
publishedAt = &t
|
||||
break
|
||||
}
|
||||
}
|
||||
content := strings.TrimSpace(item.Desc)
|
||||
if content == "" {
|
||||
content = title
|
||||
}
|
||||
articles = append(articles, scraper.Article{
|
||||
Title: title,
|
||||
Content: content,
|
||||
URL: link,
|
||||
PublishedAt: publishedAt,
|
||||
})
|
||||
}
|
||||
return articles, nil
|
||||
}
|
||||
|
||||
func (w *WatcherGuru) scrapeHTML(ctx context.Context) ([]scraper.Article, error) {
|
||||
req, err := http.NewRequestWithContext(ctx, http.MethodGet, baseURL+"/news/", nil)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
req.Header.Set("User-Agent", "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 Chrome/122.0.0.0 Safari/537.36")
|
||||
req.Header.Set("Accept", "text/html,application/xhtml+xml")
|
||||
|
||||
resp, err := w.client.Do(req)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
defer resp.Body.Close()
|
||||
|
||||
if resp.StatusCode != http.StatusOK {
|
||||
body, _ := io.ReadAll(io.LimitReader(resp.Body, 512))
|
||||
return nil, fmt.Errorf("HTTP %d: %s", resp.StatusCode, body)
|
||||
}
|
||||
|
||||
doc, err := html.Parse(resp.Body)
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("parse HTML: %w", err)
|
||||
}
|
||||
|
||||
var articles []scraper.Article
|
||||
seen := make(map[string]bool)
|
||||
now := time.Now()
|
||||
|
||||
var walk func(*html.Node)
|
||||
walk = func(n *html.Node) {
|
||||
if n.Type == html.ElementNode && (n.Data == "a" || n.Data == "article") {
|
||||
if n.Data == "a" {
|
||||
href := attrVal(n, "href")
|
||||
if href == "" || seen[href] {
|
||||
walk(n.FirstChild)
|
||||
return
|
||||
}
|
||||
// Collect links that look like news articles
|
||||
if strings.Contains(href, "/news/") || strings.Contains(href, "watcher.guru") {
|
||||
text := strings.TrimSpace(nodeText(n))
|
||||
if len(text) > 20 {
|
||||
url := href
|
||||
if !strings.HasPrefix(url, "http") {
|
||||
url = baseURL + url
|
||||
}
|
||||
if !seen[url] {
|
||||
seen[url] = true
|
||||
articles = append(articles, scraper.Article{
|
||||
Title: text,
|
||||
Content: text,
|
||||
URL: url,
|
||||
PublishedAt: &now,
|
||||
})
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
for c := n.FirstChild; c != nil; c = c.NextSibling {
|
||||
walk(c)
|
||||
}
|
||||
}
|
||||
walk(doc)
|
||||
|
||||
if len(articles) > 40 {
|
||||
articles = articles[:40]
|
||||
}
|
||||
return articles, nil
|
||||
}
|
||||
|
||||
func attrVal(n *html.Node, key string) string {
|
||||
for _, a := range n.Attr {
|
||||
if a.Key == key {
|
||||
return a.Val
|
||||
}
|
||||
}
|
||||
return ""
|
||||
}
|
||||
|
||||
func nodeText(n *html.Node) string {
|
||||
if n.Type == html.TextNode {
|
||||
return n.Data
|
||||
}
|
||||
var sb strings.Builder
|
||||
for c := n.FirstChild; c != nil; c = c.NextSibling {
|
||||
sb.WriteString(nodeText(c))
|
||||
}
|
||||
return sb.String()
|
||||
}
|
||||
Reference in New Issue
Block a user