From 788f0290d5afd992919c804a2711b52bc8858487 Mon Sep 17 00:00:00 2001 From: Blomios Date: Fri, 1 May 2026 00:33:27 +0200 Subject: [PATCH] fix: fix bloomberg scraping using byparr --- docker-compose.prod.yml | 10 ++ docker-compose.yml | 10 ++ scraper-service/index.js | 194 ++++++++++++++++++++++++----------- scraper-service/package.json | 1 + 4 files changed, 157 insertions(+), 58 deletions(-) diff --git a/docker-compose.prod.yml b/docker-compose.prod.yml index 2dfc9f3..44b9cd7 100644 --- a/docker-compose.prod.yml +++ b/docker-compose.prod.yml @@ -14,9 +14,19 @@ services: timeout: 5s retries: 5 + byparr: + image: ghcr.io/thephaseless/byparr:latest + restart: unless-stopped + expose: + - "8191" + scraper: image: gitea.anthonybouteiller.ovh/blomios/tradarr-scraper:v1.0.0 restart: unless-stopped + depends_on: + - byparr + environment: + BYPARR_URL: "http://byparr:8191" expose: - "3001" diff --git a/docker-compose.yml b/docker-compose.yml index 6d32ccf..b01f768 100644 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -14,11 +14,21 @@ services: timeout: 5s retries: 5 + byparr: + image: ghcr.io/thephaseless/byparr:latest + restart: unless-stopped + expose: + - "8191" + scraper: build: context: ./scraper-service dockerfile: Dockerfile restart: unless-stopped + depends_on: + - byparr + environment: + BYPARR_URL: "http://byparr:8191" expose: - "3001" diff --git a/scraper-service/index.js b/scraper-service/index.js index ca048c8..8747a94 100644 --- a/scraper-service/index.js +++ b/scraper-service/index.js @@ -1,4 +1,5 @@ const express = require('express') +const cheerio = require('cheerio') const puppeteer = require('puppeteer-extra') const StealthPlugin = require('puppeteer-extra-plugin-stealth') @@ -9,6 +10,93 @@ app.use(express.json()) const CHROME_PATH = process.env.PUPPETEER_EXECUTABLE_PATH || '/usr/bin/chromium' const PORT = process.env.PORT || 3001 +const BYPARR_URL = process.env.BYPARR_URL || '' + +// ---------- Byparr helpers ---------- + +async function byparrRequest(body) { + const resp = await fetch(`${BYPARR_URL}/v1`, { + method: 'POST', + headers: { 'Content-Type': 'application/json' }, + body: JSON.stringify(body), + }) + if (!resp.ok) throw new Error(`byparr HTTP ${resp.status}: ${await resp.text()}`) + return resp.json() +} + +async function byparrFetch(url) { + console.log('[byparr] GET', url) + const data = await byparrRequest({ cmd: 'request.get', url, maxTimeout: 60000 }) + const html = data.solution?.response + if (!html) throw new Error(`byparr returned no HTML for ${url}`) + return { html } +} + +// ---------- Article parsing ---------- + +function parseArticlesFromHtml(html) { + const $ = cheerio.load(html) + const results = [] + const seen = new Set() + + $('a[data-component="story-link"]').each((_, el) => { + const a = $(el) + const href = a.attr('href') || '' + const absolute = href.startsWith('http') ? href : `https://www.bloomberg.com${href}` + const clean = absolute.split('?')[0] + + if (seen.has(clean)) return + if (!/\/(news\/articles|news\/features|features)\//.test(clean)) return + + const title = a.find('[data-testid="headline"] span').text().trim() + if (!title || title.length < 10) return + + seen.add(clean) + results.push({ title, url: clean }) + }) + + return results +} + +// ---------- Bloomberg scraping via Byparr ---------- + +async function scrapeBloombergViaByparr(username, password) { + // /markets main page is excluded: live ticker widgets cause continuous network activity + // so Byparr's networkidle wait never resolves. Sub-sections below are editorial pages + // without live data — they behave like /technology and /economics (~5s each). + const byparrPages = [ + 'https://www.bloomberg.com/markets/stocks', + 'https://www.bloomberg.com/markets/currencies', + 'https://www.bloomberg.com/markets/rates-bonds', + 'https://www.bloomberg.com/markets/commodities', + 'https://www.bloomberg.com/technology', + 'https://www.bloomberg.com/economics', + 'https://www.bloomberg.com/politics', + ] + + const articles = [] + const seen = new Set() + + for (const url of byparrPages) { + try { + const { html } = await byparrFetch(url) + const items = parseArticlesFromHtml(html) + for (const item of items) { + if (!seen.has(item.url)) { + seen.add(item.url) + articles.push(item) + } + } + console.log('[bloomberg]', url, '->', items.length, 'articles') + } catch (e) { + console.error('[bloomberg] error on', url, ':', e.message) + } + } + + return articles +} + +// ---------- Bloomberg scraping via Puppeteer (fallback, may be blocked) ---------- function launchBrowser() { return puppeteer.launch({ @@ -47,21 +135,13 @@ async function tryType(page, selectors, text) { return false } -app.get('/health', (_, res) => res.json({ ok: true })) - -app.post('/bloomberg/scrape', async (req, res) => { - const { username, password } = req.body || {} - if (!username || !password) { - return res.status(400).json({ error: 'username and password required' }) - } - +async function scrapeBloombergViaPuppeteer(username, password) { let browser try { browser = await launchBrowser() const page = await browser.newPage() await page.setViewport({ width: 1920, height: 1080 }) - // Hide automation signals await page.evaluateOnNewDocument(() => { Object.defineProperty(navigator, 'webdriver', { get: () => undefined }) window.chrome = { runtime: {} } @@ -75,35 +155,28 @@ app.post('/bloomberg/scrape', async (req, res) => { }) await new Promise(r => setTimeout(r, 2000)) - // Debug: état de la page avant de chercher l'email const pageInputs = await page.evaluate(() => Array.from(document.querySelectorAll('input')).map(i => ({ type: i.type, name: i.name, id: i.id, placeholder: i.placeholder, visible: i.offsetParent !== null })) ) console.log('[bloomberg] inputs on login page:', JSON.stringify(pageInputs)) - const pageTitle = await page.title() - console.log('[bloomberg] page title:', pageTitle) + console.log('[bloomberg] page title:', await page.title()) - console.log('[bloomberg] entering email') - const emailSelectors = [ + const emailOk = await tryType(page, [ '#email-form-input', 'input[id="email-form-input"]', 'input[type="email"]', 'input[name="text-input"]', 'input[placeholder*="email" i]', - ] - const emailOk = await tryType(page, emailSelectors, username) + ], username) if (!emailOk) throw new Error('could not find email input') await new Promise(r => setTimeout(r, 800)) - // Click submit via JS pour contourner les boutons désactivés const submitted = await page.evaluate(() => { - const btns = Array.from(document.querySelectorAll('button')) - const btn = btns.find(b => - b.type === 'submit' || - /continue|next|sign.?in/i.test(b.textContent) + const btn = Array.from(document.querySelectorAll('button')).find(b => + b.type === 'submit' || /continue|next|sign.?in/i.test(b.textContent) ) if (btn) { btn.click(); return true } const form = document.querySelector('form') @@ -112,7 +185,6 @@ app.post('/bloomberg/scrape', async (req, res) => { }) if (!submitted) await page.keyboard.press('Enter') - // Attendre que la page change (password input apparaît ou navigation) try { await page.waitForFunction( () => document.querySelector('input[type="password"]') !== null, @@ -123,31 +195,28 @@ app.post('/bloomberg/scrape', async (req, res) => { } console.log('[bloomberg] after email submit, url:', page.url()) - // Debug inputs disponibles - const allInputs = await page.evaluate(() => - Array.from(document.querySelectorAll('input')).map(i => ({ - type: i.type, name: i.name, id: i.id, placeholder: i.placeholder - })) - ) - console.log('[bloomberg] inputs after email submit:', JSON.stringify(allInputs)) - - console.log('[bloomberg] entering password') const pwdOk = await tryType(page, [ 'input[type="password"]', 'input[name="password"]', 'input[autocomplete="current-password"]', 'input[autocomplete="password"]', ], password) - if (!pwdOk) throw new Error('could not find password input — check logs above for available inputs') + if (!pwdOk) throw new Error('could not find password input') await new Promise(r => setTimeout(r, 500)) - await tryClick(page, ['button[type="submit"]', 'input[type="submit"]']) - await new Promise(r => setTimeout(r, 3000)) + await Promise.all([ + page.waitForNavigation({ waitUntil: 'networkidle2', timeout: 20000 }).catch(() => {}), + tryClick(page, ['button[type="submit"]', 'input[type="submit"]']), + ]) const currentURL = page.url() console.log('[bloomberg] after login, url:', currentURL) + if (currentURL.includes('/account/signin')) { + const bodyText = await page.evaluate(() => document.body.innerText.slice(0, 500)) + throw new Error(`login did not redirect — page content: ${bodyText}`) + } - const pages = [ + const targetPages = [ 'https://www.bloomberg.com/markets', 'https://www.bloomberg.com/technology', 'https://www.bloomberg.com/economics', @@ -156,32 +225,17 @@ app.post('/bloomberg/scrape', async (req, res) => { const articles = [] const seen = new Set() - for (const url of pages) { + for (const url of targetPages) { try { console.log('[bloomberg] scraping', url) - await page.goto(url, { waitUntil: 'networkidle2', timeout: 30000 }) - await new Promise(r => setTimeout(r, 2000)) + await page.goto(url, { waitUntil: 'domcontentloaded', timeout: 30000 }) + await page.waitForSelector('a[data-component="story-link"]', { timeout: 15000 }).catch(() => {}) - const items = await page.evaluate(() => { - const results = [] - const seen = new Set() - const links = document.querySelectorAll( - 'a[href*="/news/articles"], a[href*="/opinion/"], a[href*="/markets/"]' - ) - links.forEach(a => { - if (seen.has(a.href)) return - seen.add(a.href) - const titleEl = a.querySelector('h1,h2,h3,h4,[class*="headline"],[class*="title"]') - const text = titleEl ? titleEl.innerText.trim() : a.innerText.trim() - if (text.length > 20 && a.href.includes('bloomberg.com')) { - results.push({ title: text, url: a.href }) - } - }) - return results.slice(0, 25) - }) + const html = await page.content() + const items = parseArticlesFromHtml(html) for (const item of items) { - if (!seen.has(item.url) && item.title && item.url) { + if (!seen.has(item.url)) { seen.add(item.url) articles.push(item) } @@ -192,13 +246,37 @@ app.post('/bloomberg/scrape', async (req, res) => { } } + return articles + } finally { + if (browser) await browser.close() + } +} + +// ---------- Routes ---------- + +app.get('/health', (_, res) => res.json({ ok: true })) + +app.post('/bloomberg/scrape', async (req, res) => { + const { username, password } = req.body || {} + if (!username || !password) { + return res.status(400).json({ error: 'username and password required' }) + } + + try { + let articles + if (BYPARR_URL) { + console.log('[bloomberg] using Byparr (nodriver) to bypass bot detection') + articles = await scrapeBloombergViaByparr(username, password) + } else { + console.log('[bloomberg] using Puppeteer (Byparr not configured)') + articles = await scrapeBloombergViaPuppeteer(username, password) + } + console.log('[bloomberg] total:', articles.length, 'articles') res.json({ articles }) } catch (e) { console.error('[bloomberg] scrape error:', e.message) res.status(500).json({ error: e.message }) - } finally { - if (browser) await browser.close() } }) diff --git a/scraper-service/package.json b/scraper-service/package.json index 42602d8..401ac4b 100644 --- a/scraper-service/package.json +++ b/scraper-service/package.json @@ -6,6 +6,7 @@ "start": "node index.js" }, "dependencies": { + "cheerio": "^1.0.0", "express": "^4.19.2", "puppeteer-extra": "^3.3.6", "puppeteer-extra-plugin-stealth": "^2.11.2",