From 788f0290d5afd992919c804a2711b52bc8858487 Mon Sep 17 00:00:00 2001
From: Blomios <blomios@gmail.com>
Date: Fri, 1 May 2026 00:33:27 +0200
Subject: [PATCH] fix: fix bloomberg scraping using byparr

---
 docker-compose.prod.yml      |  10 ++
 docker-compose.yml           |  10 ++
 scraper-service/index.js     | 194 ++++++++++++++++++++++++-----------
 scraper-service/package.json |   1 +
 4 files changed, 157 insertions(+), 58 deletions(-)

diff --git a/docker-compose.prod.yml b/docker-compose.prod.yml
index 2dfc9f3..44b9cd7 100644
--- a/docker-compose.prod.yml
+++ b/docker-compose.prod.yml
@@ -14,9 +14,19 @@ services:
       timeout: 5s
       retries: 5
 
+  byparr:
+    image: ghcr.io/thephaseless/byparr:latest
+    restart: unless-stopped
+    expose:
+      - "8191"
+
   scraper:
     image: gitea.anthonybouteiller.ovh/blomios/tradarr-scraper:v1.0.0
     restart: unless-stopped
+    depends_on:
+      - byparr
+    environment:
+      BYPARR_URL: "http://byparr:8191"
     expose:
       - "3001"
 
diff --git a/docker-compose.yml b/docker-compose.yml
index 6d32ccf..b01f768 100644
--- a/docker-compose.yml
+++ b/docker-compose.yml
@@ -14,11 +14,21 @@ services:
       timeout: 5s
       retries: 5
 
+  byparr:
+    image: ghcr.io/thephaseless/byparr:latest
+    restart: unless-stopped
+    expose:
+      - "8191"
+
   scraper:
     build:
       context: ./scraper-service
       dockerfile: Dockerfile
     restart: unless-stopped
+    depends_on:
+      - byparr
+    environment:
+      BYPARR_URL: "http://byparr:8191"
     expose:
       - "3001"
 
diff --git a/scraper-service/index.js b/scraper-service/index.js
index ca048c8..8747a94 100644
--- a/scraper-service/index.js
+++ b/scraper-service/index.js
@@ -1,4 +1,5 @@
 const express = require('express')
+const cheerio = require('cheerio')
 const puppeteer = require('puppeteer-extra')
 const StealthPlugin = require('puppeteer-extra-plugin-stealth')
 
@@ -9,6 +10,93 @@ app.use(express.json())
 
 const CHROME_PATH = process.env.PUPPETEER_EXECUTABLE_PATH || '/usr/bin/chromium'
 const PORT = process.env.PORT || 3001
+const BYPARR_URL = process.env.BYPARR_URL || ''
+
+// ---------- Byparr helpers ----------
+
+async function byparrRequest(body) {
+  const resp = await fetch(`${BYPARR_URL}/v1`, {
+    method: 'POST',
+    headers: { 'Content-Type': 'application/json' },
+    body: JSON.stringify(body),
+  })
+  if (!resp.ok) throw new Error(`byparr HTTP ${resp.status}: ${await resp.text()}`)
+  return resp.json()
+}
+
+async function byparrFetch(url) {
+  console.log('[byparr] GET', url)
+  const data = await byparrRequest({ cmd: 'request.get', url, maxTimeout: 60000 })
+  const html = data.solution?.response
+  if (!html) throw new Error(`byparr returned no HTML for ${url}`)
+  return { html }
+}
+
+// ---------- Article parsing ----------
+
+function parseArticlesFromHtml(html) {
+  const $ = cheerio.load(html)
+  const results = []
+  const seen = new Set()
+
+  $('a[data-component="story-link"]').each((_, el) => {
+    const a = $(el)
+    const href = a.attr('href') || ''
+    const absolute = href.startsWith('http') ? href : `https://www.bloomberg.com${href}`
+    const clean = absolute.split('?')[0]
+
+    if (seen.has(clean)) return
+    if (!/\/(news\/articles|news\/features|features)\//.test(clean)) return
+
+    const title = a.find('[data-testid="headline"] span').text().trim()
+    if (!title || title.length < 10) return
+
+    seen.add(clean)
+    results.push({ title, url: clean })
+  })
+
+  return results
+}
+
+// ---------- Bloomberg scraping via Byparr ----------
+
+async function scrapeBloombergViaByparr(username, password) {
+  // /markets main page is excluded: live ticker widgets cause continuous network activity
+  // so Byparr's networkidle wait never resolves. Sub-sections below are editorial pages
+  // without live data — they behave like /technology and /economics (~5s each).
+  const byparrPages = [
+    'https://www.bloomberg.com/markets/stocks',
+    'https://www.bloomberg.com/markets/currencies',
+    'https://www.bloomberg.com/markets/rates-bonds',
+    'https://www.bloomberg.com/markets/commodities',
+    'https://www.bloomberg.com/technology',
+    'https://www.bloomberg.com/economics',
+    'https://www.bloomberg.com/politics',
+  ]
+
+  const articles = []
+  const seen = new Set()
+
+  for (const url of byparrPages) {
+    try {
+      const { html } = await byparrFetch(url)
+      const items = parseArticlesFromHtml(html)
+      for (const item of items) {
+        if (!seen.has(item.url)) {
+          seen.add(item.url)
+          articles.push(item)
+        }
+      }
+      console.log('[bloomberg]', url, '->', items.length, 'articles')
+    } catch (e) {
+      console.error('[bloomberg] error on', url, ':', e.message)
+    }
+  }
+
+  return articles
+}
+
+// ---------- Bloomberg scraping via Puppeteer (fallback, may be blocked) ----------
 
 function launchBrowser() {
   return puppeteer.launch({
@@ -47,21 +135,13 @@ async function tryType(page, selectors, text) {
   return false
 }
 
-app.get('/health', (_, res) => res.json({ ok: true }))
-
-app.post('/bloomberg/scrape', async (req, res) => {
-  const { username, password } = req.body || {}
-  if (!username || !password) {
-    return res.status(400).json({ error: 'username and password required' })
-  }
-
+async function scrapeBloombergViaPuppeteer(username, password) {
   let browser
   try {
     browser = await launchBrowser()
     const page = await browser.newPage()
     await page.setViewport({ width: 1920, height: 1080 })
 
-    // Hide automation signals
     await page.evaluateOnNewDocument(() => {
       Object.defineProperty(navigator, 'webdriver', { get: () => undefined })
       window.chrome = { runtime: {} }
@@ -75,35 +155,28 @@ app.post('/bloomberg/scrape', async (req, res) => {
     })
     await new Promise(r => setTimeout(r, 2000))
 
-    // Debug: état de la page avant de chercher l'email
     const pageInputs = await page.evaluate(() =>
       Array.from(document.querySelectorAll('input')).map(i => ({
         type: i.type, name: i.name, id: i.id, placeholder: i.placeholder, visible: i.offsetParent !== null
       }))
     )
     console.log('[bloomberg] inputs on login page:', JSON.stringify(pageInputs))
-    const pageTitle = await page.title()
-    console.log('[bloomberg] page title:', pageTitle)
+    console.log('[bloomberg] page title:', await page.title())
 
-    console.log('[bloomberg] entering email')
-    const emailSelectors = [
+    const emailOk = await tryType(page, [
       '#email-form-input',
       'input[id="email-form-input"]',
       'input[type="email"]',
       'input[name="text-input"]',
       'input[placeholder*="email" i]',
-    ]
-    const emailOk = await tryType(page, emailSelectors, username)
+    ], username)
     if (!emailOk) throw new Error('could not find email input')
 
     await new Promise(r => setTimeout(r, 800))
 
-    // Click submit via JS pour contourner les boutons désactivés
     const submitted = await page.evaluate(() => {
-      const btns = Array.from(document.querySelectorAll('button'))
-      const btn = btns.find(b =>
-        b.type === 'submit' ||
-        /continue|next|sign.?in/i.test(b.textContent)
+      const btn = Array.from(document.querySelectorAll('button')).find(b =>
+        b.type === 'submit' || /continue|next|sign.?in/i.test(b.textContent)
       )
       if (btn) { btn.click(); return true }
       const form = document.querySelector('form')
@@ -112,7 +185,6 @@ app.post('/bloomberg/scrape', async (req, res) => {
     })
     if (!submitted) await page.keyboard.press('Enter')
 
-    // Attendre que la page change (password input apparaît ou navigation)
     try {
       await page.waitForFunction(
         () => document.querySelector('input[type="password"]') !== null,
@@ -123,31 +195,28 @@ app.post('/bloomberg/scrape', async (req, res) => {
     }
     console.log('[bloomberg] after email submit, url:', page.url())
 
-    // Debug inputs disponibles
-    const allInputs = await page.evaluate(() =>
-      Array.from(document.querySelectorAll('input')).map(i => ({
-        type: i.type, name: i.name, id: i.id, placeholder: i.placeholder
-      }))
-    )
-    console.log('[bloomberg] inputs after email submit:', JSON.stringify(allInputs))
-
-    console.log('[bloomberg] entering password')
     const pwdOk = await tryType(page, [
       'input[type="password"]',
       'input[name="password"]',
       'input[autocomplete="current-password"]',
       'input[autocomplete="password"]',
     ], password)
-    if (!pwdOk) throw new Error('could not find password input — check logs above for available inputs')
+    if (!pwdOk) throw new Error('could not find password input')
 
     await new Promise(r => setTimeout(r, 500))
-    await tryClick(page, ['button[type="submit"]', 'input[type="submit"]'])
-    await new Promise(r => setTimeout(r, 3000))
+    await Promise.all([
+      page.waitForNavigation({ waitUntil: 'networkidle2', timeout: 20000 }).catch(() => {}),
+      tryClick(page, ['button[type="submit"]', 'input[type="submit"]']),
+    ])
 
     const currentURL = page.url()
     console.log('[bloomberg] after login, url:', currentURL)
+    if (currentURL.includes('/account/signin')) {
+      const bodyText = await page.evaluate(() => document.body.innerText.slice(0, 500))
+      throw new Error(`login did not redirect — page content: ${bodyText}`)
+    }
 
-    const pages = [
+    const targetPages = [
       'https://www.bloomberg.com/markets',
       'https://www.bloomberg.com/technology',
       'https://www.bloomberg.com/economics',
@@ -156,32 +225,17 @@ app.post('/bloomberg/scrape', async (req, res) => {
     const articles = []
     const seen = new Set()
 
-    for (const url of pages) {
+    for (const url of targetPages) {
       try {
         console.log('[bloomberg] scraping', url)
-        await page.goto(url, { waitUntil: 'networkidle2', timeout: 30000 })
-        await new Promise(r => setTimeout(r, 2000))
+        await page.goto(url, { waitUntil: 'domcontentloaded', timeout: 30000 })
+        await page.waitForSelector('a[data-component="story-link"]', { timeout: 15000 }).catch(() => {})
 
-        const items = await page.evaluate(() => {
-          const results = []
-          const seen = new Set()
-          const links = document.querySelectorAll(
-            'a[href*="/news/articles"], a[href*="/opinion/"], a[href*="/markets/"]'
-          )
-          links.forEach(a => {
-            if (seen.has(a.href)) return
-            seen.add(a.href)
-            const titleEl = a.querySelector('h1,h2,h3,h4,[class*="headline"],[class*="title"]')
-            const text = titleEl ? titleEl.innerText.trim() : a.innerText.trim()
-            if (text.length > 20 && a.href.includes('bloomberg.com')) {
-              results.push({ title: text, url: a.href })
-            }
-          })
-          return results.slice(0, 25)
-        })
+        const html = await page.content()
+        const items = parseArticlesFromHtml(html)
 
         for (const item of items) {
-          if (!seen.has(item.url) && item.title && item.url) {
+          if (!seen.has(item.url)) {
             seen.add(item.url)
             articles.push(item)
           }
@@ -192,13 +246,37 @@ app.post('/bloomberg/scrape', async (req, res) => {
       }
     }
 
+    return articles
+  } finally {
+    if (browser) await browser.close()
+  }
+}
+
+// ---------- Routes ----------
+
+app.get('/health', (_, res) => res.json({ ok: true }))
+
+app.post('/bloomberg/scrape', async (req, res) => {
+  const { username, password } = req.body || {}
+  if (!username || !password) {
+    return res.status(400).json({ error: 'username and password required' })
+  }
+
+  try {
+    let articles
+    if (BYPARR_URL) {
+      console.log('[bloomberg] using Byparr (nodriver) to bypass bot detection')
+      articles = await scrapeBloombergViaByparr(username, password)
+    } else {
+      console.log('[bloomberg] using Puppeteer (Byparr not configured)')
+      articles = await scrapeBloombergViaPuppeteer(username, password)
+    }
+
     console.log('[bloomberg] total:', articles.length, 'articles')
     res.json({ articles })
   } catch (e) {
     console.error('[bloomberg] scrape error:', e.message)
     res.status(500).json({ error: e.message })
-  } finally {
-    if (browser) await browser.close()
   }
 })
 
diff --git a/scraper-service/package.json b/scraper-service/package.json
index 42602d8..401ac4b 100644
--- a/scraper-service/package.json
+++ b/scraper-service/package.json
@@ -6,6 +6,7 @@
     "start": "node index.js"
   },
   "dependencies": {
+    "cheerio": "^1.0.0",
     "express": "^4.19.2",
     "puppeteer-extra": "^3.3.6",
     "puppeteer-extra-plugin-stealth": "^2.11.2",