const express = require('express') const cheerio = require('cheerio') const puppeteer = require('puppeteer-extra') const StealthPlugin = require('puppeteer-extra-plugin-stealth') puppeteer.use(StealthPlugin()) const app = express() app.use(express.json()) const CHROME_PATH = process.env.PUPPETEER_EXECUTABLE_PATH || '/usr/bin/chromium' const PORT = process.env.PORT || 3001 const BYPARR_URL = process.env.BYPARR_URL || '' // ---------- Byparr helpers ---------- async function byparrRequest(body) { const resp = await fetch(`${BYPARR_URL}/v1`, { method: 'POST', headers: { 'Content-Type': 'application/json' }, body: JSON.stringify(body), }) if (!resp.ok) throw new Error(`byparr HTTP ${resp.status}: ${await resp.text()}`) return resp.json() } async function byparrFetch(url) { console.log('[byparr] GET', url) const data = await byparrRequest({ cmd: 'request.get', url, maxTimeout: 60000 }) const html = data.solution?.response if (!html) throw new Error(`byparr returned no HTML for ${url}`) return { html } } // ---------- Article parsing ---------- function parseArticlesFromHtml(html) { const $ = cheerio.load(html) const results = [] const seen = new Set() $('a[data-component="story-link"]').each((_, el) => { const a = $(el) const href = a.attr('href') || '' const absolute = href.startsWith('http') ? href : `https://www.bloomberg.com${href}` const clean = absolute.split('?')[0] if (seen.has(clean)) return if (!/\/(news\/articles|news\/features|features)\//.test(clean)) return const title = a.find('[data-testid="headline"] span').text().trim() if (!title || title.length < 10) return seen.add(clean) results.push({ title, url: clean }) }) return results } // ---------- Bloomberg scraping via Byparr ---------- async function scrapeBloombergViaByparr(username, password) { // /markets main page is excluded: live ticker widgets cause continuous network activity // so Byparr's networkidle wait never resolves. Sub-sections below are editorial pages // without live data — they behave like /technology and /economics (~5s each). const byparrPages = [ 'https://www.bloomberg.com/markets/stocks', 'https://www.bloomberg.com/markets/currencies', 'https://www.bloomberg.com/markets/rates-bonds', 'https://www.bloomberg.com/markets/commodities', 'https://www.bloomberg.com/technology', 'https://www.bloomberg.com/economics', 'https://www.bloomberg.com/politics', ] const articles = [] const seen = new Set() for (const url of byparrPages) { try { const { html } = await byparrFetch(url) const items = parseArticlesFromHtml(html) for (const item of items) { if (!seen.has(item.url)) { seen.add(item.url) articles.push(item) } } console.log('[bloomberg]', url, '->', items.length, 'articles') } catch (e) { console.error('[bloomberg] error on', url, ':', e.message) } } return articles } // ---------- Bloomberg scraping via Puppeteer (fallback, may be blocked) ---------- function launchBrowser() { return puppeteer.launch({ executablePath: CHROME_PATH, headless: true, args: [ '--no-sandbox', '--disable-setuid-sandbox', '--disable-dev-shm-usage', '--disable-gpu', '--window-size=1920,1080', '--disable-blink-features=AutomationControlled', ], }) } async function tryClick(page, selectors) { for (const sel of selectors) { try { const el = await page.$(sel) if (el) { await el.click(); return true } } catch {} } await page.keyboard.press('Enter') return false } async function tryType(page, selectors, text) { for (const sel of selectors) { try { await page.waitForSelector(sel, { timeout: 4000 }) await page.type(sel, text, { delay: 60 }) return true } catch {} } return false } async function scrapeBloombergViaPuppeteer(username, password) { let browser try { browser = await launchBrowser() const page = await browser.newPage() await page.setViewport({ width: 1920, height: 1080 }) await page.evaluateOnNewDocument(() => { Object.defineProperty(navigator, 'webdriver', { get: () => undefined }) window.chrome = { runtime: {} } Object.defineProperty(navigator, 'plugins', { get: () => [1, 2, 3] }) }) console.log('[bloomberg] navigating to login page') await page.goto('https://www.bloomberg.com/account/signin', { waitUntil: 'networkidle2', timeout: 60000, }) await new Promise(r => setTimeout(r, 2000)) const pageInputs = await page.evaluate(() => Array.from(document.querySelectorAll('input')).map(i => ({ type: i.type, name: i.name, id: i.id, placeholder: i.placeholder, visible: i.offsetParent !== null })) ) console.log('[bloomberg] inputs on login page:', JSON.stringify(pageInputs)) console.log('[bloomberg] page title:', await page.title()) const emailOk = await tryType(page, [ '#email-form-input', 'input[id="email-form-input"]', 'input[type="email"]', 'input[name="text-input"]', 'input[placeholder*="email" i]', ], username) if (!emailOk) throw new Error('could not find email input') await new Promise(r => setTimeout(r, 800)) const submitted = await page.evaluate(() => { const btn = Array.from(document.querySelectorAll('button')).find(b => b.type === 'submit' || /continue|next|sign.?in/i.test(b.textContent) ) if (btn) { btn.click(); return true } const form = document.querySelector('form') if (form) { form.submit(); return true } return false }) if (!submitted) await page.keyboard.press('Enter') try { await page.waitForFunction( () => document.querySelector('input[type="password"]') !== null, { timeout: 10000 } ) } catch { await new Promise(r => setTimeout(r, 3000)) } console.log('[bloomberg] after email submit, url:', page.url()) const pwdOk = await tryType(page, [ 'input[type="password"]', 'input[name="password"]', 'input[autocomplete="current-password"]', 'input[autocomplete="password"]', ], password) if (!pwdOk) throw new Error('could not find password input') await new Promise(r => setTimeout(r, 500)) await Promise.all([ page.waitForNavigation({ waitUntil: 'networkidle2', timeout: 20000 }).catch(() => {}), tryClick(page, ['button[type="submit"]', 'input[type="submit"]']), ]) const currentURL = page.url() console.log('[bloomberg] after login, url:', currentURL) if (currentURL.includes('/account/signin')) { const bodyText = await page.evaluate(() => document.body.innerText.slice(0, 500)) throw new Error(`login did not redirect — page content: ${bodyText}`) } const targetPages = [ 'https://www.bloomberg.com/markets', 'https://www.bloomberg.com/technology', 'https://www.bloomberg.com/economics', ] const articles = [] const seen = new Set() for (const url of targetPages) { try { console.log('[bloomberg] scraping', url) await page.goto(url, { waitUntil: 'domcontentloaded', timeout: 30000 }) await page.waitForSelector('a[data-component="story-link"]', { timeout: 15000 }).catch(() => {}) const html = await page.content() const items = parseArticlesFromHtml(html) for (const item of items) { if (!seen.has(item.url)) { seen.add(item.url) articles.push(item) } } console.log('[bloomberg]', url, '->', items.length, 'articles') } catch (e) { console.error('[bloomberg] error on', url, ':', e.message) } } return articles } finally { if (browser) await browser.close() } } // ---------- Routes ---------- app.get('/health', (_, res) => res.json({ ok: true })) app.post('/bloomberg/scrape', async (req, res) => { const { username, password } = req.body || {} if (!username || !password) { return res.status(400).json({ error: 'username and password required' }) } try { let articles if (BYPARR_URL) { console.log('[bloomberg] using Byparr (nodriver) to bypass bot detection') articles = await scrapeBloombergViaByparr(username, password) } else { console.log('[bloomberg] using Puppeteer (Byparr not configured)') articles = await scrapeBloombergViaPuppeteer(username, password) } console.log('[bloomberg] total:', articles.length, 'articles') res.json({ articles }) } catch (e) { console.error('[bloomberg] scrape error:', e.message) res.status(500).json({ error: e.message }) } }) app.listen(PORT, () => console.log(`scraper-service listening on :${PORT}`))