const express = require('express') const puppeteer = require('puppeteer-extra') const StealthPlugin = require('puppeteer-extra-plugin-stealth') puppeteer.use(StealthPlugin()) const app = express() app.use(express.json()) const CHROME_PATH = process.env.PUPPETEER_EXECUTABLE_PATH || '/usr/bin/chromium' const PORT = process.env.PORT || 3001 function launchBrowser() { return puppeteer.launch({ executablePath: CHROME_PATH, headless: true, args: [ '--no-sandbox', '--disable-setuid-sandbox', '--disable-dev-shm-usage', '--disable-gpu', '--window-size=1920,1080', '--disable-blink-features=AutomationControlled', ], }) } async function tryClick(page, selectors) { for (const sel of selectors) { try { const el = await page.$(sel) if (el) { await el.click(); return true } } catch {} } await page.keyboard.press('Enter') return false } async function tryType(page, selectors, text) { for (const sel of selectors) { try { await page.waitForSelector(sel, { timeout: 4000 }) await page.type(sel, text, { delay: 60 }) return true } catch {} } return false } app.get('/health', (_, res) => res.json({ ok: true })) app.post('/bloomberg/scrape', async (req, res) => { const { username, password } = req.body || {} if (!username || !password) { return res.status(400).json({ error: 'username and password required' }) } let browser try { browser = await launchBrowser() const page = await browser.newPage() await page.setViewport({ width: 1920, height: 1080 }) // Hide automation signals await page.evaluateOnNewDocument(() => { Object.defineProperty(navigator, 'webdriver', { get: () => undefined }) window.chrome = { runtime: {} } Object.defineProperty(navigator, 'plugins', { get: () => [1, 2, 3] }) }) console.log('[bloomberg] navigating to login page') await page.goto('https://www.bloomberg.com/account/signin', { waitUntil: 'networkidle2', timeout: 60000, }) await new Promise(r => setTimeout(r, 2000)) // Debug: état de la page avant de chercher l'email const pageInputs = await page.evaluate(() => Array.from(document.querySelectorAll('input')).map(i => ({ type: i.type, name: i.name, id: i.id, placeholder: i.placeholder, visible: i.offsetParent !== null })) ) console.log('[bloomberg] inputs on login page:', JSON.stringify(pageInputs)) const pageTitle = await page.title() console.log('[bloomberg] page title:', pageTitle) console.log('[bloomberg] entering email') const emailSelectors = [ '#email-form-input', 'input[id="email-form-input"]', 'input[type="email"]', 'input[name="text-input"]', 'input[placeholder*="email" i]', ] const emailOk = await tryType(page, emailSelectors, username) if (!emailOk) throw new Error('could not find email input') await new Promise(r => setTimeout(r, 800)) // Click submit via JS pour contourner les boutons désactivés const submitted = await page.evaluate(() => { const btns = Array.from(document.querySelectorAll('button')) const btn = btns.find(b => b.type === 'submit' || /continue|next|sign.?in/i.test(b.textContent) ) if (btn) { btn.click(); return true } const form = document.querySelector('form') if (form) { form.submit(); return true } return false }) if (!submitted) await page.keyboard.press('Enter') // Attendre que la page change (password input apparaît ou navigation) try { await page.waitForFunction( () => document.querySelector('input[type="password"]') !== null, { timeout: 10000 } ) } catch { await new Promise(r => setTimeout(r, 3000)) } console.log('[bloomberg] after email submit, url:', page.url()) // Debug inputs disponibles const allInputs = await page.evaluate(() => Array.from(document.querySelectorAll('input')).map(i => ({ type: i.type, name: i.name, id: i.id, placeholder: i.placeholder })) ) console.log('[bloomberg] inputs after email submit:', JSON.stringify(allInputs)) console.log('[bloomberg] entering password') const pwdOk = await tryType(page, [ 'input[type="password"]', 'input[name="password"]', 'input[autocomplete="current-password"]', 'input[autocomplete="password"]', ], password) if (!pwdOk) throw new Error('could not find password input — check logs above for available inputs') await new Promise(r => setTimeout(r, 500)) await tryClick(page, ['button[type="submit"]', 'input[type="submit"]']) await new Promise(r => setTimeout(r, 3000)) const currentURL = page.url() console.log('[bloomberg] after login, url:', currentURL) const pages = [ 'https://www.bloomberg.com/markets', 'https://www.bloomberg.com/technology', 'https://www.bloomberg.com/economics', ] const articles = [] const seen = new Set() for (const url of pages) { try { console.log('[bloomberg] scraping', url) await page.goto(url, { waitUntil: 'networkidle2', timeout: 30000 }) await new Promise(r => setTimeout(r, 2000)) const items = await page.evaluate(() => { const results = [] const seen = new Set() const links = document.querySelectorAll( 'a[href*="/news/articles"], a[href*="/opinion/"], a[href*="/markets/"]' ) links.forEach(a => { if (seen.has(a.href)) return seen.add(a.href) const titleEl = a.querySelector('h1,h2,h3,h4,[class*="headline"],[class*="title"]') const text = titleEl ? titleEl.innerText.trim() : a.innerText.trim() if (text.length > 20 && a.href.includes('bloomberg.com')) { results.push({ title: text, url: a.href }) } }) return results.slice(0, 25) }) for (const item of items) { if (!seen.has(item.url) && item.title && item.url) { seen.add(item.url) articles.push(item) } } console.log('[bloomberg]', url, '->', items.length, 'articles') } catch (e) { console.error('[bloomberg] error on', url, ':', e.message) } } console.log('[bloomberg] total:', articles.length, 'articles') res.json({ articles }) } catch (e) { console.error('[bloomberg] scrape error:', e.message) res.status(500).json({ error: e.message }) } finally { if (browser) await browser.close() } }) app.listen(PORT, () => console.log(`scraper-service listening on :${PORT}`))