fix: fix bloomberg scraping using byparr
This commit is contained in:
@ -1,4 +1,5 @@
|
||||
const express = require('express')
|
||||
const cheerio = require('cheerio')
|
||||
const puppeteer = require('puppeteer-extra')
|
||||
const StealthPlugin = require('puppeteer-extra-plugin-stealth')
|
||||
|
||||
@ -9,6 +10,93 @@ app.use(express.json())
|
||||
|
||||
const CHROME_PATH = process.env.PUPPETEER_EXECUTABLE_PATH || '/usr/bin/chromium'
|
||||
const PORT = process.env.PORT || 3001
|
||||
const BYPARR_URL = process.env.BYPARR_URL || ''
|
||||
|
||||
// ---------- Byparr helpers ----------
|
||||
|
||||
async function byparrRequest(body) {
|
||||
const resp = await fetch(`${BYPARR_URL}/v1`, {
|
||||
method: 'POST',
|
||||
headers: { 'Content-Type': 'application/json' },
|
||||
body: JSON.stringify(body),
|
||||
})
|
||||
if (!resp.ok) throw new Error(`byparr HTTP ${resp.status}: ${await resp.text()}`)
|
||||
return resp.json()
|
||||
}
|
||||
|
||||
async function byparrFetch(url) {
|
||||
console.log('[byparr] GET', url)
|
||||
const data = await byparrRequest({ cmd: 'request.get', url, maxTimeout: 60000 })
|
||||
const html = data.solution?.response
|
||||
if (!html) throw new Error(`byparr returned no HTML for ${url}`)
|
||||
return { html }
|
||||
}
|
||||
|
||||
// ---------- Article parsing ----------
|
||||
|
||||
function parseArticlesFromHtml(html) {
|
||||
const $ = cheerio.load(html)
|
||||
const results = []
|
||||
const seen = new Set()
|
||||
|
||||
$('a[data-component="story-link"]').each((_, el) => {
|
||||
const a = $(el)
|
||||
const href = a.attr('href') || ''
|
||||
const absolute = href.startsWith('http') ? href : `https://www.bloomberg.com${href}`
|
||||
const clean = absolute.split('?')[0]
|
||||
|
||||
if (seen.has(clean)) return
|
||||
if (!/\/(news\/articles|news\/features|features)\//.test(clean)) return
|
||||
|
||||
const title = a.find('[data-testid="headline"] span').text().trim()
|
||||
if (!title || title.length < 10) return
|
||||
|
||||
seen.add(clean)
|
||||
results.push({ title, url: clean })
|
||||
})
|
||||
|
||||
return results
|
||||
}
|
||||
|
||||
// ---------- Bloomberg scraping via Byparr ----------
|
||||
|
||||
async function scrapeBloombergViaByparr(username, password) {
|
||||
// /markets main page is excluded: live ticker widgets cause continuous network activity
|
||||
// so Byparr's networkidle wait never resolves. Sub-sections below are editorial pages
|
||||
// without live data — they behave like /technology and /economics (~5s each).
|
||||
const byparrPages = [
|
||||
'https://www.bloomberg.com/markets/stocks',
|
||||
'https://www.bloomberg.com/markets/currencies',
|
||||
'https://www.bloomberg.com/markets/rates-bonds',
|
||||
'https://www.bloomberg.com/markets/commodities',
|
||||
'https://www.bloomberg.com/technology',
|
||||
'https://www.bloomberg.com/economics',
|
||||
'https://www.bloomberg.com/politics',
|
||||
]
|
||||
|
||||
const articles = []
|
||||
const seen = new Set()
|
||||
|
||||
for (const url of byparrPages) {
|
||||
try {
|
||||
const { html } = await byparrFetch(url)
|
||||
const items = parseArticlesFromHtml(html)
|
||||
for (const item of items) {
|
||||
if (!seen.has(item.url)) {
|
||||
seen.add(item.url)
|
||||
articles.push(item)
|
||||
}
|
||||
}
|
||||
console.log('[bloomberg]', url, '->', items.length, 'articles')
|
||||
} catch (e) {
|
||||
console.error('[bloomberg] error on', url, ':', e.message)
|
||||
}
|
||||
}
|
||||
|
||||
return articles
|
||||
}
|
||||
|
||||
// ---------- Bloomberg scraping via Puppeteer (fallback, may be blocked) ----------
|
||||
|
||||
function launchBrowser() {
|
||||
return puppeteer.launch({
|
||||
@ -47,21 +135,13 @@ async function tryType(page, selectors, text) {
|
||||
return false
|
||||
}
|
||||
|
||||
app.get('/health', (_, res) => res.json({ ok: true }))
|
||||
|
||||
app.post('/bloomberg/scrape', async (req, res) => {
|
||||
const { username, password } = req.body || {}
|
||||
if (!username || !password) {
|
||||
return res.status(400).json({ error: 'username and password required' })
|
||||
}
|
||||
|
||||
async function scrapeBloombergViaPuppeteer(username, password) {
|
||||
let browser
|
||||
try {
|
||||
browser = await launchBrowser()
|
||||
const page = await browser.newPage()
|
||||
await page.setViewport({ width: 1920, height: 1080 })
|
||||
|
||||
// Hide automation signals
|
||||
await page.evaluateOnNewDocument(() => {
|
||||
Object.defineProperty(navigator, 'webdriver', { get: () => undefined })
|
||||
window.chrome = { runtime: {} }
|
||||
@ -75,35 +155,28 @@ app.post('/bloomberg/scrape', async (req, res) => {
|
||||
})
|
||||
await new Promise(r => setTimeout(r, 2000))
|
||||
|
||||
// Debug: état de la page avant de chercher l'email
|
||||
const pageInputs = await page.evaluate(() =>
|
||||
Array.from(document.querySelectorAll('input')).map(i => ({
|
||||
type: i.type, name: i.name, id: i.id, placeholder: i.placeholder, visible: i.offsetParent !== null
|
||||
}))
|
||||
)
|
||||
console.log('[bloomberg] inputs on login page:', JSON.stringify(pageInputs))
|
||||
const pageTitle = await page.title()
|
||||
console.log('[bloomberg] page title:', pageTitle)
|
||||
console.log('[bloomberg] page title:', await page.title())
|
||||
|
||||
console.log('[bloomberg] entering email')
|
||||
const emailSelectors = [
|
||||
const emailOk = await tryType(page, [
|
||||
'#email-form-input',
|
||||
'input[id="email-form-input"]',
|
||||
'input[type="email"]',
|
||||
'input[name="text-input"]',
|
||||
'input[placeholder*="email" i]',
|
||||
]
|
||||
const emailOk = await tryType(page, emailSelectors, username)
|
||||
], username)
|
||||
if (!emailOk) throw new Error('could not find email input')
|
||||
|
||||
await new Promise(r => setTimeout(r, 800))
|
||||
|
||||
// Click submit via JS pour contourner les boutons désactivés
|
||||
const submitted = await page.evaluate(() => {
|
||||
const btns = Array.from(document.querySelectorAll('button'))
|
||||
const btn = btns.find(b =>
|
||||
b.type === 'submit' ||
|
||||
/continue|next|sign.?in/i.test(b.textContent)
|
||||
const btn = Array.from(document.querySelectorAll('button')).find(b =>
|
||||
b.type === 'submit' || /continue|next|sign.?in/i.test(b.textContent)
|
||||
)
|
||||
if (btn) { btn.click(); return true }
|
||||
const form = document.querySelector('form')
|
||||
@ -112,7 +185,6 @@ app.post('/bloomberg/scrape', async (req, res) => {
|
||||
})
|
||||
if (!submitted) await page.keyboard.press('Enter')
|
||||
|
||||
// Attendre que la page change (password input apparaît ou navigation)
|
||||
try {
|
||||
await page.waitForFunction(
|
||||
() => document.querySelector('input[type="password"]') !== null,
|
||||
@ -123,31 +195,28 @@ app.post('/bloomberg/scrape', async (req, res) => {
|
||||
}
|
||||
console.log('[bloomberg] after email submit, url:', page.url())
|
||||
|
||||
// Debug inputs disponibles
|
||||
const allInputs = await page.evaluate(() =>
|
||||
Array.from(document.querySelectorAll('input')).map(i => ({
|
||||
type: i.type, name: i.name, id: i.id, placeholder: i.placeholder
|
||||
}))
|
||||
)
|
||||
console.log('[bloomberg] inputs after email submit:', JSON.stringify(allInputs))
|
||||
|
||||
console.log('[bloomberg] entering password')
|
||||
const pwdOk = await tryType(page, [
|
||||
'input[type="password"]',
|
||||
'input[name="password"]',
|
||||
'input[autocomplete="current-password"]',
|
||||
'input[autocomplete="password"]',
|
||||
], password)
|
||||
if (!pwdOk) throw new Error('could not find password input — check logs above for available inputs')
|
||||
if (!pwdOk) throw new Error('could not find password input')
|
||||
|
||||
await new Promise(r => setTimeout(r, 500))
|
||||
await tryClick(page, ['button[type="submit"]', 'input[type="submit"]'])
|
||||
await new Promise(r => setTimeout(r, 3000))
|
||||
await Promise.all([
|
||||
page.waitForNavigation({ waitUntil: 'networkidle2', timeout: 20000 }).catch(() => {}),
|
||||
tryClick(page, ['button[type="submit"]', 'input[type="submit"]']),
|
||||
])
|
||||
|
||||
const currentURL = page.url()
|
||||
console.log('[bloomberg] after login, url:', currentURL)
|
||||
if (currentURL.includes('/account/signin')) {
|
||||
const bodyText = await page.evaluate(() => document.body.innerText.slice(0, 500))
|
||||
throw new Error(`login did not redirect — page content: ${bodyText}`)
|
||||
}
|
||||
|
||||
const pages = [
|
||||
const targetPages = [
|
||||
'https://www.bloomberg.com/markets',
|
||||
'https://www.bloomberg.com/technology',
|
||||
'https://www.bloomberg.com/economics',
|
||||
@ -156,32 +225,17 @@ app.post('/bloomberg/scrape', async (req, res) => {
|
||||
const articles = []
|
||||
const seen = new Set()
|
||||
|
||||
for (const url of pages) {
|
||||
for (const url of targetPages) {
|
||||
try {
|
||||
console.log('[bloomberg] scraping', url)
|
||||
await page.goto(url, { waitUntil: 'networkidle2', timeout: 30000 })
|
||||
await new Promise(r => setTimeout(r, 2000))
|
||||
await page.goto(url, { waitUntil: 'domcontentloaded', timeout: 30000 })
|
||||
await page.waitForSelector('a[data-component="story-link"]', { timeout: 15000 }).catch(() => {})
|
||||
|
||||
const items = await page.evaluate(() => {
|
||||
const results = []
|
||||
const seen = new Set()
|
||||
const links = document.querySelectorAll(
|
||||
'a[href*="/news/articles"], a[href*="/opinion/"], a[href*="/markets/"]'
|
||||
)
|
||||
links.forEach(a => {
|
||||
if (seen.has(a.href)) return
|
||||
seen.add(a.href)
|
||||
const titleEl = a.querySelector('h1,h2,h3,h4,[class*="headline"],[class*="title"]')
|
||||
const text = titleEl ? titleEl.innerText.trim() : a.innerText.trim()
|
||||
if (text.length > 20 && a.href.includes('bloomberg.com')) {
|
||||
results.push({ title: text, url: a.href })
|
||||
}
|
||||
})
|
||||
return results.slice(0, 25)
|
||||
})
|
||||
const html = await page.content()
|
||||
const items = parseArticlesFromHtml(html)
|
||||
|
||||
for (const item of items) {
|
||||
if (!seen.has(item.url) && item.title && item.url) {
|
||||
if (!seen.has(item.url)) {
|
||||
seen.add(item.url)
|
||||
articles.push(item)
|
||||
}
|
||||
@ -192,13 +246,37 @@ app.post('/bloomberg/scrape', async (req, res) => {
|
||||
}
|
||||
}
|
||||
|
||||
return articles
|
||||
} finally {
|
||||
if (browser) await browser.close()
|
||||
}
|
||||
}
|
||||
|
||||
// ---------- Routes ----------
|
||||
|
||||
app.get('/health', (_, res) => res.json({ ok: true }))
|
||||
|
||||
app.post('/bloomberg/scrape', async (req, res) => {
|
||||
const { username, password } = req.body || {}
|
||||
if (!username || !password) {
|
||||
return res.status(400).json({ error: 'username and password required' })
|
||||
}
|
||||
|
||||
try {
|
||||
let articles
|
||||
if (BYPARR_URL) {
|
||||
console.log('[bloomberg] using Byparr (nodriver) to bypass bot detection')
|
||||
articles = await scrapeBloombergViaByparr(username, password)
|
||||
} else {
|
||||
console.log('[bloomberg] using Puppeteer (Byparr not configured)')
|
||||
articles = await scrapeBloombergViaPuppeteer(username, password)
|
||||
}
|
||||
|
||||
console.log('[bloomberg] total:', articles.length, 'articles')
|
||||
res.json({ articles })
|
||||
} catch (e) {
|
||||
console.error('[bloomberg] scrape error:', e.message)
|
||||
res.status(500).json({ error: e.message })
|
||||
} finally {
|
||||
if (browser) await browser.close()
|
||||
}
|
||||
})
|
||||
|
||||
|
||||
Reference in New Issue
Block a user