feat: add sources to retrieve news and divide the IA reflexions in 2 steps to limit the number of news

This commit is contained in:
2026-04-19 10:43:15 +02:00
parent 93668273ff
commit eb1fb5ca78
28 changed files with 1086 additions and 249 deletions

205
scraper-service/index.js Normal file
View File

@ -0,0 +1,205 @@
const express = require('express')
const puppeteer = require('puppeteer-extra')
const StealthPlugin = require('puppeteer-extra-plugin-stealth')
puppeteer.use(StealthPlugin())
const app = express()
app.use(express.json())
const CHROME_PATH = process.env.PUPPETEER_EXECUTABLE_PATH || '/usr/bin/chromium'
const PORT = process.env.PORT || 3001
function launchBrowser() {
return puppeteer.launch({
executablePath: CHROME_PATH,
headless: true,
args: [
'--no-sandbox',
'--disable-setuid-sandbox',
'--disable-dev-shm-usage',
'--disable-gpu',
'--window-size=1920,1080',
'--disable-blink-features=AutomationControlled',
],
})
}
async function tryClick(page, selectors) {
for (const sel of selectors) {
try {
const el = await page.$(sel)
if (el) { await el.click(); return true }
} catch {}
}
await page.keyboard.press('Enter')
return false
}
async function tryType(page, selectors, text) {
for (const sel of selectors) {
try {
await page.waitForSelector(sel, { timeout: 4000 })
await page.type(sel, text, { delay: 60 })
return true
} catch {}
}
return false
}
app.get('/health', (_, res) => res.json({ ok: true }))
app.post('/bloomberg/scrape', async (req, res) => {
const { username, password } = req.body || {}
if (!username || !password) {
return res.status(400).json({ error: 'username and password required' })
}
let browser
try {
browser = await launchBrowser()
const page = await browser.newPage()
await page.setViewport({ width: 1920, height: 1080 })
// Hide automation signals
await page.evaluateOnNewDocument(() => {
Object.defineProperty(navigator, 'webdriver', { get: () => undefined })
window.chrome = { runtime: {} }
Object.defineProperty(navigator, 'plugins', { get: () => [1, 2, 3] })
})
console.log('[bloomberg] navigating to login page')
await page.goto('https://www.bloomberg.com/account/signin', {
waitUntil: 'networkidle2',
timeout: 60000,
})
await new Promise(r => setTimeout(r, 2000))
// Debug: état de la page avant de chercher l'email
const pageInputs = await page.evaluate(() =>
Array.from(document.querySelectorAll('input')).map(i => ({
type: i.type, name: i.name, id: i.id, placeholder: i.placeholder, visible: i.offsetParent !== null
}))
)
console.log('[bloomberg] inputs on login page:', JSON.stringify(pageInputs))
const pageTitle = await page.title()
console.log('[bloomberg] page title:', pageTitle)
console.log('[bloomberg] entering email')
const emailSelectors = [
'#email-form-input',
'input[id="email-form-input"]',
'input[type="email"]',
'input[name="text-input"]',
'input[placeholder*="email" i]',
]
const emailOk = await tryType(page, emailSelectors, username)
if (!emailOk) throw new Error('could not find email input')
await new Promise(r => setTimeout(r, 800))
// Click submit via JS pour contourner les boutons désactivés
const submitted = await page.evaluate(() => {
const btns = Array.from(document.querySelectorAll('button'))
const btn = btns.find(b =>
b.type === 'submit' ||
/continue|next|sign.?in/i.test(b.textContent)
)
if (btn) { btn.click(); return true }
const form = document.querySelector('form')
if (form) { form.submit(); return true }
return false
})
if (!submitted) await page.keyboard.press('Enter')
// Attendre que la page change (password input apparaît ou navigation)
try {
await page.waitForFunction(
() => document.querySelector('input[type="password"]') !== null,
{ timeout: 10000 }
)
} catch {
await new Promise(r => setTimeout(r, 3000))
}
console.log('[bloomberg] after email submit, url:', page.url())
// Debug inputs disponibles
const allInputs = await page.evaluate(() =>
Array.from(document.querySelectorAll('input')).map(i => ({
type: i.type, name: i.name, id: i.id, placeholder: i.placeholder
}))
)
console.log('[bloomberg] inputs after email submit:', JSON.stringify(allInputs))
console.log('[bloomberg] entering password')
const pwdOk = await tryType(page, [
'input[type="password"]',
'input[name="password"]',
'input[autocomplete="current-password"]',
'input[autocomplete="password"]',
], password)
if (!pwdOk) throw new Error('could not find password input — check logs above for available inputs')
await new Promise(r => setTimeout(r, 500))
await tryClick(page, ['button[type="submit"]', 'input[type="submit"]'])
await new Promise(r => setTimeout(r, 3000))
const currentURL = page.url()
console.log('[bloomberg] after login, url:', currentURL)
const pages = [
'https://www.bloomberg.com/markets',
'https://www.bloomberg.com/technology',
'https://www.bloomberg.com/economics',
]
const articles = []
const seen = new Set()
for (const url of pages) {
try {
console.log('[bloomberg] scraping', url)
await page.goto(url, { waitUntil: 'networkidle2', timeout: 30000 })
await new Promise(r => setTimeout(r, 2000))
const items = await page.evaluate(() => {
const results = []
const seen = new Set()
const links = document.querySelectorAll(
'a[href*="/news/articles"], a[href*="/opinion/"], a[href*="/markets/"]'
)
links.forEach(a => {
if (seen.has(a.href)) return
seen.add(a.href)
const titleEl = a.querySelector('h1,h2,h3,h4,[class*="headline"],[class*="title"]')
const text = titleEl ? titleEl.innerText.trim() : a.innerText.trim()
if (text.length > 20 && a.href.includes('bloomberg.com')) {
results.push({ title: text, url: a.href })
}
})
return results.slice(0, 25)
})
for (const item of items) {
if (!seen.has(item.url) && item.title && item.url) {
seen.add(item.url)
articles.push(item)
}
}
console.log('[bloomberg]', url, '->', items.length, 'articles')
} catch (e) {
console.error('[bloomberg] error on', url, ':', e.message)
}
}
console.log('[bloomberg] total:', articles.length, 'articles')
res.json({ articles })
} catch (e) {
console.error('[bloomberg] scrape error:', e.message)
res.status(500).json({ error: e.message })
} finally {
if (browser) await browser.close()
}
})
app.listen(PORT, () => console.log(`scraper-service listening on :${PORT}`))