Tradarr/scraper-service/index.js

const express = require('express')
const cheerio = require('cheerio')
const puppeteer = require('puppeteer-extra')
const StealthPlugin = require('puppeteer-extra-plugin-stealth')

puppeteer.use(StealthPlugin())

const app = express()
app.use(express.json())

const CHROME_PATH = process.env.PUPPETEER_EXECUTABLE_PATH || '/usr/bin/chromium'
const PORT = process.env.PORT || 3001
const BYPARR_URL = process.env.BYPARR_URL || ''

// ---------- Byparr helpers ----------

async function byparrRequest(body) {
  const resp = await fetch(`${BYPARR_URL}/v1`, {
    method: 'POST',
    headers: { 'Content-Type': 'application/json' },
    body: JSON.stringify(body),
  })
  if (!resp.ok) throw new Error(`byparr HTTP ${resp.status}: ${await resp.text()}`)
  return resp.json()
}

async function byparrFetch(url) {
  console.log('[byparr] GET', url)
  const data = await byparrRequest({ cmd: 'request.get', url, maxTimeout: 60000 })
  const html = data.solution?.response
  if (!html) throw new Error(`byparr returned no HTML for ${url}`)
  return { html }
}

// ---------- Article parsing ----------

function parseArticlesFromHtml(html) {
  const $ = cheerio.load(html)
  const results = []
  const seen = new Set()

  $('a[data-component="story-link"]').each((_, el) => {
    const a = $(el)
    const href = a.attr('href') || ''
    const absolute = href.startsWith('http') ? href : `https://www.bloomberg.com${href}`
    const clean = absolute.split('?')[0]

    if (seen.has(clean)) return
    if (!/\/(news\/articles|news\/features|features)\//.test(clean)) return

    const title = a.find('[data-testid="headline"] span').text().trim()
    if (!title || title.length < 10) return

    seen.add(clean)
    results.push({ title, url: clean })
  })

  return results
}

// ---------- Bloomberg scraping via Byparr ----------

async function scrapeBloombergViaByparr(username, password) {
  // /markets main page is excluded: live ticker widgets cause continuous network activity
  // so Byparr's networkidle wait never resolves. Sub-sections below are editorial pages
  // without live data — they behave like /technology and /economics (~5s each).
  const byparrPages = [
    'https://www.bloomberg.com/markets/stocks',
    'https://www.bloomberg.com/markets/currencies',
    'https://www.bloomberg.com/markets/rates-bonds',
    'https://www.bloomberg.com/markets/commodities',
    'https://www.bloomberg.com/technology',
    'https://www.bloomberg.com/economics',
    'https://www.bloomberg.com/politics',
  ]

  const articles = []
  const seen = new Set()

  for (const url of byparrPages) {
    try {
      const { html } = await byparrFetch(url)
      const items = parseArticlesFromHtml(html)
      for (const item of items) {
        if (!seen.has(item.url)) {
          seen.add(item.url)
          articles.push(item)
        }
      }
      console.log('[bloomberg]', url, '->', items.length, 'articles')
    } catch (e) {
      console.error('[bloomberg] error on', url, ':', e.message)
    }
  }

  return articles
}

// ---------- Bloomberg scraping via Puppeteer (fallback, may be blocked) ----------

function launchBrowser() {
  return puppeteer.launch({
    executablePath: CHROME_PATH,
    headless: true,
    args: [
      '--no-sandbox',
      '--disable-setuid-sandbox',
      '--disable-dev-shm-usage',
      '--disable-gpu',
      '--window-size=1920,1080',
      '--disable-blink-features=AutomationControlled',
    ],
  })
}

async function tryClick(page, selectors) {
  for (const sel of selectors) {
    try {
      const el = await page.$(sel)
      if (el) { await el.click(); return true }
    } catch {}
  }
  await page.keyboard.press('Enter')
  return false
}

async function tryType(page, selectors, text) {
  for (const sel of selectors) {
    try {
      await page.waitForSelector(sel, { timeout: 4000 })
      await page.type(sel, text, { delay: 60 })
      return true
    } catch {}
  }
  return false
}

async function scrapeBloombergViaPuppeteer(username, password) {
  let browser
  try {
    browser = await launchBrowser()
    const page = await browser.newPage()
    await page.setViewport({ width: 1920, height: 1080 })

    await page.evaluateOnNewDocument(() => {
      Object.defineProperty(navigator, 'webdriver', { get: () => undefined })
      window.chrome = { runtime: {} }
      Object.defineProperty(navigator, 'plugins', { get: () => [1, 2, 3] })
    })

    console.log('[bloomberg] navigating to login page')
    await page.goto('https://www.bloomberg.com/account/signin', {
      waitUntil: 'networkidle2',
      timeout: 60000,
    })
    await new Promise(r => setTimeout(r, 2000))

    const pageInputs = await page.evaluate(() =>
      Array.from(document.querySelectorAll('input')).map(i => ({
        type: i.type, name: i.name, id: i.id, placeholder: i.placeholder, visible: i.offsetParent !== null
      }))
    )
    console.log('[bloomberg] inputs on login page:', JSON.stringify(pageInputs))
    console.log('[bloomberg] page title:', await page.title())

    const emailOk = await tryType(page, [
      '#email-form-input',
      'input[id="email-form-input"]',
      'input[type="email"]',
      'input[name="text-input"]',
      'input[placeholder*="email" i]',
    ], username)
    if (!emailOk) throw new Error('could not find email input')

    await new Promise(r => setTimeout(r, 800))

    const submitted = await page.evaluate(() => {
      const btn = Array.from(document.querySelectorAll('button')).find(b =>
        b.type === 'submit' || /continue|next|sign.?in/i.test(b.textContent)
      )
      if (btn) { btn.click(); return true }
      const form = document.querySelector('form')
      if (form) { form.submit(); return true }
      return false
    })
    if (!submitted) await page.keyboard.press('Enter')

    try {
      await page.waitForFunction(
        () => document.querySelector('input[type="password"]') !== null,
        { timeout: 10000 }
      )
    } catch {
      await new Promise(r => setTimeout(r, 3000))
    }
    console.log('[bloomberg] after email submit, url:', page.url())

    const pwdOk = await tryType(page, [
      'input[type="password"]',
      'input[name="password"]',
      'input[autocomplete="current-password"]',
      'input[autocomplete="password"]',
    ], password)
    if (!pwdOk) throw new Error('could not find password input')

    await new Promise(r => setTimeout(r, 500))
    await Promise.all([
      page.waitForNavigation({ waitUntil: 'networkidle2', timeout: 20000 }).catch(() => {}),
      tryClick(page, ['button[type="submit"]', 'input[type="submit"]']),
    ])

    const currentURL = page.url()
    console.log('[bloomberg] after login, url:', currentURL)
    if (currentURL.includes('/account/signin')) {
      const bodyText = await page.evaluate(() => document.body.innerText.slice(0, 500))
      throw new Error(`login did not redirect — page content: ${bodyText}`)
    }

    const targetPages = [
      'https://www.bloomberg.com/markets',
      'https://www.bloomberg.com/technology',
      'https://www.bloomberg.com/economics',
    ]

    const articles = []
    const seen = new Set()

    for (const url of targetPages) {
      try {
        console.log('[bloomberg] scraping', url)
        await page.goto(url, { waitUntil: 'domcontentloaded', timeout: 30000 })
        await page.waitForSelector('a[data-component="story-link"]', { timeout: 15000 }).catch(() => {})

        const html = await page.content()
        const items = parseArticlesFromHtml(html)

        for (const item of items) {
          if (!seen.has(item.url)) {
            seen.add(item.url)
            articles.push(item)
          }
        }
        console.log('[bloomberg]', url, '->', items.length, 'articles')
      } catch (e) {
        console.error('[bloomberg] error on', url, ':', e.message)
      }
    }

    return articles
  } finally {
    if (browser) await browser.close()
  }
}

// ---------- Routes ----------

app.get('/health', (_, res) => res.json({ ok: true }))

app.post('/bloomberg/scrape', async (req, res) => {
  const { username, password } = req.body || {}
  if (!username || !password) {
    return res.status(400).json({ error: 'username and password required' })
  }

  try {
    let articles
    if (BYPARR_URL) {
      console.log('[bloomberg] using Byparr (nodriver) to bypass bot detection')
      articles = await scrapeBloombergViaByparr(username, password)
    } else {
      console.log('[bloomberg] using Puppeteer (Byparr not configured)')
      articles = await scrapeBloombergViaPuppeteer(username, password)
    }

    console.log('[bloomberg] total:', articles.length, 'articles')
    res.json({ articles })
  } catch (e) {
    console.error('[bloomberg] scrape error:', e.message)
    res.status(500).json({ error: e.message })
  }
})

app.listen(PORT, () => console.log(`scraper-service listening on :${PORT}`))