Welcome to OStack Knowledge Sharing Community for programmer and developer-Open, Learning and Share
Welcome To Ask or Share your Answers For Others

Categories

0 votes
550 views
in Technique[技术] by (71.8m points)

node.js - How to avoid being detected as bot on Puppeteer and Phantomjs?

Puppeteer and PhantomJS are similar. The issue I'm having is happening for both, and the code is also similar.

I'd like to catch some informations from a website, which needs authentication for viewing those informations. I can't even access home page because it's detected like a "suspicious activity", like the SS: https://i.imgur.com/p69OIjO.png

I discovered that the problem doesn't happen when I tested on Postman using a header named Cookie and the value of it's cookie caught on browser, but this cookie expires after some time. So I guess Puppeteer/PhantomJS both are not catching cookies, because this site is denying the headless browser access.

What could I do for bypass this?

// Simple Javascript example
var page = require('webpage').create();
var url = 'https://www.expertflyer.com';

page.open(url, function (status) {
    if( status === "success") {
        page.render("home.png");
        phantom.exit();
    }
});
See Question&Answers more detail:os

与恶龙缠斗过久,自身亦成为恶龙;凝视深渊过久,深渊将回以凝视…
Welcome To Ask or Share your Answers For Others

1 Answer

0 votes
by (71.8m points)

If anyone need in future for the same problem. Using puppeteer-extra

I have tested the code on a server. On 2nd run there is google Captcha. You can solve it your self and restart the bot or use a Captcha solving service.

I did run the code more than 10 times there is no ip ban. I did not get captcha again on my continues run.

But you can get captcha again!

//sudo npm install puppeteer puppeteer-extra puppeteer-extra-plugin-stealth puppeteer-extra-plugin-adblocker readline
var headless_mode = process.argv[2]

const readline = require('readline');
const puppeteer = require('puppeteer-extra')
const StealthPlugin = require('puppeteer-extra-plugin-stealth')
puppeteer.use(StealthPlugin())
const AdblockerPlugin = require('puppeteer-extra-plugin-adblocker')
puppeteer.use(AdblockerPlugin({ blockTrackers: true }))


async function run () {
  const browser = await puppeteer.launch({
    headless:(headless_mode !== 'true')? false : true,
    ignoreHTTPSErrors: true,
    slowMo: 0,
    args: ['--window-size=1400,900',
    '--remote-debugging-port=9222',
    "--remote-debugging-address=0.0.0.0", // You know what your doing?
    '--disable-gpu', "--disable-features=IsolateOrigins,site-per-process", '--blink-settings=imagesEnabled=true'
    ]})

  const page = await browser.newPage();


  console.log(`Testing expertflyer.com`)
  //await page.goto('https://www.expertflyer.com')
  await goto_Page('https://www.expertflyer.com')
  await waitForNetworkIdle(page, 3000, 0)
  //await page.waitFor(7000)
  await checking_error(do_2nd_part)




  async function do_2nd_part(){
    try{await page.click('#yui-gen2 > a')}catch{}
    await page.waitFor(5000)
    var seat = '#headerTitleContainer > h1'
    try{console.log(await page.$eval(seat, e => e.innerText))}catch{}
    await page.screenshot({ path: 'expertflyer1.png'})

    await checking_error(do_3nd_part)
  }

  async function do_3nd_part(){
    try{await page.click('#yui-gen1 > a')}catch{}
    await page.waitFor(5000)
    var pro = '#headerTitleContainer > h1'
    try{console.log(await page.$eval(pro, e => e.innerText))}catch{}
    await page.screenshot({ path: 'expertflyer2.png'})

    console.log(`All done, check the screenshots?`)
  }


  async function checking_error(callback){
    try{
      try{var error_found = await page.evaluate(() => document.querySelectorAll('a[class="text yuimenubaritemlabel"]').length)}catch(error){console.log(`catch error ${error}`)}

      if (error_found === 0) {
        console.log(`Error found`)
        var captcha_msg = "Due to suspicious activity from your computer, we have blocked your access to ExpertFlyer. After completing the CAPTCHA below, you will immediately regain access unless further suspicious behavior is detected."
        var ip_blocked = "Due to recent suspicious activity from your computer, we have blocked your access to ExpertFlyer. If you feel this block is in error, please contact us using the form below."
        try{var error_msg = await page.$eval('h2', e => e.innerText)}catch{}
        try{var error_msg_details = await page.$eval('body > p:nth-child(2)', e => e.innerText)}catch{}

        if (error_msg_details == captcha_msg) {
          console.log(`Google Captcha found, You have to solve the captch here manually or some automation recaptcha service`)

          await verify_User_answer()
          await callback()
        } else if (error_msg_details == ip_blocked) {
          console.log(`The current ip address is blocked. The only way is change the ip address.`)
        } else {
          console.log(`Waiting for error page load... Waiting for 10 sec before rechecking...`)
          await page.waitFor(10000)
          await checking_error()
        }

      } else {
        console.log(`Page loaded successfully! You can do things here.`)
        await callback()
      }

    }catch{}
  }

  async function goto_Page(page_URL){
    try{
      await page.goto(page_URL, { waitUntil: 'networkidle2', timeout: 30000 });
    } catch {
      console.log(`Error in loading page, re-trying...`)
      await goto_Page(page_URL)
    }
  }

  async function verify_User_answer(call_back){
      user_Answer = await readLine();

      if (user_Answer == 'yes') {
        console.log(`user_Answer is ${user_Answer}, Processing...`)
        // Not working what i want. Will fix later
        // Have to restart the bot after solving
        await call_back()
      } else {
        console.log(`answer not match. try again...`)

        var user_Answer = await readLine();
        console.log(`user_Answer is ${user_Answer}`)
        await verify_User_answer(call_back)
      }
    }

    async function readLine() {

      const rl = readline.createInterface({
        input: process.stdin,
        output: process.stdout
      });

      return new Promise(resolve => {

        rl.question('Solve the captcha and type yes to continue: ', (answer) => {
          rl.close();
          resolve(answer)
        });
      })
    }

  async function waitForNetworkIdle(page, timeout, maxInflightRequests = 0) {
  console.log('waitForNetworkIdle called')
  page.on('request', onRequestStarted);
  page.on('requestfinished', onRequestFinished);
  page.on('requestfailed', onRequestFinished);

  let inflight = 0;
  let fulfill;
  let promise = new Promise(x => fulfill = x);
  let timeoutId = setTimeout(onTimeoutDone, timeout);
  return promise;

  function onTimeoutDone() {
    page.removeListener('request', onRequestStarted);
    page.removeListener('requestfinished', onRequestFinished);
    page.removeListener('requestfailed', onRequestFinished);
    fulfill();
  }

  function onRequestStarted() {
    ++inflight;
    if (inflight > maxInflightRequests)
      clearTimeout(timeoutId);
  }

  function onRequestFinished() {
    if (inflight === 0)
      return;
    --inflight;
    if (inflight === maxInflightRequests)
      timeoutId = setTimeout(onTimeoutDone, timeout);
  }
}


  await browser.close()
}
run();

Please note "Solve the captcha and type yes to continue: " method not working as expected, Need some fixing.

Edit: Re-run the bot after 10 minutes got captcha again. Solved captcha on chrome://inspect/#devices restarted the bot, everything working again. No ip ban.


与恶龙缠斗过久,自身亦成为恶龙;凝视深渊过久,深渊将回以凝视…
Welcome to OStack Knowledge Sharing Community for programmer and developer-Open, Learning and Share
Click Here to Ask a Question

2.1m questions

2.1m answers

60 comments

57.0k users

...