How to Bypass Cloudflare With Puppeteer
Web Application Firewalls like Cloudflare are more and more a pain in the ass when checking links.
Automatic checking the links and bypassing all Cloudflare challenges is nearly impossible. There are some companies providing scrapping solutions as a service. But this are quite expensive and slow.
There are tools like puppeteer that allow running a normal web browser in an automatic process.
Puppeteer could run completely automatically in what's called the headless mode. However, detection of a headless browser by a WAF is possible despite stealth configurations. And with a headless browser, solving challenges manually is not possible.
Running puppeteer using the full Chrome browser, passing the Cloudflare challenge without human interaction.
const pptr = require('puppeteer-core');
pptr.launch({
headless: false,
defaultViewport: null,
executablePath: 'C:\\Program Files\\Google\\Chrome\\Application\\chrome.exe',
args: ['--disable-blink-features=AutomationControlled'],
userDataDir: 'C:\\Users\\xxx\\AppData\\Local\\Google\\Chrome\\User Data\\Puppeteer-1'
}).then(async browser => {
let page = await browser.newPage();
// navigate to a URL
await page.goto('https://www.scrapingcourse.com/cloudflare-challenge', {
waitUntil: 'networkidle0',
});
// wait for the challenge to resolve
await new Promise(function (resolve) {
setTimeout(resolve, 10000);
});
// take page screenshot
await page.screenshot({ path: 'screenshot.png' });
// close the browser instance
await browser.close();
})
Both the Joomla Link Checker as the forked Wordpress broken link checker allow fetching broken links from their databases and sending checked links back.
Using node + puppeteer to check the links still requires firing up puppeteer manually. But at last the links are validated automatically.
So all that's needed is a loop through all reported links and opening them in the web browser. Any challenges can be solved manually. And as cookies are saved between sessions, the link should pass the challenge automatically on the next run.
const pptr = require('puppeteer-core');
//I am anything but a javascript programmer
pptr.launch({
headless: false,
defaultViewport: null,
executablePath: 'C:\\Program Files\\Google\\Chrome\\Application\\chrome.exe',
args: ['--disable-blink-features=AutomationControlled'],
userDataDir: 'C:\\Users\\<you>\\AppData\\Local\\Google\\Chrome\\User Data\\Puppeteer'
}).then(async browser => {
const baseSite = 'https://example.com';
const baseUrl = `${baseSite}/component/ajax`;
const reportRequest = {
format: 'json',
tmpl: 'component',
plugin: 'blcReport',
token: '<token>',
limit: 10,
tocheck: 0,
orderby: 'last_check_attempt',
order: 'ASC',
all: 1,
working: -1,
external: 1,
checked: '500,503,610,601,615,403,613'
}
const apiRequest = {
format: 'json',
tmpl: 'component',
plugin: 'blcUpdate'
}
const apiHeaders = {
'Accept': 'application/json',
'Content-type': 'application/json',
'X-Joomla-Token': '<x-joomla-token>',
}
const reportUrl = baseUrl + '?' + buildQueryString(reportRequest);
const apiUrl = baseUrl + '?' + buildQueryString(apiRequest);
links = await fetch(reportUrl)
.then(res => res.json())
.then(out =>
links = out.data
)
.catch(err => console.log(err));
if (typeof links !== 'object') {
process.exit(1);
};
let page = await browser.newPage();
let urls = Object.keys(links);
if (!urls.length) {
//nothing to do. Show something
page.goto(baseSite);
return;
}
for (let index = 0; index < urls.length; index++) {
const url = urls[index];
const startTime = performance.now()
let report = {
http_code: 0,
url: url,
};
await
page.goto(url,
{ waitUntil: 'load', timeout: 8000 }
).then((response) => {
const chain = response.request().redirectChain();
report.http_code = response.status();
report.redirect_count = chain.length;
report.final_url = chain.length ? chain[0].url() : url;
}).catch((res) => {
report.http_code = 613; //timeout pseduo code
report.redirect_count = 0;
report.timeout = 1;
report.final_url = url;
});
const endTime = performance.now();
report.request_duration = Math.round((endTime - startTime)) / 1000;
if (!report.http_code) {
console.log('This weird: ' + url);
continue
}
if (report.http_code >= 300) {
page = await browser.newPage();
}
await fetch(apiUrl, {
method: "POST",
body: JSON.stringify(report),
headers: apiHeaders
}).then((response) => response.json())
.then((json) => console.log(json));
}
})
function buildQueryString(data) {
let parts = [];
const stack = Object.entries(data);
let pair;
while (pair = stack.shift()) {
let [key, value] = pair;
if (value !== undefined) {
// Null is treated as special case, equivalent to empty string.
if (value === null) {
value = '';
}
parts.push([key, value].map(encodeURIComponent).join('='));
}
}
// Loop will concatenate with leading `&`, but it's only expected for all
// but the first query parameter. This strips the leading `&`, while still
// accounting for the case that the string may in-fact be empty.
return parts.join('&');
}