AWS Lambda Web Scraper
2020-06-21
This is a small AWS Lambda function to scrape websites using axios
and store the data in a MongoDB document. You can setup an API Gateway to the Lambda function and use GET
requests to call the function.
Features
- Randomly selects from a set of headers with each call.
- Automatically sets the host and referer to the same domain.
- Saves the response to MongoDB.
- Optionally sets the header to json if you expect the output to be json format
- Optionally sets the request to XMLHttpRequest
Install the required Node modules: npm install axios mongodb dotenv
const request = require('axios');
const MongoClient = require('mongodb').MongoClient;
const crypto = require('crypto');
// local .env files are loaded into process.env
require('dotenv').config({silent: false});
// load the MongoDB connection string from the .env file
const mongo_host = process.env.MONGO
// database and collection name
= 'scraper'
databaseName = 'rawdata'
collectionName
// set of headers from which we will randomly select
let headers_list = [
// Firefox 77 Mac
{"User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10.15; rv:77.0) Gecko/20100101 Firefox/77.0",
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8",
"Accept-Language": "en-US,en;q=0.5",
"Referer": "https://www.google.com/",
"DNT": "1",
"Connection": "keep-alive",
"Upgrade-Insecure-Requests": "1"
,
}// Firefox 77 Windows
{"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:77.0) Gecko/20100101 Firefox/77.0",
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8",
"Accept-Language": "en-US,en;q=0.5",
"Accept-Encoding": "gzip, deflate, br",
"Referer": "https://www.google.com/",
"DNT": "1",
"Connection": "keep-alive",
"Upgrade-Insecure-Requests": "1"
,
}// Chrome 83 Mac
{"Connection": "keep-alive",
"DNT": "1",
"Upgrade-Insecure-Requests": "1",
"User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.97 Safari/537.36",
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9",
"Sec-Fetch-Site": "none",
"Sec-Fetch-Mode": "navigate",
"Sec-Fetch-Dest": "document",
"Referer": "https://www.google.com/",
"Accept-Encoding": "gzip, deflate, br",
"Accept-Language": "en-GB,en-US;q=0.9,en;q=0.8"
,
}// Chrome 83 Windows
{"Connection": "keep-alive",
"Upgrade-Insecure-Requests": "1",
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.97 Safari/537.36",
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9",
"Sec-Fetch-Site": "same-origin",
"Sec-Fetch-Mode": "navigate",
"Sec-Fetch-User": "?1",
"Sec-Fetch-Dest": "document",
"Referer": "https://www.google.com/",
"Accept-Encoding": "gzip, deflate, br",
"Accept-Language": "en-US,en;q=0.9"
}
]
function isValidURL(string) {
var res = string.match(/(http(s)?:\/\/.)?(www\.)?[-a-zA-Z0-9@:%._\+~#=]{2,256}\.[a-z]{2,6}\b([-a-zA-Z0-9@:%_\+.~#?&//=]*)/g);
return (res !== null)
;
}
.exports.scrape = async event => {
module// start by parsing the body assuming a POST statement with a JSON body
let body = JSON.parse(event.body)
// url is required
if (!('url' in body)) {
return {ok: 0, msg: 'Missing URL'}
}
// check the url is valid
if (!isValidURL(body.url)) {
return {ok: 0, msg: 'Invalid URL'}
}
let url = body.url
let host = new URL(url)
// randomly select a header
let headers = headers_list[Math.floor(Math.random() * headers_list.length)]
// the request should look like it is originating from the host
'Host'] = host.host
headers[// referer is from the same domain, referers from google.com are often
// redirected, which we want to avoid
'Referer'] = host.origin
headers[
// set json headers if we expect the response to be in json
if ('json' in body) {
'Accept'] = 'application/json, text/javascript, */*; q=0.01'
headers[
}
// set XMLHttpRequest header, which helps when calling private APIs that
// would typically be loaded by AJAX calls
if ('ajax' in body) {
'X-Requested-With'] = 'XMLHttpRequest'
headers[
}
// send a GET request with our headers
const response = await request({
'url': url,
'method': 'get',
'headers': headers,
;
})
if (response.status == 200) {
// create a data object containing the response body and headers
let date = new Date()
let data = {
'url': url,
'url_hash': crypto.createHash('md5').update(url).digest("hex"),
'host': host.host,
'data': response.data,
'processed': false,
'scraped_at': date,
'scraped_year': date.getFullYear(),
'scraped_month': date.getMonth() + 1,
'scraped_day': date.getDate(),
'response_headers': response.headers,
'request_headers': headers
}
// create a connection to the MongoDB
const client = await MongoClient.connect(mongo_host, {useUnifiedTopology: true});
// select the database
const db = client.db(databaseName);
// insert data into collection in database
let r = await db.collection(collectionName).insertOne(data);
// close the connection to MongoDB
.close();
client
if (r.insertedCount == 1) {
// return the newly created ObjectID if a new document was successfully inserted
return {
ok: 1,
url: url,
insertedId: r.insertedId,
;
}
}else {
} return {
ok: 0,
url: url,
status: response.status,
msg: 'Bad response status'
;
}
}
return {
ok: 0,
url: url
;
}; }