End of an Era: Travel SEO

on in AJAX and Fetching Data
Last modified on

I’ll just leave this here for reference purposes and I won’t explain any code, as it’s obsolete and I’m not using it anyway. I also won’t go into any details on how to install Node.js or any of the other dependencies.

The main idea was to scrape the content and import it into WordPress as custom posts: hotels, venues, destinations, regions, countries. While I had it set up as I wanted and it was working properly, with AJAX requests and database caching, I had to stop due to lack of time and motivation. I learned a bit of Node.js in the process, so here’s my findings.

I was aiming for special offers and featured holidays, so I needed a way to scrape dynamic content and search results.

As a quick note, I know that the code below is pretty basic, but it might help someone in search of a quick solution.

Here’s how I scraped Booking.com hotels and extracted the hotel description:

var express = require('express');
var fs = require('fs');
var request = require('request');
var cheerio = require('cheerio');
var app = express();

app.get('/accommodation/', async function (req, res) {
    var accommodationArray = [
        'https://www.booking.com/hotel/es/trianflor.en-gb.html',
        'https://www.booking.com/hotel/es/apartamentos-doramar.en-gb.html',
        'https://www.booking.com/hotel/es/oasis-club.en-gb.html',
    ],
    url;
    var accommodation = [];

    for (let i = 0; i < accommodationArray.length; i++) {
        url = accommodationArray[i];

        let {err, resp, body} = await makeRequest(url);

        if (!err) {
            var $ = cheerio.load(body);

            let summary = $('#hotel_main_content #summary').text();
            console.log(url);

            accommodation.push([accommodationArray[i], summary]);
        }
    }

    res.send(accommodation);
});

function makeRequest(url) {
    return new Promise((resolve, reject) => {
        request(url, function (err, resp, body) {
            resolve({err, resp, body});
        });
    })
}

app.listen('8081');
console.log('Magic happens on port 8081');
exports = module.exports = app;

And here’s how I scraped more information based on a URL parameter:

var express = require('express');
var fs = require('fs');
var request = require('request');
var cheerio = require('cheerio');
var app = express();

function sanitize_string(str, keepCommas) {
    // Remove all single quotes
    str = str.replace(/'/g, '');

    // Remove all double quotes
    str = str.replace(/"/g, '');

    if (keepCommas === false) {
        // Remove all commas
        str = str.replace(/,/g, '');
    }

    // Remove the last comma and any whitespace after it
    str = str.replace(/,\s*$/, '');

    // Trim
    str = str.trim();

    return str;
}

app.get('/aod/:uri', async function (req, res) {
    var accommodationArray = [
        req.params.uri
    ],
    url;
    var accommodation = [];

    for (let i = 0; i < accommodationArray.length; i++) {
        url = accommodationArray[i];

        let {err, resp, body} = await makeRequest(url);

        if (!err) {
            var $ = cheerio.load(body);

            var regex;

            let summary = $('#hotel_main_content #summary').text().trim();
            //let address = $('.hp_address_subtitle').text();

            // Get coordinates
            regex = /(b_map_center_latitude.*)/g;
            let latitude = regex.exec(body);
            latitude = latitude[1];
            latitude = latitude.replace('b_map_center_latitude =', '');
            latitude = latitude.replace(';', '');
            latitude = sanitize_string(latitude, false);

            regex = /(b_map_center_longitude.*)/g;
            let longitude = regex.exec(body);
            longitude = longitude[1];
            longitude = longitude.replace('b_map_center_longitude =', '');
            longitude = longitude.replace(';', '');
            longitude = sanitize_string(longitude, false);

            // Get hotel ID
            regex = /(b_hotel_id.*)/g;
            let hotelId = regex.exec(body);
            hotelId = hotelId[1];
            hotelId = hotelId.replace('b_hotel_id =', '');
            hotelId = hotelId.replace('b_hotel_id:', '');
            hotelId = sanitize_string(hotelId, false);

            // Get country
            regex = /("addressCountry.*)/g;
            let country = regex.exec(body);
            country = country[1];
            country = country.replace('addressCountry" :', '');
            country = sanitize_string(country, false);

            // Get region
            regex = /("addressRegion.*)/g;
            let region = regex.exec(body);
            region = region[1];
            region = region.replace('addressRegion" :', '');
            region = sanitize_string(region, false);

            // Get address
            regex = /("streetAddress.*)/g;
            let address = regex.exec(body);
            address = address[1];
            address = address.replace('streetAddress" :', '');
            address = sanitize_string(address, true);

            // Get description
            regex = /("description" :.*)/g;
            let description = regex.exec(body);
            description = description[1];
            description = description.replace('description" :', '');
            description = sanitize_string(description, true);

            // Get photo
            regex = /(large_url.*)/g;
            let photo = regex.exec(body);
            photo = photo[1];
            photo = photo.replace('large_url:', '');
            photo = sanitize_string(photo, false);

            accommodation.push([accommodationArray[i], summary, address, country, region, description, photo, latitude, longitude, hotelId]);
        }
    }

    res.send(accommodation);
});

function makeRequest(url) {
    return new Promise((resolve, reject) => {
        request(url, function (err, resp, body) {
            resolve({err, resp, body});
        });
    })
}

app.listen('8081');
console.log('Magic happens on port 8081');
exports = module.exports = app;

And, finally, here’s a more specific scraping script for specific travel sites with different DOM structures:

var express = require('express');
var fs = require('fs');
var request = require('request');
var cheerio = require('cheerio');
var app = express();

function extract_details() {
    //
}

app.get('/scrape/:id', function (req, res) {
    if (req.params.id == 1) {
        url = 'https://www.example.com/family-holidays/';

        request(url, function (error, response, html) {
            if (!error) {
                var $ = cheerio.load(html),
                    offers = [];

                let offer = $('.cms-module-section');

                for (let i = 0; i < offer.length; i++) {
                    let source = 'Example Travel';
                    let sourceUri = 'https://www.example.com';
                    let title = $(offer[i]).find('h3.cms-txt1').text().trim();
                    let price = $(offer[i]).find('.price').text().trim();
                    let bb = $(offer[i]).find('p.cms-rating + p').text().trim();
                    let dateDuration = $(offer[i]).find('p.cms-rating + p + p').text().trim();
                    let origin = $(offer[i]).find('p.cms-rating + p + p + p').text().trim();
                    let uri = sourceUri + $(offer[i]).find('.cms-box-main a').attr('href');
                    let imageUri = sourceUri + $(offer[i]).find('.offer-img').attr('src');
                    let resort = $(offer[i]).find('h2.cms-nowrap a').text().trim();

                    imageUri = decodeURIComponent(imageUri);
                    imageUri = imageUri.replace('https://www.example.com/img.evolve?u=', '');
                    imageUri = imageUri.replace('https://www.example.comhttp', 'http');

                    dateDuration = dateDuration.replace(' ', ' ');
                    dateDuration = dateDuration.trim();
                    dateDuration = dateDuration.split('for');

                    if (title.length) {
                        offers.push([source, title, bb, dateDuration[0], dateDuration[1], origin, price, imageUri, uri, resort]);
                    }
                }

                res.send(offers);
            }
        });
    } else if (req.params.id == 2) {
        url = 'https://www.example.com/sun-holidays/sun-holidays-2019/';

        request(url, function (error, response, html) {
            if (!error) {
                var $ = cheerio.load(html),
                    offers = [];

                let offer = $('.cms-module');

                for (let i = 0; i < offer.length; i++) {
                    let source = 'Example Travel';
                    let sourceUri = 'https://www.example.com';
                    let title = $(offer[i]).find('.cms-module-main h3').text().trim();
                    let price = $(offer[i]).find('.price').text().trim();
                    let bb = '';
                    let dateDuration = $(offer[i]).find('.cms-module-main .theme-box-content .content')
                        .clone()    // clone the element
                        .children() // select all the children
                        .remove()   // remove all the children
                        .end()      // again go back to selected element
                        .text()     // get the text of element
                        .trim();
                    let uri = sourceUri + $(offer[i]).find('.cms-module-view a').attr('href');
                    let imageUri = $(offer[i]).find('.cms-module-main > img').attr('src');
                    let resort = $(offer[i]).find('.cms-module-main .theme-box-content .content h3 + div').text().trim();

                    imageUri = decodeURI(imageUri);

                    dateDuration = dateDuration.split("\n");
                    newDate = dateDuration[0];
                    duration = dateDuration[0].split('for');

                    origin = dateDuration[1];

                    if (title.length) {
                        offers.push([source, title, bb, duration[0].trim(), duration[1].trim(), origin, price, imageUri, uri, resort]);
                    }
                }

                res.send(offers);
            }
        });
    } else if (req.params.id == 3) {
        url = 'https://www.example.com/special-offers/';

        request(url, function (error, response, html) {
            if (!error) {
                var $ = cheerio.load(html),
                    offers = [];

                let offer = $('.cms-module');

                for (let i = 0; i < offer.length; i++) {
                    let source = 'Example Travel';
                    let title = $(offer[i]).find('.cms-box-info h3').text().trim();
                    let price = $(offer[i]).find('.cms-price').text().trim();
                    let uri = $(offer[i]).find('.cms-box-bottom a').attr('href');

                    let info = $(offer[i]).find('.cms-box-info > p.cms-no-wrap').text().trim();

                    // todo // change HTML structure
                    let bb = $(offer[i]).find('p.cms-rating + p').text().trim();
                    let dateDuration = $(offer[i]).find('p.cms-rating + p + p').text().trim();
                    let origin = $(offer[i]).find('p.cms-rating + p + p + p').text().trim();
                    let uri = sourceUri + $(offer[i]).find('.cms-box-main a').attr('href');
                    let imageUri = sourceUri + $(offer[i]).find('.offer-img').attr('src');
                    let resort = $(offer[i]).find('h2.cms-nowrap a').text().trim();

                    imageUri = decodeURIComponent(imageUri);
                    imageUri = imageUri.replace('https://www.example.com/img.evolve?u=', '');
                    imageUri = imageUri.replace('https://www.example.comhttp', 'http');

                    dateDuration = dateDuration.replace(' ', ' ');
                    dateDuration = dateDuration.trim();
                    dateDuration = dateDuration.split('for');

                    if (title.length) {
                        offers.push([title, price, uri]);
                    }
                }

                res.send(offers);
            }
        });
    } else if (req.params.id == 4) {
        url = 'https://www.example.com/winter-sun';

        request(url, function (error, response, html) {
            if (!error) {
                var $ = cheerio.load(html),
                    offers = [];

                let offer = $('.offersList ul li');

                for (let i = 0; i < offer.length; i++) {
                    let source = 'Example Travel';
                    let sourceUri = 'https://www.example.com';
                    let title = $(offer[i]).find('h2').text().trim();
                    let bb = '';
                    let price = $(offer[i]).find('.price').text().trim();
                    let uri = url + '/' + $(offer[i]).find('a.inner').attr('href');
                    let holidayDate = $(offer[i]).find('.badgeposition2').text().trim();
                    let duration = $(offer[i]).find('.infoList li').text().trim();
                    let origin = $(offer[i]).find('.badgeposition1.type1').text().trim();
                    let imageUri = sourceUri + '/' + $(offer[i]).find('.image img').attr('src');
                    let resort = '';

                    if (title.indexOf('-') > -1) {
                        title = title.split('-');
                        resort = title[0].trim();
                        title = title[1].trim();
                    }

                    if (price.indexOf('pp') > -1) {
                        price = price.split('pp');
                        bb = price[1].trim();
                        price = price[0].trim();
                    } else if (price.indexOf('per') > -1) {
                        price = price.split('per family');
                        bb = price[1].trim();
                        price = price[0].trim();
                    }
                    if (bb.indexOf('(') > -1 || bb.indexOf('based') > -1) {
                        bb = '';
                    }

                    holidayDate = holidayDate.replace('Book by', '').trim();
                    origin = origin.replace('From', '').trim();

                    if (title.length) {
                        offers.push([source, title, bb, holidayDate, duration, origin, price, imageUri, uri, resort]);
                    }
                }

                res.send(offers);
            }
        });
    } else if (req.params.id == 5) {
        url = 'https://www.example.com/cheap-holidays/lanzarote/7-Night-holidays-to-lanzarote/';

        request(url, function (error, response, html) {
            if (!error) {
                var $ = cheerio.load(html),
                    offers = [];

                let offer = $('.offers_iterator .flight_click_link');

                for (let i = 0; i < offer.length; i++) {
                    let source = 'Example Travel';
                    let sourceUri = 'https://www.example.com';
                    let title = $(offer[i]).find('div.where span').text().trim();
                    let bb = '';
                    let price = $(offer[i]).find('div.from strong').text().trim();
                    let uri = sourceUri + $(offer[i]).find('div.prices_from a').attr('href');
                    let holidayDate = $(offer[i]).find('div.from')
                        .clone()    // clone the element
                        .children() // select all the children
                        .remove()   // remove all the children
                        .end()      // again go back to selected element
                        .text()     // get the text of element
                        .trim();
                    let duration = $(offer[i]).find('div.howlong').text().trim();
                    let imageUri = $(offer[i]).find('img.dest_image').attr('src');
                    let resort = $(offer[i]).find('div.where')
                        .clone()    // clone the element
                        .children() // select all the children
                        .remove()   // remove all the children
                        .end()      // again go back to selected element
                        .text()     // get the text of element
                        .trim();
                    let origin = '';

                    holidayDate = holidayDate.replace('from', '').trim();
                    duration = duration.split('from');
                    origin = duration[1].trim();
                    duration = duration[0].trim();


                    if (title.length) {
                        offers.push([source, title, bb, holidayDate, duration, origin, price, imageUri, uri, resort]);
                    }
                }

                res.send(offers);
            }
        });
    }
});

app.listen('8082');
console.log('Magic happens on port 8082');
exports = module.exports = app;

Finally, here’s my package.json file:

{
  "name": "node-web-scrape",
  "version": "0.0.1",
  "description": "Scraper.",
  "main": "server.js",
  "author": "Me",
  "dependencies": {
    "express": "latest",
    "request": "latest",
    "cheerio": "latest"
  }
}

Related posts