Home Identifier Source Test Repository

src/checker.es6

import _ from 'lodash';
import got  from 'got';
import Base  from './base';
import BasedOptions  from './based-option';
import Model from './model/model';
import Document  from './model/document';
import Statistic  from './model/statistic';
import LinkAnalyzer from './link-analyzer';

require('http').globalAgent.maxSockets = Infinity;
require('https').globalAgent.maxSockets = Infinity;
process.env.NODE_TLS_REJECT_UNAUTHORIZED = "0";

export default class Checker extends Base {
    /**
     * Constructor
     * @param {Object}    [options]                            — configuration object
     * @param {String}    [options.mode]                       - mode of checking ("website", "section" or "page")
     * @param {Number}    [options.concurrent]                 — number of concurrent requests
     * @param {Object}    [options.requestHeaders]             — set custom request headers for crawler requests
     * @param {Number}    [options.requestRetriesAmount]       - number of attempts for request if it fails at first
     * @param {Number}    [options.requestTimeout]             - request timeout (in milliseconds)
     * @param {Function}  [options.onDone]                     - set custom done handler function
     * @param {String[]}  [options.acceptedSchemes]            — set array of accepted request acceptedSchemes
     * @param {Boolean}   [options.checkExternalUrls]          — set `true` for check outer links
     * @param {RegExp[]}  [options.excludeLinkPatterns         - array of regular expressions. Urls that matches
     * for this regular expressions would be excluded from verification
     * @constructor
     */
    constructor(options = {}) {
        super(options, module);

        this.logger.info('Initialize crawler instance');

        /**
         * Checker options
         * @type {BasedOptions}
         */
        this._options = new BasedOptions();

        const def = this.constructor.DEFAULT;
        this.options
            .setOption(options, 'mode', def.mode)
            .setOption(options, 'concurrent', def.concurrent)
            .setOption(options, 'requestHeaders', def.requestHeaders)
            .setOption(options, 'requestTimeout', def.requestTimeout)
            .setOption(options, 'acceptedSchemes', def.acceptedSchemes)
            .setOption(options, 'checkExternalUrls', def.checkExternalUrls)
            .setOption(options, 'excludeLinkPatterns', def.excludeLinkPatterns)
            .setOption(options, 'requestRetriesAmount', def.requestRetriesAmount)
            .setOption(options, 'onDone', this.onDone.bind(this));
    }

    /**
     * Getter function for options
     * @returns {BasedOptions}
     */
    get options() {
        return this._options;
    }

    /**
     * Returns logger instance
     * @return {Logger} logger
     */
    get logger() {
        return this._logger;
    }

    /**
     * Returns model instance
     * @return {Model} model
     */
    get model() {
        return this._model;
    }

    /**
     * Returns instance of LinkAnalyzer class
     * @return {LinkAnalyzer} linkAnalyzer
     */
    get linkAnalyzer() {
        return this._linkAnalyzer;
    }

    /**
     * Returns instance of Statistic class
     * @return {Statistic} statistic
     */
    get statistic() {
        return this._statistic;
    }

    /**
     * Sets model instance
     * @param {Model} model instance
     * @return {Checker}
     */
    initModel(model) {
        this._model = model;
        return this;
    }

    /**
     * Sets linkAnalyzer instance
     * @param {LinkAnalyzer} linkAnalyzer
     * @return {Checker}
     */
    initLinkAnalyzer(linkAnalyzer) {
        this._linkAnalyzer = linkAnalyzer;
        return this;
    }

    /**
     * Sets Statistic instance
     * @param  {Statistic} statistic
     * @return {Checker}
     */
    initStatistic(statistic) {
        this._statistic = statistic;
        return this;
    }

    /**
     * Returns application default options
     * @returns {Object}
     * @constructor
     */
    static get DEFAULT() {
        return {
            mode: 'website',
            concurrent: 100,
            requestHeaders: { 'user-agent': 'node-spider' },
            requestRetriesAmount: 5,
            requestTimeout: 5000,
            acceptedSchemes: ['http:', 'https:'],
            checkExternalUrls: false,
            excludeLinkPatterns: []
        };
    }

    /**
     * Returns application constants model
     * @returns {Object}
     * @static
     */
    static get CONSTANTS() {
        return {
            URL_REGEXP: /https?\:\/\/\w+((\:\d+)?\/\S*)?/,
            MODE: {
                WEBSITE: 'website',
                SECTION: 'section',
                PAGE: 'page'
            }
        };
    }

    /**
     * Processes loaded document
     * @param {Document}                   document - document model
     * @param {String}                     document.url - request url
     * @param {HttpResponse|HttpsResponse} document.res - response object
     * @protected
     */
    processLoadedDocument(document) {
        var _this = this,
            documentUrl = document.url,
            $ = document.$;

        $('a').each(function () {
            var href = $(this).attr('href');

            if (href) {
                let url = document.resolve(href.split('#')[0]);

                if (_this.linkAnalyzer.isNeedToSkipUrl(url, documentUrl)) {
                    return;
                }

                if(_this.linkAnalyzer.isExternal(url)) {
                    _this.model.addToExternal(url, documentUrl, href);
                } else {
                    _this._addToQueue(url, { page: documentUrl, href: href });
                }
            }
        });
        this._onFinishLoad(documentUrl);
    }

    /**
     * Start to crawl pages for given url
     * @param {String} url - initial site url for start
     * @throws Error
     * @public
     */
    start(url) {
        if (!url) {
            throw new Error('Url was not set');
        }

        if (!url.match(this.constructor.CONSTANTS.URL_REGEXP)) {
            throw new Error('Urls is not valid');
        }

        this
            .initStatistic(new Statistic())
            .initModel(new Model())
            .initLinkAnalyzer(new LinkAnalyzer(url, this.options))
            .logger
            .info('Start to analyze pages for: => %s', url)
            .info('It can be take a long time. Please wait ...');
        this._addToQueue(url, { page: url });
    }

    /**
     * onDone callback function
     * @param {Statistic} statistic model instance
     * @protected
     */
    onDone(statistic) {
        return statistic;
    }

    /**
     * Makes request to given external url
     * @param {String} url - external url (url that should be requested)
     * @param {Object} advanced - object with advanced data
     * @param {Number} attempt - number of request attempt
     * @private
     */
    _checkInternalLink(url, advanced, attempt = 0) {
        if (attempt === 0) {
            this.model.addToActive(url);
        }

        got.get(url, this._getRequestOptions(), (error, data, res) => {
            if (error) {
                 if (!error.statusCode && attempt < this.options.getOption('requestRetriesAmount')) {
                     return this._checkInternalLink(url, advanced, ++attempt);
                 } else {
                     this.statistic.increaseInternalCount();
                     this.statistic.getBroken().add(url, advanced, error.statusCode);
                     this.logger.warn('Broken [%s] link: => %s on page: => %s',
                         error.statusCode, advanced.href, advanced.page);
                 }
                 return this._onFinishLoad(url);
            }

            this.logger.debug('[%s] [%s] Receive [%s] for url: => %s',
                this.model.getPendingLength(), this.model.getActiveLength(), res ? res.statusCode : -1, url);

            this.statistic.increaseInternalCount();
            this.processLoadedDocument(new Document(url, data));
        });
    }

    /**
     * Checks given external link item
     * @param {Object} item - external link item object
     * @param {String} item.url - external link url
     * @param {Object} item.advanced - external link advanced meta data object
     * @param {Number} attempt - number of request attempt
     * @returns {Promise}
     * @private
     */
    _checkExternalLink(item, attempt = 0) {
        var url = item.url,
            advanced = item.advanced;

        function ping() {
            return new Promise(resolve => {
                got.head(url, this._getRequestOptions(), (error, data, res) => {
                    if (error) {
                        if (!error.statusCode && attempt < this.options.getOption('requestRetriesAmount') - 1) {
                            return resolve(false);
                        } else if (error.statusCode) {
                            this.statistic.getBroken().add(url, advanced, error.statusCode);
                            this.logger.warn('Broken [%s] link: => %s on page: => %s',
                                error.statusCode, advanced.href, advanced.page);
                        }
                    }

                    this.logger.debug('[%s] [%s] Receive [%s] for url: => %s',
                        this.model.getPendingLength(), this.model.getActiveLength(), res ? res.statusCode : -1, url);

                    this.statistic.increaseExternalCount();
                    resolve(true);
                });
            });
        }

        return ping.apply(this).then(result => {
            return result || this._checkExternalLink(item, ++attempt);
        });
    }

    /**
     * Check all collected external links
     * @returns {Promise}
     * @private
     */
    _checkExternalLinks() {
        if (!this.model.areExternal()) {
            return Promise.resolve();
        }

        this.logger.info('Start to verify external links ...');

        return _(Array.from(this.model.external))
            .map(item => {
                return { url: item[0], advanced: item[1] };
            })
            .chunk(100)
            .value()
            .reduce((prev, portion) => {
                return prev.then(() => {
                    return Promise.all(portion.map(this._checkExternalLink.bind(this)));
                });
            }, Promise.resolve());
    }

    /**
     * Adds item to check queue
     * @param {String} url - link url
     * @param {Object} advanced - object with advanced data
     * @private
     */
    _addToQueue(url, advanced) {
        url = url.replace(/\/$/, '');

        if (this.model.addToProcessed(url)) {
            this.model.isQueueFull(this.options.getOption('concurrent')) ?
                this.model.addToPending(url, advanced) :
                this._checkInternalLink(url, advanced);
        }
    }

    /**
     * Function which called after request to given url will be finished
     * @param  {String} url which was requested
     * @return {*}
     * @private
     */
    _onFinishLoad(url) {
        this.model.removeFromActive(url);
        if (!this.model.isQueueFull(this.options.getOption('concurrent'))) {
            var next = this.model.removeFromPending();
            if (next) {
                this._checkInternalLink(next.url, next.advanced);
            } else if (!this.model.areActive()) {
                return this._checkExternalLinks().then(() => {
                    this.options.getOption('onDone')(this.statistic);
                });
            }
        }
    }

    /**
     * Returns request options hash
     * @returns {{encoding: string, headers: *, timeout: *}}
     * @private
     */
    _getRequestOptions() {
        return {
            encoding: 'utf-8',
            headers: this.options.getOption('requestHeaders'),
            timeout: this.options.getOption('requestTimeout')
        };
    }
}