nodejs 爬虫 例子

使用nodejs爬取格言网,解析html并存入sqlite数据库

/**
 * Shows how to use chaining rather than the `serialize` method.
 */
"use strict";

var sqlite3 = require('sqlite3').verbose();
var db;
var dbfile = "C:\\Users\\spike\\Desktop\\test.db";
function createDb() {
    console.log("createDb chain");
    db = new sqlite3.Database(dbfile, readAllRows);
    db.configure("busyTimeout", 5000)
}
var persons = [];


var Crawler = require("crawler");

var c = new Crawler({
    rateLimit: 2000, // `maxConnections` will be forced to 1
    maxConnections: 10,
    // This will be called for each crawled page
    // callback : function (error, res, done) {
    //     console.log(res.options.count);
    //     if(error){
    //         console.log(error);
    //     }else{
    //         if(res.statusCode==404){
    //              console.log("发生了404错误");
    //         }
    //         var $ = res.$;
    //         // $ is Cheerio by default
    //         //a lean implementation of core jQuery designed specifically for the server
    //         var content = $(".view-content").text();
    //         if(content==""|| content==undefined|| content==null){
    //             console.log("执行结束")
    //         }
    //         var senstenceArray = content.split('\n');

    //         for(var i = 0;i<senstenceArray.length;i++){
    //                 var element = senstenceArray[i];
    //                 //去掉空格
    //                 element = element.replace(/\ +/g,"");  
    //                 //去掉回车换行        
    //                 element = element.replace(/[\r\n]/g,"");      
    //                 if(element=='喜欢') {
    //                     break;
    //                 }
    //                 if(element==''){

    //                 }else{
    //                     // console.log(element+'\n');
    //                     //对每行进行处理
    //                     var personId = res.options.personId;
    //                     var singleSen = element.slice(0,element.indexOf("》")+1);
    //                     var completeSen = singleSen.slice(0,singleSen.indexOf("《"));
    //                     if(singleSen==''||singleSen==null||singleSen==undefined){

    //                     }else{
    //                         // console.log(completeSen+'            ');
    //                         var senSource = singleSen.slice(singleSen.indexOf("《"));
    //                         // console.log(senSource);
    //                         insertRows(singleSen,senSource,personId);
    //                     }
    //                 }
    //         }
    //     }
    //     done();
    // }
});




c.on('drain', function () {
    // For example, release a connection to database.
    closeDb();// close connection to MySQL
});




// function createTable() {
//     console.log("createTable lorem");
//     db.run("CREATE TABLE IF NOT EXISTS lorem (info TEXT)", insertRows);
// }


function insertRows(Senstence, Source, FamousPersonId) {
    console.log("insertRows  " + Senstence+   "    FamousPersonId:"+FamousPersonId);
    db.run("INSERT INTO Senstence (Senstence,Source,FamousPersonId)  VALUES ('" + Senstence + "','" + Source + "','" + FamousPersonId + "')");
}

function readAllRows() {
    console.log("readAllRows Senstence");
    db.all("SELECT Id,Name FROM FamousPerson where Id > 1400 and Id < 1500 ", function (err, rows) {
        if (err) {
            console.log(err);
        }
        if (rows == undefined) {
            console.log("啥也没茶树来")
        }
        rows.forEach(function (row) {
            var itemStr = row.Id + ":" + row.Name
            persons.push(itemStr)
            // console.log(row.Id + ":" + row.Name);
        });
        doYourWork();
    });
}



function doYourWork() {
    //persons.length
    for (var x = 0; x < persons.length; x++) {
        var idTemp = persons[x].slice(0, persons[x].indexOf(':'));
        var nameTemp = persons[x].slice(persons[x].indexOf(':') + 1);
        var uriPersonNameCode = encodeURI(nameTemp);
        c.queue({
            // uri: 'https://www.juzimi.com/writer/' + uriPersonNameCode + '?page=0',
            uri: 'https://www.juzimi.com/writer/' + uriPersonNameCode ,
            personId: idTemp,
            personName: nameTemp,
            personNameCode: uriPersonNameCode,
            count: 0,
            callback: function (error, res, done) {
                var $ = res.$;
                if (error) {
                    console.log(error);
                } else {
                    if (res.statusCode == 404) {
                        console.log("查询名称的时候发生了404错误");
                    } else {
                        var pageCountTemp = $(".pager-last").text();
                        var personIdNum = res.options.personId;
                        var personNameCode = res.options.personNameCode;
                        var personName = res.options.personName;
                        var parentObj = res.options.parentObj;
                        console.log(personName + "的总页数为" + pageCountTemp);

                        if (pageCountTemp != null && pageCountTemp != '' && pageCountTemp != undefined && pageCountTemp > 0) {
                            console.log("进入子循环:pageCountTemp为" + pageCountTemp + "personNameCode为" + personNameCode + "personIdNum为" + personIdNum);
                            ccSpyer(pageCountTemp, personNameCode, personIdNum);
                        } else {
                            console.log("没有数据,流程结束!");
                        }
                    }
                    // $ is Cheerio by default
                    //a lean implementation of core jQuery designed specifically for the server

                }
                done();
            },
            // preRequest: function (options, done) {
            //     setTimeout(function () {
            //         // console.log(options);
            //         done();
            //     }, 1000)
            // }
        });
    }
}


function ccSpyer(pageCountTemp, personNameCode, personIdNum) {
    for (var j = 0; j < pageCountTemp; j++) {
        console.log("开始第" + j + "次循环")
        c.queue({
            uri: 'https://www.juzimi.com/writer/' + personNameCode + '?page=' + j,
            personId: personIdNum,
            callback: function (error, res, done) {
                if (error) {
                    console.log(error);
                }
                if (res.statusCode == 404) {
                    console.log("发生了404错误");
                } else{
                    var $ = res.$;
                    var content = $(".view-content").text();
                    if (content == "" || content == undefined || content == null) {
                        console.log("执行结束")
                    } else {
                        var senstenceArray = content.split('\n');
                        console.log("开始内部循环")
                        for (var g = 0; g < senstenceArray.length; g++) {
                            var element = senstenceArray[g];
                            //去掉空格
                            element = element.replace(/\ +/g, "");
                            //去掉回车换行        
                            element = element.replace(/[\r\n]/g, "");
                            if (element == '喜欢') {
                                break;
                            }
                            if (element == '') {
    
                            } else {
                                // console.log(element+'\n');
                                //对每行进行处理
                                var personId = res.options.personId;
                                if(element.indexOf('\'')>-1){

                                }else{
                                    if(element.indexOf(';')>-1){
                                        element= element.replace(/;/g,"。");
                                        element= element.replace(/,/g,",");
                                    }
                       
                                    var singleSen = element.slice(0, element.indexOf("》") + 1);
                                    var completeSen = singleSen.slice(0, singleSen.indexOf("《"));
                                    if (singleSen == '' || singleSen == null || singleSen == undefined) {
        
                                    } else {
                                        // console.log(completeSen+'            ');
                                        var senSource = singleSen.slice(singleSen.indexOf("《"));
                                        // console.log(senSource);
                                        insertRows(completeSen, senSource, personId);
                                    }
                                }
                            
                            }
                        }
                    }
                }
                done();
            },
            // preRequest: function (options, done) {
            //     setTimeout(function () {
            //         // console.log(options);
            //         done();
            //     }, 5000)
            // }
        });
    }
}



function closeDb() {
    console.log("closeDb");
    db.close();
}

function runChainExample() {
    createDb();
}

runChainExample();


作者:spike

分类: Nodejs

创作时间:2024-09-27

更新时间:2024-12-09