使用nodejs爬取格言网,解析html并存入sqlite数据库
/**
* Shows how to use chaining rather than the `serialize` method.
*/
"use strict";
var sqlite3 = require('sqlite3').verbose();
var db;
var dbfile = "C:\\Users\\spike\\Desktop\\test.db";
function createDb() {
console.log("createDb chain");
db = new sqlite3.Database(dbfile, readAllRows);
db.configure("busyTimeout", 5000)
}
var persons = [];
var Crawler = require("crawler");
var c = new Crawler({
rateLimit: 2000, // `maxConnections` will be forced to 1
maxConnections: 10,
// This will be called for each crawled page
// callback : function (error, res, done) {
// console.log(res.options.count);
// if(error){
// console.log(error);
// }else{
// if(res.statusCode==404){
// console.log("发生了404错误");
// }
// var $ = res.$;
// // $ is Cheerio by default
// //a lean implementation of core jQuery designed specifically for the server
// var content = $(".view-content").text();
// if(content==""|| content==undefined|| content==null){
// console.log("执行结束")
// }
// var senstenceArray = content.split('\n');
// for(var i = 0;i<senstenceArray.length;i++){
// var element = senstenceArray[i];
// //去掉空格
// element = element.replace(/\ +/g,"");
// //去掉回车换行
// element = element.replace(/[\r\n]/g,"");
// if(element=='喜欢') {
// break;
// }
// if(element==''){
// }else{
// // console.log(element+'\n');
// //对每行进行处理
// var personId = res.options.personId;
// var singleSen = element.slice(0,element.indexOf("》")+1);
// var completeSen = singleSen.slice(0,singleSen.indexOf("《"));
// if(singleSen==''||singleSen==null||singleSen==undefined){
// }else{
// // console.log(completeSen+' ');
// var senSource = singleSen.slice(singleSen.indexOf("《"));
// // console.log(senSource);
// insertRows(singleSen,senSource,personId);
// }
// }
// }
// }
// done();
// }
});
c.on('drain', function () {
// For example, release a connection to database.
closeDb();// close connection to MySQL
});
// function createTable() {
// console.log("createTable lorem");
// db.run("CREATE TABLE IF NOT EXISTS lorem (info TEXT)", insertRows);
// }
function insertRows(Senstence, Source, FamousPersonId) {
console.log("insertRows " + Senstence+ " FamousPersonId:"+FamousPersonId);
db.run("INSERT INTO Senstence (Senstence,Source,FamousPersonId) VALUES ('" + Senstence + "','" + Source + "','" + FamousPersonId + "')");
}
function readAllRows() {
console.log("readAllRows Senstence");
db.all("SELECT Id,Name FROM FamousPerson where Id > 1400 and Id < 1500 ", function (err, rows) {
if (err) {
console.log(err);
}
if (rows == undefined) {
console.log("啥也没茶树来")
}
rows.forEach(function (row) {
var itemStr = row.Id + ":" + row.Name
persons.push(itemStr)
// console.log(row.Id + ":" + row.Name);
});
doYourWork();
});
}
function doYourWork() {
//persons.length
for (var x = 0; x < persons.length; x++) {
var idTemp = persons[x].slice(0, persons[x].indexOf(':'));
var nameTemp = persons[x].slice(persons[x].indexOf(':') + 1);
var uriPersonNameCode = encodeURI(nameTemp);
c.queue({
// uri: 'https://www.juzimi.com/writer/' + uriPersonNameCode + '?page=0',
uri: 'https://www.juzimi.com/writer/' + uriPersonNameCode ,
personId: idTemp,
personName: nameTemp,
personNameCode: uriPersonNameCode,
count: 0,
callback: function (error, res, done) {
var $ = res.$;
if (error) {
console.log(error);
} else {
if (res.statusCode == 404) {
console.log("查询名称的时候发生了404错误");
} else {
var pageCountTemp = $(".pager-last").text();
var personIdNum = res.options.personId;
var personNameCode = res.options.personNameCode;
var personName = res.options.personName;
var parentObj = res.options.parentObj;
console.log(personName + "的总页数为" + pageCountTemp);
if (pageCountTemp != null && pageCountTemp != '' && pageCountTemp != undefined && pageCountTemp > 0) {
console.log("进入子循环:pageCountTemp为" + pageCountTemp + "personNameCode为" + personNameCode + "personIdNum为" + personIdNum);
ccSpyer(pageCountTemp, personNameCode, personIdNum);
} else {
console.log("没有数据,流程结束!");
}
}
// $ is Cheerio by default
//a lean implementation of core jQuery designed specifically for the server
}
done();
},
// preRequest: function (options, done) {
// setTimeout(function () {
// // console.log(options);
// done();
// }, 1000)
// }
});
}
}
function ccSpyer(pageCountTemp, personNameCode, personIdNum) {
for (var j = 0; j < pageCountTemp; j++) {
console.log("开始第" + j + "次循环")
c.queue({
uri: 'https://www.juzimi.com/writer/' + personNameCode + '?page=' + j,
personId: personIdNum,
callback: function (error, res, done) {
if (error) {
console.log(error);
}
if (res.statusCode == 404) {
console.log("发生了404错误");
} else{
var $ = res.$;
var content = $(".view-content").text();
if (content == "" || content == undefined || content == null) {
console.log("执行结束")
} else {
var senstenceArray = content.split('\n');
console.log("开始内部循环")
for (var g = 0; g < senstenceArray.length; g++) {
var element = senstenceArray[g];
//去掉空格
element = element.replace(/\ +/g, "");
//去掉回车换行
element = element.replace(/[\r\n]/g, "");
if (element == '喜欢') {
break;
}
if (element == '') {
} else {
// console.log(element+'\n');
//对每行进行处理
var personId = res.options.personId;
if(element.indexOf('\'')>-1){
}else{
if(element.indexOf(';')>-1){
element= element.replace(/;/g,"。");
element= element.replace(/,/g,",");
}
var singleSen = element.slice(0, element.indexOf("》") + 1);
var completeSen = singleSen.slice(0, singleSen.indexOf("《"));
if (singleSen == '' || singleSen == null || singleSen == undefined) {
} else {
// console.log(completeSen+' ');
var senSource = singleSen.slice(singleSen.indexOf("《"));
// console.log(senSource);
insertRows(completeSen, senSource, personId);
}
}
}
}
}
}
done();
},
// preRequest: function (options, done) {
// setTimeout(function () {
// // console.log(options);
// done();
// }, 5000)
// }
});
}
}
function closeDb() {
console.log("closeDb");
db.close();
}
function runChainExample() {
createDb();
}
runChainExample();