A crawler application development

Use the cherio module to crawl the content on the web page

First, make sure you have installed node. If not, you can refer to Construction of node development environment

Initialize project

Create a new folder, choose any directory name, and execute it under the root directory

//Initialize package JSON file
npm init
//Introducing cherio module
npm install cheerio --save
//Introduction of timed task module
npm install node-schedule --save
//Introducing axios
npm install axios --save

Introduction to cherio module

Cherio is a subset of jQuery Core. It implements the browser independent dom operation API in jQuery Core. After removing some functions of jQuery, such as effect class and request class, it only retains the part of the core operating on dom, so it can operate on dom as conveniently as jQuery
Specific operations can be viewed Cherio Chinese document

Introduction to node schedule module

Node schedule is a module that implements scheduled tasks in node. It is convenient to use it to implement scheduled tasks. Please refer to the specific use methods file

http request method encapsulation

Here, we use axios to crawl web content. First, we encapsulate axios, which can be disguised as browser access, adding cookie s and other operations, and create an API JS file

const axios = require('axios');

const server = axios.create({
	timeout:10000,
	headers:{//Simulate browser access
		'user-agent':'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.75 Safari/537.36'
	}
});

module.exports = {
	get(url){
		return server.get(url);
	},
	post(url,data){
		return server.post(url,data);
	}
}

Next, let's start the code implementation of the crawler

The principle is to send a get request through axios to get the html code returned by the target website, and then get the data and other information you want through cherio
Here, we have taken the article website as an example to crawl the article list and article content
First review the elements and check the dom structure, class name, id and other information of the content you want to obtain on the target website
Here we take the article website as an example

We need the article title, picture list, article classification, author, article details, jump links and other information. By reviewing the source code, we can find the dom under the article list
Find the class name and id name corresponding to these information, and then get these information through cherio and combine them into the information you need
for example

const $http = require('./api.js');
const schedule = require('node-schedule');
const cheerio = require('cheerio');


let list = [];
//Here are some links to the classified list of articles on the target website
let baseList = [
	"https://cpu.baidu.com",
	"https://cpu.baidu.com/?chn=1022&from=list",
	"https://cpu.baidu.com/?chn=1021&from=list",
	"https://cpu.baidu.com/?chn=1001&from=list",
	"https://cpu.baidu.com/?chn=1013&from=list",
	"https://cpu.baidu.com/?chn=1048&from=list",
	"https://cpu.baidu.com/?chn=1006&from=list",
	"https://cpu.baidu.com/?chn=1002&from=list",
	"https://cpu.baidu.com/?chn=1007&from=list"

];
//Random access list
function getList(){
	return new Promise((resolve,reject)=>{
		let index = Math.floor((Math.random()*baseList.length));
		let url = baseList[index];
		$http.get(`${url}`).then((res)=>{
			if(res.status == 200){
				let data = res.data;
				let $ = cheerio.load(data);
				let ls = [];
				$("#main-content .news-list .news-item").each(function(i,e){
					let item = {};
					item.title = $(e).find(".n-title span").text();
					let imglist = [];
					$(e).find('.n-img-wrapper .img').each(function(j,v){
						let img_item = $(v).find('img').attr('data-src');
						if(img_item.indexOf('http') < 0){
							img_item = 'https:' + img_item;
						}
						if(imglist.length < 3){
							imglist.push(img_item);
						}
					});
					//Judge the number of pictures
					if(imglist.length >=3){
						item.extra = imglist;
					}
					
					//Get source
					let source_name = $(e).find(".n-desc .info span").eq(1).text();
					item.source_name  = source_name;
					//Details link
					let url = $(e).find('.n-item-link').attr('href');
					//Judgment classification
					let cat_info = $(e).find('.n-desc .info .cat-info').text();
					item.cat_info = cat_info;
					//Custom classification id
					if(cat_info == "entertainment"){
						item.catid = "20006";
					}else if(cat_info == "science and technology"){
						item.catid = "20014";
					}else if(cat_info == "Finance and Economics"){
						item.catid = "20013";
					}else if(cat_info == "Sports"){
						item.catid = "20076";
					}else if(cat_info == "automobile"){
						item.catid = "20016";
					}else if(cat_info == "Sociology"){
						item.catid = "1048";
					}else if(cat_info == "game"){
						item.catid = "1040";
					}else if(cat_info == "fashion"){
						item.catid = "20090";
					}else if(cat_info == "Agriculture, rural areas and farmers"){
						item.catid = "20091";
					}else if(cat_info == "mobile phone"){
						item.catid = "20092";
					}else if(cat_info == "house property"){
						item.catid = "20093";
					}else if(cat_info == "Chicken soup Meiwen"){
						item.catid = "20094";
					}else if(cat_info == "Crosstalk sketch"){
						item.catid = "20095";
					}else if(cat_info == "delicious food"){
						item.catid = "20096";
					}else if(cat_info == "constellation"){
						item.catid = "20097";
					}
					
					item.url = url;
					ls.push(item);
				})
				resolve(ls);
			}else{
				console.log("request was aborted");
				reject('222');
			}
		})
	})
}
//Get article details
function getContent(url){
	return new Promise((resolve,reject)=>{
		$http.get(`https://cpu.baidu.com${url}`).then((res)=>{
			if(res.status == 200){
				resolve(res.data);
			}
		})
	})
}
async function init(){
	if(list.length == 0){
		list = await getList();
	}
	let data = list.pop();
	if(data.title.length > 40){
		console.log('Article title is too long,No collection');
		return;
	}
	if(data.title && data.extra){
		let html = await getContent(data.url);
		let $ = cheerio.load(html,{decodeEntities:false});
		let thumb = data.extra[0] || "";
		data.extra = JSON.stringify(data.extra);
		$("#main-content .article").find('img').each(function(i,e){
			let url = $(this).attr("data-src");
			$(this).attr("src",url);
		});
		$("#main-content .article").find(".content").each(function(i,e){
			$(this).attr("style","");
		})
		let content = $("#main-content .article").html();
		let postdata = {
			catid:data.catid,
			title:data.title,
			source_name:data.source_name,
			thumb:thumb,
			extra:data.extra || '',
			content:content,
		};
		//postdata is the final data collection, and the saving process needs to be realized by itself
		console.log(postdata);
	}else{
		console.log("No such classification"+data.cat_info);
	}
}

//Execute acquisition tasks regularly
let rule = new schedule.RecurrenceRule();

let times = [0,15,30,45];

rule.second = times;

schedule.scheduleJob(rule,()=>{
	init();
})


So far, a simple crawler application has been completed. The code is very simple. It just provides you with an idea for reference Sample project

Tags: crawler node.js npm

Posted by kaisellgren on Mon, 18 Apr 2022 06:42:33 +0930