Web crawler, captured by NodeJs RSS news

Recommended for you: Get network issues from WhatsUp Gold. Not end users.

Node.js is a Chrome JavaScript running on a platform of, Used to build fast and easily, Easy extension of network applications and Node.js based event driven, Non blocking I/O model to be lightweight and efficient, Real time application is very suitable for run across distributed devices data-intensive·


To provide RSS services site super multi, Baidu, NetEase, Sina, the tiger sniff the web site, based on the RSS Java c++ PHP crawls the web a lot, today to talk about NodeJs grab RSS information,

NodeJs make use of web crawler, crawling RSS news. The site code format is not the same as GBK, UTF-8, ISO8859-1 and so on, so the need for coding, UTF-8 in Chinese is the most cool. Grab a multi site, and then saved to the database, make full use of the characteristics of the JavaScript asynchronous programming, grab super fast speed.

This project is implemented as a news Android client, I will upload news client source code. The project source code in Git


Environmental needs:

NodeJs (must), my version is 0.10.24

Mongodb (optional), or MySQL and other database


The programming tool: webStrom


The first step: create a new nodejs project, I established the express web project

The second step: add a dependency in the package.json file


 "dependencies": {
    "express": "3.4.8",
    "ejs": "*",
    "feedparser":"0.16.6",
    "request":"2.33.0",
    "iconv":"2.0.7",
    "mongoose":"3.8.7",
    "mongodb":"*"
  }

The following code, import the related files to the project in node_modules:

npm install -d 

The third step:

Basic preparation is completed, start to write code. The RSS grab, mainly depends on the feedparser library, GitHub address:http://github.com/danmactough/node-feedparser

The first configuration, need to grab site information.

Create a rssSite.json file


{

    "channel":[
        {
            "from":"baidu",
            "name":"civilnews",
            "work":false,       //False does not capture
            "title":"Baidu domestic latest news",
            "link":"http://news.baidu.com/n?cmd=4&class=civilnews&tn=rss",
            "typeId":1
        },{
            "from":"netEase",
            "name":"rss_gn",
            "title":"NetEase latest news",
            "link":" ;,
            "typeId":2
        }
    ]
}

I want to grasp is that two site, the value of channel is an array of objects, if you need multiple sites, added directly on the line.

Introducing the related packages,


var request = require('request')
    , FeedParser = require('feedparser')
    , rssSite = require('../config/rssSite.json')
    , Iconv = require('iconv').Iconv;

Just need to traverse the configuration of the channel, need to find the URL address

var channels = rssSite.channel;
channels.forEach(function(e,i){
    if(e.work != false){
        console.log("begin:"+ e.title);
        fetch(e.link,e.typeId);
    }
});
The work false site, do not grab. The black list, typeId is the identification of the news is belong to which program, social, financial or other.

The key lies in the fetch function, capture and analysis here. I put the code to explain


function fetch(feed,typeId) {
    var posts;
    // Define our streams
    var req = request(feed, {timeout: 10000, pool: false});
    req.setMaxListeners(50);
    // Some feeds do not response without user-agent and accept headers.
    req.setHeader('user-agent', 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_8_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/31.0.1650.63 Safari/537.36')
        .setHeader('accept', 'text/html,application/xhtml+xml');

    var feedparser = new FeedParser();

    // Define our handlers
    req.on('error', done);
    req.on('response', function(res) {
        var stream = this
            , iconv
            , charset;
        posts = new Array();
        if (res.statusCode != 200) return this.emit('error', new Error('Bad status code'));
        charset = getParams(res.headers['content-type'] || '').charset;
        if (!iconv && charset && !/utf-*8/i.test(charset)) {
            try {
                iconv = new Iconv(charset, 'utf-8');
                iconv.on('error', done);
                stream = this.pipe(iconv);
            } catch(err) {
                this.emit('error', err);
            }
        }
        stream.pipe(feedparser);
    });

    feedparser.on('error', done);
    feedparser.on('end', function(err){
      //  postService.savePost(posts);    //Save to database
    });
    feedparser.on('readable', function() {
        var post;
        while (post = this.read()) {
            posts.push(transToPost(post));//Save to an array of objects
        }
    });
    function transToPost(post){
        var mPost = new Post({
            title : post.title,
            link : post.link,
            description : post.description,
            pubDate : post.pubDate,
            source : post.source,
            author : post.author,
            typeId : typeId
        });
        return mPost;
    }
}



1, Key functions: request (URL, [option]); this is can send HTTP requests function. Address: http://github.com/mikeal/request.git


 var req = request(feed, {timeout: 10000, pool: false});


Req need to monitor several state response, error. The request is issued, will receive a response, a response, the received data are spliced, the need for coding conversion before splicing. The non utf8 code into utf8. using the ICONV library. Address: http://github.com/bnoordhuis/node-iconv

res.headers['content-type'].charset;
It gets to you grab the site code format



The need for coding conversion before splicing, of course, the smart approach, pipe pipe


Then the object into I can not operate, this time need to feedparser again,

 var feedparser = new FeedParser();

Feedparse also monitor some state readable, end, error

The callback method readable one can read a record, every time I read a record I stored in the array object.

All data read callback function will call end. Here, grab the site. Or multi site



The fourth step: stored in the database

All data are the array of Posts objects, how to deal with, let your mercy. Save to mongoDB are things a few lines of code. Here, use the mongoose Library

Of course, the mongodb library is necessary. Feel mongodb database operation mongoose is similar to baseDao is very convenient.

To establish the model of:


var mongoose = require('mongoose');
var PostSchema = new mongoose.Schema({
    title:String,
    link :String,
    description :String,
    pubDate :String,
    source :String,
    author :String,
    typeId : Number
});
module.exports = PostSchema;

The establishment of models:

var mongoose = require('mongoose');
var PostSchema = require('../schemas/PostSchema');
var Post = mongoose.model('Post',PostSchema);

module.exports = Post;

OK, Can be saved to the database, mongoDB does not seem to bulk insert, here I will cycle, if the title does not exist is inserted into, otherwise there will be repeated news

var Post = require('../model/Post');

function savePost(posts){
    for(var i = 0 ;i<posts.length;i++){
        var post = posts[i];
        console.log(post.title||"");
        Post.find({"title":post.title||""},function(err,r){ // Does not exist, then insert
            if(err){
                console.error(err.stack);
                return;
            };
            if(r == null){
                post.save(function(err){
                    if(err){
                        console.error(err.stack);
                        return;
                    };
                });
            }

        });
    }
}
exports.savePost = savePost;


This news crawls over, welcome to clap brick.

The next article, I will introduce "HTTP news service, mongodb page implementation", of course, continue to use nodejs

Recommended from our users: Dynamic Network Monitoring from WhatsUp Gold from IPSwitch. Free Download

Posted by York at May 09, 2014 - 5:10 PM