# 环境

1、express 基本环境以这个为例子 2、request 请求 3、cheerio 让我们可以把爬下来的数据如jquery般操作 4、supervisor

# robots

被爬取方对爬取的一些限定 https://www.douban.com/robots.txt

    User-agent: *
    Disallow: /subject_search
    Disallow: /amazon_search
    Disallow: /search
    Disallow: /group/search
    Disallow: /event/search
    Disallow: /celebrities/search
    Disallow: /location/drama/search
    Disallow: /forum/
    Disallow: /new_subject
    Disallow: /service/iframe
    Disallow: /j/
    Disallow: /link2/
    Disallow: /recommend/
    Disallow: /doubanapp/card
    Disallow: /update/topic/
    Sitemap: https://www.douban.com/sitemap_index.xml
    Sitemap: https://www.douban.com/sitemap_updated_index.xml
    # Crawl-delay: 5

    User-agent: Wandoujia Spider
    Disallow: /
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22

显示出允许爬去的地方

# 简单demo

const express = require('express');
const app = express();
var request = require('request');
const cheerio = require('cheerio')


app.get('/', (req, res) => {
  request('https://www.douban.com', function (error, response, body) {
    console.log('error:', error); // Print the error if one occurred
    console.log('statusCode:', response && response.statusCode); // Print the response status code if a response was received
    console.log('body:', body); // Print the HTML for the Google homepage.
    const $ = cheerio.load(body);
    res.json({
      '1111': '222',
      'lessonNum': $('.aside-allCategory li').length,
      'titleName': $('title').text(),
    })

    res.send('Hello World!');
  });
})

app.listen(3000, () =>
  console.log('Example app listening on port 3000!')
)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25

输出

    {
    1111: "222",
    lessonNum: 24,
    titleName: "豆瓣"
    }
1
2
3
4
5