# 环境
1、express 基本环境以这个为例子 2、request 请求 3、cheerio 让我们可以把爬下来的数据如jquery般操作 4、supervisor
# robots
被爬取方对爬取的一些限定 https://www.douban.com/robots.txt
User-agent: *
Disallow: /subject_search
Disallow: /amazon_search
Disallow: /search
Disallow: /group/search
Disallow: /event/search
Disallow: /celebrities/search
Disallow: /location/drama/search
Disallow: /forum/
Disallow: /new_subject
Disallow: /service/iframe
Disallow: /j/
Disallow: /link2/
Disallow: /recommend/
Disallow: /doubanapp/card
Disallow: /update/topic/
Sitemap: https://www.douban.com/sitemap_index.xml
Sitemap: https://www.douban.com/sitemap_updated_index.xml
# Crawl-delay: 5
User-agent: Wandoujia Spider
Disallow: /
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
显示出允许爬去的地方
# 简单demo
const express = require('express');
const app = express();
var request = require('request');
const cheerio = require('cheerio')
app.get('/', (req, res) => {
request('https://www.douban.com', function (error, response, body) {
console.log('error:', error); // Print the error if one occurred
console.log('statusCode:', response && response.statusCode); // Print the response status code if a response was received
console.log('body:', body); // Print the HTML for the Google homepage.
const $ = cheerio.load(body);
res.json({
'1111': '222',
'lessonNum': $('.aside-allCategory li').length,
'titleName': $('title').text(),
})
res.send('Hello World!');
});
})
app.listen(3000, () =>
console.log('Example app listening on port 3000!')
)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
输出
{
1111: "222",
lessonNum: 24,
titleName: "豆瓣"
}
1
2
3
4
5
2
3
4
5