爬取豆瓣音乐Top250(多线程、多进程)

If you stay positive, you have a shot at a sliver lining. ——《乌云背后的幸福线》

山重水复疑无路,柳暗花明又一村。

站点分析

  本次抓取目标是豆瓣音乐Top250排行榜,目标站点为 https://music.douban.com/top250 ,打开之后便可以查看榜单。

  将网页滑到最下方,发现有分页,点击切换到第2页,观察URL的变化。发现页面的URL变成了 https://music.douban.com/top250?start=25 ,如下图:

  比之前的URL多了一个start参数,初步推断这是一个偏移量的参数。再点击下一页,start参数变成了50。多次切换页码offset都有改变,由此得出规律,start代表偏移量值。也就是说Top250我们只需要分开请求10次即可,而10次的参数分别设置为0、25、50…225即可。
  使用Chrome浏览器开发者模式查看源代码。选中任意一个条目,发现其详情页链接在class属性为ngb的a节点。

1
all_music_url = tree.xpath('//*[@class="nbg"]/@href')

  进入专辑页面,需要抓取的信息都在这里,不过有的信息使用XPath不太好获取,所以对于这些不好提取的信息使用了正则表达式。如流派的内容:

1
2
3
4
5
styles = re.findall(r'<span class="pl">流派:</span>&nbsp;(.*?)<br />', html, re.S)
if len(styles) == 0:
style = '未知'
else:
style = styles[0].strip()

完整代码

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
import re
import csv
import requests
from lxml import etree
from datetime import datetime
from fake_useragent import UserAgent


# 开始时间
# start_time = datetime.now()

# 随机请求头
headers = {
"User-Agent": UserAgent(verify_ssl=False).random, # 随机UA
"Connection": "close" # 关闭长链接
}


# 获取所有音乐详情页链接
def get_music_url(url):
html = requests.get(url, headers=headers).text
tree = etree.HTML(html)
all_music_url = tree.xpath('//*[@class="nbg"]/@href')
for music_url in all_music_url:
get_music_info(music_url)


# 解析详情页
def get_music_info(music_url):
html = requests.get(music_url, headers=headers).text
tree = etree.HTML(html)
name = tree.xpath('//*[@id="wrapper"]/h1/span/text()')[0] # 音乐名称
author = tree.xpath('//*[@id="info"]/span/span/a/text()')[0] # 演唱者
styles = re.findall(r'<span class="pl">流派:</span>&nbsp;(.*?)<br />', html, re.S) # 流派
if len(styles) == 0:
style = '未知'
else:
style = styles[0].strip()
time = re.findall(r'<span class="pl">发行时间:</span>&nbsp;(.*?)<br />', html, re.S)[0].strip() # 发行时间
publishers = re.findall(r'<span class="pl">出版者:</span>&nbsp;(.*?)<br />', html, re.S) # 出版者
score = tree.xpath('//*[@class="ll rating_num"]/text()')[0]
if len(publishers) == 0:
publisher = '未知'
else:
publisher = publishers[0].strip()
music_info = {
"name": name,
"author": author,
"style": style,
"time": time,
"publisher": publisher,
"score": score
}
print(music_info)
save_to_csv(filename, music_info)


# 储存到CSV文件
def save_to_csv(filename, music_info):
with open(filename, 'a', encoding='utf-8') as f:
fieldnames = ["name", "author", "style", "time", "publisher", "score"]
writer = csv.DictWriter(f, fieldnames=fieldnames)
writer.writerow(music_info)


if __name__ == "__main__":
urls = ['https://music.douban.com/top250?start={}'.format(str(i)) for i in range(0, 250, 25)]
filename = 'musicTop250.csv'
# 创建musicTop250.csv文件,并添加表头信息
with open(filename, 'a', encoding='utf-8') as f:
fieldnames = ["name", "author", "style", "time", "publisher", "score"]
writer = csv.DictWriter(f, fieldnames=fieldnames)
writer.writeheader()
for url in urls:
get_music_url(url)

# 结束时间
# end_time = datetime.now()
# print(f'用时:{(end_time - start_time).seconds}秒')

多线程版

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
import re
import csv
import requests
from lxml import etree
from fake_useragent import UserAgent
from threading import Thread, Lock
from datetime import datetime


# 开始时间
start_time = datetime.now()

# 线程锁
lock = Lock()

# 随机请求头
headers = {
"User-Agent": UserAgent(verify_ssl=False).random, # 随机UA
"Connection": "close" # 关闭长链接
}


# 从URL列表中获取URl
def get_url():
global urls
lock.acquire()
if len(urls) == 0:
lock.release()
return ""
lock.release()
return urls.pop(0)


# 获取所有音乐详情页链接
def get_music_url(url, thread_name):
html = requests.get(url, headers=headers).text
tree = etree.HTML(html)
all_music_url = tree.xpath('//*[@class="nbg"]/@href')
for music_url in all_music_url:
get_music_info(music_url, thread_name)


# 解析详情页
def get_music_info(music_url, thread_name):
html = requests.get(music_url, headers=headers).text
tree = etree.HTML(html)
name = tree.xpath('//*[@id="wrapper"]/h1/span/text()')[0] # 音乐名称
author = tree.xpath('//*[@id="info"]/span/span/a/text()')[0] # 演唱者
styles = re.findall(r'<span class="pl">流派:</span>&nbsp;(.*?)<br />', html, re.S) # 流派
if len(styles) == 0:
style = '未知'
else:
style = styles[0].strip()
time = re.findall(r'<span class="pl">发行时间:</span>&nbsp;(.*?)<br />', html, re.S)[0].strip() # 发行时间
publishers = re.findall(r'<span class="pl">出版者:</span>&nbsp;(.*?)<br />', html, re.S) # 出版者
score = tree.xpath('//*[@class="ll rating_num"]/text()')[0]
if len(publishers) == 0:
publisher = '未知'
else:
publisher = publishers[0].strip()
music_info = {
"name": name,
"author": author,
"style": style,
"time": time,
"publisher": publisher,
"score": score
}
print(thread_name, music_info)
# save_to_csv(filename, music_info)


# 储存到CSV文件
def save_to_csv(filename, music_info):
with open(filename, 'a', encoding='utf-8') as f:
fieldnames = ["name", "author", "style", "time", "publisher", "score"]
writer = csv.DictWriter(f, fieldnames=fieldnames)
writer.writerow(music_info)


class MusicSpiderThread(Thread):
def __init__(self, name):
Thread.__init__(self)
self.name = name

def run(self):
while True:
url = get_url()
if url != "":
get_music_url(url, self.name)
else:
break


if __name__ == "__main__":
urls = ['https://music.douban.com/top250?start={}'.format(str(i)) for i in range(0, 250, 25)]
# filename = 'musicTop250.csv'
# with open(filename, 'a', encoding='utf-8') as f:
# fieldnames = ["name", "author", "style", "time", "publisher", "score"]
# writer = csv.DictWriter(f, fieldnames=fieldnames)
# writer.writeheader()
print(len(urls))
threads = []
for i in range(1, 5): # 开启4个线程
thread = MusicSpiderThread('thread-{}'.format(i))
threads.append(thread)

for thread in threads:
thread.start()

for thread in threads:
thread.join()

# 结束时间
end_time = datetime.now()
print(f'用时:{(end_time-start_time).seconds}秒')

多进程版

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
import re
import csv
import requests
from lxml import etree
from datetime import datetime
from multiprocessing import Pool
from fake_useragent import UserAgent

# 开始时间
start_time = datetime.now()

# 随机请求头
headers = {
"User-Agent": UserAgent(verify_ssl=False).random, # 随机UA
"Connection": "close" # 关闭长链接
}


# 获取所有音乐详情页链接
def get_music_url(url):
html = requests.get(url, headers=headers).text
tree = etree.HTML(html)
all_music_url = tree.xpath('//*[@class="nbg"]/@href')
for music_url in all_music_url:
get_music_info(music_url)


# 解析详情页
def get_music_info(music_url):
html = requests.get(music_url, headers=headers).text
tree = etree.HTML(html)
name = tree.xpath('//*[@id="wrapper"]/h1/span/text()')[0] # 音乐名称
author = tree.xpath('//*[@id="info"]/span/span/a/text()')[0] # 演唱者
styles = re.findall(r'<span class="pl">流派:</span>&nbsp;(.*?)<br />', html, re.S) # 流派
if len(styles) == 0:
style = '未知'
else:
style = styles[0].strip()
time = re.findall(r'<span class="pl">发行时间:</span>&nbsp;(.*?)<br />', html, re.S)[0].strip() # 发行时间
publishers = re.findall(r'<span class="pl">出版者:</span>&nbsp;(.*?)<br />', html, re.S) # 出版者
score = tree.xpath('//*[@class="ll rating_num"]/text()')[0]
if len(publishers) == 0:
publisher = '未知'
else:
publisher = publishers[0].strip()
music_info = {
"name": name,
"author": author,
"style": style,
"time": time,
"publisher": publisher,
"score": score
}
print(music_info)
# save_to_csv(filename, music_info)


# 储存到CSV文件
def save_to_csv(filename, music_info):
with open(filename, 'a', encoding='utf-8') as f:
fieldnames = ["name", "author", "style", "time", "publisher", "score"]
writer = csv.DictWriter(f, fieldnames=fieldnames)
writer.writerow(music_info)


if __name__ == "__main__":
urls = ['https://music.douban.com/top250?start={}'.format(str(i)) for i in range(0, 250, 25)]
# filename = 'musicTop250.csv'
# with open(filename, 'a', encoding='utf-8') as f:
# fieldnames = ["name", "author", "style", "time", "publisher", "score"]
# writer = csv.DictWriter(f, fieldnames=fieldnames)
# writer.writeheader()
# for url in urls:
# get_music_url(url)

pool = Pool(processes=4) # 开启4个进程
pool.map(get_music_url, urls)

# 结束时间
end_time = datetime.now()
print(f'用时:{(end_time - start_time).seconds}秒')

用时对比

  为了测试,并没有把抓取到的结构化数据存储到文件,而是直接输出到控制台。在同一台机器上运行了上面多线程与多进程的代码,多线程开启4个线程用时104秒,多进程开启4个进程用时123秒。感觉不是很快,可能跟我这边的网络不太好有关系。豆瓣这边快速抓取一定数量后,会提示你IP异常,然后让你登录,对此,你可以将登录后的cookies放到headers中。

写在最后

  对于多线程多进程爬虫效率固然可观,但缺点也很明显,你需要考虑更多的细节,并且多线程和多进程的程序并不好调试,操作不好的话容易造成死锁或脏数据。对于一些小型站点个人感觉没必要使用多线程或多进程,对于大型站点或者你实在想要高速爬取,推荐使用框架。