0%

Python多线程,单线程,协程爬虫某音乐实战对比

  • 单线程,多线程下载某云音乐
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
import re
import urllib.request
import requests
from bs4 import BeautifulSoup
import os
import time
from Threads import BaseThread

PATH = lambda p: os.path.abspath(
os.path.join(os.path.dirname(__file__), p)
)

'''
https://music.163.com/playlist?id= 得到播放列表
http://music.163.com/song/media/outer/url?id= 得到下载链接
urllib.request.urlretrieve 把远程下载的mp3文件下载到本地
'''


class Music163:
def __init__(self):
pass
def get_music_163(self, id):
user_agent = 'Mozilla/5.0 (Windows NT 6.3; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/52.0.2743.82 ' \
'Safari/537.36 '
headers = {'User-Agent': user_agent}
data = requests.get("https://music.163.com/playlist?id=" + id, headers).text
soup = BeautifulSoup(data, 'lxml')
temp = []
for i in soup.find("ul", {"class", "f-hide"}).find_all("a"):
pattern = re.compile('<a .*?id=(.*?)">(.*?)</a>', re.S)
items = re.findall(pattern, str(i))
temp.append([items[0][0], items[0][1]])
return temp

# 批量下载
def download(self, value):
for i in value:
if os.path.isfile(PATH("mp3/" + i[1] + ".mp3")):
print("%s已经被下载了" % i[1])
else:
url = 'http://music.163.com/song/media/outer/url?id=' + i[0] + '.mp3'
urllib.request.urlretrieve(url, '%s' % PATH("mp3/" + i[1] + ".mp3"))
print("%s下载成功" % i[1])

# 单个下载
def get(self, value):
if os.path.isfile(PATH("mp3/" + value[1] + ".mp3")):
print("%s已经被下载了" % value[1])
else:
url = 'http://music.163.com/song/media/outer/url?id=' + value[0] + '.mp3'
urllib.request.urlretrieve(url, '%s' % PATH("mp3/" + value[1] + ".mp3"))
print("%s下载成功" % value[1])


# 多线程
def multi_thread():
id = "2786226719" # 播放的列表id
start_time = time.time()
threads = []
mc = Music163()
data = mc.get_music_163(id)
count = len(data)
for i in range(0, count):
threads.append(BaseThread(mc.get(data[i])))
for j in range(0, count):
threads[j].start()
for k in range(0, count):
threads[k].join()
end_time = time.time()
print("共耗时%.2f" % (end_time - start_time) + "秒")
# 多线程47秒


# 运行单线程
def run():
id = "2786226719" # 播放的列表id
start_time = time.time()
mc = Music163()
data = mc.get_music_163(id)
mc.download(data)
end_time = time.time()
print("共耗时%.2f" % (end_time - start_time) + "秒")
# 单线程43秒


if __name__ == "__main__":
# run()
multi_thread()
  • 单线程共下载100首歌,耗时9.09秒

dan.png

  • 多线程共下载100首歌,耗时9.60秒

image.png

  • 协程下载的代码
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
mport time
from multiprocessing import Process
from gevent import monkey
import urllib.request
import BaseMusic163
monkey.patch_all()
import gevent
import os

PATH = lambda p: os.path.abspath(
os.path.join(os.path.dirname(__file__), p)
)
'''
协程发请求,
'''
class Producer(object):
def __init__(self):
self._rungevent()

def _rungevent(self):
jobs = []
id = "2786226719" # 播放的列表id
start_time = time.time()
mc = BaseMusic163.Music163()
data = mc.get_music_163(id)
count = len(data)
for i in range(count): # windows下有1024端口限制
jobs.append(gevent.spawn(self.produce(data[i])))
gevent.joinall(jobs)
end_time = time.time()
print("共耗时%.2f" % (end_time - start_time) + "秒")

def produce(self, value):
if os.path.isfile(PATH("mp3/" + value[1] + ".mp3")):
print("%s已经被下载了" % value[1])
else:
url = 'http://music.163.com/song/media/outer/url?id=' + value[0] + '.mp3'
urllib.request.urlretrieve(url, '%s' % PATH("mp3/" + value[1] + ".mp3"))
print("%s下载成功" % value[1])


def main():
p1 = Process(target=Producer, args=())
p1.start()


if __name__ == '__main__':
main()
  • 下载时间
    image.png

结论

  • 昨天测试,发现是协程>多线程>单线程
  • 今天测试却是:多线程>协程>单线程
  • 当然也会出现单线程耗时反而比多线程耗时短的情况
  • 一直流传多进程+协程,可以解决python的GIL问题,因为本次测试的数据不多,使用的也是单进程+协程的方式,后续对协程的测试,有机会进行大量数据的测试,采用多进程+协程的方式进行测试
  • 源码获取