0%

多线程爬虫-BeautifulSoup

线程基类

1
2
3
4
5
6
7
8
import threading
class base_thread(threading.Thread):
def __init__(self, func):
threading.Thread.__init__(self)
self.func = func
#print(type(self.func))
def run(self):
self.func

爬虫百度贴吧中某帖子图片

  • 先要安装BeautifulSoup
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
import urllib.request as request
from bs4 import BeautifulSoup
from Base.Threads import base_thread
path = 'e:/apps/pic/'
def CrawlerFunc(url):
list_img = getUrl(url)
multi_thread(len(list_img), downloadImg(list_img))
def getUrl(url):
response = request.urlopen(url)
html = response.read()
data = html.decode('utf-8')
soup = BeautifulSoup(data)
list_img = []
for list in soup.find_all("img", {"class", "BDE_Image"}):
list_img.append(list.attrs["src"])
return list_img

def downloadImg(list_img):
count = 1
for i in list_img:
filepath = path + str(count)+".jpg"
with open(filepath, 'wb') as file:
print(filepath)
image_data = request.urlopen(i).read()
file.write(image_data)
count += 1
def multi_thread(count, func):
threads = []
for i in range(0, count):
threads.append(base_thread(func))
for j in range(0, count):
threads[j].start()
for k in range(0, count):
threads[k].join()
#CrawlerFunc("http://tieba.baidu.com/p/3764230390")

参考