0%

Python多进程爬虫初体验

最前

近期根据助教的推荐,在莫烦Python上学习了Python爬虫的新操作,主要是多进程和多线程。本次结合课程作业,体验了一把多进程的速度。

要求

输入一座城市的名称,获取其百度百科的基本介绍。如输入北京,访问 https://baike.baidu.com/item/北京 ,获得如下表格数据:

表格数据
将其中的数据提取出来,写入一个.txt文本,如下图所示:

文本数据

基本作业思路及代码

使用requests库访问页面,比原先使用urllib库更加方便,不再需要解决中文的问题。然后再利用beautifulsoup库进行提取,并写入文件。

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
# -*- coding: utf-8 -*-
"""
Created on Fri Mar 30 09:52:02 2018

@author: busby
"""
import re
import requests
import bs4

def open_url(url):
# 使用headers解决百科反爬虫的问题
headers = {'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:59.0) Gecko/20100101 Firefox/59.0'}
res = requests.get(url, headers = headers).content.decode('utf-8')
soup = bs4.BeautifulSoup(res, 'lxml')
# print(soup)
return (soup)

def get_details(soup):
try: #分别获取整张表、keys、values和注释符号
table = soup.find('div', class_="basic-info cmn-clearfix")
table_dts = table.find_all('dt', class_="basicInfo-item name")
table_dds = table.find_all('dd', class_="basicInfo-item value")
table_zs = table.find_all('sup', class_="sup--normal")
except:
print("请重新输入正确的城市名!")
main()

i = 0
max_i = len(table_dts) #获取循环次数
keys = []
values = []
zss = []
for each in table_zs: #将注释符号汇总为一个列表
zss.append(each.text.strip().replace(" ", ""))
while i < max_i:
key = table_dts[i].text.strip().replace(" ", "") #先去除空格和换行符
value = table_dds[i].text.strip().replace(" ", "") #同上
for each in zss: #通过循环迭代,去除掉注释符号,如果没有则不会去除任何字符
key = key.replace(each, "").strip()
value = value.replace(each, "").strip()
keys.append(key)
values.append(value)
i += 1

details = [keys, values, max_i] #组成列表,传参
return(details)

def show_details(details):
max_i = details[2]
i = 0
while i < max_i: #输出完整信息
print(details[0][i]+":"+details[1][i])
i += 1

print('\n\n\n')
i = 0

r = re.compile('(\d+(\.\d+)?)') #表示小数或整数
while i < max_i: #仅输出需要的城市面积和人口信息
# print(details[0][i])
if details[0][i] == '面\xa0\xa0\xa0\xa0积':
area = r.search(details[1][i])[0]
print("面积:" + str(area))
if details[0][i] == '人\xa0\xa0\xa0\xa0口':
pop = r.search(details[1][i])[0]
print("人口:" + str(pop))
i += 1

def main():
while True:
city_name = input("请输入城市名称(默认为上海,输入exit为退出):")
if city_name == '':
city_name = "上海"
elif city_name == 'exit':
print("已退出")
break
url = "https://baike.baidu.com/item/" + city_name
# print(url)
soup = open_url(url)
details = get_details(soup)
show_details(details)

if __name__ == '__main__':
main()

尝试:获取中国所有县级以上城市的信息

1. 获取中国县级以上城市列表

经过多个来源尝试,最终从民政部官网上找到一份
2018年1月中华人民共和国县以上行政区划代码,利用爬虫进行爬取,整合省份信息,代码如下:

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
# -*- coding: utf-8 -*-
"""
Created on Wed Apr 11 11:53:39 2018

@author: busby
"""

import requests
import bs4


def get_res(url):
res = requests.get(url).content.decode('utf-8')
soup = bs4.BeautifulSoup(res, 'lxml')
content = soup.find_all('td',class_="xl7012452") # 获取有用的列表信息
id_ =[]
name_ = []
i = 0
new_content = []
for each in content:
if each.text != "":
new_content.append(each)

while i < len(new_content):
if i % 2 == 0:
id_.append(new_content[i].text)
else:
name_.append(new_content[i].text)
i += 1

provinces = []
cities = []
i = 0
while i < len(id_):
if id_[i] == 0:
cities.append(name_[i])
elif id_[i][0:2] == id_[i-1][0:2]: # 切换不用的省份
cities.append(name_[i])
else:
# print(cities)
provinces.append(cities)
cities = []
cities.append(name_[i])
i += 1
return(provinces)

def write_list(provinces):
# print(provinces)
file = open("中华人民共和国县以上行政区划代码.txt", 'a', encoding = 'utf-8')
for province in provinces:
for city in province:
file.write(city + "\n")
file.write("\n")
file.close

def main():
url = 'http://www.mca.gov.cn/article/sj/tjbz/a/2018/201803/201803191002.html'
provinces = get_res(url)
write_list(provinces)

if __name__ == '__main__':
main()

大致一共获取到了3200多座县级以上城市。

2.1 尝试简单循环获取所有县级以上城市的百科信息

第一次没有使用多进程,简单利用循环进行获取,结果花了半个小时才获取到所有城市的信息,基本没有丢失城市,代码如下:

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
# -*- coding: utf-8 -*-
"""
Created on Wed Apr 11 16:11:47 2018

@author: busby
"""

import time
import requests
import bs4

def open_url(url):
# 使用headers解决百科反爬虫的问题
headers = {
'User-Agent':'User-Agent: Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36 Edge/16.16299',
'Referer': 'https://baike.baidu.com/item/%E5%8C%97%E4%BA%AC/128981?fr=aladdin&fromid=126069&fromtitle=%E5%8C%97%E4%BA%AC%E5%B8%82'
}

res = requests.get(url, headers = headers).content.decode('utf-8')
soup = bs4.BeautifulSoup(res, 'lxml')
# print(soup)
return (soup)

def get_details(soup):
try: #分别获取整张表、keys、values和注释符号
table = soup.find('div', class_="basic-info cmn-clearfix")
table_dts = table.find_all('dt', class_="basicInfo-item name")
table_dds = table.find_all('dd', class_="basicInfo-item value")
table_zs = table.find_all('sup', class_="sup--normal")
except:
print("请重新输入正确的城市名!")
main()

i = 0
max_i = len(table_dts) #获取循环次数
keys = []
values = []
zss = []
for each in table_zs: #将注释符号汇总为一个列表
zss.append(each.text.strip().replace(" ", ""))
while i < max_i:
key = table_dts[i].text.strip().replace(" ", "") #先去除空格和换行符
value = table_dds[i].text.strip().replace(" ", "") #同上
for each in zss: #通过循环迭代,去除掉注释符号,如果没有则不会去除任何字符
key = key.replace(each, "").strip()
value = value.replace(each, "").strip()
keys.append(key)
values.append(value)
i += 1

details = [keys, values, max_i] #组成列表,传参
return(details)

def save_details(details):
max_i = details[2]
i = 0
file = open('中国县级以上城市信息简表.txt', 'a', encoding = 'utf-8')
while i < max_i: #输出完整信息
file.write(details[0][i]+":"+details[1][i]+"\n")
# print(details[0][i]+":"+details[1][i])
i += 1
file.write('\n\n')
file.close

def get_list():
file = open("中华人民共和国县以上行政区划代码.txt", "r", encoding = "utf-8")
all_lines = file.readlines()
details = []
provinces = []
cities = []

for line in all_lines:
details.append(line.split())

i = 0
while i < len(details):
if len(details[i]) != 0:
cities.append(details[i])
else:
#print(cities)
provinces.append(cities)
cities = []
i += 1
return (provinces)

def main():
st = time.time()
num = 1
city_lists = get_list()
#print(city_lists)
for province in city_lists:
for city in province:
#print(city[0])
time.sleep(0.1)
url = ("https://baike.baidu.com/item/" + city[0][:-1])
print(url)
soup = open_url(url)
details = get_details(soup)
save_details(details)
print("已完成", city[0], "共", num, "座城市的信息获取")
num += 1

file = open('中国县级以上城市信息简表.txt', 'a', encoding = 'utf-8')
file.write('\n\n\n\n\n\n\n\n\n\n')
file.close
print("用时:", time.time()-st)

if __name__ == '__main__':
main()

2.2 尝试多进程循环获取所有县级以上城市的百科信息

由于简单循环实在太慢,结合新学的多进程技术,尝试修改了新代码,由于新手原因,折腾了很久都没成功,最后发现pool.map传入的参数本身需要的就是一个列表,才解决了问题,也是没有认真看视频教程好好记笔记的问题,代码如下:

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
# -*- coding: utf-8 -*-
"""
Created on Wed Apr 11 16:11:47 2018

@author: busby
"""

import time
import requests
import bs4
from multiprocessing import Pool

def get_details(soup):
try: #分别获取整张表、keys、values和注释符号
table = soup.find('div', class_="basic-info cmn-clearfix")
table_dts = table.find_all('dt', class_="basicInfo-item name")
table_dds = table.find_all('dd', class_="basicInfo-item value")
table_zs = table.find_all('sup', class_="sup--normal")
except:
return(None)

i = 0
max_i = len(table_dts) #获取循环次数
keys = []
values = []
zss = []
for each in table_zs: #将注释符号汇总为一个列表
zss.append(each.text.strip().replace(" ", ""))
while i < max_i:
key = table_dts[i].text.strip().replace(" ", "") #先去除空格和换行符
value = table_dds[i].text.strip().replace(" ", "") #同上
for each in zss: #通过循环迭代,去除掉注释符号,如果没有则不会去除任何字符
key = key.replace(each, "").strip()
value = value.replace(each, "").strip()
keys.append(key)
values.append(value)
i += 1

details = [keys, values, max_i] #组成列表,传参
return(details)

def open_url(url):
# 使用headers解决百科反爬虫的问题
#return(url)
headers = {
'User-Agent':'User-Agent: Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36 Edge/16.16299',
'Referer': 'https://baike.baidu.com/item/%E5%8C%97%E4%BA%AC/128981?fr=aladdin&fromid=126069&fromtitle=%E5%8C%97%E4%BA%AC%E5%B8%82'
}

res = requests.get(url, headers = headers).content.decode('utf-8')
soup = bs4.BeautifulSoup(res, 'lxml')
# print(soup)
details = get_details(soup)
return (details)

def save_details(details):
max_i = details[2]
i = 0
file = open('中国县级以上城市信息简表2.txt', 'a', encoding = 'utf-8')
while i < max_i: #输出完整信息
file.write(details[0][i]+":"+details[1][i]+"\n")
# print(details[0][i]+":"+details[1][i])
i += 1
file.write('\n\n')
file.close
return('Success')

def get_list():
file = open("中华人民共和国县以上行政区划代码.txt", "r", encoding = "utf-8")
all_lines = file.readlines()
details = []
provinces = []
cities = []

for line in all_lines:
details.append(line.split())

i = 0
while i < len(details):
if len(details[i]) != 0:
cities.append(details[i])
else:
#print(cities)
provinces.append(cities)
cities = []
i += 1
return (provinces)

def make_urls(name):
# print("正在拼凑")
name = name[0]
url = "https://baike.baidu.com/item/" + str(name)
return (url)

def main():
num = 1
st = time.time()
provinces_name = get_list()
pool = Pool(processes = 8)
for province in provinces_name:
provinces_urls = pool.map(make_urls, province)
provinces_txt = pool.map(open_url, provinces_urls)
for each in provinces_txt:
try:
result = save_details(each)
print(result, num)
except:
print('failed', each)
num += 1

print(time.time()-st)

if __name__ == '__main__':
main()

果然多进程没有让我失望,仅用3分钟就完成了3200多座城市的信息获取,从速度上完胜,但遗憾的是,我发现有几座城市的返回信息为None,暂未明白出错的原因,如有大佬路过,请赐教,非常感谢!

最后

  1. 我学习Python的GitHub主页:https://github.com/busbyjrj/Python-Learning

  2. 本次学习的代码(GitHub):https://github.com/busbyjrj/Python-Learning/tree/master/multiprocessing/city_list

  3. 本爬虫仅学习使用,请控制爬取的速度,以免影响其他用户正常使用百度,谢谢。

  4. 我的多进程代码还有问题,可能会丢失几座城市的信息,暂未查明原因,如有大佬路过,请赐教,非常感谢!

  5. 我的代码虽然例子中只用于城市信息,但实测对其他百度词条也可以使用,但也可能存在一个Bug需要修正,也请赐教,谢谢!

最近更新时间:2018年4月14日